From aec58b48517c911fbdf2beebba46a347e5910072 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 15 May 2025 14:46:30 +0200
Subject: bugs/core: Extend __WARN_FLAGS() with the 'cond_str' parameter

Push the new parameter down into every architecture that defines __WARN_FLAGS():

  arm64
  loongarch
  parisc
  powerpc
  riscv
  s390
  sh
  x86

Don't pass anything substantial down yet, just propagate the
new parameter with empty strings, without generating it or
using it.

( The string is never NULL, so it can be concatenated at the
  preprocessor level. )

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-2-mingo@kernel.org
---
 include/asm-generic/bug.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 387720933973..af76e4a04b16 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -100,17 +100,18 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 		instrumentation_end();					\
 	} while (0)
 #else
-#define __WARN()		__WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN))
+#define __WARN()		__WARN_FLAGS("", BUGFLAG_TAINT(TAINT_WARN))
 #define __WARN_printf(taint, arg...) do {				\
 		instrumentation_begin();				\
 		__warn_printk(arg);					\
-		__WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
+		__WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
 		instrumentation_end();					\
 	} while (0)
 #define WARN_ON_ONCE(condition) ({				\
 	int __ret_warn_on = !!(condition);			\
 	if (unlikely(__ret_warn_on))				\
-		__WARN_FLAGS(BUGFLAG_ONCE |			\
+		__WARN_FLAGS("",				\
+			     BUGFLAG_ONCE |			\
 			     BUGFLAG_TAINT(TAINT_WARN));	\
 	unlikely(__ret_warn_on);				\
 })
-- 
cgit v1.2.3


From 3bc3c9c3ab6df45a3a3389f74000f8bec1bc96e3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 15 May 2025 14:46:31 +0200
Subject: bugs/core: Pass down the condition string of WARN_ON_ONCE(cond)
 warnings to __WARN_FLAGS()

Doing this will allow architecture code to store and print out
this information as part of the WARN_ON and BUG_ON facilities.

The format of the string is '[condition]', for example:

  WARN_ON_ONCE(idx < 0 && ptr);

Will get the '[idx < 0 && ptr]' string literal passed down as 'cond_str'
in __WARN_FLAGS().

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-3-mingo@kernel.org
---
 include/asm-generic/bug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index af76e4a04b16..c8e7126bc26e 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -110,7 +110,7 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 #define WARN_ON_ONCE(condition) ({				\
 	int __ret_warn_on = !!(condition);			\
 	if (unlikely(__ret_warn_on))				\
-		__WARN_FLAGS("",				\
+		__WARN_FLAGS("["#condition"] ",			\
 			     BUGFLAG_ONCE |			\
 			     BUGFLAG_TAINT(TAINT_WARN));	\
 	unlikely(__ret_warn_on);				\
-- 
cgit v1.2.3


From 687fac9d1b00fb10421fdd455d60543cc46e42d0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 15 May 2025 14:46:32 +0200
Subject: bugs/core: Introduce the CONFIG_DEBUG_BUGVERBOSE_DETAILED Kconfig
 switch

Allow configurability of the inclusion of more detailed
WARN_ON() strings, to be implemented in subsequent
commits.

Since the full cost will be around 100K more memory on
an x86 defconfig, disable it by default.

Provide the WARN_CONDITION_STR() macro to allow the conditional
passing of extra strings to lower level BUG/WARN handlers.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-4-mingo@kernel.org
---
 include/asm-generic/bug.h |  6 ++++++
 lib/Kconfig.debug         | 10 ++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index c8e7126bc26e..2d9f61346dab 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -17,6 +17,12 @@
 #define BUG_GET_TAINT(bug)	((bug)->flags >> 8)
 #endif
 
+#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED
+# define WARN_CONDITION_STR(cond_str) cond_str
+#else
+# define WARN_CONDITION_STR(cond_str)
+#endif
+
 #ifndef __ASSEMBLY__
 #include <linux/panic.h>
 #include <linux/printk.h>
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ebe33181b6e6..ef00752a2b67 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -206,6 +206,16 @@ config DEBUG_BUGVERBOSE
 	  of the BUG call as well as the EIP and oops trace.  This aids
 	  debugging but costs about 70-100K of memory.
 
+config DEBUG_BUGVERBOSE_DETAILED
+	bool "Verbose WARN_ON_ONCE() reporting (adds 100K)" if DEBUG_BUGVERBOSE
+	help
+	  Say Y here to make WARN_ON_ONCE() output the condition string of the
+	  warning, in addition to the file name and line number.
+	  This helps debugging, but costs about 100K of memory.
+
+	  Say N if unsure.
+
+
 endmenu # "printk and dmesg options"
 
 config DEBUG_KERNEL
-- 
cgit v1.2.3


From 548fe51740d0f3294e548f654c099e5aefbf4cb7 Mon Sep 17 00:00:00 2001
From: Madhav Bhatt <madhav.bhatt@amd.com>
Date: Thu, 17 Apr 2025 02:45:43 -0700
Subject: firmware: xilinx: Add debugfs support for PM_GET_NODE_STATUS

Add new debug interface to support PM_GET_NODE_STATUS to get the node
information like requirements and usage.

The debugfs firmware driver interface is only meant for testing and
debugging EEMI APIs. Hence, it is by-default disabled in production
systems.

Signed-off-by: Madhav Bhatt <madhav.bhatt@amd.com>
Link: https://lore.kernel.org/r/20250417094543.3873507-1-madhav.bhatt@amd.com
Signed-off-by: Michal Simek <michal.simek@amd.com>
---
 drivers/firmware/xilinx/zynqmp-debug.c | 13 +++++++++++
 drivers/firmware/xilinx/zynqmp.c       | 41 +++++++++++++++++++++++++++++++++-
 include/linux/firmware/xlnx-zynqmp.h   | 12 +++++++++-
 3 files changed, 64 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/xilinx/zynqmp-debug.c b/drivers/firmware/xilinx/zynqmp-debug.c
index 22853ae0efdf..36efb827f3da 100644
--- a/drivers/firmware/xilinx/zynqmp-debug.c
+++ b/drivers/firmware/xilinx/zynqmp-debug.c
@@ -3,6 +3,7 @@
  * Xilinx Zynq MPSoC Firmware layer for debugfs APIs
  *
  *  Copyright (C) 2014-2018 Xilinx, Inc.
+ *  Copyright (C) 2022 - 2025 Advanced Micro Devices, Inc.
  *
  *  Michal Simek <michal.simek@amd.com>
  *  Davorin Mista <davorin.mista@aggios.com>
@@ -38,6 +39,7 @@ static struct pm_api_info pm_api_list[] = {
 	PM_API(PM_RELEASE_NODE),
 	PM_API(PM_SET_REQUIREMENT),
 	PM_API(PM_GET_API_VERSION),
+	PM_API(PM_GET_NODE_STATUS),
 	PM_API(PM_REGISTER_NOTIFIER),
 	PM_API(PM_RESET_ASSERT),
 	PM_API(PM_RESET_GET_STATUS),
@@ -167,6 +169,17 @@ static int process_api_request(u32 pm_id, u64 *pm_api_arg, u32 *pm_api_ret)
 						pm_api_arg[3] ? pm_api_arg[3] :
 						ZYNQMP_PM_REQUEST_ACK_BLOCKING);
 		break;
+	case PM_GET_NODE_STATUS:
+		ret = zynqmp_pm_get_node_status(pm_api_arg[0],
+						&pm_api_ret[0],
+						&pm_api_ret[1],
+						&pm_api_ret[2]);
+		if (!ret)
+			sprintf(debugfs_buf,
+				"GET_NODE_STATUS:\n\tNodeId: %llu\n\tStatus: %u\n\tRequirements: %u\n\tUsage: %u\n",
+				pm_api_arg[0], pm_api_ret[0],
+				pm_api_ret[1], pm_api_ret[2]);
+		break;
 	case PM_REGISTER_NOTIFIER:
 		ret = zynqmp_pm_register_notifier(pm_api_arg[0],
 						  pm_api_arg[1] ?
diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 02da3e48bc8f..a17c806cd117 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -3,7 +3,7 @@
  * Xilinx Zynq MPSoC Firmware layer
  *
  *  Copyright (C) 2014-2022 Xilinx, Inc.
- *  Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc.
+ *  Copyright (C) 2022 - 2025 Advanced Micro Devices, Inc.
  *
  *  Michal Simek <michal.simek@amd.com>
  *  Davorin Mista <davorin.mista@aggios.com>
@@ -1413,6 +1413,45 @@ int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mode)
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_tcm_config);
 
+/**
+ * zynqmp_pm_get_node_status - PM call to request a node's current power state
+ * @node:		ID of the component or sub-system in question
+ * @status:		Current operating state of the requested node
+ * @requirements:	Current requirements asserted on the node,
+ *			used for slave nodes only.
+ * @usage:		Usage information, used for slave nodes only:
+ *			PM_USAGE_NO_MASTER	- No master is currently using
+ *						  the node
+ *			PM_USAGE_CURRENT_MASTER	- Only requesting master is
+ *						  currently using the node
+ *			PM_USAGE_OTHER_MASTER	- Only other masters are
+ *						  currently using the node
+ *			PM_USAGE_BOTH_MASTERS	- Both the current and at least
+ *						  one other master is currently
+ *						  using the node
+ *
+ * Return:		Returns status, either success or error+reason
+ */
+int zynqmp_pm_get_node_status(const u32 node, u32 *const status,
+			      u32 *const requirements, u32 *const usage)
+{
+	u32 ret_payload[PAYLOAD_ARG_CNT];
+	int ret;
+
+	if (!status || !requirements || !usage)
+		return -EINVAL;
+
+	ret = zynqmp_pm_invoke_fn(PM_GET_NODE_STATUS, ret_payload, 1, node);
+	if (ret_payload[0] == XST_PM_SUCCESS) {
+		*status = ret_payload[1];
+		*requirements = ret_payload[2];
+		*usage = ret_payload[3];
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_get_node_status);
+
 /**
  * zynqmp_pm_force_pwrdwn - PM call to request for another PU or subsystem to
  *             be powered down forcefully
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index ae48d619c4e0..4699f50465f2 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -3,7 +3,7 @@
  * Xilinx Zynq MPSoC Firmware layer
  *
  *  Copyright (C) 2014-2021 Xilinx
- *  Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc.
+ *  Copyright (C) 2022 - 2025 Advanced Micro Devices, Inc.
  *
  *  Michal Simek <michal.simek@amd.com>
  *  Davorin Mista <davorin.mista@aggios.com>
@@ -164,6 +164,7 @@ enum pm_api_cb_id {
 enum pm_api_id {
 	PM_API_FEATURES = 0,
 	PM_GET_API_VERSION = 1,
+	PM_GET_NODE_STATUS = 3,
 	PM_REGISTER_NOTIFIER = 5,
 	PM_FORCE_POWERDOWN = 8,
 	PM_REQUEST_WAKEUP = 10,
@@ -629,6 +630,8 @@ int zynqmp_pm_request_wake(const u32 node,
 int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode);
 int zynqmp_pm_set_rpu_mode(u32 node_id, enum rpu_oper_mode rpu_mode);
 int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mode);
+int zynqmp_pm_get_node_status(const u32 node, u32 *const status,
+			      u32 *const requirements, u32 *const usage);
 int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value);
 int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config,
 			     u32 value);
@@ -931,6 +934,13 @@ static inline int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mo
 	return -ENODEV;
 }
 
+static inline int zynqmp_pm_get_node_status(const u32 node, u32 *const status,
+					    u32 *const requirements,
+					    u32 *const usage)
+{
+	return -ENODEV;
+}
+
 static inline int zynqmp_pm_set_sd_config(u32 node,
 					  enum pm_sd_config_type config,
 					  u32 value)
-- 
cgit v1.2.3


From e66f4c35e375346943bfe2a0990e97253f74440f Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Tue, 1 Jul 2025 05:38:50 -0700
Subject: drivers: firmware: xilinx: Add unique family code for all platforms

The family code is currently derived from the PMC_TAP_IDCODE register
value, but there are issues where Versal, Versal NET, and future
platforms share the same family code. Additionally for some platforms
have identical subfamily code, making it challenging to differentiate
between platforms based on the family and subfamily codes. To resolve
this, a new family code member is added to the platform data, initialized
with unique values. This change enables better platform distinction via
the compatible string.

Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20250701123851.1314531-3-jay.buddhabhatti@amd.com
Signed-off-by: Michal Simek <michal.simek@amd.com>
---
 drivers/firmware/xilinx/zynqmp.c     | 32 +++++++++++++++++++++++++++++---
 include/linux/firmware/xlnx-zynqmp.h |  5 +++++
 2 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 53b8c88b4178..2d250a16d3dd 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -72,6 +72,15 @@ struct pm_api_feature_data {
 	struct hlist_node hentry;
 };
 
+struct platform_fw_data {
+	/*
+	 * Family code for platform.
+	 */
+	const u32 family_code;
+};
+
+static struct platform_fw_data *active_platform_fw_data;
+
 static const struct mfd_cell firmware_devs[] = {
 	{
 		.name = "zynqmp_power_controller",
@@ -2052,6 +2061,11 @@ static int zynqmp_firmware_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	/* Get platform-specific firmware data from device tree match */
+	active_platform_fw_data = (struct platform_fw_data *)device_get_match_data(dev);
+	if (!active_platform_fw_data)
+		return -EINVAL;
+
 	/* Get SiP SVC version number */
 	ret = zynqmp_pm_get_sip_svc_version(&sip_svc_version);
 	if (ret)
@@ -2152,10 +2166,22 @@ static void zynqmp_firmware_sync_state(struct device *dev)
 		dev_warn(dev, "failed to release power management to firmware\n");
 }
 
+static const struct platform_fw_data platform_fw_data_versal = {
+	.family_code = PM_VERSAL_FAMILY_CODE,
+};
+
+static const struct platform_fw_data platform_fw_data_versal_net = {
+	.family_code = PM_VERSAL_NET_FAMILY_CODE,
+};
+
+static const struct platform_fw_data platform_fw_data_zynqmp = {
+	.family_code = PM_ZYNQMP_FAMILY_CODE,
+};
+
 static const struct of_device_id zynqmp_firmware_of_match[] = {
-	{.compatible = "xlnx,zynqmp-firmware"},
-	{.compatible = "xlnx,versal-firmware"},
-	{.compatible = "xlnx,versal-net-firmware"},
+	{.compatible = "xlnx,zynqmp-firmware", .data = &platform_fw_data_zynqmp},
+	{.compatible = "xlnx,versal-firmware", .data = &platform_fw_data_versal},
+	{.compatible = "xlnx,versal-net-firmware", .data = &platform_fw_data_versal_net},
 	{},
 };
 MODULE_DEVICE_TABLE(of, zynqmp_firmware_of_match);
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 4699f50465f2..6458ef4e04e2 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -54,6 +54,11 @@
 #define ZYNQMP_FAMILY_CODE 0x23
 #define VERSAL_FAMILY_CODE 0x26
 
+/* Family codes */
+#define PM_ZYNQMP_FAMILY_CODE 0x1 /* ZynqMP family code */
+#define PM_VERSAL_FAMILY_CODE 0x2 /* Versal family code */
+#define PM_VERSAL_NET_FAMILY_CODE 0x3 /* Versal NET family code */
+
 /* When all subfamily of platform need to support */
 #define ALL_SUB_FAMILY_CODE		0x00
 #define VERSAL_SUB_FAMILY_CODE		0x01
-- 
cgit v1.2.3


From 25e3ae0ce364fa725a6eea68d63d6a2ee09e019f Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Tue, 1 Jul 2025 05:38:51 -0700
Subject: drivers: firmware: xilinx: Switch to new family code in
 zynqmp_pm_get_family_info()

Currently, the family code and subfamily code are derived from the
PMC_TAP_IDCODE register. Versal, Versal NET share the same family
code. Also some platforms share the same subfamily code, making it
difficult to distinguish between platforms. Update
zynqmp_pm_get_family_info() to use IDs derived from the compatible
string instead of silicon ID codes derived from PMC_TAP_IDCODE register.

Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20250701123851.1314531-4-jay.buddhabhatti@amd.com
Signed-off-by: Michal Simek <michal.simek@amd.com>
---
 drivers/firmware/xilinx/zynqmp.c        | 42 +++++++++++++--------------------
 drivers/pinctrl/pinctrl-zynqmp.c        |  7 +++---
 drivers/soc/xilinx/xlnx_event_manager.c |  8 +++----
 drivers/soc/xilinx/zynqmp_power.c       | 10 ++++----
 include/linux/firmware/xlnx-zynqmp.h    | 15 ++----------
 5 files changed, 31 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 2d250a16d3dd..835a50c5af46 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -473,8 +473,6 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...)
 
 static u32 pm_api_version;
 static u32 pm_tz_version;
-static u32 pm_family_code;
-static u32 pm_sub_family_code;
 
 int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset)
 {
@@ -541,32 +539,18 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_get_chipid);
 /**
  * zynqmp_pm_get_family_info() - Get family info of platform
  * @family:	Returned family code value
- * @subfamily:	Returned sub-family code value
  *
  * Return: Returns status, either success or error+reason
  */
-int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily)
+int zynqmp_pm_get_family_info(u32 *family)
 {
-	u32 ret_payload[PAYLOAD_ARG_CNT];
-	u32 idcode;
-	int ret;
-
-	/* Check is family or sub-family code already received */
-	if (pm_family_code && pm_sub_family_code) {
-		*family = pm_family_code;
-		*subfamily = pm_sub_family_code;
-		return 0;
-	}
+	if (!active_platform_fw_data)
+		return -ENODEV;
 
-	ret = zynqmp_pm_invoke_fn(PM_GET_CHIPID, ret_payload, 0);
-	if (ret < 0)
-		return ret;
+	if (!family)
+		return -EINVAL;
 
-	idcode = ret_payload[1];
-	pm_family_code = FIELD_GET(FAMILY_CODE_MASK, idcode);
-	pm_sub_family_code = FIELD_GET(SUB_FAMILY_CODE_MASK, idcode);
-	*family = pm_family_code;
-	*subfamily = pm_sub_family_code;
+	*family = active_platform_fw_data->family_code;
 
 	return 0;
 }
@@ -1247,8 +1231,13 @@ int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param,
 				 u32 value)
 {
 	int ret;
+	u32 pm_family_code;
+
+	ret = zynqmp_pm_get_family_info(&pm_family_code);
+	if (ret)
+		return ret;
 
-	if (pm_family_code == ZYNQMP_FAMILY_CODE &&
+	if (pm_family_code == PM_ZYNQMP_FAMILY_CODE &&
 	    param == PM_PINCTRL_CONFIG_TRI_STATE) {
 		ret = zynqmp_pm_feature(PM_PINCTRL_CONFIG_PARAM_SET);
 		if (ret < PM_PINCTRL_PARAM_SET_VERSION) {
@@ -2055,6 +2044,7 @@ static int zynqmp_firmware_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct zynqmp_devinfo *devinfo;
+	u32 pm_family_code;
 	int ret;
 
 	ret = get_set_conduit_method(dev->of_node);
@@ -2098,8 +2088,8 @@ static int zynqmp_firmware_probe(struct platform_device *pdev)
 	pr_info("%s Platform Management API v%d.%d\n", __func__,
 		pm_api_version >> 16, pm_api_version & 0xFFFF);
 
-	/* Get the Family code and sub family code of platform */
-	ret = zynqmp_pm_get_family_info(&pm_family_code, &pm_sub_family_code);
+	/* Get the Family code of platform */
+	ret = zynqmp_pm_get_family_info(&pm_family_code);
 	if (ret < 0)
 		return ret;
 
@@ -2126,7 +2116,7 @@ static int zynqmp_firmware_probe(struct platform_device *pdev)
 
 	zynqmp_pm_api_debugfs_init();
 
-	if (pm_family_code == VERSAL_FAMILY_CODE) {
+	if (pm_family_code != PM_ZYNQMP_FAMILY_CODE) {
 		em_dev = platform_device_register_data(&pdev->dev, "xlnx_event_manager",
 						       -1, NULL, 0);
 		if (IS_ERR(em_dev))
diff --git a/drivers/pinctrl/pinctrl-zynqmp.c b/drivers/pinctrl/pinctrl-zynqmp.c
index fddf0fef4b13..71eaac81deb1 100644
--- a/drivers/pinctrl/pinctrl-zynqmp.c
+++ b/drivers/pinctrl/pinctrl-zynqmp.c
@@ -100,7 +100,6 @@ struct zynqmp_pctrl_group {
 
 static struct pinctrl_desc zynqmp_desc;
 static u32 family_code;
-static u32 sub_family_code;
 
 static int zynqmp_pctrl_get_groups_count(struct pinctrl_dev *pctldev)
 {
@@ -605,7 +604,7 @@ static int zynqmp_pinctrl_prepare_func_groups(struct device *dev, u32 fid,
 				return -ENOMEM;
 
 			for (pin = 0; pin < groups[resp[i]].npins; pin++) {
-				if (family_code == ZYNQMP_FAMILY_CODE)
+				if (family_code == PM_ZYNQMP_FAMILY_CODE)
 					__set_bit(groups[resp[i]].pins[pin], used_pins);
 				else
 					__set_bit((u8)groups[resp[i]].pins[pin] - 1, used_pins);
@@ -958,11 +957,11 @@ static int zynqmp_pinctrl_probe(struct platform_device *pdev)
 	if (!pctrl)
 		return -ENOMEM;
 
-	ret = zynqmp_pm_get_family_info(&family_code, &sub_family_code);
+	ret = zynqmp_pm_get_family_info(&family_code);
 	if (ret < 0)
 		return ret;
 
-	if (family_code == ZYNQMP_FAMILY_CODE) {
+	if (family_code == PM_ZYNQMP_FAMILY_CODE) {
 		ret = zynqmp_pinctrl_prepare_pin_desc(&pdev->dev, &zynqmp_desc.pins,
 						      &zynqmp_desc.npins);
 	} else {
diff --git a/drivers/soc/xilinx/xlnx_event_manager.c b/drivers/soc/xilinx/xlnx_event_manager.c
index a572d15f6161..6fdf4d14b7e7 100644
--- a/drivers/soc/xilinx/xlnx_event_manager.c
+++ b/drivers/soc/xilinx/xlnx_event_manager.c
@@ -77,17 +77,17 @@ struct registered_event_data {
 
 static bool xlnx_is_error_event(const u32 node_id)
 {
-	u32 pm_family_code, pm_sub_family_code;
+	u32 pm_family_code;
 
-	zynqmp_pm_get_family_info(&pm_family_code, &pm_sub_family_code);
+	zynqmp_pm_get_family_info(&pm_family_code);
 
-	if (pm_sub_family_code == VERSAL_SUB_FAMILY_CODE) {
+	if (pm_family_code == PM_VERSAL_FAMILY_CODE) {
 		if (node_id == VERSAL_EVENT_ERROR_PMC_ERR1 ||
 		    node_id == VERSAL_EVENT_ERROR_PMC_ERR2 ||
 		    node_id == VERSAL_EVENT_ERROR_PSM_ERR1 ||
 		    node_id == VERSAL_EVENT_ERROR_PSM_ERR2)
 			return true;
-	} else {
+	} else if (pm_family_code == PM_VERSAL_NET_FAMILY_CODE) {
 		if (node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR1 ||
 		    node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR2 ||
 		    node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR3 ||
diff --git a/drivers/soc/xilinx/zynqmp_power.c b/drivers/soc/xilinx/zynqmp_power.c
index ae59bf16659a..9b7b2858b22a 100644
--- a/drivers/soc/xilinx/zynqmp_power.c
+++ b/drivers/soc/xilinx/zynqmp_power.c
@@ -285,7 +285,7 @@ static int register_event(struct device *dev, const enum pm_api_cb_id cb_type, c
 static int zynqmp_pm_probe(struct platform_device *pdev)
 {
 	int ret, irq;
-	u32 pm_api_version, pm_family_code, pm_sub_family_code, node_id;
+	u32 pm_api_version, pm_family_code, node_id;
 	struct mbox_client *client;
 
 	ret = zynqmp_pm_get_api_version(&pm_api_version);
@@ -315,14 +315,16 @@ static int zynqmp_pm_probe(struct platform_device *pdev)
 		INIT_WORK(&zynqmp_pm_init_suspend_work->callback_work,
 			  zynqmp_pm_init_suspend_work_fn);
 
-		ret = zynqmp_pm_get_family_info(&pm_family_code, &pm_sub_family_code);
+		ret = zynqmp_pm_get_family_info(&pm_family_code);
 		if (ret < 0)
 			return ret;
 
-		if (pm_sub_family_code == VERSALNET_SUB_FAMILY_CODE)
+		if (pm_family_code == PM_VERSAL_NET_FAMILY_CODE)
 			node_id = PM_DEV_ACPU_0_0;
-		else
+		else if (pm_family_code == PM_VERSAL_FAMILY_CODE)
 			node_id = PM_DEV_ACPU_0;
+		else
+			return -ENODEV;
 
 		ret = register_event(&pdev->dev, PM_NOTIFY_CB, node_id, EVENT_SUBSYSTEM_RESTART,
 				     false, subsystem_restart_event_callback);
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 6458ef4e04e2..be6817ac5120 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -51,22 +51,11 @@
 
 #define PM_PINCTRL_PARAM_SET_VERSION	2
 
-#define ZYNQMP_FAMILY_CODE 0x23
-#define VERSAL_FAMILY_CODE 0x26
-
 /* Family codes */
 #define PM_ZYNQMP_FAMILY_CODE 0x1 /* ZynqMP family code */
 #define PM_VERSAL_FAMILY_CODE 0x2 /* Versal family code */
 #define PM_VERSAL_NET_FAMILY_CODE 0x3 /* Versal NET family code */
 
-/* When all subfamily of platform need to support */
-#define ALL_SUB_FAMILY_CODE		0x00
-#define VERSAL_SUB_FAMILY_CODE		0x01
-#define VERSALNET_SUB_FAMILY_CODE	0x03
-
-#define FAMILY_CODE_MASK	GENMASK(27, 21)
-#define SUB_FAMILY_CODE_MASK	GENMASK(20, 19)
-
 #define API_ID_MASK		GENMASK(7, 0)
 #define MODULE_ID_MASK		GENMASK(11, 8)
 #define PLM_MODULE_ID_MASK	GENMASK(15, 8)
@@ -570,7 +559,7 @@ int zynqmp_pm_invoke_fw_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...);
 #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE)
 int zynqmp_pm_get_api_version(u32 *version);
 int zynqmp_pm_get_chipid(u32 *idcode, u32 *version);
-int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily);
+int zynqmp_pm_get_family_info(u32 *family);
 int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out);
 int zynqmp_pm_clock_enable(u32 clock_id);
 int zynqmp_pm_clock_disable(u32 clock_id);
@@ -651,7 +640,7 @@ static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version)
 	return -ENODEV;
 }
 
-static inline int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily)
+static inline int zynqmp_pm_get_family_info(u32 *family)
 {
 	return -ENODEV;
 }
-- 
cgit v1.2.3


From f233d4855918547f19c5bff95223706d1c836b7c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 7 Oct 2025 22:03:48 +0000
Subject: bpf: Refactor storage_get_func_atomic to generic non_sleepable flag

Rename the storage_get_func_atomic flag to a more generic non_sleepable
flag that tracks whether a helper or kfunc may be called from a
non-sleepable context. This makes the flag more broadly applicable
beyond just storage_get helpers. See [0] for more context.

The flag is now set unconditionally for all helpers and kfuncs when:
- RCU critical section is active.
- Preemption is disabled.
- IRQs are disabled.
- In a non-sleepable context within a sleepable program (e.g., timer
  callbacks), which is indicated by !in_sleepable().

Previously, the flag was only set for storage_get helpers in these
contexts. With this change, it can be used by any code that needs to
differentiate between sleepable and non-sleepable contexts at the
per-instruction level.

The existing usage in do_misc_fixups() for storage_get helpers is
preserved by checking is_storage_get_function() before using the flag.

  [0]: https://lore.kernel.org/bpf/CAP01T76cbaNi4p-y8E0sjE2NXSra2S=Uja8G4hSQDu_SbXxREQ@mail.gmail.com

Cc: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Mykyta Yatsenko <mykyta.yatsenko5@gmail.com>
Link: https://lore.kernel.org/r/20251007220349.3852807-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  2 +-
 kernel/bpf/verifier.c        | 33 +++++++++++++++++----------------
 2 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4c497e839526..b57222a25a4a 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -548,7 +548,7 @@ struct bpf_insn_aux_data {
 	bool nospec_result; /* result is unsafe under speculation, nospec must follow */
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool needs_zext; /* alu op needs to clear upper bits */
-	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
+	bool non_sleepable; /* helper/kfunc may be called from non-sleepable context */
 	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
 	bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
 	u8 alu_state; /* used in combination with alu_limit */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 32123c4b041a..85a953124412 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11371,6 +11371,15 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
 	return *ptr && (*ptr)->func ? 0 : -EINVAL;
 }
 
+/* Check if we're in a sleepable context. */
+static inline bool in_sleepable_context(struct bpf_verifier_env *env)
+{
+	return !env->cur_state->active_rcu_lock &&
+	       !env->cur_state->active_preempt_locks &&
+	       !env->cur_state->active_irq_id &&
+	       in_sleepable(env);
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx_p)
 {
@@ -11437,9 +11446,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 				func_id_name(func_id), func_id);
 			return -EINVAL;
 		}
-
-		if (is_storage_get_function(func_id))
-			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
 	}
 
 	if (env->cur_state->active_preempt_locks) {
@@ -11448,9 +11454,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 				func_id_name(func_id), func_id);
 			return -EINVAL;
 		}
-
-		if (is_storage_get_function(func_id))
-			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
 	}
 
 	if (env->cur_state->active_irq_id) {
@@ -11459,17 +11462,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 				func_id_name(func_id), func_id);
 			return -EINVAL;
 		}
-
-		if (is_storage_get_function(func_id))
-			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
 	}
 
-	/*
-	 * Non-sleepable contexts in sleepable programs (e.g., timer callbacks)
-	 * are atomic and must use GFP_ATOMIC for storage_get helpers.
-	 */
-	if (!in_sleepable(env) && is_storage_get_function(func_id))
-		env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+	/* Track non-sleepable context for helpers. */
+	if (!in_sleepable_context(env))
+		env->insn_aux_data[insn_idx].non_sleepable = true;
 
 	meta.func_id = func_id;
 	/* check args */
@@ -13880,6 +13877,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EACCES;
 	}
 
+	/* Track non-sleepable context for kfuncs, same as for helpers. */
+	if (!in_sleepable_context(env))
+		insn_aux->non_sleepable = true;
+
 	/* Check the arguments */
 	err = check_kfunc_args(env, &meta, insn_idx);
 	if (err < 0)
@@ -22502,7 +22503,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		}
 
 		if (is_storage_get_function(insn->imm)) {
-			if (env->insn_aux_data[i + delta].storage_get_func_atomic)
+			if (env->insn_aux_data[i + delta].non_sleepable)
 				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
 			else
 				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
-- 
cgit v1.2.3


From 4c97c4b149a019a3b318dc6ea3dc96efe0ee1f39 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Fri, 10 Oct 2025 17:46:06 +0100
Subject: bpf: Extract internal structs validation logic into helpers

The arraymap and hashtab duplicate the logic that checks for and frees
internal structs (timer, workqueue, task_work) based on
BTF record flags. Centralize this by introducing two helpers:

  * bpf_map_has_internal_structs(map)
    Returns true if the map value contains any of internal structs:
    BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK.

  * bpf_map_free_internal_structs(map, obj)
    Frees the internal structs for a single value object.

Convert arraymap and both the prealloc/malloc hashtab paths to use the
new generic functions. This keeps the functionality for when/how to free
these special fields in one place and makes it easier to add support for
new internal structs in the future without touching every map
implementation.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251010164606.147298-3-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  7 +++++++
 kernel/bpf/arraymap.c | 19 ++++++-------------
 kernel/bpf/hashtab.c  | 36 +++++++++++++-----------------------
 kernel/bpf/helpers.c  | 10 ++++++++++
 4 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a98c83346134..f87fb203aaae 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -663,6 +663,13 @@ int map_check_no_btf(const struct bpf_map *map,
 bool bpf_map_meta_equal(const struct bpf_map *meta0,
 			const struct bpf_map *meta1);
 
+static inline bool bpf_map_has_internal_structs(struct bpf_map *map)
+{
+	return btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK);
+}
+
+void bpf_map_free_internal_structs(struct bpf_map *map, void *obj);
+
 extern const struct bpf_map_ops bpf_map_offload_ops;
 
 /* bpf_type_flag contains a set of flags that are applicable to the values of
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 80b1765a3159..0ba790c2d2e5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -448,19 +448,12 @@ static void array_map_free_internal_structs(struct bpf_map *map)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
-	/* We don't reset or free fields other than timer and workqueue
-	 * on uref dropping to zero.
-	 */
-	if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
-		for (i = 0; i < array->map.max_entries; i++) {
-			if (btf_record_has_field(map->record, BPF_TIMER))
-				bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
-			if (btf_record_has_field(map->record, BPF_WORKQUEUE))
-				bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
-			if (btf_record_has_field(map->record, BPF_TASK_WORK))
-				bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i));
-		}
-	}
+	/* We only free internal structs on uref dropping to zero */
+	if (!bpf_map_has_internal_structs(map))
+		return;
+
+	for (i = 0; i < array->map.max_entries; i++)
+		bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i));
 }
 
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c2fcd0cd51e5..e7a6ba04dc82 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -215,19 +215,6 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
 	return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
 }
 
-static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem)
-{
-	if (btf_record_has_field(htab->map.record, BPF_TIMER))
-		bpf_obj_free_timer(htab->map.record,
-				   htab_elem_value(elem, htab->map.key_size));
-	if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
-		bpf_obj_free_workqueue(htab->map.record,
-				       htab_elem_value(elem, htab->map.key_size));
-	if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
-		bpf_obj_free_task_work(htab->map.record,
-				       htab_elem_value(elem, htab->map.key_size));
-}
-
 static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
 {
 	u32 num_entries = htab->map.max_entries;
@@ -240,7 +227,8 @@ static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
 		struct htab_elem *elem;
 
 		elem = get_htab_elem(htab, i);
-		htab_free_internal_structs(htab, elem);
+		bpf_map_free_internal_structs(&htab->map,
+					      htab_elem_value(elem, htab->map.key_size));
 		cond_resched();
 	}
 }
@@ -1509,8 +1497,9 @@ static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
 		struct htab_elem *l;
 
 		hlist_nulls_for_each_entry(l, n, head, hash_node) {
-			/* We only free timer on uref dropping to zero */
-			htab_free_internal_structs(htab, l);
+			/* We only free internal structs on uref dropping to zero */
+			bpf_map_free_internal_structs(&htab->map,
+						      htab_elem_value(l, htab->map.key_size));
 		}
 		cond_resched_rcu();
 	}
@@ -1521,13 +1510,14 @@ static void htab_map_free_internal_structs(struct bpf_map *map)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 
-	/* We only free timer and workqueue on uref dropping to zero */
-	if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
-		if (!htab_is_prealloc(htab))
-			htab_free_malloced_internal_structs(htab);
-		else
-			htab_free_prealloced_internal_structs(htab);
-	}
+	/* We only free internal structs on uref dropping to zero */
+	if (!bpf_map_has_internal_structs(map))
+		return;
+
+	if (htab_is_prealloc(htab))
+		htab_free_prealloced_internal_structs(htab);
+	else
+		htab_free_malloced_internal_structs(htab);
 }
 
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 485f65fbd97f..dea8443f782c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4487,3 +4487,13 @@ void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len)
 		return NULL;
 	return (void *)__bpf_dynptr_data(ptr, len);
 }
+
+void bpf_map_free_internal_structs(struct bpf_map *map, void *val)
+{
+	if (btf_record_has_field(map->record, BPF_TIMER))
+		bpf_obj_free_timer(map->record, val);
+	if (btf_record_has_field(map->record, BPF_WORKQUEUE))
+		bpf_obj_free_workqueue(map->record, val);
+	if (btf_record_has_field(map->record, BPF_TASK_WORK))
+		bpf_obj_free_task_work(map->record, val);
+}
-- 
cgit v1.2.3


From 4914c17a76047ccbde24397cf9d406558183d756 Mon Sep 17 00:00:00 2001
From: Raghav Sharma <raghav.s@samsung.com>
Date: Mon, 15 Sep 2025 15:23:59 +0530
Subject: dt-bindings: clock: exynosautov920: add m2m clock definitions

Add device tree clock binding definitions for CMU_M2M

Signed-off-by: Raghav Sharma <raghav.s@samsung.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Alim Akhtar <alim.akhtar@samsung.com>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 .../clock/samsung,exynosautov920-clock.yaml         | 21 +++++++++++++++++++++
 include/dt-bindings/clock/samsung,exynosautov920.h  |  5 +++++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml
index 72f59db73f76..b2dfe6ed353a 100644
--- a/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml
@@ -38,6 +38,7 @@ properties:
       - samsung,exynosautov920-cmu-hsi0
       - samsung,exynosautov920-cmu-hsi1
       - samsung,exynosautov920-cmu-hsi2
+      - samsung,exynosautov920-cmu-m2m
       - samsung,exynosautov920-cmu-misc
       - samsung,exynosautov920-cmu-peric0
       - samsung,exynosautov920-cmu-peric1
@@ -226,6 +227,26 @@ allOf:
             - const: embd
             - const: ethernet
 
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: samsung,exynosautov920-cmu-m2m
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (38.4 MHz)
+            - description: CMU_M2M NOC clock (from CMU_TOP)
+            - description: CMU_M2M JPEG clock (from CMU_TOP)
+
+        clock-names:
+          items:
+            - const: oscclk
+            - const: noc
+            - const: jpeg
+
 required:
   - compatible
   - "#clock-cells"
diff --git a/include/dt-bindings/clock/samsung,exynosautov920.h b/include/dt-bindings/clock/samsung,exynosautov920.h
index 93e6233d1358..0342a988565a 100644
--- a/include/dt-bindings/clock/samsung,exynosautov920.h
+++ b/include/dt-bindings/clock/samsung,exynosautov920.h
@@ -295,4 +295,9 @@
 #define CLK_DOUT_HSI2_ETHERNET          6
 #define CLK_DOUT_HSI2_ETHERNET_PTP      7
 
+/* CMU_M2M */
+#define CLK_MOUT_M2M_JPEG_USER          1
+#define CLK_MOUT_M2M_NOC_USER           2
+#define CLK_DOUT_M2M_NOCP               3
+
 #endif /* _DT_BINDINGS_CLOCK_EXYNOSAUTOV920_H */
-- 
cgit v1.2.3


From 0b94201e327471d034d81cf5fd2131a5529eea19 Mon Sep 17 00:00:00 2001
From: Raghav Sharma <raghav.s@samsung.com>
Date: Thu, 25 Sep 2025 18:34:55 +0530
Subject: dt-bindings: clock: exynosautov920: add mfc clock definitions

Add device tree clock binding definitions for CMU_MFC

Signed-off-by: Raghav Sharma <raghav.s@samsung.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 .../clock/samsung,exynosautov920-clock.yaml         | 21 +++++++++++++++++++++
 include/dt-bindings/clock/samsung,exynosautov920.h  |  5 +++++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml
index b2dfe6ed353a..5bf905f88a1a 100644
--- a/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml
@@ -39,6 +39,7 @@ properties:
       - samsung,exynosautov920-cmu-hsi1
       - samsung,exynosautov920-cmu-hsi2
       - samsung,exynosautov920-cmu-m2m
+      - samsung,exynosautov920-cmu-mfc
       - samsung,exynosautov920-cmu-misc
       - samsung,exynosautov920-cmu-peric0
       - samsung,exynosautov920-cmu-peric1
@@ -247,6 +248,26 @@ allOf:
             - const: noc
             - const: jpeg
 
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: samsung,exynosautov920-cmu-mfc
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (38.4 MHz)
+            - description: CMU_MFC MFC clock (from CMU_TOP)
+            - description: CMU_MFC WFD clock (from CMU_TOP)
+
+        clock-names:
+          items:
+            - const: oscclk
+            - const: mfc
+            - const: wfd
+
 required:
   - compatible
   - "#clock-cells"
diff --git a/include/dt-bindings/clock/samsung,exynosautov920.h b/include/dt-bindings/clock/samsung,exynosautov920.h
index 0342a988565a..970d05167fc6 100644
--- a/include/dt-bindings/clock/samsung,exynosautov920.h
+++ b/include/dt-bindings/clock/samsung,exynosautov920.h
@@ -300,4 +300,9 @@
 #define CLK_MOUT_M2M_NOC_USER           2
 #define CLK_DOUT_M2M_NOCP               3
 
+/* CMU_MFC */
+#define CLK_MOUT_MFC_MFC_USER           1
+#define CLK_MOUT_MFC_WFD_USER           2
+#define CLK_DOUT_MFC_NOCP               3
+
 #endif /* _DT_BINDINGS_CLOCK_EXYNOSAUTOV920_H */
-- 
cgit v1.2.3


From ed4a5c5de56ad4e23c9e5da8981639352b63b8ac Mon Sep 17 00:00:00 2001
From: RD Babiera <rdbabiera@google.com>
Date: Tue, 23 Sep 2025 18:16:07 +0000
Subject: usb: typec: class: add typec_get_data_role symbol

Alt Mode drivers are responsible for sending Enter Mode through the TCPM,
but only a DFP is allowed to send Enter Mode. typec_get_data_role gets
the port's data role, which can then be used in altmode drivers via
typec_altmode_get_data_role to know if Enter Mode should be sent.

Signed-off-by: RD Babiera <rdbabiera@google.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20250923181606.1583584-5-rdbabiera@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/class.c         | 13 +++++++++++++
 include/linux/usb/typec.h         |  1 +
 include/linux/usb/typec_altmode.h | 13 +++++++++++++
 3 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index 67a533e35150..9b2647cb199b 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -2120,6 +2120,19 @@ void typec_set_data_role(struct typec_port *port, enum typec_data_role role)
 }
 EXPORT_SYMBOL_GPL(typec_set_data_role);
 
+/**
+ * typec_get_data_role - Get port data role
+ * @port: The USB Type-C Port to query
+ *
+ * This routine is used by the altmode drivers to determine if the port is the
+ * DFP before issuing Enter Mode
+ */
+enum typec_data_role typec_get_data_role(struct typec_port *port)
+{
+	return port->data_role;
+}
+EXPORT_SYMBOL_GPL(typec_get_data_role);
+
 /**
  * typec_set_pwr_role - Report power role change
  * @port: The USB Type-C Port where the role was changed
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index 252af3f77039..309251572e2e 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -337,6 +337,7 @@ struct typec_plug *typec_register_plug(struct typec_cable *cable,
 void typec_unregister_plug(struct typec_plug *plug);
 
 void typec_set_data_role(struct typec_port *port, enum typec_data_role role);
+enum typec_data_role typec_get_data_role(struct typec_port *port);
 void typec_set_pwr_role(struct typec_port *port, enum typec_role role);
 void typec_set_vconn_role(struct typec_port *port, enum typec_role role);
 void typec_set_pwr_opmode(struct typec_port *port, enum typec_pwr_opmode mode);
diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h
index b3c0866ea70f..f7db3bd4c90e 100644
--- a/include/linux/usb/typec_altmode.h
+++ b/include/linux/usb/typec_altmode.h
@@ -172,6 +172,19 @@ typec_altmode_get_svdm_version(struct typec_altmode *altmode)
 	return typec_get_negotiated_svdm_version(typec_altmode2port(altmode));
 }
 
+/**
+ * typec_altmode_get_data_role - Get port data role
+ * @altmode: Handle to the alternate mode
+ *
+ * Alt Mode drivers should only issue Enter Mode through the port if they are
+ * the DFP.
+ */
+static inline enum typec_data_role
+typec_altmode_get_data_role(struct typec_altmode *altmode)
+{
+	return typec_get_data_role(typec_altmode2port(altmode));
+}
+
 /**
  * struct typec_altmode_driver - USB Type-C alternate mode device driver
  * @id_table: Null terminated array of SVIDs
-- 
cgit v1.2.3


From 536bf30d282a6b2f676c6106587f0e1946449aca Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Tue, 16 Sep 2025 16:02:53 -0500
Subject: iio: buffer: document iio_push_to_buffers_with_ts()

Document the iio_push_to_buffers_with_ts() function.

This is copied and slightly cleaned up from
iio_push_to_buffers_with_timestamp().

Signed-off-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/buffer.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 5c84ec4a9810..e46b818981aa 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -45,6 +45,22 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev,
 	return iio_push_to_buffers(indio_dev, data);
 }
 
+/**
+ * iio_push_to_buffers_with_ts() - push data and timestamp to buffers
+ * @indio_dev:		iio_dev structure for device.
+ * @data:		Pointer to sample data buffer.
+ * @data_total_len:	The size of @data in bytes.
+ * @timestamp:		Timestamp for the sample data.
+ *
+ * Pushes data to the IIO device's buffers. If timestamps are enabled for the
+ * device the function will store the supplied timestamp as the last element in
+ * the sample data buffer before pushing it to the device buffers. The sample
+ * data buffer needs to be large enough to hold the additional timestamp
+ * (usually the buffer should be at least indio->scan_bytes bytes large).
+ *
+ * Context: Any context.
+ * Return: 0 on success, a negative error code otherwise.
+ */
 static inline int iio_push_to_buffers_with_ts(struct iio_dev *indio_dev,
 					      void *data, size_t data_total_len,
 					      s64 timestamp)
-- 
cgit v1.2.3


From 4992ce003b76ee1629ad4e7332a49ea2619e7523 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Tue, 16 Sep 2025 16:02:54 -0500
Subject: iio: buffer: deprecated iio_push_to_buffers_with_timestamp()

Replace the documentation of iio_push_to_buffers_with_timestamp() with
a deprecation notice pointing to the preferred alternative.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/buffer.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index e46b818981aa..d37f82678f71 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -26,11 +26,7 @@ int iio_pop_from_buffer(struct iio_buffer *buffer, void *data);
  * @data:		sample data
  * @timestamp:		timestamp for the sample data
  *
- * Pushes data to the IIO device's buffers. If timestamps are enabled for the
- * device the function will store the supplied timestamp as the last element in
- * the sample data buffer before pushing it to the device buffers. The sample
- * data buffer needs to be large enough to hold the additional timestamp
- * (usually the buffer should be indio->scan_bytes bytes large).
+ * DEPRECATED: Use iio_push_to_buffers_with_ts() instead.
  *
  * Returns 0 on success, a negative error code otherwise.
  */
-- 
cgit v1.2.3


From 748ed9fc8596015e7e136877465919b89c7d08d6 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Tue, 16 Sep 2025 16:02:56 -0500
Subject: iio: buffer: document store_to() callback may be called in any
 context

Document that the struct iio_buffer_access_funcs.store_to() callback
must be safe to call from any context since it is called from
iio_push_to_buffer() which may be called from any context.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/buffer_impl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
index e72552e026f3..0daff9ff20ce 100644
--- a/include/linux/iio/buffer_impl.h
+++ b/include/linux/iio/buffer_impl.h
@@ -24,7 +24,8 @@ struct sg_table;
 
 /**
  * struct iio_buffer_access_funcs - access functions for buffers.
- * @store_to:		actually store stuff to the buffer
+ * @store_to:		actually store stuff to the buffer - must be safe to
+ *			call from any context (e.g. must not sleep).
  * @read:		try to get a specified number of bytes (must exist)
  * @data_available:	indicates how much data is available for reading from
  *			the buffer.
-- 
cgit v1.2.3


From 592ae0ccecfac9af8f67444cab11cbb11770f571 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Tue, 16 Sep 2025 16:02:57 -0500
Subject: iio: buffer: document that buffer callback must be context safe

Document that the callback registered with iio_channel_get_all_cb()
must be safe to call from any context since it is called from by
iio_push_to_buffer() which can be called in any context.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/buffer/industrialio-buffer-cb.c | 1 +
 include/linux/iio/consumer.h                | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/iio/buffer/industrialio-buffer-cb.c b/drivers/iio/buffer/industrialio-buffer-cb.c
index 3e27385069ed..f4ebff968493 100644
--- a/drivers/iio/buffer/industrialio-buffer-cb.c
+++ b/drivers/iio/buffer/industrialio-buffer-cb.c
@@ -13,6 +13,7 @@
 
 struct iio_cb_buffer {
 	struct iio_buffer buffer;
+	/* Must be safe to call from any context (e.g. must not sleep). */
 	int (*cb)(const void *data, void *private);
 	void *private;
 	struct iio_channel *channels;
diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index a38b277c2c02..5039558267e4 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -131,7 +131,8 @@ struct iio_cb_buffer;
 /**
  * iio_channel_get_all_cb() - register callback for triggered capture
  * @dev:		Pointer to client device.
- * @cb:			Callback function.
+ * @cb:			Callback function. Must be safe to call from any context
+ *			(e.g. must not sleep).
  * @private:		Private data passed to callback.
  *
  * NB right now we have no ability to mux data from multiple devices.
-- 
cgit v1.2.3


From a514bb109eada64f798f1c86c17182229cc20fe7 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 7 Oct 2025 10:15:21 +0100
Subject: iio: buffer: support getting dma channel from the buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new buffer accessor .get_dma_dev() in order to get the
struct device responsible for actually providing the dma channel. We
cannot assume that we can use the parent of the IIO device for mapping
the DMA buffer. This becomes important on systems (like the Xilinx/AMD
zynqMP Ultrascale) where memory (or part of it) is mapped above the
32 bit range. On such systems and given that a device by default has
a dma mask of 32 bits we would then need to rely on bounce buffers (to
swiotlb) for mapping memory above the dma mask limit.

In the process, add an iio_buffer_get_dma_dev() helper function to get
the proper DMA device.

Cc: stable@vger.kernel.org
Reviewed-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-buffer.c | 21 ++++++++++++++++-----
 include/linux/iio/buffer_impl.h   |  2 ++
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index a80f7cc25a27..96ea0f039dfb 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -1623,19 +1623,28 @@ static int iio_dma_resv_lock(struct dma_buf *dmabuf, bool nonblock)
 	return 0;
 }
 
+static struct device *iio_buffer_get_dma_dev(const struct iio_dev *indio_dev,
+					     struct iio_buffer *buffer)
+{
+	if (buffer->access->get_dma_dev)
+		return buffer->access->get_dma_dev(buffer);
+
+	return indio_dev->dev.parent;
+}
+
 static struct dma_buf_attachment *
 iio_buffer_find_attachment(struct iio_dev_buffer_pair *ib,
 			   struct dma_buf *dmabuf, bool nonblock)
 {
-	struct device *dev = ib->indio_dev->dev.parent;
 	struct iio_buffer *buffer = ib->buffer;
+	struct device *dma_dev = iio_buffer_get_dma_dev(ib->indio_dev, buffer);
 	struct dma_buf_attachment *attach = NULL;
 	struct iio_dmabuf_priv *priv;
 
 	guard(mutex)(&buffer->dmabufs_mutex);
 
 	list_for_each_entry(priv, &buffer->dmabufs, entry) {
-		if (priv->attach->dev == dev
+		if (priv->attach->dev == dma_dev
 		    && priv->attach->dmabuf == dmabuf) {
 			attach = priv->attach;
 			break;
@@ -1653,6 +1662,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib,
 {
 	struct iio_dev *indio_dev = ib->indio_dev;
 	struct iio_buffer *buffer = ib->buffer;
+	struct device *dma_dev = iio_buffer_get_dma_dev(indio_dev, buffer);
 	struct dma_buf_attachment *attach;
 	struct iio_dmabuf_priv *priv, *each;
 	struct dma_buf *dmabuf;
@@ -1679,7 +1689,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib,
 		goto err_free_priv;
 	}
 
-	attach = dma_buf_attach(dmabuf, indio_dev->dev.parent);
+	attach = dma_buf_attach(dmabuf, dma_dev);
 	if (IS_ERR(attach)) {
 		err = PTR_ERR(attach);
 		goto err_dmabuf_put;
@@ -1719,7 +1729,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib,
 	 * combo. If we do, refuse to attach.
 	 */
 	list_for_each_entry(each, &buffer->dmabufs, entry) {
-		if (each->attach->dev == indio_dev->dev.parent
+		if (each->attach->dev == dma_dev
 		    && each->attach->dmabuf == dmabuf) {
 			/*
 			 * We unlocked the reservation object, so going through
@@ -1758,6 +1768,7 @@ static int iio_buffer_detach_dmabuf(struct iio_dev_buffer_pair *ib,
 {
 	struct iio_buffer *buffer = ib->buffer;
 	struct iio_dev *indio_dev = ib->indio_dev;
+	struct device *dma_dev = iio_buffer_get_dma_dev(indio_dev, buffer);
 	struct iio_dmabuf_priv *priv;
 	struct dma_buf *dmabuf;
 	int dmabuf_fd, ret = -EPERM;
@@ -1772,7 +1783,7 @@ static int iio_buffer_detach_dmabuf(struct iio_dev_buffer_pair *ib,
 	guard(mutex)(&buffer->dmabufs_mutex);
 
 	list_for_each_entry(priv, &buffer->dmabufs, entry) {
-		if (priv->attach->dev == indio_dev->dev.parent
+		if (priv->attach->dev == dma_dev
 		    && priv->attach->dmabuf == dmabuf) {
 			list_del(&priv->entry);
 
diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
index e72552e026f3..8d770ced66b2 100644
--- a/include/linux/iio/buffer_impl.h
+++ b/include/linux/iio/buffer_impl.h
@@ -50,6 +50,7 @@ struct sg_table;
  * @enqueue_dmabuf:	called from userspace via ioctl to queue this DMABUF
  *			object to this buffer. Requires a valid DMABUF fd, that
  *			was previouly attached to this buffer.
+ * @get_dma_dev:	called to get the DMA channel associated with this buffer.
  * @lock_queue:		called when the core needs to lock the buffer queue;
  *                      it is used when enqueueing DMABUF objects.
  * @unlock_queue:       used to unlock a previously locked buffer queue
@@ -90,6 +91,7 @@ struct iio_buffer_access_funcs {
 			      struct iio_dma_buffer_block *block,
 			      struct dma_fence *fence, struct sg_table *sgt,
 			      size_t size, bool cyclic);
+	struct device * (*get_dma_dev)(struct iio_buffer *buffer);
 	void (*lock_queue)(struct iio_buffer *buffer);
 	void (*unlock_queue)(struct iio_buffer *buffer);
 
-- 
cgit v1.2.3


From f9c198c3ccaf90a1a265fb2ffa8d4b093c3b0784 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 7 Oct 2025 10:15:22 +0100
Subject: iio: buffer-dma: support getting the DMA channel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the .get_dma_dev() callback for DMA buffers by returning the
device that owns the DMA channel. This allows the core DMABUF
infrastructure to properly map DMA buffers using the correct device,
avoiding the need for bounce buffers on systems where memory is mapped
above the 32-bit range.

The function returns the DMA queue's device, which is the actual device
responsible for DMA operations in buffer-dma implementations.

Cc: stable@vger.kernel.org
Reviewed-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/buffer/industrialio-buffer-dma.c | 6 ++++++
 include/linux/iio/buffer-dma.h               | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c
index ee294a775e8a..7a7a9d37339b 100644
--- a/drivers/iio/buffer/industrialio-buffer-dma.c
+++ b/drivers/iio/buffer/industrialio-buffer-dma.c
@@ -786,6 +786,12 @@ out_end_signalling:
 }
 EXPORT_SYMBOL_NS_GPL(iio_dma_buffer_enqueue_dmabuf, "IIO_DMA_BUFFER");
 
+struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer)
+{
+	return iio_buffer_to_queue(buffer)->dev;
+}
+EXPORT_SYMBOL_NS_GPL(iio_dma_buffer_get_dma_dev, "IIO_DMA_BUFFER");
+
 void iio_dma_buffer_lock_queue(struct iio_buffer *buffer)
 {
 	struct iio_dma_buffer_queue *queue = iio_buffer_to_queue(buffer);
diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h
index 5eb66a399002..4f33e6a39797 100644
--- a/include/linux/iio/buffer-dma.h
+++ b/include/linux/iio/buffer-dma.h
@@ -174,5 +174,6 @@ int iio_dma_buffer_enqueue_dmabuf(struct iio_buffer *buffer,
 				  size_t size, bool cyclic);
 void iio_dma_buffer_lock_queue(struct iio_buffer *buffer);
 void iio_dma_buffer_unlock_queue(struct iio_buffer *buffer);
+struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer);
 
 #endif
-- 
cgit v1.2.3


From d25de16477657f9eddd4be9abd409515edcc3b9e Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Thu, 9 Oct 2025 17:40:16 +0800
Subject: ASoC: soc-acpi: make some variables of acpi adr and link adr
 non-const

Currently, we use predefined snd_soc_acpi_link_adr tables to match the
link adr from ACPI table to select the machine driver and the topology.
However, with the mechanism, we need to create the snd_soc_acpi_link_adr
table for each audio config. The sof_sdw machine driver is used by
almost all Intel platforms with SOF and we can load required topology
file dynamically today. In other words, we can use sof_sdw machine
driver as the default machine driver for Intel SOF SoundWire codecs and
no need to create snd_soc_acpi_link_adr table for every new audio
configs.
To achieve it, we need to drop the const for some members and edit the
link adr and acpi adr data to match the data from the ACPI table.

Suggested-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Simon Trimmer <simont@opensource.cirrus.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Link: https://patch.msgid.link/20251009094023.3474895-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-acpi.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h
index b8af309c2683..90d73b9bddab 100644
--- a/include/sound/soc-acpi.h
+++ b/include/sound/soc-acpi.h
@@ -114,8 +114,8 @@ struct snd_soc_acpi_endpoint {
  * @name_prefix: string used for codec controls
  */
 struct snd_soc_acpi_adr_device {
-	const u64 adr;
-	const u8 num_endpoints;
+	u64 adr;
+	u8 num_endpoints;
 	const struct snd_soc_acpi_endpoint *endpoints;
 	const char *name_prefix;
 };
@@ -131,8 +131,8 @@ struct snd_soc_acpi_adr_device {
  */
 
 struct snd_soc_acpi_link_adr {
-	const u32 mask;
-	const u32 num_adr;
+	u32 mask;
+	u32 num_adr;
 	const struct snd_soc_acpi_adr_device *adr_d;
 };
 
-- 
cgit v1.2.3


From ea97713903784286ef1ce45456f404ed288f19b1 Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Thu, 9 Oct 2025 17:40:17 +0800
Subject: ASoC: soc_sdw_utils: add name_prefix to asoc_sdw_codec_info struct

Currently, the codec name_prefix of Intel SoundWire machine driver is
from the ACPI match table. We can have it in the asoc_sdw_codec_info
struct as a default name_prefix of a codec if there is no corresponding
audio config found in the ACPI match table.

Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Simon Trimmer <simont@opensource.cirrus.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Link: https://patch.msgid.link/20251009094023.3474895-4-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc_sdw_utils.h       |  1 +
 sound/soc/sdw_utils/soc_sdw_utils.c | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h
index 3c5e9b2af7f1..e289b453baba 100644
--- a/include/sound/soc_sdw_utils.h
+++ b/include/sound/soc_sdw_utils.h
@@ -68,6 +68,7 @@ struct asoc_sdw_codec_info {
 	const int part_id;
 	const int version_id;
 	const char *codec_name;
+	const char *name_prefix;
 	int amp_num;
 	const u8 acpi_id[ACPI_ID_LEN];
 	const bool ignore_internal_dmic;
diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c
index 56c72ef27e7b..cd41b0e16777 100644
--- a/sound/soc/sdw_utils/soc_sdw_utils.c
+++ b/sound/soc/sdw_utils/soc_sdw_utils.c
@@ -78,6 +78,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x700,
+		.name_prefix = "rt700",
 		.dais = {
 			{
 				.direction = {true, true},
@@ -95,6 +96,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x711,
+		.name_prefix = "rt711",
 		.version_id = 3,
 		.dais = {
 			{
@@ -115,6 +117,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x711,
+		.name_prefix = "rt711",
 		.version_id = 2,
 		.dais = {
 			{
@@ -135,6 +138,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x712,
+		.name_prefix = "rt712",
 		.version_id = 3,
 		.dais =	{
 			{
@@ -176,6 +180,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x1712,
+		.name_prefix = "rt712-dmic",
 		.version_id = 3,
 		.dais =	{
 			{
@@ -190,6 +195,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x713,
+		.name_prefix = "rt713",
 		.version_id = 3,
 		.dais =	{
 			{
@@ -217,6 +223,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x1713,
+		.name_prefix = "rt713-dmic",
 		.version_id = 3,
 		.dais =	{
 			{
@@ -231,6 +238,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x1308,
+		.name_prefix = "rt1308",
 		.acpi_id = "10EC1308",
 		.dais = {
 			{
@@ -253,6 +261,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x1316,
+		.name_prefix = "rt1316",
 		.dais = {
 			{
 				.direction = {true, true},
@@ -273,6 +282,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x1318,
+		.name_prefix = "rt1318",
 		.dais = {
 			{
 				.direction = {true, true},
@@ -293,6 +303,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x1320,
+		.name_prefix = "rt1320",
 		.dais = {
 			{
 				.direction = {true, false},
@@ -313,6 +324,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x714,
+		.name_prefix = "rt714",
 		.version_id = 3,
 		.ignore_internal_dmic = true,
 		.dais = {
@@ -328,6 +340,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x715,
+		.name_prefix = "rt715",
 		.version_id = 3,
 		.ignore_internal_dmic = true,
 		.dais = {
@@ -343,6 +356,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x714,
+		.name_prefix = "rt714",
 		.version_id = 2,
 		.ignore_internal_dmic = true,
 		.dais = {
@@ -358,6 +372,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x715,
+		.name_prefix = "rt715",
 		.version_id = 2,
 		.ignore_internal_dmic = true,
 		.dais = {
@@ -373,6 +388,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x721,
+		.name_prefix = "rt721",
 		.version_id = 3,
 		.dais = {
 			{
@@ -415,6 +431,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x722,
+		.name_prefix = "rt722",
 		.version_id = 3,
 		.dais = {
 			{
@@ -459,6 +476,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x8373,
+		.name_prefix = "Left",
 		.dais = {
 			{
 				.direction = {true, true},
@@ -478,6 +496,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x8363,
+		.name_prefix = "Left",
 		.dais = {
 			{
 				.direction = {true, false},
@@ -497,6 +516,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x5682,
+		.name_prefix = "rt5682",
 		.dais = {
 			{
 				.direction = {true, true},
@@ -514,6 +534,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x3556,
+		.name_prefix = "AMP",
 		.dais = {
 			{
 				.direction = {true, false},
@@ -566,6 +587,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x4242,
+		.name_prefix = "cs42l42",
 		.dais = {
 			{
 				.direction = {true, true},
@@ -583,6 +605,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x4243,
+		.name_prefix = "cs42l43",
 		.codec_name = "cs42l43-codec",
 		.count_sidecar = asoc_sdw_bridge_cs35l56_count_sidecar,
 		.add_sidecar = asoc_sdw_bridge_cs35l56_add_sidecar,
@@ -634,6 +657,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0xaaaa, /* generic codec mockup */
+		.name_prefix = "sdw_mockup_mmulti-function",
 		.version_id = 0,
 		.dais = {
 			{
@@ -659,6 +683,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0xaa55, /* headset codec mockup */
+		.name_prefix = "sdw_mockup_headset0",
 		.version_id = 0,
 		.dais = {
 			{
@@ -672,6 +697,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x55aa, /* amplifier mockup */
+		.name_prefix = "sdw_mockup_amp1",
 		.version_id = 0,
 		.dais = {
 			{
@@ -685,6 +711,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	},
 	{
 		.part_id = 0x5555,
+		.name_prefix = "sdw_mockup_mic0",
 		.version_id = 0,
 		.dais = {
 			{
-- 
cgit v1.2.3


From 5ed60e45c59d66e61586a10433e2b5527d4d72b5 Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Thu, 9 Oct 2025 17:40:19 +0800
Subject: ASoC: soc_sdw_utils: export asoc_sdw_get_dai_type

asoc_sdw_get_dai_type() is quite useful to convert SDCA function types
to SDW DAI types. It can be used by other drivers.

Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Simon Trimmer <simont@opensource.cirrus.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Link: https://patch.msgid.link/20251009094023.3474895-6-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc_sdw_utils.h       | 1 +
 sound/soc/sdw_utils/soc_sdw_utils.c | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h
index e289b453baba..76c64c5245d4 100644
--- a/include/sound/soc_sdw_utils.h
+++ b/include/sound/soc_sdw_utils.h
@@ -169,6 +169,7 @@ int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card, int *num_devs, int *
 
 struct asoc_sdw_dailink *asoc_sdw_find_dailink(struct asoc_sdw_dailink *dailinks,
 					       const struct snd_soc_acpi_endpoint *new);
+int asoc_sdw_get_dai_type(u32 type);
 
 int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card,
 				 struct asoc_sdw_dailink *soc_dais,
diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c
index cd41b0e16777..0460e2a8c50a 100644
--- a/sound/soc/sdw_utils/soc_sdw_utils.c
+++ b/sound/soc/sdw_utils/soc_sdw_utils.c
@@ -1239,7 +1239,7 @@ struct asoc_sdw_dailink *asoc_sdw_find_dailink(struct asoc_sdw_dailink *dailinks
 }
 EXPORT_SYMBOL_NS(asoc_sdw_find_dailink, "SND_SOC_SDW_UTILS");
 
-static int asoc_sdw_get_dai_type(u32 type)
+int asoc_sdw_get_dai_type(u32 type)
 {
 	switch (type) {
 	case SDCA_FUNCTION_TYPE_SMART_AMP:
@@ -1257,6 +1257,7 @@ static int asoc_sdw_get_dai_type(u32 type)
 		return -EINVAL;
 	}
 }
+EXPORT_SYMBOL_NS(asoc_sdw_get_dai_type, "SND_SOC_SDW_UTILS");
 
 /*
  * Check if the SDCA endpoint is present by the SDW peripheral
-- 
cgit v1.2.3


From 4ebe64f507ca921c5109eb37eae6058b77413d93 Mon Sep 17 00:00:00 2001
From: Baojun Xu <baojun.xu@ti.com>
Date: Fri, 10 Oct 2025 16:53:48 +0800
Subject: ASoC: tas2781: Add TAS5802, TAS5815, and TAS5828

TAS5802/TAS5815/TAS5828 has on-chip DSP without current/voltage feedback.

Signed-off-by: Baojun Xu <baojun.xu@ti.com>
Link: https://patch.msgid.link/20251010085349.52951-1-baojun.xu@ti.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/tas2781.h        |  3 +++
 sound/soc/codecs/tas2781-i2c.c | 21 +++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h
index ddd997ac3216..0fbcdb15c74b 100644
--- a/include/sound/tas2781.h
+++ b/include/sound/tas2781.h
@@ -120,8 +120,11 @@ enum audio_device {
 	TAS2570,
 	TAS2572,
 	TAS2781,
+	TAS5802,
+	TAS5815,
 	TAS5825,
 	TAS5827,
+	TAS5828,
 	TAS_OTHERS,
 };
 
diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c
index 1539b70881d1..ba880b5de7e8 100644
--- a/sound/soc/codecs/tas2781-i2c.c
+++ b/sound/soc/codecs/tas2781-i2c.c
@@ -108,8 +108,11 @@ static const struct i2c_device_id tasdevice_id[] = {
 	{ "tas2570", TAS2570 },
 	{ "tas2572", TAS2572 },
 	{ "tas2781", TAS2781 },
+	{ "tas5802", TAS5802 },
+	{ "tas5815", TAS5815 },
 	{ "tas5825", TAS5825 },
 	{ "tas5827", TAS5827 },
+	{ "tas5828", TAS5828 },
 	{}
 };
 MODULE_DEVICE_TABLE(i2c, tasdevice_id);
@@ -124,8 +127,11 @@ static const struct of_device_id tasdevice_of_match[] = {
 	{ .compatible = "ti,tas2570" },
 	{ .compatible = "ti,tas2572" },
 	{ .compatible = "ti,tas2781" },
+	{ .compatible = "ti,tas5802" },
+	{ .compatible = "ti,tas5815" },
 	{ .compatible = "ti,tas5825" },
 	{ .compatible = "ti,tas5827" },
+	{ .compatible = "ti,tas5828" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, tasdevice_of_match);
@@ -1665,8 +1671,10 @@ static void tasdevice_fw_ready(const struct firmware *fmw,
 	}
 	tas_priv->fw_state = TASDEVICE_DSP_FW_ALL_OK;
 
-	/* There is no calibration required for TAS5825/TAS5827. */
-	if (tas_priv->chip_id < TAS5825) {
+	/* There is no calibration required for
+	 * TAS5802/TAS5815/TAS5825/TAS5827/TAS5828.
+	 */
+	if (tas_priv->chip_id < TAS5802) {
 		ret = tasdevice_create_cali_ctrls(tas_priv);
 		if (ret) {
 			dev_err(tas_priv->dev, "cali controls error\n");
@@ -1720,8 +1728,11 @@ out:
 		switch (tas_priv->chip_id) {
 		case TAS2563:
 		case TAS2781:
+		case TAS5802:
+		case TAS5815:
 		case TAS5825:
 		case TAS5827:
+		case TAS5828:
 			/* If DSP FW fail, DSP kcontrol won't be created. */
 			tasdevice_dsp_remove(tas_priv);
 		}
@@ -1882,8 +1893,11 @@ static int tasdevice_codec_probe(struct snd_soc_component *codec)
 		p = (struct snd_kcontrol_new *)tas2781_snd_controls;
 		size = ARRAY_SIZE(tas2781_snd_controls);
 		break;
+	case TAS5802:
+	case TAS5815:
 	case TAS5825:
 	case TAS5827:
+	case TAS5828:
 		p = (struct snd_kcontrol_new *)tas5825_snd_controls;
 		size = ARRAY_SIZE(tas5825_snd_controls);
 		break;
@@ -2054,8 +2068,11 @@ static const struct acpi_device_id tasdevice_acpi_match[] = {
 	{ "TXNW2570", TAS2570 },
 	{ "TXNW2572", TAS2572 },
 	{ "TXNW2781", TAS2781 },
+	{ "TXNW5802", TAS5802 },
+	{ "TXNW5815", TAS5815 },
 	{ "TXNW5825", TAS5825 },
 	{ "TXNW5827", TAS5827 },
+	{ "TXNW5828", TAS5828 },
 	{},
 };
 
-- 
cgit v1.2.3


From b83fb1b14c06bdd765903ac852ba20a14e24f227 Mon Sep 17 00:00:00 2001
From: Axel Haslam <ahaslam@baylibre.com>
Date: Mon, 6 Oct 2025 11:25:41 -0300
Subject: spi: offload: Add offset parameter

Add an offset parameter that can be passed in the periodic trigger.
This is useful for example when ADC drivers implement a separate periodic
signal to trigger conversion and need offload to read the result with
some delay. While at it, add some documentation to offload periodic trigger
parameters.

Reviewed-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Axel Haslam <ahaslam@baylibre.com>
Signed-off-by: Marcelo Schmitt <marcelo.schmitt@analog.com>
Link: https://patch.msgid.link/cd315e95c0bd8523f00e91c400abcd6a418e5924.1759760519.git.marcelo.schmitt@analog.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-offload-trigger-pwm.c | 3 +++
 include/linux/spi/offload/types.h     | 9 +++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/spi/spi-offload-trigger-pwm.c b/drivers/spi/spi-offload-trigger-pwm.c
index 805ed41560df..3e8c19227edb 100644
--- a/drivers/spi/spi-offload-trigger-pwm.c
+++ b/drivers/spi/spi-offload-trigger-pwm.c
@@ -51,12 +51,14 @@ static int spi_offload_trigger_pwm_validate(struct spi_offload_trigger *trigger,
 	wf.period_length_ns = DIV_ROUND_UP_ULL(NSEC_PER_SEC, periodic->frequency_hz);
 	/* REVISIT: 50% duty-cycle for now - may add config parameter later */
 	wf.duty_length_ns = wf.period_length_ns / 2;
+	wf.duty_offset_ns = periodic->offset_ns;
 
 	ret = pwm_round_waveform_might_sleep(st->pwm, &wf);
 	if (ret < 0)
 		return ret;
 
 	periodic->frequency_hz = DIV_ROUND_UP_ULL(NSEC_PER_SEC, wf.period_length_ns);
+	periodic->offset_ns = wf.duty_offset_ns;
 
 	return 0;
 }
@@ -77,6 +79,7 @@ static int spi_offload_trigger_pwm_enable(struct spi_offload_trigger *trigger,
 	wf.period_length_ns = DIV_ROUND_UP_ULL(NSEC_PER_SEC, periodic->frequency_hz);
 	/* REVISIT: 50% duty-cycle for now - may add config parameter later */
 	wf.duty_length_ns = wf.period_length_ns / 2;
+	wf.duty_offset_ns = periodic->offset_ns;
 
 	return pwm_set_waveform_might_sleep(st->pwm, &wf, false);
 }
diff --git a/include/linux/spi/offload/types.h b/include/linux/spi/offload/types.h
index 6f7892347871..cd61f8adb7a5 100644
--- a/include/linux/spi/offload/types.h
+++ b/include/linux/spi/offload/types.h
@@ -57,8 +57,17 @@ enum spi_offload_trigger_type {
 	SPI_OFFLOAD_TRIGGER_PERIODIC,
 };
 
+/**
+ * spi_offload_trigger_periodic - configuration parameters for periodic triggers
+ * @frequency_hz: The rate that the trigger should fire in Hz.
+ * @offset_ns: A delay in nanoseconds between when this trigger fires
+ *	       compared to another trigger. This requires specialized hardware
+ *	       that supports such synchronization with a delay between two or
+ *	       more triggers. Set to 0 when not needed.
+ */
 struct spi_offload_trigger_periodic {
 	u64 frequency_hz;
+	u64 offset_ns;
 };
 
 struct spi_offload_trigger_config {
-- 
cgit v1.2.3


From 5a43dc9f4ee0a3624d0598ee14e8ef8468914525 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 13 Oct 2025 23:03:10 +0900
Subject: firewire: core: detect device quirk when reading configuration ROM

Every time the bus manager runs, the cached configuration ROM content of
the IRM device is investigated to detect device-specific quirks. This
detection can be performed in advance when reading the configuration ROM.

This commit adds device quirk flags to the fw_device structure, and
initializes them after reading the bus information block of the
configuration ROM. The quirk flags are immutable once the configuration
ROM has been read. Although they are likely accessed concurrently only by
the bus manager, this commit ensures safe access by preventing torn writes
and reads using the WRITE_ONCE()/READ_ONCE() macros.

Link: https://lore.kernel.org/r/20251013140311.97159-2-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-card.c   | 21 +++++++--------------
 drivers/firewire/core-device.c | 25 +++++++++++++++++++++++--
 include/linux/firewire.h       | 11 +++++++++++
 3 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index e5e0174a0335..6979d6a88ae2 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -86,8 +86,6 @@ static size_t config_rom_length = 1 + 4 + 1 + 1;
  */
 #define DEFAULT_SPLIT_TIMEOUT	(2 * 8000)
 
-#define CANON_OUI		0x000085
-
 static void generate_config_rom(struct fw_card *card, __be32 *config_rom)
 {
 	struct fw_descriptor *desc;
@@ -308,11 +306,9 @@ __must_hold(&card->lock)
 		cpu_to_be32(local_id),
 	};
 	bool grace = time_is_before_jiffies64(card->reset_jiffies + msecs_to_jiffies(125));
-	bool irm_is_1394_1995_only = false;
-	bool keep_this_irm = false;
 	struct fw_node *irm_node;
 	struct fw_device *irm_device;
-	int irm_node_id;
+	int irm_node_id, irm_device_quirks = 0;
 	int rcode;
 
 	lockdep_assert_held(&card->lock);
@@ -328,15 +324,12 @@ __must_hold(&card->lock)
 		return BM_CONTENTION_OUTCOME_IRM_HAS_LINK_OFF;
 	}
 
+	// NOTE: It is likely that the quirk detection for IRM device has not done yet.
 	irm_device = fw_node_get_device(irm_node);
-	if (irm_device && irm_device->config_rom) {
-		irm_is_1394_1995_only = (irm_device->config_rom[2] & 0x000000f0) == 0;
-
-		// Canon MV5i works unreliably if it is not root node.
-		keep_this_irm = irm_device->config_rom[3] >> 8 == CANON_OUI;
-	}
-
-	if (irm_is_1394_1995_only && !keep_this_irm) {
+	if (irm_device)
+		irm_device_quirks = READ_ONCE(irm_device->quirks);
+	if ((irm_device_quirks & FW_DEVICE_QUIRK_IRM_IS_1394_1995_ONLY) &&
+	    !(irm_device_quirks & FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER)) {
 		fw_notice(card, "IRM is not 1394a compliant, making local node (%02x) root\n",
 			  local_id);
 		return BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY;
@@ -373,7 +366,7 @@ __must_hold(&card->lock)
 			return BM_CONTENTION_OUTCOME_IRM_HOLDS_LOCAL_NODE_AS_BM;
 	}
 	default:
-		if (!keep_this_irm) {
+		if (!(irm_device_quirks & FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER)) {
 			fw_notice(card, "BM lock failed (%s), making local node (%02x) root\n",
 				  fw_rcode_string(rcode), local_id);
 			return BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY;
diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 457a0da024a7..9bab2d594b89 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -542,6 +542,21 @@ static struct device_attribute fw_device_attributes[] = {
 	__ATTR_NULL,
 };
 
+#define CANON_OUI		0x000085
+
+static int detect_quirks_by_bus_information_block(const u32 *bus_information_block)
+{
+	int quirks = 0;
+
+	if ((bus_information_block[2] & 0x000000f0) == 0)
+		quirks |= FW_DEVICE_QUIRK_IRM_IS_1394_1995_ONLY;
+
+	if ((bus_information_block[3] >> 8) == CANON_OUI)
+		quirks |= FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER;
+
+	return quirks;
+}
+
 static int read_rom(struct fw_device *device,
 		    int generation, int index, u32 *data)
 {
@@ -582,6 +597,7 @@ static int read_config_rom(struct fw_device *device, int generation)
 	u32 *rom, *stack;
 	u32 sp, key;
 	int i, end, length, ret;
+	int quirks;
 
 	rom = kmalloc(sizeof(*rom) * MAX_CONFIG_ROM_SIZE +
 		      sizeof(*stack) * MAX_CONFIG_ROM_SIZE, GFP_KERNEL);
@@ -612,6 +628,11 @@ static int read_config_rom(struct fw_device *device, int generation)
 		}
 	}
 
+	quirks = detect_quirks_by_bus_information_block(rom);
+
+	// Just prevent from torn writing/reading.
+	WRITE_ONCE(device->quirks, quirks);
+
 	device->max_speed = device->node->max_speed;
 
 	/*
@@ -1122,10 +1143,10 @@ static void fw_device_init(struct work_struct *work)
 		device->workfn = fw_device_shutdown;
 		fw_schedule_device_work(device, SHUTDOWN_DELAY);
 	} else {
-		fw_notice(card, "created device %s: GUID %08x%08x, S%d00\n",
+		fw_notice(card, "created device %s: GUID %08x%08x, S%d00, quirks %08x\n",
 			  dev_name(&device->device),
 			  device->config_rom[3], device->config_rom[4],
-			  1 << device->max_speed);
+			  1 << device->max_speed, device->quirks);
 		device->config_rom_retries = 0;
 
 		set_broadcast_channel(device, device->generation);
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 6d208769d456..161829cfcc00 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -170,6 +170,14 @@ struct fw_attribute_group {
 	struct attribute *attrs[13];
 };
 
+enum fw_device_quirk {
+	// See afa1282a35d3 ("firewire: core: check for 1394a compliant IRM, fix inaccessibility of Sony camcorder").
+	FW_DEVICE_QUIRK_IRM_IS_1394_1995_ONLY = BIT(0),
+
+	// See a509e43ff338 ("firewire: core: fix unstable I/O with Canon camcorder").
+	FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER = BIT(1),
+};
+
 enum fw_device_state {
 	FW_DEVICE_INITIALIZING,
 	FW_DEVICE_RUNNING,
@@ -203,6 +211,9 @@ struct fw_device {
 	struct fw_card *card;
 	struct device device;
 
+	// A set of enum fw_device_quirk.
+	int quirks;
+
 	struct mutex client_list_mutex;
 	struct list_head client_list;
 
-- 
cgit v1.2.3


From 15f9610fc96ac6fd2844e63f7bf5a0b08e1c31c8 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 13 Oct 2025 23:03:11 +0900
Subject: firewire: core: handle device quirk of MOTU Audio Express

A commit 3a93d082bacf ("ALSA: firewire-motu: add support for MOTU Audio
Express") describes a quirk of MOTU Audio Express. The device returns
acknowledge packet with 0x10 as the pending state of any types of
asynchronous request transaction. It is completely out of specification.

This commit implements handling for that device-specific quirk. The quirk
is detected after reading the root directory of configuration ROM. When
processing the acknowledge code in 1394 OHCI AT context event handler,
firewire-ohci module seeks the device instance of destination node by
traversing device hierarchy. If the device has the quirk, the acknowledge
code is replaced with the standard code.

The 1394 OHCI AT context events occur for outgoing asynchronous request
packets. The device traversal is safe since no new request initiators
exist after the fw_card_instance has been invalidated.

Link: https://lore.kernel.org/r/20251013140311.97159-3-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-device.c | 53 ++++++++++++++++++++++++++++++++++++++++++
 drivers/firewire/ohci.c        | 29 +++++++++++++++++++++++
 include/linux/firewire.h       |  3 +++
 3 files changed, 85 insertions(+)

(limited to 'include')

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 9bab2d594b89..33ce4cd357ed 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -557,6 +557,54 @@ static int detect_quirks_by_bus_information_block(const u32 *bus_information_blo
 	return quirks;
 }
 
+struct entry_match {
+	unsigned int index;
+	u32 value;
+};
+
+static const struct entry_match motu_audio_express_matches[] = {
+	{ 1, 0x030001f2 },
+	{ 3, 0xd1000002 },
+	{ 4, 0x8d000005 },
+	{ 6, 0x120001f2 },
+	{ 7, 0x13000033 },
+	{ 8, 0x17104800 },
+};
+
+static int detect_quirks_by_root_directory(const u32 *root_directory, unsigned int length)
+{
+	static const struct {
+		enum fw_device_quirk quirk;
+		const struct entry_match *matches;
+		unsigned int match_count;
+	} *entry, entries[] = {
+		{
+			.quirk = FW_DEVICE_QUIRK_ACK_PACKET_WITH_INVALID_PENDING_CODE,
+			.matches = motu_audio_express_matches,
+			.match_count = ARRAY_SIZE(motu_audio_express_matches),
+		},
+	};
+	int quirks = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(entries); ++i) {
+		int j;
+
+		entry = entries + i;
+		for (j = 0; j < entry->match_count; ++j) {
+			unsigned int index = entry->matches[j].index;
+			unsigned int value = entry->matches[j].value;
+
+			if ((length < index) || (root_directory[index] != value))
+				break;
+		}
+		if (j == entry->match_count)
+			quirks |= entry->quirk;
+	}
+
+	return quirks;
+}
+
 static int read_rom(struct fw_device *device,
 		    int generation, int index, u32 *data)
 {
@@ -737,6 +785,11 @@ static int read_config_rom(struct fw_device *device, int generation)
 			length = i;
 	}
 
+	quirks |= detect_quirks_by_root_directory(rom + ROOT_DIR_OFFSET, length - ROOT_DIR_OFFSET);
+
+	// Just prevent from torn writing/reading.
+	WRITE_ONCE(device->quirks, quirks);
+
 	old_rom = device->config_rom;
 	new_rom = kmemdup(rom, length * 4, GFP_KERNEL);
 	if (new_rom == NULL) {
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 030aed5453a1..757dd9c64b1c 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -1319,6 +1319,14 @@ static void at_context_flush(struct at_context *ctx)
 	enable_work(&ctx->work);
 }
 
+static int find_fw_device(struct device *dev, const void *data)
+{
+	struct fw_device *device = fw_device(dev);
+	const u32 *params = data;
+
+	return (device->generation == params[0]) && (device->node_id == params[1]);
+}
+
 static int handle_at_packet(struct context *context,
 			    struct descriptor *d,
 			    struct descriptor *last)
@@ -1390,6 +1398,27 @@ static int handle_at_packet(struct context *context,
 		fallthrough;
 
 	default:
+		if (unlikely(evt == 0x10)) {
+			u32 params[2] = {
+				packet->generation,
+				async_header_get_destination(packet->header),
+			};
+			struct device *dev;
+
+			fw_card_get(&ohci->card);
+			dev = device_find_child(ohci->card.device, (const void *)params, find_fw_device);
+			fw_card_put(&ohci->card);
+			if (dev) {
+				struct fw_device *device = fw_device(dev);
+				int quirks = READ_ONCE(device->quirks);
+
+				put_device(dev);
+				if (quirks & FW_DEVICE_QUIRK_ACK_PACKET_WITH_INVALID_PENDING_CODE) {
+					packet->ack = ACK_PENDING;
+					break;
+				}
+			}
+		}
 		packet->ack = RCODE_SEND_ERROR;
 		break;
 	}
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 161829cfcc00..f1d8734c0ec6 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -176,6 +176,9 @@ enum fw_device_quirk {
 
 	// See a509e43ff338 ("firewire: core: fix unstable I/O with Canon camcorder").
 	FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER = BIT(1),
+
+	// MOTU Audio Express transfers acknowledge packet with 0x10 for pending state.
+	FW_DEVICE_QUIRK_ACK_PACKET_WITH_INVALID_PENDING_CODE = BIT(2),
 };
 
 enum fw_device_state {
-- 
cgit v1.2.3


From c510368bce39cbaf4cb66f4acf788f5efa8692a6 Mon Sep 17 00:00:00 2001
From: Tommaso Merciai <tommaso.merciai.xr@bp.renesas.com>
Date: Wed, 1 Oct 2025 23:26:52 +0200
Subject: dt-bindings: clock: renesas,r9a09g047-cpg: Add USB2 PHY core clocks

Add definitions for USB2 PHY core clocks in the R9A09G047 CPG DT
bindings header file.

Signed-off-by: Tommaso Merciai <tommaso.merciai.xr@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://patch.msgid.link/20251001212709.579080-9-tommaso.merciai.xr@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/renesas,r9a09g047-cpg.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/renesas,r9a09g047-cpg.h b/include/dt-bindings/clock/renesas,r9a09g047-cpg.h
index f165df8a6f5a..dab24740de3c 100644
--- a/include/dt-bindings/clock/renesas,r9a09g047-cpg.h
+++ b/include/dt-bindings/clock/renesas,r9a09g047-cpg.h
@@ -22,5 +22,7 @@
 #define R9A09G047_GBETH_1_CLK_PTP_REF_I		11
 #define R9A09G047_USB3_0_REF_ALT_CLK_P		12
 #define R9A09G047_USB3_0_CLKCORE		13
+#define R9A09G047_USB2_0_CLK_CORE0		14
+#define R9A09G047_USB2_0_CLK_CORE1		15
 
 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G047_CPG_H__ */
-- 
cgit v1.2.3


From 53615ad26e9789bfcdf3a4dccbcecb15294ea024 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Mon, 13 Oct 2025 13:41:33 +0900
Subject: netmem: replace __netmem_clear_lsb() with netmem_to_nmdesc()

Now that we have struct netmem_desc, it'd better access the pp fields
via struct netmem_desc rather than struct net_iov.

Introduce netmem_to_nmdesc() for safely converting netmem_ref to
netmem_desc regardless of the type underneath e.i. netmem_desc, net_iov.

While at it, remove __netmem_clear_lsb() and make netmem_to_nmdesc()
used instead.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Byungchul Park <byungchul@sk.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20251013044133.69472-1-byungchul@sk.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/netmem.h   | 66 +++++++++++++++++++++++++-------------------------
 net/core/netmem_priv.h | 16 ++++++------
 2 files changed, 41 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/net/netmem.h b/include/net/netmem.h
index f7dacc9e75fd..651e2c62d1dd 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -247,6 +247,23 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem)
 	return page_to_pfn(netmem_to_page(netmem));
 }
 
+/* XXX: How to extract netmem_desc from page must be changed, once
+ * netmem_desc no longer overlays on page and will be allocated through
+ * slab.
+ */
+#define __pp_page_to_nmdesc(p)	(_Generic((p),				\
+	const struct page * :	(const struct netmem_desc *)(p),	\
+	struct page * :		(struct netmem_desc *)(p)))
+
+/* CAUTION: Check if the page is a pp page before calling this helper or
+ * know it's a pp page.
+ */
+#define pp_page_to_nmdesc(p)						\
+({									\
+	DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p));		\
+	__pp_page_to_nmdesc(p);						\
+})
+
 /**
  * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing
  * @netmem
@@ -265,42 +282,25 @@ static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem)
 	return (__force struct netmem_desc *)netmem;
 }
 
-/* __netmem_clear_lsb - convert netmem_ref to struct net_iov * for access to
- * common fields.
- * @netmem: netmem reference to extract as net_iov.
- *
- * All the sub types of netmem_ref (page, net_iov) have the same pp, pp_magic,
- * dma_addr, and pp_ref_count fields at the same offsets. Thus, we can access
- * these fields without a type check to make sure that the underlying mem is
- * net_iov or page.
+/* netmem_to_nmdesc - convert netmem_ref to struct netmem_desc * for
+ * access to common fields.
+ * @netmem: netmem reference to get netmem_desc.
  *
- * The resulting value of this function can only be used to access the fields
- * that are NET_IOV_ASSERT_OFFSET'd. Accessing any other fields will result in
- * undefined behavior.
+ * All the sub types of netmem_ref (netmem_desc, net_iov) have the same
+ * pp, pp_magic, dma_addr, and pp_ref_count fields via netmem_desc.
  *
- * Return: the netmem_ref cast to net_iov* regardless of its underlying type.
+ * Return: the pointer to struct netmem_desc * regardless of its
+ * underlying type.
  */
-static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
+static inline struct netmem_desc *netmem_to_nmdesc(netmem_ref netmem)
 {
-	return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV);
-}
+	void *p = (void *)((__force unsigned long)netmem & ~NET_IOV);
 
-/* XXX: How to extract netmem_desc from page must be changed, once
- * netmem_desc no longer overlays on page and will be allocated through
- * slab.
- */
-#define __pp_page_to_nmdesc(p)	(_Generic((p),				\
-	const struct page * :	(const struct netmem_desc *)(p),	\
-	struct page * :		(struct netmem_desc *)(p)))
+	if (netmem_is_net_iov(netmem))
+		return &((struct net_iov *)p)->desc;
 
-/* CAUTION: Check if the page is a pp page before calling this helper or
- * know it's a pp page.
- */
-#define pp_page_to_nmdesc(p)						\
-({									\
-	DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p));		\
-	__pp_page_to_nmdesc(p);						\
-})
+	return __pp_page_to_nmdesc((struct page *)p);
+}
 
 /**
  * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem
@@ -320,12 +320,12 @@ static inline struct page_pool *__netmem_get_pp(netmem_ref netmem)
 
 static inline struct page_pool *netmem_get_pp(netmem_ref netmem)
 {
-	return __netmem_clear_lsb(netmem)->pp;
+	return netmem_to_nmdesc(netmem)->pp;
 }
 
 static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem)
 {
-	return &__netmem_clear_lsb(netmem)->pp_ref_count;
+	return &netmem_to_nmdesc(netmem)->pp_ref_count;
 }
 
 static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid)
@@ -390,7 +390,7 @@ static inline bool netmem_is_pfmemalloc(netmem_ref netmem)
 
 static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
 {
-	return __netmem_clear_lsb(netmem)->dma_addr;
+	return netmem_to_nmdesc(netmem)->dma_addr;
 }
 
 void get_netmem(netmem_ref netmem);
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
index cd95394399b4..23175cb2bd86 100644
--- a/net/core/netmem_priv.h
+++ b/net/core/netmem_priv.h
@@ -5,19 +5,19 @@
 
 static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
 {
-	return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
+	return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
 }
 
 static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
 {
-	__netmem_clear_lsb(netmem)->pp_magic |= pp_magic;
+	netmem_to_nmdesc(netmem)->pp_magic |= pp_magic;
 }
 
 static inline void netmem_clear_pp_magic(netmem_ref netmem)
 {
-	WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK);
+	WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK);
 
-	__netmem_clear_lsb(netmem)->pp_magic = 0;
+	netmem_to_nmdesc(netmem)->pp_magic = 0;
 }
 
 static inline bool netmem_is_pp(netmem_ref netmem)
@@ -27,13 +27,13 @@ static inline bool netmem_is_pp(netmem_ref netmem)
 
 static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
 {
-	__netmem_clear_lsb(netmem)->pp = pool;
+	netmem_to_nmdesc(netmem)->pp = pool;
 }
 
 static inline void netmem_set_dma_addr(netmem_ref netmem,
 				       unsigned long dma_addr)
 {
-	__netmem_clear_lsb(netmem)->dma_addr = dma_addr;
+	netmem_to_nmdesc(netmem)->dma_addr = dma_addr;
 }
 
 static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
@@ -43,7 +43,7 @@ static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
 	if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
 		return 0;
 
-	magic = __netmem_clear_lsb(netmem)->pp_magic;
+	magic = netmem_to_nmdesc(netmem)->pp_magic;
 
 	return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT;
 }
@@ -57,6 +57,6 @@ static inline void netmem_set_dma_index(netmem_ref netmem,
 		return;
 
 	magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT);
-	__netmem_clear_lsb(netmem)->pp_magic = magic;
+	netmem_to_nmdesc(netmem)->pp_magic = magic;
 }
 #endif
-- 
cgit v1.2.3


From 300709fbefd19ff7293c7d0ded9b56e69216e634 Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Fri, 10 Oct 2025 10:51:47 +0200
Subject: mm/memory_hotplug: Remove MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE
 notifiers

MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers were introduced
to prepare the transition of memory to and from a physically accessible
state. This enhancement was crucial for implementing the "memmap on memory"
feature for s390.

With introduction of dynamic (de)configuration of hotpluggable memory,
memory can be brought to accessible state before add_memory(). Memory
can be brought to inaccessible state before remove_memory(). Hence,
there is no need of MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory
notifiers anymore.

This basically reverts commit
c5f1e2d18909 ("mm/memory_hotplug: introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers")
Additionally, apply minor adjustments to the function parameters of
move_pfn_range_to_zone() and mhp_supports_memmap_on_memory() to ensure
compatibility with the latest branch.

Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 drivers/base/memory.c          | 23 +----------------------
 include/linux/memory.h         |  9 ---------
 include/linux/memory_hotplug.h | 18 +-----------------
 include/linux/memremap.h       |  1 -
 mm/memory_hotplug.c            | 17 +++--------------
 mm/sparse.c                    |  3 +--
 6 files changed, 6 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 6d84a02cfa5d..fc43f2703ae0 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -226,7 +226,6 @@ static int memory_block_online(struct memory_block *mem)
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long nr_vmemmap_pages = 0;
-	struct memory_notify arg;
 	struct zone *zone;
 	int ret;
 
@@ -246,19 +245,9 @@ static int memory_block_online(struct memory_block *mem)
 	if (mem->altmap)
 		nr_vmemmap_pages = mem->altmap->free;
 
-	arg.altmap_start_pfn = start_pfn;
-	arg.altmap_nr_pages = nr_vmemmap_pages;
-	arg.start_pfn = start_pfn + nr_vmemmap_pages;
-	arg.nr_pages = nr_pages - nr_vmemmap_pages;
 	mem_hotplug_begin();
-	ret = memory_notify(MEM_PREPARE_ONLINE, &arg);
-	ret = notifier_to_errno(ret);
-	if (ret)
-		goto out_notifier;
-
 	if (nr_vmemmap_pages) {
-		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
-						zone, mem->altmap->inaccessible);
+		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
 		if (ret)
 			goto out;
 	}
@@ -280,11 +269,7 @@ static int memory_block_online(struct memory_block *mem)
 					  nr_vmemmap_pages);
 
 	mem->zone = zone;
-	mem_hotplug_done();
-	return ret;
 out:
-	memory_notify(MEM_FINISH_OFFLINE, &arg);
-out_notifier:
 	mem_hotplug_done();
 	return ret;
 }
@@ -297,7 +282,6 @@ static int memory_block_offline(struct memory_block *mem)
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long nr_vmemmap_pages = 0;
-	struct memory_notify arg;
 	int ret;
 
 	if (!mem->zone)
@@ -329,11 +313,6 @@ static int memory_block_offline(struct memory_block *mem)
 		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 
 	mem->zone = NULL;
-	arg.altmap_start_pfn = start_pfn;
-	arg.altmap_nr_pages = nr_vmemmap_pages;
-	arg.start_pfn = start_pfn + nr_vmemmap_pages;
-	arg.nr_pages = nr_pages - nr_vmemmap_pages;
-	memory_notify(MEM_FINISH_OFFLINE, &arg);
 out:
 	mem_hotplug_done();
 	return ret;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 0c214256216f..ba1515160894 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -96,17 +96,8 @@ int set_memory_block_size_order(unsigned int order);
 #define	MEM_GOING_ONLINE	(1<<3)
 #define	MEM_CANCEL_ONLINE	(1<<4)
 #define	MEM_CANCEL_OFFLINE	(1<<5)
-#define	MEM_PREPARE_ONLINE	(1<<6)
-#define	MEM_FINISH_OFFLINE	(1<<7)
 
 struct memory_notify {
-	/*
-	 * The altmap_start_pfn and altmap_nr_pages fields are designated for
-	 * specifying the altmap range and are exclusively intended for use in
-	 * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
-	 */
-	unsigned long altmap_start_pfn;
-	unsigned long altmap_nr_pages;
 	unsigned long start_pfn;
 	unsigned long nr_pages;
 };
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 23f038a16231..f2f16cdd73ee 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -58,22 +58,6 @@ typedef int __bitwise mhp_t;
  * implies the node id (nid).
  */
 #define MHP_NID_IS_MGID		((__force mhp_t)BIT(2))
-/*
- * The hotplugged memory is completely inaccessible while the memory is
- * offline. The memory provider will handle MEM_PREPARE_ONLINE /
- * MEM_FINISH_OFFLINE notifications and make the memory accessible.
- *
- * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY,
- * because the altmap cannot be written (e.g., poisoned) when adding
- * memory -- before it is set online.
- *
- * This allows for adding memory with an altmap that is not currently
- * made available by a hypervisor. When onlining that memory, the
- * hypervisor can be instructed to make that memory available, and
- * the onlining phase will not require any memory allocations, which is
- * helpful in low-memory situations.
- */
-#define MHP_OFFLINE_INACCESSIBLE	((__force mhp_t)BIT(3))
 
 /*
  * Extended parameters for memory hotplug:
@@ -123,7 +107,7 @@ extern void adjust_present_page_count(struct page *page,
 				      long nr_pages);
 /* VM interface that may be used by firmware interface */
 extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
-				     struct zone *zone, bool mhp_off_inaccessible);
+				     struct zone *zone);
 extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
 			struct zone *zone, struct memory_group *group);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index e5951ba12a28..30c7aecbd245 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -25,7 +25,6 @@ struct vmem_altmap {
 	unsigned long free;
 	unsigned long align;
 	unsigned long alloc;
-	bool inaccessible;
 };
 
 /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0be83039c3b5..238a6712738e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1088,7 +1088,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
 }
 
 int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
-			      struct zone *zone, bool mhp_off_inaccessible)
+			      struct zone *zone)
 {
 	unsigned long end_pfn = pfn + nr_pages;
 	int ret, i;
@@ -1097,15 +1097,6 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 	if (ret)
 		return ret;
 
-	/*
-	 * Memory block is accessible at this stage and hence poison the struct
-	 * pages now.  If the memory block is accessible during memory hotplug
-	 * addition phase, then page poisining is already performed in
-	 * sparse_add_section().
-	 */
-	if (mhp_off_inaccessible)
-		page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
-
 	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
 			       false);
 
@@ -1444,7 +1435,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
 }
 
 static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
-					    u64 start, u64 size, mhp_t mhp_flags)
+					    u64 start, u64 size)
 {
 	unsigned long memblock_size = memory_block_size_bytes();
 	u64 cur_start;
@@ -1460,8 +1451,6 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
 		};
 
 		mhp_altmap.free = memory_block_memmap_on_memory_pages();
-		if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
-			mhp_altmap.inaccessible = true;
 		params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
 					GFP_KERNEL);
 		if (!params.altmap) {
@@ -1555,7 +1544,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 */
 	if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
 	    mhp_supports_memmap_on_memory()) {
-		ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
+		ret = create_altmaps_and_memory_blocks(nid, group, start, size);
 		if (ret)
 			goto error;
 	} else {
diff --git a/mm/sparse.c b/mm/sparse.c
index 17c50a6415c2..b5b2b6f7041b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -951,8 +951,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 	 * Poison uninitialized struct pages in order to catch invalid flags
 	 * combinations.
 	 */
-	if (!altmap || !altmap->inaccessible)
-		page_init_poison(memmap, sizeof(struct page) * nr_pages);
+	page_init_poison(memmap, sizeof(struct page) * nr_pages);
 
 	ms = __nr_to_section(section_nr);
 	set_section_nid(section_nr, nid);
-- 
cgit v1.2.3


From 9de877338a151860c76f194934d53b7b816d339a Mon Sep 17 00:00:00 2001
From: Raphael Gallais-Pou <rgallaispou@gmail.com>
Date: Fri, 12 Sep 2025 13:36:09 +0200
Subject: media: include: remove c8sectpfe header

Driver is not used anymore.  Remove header file.

Signed-off-by: Raphael Gallais-Pou <rgallaispou@gmail.com>
Reviewed-by: Patrice Chotard <patrice.chotard@foss.st.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 include/dt-bindings/media/c8sectpfe.h | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 include/dt-bindings/media/c8sectpfe.h

(limited to 'include')

diff --git a/include/dt-bindings/media/c8sectpfe.h b/include/dt-bindings/media/c8sectpfe.h
deleted file mode 100644
index 6b1fb6f5413b..000000000000
--- a/include/dt-bindings/media/c8sectpfe.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __DT_C8SECTPFE_H
-#define __DT_C8SECTPFE_H
-
-#define STV0367_TDA18212_NIMA_1	0
-#define STV0367_TDA18212_NIMA_2	1
-#define STV0367_TDA18212_NIMB_1	2
-#define STV0367_TDA18212_NIMB_2	3
-
-#define STV0903_6110_LNB24_NIMA	4
-#define STV0903_6110_LNB24_NIMB	5
-
-#endif /* __DT_C8SECTPFE_H */
-- 
cgit v1.2.3


From 082b86919b7a94de01d849021b4da820a6cb89dc Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Wed, 8 Oct 2025 12:55:18 +0300
Subject: media: v4l2-mem2mem: Fix outdated documentation

Commit cbd9463da1b1 ("media: v4l2-mem2mem: Avoid calling .device_run in
v4l2_m2m_job_finish") deferred calls to .device_run() to a work queue to
avoid recursive calls when a job is finished right away from
.device_run(). It failed to update the v4l2_m2m_job_finish()
documentation that still states the function must not be called from
.device_run(). Fix it.

Fixes: cbd9463da1b1 ("media: v4l2-mem2mem: Avoid calling .device_run in v4l2_m2m_job_finish")
Cc: stable@vger.kernel.org
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 include/media/v4l2-mem2mem.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h
index 09c6164577cc..500f81f399df 100644
--- a/include/media/v4l2-mem2mem.h
+++ b/include/media/v4l2-mem2mem.h
@@ -192,8 +192,7 @@ void v4l2_m2m_try_schedule(struct v4l2_m2m_ctx *m2m_ctx);
  * other instances to take control of the device.
  *
  * This function has to be called only after &v4l2_m2m_ops->device_run
- * callback has been called on the driver. To prevent recursion, it should
- * not be called directly from the &v4l2_m2m_ops->device_run callback though.
+ * callback has been called on the driver.
  */
 void v4l2_m2m_job_finish(struct v4l2_m2m_dev *m2m_dev,
 			 struct v4l2_m2m_ctx *m2m_ctx);
-- 
cgit v1.2.3


From 347ed2d566dabb06c7970fff01129c4f59995ed6 Mon Sep 17 00:00:00 2001
From: zhidao su <suzhidao@xiaomi.com>
Date: Sat, 11 Oct 2025 15:16:51 +0800
Subject: sched/ext: Implement cgroup_set_idle() callback

Implement the missing cgroup_set_idle() callback that was marked as a
TODO. This allows BPF schedulers to be notified when a cgroup's idle
state changes, enabling them to adjust their scheduling behavior
accordingly.

The implementation follows the same pattern as other cgroup callbacks
like cgroup_set_weight() and cgroup_set_bandwidth(). It checks if the
BPF scheduler has implemented the callback and invokes it with the
appropriate parameters.

Fixes a spelling error in the cgroup_set_bandwidth() documentation.

tj: s/scx_cgroup_rwsem/scx_cgroup_ops_rwsem/ to fix build breakage.

Signed-off-by: zhidao su <soolaugust@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h   |  1 +
 kernel/sched/ext.c          | 16 +++++++++++++++-
 kernel/sched/ext_internal.h | 13 ++++++++++++-
 3 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index d82b7a9b0658..9848aeab2786 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -228,6 +228,7 @@ struct scx_task_group {
 	u64			bw_period_us;
 	u64			bw_quota_us;
 	u64			bw_burst_us;
+	bool			idle;
 #endif
 };
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4b1467d3541a..430749ce46ab 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3066,6 +3066,7 @@ void scx_tg_init(struct task_group *tg)
 	tg->scx.weight = CGROUP_WEIGHT_DFL;
 	tg->scx.bw_period_us = default_bw_period_us();
 	tg->scx.bw_quota_us = RUNTIME_INF;
+	tg->scx.idle = false;
 }
 
 int scx_tg_online(struct task_group *tg)
@@ -3214,7 +3215,18 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
 
 void scx_group_set_idle(struct task_group *tg, bool idle)
 {
-	/* TODO: Implement ops->cgroup_set_idle() */
+	struct scx_sched *sch = scx_root;
+
+	percpu_down_read(&scx_cgroup_ops_rwsem);
+
+	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
+		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL,
+			    tg_cgrp(tg), idle);
+
+	/* Update the task group's idle state */
+	tg->scx.idle = idle;
+
+	percpu_up_read(&scx_cgroup_ops_rwsem);
 }
 
 void scx_group_set_bandwidth(struct task_group *tg,
@@ -5017,6 +5029,7 @@ static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *fro
 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
+static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {}
 #endif
 static void sched_ext_ops__cpu_online(s32 cpu) {}
 static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -5055,6 +5068,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
 	.cgroup_cancel_move	= sched_ext_ops__cgroup_cancel_move,
 	.cgroup_set_weight	= sched_ext_ops__cgroup_set_weight,
 	.cgroup_set_bandwidth	= sched_ext_ops__cgroup_set_bandwidth,
+	.cgroup_set_idle	= sched_ext_ops__cgroup_set_idle,
 #endif
 	.cpu_online		= sched_ext_ops__cpu_online,
 	.cpu_offline		= sched_ext_ops__cpu_offline,
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index b3617abed510..7d00a0a2456e 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -697,12 +697,23 @@ struct sched_ext_ops {
 	 * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
 	 * interpreted in the same fashion and specifies how much @cgrp can
 	 * burst temporarily. The specific control mechanism and thus the
-	 * interpretation of @period_us and burstiness is upto to the BPF
+	 * interpretation of @period_us and burstiness is up to the BPF
 	 * scheduler.
 	 */
 	void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
 				     u64 period_us, u64 quota_us, u64 burst_us);
 
+	/**
+	 * @cgroup_set_idle: A cgroup's idle state is being changed
+	 * @cgrp: cgroup whose idle state is being updated
+	 * @idle: whether the cgroup is entering or exiting idle state
+	 *
+	 * Update @cgrp's idle state to @idle. This callback is invoked when
+	 * a cgroup transitions between idle and non-idle states, allowing the
+	 * BPF scheduler to adjust its behavior accordingly.
+	 */
+	void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle);
+
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 
 	/*
-- 
cgit v1.2.3


From 1ba9f8979426590367406c70c1c821f5b943f993 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:03:10 -0700
Subject: vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros

TEXT_MAIN, DATA_MAIN and friends are defined differently depending on
whether certain config options enable -ffunction-sections and/or
-fdata-sections.

There's no technical reason for that beyond voodoo coding.  Keeping the
separate implementations adds unnecessary complexity, fragments the
logic, and increases the risk of subtle bugs.

Unify the macros by using the same input section patterns across all
configs.

This is a prerequisite for the upcoming livepatch klp-build tooling
which will manually enable -ffunction-sections and -fdata-sections via
KCFLAGS.

Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/asm-generic/vmlinux.lds.h | 40 ++++++++++++---------------------------
 scripts/module.lds.S              | 12 +++++-------
 2 files changed, 17 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8a9a2e732a65..5facbc994634 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -87,39 +87,24 @@
 #define ALIGN_FUNCTION()  . = ALIGN(CONFIG_FUNCTION_ALIGNMENT)
 
 /*
- * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which
- * generates .data.identifier sections, which need to be pulled in with
- * .data. We don't want to pull in .data..other sections, which Linux
- * has defined. Same for text and bss.
+ * Support -ffunction-sections by matching .text and .text.*,
+ * but exclude '.text..*'.
  *
- * With LTO_CLANG, the linker also splits sections by default, so we need
- * these macros to combine the sections during the final link.
- *
- * With AUTOFDO_CLANG and PROPELLER_CLANG, by default, the linker splits
- * text sections and regroups functions into subsections.
- *
- * RODATA_MAIN is not used because existing code already defines .rodata.x
- * sections to be brought in with rodata.
+ * Special .text.* sections that are typically grouped separately, such as
+ * .text.unlikely or .text.hot, must be matched explicitly before using
+ * TEXT_MAIN.
  */
-#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG) || \
-defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
 #define TEXT_MAIN .text .text.[0-9a-zA-Z_]*
-#else
-#define TEXT_MAIN .text
-#endif
-#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG)
+
+/*
+ * Support -fdata-sections by matching .data, .data.*, and others,
+ * but exclude '.data..*'.
+ */
 #define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data.rel.* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$L*
 #define SDATA_MAIN .sdata .sdata.[0-9a-zA-Z_]*
 #define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]* .rodata..L*
 #define BSS_MAIN .bss .bss.[0-9a-zA-Z_]* .bss..L* .bss..compoundliteral*
 #define SBSS_MAIN .sbss .sbss.[0-9a-zA-Z_]*
-#else
-#define DATA_MAIN .data .data.rel .data.rel.local
-#define SDATA_MAIN .sdata
-#define RODATA_MAIN .rodata
-#define BSS_MAIN .bss
-#define SBSS_MAIN .sbss
-#endif
 
 /*
  * GCC 4.5 and later have a 32 bytes section alignment for structures.
@@ -581,9 +566,8 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
  * during second ld run in second ld pass when generating System.map
  *
  * TEXT_MAIN here will match symbols with a fixed pattern (for example,
- * .text.hot or .text.unlikely) if dead code elimination or
- * function-section is enabled. Match these symbols first before
- * TEXT_MAIN to ensure they are grouped together.
+ * .text.hot or .text.unlikely).  Match those before TEXT_MAIN to ensure
+ * they get grouped together.
  *
  * Also placing .text.hot section at the beginning of a page, this
  * would help the TLB performance.
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index ee79c41059f3..2632c6cb8ebe 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -38,12 +38,10 @@ SECTIONS {
 	__kcfi_traps 		: { KEEP(*(.kcfi_traps)) }
 #endif
 
-#ifdef CONFIG_LTO_CLANG
-	/*
-	 * With CONFIG_LTO_CLANG, LLD always enables -fdata-sections and
-	 * -ffunction-sections, which increases the size of the final module.
-	 * Merge the split sections in the final binary.
-	 */
+	.text : {
+		*(.text .text.[0-9a-zA-Z_]*)
+	}
+
 	.bss : {
 		*(.bss .bss.[0-9a-zA-Z_]*)
 		*(.bss..L*)
@@ -58,7 +56,7 @@ SECTIONS {
 		*(.rodata .rodata.[0-9a-zA-Z_]*)
 		*(.rodata..L*)
 	}
-#endif
+
 	MOD_SEPARATE_CODETAG_SECTIONS()
 }
 
-- 
cgit v1.2.3


From afb026b6d35c79f6f47752147327932827aeac8c Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:03:13 -0700
Subject: compiler: Tweak __UNIQUE_ID() naming

In preparation for the objtool klp diff subcommand, add an underscore
between the name and the counter.  This will make it possible for
objtool to distinguish between the non-unique and unique parts of the
symbol name so it can properly correlate the symbols.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/compiler.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 5b45ea7dff3e..6a32250f22f7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -163,7 +163,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	__asm__ ("" : "=r" (var) : "0" (var))
 #endif
 
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+/* Format: __UNIQUE_ID_<name>_<__COUNTER__> */
+#define __UNIQUE_ID(name)					\
+	__PASTE(__UNIQUE_ID_,					\
+	__PASTE(name,						\
+	__PASTE(_, __COUNTER__)))
 
 /**
  * data_race - mark an expression as containing intentional data races
-- 
cgit v1.2.3


From 9f14f1f91883aa2bfd6663161d2002c8ce937c43 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:03:14 -0700
Subject: compiler.h: Make addressable symbols less of an eyesore

Avoid underscore overload by changing:

  __UNIQUE_ID___addressable_loops_per_jiffy_868

to the following:

  __UNIQUE_ID_addressable_loops_per_jiffy_868

This matches the format used by other __UNIQUE_ID()-generated symbols
and improves readability for those who stare at ELF symbol table dumps.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 6a32250f22f7..ab181d87d71d 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -287,7 +287,7 @@ static inline void *offset_to_ptr(const int *off)
  */
 #define ___ADDRESSABLE(sym, __attrs)						\
 	static void * __used __attrs						\
-	__UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym;
+	__UNIQUE_ID(__PASTE(addressable_, sym)) = (void *)(uintptr_t)&sym;
 
 #define __ADDRESSABLE(sym) \
 	___ADDRESSABLE(sym, __section(".discard.addressable"))
-- 
cgit v1.2.3


From c2d420796a427dda71a2400909864e7f8e037fd4 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:03:15 -0700
Subject: elfnote: Change ELFNOTE() to use __UNIQUE_ID()

In preparation for the objtool klp diff subcommand, replace the custom
unique symbol name generation in ELFNOTE() with __UNIQUE_ID().

This standardizes the naming format for all "unique" symbols, which will
allow objtool to properly correlate them.  Note this also removes the
"one ELF note per line" limitation.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/elfnote.h | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
index 69b136e4dd2b..bb3dcded055f 100644
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -60,23 +60,21 @@
 
 #else	/* !__ASSEMBLER__ */
 #include <uapi/linux/elf.h>
+#include <linux/compiler.h>
 /*
  * Use an anonymous structure which matches the shape of
  * Elf{32,64}_Nhdr, but includes the name and desc data.  The size and
  * type of name and desc depend on the macro arguments.  "name" must
- * be a literal string, and "desc" must be passed by value.  You may
- * only define one note per line, since __LINE__ is used to generate
- * unique symbols.
+ * be a literal string, and "desc" must be passed by value.
  */
-#define _ELFNOTE_PASTE(a,b)	a##b
-#define _ELFNOTE(size, name, unique, type, desc)			\
+#define ELFNOTE(size, name, type, desc)					\
 	static const struct {						\
 		struct elf##size##_note _nhdr;				\
 		unsigned char _name[sizeof(name)]			\
 		__attribute__((aligned(sizeof(Elf##size##_Word))));	\
 		typeof(desc) _desc					\
 			     __attribute__((aligned(sizeof(Elf##size##_Word)))); \
-	} _ELFNOTE_PASTE(_note_, unique)				\
+	} __UNIQUE_ID(note)						\
 		__used							\
 		__attribute__((section(".note." name),			\
 			       aligned(sizeof(Elf##size##_Word)),	\
@@ -89,11 +87,10 @@
 		name,							\
 		desc							\
 	}
-#define ELFNOTE(size, name, type, desc)		\
-	_ELFNOTE(size, name, __LINE__, type, desc)
 
 #define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc)
 #define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc)
+
 #endif	/* __ASSEMBLER__ */
 
 #endif /* _LINUX_ELFNOTE_H */
-- 
cgit v1.2.3


From 6717e8f91db71641cb52855ed14c7900972ed0bc Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:03:16 -0700
Subject: kbuild: Remove 'kmod_' prefix from __KBUILD_MODNAME

In preparation for the objtool klp diff subcommand, remove the arbitrary
'kmod_' prefix from __KBUILD_MODNAME and instead add it explicitly in
the __initcall_id() macro.

This change supports the standardization of "unique" symbol naming by
ensuring the non-unique portion of the name comes before the unique
part.  That will enable objtool to properly correlate symbols across
builds.

Cc: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/init.h | 3 ++-
 scripts/Makefile.lib | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/init.h b/include/linux/init.h
index 17c1bc712e23..40331923b9f4 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -200,12 +200,13 @@ extern struct module __this_module;
 
 /* Format: <modname>__<counter>_<line>_<fn> */
 #define __initcall_id(fn)					\
+	__PASTE(kmod_,						\
 	__PASTE(__KBUILD_MODNAME,				\
 	__PASTE(__,						\
 	__PASTE(__COUNTER__,					\
 	__PASTE(_,						\
 	__PASTE(__LINE__,					\
-	__PASTE(_, fn))))))
+	__PASTE(_, fn)))))))
 
 /* Format: __<prefix>__<iid><id> */
 #define __initcall_name(prefix, __iid, id)			\
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 1d581ba5df66..b95560266124 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -20,7 +20,7 @@ name-fix-token = $(subst $(comma),_,$(subst -,_,$1))
 name-fix = $(call stringify,$(call name-fix-token,$1))
 basename_flags = -DKBUILD_BASENAME=$(call name-fix,$(basetarget))
 modname_flags  = -DKBUILD_MODNAME=$(call name-fix,$(modname)) \
-		 -D__KBUILD_MODNAME=kmod_$(call name-fix-token,$(modname))
+		 -D__KBUILD_MODNAME=$(call name-fix-token,$(modname))
 modfile_flags  = -DKBUILD_MODFILE=$(call stringify,$(modfile))
 
 _c_flags       = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), \
-- 
cgit v1.2.3


From b37491d72b43c3a322d396c2d8e951a10be70c17 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Thu, 18 Sep 2025 09:30:03 -0700
Subject: interval_tree: Fix ITSTATIC usage for *_subtree_search()

For consistency with the other function templates, change
_subtree_search_*() to use the user-supplied ITSTATIC rather than the
hard-coded 'static'.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h | 4 ++++
 include/linux/interval_tree.h                          | 4 ++++
 include/linux/interval_tree_generic.h                  | 2 +-
 include/linux/mm.h                                     | 2 ++
 lib/interval_tree.c                                    | 1 +
 tools/include/linux/interval_tree_generic.h            | 2 +-
 6 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
index 1d7fc3226bca..cfb42a8f5768 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
@@ -53,6 +53,10 @@ extern void
 usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node,
 					struct rb_root_cached *root);
 extern struct usnic_uiom_interval_node *
+usnic_uiom_interval_tree_subtree_search(struct usnic_uiom_interval_node *node,
+					unsigned long start,
+					unsigned long last);
+extern struct usnic_uiom_interval_node *
 usnic_uiom_interval_tree_iter_first(struct rb_root_cached *root,
 					unsigned long start,
 					unsigned long last);
diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h
index 2b8026a39906..9d5791e9f737 100644
--- a/include/linux/interval_tree.h
+++ b/include/linux/interval_tree.h
@@ -19,6 +19,10 @@ extern void
 interval_tree_remove(struct interval_tree_node *node,
 		     struct rb_root_cached *root);
 
+extern struct interval_tree_node *
+interval_tree_subtree_search(struct interval_tree_node *node,
+			     unsigned long start, unsigned long last);
+
 extern struct interval_tree_node *
 interval_tree_iter_first(struct rb_root_cached *root,
 			 unsigned long start, unsigned long last);
diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
index 1b400f26f63d..c5a2fed49eb0 100644
--- a/include/linux/interval_tree_generic.h
+++ b/include/linux/interval_tree_generic.h
@@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node,			      \
  *   Cond2: start <= ITLAST(node)					      \
  */									      \
 									      \
-static ITSTRUCT *							      \
+ITSTATIC ITSTRUCT *							      \
 ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last)	      \
 {									      \
 	while (true) {							      \
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d16b33bacc32..04fa27718cd1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3369,6 +3369,8 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
 				    struct rb_root_cached *root);
 void vma_interval_tree_remove(struct vm_area_struct *node,
 			      struct rb_root_cached *root);
+struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node,
+				unsigned long start, unsigned long last);
 struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
 				unsigned long start, unsigned long last);
 struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
diff --git a/lib/interval_tree.c b/lib/interval_tree.c
index 324766e9bf63..9ceb084b6b4e 100644
--- a/lib/interval_tree.c
+++ b/lib/interval_tree.c
@@ -13,6 +13,7 @@ INTERVAL_TREE_DEFINE(struct interval_tree_node, rb,
 
 EXPORT_SYMBOL_GPL(interval_tree_insert);
 EXPORT_SYMBOL_GPL(interval_tree_remove);
+EXPORT_SYMBOL_GPL(interval_tree_subtree_search);
 EXPORT_SYMBOL_GPL(interval_tree_iter_first);
 EXPORT_SYMBOL_GPL(interval_tree_iter_next);
 
diff --git a/tools/include/linux/interval_tree_generic.h b/tools/include/linux/interval_tree_generic.h
index 1b400f26f63d..c5a2fed49eb0 100644
--- a/tools/include/linux/interval_tree_generic.h
+++ b/tools/include/linux/interval_tree_generic.h
@@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node,			      \
  *   Cond2: start <= ITLAST(node)					      \
  */									      \
 									      \
-static ITSTRUCT *							      \
+ITSTATIC ITSTRUCT *							      \
 ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last)	      \
 {									      \
 	while (true) {							      \
-- 
cgit v1.2.3


From d2c60bde1c0fcac8b140e527546f80749ccd9c67 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:03:53 -0700
Subject: objtool: Move ANNOTATE* macros to annotate.h

In preparation for using the objtool annotation macros in higher-level
objtool.h macros like UNWIND_HINT, move them to their own file.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/annotate.h | 109 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/objtool.h  |  90 +-------------------------------------
 2 files changed, 110 insertions(+), 89 deletions(-)
 create mode 100644 include/linux/annotate.h

(limited to 'include')

diff --git a/include/linux/annotate.h b/include/linux/annotate.h
new file mode 100644
index 000000000000..ccb445496331
--- /dev/null
+++ b/include/linux/annotate.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ANNOTATE_H
+#define _LINUX_ANNOTATE_H
+
+#include <linux/objtool_types.h>
+
+#ifdef CONFIG_OBJTOOL
+
+#ifndef __ASSEMBLY__
+
+#define __ASM_ANNOTATE(label, type)					\
+	".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t"	\
+	".long " __stringify(label) " - .\n\t"				\
+	".long " __stringify(type) "\n\t"				\
+	".popsection\n\t"
+
+#define ASM_ANNOTATE(type)						\
+	"911:\n\t"							\
+	__ASM_ANNOTATE(911b, type)
+
+#else /* __ASSEMBLY__ */
+
+.macro ANNOTATE type:req
+.Lhere_\@:
+	.pushsection .discard.annotate_insn,"M",@progbits,8
+	.long	.Lhere_\@ - .
+	.long	\type
+	.popsection
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#else /* !CONFIG_OBJTOOL */
+#ifndef __ASSEMBLY__
+#define __ASM_ANNOTATE(label, type) ""
+#define ASM_ANNOTATE(type)
+#else /* __ASSEMBLY__ */
+.macro ANNOTATE type:req
+.endm
+#endif /* __ASSEMBLY__ */
+#endif /* !CONFIG_OBJTOOL */
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Annotate away the various 'relocation to !ENDBR` complaints; knowing that
+ * these relocations will never be used for indirect calls.
+ */
+#define ANNOTATE_NOENDBR		ASM_ANNOTATE(ANNOTYPE_NOENDBR)
+#define ANNOTATE_NOENDBR_SYM(sym)	asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR))
+
+/*
+ * This should be used immediately before an indirect jump/call. It tells
+ * objtool the subsequent indirect jump/call is vouched safe for retpoline
+ * builds.
+ */
+#define ANNOTATE_RETPOLINE_SAFE		ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE)
+/*
+ * See linux/instrumentation.h
+ */
+#define ANNOTATE_INSTR_BEGIN(label)	__ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN)
+#define ANNOTATE_INSTR_END(label)	__ASM_ANNOTATE(label, ANNOTYPE_INSTR_END)
+/*
+ * objtool annotation to ignore the alternatives and only consider the original
+ * instruction(s).
+ */
+#define ANNOTATE_IGNORE_ALTERNATIVE	ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS)
+/*
+ * This macro indicates that the following intra-function call is valid.
+ * Any non-annotated intra-function call will cause objtool to issue a warning.
+ */
+#define ANNOTATE_INTRA_FUNCTION_CALL	ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL)
+/*
+ * Use objtool to validate the entry requirement that all code paths do
+ * VALIDATE_UNRET_END before RET.
+ *
+ * NOTE: The macro must be used at the beginning of a global symbol, otherwise
+ * it will be ignored.
+ */
+#define ANNOTATE_UNRET_BEGIN		ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN)
+/*
+ * This should be used to refer to an instruction that is considered
+ * terminating, like a noreturn CALL or UD2 when we know they are not -- eg
+ * WARN using UD2.
+ */
+#define ANNOTATE_REACHABLE(label)	__ASM_ANNOTATE(label, ANNOTYPE_REACHABLE)
+/*
+ * This should not be used; it annotates away CFI violations. There are a few
+ * valid use cases like kexec handover to the next kernel image, and there is
+ * no security concern there.
+ *
+ * There are also a few real issues annotated away, like EFI because we can't
+ * control the EFI code.
+ */
+#define ANNOTATE_NOCFI_SYM(sym)		asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI))
+
+#else /* __ASSEMBLY__ */
+#define ANNOTATE_NOENDBR		ANNOTATE type=ANNOTYPE_NOENDBR
+#define ANNOTATE_RETPOLINE_SAFE		ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE
+/*	ANNOTATE_INSTR_BEGIN		ANNOTATE type=ANNOTYPE_INSTR_BEGIN */
+/*	ANNOTATE_INSTR_END		ANNOTATE type=ANNOTYPE_INSTR_END */
+#define ANNOTATE_IGNORE_ALTERNATIVE	ANNOTATE type=ANNOTYPE_IGNORE_ALTS
+#define ANNOTATE_INTRA_FUNCTION_CALL	ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL
+#define ANNOTATE_UNRET_BEGIN		ANNOTATE type=ANNOTYPE_UNRET_BEGIN
+#define ANNOTATE_REACHABLE		ANNOTATE type=ANNOTYPE_REACHABLE
+#define ANNOTATE_NOCFI_SYM		ANNOTATE type=ANNOTYPE_NOCFI
+#endif /* __ASSEMBLY__ */
+
+#endif /* _LINUX_ANNOTATE_H */
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 46ebaa46e6c5..1973e9f14bf9 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -3,11 +3,10 @@
 #define _LINUX_OBJTOOL_H
 
 #include <linux/objtool_types.h>
+#include <linux/annotate.h>
 
 #ifdef CONFIG_OBJTOOL
 
-#include <asm/asm.h>
-
 #ifndef __ASSEMBLY__
 
 #define UNWIND_HINT(type, sp_reg, sp_offset, signal)	\
@@ -53,16 +52,6 @@
 
 #define __ASM_BREF(label)	label ## b
 
-#define __ASM_ANNOTATE(label, type)					\
-	".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t"	\
-	".long " __stringify(label) " - .\n\t"			\
-	".long " __stringify(type) "\n\t"				\
-	".popsection\n\t"
-
-#define ASM_ANNOTATE(type)						\
-	"911:\n\t"						\
-	__ASM_ANNOTATE(911b, type)
-
 #else /* __ASSEMBLY__ */
 
 /*
@@ -111,14 +100,6 @@
 #endif
 .endm
 
-.macro ANNOTATE type:req
-.Lhere_\@:
-	.pushsection .discard.annotate_insn,"M",@progbits,8
-	.long	.Lhere_\@ - .
-	.long	\type
-	.popsection
-.endm
-
 #endif /* __ASSEMBLY__ */
 
 #else /* !CONFIG_OBJTOOL */
@@ -128,84 +109,15 @@
 #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t"
 #define STACK_FRAME_NON_STANDARD(func)
 #define STACK_FRAME_NON_STANDARD_FP(func)
-#define __ASM_ANNOTATE(label, type) ""
-#define ASM_ANNOTATE(type)
 #else
 .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0
 .endm
 .macro STACK_FRAME_NON_STANDARD func:req
 .endm
-.macro ANNOTATE type:req
-.endm
 #endif
 
 #endif /* CONFIG_OBJTOOL */
 
-#ifndef __ASSEMBLY__
-/*
- * Annotate away the various 'relocation to !ENDBR` complaints; knowing that
- * these relocations will never be used for indirect calls.
- */
-#define ANNOTATE_NOENDBR		ASM_ANNOTATE(ANNOTYPE_NOENDBR)
-#define ANNOTATE_NOENDBR_SYM(sym)	asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR))
-
-/*
- * This should be used immediately before an indirect jump/call. It tells
- * objtool the subsequent indirect jump/call is vouched safe for retpoline
- * builds.
- */
-#define ANNOTATE_RETPOLINE_SAFE		ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE)
-/*
- * See linux/instrumentation.h
- */
-#define ANNOTATE_INSTR_BEGIN(label)	__ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN)
-#define ANNOTATE_INSTR_END(label)	__ASM_ANNOTATE(label, ANNOTYPE_INSTR_END)
-/*
- * objtool annotation to ignore the alternatives and only consider the original
- * instruction(s).
- */
-#define ANNOTATE_IGNORE_ALTERNATIVE	ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS)
-/*
- * This macro indicates that the following intra-function call is valid.
- * Any non-annotated intra-function call will cause objtool to issue a warning.
- */
-#define ANNOTATE_INTRA_FUNCTION_CALL	ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL)
-/*
- * Use objtool to validate the entry requirement that all code paths do
- * VALIDATE_UNRET_END before RET.
- *
- * NOTE: The macro must be used at the beginning of a global symbol, otherwise
- * it will be ignored.
- */
-#define ANNOTATE_UNRET_BEGIN		ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN)
-/*
- * This should be used to refer to an instruction that is considered
- * terminating, like a noreturn CALL or UD2 when we know they are not -- eg
- * WARN using UD2.
- */
-#define ANNOTATE_REACHABLE(label)	__ASM_ANNOTATE(label, ANNOTYPE_REACHABLE)
-/*
- * This should not be used; it annotates away CFI violations. There are a few
- * valid use cases like kexec handover to the next kernel image, and there is
- * no security concern there.
- *
- * There are also a few real issues annotated away, like EFI because we can't
- * control the EFI code.
- */
-#define ANNOTATE_NOCFI_SYM(sym)		asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI))
-
-#else
-#define ANNOTATE_NOENDBR		ANNOTATE type=ANNOTYPE_NOENDBR
-#define ANNOTATE_RETPOLINE_SAFE		ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE
-/*	ANNOTATE_INSTR_BEGIN		ANNOTATE type=ANNOTYPE_INSTR_BEGIN */
-/*	ANNOTATE_INSTR_END		ANNOTATE type=ANNOTYPE_INSTR_END */
-#define ANNOTATE_IGNORE_ALTERNATIVE	ANNOTATE type=ANNOTYPE_IGNORE_ALTS
-#define ANNOTATE_INTRA_FUNCTION_CALL	ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL
-#define ANNOTATE_UNRET_BEGIN		ANNOTATE type=ANNOTYPE_UNRET_BEGIN
-#define ANNOTATE_REACHABLE		ANNOTATE type=ANNOTYPE_REACHABLE
-#define ANNOTATE_NOCFI_SYM		ANNOTATE type=ANNOTYPE_NOCFI
-#endif
-
 #if defined(CONFIG_NOINSTR_VALIDATION) && \
 	(defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO))
 #define VALIDATE_UNRET_BEGIN	ANNOTATE_UNRET_BEGIN
-- 
cgit v1.2.3


From 58f36a5756445dcd0a733504cd798955ebe968c1 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:03:54 -0700
Subject: objtool: Add ANNOTATE_DATA_SPECIAL

In preparation for the objtool klp diff subcommand, add an
ANNOTATE_DATA_SPECIAL macro which annotates special section entries so
that objtool can determine their size and location and extract them
when needed.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/annotate.h            | 49 ++++++++++++++++++++++++++++---------
 include/linux/objtool_types.h       |  2 ++
 tools/include/linux/objtool_types.h |  2 ++
 3 files changed, 41 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/annotate.h b/include/linux/annotate.h
index ccb445496331..7c10d34d198c 100644
--- a/include/linux/annotate.h
+++ b/include/linux/annotate.h
@@ -8,34 +8,52 @@
 
 #ifndef __ASSEMBLY__
 
-#define __ASM_ANNOTATE(label, type)					\
-	".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t"	\
+#define __ASM_ANNOTATE(section, label, type)				\
+	".pushsection " section ",\"M\", @progbits, 8\n\t"		\
 	".long " __stringify(label) " - .\n\t"				\
 	".long " __stringify(type) "\n\t"				\
 	".popsection\n\t"
 
+#define ASM_ANNOTATE_LABEL(label, type)					\
+	__ASM_ANNOTATE(".discard.annotate_insn", label, type)
+
 #define ASM_ANNOTATE(type)						\
 	"911:\n\t"							\
-	__ASM_ANNOTATE(911b, type)
+	ASM_ANNOTATE_LABEL(911b, type)
+
+#define ASM_ANNOTATE_DATA(type)						\
+	"912:\n\t"							\
+	__ASM_ANNOTATE(".discard.annotate_data", 912b, type)
 
 #else /* __ASSEMBLY__ */
 
-.macro ANNOTATE type:req
+.macro __ANNOTATE section, type
 .Lhere_\@:
-	.pushsection .discard.annotate_insn,"M",@progbits,8
+	.pushsection \section, "M", @progbits, 8
 	.long	.Lhere_\@ - .
 	.long	\type
 	.popsection
 .endm
 
+.macro ANNOTATE type
+	__ANNOTATE ".discard.annotate_insn", \type
+.endm
+
+.macro ANNOTATE_DATA type
+	__ANNOTATE ".discard.annotate_data", \type
+.endm
+
 #endif /* __ASSEMBLY__ */
 
 #else /* !CONFIG_OBJTOOL */
 #ifndef __ASSEMBLY__
-#define __ASM_ANNOTATE(label, type) ""
+#define ASM_ANNOTATE_LABEL(label, type) ""
 #define ASM_ANNOTATE(type)
+#define ASM_ANNOTATE_DATA(type)
 #else /* __ASSEMBLY__ */
-.macro ANNOTATE type:req
+.macro ANNOTATE type
+.endm
+.macro ANNOTATE_DATA type
 .endm
 #endif /* __ASSEMBLY__ */
 #endif /* !CONFIG_OBJTOOL */
@@ -47,7 +65,7 @@
  * these relocations will never be used for indirect calls.
  */
 #define ANNOTATE_NOENDBR		ASM_ANNOTATE(ANNOTYPE_NOENDBR)
-#define ANNOTATE_NOENDBR_SYM(sym)	asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR))
+#define ANNOTATE_NOENDBR_SYM(sym)	asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOENDBR))
 
 /*
  * This should be used immediately before an indirect jump/call. It tells
@@ -58,8 +76,8 @@
 /*
  * See linux/instrumentation.h
  */
-#define ANNOTATE_INSTR_BEGIN(label)	__ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN)
-#define ANNOTATE_INSTR_END(label)	__ASM_ANNOTATE(label, ANNOTYPE_INSTR_END)
+#define ANNOTATE_INSTR_BEGIN(label)	ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_BEGIN)
+#define ANNOTATE_INSTR_END(label)	ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_END)
 /*
  * objtool annotation to ignore the alternatives and only consider the original
  * instruction(s).
@@ -83,7 +101,7 @@
  * terminating, like a noreturn CALL or UD2 when we know they are not -- eg
  * WARN using UD2.
  */
-#define ANNOTATE_REACHABLE(label)	__ASM_ANNOTATE(label, ANNOTYPE_REACHABLE)
+#define ANNOTATE_REACHABLE(label)	ASM_ANNOTATE_LABEL(label, ANNOTYPE_REACHABLE)
 /*
  * This should not be used; it annotates away CFI violations. There are a few
  * valid use cases like kexec handover to the next kernel image, and there is
@@ -92,7 +110,13 @@
  * There are also a few real issues annotated away, like EFI because we can't
  * control the EFI code.
  */
-#define ANNOTATE_NOCFI_SYM(sym)		asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI))
+#define ANNOTATE_NOCFI_SYM(sym)		asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOCFI))
+
+/*
+ * Annotate a special section entry.  This emables livepatch module generation
+ * to find and extract individual special section entries as needed.
+ */
+#define ANNOTATE_DATA_SPECIAL		ASM_ANNOTATE_DATA(ANNOTYPE_DATA_SPECIAL)
 
 #else /* __ASSEMBLY__ */
 #define ANNOTATE_NOENDBR		ANNOTATE type=ANNOTYPE_NOENDBR
@@ -104,6 +128,7 @@
 #define ANNOTATE_UNRET_BEGIN		ANNOTATE type=ANNOTYPE_UNRET_BEGIN
 #define ANNOTATE_REACHABLE		ANNOTATE type=ANNOTYPE_REACHABLE
 #define ANNOTATE_NOCFI_SYM		ANNOTATE type=ANNOTYPE_NOCFI
+#define ANNOTATE_DATA_SPECIAL		ANNOTATE_DATA type=ANNOTYPE_DATA_SPECIAL
 #endif /* __ASSEMBLY__ */
 
 #endif /* _LINUX_ANNOTATE_H */
diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h
index aceac94632c8..c6def4049b1a 100644
--- a/include/linux/objtool_types.h
+++ b/include/linux/objtool_types.h
@@ -67,4 +67,6 @@ struct unwind_hint {
 #define ANNOTYPE_REACHABLE		8
 #define ANNOTYPE_NOCFI			9
 
+#define ANNOTYPE_DATA_SPECIAL		1
+
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index aceac94632c8..c6def4049b1a 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -67,4 +67,6 @@ struct unwind_hint {
 #define ANNOTYPE_REACHABLE		8
 #define ANNOTYPE_NOCFI			9
 
+#define ANNOTYPE_DATA_SPECIAL		1
+
 #endif /* _LINUX_OBJTOOL_TYPES_H */
-- 
cgit v1.2.3


From aca282ab7e75dd3c1d14230146357a03bef12194 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:03:55 -0700
Subject: x86/asm: Annotate special section entries

In preparation for the objtool klp diff subcommand, add annotations for
special section entries.  This will enable objtool to determine the size
and location of the entries and to extract them when needed.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 arch/x86/include/asm/alternative.h | 4 ++++
 arch/x86/include/asm/asm.h         | 5 +++++
 arch/x86/include/asm/bug.h         | 1 +
 arch/x86/include/asm/cpufeature.h  | 1 +
 arch/x86/include/asm/jump_label.h  | 1 +
 include/linux/objtool.h            | 4 +++-
 6 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 15bc07a5ebb3..b14c045679e1 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -198,6 +198,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 
 #define ALTINSTR_ENTRY(ft_flags)					      \
 	".pushsection .altinstructions,\"a\"\n"				      \
+	ANNOTATE_DATA_SPECIAL						      \
 	" .long 771b - .\n"				/* label           */ \
 	" .long 774f - .\n"				/* new instruction */ \
 	" .4byte " __stringify(ft_flags) "\n"		/* feature + flags */ \
@@ -207,6 +208,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 
 #define ALTINSTR_REPLACEMENT(newinstr)		/* replacement */	\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
+	ANNOTATE_DATA_SPECIAL						\
 	"# ALT: replacement\n"						\
 	"774:\n\t" newinstr "\n775:\n"					\
 	".popsection\n"
@@ -337,6 +339,7 @@ void nop_func(void);
  * instruction. See apply_alternatives().
  */
 .macro altinstr_entry orig alt ft_flags orig_len alt_len
+	ANNOTATE_DATA_SPECIAL
 	.long \orig - .
 	.long \alt - .
 	.4byte \ft_flags
@@ -365,6 +368,7 @@ void nop_func(void);
 	.popsection ;							\
 	.pushsection .altinstr_replacement,"ax"	;			\
 743:									\
+	ANNOTATE_DATA_SPECIAL ;						\
 	newinst	;							\
 744:									\
 	.popsection ;
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index d5c8d3afe196..bd62bd87a841 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_ASM_H
 #define _ASM_X86_ASM_H
 
+#include <linux/annotate.h>
+
 #ifdef __ASSEMBLER__
 # define __ASM_FORM(x, ...)		x,## __VA_ARGS__
 # define __ASM_FORM_RAW(x, ...)		x,## __VA_ARGS__
@@ -132,6 +134,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
 # define _ASM_EXTABLE_TYPE(from, to, type)			\
 	.pushsection "__ex_table","a" ;				\
 	.balign 4 ;						\
+	ANNOTATE_DATA_SPECIAL ;					\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
 	.long type ;						\
@@ -179,6 +182,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
 # define _ASM_EXTABLE_TYPE(from, to, type)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
 	" .balign 4\n"						\
+	ANNOTATE_DATA_SPECIAL					\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
 	" .long " __stringify(type) " \n"			\
@@ -187,6 +191,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
 # define _ASM_EXTABLE_TYPE_REG(from, to, type, reg)				\
 	" .pushsection \"__ex_table\",\"a\"\n"					\
 	" .balign 4\n"								\
+	ANNOTATE_DATA_SPECIAL							\
 	" .long (" #from ") - .\n"						\
 	" .long (" #to ") - .\n"						\
 	DEFINE_EXTABLE_TYPE_REG							\
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 880ca15073ed..372f4018880c 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -57,6 +57,7 @@
 #define _BUG_FLAGS_ASM(ins, file, line, flags, size, extra)		\
 	"1:\t" ins "\n"							\
 	".pushsection __bug_table,\"aw\"\n"				\
+	ANNOTATE_DATA_SPECIAL						\
 	__BUG_ENTRY(file, line, flags)					\
 	"\t.org 2b + " size "\n"					\
 	".popsection\n"							\
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 893cbca37fe9..fc5f32d4da6e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -101,6 +101,7 @@ static __always_inline bool _static_cpu_has(u16 bit)
 	asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
 		".pushsection .altinstr_aux,\"ax\"\n"
 		"6:\n"
+		ANNOTATE_DATA_SPECIAL
 		" testb %[bitnum], %a[cap_byte]\n"
 		" jnz %l[t_yes]\n"
 		" jmp %l[t_no]\n"
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 61dd1dee7812..e0a6930a4029 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -15,6 +15,7 @@
 #define JUMP_TABLE_ENTRY(key, label)			\
 	".pushsection __jump_table,  \"aw\" \n\t"	\
 	_ASM_ALIGN "\n\t"				\
+	ANNOTATE_DATA_SPECIAL				\
 	".long 1b - . \n\t"				\
 	".long " label " - . \n\t"			\
 	_ASM_PTR " " key " - . \n\t"			\
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 1973e9f14bf9..4fea6a042b28 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -9,9 +9,10 @@
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(type, sp_reg, sp_offset, signal)	\
+#define UNWIND_HINT(type, sp_reg, sp_offset, signal)		\
 	"987: \n\t"						\
 	".pushsection .discard.unwind_hints\n\t"		\
+	ANNOTATE_DATA_SPECIAL					\
 	/* struct unwind_hint */				\
 	".long 987b - .\n\t"					\
 	".short " __stringify(sp_offset) "\n\t"			\
@@ -78,6 +79,7 @@
 .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0
 .Lhere_\@:
 	.pushsection .discard.unwind_hints
+		ANNOTATE_DATA_SPECIAL
 		/* struct unwind_hint */
 		.long .Lhere_\@ - .
 		.short \sp_offset
-- 
cgit v1.2.3


From f6b740ef5f4724f95363ac0d664e88d221343fa1 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:03:56 -0700
Subject: objtool: Unify STACK_FRAME_NON_STANDARD entry sizes

The C implementation of STACK_FRAME_NON_STANDARD emits 8-byte entries,
whereas the asm version's entries are only 4 bytes.

Make them consistent by converting the asm version to 8-byte entries.

This is much easier than converting the C version to 4-bytes, which
would require awkwardly putting inline asm in a dummy function in order
to pass the 'func' pointer to the asm.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/objtool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 4fea6a042b28..b18ab53561c9 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -92,7 +92,7 @@
 
 .macro STACK_FRAME_NON_STANDARD func:req
 	.pushsection .discard.func_stack_frame_non_standard, "aw"
-	.long \func - .
+	.quad \func
 	.popsection
 .endm
 
-- 
cgit v1.2.3


From dd590d4d57ebeeb826823c288741f2ed20f452af Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:03:59 -0700
Subject: objtool/klp: Introduce klp diff subcommand for diffing object files

Add a new klp diff subcommand which performs a binary diff between two
object files and extracts changed functions into a new object which can
then be linked into a livepatch module.

This builds on concepts from the longstanding out-of-tree kpatch [1]
project which began in 2012 and has been used for many years to generate
livepatch modules for production kernels.  However, this is a complete
rewrite which incorporates hard-earned lessons from 12+ years of
maintaining kpatch.

Key improvements compared to kpatch-build:

  - Integrated with objtool: Leverages objtool's existing control-flow
    graph analysis to help detect changed functions.

  - Works on vmlinux.o: Supports late-linked objects, making it
    compatible with LTO, IBT, and similar.

  - Simplified code base: ~3k fewer lines of code.

  - Upstream: No more out-of-tree #ifdef hacks, far less cruft.

  - Cleaner internals: Vastly simplified logic for symbol/section/reloc
    inclusion and special section extraction.

  - Robust __LINE__ macro handling: Avoids false positive binary diffs
    caused by the __LINE__ macro by introducing a fix-patch-lines script
    (coming in a later patch) which injects #line directives into the
    source .patch to preserve the original line numbers at compile time.

Note the end result of this subcommand is not yet functionally complete.
Livepatch needs some ELF magic which linkers don't like:

  - Two relocation sections (.rela*, .klp.rela*) for the same text
    section.

  - Use of SHN_LIVEPATCH to mark livepatch symbols.

Unfortunately linkers tend to mangle such things.  To work around that,
klp diff generates a linker-compliant intermediate binary which encodes
the relevant KLP section/reloc/symbol metadata.

After module linking, a klp post-link step (coming soon) will clean up
the mess and convert the linked .ko into a fully compliant livepatch
module.

Note this subcommand requires the diffed binaries to have been compiled
with -ffunction-sections and -fdata-sections, and processed with
'objtool --checksum'.  Those constraints will be handled by a klp-build
script introduced in a later patch.

Without '-ffunction-sections -fdata-sections', reliable object diffing
would be infeasible due to toolchain limitations:

  - For intra-file+intra-section references, the compiler might
    occasionally generated hard-coded instruction offsets instead of
    relocations.

  - Section-symbol-based references can be ambiguous:

    - Overlapping or zero-length symbols create ambiguity as to which
      symbol is being referenced.

    - A reference to the end of a symbol (e.g., checking array bounds)
      can be misinterpreted as a reference to the next symbol, or vice
      versa.

A potential future alternative to '-ffunction-sections -fdata-sections'
would be to introduce a toolchain option that forces symbol-based
(non-section) relocations.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 MAINTAINERS                              |    2 +-
 include/linux/livepatch.h                |   25 +-
 include/linux/livepatch_external.h       |   76 ++
 kernel/livepatch/core.c                  |    4 +-
 scripts/module.lds.S                     |   10 +-
 tools/include/linux/livepatch_external.h |   76 ++
 tools/include/linux/string.h             |   14 +
 tools/objtool/Build                      |    4 +-
 tools/objtool/Makefile                   |    3 +-
 tools/objtool/arch/x86/decode.c          |   40 +
 tools/objtool/builtin-klp.c              |   52 +
 tools/objtool/check.c                    |   14 -
 tools/objtool/elf.c                      |   21 +-
 tools/objtool/include/objtool/arch.h     |    1 +
 tools/objtool/include/objtool/builtin.h  |    2 +
 tools/objtool/include/objtool/elf.h      |   56 +-
 tools/objtool/include/objtool/klp.h      |   31 +
 tools/objtool/include/objtool/objtool.h  |    2 +
 tools/objtool/include/objtool/util.h     |   19 +
 tools/objtool/klp-diff.c                 | 1646 ++++++++++++++++++++++++++++++
 tools/objtool/objtool.c                  |   41 +-
 tools/objtool/sync-check.sh              |    1 +
 tools/objtool/weak.c                     |    7 +
 23 files changed, 2088 insertions(+), 59 deletions(-)
 create mode 100644 include/linux/livepatch_external.h
 create mode 100644 tools/include/linux/livepatch_external.h
 create mode 100644 tools/objtool/builtin-klp.c
 create mode 100644 tools/objtool/include/objtool/klp.h
 create mode 100644 tools/objtool/include/objtool/util.h
 create mode 100644 tools/objtool/klp-diff.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 46126ce2f968..755e2528f839 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14439,7 +14439,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching.g
 F:	Documentation/ABI/testing/sysfs-kernel-livepatch
 F:	Documentation/livepatch/
 F:	arch/powerpc/include/asm/livepatch.h
-F:	include/linux/livepatch.h
+F:	include/linux/livepatch*.h
 F:	kernel/livepatch/
 F:	kernel/module/livepatch.c
 F:	samples/livepatch/
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 51a258c24ff5..772919e8096a 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -13,6 +13,7 @@
 #include <linux/ftrace.h>
 #include <linux/completion.h>
 #include <linux/list.h>
+#include <linux/livepatch_external.h>
 #include <linux/livepatch_sched.h>
 
 #if IS_ENABLED(CONFIG_LIVEPATCH)
@@ -77,30 +78,6 @@ struct klp_func {
 	bool transition;
 };
 
-struct klp_object;
-
-/**
- * struct klp_callbacks - pre/post live-(un)patch callback structure
- * @pre_patch:		executed before code patching
- * @post_patch:		executed after code patching
- * @pre_unpatch:	executed before code unpatching
- * @post_unpatch:	executed after code unpatching
- * @post_unpatch_enabled:	flag indicating if post-unpatch callback
- * 				should run
- *
- * All callbacks are optional.  Only the pre-patch callback, if provided,
- * will be unconditionally executed.  If the parent klp_object fails to
- * patch for any reason, including a non-zero error status returned from
- * the pre-patch callback, no further callbacks will be executed.
- */
-struct klp_callbacks {
-	int (*pre_patch)(struct klp_object *obj);
-	void (*post_patch)(struct klp_object *obj);
-	void (*pre_unpatch)(struct klp_object *obj);
-	void (*post_unpatch)(struct klp_object *obj);
-	bool post_unpatch_enabled;
-};
-
 /**
  * struct klp_object - kernel object structure for live patching
  * @name:	module name (or NULL for vmlinux)
diff --git a/include/linux/livepatch_external.h b/include/linux/livepatch_external.h
new file mode 100644
index 000000000000..138af19b0f5c
--- /dev/null
+++ b/include/linux/livepatch_external.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * External livepatch interfaces for patch creation tooling
+ */
+
+#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_
+#define _LINUX_LIVEPATCH_EXTERNAL_H_
+
+#include <linux/types.h>
+
+#define KLP_RELOC_SEC_PREFIX		".klp.rela."
+#define KLP_SYM_PREFIX			".klp.sym."
+
+#define __KLP_PRE_PATCH_PREFIX		__klp_pre_patch_callback_
+#define __KLP_POST_PATCH_PREFIX		__klp_post_patch_callback_
+#define __KLP_PRE_UNPATCH_PREFIX	__klp_pre_unpatch_callback_
+#define __KLP_POST_UNPATCH_PREFIX	__klp_post_unpatch_callback_
+
+#define KLP_PRE_PATCH_PREFIX		__stringify(__KLP_PRE_PATCH_PREFIX)
+#define KLP_POST_PATCH_PREFIX		__stringify(__KLP_POST_PATCH_PREFIX)
+#define KLP_PRE_UNPATCH_PREFIX		__stringify(__KLP_PRE_UNPATCH_PREFIX)
+#define KLP_POST_UNPATCH_PREFIX		__stringify(__KLP_POST_UNPATCH_PREFIX)
+
+struct klp_object;
+
+typedef int (*klp_pre_patch_t)(struct klp_object *obj);
+typedef void (*klp_post_patch_t)(struct klp_object *obj);
+typedef void (*klp_pre_unpatch_t)(struct klp_object *obj);
+typedef void (*klp_post_unpatch_t)(struct klp_object *obj);
+
+/**
+ * struct klp_callbacks - pre/post live-(un)patch callback structure
+ * @pre_patch:		executed before code patching
+ * @post_patch:		executed after code patching
+ * @pre_unpatch:	executed before code unpatching
+ * @post_unpatch:	executed after code unpatching
+ * @post_unpatch_enabled:	flag indicating if post-unpatch callback
+ *				should run
+ *
+ * All callbacks are optional.  Only the pre-patch callback, if provided,
+ * will be unconditionally executed.  If the parent klp_object fails to
+ * patch for any reason, including a non-zero error status returned from
+ * the pre-patch callback, no further callbacks will be executed.
+ */
+struct klp_callbacks {
+	klp_pre_patch_t		pre_patch;
+	klp_post_patch_t	post_patch;
+	klp_pre_unpatch_t	pre_unpatch;
+	klp_post_unpatch_t	post_unpatch;
+	bool post_unpatch_enabled;
+};
+
+/*
+ * 'struct klp_{func,object}_ext' are compact "external" representations of
+ * 'struct klp_{func,object}'.   They are used by objtool for livepatch
+ * generation.  The structs are then read by the livepatch module and converted
+ * to the real structs before calling klp_enable_patch().
+ *
+ * TODO make these the official API for klp_enable_patch().  That should
+ * simplify livepatch's interface as well as its data structure lifetime
+ * management.
+ */
+struct klp_func_ext {
+	const char *old_name;
+	void *new_func;
+	unsigned long sympos;
+};
+
+struct klp_object_ext {
+	const char *name;
+	struct klp_func_ext *funcs;
+	struct klp_callbacks callbacks;
+	unsigned int nr_funcs;
+};
+
+#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 7e443c2cf7d4..0044a8125013 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -224,7 +224,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
 
 		/* Format: .klp.sym.sym_objname.sym_name,sympos */
 		cnt = sscanf(strtab + sym->st_name,
-			     ".klp.sym.%55[^.].%511[^,],%lu",
+			     KLP_SYM_PREFIX "%55[^.].%511[^,],%lu",
 			     sym_objname, sym_name, &sympos);
 		if (cnt != 3) {
 			pr_err("symbol %s has an incorrectly formatted name\n",
@@ -303,7 +303,7 @@ static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
 	 * See comment in klp_resolve_symbols() for an explanation
 	 * of the selected field width value.
 	 */
-	cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]",
+	cnt = sscanf(shstrtab + sec->sh_name, KLP_RELOC_SEC_PREFIX "%55[^.]",
 		     sec_objname);
 	if (cnt != 1) {
 		pr_err("section %s has an incorrectly formatted name\n",
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index 2632c6cb8ebe..3037d5e5527c 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -34,8 +34,16 @@ SECTIONS {
 
 	__patchable_function_entries : { *(__patchable_function_entries) }
 
+	__klp_funcs		0: ALIGN(8) { KEEP(*(__klp_funcs)) }
+
+	__klp_objects		0: ALIGN(8) {
+		__start_klp_objects = .;
+		KEEP(*(__klp_objects))
+		__stop_klp_objects = .;
+	}
+
 #ifdef CONFIG_ARCH_USES_CFI_TRAPS
-	__kcfi_traps 		: { KEEP(*(.kcfi_traps)) }
+	__kcfi_traps		: { KEEP(*(.kcfi_traps)) }
 #endif
 
 	.text : {
diff --git a/tools/include/linux/livepatch_external.h b/tools/include/linux/livepatch_external.h
new file mode 100644
index 000000000000..138af19b0f5c
--- /dev/null
+++ b/tools/include/linux/livepatch_external.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * External livepatch interfaces for patch creation tooling
+ */
+
+#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_
+#define _LINUX_LIVEPATCH_EXTERNAL_H_
+
+#include <linux/types.h>
+
+#define KLP_RELOC_SEC_PREFIX		".klp.rela."
+#define KLP_SYM_PREFIX			".klp.sym."
+
+#define __KLP_PRE_PATCH_PREFIX		__klp_pre_patch_callback_
+#define __KLP_POST_PATCH_PREFIX		__klp_post_patch_callback_
+#define __KLP_PRE_UNPATCH_PREFIX	__klp_pre_unpatch_callback_
+#define __KLP_POST_UNPATCH_PREFIX	__klp_post_unpatch_callback_
+
+#define KLP_PRE_PATCH_PREFIX		__stringify(__KLP_PRE_PATCH_PREFIX)
+#define KLP_POST_PATCH_PREFIX		__stringify(__KLP_POST_PATCH_PREFIX)
+#define KLP_PRE_UNPATCH_PREFIX		__stringify(__KLP_PRE_UNPATCH_PREFIX)
+#define KLP_POST_UNPATCH_PREFIX		__stringify(__KLP_POST_UNPATCH_PREFIX)
+
+struct klp_object;
+
+typedef int (*klp_pre_patch_t)(struct klp_object *obj);
+typedef void (*klp_post_patch_t)(struct klp_object *obj);
+typedef void (*klp_pre_unpatch_t)(struct klp_object *obj);
+typedef void (*klp_post_unpatch_t)(struct klp_object *obj);
+
+/**
+ * struct klp_callbacks - pre/post live-(un)patch callback structure
+ * @pre_patch:		executed before code patching
+ * @post_patch:		executed after code patching
+ * @pre_unpatch:	executed before code unpatching
+ * @post_unpatch:	executed after code unpatching
+ * @post_unpatch_enabled:	flag indicating if post-unpatch callback
+ *				should run
+ *
+ * All callbacks are optional.  Only the pre-patch callback, if provided,
+ * will be unconditionally executed.  If the parent klp_object fails to
+ * patch for any reason, including a non-zero error status returned from
+ * the pre-patch callback, no further callbacks will be executed.
+ */
+struct klp_callbacks {
+	klp_pre_patch_t		pre_patch;
+	klp_post_patch_t	post_patch;
+	klp_pre_unpatch_t	pre_unpatch;
+	klp_post_unpatch_t	post_unpatch;
+	bool post_unpatch_enabled;
+};
+
+/*
+ * 'struct klp_{func,object}_ext' are compact "external" representations of
+ * 'struct klp_{func,object}'.   They are used by objtool for livepatch
+ * generation.  The structs are then read by the livepatch module and converted
+ * to the real structs before calling klp_enable_patch().
+ *
+ * TODO make these the official API for klp_enable_patch().  That should
+ * simplify livepatch's interface as well as its data structure lifetime
+ * management.
+ */
+struct klp_func_ext {
+	const char *old_name;
+	void *new_func;
+	unsigned long sympos;
+};
+
+struct klp_object_ext {
+	const char *name;
+	struct klp_func_ext *funcs;
+	struct klp_callbacks callbacks;
+	unsigned int nr_funcs;
+};
+
+#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */
diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h
index 8499f509f03e..51ad3cf4fa82 100644
--- a/tools/include/linux/string.h
+++ b/tools/include/linux/string.h
@@ -44,6 +44,20 @@ static inline bool strstarts(const char *str, const char *prefix)
 	return strncmp(str, prefix, strlen(prefix)) == 0;
 }
 
+/*
+ * Checks if a string ends with another.
+ */
+static inline bool str_ends_with(const char *str, const char *substr)
+{
+	size_t len = strlen(str);
+	size_t sublen = strlen(substr);
+
+	if (sublen > len)
+		return false;
+
+	return !strcmp(str + len - sublen, substr);
+}
+
 extern char * __must_check skip_spaces(const char *);
 
 extern char *strim(char *);
diff --git a/tools/objtool/Build b/tools/objtool/Build
index a3cdf8af6635..0b01657671d7 100644
--- a/tools/objtool/Build
+++ b/tools/objtool/Build
@@ -8,8 +8,8 @@ objtool-y += builtin-check.o
 objtool-y += elf.o
 objtool-y += objtool.o
 
-objtool-$(BUILD_ORC) += orc_gen.o
-objtool-$(BUILD_ORC) += orc_dump.o
+objtool-$(BUILD_ORC) += orc_gen.o orc_dump.o
+objtool-$(BUILD_KLP) += builtin-klp.o klp-diff.o
 
 objtool-y += libstring.o
 objtool-y += libctype.o
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 958761c05b7c..48928c9bebef 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -15,13 +15,14 @@ ifeq ($(ARCH_HAS_KLP),y)
 	HAVE_XXHASH = $(shell echo "int main() {}" | \
 		      $(HOSTCC) -xc - -o /dev/null -lxxhash 2> /dev/null && echo y || echo n)
 	ifeq ($(HAVE_XXHASH),y)
+		BUILD_KLP	 := y
 		LIBXXHASH_CFLAGS := $(shell $(HOSTPKG_CONFIG) libxxhash --cflags 2>/dev/null) \
 				    -DBUILD_KLP
 		LIBXXHASH_LIBS   := $(shell $(HOSTPKG_CONFIG) libxxhash --libs 2>/dev/null || echo -lxxhash)
 	endif
 endif
 
-export BUILD_ORC
+export BUILD_ORC BUILD_KLP
 
 ifeq ($(srctree),)
 srctree := $(patsubst %/,%,$(dir $(CURDIR)))
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index b2c320f701f9..5c72beeaa3a7 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -88,6 +88,46 @@ s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc)
 	return phys_to_virt(addend);
 }
 
+static void scan_for_insn(struct section *sec, unsigned long offset,
+			  unsigned long *insn_off, unsigned int *insn_len)
+{
+	unsigned long o = 0;
+	struct insn insn;
+
+	while (1) {
+
+		insn_decode(&insn, sec->data->d_buf + o, sec_size(sec) - o,
+			    INSN_MODE_64);
+
+		if (o + insn.length > offset) {
+			*insn_off = o;
+			*insn_len = insn.length;
+			return;
+		}
+
+		o += insn.length;
+	}
+}
+
+u64 arch_adjusted_addend(struct reloc *reloc)
+{
+	unsigned int type = reloc_type(reloc);
+	s64 addend = reloc_addend(reloc);
+	unsigned long insn_off;
+	unsigned int insn_len;
+
+	if (type == R_X86_64_PLT32)
+		return addend + 4;
+
+	if (type != R_X86_64_PC32 || !is_text_sec(reloc->sec->base))
+		return addend;
+
+	scan_for_insn(reloc->sec->base, reloc_offset(reloc),
+		      &insn_off, &insn_len);
+
+	return addend + insn_off + insn_len - reloc_offset(reloc);
+}
+
 unsigned long arch_jump_destination(struct instruction *insn)
 {
 	return insn->offset + insn->len + insn->immediate;
diff --git a/tools/objtool/builtin-klp.c b/tools/objtool/builtin-klp.c
new file mode 100644
index 000000000000..9b13dd1182af
--- /dev/null
+++ b/tools/objtool/builtin-klp.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <subcmd/parse-options.h>
+#include <string.h>
+#include <stdlib.h>
+#include <objtool/builtin.h>
+#include <objtool/objtool.h>
+#include <objtool/klp.h>
+
+struct subcmd {
+	const char *name;
+	const char *description;
+	int (*fn)(int, const char **);
+};
+
+static struct subcmd subcmds[] = {
+	{ "diff",		"Generate binary diff of two object files",		cmd_klp_diff, },
+};
+
+static void cmd_klp_usage(void)
+{
+	fprintf(stderr, "usage: objtool klp <subcommand> [<options>]\n\n");
+	fprintf(stderr, "Subcommands:\n");
+
+	for (int i = 0; i < ARRAY_SIZE(subcmds); i++) {
+		struct subcmd *cmd = &subcmds[i];
+
+		fprintf(stderr, "  %s\t%s\n", cmd->name, cmd->description);
+	}
+
+	exit(1);
+}
+
+int cmd_klp(int argc, const char **argv)
+{
+	argc--;
+	argv++;
+
+	if (!argc)
+		cmd_klp_usage();
+
+	if (argc) {
+		for (int i = 0; i < ARRAY_SIZE(subcmds); i++) {
+			struct subcmd *cmd = &subcmds[i];
+
+			if (!strcmp(cmd->name, argv[0]))
+				return cmd->fn(argc, argv);
+		}
+	}
+
+	cmd_klp_usage();
+	return 0;
+}
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 0f5278127f37..8d17d930d0c8 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -185,20 +185,6 @@ static bool is_sibling_call(struct instruction *insn)
 	return (is_static_jump(insn) && insn_call_dest(insn));
 }
 
-/*
- * Checks if a string ends with another.
- */
-static bool str_ends_with(const char *s, const char *sub)
-{
-	const int slen = strlen(s);
-	const int sublen = strlen(sub);
-
-	if (sublen > slen)
-		return 0;
-
-	return !memcmp(s + slen - sublen, sub, sublen);
-}
-
 /*
  * Checks if a function is a Rust "noreturn" one.
  */
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 0119b3b4c554..e1daae0630be 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -288,6 +288,18 @@ struct symbol *find_symbol_by_name(const struct elf *elf, const char *name)
 	return NULL;
 }
 
+struct symbol *find_global_symbol_by_name(const struct elf *elf, const char *name)
+{
+	struct symbol *sym;
+
+	elf_hash_for_each_possible(symbol_name, sym, name_hash, str_hash(name)) {
+		if (!strcmp(sym->name, name) && !is_local_sym(sym))
+			return sym;
+	}
+
+	return NULL;
+}
+
 struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec,
 				     unsigned long offset, unsigned int len)
 {
@@ -475,6 +487,8 @@ static int elf_add_symbol(struct elf *elf, struct symbol *sym)
 	else
 		entry = &sym->sec->symbol_list;
 	list_add(&sym->list, entry);
+
+	list_add_tail(&sym->global_list, &elf->symbols);
 	elf_hash_add(symbol, &sym->hash, sym->idx);
 	elf_hash_add(symbol_name, &sym->name_hash, str_hash(sym->name));
 
@@ -531,6 +545,9 @@ static int read_symbols(struct elf *elf)
 		ERROR_GLIBC("calloc");
 		return -1;
 	}
+
+	INIT_LIST_HEAD(&elf->symbols);
+
 	for (i = 0; i < symbols_nr; i++) {
 		sym = &elf->symbol_data[i];
 
@@ -639,7 +656,7 @@ static int mark_group_syms(struct elf *elf)
 		return -1;
 	}
 
-	list_for_each_entry(sec, &elf->sections, list) {
+	for_each_sec(elf, sec) {
 		if (sec->sh.sh_type == SHT_GROUP &&
 		    sec->sh.sh_link == symtab->idx) {
 			sym = find_symbol_by_index(elf, sec->sh.sh_info);
@@ -1224,6 +1241,8 @@ struct elf *elf_create_file(GElf_Ehdr *ehdr, const char *name)
 		return NULL;
 	}
 
+	INIT_LIST_HEAD(&elf->symbols);
+
 	if (!elf_alloc_hash(section,		1000) ||
 	    !elf_alloc_hash(section_name,	1000) ||
 	    !elf_alloc_hash(symbol,		10000) ||
diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h
index a4502947307a..d89f8b5ec14e 100644
--- a/tools/objtool/include/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -84,6 +84,7 @@ bool arch_callee_saved_reg(unsigned char reg);
 unsigned long arch_jump_destination(struct instruction *insn);
 
 s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc);
+u64 arch_adjusted_addend(struct reloc *reloc);
 
 const char *arch_nop_insn(int len);
 const char *arch_ret_insn(int len);
diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index cee9fc031877..bb0b25eb08ba 100644
--- a/tools/objtool/include/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
@@ -53,4 +53,6 @@ int objtool_run(int argc, const char **argv);
 
 int make_backup(void);
 
+int cmd_klp(int argc, const char **argv);
+
 #endif /* _BUILTIN_H */
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index a1f1762f89c4..e2cd817fca52 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -18,6 +18,7 @@
 #include <objtool/checksum_types.h>
 #include <arch/elf.h>
 
+#define SEC_NAME_LEN		1024
 #define SYM_NAME_LEN		512
 
 #define bswap_if_needed(elf, val) __bswap_if_needed(&elf->ehdr, val)
@@ -53,10 +54,12 @@ struct section {
 	bool _changed, text, rodata, noinstr, init, truncate;
 	struct reloc *relocs;
 	unsigned long nr_alloc_relocs;
+	struct section *twin;
 };
 
 struct symbol {
 	struct list_head list;
+	struct list_head global_list;
 	struct rb_node node;
 	struct elf_hash_node hash;
 	struct elf_hash_node name_hash;
@@ -83,10 +86,13 @@ struct symbol {
 	u8 cold		     : 1;
 	u8 prefix	     : 1;
 	u8 debug_checksum    : 1;
+	u8 changed	     : 1;
+	u8 included	     : 1;
 	struct list_head pv_target;
 	struct reloc *relocs;
 	struct section *group_sec;
 	struct checksum csum;
+	struct symbol *twin, *clone;
 };
 
 struct reloc {
@@ -104,6 +110,7 @@ struct elf {
 	const char *name, *tmp_name;
 	unsigned int num_files;
 	struct list_head sections;
+	struct list_head symbols;
 	unsigned long num_relocs;
 
 	int symbol_bits;
@@ -179,6 +186,7 @@ struct section *find_section_by_name(const struct elf *elf, const char *name);
 struct symbol *find_func_by_offset(struct section *sec, unsigned long offset);
 struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset);
 struct symbol *find_symbol_by_name(const struct elf *elf, const char *name);
+struct symbol *find_global_symbol_by_name(const struct elf *elf, const char *name);
 struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset);
 int find_symbol_hole_containing(const struct section *sec, unsigned long offset);
 struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset);
@@ -448,22 +456,48 @@ static inline void set_sym_next_reloc(struct reloc *reloc, struct reloc *next)
 #define sec_for_each_sym(sec, sym)					\
 	list_for_each_entry(sym, &sec->symbol_list, list)
 
+#define sec_prev_sym(sym)						\
+	sym->sec && sym->list.prev != &sym->sec->symbol_list ?		\
+	list_prev_entry(sym, list) : NULL
+
 #define for_each_sym(elf, sym)						\
-	for (struct section *__sec, *__fake = (struct section *)1;	\
-	     __fake; __fake = NULL)					\
-		for_each_sec(elf, __sec)				\
-			sec_for_each_sym(__sec, sym)
+	list_for_each_entry(sym, &elf->symbols, global_list)
+
+#define for_each_sym_continue(elf, sym)					\
+	list_for_each_entry_continue(sym, &elf->symbols, global_list)
+
+#define rsec_next_reloc(rsec, reloc)					\
+	reloc_idx(reloc) < sec_num_entries(rsec) - 1 ? reloc + 1 : NULL
 
 #define for_each_reloc(rsec, reloc)					\
-	for (int __i = 0, __fake = 1; __fake; __fake = 0)		\
-		for (reloc = rsec->relocs;				\
-		     __i < sec_num_entries(rsec);			\
-		     __i++, reloc++)
+	for (reloc = rsec->relocs; reloc; reloc = rsec_next_reloc(rsec, reloc))
 
 #define for_each_reloc_from(rsec, reloc)				\
-	for (int __i = reloc_idx(reloc);				\
-	     __i < sec_num_entries(rsec);				\
-	     __i++, reloc++)
+	for (; reloc; reloc = rsec_next_reloc(rsec, reloc))
+
+#define for_each_reloc_continue(rsec, reloc)				\
+	for (reloc = rsec_next_reloc(rsec, reloc); reloc;		\
+	     reloc = rsec_next_reloc(rsec, reloc))
+
+#define sym_for_each_reloc(elf, sym, reloc)				\
+	for (reloc = find_reloc_by_dest_range(elf, sym->sec,		\
+					      sym->offset, sym->len);	\
+	     reloc && reloc_offset(reloc) <  sym->offset + sym->len;	\
+	     reloc = rsec_next_reloc(sym->sec->rsec, reloc))
+
+static inline struct symbol *get_func_prefix(struct symbol *func)
+{
+	struct symbol *prev;
+
+	if (!is_func_sym(func))
+		return NULL;
+
+	prev = sec_prev_sym(func);
+	if (prev && is_prefix_func(prev))
+		return prev;
+
+	return NULL;
+}
 
 #define OFFSET_STRIDE_BITS	4
 #define OFFSET_STRIDE		(1UL << OFFSET_STRIDE_BITS)
diff --git a/tools/objtool/include/objtool/klp.h b/tools/objtool/include/objtool/klp.h
new file mode 100644
index 000000000000..07928fac059b
--- /dev/null
+++ b/tools/objtool/include/objtool/klp.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _OBJTOOL_KLP_H
+#define _OBJTOOL_KLP_H
+
+/*
+ * __klp_objects and __klp_funcs are created by klp diff and used by the patch
+ * module init code to build the klp_patch, klp_object and klp_func structs
+ * needed by the livepatch API.
+ */
+#define KLP_OBJECTS_SEC	"__klp_objects"
+#define KLP_FUNCS_SEC	"__klp_funcs"
+
+/*
+ * __klp_relocs is an intermediate section which are created by klp diff and
+ * converted into KLP symbols/relas by "objtool klp post-link".  This is needed
+ * to work around the linker, which doesn't preserve SHN_LIVEPATCH or
+ * SHF_RELA_LIVEPATCH, nor does it support having two RELA sections for a
+ * single PROGBITS section.
+ */
+#define KLP_RELOCS_SEC	"__klp_relocs"
+#define KLP_STRINGS_SEC	".rodata.klp.str1.1"
+
+struct klp_reloc {
+	void *offset;
+	void *sym;
+	u32 type;
+};
+
+int cmd_klp_diff(int argc, const char **argv);
+
+#endif /* _OBJTOOL_KLP_H */
diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h
index c0dc86a78ff6..7f70b41d1b8d 100644
--- a/tools/objtool/include/objtool/objtool.h
+++ b/tools/objtool/include/objtool/objtool.h
@@ -39,6 +39,8 @@ struct objtool_file {
 	struct pv_state *pv_ops;
 };
 
+char *top_level_dir(const char *file);
+
 struct objtool_file *objtool_open_read(const char *_objname);
 
 int objtool_pv_add(struct objtool_file *file, int idx, struct symbol *func);
diff --git a/tools/objtool/include/objtool/util.h b/tools/objtool/include/objtool/util.h
new file mode 100644
index 000000000000..a0180b312f73
--- /dev/null
+++ b/tools/objtool/include/objtool/util.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _UTIL_H
+#define _UTIL_H
+
+#include <objtool/warn.h>
+
+#define snprintf_check(str, size, format, args...)			\
+({									\
+	int __ret = snprintf(str, size, format, args);			\
+	if (__ret < 0)							\
+		ERROR_GLIBC("snprintf");				\
+	else if (__ret >= size)						\
+		ERROR("snprintf() failed for '" format "'", args);	\
+	else								\
+		__ret = 0;						\
+	__ret;								\
+})
+
+#endif /* _UTIL_H */
diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c
new file mode 100644
index 000000000000..0d69b621a26c
--- /dev/null
+++ b/tools/objtool/klp-diff.c
@@ -0,0 +1,1646 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#define _GNU_SOURCE /* memmem() */
+#include <subcmd/parse-options.h>
+#include <stdlib.h>
+#include <string.h>
+#include <libgen.h>
+#include <stdio.h>
+#include <ctype.h>
+
+#include <objtool/objtool.h>
+#include <objtool/warn.h>
+#include <objtool/arch.h>
+#include <objtool/klp.h>
+#include <objtool/util.h>
+#include <arch/special.h>
+
+#include <linux/objtool_types.h>
+#include <linux/livepatch_external.h>
+#include <linux/stringify.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+
+#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+
+struct elfs {
+	struct elf *orig, *patched, *out;
+	const char *modname;
+};
+
+struct export {
+	struct hlist_node hash;
+	char *mod, *sym;
+};
+
+static const char * const klp_diff_usage[] = {
+	"objtool klp diff [<options>] <in1.o> <in2.o> <out.o>",
+	NULL,
+};
+
+static const struct option klp_diff_options[] = {
+	OPT_END(),
+};
+
+static DEFINE_HASHTABLE(exports, 15);
+
+static inline u32 str_hash(const char *str)
+{
+	return jhash(str, strlen(str), 0);
+}
+
+static int read_exports(void)
+{
+	const char *symvers = "Module.symvers";
+	char line[1024], *path = NULL;
+	unsigned int line_num = 1;
+	FILE *file;
+
+	file = fopen(symvers, "r");
+	if (!file) {
+		path = top_level_dir(symvers);
+		if (!path) {
+			ERROR("can't open '%s', \"objtool diff\" should be run from the kernel tree", symvers);
+			return -1;
+		}
+
+		file = fopen(path, "r");
+		if (!file) {
+			ERROR_GLIBC("fopen");
+			return -1;
+		}
+	}
+
+	while (fgets(line, 1024, file)) {
+		char *sym, *mod, *type;
+		struct export *export;
+
+		sym = strchr(line, '\t');
+		if (!sym) {
+			ERROR("malformed Module.symvers (sym) at line %d", line_num);
+			return -1;
+		}
+
+		*sym++ = '\0';
+
+		mod = strchr(sym, '\t');
+		if (!mod) {
+			ERROR("malformed Module.symvers (mod) at line %d", line_num);
+			return -1;
+		}
+
+		*mod++ = '\0';
+
+		type = strchr(mod, '\t');
+		if (!type) {
+			ERROR("malformed Module.symvers (type) at line %d", line_num);
+			return -1;
+		}
+
+		*type++ = '\0';
+
+		if (*sym == '\0' || *mod == '\0') {
+			ERROR("malformed Module.symvers at line %d", line_num);
+			return -1;
+		}
+
+		export = calloc(1, sizeof(*export));
+		if (!export) {
+			ERROR_GLIBC("calloc");
+			return -1;
+		}
+
+		export->mod = strdup(mod);
+		if (!export->mod) {
+			ERROR_GLIBC("strdup");
+			return -1;
+		}
+
+		export->sym = strdup(sym);
+		if (!export->sym) {
+			ERROR_GLIBC("strdup");
+			return -1;
+		}
+
+		hash_add(exports, &export->hash, str_hash(sym));
+	}
+
+	free(path);
+	fclose(file);
+
+	return 0;
+}
+
+static int read_sym_checksums(struct elf *elf)
+{
+	struct section *sec;
+
+	sec = find_section_by_name(elf, ".discard.sym_checksum");
+	if (!sec) {
+		ERROR("'%s' missing .discard.sym_checksum section, file not processed by 'objtool --checksum'?",
+		      elf->name);
+		return -1;
+	}
+
+	if (!sec->rsec) {
+		ERROR("missing reloc section for .discard.sym_checksum");
+		return -1;
+	}
+
+	if (sec_size(sec) % sizeof(struct sym_checksum)) {
+		ERROR("struct sym_checksum size mismatch");
+		return -1;
+	}
+
+	for (int i = 0; i < sec_size(sec) / sizeof(struct sym_checksum); i++) {
+		struct sym_checksum *sym_checksum;
+		struct reloc *reloc;
+		struct symbol *sym;
+
+		sym_checksum = (struct sym_checksum *)sec->data->d_buf + i;
+
+		reloc = find_reloc_by_dest(elf, sec, i * sizeof(*sym_checksum));
+		if (!reloc) {
+			ERROR("can't find reloc for sym_checksum[%d]", i);
+			return -1;
+		}
+
+		sym = reloc->sym;
+
+		if (is_sec_sym(sym)) {
+			ERROR("not sure how to handle section %s", sym->name);
+			return -1;
+		}
+
+		if (is_func_sym(sym))
+			sym->csum.checksum = sym_checksum->checksum;
+	}
+
+	return 0;
+}
+
+static struct symbol *first_file_symbol(struct elf *elf)
+{
+	struct symbol *sym;
+
+	for_each_sym(elf, sym) {
+		if (is_file_sym(sym))
+			return sym;
+	}
+
+	return NULL;
+}
+
+static struct symbol *next_file_symbol(struct elf *elf, struct symbol *sym)
+{
+	for_each_sym_continue(elf, sym) {
+		if (is_file_sym(sym))
+			return sym;
+	}
+
+	return NULL;
+}
+
+/*
+ * Certain static local variables should never be correlated.  They will be
+ * used in place rather than referencing the originals.
+ */
+static bool is_uncorrelated_static_local(struct symbol *sym)
+{
+	static const char * const vars[] = {
+		"__key.",
+		"__warned.",
+		"__already_done.",
+		"__func__.",
+		"_rs.",
+		"descriptor.",
+		"CSWTCH.",
+	};
+
+	if (!is_object_sym(sym) || !is_local_sym(sym))
+		return false;
+
+	if (!strcmp(sym->sec->name, ".data.once"))
+		return true;
+
+	for (int i = 0; i < ARRAY_SIZE(vars); i++) {
+		if (strstarts(sym->name, vars[i]))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Clang emits several useless .Ltmp_* code labels.
+ */
+static bool is_clang_tmp_label(struct symbol *sym)
+{
+	return sym->type == STT_NOTYPE &&
+	       is_text_sec(sym->sec) &&
+	       strstarts(sym->name, ".Ltmp") &&
+	       isdigit(sym->name[5]);
+}
+
+static bool is_special_section(struct section *sec)
+{
+	static const char * const specials[] = {
+		".altinstructions",
+		".smp_locks",
+		"__bug_table",
+		"__ex_table",
+		"__jump_table",
+		"__mcount_loc",
+
+		/*
+		 * Extract .static_call_sites here to inherit non-module
+		 * preferential treatment.  The later static call processing
+		 * during klp module build will be skipped when it sees this
+		 * section already exists.
+		 */
+		".static_call_sites",
+	};
+
+	static const char * const non_special_discards[] = {
+		".discard.addressable",
+		".discard.sym_checksum",
+	};
+
+	if (is_text_sec(sec))
+		return false;
+
+	for (int i = 0; i < ARRAY_SIZE(specials); i++) {
+		if (!strcmp(sec->name, specials[i]))
+			return true;
+	}
+
+	/* Most .discard data sections are special */
+	for (int i = 0; i < ARRAY_SIZE(non_special_discards); i++) {
+		if (!strcmp(sec->name, non_special_discards[i]))
+			return false;
+	}
+
+	return strstarts(sec->name, ".discard.");
+}
+
+/*
+ * These sections are referenced by special sections but aren't considered
+ * special sections themselves.
+ */
+static bool is_special_section_aux(struct section *sec)
+{
+	static const char * const specials_aux[] = {
+		".altinstr_replacement",
+		".altinstr_aux",
+	};
+
+	for (int i = 0; i < ARRAY_SIZE(specials_aux); i++) {
+		if (!strcmp(sec->name, specials_aux[i]))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * These symbols should never be correlated, so their local patched versions
+ * are used instead of linking to the originals.
+ */
+static bool dont_correlate(struct symbol *sym)
+{
+	return is_file_sym(sym) ||
+	       is_null_sym(sym) ||
+	       is_sec_sym(sym) ||
+	       is_prefix_func(sym) ||
+	       is_uncorrelated_static_local(sym) ||
+	       is_clang_tmp_label(sym) ||
+	       is_string_sec(sym->sec) ||
+	       is_special_section(sym->sec) ||
+	       is_special_section_aux(sym->sec) ||
+	       strstarts(sym->name, "__initcall__");
+}
+
+/*
+ * For each symbol in the original kernel, find its corresponding "twin" in the
+ * patched kernel.
+ */
+static int correlate_symbols(struct elfs *e)
+{
+	struct symbol *file1_sym, *file2_sym;
+	struct symbol *sym1, *sym2;
+
+	/* Correlate locals */
+	for (file1_sym = first_file_symbol(e->orig),
+	     file2_sym = first_file_symbol(e->patched); ;
+	     file1_sym = next_file_symbol(e->orig, file1_sym),
+	     file2_sym = next_file_symbol(e->patched, file2_sym)) {
+
+		if (!file1_sym && file2_sym) {
+			ERROR("FILE symbol mismatch: NULL != %s", file2_sym->name);
+			return -1;
+		}
+
+		if (file1_sym && !file2_sym) {
+			ERROR("FILE symbol mismatch: %s != NULL", file1_sym->name);
+			return -1;
+		}
+
+		if (!file1_sym)
+			break;
+
+		if (strcmp(file1_sym->name, file2_sym->name)) {
+			ERROR("FILE symbol mismatch: %s != %s", file1_sym->name, file2_sym->name);
+			return -1;
+		}
+
+		file1_sym->twin = file2_sym;
+		file2_sym->twin = file1_sym;
+
+		sym1 = file1_sym;
+
+		for_each_sym_continue(e->orig, sym1) {
+			if (is_file_sym(sym1) || !is_local_sym(sym1))
+				break;
+
+			if (dont_correlate(sym1))
+				continue;
+
+			sym2 = file2_sym;
+			for_each_sym_continue(e->patched, sym2) {
+				if (is_file_sym(sym2) || !is_local_sym(sym2))
+					break;
+
+				if (sym2->twin || dont_correlate(sym2))
+					continue;
+
+				if (strcmp(sym1->demangled_name, sym2->demangled_name))
+					continue;
+
+				sym1->twin = sym2;
+				sym2->twin = sym1;
+				break;
+			}
+		}
+	}
+
+	/* Correlate globals */
+	for_each_sym(e->orig, sym1) {
+		if (sym1->bind == STB_LOCAL)
+			continue;
+
+		sym2 = find_global_symbol_by_name(e->patched, sym1->name);
+
+		if (sym2 && !sym2->twin && !strcmp(sym1->name, sym2->name)) {
+			sym1->twin = sym2;
+			sym2->twin = sym1;
+		}
+	}
+
+	for_each_sym(e->orig, sym1) {
+		if (sym1->twin || dont_correlate(sym1))
+			continue;
+		WARN("no correlation: %s", sym1->name);
+	}
+
+	return 0;
+}
+
+/* "sympos" is used by livepatch to disambiguate duplicate symbol names */
+static unsigned long find_sympos(struct elf *elf, struct symbol *sym)
+{
+	bool vmlinux = str_ends_with(objname, "vmlinux.o");
+	unsigned long sympos = 0, nr_matches = 0;
+	bool has_dup = false;
+	struct symbol *s;
+
+	if (sym->bind != STB_LOCAL)
+		return 0;
+
+	if (vmlinux && sym->type == STT_FUNC) {
+		/*
+		 * HACK: Unfortunately, symbol ordering can differ between
+		 * vmlinux.o and vmlinux due to the linker script emitting
+		 * .text.unlikely* before .text*.  Count .text.unlikely* first.
+		 *
+		 * TODO: Disambiguate symbols more reliably (checksums?)
+		 */
+		for_each_sym(elf, s) {
+			if (strstarts(s->sec->name, ".text.unlikely") &&
+			    !strcmp(s->name, sym->name)) {
+				nr_matches++;
+				if (s == sym)
+					sympos = nr_matches;
+				else
+					has_dup = true;
+			}
+		}
+		for_each_sym(elf, s) {
+			if (!strstarts(s->sec->name, ".text.unlikely") &&
+			    !strcmp(s->name, sym->name)) {
+				nr_matches++;
+				if (s == sym)
+					sympos = nr_matches;
+				else
+					has_dup = true;
+			}
+		}
+	} else {
+		for_each_sym(elf, s) {
+			if (!strcmp(s->name, sym->name)) {
+				nr_matches++;
+				if (s == sym)
+					sympos = nr_matches;
+				else
+					has_dup = true;
+			}
+		}
+	}
+
+	if (!sympos) {
+		ERROR("can't find sympos for %s", sym->name);
+		return ULONG_MAX;
+	}
+
+	return has_dup ? sympos : 0;
+}
+
+static int clone_sym_relocs(struct elfs *e, struct symbol *patched_sym);
+
+static struct symbol *__clone_symbol(struct elf *elf, struct symbol *patched_sym,
+				     bool data_too)
+{
+	struct section *out_sec = NULL;
+	unsigned long offset = 0;
+	struct symbol *out_sym;
+
+	if (data_too && !is_undef_sym(patched_sym)) {
+		struct section *patched_sec = patched_sym->sec;
+
+		out_sec = find_section_by_name(elf, patched_sec->name);
+		if (!out_sec) {
+			out_sec = elf_create_section(elf, patched_sec->name, 0,
+						     patched_sec->sh.sh_entsize,
+						     patched_sec->sh.sh_type,
+						     patched_sec->sh.sh_addralign,
+						     patched_sec->sh.sh_flags);
+			if (!out_sec)
+				return NULL;
+		}
+
+		if (is_string_sec(patched_sym->sec)) {
+			out_sym = elf_create_section_symbol(elf, out_sec);
+			if (!out_sym)
+				return NULL;
+
+			goto sym_created;
+		}
+
+		if (!is_sec_sym(patched_sym))
+			offset = sec_size(out_sec);
+
+		if (patched_sym->len || is_sec_sym(patched_sym)) {
+			void *data = NULL;
+			size_t size;
+
+			/* bss doesn't have data */
+			if (patched_sym->sec->data->d_buf)
+				data = patched_sym->sec->data->d_buf + patched_sym->offset;
+
+			if (is_sec_sym(patched_sym))
+				size = sec_size(patched_sym->sec);
+			else
+				size = patched_sym->len;
+
+			if (!elf_add_data(elf, out_sec, data, size))
+				return NULL;
+		}
+	}
+
+	out_sym = elf_create_symbol(elf, patched_sym->name, out_sec,
+				    patched_sym->bind, patched_sym->type,
+				    offset, patched_sym->len);
+	if (!out_sym)
+		return NULL;
+
+sym_created:
+	patched_sym->clone = out_sym;
+	out_sym->clone = patched_sym;
+
+	return out_sym;
+}
+
+/*
+ * Copy a symbol to the output object, optionally including its data and
+ * relocations.
+ */
+static struct symbol *clone_symbol(struct elfs *e, struct symbol *patched_sym,
+				   bool data_too)
+{
+	struct symbol *pfx;
+
+	if (patched_sym->clone)
+		return patched_sym->clone;
+
+	/* Make sure the prefix gets cloned first */
+	if (is_func_sym(patched_sym) && data_too) {
+		pfx = get_func_prefix(patched_sym);
+		if (pfx)
+			clone_symbol(e, pfx, true);
+	}
+
+	if (!__clone_symbol(e->out, patched_sym, data_too))
+		return NULL;
+
+	if (data_too && clone_sym_relocs(e, patched_sym))
+		return NULL;
+
+	return patched_sym->clone;
+}
+
+static void mark_included_function(struct symbol *func)
+{
+	struct symbol *pfx;
+
+	func->included = 1;
+
+	/* Include prefix function */
+	pfx = get_func_prefix(func);
+	if (pfx)
+		pfx->included = 1;
+
+	/* Make sure .cold parent+child always stay together */
+	if (func->cfunc && func->cfunc != func)
+		func->cfunc->included = 1;
+	if (func->pfunc && func->pfunc != func)
+		func->pfunc->included = 1;
+}
+
+/*
+ * Copy all changed functions (and their dependencies) from the patched object
+ * to the output object.
+ */
+static int mark_changed_functions(struct elfs *e)
+{
+	struct symbol *sym_orig, *patched_sym;
+	bool changed = false;
+
+	/* Find changed functions */
+	for_each_sym(e->orig, sym_orig) {
+		if (!is_func_sym(sym_orig) || is_prefix_func(sym_orig))
+			continue;
+
+		patched_sym = sym_orig->twin;
+		if (!patched_sym)
+			continue;
+
+		if (sym_orig->csum.checksum != patched_sym->csum.checksum) {
+			patched_sym->changed = 1;
+			mark_included_function(patched_sym);
+			changed = true;
+		}
+	}
+
+	/* Find added functions and print them */
+	for_each_sym(e->patched, patched_sym) {
+		if (!is_func_sym(patched_sym) || is_prefix_func(patched_sym))
+			continue;
+
+		if (!patched_sym->twin) {
+			printf("%s: new function: %s\n", objname, patched_sym->name);
+			mark_included_function(patched_sym);
+			changed = true;
+		}
+	}
+
+	/* Print changed functions */
+	for_each_sym(e->patched, patched_sym) {
+		if (patched_sym->changed)
+			printf("%s: changed function: %s\n", objname, patched_sym->name);
+	}
+
+	return !changed ? -1 : 0;
+}
+
+static int clone_included_functions(struct elfs *e)
+{
+	struct symbol *patched_sym;
+
+	for_each_sym(e->patched, patched_sym) {
+		if (patched_sym->included) {
+			if (!clone_symbol(e, patched_sym, true))
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Determine whether a relocation should reference the section rather than the
+ * underlying symbol.
+ */
+static bool section_reference_needed(struct section *sec)
+{
+	/*
+	 * String symbols are zero-length and uncorrelated.  It's easier to
+	 * deal with them as section symbols.
+	 */
+	if (is_string_sec(sec))
+		return true;
+
+	/*
+	 * .rodata has mostly anonymous data so there's no way to determine the
+	 * length of a needed reference.  just copy the whole section if needed.
+	 */
+	if (strstarts(sec->name, ".rodata"))
+		return true;
+
+	/* UBSAN anonymous data */
+	if (strstarts(sec->name, ".data..Lubsan") ||	/* GCC */
+	    strstarts(sec->name, ".data..L__unnamed_"))	/* Clang */
+		return true;
+
+	return false;
+}
+
+static bool is_reloc_allowed(struct reloc *reloc)
+{
+	return section_reference_needed(reloc->sym->sec) == is_sec_sym(reloc->sym);
+}
+
+static struct export *find_export(struct symbol *sym)
+{
+	struct export *export;
+
+	hash_for_each_possible(exports, export, hash, str_hash(sym->name)) {
+		if (!strcmp(export->sym, sym->name))
+			return export;
+	}
+
+	return NULL;
+}
+
+static const char *__find_modname(struct elfs *e)
+{
+	struct section *sec;
+	char *name;
+
+	sec = find_section_by_name(e->orig, ".modinfo");
+	if (!sec) {
+		ERROR("missing .modinfo section");
+		return NULL;
+	}
+
+	name = memmem(sec->data->d_buf, sec_size(sec), "\0name=", 6);
+	if (name)
+		return name + 6;
+
+	name = strdup(e->orig->name);
+	if (!name) {
+		ERROR_GLIBC("strdup");
+		return NULL;
+	}
+
+	for (char *c = name; *c; c++) {
+		if (*c == '/')
+			name = c + 1;
+		else if (*c == '-')
+			*c = '_';
+		else if (*c == '.') {
+			*c = '\0';
+			break;
+		}
+	}
+
+	return name;
+}
+
+/* Get the object's module name as defined by the kernel (and klp_object) */
+static const char *find_modname(struct elfs *e)
+{
+	const char *modname;
+
+	if (e->modname)
+		return e->modname;
+
+	modname = __find_modname(e);
+	e->modname = modname;
+	return modname;
+}
+
+/*
+ * Copying a function from its native compiled environment to a kernel module
+ * removes its natural access to local functions/variables and unexported
+ * globals.  References to such symbols need to be converted to KLP relocs so
+ * the kernel arch relocation code knows to apply them and where to find the
+ * symbols.  Particularly, duplicate static symbols need to be disambiguated.
+ */
+static bool klp_reloc_needed(struct reloc *patched_reloc)
+{
+	struct symbol *patched_sym = patched_reloc->sym;
+	struct export *export;
+
+	/* no external symbol to reference */
+	if (dont_correlate(patched_sym))
+		return false;
+
+	/* For included functions, a regular reloc will do. */
+	if (patched_sym->included)
+		return false;
+
+	/*
+	 * If exported by a module, it has to be a klp reloc.  Thanks to the
+	 * clusterfunk that is late module patching, the patch module is
+	 * allowed to be loaded before any modules it depends on.
+	 *
+	 * If exported by vmlinux, a normal reloc will do.
+	 */
+	export = find_export(patched_sym);
+	if (export)
+		return strcmp(export->mod, "vmlinux");
+
+	if (!patched_sym->twin) {
+		/*
+		 * Presumably the symbol and its reference were added by the
+		 * patch.  The symbol could be defined in this .o or in another
+		 * .o in the patch module.
+		 *
+		 * This check needs to be *after* the export check due to the
+		 * possibility of the patch adding a new UNDEF reference to an
+		 * exported symbol.
+		 */
+		return false;
+	}
+
+	/* Unexported symbol which lives in the original vmlinux or module. */
+	return true;
+}
+
+static int convert_reloc_sym_to_secsym(struct elf *elf, struct reloc *reloc)
+{
+	struct symbol *sym = reloc->sym;
+	struct section *sec = sym->sec;
+
+	if (!sec->sym && !elf_create_section_symbol(elf, sec))
+		return -1;
+
+	reloc->sym = sec->sym;
+	set_reloc_sym(elf, reloc, sym->idx);
+	set_reloc_addend(elf, reloc, sym->offset + reloc_addend(reloc));
+	return 0;
+}
+
+static int convert_reloc_secsym_to_sym(struct elf *elf, struct reloc *reloc)
+{
+	struct symbol *sym = reloc->sym;
+	struct section *sec = sym->sec;
+
+	/* If the symbol has a dedicated section, it's easy to find */
+	sym = find_symbol_by_offset(sec, 0);
+	if (sym && sym->len == sec_size(sec))
+		goto found_sym;
+
+	/* No dedicated section; find the symbol manually */
+	sym = find_symbol_containing(sec, arch_adjusted_addend(reloc));
+	if (!sym) {
+		/*
+		 * This can happen for special section references to weak code
+		 * whose symbol has been stripped by the linker.
+		 */
+		return -1;
+	}
+
+found_sym:
+	reloc->sym = sym;
+	set_reloc_sym(elf, reloc, sym->idx);
+	set_reloc_addend(elf, reloc, reloc_addend(reloc) - sym->offset);
+	return 0;
+}
+
+/*
+ * Convert a relocation symbol reference to the needed format: either a section
+ * symbol or the underlying symbol itself.
+ */
+static int convert_reloc_sym(struct elf *elf, struct reloc *reloc)
+{
+	if (is_reloc_allowed(reloc))
+		return 0;
+
+	if (section_reference_needed(reloc->sym->sec))
+		return convert_reloc_sym_to_secsym(elf, reloc);
+	else
+		return convert_reloc_secsym_to_sym(elf, reloc);
+}
+
+/*
+ * Convert a regular relocation to a klp relocation (sort of).
+ */
+static int clone_reloc_klp(struct elfs *e, struct reloc *patched_reloc,
+			   struct section *sec, unsigned long offset,
+			   struct export *export)
+{
+	struct symbol *patched_sym = patched_reloc->sym;
+	s64 addend = reloc_addend(patched_reloc);
+	const char *sym_modname, *sym_orig_name;
+	static struct section *klp_relocs;
+	struct symbol *sym, *klp_sym;
+	unsigned long klp_reloc_off;
+	char sym_name[SYM_NAME_LEN];
+	struct klp_reloc klp_reloc;
+	unsigned long sympos;
+
+	if (!patched_sym->twin) {
+		ERROR("unexpected klp reloc for new symbol %s", patched_sym->name);
+		return -1;
+	}
+
+	/*
+	 * Keep the original reloc intact for now to avoid breaking objtool run
+	 * which relies on proper relocations for many of its features.  This
+	 * will be disabled later by "objtool klp post-link".
+	 *
+	 * Convert it to UNDEF (and WEAK to avoid modpost warnings).
+	 */
+
+	sym = patched_sym->clone;
+	if (!sym) {
+		/* STB_WEAK: avoid modpost undefined symbol warnings */
+		sym = elf_create_symbol(e->out, patched_sym->name, NULL,
+					STB_WEAK, patched_sym->type, 0, 0);
+		if (!sym)
+			return -1;
+
+		patched_sym->clone = sym;
+		sym->clone = patched_sym;
+	}
+
+	if (!elf_create_reloc(e->out, sec, offset, sym, addend, reloc_type(patched_reloc)))
+		return -1;
+
+	/*
+	 * Create the KLP symbol.
+	 */
+
+	if (export) {
+		sym_modname = export->mod;
+		sym_orig_name = export->sym;
+		sympos = 0;
+	} else {
+		sym_modname = find_modname(e);
+		if (!sym_modname)
+			return -1;
+
+		sym_orig_name = patched_sym->twin->name;
+		sympos = find_sympos(e->orig, patched_sym->twin);
+		if (sympos == ULONG_MAX)
+			return -1;
+	}
+
+	/* symbol format: .klp.sym.modname.sym_name,sympos */
+	if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_SYM_PREFIX "%s.%s,%ld",
+		      sym_modname, sym_orig_name, sympos))
+		return -1;
+
+	klp_sym = find_symbol_by_name(e->out, sym_name);
+	if (!klp_sym) {
+		/* STB_WEAK: avoid modpost undefined symbol warnings */
+		klp_sym = elf_create_symbol(e->out, sym_name, NULL,
+					    STB_WEAK, patched_sym->type, 0, 0);
+		if (!klp_sym)
+			return -1;
+	}
+
+	/*
+	 * Create the __klp_relocs entry.  This will be converted to an actual
+	 * KLP rela by "objtool klp post-link".
+	 *
+	 * This intermediate step is necessary to prevent corruption by the
+	 * linker, which doesn't know how to properly handle two rela sections
+	 * applying to the same base section.
+	 */
+
+	if (!klp_relocs) {
+		klp_relocs = elf_create_section(e->out, KLP_RELOCS_SEC, 0,
+						0, SHT_PROGBITS, 8, SHF_ALLOC);
+		if (!klp_relocs)
+			return -1;
+	}
+
+	klp_reloc_off = sec_size(klp_relocs);
+	memset(&klp_reloc, 0, sizeof(klp_reloc));
+
+	klp_reloc.type = reloc_type(patched_reloc);
+	if (!elf_add_data(e->out, klp_relocs, &klp_reloc, sizeof(klp_reloc)))
+		return -1;
+
+	/* klp_reloc.offset */
+	if (!sec->sym && !elf_create_section_symbol(e->out, sec))
+		return -1;
+
+	if (!elf_create_reloc(e->out, klp_relocs,
+			      klp_reloc_off + offsetof(struct klp_reloc, offset),
+			      sec->sym, offset, R_ABS64))
+		return -1;
+
+	/* klp_reloc.sym */
+	if (!elf_create_reloc(e->out, klp_relocs,
+			      klp_reloc_off + offsetof(struct klp_reloc, sym),
+			      klp_sym, addend, R_ABS64))
+		return -1;
+
+	return 0;
+}
+
+/* Copy a reloc and its symbol to the output object */
+static int clone_reloc(struct elfs *e, struct reloc *patched_reloc,
+			struct section *sec, unsigned long offset)
+{
+	struct symbol *patched_sym = patched_reloc->sym;
+	struct export *export = find_export(patched_sym);
+	long addend = reloc_addend(patched_reloc);
+	struct symbol *out_sym;
+	bool klp;
+
+	if (!is_reloc_allowed(patched_reloc)) {
+		ERROR_FUNC(patched_reloc->sec->base, reloc_offset(patched_reloc),
+			   "missing symbol for reference to %s+%ld",
+			   patched_sym->name, addend);
+		return -1;
+	}
+
+	klp = klp_reloc_needed(patched_reloc);
+
+	if (klp) {
+		if (clone_reloc_klp(e, patched_reloc, sec, offset, export))
+			return -1;
+
+		return 0;
+	}
+
+	/*
+	 * Why !export sets 'data_too':
+	 *
+	 * Unexported non-klp symbols need to live in the patch module,
+	 * otherwise there will be unresolved symbols.  Notably, this includes:
+	 *
+	 *   - New functions/data
+	 *   - String sections
+	 *   - Special section entries
+	 *   - Uncorrelated static local variables
+	 *   - UBSAN sections
+	 */
+	out_sym = clone_symbol(e, patched_sym, patched_sym->included || !export);
+	if (!out_sym)
+		return -1;
+
+	/*
+	 * For strings, all references use section symbols, thanks to
+	 * section_reference_needed().  clone_symbol() has cloned an empty
+	 * version of the string section.  Now copy the string itself.
+	 */
+	if (is_string_sec(patched_sym->sec)) {
+		const char *str = patched_sym->sec->data->d_buf + addend;
+
+		addend = elf_add_string(e->out, out_sym->sec, str);
+		if (addend == -1)
+			return -1;
+	}
+
+	if (!elf_create_reloc(e->out, sec, offset, out_sym, addend,
+			      reloc_type(patched_reloc)))
+		return -1;
+
+	return 0;
+}
+
+/* Copy all relocs needed for a symbol's contents */
+static int clone_sym_relocs(struct elfs *e, struct symbol *patched_sym)
+{
+	struct section *patched_rsec = patched_sym->sec->rsec;
+	struct reloc *patched_reloc;
+	unsigned long start, end;
+	struct symbol *out_sym;
+
+	out_sym = patched_sym->clone;
+	if (!out_sym) {
+		ERROR("no clone for %s", patched_sym->name);
+		return -1;
+	}
+
+	if (!patched_rsec)
+		return 0;
+
+	if (!is_sec_sym(patched_sym) && !patched_sym->len)
+		return 0;
+
+	if (is_string_sec(patched_sym->sec))
+		return 0;
+
+	if (is_sec_sym(patched_sym)) {
+		start = 0;
+		end = sec_size(patched_sym->sec);
+	} else {
+		start = patched_sym->offset;
+		end = start + patched_sym->len;
+	}
+
+	for_each_reloc(patched_rsec, patched_reloc) {
+		unsigned long offset;
+
+		if (reloc_offset(patched_reloc) < start ||
+		    reloc_offset(patched_reloc) >= end)
+			continue;
+
+		/*
+		 * Skip any reloc referencing .altinstr_aux.  Its code is
+		 * always patched by alternatives.  See ALTERNATIVE_TERNARY().
+		 */
+		if (patched_reloc->sym->sec &&
+		    !strcmp(patched_reloc->sym->sec->name, ".altinstr_aux"))
+			continue;
+
+		if (convert_reloc_sym(e->patched, patched_reloc)) {
+			ERROR_FUNC(patched_rsec->base, reloc_offset(patched_reloc),
+				   "failed to convert reloc sym '%s' to its proper format",
+				   patched_reloc->sym->name);
+			return -1;
+		}
+
+		offset = out_sym->offset + (reloc_offset(patched_reloc) - patched_sym->offset);
+
+		if (clone_reloc(e, patched_reloc, out_sym->sec, offset))
+			return -1;
+	}
+	return 0;
+
+}
+
+static int create_fake_symbol(struct elf *elf, struct section *sec,
+			      unsigned long offset, size_t size)
+{
+	char name[SYM_NAME_LEN];
+	unsigned int type;
+	static int ctr;
+	char *c;
+
+	if (snprintf_check(name, SYM_NAME_LEN, "%s_%d", sec->name, ctr++))
+		return -1;
+
+	for (c = name; *c; c++)
+		if (*c == '.')
+			*c = '_';
+
+	/*
+	 * STT_NOTYPE: Prevent objtool from validating .altinstr_replacement
+	 *	       while still allowing objdump to disassemble it.
+	 */
+	type = is_text_sec(sec) ? STT_NOTYPE : STT_OBJECT;
+	return elf_create_symbol(elf, name, sec, STB_LOCAL, type, offset, size) ? 0 : -1;
+}
+
+/*
+ * Special sections (alternatives, etc) are basically arrays of structs.
+ * For all the special sections, create a symbol for each struct entry.  This
+ * is a bit cumbersome, but it makes the extracting of the individual entries
+ * much more straightforward.
+ *
+ * There are three ways to identify the entry sizes for a special section:
+ *
+ * 1) ELF section header sh_entsize: Ideally this would be used almost
+ *    everywhere.  But unfortunately the toolchains make it difficult.  The
+ *    assembler .[push]section directive syntax only takes entsize when
+ *    combined with SHF_MERGE.  But Clang disallows combining SHF_MERGE with
+ *    SHF_WRITE.  And some special sections do need to be writable.
+ *
+ *    Another place this wouldn't work is .altinstr_replacement, whose entries
+ *    don't have a fixed size.
+ *
+ * 2) ANNOTATE_DATA_SPECIAL: This is a lightweight objtool annotation which
+ *    points to the beginning of each entry.  The size of the entry is then
+ *    inferred by the location of the subsequent annotation (or end of
+ *    section).
+ *
+ * 3) Simple array of pointers: If the special section is just a basic array of
+ *    pointers, the entry size can be inferred by the number of relocations.
+ *    No annotations needed.
+ *
+ * Note I also tried to create per-entry symbols at the time of creation, in
+ * the original [inline] asm.  Unfortunately, creating uniquely named symbols
+ * is trickier than one might think, especially with Clang inline asm.  I
+ * eventually just gave up trying to make that work, in favor of using
+ * ANNOTATE_DATA_SPECIAL and creating the symbols here after the fact.
+ */
+static int create_fake_symbols(struct elf *elf)
+{
+	struct section *sec;
+	struct reloc *reloc;
+
+	/*
+	 * 1) Make symbols for all the ANNOTATE_DATA_SPECIAL entries:
+	 */
+
+	sec = find_section_by_name(elf, ".discard.annotate_data");
+	if (!sec || !sec->rsec)
+		return 0;
+
+	for_each_reloc(sec->rsec, reloc) {
+		unsigned long offset, size;
+		struct reloc *next_reloc;
+
+		if (annotype(elf, sec, reloc) != ANNOTYPE_DATA_SPECIAL)
+			continue;
+
+		offset = reloc_addend(reloc);
+
+		size = 0;
+		next_reloc = reloc;
+		for_each_reloc_continue(sec->rsec, next_reloc) {
+			if (annotype(elf, sec, next_reloc) != ANNOTYPE_DATA_SPECIAL ||
+			    next_reloc->sym->sec != reloc->sym->sec)
+				continue;
+
+			size = reloc_addend(next_reloc) - offset;
+			break;
+		}
+
+		if (!size)
+			size = sec_size(reloc->sym->sec) - offset;
+
+		if (create_fake_symbol(elf, reloc->sym->sec, offset, size))
+			return -1;
+	}
+
+	/*
+	 * 2) Make symbols for sh_entsize, and simple arrays of pointers:
+	 */
+
+	for_each_sec(elf, sec) {
+		unsigned int entry_size;
+		unsigned long offset;
+
+		if (!is_special_section(sec) || find_symbol_by_offset(sec, 0))
+			continue;
+
+		if (!sec->rsec) {
+			ERROR("%s: missing special section relocations", sec->name);
+			return -1;
+		}
+
+		entry_size = sec->sh.sh_entsize;
+		if (!entry_size) {
+			entry_size = arch_reloc_size(sec->rsec->relocs);
+			if (sec_size(sec) != entry_size * sec_num_entries(sec->rsec)) {
+				ERROR("%s: missing special section entsize or annotations", sec->name);
+				return -1;
+			}
+		}
+
+		for (offset = 0; offset < sec_size(sec); offset += entry_size) {
+			if (create_fake_symbol(elf, sec, offset, entry_size))
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/* Keep a special section entry if it references an included function */
+static bool should_keep_special_sym(struct elf *elf, struct symbol *sym)
+{
+	struct reloc *reloc;
+
+	if (is_sec_sym(sym) || !sym->sec->rsec)
+		return false;
+
+	sym_for_each_reloc(elf, sym, reloc) {
+		if (convert_reloc_sym(elf, reloc))
+			continue;
+
+		if (is_func_sym(reloc->sym) && reloc->sym->included)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Klp relocations aren't allowed for __jump_table and .static_call_sites if
+ * the referenced symbol lives in a kernel module, because such klp relocs may
+ * be applied after static branch/call init, resulting in code corruption.
+ *
+ * Validate a special section entry to avoid that.  Note that an inert
+ * tracepoint is harmless enough, in that case just skip the entry and print a
+ * warning.  Otherwise, return an error.
+ *
+ * This is only a temporary limitation which will be fixed when livepatch adds
+ * support for submodules: fully self-contained modules which are embedded in
+ * the top-level livepatch module's data and which can be loaded on demand when
+ * their corresponding to-be-patched module gets loaded.  Then klp relocs can
+ * be retired.
+ *
+ * Return:
+ *   -1: error: validation failed
+ *    1: warning: tracepoint skipped
+ *    0: success
+ */
+static int validate_special_section_klp_reloc(struct elfs *e, struct symbol *sym)
+{
+	bool static_branch = !strcmp(sym->sec->name, "__jump_table");
+	bool static_call   = !strcmp(sym->sec->name, ".static_call_sites");
+	struct symbol *code_sym = NULL;
+	unsigned long code_offset = 0;
+	struct reloc *reloc;
+	int ret = 0;
+
+	if (!static_branch && !static_call)
+		return 0;
+
+	sym_for_each_reloc(e->patched, sym, reloc) {
+		const char *sym_modname;
+		struct export *export;
+
+		/* Static branch/call keys are always STT_OBJECT */
+		if (reloc->sym->type != STT_OBJECT) {
+
+			/* Save code location which can be printed below */
+			if (reloc->sym->type == STT_FUNC && !code_sym) {
+				code_sym = reloc->sym;
+				code_offset = reloc_addend(reloc);
+			}
+
+			continue;
+		}
+
+		if (!klp_reloc_needed(reloc))
+			continue;
+
+		export = find_export(reloc->sym);
+		if (export) {
+			sym_modname = export->mod;
+		} else {
+			sym_modname = find_modname(e);
+			if (!sym_modname)
+				return -1;
+		}
+
+		/* vmlinux keys are ok */
+		if (!strcmp(sym_modname, "vmlinux"))
+			continue;
+
+		if (static_branch) {
+			if (strstarts(reloc->sym->name, "__tracepoint_")) {
+				WARN("%s: disabling unsupported tracepoint %s",
+				     code_sym->name, reloc->sym->name + 13);
+				ret = 1;
+				continue;
+			}
+
+			ERROR("%s+0x%lx: unsupported static branch key %s.  Use static_key_enabled() instead",
+			      code_sym->name, code_offset, reloc->sym->name);
+			return -1;
+		}
+
+		/* static call */
+		if (strstarts(reloc->sym->name, "__SCK__tp_func_")) {
+			ret = 1;
+			continue;
+		}
+
+		ERROR("%s()+0x%lx: unsupported static call key %s.  Use KLP_STATIC_CALL() instead",
+		      code_sym->name, code_offset, reloc->sym->name);
+		return -1;
+	}
+
+	return ret;
+}
+
+static int clone_special_section(struct elfs *e, struct section *patched_sec)
+{
+	struct symbol *patched_sym;
+
+	/*
+	 * Extract all special section symbols (and their dependencies) which
+	 * reference included functions.
+	 */
+	sec_for_each_sym(patched_sec, patched_sym) {
+		int ret;
+
+		if (!is_object_sym(patched_sym))
+			continue;
+
+		if (!should_keep_special_sym(e->patched, patched_sym))
+			continue;
+
+		ret = validate_special_section_klp_reloc(e, patched_sym);
+		if (ret < 0)
+			return -1;
+		if (ret > 0)
+			continue;
+
+		if (!clone_symbol(e, patched_sym, true))
+			return -1;
+	}
+
+	return 0;
+}
+
+/* Extract only the needed bits from special sections */
+static int clone_special_sections(struct elfs *e)
+{
+	struct section *patched_sec;
+
+	if (create_fake_symbols(e->patched))
+		return -1;
+
+	for_each_sec(e->patched, patched_sec) {
+		if (is_special_section(patched_sec)) {
+			if (clone_special_section(e, patched_sec))
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Create __klp_objects and __klp_funcs sections which are intermediate
+ * sections provided as input to the patch module's init code for building the
+ * klp_patch, klp_object and klp_func structs for the livepatch API.
+ */
+static int create_klp_sections(struct elfs *e)
+{
+	size_t obj_size  = sizeof(struct klp_object_ext);
+	size_t func_size = sizeof(struct klp_func_ext);
+	struct section *obj_sec, *funcs_sec, *str_sec;
+	struct symbol *funcs_sym, *str_sym, *sym;
+	char sym_name[SYM_NAME_LEN];
+	unsigned int nr_funcs = 0;
+	const char *modname;
+	void *obj_data;
+	s64 addend;
+
+	obj_sec  = elf_create_section_pair(e->out, KLP_OBJECTS_SEC, obj_size, 0, 0);
+	if (!obj_sec)
+		return -1;
+
+	funcs_sec = elf_create_section_pair(e->out, KLP_FUNCS_SEC, func_size, 0, 0);
+	if (!funcs_sec)
+		return -1;
+
+	funcs_sym = elf_create_section_symbol(e->out, funcs_sec);
+	if (!funcs_sym)
+		return -1;
+
+	str_sec = elf_create_section(e->out, KLP_STRINGS_SEC, 0, 0,
+				     SHT_PROGBITS, 1,
+				     SHF_ALLOC | SHF_STRINGS | SHF_MERGE);
+	if (!str_sec)
+		return -1;
+
+	if (elf_add_string(e->out, str_sec, "") == -1)
+		return -1;
+
+	str_sym = elf_create_section_symbol(e->out, str_sec);
+	if (!str_sym)
+		return -1;
+
+	/* allocate klp_object_ext */
+	obj_data = elf_add_data(e->out, obj_sec, NULL, obj_size);
+	if (!obj_data)
+		return -1;
+
+	modname = find_modname(e);
+	if (!modname)
+		return -1;
+
+	/* klp_object_ext.name */
+	if (strcmp(modname, "vmlinux")) {
+		addend = elf_add_string(e->out, str_sec, modname);
+		if (addend == -1)
+			return -1;
+
+		if (!elf_create_reloc(e->out, obj_sec,
+				      offsetof(struct klp_object_ext, name),
+				      str_sym, addend, R_ABS64))
+			return -1;
+	}
+
+	/* klp_object_ext.funcs */
+	if (!elf_create_reloc(e->out, obj_sec, offsetof(struct klp_object_ext, funcs),
+			      funcs_sym, 0, R_ABS64))
+		return -1;
+
+	for_each_sym(e->out, sym) {
+		unsigned long offset = nr_funcs * func_size;
+		unsigned long sympos;
+		void *func_data;
+
+		if (!is_func_sym(sym) || sym->cold || !sym->clone || !sym->clone->changed)
+			continue;
+
+		/* allocate klp_func_ext */
+		func_data = elf_add_data(e->out, funcs_sec, NULL, func_size);
+		if (!func_data)
+			return -1;
+
+		/* klp_func_ext.old_name */
+		addend = elf_add_string(e->out, str_sec, sym->clone->twin->name);
+		if (addend == -1)
+			return -1;
+
+		if (!elf_create_reloc(e->out, funcs_sec,
+				      offset + offsetof(struct klp_func_ext, old_name),
+				      str_sym, addend, R_ABS64))
+			return -1;
+
+		/* klp_func_ext.new_func */
+		if (!elf_create_reloc(e->out, funcs_sec,
+				      offset + offsetof(struct klp_func_ext, new_func),
+				      sym, 0, R_ABS64))
+			return -1;
+
+		/* klp_func_ext.sympos */
+		BUILD_BUG_ON(sizeof(sympos) != sizeof_field(struct klp_func_ext, sympos));
+		sympos = find_sympos(e->orig, sym->clone->twin);
+		if (sympos == ULONG_MAX)
+			return -1;
+		memcpy(func_data + offsetof(struct klp_func_ext, sympos), &sympos,
+		       sizeof_field(struct klp_func_ext, sympos));
+
+		nr_funcs++;
+	}
+
+	/* klp_object_ext.nr_funcs */
+	BUILD_BUG_ON(sizeof(nr_funcs) != sizeof_field(struct klp_object_ext, nr_funcs));
+	memcpy(obj_data + offsetof(struct klp_object_ext, nr_funcs), &nr_funcs,
+	       sizeof_field(struct klp_object_ext, nr_funcs));
+
+	/*
+	 * Find callback pointers created by KLP_PRE_PATCH_CALLBACK() and
+	 * friends, and add them to the klp object.
+	 */
+
+	if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_PRE_PATCH_PREFIX "%s", modname))
+		return -1;
+
+	sym = find_symbol_by_name(e->out, sym_name);
+	if (sym) {
+		struct reloc *reloc;
+
+		reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset);
+
+		if (!elf_create_reloc(e->out, obj_sec,
+				      offsetof(struct klp_object_ext, callbacks) +
+				      offsetof(struct klp_callbacks, pre_patch),
+				      reloc->sym, reloc_addend(reloc), R_ABS64))
+			return -1;
+	}
+
+	if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_POST_PATCH_PREFIX "%s", modname))
+		return -1;
+
+	sym = find_symbol_by_name(e->out, sym_name);
+	if (sym) {
+		struct reloc *reloc;
+
+		reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset);
+
+		if (!elf_create_reloc(e->out, obj_sec,
+				      offsetof(struct klp_object_ext, callbacks) +
+				      offsetof(struct klp_callbacks, post_patch),
+				      reloc->sym, reloc_addend(reloc), R_ABS64))
+			return -1;
+	}
+
+	if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_PRE_UNPATCH_PREFIX "%s", modname))
+		return -1;
+
+	sym = find_symbol_by_name(e->out, sym_name);
+	if (sym) {
+		struct reloc *reloc;
+
+		reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset);
+
+		if (!elf_create_reloc(e->out, obj_sec,
+				      offsetof(struct klp_object_ext, callbacks) +
+				      offsetof(struct klp_callbacks, pre_unpatch),
+				      reloc->sym, reloc_addend(reloc), R_ABS64))
+			return -1;
+	}
+
+	if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_POST_UNPATCH_PREFIX "%s", modname))
+		return -1;
+
+	sym = find_symbol_by_name(e->out, sym_name);
+	if (sym) {
+		struct reloc *reloc;
+
+		reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset);
+
+		if (!elf_create_reloc(e->out, obj_sec,
+				      offsetof(struct klp_object_ext, callbacks) +
+				      offsetof(struct klp_callbacks, post_unpatch),
+				      reloc->sym, reloc_addend(reloc), R_ABS64))
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Copy all .modinfo import_ns= tags to ensure all namespaced exported symbols
+ * can be accessed via normal relocs.
+ */
+static int copy_import_ns(struct elfs *e)
+{
+	struct section *patched_sec, *out_sec = NULL;
+	char *import_ns, *data_end;
+
+	patched_sec = find_section_by_name(e->patched, ".modinfo");
+	if (!patched_sec)
+		return 0;
+
+	import_ns = patched_sec->data->d_buf;
+	if (!import_ns)
+		return 0;
+
+	for (data_end = import_ns + sec_size(patched_sec);
+	     import_ns < data_end;
+	     import_ns += strlen(import_ns) + 1) {
+
+		import_ns = memmem(import_ns, data_end - import_ns, "import_ns=", 10);
+		if (!import_ns)
+			return 0;
+
+		if (!out_sec) {
+			out_sec = find_section_by_name(e->out, ".modinfo");
+			if (!out_sec) {
+				out_sec = elf_create_section(e->out, ".modinfo", 0,
+							     patched_sec->sh.sh_entsize,
+							     patched_sec->sh.sh_type,
+							     patched_sec->sh.sh_addralign,
+							     patched_sec->sh.sh_flags);
+				if (!out_sec)
+					return -1;
+			}
+		}
+
+		if (!elf_add_data(e->out, out_sec, import_ns, strlen(import_ns) + 1))
+			return -1;
+	}
+
+	return 0;
+}
+
+int cmd_klp_diff(int argc, const char **argv)
+{
+	struct elfs e = {0};
+
+	argc = parse_options(argc, argv, klp_diff_options, klp_diff_usage, 0);
+	if (argc != 3)
+		usage_with_options(klp_diff_usage, klp_diff_options);
+
+	objname = argv[0];
+
+	e.orig = elf_open_read(argv[0], O_RDONLY);
+	e.patched = elf_open_read(argv[1], O_RDONLY);
+	e.out = NULL;
+
+	if (!e.orig || !e.patched)
+		return -1;
+
+	if (read_exports())
+		return -1;
+
+	if (read_sym_checksums(e.orig))
+		return -1;
+
+	if (read_sym_checksums(e.patched))
+		return -1;
+
+	if (correlate_symbols(&e))
+		return -1;
+
+	if (mark_changed_functions(&e))
+		return 0;
+
+	e.out = elf_create_file(&e.orig->ehdr, argv[2]);
+	if (!e.out)
+		return -1;
+
+	if (clone_included_functions(&e))
+		return -1;
+
+	if (clone_special_sections(&e))
+		return -1;
+
+	if (create_klp_sections(&e))
+		return -1;
+
+	if (copy_import_ns(&e))
+		return -1;
+
+	if  (elf_write(e.out))
+		return -1;
+
+	return elf_close(e.out);
+}
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index 5c8b974ad0f9..c8f611c1320d 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -16,8 +16,6 @@
 #include <objtool/objtool.h>
 #include <objtool/warn.h>
 
-bool help;
-
 static struct objtool_file file;
 
 struct objtool_file *objtool_open_read(const char *filename)
@@ -71,6 +69,39 @@ int objtool_pv_add(struct objtool_file *f, int idx, struct symbol *func)
 	return 0;
 }
 
+char *top_level_dir(const char *file)
+{
+	ssize_t len, self_len, file_len;
+	char self[PATH_MAX], *str;
+	int i;
+
+	len = readlink("/proc/self/exe", self, sizeof(self) - 1);
+	if (len <= 0)
+		return NULL;
+	self[len] = '\0';
+
+	for (i = 0; i < 3; i++) {
+		char *s = strrchr(self, '/');
+		if (!s)
+			return NULL;
+		*s = '\0';
+	}
+
+	self_len = strlen(self);
+	file_len = strlen(file);
+
+	str = malloc(self_len + file_len + 2);
+	if (!str)
+		return NULL;
+
+	memcpy(str, self, self_len);
+	str[self_len] = '/';
+	strcpy(str + self_len + 1, file);
+
+	return str;
+}
+
+
 int main(int argc, const char **argv)
 {
 	static const char *UNUSED = "OBJTOOL_NOT_IMPLEMENTED";
@@ -79,5 +110,11 @@ int main(int argc, const char **argv)
 	exec_cmd_init("objtool", UNUSED, UNUSED, UNUSED);
 	pager_init(UNUSED);
 
+	if (argc > 1 && !strcmp(argv[1], "klp")) {
+		argc--;
+		argv++;
+		return cmd_klp(argc, argv);
+	}
+
 	return objtool_run(argc, argv);
 }
diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh
index 86d64e3ac6f7..e38167ca56a9 100755
--- a/tools/objtool/sync-check.sh
+++ b/tools/objtool/sync-check.sh
@@ -17,6 +17,7 @@ arch/x86/include/asm/emulate_prefix.h
 arch/x86/lib/x86-opcode-map.txt
 arch/x86/tools/gen-insn-attr-x86.awk
 include/linux/interval_tree_generic.h
+include/linux/livepatch_external.h
 include/linux/static_call_types.h
 "
 
diff --git a/tools/objtool/weak.c b/tools/objtool/weak.c
index d83f607733b0..d6562f292259 100644
--- a/tools/objtool/weak.c
+++ b/tools/objtool/weak.c
@@ -8,6 +8,8 @@
 #include <stdbool.h>
 #include <errno.h>
 #include <objtool/objtool.h>
+#include <objtool/arch.h>
+#include <objtool/builtin.h>
 
 #define UNSUPPORTED(name)						\
 ({									\
@@ -24,3 +26,8 @@ int __weak orc_create(struct objtool_file *file)
 {
 	UNSUPPORTED("ORC");
 }
+
+int __weak cmd_klp(int argc, const char **argv)
+{
+	UNSUPPORTED("klp");
+}
-- 
cgit v1.2.3


From b9976fa4649627c04dde26183333c3dcc90a0b76 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 17 Sep 2025 09:04:11 -0700
Subject: livepatch: Introduce source code helpers for livepatch modules

Add some helper macros which can be used by livepatch source .patch
files to register callbacks, convert static calls to regular calls where
needed, and patch syscalls.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/livepatch_helpers.h | 77 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 include/linux/livepatch_helpers.h

(limited to 'include')

diff --git a/include/linux/livepatch_helpers.h b/include/linux/livepatch_helpers.h
new file mode 100644
index 000000000000..99d68d0773fa
--- /dev/null
+++ b/include/linux/livepatch_helpers.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_LIVEPATCH_HELPERS_H
+#define _LINUX_LIVEPATCH_HELPERS_H
+
+/*
+ * Interfaces for use by livepatch patches
+ */
+
+#include <linux/syscalls.h>
+#include <linux/livepatch.h>
+
+#ifdef MODULE
+#define KLP_OBJNAME __KBUILD_MODNAME
+#else
+#define KLP_OBJNAME vmlinux
+#endif
+
+/* Livepatch callback registration */
+
+#define KLP_CALLBACK_PTRS ".discard.klp_callback_ptrs"
+
+#define KLP_PRE_PATCH_CALLBACK(func)						\
+	klp_pre_patch_t __used __section(KLP_CALLBACK_PTRS)			\
+		__PASTE(__KLP_PRE_PATCH_PREFIX, KLP_OBJNAME) = func
+
+#define KLP_POST_PATCH_CALLBACK(func)						\
+	klp_post_patch_t __used __section(KLP_CALLBACK_PTRS)			\
+		__PASTE(__KLP_POST_PATCH_PREFIX, KLP_OBJNAME) = func
+
+#define KLP_PRE_UNPATCH_CALLBACK(func)						\
+	klp_pre_unpatch_t __used __section(KLP_CALLBACK_PTRS)			\
+		__PASTE(__KLP_PRE_UNPATCH_PREFIX, KLP_OBJNAME) = func
+
+#define KLP_POST_UNPATCH_CALLBACK(func)						\
+	klp_post_unpatch_t __used __section(KLP_CALLBACK_PTRS)			\
+		__PASTE(__KLP_POST_UNPATCH_PREFIX, KLP_OBJNAME) = func
+
+/*
+ * Replace static_call() usage with this macro when create-diff-object
+ * recommends it due to the original static call key living in a module.
+ *
+ * This converts the static call to a regular indirect call.
+ */
+#define KLP_STATIC_CALL(name) \
+	((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func))
+
+/* Syscall patching */
+
+#define KLP_SYSCALL_DEFINE1(name, ...) KLP_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
+#define KLP_SYSCALL_DEFINE2(name, ...) KLP_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
+#define KLP_SYSCALL_DEFINE3(name, ...) KLP_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
+#define KLP_SYSCALL_DEFINE4(name, ...) KLP_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
+#define KLP_SYSCALL_DEFINE5(name, ...) KLP_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
+#define KLP_SYSCALL_DEFINE6(name, ...) KLP_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
+
+#define KLP_SYSCALL_DEFINEx(x, sname, ...)				\
+	__KLP_SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+
+#ifdef CONFIG_X86_64
+// TODO move this to arch/x86/include/asm/syscall_wrapper.h and share code
+#define __KLP_SYSCALL_DEFINEx(x, name, ...)			\
+	static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
+	static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+	__X64_SYS_STUBx(x, name, __VA_ARGS__)				\
+	__IA32_SYS_STUBx(x, name, __VA_ARGS__)				\
+	static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
+	{								\
+		long ret = __klp_do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
+		__MAP(x,__SC_TEST,__VA_ARGS__);				\
+		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
+		return ret;						\
+	}								\
+	static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+
+#endif
+
+#endif /* _LINUX_LIVEPATCH_HELPERS_H */
-- 
cgit v1.2.3


From 10c4b4f60f5d0dbd29fa819be76e888501c7b729 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 13 Oct 2025 22:50:27 +0200
Subject: net: mdio: use macro module_driver to avoid boilerplate code

Use macro module_driver to avoid boilerplate code.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/e5c37417-4984-4b57-8154-264deef61e0d@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mdio.h | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index c640ba44dd6e..42d6d47e445b 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -689,16 +689,7 @@ struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr);
  * init/exit. Each module may only use this macro once, and calling it
  * replaces module_init() and module_exit().
  */
-#define mdio_module_driver(_mdio_driver)				\
-static int __init mdio_module_init(void)				\
-{									\
-	return mdio_driver_register(&_mdio_driver);			\
-}									\
-module_init(mdio_module_init);						\
-static void __exit mdio_module_exit(void)				\
-{									\
-	mdio_driver_unregister(&_mdio_driver);				\
-}									\
-module_exit(mdio_module_exit)
+#define mdio_module_driver(_mdio_driver) \
+	module_driver(_mdio_driver, mdio_driver_register, mdio_driver_unregister)
 
 #endif /* __LINUX_MDIO_H__ */
-- 
cgit v1.2.3


From 433e294c3c5b5d2020085a0e36c1cb47b694690a Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Wed, 1 Oct 2025 12:56:49 +0200
Subject: regulator: core: forward undervoltage events downstream by default

Forward critical supply events downstream so consumers can react in
time.  An under-voltage event on an upstream rail may otherwise never
reach end devices (e.g. eMMC).

Register a notifier on a regulator's supply when the supply is resolved,
and forward only REGULATOR_EVENT_UNDER_VOLTAGE to the consumer's notifier
chain. Event handling is deferred to process context via a workqueue; the
consumer rdev is lifetime-pinned and the rdev lock is held while calling
the notifier chain. The notifier is unregistered on regulator teardown.

No DT/UAPI changes. Behavior applies to all regulators with a supply.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20251001105650.2391477-1-o.rempel@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         | 124 +++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/driver.h |   3 +
 2 files changed, 127 insertions(+)

(limited to 'include')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index dd7b10e768c0..84bc38911dba 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -83,6 +83,19 @@ struct regulator_supply_alias {
 	const char *alias_supply;
 };
 
+/*
+ * Work item used to forward regulator events.
+ *
+ * @work: workqueue entry
+ * @rdev: regulator device to notify (consumer receiving the forwarded event)
+ * @event: event code to be forwarded
+ */
+struct regulator_event_work {
+	struct work_struct work;
+	struct regulator_dev *rdev;
+	unsigned long event;
+};
+
 static int _regulator_is_enabled(struct regulator_dev *rdev);
 static int _regulator_disable(struct regulator *regulator);
 static int _regulator_get_error_flags(struct regulator_dev *rdev, unsigned int *flags);
@@ -1658,6 +1671,104 @@ static int set_machine_constraints(struct regulator_dev *rdev)
 	return 0;
 }
 
+/**
+ * regulator_event_work_fn - process a deferred regulator event
+ * @work: work_struct queued by the notifier
+ *
+ * Calls the regulator's notifier chain in process context while holding
+ * the rdev lock, then releases the device reference.
+ */
+static void regulator_event_work_fn(struct work_struct *work)
+{
+	struct regulator_event_work *rew =
+		container_of(work, struct regulator_event_work, work);
+	struct regulator_dev *rdev = rew->rdev;
+	int ret;
+
+	regulator_lock(rdev);
+	ret = regulator_notifier_call_chain(rdev, rew->event, NULL);
+	regulator_unlock(rdev);
+	if (ret == NOTIFY_BAD)
+		dev_err(rdev_get_dev(rdev), "failed to forward regulator event\n");
+
+	put_device(rdev_get_dev(rdev));
+	kfree(rew);
+}
+
+/**
+ * regulator_event_forward_notifier - notifier callback for supply events
+ * @nb:    notifier block embedded in the regulator
+ * @event: regulator event code
+ * @data:  unused
+ *
+ * Packages the event into a work item and schedules it in process context.
+ * Takes a reference on @rdev->dev to pin the regulator until the work
+ * completes (see put_device() in the worker).
+ *
+ * Return: NOTIFY_OK on success, NOTIFY_DONE for events that are not forwarded.
+ */
+static int regulator_event_forward_notifier(struct notifier_block *nb,
+					    unsigned long event,
+					    void __always_unused *data)
+{
+	struct regulator_dev *rdev = container_of(nb, struct regulator_dev,
+						  supply_fwd_nb);
+	struct regulator_event_work *rew;
+
+	switch (event) {
+	case REGULATOR_EVENT_UNDER_VOLTAGE:
+		break;
+	default:
+		/* Only forward allowed events downstream. */
+		return NOTIFY_DONE;
+	}
+
+	rew = kmalloc(sizeof(*rew), GFP_ATOMIC);
+	if (!rew)
+		return NOTIFY_DONE;
+
+	get_device(rdev_get_dev(rdev));
+	rew->rdev = rdev;
+	rew->event = event;
+	INIT_WORK(&rew->work, regulator_event_work_fn);
+
+	queue_work(system_highpri_wq, &rew->work);
+
+	return NOTIFY_OK;
+}
+
+/**
+ * register_regulator_event_forwarding - enable supply event forwarding
+ * @rdev: regulator device
+ *
+ * Registers a notifier on the regulator's supply so that supply events
+ * are forwarded to the consumer regulator via the deferred work handler.
+ *
+ * Return: 0 on success, -EALREADY if already enabled, or a negative error code.
+ */
+static int register_regulator_event_forwarding(struct regulator_dev *rdev)
+{
+	int ret;
+
+	if (!rdev->supply)
+		return 0; /* top-level regulator: nothing to forward */
+
+	if (rdev->supply_fwd_nb.notifier_call)
+		return -EALREADY;
+
+	rdev->supply_fwd_nb.notifier_call = regulator_event_forward_notifier;
+
+	ret = regulator_register_notifier(rdev->supply, &rdev->supply_fwd_nb);
+	if (ret) {
+		dev_err(&rdev->dev, "failed to register supply notifier: %pe\n",
+			ERR_PTR(ret));
+		rdev->supply_fwd_nb.notifier_call = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
 /**
  * set_supply - set regulator supply regulator
  * @rdev: regulator (locked)
@@ -2144,6 +2255,16 @@ static int regulator_resolve_supply(struct regulator_dev *rdev)
 		goto out;
 	}
 
+	/*
+	 * Automatically register for event forwarding from the new supply.
+	 * This creates the downstream propagation link for events like
+	 * under-voltage.
+	 */
+	ret = register_regulator_event_forwarding(rdev);
+	if (ret < 0)
+		rdev_warn(rdev, "Failed to register event forwarding: %pe\n",
+			  ERR_PTR(ret));
+
 	regulator_unlock_two(rdev, r, &ww_ctx);
 
 	/* rdev->supply was created in set_supply() */
@@ -6031,6 +6152,9 @@ void regulator_unregister(struct regulator_dev *rdev)
 		return;
 
 	if (rdev->supply) {
+		regulator_unregister_notifier(rdev->supply,
+					      &rdev->supply_fwd_nb);
+
 		while (rdev->use_count--)
 			regulator_disable(rdev->supply);
 		regulator_put(rdev->supply);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 4a216fdba354..978cf593b662 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -658,6 +658,9 @@ struct regulator_dev {
 	spinlock_t err_lock;
 
 	int pw_requested_mW;
+
+	/* regulator notification forwarding */
+	struct notifier_block supply_fwd_nb;
 };
 
 /*
-- 
cgit v1.2.3


From 48a97ffc6c826640907d13b199e29008f4fe2c15 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 14 Oct 2025 13:14:03 -0700
Subject: bpf: Consistently use bpf_rcu_lock_held() everywhere

We have many places which open-code what's now is bpf_rcu_lock_held()
macro, so replace all those places with a clean and short macro invocation.
For that, move bpf_rcu_lock_held() macro into include/linux/bpf.h.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20251014201403.4104511-1-andrii@kernel.org
---
 include/linux/bpf.h               |  3 +++
 include/linux/bpf_local_storage.h |  3 ---
 kernel/bpf/hashtab.c              | 21 +++++++--------------
 kernel/bpf/helpers.c              | 12 ++++--------
 4 files changed, 14 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f87fb203aaae..86afd9ac6848 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2381,6 +2381,9 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array *array,
 bool bpf_jit_bypass_spec_v1(void);
 bool bpf_jit_bypass_spec_v4(void);
 
+#define bpf_rcu_lock_held() \
+	(rcu_read_lock_held() || rcu_read_lock_trace_held() || rcu_read_lock_bh_held())
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 extern struct mutex bpf_stats_enabled_mutex;
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index ab7244d8108f..782f58feea35 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -18,9 +18,6 @@
 
 #define BPF_LOCAL_STORAGE_CACHE_SIZE	16
 
-#define bpf_rcu_lock_held()                                                    \
-	(rcu_read_lock_held() || rcu_read_lock_trace_held() ||                 \
-	 rcu_read_lock_bh_held())
 struct bpf_local_storage_map_bucket {
 	struct hlist_head list;
 	raw_spinlock_t lock;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index e7a6ba04dc82..f876f09355f0 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -657,8 +657,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 	struct htab_elem *l;
 	u32 hash, key_size;
 
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 
 	key_size = map->key_size;
 
@@ -1086,8 +1085,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		/* unknown flags */
 		return -EINVAL;
 
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 
 	key_size = map->key_size;
 
@@ -1194,8 +1192,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
 		/* unknown flags */
 		return -EINVAL;
 
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 
 	key_size = map->key_size;
 
@@ -1263,8 +1260,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
 		/* unknown flags */
 		return -EINVAL;
 
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 
 	key_size = map->key_size;
 
@@ -1326,8 +1322,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 		/* unknown flags */
 		return -EINVAL;
 
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 
 	key_size = map->key_size;
 
@@ -1404,8 +1399,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
 	u32 hash, key_size;
 	int ret;
 
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 
 	key_size = map->key_size;
 
@@ -1440,8 +1434,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 	u32 hash, key_size;
 	int ret;
 
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 
 	key_size = map->key_size;
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index dea8443f782c..825280c953be 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -42,8 +42,7 @@
  */
 BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 	return (unsigned long) map->ops->map_lookup_elem(map, key);
 }
 
@@ -59,8 +58,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
 	   void *, value, u64, flags)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 	return map->ops->map_update_elem(map, key, value, flags);
 }
 
@@ -77,8 +75,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
 
 BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 	return map->ops->map_delete_elem(map, key);
 }
 
@@ -134,8 +131,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
 
 BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
-		     !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
 	return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
 }
 
-- 
cgit v1.2.3


From 1c51450f1afff1e7419797720df3fbd9ccbf610c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 13 Oct 2025 14:59:26 +0000
Subject: tcp: better handle TCP_TX_DELAY on established flows

Some applications uses TCP_TX_DELAY socket option after TCP flow
is established.

Some metrics need to be updated, otherwise TCP might take time to
adapt to the new (emulated) RTT.

This patch adjusts tp->srtt_us, tp->rtt_min, icsk_rto
and sk->sk_pacing_rate.

This is best effort, and for instance icsk_rto is reset
without taking backoff into account.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251013145926.833198-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h    |  2 ++
 net/ipv4/tcp.c       | 31 +++++++++++++++++++++++++++----
 net/ipv4/tcp_input.c |  4 ++--
 3 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5ca230ed526a..1e547138f4fb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -461,6 +461,8 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
 void tcp_enter_loss(struct sock *sk);
 void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag);
 void tcp_clear_retrans(struct tcp_sock *tp);
+void tcp_update_pacing_rate(struct sock *sk);
+void tcp_set_rto(struct sock *sk);
 void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
 void tcp_metrics_init(void);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8a18aeca7ab0..4d720aa09a4c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3583,9 +3583,12 @@ static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
 DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
 EXPORT_IPV6_MOD(tcp_tx_delay_enabled);
 
-static void tcp_enable_tx_delay(void)
+static void tcp_enable_tx_delay(struct sock *sk, int val)
 {
-	if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+	struct tcp_sock *tp = tcp_sk(sk);
+	s32 delta = (val - tp->tcp_tx_delay) << 3;
+
+	if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) {
 		static int __tcp_tx_delay_enabled = 0;
 
 		if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
@@ -3593,6 +3596,22 @@ static void tcp_enable_tx_delay(void)
 			pr_info("TCP_TX_DELAY enabled\n");
 		}
 	}
+	/* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us,
+	 * tp->rtt_min, icsk_rto and sk->sk_pacing_rate.
+	 * This is best effort.
+	 */
+	if (delta && sk->sk_state == TCP_ESTABLISHED) {
+		s64 srtt = (s64)tp->srtt_us + delta;
+
+		tp->srtt_us = clamp_t(s64, srtt, 1, ~0U);
+
+		/* Note: does not deal with non zero icsk_backoff */
+		tcp_set_rto(sk);
+
+		minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
+
+		tcp_update_pacing_rate(sk);
+	}
 }
 
 /* When set indicates to always queue non-full frames.  Later the user clears
@@ -4119,8 +4138,12 @@ ao_parse:
 			tp->recvmsg_inq = val;
 		break;
 	case TCP_TX_DELAY:
-		if (val)
-			tcp_enable_tx_delay();
+		/* tp->srtt_us is u32, and is shifted by 3 */
+		if (val < 0 || val >= (1U << (31 - 3))) {
+			err = -EINVAL;
+			break;
+		}
+		tcp_enable_tx_delay(sk, val);
 		WRITE_ONCE(tp->tcp_tx_delay, val);
 		break;
 	default:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 31ea5af49f2d..8fc97f4d8a6b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1095,7 +1095,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 	tp->srtt_us = max(1U, srtt);
 }
 
-static void tcp_update_pacing_rate(struct sock *sk)
+void tcp_update_pacing_rate(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	u64 rate;
@@ -1132,7 +1132,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
  * routine referred to above.
  */
-static void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	/* Old crap is replaced with new one. 8)
-- 
cgit v1.2.3


From 6ddb811a579f87b8506344020002d396f814f7c8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 13 Oct 2025 15:22:31 +0000
Subject: net: add SK_WMEM_ALLOC_BIAS constant

sk->sk_wmem_alloc is initialized to 1, and sk_wmem_alloc_get()
takes care of this initial value.

Add SK_WMEM_ALLOC_BIAS define to not spread this magic value.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251013152234.842065-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 3 ++-
 net/atm/common.c   | 2 +-
 net/core/sock.c    | 5 ++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 60bcb13f045c..2794bc5c5654 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2303,6 +2303,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
 	return 0;
 }
 
+#define SK_WMEM_ALLOC_BIAS 1
 /**
  * sk_wmem_alloc_get - returns write allocations
  * @sk: socket
@@ -2311,7 +2312,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
  */
 static inline int sk_wmem_alloc_get(const struct sock *sk)
 {
-	return refcount_read(&sk->sk_wmem_alloc) - 1;
+	return refcount_read(&sk->sk_wmem_alloc) - SK_WMEM_ALLOC_BIAS;
 }
 
 /**
diff --git a/net/atm/common.c b/net/atm/common.c
index 881c7f259dbd..cecc71a8bee1 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -157,7 +157,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family, i
 	memset(&vcc->local, 0, sizeof(struct sockaddr_atmsvc));
 	memset(&vcc->remote, 0, sizeof(struct sockaddr_atmsvc));
 	vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */
-	refcount_set(&sk->sk_wmem_alloc, 1);
+	refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
 	atomic_set(&sk->sk_rmem_alloc, 0);
 	vcc->push = NULL;
 	vcc->pop = NULL;
diff --git a/net/core/sock.c b/net/core/sock.c
index dc03d4b5909a..542cfa16ee12 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2313,7 +2313,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		}
 
 		sock_net_set(sk, net);
-		refcount_set(&sk->sk_wmem_alloc, 1);
+		refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
 
 		mem_cgroup_sk_alloc(sk);
 		cgroup_sk_alloc(&sk->sk_cgrp_data);
@@ -2494,8 +2494,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 	atomic_set(&newsk->sk_rmem_alloc, 0);
 
-	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
-	refcount_set(&newsk->sk_wmem_alloc, 1);
+	refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
 
 	atomic_set(&newsk->sk_omem_alloc, 0);
 	sk_init_common(newsk);
-- 
cgit v1.2.3


From 2ddef3462b3a5d62e5485e22ce128a5c02276438 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 13 Oct 2025 15:22:33 +0000
Subject: net: add /proc/sys/net/core/txq_reselection_ms control

Add a new sysctl to control how often a queue reselection
can happen even if a flow has a persistent queue of skbs
in a Qdisc or NIC queue.

A value of zero means the feature is disabled.

Default is 1000 (1 second).

This sysctl is used in the following patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251013152234.842065-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/admin-guide/sysctl/net.rst | 17 +++++++++++++++++
 include/net/netns/core.h                 |  1 +
 net/core/net_namespace.c                 |  1 +
 net/core/sysctl_net_core.c               |  7 +++++++
 4 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 2ef50828aff1..40749b3cd356 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -406,6 +406,23 @@ to SOCK_TXREHASH_DEFAULT (i. e. not overridden by setsockopt).
 If set to 1 (default), hash rethink is performed on listening socket.
 If set to 0, hash rethink is not performed.
 
+txq_reselection_ms
+------------------
+
+Controls how often (in ms) a busy connected flow can select another tx queue.
+
+A resection is desirable when/if user thread has migrated and XPS
+would select a different queue. Same can occur without XPS
+if the flow hash has changed.
+
+But switching txq can introduce reorders, especially if the
+old queue is under high pressure. Modern TCP stacks deal
+well with reorders if they happen not too often.
+
+To disable this feature, set the value to 0.
+
+Default : 1000
+
 gro_normal_batch
 ----------------
 
diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 9b36f0ff0c20..cb9c3e4cd738 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -13,6 +13,7 @@ struct netns_core {
 	struct ctl_table_header	*sysctl_hdr;
 
 	int	sysctl_somaxconn;
+	int	sysctl_txq_reselection;
 	int	sysctl_optmem_max;
 	u8	sysctl_txrehash;
 	u8	sysctl_tstamp_allow_data;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b0e0f22d7b21..adcfef55a66f 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -395,6 +395,7 @@ static __net_init void preinit_net_sysctl(struct net *net)
 	net->core.sysctl_optmem_max = 128 * 1024;
 	net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
 	net->core.sysctl_tstamp_allow_data = 1;
+	net->core.sysctl_txq_reselection = msecs_to_jiffies(1000);
 }
 
 /* init code that must occur even if setup_net() is not called. */
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 8cf04b57ade1..f79137826d7f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -667,6 +667,13 @@ static struct ctl_table netns_core_table[] = {
 		.extra2		= SYSCTL_ONE,
 		.proc_handler	= proc_dou8vec_minmax,
 	},
+	{
+		.procname	= "txq_reselection_ms",
+		.data		= &init_net.core.sysctl_txq_reselection,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
 	{
 		.procname	= "tstamp_allow_data",
 		.data		= &init_net.core.sysctl_tstamp_allow_data,
-- 
cgit v1.2.3


From 4a7708443dec13b074bc43855f494358fedbd3c0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 13 Oct 2025 15:22:34 +0000
Subject: net: allow busy connected flows to switch tx queues

This is a followup of commit 726e9e8b94b9 ("tcp: refine
skb->ooo_okay setting") and of prior commit in this series
("net: control skb->ooo_okay from skb_set_owner_w()")

skb->ooo_okay might never be set for bulk flows that always
have at least one skb in a qdisc queue of NIC queue,
especially if TX completion is delayed because of a stressed cpu.

The so-called "strange attractors" has caused many performance
issues (see for instance 9b462d02d6dd ("tcp: TCP Small Queues
and strange attractors")), we need to do better.

We have tried very hard to avoid reorders because TCP was
not dealing with them nicely a decade ago.

Use the new net.core.txq_reselection_ms sysctl to let
flows follow XPS and select a more efficient queue.

After this patch, we no longer have to make sure threads
are pinned to cpus, they now can be migrated without
adding too much spinlock/qdisc/TX completion pressure anymore.

TX completion part was problematic, because it added false sharing
on various socket fields, but also added false sharing and spinlock
contention in mm layers. Calling skb_orphan() from ndo_start_xmit()
is not an option unfortunately.

Note for later:

1) move sk->sk_tx_queue_mapping closer
to sk_tx_queue_mapping_jiffies for better cache locality.

2) Study if 9b462d02d6dd ("tcp: TCP Small Queues
and strange attractors") could be revised.

Tested:

Used a host with 32 TX queues, shared by groups of 8 cores.
XPS setup :

echo ff >/sys/class/net/eth1/queue/tx-0/xps_cpus
echo ff00 >/sys/class/net/eth1/queue/tx-1/xps_cpus
echo ff0000 >/sys/class/net/eth1/queue/tx-2/xps_cpus
echo ff000000 >/sys/class/net/eth1/queue/tx-3/xps_cpus
echo ff,00000000 >/sys/class/net/eth1/queue/tx-4/xps_cpus
echo ff00,00000000 >/sys/class/net/eth1/queue/tx-5/xps_cpus
echo ff0000,00000000 >/sys/class/net/eth1/queue/tx-6/xps_cpus
echo ff000000,00000000 >/sys/class/net/eth1/queue/tx-7/xps_cpus
...

Launched a tcp_stream with 15 threads and 1000 flows, initially affined to core 0-15

taskset -c 0-15 tcp_stream -T15 -F1000 -l1000 -c -H target_host

Checked that only queues 0 and 1 are used as instructed by XPS :
tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p"
 backlog 123489410b 1890p
 backlog 69809026b 1064p
 backlog 52401054b 805p

Then force each thread to run on cpu 1,9,17,25,33,41,49,57,65,73,81,89,97,105,113,121

C=1;PID=`pidof tcp_stream`;for P in `ls /proc/$PID/task`; do taskset -pc $C $P; C=$(($C + 8));done

Set txq_reselection_ms to 1000
echo 1000 > /proc/sys/net/core/txq_reselection_ms

Check that the flows have migrated nicely:

tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p"
 backlog 130508314b 1916p
 backlog 8584380b 126p
 backlog 8584380b 126p
 backlog 8379990b 123p
 backlog 8584380b 126p
 backlog 8487484b 125p
 backlog 8584380b 126p
 backlog 8448120b 124p
 backlog 8584380b 126p
 backlog 8720640b 128p
 backlog 8856900b 130p
 backlog 8584380b 126p
 backlog 8652510b 127p
 backlog 8448120b 124p
 backlog 8516250b 125p
 backlog 7834950b 115p

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251013152234.842065-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 26 ++++++++++++--------------
 net/core/dev.c     | 29 +++++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 2794bc5c5654..f0d00928db9e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -313,6 +313,7 @@ struct sk_filter;
   *	@sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
   *	              for timestamping
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
+  *	@sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh.
   *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
   *	@sk_socket: Identd and reporting IO signals
   *	@sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
@@ -485,6 +486,7 @@ struct sock {
 	unsigned long		sk_pacing_rate; /* bytes per second */
 	atomic_t		sk_zckey;
 	atomic_t		sk_tskey;
+	unsigned long		sk_tx_queue_mapping_jiffies;
 	__cacheline_group_end(sock_write_tx);
 
 	__cacheline_group_begin(sock_read_tx);
@@ -1992,7 +1994,15 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
 	/* Paired with READ_ONCE() in sk_tx_queue_get() and
 	 * other WRITE_ONCE() because socket lock might be not held.
 	 */
-	WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
+	if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) {
+		WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
+		WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
+		return;
+	}
+
+	/* Refresh sk_tx_queue_mapping_jiffies if too old. */
+	if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ))
+		WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
 }
 
 #define NO_QUEUE_MAPPING	USHRT_MAX
@@ -2005,19 +2015,7 @@ static inline void sk_tx_queue_clear(struct sock *sk)
 	WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
 }
 
-static inline int sk_tx_queue_get(const struct sock *sk)
-{
-	if (sk) {
-		/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
-		 * and sk_tx_queue_set().
-		 */
-		int val = READ_ONCE(sk->sk_tx_queue_mapping);
-
-		if (val != NO_QUEUE_MAPPING)
-			return val;
-	}
-	return -1;
-}
+int sk_tx_queue_get(const struct sock *sk);
 
 static inline void __sk_rx_queue_set(struct sock *sk,
 				     const struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index a64cef2c537e..33e6101dbc45 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4591,6 +4591,32 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(dev_pick_tx_zero);
 
+int sk_tx_queue_get(const struct sock *sk)
+{
+	int resel, val;
+
+	if (!sk)
+		return -1;
+	/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
+	 * and sk_tx_queue_set().
+	 */
+	val = READ_ONCE(sk->sk_tx_queue_mapping);
+
+	if (val == NO_QUEUE_MAPPING)
+		return -1;
+
+	if (!sk_fullsock(sk))
+		return val;
+
+	resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
+	if (resel && time_is_before_jiffies(
+			READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
+		return -1;
+
+	return val;
+}
+EXPORT_SYMBOL(sk_tx_queue_get);
+
 u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 		     struct net_device *sb_dev)
 {
@@ -4606,8 +4632,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 		if (new_index < 0)
 			new_index = skb_tx_hash(dev, sb_dev, skb);
 
-		if (queue_index != new_index && sk &&
-		    sk_fullsock(sk) &&
+		if (sk && sk_fullsock(sk) &&
 		    rcu_access_pointer(sk->sk_dst_cache))
 			sk_tx_queue_set(sk, new_index);
 
-- 
cgit v1.2.3


From 378e6523ebb1e80b3955b7675cfe40b07028d085 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 14 Oct 2025 08:02:47 +0200
Subject: net: bcmgenet: remove unused platform code

This effectively reverts b0ba512e25d7 ("net: bcmgenet: enable driver to
work without a device tree"). There has never been an in-tree user of
struct bcmgenet_platform_data, all devices use OF or ACPI.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/108b4e64-55d4-4b4e-9a11-3c810c319d66@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS                                    |  1 -
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 20 +++----
 drivers/net/ethernet/broadcom/genet/bcmmii.c   | 75 +-------------------------
 include/linux/platform_data/bcmgenet.h         | 19 -------
 4 files changed, 7 insertions(+), 108 deletions(-)
 delete mode 100644 include/linux/platform_data/bcmgenet.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 3a27901781c2..4c4b519171f3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5123,7 +5123,6 @@ F:	Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml
 F:	drivers/net/ethernet/broadcom/genet/
 F:	drivers/net/ethernet/broadcom/unimac.h
 F:	drivers/net/mdio/mdio-bcm-unimac.c
-F:	include/linux/platform_data/bcmgenet.h
 F:	include/linux/platform_data/mdio-bcm-unimac.h
 
 BROADCOM IPROC ARM ARCHITECTURE
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 98971ae4f87d..d99ef92feb82 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -35,7 +35,6 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/phy.h>
-#include <linux/platform_data/bcmgenet.h>
 
 #include <linux/unaligned.h>
 
@@ -3926,7 +3925,6 @@ MODULE_DEVICE_TABLE(of, bcmgenet_match);
 
 static int bcmgenet_probe(struct platform_device *pdev)
 {
-	struct bcmgenet_platform_data *pd = pdev->dev.platform_data;
 	const struct bcmgenet_plat_data *pdata;
 	struct bcmgenet_priv *priv;
 	struct net_device *dev;
@@ -4010,9 +4008,6 @@ static int bcmgenet_probe(struct platform_device *pdev)
 		priv->version = pdata->version;
 		priv->dma_max_burst_length = pdata->dma_max_burst_length;
 		priv->flags = pdata->flags;
-	} else {
-		priv->version = pd->genet_version;
-		priv->dma_max_burst_length = DMA_MAX_BURST_LENGTH;
 	}
 
 	priv->clk = devm_clk_get_optional(&priv->pdev->dev, "enet");
@@ -4062,16 +4057,13 @@ static int bcmgenet_probe(struct platform_device *pdev)
 	if (device_get_phy_mode(&pdev->dev) == PHY_INTERFACE_MODE_INTERNAL)
 		bcmgenet_power_up(priv, GENET_POWER_PASSIVE);
 
-	if (pd && !IS_ERR_OR_NULL(pd->mac_address))
-		eth_hw_addr_set(dev, pd->mac_address);
-	else
-		if (device_get_ethdev_address(&pdev->dev, dev))
-			if (has_acpi_companion(&pdev->dev)) {
-				u8 addr[ETH_ALEN];
+	if (device_get_ethdev_address(&pdev->dev, dev))
+		if (has_acpi_companion(&pdev->dev)) {
+			u8 addr[ETH_ALEN];
 
-				bcmgenet_get_hw_addr(priv, addr);
-				eth_hw_addr_set(dev, addr);
-			}
+			bcmgenet_get_hw_addr(priv, addr);
+			eth_hw_addr_set(dev, addr);
+		}
 
 	if (!is_valid_ether_addr(dev->dev_addr)) {
 		dev_warn(&pdev->dev, "using random Ethernet MAC\n");
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 573e8b279e52..38f854b94a79 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -20,7 +20,6 @@
 #include <linux/of.h>
 #include <linux/of_net.h>
 #include <linux/of_mdio.h>
-#include <linux/platform_data/bcmgenet.h>
 #include <linux/platform_data/mdio-bcm-unimac.h>
 
 #include "bcmgenet.h"
@@ -436,23 +435,6 @@ static struct device_node *bcmgenet_mii_of_find_mdio(struct bcmgenet_priv *priv)
 	return priv->mdio_dn;
 }
 
-static void bcmgenet_mii_pdata_init(struct bcmgenet_priv *priv,
-				    struct unimac_mdio_pdata *ppd)
-{
-	struct device *kdev = &priv->pdev->dev;
-	struct bcmgenet_platform_data *pd = kdev->platform_data;
-
-	if (pd->phy_interface != PHY_INTERFACE_MODE_MOCA && pd->mdio_enabled) {
-		/*
-		 * Internal or external PHY with MDIO access
-		 */
-		if (pd->phy_address >= 0 && pd->phy_address < PHY_MAX_ADDR)
-			ppd->phy_mask = 1 << pd->phy_address;
-		else
-			ppd->phy_mask = 0;
-	}
-}
-
 static int bcmgenet_mii_wait(void *wait_func_data)
 {
 	struct bcmgenet_priv *priv = wait_func_data;
@@ -467,7 +449,6 @@ static int bcmgenet_mii_wait(void *wait_func_data)
 static int bcmgenet_mii_register(struct bcmgenet_priv *priv)
 {
 	struct platform_device *pdev = priv->pdev;
-	struct bcmgenet_platform_data *pdata = pdev->dev.platform_data;
 	struct device_node *dn = pdev->dev.of_node;
 	struct unimac_mdio_pdata ppd;
 	struct platform_device *ppdev;
@@ -511,8 +492,6 @@ static int bcmgenet_mii_register(struct bcmgenet_priv *priv)
 	ppdev->dev.parent = &pdev->dev;
 	if (dn)
 		ppdev->dev.of_node = bcmgenet_mii_of_find_mdio(priv);
-	else if (pdata)
-		bcmgenet_mii_pdata_init(priv, &ppd);
 	else
 		ppd.phy_mask = ~0;
 
@@ -594,58 +573,6 @@ static int bcmgenet_mii_of_init(struct bcmgenet_priv *priv)
 	return 0;
 }
 
-static int bcmgenet_mii_pd_init(struct bcmgenet_priv *priv)
-{
-	struct device *kdev = &priv->pdev->dev;
-	struct bcmgenet_platform_data *pd = kdev->platform_data;
-	char phy_name[MII_BUS_ID_SIZE + 3];
-	char mdio_bus_id[MII_BUS_ID_SIZE];
-	struct phy_device *phydev;
-
-	snprintf(mdio_bus_id, MII_BUS_ID_SIZE, "%s-%d",
-		 UNIMAC_MDIO_DRV_NAME, priv->pdev->id);
-
-	if (pd->phy_interface != PHY_INTERFACE_MODE_MOCA && pd->mdio_enabled) {
-		snprintf(phy_name, MII_BUS_ID_SIZE, PHY_ID_FMT,
-			 mdio_bus_id, pd->phy_address);
-
-		/*
-		 * Internal or external PHY with MDIO access
-		 */
-		phydev = phy_attach(priv->dev, phy_name, pd->phy_interface);
-		if (IS_ERR(phydev)) {
-			dev_err(kdev, "failed to register PHY device\n");
-			return PTR_ERR(phydev);
-		}
-	} else {
-		/*
-		 * MoCA port or no MDIO access.
-		 * Use fixed PHY to represent the link layer.
-		 */
-		struct fixed_phy_status fphy_status = {
-			.link = 1,
-			.speed = pd->phy_speed,
-			.duplex = pd->phy_duplex,
-			.pause = 0,
-			.asym_pause = 0,
-		};
-
-		phydev = fixed_phy_register(&fphy_status, NULL);
-		if (IS_ERR(phydev)) {
-			dev_err(kdev, "failed to register fixed PHY device\n");
-			return PTR_ERR(phydev);
-		}
-
-		/* Make sure we initialize MoCA PHYs with a link down */
-		phydev->link = 0;
-
-	}
-
-	priv->phy_interface = pd->phy_interface;
-
-	return 0;
-}
-
 static int bcmgenet_mii_bus_init(struct bcmgenet_priv *priv)
 {
 	struct device *kdev = &priv->pdev->dev;
@@ -656,7 +583,7 @@ static int bcmgenet_mii_bus_init(struct bcmgenet_priv *priv)
 	else if (has_acpi_companion(kdev))
 		return bcmgenet_phy_interface_init(priv);
 	else
-		return bcmgenet_mii_pd_init(priv);
+		return -EINVAL;
 }
 
 int bcmgenet_mii_init(struct net_device *dev)
diff --git a/include/linux/platform_data/bcmgenet.h b/include/linux/platform_data/bcmgenet.h
deleted file mode 100644
index d8f8738629d2..000000000000
--- a/include/linux/platform_data/bcmgenet.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_PLATFORM_DATA_BCMGENET_H__
-#define __LINUX_PLATFORM_DATA_BCMGENET_H__
-
-#include <linux/types.h>
-#include <linux/if_ether.h>
-#include <linux/phy.h>
-
-struct bcmgenet_platform_data {
-	bool		mdio_enabled;
-	phy_interface_t	phy_interface;
-	int		phy_address;
-	int		phy_speed;
-	int		phy_duplex;
-	u8		mac_address[ETH_ALEN];
-	int		genet_version;
-};
-
-#endif
-- 
cgit v1.2.3


From 44f5c8ec5b9ad8ed4ade08d727f803b2bb07f1c3 Mon Sep 17 00:00:00 2001
From: Ryan Newton <newton@meta.com>
Date: Wed, 15 Oct 2025 11:50:35 -0400
Subject: sched_ext: Add lockless peek operation for DSQs

The builtin DSQ queue data structures are meant to be used by a wide
range of different sched_ext schedulers with different demands on these
data structures. They might be per-cpu with low-contention, or
high-contention shared queues. Unfortunately, DSQs have a coarse-grained
lock around the whole data structure. Without going all the way to a
lock-free, more scalable implementation, a small step we can take to
reduce lock contention is to allow a lockless, small-fixed-cost peek at
the head of the queue.

This change allows certain custom SCX schedulers to cheaply peek at
queues, e.g. during load balancing, before locking them. But it
represents a few extra memory operations to update the pointer each
time the DSQ is modified, including a memory barrier on ARM so the write
appears correctly ordered.

This commit adds a first_task pointer field which is updated
atomically when the DSQ is modified, and allows any thread to peek at
the head of the queue without holding the lock.

Signed-off-by: Ryan Newton <newton@meta.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h                |  1 +
 kernel/sched/ext.c                       | 58 ++++++++++++++++++++++++++++++--
 tools/sched_ext/include/scx/common.bpf.h |  1 +
 tools/sched_ext/include/scx/compat.bpf.h | 18 ++++++++++
 4 files changed, 76 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 9848aeab2786..4713f374acc0 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -58,6 +58,7 @@ enum scx_dsq_id_flags {
  */
 struct scx_dispatch_q {
 	raw_spinlock_t		lock;
+	struct task_struct __rcu *first_task; /* lockless peek at head */
 	struct list_head	list;	/* tasks in dispatch order */
 	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
 	u32			nr;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 430749ce46ab..f9c0888ef279 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -965,8 +965,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
 				container_of(rbp, struct task_struct,
 					     scx.dsq_priq);
 			list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
+			/* first task unchanged - no update needed */
 		} else {
 			list_add(&p->scx.dsq_list.node, &dsq->list);
+			/* not builtin and new task is at head - use fastpath */
+			rcu_assign_pointer(dsq->first_task, p);
 		}
 	} else {
 		/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
@@ -974,10 +977,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
 			scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
 				  dsq->id);
 
-		if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+		if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) {
 			list_add(&p->scx.dsq_list.node, &dsq->list);
-		else
+			/* new task inserted at head - use fastpath */
+			if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+				rcu_assign_pointer(dsq->first_task, p);
+		} else {
+			bool was_empty;
+
+			was_empty = list_empty(&dsq->list);
 			list_add_tail(&p->scx.dsq_list.node, &dsq->list);
+			if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+				rcu_assign_pointer(dsq->first_task, p);
+		}
 	}
 
 	/* seq records the order tasks are queued, used by BPF DSQ iterator */
@@ -1032,6 +1044,13 @@ static void task_unlink_from_dsq(struct task_struct *p,
 		p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
 	}
 
+	if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
+		struct task_struct *first_task;
+
+		first_task = nldsq_next_task(dsq, NULL, false);
+		rcu_assign_pointer(dsq->first_task, first_task);
+	}
+
 	list_del_init(&p->scx.dsq_list.node);
 	dsq_mod_nr(dsq, -1);
 }
@@ -6292,6 +6311,40 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
 	kit->dsq = NULL;
 }
 
+/**
+ * scx_bpf_dsq_peek - Lockless peek at the first element.
+ * @dsq_id: DSQ to examine.
+ *
+ * Read the first element in the DSQ. This is semantically equivalent to using
+ * the DSQ iterator, but is lockfree. Of course, like any lockless operation,
+ * this provides only a point-in-time snapshot, and the contents may change
+ * by the time any subsequent locking operation reads the queue.
+ *
+ * Returns the pointer, or NULL indicates an empty queue OR internal error.
+ */
+__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
+{
+	struct scx_sched *sch;
+	struct scx_dispatch_q *dsq;
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return NULL;
+
+	if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) {
+		scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id);
+		return NULL;
+	}
+
+	dsq = find_user_dsq(sch, dsq_id);
+	if (unlikely(!dsq)) {
+		scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id);
+		return NULL;
+	}
+
+	return rcu_dereference(dsq->first_task);
+}
+
 __bpf_kfunc_end_defs();
 
 static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
@@ -6851,6 +6904,7 @@ BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU);
 BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index eb3c99445cb3..e65b1eb668ea 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -74,6 +74,7 @@ u32 scx_bpf_reenqueue_local(void) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
 int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
 struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
 void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index e487c10b5e07..619a16f0d39a 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -26,6 +26,24 @@ int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym
 	(bpf_ksym_exists(bpf_cpumask_populate) ?			\
 	 (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP)
 
+/*
+ * v6.19: Introduce lockless peek API for user DSQs.
+ *
+ * Preserve the following macro until v6.21.
+ */
+static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id)
+{
+	struct task_struct *p = NULL;
+	struct bpf_iter_scx_dsq it;
+
+	if (bpf_ksym_exists(scx_bpf_dsq_peek))
+		return scx_bpf_dsq_peek(dsq_id);
+	if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0))
+		p = bpf_iter_scx_dsq_next(&it);
+	bpf_iter_scx_dsq_destroy(&it);
+	return p;
+}
+
 /**
  * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
  * in a compatible way. We will preserve this __COMPAT helper until v6.16.
-- 
cgit v1.2.3


From e5b670e5439bda09ea7e3dd3dd32edb2f367c0d3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Oct 2025 14:06:05 +0000
Subject: net: remove obsolete WARN_ON(refcount_read(&sk->sk_refcnt) == 1)

sk->sk_refcnt has been converted to refcount_t in 2017.

__sock_put(sk) being refcount_dec(&sk->sk_refcnt), it will complain
loudly if the current refcnt is 1 (or less) in a non racy way.

We can remove four WARN_ON() in favor of the generic refcount_dec()
check.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Xuanqiang Luo<luoxuanqiang@kylinos.cn>
Link: https://patch.msgid.link/20251014140605.2982703-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h       | 12 ++++--------
 net/netlink/af_netlink.c |  4 +---
 net/tipc/socket.c        |  4 +---
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index f0d00928db9e..30ac2eb4ef9b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -830,11 +830,9 @@ static inline bool sk_del_node_init(struct sock *sk)
 {
 	bool rc = __sk_del_node_init(sk);
 
-	if (rc) {
-		/* paranoid for a while -acme */
-		WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
+	if (rc)
 		__sock_put(sk);
-	}
+
 	return rc;
 }
 #define sk_del_node_init_rcu(sk)	sk_del_node_init(sk)
@@ -852,11 +850,9 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
 {
 	bool rc = __sk_nulls_del_node_init_rcu(sk);
 
-	if (rc) {
-		/* paranoid for a while -acme */
-		WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
+	if (rc)
 		__sock_put(sk);
-	}
+
 	return rc;
 }
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2b46c0cd752a..687a84c48882 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -596,10 +596,8 @@ static void netlink_remove(struct sock *sk)
 
 	table = &nl_table[sk->sk_protocol];
 	if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
-				    netlink_rhashtable_params)) {
-		WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
+				    netlink_rhashtable_params))
 		__sock_put(sk);
-	}
 
 	netlink_table_grab();
 	if (nlk_sk(sk)->subscriptions) {
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 1574a83384f8..bc614a1f019c 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3031,10 +3031,8 @@ static void tipc_sk_remove(struct tipc_sock *tsk)
 	struct sock *sk = &tsk->sk;
 	struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id);
 
-	if (!rhashtable_remove_fast(&tn->sk_rht, &tsk->node, tsk_rht_params)) {
-		WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
+	if (!rhashtable_remove_fast(&tn->sk_rht, &tsk->node, tsk_rht_params))
 		__sock_put(sk);
-	}
 }
 
 static const struct rhashtable_params tsk_rht_params = {
-- 
cgit v1.2.3


From e9139f765ac7048cadc9981e962acdf8b08eabf3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 30 Oct 2024 13:43:43 +0100
Subject: sched: Employ sched_change guards

As proposed a long while ago -- and half done by scx -- wrap the
scheduler's 'change' pattern in a guard helper.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 include/linux/cleanup.h |   5 ++
 kernel/sched/core.c     | 159 +++++++++++++++++++-----------------------------
 kernel/sched/ext.c      |  39 ++++++------
 kernel/sched/sched.h    |  33 +++++++---
 kernel/sched/syscalls.c |  65 +++++++-------------
 5 files changed, 131 insertions(+), 170 deletions(-)

(limited to 'include')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 2573585b7f06..ae381675455d 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -340,6 +340,11 @@ _label:                                                         \
 #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond)	\
 static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
 
+#define DEFINE_CLASS_IS_UNCONDITIONAL(_name)		\
+	__DEFINE_CLASS_IS_CONDITIONAL(_name, false);	\
+	static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
+	{ return (void *)1; }
+
 #define __GUARD_IS_ERR(_ptr)                                       \
 	({                                                         \
 		unsigned long _rc = (__force unsigned long)(_ptr); \
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 198d2dd45f59..eca40df4b6d3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7326,7 +7326,7 @@ void rt_mutex_post_schedule(void)
  */
 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-	int prio, oldprio, queued, running, queue_flag =
+	int prio, oldprio, queue_flag =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	const struct sched_class *prev_class, *next_class;
 	struct rq_flags rf;
@@ -7391,52 +7391,42 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-	queued = task_on_rq_queued(p);
-	running = task_current_donor(rq, p);
-	if (queued)
-		dequeue_task(rq, p, queue_flag);
-	if (running)
-		put_prev_task(rq, p);
-
-	/*
-	 * Boosting condition are:
-	 * 1. -rt task is running and holds mutex A
-	 *      --> -dl task blocks on mutex A
-	 *
-	 * 2. -dl task is running and holds mutex A
-	 *      --> -dl task blocks on mutex A and could preempt the
-	 *          running task
-	 */
-	if (dl_prio(prio)) {
-		if (!dl_prio(p->normal_prio) ||
-		    (pi_task && dl_prio(pi_task->prio) &&
-		     dl_entity_preempt(&pi_task->dl, &p->dl))) {
-			p->dl.pi_se = pi_task->dl.pi_se;
-			queue_flag |= ENQUEUE_REPLENISH;
+	scoped_guard (sched_change, p, queue_flag) {
+		/*
+		 * Boosting condition are:
+		 * 1. -rt task is running and holds mutex A
+		 *      --> -dl task blocks on mutex A
+		 *
+		 * 2. -dl task is running and holds mutex A
+		 *      --> -dl task blocks on mutex A and could preempt the
+		 *          running task
+		 */
+		if (dl_prio(prio)) {
+			if (!dl_prio(p->normal_prio) ||
+			    (pi_task && dl_prio(pi_task->prio) &&
+			     dl_entity_preempt(&pi_task->dl, &p->dl))) {
+				p->dl.pi_se = pi_task->dl.pi_se;
+				scope->flags |= ENQUEUE_REPLENISH;
+			} else {
+				p->dl.pi_se = &p->dl;
+			}
+		} else if (rt_prio(prio)) {
+			if (dl_prio(oldprio))
+				p->dl.pi_se = &p->dl;
+			if (oldprio < prio)
+				scope->flags |= ENQUEUE_HEAD;
 		} else {
-			p->dl.pi_se = &p->dl;
+			if (dl_prio(oldprio))
+				p->dl.pi_se = &p->dl;
+			if (rt_prio(oldprio))
+				p->rt.timeout = 0;
 		}
-	} else if (rt_prio(prio)) {
-		if (dl_prio(oldprio))
-			p->dl.pi_se = &p->dl;
-		if (oldprio < prio)
-			queue_flag |= ENQUEUE_HEAD;
-	} else {
-		if (dl_prio(oldprio))
-			p->dl.pi_se = &p->dl;
-		if (rt_prio(oldprio))
-			p->rt.timeout = 0;
-	}
 
-	p->sched_class = next_class;
-	p->prio = prio;
+		p->sched_class = next_class;
+		p->prio = prio;
 
-	check_class_changing(rq, p, prev_class);
-
-	if (queued)
-		enqueue_task(rq, p, queue_flag);
-	if (running)
-		set_next_task(rq, p);
+		check_class_changing(rq, p, prev_class);
+	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -8084,26 +8074,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
-	bool queued, running;
-	struct rq_flags rf;
-	struct rq *rq;
-
-	rq = task_rq_lock(p, &rf);
-	queued = task_on_rq_queued(p);
-	running = task_current_donor(rq, p);
-
-	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
-	if (running)
-		put_prev_task(rq, p);
-
-	p->numa_preferred_nid = nid;
-
-	if (queued)
-		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
-	if (running)
-		set_next_task(rq, p);
-	task_rq_unlock(rq, p, &rf);
+	guard(task_rq_lock)(p);
+	scoped_guard (sched_change, p, DEQUEUE_SAVE)
+		p->numa_preferred_nid = nid;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
@@ -9205,8 +9178,9 @@ static void sched_change_group(struct task_struct *tsk)
  */
 void sched_move_task(struct task_struct *tsk, bool for_autogroup)
 {
-	int queued, running, queue_flags =
+	unsigned int queue_flags =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+	bool resched = false;
 	struct rq *rq;
 
 	CLASS(task_rq_lock, rq_guard)(tsk);
@@ -9214,29 +9188,16 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
 
 	update_rq_clock(rq);
 
-	running = task_current_donor(rq, tsk);
-	queued = task_on_rq_queued(tsk);
-
-	if (queued)
-		dequeue_task(rq, tsk, queue_flags);
-	if (running)
-		put_prev_task(rq, tsk);
-
-	sched_change_group(tsk);
-	if (!for_autogroup)
-		scx_cgroup_move_task(tsk);
+	scoped_guard (sched_change, tsk, queue_flags) {
+		sched_change_group(tsk);
+		if (!for_autogroup)
+			scx_cgroup_move_task(tsk);
+		if (scope->running)
+			resched = true;
+	}
 
-	if (queued)
-		enqueue_task(rq, tsk, queue_flags);
-	if (running) {
-		set_next_task(rq, tsk);
-		/*
-		 * After changing group, the running task may have joined a
-		 * throttled one but it's still the running task. Trigger a
-		 * resched to make sure that task can still run.
-		 */
+	if (resched)
 		resched_curr(rq);
-	}
 }
 
 static struct cgroup_subsys_state *
@@ -10892,37 +10853,39 @@ void sched_mm_cid_fork(struct task_struct *t)
 }
 #endif /* CONFIG_SCHED_MM_CID */
 
-#ifdef CONFIG_SCHED_CLASS_EXT
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-			    struct sched_enq_and_set_ctx *ctx)
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
 {
+	struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
 	struct rq *rq = task_rq(p);
 
 	lockdep_assert_rq_held(rq);
 
-	*ctx = (struct sched_enq_and_set_ctx){
+	*ctx = (struct sched_change_ctx){
 		.p = p,
-		.queue_flags = queue_flags,
+		.flags = flags,
 		.queued = task_on_rq_queued(p),
-		.running = task_current(rq, p),
+		.running = task_current_donor(rq, p),
 	};
 
-	update_rq_clock(rq);
 	if (ctx->queued)
-		dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+		dequeue_task(rq, p, flags);
 	if (ctx->running)
 		put_prev_task(rq, p);
+
+	return ctx;
 }
 
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+void sched_change_end(struct sched_change_ctx *ctx)
 {
-	struct rq *rq = task_rq(ctx->p);
+	struct task_struct *p = ctx->p;
+	struct rq *rq = task_rq(p);
 
 	lockdep_assert_rq_held(rq);
 
 	if (ctx->queued)
-		enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+		enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
 	if (ctx->running)
-		set_next_task(rq, ctx->p);
+		set_next_task(rq, p);
 }
-#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2b0e88206d07..4566a7c81360 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3780,11 +3780,10 @@ static void scx_bypass(bool bypass)
 		 */
 		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
 						 scx.runnable_node) {
-			struct sched_enq_and_set_ctx ctx;
-
 			/* cycling deq/enq is enough, see the function comment */
-			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-			sched_enq_and_set_task(&ctx);
+			scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+				/* nothing */ ;
+			}
 		}
 
 		/* resched to restore ticks and idle state */
@@ -3916,17 +3915,16 @@ static void scx_disable_workfn(struct kthread_work *work)
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
-		struct sched_enq_and_set_ctx ctx;
 
-		if (old_class != new_class && p->se.sched_delayed)
-			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+		update_rq_clock(task_rq(p));
 
-		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-
-		p->sched_class = new_class;
-		check_class_changing(task_rq(p), p, old_class);
+		if (old_class != new_class && p->se.sched_delayed)
+			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-		sched_enq_and_set_task(&ctx);
+		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+			p->sched_class = new_class;
+			check_class_changing(task_rq(p), p, old_class);
+		}
 
 		check_class_changed(task_rq(p), p, old_class, p->prio);
 		scx_exit_task(p);
@@ -4660,21 +4658,20 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
-		struct sched_enq_and_set_ctx ctx;
 
 		if (!tryget_task_struct(p))
 			continue;
 
-		if (old_class != new_class && p->se.sched_delayed)
-			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-
-		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+		update_rq_clock(task_rq(p));
 
-		p->scx.slice = SCX_SLICE_DFL;
-		p->sched_class = new_class;
-		check_class_changing(task_rq(p), p, old_class);
+		if (old_class != new_class && p->se.sched_delayed)
+			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-		sched_enq_and_set_task(&ctx);
+		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+			p->scx.slice = SCX_SLICE_DFL;
+			p->sched_class = new_class;
+			check_class_changing(task_rq(p), p, old_class);
+		}
 
 		check_class_changed(task_rq(p), p, old_class, p->prio);
 		put_task_struct(p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1f5d07067f60..6546849aa075 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3885,23 +3885,38 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p,
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
 
-#ifdef CONFIG_SCHED_CLASS_EXT
 /*
- * Used by SCX in the enable/disable paths to move tasks between sched_classes
- * and establish invariants.
+ * The 'sched_change' pattern is the safe, easy and slow way of changing a
+ * task's scheduling properties. It dequeues a task, such that the scheduler
+ * is fully unaware of it; at which point its properties can be modified;
+ * after which it is enqueued again.
+ *
+ * Typically this must be called while holding task_rq_lock, since most/all
+ * properties are serialized under those locks. There is currently one
+ * exception to this rule in sched/ext which only holds rq->lock.
+ */
+
+/*
+ * This structure is a temporary, used to preserve/convey the queueing state
+ * of the task between sched_change_begin() and sched_change_end(). Ensuring
+ * the task's queueing state is idempotent across the operation.
  */
-struct sched_enq_and_set_ctx {
+struct sched_change_ctx {
 	struct task_struct	*p;
-	int			queue_flags;
+	int			flags;
 	bool			queued;
 	bool			running;
 };
 
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-			    struct sched_enq_and_set_ctx *ctx);
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
+void sched_change_end(struct sched_change_ctx *ctx);
 
-#endif /* CONFIG_SCHED_CLASS_EXT */
+DEFINE_CLASS(sched_change, struct sched_change_ctx *,
+	     sched_change_end(_T),
+	     sched_change_begin(p, flags),
+	     struct task_struct *p, unsigned int flags)
+
+DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
 
 #include "ext.h"
 
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 77ae87f36e84..09ffe91410b1 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -64,7 +64,6 @@ static int effective_prio(struct task_struct *p)
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	bool queued, running;
 	struct rq *rq;
 	int old_prio;
 
@@ -90,22 +89,12 @@ void set_user_nice(struct task_struct *p, long nice)
 		return;
 	}
 
-	queued = task_on_rq_queued(p);
-	running = task_current_donor(rq, p);
-	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
-	if (running)
-		put_prev_task(rq, p);
-
-	p->static_prio = NICE_TO_PRIO(nice);
-	set_load_weight(p, true);
-	old_prio = p->prio;
-	p->prio = effective_prio(p);
-
-	if (queued)
-		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
-	if (running)
-		set_next_task(rq, p);
+	scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+		p->static_prio = NICE_TO_PRIO(nice);
+		set_load_weight(p, true);
+		old_prio = p->prio;
+		p->prio = effective_prio(p);
+	}
 
 	/*
 	 * If the task increased its priority or is running and
@@ -515,7 +504,7 @@ int __sched_setscheduler(struct task_struct *p,
 			 bool user, bool pi)
 {
 	int oldpolicy = -1, policy = attr->sched_policy;
-	int retval, oldprio, newprio, queued, running;
+	int retval, oldprio, newprio;
 	const struct sched_class *prev_class, *next_class;
 	struct balance_callback *head;
 	struct rq_flags rf;
@@ -698,33 +687,25 @@ change:
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-	queued = task_on_rq_queued(p);
-	running = task_current_donor(rq, p);
-	if (queued)
-		dequeue_task(rq, p, queue_flags);
-	if (running)
-		put_prev_task(rq, p);
-
-	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
-		__setscheduler_params(p, attr);
-		p->sched_class = next_class;
-		p->prio = newprio;
-	}
-	__setscheduler_uclamp(p, attr);
-	check_class_changing(rq, p, prev_class);
+	scoped_guard (sched_change, p, queue_flags) {
 
-	if (queued) {
-		/*
-		 * We enqueue to tail when the priority of a task is
-		 * increased (user space view).
-		 */
-		if (oldprio < p->prio)
-			queue_flags |= ENQUEUE_HEAD;
+		if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+			__setscheduler_params(p, attr);
+			p->sched_class = next_class;
+			p->prio = newprio;
+		}
+		__setscheduler_uclamp(p, attr);
+		check_class_changing(rq, p, prev_class);
 
-		enqueue_task(rq, p, queue_flags);
+		if (scope->queued) {
+			/*
+			 * We enqueue to tail when the priority of a task is
+			 * increased (user space view).
+			 */
+			if (oldprio < p->prio)
+				scope->flags |= ENQUEUE_HEAD;
+		}
 	}
-	if (running)
-		set_next_task(rq, p);
 
 	check_class_changed(rq, p, prev_class, oldprio);
 
-- 
cgit v1.2.3


From b079d93796528053cde322f2ca838c2d21c297e7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 10 Sep 2025 10:08:05 +0200
Subject: sched: Rename do_set_cpus_allowed()

Hopefully saner naming.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 include/linux/sched.h  |  4 ++--
 kernel/cgroup/cpuset.c |  2 +-
 kernel/kthread.c       |  4 ++--
 kernel/sched/core.c    | 16 ++++++++--------
 kernel/sched/sched.h   |  2 +-
 5 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbb7340c5866..77426c347cff 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1861,8 +1861,8 @@ extern int task_can_attach(struct task_struct *p);
 extern int dl_bw_alloc(int cpu, u64 dl_bw);
 extern void dl_bw_free(int cpu, u64 dl_bw);
 
-/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
-extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
+extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);
 
 /**
  * set_cpus_allowed_ptr - set CPU affinity mask of a task
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 52468d2c178a..185e820cd1df 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4180,7 +4180,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 	rcu_read_lock();
 	cs_mask = task_cs(tsk)->cpus_allowed;
 	if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
-		do_set_cpus_allowed(tsk, cs_mask);
+		set_cpus_allowed_force(tsk, cs_mask);
 		changed = true;
 	}
 	rcu_read_unlock();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 832bd2afecc6..99a3808d086f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -599,7 +599,7 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas
 	}
 
 	scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
-		do_set_cpus_allowed(p, mask);
+		set_cpus_allowed_force(p, mask);
 
 	/* It's safe because the task is inactive. */
 	p->flags |= PF_NO_SETAFFINITY;
@@ -880,7 +880,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
 	kthread_fetch_affinity(kthread, affinity);
 
 	scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
-		do_set_cpus_allowed(p, affinity);
+		set_cpus_allowed_force(p, affinity);
 
 	mutex_unlock(&kthreads_hotplug_lock);
 out:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 805e65007e62..638bffd4c1a2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2331,7 +2331,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
 }
 
 static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
 
 static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
 {
@@ -2348,7 +2348,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
 
 	scoped_guard (task_rq_lock, p) {
 		update_rq_clock(scope.rq);
-		__do_set_cpus_allowed(p, &ac);
+		do_set_cpus_allowed(p, &ac);
 	}
 }
 
@@ -2662,7 +2662,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
 }
 
 static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
 {
 	struct rq *rq = task_rq(p);
 	bool queued, running;
@@ -2692,7 +2692,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
  * Used for kthread_bind() and select_fallback_rq(), in both cases the user
  * affinity (if any) should be destroyed too.
  */
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask)
 {
 	struct affinity_context ac = {
 		.new_mask  = new_mask,
@@ -2706,7 +2706,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 
 	scoped_guard (__task_rq_lock, p) {
 		update_rq_clock(scope.rq);
-		__do_set_cpus_allowed(p, &ac);
+		do_set_cpus_allowed(p, &ac);
 	}
 
 	/*
@@ -2745,7 +2745,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
 	 * Use pi_lock to protect content of user_cpus_ptr
 	 *
 	 * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
-	 * do_set_cpus_allowed().
+	 * set_cpus_allowed_force().
 	 */
 	raw_spin_lock_irqsave(&src->pi_lock, flags);
 	if (src->user_cpus_ptr) {
@@ -3073,7 +3073,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
 		goto out;
 	}
 
-	__do_set_cpus_allowed(p, ctx);
+	do_set_cpus_allowed(p, ctx);
 
 	return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
 
@@ -3482,7 +3482,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			}
 			fallthrough;
 		case possible:
-			do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
+			set_cpus_allowed_force(p, task_cpu_fallback_mask(p));
 			state = fail;
 			break;
 		case fail:
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b23ce9c77611..ea2ea8fd6505 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2617,7 +2617,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
 static inline cpumask_t *alloc_user_cpus_ptr(int node)
 {
 	/*
-	 * See do_set_cpus_allowed() above for the rcu_head usage.
+	 * See set_cpus_allowed_force() above for the rcu_head usage.
 	 */
 	int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
 
-- 
cgit v1.2.3


From 50653216e4ff7a74c95b2ee9ec439916875556ec Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelagnelf@nvidia.com>
Date: Sat, 9 Aug 2025 14:47:50 -0400
Subject: sched: Add support to pick functions to take rf

Some pick functions like the internal pick_next_task_fair() already take
rf but some others dont. We need this for scx's server pick function.
Prepare for this by having pick functions accept it.

[peterz: - added RETRY_TASK handling
         - removed pick_next_task_fair indirection]
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched.h    |  7 ++-----
 kernel/sched/core.c      | 35 ++++++++++++++++++++++++++---------
 kernel/sched/deadline.c  |  8 ++++----
 kernel/sched/ext.c       |  2 +-
 kernel/sched/fair.c      | 26 ++++++--------------------
 kernel/sched/idle.c      |  2 +-
 kernel/sched/rt.c        |  2 +-
 kernel/sched/sched.h     | 10 ++++++----
 kernel/sched/stop_task.c |  2 +-
 9 files changed, 48 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77426c347cff..07576479c0ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -637,8 +637,8 @@ struct sched_rt_entity {
 #endif
 } __randomize_layout;
 
-typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
-typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+struct rq_flags;
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf);
 
 struct sched_dl_entity {
 	struct rb_node			rb_node;
@@ -730,9 +730,6 @@ struct sched_dl_entity {
 	 * dl_server_update().
 	 *
 	 * @rq the runqueue this server is for
-	 *
-	 * @server_has_tasks() returns true if @server_pick return a
-	 * runnable task.
 	 */
 	struct rq			*rq;
 	dl_server_pick_f		server_pick_task;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9fc990ff6845..a75d45680f9c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5901,7 +5901,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 		/* Assume the next prioritized class is idle_sched_class */
 		if (!p) {
-			p = pick_task_idle(rq);
+			p = pick_task_idle(rq, rf);
 			put_prev_set_next_task(rq, prev, p);
 		}
 
@@ -5913,11 +5913,15 @@ restart:
 
 	for_each_active_class(class) {
 		if (class->pick_next_task) {
-			p = class->pick_next_task(rq, prev);
+			p = class->pick_next_task(rq, prev, rf);
+			if (unlikely(p == RETRY_TASK))
+				goto restart;
 			if (p)
 				return p;
 		} else {
-			p = class->pick_task(rq);
+			p = class->pick_task(rq, rf);
+			if (unlikely(p == RETRY_TASK))
+				goto restart;
 			if (p) {
 				put_prev_set_next_task(rq, prev, p);
 				return p;
@@ -5947,7 +5951,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
 	return a->core_cookie == b->core_cookie;
 }
 
-static inline struct task_struct *pick_task(struct rq *rq)
+/*
+ * Careful; this can return RETRY_TASK, it does not include the retry-loop
+ * itself due to the whole SMT pick retry thing below.
+ */
+static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -5955,7 +5963,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
 	rq->dl_server = NULL;
 
 	for_each_active_class(class) {
-		p = class->pick_task(rq);
+		p = class->pick_task(rq, rf);
 		if (p)
 			return p;
 	}
@@ -5970,7 +5978,7 @@ static void queue_core_balance(struct rq *rq);
 static struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
-	struct task_struct *next, *p, *max = NULL;
+	struct task_struct *next, *p, *max;
 	const struct cpumask *smt_mask;
 	bool fi_before = false;
 	bool core_clock_updated = (rq == rq->core);
@@ -6055,7 +6063,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	 * and there are no cookied tasks running on siblings.
 	 */
 	if (!need_sync) {
-		next = pick_task(rq);
+restart_single:
+		next = pick_task(rq, rf);
+		if (unlikely(next == RETRY_TASK))
+			goto restart_single;
 		if (!next->core_cookie) {
 			rq->core_pick = NULL;
 			rq->core_dl_server = NULL;
@@ -6075,6 +6086,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	 *
 	 * Tie-break prio towards the current CPU
 	 */
+restart_multi:
+	max = NULL;
 	for_each_cpu_wrap(i, smt_mask, cpu) {
 		rq_i = cpu_rq(i);
 
@@ -6086,7 +6099,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		if (i != cpu && (rq_i != rq->core || !core_clock_updated))
 			update_rq_clock(rq_i);
 
-		rq_i->core_pick = p = pick_task(rq_i);
+		p = pick_task(rq_i, rf);
+		if (unlikely(p == RETRY_TASK))
+			goto restart_multi;
+
+		rq_i->core_pick = p;
 		rq_i->core_dl_server = rq_i->dl_server;
 
 		if (!max || prio_less(max, p, fi_before))
@@ -6108,7 +6125,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 			if (cookie)
 				p = sched_core_find(rq_i, cookie);
 			if (!p)
-				p = idle_sched_class.pick_task(rq_i);
+				p = idle_sched_class.pick_task(rq_i, rf);
 		}
 
 		rq_i->core_pick = p;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 83e6175d79f5..48357d4609bf 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2352,7 +2352,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
  * __pick_next_task_dl - Helper to pick the next -deadline task to run.
  * @rq: The runqueue to pick the next task from.
  */
-static struct task_struct *__pick_task_dl(struct rq *rq)
+static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf)
 {
 	struct sched_dl_entity *dl_se;
 	struct dl_rq *dl_rq = &rq->dl;
@@ -2366,7 +2366,7 @@ again:
 	WARN_ON_ONCE(!dl_se);
 
 	if (dl_server(dl_se)) {
-		p = dl_se->server_pick_task(dl_se);
+		p = dl_se->server_pick_task(dl_se, rf);
 		if (!p) {
 			dl_server_stop(dl_se);
 			goto again;
@@ -2379,9 +2379,9 @@ again:
 	return p;
 }
 
-static struct task_struct *pick_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf)
 {
-	return __pick_task_dl(rq);
+	return __pick_task_dl(rq, rf);
 }
 
 static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 949c3a6e24d4..dc743cac59cb 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2332,7 +2332,7 @@ static struct task_struct *first_local_task(struct rq *rq)
 					struct task_struct, scx.dsq_list.node);
 }
 
-static struct task_struct *pick_task_scx(struct rq *rq)
+static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
 {
 	struct task_struct *prev = rq->curr;
 	struct task_struct *p;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23ac05cca4a4..2554055c1ba1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8705,15 +8705,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context
 	set_task_max_allowed_capacity(p);
 }
 
-static int
-balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
-{
-	if (sched_fair_runnable(rq))
-		return 1;
-
-	return sched_balance_newidle(rq, rf) != 0;
-}
-
 static void set_next_buddy(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
@@ -8822,7 +8813,7 @@ preempt:
 	resched_curr_lazy(rq);
 }
 
-static struct task_struct *pick_task_fair(struct rq *rq)
+static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
@@ -8866,7 +8857,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	int new_tasks;
 
 again:
-	p = pick_task_fair(rq);
+	p = pick_task_fair(rq, rf);
 	if (!p)
 		goto idle;
 	se = &p->se;
@@ -8945,14 +8936,10 @@ idle:
 	return NULL;
 }
 
-static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
-{
-	return pick_next_task_fair(rq, prev, NULL);
-}
-
-static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
+static struct task_struct *
+fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
 {
-	return pick_task_fair(dl_se->rq);
+	return pick_task_fair(dl_se->rq, rf);
 }
 
 void fair_server_init(struct rq *rq)
@@ -13644,11 +13631,10 @@ DEFINE_SCHED_CLASS(fair) = {
 	.wakeup_preempt		= check_preempt_wakeup_fair,
 
 	.pick_task		= pick_task_fair,
-	.pick_next_task		= __pick_next_task_fair,
+	.pick_next_task		= pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
 	.set_next_task          = set_next_task_fair,
 
-	.balance		= balance_fair,
 	.select_task_rq		= select_task_rq_fair,
 	.migrate_task_rq	= migrate_task_rq_fair,
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 055b0ddbcd54..7fa0b593bcff 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -466,7 +466,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
 	next->se.exec_start = rq_clock_task(rq);
 }
 
-struct task_struct *pick_task_idle(struct rq *rq)
+struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf)
 {
 	scx_update_idle(rq, true, false);
 	return rq->idle;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9bc828d59121..1fd97f2d7ec6 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1695,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	return rt_task_of(rt_se);
 }
 
-static struct task_struct *pick_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf)
 {
 	struct task_struct *p;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f4a323007dce..8946294929a4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2470,7 +2470,7 @@ struct sched_class {
 	/*
 	 * schedule/pick_next_task: rq->lock
 	 */
-	struct task_struct *(*pick_task)(struct rq *rq);
+	struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
 	/*
 	 * Optional! When implemented pick_next_task() should be equivalent to:
 	 *
@@ -2480,7 +2480,8 @@ struct sched_class {
 	 *       set_next_task_first(next);
 	 *   }
 	 */
-	struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+	struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
+					      struct rq_flags *rf);
 
 	/*
 	 * sched_change:
@@ -2707,8 +2708,9 @@ static inline bool sched_fair_runnable(struct rq *rq)
 	return rq->cfs.nr_queued > 0;
 }
 
-extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-extern struct task_struct *pick_task_idle(struct rq *rq);
+extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
+					       struct rq_flags *rf);
+extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
 
 #define SCA_CHECK		0x01
 #define SCA_MIGRATE_DISABLE	0x02
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d98c453c9b4e..4f9192be4b5b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -32,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
 	stop->se.exec_start = rq_clock_task(rq);
 }
 
-static struct task_struct *pick_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf)
 {
 	if (!sched_stop_runnable(rq))
 		return NULL;
-- 
cgit v1.2.3


From 25937d399be2ee9852103a41aaca42d91b140d79 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 9 Oct 2025 18:10:39 +0200
Subject: dt-bindings: power: Add power domain IDs for Tegra264

Add the set of power domain IDs available on the Tegra264 SoC so that
they can be used in device tree files.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/dt-bindings/power/nvidia,tegra264-bpmp.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 include/dt-bindings/power/nvidia,tegra264-bpmp.h

(limited to 'include')

diff --git a/include/dt-bindings/power/nvidia,tegra264-bpmp.h b/include/dt-bindings/power/nvidia,tegra264-bpmp.h
new file mode 100644
index 000000000000..2eef4a2a02b0
--- /dev/null
+++ b/include/dt-bindings/power/nvidia,tegra264-bpmp.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)  */
+/* Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved. */
+
+#ifndef DT_BINDINGS_POWER_NVIDIA_TEGRA264_BPMP_H
+#define DT_BINDINGS_POWER_NVIDIA_TEGRA264_BPMP_H
+
+#define TEGRA264_POWER_DOMAIN_DISP	1
+#define TEGRA264_POWER_DOMAIN_AUD	2
+/* reserved 3:9 */
+#define TEGRA264_POWER_DOMAIN_XUSB_SS	10
+#define TEGRA264_POWER_DOMAIN_XUSB_DEV	11
+#define TEGRA264_POWER_DOMAIN_XUSB_HOST	12
+#define TEGRA264_POWER_DOMAIN_MGBE0	13
+#define TEGRA264_POWER_DOMAIN_MGBE1	14
+#define TEGRA264_POWER_DOMAIN_MGBE2	15
+#define TEGRA264_POWER_DOMAIN_MGBE3	16
+#define TEGRA264_POWER_DOMAIN_VI	17
+#define TEGRA264_POWER_DOMAIN_VIC	18
+#define TEGRA264_POWER_DOMAIN_ISP0	19
+#define TEGRA264_POWER_DOMAIN_ISP1	20
+#define TEGRA264_POWER_DOMAIN_PVA0	21
+#define TEGRA264_POWER_DOMAIN_GPU	22
+
+#endif /* DT_BINDINGS_POWER_NVIDIA_TEGRA264_BPMP_H */
-- 
cgit v1.2.3


From ae495810cffe29c3c30a757bd48b0bb035fc3098 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Tue, 14 Oct 2025 18:53:53 +0300
Subject: gpio: regmap: add the .fixed_direction_output configuration parameter

There are GPIO controllers such as the one present in the LX2160ARDB
QIXIS FPGA which have fixed-direction input and output GPIO lines mixed
together in a single register. This cannot be modeled using the
gpio-regmap as-is since there is no way to present the true direction of
a GPIO line.

In order to make this use case possible, add a new configuration
parameter - fixed_direction_output - into the gpio_regmap_config
structure. This will enable user drivers to provide a bitmap that
represents the fixed direction of the GPIO lines.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Acked-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Michael Walle <mwalle@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpio-regmap.c  | 26 ++++++++++++++++++++++++--
 include/linux/gpio/regmap.h |  5 +++++
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c
index ab9e4077fa60..f4267af00027 100644
--- a/drivers/gpio/gpio-regmap.c
+++ b/drivers/gpio/gpio-regmap.c
@@ -31,6 +31,7 @@ struct gpio_regmap {
 	unsigned int reg_clr_base;
 	unsigned int reg_dir_in_base;
 	unsigned int reg_dir_out_base;
+	unsigned long *fixed_direction_output;
 
 #ifdef CONFIG_REGMAP_IRQ
 	int regmap_irq_line;
@@ -134,6 +135,13 @@ static int gpio_regmap_get_direction(struct gpio_chip *chip,
 	unsigned int base, val, reg, mask;
 	int invert, ret;
 
+	if (gpio->fixed_direction_output) {
+		if (test_bit(offset, gpio->fixed_direction_output))
+			return GPIO_LINE_DIRECTION_OUT;
+		else
+			return GPIO_LINE_DIRECTION_IN;
+	}
+
 	if (gpio->reg_dat_base && !gpio->reg_set_base)
 		return GPIO_LINE_DIRECTION_IN;
 	if (gpio->reg_set_base && !gpio->reg_dat_base)
@@ -284,6 +292,17 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config
 			goto err_free_gpio;
 	}
 
+	if (config->fixed_direction_output) {
+		gpio->fixed_direction_output = bitmap_alloc(chip->ngpio,
+							    GFP_KERNEL);
+		if (!gpio->fixed_direction_output) {
+			ret = -ENOMEM;
+			goto err_free_gpio;
+		}
+		bitmap_copy(gpio->fixed_direction_output,
+			    config->fixed_direction_output, chip->ngpio);
+	}
+
 	/* if not set, assume there is only one register */
 	gpio->ngpio_per_reg = config->ngpio_per_reg;
 	if (!gpio->ngpio_per_reg)
@@ -300,7 +319,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config
 
 	ret = gpiochip_add_data(chip, gpio);
 	if (ret < 0)
-		goto err_free_gpio;
+		goto err_free_bitmap;
 
 #ifdef CONFIG_REGMAP_IRQ
 	if (config->regmap_irq_chip) {
@@ -309,7 +328,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config
 						 config->regmap_irq_line, config->regmap_irq_flags,
 						 0, config->regmap_irq_chip, &gpio->irq_chip_data);
 		if (ret)
-			goto err_free_gpio;
+			goto err_free_bitmap;
 
 		irq_domain = regmap_irq_get_domain(gpio->irq_chip_data);
 	} else
@@ -326,6 +345,8 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config
 
 err_remove_gpiochip:
 	gpiochip_remove(chip);
+err_free_bitmap:
+	bitmap_free(gpio->fixed_direction_output);
 err_free_gpio:
 	kfree(gpio);
 	return ERR_PTR(ret);
@@ -344,6 +365,7 @@ void gpio_regmap_unregister(struct gpio_regmap *gpio)
 #endif
 
 	gpiochip_remove(&gpio->gpio_chip);
+	bitmap_free(gpio->fixed_direction_output);
 	kfree(gpio);
 }
 EXPORT_SYMBOL_GPL(gpio_regmap_unregister);
diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h
index 622a2939ebe0..87983a5f3681 100644
--- a/include/linux/gpio/regmap.h
+++ b/include/linux/gpio/regmap.h
@@ -38,6 +38,10 @@ struct regmap;
  *			offset to a register/bitmask pair. If not
  *			given the default gpio_regmap_simple_xlate()
  *			is used.
+ * @fixed_direction_output:
+ *			(Optional) Bitmap representing the fixed direction of
+ *			the GPIO lines. Useful when there are GPIO lines with a
+ *			fixed direction mixed together in the same register.
  * @drvdata:		(Optional) Pointer to driver specific data which is
  *			not used by gpio-remap but is provided "as is" to the
  *			driver callback(s).
@@ -85,6 +89,7 @@ struct gpio_regmap_config {
 	int reg_stride;
 	int ngpio_per_reg;
 	struct irq_domain *irq_domain;
+	unsigned long *fixed_direction_output;
 
 #ifdef CONFIG_REGMAP_IRQ
 	struct regmap_irq_chip *regmap_irq_chip;
-- 
cgit v1.2.3


From eba11116f39533d2e38cc5898014f2c95f32d23a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Mon, 13 Oct 2025 15:07:15 +0200
Subject: gpiolib: of: Get rid of <linux/gpio/legacy-of-mm-gpiochip.h>

Last user of linux/gpio/legacy-of-mm-gpiochip.h is gone.

Remove linux/gpio/legacy-of-mm-gpiochip.h and
CONFIG_OF_GPIO_MM_GPIOCHIP

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/Kconfig                       |  8 ---
 drivers/gpio/TODO                          | 11 -----
 drivers/gpio/gpiolib-of.c                  | 79 ------------------------------
 include/linux/gpio/legacy-of-mm-gpiochip.h | 36 --------------
 4 files changed, 134 deletions(-)
 delete mode 100644 include/linux/gpio/legacy-of-mm-gpiochip.h

(limited to 'include')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 154d31b75070..ce237398fa00 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -42,14 +42,6 @@ config GPIOLIB_IRQCHIP
 	select IRQ_DOMAIN
 	bool
 
-config OF_GPIO_MM_GPIOCHIP
-	bool
-	help
-	  This adds support for the legacy 'struct of_mm_gpio_chip' interface
-	  from PowerPC. Existing drivers using this interface need to select
-	  this symbol, but new drivers should use the generic gpio-regmap
-	  infrastructure instead.
-
 config DEBUG_GPIO
 	bool "Debug GPIO calls"
 	depends on DEBUG_KERNEL
diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO
index 8ed74e05903a..5acaeab029ec 100644
--- a/drivers/gpio/TODO
+++ b/drivers/gpio/TODO
@@ -86,17 +86,6 @@ Work items:
 
 -------------------------------------------------------------------------------
 
-Get rid of <linux/gpio/legacy-of-mm-gpiochip.h>
-
-Work items:
-
-- Get rid of struct of_mm_gpio_chip altogether: use the generic  MMIO
-  GPIO for all current users (see below). Delete struct of_mm_gpio_chip,
-  to_of_mm_gpio_chip(), of_mm_gpiochip_add_data(), of_mm_gpiochip_remove(),
-  CONFIG_OF_GPIO_MM_GPIOCHIP from the kernel.
-
--------------------------------------------------------------------------------
-
 Collect drivers
 
 Collect GPIO drivers from arch/* and other places that should be placed
diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index fad4edf9cc5c..8657379e9165 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -1031,85 +1031,6 @@ static int of_gpio_threecell_xlate(struct gpio_chip *gc,
 	return gpiospec->args[1];
 }
 
-#if IS_ENABLED(CONFIG_OF_GPIO_MM_GPIOCHIP)
-#include <linux/gpio/legacy-of-mm-gpiochip.h>
-/**
- * of_mm_gpiochip_add_data - Add memory mapped GPIO chip (bank)
- * @np:		device node of the GPIO chip
- * @mm_gc:	pointer to the of_mm_gpio_chip allocated structure
- * @data:	driver data to store in the struct gpio_chip
- *
- * To use this function you should allocate and fill mm_gc with:
- *
- * 1) In the gpio_chip structure:
- *    - all the callbacks
- *    - of_gpio_n_cells
- *    - of_xlate callback (optional)
- *
- * 3) In the of_mm_gpio_chip structure:
- *    - save_regs callback (optional)
- *
- * If succeeded, this function will map bank's memory and will
- * do all necessary work for you. Then you'll able to use .regs
- * to manage GPIOs from the callbacks.
- *
- * Returns:
- * 0 on success, or negative errno on failure.
- */
-int of_mm_gpiochip_add_data(struct device_node *np,
-			    struct of_mm_gpio_chip *mm_gc,
-			    void *data)
-{
-	int ret = -ENOMEM;
-	struct gpio_chip *gc = &mm_gc->gc;
-
-	gc->label = kasprintf(GFP_KERNEL, "%pOF", np);
-	if (!gc->label)
-		goto err0;
-
-	mm_gc->regs = of_iomap(np, 0);
-	if (!mm_gc->regs)
-		goto err1;
-
-	gc->base = -1;
-
-	if (mm_gc->save_regs)
-		mm_gc->save_regs(mm_gc);
-
-	fwnode_handle_put(mm_gc->gc.fwnode);
-	mm_gc->gc.fwnode = fwnode_handle_get(of_fwnode_handle(np));
-
-	ret = gpiochip_add_data(gc, data);
-	if (ret)
-		goto err2;
-
-	return 0;
-err2:
-	of_node_put(np);
-	iounmap(mm_gc->regs);
-err1:
-	kfree(gc->label);
-err0:
-	pr_err("%pOF: GPIO chip registration failed with status %d\n", np, ret);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(of_mm_gpiochip_add_data);
-
-/**
- * of_mm_gpiochip_remove - Remove memory mapped GPIO chip (bank)
- * @mm_gc:	pointer to the of_mm_gpio_chip allocated structure
- */
-void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc)
-{
-	struct gpio_chip *gc = &mm_gc->gc;
-
-	gpiochip_remove(gc);
-	iounmap(mm_gc->regs);
-	kfree(gc->label);
-}
-EXPORT_SYMBOL_GPL(of_mm_gpiochip_remove);
-#endif
-
 #ifdef CONFIG_PINCTRL
 static int of_gpiochip_add_pin_range(struct gpio_chip *chip)
 {
diff --git a/include/linux/gpio/legacy-of-mm-gpiochip.h b/include/linux/gpio/legacy-of-mm-gpiochip.h
deleted file mode 100644
index 2e2bd3b19cc3..000000000000
--- a/include/linux/gpio/legacy-of-mm-gpiochip.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * OF helpers for the old of_mm_gpio_chip, used on ppc32 and nios2,
- * do not use in new code.
- *
- * Copyright (c) 2007-2008  MontaVista Software, Inc.
- *
- * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
- */
-
-#ifndef __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H
-#define __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H
-
-#include <linux/gpio/driver.h>
-#include <linux/of.h>
-
-/*
- * OF GPIO chip for memory mapped banks
- */
-struct of_mm_gpio_chip {
-	struct gpio_chip gc;
-	void (*save_regs)(struct of_mm_gpio_chip *mm_gc);
-	void __iomem *regs;
-};
-
-static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc)
-{
-	return container_of(gc, struct of_mm_gpio_chip, gc);
-}
-
-extern int of_mm_gpiochip_add_data(struct device_node *np,
-				   struct of_mm_gpio_chip *mm_gc,
-				   void *data);
-extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc);
-
-#endif /* __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H */
-- 
cgit v1.2.3


From 1e3e330c07076a0582385bbea029c9cc918fa30d Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 13 Oct 2025 11:46:11 +0200
Subject: irqchip: Pass platform device to platform drivers

The IRQCHIP_PLATFORM_DRIVER macros can be used to convert OF irqchip
drivers to platform drivers but currently reuse the OF init callback
prototype that only takes OF nodes as arguments. This forces drivers to
do reverse lookups of their struct devices during probe if they need
them for things like dev_printk() and device managed resources.

Half of the drivers doing reverse lookups also currently fail to release
the additional reference taken during the lookup, while other drivers
have had the reference leak plugged in various ways (e.g. using
non-intuitive cleanup constructs which still confuse static checkers).

Switch to using a probe callback that takes a platform device as its
first argument to simplify drivers and plug the remaining (mostly
benign) reference leaks.

Fixes: 32c6c054661a ("irqchip: Add Broadcom BCM2712 MSI-X interrupt controller")
Fixes: 70afdab904d2 ("irqchip: Add IMX MU MSI controller driver")
Fixes: a6199bb514d8 ("irqchip: Add Qualcomm MPM controller driver")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Changhuang Liang <changhuang.liang@starfivetech.com>
---
 drivers/irqchip/irq-bcm2712-mip.c          | 10 +++------
 drivers/irqchip/irq-bcm7038-l1.c           |  5 +++--
 drivers/irqchip/irq-bcm7120-l2.c           | 20 +++++------------
 drivers/irqchip/irq-brcmstb-l2.c           | 21 +++++++++---------
 drivers/irqchip/irq-imx-mu-msi.c           | 24 ++++++++++----------
 drivers/irqchip/irq-mchp-eic.c             |  5 +++--
 drivers/irqchip/irq-meson-gpio.c           |  5 +++--
 drivers/irqchip/irq-qcom-mpm.c             |  6 ++---
 drivers/irqchip/irq-renesas-rzg2l.c        | 35 +++++++++---------------------
 drivers/irqchip/irq-renesas-rzv2h.c        | 32 ++++++++-------------------
 drivers/irqchip/irq-starfive-jh8100-intc.c |  5 +++--
 drivers/irqchip/irqchip.c                  |  6 ++---
 drivers/irqchip/qcom-pdc.c                 |  5 +++--
 include/linux/irqchip.h                    |  8 ++++++-
 14 files changed, 78 insertions(+), 109 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-bcm2712-mip.c b/drivers/irqchip/irq-bcm2712-mip.c
index 8466646e5a2d..4761974ad650 100644
--- a/drivers/irqchip/irq-bcm2712-mip.c
+++ b/drivers/irqchip/irq-bcm2712-mip.c
@@ -232,16 +232,12 @@ err_put:
 	return ret;
 }
 
-static int mip_of_msi_init(struct device_node *node, struct device_node *parent)
+static int mip_msi_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	struct platform_device *pdev;
+	struct device_node *node = pdev->dev.of_node;
 	struct mip_priv *mip;
 	int ret;
 
-	pdev = of_find_device_by_node(node);
-	if (!pdev)
-		return -EPROBE_DEFER;
-
 	mip = kzalloc(sizeof(*mip), GFP_KERNEL);
 	if (!mip)
 		return -ENOMEM;
@@ -284,7 +280,7 @@ err_priv:
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(mip_msi)
-IRQCHIP_MATCH("brcm,bcm2712-mip", mip_of_msi_init)
+IRQCHIP_MATCH("brcm,bcm2712-mip", mip_msi_probe)
 IRQCHIP_PLATFORM_DRIVER_END(mip_msi)
 MODULE_DESCRIPTION("Broadcom BCM2712 MSI-X interrupt controller");
 MODULE_AUTHOR("Phil Elwell <phil@raspberrypi.com>");
diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c
index eda33bd5d080..821b288587ca 100644
--- a/drivers/irqchip/irq-bcm7038-l1.c
+++ b/drivers/irqchip/irq-bcm7038-l1.c
@@ -394,8 +394,9 @@ static const struct irq_domain_ops bcm7038_l1_domain_ops = {
 	.map			= bcm7038_l1_map,
 };
 
-static int bcm7038_l1_of_init(struct device_node *dn, struct device_node *parent)
+static int bcm7038_l1_probe(struct platform_device *pdev, struct device_node *parent)
 {
+	struct device_node *dn = pdev->dev.of_node;
 	struct bcm7038_l1_chip *intc;
 	int idx, ret;
 
@@ -453,7 +454,7 @@ out_free:
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(bcm7038_l1)
-IRQCHIP_MATCH("brcm,bcm7038-l1-intc", bcm7038_l1_of_init)
+IRQCHIP_MATCH("brcm,bcm7038-l1-intc", bcm7038_l1_probe)
 IRQCHIP_PLATFORM_DRIVER_END(bcm7038_l1)
 MODULE_DESCRIPTION("Broadcom STB 7038-style L1/L2 interrupt controller");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/irqchip/irq-bcm7120-l2.c b/drivers/irqchip/irq-bcm7120-l2.c
index b6c85560c42e..518c9d4366a5 100644
--- a/drivers/irqchip/irq-bcm7120-l2.c
+++ b/drivers/irqchip/irq-bcm7120-l2.c
@@ -206,14 +206,14 @@ static int bcm7120_l2_intc_iomap_3380(struct device_node *dn, struct bcm7120_l2_
 	return 0;
 }
 
-static int bcm7120_l2_intc_probe(struct device_node *dn, struct device_node *parent,
+static int bcm7120_l2_intc_probe(struct platform_device *pdev, struct device_node *parent,
 				 int (*iomap_regs_fn)(struct device_node *,
 						      struct bcm7120_l2_intc_data *),
 				 const char *intc_name)
 {
 	unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN;
+	struct device_node *dn = pdev->dev.of_node;
 	struct bcm7120_l2_intc_data *data;
-	struct platform_device *pdev;
 	struct irq_chip_generic *gc;
 	struct irq_chip_type *ct;
 	int ret = 0;
@@ -224,14 +224,7 @@ static int bcm7120_l2_intc_probe(struct device_node *dn, struct device_node *par
 	if (!data)
 		return -ENOMEM;
 
-	pdev = of_find_device_by_node(dn);
-	if (!pdev) {
-		ret = -ENODEV;
-		goto out_free_data;
-	}
-
 	data->num_parent_irqs = platform_irq_count(pdev);
-	put_device(&pdev->dev);
 	if (data->num_parent_irqs <= 0) {
 		pr_err("invalid number of parent interrupts\n");
 		ret = -ENOMEM;
@@ -331,20 +324,19 @@ out_unmap:
 		if (data->map_base[idx])
 			iounmap(data->map_base[idx]);
 	}
-out_free_data:
 	kfree(data);
 	return ret;
 }
 
-static int bcm7120_l2_intc_probe_7120(struct device_node *dn, struct device_node *parent)
+static int bcm7120_l2_intc_probe_7120(struct platform_device *pdev, struct device_node *parent)
 {
-	return bcm7120_l2_intc_probe(dn, parent, bcm7120_l2_intc_iomap_7120,
+	return bcm7120_l2_intc_probe(pdev, parent, bcm7120_l2_intc_iomap_7120,
 				     "BCM7120 L2");
 }
 
-static int bcm7120_l2_intc_probe_3380(struct device_node *dn, struct device_node *parent)
+static int bcm7120_l2_intc_probe_3380(struct platform_device *pdev, struct device_node *parent)
 {
-	return bcm7120_l2_intc_probe(dn, parent, bcm7120_l2_intc_iomap_3380,
+	return bcm7120_l2_intc_probe(pdev, parent, bcm7120_l2_intc_iomap_3380,
 				     "BCM3380 L2");
 }
 
diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c
index 53e67c6c01f7..bb7078d6524f 100644
--- a/drivers/irqchip/irq-brcmstb-l2.c
+++ b/drivers/irqchip/irq-brcmstb-l2.c
@@ -138,11 +138,12 @@ static void brcmstb_l2_intc_resume(struct irq_data *d)
 	irq_reg_writel(gc, ~b->saved_mask, ct->regs.enable);
 }
 
-static int brcmstb_l2_intc_of_init(struct device_node *np, struct device_node *parent,
-				   const struct brcmstb_intc_init_params *init_params)
+static int brcmstb_l2_intc_probe(struct platform_device *pdev, struct device_node *parent,
+				 const struct brcmstb_intc_init_params *init_params)
 {
 	unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN;
 	unsigned int set = 0;
+	struct device_node *np = pdev->dev.of_node;
 	struct brcmstb_l2_intc_data *data;
 	struct irq_chip_type *ct;
 	int ret;
@@ -255,21 +256,21 @@ out_free:
 	return ret;
 }
 
-static int brcmstb_l2_edge_intc_of_init(struct device_node *np, struct device_node *parent)
+static int brcmstb_l2_edge_intc_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	return brcmstb_l2_intc_of_init(np, parent, &l2_edge_intc_init);
+	return brcmstb_l2_intc_probe(pdev, parent, &l2_edge_intc_init);
 }
 
-static int brcmstb_l2_lvl_intc_of_init(struct device_node *np, struct device_node *parent)
+static int brcmstb_l2_lvl_intc_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	return brcmstb_l2_intc_of_init(np, parent, &l2_lvl_intc_init);
+	return brcmstb_l2_intc_probe(pdev, parent, &l2_lvl_intc_init);
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(brcmstb_l2)
-IRQCHIP_MATCH("brcm,l2-intc", brcmstb_l2_edge_intc_of_init)
-IRQCHIP_MATCH("brcm,hif-spi-l2-intc", brcmstb_l2_edge_intc_of_init)
-IRQCHIP_MATCH("brcm,upg-aux-aon-l2-intc", brcmstb_l2_edge_intc_of_init)
-IRQCHIP_MATCH("brcm,bcm7271-l2-intc", brcmstb_l2_lvl_intc_of_init)
+IRQCHIP_MATCH("brcm,l2-intc", brcmstb_l2_edge_intc_probe)
+IRQCHIP_MATCH("brcm,hif-spi-l2-intc", brcmstb_l2_edge_intc_probe)
+IRQCHIP_MATCH("brcm,upg-aux-aon-l2-intc", brcmstb_l2_edge_intc_probe)
+IRQCHIP_MATCH("brcm,bcm7271-l2-intc", brcmstb_l2_lvl_intc_probe)
 IRQCHIP_PLATFORM_DRIVER_END(brcmstb_l2)
 MODULE_DESCRIPTION("Broadcom STB generic L2 interrupt controller");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/irqchip/irq-imx-mu-msi.c b/drivers/irqchip/irq-imx-mu-msi.c
index 41df168aa7da..c598f2f52fc6 100644
--- a/drivers/irqchip/irq-imx-mu-msi.c
+++ b/drivers/irqchip/irq-imx-mu-msi.c
@@ -296,10 +296,9 @@ static const struct imx_mu_dcfg imx_mu_cfg_imx8ulp = {
 		  },
 };
 
-static int imx_mu_of_init(struct device_node *dn, struct device_node *parent,
-			  const struct imx_mu_dcfg *cfg)
+static int imx_mu_probe(struct platform_device *pdev, struct device_node *parent,
+			const struct imx_mu_dcfg *cfg)
 {
-	struct platform_device *pdev = of_find_device_by_node(dn);
 	struct device_link *pd_link_a;
 	struct device_link *pd_link_b;
 	struct imx_mu_msi *msi_data;
@@ -415,28 +414,27 @@ static const struct dev_pm_ops imx_mu_pm_ops = {
 			   imx_mu_runtime_resume, NULL)
 };
 
-static int imx_mu_imx7ulp_of_init(struct device_node *dn, struct device_node *parent)
+static int imx_mu_imx7ulp_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx7ulp);
+	return imx_mu_probe(pdev, parent, &imx_mu_cfg_imx7ulp);
 }
 
-static int imx_mu_imx6sx_of_init(struct device_node *dn, struct device_node *parent)
+static int imx_mu_imx6sx_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx6sx);
+	return imx_mu_probe(pdev, parent, &imx_mu_cfg_imx6sx);
 }
 
-static int imx_mu_imx8ulp_of_init(struct device_node *dn, struct device_node *parent)
+static int imx_mu_imx8ulp_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx8ulp);
+	return imx_mu_probe(pdev, parent, &imx_mu_cfg_imx8ulp);
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(imx_mu_msi)
-IRQCHIP_MATCH("fsl,imx7ulp-mu-msi", imx_mu_imx7ulp_of_init)
-IRQCHIP_MATCH("fsl,imx6sx-mu-msi", imx_mu_imx6sx_of_init)
-IRQCHIP_MATCH("fsl,imx8ulp-mu-msi", imx_mu_imx8ulp_of_init)
+IRQCHIP_MATCH("fsl,imx7ulp-mu-msi", imx_mu_imx7ulp_probe)
+IRQCHIP_MATCH("fsl,imx6sx-mu-msi", imx_mu_imx6sx_probe)
+IRQCHIP_MATCH("fsl,imx8ulp-mu-msi", imx_mu_imx8ulp_probe)
 IRQCHIP_PLATFORM_DRIVER_END(imx_mu_msi, .pm = &imx_mu_pm_ops)
 
-
 MODULE_AUTHOR("Frank Li <Frank.Li@nxp.com>");
 MODULE_DESCRIPTION("Freescale MU MSI controller driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/irqchip/irq-mchp-eic.c b/drivers/irqchip/irq-mchp-eic.c
index 516a3a0e359c..b513a899c085 100644
--- a/drivers/irqchip/irq-mchp-eic.c
+++ b/drivers/irqchip/irq-mchp-eic.c
@@ -199,8 +199,9 @@ static const struct irq_domain_ops mchp_eic_domain_ops = {
 	.free		= irq_domain_free_irqs_common,
 };
 
-static int mchp_eic_init(struct device_node *node, struct device_node *parent)
+static int mchp_eic_probe(struct platform_device *pdev, struct device_node *parent)
 {
+	struct device_node *node = pdev->dev.of_node;
 	struct irq_domain *parent_domain = NULL;
 	int ret, i;
 
@@ -273,7 +274,7 @@ free:
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(mchp_eic)
-IRQCHIP_MATCH("microchip,sama7g5-eic", mchp_eic_init)
+IRQCHIP_MATCH("microchip,sama7g5-eic", mchp_eic_probe)
 IRQCHIP_PLATFORM_DRIVER_END(mchp_eic)
 
 MODULE_DESCRIPTION("Microchip External Interrupt Controller");
diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c
index 7d177626d64b..09ebf1d9c21b 100644
--- a/drivers/irqchip/irq-meson-gpio.c
+++ b/drivers/irqchip/irq-meson-gpio.c
@@ -572,8 +572,9 @@ static int meson_gpio_irq_parse_dt(struct device_node *node, struct meson_gpio_i
 	return 0;
 }
 
-static int meson_gpio_irq_of_init(struct device_node *node, struct device_node *parent)
+static int meson_gpio_irq_probe(struct platform_device *pdev, struct device_node *parent)
 {
+	struct device_node *node = pdev->dev.of_node;
 	struct irq_domain *domain, *parent_domain;
 	struct meson_gpio_irq_controller *ctl;
 	int ret;
@@ -630,7 +631,7 @@ free_ctl:
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(meson_gpio_intc)
-IRQCHIP_MATCH("amlogic,meson-gpio-intc", meson_gpio_irq_of_init)
+IRQCHIP_MATCH("amlogic,meson-gpio-intc", meson_gpio_irq_probe)
 IRQCHIP_PLATFORM_DRIVER_END(meson_gpio_intc)
 
 MODULE_AUTHOR("Jerome Brunet <jbrunet@baylibre.com>");
diff --git a/drivers/irqchip/irq-qcom-mpm.c b/drivers/irqchip/irq-qcom-mpm.c
index 8d569f7c5a7a..83f31ea657b7 100644
--- a/drivers/irqchip/irq-qcom-mpm.c
+++ b/drivers/irqchip/irq-qcom-mpm.c
@@ -320,9 +320,9 @@ static bool gic_hwirq_is_mapped(struct mpm_gic_map *maps, int cnt, u32 hwirq)
 	return false;
 }
 
-static int qcom_mpm_init(struct device_node *np, struct device_node *parent)
+static int qcom_mpm_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	struct platform_device *pdev = of_find_device_by_node(np);
+	struct device_node *np = pdev->dev.of_node;
 	struct device *dev = &pdev->dev;
 	struct irq_domain *parent_domain;
 	struct generic_pm_domain *genpd;
@@ -478,7 +478,7 @@ remove_genpd:
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(qcom_mpm)
-IRQCHIP_MATCH("qcom,mpm", qcom_mpm_init)
+IRQCHIP_MATCH("qcom,mpm", qcom_mpm_probe)
 IRQCHIP_PLATFORM_DRIVER_END(qcom_mpm)
 MODULE_DESCRIPTION("Qualcomm Technologies, Inc. MSM Power Manager");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c
index 12b6eb150301..1bf19deb02c4 100644
--- a/drivers/irqchip/irq-renesas-rzg2l.c
+++ b/drivers/irqchip/irq-renesas-rzg2l.c
@@ -8,7 +8,6 @@
  */
 
 #include <linux/bitfield.h>
-#include <linux/cleanup.h>
 #include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/io.h>
@@ -528,18 +527,15 @@ static int rzg2l_irqc_parse_interrupts(struct rzg2l_irqc_priv *priv,
 	return 0;
 }
 
-static int rzg2l_irqc_common_init(struct device_node *node, struct device_node *parent,
-				  const struct irq_chip *irq_chip)
+static int rzg2l_irqc_common_probe(struct platform_device *pdev, struct device_node *parent,
+				   const struct irq_chip *irq_chip)
 {
-	struct platform_device *pdev = of_find_device_by_node(node);
-	struct device *dev __free(put_device) = pdev ? &pdev->dev : NULL;
 	struct irq_domain *irq_domain, *parent_domain;
+	struct device_node *node = pdev->dev.of_node;
+	struct device *dev = &pdev->dev;
 	struct reset_control *resetn;
 	int ret;
 
-	if (!pdev)
-		return -ENODEV;
-
 	parent_domain = irq_find_host(parent);
 	if (!parent_domain)
 		return dev_err_probe(dev, -ENODEV, "cannot find parent domain\n");
@@ -583,33 +579,22 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node *
 
 	register_syscore_ops(&rzg2l_irqc_syscore_ops);
 
-	/*
-	 * Prevent the cleanup function from invoking put_device by assigning
-	 * NULL to dev.
-	 *
-	 * make coccicheck will complain about missing put_device calls, but
-	 * those are false positives, as dev will be automatically "put" via
-	 * __free_put_device on the failing path.
-	 * On the successful path we don't actually want to "put" dev.
-	 */
-	dev = NULL;
-
 	return 0;
 }
 
-static int rzg2l_irqc_init(struct device_node *node, struct device_node *parent)
+static int rzg2l_irqc_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	return rzg2l_irqc_common_init(node, parent, &rzg2l_irqc_chip);
+	return rzg2l_irqc_common_probe(pdev, parent, &rzg2l_irqc_chip);
 }
 
-static int rzfive_irqc_init(struct device_node *node, struct device_node *parent)
+static int rzfive_irqc_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	return rzg2l_irqc_common_init(node, parent, &rzfive_irqc_chip);
+	return rzg2l_irqc_common_probe(pdev, parent, &rzfive_irqc_chip);
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(rzg2l_irqc)
-IRQCHIP_MATCH("renesas,rzg2l-irqc", rzg2l_irqc_init)
-IRQCHIP_MATCH("renesas,r9a07g043f-irqc", rzfive_irqc_init)
+IRQCHIP_MATCH("renesas,rzg2l-irqc", rzg2l_irqc_probe)
+IRQCHIP_MATCH("renesas,r9a07g043f-irqc", rzfive_irqc_probe)
 IRQCHIP_PLATFORM_DRIVER_END(rzg2l_irqc)
 MODULE_AUTHOR("Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>");
 MODULE_DESCRIPTION("Renesas RZ/G2L IRQC Driver");
diff --git a/drivers/irqchip/irq-renesas-rzv2h.c b/drivers/irqchip/irq-renesas-rzv2h.c
index 9018d9c3911e..899a423b5da8 100644
--- a/drivers/irqchip/irq-renesas-rzv2h.c
+++ b/drivers/irqchip/irq-renesas-rzv2h.c
@@ -490,29 +490,15 @@ static int rzv2h_icu_parse_interrupts(struct rzv2h_icu_priv *priv, struct device
 	return 0;
 }
 
-static void rzv2h_icu_put_device(void *data)
-{
-	put_device(data);
-}
-
-static int rzv2h_icu_init_common(struct device_node *node, struct device_node *parent,
-				 const struct rzv2h_hw_info *hw_info)
+static int rzv2h_icu_probe_common(struct platform_device *pdev, struct device_node *parent,
+				  const struct rzv2h_hw_info *hw_info)
 {
 	struct irq_domain *irq_domain, *parent_domain;
+	struct device_node *node = pdev->dev.of_node;
 	struct rzv2h_icu_priv *rzv2h_icu_data;
-	struct platform_device *pdev;
 	struct reset_control *resetn;
 	int ret;
 
-	pdev = of_find_device_by_node(node);
-	if (!pdev)
-		return -ENODEV;
-
-	ret = devm_add_action_or_reset(&pdev->dev, rzv2h_icu_put_device,
-				       &pdev->dev);
-	if (ret < 0)
-		return ret;
-
 	parent_domain = irq_find_host(parent);
 	if (!parent_domain) {
 		dev_err(&pdev->dev, "cannot find parent domain\n");
@@ -618,19 +604,19 @@ static const struct rzv2h_hw_info rzv2h_hw_params = {
 	.field_width	= 8,
 };
 
-static int rzg3e_icu_init(struct device_node *node, struct device_node *parent)
+static int rzg3e_icu_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	return rzv2h_icu_init_common(node, parent, &rzg3e_hw_params);
+	return rzv2h_icu_probe_common(pdev, parent, &rzg3e_hw_params);
 }
 
-static int rzv2h_icu_init(struct device_node *node, struct device_node *parent)
+static int rzv2h_icu_probe(struct platform_device *pdev, struct device_node *parent)
 {
-	return rzv2h_icu_init_common(node, parent, &rzv2h_hw_params);
+	return rzv2h_icu_probe_common(pdev, parent, &rzv2h_hw_params);
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(rzv2h_icu)
-IRQCHIP_MATCH("renesas,r9a09g047-icu", rzg3e_icu_init)
-IRQCHIP_MATCH("renesas,r9a09g057-icu", rzv2h_icu_init)
+IRQCHIP_MATCH("renesas,r9a09g047-icu", rzg3e_icu_probe)
+IRQCHIP_MATCH("renesas,r9a09g057-icu", rzv2h_icu_probe)
 IRQCHIP_PLATFORM_DRIVER_END(rzv2h_icu)
 MODULE_AUTHOR("Fabrizio Castro <fabrizio.castro.jz@renesas.com>");
 MODULE_DESCRIPTION("Renesas RZ/V2H(P) ICU Driver");
diff --git a/drivers/irqchip/irq-starfive-jh8100-intc.c b/drivers/irqchip/irq-starfive-jh8100-intc.c
index 117f2c651ebd..705361b4ebe0 100644
--- a/drivers/irqchip/irq-starfive-jh8100-intc.c
+++ b/drivers/irqchip/irq-starfive-jh8100-intc.c
@@ -114,8 +114,9 @@ static void starfive_intc_irq_handler(struct irq_desc *desc)
 	chained_irq_exit(chip, desc);
 }
 
-static int starfive_intc_init(struct device_node *intc, struct device_node *parent)
+static int starfive_intc_probe(struct platform_device *pdev, struct device_node *parent)
 {
+	struct device_node *intc = pdev->dev.of_node;
 	struct starfive_irq_chip *irqc;
 	struct reset_control *rst;
 	struct clk *clk;
@@ -198,7 +199,7 @@ err_free:
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(starfive_intc)
-IRQCHIP_MATCH("starfive,jh8100-intc", starfive_intc_init)
+IRQCHIP_MATCH("starfive,jh8100-intc", starfive_intc_probe)
 IRQCHIP_PLATFORM_DRIVER_END(starfive_intc)
 
 MODULE_DESCRIPTION("StarFive JH8100 External Interrupt Controller");
diff --git a/drivers/irqchip/irqchip.c b/drivers/irqchip/irqchip.c
index 652d20d2b07f..689c8e448901 100644
--- a/drivers/irqchip/irqchip.c
+++ b/drivers/irqchip/irqchip.c
@@ -36,9 +36,9 @@ int platform_irqchip_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct device_node *par_np __free(device_node) = of_irq_find_parent(np);
-	of_irq_init_cb_t irq_init_cb = of_device_get_match_data(&pdev->dev);
+	platform_irq_probe_t irq_probe = of_device_get_match_data(&pdev->dev);
 
-	if (!irq_init_cb)
+	if (!irq_probe)
 		return -EINVAL;
 
 	if (par_np == np)
@@ -55,6 +55,6 @@ int platform_irqchip_probe(struct platform_device *pdev)
 	if (par_np && !irq_find_matching_host(par_np, DOMAIN_BUS_ANY))
 		return -EPROBE_DEFER;
 
-	return irq_init_cb(np, par_np);
+	return irq_probe(pdev, par_np);
 }
 EXPORT_SYMBOL_GPL(platform_irqchip_probe);
diff --git a/drivers/irqchip/qcom-pdc.c b/drivers/irqchip/qcom-pdc.c
index 52d77546aacb..518f7f0f3dab 100644
--- a/drivers/irqchip/qcom-pdc.c
+++ b/drivers/irqchip/qcom-pdc.c
@@ -350,9 +350,10 @@ static int pdc_setup_pin_mapping(struct device_node *np)
 
 #define QCOM_PDC_SIZE 0x30000
 
-static int qcom_pdc_init(struct device_node *node, struct device_node *parent)
+static int qcom_pdc_probe(struct platform_device *pdev, struct device_node *parent)
 {
 	struct irq_domain *parent_domain, *pdc_domain;
+	struct device_node *node = pdev->dev.of_node;
 	resource_size_t res_size;
 	struct resource res;
 	int ret;
@@ -428,7 +429,7 @@ fail:
 }
 
 IRQCHIP_PLATFORM_DRIVER_BEGIN(qcom_pdc)
-IRQCHIP_MATCH("qcom,pdc", qcom_pdc_init)
+IRQCHIP_MATCH("qcom,pdc", qcom_pdc_probe)
 IRQCHIP_PLATFORM_DRIVER_END(qcom_pdc)
 MODULE_DESCRIPTION("Qualcomm Technologies, Inc. Power Domain Controller");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h
index d5e6024cb2a8..bc4ddacd6ddc 100644
--- a/include/linux/irqchip.h
+++ b/include/linux/irqchip.h
@@ -17,12 +17,18 @@
 #include <linux/of_irq.h>
 #include <linux/platform_device.h>
 
+typedef int (*platform_irq_probe_t)(struct platform_device *, struct device_node *);
+
 /* Undefined on purpose */
 extern of_irq_init_cb_t typecheck_irq_init_cb;
+extern platform_irq_probe_t typecheck_irq_probe;
 
 #define typecheck_irq_init_cb(fn)					\
 	(__typecheck(typecheck_irq_init_cb, &fn) ? fn : fn)
 
+#define typecheck_irq_probe(fn)						\
+	(__typecheck(typecheck_irq_probe, &fn) ? fn : fn)
+
 /*
  * This macro must be used by the different irqchip drivers to declare
  * the association between their DT compatible string and their
@@ -42,7 +48,7 @@ extern int platform_irqchip_probe(struct platform_device *pdev);
 static const struct of_device_id drv_name##_irqchip_match_table[] = {
 
 #define IRQCHIP_MATCH(compat, fn) { .compatible = compat,		\
-				    .data = typecheck_irq_init_cb(fn), },
+				    .data = typecheck_irq_probe(fn), },
 
 
 #define IRQCHIP_PLATFORM_DRIVER_END(drv_name, ...)			\
-- 
cgit v1.2.3


From 7c268eaeec6388b7bee36aef3fb5e62c9222ad3b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Tue, 14 Oct 2025 23:54:55 +0000
Subject: net: Allow opt-out from global protocol memory accounting.

Some protocols (e.g., TCP, UDP) implement memory accounting for socket
buffers and charge memory to per-protocol global counters pointed to by
sk->sk_proto->memory_allocated.

Sometimes, system processes do not want that limitation.  For a similar
purpose, there is SO_RESERVE_MEM for sockets under memcg.

Also, by opting out of the per-protocol accounting, sockets under memcg
can avoid paying costs for two orthogonal memory accounting mechanisms.
A microbenchmark result is in the subsequent bpf patch.

Let's allow opt-out from the per-protocol memory accounting if
sk->sk_bypass_prot_mem is true.

sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache
line, and sk_has_account() always fetches sk->sk_prot before accessing
sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch.

The following patches will set sk->sk_bypass_prot_mem to true, and
then, the per-protocol memory accounting will be skipped.

Note that this does NOT disable memcg, but rather the per-protocol one.

Another option not to use the hole in struct sock_common is create
sk_prot variants like tcp_prot_bypass, but this would complicate
SOCKMAP logic, tcp_bpf_prots etc.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-3-kuniyu@google.com
---
 include/net/proto_memory.h |  3 +++
 include/net/sock.h         |  3 +++
 include/net/tcp.h          |  3 +++
 net/core/sock.c            | 32 +++++++++++++++++++++++++-------
 net/ipv4/tcp.c             |  3 ++-
 net/ipv4/tcp_output.c      |  7 ++++++-
 net/mptcp/protocol.c       |  7 ++++---
 net/tls/tls_device.c       |  3 ++-
 8 files changed, 48 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h
index 8e91a8fa31b5..ad6d703ce6fe 100644
--- a/include/net/proto_memory.h
+++ b/include/net/proto_memory.h
@@ -35,6 +35,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
 	    mem_cgroup_sk_under_memory_pressure(sk))
 		return true;
 
+	if (sk->sk_bypass_prot_mem)
+		return false;
+
 	return !!READ_ONCE(*sk->sk_prot->memory_pressure);
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 30ac2eb4ef9b..415e7381aa50 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -118,6 +118,7 @@ typedef __u64 __bitwise __addrpair;
  *	@skc_reuseport: %SO_REUSEPORT setting
  *	@skc_ipv6only: socket is IPV6 only
  *	@skc_net_refcnt: socket is using net ref counting
+ *	@skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb
  *	@skc_bound_dev_if: bound device index if != 0
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
  *	@skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -174,6 +175,7 @@ struct sock_common {
 	unsigned char		skc_reuseport:1;
 	unsigned char		skc_ipv6only:1;
 	unsigned char		skc_net_refcnt:1;
+	unsigned char		skc_bypass_prot_mem:1;
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
@@ -381,6 +383,7 @@ struct sock {
 #define sk_reuseport		__sk_common.skc_reuseport
 #define sk_ipv6only		__sk_common.skc_ipv6only
 #define sk_net_refcnt		__sk_common.skc_net_refcnt
+#define sk_bypass_prot_mem	__sk_common.skc_bypass_prot_mem
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1e547138f4fb..439e327fdbfa 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -303,6 +303,9 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk)
 	    mem_cgroup_sk_under_memory_pressure(sk))
 		return true;
 
+	if (sk->sk_bypass_prot_mem)
+		return false;
+
 	return READ_ONCE(tcp_memory_pressure);
 }
 /*
diff --git a/net/core/sock.c b/net/core/sock.c
index 08ae20069b6d..5bf208579c02 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1046,9 +1046,13 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
 	if (!charged)
 		return -ENOMEM;
 
+	if (sk->sk_bypass_prot_mem)
+		goto success;
+
 	/* pre-charge to forward_alloc */
 	sk_memory_allocated_add(sk, pages);
 	allocated = sk_memory_allocated(sk);
+
 	/* If the system goes into memory pressure with this
 	 * precharge, give up and return error.
 	 */
@@ -1057,6 +1061,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
 		mem_cgroup_sk_uncharge(sk, pages);
 		return -ENOMEM;
 	}
+
+success:
 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
 
 	WRITE_ONCE(sk->sk_reserved_mem,
@@ -3145,8 +3151,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
 		return true;
 
-	sk_enter_memory_pressure(sk);
+	if (!sk->sk_bypass_prot_mem)
+		sk_enter_memory_pressure(sk);
+
 	sk_stream_moderate_sndbuf(sk);
+
 	return false;
 }
 EXPORT_SYMBOL(sk_page_frag_refill);
@@ -3263,10 +3272,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 {
 	bool memcg_enabled = false, charged = false;
 	struct proto *prot = sk->sk_prot;
-	long allocated;
+	long allocated = 0;
 
-	sk_memory_allocated_add(sk, amt);
-	allocated = sk_memory_allocated(sk);
+	if (!sk->sk_bypass_prot_mem) {
+		sk_memory_allocated_add(sk, amt);
+		allocated = sk_memory_allocated(sk);
+	}
 
 	if (mem_cgroup_sk_enabled(sk)) {
 		memcg_enabled = true;
@@ -3275,6 +3286,9 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 			goto suppress_allocation;
 	}
 
+	if (!allocated)
+		return 1;
+
 	/* Under limit. */
 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
 		sk_leave_memory_pressure(sk);
@@ -3353,7 +3367,8 @@ suppress_allocation:
 
 	trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
 
-	sk_memory_allocated_sub(sk, amt);
+	if (allocated)
+		sk_memory_allocated_sub(sk, amt);
 
 	if (charged)
 		mem_cgroup_sk_uncharge(sk, amt);
@@ -3392,11 +3407,14 @@ EXPORT_SYMBOL(__sk_mem_schedule);
  */
 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
 {
-	sk_memory_allocated_sub(sk, amount);
-
 	if (mem_cgroup_sk_enabled(sk))
 		mem_cgroup_sk_uncharge(sk, amount);
 
+	if (sk->sk_bypass_prot_mem)
+		return;
+
+	sk_memory_allocated_sub(sk, amount);
+
 	if (sk_under_global_memory_pressure(sk) &&
 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
 		sk_leave_memory_pressure(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4d720aa09a4c..54def27326f1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -928,7 +928,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
 		}
 		__kfree_skb(skb);
 	} else {
-		sk->sk_prot->enter_memory_pressure(sk);
+		if (!sk->sk_bypass_prot_mem)
+			tcp_enter_memory_pressure(sk);
 		sk_stream_moderate_sndbuf(sk);
 	}
 	return NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b94efb3050d2..7f5df7a71f62 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3743,12 +3743,17 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
 	delta = size - sk->sk_forward_alloc;
 	if (delta <= 0)
 		return;
+
 	amt = sk_mem_pages(delta);
 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
-	sk_memory_allocated_add(sk, amt);
 
 	if (mem_cgroup_sk_enabled(sk))
 		mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+	if (sk->sk_bypass_prot_mem)
+		return;
+
+	sk_memory_allocated_add(sk, amt);
 }
 
 /* Send a FIN. The caller locks the socket for us.
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 0292162a14ee..94a5f6dcc577 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1065,11 +1065,12 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
-		if (first)
+		if (first && !ssk->sk_bypass_prot_mem) {
 			tcp_enter_memory_pressure(ssk);
-		sk_stream_moderate_sndbuf(ssk);
+			first = false;
+		}
 
-		first = false;
+		sk_stream_moderate_sndbuf(ssk);
 	}
 	__mptcp_sync_sndbuf(sk);
 }
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index a64ae15b1a60..caa2b5d24622 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -373,7 +373,8 @@ static int tls_do_allocation(struct sock *sk,
 	if (!offload_ctx->open_record) {
 		if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
 						   sk->sk_allocation))) {
-			READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
+			if (!sk->sk_bypass_prot_mem)
+				READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
 			sk_stream_moderate_sndbuf(sk);
 			return -ENOMEM;
 		}
-- 
cgit v1.2.3


From b46ab63181ff973ddce44ebc9ac24b269d42f481 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Tue, 14 Oct 2025 23:54:56 +0000
Subject: net: Introduce net.core.bypass_prot_mem sysctl.

If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out
of the global protocol memory accounting.

Let's control the flag by a new sysctl knob.

The flag is written once during socket(2) and is inherited to child
sockets.

Tested with a script that creates local socket pairs and send()s a
bunch of data without recv()ing.

Setup:

  # mkdir /sys/fs/cgroup/test
  # echo $$ >> /sys/fs/cgroup/test/cgroup.procs
  # sysctl -q net.ipv4.tcp_mem="1000 1000 1000"
  # ulimit -n 524288

Without net.core.bypass_prot_mem, charged to tcp_mem & memcg

  # python3 pressure.py &
  # cat /sys/fs/cgroup/test/memory.stat | grep sock
  sock 22642688 <-------------------------------------- charged to memcg
  # cat /proc/net/sockstat| grep TCP
  TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 <-- charged to tcp_mem
  # ss -tn | head -n 5
  State Recv-Q Send-Q Local Address:Port  Peer Address:Port
  ESTAB 2000   0          127.0.0.1:34479    127.0.0.1:53188
  ESTAB 2000   0          127.0.0.1:34479    127.0.0.1:49972
  ESTAB 2000   0          127.0.0.1:34479    127.0.0.1:53868
  ESTAB 2000   0          127.0.0.1:34479    127.0.0.1:53554
  # nstat | grep Pressure || echo no pressure
  TcpExtTCPMemoryPressures        1                  0.0

With net.core.bypass_prot_mem=1, charged to memcg only:

  # sysctl -q net.core.bypass_prot_mem=1
  # python3 pressure.py &
  # cat /sys/fs/cgroup/test/memory.stat | grep sock
  sock 2757468160 <------------------------------------ charged to memcg
  # cat /proc/net/sockstat | grep TCP
  TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 <- NOT charged to tcp_mem
  # ss -tn | head -n 5
  State Recv-Q Send-Q  Local Address:Port  Peer Address:Port
  ESTAB 111000 0           127.0.0.1:36019    127.0.0.1:49026
  ESTAB 110000 0           127.0.0.1:36019    127.0.0.1:45630
  ESTAB 110000 0           127.0.0.1:36019    127.0.0.1:44870
  ESTAB 111000 0           127.0.0.1:36019    127.0.0.1:45274
  # nstat | grep Pressure || echo no pressure
  no pressure

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-4-kuniyu@google.com
---
 Documentation/admin-guide/sysctl/net.rst | 8 ++++++++
 include/net/netns/core.h                 | 1 +
 net/core/sock.c                          | 5 +++++
 net/core/sysctl_net_core.c               | 9 +++++++++
 4 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 40749b3cd356..991773dcb9cf 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -212,6 +212,14 @@ mem_pcpu_rsv
 
 Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU.
 
+bypass_prot_mem
+---------------
+
+Skip charging socket buffers to the global per-protocol memory
+accounting controlled by net.ipv4.tcp_mem, net.ipv4.udp_mem, etc.
+
+Default: 0 (off)
+
 rmem_default
 ------------
 
diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index cb9c3e4cd738..9ef3d70e5e9c 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -17,6 +17,7 @@ struct netns_core {
 	int	sysctl_optmem_max;
 	u8	sysctl_txrehash;
 	u8	sysctl_tstamp_allow_data;
+	u8	sysctl_bypass_prot_mem;
 
 #ifdef CONFIG_PROC_FS
 	struct prot_inuse __percpu *prot_inuse;
diff --git a/net/core/sock.c b/net/core/sock.c
index 5bf208579c02..b78533fb9268 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2306,8 +2306,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		 * why we need sk_prot_creator -acme
 		 */
 		sk->sk_prot = sk->sk_prot_creator = prot;
+
+		if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
+			sk->sk_bypass_prot_mem = 1;
+
 		sk->sk_kern_sock = kern;
 		sock_lock_init(sk);
+
 		sk->sk_net_refcnt = kern ? 0 : 1;
 		if (likely(sk->sk_net_refcnt)) {
 			get_net_track(net, &sk->ns_tracker, priority);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index f79137826d7f..8d4decb2606f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -683,6 +683,15 @@ static struct ctl_table netns_core_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE
 	},
+	{
+		.procname	= "bypass_prot_mem",
+		.data		= &init_net.core.sysctl_bypass_prot_mem,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
 	/* sysctl_core_net_init() will set the values after this
 	 * to readonly in network namespaces
 	 */
-- 
cgit v1.2.3


From 38163af068810b388f6723a681dfd8c7b3680d38 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Tue, 14 Oct 2025 23:54:58 +0000
Subject: bpf: Introduce SK_BPF_BYPASS_PROT_MEM.

If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out
of the global protocol memory accounting.

This is easily controlled by net.core.bypass_prot_mem sysctl, but it
lacks flexibility.

Let's support flagging (and clearing) sk->sk_bypass_prot_mem via
bpf_setsockopt() at the BPF_CGROUP_INET_SOCK_CREATE hook.

  int val = 1;

  bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM,
                 &val, sizeof(val));

As with net.core.bypass_prot_mem, this is inherited to child sockets,
and BPF always takes precedence over sysctl at socket(2) and accept(2).

SK_BPF_BYPASS_PROT_MEM is only supported at BPF_CGROUP_INET_SOCK_CREATE
and not supported on other hooks for some reasons:

  1. UDP charges memory under sk->sk_receive_queue.lock instead
     of lock_sock()

  2. Modifying the flag after skb is charged to sk requires such
     adjustment during bpf_setsockopt() and complicates the logic
     unnecessarily

We can support other hooks later if a real use case justifies that.

Most changes are inline and hard to trace, but a microbenchmark on
__sk_mem_raise_allocated() during neper/tcp_stream showed that more
samples completed faster with sk->sk_bypass_prot_mem == 1.  This will
be more visible under tcp_mem pressure (but it's not a fair comparison).

  # bpftrace -e 'kprobe:__sk_mem_raise_allocated { @start[tid] = nsecs; }
    kretprobe:__sk_mem_raise_allocated /@start[tid]/
    { @end[tid] = nsecs - @start[tid]; @times = hist(@end[tid]); delete(@start[tid]); }'
  # tcp_stream -6 -F 1000 -N -T 256

Without bpf prog:

  [128, 256)          3846 |                                                    |
  [256, 512)       1505326 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
  [512, 1K)        1371006 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@     |
  [1K, 2K)          198207 |@@@@@@                                              |
  [2K, 4K)           31199 |@                                                   |

With bpf prog in the next patch:
  (must be attached before tcp_stream)
  # bpftool prog load sk_bypass_prot_mem.bpf.o /sys/fs/bpf/test type cgroup/sock_create
  # bpftool cgroup attach /sys/fs/cgroup/test cgroup_inet_sock_create pinned /sys/fs/bpf/test

  [128, 256)          6413 |                                                    |
  [256, 512)       1868425 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
  [512, 1K)        1101697 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                      |
  [1K, 2K)          117031 |@@@@                                                |
  [2K, 4K)           11773 |                                                    |

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-6-kuniyu@google.com
---
 include/uapi/linux/bpf.h       |  2 ++
 net/core/filter.c              | 37 +++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 3 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6829936d33f5..6eb75ad900b1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7200,6 +7200,8 @@ enum {
 	TCP_BPF_SYN_MAC         = 1007, /* Copy the MAC, IP[46], and TCP header */
 	TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
 	SK_BPF_CB_FLAGS		= 1009, /* Get or set sock ops flags in socket */
+	SK_BPF_BYPASS_PROT_MEM	= 1010, /* Get or Set sk->sk_bypass_prot_mem */
+
 };
 
 enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index ed3f0e536059..16105f52927d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5733,9 +5733,37 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk,
+					  char *optval, int optlen,
+					  bool getopt)
+{
+	int val;
+
+	if (optlen != sizeof(int))
+		return -EINVAL;
+
+	if (!sk_has_account(sk))
+		return -EOPNOTSUPP;
+
+	if (getopt) {
+		*(int *)optval = sk->sk_bypass_prot_mem;
+		return 0;
+	}
+
+	val = *(int *)optval;
+	if (val < 0 || val > 1)
+		return -EINVAL;
+
+	sk->sk_bypass_prot_mem = val;
+	return 0;
+}
+
 BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level,
 	   int, optname, char *, optval, int, optlen)
 {
+	if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM)
+		return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false);
+
 	return __bpf_setsockopt(sk, level, optname, optval, optlen);
 }
 
@@ -5753,6 +5781,15 @@ static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = {
 BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level,
 	   int, optname, char *, optval, int, optlen)
 {
+	if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) {
+		int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true);
+
+		if (err)
+			memset(optval, 0, optlen);
+
+		return err;
+	}
+
 	return __bpf_getsockopt(sk, level, optname, optval, optlen);
 }
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 6829936d33f5..9b17d937edf7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7200,6 +7200,7 @@ enum {
 	TCP_BPF_SYN_MAC         = 1007, /* Copy the MAC, IP[46], and TCP header */
 	TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
 	SK_BPF_CB_FLAGS		= 1009, /* Get or set sock ops flags in socket */
+	SK_BPF_BYPASS_PROT_MEM	= 1010, /* Get or Set sk->sk_bypass_prot_mem */
 };
 
 enum {
-- 
cgit v1.2.3


From dce745009349fc391271c9415d5e242781ddadd7 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Mon, 21 Jul 2025 08:36:26 +0200
Subject: PCI/MSI: Delete pci_msi_create_irq_domain()

pci_msi_create_irq_domain() is now unused. Delete it.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/msi/irqdomain.c | 90 ---------------------------------------------
 include/linux/msi.h         |  3 --
 2 files changed, 93 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
index ce741ed9dc3f..a329060287b5 100644
--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -49,96 +49,6 @@ static void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *
 		__pci_write_msi_msg(desc, msg);
 }
 
-/**
- * pci_msi_domain_calc_hwirq - Generate a unique ID for an MSI source
- * @desc:	Pointer to the MSI descriptor
- *
- * The ID number is only used within the irqdomain.
- */
-static irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc)
-{
-	struct pci_dev *dev = msi_desc_to_pci_dev(desc);
-
-	return (irq_hw_number_t)desc->msi_index |
-		pci_dev_id(dev) << 11 |
-		((irq_hw_number_t)(pci_domain_nr(dev->bus) & 0xFFFFFFFF)) << 27;
-}
-
-static void pci_msi_domain_set_desc(msi_alloc_info_t *arg,
-				    struct msi_desc *desc)
-{
-	arg->desc = desc;
-	arg->hwirq = pci_msi_domain_calc_hwirq(desc);
-}
-
-static struct msi_domain_ops pci_msi_domain_ops_default = {
-	.set_desc	= pci_msi_domain_set_desc,
-};
-
-static void pci_msi_domain_update_dom_ops(struct msi_domain_info *info)
-{
-	struct msi_domain_ops *ops = info->ops;
-
-	if (ops == NULL) {
-		info->ops = &pci_msi_domain_ops_default;
-	} else {
-		if (ops->set_desc == NULL)
-			ops->set_desc = pci_msi_domain_set_desc;
-	}
-}
-
-static void pci_msi_domain_update_chip_ops(struct msi_domain_info *info)
-{
-	struct irq_chip *chip = info->chip;
-
-	BUG_ON(!chip);
-	if (!chip->irq_write_msi_msg)
-		chip->irq_write_msi_msg = pci_msi_domain_write_msg;
-	if (!chip->irq_mask)
-		chip->irq_mask = pci_msi_mask_irq;
-	if (!chip->irq_unmask)
-		chip->irq_unmask = pci_msi_unmask_irq;
-}
-
-/**
- * pci_msi_create_irq_domain - Create a MSI interrupt domain
- * @fwnode:	Optional fwnode of the interrupt controller
- * @info:	MSI domain info
- * @parent:	Parent irq domain
- *
- * Updates the domain and chip ops and creates a MSI interrupt domain.
- *
- * Returns:
- * A domain pointer or NULL in case of failure.
- */
-struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
-					     struct msi_domain_info *info,
-					     struct irq_domain *parent)
-{
-	if (WARN_ON(info->flags & MSI_FLAG_LEVEL_CAPABLE))
-		info->flags &= ~MSI_FLAG_LEVEL_CAPABLE;
-
-	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
-		pci_msi_domain_update_dom_ops(info);
-	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
-		pci_msi_domain_update_chip_ops(info);
-
-	/* Let the core code free MSI descriptors when freeing interrupts */
-	info->flags |= MSI_FLAG_FREE_MSI_DESCS;
-
-	info->flags |= MSI_FLAG_ACTIVATE_EARLY | MSI_FLAG_DEV_SYSFS;
-	if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
-		info->flags |= MSI_FLAG_MUST_REACTIVATE;
-
-	/* PCI-MSI is oneshot-safe */
-	info->chip->flags |= IRQCHIP_ONESHOT_SAFE;
-	/* Let the core update the bus token */
-	info->bus_token = DOMAIN_BUS_PCI_MSI;
-
-	return msi_create_irq_domain(fwnode, info, parent);
-}
-EXPORT_SYMBOL_GPL(pci_msi_create_irq_domain);
-
 /*
  * Per device MSI[-X] domain functionality
  */
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d415dd15a0a9..8003e3218c46 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -701,9 +701,6 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void pci_msi_mask_irq(struct irq_data *data);
 void pci_msi_unmask_irq(struct irq_data *data);
-struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
-					     struct msi_domain_info *info,
-					     struct irq_domain *parent);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
 u32 pci_msi_map_rid_ctlr_node(struct pci_dev *pdev, struct device_node **node);
 struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
-- 
cgit v1.2.3


From fe946a751d9b52b7c45ca34899723b314b79b249 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Oct 2025 17:19:04 +0000
Subject: net/sched: act_mirred: add loop detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 0f022d32c3ec ("net/sched: Fix mirred deadlock on device recursion")
added code in the fast path, even when act_mirred is not used.

Prepare its revert by implementing loop detection in act_mirred.

Adds an array of device pointers in struct netdev_xmit.

tcf_mirred_is_act_redirect() can detect if the array
already contains the target device.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251014171907.3554413-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice_xmit.h |  9 +++++-
 net/sched/act_mirred.c         | 62 ++++++++++++++++--------------------------
 2 files changed, 31 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h
index 813a19122ebb..cc232508e695 100644
--- a/include/linux/netdevice_xmit.h
+++ b/include/linux/netdevice_xmit.h
@@ -2,6 +2,12 @@
 #ifndef _LINUX_NETDEVICE_XMIT_H
 #define _LINUX_NETDEVICE_XMIT_H
 
+#if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
+#define MIRRED_NEST_LIMIT	4
+#endif
+
+struct net_device;
+
 struct netdev_xmit {
 	u16 recursion;
 	u8  more;
@@ -9,7 +15,8 @@ struct netdev_xmit {
 	u8  skip_txqueue;
 #endif
 #if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
-	u8 sched_mirred_nest;
+	u8			sched_mirred_nest;
+	struct net_device	*sched_mirred_dev[MIRRED_NEST_LIMIT];
 #endif
 #if IS_ENABLED(CONFIG_NF_DUP_NETDEV)
 	u8 nf_dup_skb_recursion;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5f01f567c934..f27b583def78 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -29,31 +29,6 @@
 static LIST_HEAD(mirred_list);
 static DEFINE_SPINLOCK(mirred_list_lock);
 
-#define MIRRED_NEST_LIMIT    4
-
-#ifndef CONFIG_PREEMPT_RT
-static u8 tcf_mirred_nest_level_inc_return(void)
-{
-	return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest);
-}
-
-static void tcf_mirred_nest_level_dec(void)
-{
-	__this_cpu_dec(softnet_data.xmit.sched_mirred_nest);
-}
-
-#else
-static u8 tcf_mirred_nest_level_inc_return(void)
-{
-	return current->net_xmit.sched_mirred_nest++;
-}
-
-static void tcf_mirred_nest_level_dec(void)
-{
-	current->net_xmit.sched_mirred_nest--;
-}
-#endif
-
 static bool tcf_mirred_is_act_redirect(int action)
 {
 	return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
@@ -439,44 +414,53 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
 {
 	struct tcf_mirred *m = to_mirred(a);
 	int retval = READ_ONCE(m->tcf_action);
-	unsigned int nest_level;
+	struct netdev_xmit *xmit;
 	bool m_mac_header_xmit;
 	struct net_device *dev;
-	int m_eaction;
+	int i, m_eaction;
 	u32 blockid;
 
-	nest_level = tcf_mirred_nest_level_inc_return();
-	if (unlikely(nest_level > MIRRED_NEST_LIMIT)) {
+#ifdef CONFIG_PREEMPT_RT
+	xmit = &current->net_xmit;
+#else
+	xmit = this_cpu_ptr(&softnet_data.xmit);
+#endif
+	if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) {
 		net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
 				     netdev_name(skb->dev));
-		retval = TC_ACT_SHOT;
-		goto dec_nest_level;
+		return TC_ACT_SHOT;
 	}
 
 	tcf_lastuse_update(&m->tcf_tm);
 	tcf_action_update_bstats(&m->common, skb);
 
 	blockid = READ_ONCE(m->tcfm_blockid);
-	if (blockid) {
-		retval = tcf_blockcast(skb, m, blockid, res, retval);
-		goto dec_nest_level;
-	}
+	if (blockid)
+		return tcf_blockcast(skb, m, blockid, res, retval);
 
 	dev = rcu_dereference_bh(m->tcfm_dev);
 	if (unlikely(!dev)) {
 		pr_notice_once("tc mirred: target device is gone\n");
 		tcf_action_inc_overlimit_qstats(&m->common);
-		goto dec_nest_level;
+		return retval;
 	}
+	for (i = 0; i < xmit->sched_mirred_nest; i++) {
+		if (xmit->sched_mirred_dev[i] != dev)
+			continue;
+		pr_notice_once("tc mirred: loop on device %s\n",
+			       netdev_name(dev));
+		tcf_action_inc_overlimit_qstats(&m->common);
+		return retval;
+	}
+
+	xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev;
 
 	m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
 	m_eaction = READ_ONCE(m->tcfm_eaction);
 
 	retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction,
 				   retval);
-
-dec_nest_level:
-	tcf_mirred_nest_level_dec();
+	xmit->sched_mirred_nest--;
 
 	return retval;
 }
-- 
cgit v1.2.3


From 178ca30889a13b555dddab7689fd2cc58c8e5dac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Oct 2025 17:19:05 +0000
Subject: Revert "net/sched: Fix mirred deadlock on device recursion"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commits 0f022d32c3eca477fbf79a205243a6123ed0fe11
and 44180feaccf266d9b0b28cc4ceaac019817deb5c.

Prior patch in this series implemented loop detection
in act_mirred, we can remove q->owner to save some cycles
in the fast path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Victor Nogueira <victor@mojatatu.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251014171907.3554413-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sch_generic.h | 1 -
 net/core/dev.c            | 6 ------
 net/sched/sch_generic.c   | 2 --
 3 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 738cd5b13c62..32e9961570b4 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -117,7 +117,6 @@ struct Qdisc {
 	struct qdisc_skb_head	q;
 	struct gnet_stats_basic_sync bstats;
 	struct gnet_stats_queue	qstats;
-	int                     owner;
 	unsigned long		state;
 	unsigned long		state2; /* must be written under qdisc spinlock */
 	struct Qdisc            *next_sched;
diff --git a/net/core/dev.c b/net/core/dev.c
index 35010faf0b78..1d8e7a76d83b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4167,10 +4167,6 @@ no_lock_out:
 		return rc;
 	}
 
-	if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
-		return NET_XMIT_DROP;
-	}
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
@@ -4210,9 +4206,7 @@ no_lock_out:
 		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
-		WRITE_ONCE(q->owner, smp_processor_id());
 		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
-		WRITE_ONCE(q->owner, -1);
 		if (qdisc_run_begin(q)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 1e008a228ebd..dfa8e8e667d2 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -679,7 +679,6 @@ struct Qdisc noop_qdisc = {
 		.qlen = 0,
 		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
 	},
-	.owner = -1,
 };
 EXPORT_SYMBOL(noop_qdisc);
 
@@ -985,7 +984,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
 	sch->dev_queue = dev_queue;
-	sch->owner = -1;
 	netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL);
 	refcount_set(&sch->refcnt, 1);
 
-- 
cgit v1.2.3


From 526f5fb112f7c89c5a9b8b2f9870c8cb76ca4e42 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Oct 2025 17:19:06 +0000
Subject: net: sched: claim one cache line in Qdisc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace state2 field with a boolean.

Move it to a hole between qstats and state so that
we shrink Qdisc by a full cache line.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251014171907.3554413-6-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sch_generic.h | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 32e9961570b4..31561291bc92 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -41,13 +41,6 @@ enum qdisc_state_t {
 	__QDISC_STATE_DRAINING,
 };
 
-enum qdisc_state2_t {
-	/* Only for !TCQ_F_NOLOCK qdisc. Never access it directly.
-	 * Use qdisc_run_begin/end() or qdisc_is_running() instead.
-	 */
-	__QDISC_STATE2_RUNNING,
-};
-
 #define QDISC_STATE_MISSED	BIT(__QDISC_STATE_MISSED)
 #define QDISC_STATE_DRAINING	BIT(__QDISC_STATE_DRAINING)
 
@@ -117,8 +110,8 @@ struct Qdisc {
 	struct qdisc_skb_head	q;
 	struct gnet_stats_basic_sync bstats;
 	struct gnet_stats_queue	qstats;
+	bool			running; /* must be written under qdisc spinlock */
 	unsigned long		state;
-	unsigned long		state2; /* must be written under qdisc spinlock */
 	struct Qdisc            *next_sched;
 	struct sk_buff_head	skb_bad_txq;
 
@@ -167,7 +160,7 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc)
 {
 	if (qdisc->flags & TCQ_F_NOLOCK)
 		return spin_is_locked(&qdisc->seqlock);
-	return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+	return READ_ONCE(qdisc->running);
 }
 
 static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc)
@@ -210,7 +203,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 		 */
 		return spin_trylock(&qdisc->seqlock);
 	}
-	return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+	if (READ_ONCE(qdisc->running))
+		return false;
+	WRITE_ONCE(qdisc->running, true);
+	return true;
 }
 
 static inline void qdisc_run_end(struct Qdisc *qdisc)
@@ -228,7 +224,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
 				      &qdisc->state)))
 			__netif_schedule(qdisc);
 	} else {
-		__clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+		WRITE_ONCE(qdisc->running, false);
 	}
 }
 
-- 
cgit v1.2.3


From 100dfa74cad9d4665cdcf0cc8e673b123a3ea910 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Oct 2025 17:19:07 +0000
Subject: net: dev_queue_xmit() llist adoption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove busylock spinlock and use a lockless list (llist)
to reduce spinlock contention to the minimum.

Idea is that only one cpu might spin on the qdisc spinlock,
while others simply add their skb in the llist.

After this patch, we get a 300 % improvement on heavy TX workloads.
- Sending twice the number of packets per second.
- While consuming 50 % less cycles.

Note that this also allows in the future to submit batches
to various qdisc->enqueue() methods.

Tested:

- Dual Intel(R) Xeon(R) 6985P-C  (480 hyper threads).
- 100Gbit NIC, 30 TX queues with FQ packet scheduler.
- echo 64 >/sys/kernel/slab/skbuff_small_head/cpu_partial (avoid contention in mm)
- 240 concurrent "netperf -t UDP_STREAM -- -m 120 -n"

Before:

16 Mpps (41 Mpps if each thread is pinned to a different cpu)

vmstat 2 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
243  0      0 2368988672  51036 1100852    0    0   146     1  242   60  0  9 91  0  0
244  0      0 2368988672  51036 1100852    0    0   536    10 487745 14718  0 52 48  0  0
244  0      0 2368988672  51036 1100852    0    0   512     0 503067 46033  0 52 48  0  0
244  0      0 2368988672  51036 1100852    0    0   512     0 494807 12107  0 52 48  0  0
244  0      0 2368988672  51036 1100852    0    0   702    26 492845 10110  0 52 48  0  0

Lock contention (1 second sample taken on 8 cores)
perf lock record -C0-7 sleep 1; perf lock contention
 contended   total wait     max wait     avg wait         type   caller

    442111      6.79 s     162.47 ms     15.35 us     spinlock   dev_hard_start_xmit+0xcd
      5961      9.57 ms      8.12 us      1.60 us     spinlock   __dev_queue_xmit+0x3a0
       244    560.63 us      7.63 us      2.30 us     spinlock   do_softirq+0x5b
        13     25.09 us      3.21 us      1.93 us     spinlock   net_tx_action+0xf8

If netperf threads are pinned, spinlock stress is very high.
perf lock record -C0-7 sleep 1; perf lock contention
 contended   total wait     max wait     avg wait         type   caller

    964508      7.10 s     147.25 ms      7.36 us     spinlock   dev_hard_start_xmit+0xcd
       201    268.05 us      4.65 us      1.33 us     spinlock   __dev_queue_xmit+0x3a0
        12     26.05 us      3.84 us      2.17 us     spinlock   do_softirq+0x5b

@__dev_queue_xmit_ns:
[256, 512)            21 |                                                    |
[512, 1K)            631 |                                                    |
[1K, 2K)           27328 |@                                                   |
[2K, 4K)          265392 |@@@@@@@@@@@@@@@@                                    |
[4K, 8K)          417543 |@@@@@@@@@@@@@@@@@@@@@@@@@@                          |
[8K, 16K)         826292 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[16K, 32K)        733822 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
[32K, 64K)         19055 |@                                                   |
[64K, 128K)        17240 |@                                                   |
[128K, 256K)       25633 |@                                                   |
[256K, 512K)           4 |                                                    |

After:

29 Mpps (57 Mpps if each thread is pinned to a different cpu)

vmstat 2 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
78  0      0 2369573632  32896 1350988    0    0    22     0  331  254  0  8 92  0  0
75  0      0 2369573632  32896 1350988    0    0    22    50 425713 280199  0 23 76  0  0
104  0      0 2369573632  32896 1350988    0    0   290     0 430238 298247  0 23 76  0  0
86  0      0 2369573632  32896 1350988    0    0   132     0 428019 291865  0 24 76  0  0
90  0      0 2369573632  32896 1350988    0    0   502     0 422498 278672  0 23 76  0  0

perf lock record -C0-7 sleep 1; perf lock contention
 contended   total wait     max wait     avg wait         type   caller

      2524    116.15 ms    486.61 us     46.02 us     spinlock   __dev_queue_xmit+0x55b
      5821    107.18 ms    371.67 us     18.41 us     spinlock   dev_hard_start_xmit+0xcd
      2377      9.73 ms     35.86 us      4.09 us     spinlock   ___slab_alloc+0x4e0
       923      5.74 ms     20.91 us      6.22 us     spinlock   ___slab_alloc+0x5c9
       121      3.42 ms    193.05 us     28.24 us     spinlock   net_tx_action+0xf8
         6    564.33 us    167.60 us     94.05 us     spinlock   do_softirq+0x5b

If netperf threads are pinned (~54 Mpps)
perf lock record -C0-7 sleep 1; perf lock contention
     32907    316.98 ms    195.98 us      9.63 us     spinlock   dev_hard_start_xmit+0xcd
      4507     61.83 ms    212.73 us     13.72 us     spinlock   __dev_queue_xmit+0x554
      2781     23.53 ms     40.03 us      8.46 us     spinlock   ___slab_alloc+0x5c9
      3554     18.94 ms     34.69 us      5.33 us     spinlock   ___slab_alloc+0x4e0
       233      9.09 ms    215.70 us     38.99 us     spinlock   do_softirq+0x5b
       153    930.66 us     48.67 us      6.08 us     spinlock   net_tx_action+0xfd
        84    331.10 us     14.22 us      3.94 us     spinlock   ___slab_alloc+0x5c9
       140    323.71 us      9.94 us      2.31 us     spinlock   ___slab_alloc+0x4e0

@__dev_queue_xmit_ns:
[128, 256)       1539830 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                  |
[256, 512)       2299558 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[512, 1K)         483936 |@@@@@@@@@@                                          |
[1K, 2K)          265345 |@@@@@@                                              |
[2K, 4K)          145463 |@@@                                                 |
[4K, 8K)           54571 |@                                                   |
[8K, 16K)          10270 |                                                    |
[16K, 32K)          9385 |                                                    |
[32K, 64K)          7749 |                                                    |
[64K, 128K)        26799 |                                                    |
[128K, 256K)        2665 |                                                    |
[256K, 512K)         665 |                                                    |

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251014171907.3554413-7-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sch_generic.h |  4 ++-
 net/core/dev.c            | 91 +++++++++++++++++++++++++++++------------------
 net/sched/sch_generic.c   |  5 ---
 3 files changed, 59 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 31561291bc92..94966692ccdf 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -115,7 +115,9 @@ struct Qdisc {
 	struct Qdisc            *next_sched;
 	struct sk_buff_head	skb_bad_txq;
 
-	spinlock_t		busylock ____cacheline_aligned_in_smp;
+	atomic_long_t		defer_count ____cacheline_aligned_in_smp;
+	struct llist_head	defer_list;
+
 	spinlock_t		seqlock;
 
 	struct rcu_head		rcu;
diff --git a/net/core/dev.c b/net/core/dev.c
index 1d8e7a76d83b..821e7c718924 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4125,9 +4125,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct net_device *dev,
 				 struct netdev_queue *txq)
 {
+	struct sk_buff *next, *to_free = NULL;
 	spinlock_t *root_lock = qdisc_lock(q);
-	struct sk_buff *to_free = NULL;
-	bool contended;
+	struct llist_node *ll_list, *first_n;
+	unsigned long defer_count = 0;
 	int rc;
 
 	qdisc_calculate_pkt_len(skb, q);
@@ -4167,61 +4168,81 @@ no_lock_out:
 		return rc;
 	}
 
-	/*
-	 * Heuristic to force contended enqueues to serialize on a
-	 * separate lock before trying to get qdisc main lock.
-	 * This permits qdisc->running owner to get the lock more
-	 * often and dequeue packets faster.
-	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
-	 * and then other tasks will only enqueue packets. The packets will be
-	 * sent after the qdisc owner is scheduled again. To prevent this
-	 * scenario the task always serialize on the lock.
+	/* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
+	 * In the try_cmpxchg() loop, we want to increment q->defer_count
+	 * at most once to limit the number of skbs in defer_list.
+	 * We perform the defer_count increment only if the list is not empty,
+	 * because some arches have slow atomic_long_inc_return().
+	 */
+	first_n = READ_ONCE(q->defer_list.first);
+	do {
+		if (first_n && !defer_count) {
+			defer_count = atomic_long_inc_return(&q->defer_count);
+			if (unlikely(defer_count > q->limit)) {
+				kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+				return NET_XMIT_DROP;
+			}
+		}
+		skb->ll_node.next = first_n;
+	} while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
+
+	/* If defer_list was not empty, we know the cpu which queued
+	 * the first skb will process the whole list for us.
 	 */
-	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
-	if (unlikely(contended))
-		spin_lock(&q->busylock);
+	if (first_n)
+		return NET_XMIT_SUCCESS;
 
 	spin_lock(root_lock);
+
+	ll_list = llist_del_all(&q->defer_list);
+	/* There is a small race because we clear defer_count not atomically
+	 * with the prior llist_del_all(). This means defer_list could grow
+	 * over q->limit.
+	 */
+	atomic_long_set(&q->defer_count, 0);
+
+	ll_list = llist_reverse_order(ll_list);
+
 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
-		__qdisc_drop(skb, &to_free);
+		llist_for_each_entry_safe(skb, next, ll_list, ll_node)
+			__qdisc_drop(skb, &to_free);
 		rc = NET_XMIT_DROP;
-	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
-		   qdisc_run_begin(q)) {
+		goto unlock;
+	}
+	if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+	    !llist_next(ll_list) && qdisc_run_begin(q)) {
 		/*
 		 * This is a work-conserving queue; there are no old skbs
 		 * waiting to be sent out; and the qdisc is not running -
 		 * xmit the skb directly.
 		 */
 
+		DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
+							  struct sk_buff,
+							  ll_node));
 		qdisc_bstats_update(q, skb);
-
-		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
-			if (unlikely(contended)) {
-				spin_unlock(&q->busylock);
-				contended = false;
-			}
+		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
 			__qdisc_run(q);
-		}
-
 		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
-		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
-		if (qdisc_run_begin(q)) {
-			if (unlikely(contended)) {
-				spin_unlock(&q->busylock);
-				contended = false;
-			}
-			__qdisc_run(q);
-			qdisc_run_end(q);
+		int count = 0;
+
+		llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+			prefetch(next);
+			skb_mark_not_on_list(skb);
+			rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+			count++;
 		}
+		qdisc_run(q);
+		if (count != 1)
+			rc = NET_XMIT_SUCCESS;
 	}
+unlock:
 	spin_unlock(root_lock);
 	if (unlikely(to_free))
 		kfree_skb_list_reason(to_free,
 				      tcf_get_drop_reason(to_free));
-	if (unlikely(contended))
-		spin_unlock(&q->busylock);
 	return rc;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index dfa8e8e667d2..d9a98d02a55f 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -666,7 +666,6 @@ struct Qdisc noop_qdisc = {
 	.ops		=	&noop_qdisc_ops,
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.dev_queue	=	&noop_netdev_queue,
-	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
 	.gso_skb = {
 		.next = (struct sk_buff *)&noop_qdisc.gso_skb,
 		.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
@@ -970,10 +969,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		}
 	}
 
-	spin_lock_init(&sch->busylock);
-	lockdep_set_class(&sch->busylock,
-			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
 	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
 	spin_lock_init(&sch->seqlock);
 	lockdep_set_class(&sch->seqlock,
-- 
cgit v1.2.3


From f968a24cad3da72fdff12a0ae5ac0b679439cca1 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Fri, 3 Oct 2025 12:16:38 +0900
Subject: can: treewide: remove can_change_mtu()

can_change_mtu() became obsolete by commit 23049938605b ("can: populate the
minimum and maximum MTU values"). Now that net_device->min_mtu and
net_device->max_mtu are populated, all the checks are already done by
dev_validate_mtu() in net/core/dev.c.

Remove the net_device_ops->ndo_change_mtu() callback of all the physical
interfaces, then remove can_change_mtu(). Only keep the vcan_change_mtu()
and vxcan_change_mtu() because the virtual interfaces use their own
different MTU logic.

The only functional change this patch introduces is that now the user will
be able to change the MTU even if the interface is up. This does not matter
for Classical CAN and CAN FD because their MTU range is composed of only
one value, respectively CAN_MTU and CANFD_MTU. For the upcoming CAN XL, the
MTU will be configurable within the CANXL_MIN_MTU to CANXL_MAX_MTU range at
any time, even if the interface is up. This is consistent with the other
net protocols and does not contradict ISO 11898-1:2024 as having a
modifiable MTU is a kernel extension.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Link: https://patch.msgid.link/20251003-remove-can_change_mtu-v1-1-337f8bc21181@kernel.org
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/at91_can.c                         |  1 -
 drivers/net/can/bxcan.c                            |  1 -
 drivers/net/can/c_can/c_can_main.c                 |  1 -
 drivers/net/can/can327.c                           |  1 -
 drivers/net/can/cc770/cc770.c                      |  1 -
 drivers/net/can/ctucanfd/ctucanfd_base.c           |  1 -
 drivers/net/can/dev/dev.c                          | 38 ----------------------
 drivers/net/can/esd/esd_402_pci-core.c             |  1 -
 drivers/net/can/flexcan/flexcan-core.c             |  1 -
 drivers/net/can/grcan.c                            |  1 -
 drivers/net/can/ifi_canfd/ifi_canfd.c              |  1 -
 drivers/net/can/janz-ican3.c                       |  1 -
 drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c |  1 -
 drivers/net/can/m_can/m_can.c                      |  1 -
 drivers/net/can/mscan/mscan.c                      |  1 -
 drivers/net/can/peak_canfd/peak_canfd.c            |  1 -
 drivers/net/can/rcar/rcar_can.c                    |  1 -
 drivers/net/can/rcar/rcar_canfd.c                  |  1 -
 drivers/net/can/rockchip/rockchip_canfd-core.c     |  1 -
 drivers/net/can/sja1000/sja1000.c                  |  1 -
 drivers/net/can/slcan/slcan-core.c                 |  1 -
 drivers/net/can/softing/softing_main.c             |  1 -
 drivers/net/can/spi/hi311x.c                       |  1 -
 drivers/net/can/spi/mcp251x.c                      |  1 -
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c     |  1 -
 drivers/net/can/sun4i_can.c                        |  1 -
 drivers/net/can/ti_hecc.c                          |  1 -
 drivers/net/can/usb/ems_usb.c                      |  1 -
 drivers/net/can/usb/esd_usb.c                      |  1 -
 drivers/net/can/usb/etas_es58x/es58x_core.c        |  1 -
 drivers/net/can/usb/f81604.c                       |  1 -
 drivers/net/can/usb/gs_usb.c                       |  1 -
 drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c   |  1 -
 drivers/net/can/usb/mcba_usb.c                     |  1 -
 drivers/net/can/usb/nct6694_canfd.c                |  1 -
 drivers/net/can/usb/peak_usb/pcan_usb_core.c       |  1 -
 drivers/net/can/usb/ucan.c                         |  1 -
 drivers/net/can/usb/usb_8dev.c                     |  1 -
 drivers/net/can/xilinx_can.c                       |  1 -
 include/linux/can/dev.h                            |  1 -
 40 files changed, 77 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index 191707d7e3da..c2a3a4eef5b2 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -948,7 +948,6 @@ static const struct net_device_ops at91_netdev_ops = {
 	.ndo_open	= at91_open,
 	.ndo_stop	= at91_close,
 	.ndo_start_xmit	= at91_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops at91_ethtool_ops = {
diff --git a/drivers/net/can/bxcan.c b/drivers/net/can/bxcan.c
index bfc60eb33dc3..9c3af7049814 100644
--- a/drivers/net/can/bxcan.c
+++ b/drivers/net/can/bxcan.c
@@ -881,7 +881,6 @@ static const struct net_device_ops bxcan_netdev_ops = {
 	.ndo_open = bxcan_open,
 	.ndo_stop = bxcan_stop,
 	.ndo_start_xmit = bxcan_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops bxcan_ethtool_ops = {
diff --git a/drivers/net/can/c_can/c_can_main.c b/drivers/net/can/c_can/c_can_main.c
index cc371d0c9f3c..3702cac7fbf0 100644
--- a/drivers/net/can/c_can/c_can_main.c
+++ b/drivers/net/can/c_can/c_can_main.c
@@ -1362,7 +1362,6 @@ static const struct net_device_ops c_can_netdev_ops = {
 	.ndo_open = c_can_open,
 	.ndo_stop = c_can_close,
 	.ndo_start_xmit = c_can_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 int register_c_can_dev(struct net_device *dev)
diff --git a/drivers/net/can/can327.c b/drivers/net/can/can327.c
index 24af63961030..b66fc16aedd2 100644
--- a/drivers/net/can/can327.c
+++ b/drivers/net/can/can327.c
@@ -849,7 +849,6 @@ static const struct net_device_ops can327_netdev_ops = {
 	.ndo_open = can327_netdev_open,
 	.ndo_stop = can327_netdev_close,
 	.ndo_start_xmit = can327_netdev_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops can327_ethtool_ops = {
diff --git a/drivers/net/can/cc770/cc770.c b/drivers/net/can/cc770/cc770.c
index 30909f3aab57..8d5abd643c06 100644
--- a/drivers/net/can/cc770/cc770.c
+++ b/drivers/net/can/cc770/cc770.c
@@ -834,7 +834,6 @@ static const struct net_device_ops cc770_netdev_ops = {
 	.ndo_open = cc770_open,
 	.ndo_stop = cc770_close,
 	.ndo_start_xmit = cc770_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops cc770_ethtool_ops = {
diff --git a/drivers/net/can/ctucanfd/ctucanfd_base.c b/drivers/net/can/ctucanfd/ctucanfd_base.c
index 8bd3f0fc385c..1e6b9e3dc2fe 100644
--- a/drivers/net/can/ctucanfd/ctucanfd_base.c
+++ b/drivers/net/can/ctucanfd/ctucanfd_base.c
@@ -1301,7 +1301,6 @@ static const struct net_device_ops ctucan_netdev_ops = {
 	.ndo_open	= ctucan_open,
 	.ndo_stop	= ctucan_close,
 	.ndo_start_xmit	= ctucan_start_xmit,
-	.ndo_change_mtu	= can_change_mtu,
 };
 
 static const struct ethtool_ops ctucan_ethtool_ops = {
diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 15ccedbb3f8d..0cc3d008adb3 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -359,44 +359,6 @@ void can_set_default_mtu(struct net_device *dev)
 	}
 }
 
-/* changing MTU and control mode for CAN/CANFD devices */
-int can_change_mtu(struct net_device *dev, int new_mtu)
-{
-	struct can_priv *priv = netdev_priv(dev);
-	u32 ctrlmode_static = can_get_static_ctrlmode(priv);
-
-	/* Do not allow changing the MTU while running */
-	if (dev->flags & IFF_UP)
-		return -EBUSY;
-
-	/* allow change of MTU according to the CANFD ability of the device */
-	switch (new_mtu) {
-	case CAN_MTU:
-		/* 'CANFD-only' controllers can not switch to CAN_MTU */
-		if (ctrlmode_static & CAN_CTRLMODE_FD)
-			return -EINVAL;
-
-		priv->ctrlmode &= ~CAN_CTRLMODE_FD;
-		break;
-
-	case CANFD_MTU:
-		/* check for potential CANFD ability */
-		if (!(priv->ctrlmode_supported & CAN_CTRLMODE_FD) &&
-		    !(ctrlmode_static & CAN_CTRLMODE_FD))
-			return -EINVAL;
-
-		priv->ctrlmode |= CAN_CTRLMODE_FD;
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	WRITE_ONCE(dev->mtu, new_mtu);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(can_change_mtu);
-
 /* helper to define static CAN controller features at device creation time */
 int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode)
 {
diff --git a/drivers/net/can/esd/esd_402_pci-core.c b/drivers/net/can/esd/esd_402_pci-core.c
index 5d6d2828cd04..05adecae6375 100644
--- a/drivers/net/can/esd/esd_402_pci-core.c
+++ b/drivers/net/can/esd/esd_402_pci-core.c
@@ -86,7 +86,6 @@ static const struct net_device_ops pci402_acc_netdev_ops = {
 	.ndo_open = acc_open,
 	.ndo_stop = acc_close,
 	.ndo_start_xmit = acc_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 	.ndo_eth_ioctl = can_eth_ioctl_hwts,
 };
 
diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c
index 06d5d35fc1b5..f5d22c61503f 100644
--- a/drivers/net/can/flexcan/flexcan-core.c
+++ b/drivers/net/can/flexcan/flexcan-core.c
@@ -1867,7 +1867,6 @@ static const struct net_device_ops flexcan_netdev_ops = {
 	.ndo_open	= flexcan_open,
 	.ndo_stop	= flexcan_close,
 	.ndo_start_xmit	= flexcan_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static int register_flexcandev(struct net_device *dev)
diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c
index c5784d9779ef..3b1b09943436 100644
--- a/drivers/net/can/grcan.c
+++ b/drivers/net/can/grcan.c
@@ -1561,7 +1561,6 @@ static const struct net_device_ops grcan_netdev_ops = {
 	.ndo_open	= grcan_open,
 	.ndo_stop	= grcan_close,
 	.ndo_start_xmit	= grcan_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops grcan_ethtool_ops = {
diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c
index 2eeee65f606f..0f83335e4d07 100644
--- a/drivers/net/can/ifi_canfd/ifi_canfd.c
+++ b/drivers/net/can/ifi_canfd/ifi_canfd.c
@@ -944,7 +944,6 @@ static const struct net_device_ops ifi_canfd_netdev_ops = {
 	.ndo_open	= ifi_canfd_open,
 	.ndo_stop	= ifi_canfd_close,
 	.ndo_start_xmit	= ifi_canfd_start_xmit,
-	.ndo_change_mtu	= can_change_mtu,
 };
 
 static const struct ethtool_ops ifi_canfd_ethtool_ops = {
diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c
index bfa5cbe88017..1efdd1fd8caa 100644
--- a/drivers/net/can/janz-ican3.c
+++ b/drivers/net/can/janz-ican3.c
@@ -1752,7 +1752,6 @@ static const struct net_device_ops ican3_netdev_ops = {
 	.ndo_open	= ican3_open,
 	.ndo_stop	= ican3_stop,
 	.ndo_start_xmit	= ican3_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops ican3_ethtool_ops = {
diff --git a/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c b/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c
index 0880023611be..705f9bb74cd2 100644
--- a/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c
+++ b/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c
@@ -904,7 +904,6 @@ static const struct net_device_ops kvaser_pciefd_netdev_ops = {
 	.ndo_stop = kvaser_pciefd_stop,
 	.ndo_eth_ioctl = can_eth_ioctl_hwts,
 	.ndo_start_xmit = kvaser_pciefd_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static int kvaser_pciefd_set_phys_id(struct net_device *netdev,
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 48b7a67336b5..873f5991fc5a 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -2148,7 +2148,6 @@ static const struct net_device_ops m_can_netdev_ops = {
 	.ndo_open = m_can_open,
 	.ndo_stop = m_can_close,
 	.ndo_start_xmit = m_can_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static int m_can_get_coalesce(struct net_device *dev,
diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c
index 8c2a7bc64d3d..39c7aa2a0b2f 100644
--- a/drivers/net/can/mscan/mscan.c
+++ b/drivers/net/can/mscan/mscan.c
@@ -607,7 +607,6 @@ static const struct net_device_ops mscan_netdev_ops = {
 	.ndo_open	= mscan_open,
 	.ndo_stop	= mscan_close,
 	.ndo_start_xmit	= mscan_start_xmit,
-	.ndo_change_mtu	= can_change_mtu,
 };
 
 static const struct ethtool_ops mscan_ethtool_ops = {
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
index b5bc80ac7876..a53c9d347b7b 100644
--- a/drivers/net/can/peak_canfd/peak_canfd.c
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -773,7 +773,6 @@ static const struct net_device_ops peak_canfd_netdev_ops = {
 	.ndo_stop = peak_canfd_close,
 	.ndo_eth_ioctl = peak_eth_ioctl,
 	.ndo_start_xmit = peak_canfd_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static int peak_get_ts_info(struct net_device *dev,
diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index 5f85f4e27205..fc3df328e877 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -635,7 +635,6 @@ static const struct net_device_ops rcar_can_netdev_ops = {
 	.ndo_open = rcar_can_open,
 	.ndo_stop = rcar_can_close,
 	.ndo_start_xmit = rcar_can_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops rcar_can_ethtool_ops = {
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index 45d36adb51b7..49ab65274b51 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -1818,7 +1818,6 @@ static const struct net_device_ops rcar_canfd_netdev_ops = {
 	.ndo_open = rcar_canfd_open,
 	.ndo_stop = rcar_canfd_close,
 	.ndo_start_xmit = rcar_canfd_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops rcar_canfd_ethtool_ops = {
diff --git a/drivers/net/can/rockchip/rockchip_canfd-core.c b/drivers/net/can/rockchip/rockchip_canfd-core.c
index 046f0a0ae4d4..29de0c01e4ed 100644
--- a/drivers/net/can/rockchip/rockchip_canfd-core.c
+++ b/drivers/net/can/rockchip/rockchip_canfd-core.c
@@ -761,7 +761,6 @@ static const struct net_device_ops rkcanfd_netdev_ops = {
 	.ndo_open = rkcanfd_open,
 	.ndo_stop = rkcanfd_stop,
 	.ndo_start_xmit = rkcanfd_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static int __maybe_unused rkcanfd_runtime_suspend(struct device *dev)
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 4d245857ef1c..acfa49db3907 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -697,7 +697,6 @@ static const struct net_device_ops sja1000_netdev_ops = {
 	.ndo_open	= sja1000_open,
 	.ndo_stop	= sja1000_close,
 	.ndo_start_xmit	= sja1000_start_xmit,
-	.ndo_change_mtu	= can_change_mtu,
 };
 
 static const struct ethtool_ops sja1000_ethtool_ops = {
diff --git a/drivers/net/can/slcan/slcan-core.c b/drivers/net/can/slcan/slcan-core.c
index 58ff2ec1d975..cd789e178d34 100644
--- a/drivers/net/can/slcan/slcan-core.c
+++ b/drivers/net/can/slcan/slcan-core.c
@@ -774,7 +774,6 @@ static const struct net_device_ops slcan_netdev_ops = {
 	.ndo_open               = slcan_netdev_open,
 	.ndo_stop               = slcan_netdev_close,
 	.ndo_start_xmit         = slcan_netdev_xmit,
-	.ndo_change_mtu         = can_change_mtu,
 };
 
 /******************************************
diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c
index 278ee8722770..79bc64395ac4 100644
--- a/drivers/net/can/softing/softing_main.c
+++ b/drivers/net/can/softing/softing_main.c
@@ -609,7 +609,6 @@ static const struct net_device_ops softing_netdev_ops = {
 	.ndo_open = softing_netdev_open,
 	.ndo_stop = softing_netdev_stop,
 	.ndo_start_xmit	= softing_netdev_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops softing_ethtool_ops = {
diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c
index 6d4b643e135f..e00d3dbc4cf4 100644
--- a/drivers/net/can/spi/hi311x.c
+++ b/drivers/net/can/spi/hi311x.c
@@ -799,7 +799,6 @@ static const struct net_device_ops hi3110_netdev_ops = {
 	.ndo_open = hi3110_open,
 	.ndo_stop = hi3110_stop,
 	.ndo_start_xmit = hi3110_hard_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops hi3110_ethtool_ops = {
diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
index b797e08499d7..1e54e1a22702 100644
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -1270,7 +1270,6 @@ static const struct net_device_ops mcp251x_netdev_ops = {
 	.ndo_open = mcp251x_open,
 	.ndo_stop = mcp251x_stop,
 	.ndo_start_xmit = mcp251x_hard_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops mcp251x_ethtool_ops = {
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 7450ea42c1ea..9402530ba3d4 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -1715,7 +1715,6 @@ static const struct net_device_ops mcp251xfd_netdev_ops = {
 	.ndo_stop = mcp251xfd_stop,
 	.ndo_start_xmit	= mcp251xfd_start_xmit,
 	.ndo_eth_ioctl = can_eth_ioctl_hwts,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static void
diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c
index 53bfd873de9b..6fcb301ef611 100644
--- a/drivers/net/can/sun4i_can.c
+++ b/drivers/net/can/sun4i_can.c
@@ -768,7 +768,6 @@ static const struct net_device_ops sun4ican_netdev_ops = {
 	.ndo_open = sun4ican_open,
 	.ndo_stop = sun4ican_close,
 	.ndo_start_xmit = sun4ican_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops sun4ican_ethtool_ops = {
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index e6d6661a908a..1d3dbf28b105 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -829,7 +829,6 @@ static const struct net_device_ops ti_hecc_netdev_ops = {
 	.ndo_open		= ti_hecc_open,
 	.ndo_stop		= ti_hecc_close,
 	.ndo_start_xmit		= ti_hecc_xmit,
-	.ndo_change_mtu		= can_change_mtu,
 };
 
 static const struct ethtool_ops ti_hecc_ethtool_ops = {
diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
index 5355bac4dccb..de8e212a1366 100644
--- a/drivers/net/can/usb/ems_usb.c
+++ b/drivers/net/can/usb/ems_usb.c
@@ -885,7 +885,6 @@ static const struct net_device_ops ems_usb_netdev_ops = {
 	.ndo_open = ems_usb_open,
 	.ndo_stop = ems_usb_close,
 	.ndo_start_xmit = ems_usb_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops ems_usb_ethtool_ops = {
diff --git a/drivers/net/can/usb/esd_usb.c b/drivers/net/can/usb/esd_usb.c
index 9bc1824d7be6..08da507faef4 100644
--- a/drivers/net/can/usb/esd_usb.c
+++ b/drivers/net/can/usb/esd_usb.c
@@ -1011,7 +1011,6 @@ static const struct net_device_ops esd_usb_netdev_ops = {
 	.ndo_open = esd_usb_open,
 	.ndo_stop = esd_usb_close,
 	.ndo_start_xmit = esd_usb_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops esd_usb_ethtool_ops = {
diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c
index adc91873c083..47d9e03f3044 100644
--- a/drivers/net/can/usb/etas_es58x/es58x_core.c
+++ b/drivers/net/can/usb/etas_es58x/es58x_core.c
@@ -1977,7 +1977,6 @@ static const struct net_device_ops es58x_netdev_ops = {
 	.ndo_stop = es58x_stop,
 	.ndo_start_xmit = es58x_start_xmit,
 	.ndo_eth_ioctl = can_eth_ioctl_hwts,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops es58x_ethtool_ops = {
diff --git a/drivers/net/can/usb/f81604.c b/drivers/net/can/usb/f81604.c
index e0cfa1460b0b..efe61ece79ea 100644
--- a/drivers/net/can/usb/f81604.c
+++ b/drivers/net/can/usb/f81604.c
@@ -1052,7 +1052,6 @@ static const struct net_device_ops f81604_netdev_ops = {
 	.ndo_open = f81604_open,
 	.ndo_stop = f81604_close,
 	.ndo_start_xmit = f81604_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct can_bittiming_const f81604_bittiming_const = {
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 69b8d6da651b..30608901a974 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -1101,7 +1101,6 @@ static const struct net_device_ops gs_usb_netdev_ops = {
 	.ndo_open = gs_can_open,
 	.ndo_stop = gs_can_close,
 	.ndo_start_xmit = gs_can_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 	.ndo_eth_ioctl = gs_can_eth_ioctl,
 };
 
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
index 90e77fa0ff4a..89e22b66f919 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
@@ -786,7 +786,6 @@ static const struct net_device_ops kvaser_usb_netdev_ops = {
 	.ndo_stop = kvaser_usb_close,
 	.ndo_eth_ioctl = can_eth_ioctl_hwts,
 	.ndo_start_xmit = kvaser_usb_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops kvaser_usb_ethtool_ops = {
diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c
index 1f9b915094e6..41c0a1c399bf 100644
--- a/drivers/net/can/usb/mcba_usb.c
+++ b/drivers/net/can/usb/mcba_usb.c
@@ -761,7 +761,6 @@ static const struct net_device_ops mcba_netdev_ops = {
 	.ndo_open = mcba_usb_open,
 	.ndo_stop = mcba_usb_close,
 	.ndo_start_xmit = mcba_usb_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops mcba_ethtool_ops = {
diff --git a/drivers/net/can/usb/nct6694_canfd.c b/drivers/net/can/usb/nct6694_canfd.c
index 8deff16491a1..dd6df2ec3742 100644
--- a/drivers/net/can/usb/nct6694_canfd.c
+++ b/drivers/net/can/usb/nct6694_canfd.c
@@ -690,7 +690,6 @@ static const struct net_device_ops nct6694_canfd_netdev_ops = {
 	.ndo_open = nct6694_canfd_open,
 	.ndo_stop = nct6694_canfd_close,
 	.ndo_start_xmit = nct6694_canfd_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops nct6694_canfd_ethtool_ops = {
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
index c74302ca7cee..94b1d7f15d27 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
@@ -814,7 +814,6 @@ static const struct net_device_ops peak_usb_netdev_ops = {
 	.ndo_stop = peak_usb_ndo_stop,
 	.ndo_eth_ioctl = peak_eth_ioctl,
 	.ndo_start_xmit = peak_usb_ndo_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 /* CAN-USB devices generally handle 32-bit CAN channel IDs.
diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c
index 07406daf7c88..de61d9da99e3 100644
--- a/drivers/net/can/usb/ucan.c
+++ b/drivers/net/can/usb/ucan.c
@@ -1233,7 +1233,6 @@ static const struct net_device_ops ucan_netdev_ops = {
 	.ndo_open = ucan_open,
 	.ndo_stop = ucan_close,
 	.ndo_start_xmit = ucan_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops ucan_ethtool_ops = {
diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c
index 8a5596ce4e46..7449328f7cd7 100644
--- a/drivers/net/can/usb/usb_8dev.c
+++ b/drivers/net/can/usb/usb_8dev.c
@@ -868,7 +868,6 @@ static const struct net_device_ops usb_8dev_netdev_ops = {
 	.ndo_open = usb_8dev_open,
 	.ndo_stop = usb_8dev_close,
 	.ndo_start_xmit = usb_8dev_start_xmit,
-	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops usb_8dev_ethtool_ops = {
diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c
index a25a3ca62c12..43d7f22820b8 100644
--- a/drivers/net/can/xilinx_can.c
+++ b/drivers/net/can/xilinx_can.c
@@ -1702,7 +1702,6 @@ static const struct net_device_ops xcan_netdev_ops = {
 	.ndo_open	= xcan_open,
 	.ndo_stop	= xcan_close,
 	.ndo_start_xmit	= xcan_start_xmit,
-	.ndo_change_mtu	= can_change_mtu,
 };
 
 static const struct ethtool_ops xcan_ethtool_ops = {
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index a2229a61ccde..0fe8f80f223e 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -127,7 +127,6 @@ struct can_priv *safe_candev_priv(struct net_device *dev);
 int open_candev(struct net_device *dev);
 void close_candev(struct net_device *dev);
 void can_set_default_mtu(struct net_device *dev);
-int can_change_mtu(struct net_device *dev, int new_mtu);
 int __must_check can_set_static_ctrlmode(struct net_device *dev,
 					 u32 static_mode);
 int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd);
-- 
cgit v1.2.3


From 6c4fed5fee42f5785e881ef2c28359724b18b80e Mon Sep 17 00:00:00 2001
From: Harsh Jain <h.jain@amd.com>
Date: Mon, 15 Sep 2025 19:00:25 +0530
Subject: crypto: drbg - Export CTR DRBG DF functions

Export drbg_ctr_df() derivative function to new module df_sp80090.

Signed-off-by: Harsh Jain <h.jain@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig                 |   8 +-
 crypto/Makefile                |   2 +
 crypto/df_sp80090a.c           | 247 +++++++++++++++++++++++++++++++++++++++++
 crypto/drbg.c                  | 244 +---------------------------------------
 drivers/crypto/Kconfig         |   1 +
 include/crypto/df_sp80090a.h   |  27 +++++
 include/crypto/drbg.h          |  25 +----
 include/crypto/internal/drbg.h |  54 +++++++++
 8 files changed, 343 insertions(+), 265 deletions(-)
 create mode 100644 crypto/df_sp80090a.c
 create mode 100644 include/crypto/df_sp80090a.h
 create mode 100644 include/crypto/internal/drbg.h

(limited to 'include')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index a04595f9d0ca..b9afd8505b89 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1205,8 +1205,7 @@ config CRYPTO_DRBG_HASH
 
 config CRYPTO_DRBG_CTR
 	bool "CTR_DRBG"
-	select CRYPTO_AES
-	select CRYPTO_CTR
+	select CRYPTO_DF80090A
 	help
 	  CTR_DRBG variant as defined in NIST SP800-90A.
 
@@ -1342,6 +1341,11 @@ config CRYPTO_KDF800108_CTR
 	select CRYPTO_HMAC
 	select CRYPTO_SHA256
 
+config CRYPTO_DF80090A
+	tristate
+	select CRYPTO_AES
+	select CRYPTO_CTR
+
 endmenu
 menu "Userspace interface"
 
diff --git a/crypto/Makefile b/crypto/Makefile
index e430e6e99b6a..c47f2bf5db61 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -209,4 +209,6 @@ obj-$(CONFIG_CRYPTO_SIMD) += crypto_simd.o
 #
 obj-$(CONFIG_CRYPTO_KDF800108_CTR) += kdf_sp800108.o
 
+obj-$(CONFIG_CRYPTO_DF80090A) += df_sp80090a.o
+
 obj-$(CONFIG_CRYPTO_KRB5) += krb5/
diff --git a/crypto/df_sp80090a.c b/crypto/df_sp80090a.c
new file mode 100644
index 000000000000..8309e62abe27
--- /dev/null
+++ b/crypto/df_sp80090a.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * NIST SP800-90A DRBG derivation function
+ *
+ * Copyright (C) 2014, Stephan Mueller <smueller@chronox.de>
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <crypto/df_sp80090a.h>
+#include <crypto/internal/drbg.h>
+
+static void drbg_kcapi_symsetkey(struct crypto_cipher *tfm,
+				 const unsigned char *key,
+				 u8 keylen);
+static int drbg_kcapi_sym(struct crypto_cipher *tfm, unsigned char *outval,
+			  const struct drbg_string *in, u8 blocklen_bytes);
+
+static void drbg_kcapi_symsetkey(struct crypto_cipher *tfm,
+				 const unsigned char *key, u8 keylen)
+{
+	crypto_cipher_setkey(tfm, key, keylen);
+}
+
+static int drbg_kcapi_sym(struct crypto_cipher *tfm, unsigned char *outval,
+			  const struct drbg_string *in, u8 blocklen_bytes)
+{
+	/* there is only component in *in */
+	BUG_ON(in->len < blocklen_bytes);
+	crypto_cipher_encrypt_one(tfm, outval, in->buf);
+	return 0;
+}
+
+/* BCC function for CTR DRBG as defined in 10.4.3 */
+
+static int drbg_ctr_bcc(struct crypto_cipher *tfm,
+			unsigned char *out, const unsigned char *key,
+			struct list_head *in,
+			u8 blocklen_bytes,
+			u8 keylen)
+{
+	int ret = 0;
+	struct drbg_string *curr = NULL;
+	struct drbg_string data;
+	short cnt = 0;
+
+	drbg_string_fill(&data, out, blocklen_bytes);
+
+	/* 10.4.3 step 2 / 4 */
+	drbg_kcapi_symsetkey(tfm, key, keylen);
+	list_for_each_entry(curr, in, list) {
+		const unsigned char *pos = curr->buf;
+		size_t len = curr->len;
+		/* 10.4.3 step 4.1 */
+		while (len) {
+			/* 10.4.3 step 4.2 */
+			if (blocklen_bytes == cnt) {
+				cnt = 0;
+				ret = drbg_kcapi_sym(tfm, out, &data, blocklen_bytes);
+				if (ret)
+					return ret;
+			}
+			out[cnt] ^= *pos;
+			pos++;
+			cnt++;
+			len--;
+		}
+	}
+	/* 10.4.3 step 4.2 for last block */
+	if (cnt)
+		ret = drbg_kcapi_sym(tfm, out, &data, blocklen_bytes);
+
+	return ret;
+}
+
+/*
+ * scratchpad usage: drbg_ctr_update is interlinked with crypto_drbg_ctr_df
+ * (and drbg_ctr_bcc, but this function does not need any temporary buffers),
+ * the scratchpad is used as follows:
+ * drbg_ctr_update:
+ *	temp
+ *		start: drbg->scratchpad
+ *		length: drbg_statelen(drbg) + drbg_blocklen(drbg)
+ *			note: the cipher writing into this variable works
+ *			blocklen-wise. Now, when the statelen is not a multiple
+ *			of blocklen, the generateion loop below "spills over"
+ *			by at most blocklen. Thus, we need to give sufficient
+ *			memory.
+ *	df_data
+ *		start: drbg->scratchpad +
+ *				drbg_statelen(drbg) + drbg_blocklen(drbg)
+ *		length: drbg_statelen(drbg)
+ *
+ * crypto_drbg_ctr_df:
+ *	pad
+ *		start: df_data + drbg_statelen(drbg)
+ *		length: drbg_blocklen(drbg)
+ *	iv
+ *		start: pad + drbg_blocklen(drbg)
+ *		length: drbg_blocklen(drbg)
+ *	temp
+ *		start: iv + drbg_blocklen(drbg)
+ *		length: drbg_satelen(drbg) + drbg_blocklen(drbg)
+ *			note: temp is the buffer that the BCC function operates
+ *			on. BCC operates blockwise. drbg_statelen(drbg)
+ *			is sufficient when the DRBG state length is a multiple
+ *			of the block size. For AES192 (and maybe other ciphers)
+ *			this is not correct and the length for temp is
+ *			insufficient (yes, that also means for such ciphers,
+ *			the final output of all BCC rounds are truncated).
+ *			Therefore, add drbg_blocklen(drbg) to cover all
+ *			possibilities.
+ * refer to crypto_drbg_ctr_df_datalen() to get required length
+ */
+
+/* Derivation Function for CTR DRBG as defined in 10.4.2 */
+int crypto_drbg_ctr_df(struct crypto_cipher *tfm,
+		       unsigned char *df_data, size_t bytes_to_return,
+		       struct list_head *seedlist,
+		       u8 blocklen_bytes,
+		       u8 statelen)
+{
+	int ret = -EFAULT;
+	unsigned char L_N[8];
+	/* S3 is input */
+	struct drbg_string S1, S2, S4, cipherin;
+	LIST_HEAD(bcc_list);
+	unsigned char *pad = df_data + statelen;
+	unsigned char *iv = pad + blocklen_bytes;
+	unsigned char *temp = iv + blocklen_bytes;
+	size_t padlen = 0;
+	unsigned int templen = 0;
+	/* 10.4.2 step 7 */
+	unsigned int i = 0;
+	/* 10.4.2 step 8 */
+	const unsigned char *K = (unsigned char *)
+			   "\x00\x01\x02\x03\x04\x05\x06\x07"
+			   "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			   "\x10\x11\x12\x13\x14\x15\x16\x17"
+			   "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f";
+	unsigned char *X;
+	size_t generated_len = 0;
+	size_t inputlen = 0;
+	struct drbg_string *seed = NULL;
+	u8 keylen;
+
+	memset(pad, 0, blocklen_bytes);
+	memset(iv, 0, blocklen_bytes);
+	keylen = statelen - blocklen_bytes;
+	/* 10.4.2 step 1 is implicit as we work byte-wise */
+
+	/* 10.4.2 step 2 */
+	if ((512 / 8) < bytes_to_return)
+		return -EINVAL;
+
+	/* 10.4.2 step 2 -- calculate the entire length of all input data */
+	list_for_each_entry(seed, seedlist, list)
+		inputlen += seed->len;
+	drbg_cpu_to_be32(inputlen, &L_N[0]);
+
+	/* 10.4.2 step 3 */
+	drbg_cpu_to_be32(bytes_to_return, &L_N[4]);
+
+	/* 10.4.2 step 5: length is L_N, input_string, one byte, padding */
+	padlen = (inputlen + sizeof(L_N) + 1) % (blocklen_bytes);
+	/* wrap the padlen appropriately */
+	if (padlen)
+		padlen = blocklen_bytes - padlen;
+	/*
+	 * pad / padlen contains the 0x80 byte and the following zero bytes.
+	 * As the calculated padlen value only covers the number of zero
+	 * bytes, this value has to be incremented by one for the 0x80 byte.
+	 */
+	padlen++;
+	pad[0] = 0x80;
+
+	/* 10.4.2 step 4 -- first fill the linked list and then order it */
+	drbg_string_fill(&S1, iv, blocklen_bytes);
+	list_add_tail(&S1.list, &bcc_list);
+	drbg_string_fill(&S2, L_N, sizeof(L_N));
+	list_add_tail(&S2.list, &bcc_list);
+	list_splice_tail(seedlist, &bcc_list);
+	drbg_string_fill(&S4, pad, padlen);
+	list_add_tail(&S4.list, &bcc_list);
+
+	/* 10.4.2 step 9 */
+	while (templen < (keylen + (blocklen_bytes))) {
+		/*
+		 * 10.4.2 step 9.1 - the padding is implicit as the buffer
+		 * holds zeros after allocation -- even the increment of i
+		 * is irrelevant as the increment remains within length of i
+		 */
+		drbg_cpu_to_be32(i, iv);
+		/* 10.4.2 step 9.2 -- BCC and concatenation with temp */
+		ret = drbg_ctr_bcc(tfm, temp + templen, K, &bcc_list,
+				   blocklen_bytes, keylen);
+		if (ret)
+			goto out;
+		/* 10.4.2 step 9.3 */
+		i++;
+		templen += blocklen_bytes;
+	}
+
+	/* 10.4.2 step 11 */
+	X = temp + (keylen);
+	drbg_string_fill(&cipherin, X, blocklen_bytes);
+
+	/* 10.4.2 step 12: overwriting of outval is implemented in next step */
+
+	/* 10.4.2 step 13 */
+	drbg_kcapi_symsetkey(tfm, temp, keylen);
+	while (generated_len < bytes_to_return) {
+		short blocklen = 0;
+		/*
+		 * 10.4.2 step 13.1: the truncation of the key length is
+		 * implicit as the key is only drbg_blocklen in size based on
+		 * the implementation of the cipher function callback
+		 */
+		ret = drbg_kcapi_sym(tfm, X, &cipherin, blocklen_bytes);
+		if (ret)
+			goto out;
+		blocklen = (blocklen_bytes <
+				(bytes_to_return - generated_len)) ?
+			    blocklen_bytes :
+				(bytes_to_return - generated_len);
+		/* 10.4.2 step 13.2 and 14 */
+		memcpy(df_data + generated_len, X, blocklen);
+		generated_len += blocklen;
+	}
+
+	ret = 0;
+
+out:
+	memset(iv, 0, blocklen_bytes);
+	memset(temp, 0, statelen + blocklen_bytes);
+	memset(pad, 0, blocklen_bytes);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(crypto_drbg_ctr_df);
+
+MODULE_IMPORT_NS("CRYPTO_INTERNAL");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>");
+MODULE_DESCRIPTION("Derivation Function conformant to SP800-90A");
diff --git a/crypto/drbg.c b/crypto/drbg.c
index dbe4c8bb5ceb..bad005eef03d 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -98,6 +98,7 @@
  */
 
 #include <crypto/drbg.h>
+#include <crypto/df_sp80090a.h>
 #include <crypto/internal/cipher.h>
 #include <linux/kernel.h>
 #include <linux/jiffies.h>
@@ -261,26 +262,6 @@ static int drbg_fips_continuous_test(struct drbg_state *drbg,
 	return 0;
 }
 
-/*
- * Convert an integer into a byte representation of this integer.
- * The byte representation is big-endian
- *
- * @val value to be converted
- * @buf buffer holding the converted integer -- caller must ensure that
- *      buffer size is at least 32 bit
- */
-#if (defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_CTR))
-static inline void drbg_cpu_to_be32(__u32 val, unsigned char *buf)
-{
-	struct s {
-		__be32 conv;
-	};
-	struct s *conversion = (struct s *) buf;
-
-	conversion->conv = cpu_to_be32(val);
-}
-#endif /* defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_CTR) */
-
 /******************************************************************
  * CTR DRBG callback functions
  ******************************************************************/
@@ -294,10 +275,6 @@ MODULE_ALIAS_CRYPTO("drbg_nopr_ctr_aes192");
 MODULE_ALIAS_CRYPTO("drbg_pr_ctr_aes128");
 MODULE_ALIAS_CRYPTO("drbg_nopr_ctr_aes128");
 
-static void drbg_kcapi_symsetkey(struct drbg_state *drbg,
-				 const unsigned char *key);
-static int drbg_kcapi_sym(struct drbg_state *drbg, unsigned char *outval,
-			  const struct drbg_string *in);
 static int drbg_init_sym_kernel(struct drbg_state *drbg);
 static int drbg_fini_sym_kernel(struct drbg_state *drbg);
 static int drbg_kcapi_sym_ctr(struct drbg_state *drbg,
@@ -305,202 +282,12 @@ static int drbg_kcapi_sym_ctr(struct drbg_state *drbg,
 			      u8 *outbuf, u32 outlen);
 #define DRBG_OUTSCRATCHLEN 256
 
-/* BCC function for CTR DRBG as defined in 10.4.3 */
-static int drbg_ctr_bcc(struct drbg_state *drbg,
-			unsigned char *out, const unsigned char *key,
-			struct list_head *in)
-{
-	int ret = 0;
-	struct drbg_string *curr = NULL;
-	struct drbg_string data;
-	short cnt = 0;
-
-	drbg_string_fill(&data, out, drbg_blocklen(drbg));
-
-	/* 10.4.3 step 2 / 4 */
-	drbg_kcapi_symsetkey(drbg, key);
-	list_for_each_entry(curr, in, list) {
-		const unsigned char *pos = curr->buf;
-		size_t len = curr->len;
-		/* 10.4.3 step 4.1 */
-		while (len) {
-			/* 10.4.3 step 4.2 */
-			if (drbg_blocklen(drbg) == cnt) {
-				cnt = 0;
-				ret = drbg_kcapi_sym(drbg, out, &data);
-				if (ret)
-					return ret;
-			}
-			out[cnt] ^= *pos;
-			pos++;
-			cnt++;
-			len--;
-		}
-	}
-	/* 10.4.3 step 4.2 for last block */
-	if (cnt)
-		ret = drbg_kcapi_sym(drbg, out, &data);
-
-	return ret;
-}
-
-/*
- * scratchpad usage: drbg_ctr_update is interlinked with drbg_ctr_df
- * (and drbg_ctr_bcc, but this function does not need any temporary buffers),
- * the scratchpad is used as follows:
- * drbg_ctr_update:
- *	temp
- *		start: drbg->scratchpad
- *		length: drbg_statelen(drbg) + drbg_blocklen(drbg)
- *			note: the cipher writing into this variable works
- *			blocklen-wise. Now, when the statelen is not a multiple
- *			of blocklen, the generateion loop below "spills over"
- *			by at most blocklen. Thus, we need to give sufficient
- *			memory.
- *	df_data
- *		start: drbg->scratchpad +
- *				drbg_statelen(drbg) + drbg_blocklen(drbg)
- *		length: drbg_statelen(drbg)
- *
- * drbg_ctr_df:
- *	pad
- *		start: df_data + drbg_statelen(drbg)
- *		length: drbg_blocklen(drbg)
- *	iv
- *		start: pad + drbg_blocklen(drbg)
- *		length: drbg_blocklen(drbg)
- *	temp
- *		start: iv + drbg_blocklen(drbg)
- *		length: drbg_satelen(drbg) + drbg_blocklen(drbg)
- *			note: temp is the buffer that the BCC function operates
- *			on. BCC operates blockwise. drbg_statelen(drbg)
- *			is sufficient when the DRBG state length is a multiple
- *			of the block size. For AES192 (and maybe other ciphers)
- *			this is not correct and the length for temp is
- *			insufficient (yes, that also means for such ciphers,
- *			the final output of all BCC rounds are truncated).
- *			Therefore, add drbg_blocklen(drbg) to cover all
- *			possibilities.
- */
-
-/* Derivation Function for CTR DRBG as defined in 10.4.2 */
 static int drbg_ctr_df(struct drbg_state *drbg,
 		       unsigned char *df_data, size_t bytes_to_return,
 		       struct list_head *seedlist)
 {
-	int ret = -EFAULT;
-	unsigned char L_N[8];
-	/* S3 is input */
-	struct drbg_string S1, S2, S4, cipherin;
-	LIST_HEAD(bcc_list);
-	unsigned char *pad = df_data + drbg_statelen(drbg);
-	unsigned char *iv = pad + drbg_blocklen(drbg);
-	unsigned char *temp = iv + drbg_blocklen(drbg);
-	size_t padlen = 0;
-	unsigned int templen = 0;
-	/* 10.4.2 step 7 */
-	unsigned int i = 0;
-	/* 10.4.2 step 8 */
-	const unsigned char *K = (unsigned char *)
-			   "\x00\x01\x02\x03\x04\x05\x06\x07"
-			   "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			   "\x10\x11\x12\x13\x14\x15\x16\x17"
-			   "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f";
-	unsigned char *X;
-	size_t generated_len = 0;
-	size_t inputlen = 0;
-	struct drbg_string *seed = NULL;
-
-	memset(pad, 0, drbg_blocklen(drbg));
-	memset(iv, 0, drbg_blocklen(drbg));
-
-	/* 10.4.2 step 1 is implicit as we work byte-wise */
-
-	/* 10.4.2 step 2 */
-	if ((512/8) < bytes_to_return)
-		return -EINVAL;
-
-	/* 10.4.2 step 2 -- calculate the entire length of all input data */
-	list_for_each_entry(seed, seedlist, list)
-		inputlen += seed->len;
-	drbg_cpu_to_be32(inputlen, &L_N[0]);
-
-	/* 10.4.2 step 3 */
-	drbg_cpu_to_be32(bytes_to_return, &L_N[4]);
-
-	/* 10.4.2 step 5: length is L_N, input_string, one byte, padding */
-	padlen = (inputlen + sizeof(L_N) + 1) % (drbg_blocklen(drbg));
-	/* wrap the padlen appropriately */
-	if (padlen)
-		padlen = drbg_blocklen(drbg) - padlen;
-	/*
-	 * pad / padlen contains the 0x80 byte and the following zero bytes.
-	 * As the calculated padlen value only covers the number of zero
-	 * bytes, this value has to be incremented by one for the 0x80 byte.
-	 */
-	padlen++;
-	pad[0] = 0x80;
-
-	/* 10.4.2 step 4 -- first fill the linked list and then order it */
-	drbg_string_fill(&S1, iv, drbg_blocklen(drbg));
-	list_add_tail(&S1.list, &bcc_list);
-	drbg_string_fill(&S2, L_N, sizeof(L_N));
-	list_add_tail(&S2.list, &bcc_list);
-	list_splice_tail(seedlist, &bcc_list);
-	drbg_string_fill(&S4, pad, padlen);
-	list_add_tail(&S4.list, &bcc_list);
-
-	/* 10.4.2 step 9 */
-	while (templen < (drbg_keylen(drbg) + (drbg_blocklen(drbg)))) {
-		/*
-		 * 10.4.2 step 9.1 - the padding is implicit as the buffer
-		 * holds zeros after allocation -- even the increment of i
-		 * is irrelevant as the increment remains within length of i
-		 */
-		drbg_cpu_to_be32(i, iv);
-		/* 10.4.2 step 9.2 -- BCC and concatenation with temp */
-		ret = drbg_ctr_bcc(drbg, temp + templen, K, &bcc_list);
-		if (ret)
-			goto out;
-		/* 10.4.2 step 9.3 */
-		i++;
-		templen += drbg_blocklen(drbg);
-	}
-
-	/* 10.4.2 step 11 */
-	X = temp + (drbg_keylen(drbg));
-	drbg_string_fill(&cipherin, X, drbg_blocklen(drbg));
-
-	/* 10.4.2 step 12: overwriting of outval is implemented in next step */
-
-	/* 10.4.2 step 13 */
-	drbg_kcapi_symsetkey(drbg, temp);
-	while (generated_len < bytes_to_return) {
-		short blocklen = 0;
-		/*
-		 * 10.4.2 step 13.1: the truncation of the key length is
-		 * implicit as the key is only drbg_blocklen in size based on
-		 * the implementation of the cipher function callback
-		 */
-		ret = drbg_kcapi_sym(drbg, X, &cipherin);
-		if (ret)
-			goto out;
-		blocklen = (drbg_blocklen(drbg) <
-				(bytes_to_return - generated_len)) ?
-			    drbg_blocklen(drbg) :
-				(bytes_to_return - generated_len);
-		/* 10.4.2 step 13.2 and 14 */
-		memcpy(df_data + generated_len, X, blocklen);
-		generated_len += blocklen;
-	}
-
-	ret = 0;
-
-out:
-	memset(iv, 0, drbg_blocklen(drbg));
-	memset(temp, 0, drbg_statelen(drbg) + drbg_blocklen(drbg));
-	memset(pad, 0, drbg_blocklen(drbg));
-	return ret;
+	return crypto_drbg_ctr_df(drbg->priv_data, df_data, drbg_statelen(drbg),
+				  seedlist, drbg_blocklen(drbg), drbg_statelen(drbg));
 }
 
 /*
@@ -1310,10 +1097,8 @@ static inline int drbg_alloc_state(struct drbg_state *drbg)
 		sb_size = 0;
 	else if (drbg->core->flags & DRBG_CTR)
 		sb_size = drbg_statelen(drbg) + drbg_blocklen(drbg) + /* temp */
-			  drbg_statelen(drbg) +	/* df_data */
-			  drbg_blocklen(drbg) +	/* pad */
-			  drbg_blocklen(drbg) +	/* iv */
-			  drbg_statelen(drbg) + drbg_blocklen(drbg); /* temp */
+			  crypto_drbg_ctr_df_datalen(drbg_statelen(drbg),
+						     drbg_blocklen(drbg));
 	else
 		sb_size = drbg_statelen(drbg) + drbg_blocklen(drbg);
 
@@ -1800,25 +1585,6 @@ static int drbg_init_sym_kernel(struct drbg_state *drbg)
 	return alignmask;
 }
 
-static void drbg_kcapi_symsetkey(struct drbg_state *drbg,
-				 const unsigned char *key)
-{
-	struct crypto_cipher *tfm = drbg->priv_data;
-
-	crypto_cipher_setkey(tfm, key, (drbg_keylen(drbg)));
-}
-
-static int drbg_kcapi_sym(struct drbg_state *drbg, unsigned char *outval,
-			  const struct drbg_string *in)
-{
-	struct crypto_cipher *tfm = drbg->priv_data;
-
-	/* there is only component in *in */
-	BUG_ON(in->len < drbg_blocklen(drbg));
-	crypto_cipher_encrypt_one(tfm, outval, in->buf);
-	return 0;
-}
-
 static int drbg_kcapi_sym_ctr(struct drbg_state *drbg,
 			      u8 *inbuf, u32 inlen,
 			      u8 *outbuf, u32 outlen)
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index a6688d54984c..8d3b5d2890f8 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -728,6 +728,7 @@ config CRYPTO_DEV_TEGRA
 config CRYPTO_DEV_XILINX_TRNG
 	tristate "Support for Xilinx True Random Generator"
 	depends on ZYNQMP_FIRMWARE || COMPILE_TEST
+	select CRYPTO_DF80090A
 	select CRYPTO_RNG
 	select HW_RANDOM
 	help
diff --git a/include/crypto/df_sp80090a.h b/include/crypto/df_sp80090a.h
new file mode 100644
index 000000000000..182865538662
--- /dev/null
+++ b/include/crypto/df_sp80090a.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright Stephan Mueller <smueller@chronox.de>, 2014
+ */
+
+#ifndef _CRYPTO_DF80090A_H
+#define _CRYPTO_DF80090A_H
+
+#include <crypto/internal/cipher.h>
+
+static inline int crypto_drbg_ctr_df_datalen(u8 statelen, u8 blocklen)
+{
+	return statelen +       /* df_data */
+		blocklen +      /* pad */
+		blocklen +      /* iv */
+		statelen + blocklen;  /* temp */
+}
+
+int crypto_drbg_ctr_df(struct crypto_cipher *tfm,
+		       unsigned char *df_data,
+		       size_t bytes_to_return,
+		       struct list_head *seedlist,
+		       u8 blocklen_bytes,
+		       u8 statelen);
+
+#endif /* _CRYPTO_DF80090A_H */
diff --git a/include/crypto/drbg.h b/include/crypto/drbg.h
index af5ad51d3eef..2d42518cbdce 100644
--- a/include/crypto/drbg.h
+++ b/include/crypto/drbg.h
@@ -47,6 +47,7 @@
 #include <linux/module.h>
 #include <linux/crypto.h>
 #include <linux/slab.h>
+#include <crypto/internal/drbg.h>
 #include <crypto/internal/rng.h>
 #include <crypto/rng.h>
 #include <linux/fips.h>
@@ -54,30 +55,6 @@
 #include <linux/list.h>
 #include <linux/workqueue.h>
 
-/*
- * Concatenation Helper and string operation helper
- *
- * SP800-90A requires the concatenation of different data. To avoid copying
- * buffers around or allocate additional memory, the following data structure
- * is used to point to the original memory with its size. In addition, it
- * is used to build a linked list. The linked list defines the concatenation
- * of individual buffers. The order of memory block referenced in that
- * linked list determines the order of concatenation.
- */
-struct drbg_string {
-	const unsigned char *buf;
-	size_t len;
-	struct list_head list;
-};
-
-static inline void drbg_string_fill(struct drbg_string *string,
-				    const unsigned char *buf, size_t len)
-{
-	string->buf = buf;
-	string->len = len;
-	INIT_LIST_HEAD(&string->list);
-}
-
 struct drbg_state;
 typedef uint32_t drbg_flag_t;
 
diff --git a/include/crypto/internal/drbg.h b/include/crypto/internal/drbg.h
new file mode 100644
index 000000000000..371e52dcee6c
--- /dev/null
+++ b/include/crypto/internal/drbg.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * NIST SP800-90A DRBG derivation function
+ *
+ * Copyright (C) 2014, Stephan Mueller <smueller@chronox.de>
+ */
+
+#ifndef _INTERNAL_DRBG_H
+#define _INTERNAL_DRBG_H
+
+/*
+ * Convert an integer into a byte representation of this integer.
+ * The byte representation is big-endian
+ *
+ * @val value to be converted
+ * @buf buffer holding the converted integer -- caller must ensure that
+ *      buffer size is at least 32 bit
+ */
+static inline void drbg_cpu_to_be32(__u32 val, unsigned char *buf)
+{
+	struct s {
+		__be32 conv;
+	};
+	struct s *conversion = (struct s *)buf;
+
+	conversion->conv = cpu_to_be32(val);
+}
+
+/*
+ * Concatenation Helper and string operation helper
+ *
+ * SP800-90A requires the concatenation of different data. To avoid copying
+ * buffers around or allocate additional memory, the following data structure
+ * is used to point to the original memory with its size. In addition, it
+ * is used to build a linked list. The linked list defines the concatenation
+ * of individual buffers. The order of memory block referenced in that
+ * linked list determines the order of concatenation.
+ */
+struct drbg_string {
+	const unsigned char *buf;
+	size_t len;
+	struct list_head list;
+};
+
+static inline void drbg_string_fill(struct drbg_string *string,
+				    const unsigned char *buf, size_t len)
+{
+	string->buf = buf;
+	string->len = len;
+	INIT_LIST_HEAD(&string->list);
+}
+
+#endif //_INTERNAL_DRBG_H
-- 
cgit v1.2.3


From ba0570bdf1d9956a63db2ddc50fa6a78d8c93f30 Mon Sep 17 00:00:00 2001
From: Harsh Jain <h.jain@amd.com>
Date: Mon, 15 Sep 2025 19:00:26 +0530
Subject: crypto: drbg - Replace AES cipher calls with library calls

Replace aes used in drbg with library calls.

Signed-off-by: Harsh Jain <h.jain@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/df_sp80090a.c         | 29 +++++++++++++++--------------
 crypto/drbg.c                | 21 ++++++++-------------
 include/crypto/df_sp80090a.h |  3 ++-
 3 files changed, 25 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/crypto/df_sp80090a.c b/crypto/df_sp80090a.c
index 8309e62abe27..bad38c267180 100644
--- a/crypto/df_sp80090a.c
+++ b/crypto/df_sp80090a.c
@@ -10,33 +10,34 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <crypto/aes.h>
 #include <crypto/df_sp80090a.h>
 #include <crypto/internal/drbg.h>
 
-static void drbg_kcapi_symsetkey(struct crypto_cipher *tfm,
+static void drbg_kcapi_symsetkey(struct crypto_aes_ctx *aesctx,
 				 const unsigned char *key,
 				 u8 keylen);
-static int drbg_kcapi_sym(struct crypto_cipher *tfm, unsigned char *outval,
+static int drbg_kcapi_sym(struct crypto_aes_ctx *aesctx, unsigned char *outval,
 			  const struct drbg_string *in, u8 blocklen_bytes);
 
-static void drbg_kcapi_symsetkey(struct crypto_cipher *tfm,
+static void drbg_kcapi_symsetkey(struct crypto_aes_ctx *aesctx,
 				 const unsigned char *key, u8 keylen)
 {
-	crypto_cipher_setkey(tfm, key, keylen);
+	aes_expandkey(aesctx, key, keylen);
 }
 
-static int drbg_kcapi_sym(struct crypto_cipher *tfm, unsigned char *outval,
+static int drbg_kcapi_sym(struct crypto_aes_ctx *aesctx, unsigned char *outval,
 			  const struct drbg_string *in, u8 blocklen_bytes)
 {
 	/* there is only component in *in */
 	BUG_ON(in->len < blocklen_bytes);
-	crypto_cipher_encrypt_one(tfm, outval, in->buf);
+	aes_encrypt(aesctx, outval, in->buf);
 	return 0;
 }
 
 /* BCC function for CTR DRBG as defined in 10.4.3 */
 
-static int drbg_ctr_bcc(struct crypto_cipher *tfm,
+static int drbg_ctr_bcc(struct crypto_aes_ctx *aesctx,
 			unsigned char *out, const unsigned char *key,
 			struct list_head *in,
 			u8 blocklen_bytes,
@@ -50,7 +51,7 @@ static int drbg_ctr_bcc(struct crypto_cipher *tfm,
 	drbg_string_fill(&data, out, blocklen_bytes);
 
 	/* 10.4.3 step 2 / 4 */
-	drbg_kcapi_symsetkey(tfm, key, keylen);
+	drbg_kcapi_symsetkey(aesctx, key, keylen);
 	list_for_each_entry(curr, in, list) {
 		const unsigned char *pos = curr->buf;
 		size_t len = curr->len;
@@ -59,7 +60,7 @@ static int drbg_ctr_bcc(struct crypto_cipher *tfm,
 			/* 10.4.3 step 4.2 */
 			if (blocklen_bytes == cnt) {
 				cnt = 0;
-				ret = drbg_kcapi_sym(tfm, out, &data, blocklen_bytes);
+				ret = drbg_kcapi_sym(aesctx, out, &data, blocklen_bytes);
 				if (ret)
 					return ret;
 			}
@@ -71,7 +72,7 @@ static int drbg_ctr_bcc(struct crypto_cipher *tfm,
 	}
 	/* 10.4.3 step 4.2 for last block */
 	if (cnt)
-		ret = drbg_kcapi_sym(tfm, out, &data, blocklen_bytes);
+		ret = drbg_kcapi_sym(aesctx, out, &data, blocklen_bytes);
 
 	return ret;
 }
@@ -117,7 +118,7 @@ static int drbg_ctr_bcc(struct crypto_cipher *tfm,
  */
 
 /* Derivation Function for CTR DRBG as defined in 10.4.2 */
-int crypto_drbg_ctr_df(struct crypto_cipher *tfm,
+int crypto_drbg_ctr_df(struct crypto_aes_ctx *aesctx,
 		       unsigned char *df_data, size_t bytes_to_return,
 		       struct list_head *seedlist,
 		       u8 blocklen_bytes,
@@ -195,7 +196,7 @@ int crypto_drbg_ctr_df(struct crypto_cipher *tfm,
 		 */
 		drbg_cpu_to_be32(i, iv);
 		/* 10.4.2 step 9.2 -- BCC and concatenation with temp */
-		ret = drbg_ctr_bcc(tfm, temp + templen, K, &bcc_list,
+		ret = drbg_ctr_bcc(aesctx, temp + templen, K, &bcc_list,
 				   blocklen_bytes, keylen);
 		if (ret)
 			goto out;
@@ -211,7 +212,7 @@ int crypto_drbg_ctr_df(struct crypto_cipher *tfm,
 	/* 10.4.2 step 12: overwriting of outval is implemented in next step */
 
 	/* 10.4.2 step 13 */
-	drbg_kcapi_symsetkey(tfm, temp, keylen);
+	drbg_kcapi_symsetkey(aesctx, temp, keylen);
 	while (generated_len < bytes_to_return) {
 		short blocklen = 0;
 		/*
@@ -219,7 +220,7 @@ int crypto_drbg_ctr_df(struct crypto_cipher *tfm,
 		 * implicit as the key is only drbg_blocklen in size based on
 		 * the implementation of the cipher function callback
 		 */
-		ret = drbg_kcapi_sym(tfm, X, &cipherin, blocklen_bytes);
+		ret = drbg_kcapi_sym(aesctx, X, &cipherin, blocklen_bytes);
 		if (ret)
 			goto out;
 		blocklen = (blocklen_bytes <
diff --git a/crypto/drbg.c b/crypto/drbg.c
index bad005eef03d..511a27c91813 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1506,10 +1506,9 @@ static int drbg_kcapi_hash(struct drbg_state *drbg, unsigned char *outval,
 #ifdef CONFIG_CRYPTO_DRBG_CTR
 static int drbg_fini_sym_kernel(struct drbg_state *drbg)
 {
-	struct crypto_cipher *tfm =
-		(struct crypto_cipher *)drbg->priv_data;
-	if (tfm)
-		crypto_free_cipher(tfm);
+	struct crypto_aes_ctx *aesctx =	(struct crypto_aes_ctx *)drbg->priv_data;
+
+	kfree(aesctx);
 	drbg->priv_data = NULL;
 
 	if (drbg->ctr_handle)
@@ -1528,20 +1527,16 @@ static int drbg_fini_sym_kernel(struct drbg_state *drbg)
 
 static int drbg_init_sym_kernel(struct drbg_state *drbg)
 {
-	struct crypto_cipher *tfm;
+	struct crypto_aes_ctx *aesctx;
 	struct crypto_skcipher *sk_tfm;
 	struct skcipher_request *req;
 	unsigned int alignmask;
 	char ctr_name[CRYPTO_MAX_ALG_NAME];
 
-	tfm = crypto_alloc_cipher(drbg->core->backend_cra_name, 0, 0);
-	if (IS_ERR(tfm)) {
-		pr_info("DRBG: could not allocate cipher TFM handle: %s\n",
-				drbg->core->backend_cra_name);
-		return PTR_ERR(tfm);
-	}
-	BUG_ON(drbg_blocklen(drbg) != crypto_cipher_blocksize(tfm));
-	drbg->priv_data = tfm;
+	aesctx = kzalloc(sizeof(*aesctx), GFP_KERNEL);
+	if (!aesctx)
+		return -ENOMEM;
+	drbg->priv_data = aesctx;
 
 	if (snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)",
 	    drbg->core->backend_cra_name) >= CRYPTO_MAX_ALG_NAME) {
diff --git a/include/crypto/df_sp80090a.h b/include/crypto/df_sp80090a.h
index 182865538662..6b25305fe611 100644
--- a/include/crypto/df_sp80090a.h
+++ b/include/crypto/df_sp80090a.h
@@ -8,6 +8,7 @@
 #define _CRYPTO_DF80090A_H
 
 #include <crypto/internal/cipher.h>
+#include <crypto/aes.h>
 
 static inline int crypto_drbg_ctr_df_datalen(u8 statelen, u8 blocklen)
 {
@@ -17,7 +18,7 @@ static inline int crypto_drbg_ctr_df_datalen(u8 statelen, u8 blocklen)
 		statelen + blocklen;  /* temp */
 }
 
-int crypto_drbg_ctr_df(struct crypto_cipher *tfm,
+int crypto_drbg_ctr_df(struct crypto_aes_ctx *aes,
 		       unsigned char *df_data,
 		       size_t bytes_to_return,
 		       struct list_head *seedlist,
-- 
cgit v1.2.3


From 3662b54c16924b03197ec80f9764aabdf2c90231 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Wed, 15 Oct 2025 10:53:24 +0300
Subject: media: v4l2-mem2mem: Document that v4l2_m2m_get_vq() never returns
 NULL

The v4l2_m2m_get_vq() never returns a NULL pointer, as the internal
get_queue_ctx() helper always returns a non-NULL pointer. Many drivers
check the return value against NULL, due to a combination of old code
and cargo-cult programming. Even v4l2-mem2mem.c contains unneeded NULL
checks.

Clarify the API by documenting explicitly that a NULL check is not
needed, and simplify the code by removing the unneeded NULL checks from
v4l2-mem2mem.c.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Stefan Klug <stefan.klug@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 drivers/media/v4l2-core/v4l2-mem2mem.c | 12 +-----------
 include/media/v4l2-mem2mem.h           |  3 +++
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c
index 21acd9bc8607..9fa8833e445f 100644
--- a/drivers/media/v4l2-core/v4l2-mem2mem.c
+++ b/drivers/media/v4l2-core/v4l2-mem2mem.c
@@ -123,13 +123,7 @@ static struct v4l2_m2m_queue_ctx *get_queue_ctx(struct v4l2_m2m_ctx *m2m_ctx,
 struct vb2_queue *v4l2_m2m_get_vq(struct v4l2_m2m_ctx *m2m_ctx,
 				       enum v4l2_buf_type type)
 {
-	struct v4l2_m2m_queue_ctx *q_ctx;
-
-	q_ctx = get_queue_ctx(m2m_ctx, type);
-	if (!q_ctx)
-		return NULL;
-
-	return &q_ctx->q;
+	return &get_queue_ctx(m2m_ctx, type)->q;
 }
 EXPORT_SYMBOL(v4l2_m2m_get_vq);
 
@@ -1285,8 +1279,6 @@ void v4l2_m2m_buf_queue(struct v4l2_m2m_ctx *m2m_ctx,
 	unsigned long flags;
 
 	q_ctx = get_queue_ctx(m2m_ctx, vbuf->vb2_buf.vb2_queue->type);
-	if (!q_ctx)
-		return;
 
 	spin_lock_irqsave(&q_ctx->rdy_spinlock, flags);
 	list_add_tail(&b->list, &q_ctx->rdy_queue);
@@ -1388,8 +1380,6 @@ int v4l2_m2m_ioctl_remove_bufs(struct file *file, void *priv,
 	struct v4l2_fh *fh = file_to_v4l2_fh(file);
 	struct vb2_queue *q = v4l2_m2m_get_vq(fh->m2m_ctx, remove->type);
 
-	if (!q)
-		return -EINVAL;
 	if (q->type != remove->type)
 		return -EINVAL;
 
diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h
index 500f81f399df..c82445929c68 100644
--- a/include/media/v4l2-mem2mem.h
+++ b/include/media/v4l2-mem2mem.h
@@ -153,6 +153,9 @@ void *v4l2_m2m_get_curr_priv(struct v4l2_m2m_dev *m2m_dev);
  *
  * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx
  * @type: type of the V4L2 buffer, as defined by enum &v4l2_buf_type
+ *
+ * This function returns the capture queue when @type is a capture type, and the
+ * output queue otherwise. It never returns a NULL pointer.
  */
 struct vb2_queue *v4l2_m2m_get_vq(struct v4l2_m2m_ctx *m2m_ctx,
 				       enum v4l2_buf_type type);
-- 
cgit v1.2.3


From 1fdb55ed40fa5ebe6934bd6b93036c714ebb5ef8 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Wed, 15 Oct 2025 13:01:16 +0300
Subject: media: v4l2-mem2mem: Don't copy frame flags in
 v4l2_m2m_buf_copy_metadata()

The v4l2_m2m_buf_copy_metadata() function takes a boolean
copy_frame_flags argument. When true, it causes the function to copy the
V4L2_BUF_FLAG_KEYFRAME, V4L2_BUF_FLAG_BFRAME and V4L2_BUF_FLAG_PFRAME
flags from the output buffer to the capture buffer.

There is no use cases in any upstream driver for copying the flags.
KEY/P/B frames are properties of the bitstream buffer in some formats.
Once decoded, this is no longer a property of the video frame and should
be discarded.

It was considered useful to know if an uncompressed frame was decoded
from a KEY/P/B compressed frame, and to preserve that information if
that same uncompressed frame was passed through another M2M device (e.g.
a scaler). However, the V4L2 documentation makes it clear that the flags
are meant for compressed frames only.

Drop the copy_frame_flags argument from v4l2_m2m_buf_copy_metadata().
The change to drivers was performed with the following Coccinelle
semantic patch:

@@
expression src;
expression dst;
expression flag;
@@
-       v4l2_m2m_buf_copy_metadata(src, dst, flag);
+       v4l2_m2m_buf_copy_metadata(src, dst);

include/media/v4l2-mem2mem.h and drivers/media/v4l2-core/v4l2-mem2mem.c
have been updated manually.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Reviewed-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 drivers/media/platform/allegro-dvt/allegro-core.c          |  2 +-
 drivers/media/platform/amphion/vdec.c                      |  2 +-
 drivers/media/platform/amphion/venc.c                      |  2 +-
 drivers/media/platform/chips-media/coda/coda-bit.c         |  2 +-
 drivers/media/platform/chips-media/coda/coda-jpeg.c        |  4 ++--
 drivers/media/platform/imagination/e5010-jpeg-enc.c        |  2 +-
 drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c       |  4 ++--
 drivers/media/platform/mediatek/jpeg/mtk_jpeg_dec_hw.c     |  4 ++--
 drivers/media/platform/mediatek/jpeg/mtk_jpeg_enc_hw.c     |  4 ++--
 drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c        |  2 +-
 .../mediatek/vcodec/decoder/vdec/vdec_av1_req_lat_if.c     |  4 ++--
 .../mediatek/vcodec/decoder/vdec/vdec_h264_req_if.c        |  2 +-
 .../mediatek/vcodec/decoder/vdec/vdec_h264_req_multi_if.c  | 14 ++++++++------
 .../mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c  |  5 +++--
 .../mediatek/vcodec/decoder/vdec/vdec_vp8_req_if.c         |  2 +-
 .../mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c     |  6 +++---
 drivers/media/platform/nvidia/tegra-vde/h264.c             |  2 +-
 drivers/media/platform/nxp/dw100/dw100.c                   |  2 +-
 drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c             |  2 +-
 drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c         |  2 +-
 drivers/media/platform/rockchip/rga/rga.c                  |  2 +-
 drivers/media/platform/rockchip/rkvdec/rkvdec.c            |  2 +-
 drivers/media/platform/st/stm32/dma2d/dma2d.c              |  2 +-
 drivers/media/platform/sunxi/sun8i-di/sun8i-di.c           |  2 +-
 drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c   |  2 +-
 drivers/media/platform/verisilicon/hantro_drv.c            |  2 +-
 drivers/media/test-drivers/vicodec/vicodec-core.c          |  4 ++--
 drivers/media/test-drivers/vim2m.c                         |  2 +-
 drivers/media/test-drivers/visl/visl-dec.c                 |  2 +-
 drivers/media/v4l2-core/v4l2-mem2mem.c                     |  9 ++-------
 drivers/staging/media/imx/imx-media-csc-scaler.c           |  2 +-
 drivers/staging/media/sunxi/cedrus/cedrus_dec.c            |  2 +-
 include/media/v4l2-mem2mem.h                               | 12 +++---------
 33 files changed, 53 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/drivers/media/platform/allegro-dvt/allegro-core.c b/drivers/media/platform/allegro-dvt/allegro-core.c
index 875e5cbbeb07..f347d56ac108 100644
--- a/drivers/media/platform/allegro-dvt/allegro-core.c
+++ b/drivers/media/platform/allegro-dvt/allegro-core.c
@@ -2124,7 +2124,7 @@ static void allegro_channel_finish_frame(struct allegro_channel *channel,
 
 	state = VB2_BUF_STATE_DONE;
 
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, false);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 	if (msg->is_idr)
 		dst_buf->flags |= V4L2_BUF_FLAG_KEYFRAME;
 	else
diff --git a/drivers/media/platform/amphion/vdec.c b/drivers/media/platform/amphion/vdec.c
index 79790fbadc95..c0d2aabb9e0e 100644
--- a/drivers/media/platform/amphion/vdec.c
+++ b/drivers/media/platform/amphion/vdec.c
@@ -821,7 +821,7 @@ static int vdec_frame_decoded(struct vpu_inst *inst, void *arg)
 	vbuf = &vpu_buf->m2m_buf.vb;
 	src_buf = vdec_get_src_buffer(inst, info->consumed_count);
 	if (src_buf) {
-		v4l2_m2m_buf_copy_metadata(src_buf, vbuf, true);
+		v4l2_m2m_buf_copy_metadata(src_buf, vbuf);
 		if (info->consumed_count) {
 			v4l2_m2m_src_buf_remove(inst->fh.m2m_ctx);
 			vpu_set_buffer_state(src_buf, VPU_BUF_STATE_IDLE);
diff --git a/drivers/media/platform/amphion/venc.c b/drivers/media/platform/amphion/venc.c
index 319fbae70571..aced76401b69 100644
--- a/drivers/media/platform/amphion/venc.c
+++ b/drivers/media/platform/amphion/venc.c
@@ -788,7 +788,7 @@ static int venc_get_one_encoded_frame(struct vpu_inst *inst,
 
 	src_buf = vpu_find_buf_by_sequence(inst, inst->out_format.type, frame->info.frame_id);
 	if (src_buf) {
-		v4l2_m2m_buf_copy_metadata(src_buf, vbuf, true);
+		v4l2_m2m_buf_copy_metadata(src_buf, vbuf);
 		vpu_set_buffer_state(src_buf, VPU_BUF_STATE_IDLE);
 		v4l2_m2m_src_buf_remove_by_buf(inst->fh.m2m_ctx, src_buf);
 		v4l2_m2m_buf_done(src_buf, VB2_BUF_STATE_DONE);
diff --git a/drivers/media/platform/chips-media/coda/coda-bit.c b/drivers/media/platform/chips-media/coda/coda-bit.c
index 84ded154adfe..fa6b72c3bd93 100644
--- a/drivers/media/platform/chips-media/coda/coda-bit.c
+++ b/drivers/media/platform/chips-media/coda/coda-bit.c
@@ -1685,7 +1685,7 @@ static void coda_finish_encode(struct coda_ctx *ctx)
 		dst_buf->flags |= V4L2_BUF_FLAG_PFRAME;
 	dst_buf->flags |= src_buf->flags & V4L2_BUF_FLAG_LAST;
 
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, false);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	v4l2_m2m_buf_done(src_buf, VB2_BUF_STATE_DONE);
 
diff --git a/drivers/media/platform/chips-media/coda/coda-jpeg.c b/drivers/media/platform/chips-media/coda/coda-jpeg.c
index 5746892658b1..fb150b87c773 100644
--- a/drivers/media/platform/chips-media/coda/coda-jpeg.c
+++ b/drivers/media/platform/chips-media/coda/coda-jpeg.c
@@ -1245,7 +1245,7 @@ static void coda9_jpeg_finish_encode(struct coda_ctx *ctx)
 	dst_buf->flags |= V4L2_BUF_FLAG_KEYFRAME;
 	dst_buf->flags |= src_buf->flags & V4L2_BUF_FLAG_LAST;
 
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, false);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	v4l2_m2m_buf_done(src_buf, VB2_BUF_STATE_DONE);
 	coda_m2m_buf_done(ctx, dst_buf, err_mb ? VB2_BUF_STATE_ERROR :
@@ -1472,7 +1472,7 @@ static void coda9_jpeg_finish_decode(struct coda_ctx *ctx)
 	dst_buf->flags |= V4L2_BUF_FLAG_KEYFRAME;
 	dst_buf->flags |= src_buf->flags & V4L2_BUF_FLAG_LAST;
 
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, false);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	q_data_dst = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE);
 	vb2_set_plane_payload(&dst_buf->vb2_buf, 0, q_data_dst->sizeimage);
diff --git a/drivers/media/platform/imagination/e5010-jpeg-enc.c b/drivers/media/platform/imagination/e5010-jpeg-enc.c
index 1b5c8b1d6a31..1c6e076033ec 100644
--- a/drivers/media/platform/imagination/e5010-jpeg-enc.c
+++ b/drivers/media/platform/imagination/e5010-jpeg-enc.c
@@ -1354,7 +1354,7 @@ static void e5010_device_run(void *priv)
 	s_vb->sequence = ctx->out_queue.sequence++;
 	d_vb->sequence = ctx->cap_queue.sequence++;
 
-	v4l2_m2m_buf_copy_metadata(s_vb, d_vb, false);
+	v4l2_m2m_buf_copy_metadata(s_vb, d_vb);
 
 	if (ctx != e5010->last_context_run || ctx->update_qp) {
 		dprintk(e5010, 1, "ctx updated: 0x%p -> 0x%p, updating qp tables\n",
diff --git a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c
index d78b83f06247..d08fe365cbb2 100644
--- a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c
+++ b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c
@@ -1618,7 +1618,7 @@ retry_select:
 	if (!dst_buf)
 		goto getbuf_fail;
 
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, true);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	mtk_jpegenc_set_hw_param(ctx, hw_id, src_buf, dst_buf);
 	ret = pm_runtime_get_sync(comp_jpeg[hw_id]->dev);
@@ -1714,7 +1714,7 @@ retry_select:
 	if (!dst_buf)
 		goto getbuf_fail;
 
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, true);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 	jpeg_src_buf = mtk_jpeg_vb2_to_srcbuf(&src_buf->vb2_buf);
 	jpeg_dst_buf = mtk_jpeg_vb2_to_srcbuf(&dst_buf->vb2_buf);
 
diff --git a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_dec_hw.c b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_dec_hw.c
index e78e1d11093c..32372781daf5 100644
--- a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_dec_hw.c
+++ b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_dec_hw.c
@@ -530,7 +530,7 @@ static void mtk_jpegdec_timeout_work(struct work_struct *work)
 
 	src_buf = cjpeg->hw_param.src_buffer;
 	dst_buf = cjpeg->hw_param.dst_buffer;
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, true);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	mtk_jpeg_dec_reset(cjpeg->reg_base);
 	clk_disable_unprepare(cjpeg->jdec_clk.clks->clk);
@@ -560,7 +560,7 @@ static irqreturn_t mtk_jpegdec_hw_irq_handler(int irq, void *priv)
 	ctx = jpeg->hw_param.curr_ctx;
 	src_buf = jpeg->hw_param.src_buffer;
 	dst_buf = jpeg->hw_param.dst_buffer;
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, true);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	irq_status = mtk_jpeg_dec_get_int_status(jpeg->reg_base);
 	dec_irq_ret = mtk_jpeg_dec_enum_result(irq_status);
diff --git a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_enc_hw.c b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_enc_hw.c
index 9ab27aee302a..b6f5b2249f1f 100644
--- a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_enc_hw.c
+++ b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_enc_hw.c
@@ -261,7 +261,7 @@ static void mtk_jpegenc_timeout_work(struct work_struct *work)
 
 	src_buf = cjpeg->hw_param.src_buffer;
 	dst_buf = cjpeg->hw_param.dst_buffer;
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, true);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	mtk_jpeg_enc_reset(cjpeg->reg_base);
 	clk_disable_unprepare(cjpeg->venc_clk.clks->clk);
@@ -289,7 +289,7 @@ static irqreturn_t mtk_jpegenc_hw_irq_handler(int irq, void *priv)
 	ctx = jpeg->hw_param.curr_ctx;
 	src_buf = jpeg->hw_param.src_buffer;
 	dst_buf = jpeg->hw_param.dst_buffer;
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, true);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	irq_status = readl(jpeg->reg_base + JPEG_ENC_INT_STS) &
 		JPEG_ENC_INT_STATUS_MASK_ALLIRQ;
diff --git a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c
index 9ef956b565a7..44140987cf5f 100644
--- a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c
+++ b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c
@@ -51,7 +51,7 @@ static void mdp_m2m_process_done(void *priv, int vb_state)
 	ctx->curr_param.frame_no = ctx->frame_count[MDP_M2M_SRC];
 	src_vbuf->sequence = ctx->frame_count[MDP_M2M_SRC]++;
 	dst_vbuf->sequence = ctx->frame_count[MDP_M2M_DST]++;
-	v4l2_m2m_buf_copy_metadata(src_vbuf, dst_vbuf, true);
+	v4l2_m2m_buf_copy_metadata(src_vbuf, dst_vbuf);
 
 	v4l2_m2m_buf_done(src_vbuf, vb_state);
 	v4l2_m2m_buf_done(dst_vbuf, vb_state);
diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_av1_req_lat_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_av1_req_lat_if.c
index 08e0f5a70935..7be4b6086920 100644
--- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_av1_req_lat_if.c
+++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_av1_req_lat_if.c
@@ -1073,7 +1073,7 @@ static int vdec_av1_slice_setup_lat_from_src_buf(struct vdec_av1_slice_instance
 
 	lat_buf->src_buf_req = src->vb2_buf.req_obj.req;
 	dst = &lat_buf->ts_info;
-	v4l2_m2m_buf_copy_metadata(src, dst, true);
+	v4l2_m2m_buf_copy_metadata(src, dst);
 	vsi->frame.cur_ts = dst->vb2_buf.timestamp;
 
 	return 0;
@@ -1780,7 +1780,7 @@ static int vdec_av1_slice_setup_core_to_dst_buf(struct vdec_av1_slice_instance *
 	if (!dst)
 		return -EINVAL;
 
-	v4l2_m2m_buf_copy_metadata(&lat_buf->ts_info, dst, true);
+	v4l2_m2m_buf_copy_metadata(&lat_buf->ts_info, dst);
 
 	return 0;
 }
diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_h264_req_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_h264_req_if.c
index 1e1b32faac77..b9a5ea7887d3 100644
--- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_h264_req_if.c
+++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_h264_req_if.c
@@ -367,7 +367,7 @@ static int vdec_h264_slice_decode(void *h_vdec, struct mtk_vcodec_mem *bs,
 	inst->vsi_ctx.dec.vdec_fb_va = (u64)(uintptr_t)fb;
 
 	v4l2_m2m_buf_copy_metadata(&src_buf_info->m2m_buf.vb,
-				   &dst_buf_info->m2m_buf.vb, true);
+				   &dst_buf_info->m2m_buf.vb);
 	err = get_vdec_decode_parameters(inst);
 	if (err)
 		goto err_free_fb_out;
diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_h264_req_multi_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_h264_req_multi_if.c
index 5b25e1679b51..9a9dc2f88d6e 100644
--- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_h264_req_multi_if.c
+++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_h264_req_multi_if.c
@@ -570,7 +570,7 @@ static int vdec_h264_slice_setup_core_buffer_ext(struct vdec_h264_slice_inst *in
 	}
 
 	vb2_v4l2 = v4l2_m2m_next_dst_buf(ctx->m2m_ctx);
-	v4l2_m2m_buf_copy_metadata(&lat_buf->ts_info, vb2_v4l2, true);
+	v4l2_m2m_buf_copy_metadata(&lat_buf->ts_info, vb2_v4l2);
 
 	return 0;
 }
@@ -674,7 +674,7 @@ static int vdec_h264_slice_core_decode(struct vdec_lat_buf *lat_buf)
 	}
 
 	vb2_v4l2 = v4l2_m2m_next_dst_buf(ctx->m2m_ctx);
-	v4l2_m2m_buf_copy_metadata(&lat_buf->ts_info, vb2_v4l2, true);
+	v4l2_m2m_buf_copy_metadata(&lat_buf->ts_info, vb2_v4l2);
 
 	vdec_h264_slice_fill_decode_reflist(inst, &inst->vsi_core->h264_slice_params,
 					    share_info);
@@ -768,7 +768,8 @@ static int vdec_h264_slice_lat_decode_ext(void *h_vdec, struct mtk_vcodec_mem *b
 	src_buf_info = container_of(bs, struct mtk_video_dec_buf, bs_buffer);
 
 	lat_buf->src_buf_req = src_buf_info->m2m_buf.vb.vb2_buf.req_obj.req;
-	v4l2_m2m_buf_copy_metadata(&src_buf_info->m2m_buf.vb, &lat_buf->ts_info, true);
+	v4l2_m2m_buf_copy_metadata(&src_buf_info->m2m_buf.vb,
+				   &lat_buf->ts_info);
 
 	err = vdec_h264_slice_fill_decode_parameters(inst, share_info,
 						     &inst->vsi_ext->h264_slice_params);
@@ -900,7 +901,8 @@ static int vdec_h264_slice_lat_decode(void *h_vdec, struct mtk_vcodec_mem *bs,
 
 	inst->vsi->dec.nal_info = buf[nal_start_idx];
 	lat_buf->src_buf_req = src_buf_info->m2m_buf.vb.vb2_buf.req_obj.req;
-	v4l2_m2m_buf_copy_metadata(&src_buf_info->m2m_buf.vb, &lat_buf->ts_info, true);
+	v4l2_m2m_buf_copy_metadata(&src_buf_info->m2m_buf.vb,
+				   &lat_buf->ts_info);
 
 	err = vdec_h264_slice_fill_decode_parameters(inst, share_info,
 						     &inst->vsi->h264_slice_params);
@@ -1039,7 +1041,7 @@ static int vdec_h264_slice_single_decode_ext(void *h_vdec, struct mtk_vcodec_mem
 	inst->vsi_ctx_ext.dec.vdec_fb_va = (u64)(uintptr_t)fb;
 
 	v4l2_m2m_buf_copy_metadata(&src_buf_info->m2m_buf.vb,
-				   &dst_buf_info->m2m_buf.vb, true);
+				   &dst_buf_info->m2m_buf.vb);
 	err = get_vdec_sig_decode_parameters(inst);
 	if (err)
 		goto err_free_fb_out;
@@ -1135,7 +1137,7 @@ static int vdec_h264_slice_single_decode(void *h_vdec, struct mtk_vcodec_mem *bs
 	inst->vsi_ctx.dec.vdec_fb_va = (u64)(uintptr_t)fb;
 
 	v4l2_m2m_buf_copy_metadata(&src_buf_info->m2m_buf.vb,
-				   &dst_buf_info->m2m_buf.vb, true);
+				   &dst_buf_info->m2m_buf.vb);
 	err = get_vdec_sig_decode_parameters(inst);
 	if (err)
 		goto err_free_fb_out;
diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c
index 2725db882e5b..88eca50c2017 100644
--- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c
+++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c
@@ -742,7 +742,8 @@ static int vdec_hevc_slice_setup_lat_buffer(struct vdec_hevc_slice_inst *inst,
 
 	src_buf_info = container_of(bs, struct mtk_video_dec_buf, bs_buffer);
 	lat_buf->src_buf_req = src_buf_info->m2m_buf.vb.vb2_buf.req_obj.req;
-	v4l2_m2m_buf_copy_metadata(&src_buf_info->m2m_buf.vb, &lat_buf->ts_info, true);
+	v4l2_m2m_buf_copy_metadata(&src_buf_info->m2m_buf.vb,
+				   &lat_buf->ts_info);
 
 	*res_chg = inst->resolution_changed;
 	if (inst->resolution_changed) {
@@ -847,7 +848,7 @@ static int vdec_hevc_slice_setup_core_buffer(struct vdec_hevc_slice_inst *inst,
 	}
 
 	vb2_v4l2 = v4l2_m2m_next_dst_buf(ctx->m2m_ctx);
-	v4l2_m2m_buf_copy_metadata(&lat_buf->ts_info, vb2_v4l2, true);
+	v4l2_m2m_buf_copy_metadata(&lat_buf->ts_info, vb2_v4l2);
 
 	return 0;
 }
diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_req_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_req_if.c
index 232ef3bd246a..e1d4960553f2 100644
--- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_req_if.c
+++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_req_if.c
@@ -358,7 +358,7 @@ static int vdec_vp8_slice_decode(void *h_vdec, struct mtk_vcodec_mem *bs,
 		       y_fb_dma, c_fb_dma);
 
 	v4l2_m2m_buf_copy_metadata(&src_buf_info->m2m_buf.vb,
-				   &dst_buf_info->m2m_buf.vb, true);
+				   &dst_buf_info->m2m_buf.vb);
 
 	err = vdec_vp8_slice_get_decode_parameters(inst);
 	if (err)
diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c
index 45cd555a5fb5..cd1935014d76 100644
--- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c
+++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c
@@ -706,7 +706,7 @@ int vdec_vp9_slice_setup_single_from_src_to_dst(struct vdec_vp9_slice_instance *
 	if (!dst)
 		return -EINVAL;
 
-	v4l2_m2m_buf_copy_metadata(src, dst, true);
+	v4l2_m2m_buf_copy_metadata(src, dst);
 
 	return 0;
 }
@@ -724,7 +724,7 @@ static int vdec_vp9_slice_setup_lat_from_src_buf(struct vdec_vp9_slice_instance
 	lat_buf->src_buf_req = src->vb2_buf.req_obj.req;
 
 	dst = &lat_buf->ts_info;
-	v4l2_m2m_buf_copy_metadata(src, dst, true);
+	v4l2_m2m_buf_copy_metadata(src, dst);
 	return 0;
 }
 
@@ -1652,7 +1652,7 @@ static int vdec_vp9_slice_setup_core_to_dst_buf(struct vdec_vp9_slice_instance *
 	if (!dst)
 		return -EINVAL;
 
-	v4l2_m2m_buf_copy_metadata(&lat_buf->ts_info, dst, true);
+	v4l2_m2m_buf_copy_metadata(&lat_buf->ts_info, dst);
 	return 0;
 }
 
diff --git a/drivers/media/platform/nvidia/tegra-vde/h264.c b/drivers/media/platform/nvidia/tegra-vde/h264.c
index 45f8f6904867..2a2211671fd9 100644
--- a/drivers/media/platform/nvidia/tegra-vde/h264.c
+++ b/drivers/media/platform/nvidia/tegra-vde/h264.c
@@ -776,7 +776,7 @@ static int tegra_vde_h264_setup_frames(struct tegra_ctx *ctx,
 	 * If userspace doesn't tell us frame's type, then we will try decode
 	 * as-is.
 	 */
-	v4l2_m2m_buf_copy_metadata(src, dst, true);
+	v4l2_m2m_buf_copy_metadata(src, dst);
 
 	if (h->decode_params->flags & V4L2_H264_DECODE_PARAM_FLAG_BFRAME)
 		tb->b_frame = true;
diff --git a/drivers/media/platform/nxp/dw100/dw100.c b/drivers/media/platform/nxp/dw100/dw100.c
index 035081c4223b..4aaf9c3fff53 100644
--- a/drivers/media/platform/nxp/dw100/dw100.c
+++ b/drivers/media/platform/nxp/dw100/dw100.c
@@ -1430,7 +1430,7 @@ static void dw100_start(struct dw100_ctx *ctx, struct vb2_v4l2_buffer *in_vb,
 				V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE),
 		in_vb->sequence, out_vb->sequence);
 
-	v4l2_m2m_buf_copy_metadata(in_vb, out_vb, true);
+	v4l2_m2m_buf_copy_metadata(in_vb, out_vb);
 
 	/* Now, let's deal with hardware ... */
 	dw100_hw_master_bus_disable(dw_dev);
diff --git a/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c b/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c
index d41fa4c3139b..9e4a813489c0 100644
--- a/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c
+++ b/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c
@@ -1537,7 +1537,7 @@ static void mxc_jpeg_device_run(void *priv)
 	src_buf->sequence = q_data_out->sequence++;
 	dst_buf->sequence = q_data_cap->sequence++;
 
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, true);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	jpeg_src_buf = vb2_to_mxc_buf(&src_buf->vb2_buf);
 	if (q_data_cap->fmt->mem_planes != dst_buf->vb2_buf.num_planes) {
diff --git a/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c b/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c
index df14f12b3ad2..f425ac786854 100644
--- a/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c
+++ b/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c
@@ -107,7 +107,7 @@ static void mxc_isi_m2m_frame_write_done(struct mxc_isi_pipe *pipe, u32 status)
 	src_vbuf = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx);
 	dst_vbuf = v4l2_m2m_dst_buf_remove(ctx->fh.m2m_ctx);
 
-	v4l2_m2m_buf_copy_metadata(src_vbuf, dst_vbuf, false);
+	v4l2_m2m_buf_copy_metadata(src_vbuf, dst_vbuf);
 
 	src_vbuf->sequence = ctx->queues.out.sequence++;
 	dst_vbuf->sequence = ctx->queues.cap.sequence++;
diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
index eb9c556a105c..43f6a8d99381 100644
--- a/drivers/media/platform/rockchip/rga/rga.c
+++ b/drivers/media/platform/rockchip/rga/rga.c
@@ -75,7 +75,7 @@ static irqreturn_t rga_isr(int irq, void *prv)
 		WARN_ON(!src);
 		WARN_ON(!dst);
 
-		v4l2_m2m_buf_copy_metadata(src, dst, true);
+		v4l2_m2m_buf_copy_metadata(src, dst);
 
 		dst->sequence = ctx->csequence++;
 
diff --git a/drivers/media/platform/rockchip/rkvdec/rkvdec.c b/drivers/media/platform/rockchip/rkvdec/rkvdec.c
index 6e606d73ff51..cb02c6988602 100644
--- a/drivers/media/platform/rockchip/rkvdec/rkvdec.c
+++ b/drivers/media/platform/rockchip/rkvdec/rkvdec.c
@@ -783,7 +783,7 @@ void rkvdec_run_preamble(struct rkvdec_ctx *ctx, struct rkvdec_run *run)
 	if (src_req)
 		v4l2_ctrl_request_setup(src_req, &ctx->ctrl_hdl);
 
-	v4l2_m2m_buf_copy_metadata(run->bufs.src, run->bufs.dst, true);
+	v4l2_m2m_buf_copy_metadata(run->bufs.src, run->bufs.dst);
 }
 
 void rkvdec_run_postamble(struct rkvdec_ctx *ctx, struct rkvdec_run *run)
diff --git a/drivers/media/platform/st/stm32/dma2d/dma2d.c b/drivers/media/platform/st/stm32/dma2d/dma2d.c
index 30544ceca42c..72488aa922fc 100644
--- a/drivers/media/platform/st/stm32/dma2d/dma2d.c
+++ b/drivers/media/platform/st/stm32/dma2d/dma2d.c
@@ -485,7 +485,7 @@ static void device_run(void *prv)
 
 	src->sequence = frm_out->sequence++;
 	dst->sequence = frm_cap->sequence++;
-	v4l2_m2m_buf_copy_metadata(src, dst, true);
+	v4l2_m2m_buf_copy_metadata(src, dst);
 
 	if (clk_enable(dev->gate))
 		goto end;
diff --git a/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c b/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c
index eb519afb30ca..7c4dd1ac772d 100644
--- a/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c
+++ b/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c
@@ -71,7 +71,7 @@ static void deinterlace_device_run(void *priv)
 	src = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 	dst = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx);
 
-	v4l2_m2m_buf_copy_metadata(src, dst, true);
+	v4l2_m2m_buf_copy_metadata(src, dst);
 
 	deinterlace_write(dev, DEINTERLACE_MOD_ENABLE,
 			  DEINTERLACE_MOD_ENABLE_EN);
diff --git a/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c b/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c
index 89992feaab60..2deab920884a 100644
--- a/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c
+++ b/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c
@@ -70,7 +70,7 @@ static void rotate_device_run(void *priv)
 	src = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 	dst = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx);
 
-	v4l2_m2m_buf_copy_metadata(src, dst, true);
+	v4l2_m2m_buf_copy_metadata(src, dst);
 
 	val = ROTATE_GLB_CTL_MODE(ROTATE_MODE_COPY_ROTATE);
 	if (ctx->hflip)
diff --git a/drivers/media/platform/verisilicon/hantro_drv.c b/drivers/media/platform/verisilicon/hantro_drv.c
index e0c11fe8b55c..60b95b5d8565 100644
--- a/drivers/media/platform/verisilicon/hantro_drv.c
+++ b/drivers/media/platform/verisilicon/hantro_drv.c
@@ -183,7 +183,7 @@ static void device_run(void *priv)
 	if (ret)
 		goto err_cancel_job;
 
-	v4l2_m2m_buf_copy_metadata(src, dst, true);
+	v4l2_m2m_buf_copy_metadata(src, dst);
 
 	if (ctx->codec_ops->run(ctx))
 		goto err_cancel_job;
diff --git a/drivers/media/test-drivers/vicodec/vicodec-core.c b/drivers/media/test-drivers/vicodec/vicodec-core.c
index 9dc32c593427..a7ab668ce70b 100644
--- a/drivers/media/test-drivers/vicodec/vicodec-core.c
+++ b/drivers/media/test-drivers/vicodec/vicodec-core.c
@@ -421,7 +421,7 @@ static void device_run(void *priv)
 	else
 		dst_buf->sequence = q_dst->sequence++;
 	dst_buf->flags &= ~V4L2_BUF_FLAG_LAST;
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, false);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	spin_lock(ctx->lock);
 	if (!ctx->comp_has_next_frame &&
@@ -555,7 +555,7 @@ static void set_last_buffer(struct vb2_v4l2_buffer *dst_buf,
 	vb2_set_plane_payload(&dst_buf->vb2_buf, 0, 0);
 	dst_buf->sequence = q_dst->sequence++;
 
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, !ctx->is_enc);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 	dst_buf->flags |= V4L2_BUF_FLAG_LAST;
 	v4l2_m2m_buf_done(dst_buf, VB2_BUF_STATE_DONE);
 }
diff --git a/drivers/media/test-drivers/vim2m.c b/drivers/media/test-drivers/vim2m.c
index 9d921feaf824..c33c18ea5210 100644
--- a/drivers/media/test-drivers/vim2m.c
+++ b/drivers/media/test-drivers/vim2m.c
@@ -477,7 +477,7 @@ static int device_process(struct vim2m_ctx *ctx,
 
 	out_vb->sequence = q_data_out->sequence++;
 	in_vb->sequence = q_data_in->sequence++;
-	v4l2_m2m_buf_copy_metadata(in_vb, out_vb, true);
+	v4l2_m2m_buf_copy_metadata(in_vb, out_vb);
 
 	if (ctx->mode & MEM2MEM_VFLIP) {
 		start = height - 1;
diff --git a/drivers/media/test-drivers/visl/visl-dec.c b/drivers/media/test-drivers/visl/visl-dec.c
index 6a9639bd4d61..d90b79de8384 100644
--- a/drivers/media/test-drivers/visl/visl-dec.c
+++ b/drivers/media/test-drivers/visl/visl-dec.c
@@ -572,7 +572,7 @@ void visl_device_run(void *priv)
 	if (src_req)
 		v4l2_ctrl_request_setup(src_req, &ctx->hdl);
 
-	v4l2_m2m_buf_copy_metadata(run.src, run.dst, true);
+	v4l2_m2m_buf_copy_metadata(run.src, run.dst);
 	run.dst->sequence = ctx->q_data[V4L2_M2M_DST].sequence++;
 	run.src->sequence = ctx->q_data[V4L2_M2M_SRC].sequence++;
 	run.dst->field = ctx->decoded_fmt.fmt.pix.field;
diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c
index 9fa8833e445f..fec93c1a9231 100644
--- a/drivers/media/v4l2-core/v4l2-mem2mem.c
+++ b/drivers/media/v4l2-core/v4l2-mem2mem.c
@@ -1288,14 +1288,9 @@ void v4l2_m2m_buf_queue(struct v4l2_m2m_ctx *m2m_ctx,
 EXPORT_SYMBOL_GPL(v4l2_m2m_buf_queue);
 
 void v4l2_m2m_buf_copy_metadata(const struct vb2_v4l2_buffer *out_vb,
-				struct vb2_v4l2_buffer *cap_vb,
-				bool copy_frame_flags)
+				struct vb2_v4l2_buffer *cap_vb)
 {
-	u32 mask = V4L2_BUF_FLAG_TIMECODE | V4L2_BUF_FLAG_TSTAMP_SRC_MASK;
-
-	if (copy_frame_flags)
-		mask |= V4L2_BUF_FLAG_KEYFRAME | V4L2_BUF_FLAG_PFRAME |
-			V4L2_BUF_FLAG_BFRAME;
+	const u32 mask = V4L2_BUF_FLAG_TIMECODE | V4L2_BUF_FLAG_TSTAMP_SRC_MASK;
 
 	cap_vb->vb2_buf.timestamp = out_vb->vb2_buf.timestamp;
 
diff --git a/drivers/staging/media/imx/imx-media-csc-scaler.c b/drivers/staging/media/imx/imx-media-csc-scaler.c
index 1869c5792ecb..0a27330f9790 100644
--- a/drivers/staging/media/imx/imx-media-csc-scaler.c
+++ b/drivers/staging/media/imx/imx-media-csc-scaler.c
@@ -99,7 +99,7 @@ static void ipu_ic_pp_complete(struct ipu_image_convert_run *run, void *_ctx)
 	src_buf = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx);
 	dst_buf = v4l2_m2m_dst_buf_remove(ctx->fh.m2m_ctx);
 
-	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, true);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf);
 
 	src_buf->sequence = ctx->sequence++;
 	dst_buf->sequence = src_buf->sequence;
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
index fbbf9e6f0f50..9f8b0555b7dc 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
@@ -90,7 +90,7 @@ void cedrus_device_run(void *priv)
 		break;
 	}
 
-	v4l2_m2m_buf_copy_metadata(run.src, run.dst, true);
+	v4l2_m2m_buf_copy_metadata(run.src, run.dst);
 
 	cedrus_dst_format_set(dev, &ctx->dst_fmt);
 
diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h
index c82445929c68..bf6a09a04dcf 100644
--- a/include/media/v4l2-mem2mem.h
+++ b/include/media/v4l2-mem2mem.h
@@ -845,19 +845,13 @@ v4l2_m2m_dst_buf_remove_by_idx(struct v4l2_m2m_ctx *m2m_ctx, unsigned int idx)
  *
  * @out_vb: the output buffer that is the source of the metadata.
  * @cap_vb: the capture buffer that will receive the metadata.
- * @copy_frame_flags: copy the KEY/B/PFRAME flags as well.
  *
  * This helper function copies the timestamp, timecode (if the TIMECODE
- * buffer flag was set), field and the TIMECODE, KEYFRAME, BFRAME, PFRAME
- * and TSTAMP_SRC_MASK flags from @out_vb to @cap_vb.
- *
- * If @copy_frame_flags is false, then the KEYFRAME, BFRAME and PFRAME
- * flags are not copied. This is typically needed for encoders that
- * set this bits explicitly.
+ * buffer flag was set), field, and the TIMECODE and TSTAMP_SRC_MASK flags from
+ * @out_vb to @cap_vb.
  */
 void v4l2_m2m_buf_copy_metadata(const struct vb2_v4l2_buffer *out_vb,
-				struct vb2_v4l2_buffer *cap_vb,
-				bool copy_frame_flags);
+				struct vb2_v4l2_buffer *cap_vb);
 
 /* v4l2 request helper */
 
-- 
cgit v1.2.3


From 0d30dae38fe01cd1de358c6039a0b1184689fe51 Mon Sep 17 00:00:00 2001
From: Zhang Lixu <lixu.zhang@intel.com>
Date: Fri, 10 Oct 2025 13:52:54 +0800
Subject: HID: intel-ish-hid: Use dedicated unbound workqueues to prevent
 resume blocking

During suspend/resume tests with S2IDLE, some ISH functional failures were
observed because of delay in executing ISH resume handler. Here
schedule_work() is used from resume handler to do actual work.
schedule_work() uses system_wq, which is a per CPU work queue. Although
the queuing is not bound to a CPU, but it prefers local CPU of the caller,
unless prohibited.

Users of this work queue are not supposed to queue long running work.
But in practice, there are scenarios where long running work items are
queued on other unbound workqueues, occupying the CPU. As a result, the
ISH resume handler may not get a chance to execute in a timely manner.

In one scenario, one of the ish_resume_handler() executions was delayed
nearly 1 second because another work item on an unbound workqueue occupied
the same CPU. This delay causes ISH functionality failures.

A similar issue was previously observed where the ISH HID driver timed out
while getting the HID descriptor during S4 resume in the recovery kernel,
likely caused by the same workqueue contention problem.

Create dedicated unbound workqueues for all ISH operations to allow work
items to execute on any available CPU, eliminating CPU-specific bottlenecks
and improving resume reliability under varying system loads. Also ISH has
three different components, a bus driver which implements ISH protocols, a
PCI interface layer and HID interface. Use one dedicated work queue for all
of them.

Signed-off-by: Zhang Lixu <lixu.zhang@intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 drivers/hid/intel-ish-hid/ipc/ipc.c          | 21 ++++++++++++++++++++-
 drivers/hid/intel-ish-hid/ipc/pci-ish.c      |  2 +-
 drivers/hid/intel-ish-hid/ishtp-hid-client.c |  4 ++--
 drivers/hid/intel-ish-hid/ishtp/bus.c        | 18 +++++++++++++++++-
 drivers/hid/intel-ish-hid/ishtp/hbm.c        |  4 ++--
 drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h  |  3 +++
 include/linux/intel-ish-client-if.h          |  2 ++
 7 files changed, 47 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/intel-ish-hid/ipc/ipc.c b/drivers/hid/intel-ish-hid/ipc/ipc.c
index 3ddaa2cd39d5..9958f2968c4f 100644
--- a/drivers/hid/intel-ish-hid/ipc/ipc.c
+++ b/drivers/hid/intel-ish-hid/ipc/ipc.c
@@ -628,7 +628,7 @@ static void	recv_ipc(struct ishtp_device *dev, uint32_t doorbell_val)
 		if (!ishtp_dev) {
 			ishtp_dev = dev;
 		}
-		schedule_work(&fw_reset_work);
+		queue_work(dev->unbound_wq, &fw_reset_work);
 		break;
 
 	case MNG_RESET_NOTIFY_ACK:
@@ -933,6 +933,21 @@ static const struct ishtp_hw_ops ish_hw_ops = {
 	.dma_no_cache_snooping = _dma_no_cache_snooping
 };
 
+static struct workqueue_struct *devm_ishtp_alloc_workqueue(struct device *dev)
+{
+	struct workqueue_struct *wq;
+
+	wq = alloc_workqueue("ishtp_unbound_%d", WQ_UNBOUND, 0, dev->id);
+	if (!wq)
+		return NULL;
+
+	if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue,
+				     wq))
+		return NULL;
+
+	return wq;
+}
+
 /**
  * ish_dev_init() -Initialize ISH devoce
  * @pdev: PCI device
@@ -953,6 +968,10 @@ struct ishtp_device *ish_dev_init(struct pci_dev *pdev)
 	if (!dev)
 		return NULL;
 
+	dev->unbound_wq = devm_ishtp_alloc_workqueue(&pdev->dev);
+	if (!dev->unbound_wq)
+		return NULL;
+
 	dev->devc = &pdev->dev;
 	ishtp_device_init(dev);
 
diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c
index 9d150ce234f2..b748ac6fbfdc 100644
--- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c
+++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c
@@ -384,7 +384,7 @@ static int __maybe_unused ish_resume(struct device *device)
 	ish_resume_device = device;
 	dev->resume_flag = 1;
 
-	schedule_work(&resume_work);
+	queue_work(dev->unbound_wq, &resume_work);
 
 	return 0;
 }
diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
index d8c3c54a8c0f..f61add862b6b 100644
--- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c
+++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
@@ -860,7 +860,7 @@ static int hid_ishtp_cl_reset(struct ishtp_cl_device *cl_device)
 	hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__,
 			hid_ishtp_cl);
 
-	schedule_work(&client_data->work);
+	queue_work(ishtp_get_workqueue(cl_device), &client_data->work);
 
 	return 0;
 }
@@ -902,7 +902,7 @@ static int hid_ishtp_cl_resume(struct device *device)
 
 	hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__,
 			hid_ishtp_cl);
-	schedule_work(&client_data->resume_work);
+	queue_work(ishtp_get_workqueue(cl_device), &client_data->resume_work);
 	return 0;
 }
 
diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c
index 93a0432e7058..c6ce37244e49 100644
--- a/drivers/hid/intel-ish-hid/ishtp/bus.c
+++ b/drivers/hid/intel-ish-hid/ishtp/bus.c
@@ -541,7 +541,7 @@ void ishtp_cl_bus_rx_event(struct ishtp_cl_device *device)
 		return;
 
 	if (device->event_cb)
-		schedule_work(&device->event_work);
+		queue_work(device->ishtp_dev->unbound_wq, &device->event_work);
 }
 
 /**
@@ -876,6 +876,22 @@ struct device *ishtp_get_pci_device(struct ishtp_cl_device *device)
 }
 EXPORT_SYMBOL(ishtp_get_pci_device);
 
+/**
+ * ishtp_get_workqueue - Retrieve the workqueue associated with an ISHTP device
+ * @cl_device: Pointer to the ISHTP client device structure
+ *
+ * Returns the workqueue_struct pointer (unbound_wq) associated with the given
+ * ISHTP client device. This workqueue is typically used for scheduling work
+ * related to the device.
+ *
+ * Return: Pointer to struct workqueue_struct.
+ */
+struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device)
+{
+	return cl_device->ishtp_dev->unbound_wq;
+}
+EXPORT_SYMBOL(ishtp_get_workqueue);
+
 /**
  * ishtp_trace_callback() - Return trace callback
  * @cl_device: ISH-TP client device instance
diff --git a/drivers/hid/intel-ish-hid/ishtp/hbm.c b/drivers/hid/intel-ish-hid/ishtp/hbm.c
index 8ee5467127d8..97c4fcd9e3c6 100644
--- a/drivers/hid/intel-ish-hid/ishtp/hbm.c
+++ b/drivers/hid/intel-ish-hid/ishtp/hbm.c
@@ -573,7 +573,7 @@ void ishtp_hbm_dispatch(struct ishtp_device *dev,
 
 		/* Start firmware loading process if it has loader capability */
 		if (version_res->host_version_supported & ISHTP_SUPPORT_CAP_LOADER)
-			schedule_work(&dev->work_fw_loader);
+			queue_work(dev->unbound_wq, &dev->work_fw_loader);
 
 		dev->version.major_version = HBM_MAJOR_VERSION;
 		dev->version.minor_version = HBM_MINOR_VERSION;
@@ -864,7 +864,7 @@ void	recv_hbm(struct ishtp_device *dev, struct ishtp_msg_hdr *ishtp_hdr)
 	dev->rd_msg_fifo_tail = (dev->rd_msg_fifo_tail + IPC_PAYLOAD_SIZE) %
 		(RD_INT_FIFO_SIZE * IPC_PAYLOAD_SIZE);
 	spin_unlock_irqrestore(&dev->rd_msg_spinlock, flags);
-	schedule_work(&dev->bh_hbm_work);
+	queue_work(dev->unbound_wq, &dev->bh_hbm_work);
 eoi:
 	return;
 }
diff --git a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
index 23db97ecf21c..4b0596eadf1c 100644
--- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
+++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
@@ -175,6 +175,9 @@ struct ishtp_device {
 	struct hbm_version version;
 	int transfer_path; /* Choice of transfer path: IPC or DMA */
 
+	/* Alloc a dedicated unbound workqueue for ishtp device */
+	struct workqueue_struct *unbound_wq;
+
 	/* work structure for scheduling firmware loading tasks */
 	struct work_struct work_fw_loader;
 	/* waitq for waiting for command response from the firmware loader */
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index dfbf7d9d7bb5..b235fd84f478 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -87,6 +87,8 @@ bool ishtp_wait_resume(struct ishtp_device *dev);
 ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device);
 /* Get device pointer of PCI device for DMA acces */
 struct device *ishtp_get_pci_device(struct ishtp_cl_device *cl_device);
+/* Get the ISHTP workqueue */
+struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device);
 
 struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device);
 void ishtp_cl_free(struct ishtp_cl *cl);
-- 
cgit v1.2.3


From 011aa2aa2c4c2b3356c32f195f306df6e177ac38 Mon Sep 17 00:00:00 2001
From: Zhang Lixu <lixu.zhang@intel.com>
Date: Fri, 17 Oct 2025 10:22:13 +0800
Subject: HID: intel-ish-hid: Add ishtp_get_connection_state() interface

Add the ishtp_get_connection_state() function for struct ishtp_cl, allowing
ishtp client drivers to retrieve the current connection state.

Signed-off-by: Zhang Lixu <lixu.zhang@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 drivers/hid/intel-ish-hid/ishtp/client.c | 6 ++++++
 include/linux/intel-ish-client-if.h      | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/hid/intel-ish-hid/ishtp/client.c b/drivers/hid/intel-ish-hid/ishtp/client.c
index 21a2c0773cc2..40f510b1c072 100644
--- a/drivers/hid/intel-ish-hid/ishtp/client.c
+++ b/drivers/hid/intel-ish-hid/ishtp/client.c
@@ -1261,6 +1261,12 @@ void ishtp_set_connection_state(struct ishtp_cl *cl, int state)
 }
 EXPORT_SYMBOL(ishtp_set_connection_state);
 
+int ishtp_get_connection_state(struct ishtp_cl *cl)
+{
+	return cl->state;
+}
+EXPORT_SYMBOL(ishtp_get_connection_state);
+
 void ishtp_cl_set_fw_client_id(struct ishtp_cl *cl, int fw_client_id)
 {
 	cl->fw_client_id = fw_client_id;
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index b235fd84f478..2cd4f65aaa37 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -109,6 +109,7 @@ struct ishtp_device *ishtp_get_ishtp_device(struct ishtp_cl *cl);
 void ishtp_set_tx_ring_size(struct ishtp_cl *cl, int size);
 void ishtp_set_rx_ring_size(struct ishtp_cl *cl, int size);
 void ishtp_set_connection_state(struct ishtp_cl *cl, int state);
+int ishtp_get_connection_state(struct ishtp_cl *cl);
 void ishtp_cl_set_fw_client_id(struct ishtp_cl *cl, int fw_client_id);
 
 void ishtp_put_device(struct ishtp_cl_device *cl_dev);
-- 
cgit v1.2.3


From e4c4f5a1ae18a7828c2bfaf9dfe2473632b92d1b Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Date: Fri, 3 Oct 2025 20:14:38 +0200
Subject: dt-bindings: clock: qcom,x1e80100-gcc: Add missing USB4 clocks/resets

Some of the USB4 muxes, RCGs and resets were not initially described.

Add indices for them to allow extending the driver.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Bryan O'Donoghue <bod@kernel.org>
Signed-off-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20251003-topic-hamoa_gcc_usb4-v2-1-61d27a14ee65@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,x1e80100-gcc.yaml          | 62 ++++++++++++++++++++--
 include/dt-bindings/clock/qcom,x1e80100-gcc.h      | 61 +++++++++++++++++++++
 2 files changed, 119 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,x1e80100-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,x1e80100-gcc.yaml
index 68dde0720c71..1b15b5070954 100644
--- a/Documentation/devicetree/bindings/clock/qcom,x1e80100-gcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,x1e80100-gcc.yaml
@@ -32,9 +32,36 @@ properties:
       - description: PCIe 5 pipe clock
       - description: PCIe 6a pipe clock
       - description: PCIe 6b pipe clock
-      - description: USB QMP Phy 0 clock source
-      - description: USB QMP Phy 1 clock source
-      - description: USB QMP Phy 2 clock source
+      - description: USB4_0 QMPPHY clock source
+      - description: USB4_1 QMPPHY clock source
+      - description: USB4_2 QMPPHY clock source
+      - description: USB4_0 PHY DP0 GMUX clock source
+      - description: USB4_0 PHY DP1 GMUX clock source
+      - description: USB4_0 PHY PCIE PIPEGMUX clock source
+      - description: USB4_0 PHY PIPEGMUX clock source
+      - description: USB4_0 PHY SYS PCIE PIPEGMUX clock source
+      - description: USB4_1 PHY DP0 GMUX 2 clock source
+      - description: USB4_1 PHY DP1 GMUX 2 clock source
+      - description: USB4_1 PHY PCIE PIPEGMUX clock source
+      - description: USB4_1 PHY PIPEGMUX clock source
+      - description: USB4_1 PHY SYS PCIE PIPEGMUX clock source
+      - description: USB4_2 PHY DP0 GMUX 2 clock source
+      - description: USB4_2 PHY DP1 GMUX 2 clock source
+      - description: USB4_2 PHY PCIE PIPEGMUX clock source
+      - description: USB4_2 PHY PIPEGMUX clock source
+      - description: USB4_2 PHY SYS PCIE PIPEGMUX clock source
+      - description: USB4_0 PHY RX 0 clock source
+      - description: USB4_0 PHY RX 1 clock source
+      - description: USB4_1 PHY RX 0 clock source
+      - description: USB4_1 PHY RX 1 clock source
+      - description: USB4_2 PHY RX 0 clock source
+      - description: USB4_2 PHY RX 1 clock source
+      - description: USB4_0 PHY PCIE PIPE clock source
+      - description: USB4_0 PHY max PIPE clock source
+      - description: USB4_1 PHY PCIE PIPE clock source
+      - description: USB4_1 PHY max PIPE clock source
+      - description: USB4_2 PHY PCIE PIPE clock source
+      - description: USB4_2 PHY max PIPE clock source
 
   power-domains:
     description:
@@ -67,7 +94,34 @@ examples:
                <&pcie6b_phy>,
                <&usb_1_ss0_qmpphy 0>,
                <&usb_1_ss1_qmpphy 1>,
-               <&usb_1_ss2_qmpphy 2>;
+               <&usb_1_ss2_qmpphy 2>,
+               <&usb4_0_phy_dp0_gmux_clk>,
+               <&usb4_0_phy_dp1_gmux_clk>,
+               <&usb4_0_phy_pcie_pipegmux_clk>,
+               <&usb4_0_phy_pipegmux_clk>,
+               <&usb4_0_phy_sys_pcie_pipegmux_clk>,
+               <&usb4_1_phy_dp0_gmux_2_clk>,
+               <&usb4_1_phy_dp1_gmux_2_clk>,
+               <&usb4_1_phy_pcie_pipegmux_clk>,
+               <&usb4_1_phy_pipegmux_clk>,
+               <&usb4_1_phy_sys_pcie_pipegmux_clk>,
+               <&usb4_2_phy_dp0_gmux_2_clk>,
+               <&usb4_2_phy_dp1_gmux_2_clk>,
+               <&usb4_2_phy_pcie_pipegmux_clk>,
+               <&usb4_2_phy_pipegmux_clk>,
+               <&usb4_2_phy_sys_pcie_pipegmux_clk>,
+               <&usb4_0_phy_rx_0_clk>,
+               <&usb4_0_phy_rx_1_clk>,
+               <&usb4_1_phy_rx_0_clk>,
+               <&usb4_1_phy_rx_1_clk>,
+               <&usb4_2_phy_rx_0_clk>,
+               <&usb4_2_phy_rx_1_clk>,
+               <&usb4_0_phy_pcie_pipe_clk>,
+               <&usb4_0_phy_max_pipe_clk>,
+               <&usb4_1_phy_pcie_pipe_clk>,
+               <&usb4_1_phy_max_pipe_clk>,
+               <&usb4_2_phy_pcie_pipe_clk>,
+               <&usb4_2_phy_max_pipe_clk>;
       power-domains = <&rpmhpd RPMHPD_CX>;
       #clock-cells = <1>;
       #reset-cells = <1>;
diff --git a/include/dt-bindings/clock/qcom,x1e80100-gcc.h b/include/dt-bindings/clock/qcom,x1e80100-gcc.h
index 710c340f24a5..62aa12425592 100644
--- a/include/dt-bindings/clock/qcom,x1e80100-gcc.h
+++ b/include/dt-bindings/clock/qcom,x1e80100-gcc.h
@@ -363,6 +363,30 @@
 #define GCC_USB3_PRIM_PHY_PIPE_CLK_SRC				353
 #define GCC_USB3_SEC_PHY_PIPE_CLK_SRC				354
 #define GCC_USB3_TERT_PHY_PIPE_CLK_SRC				355
+#define GCC_USB34_PRIM_PHY_PIPE_CLK_SRC				356
+#define GCC_USB34_SEC_PHY_PIPE_CLK_SRC				357
+#define GCC_USB34_TERT_PHY_PIPE_CLK_SRC				358
+#define GCC_USB4_0_PHY_DP0_CLK_SRC				359
+#define GCC_USB4_0_PHY_DP1_CLK_SRC				360
+#define GCC_USB4_0_PHY_P2RR2P_PIPE_CLK_SRC			361
+#define GCC_USB4_0_PHY_PCIE_PIPE_MUX_CLK_SRC			362
+#define GCC_USB4_0_PHY_RX0_CLK_SRC				363
+#define GCC_USB4_0_PHY_RX1_CLK_SRC				364
+#define GCC_USB4_0_PHY_SYS_CLK_SRC				365
+#define GCC_USB4_1_PHY_DP0_CLK_SRC				366
+#define GCC_USB4_1_PHY_DP1_CLK_SRC				367
+#define GCC_USB4_1_PHY_P2RR2P_PIPE_CLK_SRC			368
+#define GCC_USB4_1_PHY_PCIE_PIPE_MUX_CLK_SRC			369
+#define GCC_USB4_1_PHY_RX0_CLK_SRC				370
+#define GCC_USB4_1_PHY_RX1_CLK_SRC				371
+#define GCC_USB4_1_PHY_SYS_CLK_SRC				372
+#define GCC_USB4_2_PHY_DP0_CLK_SRC				373
+#define GCC_USB4_2_PHY_DP1_CLK_SRC				374
+#define GCC_USB4_2_PHY_P2RR2P_PIPE_CLK_SRC			375
+#define GCC_USB4_2_PHY_PCIE_PIPE_MUX_CLK_SRC			376
+#define GCC_USB4_2_PHY_RX0_CLK_SRC				377
+#define GCC_USB4_2_PHY_RX1_CLK_SRC				378
+#define GCC_USB4_2_PHY_SYS_CLK_SRC				379
 
 /* GCC power domains */
 #define GCC_PCIE_0_TUNNEL_GDSC					0
@@ -484,4 +508,41 @@
 #define GCC_VIDEO_BCR						87
 #define GCC_VIDEO_AXI0_CLK_ARES					88
 #define GCC_VIDEO_AXI1_CLK_ARES					89
+#define GCC_USB4_0_MISC_USB4_SYS_BCR				90
+#define GCC_USB4_0_MISC_RX_CLK_0_BCR				91
+#define GCC_USB4_0_MISC_RX_CLK_1_BCR				92
+#define GCC_USB4_0_MISC_USB_PIPE_BCR				93
+#define GCC_USB4_0_MISC_PCIE_PIPE_BCR				94
+#define GCC_USB4_0_MISC_TMU_BCR					95
+#define GCC_USB4_0_MISC_SB_IF_BCR				96
+#define GCC_USB4_0_MISC_HIA_MSTR_BCR				97
+#define GCC_USB4_0_MISC_AHB_BCR					98
+#define GCC_USB4_0_MISC_DP0_MAX_PCLK_BCR			99
+#define GCC_USB4_0_MISC_DP1_MAX_PCLK_BCR			100
+#define GCC_USB4_1_MISC_USB4_SYS_BCR				101
+#define GCC_USB4_1_MISC_RX_CLK_0_BCR				102
+#define GCC_USB4_1_MISC_RX_CLK_1_BCR				103
+#define GCC_USB4_1_MISC_USB_PIPE_BCR				104
+#define GCC_USB4_1_MISC_PCIE_PIPE_BCR				105
+#define GCC_USB4_1_MISC_TMU_BCR					106
+#define GCC_USB4_1_MISC_SB_IF_BCR				107
+#define GCC_USB4_1_MISC_HIA_MSTR_BCR				108
+#define GCC_USB4_1_MISC_AHB_BCR					109
+#define GCC_USB4_1_MISC_DP0_MAX_PCLK_BCR			110
+#define GCC_USB4_1_MISC_DP1_MAX_PCLK_BCR			111
+#define GCC_USB4_2_MISC_USB4_SYS_BCR				112
+#define GCC_USB4_2_MISC_RX_CLK_0_BCR				113
+#define GCC_USB4_2_MISC_RX_CLK_1_BCR				114
+#define GCC_USB4_2_MISC_USB_PIPE_BCR				115
+#define GCC_USB4_2_MISC_PCIE_PIPE_BCR				116
+#define GCC_USB4_2_MISC_TMU_BCR					117
+#define GCC_USB4_2_MISC_SB_IF_BCR				118
+#define GCC_USB4_2_MISC_HIA_MSTR_BCR				119
+#define GCC_USB4_2_MISC_AHB_BCR					120
+#define GCC_USB4_2_MISC_DP0_MAX_PCLK_BCR			121
+#define GCC_USB4_2_MISC_DP1_MAX_PCLK_BCR			122
+#define GCC_USB4PHY_PHY_PRIM_BCR				123
+#define GCC_USB4PHY_PHY_SEC_BCR					124
+#define GCC_USB4PHY_PHY_TERT_BCR				125
+
 #endif
-- 
cgit v1.2.3


From 1c17f4373d4db1e1f0ebd3ddcd8e7a642927a826 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Tue, 14 Oct 2025 22:42:07 +0000
Subject: ipv6: Move ipv6_fl_list from ipv6_pinfo to inet_sock.

In {tcp6,udp6,raw6}_sock, struct ipv6_pinfo is always placed at
the beginning of a new cache line because

  1. __alignof__(struct tcp_sock) is 64 due to ____cacheline_aligned
     of __cacheline_group_begin(tcp_sock_write_tx)

  2. __alignof__(struct udp_sock) is 64 due to ____cacheline_aligned
     of struct numa_drop_counters

  3. in raw6_sock, struct numa_drop_counters is placed before
     struct ipv6_pinfo

.  struct ipv6_pinfo is 136 bytes, but the last cache line is
only used by ipv6_fl_list:

  $ pahole -C ipv6_pinfo vmlinux
  struct ipv6_pinfo {
  ...
  	/* --- cacheline 2 boundary (128 bytes) --- */
  	struct ipv6_fl_socklist *  ipv6_fl_list;         /*   128     8 */

  	/* size: 136, cachelines: 3, members: 23 */

Let's move ipv6_fl_list from struct ipv6_pinfo to struct inet_sock
to save a full cache line for {tcp6,udp6,raw6}_sock.

Now, struct ipv6_pinfo is 128 bytes, and {tcp6,udp6,raw6}_sock have
64 bytes less, while {tcp,udp,raw}_sock retain the same size.

Before:

  # grep -E "^(RAW|UDP[^L\-]|TCP)" /proc/slabinfo | awk '{print $1, "\t", $4}'
  RAWv6 	 1408
  UDPv6 	 1472
  TCPv6 	 2560
  RAW 		 1152
  UDP	 	 1280
  TCP 		 2368

After:

  # grep -E "^(RAW|UDP[^L\-]|TCP)" /proc/slabinfo | awk '{print $1, "\t", $4}'
  RAWv6 	 1344
  UDPv6 	 1408
  TCPv6 	 2496
  RAW 		 1152
  UDP	 	 1280
  TCP 		 2368

Also, ipv6_fl_list and inet_flags (SNDFLOW bit) are placed in the
same cache line.

  $ pahole -C inet_sock vmlinux
  ...
  	/* --- cacheline 11 boundary (704 bytes) was 56 bytes ago --- */
  	struct ipv6_pinfo *        pinet6;               /*   760     8 */
  	/* --- cacheline 12 boundary (768 bytes) --- */
  	struct ipv6_fl_socklist *  ipv6_fl_list;         /*   768     8 */
  	unsigned long              inet_flags;           /*   776     8 */

Doc churn is due to the insufficient Type column (only 1 space short).

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251014224210.2964778-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/net_cachelines/inet_sock.rst        | 79 +++++++++++-----------
 .../chelsio/inline_crypto/chtls/chtls_cm.c         |  4 +-
 include/linux/ipv6.h                               |  1 -
 include/net/inet_sock.h                            |  1 +
 net/ipv6/ip6_flowlabel.c                           | 44 ++++++------
 net/ipv6/tcp_ipv6.c                                | 13 ++--
 net/sctp/ipv6.c                                    |  8 ++-
 7 files changed, 76 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/net_cachelines/inet_sock.rst b/Documentation/networking/net_cachelines/inet_sock.rst
index b11bf48fa2b3..4c72a28a7012 100644
--- a/Documentation/networking/net_cachelines/inet_sock.rst
+++ b/Documentation/networking/net_cachelines/inet_sock.rst
@@ -5,42 +5,43 @@
 inet_sock struct fast path usage breakdown
 ==========================================
 
-======================= ===================== =================== =================== ======================================================================================================
-Type                    Name                  fastpath_tx_access  fastpath_rx_access  comment
-======================= ===================== =================== =================== ======================================================================================================
-struct sock             sk                    read_mostly         read_mostly         tcp_init_buffer_space,tcp_init_transfer,tcp_finish_connect,tcp_connect,tcp_send_rcvq,tcp_send_syn_data
-struct ipv6_pinfo*      pinet6
-be16                    inet_sport            read_mostly                             __tcp_transmit_skb
-be32                    inet_daddr            read_mostly                             ip_select_ident_segs
-be32                    inet_rcv_saddr
-be16                    inet_dport            read_mostly                             __tcp_transmit_skb
-u16                     inet_num
-be32                    inet_saddr
-s16                     uc_ttl                read_mostly                             __ip_queue_xmit/ip_select_ttl
-u16                     cmsg_flags
-struct ip_options_rcu*  inet_opt              read_mostly                             __ip_queue_xmit
-u16                     inet_id               read_mostly                             ip_select_ident_segs
-u8                      tos                   read_mostly                             ip_queue_xmit
-u8                      min_ttl
-u8                      mc_ttl
-u8                      pmtudisc
-u8:1                    recverr
-u8:1                    is_icsk
-u8:1                    freebind
-u8:1                    hdrincl
-u8:1                    mc_loop
-u8:1                    transparent
-u8:1                    mc_all
-u8:1                    nodefrag
-u8:1                    bind_address_no_port
-u8:1                    recverr_rfc4884
-u8:1                    defer_connect         read_mostly                             tcp_sendmsg_fastopen
-u8                      rcv_tos
-u8                      convert_csum
-int                     uc_index
-int                     mc_index
-be32                    mc_addr
-struct ip_mc_socklist*  mc_list
-struct inet_cork_full   cork                  read_mostly                             __tcp_transmit_skb
-struct                  local_port_range
-======================= ===================== =================== =================== ======================================================================================================
+======================== ===================== =================== =================== ======================================================================================================
+Type                     Name                  fastpath_tx_access  fastpath_rx_access  comment
+======================== ===================== =================== =================== ======================================================================================================
+struct sock              sk                    read_mostly         read_mostly         tcp_init_buffer_space,tcp_init_transfer,tcp_finish_connect,tcp_connect,tcp_send_rcvq,tcp_send_syn_data
+struct ipv6_pinfo*       pinet6
+struct ipv6_fl_socklist* ipv6_fl_list          read_mostly                             tcp_v6_connect,__ip6_datagram_connect,udpv6_sendmsg,rawv6_sendmsg
+be16                     inet_sport            read_mostly                             __tcp_transmit_skb
+be32                     inet_daddr            read_mostly                             ip_select_ident_segs
+be32                     inet_rcv_saddr
+be16                     inet_dport            read_mostly                             __tcp_transmit_skb
+u16                      inet_num
+be32                     inet_saddr
+s16                      uc_ttl                read_mostly                             __ip_queue_xmit/ip_select_ttl
+u16                      cmsg_flags
+struct ip_options_rcu*   inet_opt              read_mostly                             __ip_queue_xmit
+u16                      inet_id               read_mostly                             ip_select_ident_segs
+u8                       tos                   read_mostly                             ip_queue_xmit
+u8                       min_ttl
+u8                       mc_ttl
+u8                       pmtudisc
+u8:1                     recverr
+u8:1                     is_icsk
+u8:1                     freebind
+u8:1                     hdrincl
+u8:1                     mc_loop
+u8:1                     transparent
+u8:1                     mc_all
+u8:1                     nodefrag
+u8:1                     bind_address_no_port
+u8:1                     recverr_rfc4884
+u8:1                     defer_connect         read_mostly                             tcp_sendmsg_fastopen
+u8                       rcv_tos
+u8                       convert_csum
+int                      uc_index
+int                      mc_index
+be32                     mc_addr
+struct ip_mc_socklist*   mc_list
+struct inet_cork_full    cork                  read_mostly                             __tcp_transmit_skb
+struct                   local_port_range
+======================== ===================== =================== =================== ======================================================================================================
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
index 4ee970f3bad6..ee0154337a9c 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
@@ -1199,12 +1199,12 @@ static struct sock *chtls_recv_sock(struct sock *lsk,
 		struct ipv6_pinfo *newnp = inet6_sk(newsk);
 		struct ipv6_pinfo *np = inet6_sk(lsk);
 
-		inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
+		newinet->pinet6 = &newtcp6sk->inet6;
+		newinet->ipv6_fl_list = NULL;
 		memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 		newsk->sk_v6_daddr = treq->ir_v6_rmt_addr;
 		newsk->sk_v6_rcv_saddr = treq->ir_v6_loc_addr;
 		inet6_sk(newsk)->saddr = treq->ir_v6_loc_addr;
-		newnp->ipv6_fl_list = NULL;
 		newnp->pktoptions = NULL;
 		newsk->sk_bound_dev_if = treq->ir_iif;
 		newinet->inet_opt = NULL;
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 43b7bb828738..7294e4e89b79 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -271,7 +271,6 @@ struct ipv6_pinfo {
 
 	struct ipv6_mc_socklist	__rcu *ipv6_mc_list;
 	struct ipv6_ac_socklist	*ipv6_ac_list;
-	struct ipv6_fl_socklist __rcu *ipv6_fl_list;
 };
 
 /* We currently use available bits from inet_sk(sk)->inet_flags,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 1086256549fa..b6ec08072533 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -214,6 +214,7 @@ struct inet_sock {
 	struct sock		sk;
 #if IS_ENABLED(CONFIG_IPV6)
 	struct ipv6_pinfo	*pinet6;
+	struct ipv6_fl_socklist __rcu *ipv6_fl_list;
 #endif
 	/* Socket demultiplex comparisons on incoming packets. */
 #define inet_daddr		sk.__sk_common.skc_daddr
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index a3ff575798dd..60d0be47a9f3 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -66,8 +66,8 @@ EXPORT_SYMBOL(ipv6_flowlabel_exclusive);
 	     fl != NULL;					\
 	     fl = rcu_dereference(fl->next))
 
-#define for_each_sk_fl_rcu(np, sfl)				\
-	for (sfl = rcu_dereference(np->ipv6_fl_list);	\
+#define for_each_sk_fl_rcu(sk, sfl)				\
+	for (sfl = rcu_dereference(inet_sk(sk)->ipv6_fl_list);	\
 	     sfl != NULL;					\
 	     sfl = rcu_dereference(sfl->next))
 
@@ -262,12 +262,11 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
 struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label)
 {
 	struct ipv6_fl_socklist *sfl;
-	struct ipv6_pinfo *np = inet6_sk(sk);
 
 	label &= IPV6_FLOWLABEL_MASK;
 
 	rcu_read_lock();
-	for_each_sk_fl_rcu(np, sfl) {
+	for_each_sk_fl_rcu(sk, sfl) {
 		struct ip6_flowlabel *fl = sfl->fl;
 
 		if (fl->label == label && atomic_inc_not_zero(&fl->users)) {
@@ -283,16 +282,16 @@ EXPORT_SYMBOL_GPL(__fl6_sock_lookup);
 
 void fl6_free_socklist(struct sock *sk)
 {
-	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_fl_socklist *sfl;
 
-	if (!rcu_access_pointer(np->ipv6_fl_list))
+	if (!rcu_access_pointer(inet->ipv6_fl_list))
 		return;
 
 	spin_lock_bh(&ip6_sk_fl_lock);
-	while ((sfl = rcu_dereference_protected(np->ipv6_fl_list,
+	while ((sfl = rcu_dereference_protected(inet->ipv6_fl_list,
 						lockdep_is_held(&ip6_sk_fl_lock))) != NULL) {
-		np->ipv6_fl_list = sfl->next;
+		inet->ipv6_fl_list = sfl->next;
 		spin_unlock_bh(&ip6_sk_fl_lock);
 
 		fl_release(sfl->fl);
@@ -470,16 +469,15 @@ done:
 
 static int mem_check(struct sock *sk)
 {
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct ipv6_fl_socklist *sfl;
 	int room = FL_MAX_SIZE - atomic_read(&fl_size);
+	struct ipv6_fl_socklist *sfl;
 	int count = 0;
 
 	if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK)
 		return 0;
 
 	rcu_read_lock();
-	for_each_sk_fl_rcu(np, sfl)
+	for_each_sk_fl_rcu(sk, sfl)
 		count++;
 	rcu_read_unlock();
 
@@ -492,13 +490,15 @@ static int mem_check(struct sock *sk)
 	return 0;
 }
 
-static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl,
-		struct ip6_flowlabel *fl)
+static inline void fl_link(struct sock *sk, struct ipv6_fl_socklist *sfl,
+			   struct ip6_flowlabel *fl)
 {
+	struct inet_sock *inet = inet_sk(sk);
+
 	spin_lock_bh(&ip6_sk_fl_lock);
 	sfl->fl = fl;
-	sfl->next = np->ipv6_fl_list;
-	rcu_assign_pointer(np->ipv6_fl_list, sfl);
+	sfl->next = inet->ipv6_fl_list;
+	rcu_assign_pointer(inet->ipv6_fl_list, sfl);
 	spin_unlock_bh(&ip6_sk_fl_lock);
 }
 
@@ -520,7 +520,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
 
 	rcu_read_lock();
 
-	for_each_sk_fl_rcu(np, sfl) {
+	for_each_sk_fl_rcu(sk, sfl) {
 		if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) {
 			spin_lock_bh(&ip6_fl_lock);
 			freq->flr_label = sfl->fl->label;
@@ -559,7 +559,7 @@ static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq)
 	}
 
 	spin_lock_bh(&ip6_sk_fl_lock);
-	for (sflp = &np->ipv6_fl_list;
+	for (sflp = &inet_sk(sk)->ipv6_fl_list;
 	     (sfl = socklist_dereference(*sflp)) != NULL;
 	     sflp = &sfl->next) {
 		if (sfl->fl->label == freq->flr_label)
@@ -579,13 +579,12 @@ found:
 
 static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq)
 {
-	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct net *net = sock_net(sk);
 	struct ipv6_fl_socklist *sfl;
 	int err;
 
 	rcu_read_lock();
-	for_each_sk_fl_rcu(np, sfl) {
+	for_each_sk_fl_rcu(sk, sfl) {
 		if (sfl->fl->label == freq->flr_label) {
 			err = fl6_renew(sfl->fl, freq->flr_linger,
 					freq->flr_expires);
@@ -614,7 +613,6 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
 {
 	struct ipv6_fl_socklist *sfl, *sfl1 = NULL;
 	struct ip6_flowlabel *fl, *fl1 = NULL;
-	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct net *net = sock_net(sk);
 	int err;
 
@@ -645,7 +643,7 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
 	if (freq->flr_label) {
 		err = -EEXIST;
 		rcu_read_lock();
-		for_each_sk_fl_rcu(np, sfl) {
+		for_each_sk_fl_rcu(sk, sfl) {
 			if (sfl->fl->label == freq->flr_label) {
 				if (freq->flr_flags & IPV6_FL_F_EXCL) {
 					rcu_read_unlock();
@@ -682,7 +680,7 @@ recheck:
 				fl1->linger = fl->linger;
 			if ((long)(fl->expires - fl1->expires) > 0)
 				fl1->expires = fl->expires;
-			fl_link(np, sfl1, fl1);
+			fl_link(sk, sfl1, fl1);
 			fl_free(fl);
 			return 0;
 
@@ -716,7 +714,7 @@ release:
 		}
 	}
 
-	fl_link(np, sfl1, fl);
+	fl_link(sk, sfl1, fl);
 	return 0;
 done:
 	fl_free(fl);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 59c4977a811a..6197dd4e6261 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1386,7 +1386,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		if (!newsk)
 			return NULL;
 
-		inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk);
+		newinet = inet_sk(newsk);
+		newinet->pinet6 = tcp_inet6_sk(newsk);
+		newinet->ipv6_fl_list = NULL;
 
 		newnp = tcp_inet6_sk(newsk);
 		newtp = tcp_sk(newsk);
@@ -1405,7 +1407,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 
 		newnp->ipv6_mc_list = NULL;
 		newnp->ipv6_ac_list = NULL;
-		newnp->ipv6_fl_list = NULL;
 		newnp->pktoptions  = NULL;
 		newnp->opt	   = NULL;
 		newnp->mcast_oif   = inet_iif(skb);
@@ -1453,10 +1454,12 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	newsk->sk_gso_type = SKB_GSO_TCPV6;
 	inet6_sk_rx_dst_set(newsk, skb);
 
-	inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk);
+	newinet = inet_sk(newsk);
+	newinet->pinet6 = tcp_inet6_sk(newsk);
+	newinet->ipv6_fl_list = NULL;
+	newinet->inet_opt = NULL;
 
 	newtp = tcp_sk(newsk);
-	newinet = inet_sk(newsk);
 	newnp = tcp_inet6_sk(newsk);
 
 	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
@@ -1469,10 +1472,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 
 	   First: no IPv4 options.
 	 */
-	newinet->inet_opt = NULL;
 	newnp->ipv6_mc_list = NULL;
 	newnp->ipv6_ac_list = NULL;
-	newnp->ipv6_fl_list = NULL;
 
 	/* Clone RX bits */
 	newnp->rxopt.all = np->rxopt.all;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 568ff8797c39..d725b2158758 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -782,9 +782,10 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
 					     struct sctp_association *asoc,
 					     bool kern)
 {
-	struct sock *newsk;
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct sctp6_sock *newsctp6sk;
+	struct inet_sock *newinet;
+	struct sock *newsk;
 
 	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern);
 	if (!newsk)
@@ -796,7 +797,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
 	sock_reset_flag(sk, SOCK_ZAPPED);
 
 	newsctp6sk = (struct sctp6_sock *)newsk;
-	inet_sk(newsk)->pinet6 = &newsctp6sk->inet6;
+	newinet = inet_sk(newsk);
+	newinet->pinet6 = &newsctp6sk->inet6;
+	newinet->ipv6_fl_list = NULL;
 
 	sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped;
 
@@ -805,7 +808,6 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
 	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 	newnp->ipv6_mc_list = NULL;
 	newnp->ipv6_ac_list = NULL;
-	newnp->ipv6_fl_list = NULL;
 
 	sctp_v6_copy_ip_options(sk, newsk);
 
-- 
cgit v1.2.3


From 9c4609225ec1cb551006d6a03c7c4ad8cb5584c0 Mon Sep 17 00:00:00 2001
From: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
Date: Wed, 15 Oct 2025 10:02:34 +0800
Subject: rculist: Add hlist_nulls_replace_rcu() and
 hlist_nulls_replace_init_rcu()

Add two functions to atomically replace RCU-protected hlist_nulls entries.

Keep using WRITE_ONCE() to assign values to ->next and ->pprev, as
mentioned in the patch below:
commit efd04f8a8b45 ("rcu: Use WRITE_ONCE() for assignments to ->next for
rculist_nulls")
commit 860c8802ace1 ("rcu: Use WRITE_ONCE() for assignments to ->pprev for
hlist_nulls")

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
Link: https://patch.msgid.link/20251015020236.431822-2-xuanqiang.luo@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rculist_nulls.h | 59 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'include')

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 89186c499dd4..c26cb83ca071 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
 #define hlist_nulls_next_rcu(node) \
 	(*((struct hlist_nulls_node __rcu __force **)&(node)->next))
 
+/**
+ * hlist_nulls_pprev_rcu - returns the dereferenced pprev of @node.
+ * @node: element of the list.
+ */
+#define hlist_nulls_pprev_rcu(node) \
+	(*((struct hlist_nulls_node __rcu __force **)(node)->pprev))
+
 /**
  * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
  * @n: the element to delete from the hash list.
@@ -152,6 +159,58 @@ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
 	n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
 }
 
+/**
+ * hlist_nulls_replace_rcu - replace an old entry by a new one
+ * @old: the element to be replaced
+ * @new: the new element to insert
+ *
+ * Description:
+ * Replace the old entry with the new one in a RCU-protected hlist_nulls, while
+ * permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary (such as holding
+ * appropriate locks) to avoid racing with another list-mutation primitive, such
+ * as hlist_nulls_add_head_rcu() or hlist_nulls_del_rcu(), running on this same
+ * list.  However, it is perfectly legal to run concurrently with the _rcu
+ * list-traversal primitives, such as hlist_nulls_for_each_entry_rcu().
+ */
+static inline void hlist_nulls_replace_rcu(struct hlist_nulls_node *old,
+					   struct hlist_nulls_node *new)
+{
+	struct hlist_nulls_node *next = old->next;
+
+	WRITE_ONCE(new->next, next);
+	WRITE_ONCE(new->pprev, old->pprev);
+	rcu_assign_pointer(hlist_nulls_pprev_rcu(new), new);
+	if (!is_a_nulls(next))
+		WRITE_ONCE(next->pprev, &new->next);
+}
+
+/**
+ * hlist_nulls_replace_init_rcu - replace an old entry by a new one and
+ * initialize the old
+ * @old: the element to be replaced
+ * @new: the new element to insert
+ *
+ * Description:
+ * Replace the old entry with the new one in a RCU-protected hlist_nulls, while
+ * permitting racing traversals, and reinitialize the old entry.
+ *
+ * Note: @old must be hashed.
+ *
+ * The caller must take whatever precautions are necessary (such as holding
+ * appropriate locks) to avoid racing with another list-mutation primitive, such
+ * as hlist_nulls_add_head_rcu() or hlist_nulls_del_rcu(), running on this same
+ * list. However, it is perfectly legal to run concurrently with the _rcu
+ * list-traversal primitives, such as hlist_nulls_for_each_entry_rcu().
+ */
+static inline void hlist_nulls_replace_init_rcu(struct hlist_nulls_node *old,
+						struct hlist_nulls_node *new)
+{
+	hlist_nulls_replace_rcu(old, new);
+	WRITE_ONCE(old->pprev, NULL);
+}
+
 /**
  * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
  * @tpos:	the type * to use as a loop cursor.
-- 
cgit v1.2.3


From 1532ed0d0753c83e72595f785f82b48c28bbe5dc Mon Sep 17 00:00:00 2001
From: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
Date: Wed, 15 Oct 2025 10:02:35 +0800
Subject: inet: Avoid ehash lookup race in inet_ehash_insert()

Since ehash lookups are lockless, if one CPU performs a lookup while
another concurrently deletes and inserts (removing reqsk and inserting sk),
the lookup may fail to find the socket, an RST may be sent.

The call trace map is drawn as follows:
   CPU 0                           CPU 1
   -----                           -----
				inet_ehash_insert()
                                spin_lock()
                                sk_nulls_del_node_init_rcu(osk)
__inet_lookup_established()
	(lookup failed)
                                __sk_nulls_add_node_rcu(sk, list)
                                spin_unlock()

As both deletion and insertion operate on the same ehash chain, this patch
introduces a new sk_nulls_replace_node_init_rcu() helper functions to
implement atomic replacement.

Fixes: 5e0724d027f0 ("tcp/dccp: fix hashdance race for passive sessions")
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Signed-off-by: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251015020236.431822-3-xuanqiang.luo@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h         | 13 +++++++++++++
 net/ipv4/inet_hashtables.c |  8 ++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 30ac2eb4ef9b..335d0da82d79 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -856,6 +856,19 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
 	return rc;
 }
 
+static inline bool sk_nulls_replace_node_init_rcu(struct sock *old,
+						  struct sock *new)
+{
+	if (sk_hashed(old)) {
+		hlist_nulls_replace_init_rcu(&old->sk_nulls_node,
+					     &new->sk_nulls_node);
+		__sock_put(old);
+		return true;
+	}
+
+	return false;
+}
+
 static inline void __sk_add_node(struct sock *sk, struct hlist_head *list)
 {
 	hlist_add_head(&sk->sk_node, list);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index b7024e3d9ac3..f5826ec4bcaa 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -720,8 +720,11 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
 	spin_lock(lock);
 	if (osk) {
 		WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
-		ret = sk_nulls_del_node_init_rcu(osk);
-	} else if (found_dup_sk) {
+		ret = sk_nulls_replace_node_init_rcu(osk, sk);
+		goto unlock;
+	}
+
+	if (found_dup_sk) {
 		*found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
 		if (*found_dup_sk)
 			ret = false;
@@ -730,6 +733,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
 	if (ret)
 		__sk_nulls_add_node_rcu(sk, list);
 
+unlock:
 	spin_unlock(lock);
 
 	return ret;
-- 
cgit v1.2.3


From 37a183d3b7cdb873e7f5f9daef1ad6d8f7c95fb7 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Tue, 14 Oct 2025 14:58:36 -0700
Subject: tcp: Convert tcp-md5 to use MD5 library instead of crypto_ahash

Make tcp-md5 use the MD5 library API (added in 6.18) instead of the
crypto_ahash API.  This is much simpler and also more efficient:

- The library API just operates on struct md5_ctx.  Just allocate this
  struct on the stack instead of using a pool of pre-allocated
  crypto_ahash and ahash_request objects.

- The library API accepts standard pointers and doesn't require
  scatterlists.  So, for hashing the headers just use an on-stack buffer
  instead of a pool of pre-allocated kmalloc'ed scratch buffers.

- The library API never fails.  Therefore, checking for MD5 hashing
  errors is no longer necessary.  Update tcp_v4_md5_hash_skb(),
  tcp_v6_md5_hash_skb(), tcp_v4_md5_hash_hdr(), tcp_v6_md5_hash_hdr(),
  tcp_md5_hash_key(), tcp_sock_af_ops::calc_md5_hash, and
  tcp_request_sock_ops::calc_md5_hash to return void instead of int.

- The library API provides direct access to the MD5 code, eliminating
  unnecessary overhead such as indirect function calls and scatterlist
  management.  Microbenchmarks of tcp_v4_md5_hash_skb() on x86_64 show a
  speedup from 7518 to 7041 cycles (6% fewer) with skb->len == 1440, or
  from 1020 to 678 cycles (33% fewer) with skb->len == 140.

Since tcp_sigpool_hash_skb_data() can no longer be used, add a function
tcp_md5_hash_skb_data() which is specialized to MD5.  Of course, to the
extent that this duplicates any code, it's well worth it.

To preserve the existing behavior of TCP-MD5 support being disabled when
the kernel is booted with "fips=1", make tcp_md5_do_add() check
fips_enabled itself.  Previously it relied on the error from
crypto_alloc_ahash("md5") being bubbled up.  I don't know for sure that
this is actually needed, but this preserves the existing behavior.

Tested with bidirectional TCP-MD5, both IPv4 and IPv6, between a kernel
that includes this commit and a kernel that doesn't include this commit.

(Side note: please don't use TCP-MD5!  It's cryptographically weak.  But
as long as Linux supports it, it might as well be implemented properly.)

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Link: https://patch.msgid.link/20251014215836.115616-1-ebiggers@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h        |  26 +++------
 net/ipv4/Kconfig         |   4 +-
 net/ipv4/tcp.c           |  73 +++++++++++--------------
 net/ipv4/tcp_ipv4.c      | 137 +++++++++++++++--------------------------------
 net/ipv4/tcp_minisocks.c |   2 -
 net/ipv6/tcp_ipv6.c      | 119 +++++++++++++---------------------------
 6 files changed, 121 insertions(+), 240 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1e547138f4fb..67fdd2523d92 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1898,13 +1898,6 @@ struct tcp6_pseudohdr {
 	__be32		protocol;	/* including padding */
 };
 
-union tcp_md5sum_block {
-	struct tcp4_pseudohdr ip4;
-#if IS_ENABLED(CONFIG_IPV6)
-	struct tcp6_pseudohdr ip6;
-#endif
-};
-
 /*
  * struct tcp_sigpool - per-CPU pool of ahash_requests
  * @scratch: per-CPU temporary area, that can be used between
@@ -1939,8 +1932,8 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c);
 void tcp_sigpool_end(struct tcp_sigpool *c);
 size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len);
 /* - functions */
-int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
-			const struct sock *sk, const struct sk_buff *skb);
+void tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+			 const struct sock *sk, const struct sk_buff *skb);
 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 		   int family, u8 prefixlen, int l3index, u8 flags,
 		   const u8 *newkey, u8 newkeylen);
@@ -1999,13 +1992,10 @@ static inline void tcp_md5_destruct_sock(struct sock *sk)
 }
 #endif
 
-int tcp_md5_alloc_sigpool(void);
-void tcp_md5_release_sigpool(void);
-void tcp_md5_add_sigpool(void);
-extern int tcp_md5_sigpool_id;
-
-int tcp_md5_hash_key(struct tcp_sigpool *hp,
-		     const struct tcp_md5sig_key *key);
+struct md5_ctx;
+void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb,
+			   unsigned int header_len);
+void tcp_md5_hash_key(struct md5_ctx *ctx, const struct tcp_md5sig_key *key);
 
 /* From tcp_fastopen.c */
 void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
@@ -2355,7 +2345,7 @@ struct tcp_sock_af_ops {
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	*(*md5_lookup) (const struct sock *sk,
 						const struct sock *addr_sk);
-	int		(*calc_md5_hash)(char *location,
+	void		(*calc_md5_hash)(char *location,
 					 const struct tcp_md5sig_key *md5,
 					 const struct sock *sk,
 					 const struct sk_buff *skb);
@@ -2383,7 +2373,7 @@ struct tcp_request_sock_ops {
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk,
 						 const struct sock *addr_sk);
-	int		(*calc_md5_hash) (char *location,
+	void		(*calc_md5_hash) (char *location,
 					  const struct tcp_md5sig_key *md5,
 					  const struct sock *sk,
 					  const struct sk_buff *skb);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 12850a277251..b71c22475c51 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -760,9 +760,7 @@ config TCP_AO
 
 config TCP_MD5SIG
 	bool "TCP: MD5 Signature Option support (RFC2385)"
-	select CRYPTO
-	select CRYPTO_MD5
-	select TCP_SIGPOOL
+	select CRYPTO_LIB_MD5
 	help
 	  RFC2385 specifies a method of giving MD5 protection to TCP sessions.
 	  Its main (only?) use is to protect BGP sessions between core routers
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4d720aa09a4c..0ccc5405e740 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -243,7 +243,7 @@
 
 #define pr_fmt(fmt) "TCP: " fmt
 
-#include <crypto/hash.h>
+#include <crypto/md5.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -253,7 +253,6 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/skbuff.h>
-#include <linux/scatterlist.h>
 #include <linux/splice.h>
 #include <linux/net.h>
 #include <linux/socket.h>
@@ -425,7 +424,6 @@ void tcp_md5_destruct_sock(struct sock *sk)
 		tcp_clear_md5_list(sk);
 		kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1));
 		static_branch_slow_dec_deferred(&tcp_md5_needed);
-		tcp_md5_release_sigpool();
 	}
 }
 EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock);
@@ -4838,52 +4836,45 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 EXPORT_IPV6_MOD(tcp_getsockopt);
 
 #ifdef CONFIG_TCP_MD5SIG
-int tcp_md5_sigpool_id = -1;
-EXPORT_IPV6_MOD_GPL(tcp_md5_sigpool_id);
-
-int tcp_md5_alloc_sigpool(void)
+void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb,
+			   unsigned int header_len)
 {
-	size_t scratch_size;
-	int ret;
+	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+					   skb_headlen(skb) - header_len : 0;
+	const struct skb_shared_info *shi = skb_shinfo(skb);
+	struct sk_buff *frag_iter;
+	unsigned int i;
 
-	scratch_size = sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr);
-	ret = tcp_sigpool_alloc_ahash("md5", scratch_size);
-	if (ret >= 0) {
-		/* As long as any md5 sigpool was allocated, the return
-		 * id would stay the same. Re-write the id only for the case
-		 * when previously all MD5 keys were deleted and this call
-		 * allocates the first MD5 key, which may return a different
-		 * sigpool id than was used previously.
-		 */
-		WRITE_ONCE(tcp_md5_sigpool_id, ret); /* Avoids the compiler potentially being smart here */
-		return 0;
-	}
-	return ret;
-}
+	md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len);
 
-void tcp_md5_release_sigpool(void)
-{
-	tcp_sigpool_release(READ_ONCE(tcp_md5_sigpool_id));
-}
+	for (i = 0; i < shi->nr_frags; ++i) {
+		const skb_frag_t *f = &shi->frags[i];
+		u32 p_off, p_len, copied;
+		const void *vaddr;
+		struct page *p;
 
-void tcp_md5_add_sigpool(void)
-{
-	tcp_sigpool_get(READ_ONCE(tcp_md5_sigpool_id));
+		skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
+				      p, p_off, p_len, copied) {
+			vaddr = kmap_local_page(p);
+			md5_update(ctx, vaddr + p_off, p_len);
+			kunmap_local(vaddr);
+		}
+	}
+
+	skb_walk_frags(skb, frag_iter)
+		tcp_md5_hash_skb_data(ctx, frag_iter, 0);
 }
+EXPORT_IPV6_MOD(tcp_md5_hash_skb_data);
 
-int tcp_md5_hash_key(struct tcp_sigpool *hp,
-		     const struct tcp_md5sig_key *key)
+void tcp_md5_hash_key(struct md5_ctx *ctx,
+		      const struct tcp_md5sig_key *key)
 {
 	u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
-	struct scatterlist sg;
-
-	sg_init_one(&sg, key->key, keylen);
-	ahash_request_set_crypt(hp->req, &sg, NULL, keylen);
 
 	/* We use data_race() because tcp_md5_do_add() might change
 	 * key->key under us
 	 */
-	return data_race(crypto_ahash_update(hp->req));
+	data_race(({ md5_update(ctx, key->key, keylen), 0; }));
 }
 EXPORT_IPV6_MOD(tcp_md5_hash_key);
 
@@ -4902,7 +4893,6 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_md5sig_key *key;
 	u8 newhash[16];
-	int genhash;
 
 	key = tcp_md5_do_lookup(sk, l3index, saddr, family);
 
@@ -4917,11 +4907,10 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 	 * IPv4-mapped case.
 	 */
 	if (family == AF_INET)
-		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+		tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 	else
-		genhash = tp->af_specific->calc_md5_hash(newhash, key,
-							 NULL, skb);
-	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+		tp->af_specific->calc_md5_hash(newhash, key, NULL, skb);
+	if (memcmp(hash_location, newhash, 16) != 0) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
 		trace_tcp_hash_md5_mismatch(sk, skb);
 		return SKB_DROP_REASON_TCP_MD5FAILURE;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b1fcf3e4e1ce..40a76da5364a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -53,6 +53,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/cache.h>
+#include <linux/fips.h>
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
@@ -86,14 +87,13 @@
 #include <linux/btf_ids.h>
 #include <linux/skbuff_ref.h>
 
-#include <crypto/hash.h>
-#include <linux/scatterlist.h>
+#include <crypto/md5.h>
 
 #include <trace/events/tcp.h>
 
 #ifdef CONFIG_TCP_MD5SIG
-static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
-			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
+static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+				__be32 daddr, __be32 saddr, const struct tcphdr *th);
 #endif
 
 struct inet_hashinfo tcp_hashinfo;
@@ -754,7 +754,6 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
 	struct tcp_md5sig_key *key = NULL;
 	unsigned char newhash[16];
 	struct sock *sk1 = NULL;
-	int genhash;
 #endif
 	u64 transmit_time = 0;
 	struct sock *ctl_sk;
@@ -840,11 +839,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
 		if (!key)
 			goto out;
 
-
-		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
-		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
+		tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+		if (memcmp(md5_hash_location, newhash, 16) != 0)
 			goto out;
-
 	}
 
 	if (key) {
@@ -1425,13 +1422,13 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
-		if (tcp_md5_alloc_sigpool())
-			return -ENOMEM;
+		if (fips_enabled) {
+			pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
+			return -EOPNOTSUPP;
+		}
 
-		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
-			tcp_md5_release_sigpool();
+		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
 			return -ENOMEM;
-		}
 
 		if (!static_branch_inc(&tcp_md5_needed.key)) {
 			struct tcp_md5sig_info *md5sig;
@@ -1439,7 +1436,6 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
 			rcu_assign_pointer(tp->md5sig_info, NULL);
 			kfree_rcu(md5sig, rcu);
-			tcp_md5_release_sigpool();
 			return -EUSERS;
 		}
 	}
@@ -1456,12 +1452,9 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
-		tcp_md5_add_sigpool();
 
-		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
-			tcp_md5_release_sigpool();
+		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
 			return -ENOMEM;
-		}
 
 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
 			struct tcp_md5sig_info *md5sig;
@@ -1470,7 +1463,6 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
 			rcu_assign_pointer(tp->md5sig_info, NULL);
 			kfree_rcu(md5sig, rcu);
-			tcp_md5_release_sigpool();
 			return -EUSERS;
 		}
 	}
@@ -1578,66 +1570,44 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
 			      cmd.tcpm_key, cmd.tcpm_keylen);
 }
 
-static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
-				   __be32 daddr, __be32 saddr,
-				   const struct tcphdr *th, int nbytes)
+static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
+				    __be32 daddr, __be32 saddr,
+				    const struct tcphdr *th, int nbytes)
 {
-	struct tcp4_pseudohdr *bp;
-	struct scatterlist sg;
-	struct tcphdr *_th;
-
-	bp = hp->scratch;
-	bp->saddr = saddr;
-	bp->daddr = daddr;
-	bp->pad = 0;
-	bp->protocol = IPPROTO_TCP;
-	bp->len = cpu_to_be16(nbytes);
-
-	_th = (struct tcphdr *)(bp + 1);
-	memcpy(_th, th, sizeof(*th));
-	_th->check = 0;
+	struct {
+		struct tcp4_pseudohdr ip;
+		struct tcphdr tcp;
+	} h;
 
-	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
-	ahash_request_set_crypt(hp->req, &sg, NULL,
-				sizeof(*bp) + sizeof(*th));
-	return crypto_ahash_update(hp->req);
+	h.ip.saddr = saddr;
+	h.ip.daddr = daddr;
+	h.ip.pad = 0;
+	h.ip.protocol = IPPROTO_TCP;
+	h.ip.len = cpu_to_be16(nbytes);
+	h.tcp = *th;
+	h.tcp.check = 0;
+	md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
 }
 
-static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
-			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
+static noinline_for_stack void
+tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+		    __be32 daddr, __be32 saddr, const struct tcphdr *th)
 {
-	struct tcp_sigpool hp;
+	struct md5_ctx ctx;
 
-	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
-		goto clear_hash_nostart;
-
-	if (crypto_ahash_init(hp.req))
-		goto clear_hash;
-	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
-		goto clear_hash;
-	if (tcp_md5_hash_key(&hp, key))
-		goto clear_hash;
-	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
-	if (crypto_ahash_final(hp.req))
-		goto clear_hash;
-
-	tcp_sigpool_end(&hp);
-	return 0;
-
-clear_hash:
-	tcp_sigpool_end(&hp);
-clear_hash_nostart:
-	memset(md5_hash, 0, 16);
-	return 1;
+	md5_init(&ctx);
+	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
+	tcp_md5_hash_key(&ctx, key);
+	md5_final(&ctx, md5_hash);
 }
 
-int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
-			const struct sock *sk,
-			const struct sk_buff *skb)
+noinline_for_stack void
+tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+		    const struct sock *sk, const struct sk_buff *skb)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
-	struct tcp_sigpool hp;
 	__be32 saddr, daddr;
+	struct md5_ctx ctx;
 
 	if (sk) { /* valid for establish/request sockets */
 		saddr = sk->sk_rcv_saddr;
@@ -1648,30 +1618,11 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
 		daddr = iph->daddr;
 	}
 
-	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
-		goto clear_hash_nostart;
-
-	if (crypto_ahash_init(hp.req))
-		goto clear_hash;
-
-	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
-		goto clear_hash;
-	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
-		goto clear_hash;
-	if (tcp_md5_hash_key(&hp, key))
-		goto clear_hash;
-	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
-	if (crypto_ahash_final(hp.req))
-		goto clear_hash;
-
-	tcp_sigpool_end(&hp);
-	return 0;
-
-clear_hash:
-	tcp_sigpool_end(&hp);
-clear_hash_nostart:
-	memset(md5_hash, 0, 16);
-	return 1;
+	md5_init(&ctx);
+	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
+	tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
+	tcp_md5_hash_key(&ctx, key);
+	md5_final(&ctx, md5_hash);
 }
 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 2ec8c6f1cdcc..ded2cf1f6006 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -312,7 +312,6 @@ static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
 			return;
 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key))
 			goto out_free;
-		tcp_md5_add_sigpool();
 	}
 	return;
 out_free:
@@ -406,7 +405,6 @@ void tcp_twsk_destructor(struct sock *sk)
 		if (twsk->tw_md5_key) {
 			kfree(twsk->tw_md5_key);
 			static_branch_slow_dec_deferred(&tcp_md5_needed);
-			tcp_md5_release_sigpool();
 		}
 	}
 #endif
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6197dd4e6261..06eb90e4078e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -67,8 +67,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
-#include <crypto/hash.h>
-#include <linux/scatterlist.h>
+#include <crypto/md5.h>
 
 #include <trace/events/tcp.h>
 
@@ -691,69 +690,45 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 			      cmd.tcpm_key, cmd.tcpm_keylen);
 }
 
-static int tcp_v6_md5_hash_headers(struct tcp_sigpool *hp,
-				   const struct in6_addr *daddr,
-				   const struct in6_addr *saddr,
-				   const struct tcphdr *th, int nbytes)
+static void tcp_v6_md5_hash_headers(struct md5_ctx *ctx,
+				    const struct in6_addr *daddr,
+				    const struct in6_addr *saddr,
+				    const struct tcphdr *th, int nbytes)
 {
-	struct tcp6_pseudohdr *bp;
-	struct scatterlist sg;
-	struct tcphdr *_th;
-
-	bp = hp->scratch;
-	/* 1. TCP pseudo-header (RFC2460) */
-	bp->saddr = *saddr;
-	bp->daddr = *daddr;
-	bp->protocol = cpu_to_be32(IPPROTO_TCP);
-	bp->len = cpu_to_be32(nbytes);
-
-	_th = (struct tcphdr *)(bp + 1);
-	memcpy(_th, th, sizeof(*th));
-	_th->check = 0;
-
-	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
-	ahash_request_set_crypt(hp->req, &sg, NULL,
-				sizeof(*bp) + sizeof(*th));
-	return crypto_ahash_update(hp->req);
+	struct {
+		struct tcp6_pseudohdr ip; /* TCP pseudo-header (RFC2460) */
+		struct tcphdr tcp;
+	} h;
+
+	h.ip.saddr = *saddr;
+	h.ip.daddr = *daddr;
+	h.ip.protocol = cpu_to_be32(IPPROTO_TCP);
+	h.ip.len = cpu_to_be32(nbytes);
+	h.tcp = *th;
+	h.tcp.check = 0;
+	md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
 }
 
-static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
-			       const struct in6_addr *daddr, struct in6_addr *saddr,
-			       const struct tcphdr *th)
+static noinline_for_stack void
+tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+		    const struct in6_addr *daddr, struct in6_addr *saddr,
+		    const struct tcphdr *th)
 {
-	struct tcp_sigpool hp;
-
-	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
-		goto clear_hash_nostart;
-
-	if (crypto_ahash_init(hp.req))
-		goto clear_hash;
-	if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
-		goto clear_hash;
-	if (tcp_md5_hash_key(&hp, key))
-		goto clear_hash;
-	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
-	if (crypto_ahash_final(hp.req))
-		goto clear_hash;
-
-	tcp_sigpool_end(&hp);
-	return 0;
+	struct md5_ctx ctx;
 
-clear_hash:
-	tcp_sigpool_end(&hp);
-clear_hash_nostart:
-	memset(md5_hash, 0, 16);
-	return 1;
+	md5_init(&ctx);
+	tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
+	tcp_md5_hash_key(&ctx, key);
+	md5_final(&ctx, md5_hash);
 }
 
-static int tcp_v6_md5_hash_skb(char *md5_hash,
-			       const struct tcp_md5sig_key *key,
-			       const struct sock *sk,
-			       const struct sk_buff *skb)
+static noinline_for_stack void
+tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+		    const struct sock *sk, const struct sk_buff *skb)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	const struct in6_addr *saddr, *daddr;
-	struct tcp_sigpool hp;
+	struct md5_ctx ctx;
 
 	if (sk) { /* valid for establish/request sockets */
 		saddr = &sk->sk_v6_rcv_saddr;
@@ -764,30 +739,11 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
 		daddr = &ip6h->daddr;
 	}
 
-	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
-		goto clear_hash_nostart;
-
-	if (crypto_ahash_init(hp.req))
-		goto clear_hash;
-
-	if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
-		goto clear_hash;
-	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
-		goto clear_hash;
-	if (tcp_md5_hash_key(&hp, key))
-		goto clear_hash;
-	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
-	if (crypto_ahash_final(hp.req))
-		goto clear_hash;
-
-	tcp_sigpool_end(&hp);
-	return 0;
-
-clear_hash:
-	tcp_sigpool_end(&hp);
-clear_hash_nostart:
-	memset(md5_hash, 0, 16);
-	return 1;
+	md5_init(&ctx);
+	tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
+	tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
+	tcp_md5_hash_key(&ctx, key);
+	md5_final(&ctx, md5_hash);
 }
 #endif
 
@@ -1032,7 +988,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
 	int oif = 0;
 #ifdef CONFIG_TCP_MD5SIG
 	unsigned char newhash[16];
-	int genhash;
 	struct sock *sk1 = NULL;
 #endif
 
@@ -1091,8 +1046,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
 			goto out;
 		key.type = TCP_KEY_MD5;
 
-		genhash = tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb);
-		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
+		tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb);
+		if (memcmp(md5_hash_location, newhash, 16) != 0)
 			goto out;
 	}
 #endif
-- 
cgit v1.2.3


From d52bb3daad3f28403676dff31fa0577bdaf8e7c6 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Sat, 18 Oct 2025 12:55:31 +0900
Subject: firewire: core: handle device quirk of TASCAM FW-1884/FW-1804/FW-1082

TASCAM FW-1884/FW-1804/FW-1082 is too lazy to repspond to asynchronous
request at S400. The asynchronous transaction often results in timeout.
This is a problematic quirk.

This commit adds support for the quirk. When identifying the new quirk
flag, then the transaction speed is configured at S200.

Link: https://lore.kernel.org/r/20251018035532.287124-4-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-device.c | 18 +++++++++++++++++-
 include/linux/firewire.h       |  3 +++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 6a5740ed4934..1674de477852 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -571,6 +571,14 @@ static const struct entry_match motu_audio_express_matches[] = {
 	{ 8, 0x17104800 },
 };
 
+static const struct entry_match tascam_fw_series_matches[] = {
+	{ 1, 0x0300022e },
+	{ 3, 0x8d000006 },
+	{ 4, 0xd1000001 },
+	{ 6, 0x1200022e },
+	{ 8, 0xd4000004 },
+};
+
 static int detect_quirks_by_root_directory(const u32 *root_directory, unsigned int length)
 {
 	static const struct {
@@ -583,6 +591,11 @@ static int detect_quirks_by_root_directory(const u32 *root_directory, unsigned i
 			.matches = motu_audio_express_matches,
 			.match_count = ARRAY_SIZE(motu_audio_express_matches),
 		},
+		{
+			.quirk = FW_DEVICE_QUIRK_UNSTABLE_AT_S400,
+			.matches = tascam_fw_series_matches,
+			.match_count = ARRAY_SIZE(tascam_fw_series_matches),
+		},
 	};
 	int quirks = 0;
 	int i;
@@ -761,7 +774,10 @@ static int read_config_rom(struct fw_device *device, int generation)
 	// Just prevent from torn writing/reading.
 	WRITE_ONCE(device->quirks, quirks);
 
-	speed = device->node->max_speed;
+	if (unlikely(quirks & FW_DEVICE_QUIRK_UNSTABLE_AT_S400))
+		speed = SCODE_200;
+	else
+		speed = device->node->max_speed;
 
 	// Determine the speed of
 	//   - devices with link speed less than PHY speed,
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index f1d8734c0ec6..6143b7d28eac 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -179,6 +179,9 @@ enum fw_device_quirk {
 
 	// MOTU Audio Express transfers acknowledge packet with 0x10 for pending state.
 	FW_DEVICE_QUIRK_ACK_PACKET_WITH_INVALID_PENDING_CODE = BIT(2),
+
+	// TASCAM FW-1082/FW-1804/FW-1884 often freezes when receiving S400 packets.
+	FW_DEVICE_QUIRK_UNSTABLE_AT_S400 = BIT(3),
 };
 
 enum fw_device_state {
-- 
cgit v1.2.3


From b57100a3d9ced8c2b78e87d313f514a3338d016e Mon Sep 17 00:00:00 2001
From: Malaya Kumar Rout <mrout@redhat.com>
Date: Tue, 14 Oct 2025 01:00:27 +0530
Subject: PM: console: Fix memory allocation error handling in
 pm_vt_switch_required()

The pm_vt_switch_required() function fails silently when memory
allocation fails, offering no indication to callers that the operation
was unsuccessful. This behavior prevents drivers from handling allocation
errors correctly or implementing retry mechanisms. By ensuring that
failures are reported back to the caller, drivers can make informed
decisions, improve robustness, and avoid unexpected behavior during
critical power management operations.

Change the function signature to return an integer error code and modify
the implementation to return -ENOMEM when kmalloc() fails. Update both
the function declaration and the inline stub in include/linux/pm.h to
maintain consistency across CONFIG_VT_CONSOLE_SLEEP configurations.

The function now returns:
 - 0 on success (including when updating existing entries)
 - -ENOMEM when memory allocation fails

This change improves error reporting without breaking existing callers,
as the current callers in drivers/video/fbdev/core/fbmem.c already
ignore the return value, making this a backward-compatible improvement.

Reviewed-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Malaya Kumar Rout <mrout@redhat.com>
Reviewed-by: Dhruva Gole <d-gole@ti.com>
Reviewed-by: Lyude Paul <lyude@redhat.com>
Link: https://patch.msgid.link/20251013193028.89570-1-mrout@redhat.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h     | 5 +++--
 kernel/power/console.c | 8 ++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index cc7b2dc28574..a72e42eec130 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -25,11 +25,12 @@ extern void (*pm_power_off)(void);
 
 struct device; /* we have a circular dep with device.h */
 #ifdef CONFIG_VT_CONSOLE_SLEEP
-extern void pm_vt_switch_required(struct device *dev, bool required);
+extern int pm_vt_switch_required(struct device *dev, bool required);
 extern void pm_vt_switch_unregister(struct device *dev);
 #else
-static inline void pm_vt_switch_required(struct device *dev, bool required)
+static inline int pm_vt_switch_required(struct device *dev, bool required)
 {
+	return 0;
 }
 static inline void pm_vt_switch_unregister(struct device *dev)
 {
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 19c48aa5355d..a906a0ac0f9b 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -44,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list);
  * no_console_suspend argument has been passed on the command line, VT
  * switches will occur.
  */
-void pm_vt_switch_required(struct device *dev, bool required)
+int pm_vt_switch_required(struct device *dev, bool required)
 {
 	struct pm_vt_switch *entry, *tmp;
+	int ret = 0;
 
 	mutex_lock(&vt_switch_mutex);
 	list_for_each_entry(tmp, &pm_vt_switch_list, head) {
@@ -58,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required)
 	}
 
 	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
+	if (!entry) {
+		ret = -ENOMEM;
 		goto out;
+		}
 
 	entry->required = required;
 	entry->dev = dev;
@@ -67,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required)
 	list_add(&entry->head, &pm_vt_switch_list);
 out:
 	mutex_unlock(&vt_switch_mutex);
+	return ret;
 }
 EXPORT_SYMBOL(pm_vt_switch_required);
 
-- 
cgit v1.2.3


From 8b9cd112f1ac8d72244b189654e693012ea8dfe0 Mon Sep 17 00:00:00 2001
From: André Draszik <andre.draszik@linaro.org>
Date: Thu, 9 Oct 2025 10:31:27 +0100
Subject: soc: samsung: gs101-pmu: implement access tables for read and write
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Accessing non-existent PMU registers causes an SError, halting the
system.

Implement read and write access tables for the gs101-PMU to specify
which registers are read- and/or writable to avoid that SError.

Reviewed-by: Sam Protsenko <semen.protsenko@linaro.org>
Signed-off-by: André Draszik <andre.draszik@linaro.org>
Link: https://patch.msgid.link/20251009-gs101-pmu-regmap-tables-v2-3-2d64f5261952@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 drivers/soc/samsung/gs101-pmu.c             | 306 ++++++++++++++++++++++++-
 include/linux/soc/samsung/exynos-regs-pmu.h | 343 +++++++++++++++++++++++++++-
 2 files changed, 640 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/samsung/gs101-pmu.c b/drivers/soc/samsung/gs101-pmu.c
index ec345d09f21f..17dadc1b9c6e 100644
--- a/drivers/soc/samsung/gs101-pmu.c
+++ b/drivers/soc/samsung/gs101-pmu.c
@@ -9,6 +9,7 @@
 #include <linux/array_size.h>
 #include <linux/soc/samsung/exynos-pmu.h>
 #include <linux/soc/samsung/exynos-regs-pmu.h>
+#include <linux/regmap.h>
 
 #include "exynos-pmu.h"
 
@@ -20,9 +21,312 @@
 #define TENSOR_PMUREG_WRITE		1
 #define TENSOR_PMUREG_RMW		2
 
+static const struct regmap_range gs101_pmu_registers[] = {
+	regmap_reg_range(GS101_OM_STAT, GS101_SYSTEM_INFO),
+	regmap_reg_range(GS101_IDLE_IP(0), GS101_IDLE_IP_MASK(3)),
+	regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(0),
+			 GS101_PPMPURAM_INFORM_SCL_CH(3)),
+	regmap_reg_range(GS101_INFORM0, GS101_SYSIP_DAT(0)),
+	/* skip SYSIP_DAT1 SYSIP_DAT2 */
+	regmap_reg_range(GS101_SYSIP_DAT(3), GS101_PWR_HOLD_SW_TRIP),
+	regmap_reg_range(GS101_GSA_INFORM(0), GS101_GSA_INFORM(1)),
+	regmap_reg_range(GS101_INFORM4, GS101_IROM_INFORM),
+	regmap_reg_range(GS101_IROM_CPU_INFORM(0), GS101_IROM_CPU_INFORM(7)),
+	regmap_reg_range(GS101_PMU_SPARE(0), GS101_PMU_SPARE(3)),
+	/* skip most IROM_xxx registers */
+	regmap_reg_range(GS101_DREX_CALIBRATION(0), GS101_DREX_CALIBRATION(7)),
+
+#define CLUSTER_CPU_RANGE(cl, cpu)					\
+	regmap_reg_range(GS101_CLUSTER_CPU_CONFIGURATION(cl, cpu),	\
+			 GS101_CLUSTER_CPU_OPTION(cl, cpu)),		\
+	regmap_reg_range(GS101_CLUSTER_CPU_OUT(cl, cpu),		\
+			 GS101_CLUSTER_CPU_IN(cl, cpu)),		\
+	regmap_reg_range(GS101_CLUSTER_CPU_INT_IN(cl, cpu),		\
+			 GS101_CLUSTER_CPU_INT_DIR(cl, cpu))
+
+	/* cluster 0..2 and cpu 0..4 or 0..1 */
+	CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 0),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 1),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 2),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 3),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 0),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 1),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 0),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 1),
+#undef CLUSTER_CPU_RANGE
+
+#define CLUSTER_NONCPU_RANGE(cl)					\
+	regmap_reg_range(GS101_CLUSTER_NONCPU_CONFIGURATION(cl),	\
+			 GS101_CLUSTER_NONCPU_OPTION(cl)),		\
+	regmap_reg_range(GS101_CLUSTER_NONCPU_OUT(cl),			\
+			 GS101_CLUSTER_NONCPU_IN(cl)),			\
+	regmap_reg_range(GS101_CLUSTER_NONCPU_INT_IN(cl),		\
+			 GS101_CLUSTER_NONCPU_INT_DIR(cl)),		\
+	regmap_reg_range(GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_OUT(cl),	\
+			 GS101_CLUSTER_NONCPU_DUALRAIL_POS_OUT(cl)),	\
+	regmap_reg_range(GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl),	\
+			 GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl))
+
+	CLUSTER_NONCPU_RANGE(0),
+	regmap_reg_range(GS101_CLUSTER0_NONCPU_DSU_PCH,
+			 GS101_CLUSTER0_NONCPU_DSU_PCH),
+	CLUSTER_NONCPU_RANGE(1),
+	CLUSTER_NONCPU_RANGE(2),
+#undef CLUSTER_NONCPU_RANGE
+
+#define SUBBLK_RANGE(blk)						\
+	regmap_reg_range(GS101_SUBBLK_CONFIGURATION(blk),		\
+			 GS101_SUBBLK_CTRL(blk)),			\
+	regmap_reg_range(GS101_SUBBLK_OUT(blk), GS101_SUBBLK_IN(blk)),	\
+	regmap_reg_range(GS101_SUBBLK_INT_IN(blk),			\
+			 GS101_SUBBLK_INT_DIR(blk)),			\
+	regmap_reg_range(GS101_SUBBLK_MEMORY_OUT(blk),			\
+			 GS101_SUBBLK_MEMORY_IN(blk))
+
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ALIVE),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_AOC),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_APM),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CMU),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS1),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS2),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CORE),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EH),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL1),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL2),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3D),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_G3D),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI1),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI2),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DPU),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DISP),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G2D),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MFC),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CSIS),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PDP),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DNS),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3AA),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_IPP),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ITP),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MCSC),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_GDC),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TNR),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BO),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TPU),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF1),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF2),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF3),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MISC),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC1),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_S2D),
+#undef SUBBLK_RANGE
+
+#define SUBBLK_CPU_RANGE(blk)						\
+	regmap_reg_range(GS101_SUBBLK_CPU_CONFIGURATION(blk),		\
+			 GS101_SUBBLK_CPU_OPTION(blk)),			\
+	regmap_reg_range(GS101_SUBBLK_CPU_OUT(blk),			\
+			 GS101_SUBBLK_CPU_IN(blk)),			\
+	regmap_reg_range(GS101_SUBBLK_CPU_INT_IN(blk),			\
+			 GS101_SUBBLK_CPU_INT_DIR(blk))
+
+	SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_APM),
+	SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_DBGCORE),
+	SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_SSS),
+#undef SUBBLK_CPU_RANGE
+
+	regmap_reg_range(GS101_MIF_CONFIGURATION, GS101_MIF_CTRL),
+	regmap_reg_range(GS101_MIF_OUT, GS101_MIF_IN),
+	regmap_reg_range(GS101_MIF_INT_IN, GS101_MIF_INT_DIR),
+	regmap_reg_range(GS101_TOP_CONFIGURATION, GS101_TOP_OPTION),
+	regmap_reg_range(GS101_TOP_OUT, GS101_TOP_IN),
+	regmap_reg_range(GS101_TOP_INT_IN, GS101_WAKEUP2_STAT),
+	regmap_reg_range(GS101_WAKEUP2_INT_IN, GS101_WAKEUP2_INT_DIR),
+	regmap_reg_range(GS101_SYSTEM_CONFIGURATION, GS101_USER_DEFINED_OUT),
+	regmap_reg_range(GS101_SYSTEM_OUT, GS101_SYSTEM_IN),
+	regmap_reg_range(GS101_SYSTEM_INT_IN, GS101_EINT_WAKEUP_MASK3),
+	regmap_reg_range(GS101_USER_DEFINED_INT_IN, GS101_SCAN2DRAM_INT_DIR),
+	/* skip HCU_START */
+	regmap_reg_range(GS101_CUSTOM_OUT, GS101_CUSTOM_IN),
+	regmap_reg_range(GS101_CUSTOM_INT_IN, GS101_CUSTOM_INT_DIR),
+	regmap_reg_range(GS101_ACK_LAST_CPU, GS101_HCU_R(3)),
+	regmap_reg_range(GS101_HCU_SP, GS101_HCU_PC),
+	/* skip PMU_RAM_CTRL */
+	regmap_reg_range(GS101_APM_HCU_CTRL, GS101_APM_HCU_CTRL),
+	regmap_reg_range(GS101_APM_NMI_ENABLE, GS101_RST_STAT_PMU),
+	regmap_reg_range(GS101_HPM_INT_IN, GS101_BOOT_STAT),
+	regmap_reg_range(GS101_PMLINK_OUT, GS101_PMLINK_AOC_CTRL),
+	regmap_reg_range(GS101_TCXO_BUF_CTRL, GS101_ADD_CTRL),
+	regmap_reg_range(GS101_HCU_TIMEOUT_RESET, GS101_HCU_TIMEOUT_SCAN2DRAM),
+	regmap_reg_range(GS101_TIMER(0), GS101_TIMER(3)),
+	regmap_reg_range(GS101_PPC_MIF(0), GS101_PPC_EH),
+	/* PPC_OFFSET, skip PPC_CPUCL1_0 PPC_CPUCL1_1 */
+	regmap_reg_range(GS101_EXT_REGULATOR_MIF_DURATION, GS101_TCXO_DURATION),
+	regmap_reg_range(GS101_BURNIN_CTRL, GS101_TMU_SUB_TRIP),
+	regmap_reg_range(GS101_MEMORY_CEN, GS101_MEMORY_SMX_FEEDBACK),
+	regmap_reg_range(GS101_SLC_PCH_CHANNEL, GS101_SLC_PCH_CB),
+	regmap_reg_range(GS101_FORCE_NOMC, GS101_FORCE_NOMC),
+	regmap_reg_range(GS101_FORCE_BOOST, GS101_PMLINK_SLC_BUSY),
+	regmap_reg_range(GS101_BOOTSYNC_OUT, GS101_CTRL_SECJTAG_ALIVE),
+	regmap_reg_range(GS101_CTRL_DIV_PLL_ALV_DIVLOW, GS101_CTRL_CLKDIV__CLKRTC),
+	regmap_reg_range(GS101_CTRL_SOC32K, GS101_CTRL_SBU_SW_EN),
+	regmap_reg_range(GS101_PAD_CTRL_CLKOUT0, GS101_PAD_CTRL_WRESETO_n),
+	regmap_reg_range(GS101_PHY_CTRL_USB20, GS101_PHY_CTRL_UFS),
+};
+
+static const struct regmap_range gs101_pmu_ro_registers[] = {
+	regmap_reg_range(GS101_OM_STAT, GS101_VERSION),
+	regmap_reg_range(GS101_OTP_STATUS, GS101_OTP_STATUS),
+
+	regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(0),
+			 GS101_PPMPURAM_STATE_SLC_CH(0)),
+	regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(1),
+			 GS101_PPMPURAM_STATE_SLC_CH(1)),
+	regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(2),
+			 GS101_PPMPURAM_STATE_SLC_CH(2)),
+	regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(3),
+			 GS101_PPMPURAM_STATE_SLC_CH(3)),
+
+#define CLUSTER_CPU_RANGE(cl, cpu)					\
+	regmap_reg_range(GS101_CLUSTER_CPU_IN(cl, cpu),			\
+			 GS101_CLUSTER_CPU_IN(cl, cpu)),		\
+	regmap_reg_range(GS101_CLUSTER_CPU_INT_IN(cl, cpu),		\
+			 GS101_CLUSTER_CPU_INT_IN(cl, cpu))
+
+	CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 0),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 1),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 2),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 3),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 0),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 1),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 0),
+	CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 1),
+#undef CLUSTER_CPU_RANGE
+
+#define CLUSTER_NONCPU_RANGE(cl)					\
+	regmap_reg_range(GS101_CLUSTER_NONCPU_IN(cl),			\
+			 GS101_CLUSTER_NONCPU_IN(cl)),			\
+	regmap_reg_range(GS101_CLUSTER_NONCPU_INT_IN(cl),		\
+			 GS101_CLUSTER_NONCPU_INT_IN(cl)),		\
+	regmap_reg_range(GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl),	\
+			 GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl))
+
+	CLUSTER_NONCPU_RANGE(0),
+	CLUSTER_NONCPU_RANGE(1),
+	CLUSTER_NONCPU_RANGE(2),
+	regmap_reg_range(GS101_CLUSTER_NONCPU_INT_EN(2),
+			 GS101_CLUSTER_NONCPU_INT_DIR(2)),
+#undef CLUSTER_NONCPU_RANGE
+
+#define SUBBLK_RANGE(blk)						\
+	regmap_reg_range(GS101_SUBBLK_IN(blk), GS101_SUBBLK_IN(blk)),	\
+	regmap_reg_range(GS101_SUBBLK_INT_IN(blk),			\
+			 GS101_SUBBLK_INT_IN(blk)),			\
+	regmap_reg_range(GS101_SUBBLK_MEMORY_IN(blk),			\
+			 GS101_SUBBLK_MEMORY_IN(blk))
+
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ALIVE),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_AOC),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_APM),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CMU),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS1),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS2),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CORE),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EH),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL1),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL2),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3D),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_G3D),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI1),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI2),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DPU),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DISP),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G2D),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MFC),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CSIS),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PDP),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DNS),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3AA),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_IPP),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ITP),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MCSC),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_GDC),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TNR),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BO),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TPU),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF1),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF2),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF3),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MISC),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC0),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC1),
+	SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_S2D),
+#undef SUBBLK_RANGE
+
+#define SUBBLK_CPU_RANGE(blk)						\
+	regmap_reg_range(GS101_SUBBLK_CPU_IN(blk),			\
+			 GS101_SUBBLK_CPU_IN(blk)),			\
+	regmap_reg_range(GS101_SUBBLK_CPU_INT_IN(blk),			\
+			 GS101_SUBBLK_CPU_INT_IN(blk))
+
+	SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_APM),
+	SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_DBGCORE),
+	SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_SSS),
+#undef SUBBLK_CPU_RANGE
+
+	regmap_reg_range(GS101_MIF_CONFIGURATION, GS101_MIF_CONFIGURATION),
+	regmap_reg_range(GS101_MIF_IN, GS101_MIF_IN),
+	regmap_reg_range(GS101_MIF_INT_IN, GS101_MIF_INT_IN),
+	regmap_reg_range(GS101_TOP_IN, GS101_TOP_IN),
+	regmap_reg_range(GS101_TOP_INT_IN, GS101_TOP_INT_IN),
+	regmap_reg_range(GS101_WAKEUP2_INT_IN, GS101_WAKEUP2_INT_IN),
+	regmap_reg_range(GS101_SYSTEM_IN, GS101_SYSTEM_IN),
+	regmap_reg_range(GS101_SYSTEM_INT_IN, GS101_SYSTEM_INT_IN),
+	regmap_reg_range(GS101_EINT_INT_IN, GS101_EINT_INT_IN),
+	regmap_reg_range(GS101_EINT2_INT_IN, GS101_EINT2_INT_IN),
+	regmap_reg_range(GS101_EINT3_INT_IN, GS101_EINT3_INT_IN),
+	regmap_reg_range(GS101_USER_DEFINED_INT_IN, GS101_USER_DEFINED_INT_IN),
+	regmap_reg_range(GS101_SCAN2DRAM_INT_IN, GS101_SCAN2DRAM_INT_IN),
+	regmap_reg_range(GS101_CUSTOM_IN, GS101_CUSTOM_IN),
+	regmap_reg_range(GS101_CUSTOM_INT_IN, GS101_CUSTOM_INT_IN),
+	regmap_reg_range(GS101_HCU_R(0), GS101_HCU_R(3)),
+	regmap_reg_range(GS101_HCU_SP, GS101_HCU_PC),
+	regmap_reg_range(GS101_NMI_SRC_IN, GS101_NMI_SRC_IN),
+	regmap_reg_range(GS101_HPM_INT_IN, GS101_HPM_INT_IN),
+	regmap_reg_range(GS101_MEMORY_PGEN_FEEDBACK, GS101_MEMORY_PGEN_FEEDBACK),
+	regmap_reg_range(GS101_MEMORY_SMX_FEEDBACK, GS101_MEMORY_SMX_FEEDBACK),
+	regmap_reg_range(GS101_PMLINK_SLC_ACK, GS101_PMLINK_SLC_BUSY),
+	regmap_reg_range(GS101_BOOTSYNC_IN, GS101_BOOTSYNC_IN),
+	regmap_reg_range(GS101_SCAN_READY_IN, GS101_SCAN_READY_IN),
+	regmap_reg_range(GS101_CTRL_PLL_ALV_LOCK, GS101_CTRL_PLL_ALV_LOCK),
+};
+
+static const struct regmap_access_table gs101_pmu_rd_table = {
+	.yes_ranges = gs101_pmu_registers,
+	.n_yes_ranges = ARRAY_SIZE(gs101_pmu_registers),
+};
+
+static const struct regmap_access_table gs101_pmu_wr_table = {
+	.yes_ranges = gs101_pmu_registers,
+	.n_yes_ranges = ARRAY_SIZE(gs101_pmu_registers),
+	.no_ranges = gs101_pmu_ro_registers,
+	.n_no_ranges = ARRAY_SIZE(gs101_pmu_ro_registers),
+};
+
 const struct exynos_pmu_data gs101_pmu_data = {
 	.pmu_secure = true,
 	.pmu_cpuhp = true,
+	.rd_table = &gs101_pmu_rd_table,
+	.wr_table = &gs101_pmu_wr_table,
 };
 
 /*
@@ -124,7 +428,7 @@ static bool tensor_is_atomic(unsigned int reg)
 		return false;
 
 	switch (reg) {
-	case GS101_SYSIP_DAT0:
+	case GS101_SYSIP_DAT(0):
 	case GS101_SYSTEM_CONFIGURATION:
 		return false;
 	default:
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index 71e0c09a49eb..532c6c2d1195 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -672,14 +672,341 @@
 
 /* For Tensor GS101 */
 /* PMU ALIVE */
-#define GS101_SYSIP_DAT0					(0x810)
-#define GS101_CPU0_INFORM					(0x860)
-#define GS101_CPU_INFORM(cpu)	\
-			(GS101_CPU0_INFORM + (cpu*4))
-#define GS101_SYSTEM_CONFIGURATION				(0x3A00)
-#define GS101_EINT_WAKEUP_MASK					(0x3A80)
-#define GS101_PHY_CTRL_USB20					(0x3EB0)
-#define GS101_PHY_CTRL_USBDP					(0x3EB4)
+#define GS101_OM_STAT                           0x0000
+#define GS101_VERSION                           0x0004
+#define GS101_PORESET_CHECK                     0x0008
+#define GS101_OTP_STATUS                        0x000c
+#define GS101_SYSTEM_INFO                       0x0010
+#define GS101_IDLE_IP(n)                        (0x03e0 + ((n) & 3) * 4)
+#define GS101_IDLE_IP_MASK(n)                   (0x03f0 + ((n) & 3) * 4)
+#define GS101_SLC_CH_OFFSET(ch)                 (0x0400 + ((ch) & 3) * 0x10)
+#define GS101_DATARAM_STATE_SLC_CH(ch)          (GS101_SLC_CH_OFFSET(ch) + 0x00)
+#define GS101_TAGRAM_STATE_SLC_CH(ch)           (GS101_SLC_CH_OFFSET(ch) + 0x04)
+#define GS101_LRURAM_STATE_SLC_CH(ch)           (GS101_SLC_CH_OFFSET(ch) + 0x08)
+#define GS101_PPMPURAM_STATE_SLC_CH(ch)         (GS101_SLC_CH_OFFSET(ch) + 0x0c)
+#define GS101_DATARAM_INFORM_SCL_CH(ch)         (GS101_SLC_CH_OFFSET(ch) + 0x40)
+#define GS101_TAGRAM_INFORM_SCL_CH(ch)          (GS101_SLC_CH_OFFSET(ch) + 0x44)
+#define GS101_LRURAM_INFORM_SCL_CH(ch)          (GS101_SLC_CH_OFFSET(ch) + 0x48)
+#define GS101_PPMPURAM_INFORM_SCL_CH(ch)        (GS101_SLC_CH_OFFSET(ch) + 0x4c)
+#define GS101_INFORM0                           0x0800
+#define GS101_INFORM1                           0x0804
+#define GS101_INFORM2                           0x0808
+#define GS101_INFORM3                           0x080c
+#define GS101_SYSIP_DAT(n)                      (0x0810 + ((n) & 3) * 4)
+#define GS101_PWR_HOLD_HW_TRIP                  0x0820
+#define GS101_PWR_HOLD_SW_TRIP                  0x0824
+#define GS101_GSA_INFORM(n)                     (0x0830 + ((n) & 1) * 4)
+#define GS101_INFORM4                           0x0840
+#define GS101_INFORM5                           0x0844
+#define GS101_INFORM6                           0x0848
+#define GS101_INFORM7                           0x084c
+#define GS101_INFORM8                           0x0850
+#define GS101_INFORM9                           0x0854
+#define GS101_INFORM10                          0x0858
+#define GS101_INFORM11                          0x085c
+#define GS101_CPU_INFORM(cpu)                   (0x0860 + ((cpu) & 7) * 4)
+#define GS101_IROM_INFORM                       0x0880
+#define GS101_IROM_CPU_INFORM(cpu)              (0x0890 + ((cpu) & 7) * 4)
+#define GS101_PMU_SPARE(n)                      (0x0900 + ((n) & 3) * 4)
+#define GS101_IROM_DATA_REG(n)                  (0x0980 + ((n) & 3) * 4)
+#define GS101_IROM_PWRMODE                      0x0990
+#define GS101_DREX_CALIBRATION(n)               (0x09a0 + ((n) & 7) * 4)
+
+#define GS101_CLUSTER0_OFFSET                   0x1000
+#define GS101_CLUSTER1_OFFSET                   0x1300
+#define GS101_CLUSTER2_OFFSET                   0x1500
+#define GS101_CLUSTER_CPU_OFFSET(cl, cpu)       ((cl) + ((cpu) * 0x80))
+#define GS101_CLUSTER_CPU_CONFIGURATION(cl, cpu) \
+			(GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x00)
+#define GS101_CLUSTER_CPU_STATUS(cl, cpu) \
+			(GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x04)
+#define GS101_CLUSTER_CPU_STATES(cl, cpu) \
+			(GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x08)
+#define GS101_CLUSTER_CPU_OPTION(cl, cpu) \
+			(GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x0c)
+#define GS101_CLUSTER_CPU_OUT(cl, cpu) \
+			(GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x20)
+#define GS101_CLUSTER_CPU_IN(cl, cpu) \
+			(GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x24)
+#define GS101_CLUSTER_CPU_INT_IN(cl, cpu) \
+			(GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x40)
+#define GS101_CLUSTER_CPU_INT_EN(cl, cpu) \
+			(GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x44)
+#define GS101_CLUSTER_CPU_INT_TYPE(cl, cpu) \
+			(GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x48)
+#define GS101_CLUSTER_CPU_INT_DIR(cl, cpu) \
+			(GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x4c)
+
+#define GS101_CLUSTER_NONCPU_OFFSET(cl)         (0x1200 + ((cl) * 0x200))
+#define GS101_CLUSTER_NONCPU_CONFIGURATION(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x00)
+#define GS101_CLUSTER_NONCPU_STATUS(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x04)
+#define GS101_CLUSTER_NONCPU_STATES(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x08)
+#define GS101_CLUSTER_NONCPU_OPTION(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x0c)
+#define GS101_CLUSTER_NONCPU_OUT(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x20)
+#define GS101_CLUSTER_NONCPU_IN(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x24)
+#define GS101_CLUSTER_NONCPU_INT_IN(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x40)
+#define GS101_CLUSTER_NONCPU_INT_EN(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x44)
+#define GS101_CLUSTER_NONCPU_INT_TYPE(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x48)
+#define GS101_CLUSTER_NONCPU_INT_DIR(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x4c)
+#define GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_OUT(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x60)
+#define GS101_CLUSTER_NONCPU_DUALRAIL_POS_OUT(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x64)
+#define GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl) \
+			(GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x6c)
+#define GS101_CLUSTER0_NONCPU_DSU_PCH \
+			(GS101_CLUSTER_NONCPU_OFFSET(0) + 0x80)
+
+#define GS101_SUBBBLK_OFFSET_ALIVE              0x1800
+#define GS101_SUBBBLK_OFFSET_AOC                0x1880
+#define GS101_SUBBBLK_OFFSET_APM                0x1900
+#define GS101_SUBBBLK_OFFSET_CMU                0x1980
+#define GS101_SUBBBLK_OFFSET_BUS0               0x1a00
+#define GS101_SUBBBLK_OFFSET_BUS1               0x1a80
+#define GS101_SUBBBLK_OFFSET_BUS2               0x1b00
+#define GS101_SUBBBLK_OFFSET_CORE               0x1b80
+#define GS101_SUBBBLK_OFFSET_EH                 0x1c00
+#define GS101_SUBBBLK_OFFSET_CPUCL0             0x1c80
+#define GS101_SUBBBLK_OFFSET_CPUCL1             0x1d00
+#define GS101_SUBBBLK_OFFSET_CPUCL2             0x1d80
+#define GS101_SUBBBLK_OFFSET_G3D                0x1e00
+#define GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0    0x1e80
+#define GS101_SUBBBLK_OFFSET_EMBEDDED_G3D       0x2000
+#define GS101_SUBBBLK_OFFSET_HSI0               0x2080
+#define GS101_SUBBBLK_OFFSET_HSI1               0x2100
+#define GS101_SUBBBLK_OFFSET_HSI2               0x2180
+#define GS101_SUBBBLK_OFFSET_DPU                0x2200
+#define GS101_SUBBBLK_OFFSET_DISP               0x2280
+#define GS101_SUBBBLK_OFFSET_G2D                0x2300
+#define GS101_SUBBBLK_OFFSET_MFC                0x2380
+#define GS101_SUBBBLK_OFFSET_CSIS               0x2400
+#define GS101_SUBBBLK_OFFSET_PDP                0x2480
+#define GS101_SUBBBLK_OFFSET_DNS                0x2500
+#define GS101_SUBBBLK_OFFSET_G3AA               0x2580
+#define GS101_SUBBBLK_OFFSET_IPP                0x2600
+#define GS101_SUBBBLK_OFFSET_ITP                0x2680
+#define GS101_SUBBBLK_OFFSET_MCSC               0x2700
+#define GS101_SUBBBLK_OFFSET_GDC                0x2780
+#define GS101_SUBBBLK_OFFSET_TNR                0x2800
+#define GS101_SUBBBLK_OFFSET_BO                 0x2880
+#define GS101_SUBBBLK_OFFSET_TPU                0x2900
+#define GS101_SUBBBLK_OFFSET_MIF0               0x2980
+#define GS101_SUBBBLK_OFFSET_MIF1               0x2a00
+#define GS101_SUBBBLK_OFFSET_MIF2               0x2a80
+#define GS101_SUBBBLK_OFFSET_MIF3               0x2b00
+#define GS101_SUBBBLK_OFFSET_MISC               0x2b80
+#define GS101_SUBBBLK_OFFSET_PERIC0             0x2c00
+#define GS101_SUBBBLK_OFFSET_PERIC1             0x2c80
+#define GS101_SUBBBLK_OFFSET_S2D                0x2d00
+#define GS101_SUBBLK_CONFIGURATION(blk)         ((blk) + 0x00)
+#define GS101_SUBBLK_STATUS(blk)                ((blk) + 0x04)
+#define GS101_SUBBLK_STATES(blk)                ((blk) + 0x08)
+#define GS101_SUBBLK_OPTION(blk)                ((blk) + 0x0c)
+#define GS101_SUBBLK_CTRL(blk)                  ((blk) + 0x10)
+#define GS101_SUBBLK_OUT(blk)                   ((blk) + 0x20)
+#define GS101_SUBBLK_IN(blk)                    ((blk) + 0x24)
+#define GS101_SUBBLK_INT_IN(blk)                ((blk) + 0x40)
+#define GS101_SUBBLK_INT_EN(blk)                ((blk) + 0x44)
+#define GS101_SUBBLK_INT_TYPE(blk)              ((blk) + 0x48)
+#define GS101_SUBBLK_INT_DIR(blk)               ((blk) + 0x4c)
+#define GS101_SUBBLK_MEMORY_OUT(blk)            ((blk) + 0x60)
+#define GS101_SUBBLK_MEMORY_IN(blk)             ((blk) + 0x64)
+
+#define GS101_SUBBBLK_CPU_OFFSET_APM            0x3000
+#define GS101_SUBBBLK_CPU_OFFSET_DBGCORE        0x3080
+#define GS101_SUBBBLK_CPU_OFFSET_SSS            0x3100
+#define GS101_SUBBLK_CPU_CONFIGURATION(blk)     ((blk) + 0x00)
+#define GS101_SUBBLK_CPU_STATUS(blk)            ((blk) + 0x04)
+#define GS101_SUBBLK_CPU_STATES(blk)            ((blk) + 0x08)
+#define GS101_SUBBLK_CPU_OPTION(blk)            ((blk) + 0x0c)
+#define GS101_SUBBLK_CPU_OUT(blk)               ((blk) + 0x20)
+#define GS101_SUBBLK_CPU_IN(blk)                ((blk) + 0x24)
+#define GS101_SUBBLK_CPU_INT_IN(blk)            ((blk) + 0x40)
+#define GS101_SUBBLK_CPU_INT_EN(blk)            ((blk) + 0x44)
+#define GS101_SUBBLK_CPU_INT_TYPE(blk)          ((blk) + 0x48)
+#define GS101_SUBBLK_CPU_INT_DIR(blk)           ((blk) + 0x4c)
+
+#define GS101_MIF_CONFIGURATION                 0x3800
+#define GS101_MIF_STATUS                        0x3804
+#define GS101_MIF_STATES                        0x3808
+#define GS101_MIF_OPTION                        0x380c
+#define GS101_MIF_CTRL                          0x3810
+#define GS101_MIF_OUT                           0x3820
+#define GS101_MIF_IN                            0x3824
+#define GS101_MIF_INT_IN                        0x3840
+#define GS101_MIF_INT_EN                        0x3844
+#define GS101_MIF_INT_TYPE                      0x3848
+#define GS101_MIF_INT_DIR                       0x384c
+#define GS101_TOP_CONFIGURATION                 0x3900
+#define GS101_TOP_STATUS                        0x3904
+#define GS101_TOP_STATES                        0x3908
+#define GS101_TOP_OPTION                        0x390c
+#define GS101_TOP_OUT                           0x3920
+#define GS101_TOP_IN                            0x3924
+#define GS101_TOP_INT_IN                        0x3940
+#define GS101_TOP_INT_EN                        0x3944
+#define GS101_TOP_INT_TYPE                      0x3948
+#define GS101_TOP_INT_DIR                       0x394c
+#define GS101_WAKEUP_STAT                       0x3950
+#define GS101_WAKEUP2_STAT                      0x3954
+#define GS101_WAKEUP2_INT_IN                    0x3960
+#define GS101_WAKEUP2_INT_EN                    0x3964
+#define GS101_WAKEUP2_INT_TYPE                  0x3968
+#define GS101_WAKEUP2_INT_DIR                   0x396c
+#define GS101_SYSTEM_CONFIGURATION              0x3a00
+#define GS101_SYSTEM_STATUS                     0x3a04
+#define GS101_SYSTEM_STATES                     0x3a08
+#define GS101_SYSTEM_OPTION                     0x3a0c
+#define GS101_SYSTEM_CTRL                       0x3a10
+#define GS101_SPARE_CTRL                        0x3a14
+#define GS101_USER_DEFINED_OUT                  0x3a18
+#define GS101_SYSTEM_OUT                        0x3a20
+#define GS101_SYSTEM_IN                         0x3a24
+#define GS101_SYSTEM_INT_IN                     0x3a40
+#define GS101_SYSTEM_INT_EN                     0x3a44
+#define GS101_SYSTEM_INT_TYPE                   0x3a48
+#define GS101_SYSTEM_INT_DIR                    0x3a4c
+#define GS101_EINT_INT_IN                       0x3a50
+#define GS101_EINT_INT_EN                       0x3a54
+#define GS101_EINT_INT_TYPE                     0x3a58
+#define GS101_EINT_INT_DIR                      0x3a5c
+#define GS101_EINT2_INT_IN                      0x3a60
+#define GS101_EINT2_INT_EN                      0x3a64
+#define GS101_EINT2_INT_TYPE                    0x3a68
+#define GS101_EINT2_INT_DIR                     0x3a6c
+#define GS101_EINT3_INT_IN                      0x3a70
+#define GS101_EINT3_INT_EN                      0x3a74
+#define GS101_EINT3_INT_TYPE                    0x3a78
+#define GS101_EINT3_INT_DIR                     0x3a7c
+#define GS101_EINT_WAKEUP_MASK                  0x3a80
+#define GS101_EINT_WAKEUP_MASK2                 0x3a84
+#define GS101_EINT_WAKEUP_MASK3                 0x3a88
+#define GS101_USER_DEFINED_INT_IN               0x3a90
+#define GS101_USER_DEFINED_INT_EN               0x3a94
+#define GS101_USER_DEFINED_INT_TYPE             0x3a98
+#define GS101_USER_DEFINED_INT_DIR              0x3a9c
+#define GS101_SCAN2DRAM_INT_IN                  0x3aa0
+#define GS101_SCAN2DRAM_INT_EN                  0x3aa4
+#define GS101_SCAN2DRAM_INT_TYPE                0x3aa8
+#define GS101_SCAN2DRAM_INT_DIR                 0x3aac
+#define GS101_HCU_START                         0x3ab0
+#define GS101_CUSTOM_OUT                        0x3ac0
+#define GS101_CUSTOM_IN                         0x3ac4
+#define GS101_CUSTOM_INT_IN                     0x3ad0
+#define GS101_CUSTOM_INT_EN                     0x3ad4
+#define GS101_CUSTOM_INT_TYPE                   0x3ad8
+#define GS101_CUSTOM_INT_DIR                    0x3adc
+#define GS101_ACK_LAST_CPU                      0x3afc
+#define GS101_HCU_R(n)                          (0x3b00 + ((n) & 3) * 4)
+#define GS101_HCU_SP                            0x3b14
+#define GS101_HCU_PC                            0x3b18
+#define GS101_PMU_RAM_CTRL                      0x3b20
+#define GS101_APM_HCU_CTRL                      0x3b24
+#define GS101_APM_NMI_ENABLE                    0x3b30
+#define GS101_DBGCORE_NMI_ENABLE                0x3b34
+#define GS101_HCU_NMI_ENABLE                    0x3b38
+#define GS101_PWR_HOLD_WDT_ENABLE               0x3b3c
+#define GS101_NMI_SRC_IN                        0x3b40
+#define GS101_RST_STAT                          0x3b44
+#define GS101_RST_STAT_PMU                      0x3b48
+#define GS101_HPM_INT_IN                        0x3b60
+#define GS101_HPM_INT_EN                        0x3b64
+#define GS101_HPM_INT_TYPE                      0x3b68
+#define GS101_HPM_INT_DIR                       0x3b6c
+#define GS101_S2D_AUTH                          0x3b70
+#define GS101_BOOT_STAT                         0x3b74
+#define GS101_PMLINK_OUT                        0x3c00
+#define GS101_PMLINK_AOC_OUT                    0x3c04
+#define GS101_PMLINK_AOC_CTRL                   0x3c08
+#define GS101_TCXO_BUF_CTRL                     0x3c10
+#define GS101_ADD_CTRL                          0x3c14
+#define GS101_HCU_TIMEOUT_RESET                 0x3c20
+#define GS101_HCU_TIMEOUT_SCAN2DRAM             0x3c24
+#define GS101_TIMER(n)                          (0x3c80 + ((n) & 3) * 4)
+#define GS101_PPC_MIF(n)                        (0x3c90 + ((n) & 3) * 4)
+#define GS101_PPC_CORE                          0x3ca0
+#define GS101_PPC_EH                            0x3ca4
+#define GS101_PPC_CPUCL1_0                      0x3ca8
+#define GS101_PPC_CPUCL1_1                      0x3cac
+#define GS101_EXT_REGULATOR_MIF_DURATION        0x3cb0
+#define GS101_EXT_REGULATOR_TOP_DURATION        0x3cb4
+#define GS101_EXT_REGULATOR_CPUCL2_DURATION     0x3cb8
+#define GS101_EXT_REGULATOR_CPUCL1_DURATION     0x3cbc
+#define GS101_EXT_REGULATOR_G3D_DURATION        0x3cc0
+#define GS101_EXT_REGULATOR_TPU_DURATION        0x3cc4
+#define GS101_TCXO_DURATION                     0x3cc8
+#define GS101_BURNIN_CTRL                       0x3cd0
+#define GS101_JTAG_DBG_DET                      0x3cd4
+#define GS101_MMC_CONWKUP_CTRL                  0x3cd8
+#define GS101_USBDPPHY0_USBDP_WAKEUP            0x3cdc
+#define GS101_TMU_TOP_TRIP                      0x3ce0
+#define GS101_TMU_SUB_TRIP                      0x3ce4
+#define GS101_MEMORY_CEN                        0x3d00
+#define GS101_MEMORY_PGEN                       0x3d04
+#define GS101_MEMORY_RET                        0x3d08
+#define GS101_MEMORY_PGEN_FEEDBACK              0x3d0c
+#define GS101_MEMORY_SMX                        0x3d10
+#define GS101_MEMORY_SMX_FEEDBACK               0x3d14
+#define GS101_SLC_PCH_CHANNEL                   0x3d20
+#define GS101_SLC_PCH_CB                        0x3d24
+#define GS101_FORCE_NOMC                        0x3d3c
+#define GS101_FORCE_BOOST                       0x3d4c
+#define GS101_PMLINK_SLC_REQ                    0x3d50
+#define GS101_PMLINK_SLC_ACK                    0x3d54
+#define GS101_PMLINK_SLC_BUSY                   0x3d58
+#define GS101_BOOTSYNC_OUT                      0x3d80
+#define GS101_BOOTSYNC_IN                       0x3d84
+#define GS101_SCAN_READY_OUT                    0x3d88
+#define GS101_SCAN_READY_IN                     0x3d8c
+#define GS101_GSA_RESTORE                       0x3d90
+#define GS101_ALIVE_OTP_LATCH                   0x3d94
+#define GS101_DEBUG_OVERRIDE                    0x3d98
+#define GS101_WDT_OPTION                        0x3d9c
+#define GS101_AOC_WDT_CFG                       0x3da0
+#define GS101_CTRL_SECJTAG_ALIVE                0x3da4
+#define GS101_CTRL_DIV_PLL_ALV_DIVLOW           0x3e00
+#define GS101_CTRL_MUX_CLK_APM_REFSRC_AUTORESTORE 0x3e04
+#define GS101_CTRL_MUX_CLK_APM_REFSRC           0x3e08
+#define GS101_CTRL_MUX_CLK_APM_REF              0x3e0c
+#define GS101_CTRL_MUX_PLL_ALV_DIV4             0x3e10
+#define GS101_CTRL_PLL_ALV_DIV4                 0x3e14
+#define GS101_CTRL_OSCCLK_APMGSA                0x3e18
+#define GS101_CTRL_BLK_AOC_CLKS                 0x3e1c
+#define GS101_CTRL_PLL_ALV_LOCK                 0x3e20
+#define GS101_CTRL_CLKDIV__CLKRTC               0x3e24
+#define GS101_CTRL_SOC32K                       0x3e30
+#define GS101_CTRL_STM_PMU                      0x3e34
+#define GS101_CTRL_PMU_DEBUG                    0x3e38
+#define GS101_CTRL_DEBUG_UART                   0x3e3c
+#define GS101_CTRL_TCK                          0x3e40
+#define GS101_CTRL_SBU_SW_EN                    0x3e44
+#define GS101_PAD_CTRL_CLKOUT0                  0x3e80
+#define GS101_PAD_CTRL_CLKOUT1                  0x3e84
+#define GS101_PAD_CTRL_APM_24MOUT_0             0x3e88
+#define GS101_PAD_CTRL_APM_24MOUT_1             0x3e8c
+#define GS101_PAD_CTRL_IO_FORCE_RETENTION       0x3e90
+#define GS101_PAD_CTRL_APACTIVE_n               0x3e94
+#define GS101_PAD_CTRL_TCXO_ON                  0x3e98
+#define GS101_PAD_CTRL_PWR_HOLD                 0x3e9c
+#define GS101_PAD_CTRL_RESETO_n                 0x3ea0
+#define GS101_PAD_CTRL_WRESETO_n                0x3ea4
+#define GS101_PHY_CTRL_USB20                    0x3eb0
+#define GS101_PHY_CTRL_USBDP                    0x3eb4
+#define GS101_PHY_CTRL_MIPI_DCPHY_M4M4          0x3eb8
+#define GS101_PHY_CTRL_MIPI_DCPHY_S4S4S4S4      0x3ebc
+#define GS101_PHY_CTRL_PCIE_GEN4_0              0x3ec0
+#define GS101_PHY_CTRL_PCIE_GEN4_1              0x3ec4
+#define GS101_PHY_CTRL_UFS                      0x3ec8
 
 /* PMU INTR GEN */
 #define GS101_GRP1_INTR_BID_UPEND				(0x0108)
-- 
cgit v1.2.3


From edd548dc64a699d71ea4f537f815044e763d01e1 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 17 Oct 2025 12:13:23 -0700
Subject: firmware: qcom: tzmem: fix qcom_tzmem_policy kernel-doc

Fix kernel-doc warnings by using correct kernel-doc syntax and
formatting to prevent warnings:

Warning: include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value
 'QCOM_TZMEM_POLICY_STATIC' not described in enum 'qcom_tzmem_policy'
Warning: ../include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value
 'QCOM_TZMEM_POLICY_MULTIPLIER' not described in enum 'qcom_tzmem_policy'
Warning: ../include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value
 'QCOM_TZMEM_POLICY_ON_DEMAND' not described in enum 'qcom_tzmem_policy'

Fixes: 84f5a7b67b61 ("firmware: qcom: add a dedicated TrustZone buffer allocator")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20251017191323.1820167-1-rdunlap@infradead.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/firmware/qcom/qcom_tzmem.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/firmware/qcom/qcom_tzmem.h b/include/linux/firmware/qcom/qcom_tzmem.h
index 48ac0e5454c7..23173e0c3ddd 100644
--- a/include/linux/firmware/qcom/qcom_tzmem.h
+++ b/include/linux/firmware/qcom/qcom_tzmem.h
@@ -17,11 +17,20 @@ struct qcom_tzmem_pool;
  * enum qcom_tzmem_policy - Policy for pool growth.
  */
 enum qcom_tzmem_policy {
-	/**< Static pool, never grow above initial size. */
+	/**
+	 * @QCOM_TZMEM_POLICY_STATIC: Static pool,
+	 * never grow above initial size.
+	 */
 	QCOM_TZMEM_POLICY_STATIC = 1,
-	/**< When out of memory, add increment * current size of memory. */
+	/**
+	 * @QCOM_TZMEM_POLICY_MULTIPLIER: When out of memory,
+	 * add increment * current size of memory.
+	 */
 	QCOM_TZMEM_POLICY_MULTIPLIER,
-	/**< When out of memory add as much as is needed until max_size. */
+	/**
+	 * @QCOM_TZMEM_POLICY_ON_DEMAND: When out of memory
+	 * add as much as is needed until max_size.
+	 */
 	QCOM_TZMEM_POLICY_ON_DEMAND,
 };
 
-- 
cgit v1.2.3


From d742ebcfe524dc54023f7c520d2ed2e4b7203c19 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 14 Oct 2025 04:28:04 +0000
Subject: ASoC: soc.h: remove snd_soc_kcontrol_component()

All driver is now using snd_kcontrol_chip() instead of
snd_soc_kcontrol_component() to get component.

Remove snd_soc_kcontrol_component().

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://patch.msgid.link/87bjmam7jf.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index ddc508ff7b9b..1aebf14fcf80 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1305,22 +1305,6 @@ static inline unsigned int snd_soc_enum_item_to_val(const struct soc_enum *e,
 	return e->values[item];
 }
 
-/**
- * snd_soc_kcontrol_component() - Returns the component that registered the
- *  control
- * @kcontrol: The control for which to get the component
- *
- * Note: This function will work correctly if the control has been registered
- * for a component. With snd_soc_add_codec_controls() or via table based
- * setup for either a CODEC or component driver. Otherwise the behavior is
- * undefined.
- */
-static inline struct snd_soc_component *snd_soc_kcontrol_component(
-	struct snd_kcontrol *kcontrol)
-{
-	return snd_kcontrol_chip(kcontrol);
-}
-
 int snd_soc_util_init(void);
 void snd_soc_util_exit(void);
 
-- 
cgit v1.2.3


From a703a4c2a3280835003d4d0eb8845bac0f1a6ef1 Mon Sep 17 00:00:00 2001
From: Meenakshi Aggarwal <meenakshi.aggarwal@nxp.com>
Date: Mon, 6 Oct 2025 09:17:52 +0200
Subject: KEYS: trusted: caam based protected key

- CAAM supports two types of protected keys:
  -- Plain key encrypted with ECB
  -- Plain key encrypted with CCM
  Due to robustness, default encryption used for protected key is CCM.

- Generate protected key blob and add it to trusted key payload.
  This is done as part of sealing operation, which is triggered
  when below two operations are requested:
  -- new key generation
  -- load key,

Signed-off-by: Pankaj Gupta <pankaj.gupta@nxp.com>
Signed-off-by: Meenakshi Aggarwal <meenakshi.aggarwal@nxp.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/caam/blob_gen.c            |  86 +++++++++++++++++++-----
 drivers/crypto/caam/desc.h                |   9 ++-
 include/soc/fsl/caam-blob.h               |  26 +++++++
 security/keys/trusted-keys/trusted_caam.c | 108 ++++++++++++++++++++++++++++++
 4 files changed, 212 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/caam/blob_gen.c b/drivers/crypto/caam/blob_gen.c
index 079a22cc9f02..c18dbac56493 100644
--- a/drivers/crypto/caam/blob_gen.c
+++ b/drivers/crypto/caam/blob_gen.c
@@ -2,13 +2,14 @@
 /*
  * Copyright (C) 2015 Pengutronix, Steffen Trumtrar <kernel@pengutronix.de>
  * Copyright (C) 2021 Pengutronix, Ahmad Fatoum <kernel@pengutronix.de>
- * Copyright 2024 NXP
+ * Copyright 2024-2025 NXP
  */
 
 #define pr_fmt(fmt) "caam blob_gen: " fmt
 
 #include <linux/bitfield.h>
 #include <linux/device.h>
+#include <keys/trusted-type.h>
 #include <soc/fsl/caam-blob.h>
 
 #include "compat.h"
@@ -60,18 +61,27 @@ static void caam_blob_job_done(struct device *dev, u32 *desc, u32 err, void *con
 	complete(&res->completion);
 }
 
+static u32 check_caam_state(struct device *jrdev)
+{
+	const struct caam_drv_private *ctrlpriv;
+
+	ctrlpriv = dev_get_drvdata(jrdev->parent);
+	return FIELD_GET(CSTA_MOO, rd_reg32(&ctrlpriv->jr[0]->perfmon.status));
+}
+
 int caam_process_blob(struct caam_blob_priv *priv,
 		      struct caam_blob_info *info, bool encap)
 {
-	const struct caam_drv_private *ctrlpriv;
 	struct caam_blob_job_result testres;
 	struct device *jrdev = &priv->jrdev;
 	dma_addr_t dma_in, dma_out;
 	int op = OP_PCLID_BLOB;
+	int hwbk_caam_ovhd = 0;
 	size_t output_len;
 	u32 *desc;
 	u32 moo;
 	int ret;
+	int len;
 
 	if (info->key_mod_len > CAAM_BLOB_KEYMOD_LENGTH)
 		return -EINVAL;
@@ -82,14 +92,29 @@ int caam_process_blob(struct caam_blob_priv *priv,
 	} else {
 		op |= OP_TYPE_DECAP_PROTOCOL;
 		output_len = info->input_len - CAAM_BLOB_OVERHEAD;
+		info->output_len = output_len;
+	}
+
+	if (encap && info->pkey_info.is_pkey) {
+		op |= OP_PCL_BLOB_BLACK;
+		if (info->pkey_info.key_enc_algo == CAAM_ENC_ALGO_CCM) {
+			op |= OP_PCL_BLOB_EKT;
+			hwbk_caam_ovhd = CAAM_CCM_OVERHEAD;
+		}
+		if ((info->input_len + hwbk_caam_ovhd) > MAX_KEY_SIZE)
+			return -EINVAL;
+
+		len = info->input_len + hwbk_caam_ovhd;
+	} else {
+		len = info->input_len;
 	}
 
 	desc = kzalloc(CAAM_BLOB_DESC_BYTES_MAX, GFP_KERNEL);
 	if (!desc)
 		return -ENOMEM;
 
-	dma_in = dma_map_single(jrdev, info->input, info->input_len,
-				DMA_TO_DEVICE);
+	dma_in = dma_map_single(jrdev, info->input, len,
+				encap ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE);
 	if (dma_mapping_error(jrdev, dma_in)) {
 		dev_err(jrdev, "unable to map input DMA buffer\n");
 		ret = -ENOMEM;
@@ -104,8 +129,7 @@ int caam_process_blob(struct caam_blob_priv *priv,
 		goto out_unmap_in;
 	}
 
-	ctrlpriv = dev_get_drvdata(jrdev->parent);
-	moo = FIELD_GET(CSTA_MOO, rd_reg32(&ctrlpriv->jr[0]->perfmon.status));
+	moo = check_caam_state(jrdev);
 	if (moo != CSTA_MOO_SECURE && moo != CSTA_MOO_TRUSTED)
 		dev_warn(jrdev,
 			 "using insecure test key, enable HAB to use unique device key!\n");
@@ -117,18 +141,48 @@ int caam_process_blob(struct caam_blob_priv *priv,
 	 * Class 1 Context DWords 0+1+2+3. The random BK is stored in the
 	 * Class 1 Key Register. Operation Mode is set to AES-CCM.
 	 */
-
 	init_job_desc(desc, 0);
+
+	if (encap && info->pkey_info.is_pkey) {
+		/*!1. key command used to load class 1 key register
+		 *    from input plain key.
+		 */
+		append_key(desc, dma_in, info->input_len,
+				CLASS_1 | KEY_DEST_CLASS_REG);
+		/*!2. Fifostore to store protected key from class 1 key register. */
+		if (info->pkey_info.key_enc_algo == CAAM_ENC_ALGO_CCM) {
+			append_fifo_store(desc, dma_in, info->input_len,
+					  LDST_CLASS_1_CCB |
+					  FIFOST_TYPE_KEY_CCM_JKEK);
+		} else {
+			append_fifo_store(desc, dma_in, info->input_len,
+					  LDST_CLASS_1_CCB |
+					  FIFOST_TYPE_KEY_KEK);
+		}
+		/*
+		 * JUMP_OFFSET specifies the offset of the JUMP target from
+		 * the JUMP command's address in the descriptor buffer.
+		 */
+		append_jump(desc, JUMP_COND_NOP | BIT(0) << JUMP_OFFSET_SHIFT);
+	}
+
+	/*!3. Load class 2 key with key modifier. */
 	append_key_as_imm(desc, info->key_mod, info->key_mod_len,
-			  info->key_mod_len, CLASS_2 | KEY_DEST_CLASS_REG);
-	append_seq_in_ptr_intlen(desc, dma_in, info->input_len, 0);
-	append_seq_out_ptr_intlen(desc, dma_out, output_len, 0);
+			info->key_mod_len, CLASS_2 | KEY_DEST_CLASS_REG);
+
+	/*!4. SEQ IN PTR Command. */
+	append_seq_in_ptr(desc, dma_in, info->input_len, 0);
+
+	/*!5. SEQ OUT PTR Command. */
+	append_seq_out_ptr(desc, dma_out, output_len, 0);
+
+	/*!6. Blob encapsulation/decapsulation PROTOCOL Command. */
 	append_operation(desc, op);
 
-	print_hex_dump_debug("data@"__stringify(__LINE__)": ",
+	print_hex_dump_debug("data@" __stringify(__LINE__)": ",
 			     DUMP_PREFIX_ADDRESS, 16, 1, info->input,
-			     info->input_len, false);
-	print_hex_dump_debug("jobdesc@"__stringify(__LINE__)": ",
+			     len, false);
+	print_hex_dump_debug("jobdesc@" __stringify(__LINE__)": ",
 			     DUMP_PREFIX_ADDRESS, 16, 1, desc,
 			     desc_bytes(desc), false);
 
@@ -139,7 +193,7 @@ int caam_process_blob(struct caam_blob_priv *priv,
 	if (ret == -EINPROGRESS) {
 		wait_for_completion(&testres.completion);
 		ret = testres.err;
-		print_hex_dump_debug("output@"__stringify(__LINE__)": ",
+		print_hex_dump_debug("output@" __stringify(__LINE__)": ",
 				     DUMP_PREFIX_ADDRESS, 16, 1, info->output,
 				     output_len, false);
 	}
@@ -149,10 +203,10 @@ int caam_process_blob(struct caam_blob_priv *priv,
 
 	dma_unmap_single(jrdev, dma_out, output_len, DMA_FROM_DEVICE);
 out_unmap_in:
-	dma_unmap_single(jrdev, dma_in, info->input_len, DMA_TO_DEVICE);
+	dma_unmap_single(jrdev, dma_in, len,
+			 encap ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE);
 out_free:
 	kfree(desc);
-
 	return ret;
 }
 EXPORT_SYMBOL(caam_process_blob);
diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h
index e13470901586..c28e94fcb8c7 100644
--- a/drivers/crypto/caam/desc.h
+++ b/drivers/crypto/caam/desc.h
@@ -4,7 +4,7 @@
  * Definitions to support CAAM descriptor instruction generation
  *
  * Copyright 2008-2011 Freescale Semiconductor, Inc.
- * Copyright 2018 NXP
+ * Copyright 2018, 2025 NXP
  */
 
 #ifndef DESC_H
@@ -162,6 +162,7 @@
  * Enhanced Encryption of Key
  */
 #define KEY_EKT			0x00100000
+#define KEY_EKT_OFFSET		20
 
 /*
  * Encrypted with Trusted Key
@@ -403,6 +404,7 @@
 #define FIFOST_TYPE_PKHA_N	 (0x08 << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_PKHA_A	 (0x0c << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_PKHA_B	 (0x0d << FIFOST_TYPE_SHIFT)
+#define FIFOST_TYPE_KEY_CCM_JKEK (0x14 << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_AF_SBOX_JKEK (0x20 << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_AF_SBOX_TKEK (0x21 << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_PKHA_E_JKEK	 (0x22 << FIFOST_TYPE_SHIFT)
@@ -1001,6 +1003,11 @@
 #define OP_PCL_TLS12_AES_256_CBC_SHA384		 0xff63
 #define OP_PCL_TLS12_AES_256_CBC_SHA512		 0xff65
 
+/* Blob protocol protinfo bits */
+
+#define OP_PCL_BLOB_BLACK                        0x0004
+#define OP_PCL_BLOB_EKT                          0x0100
+
 /* For DTLS - OP_PCLID_DTLS */
 
 #define OP_PCL_DTLS_AES_128_CBC_SHA		 0x002f
diff --git a/include/soc/fsl/caam-blob.h b/include/soc/fsl/caam-blob.h
index 937cac52f36d..922f7ec3e231 100644
--- a/include/soc/fsl/caam-blob.h
+++ b/include/soc/fsl/caam-blob.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2020 Pengutronix, Ahmad Fatoum <kernel@pengutronix.de>
+ * Copyright 2024-2025 NXP
  */
 
 #ifndef __CAAM_BLOB_GEN
@@ -12,11 +13,34 @@
 #define CAAM_BLOB_KEYMOD_LENGTH		16
 #define CAAM_BLOB_OVERHEAD		(32 + 16)
 #define CAAM_BLOB_MAX_LEN		4096
+#define CAAM_ENC_ALGO_CCM		0x1
+#define CAAM_ENC_ALGO_ECB		0x2
+#define CAAM_NONCE_SIZE			6
+#define CAAM_ICV_SIZE			6
+#define CAAM_CCM_OVERHEAD		(CAAM_NONCE_SIZE + CAAM_ICV_SIZE)
 
 struct caam_blob_priv;
 
+/**
+ * struct caam_pkey_info - information for CAAM protected key
+ * @is_pkey:		flag to identify, if the key is protected.
+ * @key_enc_algo:	identifies the algorithm, ccm or ecb
+ * @plain_key_sz:	size of plain key.
+ * @key_buf:		contains key data
+ */
+struct caam_pkey_info {
+	u8  is_pkey;
+	u8  key_enc_algo;
+	u16 plain_key_sz;
+	u8 key_buf[];
+} __packed;
+
+/* sizeof struct caam_pkey_info */
+#define CAAM_PKEY_HEADER		4
+
 /**
  * struct caam_blob_info - information for CAAM blobbing
+ * @pkey_info:	 pointer to keep protected key information
  * @input:       pointer to input buffer (must be DMAable)
  * @input_len:   length of @input buffer in bytes.
  * @output:      pointer to output buffer (must be DMAable)
@@ -26,6 +50,8 @@ struct caam_blob_priv;
  *	         May not exceed %CAAM_BLOB_KEYMOD_LENGTH
  */
 struct caam_blob_info {
+	struct caam_pkey_info pkey_info;
+
 	void *input;
 	size_t input_len;
 
diff --git a/security/keys/trusted-keys/trusted_caam.c b/security/keys/trusted-keys/trusted_caam.c
index e3415c520c0a..090099d1b04d 100644
--- a/security/keys/trusted-keys/trusted_caam.c
+++ b/security/keys/trusted-keys/trusted_caam.c
@@ -1,12 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2021 Pengutronix, Ahmad Fatoum <kernel@pengutronix.de>
+ * Copyright 2025 NXP
  */
 
 #include <keys/trusted_caam.h>
 #include <keys/trusted-type.h>
 #include <linux/build_bug.h>
 #include <linux/key-type.h>
+#include <linux/parser.h>
 #include <soc/fsl/caam-blob.h>
 
 static struct caam_blob_priv *blobifier;
@@ -16,6 +18,77 @@ static struct caam_blob_priv *blobifier;
 static_assert(MAX_KEY_SIZE + CAAM_BLOB_OVERHEAD <= CAAM_BLOB_MAX_LEN);
 static_assert(MAX_BLOB_SIZE <= CAAM_BLOB_MAX_LEN);
 
+enum {
+	opt_err,
+	opt_key_enc_algo,
+};
+
+static const match_table_t key_tokens = {
+	{opt_key_enc_algo, "key_enc_algo=%s"},
+	{opt_err, NULL}
+};
+
+#ifdef CAAM_DEBUG
+static inline void dump_options(struct caam_pkey_info pkey_info)
+{
+	pr_info("key encryption algo %d\n", pkey_info.key_enc_algo);
+}
+#else
+static inline void dump_options(struct caam_pkey_info pkey_info)
+{
+}
+#endif
+
+static int get_pkey_options(char *c,
+			    struct caam_pkey_info *pkey_info)
+{
+	substring_t args[MAX_OPT_ARGS];
+	unsigned long token_mask = 0;
+	u16 key_enc_algo;
+	char *p = c;
+	int token;
+	int res;
+
+	if (!c)
+		return 0;
+
+	while ((p = strsep(&c, " \t"))) {
+		if (*p == '\0' || *p == ' ' || *p == '\t')
+			continue;
+		token = match_token(p, key_tokens, args);
+		if (test_and_set_bit(token, &token_mask))
+			return -EINVAL;
+
+		switch (token) {
+		case opt_key_enc_algo:
+			res = kstrtou16(args[0].from, 16, &key_enc_algo);
+			if (res < 0)
+				return -EINVAL;
+			pkey_info->key_enc_algo = key_enc_algo;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static bool is_key_pkey(char **datablob)
+{
+	char *c = NULL;
+
+	do {
+		/* Second argument onwards,
+		 * determine if tied to HW
+		 */
+		c = strsep(datablob, " \t");
+		if (c && (strcmp(c, "pk") == 0))
+			return true;
+	} while (c);
+
+	return false;
+}
+
 static int trusted_caam_seal(struct trusted_key_payload *p, char *datablob)
 {
 	int ret;
@@ -25,11 +98,30 @@ static int trusted_caam_seal(struct trusted_key_payload *p, char *datablob)
 		.key_mod = KEYMOD, .key_mod_len = sizeof(KEYMOD) - 1,
 	};
 
+	/*
+	 * If it is to be treated as protected key,
+	 * read next arguments too.
+	 */
+	if (is_key_pkey(&datablob)) {
+		info.pkey_info.plain_key_sz = p->key_len;
+		info.pkey_info.is_pkey = 1;
+		ret = get_pkey_options(datablob, &info.pkey_info);
+		if (ret < 0)
+			return 0;
+		dump_options(info.pkey_info);
+	}
+
 	ret = caam_encap_blob(blobifier, &info);
 	if (ret)
 		return ret;
 
 	p->blob_len = info.output_len;
+	if (info.pkey_info.is_pkey) {
+		p->key_len = p->blob_len + sizeof(struct caam_pkey_info);
+		memcpy(p->key, &info.pkey_info, sizeof(struct caam_pkey_info));
+		memcpy(p->key + sizeof(struct caam_pkey_info), p->blob, p->blob_len);
+	}
+
 	return 0;
 }
 
@@ -42,11 +134,27 @@ static int trusted_caam_unseal(struct trusted_key_payload *p, char *datablob)
 		.key_mod = KEYMOD,  .key_mod_len = sizeof(KEYMOD) - 1,
 	};
 
+	if (is_key_pkey(&datablob)) {
+		info.pkey_info.plain_key_sz = p->blob_len - CAAM_BLOB_OVERHEAD;
+		info.pkey_info.is_pkey = 1;
+		ret = get_pkey_options(datablob, &info.pkey_info);
+		if (ret < 0)
+			return 0;
+		dump_options(info.pkey_info);
+
+		p->key_len = p->blob_len + sizeof(struct caam_pkey_info);
+		memcpy(p->key, &info.pkey_info, sizeof(struct caam_pkey_info));
+		memcpy(p->key + sizeof(struct caam_pkey_info), p->blob, p->blob_len);
+
+		return 0;
+	}
+
 	ret = caam_decap_blob(blobifier, &info);
 	if (ret)
 		return ret;
 
 	p->key_len = info.output_len;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From aa653654ee67f9cbbebb7d4c18f360ad4fef3180 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sat, 11 Oct 2025 09:48:55 +0800
Subject: rhashtable: use likely for rhashtable lookup

Sometimes, the result of the rhashtable_lookup() is expected to be found.
Therefore, we can use likely() for such cases.

Following new functions are introduced, which will use likely or unlikely
during the lookup:

 rhashtable_lookup_likely
 rhltable_lookup_likely

A micro-benchmark is made for these new functions: lookup a existed entry
repeatedly for 100000000 times, and rhashtable_lookup_likely() gets ~30%
speedup.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/rhashtable.h | 70 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 05a221ce79a6..08e664b21f5a 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -355,12 +355,25 @@ static inline void rht_unlock(struct bucket_table *tbl,
 	local_irq_restore(flags);
 }
 
-static inline struct rhash_head *__rht_ptr(
-	struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
+enum rht_lookup_freq {
+	RHT_LOOKUP_NORMAL,
+	RHT_LOOKUP_LIKELY,
+};
+
+static __always_inline struct rhash_head *__rht_ptr(
+	struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt,
+	const enum rht_lookup_freq freq)
 {
-	return (struct rhash_head *)
-		((unsigned long)p & ~BIT(0) ?:
-		 (unsigned long)RHT_NULLS_MARKER(bkt));
+	unsigned long p_val = (unsigned long)p & ~BIT(0);
+
+	BUILD_BUG_ON(!__builtin_constant_p(freq));
+
+	if (freq == RHT_LOOKUP_LIKELY)
+		return (struct rhash_head *)
+			(likely(p_val) ? p_val : (unsigned long)RHT_NULLS_MARKER(bkt));
+	else
+		return (struct rhash_head *)
+			(p_val ?: (unsigned long)RHT_NULLS_MARKER(bkt));
 }
 
 /*
@@ -370,10 +383,17 @@ static inline struct rhash_head *__rht_ptr(
  *   rht_ptr_exclusive() dereferences in a context where exclusive
  *            access is guaranteed, such as when destroying the table.
  */
+static __always_inline struct rhash_head *__rht_ptr_rcu(
+	struct rhash_lock_head __rcu *const *bkt,
+	const enum rht_lookup_freq freq)
+{
+	return __rht_ptr(rcu_dereference_all(*bkt), bkt, freq);
+}
+
 static inline struct rhash_head *rht_ptr_rcu(
 	struct rhash_lock_head __rcu *const *bkt)
 {
-	return __rht_ptr(rcu_dereference_all(*bkt), bkt);
+	return __rht_ptr_rcu(bkt, RHT_LOOKUP_NORMAL);
 }
 
 static inline struct rhash_head *rht_ptr(
@@ -381,13 +401,15 @@ static inline struct rhash_head *rht_ptr(
 	struct bucket_table *tbl,
 	unsigned int hash)
 {
-	return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
+	return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt,
+			 RHT_LOOKUP_NORMAL);
 }
 
 static inline struct rhash_head *rht_ptr_exclusive(
 	struct rhash_lock_head __rcu *const *bkt)
 {
-	return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt);
+	return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt,
+			 RHT_LOOKUP_NORMAL);
 }
 
 static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
@@ -588,7 +610,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
 /* Internal function, do not use. */
 static __always_inline struct rhash_head *__rhashtable_lookup(
 	struct rhashtable *ht, const void *key,
-	const struct rhashtable_params params)
+	const struct rhashtable_params params,
+	const enum rht_lookup_freq freq)
 {
 	struct rhashtable_compare_arg arg = {
 		.ht = ht,
@@ -599,12 +622,13 @@ static __always_inline struct rhash_head *__rhashtable_lookup(
 	struct rhash_head *he;
 	unsigned int hash;
 
+	BUILD_BUG_ON(!__builtin_constant_p(freq));
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
 	bkt = rht_bucket(tbl, hash);
 	do {
-		rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
+		rht_for_each_rcu_from(he, __rht_ptr_rcu(bkt, freq), tbl, hash) {
 			if (params.obj_cmpfn ?
 			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
 			    rhashtable_compare(&arg, rht_obj(ht, he)))
@@ -643,11 +667,22 @@ static __always_inline void *rhashtable_lookup(
 	struct rhashtable *ht, const void *key,
 	const struct rhashtable_params params)
 {
-	struct rhash_head *he = __rhashtable_lookup(ht, key, params);
+	struct rhash_head *he = __rhashtable_lookup(ht, key, params,
+						    RHT_LOOKUP_NORMAL);
 
 	return he ? rht_obj(ht, he) : NULL;
 }
 
+static __always_inline void *rhashtable_lookup_likely(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhash_head *he = __rhashtable_lookup(ht, key, params,
+						    RHT_LOOKUP_LIKELY);
+
+	return likely(he) ? rht_obj(ht, he) : NULL;
+}
+
 /**
  * rhashtable_lookup_fast - search hash table, without RCU read lock
  * @ht:		hash table
@@ -693,11 +728,22 @@ static __always_inline struct rhlist_head *rhltable_lookup(
 	struct rhltable *hlt, const void *key,
 	const struct rhashtable_params params)
 {
-	struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);
+	struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params,
+						    RHT_LOOKUP_NORMAL);
 
 	return he ? container_of(he, struct rhlist_head, rhead) : NULL;
 }
 
+static __always_inline struct rhlist_head *rhltable_lookup_likely(
+	struct rhltable *hlt, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params,
+						    RHT_LOOKUP_LIKELY);
+
+	return likely(he) ? container_of(he, struct rhlist_head, rhead) : NULL;
+}
+
 /* Internal function, please use rhashtable_insert_fast() instead. This
  * function returns the existing element already in hashes if there is a clash,
  * otherwise it returns an error via ERR_PTR().
-- 
cgit v1.2.3


From 83c4e3c39b2b55afe56ed0d14b93b5f219350c81 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@linaro.org>
Date: Fri, 10 Oct 2025 12:46:31 +0000
Subject: dt-bindings: firmware: google,gs101-acpm-ipc: add ACPM clocks

The firmware exposes clocks that can be controlled via the
Alive Clock and Power Manager (ACPM) interface.

Make the ACPM node a clock provider by adding the mandatory
"#clock-cells" property, which allows devices to reference its
clock outputs.

Signed-off-by: Tudor Ambarus <tudor.ambarus@linaro.org>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Peter Griffin <peter.griffin@linaro.org>
Tested-by: Peter Griffin <peter.griffin@linaro.org> # on gs101-oriole
Link: https://patch.msgid.link/20251010-acpm-clk-v6-1-321ee8826fd4@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 .../bindings/firmware/google,gs101-acpm-ipc.yaml   | 11 +++++++++
 include/dt-bindings/clock/google,gs101-acpm.h      | 26 ++++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 include/dt-bindings/clock/google,gs101-acpm.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/firmware/google,gs101-acpm-ipc.yaml b/Documentation/devicetree/bindings/firmware/google,gs101-acpm-ipc.yaml
index 9785aac3b5f3..d3bca6088d12 100644
--- a/Documentation/devicetree/bindings/firmware/google,gs101-acpm-ipc.yaml
+++ b/Documentation/devicetree/bindings/firmware/google,gs101-acpm-ipc.yaml
@@ -24,6 +24,15 @@ properties:
   compatible:
     const: google,gs101-acpm-ipc
 
+  "#clock-cells":
+    const: 1
+    description:
+      Clocks that are variable and index based. These clocks don't provide
+      an entire range of values between the limits but only discrete points
+      within the range. The firmware also manages the voltage scaling
+      appropriately with the clock scaling. The argument is the ID of the
+      clock contained by the firmware messages.
+
   mboxes:
     maxItems: 1
 
@@ -45,6 +54,7 @@ properties:
 
 required:
   - compatible
+  - "#clock-cells"
   - mboxes
   - shmem
 
@@ -56,6 +66,7 @@ examples:
 
     power-management {
         compatible = "google,gs101-acpm-ipc";
+        #clock-cells = <1>;
         mboxes = <&ap2apm_mailbox>;
         shmem = <&apm_sram>;
 
diff --git a/include/dt-bindings/clock/google,gs101-acpm.h b/include/dt-bindings/clock/google,gs101-acpm.h
new file mode 100644
index 000000000000..e2ba89e09fa6
--- /dev/null
+++ b/include/dt-bindings/clock/google,gs101-acpm.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright 2025 Linaro Ltd.
+ *
+ * Device Tree binding constants for Google gs101 ACPM clock controller.
+ */
+
+#ifndef _DT_BINDINGS_CLOCK_GOOGLE_GS101_ACPM_H
+#define _DT_BINDINGS_CLOCK_GOOGLE_GS101_ACPM_H
+
+#define GS101_CLK_ACPM_DVFS_MIF				0
+#define GS101_CLK_ACPM_DVFS_INT				1
+#define GS101_CLK_ACPM_DVFS_CPUCL0			2
+#define GS101_CLK_ACPM_DVFS_CPUCL1			3
+#define GS101_CLK_ACPM_DVFS_CPUCL2			4
+#define GS101_CLK_ACPM_DVFS_G3D				5
+#define GS101_CLK_ACPM_DVFS_G3DL2			6
+#define GS101_CLK_ACPM_DVFS_TPU				7
+#define GS101_CLK_ACPM_DVFS_INTCAM			8
+#define GS101_CLK_ACPM_DVFS_TNR				9
+#define GS101_CLK_ACPM_DVFS_CAM				10
+#define GS101_CLK_ACPM_DVFS_MFC				11
+#define GS101_CLK_ACPM_DVFS_DISP			12
+#define GS101_CLK_ACPM_DVFS_BO				13
+
+#endif /* _DT_BINDINGS_CLOCK_GOOGLE_GS101_ACPM_H */
-- 
cgit v1.2.3


From 84a222d1b369ba83f8947948670f775367e653f1 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@linaro.org>
Date: Fri, 10 Oct 2025 12:46:32 +0000
Subject: firmware: exynos-acpm: add DVFS protocol

Add ACPM DVFS protocol handler. It constructs DVFS messages that
the APM firmware can understand.

Signed-off-by: Tudor Ambarus <tudor.ambarus@linaro.org>
Reviewed-by: Peter Griffin <peter.griffin@linaro.org>
Tested-by: Peter Griffin <peter.griffin@linaro.org> # on gs101-oriole
Link: https://patch.msgid.link/20251010-acpm-clk-v6-2-321ee8826fd4@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 drivers/firmware/samsung/Makefile                  |  4 +-
 drivers/firmware/samsung/exynos-acpm-dvfs.c        | 80 ++++++++++++++++++++++
 drivers/firmware/samsung/exynos-acpm-dvfs.h        | 21 ++++++
 drivers/firmware/samsung/exynos-acpm.c             |  5 ++
 .../linux/firmware/samsung/exynos-acpm-protocol.h  | 10 +++
 5 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/samsung/exynos-acpm-dvfs.c
 create mode 100644 drivers/firmware/samsung/exynos-acpm-dvfs.h

(limited to 'include')

diff --git a/drivers/firmware/samsung/Makefile b/drivers/firmware/samsung/Makefile
index 7b4c9f6f34f5..80d4f89b33a9 100644
--- a/drivers/firmware/samsung/Makefile
+++ b/drivers/firmware/samsung/Makefile
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-acpm-protocol-objs			:= exynos-acpm.o exynos-acpm-pmic.o
+acpm-protocol-objs			:= exynos-acpm.o
+acpm-protocol-objs			+= exynos-acpm-pmic.o
+acpm-protocol-objs			+= exynos-acpm-dvfs.o
 obj-$(CONFIG_EXYNOS_ACPM_PROTOCOL)	+= acpm-protocol.o
diff --git a/drivers/firmware/samsung/exynos-acpm-dvfs.c b/drivers/firmware/samsung/exynos-acpm-dvfs.c
new file mode 100644
index 000000000000..1c5b2b143bcc
--- /dev/null
+++ b/drivers/firmware/samsung/exynos-acpm-dvfs.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020 Samsung Electronics Co., Ltd.
+ * Copyright 2020 Google LLC.
+ * Copyright 2025 Linaro Ltd.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/firmware/samsung/exynos-acpm-protocol.h>
+#include <linux/ktime.h>
+#include <linux/types.h>
+#include <linux/units.h>
+
+#include "exynos-acpm.h"
+#include "exynos-acpm-dvfs.h"
+
+#define ACPM_DVFS_ID			GENMASK(11, 0)
+#define ACPM_DVFS_REQ_TYPE		GENMASK(15, 0)
+
+#define ACPM_DVFS_FREQ_REQ		0
+#define ACPM_DVFS_FREQ_GET		1
+
+static void acpm_dvfs_set_xfer(struct acpm_xfer *xfer, u32 *cmd, size_t cmdlen,
+			       unsigned int acpm_chan_id, bool response)
+{
+	xfer->acpm_chan_id = acpm_chan_id;
+	xfer->txd = cmd;
+	xfer->txlen = cmdlen;
+
+	if (response) {
+		xfer->rxd = cmd;
+		xfer->rxlen = cmdlen;
+	}
+}
+
+static void acpm_dvfs_init_set_rate_cmd(u32 cmd[4], unsigned int clk_id,
+					unsigned long rate)
+{
+	cmd[0] = FIELD_PREP(ACPM_DVFS_ID, clk_id);
+	cmd[1] = rate / HZ_PER_KHZ;
+	cmd[2] = FIELD_PREP(ACPM_DVFS_REQ_TYPE, ACPM_DVFS_FREQ_REQ);
+	cmd[3] = ktime_to_ms(ktime_get());
+}
+
+int acpm_dvfs_set_rate(const struct acpm_handle *handle,
+		       unsigned int acpm_chan_id, unsigned int clk_id,
+		       unsigned long rate)
+{
+	struct acpm_xfer xfer = {0};
+	u32 cmd[4];
+
+	acpm_dvfs_init_set_rate_cmd(cmd, clk_id, rate);
+	acpm_dvfs_set_xfer(&xfer, cmd, sizeof(cmd), acpm_chan_id, false);
+
+	return acpm_do_xfer(handle, &xfer);
+}
+
+static void acpm_dvfs_init_get_rate_cmd(u32 cmd[4], unsigned int clk_id)
+{
+	cmd[0] = FIELD_PREP(ACPM_DVFS_ID, clk_id);
+	cmd[2] = FIELD_PREP(ACPM_DVFS_REQ_TYPE, ACPM_DVFS_FREQ_GET);
+	cmd[3] = ktime_to_ms(ktime_get());
+}
+
+unsigned long acpm_dvfs_get_rate(const struct acpm_handle *handle,
+				 unsigned int acpm_chan_id, unsigned int clk_id)
+{
+	struct acpm_xfer xfer;
+	unsigned int cmd[4] = {0};
+	int ret;
+
+	acpm_dvfs_init_get_rate_cmd(cmd, clk_id);
+	acpm_dvfs_set_xfer(&xfer, cmd, sizeof(cmd), acpm_chan_id, true);
+
+	ret = acpm_do_xfer(handle, &xfer);
+	if (ret)
+		return 0;
+
+	return xfer.rxd[1] * HZ_PER_KHZ;
+}
diff --git a/drivers/firmware/samsung/exynos-acpm-dvfs.h b/drivers/firmware/samsung/exynos-acpm-dvfs.h
new file mode 100644
index 000000000000..9f2778e649c9
--- /dev/null
+++ b/drivers/firmware/samsung/exynos-acpm-dvfs.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2020 Samsung Electronics Co., Ltd.
+ * Copyright 2020 Google LLC.
+ * Copyright 2025 Linaro Ltd.
+ */
+#ifndef __EXYNOS_ACPM_DVFS_H__
+#define __EXYNOS_ACPM_DVFS_H__
+
+#include <linux/types.h>
+
+struct acpm_handle;
+
+int acpm_dvfs_set_rate(const struct acpm_handle *handle,
+		       unsigned int acpm_chan_id, unsigned int id,
+		       unsigned long rate);
+unsigned long acpm_dvfs_get_rate(const struct acpm_handle *handle,
+				 unsigned int acpm_chan_id,
+				 unsigned int clk_id);
+
+#endif /* __EXYNOS_ACPM_DVFS_H__ */
diff --git a/drivers/firmware/samsung/exynos-acpm.c b/drivers/firmware/samsung/exynos-acpm.c
index 3a69fe3234c7..9fa0335ccf5d 100644
--- a/drivers/firmware/samsung/exynos-acpm.c
+++ b/drivers/firmware/samsung/exynos-acpm.c
@@ -29,6 +29,7 @@
 #include <linux/types.h>
 
 #include "exynos-acpm.h"
+#include "exynos-acpm-dvfs.h"
 #include "exynos-acpm-pmic.h"
 
 #define ACPM_PROTOCOL_SEQNUM		GENMASK(21, 16)
@@ -590,8 +591,12 @@ static int acpm_channels_init(struct acpm_info *acpm)
  */
 static void acpm_setup_ops(struct acpm_info *acpm)
 {
+	struct acpm_dvfs_ops *dvfs_ops = &acpm->handle.ops.dvfs_ops;
 	struct acpm_pmic_ops *pmic_ops = &acpm->handle.ops.pmic_ops;
 
+	dvfs_ops->set_rate = acpm_dvfs_set_rate;
+	dvfs_ops->get_rate = acpm_dvfs_get_rate;
+
 	pmic_ops->read_reg = acpm_pmic_read_reg;
 	pmic_ops->bulk_read = acpm_pmic_bulk_read;
 	pmic_ops->write_reg = acpm_pmic_write_reg;
diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h
index f628bf1862c2..b1e95435240f 100644
--- a/include/linux/firmware/samsung/exynos-acpm-protocol.h
+++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h
@@ -13,6 +13,15 @@
 struct acpm_handle;
 struct device_node;
 
+struct acpm_dvfs_ops {
+	int (*set_rate)(const struct acpm_handle *handle,
+			unsigned int acpm_chan_id, unsigned int clk_id,
+			unsigned long rate);
+	unsigned long (*get_rate)(const struct acpm_handle *handle,
+				  unsigned int acpm_chan_id,
+				  unsigned int clk_id);
+};
+
 struct acpm_pmic_ops {
 	int (*read_reg)(const struct acpm_handle *handle,
 			unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan,
@@ -32,6 +41,7 @@ struct acpm_pmic_ops {
 };
 
 struct acpm_ops {
+	struct acpm_dvfs_ops dvfs_ops;
 	struct acpm_pmic_ops pmic_ops;
 };
 
-- 
cgit v1.2.3


From 7f3779a3ac3e474d043f0a2b77dd6e6bb020c577 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Wed, 27 Aug 2025 17:52:43 +0000
Subject: mm/filemap: Add NUMA mempolicy support to filemap_alloc_folio()

Add a mempolicy parameter to filemap_alloc_folio() to enable NUMA-aware
page cache allocations. This will be used by upcoming changes to
support NUMA policies in guest-memfd, where guest_memory need to be
allocated NUMA policy specified by VMM.

All existing users pass NULL maintaining current behavior.

Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
Tested-by: Ashish Kalra <ashish.kalra@amd.com>
Link: https://lore.kernel.org/r/20250827175247.83322-4-shivankg@amd.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 fs/btrfs/compression.c  |  4 ++--
 fs/btrfs/verity.c       |  2 +-
 fs/erofs/zdata.c        |  2 +-
 fs/f2fs/compress.c      |  2 +-
 include/linux/pagemap.h |  8 +++++---
 mm/filemap.c            | 14 +++++++++-----
 mm/readahead.c          |  2 +-
 7 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index bacad18357b3..d927ae32e7d0 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -491,8 +491,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			continue;
 		}
 
-		folio = filemap_alloc_folio(mapping_gfp_constraint(mapping,
-								   ~__GFP_FS), 0);
+		folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS),
+					    0, NULL);
 		if (!folio)
 			break;
 
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 46bd8ca58670..d4523d5debcd 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -742,7 +742,7 @@ again:
 	}
 
 	folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS),
-				    0);
+				    0, NULL);
 	if (!folio)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index bc80cfe482f7..b7369fb4fbe9 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -562,7 +562,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
 			 * Allocate a managed folio for cached I/O, or it may be
 			 * then filled with a file-backed folio for in-place I/O
 			 */
-			newfolio = filemap_alloc_folio(gfp, 0);
+			newfolio = filemap_alloc_folio(gfp, 0, NULL);
 			if (!newfolio)
 				continue;
 			newfolio->private = Z_EROFS_PREALLOCATED_FOLIO;
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 6ad8d3bc6df7..a65e8cd388bc 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1947,7 +1947,7 @@ static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
 		return;
 	}
 
-	cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0);
+	cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0, NULL);
 	if (!cfolio)
 		return;
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878..f1d0610210f7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -654,9 +654,11 @@ static inline void *detach_page_private(struct page *page)
 }
 
 #ifdef CONFIG_NUMA
-struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
+		struct mempolicy *policy);
 #else
-static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
+static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
+		struct mempolicy *policy)
 {
 	return folio_alloc_noprof(gfp, order);
 }
@@ -667,7 +669,7 @@ static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int o
 
 static inline struct page *__page_cache_alloc(gfp_t gfp)
 {
-	return &filemap_alloc_folio(gfp, 0)->page;
+	return &filemap_alloc_folio(gfp, 0, NULL)->page;
 }
 
 static inline gfp_t readahead_gfp_mask(struct address_space *x)
diff --git a/mm/filemap.c b/mm/filemap.c
index 13f0259d993c..7b42fd6dcc9a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1002,11 +1002,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
 EXPORT_SYMBOL_GPL(filemap_add_folio);
 
 #ifdef CONFIG_NUMA
-struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
+		struct mempolicy *policy)
 {
 	int n;
 	struct folio *folio;
 
+	if (policy)
+		return folio_alloc_mpol_noprof(gfp, order, policy,
+				NO_INTERLEAVE_INDEX, numa_node_id());
+
 	if (cpuset_do_page_mem_spread()) {
 		unsigned int cpuset_mems_cookie;
 		do {
@@ -2009,7 +2014,7 @@ no_page:
 			err = -ENOMEM;
 			if (order > min_order)
 				alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
-			folio = filemap_alloc_folio(alloc_gfp, order);
+			folio = filemap_alloc_folio(alloc_gfp, order, NULL);
 			if (!folio)
 				continue;
 
@@ -2551,7 +2556,7 @@ static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
 	if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
 		return -EAGAIN;
 
-	folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
+	folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL);
 	if (!folio)
 		return -ENOMEM;
 	if (iocb->ki_flags & IOCB_DONTCACHE)
@@ -3983,8 +3988,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping,
 repeat:
 	folio = filemap_get_folio(mapping, index);
 	if (IS_ERR(folio)) {
-		folio = filemap_alloc_folio(gfp,
-					    mapping_min_folio_order(mapping));
+		folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL);
 		if (!folio)
 			return ERR_PTR(-ENOMEM);
 		index = mapping_align_index(mapping, index);
diff --git a/mm/readahead.c b/mm/readahead.c
index 3a4b5d58eeb6..b415c9969176 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -186,7 +186,7 @@ static struct folio *ractl_alloc_folio(struct readahead_control *ractl,
 {
 	struct folio *folio;
 
-	folio = filemap_alloc_folio(gfp_mask, order);
+	folio = filemap_alloc_folio(gfp_mask, order, NULL);
 	if (folio && ractl->dropbehind)
 		__folio_set_dropbehind(folio);
 
-- 
cgit v1.2.3


From 16a542e22339cd5e73e56a956bbd335c7bd7c08c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Wed, 27 Aug 2025 17:52:44 +0000
Subject: mm/filemap: Extend __filemap_get_folio() to support NUMA memory
 policies

Extend __filemap_get_folio() to support NUMA memory policies by
renaming the implementation to __filemap_get_folio_mpol() and adding
a mempolicy parameter. The original function becomes a static inline
wrapper that passes NULL for the mempolicy.

This infrastructure will enable future support for NUMA-aware page cache
allocations in guest_memfd memory backend KVM guests.

Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
Tested-by: Ashish Kalra <ashish.kalra@amd.com>
Link: https://lore.kernel.org/r/20250827175247.83322-5-shivankg@amd.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/pagemap.h | 10 ++++++++--
 mm/filemap.c            | 11 ++++++-----
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index f1d0610210f7..a17fabbc0269 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -755,11 +755,17 @@ static inline fgf_t fgf_set_order(size_t size)
 }
 
 void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
-struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
-		fgf_t fgp_flags, gfp_t gfp);
+struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
+		pgoff_t index, fgf_t fgf_flags, gfp_t gfp, struct mempolicy *policy);
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
 		fgf_t fgp_flags, gfp_t gfp);
 
+static inline struct folio *__filemap_get_folio(struct address_space *mapping,
+		pgoff_t index, fgf_t fgf_flags, gfp_t gfp)
+{
+	return __filemap_get_folio_mpol(mapping, index, fgf_flags, gfp, NULL);
+}
+
 /**
  * write_begin_get_folio - Get folio for write_begin with flags.
  * @iocb: The kiocb passed from write_begin (may be NULL).
diff --git a/mm/filemap.c b/mm/filemap.c
index 7b42fd6dcc9a..91c4537283d3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1928,11 +1928,12 @@ out:
 }
 
 /**
- * __filemap_get_folio - Find and get a reference to a folio.
+ * __filemap_get_folio_mpol - Find and get a reference to a folio.
  * @mapping: The address_space to search.
  * @index: The page index.
  * @fgp_flags: %FGP flags modify how the folio is returned.
  * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
+ * @policy: NUMA memory allocation policy to follow.
  *
  * Looks up the page cache entry at @mapping & @index.
  *
@@ -1943,8 +1944,8 @@ out:
  *
  * Return: The found folio or an ERR_PTR() otherwise.
  */
-struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
-		fgf_t fgp_flags, gfp_t gfp)
+struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
+		pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy)
 {
 	struct folio *folio;
 
@@ -2014,7 +2015,7 @@ no_page:
 			err = -ENOMEM;
 			if (order > min_order)
 				alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
-			folio = filemap_alloc_folio(alloc_gfp, order, NULL);
+			folio = filemap_alloc_folio(alloc_gfp, order, policy);
 			if (!folio)
 				continue;
 
@@ -2061,7 +2062,7 @@ no_page:
 		folio_clear_dropbehind(folio);
 	return folio;
 }
-EXPORT_SYMBOL(__filemap_get_folio);
+EXPORT_SYMBOL(__filemap_get_folio_mpol);
 
 static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
 		xa_mark_t mark)
-- 
cgit v1.2.3


From a63ca4236e6799cf4343f9aec9d92afdfa582446 Mon Sep 17 00:00:00 2001
From: Ackerley Tng <ackerleytng@google.com>
Date: Thu, 16 Oct 2025 10:28:44 -0700
Subject: KVM: guest_memfd: Use guest mem inodes instead of anonymous inodes

guest_memfd's inode represents memory the guest_memfd is
providing. guest_memfd's file represents a struct kvm's view of that
memory.

Using a custom inode allows customization of the inode teardown
process via callbacks. For example, ->evict_inode() allows
customization of the truncation process on file close, and
->destroy_inode() and ->free_inode() allow customization of the inode
freeing process.

Customizing the truncation process allows flexibility in management of
guest_memfd memory and customization of the inode freeing process
allows proper cleanup of memory metadata stored on the inode.

Memory metadata is more appropriately stored on the inode (as opposed
to the file), since the metadata is for the memory and is not unique
to a specific binding and struct kvm.

Acked-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
Tested-by: Ashish Kalra <ashish.kalra@amd.com>
[sean: drop helpers, open code logic in __kvm_gmem_create()]
Link: https://lore.kernel.org/r/20251016172853.52451-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/uapi/linux/magic.h |  1 +
 virt/kvm/guest_memfd.c     | 82 ++++++++++++++++++++++++++++++++++++++--------
 virt/kvm/kvm_main.c        |  7 +++-
 virt/kvm/kvm_mm.h          |  9 ++---
 4 files changed, 80 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index bb575f3ab45e..638ca21b7a90 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -103,5 +103,6 @@
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 #define PID_FS_MAGIC		0x50494446	/* "PIDF" */
+#define GUEST_MEMFD_MAGIC	0x474d454d	/* "GMEM" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 5cce20ff418d..ce04fc85e631 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1,12 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
 #include <linux/backing-dev.h>
 #include <linux/falloc.h>
+#include <linux/fs.h>
 #include <linux/kvm_host.h>
+#include <linux/pseudo_fs.h>
 #include <linux/pagemap.h>
-#include <linux/anon_inodes.h>
 
 #include "kvm_mm.h"
 
+static struct vfsmount *kvm_gmem_mnt;
+
 /*
  * A guest_memfd instance can be associated multiple VMs, each with its own
  * "view" of the underlying physical memory.
@@ -424,11 +428,6 @@ static struct file_operations kvm_gmem_fops = {
 	.fallocate	= kvm_gmem_fallocate,
 };
 
-void kvm_gmem_init(struct module *module)
-{
-	kvm_gmem_fops.owner = module;
-}
-
 static int kvm_gmem_migrate_folio(struct address_space *mapping,
 				  struct folio *dst, struct folio *src,
 				  enum migrate_mode mode)
@@ -500,7 +499,7 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
 
 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 {
-	const char *anon_name = "[kvm-gmem]";
+	static const char *name = "[kvm-gmem]";
 	struct gmem_file *f;
 	struct inode *inode;
 	struct file *file;
@@ -516,16 +515,17 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 		goto err_fd;
 	}
 
-	file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, f, O_RDWR, NULL);
-	if (IS_ERR(file)) {
-		err = PTR_ERR(file);
+	/* __fput() will take care of fops_put(). */
+	if (!fops_get(&kvm_gmem_fops)) {
+		err = -ENOENT;
 		goto err_gmem;
 	}
 
-	file->f_flags |= O_LARGEFILE;
-
-	inode = file->f_inode;
-	WARN_ON(file->f_mapping != inode->i_mapping);
+	inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto err_fops;
+	}
 
 	inode->i_private = (void *)(unsigned long)flags;
 	inode->i_op = &kvm_gmem_iops;
@@ -537,6 +537,15 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	/* Unmovable mappings are supposed to be marked unevictable as well. */
 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
 
+	file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_inode;
+	}
+
+	file->f_flags |= O_LARGEFILE;
+	file->private_data = f;
+
 	kvm_get_kvm(kvm);
 	f->kvm = kvm;
 	xa_init(&f->bindings);
@@ -545,6 +554,10 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	fd_install(fd, file);
 	return fd;
 
+err_inode:
+	iput(inode);
+err_fops:
+	fops_put(&kvm_gmem_fops);
 err_gmem:
 	kfree(f);
 err_fd:
@@ -816,3 +829,44 @@ put_folio_and_exit:
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
 #endif
+
+static int kvm_gmem_init_fs_context(struct fs_context *fc)
+{
+	if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
+		return -ENOMEM;
+
+	fc->s_iflags |= SB_I_NOEXEC;
+	fc->s_iflags |= SB_I_NODEV;
+
+	return 0;
+}
+
+static struct file_system_type kvm_gmem_fs = {
+	.name		 = "guest_memfd",
+	.init_fs_context = kvm_gmem_init_fs_context,
+	.kill_sb	 = kill_anon_super,
+};
+
+static int kvm_gmem_init_mount(void)
+{
+	kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
+
+	if (IS_ERR(kvm_gmem_mnt))
+		return PTR_ERR(kvm_gmem_mnt);
+
+	kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
+	return 0;
+}
+
+int kvm_gmem_init(struct module *module)
+{
+	kvm_gmem_fops.owner = module;
+
+	return kvm_gmem_init_mount();
+}
+
+void kvm_gmem_exit(void)
+{
+	kern_unmount(kvm_gmem_mnt);
+	kvm_gmem_mnt = NULL;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b7a0ae2a7b20..4845e5739436 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6517,7 +6517,9 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	if (WARN_ON_ONCE(r))
 		goto err_vfio;
 
-	kvm_gmem_init(module);
+	r = kvm_gmem_init(module);
+	if (r)
+		goto err_gmem;
 
 	r = kvm_init_virtualization();
 	if (r)
@@ -6538,6 +6540,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 err_register:
 	kvm_uninit_virtualization();
 err_virt:
+	kvm_gmem_exit();
+err_gmem:
 	kvm_vfio_ops_exit();
 err_vfio:
 	kvm_async_pf_deinit();
@@ -6569,6 +6573,7 @@ void kvm_exit(void)
 	for_each_possible_cpu(cpu)
 		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
 	kmem_cache_destroy(kvm_vcpu_cache);
+	kvm_gmem_exit();
 	kvm_vfio_ops_exit();
 	kvm_async_pf_deinit();
 	kvm_irqfd_exit();
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 31defb08ccba..9fcc5d5b7f8d 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -68,17 +68,18 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
 #endif /* HAVE_KVM_PFNCACHE */
 
 #ifdef CONFIG_KVM_GUEST_MEMFD
-void kvm_gmem_init(struct module *module);
+int kvm_gmem_init(struct module *module);
+void kvm_gmem_exit(void);
 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
 		  unsigned int fd, loff_t offset);
 void kvm_gmem_unbind(struct kvm_memory_slot *slot);
 #else
-static inline void kvm_gmem_init(struct module *module)
+static inline int kvm_gmem_init(struct module *module)
 {
-
+	return 0;
 }
-
+static inline void kvm_gmem_exit(void) {};
 static inline int kvm_gmem_bind(struct kvm *kvm,
 					 struct kvm_memory_slot *slot,
 					 unsigned int fd, loff_t offset)
-- 
cgit v1.2.3


From be180c847a6db6646d7bb4740a1d73f6f67d1030 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 19 Oct 2025 20:43:20 -0700
Subject: RDMA/uverbs: fix some kernel-doc warnings

Fix 49 kernel-doc warnings in ib_verbs.h:

- Add struct short description for rdma_stat_desc, rdma_hw_stats.
- Fix kernel-doc format for struct members (use ':' instead of '-') for
  several structs.
- Don't use "/**" kernel-doc notation for struct members in ib_device_ops
  (most members are not documented and most of the kernel-doc was
  not formatted correctly).
- Spell function parameters correctly in ib_dma_map_sgtable_attrs(),
  ib_device_try_get(), rdma_roce_rescan_device().
- Add kernel-doc for the function parameter in
  rdma_flow_label_to_udp_sport().

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251020034320.3011094-1-rdunlap@infradead.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/rdma/ib_verbs.h | 99 +++++++++++++++++++++++++------------------------
 1 file changed, 50 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6139223e92e4..0a85af610b6b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -586,10 +586,10 @@ enum ib_stat_flag {
 };
 
 /**
- * struct rdma_stat_desc
- * @name - The name of the counter
- * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL
- * @priv - Driver private information; Core code should not use
+ * struct rdma_stat_desc - description of one rdma stat/counter
+ * @name: The name of the counter
+ * @flags: Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL
+ * @priv: Driver private information; Core code should not use
  */
 struct rdma_stat_desc {
 	const char *name;
@@ -598,24 +598,24 @@ struct rdma_stat_desc {
 };
 
 /**
- * struct rdma_hw_stats
- * @lock - Mutex to protect parallel write access to lifespan and values
+ * struct rdma_hw_stats - collection of hardware stats and their management
+ * @lock: Mutex to protect parallel write access to lifespan and values
  *    of counters, which are 64bits and not guaranteed to be written
  *    atomicaly on 32bits systems.
- * @timestamp - Used by the core code to track when the last update was
- * @lifespan - Used by the core code to determine how old the counters
+ * @timestamp: Used by the core code to track when the last update was
+ * @lifespan: Used by the core code to determine how old the counters
  *   should be before being updated again.  Stored in jiffies, defaults
  *   to 10 milliseconds, drivers can override the default be specifying
  *   their own value during their allocation routine.
- * @descs - Array of pointers to static descriptors used for the counters
+ * @descs: Array of pointers to static descriptors used for the counters
  *   in directory.
- * @is_disabled - A bitmap to indicate each counter is currently disabled
+ * @is_disabled: A bitmap to indicate each counter is currently disabled
  *   or not.
- * @num_counters - How many hardware counters there are.  If name is
+ * @num_counters: How many hardware counters there are.  If name is
  *   shorter than this number, a kernel oops will result.  Driver authors
  *   are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters)
  *   in their code to prevent this.
- * @value - Array of u64 counters that are accessed by the sysfs code and
+ * @value: Array of u64 counters that are accessed by the sysfs code and
  *   filled in by the drivers get_stats routine
  */
 struct rdma_hw_stats {
@@ -2405,7 +2405,7 @@ struct ib_device_ops {
 	int (*modify_port)(struct ib_device *device, u32 port_num,
 			   int port_modify_mask,
 			   struct ib_port_modify *port_modify);
-	/**
+	/*
 	 * The following mandatory functions are used only at device
 	 * registration.  Keep functions such as these at the end of this
 	 * structure to avoid cache line misses when accessing struct ib_device
@@ -2415,7 +2415,7 @@ struct ib_device_ops {
 				  struct ib_port_immutable *immutable);
 	enum rdma_link_layer (*get_link_layer)(struct ib_device *device,
 					       u32 port_num);
-	/**
+	/*
 	 * When calling get_netdev, the HW vendor's driver should return the
 	 * net device of device @device at port @port_num or NULL if such
 	 * a net device doesn't exist. The vendor driver should call dev_hold
@@ -2425,7 +2425,7 @@ struct ib_device_ops {
 	 */
 	struct net_device *(*get_netdev)(struct ib_device *device,
 					 u32 port_num);
-	/**
+	/*
 	 * rdma netdev operation
 	 *
 	 * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params
@@ -2439,14 +2439,14 @@ struct ib_device_ops {
 	int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num,
 				      enum rdma_netdev_t type,
 				      struct rdma_netdev_alloc_params *params);
-	/**
+	/*
 	 * query_gid should be return GID value for @device, when @port_num
 	 * link layer is either IB or iWarp. It is no-op if @port_num port
 	 * is RoCE link layer.
 	 */
 	int (*query_gid)(struct ib_device *device, u32 port_num, int index,
 			 union ib_gid *gid);
-	/**
+	/*
 	 * When calling add_gid, the HW vendor's driver should add the gid
 	 * of device of port at gid index available at @attr. Meta-info of
 	 * that gid (for example, the network device related to this gid) is
@@ -2460,7 +2460,7 @@ struct ib_device_ops {
 	 * roce_gid_table is used.
 	 */
 	int (*add_gid)(const struct ib_gid_attr *attr, void **context);
-	/**
+	/*
 	 * When calling del_gid, the HW vendor's driver should delete the
 	 * gid of device @device at gid index gid_index of port port_num
 	 * available in @attr.
@@ -2475,7 +2475,7 @@ struct ib_device_ops {
 			      struct ib_udata *udata);
 	void (*dealloc_ucontext)(struct ib_ucontext *context);
 	int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
-	/**
+	/*
 	 * This will be called once refcount of an entry in mmap_xa reaches
 	 * zero. The type of the memory that was mapped may differ between
 	 * entries and is opaque to the rdma_user_mmap interface.
@@ -2516,12 +2516,12 @@ struct ib_device_ops {
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 	int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
 	int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata);
-	/**
+	/*
 	 * pre_destroy_cq - Prevent a cq from generating any new work
 	 * completions, but not free any kernel resources
 	 */
 	int (*pre_destroy_cq)(struct ib_cq *cq);
-	/**
+	/*
 	 * post_destroy_cq - Free all kernel resources
 	 */
 	void (*post_destroy_cq)(struct ib_cq *cq);
@@ -2615,7 +2615,7 @@ struct ib_device_ops {
 			    struct scatterlist *meta_sg, int meta_sg_nents,
 			    unsigned int *meta_sg_offset);
 
-	/**
+	/*
 	 * alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and
 	 *   fill in the driver initialized data.  The struct is kfree()'ed by
 	 *   the sysfs core when the device is removed.  A lifespan of -1 in the
@@ -2624,7 +2624,7 @@ struct ib_device_ops {
 	struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device);
 	struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device,
 						     u32 port_num);
-	/**
+	/*
 	 * get_hw_stats - Fill in the counter value(s) in the stats struct.
 	 * @index - The index in the value array we wish to have updated, or
 	 *   num_counters if we want all stats updated
@@ -2639,14 +2639,14 @@ struct ib_device_ops {
 	int (*get_hw_stats)(struct ib_device *device,
 			    struct rdma_hw_stats *stats, u32 port, int index);
 
-	/**
+	/*
 	 * modify_hw_stat - Modify the counter configuration
 	 * @enable: true/false when enable/disable a counter
 	 * Return codes - 0 on success or error code otherwise.
 	 */
 	int (*modify_hw_stat)(struct ib_device *device, u32 port,
 			      unsigned int counter_index, bool enable);
-	/**
+	/*
 	 * Allows rdma drivers to add their own restrack attributes.
 	 */
 	int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr);
@@ -2682,39 +2682,39 @@ struct ib_device_ops {
 			 u8 pdata_len);
 	int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog);
 	int (*iw_destroy_listen)(struct iw_cm_id *cm_id);
-	/**
+	/*
 	 * counter_bind_qp - Bind a QP to a counter.
 	 * @counter - The counter to be bound. If counter->id is zero then
 	 *   the driver needs to allocate a new counter and set counter->id
 	 */
 	int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp,
 			       u32 port);
-	/**
+	/*
 	 * counter_unbind_qp - Unbind the qp from the dynamically-allocated
 	 *   counter and bind it onto the default one
 	 */
 	int (*counter_unbind_qp)(struct ib_qp *qp, u32 port);
-	/**
+	/*
 	 * counter_dealloc -De-allocate the hw counter
 	 */
 	int (*counter_dealloc)(struct rdma_counter *counter);
-	/**
+	/*
 	 * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in
 	 * the driver initialized data.
 	 */
 	struct rdma_hw_stats *(*counter_alloc_stats)(
 		struct rdma_counter *counter);
-	/**
+	/*
 	 * counter_update_stats - Query the stats value of this counter
 	 */
 	int (*counter_update_stats)(struct rdma_counter *counter);
 
-	/**
+	/*
 	 * counter_init - Initialize the driver specific rdma counter struct.
 	 */
 	void (*counter_init)(struct rdma_counter *counter);
 
-	/**
+	/*
 	 * Allows rdma drivers to add their own restrack attributes
 	 * dumped via 'rdma stat' iproute2 command.
 	 */
@@ -2730,25 +2730,25 @@ struct ib_device_ops {
 	 */
 	int (*get_numa_node)(struct ib_device *dev);
 
-	/**
+	/*
 	 * add_sub_dev - Add a sub IB device
 	 */
 	struct ib_device *(*add_sub_dev)(struct ib_device *parent,
 					 enum rdma_nl_dev_type type,
 					 const char *name);
 
-	/**
+	/*
 	 * del_sub_dev - Delete a sub IB device
 	 */
 	void (*del_sub_dev)(struct ib_device *sub_dev);
 
-	/**
+	/*
 	 * ufile_cleanup - Attempt to cleanup ubojects HW resources inside
 	 * the ufile.
 	 */
 	void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile);
 
-	/**
+	/*
 	 * report_port_event - Drivers need to implement this if they have
 	 * some private stuff to handle when link status changes.
 	 */
@@ -3157,8 +3157,8 @@ static inline u32 rdma_start_port(const struct ib_device *device)
 
 /**
  * rdma_for_each_port - Iterate over all valid port numbers of the IB device
- * @device - The struct ib_device * to iterate over
- * @iter - The unsigned int to store the port number
+ * @device: The struct ib_device * to iterate over
+ * @iter: The unsigned int to store the port number
  */
 #define rdma_for_each_port(device, iter)                                       \
 	for (iter = rdma_start_port(device +				       \
@@ -3524,7 +3524,7 @@ static inline bool rdma_core_cap_opa_port(struct ib_device *device,
 /**
  * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value.
  * @device: Device
- * @port_num: Port number
+ * @port: Port number
  * @mtu: enum value of MTU
  *
  * Return the MTU size supported by the port as an integer value. Will return
@@ -3542,7 +3542,7 @@ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port,
 /**
  * rdma_mtu_from_attr - Return the mtu of the port from the port attribute.
  * @device: Device
- * @port_num: Port number
+ * @port: Port number
  * @attr: port attribute
  *
  * Return the MTU size supported by the port as an integer value.
@@ -3919,7 +3919,7 @@ static inline int ib_destroy_qp(struct ib_qp *qp)
 
 /**
  * ib_open_qp - Obtain a reference to an existing sharable QP.
- * @xrcd - XRC domain
+ * @xrcd: XRC domain
  * @qp_open_attr: Attributes identifying the QP to open.
  *
  * Returns a reference to a sharable QP.
@@ -4273,9 +4273,9 @@ static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
 /**
  * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses
  * @dev: The device for which the DMA addresses are to be created
- * @sg: The sg_table object describing the buffer
+ * @sgt: The sg_table object describing the buffer
  * @direction: The direction of the DMA
- * @attrs: Optional DMA attributes for the map operation
+ * @dma_attrs: Optional DMA attributes for the map operation
  */
 static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev,
 					   struct sg_table *sgt,
@@ -4419,8 +4419,8 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
 /**
  * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
  *   R_Key and L_Key.
- * @mr - struct ib_mr pointer to be updated.
- * @newkey - new key to be used.
+ * @mr: struct ib_mr pointer to be updated.
+ * @newkey: new key to be used.
  */
 static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
 {
@@ -4431,7 +4431,7 @@ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
 /**
  * ib_inc_rkey - increments the key portion of the given rkey. Can be used
  * for calculating a new rkey for type 2 memory windows.
- * @rkey - the rkey to increment.
+ * @rkey: the rkey to increment.
  */
 static inline u32 ib_inc_rkey(u32 rkey)
 {
@@ -4525,7 +4525,7 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
 
 /**
  * ib_device_try_get: Hold a registration lock
- * device: The device to lock
+ * @dev: The device to lock
  *
  * A device under an active registration lock cannot become unregistered. It
  * is only possible to obtain a registration lock on a device that is fully
@@ -4832,7 +4832,7 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector)
  * rdma_roce_rescan_device - Rescan all of the network devices in the system
  * and add their gids, as needed, to the relevant RoCE devices.
  *
- * @device:         the rdma device
+ * @ibdev:         the rdma device
  */
 void rdma_roce_rescan_device(struct ib_device *ibdev);
 void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port);
@@ -4885,7 +4885,7 @@ static inline struct ib_device *rdma_device_to_ibdev(struct device *device)
 
 /**
  * ibdev_to_node - return the NUMA node for a given ib_device
- * @dev:	device to get the NUMA node for.
+ * @ibdev:	device to get the NUMA node for.
  */
 static inline int ibdev_to_node(struct ib_device *ibdev)
 {
@@ -4923,6 +4923,7 @@ static inline struct net *rdma_dev_net(struct ib_device *device)
 /**
  * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based
  *                               on the flow_label
+ * @fl: flow_label value
  *
  * This function will convert the 20 bit flow_label input to a valid RoCE v2
  * UDP src port 14 bit value. All RoCE V2 drivers should use this same
-- 
cgit v1.2.3


From 7be20254a743be4f02414b9d56cc3fe5f84e6500 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 23 Sep 2025 04:25:22 -0600
Subject: io_uring: unify task_work cancelation checks

Rather than do per-tw checking, which needs to dip into the task_struct
for checking flags, do it upfront before running task_work. This places
a 'cancel' member in io_tw_token_t, which is assigned before running
task_work for that given ctx.

This is both more efficient in doing it upfront rather than for every
task_work, and it means that io_should_terminate_tw() can be made
private in io_uring.c rather than need to be called by various
callbacks of task_work.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 +
 io_uring/io_uring.c            | 27 ++++++++++++++++++++-------
 io_uring/io_uring.h            | 13 -------------
 io_uring/poll.c                |  2 +-
 io_uring/timeout.c             |  2 +-
 io_uring/uring_cmd.c           |  2 +-
 6 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index c2ea6280901d..25ee982eb435 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -474,6 +474,7 @@ struct io_ring_ctx {
  * ONLY core io_uring.c should instantiate this struct.
  */
 struct io_tw_state {
+	bool cancel;
 };
 /* Alias to use in code that doesn't instantiate struct io_tw_state */
 typedef struct io_tw_state io_tw_token_t;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 820ef0527666..c397118da85e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -265,6 +265,20 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
 	complete(&ctx->ref_comp);
 }
 
+/*
+ * Terminate the request if either of these conditions are true:
+ *
+ * 1) It's being executed by the original task, but that task is marked
+ *    with PF_EXITING as it's exiting.
+ * 2) PF_KTHREAD is set, in which case the invoker of the task_work is
+ *    our fallback task_work.
+ * 3) The ring has been closed and is going away.
+ */
+static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx)
+{
+	return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs);
+}
+
 static __cold void io_fallback_req_func(struct work_struct *work)
 {
 	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
@@ -275,8 +289,10 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 
 	percpu_ref_get(&ctx->refs);
 	mutex_lock(&ctx->uring_lock);
-	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
+	llist_for_each_entry_safe(req, tmp, node, io_task_work.node) {
+		ts.cancel = io_should_terminate_tw(req->ctx);
 		req->io_task_work.func(req, ts);
+	}
 	io_submit_flush_completions(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	percpu_ref_put(&ctx->refs);
@@ -1147,6 +1163,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node,
 			ctx = req->ctx;
 			mutex_lock(&ctx->uring_lock);
 			percpu_ref_get(&ctx->refs);
+			ts.cancel = io_should_terminate_tw(ctx);
 		}
 		INDIRECT_CALL_2(req->io_task_work.func,
 				io_poll_task_func, io_req_rw_complete,
@@ -1205,11 +1222,6 @@ struct llist_node *tctx_task_work_run(struct io_uring_task *tctx,
 {
 	struct llist_node *node;
 
-	if (unlikely(current->flags & PF_EXITING)) {
-		io_fallback_tw(tctx, true);
-		return NULL;
-	}
-
 	node = llist_del_all(&tctx->task_list);
 	if (node) {
 		node = llist_reverse_order(node);
@@ -1399,6 +1411,7 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw,
 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 again:
+	tw.cancel = io_should_terminate_tw(ctx);
 	min_events -= ret;
 	ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events);
 	if (ctx->retry_llist.first)
@@ -1458,7 +1471,7 @@ void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_tw_lock(ctx, tw);
-	if (unlikely(io_should_terminate_tw(ctx)))
+	if (unlikely(tw.cancel))
 		io_req_defer_failed(req, -EFAULT);
 	else if (req->flags & REQ_F_FORCE_ASYNC)
 		io_queue_iowq(req);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 46d9141d772a..78777bf1ea4b 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -558,19 +558,6 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
 		      ctx->submitter_task == current);
 }
 
-/*
- * Terminate the request if either of these conditions are true:
- *
- * 1) It's being executed by the original task, but that task is marked
- *    with PF_EXITING as it's exiting.
- * 2) PF_KTHREAD is set, in which case the invoker of the task_work is
- *    our fallback task_work.
- */
-static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx)
-{
-	return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs);
-}
-
 static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
 {
 	io_req_set_res(req, res, 0);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index b9681d0f9f13..c403e751841a 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -224,7 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
 {
 	int v;
 
-	if (unlikely(io_should_terminate_tw(req->ctx)))
+	if (unlikely(tw.cancel))
 		return -ECANCELED;
 
 	do {
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 17e3aab0af36..444142ba9d04 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -324,7 +324,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw)
 	int ret;
 
 	if (prev) {
-		if (!io_should_terminate_tw(req->ctx)) {
+		if (!tw.cancel) {
 			struct io_cancel_data cd = {
 				.ctx		= req->ctx,
 				.data		= prev->cqe.user_data,
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index d1e3ba62ee8e..1225f8124e4b 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -118,7 +118,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw)
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 	unsigned int flags = IO_URING_F_COMPLETE_DEFER;
 
-	if (io_should_terminate_tw(req->ctx))
+	if (unlikely(tw.cancel))
 		flags |= IO_URING_F_TASK_DEAD;
 
 	/* task_work executor checks the deffered list completion */
-- 
cgit v1.2.3


From c5ebcc80fcf7d2c6ed917371f024d2da5bce9128 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 17 Oct 2025 00:07:27 -0700
Subject: iio: adc: qcom-vadc-common: fix vadc_scale_fn_type kernel-doc

Fix multiple warnings in enum vadc_scale_fn_type by adding a leading
'@' to the kernel-doc descriptions.

Fixed 14 warnings in this one enum, such as:
Warning: include/linux/iio/adc/qcom-vadc-common.h:123 Enum value
 'SCALE_DEFAULT' not described in enum 'vadc_scale_fn_type'
Warning: ../include/linux/iio/adc/qcom-vadc-common.h:123 Enum value
 'SCALE_THERM_100K_PULLUP' not described in enum 'vadc_scale_fn_type'
Warning: ../include/linux/iio/adc/qcom-vadc-common.h:123 Enum value
 'SCALE_PMIC_THERM' not described in enum 'vadc_scale_fn_type'

Also prevent the warning on SCALE_HW_CALIB_INVALID by marking it
"private:" so that kernel-doc notation is not needed for it.

This leaves only one warning here, which I don't know the
appropriate description of:
qcom-vadc-common.h:125: warning: Enum value
 'SCALE_HW_CALIB_PMIC_THERM_PM7' not described in enum 'vadc_scale_fn_type'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/adc/qcom-vadc-common.h | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/iio/adc/qcom-vadc-common.h b/include/linux/iio/adc/qcom-vadc-common.h
index aa21b032e861..3bf4c49726a7 100644
--- a/include/linux/iio/adc/qcom-vadc-common.h
+++ b/include/linux/iio/adc/qcom-vadc-common.h
@@ -83,27 +83,27 @@ struct vadc_linear_graph {
 /**
  * enum vadc_scale_fn_type - Scaling function to convert ADC code to
  *				physical scaled units for the channel.
- * SCALE_DEFAULT: Default scaling to convert raw adc code to voltage (uV).
- * SCALE_THERM_100K_PULLUP: Returns temperature in millidegC.
+ * @SCALE_DEFAULT: Default scaling to convert raw adc code to voltage (uV).
+ * @SCALE_THERM_100K_PULLUP: Returns temperature in millidegC.
  *				 Uses a mapping table with 100K pullup.
- * SCALE_PMIC_THERM: Returns result in milli degree's Centigrade.
- * SCALE_XOTHERM: Returns XO thermistor voltage in millidegC.
- * SCALE_PMI_CHG_TEMP: Conversion for PMI CHG temp
- * SCALE_HW_CALIB_DEFAULT: Default scaling to convert raw adc code to
+ * @SCALE_PMIC_THERM: Returns result in milli degree's Centigrade.
+ * @SCALE_XOTHERM: Returns XO thermistor voltage in millidegC.
+ * @SCALE_PMI_CHG_TEMP: Conversion for PMI CHG temp
+ * @SCALE_HW_CALIB_DEFAULT: Default scaling to convert raw adc code to
  *	voltage (uV) with hardware applied offset/slope values to adc code.
- * SCALE_HW_CALIB_THERM_100K_PULLUP: Returns temperature in millidegC using
+ * @SCALE_HW_CALIB_THERM_100K_PULLUP: Returns temperature in millidegC using
  *	lookup table. The hardware applies offset/slope to adc code.
- * SCALE_HW_CALIB_XOTHERM: Returns XO thermistor voltage in millidegC using
+ * @SCALE_HW_CALIB_XOTHERM: Returns XO thermistor voltage in millidegC using
  *	100k pullup. The hardware applies offset/slope to adc code.
- * SCALE_HW_CALIB_THERM_100K_PU_PM7: Returns temperature in millidegC using
+ * @SCALE_HW_CALIB_THERM_100K_PU_PM7: Returns temperature in millidegC using
  *	lookup table for PMIC7. The hardware applies offset/slope to adc code.
- * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade.
+ * @SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade.
  *	The hardware applies offset/slope to adc code.
- * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade.
+ * @SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade.
  *	The hardware applies offset/slope to adc code. This is for PMIC7.
- * SCALE_HW_CALIB_PM5_CHG_TEMP: Returns result in millidegrees for PMIC5
+ * @SCALE_HW_CALIB_PM5_CHG_TEMP: Returns result in millidegrees for PMIC5
  *	charger temperature.
- * SCALE_HW_CALIB_PM5_SMB_TEMP: Returns result in millidegrees for PMIC5
+ * @SCALE_HW_CALIB_PM5_SMB_TEMP: Returns result in millidegrees for PMIC5
  *	SMB1390 temperature.
  */
 enum vadc_scale_fn_type {
@@ -120,6 +120,7 @@ enum vadc_scale_fn_type {
 	SCALE_HW_CALIB_PMIC_THERM_PM7,
 	SCALE_HW_CALIB_PM5_CHG_TEMP,
 	SCALE_HW_CALIB_PM5_SMB_TEMP,
+	/* private: */
 	SCALE_HW_CALIB_INVALID,
 };
 
-- 
cgit v1.2.3


From ca82a7ea2299b4586af1f77daee66ee781202320 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Fri, 19 Sep 2025 14:42:50 -0700
Subject: iomap: simplify iomap_iter_advance()

Most callers of iomap_iter_advance() do not need the remaining length
returned. Get rid of the extra iomap_length() call that
iomap_iter_advance() does.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/dax.c               | 30 ++++++++++++------------------
 fs/iomap/buffered-io.c | 18 +++++++++---------
 fs/iomap/direct-io.c   |  6 +++---
 fs/iomap/iter.c        | 14 +++++---------
 fs/iomap/seek.c        |  8 ++++----
 include/linux/iomap.h  |  6 ++----
 6 files changed, 35 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index 89f071ba7b10..c299fcb5618d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1507,7 +1507,7 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
 
 	/* already zeroed?  we're done. */
 	if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
-		return iomap_iter_advance(iter, &length);
+		return iomap_iter_advance(iter, length);
 
 	/*
 	 * invalidate the pages whose sharing state is to be changed
@@ -1536,10 +1536,10 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
 		if (ret < 0)
 			return ret;
 
-		ret = iomap_iter_advance(iter, &length);
+		ret = iomap_iter_advance(iter, length);
 		if (ret)
 			return ret;
-	} while (length > 0);
+	} while ((length = iomap_length(iter)) > 0);
 
 	if (did_zero)
 		*did_zero = true;
@@ -1597,7 +1597,7 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
 
 		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
 			done = iov_iter_zero(min(length, end - pos), iter);
-			return iomap_iter_advance(iomi, &done);
+			return iomap_iter_advance(iomi, done);
 		}
 	}
 
@@ -1681,12 +1681,12 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
 			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
 					map_len, iter);
 
-		length = xfer;
-		ret = iomap_iter_advance(iomi, &length);
+		ret = iomap_iter_advance(iomi, xfer);
 		if (!ret && xfer == 0)
 			ret = -EFAULT;
 		if (xfer < map_len)
 			break;
+		length = iomap_length(iomi);
 	}
 	dax_read_unlock(id);
 
@@ -1919,10 +1919,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
 			ret |= VM_FAULT_MAJOR;
 		}
 
-		if (!(ret & VM_FAULT_ERROR)) {
-			u64 length = PAGE_SIZE;
-			iter.status = iomap_iter_advance(&iter, &length);
-		}
+		if (!(ret & VM_FAULT_ERROR))
+			iter.status = iomap_iter_advance(&iter, PAGE_SIZE);
 	}
 
 	if (iomap_errp)
@@ -2034,10 +2032,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
 			continue; /* actually breaks out of the loop */
 
 		ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
-		if (ret != VM_FAULT_FALLBACK) {
-			u64 length = PMD_SIZE;
-			iter.status = iomap_iter_advance(&iter, &length);
-		}
+		if (ret != VM_FAULT_FALLBACK)
+			iter.status = iomap_iter_advance(&iter, PMD_SIZE);
 	}
 
 unlock_entry:
@@ -2163,7 +2159,6 @@ static int dax_range_compare_iter(struct iomap_iter *it_src,
 	const struct iomap *smap = &it_src->iomap;
 	const struct iomap *dmap = &it_dest->iomap;
 	loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
-	u64 dest_len;
 	void *saddr, *daddr;
 	int id, ret;
 
@@ -2196,10 +2191,9 @@ static int dax_range_compare_iter(struct iomap_iter *it_src,
 	dax_read_unlock(id);
 
 advance:
-	dest_len = len;
-	ret = iomap_iter_advance(it_src, &len);
+	ret = iomap_iter_advance(it_src, len);
 	if (!ret)
-		ret = iomap_iter_advance(it_dest, &dest_len);
+		ret = iomap_iter_advance(it_dest, len);
 	return ret;
 
 out_unlock:
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8b847a1e27f1..6cc2ee44bbca 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -376,7 +376,7 @@ static int iomap_readpage_iter(struct iomap_iter *iter,
 		ret = iomap_read_inline_data(iter, folio);
 		if (ret)
 			return ret;
-		return iomap_iter_advance(iter, &length);
+		return iomap_iter_advance(iter, length);
 	}
 
 	/* zero post-eof blocks as the page may be mapped */
@@ -437,7 +437,7 @@ done:
 	 * iteration.
 	 */
 	length = pos - iter->pos + plen;
-	return iomap_iter_advance(iter, &length);
+	return iomap_iter_advance(iter, length);
 }
 
 static int iomap_read_folio_iter(struct iomap_iter *iter,
@@ -1041,7 +1041,7 @@ retry:
 			}
 		} else {
 			total_written += written;
-			iomap_iter_advance(iter, &written);
+			iomap_iter_advance(iter, written);
 		}
 	} while (iov_iter_count(i) && iomap_length(iter));
 
@@ -1310,7 +1310,7 @@ static int iomap_unshare_iter(struct iomap_iter *iter,
 	int status;
 
 	if (!iomap_want_unshare_iter(iter))
-		return iomap_iter_advance(iter, &bytes);
+		return iomap_iter_advance(iter, bytes);
 
 	do {
 		struct folio *folio;
@@ -1334,10 +1334,10 @@ static int iomap_unshare_iter(struct iomap_iter *iter,
 
 		balance_dirty_pages_ratelimited(iter->inode->i_mapping);
 
-		status = iomap_iter_advance(iter, &bytes);
+		status = iomap_iter_advance(iter, bytes);
 		if (status)
 			break;
-	} while (bytes > 0);
+	} while ((bytes = iomap_length(iter)) > 0);
 
 	return status;
 }
@@ -1412,10 +1412,10 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 		if (WARN_ON_ONCE(!ret))
 			return -EIO;
 
-		status = iomap_iter_advance(iter, &bytes);
+		status = iomap_iter_advance(iter, bytes);
 		if (status)
 			break;
-	} while (bytes > 0);
+	} while ((bytes = iomap_length(iter)) > 0);
 
 	if (did_zero)
 		*did_zero = true;
@@ -1526,7 +1526,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
 		folio_mark_dirty(folio);
 	}
 
-	return iomap_iter_advance(iter, &length);
+	return iomap_iter_advance(iter, length);
 }
 
 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 5d5d63efbd57..e9e5f0703160 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -496,7 +496,7 @@ out:
 	/* Undo iter limitation to current extent */
 	iov_iter_reexpand(dio->submit.iter, orig_count - copied);
 	if (copied)
-		return iomap_iter_advance(iter, &copied);
+		return iomap_iter_advance(iter, copied);
 	return ret;
 }
 
@@ -507,7 +507,7 @@ static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 	dio->size += length;
 	if (!length)
 		return -EFAULT;
-	return iomap_iter_advance(iter, &length);
+	return iomap_iter_advance(iter, length);
 }
 
 static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
@@ -542,7 +542,7 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
 	dio->size += copied;
 	if (!copied)
 		return -EFAULT;
-	return iomap_iter_advance(iomi, &copied);
+	return iomap_iter_advance(iomi, copied);
 }
 
 static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index cef77ca0c20b..91d2024e00da 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -13,17 +13,13 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
 	memset(&iter->srcmap, 0, sizeof(iter->srcmap));
 }
 
-/*
- * Advance the current iterator position and output the length remaining for the
- * current mapping.
- */
-int iomap_iter_advance(struct iomap_iter *iter, u64 *count)
+/* Advance the current iterator position and decrement the remaining length */
+int iomap_iter_advance(struct iomap_iter *iter, u64 count)
 {
-	if (WARN_ON_ONCE(*count > iomap_length(iter)))
+	if (WARN_ON_ONCE(count > iomap_length(iter)))
 		return -EIO;
-	iter->pos += *count;
-	iter->len -= *count;
-	*count = iomap_length(iter);
+	iter->pos += count;
+	iter->len -= count;
 	return 0;
 }
 
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index 56db2dd4b10d..6cbc587c93da 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -16,13 +16,13 @@ static int iomap_seek_hole_iter(struct iomap_iter *iter,
 		*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
 				iter->pos, iter->pos + length, SEEK_HOLE);
 		if (*hole_pos == iter->pos + length)
-			return iomap_iter_advance(iter, &length);
+			return iomap_iter_advance(iter, length);
 		return 0;
 	case IOMAP_HOLE:
 		*hole_pos = iter->pos;
 		return 0;
 	default:
-		return iomap_iter_advance(iter, &length);
+		return iomap_iter_advance(iter, length);
 	}
 }
 
@@ -59,12 +59,12 @@ static int iomap_seek_data_iter(struct iomap_iter *iter,
 
 	switch (iter->iomap.type) {
 	case IOMAP_HOLE:
-		return iomap_iter_advance(iter, &length);
+		return iomap_iter_advance(iter, length);
 	case IOMAP_UNWRITTEN:
 		*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
 				iter->pos, iter->pos + length, SEEK_DATA);
 		if (*hole_pos < 0)
-			return iomap_iter_advance(iter, &length);
+			return iomap_iter_advance(iter, length);
 		return 0;
 	default:
 		*hole_pos = iter->pos;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 73dceabc21c8..4469b2318b08 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -245,7 +245,7 @@ struct iomap_iter {
 };
 
 int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
-int iomap_iter_advance(struct iomap_iter *iter, u64 *count);
+int iomap_iter_advance(struct iomap_iter *iter, u64 count);
 
 /**
  * iomap_length_trim - trimmed length of the current iomap iteration
@@ -282,9 +282,7 @@ static inline u64 iomap_length(const struct iomap_iter *iter)
  */
 static inline int iomap_iter_advance_full(struct iomap_iter *iter)
 {
-	u64 length = iomap_length(iter);
-
-	return iomap_iter_advance(iter, &length);
+	return iomap_iter_advance(iter, iomap_length(iter));
 }
 
 /**
-- 
cgit v1.2.3


From dc816f8d925cac34922ea73abd94ae23a96cacac Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 1 Oct 2025 01:53:14 +0200
Subject: fs: assert ->i_lock held in __iget()

Also remove the now redundant comment.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..ac62b9d10b00 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3378,11 +3378,9 @@ static inline bool is_zero_ino(ino_t ino)
 	return (u32)ino == 0;
 }
 
-/*
- * inode->i_lock must be held
- */
 static inline void __iget(struct inode *inode)
 {
+	lockdep_assert_held(&inode->i_lock);
 	atomic_inc(&inode->i_count);
 }
 
-- 
cgit v1.2.3


From 31e332b911fca54df467d264d7e2a2ef9317f3ca Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Mon, 6 Oct 2025 01:15:26 +0200
Subject: fs: add missing fences to I_NEW handling

Suppose there are 2 CPUs racing inode hash lookup func (say ilookup5())
and unlock_new_inode().

In principle the latter can clear the I_NEW flag before prior stores
into the inode were made visible.

The former can in turn observe I_NEW is cleared and proceed to use the
inode, while possibly reading from not-yet-published areas.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/dcache.c               | 4 ++++
 fs/inode.c                | 8 ++++++++
 include/linux/writeback.h | 4 ++++
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index a067fa0a965a..806d6a665124 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1981,6 +1981,10 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
 	spin_lock(&inode->i_lock);
 	__d_instantiate(entry, inode);
 	WARN_ON(!(inode->i_state & I_NEW));
+	/*
+	 * Pairs with smp_rmb in wait_on_inode().
+	 */
+	smp_wmb();
 	inode->i_state &= ~I_NEW & ~I_CREATING;
 	/*
 	 * Pairs with the barrier in prepare_to_wait_event() to make sure
diff --git a/fs/inode.c b/fs/inode.c
index fa82cb810af4..37fc7a72aba5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1181,6 +1181,10 @@ void unlock_new_inode(struct inode *inode)
 	lockdep_annotate_inode_mutex_key(inode);
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
+	/*
+	 * Pairs with smp_rmb in wait_on_inode().
+	 */
+	smp_wmb();
 	inode->i_state &= ~I_NEW & ~I_CREATING;
 	/*
 	 * Pairs with the barrier in prepare_to_wait_event() to make sure
@@ -1198,6 +1202,10 @@ void discard_new_inode(struct inode *inode)
 	lockdep_annotate_inode_mutex_key(inode);
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
+	/*
+	 * Pairs with smp_rmb in wait_on_inode().
+	 */
+	smp_wmb();
 	inode->i_state &= ~I_NEW;
 	/*
 	 * Pairs with the barrier in prepare_to_wait_event() to make sure
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 22dd4adc5667..e1e1231a6830 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -194,6 +194,10 @@ static inline void wait_on_inode(struct inode *inode)
 {
 	wait_var_event(inode_state_wait_address(inode, __I_NEW),
 		       !(READ_ONCE(inode->i_state) & I_NEW));
+	/*
+	 * Pairs with routines clearing I_NEW.
+	 */
+	smp_rmb();
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
-- 
cgit v1.2.3


From af6023e2ce0a3d4d948885d464b0ddca4b8b1fdf Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 9 Oct 2025 09:59:15 +0200
Subject: fs: move wait_on_inode() from writeback.h to fs.h

The only consumer outside of fs/inode.c is gfs2 and it already includes
fs.h in the relevant file.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h        | 10 ++++++++++
 include/linux/writeback.h | 11 -----------
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ac62b9d10b00..b35014ba681b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -949,6 +949,16 @@ static inline void inode_fake_hash(struct inode *inode)
 	hlist_add_fake(&inode->i_hash);
 }
 
+static inline void wait_on_inode(struct inode *inode)
+{
+	wait_var_event(inode_state_wait_address(inode, __I_NEW),
+		       !(READ_ONCE(inode->i_state) & I_NEW));
+	/*
+	 * Pairs with routines clearing I_NEW.
+	 */
+	smp_rmb();
+}
+
 /*
  * inode->i_rwsem nesting subclasses for the lock validator:
  *
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e1e1231a6830..06195c2a535b 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -189,17 +189,6 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
 void inode_wait_for_writeback(struct inode *inode);
 void inode_io_list_del(struct inode *inode);
 
-/* writeback.h requires fs.h; it, too, is not included from here. */
-static inline void wait_on_inode(struct inode *inode)
-{
-	wait_var_event(inode_state_wait_address(inode, __I_NEW),
-		       !(READ_ONCE(inode->i_state) & I_NEW));
-	/*
-	 * Pairs with routines clearing I_NEW.
-	 */
-	smp_rmb();
-}
-
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 #include <linux/cgroup.h>
-- 
cgit v1.2.3


From cb5db358ab5769cbd3e8e864f14af321126cccdb Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 9 Oct 2025 09:59:16 +0200
Subject: fs: spell out fenced ->i_state accesses with explicit smp_wmb/smp_rmb

The incomming helpers don't ship with _release/_acquire variants, for
the time being anyway.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c           | 5 +++--
 include/linux/backing-dev.h | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2b35e80037fe..9cda19a40ca2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -476,10 +476,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
 	switched = true;
 skip_switch:
 	/*
-	 * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+	 * Paired with an acquire fence in unlocked_inode_to_wb_begin() and
 	 * ensures that the new wb is visible if they see !I_WB_SWITCH.
 	 */
-	smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+	smp_wmb();
+	inode->i_state &= ~I_WB_SWITCH;
 
 	xa_unlock_irq(&mapping->i_pages);
 	spin_unlock(&inode->i_lock);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 3e64f14739dd..065cba5dc111 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -277,10 +277,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
 	rcu_read_lock();
 
 	/*
-	 * Paired with store_release in inode_switch_wbs_work_fn() and
+	 * Paired with a release fence in inode_do_switch_wbs() and
 	 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
 	 */
-	cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+	cookie->locked = inode->i_state & I_WB_SWITCH;
+	smp_rmb();
 
 	if (unlikely(cookie->locked))
 		xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);
-- 
cgit v1.2.3


From d8753f788ab4916341d9fab81795be9f2f49c264 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 9 Oct 2025 09:59:17 +0200
Subject: fs: provide accessors for ->i_state

Open-coded accesses prevent asserting they are done correctly. One
obvious aspect is locking, but significantly more can checked. For
example it can be detected when the code is clearing flags which are
already missing, or is setting flags when it is illegal (e.g., I_FREEING
when ->i_count > 0).

In order to keep things manageable this patchset merely gets the thing
off the ground with only lockdep checks baked in.

Current consumers can be trivially converted.

Suppose flags I_A and I_B are to be handled.

If ->i_lock is held, then:

state = inode->i_state  	=> state = inode_state_read(inode)
inode->i_state |= (I_A | I_B) 	=> inode_state_set(inode, I_A | I_B)
inode->i_state &= ~(I_A | I_B) 	=> inode_state_clear(inode, I_A | I_B)
inode->i_state = I_A | I_B	=> inode_state_assign(inode, I_A | I_B)

If ->i_lock is not held or only held conditionally:

state = inode->i_state  	=> state = inode_state_read_once(inode)
inode->i_state |= (I_A | I_B) 	=> inode_state_set_raw(inode, I_A | I_B)
inode->i_state &= ~(I_A | I_B) 	=> inode_state_clear_raw(inode, I_A | I_B)
inode->i_state = I_A | I_B	=> inode_state_assign_raw(inode, I_A | I_B)

The "_once" vs "_raw" discrepancy stems from the read variant differing
by READ_ONCE as opposed to just lockdep checks.

Finally, if you want to atomically clear flags and set new ones, the
following:

state = inode->i_state;
state &= ~I_A;
state |= I_B;
inode->i_state = state;

turns into:

inode_state_replace(inode, I_A, I_B);

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 76 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b35014ba681b..909eb1e68637 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -759,7 +759,7 @@ enum inode_state_bits {
 	/* reserved wait address bit 3 */
 };
 
-enum inode_state_flags_t {
+enum inode_state_flags_enum {
 	I_NEW			= (1U << __I_NEW),
 	I_SYNC			= (1U << __I_SYNC),
 	I_LRU_ISOLATING         = (1U << __I_LRU_ISOLATING),
@@ -843,7 +843,7 @@ struct inode {
 #endif
 
 	/* Misc */
-	enum inode_state_flags_t	i_state;
+	enum inode_state_flags_enum i_state;
 	/* 32-bit hole */
 	struct rw_semaphore	i_rwsem;
 
@@ -902,6 +902,80 @@ struct inode {
 	void			*i_private; /* fs or device private pointer */
 } __randomize_layout;
 
+/*
+ * i_state handling
+ *
+ * We hide all of it behind helpers so that we can validate consumers.
+ */
+static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode)
+{
+	return READ_ONCE(inode->i_state);
+}
+
+static inline enum inode_state_flags_enum inode_state_read(struct inode *inode)
+{
+	lockdep_assert_held(&inode->i_lock);
+	return inode->i_state;
+}
+
+static inline void inode_state_set_raw(struct inode *inode,
+				       enum inode_state_flags_enum flags)
+{
+	WRITE_ONCE(inode->i_state, inode->i_state | flags);
+}
+
+static inline void inode_state_set(struct inode *inode,
+				   enum inode_state_flags_enum flags)
+{
+	lockdep_assert_held(&inode->i_lock);
+	inode_state_set_raw(inode, flags);
+}
+
+static inline void inode_state_clear_raw(struct inode *inode,
+					 enum inode_state_flags_enum flags)
+{
+	WRITE_ONCE(inode->i_state, inode->i_state & ~flags);
+}
+
+static inline void inode_state_clear(struct inode *inode,
+				     enum inode_state_flags_enum flags)
+{
+	lockdep_assert_held(&inode->i_lock);
+	inode_state_clear_raw(inode, flags);
+}
+
+static inline void inode_state_assign_raw(struct inode *inode,
+					  enum inode_state_flags_enum flags)
+{
+	WRITE_ONCE(inode->i_state, flags);
+}
+
+static inline void inode_state_assign(struct inode *inode,
+				      enum inode_state_flags_enum flags)
+{
+	lockdep_assert_held(&inode->i_lock);
+	inode_state_assign_raw(inode, flags);
+}
+
+static inline void inode_state_replace_raw(struct inode *inode,
+					   enum inode_state_flags_enum clearflags,
+					   enum inode_state_flags_enum setflags)
+{
+	enum inode_state_flags_enum flags;
+	flags = inode->i_state;
+	flags &= ~clearflags;
+	flags |= setflags;
+	inode_state_assign_raw(inode, flags);
+}
+
+static inline void inode_state_replace(struct inode *inode,
+				       enum inode_state_flags_enum clearflags,
+				       enum inode_state_flags_enum setflags)
+{
+	lockdep_assert_held(&inode->i_lock);
+	inode_state_replace_raw(inode, clearflags, setflags);
+}
+
 static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
 {
 	VFS_WARN_ON_INODE(strlen(link) != linklen, inode);
-- 
cgit v1.2.3


From f5aa78e2be066f3801785094f1b55a3114fe461a Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 9 Oct 2025 09:59:19 +0200
Subject: Manual conversion to use ->i_state accessors of all places not
 covered by coccinelle

Nothing to look at apart from iput_final().

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/porting.rst |  2 +-
 fs/afs/inode.c                        |  2 +-
 fs/ext4/inode.c                       | 10 +++++-----
 fs/ext4/orphan.c                      |  4 ++--
 fs/inode.c                            | 18 ++++++++----------
 include/linux/backing-dev.h           |  2 +-
 include/linux/fs.h                    |  6 +++---
 include/linux/writeback.h             |  2 +-
 include/trace/events/writeback.h      |  8 ++++----
 9 files changed, 26 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 7233b04668fc..35f027981b21 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -211,7 +211,7 @@ test and set for you.
 e.g.::
 
 	inode = iget_locked(sb, ino);
-	if (inode->i_state & I_NEW) {
+	if (inode_state_read_once(inode) & I_NEW) {
 		err = read_inode_from_disk(inode);
 		if (err < 0) {
 			iget_failed(inode);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 2fe2ccf59c7a..dde1857fcabb 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -427,7 +427,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
 	struct afs_vnode *vnode = vp->vnode;
 	int ret;
 
-	if (vnode->netfs.inode.i_state & I_NEW) {
+	if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) {
 		ret = afs_inode_init_from_status(op, vp, vnode);
 		afs_op_set_error(op, ret);
 		if (ret == 0)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9e4ac87211e..b864e9645f85 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -425,7 +425,7 @@ void ext4_check_map_extents_env(struct inode *inode)
 	if (!S_ISREG(inode->i_mode) ||
 	    IS_NOQUOTA(inode) || IS_VERITY(inode) ||
 	    is_special_ino(inode->i_sb, inode->i_ino) ||
-	    (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
+	    (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
 	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
 	    ext4_verity_in_progress(inode))
 		return;
@@ -3473,7 +3473,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
 	/* Any metadata buffers to write? */
 	if (!list_empty(&inode->i_mapping->i_private_list))
 		return true;
-	return inode->i_state & I_DIRTY_DATASYNC;
+	return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
 }
 
 static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
@@ -4552,7 +4552,7 @@ int ext4_truncate(struct inode *inode)
 	 * or it's a completely new inode. In those cases we might not
 	 * have i_rwsem locked because it's not necessary.
 	 */
-	if (!(inode->i_state & (I_NEW|I_FREEING)))
+	if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
 		WARN_ON(!inode_is_locked(inode));
 	trace_ext4_truncate_enter(inode);
 
@@ -5210,7 +5210,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW)) {
+	if (!(inode_state_read_once(inode) & I_NEW)) {
 		ret = check_igot_inode(inode, flags, function, line);
 		if (ret) {
 			iput(inode);
@@ -5541,7 +5541,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
 	if (inode_is_dirtytime_only(inode)) {
 		struct ext4_inode_info	*ei = EXT4_I(inode);
 
-		inode->i_state &= ~I_DIRTY_TIME;
+		inode_state_clear(inode, I_DIRTY_TIME);
 		spin_unlock(&inode->i_lock);
 
 		spin_lock(&ei->i_raw_lock);
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 33c3a89396b1..c4903d98ff81 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -107,7 +107,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	if (!sbi->s_journal || is_bad_inode(inode))
 		return 0;
 
-	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+	WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
 		     !inode_is_locked(inode));
 	if (ext4_inode_orphan_tracked(inode))
 		return 0;
@@ -232,7 +232,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
 		return 0;
 
-	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+	WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
 		     !inode_is_locked(inode));
 	if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
 		return ext4_orphan_file_del(handle, inode);
diff --git a/fs/inode.c b/fs/inode.c
index f094ed3e6f30..3153d725859c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -829,7 +829,7 @@ static void evict(struct inode *inode)
 	 * This also means we don't need any fences for the call below.
 	 */
 	inode_wake_up_bit(inode, __I_NEW);
-	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+	BUG_ON(inode_state_read_once(inode) != (I_FREEING | I_CLEAR));
 
 	destroy_inode(inode);
 }
@@ -1883,7 +1883,6 @@ static void iput_final(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	const struct super_operations *op = inode->i_sb->s_op;
-	unsigned long state;
 	int drop;
 
 	WARN_ON(inode_state_read(inode) & I_NEW);
@@ -1908,20 +1907,19 @@ static void iput_final(struct inode *inode)
 	 */
 	VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);
 
-	state = inode_state_read(inode);
-	if (!drop) {
-		WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
+	if (drop) {
+		inode_state_set(inode, I_FREEING);
+	} else {
+		inode_state_set(inode, I_WILL_FREE);
 		spin_unlock(&inode->i_lock);
 
 		write_inode_now(inode, 1);
 
 		spin_lock(&inode->i_lock);
-		state = inode_state_read(inode);
-		WARN_ON(state & I_NEW);
-		state &= ~I_WILL_FREE;
+		WARN_ON(inode_state_read(inode) & I_NEW);
+		inode_state_replace(inode, I_WILL_FREE, I_FREEING);
 	}
 
-	WRITE_ONCE(inode->i_state, state | I_FREEING);
 	if (!list_empty(&inode->i_lru))
 		inode_lru_list_del(inode);
 	spin_unlock(&inode->i_lock);
@@ -2985,7 +2983,7 @@ void dump_inode(struct inode *inode, const char *reason)
 	pr_warn("%s encountered for inode %px\n"
 		"fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n",
 		reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags,
-		inode->i_flags, inode->i_state, atomic_read(&inode->i_count));
+		inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count));
 }
 
 EXPORT_SYMBOL(dump_inode);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 065cba5dc111..0c8342747cab 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -280,7 +280,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
 	 * Paired with a release fence in inode_do_switch_wbs() and
 	 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
 	 */
-	cookie->locked = inode->i_state & I_WB_SWITCH;
+	cookie->locked = inode_state_read_once(inode) & I_WB_SWITCH;
 	smp_rmb();
 
 	if (unlikely(cookie->locked))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 909eb1e68637..77b6486dcae7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1026,7 +1026,7 @@ static inline void inode_fake_hash(struct inode *inode)
 static inline void wait_on_inode(struct inode *inode)
 {
 	wait_var_event(inode_state_wait_address(inode, __I_NEW),
-		       !(READ_ONCE(inode->i_state) & I_NEW));
+		       !(inode_state_read_once(inode) & I_NEW));
 	/*
 	 * Pairs with routines clearing I_NEW.
 	 */
@@ -2719,8 +2719,8 @@ static inline int icount_read(const struct inode *inode)
  */
 static inline bool inode_is_dirtytime_only(struct inode *inode)
 {
-	return (inode->i_state & (I_DIRTY_TIME | I_NEW |
-				  I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
+	return (inode_state_read_once(inode) &
+	       (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
 }
 
 extern void inc_nlink(struct inode *inode);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 06195c2a535b..102071ffedcb 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -227,7 +227,7 @@ static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
 static inline void inode_detach_wb(struct inode *inode)
 {
 	if (inode->i_wb) {
-		WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
+		WARN_ON_ONCE(!(inode_state_read_once(inode) & I_CLEAR));
 		wb_put(inode->i_wb);
 		inode->i_wb = NULL;
 	}
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index c08aff044e80..311a341e6fe4 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 		/* may be called for files on pseudo FSes w/ unregistered bdi */
 		strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
 		__entry->ino		= inode->i_ino;
-		__entry->state		= inode->i_state;
+		__entry->state		= inode_state_read_once(inode);
 		__entry->flags		= flags;
 	),
 
@@ -748,7 +748,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
 		strscpy_pad(__entry->name,
 			    bdi_dev_name(inode_to_bdi(inode)), 32);
 		__entry->ino		= inode->i_ino;
-		__entry->state		= inode->i_state;
+		__entry->state		= inode_state_read_once(inode);
 		__entry->dirtied_when	= inode->dirtied_when;
 		__entry->cgroup_ino	= __trace_wb_assign_cgroup(inode_to_wb(inode));
 	),
@@ -787,7 +787,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
 		strscpy_pad(__entry->name,
 			    bdi_dev_name(inode_to_bdi(inode)), 32);
 		__entry->ino		= inode->i_ino;
-		__entry->state		= inode->i_state;
+		__entry->state		= inode_state_read_once(inode);
 		__entry->dirtied_when	= inode->dirtied_when;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 		__entry->nr_to_write	= nr_to_write;
@@ -839,7 +839,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template,
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
-		__entry->state	= inode->i_state;
+		__entry->state	= inode_state_read_once(inode);
 		__entry->mode	= inode->i_mode;
 		__entry->dirtied_when = inode->dirtied_when;
 	),
-- 
cgit v1.2.3


From 2ed81b4bef9b74ae0f095ad4667dbe2ae0b86a91 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 9 Oct 2025 09:59:28 +0200
Subject: fs: make plain ->i_state access fail to compile

... to make sure all accesses are properly validated.

Merely renaming the var to __i_state still lets the compiler make the
following suggestion:
error: 'struct inode' has no member named 'i_state'; did you mean '__i_state'?

Unfortunately some people will add the __'s and call it a day.

In order to make it harder to mess up in this way, hide it behind a
struct. The resulting error message should be convincing in terms of
checking what to do:
error: invalid operands to binary & (have 'struct inode_state_flags' and 'int')

Of course people determined to do a plain access can still do it, but
nothing can be done for that case.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 77b6486dcae7..21c73df3ce75 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -785,6 +785,13 @@ enum inode_state_flags_enum {
 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
 #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
 
+/*
+ * Use inode_state_read() & friends to access.
+ */
+struct inode_state_flags {
+	enum inode_state_flags_enum __state;
+};
+
 /*
  * Keep mostly read-only and often accessed (especially for
  * the RCU path lookup and 'stat' data) fields at the beginning
@@ -843,7 +850,7 @@ struct inode {
 #endif
 
 	/* Misc */
-	enum inode_state_flags_enum i_state;
+	struct inode_state_flags i_state;
 	/* 32-bit hole */
 	struct rw_semaphore	i_rwsem;
 
@@ -909,19 +916,19 @@ struct inode {
  */
 static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode)
 {
-	return READ_ONCE(inode->i_state);
+	return READ_ONCE(inode->i_state.__state);
 }
 
 static inline enum inode_state_flags_enum inode_state_read(struct inode *inode)
 {
 	lockdep_assert_held(&inode->i_lock);
-	return inode->i_state;
+	return inode->i_state.__state;
 }
 
 static inline void inode_state_set_raw(struct inode *inode,
 				       enum inode_state_flags_enum flags)
 {
-	WRITE_ONCE(inode->i_state, inode->i_state | flags);
+	WRITE_ONCE(inode->i_state.__state, inode->i_state.__state | flags);
 }
 
 static inline void inode_state_set(struct inode *inode,
@@ -934,7 +941,7 @@ static inline void inode_state_set(struct inode *inode,
 static inline void inode_state_clear_raw(struct inode *inode,
 					 enum inode_state_flags_enum flags)
 {
-	WRITE_ONCE(inode->i_state, inode->i_state & ~flags);
+	WRITE_ONCE(inode->i_state.__state, inode->i_state.__state & ~flags);
 }
 
 static inline void inode_state_clear(struct inode *inode,
@@ -947,7 +954,7 @@ static inline void inode_state_clear(struct inode *inode,
 static inline void inode_state_assign_raw(struct inode *inode,
 					  enum inode_state_flags_enum flags)
 {
-	WRITE_ONCE(inode->i_state, flags);
+	WRITE_ONCE(inode->i_state.__state, flags);
 }
 
 static inline void inode_state_assign(struct inode *inode,
@@ -962,7 +969,7 @@ static inline void inode_state_replace_raw(struct inode *inode,
 					   enum inode_state_flags_enum setflags)
 {
 	enum inode_state_flags_enum flags;
-	flags = inode->i_state;
+	flags = inode->i_state.__state;
 	flags &= ~clearflags;
 	flags |= setflags;
 	inode_state_assign_raw(inode, flags);
-- 
cgit v1.2.3


From 1888635532fbbd6be4a4368621085c3a197279f8 Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao@bytedance.com>
Date: Tue, 30 Sep 2025 16:53:15 +0800
Subject: writeback: Wake up waiting tasks when finishing the writeback of a
 chunk.

Writing back a large number of pages can take a lots of time.
This issue is exacerbated when the underlying device is slow or
subject to block layer rate limiting, which in turn triggers
unexpected hung task warnings.

We can trigger a wake-up once a chunk has been written back and the
waiting time for writeback exceeds half of
sysctl_hung_task_timeout_secs.
This action allows the hung task detector to be aware of the writeback
progress, thereby eliminating these unexpected hung task warnings.

This patch has passed the xfstests 'check -g quick' test based on ext4,
with no additional failures introduced.

Signed-off-by: Julian Sun <sunjunchao@bytedance.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c                | 10 +++++++++-
 include/linux/backing-dev-defs.h |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2b35e80037fe..61a980a06cee 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -14,6 +14,7 @@
  *		Additions for address_space-based writeback
  */
 
+#include <linux/sched/sysctl.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
@@ -213,7 +214,8 @@ static void wb_queue_work(struct bdi_writeback *wb,
 void wb_wait_for_completion(struct wb_completion *done)
 {
 	atomic_dec(&done->cnt);		/* put down the initial count */
-	wait_event(*done->waitq, !atomic_read(&done->cnt));
+	wait_event(*done->waitq,
+		   ({ done->progress_stamp = jiffies; !atomic_read(&done->cnt); }));
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -2014,6 +2016,12 @@ static long writeback_sb_inodes(struct super_block *sb,
 		 */
 		__writeback_single_inode(inode, &wbc);
 
+		/* Report progress to inform the hung task detector of the progress. */
+		if (work->done && work->done->progress_stamp &&
+		   (jiffies - work->done->progress_stamp) > HZ *
+		   sysctl_hung_task_timeout_secs / 2)
+			wake_up_all(work->done->waitq);
+
 		wbc_detach_inode(&wbc);
 		work->nr_pages -= write_chunk - wbc.nr_to_write;
 		wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index c5c9d89c73ed..c8aa749790b1 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -63,6 +63,7 @@ enum wb_reason {
 struct wb_completion {
 	atomic_t		cnt;
 	wait_queue_head_t	*waitq;
+	unsigned long progress_stamp;	/* The jiffies when slow progress is detected */
 };
 
 #define __WB_COMPLETION_INIT(_waitq)	\
-- 
cgit v1.2.3


From d6e6215907640801b1f407dc9e871b19ca5a3805 Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao@bytedance.com>
Date: Tue, 30 Sep 2025 15:18:29 +0800
Subject: writeback: Add logging for slow writeback (exceeds
 sysctl_hung_task_timeout_secs)

When a writeback work lasts for sysctl_hung_task_timeout_secs, we want
to identify that there are tasks waiting for a long time-this helps us
pinpoint potential issues.

Additionally, recording the starting jiffies is useful when debugging a
crashed vmcore.

Signed-off-by: Julian Sun <sunjunchao@bytedance.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c                | 17 +++++++++++++++--
 include/linux/backing-dev-defs.h |  1 +
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 61a980a06cee..e76192d140e3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -201,6 +201,19 @@ static void wb_queue_work(struct bdi_writeback *wb,
 	spin_unlock_irq(&wb->work_lock);
 }
 
+static bool wb_wait_for_completion_cb(struct wb_completion *done)
+{
+	unsigned long waited_secs = (jiffies - done->wait_start) / HZ;
+
+	done->progress_stamp = jiffies;
+	if (waited_secs > sysctl_hung_task_timeout_secs)
+		pr_info("INFO: The task %s:%d has been waiting for writeback "
+			"completion for more than %lu seconds.",
+			current->comm, current->pid, waited_secs);
+
+	return !atomic_read(&done->cnt);
+}
+
 /**
  * wb_wait_for_completion - wait for completion of bdi_writeback_works
  * @done: target wb_completion
@@ -213,9 +226,9 @@ static void wb_queue_work(struct bdi_writeback *wb,
  */
 void wb_wait_for_completion(struct wb_completion *done)
 {
+	done->wait_start = jiffies;
 	atomic_dec(&done->cnt);		/* put down the initial count */
-	wait_event(*done->waitq,
-		   ({ done->progress_stamp = jiffies; !atomic_read(&done->cnt); }));
+	wait_event(*done->waitq, wb_wait_for_completion_cb(done));
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index c8aa749790b1..610ef62b6a32 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -64,6 +64,7 @@ struct wb_completion {
 	atomic_t		cnt;
 	wait_queue_head_t	*waitq;
 	unsigned long progress_stamp;	/* The jiffies when slow progress is detected */
+	unsigned long wait_start;	/* The jiffies when waiting for the writeback work to finish */
 };
 
 #define __WB_COMPLETION_INIT(_waitq)	\
-- 
cgit v1.2.3


From a00f3dea0352a5fb0b67b84c72daeb6563f8e67f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 9 Oct 2025 21:25:56 +0200
Subject: ACPI: PM: s2idle: Drop acpi_get_lps0_constraint()

Drop unused function acpi_get_lps0_constraint().

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>
Link: https://patch.msgid.link/5032801.GXAFRqVoOG@rafael.j.wysocki
---
 drivers/acpi/x86/s2idle.c | 24 ------------------------
 include/linux/acpi.h      |  5 -----
 2 files changed, 29 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index dd0b40b9bbe8..ea645853fc20 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -299,30 +299,6 @@ free_acpi_buffer:
 	ACPI_FREE(out_obj);
 }
 
-/**
- * acpi_get_lps0_constraint - Get the LPS0 constraint for a device.
- * @adev: Device to get the constraint for.
- *
- * The LPS0 constraint is the shallowest (minimum) power state in which the
- * device can be so as to allow the platform as a whole to achieve additional
- * energy conservation by utilizing a system-wide low-power state.
- *
- * Returns:
- *  - ACPI power state value of the constraint for @adev on success.
- *  - Otherwise, ACPI_STATE_UNKNOWN.
- */
-int acpi_get_lps0_constraint(struct acpi_device *adev)
-{
-	struct lpi_constraints *entry;
-
-	for_each_lpi_constraint(entry) {
-		if (adev->handle == entry->handle)
-			return entry->min_dstate;
-	}
-
-	return ACPI_STATE_UNKNOWN;
-}
-
 static void lpi_check_constraints(void)
 {
 	struct lpi_constraints *entry;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 5ff5d99f6ead..252768d007c7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1146,12 +1146,7 @@ struct acpi_s2idle_dev_ops {
 #if defined(CONFIG_SUSPEND) && defined(CONFIG_X86)
 int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg);
 void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg);
-int acpi_get_lps0_constraint(struct acpi_device *adev);
 #else /* CONFIG_SUSPEND && CONFIG_X86 */
-static inline int acpi_get_lps0_constraint(struct device *dev)
-{
-	return ACPI_STATE_UNKNOWN;
-}
 static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 370157293175a702036203faec3e0495b081f135 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 15 Oct 2025 20:59:17 -0700
Subject: nl802154: fix some kernel-doc warnings

Correct multiple kernel-doc warnings in nl802154.h:

- Fix a typo on one enum name to avoid a kernel-doc warning.
- Drop 2 enum descriptions that are no longer needed.
- Mark 2 internal enums as "private:" so that kernel-doc is not needed
  for them.

Warning: nl802154.h:239 Enum value 'NL802154_CAP_ATTR_MAX_MAXBE' not described in enum 'nl802154_wpan_phy_capability_attr'
Warning: nl802154.h:239 Excess enum value '%NL802154_CAP_ATTR_MIN_CCA_ED_LEVEL' description in 'nl802154_wpan_phy_capability_attr'
Warning: nl802154.h:239 Excess enum value '%NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL' description in 'nl802154_wpan_phy_capability_attr'
Warning: nl802154.h:369 Enum value '__NL802154_CCA_OPT_ATTR_AFTER_LAST' not described in enum 'nl802154_cca_opts'
Warning: nl802154.h:369 Enum value 'NL802154_CCA_OPT_ATTR_MAX' not described in enum 'nl802154_cca_opts'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251016035917.1148012-1-rdunlap@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/nl802154.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/nl802154.h b/include/net/nl802154.h
index a994dea74596..442822746e92 100644
--- a/include/net/nl802154.h
+++ b/include/net/nl802154.h
@@ -191,14 +191,12 @@ enum nl802154_iftype {
  * @NL802154_CAP_ATTR_CHANNELS: a nested attribute for nl802154_channel_attr
  * @NL802154_CAP_ATTR_TX_POWERS: a nested attribute for
  *	nl802154_wpan_phy_tx_power
- * @NL802154_CAP_ATTR_MIN_CCA_ED_LEVEL: minimum value for cca_ed_level
- * @NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL: maximum value for cca_ed_level
  * @NL802154_CAP_ATTR_CCA_MODES: nl802154_cca_modes flags
  * @NL802154_CAP_ATTR_CCA_OPTS: nl802154_cca_opts flags
  * @NL802154_CAP_ATTR_MIN_MINBE: minimum of minbe value
  * @NL802154_CAP_ATTR_MAX_MINBE: maximum of minbe value
  * @NL802154_CAP_ATTR_MIN_MAXBE: minimum of maxbe value
- * @NL802154_CAP_ATTR_MAX_MINBE: maximum of maxbe value
+ * @NL802154_CAP_ATTR_MAX_MAXBE: maximum of maxbe value
  * @NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS: minimum of csma backoff value
  * @NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS: maximum of csma backoffs value
  * @NL802154_CAP_ATTR_MIN_FRAME_RETRIES: minimum of frame retries value
@@ -364,6 +362,7 @@ enum nl802154_cca_opts {
 	NL802154_CCA_OPT_ENERGY_CARRIER_AND,
 	NL802154_CCA_OPT_ENERGY_CARRIER_OR,
 
+	/* private: */
 	/* keep last */
 	__NL802154_CCA_OPT_ATTR_AFTER_LAST,
 	NL802154_CCA_OPT_ATTR_MAX = __NL802154_CCA_OPT_ATTR_AFTER_LAST - 1
-- 
cgit v1.2.3


From 813882ae22756bcf9645d405e045c60e5aab0a93 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 16 Oct 2025 15:36:46 +0100
Subject: net: stmmac: remove broken PCS code

Changing the netif_carrier_*() state behind phylink's back has always
been prohibited because it messes up with phylinks state tracking, and
means that phylink no longer guarantees to call the mac_link_down()
and mac_link_up() methods at the appropriate times.  This was later
documented in the sfp-phylink network driver conversion guide.

stmmac was converted to phylink in 2019, but nothing was done with the
"PCS" code. Since then, apart from the updates as part of phylink
development, nothing has happened with stmmac to improve its use of
phylink, or even to address this point.

A couple of years ago, a has_integrated_pcs boolean was added by Bart,
which later became the STMMAC_FLAG_HAS_INTEGRATED_PCS flag, to avoid
manipulating the netif_carrier_*() state. This flag is mis-named,
because whenever the stmmac is synthesized for its native SGMII, TBI
or RTBI interfaces, it has an "integrated PCS". This boolean/flag
actually means "ignore the status from the integrated PCS".

Discussing with Bart, the reasons for this are lost to the winds of
time (which is why we should always document the reasons in the commit
message.)

RGMII also has in-band status, and the dwmac cores and stmmac code
supports this but with one bug that saves the day.

When dwmac cores are synthesised for RGMII only, they do not contain
an integrated PCS, and so priv->dma_cap.pcs is clear, which prevents
(incorrectly) the "RGMII PCS" being used, meaning we don't read the
in-band status. However, a core synthesised for RGMII and also SGMII,
TBI or RTBI will have this capability bit set, thus making these
code paths reachable.

The Jetson Xavier NX uses RGMII mode to talk to its PHY, and removing
the incorrect check for priv->dma_cap.pcs reveals the theortical issue
with netif_carrier_*() manipulation is real:

dwc-eth-dwmac 2490000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0
dwc-eth-dwmac 2490000.ethernet eth0: PHY [stmmac-0:00] driver [RTL8211F Gigabit Ethernet] (irq=141)
dwc-eth-dwmac 2490000.ethernet eth0: No Safety Features support found
dwc-eth-dwmac 2490000.ethernet eth0: IEEE 1588-2008 Advanced Timestamp supported
dwc-eth-dwmac 2490000.ethernet eth0: registered PTP clock
dwc-eth-dwmac 2490000.ethernet eth0: configuring for phy/rgmii-id link mode
8021q: adding VLAN 0 to HW filter on device eth0
dwc-eth-dwmac 2490000.ethernet eth0: Adding VLAN ID 0 is not supported
Link is Up - 1000/Full
Link is Down
Link is Up - 1000/Full

This looks good until one realises that the phylink "Link" status
messages are missing, even when the RJ45 cable is reconnected. Nothing
one can do results in the interface working. The interrupt handler
(which prints those "Link is" messages) always wins over phylink's
resolve worker, meaning phylink never calls the mac_link_up() nor
mac_link_down() methods.

eth0 also sees no traffic received, and is unable to obtain a DHCP
address:

3: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group defa
ult qlen 1000
    link/ether e6:d3:6a:e6:92:de brd ff:ff:ff:ff:ff:ff
    RX: bytes  packets  errors  dropped overrun mcast
    0          0        0       0       0       0
    TX: bytes  packets  errors  dropped carrier collsns
    27686      149      0       0       0       0

With the STMMAC_FLAG_HAS_INTEGRATED_PCS flag set, which disables the
netif_carrier_*() manipulation then stmmac works normally:

dwc-eth-dwmac 2490000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0
dwc-eth-dwmac 2490000.ethernet eth0: PHY [stmmac-0:00] driver [RTL8211F Gigabit Ethernet] (irq=141)
dwc-eth-dwmac 2490000.ethernet eth0: No Safety Features support found
dwc-eth-dwmac 2490000.ethernet eth0: IEEE 1588-2008 Advanced Timestamp supported
dwc-eth-dwmac 2490000.ethernet eth0: registered PTP clock
dwc-eth-dwmac 2490000.ethernet eth0: configuring for phy/rgmii-id link mode
8021q: adding VLAN 0 to HW filter on device eth0
dwc-eth-dwmac 2490000.ethernet eth0: Adding VLAN ID 0 is not supported
Link is Up - 1000/Full
dwc-eth-dwmac 2490000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx

and packets can be transferred.

This clearly shows that when priv->hw->pcs is set, but
STMMAC_FLAG_HAS_INTEGRATED_PCS is clear, the driver reliably fails.

Discovering whether a platform falls into this is impossible as
parsing all the dtsi and dts files to find out which use the stmmac
driver, whether any of them use RGMII or SGMII and also depends
whether an external interface is being used. The kernel likely
doesn't contain all dts files either.

The only driver that sets this flag uses the qcom,sa8775p-ethqos
compatible, and uses SGMII or 2500BASE-X.

but these are saved from this problem by the incorrect check for
priv->dma_cap.pcs.

So, we have to assume that for every other platform that uses SGMII
with stmmac is using an external PCS.

Moreover, ethtool output can be incorrect. With the full-duplex link
negotiated, ethtool reports:

        Speed: 1000Mb/s
        Duplex: Half

because with dwmac4, the full-duplex bit is in bit 16 of the status,
priv->xstats.pcs_duplex becomes BIT(16) for full duplex, but the
ethtool ksettings duplex member is u8 - so becomes zero. Moreover,
the supported, advertised and link partner modes are all "not
reported".

Finally, ksettings_set() won't be able to set the advertisement on
a PHY if this PCS code is activated, which is incorrect when SGMII
is used with a PHY.

Thus, remove:
1. the incorrect netif_carrier_*() manipulation.
2. the broken ethtool ksettings code.

Given that all uses of STMMAC_FLAG_HAS_INTEGRATED_PCS are now gone,
remove the flag from stmmac.h and dwmac-qcom-ethqos.c.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P5y-0000000AolC-1QWH@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c    |  4 --
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   | 55 ----------------------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  9 ----
 include/linux/stmmac.h                             |  1 -
 4 files changed, 69 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
index d8fd4d8f6ced..f62825220cf7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
@@ -96,7 +96,6 @@ struct ethqos_emac_driver_data {
 	bool rgmii_config_loopback_en;
 	bool has_emac_ge_3;
 	const char *link_clk_name;
-	bool has_integrated_pcs;
 	u32 dma_addr_width;
 	struct dwmac4_addrs dwmac4_addrs;
 	bool needs_sgmii_loopback;
@@ -282,7 +281,6 @@ static const struct ethqos_emac_driver_data emac_v4_0_0_data = {
 	.rgmii_config_loopback_en = false,
 	.has_emac_ge_3 = true,
 	.link_clk_name = "phyaux",
-	.has_integrated_pcs = true,
 	.needs_sgmii_loopback = true,
 	.dma_addr_width = 36,
 	.dwmac4_addrs = {
@@ -856,8 +854,6 @@ static int qcom_ethqos_probe(struct platform_device *pdev)
 		plat_dat->flags |= STMMAC_FLAG_TSO_EN;
 	if (of_device_is_compatible(np, "qcom,qcs404-ethqos"))
 		plat_dat->flags |= STMMAC_FLAG_RX_CLK_RUNS_IN_LPI;
-	if (data->has_integrated_pcs)
-		plat_dat->flags |= STMMAC_FLAG_HAS_INTEGRATED_PCS;
 	if (data->dma_addr_width)
 		plat_dat->host_dma_width = data->dma_addr_width;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 39fa1ec92f82..d89662b48087 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -322,47 +322,6 @@ static int stmmac_ethtool_get_link_ksettings(struct net_device *dev,
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 
-	if (!(priv->plat->flags & STMMAC_FLAG_HAS_INTEGRATED_PCS) &&
-	    (priv->hw->pcs & STMMAC_PCS_RGMII ||
-	     priv->hw->pcs & STMMAC_PCS_SGMII)) {
-		u32 supported, advertising, lp_advertising;
-
-		if (!priv->xstats.pcs_link) {
-			cmd->base.speed = SPEED_UNKNOWN;
-			cmd->base.duplex = DUPLEX_UNKNOWN;
-			return 0;
-		}
-		cmd->base.duplex = priv->xstats.pcs_duplex;
-
-		cmd->base.speed = priv->xstats.pcs_speed;
-
-		/* Encoding of PSE bits is defined in 802.3z, 37.2.1.4 */
-
-		ethtool_convert_link_mode_to_legacy_u32(
-			&supported, cmd->link_modes.supported);
-		ethtool_convert_link_mode_to_legacy_u32(
-			&advertising, cmd->link_modes.advertising);
-		ethtool_convert_link_mode_to_legacy_u32(
-			&lp_advertising, cmd->link_modes.lp_advertising);
-
-		/* Reg49[3] always set because ANE is always supported */
-		cmd->base.autoneg = ADVERTISED_Autoneg;
-		supported |= SUPPORTED_Autoneg;
-		advertising |= ADVERTISED_Autoneg;
-		lp_advertising |= ADVERTISED_Autoneg;
-
-		cmd->base.port = PORT_OTHER;
-
-		ethtool_convert_legacy_u32_to_link_mode(
-			cmd->link_modes.supported, supported);
-		ethtool_convert_legacy_u32_to_link_mode(
-			cmd->link_modes.advertising, advertising);
-		ethtool_convert_legacy_u32_to_link_mode(
-			cmd->link_modes.lp_advertising, lp_advertising);
-
-		return 0;
-	}
-
 	return phylink_ethtool_ksettings_get(priv->phylink, cmd);
 }
 
@@ -372,20 +331,6 @@ stmmac_ethtool_set_link_ksettings(struct net_device *dev,
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 
-	if (!(priv->plat->flags & STMMAC_FLAG_HAS_INTEGRATED_PCS) &&
-	    (priv->hw->pcs & STMMAC_PCS_RGMII ||
-	     priv->hw->pcs & STMMAC_PCS_SGMII)) {
-		/* Only support ANE */
-		if (cmd->base.autoneg != AUTONEG_ENABLE)
-			return -EINVAL;
-
-		mutex_lock(&priv->lock);
-		stmmac_pcs_ctrl_ane(priv, 1, priv->hw->ps, 0);
-		mutex_unlock(&priv->lock);
-
-		return 0;
-	}
-
 	return phylink_ethtool_ksettings_set(priv->phylink, cmd);
 }
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index c9fa965c8566..867d0ca3b45e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -6001,15 +6001,6 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv)
 		for (queue = 0; queue < queues_count; queue++)
 			stmmac_host_mtl_irq_status(priv, priv->hw, queue);
 
-		/* PCS link status */
-		if (priv->hw->pcs &&
-		    !(priv->plat->flags & STMMAC_FLAG_HAS_INTEGRATED_PCS)) {
-			if (priv->xstats.pcs_link)
-				netif_carrier_on(priv->dev);
-			else
-				netif_carrier_off(priv->dev);
-		}
-
 		stmmac_timestamp_interrupt(priv, priv);
 	}
 }
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index fa1318bac06c..99022620457a 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -171,7 +171,6 @@ struct dwmac4_addrs {
 	u32 mtl_low_cred_offset;
 };
 
-#define STMMAC_FLAG_HAS_INTEGRATED_PCS		BIT(0)
 #define STMMAC_FLAG_SPH_DISABLE			BIT(1)
 #define STMMAC_FLAG_USE_PHY_WOL			BIT(2)
 #define STMMAC_FLAG_HAS_SUN8I			BIT(3)
-- 
cgit v1.2.3


From d19f6451c6feefd6537b97efa5f3859681f243cb Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 16 Oct 2025 11:09:26 +0200
Subject: gpio: export gpiod_hwgpio()

Reading the GPIO hardware number from a descriptor is a valid use-case
outside of the GPIO core. Export the symbol to consumers of GPIO
descriptors.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andrew Jeffery <andrew@codeconstruct.com.au>
Link: https://lore.kernel.org/r/20251016-aspeed-gpiolib-include-v1-2-31201c06d124@linaro.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib.c        | 13 +++++++++++++
 drivers/gpio/gpiolib.h        |  8 --------
 include/linux/gpio/consumer.h |  2 ++
 3 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 5a450dac8f3a..a81981336b36 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -235,6 +235,19 @@ int desc_to_gpio(const struct gpio_desc *desc)
 }
 EXPORT_SYMBOL_GPL(desc_to_gpio);
 
+/**
+ * gpiod_hwgpio - Return the GPIO number of the passed descriptor relative to
+ *                its chip.
+ * @desc: GPIO descriptor
+ *
+ * Returns:
+ * Hardware offset of the GPIO represented by the descriptor.
+ */
+int gpiod_hwgpio(const struct gpio_desc *desc)
+{
+	return desc - &desc->gdev->descs[0];
+}
+EXPORT_SYMBOL_GPL(gpiod_hwgpio);
 
 /**
  * gpiod_to_chip - Return the GPIO chip to which a GPIO descriptor belongs
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index 62d4c15b74f5..14e6a9807a89 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -273,14 +273,6 @@ int gpiochip_get_ngpios(struct gpio_chip *gc, struct device *dev);
 struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum);
 const char *gpiod_get_label(struct gpio_desc *desc);
 
-/*
- * Return the GPIO number of the passed descriptor relative to its chip
- */
-static inline int gpiod_hwgpio(const struct gpio_desc *desc)
-{
-	return desc - &desc->gdev->descs[0];
-}
-
 /* With descriptor prefix */
 
 #define __gpiod_pr(level, desc, fmt, ...) \
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 00df68c51405..994d46874d56 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -171,6 +171,8 @@ int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name);
 struct gpio_desc *gpio_to_desc(unsigned gpio);
 int desc_to_gpio(const struct gpio_desc *desc);
 
+int gpiod_hwgpio(const struct gpio_desc *desc);
+
 struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode,
 					 const char *con_id, int index,
 					 enum gpiod_flags flags,
-- 
cgit v1.2.3


From 44472d1b83127e579c798ff92a07ae86d98b61b9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 6 Oct 2025 13:07:32 +0200
Subject: atomic: Skip alignment check for try_cmpxchg() old arg

The 'old' argument in atomic_try_cmpxchg() and related functions is a
pointer to a normal non-atomic integer number, which does not require
to be naturally aligned, unlike the atomic_t/atomic64_t types themselves.

In order to add an alignment check with CONFIG_DEBUG_ATOMIC into the
normal instrument_atomic_read_write() helper, change this check to use
the non-atomic instrument_read_write(), the same way that was done
earlier for try_cmpxchg() in commit ec570320b09f ("locking/atomic:
Correct (cmp)xchg() instrumentation").

This prevents warnings on m68k calling the 32-bit atomic_try_cmpxchg()
with 16-bit aligned arguments as well as several more architectures
including x86-32 when calling atomic64_try_cmpxchg() with 32-bit
aligned u64 arguments.

Reported-by: Finn Thain <fthain@linux-m68k.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/cover.1757810729.git.fthain@linux-m68k.org/
---
 include/linux/atomic/atomic-instrumented.h | 26 +++++++++++++-------------
 scripts/atomic/gen-atomic-instrumented.sh  | 11 +++++++----
 2 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 9409a6ddf3e0..37ab6314a9f7 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -1276,7 +1276,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new)
 {
 	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic_try_cmpxchg(v, old, new);
 }
 
@@ -1298,7 +1298,7 @@ static __always_inline bool
 atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
 {
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic_try_cmpxchg_acquire(v, old, new);
 }
 
@@ -1321,7 +1321,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
 {
 	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic_try_cmpxchg_release(v, old, new);
 }
 
@@ -1343,7 +1343,7 @@ static __always_inline bool
 atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
 {
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic_try_cmpxchg_relaxed(v, old, new);
 }
 
@@ -2854,7 +2854,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
 {
 	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic64_try_cmpxchg(v, old, new);
 }
 
@@ -2876,7 +2876,7 @@ static __always_inline bool
 atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
 {
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic64_try_cmpxchg_acquire(v, old, new);
 }
 
@@ -2899,7 +2899,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
 {
 	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic64_try_cmpxchg_release(v, old, new);
 }
 
@@ -2921,7 +2921,7 @@ static __always_inline bool
 atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
 {
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic64_try_cmpxchg_relaxed(v, old, new);
 }
 
@@ -4432,7 +4432,7 @@ atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
 {
 	kcsan_mb();
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic_long_try_cmpxchg(v, old, new);
 }
 
@@ -4454,7 +4454,7 @@ static __always_inline bool
 atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
 {
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic_long_try_cmpxchg_acquire(v, old, new);
 }
 
@@ -4477,7 +4477,7 @@ atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
 {
 	kcsan_release();
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic_long_try_cmpxchg_release(v, old, new);
 }
 
@@ -4499,7 +4499,7 @@ static __always_inline bool
 atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
 {
 	instrument_atomic_read_write(v, sizeof(*v));
-	instrument_atomic_read_write(old, sizeof(*old));
+	instrument_read_write(old, sizeof(*old));
 	return raw_atomic_long_try_cmpxchg_relaxed(v, old, new);
 }
 
@@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 8829b337928e9508259079d32581775ececd415b
+// f618ac667f868941a84ce0ab2242f1786e049ed4
diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh
index 592f3ec89b5f..9c1d53f81eb2 100755
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -12,7 +12,7 @@ gen_param_check()
 	local arg="$1"; shift
 	local type="${arg%%:*}"
 	local name="$(gen_param_name "${arg}")"
-	local rw="write"
+	local rw="atomic_write"
 
 	case "${type#c}" in
 	i) return;;
@@ -20,14 +20,17 @@ gen_param_check()
 
 	if [ ${type#c} != ${type} ]; then
 		# We don't write to constant parameters.
-		rw="read"
+		rw="atomic_read"
+	elif [ "${type}" = "p" ] ; then
+		# The "old" argument in try_cmpxchg() gets accessed non-atomically
+		rw="read_write"
 	elif [ "${meta}" != "s" ]; then
 		# An atomic RMW: if this parameter is not a constant, and this atomic is
 		# not just a 's'tore, this parameter is both read from and written to.
-		rw="read_write"
+		rw="atomic_read_write"
 	fi
 
-	printf "\tinstrument_atomic_${rw}(${name}, sizeof(*${name}));\n"
+	printf "\tinstrument_${rw}(${name}, sizeof(*${name}));\n"
 }
 
 #gen_params_checks(meta, arg...)
-- 
cgit v1.2.3


From cc39f3872c0865bef992b713338df369554fa9e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 9 Oct 2025 22:11:54 +0200
Subject: seqlock: Introduce scoped_seqlock_read()

The read_seqbegin/need_seqretry/done_seqretry API is cumbersome and
error prone. With the new helper the "typical" code like

	int seq, nextseq;
	unsigned long flags;

	nextseq = 0;
	do {
		seq = nextseq;
		flags = read_seqbegin_or_lock_irqsave(&seqlock, &seq);

		// read-side critical section

		nextseq = 1;
	} while (need_seqretry(&seqlock, seq));
	done_seqretry_irqrestore(&seqlock, seq, flags);

can be rewritten as

	scoped_seqlock_read (&seqlock, ss_lock_irqsave) {
		// read-side critical section
	}

Original idea by Oleg Nesterov; with contributions from Linus.

Originally-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/seqlock.h | 111 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

(limited to 'include')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 5ce48eab7a2a..b7bcc4111e90 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -1209,4 +1209,115 @@ done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
 	if (seq & 1)
 		read_sequnlock_excl_irqrestore(lock, flags);
 }
+
+enum ss_state {
+	ss_done = 0,
+	ss_lock,
+	ss_lock_irqsave,
+	ss_lockless,
+};
+
+struct ss_tmp {
+	enum ss_state	state;
+	unsigned long	data;
+	spinlock_t	*lock;
+	spinlock_t	*lock_irqsave;
+};
+
+static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst)
+{
+	if (sst->lock)
+		spin_unlock(sst->lock);
+	if (sst->lock_irqsave)
+		spin_unlock_irqrestore(sst->lock_irqsave, sst->data);
+}
+
+extern void __scoped_seqlock_invalid_target(void);
+
+#if defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000
+/*
+ * For some reason some GCC-8 architectures (nios2, alpha) have trouble
+ * determining that the ss_done state is impossible in __scoped_seqlock_next()
+ * below.
+ */
+static inline void __scoped_seqlock_bug(void) { }
+#else
+/*
+ * Canary for compiler optimization -- if the compiler doesn't realize this is
+ * an impossible state, it very likely generates sub-optimal code here.
+ */
+extern void __scoped_seqlock_bug(void);
+#endif
+
+static inline void
+__scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target)
+{
+	switch (sst->state) {
+	case ss_done:
+		__scoped_seqlock_bug();
+		return;
+
+	case ss_lock:
+	case ss_lock_irqsave:
+		sst->state = ss_done;
+		return;
+
+	case ss_lockless:
+		if (!read_seqretry(lock, sst->data)) {
+			sst->state = ss_done;
+			return;
+		}
+		break;
+	}
+
+	switch (target) {
+	case ss_done:
+		__scoped_seqlock_invalid_target();
+		return;
+
+	case ss_lock:
+		sst->lock = &lock->lock;
+		spin_lock(sst->lock);
+		sst->state = ss_lock;
+		return;
+
+	case ss_lock_irqsave:
+		sst->lock_irqsave = &lock->lock;
+		spin_lock_irqsave(sst->lock_irqsave, sst->data);
+		sst->state = ss_lock_irqsave;
+		return;
+
+	case ss_lockless:
+		sst->data = read_seqbegin(lock);
+		return;
+	}
+}
+
+#define __scoped_seqlock_read(_seqlock, _target, _s)			\
+	for (struct ss_tmp _s __cleanup(__scoped_seqlock_cleanup) =	\
+	     { .state = ss_lockless, .data = read_seqbegin(_seqlock) };	\
+	     _s.state != ss_done;					\
+	     __scoped_seqlock_next(&_s, _seqlock, _target))
+
+/**
+ * scoped_seqlock_read (lock, ss_state) - execute the read side critical
+ *                                        section without manual sequence
+ *                                        counter handling or calls to other
+ *                                        helpers
+ * @lock: pointer to seqlock_t protecting the data
+ * @ss_state: one of {ss_lock, ss_lock_irqsave, ss_lockless} indicating
+ *            the type of critical read section
+ *
+ * Example:
+ *
+ *     scoped_seqlock_read (&lock, ss_lock) {
+ *         // read-side critical section
+ *     }
+ *
+ * Starts with a lockess pass first. If it fails, restarts the critical
+ * section with the lock held.
+ */
+#define scoped_seqlock_read(_seqlock, _target)				\
+	__scoped_seqlock_read(_seqlock, _target, __UNIQUE_ID(seqlock))
+
 #endif /* __LINUX_SEQLOCK_H */
-- 
cgit v1.2.3


From 0e85936a9d492acf6ff9519a5f630a7fedb62f7f Mon Sep 17 00:00:00 2001
From: Jishnu Prakash <jishnu.prakash@oss.qualcomm.com>
Date: Wed, 24 Sep 2025 16:17:07 -0700
Subject: dt-bindings: power: qcom,rpmpd: add new RPMH levels

Add constants for voltage levels: LOW_SVS_D2_1, LOW_SVS_D1_1 and
LOW_SVS_L0.

Signed-off-by: Jishnu Prakash <jishnu.prakash@oss.qualcomm.com>
Signed-off-by: Jingyi Wang <jingyi.wang@oss.qualcomm.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/dt-bindings/power/qcom,rpmhpd.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/power/qcom,rpmhpd.h b/include/dt-bindings/power/qcom,rpmhpd.h
index 73cceb88953f..50e7c886709d 100644
--- a/include/dt-bindings/power/qcom,rpmhpd.h
+++ b/include/dt-bindings/power/qcom,rpmhpd.h
@@ -33,11 +33,14 @@
 #define RPMH_REGULATOR_LEVEL_RETENTION		16
 #define RPMH_REGULATOR_LEVEL_MIN_SVS		48
 #define RPMH_REGULATOR_LEVEL_LOW_SVS_D3		50
+#define RPMH_REGULATOR_LEVEL_LOW_SVS_D2_1	51
 #define RPMH_REGULATOR_LEVEL_LOW_SVS_D2		52
+#define RPMH_REGULATOR_LEVEL_LOW_SVS_D1_1	54
 #define RPMH_REGULATOR_LEVEL_LOW_SVS_D1		56
 #define RPMH_REGULATOR_LEVEL_LOW_SVS_D0		60
 #define RPMH_REGULATOR_LEVEL_LOW_SVS		64
 #define RPMH_REGULATOR_LEVEL_LOW_SVS_P1		72
+#define RPMH_REGULATOR_LEVEL_LOW_SVS_L0		76
 #define RPMH_REGULATOR_LEVEL_LOW_SVS_L1		80
 #define RPMH_REGULATOR_LEVEL_LOW_SVS_L2		96
 #define RPMH_REGULATOR_LEVEL_SVS		128
-- 
cgit v1.2.3


From 3ff9bcecce83f12169ab3e42671bd76554ca521a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Oct 2025 13:37:12 +0000
Subject: net: avoid extra access to sk->sk_wmem_alloc in sock_wfree()

UDP TX packets destructor is sock_wfree().

It suffers from a cache line bouncing in sock_def_write_space_wfree().

Instead of reading sk->sk_wmem_alloc after we just did an atomic RMW
on it, use __refcount_sub_and_test() to get the old value for free,
and pass the new value to sock_def_write_space_wfree().

Add __sock_writeable() helper.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251017133712.2842665-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sock.h |  6 +++++-
 net/core/sock.c    | 14 ++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 5c564f114ae9..01ce231603db 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2607,12 +2607,16 @@ static inline struct page_frag *sk_page_frag(struct sock *sk)
 
 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
 
+static inline bool __sock_writeable(const struct sock *sk, int wmem_alloc)
+{
+	return wmem_alloc < (READ_ONCE(sk->sk_sndbuf) >> 1);
+}
 /*
  *	Default write policy as shown to user space via poll/select/SIGIO
  */
 static inline bool sock_writeable(const struct sock *sk)
 {
-	return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
+	return __sock_writeable(sk, refcount_read(&sk->sk_wmem_alloc));
 }
 
 static inline gfp_t gfp_any(void)
diff --git a/net/core/sock.c b/net/core/sock.c
index b78533fb9268..a99132cc0965 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -155,7 +155,7 @@
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
-static void sock_def_write_space_wfree(struct sock *sk);
+static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
 static void sock_def_write_space(struct sock *sk);
 
 /**
@@ -2659,16 +2659,18 @@ EXPORT_SYMBOL_GPL(sk_setup_caps);
  */
 void sock_wfree(struct sk_buff *skb)
 {
-	struct sock *sk = skb->sk;
 	unsigned int len = skb->truesize;
+	struct sock *sk = skb->sk;
 	bool free;
+	int old;
 
 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
 		if (sock_flag(sk, SOCK_RCU_FREE) &&
 		    sk->sk_write_space == sock_def_write_space) {
 			rcu_read_lock();
-			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
-			sock_def_write_space_wfree(sk);
+			free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
+						       &old);
+			sock_def_write_space_wfree(sk, old - len);
 			rcu_read_unlock();
 			if (unlikely(free))
 				__sk_free(sk);
@@ -3612,12 +3614,12 @@ static void sock_def_write_space(struct sock *sk)
  * for SOCK_RCU_FREE sockets under RCU read section and after putting
  * ->sk_wmem_alloc.
  */
-static void sock_def_write_space_wfree(struct sock *sk)
+static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
 {
 	/* Do not wake up a writer until he can make "significant"
 	 * progress.  --DaveM
 	 */
-	if (sock_writeable(sk)) {
+	if (__sock_writeable(sk, wmem_alloc)) {
 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
 
 		/* rely on refcount_sub from sock_wfree() */
-- 
cgit v1.2.3


From ebaec90ec0b5850ab80ca017e7b63183adcca131 Mon Sep 17 00:00:00 2001
From: Samuel Kayode <samuel.kayode@savoirfairelinux.com>
Date: Wed, 1 Oct 2025 11:42:38 -0400
Subject: mfd: pf1550: Add core driver for the PF1550 PMIC

There are 3 sub-devices for which the drivers will be added in
subsequent patches.

Signed-off-by: Samuel Kayode <samuel.kayode@savoirfairelinux.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Tested-by: Sean Nyekjaer <sean@geanix.com>
Link: https://patch.msgid.link/20251001-pf1550-v12-2-a3302aa41687@savoirfairelinux.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/Kconfig        |  16 ++
 drivers/mfd/Makefile       |   2 +
 drivers/mfd/pf1550.c       | 367 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/pf1550.h | 273 +++++++++++++++++++++++++++++++++
 4 files changed, 658 insertions(+)
 create mode 100644 drivers/mfd/pf1550.c
 create mode 100644 include/linux/mfd/pf1550.h

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 6cec1858947b..219ee6ddf516 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -605,6 +605,22 @@ config MFD_MX25_TSADC
 	  i.MX25 processors. They consist of a conversion queue for general
 	  purpose ADC and a queue for Touchscreens.
 
+config MFD_PF1550
+	tristate "NXP PF1550 PMIC Support"
+	depends on I2C=y && OF
+	select MFD_CORE
+	select REGMAP_I2C
+	select REGMAP_IRQ
+	help
+	  Say yes here to add support for NXP PF1550. This is a companion Power
+	  Management IC with regulators, onkey, and charger control on chip.
+	  This driver provides common support for accessing the device;
+	  additional drivers must be enabled in order to use the functionality
+	  of the device.
+
+	  This driver can also be built as a module and if so will be called
+	  pf1550.
+
 config MFD_HI6421_PMIC
 	tristate "HiSilicon Hi6421 PMU/Codec IC"
 	depends on OF
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 865e9f12faff..566952f191b5 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -122,6 +122,8 @@ obj-$(CONFIG_MFD_MC13XXX)	+= mc13xxx-core.o
 obj-$(CONFIG_MFD_MC13XXX_SPI)	+= mc13xxx-spi.o
 obj-$(CONFIG_MFD_MC13XXX_I2C)	+= mc13xxx-i2c.o
 
+obj-$(CONFIG_MFD_PF1550)	+= pf1550.o
+
 obj-$(CONFIG_MFD_NCT6694)	+= nct6694.o
 
 obj-$(CONFIG_MFD_CORE)		+= mfd-core.o
diff --git a/drivers/mfd/pf1550.c b/drivers/mfd/pf1550.c
new file mode 100644
index 000000000000..c4f567c05564
--- /dev/null
+++ b/drivers/mfd/pf1550.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Core driver for the PF1550
+ *
+ * Copyright (C) 2016 Freescale Semiconductor, Inc.
+ * Robin Gong <yibin.gong@freescale.com>
+ *
+ * Portions Copyright (c) 2025 Savoir-faire Linux Inc.
+ * Samuel Kayode <samuel.kayode@savoirfairelinux.com>
+ */
+
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/pf1550.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/regmap.h>
+
+static const struct regmap_config pf1550_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = PF1550_PMIC_REG_END,
+};
+
+static const struct regmap_irq pf1550_irqs[] = {
+	REGMAP_IRQ_REG(PF1550_IRQ_CHG, 0, IRQ_CHG),
+	REGMAP_IRQ_REG(PF1550_IRQ_REGULATOR, 0, IRQ_REGULATOR),
+	REGMAP_IRQ_REG(PF1550_IRQ_ONKEY, 0, IRQ_ONKEY),
+};
+
+static const struct regmap_irq_chip pf1550_irq_chip = {
+	.name = "pf1550",
+	.status_base = PF1550_PMIC_REG_INT_CATEGORY,
+	.init_ack_masked = 1,
+	.num_regs = 1,
+	.irqs = pf1550_irqs,
+	.num_irqs = ARRAY_SIZE(pf1550_irqs),
+};
+
+static const struct regmap_irq pf1550_regulator_irqs[] = {
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW1_LS, 0, PMIC_IRQ_SW1_LS),
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW2_LS, 0, PMIC_IRQ_SW2_LS),
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW3_LS, 0, PMIC_IRQ_SW3_LS),
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW1_HS, 3, PMIC_IRQ_SW1_HS),
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW2_HS, 3, PMIC_IRQ_SW2_HS),
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW3_HS, 3, PMIC_IRQ_SW3_HS),
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_LDO1_FAULT, 16, PMIC_IRQ_LDO1_FAULT),
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_LDO2_FAULT, 16, PMIC_IRQ_LDO2_FAULT),
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_LDO3_FAULT, 16, PMIC_IRQ_LDO3_FAULT),
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_TEMP_110, 24, PMIC_IRQ_TEMP_110),
+	REGMAP_IRQ_REG(PF1550_PMIC_IRQ_TEMP_125, 24, PMIC_IRQ_TEMP_125),
+};
+
+static const struct regmap_irq_chip pf1550_regulator_irq_chip = {
+	.name = "pf1550-regulator",
+	.status_base = PF1550_PMIC_REG_SW_INT_STAT0,
+	.ack_base = PF1550_PMIC_REG_SW_INT_STAT0,
+	.mask_base = PF1550_PMIC_REG_SW_INT_MASK0,
+	.use_ack = 1,
+	.init_ack_masked = 1,
+	.num_regs = 25,
+	.irqs = pf1550_regulator_irqs,
+	.num_irqs = ARRAY_SIZE(pf1550_regulator_irqs),
+};
+
+static const struct resource regulator_resources[] = {
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW1_LS),
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW2_LS),
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW3_LS),
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW1_HS),
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW2_HS),
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW3_HS),
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_LDO1_FAULT),
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_LDO2_FAULT),
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_LDO3_FAULT),
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_TEMP_110),
+	DEFINE_RES_IRQ(PF1550_PMIC_IRQ_TEMP_125),
+};
+
+static const struct regmap_irq pf1550_onkey_irqs[] = {
+	REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_PUSHI, 0, ONKEY_IRQ_PUSHI),
+	REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_1SI, 0, ONKEY_IRQ_1SI),
+	REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_2SI, 0, ONKEY_IRQ_2SI),
+	REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_3SI, 0, ONKEY_IRQ_3SI),
+	REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_4SI, 0, ONKEY_IRQ_4SI),
+	REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_8SI, 0, ONKEY_IRQ_8SI),
+};
+
+static const struct regmap_irq_chip pf1550_onkey_irq_chip = {
+	.name = "pf1550-onkey",
+	.status_base = PF1550_PMIC_REG_ONKEY_INT_STAT0,
+	.ack_base = PF1550_PMIC_REG_ONKEY_INT_STAT0,
+	.mask_base = PF1550_PMIC_REG_ONKEY_INT_MASK0,
+	.use_ack = 1,
+	.init_ack_masked = 1,
+	.num_regs = 1,
+	.irqs = pf1550_onkey_irqs,
+	.num_irqs = ARRAY_SIZE(pf1550_onkey_irqs),
+};
+
+static const struct resource onkey_resources[] = {
+	DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_PUSHI),
+	DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_1SI),
+	DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_2SI),
+	DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_3SI),
+	DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_4SI),
+	DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_8SI),
+};
+
+static const struct regmap_irq pf1550_charger_irqs[] = {
+	REGMAP_IRQ_REG(PF1550_CHARG_IRQ_BAT2SOCI, 0, CHARG_IRQ_BAT2SOCI),
+	REGMAP_IRQ_REG(PF1550_CHARG_IRQ_BATI, 0, CHARG_IRQ_BATI),
+	REGMAP_IRQ_REG(PF1550_CHARG_IRQ_CHGI, 0, CHARG_IRQ_CHGI),
+	REGMAP_IRQ_REG(PF1550_CHARG_IRQ_VBUSI, 0, CHARG_IRQ_VBUSI),
+	REGMAP_IRQ_REG(PF1550_CHARG_IRQ_THMI, 0, CHARG_IRQ_THMI),
+};
+
+static const struct regmap_irq_chip pf1550_charger_irq_chip = {
+	.name = "pf1550-charger",
+	.status_base = PF1550_CHARG_REG_CHG_INT,
+	.ack_base = PF1550_CHARG_REG_CHG_INT,
+	.mask_base = PF1550_CHARG_REG_CHG_INT_MASK,
+	.use_ack = 1,
+	.init_ack_masked = 1,
+	.num_regs = 1,
+	.irqs = pf1550_charger_irqs,
+	.num_irqs = ARRAY_SIZE(pf1550_charger_irqs),
+};
+
+static const struct resource charger_resources[] = {
+	DEFINE_RES_IRQ(PF1550_CHARG_IRQ_BAT2SOCI),
+	DEFINE_RES_IRQ(PF1550_CHARG_IRQ_BATI),
+	DEFINE_RES_IRQ(PF1550_CHARG_IRQ_CHGI),
+	DEFINE_RES_IRQ(PF1550_CHARG_IRQ_VBUSI),
+	DEFINE_RES_IRQ(PF1550_CHARG_IRQ_THMI),
+};
+
+static const struct mfd_cell pf1550_regulator_cell = {
+	.name = "pf1550-regulator",
+	.num_resources = ARRAY_SIZE(regulator_resources),
+	.resources = regulator_resources,
+};
+
+static const struct mfd_cell pf1550_onkey_cell = {
+	.name = "pf1550-onkey",
+	.num_resources = ARRAY_SIZE(onkey_resources),
+	.resources = onkey_resources,
+};
+
+static const struct mfd_cell pf1550_charger_cell = {
+	.name = "pf1550-charger",
+	.num_resources = ARRAY_SIZE(charger_resources),
+	.resources = charger_resources,
+};
+
+/*
+ * The PF1550 is shipped in variants of A0, A1,...A9. Each variant defines a
+ * configuration of the PMIC in a One-Time Programmable (OTP) memory.
+ * This memory is accessed indirectly by writing valid keys to specific
+ * registers of the PMIC. To read the OTP memory after writing the valid keys,
+ * the OTP register address to be read is written to pf1550 register 0xc4 and
+ * its value read from pf1550 register 0xc5.
+ */
+static int pf1550_read_otp(const struct pf1550_ddata *pf1550, unsigned int index,
+			   unsigned int *val)
+{
+	int ret = 0;
+
+	ret = regmap_write(pf1550->regmap, PF1550_PMIC_REG_KEY, PF1550_OTP_PMIC_KEY);
+	if (ret)
+		goto read_err;
+
+	ret = regmap_write(pf1550->regmap, PF1550_CHARG_REG_CHGR_KEY2, PF1550_OTP_CHGR_KEY);
+	if (ret)
+		goto read_err;
+
+	ret = regmap_write(pf1550->regmap, PF1550_TEST_REG_KEY3, PF1550_OTP_TEST_KEY);
+	if (ret)
+		goto read_err;
+
+	ret = regmap_write(pf1550->regmap, PF1550_TEST_REG_FMRADDR, index);
+	if (ret)
+		goto read_err;
+
+	ret = regmap_read(pf1550->regmap, PF1550_TEST_REG_FMRDATA, val);
+	if (ret)
+		goto read_err;
+
+	return 0;
+
+read_err:
+	return dev_err_probe(pf1550->dev, ret, "OTP reg %x not found!\n", index);
+}
+
+static int pf1550_i2c_probe(struct i2c_client *i2c)
+{
+	const struct mfd_cell *regulator = &pf1550_regulator_cell;
+	const struct mfd_cell *charger = &pf1550_charger_cell;
+	const struct mfd_cell *onkey = &pf1550_onkey_cell;
+	unsigned int reg_data = 0, otp_data = 0;
+	struct pf1550_ddata *pf1550;
+	struct irq_domain *domain;
+	int irq, ret = 0;
+
+	pf1550 = devm_kzalloc(&i2c->dev, sizeof(*pf1550), GFP_KERNEL);
+	if (!pf1550)
+		return -ENOMEM;
+
+	i2c_set_clientdata(i2c, pf1550);
+	pf1550->dev = &i2c->dev;
+	pf1550->irq = i2c->irq;
+
+	pf1550->regmap = devm_regmap_init_i2c(i2c, &pf1550_regmap_config);
+	if (IS_ERR(pf1550->regmap))
+		return dev_err_probe(pf1550->dev, PTR_ERR(pf1550->regmap),
+				     "failed to allocate register map\n");
+
+	ret = regmap_read(pf1550->regmap, PF1550_PMIC_REG_DEVICE_ID, &reg_data);
+	if (ret < 0)
+		return dev_err_probe(pf1550->dev, ret, "cannot read chip ID\n");
+	if (reg_data != PF1550_DEVICE_ID)
+		return dev_err_probe(pf1550->dev, -ENODEV, "invalid device ID: 0x%02x\n", reg_data);
+
+	/* Regulator DVS for SW2 */
+	ret = pf1550_read_otp(pf1550, PF1550_OTP_SW2_SW3, &otp_data);
+	if (ret)
+		return ret;
+
+	/* When clear, DVS should be enabled */
+	if (!(otp_data & OTP_SW2_DVS_ENB))
+		pf1550->dvs2_enable = true;
+
+	/* Regulator DVS for SW1 */
+	ret = pf1550_read_otp(pf1550, PF1550_OTP_SW1_SW2, &otp_data);
+	if (ret)
+		return ret;
+
+	if (!(otp_data & OTP_SW1_DVS_ENB))
+		pf1550->dvs1_enable = true;
+
+	/* Add top level interrupts */
+	ret = devm_regmap_add_irq_chip(pf1550->dev, pf1550->regmap, pf1550->irq,
+				       IRQF_ONESHOT | IRQF_SHARED |
+				       IRQF_TRIGGER_FALLING,
+				       0, &pf1550_irq_chip,
+				       &pf1550->irq_data);
+	if (ret)
+		return ret;
+
+	/* Add regulator */
+	irq = regmap_irq_get_virq(pf1550->irq_data, PF1550_IRQ_REGULATOR);
+	if (irq < 0)
+		return dev_err_probe(pf1550->dev, irq,
+				     "Failed to get parent vIRQ(%d) for chip %s\n",
+				     PF1550_IRQ_REGULATOR, pf1550_irq_chip.name);
+
+	ret = devm_regmap_add_irq_chip(pf1550->dev, pf1550->regmap, irq,
+				       IRQF_ONESHOT | IRQF_SHARED |
+				       IRQF_TRIGGER_FALLING, 0,
+				       &pf1550_regulator_irq_chip,
+				       &pf1550->irq_data_regulator);
+	if (ret)
+		return dev_err_probe(pf1550->dev, ret, "Failed to add %s IRQ chip\n",
+				     pf1550_regulator_irq_chip.name);
+
+	domain = regmap_irq_get_domain(pf1550->irq_data_regulator);
+
+	ret = devm_mfd_add_devices(pf1550->dev, PLATFORM_DEVID_NONE, regulator, 1, NULL, 0, domain);
+	if (ret)
+		return ret;
+
+	/* Add onkey */
+	irq = regmap_irq_get_virq(pf1550->irq_data, PF1550_IRQ_ONKEY);
+	if (irq < 0)
+		return dev_err_probe(pf1550->dev, irq,
+				     "Failed to get parent vIRQ(%d) for chip %s\n",
+				     PF1550_IRQ_ONKEY, pf1550_irq_chip.name);
+
+	ret = devm_regmap_add_irq_chip(pf1550->dev, pf1550->regmap, irq,
+				       IRQF_ONESHOT | IRQF_SHARED |
+				       IRQF_TRIGGER_FALLING, 0,
+				       &pf1550_onkey_irq_chip,
+				       &pf1550->irq_data_onkey);
+	if (ret)
+		return dev_err_probe(pf1550->dev, ret, "Failed to add %s IRQ chip\n",
+				     pf1550_onkey_irq_chip.name);
+
+	domain = regmap_irq_get_domain(pf1550->irq_data_onkey);
+
+	ret = devm_mfd_add_devices(pf1550->dev, PLATFORM_DEVID_NONE, onkey, 1, NULL, 0, domain);
+	if (ret)
+		return ret;
+
+	/* Add battery charger */
+	irq = regmap_irq_get_virq(pf1550->irq_data, PF1550_IRQ_CHG);
+	if (irq < 0)
+		return dev_err_probe(pf1550->dev, irq,
+				     "Failed to get parent vIRQ(%d) for chip %s\n",
+				     PF1550_IRQ_CHG, pf1550_irq_chip.name);
+
+	ret = devm_regmap_add_irq_chip(pf1550->dev, pf1550->regmap, irq,
+				       IRQF_ONESHOT | IRQF_SHARED |
+				       IRQF_TRIGGER_FALLING, 0,
+				       &pf1550_charger_irq_chip,
+				       &pf1550->irq_data_charger);
+	if (ret)
+		return dev_err_probe(pf1550->dev, ret, "Failed to add %s IRQ chip\n",
+				     pf1550_charger_irq_chip.name);
+
+	domain = regmap_irq_get_domain(pf1550->irq_data_charger);
+
+	return devm_mfd_add_devices(pf1550->dev, PLATFORM_DEVID_NONE, charger, 1, NULL, 0, domain);
+}
+
+static int pf1550_suspend(struct device *dev)
+{
+	struct pf1550_ddata *pf1550 = dev_get_drvdata(dev);
+
+	if (device_may_wakeup(dev)) {
+		enable_irq_wake(pf1550->irq);
+		disable_irq(pf1550->irq);
+	}
+
+	return 0;
+}
+
+static int pf1550_resume(struct device *dev)
+{
+	struct pf1550_ddata *pf1550 = dev_get_drvdata(dev);
+
+	if (device_may_wakeup(dev)) {
+		disable_irq_wake(pf1550->irq);
+		enable_irq(pf1550->irq);
+	}
+
+	return 0;
+}
+static DEFINE_SIMPLE_DEV_PM_OPS(pf1550_pm, pf1550_suspend, pf1550_resume);
+
+static const struct i2c_device_id pf1550_i2c_id[] = {
+	{ "pf1550" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(i2c, pf1550_i2c_id);
+
+static const struct of_device_id pf1550_dt_match[] = {
+	{ .compatible = "nxp,pf1550" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, pf1550_dt_match);
+
+static struct i2c_driver pf1550_i2c_driver = {
+	.driver = {
+		   .name = "pf1550",
+		   .pm = pm_sleep_ptr(&pf1550_pm),
+		   .of_match_table = pf1550_dt_match,
+	},
+	.probe = pf1550_i2c_probe,
+	.id_table = pf1550_i2c_id,
+};
+module_i2c_driver(pf1550_i2c_driver);
+
+MODULE_DESCRIPTION("NXP PF1550 core driver");
+MODULE_AUTHOR("Robin Gong <yibin.gong@freescale.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/pf1550.h b/include/linux/mfd/pf1550.h
new file mode 100644
index 000000000000..7cb2340ff2bd
--- /dev/null
+++ b/include/linux/mfd/pf1550.h
@@ -0,0 +1,273 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Declarations for the PF1550 PMIC
+ *
+ * Copyright (C) 2016 Freescale Semiconductor, Inc.
+ * Robin Gong <yibin.gong@freescale.com>
+ *
+ * Portions Copyright (c) 2025 Savoir-faire Linux Inc.
+ * Samuel Kayode <samuel.kayode@savoirfairelinux.com>
+ */
+
+#ifndef __LINUX_MFD_PF1550_H
+#define __LINUX_MFD_PF1550_H
+
+#include <linux/i2c.h>
+#include <linux/regmap.h>
+
+enum pf1550_pmic_reg {
+	/* PMIC regulator part */
+	PF1550_PMIC_REG_DEVICE_ID		= 0x00,
+	PF1550_PMIC_REG_OTP_FLAVOR		= 0x01,
+	PF1550_PMIC_REG_SILICON_REV		= 0x02,
+
+	PF1550_PMIC_REG_INT_CATEGORY		= 0x06,
+	PF1550_PMIC_REG_SW_INT_STAT0		= 0x08,
+	PF1550_PMIC_REG_SW_INT_MASK0		= 0x09,
+	PF1550_PMIC_REG_SW_INT_SENSE0		= 0x0a,
+	PF1550_PMIC_REG_SW_INT_STAT1		= 0x0b,
+	PF1550_PMIC_REG_SW_INT_MASK1		= 0x0c,
+	PF1550_PMIC_REG_SW_INT_SENSE1		= 0x0d,
+	PF1550_PMIC_REG_SW_INT_STAT2		= 0x0e,
+	PF1550_PMIC_REG_SW_INT_MASK2		= 0x0f,
+	PF1550_PMIC_REG_SW_INT_SENSE2		= 0x10,
+	PF1550_PMIC_REG_LDO_INT_STAT0		= 0x18,
+	PF1550_PMIC_REG_LDO_INT_MASK0		= 0x19,
+	PF1550_PMIC_REG_LDO_INT_SENSE0		= 0x1a,
+	PF1550_PMIC_REG_TEMP_INT_STAT0		= 0x20,
+	PF1550_PMIC_REG_TEMP_INT_MASK0		= 0x21,
+	PF1550_PMIC_REG_TEMP_INT_SENSE0		= 0x22,
+	PF1550_PMIC_REG_ONKEY_INT_STAT0		= 0x24,
+	PF1550_PMIC_REG_ONKEY_INT_MASK0		= 0x25,
+	PF1550_PMIC_REG_ONKEY_INT_SENSE0	= 0x26,
+	PF1550_PMIC_REG_MISC_INT_STAT0		= 0x28,
+	PF1550_PMIC_REG_MISC_INT_MASK0		= 0x29,
+	PF1550_PMIC_REG_MISC_INT_SENSE0		= 0x2a,
+
+	PF1550_PMIC_REG_COINCELL_CONTROL	= 0x30,
+
+	PF1550_PMIC_REG_SW1_VOLT		= 0x32,
+	PF1550_PMIC_REG_SW1_STBY_VOLT		= 0x33,
+	PF1550_PMIC_REG_SW1_SLP_VOLT		= 0x34,
+	PF1550_PMIC_REG_SW1_CTRL		= 0x35,
+	PF1550_PMIC_REG_SW1_CTRL1		= 0x36,
+	PF1550_PMIC_REG_SW2_VOLT		= 0x38,
+	PF1550_PMIC_REG_SW2_STBY_VOLT		= 0x39,
+	PF1550_PMIC_REG_SW2_SLP_VOLT		= 0x3a,
+	PF1550_PMIC_REG_SW2_CTRL		= 0x3b,
+	PF1550_PMIC_REG_SW2_CTRL1		= 0x3c,
+	PF1550_PMIC_REG_SW3_VOLT		= 0x3e,
+	PF1550_PMIC_REG_SW3_STBY_VOLT		= 0x3f,
+	PF1550_PMIC_REG_SW3_SLP_VOLT		= 0x40,
+	PF1550_PMIC_REG_SW3_CTRL		= 0x41,
+	PF1550_PMIC_REG_SW3_CTRL1		= 0x42,
+	PF1550_PMIC_REG_VSNVS_CTRL		= 0x48,
+	PF1550_PMIC_REG_VREFDDR_CTRL		= 0x4a,
+	PF1550_PMIC_REG_LDO1_VOLT		= 0x4c,
+	PF1550_PMIC_REG_LDO1_CTRL		= 0x4d,
+	PF1550_PMIC_REG_LDO2_VOLT		= 0x4f,
+	PF1550_PMIC_REG_LDO2_CTRL		= 0x50,
+	PF1550_PMIC_REG_LDO3_VOLT		= 0x52,
+	PF1550_PMIC_REG_LDO3_CTRL		= 0x53,
+	PF1550_PMIC_REG_PWRCTRL0		= 0x58,
+	PF1550_PMIC_REG_PWRCTRL1		= 0x59,
+	PF1550_PMIC_REG_PWRCTRL2		= 0x5a,
+	PF1550_PMIC_REG_PWRCTRL3		= 0x5b,
+	PF1550_PMIC_REG_SW1_PWRDN_SEQ		= 0x5f,
+	PF1550_PMIC_REG_SW2_PWRDN_SEQ		= 0x60,
+	PF1550_PMIC_REG_SW3_PWRDN_SEQ		= 0x61,
+	PF1550_PMIC_REG_LDO1_PWRDN_SEQ		= 0x62,
+	PF1550_PMIC_REG_LDO2_PWRDN_SEQ		= 0x63,
+	PF1550_PMIC_REG_LDO3_PWRDN_SEQ		= 0x64,
+	PF1550_PMIC_REG_VREFDDR_PWRDN_SEQ	= 0x65,
+
+	PF1550_PMIC_REG_STATE_INFO		= 0x67,
+	PF1550_PMIC_REG_I2C_ADDR		= 0x68,
+	PF1550_PMIC_REG_IO_DRV0			= 0x69,
+	PF1550_PMIC_REG_IO_DRV1			= 0x6a,
+	PF1550_PMIC_REG_RC_16MHZ		= 0x6b,
+	PF1550_PMIC_REG_KEY			= 0x6f,
+
+	/* Charger part */
+	PF1550_CHARG_REG_CHG_INT		= 0x80,
+	PF1550_CHARG_REG_CHG_INT_MASK		= 0x82,
+	PF1550_CHARG_REG_CHG_INT_OK		= 0x84,
+	PF1550_CHARG_REG_VBUS_SNS		= 0x86,
+	PF1550_CHARG_REG_CHG_SNS		= 0x87,
+	PF1550_CHARG_REG_BATT_SNS		= 0x88,
+	PF1550_CHARG_REG_CHG_OPER		= 0x89,
+	PF1550_CHARG_REG_CHG_TMR		= 0x8a,
+	PF1550_CHARG_REG_CHG_EOC_CNFG		= 0x8d,
+	PF1550_CHARG_REG_CHG_CURR_CNFG		= 0x8e,
+	PF1550_CHARG_REG_BATT_REG		= 0x8f,
+	PF1550_CHARG_REG_BATFET_CNFG		= 0x91,
+	PF1550_CHARG_REG_THM_REG_CNFG		= 0x92,
+	PF1550_CHARG_REG_VBUS_INLIM_CNFG	= 0x94,
+	PF1550_CHARG_REG_VBUS_LIN_DPM		= 0x95,
+	PF1550_CHARG_REG_USB_PHY_LDO_CNFG	= 0x96,
+	PF1550_CHARG_REG_DBNC_DELAY_TIME	= 0x98,
+	PF1550_CHARG_REG_CHG_INT_CNFG		= 0x99,
+	PF1550_CHARG_REG_THM_ADJ_SETTING	= 0x9a,
+	PF1550_CHARG_REG_VBUS2SYS_CNFG		= 0x9b,
+	PF1550_CHARG_REG_LED_PWM		= 0x9c,
+	PF1550_CHARG_REG_FAULT_BATFET_CNFG	= 0x9d,
+	PF1550_CHARG_REG_LED_CNFG		= 0x9e,
+	PF1550_CHARG_REG_CHGR_KEY2		= 0x9f,
+
+	PF1550_TEST_REG_FMRADDR			= 0xc4,
+	PF1550_TEST_REG_FMRDATA			= 0xc5,
+	PF1550_TEST_REG_KEY3			= 0xdf,
+
+	PF1550_PMIC_REG_END			= 0xff,
+};
+
+/* One-Time Programmable(OTP) memory */
+enum pf1550_otp_reg {
+	PF1550_OTP_SW1_SW2			= 0x1e,
+	PF1550_OTP_SW2_SW3			= 0x1f,
+};
+
+#define PF1550_DEVICE_ID		0x7c
+
+/* Keys for reading OTP */
+#define PF1550_OTP_PMIC_KEY		0x15
+#define PF1550_OTP_CHGR_KEY		0x50
+#define PF1550_OTP_TEST_KEY		0xab
+
+/* Supported charger modes */
+#define PF1550_CHG_BAT_OFF		1
+#define PF1550_CHG_BAT_ON		2
+
+#define PF1550_CHG_PRECHARGE		0
+#define PF1550_CHG_CONSTANT_CURRENT	1
+#define PF1550_CHG_CONSTANT_VOL		2
+#define PF1550_CHG_EOC			3
+#define PF1550_CHG_DONE			4
+#define PF1550_CHG_TIMER_FAULT		6
+#define PF1550_CHG_SUSPEND		7
+#define PF1550_CHG_OFF_INV		8
+#define PF1550_CHG_BAT_OVER		9
+#define PF1550_CHG_OFF_TEMP		10
+#define PF1550_CHG_LINEAR_ONLY		12
+#define PF1550_CHG_SNS_MASK		0xf
+#define PF1550_CHG_INT_MASK		0x51
+
+#define PF1550_BAT_NO_VBUS		0
+#define PF1550_BAT_LOW_THAN_PRECHARG	1
+#define PF1550_BAT_CHARG_FAIL		2
+#define PF1550_BAT_HIGH_THAN_PRECHARG	4
+#define PF1550_BAT_OVER_VOL		5
+#define PF1550_BAT_NO_DETECT		6
+#define PF1550_BAT_SNS_MASK		0x7
+
+#define PF1550_VBUS_UVLO		BIT(2)
+#define PF1550_VBUS_IN2SYS		BIT(3)
+#define PF1550_VBUS_OVLO		BIT(4)
+#define PF1550_VBUS_VALID		BIT(5)
+
+#define PF1550_CHARG_REG_BATT_REG_CHGCV_MASK		0x3f
+#define PF1550_CHARG_REG_BATT_REG_VMINSYS_SHIFT		6
+#define PF1550_CHARG_REG_BATT_REG_VMINSYS_MASK		GENMASK(7, 6)
+#define PF1550_CHARG_REG_THM_REG_CNFG_REGTEMP_SHIFT	2
+#define PF1550_CHARG_REG_THM_REG_CNFG_REGTEMP_MASK	GENMASK(3, 2)
+
+#define PF1550_ONKEY_RST_EN		BIT(7)
+
+/* DVS enable masks */
+#define OTP_SW1_DVS_ENB		BIT(1)
+#define OTP_SW2_DVS_ENB		BIT(3)
+
+/* Top level interrupt masks */
+#define IRQ_REGULATOR		(BIT(1) | BIT(2) | BIT(3) | BIT(4) | BIT(6))
+#define IRQ_ONKEY		BIT(5)
+#define IRQ_CHG			BIT(0)
+
+/* Regulator interrupt masks */
+#define PMIC_IRQ_SW1_LS		BIT(0)
+#define PMIC_IRQ_SW2_LS		BIT(1)
+#define PMIC_IRQ_SW3_LS		BIT(2)
+#define PMIC_IRQ_SW1_HS		BIT(0)
+#define PMIC_IRQ_SW2_HS		BIT(1)
+#define PMIC_IRQ_SW3_HS		BIT(2)
+#define PMIC_IRQ_LDO1_FAULT	BIT(0)
+#define PMIC_IRQ_LDO2_FAULT	BIT(1)
+#define PMIC_IRQ_LDO3_FAULT	BIT(2)
+#define PMIC_IRQ_TEMP_110	BIT(0)
+#define PMIC_IRQ_TEMP_125	BIT(1)
+
+/* Onkey interrupt masks */
+#define ONKEY_IRQ_PUSHI		BIT(0)
+#define ONKEY_IRQ_1SI		BIT(1)
+#define ONKEY_IRQ_2SI		BIT(2)
+#define ONKEY_IRQ_3SI		BIT(3)
+#define ONKEY_IRQ_4SI		BIT(4)
+#define ONKEY_IRQ_8SI		BIT(5)
+
+/* Charger interrupt masks */
+#define CHARG_IRQ_BAT2SOCI	BIT(1)
+#define CHARG_IRQ_BATI		BIT(2)
+#define CHARG_IRQ_CHGI		BIT(3)
+#define CHARG_IRQ_VBUSI		BIT(5)
+#define CHARG_IRQ_DPMI		BIT(6)
+#define CHARG_IRQ_THMI		BIT(7)
+
+enum pf1550_irq {
+	PF1550_IRQ_CHG,
+	PF1550_IRQ_REGULATOR,
+	PF1550_IRQ_ONKEY,
+};
+
+enum pf1550_pmic_irq {
+	PF1550_PMIC_IRQ_SW1_LS,
+	PF1550_PMIC_IRQ_SW2_LS,
+	PF1550_PMIC_IRQ_SW3_LS,
+	PF1550_PMIC_IRQ_SW1_HS,
+	PF1550_PMIC_IRQ_SW2_HS,
+	PF1550_PMIC_IRQ_SW3_HS,
+	PF1550_PMIC_IRQ_LDO1_FAULT,
+	PF1550_PMIC_IRQ_LDO2_FAULT,
+	PF1550_PMIC_IRQ_LDO3_FAULT,
+	PF1550_PMIC_IRQ_TEMP_110,
+	PF1550_PMIC_IRQ_TEMP_125,
+};
+
+enum pf1550_onkey_irq {
+	PF1550_ONKEY_IRQ_PUSHI,
+	PF1550_ONKEY_IRQ_1SI,
+	PF1550_ONKEY_IRQ_2SI,
+	PF1550_ONKEY_IRQ_3SI,
+	PF1550_ONKEY_IRQ_4SI,
+	PF1550_ONKEY_IRQ_8SI,
+};
+
+enum pf1550_charg_irq {
+	PF1550_CHARG_IRQ_BAT2SOCI,
+	PF1550_CHARG_IRQ_BATI,
+	PF1550_CHARG_IRQ_CHGI,
+	PF1550_CHARG_IRQ_VBUSI,
+	PF1550_CHARG_IRQ_THMI,
+};
+
+enum pf1550_regulators {
+	PF1550_SW1,
+	PF1550_SW2,
+	PF1550_SW3,
+	PF1550_VREFDDR,
+	PF1550_LDO1,
+	PF1550_LDO2,
+	PF1550_LDO3,
+};
+
+struct pf1550_ddata {
+	struct regmap_irq_chip_data *irq_data_regulator;
+	struct regmap_irq_chip_data *irq_data_charger;
+	struct regmap_irq_chip_data *irq_data_onkey;
+	struct regmap_irq_chip_data *irq_data;
+	struct regmap *regmap;
+	struct device *dev;
+	bool dvs1_enable;
+	bool dvs2_enable;
+	int irq;
+};
+
+#endif /* __LINUX_MFD_PF1550_H */
-- 
cgit v1.2.3


From f7d72d0b3f438b881dba16c7c00493f16e41a821 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <a.s.protopopov@gmail.com>
Date: Sun, 19 Oct 2025 20:21:30 +0000
Subject: bpf: save the start of functions in bpf_prog_aux

Introduce a new subprog_start field in bpf_prog_aux. This field may
be used by JIT compilers wanting to know the real absolute xlated
offset of the function being jitted. The func_info[func_id] may have
served this purpose, but func_info may be NULL, so JIT compilers
can't rely on it.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251019202145.3944697-3-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   | 1 +
 kernel/bpf/verifier.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 204f9c759a41..3bda915cd7a8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1623,6 +1623,7 @@ struct bpf_prog_aux {
 	u32 ctx_arg_info_size;
 	u32 max_rdonly_access;
 	u32 max_rdwr_access;
+	u32 subprog_start;
 	struct btf *attach_btf;
 	struct bpf_ctx_arg_aux *ctx_arg_info;
 	void __percpu *priv_stack_ptr;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 80c99ef4cac5..4579082068ca 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21607,6 +21607,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->func_idx = i;
 		/* Below members will be freed only at prog->aux */
 		func[i]->aux->btf = prog->aux->btf;
+		func[i]->aux->subprog_start = subprog_start;
 		func[i]->aux->func_info = prog->aux->func_info;
 		func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
 		func[i]->aux->poke_tab = prog->aux->poke_tab;
-- 
cgit v1.2.3


From 44481e4925327d833f2e37c8741406e4cabfe054 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <a.s.protopopov@gmail.com>
Date: Sun, 19 Oct 2025 20:21:31 +0000
Subject: bpf: generalize and export map_get_next_key for arrays

The kernel/bpf/array.c file defines the array_map_get_next_key()
function which finds the next key for array maps. It actually doesn't
use any map fields besides the generic max_entries field. Generalize
it, and export as bpf_array_get_next_key() such that it can be
re-used by other array-like maps.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251019202145.3944697-4-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  6 ++++++
 kernel/bpf/arraymap.c | 19 +++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3bda915cd7a8..e53cda0aabb6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2107,6 +2107,12 @@ struct bpf_array {
 	};
 };
 
+/*
+ * The bpf_array_get_next_key() function may be used for all array-like
+ * maps, i.e., maps with u32 keys with range [0 ,..., max_entries)
+ */
+int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key);
+
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0ba790c2d2e5..1eeb31c5b317 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -335,18 +335,17 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
 }
 
 /* Called from syscall */
-static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
-	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = key ? *(u32 *)key : U32_MAX;
 	u32 *next = (u32 *)next_key;
 
-	if (index >= array->map.max_entries) {
+	if (index >= map->max_entries) {
 		*next = 0;
 		return 0;
 	}
 
-	if (index == array->map.max_entries - 1)
+	if (index == map->max_entries - 1)
 		return -ENOENT;
 
 	*next = index + 1;
@@ -789,7 +788,7 @@ const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
-	.map_get_next_key = array_map_get_next_key,
+	.map_get_next_key = bpf_array_get_next_key,
 	.map_release_uref = array_map_free_internal_structs,
 	.map_lookup_elem = array_map_lookup_elem,
 	.map_update_elem = array_map_update_elem,
@@ -815,7 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
-	.map_get_next_key = array_map_get_next_key,
+	.map_get_next_key = bpf_array_get_next_key,
 	.map_lookup_elem = percpu_array_map_lookup_elem,
 	.map_gen_lookup = percpu_array_map_gen_lookup,
 	.map_update_elem = array_map_update_elem,
@@ -1204,7 +1203,7 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_poke_track = prog_array_map_poke_track,
 	.map_poke_untrack = prog_array_map_poke_untrack,
 	.map_poke_run = prog_array_map_poke_run,
-	.map_get_next_key = array_map_get_next_key,
+	.map_get_next_key = bpf_array_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
@@ -1308,7 +1307,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_map_alloc,
 	.map_free = perf_event_fd_array_map_free,
-	.map_get_next_key = array_map_get_next_key,
+	.map_get_next_key = bpf_array_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
@@ -1344,7 +1343,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_map_alloc,
 	.map_free = cgroup_fd_array_free,
-	.map_get_next_key = array_map_get_next_key,
+	.map_get_next_key = bpf_array_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = cgroup_fd_array_get_ptr,
@@ -1429,7 +1428,7 @@ const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_of_map_alloc,
 	.map_free = array_of_map_free,
-	.map_get_next_key = array_map_get_next_key,
+	.map_get_next_key = bpf_array_get_next_key,
 	.map_lookup_elem = array_of_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
-- 
cgit v1.2.3


From 2f69c5685427308d2f312646779313f3677536bc Mon Sep 17 00:00:00 2001
From: Anton Protopopov <a.s.protopopov@gmail.com>
Date: Sun, 19 Oct 2025 20:21:37 +0000
Subject: bpf: make bpf_insn_successors to return a pointer

The bpf_insn_successors() function is used to return successors
to a BPF instruction. So far, an instruction could have 0, 1 or 2
successors. Prepare the verifier code to introduction of instructions
with more than 2 successors (namely, indirect jumps).

To do this, introduce a new struct, struct bpf_iarray, containing
an array of bpf instruction indexes and make bpf_insn_successors
to return a pointer of that type. The storage for all instructions
is allocated in the env->succ, which holds an array of size 2,
to be used for all instructions.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251019202145.3944697-10-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 12 ++++++++-
 kernel/bpf/liveness.c        | 36 +++++++++++++++++---------
 kernel/bpf/verifier.c        | 60 +++++++++++++++++++++++++++++---------------
 3 files changed, 75 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b57222a25a4a..c6eb68b6389c 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -509,6 +509,15 @@ struct bpf_map_ptr_state {
 #define BPF_ALU_SANITIZE		(BPF_ALU_SANITIZE_SRC | \
 					 BPF_ALU_SANITIZE_DST)
 
+/*
+ * An array of BPF instructions.
+ * Primary usage: return value of bpf_insn_successors.
+ */
+struct bpf_iarray {
+	int cnt;
+	u32 items[];
+};
+
 struct bpf_insn_aux_data {
 	union {
 		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
@@ -828,6 +837,7 @@ struct bpf_verifier_env {
 	/* array of pointers to bpf_scc_info indexed by SCC id */
 	struct bpf_scc_info **scc_info;
 	u32 scc_cnt;
+	struct bpf_iarray *succ;
 };
 
 static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
@@ -1050,7 +1060,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
 
 struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off);
 int bpf_jmp_offset(struct bpf_insn *insn);
-int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]);
+struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx);
 void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask);
 bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx);
 
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index baa742e6cbb6..bffb495bc933 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -34,7 +34,7 @@
  *   - read and write marks propagation.
  * - The propagation phase is a textbook live variable data flow analysis:
  *
- *     state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)]
+ *     state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)]
  *     state[cc, i].live_before =
  *       (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
  *
@@ -54,7 +54,7 @@
  *   The equation for "must_write_acc" propagation looks as follows:
  *
  *     state[cc, i].must_write_acc =
- *       ∩ [state[cc, s].must_write_acc for s in insn_successors(i)]
+ *       ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)]
  *       U state[cc, i].must_write
  *
  *   (An intersection of all "must_write_acc" for instruction successors
@@ -447,7 +447,12 @@ int bpf_jmp_offset(struct bpf_insn *insn)
 __diag_push();
 __diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl");
 
-inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
+/*
+ * Returns an array of instructions succ, with succ->items[0], ...,
+ * succ->items[n-1] with successor instructions, where n=succ->cnt
+ */
+inline struct bpf_iarray *
+bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
 {
 	static const struct opcode_info {
 		bool can_jump;
@@ -474,19 +479,25 @@ inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
 		_J(BPF_JSET,  {.can_jump = true,  .can_fallthrough = true}),
 	#undef _J
 	};
+	struct bpf_prog *prog = env->prog;
 	struct bpf_insn *insn = &prog->insnsi[idx];
 	const struct opcode_info *opcode_info;
-	int i = 0, insn_sz;
+	struct bpf_iarray *succ;
+	int insn_sz;
+
+	/* pre-allocated array of size up to 2; reset cnt, as it may have been used already */
+	succ = env->succ;
+	succ->cnt = 0;
 
 	opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)];
 	insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
 	if (opcode_info->can_fallthrough)
-		succ[i++] = idx + insn_sz;
+		succ->items[succ->cnt++] = idx + insn_sz;
 
 	if (opcode_info->can_jump)
-		succ[i++] = idx + bpf_jmp_offset(insn) + 1;
+		succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1;
 
-	return i;
+	return succ;
 }
 
 __diag_pop();
@@ -548,11 +559,12 @@ static inline bool update_insn(struct bpf_verifier_env *env,
 	struct bpf_insn_aux_data *aux = env->insn_aux_data;
 	u64 new_before, new_after, must_write_acc;
 	struct per_frame_masks *insn, *succ_insn;
-	u32 succ_num, s, succ[2];
+	struct bpf_iarray *succ;
+	u32 s;
 	bool changed;
 
-	succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
-	if (unlikely(succ_num == 0))
+	succ = bpf_insn_successors(env, insn_idx);
+	if (succ->cnt == 0)
 		return false;
 
 	changed = false;
@@ -564,8 +576,8 @@ static inline bool update_insn(struct bpf_verifier_env *env,
 	 * of successors plus all "must_write" slots of instruction itself.
 	 */
 	must_write_acc = U64_MAX;
-	for (s = 0; s < succ_num; ++s) {
-		succ_insn = get_frame_masks(instance, frame, succ[s]);
+	for (s = 0; s < succ->cnt; ++s) {
+		succ_insn = get_frame_masks(instance, frame, succ->items[s]);
 		new_after |= succ_insn->live_before;
 		must_write_acc &= succ_insn->must_write_acc;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4579082068ca..6d175849e57a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17805,6 +17805,22 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env)
 	return 0;
 }
 
+static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem)
+{
+	size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]);
+	struct bpf_iarray *new;
+
+	new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT);
+	if (!new) {
+		/* this is what callers always want, so simplify the call site */
+		kvfree(old);
+		return NULL;
+	}
+
+	new->cnt = n_elem;
+	return new;
+}
+
 /* Visits the instruction at index t and returns one of the following:
  *  < 0 - an error occurred
  *  DONE_EXPLORING - the instruction was fully explored
@@ -18025,8 +18041,9 @@ err_free:
  */
 static int compute_postorder(struct bpf_verifier_env *env)
 {
-	u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2];
+	u32 cur_postorder, i, top, stack_sz, s;
 	int *stack = NULL, *postorder = NULL, *state = NULL;
+	struct bpf_iarray *succ;
 
 	postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
 	state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
@@ -18050,11 +18067,11 @@ static int compute_postorder(struct bpf_verifier_env *env)
 				stack_sz--;
 				continue;
 			}
-			succ_cnt = bpf_insn_successors(env->prog, top, succ);
-			for (s = 0; s < succ_cnt; ++s) {
-				if (!state[succ[s]]) {
-					stack[stack_sz++] = succ[s];
-					state[succ[s]] |= DISCOVERED;
+			succ = bpf_insn_successors(env, top);
+			for (s = 0; s < succ->cnt; ++s) {
+				if (!state[succ->items[s]]) {
+					stack[stack_sz++] = succ->items[s];
+					state[succ->items[s]] |= DISCOVERED;
 				}
 			}
 			state[top] |= EXPLORED;
@@ -24313,14 +24330,13 @@ static int compute_live_registers(struct bpf_verifier_env *env)
 		for (i = 0; i < env->cfg.cur_postorder; ++i) {
 			int insn_idx = env->cfg.insn_postorder[i];
 			struct insn_live_regs *live = &state[insn_idx];
-			int succ_num;
-			u32 succ[2];
+			struct bpf_iarray *succ;
 			u16 new_out = 0;
 			u16 new_in = 0;
 
-			succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
-			for (int s = 0; s < succ_num; ++s)
-				new_out |= state[succ[s]].in;
+			succ = bpf_insn_successors(env, insn_idx);
+			for (int s = 0; s < succ->cnt; ++s)
+				new_out |= state[succ->items[s]].in;
 			new_in = (new_out & ~live->def) | live->use;
 			if (new_out != live->out || new_in != live->in) {
 				live->in = new_in;
@@ -24373,11 +24389,11 @@ static int compute_scc(struct bpf_verifier_env *env)
 	const u32 insn_cnt = env->prog->len;
 	int stack_sz, dfs_sz, err = 0;
 	u32 *stack, *pre, *low, *dfs;
-	u32 succ_cnt, i, j, t, w;
+	u32 i, j, t, w;
 	u32 next_preorder_num;
 	u32 next_scc_id;
 	bool assign_scc;
-	u32 succ[2];
+	struct bpf_iarray *succ;
 
 	next_preorder_num = 1;
 	next_scc_id = 1;
@@ -24484,12 +24500,12 @@ dfs_continue:
 				stack[stack_sz++] = w;
 			}
 			/* Visit 'w' successors */
-			succ_cnt = bpf_insn_successors(env->prog, w, succ);
-			for (j = 0; j < succ_cnt; ++j) {
-				if (pre[succ[j]]) {
-					low[w] = min(low[w], low[succ[j]]);
+			succ = bpf_insn_successors(env, w);
+			for (j = 0; j < succ->cnt; ++j) {
+				if (pre[succ->items[j]]) {
+					low[w] = min(low[w], low[succ->items[j]]);
 				} else {
-					dfs[dfs_sz++] = succ[j];
+					dfs[dfs_sz++] = succ->items[j];
 					goto dfs_continue;
 				}
 			}
@@ -24506,8 +24522,8 @@ dfs_continue:
 			 * or if component has a self reference.
 			 */
 			assign_scc = stack[stack_sz - 1] != w;
-			for (j = 0; j < succ_cnt; ++j) {
-				if (succ[j] == w) {
+			for (j = 0; j < succ->cnt; ++j) {
+				if (succ->items[j] == w) {
 					assign_scc = true;
 					break;
 				}
@@ -24569,6 +24585,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 		goto err_free_env;
 	for (i = 0; i < len; i++)
 		env->insn_aux_data[i].orig_idx = i;
+	env->succ = iarray_realloc(NULL, 2);
+	if (!env->succ)
+		goto err_free_env;
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
 
@@ -24817,6 +24836,7 @@ err_free_env:
 	bpf_stack_liveness_free(env);
 	kvfree(env->cfg.insn_postorder);
 	kvfree(env->scc_info);
+	kvfree(env->succ);
 	kvfree(env);
 	return ret;
 }
-- 
cgit v1.2.3


From 28098defc79fe7d29e6bfe4eb6312991f6bdc3d3 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 17 Oct 2025 03:41:52 +0000
Subject: net: add a common function to compute features for upper devices

Some high level software drivers need to compute features from lower
devices. But each has their own implementations and may lost some
feature compute. Let's use one common function to compute features
for kinds of these devices.

The new helper uses the current bond implementation as the reference
one, as the latter already handles all the relevant aspects: netdev
features, TSO limits and dst retention.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251017034155.61990-2-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdev_features.h | 18 +++++++++
 include/linux/netdevice.h       |  1 +
 net/core/dev.c                  | 88 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 7a01c518e573..93e4da7046a1 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -255,6 +255,24 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start)
 				 NETIF_F_GSO_UDP_TUNNEL |		\
 				 NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
+/* virtual device features */
+#define MASTER_UPPER_DEV_VLAN_FEATURES	 (NETIF_F_HW_CSUM | NETIF_F_SG | \
+					  NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \
+					  NETIF_F_GSO_ENCAP_ALL | \
+					  NETIF_F_HIGHDMA | NETIF_F_LRO)
+
+#define MASTER_UPPER_DEV_ENC_FEATURES	 (NETIF_F_HW_CSUM | NETIF_F_SG | \
+					  NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE | \
+					  NETIF_F_GSO_PARTIAL)
+
+#define MASTER_UPPER_DEV_MPLS_FEATURES	 (NETIF_F_HW_CSUM | NETIF_F_SG | \
+					  NETIF_F_GSO_SOFTWARE)
+
+#define MASTER_UPPER_DEV_XFRM_FEATURES	 (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \
+					  NETIF_F_GSO_ESP)
+
+#define MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES (NETIF_F_GSO_ESP)
+
 static inline netdev_features_t netdev_base_features(netdev_features_t features)
 {
 	features &= ~NETIF_F_ONE_FOR_ALL;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d1a687444b27..7f5aad5cc9a1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -5304,6 +5304,7 @@ static inline netdev_features_t netdev_add_tso_features(netdev_features_t featur
 int __netdev_update_features(struct net_device *dev);
 void netdev_update_features(struct net_device *dev);
 void netdev_change_features(struct net_device *dev);
+void netdev_compute_master_upper_features(struct net_device *dev, bool update_header);
 
 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 					struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 9482b905c66a..378c2d010faf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -12693,6 +12693,94 @@ netdev_features_t netdev_increment_features(netdev_features_t all,
 }
 EXPORT_SYMBOL(netdev_increment_features);
 
+/**
+ *	netdev_compute_master_upper_features - compute feature from lowers
+ *	@dev: the upper device
+ *	@update_header: whether to update upper device's header_len/headroom/tailroom
+ *
+ *	Recompute the upper device's feature based on all lower devices.
+ */
+void netdev_compute_master_upper_features(struct net_device *dev, bool update_header)
+{
+	unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
+	netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES;
+	netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES;
+	netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES;
+	netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES;
+	netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES;
+	unsigned short max_header_len = ETH_HLEN;
+	unsigned int tso_max_size = TSO_MAX_SIZE;
+	unsigned short max_headroom = 0;
+	unsigned short max_tailroom = 0;
+	u16 tso_max_segs = TSO_MAX_SEGS;
+	struct net_device *lower_dev;
+	struct list_head *iter;
+
+	mpls_features = netdev_base_features(mpls_features);
+	vlan_features = netdev_base_features(vlan_features);
+	enc_features = netdev_base_features(enc_features);
+
+	netdev_for_each_lower_dev(dev, lower_dev, iter) {
+		gso_partial_features = netdev_increment_features(gso_partial_features,
+								 lower_dev->gso_partial_features,
+								 MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES);
+
+		vlan_features = netdev_increment_features(vlan_features,
+							  lower_dev->vlan_features,
+							  MASTER_UPPER_DEV_VLAN_FEATURES);
+
+		enc_features = netdev_increment_features(enc_features,
+							 lower_dev->hw_enc_features,
+							 MASTER_UPPER_DEV_ENC_FEATURES);
+
+		if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
+			xfrm_features = netdev_increment_features(xfrm_features,
+								  lower_dev->hw_enc_features,
+								  MASTER_UPPER_DEV_XFRM_FEATURES);
+
+		mpls_features = netdev_increment_features(mpls_features,
+							  lower_dev->mpls_features,
+							  MASTER_UPPER_DEV_MPLS_FEATURES);
+
+		dst_release_flag &= lower_dev->priv_flags;
+
+		if (update_header) {
+			max_header_len = max(max_header_len, lower_dev->hard_header_len);
+			max_headroom = max(max_headroom, lower_dev->needed_headroom);
+			max_tailroom = max(max_tailroom, lower_dev->needed_tailroom);
+		}
+
+		tso_max_size = min(tso_max_size, lower_dev->tso_max_size);
+		tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs);
+	}
+
+	dev->gso_partial_features = gso_partial_features;
+	dev->vlan_features = vlan_features;
+	dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+			       NETIF_F_HW_VLAN_CTAG_TX |
+			       NETIF_F_HW_VLAN_STAG_TX;
+	if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
+		dev->hw_enc_features |= xfrm_features;
+	dev->mpls_features = mpls_features;
+
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
+	    dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+		dev->priv_flags |= IFF_XMIT_DST_RELEASE;
+
+	if (update_header) {
+		dev->hard_header_len = max_header_len;
+		dev->needed_headroom = max_headroom;
+		dev->needed_tailroom = max_tailroom;
+	}
+
+	netif_set_tso_max_segs(dev, tso_max_segs);
+	netif_set_tso_max_size(dev, tso_max_size);
+
+	netdev_change_features(dev);
+}
+EXPORT_SYMBOL(netdev_compute_master_upper_features);
+
 static struct hlist_head * __net_init netdev_create_hash(void)
 {
 	int i;
-- 
cgit v1.2.3


From ce085ecdba23a5d5462877d884ecff3ffceaad22 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 14 Oct 2025 15:04:25 -0700
Subject: scsi: core: Do not declare scsi_cmnd pointers const

This change allows removing multiple casts and hence improves type
checking by the compiler.

Cc: Hannes Reinecke <hare@suse.de>
Suggested-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Link: https://patch.msgid.link/20251014220426.3690007-1-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_logging.c | 21 ++++++++++-----------
 include/scsi/scsi_dbg.h     |  4 ++--
 include/scsi/scsi_device.h  |  4 ++--
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_logging.c b/drivers/scsi/scsi_logging.c
index b02af340c2d3..3cd0d3074085 100644
--- a/drivers/scsi/scsi_logging.c
+++ b/drivers/scsi/scsi_logging.c
@@ -26,9 +26,9 @@ static void scsi_log_release_buffer(char *bufptr)
 	kfree(bufptr);
 }
 
-static inline const char *scmd_name(const struct scsi_cmnd *scmd)
+static inline const char *scmd_name(struct scsi_cmnd *scmd)
 {
-	struct request *rq = scsi_cmd_to_rq((struct scsi_cmnd *)scmd);
+	const struct request *rq = scsi_cmd_to_rq(scmd);
 
 	if (!rq->q || !rq->q->disk)
 		return NULL;
@@ -80,8 +80,8 @@ void sdev_prefix_printk(const char *level, const struct scsi_device *sdev,
 }
 EXPORT_SYMBOL(sdev_prefix_printk);
 
-void scmd_printk(const char *level, const struct scsi_cmnd *scmd,
-		const char *fmt, ...)
+void scmd_printk(const char *level, struct scsi_cmnd *scmd, const char *fmt,
+		 ...)
 {
 	va_list args;
 	char *logbuf;
@@ -94,7 +94,7 @@ void scmd_printk(const char *level, const struct scsi_cmnd *scmd,
 	if (!logbuf)
 		return;
 	off = sdev_format_header(logbuf, logbuf_len, scmd_name(scmd),
-				 scsi_cmd_to_rq((struct scsi_cmnd *)scmd)->tag);
+				 scsi_cmd_to_rq(scmd)->tag);
 	if (off < logbuf_len) {
 		va_start(args, fmt);
 		off += vscnprintf(logbuf + off, logbuf_len - off, fmt, args);
@@ -371,16 +371,15 @@ void __scsi_print_sense(const struct scsi_device *sdev, const char *name,
 EXPORT_SYMBOL(__scsi_print_sense);
 
 /* Normalize and print sense buffer in SCSI command */
-void scsi_print_sense(const struct scsi_cmnd *cmd)
+void scsi_print_sense(struct scsi_cmnd *cmd)
 {
 	scsi_log_print_sense(cmd->device, scmd_name(cmd),
-			     scsi_cmd_to_rq((struct scsi_cmnd *)cmd)->tag,
-			     cmd->sense_buffer, SCSI_SENSE_BUFFERSIZE);
+			     scsi_cmd_to_rq(cmd)->tag, cmd->sense_buffer,
+			     SCSI_SENSE_BUFFERSIZE);
 }
 EXPORT_SYMBOL(scsi_print_sense);
 
-void scsi_print_result(const struct scsi_cmnd *cmd, const char *msg,
-		       int disposition)
+void scsi_print_result(struct scsi_cmnd *cmd, const char *msg, int disposition)
 {
 	char *logbuf;
 	size_t off, logbuf_len;
@@ -393,7 +392,7 @@ void scsi_print_result(const struct scsi_cmnd *cmd, const char *msg,
 		return;
 
 	off = sdev_format_header(logbuf, logbuf_len, scmd_name(cmd),
-				 scsi_cmd_to_rq((struct scsi_cmnd *)cmd)->tag);
+				 scsi_cmd_to_rq(cmd)->tag);
 
 	if (off >= logbuf_len)
 		goto out_printk;
diff --git a/include/scsi/scsi_dbg.h b/include/scsi/scsi_dbg.h
index bd29cdb513a5..efcdc78530d5 100644
--- a/include/scsi/scsi_dbg.h
+++ b/include/scsi/scsi_dbg.h
@@ -11,11 +11,11 @@ extern size_t __scsi_format_command(char *, size_t,
 				   const unsigned char *, size_t);
 extern void scsi_print_sense_hdr(const struct scsi_device *, const char *,
 				 const struct scsi_sense_hdr *);
-extern void scsi_print_sense(const struct scsi_cmnd *);
+extern void scsi_print_sense(struct scsi_cmnd *);
 extern void __scsi_print_sense(const struct scsi_device *, const char *name,
 			       const unsigned char *sense_buffer,
 			       int sense_len);
-extern void scsi_print_result(const struct scsi_cmnd *, const char *, int);
+extern void scsi_print_result(struct scsi_cmnd *, const char *, int);
 
 #ifdef CONFIG_SCSI_CONSTANTS
 extern bool scsi_opcode_sa_name(int, int, const char **, const char **);
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 6d6500148c4b..4c106342c4ae 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -313,8 +313,8 @@ sdev_prefix_printk(const char *, const struct scsi_device *, const char *,
 #define sdev_printk(l, sdev, fmt, a...)				\
 	sdev_prefix_printk(l, sdev, NULL, fmt, ##a)
 
-__printf(3, 4) void
-scmd_printk(const char *, const struct scsi_cmnd *, const char *, ...);
+__printf(3, 4) void scmd_printk(const char *, struct scsi_cmnd *, const char *,
+				...);
 
 #define scmd_dbg(scmd, fmt, a...)					\
 	do {								\
-- 
cgit v1.2.3


From bfe0d22f12559f44bf27ae88b9c4a9f8fdae65d0 Mon Sep 17 00:00:00 2001
From: Peter Wang <peter.wang@mediatek.com>
Date: Thu, 16 Oct 2025 10:32:31 +0800
Subject: scsi: ufs: core: Update CQ Entry to UFS 4.1 format

Update the completion queue (CQ) entry format according to the UFS 4.1
specification. UFS 4.1 introduces new members in reserved record
DW5. Also refine DW4 with detailed members defined in UFS 4.0. Modify
the code to incorporate these changes by updating the overall_status in
the CQ entry structure.

Signed-off-by: Peter Wang <peter.wang@mediatek.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251016023507.1000664-2-peter.wang@mediatek.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c |  4 ++--
 include/ufs/ufshci.h      | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 8339fec975b9..864b2c490feb 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -856,7 +856,7 @@ static enum utp_ocs ufshcd_get_tr_ocs(struct ufshcd_lrb *lrbp,
 				      struct cq_entry *cqe)
 {
 	if (cqe)
-		return le32_to_cpu(cqe->status) & MASK_OCS;
+		return cqe->overall_status & MASK_OCS;
 
 	return lrbp->utr_descriptor_ptr->header.ocs & MASK_OCS;
 }
@@ -5646,7 +5646,7 @@ void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag,
 		scsi_done(cmd);
 	} else {
 		if (cqe) {
-			ocs = le32_to_cpu(cqe->status) & MASK_OCS;
+			ocs = cqe->overall_status & MASK_OCS;
 			lrbp->utr_descriptor_ptr->header.ocs = ocs;
 		}
 		complete(&hba->dev_cmd.complete);
diff --git a/include/ufs/ufshci.h b/include/ufs/ufshci.h
index e64b70132101..bfc5401a9a0a 100644
--- a/include/ufs/ufshci.h
+++ b/include/ufs/ufshci.h
@@ -569,10 +569,26 @@ struct cq_entry {
 	__le16  prd_table_offset;
 
 	/* DW 4 */
-	__le32 status;
+	u8 overall_status;
+	u8 extended_error_code;
+	__le16 reserved_1;
 
-	/* DW 5-7 */
-	__le32 reserved[3];
+	/* DW 5 */
+	u8 task_tag;
+	u8 lun;
+#if defined(__BIG_ENDIAN)
+	u8 ext_iid:4;
+	u8 iid:4;
+#elif defined(__LITTLE_ENDIAN)
+	u8 iid:4;
+	u8 ext_iid:4;
+#else
+#error
+#endif
+	u8 reserved_2;
+
+	/* DW 6-7 */
+	__le32 reserved_3[2];
 };
 
 static_assert(sizeof(struct cq_entry) == 32);
-- 
cgit v1.2.3


From f8e82ae65eaf347fb8924a1d9c544da7bcb9f798 Mon Sep 17 00:00:00 2001
From: "Bao D. Nguyen" <quic_nguyenb@quicinc.com>
Date: Mon, 13 Oct 2025 12:38:15 -0700
Subject: scsi: ufs: core: Remove UFS_DEVICE_QUIRK_DELAY_AFTER_LPM quirk

After the UFS device VCC is turned off, all the UFS device manufacturers
require a period of power-off time before the VCC can be turned on
again. This requirement has been confirmed with all the UFS device
manufacturer's datasheets.

Remove the UFS_DEVICE_QUIRK_DELAY_AFTER_LPM quirk in the UFS core driver
and implement a universal delay that is required by all the UFS device
manufacturers. In addition, remove the support for this quirk in the
platform drivers.

Signed-off-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Reviewed-by: Peter Wang <peter.wang@mediatek.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/25f134d5a42e8b8365be64d512d1bb5fc2bce6ff.1760383740.git.quic_nguyenb@quicinc.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c       |  5 ++---
 drivers/ufs/host/ufs-mediatek.c | 11 ++++-------
 drivers/ufs/host/ufs-qcom.c     |  3 ---
 include/ufs/ufs_quirks.h        |  7 -------
 4 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 8339fec975b9..4ca652c02ac2 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -9774,10 +9774,9 @@ static void ufshcd_vreg_set_lpm(struct ufs_hba *hba)
 	}
 
 	/*
-	 * Some UFS devices require delay after VCC power rail is turned-off.
+	 * All UFS devices require delay after VCC power rail is turned-off.
 	 */
-	if (vcc_off && hba->vreg_info.vcc &&
-		hba->dev_quirks & UFS_DEVICE_QUIRK_DELAY_AFTER_LPM)
+	if (vcc_off && hba->vreg_info.vcc && !hba->vreg_info.vcc->always_on)
 		usleep_range(5000, 5100);
 }
 
diff --git a/drivers/ufs/host/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c
index 758a393a9de1..bbfb2d51be92 100644
--- a/drivers/ufs/host/ufs-mediatek.c
+++ b/drivers/ufs/host/ufs-mediatek.c
@@ -41,8 +41,7 @@ static void _ufs_mtk_clk_scale(struct ufs_hba *hba, bool scale_up);
 static const struct ufs_dev_quirk ufs_mtk_dev_fixups[] = {
 	{ .wmanufacturerid = UFS_ANY_VENDOR,
 	  .model = UFS_ANY_MODEL,
-	  .quirk = UFS_DEVICE_QUIRK_DELAY_AFTER_LPM |
-		UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM },
+	  .quirk = UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM },
 	{ .wmanufacturerid = UFS_VENDOR_SKHYNIX,
 	  .model = "H9HQ21AFAMZDAR",
 	  .quirk = UFS_DEVICE_QUIRK_SUPPORT_EXTENDED_FEATURES },
@@ -1889,15 +1888,13 @@ static void ufs_mtk_fixup_dev_quirks(struct ufs_hba *hba)
 {
 	ufshcd_fixup_dev_quirks(hba, ufs_mtk_dev_fixups);
 
-	if (ufs_mtk_is_broken_vcc(hba) && hba->vreg_info.vcc &&
-	    (hba->dev_quirks & UFS_DEVICE_QUIRK_DELAY_AFTER_LPM)) {
+	if (ufs_mtk_is_broken_vcc(hba) && hba->vreg_info.vcc) {
 		hba->vreg_info.vcc->always_on = true;
 		/*
 		 * VCC will be kept always-on thus we don't
-		 * need any delay during regulator operations
+		 * need any delay before putting device's VCC in LPM mode.
 		 */
-		hba->dev_quirks &= ~(UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM |
-			UFS_DEVICE_QUIRK_DELAY_AFTER_LPM);
+		hba->dev_quirks &= ~UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM;
 	}
 
 	ufs_mtk_vreg_fix_vcc(hba);
diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c
index 3e83dc51d538..038064fb6ec2 100644
--- a/drivers/ufs/host/ufs-qcom.c
+++ b/drivers/ufs/host/ufs-qcom.c
@@ -1024,9 +1024,6 @@ static struct ufs_dev_quirk ufs_qcom_dev_fixups[] = {
 	{ .wmanufacturerid = UFS_VENDOR_SKHYNIX,
 	  .model = UFS_ANY_MODEL,
 	  .quirk = UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM },
-	{ .wmanufacturerid = UFS_VENDOR_TOSHIBA,
-	  .model = UFS_ANY_MODEL,
-	  .quirk = UFS_DEVICE_QUIRK_DELAY_AFTER_LPM },
 	{ .wmanufacturerid = UFS_VENDOR_WDC,
 	  .model = UFS_ANY_MODEL,
 	  .quirk = UFS_DEVICE_QUIRK_HOST_PA_TACTIVATE },
diff --git a/include/ufs/ufs_quirks.h b/include/ufs/ufs_quirks.h
index 83563247c36c..e9c59ec1ceae 100644
--- a/include/ufs/ufs_quirks.h
+++ b/include/ufs/ufs_quirks.h
@@ -100,13 +100,6 @@ struct ufs_dev_quirk {
  */
 #define UFS_DEVICE_QUIRK_SUPPORT_EXTENDED_FEATURES (1 << 10)
 
-/*
- * Some UFS devices require delay after VCC power rail is turned-off.
- * Enable this quirk to introduce 5ms delays after VCC power-off during
- * suspend flow.
- */
-#define UFS_DEVICE_QUIRK_DELAY_AFTER_LPM        (1 << 11)
-
 /*
  * Some ufs devices may need more time to be in hibern8 before exiting.
  * Enable this quirk to give it an additional 100us.
-- 
cgit v1.2.3


From 4760b639b43c107c8bfccd658478bbb3152fa56f Mon Sep 17 00:00:00 2001
From: "Bao D. Nguyen" <quic_nguyenb@quicinc.com>
Date: Mon, 13 Oct 2025 12:38:16 -0700
Subject: scsi: ufs: core: Replace hard coded vcc-off delay with a variable

After the UFS device VCC is powered off, all the UFS device
manufacturers require a minimum of 1ms of power-off time before VCC can
be powered on again. This requirement has been verified with all the UFS
device manufacturer's datasheets.

Replace the hard coded 5ms delay with a variable with a default setting
of 2ms to improve the system resume latency. The platform drivers can
override this setting as needed.

Signed-off-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Peter Wang <peter.wang@mediatek.com>
Link: https://patch.msgid.link/72fa649406a0bf02271575b7d58f22c968aa5d7e.1760383740.git.quic_nguyenb@quicinc.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 10 +++++++++-
 include/ufs/ufshcd.h      |  2 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 4ca652c02ac2..516e9e06d3c3 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -9777,7 +9777,8 @@ static void ufshcd_vreg_set_lpm(struct ufs_hba *hba)
 	 * All UFS devices require delay after VCC power rail is turned-off.
 	 */
 	if (vcc_off && hba->vreg_info.vcc && !hba->vreg_info.vcc->always_on)
-		usleep_range(5000, 5100);
+		usleep_range(hba->vcc_off_delay_us,
+			     hba->vcc_off_delay_us + 100);
 }
 
 #ifdef CONFIG_PM
@@ -10704,6 +10705,13 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 						UFS_SLEEP_PWR_MODE,
 						UIC_LINK_HIBERN8_STATE);
 
+	/*
+	 * Most ufs devices require 1ms delay after vcc is powered off before
+	 * it can be powered on again. Set the default to 2ms. The platform
+	 * drivers can override this setting as needed.
+	 */
+	hba->vcc_off_delay_us = 2000;
+
 	init_completion(&hba->dev_cmd.complete);
 
 	err = ufshcd_hba_init(hba);
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 9425cfd9d00e..752640a3e25f 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1117,6 +1117,8 @@ struct ufs_hba {
 	int critical_health_count;
 	atomic_t dev_lvl_exception_count;
 	u64 dev_lvl_exception_id;
+
+	u32 vcc_off_delay_us;
 };
 
 /**
-- 
cgit v1.2.3


From ca4709843b7e72f96976cd6b35bca148a4071673 Mon Sep 17 00:00:00 2001
From: David Yang <mmyangfl@gmail.com>
Date: Fri, 17 Oct 2025 14:08:54 +0800
Subject: net: dsa: tag_yt921x: add support for Motorcomm YT921x tags

Add support for Motorcomm YT921x tags, which includes a proper
configurable ethertype field (default to 0x9988).

Signed-off-by: David Yang <mmyangfl@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251017060859.326450-3-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h             |   2 +
 include/uapi/linux/if_ether.h |   1 +
 net/dsa/Kconfig               |   6 ++
 net/dsa/Makefile              |   1 +
 net/dsa/tag_yt921x.c          | 141 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 151 insertions(+)
 create mode 100644 net/dsa/tag_yt921x.c

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index d73ea0880066..67762fdaf3c7 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -55,6 +55,7 @@ struct tc_action;
 #define DSA_TAG_PROTO_LAN937X_VALUE		27
 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE	28
 #define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE	29
+#define DSA_TAG_PROTO_YT921X_VALUE		30
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -87,6 +88,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_RZN1_A5PSW	= DSA_TAG_PROTO_RZN1_A5PSW_VALUE,
 	DSA_TAG_PROTO_LAN937X		= DSA_TAG_PROTO_LAN937X_VALUE,
 	DSA_TAG_PROTO_VSC73XX_8021Q	= DSA_TAG_PROTO_VSC73XX_8021Q_VALUE,
+	DSA_TAG_PROTO_YT921X		= DSA_TAG_PROTO_YT921X_VALUE,
 };
 
 struct dsa_switch;
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 69e0457eb200..cfd200c87e5e 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -114,6 +114,7 @@
 #define ETH_P_QINQ1	0x9100		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_QINQ2	0x9200		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_QINQ3	0x9300		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_YT921X	0x9988		/* Motorcomm YT921x DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_EDSA	0xDADA		/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_DSA_8021Q	0xDADB		/* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_DSA_A5PSW	0xE001		/* A5PSW Tag Value [ NOT AN OFFICIALLY REGISTERED ID ] */
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 869cbe57162f..6b94028b1fcc 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -190,4 +190,10 @@ config NET_DSA_TAG_XRS700X
 	  Say Y or M if you want to enable support for tagging frames for
 	  Arrow SpeedChips XRS700x switches that use a single byte tag trailer.
 
+config NET_DSA_TAG_YT921X
+	tristate "Tag driver for Motorcomm YT921x switches"
+	help
+	  Say Y or M if you want to enable support for tagging frames for
+	  Motorcomm YT921x switches.
+
 endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 555c07cfeb71..4b011a1d5c87 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o
 obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
 obj-$(CONFIG_NET_DSA_TAG_VSC73XX_8021Q) += tag_vsc73xx_8021q.o
 obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o
+obj-$(CONFIG_NET_DSA_TAG_YT921X) += tag_yt921x.o
 
 # for tracing framework to find trace.h
 CFLAGS_trace.o := -I$(src)
diff --git a/net/dsa/tag_yt921x.c b/net/dsa/tag_yt921x.c
new file mode 100644
index 000000000000..995da44f0a2a
--- /dev/null
+++ b/net/dsa/tag_yt921x.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Motorcomm YT921x Switch Extended CPU Port Tagging
+ *
+ * Copyright (c) 2025 David Yang <mmyangfl@gmail.com>
+ *
+ * +----+----+-------+-----+----+---------
+ * | DA | SA | TagET | Tag | ET | Payload ...
+ * +----+----+-------+-----+----+---------
+ *   6    6      2      6    2       N
+ *
+ * Tag Ethertype: CPU_TAG_TPID_TPID (default: ETH_P_YT921X = 0x9988)
+ *   * Hardcoded for the moment, but still configurable. Discuss it if there
+ *     are conflicts somewhere and/or you want to change it for some reason.
+ * Tag:
+ *   2: VLAN Tag
+ *   2: Rx Port
+ *     15b: Rx Port Valid
+ *     14b-11b: Rx Port
+ *     10b-0b: Cmd?
+ *   2: Tx Port(s)
+ *     15b: Tx Port(s) Valid
+ *     10b-0b: Tx Port(s) Mask
+ */
+
+#include <linux/etherdevice.h>
+
+#include "tag.h"
+
+#define YT921X_TAG_NAME	"yt921x"
+
+#define YT921X_TAG_LEN	8
+
+#define YT921X_TAG_PORT_EN		BIT(15)
+#define YT921X_TAG_RX_PORT_M		GENMASK(14, 11)
+#define YT921X_TAG_RX_CMD_M		GENMASK(10, 0)
+#define  YT921X_TAG_RX_CMD(x)			FIELD_PREP(YT921X_TAG_RX_CMD_M, (x))
+#define  YT921X_TAG_RX_CMD_FORWARDED		0x80
+#define  YT921X_TAG_RX_CMD_UNK_UCAST		0xb2
+#define  YT921X_TAG_RX_CMD_UNK_MCAST		0xb4
+#define YT921X_TAG_TX_PORTS_M		GENMASK(10, 0)
+#define YT921X_TAG_TX_PORTn(port)	BIT(port)
+
+static struct sk_buff *
+yt921x_tag_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct dsa_port *dp = dsa_user_to_port(netdev);
+	unsigned int port = dp->index;
+	__be16 *tag;
+	u16 tx;
+
+	skb_push(skb, YT921X_TAG_LEN);
+	dsa_alloc_etype_header(skb, YT921X_TAG_LEN);
+
+	tag = dsa_etype_header_pos_tx(skb);
+
+	tag[0] = htons(ETH_P_YT921X);
+	/* VLAN tag unrelated when TX */
+	tag[1] = 0;
+	tag[2] = 0;
+	tx = YT921X_TAG_PORT_EN | YT921X_TAG_TX_PORTn(port);
+	tag[3] = htons(tx);
+
+	return skb;
+}
+
+static struct sk_buff *
+yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
+{
+	unsigned int port;
+	__be16 *tag;
+	u16 cmd;
+	u16 rx;
+
+	if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN)))
+		return NULL;
+
+	tag = dsa_etype_header_pos_rx(skb);
+
+	if (unlikely(tag[0] != htons(ETH_P_YT921X))) {
+		dev_warn_ratelimited(&netdev->dev,
+				     "Unexpected EtherType 0x%04x\n",
+				     ntohs(tag[0]));
+		return NULL;
+	}
+
+	/* Locate which port this is coming from */
+	rx = ntohs(tag[2]);
+	if (unlikely((rx & YT921X_TAG_PORT_EN) == 0)) {
+		dev_warn_ratelimited(&netdev->dev,
+				     "Unexpected rx tag 0x%04x\n", rx);
+		return NULL;
+	}
+
+	port = FIELD_GET(YT921X_TAG_RX_PORT_M, rx);
+	skb->dev = dsa_conduit_find_user(netdev, 0, port);
+	if (unlikely(!skb->dev)) {
+		dev_warn_ratelimited(&netdev->dev,
+				     "Couldn't decode source port %u\n", port);
+		return NULL;
+	}
+
+	cmd = FIELD_GET(YT921X_TAG_RX_CMD_M, rx);
+	switch (cmd) {
+	case YT921X_TAG_RX_CMD_FORWARDED:
+		/* Already forwarded by hardware */
+		dsa_default_offload_fwd_mark(skb);
+		break;
+	case YT921X_TAG_RX_CMD_UNK_UCAST:
+	case YT921X_TAG_RX_CMD_UNK_MCAST:
+		/* NOTE: hardware doesn't distinguish between TRAP (copy to CPU
+		 * only) and COPY (forward and copy to CPU). In order to perform
+		 * a soft switch, NEVER use COPY action in the switch driver.
+		 */
+		break;
+	default:
+		dev_warn_ratelimited(&netdev->dev,
+				     "Unexpected rx cmd 0x%02x\n", cmd);
+		break;
+	}
+
+	/* Remove YT921x tag and update checksum */
+	skb_pull_rcsum(skb, YT921X_TAG_LEN);
+	dsa_strip_etype_header(skb, YT921X_TAG_LEN);
+
+	return skb;
+}
+
+static const struct dsa_device_ops yt921x_netdev_ops = {
+	.name	= YT921X_TAG_NAME,
+	.proto	= DSA_TAG_PROTO_YT921X,
+	.xmit	= yt921x_tag_xmit,
+	.rcv	= yt921x_tag_rcv,
+	.needed_headroom = YT921X_TAG_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Motorcomm YT921x switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_YT921X, YT921X_TAG_NAME);
+
+module_dsa_tag_driver(yt921x_netdev_ops);
-- 
cgit v1.2.3


From 7162536410768ec6b219524c36d3a871ff97adf8 Mon Sep 17 00:00:00 2001
From: Peter Wang <peter.wang@mediatek.com>
Date: Wed, 24 Sep 2025 17:43:23 +0800
Subject: scsi: ufs: host: mediatek: Correct clock scaling with PM QoS flow

Correct clock scaling with PM QoS during suspend and resume.  Ensure PM
QoS is released during suspend if scaling up and re-applied after
resume. This prevents performance issues and maintains proper power
management.

Signed-off-by: Peter Wang <peter.wang@mediatek.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Acked-by: Chun-Hung Wu <chun-hung.wu@mediatek.com>
Link: https://patch.msgid.link/20250924094527.2992256-2-peter.wang@mediatek.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c       |  3 ++-
 drivers/ufs/host/ufs-mediatek.c | 10 ++++++++++
 include/ufs/ufshcd.h            |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 8339fec975b9..2dadb749e213 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -1076,7 +1076,7 @@ void ufshcd_pm_qos_exit(struct ufs_hba *hba)
  * @hba: per adapter instance
  * @on: If True, vote for perf PM QoS mode otherwise power save mode
  */
-static void ufshcd_pm_qos_update(struct ufs_hba *hba, bool on)
+void ufshcd_pm_qos_update(struct ufs_hba *hba, bool on)
 {
 	guard(mutex)(&hba->pm_qos_mutex);
 
@@ -1085,6 +1085,7 @@ static void ufshcd_pm_qos_update(struct ufs_hba *hba, bool on)
 
 	cpu_latency_qos_update_request(&hba->pm_qos_req, on ? 0 : PM_QOS_DEFAULT_VALUE);
 }
+EXPORT_SYMBOL_GPL(ufshcd_pm_qos_update);
 
 /**
  * ufshcd_set_clk_freq - set UFS controller clock frequencies
diff --git a/drivers/ufs/host/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c
index 758a393a9de1..009031fee744 100644
--- a/drivers/ufs/host/ufs-mediatek.c
+++ b/drivers/ufs/host/ufs-mediatek.c
@@ -1744,6 +1744,7 @@ static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op,
 {
 	int err;
 	struct arm_smccc_res res;
+	struct ufs_mtk_host *host = ufshcd_get_variant(hba);
 
 	if (status == PRE_CHANGE) {
 		if (ufshcd_is_auto_hibern8_supported(hba))
@@ -1773,6 +1774,10 @@ static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op,
 
 	ufs_mtk_sram_pwr_ctrl(false, res);
 
+	/* Release pm_qos if in scale-up mode during suspend */
+	if (ufshcd_is_clkscaling_supported(hba) && (host->clk_scale_up))
+		ufshcd_pm_qos_update(hba, false);
+
 	return 0;
 fail:
 	/*
@@ -1788,6 +1793,7 @@ static int ufs_mtk_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 {
 	int err;
 	struct arm_smccc_res res;
+	struct ufs_mtk_host *host = ufshcd_get_variant(hba);
 
 	if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL)
 		ufs_mtk_dev_vreg_set_lpm(hba, false);
@@ -1798,6 +1804,10 @@ static int ufs_mtk_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 	if (err)
 		goto fail;
 
+	/* Request pm_qos if in scale-up mode after resume */
+	if (ufshcd_is_clkscaling_supported(hba) && (host->clk_scale_up))
+		ufshcd_pm_qos_update(hba, true);
+
 	if (ufshcd_is_link_hibern8(hba)) {
 		err = ufs_mtk_link_set_hpm(hba);
 		if (err)
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 9425cfd9d00e..ce7301d63c5c 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1487,5 +1487,6 @@ int ufshcd_write_ee_control(struct ufs_hba *hba);
 int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask,
 			     const u16 *other_mask, u16 set, u16 clr);
 void ufshcd_force_error_recovery(struct ufs_hba *hba);
+void ufshcd_pm_qos_update(struct ufs_hba *hba, bool on);
 
 #endif /* End of Header */
-- 
cgit v1.2.3


From 1fd05367d5b1a5edd3d14c966a5f510e5b8a0c5e Mon Sep 17 00:00:00 2001
From: Peter Wang <peter.wang@mediatek.com>
Date: Wed, 24 Sep 2025 17:43:26 +0800
Subject: scsi: ufs: host: mediatek: Adjust sync length for FASTAUTO mode

Set the sync length for FASTAUTO G1 mode in the UFS Mediatek
driver. This ensures the sync length meets minimum values for high-speed
gears, improving stability during power mode changes.

Signed-off-by: Peter Wang <peter.wang@mediatek.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Acked-by: Chun-Hung Wu <chun-hung.wu@mediatek.com>
Link: https://patch.msgid.link/20250924094527.2992256-5-peter.wang@mediatek.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/host/ufs-mediatek.c | 32 ++++++++++++++++++++++++++++++++
 include/ufs/unipro.h            |  7 ++++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/ufs/host/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c
index 1dcc0c7c9f9b..2a69b4cede22 100644
--- a/drivers/ufs/host/ufs-mediatek.c
+++ b/drivers/ufs/host/ufs-mediatek.c
@@ -1332,6 +1332,36 @@ static bool ufs_mtk_pmc_via_fastauto(struct ufs_hba *hba,
 	return true;
 }
 
+static void ufs_mtk_adjust_sync_length(struct ufs_hba *hba)
+{
+	int i;
+	u32 value;
+	u32 cnt, att, min;
+	struct attr_min {
+		u32 attr;
+		u32 min_value;
+	} pa_min_sync_length[] = {
+		{PA_TXHSG1SYNCLENGTH, 0x48},
+		{PA_TXHSG2SYNCLENGTH, 0x48},
+		{PA_TXHSG3SYNCLENGTH, 0x48},
+		{PA_TXHSG4SYNCLENGTH, 0x48},
+		{PA_TXHSG5SYNCLENGTH, 0x48}
+	};
+
+	cnt = sizeof(pa_min_sync_length) / sizeof(struct attr_min);
+	for (i = 0; i < cnt; i++) {
+		att = pa_min_sync_length[i].attr;
+		min = pa_min_sync_length[i].min_value;
+		ufshcd_dme_get(hba, UIC_ARG_MIB(att), &value);
+		if (value < min)
+			ufshcd_dme_set(hba, UIC_ARG_MIB(att), min);
+
+		ufshcd_dme_peer_get(hba, UIC_ARG_MIB(att), &value);
+		if (value < min)
+			ufshcd_dme_peer_set(hba, UIC_ARG_MIB(att), min);
+	}
+}
+
 static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba,
 				const struct ufs_pa_layer_attr *dev_max_params,
 				struct ufs_pa_layer_attr *dev_req_params)
@@ -1355,6 +1385,8 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba,
 	}
 
 	if (ufs_mtk_pmc_via_fastauto(hba, dev_req_params)) {
+		ufs_mtk_adjust_sync_length(hba);
+
 		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXTERMINATION), true);
 		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXGEAR), UFS_HS_G1);
 
diff --git a/include/ufs/unipro.h b/include/ufs/unipro.h
index 360e1245fb40..498ec9028b3c 100644
--- a/include/ufs/unipro.h
+++ b/include/ufs/unipro.h
@@ -111,6 +111,9 @@
 #define PA_TXLINKSTARTUPHS	0x1544
 #define PA_AVAILRXDATALANES	0x1540
 #define PA_MINRXTRAILINGCLOCKS	0x1543
+#define PA_TXHSG1SYNCLENGTH	0x1552
+#define PA_TXHSG2SYNCLENGTH	0x1554
+#define PA_TXHSG3SYNCLENGTH	0x1556
 #define PA_LOCAL_TX_LCC_ENABLE	0x155E
 #define PA_ACTIVETXDATALANES	0x1560
 #define PA_CONNECTEDTXDATALANES	0x1561
@@ -160,7 +163,9 @@
 #define PA_PACPFRAMECOUNT	0x15C0
 #define PA_PACPERRORCOUNT	0x15C1
 #define PA_PHYTESTCONTROL	0x15C2
-#define PA_TXHSADAPTTYPE       0x15D4
+#define PA_TXHSG4SYNCLENGTH	0x15D0
+#define PA_TXHSADAPTTYPE	0x15D4
+#define PA_TXHSG5SYNCLENGTH	0x15D6
 
 /* Adpat type for PA_TXHSADAPTTYPE attribute */
 #define PA_REFRESH_ADAPT       0x00
-- 
cgit v1.2.3


From 9b2b03b36168bcda298546b121d6ecc530d01d25 Mon Sep 17 00:00:00 2001
From: Peter Wang <peter.wang@mediatek.com>
Date: Wed, 24 Sep 2025 17:43:28 +0800
Subject: scsi: ufs: host: mediatek: Remove duplicate function

Remove the duplicate ufs_mtk_us_to_ahit() function in the UFS Mediatek
driver and export the existing ufshcd_us_to_ahit() function for shared
use. This change reduces redundancy and maintains consistency across the
codebase.

Signed-off-by: Peter Wang <peter.wang@mediatek.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Acked-by: Chun-Hung Wu <chun-hung.wu@mediatek.com>
Link: https://patch.msgid.link/20250924094527.2992256-7-peter.wang@mediatek.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufs-sysfs.c    |  3 ++-
 drivers/ufs/host/ufs-mediatek.c | 14 +-------------
 include/ufs/ufshcd.h            |  1 +
 3 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufs-sysfs.c b/drivers/ufs/core/ufs-sysfs.c
index c040afc6668e..af9615587bf3 100644
--- a/drivers/ufs/core/ufs-sysfs.c
+++ b/drivers/ufs/core/ufs-sysfs.c
@@ -235,7 +235,7 @@ static int ufshcd_ahit_to_us(u32 ahit)
 }
 
 /* Convert microseconds to Auto-Hibernate Idle Timer register value */
-static u32 ufshcd_us_to_ahit(unsigned int timer)
+u32 ufshcd_us_to_ahit(unsigned int timer)
 {
 	unsigned int scale;
 
@@ -245,6 +245,7 @@ static u32 ufshcd_us_to_ahit(unsigned int timer)
 	return FIELD_PREP(UFSHCI_AHIBERN8_TIMER_MASK, timer) |
 	       FIELD_PREP(UFSHCI_AHIBERN8_SCALE_MASK, scale);
 }
+EXPORT_SYMBOL_GPL(ufshcd_us_to_ahit);
 
 static int ufshcd_read_hci_reg(struct ufs_hba *hba, u32 *val, unsigned int reg)
 {
diff --git a/drivers/ufs/host/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c
index c00e62adbbda..3e54154d5547 100644
--- a/drivers/ufs/host/ufs-mediatek.c
+++ b/drivers/ufs/host/ufs-mediatek.c
@@ -1109,18 +1109,6 @@ static void ufs_mtk_setup_clk_gating(struct ufs_hba *hba)
 	}
 }
 
-/* Convert microseconds to Auto-Hibernate Idle Timer register value */
-static u32 ufs_mtk_us_to_ahit(unsigned int timer)
-{
-	unsigned int scale;
-
-	for (scale = 0; timer > UFSHCI_AHIBERN8_TIMER_MASK; ++scale)
-		timer /= UFSHCI_AHIBERN8_SCALE_FACTOR;
-
-	return FIELD_PREP(UFSHCI_AHIBERN8_TIMER_MASK, timer) |
-	       FIELD_PREP(UFSHCI_AHIBERN8_SCALE_MASK, scale);
-}
-
 static void ufs_mtk_fix_ahit(struct ufs_hba *hba)
 {
 	unsigned int us;
@@ -1143,7 +1131,7 @@ static void ufs_mtk_fix_ahit(struct ufs_hba *hba)
 			break;
 		}
 
-		hba->ahit = ufs_mtk_us_to_ahit(us);
+		hba->ahit = ufshcd_us_to_ahit(us);
 	}
 
 	ufs_mtk_setup_clk_gating(hba);
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index ce7301d63c5c..4b3a8daf8e0b 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1488,5 +1488,6 @@ int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask,
 			     const u16 *other_mask, u16 set, u16 clr);
 void ufshcd_force_error_recovery(struct ufs_hba *hba);
 void ufshcd_pm_qos_update(struct ufs_hba *hba, bool on);
+u32 ufshcd_us_to_ahit(unsigned int timer);
 
 #endif /* End of Header */
-- 
cgit v1.2.3


From 6837c006d4e72d6add451411bcf407e0dea4ad25 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@linaro.org>
Date: Tue, 21 Oct 2025 15:24:47 +0000
Subject: firmware: exynos-acpm: add empty method to allow compile test

Provide empty method for devm_acpm_get_by_node() if we aren't
building in the CONFIG_EXYNOS_ACPM_PROTOCOL. This allows to
test-build the CONFIG_EXYNOS_ACPM_CLK code.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202510211905.RgfWkgss-lkp@intel.com/
Fixes: 40498a742053 ("clk: samsung: add Exynos ACPM clock driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus@linaro.org>
Link: https://patch.msgid.link/20251021-fix-acpm-clk-build-test-v1-1-236a3d6db7f5@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 include/linux/firmware/samsung/exynos-acpm-protocol.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h
index b1e95435240f..2091da965a5a 100644
--- a/include/linux/firmware/samsung/exynos-acpm-protocol.h
+++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h
@@ -55,7 +55,16 @@ struct acpm_handle {
 
 struct device;
 
+#if IS_ENABLED(CONFIG_EXYNOS_ACPM_PROTOCOL)
 const struct acpm_handle *devm_acpm_get_by_node(struct device *dev,
 						struct device_node *np);
+#else
+
+static inline const struct acpm_handle *devm_acpm_get_by_node(struct device *dev,
+							      struct device_node *np)
+{
+	return NULL;
+}
+#endif
 
 #endif /* __EXYNOS_ACPM_PROTOCOL_H */
-- 
cgit v1.2.3


From fdd00d79dc0e8a3f90be65d5060c55bb115c0f43 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 15 Oct 2025 20:35:43 -0700
Subject: ipack: fix ipack.h kernel-doc warnings

Fix various kernel-doc warnings in ipack.h:

 - Remove an empty kernel-doc comment.
 - Add 2 missing struct short descriptions.
 - Fix a typo in a description.
 - Add a missing struct field description.
 - Add some missing Return descriptions.
 - Clarify one function short description.

Warning: ../include/linux/ipack.h:73 Cannot find identifier on line:
 */
Warning: ../include/linux/ipack.h:74 Cannot find identifier on line:
struct ipack_region {
Warning: ../include/linux/ipack.h:75 Cannot find identifier on line:
        phys_addr_t start;
Warning: ../include/linux/ipack.h:76 Cannot find identifier on line:
        size_t      size;
Warning: ../include/linux/ipack.h:77 Cannot find identifier on line:
};
Warning: ../include/linux/ipack.h:78 Cannot find identifier on line:

Warning: ../include/linux/ipack.h:79 Cannot find identifier on line:
/**
Warning: ipack.h:80 missing initial short description on line:
 *      struct ipack_device
Warning: ipack.h:163 missing initial short description on line:
 *      struct ipack_bus_device
Warning: ipack.h:130 struct member 'id_table' not described in 'ipack_driver'
Warning: ipack.h:189 No description found for return value of 'ipack_bus_register'
Warning: ipack.h:194 No description found for return value of 'ipack_bus_unregister' ***
Warning: ipack.h:202 No description found for return value of 'ipack_driver_register'
Warning: ipack.h:221 No description found for return value of 'ipack_device_init'
Warning: ipack.h:236 No description found for return value of 'ipack_device_add'
Warning: ipack.h:271 No description found for return value of 'ipack_get_carrier'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Vaibhav Gupta <vaibhavgupta40@gmail.com>
Link: https://patch.msgid.link/20251016033543.1142049-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/ipack.h | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipack.h b/include/linux/ipack.h
index 2c6936b8371f..455f6c2a1903 100644
--- a/include/linux/ipack.h
+++ b/include/linux/ipack.h
@@ -70,15 +70,13 @@ enum ipack_space {
 	IPACK_SPACE_COUNT,
 };
 
-/**
- */
 struct ipack_region {
 	phys_addr_t start;
 	size_t      size;
 };
 
 /**
- *	struct ipack_device
+ *	struct ipack_device - subsystem representation of an IPack device
  *
  *	@slot: Slot where the device is plugged in the carrier board
  *	@bus: ipack_bus_device where the device is plugged to.
@@ -89,7 +87,7 @@ struct ipack_region {
  *
  * Warning: Direct access to mapped memory is possible but the endianness
  * is not the same with PCI carrier or VME carrier. The endianness is managed
- * by the carrier board throught bus->ops.
+ * by the carrier board through bus->ops.
  */
 struct ipack_device {
 	unsigned int slot;
@@ -124,6 +122,7 @@ struct ipack_driver_ops {
  * struct ipack_driver -- Specific data to each ipack device driver
  *
  * @driver: Device driver kernel representation
+ * @id_table: Device ID table for this driver
  * @ops:    Callbacks provided by the IPack device driver
  */
 struct ipack_driver {
@@ -161,7 +160,7 @@ struct ipack_bus_ops {
 };
 
 /**
- *	struct ipack_bus_device
+ *	struct ipack_bus_device - IPack bus representation
  *
  *	@dev: pointer to carrier device
  *	@slots: number of slots available
@@ -185,6 +184,8 @@ struct ipack_bus_device {
  *
  * The carrier board device should call this function to register itself as
  * available bus device in ipack.
+ *
+ * Return: %NULL on error or &struct ipack_bus_device on success
  */
 struct ipack_bus_device *ipack_bus_register(struct device *parent, int slots,
 					    const struct ipack_bus_ops *ops,
@@ -192,6 +193,8 @@ struct ipack_bus_device *ipack_bus_register(struct device *parent, int slots,
 
 /**
  *	ipack_bus_unregister -- unregister an ipack bus
+ *
+ *	Return: %0
  */
 int ipack_bus_unregister(struct ipack_bus_device *bus);
 
@@ -200,6 +203,8 @@ int ipack_bus_unregister(struct ipack_bus_device *bus);
  *
  * Called by a ipack driver to register itself as a driver
  * that can manage ipack devices.
+ *
+ * Return: zero on success or error code on failure.
  */
 int ipack_driver_register(struct ipack_driver *edrv, struct module *owner,
 			  const char *name);
@@ -215,7 +220,7 @@ void ipack_driver_unregister(struct ipack_driver *edrv);
  * function.  The rest of the fields will be allocated and populated
  * during initalization.
  *
- * Return zero on success or error code on failure.
+ * Return: zero on success or error code on failure.
  *
  * NOTE: _Never_ directly free @dev after calling this function, even
  * if it returned an error! Always use ipack_put_device() to give up the
@@ -230,7 +235,7 @@ int ipack_device_init(struct ipack_device *dev);
  * Add a new IPack device. The call is done by the carrier driver
  * after calling ipack_device_init().
  *
- * Return zero on success or error code on failure.
+ * Return: zero on success or error code on failure.
  *
  * NOTE: _Never_ directly free @dev after calling this function, even
  * if it returned an error! Always use ipack_put_device() to give up the
@@ -266,9 +271,11 @@ void ipack_put_device(struct ipack_device *dev);
 	 .device = (dev)
 
 /**
- * ipack_get_carrier - it increase the carrier ref. counter of
+ * ipack_get_carrier - try to increase the carrier ref. counter of
  *                     the carrier module
  * @dev: mezzanine device which wants to get the carrier
+ *
+ * Return: true on success.
  */
 static inline int ipack_get_carrier(struct ipack_device *dev)
 {
-- 
cgit v1.2.3


From 9fd2eb9e18a0a0b5a127937586388ed0181d9dac Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 14 Sep 2025 15:42:40 +0200
Subject: cdx: make cdx_bus_type constant

Now that the driver core can properly handle constant struct bus_type,
move the cdx_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Nipun Gupta <nipun.gupta@amd.com>
Cc: Nikhil Agarwal <nikhil.agarwal@amd.com>
Acked-by: Nipun Gupta <nipun.gupta@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/2025091439-sustained-acorn-4af4@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/cdx/cdx.c           | 4 ++--
 include/linux/cdx/cdx_bus.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index 3d50f8cd9c0b..b39af2f1937f 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -170,7 +170,7 @@ static int cdx_unregister_device(struct device *dev,
 	return 0;
 }
 
-static void cdx_unregister_devices(struct bus_type *bus)
+static void cdx_unregister_devices(const struct bus_type *bus)
 {
 	/* Reset all the devices attached to cdx bus */
 	bus_for_each_dev(bus, NULL, NULL, cdx_unregister_device);
@@ -651,7 +651,7 @@ static struct attribute *cdx_bus_attrs[] = {
 };
 ATTRIBUTE_GROUPS(cdx_bus);
 
-struct bus_type cdx_bus_type = {
+const struct bus_type cdx_bus_type = {
 	.name		= "cdx",
 	.match		= cdx_bus_match,
 	.probe		= cdx_probe,
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index 79bb80e56790..b1ba97f6c9ad 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -234,7 +234,7 @@ int __must_check __cdx_driver_register(struct cdx_driver *cdx_driver,
  */
 void cdx_driver_unregister(struct cdx_driver *cdx_driver);
 
-extern struct bus_type cdx_bus_type;
+extern const struct bus_type cdx_bus_type;
 
 /**
  * cdx_dev_reset - Reset CDX device
-- 
cgit v1.2.3


From 61e606305672342858a647af3629d9dfcc4e4265 Mon Sep 17 00:00:00 2001
From: Adrian Barnaś <abarnas@google.com>
Date: Fri, 19 Sep 2025 06:53:27 +0000
Subject: drivers: eisa: make eisa_bus_type const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because driver core can properly handle constant struct bus_type,
move the eisa_bus_type to be a constant structure as well, placing it into
read-only memory which can not be modified at runtime.

Signed-off-by: Adrian Barnaś <abarnas@google.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/20250919065327.672924-1-abarnas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/eisa/eisa-bus.c | 2 +-
 include/linux/eisa.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/eisa/eisa-bus.c b/drivers/eisa/eisa-bus.c
index edceea083b98..bd76d599109c 100644
--- a/drivers/eisa/eisa-bus.c
+++ b/drivers/eisa/eisa-bus.c
@@ -135,7 +135,7 @@ static int eisa_bus_uevent(const struct device *dev, struct kobj_uevent_env *env
 	return 0;
 }
 
-struct bus_type eisa_bus_type = {
+const struct bus_type eisa_bus_type = {
 	.name  = "eisa",
 	.match = eisa_bus_match,
 	.uevent = eisa_bus_uevent,
diff --git a/include/linux/eisa.h b/include/linux/eisa.h
index 21a2ecc1e538..cf55630b595b 100644
--- a/include/linux/eisa.h
+++ b/include/linux/eisa.h
@@ -68,7 +68,7 @@ struct eisa_driver {
 /* These external functions are only available when EISA support is enabled. */
 #ifdef CONFIG_EISA
 
-extern struct bus_type eisa_bus_type;
+extern const struct bus_type eisa_bus_type;
 int eisa_driver_register (struct eisa_driver *edrv);
 void eisa_driver_unregister (struct eisa_driver *edrv);
 
-- 
cgit v1.2.3


From 8ce6b508f24b4ef3a78c2c0d92e67b9e324c4f7a Mon Sep 17 00:00:00 2001
From: Adrian Barnaś <abarnas@google.com>
Date: Fri, 19 Sep 2025 07:32:01 +0000
Subject: drivers: rapidio: make rio_bus_type const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because driver core can properly handle constant struct bus_type,
move the rio_bus_type to be a constant structure as well, placing it into
read-only memory which can not be modified at runtime.

Signed-off-by: Adrian Barnaś <abarnas@google.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/20250919073201.751348-1-abarnas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/rapidio/rio-driver.c | 2 +-
 include/linux/rio.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c
index 238250e69005..bcfe0b45b377 100644
--- a/drivers/rapidio/rio-driver.c
+++ b/drivers/rapidio/rio-driver.c
@@ -227,7 +227,7 @@ struct class rio_mport_class = {
 };
 EXPORT_SYMBOL_GPL(rio_mport_class);
 
-struct bus_type rio_bus_type = {
+const struct bus_type rio_bus_type = {
 	.name = "rapidio",
 	.match = rio_match_bus,
 	.dev_groups = rio_dev_groups,
diff --git a/include/linux/rio.h b/include/linux/rio.h
index 3c29f40f3c94..2c29f21ba9e5 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -78,7 +78,7 @@
 #define RIO_CTAG_RESRVD	0xfffe0000 /* Reserved */
 #define RIO_CTAG_UDEVID	0x0001ffff /* Unique device identifier */
 
-extern struct bus_type rio_bus_type;
+extern const struct bus_type rio_bus_type;
 extern struct class rio_mport_class;
 
 struct rio_mport;
-- 
cgit v1.2.3


From cebd22dd3a0ac76e0e1f2f369bba710bc6b1dc66 Mon Sep 17 00:00:00 2001
From: Pei Xiao <xiaopei01@kylinos.cn>
Date: Thu, 18 Sep 2025 10:54:02 +0800
Subject: platform: Use IOMEM_ERR_PTR for ioremap error returns

Replace ERR_PTR() with IOMEM_ERR_PTR() in stubbed ioremap functions to
maintain type consistency. The functions return void __iomem * pointers
and IOMEM_ERR_PTR() provides proper type casting to avoid sparse warnings.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202509060307.JubgnLhc-lkp@intel.com/
Signed-off-by: Pei Xiao <xiaopei01@kylinos.cn>
Link: https://patch.msgid.link/320f2cc9ada5cb66845daa6bf259000b4cffd8b3.1758163939.git.xiaopei01@kylinos.cn
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_device.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 074754c23d33..1d424fed1435 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -80,7 +80,7 @@ static inline void __iomem *
 devm_platform_get_and_ioremap_resource(struct platform_device *pdev,
 				unsigned int index, struct resource **res)
 {
-	return ERR_PTR(-EINVAL);
+	return IOMEM_ERR_PTR(-EINVAL);
 }
 
 
@@ -88,14 +88,14 @@ static inline void __iomem *
 devm_platform_ioremap_resource(struct platform_device *pdev,
 			       unsigned int index)
 {
-	return ERR_PTR(-EINVAL);
+	return IOMEM_ERR_PTR(-EINVAL);
 }
 
 static inline void __iomem *
 devm_platform_ioremap_resource_byname(struct platform_device *pdev,
 				      const char *name)
 {
-	return ERR_PTR(-EINVAL);
+	return IOMEM_ERR_PTR(-EINVAL);
 }
 
 #endif
-- 
cgit v1.2.3


From 6d0ef68955d30be1e218caf160ec32eec23ebc6e Mon Sep 17 00:00:00 2001
From: Yunhui Cui <cuiyunhui@bytedance.com>
Date: Tue, 23 Sep 2025 09:54:09 +0800
Subject: arch_topology: move parse_acpi_topology() to common code

Currently, RISC-V lacks arch-specific registers for CPU topology
properties and must get them from ACPI. Thus, parse_acpi_topology()
is moved from arm64/ to drivers/ for RISC-V reuse.

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://patch.msgid.link/20250923015409.15983-2-cuiyunhui@bytedance.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/include/asm/topology.h |   3 ++
 arch/arm64/kernel/topology.c      | 101 --------------------------------------
 drivers/base/arch_topology.c      |  96 +++++++++++++++++++++++++++++++++++-
 include/linux/arch_topology.h     |   5 ++
 4 files changed, 103 insertions(+), 102 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 341174bf9106..b9eaf4ad7085 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -36,6 +36,9 @@ void update_freq_counters_refs(void);
 #define arch_scale_hw_pressure topology_get_hw_pressure
 #define arch_update_hw_pressure	topology_update_hw_pressure
 
+#undef arch_cpu_is_threaded
+#define arch_cpu_is_threaded() (read_cpuid_mpidr() & MPIDR_MT_BITMASK)
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_ARM_TOPOLOGY_H */
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 5d07ee85bdae..5d24dc53799b 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -25,107 +25,6 @@
 #include <asm/cputype.h>
 #include <asm/topology.h>
 
-#ifdef CONFIG_ACPI
-static bool __init acpi_cpu_is_threaded(int cpu)
-{
-	int is_threaded = acpi_pptt_cpu_is_thread(cpu);
-
-	/*
-	 * if the PPTT doesn't have thread information, assume a homogeneous
-	 * machine and return the current CPU's thread state.
-	 */
-	if (is_threaded < 0)
-		is_threaded = read_cpuid_mpidr() & MPIDR_MT_BITMASK;
-
-	return !!is_threaded;
-}
-
-struct cpu_smt_info {
-	unsigned int thread_num;
-	int core_id;
-};
-
-/*
- * Propagate the topology information of the processor_topology_node tree to the
- * cpu_topology array.
- */
-int __init parse_acpi_topology(void)
-{
-	unsigned int max_smt_thread_num = 1;
-	struct cpu_smt_info *entry;
-	struct xarray hetero_cpu;
-	unsigned long hetero_id;
-	int cpu, topology_id;
-
-	if (acpi_disabled)
-		return 0;
-
-	xa_init(&hetero_cpu);
-
-	for_each_possible_cpu(cpu) {
-		topology_id = find_acpi_cpu_topology(cpu, 0);
-		if (topology_id < 0)
-			return topology_id;
-
-		if (acpi_cpu_is_threaded(cpu)) {
-			cpu_topology[cpu].thread_id = topology_id;
-			topology_id = find_acpi_cpu_topology(cpu, 1);
-			cpu_topology[cpu].core_id   = topology_id;
-
-			/*
-			 * In the PPTT, CPUs below a node with the 'identical
-			 * implementation' flag have the same number of threads.
-			 * Count the number of threads for only one CPU (i.e.
-			 * one core_id) among those with the same hetero_id.
-			 * See the comment of find_acpi_cpu_topology_hetero_id()
-			 * for more details.
-			 *
-			 * One entry is created for each node having:
-			 * - the 'identical implementation' flag
-			 * - its parent not having the flag
-			 */
-			hetero_id = find_acpi_cpu_topology_hetero_id(cpu);
-			entry = xa_load(&hetero_cpu, hetero_id);
-			if (!entry) {
-				entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-				WARN_ON_ONCE(!entry);
-
-				if (entry) {
-					entry->core_id = topology_id;
-					entry->thread_num = 1;
-					xa_store(&hetero_cpu, hetero_id,
-						 entry, GFP_KERNEL);
-				}
-			} else if (entry->core_id == topology_id) {
-				entry->thread_num++;
-			}
-		} else {
-			cpu_topology[cpu].thread_id  = -1;
-			cpu_topology[cpu].core_id    = topology_id;
-		}
-		topology_id = find_acpi_cpu_topology_cluster(cpu);
-		cpu_topology[cpu].cluster_id = topology_id;
-		topology_id = find_acpi_cpu_topology_package(cpu);
-		cpu_topology[cpu].package_id = topology_id;
-	}
-
-	/*
-	 * This is a short loop since the number of XArray elements is the
-	 * number of heterogeneous CPU clusters. On a homogeneous system
-	 * there's only one entry in the XArray.
-	 */
-	xa_for_each(&hetero_cpu, hetero_id, entry) {
-		max_smt_thread_num = max(max_smt_thread_num, entry->thread_num);
-		xa_erase(&hetero_cpu, hetero_id);
-		kfree(entry);
-	}
-
-	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
-	xa_destroy(&hetero_cpu);
-	return 0;
-}
-#endif
-
 #ifdef CONFIG_ARM64_AMU_EXTN
 #define read_corecnt()	read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0)
 #define read_constcnt()	read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0)
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 1037169abb45..1ccb1eda4ce8 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -823,12 +823,106 @@ void remove_cpu_topology(unsigned int cpu)
 	clear_cpu_topology(cpu);
 }
 
+#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
+struct cpu_smt_info {
+	unsigned int thread_num;
+	int core_id;
+};
+
+static bool __init acpi_cpu_is_threaded(int cpu)
+{
+	int is_threaded = acpi_pptt_cpu_is_thread(cpu);
+
+	/*
+	 * if the PPTT doesn't have thread information, check for architecture
+	 * specific fallback if available
+	 */
+	if (is_threaded < 0)
+		is_threaded = arch_cpu_is_threaded();
+
+	return !!is_threaded;
+}
+
+/*
+ * Propagate the topology information of the processor_topology_node tree to the
+ * cpu_topology array.
+ */
 __weak int __init parse_acpi_topology(void)
 {
+	unsigned int max_smt_thread_num = 1;
+	struct cpu_smt_info *entry;
+	struct xarray hetero_cpu;
+	unsigned long hetero_id;
+	int cpu, topology_id;
+
+	if (acpi_disabled)
+		return 0;
+
+	xa_init(&hetero_cpu);
+
+	for_each_possible_cpu(cpu) {
+		topology_id = find_acpi_cpu_topology(cpu, 0);
+		if (topology_id < 0)
+			return topology_id;
+
+		if (acpi_cpu_is_threaded(cpu)) {
+			cpu_topology[cpu].thread_id = topology_id;
+			topology_id = find_acpi_cpu_topology(cpu, 1);
+			cpu_topology[cpu].core_id   = topology_id;
+
+			/*
+			 * In the PPTT, CPUs below a node with the 'identical
+			 * implementation' flag have the same number of threads.
+			 * Count the number of threads for only one CPU (i.e.
+			 * one core_id) among those with the same hetero_id.
+			 * See the comment of find_acpi_cpu_topology_hetero_id()
+			 * for more details.
+			 *
+			 * One entry is created for each node having:
+			 * - the 'identical implementation' flag
+			 * - its parent not having the flag
+			 */
+			hetero_id = find_acpi_cpu_topology_hetero_id(cpu);
+			entry = xa_load(&hetero_cpu, hetero_id);
+			if (!entry) {
+				entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+				WARN_ON_ONCE(!entry);
+
+				if (entry) {
+					entry->core_id = topology_id;
+					entry->thread_num = 1;
+					xa_store(&hetero_cpu, hetero_id,
+						 entry, GFP_KERNEL);
+				}
+			} else if (entry->core_id == topology_id) {
+				entry->thread_num++;
+			}
+		} else {
+			cpu_topology[cpu].thread_id  = -1;
+			cpu_topology[cpu].core_id    = topology_id;
+		}
+		topology_id = find_acpi_cpu_topology_cluster(cpu);
+		cpu_topology[cpu].cluster_id = topology_id;
+		topology_id = find_acpi_cpu_topology_package(cpu);
+		cpu_topology[cpu].package_id = topology_id;
+	}
+
+	/*
+	 * This is a short loop since the number of XArray elements is the
+	 * number of heterogeneous CPU clusters. On a homogeneous system
+	 * there's only one entry in the XArray.
+	 */
+	xa_for_each(&hetero_cpu, hetero_id, entry) {
+		max_smt_thread_num = max(max_smt_thread_num, entry->thread_num);
+		xa_erase(&hetero_cpu, hetero_id);
+		kfree(entry);
+	}
+
+	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
+	xa_destroy(&hetero_cpu);
 	return 0;
 }
 
-#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
 void __init init_cpu_topology(void)
 {
 	int cpu, ret;
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index d72d6e5aa200..766ed9cf0e54 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -80,6 +80,11 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
 #define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
 #define topology_cluster_cpumask(cpu)	(&cpu_topology[cpu].cluster_sibling)
 #define topology_llc_cpumask(cpu)	(&cpu_topology[cpu].llc_sibling)
+
+#ifndef arch_cpu_is_threaded
+#define arch_cpu_is_threaded()	(0)
+#endif
+
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
-- 
cgit v1.2.3


From f82890c98f3e3fd61983e9021354c632ecd47427 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <badhri@google.com>
Date: Wed, 15 Oct 2025 04:30:13 +0000
Subject: tcpm: Parse and log AVS APDO

The USB PD specification introduced new Adjustable Voltage Supply (AVS)
types for both Standard Power Range (SPR) and Extended Power Range (EPR)
sources.

Add definitions to correctly parse and handle the new AVS APDO. Use
bitfield macros to add inline helper functions to extract voltage,
current, power, and peak current fields to parse and log the details
of the new EPR AVS and SPR AVS APDO.

Signed-off-by: Badhri Jagan Sridharan <badhri@google.com>
Reviewed-by: Amit Sunil Dhamne <amitsd@google.com>
Reviewed-by: Kyle Tso <kyletso@google.com>
Reviewed-by: RD Babiera <rdbabiera@google.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://patch.msgid.link/20251015043017.3382908-1-badhri@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 15 +++++++++-
 include/linux/usb/pd.h        | 69 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index b2a568a5bc9b..c65aa8104950 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -823,10 +823,23 @@ static void tcpm_log_source_caps(struct tcpm_port *port)
 		case PDO_TYPE_APDO:
 			if (pdo_apdo_type(pdo) == APDO_TYPE_PPS)
 				scnprintf(msg, sizeof(msg),
-					  "%u-%u mV, %u mA",
+					  "PPS %u-%u mV, %u mA",
 					  pdo_pps_apdo_min_voltage(pdo),
 					  pdo_pps_apdo_max_voltage(pdo),
 					  pdo_pps_apdo_max_current(pdo));
+			else if (pdo_apdo_type(pdo) == APDO_TYPE_EPR_AVS)
+				scnprintf(msg, sizeof(msg),
+					  "EPR AVS %u-%u mV %u W peak_current: %u",
+					  pdo_epr_avs_apdo_min_voltage_mv(pdo),
+					  pdo_epr_avs_apdo_max_voltage_mv(pdo),
+					  pdo_epr_avs_apdo_pdp_w(pdo),
+					  pdo_epr_avs_apdo_src_peak_current(pdo));
+			else if (pdo_apdo_type(pdo) == APDO_TYPE_SPR_AVS)
+				scnprintf(msg, sizeof(msg),
+					  "SPR AVS 9-15 V: %u mA 15-20 V: %u mA peak_current: %u",
+					  pdo_spr_avs_apdo_9v_to_15v_max_current_ma(pdo),
+					  pdo_spr_avs_apdo_15v_to_20v_max_current_ma(pdo),
+					  pdo_spr_avs_apdo_src_peak_current(pdo));
 			else
 				strcpy(msg, "undefined APDO");
 			break;
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index 3068c3084eb6..6ccd1b2af993 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -6,6 +6,7 @@
 #ifndef __LINUX_USB_PD_H
 #define __LINUX_USB_PD_H
 
+#include <linux/bitfield.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/usb/typec.h>
@@ -271,9 +272,11 @@ enum pd_pdo_type {
 
 enum pd_apdo_type {
 	APDO_TYPE_PPS = 0,
+	APDO_TYPE_EPR_AVS = 1,
+	APDO_TYPE_SPR_AVS = 2,
 };
 
-#define PDO_APDO_TYPE_SHIFT	28	/* Only valid value currently is 0x0 - PPS */
+#define PDO_APDO_TYPE_SHIFT	28
 #define PDO_APDO_TYPE_MASK	0x3
 
 #define PDO_APDO_TYPE(t)	((t) << PDO_APDO_TYPE_SHIFT)
@@ -297,6 +300,35 @@ enum pd_apdo_type {
 	PDO_PPS_APDO_MIN_VOLT(min_mv) | PDO_PPS_APDO_MAX_VOLT(max_mv) |	\
 	PDO_PPS_APDO_MAX_CURR(max_ma))
 
+/*
+ * Applicable only to EPR AVS APDO source cap as per
+ * Table 6.15 EPR Adjustable Voltage Supply APDO – Source
+ */
+#define PDO_EPR_AVS_APDO_PEAK_CURRENT	GENMASK(27, 26)
+
+/*
+ * Applicable to both EPR AVS APDO source and sink cap as per
+ * Table 6.15 EPR Adjustable Voltage Supply APDO – Source
+ * Table 6.22 EPR Adjustable Voltage Supply APDO – Sink
+ */
+#define PDO_EPR_AVS_APDO_MAX_VOLT	GENMASK(25, 17)	/* 100mV unit */
+#define PDO_EPR_AVS_APDO_MIN_VOLT	GENMASK(15, 8)	/* 100mV unit */
+#define PDO_EPR_AVS_APDO_PDP		GENMASK(7, 0) /* 1W unit */
+
+/*
+ * Applicable only SPR AVS APDO source cap as per
+ * Table 6.14 SPR Adjustable Voltage Supply APDO – Source
+ */
+#define PDO_SPR_AVS_APDO_PEAK_CURRENT		GENMASK(27, 26)
+
+/*
+ * Applicable to both SPR AVS APDO source and sink cap as per
+ * Table 6.14 SPR Adjustable Voltage Supply APDO – Source
+ * Table 6.21 SPR Adjustable Voltage Supply APDO – Sink
+ */
+#define PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR	GENMASK(19, 10)	/* 10mA unit */
+#define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR	GENMASK(9, 0)	/* 10mA unit */
+
 static inline enum pd_pdo_type pdo_type(u32 pdo)
 {
 	return (pdo >> PDO_TYPE_SHIFT) & PDO_TYPE_MASK;
@@ -350,6 +382,41 @@ static inline unsigned int pdo_pps_apdo_max_current(u32 pdo)
 		PDO_PPS_APDO_CURR_MASK) * 50;
 }
 
+static inline unsigned int pdo_epr_avs_apdo_src_peak_current(u32 pdo)
+{
+	return FIELD_GET(PDO_EPR_AVS_APDO_PEAK_CURRENT, pdo);
+}
+
+static inline unsigned int pdo_epr_avs_apdo_min_voltage_mv(u32 pdo)
+{
+	return FIELD_GET(PDO_EPR_AVS_APDO_MIN_VOLT, pdo) * 100;
+}
+
+static inline unsigned int pdo_epr_avs_apdo_max_voltage_mv(u32 pdo)
+{
+	return FIELD_GET(PDO_EPR_AVS_APDO_MIN_VOLT, pdo) * 100;
+}
+
+static inline unsigned int pdo_epr_avs_apdo_pdp_w(u32 pdo)
+{
+	return FIELD_GET(PDO_EPR_AVS_APDO_PDP, pdo);
+}
+
+static inline unsigned int pdo_spr_avs_apdo_src_peak_current(u32 pdo)
+{
+	return FIELD_GET(PDO_SPR_AVS_APDO_PEAK_CURRENT, pdo);
+}
+
+static inline unsigned int pdo_spr_avs_apdo_9v_to_15v_max_current_ma(u32 pdo)
+{
+	return FIELD_GET(PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR, pdo) * 10;
+}
+
+static inline unsigned int pdo_spr_avs_apdo_15v_to_20v_max_current_ma(u32 pdo)
+{
+	return FIELD_GET(PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR, pdo) * 10;
+}
+
 /* RDO: Request Data Object */
 #define RDO_OBJ_POS_SHIFT	28
 #define RDO_OBJ_POS_MASK	0x7
-- 
cgit v1.2.3


From 832c8d3fce77cf03cc225fc555c1bffa1c547ba1 Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Date: Tue, 14 Oct 2025 18:06:47 +0200
Subject: usb: typec: ps883x: Add USB4 mode and TBT3 altmode support

This chip can do some more than the driver currently describes. Add
support for configuring it for various flavors of TBT3/USB4 operation.

Reviewed-by: Jack Pham <jack.pham@oss.qualcomm.com>
Signed-off-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://patch.msgid.link/20251014-topic-ps883x_usb4-v1-3-e6adb1a4296e@oss.qualcomm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/mux/ps883x.c | 29 +++++++++++++++++++++++++++++
 include/linux/usb/typec_tbt.h  |  1 +
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/typec/mux/ps883x.c b/drivers/usb/typec/mux/ps883x.c
index 72f1e737ca4b..7c61629b36d6 100644
--- a/drivers/usb/typec/mux/ps883x.c
+++ b/drivers/usb/typec/mux/ps883x.c
@@ -14,15 +14,18 @@
 #include <linux/mutex.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
+#include <linux/usb/pd.h>
 #include <linux/usb/typec_altmode.h>
 #include <linux/usb/typec_dp.h>
 #include <linux/usb/typec_mux.h>
 #include <linux/usb/typec_retimer.h>
+#include <linux/usb/typec_tbt.h>
 
 #define REG_USB_PORT_CONN_STATUS_0		0x00
 
 #define CONN_STATUS_0_CONNECTION_PRESENT	BIT(0)
 #define CONN_STATUS_0_ORIENTATION_REVERSED	BIT(1)
+#define CONN_STATUS_0_ACTIVE_CABLE		BIT(2)
 #define CONN_STATUS_0_USB_3_1_CONNECTED		BIT(5)
 
 #define REG_USB_PORT_CONN_STATUS_1		0x01
@@ -34,6 +37,10 @@
 
 #define REG_USB_PORT_CONN_STATUS_2		0x02
 
+#define CONN_STATUS_2_TBT_CONNECTED		BIT(0)
+#define CONN_STATUS_2_TBT_UNIDIR_LSRX_ACT_LT	BIT(4)
+#define CONN_STATUS_2_USB4_CONNECTED		BIT(7)
+
 struct ps883x_retimer {
 	struct i2c_client *client;
 	struct gpio_desc *reset_gpio;
@@ -95,6 +102,8 @@ static int ps883x_configure(struct ps883x_retimer *retimer, int cfg0,
 
 static int ps883x_set(struct ps883x_retimer *retimer, struct typec_retimer_state *state)
 {
+	struct typec_thunderbolt_data *tb_data;
+	const struct enter_usb_data *eudo_data;
 	int cfg0 = CONN_STATUS_0_CONNECTION_PRESENT;
 	int cfg1 = 0x00;
 	int cfg2 = 0x00;
@@ -120,6 +129,18 @@ static int ps883x_set(struct ps883x_retimer *retimer, struct typec_retimer_state
 				break;
 			}
 			break;
+		case USB_TYPEC_TBT_SID:
+			tb_data = state->data;
+
+			/* Unconditional */
+			cfg2 |= CONN_STATUS_2_TBT_CONNECTED;
+
+			if (tb_data->cable_mode & TBT_CABLE_ACTIVE_PASSIVE)
+				cfg0 |= CONN_STATUS_0_ACTIVE_CABLE;
+
+			if (tb_data->enter_vdo & TBT_ENTER_MODE_UNI_DIR_LSRX)
+				cfg2 |= CONN_STATUS_2_TBT_UNIDIR_LSRX_ACT_LT;
+			break;
 		default:
 			dev_err(&retimer->client->dev, "Got unsupported SID: 0x%x\n",
 				state->alt->svid);
@@ -135,6 +156,14 @@ static int ps883x_set(struct ps883x_retimer *retimer, struct typec_retimer_state
 		case TYPEC_MODE_USB3:
 			cfg0 |= CONN_STATUS_0_USB_3_1_CONNECTED;
 			break;
+		case TYPEC_MODE_USB4:
+			eudo_data = state->data;
+
+			cfg2 |= CONN_STATUS_2_USB4_CONNECTED;
+
+			if (FIELD_GET(EUDO_CABLE_TYPE_MASK, eudo_data->eudo) != EUDO_CABLE_TYPE_PASSIVE)
+				cfg0 |= CONN_STATUS_0_ACTIVE_CABLE;
+			break;
 		default:
 			dev_err(&retimer->client->dev, "Got unsupported mode: %lu\n",
 				state->mode);
diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h
index 55dcea12082c..0b570f1b8bc8 100644
--- a/include/linux/usb/typec_tbt.h
+++ b/include/linux/usb/typec_tbt.h
@@ -55,6 +55,7 @@ struct typec_thunderbolt_data {
 
 /* TBT3 Device Enter Mode VDO bits */
 #define TBT_ENTER_MODE_CABLE_SPEED(s)	TBT_SET_CABLE_SPEED(s)
+#define TBT_ENTER_MODE_UNI_DIR_LSRX	BIT(23)
 #define TBT_ENTER_MODE_ACTIVE_CABLE	BIT(24)
 
 #endif /* __USB_TYPEC_TBT_H */
-- 
cgit v1.2.3


From 203dfbda03540f9a99341144a24877ee8b352189 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Thu, 25 Sep 2025 16:31:12 +0200
Subject: dt-bindings: power: Add support for MT8196 power controllers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for the power controllers found in the MediaTek MT8196
Chromebook SoC.

This chip has three power controllers, two of which located in the
SCP subsystems (where one can be directly controlled and the other
can be controlled only through the HW Voter IP), and one located
in the Multimedia HFRP subsystem, controllable only through the HW
Voter IP.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 .../bindings/power/mediatek,power-controller.yaml  |  4 ++
 include/dt-bindings/power/mediatek,mt8196-power.h  | 58 ++++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 include/dt-bindings/power/mediatek,mt8196-power.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml
index 500d98921581..f8a13928f615 100644
--- a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml
+++ b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml
@@ -33,6 +33,9 @@ properties:
       - mediatek,mt8188-power-controller
       - mediatek,mt8192-power-controller
       - mediatek,mt8195-power-controller
+      - mediatek,mt8196-hwv-hfrp-power-controller
+      - mediatek,mt8196-hwv-scp-power-controller
+      - mediatek,mt8196-power-controller
       - mediatek,mt8365-power-controller
 
   '#power-domain-cells':
@@ -157,6 +160,7 @@ allOf:
           contains:
             enum:
               - mediatek,mt8183-power-controller
+              - mediatek,mt8196-power-controller
     then:
       properties:
         access-controllers:
diff --git a/include/dt-bindings/power/mediatek,mt8196-power.h b/include/dt-bindings/power/mediatek,mt8196-power.h
new file mode 100644
index 000000000000..0f622a93c807
--- /dev/null
+++ b/include/dt-bindings/power/mediatek,mt8196-power.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Copyright (c) 2025 Collabora Ltd
+ *                    AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef _DT_BINDINGS_POWER_MT8196_POWER_H
+#define _DT_BINDINGS_POWER_MT8196_POWER_H
+
+/* SCPSYS Secure Power Manager - Direct Control */
+#define MT8196_POWER_DOMAIN_MD				0
+#define MT8196_POWER_DOMAIN_CONN			1
+#define MT8196_POWER_DOMAIN_SSUSB_P0			2
+#define MT8196_POWER_DOMAIN_SSUSB_DP_PHY_P0		3
+#define MT8196_POWER_DOMAIN_SSUSB_P1			4
+#define MT8196_POWER_DOMAIN_SSUSB_P23			5
+#define MT8196_POWER_DOMAIN_SSUSB_PHY_P2		6
+#define MT8196_POWER_DOMAIN_PEXTP_MAC0			7
+#define MT8196_POWER_DOMAIN_PEXTP_MAC1			8
+#define MT8196_POWER_DOMAIN_PEXTP_MAC2			9
+#define MT8196_POWER_DOMAIN_PEXTP_PHY0			10
+#define MT8196_POWER_DOMAIN_PEXTP_PHY1			11
+#define MT8196_POWER_DOMAIN_PEXTP_PHY2			12
+#define MT8196_POWER_DOMAIN_AUDIO			13
+#define MT8196_POWER_DOMAIN_ADSP_TOP_DORMANT		14
+#define MT8196_POWER_DOMAIN_ADSP_INFRA			15
+#define MT8196_POWER_DOMAIN_ADSP_AO			16
+
+/* SCPSYS Secure Power Manager - HW Voter */
+#define MT8196_POWER_DOMAIN_MM_PROC_DORMANT		0
+#define MT8196_POWER_DOMAIN_SSR				1
+
+/* HFRPSYS MultiMedia Power Control (MMPC) - HW Voter */
+#define MT8196_POWER_DOMAIN_VDE0			0
+#define MT8196_POWER_DOMAIN_VDE1			1
+#define MT8196_POWER_DOMAIN_VDE_VCORE0			2
+#define MT8196_POWER_DOMAIN_VEN0			3
+#define MT8196_POWER_DOMAIN_VEN1			4
+#define MT8196_POWER_DOMAIN_VEN2			5
+#define MT8196_POWER_DOMAIN_DISP_VCORE			6
+#define MT8196_POWER_DOMAIN_DIS0_DORMANT		7
+#define MT8196_POWER_DOMAIN_DIS1_DORMANT		8
+#define MT8196_POWER_DOMAIN_OVL0_DORMANT		9
+#define MT8196_POWER_DOMAIN_OVL1_DORMANT		10
+#define MT8196_POWER_DOMAIN_DISP_EDPTX_DORMANT		11
+#define MT8196_POWER_DOMAIN_DISP_DPTX_DORMANT		12
+#define MT8196_POWER_DOMAIN_MML0_SHUTDOWN		13
+#define MT8196_POWER_DOMAIN_MML1_SHUTDOWN		14
+#define MT8196_POWER_DOMAIN_MM_INFRA0			15
+#define MT8196_POWER_DOMAIN_MM_INFRA1			16
+#define MT8196_POWER_DOMAIN_MM_INFRA_AO			17
+#define MT8196_POWER_DOMAIN_CSI_BS_RX			18
+#define MT8196_POWER_DOMAIN_CSI_LS_RX			19
+#define MT8196_POWER_DOMAIN_DSI_PHY0			20
+#define MT8196_POWER_DOMAIN_DSI_PHY1			21
+#define MT8196_POWER_DOMAIN_DSI_PHY2			22
+
+#endif /* _DT_BINDINGS_POWER_MT8196_POWER_H */
-- 
cgit v1.2.3


From 295926ef36bb83d997f9c897b67fd1a0671db52e Mon Sep 17 00:00:00 2001
From: Finley Xiao <finley.xiao@rock-chips.com>
Date: Fri, 17 Oct 2025 17:38:33 +0800
Subject: dt-bindings: power: rockchip: Add support for RV1126B

Add power domain IDs for RV1126B SoC.
Add a new compatible because register fields have changed.

Signed-off-by: Finley Xiao <finley.xiao@rock-chips.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 .../bindings/power/rockchip,power-controller.yaml       |  2 ++
 .../power/rockchip,rv1126b-power-controller.h           | 17 +++++++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 include/dt-bindings/power/rockchip,rv1126b-power-controller.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/power/rockchip,power-controller.yaml b/Documentation/devicetree/bindings/power/rockchip,power-controller.yaml
index a884e49c995f..b41db576f95d 100644
--- a/Documentation/devicetree/bindings/power/rockchip,power-controller.yaml
+++ b/Documentation/devicetree/bindings/power/rockchip,power-controller.yaml
@@ -46,6 +46,7 @@ properties:
       - rockchip,rk3576-power-controller
       - rockchip,rk3588-power-controller
       - rockchip,rv1126-power-controller
+      - rockchip,rv1126b-power-controller
 
   "#power-domain-cells":
     const: 1
@@ -126,6 +127,7 @@ $defs:
           "include/dt-bindings/power/rk3568-power.h"
           "include/dt-bindings/power/rk3588-power.h"
           "include/dt-bindings/power/rockchip,rv1126-power.h"
+          "include/dt-bindings/power/rockchip,rv1126b-power-controller.h"
 
       clocks:
         minItems: 1
diff --git a/include/dt-bindings/power/rockchip,rv1126b-power-controller.h b/include/dt-bindings/power/rockchip,rv1126b-power-controller.h
new file mode 100644
index 000000000000..48ea87a4423c
--- /dev/null
+++ b/include/dt-bindings/power/rockchip,rv1126b-power-controller.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */
+/*
+ * Copyright (c) 2024 Rockchip Electronics Co., Ltd.
+ * Author: Finley Xiao <finley.xiao@rock-chips.com>
+ */
+
+#ifndef __DT_BINDINGS_POWER_RV1126B_POWER_CONTROLLER_H__
+#define __DT_BINDINGS_POWER_RV1126B_POWER_CONTROLLER_H__
+
+/* VD_NPU */
+#define RV1126B_PD_NPU		0
+
+/* VD_LOGIC */
+#define RV1126B_PD_VDO		1
+#define RV1126B_PD_AIISP	2
+
+#endif
-- 
cgit v1.2.3


From 9025688bf6d427e553aca911308cd92e92634f51 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Mon, 20 Oct 2025 10:53:40 -0700
Subject: module: Fix device table module aliases

Commit 6717e8f91db7 ("kbuild: Remove 'kmod_' prefix from
__KBUILD_MODNAME") inadvertently broke module alias generation for
modules which rely on MODULE_DEVICE_TABLE().

It removed the "kmod_" prefix from __KBUILD_MODNAME, which caused
MODULE_DEVICE_TABLE() to generate a symbol name which no longer matched
the format expected by handle_moddevtable() in scripts/mod/file2alias.c.

As a result, modpost failed to find the device tables, leading to
missing module aliases.

Fix this by explicitly adding the "kmod_" string within the
MODULE_DEVICE_TABLE() macro itself, restoring the symbol name to the
format expected by file2alias.c.

Fixes: 6717e8f91db7 ("kbuild: Remove 'kmod_' prefix from __KBUILD_MODNAME")
Reported-by: Alexander Stein <alexander.stein@ew.tq-group.com>
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reported-by: Mark Brown <broonie@kernel.org>
Reported-by: Cosmin Tanislav <demonsingur@gmail.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Cosmin Tanislav <demonsingur@gmail.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Mark Brown <broonie@kernel.org>
Tested-by: Alexander Stein <alexander.stein@ew.tq-group.com>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Tested-by: Anders Roxell <anders.roxell@linaro.org>
Link: https://patch.msgid.link/e52ee3edf32874da645a9e037a7d77c69893a22a.1760982784.git.jpoimboe@kernel.org
---
 include/linux/module.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index e135cc79acee..d80c3ea57472 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -251,10 +251,11 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name);
  */
 #define __mod_device_table(type, name)	\
 	__PASTE(__mod_device_table__,	\
+	__PASTE(kmod_,			\
 	__PASTE(__KBUILD_MODNAME,	\
 	__PASTE(__,			\
 	__PASTE(type,			\
-	__PASTE(__, name)))))
+	__PASTE(__, name))))))
 
 /* Creates an alias so file2alias.c can find device table. */
 #define MODULE_DEVICE_TABLE(type, name)					\
-- 
cgit v1.2.3


From 1cba30bf9fdd6c982708f3587f609a30c370d889 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 16 Oct 2025 11:09:38 -0700
Subject: io_uring: add support for IORING_SETUP_SQE_MIXED

Normal rings support 64b SQEs for posting submissions, while certain
features require the ring to be configured with IORING_SETUP_SQE128, as
they need to convey more information per submission. This, in turn,
makes ALL the SQEs be 128b in size. This is somewhat wasteful and
inefficient, particularly when only certain SQEs need to be of the
bigger variant.

This adds support for setting up a ring with mixed SQE sizes, using
IORING_SETUP_SQE_MIXED. When setup in this mode, SQEs posted to the ring
may be either 64b or 128b in size. If a SQE is 128b in size, then opcode
will be set to a variante to indicate that this is the case. Any other
non-128b opcode will assume the SQ's default size.

SQEs on these types of mixed rings may also utilize NOP with skip
success set.  This can happen if the ring is one (small) SQE entry away
from wrapping, and an attempt is made to get a 128b SQE. As SQEs must be
contiguous in the SQ ring, a 128b SQE cannot wrap the ring. For this
case, a single NOP SQE should be inserted with the SKIP_SUCCESS flag
set. The kernel will process this as a normal NOP and without posting a
CQE.

Signed-off-by: Keith Busch <kbusch@kernel.org>
[axboe: {} style fix and assign sqe before opcode read]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  8 ++++++++
 io_uring/fdinfo.c             | 34 +++++++++++++++++++++++++++-------
 io_uring/io_uring.c           | 37 +++++++++++++++++++++++++++++++++----
 io_uring/io_uring.h           | 14 ++------------
 io_uring/opdef.c              | 26 ++++++++++++++++++++++++++
 io_uring/opdef.h              |  2 ++
 io_uring/register.c           |  2 +-
 io_uring/uring_cmd.c          | 17 +++++++++++++++--
 8 files changed, 114 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 263bed13473e..04797a9b76bc 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -231,6 +231,12 @@ enum io_uring_sqe_flags_bit {
  */
 #define IORING_SETUP_CQE_MIXED		(1U << 18)
 
+/*
+ * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have
+ * a 128b opcode.
+ */
+#define IORING_SETUP_SQE_MIXED		(1U << 19)
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
@@ -295,6 +301,8 @@ enum io_uring_op {
 	IORING_OP_READV_FIXED,
 	IORING_OP_WRITEV_FIXED,
 	IORING_OP_PIPE,
+	IORING_OP_NOP128,
+	IORING_OP_URING_CMD128,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index ff3364531c77..1a806ad16840 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -14,6 +14,7 @@
 #include "fdinfo.h"
 #include "cancel.h"
 #include "rsrc.h"
+#include "opdef.h"
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
@@ -66,7 +67,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 	unsigned int cq_head = READ_ONCE(r->cq.head);
 	unsigned int cq_tail = READ_ONCE(r->cq.tail);
 	unsigned int sq_shift = 0;
-	unsigned int sq_entries;
 	int sq_pid = -1, sq_cpu = -1;
 	u64 sq_total_time = 0, sq_work_time = 0;
 	unsigned int i;
@@ -89,26 +89,45 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 	seq_printf(m, "CqTail:\t%u\n", cq_tail);
 	seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail));
 	seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head);
-	sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
-	for (i = 0; i < sq_entries; i++) {
-		unsigned int entry = i + sq_head;
+	while (sq_head < sq_tail) {
 		struct io_uring_sqe *sqe;
 		unsigned int sq_idx;
+		bool sqe128 = false;
+		u8 opcode;
 
 		if (ctx->flags & IORING_SETUP_NO_SQARRAY)
 			break;
-		sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+		sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]);
 		if (sq_idx > sq_mask)
 			continue;
+
 		sqe = &ctx->sq_sqes[sq_idx << sq_shift];
+		opcode = READ_ONCE(sqe->opcode);
+		if (sq_shift) {
+			sqe128 = true;
+		} else if (io_issue_defs[opcode].is_128) {
+			if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) {
+				seq_printf(m,
+					"%5u: invalid sqe, 128B entry on non-mixed sq\n",
+					sq_idx);
+				break;
+			}
+			if ((++sq_head & sq_mask) == 0) {
+				seq_printf(m,
+					"%5u: corrupted sqe, wrapping 128B entry\n",
+					sq_idx);
+				break;
+			}
+			sqe128 = true;
+		}
 		seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "
 			      "addr:0x%llx, rw_flags:0x%x, buf_index:%d "
 			      "user_data:%llu",
-			   sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd,
+			   sq_idx, io_uring_get_opcode(opcode), sqe->fd,
 			   sqe->flags, (unsigned long long) sqe->off,
 			   (unsigned long long) sqe->addr, sqe->rw_flags,
 			   sqe->buf_index, sqe->user_data);
-		if (sq_shift) {
+		if (sqe128) {
 			u64 *sqeb = (void *) (sqe + 1);
 			int size = sizeof(struct io_uring_sqe) / sizeof(u64);
 			int j;
@@ -120,6 +139,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 			}
 		}
 		seq_printf(m, "\n");
+		sq_head++;
 	}
 	seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
 	while (cq_head < cq_tail) {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e4ede0bad36f..be44d636fe1f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2164,7 +2164,7 @@ static __cold int io_init_fail_req(struct io_kiocb *req, int err)
 }
 
 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
-		       const struct io_uring_sqe *sqe)
+		       const struct io_uring_sqe *sqe, unsigned int *left)
 	__must_hold(&ctx->uring_lock)
 {
 	const struct io_issue_def *def;
@@ -2190,6 +2190,24 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	opcode = array_index_nospec(opcode, IORING_OP_LAST);
 
 	def = &io_issue_defs[opcode];
+	if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) {
+		/*
+		 * A 128b op on a non-128b SQ requires mixed SQE support as
+		 * well as 2 contiguous entries.
+		 */
+		if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
+		    !(ctx->cached_sq_head & (ctx->sq_entries - 1)))
+			return io_init_fail_req(req, -EINVAL);
+		/*
+		 * A 128b operation on a mixed SQ uses two entries, so we have
+		 * to increment the head and cached refs, and decrement what's
+		 * left.
+		 */
+		current->io_uring->cached_refs++;
+		ctx->cached_sq_head++;
+		(*left)--;
+	}
+
 	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
 		/* enforce forwards compatibility on users */
 		if (sqe_flags & ~SQE_VALID_FLAGS)
@@ -2299,13 +2317,13 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
 }
 
 static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			 const struct io_uring_sqe *sqe)
+			 const struct io_uring_sqe *sqe, unsigned int *left)
 	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_link *link = &ctx->submit_state.link;
 	int ret;
 
-	ret = io_init_req(ctx, req, sqe);
+	ret = io_init_req(ctx, req, sqe, left);
 	if (unlikely(ret))
 		return io_submit_fail_init(sqe, req, ret);
 
@@ -2457,7 +2475,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		 * Continue submitting even for sqe failure if the
 		 * ring was setup with IORING_SETUP_SUBMIT_ALL
 		 */
-		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
+		if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) &&
 		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
 			left--;
 			break;
@@ -2802,6 +2820,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
 		if (cq_entries < 2)
 			return SIZE_MAX;
 	}
+	if (flags & IORING_SETUP_SQE_MIXED) {
+		if (sq_entries < 2)
+			return SIZE_MAX;
+	}
 
 #ifdef CONFIG_SMP
 	off = ALIGN(off, SMP_CACHE_BYTES);
@@ -3726,6 +3748,13 @@ static int io_uring_sanitise_params(struct io_uring_params *p)
 	if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) ==
 	    (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))
 		return -EINVAL;
+	/*
+	 * Nonsensical to ask for SQE128 and mixed SQE support, it's not
+	 * supported to post 64b SQEs on a ring setup with SQE128.
+	 */
+	if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) ==
+	    (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 78777bf1ea4b..44b8091c7fcd 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -54,7 +54,8 @@
 			IORING_SETUP_REGISTERED_FD_ONLY |\
 			IORING_SETUP_NO_SQARRAY |\
 			IORING_SETUP_HYBRID_IOPOLL |\
-			IORING_SETUP_CQE_MIXED)
+			IORING_SETUP_CQE_MIXED |\
+			IORING_SETUP_SQE_MIXED)
 
 #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
 			IORING_ENTER_SQ_WAKEUP |\
@@ -565,17 +566,6 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
 	io_req_task_work_add(req);
 }
 
-/*
- * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
- * slot.
- */
-static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
-{
-	if (ctx->flags & IORING_SETUP_SQE128)
-		return 2 * sizeof(struct io_uring_sqe);
-	return sizeof(struct io_uring_sqe);
-}
-
 static inline bool io_file_can_poll(struct io_kiocb *req)
 {
 	if (req->flags & REQ_F_CAN_POLL)
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 932319633eac..df52d760240e 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -575,6 +575,24 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_pipe_prep,
 		.issue			= io_pipe,
 	},
+	[IORING_OP_NOP128] = {
+		.audit_skip		= 1,
+		.iopoll			= 1,
+		.is_128			= 1,
+		.prep			= io_nop_prep,
+		.issue			= io_nop,
+	},
+	[IORING_OP_URING_CMD128] = {
+		.buffer_select		= 1,
+		.needs_file		= 1,
+		.plug			= 1,
+		.iopoll			= 1,
+		.iopoll_queue		= 1,
+		.is_128			= 1,
+		.async_size		= sizeof(struct io_async_cmd),
+		.prep			= io_uring_cmd_prep,
+		.issue			= io_uring_cmd,
+	},
 };
 
 const struct io_cold_def io_cold_defs[] = {
@@ -825,6 +843,14 @@ const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_PIPE] = {
 		.name			= "PIPE",
 	},
+	[IORING_OP_NOP128] = {
+		.name			= "NOP128",
+	},
+	[IORING_OP_URING_CMD128] = {
+		.name			= "URING_CMD128",
+		.sqe_copy		= io_uring_cmd_sqe_copy,
+		.cleanup		= io_uring_cmd_cleanup,
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index c2f0907ed78c..aa37846880ff 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -27,6 +27,8 @@ struct io_issue_def {
 	unsigned		iopoll_queue : 1;
 	/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
 	unsigned		vectored : 1;
+	/* set to 1 if this opcode uses 128b sqes in a mixed sq */
+	unsigned		is_128 : 1;
 
 	/* size of async data needed, if any */
 	unsigned short		async_size;
diff --git a/io_uring/register.c b/io_uring/register.c
index 43eb02004824..1a3e05be6e7b 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -394,7 +394,7 @@ static void io_register_free_rings(struct io_ring_ctx *ctx,
 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
-			 IORING_SETUP_CQE_MIXED)
+			 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
 
 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 {
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 1225f8124e4b..9d67a2a721aa 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -216,6 +216,18 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
+/*
+ * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
+ * slot.
+ */
+static inline size_t uring_sqe_size(struct io_kiocb *req)
+{
+	if (req->ctx->flags & IORING_SETUP_SQE128 ||
+	    req->opcode == IORING_OP_URING_CMD128)
+		return 2 * sizeof(struct io_uring_sqe);
+	return sizeof(struct io_uring_sqe);
+}
+
 void io_uring_cmd_sqe_copy(struct io_kiocb *req)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@@ -224,7 +236,7 @@ void io_uring_cmd_sqe_copy(struct io_kiocb *req)
 	/* Should not happen, as REQ_F_SQE_COPIED covers this */
 	if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes))
 		return;
-	memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx));
+	memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req));
 	ioucmd->sqe = ac->sqes;
 }
 
@@ -242,7 +254,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret)
 		return ret;
 
-	if (ctx->flags & IORING_SETUP_SQE128)
+	if (ctx->flags & IORING_SETUP_SQE128 ||
+	    req->opcode == IORING_OP_URING_CMD128)
 		issue_flags |= IO_URING_F_SQE128;
 	if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED))
 		issue_flags |= IO_URING_F_CQE32;
-- 
cgit v1.2.3


From 5c5028ee594ce5f907ca6ad1c32cca6a15098464 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 20 Oct 2025 13:47:15 -0700
Subject: block: rename min_segment_size

Despite its name, the block layer is fine with segments smaller that the
"min_segment_size" limit. The value is an optimization limit indicating
the largest segment that can be used without considering boundary
limits. Smaller segments can take a fast path, so give it a name that
reflects that: max_fast_segment_size.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c      | 2 +-
 block/blk-settings.c   | 4 ++--
 block/blk.h            | 2 +-
 include/linux/blkdev.h | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 37864c5d287e..c47d18587a0b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -336,7 +336,7 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
 
 		if (nsegs < lim->max_segments &&
 		    bytes + bv.bv_len <= max_bytes &&
-		    bv.bv_offset + bv.bv_len <= lim->min_segment_size) {
+		    bv.bv_offset + bv.bv_len <= lim->max_fast_segment_size) {
 			nsegs++;
 			bytes += bv.bv_len;
 		} else {
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 54cffaae4df4..345b6a271cc3 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -457,12 +457,12 @@ int blk_validate_limits(struct queue_limits *lim)
 			return -EINVAL;
 	}
 
-	/* setup min segment size for building new segment in fast path */
+	/* setup max segment size for building new segment in fast path */
 	if (lim->seg_boundary_mask > lim->max_segment_size - 1)
 		seg_size = lim->max_segment_size;
 	else
 		seg_size = lim->seg_boundary_mask + 1;
-	lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE);
+	lim->max_fast_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE);
 
 	/*
 	 * We require drivers to at least do logical block aligned I/O, but
diff --git a/block/blk.h b/block/blk.h
index 170794632135..32a10024efba 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -377,7 +377,7 @@ static inline bool bio_may_need_split(struct bio *bio,
 	if (bio->bi_vcnt != 1)
 		return true;
 	return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset >
-		lim->min_segment_size;
+		lim->max_fast_segment_size;
 }
 
 /**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 70b671a9a7f7..99be263b31ab 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -378,7 +378,7 @@ struct queue_limits {
 	unsigned int		max_sectors;
 	unsigned int		max_user_sectors;
 	unsigned int		max_segment_size;
-	unsigned int		min_segment_size;
+	unsigned int		max_fast_segment_size;
 	unsigned int		physical_block_size;
 	unsigned int		logical_block_size;
 	unsigned int		alignment_offset;
-- 
cgit v1.2.3


From 159e85110891ebc12500d02d4bf214b1d203e305 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 1 Oct 2025 13:43:18 +0300
Subject: ACPI: property: Make acpi_get_next_subnode() static

acpi_get_next_subnode() is only used in drivers/acpi/property.c. Remove
its prototype from include/linux/acpi.h and make it static.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251001104320.1272752-2-sakari.ailus@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c |  7 ++++---
 include/linux/acpi.h    | 10 ----------
 2 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 43d5e457814e..dbf86bee62e1 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1329,13 +1329,14 @@ static int stop_on_next(struct acpi_device *adev, void *data)
 	return 0;
 }
 
-/**
+/*
  * acpi_get_next_subnode - Return the next child node handle for a fwnode
  * @fwnode: Firmware node to find the next child node for.
  * @child: Handle to one of the device's child nodes or a null handle.
  */
-struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
-					    struct fwnode_handle *child)
+static struct fwnode_handle *
+acpi_get_next_subnode(const struct fwnode_handle *fwnode,
+		      struct fwnode_handle *child)
 {
 	struct acpi_device *adev = to_acpi_device_node(fwnode);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 5ff5d99f6ead..703323b9fe0c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1349,9 +1349,6 @@ acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
 int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
 		       void **valptr);
 
-struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
-					    struct fwnode_handle *child);
-
 struct acpi_probe_entry;
 typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
 						 struct acpi_probe_entry *);
@@ -1450,13 +1447,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode,
 	return -ENXIO;
 }
 
-static inline struct fwnode_handle *
-acpi_get_next_subnode(const struct fwnode_handle *fwnode,
-		      struct fwnode_handle *child)
-{
-	return NULL;
-}
-
 static inline struct fwnode_handle *
 acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
 			     struct fwnode_handle *prev)
-- 
cgit v1.2.3


From 0d8627cc936de8ea04f3cc1e6921c63fb72cc199 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 22 Oct 2025 13:41:06 +0200
Subject: blktrace: add definitions for blk_user_trace_setup2

Add definitions for a version 2 of the blk_user_trace_setup ioctl. This
new ioctl will enable a different struct layout of the binary data passed
to user-space when using a new version of the blktrace utility requesting
the new struct layout.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blktrace_api.h | 16 ++++++++++++++++
 include/uapi/linux/fs.h           |  1 +
 kernel/trace/blktrace.c           |  3 +++
 3 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index 1bfb635e309b..a6958708d477 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -129,6 +129,7 @@ enum {
 };
 
 #define BLKTRACE_BDEV_SIZE	32
+#define BLKTRACE_BDEV_SIZE2	64
 
 /*
  * User setup structure passed with BLKTRACESETUP
@@ -143,4 +144,19 @@ struct blk_user_trace_setup {
 	__u32 pid;
 };
 
+/*
+ * User setup structure passed with BLKTRACESETUP2
+ */
+struct blk_user_trace_setup2 {
+	char name[BLKTRACE_BDEV_SIZE2];		/* output */
+	__u64 act_mask;				/* input */
+	__u32 buf_size;				/* input */
+	__u32 buf_nr;				/* input */
+	__u64 start_lba;
+	__u64 end_lba;
+	__u32 pid;
+	__u32 flags;		/* currently unused */
+	__u64 reserved[11];
+};
+
 #endif /* _UAPIBLKTRACE_H */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index beb4c2d1e41c..957ce3343a4f 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -300,6 +300,7 @@ struct file_attr {
 #define BLKGETDISKSEQ _IOR(0x12,128,__u64)
 /* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */
 /* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */
+#define BLKTRACESETUP2 _IOWR(0x12, 142, struct blk_user_trace_setup2)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index df90422ae613..c31b8f433116 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1601,6 +1601,9 @@ static int __init init_blk_tracer(void)
 		return 1;
 	}
 
+	BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) %
+		     __alignof__(long));
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 113cbd62824afdf62d2f3f092809cf37cc7f1dd8 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 22 Oct 2025 13:41:07 +0200
Subject: blktrace: pass blk_user_trace2 to setup functions

Pass struct blk_user_trace_setup2 to blktrace_setup_finalize(). This
prepares for the incoming extension of the blktrace protocol with a 64bit
act_mask.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blktrace_api.h |  3 ++-
 kernel/trace/blktrace.c      | 31 ++++++++++++++++++++++---------
 2 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 122c62e561fc..05c8754456aa 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -14,11 +14,12 @@
 #include <linux/sysfs.h>
 
 struct blk_trace {
+	int version;
 	int trace_state;
 	struct rchan *rchan;
 	unsigned long __percpu *sequence;
 	unsigned char __percpu *msg_data;
-	u16 act_mask;
+	u64 act_mask;
 	u64 start_lba;
 	u64 end_lba;
 	u32 pid;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c31b8f433116..d1532df84cc8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -597,11 +597,12 @@ err:
 }
 
 static void blk_trace_setup_finalize(struct request_queue *q,
-				     char *name, struct blk_trace *bt,
-				     struct blk_user_trace_setup *buts)
+				     char *name, int version,
+				     struct blk_trace *bt,
+				     struct blk_user_trace_setup2 *buts)
 
 {
-	strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE);
+	strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2);
 
 	/*
 	 * some device names have larger paths - convert the slashes
@@ -609,6 +610,7 @@ static void blk_trace_setup_finalize(struct request_queue *q,
 	 */
 	strreplace(buts->name, '/', '_');
 
+	bt->version = version;
 	bt->act_mask = buts->act_mask;
 	if (!bt->act_mask)
 		bt->act_mask = (u16) -1;
@@ -630,6 +632,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		    struct block_device *bdev,
 		    char __user *arg)
 {
+	struct blk_user_trace_setup2 buts2;
 	struct blk_user_trace_setup buts;
 	struct blk_trace *bt;
 	int ret;
@@ -641,6 +644,15 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!buts.buf_size || !buts.buf_nr)
 		return -EINVAL;
 
+	buts2 = (struct blk_user_trace_setup2) {
+		.act_mask = buts.act_mask,
+		.buf_size = buts.buf_size,
+		.buf_nr = buts.buf_nr,
+		.start_lba = buts.start_lba,
+		.end_lba = buts.end_lba,
+		.pid = buts.pid,
+	};
+
 	mutex_lock(&q->debugfs_mutex);
 	bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr,
 				     bdev);
@@ -648,7 +660,8 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		mutex_unlock(&q->debugfs_mutex);
 		return PTR_ERR(bt);
 	}
-	blk_trace_setup_finalize(q, name, bt, &buts);
+	blk_trace_setup_finalize(q, name, 1, bt, &buts2);
+	strcpy(buts.name, buts2.name);
 	mutex_unlock(&q->debugfs_mutex);
 
 	if (copy_to_user(arg, &buts, sizeof(buts))) {
@@ -664,7 +677,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
 				  dev_t dev, struct block_device *bdev,
 				  char __user *arg)
 {
-	struct blk_user_trace_setup buts;
+	struct blk_user_trace_setup2 buts2;
 	struct compat_blk_user_trace_setup cbuts;
 	struct blk_trace *bt;
 
@@ -674,7 +687,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
 	if (!cbuts.buf_size || !cbuts.buf_nr)
 		return -EINVAL;
 
-	buts = (struct blk_user_trace_setup) {
+	buts2 = (struct blk_user_trace_setup2) {
 		.act_mask = cbuts.act_mask,
 		.buf_size = cbuts.buf_size,
 		.buf_nr = cbuts.buf_nr,
@@ -684,16 +697,16 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
 	};
 
 	mutex_lock(&q->debugfs_mutex);
-	bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr,
+	bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr,
 				     bdev);
 	if (IS_ERR(bt)) {
 		mutex_unlock(&q->debugfs_mutex);
 		return PTR_ERR(bt);
 	}
-	blk_trace_setup_finalize(q, name, bt, &buts);
+	blk_trace_setup_finalize(q, name, 1, bt, &buts2);
 	mutex_unlock(&q->debugfs_mutex);
 
-	if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
+	if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) {
 		blk_trace_remove(q);
 		return -EFAULT;
 	}
-- 
cgit v1.2.3


From c44347d606260f36a81f6d8415a5af33cb3015fa Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 22 Oct 2025 13:41:08 +0200
Subject: blktrace: add definitions for struct blk_io_trace2

Add definitions for the extended version of the blktrace protocol using a
wider action type to be able to record new actions in the kernel.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blktrace_api.h | 16 ++++++++++++++++
 kernel/trace/blktrace.c           |  1 +
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index a6958708d477..9f9834d76e00 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -94,6 +94,7 @@ enum blktrace_notify {
 
 #define BLK_IO_TRACE_MAGIC	0x65617400
 #define BLK_IO_TRACE_VERSION	0x07
+#define BLK_IO_TRACE2_VERSION	0x08
 
 /*
  * The trace itself
@@ -113,6 +114,21 @@ struct blk_io_trace {
 	/* cgroup id will be stored here if exists */
 };
 
+struct blk_io_trace2 {
+	__u32 magic;		/* MAGIC << 8 | BLK_IO_TRACE2_VERSION */
+	__u32 sequence;		/* event number */
+	__u64 time;		/* in nanoseconds */
+	__u64 sector;		/* disk offset */
+	__u32 bytes;		/* transfer length */
+	__u32 pid;		/* who did it */
+	__u64 action;		/* what happened */
+	__u32 device;		/* device number */
+	__u32 cpu;		/* on what cpu did it happen */
+	__u16 error;		/* completion error */
+	__u16 pdu_len;		/* length of data after this trace */
+	__u8 pad[12];
+	/* cgroup id will be stored here if it exists */
+};
 /*
  * The remap event
  */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d1532df84cc8..185f19c9f772 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1616,6 +1616,7 @@ static int __init init_blk_tracer(void)
 
 	BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) %
 		     __alignof__(long));
+	BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long));
 
 	return 0;
 }
-- 
cgit v1.2.3


From f9ee38bbf70fb20584625849a253c8652176fa66 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 22 Oct 2025 13:41:12 +0200
Subject: blktrace: add block trace commands for zone operations

Add block trace commands for zone operations. These commands can only be
handled with version 2 of the blktrace protocol. For version 1, warn if a
command that does not fit into the 16 bits reserved for the command in
this version is passed in.

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blktrace_api.h | 13 +++++++++++--
 kernel/trace/blktrace.c           | 29 +++++++++++++++++++++++++----
 2 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index 9f9834d76e00..190a3c5ab0a0 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -26,11 +26,20 @@ enum blktrace_cat {
 	BLK_TC_DRV_DATA	= 1 << 14,	/* binary per-driver data */
 	BLK_TC_FUA	= 1 << 15,	/* fua requests */
 
-	BLK_TC_END	= 1 << 15,	/* we've run out of bits! */
+	BLK_TC_END_V1	= 1 << 15,	/* we've run out of bits! */
+
+	BLK_TC_ZONE_APPEND	= 1ull << 16,  	/* zone append */
+	BLK_TC_ZONE_RESET	= 1ull << 17,	/* zone reset */
+	BLK_TC_ZONE_RESET_ALL	= 1ull << 18,	/* zone reset all */
+	BLK_TC_ZONE_FINISH	= 1ull << 19,	/* zone finish */
+	BLK_TC_ZONE_OPEN	= 1ull << 20,	/* zone open */
+	BLK_TC_ZONE_CLOSE	= 1ull << 21,	/* zone close */
+
+	BLK_TC_END_V2		= 1ull << 21,
 };
 
 #define BLK_TC_SHIFT		(16)
-#define BLK_TC_ACT(act)		((act) << BLK_TC_SHIFT)
+#define BLK_TC_ACT(act)		((u64)(act) << BLK_TC_SHIFT)
 
 /*
  * Basic trace actions
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 49f73cb3cb33..fb5935885abc 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -163,8 +163,8 @@ static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence,
 					     bytes, what, error, cgid, cgid_len,
 					     pdu_data, pdu_len);
 	return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes,
-				     lower_32_bits(what), error, cgid, cgid_len,
-				     pdu_data, pdu_len);
+				     what, error, cgid, cgid_len, pdu_data,
+				     pdu_len);
 }
 
 /*
@@ -342,10 +342,32 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	case REQ_OP_FLUSH:
 		what |= BLK_TC_ACT(BLK_TC_FLUSH);
 		break;
+	case REQ_OP_ZONE_APPEND:
+		what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND);
+		break;
+	case REQ_OP_ZONE_RESET:
+		what |= BLK_TC_ACT(BLK_TC_ZONE_RESET);
+		break;
+	case REQ_OP_ZONE_RESET_ALL:
+		what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL);
+		break;
+	case REQ_OP_ZONE_FINISH:
+		what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH);
+		break;
+	case REQ_OP_ZONE_OPEN:
+		what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN);
+		break;
+	case REQ_OP_ZONE_CLOSE:
+		what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE);
+		break;
 	default:
 		break;
 	}
 
+	if (WARN_ON_ONCE(bt->version == 1 &&
+		     (what >> BLK_TC_SHIFT) > BLK_TC_END_V1))
+		return;
+
 	if (cgid)
 		what |= __BLK_TA_CGROUP;
 
@@ -386,8 +408,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	sequence = per_cpu_ptr(bt->sequence, cpu);
 	(*sequence)++;
 	relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes,
-			     lower_32_bits(what), error, cgid, cgid_len,
-			     pdu_data, pdu_len);
+			     what, error, cgid, cgid_len, pdu_data, pdu_len);
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3


From 1c164fcc1b08e75f1cad1532718f09cddc0ddebe Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 22 Oct 2025 13:41:13 +0200
Subject: blktrace: expose ZONE APPEND completions to blktrace

Expose ZONE APPEND completions as a block trace completion action to
blktrace.

As tracing of zoned block commands needs the upper 32bit of the widened
64bit action, only add traces to blktrace if user-space has requested
version 2 of the blktrace protocol.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blktrace_api.h |  3 +++
 kernel/trace/blktrace.c           | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index 190a3c5ab0a0..289872e51fc5 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -97,6 +97,9 @@ enum blktrace_notify {
 #define BLK_TA_ABORT		(__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE))
 #define BLK_TA_DRV_DATA	(__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA))
 
+#define BLK_TA_ZONE_APPEND	(__BLK_TA_COMPLETE |\
+				 BLK_TC_ACT(BLK_TC_ZONE_APPEND))
+
 #define BLK_TN_PROCESS		(__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
 #define BLK_TN_TIMESTAMP	(__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
 #define BLK_TN_MESSAGE		(__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY))
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb5935885abc..c83577096607 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -978,6 +978,22 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
 			 blk_trace_request_get_cgid(rq));
 }
 
+static void blk_add_trace_zone_update_request(void *ignore, struct request *rq)
+{
+	struct blk_trace *bt;
+
+	rcu_read_lock();
+	bt = rcu_dereference(rq->q->blk_trace);
+	if (likely(!bt) || bt->version < 2) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND,
+			 blk_trace_request_get_cgid(rq));
+}
+
 /**
  * blk_add_trace_bio - Add a trace for a bio oriented action
  * @q:		queue the io is for
@@ -1208,6 +1224,9 @@ static void blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
 	WARN_ON(ret);
+	ret = register_trace_blk_zone_append_update_request_bio(
+		blk_add_trace_zone_update_request, NULL);
+	WARN_ON(ret);
 	ret = register_trace_block_plug(blk_add_trace_plug, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
@@ -1227,6 +1246,8 @@ static void blk_unregister_tracepoints(void)
 	unregister_trace_block_split(blk_add_trace_split, NULL);
 	unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
 	unregister_trace_block_plug(blk_add_trace_plug, NULL);
+	unregister_trace_blk_zone_append_update_request_bio(
+		blk_add_trace_zone_update_request, NULL);
 	unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
 	unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
 	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
-- 
cgit v1.2.3


From 3f6722816a73e2017599d965683dbe71833afd7a Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 22 Oct 2025 13:41:14 +0200
Subject: blktrace: trace zone write plugging operations

Trace zone write plugging operations on block devices.

As tracing of zoned block commands needs the upper 32bit of the widened
64bit action, only add traces to blktrace if user-space has requested
version 2 of the blktrace protocol.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blktrace_api.h |  5 +++++
 kernel/trace/blktrace.c           | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index 289872e51fc5..30f3d2589365 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -62,6 +62,8 @@ enum blktrace_act {
 	__BLK_TA_REMAP,			/* bio was remapped */
 	__BLK_TA_ABORT,			/* request aborted */
 	__BLK_TA_DRV_DATA,		/* driver-specific binary data */
+	__BLK_TA_ZONE_PLUG,		/* zone write plug was plugged */
+	__BLK_TA_ZONE_UNPLUG,		/* zone write plug was unplugged */
 	__BLK_TA_CGROUP = 1 << 8,	/* from a cgroup*/
 };
 
@@ -99,6 +101,9 @@ enum blktrace_notify {
 
 #define BLK_TA_ZONE_APPEND	(__BLK_TA_COMPLETE |\
 				 BLK_TC_ACT(BLK_TC_ZONE_APPEND))
+#define BLK_TA_ZONE_PLUG	(__BLK_TA_ZONE_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_ZONE_UNPLUG	(__BLK_TA_ZONE_UNPLUG |\
+				 BLK_TC_ACT(BLK_TC_QUEUE))
 
 #define BLK_TN_PROCESS		(__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
 #define BLK_TN_TIMESTAMP	(__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c83577096607..6bfe1b36a1d3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1084,6 +1084,37 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 	rcu_read_unlock();
 }
 
+static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q,
+				    unsigned int zno, sector_t sector,
+				    unsigned int sectors)
+{
+	struct blk_trace *bt;
+
+	rcu_read_lock();
+	bt = rcu_dereference(q->blk_trace);
+	if (bt && bt->version >= 2)
+		__blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0,
+				BLK_TA_ZONE_PLUG, 0, 0, NULL, 0);
+	rcu_read_unlock();
+
+	return;
+}
+
+static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q,
+				      unsigned int zno, sector_t sector,
+				      unsigned int sectors)
+{
+	struct blk_trace *bt;
+
+	rcu_read_lock();
+	bt = rcu_dereference(q->blk_trace);
+	if (bt && bt->version >= 2)
+		__blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0,
+				BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0);
+	rcu_read_unlock();
+	return;
+}
+
 static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
 {
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
@@ -1227,6 +1258,12 @@ static void blk_register_tracepoints(void)
 	ret = register_trace_blk_zone_append_update_request_bio(
 		blk_add_trace_zone_update_request, NULL);
 	WARN_ON(ret);
+	ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug,
+						     NULL);
+	WARN_ON(ret);
+	ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug,
+						NULL);
+	WARN_ON(ret);
 	ret = register_trace_block_plug(blk_add_trace_plug, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
@@ -1246,6 +1283,8 @@ static void blk_unregister_tracepoints(void)
 	unregister_trace_block_split(blk_add_trace_split, NULL);
 	unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
 	unregister_trace_block_plug(blk_add_trace_plug, NULL);
+	unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL);
+	unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL);
 	unregister_trace_blk_zone_append_update_request_bio(
 		blk_add_trace_zone_update_request, NULL);
 	unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
-- 
cgit v1.2.3


From cbe5aeedecc72314c3a8fd0d41d9b270f576aee1 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Tue, 21 Oct 2025 07:09:05 +0900
Subject: PM: EM: Assign a unique ID when creating a performance domain

It is necessary to refer to a specific performance domain from a
userspace. For example, the energy model of a particular performance
domain is updated.

To this end, assign a unique ID to each performance domain to address it,
and manage them in a global linked list to look up a specific one by
matching ID. IDA is used for ID assignment, and the mutex is used to
protect the global list from concurrent access.

Note that the mutex (em_pd_list_mutex) is not supposed to hold while
holding em_pd_mutex to avoid ABBA deadlock.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://patch.msgid.link/20251020220914.320832-2-changwoo@igalia.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h |  4 ++++
 kernel/power/energy_model.c  | 30 +++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 61d50571ad88..43aa6153dc57 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -54,6 +54,8 @@ struct em_perf_table {
 /**
  * struct em_perf_domain - Performance domain
  * @em_table:		Pointer to the runtime modifiable em_perf_table
+ * @node:		node in	em_pd_list (in energy_model.c)
+ * @id:			A unique ID number for each performance domain
  * @nr_perf_states:	Number of performance states
  * @min_perf_state:	Minimum allowed Performance State index
  * @max_perf_state:	Maximum allowed Performance State index
@@ -71,6 +73,8 @@ struct em_perf_table {
  */
 struct em_perf_domain {
 	struct em_perf_table __rcu *em_table;
+	struct list_head node;
+	int id;
 	int nr_perf_states;
 	int min_perf_state;
 	int max_perf_state;
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 5f17d2e8e954..2047b546ad11 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -23,6 +23,16 @@
  */
 static DEFINE_MUTEX(em_pd_mutex);
 
+/*
+ * Manage performance domains with IDs. One can iterate the performance domains
+ * through the list and pick one with their associated ID. The mutex serializes
+ * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be
+ * taken to avoid potential deadlock.
+ */
+static DEFINE_IDA(em_pd_ida);
+static LIST_HEAD(em_pd_list);
+static DEFINE_MUTEX(em_pd_list_mutex);
+
 static void em_cpufreq_update_efficiencies(struct device *dev,
 					   struct em_perf_state *table);
 static void em_check_capacity_update(void);
@@ -396,7 +406,7 @@ static int em_create_pd(struct device *dev, int nr_states,
 	struct em_perf_table *em_table;
 	struct em_perf_domain *pd;
 	struct device *cpu_dev;
-	int cpu, ret, num_cpus;
+	int cpu, ret, num_cpus, id;
 
 	if (_is_cpu_device(dev)) {
 		num_cpus = cpumask_weight(cpus);
@@ -420,6 +430,13 @@ static int em_create_pd(struct device *dev, int nr_states,
 
 	pd->nr_perf_states = nr_states;
 
+	INIT_LIST_HEAD(&pd->node);
+
+	id = ida_alloc(&em_pd_ida, GFP_KERNEL);
+	if (id < 0)
+		return -ENOMEM;
+	pd->id = id;
+
 	em_table = em_table_alloc(pd);
 	if (!em_table)
 		goto free_pd;
@@ -444,6 +461,7 @@ free_pd_table:
 	kfree(em_table);
 free_pd:
 	kfree(pd);
+	ida_free(&em_pd_ida, id);
 	return -EINVAL;
 }
 
@@ -660,6 +678,10 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
 unlock:
 	mutex_unlock(&em_pd_mutex);
 
+	mutex_lock(&em_pd_list_mutex);
+	list_add_tail(&dev->em_pd->node, &em_pd_list);
+	mutex_unlock(&em_pd_list_mutex);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
@@ -678,6 +700,10 @@ void em_dev_unregister_perf_domain(struct device *dev)
 	if (_is_cpu_device(dev))
 		return;
 
+	mutex_lock(&em_pd_list_mutex);
+	list_del_init(&dev->em_pd->node);
+	mutex_unlock(&em_pd_list_mutex);
+
 	/*
 	 * The mutex separates all register/unregister requests and protects
 	 * from potential clean-up/setup issues in the debugfs directories.
@@ -689,6 +715,8 @@ void em_dev_unregister_perf_domain(struct device *dev)
 	em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
 						lockdep_is_held(&em_pd_mutex)));
 
+	ida_free(&em_pd_ida, dev->em_pd->id);
+
 	kfree(dev->em_pd);
 	dev->em_pd = NULL;
 	mutex_unlock(&em_pd_mutex);
-- 
cgit v1.2.3


From bd26631ccdfd11701fa29e665a7f041875ba9423 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Tue, 21 Oct 2025 07:09:07 +0900
Subject: PM: EM: Add em.yaml and autogen files

Add a generic netlink spec in YAML format and autogenerate boilerplate
code using ynl-regen.sh to introduce a generic netlink for the energy
model. It allows a userspace program to read the performance domain and
its energy model. It notifies the userspace program when a performance
domain is created or deleted or its energy model is updated through a
multicast interface.

Specifically, it supports two commands:
  - EM_CMD_GET_PDS: Get the list of information for all performance
    domains.
  - EM_CMD_GET_PD_TABLE: Get the energy model table of a performance
    domain.

Also, it supports three notification events:
  - EM_CMD_PD_CREATED: When a performance domain is created.
  - EM_CMD_PD_DELETED: When a performance domain is deleted.
  - EM_CMD_PD_UPDATED: When the energy model table of a performance domain
    is updated.

Finally, update MAINTAINERS to include new files.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://patch.msgid.link/20251020220914.320832-4-changwoo@igalia.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/netlink/specs/em.yaml | 113 ++++++++++++++++++++++++++++++++++++
 MAINTAINERS                         |   3 +
 include/uapi/linux/energy_model.h   |  62 ++++++++++++++++++++
 kernel/power/em_netlink_autogen.c   |  48 +++++++++++++++
 kernel/power/em_netlink_autogen.h   |  23 ++++++++
 5 files changed, 249 insertions(+)
 create mode 100644 Documentation/netlink/specs/em.yaml
 create mode 100644 include/uapi/linux/energy_model.h
 create mode 100644 kernel/power/em_netlink_autogen.c
 create mode 100644 kernel/power/em_netlink_autogen.h

(limited to 'include')

diff --git a/Documentation/netlink/specs/em.yaml b/Documentation/netlink/specs/em.yaml
new file mode 100644
index 000000000000..9905ca482325
--- /dev/null
+++ b/Documentation/netlink/specs/em.yaml
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+
+name: em
+
+doc: |
+  Energy model netlink interface to notify its changes.
+
+protocol: genetlink
+
+uapi-header: linux/energy_model.h
+
+attribute-sets:
+  -
+    name: pds
+    attributes:
+      -
+        name: pd
+        type: nest
+        nested-attributes: pd
+        multi-attr: true
+  -
+    name: pd
+    attributes:
+      -
+        name: pad
+        type: pad
+      -
+        name: pd-id
+        type: u32
+      -
+        name: flags
+        type: u64
+      -
+        name: cpus
+        type: string
+  -
+    name: pd-table
+    attributes:
+      -
+        name: pd-id
+        type: u32
+      -
+        name: ps
+        type: nest
+        nested-attributes: ps
+        multi-attr: true
+  -
+    name: ps
+    attributes:
+      -
+        name: pad
+        type: pad
+      -
+        name: performance
+        type: u64
+      -
+        name: frequency
+        type: u64
+      -
+        name: power
+        type: u64
+      -
+        name: cost
+        type: u64
+      -
+        name: flags
+        type: u64
+
+operations:
+  list:
+    -
+      name: get-pds
+      attribute-set: pds
+      doc: Get the list of information for all performance domains.
+      do:
+        reply:
+          attributes:
+            - pd
+    -
+      name: get-pd-table
+      attribute-set: pd-table
+      doc: Get the energy model table of a performance domain.
+      do:
+        request:
+          attributes:
+            - pd-id
+        reply:
+          attributes:
+            - pd-id
+            - ps
+    -
+      name: pd-created
+      doc: A performance domain is created.
+      notify: get-pd-table
+      mcgrp: event
+    -
+      name: pd-updated
+      doc: A performance domain is updated.
+      notify: get-pd-table
+      mcgrp: event
+    -
+      name: pd-deleted
+      doc: A performance domain is deleted.
+      attribute-set: pd-table
+      event:
+        attributes:
+            - pd-id
+      mcgrp: event
+
+mcast-groups:
+  list:
+    -
+      name: event
diff --git a/MAINTAINERS b/MAINTAINERS
index 545a4776795e..e6b3bab9dbeb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9181,6 +9181,9 @@ S:	Maintained
 F:	kernel/power/energy_model.c
 F:	include/linux/energy_model.h
 F:	Documentation/power/energy-model.rst
+F:	Documentation/netlink/specs/em.yaml
+F:	include/uapi/linux/energy_model.h
+F:	kernel/power/em_netlink_autogen.*
 
 EPAPR HYPERVISOR BYTE CHANNEL DEVICE DRIVER
 M:	Laurentiu Tudor <laurentiu.tudor@nxp.com>
diff --git a/include/uapi/linux/energy_model.h b/include/uapi/linux/energy_model.h
new file mode 100644
index 000000000000..4ec4c0eabbbb
--- /dev/null
+++ b/include/uapi/linux/energy_model.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/em.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_ENERGY_MODEL_H
+#define _UAPI_LINUX_ENERGY_MODEL_H
+
+#define EM_FAMILY_NAME		"em"
+#define EM_FAMILY_VERSION	1
+
+enum {
+	EM_A_PDS_PD = 1,
+
+	__EM_A_PDS_MAX,
+	EM_A_PDS_MAX = (__EM_A_PDS_MAX - 1)
+};
+
+enum {
+	EM_A_PD_PAD = 1,
+	EM_A_PD_PD_ID,
+	EM_A_PD_FLAGS,
+	EM_A_PD_CPUS,
+
+	__EM_A_PD_MAX,
+	EM_A_PD_MAX = (__EM_A_PD_MAX - 1)
+};
+
+enum {
+	EM_A_PD_TABLE_PD_ID = 1,
+	EM_A_PD_TABLE_PS,
+
+	__EM_A_PD_TABLE_MAX,
+	EM_A_PD_TABLE_MAX = (__EM_A_PD_TABLE_MAX - 1)
+};
+
+enum {
+	EM_A_PS_PAD = 1,
+	EM_A_PS_PERFORMANCE,
+	EM_A_PS_FREQUENCY,
+	EM_A_PS_POWER,
+	EM_A_PS_COST,
+	EM_A_PS_FLAGS,
+
+	__EM_A_PS_MAX,
+	EM_A_PS_MAX = (__EM_A_PS_MAX - 1)
+};
+
+enum {
+	EM_CMD_GET_PDS = 1,
+	EM_CMD_GET_PD_TABLE,
+	EM_CMD_PD_CREATED,
+	EM_CMD_PD_UPDATED,
+	EM_CMD_PD_DELETED,
+
+	__EM_CMD_MAX,
+	EM_CMD_MAX = (__EM_CMD_MAX - 1)
+};
+
+#define EM_MCGRP_EVENT	"event"
+
+#endif /* _UAPI_LINUX_ENERGY_MODEL_H */
diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c
new file mode 100644
index 000000000000..a7a09ab1d1c2
--- /dev/null
+++ b/kernel/power/em_netlink_autogen.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/em.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "em_netlink_autogen.h"
+
+#include <uapi/linux/energy_model.h>
+
+/* EM_CMD_GET_PD_TABLE - do */
+static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = {
+	[EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, },
+};
+
+/* Ops table for em */
+static const struct genl_split_ops em_nl_ops[] = {
+	{
+		.cmd	= EM_CMD_GET_PDS,
+		.doit	= em_nl_get_pds_doit,
+		.flags	= GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= EM_CMD_GET_PD_TABLE,
+		.doit		= em_nl_get_pd_table_doit,
+		.policy		= em_get_pd_table_nl_policy,
+		.maxattr	= EM_A_PD_TABLE_PD_ID,
+		.flags		= GENL_CMD_CAP_DO,
+	},
+};
+
+static const struct genl_multicast_group em_nl_mcgrps[] = {
+	[EM_NLGRP_EVENT] = { "event", },
+};
+
+struct genl_family em_nl_family __ro_after_init = {
+	.name		= EM_FAMILY_NAME,
+	.version	= EM_FAMILY_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.split_ops	= em_nl_ops,
+	.n_split_ops	= ARRAY_SIZE(em_nl_ops),
+	.mcgrps		= em_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(em_nl_mcgrps),
+};
diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h
new file mode 100644
index 000000000000..78ce609641f1
--- /dev/null
+++ b/kernel/power/em_netlink_autogen.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/em.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_EM_GEN_H
+#define _LINUX_EM_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/energy_model.h>
+
+int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info);
+int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+	EM_NLGRP_EVENT,
+};
+
+extern struct genl_family em_nl_family;
+
+#endif /* _LINUX_EM_GEN_H */
-- 
cgit v1.2.3


From e090dc10c65eac35dcdb7c1b9cd6adcf0b590d3a Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Fri, 19 Sep 2025 11:57:23 +0200
Subject: dt-bindings: clock: dispcc-sm6350: Add MDSS_CORE & MDSS_RSCC resets

Add the indexes for two resets inside the dispcc on SM6350 SoC.

Signed-off-by: Luca Weiss <luca.weiss@fairphone.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20250919-sm6350-mdss-reset-v1-1-48dcac917c73@fairphone.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/clock/qcom,dispcc-sm6350.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,dispcc-sm6350.h b/include/dt-bindings/clock/qcom,dispcc-sm6350.h
index cb54aae2723e..61426a80e620 100644
--- a/include/dt-bindings/clock/qcom,dispcc-sm6350.h
+++ b/include/dt-bindings/clock/qcom,dispcc-sm6350.h
@@ -42,6 +42,10 @@
 #define DISP_CC_SLEEP_CLK			31
 #define DISP_CC_XO_CLK				32
 
+/* Resets */
+#define DISP_CC_MDSS_CORE_BCR			0
+#define DISP_CC_MDSS_RSCC_BCR			1
+
 /* GDSCs */
 #define MDSS_GDSC				0
 
-- 
cgit v1.2.3


From 2238840342af8e8d37a9355f0a2ad4285c32f854 Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@mainlining.org>
Date: Fri, 19 Sep 2025 14:34:30 +0200
Subject: dt-bindings: clock: sm7150-dispcc: Add MDSS_CORE reset

Add the index for a reset inside the dispcc on SM7150 SoC.

Signed-off-by: Jens Reidel <adrian@mainlining.org>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20250919-sm7150-dispcc-fixes-v1-1-308ad47c5fce@mainlining.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/clock/qcom,sm7150-dispcc.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,sm7150-dispcc.h b/include/dt-bindings/clock/qcom,sm7150-dispcc.h
index fc1fefe8fd72..1e4e6432d506 100644
--- a/include/dt-bindings/clock/qcom,sm7150-dispcc.h
+++ b/include/dt-bindings/clock/qcom,sm7150-dispcc.h
@@ -53,6 +53,9 @@
 #define DISPCC_SLEEP_CLK			41
 #define DISPCC_SLEEP_CLK_SRC			42
 
+/* DISPCC resets */
+#define DISPCC_MDSS_CORE_BCR			0
+
 /* DISPCC GDSCR */
 #define MDSS_GDSC				0
 
-- 
cgit v1.2.3


From 2985e76c66e15a6953c77d0b924e3a78d495208e Mon Sep 17 00:00:00 2001
From: Luo Jie <quic_luoj@quicinc.com>
Date: Tue, 14 Oct 2025 22:35:28 +0800
Subject: dt-bindings: interconnect: Add Qualcomm IPQ5424 NSSNOC IDs

Add the NSSNOC master/slave ids for Qualcomm IPQ5424 network subsystem
(NSS) hardware blocks. These will be used by the gcc-ipq5424 driver
that provides the interconnect services by using the icc-clk framework.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Luo Jie <quic_luoj@quicinc.com>
Acked-by: Georgi Djakov <djakov@kernel.org>
Link: https://lore.kernel.org/r/20251014-qcom_ipq5424_nsscc-v7-3-081f4956be02@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/interconnect/qcom,ipq5424.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/interconnect/qcom,ipq5424.h b/include/dt-bindings/interconnect/qcom,ipq5424.h
index afd7e0683a24..c5e0dec0b300 100644
--- a/include/dt-bindings/interconnect/qcom,ipq5424.h
+++ b/include/dt-bindings/interconnect/qcom,ipq5424.h
@@ -20,6 +20,26 @@
 #define SLAVE_CNOC_PCIE3		15
 #define MASTER_CNOC_USB			16
 #define SLAVE_CNOC_USB			17
+#define MASTER_NSSNOC_NSSCC		18
+#define SLAVE_NSSNOC_NSSCC		19
+#define MASTER_NSSNOC_SNOC_0		20
+#define SLAVE_NSSNOC_SNOC_0		21
+#define MASTER_NSSNOC_SNOC_1		22
+#define SLAVE_NSSNOC_SNOC_1		23
+#define MASTER_NSSNOC_PCNOC_1		24
+#define SLAVE_NSSNOC_PCNOC_1		25
+#define MASTER_NSSNOC_QOSGEN_REF	26
+#define SLAVE_NSSNOC_QOSGEN_REF		27
+#define MASTER_NSSNOC_TIMEOUT_REF	28
+#define SLAVE_NSSNOC_TIMEOUT_REF	29
+#define MASTER_NSSNOC_XO_DCD		30
+#define SLAVE_NSSNOC_XO_DCD		31
+#define MASTER_NSSNOC_ATB		32
+#define SLAVE_NSSNOC_ATB		33
+#define MASTER_CNOC_LPASS_CFG		34
+#define SLAVE_CNOC_LPASS_CFG		35
+#define MASTER_SNOC_LPASS		36
+#define SLAVE_SNOC_LPASS		37
 
 #define MASTER_CPU			0
 #define SLAVE_L3			1
-- 
cgit v1.2.3


From 60c8b7569c10c4b2ad5645cd093ff4577487314b Mon Sep 17 00:00:00 2001
From: Luo Jie <quic_luoj@quicinc.com>
Date: Tue, 14 Oct 2025 22:35:30 +0800
Subject: dt-bindings: clock: gcc-ipq5424: Add definition for GPLL0_OUT_AUX

The GCC clock GPLL0_OUT_AUX is one of source clocks for IPQ5424 NSS clock
controller.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Luo Jie <quic_luoj@quicinc.com>
Link: https://lore.kernel.org/r/20251014-qcom_ipq5424_nsscc-v7-5-081f4956be02@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/clock/qcom,ipq5424-gcc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,ipq5424-gcc.h b/include/dt-bindings/clock/qcom,ipq5424-gcc.h
index c15ad16923bd..3ae33a0fa002 100644
--- a/include/dt-bindings/clock/qcom,ipq5424-gcc.h
+++ b/include/dt-bindings/clock/qcom,ipq5424-gcc.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
 /*
  * Copyright (c) 2018,2020 The Linux Foundation. All rights reserved.
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  */
 
 #ifndef _DT_BINDINGS_CLOCK_IPQ_GCC_IPQ5424_H
@@ -152,5 +152,6 @@
 #define GCC_PCIE3_RCHNG_CLK			143
 #define GCC_IM_SLEEP_CLK			144
 #define GCC_XO_CLK				145
+#define GPLL0_OUT_AUX				146
 
 #endif
-- 
cgit v1.2.3


From 06ac2566e73d9d9fa2be62315e182945f7934882 Mon Sep 17 00:00:00 2001
From: Luo Jie <quic_luoj@quicinc.com>
Date: Tue, 14 Oct 2025 22:35:32 +0800
Subject: dt-bindings: clock: qcom: Add NSS clock controller for IPQ5424 SoC

NSS clock controller provides the clocks and resets to the networking
blocks such as PPE (Packet Process Engine) and UNIPHY (PCS) on IPQ5424
devices.

Add support for the compatible string "qcom,ipq5424-nsscc" based on the
existing IPQ9574 NSS clock controller Device Tree binding. Additionally,
update the clock names for PPE and NSS for newer SoC additions like
IPQ5424 to use generic and reusable identifiers "nss" and "ppe" without
the clock rate suffix.

Also add master/slave ids for IPQ5424 networking interfaces, which is
used by nss-ipq5424 driver for providing interconnect services using
icc-clk framework.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Luo Jie <quic_luoj@quicinc.com>
Link: https://lore.kernel.org/r/20251014-qcom_ipq5424_nsscc-v7-7-081f4956be02@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,ipq9574-nsscc.yaml         | 62 ++++++++++++++++++---
 include/dt-bindings/clock/qcom,ipq5424-nsscc.h     | 65 ++++++++++++++++++++++
 include/dt-bindings/interconnect/qcom,ipq5424.h    | 13 +++++
 include/dt-bindings/reset/qcom,ipq5424-nsscc.h     | 46 +++++++++++++++
 4 files changed, 178 insertions(+), 8 deletions(-)
 create mode 100644 include/dt-bindings/clock/qcom,ipq5424-nsscc.h
 create mode 100644 include/dt-bindings/reset/qcom,ipq5424-nsscc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,ipq9574-nsscc.yaml b/Documentation/devicetree/bindings/clock/qcom,ipq9574-nsscc.yaml
index 5d35925e60d0..7ff4ff3587ca 100644
--- a/Documentation/devicetree/bindings/clock/qcom,ipq9574-nsscc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,ipq9574-nsscc.yaml
@@ -4,7 +4,7 @@
 $id: http://devicetree.org/schemas/clock/qcom,ipq9574-nsscc.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#
 
-title: Qualcomm Networking Sub System Clock & Reset Controller on IPQ9574
+title: Qualcomm Networking Sub System Clock & Reset Controller on IPQ9574 and IPQ5424
 
 maintainers:
   - Bjorn Andersson <andersson@kernel.org>
@@ -12,21 +12,29 @@ maintainers:
 
 description: |
   Qualcomm networking sub system clock control module provides the clocks,
-  resets on IPQ9574
+  resets on IPQ9574 and IPQ5424
 
-  See also::
+  See also:
+    include/dt-bindings/clock/qcom,ipq5424-nsscc.h
     include/dt-bindings/clock/qcom,ipq9574-nsscc.h
+    include/dt-bindings/reset/qcom,ipq5424-nsscc.h
     include/dt-bindings/reset/qcom,ipq9574-nsscc.h
 
 properties:
   compatible:
-    const: qcom,ipq9574-nsscc
+    enum:
+      - qcom,ipq5424-nsscc
+      - qcom,ipq9574-nsscc
 
   clocks:
     items:
       - description: Board XO source
-      - description: CMN_PLL NSS 1200MHz (Bias PLL cc) clock source
-      - description: CMN_PLL PPE 353MHz (Bias PLL ubi nc) clock source
+      - description: CMN_PLL NSS (Bias PLL cc) clock source. This clock rate
+          can vary for different IPQ SoCs. For example, it is 1200 MHz on the
+          IPQ9574 and 300 MHz on the IPQ5424.
+      - description: CMN_PLL PPE (Bias PLL ubi nc) clock source. The clock
+          rate can vary for different IPQ SoCs. For example, it is 353 MHz
+          on the IPQ9574 and 375 MHz on the IPQ5424.
       - description: GCC GPLL0 OUT AUX clock source
       - description: Uniphy0 NSS Rx clock source
       - description: Uniphy0 NSS Tx clock source
@@ -42,8 +50,12 @@ properties:
   clock-names:
     items:
       - const: xo
-      - const: nss_1200
-      - const: ppe_353
+      - enum:
+          - nss_1200
+          - nss
+      - enum:
+          - ppe_353
+          - ppe
       - const: gpll0_out
       - const: uniphy0_rx
       - const: uniphy0_tx
@@ -60,6 +72,40 @@ required:
 
 allOf:
   - $ref: qcom,gcc.yaml#
+  - if:
+      properties:
+        compatible:
+          const: qcom,ipq9574-nsscc
+    then:
+      properties:
+        clock-names:
+          items:
+            - const: xo
+            - const: nss_1200
+            - const: ppe_353
+            - const: gpll0_out
+            - const: uniphy0_rx
+            - const: uniphy0_tx
+            - const: uniphy1_rx
+            - const: uniphy1_tx
+            - const: uniphy2_rx
+            - const: uniphy2_tx
+            - const: bus
+    else:
+      properties:
+        clock-names:
+          items:
+            - const: xo
+            - const: nss
+            - const: ppe
+            - const: gpll0_out
+            - const: uniphy0_rx
+            - const: uniphy0_tx
+            - const: uniphy1_rx
+            - const: uniphy1_tx
+            - const: uniphy2_rx
+            - const: uniphy2_tx
+            - const: bus
 
 unevaluatedProperties: false
 
diff --git a/include/dt-bindings/clock/qcom,ipq5424-nsscc.h b/include/dt-bindings/clock/qcom,ipq5424-nsscc.h
new file mode 100644
index 000000000000..eeae0dc38042
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,ipq5424-nsscc.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+ */
+
+#ifndef _DT_BINDINGS_CLOCK_QCOM_IPQ5424_NSSCC_H
+#define _DT_BINDINGS_CLOCK_QCOM_IPQ5424_NSSCC_H
+
+/* NSS_CC clocks */
+#define NSS_CC_CE_APB_CLK					0
+#define NSS_CC_CE_AXI_CLK					1
+#define NSS_CC_CE_CLK_SRC					2
+#define NSS_CC_CFG_CLK_SRC					3
+#define NSS_CC_DEBUG_CLK					4
+#define NSS_CC_EIP_BFDCD_CLK_SRC				5
+#define NSS_CC_EIP_CLK						6
+#define NSS_CC_NSS_CSR_CLK					7
+#define NSS_CC_NSSNOC_CE_APB_CLK				8
+#define NSS_CC_NSSNOC_CE_AXI_CLK				9
+#define NSS_CC_NSSNOC_EIP_CLK					10
+#define NSS_CC_NSSNOC_NSS_CSR_CLK				11
+#define NSS_CC_NSSNOC_PPE_CFG_CLK				12
+#define NSS_CC_NSSNOC_PPE_CLK					13
+#define NSS_CC_PORT1_MAC_CLK					14
+#define NSS_CC_PORT1_RX_CLK					15
+#define NSS_CC_PORT1_RX_CLK_SRC					16
+#define NSS_CC_PORT1_RX_DIV_CLK_SRC				17
+#define NSS_CC_PORT1_TX_CLK					18
+#define NSS_CC_PORT1_TX_CLK_SRC					19
+#define NSS_CC_PORT1_TX_DIV_CLK_SRC				20
+#define NSS_CC_PORT2_MAC_CLK					21
+#define NSS_CC_PORT2_RX_CLK					22
+#define NSS_CC_PORT2_RX_CLK_SRC					23
+#define NSS_CC_PORT2_RX_DIV_CLK_SRC				24
+#define NSS_CC_PORT2_TX_CLK					25
+#define NSS_CC_PORT2_TX_CLK_SRC					26
+#define NSS_CC_PORT2_TX_DIV_CLK_SRC				27
+#define NSS_CC_PORT3_MAC_CLK					28
+#define NSS_CC_PORT3_RX_CLK					29
+#define NSS_CC_PORT3_RX_CLK_SRC					30
+#define NSS_CC_PORT3_RX_DIV_CLK_SRC				31
+#define NSS_CC_PORT3_TX_CLK					32
+#define NSS_CC_PORT3_TX_CLK_SRC					33
+#define NSS_CC_PORT3_TX_DIV_CLK_SRC				34
+#define NSS_CC_PPE_CLK_SRC					35
+#define NSS_CC_PPE_EDMA_CFG_CLK					36
+#define NSS_CC_PPE_EDMA_CLK					37
+#define NSS_CC_PPE_SWITCH_BTQ_CLK				38
+#define NSS_CC_PPE_SWITCH_CFG_CLK				39
+#define NSS_CC_PPE_SWITCH_CLK					40
+#define NSS_CC_PPE_SWITCH_IPE_CLK				41
+#define NSS_CC_UNIPHY_PORT1_RX_CLK				42
+#define NSS_CC_UNIPHY_PORT1_TX_CLK				43
+#define NSS_CC_UNIPHY_PORT2_RX_CLK				44
+#define NSS_CC_UNIPHY_PORT2_TX_CLK				45
+#define NSS_CC_UNIPHY_PORT3_RX_CLK				46
+#define NSS_CC_UNIPHY_PORT3_TX_CLK				47
+#define NSS_CC_XGMAC0_PTP_REF_CLK				48
+#define NSS_CC_XGMAC0_PTP_REF_DIV_CLK_SRC			49
+#define NSS_CC_XGMAC1_PTP_REF_CLK				50
+#define NSS_CC_XGMAC1_PTP_REF_DIV_CLK_SRC			51
+#define NSS_CC_XGMAC2_PTP_REF_CLK				52
+#define NSS_CC_XGMAC2_PTP_REF_DIV_CLK_SRC			53
+
+#endif
diff --git a/include/dt-bindings/interconnect/qcom,ipq5424.h b/include/dt-bindings/interconnect/qcom,ipq5424.h
index c5e0dec0b300..07b786bee7d6 100644
--- a/include/dt-bindings/interconnect/qcom,ipq5424.h
+++ b/include/dt-bindings/interconnect/qcom,ipq5424.h
@@ -44,4 +44,17 @@
 #define MASTER_CPU			0
 #define SLAVE_L3			1
 
+#define MASTER_NSSNOC_PPE		0
+#define SLAVE_NSSNOC_PPE		1
+#define MASTER_NSSNOC_PPE_CFG		2
+#define SLAVE_NSSNOC_PPE_CFG		3
+#define MASTER_NSSNOC_NSS_CSR		4
+#define SLAVE_NSSNOC_NSS_CSR		5
+#define MASTER_NSSNOC_CE_AXI		6
+#define SLAVE_NSSNOC_CE_AXI		7
+#define MASTER_NSSNOC_CE_APB		8
+#define SLAVE_NSSNOC_CE_APB		9
+#define MASTER_NSSNOC_EIP		10
+#define SLAVE_NSSNOC_EIP		11
+
 #endif /* INTERCONNECT_QCOM_IPQ5424_H */
diff --git a/include/dt-bindings/reset/qcom,ipq5424-nsscc.h b/include/dt-bindings/reset/qcom,ipq5424-nsscc.h
new file mode 100644
index 000000000000..9627e3b0ad30
--- /dev/null
+++ b/include/dt-bindings/reset/qcom,ipq5424-nsscc.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+ */
+
+#ifndef _DT_BINDINGS_RESET_QCOM_IPQ5424_NSSCC_H
+#define _DT_BINDINGS_RESET_QCOM_IPQ5424_NSSCC_H
+
+#define NSS_CC_CE_APB_CLK_ARES					0
+#define NSS_CC_CE_AXI_CLK_ARES					1
+#define NSS_CC_DEBUG_CLK_ARES					2
+#define NSS_CC_EIP_CLK_ARES					3
+#define NSS_CC_NSS_CSR_CLK_ARES					4
+#define NSS_CC_NSSNOC_CE_APB_CLK_ARES				5
+#define NSS_CC_NSSNOC_CE_AXI_CLK_ARES				6
+#define NSS_CC_NSSNOC_EIP_CLK_ARES				7
+#define NSS_CC_NSSNOC_NSS_CSR_CLK_ARES				8
+#define NSS_CC_NSSNOC_PPE_CLK_ARES				9
+#define NSS_CC_NSSNOC_PPE_CFG_CLK_ARES				10
+#define NSS_CC_PORT1_MAC_CLK_ARES				11
+#define NSS_CC_PORT1_RX_CLK_ARES				12
+#define NSS_CC_PORT1_TX_CLK_ARES				13
+#define NSS_CC_PORT2_MAC_CLK_ARES				14
+#define NSS_CC_PORT2_RX_CLK_ARES				15
+#define NSS_CC_PORT2_TX_CLK_ARES				16
+#define NSS_CC_PORT3_MAC_CLK_ARES				17
+#define NSS_CC_PORT3_RX_CLK_ARES				18
+#define NSS_CC_PORT3_TX_CLK_ARES				19
+#define NSS_CC_PPE_BCR						20
+#define NSS_CC_PPE_EDMA_CLK_ARES				21
+#define NSS_CC_PPE_EDMA_CFG_CLK_ARES				22
+#define NSS_CC_PPE_SWITCH_BTQ_CLK_ARES				23
+#define NSS_CC_PPE_SWITCH_CLK_ARES				24
+#define NSS_CC_PPE_SWITCH_CFG_CLK_ARES				25
+#define NSS_CC_PPE_SWITCH_IPE_CLK_ARES				26
+#define NSS_CC_UNIPHY_PORT1_RX_CLK_ARES				27
+#define NSS_CC_UNIPHY_PORT1_TX_CLK_ARES				28
+#define NSS_CC_UNIPHY_PORT2_RX_CLK_ARES				29
+#define NSS_CC_UNIPHY_PORT2_TX_CLK_ARES				30
+#define NSS_CC_UNIPHY_PORT3_RX_CLK_ARES				31
+#define NSS_CC_UNIPHY_PORT3_TX_CLK_ARES				32
+#define NSS_CC_XGMAC0_PTP_REF_CLK_ARES				33
+#define NSS_CC_XGMAC1_PTP_REF_CLK_ARES				34
+#define NSS_CC_XGMAC2_PTP_REF_CLK_ARES				35
+
+#endif
-- 
cgit v1.2.3


From c88b6ee3ba3c7bf6386ea0e6de8111acc3d832bc Mon Sep 17 00:00:00 2001
From: Jingyi Wang <jingyi.wang@oss.qualcomm.com>
Date: Wed, 24 Sep 2025 16:24:55 -0700
Subject: soc: qcom: llcc-qcom: Add support for Kaanapali

Add system cache table and configs for Kaanapali SoC.

Signed-off-by: Jingyi Wang <jingyi.wang@oss.qualcomm.com>
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250924-knp-llcc-v1-2-ae6a016e5138@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/soc/qcom/llcc-qcom.c       | 373 +++++++++++++++++++++++++++++++++++++
 include/linux/soc/qcom/llcc-qcom.h |   7 +
 2 files changed, 380 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index 857ead56b37d..13e174267294 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -214,6 +214,364 @@ static const struct llcc_slice_config ipq5424_data[] =  {
 	},
 };
 
+static const struct llcc_slice_config kaanapali_data[] = {
+	{
+		.usecase_id = LLCC_CPUSS,
+		.slice_id = 1,
+		.max_cap = 5120,
+		.priority = 1,
+		.bonus_ways = 0xffffffff,
+		.activate_on_init = true,
+		.write_scid_en = true,
+		.stale_en = true,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_VIDSC0,
+		.slice_id = 2,
+		.max_cap = 512,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_AUDIO,
+		.slice_id = 35,
+		.max_cap = 512,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_MDMHPGRW,
+		.slice_id = 25,
+		.max_cap = 1024,
+		.priority = 5,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_CMPT,
+		.slice_id = 34,
+		.max_cap = 4096,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_GPUHTW,
+		.slice_id = 11,
+		.max_cap = 512,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_GPU,
+		.slice_id = 9,
+		.max_cap = 5632,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.write_scid_cacheable_en = true,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_MMUHWT,
+		.slice_id = 18,
+		.max_cap = 768,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.activate_on_init = true,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_DISP,
+		.slice_id = 16,
+		.max_cap = 7168,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.cache_mode = 2,
+		.stale_en = true,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_MDMHPFX,
+		.slice_id = 24,
+		.max_cap = 1024,
+		.priority = 5,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_MDMPNG,
+		.slice_id = 27,
+		.max_cap = 256,
+		.priority = 5,
+		.bonus_ways = 0xfffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_CVP,
+		.slice_id = 8,
+		.max_cap = 800,
+		.priority = 5,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.ovcap_en = true,
+		.vict_prio = true,
+		.parent_slice_id = 33,
+	}, {
+		.usecase_id = LLCC_MODPE,
+		.slice_id = 29,
+		.max_cap = 256,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xf0000000,
+		.mru_uncap_en = true,
+		.alloc_oneway_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_WRCACHE,
+		.slice_id = 31,
+		.max_cap = 512,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.activate_on_init = true,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_CVPFW,
+		.slice_id = 19,
+		.max_cap = 512,
+		.priority = 5,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+		.parent_slice_id = 33,
+	}, {
+		.usecase_id = LLCC_CPUMTE,
+		.slice_id = 7,
+		.max_cap = 256,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_CMPTHCP,
+		.slice_id = 15,
+		.max_cap = 256,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_LCPDARE,
+		.slice_id = 30,
+		.max_cap = 128,
+		.priority = 5,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.activate_on_init = true,
+		.mru_uncap_en = true,
+		.alloc_oneway_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_AENPU,
+		.slice_id = 3,
+		.max_cap = 3072,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.cache_mode = 2,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_ISLAND1,
+		.slice_id = 12,
+		.max_cap = 7936,
+		.priority = 7,
+		.fixed_size = true,
+		.bonus_ways = 0x7fffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_DISP_WB,
+		.slice_id = 23,
+		.max_cap = 512,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_VIDVSP,
+		.slice_id = 4,
+		.max_cap = 256,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_VIDDEC,
+		.slice_id = 5,
+		.max_cap = 512,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.cache_mode = 2,
+		.mru_uncap_en = true,
+		.ovcap_en = true,
+		.vict_prio = true,
+		.parent_slice_id = 33,
+	}, {
+		.usecase_id = LLCC_CAMOFE,
+		.slice_id = 33,
+		.max_cap = 6144,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.stale_en = true,
+		.mru_uncap_en = true,
+		.ovcap_en = true,
+		.vict_prio = true,
+		.parent_slice_id = 33,
+	}, {
+		.usecase_id = LLCC_CAMRTIP,
+		.slice_id = 13,
+		.max_cap = 6144,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.stale_en = true,
+		.mru_uncap_en = true,
+		.ovcap_en = true,
+		.vict_prio = true,
+		.parent_slice_id = 33,
+	}, {
+		.usecase_id = LLCC_CAMRTRF,
+		.slice_id = 10,
+		.max_cap = 3584,
+		.priority = 3,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.stale_en = true,
+		.mru_uncap_en = true,
+		.ovcap_en = true,
+		.vict_prio = true,
+		.parent_slice_id = 33,
+	}, {
+		.usecase_id = LLCC_CAMSRTRF,
+		.slice_id = 21,
+		.max_cap = 6144,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.stale_en = true,
+		.mru_uncap_en = true,
+		.ovcap_en = true,
+		.vict_prio = true,
+		.parent_slice_id = 33,
+	}, {
+		.usecase_id = LLCC_VIDEO_APV,
+		.slice_id = 6,
+		.max_cap = 768,
+		.priority = 4,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_COMPUTE1,
+		.slice_id = 22,
+		.max_cap = 4096,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_CPUSS_OPP,
+		.slice_id = 32,
+		.max_cap = 0,
+		.priority = 0,
+		.fixed_size = true,
+		.bonus_ways = 0,
+		.activate_on_init = true,
+		.write_scid_en = true,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_CPUSSMPAM,
+		.slice_id = 17,
+		.max_cap = 2048,
+		.priority = 1,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.activate_on_init = true,
+		.write_scid_en = true,
+		.stale_en = true,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_CAM_IPE_STROV,
+		.slice_id = 14,
+		.max_cap = 400,
+		.priority = 5,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.ovcap_en = true,
+		.vict_prio = true,
+		.parent_slice_id = 33,
+	}, {
+		.usecase_id = LLCC_CAM_OFE_STROV,
+		.slice_id = 20,
+		.max_cap = 400,
+		.priority = 5,
+		.fixed_size = true,
+		.bonus_ways = 0xffffffff,
+		.mru_uncap_en = true,
+		.ovcap_en = true,
+		.vict_prio = true,
+		.parent_slice_id = 33,
+	}, {
+		.usecase_id = LLCC_CPUSS_HEU,
+		.slice_id = 28,
+		.max_cap = 0,
+		.priority = 0,
+		.fixed_size = true,
+		.bonus_ways = 0,
+		.mru_uncap_en = true,
+		.ovcap_en = true,
+		.vict_prio = true,
+	}, {
+		.usecase_id = LLCC_MDM_PNG_FIXED,
+		.slice_id = 26,
+		.max_cap = 256,
+		.priority = 5,
+		.fixed_size = true,
+		.bonus_ways = 0xff000000,
+		.activate_on_init = true,
+		.write_scid_en = true,
+		.mru_uncap_en = true,
+		.vict_prio = true,
+	   },
+};
+
 static const struct llcc_slice_config sa8775p_data[] =  {
 	{
 		.usecase_id = LLCC_CPUSS,
@@ -3505,6 +3863,15 @@ static const u32 llcc_v6_reg_offset[] = {
 	[LLCC_TRP_WRS_CACHEABLE_EN]	= 0x00042088,
 };
 
+static const struct qcom_llcc_config kaanapali_cfg[] = {
+	{
+		.sct_data	= kaanapali_data,
+		.size		= ARRAY_SIZE(kaanapali_data),
+		.reg_offset	= llcc_v6_reg_offset,
+		.edac_reg_offset = &llcc_v6_edac_reg_offset,
+	},
+};
+
 static const struct qcom_llcc_config qcs615_cfg[] = {
 	{
 		.sct_data	= qcs615_data,
@@ -3731,6 +4098,11 @@ static const struct qcom_llcc_config x1e80100_cfg[] = {
 	},
 };
 
+static const struct qcom_sct_config kaanapali_cfgs = {
+	.llcc_config	= kaanapali_cfg,
+	.num_config	= ARRAY_SIZE(kaanapali_cfg),
+};
+
 static const struct qcom_sct_config qcs615_cfgs = {
 	.llcc_config	= qcs615_cfg,
 	.num_config	= ARRAY_SIZE(qcs615_cfg),
@@ -4570,6 +4942,7 @@ err:
 
 static const struct of_device_id qcom_llcc_of_match[] = {
 	{ .compatible = "qcom,ipq5424-llcc", .data = &ipq5424_cfgs},
+	{ .compatible = "qcom,kaanapali-llcc", .data = &kaanapali_cfgs},
 	{ .compatible = "qcom,qcs615-llcc", .data = &qcs615_cfgs},
 	{ .compatible = "qcom,qcs8300-llcc", .data = &qcs8300_cfgs},
 	{ .compatible = "qcom,qdu1000-llcc", .data = &qdu1000_cfgs},
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 7a69210a250c..0287f9182c4d 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -74,7 +74,14 @@
 #define LLCC_CAMSRTIP	 73
 #define LLCC_CAMRTRF	 74
 #define LLCC_CAMSRTRF	 75
+#define LLCC_VIDEO_APV	 83
+#define LLCC_COMPUTE1	 87
+#define LLCC_CPUSS_OPP	 88
 #define LLCC_CPUSSMPAM	 89
+#define LLCC_CAM_IPE_STROV	 92
+#define LLCC_CAM_OFE_STROV	 93
+#define LLCC_CPUSS_HEU	 94
+#define LLCC_MDM_PNG_FIXED	 100
 
 /**
  * struct llcc_slice_desc - Cache slice descriptor
-- 
cgit v1.2.3


From 67a4b6a89b99aff0883114e4ecba4b11aedc29a5 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 6 Feb 2025 16:44:10 -0500
Subject: lsm: split the init code out into lsm_init.c

Continue to pull code out of security/security.c to help improve
readability by pulling all of the LSM framework initialization
code out into a new file.

No code changes.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h |   3 +-
 security/Makefile         |   2 +-
 security/lsm.h            |  22 ++
 security/lsm_init.c       | 543 +++++++++++++++++++++++++++++++++++++++++
 security/security.c       | 597 +++-------------------------------------------
 5 files changed, 601 insertions(+), 566 deletions(-)
 create mode 100644 security/lsm.h
 create mode 100644 security/lsm_init.c

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 79ec5a2bdcca..0112926ed923 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -170,11 +170,10 @@ struct lsm_info {
 		__used __section(".early_lsm_info.init")		\
 		__aligned(sizeof(unsigned long))
 
+
 /* DO NOT tamper with these variables outside of the LSM framework */
 extern char *lsm_names;
 extern struct lsm_static_calls_table static_calls_table __ro_after_init;
-extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
-extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
 
 /**
  * lsm_get_xattr_slot - Return the next available slot and increment the index
diff --git a/security/Makefile b/security/Makefile
index 14d87847bce8..4601230ba442 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_SECURITY) 			+= lsm_syscalls.o
 obj-$(CONFIG_MMU)			+= min_addr.o
 
 # Object file lists
-obj-$(CONFIG_SECURITY)			+= security.o lsm_notifier.o
+obj-$(CONFIG_SECURITY)			+= security.o lsm_notifier.o lsm_init.o
 obj-$(CONFIG_SECURITYFS)		+= inode.o
 obj-$(CONFIG_SECURITY_SELINUX)		+= selinux/
 obj-$(CONFIG_SECURITY_SMACK)		+= smack/
diff --git a/security/lsm.h b/security/lsm.h
new file mode 100644
index 000000000000..0e1731bad4a7
--- /dev/null
+++ b/security/lsm.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * LSM functions
+ */
+
+#ifndef _LSM_H_
+#define _LSM_H_
+
+#include <linux/lsm_hooks.h>
+
+/* LSM blob configuration */
+extern struct lsm_blob_sizes blob_sizes;
+
+/* LSM blob caches */
+extern struct kmem_cache *lsm_file_cache;
+extern struct kmem_cache *lsm_inode_cache;
+
+/* LSM blob allocators */
+int lsm_cred_alloc(struct cred *cred, gfp_t gfp);
+int lsm_task_alloc(struct task_struct *task);
+
+#endif /* _LSM_H_ */
diff --git a/security/lsm_init.c b/security/lsm_init.c
new file mode 100644
index 000000000000..124213b906af
--- /dev/null
+++ b/security/lsm_init.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * LSM initialization functions
+ */
+
+#define pr_fmt(fmt) "LSM: " fmt
+
+#include <linux/init.h>
+#include <linux/lsm_hooks.h>
+
+#include "lsm.h"
+
+char *lsm_names;
+
+/* Pointers to LSM sections defined in include/asm-generic/vmlinux.lds.h */
+extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
+extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
+
+/* Boot-time LSM user choice */
+static __initconst const char *const builtin_lsm_order = CONFIG_LSM;
+static __initdata const char *chosen_lsm_order;
+static __initdata const char *chosen_major_lsm;
+
+/* Ordered list of LSMs to initialize. */
+static __initdata struct lsm_info *ordered_lsms[MAX_LSM_COUNT + 1];
+static __initdata struct lsm_info *exclusive;
+
+static __initdata bool debug;
+#define init_debug(...)							\
+	do {								\
+		if (debug)						\
+			pr_info(__VA_ARGS__);				\
+	} while (0)
+
+static int lsm_append(const char *new, char **result);
+
+/* Save user chosen LSM */
+static int __init choose_major_lsm(char *str)
+{
+	chosen_major_lsm = str;
+	return 1;
+}
+__setup("security=", choose_major_lsm);
+
+/* Explicitly choose LSM initialization order. */
+static int __init choose_lsm_order(char *str)
+{
+	chosen_lsm_order = str;
+	return 1;
+}
+__setup("lsm=", choose_lsm_order);
+
+/* Enable LSM order debugging. */
+static int __init enable_debug(char *str)
+{
+	debug = true;
+	return 1;
+}
+__setup("lsm.debug", enable_debug);
+
+/* Mark an LSM's enabled flag. */
+static int lsm_enabled_true __initdata = 1;
+static int lsm_enabled_false __initdata = 0;
+static void __init set_enabled(struct lsm_info *lsm, bool enabled)
+{
+	/*
+	 * When an LSM hasn't configured an enable variable, we can use
+	 * a hard-coded location for storing the default enabled state.
+	 */
+	if (!lsm->enabled) {
+		if (enabled)
+			lsm->enabled = &lsm_enabled_true;
+		else
+			lsm->enabled = &lsm_enabled_false;
+	} else if (lsm->enabled == &lsm_enabled_true) {
+		if (!enabled)
+			lsm->enabled = &lsm_enabled_false;
+	} else if (lsm->enabled == &lsm_enabled_false) {
+		if (enabled)
+			lsm->enabled = &lsm_enabled_true;
+	} else {
+		*lsm->enabled = enabled;
+	}
+}
+
+static inline bool is_enabled(struct lsm_info *lsm)
+{
+	if (!lsm->enabled)
+		return false;
+
+	return *lsm->enabled;
+}
+
+/* Is an LSM already listed in the ordered LSMs list? */
+static bool __init exists_ordered_lsm(struct lsm_info *lsm)
+{
+	struct lsm_info **check;
+
+	for (check = ordered_lsms; *check; check++)
+		if (*check == lsm)
+			return true;
+
+	return false;
+}
+
+/* Append an LSM to the list of ordered LSMs to initialize. */
+static int last_lsm __initdata;
+static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
+{
+	/* Ignore duplicate selections. */
+	if (exists_ordered_lsm(lsm))
+		return;
+
+	if (WARN(last_lsm == MAX_LSM_COUNT, "%s: out of LSM static calls!?\n", from))
+		return;
+
+	/* Enable this LSM, if it is not already set. */
+	if (!lsm->enabled)
+		lsm->enabled = &lsm_enabled_true;
+	ordered_lsms[last_lsm++] = lsm;
+
+	init_debug("%s ordered: %s (%s)\n", from, lsm->name,
+		   is_enabled(lsm) ? "enabled" : "disabled");
+}
+
+/* Is an LSM allowed to be initialized? */
+static bool __init lsm_allowed(struct lsm_info *lsm)
+{
+	/* Skip if the LSM is disabled. */
+	if (!is_enabled(lsm))
+		return false;
+
+	/* Not allowed if another exclusive LSM already initialized. */
+	if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
+		init_debug("exclusive disabled: %s\n", lsm->name);
+		return false;
+	}
+
+	return true;
+}
+
+static void __init lsm_set_blob_size(int *need, int *lbs)
+{
+	int offset;
+
+	if (*need <= 0)
+		return;
+
+	offset = ALIGN(*lbs, sizeof(void *));
+	*lbs = offset + *need;
+	*need = offset;
+}
+
+static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
+{
+	if (!needed)
+		return;
+
+	lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
+	lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
+	lsm_set_blob_size(&needed->lbs_ib, &blob_sizes.lbs_ib);
+	/*
+	 * The inode blob gets an rcu_head in addition to
+	 * what the modules might need.
+	 */
+	if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
+		blob_sizes.lbs_inode = sizeof(struct rcu_head);
+	lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
+	lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
+	lsm_set_blob_size(&needed->lbs_key, &blob_sizes.lbs_key);
+	lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
+	lsm_set_blob_size(&needed->lbs_perf_event, &blob_sizes.lbs_perf_event);
+	lsm_set_blob_size(&needed->lbs_sock, &blob_sizes.lbs_sock);
+	lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock);
+	lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
+	lsm_set_blob_size(&needed->lbs_tun_dev, &blob_sizes.lbs_tun_dev);
+	lsm_set_blob_size(&needed->lbs_xattr_count,
+			  &blob_sizes.lbs_xattr_count);
+	lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev);
+	lsm_set_blob_size(&needed->lbs_bpf_map, &blob_sizes.lbs_bpf_map);
+	lsm_set_blob_size(&needed->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog);
+	lsm_set_blob_size(&needed->lbs_bpf_token, &blob_sizes.lbs_bpf_token);
+}
+
+/* Prepare LSM for initialization. */
+static void __init prepare_lsm(struct lsm_info *lsm)
+{
+	int enabled = lsm_allowed(lsm);
+
+	/* Record enablement (to handle any following exclusive LSMs). */
+	set_enabled(lsm, enabled);
+
+	/* If enabled, do pre-initialization work. */
+	if (enabled) {
+		if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
+			exclusive = lsm;
+			init_debug("exclusive chosen:   %s\n", lsm->name);
+		}
+
+		lsm_set_blob_sizes(lsm->blobs);
+	}
+}
+
+/* Initialize a given LSM, if it is enabled. */
+static void __init initialize_lsm(struct lsm_info *lsm)
+{
+	if (is_enabled(lsm)) {
+		int ret;
+
+		init_debug("initializing %s\n", lsm->name);
+		ret = lsm->init();
+		WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
+	}
+}
+
+/*
+ * Current index to use while initializing the lsm id list.
+ */
+u32 lsm_active_cnt __ro_after_init;
+const struct lsm_id *lsm_idlist[MAX_LSM_COUNT];
+
+/* Populate ordered LSMs list from comma-separated LSM name list. */
+static void __init ordered_lsm_parse(const char *order, const char *origin)
+{
+	struct lsm_info *lsm;
+	char *sep, *name, *next;
+
+	/* LSM_ORDER_FIRST is always first. */
+	for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+		if (lsm->order == LSM_ORDER_FIRST)
+			append_ordered_lsm(lsm, "  first");
+	}
+
+	/* Process "security=", if given. */
+	if (chosen_major_lsm) {
+		struct lsm_info *major;
+
+		/*
+		 * To match the original "security=" behavior, this
+		 * explicitly does NOT fallback to another Legacy Major
+		 * if the selected one was separately disabled: disable
+		 * all non-matching Legacy Major LSMs.
+		 */
+		for (major = __start_lsm_info; major < __end_lsm_info;
+		     major++) {
+			if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
+			    strcmp(major->name, chosen_major_lsm) != 0) {
+				set_enabled(major, false);
+				init_debug("security=%s disabled: %s (only one legacy major LSM)\n",
+					   chosen_major_lsm, major->name);
+			}
+		}
+	}
+
+	sep = kstrdup(order, GFP_KERNEL);
+	next = sep;
+	/* Walk the list, looking for matching LSMs. */
+	while ((name = strsep(&next, ",")) != NULL) {
+		bool found = false;
+
+		for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+			if (strcmp(lsm->name, name) == 0) {
+				if (lsm->order == LSM_ORDER_MUTABLE)
+					append_ordered_lsm(lsm, origin);
+				found = true;
+			}
+		}
+
+		if (!found)
+			init_debug("%s ignored: %s (not built into kernel)\n",
+				   origin, name);
+	}
+
+	/* Process "security=", if given. */
+	if (chosen_major_lsm) {
+		for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+			if (exists_ordered_lsm(lsm))
+				continue;
+			if (strcmp(lsm->name, chosen_major_lsm) == 0)
+				append_ordered_lsm(lsm, "security=");
+		}
+	}
+
+	/* LSM_ORDER_LAST is always last. */
+	for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+		if (lsm->order == LSM_ORDER_LAST)
+			append_ordered_lsm(lsm, "   last");
+	}
+
+	/* Disable all LSMs not in the ordered list. */
+	for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+		if (exists_ordered_lsm(lsm))
+			continue;
+		set_enabled(lsm, false);
+		init_debug("%s skipped: %s (not in requested order)\n",
+			   origin, lsm->name);
+	}
+
+	kfree(sep);
+}
+
+static void __init report_lsm_order(void)
+{
+	struct lsm_info **lsm, *early;
+	int first = 0;
+
+	pr_info("initializing lsm=");
+
+	/* Report each enabled LSM name, comma separated. */
+	for (early = __start_early_lsm_info;
+	     early < __end_early_lsm_info; early++)
+		if (is_enabled(early))
+			pr_cont("%s%s", first++ == 0 ? "" : ",", early->name);
+	for (lsm = ordered_lsms; *lsm; lsm++)
+		if (is_enabled(*lsm))
+			pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name);
+
+	pr_cont("\n");
+}
+
+/**
+ * lsm_early_cred - during initialization allocate a composite cred blob
+ * @cred: the cred that needs a blob
+ *
+ * Allocate the cred blob for all the modules
+ */
+static void __init lsm_early_cred(struct cred *cred)
+{
+	int rc = lsm_cred_alloc(cred, GFP_KERNEL);
+
+	if (rc)
+		panic("%s: Early cred alloc failed.\n", __func__);
+}
+
+/**
+ * lsm_early_task - during initialization allocate a composite task blob
+ * @task: the task that needs a blob
+ *
+ * Allocate the task blob for all the modules
+ */
+static void __init lsm_early_task(struct task_struct *task)
+{
+	int rc = lsm_task_alloc(task);
+
+	if (rc)
+		panic("%s: Early task alloc failed.\n", __func__);
+}
+
+static void __init ordered_lsm_init(void)
+{
+	struct lsm_info **lsm;
+
+	if (chosen_lsm_order) {
+		if (chosen_major_lsm) {
+			pr_warn("security=%s is ignored because it is superseded by lsm=%s\n",
+				chosen_major_lsm, chosen_lsm_order);
+			chosen_major_lsm = NULL;
+		}
+		ordered_lsm_parse(chosen_lsm_order, "cmdline");
+	} else
+		ordered_lsm_parse(builtin_lsm_order, "builtin");
+
+	for (lsm = ordered_lsms; *lsm; lsm++)
+		prepare_lsm(*lsm);
+
+	report_lsm_order();
+
+	init_debug("cred blob size       = %d\n", blob_sizes.lbs_cred);
+	init_debug("file blob size       = %d\n", blob_sizes.lbs_file);
+	init_debug("ib blob size         = %d\n", blob_sizes.lbs_ib);
+	init_debug("inode blob size      = %d\n", blob_sizes.lbs_inode);
+	init_debug("ipc blob size        = %d\n", blob_sizes.lbs_ipc);
+#ifdef CONFIG_KEYS
+	init_debug("key blob size        = %d\n", blob_sizes.lbs_key);
+#endif /* CONFIG_KEYS */
+	init_debug("msg_msg blob size    = %d\n", blob_sizes.lbs_msg_msg);
+	init_debug("sock blob size       = %d\n", blob_sizes.lbs_sock);
+	init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
+	init_debug("perf event blob size = %d\n", blob_sizes.lbs_perf_event);
+	init_debug("task blob size       = %d\n", blob_sizes.lbs_task);
+	init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev);
+	init_debug("xattr slots          = %d\n", blob_sizes.lbs_xattr_count);
+	init_debug("bdev blob size       = %d\n", blob_sizes.lbs_bdev);
+	init_debug("bpf map blob size    = %d\n", blob_sizes.lbs_bpf_map);
+	init_debug("bpf prog blob size   = %d\n", blob_sizes.lbs_bpf_prog);
+	init_debug("bpf token blob size  = %d\n", blob_sizes.lbs_bpf_token);
+
+	/*
+	 * Create any kmem_caches needed for blobs
+	 */
+	if (blob_sizes.lbs_file)
+		lsm_file_cache = kmem_cache_create("lsm_file_cache",
+						   blob_sizes.lbs_file, 0,
+						   SLAB_PANIC, NULL);
+	if (blob_sizes.lbs_inode)
+		lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
+						    blob_sizes.lbs_inode, 0,
+						    SLAB_PANIC, NULL);
+
+	lsm_early_cred((struct cred *) current->cred);
+	lsm_early_task(current);
+	for (lsm = ordered_lsms; *lsm; lsm++)
+		initialize_lsm(*lsm);
+}
+
+static bool match_last_lsm(const char *list, const char *lsm)
+{
+	const char *last;
+
+	if (WARN_ON(!list || !lsm))
+		return false;
+	last = strrchr(list, ',');
+	if (last)
+		/* Pass the comma, strcmp() will check for '\0' */
+		last++;
+	else
+		last = list;
+	return !strcmp(last, lsm);
+}
+
+static int lsm_append(const char *new, char **result)
+{
+	char *cp;
+
+	if (*result == NULL) {
+		*result = kstrdup(new, GFP_KERNEL);
+		if (*result == NULL)
+			return -ENOMEM;
+	} else {
+		/* Check if it is the last registered name */
+		if (match_last_lsm(*result, new))
+			return 0;
+		cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
+		if (cp == NULL)
+			return -ENOMEM;
+		kfree(*result);
+		*result = cp;
+	}
+	return 0;
+}
+
+static void __init lsm_static_call_init(struct security_hook_list *hl)
+{
+	struct lsm_static_call *scall = hl->scalls;
+	int i;
+
+	for (i = 0; i < MAX_LSM_COUNT; i++) {
+		/* Update the first static call that is not used yet */
+		if (!scall->hl) {
+			__static_call_update(scall->key, scall->trampoline,
+					     hl->hook.lsm_func_addr);
+			scall->hl = hl;
+			static_branch_enable(scall->active);
+			return;
+		}
+		scall++;
+	}
+	panic("%s - Ran out of static slots.\n", __func__);
+}
+
+/**
+ * security_add_hooks - Add a modules hooks to the hook lists.
+ * @hooks: the hooks to add
+ * @count: the number of hooks to add
+ * @lsmid: the identification information for the security module
+ *
+ * Each LSM has to register its hooks with the infrastructure.
+ */
+void __init security_add_hooks(struct security_hook_list *hooks, int count,
+			       const struct lsm_id *lsmid)
+{
+	int i;
+
+	/*
+	 * A security module may call security_add_hooks() more
+	 * than once during initialization, and LSM initialization
+	 * is serialized. Landlock is one such case.
+	 * Look at the previous entry, if there is one, for duplication.
+	 */
+	if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) {
+		if (lsm_active_cnt >= MAX_LSM_COUNT)
+			panic("%s Too many LSMs registered.\n", __func__);
+		lsm_idlist[lsm_active_cnt++] = lsmid;
+	}
+
+	for (i = 0; i < count; i++) {
+		hooks[i].lsmid = lsmid;
+		lsm_static_call_init(&hooks[i]);
+	}
+
+	/*
+	 * Don't try to append during early_security_init(), we'll come back
+	 * and fix this up afterwards.
+	 */
+	if (slab_is_available()) {
+		if (lsm_append(lsmid->name, &lsm_names) < 0)
+			panic("%s - Cannot get early memory.\n", __func__);
+	}
+}
+
+int __init early_security_init(void)
+{
+	struct lsm_info *lsm;
+
+	for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
+		if (!lsm->enabled)
+			lsm->enabled = &lsm_enabled_true;
+		prepare_lsm(lsm);
+		initialize_lsm(lsm);
+	}
+
+	return 0;
+}
+
+/**
+ * security_init - initializes the security framework
+ *
+ * This should be called early in the kernel initialization sequence.
+ */
+int __init security_init(void)
+{
+	struct lsm_info *lsm;
+
+	init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*");
+	init_debug("  CONFIG_LSM=%s\n", builtin_lsm_order);
+	init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*");
+
+	/*
+	 * Append the names of the early LSM modules now that kmalloc() is
+	 * available
+	 */
+	for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
+		init_debug("  early started: %s (%s)\n", lsm->name,
+			   is_enabled(lsm) ? "enabled" : "disabled");
+		if (lsm->enabled)
+			lsm_append(lsm->name, &lsm_names);
+	}
+
+	/* Load LSMs in specified order. */
+	ordered_lsm_init();
+
+	return 0;
+}
diff --git a/security/security.c b/security/security.c
index 667479c2e82f..dc9734f0d45c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -32,24 +32,7 @@
 #include <net/flow.h>
 #include <net/sock.h>
 
-#define SECURITY_HOOK_ACTIVE_KEY(HOOK, IDX) security_hook_active_##HOOK##_##IDX
-
-/*
- * Identifier for the LSM static calls.
- * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h
- * IDX is the index of the static call. 0 <= NUM < MAX_LSM_COUNT
- */
-#define LSM_STATIC_CALL(HOOK, IDX) lsm_static_call_##HOOK##_##IDX
-
-/*
- * Call the macro M for each LSM hook MAX_LSM_COUNT times.
- */
-#define LSM_LOOP_UNROLL(M, ...) 		\
-do {						\
-	UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__)	\
-} while (0)
-
-#define LSM_DEFINE_UNROLL(M, ...) UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__)
+#include "lsm.h"
 
 /*
  * These are descriptions of the reasons that can be passed to the
@@ -90,21 +73,29 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = {
 	[LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
 };
 
-static struct kmem_cache *lsm_file_cache;
-static struct kmem_cache *lsm_inode_cache;
+struct lsm_blob_sizes blob_sizes;
 
-char *lsm_names;
-static struct lsm_blob_sizes blob_sizes __ro_after_init;
+struct kmem_cache *lsm_file_cache;
+struct kmem_cache *lsm_inode_cache;
 
-/* Boot-time LSM user choice */
-static __initdata const char *chosen_lsm_order;
-static __initdata const char *chosen_major_lsm;
+#define SECURITY_HOOK_ACTIVE_KEY(HOOK, IDX) security_hook_active_##HOOK##_##IDX
 
-static __initconst const char *const builtin_lsm_order = CONFIG_LSM;
+/*
+ * Identifier for the LSM static calls.
+ * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h
+ * IDX is the index of the static call. 0 <= NUM < MAX_LSM_COUNT
+ */
+#define LSM_STATIC_CALL(HOOK, IDX) lsm_static_call_##HOOK##_##IDX
+
+/*
+ * Call the macro M for each LSM hook MAX_LSM_COUNT times.
+ */
+#define LSM_LOOP_UNROLL(M, ...) 		\
+do {						\
+	UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__)	\
+} while (0)
 
-/* Ordered list of LSMs to initialize. */
-static __initdata struct lsm_info *ordered_lsms[MAX_LSM_COUNT + 1];
-static __initdata struct lsm_info *exclusive;
+#define LSM_DEFINE_UNROLL(M, ...) UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__)
 
 #ifdef CONFIG_HAVE_STATIC_CALL
 #define LSM_HOOK_TRAMP(NAME, NUM) \
@@ -155,496 +146,25 @@ struct lsm_static_calls_table
 #undef INIT_LSM_STATIC_CALL
 	};
 
-static __initdata bool debug;
-#define init_debug(...)						\
-	do {							\
-		if (debug)					\
-			pr_info(__VA_ARGS__);			\
-	} while (0)
-
-static bool __init is_enabled(struct lsm_info *lsm)
-{
-	if (!lsm->enabled)
-		return false;
-
-	return *lsm->enabled;
-}
-
-/* Mark an LSM's enabled flag. */
-static int lsm_enabled_true __initdata = 1;
-static int lsm_enabled_false __initdata = 0;
-static void __init set_enabled(struct lsm_info *lsm, bool enabled)
-{
-	/*
-	 * When an LSM hasn't configured an enable variable, we can use
-	 * a hard-coded location for storing the default enabled state.
-	 */
-	if (!lsm->enabled) {
-		if (enabled)
-			lsm->enabled = &lsm_enabled_true;
-		else
-			lsm->enabled = &lsm_enabled_false;
-	} else if (lsm->enabled == &lsm_enabled_true) {
-		if (!enabled)
-			lsm->enabled = &lsm_enabled_false;
-	} else if (lsm->enabled == &lsm_enabled_false) {
-		if (enabled)
-			lsm->enabled = &lsm_enabled_true;
-	} else {
-		*lsm->enabled = enabled;
-	}
-}
-
-/* Is an LSM already listed in the ordered LSMs list? */
-static bool __init exists_ordered_lsm(struct lsm_info *lsm)
-{
-	struct lsm_info **check;
-
-	for (check = ordered_lsms; *check; check++)
-		if (*check == lsm)
-			return true;
-
-	return false;
-}
-
-/* Append an LSM to the list of ordered LSMs to initialize. */
-static int last_lsm __initdata;
-static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
-{
-	/* Ignore duplicate selections. */
-	if (exists_ordered_lsm(lsm))
-		return;
-
-	if (WARN(last_lsm == MAX_LSM_COUNT, "%s: out of LSM static calls!?\n", from))
-		return;
-
-	/* Enable this LSM, if it is not already set. */
-	if (!lsm->enabled)
-		lsm->enabled = &lsm_enabled_true;
-	ordered_lsms[last_lsm++] = lsm;
-
-	init_debug("%s ordered: %s (%s)\n", from, lsm->name,
-		   is_enabled(lsm) ? "enabled" : "disabled");
-}
-
-/* Is an LSM allowed to be initialized? */
-static bool __init lsm_allowed(struct lsm_info *lsm)
-{
-	/* Skip if the LSM is disabled. */
-	if (!is_enabled(lsm))
-		return false;
-
-	/* Not allowed if another exclusive LSM already initialized. */
-	if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
-		init_debug("exclusive disabled: %s\n", lsm->name);
-		return false;
-	}
-
-	return true;
-}
-
-static void __init lsm_set_blob_size(int *need, int *lbs)
-{
-	int offset;
-
-	if (*need <= 0)
-		return;
-
-	offset = ALIGN(*lbs, sizeof(void *));
-	*lbs = offset + *need;
-	*need = offset;
-}
-
-static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
-{
-	if (!needed)
-		return;
-
-	lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
-	lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
-	lsm_set_blob_size(&needed->lbs_ib, &blob_sizes.lbs_ib);
-	/*
-	 * The inode blob gets an rcu_head in addition to
-	 * what the modules might need.
-	 */
-	if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
-		blob_sizes.lbs_inode = sizeof(struct rcu_head);
-	lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
-	lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
-	lsm_set_blob_size(&needed->lbs_key, &blob_sizes.lbs_key);
-	lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
-	lsm_set_blob_size(&needed->lbs_perf_event, &blob_sizes.lbs_perf_event);
-	lsm_set_blob_size(&needed->lbs_sock, &blob_sizes.lbs_sock);
-	lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock);
-	lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
-	lsm_set_blob_size(&needed->lbs_tun_dev, &blob_sizes.lbs_tun_dev);
-	lsm_set_blob_size(&needed->lbs_xattr_count,
-			  &blob_sizes.lbs_xattr_count);
-	lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev);
-	lsm_set_blob_size(&needed->lbs_bpf_map, &blob_sizes.lbs_bpf_map);
-	lsm_set_blob_size(&needed->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog);
-	lsm_set_blob_size(&needed->lbs_bpf_token, &blob_sizes.lbs_bpf_token);
-}
-
-/* Prepare LSM for initialization. */
-static void __init prepare_lsm(struct lsm_info *lsm)
-{
-	int enabled = lsm_allowed(lsm);
-
-	/* Record enablement (to handle any following exclusive LSMs). */
-	set_enabled(lsm, enabled);
-
-	/* If enabled, do pre-initialization work. */
-	if (enabled) {
-		if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
-			exclusive = lsm;
-			init_debug("exclusive chosen:   %s\n", lsm->name);
-		}
-
-		lsm_set_blob_sizes(lsm->blobs);
-	}
-}
-
-/* Initialize a given LSM, if it is enabled. */
-static void __init initialize_lsm(struct lsm_info *lsm)
-{
-	if (is_enabled(lsm)) {
-		int ret;
-
-		init_debug("initializing %s\n", lsm->name);
-		ret = lsm->init();
-		WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
-	}
-}
-
-/*
- * Current index to use while initializing the lsm id list.
- */
-u32 lsm_active_cnt __ro_after_init;
-const struct lsm_id *lsm_idlist[MAX_LSM_COUNT];
-
-/* Populate ordered LSMs list from comma-separated LSM name list. */
-static void __init ordered_lsm_parse(const char *order, const char *origin)
-{
-	struct lsm_info *lsm;
-	char *sep, *name, *next;
-
-	/* LSM_ORDER_FIRST is always first. */
-	for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
-		if (lsm->order == LSM_ORDER_FIRST)
-			append_ordered_lsm(lsm, "  first");
-	}
-
-	/* Process "security=", if given. */
-	if (chosen_major_lsm) {
-		struct lsm_info *major;
-
-		/*
-		 * To match the original "security=" behavior, this
-		 * explicitly does NOT fallback to another Legacy Major
-		 * if the selected one was separately disabled: disable
-		 * all non-matching Legacy Major LSMs.
-		 */
-		for (major = __start_lsm_info; major < __end_lsm_info;
-		     major++) {
-			if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
-			    strcmp(major->name, chosen_major_lsm) != 0) {
-				set_enabled(major, false);
-				init_debug("security=%s disabled: %s (only one legacy major LSM)\n",
-					   chosen_major_lsm, major->name);
-			}
-		}
-	}
-
-	sep = kstrdup(order, GFP_KERNEL);
-	next = sep;
-	/* Walk the list, looking for matching LSMs. */
-	while ((name = strsep(&next, ",")) != NULL) {
-		bool found = false;
-
-		for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
-			if (strcmp(lsm->name, name) == 0) {
-				if (lsm->order == LSM_ORDER_MUTABLE)
-					append_ordered_lsm(lsm, origin);
-				found = true;
-			}
-		}
-
-		if (!found)
-			init_debug("%s ignored: %s (not built into kernel)\n",
-				   origin, name);
-	}
-
-	/* Process "security=", if given. */
-	if (chosen_major_lsm) {
-		for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
-			if (exists_ordered_lsm(lsm))
-				continue;
-			if (strcmp(lsm->name, chosen_major_lsm) == 0)
-				append_ordered_lsm(lsm, "security=");
-		}
-	}
-
-	/* LSM_ORDER_LAST is always last. */
-	for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
-		if (lsm->order == LSM_ORDER_LAST)
-			append_ordered_lsm(lsm, "   last");
-	}
-
-	/* Disable all LSMs not in the ordered list. */
-	for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
-		if (exists_ordered_lsm(lsm))
-			continue;
-		set_enabled(lsm, false);
-		init_debug("%s skipped: %s (not in requested order)\n",
-			   origin, lsm->name);
-	}
-
-	kfree(sep);
-}
-
-static void __init lsm_static_call_init(struct security_hook_list *hl)
-{
-	struct lsm_static_call *scall = hl->scalls;
-	int i;
-
-	for (i = 0; i < MAX_LSM_COUNT; i++) {
-		/* Update the first static call that is not used yet */
-		if (!scall->hl) {
-			__static_call_update(scall->key, scall->trampoline,
-					     hl->hook.lsm_func_addr);
-			scall->hl = hl;
-			static_branch_enable(scall->active);
-			return;
-		}
-		scall++;
-	}
-	panic("%s - Ran out of static slots.\n", __func__);
-}
-
-static void __init lsm_early_cred(struct cred *cred);
-static void __init lsm_early_task(struct task_struct *task);
-
-static int lsm_append(const char *new, char **result);
-
-static void __init report_lsm_order(void)
-{
-	struct lsm_info **lsm, *early;
-	int first = 0;
-
-	pr_info("initializing lsm=");
-
-	/* Report each enabled LSM name, comma separated. */
-	for (early = __start_early_lsm_info;
-	     early < __end_early_lsm_info; early++)
-		if (is_enabled(early))
-			pr_cont("%s%s", first++ == 0 ? "" : ",", early->name);
-	for (lsm = ordered_lsms; *lsm; lsm++)
-		if (is_enabled(*lsm))
-			pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name);
-
-	pr_cont("\n");
-}
-
-static void __init ordered_lsm_init(void)
-{
-	struct lsm_info **lsm;
-
-	if (chosen_lsm_order) {
-		if (chosen_major_lsm) {
-			pr_warn("security=%s is ignored because it is superseded by lsm=%s\n",
-				chosen_major_lsm, chosen_lsm_order);
-			chosen_major_lsm = NULL;
-		}
-		ordered_lsm_parse(chosen_lsm_order, "cmdline");
-	} else
-		ordered_lsm_parse(builtin_lsm_order, "builtin");
-
-	for (lsm = ordered_lsms; *lsm; lsm++)
-		prepare_lsm(*lsm);
-
-	report_lsm_order();
-
-	init_debug("cred blob size       = %d\n", blob_sizes.lbs_cred);
-	init_debug("file blob size       = %d\n", blob_sizes.lbs_file);
-	init_debug("ib blob size         = %d\n", blob_sizes.lbs_ib);
-	init_debug("inode blob size      = %d\n", blob_sizes.lbs_inode);
-	init_debug("ipc blob size        = %d\n", blob_sizes.lbs_ipc);
-#ifdef CONFIG_KEYS
-	init_debug("key blob size        = %d\n", blob_sizes.lbs_key);
-#endif /* CONFIG_KEYS */
-	init_debug("msg_msg blob size    = %d\n", blob_sizes.lbs_msg_msg);
-	init_debug("sock blob size       = %d\n", blob_sizes.lbs_sock);
-	init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
-	init_debug("perf event blob size = %d\n", blob_sizes.lbs_perf_event);
-	init_debug("task blob size       = %d\n", blob_sizes.lbs_task);
-	init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev);
-	init_debug("xattr slots          = %d\n", blob_sizes.lbs_xattr_count);
-	init_debug("bdev blob size       = %d\n", blob_sizes.lbs_bdev);
-	init_debug("bpf map blob size    = %d\n", blob_sizes.lbs_bpf_map);
-	init_debug("bpf prog blob size   = %d\n", blob_sizes.lbs_bpf_prog);
-	init_debug("bpf token blob size  = %d\n", blob_sizes.lbs_bpf_token);
-
-	/*
-	 * Create any kmem_caches needed for blobs
-	 */
-	if (blob_sizes.lbs_file)
-		lsm_file_cache = kmem_cache_create("lsm_file_cache",
-						   blob_sizes.lbs_file, 0,
-						   SLAB_PANIC, NULL);
-	if (blob_sizes.lbs_inode)
-		lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
-						    blob_sizes.lbs_inode, 0,
-						    SLAB_PANIC, NULL);
-
-	lsm_early_cred((struct cred *) current->cred);
-	lsm_early_task(current);
-	for (lsm = ordered_lsms; *lsm; lsm++)
-		initialize_lsm(*lsm);
-}
-
-int __init early_security_init(void)
-{
-	struct lsm_info *lsm;
-
-	for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
-		if (!lsm->enabled)
-			lsm->enabled = &lsm_enabled_true;
-		prepare_lsm(lsm);
-		initialize_lsm(lsm);
-	}
-
-	return 0;
-}
-
 /**
- * security_init - initializes the security framework
+ * lsm_file_alloc - allocate a composite file blob
+ * @file: the file that needs a blob
  *
- * This should be called early in the kernel initialization sequence.
- */
-int __init security_init(void)
-{
-	struct lsm_info *lsm;
-
-	init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*");
-	init_debug("  CONFIG_LSM=%s\n", builtin_lsm_order);
-	init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*");
-
-	/*
-	 * Append the names of the early LSM modules now that kmalloc() is
-	 * available
-	 */
-	for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
-		init_debug("  early started: %s (%s)\n", lsm->name,
-			   is_enabled(lsm) ? "enabled" : "disabled");
-		if (lsm->enabled)
-			lsm_append(lsm->name, &lsm_names);
-	}
-
-	/* Load LSMs in specified order. */
-	ordered_lsm_init();
-
-	return 0;
-}
-
-/* Save user chosen LSM */
-static int __init choose_major_lsm(char *str)
-{
-	chosen_major_lsm = str;
-	return 1;
-}
-__setup("security=", choose_major_lsm);
-
-/* Explicitly choose LSM initialization order. */
-static int __init choose_lsm_order(char *str)
-{
-	chosen_lsm_order = str;
-	return 1;
-}
-__setup("lsm=", choose_lsm_order);
-
-/* Enable LSM order debugging. */
-static int __init enable_debug(char *str)
-{
-	debug = true;
-	return 1;
-}
-__setup("lsm.debug", enable_debug);
-
-static bool match_last_lsm(const char *list, const char *lsm)
-{
-	const char *last;
-
-	if (WARN_ON(!list || !lsm))
-		return false;
-	last = strrchr(list, ',');
-	if (last)
-		/* Pass the comma, strcmp() will check for '\0' */
-		last++;
-	else
-		last = list;
-	return !strcmp(last, lsm);
-}
-
-static int lsm_append(const char *new, char **result)
-{
-	char *cp;
-
-	if (*result == NULL) {
-		*result = kstrdup(new, GFP_KERNEL);
-		if (*result == NULL)
-			return -ENOMEM;
-	} else {
-		/* Check if it is the last registered name */
-		if (match_last_lsm(*result, new))
-			return 0;
-		cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
-		if (cp == NULL)
-			return -ENOMEM;
-		kfree(*result);
-		*result = cp;
-	}
-	return 0;
-}
-
-/**
- * security_add_hooks - Add a modules hooks to the hook lists.
- * @hooks: the hooks to add
- * @count: the number of hooks to add
- * @lsmid: the identification information for the security module
+ * Allocate the file blob for all the modules
  *
- * Each LSM has to register its hooks with the infrastructure.
+ * Returns 0, or -ENOMEM if memory can't be allocated.
  */
-void __init security_add_hooks(struct security_hook_list *hooks, int count,
-			       const struct lsm_id *lsmid)
+static int lsm_file_alloc(struct file *file)
 {
-	int i;
-
-	/*
-	 * A security module may call security_add_hooks() more
-	 * than once during initialization, and LSM initialization
-	 * is serialized. Landlock is one such case.
-	 * Look at the previous entry, if there is one, for duplication.
-	 */
-	if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) {
-		if (lsm_active_cnt >= MAX_LSM_COUNT)
-			panic("%s Too many LSMs registered.\n", __func__);
-		lsm_idlist[lsm_active_cnt++] = lsmid;
-	}
-
-	for (i = 0; i < count; i++) {
-		hooks[i].lsmid = lsmid;
-		lsm_static_call_init(&hooks[i]);
+	if (!lsm_file_cache) {
+		file->f_security = NULL;
+		return 0;
 	}
 
-	/*
-	 * Don't try to append during early_security_init(), we'll come back
-	 * and fix this up afterwards.
-	 */
-	if (slab_is_available()) {
-		if (lsm_append(lsmid->name, &lsm_names) < 0)
-			panic("%s - Cannot get early memory.\n", __func__);
-	}
+	file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
+	if (file->f_security == NULL)
+		return -ENOMEM;
+	return 0;
 }
 
 /**
@@ -679,46 +199,11 @@ static int lsm_blob_alloc(void **dest, size_t size, gfp_t gfp)
  *
  * Returns 0, or -ENOMEM if memory can't be allocated.
  */
-static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
+int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
 {
 	return lsm_blob_alloc(&cred->security, blob_sizes.lbs_cred, gfp);
 }
 
-/**
- * lsm_early_cred - during initialization allocate a composite cred blob
- * @cred: the cred that needs a blob
- *
- * Allocate the cred blob for all the modules
- */
-static void __init lsm_early_cred(struct cred *cred)
-{
-	int rc = lsm_cred_alloc(cred, GFP_KERNEL);
-
-	if (rc)
-		panic("%s: Early cred alloc failed.\n", __func__);
-}
-
-/**
- * lsm_file_alloc - allocate a composite file blob
- * @file: the file that needs a blob
- *
- * Allocate the file blob for all the modules
- *
- * Returns 0, or -ENOMEM if memory can't be allocated.
- */
-static int lsm_file_alloc(struct file *file)
-{
-	if (!lsm_file_cache) {
-		file->f_security = NULL;
-		return 0;
-	}
-
-	file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
-	if (file->f_security == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
 /**
  * lsm_inode_alloc - allocate a composite inode blob
  * @inode: the inode that needs a blob
@@ -749,7 +234,7 @@ static int lsm_inode_alloc(struct inode *inode, gfp_t gfp)
  *
  * Returns 0, or -ENOMEM if memory can't be allocated.
  */
-static int lsm_task_alloc(struct task_struct *task)
+int lsm_task_alloc(struct task_struct *task)
 {
 	return lsm_blob_alloc(&task->security, blob_sizes.lbs_task, GFP_KERNEL);
 }
@@ -851,20 +336,6 @@ static int lsm_bpf_token_alloc(struct bpf_token *token)
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
-/**
- * lsm_early_task - during initialization allocate a composite task blob
- * @task: the task that needs a blob
- *
- * Allocate the task blob for all the modules
- */
-static void __init lsm_early_task(struct task_struct *task)
-{
-	int rc = lsm_task_alloc(task);
-
-	if (rc)
-		panic("%s: Early task alloc failed.\n", __func__);
-}
-
 /**
  * lsm_superblock_alloc - allocate a composite superblock blob
  * @sb: the superblock that needs a blob
-- 
cgit v1.2.3


From 9f9dc69e06ecbc61e7a50b823b82a78daf130dc0 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 12 Feb 2025 14:45:06 -0500
Subject: lsm: replace the name field with a pointer to the lsm_id struct

Reduce the duplication between the lsm_id struct and the DEFINE_LSM()
definition by linking the lsm_id struct directly into the individual
LSM's DEFINE_LSM() instance.

Linking the lsm_id into the LSM definition also allows us to simplify
the security_add_hooks() function by removing the code which populates
the lsm_idlist[] array and moving it into the normal LSM startup code
where the LSM list is parsed and the individual LSMs are enabled,
making for a cleaner implementation with less overhead at boot.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h         |  2 +-
 security/apparmor/lsm.c           |  2 +-
 security/bpf/hooks.c              |  2 +-
 security/commoncap.c              |  2 +-
 security/integrity/evm/evm_main.c |  2 +-
 security/integrity/ima/ima_main.c |  2 +-
 security/ipe/ipe.c                |  2 +-
 security/landlock/setup.c         |  2 +-
 security/loadpin/loadpin.c        |  2 +-
 security/lockdown/lockdown.c      |  2 +-
 security/lsm_init.c               | 45 ++++++++++++++++-----------------------
 security/safesetid/lsm.c          |  2 +-
 security/selinux/hooks.c          |  2 +-
 security/smack/smack_lsm.c        |  2 +-
 security/tomoyo/tomoyo.c          |  2 +-
 security/yama/yama_lsm.c          |  2 +-
 16 files changed, 33 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 0112926ed923..7343dd60b1d5 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -152,7 +152,7 @@ enum lsm_order {
 };
 
 struct lsm_info {
-	const char *name;	/* Required. */
+	const struct lsm_id *id;
 	enum lsm_order order;	/* Optional: default is LSM_ORDER_MUTABLE */
 	unsigned long flags;	/* Optional: flags describing LSM */
 	int *enabled;		/* Optional: controlled by CONFIG_LSM */
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index b3f7a3258a2c..f6798144234b 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -2555,7 +2555,7 @@ alloc_out:
 }
 
 DEFINE_LSM(apparmor) = {
-	.name = "apparmor",
+	.id = &apparmor_lsmid,
 	.flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
 	.enabled = &apparmor_enabled,
 	.blobs = &apparmor_blob_sizes,
diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c
index db759025abe1..40efde233f3a 100644
--- a/security/bpf/hooks.c
+++ b/security/bpf/hooks.c
@@ -33,7 +33,7 @@ struct lsm_blob_sizes bpf_lsm_blob_sizes __ro_after_init = {
 };
 
 DEFINE_LSM(bpf) = {
-	.name = "bpf",
+	.id = &bpf_lsmid,
 	.init = bpf_lsm_init,
 	.blobs = &bpf_lsm_blob_sizes
 };
diff --git a/security/commoncap.c b/security/commoncap.c
index 6bd4adeb4795..b50479bd0286 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -1505,7 +1505,7 @@ static int __init capability_init(void)
 }
 
 DEFINE_LSM(capability) = {
-	.name = "capability",
+	.id = &capability_lsmid,
 	.order = LSM_ORDER_FIRST,
 	.init = capability_init,
 };
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 0add782e73ba..db8e324ed4e6 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -1175,7 +1175,7 @@ struct lsm_blob_sizes evm_blob_sizes __ro_after_init = {
 };
 
 DEFINE_LSM(evm) = {
-	.name = "evm",
+	.id = &evm_lsmid,
 	.init = init_evm_lsm,
 	.order = LSM_ORDER_LAST,
 	.blobs = &evm_blob_sizes,
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index cdd225f65a62..eade8e1e3cb1 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -1279,7 +1279,7 @@ struct lsm_blob_sizes ima_blob_sizes __ro_after_init = {
 };
 
 DEFINE_LSM(ima) = {
-	.name = "ima",
+	.id = &ima_lsmid,
 	.init = init_ima_lsm,
 	.order = LSM_ORDER_LAST,
 	.blobs = &ima_blob_sizes,
diff --git a/security/ipe/ipe.c b/security/ipe/ipe.c
index 4317134cb0da..2426441181dc 100644
--- a/security/ipe/ipe.c
+++ b/security/ipe/ipe.c
@@ -92,7 +92,7 @@ static int __init ipe_init(void)
 }
 
 DEFINE_LSM(ipe) = {
-	.name = "ipe",
+	.id = &ipe_lsmid,
 	.init = ipe_init,
 	.blobs = &ipe_blobs,
 };
diff --git a/security/landlock/setup.c b/security/landlock/setup.c
index bd53c7a56ab9..47dac1736f10 100644
--- a/security/landlock/setup.c
+++ b/security/landlock/setup.c
@@ -75,7 +75,7 @@ static int __init landlock_init(void)
 }
 
 DEFINE_LSM(LANDLOCK_NAME) = {
-	.name = LANDLOCK_NAME,
+	.id = &landlock_lsmid,
 	.init = landlock_init,
 	.blobs = &landlock_blob_sizes,
 };
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index 68252452b66c..b9ddf05c5c16 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -271,7 +271,7 @@ static int __init loadpin_init(void)
 }
 
 DEFINE_LSM(loadpin) = {
-	.name = "loadpin",
+	.id = &loadpin_lsmid,
 	.init = loadpin_init,
 };
 
diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c
index cf83afa1d879..4813f168ff93 100644
--- a/security/lockdown/lockdown.c
+++ b/security/lockdown/lockdown.c
@@ -168,6 +168,6 @@ DEFINE_EARLY_LSM(lockdown) = {
 #else
 DEFINE_LSM(lockdown) = {
 #endif
-	.name = "lockdown",
+	.id = &lockdown_lsmid,
 	.init = lockdown_lsm_init,
 };
diff --git a/security/lsm_init.c b/security/lsm_init.c
index 9249d5f37ae9..692d61a2ea10 100644
--- a/security/lsm_init.c
+++ b/security/lsm_init.c
@@ -127,9 +127,10 @@ static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
 	/* Enable this LSM, if it is not already set. */
 	if (!lsm->enabled)
 		lsm->enabled = &lsm_enabled_true;
-	ordered_lsms[last_lsm++] = lsm;
+	ordered_lsms[last_lsm] = lsm;
+	lsm_idlist[last_lsm++] = lsm->id;
 
-	init_debug("%s ordered: %s (%s)\n", from, lsm->name,
+	init_debug("%s ordered: %s (%s)\n", from, lsm->id->name,
 		   is_enabled(lsm) ? "enabled" : "disabled");
 }
 
@@ -157,7 +158,7 @@ static void __init lsm_prepare(struct lsm_info *lsm)
 		set_enabled(lsm, false);
 		return;
 	} else if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
-		init_debug("exclusive disabled: %s\n", lsm->name);
+		init_debug("exclusive disabled: %s\n", lsm->id->name);
 		set_enabled(lsm, false);
 		return;
 	}
@@ -165,7 +166,7 @@ static void __init lsm_prepare(struct lsm_info *lsm)
 	/* Mark the LSM as enabled. */
 	set_enabled(lsm, true);
 	if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
-		init_debug("exclusive chosen:   %s\n", lsm->name);
+		init_debug("exclusive chosen:   %s\n", lsm->id->name);
 		exclusive = lsm;
 	}
 
@@ -200,9 +201,9 @@ static void __init initialize_lsm(struct lsm_info *lsm)
 	if (is_enabled(lsm)) {
 		int ret;
 
-		init_debug("initializing %s\n", lsm->name);
+		init_debug("initializing %s\n", lsm->id->name);
 		ret = lsm->init();
-		WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
+		WARN(ret, "%s failed to initialize: %d\n", lsm->id->name, ret);
 	}
 }
 
@@ -236,10 +237,10 @@ static void __init ordered_lsm_parse(const char *order, const char *origin)
 		 */
 		lsm_for_each_raw(major) {
 			if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
-			    strcmp(major->name, chosen_major_lsm) != 0) {
+			    strcmp(major->id->name, chosen_major_lsm) != 0) {
 				set_enabled(major, false);
 				init_debug("security=%s disabled: %s (only one legacy major LSM)\n",
-					   chosen_major_lsm, major->name);
+					   chosen_major_lsm, major->id->name);
 			}
 		}
 	}
@@ -251,7 +252,7 @@ static void __init ordered_lsm_parse(const char *order, const char *origin)
 		bool found = false;
 
 		lsm_for_each_raw(lsm) {
-			if (strcmp(lsm->name, name) == 0) {
+			if (strcmp(lsm->id->name, name) == 0) {
 				if (lsm->order == LSM_ORDER_MUTABLE)
 					append_ordered_lsm(lsm, origin);
 				found = true;
@@ -268,7 +269,7 @@ static void __init ordered_lsm_parse(const char *order, const char *origin)
 		lsm_for_each_raw(lsm) {
 			if (exists_ordered_lsm(lsm))
 				continue;
-			if (strcmp(lsm->name, chosen_major_lsm) == 0)
+			if (strcmp(lsm->id->name, chosen_major_lsm) == 0)
 				append_ordered_lsm(lsm, "security=");
 		}
 	}
@@ -285,7 +286,7 @@ static void __init ordered_lsm_parse(const char *order, const char *origin)
 			continue;
 		set_enabled(lsm, false);
 		init_debug("%s skipped: %s (not in requested order)\n",
-			   origin, lsm->name);
+			   origin, lsm->id->name);
 	}
 
 	kfree(sep);
@@ -317,11 +318,13 @@ static void __init lsm_init_ordered(void)
 	pr_info("initializing lsm=");
 	lsm_early_for_each_raw(early) {
 		if (is_enabled(early))
-			pr_cont("%s%s", first++ == 0 ? "" : ",", early->name);
+			pr_cont("%s%s",
+				first++ == 0 ? "" : ",", early->id->name);
 	}
 	lsm_order_for_each(lsm) {
 		if (is_enabled(*lsm))
-			pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name);
+			pr_cont("%s%s",
+				first++ == 0 ? "" : ",", (*lsm)->id->name);
 	}
 	pr_cont("\n");
 
@@ -432,18 +435,6 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count,
 {
 	int i;
 
-	/*
-	 * A security module may call security_add_hooks() more
-	 * than once during initialization, and LSM initialization
-	 * is serialized. Landlock is one such case.
-	 * Look at the previous entry, if there is one, for duplication.
-	 */
-	if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) {
-		if (lsm_active_cnt >= MAX_LSM_COUNT)
-			panic("%s Too many LSMs registered.\n", __func__);
-		lsm_idlist[lsm_active_cnt++] = lsmid;
-	}
-
 	for (i = 0; i < count; i++) {
 		hooks[i].lsmid = lsmid;
 		lsm_static_call_init(&hooks[i]);
@@ -491,10 +482,10 @@ int __init security_init(void)
 	 * available
 	 */
 	lsm_early_for_each_raw(lsm) {
-		init_debug("  early started: %s (%s)\n", lsm->name,
+		init_debug("  early started: %s (%s)\n", lsm->id->name,
 			   is_enabled(lsm) ? "enabled" : "disabled");
 		if (lsm->enabled)
-			lsm_append(lsm->name, &lsm_names);
+			lsm_append(lsm->id->name, &lsm_names);
 	}
 
 	/* Load LSMs in specified order. */
diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c
index 1ba564f097f5..9a7c68d4e642 100644
--- a/security/safesetid/lsm.c
+++ b/security/safesetid/lsm.c
@@ -287,6 +287,6 @@ static int __init safesetid_security_init(void)
 }
 
 DEFINE_LSM(safesetid_security_init) = {
+	.id = &safesetid_lsmid,
 	.init = safesetid_security_init,
-	.name = "safesetid",
 };
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index dfc22da42f30..299b656ac007 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -7639,7 +7639,7 @@ void selinux_complete_init(void)
 /* SELinux requires early initialization in order to label
    all processes and objects when they are created. */
 DEFINE_LSM(selinux) = {
-	.name = "selinux",
+	.id = &selinux_lsmid,
 	.flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
 	.enabled = &selinux_enabled_boot,
 	.blobs = &selinux_blob_sizes,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index af986587841d..392698e41120 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -5280,7 +5280,7 @@ static __init int smack_init(void)
  * all processes and objects when they are created.
  */
 DEFINE_LSM(smack) = {
-	.name = "smack",
+	.id = &smack_lsmid,
 	.flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
 	.blobs = &smack_blob_sizes,
 	.init = smack_init,
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 48fc59d38ab2..cb003c460dc2 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -612,7 +612,7 @@ static int __init tomoyo_init(void)
 }
 
 DEFINE_LSM(tomoyo) = {
-	.name = "tomoyo",
+	.id = &tomoyo_lsmid,
 	.enabled = &tomoyo_enabled,
 	.flags = LSM_FLAG_LEGACY_MAJOR,
 	.blobs = &tomoyo_blob_sizes,
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 3d064dd4e03f..38b21ee0c560 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -476,6 +476,6 @@ static int __init yama_init(void)
 }
 
 DEFINE_LSM(yama) = {
-	.name = "yama",
+	.id = &yama_lsmid,
 	.init = yama_init,
 };
-- 
cgit v1.2.3


From 250898ca335f337bc032a9693dc0a30a1cb85825 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 12 Feb 2025 15:36:51 -0500
Subject: lsm: rework lsm_active_cnt and lsm_idlist[]

Move the LSM active count and lsm_id list declarations out of a header
that is visible across the kernel and into a header that is limited to
the LSM framework.  This not only helps keep the include/linux headers
smaller and cleaner, it helps prevent misuse of these variables.

Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johhansen@canonical.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 2 --
 security/lsm.h           | 5 +++++
 security/lsm_init.c      | 6 ------
 security/lsm_syscalls.c  | 2 ++
 security/security.c      | 3 +++
 5 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 92ac3f27b973..556890ea2e83 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -167,8 +167,6 @@ struct lsm_prop {
 };
 
 extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1];
-extern u32 lsm_active_cnt;
-extern const struct lsm_id *lsm_idlist[];
 
 /* These functions are in security/commoncap.c */
 extern int cap_capable(const struct cred *cred, struct user_namespace *ns,
diff --git a/security/lsm.h b/security/lsm.h
index 0e1731bad4a7..dbe755c45e57 100644
--- a/security/lsm.h
+++ b/security/lsm.h
@@ -7,6 +7,11 @@
 #define _LSM_H_
 
 #include <linux/lsm_hooks.h>
+#include <linux/lsm_count.h>
+
+/* List of configured LSMs */
+extern unsigned int lsm_active_cnt;
+extern const struct lsm_id *lsm_idlist[];
 
 /* LSM blob configuration */
 extern struct lsm_blob_sizes blob_sizes;
diff --git a/security/lsm_init.c b/security/lsm_init.c
index a0785ca081c7..d40f31e79bd5 100644
--- a/security/lsm_init.c
+++ b/security/lsm_init.c
@@ -217,12 +217,6 @@ static void __init initialize_lsm(struct lsm_info *lsm)
 	}
 }
 
-/*
- * Current index to use while initializing the lsm id list.
- */
-u32 lsm_active_cnt __ro_after_init;
-const struct lsm_id *lsm_idlist[MAX_LSM_COUNT];
-
 /* Populate ordered LSMs list from comma-separated LSM name list. */
 static void __init ordered_lsm_parse(const char *order, const char *origin)
 {
diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c
index 8440948a690c..5648b1f0ce9c 100644
--- a/security/lsm_syscalls.c
+++ b/security/lsm_syscalls.c
@@ -17,6 +17,8 @@
 #include <linux/lsm_hooks.h>
 #include <uapi/linux/lsm.h>
 
+#include "lsm.h"
+
 /**
  * lsm_name_to_attr - map an LSM attribute name to its ID
  * @name: name of the attribute
diff --git a/security/security.c b/security/security.c
index dc9734f0d45c..b4eec4f00730 100644
--- a/security/security.c
+++ b/security/security.c
@@ -73,6 +73,9 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = {
 	[LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
 };
 
+unsigned int lsm_active_cnt __ro_after_init;
+const struct lsm_id *lsm_idlist[MAX_LSM_COUNT];
+
 struct lsm_blob_sizes blob_sizes;
 
 struct kmem_cache *lsm_file_cache;
-- 
cgit v1.2.3


From 935d508d4d7ab9d19c603bd7eb2937249551d507 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 13 Feb 2025 17:34:12 -0500
Subject: lsm: get rid of the lsm_names list and do some cleanup

The LSM currently has a lot of code to maintain a list of the currently
active LSMs in a human readable string, with the only user being the
"/sys/kernel/security/lsm" code.  Let's drop all of that code and
generate the string on first use and then cache it for subsequent use.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h |  1 -
 security/inode.c          | 43 +++++++++++++++++++++++++++++++++++++++--
 security/lsm_init.c       | 49 -----------------------------------------------
 3 files changed, 41 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 7343dd60b1d5..65a8227bece7 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -172,7 +172,6 @@ struct lsm_info {
 
 
 /* DO NOT tamper with these variables outside of the LSM framework */
-extern char *lsm_names;
 extern struct lsm_static_calls_table static_calls_table __ro_after_init;
 
 /**
diff --git a/security/inode.c b/security/inode.c
index 43382ef8896e..6620c3e42af2 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -22,6 +22,8 @@
 #include <linux/lsm_hooks.h>
 #include <linux/magic.h>
 
+#include "lsm.h"
+
 static struct vfsmount *mount;
 static int mount_count;
 
@@ -315,12 +317,49 @@ void securityfs_remove(struct dentry *dentry)
 EXPORT_SYMBOL_GPL(securityfs_remove);
 
 #ifdef CONFIG_SECURITY
+#include <linux/spinlock.h>
+
 static struct dentry *lsm_dentry;
+
 static ssize_t lsm_read(struct file *filp, char __user *buf, size_t count,
 			loff_t *ppos)
 {
-	return simple_read_from_buffer(buf, count, ppos, lsm_names,
-		strlen(lsm_names));
+	int i;
+	static char *str;
+	static size_t len;
+	static DEFINE_SPINLOCK(lock);
+
+	/* NOTE: we never free or modify the string once it is set */
+
+	if (unlikely(!str || !len)) {
+		char *str_tmp;
+		size_t len_tmp = 0;
+
+		for (i = 0; i < lsm_active_cnt; i++)
+			/* the '+ 1' accounts for either a comma or a NUL */
+			len_tmp += strlen(lsm_idlist[i]->name) + 1;
+
+		str_tmp = kmalloc(len_tmp, GFP_KERNEL);
+		if (!str_tmp)
+			return -ENOMEM;
+		str_tmp[0] = '\0';
+
+		for (i = 0; i < lsm_active_cnt; i++) {
+			if (i > 0)
+				strcat(str_tmp, ",");
+			strcat(str_tmp, lsm_idlist[i]->name);
+		}
+
+		spin_lock(&lock);
+		if (!str) {
+			str = str_tmp;
+			len = len_tmp - 1;
+		} else
+			kfree(str_tmp);
+		spin_unlock(&lock);
+	}
+
+	return simple_read_from_buffer(buf, count, ppos, str, len);
 }
 
 static const struct file_operations lsm_ops = {
diff --git a/security/lsm_init.c b/security/lsm_init.c
index d40f31e79bd5..574fff354d3f 100644
--- a/security/lsm_init.c
+++ b/security/lsm_init.c
@@ -10,8 +10,6 @@
 
 #include "lsm.h"
 
-char *lsm_names;
-
 /* Pointers to LSM sections defined in include/asm-generic/vmlinux.lds.h */
 extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
 extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
@@ -371,42 +369,6 @@ static void __init lsm_init_ordered(void)
 	}
 }
 
-static bool match_last_lsm(const char *list, const char *lsm)
-{
-	const char *last;
-
-	if (WARN_ON(!list || !lsm))
-		return false;
-	last = strrchr(list, ',');
-	if (last)
-		/* Pass the comma, strcmp() will check for '\0' */
-		last++;
-	else
-		last = list;
-	return !strcmp(last, lsm);
-}
-
-static int lsm_append(const char *new, char **result)
-{
-	char *cp;
-
-	if (*result == NULL) {
-		*result = kstrdup(new, GFP_KERNEL);
-		if (*result == NULL)
-			return -ENOMEM;
-	} else {
-		/* Check if it is the last registered name */
-		if (match_last_lsm(*result, new))
-			return 0;
-		cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
-		if (cp == NULL)
-			return -ENOMEM;
-		kfree(*result);
-		*result = cp;
-	}
-	return 0;
-}
-
 static void __init lsm_static_call_init(struct security_hook_list *hl)
 {
 	struct lsm_static_call *scall = hl->scalls;
@@ -443,15 +405,6 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count,
 		hooks[i].lsmid = lsmid;
 		lsm_static_call_init(&hooks[i]);
 	}
-
-	/*
-	 * Don't try to append during early_security_init(), we'll come back
-	 * and fix this up afterwards.
-	 */
-	if (slab_is_available()) {
-		if (lsm_append(lsmid->name, &lsm_names) < 0)
-			panic("%s - Cannot get early memory.\n", __func__);
-	}
 }
 
 int __init early_security_init(void)
@@ -488,8 +441,6 @@ int __init security_init(void)
 	lsm_early_for_each_raw(lsm) {
 		init_debug("  early started: %s (%s)\n", lsm->id->name,
 			   is_enabled(lsm) ? "enabled" : "disabled");
-		if (lsm->enabled)
-			lsm_append(lsm->id->name, &lsm_names);
 	}
 
 	/* Load LSMs in specified order. */
-- 
cgit v1.2.3


From 291271e691740003021cf5b48fa7cf7e3371eaa7 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 11 Feb 2025 17:49:11 -0500
Subject: lsm: cleanup the LSM blob size code

Convert the lsm_blob_size fields to unsigned integers as there is no
current need for them to be negative, change "lsm_set_blob_size()" to
"lsm_blob_size_update()" to better reflect reality, and perform some
other minor cleanups to the associated code.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 34 ++++++++++++++--------------
 security/lsm_init.c       | 57 +++++++++++++++++++++++++++--------------------
 2 files changed, 50 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 65a8227bece7..86e457aa8809 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -102,23 +102,23 @@ struct security_hook_list {
  * Security blob size or offset data.
  */
 struct lsm_blob_sizes {
-	int lbs_cred;
-	int lbs_file;
-	int lbs_ib;
-	int lbs_inode;
-	int lbs_sock;
-	int lbs_superblock;
-	int lbs_ipc;
-	int lbs_key;
-	int lbs_msg_msg;
-	int lbs_perf_event;
-	int lbs_task;
-	int lbs_xattr_count; /* number of xattr slots in new_xattrs array */
-	int lbs_tun_dev;
-	int lbs_bdev;
-	int lbs_bpf_map;
-	int lbs_bpf_prog;
-	int lbs_bpf_token;
+	unsigned int lbs_cred;
+	unsigned int lbs_file;
+	unsigned int lbs_ib;
+	unsigned int lbs_inode;
+	unsigned int lbs_sock;
+	unsigned int lbs_superblock;
+	unsigned int lbs_ipc;
+	unsigned int lbs_key;
+	unsigned int lbs_msg_msg;
+	unsigned int lbs_perf_event;
+	unsigned int lbs_task;
+	unsigned int lbs_xattr_count; /* num xattr slots in new_xattrs array */
+	unsigned int lbs_tun_dev;
+	unsigned int lbs_bdev;
+	unsigned int lbs_bpf_map;
+	unsigned int lbs_bpf_prog;
+	unsigned int lbs_bpf_token;
 };
 
 /*
diff --git a/security/lsm_init.c b/security/lsm_init.c
index f0066857bd1a..6b1f8f18a43c 100644
--- a/security/lsm_init.c
+++ b/security/lsm_init.c
@@ -169,16 +169,22 @@ out:
 		   lsm_is_enabled(lsm) ? "enabled" : "disabled");
 }
 
-static void __init lsm_set_blob_size(int *need, int *lbs)
+/**
+ * lsm_blob_size_update - Update the LSM blob size and offset information
+ * @sz_req: the requested additional blob size
+ * @sz_cur: the existing blob size
+ */
+static void __init lsm_blob_size_update(unsigned int *sz_req,
+					unsigned int *sz_cur)
 {
-	int offset;
+	unsigned int offset;
 
-	if (*need <= 0)
+	if (*sz_req == 0)
 		return;
 
-	offset = ALIGN(*lbs, sizeof(void *));
-	*lbs = offset + *need;
-	*need = offset;
+	offset = ALIGN(*sz_cur, sizeof(void *));
+	*sz_cur = offset + *sz_req;
+	*sz_req = offset;
 }
 
 /**
@@ -193,27 +199,30 @@ static void __init lsm_prepare(struct lsm_info *lsm)
 		return;
 
 	/* Register the LSM blob sizes. */
-	lsm_set_blob_size(&blobs->lbs_cred, &blob_sizes.lbs_cred);
-	lsm_set_blob_size(&blobs->lbs_file, &blob_sizes.lbs_file);
-	lsm_set_blob_size(&blobs->lbs_ib, &blob_sizes.lbs_ib);
+	blobs = lsm->blobs;
+	lsm_blob_size_update(&blobs->lbs_cred, &blob_sizes.lbs_cred);
+	lsm_blob_size_update(&blobs->lbs_file, &blob_sizes.lbs_file);
+	lsm_blob_size_update(&blobs->lbs_ib, &blob_sizes.lbs_ib);
 	/* inode blob gets an rcu_head in addition to LSM blobs. */
 	if (blobs->lbs_inode && blob_sizes.lbs_inode == 0)
 		blob_sizes.lbs_inode = sizeof(struct rcu_head);
-	lsm_set_blob_size(&blobs->lbs_inode, &blob_sizes.lbs_inode);
-	lsm_set_blob_size(&blobs->lbs_ipc, &blob_sizes.lbs_ipc);
-	lsm_set_blob_size(&blobs->lbs_key, &blob_sizes.lbs_key);
-	lsm_set_blob_size(&blobs->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
-	lsm_set_blob_size(&blobs->lbs_perf_event, &blob_sizes.lbs_perf_event);
-	lsm_set_blob_size(&blobs->lbs_sock, &blob_sizes.lbs_sock);
-	lsm_set_blob_size(&blobs->lbs_superblock, &blob_sizes.lbs_superblock);
-	lsm_set_blob_size(&blobs->lbs_task, &blob_sizes.lbs_task);
-	lsm_set_blob_size(&blobs->lbs_tun_dev, &blob_sizes.lbs_tun_dev);
-	lsm_set_blob_size(&blobs->lbs_xattr_count,
-			  &blob_sizes.lbs_xattr_count);
-	lsm_set_blob_size(&blobs->lbs_bdev, &blob_sizes.lbs_bdev);
-	lsm_set_blob_size(&blobs->lbs_bpf_map, &blob_sizes.lbs_bpf_map);
-	lsm_set_blob_size(&blobs->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog);
-	lsm_set_blob_size(&blobs->lbs_bpf_token, &blob_sizes.lbs_bpf_token);
+	lsm_blob_size_update(&blobs->lbs_inode, &blob_sizes.lbs_inode);
+	lsm_blob_size_update(&blobs->lbs_ipc, &blob_sizes.lbs_ipc);
+	lsm_blob_size_update(&blobs->lbs_key, &blob_sizes.lbs_key);
+	lsm_blob_size_update(&blobs->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
+	lsm_blob_size_update(&blobs->lbs_perf_event,
+			     &blob_sizes.lbs_perf_event);
+	lsm_blob_size_update(&blobs->lbs_sock, &blob_sizes.lbs_sock);
+	lsm_blob_size_update(&blobs->lbs_superblock,
+			     &blob_sizes.lbs_superblock);
+	lsm_blob_size_update(&blobs->lbs_task, &blob_sizes.lbs_task);
+	lsm_blob_size_update(&blobs->lbs_tun_dev, &blob_sizes.lbs_tun_dev);
+	lsm_blob_size_update(&blobs->lbs_xattr_count,
+			     &blob_sizes.lbs_xattr_count);
+	lsm_blob_size_update(&blobs->lbs_bdev, &blob_sizes.lbs_bdev);
+	lsm_blob_size_update(&blobs->lbs_bpf_map, &blob_sizes.lbs_bpf_map);
+	lsm_blob_size_update(&blobs->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog);
+	lsm_blob_size_update(&blobs->lbs_bpf_token, &blob_sizes.lbs_bpf_token);
 }
 
 /* Initialize a given LSM, if it is enabled. */
-- 
cgit v1.2.3


From cdc028812f727907d1575cf454a5f01ddffa7750 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 11 Feb 2025 12:18:35 -0500
Subject: lsm: introduce an initcall mechanism into the LSM framework

Currently the individual LSMs register their own initcalls, and while
this should be harmless, it can be wasteful in the case where a LSM
is disabled at boot as the initcall will still be executed.  This
patch introduces support for managing the initcalls in the LSM
framework, and future patches will convert the existing LSMs over to
this new mechanism.

Only initcall types which are used by the current in-tree LSMs are
supported, additional initcall types can easily be added in the future
if needed.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johhansen@canonical.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 33 +++++++++++++++---
 security/lsm_init.c       | 89 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 86e457aa8809..b92008641242 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -151,13 +151,36 @@ enum lsm_order {
 	LSM_ORDER_LAST = 1,	/* This is only for integrity. */
 };
 
+/**
+ * struct lsm_info - Define an individual LSM for the LSM framework.
+ * @id: LSM name/ID info
+ * @order: ordering with respect to other LSMs, optional
+ * @flags: descriptive flags, optional
+ * @blobs: LSM blob sharing, optional
+ * @enabled: controlled by CONFIG_LSM, optional
+ * @init: LSM specific initialization routine
+ * @initcall_pure: LSM callback for initcall_pure() setup, optional
+ * @initcall_early: LSM callback for early_initcall setup, optional
+ * @initcall_core: LSM callback for core_initcall() setup, optional
+ * @initcall_subsys: LSM callback for subsys_initcall() setup, optional
+ * @initcall_fs: LSM callback for fs_initcall setup, optional
+ * @nitcall_device: LSM callback for device_initcall() setup, optional
+ * @initcall_late: LSM callback for late_initcall() setup, optional
+ */
 struct lsm_info {
 	const struct lsm_id *id;
-	enum lsm_order order;	/* Optional: default is LSM_ORDER_MUTABLE */
-	unsigned long flags;	/* Optional: flags describing LSM */
-	int *enabled;		/* Optional: controlled by CONFIG_LSM */
-	int (*init)(void);	/* Required. */
-	struct lsm_blob_sizes *blobs; /* Optional: for blob sharing. */
+	enum lsm_order order;
+	unsigned long flags;
+	struct lsm_blob_sizes *blobs;
+	int *enabled;
+	int (*init)(void);
+	int (*initcall_pure)(void);
+	int (*initcall_early)(void);
+	int (*initcall_core)(void);
+	int (*initcall_subsys)(void);
+	int (*initcall_fs)(void);
+	int (*initcall_device)(void);
+	int (*initcall_late)(void);
 };
 
 #define DEFINE_LSM(lsm)							\
diff --git a/security/lsm_init.c b/security/lsm_init.c
index fd69bde9112e..aacdac406ba5 100644
--- a/security/lsm_init.c
+++ b/security/lsm_init.c
@@ -39,6 +39,27 @@ static __initdata struct lsm_info *lsm_exclusive;
 	for ((iter) = __start_early_lsm_info;				\
 	     (iter) < __end_early_lsm_info; (iter)++)
 
+#define lsm_initcall(level)						\
+	({								\
+		int _r, _rc = 0;					\
+		struct lsm_info **_lp, *_l;				\
+		lsm_order_for_each(_lp) {				\
+			_l = *_lp;					\
+			if (!_l->initcall_##level)			\
+				continue;				\
+			lsm_pr_dbg("running %s %s initcall",		\
+				   _l->id->name, #level);		\
+			_r = _l->initcall_##level();			\
+			if (_r) {					\
+				pr_warn("failed LSM %s %s initcall with errno %d\n", \
+					_l->id->name, #level, _r);	\
+				if (!_rc)				\
+					_rc = _r;			\
+			}						\
+		}							\
+		_rc;							\
+	})
+
 /**
  * lsm_choose_security - Legacy "major" LSM selection
  * @str: kernel command line parameter
@@ -461,3 +482,71 @@ int __init security_init(void)
 
 	return 0;
 }
+
+/**
+ * security_initcall_pure - Run the LSM pure initcalls
+ */
+static int __init security_initcall_pure(void)
+{
+	return lsm_initcall(pure);
+}
+pure_initcall(security_initcall_pure);
+
+/**
+ * security_initcall_early - Run the LSM early initcalls
+ */
+static int __init security_initcall_early(void)
+{
+	return lsm_initcall(early);
+}
+early_initcall(security_initcall_early);
+
+/**
+ * security_initcall_core - Run the LSM core initcalls
+ */
+static int __init security_initcall_core(void)
+{
+	return lsm_initcall(core);
+}
+core_initcall(security_initcall_core);
+
+/**
+ * security_initcall_subsys - Run the LSM subsys initcalls
+ */
+static int __init security_initcall_subsys(void)
+{
+	return lsm_initcall(subsys);
+}
+subsys_initcall(security_initcall_subsys);
+
+/**
+ * security_initcall_fs - Run the LSM fs initcalls
+ */
+static int __init security_initcall_fs(void)
+{
+	return lsm_initcall(fs);
+}
+fs_initcall(security_initcall_fs);
+
+/**
+ * security_initcall_device - Run the LSM device initcalls
+ */
+static int __init security_initcall_device(void)
+{
+	return lsm_initcall(device);
+}
+device_initcall(security_initcall_device);
+
+/**
+ * security_initcall_late - Run the LSM late initcalls
+ */
+static int __init security_initcall_late(void)
+{
+	int rc;
+
+	rc = lsm_initcall(late);
+	lsm_pr_dbg("all enabled LSMs fully activated\n");
+
+	return rc;
+}
+late_initcall(security_initcall_late);
-- 
cgit v1.2.3


From dfa024bc3f67a97e1a975dd66b83af8b3845eb19 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 21 Feb 2025 11:53:29 -0500
Subject: lsm: add a LSM_STARTED_ALL notification event

Add a new LSM notifier event, LSM_STARTED_ALL, which is fired once at
boot when all of the LSMs have been started.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johhansen@canonical.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 1 +
 security/lsm_init.c      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 556890ea2e83..eb36451ce41f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -85,6 +85,7 @@ struct timezone;
 
 enum lsm_event {
 	LSM_POLICY_CHANGE,
+	LSM_STARTED_ALL,
 };
 
 struct dm_verity_digest {
diff --git a/security/lsm_init.c b/security/lsm_init.c
index 0f668bca98f9..6bb67d41ce52 100644
--- a/security/lsm_init.c
+++ b/security/lsm_init.c
@@ -556,6 +556,7 @@ static int __init security_initcall_late(void)
 
 	rc = lsm_initcall(late);
 	lsm_pr_dbg("all enabled LSMs fully activated\n");
+	call_blocking_lsm_notifier(LSM_STARTED_ALL, NULL);
 
 	return rc;
 }
-- 
cgit v1.2.3


From 094e94d13b606b820e3d1383e3a361f680ff023a Mon Sep 17 00:00:00 2001
From: Thiébaud Weksteen <tweek@google.com>
Date: Thu, 18 Sep 2025 12:04:34 +1000
Subject: memfd,selinux: call security_inode_init_security_anon()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prior to this change, no security hooks were called at the creation of a
memfd file. It means that, for SELinux as an example, it will receive
the default type of the filesystem that backs the in-memory inode. In
most cases, that would be tmpfs, but if MFD_HUGETLB is passed, it will
be hugetlbfs. Both can be considered implementation details of memfd.

It also means that it is not possible to differentiate between a file
coming from memfd_create and a file coming from a standard tmpfs mount
point.

Additionally, no permission is validated at creation, which differs from
the similar memfd_secret syscall.

Call security_inode_init_security_anon during creation. This ensures
that the file is setup similarly to other anonymous inodes. On SELinux,
it means that the file will receive the security context of its task.

The ability to limit fexecve on memfd has been of interest to avoid
potential pitfalls where /proc/self/exe or similar would be executed
[1][2]. Reuse the "execute_no_trans" and "entrypoint" access vectors,
similarly to the file class. These access vectors may not make sense for
the existing "anon_inode" class. Therefore, define and assign a new
class "memfd_file" to support such access vectors.

Guard these changes behind a new policy capability named "memfd_class".

[1] https://crbug.com/1305267
[2] https://lore.kernel.org/lkml/20221215001205.51969-1-jeffxu@google.com/

Signed-off-by: Thiébaud Weksteen <tweek@google.com>
Reviewed-by: Stephen Smalley <stephen.smalley.work@gmail.com>
Tested-by: Stephen Smalley <stephen.smalley.work@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
[PM: subj tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/memfd.h                      |  2 ++
 mm/memfd.c                                 | 14 ++++++++++++--
 security/selinux/hooks.c                   | 26 +++++++++++++++++++++-----
 security/selinux/include/classmap.h        |  2 ++
 security/selinux/include/policycap.h       |  1 +
 security/selinux/include/policycap_names.h |  1 +
 security/selinux/include/security.h        |  5 +++++
 7 files changed, 44 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index 6f606d9573c3..cc74de3dbcfe 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -4,6 +4,8 @@
 
 #include <linux/file.h>
 
+#define MEMFD_ANON_NAME "[memfd]"
+
 #ifdef CONFIG_MEMFD_CREATE
 extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg);
 struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx);
diff --git a/mm/memfd.c b/mm/memfd.c
index 1d109c1acf21..a61acbe5ded3 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -433,6 +433,8 @@ static struct file *alloc_file(const char *name, unsigned int flags)
 {
 	unsigned int *file_seals;
 	struct file *file;
+	struct inode *inode;
+	int err = 0;
 
 	if (flags & MFD_HUGETLB) {
 		file = hugetlb_file_setup(name, 0, VM_NORESERVE,
@@ -444,12 +446,20 @@ static struct file *alloc_file(const char *name, unsigned int flags)
 	}
 	if (IS_ERR(file))
 		return file;
+
+	inode = file_inode(file);
+	err = security_inode_init_security_anon(inode,
+			&QSTR(MEMFD_ANON_NAME), NULL);
+	if (err) {
+		fput(file);
+		file = ERR_PTR(err);
+		return file;
+	}
+
 	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 	file->f_flags |= O_LARGEFILE;
 
 	if (flags & MFD_NOEXEC_SEAL) {
-		struct inode *inode = file_inode(file);
-
 		inode->i_mode &= ~0111;
 		file_seals = memfd_file_seals_ptr(file);
 		if (file_seals) {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index dfc22da42f30..a22b1920242f 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -93,6 +93,7 @@
 #include <linux/fanotify.h>
 #include <linux/io_uring/cmd.h>
 #include <uapi/linux/lsm.h>
+#include <linux/memfd.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -2319,6 +2320,10 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
 	new_tsec = selinux_cred(bprm->cred);
 	isec = inode_security(inode);
 
+	if (WARN_ON(isec->sclass != SECCLASS_FILE &&
+		    isec->sclass != SECCLASS_MEMFD_FILE))
+		return -EACCES;
+
 	/* Default to the current task SID. */
 	new_tsec->sid = old_tsec->sid;
 	new_tsec->osid = old_tsec->sid;
@@ -2371,8 +2376,8 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
 	ad.u.file = bprm->file;
 
 	if (new_tsec->sid == old_tsec->sid) {
-		rc = avc_has_perm(old_tsec->sid, isec->sid,
-				  SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad);
+		rc = avc_has_perm(old_tsec->sid, isec->sid, isec->sclass,
+				  FILE__EXECUTE_NO_TRANS, &ad);
 		if (rc)
 			return rc;
 	} else {
@@ -2382,8 +2387,8 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
 		if (rc)
 			return rc;
 
-		rc = avc_has_perm(new_tsec->sid, isec->sid,
-				  SECCLASS_FILE, FILE__ENTRYPOINT, &ad);
+		rc = avc_has_perm(new_tsec->sid, isec->sid, isec->sclass,
+				  FILE__ENTRYPOINT, &ad);
 		if (rc)
 			return rc;
 
@@ -2978,10 +2983,18 @@ static int selinux_inode_init_security_anon(struct inode *inode,
 	struct common_audit_data ad;
 	struct inode_security_struct *isec;
 	int rc;
+	bool is_memfd = false;
 
 	if (unlikely(!selinux_initialized()))
 		return 0;
 
+	if (name != NULL && name->name != NULL &&
+	    !strcmp(name->name, MEMFD_ANON_NAME)) {
+		if (!selinux_policycap_memfd_class())
+			return 0;
+		is_memfd = true;
+	}
+
 	isec = selinux_inode(inode);
 
 	/*
@@ -3001,7 +3014,10 @@ static int selinux_inode_init_security_anon(struct inode *inode,
 		isec->sclass = context_isec->sclass;
 		isec->sid = context_isec->sid;
 	} else {
-		isec->sclass = SECCLASS_ANON_INODE;
+		if (is_memfd)
+			isec->sclass = SECCLASS_MEMFD_FILE;
+		else
+			isec->sclass = SECCLASS_ANON_INODE;
 		rc = security_transition_sid(
 			sid, sid,
 			isec->sclass, name, &isec->sid);
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 5665aa5e7853..3ec85142771f 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -179,6 +179,8 @@ const struct security_class_mapping secclass_map[] = {
 	{ "anon_inode", { COMMON_FILE_PERMS, NULL } },
 	{ "io_uring", { "override_creds", "sqpoll", "cmd", "allowed", NULL } },
 	{ "user_namespace", { "create", NULL } },
+	{ "memfd_file",
+	  { COMMON_FILE_PERMS, "execute_no_trans", "entrypoint", NULL } },
 	/* last one */ { NULL, {} }
 };
 
diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h
index 135a969f873c..231d02227e59 100644
--- a/security/selinux/include/policycap.h
+++ b/security/selinux/include/policycap.h
@@ -18,6 +18,7 @@ enum {
 	POLICYDB_CAP_NETIF_WILDCARD,
 	POLICYDB_CAP_GENFS_SECLABEL_WILDCARD,
 	POLICYDB_CAP_FUNCTIONFS_SECLABEL,
+	POLICYDB_CAP_MEMFD_CLASS,
 	__POLICYDB_CAP_MAX
 };
 #define POLICYDB_CAP_MAX (__POLICYDB_CAP_MAX - 1)
diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h
index ff8882887651..454dab37bda3 100644
--- a/security/selinux/include/policycap_names.h
+++ b/security/selinux/include/policycap_names.h
@@ -21,6 +21,7 @@ const char *const selinux_policycap_names[__POLICYDB_CAP_MAX] = {
 	"netif_wildcard",
 	"genfs_seclabel_wildcard",
 	"functionfs_seclabel",
+	"memfd_class",
 };
 /* clang-format on */
 
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 0f954a40d3fc..5d1dad8058b1 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -209,6 +209,11 @@ static inline bool selinux_policycap_functionfs_seclabel(void)
 		selinux_state.policycap[POLICYDB_CAP_FUNCTIONFS_SECLABEL]);
 }
 
+static inline bool selinux_policycap_memfd_class(void)
+{
+	return READ_ONCE(selinux_state.policycap[POLICYDB_CAP_MEMFD_CLASS]);
+}
+
 struct selinux_policy_convert_data;
 
 struct selinux_load_state {
-- 
cgit v1.2.3


From 26ab9830beabda863766be4a79dc590c7645f4d9 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 21 Oct 2025 08:26:49 +0100
Subject: net: stmmac: replace has_xxxx with core_type

Replace the has_gmac, has_gmac4 and has_xgmac ints, of which only one
can be set when matching a core to its driver backend, with an
enumerated type carrying the DWMAC core type.

Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Chen-Yu Tsai <wens@kernel.org>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Link: https://patch.msgid.link/E1vB6ld-0000000BIPy-2Qi4@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/common.h       |  5 ++
 .../ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c  |  5 +-
 .../net/ethernet/stmicro/stmmac/dwmac-ipq806x.c    |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-loongson.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c    |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c     |  4 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c    |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c  |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c  |  2 +-
 drivers/net/ethernet/stmicro/stmmac/hwif.c         | 73 ++++++++--------------
 drivers/net/ethernet/stmicro/stmmac/stmmac_est.c   |  4 +-
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   | 13 ++--
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 34 +++++-----
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c  | 14 ++---
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c   |  4 +-
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  |  9 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c   |  4 +-
 include/linux/stmmac.h                             | 11 +++-
 21 files changed, 94 insertions(+), 104 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index fee7021246b1..31254ba525d5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -43,6 +43,11 @@
 #define DWXGMAC_ID		0x76
 #define DWXLGMAC_ID		0x27
 
+static inline bool dwmac_is_xmac(enum dwmac_core_type core_type)
+{
+	return core_type == DWMAC_CORE_GMAC4 || core_type == DWMAC_CORE_XGMAC;
+}
+
 #define STMMAC_CHAN0	0	/* Always supported and default for all chips */
 
 /* TX and RX Descriptor Length, these need to be power of two.
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
index f1c2e35badf7..c7cd6497d42d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
@@ -109,7 +109,7 @@ static int dwc_eth_dwmac_config_dt(struct platform_device *pdev,
 	}
 
 	/* dwc-qos needs GMAC4, AAL, TSO and PMT */
-	plat_dat->has_gmac4 = 1;
+	plat_dat->core_type = DWMAC_CORE_GMAC4;
 	plat_dat->dma_cfg->aal = 1;
 	plat_dat->flags |= STMMAC_FLAG_TSO_EN;
 	plat_dat->pmt = 1;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index e74d00984b88..b2194e414ec1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -565,7 +565,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat)
 {
 	/* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */
 	plat->clk_csr = STMMAC_CSR_20_35M;
-	plat->has_gmac = 1;
+	plat->core_type = DWMAC_CORE_GMAC;
 	plat->force_sf_dma_mode = 1;
 
 	plat->mdio_bus_data->needs_reset = true;
@@ -612,8 +612,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 	plat->pdev = pdev;
 	plat->phy_addr = -1;
 	plat->clk_csr = STMMAC_CSR_250_300M;
-	plat->has_gmac = 0;
-	plat->has_gmac4 = 1;
+	plat->core_type = DWMAC_CORE_GMAC4;
 	plat->force_sf_dma_mode = 0;
 	plat->flags |= (STMMAC_FLAG_TSO_EN | STMMAC_FLAG_SPH_DISABLE);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
index ca4035cbb55b..c05f85534f0c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
@@ -473,7 +473,7 @@ static int ipq806x_gmac_probe(struct platform_device *pdev)
 			return err;
 	}
 
-	plat_dat->has_gmac = true;
+	plat_dat->core_type = DWMAC_CORE_GMAC;
 	plat_dat->bsp_priv = gmac;
 	plat_dat->set_clk_tx_rate = ipq806x_gmac_set_clk_tx_rate;
 	plat_dat->multicast_filter_bins = 0;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
index 592aa9d636e5..2a3ac0136cdb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
@@ -92,7 +92,7 @@ static void loongson_default_data(struct pci_dev *pdev,
 
 	/* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */
 	plat->clk_csr = STMMAC_CSR_20_35M;
-	plat->has_gmac = 1;
+	plat->core_type = DWMAC_CORE_GMAC;
 	plat->force_sf_dma_mode = 1;
 
 	/* Set default value for multicast hash bins */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c
index 2562a6d036a2..6fffc9dfbae5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c
@@ -41,7 +41,7 @@ static int lpc18xx_dwmac_probe(struct platform_device *pdev)
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
-	plat_dat->has_gmac = true;
+	plat_dat->core_type = DWMAC_CORE_GMAC;
 
 	reg = syscon_regmap_lookup_by_compatible("nxp,lpc1850-creg");
 	if (IS_ERR(reg)) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
index 32244217d952..d1e48b524d7a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
@@ -846,7 +846,7 @@ static int qcom_ethqos_probe(struct platform_device *pdev)
 	plat_dat->fix_mac_speed = ethqos_fix_mac_speed;
 	plat_dat->dump_debug_regs = rgmii_dump;
 	plat_dat->ptp_clk_freq_config = ethqos_ptp_clk_freq_config;
-	plat_dat->has_gmac4 = 1;
+	plat_dat->core_type = DWMAC_CORE_GMAC4;
 	if (ethqos->has_emac_ge_3)
 		plat_dat->dwmac4_addrs = &data->dwmac4_addrs;
 	plat_dat->pmt = 1;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
index 51ea0caf16c1..9b92f4d335cc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@@ -1750,8 +1750,8 @@ static int rk_gmac_probe(struct platform_device *pdev)
 	/* If the stmmac is not already selected as gmac4,
 	 * then make sure we fallback to gmac.
 	 */
-	if (!plat_dat->has_gmac4) {
-		plat_dat->has_gmac = true;
+	if (plat_dat->core_type != DWMAC_CORE_GMAC4) {
+		plat_dat->core_type = DWMAC_CORE_GMAC;
 		plat_dat->rx_fifo_size = 4096;
 		plat_dat->tx_fifo_size = 2048;
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c
index 221539d760bc..ee095ac13203 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c
@@ -146,7 +146,7 @@ static int s32_dwmac_probe(struct platform_device *pdev)
 	gmac->ioaddr = res.addr;
 
 	/* S32CC core feature set */
-	plat->has_gmac4 = true;
+	plat->core_type = DWMAC_CORE_GMAC4;
 	plat->pmt = 1;
 	plat->flags |= STMMAC_FLAG_SPH_DISABLE;
 	plat->rx_fifo_size = 20480;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
index 354f01184e6c..2ff5db6d41ca 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
@@ -497,7 +497,7 @@ static int socfpga_dwmac_probe(struct platform_device *pdev)
 	plat_dat->pcs_init = socfpga_dwmac_pcs_init;
 	plat_dat->pcs_exit = socfpga_dwmac_pcs_exit;
 	plat_dat->select_pcs = socfpga_dwmac_select_pcs;
-	plat_dat->has_gmac = true;
+	plat_dat->core_type = DWMAC_CORE_GMAC;
 
 	plat_dat->riwt_off = 1;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
index 1eadcf5d1ad6..7f560d78209d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
@@ -136,7 +136,7 @@ static int sun7i_gmac_probe(struct platform_device *pdev)
 	/* platform data specifying hardware features and callbacks.
 	 * hardware features were copied from Allwinner drivers. */
 	plat_dat->tx_coe = 1;
-	plat_dat->has_gmac = true;
+	plat_dat->core_type = DWMAC_CORE_GMAC;
 	plat_dat->bsp_priv = gmac;
 	plat_dat->init = sun7i_gmac_init;
 	plat_dat->exit = sun7i_gmac_exit;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c
index dc903b846b1b..d765acbe3754 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c
@@ -308,7 +308,7 @@ static int tegra_mgbe_probe(struct platform_device *pdev)
 		goto disable_clks;
 	}
 
-	plat->has_xgmac = 1;
+	plat->core_type = DWMAC_CORE_XGMAC;
 	plat->flags |= STMMAC_FLAG_TSO_EN;
 	plat->pmt = 1;
 	plat->bsp_priv = mgbe;
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index 3f7c765dcb79..00083ce52549 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -106,9 +106,7 @@ int stmmac_reset(struct stmmac_priv *priv, void __iomem *ioaddr)
 }
 
 static const struct stmmac_hwif_entry {
-	bool gmac;
-	bool gmac4;
-	bool xgmac;
+	enum dwmac_core_type core_type;
 	u32 min_id;
 	u32 dev_id;
 	const struct stmmac_regs_off regs;
@@ -127,9 +125,7 @@ static const struct stmmac_hwif_entry {
 } stmmac_hw[] = {
 	/* NOTE: New HW versions shall go to the end of this table */
 	{
-		.gmac = false,
-		.gmac4 = false,
-		.xgmac = false,
+		.core_type = DWMAC_CORE_MAC100,
 		.min_id = 0,
 		.regs = {
 			.ptp_off = PTP_GMAC3_X_OFFSET,
@@ -146,9 +142,7 @@ static const struct stmmac_hwif_entry {
 		.setup = dwmac100_setup,
 		.quirks = stmmac_dwmac1_quirks,
 	}, {
-		.gmac = true,
-		.gmac4 = false,
-		.xgmac = false,
+		.core_type = DWMAC_CORE_GMAC,
 		.min_id = 0,
 		.regs = {
 			.ptp_off = PTP_GMAC3_X_OFFSET,
@@ -165,9 +159,7 @@ static const struct stmmac_hwif_entry {
 		.setup = dwmac1000_setup,
 		.quirks = stmmac_dwmac1_quirks,
 	}, {
-		.gmac = false,
-		.gmac4 = true,
-		.xgmac = false,
+		.core_type = DWMAC_CORE_GMAC4,
 		.min_id = 0,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -187,9 +179,7 @@ static const struct stmmac_hwif_entry {
 		.setup = dwmac4_setup,
 		.quirks = stmmac_dwmac4_quirks,
 	}, {
-		.gmac = false,
-		.gmac4 = true,
-		.xgmac = false,
+		.core_type = DWMAC_CORE_GMAC4,
 		.min_id = DWMAC_CORE_4_00,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -210,9 +200,7 @@ static const struct stmmac_hwif_entry {
 		.setup = dwmac4_setup,
 		.quirks = NULL,
 	}, {
-		.gmac = false,
-		.gmac4 = true,
-		.xgmac = false,
+		.core_type = DWMAC_CORE_GMAC4,
 		.min_id = DWMAC_CORE_4_10,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -233,9 +221,7 @@ static const struct stmmac_hwif_entry {
 		.setup = dwmac4_setup,
 		.quirks = NULL,
 	}, {
-		.gmac = false,
-		.gmac4 = true,
-		.xgmac = false,
+		.core_type = DWMAC_CORE_GMAC4,
 		.min_id = DWMAC_CORE_5_10,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -256,9 +242,7 @@ static const struct stmmac_hwif_entry {
 		.setup = dwmac4_setup,
 		.quirks = NULL,
 	}, {
-		.gmac = false,
-		.gmac4 = false,
-		.xgmac = true,
+		.core_type = DWMAC_CORE_XGMAC,
 		.min_id = DWXGMAC_CORE_2_10,
 		.dev_id = DWXGMAC_ID,
 		.regs = {
@@ -280,9 +264,7 @@ static const struct stmmac_hwif_entry {
 		.setup = dwxgmac2_setup,
 		.quirks = NULL,
 	}, {
-		.gmac = false,
-		.gmac4 = false,
-		.xgmac = true,
+		.core_type = DWMAC_CORE_XGMAC,
 		.min_id = DWXLGMAC_CORE_2_00,
 		.dev_id = DWXLGMAC_ID,
 		.regs = {
@@ -308,20 +290,18 @@ static const struct stmmac_hwif_entry {
 
 int stmmac_hwif_init(struct stmmac_priv *priv)
 {
-	bool needs_xgmac = priv->plat->has_xgmac;
-	bool needs_gmac4 = priv->plat->has_gmac4;
-	bool needs_gmac = priv->plat->has_gmac;
+	enum dwmac_core_type core_type = priv->plat->core_type;
 	const struct stmmac_hwif_entry *entry;
 	struct mac_device_info *mac;
 	bool needs_setup = true;
 	u32 id, dev_id = 0;
 	int i, ret;
 
-	if (needs_gmac) {
+	if (core_type == DWMAC_CORE_GMAC) {
 		id = stmmac_get_id(priv, GMAC_VERSION);
-	} else if (needs_gmac4 || needs_xgmac) {
+	} else if (dwmac_is_xmac(core_type)) {
 		id = stmmac_get_id(priv, GMAC4_VERSION);
-		if (needs_xgmac)
+		if (core_type == DWMAC_CORE_XGMAC)
 			dev_id = stmmac_get_dev_id(priv, GMAC4_VERSION);
 	} else {
 		id = 0;
@@ -331,14 +311,16 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 	priv->synopsys_id = id;
 
 	/* Lets assume some safe values first */
-	priv->ptpaddr = priv->ioaddr +
-		(needs_gmac4 ? PTP_GMAC4_OFFSET : PTP_GMAC3_X_OFFSET);
-	priv->mmcaddr = priv->ioaddr +
-		(needs_gmac4 ? MMC_GMAC4_OFFSET : MMC_GMAC3_X_OFFSET);
-	if (needs_gmac4)
+	if (core_type == DWMAC_CORE_GMAC4) {
+		priv->ptpaddr = priv->ioaddr + PTP_GMAC4_OFFSET;
+		priv->mmcaddr = priv->ioaddr + MMC_GMAC4_OFFSET;
 		priv->estaddr = priv->ioaddr + EST_GMAC4_OFFSET;
-	else if (needs_xgmac)
-		priv->estaddr = priv->ioaddr + EST_XGMAC_OFFSET;
+	} else {
+		priv->ptpaddr = priv->ioaddr + PTP_GMAC3_X_OFFSET;
+		priv->mmcaddr = priv->ioaddr + MMC_GMAC3_X_OFFSET;
+		if (core_type == DWMAC_CORE_XGMAC)
+			priv->estaddr = priv->ioaddr + EST_XGMAC_OFFSET;
+	}
 
 	/* Check for HW specific setup first */
 	if (priv->plat->setup) {
@@ -355,16 +337,12 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 	for (i = ARRAY_SIZE(stmmac_hw) - 1; i >= 0; i--) {
 		entry = &stmmac_hw[i];
 
-		if (needs_gmac ^ entry->gmac)
-			continue;
-		if (needs_gmac4 ^ entry->gmac4)
-			continue;
-		if (needs_xgmac ^ entry->xgmac)
+		if (core_type != entry->core_type)
 			continue;
 		/* Use synopsys_id var because some setups can override this */
 		if (priv->synopsys_id < entry->min_id)
 			continue;
-		if (needs_xgmac && (dev_id ^ entry->dev_id))
+		if (core_type == DWMAC_CORE_XGMAC && (dev_id ^ entry->dev_id))
 			continue;
 
 		/* Only use generic HW helpers if needed */
@@ -400,6 +378,7 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 	}
 
 	dev_err(priv->device, "Failed to find HW IF (id=0x%x, gmac=%d/%d)\n",
-			id, needs_gmac, needs_gmac4);
+		id, core_type == DWMAC_CORE_GMAC,
+		core_type == DWMAC_CORE_GMAC4);
 	return -EINVAL;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_est.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_est.c
index 4b513d27a988..afc516059b89 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_est.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_est.c
@@ -53,7 +53,7 @@ static int est_configure(struct stmmac_priv *priv, struct stmmac_est *cfg,
 	}
 
 	ctrl = readl(est_addr + EST_CONTROL);
-	if (priv->plat->has_xgmac) {
+	if (priv->plat->core_type == DWMAC_CORE_XGMAC) {
 		ctrl &= ~EST_XGMAC_PTOV;
 		ctrl |= ((NSEC_PER_SEC / ptp_rate) * EST_XGMAC_PTOV_MUL) <<
 			 EST_XGMAC_PTOV_SHIFT;
@@ -148,7 +148,7 @@ static void est_irq_status(struct stmmac_priv *priv, struct net_device *dev,
 	}
 
 	if (status & EST_BTRE) {
-		if (priv->plat->has_xgmac) {
+		if (priv->plat->core_type == DWMAC_CORE_XGMAC) {
 			btrl = FIELD_GET(EST_XGMAC_BTRL, status);
 			btrl_max = FIELD_MAX(EST_XGMAC_BTRL);
 		} else {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index c60cd948311e..df016c4eb710 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -303,9 +303,10 @@ static void stmmac_ethtool_getdrvinfo(struct net_device *dev,
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 
-	if (priv->plat->has_gmac || priv->plat->has_gmac4)
+	if (priv->plat->core_type == DWMAC_CORE_GMAC ||
+	    priv->plat->core_type == DWMAC_CORE_GMAC4)
 		strscpy(info->driver, GMAC_ETHTOOL_NAME, sizeof(info->driver));
-	else if (priv->plat->has_xgmac)
+	else if (priv->plat->core_type == DWMAC_CORE_XGMAC)
 		strscpy(info->driver, XGMAC_ETHTOOL_NAME, sizeof(info->driver));
 	else
 		strscpy(info->driver, MAC100_ETHTOOL_NAME,
@@ -351,9 +352,9 @@ static int stmmac_ethtool_get_regs_len(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 
-	if (priv->plat->has_xgmac)
+	if (priv->plat->core_type == DWMAC_CORE_XGMAC)
 		return XGMAC_REGSIZE * 4;
-	else if (priv->plat->has_gmac4)
+	else if (priv->plat->core_type == DWMAC_CORE_GMAC4)
 		return GMAC4_REG_SPACE_SIZE;
 	return REG_SPACE_SIZE;
 }
@@ -368,12 +369,12 @@ static void stmmac_ethtool_gregs(struct net_device *dev,
 	stmmac_dump_dma_regs(priv, priv->ioaddr, reg_space);
 
 	/* Copy DMA registers to where ethtool expects them */
-	if (priv->plat->has_gmac4) {
+	if (priv->plat->core_type == DWMAC_CORE_GMAC4) {
 		/* GMAC4 dumps its DMA registers at its DMA_CHAN_BASE_ADDR */
 		memcpy(&reg_space[ETHTOOL_DMA_OFFSET],
 		       &reg_space[GMAC4_DMA_CHAN_BASE_ADDR / 4],
 		       NUM_DWMAC4_DMA_REGS * 4);
-	} else if (!priv->plat->has_xgmac) {
+	} else if (priv->plat->core_type != DWMAC_CORE_XGMAC) {
 		memcpy(&reg_space[ETHTOOL_DMA_OFFSET],
 		       &reg_space[DMA_BUS_MODE / 4],
 		       NUM_DWMAC1000_DMA_REGS * 4);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 5e6aaead5894..9fa3c221a0c3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -446,7 +446,7 @@ static void stmmac_get_rx_hwtstamp(struct stmmac_priv *priv, struct dma_desc *p,
 	if (!priv->hwts_rx_en)
 		return;
 	/* For GMAC4, the valid timestamp is from CTX next desc. */
-	if (priv->plat->has_gmac4 || priv->plat->has_xgmac)
+	if (dwmac_is_xmac(priv->plat->core_type))
 		desc = np;
 
 	/* Check if timestamp is available */
@@ -697,7 +697,7 @@ static int stmmac_hwtstamp_get(struct net_device *dev,
 static int stmmac_init_tstamp_counter(struct stmmac_priv *priv,
 				      u32 systime_flags)
 {
-	bool xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
+	bool xmac = dwmac_is_xmac(priv->plat->core_type);
 	struct timespec64 now;
 	u32 sec_inc = 0;
 	u64 temp = 0;
@@ -746,7 +746,7 @@ static int stmmac_init_tstamp_counter(struct stmmac_priv *priv,
  */
 static int stmmac_init_timestamping(struct stmmac_priv *priv)
 {
-	bool xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
+	bool xmac = dwmac_is_xmac(priv->plat->core_type);
 	int ret;
 
 	if (priv->plat->ptp_clk_freq_config)
@@ -2413,7 +2413,7 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv)
 		txfifosz = priv->dma_cap.tx_fifo_size;
 
 	/* Split up the shared Tx/Rx FIFO memory on DW QoS Eth and DW XGMAC */
-	if (priv->plat->has_gmac4 || priv->plat->has_xgmac) {
+	if (dwmac_is_xmac(priv->plat->core_type)) {
 		rxfifosz /= rx_channels_count;
 		txfifosz /= tx_channels_count;
 	}
@@ -4520,7 +4520,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (skb_is_gso(skb) && priv->tso) {
 		if (gso & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))
 			return stmmac_tso_xmit(skb, dev);
-		if (priv->plat->has_gmac4 && (gso & SKB_GSO_UDP_L4))
+		if (priv->plat->core_type == DWMAC_CORE_GMAC4 &&
+		    (gso & SKB_GSO_UDP_L4))
 			return stmmac_tso_xmit(skb, dev);
 	}
 
@@ -5973,7 +5974,7 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv)
 	u32 queue;
 	bool xmac;
 
-	xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
+	xmac = dwmac_is_xmac(priv->plat->core_type);
 	queues_count = (rx_cnt > tx_cnt) ? rx_cnt : tx_cnt;
 
 	if (priv->irq_wake)
@@ -5987,7 +5988,7 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv)
 		stmmac_fpe_irq_status(priv);
 
 	/* To handle GMAC own interrupts */
-	if ((priv->plat->has_gmac) || xmac) {
+	if (priv->plat->core_type == DWMAC_CORE_GMAC || xmac) {
 		int status = stmmac_host_irq_status(priv, priv->hw, &priv->xstats);
 
 		if (unlikely(status)) {
@@ -6348,7 +6349,7 @@ static int stmmac_dma_cap_show(struct seq_file *seq, void *v)
 		   (priv->dma_cap.mbps_1000) ? "Y" : "N");
 	seq_printf(seq, "\tHalf duplex: %s\n",
 		   (priv->dma_cap.half_duplex) ? "Y" : "N");
-	if (priv->plat->has_xgmac) {
+	if (priv->plat->core_type == DWMAC_CORE_XGMAC) {
 		seq_printf(seq,
 			   "\tNumber of Additional MAC address registers: %d\n",
 			   priv->dma_cap.multi_addr);
@@ -6372,7 +6373,7 @@ static int stmmac_dma_cap_show(struct seq_file *seq, void *v)
 		   (priv->dma_cap.time_stamp) ? "Y" : "N");
 	seq_printf(seq, "\tIEEE 1588-2008 Advanced Time Stamp: %s\n",
 		   (priv->dma_cap.atime_stamp) ? "Y" : "N");
-	if (priv->plat->has_xgmac)
+	if (priv->plat->core_type == DWMAC_CORE_XGMAC)
 		seq_printf(seq, "\tTimestamp System Time Source: %s\n",
 			   dwxgmac_timestamp_source[priv->dma_cap.tssrc]);
 	seq_printf(seq, "\t802.3az - Energy-Efficient Ethernet (EEE): %s\n",
@@ -6381,7 +6382,7 @@ static int stmmac_dma_cap_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "\tChecksum Offload in TX: %s\n",
 		   (priv->dma_cap.tx_coe) ? "Y" : "N");
 	if (priv->synopsys_id >= DWMAC_CORE_4_00 ||
-	    priv->plat->has_xgmac) {
+	    priv->plat->core_type == DWMAC_CORE_XGMAC) {
 		seq_printf(seq, "\tIP Checksum Offload in RX: %s\n",
 			   (priv->dma_cap.rx_coe) ? "Y" : "N");
 	} else {
@@ -7233,8 +7234,9 @@ static int stmmac_hw_init(struct stmmac_priv *priv)
 	 * has to be disable and this can be done by passing the
 	 * riwt_off field from the platform.
 	 */
-	if (((priv->synopsys_id >= DWMAC_CORE_3_50) ||
-	    (priv->plat->has_xgmac)) && (!priv->plat->riwt_off)) {
+	if ((priv->synopsys_id >= DWMAC_CORE_3_50 ||
+	     priv->plat->core_type == DWMAC_CORE_XGMAC) &&
+	    !priv->plat->riwt_off) {
 		priv->use_riwt = 1;
 		dev_info(priv->device,
 			 "Enable RX Mitigation via HW Watchdog Timer\n");
@@ -7355,7 +7357,7 @@ static int stmmac_xdp_rx_timestamp(const struct xdp_md *_ctx, u64 *timestamp)
 		return -ENODATA;
 
 	/* For GMAC4, the valid timestamp is from CTX next desc. */
-	if (priv->plat->has_gmac4 || priv->plat->has_xgmac)
+	if (dwmac_is_xmac(priv->plat->core_type))
 		desc_contains_ts = ndesc;
 
 	/* Check if timestamp is available */
@@ -7511,7 +7513,7 @@ int stmmac_dvr_probe(struct device *device,
 
 	if ((priv->plat->flags & STMMAC_FLAG_TSO_EN) && (priv->dma_cap.tsoen)) {
 		ndev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
-		if (priv->plat->has_gmac4)
+		if (priv->plat->core_type == DWMAC_CORE_GMAC4)
 			ndev->hw_features |= NETIF_F_GSO_UDP_L4;
 		priv->tso = true;
 		dev_info(priv->device, "TSO feature enabled\n");
@@ -7564,7 +7566,7 @@ int stmmac_dvr_probe(struct device *device,
 #ifdef STMMAC_VLAN_TAG_USED
 	/* Both mac100 and gmac support receive VLAN tag detection */
 	ndev->features |= NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX;
-	if (priv->plat->has_gmac4 || priv->plat->has_xgmac) {
+	if (dwmac_is_xmac(priv->plat->core_type)) {
 		ndev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX;
 		priv->hw->hw_vlan_en = true;
 	}
@@ -7595,7 +7597,7 @@ int stmmac_dvr_probe(struct device *device,
 
 	/* MTU range: 46 - hw-specific max */
 	ndev->min_mtu = ETH_ZLEN - ETH_HLEN;
-	if (priv->plat->has_xgmac)
+	if (priv->plat->core_type == DWMAC_CORE_XGMAC)
 		ndev->max_mtu = XGMAC_JUMBO_LEN;
 	else if ((priv->plat->enh_desc) || (priv->synopsys_id >= DWMAC_CORE_4_00))
 		ndev->max_mtu = JUMBO_LEN;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 65db43e9c85e..3f8cc3293964 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -301,7 +301,7 @@ static int stmmac_mdio_read_c22(struct mii_bus *bus, int phyaddr, int phyreg)
 	struct stmmac_priv *priv = netdev_priv(bus->priv);
 	u32 cmd;
 
-	if (priv->plat->has_gmac4)
+	if (priv->plat->core_type == DWMAC_CORE_GMAC4)
 		cmd = MII_GMAC4_READ;
 	else
 		cmd = 0;
@@ -344,7 +344,7 @@ static int stmmac_mdio_write_c22(struct mii_bus *bus, int phyaddr, int phyreg,
 	struct stmmac_priv *priv = netdev_priv(bus->priv);
 	u32 cmd;
 
-	if (priv->plat->has_gmac4)
+	if (priv->plat->core_type == DWMAC_CORE_GMAC4)
 		cmd = MII_GMAC4_WRITE;
 	else
 		cmd = MII_ADDR_GWRITE;
@@ -417,7 +417,7 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 	 * on MDC, so perform a dummy mdio read. To be updated for GMAC4
 	 * if needed.
 	 */
-	if (!priv->plat->has_gmac4)
+	if (priv->plat->core_type != DWMAC_CORE_GMAC4)
 		writel(0, priv->ioaddr + mii_address);
 #endif
 	return 0;
@@ -528,7 +528,7 @@ static u32 stmmac_clk_csr_set(struct stmmac_priv *priv)
 			value = 0;
 	}
 
-	if (priv->plat->has_xgmac) {
+	if (priv->plat->core_type == DWMAC_CORE_XGMAC) {
 		if (clk_rate > 400000000)
 			value = 0x5;
 		else if (clk_rate > 350000000)
@@ -601,7 +601,7 @@ int stmmac_mdio_register(struct net_device *ndev)
 
 	new_bus->name = "stmmac";
 
-	if (priv->plat->has_xgmac) {
+	if (priv->plat->core_type == DWMAC_CORE_XGMAC) {
 		new_bus->read = &stmmac_xgmac2_mdio_read_c22;
 		new_bus->write = &stmmac_xgmac2_mdio_write_c22;
 		new_bus->read_c45 = &stmmac_xgmac2_mdio_read_c45;
@@ -622,7 +622,7 @@ int stmmac_mdio_register(struct net_device *ndev)
 	} else {
 		new_bus->read = &stmmac_mdio_read_c22;
 		new_bus->write = &stmmac_mdio_write_c22;
-		if (priv->plat->has_gmac4) {
+		if (priv->plat->core_type == DWMAC_CORE_GMAC4) {
 			new_bus->read_c45 = &stmmac_mdio_read_c45;
 			new_bus->write_c45 = &stmmac_mdio_write_c45;
 		}
@@ -650,7 +650,7 @@ int stmmac_mdio_register(struct net_device *ndev)
 	}
 
 	/* Looks like we need a dummy read for XGMAC only and C45 PHYs */
-	if (priv->plat->has_xgmac)
+	if (priv->plat->core_type == DWMAC_CORE_XGMAC)
 		stmmac_xgmac2_mdio_read_c45(new_bus, 0, 0, 0);
 
 	/* If fixed-link is set, skip PHY scanning */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index 4e3aa611fda8..94b3a3b27270 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -23,7 +23,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat)
 {
 	/* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */
 	plat->clk_csr = STMMAC_CSR_20_35M;
-	plat->has_gmac = 1;
+	plat->core_type = DWMAC_CORE_GMAC;
 	plat->force_sf_dma_mode = 1;
 
 	plat->mdio_bus_data->needs_reset = true;
@@ -76,7 +76,7 @@ static int snps_gmac5_default_data(struct pci_dev *pdev,
 	int i;
 
 	plat->clk_csr = STMMAC_CSR_250_300M;
-	plat->has_gmac4 = 1;
+	plat->core_type = DWMAC_CORE_GMAC4;
 	plat->force_sf_dma_mode = 1;
 	plat->flags |= STMMAC_FLAG_TSO_EN;
 	plat->pmt = 1;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 27bcaae07a7f..fbb92cc6ab59 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -552,12 +552,12 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac)
 				&pdev->dev, plat->unicast_filter_entries);
 		plat->multicast_filter_bins = dwmac1000_validate_mcast_bins(
 				&pdev->dev, plat->multicast_filter_bins);
-		plat->has_gmac = 1;
+		plat->core_type = DWMAC_CORE_GMAC;
 		plat->pmt = 1;
 	}
 
 	if (of_device_is_compatible(np, "snps,dwmac-3.40a")) {
-		plat->has_gmac = 1;
+		plat->core_type = DWMAC_CORE_GMAC;
 		plat->enh_desc = 1;
 		plat->tx_coe = 1;
 		plat->bugged_jumbo = 1;
@@ -565,8 +565,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac)
 	}
 
 	if (of_device_compatible_match(np, stmmac_gmac4_compats)) {
-		plat->has_gmac4 = 1;
-		plat->has_gmac = 0;
+		plat->core_type = DWMAC_CORE_GMAC4;
 		plat->pmt = 1;
 		if (of_property_read_bool(np, "snps,tso"))
 			plat->flags |= STMMAC_FLAG_TSO_EN;
@@ -580,7 +579,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac)
 	}
 
 	if (of_device_is_compatible(np, "snps,dwxgmac")) {
-		plat->has_xgmac = 1;
+		plat->core_type = DWMAC_CORE_XGMAC;
 		plat->pmt = 1;
 		if (of_property_read_bool(np, "snps,tso"))
 			plat->flags |= STMMAC_FLAG_TSO_EN;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index 993ff4e87e55..3e30172fa129 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -57,7 +57,7 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 	bool xmac, est_rst = false;
 	int ret;
 
-	xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
+	xmac = dwmac_is_xmac(priv->plat->core_type);
 
 	if (delta < 0) {
 		neg_adj = 1;
@@ -344,7 +344,7 @@ void stmmac_ptp_register(struct stmmac_priv *priv)
 
 	/* Calculate the clock domain crossing (CDC) error if necessary */
 	priv->plat->cdc_error_adj = 0;
-	if (priv->plat->has_gmac4)
+	if (priv->plat->core_type == DWMAC_CORE_GMAC4)
 		priv->plat->cdc_error_adj = (2 * NSEC_PER_SEC) / priv->plat->clk_ptp_rate;
 
 	/* Update the ptp clock parameters based on feature discovery, when
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 99022620457a..151c81c560c8 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -171,6 +171,13 @@ struct dwmac4_addrs {
 	u32 mtl_low_cred_offset;
 };
 
+enum dwmac_core_type {
+	DWMAC_CORE_MAC100,
+	DWMAC_CORE_GMAC,
+	DWMAC_CORE_GMAC4,
+	DWMAC_CORE_XGMAC,
+};
+
 #define STMMAC_FLAG_SPH_DISABLE			BIT(1)
 #define STMMAC_FLAG_USE_PHY_WOL			BIT(2)
 #define STMMAC_FLAG_HAS_SUN8I			BIT(3)
@@ -186,6 +193,7 @@ struct dwmac4_addrs {
 #define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY	BIT(13)
 
 struct plat_stmmacenet_data {
+	enum dwmac_core_type core_type;
 	int bus_id;
 	int phy_addr;
 	/* MAC ----- optional PCS ----- SerDes ----- optional PHY ----- Media
@@ -219,7 +227,6 @@ struct plat_stmmacenet_data {
 	struct stmmac_dma_cfg *dma_cfg;
 	struct stmmac_safety_feature_cfg *safety_feat_cfg;
 	int clk_csr;
-	int has_gmac;
 	int enh_desc;
 	int tx_coe;
 	int rx_coe;
@@ -282,10 +289,8 @@ struct plat_stmmacenet_data {
 	struct reset_control *stmmac_rst;
 	struct reset_control *stmmac_ahb_rst;
 	struct stmmac_axi *axi;
-	int has_gmac4;
 	int rss_en;
 	int mac_port_sel_speed;
-	int has_xgmac;
 	u8 vlan_fail_q;
 	struct pci_dev *pdev;
 	int int_snapshot_num;
-- 
cgit v1.2.3


From 114573962a68a527835f2f1433a89bc2f9feac1b Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Tue, 21 Oct 2025 19:46:26 +0800
Subject: net/sched: Remove unused inline helper qdisc_from_priv()

Since commit fb38306ceb9e ("net/sched: Retire ATM qdisc"), this is
not used and can be removed.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://patch.msgid.link/20251021114626.3148894-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_sched.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 8a75c73fc555..c660ac871083 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -25,11 +25,6 @@ struct qdisc_walker {
 		 const struct Qdisc * : (const void *)&q->privdata,	\
 		 struct Qdisc * : (void *)&q->privdata)
 
-static inline struct Qdisc *qdisc_from_priv(void *priv)
-{
-	return container_of(priv, struct Qdisc, privdata);
-}
-
 /* 
    Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth
    
-- 
cgit v1.2.3


From 7958b4bb806c1af800ca23c8333a98231b3ab0b1 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 22 Oct 2025 15:23:42 +0200
Subject: pinctrl: pinmux: Add missing .function_is_gpio kerneldoc

This callback was undocumented, add the docs.

Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinmux.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h
index 6db6c3e1ccc2..094bbe2fd6fd 100644
--- a/include/linux/pinctrl/pinmux.h
+++ b/include/linux/pinctrl/pinmux.h
@@ -35,6 +35,16 @@ struct pinctrl_gpio_range;
  *	name can be used with the generic @pinctrl_ops to retrieve the
  *	actual pins affected. The applicable groups will be returned in
  *	@groups and the number of groups in @num_groups
+ * @function_is_gpio: determine if the indicated function selector passed
+ *	corresponds to the GPIO function which is used by the accelerated GPIO
+ *	functions @gpio_request_enable, @gpio_disable_free and
+ *	@gpio_set_direction. When the pin control core can properly determine
+ *	if a function is a GPIO function, it is easier to use the @strict mode
+ *	on the pin controller. Since a single function is passed, this is
+ *	only useful on pin controllers that use a specific function for GPIO,
+ *	and that usually presupposes that a one-group-per-pin approach is
+ *	used, so that a single function can be set on a single pin to turn
+ *	it to GPIO mode.
  * @set_mux: enable a certain muxing function with a certain pin group. The
  *	driver does not need to figure out whether enabling this function
  *	conflicts some other use of the pins in that group, such collisions
-- 
cgit v1.2.3


From 243ce64b2b371cdf2cbc39c9422cb3047cab6de7 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 13 Oct 2025 12:51:57 +0200
Subject: backlight: Do not include <linux/fb.h> in header file

The backlight interfaces don't require anything from <linux/fb.h>, so
don't include it.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Daniel Thompson (RISCstar) <danielt@kernel.org>
Reviewed-by: Simona Vetter <simona.vetter@ffwll.ch>
Link: https://patch.msgid.link/20251013105553.836715-1-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/backlight.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index 10e626db7eee..f29a9ef1052e 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -10,7 +10,6 @@
 #define _LINUX_BACKLIGHT_H
 
 #include <linux/device.h>
-#include <linux/fb.h>
 #include <linux/mutex.h>
 #include <linux/types.h>
 
-- 
cgit v1.2.3


From c2afdd73e5ba2146c7e8b43b2607da5d4b720d9d Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@sandisk.com>
Date: Thu, 19 Jun 2025 11:56:19 +0300
Subject: mmc: core: Skip to set the default 200mA SD current limit

Let's avoid updating the SD current limit when the maximum power is 200mA
(0.72W) or less, as this is already the default value for the SD card. In
this way we avoid sending an unnecessary command during initialization.

Signed-off-by: Avri Altman <avri.altman@sandisk.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sd.c    | 7 ++-----
 include/linux/mmc/card.h | 1 -
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index b6758911e22c..948948ca9b4a 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -554,7 +554,7 @@ static u32 sd_get_host_max_current(struct mmc_host *host)
 
 static int sd_set_current_limit(struct mmc_card *card, u8 *status)
 {
-	int current_limit = SD_SET_CURRENT_NO_CHANGE;
+	int current_limit = SD_SET_CURRENT_LIMIT_200;
 	int err;
 	u32 max_current;
 
@@ -598,11 +598,8 @@ static int sd_set_current_limit(struct mmc_card *card, u8 *status)
 	else if (max_current >= 400 &&
 		 card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_400)
 		current_limit = SD_SET_CURRENT_LIMIT_400;
-	else if (max_current >= 200 &&
-		 card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_200)
-		current_limit = SD_SET_CURRENT_LIMIT_200;
 
-	if (current_limit != SD_SET_CURRENT_NO_CHANGE) {
+	if (current_limit != SD_SET_CURRENT_LIMIT_200) {
 		err = mmc_sd_switch(card, SD_SWITCH_SET, 3,
 				current_limit, status);
 		if (err)
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index ddcdf23d731c..e9e964c20e53 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -182,7 +182,6 @@ struct sd_switch_caps {
 #define SD_SET_CURRENT_LIMIT_400	1
 #define SD_SET_CURRENT_LIMIT_600	2
 #define SD_SET_CURRENT_LIMIT_800	3
-#define SD_SET_CURRENT_NO_CHANGE	(-1)
 
 #define SD_MAX_CURRENT_200	(1 << SD_SET_CURRENT_LIMIT_200)
 #define SD_MAX_CURRENT_400	(1 << SD_SET_CURRENT_LIMIT_400)
-- 
cgit v1.2.3


From 7b2c4224faa7bc6cdaf1fb6106ec7b46c63a28cb Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 14 Oct 2025 13:00:55 -0700
Subject: scsi: ufs: core: Improve documentation in include/ufs/ufshci.h

Make it easier to find the sections in the UFSHCI standard where these
constants come from.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251014200118.3390839-4-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/ufs/ufshci.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/ufs/ufshci.h b/include/ufs/ufshci.h
index e64b70132101..ff96056b2ac3 100644
--- a/include/ufs/ufshci.h
+++ b/include/ufs/ufshci.h
@@ -83,12 +83,14 @@ enum {
 };
 
 enum {
+	/* Submission Queue (SQ) Configuration Registers */
 	REG_SQATTR		= 0x0,
 	REG_SQLBA		= 0x4,
 	REG_SQUBA		= 0x8,
 	REG_SQDAO		= 0xC,
 	REG_SQISAO		= 0x10,
 
+	/* Completion Queue (CQ) Configuration Registers */
 	REG_CQATTR		= 0x20,
 	REG_CQLBA		= 0x24,
 	REG_CQUBA		= 0x28,
@@ -96,6 +98,7 @@ enum {
 	REG_CQISAO		= 0x30,
 };
 
+/* Operation and Runtime Registers - Submission Queues and Completion Queues */
 enum {
 	REG_SQHP		= 0x0,
 	REG_SQTP		= 0x4,
-- 
cgit v1.2.3


From b3b0842bcb0696e25b1977238ce2907a4c02d8c4 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 14 Oct 2025 13:00:56 -0700
Subject: scsi: ufs: core: Change the type of uic_command::cmd_active

Since uic_command::cmd_active is used as a boolean variable, change its
type from 'int' into 'bool'. No functionality has been changed.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251014200118.3390839-5-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 6 +++---
 include/ufs/ufshcd.h      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 8339fec975b9..fa37162046ca 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -2618,7 +2618,7 @@ __ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd)
 
 	init_completion(&uic_cmd->done);
 
-	uic_cmd->cmd_active = 1;
+	uic_cmd->cmd_active = true;
 	ufshcd_dispatch_uic_cmd(hba, uic_cmd);
 
 	return 0;
@@ -5587,13 +5587,13 @@ static irqreturn_t ufshcd_uic_cmd_compl(struct ufs_hba *hba, u32 intr_status)
 		cmd->argument2 |= ufshcd_get_uic_cmd_result(hba);
 		cmd->argument3 = ufshcd_get_dme_attr_val(hba);
 		if (!hba->uic_async_done)
-			cmd->cmd_active = 0;
+			cmd->cmd_active = false;
 		complete(&cmd->done);
 		retval = IRQ_HANDLED;
 	}
 
 	if (intr_status & UFSHCD_UIC_PWR_MASK && hba->uic_async_done) {
-		cmd->cmd_active = 0;
+		cmd->cmd_active = false;
 		complete(hba->uic_async_done);
 		retval = IRQ_HANDLED;
 	}
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 9425cfd9d00e..4d215a18522c 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -78,7 +78,7 @@ struct uic_command {
 	const u32 argument1;
 	u32 argument2;
 	u32 argument3;
-	int cmd_active;
+	bool cmd_active;
 	struct completion done;
 };
 
-- 
cgit v1.2.3


From b30006b5bec1dcba207bc42e7f7cd96a568acc27 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 14 Oct 2025 13:00:58 -0700
Subject: scsi: ufs: core: Move the ufshcd_enable_intr() declaration

ufshcd_enable_intr() is not exported and hence should not be declared in
include/ufs/ufshcd.h.

Fixes: 253757797973 ("scsi: ufs: core: Change MCQ interrupt enable flow")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Peter Wang <peter.wang@mediatek.com>
Link: https://patch.msgid.link/20251014200118.3390839-7-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd-priv.h | 2 ++
 include/ufs/ufshcd.h           | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index d0a2c963a27d..1f0d38aa37f9 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -6,6 +6,8 @@
 #include <linux/pm_runtime.h>
 #include <ufs/ufshcd.h>
 
+void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs);
+
 static inline bool ufshcd_is_user_access_allowed(struct ufs_hba *hba)
 {
 	return !hba->shutting_down;
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 4d215a18522c..edfbc3a216be 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1295,7 +1295,6 @@ static inline void ufshcd_rmwl(struct ufs_hba *hba, u32 mask, u32 val, u32 reg)
 
 void ufshcd_enable_irq(struct ufs_hba *hba);
 void ufshcd_disable_irq(struct ufs_hba *hba);
-void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs);
 int ufshcd_alloc_host(struct device *, struct ufs_hba **);
 int ufshcd_hba_enable(struct ufs_hba *hba);
 int ufshcd_init(struct ufs_hba *, void __iomem *, unsigned int);
-- 
cgit v1.2.3


From 4da42aaa82d6e3fa2e822e6e771d031c2e20a6c7 Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Thu, 16 Oct 2025 11:47:54 -0300
Subject: printk: nbcon: Export console_is_usable

The helper will be used on KDB code in the next commits.

Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-1-866aac60a80e@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h  | 45 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/printk/internal.h | 45 ---------------------------------------------
 2 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/linux/console.h b/include/linux/console.h
index 8f10d0a85bb4..5c3a718c22fc 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -19,6 +19,7 @@
 #include <linux/irq_work.h>
 #include <linux/rculist.h>
 #include <linux/rcuwait.h>
+#include <linux/smp.h>
 #include <linux/types.h>
 #include <linux/vesa.h>
 
@@ -605,6 +606,48 @@ extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
 extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
 extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
 extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt);
+
+/*
+ * Check if the given console is currently capable and allowed to print
+ * records. Note that this function does not consider the current context,
+ * which can also play a role in deciding if @con can be used to print
+ * records.
+ */
+static inline bool console_is_usable(struct console *con, short flags, bool use_atomic)
+{
+	if (!(flags & CON_ENABLED))
+		return false;
+
+	if ((flags & CON_SUSPENDED))
+		return false;
+
+	if (flags & CON_NBCON) {
+		/* The write_atomic() callback is optional. */
+		if (use_atomic && !con->write_atomic)
+			return false;
+
+		/*
+		 * For the !use_atomic case, @printk_kthreads_running is not
+		 * checked because the write_thread() callback is also used
+		 * via the legacy loop when the printer threads are not
+		 * available.
+		 */
+	} else {
+		if (!con->write)
+			return false;
+	}
+
+	/*
+	 * Console drivers may assume that per-cpu resources have been
+	 * allocated. So unless they're explicitly marked as being able to
+	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
+	 */
+	if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
+		return false;
+
+	return true;
+}
+
 #else
 static inline void nbcon_cpu_emergency_enter(void) { }
 static inline void nbcon_cpu_emergency_exit(void) { }
@@ -612,6 +655,8 @@ static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return
 static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
 static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
 static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { }
+static inline bool console_is_usable(struct console *con, short flags,
+				     bool use_atomic) { return false; }
 #endif
 
 extern int console_set_on_cmdline;
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index f72bbfa266d6..7e3128ec9336 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -3,7 +3,6 @@
  * internal.h - printk internal definitions
  */
 #include <linux/console.h>
-#include <linux/percpu.h>
 #include <linux/types.h>
 
 #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
@@ -112,47 +111,6 @@ bool nbcon_kthread_create(struct console *con);
 void nbcon_kthread_stop(struct console *con);
 void nbcon_kthreads_wake(void);
 
-/*
- * Check if the given console is currently capable and allowed to print
- * records. Note that this function does not consider the current context,
- * which can also play a role in deciding if @con can be used to print
- * records.
- */
-static inline bool console_is_usable(struct console *con, short flags, bool use_atomic)
-{
-	if (!(flags & CON_ENABLED))
-		return false;
-
-	if ((flags & CON_SUSPENDED))
-		return false;
-
-	if (flags & CON_NBCON) {
-		/* The write_atomic() callback is optional. */
-		if (use_atomic && !con->write_atomic)
-			return false;
-
-		/*
-		 * For the !use_atomic case, @printk_kthreads_running is not
-		 * checked because the write_thread() callback is also used
-		 * via the legacy loop when the printer threads are not
-		 * available.
-		 */
-	} else {
-		if (!con->write)
-			return false;
-	}
-
-	/*
-	 * Console drivers may assume that per-cpu resources have been
-	 * allocated. So unless they're explicitly marked as being able to
-	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
-	 */
-	if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
-		return false;
-
-	return true;
-}
-
 /**
  * nbcon_kthread_wake - Wake up a console printing thread
  * @con:	Console to operate on
@@ -204,9 +162,6 @@ static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *hand
 static inline void nbcon_kthread_wake(struct console *con) { }
 static inline void nbcon_kthreads_wake(void) { }
 
-static inline bool console_is_usable(struct console *con, short flags,
-				     bool use_atomic) { return false; }
-
 #endif /* CONFIG_PRINTK */
 
 extern bool have_boot_console;
-- 
cgit v1.2.3


From 49f7d3054e84617395a37a058251c81320a3614a Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Thu, 16 Oct 2025 11:47:55 -0300
Subject: printk: nbcon: Introduce KDB helpers

These helpers will be used when calling console->write_atomic on
KDB code in the next patch. It's basically the same implementation
as nbcon_device_try_acquire, but using NBCON_PRIO_EMERGENCY when
acquiring the context.

If the acquire succeeds, the message and message length are assigned to
nbcon_write_context so ->write_atomic can print the message.

After release try to flush the console since there may be a backlog of
messages in the ringbuffer. The kthread console printers do not get a
chance to run while kdb is active.

Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-2-866aac60a80e@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h |  6 +++++
 kernel/printk/nbcon.c   | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'include')

diff --git a/include/linux/console.h b/include/linux/console.h
index 5c3a718c22fc..9406342b27db 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -606,6 +606,9 @@ extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
 extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
 extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
 extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt);
+extern bool nbcon_kdb_try_acquire(struct console *con,
+				  struct nbcon_write_context *wctxt);
+extern void nbcon_kdb_release(struct nbcon_write_context *wctxt);
 
 /*
  * Check if the given console is currently capable and allowed to print
@@ -655,6 +658,9 @@ static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return
 static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
 static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
 static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { }
+static inline bool nbcon_kdb_try_acquire(struct console *con,
+					 struct nbcon_write_context *wctxt) { return false; }
+static inline void nbcon_kdb_release(struct console *con) { }
 static inline bool console_is_usable(struct console *con, short flags,
 				     bool use_atomic) { return false; }
 #endif
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 558ef3177976..e1bf5409cb6b 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -1855,3 +1855,64 @@ void nbcon_device_release(struct console *con)
 	console_srcu_read_unlock(cookie);
 }
 EXPORT_SYMBOL_GPL(nbcon_device_release);
+
+/**
+ * nbcon_kdb_try_acquire - Try to acquire nbcon console and enter unsafe
+ *			   section
+ * @con:	The nbcon console to acquire
+ * @wctxt:	The nbcon write context to be used on success
+ *
+ * Context:	Under console_srcu_read_lock() for emitting a single kdb message
+ *		using the given con->write_atomic() callback. Can be called
+ *		only when the console is usable at the moment.
+ *
+ * Return:	True if the console was acquired. False otherwise.
+ *
+ * kdb emits messages on consoles registered for printk() without
+ * storing them into the ring buffer. It has to acquire the console
+ * ownerhip so that it could call con->write_atomic() callback a safe way.
+ *
+ * This function acquires the nbcon console using priority NBCON_PRIO_EMERGENCY
+ * and marks it unsafe for handover/takeover.
+ */
+bool nbcon_kdb_try_acquire(struct console *con,
+			   struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+	memset(ctxt, 0, sizeof(*ctxt));
+	ctxt->console = con;
+	ctxt->prio    = NBCON_PRIO_EMERGENCY;
+
+	if (!nbcon_context_try_acquire(ctxt, false))
+		return false;
+
+	if (!nbcon_context_enter_unsafe(ctxt))
+		return false;
+
+	return true;
+}
+
+/**
+ * nbcon_kdb_release - Exit unsafe section and release the nbcon console
+ *
+ * @wctxt:	The nbcon write context initialized by a successful
+ *		nbcon_kdb_try_acquire()
+ */
+void nbcon_kdb_release(struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+	if (!nbcon_context_exit_unsafe(ctxt))
+		return;
+
+	nbcon_context_release(ctxt);
+
+	/*
+	 * Flush any new printk() messages added when the console was blocked.
+	 * Only the console used by the given write context was	blocked.
+	 * The console was locked only when the write_atomic() callback
+	 * was usable.
+	 */
+	__nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb), false);
+}
-- 
cgit v1.2.3


From 286b113d70007e932d18aa0acfce1a3f5b25d8d1 Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Thu, 16 Oct 2025 11:47:56 -0300
Subject: printk: nbcon: Allow KDB to acquire the NBCON context

KDB can interrupt any console to execute the "mirrored printing" at any
time, so add an exception to nbcon_context_try_acquire_direct to allow
to get the context if the current CPU is the same as kdb_printf_cpu.

This change will be necessary for the next patch, which fixes
kdb_msg_write to work with NBCON consoles by calling ->write_atomic on
such consoles. But to print it first needs to acquire the ownership of
the console, so nbcon_context_try_acquire_direct is fixed here.

Reviewed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-3-866aac60a80e@suse.com
[pmladek@suse.com: Fix compilation with !CONFIG_KGDB_KDB.]
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/kdb.h   | 16 ++++++++++++++++
 kernel/printk/nbcon.c |  6 +++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index ecbf819deeca..741c58e86431 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -14,6 +14,7 @@
  */
 
 #include <linux/list.h>
+#include <linux/smp.h>
 
 /* Shifted versions of the command enable bits are be used if the command
  * has no arguments (see kdb_check_flags). This allows commands, such as
@@ -207,11 +208,26 @@ static inline const char *kdb_walk_kallsyms(loff_t *pos)
 /* Dynamic kdb shell command registration */
 extern int kdb_register(kdbtab_t *cmd);
 extern void kdb_unregister(kdbtab_t *cmd);
+
+/* Return true when KDB as locked for printing a message on this CPU. */
+static inline
+bool kdb_printf_on_this_cpu(void)
+{
+	/*
+	 * We can use raw_smp_processor_id() here because the task could
+	 * not get migrated when KDB has locked for printing on this CPU.
+	 */
+	return unlikely(READ_ONCE(kdb_printf_cpu) == raw_smp_processor_id());
+}
+
 #else /* ! CONFIG_KGDB_KDB */
 static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
 static inline void kdb_init(int level) {}
 static inline int kdb_register(kdbtab_t *cmd) { return 0; }
 static inline void kdb_unregister(kdbtab_t *cmd) {}
+
+static inline bool kdb_printf_on_this_cpu(void) { return false; }
+
 #endif	/* CONFIG_KGDB_KDB */
 enum {
 	KDB_NOT_INITIALIZED,
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index e1bf5409cb6b..5be018493909 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -10,6 +10,7 @@
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/irqflags.h>
+#include <linux/kdb.h>
 #include <linux/kthread.h>
 #include <linux/minmax.h>
 #include <linux/panic.h>
@@ -249,13 +250,16 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
 		 * since all non-panic CPUs are stopped during panic(), it
 		 * is safer to have them avoid gaining console ownership.
 		 *
-		 * If this acquire is a reacquire (and an unsafe takeover
+		 * One exception is when kdb has locked for printing on this CPU.
+		 *
+		 * Second exception is a reacquire (and an unsafe takeover
 		 * has not previously occurred) then it is allowed to attempt
 		 * a direct acquire in panic. This gives console drivers an
 		 * opportunity to perform any necessary cleanup if they were
 		 * interrupted by the panic CPU while printing.
 		 */
 		if (panic_on_other_cpu() &&
+		    !kdb_printf_on_this_cpu() &&
 		    (!is_reacquire || cur->unsafe_takeover)) {
 			return -EPERM;
 		}
-- 
cgit v1.2.3


From 4349cf0df34f37d2470d246bc9be8d9836dfa49e Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Thu, 16 Oct 2025 11:47:57 -0300
Subject: printk: nbcon: Export nbcon_write_context_set_buf

This function will be used in the next patch to allow a driver to set
both the message and message length of a nbcon_write_context. This is
necessary because the function also initializes the ->unsafe_takeover
struct member. By using this helper we ensure that the struct is
initialized correctly.

Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-4-866aac60a80e@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h | 4 ++++
 kernel/printk/nbcon.c   | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/console.h b/include/linux/console.h
index 9406342b27db..4585eb8e109e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -603,6 +603,8 @@ static inline bool console_is_registered(const struct console *con)
 extern void nbcon_cpu_emergency_enter(void);
 extern void nbcon_cpu_emergency_exit(void);
 extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
+extern void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
+					char *buf, unsigned int len);
 extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
 extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
 extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt);
@@ -655,6 +657,8 @@ static inline bool console_is_usable(struct console *con, short flags, bool use_
 static inline void nbcon_cpu_emergency_enter(void) { }
 static inline void nbcon_cpu_emergency_exit(void) { }
 static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
+static inline void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
+					       char *buf, unsigned int len) { }
 static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
 static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
 static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { }
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 5be018493909..fdd1cbebe77d 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -854,8 +854,8 @@ out:
 	return nbcon_context_can_proceed(ctxt, &cur);
 }
 
-static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
-					char *buf, unsigned int len)
+void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
+				 char *buf, unsigned int len)
 {
 	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
 	struct console *con = ctxt->console;
-- 
cgit v1.2.3


From 62627bf0cadf6eae87d92fecf604c42160fe16ef Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Thu, 16 Oct 2025 11:47:58 -0300
Subject: kdb: Adapt kdb_msg_write to work with NBCON consoles

Function kdb_msg_write was calling con->write for any found console,
but it won't work on NBCON consoles. In this case we should acquire the
ownership of the console using NBCON_PRIO_EMERGENCY, since printing
kdb messages should only be interrupted by a panic.

At this point, the console is required to use the atomic callback. The
console is skipped if the write_atomic callback is not set or if the
context could not be acquired. The validation of NBCON is done by the
console_is_usable helper. The context is released right after
write_atomic finishes.

The oops_in_progress handling is only needed in the legacy consoles,
so it was moved around the con->write callback.

Suggested-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-5-866aac60a80e@suse.com
[pmladek@suse.com: Fixed compilation with !CONFIG_PRINTK.]
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h   |  2 +-
 kernel/debug/kdb/kdb_io.c | 47 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 33 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/console.h b/include/linux/console.h
index 4585eb8e109e..d17f1f525bec 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -664,7 +664,7 @@ static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return
 static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { }
 static inline bool nbcon_kdb_try_acquire(struct console *con,
 					 struct nbcon_write_context *wctxt) { return false; }
-static inline void nbcon_kdb_release(struct console *con) { }
+static inline void nbcon_kdb_release(struct nbcon_write_context *wctxt) { }
 static inline bool console_is_usable(struct console *con, short flags,
 				     bool use_atomic) { return false; }
 #endif
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index b12b9db75c1d..61c1690058ed 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -589,24 +589,41 @@ static void kdb_msg_write(const char *msg, int msg_len)
 	 */
 	cookie = console_srcu_read_lock();
 	for_each_console_srcu(c) {
-		if (!(console_srcu_read_flags(c) & CON_ENABLED))
+		short flags = console_srcu_read_flags(c);
+
+		if (!console_is_usable(c, flags, true))
 			continue;
 		if (c == dbg_io_ops->cons)
 			continue;
-		if (!c->write)
-			continue;
-		/*
-		 * Set oops_in_progress to encourage the console drivers to
-		 * disregard their internal spin locks: in the current calling
-		 * context the risk of deadlock is a bigger problem than risks
-		 * due to re-entering the console driver. We operate directly on
-		 * oops_in_progress rather than using bust_spinlocks() because
-		 * the calls bust_spinlocks() makes on exit are not appropriate
-		 * for this calling context.
-		 */
-		++oops_in_progress;
-		c->write(c, msg, msg_len);
-		--oops_in_progress;
+
+		if (flags & CON_NBCON) {
+			struct nbcon_write_context wctxt = { };
+
+			/*
+			 * Do not continue if the console is NBCON and the context
+			 * can't be acquired.
+			 */
+			if (!nbcon_kdb_try_acquire(c, &wctxt))
+				continue;
+
+			nbcon_write_context_set_buf(&wctxt, (char *)msg, msg_len);
+
+			c->write_atomic(c, &wctxt);
+			nbcon_kdb_release(&wctxt);
+		} else {
+			/*
+			 * Set oops_in_progress to encourage the console drivers to
+			 * disregard their internal spin locks: in the current calling
+			 * context the risk of deadlock is a bigger problem than risks
+			 * due to re-entering the console driver. We operate directly on
+			 * oops_in_progress rather than using bust_spinlocks() because
+			 * the calls bust_spinlocks() makes on exit are not appropriate
+			 * for this calling context.
+			 */
+			++oops_in_progress;
+			c->write(c, msg, msg_len);
+			--oops_in_progress;
+		}
 		touch_nmi_watchdog();
 	}
 	console_srcu_read_unlock(cookie);
-- 
cgit v1.2.3


From 245f14f5fe283c782b16143280f283bee29dbb5f Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Tue, 30 Sep 2025 12:30:55 +0800
Subject: interconnect: Optimize kbps_to_icc() macro

The current expansion of kbps_to_icc() introduces unnecessary logic
when compiled from a general expression. Rewriting it allows compilers
to emit shorter and more efficient code across architectures.

For example, with gcc -O2:

arm64:

old:
        tst     x0, 7
        add     w1, w0, 7
        cset    w2, ne
        cmp     w0, 0
        csel    w0, w1, w0, lt
        add     w0, w2, w0, asr 3

new:
        add     w1, w0, 14
        adds    w0, w0, 7
        csel    w0, w1, w0, mi
        asr     w0, w0, 3

x86-64:

old:
        xor     eax, eax
        test    dil, 7
        lea     edx, [rdi+7]
        setne   al
        test    edi, edi
        cmovns  edx, edi
        sar     edx, 3
        add     eax, edx

new:
        lea     eax, [rdi+14]
        add     edi, 7
        cmovns  eax, edi
        sar     eax, 3

In both cases the old form relies on extra test and compare
instructions (tst, test, cmp) combined with conditional moves or sets,
while the new form uses fewer instructions by folding the addition and
flag update together (adds on arm64, add on x86).

This reduces the instruction sequence, prevents multiple evaluations of
x when it is an expression or a function call, and keeps the macro
simpler.

Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Link: https://lore.kernel.org/r/20250930043055.2200322-1-visitorckw@gmail.com
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 include/linux/interconnect.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/interconnect.h b/include/linux/interconnect.h
index e4b8808823ad..4b12821528a6 100644
--- a/include/linux/interconnect.h
+++ b/include/linux/interconnect.h
@@ -16,7 +16,7 @@
 #define MBps_to_icc(x)	((x) * 1000)
 #define GBps_to_icc(x)	((x) * 1000 * 1000)
 #define bps_to_icc(x)	(1)
-#define kbps_to_icc(x)	((x) / 8 + ((x) % 8 ? 1 : 0))
+#define kbps_to_icc(x)	(((x) + 7) / 8)
 #define Mbps_to_icc(x)	((x) * 1000 / 8)
 #define Gbps_to_icc(x)	((x) * 1000 * 1000 / 8)
 
-- 
cgit v1.2.3


From 70e0a80a1f3580ccf5bc1f34dbb433c67d9d8d00 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 24 Oct 2025 19:06:51 +0100
Subject: treewide: Remove in_irq()

This old alias for in_hardirq() has been marked as deprecated since
2020; remove the stragglers.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024180654.1691095-1-willy@infradead.org
---
 drivers/bus/fsl-mc/mc-sys.c | 2 +-
 drivers/md/dm-vdo/logger.c  | 2 +-
 include/linux/lockdep.h     | 2 +-
 include/linux/preempt.h     | 2 --
 kernel/bpf/syscall.c        | 4 ++--
 kernel/time/timer.c         | 2 +-
 lib/locking-selftest.c      | 4 ++--
 7 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/fsl-mc/mc-sys.c b/drivers/bus/fsl-mc/mc-sys.c
index b22c59d57c8f..31037f41893e 100644
--- a/drivers/bus/fsl-mc/mc-sys.c
+++ b/drivers/bus/fsl-mc/mc-sys.c
@@ -248,7 +248,7 @@ int mc_send_command(struct fsl_mc_io *mc_io, struct fsl_mc_command *cmd)
 	enum mc_cmd_status status;
 	unsigned long irq_flags = 0;
 
-	if (in_irq() && !(mc_io->flags & FSL_MC_IO_ATOMIC_CONTEXT_PORTAL))
+	if (in_hardirq() && !(mc_io->flags & FSL_MC_IO_ATOMIC_CONTEXT_PORTAL))
 		return -EINVAL;
 
 	if (mc_io->flags & FSL_MC_IO_ATOMIC_CONTEXT_PORTAL)
diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
index 3f7dc2cb6b98..76a987ccf926 100644
--- a/drivers/md/dm-vdo/logger.c
+++ b/drivers/md/dm-vdo/logger.c
@@ -34,7 +34,7 @@ static const char *get_current_interrupt_type(void)
 	if (in_nmi())
 		return "NMI";
 
-	if (in_irq())
+	if (in_hardirq())
 		return "HI";
 
 	if (in_softirq())
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 67964dc4db95..dd634103b014 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -616,7 +616,7 @@ do {									\
 #define lockdep_assert_in_softirq()					\
 do {									\
 	WARN_ON_ONCE(__lockdep_enabled			&&		\
-		     (!in_softirq() || in_irq() || in_nmi()));		\
+		     (!in_softirq() || in_hardirq() || in_nmi()));	\
 } while (0)
 
 extern void lockdep_assert_in_softirq_func(void);
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 102202185d7a..d964f965c8ff 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -134,11 +134,9 @@ static __always_inline unsigned char interrupt_context_level(void)
 
 /*
  * The following macros are deprecated and should not be used in new code:
- * in_irq()       - Obsolete version of in_hardirq()
  * in_softirq()   - We have BH disabled, or are processing softirqs
  * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
  */
-#define in_irq()		(hardirq_count())
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8a129746bd6c..6cde6a46babf 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2330,7 +2330,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
 		return;
 	if (audit_enabled == AUDIT_OFF)
 		return;
-	if (!in_irq() && !irqs_disabled())
+	if (!in_hardirq() && !irqs_disabled())
 		ctx = audit_context();
 	ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
 	if (unlikely(!ab))
@@ -2428,7 +2428,7 @@ static void __bpf_prog_put(struct bpf_prog *prog)
 	struct bpf_prog_aux *aux = prog->aux;
 
 	if (atomic64_dec_and_test(&aux->refcnt)) {
-		if (in_irq() || irqs_disabled()) {
+		if (in_hardirq() || irqs_disabled()) {
 			INIT_WORK(&aux->work, bpf_prog_put_deferred);
 			schedule_work(&aux->work);
 		} else {
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 553fa469d7cc..282a8e5c05f8 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2472,7 +2472,7 @@ void update_process_times(int user_tick)
 	run_local_timers();
 	rcu_sched_clock_irq(user_tick);
 #ifdef CONFIG_IRQ_WORK
-	if (in_irq())
+	if (in_hardirq())
 		irq_work_tick();
 #endif
 	sched_tick();
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index ed99344317f5..d939403331b5 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -202,7 +202,7 @@ static void init_shared_classes(void)
 	local_irq_disable();			\
 	__irq_enter();				\
 	lockdep_hardirq_threaded();		\
-	WARN_ON(!in_irq());
+	WARN_ON(!in_hardirq());
 
 #define HARDIRQ_EXIT()				\
 	__irq_exit();				\
@@ -2512,7 +2512,7 @@ DEFINE_LOCK_GUARD_0(NOTTHREADED_HARDIRQ,
 	do {
 		local_irq_disable();
 		__irq_enter();
-		WARN_ON(!in_irq());
+		WARN_ON(!in_hardirq());
 	} while(0), HARDIRQ_EXIT())
 DEFINE_LOCK_GUARD_0(SOFTIRQ, SOFTIRQ_ENTER(), SOFTIRQ_EXIT())
 
-- 
cgit v1.2.3


From e30f8e61e2518a837837daa26cda3c8cc30f3226 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 21 Oct 2025 20:43:40 -0400
Subject: tracing: Add a tracepoint verification check at build time

If a tracepoint is defined via DECLARE_TRACE() or TRACE_EVENT() but never
called (via the trace_<tracepoint>() function), its metadata is still
around in memory and not discarded.

When created via TRACE_EVENT() the situation is worse because the
TRACE_EVENT() creates metadata that can be around 5k per trace event.
Having unused trace events causes several thousand of wasted bytes.

Add a verifier that injects a string of the name of the tracepoint it
calls that is added to the discarded section "__tracepoint_check".
For every builtin tracepoint, its name (which is saved in the in-memory
section "__tracepoint_strings") will have its name also in the
"__tracepoint_check" section if it is used.

Add a new program that is run on build called tracepoint-update. This is
executed on the vmlinux.o before the __tracepoint_check section is
discarded (the section is discarded before vmlinux is created). This
program will create an array of each string in the __tracepoint_check
section and then sort it. Then it will walk the strings in the
__tracepoint_strings section and do a binary search to check if its name
is in the __tracepoint_check section. If it is not, then it is unused and
a warning is printed.

Note, this currently only handles tracepoints that are builtin and not in
modules.

Enabling this currently with a given config produces:

warning: tracepoint 'sched_move_numa' is unused.
warning: tracepoint 'sched_stick_numa' is unused.
warning: tracepoint 'sched_swap_numa' is unused.
warning: tracepoint 'pelt_hw_tp' is unused.
warning: tracepoint 'pelt_irq_tp' is unused.
warning: tracepoint 'rcu_preempt_task' is unused.
warning: tracepoint 'rcu_unlock_preempted_task' is unused.
warning: tracepoint 'xdp_bulk_tx' is unused.
warning: tracepoint 'xdp_redirect_map' is unused.
warning: tracepoint 'xdp_redirect_map_err' is unused.
warning: tracepoint 'vma_mas_szero' is unused.
warning: tracepoint 'vma_store' is unused.
warning: tracepoint 'hugepage_set_pmd' is unused.
warning: tracepoint 'hugepage_set_pud' is unused.
warning: tracepoint 'hugepage_update_pmd' is unused.
warning: tracepoint 'hugepage_update_pud' is unused.
warning: tracepoint 'block_rq_remap' is unused.
warning: tracepoint 'xhci_dbc_handle_event' is unused.
warning: tracepoint 'xhci_dbc_handle_transfer' is unused.
warning: tracepoint 'xhci_dbc_gadget_ep_queue' is unused.
warning: tracepoint 'xhci_dbc_alloc_request' is unused.
warning: tracepoint 'xhci_dbc_free_request' is unused.
warning: tracepoint 'xhci_dbc_queue_request' is unused.
warning: tracepoint 'xhci_dbc_giveback_request' is unused.
warning: tracepoint 'tcp_ao_wrong_maclen' is unused.
warning: tracepoint 'tcp_ao_mismatch' is unused.
warning: tracepoint 'tcp_ao_key_not_found' is unused.
warning: tracepoint 'tcp_ao_rnext_request' is unused.
warning: tracepoint 'tcp_ao_synack_no_key' is unused.
warning: tracepoint 'tcp_ao_snd_sne_update' is unused.
warning: tracepoint 'tcp_ao_rcv_sne_update' is unused.

Some of the above is totally unused but others are not used due to their
"trace_" functions being inside configs, in which case, the defined
tracepoints should also be inside those same configs. Others are
architecture specific but defined in generic code, where they should
either be moved to the architecture or be surrounded by #ifdef for the
architectures they are for.

This tool could be updated to process modules in the future.

I'd like to thank Mathieu Desnoyers for suggesting using strings instead
of pointers, as using pointers in vmlinux.o required handling relocations
and it required implementing almost a full feature linker to do so.

To enable this check, run the build with: make UT=1

Note, when all the existing unused tracepoints are removed from the build,
the "UT=1" will be removed and this will always be enabled when
tracepoints are configured to warn on any new tracepoints. The reason this
isn't always enabled now is because it will introduce a lot of warnings
for the current unused tracepoints, and all bisects would end at this
commit for those warnings.

Link: https://lore.kernel.org/all/20250528114549.4d8a5e03@gandalf.local.home/

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nicolas Schier <nicolas.schier@linux.dev>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/20251022004452.920728129@kernel.org
Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> # for using strings instead of pointers
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Makefile                          |  21 ++++
 include/asm-generic/vmlinux.lds.h |   1 +
 include/linux/tracepoint.h        |  11 ++
 scripts/Makefile                  |   3 +
 scripts/link-vmlinux.sh           |   7 ++
 scripts/tracepoint-update.c       | 232 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 275 insertions(+)
 create mode 100644 scripts/tracepoint-update.c

(limited to 'include')

diff --git a/Makefile b/Makefile
index d14824792227..9823c20c4278 100644
--- a/Makefile
+++ b/Makefile
@@ -810,6 +810,25 @@ ifdef CONFIG_FUNCTION_TRACER
   CC_FLAGS_FTRACE := -pg
 endif
 
+ifdef CONFIG_TRACEPOINTS
+# To check for unused tracepoints (tracepoints that are defined but never
+# called), run with:
+#
+# make UT=1
+#
+# Each unused tracepoints can take up to 5KB of memory in the running kernel.
+# It is best to remove any that are not used.
+#
+# This command line option will be removed when all current unused
+# tracepoints are removed.
+
+ifeq ("$(origin UT)", "command line")
+  WARN_ON_UNUSED_TRACEPOINTS := $(UT)
+endif
+endif # CONFIG_TRACEPOINTS
+
+export WARN_ON_UNUSED_TRACEPOINTS
+
 include $(srctree)/arch/$(SRCARCH)/Makefile
 
 ifdef need-config
@@ -1772,6 +1791,8 @@ help:
 	@echo  '		c: extra checks in the configuration stage (Kconfig)'
 	@echo  '		e: warnings are being treated as errors'
 	@echo  '		Multiple levels can be combined with W=12 or W=123'
+	@echo  '  make UT=1   [targets] Warn if a tracepoint is defined but not used.'
+	@echo  '          [ This will be removed when all current unused tracepoints are eliminated. ]'
 	@$(if $(dtstree), \
 		echo '  make CHECK_DTBS=1 [targets] Check all generated dtb files against schema'; \
 		echo '         This can be applied both to "dtbs" and to individual "foo.dtb" targets' ; \
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8a9a2e732a65..c510fb097a8c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -1048,6 +1048,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
 	*(.no_trim_symbol)						\
 	/* ld.bfd warns about .gnu.version* even when not emitted */	\
 	*(.gnu.version*)						\
+	*(__tracepoint_check)						\
 
 #define DISCARDS							\
 	/DISCARD/ : {							\
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 826ce3f8e1f8..1e53d3626c78 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -221,6 +221,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		__do_trace_##name(args);				\
 	}
 
+/*
+ * When a tracepoint is used, it's name is added to the __tracepoint_check
+ * section. This section is only used at build time to make sure all
+ * defined tracepoints are used. It is discarded after the build.
+ */
+# define TRACEPOINT_CHECK(name)						\
+	static const char __used __section("__tracepoint_check") __trace_check[] = \
+		#name;
+
 /*
  * Make sure the alignment of the structure in the __tracepoints section will
  * not add unwanted padding between the beginning of the section and the
@@ -270,6 +279,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \
 	static inline void __do_trace_##name(proto)			\
 	{								\
+		TRACEPOINT_CHECK(name)					\
 		if (cond) {						\
 			guard(preempt_notrace)();			\
 			__DO_TRACE_CALL(name, TP_ARGS(args));		\
@@ -289,6 +299,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \
 	static inline void __do_trace_##name(proto)			\
 	{								\
+		TRACEPOINT_CHECK(name)					\
 		guard(rcu_tasks_trace)();				\
 		__DO_TRACE_CALL(name, TP_ARGS(args));			\
 	}								\
diff --git a/scripts/Makefile b/scripts/Makefile
index f19624b3ed92..0941e5ce7b57 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -11,8 +11,10 @@ hostprogs-always-$(CONFIG_MODULE_SIG_FORMAT)		+= sign-file
 hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE)	+= insert-sys-cert
 hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS)		+= rustdoc_test_builder
 hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS)		+= rustdoc_test_gen
+hostprogs-always-$(CONFIG_TRACEPOINTS)			+= tracepoint-update
 
 sorttable-objs := sorttable.o elf-parse.o
+tracepoint-update-objs := tracepoint-update.o elf-parse.o
 
 ifneq ($(or $(CONFIG_X86_64),$(CONFIG_X86_32)),)
 always-$(CONFIG_RUST)					+= target.json
@@ -27,6 +29,7 @@ generate_rust_target-rust := y
 rustdoc_test_builder-rust := y
 rustdoc_test_gen-rust := y
 
+HOSTCFLAGS_tracepoint-update.o = -I$(srctree)/tools/include
 HOSTCFLAGS_elf-parse.o = -I$(srctree)/tools/include
 HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include
 HOSTLDLIBS_sorttable = -lpthread
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 433849ff7529..d304029fa6da 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -208,6 +208,13 @@ kallsymso=
 strip_debug=
 generate_map=
 
+# Use "make UT=1" to trigger warnings on unused tracepoints
+case "${WARN_ON_UNUSED_TRACEPOINTS}" in
+*1*)
+	${objtree}/scripts/tracepoint-update vmlinux.o
+	;;
+esac
+
 if is_enabled CONFIG_KALLSYMS; then
 	true > .tmp_vmlinux0.syms
 	kallsyms .tmp_vmlinux0.syms .tmp_vmlinux0.kallsyms
diff --git a/scripts/tracepoint-update.c b/scripts/tracepoint-update.c
new file mode 100644
index 000000000000..6ec30f39d0ad
--- /dev/null
+++ b/scripts/tracepoint-update.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <getopt.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include "elf-parse.h"
+
+static Elf_Shdr *check_data_sec;
+static Elf_Shdr *tracepoint_data_sec;
+
+static inline void *get_index(void *start, int entsize, int index)
+{
+	return start + (entsize * index);
+}
+
+static int compare_strings(const void *a, const void *b)
+{
+	const char *av = *(const char **)a;
+	const char *bv = *(const char **)b;
+
+	return strcmp(av, bv);
+}
+
+struct elf_tracepoint {
+	Elf_Ehdr *ehdr;
+	const char **array;
+	int count;
+};
+
+#define REALLOC_SIZE (1 << 10)
+#define REALLOC_MASK (REALLOC_SIZE - 1)
+
+static int add_string(const char *str, const char ***vals, int *count)
+{
+	const char **array = *vals;
+
+	if (!(*count & REALLOC_MASK)) {
+		int size = (*count) + REALLOC_SIZE;
+
+		array = realloc(array, sizeof(char *) * size);
+		if (!array) {
+			fprintf(stderr, "Failed memory allocation\n");
+			return -1;
+		}
+		*vals = array;
+	}
+
+	array[(*count)++] = str;
+	return 0;
+}
+
+/**
+ * for_each_shdr_str - iterator that reads strings that are in an ELF section.
+ * @len: "int" to hold the length of the current string
+ * @ehdr: A pointer to the ehdr of the ELF file
+ * @sec: The section that has the strings to iterate on
+ *
+ * This is a for loop that iterates over all the nul terminated strings
+ * that are in a given ELF section. The variable "str" will hold
+ * the current string for each iteration and the passed in @len will
+ * contain the strlen() of that string.
+ */
+#define for_each_shdr_str(len, ehdr, sec)				\
+	for (const char *str = (void *)(ehdr) + shdr_offset(sec),	\
+			*end = str + shdr_size(sec);			\
+	     len = strlen(str), str < end;				\
+	     str += (len) + 1)
+
+
+static void make_trace_array(struct elf_tracepoint *etrace)
+{
+	Elf_Ehdr *ehdr = etrace->ehdr;
+	const char **vals = NULL;
+	int count = 0;
+	int len;
+
+	etrace->array = NULL;
+
+	/*
+	 * The __tracepoint_check section is filled with strings of the
+	 * names of tracepoints (in tracepoint_strings). Create an array
+	 * that points to each string and then sort the array.
+	 */
+	for_each_shdr_str(len, ehdr, check_data_sec) {
+		if (!len)
+			continue;
+		if (add_string(str, &vals, &count) < 0)
+			return;
+	}
+
+	/* If CONFIG_TRACEPOINT_VERIFY_USED is not set, there's nothing to do */
+	if (!count)
+		return;
+
+	qsort(vals, count, sizeof(char *), compare_strings);
+
+	etrace->array = vals;
+	etrace->count = count;
+}
+
+static int find_event(const char *str, void *array, size_t size)
+{
+	return bsearch(&str, array, size, sizeof(char *), compare_strings) != NULL;
+}
+
+static void check_tracepoints(struct elf_tracepoint *etrace)
+{
+	Elf_Ehdr *ehdr = etrace->ehdr;
+	int len;
+
+	if (!etrace->array)
+		return;
+
+	/*
+	 * The __tracepoints_strings section holds all the names of the
+	 * defined tracepoints. If any of them are not in the
+	 * __tracepoint_check_section it means they are not used.
+	 */
+	for_each_shdr_str(len, ehdr, tracepoint_data_sec) {
+		if (!len)
+			continue;
+		if (!find_event(str, etrace->array, etrace->count)) {
+			fprintf(stderr, "warning: tracepoint '%s' is unused.\n", str);
+		}
+	}
+
+	free(etrace->array);
+}
+
+static void *tracepoint_check(struct elf_tracepoint *etrace)
+{
+	make_trace_array(etrace);
+	check_tracepoints(etrace);
+
+	return NULL;
+}
+
+static int process_tracepoints(void *addr, char const *const fname)
+{
+	struct elf_tracepoint etrace = {0};
+	Elf_Ehdr *ehdr = addr;
+	Elf_Shdr *shdr_start;
+	Elf_Shdr *string_sec;
+	const char *secstrings;
+	unsigned int shnum;
+	unsigned int shstrndx;
+	int shentsize;
+	int idx;
+	int done = 2;
+
+	shdr_start = (Elf_Shdr *)((char *)ehdr + ehdr_shoff(ehdr));
+	shentsize = ehdr_shentsize(ehdr);
+
+	shstrndx = ehdr_shstrndx(ehdr);
+	if (shstrndx == SHN_XINDEX)
+		shstrndx = shdr_link(shdr_start);
+	string_sec = get_index(shdr_start, shentsize, shstrndx);
+	secstrings = (const char *)ehdr + shdr_offset(string_sec);
+
+	shnum = ehdr_shnum(ehdr);
+	if (shnum == SHN_UNDEF)
+		shnum = shdr_size(shdr_start);
+
+	for (int i = 0; done && i < shnum; i++) {
+		Elf_Shdr *shdr = get_index(shdr_start, shentsize, i);
+
+		idx = shdr_name(shdr);
+
+		/* locate the __tracepoint_check in vmlinux */
+		if (!strcmp(secstrings + idx, "__tracepoint_check")) {
+			check_data_sec = shdr;
+			done--;
+		}
+
+		/* locate the __tracepoints_ptrs section in vmlinux */
+		if (!strcmp(secstrings + idx, "__tracepoints_strings")) {
+			tracepoint_data_sec = shdr;
+			done--;
+		}
+	}
+
+	if (!check_data_sec) {
+		fprintf(stderr,	"no __tracepoint_check in file: %s\n", fname);
+		return -1;
+	}
+
+	if (!tracepoint_data_sec) {
+		fprintf(stderr,	"no __tracepoint_strings in file: %s\n", fname);
+		return -1;
+	}
+
+	etrace.ehdr = ehdr;
+	tracepoint_check(&etrace);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int n_error = 0;
+	size_t size = 0;
+	void *addr = NULL;
+
+	if (argc < 2) {
+		fprintf(stderr, "usage: tracepoint-update vmlinux...\n");
+		return 0;
+	}
+
+	/* Process each file in turn, allowing deep failure. */
+	for (int i = 1; i < argc; i++) {
+		addr = elf_map(argv[i], &size, 1 << ET_REL);
+		if (!addr) {
+			++n_error;
+			continue;
+		}
+
+		if (process_tracepoints(addr, argv[i]))
+			++n_error;
+
+		elf_unmap(addr, size);
+	}
+
+	return !!n_error;
+}
-- 
cgit v1.2.3


From faf938153cad98d97f60ac835ead1db74961507e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 21 Oct 2025 20:43:41 -0400
Subject: tracepoint: Do not warn for unused event that is exported

There are a few generic events that may only be used by modules. They are
defined and then set with EXPORT_TRACEPOINT*(). Mark events that are
exported as being used, even though they still waste memory in the kernel
proper.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nicolas Schier <nicolas.schier@linux.dev>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/20251022004453.089254920@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 1e53d3626c78..8a56f3278b1b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -227,8 +227,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  * defined tracepoints are used. It is discarded after the build.
  */
 # define TRACEPOINT_CHECK(name)						\
-	static const char __used __section("__tracepoint_check") __trace_check[] = \
-		#name;
+	static const char __used __section("__tracepoint_check")	\
+	__trace_check_##name[] = #name;
 
 /*
  * Make sure the alignment of the structure in the __tracepoints section will
@@ -382,10 +382,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	__DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args));
 
 #define EXPORT_TRACEPOINT_SYMBOL_GPL(name)				\
+	TRACEPOINT_CHECK(name)						\
 	EXPORT_SYMBOL_GPL(__tracepoint_##name);				\
 	EXPORT_SYMBOL_GPL(__traceiter_##name);				\
 	EXPORT_STATIC_CALL_GPL(tp_func_##name)
 #define EXPORT_TRACEPOINT_SYMBOL(name)					\
+	TRACEPOINT_CHECK(name)						\
 	EXPORT_SYMBOL(__tracepoint_##name);				\
 	EXPORT_SYMBOL(__traceiter_##name);				\
 	EXPORT_STATIC_CALL(tp_func_##name)
-- 
cgit v1.2.3


From 35d7c70870338aa6a367b9e4ed528914320b0be0 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 22 Oct 2025 05:39:46 +0000
Subject: neighbour: Annotate access to neigh_parms fields.

NEIGH_VAR() is read locklessly in the fast path, and IPv6 ndisc uses
NEIGH_VAR_SET() locklessly.

The next patch will convert neightbl_dump_info() to RCU.

Let's annotate accesses to neigh_param with READ_ONCE() and WRITE_ONCE().

Note that ndisc_ifinfo_sysctl_change() uses &NEIGH_VAR() and we cannot
use '&' with READ_ONCE(), so NEIGH_VAR_PTR() is introduced.

Note also that NEIGH_VAR_INIT() does not need WRITE_ONCE() as it is before
parms is published.  Also, the only user hippi_neigh_setup_dev() is no
longer called since commit e3804cbebb67 ("net: remove COMPAT_NET_DEV_OPS"),
which looks wrong, but probably no one uses HIPPI and RoadRunner.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251022054004.2514876-3-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h | 15 ++++++++++++---
 net/core/neighbour.c    | 17 ++++++-----------
 net/ipv6/ndisc.c        |  8 ++++----
 3 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 4a30bd458c5a..998ff9eccebb 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -92,15 +92,17 @@ struct neigh_parms {
 static inline void neigh_var_set(struct neigh_parms *p, int index, int val)
 {
 	set_bit(index, p->data_state);
-	p->data[index] = val;
+	WRITE_ONCE(p->data[index], val);
 }
 
-#define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr])
+#define __NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr])
+#define NEIGH_VAR(p, attr) READ_ONCE(__NEIGH_VAR(p, attr))
+#define NEIGH_VAR_PTR(p, attr) (&(__NEIGH_VAR(p, attr)))
 
 /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used.
  * In other cases, NEIGH_VAR_SET should be used.
  */
-#define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val)
+#define NEIGH_VAR_INIT(p, attr, val) (__NEIGH_VAR(p, attr) = val)
 #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val)
 
 static inline void neigh_parms_data_state_setall(struct neigh_parms *p)
@@ -378,6 +380,13 @@ struct net *neigh_parms_net(const struct neigh_parms *parms)
 
 unsigned long neigh_rand_reach_time(unsigned long base);
 
+static inline void neigh_set_reach_time(struct neigh_parms *p)
+{
+	unsigned long base = NEIGH_VAR(p, BASE_REACHABLE_TIME);
+
+	WRITE_ONCE(p->reachable_time, neigh_rand_reach_time(base));
+}
+
 void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
 		    struct sk_buff *skb);
 struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 98428f60731b..5bbebbfcba43 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -995,8 +995,7 @@ static void neigh_periodic_work(struct work_struct *work)
 
 		WRITE_ONCE(tbl->last_rand, jiffies);
 		list_for_each_entry(p, &tbl->parms_list, list)
-			p->reachable_time =
-				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+			neigh_set_reach_time(p);
 	}
 
 	if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1))
@@ -1749,8 +1748,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
 	if (p) {
 		p->tbl		  = tbl;
 		refcount_set(&p->refcnt, 1);
-		p->reachable_time =
-				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+		neigh_set_reach_time(p);
 		p->qlen = 0;
 		netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
 		p->dev = dev;
@@ -1810,8 +1808,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
 	list_add(&tbl->parms.list, &tbl->parms_list);
 	write_pnet(&tbl->parms.net, &init_net);
 	refcount_set(&tbl->parms.refcnt, 1);
-	tbl->parms.reachable_time =
-			  neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
+	neigh_set_reach_time(&tbl->parms);
 	tbl->parms.qlen = 0;
 
 	tbl->stats = alloc_percpu(struct neigh_statistics);
@@ -2194,7 +2191,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
 			NEIGH_VAR(parms, MCAST_PROBES)) ||
 	    nla_put_u32(skb, NDTPA_MCAST_REPROBES,
 			NEIGH_VAR(parms, MCAST_REPROBES)) ||
-	    nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time,
+	    nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time),
 			  NDTPA_PAD) ||
 	    nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
 			  NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) ||
@@ -2475,8 +2472,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
 				 * only be effective after the next time neigh_periodic_work
 				 * decides to recompute it (can be multiple minutes)
 				 */
-				p->reachable_time =
-					neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+				neigh_set_reach_time(p);
 				break;
 			case NDTPA_GC_STALETIME:
 				NEIGH_VAR_SET(p, GC_STALETIME,
@@ -3721,8 +3717,7 @@ static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write
 		 * only be effective after the next time neigh_periodic_work
 		 * decides to recompute it
 		 */
-		p->reachable_time =
-			neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+		neigh_set_reach_time(p);
 	}
 	return ret;
 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f427e41e9c49..59d17b6f06bf 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1449,7 +1449,7 @@ skip_defrtr:
 					      BASE_REACHABLE_TIME, rtime);
 				NEIGH_VAR_SET(in6_dev->nd_parms,
 					      GC_STALETIME, 3 * rtime);
-				in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime);
+				neigh_set_reach_time(in6_dev->nd_parms);
 				in6_dev->tstamp = jiffies;
 				send_ifinfo_notify = true;
 			}
@@ -1948,9 +1948,9 @@ int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buf
 		ret = -1;
 
 	if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) {
-		if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME))
-			idev->nd_parms->reachable_time =
-					neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME));
+		if (ctl->data == NEIGH_VAR_PTR(idev->nd_parms, BASE_REACHABLE_TIME))
+			neigh_set_reach_time(idev->nd_parms);
+
 		WRITE_ONCE(idev->tstamp, jiffies);
 		inet6_ifinfo_notify(RTM_NEWLINK, idev);
 		in6_dev_put(idev);
-- 
cgit v1.2.3


From 3064d0fe02af23a3956d2b690461abb44da88cf4 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 22 Oct 2025 05:39:49 +0000
Subject: neighbour: Convert rwlock of struct neigh_table to spinlock.

Only neigh_for_each() and neigh_seq_start/stop() are on the
reader side of neigh_table.lock.

Let's convert rwlock to the plain spinlock.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251022054004.2514876-6-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h |  2 +-
 net/atm/clip.c          |  4 +--
 net/core/neighbour.c    | 68 +++++++++++++++++++++++++------------------------
 net/ipv4/arp.c          |  4 +--
 4 files changed, 40 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 998ff9eccebb..2dfee6d4258a 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -238,7 +238,7 @@ struct neigh_table {
 	atomic_t		gc_entries;
 	struct list_head	gc_list;
 	struct list_head	managed_list;
-	rwlock_t		lock;
+	spinlock_t		lock;
 	unsigned long		last_rand;
 	struct neigh_statistics	__percpu *stats;
 	struct neigh_hash_table __rcu *nht;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index f7a5565e794e..8f152e5fa659 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -168,10 +168,10 @@ static int neigh_check_cb(struct neighbour *n)
 
 static void idle_timer_check(struct timer_list *unused)
 {
-	write_lock(&arp_tbl.lock);
+	spin_lock(&arp_tbl.lock);
 	__neigh_for_each_release(&arp_tbl, neigh_check_cb);
 	mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ);
-	write_unlock(&arp_tbl.lock);
+	spin_unlock(&arp_tbl.lock);
 }
 
 static int clip_arp_rcv(struct sk_buff *skb)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 6d2164b4d999..96a3b1a93252 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -81,7 +81,7 @@ static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family
 }
 
 /*
-   Neighbour hash table buckets are protected with rwlock tbl->lock.
+   Neighbour hash table buckets are protected with tbl->lock.
 
    - All the scans/updates to hash buckets MUST be made under this lock.
    - NOTHING clever should be made under this lock: no callbacks
@@ -149,7 +149,7 @@ static void neigh_update_gc_list(struct neighbour *n)
 {
 	bool on_gc_list, exempt_from_gc;
 
-	write_lock_bh(&n->tbl->lock);
+	spin_lock_bh(&n->tbl->lock);
 	write_lock(&n->lock);
 	if (n->dead)
 		goto out;
@@ -172,14 +172,14 @@ static void neigh_update_gc_list(struct neighbour *n)
 	}
 out:
 	write_unlock(&n->lock);
-	write_unlock_bh(&n->tbl->lock);
+	spin_unlock_bh(&n->tbl->lock);
 }
 
 static void neigh_update_managed_list(struct neighbour *n)
 {
 	bool on_managed_list, add_to_managed;
 
-	write_lock_bh(&n->tbl->lock);
+	spin_lock_bh(&n->tbl->lock);
 	write_lock(&n->lock);
 	if (n->dead)
 		goto out;
@@ -193,7 +193,7 @@ static void neigh_update_managed_list(struct neighbour *n)
 		list_add_tail(&n->managed_list, &n->tbl->managed_list);
 out:
 	write_unlock(&n->lock);
-	write_unlock_bh(&n->tbl->lock);
+	spin_unlock_bh(&n->tbl->lock);
 }
 
 static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
@@ -263,7 +263,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
 
 	NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
 
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 
 	list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
 		if (refcount_read(&n->refcnt) == 1) {
@@ -292,7 +292,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
 
 	WRITE_ONCE(tbl->last_flush, jiffies);
 unlock:
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 
 	return shrunk;
 }
@@ -454,23 +454,23 @@ static void neigh_flush_table(struct neigh_table *tbl)
 
 void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
 {
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	neigh_flush_dev(tbl, dev, false);
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 }
 EXPORT_SYMBOL(neigh_changeaddr);
 
 static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
 			  bool skip_perm)
 {
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	if (likely(dev)) {
 		neigh_flush_dev(tbl, dev, skip_perm);
 	} else {
 		DEBUG_NET_WARN_ON_ONCE(skip_perm);
 		neigh_flush_table(tbl);
 	}
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 
 	pneigh_ifdown(tbl, dev, skip_perm);
 	pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
@@ -687,7 +687,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
 
 	n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
 
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	nht = rcu_dereference_protected(tbl->nht,
 					lockdep_is_held(&tbl->lock));
 
@@ -722,13 +722,13 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
 	hlist_add_head_rcu(&n->dev_list,
 			   neigh_get_dev_table(dev, tbl->family));
 
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 	neigh_dbg(2, "neigh %p is created\n", n);
 	rc = n;
 out:
 	return rc;
 out_tbl_unlock:
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 out_neigh_release:
 	if (!exempt_from_gc)
 		atomic_dec(&tbl->gc_entries);
@@ -982,7 +982,7 @@ static void neigh_periodic_work(struct work_struct *work)
 
 	NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
 
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	nht = rcu_dereference_protected(tbl->nht,
 					lockdep_is_held(&tbl->lock));
 
@@ -1036,9 +1036,9 @@ static void neigh_periodic_work(struct work_struct *work)
 		 * It's fine to release lock here, even if hash table
 		 * grows while we are preempted.
 		 */
-		write_unlock_bh(&tbl->lock);
+		spin_unlock_bh(&tbl->lock);
 		cond_resched();
-		write_lock_bh(&tbl->lock);
+		spin_lock_bh(&tbl->lock);
 		nht = rcu_dereference_protected(tbl->nht,
 						lockdep_is_held(&tbl->lock));
 	}
@@ -1049,7 +1049,7 @@ out:
 	 */
 	queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
 			      NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 }
 
 static __inline__ int neigh_max_probes(struct neighbour *n)
@@ -1641,12 +1641,12 @@ static void neigh_managed_work(struct work_struct *work)
 					       managed_work.work);
 	struct neighbour *neigh;
 
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	list_for_each_entry(neigh, &tbl->managed_list, managed_list)
 		neigh_event_send_probe(neigh, NULL, false);
 	queue_delayed_work(system_power_efficient_wq, &tbl->managed_work,
 			   NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS));
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 }
 
 static void neigh_proxy_process(struct timer_list *t)
@@ -1761,9 +1761,9 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
 			return NULL;
 		}
 
-		write_lock_bh(&tbl->lock);
+		spin_lock_bh(&tbl->lock);
 		list_add_rcu(&p->list, &tbl->parms.list);
-		write_unlock_bh(&tbl->lock);
+		spin_unlock_bh(&tbl->lock);
 
 		neigh_parms_data_state_cleanall(p);
 	}
@@ -1783,10 +1783,12 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
 {
 	if (!parms || parms == &tbl->parms)
 		return;
-	write_lock_bh(&tbl->lock);
+
+	spin_lock_bh(&tbl->lock);
 	list_del_rcu(&parms->list);
 	parms->dead = 1;
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
+
 	netdev_put(parms->dev, &parms->dev_tracker);
 	call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
 }
@@ -1835,7 +1837,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
 	else
 		WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
 
-	rwlock_init(&tbl->lock);
+	spin_lock_init(&tbl->lock);
 	mutex_init(&tbl->phash_lock);
 
 	INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
@@ -1978,10 +1980,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = __neigh_update(neigh, NULL, NUD_FAILED,
 			     NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN,
 			     NETLINK_CB(skb).portid, extack);
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	neigh_release(neigh);
 	neigh_remove_one(neigh);
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 
 out:
 	return err;
@@ -2406,7 +2408,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
 	 * We acquire tbl->lock to be nice to the periodic timers and
 	 * make sure they always see a consistent set of values.
 	 */
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 
 	if (tb[NDTA_PARMS]) {
 		struct nlattr *tbp[NDTPA_MAX+1];
@@ -2525,7 +2527,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = 0;
 
 errout_tbl_lock:
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 	rcu_read_unlock();
 errout:
 	return err;
@@ -3125,14 +3127,14 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
 	rcu_read_lock();
 	nht = rcu_dereference(tbl->nht);
 
-	read_lock_bh(&tbl->lock); /* avoid resizes */
+	spin_lock_bh(&tbl->lock); /* avoid resizes */
 	for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
 		struct neighbour *n;
 
 		neigh_for_each_in_bucket(n, &nht->hash_heads[chain])
 			cb(n, cookie);
 	}
-	read_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(neigh_for_each);
@@ -3402,7 +3404,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
 
 	rcu_read_lock();
 	state->nht = rcu_dereference(tbl->nht);
-	read_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 
 	return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
 }
@@ -3442,7 +3444,7 @@ void neigh_seq_stop(struct seq_file *seq, void *v)
 	struct neigh_seq_state *state = seq->private;
 	struct neigh_table *tbl = state->tbl;
 
-	read_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(neigh_seq_stop);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 833f2cf97178..f3bfecf8a234 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1217,10 +1217,10 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
 			err = neigh_update(neigh, NULL, NUD_FAILED,
 					   NEIGH_UPDATE_F_OVERRIDE|
 					   NEIGH_UPDATE_F_ADMIN, 0);
-		write_lock_bh(&tbl->lock);
+		spin_lock_bh(&tbl->lock);
 		neigh_release(neigh);
 		neigh_remove_one(neigh);
-		write_unlock_bh(&tbl->lock);
+		spin_unlock_bh(&tbl->lock);
 	}
 
 	return err;
-- 
cgit v1.2.3


From 330ce8ffc1848cbfa3e06c2c22750cfffa115579 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 23 Oct 2025 10:16:30 +0100
Subject: net: phy: add phy_can_wakeup()

Add phy_can_wakeup() to report whether the PHY driver has marked the
PHY device as being wake-up capable as far as the driver model is
concerned.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrQs-0000000BLzI-0w3U@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3c7634482356..3eeeaec52832 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1379,6 +1379,18 @@ static inline void phy_disable_eee_mode(struct phy_device *phydev, u32 link_mode
 	linkmode_clear_bit(link_mode, phydev->advertising_eee);
 }
 
+/**
+ * phy_can_wakeup() - indicate whether PHY has driver model wakeup capabilities
+ * @phydev: The phy_device struct
+ *
+ * Returns: true/false depending on the PHY driver's device_set_wakeup_capable()
+ * setting.
+ */
+static inline bool phy_can_wakeup(struct phy_device *phydev)
+{
+	return device_can_wakeup(&phydev->mdio.dev);
+}
+
 void phy_resolve_aneg_pause(struct phy_device *phydev);
 void phy_resolve_aneg_linkmode(struct phy_device *phydev);
 
-- 
cgit v1.2.3


From b344bfacf1de2dd776a218ce8341b9c672745a01 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 23 Oct 2025 10:16:35 +0100
Subject: net: phy: add phy_may_wakeup()

Add phy_may_wakeup() which uses the driver model's device_may_wakeup()
when the PHY driver has marked the device as wakeup capable in the
driver model, otherwise use phy_drv_wol_enabled().

Replace the sites that used to call phy_drv_wol_enabled() with this
as checking the driver model will be more efficient than checking the
WoL state.

Export phy_may_wakeup() so that phylink can use it.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrQx-0000000BLzO-1RLt@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 14 ++++++++++++--
 include/linux/phy.h          |  9 +++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 7a67c900e79a..b7feaf0cb1df 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -251,6 +251,16 @@ static bool phy_drv_wol_enabled(struct phy_device *phydev)
 	return wol.wolopts != 0;
 }
 
+bool phy_may_wakeup(struct phy_device *phydev)
+{
+	/* If the PHY is using driver-model based wakeup, use that state. */
+	if (phy_can_wakeup(phydev))
+		return device_may_wakeup(&phydev->mdio.dev);
+
+	return phy_drv_wol_enabled(phydev);
+}
+EXPORT_SYMBOL_GPL(phy_may_wakeup);
+
 static void phy_link_change(struct phy_device *phydev, bool up)
 {
 	struct net_device *netdev = phydev->attached_dev;
@@ -302,7 +312,7 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
 	/* If the PHY on the mido bus is not attached but has WOL enabled
 	 * we cannot suspend the PHY.
 	 */
-	if (!netdev && phy_drv_wol_enabled(phydev))
+	if (!netdev && phy_may_wakeup(phydev))
 		return false;
 
 	/* PHY not attached? May suspend if the PHY has not already been
@@ -1909,7 +1919,7 @@ int phy_suspend(struct phy_device *phydev)
 	if (phydev->suspended || !phydrv)
 		return 0;
 
-	phydev->wol_enabled = phy_drv_wol_enabled(phydev) ||
+	phydev->wol_enabled = phy_may_wakeup(phydev) ||
 			      (netdev && netdev->ethtool->wol_enabled);
 	/* If the device has WOL enabled, we cannot suspend the PHY */
 	if (phydev->wol_enabled && !(phydrv->flags & PHY_ALWAYS_CALL_SUSPEND))
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3eeeaec52832..17a2cdc9f1a0 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1391,6 +1391,15 @@ static inline bool phy_can_wakeup(struct phy_device *phydev)
 	return device_can_wakeup(&phydev->mdio.dev);
 }
 
+/**
+ * phy_may_wakeup() - indicate whether PHY has wakeup enabled
+ * @phydev: The phy_device struct
+ *
+ * Returns: true/false depending on the PHY driver's device_set_wakeup_enabled()
+ * setting if using the driver model, otherwise the legacy determination.
+ */
+bool phy_may_wakeup(struct phy_device *phydev);
+
 void phy_resolve_aneg_pause(struct phy_device *phydev);
 void phy_resolve_aneg_linkmode(struct phy_device *phydev);
 
-- 
cgit v1.2.3


From b79fbd86c84918790c128e6899b420de4667018e Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 23 Oct 2025 10:16:40 +0100
Subject: net: phylink: add phylink managed MAC Wake-on-Lan support

Add core phylink managed Wake-on-Lan support, which is enabled when the
MAC driver fills in the new .mac_wol_set() method that this commit
creates.

When this feature is disabled, phylink acts as it has in the past,
merely passing the ethtool WoL calls to phylib whenever a PHY exists.
No other new functionality provided by this commit is enabled.

When this feature is enabled, a more inteligent approach is used.
Phylink will first pass WoL options to the PHY, read them back, and
attempt to set any options that were not set at the PHY at the MAC.

Since we have PHY drivers that report they support WoL, and accept WoL
configuration even though they aren't wired up to be capable of waking
the system, we need a way to differentiate between PHYs that think
they support WoL and those which actually do. As PHY drivers do not
make use of the driver model's wake-up infrastructure, but could, we
use this to determine whether PHY drivers can participate. This gives
a path forward where, as MAC drivers are converted to this, it
encourages PHY drivers to also be converted.

Phylink will also ignore the mac_wol argument to phylink_suspend() as
it now knows the WoL state at the MAC.

MAC drivers are expected to record/configure the Wake-on-Lan state in
their .mac_set_wol() method, and deal appropriately with it in their
suspend/resume methods. The driver model provides assistance to set the
IRQ wake support which may assist driver authors in achieving the
necessary configuration.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrR2-0000000BLzU-1xYL@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 80 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/phylink.h   | 26 +++++++++++++++
 2 files changed, 102 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 9d7799ea1c17..bec44ebdf80b 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -93,6 +93,9 @@ struct phylink {
 	u8 sfp_port;
 
 	struct eee_config eee_cfg;
+
+	u32 wolopts_mac;
+	u8 wol_sopass[SOPASS_MAX];
 };
 
 #define phylink_printk(level, pl, fmt, ...) \
@@ -2562,6 +2565,17 @@ void phylink_rx_clk_stop_unblock(struct phylink *pl)
 }
 EXPORT_SYMBOL_GPL(phylink_rx_clk_stop_unblock);
 
+static bool phylink_mac_supports_wol(struct phylink *pl)
+{
+	return !!pl->mac_ops->mac_wol_set;
+}
+
+static bool phylink_phy_supports_wol(struct phylink *pl,
+				     struct phy_device *phydev)
+{
+	return phydev && (pl->config->wol_phy_legacy || phy_can_wakeup(phydev));
+}
+
 /**
  * phylink_suspend() - handle a network device suspend event
  * @pl: a pointer to a &struct phylink returned from phylink_create()
@@ -2575,11 +2589,17 @@ EXPORT_SYMBOL_GPL(phylink_rx_clk_stop_unblock);
  *   can also bring down the link between the MAC and PHY.
  * - If Wake-on-Lan is active, but being handled by the MAC, the MAC
  *   still needs to receive packets, so we can not bring the link down.
+ *
+ * Note: when phylink managed Wake-on-Lan is in use, @mac_wol is ignored.
+ * (struct phylink_mac_ops.mac_set_wol populated.)
  */
 void phylink_suspend(struct phylink *pl, bool mac_wol)
 {
 	ASSERT_RTNL();
 
+	if (phylink_mac_supports_wol(pl))
+		mac_wol = !!pl->wolopts_mac;
+
 	if (mac_wol && (!pl->netdev || pl->netdev->ethtool->wol_enabled)) {
 		/* Wake-on-Lan enabled, MAC handling */
 		mutex_lock(&pl->state_mutex);
@@ -2689,8 +2709,24 @@ void phylink_ethtool_get_wol(struct phylink *pl, struct ethtool_wolinfo *wol)
 	wol->supported = 0;
 	wol->wolopts = 0;
 
-	if (pl->phydev)
-		phy_ethtool_get_wol(pl->phydev, wol);
+	if (phylink_mac_supports_wol(pl)) {
+		if (phylink_phy_supports_wol(pl, pl->phydev))
+			phy_ethtool_get_wol(pl->phydev, wol);
+
+		/* Where the MAC augments the WoL support, merge its support and
+		 * current configuration.
+		 */
+		if (~wol->wolopts & pl->wolopts_mac & WAKE_MAGICSECURE)
+			memcpy(wol->sopass, pl->wol_sopass,
+			       sizeof(wol->sopass));
+
+		wol->supported |= pl->config->wol_mac_support;
+		wol->wolopts |= pl->wolopts_mac;
+	} else {
+		/* Legacy */
+		if (pl->phydev)
+			phy_ethtool_get_wol(pl->phydev, wol);
+	}
 }
 EXPORT_SYMBOL_GPL(phylink_ethtool_get_wol);
 
@@ -2707,12 +2743,48 @@ EXPORT_SYMBOL_GPL(phylink_ethtool_get_wol);
  */
 int phylink_ethtool_set_wol(struct phylink *pl, struct ethtool_wolinfo *wol)
 {
+	struct ethtool_wolinfo w = { .cmd = ETHTOOL_GWOL };
 	int ret = -EOPNOTSUPP;
+	bool changed;
+	u32 wolopts;
 
 	ASSERT_RTNL();
 
-	if (pl->phydev)
-		ret = phy_ethtool_set_wol(pl->phydev, wol);
+	if (phylink_mac_supports_wol(pl)) {
+		wolopts = wol->wolopts;
+
+		if (phylink_phy_supports_wol(pl, pl->phydev)) {
+			ret = phy_ethtool_set_wol(pl->phydev, wol);
+			if (ret != 0 && ret != -EOPNOTSUPP)
+				return ret;
+
+			phy_ethtool_get_wol(pl->phydev, &w);
+
+			/* Any Wake-on-Lan modes which the PHY is handling
+			 * should not be passed on to the MAC.
+			 */
+			wolopts &= ~w.wolopts;
+		}
+
+		wolopts &= pl->config->wol_mac_support;
+		changed = pl->wolopts_mac != wolopts;
+		if (wolopts & WAKE_MAGICSECURE)
+			changed |= !!memcmp(wol->sopass, pl->wol_sopass,
+					    sizeof(wol->sopass));
+		memcpy(pl->wol_sopass, wol->sopass, sizeof(pl->wol_sopass));
+
+		if (changed) {
+			ret = pl->mac_ops->mac_wol_set(pl->config, wolopts,
+						       wol->sopass);
+			if (!ret)
+				pl->wolopts_mac = wolopts;
+		} else {
+			ret = 0;
+		}
+	} else {
+		if (pl->phydev)
+			ret = phy_ethtool_set_wol(pl->phydev, wol);
+	}
 
 	return ret;
 }
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 9af0411761d7..59cb58b29d1d 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -156,6 +156,8 @@ enum phylink_op_type {
  * @lpi_capabilities: MAC speeds which can support LPI signalling
  * @lpi_timer_default: Default EEE LPI timer setting.
  * @eee_enabled_default: If set, EEE will be enabled by phylink at creation time
+ * @wol_phy_legacy: Use Wake-on-Lan with PHY even if phy_can_wakeup() is false
+ * @wol_mac_support: Bitmask of MAC supported %WAKE_* options
  */
 struct phylink_config {
 	struct device *dev;
@@ -173,6 +175,10 @@ struct phylink_config {
 	unsigned long lpi_capabilities;
 	u32 lpi_timer_default;
 	bool eee_enabled_default;
+
+	/* Wake-on-Lan support */
+	bool wol_phy_legacy;
+	u32 wol_mac_support;
 };
 
 void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
@@ -188,6 +194,7 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
  * @mac_link_up: allow the link to come up.
  * @mac_disable_tx_lpi: disable LPI.
  * @mac_enable_tx_lpi: enable and configure LPI.
+ * @mac_wol_set: configure Wake-on-Lan settings at the MAC.
  *
  * The individual methods are described more fully below.
  */
@@ -211,6 +218,9 @@ struct phylink_mac_ops {
 	void (*mac_disable_tx_lpi)(struct phylink_config *config);
 	int (*mac_enable_tx_lpi)(struct phylink_config *config, u32 timer,
 				 bool tx_clk_stop);
+
+	int (*mac_wol_set)(struct phylink_config *config, u32 wolopts,
+			   const u8 *sopass);
 };
 
 #if 0 /* For kernel-doc purposes only. */
@@ -440,6 +450,22 @@ void mac_disable_tx_lpi(struct phylink_config *config);
  */
 int mac_enable_tx_lpi(struct phylink_config *config, u32 timer,
 		      bool tx_clk_stop);
+
+/**
+ * mac_wol_set() - configure the Wake-on-Lan parameters
+ * @config: a pointer to a &struct phylink_config.
+ * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes.
+ * @sopass: SecureOn(tm) password; meaningful only for %WAKE_MAGICSECURE
+ *
+ * Enable the specified Wake-on-Lan options at the MAC. Options that the
+ * PHY can handle will have been removed from @wolopts.
+ *
+ * The presence of this method enables phylink-managed WoL support.
+ *
+ * Returns: 0 on success.
+ */
+int (*mac_wol_set)(struct phylink_config *config, u32 wolopts,
+		   const u8 *sopass);
 #endif
 
 struct phylink_pcs_ops;
-- 
cgit v1.2.3


From dc1a2a9ce5b2c80e02115ff6fb29b726ad9d7777 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 23 Oct 2025 10:16:45 +0100
Subject: net: phylink: add phylink managed wake-on-lan PHY speed control

Some drivers, e.g. stmmac, use the speed_up()/speed_down() APIs to
gain additional power saving during Wake-on-LAN where the PHY is
managing the state.

Add support to phylink for this, which can be enabled by the MAC
driver. Only change the PHY speed if the PHY is configured for
wake-up, but without any wake-up on the MAC side, as MAC side
means changing the configuration once the negotiation has
completed.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrR7-0000000BLza-2PjK@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 12 ++++++++++++
 include/linux/phylink.h   |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index bec44ebdf80b..6e1243bf68aa 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -2576,6 +2576,12 @@ static bool phylink_phy_supports_wol(struct phylink *pl,
 	return phydev && (pl->config->wol_phy_legacy || phy_can_wakeup(phydev));
 }
 
+static bool phylink_phy_pm_speed_ctrl(struct phylink *pl)
+{
+	return pl->config->wol_phy_speed_ctrl && !pl->wolopts_mac &&
+	       pl->phydev && phy_may_wakeup(pl->phydev);
+}
+
 /**
  * phylink_suspend() - handle a network device suspend event
  * @pl: a pointer to a &struct phylink returned from phylink_create()
@@ -2625,6 +2631,9 @@ void phylink_suspend(struct phylink *pl, bool mac_wol)
 	} else {
 		phylink_stop(pl);
 	}
+
+	if (phylink_phy_pm_speed_ctrl(pl))
+		phylink_speed_down(pl, false);
 }
 EXPORT_SYMBOL_GPL(phylink_suspend);
 
@@ -2664,6 +2673,9 @@ void phylink_resume(struct phylink *pl)
 {
 	ASSERT_RTNL();
 
+	if (phylink_phy_pm_speed_ctrl(pl))
+		phylink_speed_up(pl);
+
 	if (test_bit(PHYLINK_DISABLE_MAC_WOL, &pl->phylink_disable_state)) {
 		/* Wake-on-Lan enabled, MAC handling */
 
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 59cb58b29d1d..38363e566ac3 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -157,6 +157,7 @@ enum phylink_op_type {
  * @lpi_timer_default: Default EEE LPI timer setting.
  * @eee_enabled_default: If set, EEE will be enabled by phylink at creation time
  * @wol_phy_legacy: Use Wake-on-Lan with PHY even if phy_can_wakeup() is false
+ * @wol_phy_speed_ctrl: Use phy speed control on suspend/resume
  * @wol_mac_support: Bitmask of MAC supported %WAKE_* options
  */
 struct phylink_config {
@@ -178,6 +179,7 @@ struct phylink_config {
 
 	/* Wake-on-Lan support */
 	bool wol_phy_legacy;
+	bool wol_phy_speed_ctrl;
 	u32 wol_mac_support;
 };
 
-- 
cgit v1.2.3


From eea31f21dce10814e34dc7ef7ed5136269c7bb59 Mon Sep 17 00:00:00 2001
From: Adithya Jayachandran <ajayachandra@nvidia.com>
Date: Wed, 15 Oct 2025 18:40:55 -0700
Subject: {rdma,net}/mlx5: Query vports mac address from device

Before this patch during either switchdev or legacy mode enablement we
cleared the mac address of vports between changes. This change allows us
to preserve the vports mac address between eswitch mode changes.

Vports hold information for VFs/SFs such as the permanent mac address.
VF/SF mac can be set either by iproute vf interface or devlink function
interface. For no obvious reason we reset it to 0 on switchdev/legacy
mode changes, this patch is fixing that, to align with other vport
information that are never reset, e.g GUID,mtu,promisc mode, etc ..

Signed-off-by: Adithya Jayachandran <ajayachandra@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Acked-by: Leon Romanovsky <leon@kernel.org> # RDMA
---
 drivers/infiniband/hw/mlx5/main.c                  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 20 +++++++-----------
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  3 +++
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    | 24 +++++++++++-----------
 include/linux/mlx5/vport.h                         |  3 ++-
 5 files changed, 25 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index fc1e86f6c409..90daa58126f4 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -842,7 +842,7 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
 		break;
 
 	case MLX5_VPORT_ACCESS_METHOD_NIC:
-		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
+		err = mlx5_query_nic_vport_node_guid(dev->mdev, 0, false, &tmp);
 		break;
 
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index e2ffb87b94cb..25af8bd7f077 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -875,13 +875,10 @@ static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
 				      vport_num, 1,
 				      vport->info.link_state);
 
-	/* Host PF has its own mac/guid. */
-	if (vport_num) {
-		mlx5_modify_nic_vport_mac_address(esw->dev, vport_num,
-						  vport->info.mac);
-		mlx5_modify_nic_vport_node_guid(esw->dev, vport_num,
-						vport->info.node_guid);
-	}
+	mlx5_query_nic_vport_mac_address(esw->dev, vport_num, true,
+					 vport->info.mac);
+	mlx5_query_nic_vport_node_guid(esw->dev, vport_num, true,
+				       &vport->info.node_guid);
 
 	flags = (vport->info.vlan || vport->info.qos) ?
 		SET_VLAN_STRIP | SET_VLAN_INSERT : 0;
@@ -947,12 +944,6 @@ int mlx5_esw_vport_enable(struct mlx5_eswitch *esw, struct mlx5_vport *vport,
 			goto err_vhca_mapping;
 	}
 
-	/* External controller host PF has factory programmed MAC.
-	 * Read it from the device.
-	 */
-	if (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF)
-		mlx5_query_nic_vport_mac_address(esw->dev, vport_num, true, vport->info.mac);
-
 	esw_vport_change_handle_locked(vport);
 
 	esw->enabled_vports++;
@@ -2235,6 +2226,9 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
 	ivi->vf = vport - 1;
 
 	mutex_lock(&esw->state_lock);
+
+	mlx5_query_nic_vport_mac_address(esw->dev, vport, true,
+					 evport->info.mac);
 	ether_addr_copy(ivi->mac, evport->info.mac);
 	ivi->linkstate = evport->info.link_state;
 	ivi->vlan = evport->info.vlan;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 4cf995be127d..880e238497b1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -4303,6 +4303,9 @@ int mlx5_devlink_port_fn_hw_addr_get(struct devlink_port *port,
 	struct mlx5_vport *vport = mlx5_devlink_port_vport_get(port);
 
 	mutex_lock(&esw->state_lock);
+
+	mlx5_query_nic_vport_mac_address(esw->dev, vport->vport, true,
+					 vport->info.mac);
 	ether_addr_copy(hw_addr, vport->info.mac);
 	*hw_addr_len = ETH_ALEN;
 	mutex_unlock(&esw->state_lock);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 2ed2e530b07d..d1483f66cd0c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -78,15 +78,14 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 }
 
 static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
-					u32 *out)
+					bool other_vport, u32 *out)
 {
 	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {};
 
 	MLX5_SET(query_nic_vport_context_in, in, opcode,
 		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
 	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
-	if (vport)
-		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+	MLX5_SET(query_nic_vport_context_in, in, other_vport, other_vport);
 
 	return mlx5_cmd_exec_inout(mdev, query_nic_vport_context, in, out);
 }
@@ -97,7 +96,7 @@ int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 	u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {};
 	int err;
 
-	err = mlx5_query_nic_vport_context(mdev, vport, out);
+	err = mlx5_query_nic_vport_context(mdev, vport, vport > 0, out);
 	if (!err)
 		*min_inline = MLX5_GET(query_nic_vport_context_out, out,
 				       nic_vport_context.min_wqe_inline_mode);
@@ -219,7 +218,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
 	if (!out)
 		return -ENOMEM;
 
-	err = mlx5_query_nic_vport_context(mdev, 0, out);
+	err = mlx5_query_nic_vport_context(mdev, 0, false, out);
 	if (!err)
 		*mtu = MLX5_GET(query_nic_vport_context_out, out,
 				nic_vport_context.mtu);
@@ -429,7 +428,7 @@ int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 	if (!out)
 		return -ENOMEM;
 
-	err = mlx5_query_nic_vport_context(mdev, 0, out);
+	err = mlx5_query_nic_vport_context(mdev, 0, false, out);
 	if (err)
 		goto out;
 
@@ -451,7 +450,7 @@ int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group)
 	if (!out)
 		return -ENOMEM;
 
-	err = mlx5_query_nic_vport_context(mdev, 0, out);
+	err = mlx5_query_nic_vport_context(mdev, 0, false, out);
 	if (err)
 		goto out;
 
@@ -462,7 +461,8 @@ out:
 	return err;
 }
 
-int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev,
+				   u16 vport, bool other_vport, u64 *node_guid)
 {
 	u32 *out;
 	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
@@ -472,7 +472,7 @@ int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
 	if (!out)
 		return -ENOMEM;
 
-	err = mlx5_query_nic_vport_context(mdev, 0, out);
+	err = mlx5_query_nic_vport_context(mdev, vport, other_vport, out);
 	if (err)
 		goto out;
 
@@ -529,7 +529,7 @@ int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
 	if (!out)
 		return -ENOMEM;
 
-	err = mlx5_query_nic_vport_context(mdev, 0, out);
+	err = mlx5_query_nic_vport_context(mdev, 0, false, out);
 	if (err)
 		goto out;
 
@@ -804,7 +804,7 @@ int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev,
 	if (!out)
 		return -ENOMEM;
 
-	err = mlx5_query_nic_vport_context(mdev, vport, out);
+	err = mlx5_query_nic_vport_context(mdev, vport, vport > 0, out);
 	if (err)
 		goto out;
 
@@ -908,7 +908,7 @@ int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status)
 	if (!out)
 		return -ENOMEM;
 
-	err = mlx5_query_nic_vport_context(mdev, 0, out);
+	err = mlx5_query_nic_vport_context(mdev, 0, false, out);
 	if (err)
 		goto out;
 
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index c87b9507cfa1..f876bfc0669c 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -73,7 +73,8 @@ int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu);
 int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 					   u64 *system_image_guid);
 int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group);
-int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev,
+				   u16 vport, bool other_vport, u64 *node_guid);
 int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
 				    u16 vport, u64 node_guid);
 int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
-- 
cgit v1.2.3


From a392cde88d19af917740d27e13115447d3b21a06 Mon Sep 17 00:00:00 2001
From: Ryder Lee <ryder.lee@mediatek.com>
Date: Tue, 23 Sep 2025 17:23:22 +0000
Subject: wifi: cfg80211/mac80211: validate radio frequency range for monitor
 mode

In multi-radio devices, it is possible to have an MLD AP and a monitor
interface active at the same time. In such cases, monitor mode may not
be able to specify a fixed channel and could end up capturing frames
from all radios, including those outside the intended frequency bands.

This patch adds frequency validation for monitor mode. Received frames
are now only processed if their frequency fall within the allowed ranges
of the radios specified by the interface's radio_mask.

This prevents monitor mode from capturing frames outside the supported radio.

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Link: https://patch.msgid.link/700b8284e845d96654eb98431f8eeb5a81503862.1758647858.git.ryder.lee@mediatek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 14 ++++++++++++++
 net/mac80211/rx.c      | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/util.c    |  6 +++---
 3 files changed, 66 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 781624f5913a..3b6f48a783bb 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1015,6 +1015,7 @@ const struct cfg80211_chan_def *
 cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1,
 			    const struct cfg80211_chan_def *chandef2);
 
+
 /**
  * nl80211_chan_width_to_mhz - get the channel width in MHz
  * @chan_width: the channel width from &enum nl80211_chan_width
@@ -6882,6 +6883,19 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan)
 	return ieee80211_frequency_to_channel(chan->center_freq) % 16 == 5;
 }
 
+/**
+ * ieee80211_radio_freq_range_valid - Check if the radio supports the
+ * specified frequency range
+ *
+ * @radio: wiphy radio
+ * @freq: the frequency (in KHz) to be queried
+ * @width: the bandwidth (in KHz) to be queried
+ *
+ * Return: whether or not the given frequency range is valid for the given radio
+ */
+bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio,
+				      u32 freq, u32 width);
+
 /**
  * cfg80211_radio_chandef_valid - Check if the radio supports the chandef
  *
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 6af43dfefdd6..29175a0c9f68 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -763,6 +763,51 @@ ieee80211_make_monitor_skb(struct ieee80211_local *local,
 	return skb;
 }
 
+static bool
+ieee80211_validate_monitor_radio(struct ieee80211_sub_if_data *sdata,
+				 struct ieee80211_local *local,
+				 struct ieee80211_rx_status *status)
+{
+	struct wiphy *wiphy = local->hw.wiphy;
+	int i, freq, bw;
+
+	if (!wiphy->n_radio)
+		return true;
+
+	switch (status->bw) {
+	case RATE_INFO_BW_20:
+		bw = 20000;
+		break;
+	case RATE_INFO_BW_40:
+		bw = 40000;
+		break;
+	case RATE_INFO_BW_80:
+		bw = 80000;
+		break;
+	case RATE_INFO_BW_160:
+		bw = 160000;
+		break;
+	case RATE_INFO_BW_320:
+		bw = 320000;
+		break;
+	default:
+		return false;
+	}
+
+	freq = MHZ_TO_KHZ(status->freq);
+
+	for (i = 0; i < wiphy->n_radio; i++) {
+		if (!(sdata->wdev.radio_mask & BIT(i)))
+			continue;
+
+		if (!ieee80211_radio_freq_range_valid(&wiphy->radio[i], freq, bw))
+			continue;
+
+		return true;
+	}
+	return false;
+}
+
 /*
  * This function copies a received frame to all monitor interfaces and
  * returns a cleaned-up SKB that no longer includes the FCS nor the
@@ -855,6 +900,10 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 		    chandef->chan->center_freq != status->freq)
 			continue;
 
+		if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) &&
+		    !ieee80211_validate_monitor_radio(sdata, local, status))
+			continue;
+
 		if (!prev_sdata) {
 			prev_sdata = sdata;
 			continue;
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 56724b33af04..97f40c6d1e9d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -2942,9 +2942,8 @@ cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type)
 }
 EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa);
 
-static bool
-ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio,
-				 u32 freq, u32 width)
+bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio,
+				      u32 freq, u32 width)
 {
 	const struct wiphy_radio_freq_range *r;
 	int i;
@@ -2958,6 +2957,7 @@ ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio,
 
 	return false;
 }
+EXPORT_SYMBOL(ieee80211_radio_freq_range_valid);
 
 bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio,
 				  const struct cfg80211_chan_def *chandef)
-- 
cgit v1.2.3


From 7cc986c04a9b07d91684f7e326fa5b960215bc97 Mon Sep 17 00:00:00 2001
From: Roopni Devanathan <quic_rdevanat@quicinc.com>
Date: Fri, 24 Oct 2025 10:16:48 +0530
Subject: wifi: cfg80211: Add debugfs support for multi-radio wiphy

In multi-radio wiphy architecture, where a single wiphy can have
multiple radios tied to it, radio specific configuration parameters
and global wiphy parameters are maintained for the entire physical
device and common to all radios. But, each radio in a wiphy can have
different values for each radio configuration parameter, like RTS
threshold. With the current debugfs directory structure, the values
of global wiphy configuration parameters can be viewed, but, values
of individual radio configuration parameters cannot be viewed, as
radio specific configuration parameters are not maintained, separately.

To address this, in addition to maintaining global wiphy configuration
parameters common to all radios, create separate debugfs directories
for each radio in a wiphy to maintain parameters corresponding to that
radio in this directory.

In implementation, maintain a dentry structure in wiphy_radio_cfg, a
structure  containing radio configurations of a wiphy. This struct is
maintained to denote per-radio configurations of a wiphy. Create
separate directories representing each radio within phy#X directory in
debugfs during wiphy registration.

Sample directory structure with this change:
ls /sys/kernel/debug/ieee80211/phy0/radio
radio0/ radio1/ radio2/

Signed-off-by: Roopni Devanathan <quic_rdevanat@quicinc.com>
Link: https://patch.msgid.link/20251024044649.483557-2-quic_rdevanat@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  4 ++++
 net/wireless/core.c    | 15 +++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 3b6f48a783bb..53490eb04e87 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5684,9 +5684,13 @@ struct wiphy_iftype_akm_suites {
  *
  * @rts_threshold: RTS threshold (dot11RTSThreshold);
  *	-1 (default) = RTS/CTS disabled
+ * @radio_debugfsdir: Pointer to debugfs directory containing the radio-
+ *	specific parameters.
+ *	NULL (default) = Debugfs directory not created
  */
 struct wiphy_radio_cfg {
 	u32 rts_threshold;
+	struct dentry *radio_debugfsdir;
 };
 
 /**
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 797f9f2004a6..f3568eb5e592 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -34,6 +34,9 @@
 /* name for sysfs, %d is appended */
 #define PHY_NAME "phy"
 
+/* maximum length of radio debugfs directory name */
+#define RADIO_DEBUGFSDIR_MAX_LEN	8
+
 MODULE_AUTHOR("Johannes Berg");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("wireless configuration support");
@@ -1042,6 +1045,18 @@ int wiphy_register(struct wiphy *wiphy)
 	/* add to debugfs */
 	rdev->wiphy.debugfsdir = debugfs_create_dir(wiphy_name(&rdev->wiphy),
 						    ieee80211_debugfs_dir);
+	if (wiphy->n_radio > 0) {
+		int idx;
+		char radio_name[RADIO_DEBUGFSDIR_MAX_LEN];
+
+		for (idx = 0; idx < wiphy->n_radio; idx++) {
+			scnprintf(radio_name, sizeof(radio_name), "radio%d",
+				  idx);
+			wiphy->radio_cfg[idx].radio_debugfsdir =
+				debugfs_create_dir(radio_name,
+						   rdev->wiphy.debugfsdir);
+		}
+	}
 
 	cfg80211_debugfs_rdev_add(rdev);
 	nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);
-- 
cgit v1.2.3


From f864e4b721e386be132cc973eadefe5d52cdfd94 Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Date: Wed, 15 Oct 2025 20:26:07 +0100
Subject: clk: renesas: rzv2h: Add support for DSI clocks

Add support for PLLDSI and its post-dividers in the RZ/V2H CPG driver and
export helper APIs for use by the DSI driver.

Introduce per-PLL-DSI state in the CPG private structure and provide a
set of helper functions that find valid PLL parameter combinations for
a requested frequency. The new helpers are rzv2h_get_pll_pars(),
rzv2h_get_pll_div_pars(), rzv2h_get_pll_divs_pars() and
rzv2h_get_pll_dtable_pars() and they are exported in the "RZV2H_CPG"
namespace for use by other consumers (notably the DSI driver). These
helpers perform iterative searches over PLL parameters (M, K, P, S)
and optional post-dividers and return the best match (or an exact
match when possible).

Move PLL/CLK related limits and parameter types into the shared
include (include/linux/clk/renesas.h) by adding struct rzv2h_pll_limits,
struct rzv2h_pll_pars and struct rzv2h_pll_div_pars plus the
RZV2H_CPG_PLL_DSI_LIMITS() helper macro to define DSI PLL limits.

This change centralises the PLLDSI algorithms so the CPG and DSI
drivers compute PLL parameters consistently and allows the DSI driver
to accurately request rates and program its PLL.

Co-developed-by: Fabrizio Castro <fabrizio.castro.jz@renesas.com>
Signed-off-by: Fabrizio Castro <fabrizio.castro.jz@renesas.com>
Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Acked-by: Tomi Valkeinen <tomi.valkeinen+renesas@ideasonboard.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://patch.msgid.link/20251015192611.241920-4-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/clk/renesas/rzv2h-cpg.c | 497 ++++++++++++++++++++++++++++++++++++++++
 drivers/clk/renesas/rzv2h-cpg.h |  19 +-
 include/linux/clk/renesas.h     | 145 ++++++++++++
 3 files changed, 659 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/renesas/rzv2h-cpg.c b/drivers/clk/renesas/rzv2h-cpg.c
index 6abac15d3475..182437800329 100644
--- a/drivers/clk/renesas/rzv2h-cpg.c
+++ b/drivers/clk/renesas/rzv2h-cpg.c
@@ -14,9 +14,14 @@
 #include <linux/bitfield.h>
 #include <linux/clk.h>
 #include <linux/clk-provider.h>
+#include <linux/clk/renesas.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/iopoll.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+#include <linux/minmax.h>
 #include <linux/mod_devicetable.h>
 #include <linux/module.h>
 #include <linux/of.h>
@@ -26,6 +31,7 @@
 #include <linux/refcount.h>
 #include <linux/reset-controller.h>
 #include <linux/string_choices.h>
+#include <linux/units.h>
 
 #include <dt-bindings/clock/renesas-cpg-mssr.h>
 
@@ -47,7 +53,9 @@
 
 #define CPG_PLL_STBY(x)		((x))
 #define CPG_PLL_STBY_RESETB	BIT(0)
+#define CPG_PLL_STBY_SSC_EN	BIT(2)
 #define CPG_PLL_STBY_RESETB_WEN	BIT(16)
+#define CPG_PLL_STBY_SSC_EN_WEN BIT(18)
 #define CPG_PLL_CLK1(x)		((x) + 0x004)
 #define CPG_PLL_CLK1_KDIV	GENMASK(31, 16)
 #define CPG_PLL_CLK1_MDIV	GENMASK(15, 6)
@@ -65,6 +73,22 @@
 
 #define CPG_CLKSTATUS0		(0x700)
 
+/* On RZ/G3E SoC we have two DSI PLLs */
+#define MAX_CPG_DSI_PLL		2
+
+/**
+ * struct rzv2h_pll_dsi_info - PLL DSI information, holds the limits and parameters
+ *
+ * @pll_dsi_limits: PLL DSI parameters limits
+ * @pll_dsi_parameters: Calculated PLL DSI parameters
+ * @req_pll_dsi_rate: Requested PLL DSI rate
+ */
+struct rzv2h_pll_dsi_info {
+	const struct rzv2h_pll_limits *pll_dsi_limits;
+	struct rzv2h_pll_div_pars pll_dsi_parameters;
+	unsigned long req_pll_dsi_rate;
+};
+
 /**
  * struct rzv2h_cpg_priv - Clock Pulse Generator Private Data
  *
@@ -80,6 +104,7 @@
  * @ff_mod_status_ops: Fixed Factor Module Status Clock operations
  * @mstop_count: Array of mstop values
  * @rcdev: Reset controller entity
+ * @pll_dsi_info: Array of PLL DSI information, holds the limits and parameters
  */
 struct rzv2h_cpg_priv {
 	struct device *dev;
@@ -98,6 +123,8 @@ struct rzv2h_cpg_priv {
 	atomic_t *mstop_count;
 
 	struct reset_controller_dev rcdev;
+
+	struct rzv2h_pll_dsi_info pll_dsi_info[MAX_CPG_DSI_PLL];
 };
 
 #define rcdev_to_priv(x)	container_of(x, struct rzv2h_cpg_priv, rcdev)
@@ -168,6 +195,460 @@ struct rzv2h_ff_mod_status_clk {
 #define to_rzv2h_ff_mod_status_clk(_hw) \
 	container_of(_hw, struct rzv2h_ff_mod_status_clk, fix.hw)
 
+/**
+ * struct rzv2h_plldsi_div_clk - PLL DSI DDIV clock
+ *
+ * @dtable: divider table
+ * @priv: CPG private data
+ * @hw: divider clk
+ * @ddiv: divider configuration
+ */
+struct rzv2h_plldsi_div_clk {
+	const struct clk_div_table *dtable;
+	struct rzv2h_cpg_priv *priv;
+	struct clk_hw hw;
+	struct ddiv ddiv;
+};
+
+#define to_plldsi_div_clk(_hw) \
+	container_of(_hw, struct rzv2h_plldsi_div_clk, hw)
+
+#define RZ_V2H_OSC_CLK_IN_MEGA		(24 * MEGA)
+#define RZV2H_MAX_DIV_TABLES		(16)
+
+/**
+ * rzv2h_get_pll_pars - Finds the best combination of PLL parameters
+ * for a given frequency.
+ *
+ * @limits: Pointer to the structure containing the limits for the PLL parameters
+ * @pars: Pointer to the structure where the best calculated PLL parameters values
+ * will be stored
+ * @freq_millihz: Target output frequency in millihertz
+ *
+ * This function calculates the best set of PLL parameters (M, K, P, S) to achieve
+ * the desired frequency.
+ * There is no direct formula to calculate the PLL parameters, as it's an open
+ * system of equations, therefore this function uses an iterative approach to
+ * determine the best solution. The best solution is one that minimizes the error
+ * (desired frequency - actual frequency).
+ *
+ * Return: true if a valid set of parameters values is found, false otherwise.
+ */
+bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits,
+			struct rzv2h_pll_pars *pars, u64 freq_millihz)
+{
+	u64 fout_min_millihz = mul_u32_u32(limits->fout.min, MILLI);
+	u64 fout_max_millihz = mul_u32_u32(limits->fout.max, MILLI);
+	struct rzv2h_pll_pars p, best;
+
+	if (freq_millihz > fout_max_millihz ||
+	    freq_millihz < fout_min_millihz)
+		return false;
+
+	/* Initialize best error to maximum possible value */
+	best.error_millihz = S64_MAX;
+
+	for (p.p = limits->p.min; p.p <= limits->p.max; p.p++) {
+		u32 fref = RZ_V2H_OSC_CLK_IN_MEGA / p.p;
+		u16 divider;
+
+		for (divider = 1 << limits->s.min, p.s = limits->s.min;
+			p.s <= limits->s.max; p.s++, divider <<= 1) {
+			for (p.m = limits->m.min; p.m <= limits->m.max; p.m++) {
+				u64 output_m, output_k_range;
+				s64 pll_k, output_k;
+				u64 fvco, output;
+
+				/*
+				 * The frequency generated by the PLL + divider
+				 * is calculated as follows:
+				 *
+				 * With:
+				 * Freq = Ffout = Ffvco / 2^(pll_s)
+				 * Ffvco = (pll_m + (pll_k / 65536)) * Ffref
+				 * Ffref = 24MHz / pll_p
+				 *
+				 * Freq can also be rewritten as:
+				 * Freq = Ffvco / 2^(pll_s)
+				 *      = ((pll_m + (pll_k / 65536)) * Ffref) / 2^(pll_s)
+				 *      = (pll_m * Ffref) / 2^(pll_s) + ((pll_k / 65536) * Ffref) / 2^(pll_s)
+				 *      = output_m + output_k
+				 *
+				 * Every parameter has been determined at this
+				 * point, but pll_k.
+				 *
+				 * Considering that:
+				 * limits->k.min <= pll_k <= limits->k.max
+				 * Then:
+				 * -0.5 <= (pll_k / 65536) < 0.5
+				 * Therefore:
+				 * -Ffref / (2 * 2^(pll_s)) <= output_k < Ffref / (2 * 2^(pll_s))
+				 */
+
+				/* Compute output M component (in mHz) */
+				output_m = DIV_ROUND_CLOSEST_ULL(mul_u32_u32(p.m, fref) * MILLI,
+								 divider);
+				/* Compute range for output K (in mHz) */
+				output_k_range = DIV_ROUND_CLOSEST_ULL(mul_u32_u32(fref, MILLI),
+								       2 * divider);
+				/*
+				 * No point in continuing if we can't achieve
+				 * the desired frequency
+				 */
+				if (freq_millihz <  (output_m - output_k_range) ||
+				    freq_millihz >= (output_m + output_k_range)) {
+					continue;
+				}
+
+				/*
+				 * Compute the K component
+				 *
+				 * Since:
+				 * Freq = output_m + output_k
+				 * Then:
+				 * output_k = Freq - output_m
+				 *          = ((pll_k / 65536) * Ffref) / 2^(pll_s)
+				 * Therefore:
+				 * pll_k = (output_k * 65536 * 2^(pll_s)) / Ffref
+				 */
+				output_k = freq_millihz - output_m;
+				pll_k = div_s64(output_k * 65536ULL * divider,
+						fref);
+				pll_k = DIV_S64_ROUND_CLOSEST(pll_k, MILLI);
+
+				/* Validate K value within allowed limits */
+				if (pll_k < limits->k.min ||
+				    pll_k > limits->k.max)
+					continue;
+
+				p.k = pll_k;
+
+				/* Compute (Ffvco * 65536) */
+				fvco = mul_u32_u32(p.m * 65536 + p.k, fref);
+				if (fvco < mul_u32_u32(limits->fvco.min, 65536) ||
+				    fvco > mul_u32_u32(limits->fvco.max, 65536))
+					continue;
+
+				/* PLL_M component of (output * 65536 * PLL_P) */
+				output = mul_u32_u32(p.m * 65536, RZ_V2H_OSC_CLK_IN_MEGA);
+				/* PLL_K component of (output * 65536 * PLL_P) */
+				output += p.k * RZ_V2H_OSC_CLK_IN_MEGA;
+				/* Make it in mHz */
+				output *= MILLI;
+				output = DIV_U64_ROUND_CLOSEST(output, 65536 * p.p * divider);
+
+				/* Check output frequency against limits */
+				if (output < fout_min_millihz ||
+				    output > fout_max_millihz)
+					continue;
+
+				p.error_millihz = freq_millihz - output;
+				p.freq_millihz = output;
+
+				/* If an exact match is found, return immediately */
+				if (p.error_millihz == 0) {
+					*pars = p;
+					return true;
+				}
+
+				/* Update best match if error is smaller */
+				if (abs(best.error_millihz) > abs(p.error_millihz))
+					best = p;
+			}
+		}
+	}
+
+	/* If no valid parameters were found, return false */
+	if (best.error_millihz == S64_MAX)
+		return false;
+
+	*pars = best;
+	return true;
+}
+EXPORT_SYMBOL_NS_GPL(rzv2h_get_pll_pars, "RZV2H_CPG");
+
+/*
+ * rzv2h_get_pll_divs_pars - Finds the best combination of PLL parameters
+ * and divider value for a given frequency.
+ *
+ * @limits: Pointer to the structure containing the limits for the PLL parameters
+ * @pars: Pointer to the structure where the best calculated PLL parameters and
+ * divider values will be stored
+ * @table: Pointer to the array of valid divider values
+ * @table_size: Size of the divider values array
+ * @freq_millihz: Target output frequency in millihertz
+ *
+ * This function calculates the best set of PLL parameters (M, K, P, S) and divider
+ * value to achieve the desired frequency. See rzv2h_get_pll_pars() for more details
+ * on how the PLL parameters are calculated.
+ *
+ * freq_millihz is the desired frequency generated by the PLL followed by a
+ * a gear.
+ */
+bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits,
+			     struct rzv2h_pll_div_pars *pars,
+			     const u8 *table, u8 table_size, u64 freq_millihz)
+{
+	struct rzv2h_pll_div_pars p, best;
+
+	best.div.error_millihz = S64_MAX;
+	p.div.error_millihz = S64_MAX;
+	for (unsigned int i = 0; i < table_size; i++) {
+		if (!rzv2h_get_pll_pars(limits, &p.pll, freq_millihz * table[i]))
+			continue;
+
+		p.div.divider_value = table[i];
+		p.div.freq_millihz = DIV_U64_ROUND_CLOSEST(p.pll.freq_millihz, table[i]);
+		p.div.error_millihz = freq_millihz - p.div.freq_millihz;
+
+		if (p.div.error_millihz == 0) {
+			*pars = p;
+			return true;
+		}
+
+		if (abs(best.div.error_millihz) > abs(p.div.error_millihz))
+			best = p;
+	}
+
+	if (best.div.error_millihz == S64_MAX)
+		return false;
+
+	*pars = best;
+	return true;
+}
+EXPORT_SYMBOL_NS_GPL(rzv2h_get_pll_divs_pars, "RZV2H_CPG");
+
+static unsigned long rzv2h_cpg_plldsi_div_recalc_rate(struct clk_hw *hw,
+						      unsigned long parent_rate)
+{
+	struct rzv2h_plldsi_div_clk *dsi_div = to_plldsi_div_clk(hw);
+	struct rzv2h_cpg_priv *priv = dsi_div->priv;
+	struct ddiv ddiv = dsi_div->ddiv;
+	u32 div;
+
+	div = readl(priv->base + ddiv.offset);
+	div >>= ddiv.shift;
+	div &= clk_div_mask(ddiv.width);
+	div = dsi_div->dtable[div].div;
+
+	return DIV_ROUND_CLOSEST_ULL(parent_rate, div);
+}
+
+static int rzv2h_cpg_plldsi_div_determine_rate(struct clk_hw *hw,
+					       struct clk_rate_request *req)
+{
+	struct rzv2h_plldsi_div_clk *dsi_div = to_plldsi_div_clk(hw);
+	struct pll_clk *pll_clk = to_pll(clk_hw_get_parent(hw));
+	struct rzv2h_cpg_priv *priv = dsi_div->priv;
+	u8 table[RZV2H_MAX_DIV_TABLES] = { 0 };
+	struct rzv2h_pll_div_pars *dsi_params;
+	struct rzv2h_pll_dsi_info *dsi_info;
+	const struct clk_div_table *div;
+	unsigned int i = 0;
+	u64 rate_millihz;
+
+	dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance];
+	dsi_params = &dsi_info->pll_dsi_parameters;
+
+	rate_millihz = mul_u32_u32(req->rate, MILLI);
+	if (rate_millihz == dsi_params->div.error_millihz + dsi_params->div.freq_millihz)
+		goto exit_determine_rate;
+
+	for (div = dsi_div->dtable; div->div; div++) {
+		if (i >= RZV2H_MAX_DIV_TABLES)
+			return -EINVAL;
+		table[i++] = div->div;
+	}
+
+	if (!rzv2h_get_pll_divs_pars(dsi_info->pll_dsi_limits, dsi_params, table, i,
+				     rate_millihz)) {
+		dev_err(priv->dev, "failed to determine rate for req->rate: %lu\n",
+			req->rate);
+		return -EINVAL;
+	}
+
+exit_determine_rate:
+	req->rate = DIV_ROUND_CLOSEST_ULL(dsi_params->div.freq_millihz, MILLI);
+	req->best_parent_rate = req->rate * dsi_params->div.divider_value;
+	dsi_info->req_pll_dsi_rate = req->best_parent_rate;
+
+	return 0;
+}
+
+static int rzv2h_cpg_plldsi_div_set_rate(struct clk_hw *hw,
+					 unsigned long rate,
+					 unsigned long parent_rate)
+{
+	struct rzv2h_plldsi_div_clk *dsi_div = to_plldsi_div_clk(hw);
+	struct pll_clk *pll_clk = to_pll(clk_hw_get_parent(hw));
+	struct rzv2h_cpg_priv *priv = dsi_div->priv;
+	struct rzv2h_pll_div_pars *dsi_params;
+	struct rzv2h_pll_dsi_info *dsi_info;
+	struct ddiv ddiv = dsi_div->ddiv;
+	const struct clk_div_table *clkt;
+	bool divider_found = false;
+	u32 val, shift;
+
+	dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance];
+	dsi_params = &dsi_info->pll_dsi_parameters;
+
+	for (clkt = dsi_div->dtable; clkt->div; clkt++) {
+		if (clkt->div == dsi_params->div.divider_value) {
+			divider_found = true;
+			break;
+		}
+	}
+
+	if (!divider_found)
+		return -EINVAL;
+
+	shift = ddiv.shift;
+	val = readl(priv->base + ddiv.offset) | DDIV_DIVCTL_WEN(shift);
+	val &= ~(clk_div_mask(ddiv.width) << shift);
+	val |= clkt->val << shift;
+	writel(val, priv->base + ddiv.offset);
+
+	return 0;
+}
+
+static const struct clk_ops rzv2h_cpg_plldsi_div_ops = {
+	.recalc_rate = rzv2h_cpg_plldsi_div_recalc_rate,
+	.determine_rate = rzv2h_cpg_plldsi_div_determine_rate,
+	.set_rate = rzv2h_cpg_plldsi_div_set_rate,
+};
+
+static struct clk * __init
+rzv2h_cpg_plldsi_div_clk_register(const struct cpg_core_clk *core,
+				  struct rzv2h_cpg_priv *priv)
+{
+	struct rzv2h_plldsi_div_clk *clk_hw_data;
+	struct clk **clks = priv->clks;
+	struct clk_init_data init;
+	const struct clk *parent;
+	const char *parent_name;
+	struct clk_hw *clk_hw;
+	int ret;
+
+	parent = clks[core->parent];
+	if (IS_ERR(parent))
+		return ERR_CAST(parent);
+
+	clk_hw_data = devm_kzalloc(priv->dev, sizeof(*clk_hw_data), GFP_KERNEL);
+	if (!clk_hw_data)
+		return ERR_PTR(-ENOMEM);
+
+	clk_hw_data->priv = priv;
+	clk_hw_data->ddiv = core->cfg.ddiv;
+	clk_hw_data->dtable = core->dtable;
+
+	parent_name = __clk_get_name(parent);
+	init.name = core->name;
+	init.ops = &rzv2h_cpg_plldsi_div_ops;
+	init.flags = core->flag;
+	init.parent_names = &parent_name;
+	init.num_parents = 1;
+
+	clk_hw = &clk_hw_data->hw;
+	clk_hw->init = &init;
+
+	ret = devm_clk_hw_register(priv->dev, clk_hw);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return clk_hw->clk;
+}
+
+static int rzv2h_cpg_plldsi_determine_rate(struct clk_hw *hw,
+					   struct clk_rate_request *req)
+{
+	struct pll_clk *pll_clk = to_pll(hw);
+	struct rzv2h_cpg_priv *priv = pll_clk->priv;
+	struct rzv2h_pll_dsi_info *dsi_info;
+	u64 rate_millihz;
+
+	dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance];
+	/* check if the divider has already invoked the algorithm */
+	if (req->rate == dsi_info->req_pll_dsi_rate)
+		return 0;
+
+	/* If the req->rate doesn't match we do the calculation assuming there is no divider */
+	rate_millihz = mul_u32_u32(req->rate, MILLI);
+	if (!rzv2h_get_pll_pars(dsi_info->pll_dsi_limits,
+				&dsi_info->pll_dsi_parameters.pll, rate_millihz)) {
+		dev_err(priv->dev,
+			"failed to determine rate for req->rate: %lu\n",
+			req->rate);
+		return -EINVAL;
+	}
+
+	req->rate = DIV_ROUND_CLOSEST_ULL(dsi_info->pll_dsi_parameters.pll.freq_millihz, MILLI);
+	dsi_info->req_pll_dsi_rate = req->rate;
+
+	return 0;
+}
+
+static int rzv2h_cpg_pll_set_rate(struct pll_clk *pll_clk,
+				  struct rzv2h_pll_pars *params,
+				  bool ssc_disable)
+{
+	struct rzv2h_cpg_priv *priv = pll_clk->priv;
+	u16 offset = pll_clk->pll.offset;
+	u32 val;
+	int ret;
+
+	/* Put PLL into standby mode */
+	writel(CPG_PLL_STBY_RESETB_WEN, priv->base + CPG_PLL_STBY(offset));
+	ret = readl_poll_timeout_atomic(priv->base + CPG_PLL_MON(offset),
+					val, !(val & CPG_PLL_MON_LOCK),
+					100, 2000);
+	if (ret) {
+		dev_err(priv->dev, "Failed to put PLLDSI into standby mode");
+		return ret;
+	}
+
+	/* Output clock setting 1 */
+	writel(FIELD_PREP(CPG_PLL_CLK1_KDIV, (u16)params->k) |
+	       FIELD_PREP(CPG_PLL_CLK1_MDIV, params->m) |
+	       FIELD_PREP(CPG_PLL_CLK1_PDIV, params->p),
+	       priv->base + CPG_PLL_CLK1(offset));
+
+	/* Output clock setting 2 */
+	val = readl(priv->base + CPG_PLL_CLK2(offset));
+	writel((val & ~CPG_PLL_CLK2_SDIV) | FIELD_PREP(CPG_PLL_CLK2_SDIV, params->s),
+	       priv->base + CPG_PLL_CLK2(offset));
+
+	/* Put PLL to normal mode */
+	if (ssc_disable)
+		val = CPG_PLL_STBY_SSC_EN_WEN;
+	else
+		val = CPG_PLL_STBY_SSC_EN_WEN | CPG_PLL_STBY_SSC_EN;
+	writel(val | CPG_PLL_STBY_RESETB_WEN | CPG_PLL_STBY_RESETB,
+	       priv->base + CPG_PLL_STBY(offset));
+
+	/* PLL normal mode transition, output clock stability check */
+	ret = readl_poll_timeout_atomic(priv->base + CPG_PLL_MON(offset),
+					val, (val & CPG_PLL_MON_LOCK),
+					100, 2000);
+	if (ret) {
+		dev_err(priv->dev, "Failed to put PLLDSI into normal mode");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int rzv2h_cpg_plldsi_set_rate(struct clk_hw *hw, unsigned long rate,
+				     unsigned long parent_rate)
+{
+	struct pll_clk *pll_clk = to_pll(hw);
+	struct rzv2h_pll_dsi_info *dsi_info;
+	struct rzv2h_cpg_priv *priv = pll_clk->priv;
+
+	dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance];
+
+	return rzv2h_cpg_pll_set_rate(pll_clk, &dsi_info->pll_dsi_parameters.pll, true);
+}
+
 static int rzv2h_cpg_pll_clk_is_enabled(struct clk_hw *hw)
 {
 	struct pll_clk *pll_clk = to_pll(hw);
@@ -238,6 +719,12 @@ static unsigned long rzv2h_cpg_pll_clk_recalc_rate(struct clk_hw *hw,
 	return DIV_ROUND_CLOSEST_ULL(rate, FIELD_GET(CPG_PLL_CLK1_PDIV, clk1));
 }
 
+static const struct clk_ops rzv2h_cpg_plldsi_ops = {
+	.recalc_rate = rzv2h_cpg_pll_clk_recalc_rate,
+	.determine_rate = rzv2h_cpg_plldsi_determine_rate,
+	.set_rate = rzv2h_cpg_plldsi_set_rate,
+};
+
 static const struct clk_ops rzv2h_cpg_pll_ops = {
 	.is_enabled = rzv2h_cpg_pll_clk_is_enabled,
 	.enable = rzv2h_cpg_pll_clk_enable,
@@ -264,6 +751,10 @@ rzv2h_cpg_pll_clk_register(const struct cpg_core_clk *core,
 	if (!pll_clk)
 		return ERR_PTR(-ENOMEM);
 
+	if (core->type == CLK_TYPE_PLLDSI)
+		priv->pll_dsi_info[core->cfg.pll.instance].pll_dsi_limits =
+			core->cfg.pll.limits;
+
 	parent_name = __clk_get_name(parent);
 	init.name = core->name;
 	init.ops = ops;
@@ -588,6 +1079,12 @@ rzv2h_cpg_register_core_clk(const struct cpg_core_clk *core,
 	case CLK_TYPE_SMUX:
 		clk = rzv2h_cpg_mux_clk_register(core, priv);
 		break;
+	case CLK_TYPE_PLLDSI:
+		clk = rzv2h_cpg_pll_clk_register(core, priv, &rzv2h_cpg_plldsi_ops);
+		break;
+	case CLK_TYPE_PLLDSI_DIV:
+		clk = rzv2h_cpg_plldsi_div_clk_register(core, priv);
+		break;
 	default:
 		goto fail;
 	}
diff --git a/drivers/clk/renesas/rzv2h-cpg.h b/drivers/clk/renesas/rzv2h-cpg.h
index e2053049c299..637803bc1e89 100644
--- a/drivers/clk/renesas/rzv2h-cpg.h
+++ b/drivers/clk/renesas/rzv2h-cpg.h
@@ -22,15 +22,20 @@ struct pll {
 	unsigned int offset:9;
 	unsigned int has_clkn:1;
 	unsigned int instance:2;
+	const struct rzv2h_pll_limits *limits;
 };
 
-#define PLL_PACK(_offset, _has_clkn, _instance) \
+#define PLL_PACK_LIMITS(_offset, _has_clkn, _instance, _limits) \
 	((struct pll){ \
 		.offset = _offset, \
 		.has_clkn = _has_clkn, \
-		.instance = _instance \
+		.instance = _instance, \
+		.limits = _limits \
 	})
 
+#define PLL_PACK(_offset, _has_clkn, _instance) \
+	PLL_PACK_LIMITS(_offset, _has_clkn, _instance, NULL)
+
 #define PLLCA55		PLL_PACK(0x60, 1, 0)
 #define PLLGPU		PLL_PACK(0x120, 1, 0)
 
@@ -191,6 +196,8 @@ enum clk_types {
 	CLK_TYPE_PLL,
 	CLK_TYPE_DDIV,		/* Dynamic Switching Divider */
 	CLK_TYPE_SMUX,		/* Static Mux */
+	CLK_TYPE_PLLDSI,	/* PLLDSI */
+	CLK_TYPE_PLLDSI_DIV,	/* PLLDSI divider */
 };
 
 #define DEF_TYPE(_name, _id, _type...) \
@@ -221,6 +228,14 @@ enum clk_types {
 		 .num_parents = ARRAY_SIZE(_parent_names), \
 		 .flag = CLK_SET_RATE_PARENT, \
 		 .mux_flags = CLK_MUX_HIWORD_MASK)
+#define DEF_PLLDSI(_name, _id, _parent, _pll_packed) \
+	DEF_TYPE(_name, _id, CLK_TYPE_PLLDSI, .parent = _parent, .cfg.pll = _pll_packed)
+#define DEF_PLLDSI_DIV(_name, _id, _parent, _ddiv_packed, _dtable) \
+	DEF_TYPE(_name, _id, CLK_TYPE_PLLDSI_DIV, \
+		 .cfg.ddiv = _ddiv_packed, \
+		 .dtable = _dtable, \
+		 .parent = _parent, \
+		 .flag = CLK_SET_RATE_PARENT)
 
 /**
  * struct rzv2h_mod_clk - Module Clocks definitions
diff --git a/include/linux/clk/renesas.h b/include/linux/clk/renesas.h
index 0ebbe2f0b45e..69d8159deee3 100644
--- a/include/linux/clk/renesas.h
+++ b/include/linux/clk/renesas.h
@@ -10,7 +10,9 @@
 #ifndef __LINUX_CLK_RENESAS_H_
 #define __LINUX_CLK_RENESAS_H_
 
+#include <linux/clk-provider.h>
 #include <linux/types.h>
+#include <linux/units.h>
 
 struct device;
 struct device_node;
@@ -32,4 +34,147 @@ void cpg_mssr_detach_dev(struct generic_pm_domain *unused, struct device *dev);
 #define cpg_mssr_attach_dev	NULL
 #define cpg_mssr_detach_dev	NULL
 #endif
+
+/**
+ * struct rzv2h_pll_limits - PLL parameter constraints
+ *
+ * This structure defines the minimum and maximum allowed values for
+ * various parameters used to configure a PLL. These limits ensure
+ * the PLL operates within valid and stable ranges.
+ *
+ * @fout: Output frequency range (in MHz)
+ * @fout.min: Minimum allowed output frequency
+ * @fout.max: Maximum allowed output frequency
+ *
+ * @fvco: PLL oscillation frequency range (in MHz)
+ * @fvco.min: Minimum allowed VCO frequency
+ * @fvco.max: Maximum allowed VCO frequency
+ *
+ * @m: Main-divider range
+ * @m.min: Minimum main-divider value
+ * @m.max: Maximum main-divider value
+ *
+ * @p: Pre-divider range
+ * @p.min: Minimum pre-divider value
+ * @p.max: Maximum pre-divider value
+ *
+ * @s: Divider range
+ * @s.min: Minimum divider value
+ * @s.max: Maximum divider value
+ *
+ * @k: Delta-sigma modulator range (signed)
+ * @k.min: Minimum delta-sigma value
+ * @k.max: Maximum delta-sigma value
+ */
+struct rzv2h_pll_limits {
+	struct {
+		u32 min;
+		u32 max;
+	} fout;
+
+	struct {
+		u32 min;
+		u32 max;
+	} fvco;
+
+	struct {
+		u16 min;
+		u16 max;
+	} m;
+
+	struct {
+		u8 min;
+		u8 max;
+	} p;
+
+	struct {
+		u8 min;
+		u8 max;
+	} s;
+
+	struct {
+		s16 min;
+		s16 max;
+	} k;
+};
+
+/**
+ * struct rzv2h_pll_pars - PLL configuration parameters
+ *
+ * This structure contains the configuration parameters for the
+ * Phase-Locked Loop (PLL), used to achieve a specific output frequency.
+ *
+ * @m: Main divider value
+ * @p: Pre-divider value
+ * @s: Output divider value
+ * @k: Delta-sigma modulation value
+ * @freq_millihz: Calculated PLL output frequency in millihertz
+ * @error_millihz: Frequency error from target in millihertz (signed)
+ */
+struct rzv2h_pll_pars {
+	u16 m;
+	u8 p;
+	u8 s;
+	s16 k;
+	u64 freq_millihz;
+	s64 error_millihz;
+};
+
+/**
+ * struct rzv2h_pll_div_pars - PLL parameters with post-divider
+ *
+ * This structure is used for PLLs that include an additional post-divider
+ * stage after the main PLL block. It contains both the PLL configuration
+ * parameters and the resulting frequency/error values after the divider.
+ *
+ * @pll: Main PLL configuration parameters (see struct rzv2h_pll_pars)
+ *
+ * @div: Post-divider configuration and result
+ * @div.divider_value: Divider applied to the PLL output
+ * @div.freq_millihz: Output frequency after divider in millihertz
+ * @div.error_millihz: Frequency error from target in millihertz (signed)
+ */
+struct rzv2h_pll_div_pars {
+	struct rzv2h_pll_pars pll;
+	struct {
+		u8 divider_value;
+		u64 freq_millihz;
+		s64 error_millihz;
+	} div;
+};
+
+#define RZV2H_CPG_PLL_DSI_LIMITS(name)					\
+	static const struct rzv2h_pll_limits (name) = {			\
+		.fout = { .min = 25 * MEGA, .max = 375 * MEGA },	\
+		.fvco = { .min = 1600 * MEGA, .max = 3200 * MEGA },	\
+		.m = { .min = 64, .max = 533 },				\
+		.p = { .min = 1, .max = 4 },				\
+		.s = { .min = 0, .max = 6 },				\
+		.k = { .min = -32768, .max = 32767 },			\
+	}								\
+
+#ifdef CONFIG_CLK_RZV2H
+bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits,
+			struct rzv2h_pll_pars *pars, u64 freq_millihz);
+
+bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits,
+			     struct rzv2h_pll_div_pars *pars,
+			     const u8 *table, u8 table_size, u64 freq_millihz);
+#else
+static inline bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits,
+				      struct rzv2h_pll_pars *pars,
+				      u64 freq_millihz)
+{
+	return false;
+}
+
+static inline bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits,
+					   struct rzv2h_pll_div_pars *pars,
+					   const u8 *table, u8 table_size,
+					   u64 freq_millihz)
+{
+	return false;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 77a58ba7c64ccca20616aa03599766ccb0d1a330 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@linux.dev>
Date: Tue, 21 Oct 2025 10:47:03 -0400
Subject: spi: spi-mem: Trace exec_op

The spi subsystem has tracing, which is very convenient when debugging
problems. Add tracing for spi-mem too so that accesses that skip the spi
subsystem can still be seen.

The format is roughly based on the existing spi tracing. We don't bother
tracing the op's address because the tracing happens while the memory is
locked, so there can be no confusion about the matching of start and
stop. The conversion of cmd/addr/dummy to an array is directly analogous
to the conversion in the latter half of spi_mem_exec_op.

Signed-off-by: Sean Anderson <sean.anderson@linux.dev>
Link: https://patch.msgid.link/20251021144702.1582397-1-sean.anderson@linux.dev
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 MAINTAINERS                    |   1 +
 drivers/spi/spi-mem.c          |   5 ++
 include/trace/events/spi-mem.h | 106 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+)
 create mode 100644 include/trace/events/spi-mem.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 46126ce2f968..f5095fa88370 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24226,6 +24226,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git
 F:	Documentation/devicetree/bindings/spi/
 F:	Documentation/spi/
 F:	drivers/spi/
+F:	include/trace/events/spi*
 F:	include/linux/spi/
 F:	include/uapi/linux/spi/
 F:	tools/spi/
diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
index 064b99204d9a..c8b2add2640e 100644
--- a/drivers/spi/spi-mem.c
+++ b/drivers/spi/spi-mem.c
@@ -12,6 +12,9 @@
 #include <linux/spi/spi-mem.h>
 #include <linux/sched/task_stack.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/spi-mem.h>
+
 #include "internals.h"
 
 #define SPI_MEM_MAX_BUSWIDTH		8
@@ -403,7 +406,9 @@ int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
 		if (ret)
 			return ret;
 
+		trace_spi_mem_start_op(mem, op);
 		ret = ctlr->mem_ops->exec_op(mem, op);
+		trace_spi_mem_stop_op(mem, op);
 
 		spi_mem_access_end(mem);
 
diff --git a/include/trace/events/spi-mem.h b/include/trace/events/spi-mem.h
new file mode 100644
index 000000000000..d13f0bcff5e7
--- /dev/null
+++ b/include/trace/events/spi-mem.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM spi-mem
+
+#undef TRACE_SYSTEM_VAR
+#define TRACE_SYSTEM_VAR spi_mem
+
+#if !defined(_TRACE_SPI_MEM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SPI_MEM_H
+
+#include <linux/tracepoint.h>
+#include <linux/spi/spi-mem.h>
+
+#define decode_dtr(dtr) \
+	__print_symbolic(dtr, \
+		{ 0, "S" }, \
+		{ 1, "D" })
+
+TRACE_EVENT(spi_mem_start_op,
+	TP_PROTO(struct spi_mem *mem, const struct spi_mem_op *op),
+	TP_ARGS(mem, op),
+
+	TP_STRUCT__entry(
+		__string(name, mem->name)
+		__dynamic_array(u8, op, 1 + op->addr.nbytes + op->dummy.nbytes)
+		__dynamic_array(u8, data, op->data.dir == SPI_MEM_DATA_OUT ?
+					  min(op->data.nbytes, 64) : 0)
+		__field(u32, data_len)
+		__field(u32, max_freq)
+		__field(u8, cmd_buswidth)
+		__field(bool, cmd_dtr)
+		__field(u8, addr_buswidth)
+		__field(bool, addr_dtr)
+		__field(u8, dummy_nbytes)
+		__field(u8, data_buswidth)
+		__field(bool, data_dtr)
+	),
+
+	TP_fast_assign(
+		int i;
+
+		__assign_str(name);
+		__entry->max_freq = op->max_freq ?: mem->spi->max_speed_hz;
+
+		__entry->cmd_buswidth = op->cmd.buswidth;
+		__entry->cmd_dtr = op->cmd.dtr;
+		*((u8 *)__get_dynamic_array(op)) = op->cmd.opcode;
+
+		__entry->addr_buswidth = op->addr.buswidth;
+		__entry->addr_dtr = op->addr.dtr;
+		for (i = 0; i < op->addr.nbytes; i++)
+			((u8 *)__get_dynamic_array(op))[i + 1] =
+				op->addr.val >> (8 * (op->addr.nbytes - i - 1));
+
+		memset(((u8 *)__get_dynamic_array(op)) + op->addr.nbytes + 1,
+		       0xff, op->dummy.nbytes);
+
+		__entry->data_len = op->data.nbytes;
+		__entry->data_buswidth = op->data.buswidth;
+		__entry->data_dtr = op->data.dtr;
+		if (op->data.dir == SPI_MEM_DATA_OUT)
+			memcpy(__get_dynamic_array(data), op->data.buf.out,
+			       __get_dynamic_array_len(data));
+	),
+
+	TP_printk("%s %u%s-%u%s-%u%s @%u Hz op=[%*phD] len=%u tx=[%*phD]",
+		__get_str(name),
+		__entry->cmd_buswidth, decode_dtr(__entry->cmd_dtr),
+		__entry->addr_buswidth, decode_dtr(__entry->addr_dtr),
+		__entry->data_buswidth, decode_dtr(__entry->data_dtr),
+		__entry->max_freq,
+		__get_dynamic_array_len(op), __get_dynamic_array(op),
+		__entry->data_len,
+		__get_dynamic_array_len(data), __get_dynamic_array(data))
+);
+
+TRACE_EVENT(spi_mem_stop_op,
+	TP_PROTO(struct spi_mem *mem, const struct spi_mem_op *op),
+	TP_ARGS(mem, op),
+
+	TP_STRUCT__entry(
+		__string(name, mem->name)
+		__dynamic_array(u8, data, op->data.dir == SPI_MEM_DATA_IN ?
+					  min(op->data.nbytes, 64) : 0)
+		__field(u32, data_len)
+	),
+
+	TP_fast_assign(
+		__assign_str(name);
+		__entry->data_len = op->data.nbytes;
+		if (op->data.dir == SPI_MEM_DATA_IN)
+			memcpy(__get_dynamic_array(data), op->data.buf.in,
+			       __get_dynamic_array_len(data));
+	),
+
+	TP_printk("%s len=%u rx=[%*phD]",
+		__get_str(name),
+		__entry->data_len,
+		__get_dynamic_array_len(data), __get_dynamic_array(data))
+);
+
+
+#endif /* _TRACE_SPI_MEM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 812df545e3e44051d7fd39c057e53ffb56868451 Mon Sep 17 00:00:00 2001
From: Zhengnan Chen <zhengnan.chen@mediatek.com>
Date: Sat, 18 Oct 2025 21:26:10 +0800
Subject: dt-bindings: mediatek: mt8189: Add bindings for MM & APU & INFRA
 IOMMU

There are three iommu in total, namely MM_IOMMU, APU_IOMMU, INFRA_IOMMU,
Add bindings for them.

Signed-off-by: Zhengnan Chen <zhengnan.chen@mediatek.com>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 .../devicetree/bindings/iommu/mediatek,iommu.yaml  |   8 +
 .../memory/mediatek,mt8189-memory-port.h           | 283 +++++++++++++++++++++
 2 files changed, 291 insertions(+)
 create mode 100644 include/dt-bindings/memory/mediatek,mt8189-memory-port.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml b/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
index f49ed8ac4776..79c573c47b08 100644
--- a/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
+++ b/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
@@ -82,6 +82,9 @@ properties:
           - mediatek,mt8188-iommu-vdo        # generation two
           - mediatek,mt8188-iommu-vpp        # generation two
           - mediatek,mt8188-iommu-infra      # generation two
+          - mediatek,mt8189-iommu-apu        # generation two
+          - mediatek,mt8189-iommu-infra      # generation two
+          - mediatek,mt8189-iommu-mm         # generation two
           - mediatek,mt8192-m4u  # generation two
           - mediatek,mt8195-iommu-vdo        # generation two
           - mediatek,mt8195-iommu-vpp        # generation two
@@ -128,6 +131,7 @@ properties:
       This is the mtk_m4u_id according to the HW. Specifies the mtk_m4u_id as
       defined in
       dt-binding/memory/mediatek,mt8188-memory-port.h for mt8188,
+      dt-binding/memory/mediatek,mt8189-memory-port.h for mt8189,
       dt-binding/memory/mt2701-larb-port.h for mt2701 and mt7623,
       dt-binding/memory/mt2712-larb-port.h for mt2712,
       dt-binding/memory/mt6779-larb-port.h for mt6779,
@@ -164,6 +168,7 @@ allOf:
               - mediatek,mt8186-iommu-mm
               - mediatek,mt8188-iommu-vdo
               - mediatek,mt8188-iommu-vpp
+              - mediatek,mt8189-iommu-mm
               - mediatek,mt8192-m4u
               - mediatek,mt8195-iommu-vdo
               - mediatek,mt8195-iommu-vpp
@@ -180,6 +185,7 @@ allOf:
             - mediatek,mt8186-iommu-mm
             - mediatek,mt8188-iommu-vdo
             - mediatek,mt8188-iommu-vpp
+            - mediatek,mt8189-iommu-mm
             - mediatek,mt8192-m4u
             - mediatek,mt8195-iommu-vdo
             - mediatek,mt8195-iommu-vpp
@@ -208,6 +214,8 @@ allOf:
             contains:
               enum:
                 - mediatek,mt8188-iommu-infra
+                - mediatek,mt8189-iommu-apu
+                - mediatek,mt8189-iommu-infra
                 - mediatek,mt8195-iommu-infra
 
     then:
diff --git a/include/dt-bindings/memory/mediatek,mt8189-memory-port.h b/include/dt-bindings/memory/mediatek,mt8189-memory-port.h
new file mode 100644
index 000000000000..849fead3d0f7
--- /dev/null
+++ b/include/dt-bindings/memory/mediatek,mt8189-memory-port.h
@@ -0,0 +1,283 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2025 MediaTek Inc.
+ * Author: Zhengnan chen <zhengnan.chen@mediatek.com>
+ */
+#ifndef _DT_BINDINGS_MEMORY_MEDIATEK_MT8189_MEMORY_PORT_H_
+#define _DT_BINDINGS_MEMORY_MEDIATEK_MT8189_MEMORY_PORT_H_
+
+#include <dt-bindings/memory/mtk-memory-port.h>
+
+#define SMI_L0_ID		(0)
+#define SMI_L1_ID		(1)
+#define SMI_L2_ID		(2)
+#define SMI_L4_ID		(3)
+#define SMI_L7_ID		(4)
+#define SMI_L9_ID		(5)
+#define SMI_L11_ID		(6)
+#define SMI_L13_ID		(7)
+#define SMI_L14_ID		(8)
+#define SMI_L16_ID		(9)
+#define SMI_L17_ID		(10)
+#define SMI_L19_ID		(11)
+#define SMI_L20_ID		(12)
+
+/*
+ * MM IOMMU supports 16GB dma address. We separate it to four ranges:
+ * 0 ~ 4G; 4G ~ 8G; 8G ~ 12G; 12G ~ 16G, we could adjust these masters
+ * locate in anyone region. BUT:
+ * a) Make sure all the ports inside a larb are in one range.
+ * b) The iova of any master can NOT cross the 4G/8G/12G boundary.
+ *
+ * This is the suggested mapping in this SoC:
+ *
+ * modules		dma-address-region	larbs-ports
+ * disp/mdp		0 ~ 4G			larb0/1/2
+ * vcodec		4G ~ 8G                 larb4/7
+ * imgsys/cam/ipesys	8G ~ 12G                the other larbs.
+ * N/A			12G ~ 16G
+ */
+
+/* Larb0 -- disp */
+#define M4U_L0_P0_DISP_OVL0_4L_HDR		MTK_M4U_ID(SMI_L0_ID, 0)
+#define M4U_L0_P1_DISP_OVL0_4L_RDMA0		MTK_M4U_ID(SMI_L0_ID, 1)
+#define M4U_L0_P2_DISP_OVL1_4L_RDMA1		MTK_M4U_ID(SMI_L0_ID, 2)
+#define M4U_L0_P3_DISP_OVL0_4L_RDMA2		MTK_M4U_ID(SMI_L0_ID, 3)
+#define M4U_L0_P4_DISP_OVL1_4L_RDMA3		MTK_M4U_ID(SMI_L0_ID, 4)
+#define M4U_L0_P5_DISP_RDMA0			MTK_M4U_ID(SMI_L0_ID, 5)
+#define M4U_L0_P6_DISP_WDMA0			MTK_M4U_ID(SMI_L0_ID, 6)
+#define M4U_L0_P7_DISP_FAKE_ENG0		MTK_M4U_ID(SMI_L0_ID, 7)
+
+/* Larb1 -- disp */
+#define M4U_L1_P0_DISP_OVL1_4L_HDR		MTK_M4U_ID(SMI_L1_ID, 0)
+#define M4U_L1_P1_DISP_OVL1_4L_RDMA0		MTK_M4U_ID(SMI_L1_ID, 1)
+#define M4U_L1_P2_DISP_OVL0_4L_RDMA1		MTK_M4U_ID(SMI_L1_ID, 2)
+#define M4U_L1_P3_DISP_OVL1_4L_RDMA2		MTK_M4U_ID(SMI_L1_ID, 3)
+#define M4U_L1_P4_DISP_OVL0_4L_RDMA3		MTK_M4U_ID(SMI_L1_ID, 4)
+#define M4U_L1_P5_DISP_RDMA1			MTK_M4U_ID(SMI_L1_ID, 5)
+#define M4U_L1_P6_DISP_WDMA1			MTK_M4U_ID(SMI_L1_ID, 6)
+#define M4U_L1_P7_DISP_FAKE_ENG1		MTK_M4U_ID(SMI_L1_ID, 7)
+
+/* Larb2 -- mmlsys(mdp) */
+#define M4U_L2_P0_MDP_RDMA0			MTK_M4U_ID(SMI_L2_ID, 0)
+#define M4U_L2_P1_MDP_RDMA1			MTK_M4U_ID(SMI_L2_ID, 1)
+#define M4U_L2_P2_MDP_WROT0			MTK_M4U_ID(SMI_L2_ID, 2)
+#define M4U_L2_P3_MDP_WROT1			MTK_M4U_ID(SMI_L2_ID, 3)
+#define M4U_L2_P4_MDP_DUMMY0			MTK_M4U_ID(SMI_L2_ID, 4)
+#define M4U_L2_P5_MDP_DUMMY1			MTK_M4U_ID(SMI_L2_ID, 5)
+#define M4U_L2_P6_MDP_RDMA2			MTK_M4U_ID(SMI_L2_ID, 6)
+#define M4U_L2_P7_MDP_RDMA3			MTK_M4U_ID(SMI_L2_ID, 7)
+#define M4U_L2_P8_MDP_WROT2			MTK_M4U_ID(SMI_L2_ID, 8)
+#define M4U_L2_P9_MDP_WROT3			MTK_M4U_ID(SMI_L2_ID, 9)
+#define M4U_L2_P10_DISP_FAKE0			MTK_M4U_ID(SMI_L2_ID, 10)
+
+/* Larb3: null */
+
+/* Larb4 -- vdec */
+#define M4U_L4_P0_HW_VDEC_MC_EXT		MTK_M4U_ID(SMI_L4_ID, 0)
+#define M4U_L4_P1_HW_VDEC_UFO_EXT		MTK_M4U_ID(SMI_L4_ID, 1)
+#define M4U_L4_P2_HW_VDEC_PP_EXT		MTK_M4U_ID(SMI_L4_ID, 2)
+#define M4U_L4_P3_HW_VDEC_PRED_RD_EXT		MTK_M4U_ID(SMI_L4_ID, 3)
+#define M4U_L4_P4_HW_VDEC_PRED_WR_EXT		MTK_M4U_ID(SMI_L4_ID, 4)
+#define M4U_L4_P5_HW_VDEC_PPWRAP_EXT		MTK_M4U_ID(SMI_L4_ID, 5)
+#define M4U_L4_P6_HW_VDEC_TILE_EXT		MTK_M4U_ID(SMI_L4_ID, 6)
+#define M4U_L4_P7_HW_VDEC_VLD_EXT		MTK_M4U_ID(SMI_L4_ID, 7)
+#define M4U_L4_P8_HW_VDEC_VLD2_EXT		MTK_M4U_ID(SMI_L4_ID, 8)
+#define M4U_L4_P9_HW_VDEC_AVC_MV_EXT		MTK_M4U_ID(SMI_L4_ID, 9)
+#define M4U_L4_P10_HW_VDEC_RG_CTRL_DMA_EXT	MTK_M4U_ID(SMI_L4_ID, 10)
+#define M4U_L4_P11_HW_VDEC_UFO_ENC_EXT		MTK_M4U_ID(SMI_L4_ID, 11)
+
+/* Larb5: null */
+
+/* Larb6: null */
+
+/* Larb7 -- venc */
+#define M4U_L7_P0_VENC_RCPU			MTK_M4U_ID(SMI_L7_ID, 0)
+#define M4U_L7_P1_VENC_REC			MTK_M4U_ID(SMI_L7_ID, 1)
+#define M4U_L7_P2_VENC_BSDMA			MTK_M4U_ID(SMI_L7_ID, 2)
+#define M4U_L7_P3_VENC_SV_COMV			MTK_M4U_ID(SMI_L7_ID, 3)
+#define M4U_L7_P4_VENC_RD_COMV			MTK_M4U_ID(SMI_L7_ID, 4)
+#define M4U_L7_P5_JPGENC_Y_RDMA			MTK_M4U_ID(SMI_L7_ID, 5)
+#define M4U_L7_P6_JPGENC_C_RDMA			MTK_M4U_ID(SMI_L7_ID, 6)
+#define M4U_L7_P7_JPGENC_Q_RDMA			MTK_M4U_ID(SMI_L7_ID, 7)
+#define M4U_L7_P8_VENC_SUB_W_LUMA		MTK_M4U_ID(SMI_L7_ID, 8)
+#define M4U_L7_P9_JPGENC_BSDMA			MTK_M4U_ID(SMI_L7_ID, 9)
+#define M4U_L7_P10_VENC_CUR_LUMA		MTK_M4U_ID(SMI_L7_ID, 10)
+#define M4U_L7_P11_VENC_CUR_CHROMA		MTK_M4U_ID(SMI_L7_ID, 11)
+#define M4U_L7_P12_VENC_REF_LUMA		MTK_M4U_ID(SMI_L7_ID, 12)
+#define M4U_L7_P13_VENC_REF_CHROMA		MTK_M4U_ID(SMI_L7_ID, 13)
+#define M4U_L7_P14_VENC_SUB_R_LUMA		MTK_M4U_ID(SMI_L7_ID, 14)
+#define M4U_L7_P15_JPGDEC_WDMA			MTK_M4U_ID(SMI_L7_ID, 15)
+#define M4U_L7_P16_JPGDEC_BSDMA			MTK_M4U_ID(SMI_L7_ID, 16)
+#define M4U_L7_P17_JPGDEC_HUFF_OFFSET		MTK_M4U_ID(SMI_L7_ID, 17)
+
+/* Larb8: null */
+
+/* Larb9 --imgsys */
+#define M4U_L9_P0_IMGI_D1			MTK_M4U_ID(SMI_L9_ID, 0)
+#define M4U_L9_P1_IMGBI_D1			MTK_M4U_ID(SMI_L9_ID, 1)
+#define M4U_L9_P2_DMGI_D1			MTK_M4U_ID(SMI_L9_ID, 2)
+#define M4U_L9_P3_DEPI_D1			MTK_M4U_ID(SMI_L9_ID, 3)
+#define M4U_L9_P4_LCE_D1			MTK_M4U_ID(SMI_L9_ID, 4)
+#define M4U_L9_P5_SMTI_D1			MTK_M4U_ID(SMI_L9_ID, 5)
+#define M4U_L9_P6_SMTO_D2			MTK_M4U_ID(SMI_L9_ID, 6)
+#define M4U_L9_P7_SMTO_D1			MTK_M4U_ID(SMI_L9_ID, 7)
+#define M4U_L9_P8_CRZO_D1			MTK_M4U_ID(SMI_L9_ID, 8)
+#define M4U_L9_P9_IMG3O_D1			MTK_M4U_ID(SMI_L9_ID, 9)
+#define M4U_L9_P10_VIPI_D1			MTK_M4U_ID(SMI_L9_ID, 10)
+#define M4U_L9_P11_SMTI_D5			MTK_M4U_ID(SMI_L9_ID, 11)
+#define M4U_L9_P12_TIMGO_D1			MTK_M4U_ID(SMI_L9_ID, 12)
+#define M4U_L9_P13_UFBC_W0			MTK_M4U_ID(SMI_L9_ID, 13)
+#define M4U_L9_P14_UFBC_R0			MTK_M4U_ID(SMI_L9_ID, 14)
+#define M4U_L9_P15_WPE_RDMA1			MTK_M4U_ID(SMI_L9_ID, 15)
+#define M4U_L9_P16_WPE_RDMA0			MTK_M4U_ID(SMI_L9_ID, 16)
+#define M4U_L9_P17_WPE_WDMA			MTK_M4U_ID(SMI_L9_ID, 17)
+#define M4U_L9_P18_MFB_RDMA0			MTK_M4U_ID(SMI_L9_ID, 18)
+#define M4U_L9_P19_MFB_RDMA1			MTK_M4U_ID(SMI_L9_ID, 19)
+#define M4U_L9_P20_MFB_RDMA2			MTK_M4U_ID(SMI_L9_ID, 20)
+#define M4U_L9_P21_MFB_RDMA3			MTK_M4U_ID(SMI_L9_ID, 21)
+#define M4U_L9_P22_MFB_RDMA4			MTK_M4U_ID(SMI_L9_ID, 22)
+#define M4U_L9_P23_MFB_RDMA5			MTK_M4U_ID(SMI_L9_ID, 23)
+#define M4U_L9_P24_MFB_WDMA0			MTK_M4U_ID(SMI_L9_ID, 24)
+#define M4U_L9_P25_MFB_WDMA1			MTK_M4U_ID(SMI_L9_ID, 25)
+#define M4U_L9_P26_RESERVE6			MTK_M4U_ID(SMI_L9_ID, 26)
+#define M4U_L9_P27_RESERVE7			MTK_M4U_ID(SMI_L9_ID, 27)
+#define M4U_L9_P28_RESERVE8			MTK_M4U_ID(SMI_L9_ID, 28)
+
+/* Larb10: null */
+
+/* Larb11 -- imgsys */
+#define M4U_L11_P0_IMGI_D1			MTK_M4U_ID(SMI_L11_ID, 0)
+#define M4U_L11_P1_IMGBI_D1			MTK_M4U_ID(SMI_L11_ID, 1)
+#define M4U_L11_P2_DMGI_D1			MTK_M4U_ID(SMI_L11_ID, 2)
+#define M4U_L11_P3_DEPI_D1			MTK_M4U_ID(SMI_L11_ID, 3)
+#define M4U_L11_P4_LCE_D1			MTK_M4U_ID(SMI_L11_ID, 4)
+#define M4U_L11_P5_SMTI_D1			MTK_M4U_ID(SMI_L11_ID, 5)
+#define M4U_L11_P6_SMTO_D2			MTK_M4U_ID(SMI_L11_ID, 6)
+#define M4U_L11_P7_SMTO_D1			MTK_M4U_ID(SMI_L11_ID, 7)
+#define M4U_L11_P8_CRZO_D1			MTK_M4U_ID(SMI_L11_ID, 8)
+#define M4U_L11_P9_IMG3O_D1			MTK_M4U_ID(SMI_L11_ID, 9)
+#define M4U_L11_P10_VIPI_D1			MTK_M4U_ID(SMI_L11_ID, 10)
+#define M4U_L11_P11_SMTI_D5			MTK_M4U_ID(SMI_L11_ID, 11)
+#define M4U_L11_P12_TIMGO_D1			MTK_M4U_ID(SMI_L11_ID, 12)
+#define M4U_L11_P13_UFBC_W0			MTK_M4U_ID(SMI_L11_ID, 13)
+#define M4U_L11_P14_UFBC_R0			MTK_M4U_ID(SMI_L11_ID, 14)
+#define M4U_L11_P15_WPE_RDMA1			MTK_M4U_ID(SMI_L11_ID, 15)
+#define M4U_L11_P16_WPE_RDMA0			MTK_M4U_ID(SMI_L11_ID, 16)
+#define M4U_L11_P17_WPE_WDMA			MTK_M4U_ID(SMI_L11_ID, 17)
+#define M4U_L11_P18_MFB_RDMA0			MTK_M4U_ID(SMI_L11_ID, 18)
+#define M4U_L11_P19_MFB_RDMA1			MTK_M4U_ID(SMI_L11_ID, 19)
+#define M4U_L11_P20_MFB_RDMA2			MTK_M4U_ID(SMI_L11_ID, 20)
+#define M4U_L11_P21_MFB_RDMA3			MTK_M4U_ID(SMI_L11_ID, 21)
+#define M4U_L11_P22_MFB_RDMA4			MTK_M4U_ID(SMI_L11_ID, 22)
+#define M4U_L11_P23_MFB_RDMA5			MTK_M4U_ID(SMI_L11_ID, 23)
+#define M4U_L11_P24_MFB_WDMA0			MTK_M4U_ID(SMI_L11_ID, 24)
+#define M4U_L11_P25_MFB_WDMA1			MTK_M4U_ID(SMI_L11_ID, 25)
+#define M4U_L11_P26_RESERVE6			MTK_M4U_ID(SMI_L11_ID, 26)
+#define M4U_L11_P27_RESERVE7			MTK_M4U_ID(SMI_L11_ID, 27)
+#define M4U_L11_P28_RESERVE8			MTK_M4U_ID(SMI_L11_ID, 28)
+
+/* Larb12: null */
+
+/* Larb13 -- cam */
+#define M4U_L13_P0_MRAWI			MTK_M4U_ID(SMI_L13_ID, 0)
+#define M4U_L13_P1_MRAWO_0			MTK_M4U_ID(SMI_L13_ID, 1)
+#define M4U_L13_P2_MRAWO_1			MTK_M4U_ID(SMI_L13_ID, 2)
+#define M4U_L13_P3_CAMSV_1			MTK_M4U_ID(SMI_L13_ID, 3)
+#define M4U_L13_P4_CAMSV_2			MTK_M4U_ID(SMI_L13_ID, 4)
+#define M4U_L13_P5_CAMSV_3			MTK_M4U_ID(SMI_L13_ID, 5)
+#define M4U_L13_P6_CAMSV_4			MTK_M4U_ID(SMI_L13_ID, 6)
+#define M4U_L13_P7_CAMSV_5			MTK_M4U_ID(SMI_L13_ID, 7)
+#define M4U_L13_P8_CAMSV_6			MTK_M4U_ID(SMI_L13_ID, 8)
+#define M4U_L13_P9_CCUI				MTK_M4U_ID(SMI_L13_ID, 9)
+#define M4U_L13_P10_CCUO			MTK_M4U_ID(SMI_L13_ID, 10)
+#define M4U_L13_P11_FAKE			MTK_M4U_ID(SMI_L13_ID, 11)
+#define M4U_L13_P12_PDAI_0			MTK_M4U_ID(SMI_L13_ID, 12)
+#define M4U_L13_P13_PDAI_1			MTK_M4U_ID(SMI_L13_ID, 13)
+#define M4U_L13_P14_PDAO			MTK_M4U_ID(SMI_L13_ID, 14)
+
+/* Larb14 -- cam */
+#define M4U_L14_P0_RESERVE			MTK_M4U_ID(SMI_L14_ID, 0)
+#define M4U_L14_P1_RESERVE			MTK_M4U_ID(SMI_L14_ID, 1)
+#define M4U_L14_P2_RESERVE			MTK_M4U_ID(SMI_L14_ID, 2)
+#define M4U_L14_P3_CAMSV_0			MTK_M4U_ID(SMI_L14_ID, 3)
+#define M4U_L14_P4_CCUI				MTK_M4U_ID(SMI_L14_ID, 4)
+#define M4U_L14_P5_CCUO				MTK_M4U_ID(SMI_L14_ID, 5)
+#define M4U_L14_P6_CAMSV_7			MTK_M4U_ID(SMI_L14_ID, 6)
+#define M4U_L14_P7_CAMSV_8			MTK_M4U_ID(SMI_L14_ID, 7)
+#define M4U_L14_P8_CAMSV_9			MTK_M4U_ID(SMI_L14_ID, 8)
+#define M4U_L14_P9_CAMSV_10			MTK_M4U_ID(SMI_L14_ID, 9)
+
+/* Larb15: null */
+
+/* Larb16 -- cam */
+#define M4U_L16_P0_IMGO_R1_A			MTK_M4U_ID(SMI_L16_ID, 0)
+#define M4U_L16_P1_RRZO_R1_A			MTK_M4U_ID(SMI_L16_ID, 1)
+#define M4U_L16_P2_CQI_R1_A			MTK_M4U_ID(SMI_L16_ID, 2)
+#define M4U_L16_P3_BPCI_R1_A			MTK_M4U_ID(SMI_L16_ID, 3)
+#define M4U_L16_P4_YUVO_R1_A			MTK_M4U_ID(SMI_L16_ID, 4)
+#define M4U_L16_P5_UFDI_R2_A			MTK_M4U_ID(SMI_L16_ID, 5)
+#define M4U_L16_P6_RAWI_R2_A			MTK_M4U_ID(SMI_L16_ID, 6)
+#define M4U_L16_P7_RAWI_R3_A			MTK_M4U_ID(SMI_L16_ID, 7)
+#define M4U_L16_P8_AAO_R1_A			MTK_M4U_ID(SMI_L16_ID, 8)
+#define M4U_L16_P9_AFO_R1_A			MTK_M4U_ID(SMI_L16_ID, 9)
+#define M4U_L16_P10_FLKO_R1_A			MTK_M4U_ID(SMI_L16_ID, 10)
+#define M4U_L16_P11_LCESO_R1_A			MTK_M4U_ID(SMI_L16_ID, 11)
+#define M4U_L16_P12_CRZO_R1_A			MTK_M4U_ID(SMI_L16_ID, 12)
+#define M4U_L16_P13_LTMSO_R1_A			MTK_M4U_ID(SMI_L16_ID, 13)
+#define M4U_L16_P14_RSSO_R1_A			MTK_M4U_ID(SMI_L16_ID, 14)
+#define M4U_L16_P15_AAHO_R1_A			MTK_M4U_ID(SMI_L16_ID, 15)
+#define M4U_L16_P16_LSCI_R1_A			MTK_M4U_ID(SMI_L16_ID, 16)
+
+/* Larb17 -- cam */
+#define M4U_L17_P0_IMGO_R1_B			MTK_M4U_ID(SMI_L17_ID, 0)
+#define M4U_L17_P1_RRZO_R1_B			MTK_M4U_ID(SMI_L17_ID, 1)
+#define M4U_L17_P2_CQI_R1_B			MTK_M4U_ID(SMI_L17_ID, 2)
+#define M4U_L17_P3_BPCI_R1_B			MTK_M4U_ID(SMI_L17_ID, 3)
+#define M4U_L17_P4_YUVO_R1_B			MTK_M4U_ID(SMI_L17_ID, 4)
+#define M4U_L17_P5_UFDI_R2_B			MTK_M4U_ID(SMI_L17_ID, 5)
+#define M4U_L17_P6_RAWI_R2_B			MTK_M4U_ID(SMI_L17_ID, 6)
+#define M4U_L17_P7_RAWI_R3_B			MTK_M4U_ID(SMI_L17_ID, 7)
+#define M4U_L17_P8_AAO_R1_B			MTK_M4U_ID(SMI_L17_ID, 8)
+#define M4U_L17_P9_AFO_R1_B			MTK_M4U_ID(SMI_L17_ID, 9)
+#define M4U_L17_P10_FLKO_R1_B			MTK_M4U_ID(SMI_L17_ID, 10)
+#define M4U_L17_P11_LCESO_R1_B			MTK_M4U_ID(SMI_L17_ID, 11)
+#define M4U_L17_P12_CRZO_R1_B			MTK_M4U_ID(SMI_L17_ID, 12)
+#define M4U_L17_P13_LTMSO_R1_B			MTK_M4U_ID(SMI_L17_ID, 13)
+#define M4U_L17_P14_RSSO_R1_B			MTK_M4U_ID(SMI_L17_ID, 14)
+#define M4U_L17_P15_AAHO_R1_B			MTK_M4U_ID(SMI_L17_ID, 15)
+#define M4U_L17_P16_LSCI_R1_B			MTK_M4U_ID(SMI_L17_ID, 16)
+
+/* Larb19 -- ipesys */
+#define M4U_L19_P0_DVS_RDMA			MTK_M4U_ID(SMI_L19_ID, 0)
+#define M4U_L19_P1_DVS_WDMA			MTK_M4U_ID(SMI_L19_ID, 1)
+#define M4U_L19_P2_DVP_RDMA			MTK_M4U_ID(SMI_L19_ID, 2)
+#define M4U_L19_P3_DVP_WDMA			MTK_M4U_ID(SMI_L19_ID, 3)
+
+/* Larb20 -- ipesys */
+#define M4U_L20_P0_FDVT_RDA_0			MTK_M4U_ID(SMI_L20_ID, 0)
+#define M4U_L20_P1_FDVT_RDB_0			MTK_M4U_ID(SMI_L20_ID, 1)
+#define M4U_L20_P2_FDVT_WRA_0			MTK_M4U_ID(SMI_L20_ID, 2)
+#define M4U_L20_P3_FDVT_WRB_0			MTK_M4U_ID(SMI_L20_ID, 3)
+#define M4U_L20_P4_RSC_RDMA			MTK_M4U_ID(SMI_L20_ID, 4)
+#define M4U_L20_P5_RSC_WDMA			MTK_M4U_ID(SMI_L20_ID, 5)
+
+/* fake larb21 for gce */
+#define M4U_L21_GCE_DM				MTK_M4U_ID(21, 0)
+#define M4U_L21_GCE_MM				MTK_M4U_ID(21, 1)
+
+/* fake larb & port for svp and dual svp and wfd */
+#define M4U_PORT_SVP_HEAP			MTK_M4U_ID(22, 0)
+#define M4U_PORT_DUAL_SVP_HEAP			MTK_M4U_ID(22, 1)
+#define M4U_PORT_WFD_HEAP			MTK_M4U_ID(22, 2)
+
+/* fake larb0 for apu */
+#define M4U_L0_APU_DATA				MTK_M4U_ID(0, 0)
+#define M4U_L0_APU_CODE				MTK_M4U_ID(0, 1)
+#define M4U_L0_APU_SECURE			MTK_M4U_ID(0, 2)
+#define M4U_L0_APU_VLM				MTK_M4U_ID(0, 3)
+
+/* infra/peri */
+#define IFR_IOMMU_PORT_PCIE_0			MTK_IFAIOMMU_PERI_ID(0, 26)
+
+#endif
-- 
cgit v1.2.3


From fd714986e4e46effa6697b13d32918fc59608ccb Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 22 Oct 2025 19:21:09 -0700
Subject: iommu: Pass in old domain to attach_dev callback functions

The IOMMU core attaches each device to a default domain on probe(). Then,
every new "attach" operation has a fundamental meaning of two-fold:
 - detach from its currently attached (old) domain
 - attach to a given new domain

Modern IOMMU drivers following this pattern usually want to clean up the
things related to the old domain, so they call iommu_get_domain_for_dev()
to fetch the old domain.

Pass in the old domain pointer from the core to drivers, aligning with the
set_dev_pasid op that does so already.

Ensure all low-level attach fcuntions in the core can forward the correct
old domain pointer. Thus, rework those functions as well.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/powerpc/kernel/iommu.c                        |  5 +++--
 drivers/iommu/amd/iommu.c                          | 11 +++++-----
 drivers/iommu/apple-dart.c                         |  9 +++++---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c    |  5 +++--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c        | 24 +++++++++++++--------
 drivers/iommu/arm/arm-smmu/arm-smmu.c              |  9 +++++---
 drivers/iommu/arm/arm-smmu/qcom_iommu.c            | 11 +++++-----
 drivers/iommu/exynos-iommu.c                       |  8 ++++---
 drivers/iommu/fsl_pamu_domain.c                    | 12 +++++------
 drivers/iommu/intel/iommu.c                        | 10 ++++++---
 drivers/iommu/intel/nested.c                       |  2 +-
 drivers/iommu/iommu.c                              | 25 +++++++++++++---------
 drivers/iommu/iommufd/selftest.c                   |  2 +-
 drivers/iommu/ipmmu-vmsa.c                         | 10 ++++-----
 drivers/iommu/msm_iommu.c                          | 11 +++++-----
 drivers/iommu/mtk_iommu.c                          |  8 +++----
 drivers/iommu/mtk_iommu_v1.c                       |  7 ++++--
 drivers/iommu/omap-iommu.c                         | 12 +++++------
 drivers/iommu/riscv/iommu.c                        |  9 +++++---
 drivers/iommu/rockchip-iommu.c                     | 20 ++++++++++++-----
 drivers/iommu/s390-iommu.c                         | 13 ++++++-----
 drivers/iommu/sprd-iommu.c                         |  3 ++-
 drivers/iommu/sun50i-iommu.c                       |  8 ++++---
 drivers/iommu/tegra-smmu.c                         | 10 ++++-----
 drivers/iommu/virtio-iommu.c                       |  6 ++++--
 include/linux/iommu.h                              |  3 ++-
 26 files changed, 153 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 244eb4857e7f..b7dcf07b2499 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1156,7 +1156,8 @@ EXPORT_SYMBOL_GPL(iommu_add_device);
  */
 static int
 spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain,
-				    struct device *dev)
+				    struct device *dev,
+				    struct iommu_domain *old)
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct iommu_table_group *table_group;
@@ -1189,7 +1190,7 @@ static struct iommu_domain spapr_tce_platform_domain = {
 
 static int
 spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain,
-				     struct device *dev)
+				   struct device *dev, struct iommu_domain *old)
 {
 	struct iommu_group *grp = iommu_group_get(dev);
 	struct iommu_table_group *table_group;
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 6f4559eb5121..e16ad510c8c8 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -70,8 +70,8 @@ int amd_iommu_max_glx_val = -1;
  */
 DEFINE_IDA(pdom_ids);
 
-static int amd_iommu_attach_device(struct iommu_domain *dom,
-				   struct device *dev);
+static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
+				   struct iommu_domain *old);
 
 static void set_dte_entry(struct amd_iommu *iommu,
 			  struct iommu_dev_data *dev_data);
@@ -2635,7 +2635,8 @@ void amd_iommu_domain_free(struct iommu_domain *dom)
 }
 
 static int blocked_domain_attach_device(struct iommu_domain *domain,
-					struct device *dev)
+					struct device *dev,
+					struct iommu_domain *old)
 {
 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
 
@@ -2685,8 +2686,8 @@ void amd_iommu_init_identity_domain(void)
 	protection_domain_init(&identity_domain);
 }
 
-static int amd_iommu_attach_device(struct iommu_domain *dom,
-				   struct device *dev)
+static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
+				   struct iommu_domain *old)
 {
 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
 	struct protection_domain *domain = to_pdomain(dom);
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index 95a4e62b8f63..b5848770ef48 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -672,7 +672,8 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain,
 }
 
 static int apple_dart_attach_dev_paging(struct iommu_domain *domain,
-					struct device *dev)
+					struct device *dev,
+					struct iommu_domain *old)
 {
 	int ret, i;
 	struct apple_dart_stream_map *stream_map;
@@ -693,7 +694,8 @@ static int apple_dart_attach_dev_paging(struct iommu_domain *domain,
 }
 
 static int apple_dart_attach_dev_identity(struct iommu_domain *domain,
-					  struct device *dev)
+					  struct device *dev,
+					  struct iommu_domain *old)
 {
 	struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
 	struct apple_dart_stream_map *stream_map;
@@ -717,7 +719,8 @@ static struct iommu_domain apple_dart_identity_domain = {
 };
 
 static int apple_dart_attach_dev_blocked(struct iommu_domain *domain,
-					 struct device *dev)
+					 struct device *dev,
+					 struct iommu_domain *old)
 {
 	struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
 	struct apple_dart_stream_map *stream_map;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 8cd8929bbfdf..313201a61699 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -138,14 +138,15 @@ void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master)
 }
 
 static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
-				      struct device *dev)
+				      struct device *dev,
+				      struct iommu_domain *old_domain)
 {
 	struct arm_smmu_nested_domain *nested_domain =
 		to_smmu_nested_domain(domain);
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct arm_smmu_attach_state state = {
 		.master = master,
-		.old_domain = iommu_get_domain_for_dev(dev),
+		.old_domain = old_domain,
 		.ssid = IOMMU_NO_PASID,
 	};
 	struct arm_smmu_ste ste;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 2125ebfc9a70..a33fbd12a0dd 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3002,7 +3002,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
 	master->ats_enabled = state->ats_enabled;
 }
 
-static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev,
+			       struct iommu_domain *old_domain)
 {
 	int ret = 0;
 	struct arm_smmu_ste target;
@@ -3010,7 +3011,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	struct arm_smmu_device *smmu;
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_attach_state state = {
-		.old_domain = iommu_get_domain_for_dev(dev),
+		.old_domain = old_domain,
 		.ssid = IOMMU_NO_PASID,
 	};
 	struct arm_smmu_master *master;
@@ -3186,7 +3187,7 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain,
 
 	/*
 	 * When the last user of the CD table goes away downgrade the STE back
-	 * to a non-cd_table one.
+	 * to a non-cd_table one, by re-attaching its sid_domain.
 	 */
 	if (!arm_smmu_ssids_in_use(&master->cd_table)) {
 		struct iommu_domain *sid_domain =
@@ -3194,12 +3195,14 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain,
 
 		if (sid_domain->type == IOMMU_DOMAIN_IDENTITY ||
 		    sid_domain->type == IOMMU_DOMAIN_BLOCKED)
-			sid_domain->ops->attach_dev(sid_domain, dev);
+			sid_domain->ops->attach_dev(sid_domain, dev,
+						    sid_domain);
 	}
 	return 0;
 }
 
 static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
+				    struct iommu_domain *old_domain,
 				    struct device *dev,
 				    struct arm_smmu_ste *ste,
 				    unsigned int s1dss)
@@ -3207,7 +3210,7 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct arm_smmu_attach_state state = {
 		.master = master,
-		.old_domain = iommu_get_domain_for_dev(dev),
+		.old_domain = old_domain,
 		.ssid = IOMMU_NO_PASID,
 	};
 
@@ -3248,14 +3251,16 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
 }
 
 static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
-					struct device *dev)
+					struct device *dev,
+					struct iommu_domain *old_domain)
 {
 	struct arm_smmu_ste ste;
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 
 	arm_smmu_master_clear_vmaster(master);
 	arm_smmu_make_bypass_ste(master->smmu, &ste);
-	arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS);
+	arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste,
+				STRTAB_STE_1_S1DSS_BYPASS);
 	return 0;
 }
 
@@ -3269,14 +3274,15 @@ static struct iommu_domain arm_smmu_identity_domain = {
 };
 
 static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
-					struct device *dev)
+				       struct device *dev,
+				       struct iommu_domain *old_domain)
 {
 	struct arm_smmu_ste ste;
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 
 	arm_smmu_master_clear_vmaster(master);
 	arm_smmu_make_abort_ste(&ste);
-	arm_smmu_attach_dev_ste(domain, dev, &ste,
+	arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste,
 				STRTAB_STE_1_S1DSS_TERMINATE);
 	return 0;
 }
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4ced4b5bee4d..5e690cf85ec9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1165,7 +1165,8 @@ static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg,
 	}
 }
 
-static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev,
+			       struct iommu_domain *old)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
@@ -1234,7 +1235,8 @@ static int arm_smmu_attach_dev_type(struct device *dev,
 }
 
 static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
-					struct device *dev)
+					struct device *dev,
+					struct iommu_domain *old)
 {
 	return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS);
 }
@@ -1249,7 +1251,8 @@ static struct iommu_domain arm_smmu_identity_domain = {
 };
 
 static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
-				       struct device *dev)
+				       struct device *dev,
+				       struct iommu_domain *old)
 {
 	return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT);
 }
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index c5be95e56031..9222a4a48bb3 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -359,7 +359,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
 	kfree(qcom_domain);
 }
 
-static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int qcom_iommu_attach_dev(struct iommu_domain *domain,
+				 struct device *dev, struct iommu_domain *old)
 {
 	struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev);
 	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
@@ -388,18 +389,18 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
 }
 
 static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain,
-				      struct device *dev)
+				      struct device *dev,
+				      struct iommu_domain *old)
 {
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct qcom_iommu_domain *qcom_domain;
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev);
 	unsigned int i;
 
-	if (domain == identity_domain || !domain)
+	if (old == identity_domain || !old)
 		return 0;
 
-	qcom_domain = to_qcom_iommu_domain(domain);
+	qcom_domain = to_qcom_iommu_domain(old);
 	if (WARN_ON(!qcom_domain->iommu))
 		return -EINVAL;
 
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index 0857519ca718..e375ced6e2b0 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -984,7 +984,8 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain)
 }
 
 static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain,
-					struct device *dev)
+					struct device *dev,
+					struct iommu_domain *old)
 {
 	struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
 	struct exynos_iommu_domain *domain;
@@ -1035,7 +1036,8 @@ static struct iommu_domain exynos_identity_domain = {
 };
 
 static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
-				   struct device *dev)
+				      struct device *dev,
+				      struct iommu_domain *old)
 {
 	struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
 	struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
@@ -1044,7 +1046,7 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
 	unsigned long flags;
 	int err;
 
-	err = exynos_iommu_identity_attach(&exynos_identity_domain, dev);
+	err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old);
 	if (err)
 		return err;
 
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 5f08523f97cb..9664ef9840d2 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -238,7 +238,7 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
 }
 
 static int fsl_pamu_attach_device(struct iommu_domain *domain,
-				  struct device *dev)
+				  struct device *dev, struct iommu_domain *old)
 {
 	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
 	unsigned long flags;
@@ -298,9 +298,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain,
  * switches to what looks like BLOCKING.
  */
 static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain,
-				    struct device *dev)
+				    struct device *dev,
+				    struct iommu_domain *old)
 {
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct fsl_dma_domain *dma_domain;
 	const u32 *prop;
 	int len;
@@ -311,11 +311,11 @@ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain,
 	 * Hack to keep things working as they always have, only leaving an
 	 * UNMANAGED domain makes it BLOCKING.
 	 */
-	if (domain == platform_domain || !domain ||
-	    domain->type != IOMMU_DOMAIN_UNMANAGED)
+	if (old == platform_domain || !old ||
+	    old->type != IOMMU_DOMAIN_UNMANAGED)
 		return 0;
 
-	dma_domain = to_fsl_dma_domain(domain);
+	dma_domain = to_fsl_dma_domain(old);
 
 	/*
 	 * Use LIODN of the PCI controller while detaching a
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e236c7ec221f..f0396591cd9b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3230,7 +3230,8 @@ void device_block_translation(struct device *dev)
 }
 
 static int blocking_domain_attach_dev(struct iommu_domain *domain,
-				      struct device *dev)
+				      struct device *dev,
+				      struct iommu_domain *old)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 
@@ -3537,7 +3538,8 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
 }
 
 static int intel_iommu_attach_device(struct iommu_domain *domain,
-				     struct device *dev)
+				     struct device *dev,
+				     struct iommu_domain *old)
 {
 	int ret;
 
@@ -4401,7 +4403,9 @@ static int device_setup_pass_through(struct device *dev)
 				      context_setup_pass_through_cb, dev);
 }
 
-static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int identity_domain_attach_dev(struct iommu_domain *domain,
+				      struct device *dev,
+				      struct iommu_domain *old)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct intel_iommu *iommu = info->iommu;
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
index 1b6ad9c900a5..760d7aa2ade8 100644
--- a/drivers/iommu/intel/nested.c
+++ b/drivers/iommu/intel/nested.c
@@ -19,7 +19,7 @@
 #include "pasid.h"
 
 static int intel_nested_attach_dev(struct iommu_domain *domain,
-				   struct device *dev)
+				   struct device *dev, struct iommu_domain *old)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ce141f095f96..2ca990dfbb88 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -100,7 +100,7 @@ static int iommu_bus_notifier(struct notifier_block *nb,
 			      unsigned long action, void *data);
 static void iommu_release_device(struct device *dev);
 static int __iommu_attach_device(struct iommu_domain *domain,
-				 struct device *dev);
+				 struct device *dev, struct iommu_domain *old);
 static int __iommu_attach_group(struct iommu_domain *domain,
 				struct iommu_group *group);
 static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev,
@@ -114,6 +114,7 @@ enum {
 static int __iommu_device_set_domain(struct iommu_group *group,
 				     struct device *dev,
 				     struct iommu_domain *new_domain,
+				     struct iommu_domain *old_domain,
 				     unsigned int flags);
 static int __iommu_group_set_domain_internal(struct iommu_group *group,
 					     struct iommu_domain *new_domain,
@@ -554,7 +555,8 @@ static void iommu_deinit_device(struct device *dev)
 		    release_domain == ops->blocked_domain)
 			release_domain = ops->identity_domain;
 
-		release_domain->ops->attach_dev(release_domain, dev);
+		release_domain->ops->attach_dev(release_domain, dev,
+						group->domain);
 	}
 
 	if (ops->release_device)
@@ -640,7 +642,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 	if (group->default_domain)
 		iommu_create_device_direct_mappings(group->default_domain, dev);
 	if (group->domain) {
-		ret = __iommu_device_set_domain(group, dev, group->domain, 0);
+		ret = __iommu_device_set_domain(group, dev, group->domain, NULL,
+						0);
 		if (ret)
 			goto err_remove_gdev;
 	} else if (!group->default_domain && !group_list) {
@@ -2127,14 +2130,14 @@ static void __iommu_group_set_core_domain(struct iommu_group *group)
 }
 
 static int __iommu_attach_device(struct iommu_domain *domain,
-				 struct device *dev)
+				 struct device *dev, struct iommu_domain *old)
 {
 	int ret;
 
 	if (unlikely(domain->ops->attach_dev == NULL))
 		return -ENODEV;
 
-	ret = domain->ops->attach_dev(domain, dev);
+	ret = domain->ops->attach_dev(domain, dev, old);
 	if (ret)
 		return ret;
 	dev->iommu->attach_deferred = 0;
@@ -2183,7 +2186,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_device);
 int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
 {
 	if (dev->iommu && dev->iommu->attach_deferred)
-		return __iommu_attach_device(domain, dev);
+		return __iommu_attach_device(domain, dev, NULL);
 
 	return 0;
 }
@@ -2296,6 +2299,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_group);
 static int __iommu_device_set_domain(struct iommu_group *group,
 				     struct device *dev,
 				     struct iommu_domain *new_domain,
+				     struct iommu_domain *old_domain,
 				     unsigned int flags)
 {
 	int ret;
@@ -2321,7 +2325,7 @@ static int __iommu_device_set_domain(struct iommu_group *group,
 		dev->iommu->attach_deferred = 0;
 	}
 
-	ret = __iommu_attach_device(new_domain, dev);
+	ret = __iommu_attach_device(new_domain, dev, old_domain);
 	if (ret) {
 		/*
 		 * If we have a blocking domain then try to attach that in hopes
@@ -2331,7 +2335,8 @@ static int __iommu_device_set_domain(struct iommu_group *group,
 		if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) &&
 		    group->blocking_domain &&
 		    group->blocking_domain != new_domain)
-			__iommu_attach_device(group->blocking_domain, dev);
+			__iommu_attach_device(group->blocking_domain, dev,
+					      old_domain);
 		return ret;
 	}
 	return 0;
@@ -2378,7 +2383,7 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
 	result = 0;
 	for_each_group_device(group, gdev) {
 		ret = __iommu_device_set_domain(group, gdev->dev, new_domain,
-						flags);
+						group->domain, flags);
 		if (ret) {
 			result = ret;
 			/*
@@ -2413,7 +2418,7 @@ err_revert:
 		 */
 		if (group->domain)
 			WARN_ON(__iommu_device_set_domain(
-				group, gdev->dev, group->domain,
+				group, gdev->dev, group->domain, new_domain,
 				IOMMU_SET_DOMAIN_MUST_SUCCEED));
 	}
 	return ret;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index de178827a078..5661d2da2b67 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -216,7 +216,7 @@ static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj)
 }
 
 static int mock_domain_nop_attach(struct iommu_domain *domain,
-				  struct device *dev)
+				  struct device *dev, struct iommu_domain *old)
 {
 	struct mock_dev *mdev = to_mock_dev(dev);
 	struct mock_viommu *new_viommu = NULL;
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index ffa892f65714..6667ecc331f0 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -590,7 +590,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain)
 }
 
 static int ipmmu_attach_device(struct iommu_domain *io_domain,
-			       struct device *dev)
+			       struct device *dev, struct iommu_domain *old)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
@@ -637,17 +637,17 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain,
 }
 
 static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain,
-				       struct device *dev)
+				       struct device *dev,
+				       struct iommu_domain *old)
 {
-	struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev);
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct ipmmu_vmsa_domain *domain;
 	unsigned int i;
 
-	if (io_domain == identity_domain || !io_domain)
+	if (old == identity_domain || !old)
 		return 0;
 
-	domain = to_vmsa_domain(io_domain);
+	domain = to_vmsa_domain(old);
 	for (i = 0; i < fwspec->num_ids; ++i)
 		ipmmu_utlb_disable(domain, fwspec->ids[i]);
 
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 43a61ba021a5..819add75a665 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -391,7 +391,8 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev)
 	return &iommu->iommu;
 }
 
-static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev,
+				struct iommu_domain *old)
 {
 	int ret = 0;
 	unsigned long flags;
@@ -441,19 +442,19 @@ fail:
 }
 
 static int msm_iommu_identity_attach(struct iommu_domain *identity_domain,
-				     struct device *dev)
+				     struct device *dev,
+				     struct iommu_domain *old)
 {
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct msm_priv *priv;
 	unsigned long flags;
 	struct msm_iommu_dev *iommu;
 	struct msm_iommu_ctx_dev *master;
 	int ret = 0;
 
-	if (domain == identity_domain || !domain)
+	if (old == identity_domain || !old)
 		return 0;
 
-	priv = to_msm_priv(domain);
+	priv = to_msm_priv(old);
 	free_io_pgtable_ops(priv->iop);
 
 	spin_lock_irqsave(&msm_iommu_lock, flags);
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 0e0285348d2b..9747ef164413 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -705,7 +705,7 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain)
 }
 
 static int mtk_iommu_attach_device(struct iommu_domain *domain,
-				   struct device *dev)
+				   struct device *dev, struct iommu_domain *old)
 {
 	struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata;
 	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
@@ -773,12 +773,12 @@ err_unlock:
 }
 
 static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain,
-				     struct device *dev)
+				     struct device *dev,
+				     struct iommu_domain *old)
 {
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
 
-	if (domain == identity_domain || !domain)
+	if (old == identity_domain || !old)
 		return 0;
 
 	mtk_iommu_config(data, dev, false, 0);
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index 10cc0b1197e8..3b45650263ac 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -303,7 +303,9 @@ static void mtk_iommu_v1_domain_free(struct iommu_domain *domain)
 	kfree(to_mtk_domain(domain));
 }
 
-static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev)
+static int mtk_iommu_v1_attach_device(struct iommu_domain *domain,
+				      struct device *dev,
+				      struct iommu_domain *old)
 {
 	struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
 	struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain);
@@ -329,7 +331,8 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device
 }
 
 static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain,
-					struct device *dev)
+					struct device *dev,
+					struct iommu_domain *old)
 {
 	struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
 
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 5c6f5943f44b..9f0057ccea57 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1431,8 +1431,8 @@ static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain)
 	odomain->iommus = NULL;
 }
 
-static int
-omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int omap_iommu_attach_dev(struct iommu_domain *domain,
+				 struct device *dev, struct iommu_domain *old)
 {
 	struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
 	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
@@ -1536,15 +1536,15 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
 }
 
 static int omap_iommu_identity_attach(struct iommu_domain *identity_domain,
-				      struct device *dev)
+				      struct device *dev,
+				      struct iommu_domain *old)
 {
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct omap_iommu_domain *omap_domain;
 
-	if (domain == identity_domain || !domain)
+	if (old == identity_domain || !old)
 		return 0;
 
-	omap_domain = to_omap_domain(domain);
+	omap_domain = to_omap_domain(old);
 	spin_lock(&omap_domain->lock);
 	_omap_iommu_detach_dev(omap_domain, dev);
 	spin_unlock(&omap_domain->lock);
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index ebb22979075d..d9429097a2b5 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -1321,7 +1321,8 @@ static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_m
 }
 
 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
-					    struct device *dev)
+					    struct device *dev,
+					    struct iommu_domain *old)
 {
 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
@@ -1426,7 +1427,8 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
 }
 
 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
-					      struct device *dev)
+					      struct device *dev,
+					      struct iommu_domain *old)
 {
 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
@@ -1447,7 +1449,8 @@ static struct iommu_domain riscv_iommu_blocking_domain = {
 };
 
 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
-					      struct device *dev)
+					      struct device *dev,
+					      struct iommu_domain *old)
 {
 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 0861dd469bd8..85f3667e797c 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -960,7 +960,8 @@ out_disable_clocks:
 }
 
 static int rk_iommu_identity_attach(struct iommu_domain *identity_domain,
-				    struct device *dev)
+				    struct device *dev,
+				    struct iommu_domain *old)
 {
 	struct rk_iommu *iommu;
 	struct rk_iommu_domain *rk_domain;
@@ -1005,7 +1006,7 @@ static struct iommu_domain rk_identity_domain = {
 };
 
 static int rk_iommu_attach_device(struct iommu_domain *domain,
-		struct device *dev)
+				  struct device *dev, struct iommu_domain *old)
 {
 	struct rk_iommu *iommu;
 	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
@@ -1026,7 +1027,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
 	if (iommu->domain == domain)
 		return 0;
 
-	ret = rk_iommu_identity_attach(&rk_identity_domain, dev);
+	ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old);
 	if (ret)
 		return ret;
 
@@ -1041,8 +1042,17 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
 		return 0;
 
 	ret = rk_iommu_enable(iommu);
-	if (ret)
-		WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev));
+	if (ret) {
+		/*
+		 * Note rk_iommu_identity_attach() might fail before physically
+		 * attaching the dev to iommu->domain, in which case the actual
+		 * old domain for this revert should be rk_identity_domain v.s.
+		 * iommu->domain. Since rk_iommu_identity_attach() does not care
+		 * about the old domain argument for now, this is not a problem.
+		 */
+		WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev,
+						 iommu->domain));
+	}
 
 	pm_runtime_put(iommu->dev);
 
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index aa576736d60b..fe679850af28 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -670,7 +670,8 @@ int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status)
 }
 
 static int blocking_domain_attach_device(struct iommu_domain *domain,
-					 struct device *dev)
+					 struct device *dev,
+					 struct iommu_domain *old)
 {
 	struct zpci_dev *zdev = to_zpci_dev(dev);
 	struct s390_domain *s390_domain;
@@ -694,7 +695,8 @@ static int blocking_domain_attach_device(struct iommu_domain *domain,
 }
 
 static int s390_iommu_attach_device(struct iommu_domain *domain,
-				    struct device *dev)
+				    struct device *dev,
+				    struct iommu_domain *old)
 {
 	struct s390_domain *s390_domain = to_s390_domain(domain);
 	struct zpci_dev *zdev = to_zpci_dev(dev);
@@ -709,7 +711,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
 		domain->geometry.aperture_end < zdev->start_dma))
 		return -EINVAL;
 
-	blocking_domain_attach_device(&blocking_domain, dev);
+	blocking_domain_attach_device(&blocking_domain, dev, old);
 
 	/* If we fail now DMA remains blocked via blocking domain */
 	cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
@@ -1131,13 +1133,14 @@ static int __init s390_iommu_init(void)
 subsys_initcall(s390_iommu_init);
 
 static int s390_attach_dev_identity(struct iommu_domain *domain,
-				    struct device *dev)
+				    struct device *dev,
+				    struct iommu_domain *old)
 {
 	struct zpci_dev *zdev = to_zpci_dev(dev);
 	u8 status;
 	int cc;
 
-	blocking_domain_attach_device(&blocking_domain, dev);
+	blocking_domain_attach_device(&blocking_domain, dev, old);
 
 	/* If we fail now DMA remains blocked via blocking domain */
 	cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index c7ca1d8a0b15..555d4505c747 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -247,7 +247,8 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain)
 }
 
 static int sprd_iommu_attach_device(struct iommu_domain *domain,
-				    struct device *dev)
+				    struct device *dev,
+				    struct iommu_domain *old)
 {
 	struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev);
 	struct sprd_iommu_domain *dom = to_sprd_domain(domain);
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
index de10b569d9a9..d3b190be18b5 100644
--- a/drivers/iommu/sun50i-iommu.c
+++ b/drivers/iommu/sun50i-iommu.c
@@ -771,7 +771,8 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu,
 }
 
 static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain,
-					struct device *dev)
+					struct device *dev,
+					struct iommu_domain *old)
 {
 	struct sun50i_iommu *iommu = dev_iommu_priv_get(dev);
 	struct sun50i_iommu_domain *sun50i_domain;
@@ -797,7 +798,8 @@ static struct iommu_domain sun50i_iommu_identity_domain = {
 };
 
 static int sun50i_iommu_attach_device(struct iommu_domain *domain,
-				      struct device *dev)
+				      struct device *dev,
+				      struct iommu_domain *old)
 {
 	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
 	struct sun50i_iommu *iommu;
@@ -813,7 +815,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain,
 	if (iommu->domain == domain)
 		return 0;
 
-	sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev);
+	sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old);
 
 	sun50i_iommu_attach_domain(iommu, sun50i_domain);
 
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 36cdd5fbab07..336e0a3ff41f 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -490,7 +490,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
 }
 
 static int tegra_smmu_attach_dev(struct iommu_domain *domain,
-				 struct device *dev)
+				 struct device *dev, struct iommu_domain *old)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct tegra_smmu *smmu = dev_iommu_priv_get(dev);
@@ -524,9 +524,9 @@ disable:
 }
 
 static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain,
-				      struct device *dev)
+				      struct device *dev,
+				      struct iommu_domain *old)
 {
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct tegra_smmu_as *as;
 	struct tegra_smmu *smmu;
@@ -535,10 +535,10 @@ static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain,
 	if (!fwspec)
 		return -ENODEV;
 
-	if (domain == identity_domain || !domain)
+	if (old == identity_domain || !old)
 		return 0;
 
-	as = to_smmu_as(domain);
+	as = to_smmu_as(old);
 	smmu = as->smmu;
 	for (index = 0; index < fwspec->num_ids; index++) {
 		tegra_smmu_disable(smmu, fwspec->ids[index], as->id);
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index b39d6f134ab2..d314fa5cd847 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -730,7 +730,8 @@ static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev)
 	return domain;
 }
 
-static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev,
+			     struct iommu_domain *old)
 {
 	int ret = 0;
 	struct virtio_iommu_req_attach req;
@@ -781,7 +782,8 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
 }
 
 static int viommu_attach_identity_domain(struct iommu_domain *domain,
-					 struct device *dev)
+					 struct device *dev,
+					 struct iommu_domain *old)
 {
 	int ret = 0;
 	struct virtio_iommu_req_attach req;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c30d12e16473..801b2bd9e8d4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -751,7 +751,8 @@ struct iommu_ops {
  * @free: Release the domain after use.
  */
 struct iommu_domain_ops {
-	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
+	int (*attach_dev)(struct iommu_domain *domain, struct device *dev,
+			  struct iommu_domain *old);
 	int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev,
 			     ioasid_t pasid, struct iommu_domain *old);
 
-- 
cgit v1.2.3


From 1afc05996299b4546e8be9b13c89f78e19912c7d Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 21 Oct 2025 11:50:12 +0100
Subject: ASoC: cs35l56: Read silicon ID during initialization and save it

Read the silicon ID from the amp during one-time cs35l56_hw_init()
and store it in struct cs35l56_base, instead of reading it from
registers every time it is needed.

Note that marking it non-volatile without a default in regmap isn't
a suitable alternative because this causes regcache_sync() to always
write the cached value out to the registers. This could trigger a bus
fault interrupt inside the amp, which we want to avoid.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251021105022.1013685-2-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           |  1 +
 sound/soc/codecs/cs35l56-shared.c | 53 +++++++++++++++++++--------------------
 2 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index ab044ce2aa8b..ec9b1072d6be 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -309,6 +309,7 @@ struct cs35l56_base {
 	struct cs35l56_spi_payload *spi_payload_buf;
 	const struct cs35l56_fw_reg *fw_reg;
 	const struct cirrus_amp_cal_controls *calibration_controls;
+	u64 silicon_uid;
 };
 
 static inline bool cs35l56_is_otp_register(unsigned int reg)
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index 9e6b9ca2f354..1ecfc38d8eb4 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -853,7 +853,7 @@ struct cs35l56_pte {
 } __packed;
 static_assert((sizeof(struct cs35l56_pte) % sizeof(u32)) == 0);
 
-static int cs35l56_read_silicon_uid(struct cs35l56_base *cs35l56_base, u64 *uid)
+static int cs35l56_read_silicon_uid(struct cs35l56_base *cs35l56_base)
 {
 	struct cs35l56_pte pte;
 	u64 unique_id;
@@ -870,14 +870,15 @@ static int cs35l56_read_silicon_uid(struct cs35l56_base *cs35l56_base, u64 *uid)
 	unique_id |= (u32)pte.x | ((u32)pte.y << 8) | ((u32)pte.wafer_id << 16) |
 		     ((u32)pte.dvs << 24);
 
-	*uid = unique_id;
+	cs35l56_base->silicon_uid = unique_id;
 
 	return 0;
 }
 
-static int cs35l63_read_silicon_uid(struct cs35l56_base *cs35l56_base, u64 *uid)
+static int cs35l63_read_silicon_uid(struct cs35l56_base *cs35l56_base)
 {
 	u32 tmp[2];
+	u64 unique_id;
 	int ret;
 
 	ret = regmap_bulk_read(cs35l56_base->regmap, CS35L56_DIE_STS1, tmp, ARRAY_SIZE(tmp));
@@ -886,9 +887,11 @@ static int cs35l63_read_silicon_uid(struct cs35l56_base *cs35l56_base, u64 *uid)
 		return ret;
 	}
 
-	*uid = tmp[1];
-	*uid <<= 32;
-	*uid |= tmp[0];
+	unique_id = tmp[1];
+	unique_id <<= 32;
+	unique_id |= tmp[0];
+
+	cs35l56_base->silicon_uid = unique_id;
 
 	return 0;
 }
@@ -915,33 +918,14 @@ static const struct cirrus_amp_cal_controls cs35l63_calibration_controls = {
 
 int cs35l56_get_calibration(struct cs35l56_base *cs35l56_base)
 {
-	u64 silicon_uid = 0;
 	int ret;
 
 	/* Driver can't apply calibration to a secured part, so skip */
 	if (cs35l56_base->secured)
 		return 0;
 
-	switch (cs35l56_base->type) {
-	case 0x54:
-	case 0x56:
-	case 0x57:
-		ret = cs35l56_read_silicon_uid(cs35l56_base, &silicon_uid);
-		break;
-	case 0x63:
-		ret = cs35l63_read_silicon_uid(cs35l56_base, &silicon_uid);
-		break;
-	default:
-		ret = -ENODEV;
-		break;
-	}
-
-	if (ret < 0)
-		return ret;
-
-	dev_dbg(cs35l56_base->dev, "UniqueID = %#llx\n", silicon_uid);
-
-	ret = cs_amp_get_efi_calibration_data(cs35l56_base->dev, silicon_uid,
+	ret = cs_amp_get_efi_calibration_data(cs35l56_base->dev,
+					      cs35l56_base->silicon_uid,
 					      cs35l56_base->cal_index,
 					      &cs35l56_base->cal_data);
 
@@ -1111,6 +1095,21 @@ int cs35l56_hw_init(struct cs35l56_base *cs35l56_base)
 			   CS35L56_TEMP_ERR_EINT1_MASK,
 			   0);
 
+	switch (cs35l56_base->type) {
+	case 0x54:
+	case 0x56:
+	case 0x57:
+		ret = cs35l56_read_silicon_uid(cs35l56_base);
+		break;
+	default:
+		ret = cs35l63_read_silicon_uid(cs35l56_base);
+		break;
+	}
+	if (ret)
+		return ret;
+
+	dev_dbg(cs35l56_base->dev, "SiliconID = %#llx\n", cs35l56_base->silicon_uid);
+
 	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(cs35l56_hw_init, "SND_SOC_CS35L56_SHARED");
-- 
cgit v1.2.3


From cdd27fa3298ad2f39788804f7d09ab31af2b416c Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 21 Oct 2025 11:50:13 +0100
Subject: ASoC: cs-amp-lib: Add helpers for factory calibration

Add helper functions for performing factory calibration.

cs_amp_read_cal_coeffs() reads the results of a calibration into a
struct cirrus_amp_cal_data. The calTime member is also filled in with
the current time (which is defined to be in Windows format).

cs_amp_write_ambient_temp() writes a given temperature value to the
firmware control for ambient temperature.

The cs_amp_cal_target_u64() has been moved into the header file so
that it can be used by the calling code and by KUnit tests.

cs_amp_create_debugfs() creates a debugfs directory to contain
debugfs files related to calibration. This is placed in a directory
in debugfs root, named "cirrus_logic". The purpose of this is to
make it easier for tooling to find the files it needs by keeping
control of the layout under this directory. By contrast the ASoC
debugfs can vary between kernel releases and doesn't have a strictly
stable naming convention. HDA does not have a debugfs directory at all
and enabling the general ALSA debugfs (which is normally disabled) has
other side-effects.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251021105022.1013685-3-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs-amp-lib.h    |  12 ++++
 sound/soc/codecs/cs-amp-lib.c | 148 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 155 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h
index 43a87a39110c..5b094f8e8a6f 100644
--- a/include/sound/cs-amp-lib.h
+++ b/include/sound/cs-amp-lib.h
@@ -47,9 +47,21 @@ struct cirrus_amp_cal_controls {
 int cs_amp_write_cal_coeffs(struct cs_dsp *dsp,
 			    const struct cirrus_amp_cal_controls *controls,
 			    const struct cirrus_amp_cal_data *data);
+int cs_amp_read_cal_coeffs(struct cs_dsp *dsp,
+			   const struct cirrus_amp_cal_controls *controls,
+			   struct cirrus_amp_cal_data *data);
+int cs_amp_write_ambient_temp(struct cs_dsp *dsp,
+			      const struct cirrus_amp_cal_controls *controls,
+			      u32 temp);
 int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_index,
 				    struct cirrus_amp_cal_data *out_data);
 int cs_amp_get_vendor_spkid(struct device *dev);
+struct dentry *cs_amp_create_debugfs(struct device *dev);
+
+static inline u64 cs_amp_cal_target_u64(const struct cirrus_amp_cal_data *data)
+{
+	return ((u64)data->calTarget[1] << 32) | data->calTarget[0];
+}
 
 struct cs_amp_test_hooks {
 	efi_status_t (*get_efi_variable)(efi_char16_t *name,
diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c
index 8434d5196107..f9d5c4adf3f2 100644
--- a/sound/soc/codecs/cs-amp-lib.c
+++ b/sound/soc/codecs/cs-amp-lib.c
@@ -7,12 +7,15 @@
 
 #include <asm/byteorder.h>
 #include <kunit/static_stub.h>
+#include <linux/debugfs.h>
 #include <linux/dev_printk.h>
 #include <linux/efi.h>
 #include <linux/firmware/cirrus/cs_dsp.h>
+#include <linux/math64.h>
 #include <linux/module.h>
 #include <linux/overflow.h>
 #include <linux/slab.h>
+#include <linux/timekeeping.h>
 #include <linux/types.h>
 #include <sound/cs-amp-lib.h>
 
@@ -46,6 +49,16 @@ static const struct cs_amp_lib_cal_efivar {
 	},
 };
 
+/* Offset from Unix time to Windows time (100ns since 1 Jan 1601) */
+#define UNIX_TIME_TO_WINDOWS_TIME_OFFSET	116444736000000000ULL
+
+static u64 cs_amp_time_now_in_windows_time(void)
+{
+	u64 time_in_100ns = div_u64(ktime_get_real_ns(), 100);
+
+	return time_in_100ns + UNIX_TIME_TO_WINDOWS_TIME_OFFSET;
+}
+
 static int cs_amp_write_cal_coeff(struct cs_dsp *dsp,
 				  const struct cirrus_amp_cal_controls *controls,
 				  const char *ctl_name, u32 val)
@@ -73,6 +86,34 @@ static int cs_amp_write_cal_coeff(struct cs_dsp *dsp,
 	return -ENODEV;
 }
 
+static int cs_amp_read_cal_coeff(struct cs_dsp *dsp,
+				 const struct cirrus_amp_cal_controls *controls,
+				 const char *ctl_name, u32 *val)
+{
+	struct cs_dsp_coeff_ctl *cs_ctl;
+	__be32 beval;
+	int ret;
+
+	KUNIT_STATIC_STUB_REDIRECT(cs_amp_read_cal_coeff, dsp, controls, ctl_name, val);
+
+	if (!IS_REACHABLE(CONFIG_FW_CS_DSP))
+		return -ENODEV;
+
+	scoped_guard(mutex, &dsp->pwr_lock) {
+		cs_ctl = cs_dsp_get_ctl(dsp, ctl_name, controls->mem_region, controls->alg_id);
+		ret = cs_dsp_coeff_read_ctrl(cs_ctl, 0, &beval, sizeof(beval));
+	}
+
+	if (ret < 0) {
+		dev_err(dsp->dev, "Failed to write to '%s': %d\n", ctl_name, ret);
+		return ret;
+	}
+
+	*val = be32_to_cpu(beval);
+
+	return 0;
+}
+
 static int _cs_amp_write_cal_coeffs(struct cs_dsp *dsp,
 				    const struct cirrus_amp_cal_controls *controls,
 				    const struct cirrus_amp_cal_data *data)
@@ -106,6 +147,45 @@ static int _cs_amp_write_cal_coeffs(struct cs_dsp *dsp,
 	return 0;
 }
 
+static int _cs_amp_read_cal_coeffs(struct cs_dsp *dsp,
+				    const struct cirrus_amp_cal_controls *controls,
+				    struct cirrus_amp_cal_data *data)
+{
+	u64 time;
+	u32 val;
+	int ret;
+
+	if (list_empty(&dsp->ctl_list)) {
+		dev_info(dsp->dev, "Calibration disabled due to missing firmware controls\n");
+		return -ENOENT;
+	}
+
+	ret = cs_amp_read_cal_coeff(dsp, controls, controls->ambient, &val);
+	if (ret)
+		return ret;
+
+	data->calAmbient = (s8)val;
+
+	ret = cs_amp_read_cal_coeff(dsp, controls, controls->calr, &val);
+	if (ret)
+		return ret;
+
+	data->calR = (u16)val;
+
+	ret = cs_amp_read_cal_coeff(dsp, controls, controls->status, &val);
+	if (ret)
+		return ret;
+
+	data->calStatus = (u8)val;
+
+	/* Fill in timestamp */
+	time = cs_amp_time_now_in_windows_time();
+	data->calTime[0] = (u32)time;
+	data->calTime[1] = (u32)(time >> 32);
+
+	return 0;
+}
+
 /**
  * cs_amp_write_cal_coeffs - Write calibration data to firmware controls.
  * @dsp:	Pointer to struct cs_dsp.
@@ -125,6 +205,44 @@ int cs_amp_write_cal_coeffs(struct cs_dsp *dsp,
 }
 EXPORT_SYMBOL_NS_GPL(cs_amp_write_cal_coeffs, "SND_SOC_CS_AMP_LIB");
 
+/**
+ * cs_amp_read_cal_coeffs - Read calibration data from firmware controls.
+ * @dsp:	Pointer to struct cs_dsp.
+ * @controls:	Pointer to definition of firmware controls to be read.
+ * @data:	Pointer to calibration data where results will be written.
+ *
+ * Returns: 0 on success, else negative error value.
+ */
+int cs_amp_read_cal_coeffs(struct cs_dsp *dsp,
+			   const struct cirrus_amp_cal_controls *controls,
+			   struct cirrus_amp_cal_data *data)
+{
+	if (IS_REACHABLE(CONFIG_FW_CS_DSP) || IS_ENABLED(CONFIG_SND_SOC_CS_AMP_LIB_TEST))
+		return _cs_amp_read_cal_coeffs(dsp, controls, data);
+	else
+		return -ENODEV;
+}
+EXPORT_SYMBOL_NS_GPL(cs_amp_read_cal_coeffs, "SND_SOC_CS_AMP_LIB");
+
+/**
+ * cs_amp_write_ambient_temp - write value to calibration ambient temperature
+ * @dsp:	Pointer to struct cs_dsp.
+ * @controls:	Pointer to definition of firmware controls to be read.
+ * @temp:	Temperature in degrees celcius.
+ *
+ * Returns: 0 on success, else negative error value.
+ */
+int cs_amp_write_ambient_temp(struct cs_dsp *dsp,
+			      const struct cirrus_amp_cal_controls *controls,
+			      u32 temp)
+{
+	if (IS_REACHABLE(CONFIG_FW_CS_DSP) || IS_ENABLED(CONFIG_SND_SOC_CS_AMP_LIB_TEST))
+		return cs_amp_write_cal_coeff(dsp, controls, controls->ambient, temp);
+	else
+		return -ENODEV;
+}
+EXPORT_SYMBOL_NS_GPL(cs_amp_write_ambient_temp, "SND_SOC_CS_AMP_LIB");
+
 static efi_status_t cs_amp_get_efi_variable(efi_char16_t *name,
 					    efi_guid_t *guid,
 					    unsigned long *size,
@@ -215,11 +333,6 @@ err:
 	return ERR_PTR(ret);
 }
 
-static u64 cs_amp_cal_target_u64(const struct cirrus_amp_cal_data *data)
-{
-	return ((u64)data->calTarget[1] << 32) | data->calTarget[0];
-}
-
 static int _cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_index,
 					    struct cirrus_amp_cal_data *out_data)
 {
@@ -400,6 +513,31 @@ int cs_amp_get_vendor_spkid(struct device *dev)
 }
 EXPORT_SYMBOL_NS_GPL(cs_amp_get_vendor_spkid, "SND_SOC_CS_AMP_LIB");
 
+/**
+ * cs_amp_create_debugfs - create a debugfs directory for a device
+ *
+ * @dev: pointer to struct device
+ *
+ * Creates a node under "cirrus_logic" in the root of the debugfs filesystem.
+ * This is for Cirrus-specific debugfs functionality to be grouped in a
+ * defined way, independently of the debugfs provided by ALSA/ASoC.
+ * The general ALSA/ASoC debugfs may not be enabled, and does not necessarily
+ * have a stable layout or naming convention.
+ *
+ * Return: Pointer to the dentry for the created directory, or -ENODEV.
+ */
+struct dentry *cs_amp_create_debugfs(struct device *dev)
+{
+	struct dentry *dir;
+
+	dir = debugfs_lookup("cirrus_logic", NULL);
+	if (!dir)
+		dir = debugfs_create_dir("cirrus_logic", NULL);
+
+	return debugfs_create_dir(dev_name(dev), dir);
+}
+EXPORT_SYMBOL_NS_GPL(cs_amp_create_debugfs, "SND_SOC_CS_AMP_LIB");
+
 static const struct cs_amp_test_hooks cs_amp_test_hook_ptrs = {
 	.get_efi_variable = cs_amp_get_efi_variable,
 	.write_cal_coeff = cs_amp_write_cal_coeff,
-- 
cgit v1.2.3


From f7097161e94cd39df7a8848ad0de5f394124ed69 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 21 Oct 2025 11:50:14 +0100
Subject: ASoC: cs35l56: Add common code for factory calibration

Add core code to support factory calibration. This can be used by both
the ASoC and HDA drivers.

This code consists of implementations of debugfs handlers for three
debugfs files used to start factory calibration and read the results.

This is not a full implementation of debugfs files. There are some
requirements to synchronize with the rest of the amp driver, and the way
this is done is significantly different between ASoC and HDA. Therefore
cs35l56-shared.c provides the main part of the file handlers, but the
files themselves are defined in the ASoC and HDA drivers with suitable
handling before calling into this shared code.

The cal_data file allows the calibration to be read and also for a
previous calibration to be written (for systems where the storage is not
something directly accessible to drivers, such as on filesystems). Code
outside the kernel should treat the content of cal_data as an opaque blob,
so the struct definition is not exported as a user API.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251021105022.1013685-4-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           |  33 +++++
 sound/soc/codecs/Kconfig          |   3 +
 sound/soc/codecs/cs35l56-shared.c | 300 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 331 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index ec9b1072d6be..349b896ee737 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -9,6 +9,7 @@
 #ifndef __CS35L56_H
 #define __CS35L56_H
 
+#include <linux/debugfs.h>
 #include <linux/firmware/cirrus/cs_dsp.h>
 #include <linux/regulator/consumer.h>
 #include <linux/regmap.h>
@@ -62,6 +63,8 @@
 #define CS35L56_IRQ1_MASK_8				0x000E0AC
 #define CS35L56_IRQ1_MASK_18				0x000E0D4
 #define CS35L56_IRQ1_MASK_20				0x000E0DC
+#define CS35L56_MIXER_NGATE_CH1_CFG			0x0010004
+#define CS35L56_MIXER_NGATE_CH2_CFG			0x0010008
 #define CS35L56_DSP_MBOX_1_RAW				0x0011000
 #define CS35L56_DSP_VIRTUAL1_MBOX_1			0x0011020
 #define CS35L56_DSP_VIRTUAL1_MBOX_2			0x0011024
@@ -177,6 +180,9 @@
 /* IRQ1_EINT_8 */
 #define CS35L56_TEMP_ERR_EINT1_MASK			0x80000000
 
+/* MIXER_NGATE_CHn_CFG */
+#define CS35L56_AUX_NGATE_CHn_EN			0x00000001
+
 /* Mixer input sources */
 #define CS35L56_INPUT_SRC_NONE				0x00
 #define CS35L56_INPUT_SRC_ASP1RX1			0x08
@@ -243,6 +249,7 @@
 #define CS35L56_MBOX_CMD_AUDIO_PLAY			0x0B000001
 #define CS35L56_MBOX_CMD_AUDIO_PAUSE			0x0B000002
 #define CS35L56_MBOX_CMD_AUDIO_REINIT			0x0B000003
+#define CS35L56_MBOX_CMD_AUDIO_CALIBRATION		0x0B000006
 #define CS35L56_MBOX_CMD_HIBERNATE_NOW			0x02000001
 #define CS35L56_MBOX_CMD_WAKEUP				0x02000002
 #define CS35L56_MBOX_CMD_PREVENT_AUTO_HIBERNATE		0x02000003
@@ -264,6 +271,9 @@
 #define CS35L56_RESET_PULSE_MIN_US			1100
 #define CS35L56_WAKE_HOLD_TIME_US			1000
 
+#define CS35L56_CALIBRATION_POLL_US			(100 * USEC_PER_MSEC)
+#define CS35L56_CALIBRATION_TIMEOUT_US			(5 * USEC_PER_SEC)
+
 #define CS35L56_SDW1_PLAYBACK_PORT			1
 #define CS35L56_SDW1_CAPTURE_PORT			3
 
@@ -291,9 +301,16 @@ struct cs35l56_fw_reg {
 	unsigned int posture_number;
 };
 
+struct cs35l56_cal_debugfs_fops {
+	const struct debugfs_short_fops calibrate;
+	const struct debugfs_short_fops cal_temperature;
+	const struct debugfs_short_fops cal_data;
+};
+
 struct cs35l56_base {
 	struct device *dev;
 	struct regmap *regmap;
+	struct cs_dsp *dsp;
 	int irq;
 	struct mutex irq_lock;
 	u8 type;
@@ -309,6 +326,7 @@ struct cs35l56_base {
 	struct cs35l56_spi_payload *spi_payload_buf;
 	const struct cs35l56_fw_reg *fw_reg;
 	const struct cirrus_amp_cal_controls *calibration_controls;
+	struct dentry *debugfs;
 	u64 silicon_uid;
 };
 
@@ -359,6 +377,21 @@ int cs35l56_runtime_suspend_common(struct cs35l56_base *cs35l56_base);
 int cs35l56_runtime_resume_common(struct cs35l56_base *cs35l56_base, bool is_soundwire);
 void cs35l56_init_cs_dsp(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp);
 int cs35l56_get_calibration(struct cs35l56_base *cs35l56_base);
+ssize_t cs35l56_calibrate_debugfs_write(struct cs35l56_base *cs35l56_base,
+					const char __user *from, size_t count,
+					loff_t *ppos);
+ssize_t cs35l56_cal_ambient_debugfs_write(struct cs35l56_base *cs35l56_base,
+					  const char __user *from, size_t count,
+					  loff_t *ppos);
+ssize_t cs35l56_cal_data_debugfs_read(struct cs35l56_base *cs35l56_base,
+				      char __user *to, size_t count,
+				      loff_t *ppos);
+ssize_t cs35l56_cal_data_debugfs_write(struct cs35l56_base *cs35l56_base,
+				       const char __user *from, size_t count,
+				       loff_t *ppos);
+void cs35l56_create_cal_debugfs(struct cs35l56_base *cs35l56_base,
+				const struct cs35l56_cal_debugfs_fops *fops);
+void cs35l56_remove_cal_debugfs(struct cs35l56_base *cs35l56_base);
 int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base,
 			     bool *fw_missing, unsigned int *fw_version);
 void cs35l56_log_tuning(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp);
diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig
index 160c07699a8b..6bb24325c2d0 100644
--- a/sound/soc/codecs/Kconfig
+++ b/sound/soc/codecs/Kconfig
@@ -896,6 +896,9 @@ config SND_SOC_CS35L56_SDW
 	help
 	  Enable support for Cirrus Logic CS35L56 boosted amplifier with SoundWire control
 
+config SND_SOC_CS35L56_CAL_DEBUGFS_COMMON
+	bool
+
 config SND_SOC_CS40L50
 	tristate "Cirrus Logic CS40L50 CODEC"
 	depends on MFD_CS40L50_CORE
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index 1ecfc38d8eb4..eeb830e3f743 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -6,11 +6,18 @@
 //                    Cirrus Logic International Semiconductor Ltd.
 
 #include <linux/array_size.h>
+#include <linux/cleanup.h>
+#include <linux/debugfs.h>
 #include <linux/firmware/cirrus/wmfw.h>
+#include <linux/fs.h>
 #include <linux/gpio/consumer.h>
+#include <linux/kstrtox.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
 #include <linux/spi/spi.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/string_choices.h>
 #include <linux/types.h>
 #include <sound/cs-amp-lib.h>
 
@@ -206,6 +213,8 @@ static bool cs35l56_readable_reg(struct device *dev, unsigned int reg)
 	case CS35L56_IRQ1_MASK_8:
 	case CS35L56_IRQ1_MASK_18:
 	case CS35L56_IRQ1_MASK_20:
+	case CS35L56_MIXER_NGATE_CH1_CFG:
+	case CS35L56_MIXER_NGATE_CH2_CFG:
 	case CS35L56_DSP_VIRTUAL1_MBOX_1:
 	case CS35L56_DSP_VIRTUAL1_MBOX_2:
 	case CS35L56_DSP_VIRTUAL1_MBOX_3:
@@ -263,6 +272,8 @@ static bool cs35l56_common_volatile_reg(unsigned int reg)
 	case CS35L56_IRQ1_EINT_1 ... CS35L56_IRQ1_EINT_8:
 	case CS35L56_IRQ1_EINT_18:
 	case CS35L56_IRQ1_EINT_20:
+	case CS35L56_MIXER_NGATE_CH1_CFG:
+	case CS35L56_MIXER_NGATE_CH2_CFG:
 	case CS35L56_DSP_VIRTUAL1_MBOX_1:
 	case CS35L56_DSP_VIRTUAL1_MBOX_2:
 	case CS35L56_DSP_VIRTUAL1_MBOX_3:
@@ -724,15 +735,11 @@ static void cs35l56_issue_wake_event(struct cs35l56_base *cs35l56_base)
 	cs35l56_wait_control_port_ready();
 }
 
-int cs35l56_runtime_suspend_common(struct cs35l56_base *cs35l56_base)
+static int cs35l56_wait_for_ps3(struct cs35l56_base *cs35l56_base)
 {
 	unsigned int val;
 	int ret;
 
-	if (!cs35l56_base->init_done)
-		return 0;
-
-	/* Firmware must have entered a power-save state */
 	ret = regmap_read_poll_timeout(cs35l56_base->regmap,
 				       cs35l56_base->fw_reg->transducer_actual_ps,
 				       val, (val >= CS35L56_PS3),
@@ -741,6 +748,17 @@ int cs35l56_runtime_suspend_common(struct cs35l56_base *cs35l56_base)
 	if (ret)
 		dev_warn(cs35l56_base->dev, "PS3 wait failed: %d\n", ret);
 
+	return ret;
+}
+
+int cs35l56_runtime_suspend_common(struct cs35l56_base *cs35l56_base)
+{
+	if (!cs35l56_base->init_done)
+		return 0;
+
+	/* Firmware must have entered a power-save state */
+	cs35l56_wait_for_ps3(cs35l56_base);
+
 	/* Clear BOOT_DONE so it can be used to detect a reboot */
 	regmap_write(cs35l56_base->regmap, CS35L56_IRQ1_EINT_4, CS35L56_OTP_BOOT_DONE_MASK);
 
@@ -839,6 +857,8 @@ void cs35l56_init_cs_dsp(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_ds
 	cs_dsp->mem = cs35l56_dsp1_regions;
 	cs_dsp->num_mems = ARRAY_SIZE(cs35l56_dsp1_regions);
 	cs_dsp->no_core_startstop = true;
+
+	cs35l56_base->dsp = cs_dsp;
 }
 EXPORT_SYMBOL_NS_GPL(cs35l56_init_cs_dsp, "SND_SOC_CS35L56_SHARED");
 
@@ -942,6 +962,276 @@ int cs35l56_get_calibration(struct cs35l56_base *cs35l56_base)
 }
 EXPORT_SYMBOL_NS_GPL(cs35l56_get_calibration, "SND_SOC_CS35L56_SHARED");
 
+static int cs35l56_stash_calibration(struct cs35l56_base *cs35l56_base,
+				     const struct cirrus_amp_cal_data *data)
+{
+
+	/* Ignore if it is empty */
+	if (!data->calTime[0] && !data->calTime[1])
+		return -ENODATA;
+
+	if (cs_amp_cal_target_u64(data) != cs35l56_base->silicon_uid) {
+		dev_err(cs35l56_base->dev, "cal_data not for this silicon ID\n");
+		return -EINVAL;
+	}
+
+	cs35l56_base->cal_data = *data;
+	cs35l56_base->cal_data_valid = true;
+
+	return 0;
+}
+
+static int cs35l56_perform_calibration(struct cs35l56_base *cs35l56_base)
+{
+	const struct cirrus_amp_cal_controls *calibration_controls =
+		cs35l56_base->calibration_controls;
+	struct cs_dsp *dsp = cs35l56_base->dsp;
+	struct cirrus_amp_cal_data cal_data;
+	struct cs_dsp_coeff_ctl *ctl;
+	bool ngate_ch1_was_enabled = false;
+	bool ngate_ch2_was_enabled = false;
+	int cali_norm_en_alg_id, cali_norm_en_mem;
+	int ret;
+	__be32 val;
+
+	if (cs35l56_base->silicon_uid == 0) {
+		dev_err(cs35l56_base->dev, "Cannot calibrate: no silicon UID\n");
+		return -ENXIO;
+	}
+
+	switch (cs35l56_base->type) {
+	case 0x54:
+	case 0x56:
+	case 0x57:
+		if (cs35l56_base->rev < 0xb2) {
+			cali_norm_en_alg_id = 0x9f22f;
+			cali_norm_en_mem = WMFW_ADSP2_YM;
+		} else {
+			cali_norm_en_alg_id = 0x9f210;
+			cali_norm_en_mem = WMFW_ADSP2_XM;
+		}
+		break;
+	default:
+		cali_norm_en_alg_id = 0xbf210;
+		cali_norm_en_mem = WMFW_ADSP2_XM;
+		break;
+	}
+
+	ret = pm_runtime_resume_and_get(cs35l56_base->dev);
+	if (ret)
+		return ret;
+
+	ret = cs35l56_wait_for_ps3(cs35l56_base);
+	if (ret)
+		goto err_pm_put;
+
+	regmap_update_bits_check(cs35l56_base->regmap, CS35L56_MIXER_NGATE_CH1_CFG,
+				 CS35L56_AUX_NGATE_CHn_EN, 0, &ngate_ch1_was_enabled);
+	regmap_update_bits_check(cs35l56_base->regmap, CS35L56_MIXER_NGATE_CH2_CFG,
+				 CS35L56_AUX_NGATE_CHn_EN, 0, &ngate_ch2_was_enabled);
+
+	scoped_guard(mutex, &dsp->pwr_lock) {
+		ctl = cs_dsp_get_ctl(dsp,
+				     calibration_controls->status,
+				     calibration_controls->mem_region,
+				     calibration_controls->alg_id);
+		if (!ctl) {
+			dev_err(cs35l56_base->dev, "Could not get %s control\n",
+				calibration_controls->status);
+			ret = -ENXIO;
+			goto err;
+		}
+
+		val = cpu_to_be32(0);
+		ret = cs_dsp_coeff_write_ctrl(cs_dsp_get_ctl(dsp,
+					      "CALI_NORM_EN",
+					      cali_norm_en_mem,
+					      cali_norm_en_alg_id),
+					      0, &val, sizeof(val));
+		if (ret < 0) {
+			dev_err(cs35l56_base->dev, "Could not write %s: %d\n", "CALI_NORM_EN", ret);
+			goto err;
+		}
+
+		ret = cs35l56_mbox_send(cs35l56_base, CS35L56_MBOX_CMD_AUDIO_CALIBRATION);
+		if (ret)
+			goto err;
+
+		if (read_poll_timeout(cs_dsp_coeff_read_ctrl, ret,
+				      (val == cpu_to_be32(1)),
+				      CS35L56_CALIBRATION_POLL_US,
+				      CS35L56_CALIBRATION_TIMEOUT_US,
+				      true,
+				      ctl, 0, &val, sizeof(val))) {
+			dev_err(cs35l56_base->dev, "Calibration timed out (CAL_STATUS: %u)\n",
+				be32_to_cpu(val));
+			ret = -ETIMEDOUT;
+			goto err;
+		}
+	}
+
+	cs35l56_base->cal_data_valid = false;
+	memset(&cal_data, 0, sizeof(cal_data));
+	ret = cs_amp_read_cal_coeffs(dsp, calibration_controls, &cal_data);
+	if (ret)
+		goto err;
+
+	dev_info(cs35l56_base->dev, "Cal status:%d calR:%d ambient:%d\n",
+		 cal_data.calStatus, cal_data.calR, cal_data.calAmbient);
+
+	cal_data.calTarget[0] = (u32)cs35l56_base->silicon_uid;
+	cal_data.calTarget[1] = (u32)(cs35l56_base->silicon_uid >> 32);
+	cs35l56_base->cal_data = cal_data;
+	cs35l56_base->cal_data_valid = true;
+
+	ret = 0;
+
+err:
+	if (ngate_ch1_was_enabled) {
+		regmap_set_bits(cs35l56_base->regmap, CS35L56_MIXER_NGATE_CH1_CFG,
+				CS35L56_AUX_NGATE_CHn_EN);
+	}
+	if (ngate_ch2_was_enabled) {
+		regmap_set_bits(cs35l56_base->regmap, CS35L56_MIXER_NGATE_CH2_CFG,
+				CS35L56_AUX_NGATE_CHn_EN);
+	}
+err_pm_put:
+	pm_runtime_put(cs35l56_base->dev);
+
+	return ret;
+}
+
+ssize_t cs35l56_calibrate_debugfs_write(struct cs35l56_base *cs35l56_base,
+					const char __user *from, size_t count,
+					loff_t *ppos)
+{
+	static const char * const options[] = { "factory" };
+	char buf[8] = { 0 };
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_SND_SOC_CS35L56_CAL_DEBUGFS_COMMON))
+		return -ENXIO;
+
+	if (*ppos)
+		return -EINVAL;
+
+	ret = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, from, count);
+	if (ret < 0)
+		return ret;
+
+	switch (sysfs_match_string(options, buf)) {
+	case 0:
+		ret = cs35l56_perform_calibration(cs35l56_base);
+		if (ret < 0)
+			return ret;
+		break;
+	default:
+		return -ENXIO;
+	}
+
+	return count;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_calibrate_debugfs_write, "SND_SOC_CS35L56_SHARED");
+
+ssize_t cs35l56_cal_ambient_debugfs_write(struct cs35l56_base *cs35l56_base,
+					  const char __user *from, size_t count,
+					  loff_t *ppos)
+{
+	unsigned long val;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_SND_SOC_CS35L56_CAL_DEBUGFS_COMMON))
+		return -ENXIO;
+
+	if (*ppos)
+		return -EINVAL;
+
+	ret = pm_runtime_resume_and_get(cs35l56_base->dev);
+	if (ret)
+		return ret;
+
+	ret = kstrtoul_from_user(from, count, 10, &val);
+	if (ret < 0)
+		goto out;
+
+	ret = cs_amp_write_ambient_temp(cs35l56_base->dsp, cs35l56_base->calibration_controls, val);
+out:
+	pm_runtime_put(cs35l56_base->dev);
+
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_cal_ambient_debugfs_write, "SND_SOC_CS35L56_SHARED");
+
+ssize_t cs35l56_cal_data_debugfs_read(struct cs35l56_base *cs35l56_base,
+				      char __user *to, size_t count,
+				      loff_t *ppos)
+{
+	if (!IS_ENABLED(CONFIG_SND_SOC_CS35L56_CAL_DEBUGFS_COMMON))
+		return -ENXIO;
+
+	if (!cs35l56_base->cal_data_valid)
+		return 0;
+
+	return simple_read_from_buffer(to, count, ppos, &cs35l56_base->cal_data,
+				       sizeof(cs35l56_base->cal_data));
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_cal_data_debugfs_read, "SND_SOC_CS35L56_SHARED");
+
+ssize_t cs35l56_cal_data_debugfs_write(struct cs35l56_base *cs35l56_base,
+				       const char __user *from, size_t count,
+				       loff_t *ppos)
+{
+	struct cirrus_amp_cal_data cal_data;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_SND_SOC_CS35L56_CAL_DEBUGFS_COMMON))
+		return -ENXIO;
+
+	/* Only allow a full blob to be written */
+	if (*ppos || (count != sizeof(cal_data)))
+		return -EMSGSIZE;
+
+	ret = simple_write_to_buffer(&cal_data, sizeof(cal_data), ppos, from, count);
+	if (ret)
+		return ret;
+
+	ret = cs35l56_stash_calibration(cs35l56_base, &cal_data);
+	if (ret)
+		return ret;
+
+	return count;
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_cal_data_debugfs_write, "SND_SOC_CS35L56_SHARED");
+
+void cs35l56_create_cal_debugfs(struct cs35l56_base *cs35l56_base,
+				const struct cs35l56_cal_debugfs_fops *fops)
+{
+	if (!IS_ENABLED(CONFIG_SND_SOC_CS35L56_CAL_DEBUGFS_COMMON))
+		return;
+
+	cs35l56_base->debugfs = cs_amp_create_debugfs(cs35l56_base->dev);
+
+	debugfs_create_file("calibrate",
+			    0200, cs35l56_base->debugfs, cs35l56_base,
+			    &fops->calibrate);
+	debugfs_create_file("cal_temperature",
+			    0200, cs35l56_base->debugfs, cs35l56_base,
+			    &fops->cal_temperature);
+	debugfs_create_file("cal_data",
+			    0644, cs35l56_base->debugfs, cs35l56_base,
+			    &fops->cal_data);
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_create_cal_debugfs, "SND_SOC_CS35L56_SHARED");
+
+void cs35l56_remove_cal_debugfs(struct cs35l56_base *cs35l56_base)
+{
+	debugfs_remove_recursive(cs35l56_base->debugfs);
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_remove_cal_debugfs, "SND_SOC_CS35L56_SHARED");
+
 int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base,
 			     bool *fw_missing, unsigned int *fw_version)
 {
-- 
cgit v1.2.3


From cf6290eebe3cc4eb677d11aa061d10cb1df12ab9 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 21 Oct 2025 11:50:17 +0100
Subject: ASoC: cs-amp-lib-test: Add cases for factory calibration helpers

Add test cases for the cs_amp_read_cal_coeffs() and
cs_amp_write_ambient_temp() functions.

In both cases the test is simply to confirm that the correct data
value(s) get passed back to the caller.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251021105022.1013685-7-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs-amp-lib.h         |  5 ++-
 sound/soc/codecs/cs-amp-lib-test.c | 75 +++++++++++++++++++++++++++++++++++++-
 sound/soc/codecs/cs-amp-lib.c      |  1 +
 3 files changed, 79 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h
index 5b094f8e8a6f..efa744133a35 100644
--- a/include/sound/cs-amp-lib.h
+++ b/include/sound/cs-amp-lib.h
@@ -72,8 +72,11 @@ struct cs_amp_test_hooks {
 	int (*write_cal_coeff)(struct cs_dsp *dsp,
 			       const struct cirrus_amp_cal_controls *controls,
 			       const char *ctl_name, u32 val);
-};
 
+	int (*read_cal_coeff)(struct cs_dsp *dsp,
+			      const struct cirrus_amp_cal_controls *controls,
+			      const char *ctl_name, u32 *val);
+};
 extern const struct cs_amp_test_hooks * const cs_amp_test_hooks;
 
 #endif /* CS_AMP_LIB_H */
diff --git a/sound/soc/codecs/cs-amp-lib-test.c b/sound/soc/codecs/cs-amp-lib-test.c
index 2fde84309338..6878941a8f57 100644
--- a/sound/soc/codecs/cs-amp-lib-test.c
+++ b/sound/soc/codecs/cs-amp-lib-test.c
@@ -701,6 +701,77 @@ static void cs_amp_lib_test_write_cal_data_test(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, entry->value, data.calStatus);
 }
 
+static int cs_amp_lib_test_read_cal_coeff(struct cs_dsp *dsp,
+					  const struct cirrus_amp_cal_controls *controls,
+					  const char *ctl_name, u32 *val)
+{
+	struct kunit *test = kunit_get_current_test();
+
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctl_name);
+	KUNIT_EXPECT_PTR_EQ(test, controls, &cs_amp_lib_test_calibration_controls);
+
+	if (strcmp(ctl_name, controls->ambient) == 0)
+		*val = 19;
+	else if (strcmp(ctl_name, controls->calr) == 0)
+		*val = 1077;
+	else if (strcmp(ctl_name, controls->status) == 0)
+		*val = 2;
+	else
+		kunit_fail_current_test("Bad control '%s'\n", ctl_name);
+
+	return 0;
+}
+
+static void cs_amp_lib_test_read_cal_data_test(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct cirrus_amp_cal_data data = { 0 };
+	struct cs_dsp *dsp;
+	int ret;
+
+	dsp = kunit_kzalloc(test, sizeof(*dsp), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dsp);
+	dsp->dev = &priv->amp_dev->dev;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->read_cal_coeff,
+				   cs_amp_lib_test_read_cal_coeff);
+
+	ret = cs_amp_read_cal_coeffs(dsp, &cs_amp_lib_test_calibration_controls, &data);
+	KUNIT_EXPECT_EQ(test, ret, 0);
+
+	KUNIT_EXPECT_EQ(test, 19, data.calAmbient);
+	KUNIT_EXPECT_EQ(test, 1077, data.calR);
+	KUNIT_EXPECT_EQ(test, 2, data.calStatus);
+	KUNIT_EXPECT_NE(test, 0, data.calTime[0] | data.calTime[1]);
+}
+
+static void cs_amp_lib_test_write_ambient_test(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct cs_amp_lib_test_ctl_write_entry *entry;
+	struct cs_dsp *dsp;
+	int ret;
+
+	dsp = kunit_kzalloc(test, sizeof(*dsp), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dsp);
+	dsp->dev = &priv->amp_dev->dev;
+
+	/* Redirect calls to write firmware controls */
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->write_cal_coeff,
+				   cs_amp_lib_test_write_cal_coeff);
+
+	ret = cs_amp_write_ambient_temp(dsp, &cs_amp_lib_test_calibration_controls, 18);
+	KUNIT_EXPECT_EQ(test, ret, 0);
+
+	KUNIT_EXPECT_EQ(test, list_count_nodes(&priv->ctl_write_list), 1);
+
+	entry = list_first_entry(&priv->ctl_write_list, typeof(*entry), list);
+	KUNIT_EXPECT_STREQ(test, entry->name, cs_amp_lib_test_calibration_controls.ambient);
+	KUNIT_EXPECT_EQ(test, entry->value, 18);
+}
+
 static void cs_amp_lib_test_spkid_lenovo_not_present(struct kunit *test)
 {
 	struct cs_amp_lib_test_priv *priv = test->priv;
@@ -973,8 +1044,10 @@ static struct kunit_case cs_amp_lib_test_cases[] = {
 			 cs_amp_lib_test_get_cal_gen_params),
 	KUNIT_CASE(cs_amp_lib_test_get_efi_cal_empty_entry_test),
 
-	/* Tests for writing calibration data */
+	/* Tests for writing and reading calibration data */
 	KUNIT_CASE(cs_amp_lib_test_write_cal_data_test),
+	KUNIT_CASE(cs_amp_lib_test_read_cal_data_test),
+	KUNIT_CASE(cs_amp_lib_test_write_ambient_test),
 
 	/* Test cases for speaker ID */
 	KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_not_present),
diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c
index f9d5c4adf3f2..f9f79da3a9ea 100644
--- a/sound/soc/codecs/cs-amp-lib.c
+++ b/sound/soc/codecs/cs-amp-lib.c
@@ -541,6 +541,7 @@ EXPORT_SYMBOL_NS_GPL(cs_amp_create_debugfs, "SND_SOC_CS_AMP_LIB");
 static const struct cs_amp_test_hooks cs_amp_test_hook_ptrs = {
 	.get_efi_variable = cs_amp_get_efi_variable,
 	.write_cal_coeff = cs_amp_write_cal_coeff,
+	.read_cal_coeff = cs_amp_read_cal_coeff,
 };
 
 const struct cs_amp_test_hooks * const cs_amp_test_hooks =
-- 
cgit v1.2.3


From 959400caf51eb31f95d1ab754a285b5546ebd3e4 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 21 Oct 2025 11:50:18 +0100
Subject: ASoC: cs-amp-lib: Return attributes from cs_amp_get_efi_variable()

Add a pointer argument to cs_amp_get_efi_variable() to optionally
return the EFI variable attributes.

Originally this function internally consumed the attributes from
efi.get_variable(). The calling code did not use the attributes
so this was a small simplification.

However, when writing to a pre-existing variable we would want to
pass the existing attributes to efi.set_variable(). This patch
deals with the change to return the attribute in preparation for
adding code to update the variable.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251021105022.1013685-8-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs-amp-lib.h         |  1 +
 sound/soc/codecs/cs-amp-lib-test.c | 23 +++++++++++++++++++++++
 sound/soc/codecs/cs-amp-lib.c      | 15 ++++++++++-----
 3 files changed, 34 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h
index efa744133a35..2e5616a5e1f7 100644
--- a/include/sound/cs-amp-lib.h
+++ b/include/sound/cs-amp-lib.h
@@ -66,6 +66,7 @@ static inline u64 cs_amp_cal_target_u64(const struct cirrus_amp_cal_data *data)
 struct cs_amp_test_hooks {
 	efi_status_t (*get_efi_variable)(efi_char16_t *name,
 					 efi_guid_t *guid,
+					 u32 *returned_attr,
 					 unsigned long *size,
 					 void *buf);
 
diff --git a/sound/soc/codecs/cs-amp-lib-test.c b/sound/soc/codecs/cs-amp-lib-test.c
index 6878941a8f57..b00ba65badd5 100644
--- a/sound/soc/codecs/cs-amp-lib-test.c
+++ b/sound/soc/codecs/cs-amp-lib-test.c
@@ -89,6 +89,7 @@ static u64 cs_amp_lib_test_get_target_uid(struct kunit *test)
 /* Redirected get_efi_variable to simulate that the file is too short */
 static efi_status_t cs_amp_lib_test_get_efi_variable_nohead(efi_char16_t *name,
 							    efi_guid_t *guid,
+							    u32 *returned_attr,
 							    unsigned long *size,
 							    void *buf)
 {
@@ -121,6 +122,7 @@ static void cs_amp_lib_test_cal_data_too_short_test(struct kunit *test)
 /* Redirected get_efi_variable to simulate that the count is larger than the file */
 static efi_status_t cs_amp_lib_test_get_efi_variable_bad_count(efi_char16_t *name,
 							       efi_guid_t *guid,
+							       u32 *returned_attr,
 							       unsigned long *size,
 							       void *buf)
 {
@@ -164,6 +166,7 @@ static void cs_amp_lib_test_cal_count_too_big_test(struct kunit *test)
 /* Redirected get_efi_variable to simulate that the variable not found */
 static efi_status_t cs_amp_lib_test_get_efi_variable_none(efi_char16_t *name,
 							  efi_guid_t *guid,
+							  u32 *returned_attr,
 							  unsigned long *size,
 							  void *buf)
 {
@@ -191,6 +194,7 @@ static void cs_amp_lib_test_no_cal_data_test(struct kunit *test)
 /* Redirected get_efi_variable to simulate reading a cal data blob */
 static efi_status_t cs_amp_lib_test_get_efi_variable(efi_char16_t *name,
 						     efi_guid_t *guid,
+						     u32 *returned_attr,
 						     unsigned long *size,
 						     void *buf)
 {
@@ -217,11 +221,18 @@ static efi_status_t cs_amp_lib_test_get_efi_variable(efi_char16_t *name,
 
 	memcpy(buf, priv->cal_blob, priv->cal_blob->size);
 
+	if (returned_attr) {
+		*returned_attr = EFI_VARIABLE_NON_VOLATILE |
+				 EFI_VARIABLE_BOOTSERVICE_ACCESS |
+				 EFI_VARIABLE_RUNTIME_ACCESS;
+	}
+
 	return EFI_SUCCESS;
 }
 
 static efi_status_t cs_amp_lib_test_get_hp_cal_efi_variable(efi_char16_t *name,
 							    efi_guid_t *guid,
+							    u32 *returned_attr,
 							    unsigned long *size,
 							    void *buf)
 {
@@ -248,6 +259,12 @@ static efi_status_t cs_amp_lib_test_get_hp_cal_efi_variable(efi_char16_t *name,
 
 	memcpy(buf, priv->cal_blob, priv->cal_blob->size);
 
+	if (returned_attr) {
+		*returned_attr = EFI_VARIABLE_NON_VOLATILE |
+				 EFI_VARIABLE_BOOTSERVICE_ACCESS |
+				 EFI_VARIABLE_RUNTIME_ACCESS;
+	}
+
 	return EFI_SUCCESS;
 }
 
@@ -786,6 +803,7 @@ static void cs_amp_lib_test_spkid_lenovo_not_present(struct kunit *test)
 
 static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_d0(efi_char16_t *name,
 							       efi_guid_t *guid,
+							       u32 *returned_attr,
 							       unsigned long *size,
 							       void *buf)
 {
@@ -804,6 +822,7 @@ static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_d0(efi_char16_t *nam
 
 static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_d1(efi_char16_t *name,
 							       efi_guid_t *guid,
+							       u32 *returned_attr,
 							       unsigned long *size,
 							       void *buf)
 {
@@ -822,6 +841,7 @@ static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_d1(efi_char16_t *nam
 
 static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_00(efi_char16_t *name,
 							       efi_guid_t *guid,
+							       u32 *returned_attr,
 							       unsigned long *size,
 							       void *buf)
 {
@@ -873,6 +893,7 @@ static void cs_amp_lib_test_spkid_lenovo_illegal(struct kunit *test)
 
 static efi_status_t cs_amp_lib_test_get_efi_variable_buf_too_small(efi_char16_t *name,
 								   efi_guid_t *guid,
+								   u32 *returned_attr,
 								   unsigned long *size,
 								   void *buf)
 {
@@ -893,6 +914,7 @@ static void cs_amp_lib_test_spkid_lenovo_oversize(struct kunit *test)
 
 static efi_status_t cs_amp_lib_test_get_efi_variable_hp_30(efi_char16_t *name,
 							   efi_guid_t *guid,
+							   u32 *returned_attr,
 							   unsigned long *size,
 							   void *buf)
 {
@@ -911,6 +933,7 @@ static efi_status_t cs_amp_lib_test_get_efi_variable_hp_30(efi_char16_t *name,
 
 static efi_status_t cs_amp_lib_test_get_efi_variable_hp_31(efi_char16_t *name,
 							   efi_guid_t *guid,
+							   u32 *returned_attr,
 							   unsigned long *size,
 							   void *buf)
 {
diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c
index f9f79da3a9ea..c5791cbeb5b8 100644
--- a/sound/soc/codecs/cs-amp-lib.c
+++ b/sound/soc/codecs/cs-amp-lib.c
@@ -245,15 +245,20 @@ EXPORT_SYMBOL_NS_GPL(cs_amp_write_ambient_temp, "SND_SOC_CS_AMP_LIB");
 
 static efi_status_t cs_amp_get_efi_variable(efi_char16_t *name,
 					    efi_guid_t *guid,
+					    u32 *returned_attr,
 					    unsigned long *size,
 					    void *buf)
 {
 	u32 attr;
 
-	KUNIT_STATIC_STUB_REDIRECT(cs_amp_get_efi_variable, name, guid, size, buf);
+	if (!returned_attr)
+		returned_attr = &attr;
+
+	KUNIT_STATIC_STUB_REDIRECT(cs_amp_get_efi_variable, name, guid,
+				   returned_attr, size, buf);
 
 	if (efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE))
-		return efi.get_variable(name, guid, &attr, size, buf);
+		return efi.get_variable(name, guid, returned_attr, size, buf);
 
 	return EFI_NOT_FOUND;
 }
@@ -288,7 +293,7 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev)
 	for (i = 0; i < ARRAY_SIZE(cs_amp_lib_cal_efivars); i++) {
 		status = cs_amp_get_efi_variable(cs_amp_lib_cal_efivars[i].name,
 						 cs_amp_lib_cal_efivars[i].guid,
-						 &data_size, NULL);
+						 NULL, &data_size, NULL);
 		if (status == EFI_BUFFER_TOO_SMALL)
 			break;
 	}
@@ -308,7 +313,7 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev)
 
 	status = cs_amp_get_efi_variable(cs_amp_lib_cal_efivars[i].name,
 					 cs_amp_lib_cal_efivars[i].guid,
-					 &data_size, data);
+					 NULL, &data_size, data);
 	if (status != EFI_SUCCESS) {
 		ret = -EINVAL;
 		goto err;
@@ -452,7 +457,7 @@ static int cs_amp_get_efi_byte_spkid(struct device *dev, const struct cs_amp_spk
 	int i, ret;
 
 	size = sizeof(spkid);
-	status = cs_amp_get_efi_variable(info->name, info->guid, &size, &spkid);
+	status = cs_amp_get_efi_variable(info->name, info->guid, NULL, &size, &spkid);
 	ret = cs_amp_convert_efi_status(status);
 	if (ret < 0)
 		return ret;
-- 
cgit v1.2.3


From 2b62e66626f05e277c8fdeb50d4c1e0cbab2fe0e Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 21 Oct 2025 11:50:19 +0100
Subject: ASoC: cs-amp-lib: Add function to write calibration to UEFI

Add cs_amp_set_efi_calibration_data() to write an amp calibration
blob to UEFI calibration variable.

The UEFI variable will be updated or created as necessary.

- If a Vendor-specific variable exists it will be updated,
  else if the Cirrus variable exists it will be update
  else the Cirrus variable will be created.

Some collateral changes are required:

- cs_amp_convert_efi_status() now specifically handles
  EFI_WRITE_PROTECTED error.

- cs_amp_get_cal_efi_buffer() can optionally return the name,
  guid and attr of the variable it found.

- cs_amp_get_cal_efi_buffer() will update the 'size' field of
  the returned data blob if it is zero. The BIOS could have
  pre-allocated the UEFI variable as zero-filled

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251021105022.1013685-9-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs-amp-lib.h    |   2 +
 sound/soc/codecs/cs-amp-lib.c | 190 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 188 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h
index 2e5616a5e1f7..240bc53a9307 100644
--- a/include/sound/cs-amp-lib.h
+++ b/include/sound/cs-amp-lib.h
@@ -55,6 +55,8 @@ int cs_amp_write_ambient_temp(struct cs_dsp *dsp,
 			      u32 temp);
 int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_index,
 				    struct cirrus_amp_cal_data *out_data);
+int cs_amp_set_efi_calibration_data(struct device *dev, int amp_index, int num_amps,
+				    const struct cirrus_amp_cal_data *in_data);
 int cs_amp_get_vendor_spkid(struct device *dev);
 struct dentry *cs_amp_create_debugfs(struct device *dev);
 
diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c
index c5791cbeb5b8..7038574e3f4b 100644
--- a/sound/soc/codecs/cs-amp-lib.c
+++ b/sound/soc/codecs/cs-amp-lib.c
@@ -13,6 +13,7 @@
 #include <linux/firmware/cirrus/cs_dsp.h>
 #include <linux/math64.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/overflow.h>
 #include <linux/slab.h>
 #include <linux/timekeeping.h>
@@ -49,9 +50,16 @@ static const struct cs_amp_lib_cal_efivar {
 	},
 };
 
+#define CS_AMP_CAL_DEFAULT_EFI_ATTR			\
+		(EFI_VARIABLE_NON_VOLATILE |		\
+		 EFI_VARIABLE_BOOTSERVICE_ACCESS |	\
+		 EFI_VARIABLE_RUNTIME_ACCESS)
+
 /* Offset from Unix time to Windows time (100ns since 1 Jan 1601) */
 #define UNIX_TIME_TO_WINDOWS_TIME_OFFSET	116444736000000000ULL
 
+static DEFINE_MUTEX(cs_amp_efi_cal_write_lock);
+
 static u64 cs_amp_time_now_in_windows_time(void)
 {
 	u64 time_in_100ns = div_u64(ktime_get_real_ns(), 100);
@@ -263,6 +271,20 @@ static efi_status_t cs_amp_get_efi_variable(efi_char16_t *name,
 	return EFI_NOT_FOUND;
 }
 
+static efi_status_t cs_amp_set_efi_variable(efi_char16_t *name,
+					    efi_guid_t *guid,
+					    u32 attr,
+					    unsigned long size,
+					    void *buf)
+{
+	KUNIT_STATIC_STUB_REDIRECT(cs_amp_set_efi_variable, name, guid, attr, size, buf);
+
+	if (!efi_rt_services_supported(EFI_RT_SUPPORTED_SET_VARIABLE))
+		return EFI_NOT_FOUND;
+
+	return efi.set_variable(name, guid, attr, size, buf);
+}
+
 static int cs_amp_convert_efi_status(efi_status_t status)
 {
 	switch (status) {
@@ -272,6 +294,7 @@ static int cs_amp_convert_efi_status(efi_status_t status)
 		return -ENOENT;
 	case EFI_BUFFER_TOO_SMALL:
 		return -EFBIG;
+	case EFI_WRITE_PROTECTED:
 	case EFI_UNSUPPORTED:
 	case EFI_ACCESS_DENIED:
 	case EFI_SECURITY_VIOLATION:
@@ -281,7 +304,10 @@ static int cs_amp_convert_efi_status(efi_status_t status)
 	}
 }
 
-static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev)
+static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev,
+							     efi_char16_t **name,
+							     efi_guid_t **guid,
+							     u32 *attr)
 {
 	struct cirrus_amp_efi_data *efi_data;
 	unsigned long data_size = 0;
@@ -293,7 +319,7 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev)
 	for (i = 0; i < ARRAY_SIZE(cs_amp_lib_cal_efivars); i++) {
 		status = cs_amp_get_efi_variable(cs_amp_lib_cal_efivars[i].name,
 						 cs_amp_lib_cal_efivars[i].guid,
-						 NULL, &data_size, NULL);
+						 attr, &data_size, NULL);
 		if (status == EFI_BUFFER_TOO_SMALL)
 			break;
 	}
@@ -301,6 +327,12 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev)
 	if (status != EFI_BUFFER_TOO_SMALL)
 		return ERR_PTR(-ENOENT);
 
+	if (name)
+		*name = cs_amp_lib_cal_efivars[i].name;
+
+	if (guid)
+		*guid = cs_amp_lib_cal_efivars[i].guid;
+
 	if (data_size < sizeof(*efi_data)) {
 		dev_err(dev, "EFI cal variable truncated\n");
 		return ERR_PTR(-EOVERFLOW);
@@ -313,7 +345,7 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev)
 
 	status = cs_amp_get_efi_variable(cs_amp_lib_cal_efivars[i].name,
 					 cs_amp_lib_cal_efivars[i].guid,
-					 NULL, &data_size, data);
+					 attr, &data_size, data);
 	if (status != EFI_SUCCESS) {
 		ret = -EINVAL;
 		goto err;
@@ -329,6 +361,10 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev)
 		goto err;
 	}
 
+	/* This could be zero-filled space pre-allocated by the BIOS */
+	if (efi_data->size == 0)
+		efi_data->size = data_size;
+
 	return efi_data;
 
 err:
@@ -338,6 +374,20 @@ err:
 	return ERR_PTR(ret);
 }
 
+static int cs_amp_set_cal_efi_buffer(struct device *dev,
+				     efi_char16_t *name,
+				     efi_guid_t *guid,
+				     u32 attr,
+				     struct cirrus_amp_efi_data *data)
+{
+	efi_status_t status;
+
+	status = cs_amp_set_efi_variable(name, guid, attr,
+					 struct_size(data, data, data->count), data);
+
+	return cs_amp_convert_efi_status(status);
+}
+
 static int _cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_index,
 					    struct cirrus_amp_cal_data *out_data)
 {
@@ -345,7 +395,7 @@ static int _cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid,
 	struct cirrus_amp_cal_data *cal = NULL;
 	int i, ret;
 
-	efi_data = cs_amp_get_cal_efi_buffer(dev);
+	efi_data = cs_amp_get_cal_efi_buffer(dev, NULL, NULL, NULL);
 	if (IS_ERR(efi_data))
 		return PTR_ERR(efi_data);
 
@@ -397,6 +447,98 @@ static int _cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid,
 	return ret;
 }
 
+static int _cs_amp_set_efi_calibration_data(struct device *dev, int amp_index, int num_amps,
+					    const struct cirrus_amp_cal_data *in_data)
+{
+	u64 cal_target = cs_amp_cal_target_u64(in_data);
+	unsigned long num_entries;
+	struct cirrus_amp_efi_data *data __free(kfree) = NULL;
+	efi_char16_t *name = CIRRUS_LOGIC_CALIBRATION_EFI_NAME;
+	efi_guid_t *guid = &CIRRUS_LOGIC_CALIBRATION_EFI_GUID;
+	u32 attr = CS_AMP_CAL_DEFAULT_EFI_ATTR;
+	int i, ret;
+
+	if (cal_target == 0)
+		return -EINVAL;
+
+	data = cs_amp_get_cal_efi_buffer(dev, &name, &guid, &attr);
+	ret = PTR_ERR_OR_ZERO(data);
+	if (ret == -ENOENT) {
+		data = NULL;
+		goto alloc_new;
+	} else if (ret) {
+		return ret;
+	}
+
+	/*
+	 * If the EFI variable is just zero-filled reserved space the count
+	 * must be set.
+	 */
+	if (data->count == 0)
+		data->count = (data->size - sizeof(data)) / sizeof(data->data[0]);
+
+	if (amp_index < 0) {
+		/* Is there already a slot for this target? */
+		for (amp_index = 0; amp_index < data->count; amp_index++) {
+			if (cs_amp_cal_target_u64(&data->data[amp_index]) == cal_target)
+				break;
+		}
+
+		/* Else find an empty slot */
+		if (amp_index >= data->count) {
+			for (amp_index = 0; amp_index < data->count; amp_index++) {
+				if ((data->data[amp_index].calTime[0] == 0) &&
+				    (data->data[amp_index].calTime[1] == 0))
+					break;
+			}
+		}
+	} else {
+		/*
+		 * If the index is forced there could be another active
+		 * slot with the same calTarget. So deduplicate.
+		 */
+		for (i = 0; i < data->count; i++) {
+			if (i == amp_index)
+				continue;
+
+			if ((data->data[i].calTime[0] == 0) && (data->data[i].calTime[1] == 0))
+				continue;
+
+			if (cs_amp_cal_target_u64(&data->data[i]) == cal_target)
+				memset(data->data[i].calTime, 0, sizeof(data->data[i].calTime));
+		}
+	}
+
+alloc_new:
+	if (amp_index < 0)
+		amp_index = 0;
+
+	num_entries = max(num_amps, amp_index + 1);
+	if (!data || (data->count < num_entries)) {
+		struct cirrus_amp_efi_data *old_data __free(kfree) = no_free_ptr(data);
+		unsigned int new_data_size = struct_size(data, data, num_entries);
+
+		data = kzalloc(new_data_size, GFP_KERNEL);
+		if (!data)
+			return -ENOMEM;
+
+		if (old_data)
+			memcpy(data, old_data, struct_size(old_data, data, old_data->count));
+
+		data->count = num_entries;
+		data->size = new_data_size;
+	}
+
+	data->data[amp_index] = *in_data;
+	ret = cs_amp_set_cal_efi_buffer(dev, name, guid, attr, data);
+	if (ret) {
+		dev_err(dev, "Failed writing calibration to EFI: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
 /**
  * cs_amp_get_efi_calibration_data - get an entry from calibration data in EFI.
  * @dev:	struct device of the caller.
@@ -443,6 +585,46 @@ int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_
 }
 EXPORT_SYMBOL_NS_GPL(cs_amp_get_efi_calibration_data, "SND_SOC_CS_AMP_LIB");
 
+/**
+ * cs_amp_set_efi_calibration_data - write a calibration data entry to EFI.
+ * @dev:	struct device of the caller.
+ * @amp_index:	Entry index to use, or -1 to use any available slot.
+ * @num_amps:	Maximum number of amps to reserve slots for, or -1 to ignore.
+ * @in_data:	struct cirrus_amp_cal_data entry to be written to EFI.
+ *
+ * If a Vendor-specific variable exists it will be updated,
+ * else if the Cirrus variable exists it will be updated
+ * else the Cirrus variable will be created.
+ *
+ * If amp_index >= 0 the data will be placed in this entry of the calibration
+ * data array, overwriting what was in that entry. Any other entries with the
+ * same calTarget will be marked empty.
+ *
+ * If amp_index < 0 and in_data->calTarget matches any existing entry, that
+ * entry will be overwritten. Else the first available free entry will be used,
+ * extending the size of the EFI variable if there are no free entries.
+ *
+ * If num_amps > 0 the EFI variable will be sized to contain at least this
+ * many calibration entries, with any new entries marked empty.
+ *
+ * Return: 0 if the write was successful, -EFBIG if space could not be made in
+ *	   the EFI file to add the entry, -EACCES if it was not possible to
+ *	   read or write the EFI variable.
+ */
+int cs_amp_set_efi_calibration_data(struct device *dev, int amp_index, int num_amps,
+				    const struct cirrus_amp_cal_data *in_data)
+{
+	if (IS_ENABLED(CONFIG_EFI) || IS_ENABLED(CONFIG_SND_SOC_CS_AMP_LIB_TEST)) {
+		scoped_guard(mutex, &cs_amp_efi_cal_write_lock) {
+			return _cs_amp_set_efi_calibration_data(dev, amp_index,
+								num_amps, in_data);
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL_NS_GPL(cs_amp_set_efi_calibration_data, "SND_SOC_CS_AMP_LIB");
+
 struct cs_amp_spkid_efi {
 	efi_char16_t *name;
 	efi_guid_t *guid;
-- 
cgit v1.2.3


From ef24466ee1912997c2bd526194006bbca424c24f Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 21 Oct 2025 11:50:20 +0100
Subject: ASoC: cs35l56: Add calibration command to store into UEFI

Add a new command 'store_uefi' to the calibrate debugfs file.
Writing this command will call cs_amp_set_efi_calibration_data()
to save the new data into a UEFI variable. This is intended to
be used after a successful factory calibration.

On systems without UEFI the write to the debugfs file will
return an error.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251021105022.1013685-10-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           |  1 +
 sound/soc/codecs/cs35l56-shared.c | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index 349b896ee737..82559be0f249 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -321,6 +321,7 @@ struct cs35l56_base {
 	bool can_hibernate;
 	bool cal_data_valid;
 	s8 cal_index;
+	u8 num_amps;
 	struct cirrus_amp_cal_data cal_data;
 	struct gpio_desc *reset_gpio;
 	struct cs35l56_spi_payload *spi_payload_buf;
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index eeb830e3f743..bbacac6bda81 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -1105,9 +1105,9 @@ ssize_t cs35l56_calibrate_debugfs_write(struct cs35l56_base *cs35l56_base,
 					const char __user *from, size_t count,
 					loff_t *ppos)
 {
-	static const char * const options[] = { "factory" };
-	char buf[8] = { 0 };
-	int ret;
+	static const char * const options[] = { "factory", "store_uefi" };
+	char buf[11] = { 0 };
+	int num_amps, ret;
 
 	if (!IS_ENABLED(CONFIG_SND_SOC_CS35L56_CAL_DEBUGFS_COMMON))
 		return -ENXIO;
@@ -1125,6 +1125,21 @@ ssize_t cs35l56_calibrate_debugfs_write(struct cs35l56_base *cs35l56_base,
 		if (ret < 0)
 			return ret;
 		break;
+	case 1:
+		if (!cs35l56_base->cal_data_valid)
+			return -ENODATA;
+
+		num_amps = cs35l56_base->num_amps;
+		if (num_amps == 0)
+			num_amps = -1;
+
+		ret = cs_amp_set_efi_calibration_data(cs35l56_base->dev,
+						      cs35l56_base->cal_index,
+						      num_amps,
+						      &cs35l56_base->cal_data);
+		if (ret < 0)
+			return ret;
+		break;
 	default:
 		return -ENXIO;
 	}
-- 
cgit v1.2.3


From 4795375d8aa072e9aacb0b278e6203c6ca41816a Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 21 Oct 2025 11:50:22 +0100
Subject: ASoC: cs-amp-lib-test: Add test cases for
 cs_amp_set_efi_calibration_data()

Add a set of test cases for cs_amp_set_efi_calibration_data().

Broadly there are two type of behavior being tested:

How the EFI is updated:
- Create a new EFI
- Overwrite part of existing content
- Overwrite part of zero-filled preallocated content
- Grow the file to append new content

And how the location within the content is chosen:
- Overwrite a specific array entry
- Overwrite an entry with the same calTarget (silicon ID)
- Overwrite a free entry
- Append after existing data

Plus some cases for error conditions.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251021105022.1013685-12-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs-amp-lib.h         |    5 +
 sound/soc/codecs/cs-amp-lib-test.c | 1399 +++++++++++++++++++++++++++++++++++-
 sound/soc/codecs/cs-amp-lib.c      |    1 +
 3 files changed, 1397 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h
index 240bc53a9307..61e00017c9aa 100644
--- a/include/sound/cs-amp-lib.h
+++ b/include/sound/cs-amp-lib.h
@@ -71,6 +71,11 @@ struct cs_amp_test_hooks {
 					 u32 *returned_attr,
 					 unsigned long *size,
 					 void *buf);
+	efi_status_t (*set_efi_variable)(efi_char16_t *name,
+					 efi_guid_t *guid,
+					 u32 attr,
+					 unsigned long size,
+					 void *buf);
 
 	int (*write_cal_coeff)(struct cs_dsp *dsp,
 			       const struct cirrus_amp_cal_controls *controls,
diff --git a/sound/soc/codecs/cs-amp-lib-test.c b/sound/soc/codecs/cs-amp-lib-test.c
index b00ba65badd5..51799a9c86a3 100644
--- a/sound/soc/codecs/cs-amp-lib-test.c
+++ b/sound/soc/codecs/cs-amp-lib-test.c
@@ -19,6 +19,10 @@
 #include <linux/random.h>
 #include <sound/cs-amp-lib.h>
 
+#define CIRRUS_LOGIC_CALIBRATION_EFI_NAME L"CirrusSmartAmpCalibrationData"
+#define CIRRUS_LOGIC_CALIBRATION_EFI_GUID \
+	EFI_GUID(0x02f9af02, 0x7734, 0x4233, 0xb4, 0x3d, 0x93, 0xfe, 0x5a, 0xa3, 0x5d, 0xb3)
+
 #define LENOVO_SPEAKER_ID_EFI_NAME L"SdwSpeaker"
 #define LENOVO_SPEAKER_ID_EFI_GUID \
 	EFI_GUID(0x48df970e, 0xe27f, 0x460a, 0xb5, 0x86, 0x77, 0x19, 0x80, 0x1d, 0x92, 0x82)
@@ -27,6 +31,10 @@
 #define HP_SPEAKER_ID_EFI_GUID \
 	EFI_GUID(0xc49593a4, 0xd099, 0x419b, 0xa2, 0xc3, 0x67, 0xe9, 0x80, 0xe6, 0x1d, 0x1e)
 
+#define HP_CALIBRATION_EFI_NAME L"SmartAmpCalibrationData"
+#define HP_CALIBRATION_EFI_GUID \
+	EFI_GUID(0x53559579, 0x8753, 0x4f5c, 0x91, 0x30, 0xe8, 0x2a, 0xcf, 0xb8, 0xd8, 0x93)
+
 KUNIT_DEFINE_ACTION_WRAPPER(faux_device_destroy_wrapper, faux_device_destroy,
 			    struct faux_device *)
 
@@ -35,6 +43,7 @@ struct cs_amp_lib_test_priv {
 
 	struct cirrus_amp_efi_data *cal_blob;
 	struct list_head ctl_write_list;
+	u32 efi_attr;
 };
 
 struct cs_amp_lib_test_ctl_write_entry {
@@ -48,6 +57,20 @@ struct cs_amp_lib_test_param {
 	int amp_index;
 };
 
+static struct cirrus_amp_efi_data *cs_amp_lib_test_cal_blob_dup(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct cirrus_amp_efi_data *temp;
+
+	KUNIT_ASSERT_EQ(test, struct_size(priv->cal_blob, data, priv->cal_blob->count),
+			priv->cal_blob->size);
+	temp = kunit_kmalloc(test, priv->cal_blob->size, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, temp);
+	memcpy(temp, priv->cal_blob, priv->cal_blob->size);
+
+	return temp;
+}
+
 static void cs_amp_lib_test_init_dummy_cal_blob(struct kunit *test, int num_amps)
 {
 	struct cs_amp_lib_test_priv *priv = test->priv;
@@ -68,9 +91,15 @@ static void cs_amp_lib_test_init_dummy_cal_blob(struct kunit *test, int num_amps
 	for (i = 0; i < num_amps; i++)
 		priv->cal_blob->data[i].calTime[0] |= 1;
 
-	/* Ensure that all UIDs are non-zero and unique. */
-	for (i = 0; i < num_amps; i++)
+	/*
+	 * Ensure that all UIDs are non-zero and unique.
+	 * Make both words non-zero and not equal values, so that
+	 * tests can verify that both words were checked or changed.
+	 */
+	for (i = 0; i < num_amps; i++) {
 		*(u8 *)&priv->cal_blob->data[i].calTarget[0] = i + 1;
+		*(u8 *)&priv->cal_blob->data[i].calTarget[1] = i;
+	}
 }
 
 static u64 cs_amp_lib_test_get_target_uid(struct kunit *test)
@@ -198,9 +227,8 @@ static efi_status_t cs_amp_lib_test_get_efi_variable(efi_char16_t *name,
 						     unsigned long *size,
 						     void *buf)
 {
-	static const efi_char16_t expected_name[] = L"CirrusSmartAmpCalibrationData";
-	static const efi_guid_t expected_guid =
-		EFI_GUID(0x02f9af02, 0x7734, 0x4233, 0xb4, 0x3d, 0x93, 0xfe, 0x5a, 0xa3, 0x5d, 0xb3);
+	static const efi_char16_t expected_name[] = CIRRUS_LOGIC_CALIBRATION_EFI_NAME;
+	static const efi_guid_t expected_guid = CIRRUS_LOGIC_CALIBRATION_EFI_GUID;
 	struct kunit *test = kunit_get_current_test();
 	struct cs_amp_lib_test_priv *priv = test->priv;
 
@@ -222,9 +250,56 @@ static efi_status_t cs_amp_lib_test_get_efi_variable(efi_char16_t *name,
 	memcpy(buf, priv->cal_blob, priv->cal_blob->size);
 
 	if (returned_attr) {
-		*returned_attr = EFI_VARIABLE_NON_VOLATILE |
-				 EFI_VARIABLE_BOOTSERVICE_ACCESS |
-				 EFI_VARIABLE_RUNTIME_ACCESS;
+		if (priv->efi_attr)
+			*returned_attr = priv->efi_attr;
+		else
+			*returned_attr = EFI_VARIABLE_NON_VOLATILE |
+					 EFI_VARIABLE_BOOTSERVICE_ACCESS |
+					 EFI_VARIABLE_RUNTIME_ACCESS;
+	}
+
+	return EFI_SUCCESS;
+}
+
+#define CS_AMP_LIB_ZERO_FILLED_BLOB_SIZE \
+	struct_size_t(struct cirrus_amp_efi_data, data, 8)
+
+/* Redirected get_efi_variable to simulate reading a prealloced zero-filled blob */
+static efi_status_t cs_amp_lib_test_get_efi_variable_all_zeros(efi_char16_t *name,
+							       efi_guid_t *guid,
+							       u32 *returned_attr,
+							       unsigned long *size,
+							       void *buf)
+{
+	static const efi_char16_t expected_name[] = CIRRUS_LOGIC_CALIBRATION_EFI_NAME;
+	static const efi_guid_t expected_guid = CIRRUS_LOGIC_CALIBRATION_EFI_GUID;
+	struct kunit *test = kunit_get_current_test();
+	struct cs_amp_lib_test_priv *priv = test->priv;
+
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test, name);
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test, guid);
+
+	if (memcmp(name, expected_name, sizeof(expected_name)) ||
+	    efi_guidcmp(*guid, expected_guid))
+		return -EFI_NOT_FOUND;
+
+	if (!buf) {
+		*size = CS_AMP_LIB_ZERO_FILLED_BLOB_SIZE;
+		return EFI_BUFFER_TOO_SMALL;
+	}
+
+	KUNIT_ASSERT_EQ(test, *size, struct_size(priv->cal_blob, data, 8));
+	priv->cal_blob = kunit_kzalloc(test, CS_AMP_LIB_ZERO_FILLED_BLOB_SIZE, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, priv->cal_blob);
+	memset(buf, 0, CS_AMP_LIB_ZERO_FILLED_BLOB_SIZE);
+
+	if (returned_attr) {
+		if (priv->efi_attr)
+			*returned_attr = priv->efi_attr;
+		else
+			*returned_attr = EFI_VARIABLE_NON_VOLATILE |
+					 EFI_VARIABLE_BOOTSERVICE_ACCESS |
+					 EFI_VARIABLE_RUNTIME_ACCESS;
 	}
 
 	return EFI_SUCCESS;
@@ -789,6 +864,1292 @@ static void cs_amp_lib_test_write_ambient_test(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, entry->value, 18);
 }
 
+static efi_status_t cs_amp_lib_test_set_efi_variable(efi_char16_t *name,
+						     efi_guid_t *guid,
+						     u32 attr,
+						     unsigned long size,
+						     void *buf)
+{
+	static const efi_char16_t expected_name[] = CIRRUS_LOGIC_CALIBRATION_EFI_NAME;
+	static const efi_guid_t expected_guid = CIRRUS_LOGIC_CALIBRATION_EFI_GUID;
+	struct kunit *test = kunit_get_current_test();
+	struct cs_amp_lib_test_priv *priv = test->priv;
+
+	KUNIT_ASSERT_NOT_NULL(test, name);
+	KUNIT_ASSERT_NOT_NULL(test, guid);
+
+	if (memcmp(name, expected_name, sizeof(expected_name)) ||
+	    efi_guidcmp(*guid, expected_guid))
+		return -EFI_NOT_FOUND;
+
+	KUNIT_ASSERT_NOT_NULL(test, buf);
+	KUNIT_ASSERT_NE(test, 0, size);
+
+	kunit_kfree(test, priv->cal_blob);
+	priv->cal_blob = kunit_kmalloc(test, size, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, priv->cal_blob);
+	memcpy(priv->cal_blob, buf, size);
+	priv->efi_attr = attr;
+
+	return EFI_SUCCESS;
+}
+
+static efi_status_t cs_amp_lib_test_set_efi_variable_denied(efi_char16_t *name,
+							    efi_guid_t *guid,
+							    u32 attr,
+							    unsigned long size,
+							    void *buf)
+{
+	return EFI_WRITE_PROTECTED;
+}
+
+#define CS_AMP_CAL_DEFAULT_EFI_ATTR			\
+		(EFI_VARIABLE_NON_VOLATILE |		\
+		 EFI_VARIABLE_BOOTSERVICE_ACCESS |	\
+		 EFI_VARIABLE_RUNTIME_ACCESS)
+
+static void cs_amp_lib_test_create_new_cal_efi(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	struct cirrus_amp_cal_data data;
+	int i;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable_none);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/* For unspecified number of amps */
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_EQ(test, CS_AMP_CAL_DEFAULT_EFI_ATTR, priv->efi_attr);
+	KUNIT_EXPECT_GE(test, priv->cal_blob->count, 1);
+	KUNIT_EXPECT_LE(test, priv->cal_blob->count, 8);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, priv->cal_blob->count),
+			priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	for (i = 1; i < priv->cal_blob->count; i++)
+		KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[i], sizeof(data)));
+
+	/* For 2 amps */
+	priv->cal_blob = NULL;
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 2, &data));
+	KUNIT_EXPECT_EQ(test, CS_AMP_CAL_DEFAULT_EFI_ATTR, priv->efi_attr);
+	KUNIT_EXPECT_EQ(test, 2, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 2), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[1], sizeof(data)));
+
+	/* For 4 amps */
+	priv->cal_blob = NULL;
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 4, &data));
+	KUNIT_EXPECT_EQ(test, 4, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 4), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[1], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+
+	/* For 6 amps */
+	priv->cal_blob = NULL;
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[1], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+}
+
+static void cs_amp_lib_test_create_new_cal_efi_indexed(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable_none);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/* In slot 0 */
+	priv->cal_blob = NULL;
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 0, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[1], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+
+	/* In slot 1 */
+	priv->cal_blob = NULL;
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 1, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[0], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+
+	/* In slot 5 */
+	priv->cal_blob = NULL;
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 5, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[5], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[0], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[1], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+}
+
+static void cs_amp_lib_test_create_new_cal_efi_indexed_no_max(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	struct cirrus_amp_cal_data data;
+	int i;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable_none);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/* In slot 0 with unspecified number of amps */
+	priv->cal_blob = NULL;
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 0, -1, &data));
+	KUNIT_EXPECT_GE(test, priv->cal_blob->count, 1);
+	KUNIT_EXPECT_LE(test, priv->cal_blob->count, 8);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, priv->cal_blob->count),
+			priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	for (i = 1; i < priv->cal_blob->count; i++)
+		KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[i], sizeof(data)));
+
+	/* In slot 1 with unspecified number of amps  */
+	priv->cal_blob = NULL;
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 1, -1, &data));
+	KUNIT_EXPECT_GE(test, priv->cal_blob->count, 2);
+	KUNIT_EXPECT_LE(test, priv->cal_blob->count, 8);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, priv->cal_blob->count),
+			priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[0], sizeof(data)));
+	for (i = 2; i < priv->cal_blob->count; i++)
+		KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[i], sizeof(data)));
+
+	/* In slot 5 with unspecified number of amps  */
+	priv->cal_blob = NULL;
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 5, -1, &data));
+	KUNIT_EXPECT_GE(test, priv->cal_blob->count, 6);
+	KUNIT_EXPECT_LE(test, priv->cal_blob->count, 8);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, priv->cal_blob->count),
+			priv->cal_blob->size);
+	for (i = 0; (i < 5) && (i < priv->cal_blob->count); i++)
+		KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[i], sizeof(data)));
+
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[5], sizeof(data));
+	for (i = 6; i < priv->cal_blob->count; i++)
+		KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[i], sizeof(data)));
+}
+
+static void cs_amp_lib_test_grow_append_cal_efi(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/* Initially 1 used entry grown to 2 entries */
+	cs_amp_lib_test_init_dummy_cal_blob(test, 1);
+	KUNIT_ASSERT_EQ(test, 1, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 2, &data));
+	KUNIT_EXPECT_EQ(test, CS_AMP_CAL_DEFAULT_EFI_ATTR, priv->efi_attr);
+	KUNIT_EXPECT_EQ(test, 2, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 2), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+
+	/* Initially 1 entry grown to 4 entries */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 1);
+	KUNIT_ASSERT_EQ(test, 1, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 4, &data));
+	KUNIT_EXPECT_EQ(test, 4, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 4), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+
+	/* Initially 2 entries grown to 4 entries */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 2);
+	KUNIT_ASSERT_EQ(test, 2, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 4, &data));
+	KUNIT_EXPECT_EQ(test, 4, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 4), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+
+	/* Initially 1 entry grown to 6 entries */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 1);
+	KUNIT_ASSERT_EQ(test, 1, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+
+	/* Initially 4 entries grown to 6 entries */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+}
+
+static void cs_amp_lib_test_grow_append_cal_efi_indexed(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/* Initially 1 entry grown to 2 entries using slot 1 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 1);
+	KUNIT_ASSERT_EQ(test, 1, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 1, 2, &data));
+	KUNIT_EXPECT_EQ(test, 2, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 2), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+
+	/* Initially 1 entry grown to 6 entries using slot 1 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 1);
+	KUNIT_ASSERT_EQ(test, 1, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 1, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+
+	/* Initially 2 entries grown to 6 entries using slot 2 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 2);
+	KUNIT_ASSERT_EQ(test, 2, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 2, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+
+	/* Initially 2 entries grown to 6 entries using slot 4 */
+	kunit_kfree(test, original_blob);
+	kunit_kfree(test, priv->cal_blob);
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 2);
+	KUNIT_ASSERT_EQ(test, 2, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 4, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+}
+
+static void cs_amp_lib_test_cal_efi_all_zeros_add_first(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	struct cirrus_amp_cal_data data;
+	int i;
+
+	/* Simulate a BIOS reserving EFI space that is entirely zero-filled. */
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable_all_zeros);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/*
+	 * Add an entry. The header should be filled in to match the
+	 * original EFI variable size.
+	 */
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 8), priv->cal_blob->size);
+	KUNIT_EXPECT_EQ(test, 8, priv->cal_blob->count);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	for (i = 1; i < priv->cal_blob->count; i++) {
+		KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[i].calTime[0]);
+		KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[i].calTime[1]);
+	}
+}
+
+static void cs_amp_lib_test_cal_efi_all_zeros_add_first_no_shrink(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	struct cirrus_amp_cal_data data;
+	int i;
+
+	/* Simulate a BIOS reserving EFI space that is entirely zero-filled. */
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable_all_zeros);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/*
+	 * Add an entry. The header should be filled in to match the
+	 * original EFI variable size. A number of amps less than the
+	 * available preallocated space does not shrink the EFI variable.
+	 */
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 4, &data));
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 8), priv->cal_blob->size);
+	KUNIT_EXPECT_EQ(test, 8, priv->cal_blob->count);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	for (i = 1; i < priv->cal_blob->count; i++) {
+		KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[i].calTime[0]);
+		KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[i].calTime[1]);
+	}
+}
+
+static void cs_amp_lib_test_cal_efi_all_zeros_add_first_indexed(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	struct cirrus_amp_cal_data data;
+	int i;
+
+	/* Simulate a BIOS reserving EFI space that is entirely zero-filled. */
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable_all_zeros);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/*
+	 * Write entry to slot 2. The header should be filled in to match
+	 * the original EFI variable size.
+	 */
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 2, -1, &data));
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 8), priv->cal_blob->size);
+	KUNIT_EXPECT_EQ(test, 8, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[0].calTime[0]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[0].calTime[1]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[1].calTime[0]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[1].calTime[1]);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[2], sizeof(data));
+	for (i = 3; i < priv->cal_blob->count; i++) {
+		KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[i].calTime[0]);
+		KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[i].calTime[1]);
+	}
+}
+
+static void cs_amp_lib_test_cal_efi_all_zeros_add_first_indexed_no_shrink(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	struct cirrus_amp_cal_data data;
+	int i;
+
+	/* Simulate a BIOS reserving EFI space that is entirely zero-filled. */
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable_all_zeros);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/*
+	 * Write entry to slot 2. The header should be filled in to match
+	 * the original EFI variable size. A number of amps less than the
+	 * available preallocated space does not shrink the EFI variable.
+	 */
+	get_random_bytes(&data, sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 2, 4, &data));
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 8), priv->cal_blob->size);
+	KUNIT_EXPECT_EQ(test, 8, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[0].calTime[0]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[0].calTime[1]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[1].calTime[0]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[1].calTime[1]);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[2], sizeof(data));
+	for (i = 3; i < priv->cal_blob->count; i++) {
+		KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[i].calTime[0]);
+		KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[i].calTime[1]);
+	}
+}
+
+static void cs_amp_lib_test_grow_append_cal_efi_indexed_no_max(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+	int i;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/* Initially 1 entry adding slot 1 */
+	cs_amp_lib_test_init_dummy_cal_blob(test, 1);
+	KUNIT_ASSERT_EQ(test, 1, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 1, -1, &data));
+	KUNIT_EXPECT_GE(test, priv->cal_blob->count, 2);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, priv->cal_blob->count),
+			priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	for (i = 2; i < priv->cal_blob->count; i++)
+		KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[i], sizeof(data)));
+
+	/* Initially 1 entry adding slot 3 */
+	cs_amp_lib_test_init_dummy_cal_blob(test, 1);
+	KUNIT_ASSERT_EQ(test, 1, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 3, -1, &data));
+	KUNIT_EXPECT_GE(test, priv->cal_blob->count, 4);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, priv->cal_blob->count),
+			priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[1], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[3], sizeof(data));
+	for (i = 4; i < priv->cal_blob->count; i++)
+		KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[i], sizeof(data)));
+
+	/* Initially 2 entries adding slot 3 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 2);
+	KUNIT_ASSERT_EQ(test, 2, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 3, -1, &data));
+	KUNIT_EXPECT_GE(test, priv->cal_blob->count, 1);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, priv->cal_blob->count),
+			priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[3], sizeof(data));
+	for (i = 4; i < priv->cal_blob->count; i++)
+		KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[i], sizeof(data)));
+
+	/* Initially 4 entries adding slot 4 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 4, -1, &data));
+	KUNIT_EXPECT_GE(test, priv->cal_blob->count, 1);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, priv->cal_blob->count),
+			priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[4], sizeof(data));
+	for (i = 5; i < priv->cal_blob->count; i++)
+		KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[i], sizeof(data)));
+
+	/* Initially 4 entries adding slot 6 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 6, -1, &data));
+	KUNIT_EXPECT_GE(test, priv->cal_blob->count, 1);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, priv->cal_blob->count),
+			priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[6], sizeof(data));
+	for (i = 7; i < priv->cal_blob->count; i++)
+		KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[i], sizeof(data)));
+}
+
+static void cs_amp_lib_test_grow_cal_efi_replace_indexed(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/* Initially 1 entry grown to 2 entries overwriting slot 0 */
+	cs_amp_lib_test_init_dummy_cal_blob(test, 1);
+	KUNIT_ASSERT_EQ(test, 1, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 0, 2, &data));
+	KUNIT_EXPECT_EQ(test, 2, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 2), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[1], sizeof(data)));
+
+	/* Initially 2 entries grown to 4 entries overwriting slot 1 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 2);
+	KUNIT_ASSERT_EQ(test, 2, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 1, 4, &data));
+	KUNIT_EXPECT_EQ(test, 4, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 4), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+
+	/* Initially 4 entries grown to 6 entries overwriting slot 1 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 1, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+
+	/* Initially 4 entries grown to 6 entries overwriting slot 3 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 3, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+
+	/* Initially 6 entries grown to 8 entries overwriting slot 4 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 6);
+	KUNIT_ASSERT_EQ(test, 6, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa; /* won't match */
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 4, 8, &data));
+	KUNIT_EXPECT_EQ(test, 8, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 8), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[6], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[7], sizeof(data)));
+}
+
+static void cs_amp_lib_test_grow_cal_efi_replace_by_uid(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/* Initially 1 entry grown to 2 entries overwriting slot 0 */
+	cs_amp_lib_test_init_dummy_cal_blob(test, 1);
+	KUNIT_ASSERT_EQ(test, 1, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[0].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 2, &data));
+	KUNIT_EXPECT_EQ(test, 2, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 2), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[1], sizeof(data)));
+
+	/* Initially 2 entries grown to 4 entries overwriting slot 1 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 2);
+	KUNIT_ASSERT_EQ(test, 2, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[1].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 4, &data));
+	KUNIT_EXPECT_EQ(test, 4, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 4), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[2], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[3], sizeof(data)));
+
+	/* Initially 4 entries grown to 6 entries overwriting slot 1 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[1].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+
+	/* Initially 4 entries grown to 6 entries overwriting slot 3 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[3].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 6, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[4], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[5], sizeof(data)));
+
+	/* Initially 6 entries grown to 8 entries overwriting slot 4 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 6);
+	KUNIT_ASSERT_EQ(test, 6, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[4].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, 8, &data));
+	KUNIT_EXPECT_EQ(test, 8, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 8), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[6], sizeof(data)));
+	KUNIT_EXPECT_TRUE(test, mem_is_zero(&priv->cal_blob->data[7], sizeof(data)));
+}
+
+static void cs_amp_lib_test_cal_efi_replace_by_uid(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	cs_amp_lib_test_init_dummy_cal_blob(test, 6);
+	KUNIT_ASSERT_EQ(test, 6, priv->cal_blob->count);
+
+	/* Replace entry matching slot 0 */
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[0].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[4], &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+
+	/* Replace entry matching slot 4 */
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[4].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+
+	/* Replace entry matching slot 3 */
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[3].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[4], &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+
+	/* Replace entry matching slot 5 */
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[5].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[4], &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[5], sizeof(data));
+}
+
+static void cs_amp_lib_test_cal_efi_replace_by_index(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	cs_amp_lib_test_init_dummy_cal_blob(test, 6);
+	KUNIT_ASSERT_EQ(test, 6, priv->cal_blob->count);
+
+	/*
+	 * Replace entry matching slot 0.
+	 * data.calTarget is deliberately set different to current calTarget
+	 * of the slot to check that the index forces that slot to be used.
+	 */
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = ~priv->cal_blob->data[0].calTarget[0];
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 0, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[4], &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+
+	/* Replace entry matching slot 4 */
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = ~priv->cal_blob->data[4].calTarget[0];
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 4, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+
+	/* Replace entry matching slot 3 */
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = ~priv->cal_blob->data[3].calTarget[0];
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 3, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[4], &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+
+	/* Replace entry matching slot 5 */
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = ~priv->cal_blob->data[5].calTarget[0];
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 5, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[4], &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[5], sizeof(data));
+}
+
+static void cs_amp_lib_test_cal_efi_deduplicate(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+	int i;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	/*
+	 * Replace entry matching slot 0.
+	 * An active entry in slot 1 for the same UID should be marked empty.
+	 * Other entries are unaltered.
+	 */
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[1].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 0, -1, &data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[1].calTime[0]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[1].calTime[1]);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+
+	/*
+	 * Replace entry matching slot 1.
+	 * An active entry in slot 0 for the same UID should be marked empty.
+	 * Other entries are unaltered.
+	 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[0].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 1, -1, &data));
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[0].calTime[0]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[0].calTime[1]);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+
+	/*
+	 * Replace entry matching slot 1.
+	 * An active entry in slot 3 for the same UID should be marked empty.
+	 * Other entries are unaltered.
+	 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	memcpy(data.calTarget, priv->cal_blob->data[3].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 1, -1, &data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[3].calTime[0]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[3].calTime[1]);
+
+	/*
+	 * Worst case, all entries have the same UID
+	 */
+	priv->cal_blob = NULL;
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	for (i = 0; i < priv->cal_blob->count; i++) {
+		priv->cal_blob->data[i].calTarget[0] = 0xe5e5e5e5;
+		priv->cal_blob->data[i].calTarget[1] = 0xa7a7a7a7;
+	}
+	memcpy(data.calTarget, priv->cal_blob->data[2].calTarget, sizeof(data.calTarget));
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 2, -1, &data));
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[0].calTime[0]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[0].calTime[1]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[1].calTime[0]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[1].calTime[1]);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[3].calTime[0]);
+	KUNIT_EXPECT_EQ(test, 0, priv->cal_blob->data[3].calTime[1]);
+}
+
+static void cs_amp_lib_test_cal_efi_find_free(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	cs_amp_lib_test_init_dummy_cal_blob(test, 6);
+	KUNIT_ASSERT_EQ(test, 6, priv->cal_blob->count);
+
+	/*
+	 * Slot 0 is empty.
+	 * data.calTarget is set to a value that won't match any existing entry.
+	 */
+	memset(&priv->cal_blob->data[0].calTime, 0, sizeof(priv->cal_blob->data[0].calTime));
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa;
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[4], &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+
+	/* Slot 4 is empty */
+	memset(&priv->cal_blob->data[4].calTime, 0, sizeof(priv->cal_blob->data[4].calTime));
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa;
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+
+	/* Slot 3 is empty */
+	memset(&priv->cal_blob->data[3].calTime, 0, sizeof(priv->cal_blob->data[3].calTime));
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa;
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[4], &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+
+	/* Replace entry matching slot 5 */
+	memset(&priv->cal_blob->data[5].calTime, 0, sizeof(priv->cal_blob->data[5].calTime));
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = 0xaaaaaaaa;
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[4], &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[5], sizeof(data));
+}
+
+static void cs_amp_lib_test_cal_efi_bad_cal_target(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+
+	/* Zero calTarget is illegal */
+	get_random_bytes(&data, sizeof(data));
+	memset(data.calTarget, 0, sizeof(data.calTarget));
+	KUNIT_EXPECT_LT(test, cs_amp_set_efi_calibration_data(dev, -1, -1, &data), 0);
+	KUNIT_EXPECT_LT(test, cs_amp_set_efi_calibration_data(dev, 0, -1, &data), 0);
+	KUNIT_EXPECT_LT(test, cs_amp_set_efi_calibration_data(dev, 0, 2, &data), 0);
+}
+
+static void cs_amp_lib_test_cal_efi_write_denied(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable_denied);
+
+	cs_amp_lib_test_init_dummy_cal_blob(test, 4);
+	KUNIT_ASSERT_EQ(test, 4, priv->cal_blob->count);
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+
+	/* Unspecified slot */
+	KUNIT_EXPECT_LT(test, cs_amp_set_efi_calibration_data(dev, -1, -1, &data), 0);
+	KUNIT_EXPECT_MEMEQ(test, original_blob, priv->cal_blob, original_blob->size);
+
+	/* Unspecified slot with size */
+	KUNIT_EXPECT_LT(test, cs_amp_set_efi_calibration_data(dev, -1, 6, &data), 0);
+	KUNIT_EXPECT_MEMEQ(test, original_blob, priv->cal_blob, original_blob->size);
+
+	/* Specified slot */
+	KUNIT_EXPECT_LT(test, cs_amp_set_efi_calibration_data(dev, 1, -1, &data), 0);
+	KUNIT_EXPECT_MEMEQ(test, original_blob, priv->cal_blob, original_blob->size);
+
+	/* Specified slot with size */
+	KUNIT_EXPECT_LT(test, cs_amp_set_efi_calibration_data(dev, 1, 6, &data), 0);
+	KUNIT_EXPECT_MEMEQ(test, original_blob, priv->cal_blob, original_blob->size);
+}
+
+static void cs_amp_lib_test_cal_efi_attr_preserved(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_efi_variable);
+
+	cs_amp_lib_test_init_dummy_cal_blob(test, 6);
+	KUNIT_ASSERT_EQ(test, 6, priv->cal_blob->count);
+	memset(&priv->cal_blob->data[0], 0, sizeof(priv->cal_blob->data[0]));
+	get_random_bytes(&data, sizeof(data));
+
+	/* Set a non-standard attr to return from get_efi_variable() */
+	priv->efi_attr = EFI_VARIABLE_HARDWARE_ERROR_RECORD;
+
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, -1, -1, &data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_EQ(test, priv->efi_attr, EFI_VARIABLE_HARDWARE_ERROR_RECORD);
+}
+
+static efi_status_t cs_amp_lib_test_set_hp_efi_cal_variable(efi_char16_t *name,
+							    efi_guid_t *guid,
+							    u32 attr,
+							    unsigned long size,
+							    void *buf)
+{
+	static const efi_char16_t expected_name[] = HP_CALIBRATION_EFI_NAME;
+	static const efi_guid_t expected_guid = HP_CALIBRATION_EFI_GUID;
+	struct kunit *test = kunit_get_current_test();
+	struct cs_amp_lib_test_priv *priv = test->priv;
+
+	KUNIT_ASSERT_NOT_NULL(test, name);
+	KUNIT_ASSERT_NOT_NULL(test, guid);
+
+	if (memcmp(name, expected_name, sizeof(expected_name)) ||
+	    efi_guidcmp(*guid, expected_guid))
+		return -EFI_ACCESS_DENIED;
+
+	KUNIT_ASSERT_NOT_NULL(test, buf);
+	KUNIT_ASSERT_NE(test, 0, size);
+
+	kunit_kfree(test, priv->cal_blob);
+	priv->cal_blob = kunit_kmalloc(test, size, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, priv->cal_blob);
+	memcpy(priv->cal_blob, buf, size);
+	priv->efi_attr = attr;
+
+	return EFI_SUCCESS;
+}
+
+/*
+ * If the HP EFI exists it should be the one that is updated.
+ */
+static void cs_amp_lib_test_cal_efi_update_hp(struct kunit *test)
+{
+	struct cs_amp_lib_test_priv *priv = test->priv;
+	struct device *dev = &priv->amp_dev->dev;
+	const struct cirrus_amp_efi_data *original_blob;
+	struct cirrus_amp_cal_data data;
+
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->get_efi_variable,
+				   cs_amp_lib_test_get_hp_cal_efi_variable);
+	kunit_activate_static_stub(test,
+				   cs_amp_test_hooks->set_efi_variable,
+				   cs_amp_lib_test_set_hp_efi_cal_variable);
+
+	cs_amp_lib_test_init_dummy_cal_blob(test, 6);
+	KUNIT_ASSERT_EQ(test, 6, priv->cal_blob->count);
+
+	/* Replace entry matching slot 4 */
+	original_blob = cs_amp_lib_test_cal_blob_dup(test);
+	get_random_bytes(&data, sizeof(data));
+	data.calTarget[0] = ~priv->cal_blob->data[4].calTarget[0];
+	KUNIT_EXPECT_EQ(test, 0, cs_amp_set_efi_calibration_data(dev, 4, -1, &data));
+	KUNIT_EXPECT_EQ(test, 6, priv->cal_blob->count);
+	KUNIT_EXPECT_EQ(test, struct_size(priv->cal_blob, data, 6), priv->cal_blob->size);
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[0], &priv->cal_blob->data[0], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[1], &priv->cal_blob->data[1], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[2], &priv->cal_blob->data[2], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[3], &priv->cal_blob->data[3], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &data, &priv->cal_blob->data[4], sizeof(data));
+	KUNIT_EXPECT_MEMEQ(test, &original_blob->data[5], &priv->cal_blob->data[5], sizeof(data));
+}
+
 static void cs_amp_lib_test_spkid_lenovo_not_present(struct kunit *test)
 {
 	struct cs_amp_lib_test_priv *priv = test->priv;
@@ -1072,6 +2433,28 @@ static struct kunit_case cs_amp_lib_test_cases[] = {
 	KUNIT_CASE(cs_amp_lib_test_read_cal_data_test),
 	KUNIT_CASE(cs_amp_lib_test_write_ambient_test),
 
+	/* Test cases for writing cal data to UEFI */
+	KUNIT_CASE(cs_amp_lib_test_create_new_cal_efi),
+	KUNIT_CASE(cs_amp_lib_test_create_new_cal_efi_indexed),
+	KUNIT_CASE(cs_amp_lib_test_create_new_cal_efi_indexed_no_max),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_all_zeros_add_first),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_all_zeros_add_first_no_shrink),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_all_zeros_add_first_indexed),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_all_zeros_add_first_indexed_no_shrink),
+	KUNIT_CASE(cs_amp_lib_test_grow_append_cal_efi),
+	KUNIT_CASE(cs_amp_lib_test_grow_append_cal_efi_indexed),
+	KUNIT_CASE(cs_amp_lib_test_grow_append_cal_efi_indexed_no_max),
+	KUNIT_CASE(cs_amp_lib_test_grow_cal_efi_replace_indexed),
+	KUNIT_CASE(cs_amp_lib_test_grow_cal_efi_replace_by_uid),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_replace_by_uid),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_replace_by_index),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_deduplicate),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_find_free),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_bad_cal_target),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_write_denied),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_attr_preserved),
+	KUNIT_CASE(cs_amp_lib_test_cal_efi_update_hp),
+
 	/* Test cases for speaker ID */
 	KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_not_present),
 	KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_d0),
diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c
index 7038574e3f4b..d8f8b0259cd1 100644
--- a/sound/soc/codecs/cs-amp-lib.c
+++ b/sound/soc/codecs/cs-amp-lib.c
@@ -727,6 +727,7 @@ EXPORT_SYMBOL_NS_GPL(cs_amp_create_debugfs, "SND_SOC_CS_AMP_LIB");
 
 static const struct cs_amp_test_hooks cs_amp_test_hook_ptrs = {
 	.get_efi_variable = cs_amp_get_efi_variable,
+	.set_efi_variable = cs_amp_set_efi_variable,
 	.write_cal_coeff = cs_amp_write_cal_coeff,
 	.read_cal_coeff = cs_amp_read_cal_coeff,
 };
-- 
cgit v1.2.3


From 483768846d66c04354898f00bcdaad58a3763be2 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Wed, 15 Oct 2025 11:27:28 -0400
Subject: PCI: endpoint: Rename 'epf_bar::aligned_size' to 'epf_bar:mem_size'

Rename the member 'epf_bar::aligned_size' to 'epf_bar::mem_size' to better
reflect its purpose. 'aligned_size' was misleading, as it actually
represents the backing memory size allocated for the BAR rather than the
aligned size.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Manivannan Sadhasivam <mani@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
Link: https://patch.msgid.link/20251015-vntb_msi_doorbell-v6-1-9230298b1910@nxp.com
---
 drivers/pci/endpoint/pci-epf-core.c | 12 ++++++------
 include/linux/pci-epf.h             |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index d54e18872aef..214e3f6e6d0d 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -236,13 +236,13 @@ void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
 	}
 
 	dev = epc->dev.parent;
-	dma_free_coherent(dev, epf_bar[bar].aligned_size, addr,
+	dma_free_coherent(dev, epf_bar[bar].mem_size, addr,
 			  epf_bar[bar].phys_addr);
 
 	epf_bar[bar].phys_addr = 0;
 	epf_bar[bar].addr = NULL;
 	epf_bar[bar].size = 0;
-	epf_bar[bar].aligned_size = 0;
+	epf_bar[bar].mem_size = 0;
 	epf_bar[bar].barno = 0;
 	epf_bar[bar].flags = 0;
 }
@@ -265,7 +265,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
 			  enum pci_epc_interface_type type)
 {
 	u64 bar_fixed_size = epc_features->bar[bar].fixed_size;
-	size_t aligned_size, align = epc_features->align;
+	size_t mem_size, align = epc_features->align;
 	struct pci_epf_bar *epf_bar;
 	dma_addr_t phys_addr;
 	struct pci_epc *epc;
@@ -297,7 +297,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
 	 * it might be different if, for example, the fixed size of a BAR
 	 * is smaller than align.
 	 */
-	aligned_size = align ? ALIGN(size, align) : size;
+	mem_size = align ? ALIGN(size, align) : size;
 
 	if (type == PRIMARY_INTERFACE) {
 		epc = epf->epc;
@@ -308,7 +308,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
 	}
 
 	dev = epc->dev.parent;
-	space = dma_alloc_coherent(dev, aligned_size, &phys_addr, GFP_KERNEL);
+	space = dma_alloc_coherent(dev, mem_size, &phys_addr, GFP_KERNEL);
 	if (!space) {
 		dev_err(dev, "failed to allocate mem space\n");
 		return NULL;
@@ -317,7 +317,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
 	epf_bar[bar].phys_addr = phys_addr;
 	epf_bar[bar].addr = space;
 	epf_bar[bar].size = size;
-	epf_bar[bar].aligned_size = aligned_size;
+	epf_bar[bar].mem_size = mem_size;
 	epf_bar[bar].barno = bar;
 	if (upper_32_bits(size) || epc_features->bar[bar].only_64bit)
 		epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 2e85504ba2ba..4022dd080e20 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -115,8 +115,8 @@ struct pci_epf_driver {
  * @phys_addr: physical address that should be mapped to the BAR
  * @addr: virtual address corresponding to the @phys_addr
  * @size: the size of the address space present in BAR
- * @aligned_size: the size actually allocated to accommodate the iATU alignment
- *                requirement
+ * @mem_size: the size actually allocated to accommodate the iATU alignment
+ *            requirement
  * @barno: BAR number
  * @flags: flags that are set for the BAR
  */
@@ -124,7 +124,7 @@ struct pci_epf_bar {
 	dma_addr_t	phys_addr;
 	void		*addr;
 	size_t		size;
-	size_t		aligned_size;
+	size_t		mem_size;
 	enum pci_barno	barno;
 	int		flags;
 };
-- 
cgit v1.2.3


From 83be4bee57f0374ff751aaff3fef4af0af66ec81 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Fri, 17 Oct 2025 13:26:28 +0000
Subject: ACPI: PRM: Add acpi_prm_handler_available()

Add a helper function to check if a PRM handler/module is present.

This can be used during init time by code that depends on a particular
handler. If the handler is not present, then the code does not need to
be loaded.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: "Mario Limonciello (AMD)" <superm1@kernel.org>
Acked-by: "Rafael J. Wysocki (Intel)" <rafael@kernel.org>
Link: https://patch.msgid.link/all/20251017-wip-atl-prm-v2-1-7ab1df4a5fbc@amd.com
---
 drivers/acpi/prmt.c  | 6 ++++++
 include/linux/prmt.h | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 6792d4385eee..7b8b5d2015ec 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -244,6 +244,12 @@ static struct prm_handler_info *find_prm_handler(const guid_t *guid)
 	return (struct prm_handler_info *) find_guid_info(guid, GET_HANDLER);
 }
 
+bool acpi_prm_handler_available(const guid_t *guid)
+{
+	return find_prm_handler(guid) && find_prm_module(guid);
+}
+EXPORT_SYMBOL_GPL(acpi_prm_handler_available);
+
 /* In-coming PRM commands */
 
 #define PRM_CMD_RUN_SERVICE		0
diff --git a/include/linux/prmt.h b/include/linux/prmt.h
index c53ab287e932..8cdc987de963 100644
--- a/include/linux/prmt.h
+++ b/include/linux/prmt.h
@@ -4,9 +4,11 @@
 
 #ifdef CONFIG_ACPI_PRMT
 void init_prmt(void);
+bool acpi_prm_handler_available(const guid_t *handler_guid);
 int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer);
 #else
 static inline void init_prmt(void) { }
+static inline bool acpi_prm_handler_available(const guid_t *handler_guid) { return false; }
 static inline int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 0bfc6758f213a701bd662982de86f0032b51f18c Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Wed, 15 Oct 2025 11:27:30 -0400
Subject: PCI: endpoint: Add pci_epf_assign_bar_space() API

Add pci_epf_assign_bar_space() API to allow setting any MMIO address as
the BAR memory space, such as an MSI message base address.

This API also conforms to the BAR base address and size alignment
restrictions enforced by the PCI spec r6.0, sec 7.5.1.2.1.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
[mani: removed unused epc var, reworded kdoc, comments and description]
Signed-off-by: Manivannan Sadhasivam <mani@kernel.org>
Link: https://patch.msgid.link/20251015-vntb_msi_doorbell-v6-3-9230298b1910@nxp.com
---
 drivers/pci/endpoint/pci-epf-core.c | 77 +++++++++++++++++++++++++++++++++++++
 include/linux/pci-epf.h             |  6 +++
 2 files changed, 83 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index ec20aabb7f75..9a505c796370 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -346,6 +346,83 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
 }
 EXPORT_SYMBOL_GPL(pci_epf_alloc_space);
 
+/**
+ * pci_epf_assign_bar_space() - Assign PCI EPF BAR space
+ * @epf: EPF device to assign the BAR memory
+ * @size: Size of the memory that has to be assigned
+ * @bar: BAR number for which the memory is assigned
+ * @epc_features: Features provided by the EPC specific to this EPF
+ * @type: Identifies if the assignment is for primary EPC or secondary EPC
+ * @bar_addr: Address to be assigned for the @bar
+ *
+ * Invoke to assign memory for the PCI EPF BAR.
+ * Flag PCI_BASE_ADDRESS_MEM_TYPE_64 will automatically get set if the BAR
+ * can only be a 64-bit BAR, or if the requested size is larger than 2 GB.
+ */
+int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size,
+			     enum pci_barno bar,
+			     const struct pci_epc_features *epc_features,
+			     enum pci_epc_interface_type type,
+			     dma_addr_t bar_addr)
+{
+	size_t bar_size, aligned_mem_size;
+	struct pci_epf_bar *epf_bar;
+	dma_addr_t limit;
+	int pos;
+
+	if (!size)
+		return -EINVAL;
+
+	limit = bar_addr + size - 1;
+
+	/*
+	 *  Bits:		15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+	 *  bar_addr:		U  U  U  U  U  U  0 X X X X X X X X X
+	 *  limit:		U  U  U  U  U  U  1 X X X X X X X X X
+	 *
+	 *  bar_addr^limit	0  0  0  0  0  0  1 X X X X X X X X X
+	 *
+	 *  U: unchanged address bits in range [bar_addr, limit]
+	 *  X: bit 0 or 1
+	 *
+	 *  (bar_addr^limit) & BIT_ULL(pos) will find the first set bit from MSB
+	 *  (pos). And value of (2 ^ pos) should be able to cover the BAR range.
+	 */
+	for (pos = 8 * sizeof(dma_addr_t) - 1; pos > 0; pos--)
+		if ((limit ^ bar_addr) & BIT_ULL(pos))
+			break;
+
+	if (pos == 8 * sizeof(dma_addr_t) - 1)
+		return -EINVAL;
+
+	bar_size = BIT_ULL(pos + 1);
+	if (pci_epf_get_required_bar_size(epf, &bar_size, &aligned_mem_size,
+					  bar, epc_features, type))
+		return -ENOMEM;
+
+	if (type == PRIMARY_INTERFACE)
+		epf_bar = epf->bar;
+	else
+		epf_bar = epf->sec_epc_bar;
+
+	epf_bar[bar].phys_addr = ALIGN_DOWN(bar_addr, aligned_mem_size);
+
+	if (epf_bar[bar].phys_addr + bar_size < limit)
+		return -ENOMEM;
+
+	epf_bar[bar].addr = NULL;
+	epf_bar[bar].size = bar_size;
+	epf_bar[bar].mem_size = aligned_mem_size;
+	epf_bar[bar].barno = bar;
+	if (upper_32_bits(size) || epc_features->bar[bar].only_64bit)
+		epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+	else
+		epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_32;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_epf_assign_bar_space);
+
 static void pci_epf_remove_cfs(struct pci_epf_driver *driver)
 {
 	struct config_group *group, *tmp;
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 4022dd080e20..48f68c4dcfa5 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -242,6 +242,12 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
 void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
 			enum pci_epc_interface_type type);
 
+int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size,
+			     enum pci_barno bar,
+			     const struct pci_epc_features *epc_features,
+			     enum pci_epc_interface_type type,
+			     dma_addr_t bar_addr);
+
 int pci_epf_align_inbound_addr(struct pci_epf *epf, enum pci_barno bar,
 			       u64 addr, dma_addr_t *base, size_t *off);
 int pci_epf_bind(struct pci_epf *epf);
-- 
cgit v1.2.3


From 013a3a66f25af3fb614f45df43983657514944c4 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:54:55 +0100
Subject: regmap: sdw-mbq: Don't assume the regmap device is the SoundWire
 slave

Currently, the code assumes that the device that registered the
MBQ register map is the actual SoundWire slave device. This works
fine for all current users, however future SDCA devices will
likely be implemented with the SoundWire slave as a parent device
and separate child drivers with regmaps for each audio Function.
Update the regmap_init_sdw_mbq_cfg macro to allow these two
to be specified separately.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-3-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-sdw-mbq.c | 23 ++++++++++++-----------
 include/linux/regmap.h               | 21 +++++++++++----------
 sound/soc/codecs/rt722-sdca-sdw.c    |  4 +++-
 3 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap-sdw-mbq.c b/drivers/base/regmap/regmap-sdw-mbq.c
index 86644bbd0710..8b7d34a6080d 100644
--- a/drivers/base/regmap/regmap-sdw-mbq.c
+++ b/drivers/base/regmap/regmap-sdw-mbq.c
@@ -15,6 +15,7 @@
 
 struct regmap_mbq_context {
 	struct device *dev;
+	struct sdw_slave *sdw;
 
 	struct regmap_sdw_mbq_cfg cfg;
 
@@ -46,7 +47,7 @@ static bool regmap_sdw_mbq_deferrable(struct regmap_mbq_context *ctx, unsigned i
 static int regmap_sdw_mbq_poll_busy(struct sdw_slave *slave, unsigned int reg,
 				    struct regmap_mbq_context *ctx)
 {
-	struct device *dev = &slave->dev;
+	struct device *dev = ctx->dev;
 	int val, ret = 0;
 
 	dev_dbg(dev, "Deferring transaction for 0x%x\n", reg);
@@ -96,8 +97,7 @@ static int regmap_sdw_mbq_write_impl(struct sdw_slave *slave,
 static int regmap_sdw_mbq_write(void *context, unsigned int reg, unsigned int val)
 {
 	struct regmap_mbq_context *ctx = context;
-	struct device *dev = ctx->dev;
-	struct sdw_slave *slave = dev_to_sdw_dev(dev);
+	struct sdw_slave *slave = ctx->sdw;
 	bool deferrable = regmap_sdw_mbq_deferrable(ctx, reg);
 	int mbq_size = regmap_sdw_mbq_size(ctx, reg);
 	int ret;
@@ -156,8 +156,7 @@ static int regmap_sdw_mbq_read_impl(struct sdw_slave *slave,
 static int regmap_sdw_mbq_read(void *context, unsigned int reg, unsigned int *val)
 {
 	struct regmap_mbq_context *ctx = context;
-	struct device *dev = ctx->dev;
-	struct sdw_slave *slave = dev_to_sdw_dev(dev);
+	struct sdw_slave *slave = ctx->sdw;
 	bool deferrable = regmap_sdw_mbq_deferrable(ctx, reg);
 	int mbq_size = regmap_sdw_mbq_size(ctx, reg);
 	int ret;
@@ -208,6 +207,7 @@ static int regmap_sdw_mbq_config_check(const struct regmap_config *config)
 
 static struct regmap_mbq_context *
 regmap_sdw_mbq_gen_context(struct device *dev,
+			   struct sdw_slave *sdw,
 			   const struct regmap_config *config,
 			   const struct regmap_sdw_mbq_cfg *mbq_config)
 {
@@ -218,6 +218,7 @@ regmap_sdw_mbq_gen_context(struct device *dev,
 		return ERR_PTR(-ENOMEM);
 
 	ctx->dev = dev;
+	ctx->sdw = sdw;
 
 	if (mbq_config)
 		ctx->cfg = *mbq_config;
@@ -228,7 +229,7 @@ regmap_sdw_mbq_gen_context(struct device *dev,
 	return ctx;
 }
 
-struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw,
+struct regmap *__regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw,
 				     const struct regmap_config *config,
 				     const struct regmap_sdw_mbq_cfg *mbq_config,
 				     struct lock_class_key *lock_key,
@@ -241,16 +242,16 @@ struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw,
 	if (ret)
 		return ERR_PTR(ret);
 
-	ctx = regmap_sdw_mbq_gen_context(&sdw->dev, config, mbq_config);
+	ctx = regmap_sdw_mbq_gen_context(dev, sdw, config, mbq_config);
 	if (IS_ERR(ctx))
 		return ERR_CAST(ctx);
 
-	return __regmap_init(&sdw->dev, &regmap_sdw_mbq, ctx,
+	return __regmap_init(dev, &regmap_sdw_mbq, ctx,
 			     config, lock_key, lock_name);
 }
 EXPORT_SYMBOL_GPL(__regmap_init_sdw_mbq);
 
-struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw,
+struct regmap *__devm_regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw,
 					  const struct regmap_config *config,
 					  const struct regmap_sdw_mbq_cfg *mbq_config,
 					  struct lock_class_key *lock_key,
@@ -263,11 +264,11 @@ struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw,
 	if (ret)
 		return ERR_PTR(ret);
 
-	ctx = regmap_sdw_mbq_gen_context(&sdw->dev, config, mbq_config);
+	ctx = regmap_sdw_mbq_gen_context(dev, sdw, config, mbq_config);
 	if (IS_ERR(ctx))
 		return ERR_CAST(ctx);
 
-	return __devm_regmap_init(&sdw->dev, &regmap_sdw_mbq, ctx,
+	return __devm_regmap_init(dev, &regmap_sdw_mbq, ctx,
 				  config, lock_key, lock_name);
 }
 EXPORT_SYMBOL_GPL(__devm_regmap_init_sdw_mbq);
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 4e1ac1fbcec4..70daec535976 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -676,7 +676,7 @@ struct regmap *__regmap_init_sdw(struct sdw_slave *sdw,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
 				 const char *lock_name);
-struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw,
+struct regmap *__regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw,
 				     const struct regmap_config *config,
 				     const struct regmap_sdw_mbq_cfg *mbq_config,
 				     struct lock_class_key *lock_key,
@@ -738,7 +738,7 @@ struct regmap *__devm_regmap_init_sdw(struct sdw_slave *sdw,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
 				 const char *lock_name);
-struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw,
+struct regmap *__devm_regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw,
 					  const struct regmap_config *config,
 					  const struct regmap_sdw_mbq_cfg *mbq_config,
 					  struct lock_class_key *lock_key,
@@ -970,7 +970,7 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
  */
 #define regmap_init_sdw_mbq(sdw, config)					\
 	__regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config,		\
-				sdw, config, NULL)
+				&sdw->dev, sdw, config, NULL)
 
 /**
  * regmap_init_sdw_mbq_cfg() - Initialise MBQ SDW register map with config
@@ -983,9 +983,9 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
  * to a struct regmap. The regmap will be automatically freed by the
  * device management code.
  */
-#define regmap_init_sdw_mbq_cfg(sdw, config, mbq_config)		\
+#define regmap_init_sdw_mbq_cfg(dev, sdw, config, mbq_config)		\
 	__regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config,	\
-				sdw, config, mbq_config)
+				dev, sdw, config, mbq_config)
 
 /**
  * regmap_init_spi_avmm() - Initialize register map for Intel SPI Slave
@@ -1198,12 +1198,13 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
  */
 #define devm_regmap_init_sdw_mbq(sdw, config)			\
 	__regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, #config,   \
-				sdw, config, NULL)
+				&sdw->dev, sdw, config, NULL)
 
 /**
  * devm_regmap_init_sdw_mbq_cfg() - Initialise managed MBQ SDW register map with config
  *
- * @sdw: Device that will be interacted with
+ * @dev: Device that will be interacted with
+ * @sdw: SoundWire Device that will be interacted with
  * @config: Configuration for register map
  * @mbq_config: Properties for the MBQ registers
  *
@@ -1211,9 +1212,9 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
  * to a struct regmap. The regmap will be automatically freed by the
  * device management code.
  */
-#define devm_regmap_init_sdw_mbq_cfg(sdw, config, mbq_config)	\
-	__regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq,	\
-				#config, sdw, config, mbq_config)
+#define devm_regmap_init_sdw_mbq_cfg(dev, sdw, config, mbq_config)	\
+	__regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq,		\
+				#config, dev, sdw, config, mbq_config)
 
 /**
  * devm_regmap_init_slimbus() - Initialise managed register map
diff --git a/sound/soc/codecs/rt722-sdca-sdw.c b/sound/soc/codecs/rt722-sdca-sdw.c
index 5ea40c1b159a..a0f5601a262a 100644
--- a/sound/soc/codecs/rt722-sdca-sdw.c
+++ b/sound/soc/codecs/rt722-sdca-sdw.c
@@ -419,7 +419,9 @@ static int rt722_sdca_sdw_probe(struct sdw_slave *slave,
 	struct regmap *regmap;
 
 	/* Regmap Initialization */
-	regmap = devm_regmap_init_sdw_mbq_cfg(slave, &rt722_sdca_regmap, &rt722_mbq_config);
+	regmap = devm_regmap_init_sdw_mbq_cfg(&slave->dev, slave,
+					      &rt722_sdca_regmap,
+					      &rt722_mbq_config);
 	if (IS_ERR(regmap))
 		return PTR_ERR(regmap);
 
-- 
cgit v1.2.3


From 7159816707dc7040fe3ac4fa3d7ac3d173bd772a Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:54:57 +0100
Subject: ASoC: SDCA: Pass SoundWire slave to HID

The SDCA HID code can't assume that the struct device it is passed is
the SoundWire slave device. HID is represented by a Function in SDCA and
will thus likely be implemented by a child driver. Update the code to
explicitly pass in the SoundWire slave device.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-5-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h   |  2 +-
 include/sound/sdca_hid.h        |  8 ++++++--
 sound/soc/sdca/sdca_functions.c | 20 ++++++++++++--------
 sound/soc/sdca/sdca_hid.c       | 12 +++++-------
 4 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index ea68856e4c8c..51e12fcfc53c 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -1332,7 +1332,7 @@ static inline u32 sdca_range_search(struct sdca_control_range *range,
 	return 0;
 }
 
-int sdca_parse_function(struct device *dev,
+int sdca_parse_function(struct device *dev, struct sdw_slave *sdw,
 			struct sdca_function_desc *desc,
 			struct sdca_function_data *function);
 
diff --git a/include/sound/sdca_hid.h b/include/sound/sdca_hid.h
index 8ab3e498884e..3a155835e035 100644
--- a/include/sound/sdca_hid.h
+++ b/include/sound/sdca_hid.h
@@ -12,10 +12,14 @@
 #include <linux/hid.h>
 
 #if IS_ENABLED(CONFIG_SND_SOC_SDCA_HID)
-int sdca_add_hid_device(struct device *dev, struct sdca_entity *entity);
+
+int sdca_add_hid_device(struct device *dev, struct sdw_slave *sdw,
+			struct sdca_entity *entity);
 
 #else
-static inline int sdca_add_hid_device(struct device *dev, struct sdca_entity *entity)
+
+static inline int sdca_add_hid_device(struct device *dev, struct sdw_slave *sdw,
+				      struct sdca_entity *entity)
 {
 	return 0;
 }
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 5f76ff4345ff..2d5d20e23e3c 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -1253,7 +1253,8 @@ bad_list:
 }
 
 static int
-find_sdca_entity_hide(struct device *dev, struct fwnode_handle *function_node,
+find_sdca_entity_hide(struct device *dev, struct sdw_slave *sdw,
+		      struct fwnode_handle *function_node,
 		      struct fwnode_handle *entity_node, struct sdca_entity *entity)
 {
 	struct sdca_entity_hide *hide = &entity->hide;
@@ -1328,7 +1329,7 @@ find_sdca_entity_hide(struct device *dev, struct fwnode_handle *function_node,
 						      report_desc, nval);
 
 			/* add HID device */
-			ret = sdca_add_hid_device(dev, entity);
+			ret = sdca_add_hid_device(dev, sdw, entity);
 			if (ret) {
 				dev_err(dev, "%pfwP: failed to add HID device: %d\n", entity_node, ret);
 				return ret;
@@ -1339,7 +1340,7 @@ find_sdca_entity_hide(struct device *dev, struct fwnode_handle *function_node,
 	return 0;
 }
 
-static int find_sdca_entity(struct device *dev,
+static int find_sdca_entity(struct device *dev, struct sdw_slave *sdw,
 			    struct fwnode_handle *function_node,
 			    struct fwnode_handle *entity_node,
 			    struct sdca_entity *entity)
@@ -1381,7 +1382,8 @@ static int find_sdca_entity(struct device *dev,
 		ret = find_sdca_entity_ge(dev, entity_node, entity);
 		break;
 	case SDCA_ENTITY_TYPE_HIDE:
-		ret = find_sdca_entity_hide(dev, function_node, entity_node, entity);
+		ret = find_sdca_entity_hide(dev, sdw, function_node,
+					    entity_node, entity);
 		break;
 	default:
 		break;
@@ -1396,7 +1398,7 @@ static int find_sdca_entity(struct device *dev,
 	return 0;
 }
 
-static int find_sdca_entities(struct device *dev,
+static int find_sdca_entities(struct device *dev, struct sdw_slave *sdw,
 			      struct fwnode_handle *function_node,
 			      struct sdca_function_data *function)
 {
@@ -1448,7 +1450,8 @@ static int find_sdca_entities(struct device *dev,
 			return -EINVAL;
 		}
 
-		ret = find_sdca_entity(dev, function_node, entity_node, &entities[i]);
+		ret = find_sdca_entity(dev, sdw, function_node,
+				       entity_node, &entities[i]);
 		fwnode_handle_put(entity_node);
 		if (ret)
 			return ret;
@@ -1927,12 +1930,13 @@ static int find_sdca_clusters(struct device *dev,
 /**
  * sdca_parse_function - parse ACPI DisCo for a Function
  * @dev: Pointer to device against which function data will be allocated.
+ * @sdw: SoundWire slave device to be processed.
  * @function_desc: Pointer to the Function short descriptor.
  * @function: Pointer to the Function information, to be populated.
  *
  * Return: Returns 0 for success.
  */
-int sdca_parse_function(struct device *dev,
+int sdca_parse_function(struct device *dev, struct sdw_slave *sdw,
 			struct sdca_function_desc *function_desc,
 			struct sdca_function_data *function)
 {
@@ -1953,7 +1957,7 @@ int sdca_parse_function(struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = find_sdca_entities(dev, function_desc->node, function);
+	ret = find_sdca_entities(dev, sdw, function_desc->node, function);
 	if (ret)
 		return ret;
 
diff --git a/sound/soc/sdca/sdca_hid.c b/sound/soc/sdca/sdca_hid.c
index 967f7ec6fb79..53dad1a524d4 100644
--- a/sound/soc/sdca/sdca_hid.c
+++ b/sound/soc/sdca/sdca_hid.c
@@ -82,15 +82,13 @@ static const struct hid_ll_driver sdw_hid_driver = {
 	.raw_request = sdwhid_raw_request,
 };
 
-int sdca_add_hid_device(struct device *dev, struct sdca_entity *entity)
+int sdca_add_hid_device(struct device *dev, struct sdw_slave *sdw,
+			struct sdca_entity *entity)
 {
-	struct sdw_bus *bus;
+	struct sdw_bus *bus = sdw->bus;
 	struct hid_device *hid;
-	struct sdw_slave *slave = dev_to_sdw_dev(dev);
 	int ret;
 
-	bus = slave->bus;
-
 	hid = hid_allocate_device();
 	if (IS_ERR(hid))
 		return PTR_ERR(hid);
@@ -103,8 +101,8 @@ int sdca_add_hid_device(struct device *dev, struct sdca_entity *entity)
 
 	snprintf(hid->name, sizeof(hid->name),
 		 "HID sdw:%01x:%01x:%04x:%04x:%02x",
-		 bus->controller_id, bus->link_id, slave->id.mfg_id,
-		 slave->id.part_id, slave->id.class_id);
+		 bus->controller_id, bus->link_id, sdw->id.mfg_id,
+		 sdw->id.part_id, sdw->id.class_id);
 
 	snprintf(hid->phys, sizeof(hid->phys), "%s", dev->bus->name);
 
-- 
cgit v1.2.3


From 390c05f47d0749b24db65586482308c5fd680fe5 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:54:58 +0100
Subject: ASoC: SDCA: Pass device register map from IRQ alloc to handlers

Store a copy of the device register map in the structure for the IRQ
handlers. This will allow the individual IRQ handlers access to the
device level register map if required.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-6-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_interrupts.h  | 2 ++
 sound/soc/sdca/sdca_interrupts.c | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/sdca_interrupts.h b/include/sound/sdca_interrupts.h
index bbbc3ab27eba..d652c6e94ddc 100644
--- a/include/sound/sdca_interrupts.h
+++ b/include/sound/sdca_interrupts.h
@@ -23,6 +23,7 @@ struct sdca_function_data;
 /**
  * struct sdca_interrupt - contains information about a single SDCA interrupt
  * @name: The name of the interrupt.
+ * @device_regmap: Pointer to the IRQ regmap.
  * @component: Pointer to the ASoC component owns the interrupt.
  * @function: Pointer to the Function that the interrupt is associated with.
  * @entity: Pointer to the Entity that the interrupt is associated with.
@@ -35,6 +36,7 @@ struct sdca_function_data;
 struct sdca_interrupt {
 	const char *name;
 
+	struct regmap *device_regmap;
 	struct snd_soc_component *component;
 	struct sdca_function_data *function;
 	struct sdca_entity *entity;
diff --git a/sound/soc/sdca/sdca_interrupts.c b/sound/soc/sdca/sdca_interrupts.c
index 9295d283be91..898069ceffe9 100644
--- a/sound/soc/sdca/sdca_interrupts.c
+++ b/sound/soc/sdca/sdca_interrupts.c
@@ -437,7 +437,7 @@ struct sdca_interrupt_info *sdca_irq_allocate(struct device *sdev,
 					      struct regmap *regmap, int irq)
 {
 	struct sdca_interrupt_info *info;
-	int ret;
+	int ret, i;
 
 	info = devm_kzalloc(sdev, sizeof(*info), GFP_KERNEL);
 	if (!info)
@@ -445,6 +445,9 @@ struct sdca_interrupt_info *sdca_irq_allocate(struct device *sdev,
 
 	info->irq_chip = sdca_irq_chip;
 
+	for (i = 0; i < ARRAY_SIZE(info->irqs); i++)
+		info->irqs[i].device_regmap = regmap;
+
 	ret = devm_mutex_init(sdev, &info->irq_lock);
 	if (ret)
 		return ERR_PTR(ret);
-- 
cgit v1.2.3


From 56bbda23d4bece7ce998666118a068e4f71d59fb Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:54:59 +0100
Subject: ASoC: SDCA: Update externally_requested flag to cover all requests

Currently there is a flag to indicate if an IRQ has been requested by
something outside the SDCA core, such that the core can skip requesting
that IRQ. However, it is simpler and more useful to always store the
allocated IRQ number. This will allow the core to see if the IRQ has
been requested, to perform additional operations on the IRQ, and
request IRQs in multiple phases.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-7-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_interrupts.h  | 7 +++----
 sound/soc/sdca/sdca_interrupts.c | 8 ++++----
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/sound/sdca_interrupts.h b/include/sound/sdca_interrupts.h
index d652c6e94ddc..e4bf123936bb 100644
--- a/include/sound/sdca_interrupts.h
+++ b/include/sound/sdca_interrupts.h
@@ -29,9 +29,8 @@ struct sdca_function_data;
  * @entity: Pointer to the Entity that the interrupt is associated with.
  * @control: Pointer to the Control that the interrupt is associated with.
  * @priv: Pointer to private data for use by the handler.
- * @externally_requested: Internal flag used to check if a client driver has
- * already requested the interrupt, for custom handling, allowing the core to
- * skip handling this interrupt.
+ * @irq: IRQ number allocated to this interrupt, also used internally to track
+ * the IRQ being assigned.
  */
 struct sdca_interrupt {
 	const char *name;
@@ -44,7 +43,7 @@ struct sdca_interrupt {
 
 	void *priv;
 
-	bool externally_requested;
+	int irq;
 };
 
 /**
diff --git a/sound/soc/sdca/sdca_interrupts.c b/sound/soc/sdca/sdca_interrupts.c
index 898069ceffe9..cb7c7a6f356e 100644
--- a/sound/soc/sdca/sdca_interrupts.c
+++ b/sound/soc/sdca/sdca_interrupts.c
@@ -262,6 +262,8 @@ static int sdca_irq_request_locked(struct device *dev,
 	if (ret)
 		return ret;
 
+	info->irqs[sdca_irq].irq = irq;
+
 	dev_dbg(dev, "requested irq %d for %s\n", irq, name);
 
 	return 0;
@@ -301,8 +303,6 @@ int sdca_irq_request(struct device *dev, struct sdca_interrupt_info *info,
 		return ret;
 	}
 
-	info->irqs[sdca_irq].externally_requested = true;
-
 	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(sdca_irq_request, "SND_SOC_SDCA");
@@ -379,9 +379,9 @@ int sdca_irq_populate(struct sdca_function_data *function,
 
 			interrupt = &info->irqs[irq];
 
-			if (interrupt->externally_requested) {
+			if (interrupt->requested) {
 				dev_dbg(dev,
-					"skipping irq %d, externally requested\n",
+					"skipping irq %d, already requested\n",
 					irq);
 				continue;
 			}
-- 
cgit v1.2.3


From dfe7c3401ed3d3bd8e61be8d6d452896513eb52e Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:01 +0100
Subject: ASoC: SDCA: Rely less on the ASoC component in IRQ handling

In the future some IRQs (mostly the UMPs used during File
DownLoad) will need to run after the device has enumerated on the
bus but before the soundcard is actually constructed. As such
refactor more of the IRQ handling to use raw device and regmap
pointers, rather than accessing things through the component.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-9-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_interrupts.h  |  7 ++++++-
 sound/soc/sdca/sdca_interrupts.c | 37 +++++++++++++++++++++++++------------
 2 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/sound/sdca_interrupts.h b/include/sound/sdca_interrupts.h
index e4bf123936bb..3983f515349a 100644
--- a/include/sound/sdca_interrupts.h
+++ b/include/sound/sdca_interrupts.h
@@ -23,7 +23,9 @@ struct sdca_function_data;
 /**
  * struct sdca_interrupt - contains information about a single SDCA interrupt
  * @name: The name of the interrupt.
+ * @dev: Pointer to the Function device.
  * @device_regmap: Pointer to the IRQ regmap.
+ * @function_regmap: Pointer to the SDCA Function regmap.
  * @component: Pointer to the ASoC component owns the interrupt.
  * @function: Pointer to the Function that the interrupt is associated with.
  * @entity: Pointer to the Entity that the interrupt is associated with.
@@ -35,7 +37,9 @@ struct sdca_function_data;
 struct sdca_interrupt {
 	const char *name;
 
+	struct device *dev;
 	struct regmap *device_regmap;
+	struct regmap *function_regmap;
 	struct snd_soc_component *component;
 	struct sdca_function_data *function;
 	struct sdca_entity *entity;
@@ -65,7 +69,8 @@ struct sdca_interrupt_info {
 int sdca_irq_request(struct device *dev, struct sdca_interrupt_info *interrupt_info,
 		     int sdca_irq, const char *name, irq_handler_t handler,
 		     void *data);
-int sdca_irq_data_populate(struct snd_soc_component *component,
+int sdca_irq_data_populate(struct device *dev, struct regmap *function_regmap,
+			   struct snd_soc_component *component,
 			   struct sdca_function_data *function,
 			   struct sdca_entity *entity,
 			   struct sdca_control *control,
diff --git a/sound/soc/sdca/sdca_interrupts.c b/sound/soc/sdca/sdca_interrupts.c
index 2b3bb7d0cb44..d0894c8e0552 100644
--- a/sound/soc/sdca/sdca_interrupts.c
+++ b/sound/soc/sdca/sdca_interrupts.c
@@ -77,7 +77,7 @@ static const struct regmap_irq_chip sdca_irq_chip = {
 static irqreturn_t base_handler(int irq, void *data)
 {
 	struct sdca_interrupt *interrupt = data;
-	struct device *dev = interrupt->component->dev;
+	struct device *dev = interrupt->dev;
 
 	dev_info(dev, "%s irq without full handling\n", interrupt->name);
 
@@ -87,7 +87,7 @@ static irqreturn_t base_handler(int irq, void *data)
 static irqreturn_t function_status_handler(int irq, void *data)
 {
 	struct sdca_interrupt *interrupt = data;
-	struct device *dev = interrupt->component->dev;
+	struct device *dev = interrupt->dev;
 	irqreturn_t irqret = IRQ_NONE;
 	unsigned int reg, val;
 	unsigned long status;
@@ -103,7 +103,7 @@ static irqreturn_t function_status_handler(int irq, void *data)
 	reg = SDW_SDCA_CTL(interrupt->function->desc->adr, interrupt->entity->id,
 			   interrupt->control->sel, 0);
 
-	ret = regmap_read(interrupt->component->regmap, reg, &val);
+	ret = regmap_read(interrupt->function_regmap, reg, &val);
 	if (ret < 0) {
 		dev_err(dev, "failed to read function status: %d\n", ret);
 		goto error;
@@ -136,7 +136,7 @@ static irqreturn_t function_status_handler(int irq, void *data)
 		}
 	}
 
-	ret = regmap_write(interrupt->component->regmap, reg, val);
+	ret = regmap_write(interrupt->function_regmap, reg, val);
 	if (ret < 0) {
 		dev_err(dev, "failed to clear function status: %d\n", ret);
 		goto error;
@@ -151,8 +151,8 @@ error:
 static irqreturn_t detected_mode_handler(int irq, void *data)
 {
 	struct sdca_interrupt *interrupt = data;
+	struct device *dev = interrupt->dev;
 	struct snd_soc_component *component = interrupt->component;
-	struct device *dev = component->dev;
 	struct snd_soc_card *card = component->card;
 	struct rw_semaphore *rwsem = &card->snd_card->controls_rwsem;
 	struct snd_kcontrol *kctl = interrupt->priv;
@@ -190,7 +190,7 @@ static irqreturn_t detected_mode_handler(int irq, void *data)
 	reg = SDW_SDCA_CTL(interrupt->function->desc->adr, interrupt->entity->id,
 			   interrupt->control->sel, 0);
 
-	ret = regmap_read(component->regmap, reg, &val);
+	ret = regmap_read(interrupt->function_regmap, reg, &val);
 	if (ret < 0) {
 		dev_err(dev, "failed to read detected mode: %d\n", ret);
 		goto error;
@@ -209,9 +209,9 @@ static irqreturn_t detected_mode_handler(int irq, void *data)
 		 * detected mode is unknown we need to see what the device
 		 * selected as a "safe" option.
 		 */
-		regcache_drop_region(component->regmap, reg, reg);
+		regcache_drop_region(interrupt->function_regmap, reg, reg);
 
-		ret = regmap_read(component->regmap, reg, &val);
+		ret = regmap_read(interrupt->function_regmap, reg, &val);
 		if (ret) {
 			dev_err(dev, "failed to re-check selected mode: %d\n", ret);
 			goto error;
@@ -309,6 +309,8 @@ EXPORT_SYMBOL_NS_GPL(sdca_irq_request, "SND_SOC_SDCA");
 
 /**
  * sdca_irq_data_populate - Populate common interrupt data
+ * @dev: Pointer to the Function device.
+ * @regmap: Pointer to the Function regmap.
  * @component: Pointer to the ASoC component for the Function.
  * @function: Pointer to the SDCA Function.
  * @entity: Pointer to the SDCA Entity.
@@ -317,21 +319,31 @@ EXPORT_SYMBOL_NS_GPL(sdca_irq_request, "SND_SOC_SDCA");
  *
  * Return: Zero on success, and a negative error code on failure.
  */
-int sdca_irq_data_populate(struct snd_soc_component *component,
+int sdca_irq_data_populate(struct device *dev, struct regmap *regmap,
+			   struct snd_soc_component *component,
 			   struct sdca_function_data *function,
 			   struct sdca_entity *entity,
 			   struct sdca_control *control,
 			   struct sdca_interrupt *interrupt)
 {
-	struct device *dev = component->dev;
 	const char *name;
 
+	if (!dev && component)
+		dev = component->dev;
+	if (!dev)
+		return -ENODEV;
+
 	name = devm_kasprintf(dev, GFP_KERNEL, "%s %s %s", function->desc->name,
 			      entity->label, control->label);
 	if (!name)
 		return -ENOMEM;
 
 	interrupt->name = name;
+	interrupt->dev = dev;
+	if (!regmap && component)
+		interrupt->function_regmap = component->regmap;
+	else
+		interrupt->function_regmap = regmap;
 	interrupt->component = component;
 	interrupt->function = function;
 	interrupt->entity = entity;
@@ -394,8 +406,9 @@ int sdca_irq_populate(struct sdca_function_data *function,
 			else if (!interrupt)
 				continue;
 
-			ret = sdca_irq_data_populate(component, function, entity,
-						     control, interrupt);
+			ret = sdca_irq_data_populate(dev, NULL, component,
+						     function, entity, control,
+						     interrupt);
 			if (ret)
 				return ret;
 
-- 
cgit v1.2.3


From c7b6c6b60594fd1efe35c61bc6a2176b25263ccc Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:02 +0100
Subject: ASoC: SDCA: Force some SDCA Controls to be volatile

Whilst SDCA does specify an Access Mode for each Control, there is not a
1-to-1 mapping between that and ASoC's internal representation. Some
registers require being treated as volatile from the hosts perspective
even in their Access Mode is Read-Write. Add an explicit list of SDCA
controls that should be forced volatile.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-10-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h   |  1 +
 sound/soc/sdca/sdca_functions.c | 58 +++++++++++++++++++++++++++++++++++++++++
 sound/soc/sdca/sdca_regmap.c    |  9 +------
 3 files changed, 60 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 51e12fcfc53c..ab9af84082c9 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -771,6 +771,7 @@ struct sdca_control {
 	u8 layers;
 
 	bool deferrable;
+	bool is_volatile;
 	bool has_default;
 	bool has_fixed;
 };
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 2d5d20e23e3c..3e1df30f5d60 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -779,6 +779,62 @@ find_sdca_control_datatype(const struct sdca_entity *entity,
 	}
 }
 
+static bool find_sdca_control_volatile(const struct sdca_entity *entity,
+				       const struct sdca_control *control)
+{
+	switch (control->mode) {
+	case SDCA_ACCESS_MODE_DC:
+		return false;
+	case SDCA_ACCESS_MODE_RO:
+	case SDCA_ACCESS_MODE_RW1S:
+	case SDCA_ACCESS_MODE_RW1C:
+		return true;
+	default:
+		break;
+	}
+
+	switch (SDCA_CTL_TYPE(entity->type, control->sel)) {
+	case SDCA_CTL_TYPE_S(XU, FDL_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(XU, FDL_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(XU, FDL_MESSAGELENGTH):
+	case SDCA_CTL_TYPE_S(XU, FDL_STATUS):
+	case SDCA_CTL_TYPE_S(XU, FDL_HOST_REQUEST):
+	case SDCA_CTL_TYPE_S(SPE, AUTHTX_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(SPE, AUTHTX_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(SPE, AUTHTX_MESSAGELENGTH):
+	case SDCA_CTL_TYPE_S(SPE, AUTHRX_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(SPE, AUTHRX_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(SPE, AUTHRX_MESSAGELENGTH):
+	case SDCA_CTL_TYPE_S(MFPU, AE_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(MFPU, AE_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(MFPU, AE_MESSAGELENGTH):
+	case SDCA_CTL_TYPE_S(SMPU, HIST_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(SMPU, HIST_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(SMPU, HIST_MESSAGELENGTH):
+	case SDCA_CTL_TYPE_S(SMPU, DTODTX_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(SMPU, DTODTX_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(SMPU, DTODTX_MESSAGELENGTH):
+	case SDCA_CTL_TYPE_S(SMPU, DTODRX_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(SMPU, DTODRX_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(SMPU, DTODRX_MESSAGELENGTH):
+	case SDCA_CTL_TYPE_S(SAPU, DTODTX_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(SAPU, DTODTX_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(SAPU, DTODTX_MESSAGELENGTH):
+	case SDCA_CTL_TYPE_S(SAPU, DTODRX_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(SAPU, DTODRX_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(SAPU, DTODRX_MESSAGELENGTH):
+	case SDCA_CTL_TYPE_S(HIDE, HIDTX_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(HIDE, HIDTX_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(HIDE, HIDTX_MESSAGELENGTH):
+	case SDCA_CTL_TYPE_S(HIDE, HIDRX_CURRENTOWNER):
+	case SDCA_CTL_TYPE_S(HIDE, HIDRX_MESSAGEOFFSET):
+	case SDCA_CTL_TYPE_S(HIDE, HIDRX_MESSAGELENGTH):
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int find_sdca_control_range(struct device *dev,
 				   struct fwnode_handle *control_node,
 				   struct sdca_control_range *range)
@@ -930,6 +986,8 @@ static int find_sdca_entity_control(struct device *dev, struct sdca_entity *enti
 		break;
 	}
 
+	control->is_volatile = find_sdca_control_volatile(entity, control);
+
 	ret = find_sdca_control_range(dev, control_node, &control->range);
 	if (ret) {
 		dev_err(dev, "%s: control %#x: range missing: %d\n",
diff --git a/sound/soc/sdca/sdca_regmap.c b/sound/soc/sdca/sdca_regmap.c
index 72f893e00ff5..8fa138fca00f 100644
--- a/sound/soc/sdca/sdca_regmap.c
+++ b/sound/soc/sdca/sdca_regmap.c
@@ -147,14 +147,7 @@ bool sdca_regmap_volatile(struct sdca_function_data *function, unsigned int reg)
 	if (!control)
 		return false;
 
-	switch (control->mode) {
-	case SDCA_ACCESS_MODE_RO:
-	case SDCA_ACCESS_MODE_RW1S:
-	case SDCA_ACCESS_MODE_RW1C:
-		return true;
-	default:
-		return false;
-	}
+	return control->is_volatile;
 }
 EXPORT_SYMBOL_NS(sdca_regmap_volatile, "SND_SOC_SDCA");
 
-- 
cgit v1.2.3


From 0a5e9769d088bd1d8faf01207210911b9341b62c Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:03 +0100
Subject: ASoC: SDCA: Parse XU Entity properties

Parse the DisCo properties for XU Entities.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-11-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h   | 23 +++++++++++++++++++++++
 sound/soc/sdca/sdca_functions.c | 25 +++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index ab9af84082c9..f2ce13162151 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -1090,6 +1090,27 @@ struct sdca_entity_hide {
 	struct hid_descriptor hid_desc;
 };
 
+/**
+ * enum sdca_xu_reset_machanism - SDCA FDL Resets
+ */
+enum sdca_xu_reset_mechanism {
+	SDCA_XU_RESET_FUNCTION				= 0x0,
+	SDCA_XU_RESET_DEVICE				= 0x1,
+	SDCA_XU_RESET_BUS				= 0x2,
+};
+
+/**
+ * struct sdca_entity_xu - information specific to XU Entities
+ * @max_delay: the maximum time in microseconds allowed for the Device
+ * to change the ownership from Device to Host
+ * @reset_mechanism: indicates the type of reset that can be requested
+ * the end of an FDL.
+ */
+struct sdca_entity_xu {
+	unsigned int max_delay;
+	enum sdca_xu_reset_mechanism reset_mechanism;
+};
+
 /**
  * struct sdca_entity - information for one SDCA Entity
  * @label: String such as "OT 12".
@@ -1106,6 +1127,7 @@ struct sdca_entity_hide {
  * @pde: Power Domain Entity specific Entity properties.
  * @ge: Group Entity specific Entity properties.
  * @hide: HIDE Entity specific Entity properties.
+ * @xu: XU Entity specific Entity properties.
  */
 struct sdca_entity {
 	const char *label;
@@ -1123,6 +1145,7 @@ struct sdca_entity {
 		struct sdca_entity_pde pde;
 		struct sdca_entity_ge ge;
 		struct sdca_entity_hide hide;
+		struct sdca_entity_xu xu;
 	};
 };
 
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 3e1df30f5d60..2e6674846221 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -1398,6 +1398,28 @@ find_sdca_entity_hide(struct device *dev, struct sdw_slave *sdw,
 	return 0;
 }
 
+static int find_sdca_entity_xu(struct device *dev,
+			       struct fwnode_handle *entity_node,
+			       struct sdca_entity *entity)
+{
+	struct sdca_entity_xu *xu = &entity->xu;
+	u32 tmp;
+	int ret;
+
+	ret = fwnode_property_read_u32(entity_node,
+				       "mipi-sdca-RxUMP-ownership-transition-max-delay",
+				       &tmp);
+	if (!ret)
+		xu->max_delay = tmp;
+
+	ret = fwnode_property_read_u32(entity_node, "mipi-sdca-FDL-reset-mechanism",
+				       &tmp);
+	if (!ret)
+		xu->reset_mechanism = tmp;
+
+	return 0;
+}
+
 static int find_sdca_entity(struct device *dev, struct sdw_slave *sdw,
 			    struct fwnode_handle *function_node,
 			    struct fwnode_handle *entity_node,
@@ -1430,6 +1452,9 @@ static int find_sdca_entity(struct device *dev, struct sdw_slave *sdw,
 	case SDCA_ENTITY_TYPE_OT:
 		ret = find_sdca_entity_iot(dev, entity_node, entity);
 		break;
+	case SDCA_ENTITY_TYPE_XU:
+		ret = find_sdca_entity_xu(dev, entity_node, entity);
+		break;
 	case SDCA_ENTITY_TYPE_CS:
 		ret = find_sdca_entity_cs(dev, entity_node, entity);
 		break;
-- 
cgit v1.2.3


From 7b6be935e7eff06025e18cea4c6620194450abe2 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:04 +0100
Subject: ASoC: SDCA: Parse Function Reset max delay

Parse the DisCo property to get the timeout for a Function Reset.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-12-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h   |  3 +++
 sound/soc/sdca/sdca_functions.c | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index f2ce13162151..2e988a30481c 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -1323,6 +1323,8 @@ enum sdca_cluster_range {
  * @num_clusters: Number of Channel Clusters reported in this Function.
  * @busy_max_delay: Maximum Function busy delay in microseconds, before an
  * error should be reported.
+ * @reset_max_delay: Maximum Function reset delay in microseconds, before an
+ * error should be reported.
  */
 struct sdca_function_data {
 	struct sdca_function_desc *desc;
@@ -1335,6 +1337,7 @@ struct sdca_function_data {
 	int num_clusters;
 
 	unsigned int busy_max_delay;
+	unsigned int reset_max_delay;
 };
 
 static inline u32 sdca_range(struct sdca_control_range *range,
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 2e6674846221..6602727c73f7 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -2033,8 +2033,14 @@ int sdca_parse_function(struct device *dev, struct sdw_slave *sdw,
 	if (!ret)
 		function->busy_max_delay = tmp;
 
-	dev_info(dev, "%pfwP: name %s delay %dus\n", function->desc->node,
-		 function->desc->name, function->busy_max_delay);
+	ret = fwnode_property_read_u32(function_desc->node,
+				       "mipi-sdca-function-reset-max-delay", &tmp);
+	if (!ret)
+		function->reset_max_delay = tmp;
+
+	dev_info(dev, "%pfwP: name %s busy delay %dus reset delay %dus\n",
+		 function->desc->node, function->desc->name,
+		 function->busy_max_delay, function->reset_max_delay);
 
 	ret = find_sdca_init_table(dev, function_desc->node, function);
 	if (ret)
-- 
cgit v1.2.3


From daab108504be73182c16a72b9cfe47ac3b1928ca Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:05 +0100
Subject: ASoC: SDCA: Add UMP buffer helper functions

Add helper functions for handling Universal Message Passing (UMP)
buffers on SDCA devices. These are generic mechanisms to pass blocks of
binary data between the host and the device, in both directions. They
are used for things like passing HID descriptors and the File Download
process.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-13-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h |  26 +++++
 include/sound/sdca_ump.h      |  45 ++++++++
 sound/soc/sdca/Makefile       |   3 +-
 sound/soc/sdca/sdca_ump.c     | 247 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 320 insertions(+), 1 deletion(-)
 create mode 100644 include/sound/sdca_ump.h
 create mode 100644 sound/soc/sdca/sdca_ump.c

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 2e988a30481c..6dd44a7a8a35 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -133,6 +133,32 @@ struct sdca_init_write {
 #define SDCA_CTL_TYPE_S(ent, sel) SDCA_CTL_TYPE(SDCA_ENTITY_TYPE_##ent, \
 						SDCA_CTL_##ent##_##sel)
 
+/**
+ * enum sdca_messageoffset_range - Column definitions UMP MessageOffset
+ */
+enum sdca_messageoffset_range {
+	SDCA_MESSAGEOFFSET_BUFFER_START_ADDRESS		= 0,
+	SDCA_MESSAGEOFFSET_BUFFER_LENGTH		= 1,
+	SDCA_MESSAGEOFFSET_UMP_MODE			= 2,
+	SDCA_MESSAGEOFFSET_NCOLS			= 3,
+};
+
+/**
+ * enum sdca_ump_mode - SDCA UMP Mode
+ */
+enum sdca_ump_mode {
+	SDCA_UMP_MODE_DIRECT				= 0x00,
+	SDCA_UMP_MODE_INDIRECT				= 0x01,
+};
+
+/**
+ * enum sdca_ump_owner - SDCA UMP Owner
+ */
+enum sdca_ump_owner {
+	SDCA_UMP_OWNER_HOST				= 0x00,
+	SDCA_UMP_OWNER_DEVICE				= 0x01,
+};
+
 /**
  * enum sdca_it_controls - SDCA Controls for Input Terminal
  *
diff --git a/include/sound/sdca_ump.h b/include/sound/sdca_ump.h
new file mode 100644
index 000000000000..b2363199d19a
--- /dev/null
+++ b/include/sound/sdca_ump.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ *
+ * Copyright (C) 2025 Cirrus Logic, Inc. and
+ *                    Cirrus Logic International Semiconductor Ltd.
+ */
+
+#ifndef __SDCA_UMP_H__
+#define __SDCA_UMP_H__
+
+struct regmap;
+struct sdca_control;
+struct sdca_entity;
+struct sdca_function_data;
+struct snd_soc_component;
+
+int sdca_ump_get_owner_host(struct device *dev,
+			    struct regmap *function_regmap,
+			    struct sdca_function_data *function,
+			    struct sdca_entity *entity,
+			    struct sdca_control *control);
+int sdca_ump_set_owner_device(struct device *dev,
+			      struct regmap *function_regmap,
+			      struct sdca_function_data *function,
+			      struct sdca_entity *entity,
+			      struct sdca_control *control);
+int sdca_ump_read_message(struct device *dev,
+			  struct regmap *device_regmap,
+			  struct regmap *function_regmap,
+			  struct sdca_function_data *function,
+			  struct sdca_entity *entity,
+			  unsigned int offset_sel, unsigned int length_sel,
+			  void **msg);
+int sdca_ump_write_message(struct device *dev,
+			   struct regmap *device_regmap,
+			   struct regmap *function_regmap,
+			   struct sdca_function_data *function,
+			   struct sdca_entity *entity,
+			   unsigned int offset_sel, unsigned int msg_offset,
+			   unsigned int length_sel,
+			   void *msg, int msg_len);
+
+#endif // __SDCA_UMP_H__
diff --git a/sound/soc/sdca/Makefile b/sound/soc/sdca/Makefile
index 5e51760cb651..a1b24c95cd8c 100644
--- a/sound/soc/sdca/Makefile
+++ b/sound/soc/sdca/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-snd-soc-sdca-y := sdca_functions.o sdca_device.o sdca_regmap.o sdca_asoc.o
+snd-soc-sdca-y := sdca_functions.o sdca_device.o sdca_regmap.o sdca_asoc.o \
+		  sdca_ump.o
 snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_HID) += sdca_hid.o
 snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_IRQ) += sdca_interrupts.o
 
diff --git a/sound/soc/sdca/sdca_ump.c b/sound/soc/sdca/sdca_ump.c
new file mode 100644
index 000000000000..5dcad2f7ea05
--- /dev/null
+++ b/sound/soc/sdca/sdca_ump.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Cirrus Logic, Inc. and
+//                    Cirrus Logic International Semiconductor Ltd.
+
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ */
+
+#include <linux/dev_printk.h>
+#include <linux/device.h>
+#include <linux/regmap.h>
+#include <sound/sdca.h>
+#include <sound/sdca_function.h>
+#include <sound/sdca_ump.h>
+#include <sound/soc-component.h>
+#include <linux/soundwire/sdw_registers.h>
+
+/**
+ * sdca_ump_get_owner_host - check a UMP buffer is owned by the host
+ * @dev: Pointer to the struct device used for error messages.
+ * @function_regmap: Pointer to the regmap for the SDCA Function.
+ * @function: Pointer to the Function information.
+ * @entity: Pointer to the SDCA Entity.
+ * @control: Pointer to the SDCA Control for the UMP Owner.
+ *
+ * Return: Returns zero on success, and a negative error code on failure.
+ */
+int sdca_ump_get_owner_host(struct device *dev,
+			    struct regmap *function_regmap,
+			    struct sdca_function_data *function,
+			    struct sdca_entity *entity,
+			    struct sdca_control *control)
+{
+	unsigned int reg, owner;
+	int ret;
+
+	reg = SDW_SDCA_CTL(function->desc->adr, entity->id, control->sel, 0);
+	ret = regmap_read(function_regmap, reg, &owner);
+	if (ret < 0) {
+		dev_err(dev, "%s: failed to read UMP owner: %d\n",
+			entity->label, ret);
+		return ret;
+	}
+
+	if (owner != SDCA_UMP_OWNER_HOST) {
+		dev_err(dev, "%s: host is not the UMP owner\n", entity->label);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_ump_get_owner_host, "SND_SOC_SDCA");
+
+/**
+ * sdca_ump_set_owner_device - set a UMP buffer's ownership back to the device
+ * @dev: Pointer to the struct device used for error messages.
+ * @function_regmap: Pointer to the regmap for the SDCA Function.
+ * @function: Pointer to the Function information.
+ * @entity: Pointer to the SDCA Entity.
+ * @control: Pointer to the SDCA Control for the UMP Owner.
+ *
+ * Return: Returns zero on success, and a negative error code on failure.
+ */
+int sdca_ump_set_owner_device(struct device *dev,
+			      struct regmap *function_regmap,
+			      struct sdca_function_data *function,
+			      struct sdca_entity *entity,
+			      struct sdca_control *control)
+{
+	unsigned int reg;
+	int ret;
+
+	reg = SDW_SDCA_CTL(function->desc->adr, entity->id, control->sel, 0);
+	ret = regmap_write(function_regmap, reg, SDCA_UMP_OWNER_DEVICE);
+	if (ret < 0)
+		dev_err(dev, "%s: failed to write UMP owner: %d\n",
+			entity->label, ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_ump_set_owner_device, "SND_SOC_SDCA");
+
+/**
+ * sdca_ump_read_message - read a UMP message from the device
+ * @dev: Pointer to the struct device used for error messages.
+ * @device_regmap: Pointer to the Device register map.
+ * @function_regmap: Pointer to the regmap for the SDCA Function.
+ * @function: Pointer to the Function information.
+ * @entity: Pointer to the SDCA Entity.
+ * @offset_sel: Control Selector for the UMP Offset Control.
+ * @length_sel: Control Selector for the UMP Length Control.
+ * @msg: Pointer that will be populated with an dynamically buffer
+ * containing the UMP message. Note this needs to be freed by the
+ * caller.
+ *
+ * The caller should first call sdca_ump_get_owner_host() to ensure the host
+ * currently owns the UMP buffer, and then this function can be used to
+ * retrieve a message. It is the callers responsibility to free the
+ * message once it is finished with it. Finally sdca_ump_set_owner_device()
+ * should be called to return the buffer to the device.
+ *
+ * Return: Returns the message length on success, and a negative error
+ * code on failure.
+ */
+int sdca_ump_read_message(struct device *dev,
+			  struct regmap *device_regmap,
+			  struct regmap *function_regmap,
+			  struct sdca_function_data *function,
+			  struct sdca_entity *entity,
+			  unsigned int offset_sel, unsigned int length_sel,
+			  void **msg)
+{
+	struct sdca_control_range *range;
+	unsigned int msg_offset, msg_len;
+	unsigned int buf_addr, buf_len;
+	unsigned int reg;
+	int ret;
+
+	reg = SDW_SDCA_CTL(function->desc->adr, entity->id, offset_sel, 0);
+	ret = regmap_read(function_regmap, reg, &msg_offset);
+	if (ret < 0) {
+		dev_err(dev, "%s: failed to read UMP offset: %d\n",
+			entity->label, ret);
+		return ret;
+	}
+
+	range = sdca_selector_find_range(dev, entity, offset_sel,
+					 SDCA_MESSAGEOFFSET_NCOLS, 1);
+	if (!range)
+		return -ENOENT;
+
+	buf_addr = sdca_range(range, SDCA_MESSAGEOFFSET_BUFFER_START_ADDRESS, 0);
+	buf_len = sdca_range(range, SDCA_MESSAGEOFFSET_BUFFER_LENGTH, 0);
+
+	reg = SDW_SDCA_CTL(function->desc->adr, entity->id, length_sel, 0);
+	ret = regmap_read(function_regmap, reg, &msg_len);
+	if (ret < 0) {
+		dev_err(dev, "%s: failed to read UMP length: %d\n",
+			entity->label, ret);
+		return ret;
+	}
+
+	if (msg_len > buf_len - msg_offset) {
+		dev_err(dev, "%s: message too big for UMP buffer: %d\n",
+			entity->label, msg_len);
+		return -EINVAL;
+	}
+
+	*msg = kmalloc(msg_len, GFP_KERNEL);
+	if (!*msg)
+		return -ENOMEM;
+
+	ret = regmap_raw_read(device_regmap, buf_addr + msg_offset, *msg, msg_len);
+	if (ret < 0) {
+		dev_err(dev, "%s: failed to read UMP message: %d\n",
+			entity->label, ret);
+		return ret;
+	}
+
+	return msg_len;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_ump_read_message, "SND_SOC_SDCA");
+
+/**
+ * sdca_ump_write_message - write a UMP message to the device
+ * @dev: Pointer to the struct device used for error messages.
+ * @device_regmap: Pointer to the Device register map.
+ * @function_regmap: Pointer to the regmap for the SDCA Function.
+ * @function: Pointer to the Function information.
+ * @entity: Pointer to the SDCA Entity.
+ * @offset_sel: Control Selector for the UMP Offset Control.
+ * @msg_offset: Offset within the UMP buffer at which the message should
+ * be written.
+ * @length_sel: Control Selector for the UMP Length Control.
+ * @msg: Pointer to the data that should be written to the UMP buffer.
+ * @msg_len: Length of the message data in bytes.
+ *
+ * The caller should first call sdca_ump_get_owner_host() to ensure the host
+ * currently owns the UMP buffer, and then this function can be used to
+ * write a message. Finally sdca_ump_set_owner_device() should be called to
+ * return the buffer to the device, allowing the device to access the
+ * message.
+ *
+ * Return: Returns zero on success, and a negative error code on failure.
+ */
+int sdca_ump_write_message(struct device *dev,
+			   struct regmap *device_regmap,
+			   struct regmap *function_regmap,
+			   struct sdca_function_data *function,
+			   struct sdca_entity *entity,
+			   unsigned int offset_sel, unsigned int msg_offset,
+			   unsigned int length_sel,
+			   void *msg, int msg_len)
+{
+	struct sdca_control_range *range;
+	unsigned int buf_addr, buf_len, ump_mode;
+	unsigned int reg;
+	int ret;
+
+	range = sdca_selector_find_range(dev, entity, offset_sel,
+					 SDCA_MESSAGEOFFSET_NCOLS, 1);
+	if (!range)
+		return -ENOENT;
+
+	buf_addr = sdca_range(range, SDCA_MESSAGEOFFSET_BUFFER_START_ADDRESS, 0);
+	buf_len = sdca_range(range, SDCA_MESSAGEOFFSET_BUFFER_LENGTH, 0);
+	ump_mode = sdca_range(range, SDCA_MESSAGEOFFSET_UMP_MODE, 0);
+
+	if (msg_len > buf_len - msg_offset) {
+		dev_err(dev, "%s: message too big for UMP buffer: %d\n",
+			entity->label, msg_len);
+		return -EINVAL;
+	}
+
+	if (ump_mode != SDCA_UMP_MODE_DIRECT) {
+		dev_err(dev, "%s: only direct mode currently supported\n",
+			entity->label);
+		return -EINVAL;
+	}
+
+	ret = regmap_raw_write(device_regmap, buf_addr + msg_offset, msg, msg_len);
+	if (ret) {
+		dev_err(dev, "%s: failed to write UMP message: %d\n",
+			entity->label, ret);
+		return ret;
+	}
+
+	reg = SDW_SDCA_CTL(function->desc->adr, entity->id, offset_sel, 0);
+	ret = regmap_write(function_regmap, reg, msg_offset);
+	if (ret < 0) {
+		dev_err(dev, "%s: failed to write UMP offset: %d\n",
+			entity->label, ret);
+		return ret;
+	}
+
+	reg = SDW_SDCA_CTL(function->desc->adr, entity->id, length_sel, 0);
+	ret = regmap_write(function_regmap, reg, msg_len);
+	if (ret < 0) {
+		dev_err(dev, "%s: failed to write UMP length: %d\n",
+			entity->label, ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_ump_write_message, "SND_SOC_SDCA");
-- 
cgit v1.2.3


From c4d096c3ca425562192a3626c30e82651d0f2c1c Mon Sep 17 00:00:00 2001
From: Maciej Strozek <mstrozek@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:06 +0100
Subject: ASoC: SDCA: Add SDCA FDL data parsing

Add parsing of ACPI DisCo information specific to FDL (File DownLoad).
DisCo contains a list of File Sets which can be requested by the device
and within each of those a list of individual files to be downloaded to
the device. Optionally the contents of the files may also be present in
a special ACPI table, called SWFT (SoundWire File Table).

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Maciej Strozek <mstrozek@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-14-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca.h            |  5 +++
 include/sound/sdca_function.h   | 40 ++++++++++++++++++
 sound/soc/sdca/sdca_device.c    | 20 +++++++++
 sound/soc/sdca/sdca_functions.c | 93 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 158 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca.h b/include/sound/sdca.h
index 9c6a351c9d47..d38cdbfeb35f 100644
--- a/include/sound/sdca.h
+++ b/include/sound/sdca.h
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/kconfig.h>
 
+struct acpi_table_swft;
 struct sdw_slave;
 
 #define SDCA_MAX_FUNCTION_COUNT 8
@@ -37,11 +38,13 @@ struct sdca_function_desc {
  * @num_functions: Total number of supported SDCA functions. Invalid/unsupported
  * functions will be skipped.
  * @function: Array of function descriptors.
+ * @swft: Pointer to the SWFT table, if available.
  */
 struct sdca_device_data {
 	u32 interface_revision;
 	int num_functions;
 	struct sdca_function_desc function[SDCA_MAX_FUNCTION_COUNT];
+	struct acpi_table_swft *swft;
 };
 
 enum sdca_quirk {
@@ -52,12 +55,14 @@ enum sdca_quirk {
 #if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SND_SOC_SDCA)
 
 void sdca_lookup_functions(struct sdw_slave *slave);
+void sdca_lookup_swft(struct sdw_slave *slave);
 void sdca_lookup_interface_revision(struct sdw_slave *slave);
 bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_quirk quirk);
 
 #else
 
 static inline void sdca_lookup_functions(struct sdw_slave *slave) {}
+static inline void sdca_lookup_swft(struct sdw_slave *slave) {}
 static inline void sdca_lookup_interface_revision(struct sdw_slave *slave) {}
 static inline bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_quirk quirk)
 {
diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 6dd44a7a8a35..f557206cec83 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -13,6 +13,7 @@
 #include <linux/types.h>
 #include <linux/hid.h>
 
+struct acpi_table_swft;
 struct device;
 struct sdca_entity;
 struct sdca_function_desc;
@@ -1338,6 +1339,42 @@ enum sdca_cluster_range {
 	SDCA_CLUSTER_NCOLS				= 2,
 };
 
+/**
+ * struct sdca_fdl_file - information about a file from a fileset used in FDL
+ * @vendor_id: Vendor ID of the file.
+ * @file_id: File ID of the file.
+ * @fdl_offset: Offset information for FDL.
+ */
+struct sdca_fdl_file {
+	u16 vendor_id;
+	u32 file_id;
+	u32 fdl_offset;
+};
+
+/**
+ * struct sdca_fdl_set - information about a set of files used in FDL
+ * @files: Array of files in this FDL set.
+ * @num_files: Number of files in this FDL set.
+ * @id: ID of the FDL set.
+ */
+struct sdca_fdl_set {
+	struct sdca_fdl_file *files;
+	int num_files;
+	u32 id;
+};
+
+/**
+ * struct sdca_fdl_data - information about a function's FDL data
+ * @swft: Pointer to the SoundWire File Table.
+ * @sets: Array of FDL sets used by this function.
+ * @num_sets: Number of FDL sets used by this function.
+ */
+struct sdca_fdl_data {
+	struct acpi_table_swft *swft;
+	struct sdca_fdl_set *sets;
+	int num_sets;
+};
+
 /**
  * struct sdca_function_data - top-level information for one SDCA function
  * @desc: Pointer to short descriptor from initial parsing.
@@ -1351,6 +1388,7 @@ enum sdca_cluster_range {
  * error should be reported.
  * @reset_max_delay: Maximum Function reset delay in microseconds, before an
  * error should be reported.
+ * @fdl_data: FDL data for this Function, if available.
  */
 struct sdca_function_data {
 	struct sdca_function_desc *desc;
@@ -1364,6 +1402,8 @@ struct sdca_function_data {
 
 	unsigned int busy_max_delay;
 	unsigned int reset_max_delay;
+
+	struct sdca_fdl_data fdl_data;
 };
 
 static inline u32 sdca_range(struct sdca_control_range *range,
diff --git a/sound/soc/sdca/sdca_device.c b/sound/soc/sdca/sdca_device.c
index 4798ce2c8f0b..405e80b979de 100644
--- a/sound/soc/sdca/sdca_device.c
+++ b/sound/soc/sdca/sdca_device.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/acpi.h>
+#include <linux/device.h>
 #include <linux/dmi.h>
 #include <linux/module.h>
 #include <linux/property.h>
@@ -27,6 +28,25 @@ void sdca_lookup_interface_revision(struct sdw_slave *slave)
 }
 EXPORT_SYMBOL_NS(sdca_lookup_interface_revision, "SND_SOC_SDCA");
 
+static void devm_acpi_table_put(void *ptr)
+{
+	acpi_put_table((struct acpi_table_header *)ptr);
+}
+
+void sdca_lookup_swft(struct sdw_slave *slave)
+{
+	acpi_status status;
+
+	status = acpi_get_table(ACPI_SIG_SWFT, 0,
+				(struct acpi_table_header **)&slave->sdca_data.swft);
+	if (ACPI_FAILURE(status))
+		dev_info(&slave->dev, "SWFT not available\n");
+	else
+		devm_add_action_or_reset(&slave->dev, devm_acpi_table_put,
+					 &slave->sdca_data.swft);
+}
+EXPORT_SYMBOL_NS(sdca_lookup_swft, "SND_SOC_SDCA");
+
 static bool sdca_device_quirk_rt712_vb(struct sdw_slave *slave)
 {
 	struct sdw_slave_id *id = &slave->id;
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 6602727c73f7..b2e3fab9bd95 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -2010,6 +2010,95 @@ static int find_sdca_clusters(struct device *dev,
 	return 0;
 }
 
+static int find_sdca_filesets(struct device *dev, struct sdw_slave *sdw,
+			      struct fwnode_handle *function_node,
+			      struct sdca_function_data *function)
+{
+	static const int mult_fileset = 3;
+	char fileset_name[SDCA_PROPERTY_LENGTH];
+	u32 *filesets_list __free(kfree) = NULL;
+	struct sdca_fdl_set *sets;
+	int num_sets;
+	int i, j;
+
+	num_sets = fwnode_property_count_u32(function_node,
+					     "mipi-sdca-file-set-id-list");
+	if (num_sets == 0 || num_sets == -EINVAL) {
+		return 0;
+	} else if (num_sets < 0) {
+		dev_err(dev, "%pfwP: failed to read file set list: %d\n",
+			function_node, num_sets);
+		return num_sets;
+	}
+
+	filesets_list = kcalloc(num_sets, sizeof(u32), GFP_KERNEL);
+	if (!filesets_list)
+		return -ENOMEM;
+
+	fwnode_property_read_u32_array(function_node, "mipi-sdca-file-set-id-list",
+				       filesets_list, num_sets);
+
+	sets = devm_kcalloc(dev, num_sets, sizeof(struct sdca_fdl_set), GFP_KERNEL);
+	if (!sets)
+		return -ENOMEM;
+
+	for (i = 0; i < num_sets; i++) {
+		u32 *fileset_entries __free(kfree) = NULL;
+		struct sdca_fdl_set *set = &sets[i];
+		struct sdca_fdl_file *files;
+		int num_files, num_entries;
+
+		snprintf(fileset_name, sizeof(fileset_name),
+			 "mipi-sdca-file-set-id-0x%X", filesets_list[i]);
+
+		num_entries = fwnode_property_count_u32(function_node, fileset_name);
+		if (num_entries <= 0) {
+			dev_err(dev, "%pfwP: file set %d missing entries: %d\n",
+				function_node, filesets_list[i], num_entries);
+			return -EINVAL;
+		} else if (num_entries % mult_fileset != 0) {
+			dev_err(dev, "%pfwP: file set %d files not multiple of %d\n",
+				function_node, filesets_list[i], mult_fileset);
+			return -EINVAL;
+		}
+
+		dev_info(dev, "fileset: %#x\n", filesets_list[i]);
+
+		files = devm_kcalloc(dev, num_entries / mult_fileset,
+				     sizeof(struct sdca_fdl_file), GFP_KERNEL);
+		if (!files)
+			return -ENOMEM;
+
+		fileset_entries = kcalloc(num_entries, sizeof(u32), GFP_KERNEL);
+		if (!fileset_entries)
+			return -ENOMEM;
+
+		fwnode_property_read_u32_array(function_node, fileset_name,
+					       fileset_entries, num_entries);
+
+		for (j = 0, num_files = 0; j < num_entries; num_files++) {
+			struct sdca_fdl_file *file = &files[num_files];
+
+			file->vendor_id = fileset_entries[j++];
+			file->file_id = fileset_entries[j++];
+			file->fdl_offset = fileset_entries[j++];
+
+			dev_info(dev, "file: %#x, vendor: %#x, offset: %#x\n",
+				 file->file_id, file->vendor_id, file->fdl_offset);
+		}
+
+		set->id = filesets_list[i];
+		set->num_files = num_files;
+		set->files = files;
+	}
+
+	function->fdl_data.swft = sdw->sdca_data.swft;
+	function->fdl_data.num_sets = num_sets;
+	function->fdl_data.sets = sets;
+
+	return 0;
+}
+
 /**
  * sdca_parse_function - parse ACPI DisCo for a Function
  * @dev: Pointer to device against which function data will be allocated.
@@ -2058,6 +2147,10 @@ int sdca_parse_function(struct device *dev, struct sdw_slave *sdw,
 	if (ret < 0)
 		return ret;
 
+	ret = find_sdca_filesets(dev, sdw, function_desc->node, function);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 EXPORT_SYMBOL_NS(sdca_parse_function, "SND_SOC_SDCA");
-- 
cgit v1.2.3


From 71f7990a34cdb11f82d3cbbcddaca77a55635466 Mon Sep 17 00:00:00 2001
From: Maciej Strozek <mstrozek@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:07 +0100
Subject: ASoC: SDCA: Add FDL library for XU entities

Some instances of the XU Entity have a need for Files to be downloaded
from the Host. In these XUs, there is one instance of a Host to Device
(Consumer) UMP, identified by the FDL_CurrentOwner Control. FDL Library
introduced here implements the FDL flow triggered by FDL_CurrentOwner
irq, which sends a file from SoundWire File Table (SWFT) or from the
firmware directory in specific cases, to the Device FDL UMP.

Currently only Direct method of FDL is implemented.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Maciej Strozek <mstrozek@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-15-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_fdl.h      |  58 +++++++
 include/sound/sdca_function.h |  24 +++
 sound/soc/sdca/Kconfig        |   8 +
 sound/soc/sdca/Makefile       |   1 +
 sound/soc/sdca/sdca_fdl.c     | 376 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 467 insertions(+)
 create mode 100644 include/sound/sdca_fdl.h
 create mode 100644 sound/soc/sdca/sdca_fdl.c

(limited to 'include')

diff --git a/include/sound/sdca_fdl.h b/include/sound/sdca_fdl.h
new file mode 100644
index 000000000000..8b025aff4a0c
--- /dev/null
+++ b/include/sound/sdca_fdl.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ *
+ * Copyright (C) 2025 Cirrus Logic, Inc. and
+ *                    Cirrus Logic International Semiconductor Ltd.
+ */
+
+#ifndef __SDCA_FDL_H__
+#define __SDCA_FDL_H__
+
+struct device;
+struct regmap;
+struct sdca_fdl_set;
+struct sdca_function_data;
+struct sdca_interrupt;
+
+/**
+ * struct fdl_state - FDL state structure to keep data between interrupts
+ * @set: Pointer to the FDL set currently being downloaded.
+ * @file_index: Index of the current file being processed.
+ */
+struct fdl_state {
+	struct sdca_fdl_set *set;
+	int file_index;
+};
+
+#define SDCA_CTL_XU_FDLH_COMPLETE	0
+#define SDCA_CTL_XU_FDLH_MORE_FILES	SDCA_CTL_XU_FDLH_SET_IN_PROGRESS
+#define SDCA_CTL_XU_FDLH_FILE_AVAILABLE	(SDCA_CTL_XU_FDLH_TRANSFERRED_FILE | \
+					 SDCA_CTL_XU_FDLH_SET_IN_PROGRESS)
+#define SDCA_CTL_XU_FDLH_MASK		(SDCA_CTL_XU_FDLH_TRANSFERRED_CHUNK | \
+					 SDCA_CTL_XU_FDLH_TRANSFERRED_FILE | \
+					 SDCA_CTL_XU_FDLH_SET_IN_PROGRESS | \
+					 SDCA_CTL_XU_FDLH_RESET_ACK | \
+					 SDCA_CTL_XU_FDLH_REQ_ABORT)
+
+#define SDCA_CTL_XU_FDLD_COMPLETE	0
+#define SDCA_CTL_XU_FDLD_FILE_OK	(SDCA_CTL_XU_FDLH_TRANSFERRED_FILE | \
+					 SDCA_CTL_XU_FDLH_SET_IN_PROGRESS | \
+					 SDCA_CTL_XU_FDLD_ACK_TRANSFER | \
+					 SDCA_CTL_XU_FDLD_NEEDS_SET)
+#define SDCA_CTL_XU_FDLD_MORE_FILES_OK	(SDCA_CTL_XU_FDLH_SET_IN_PROGRESS | \
+					 SDCA_CTL_XU_FDLD_ACK_TRANSFER | \
+					 SDCA_CTL_XU_FDLD_NEEDS_SET)
+#define SDCA_CTL_XU_FDLD_MASK		(SDCA_CTL_XU_FDLD_REQ_RESET | \
+					 SDCA_CTL_XU_FDLD_REQ_ABORT | \
+					 SDCA_CTL_XU_FDLD_ACK_TRANSFER | \
+					 SDCA_CTL_XU_FDLD_NEEDS_SET)
+
+int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt);
+int sdca_fdl_process(struct sdca_interrupt *interrupt);
+
+int sdca_reset_function(struct device *dev, struct sdca_function_data *function,
+			struct regmap *regmap);
+
+#endif // __SDCA_FDL_H__
diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index f557206cec83..99cb978f7099 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -285,6 +285,27 @@ enum sdca_xu_controls {
 	SDCA_CTL_XU_FDL_STATUS				= 0x14,
 	SDCA_CTL_XU_FDL_SET_INDEX			= 0x15,
 	SDCA_CTL_XU_FDL_HOST_REQUEST			= 0x16,
+
+	/* FDL Status Host->Device bit definitions */
+	SDCA_CTL_XU_FDLH_TRANSFERRED_CHUNK		= BIT(0),
+	SDCA_CTL_XU_FDLH_TRANSFERRED_FILE		= BIT(1),
+	SDCA_CTL_XU_FDLH_SET_IN_PROGRESS		= BIT(2),
+	SDCA_CTL_XU_FDLH_RESET_ACK			= BIT(4),
+	SDCA_CTL_XU_FDLH_REQ_ABORT			= BIT(5),
+	/* FDL Status Device->Host bit definitions */
+	SDCA_CTL_XU_FDLD_REQ_RESET			= BIT(4),
+	SDCA_CTL_XU_FDLD_REQ_ABORT			= BIT(5),
+	SDCA_CTL_XU_FDLD_ACK_TRANSFER			= BIT(6),
+	SDCA_CTL_XU_FDLD_NEEDS_SET			= BIT(7),
+};
+
+/**
+ * enum sdca_set_index_range - Column definitions UMP SetIndex
+ */
+enum sdca_fdl_set_index_range {
+	SDCA_FDL_SET_INDEX_SET_NUMBER			= 0,
+	SDCA_FDL_SET_INDEX_FILE_SET_ID			= 1,
+	SDCA_FDL_SET_INDEX_NCOLS			= 2,
 };
 
 /**
@@ -569,6 +590,9 @@ enum sdca_entity0_controls {
 	SDCA_CTL_ENTITY_0_FUNCTION_NEEDS_INITIALIZATION	= BIT(5),
 	SDCA_CTL_ENTITY_0_FUNCTION_HAS_BEEN_RESET	= BIT(6),
 	SDCA_CTL_ENTITY_0_FUNCTION_BUSY			= BIT(7),
+
+	/* Function Action Bits */
+	SDCA_CTL_ENTITY_0_RESET_FUNCTION_NOW		= BIT(0),
 };
 
 #define SDCA_CTL_MIC_BIAS_NAME				"Mic Bias"
diff --git a/sound/soc/sdca/Kconfig b/sound/soc/sdca/Kconfig
index 6a3ba43f26bd..a73920d07073 100644
--- a/sound/soc/sdca/Kconfig
+++ b/sound/soc/sdca/Kconfig
@@ -25,6 +25,14 @@ config SND_SOC_SDCA_IRQ
 	help
 	  This option enables support for SDCA IRQs.
 
+config SND_SOC_SDCA_FDL
+	bool "SDCA FDL (File DownLoad) support"
+	depends on SND_SOC_SDCA
+	default y
+	help
+	  This option enables support for the File Download using UMP,
+	  typically used for downloading firmware to devices.
+
 config SND_SOC_SDCA_OPTIONAL
 	def_tristate SND_SOC_SDCA || !SND_SOC_SDCA
 
diff --git a/sound/soc/sdca/Makefile b/sound/soc/sdca/Makefile
index a1b24c95cd8c..be911c399bbd 100644
--- a/sound/soc/sdca/Makefile
+++ b/sound/soc/sdca/Makefile
@@ -4,5 +4,6 @@ snd-soc-sdca-y := sdca_functions.o sdca_device.o sdca_regmap.o sdca_asoc.o \
 		  sdca_ump.o
 snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_HID) += sdca_hid.o
 snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_IRQ) += sdca_interrupts.o
+snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_FDL) += sdca_fdl.o
 
 obj-$(CONFIG_SND_SOC_SDCA) += snd-soc-sdca.o
diff --git a/sound/soc/sdca/sdca_fdl.c b/sound/soc/sdca/sdca_fdl.c
new file mode 100644
index 000000000000..8a15c6300556
--- /dev/null
+++ b/sound/soc/sdca/sdca_fdl.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Cirrus Logic, Inc. and
+//                    Cirrus Logic International Semiconductor Ltd.
+
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ */
+
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/dev_printk.h>
+#include <linux/dmi.h>
+#include <linux/firmware.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/regmap.h>
+#include <linux/sprintf.h>
+#include <linux/soundwire/sdw.h>
+#include <linux/soundwire/sdw_registers.h>
+#include <sound/sdca.h>
+#include <sound/sdca_fdl.h>
+#include <sound/sdca_function.h>
+#include <sound/sdca_interrupts.h>
+#include <sound/sdca_ump.h>
+
+/**
+ * sdca_reset_function - send an SDCA function reset
+ * @dev: Device pointer for error messages.
+ * @function: Pointer to the SDCA Function.
+ * @regmap: Pointer to the SDCA Function regmap.
+ *
+ * Return: Zero on success or a negative error code.
+ */
+int sdca_reset_function(struct device *dev, struct sdca_function_data *function,
+			struct regmap *regmap)
+{
+	unsigned int reg = SDW_SDCA_CTL(function->desc->adr,
+					SDCA_ENTITY_TYPE_ENTITY_0,
+					SDCA_CTL_ENTITY_0_FUNCTION_ACTION, 0);
+	unsigned int val, poll_us;
+	int ret;
+
+	ret = regmap_write(regmap, reg, SDCA_CTL_ENTITY_0_RESET_FUNCTION_NOW);
+	if (ret) // Allowed for function reset to not be implemented
+		return 0;
+
+	if (!function->reset_max_delay) {
+		dev_err(dev, "No reset delay specified in DisCo\n");
+		return -EINVAL;
+	}
+
+	poll_us = umin(function->reset_max_delay >> 4, 1000);
+
+	ret = regmap_read_poll_timeout(regmap, reg, val, !val, poll_us,
+				       function->reset_max_delay);
+	if (ret) {
+		dev_err(dev, "Failed waiting for function reset: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(sdca_reset_function, "SND_SOC_SDCA");
+
+static char *fdl_get_sku_filename(struct device *dev,
+				  struct sdca_fdl_file *fdl_file)
+{
+	struct device *parent = dev;
+	const char *product_vendor;
+	const char *product_sku;
+
+	/*
+	 * Try to find pci_dev manually because the card may not be ready to be
+	 * used for snd_soc_card_get_pci_ssid yet
+	 */
+	while (parent) {
+		if (dev_is_pci(parent)) {
+			struct pci_dev *pci_dev = to_pci_dev(parent);
+
+			return kasprintf(GFP_KERNEL, "sdca/%x/%x/%x/%x.bin",
+					 fdl_file->vendor_id,
+					 pci_dev->subsystem_vendor,
+					 pci_dev->subsystem_device,
+					 fdl_file->file_id);
+		} else {
+			parent = parent->parent;
+		}
+	}
+
+	product_vendor = dmi_get_system_info(DMI_SYS_VENDOR);
+	if (!product_vendor || !strcmp(product_vendor, "Default string"))
+		product_vendor = dmi_get_system_info(DMI_BOARD_VENDOR);
+	if (!product_vendor || !strcmp(product_vendor, "Default string"))
+		product_vendor = dmi_get_system_info(DMI_CHASSIS_VENDOR);
+	if (!product_vendor)
+		product_vendor = "unknown";
+
+	product_sku = dmi_get_system_info(DMI_PRODUCT_SKU);
+	if (!product_sku || !strcmp(product_sku, "Default string"))
+		product_sku = dmi_get_system_info(DMI_PRODUCT_NAME);
+	if (!product_sku)
+		product_sku = "unknown";
+
+	return kasprintf(GFP_KERNEL, "sdca/%x/%s/%s/%x.bin", fdl_file->vendor_id,
+			 product_vendor, product_sku, fdl_file->file_id);
+}
+
+static int fdl_load_file(struct sdca_interrupt *interrupt,
+			 struct sdca_fdl_set *set, int file_index)
+{
+	struct device *dev = interrupt->dev;
+	struct sdca_fdl_data *fdl_data = &interrupt->function->fdl_data;
+	const struct firmware *firmware = NULL;
+	struct acpi_sw_file *swf = NULL, *tmp;
+	struct sdca_fdl_file *fdl_file;
+	char *disk_filename;
+	int ret;
+	int i;
+
+	if (!set) {
+		dev_err(dev, "request to load SWF with no set\n");
+		return -EINVAL;
+	}
+
+	fdl_file = &set->files[file_index];
+
+	if (fdl_data->swft) {
+		tmp = fdl_data->swft->files;
+		for (i = 0; i < fdl_data->swft->header.length; i += tmp->file_length,
+		     tmp = ACPI_ADD_PTR(struct acpi_sw_file, tmp, tmp->file_length)) {
+			if (tmp->vendor_id == fdl_file->vendor_id &&
+			    tmp->file_id == fdl_file->file_id) {
+				dev_dbg(dev, "located SWF in ACPI: %x-%x-%x\n",
+					tmp->vendor_id, tmp->file_id,
+					tmp->file_version);
+				swf = tmp;
+				break;
+			}
+		}
+	}
+
+	disk_filename = fdl_get_sku_filename(dev, fdl_file);
+	if (!disk_filename)
+		return -ENOMEM;
+
+	dev_dbg(dev, "FDL disk filename: %s\n", disk_filename);
+
+	ret = firmware_request_nowarn(&firmware, disk_filename, dev);
+	kfree(disk_filename);
+	if (ret) {
+		disk_filename = kasprintf(GFP_KERNEL, "sdca/%x/%x.bin",
+					  fdl_file->vendor_id, fdl_file->file_id);
+		if (!disk_filename)
+			return -ENOMEM;
+
+		dev_dbg(dev, "FDL disk filename: %s\n", disk_filename);
+
+		ret = firmware_request_nowarn(&firmware, disk_filename, dev);
+		kfree(disk_filename);
+	}
+
+	if (!ret) {
+		tmp = (struct acpi_sw_file *)&firmware->data[0];
+
+		if (firmware->size < sizeof(*tmp) ||
+		    tmp->file_length != firmware->size) {
+			dev_err(dev, "bad disk SWF size\n");
+		} else if (!swf || swf->file_version <= tmp->file_version) {
+			dev_dbg(dev, "using SWF from disk: %x-%x-%x\n",
+				tmp->vendor_id, tmp->file_id, tmp->file_version);
+			swf = tmp;
+		}
+	}
+
+	if (!swf) {
+		dev_err(dev, "failed to locate SWF\n");
+		return -ENOENT;
+	}
+
+	ret = sdca_ump_write_message(dev, interrupt->device_regmap,
+				     interrupt->function_regmap,
+				     interrupt->function, interrupt->entity,
+				     SDCA_CTL_XU_FDL_MESSAGEOFFSET, fdl_file->fdl_offset,
+				     SDCA_CTL_XU_FDL_MESSAGELENGTH, swf->data,
+				     swf->file_length - offsetof(struct acpi_sw_file, data));
+	release_firmware(firmware);
+	return ret;
+}
+
+static struct sdca_fdl_set *fdl_get_set(struct sdca_interrupt *interrupt)
+{
+	struct device *dev = interrupt->dev;
+	struct sdca_fdl_data *fdl_data = &interrupt->function->fdl_data;
+	struct sdca_entity *xu = interrupt->entity;
+	struct sdca_control_range *range;
+	unsigned int val;
+	int i, ret;
+
+	ret = regmap_read(interrupt->function_regmap,
+			  SDW_SDCA_CTL(interrupt->function->desc->adr, xu->id,
+				       SDCA_CTL_XU_FDL_SET_INDEX, 0),
+			  &val);
+	if (ret < 0) {
+		dev_err(dev, "failed to read FDL set index: %d\n", ret);
+		return NULL;
+	}
+
+	range = sdca_selector_find_range(dev, xu, SDCA_CTL_XU_FDL_SET_INDEX,
+					 SDCA_FDL_SET_INDEX_NCOLS, 0);
+
+	val = sdca_range_search(range, SDCA_FDL_SET_INDEX_SET_NUMBER,
+				val, SDCA_FDL_SET_INDEX_FILE_SET_ID);
+
+	for (i = 0; i < fdl_data->num_sets; i++) {
+		if (fdl_data->sets[i].id == val)
+			return &fdl_data->sets[i];
+	}
+
+	dev_err(dev, "invalid fileset id: %d\n", val);
+	return NULL;
+}
+
+static void fdl_end(struct sdca_interrupt *interrupt)
+{
+	struct fdl_state *fdl_state = interrupt->priv;
+
+	if (!fdl_state->set)
+		return;
+
+	fdl_state->set = NULL;
+
+	dev_dbg(interrupt->dev, "completed FDL process\n");
+}
+
+static int fdl_status_process(struct sdca_interrupt *interrupt, unsigned int status)
+{
+	struct fdl_state *fdl_state = interrupt->priv;
+	int ret;
+
+	switch (status) {
+	case SDCA_CTL_XU_FDLD_NEEDS_SET:
+		dev_dbg(interrupt->dev, "starting FDL process...\n");
+
+		fdl_state->file_index = 0;
+		fdl_state->set = fdl_get_set(interrupt);
+		fallthrough;
+	case SDCA_CTL_XU_FDLD_MORE_FILES_OK:
+		ret = fdl_load_file(interrupt, fdl_state->set, fdl_state->file_index);
+		if (ret) {
+			fdl_end(interrupt);
+			return SDCA_CTL_XU_FDLH_REQ_ABORT;
+		}
+
+		return SDCA_CTL_XU_FDLH_FILE_AVAILABLE;
+	case SDCA_CTL_XU_FDLD_FILE_OK:
+		if (!fdl_state->set) {
+			fdl_end(interrupt);
+			return SDCA_CTL_XU_FDLH_REQ_ABORT;
+		}
+
+		fdl_state->file_index++;
+
+		if (fdl_state->file_index < fdl_state->set->num_files)
+			return SDCA_CTL_XU_FDLH_MORE_FILES;
+		fallthrough;
+	case SDCA_CTL_XU_FDLD_COMPLETE:
+		fdl_end(interrupt);
+		return SDCA_CTL_XU_FDLH_COMPLETE;
+	default:
+		fdl_end(interrupt);
+
+		if (status & SDCA_CTL_XU_FDLD_REQ_RESET)
+			return SDCA_CTL_XU_FDLH_RESET_ACK;
+		else if (status & SDCA_CTL_XU_FDLD_REQ_ABORT)
+			return SDCA_CTL_XU_FDLH_COMPLETE;
+
+		dev_err(interrupt->dev, "invalid FDL status: %x\n", status);
+		return -EINVAL;
+	}
+}
+
+/**
+ * sdca_fdl_process - Process the FDL state machine
+ * @interrupt: SDCA interrupt structure
+ *
+ * Based on section 13.2.5 Flow Diagram for File Download, Host side.
+ *
+ * Return: Zero on success or a negative error code.
+ */
+int sdca_fdl_process(struct sdca_interrupt *interrupt)
+{
+	struct device *dev = interrupt->dev;
+	struct sdca_entity_xu *xu = &interrupt->entity->xu;
+	unsigned int reg, status;
+	int response, ret;
+
+	ret = sdca_ump_get_owner_host(dev, interrupt->function_regmap,
+				      interrupt->function, interrupt->entity,
+				      interrupt->control);
+	if (ret)
+		goto reset_function;
+
+	reg = SDW_SDCA_CTL(interrupt->function->desc->adr, interrupt->entity->id,
+			   SDCA_CTL_XU_FDL_STATUS, 0);
+	ret = regmap_read(interrupt->function_regmap, reg, &status);
+	if (ret < 0) {
+		dev_err(dev, "failed to read FDL status: %d\n", ret);
+		return ret;
+	}
+
+	dev_dbg(dev, "FDL status: %#x\n", status);
+
+	ret = fdl_status_process(interrupt, status);
+	if (ret < 0)
+		goto reset_function;
+
+	response = ret;
+
+	dev_dbg(dev, "FDL response: %#x\n", response);
+
+	ret = regmap_write(interrupt->function_regmap, reg,
+			   response | (status & ~SDCA_CTL_XU_FDLH_MASK));
+	if (ret < 0) {
+		dev_err(dev, "failed to set FDL status signal: %d\n", ret);
+		return ret;
+	}
+
+	ret = sdca_ump_set_owner_device(dev, interrupt->function_regmap,
+					interrupt->function, interrupt->entity,
+					interrupt->control);
+	if (ret)
+		return ret;
+
+	switch (response) {
+	case SDCA_CTL_XU_FDLH_RESET_ACK:
+		dev_dbg(dev, "FDL request reset\n");
+
+		switch (xu->reset_mechanism) {
+		default:
+			dev_warn(dev, "Requested reset mechanism not implemented\n");
+			fallthrough;
+		case SDCA_XU_RESET_FUNCTION:
+			goto reset_function;
+		}
+	default:
+		return 0;
+	}
+
+reset_function:
+	sdca_reset_function(dev, interrupt->function, interrupt->function_regmap);
+
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_fdl_process, "SND_SOC_SDCA");
+
+/**
+ * sdca_fdl_alloc_state - allocate state for an FDL interrupt
+ * @interrupt: SDCA interrupt structure.
+ *
+ * Return: Zero on success or a negative error code.
+ */
+int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt)
+{
+	struct device *dev = interrupt->dev;
+	struct fdl_state *fdl_state;
+
+	fdl_state = devm_kzalloc(dev, sizeof(struct fdl_state), GFP_KERNEL);
+	if (!fdl_state)
+		return -ENOMEM;
+
+	interrupt->priv = fdl_state;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_fdl_alloc_state, "SND_SOC_SDCA");
-- 
cgit v1.2.3


From 0723affa1bee50c3bd7ca00e00dee07fcef224b8 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:09 +0100
Subject: ASoC: SDCA: Add completion for FDL start and stop

Add some completions and a helper function to allow other parts of the
system to wait for FDL to complete. The sdca_fdl_sync() function will
wait until it completes a full time out without a new FDL request
happening, this ensures that even parts requiring multiple rounds of FDL
should be fully downloaded before the driver boot continues.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-17-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_fdl.h  | 10 +++++++
 sound/soc/sdca/sdca_fdl.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_fdl.h b/include/sound/sdca_fdl.h
index 8b025aff4a0c..4ea000d6acef 100644
--- a/include/sound/sdca_fdl.h
+++ b/include/sound/sdca_fdl.h
@@ -10,18 +10,26 @@
 #ifndef __SDCA_FDL_H__
 #define __SDCA_FDL_H__
 
+#include <linux/completion.h>
+
 struct device;
 struct regmap;
 struct sdca_fdl_set;
 struct sdca_function_data;
 struct sdca_interrupt;
+struct sdca_interrupt_info;
 
 /**
  * struct fdl_state - FDL state structure to keep data between interrupts
+ * @begin: Completion indicating the start of an FDL download cycle.
+ * @done: Completion indicating the end of an FDL download cycle.
  * @set: Pointer to the FDL set currently being downloaded.
  * @file_index: Index of the current file being processed.
  */
 struct fdl_state {
+	struct completion begin;
+	struct completion done;
+
 	struct sdca_fdl_set *set;
 	int file_index;
 };
@@ -51,6 +59,8 @@ struct fdl_state {
 
 int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt);
 int sdca_fdl_process(struct sdca_interrupt *interrupt);
+int sdca_fdl_sync(struct device *dev, struct sdca_function_data *function,
+		  struct sdca_interrupt_info *info);
 
 int sdca_reset_function(struct device *dev, struct sdca_function_data *function,
 			struct regmap *regmap);
diff --git a/sound/soc/sdca/sdca_fdl.c b/sound/soc/sdca/sdca_fdl.c
index 8a15c6300556..39298314f69c 100644
--- a/sound/soc/sdca/sdca_fdl.c
+++ b/sound/soc/sdca/sdca_fdl.c
@@ -14,6 +14,7 @@
 #include <linux/firmware.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/pm_runtime.h>
 #include <linux/regmap.h>
 #include <linux/sprintf.h>
 #include <linux/soundwire/sdw.h>
@@ -63,6 +64,71 @@ int sdca_reset_function(struct device *dev, struct sdca_function_data *function,
 }
 EXPORT_SYMBOL_NS(sdca_reset_function, "SND_SOC_SDCA");
 
+/**
+ * sdca_fdl_sync - wait for a function to finish FDL
+ * @dev: Device pointer for error messages.
+ * @function: Pointer to the SDCA Function.
+ * @info: Pointer to the SDCA interrupt info for this device.
+ *
+ * Return: Zero on success or a negative error code.
+ */
+int sdca_fdl_sync(struct device *dev, struct sdca_function_data *function,
+		  struct sdca_interrupt_info *info)
+{
+	static const int fdl_retries = 6;
+	unsigned long begin_timeout = msecs_to_jiffies(100);
+	unsigned long done_timeout = msecs_to_jiffies(4000);
+	int nfdl;
+	int i, j;
+
+	for (i = 0; i < fdl_retries; i++) {
+		nfdl = 0;
+
+		for (j = 0; j < SDCA_MAX_INTERRUPTS; j++) {
+			struct sdca_interrupt *interrupt = &info->irqs[j];
+			struct fdl_state *fdl_state;
+			unsigned long time;
+
+			if (interrupt->function != function ||
+			    !interrupt->entity || !interrupt->control ||
+			    interrupt->entity->type != SDCA_ENTITY_TYPE_XU ||
+			    interrupt->control->sel != SDCA_CTL_XU_FDL_CURRENTOWNER)
+				continue;
+
+			fdl_state = interrupt->priv;
+			nfdl++;
+
+			/*
+			 * Looking for timeout without any new FDL requests
+			 * to imply the device has completed initial
+			 * firmware setup. Alas the specification doesn't
+			 * have any mechanism to detect this.
+			 */
+			time = wait_for_completion_timeout(&fdl_state->begin,
+							   begin_timeout);
+			if (!time) {
+				dev_dbg(dev, "no new FDL starts\n");
+				nfdl--;
+				continue;
+			}
+
+			time = wait_for_completion_timeout(&fdl_state->done,
+							   done_timeout);
+			if (!time) {
+				dev_err(dev, "timed out waiting for FDL to complete\n");
+				return -ETIMEDOUT;
+			}
+		}
+
+		if (!nfdl)
+			return 0;
+	}
+
+	dev_err(dev, "too many FDL requests\n");
+	return -ETIMEDOUT;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_fdl_sync, "SND_SOC_SDCA");
+
 static char *fdl_get_sku_filename(struct device *dev,
 				  struct sdca_fdl_file *fdl_file)
 {
@@ -230,6 +296,9 @@ static void fdl_end(struct sdca_interrupt *interrupt)
 
 	fdl_state->set = NULL;
 
+	pm_runtime_put(interrupt->dev);
+	complete(&fdl_state->done);
+
 	dev_dbg(interrupt->dev, "completed FDL process\n");
 }
 
@@ -242,6 +311,9 @@ static int fdl_status_process(struct sdca_interrupt *interrupt, unsigned int sta
 	case SDCA_CTL_XU_FDLD_NEEDS_SET:
 		dev_dbg(interrupt->dev, "starting FDL process...\n");
 
+		pm_runtime_get(interrupt->dev);
+		complete(&fdl_state->begin);
+
 		fdl_state->file_index = 0;
 		fdl_state->set = fdl_get_set(interrupt);
 		fallthrough;
@@ -369,6 +441,9 @@ int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt)
 	if (!fdl_state)
 		return -ENOMEM;
 
+	init_completion(&fdl_state->begin);
+	init_completion(&fdl_state->done);
+
 	interrupt->priv = fdl_state;
 
 	return 0;
-- 
cgit v1.2.3


From e92e25f777483b7cc3e170214cc84337d7a415cf Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:10 +0100
Subject: ASoC: SDCA: Add UMP timeout handling for FDL

Several of the UMP transactions in the FDL process should timeout if the
device does not respond within a certain time, add handling into the UMP
helpers and the FDL code to handle this.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-18-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_fdl.h  |  7 +++++++
 include/sound/sdca_ump.h  |  5 +++++
 sound/soc/sdca/sdca_fdl.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++-
 sound/soc/sdca/sdca_ump.c | 15 ++++++++++++++
 4 files changed, 76 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/sdca_fdl.h b/include/sound/sdca_fdl.h
index 4ea000d6acef..f4ba809cb203 100644
--- a/include/sound/sdca_fdl.h
+++ b/include/sound/sdca_fdl.h
@@ -11,6 +11,7 @@
 #define __SDCA_FDL_H__
 
 #include <linux/completion.h>
+#include <linux/workqueue.h>
 
 struct device;
 struct regmap;
@@ -23,13 +24,19 @@ struct sdca_interrupt_info;
  * struct fdl_state - FDL state structure to keep data between interrupts
  * @begin: Completion indicating the start of an FDL download cycle.
  * @done: Completion indicating the end of an FDL download cycle.
+ * @timeout: Delayed work used for timing out UMP transactions.
+ * @lock: Mutex to protect between the timeout work and IRQ handlers.
+ * @interrupt: Pointer to the interrupt struct to which this FDL is attached.
  * @set: Pointer to the FDL set currently being downloaded.
  * @file_index: Index of the current file being processed.
  */
 struct fdl_state {
 	struct completion begin;
 	struct completion done;
+	struct delayed_work timeout;
+	struct mutex lock;
 
+	struct sdca_interrupt *interrupt;
 	struct sdca_fdl_set *set;
 	int file_index;
 };
diff --git a/include/sound/sdca_ump.h b/include/sound/sdca_ump.h
index b2363199d19a..f54f9d48c64c 100644
--- a/include/sound/sdca_ump.h
+++ b/include/sound/sdca_ump.h
@@ -15,6 +15,7 @@ struct sdca_control;
 struct sdca_entity;
 struct sdca_function_data;
 struct snd_soc_component;
+struct delayed_work;
 
 int sdca_ump_get_owner_host(struct device *dev,
 			    struct regmap *function_regmap,
@@ -42,4 +43,8 @@ int sdca_ump_write_message(struct device *dev,
 			   unsigned int length_sel,
 			   void *msg, int msg_len);
 
+void sdca_ump_cancel_timeout(struct delayed_work *work);
+void sdca_ump_schedule_timeout(struct delayed_work *work,
+			       unsigned int timeout_us);
+
 #endif // __SDCA_UMP_H__
diff --git a/sound/soc/sdca/sdca_fdl.c b/sound/soc/sdca/sdca_fdl.c
index 39298314f69c..cb79dc3131b8 100644
--- a/sound/soc/sdca/sdca_fdl.c
+++ b/sound/soc/sdca/sdca_fdl.c
@@ -116,7 +116,7 @@ int sdca_fdl_sync(struct device *dev, struct sdca_function_data *function,
 							   done_timeout);
 			if (!time) {
 				dev_err(dev, "timed out waiting for FDL to complete\n");
-				return -ETIMEDOUT;
+				goto error;
 			}
 		}
 
@@ -125,6 +125,25 @@ int sdca_fdl_sync(struct device *dev, struct sdca_function_data *function,
 	}
 
 	dev_err(dev, "too many FDL requests\n");
+
+error:
+	for (j = 0; j < SDCA_MAX_INTERRUPTS; j++) {
+		struct sdca_interrupt *interrupt = &info->irqs[j];
+		struct fdl_state *fdl_state;
+
+		if (interrupt->function != function ||
+		    !interrupt->entity || !interrupt->control ||
+		    interrupt->entity->type != SDCA_ENTITY_TYPE_XU ||
+		    interrupt->control->sel != SDCA_CTL_XU_FDL_CURRENTOWNER)
+			continue;
+
+		disable_irq(interrupt->irq);
+
+		fdl_state = interrupt->priv;
+
+		sdca_ump_cancel_timeout(&fdl_state->timeout);
+	}
+
 	return -ETIMEDOUT;
 }
 EXPORT_SYMBOL_NS_GPL(sdca_fdl_sync, "SND_SOC_SDCA");
@@ -302,6 +321,21 @@ static void fdl_end(struct sdca_interrupt *interrupt)
 	dev_dbg(interrupt->dev, "completed FDL process\n");
 }
 
+static void sdca_fdl_timeout_work(struct work_struct *work)
+{
+	struct fdl_state *fdl_state = container_of(work, struct fdl_state,
+						   timeout.work);
+	struct sdca_interrupt *interrupt = fdl_state->interrupt;
+	struct device *dev = interrupt->dev;
+
+	dev_err(dev, "FDL transaction timed out\n");
+
+	guard(mutex)(&fdl_state->lock);
+
+	fdl_end(interrupt);
+	sdca_reset_function(dev, interrupt->function, interrupt->function_regmap);
+}
+
 static int fdl_status_process(struct sdca_interrupt *interrupt, unsigned int status)
 {
 	struct fdl_state *fdl_state = interrupt->priv;
@@ -364,15 +398,20 @@ int sdca_fdl_process(struct sdca_interrupt *interrupt)
 {
 	struct device *dev = interrupt->dev;
 	struct sdca_entity_xu *xu = &interrupt->entity->xu;
+	struct fdl_state *fdl_state = interrupt->priv;
 	unsigned int reg, status;
 	int response, ret;
 
+	guard(mutex)(&fdl_state->lock);
+
 	ret = sdca_ump_get_owner_host(dev, interrupt->function_regmap,
 				      interrupt->function, interrupt->entity,
 				      interrupt->control);
 	if (ret)
 		goto reset_function;
 
+	sdca_ump_cancel_timeout(&fdl_state->timeout);
+
 	reg = SDW_SDCA_CTL(interrupt->function->desc->adr, interrupt->entity->id,
 			   SDCA_CTL_XU_FDL_STATUS, 0);
 	ret = regmap_read(interrupt->function_regmap, reg, &status);
@@ -415,7 +454,13 @@ int sdca_fdl_process(struct sdca_interrupt *interrupt)
 		case SDCA_XU_RESET_FUNCTION:
 			goto reset_function;
 		}
+	case SDCA_CTL_XU_FDLH_COMPLETE:
+		if (status & SDCA_CTL_XU_FDLD_REQ_ABORT ||
+		    status == SDCA_CTL_XU_FDLD_COMPLETE)
+			return 0;
+		fallthrough;
 	default:
+		sdca_ump_schedule_timeout(&fdl_state->timeout, xu->max_delay);
 		return 0;
 	}
 
@@ -441,8 +486,11 @@ int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt)
 	if (!fdl_state)
 		return -ENOMEM;
 
+	INIT_DELAYED_WORK(&fdl_state->timeout, sdca_fdl_timeout_work);
 	init_completion(&fdl_state->begin);
 	init_completion(&fdl_state->done);
+	mutex_init(&fdl_state->lock);
+	fdl_state->interrupt = interrupt;
 
 	interrupt->priv = fdl_state;
 
diff --git a/sound/soc/sdca/sdca_ump.c b/sound/soc/sdca/sdca_ump.c
index 5dcad2f7ea05..8aba3ff16872 100644
--- a/sound/soc/sdca/sdca_ump.c
+++ b/sound/soc/sdca/sdca_ump.c
@@ -245,3 +245,18 @@ int sdca_ump_write_message(struct device *dev,
 	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(sdca_ump_write_message, "SND_SOC_SDCA");
+
+void sdca_ump_cancel_timeout(struct delayed_work *work)
+{
+	cancel_delayed_work_sync(work);
+}
+EXPORT_SYMBOL_NS_GPL(sdca_ump_cancel_timeout, "SND_SOC_SDCA");
+
+void sdca_ump_schedule_timeout(struct delayed_work *work, unsigned int timeout_us)
+{
+	if (!timeout_us)
+		return;
+
+	queue_delayed_work(system_wq, work, usecs_to_jiffies(timeout_us));
+}
+EXPORT_SYMBOL_NS_GPL(sdca_ump_schedule_timeout, "SND_SOC_SDCA");
-- 
cgit v1.2.3


From 12aa3160c10a3179c73c4f99a2d5aec0fd907d0c Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:11 +0100
Subject: ASoC: SDCA: Add early IRQ handling

Some IRQs (FDL) require processing before the primary soundcard is
brought up, as the downloaded files could be firmware required for
operation of the audio functions of the device. Add a new helper
function which registers the required IRQs.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-19-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_interrupts.h  |  3 ++
 sound/soc/sdca/sdca_interrupts.c | 71 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_interrupts.h b/include/sound/sdca_interrupts.h
index 3983f515349a..8f13417d129a 100644
--- a/include/sound/sdca_interrupts.h
+++ b/include/sound/sdca_interrupts.h
@@ -75,6 +75,9 @@ int sdca_irq_data_populate(struct device *dev, struct regmap *function_regmap,
 			   struct sdca_entity *entity,
 			   struct sdca_control *control,
 			   struct sdca_interrupt *interrupt);
+int sdca_irq_populate_early(struct device *dev, struct regmap *function_regmap,
+			    struct sdca_function_data *function,
+			    struct sdca_interrupt_info *info);
 int sdca_irq_populate(struct sdca_function_data *function,
 		      struct snd_soc_component *component,
 		      struct sdca_interrupt_info *info);
diff --git a/sound/soc/sdca/sdca_interrupts.c b/sound/soc/sdca/sdca_interrupts.c
index 3a3b966b5782..51342b8aacae 100644
--- a/sound/soc/sdca/sdca_interrupts.c
+++ b/sound/soc/sdca/sdca_interrupts.c
@@ -396,6 +396,77 @@ static struct sdca_interrupt *get_interrupt_data(struct device *dev, int irq,
 	return &info->irqs[irq];
 }
 
+/**
+ * sdca_irq_populate_early - process pre-audio card IRQ registrations
+ * @dev: Device pointer for SDCA Function.
+ * @regmap: Regmap pointer for the SDCA Function.
+ * @function: Pointer to the SDCA Function.
+ * @info: Pointer to the SDCA interrupt info for this device.
+ *
+ * This is intended to be used as part of the Function boot process. It
+ * can be called before the soundcard is registered (ie. doesn't depend
+ * on component) and will register the FDL interrupts.
+ *
+ * Return: Zero on success, and a negative error code on failure.
+ */
+int sdca_irq_populate_early(struct device *dev, struct regmap *regmap,
+			    struct sdca_function_data *function,
+			    struct sdca_interrupt_info *info)
+{
+	int i, j;
+
+	guard(mutex)(&info->irq_lock);
+
+	for (i = 0; i < function->num_entities; i++) {
+		struct sdca_entity *entity = &function->entities[i];
+
+		for (j = 0; j < entity->num_controls; j++) {
+			struct sdca_control *control = &entity->controls[j];
+			int irq = control->interrupt_position;
+			struct sdca_interrupt *interrupt;
+			int ret;
+
+			interrupt = get_interrupt_data(dev, irq, info);
+			if (IS_ERR(interrupt))
+				return PTR_ERR(interrupt);
+			else if (!interrupt)
+				continue;
+
+			switch (entity->type) {
+			case SDCA_ENTITY_TYPE_XU:
+				if (control->sel != SDCA_CTL_XU_FDL_CURRENTOWNER)
+					break;
+
+				ret = sdca_irq_data_populate(dev, regmap, NULL,
+							     function, entity,
+							     control, interrupt);
+				if (ret)
+					return ret;
+
+				ret = sdca_fdl_alloc_state(interrupt);
+				if (ret)
+					return ret;
+
+				ret = sdca_irq_request_locked(dev, info, irq,
+							      interrupt->name,
+							      fdl_owner_handler,
+							      interrupt);
+				if (ret) {
+					dev_err(dev, "failed to request irq %s: %d\n",
+						interrupt->name, ret);
+					return ret;
+				}
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_irq_populate_early, "SND_SOC_SDCA");
+
 /**
  * sdca_irq_populate - Request all the individual IRQs for an SDCA Function
  * @function: Pointer to the SDCA Function.
-- 
cgit v1.2.3


From ef042df96d0e1089764f39ede61bc8f140a4be00 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 20 Oct 2025 16:55:12 +0100
Subject: ASoC: SDCA: Add HID button IRQ

Now full support for the UMP buffers is available, it is possible to
read the SDCA HID descriptors from the device and pass them to
user-space. Add a helper function to process HID events from an SDCA
device.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20251020155512.353774-20-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_hid.h         | 13 ++++++++++--
 sound/soc/sdca/sdca_hid.c        | 46 ++++++++++++++++++++++++++++++++++++++++
 sound/soc/sdca/sdca_interrupts.c | 28 ++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/sdca_hid.h b/include/sound/sdca_hid.h
index 3a155835e035..18bebbe428c9 100644
--- a/include/sound/sdca_hid.h
+++ b/include/sound/sdca_hid.h
@@ -8,13 +8,17 @@
 #ifndef __SDCA_HID_H__
 #define __SDCA_HID_H__
 
-#include <linux/types.h>
-#include <linux/hid.h>
+struct device;
+struct sdw_slave;
+
+struct sdca_entity;
+struct sdca_interrupt;
 
 #if IS_ENABLED(CONFIG_SND_SOC_SDCA_HID)
 
 int sdca_add_hid_device(struct device *dev, struct sdw_slave *sdw,
 			struct sdca_entity *entity);
+int sdca_hid_process_report(struct sdca_interrupt *interrupt);
 
 #else
 
@@ -24,6 +28,11 @@ static inline int sdca_add_hid_device(struct device *dev, struct sdw_slave *sdw,
 	return 0;
 }
 
+static inline int sdca_hid_process_report(struct sdca_interrupt *interrupt)
+{
+	return 0;
+}
+
 #endif
 
 #endif /* __SDCA_HID_H__ */
diff --git a/sound/soc/sdca/sdca_hid.c b/sound/soc/sdca/sdca_hid.c
index 53dad1a524d4..ad53207b0d62 100644
--- a/sound/soc/sdca/sdca_hid.c
+++ b/sound/soc/sdca/sdca_hid.c
@@ -10,6 +10,7 @@
 #include <linux/cleanup.h>
 #include <linux/device.h>
 #include <linux/dev_printk.h>
+#include <linux/hid.h>
 #include <linux/module.h>
 #include <linux/property.h>
 #include <linux/soundwire/sdw.h>
@@ -17,6 +18,8 @@
 #include <sound/sdca.h>
 #include <sound/sdca_function.h>
 #include <sound/sdca_hid.h>
+#include <sound/sdca_interrupts.h>
+#include <sound/sdca_ump.h>
 
 static int sdwhid_parse(struct hid_device *hid)
 {
@@ -121,5 +124,48 @@ int sdca_add_hid_device(struct device *dev, struct sdw_slave *sdw,
 }
 EXPORT_SYMBOL_NS(sdca_add_hid_device, "SND_SOC_SDCA");
 
+/**
+ * sdca_hid_process_report - read a HID event from the device and report
+ * @interrupt: Pointer to the SDCA interrupt information structure.
+ *
+ * Return: Zero on success, and a negative error code on failure.
+ */
+int sdca_hid_process_report(struct sdca_interrupt *interrupt)
+{
+	struct device *dev = interrupt->dev;
+	struct hid_device *hid = interrupt->entity->hide.hid;
+	void *val __free(kfree) = NULL;
+	int len, ret;
+
+	ret = sdca_ump_get_owner_host(dev, interrupt->function_regmap,
+				      interrupt->function, interrupt->entity,
+				      interrupt->control);
+	if (ret)
+		return ret;
+
+	len = sdca_ump_read_message(dev, interrupt->device_regmap,
+				    interrupt->function_regmap,
+				    interrupt->function, interrupt->entity,
+				    SDCA_CTL_HIDE_HIDTX_MESSAGEOFFSET,
+				    SDCA_CTL_HIDE_HIDTX_MESSAGELENGTH, &val);
+	if (len < 0)
+		return len;
+
+	ret = sdca_ump_set_owner_device(dev, interrupt->function_regmap,
+					interrupt->function, interrupt->entity,
+					interrupt->control);
+	if (ret)
+		return ret;
+
+	ret = hid_input_report(hid, HID_INPUT_REPORT, val, len, true);
+	if (ret < 0) {
+		dev_err(dev, "failed to report hid event: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(sdca_hid_process_report, "SND_SOC_SDCA");
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("SDCA HID library");
diff --git a/sound/soc/sdca/sdca_interrupts.c b/sound/soc/sdca/sdca_interrupts.c
index 51342b8aacae..5176460416bb 100644
--- a/sound/soc/sdca/sdca_interrupts.c
+++ b/sound/soc/sdca/sdca_interrupts.c
@@ -20,6 +20,7 @@
 #include <sound/sdca.h>
 #include <sound/sdca_fdl.h>
 #include <sound/sdca_function.h>
+#include <sound/sdca_hid.h>
 #include <sound/sdca_interrupts.h>
 #include <sound/sdca_ump.h>
 #include <sound/soc-component.h>
@@ -247,6 +248,29 @@ error:
 	return irqret;
 }
 
+static irqreturn_t hid_handler(int irq, void *data)
+{
+	struct sdca_interrupt *interrupt = data;
+	struct device *dev = interrupt->dev;
+	irqreturn_t irqret = IRQ_NONE;
+	int ret;
+
+	ret = pm_runtime_get_sync(dev);
+	if (ret < 0) {
+		dev_err(dev, "failed to resume for hid: %d\n", ret);
+		goto error;
+	}
+
+	ret = sdca_hid_process_report(interrupt);
+	if (ret)
+		goto error;
+
+	irqret = IRQ_HANDLED;
+error:
+	pm_runtime_put(dev);
+	return irqret;
+}
+
 static irqreturn_t fdl_owner_handler(int irq, void *data)
 {
 	struct sdca_interrupt *interrupt = data;
@@ -528,6 +552,10 @@ int sdca_irq_populate(struct sdca_function_data *function,
 					handler = fdl_owner_handler;
 				}
 				break;
+			case SDCA_ENTITY_TYPE_HIDE:
+				if (control->sel == SDCA_CTL_HIDE_HIDTX_CURRENTOWNER)
+					handler = hid_handler;
+				break;
 			default:
 				break;
 			}
-- 
cgit v1.2.3


From 8d748955279cfe1996e51ac51a4f746468614a10 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Mon, 27 Oct 2025 08:18:14 +0800
Subject: asm-generic: percpu: Add assembly guard

Currently, asm/percpu.h is directly or indirectly included by
some assembly files on x86. Some of them (e.g., checksum_32.S)
are also used on um. But x86 and um provide different versions
of asm/percpu.h -- um uses asm-generic/percpu.h directly.

When SMP is enabled, asm-generic/percpu.h will introduce C code
that cannot be assembled. Since asm-generic/percpu.h currently
is not designed for use in assembly, and these assembly files
do not actually need asm/percpu.h on um, let's add the assembly
guard in asm-generic/percpu.h to fix this issue.

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-arch@vger.kernel.org
Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251027001815.1666872-8-tiwei.bie@linux.dev
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/asm-generic/percpu.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 02aeca21479a..6628670bcb90 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_GENERIC_PERCPU_H_
 #define _ASM_GENERIC_PERCPU_H_
 
+#ifndef __ASSEMBLER__
+
 #include <linux/compiler.h>
 #include <linux/threads.h>
 #include <linux/percpu-defs.h>
@@ -557,4 +559,5 @@ do {									\
 	this_cpu_generic_cmpxchg(pcp, oval, nval)
 #endif
 
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_GENERIC_PERCPU_H_ */
-- 
cgit v1.2.3


From 87b0031f7f73dac2ebb874fc8f331a66ee3b5cbd Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:18 +0100
Subject: irqdomain: Add firmware info reporting interface

Add an irqdomain callback to report firmware-provided information that is
otherwise not available in a generic way. This is reported using a new data
structure (struct irq_fwspec_info).

This callback is optional and the only information that can be reported
currently is the affinity of an interrupt. However, the containing
structure is designed to be extensible, allowing other potentially relevant
information to be reported in the future.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251020122944.3074811-2-maz@kernel.org
---
 include/linux/irqdomain.h | 27 +++++++++++++++++++++++++++
 kernel/irq/irqdomain.c    | 32 +++++++++++++++++++++++++++-----
 2 files changed, 54 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 4a86e6b915dd..9d6a5e99394f 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -44,6 +44,23 @@ struct irq_fwspec {
 	u32			param[IRQ_DOMAIN_IRQ_SPEC_PARAMS];
 };
 
+/**
+ * struct irq_fwspec_info - firmware provided IRQ information structure
+ *
+ * @flags:		Information validity flags
+ * @cpumask:		Affinity mask for this interrupt
+ *
+ * This structure reports firmware-specific information about an
+ * interrupt. The only significant information is the affinity of a
+ * per-CPU interrupt, but this is designed to be extended as required.
+ */
+struct irq_fwspec_info {
+	unsigned long		flags;
+	const struct cpumask	*affinity;
+};
+
+#define IRQ_FWSPEC_INFO_AFFINITY_VALID	BIT(0)
+
 /* Conversion function from of_phandle_args fields to fwspec  */
 void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
 			       unsigned int count, struct irq_fwspec *fwspec);
@@ -69,6 +86,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
  * @translate:	Given @fwspec, decode the hardware irq number (@out_hwirq) and
  *		linux irq type value (@out_type). This is a generalised @xlate
  *		(over struct irq_fwspec) and is preferred if provided.
+ * @get_fwspec_info:
+ *		Given @fwspec, report additional firmware-provided information in
+ *		@info. Optional.
  * @debug_show:	For domains to show specific data for an interrupt in debugfs.
  *
  * Functions below are provided by the driver and called whenever a new mapping
@@ -96,6 +116,7 @@ struct irq_domain_ops {
 	void	(*deactivate)(struct irq_domain *d, struct irq_data *irq_data);
 	int	(*translate)(struct irq_domain *d, struct irq_fwspec *fwspec,
 			     unsigned long *out_hwirq, unsigned int *out_type);
+	int	(*get_fwspec_info)(struct irq_fwspec *fwspec, struct irq_fwspec_info *info);
 #endif
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
 	void	(*debug_show)(struct seq_file *m, struct irq_domain *d,
@@ -602,6 +623,8 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_bas
 
 int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq);
 
+int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info);
+
 static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
 {
 	return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY;
@@ -685,6 +708,10 @@ static inline bool irq_domain_is_msi_device(struct irq_domain *domain)
 	return false;
 }
 
+static inline int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info)
+{
+	return -EINVAL;
+}
 #endif	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 
 #ifdef CONFIG_GENERIC_MSI_IRQ
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index dc473faadcc8..2652c4cfd877 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -867,13 +867,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
 }
 EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
 
-unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
+static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec)
 {
 	struct irq_domain *domain;
-	struct irq_data *irq_data;
-	irq_hw_number_t hwirq;
-	unsigned int type = IRQ_TYPE_NONE;
-	int virq;
 
 	if (fwspec->fwnode) {
 		domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
@@ -883,6 +879,32 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 		domain = irq_default_domain;
 	}
 
+	return domain;
+}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info)
+{
+	struct irq_domain *domain = fwspec_to_domain(fwspec);
+
+	memset(info, 0, sizeof(*info));
+
+	if (!domain || !domain->ops->get_fwspec_info)
+		return 0;
+
+	return domain->ops->get_fwspec_info(fwspec, info);
+}
+#endif
+
+unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
+{
+	unsigned int type = IRQ_TYPE_NONE;
+	struct irq_domain *domain;
+	struct irq_data *irq_data;
+	irq_hw_number_t hwirq;
+	int virq;
+
+	domain = fwspec_to_domain(fwspec);
 	if (!domain) {
 		pr_warn("no irq domain found for %s !\n",
 			of_node_full_name(to_of_node(fwspec->fwnode)));
-- 
cgit v1.2.3


From 5324fe21ba9b77b299c02191645a97777cdd73ac Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:19 +0100
Subject: ACPI: irq: Add interrupt affinity reporting interface

Plug the irq_populate_fwspec_info() helper into the ACPI layer to offer an
interrupt affinity reporting function. This is currently only supported for
the CONFIG_ACPI_GENERIC_GSI configurations, but could later be extended to
legacy architectures if necessary.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Acked-by: Rafael J. Wysocki (Intel) <rafael@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-3-maz@kernel.org
---
 drivers/acpi/irq.c   | 19 +++++++++++++++++++
 include/linux/acpi.h |  7 +++++++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c
index 76a856c32c4d..d1595156c86a 100644
--- a/drivers/acpi/irq.c
+++ b/drivers/acpi/irq.c
@@ -300,6 +300,25 @@ int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res)
 }
 EXPORT_SYMBOL_GPL(acpi_irq_get);
 
+const struct cpumask *acpi_irq_get_affinity(acpi_handle handle,
+					    unsigned int index)
+{
+	struct irq_fwspec_info info;
+	struct irq_fwspec fwspec;
+	unsigned long flags;
+
+	if (acpi_irq_parse_one(handle, index, &fwspec, &flags))
+		return NULL;
+
+	if (irq_populate_fwspec_info(&fwspec, &info))
+		return NULL;
+
+	if (!(info.flags & IRQ_FWSPEC_INFO_AFFINITY_VALID))
+		return NULL;
+
+	return info.affinity;
+}
+
 /**
  * acpi_set_irq_model - Setup the GSI irqdomain information
  * @model: the value assigned to acpi_irq_model
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 5ff5d99f6ead..607db773b672 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1509,12 +1509,19 @@ static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console)
 
 #if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI)
 int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res);
+const struct cpumask *acpi_irq_get_affinity(acpi_handle handle,
+					    unsigned int index);
 #else
 static inline
 int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res)
 {
 	return -EINVAL;
 }
+static inline const struct cpumask *acpi_irq_get_affinity(acpi_handle handle,
+							  unsigned int index)
+{
+	return NULL;
+}
 #endif
 
 #ifdef CONFIG_ACPI_LPIT
-- 
cgit v1.2.3


From 5404f5c06dd41fd4445a01dec77a629e254a62e8 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:20 +0100
Subject: of/irq: Add interrupt affinity reporting interface

Plug the irq_populate_fwspec_info() helper into the OF layer to offer an
interrupt affinity reporting function.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251020122944.3074811-4-maz@kernel.org
---
 drivers/of/irq.c       | 20 ++++++++++++++++++++
 include/linux/of_irq.h |  7 +++++++
 2 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index 65c3c23255b7..168fde921bd2 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -479,6 +479,26 @@ out:
 }
 EXPORT_SYMBOL_GPL(of_irq_get);
 
+const struct cpumask *of_irq_get_affinity(struct device_node *dev, int index)
+{
+	struct of_phandle_args oirq;
+	struct irq_fwspec_info info;
+	struct irq_fwspec fwspec;
+	int rc;
+
+	rc = of_irq_parse_one(dev, index, &oirq);
+	if (rc)
+		return NULL;
+
+	of_phandle_args_to_fwspec(oirq.np, oirq.args, oirq.args_count,
+				  &fwspec);
+
+	if (irq_populate_fwspec_info(&fwspec, &info))
+		return NULL;
+
+	return info.affinity;
+}
+
 /**
  * of_irq_get_byname - Decode a node's IRQ and return it as a Linux IRQ number
  * @dev: pointer to device tree node
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 1db8543dfc8a..1c2bc0281807 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -43,6 +43,8 @@ extern int of_irq_parse_one(struct device_node *device, int index,
 			  struct of_phandle_args *out_irq);
 extern int of_irq_count(struct device_node *dev);
 extern int of_irq_get(struct device_node *dev, int index);
+extern const struct cpumask *of_irq_get_affinity(struct device_node *dev,
+						      int index);
 extern int of_irq_get_byname(struct device_node *dev, const char *name);
 extern int of_irq_to_resource_table(struct device_node *dev,
 		struct resource *res, int nr_irqs);
@@ -76,6 +78,11 @@ static inline int of_irq_get_byname(struct device_node *dev, const char *name)
 {
 	return 0;
 }
+static inline const struct cpumask *of_irq_get_affinity(struct device_node *dev,
+							int index)
+{
+	return NULL;
+}
 static inline int of_irq_to_resource_table(struct device_node *dev,
 					   struct resource *res, int nr_irqs)
 {
-- 
cgit v1.2.3


From 0d5daa938c94b8b9183e9b257a88dc0929d59409 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:21 +0100
Subject: platform: Add firmware-agnostic irq and affinity retrieval interface

Expand platform_get_irq_optional() to also return an affinity if available,
renaming it to platform_get_irq_affinity() in the process.

platform_get_irq_optional() is preserved with its current semantics by
calling into the new helper with a NULL affinity pointer.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251020122944.3074811-5-maz@kernel.org
---
 drivers/base/platform.c         | 71 ++++++++++++++++++++++++++++++++---------
 include/linux/platform_device.h |  2 ++
 2 files changed, 58 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 09450349cf32..b45d41b018ca 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -150,25 +150,37 @@ devm_platform_ioremap_resource_byname(struct platform_device *pdev,
 EXPORT_SYMBOL_GPL(devm_platform_ioremap_resource_byname);
 #endif /* CONFIG_HAS_IOMEM */
 
+static const struct cpumask *get_irq_affinity(struct platform_device *dev,
+					      unsigned int num)
+{
+	const struct cpumask *mask = NULL;
+#ifndef CONFIG_SPARC
+	struct fwnode_handle *fwnode = dev_fwnode(&dev->dev);
+
+	if (is_of_node(fwnode))
+		mask = of_irq_get_affinity(to_of_node(fwnode), num);
+	else if (is_acpi_device_node(fwnode))
+		mask = acpi_irq_get_affinity(ACPI_HANDLE_FWNODE(fwnode), num);
+#endif
+
+	return mask ?: cpu_possible_mask;
+}
+
 /**
- * platform_get_irq_optional - get an optional IRQ for a device
- * @dev: platform device
- * @num: IRQ number index
+ * platform_get_irq_affinity - get an optional IRQ and its affinity for a device
+ * @dev:	platform device
+ * @num:	interrupt number index
+ * @affinity:	optional cpumask pointer to get the affinity of a per-cpu interrupt
  *
- * Gets an IRQ for a platform device. Device drivers should check the return
- * value for errors so as to not pass a negative integer value to the
- * request_irq() APIs. This is the same as platform_get_irq(), except that it
- * does not print an error message if an IRQ can not be obtained.
- *
- * For example::
- *
- *		int irq = platform_get_irq_optional(pdev, 0);
- *		if (irq < 0)
- *			return irq;
+ * Gets an interupt for a platform device. Device drivers should check the
+ * return value for errors so as to not pass a negative integer value to
+ * the request_irq() APIs. Optional affinity information is provided in the
+ * affinity pointer if available, and NULL otherwise.
  *
- * Return: non-zero IRQ number on success, negative error number on failure.
+ * Return: non-zero interrupt number on success, negative error number on failure.
  */
-int platform_get_irq_optional(struct platform_device *dev, unsigned int num)
+int platform_get_irq_affinity(struct platform_device *dev, unsigned int num,
+			      const struct cpumask **affinity)
 {
 	int ret;
 #ifdef CONFIG_SPARC
@@ -236,8 +248,37 @@ out_not_found:
 out:
 	if (WARN(!ret, "0 is an invalid IRQ number\n"))
 		return -EINVAL;
+
+	if (ret > 0 && affinity)
+		*affinity = get_irq_affinity(dev, num);
+
 	return ret;
 }
+EXPORT_SYMBOL_GPL(platform_get_irq_affinity);
+
+/**
+ * platform_get_irq_optional - get an optional interrupt for a device
+ * @dev:	platform device
+ * @num:	interrupt number index
+ *
+ * Gets an interrupt for a platform device. Device drivers should check the
+ * return value for errors so as to not pass a negative integer value to
+ * the request_irq() APIs. This is the same as platform_get_irq(), except
+ * that it does not print an error message if an interrupt can not be
+ * obtained.
+ *
+ * For example::
+ *
+ *		int irq = platform_get_irq_optional(pdev, 0);
+ *		if (irq < 0)
+ *			return irq;
+ *
+ * Return: non-zero interrupt number on success, negative error number on failure.
+ */
+int platform_get_irq_optional(struct platform_device *dev, unsigned int num)
+{
+	return platform_get_irq_affinity(dev, num, NULL);
+}
 EXPORT_SYMBOL_GPL(platform_get_irq_optional);
 
 /**
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 074754c23d33..ad66333ce85c 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -102,6 +102,8 @@ devm_platform_ioremap_resource_byname(struct platform_device *pdev,
 
 extern int platform_get_irq(struct platform_device *, unsigned int);
 extern int platform_get_irq_optional(struct platform_device *, unsigned int);
+extern int platform_get_irq_affinity(struct platform_device *, unsigned int,
+				     const struct cpumask **);
 extern int platform_irq_count(struct platform_device *);
 extern int devm_platform_get_irqs_affinity(struct platform_device *dev,
 					   struct irq_affinity *affd,
-- 
cgit v1.2.3


From 5ff78c8de9d83ad6fc0553bf8f2edc816385837d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:28 +0100
Subject: genirq: Kill handle_percpu_devid_fasteoi_nmi()

There is no in-tree user of this flow handler anymore, so simply remove it.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-12-maz@kernel.org
---
 include/linux/irq.h |  1 -
 kernel/irq/chip.c   | 25 -------------------------
 2 files changed, 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c67e76fbcc07..b728c18f6ded 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -655,7 +655,6 @@ extern void handle_bad_irq(struct irq_desc *desc);
 extern void handle_nested_irq(unsigned int irq);
 
 extern void handle_fasteoi_nmi(struct irq_desc *desc);
-extern void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc);
 
 extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg);
 extern int irq_chip_pm_get(struct irq_data *data);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3ffa0d80ddd1..633e1f67bb6f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -929,31 +929,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
 		chip->irq_eoi(&desc->irq_data);
 }
 
-/**
- * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu
- *				     dev ids
- * @desc:	the interrupt description structure for this irq
- *
- * Similar to handle_fasteoi_nmi, but handling the dev_id cookie
- * as a percpu pointer.
- */
-void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
-{
-	struct irq_chip *chip = irq_desc_get_chip(desc);
-	struct irqaction *action = desc->action;
-	unsigned int irq = irq_desc_get_irq(desc);
-	irqreturn_t res;
-
-	__kstat_incr_irqs_this_cpu(desc);
-
-	trace_irq_handler_entry(irq, action);
-	res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
-	trace_irq_handler_exit(irq, action, res);
-
-	if (chip->irq_eoi)
-		chip->irq_eoi(&desc->irq_data);
-}
-
 static void
 __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 		     int is_chained, const char *name)
-- 
cgit v1.2.3


From 5c2b2cc472e015e79c4f0170893a1e0883bd3bb4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:29 +0100
Subject: genirq: Merge irqaction::{dev_id,percpu_dev_id}

When irqaction::percpu_dev_id was introduced, it was hoped that it could be
part of an anonymous union with dev_id, as the two fields are mutually
exclusive.

However, toolchains used at the time were often showing terrible support
for anonymous unions, breaking the build on a number of architectures. It
was therefore decided to keep the two fields separate and address this down
the line.

14 years later, the compiler dark age is over, and there is universal
support for anonymous unions. Get a whole pointer back that can immediately
be spent on something else.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-13-maz@kernel.org
---
 include/linux/interrupt.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 51b6484c0493..0ec1a71ab4e8 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -121,8 +121,10 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
  */
 struct irqaction {
 	irq_handler_t		handler;
-	void			*dev_id;
-	void __percpu		*percpu_dev_id;
+	union {
+		void		*dev_id;
+		void __percpu	*percpu_dev_id;
+	};
 	struct irqaction	*next;
 	irq_handler_t		thread_fn;
 	struct task_struct	*thread;
-- 
cgit v1.2.3


From 258e7d28a3dcd389239f9688058140c1a418b549 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:31 +0100
Subject: genirq: Add affinity to percpu_devid interrupt requests

Add an affinity field to both the irqaction structure and the interrupt
request primitives. Nothing is making use of it yet, and the only value
used it NULL, which is used as a shorthand for cpu_possible_mask.

This will shortly get used with actual affinities.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-15-maz@kernel.org
---
 include/linux/interrupt.h |  5 +++--
 kernel/irq/manage.c       | 14 ++++++++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0ec1a71ab4e8..52147d5f432b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -125,6 +125,7 @@ struct irqaction {
 		void		*dev_id;
 		void __percpu	*percpu_dev_id;
 	};
+	const struct cpumask	*affinity;
 	struct irqaction	*next;
 	irq_handler_t		thread_fn;
 	struct task_struct	*thread;
@@ -181,7 +182,7 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler,
 extern int __must_check
 __request_percpu_irq(unsigned int irq, irq_handler_t handler,
 		     unsigned long flags, const char *devname,
-		     void __percpu *percpu_dev_id);
+		     const cpumask_t *affinity, void __percpu *percpu_dev_id);
 
 extern int __must_check
 request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags,
@@ -192,7 +193,7 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler,
 		   const char *devname, void __percpu *percpu_dev_id)
 {
 	return __request_percpu_irq(irq, handler, 0,
-				    devname, percpu_dev_id);
+				    devname, NULL, percpu_dev_id);
 }
 
 extern int __must_check
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d9ddc30678b5..5f4c65167743 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2444,10 +2444,14 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
 
 static
 struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags,
-					  const char *devname, void __percpu *dev_id)
+					  const char *devname, const cpumask_t *affinity,
+					  void __percpu *dev_id)
 {
 	struct irqaction *action;
 
+	if (!affinity)
+		affinity = cpu_possible_mask;
+
 	action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
 	if (!action)
 		return NULL;
@@ -2456,6 +2460,7 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f
 	action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
 	action->name = devname;
 	action->percpu_dev_id = dev_id;
+	action->affinity = affinity;
 
 	return action;
 }
@@ -2466,6 +2471,7 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f
  * @handler:	Function to be called when the IRQ occurs.
  * @flags:	Interrupt type flags (IRQF_TIMER only)
  * @devname:	An ascii name for the claiming device
+ * @affinity:	A cpumask describing the target CPUs for this interrupt
  * @dev_id:	A percpu cookie passed back to the handler function
  *
  * This call allocates interrupt resources, but doesn't enable the interrupt
@@ -2478,7 +2484,7 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f
  */
 int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
 			 unsigned long flags, const char *devname,
-			 void __percpu *dev_id)
+			 const cpumask_t *affinity, void __percpu *dev_id)
 {
 	struct irqaction *action;
 	struct irq_desc *desc;
@@ -2495,7 +2501,7 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
 	if (flags && flags != IRQF_TIMER)
 		return -EINVAL;
 
-	action = create_percpu_irqaction(handler, flags, devname, dev_id);
+	action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id);
 	if (!action)
 		return -ENOMEM;
 
@@ -2560,7 +2566,7 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
 		return -EINVAL;
 
 	action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING,
-					 name, dev_id);
+					 name, NULL, dev_id);
 	if (!action)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From b9c6aa9efc71dae656f9f913d1250ea08cd6e10f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:32 +0100
Subject: genirq: Update request_percpu_nmi() to take an affinity

Continue spreading the notion of affinity to the per CPU interrupt request
code by updating the call sites that use request_percpu_nmi() (all two of
them) to take an affinity pointer. This pointer is firmly NULL for now.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-16-maz@kernel.org
---
 arch/arm64/kernel/smp.c   |  2 +-
 drivers/perf/arm_pmu.c    |  2 +-
 include/linux/interrupt.h |  4 ++--
 kernel/irq/manage.c       | 12 +++++++-----
 4 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 68cea3a4a35c..6fb838eee2e7 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1094,7 +1094,7 @@ static void ipi_setup_sgi(int ipi)
 	irq = ipi_irq_base + ipi;
 
 	if (ipi_should_be_nmi(ipi)) {
-		err = request_percpu_nmi(irq, ipi_handler, "IPI", &irq_stat);
+		err = request_percpu_nmi(irq, ipi_handler, "IPI", NULL, &irq_stat);
 		WARN(err, "Could not request IRQ %d as NMI, err=%d\n", irq, err);
 	} else {
 		err = request_percpu_irq(irq, ipi_handler, "IPI", &irq_stat);
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 5c310e803dd7..22c601b46c85 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -659,7 +659,7 @@ int armpmu_request_irq(int irq, int cpu)
 			irq_ops = &pmunmi_ops;
 		}
 	} else if (armpmu_count_irq_users(irq) == 0) {
-		err = request_percpu_nmi(irq, handler, "arm-pmu", &cpu_armpmu);
+		err = request_percpu_nmi(irq, handler, "arm-pmu", NULL, &cpu_armpmu);
 
 		/* If cannot get an NMI, get a normal interrupt */
 		if (err) {
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 52147d5f432b..81506ab759b8 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -197,8 +197,8 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler,
 }
 
 extern int __must_check
-request_percpu_nmi(unsigned int irq, irq_handler_t handler,
-		   const char *devname, void __percpu *dev);
+request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name,
+		   const struct cpumask *affinity, void __percpu *dev_id);
 
 extern const void *free_irq(unsigned int, void *);
 extern void free_percpu_irq(unsigned int, void __percpu *);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f4c65167743..b1a3140e5f3c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2527,6 +2527,7 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq);
  * @irq:	Interrupt line to allocate
  * @handler:	Function to be called when the IRQ occurs.
  * @name:	An ascii name for the claiming device
+ * @affinity:	A cpumask describing the target CPUs for this interrupt
  * @dev_id:	A percpu cookie passed back to the handler function
  *
  * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
@@ -2543,8 +2544,8 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq);
  * If the interrupt line cannot be used to deliver NMIs, function
  * will fail returning a negative value.
  */
-int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
-		       const char *name, void __percpu *dev_id)
+int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name,
+		       const struct cpumask *affinity, void __percpu *dev_id)
 {
 	struct irqaction *action;
 	struct irq_desc *desc;
@@ -2561,12 +2562,13 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
 	    !irq_supports_nmi(desc))
 		return -EINVAL;
 
-	/* The line cannot already be NMI */
-	if (irq_is_nmi(desc))
+	/* The line cannot be NMI already if the new request covers all CPUs */
+	if (irq_is_nmi(desc) &&
+	    (!affinity || cpumask_equal(affinity, cpu_possible_mask)))
 		return -EINVAL;
 
 	action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING,
-					 name, NULL, dev_id);
+					 name, affinity, dev_id);
 	if (!action)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From c734af3b2b95f0ac6ed87c50e7602a6beeaf534f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:34 +0100
Subject: genirq: Add request_percpu_irq_affinity() helper

While it would be nice to simply make request_percpu_irq() take an affinity
mask, the churn is likely to be on the irritating side given that most
drivers do not give a damn about affinities.

So take the more innocuous path to provide a helper that parallels
request_percpu_irq(), with an affinity as a bonus argument.

Yes, request_percpu_irq_affinity() is a bit of a mouthful.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-18-maz@kernel.org
---
 include/linux/interrupt.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 81506ab759b8..fa62ab556ee3 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -196,6 +196,15 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler,
 				    devname, NULL, percpu_dev_id);
 }
 
+static inline int __must_check
+request_percpu_irq_affinity(unsigned int irq, irq_handler_t handler,
+			    const char *devname, const cpumask_t *affinity,
+			    void __percpu *percpu_dev_id)
+{
+	return __request_percpu_irq(irq, handler, 0,
+				    devname, affinity, percpu_dev_id);
+}
+
 extern int __must_check
 request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name,
 		   const struct cpumask *affinity, void __percpu *dev_id);
-- 
cgit v1.2.3


From 54b350fa8e965dc59622698e2a18d6bf73944bf4 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 20 Oct 2025 13:29:35 +0100
Subject: perf: arm_pmu: Request specific affinities for per CPU
 NMIs/interrupts

Let the PMU driver request both NMIs and normal interrupts with an affinity mask
matching the PMU affinity.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-19-maz@kernel.org
---
 drivers/perf/arm_pmu.c          | 44 ++++++++++++++++++++++++-----------------
 drivers/perf/arm_pmu_acpi.c     |  2 +-
 drivers/perf/arm_pmu_platform.c |  4 ++--
 include/linux/perf/arm_pmu.h    |  4 ++--
 4 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 22c601b46c85..959ceb3d1f55 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -26,7 +26,8 @@
 
 #include <asm/irq_regs.h>
 
-static int armpmu_count_irq_users(const int irq);
+static int armpmu_count_irq_users(const struct cpumask *affinity,
+				  const int irq);
 
 struct pmu_irq_ops {
 	void (*enable_pmuirq)(unsigned int irq);
@@ -64,7 +65,9 @@ static void armpmu_enable_percpu_pmuirq(unsigned int irq)
 static void armpmu_free_percpu_pmuirq(unsigned int irq, int cpu,
 				   void __percpu *devid)
 {
-	if (armpmu_count_irq_users(irq) == 1)
+	struct arm_pmu *armpmu = *per_cpu_ptr((void * __percpu *)devid, cpu);
+
+	if (armpmu_count_irq_users(&armpmu->supported_cpus, irq) == 1)
 		free_percpu_irq(irq, devid);
 }
 
@@ -89,7 +92,9 @@ static void armpmu_disable_percpu_pmunmi(unsigned int irq)
 static void armpmu_free_percpu_pmunmi(unsigned int irq, int cpu,
 				      void __percpu *devid)
 {
-	if (armpmu_count_irq_users(irq) == 1)
+	struct arm_pmu *armpmu = *per_cpu_ptr((void * __percpu *)devid, cpu);
+
+	if (armpmu_count_irq_users(&armpmu->supported_cpus, irq) == 1)
 		free_percpu_nmi(irq, devid);
 }
 
@@ -580,11 +585,11 @@ static const struct attribute_group armpmu_common_attr_group = {
 	.attrs = armpmu_common_attrs,
 };
 
-static int armpmu_count_irq_users(const int irq)
+static int armpmu_count_irq_users(const struct cpumask *affinity, const int irq)
 {
 	int cpu, count = 0;
 
-	for_each_possible_cpu(cpu) {
+	for_each_cpu(cpu, affinity) {
 		if (per_cpu(cpu_irq, cpu) == irq)
 			count++;
 	}
@@ -592,12 +597,13 @@ static int armpmu_count_irq_users(const int irq)
 	return count;
 }
 
-static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq)
+static const struct pmu_irq_ops *
+armpmu_find_irq_ops(const struct cpumask *affinity, int irq)
 {
 	const struct pmu_irq_ops *ops = NULL;
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
+	for_each_cpu(cpu, affinity) {
 		if (per_cpu(cpu_irq, cpu) != irq)
 			continue;
 
@@ -609,22 +615,25 @@ static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq)
 	return ops;
 }
 
-void armpmu_free_irq(int irq, int cpu)
+void armpmu_free_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu)
 {
 	if (per_cpu(cpu_irq, cpu) == 0)
 		return;
 	if (WARN_ON(irq != per_cpu(cpu_irq, cpu)))
 		return;
 
-	per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, &cpu_armpmu);
+	per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, armpmu);
 
 	per_cpu(cpu_irq, cpu) = 0;
 	per_cpu(cpu_irq_ops, cpu) = NULL;
 }
 
-int armpmu_request_irq(int irq, int cpu)
+int armpmu_request_irq(struct arm_pmu * __percpu *pcpu_armpmu, int irq, int cpu)
 {
 	int err = 0;
+	struct arm_pmu **armpmu = per_cpu_ptr(pcpu_armpmu, cpu);
+	const struct cpumask *affinity = *armpmu ? &(*armpmu)->supported_cpus :
+						   cpu_possible_mask; /* ACPI */
 	const irq_handler_t handler = armpmu_dispatch_irq;
 	const struct pmu_irq_ops *irq_ops;
 
@@ -646,25 +655,24 @@ int armpmu_request_irq(int irq, int cpu)
 			    IRQF_NOBALANCING | IRQF_NO_AUTOEN |
 			    IRQF_NO_THREAD;
 
-		err = request_nmi(irq, handler, irq_flags, "arm-pmu",
-				  per_cpu_ptr(&cpu_armpmu, cpu));
+		err = request_nmi(irq, handler, irq_flags, "arm-pmu", armpmu);
 
 		/* If cannot get an NMI, get a normal interrupt */
 		if (err) {
 			err = request_irq(irq, handler, irq_flags, "arm-pmu",
-					  per_cpu_ptr(&cpu_armpmu, cpu));
+					  armpmu);
 			irq_ops = &pmuirq_ops;
 		} else {
 			has_nmi = true;
 			irq_ops = &pmunmi_ops;
 		}
-	} else if (armpmu_count_irq_users(irq) == 0) {
-		err = request_percpu_nmi(irq, handler, "arm-pmu", NULL, &cpu_armpmu);
+	} else if (armpmu_count_irq_users(affinity, irq) == 0) {
+		err = request_percpu_nmi(irq, handler, "arm-pmu", affinity, pcpu_armpmu);
 
 		/* If cannot get an NMI, get a normal interrupt */
 		if (err) {
-			err = request_percpu_irq(irq, handler, "arm-pmu",
-						 &cpu_armpmu);
+			err = request_percpu_irq_affinity(irq, handler, "arm-pmu",
+							  affinity, pcpu_armpmu);
 			irq_ops = &percpu_pmuirq_ops;
 		} else {
 			has_nmi = true;
@@ -672,7 +680,7 @@ int armpmu_request_irq(int irq, int cpu)
 		}
 	} else {
 		/* Per cpudevid irq was already requested by another CPU */
-		irq_ops = armpmu_find_irq_ops(irq);
+		irq_ops = armpmu_find_irq_ops(affinity, irq);
 
 		if (WARN_ON(!irq_ops))
 			err = -EINVAL;
diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c
index 05dda19c5359..e80f76d95e68 100644
--- a/drivers/perf/arm_pmu_acpi.c
+++ b/drivers/perf/arm_pmu_acpi.c
@@ -218,7 +218,7 @@ static int arm_pmu_acpi_parse_irqs(void)
 		 * them with their PMUs.
 		 */
 		per_cpu(pmu_irqs, cpu) = irq;
-		err = armpmu_request_irq(irq, cpu);
+		err = armpmu_request_irq(&probed_pmus, irq, cpu);
 		if (err)
 			goto out_err;
 	}
diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c
index 9c0494d8a867..1c9e50a13201 100644
--- a/drivers/perf/arm_pmu_platform.c
+++ b/drivers/perf/arm_pmu_platform.c
@@ -165,7 +165,7 @@ static int armpmu_request_irqs(struct arm_pmu *armpmu)
 		if (!irq)
 			continue;
 
-		err = armpmu_request_irq(irq, cpu);
+		err = armpmu_request_irq(&hw_events->percpu_pmu, irq, cpu);
 		if (err)
 			break;
 	}
@@ -181,7 +181,7 @@ static void armpmu_free_irqs(struct arm_pmu *armpmu)
 	for_each_cpu(cpu, &armpmu->supported_cpus) {
 		int irq = per_cpu(hw_events->irq, cpu);
 
-		armpmu_free_irq(irq, cpu);
+		armpmu_free_irq(&hw_events->percpu_pmu, irq, cpu);
 	}
 }
 
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 93c9a26492fc..6690bd77aa4e 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -190,8 +190,8 @@ bool arm_pmu_irq_is_nmi(void);
 struct arm_pmu *armpmu_alloc(void);
 void armpmu_free(struct arm_pmu *pmu);
 int armpmu_register(struct arm_pmu *pmu);
-int armpmu_request_irq(int irq, int cpu);
-void armpmu_free_irq(int irq, int cpu);
+int armpmu_request_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu);
+void armpmu_free_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu);
 
 #define ARMV8_PMU_PDEV_NAME "armv8-pmu"
 
-- 
cgit v1.2.3


From c620438ef2ac80b09269a9ae3c0b4fe5add19bfe Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:40 +0100
Subject: irqchip: Kill irq-partition-percpu

This code is now completely unused, and nobody will ever miss it.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-24-maz@kernel.org
---
 drivers/irqchip/Kconfig                      |   3 -
 drivers/irqchip/Makefile                     |   1 -
 drivers/irqchip/irq-partition-percpu.c       | 241 ---------------------------
 include/linux/irqchip/irq-partition-percpu.h |  53 ------
 4 files changed, 298 deletions(-)
 delete mode 100644 drivers/irqchip/irq-partition-percpu.c
 delete mode 100644 include/linux/irqchip/irq-partition-percpu.h

(limited to 'include')

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 648b3618fc78..5dddb4c9442a 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -450,9 +450,6 @@ config LS_SCFG_MSI
 	depends on PCI_MSI
 	select IRQ_MSI_LIB
 
-config PARTITION_PERCPU
-	bool
-
 config STM32MP_EXTI
 	tristate "STM32MP extended interrupts and event controller"
 	depends on (ARCH_STM32 && !ARM_SINGLE_ARMV7M) || COMPILE_TEST
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 3de083f5484c..6a229443efe0 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -36,7 +36,6 @@ obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-v3-mbi.o irq-gic-common.o
 obj-$(CONFIG_ARM_GIC_ITS_PARENT)	+= irq-gic-its-msi-parent.o
 obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v4.o
 obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC)	+= irq-gic-v3-its-fsl-mc-msi.o
-obj-$(CONFIG_PARTITION_PERCPU)		+= irq-partition-percpu.o
 obj-$(CONFIG_ARM_GIC_V5)		+= irq-gic-v5.o irq-gic-v5-irs.o irq-gic-v5-its.o \
 					   irq-gic-v5-iwb.o
 obj-$(CONFIG_HISILICON_IRQ_MBIGEN)	+= irq-mbigen.o
diff --git a/drivers/irqchip/irq-partition-percpu.c b/drivers/irqchip/irq-partition-percpu.c
deleted file mode 100644
index 4441ffe149ea..000000000000
--- a/drivers/irqchip/irq-partition-percpu.c
+++ /dev/null
@@ -1,241 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2016 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/bitops.h>
-#include <linux/interrupt.h>
-#include <linux/irqchip.h>
-#include <linux/irqchip/chained_irq.h>
-#include <linux/irqchip/irq-partition-percpu.h>
-#include <linux/irqdomain.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-
-struct partition_desc {
-	int				nr_parts;
-	struct partition_affinity	*parts;
-	struct irq_domain		*domain;
-	struct irq_desc			*chained_desc;
-	unsigned long			*bitmap;
-	struct irq_domain_ops		ops;
-};
-
-static bool partition_check_cpu(struct partition_desc *part,
-				unsigned int cpu, unsigned int hwirq)
-{
-	return cpumask_test_cpu(cpu, &part->parts[hwirq].mask);
-}
-
-static void partition_irq_mask(struct irq_data *d)
-{
-	struct partition_desc *part = irq_data_get_irq_chip_data(d);
-	struct irq_chip *chip = irq_desc_get_chip(part->chained_desc);
-	struct irq_data *data = irq_desc_get_irq_data(part->chained_desc);
-
-	if (partition_check_cpu(part, smp_processor_id(), d->hwirq) &&
-	    chip->irq_mask)
-		chip->irq_mask(data);
-}
-
-static void partition_irq_unmask(struct irq_data *d)
-{
-	struct partition_desc *part = irq_data_get_irq_chip_data(d);
-	struct irq_chip *chip = irq_desc_get_chip(part->chained_desc);
-	struct irq_data *data = irq_desc_get_irq_data(part->chained_desc);
-
-	if (partition_check_cpu(part, smp_processor_id(), d->hwirq) &&
-	    chip->irq_unmask)
-		chip->irq_unmask(data);
-}
-
-static int partition_irq_set_irqchip_state(struct irq_data *d,
-					   enum irqchip_irq_state which,
-					   bool val)
-{
-	struct partition_desc *part = irq_data_get_irq_chip_data(d);
-	struct irq_chip *chip = irq_desc_get_chip(part->chained_desc);
-	struct irq_data *data = irq_desc_get_irq_data(part->chained_desc);
-
-	if (partition_check_cpu(part, smp_processor_id(), d->hwirq) &&
-	    chip->irq_set_irqchip_state)
-		return chip->irq_set_irqchip_state(data, which, val);
-
-	return -EINVAL;
-}
-
-static int partition_irq_get_irqchip_state(struct irq_data *d,
-					   enum irqchip_irq_state which,
-					   bool *val)
-{
-	struct partition_desc *part = irq_data_get_irq_chip_data(d);
-	struct irq_chip *chip = irq_desc_get_chip(part->chained_desc);
-	struct irq_data *data = irq_desc_get_irq_data(part->chained_desc);
-
-	if (partition_check_cpu(part, smp_processor_id(), d->hwirq) &&
-	    chip->irq_get_irqchip_state)
-		return chip->irq_get_irqchip_state(data, which, val);
-
-	return -EINVAL;
-}
-
-static int partition_irq_set_type(struct irq_data *d, unsigned int type)
-{
-	struct partition_desc *part = irq_data_get_irq_chip_data(d);
-	struct irq_chip *chip = irq_desc_get_chip(part->chained_desc);
-	struct irq_data *data = irq_desc_get_irq_data(part->chained_desc);
-
-	if (chip->irq_set_type)
-		return chip->irq_set_type(data, type);
-
-	return -EINVAL;
-}
-
-static void partition_irq_print_chip(struct irq_data *d, struct seq_file *p)
-{
-	struct partition_desc *part = irq_data_get_irq_chip_data(d);
-	struct irq_chip *chip = irq_desc_get_chip(part->chained_desc);
-	struct irq_data *data = irq_desc_get_irq_data(part->chained_desc);
-
-	seq_printf(p, "%5s-%lu", chip->name, data->hwirq);
-}
-
-static struct irq_chip partition_irq_chip = {
-	.irq_mask		= partition_irq_mask,
-	.irq_unmask		= partition_irq_unmask,
-	.irq_set_type		= partition_irq_set_type,
-	.irq_get_irqchip_state	= partition_irq_get_irqchip_state,
-	.irq_set_irqchip_state	= partition_irq_set_irqchip_state,
-	.irq_print_chip		= partition_irq_print_chip,
-};
-
-static void partition_handle_irq(struct irq_desc *desc)
-{
-	struct partition_desc *part = irq_desc_get_handler_data(desc);
-	struct irq_chip *chip = irq_desc_get_chip(desc);
-	int cpu = smp_processor_id();
-	int hwirq;
-
-	chained_irq_enter(chip, desc);
-
-	for_each_set_bit(hwirq, part->bitmap, part->nr_parts) {
-		if (partition_check_cpu(part, cpu, hwirq))
-			break;
-	}
-
-	if (unlikely(hwirq == part->nr_parts))
-		handle_bad_irq(desc);
-	else
-		generic_handle_domain_irq(part->domain, hwirq);
-
-	chained_irq_exit(chip, desc);
-}
-
-static int partition_domain_alloc(struct irq_domain *domain, unsigned int virq,
-				  unsigned int nr_irqs, void *arg)
-{
-	int ret;
-	irq_hw_number_t hwirq;
-	unsigned int type;
-	struct irq_fwspec *fwspec = arg;
-	struct partition_desc *part;
-
-	BUG_ON(nr_irqs != 1);
-	ret = domain->ops->translate(domain, fwspec, &hwirq, &type);
-	if (ret)
-		return ret;
-
-	part = domain->host_data;
-
-	set_bit(hwirq, part->bitmap);
-	irq_set_chained_handler_and_data(irq_desc_get_irq(part->chained_desc),
-					 partition_handle_irq, part);
-	irq_set_percpu_devid_partition(virq, &part->parts[hwirq].mask);
-	irq_domain_set_info(domain, virq, hwirq, &partition_irq_chip, part,
-			    handle_percpu_devid_irq, NULL, NULL);
-	irq_set_status_flags(virq, IRQ_NOAUTOEN);
-
-	return 0;
-}
-
-static void partition_domain_free(struct irq_domain *domain, unsigned int virq,
-				  unsigned int nr_irqs)
-{
-	struct irq_data *d;
-
-	BUG_ON(nr_irqs != 1);
-
-	d = irq_domain_get_irq_data(domain, virq);
-	irq_set_handler(virq, NULL);
-	irq_domain_reset_irq_data(d);
-}
-
-int partition_translate_id(struct partition_desc *desc, void *partition_id)
-{
-	struct partition_affinity *part = NULL;
-	int i;
-
-	for (i = 0; i < desc->nr_parts; i++) {
-		if (desc->parts[i].partition_id == partition_id) {
-			part = &desc->parts[i];
-			break;
-		}
-	}
-
-	if (WARN_ON(!part)) {
-		pr_err("Failed to find partition\n");
-		return -EINVAL;
-	}
-
-	return i;
-}
-
-struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode,
-					     struct partition_affinity *parts,
-					     int nr_parts,
-					     int chained_irq,
-					     const struct irq_domain_ops *ops)
-{
-	struct partition_desc *desc;
-	struct irq_domain *d;
-
-	BUG_ON(!ops->select || !ops->translate);
-
-	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
-	if (!desc)
-		return NULL;
-
-	desc->ops = *ops;
-	desc->ops.free = partition_domain_free;
-	desc->ops.alloc = partition_domain_alloc;
-
-	d = irq_domain_create_linear(fwnode, nr_parts, &desc->ops, desc);
-	if (!d)
-		goto out;
-	desc->domain = d;
-
-	desc->bitmap = bitmap_zalloc(nr_parts, GFP_KERNEL);
-	if (WARN_ON(!desc->bitmap))
-		goto out;
-
-	desc->chained_desc = irq_to_desc(chained_irq);
-	desc->nr_parts = nr_parts;
-	desc->parts = parts;
-
-	return desc;
-out:
-	if (d)
-		irq_domain_remove(d);
-	kfree(desc);
-
-	return NULL;
-}
-
-struct irq_domain *partition_get_domain(struct partition_desc *dsc)
-{
-	if (dsc)
-		return dsc->domain;
-
-	return NULL;
-}
diff --git a/include/linux/irqchip/irq-partition-percpu.h b/include/linux/irqchip/irq-partition-percpu.h
deleted file mode 100644
index b35ee22c278f..000000000000
--- a/include/linux/irqchip/irq-partition-percpu.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2016 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#ifndef __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H
-#define __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H
-
-#include <linux/fwnode.h>
-#include <linux/cpumask_types.h>
-#include <linux/irqdomain.h>
-
-struct partition_affinity {
-	cpumask_t			mask;
-	void				*partition_id;
-};
-
-struct partition_desc;
-
-#ifdef CONFIG_PARTITION_PERCPU
-int partition_translate_id(struct partition_desc *desc, void *partition_id);
-struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode,
-					     struct partition_affinity *parts,
-					     int nr_parts,
-					     int chained_irq,
-					     const struct irq_domain_ops *ops);
-struct irq_domain *partition_get_domain(struct partition_desc *dsc);
-#else
-static inline int partition_translate_id(struct partition_desc *desc,
-					 void *partition_id)
-{
-	return -EINVAL;
-}
-
-static inline
-struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode,
-					     struct partition_affinity *parts,
-					     int nr_parts,
-					     int chained_irq,
-					     const struct irq_domain_ops *ops)
-{
-	return NULL;
-}
-
-static inline
-struct irq_domain *partition_get_domain(struct partition_desc *dsc)
-{
-	return NULL;
-}
-#endif
-
-#endif /* __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H */
-- 
cgit v1.2.3


From ee2d50a9f524ae829d1a8ec296d7a0170e7b8ade Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:41 +0100
Subject: genirq: Kill irq_{g,s}et_percpu_devid_partition()

These two helpers do not have any user anymore, and can be removed,
together with the affinity field kept in the irqdesc structure.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-25-maz@kernel.org
---
 include/linux/irq.h     |  4 ----
 include/linux/irqdesc.h |  1 -
 kernel/irq/irqdesc.c    | 24 +-----------------------
 3 files changed, 1 insertion(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b728c18f6ded..4a9f1d7b08c3 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -718,10 +718,6 @@ static inline void irq_set_chip_and_handler(unsigned int irq,
 }
 
 extern int irq_set_percpu_devid(unsigned int irq);
-extern int irq_set_percpu_devid_partition(unsigned int irq,
-					  const struct cpumask *affinity);
-extern int irq_get_percpu_devid_partition(unsigned int irq,
-					  struct cpumask *affinity);
 
 extern void
 __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index fd091c35d572..37e0b5b5600a 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -82,7 +82,6 @@ struct irq_desc {
 	int			threads_handled_last;
 	raw_spinlock_t		lock;
 	struct cpumask		*percpu_enabled;
-	const struct cpumask	*percpu_affinity;
 #ifdef CONFIG_SMP
 	const struct cpumask	*affinity_hint;
 	struct irq_affinity_notify *affinity_notify;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index db714d3014b5..6acf268f005b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -879,8 +879,7 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
 		chip_bus_sync_unlock(desc);
 }
 
-int irq_set_percpu_devid_partition(unsigned int irq,
-				   const struct cpumask *affinity)
+int irq_set_percpu_devid(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
@@ -892,31 +891,10 @@ int irq_set_percpu_devid_partition(unsigned int irq,
 	if (!desc->percpu_enabled)
 		return -ENOMEM;
 
-	desc->percpu_affinity = affinity ? : cpu_possible_mask;
-
 	irq_set_percpu_devid_flags(irq);
 	return 0;
 }
 
-int irq_set_percpu_devid(unsigned int irq)
-{
-	return irq_set_percpu_devid_partition(irq, NULL);
-}
-
-int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	if (!desc || !desc->percpu_enabled)
-		return -EINVAL;
-
-	if (affinity)
-		cpumask_copy(affinity, desc->percpu_affinity);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);
-
 void kstat_incr_irq_this_cpu(unsigned int irq)
 {
 	kstat_incr_irqs_this_cpu(irq_to_desc(irq));
-- 
cgit v1.2.3


From ebac4649fcadc6047030810326875c6e612c7b2f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:42 +0100
Subject: irqdomain: Kill of_node_to_fwnode() helper

There is no in-tree users of this helper since b13b41cc3dc18 ("misc:
ti_fpc202: Switch to of_fwnode_handle()"), and is replaced with
of_fwnode_handle().

Get rid of it.

Suggested-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-26-maz@kernel.org
---
 include/linux/irqdomain.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 9d6a5e99394f..5907baf6099d 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -730,12 +730,6 @@ static inline void msi_device_domain_free_wired(struct irq_domain *domain, unsig
 }
 #endif
 
-/* Deprecated functions. Will be removed in the merge window */
-static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
-{
-	return node ? &node->fwnode : NULL;
-}
-
 static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
 						     const struct irq_domain_ops *ops,
 						     void *host_data)
-- 
cgit v1.2.3


From fa9d2777387346645a40ab37cfb0c37b3ef40cc9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 20 Oct 2025 13:29:43 +0100
Subject: perf: arm_pmu: Kill last use of per-CPU cpu_armpmu pointer

Having removed the use of the cpu_armpmu per-CPU variable from the
interrupt handling, the only user left is the BRBE scheduler hook.

It is easy to drop the use of this variable by following the pointer to the
generic PMU structure, and get the arm_pmu structure from there.

Perform the conversion and kill cpu_armpmu altogether.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-27-maz@kernel.org
---
 drivers/perf/arm_pmu.c       | 5 -----
 drivers/perf/arm_pmuv3.c     | 2 +-
 include/linux/perf/arm_pmu.h | 2 --
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 959ceb3d1f55..f7abd1333963 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -104,7 +104,6 @@ static const struct pmu_irq_ops percpu_pmunmi_ops = {
 	.free_pmuirq = armpmu_free_percpu_pmunmi
 };
 
-DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu);
 static DEFINE_PER_CPU(int, cpu_irq);
 static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops);
 
@@ -725,8 +724,6 @@ static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node)
 	if (pmu->reset)
 		pmu->reset(pmu);
 
-	per_cpu(cpu_armpmu, cpu) = pmu;
-
 	irq = armpmu_get_cpu_irq(pmu, cpu);
 	if (irq)
 		per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq);
@@ -746,8 +743,6 @@ static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node)
 	if (irq)
 		per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq);
 
-	per_cpu(cpu_armpmu, cpu) = NULL;
-
 	return 0;
 }
 
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 69c5cc8f5606..ca8d706d4022 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -1064,7 +1064,7 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
 static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
 				struct task_struct *task, bool sched_in)
 {
-	struct arm_pmu *armpmu = *this_cpu_ptr(&cpu_armpmu);
+	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
 	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
 
 	if (!hw_events->branch_users)
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 6690bd77aa4e..bab26a7d79f4 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -132,8 +132,6 @@ struct arm_pmu {
 
 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
 
-DECLARE_PER_CPU(struct arm_pmu *, cpu_armpmu);
-
 u64 armpmu_event_update(struct perf_event *event);
 
 int armpmu_event_set_period(struct perf_event *event);
-- 
cgit v1.2.3


From 531b87d865eb9e625c2e46ec8f06a65a6157ee45 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Sun, 26 Oct 2025 20:38:45 +0000
Subject: bpf: widen dynptr size/offset to 64 bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dynptr currently caps size and offset at 24 bits, which isn’t sufficient
for file-backed use cases; even 32 bits can be limiting. Refactor dynptr
helpers/kfuncs to use 64-bit size and offset, ensuring consistency
across the APIs.

This change does not affect internals of xdp, skb or other dynptrs,
which continue to behave as before. Also it does not break binary
compatibility.

The widening enables large-file access support via dynptr, implemented
in the next patches.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-3-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                | 20 +++----
 include/uapi/linux/bpf.h                           |  8 +--
 kernel/bpf/helpers.c                               | 66 +++++++++++-----------
 kernel/trace/bpf_trace.c                           | 46 +++++++--------
 tools/include/uapi/linux/bpf.h                     |  8 +--
 tools/testing/selftests/bpf/bpf_kfuncs.h           | 12 ++--
 tools/testing/selftests/bpf/progs/dynptr_success.c | 12 ++--
 7 files changed, 86 insertions(+), 86 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e53cda0aabb6..907c69295293 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1387,19 +1387,19 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_SKB_META,
 };
 
-int bpf_dynptr_check_size(u32 size);
-u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
-const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len);
-void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len);
+int bpf_dynptr_check_size(u64 size);
+u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len);
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len);
 bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr);
-int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset,
-		       void *src, u32 len, u64 flags);
-void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
-			    void *buffer__opt, u32 buffer__szk);
+int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset,
+		       void *src, u64 len, u64 flags);
+void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
+			    void *buffer__opt, u64 buffer__szk);
 
-static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len)
 {
-	u32 size = __bpf_dynptr_size(ptr);
+	u64 size = __bpf_dynptr_size(ptr);
 
 	if (len > size || offset > size - len)
 		return -E2BIG;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6829936d33f5..77edd0253989 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5618,7 +5618,7 @@ union bpf_attr {
  *	Return
  *		*sk* if casting is valid, or **NULL** otherwise.
  *
- * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr)
+ * long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr)
  *	Description
  *		Get a dynptr to local memory *data*.
  *
@@ -5661,7 +5661,7 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
@@ -5671,7 +5671,7 @@ union bpf_attr {
  *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
  *		*flags* is not 0.
  *
- * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
@@ -5692,7 +5692,7 @@ union bpf_attr {
  *		is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
  *		other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
  *
- * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len)
  *	Description
  *		Get a pointer to the underlying dynptr data.
  *
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b9ec6ee21c94..a2ce17ea5edb 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1684,19 +1684,19 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt
 	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
 }
 
-u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_SIZE_MASK;
 }
 
-static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size)
+static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size)
 {
 	u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
 
-	ptr->size = new_size | metadata;
+	ptr->size = (u32)new_size | metadata;
 }
 
-int bpf_dynptr_check_size(u32 size)
+int bpf_dynptr_check_size(u64 size)
 {
 	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
 }
@@ -1715,7 +1715,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
 	memset(ptr, 0, sizeof(*ptr));
 }
 
-BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr)
 {
 	int err;
 
@@ -1750,8 +1750,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
 };
 
-static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src,
-			     u32 offset, u64 flags)
+static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src,
+			     u64 offset, u64 flags)
 {
 	enum bpf_dynptr_type type;
 	int err;
@@ -1787,8 +1787,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s
 	}
 }
 
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
-	   u32, offset, u64, flags)
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src,
+	   u64, offset, u64, flags)
 {
 	return __bpf_dynptr_read(dst, len, src, offset, flags);
 }
@@ -1804,8 +1804,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
-		       u32 len, u64 flags)
+int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src,
+		       u64 len, u64 flags)
 {
 	enum bpf_dynptr_type type;
 	int err;
@@ -1848,8 +1848,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
 	}
 }
 
-BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
-	   u32, len, u64, flags)
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src,
+	   u64, len, u64, flags)
 {
 	return __bpf_dynptr_write(dst, offset, src, len, flags);
 }
@@ -1865,7 +1865,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len)
 {
 	enum bpf_dynptr_type type;
 	int err;
@@ -2680,12 +2680,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
  * provided buffer, with its contents containing the data, if unable to obtain
  * direct pointer)
  */
-__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
-				   void *buffer__opt, u32 buffer__szk)
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
+				   void *buffer__opt, u64 buffer__szk)
 {
 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
 	enum bpf_dynptr_type type;
-	u32 len = buffer__szk;
+	u64 len = buffer__szk;
 	int err;
 
 	if (!ptr->data)
@@ -2767,8 +2767,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
  * provided buffer, with its contents containing the data, if unable to obtain
  * direct pointer)
  */
-__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
-					void *buffer__opt, u32 buffer__szk)
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
+					void *buffer__opt, u64 buffer__szk)
 {
 	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
 
@@ -2800,10 +2800,10 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
 	return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
 }
 
-__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end)
+__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
 {
 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
-	u32 size;
+	u64 size;
 
 	if (!ptr->data || start > end)
 		return -EINVAL;
@@ -2836,7 +2836,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
 	return __bpf_dynptr_is_rdonly(ptr);
 }
 
-__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p)
+__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
 {
 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
 
@@ -2873,14 +2873,14 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
  * Copies data from source dynptr to destination dynptr.
  * Returns 0 on success; negative error, otherwise.
  */
-__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
-				struct bpf_dynptr *src_ptr, u32 src_off, u32 size)
+__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
+				struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
 {
 	struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
 	struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
 	void *src_slice, *dst_slice;
 	char buf[256];
-	u32 off;
+	u64 off;
 
 	src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
 	dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
@@ -2902,7 +2902,7 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
 
 	off = 0;
 	while (off < size) {
-		u32 chunk_sz = min_t(u32, sizeof(buf), size - off);
+		u64 chunk_sz = min_t(u64, sizeof(buf), size - off);
 		int err;
 
 		err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
@@ -2928,10 +2928,10 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
  * at @offset with the constant byte @val.
  * Returns 0 on success; negative error, otherwise.
  */
- __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val)
- {
+__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
+{
 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
-	u32 chunk_sz, write_off;
+	u64 chunk_sz, write_off;
 	char buf[256];
 	void* slice;
 	int err;
@@ -2950,11 +2950,11 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
 		return err;
 
 	/* Non-linear data under the dynptr, write from a local buffer */
-	chunk_sz = min_t(u32, sizeof(buf), size);
+	chunk_sz = min_t(u64, sizeof(buf), size);
 	memset(buf, val, chunk_sz);
 
 	for (write_off = 0; write_off < size; write_off += chunk_sz) {
-		chunk_sz = min_t(u32, sizeof(buf), size - write_off);
+		chunk_sz = min_t(u64, sizeof(buf), size - write_off);
 		err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
 		if (err)
 			return err;
@@ -4469,7 +4469,7 @@ late_initcall(kfunc_init);
 /* Get a pointer to dynptr data up to len bytes for read only access. If
  * the dynptr doesn't have continuous data up to len bytes, return NULL.
  */
-const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len)
 {
 	const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
 
@@ -4480,7 +4480,7 @@ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
  * the dynptr doesn't have continuous data up to len bytes, or the dynptr
  * is read only, return NULL.
  */
-void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len)
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len)
 {
 	if (__bpf_dynptr_is_rdonly(ptr))
 		return NULL;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4f87c16d915a..a795f7afbf3d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3372,13 +3372,13 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc
  * direct calls into all the specific callback implementations
  * (copy_user_data_sleepable, copy_user_data_nofault, and so on)
  */
-static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size,
+static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size,
 						 const void *unsafe_src,
 						 copy_fn_t str_copy_fn,
 						 struct task_struct *tsk)
 {
 	struct bpf_dynptr_kern *dst;
-	u32 chunk_sz, off;
+	u64 chunk_sz, off;
 	void *dst_slice;
 	int cnt, err;
 	char buf[256];
@@ -3392,7 +3392,7 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do
 		return -E2BIG;
 
 	for (off = 0; off < size; off += chunk_sz - 1) {
-		chunk_sz = min_t(u32, sizeof(buf), size - off);
+		chunk_sz = min_t(u64, sizeof(buf), size - off);
 		/* Expect str_copy_fn to return count of copied bytes, including
 		 * zero terminator. Next iteration increment off by chunk_sz - 1 to
 		 * overwrite NUL.
@@ -3409,14 +3409,14 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do
 	return off;
 }
 
-static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff,
-					     u32 size, const void *unsafe_src,
+static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff,
+					     u64 size, const void *unsafe_src,
 					     copy_fn_t copy_fn, struct task_struct *tsk)
 {
 	struct bpf_dynptr_kern *dst;
 	void *dst_slice;
 	char buf[256];
-	u32 off, chunk_sz;
+	u64 off, chunk_sz;
 	int err;
 
 	dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
@@ -3428,7 +3428,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32
 		return -E2BIG;
 
 	for (off = 0; off < size; off += chunk_sz) {
-		chunk_sz = min_t(u32, sizeof(buf), size - off);
+		chunk_sz = min_t(u64, sizeof(buf), size - off);
 		err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
 		if (err)
 			return err;
@@ -3514,58 +3514,58 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
 	return bpf_send_signal_common(sig, type, task, value);
 }
 
-__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off,
-					   u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+					   u64 size, const void __user *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
 				 copy_user_data_nofault, NULL);
 }
 
-__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off,
-					     u32 size, const void *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
+					     u64 size, const void *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
 				 copy_kernel_data_nofault, NULL);
 }
 
-__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
-					       u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+					       u64 size, const void __user *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
 				     copy_user_str_nofault, NULL);
 }
 
-__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off,
-						 u32 size, const void *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+						 u64 size, const void *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
 				     copy_kernel_str_nofault, NULL);
 }
 
-__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off,
-					  u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+					  u64 size, const void __user *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
 				 copy_user_data_sleepable, NULL);
 }
 
-__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
-					      u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+					      u64 size, const void __user *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
 				     copy_user_str_sleepable, NULL);
 }
 
-__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off,
-					       u32 size, const void __user *unsafe_ptr__ign,
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
+					       u64 size, const void __user *unsafe_ptr__ign,
 					       struct task_struct *tsk)
 {
 	return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
 				 copy_user_data_sleepable, tsk);
 }
 
-__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off,
-						   u32 size, const void __user *unsafe_ptr__ign,
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+						   u64 size, const void __user *unsafe_ptr__ign,
 						   struct task_struct *tsk)
 {
 	return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 6829936d33f5..77edd0253989 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5618,7 +5618,7 @@ union bpf_attr {
  *	Return
  *		*sk* if casting is valid, or **NULL** otherwise.
  *
- * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr)
+ * long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr)
  *	Description
  *		Get a dynptr to local memory *data*.
  *
@@ -5661,7 +5661,7 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
@@ -5671,7 +5671,7 @@ union bpf_attr {
  *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
  *		*flags* is not 0.
  *
- * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
@@ -5692,7 +5692,7 @@ union bpf_attr {
  *		is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
  *		other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
  *
- * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len)
  *	Description
  *		Get a pointer to the underlying dynptr data.
  *
diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
index 794d44d19c88..e0189254bb6e 100644
--- a/tools/testing/selftests/bpf/bpf_kfuncs.h
+++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
@@ -28,8 +28,8 @@ extern int bpf_dynptr_from_skb_meta(struct __sk_buff *skb, __u64 flags,
  *  Either a direct pointer to the dynptr data or a pointer to the user-provided
  *  buffer if unable to obtain a direct pointer
  */
-extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset,
-			      void *buffer, __u32 buffer__szk) __ksym __weak;
+extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u64 offset,
+			      void *buffer, __u64 buffer__szk) __ksym __weak;
 
 /* Description
  *  Obtain a read-write pointer to the dynptr's data
@@ -37,13 +37,13 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset,
  *  Either a direct pointer to the dynptr data or a pointer to the user-provided
  *  buffer if unable to obtain a direct pointer
  */
-extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset,
-			      void *buffer, __u32 buffer__szk) __ksym __weak;
+extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u64 offset, void *buffer,
+				   __u64 buffer__szk) __ksym __weak;
 
-extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u32 start, __u32 end) __ksym __weak;
+extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak;
 extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak;
 extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak;
-extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak;
+extern __u64 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak;
 extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym __weak;
 
 /* Description
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index 127dea342e5a..e0d672d93adf 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -914,8 +914,8 @@ void *user_ptr;
 char expected_str[384];
 __u32 test_len[7] = {0/* placeholder */, 0, 1, 2, 255, 256, 257};
 
-typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u32 off,
-				    u32 size, const void *unsafe_ptr);
+typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u64 off,
+				    u64 size, const void *unsafe_ptr);
 
 /* Returns the offset just before the end of the maximum sized xdp fragment.
  * Any write larger than 32 bytes will be split between 2 fragments.
@@ -1106,16 +1106,16 @@ int test_copy_from_user_str_dynptr(void *ctx)
 	return 0;
 }
 
-static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u32 off,
-					u32 size, const void *unsafe_ptr)
+static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off,
+					u64 size, const void *unsafe_ptr)
 {
 	struct task_struct *task = bpf_get_current_task_btf();
 
 	return bpf_copy_from_user_task_dynptr(dptr, off, size, unsafe_ptr, task);
 }
 
-static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u32 off,
-					    u32 size, const void *unsafe_ptr)
+static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u64 off,
+					    u64 size, const void *unsafe_ptr)
 {
 	struct task_struct *task = bpf_get_current_task_btf();
 
-- 
cgit v1.2.3


From 76e4fed847124690f7344a43d01dbcd7b2925353 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Sun, 26 Oct 2025 20:38:46 +0000
Subject: lib: move freader into buildid.h

Move struct freader and prototypes of the functions operating on it into
the buildid.h.

This allows reusing freader outside buildid, e.g. for file dynptr
support added later.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Link: https://lore.kernel.org/r/20251026203853.135105-4-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 MAINTAINERS             |  1 +
 include/linux/buildid.h | 25 +++++++++++++++++++++++++
 lib/buildid.c           | 29 +++++------------------------
 3 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 545a4776795e..7564692f2f3c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4648,6 +4648,7 @@ F:	Documentation/userspace-api/ebpf/
 F:	arch/*/net/*
 F:	include/linux/bpf*
 F:	include/linux/btf*
+F:	include/linux/buildid.h
 F:	include/linux/filter.h
 F:	include/trace/events/xdp.h
 F:	include/uapi/linux/bpf*
diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 014a88c41073..831c1b4b626c 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -18,4 +18,29 @@ void init_vmlinux_build_id(void);
 static inline void init_vmlinux_build_id(void) { }
 #endif
 
+struct freader {
+	void *buf;
+	u32 buf_sz;
+	int err;
+	union {
+		struct {
+			struct file *file;
+			struct folio *folio;
+			void *addr;
+			loff_t folio_off;
+			bool may_fault;
+		};
+		struct {
+			const char *data;
+			u64 data_sz;
+		};
+	};
+};
+
+void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
+			    struct file *file, bool may_fault);
+void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz);
+const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz);
+void freader_cleanup(struct freader *r);
+
 #endif
diff --git a/lib/buildid.c b/lib/buildid.c
index c4b0f376fb34..df06e492810d 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -11,27 +11,8 @@
 
 #define MAX_PHDR_CNT 256
 
-struct freader {
-	void *buf;
-	u32 buf_sz;
-	int err;
-	union {
-		struct {
-			struct file *file;
-			struct folio *folio;
-			void *addr;
-			loff_t folio_off;
-			bool may_fault;
-		};
-		struct {
-			const char *data;
-			u64 data_sz;
-		};
-	};
-};
-
-static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
-				   struct file *file, bool may_fault)
+void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
+			    struct file *file, bool may_fault)
 {
 	memset(r, 0, sizeof(*r));
 	r->buf = buf;
@@ -40,7 +21,7 @@ static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
 	r->may_fault = may_fault;
 }
 
-static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
+void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
 {
 	memset(r, 0, sizeof(*r));
 	r->data = data;
@@ -92,7 +73,7 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
 	return 0;
 }
 
-static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz)
+const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz)
 {
 	size_t folio_sz;
 
@@ -147,7 +128,7 @@ static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz)
 	return r->addr + (file_off - r->folio_off);
 }
 
-static void freader_cleanup(struct freader *r)
+void freader_cleanup(struct freader *r)
 {
 	if (!r->buf)
 		return; /* non-file-backed mode */
-- 
cgit v1.2.3


From 8d8771dc03e48300e80b43744dd3c320ccaf746a Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Sun, 26 Oct 2025 20:38:49 +0000
Subject: bpf: add plumbing for file-backed dynptr

Add the necessary verifier plumbing for the new file-backed dynptr type.
Introduce two kfuncs for its lifecycle management:
 * bpf_dynptr_from_file() for initialization
 * bpf_dynptr_file_discard() for destruction

Currently there is no mechanism for kfunc to release dynptr, this patch
add one:
 * Dynptr release function sets meta->release_regno
 * Call unmark_stack_slots_dynptr() if meta->release_regno is set and
 dynptr ref_obj_id is set as well.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-7-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  7 ++++++-
 kernel/bpf/helpers.c  | 12 ++++++++++++
 kernel/bpf/log.c      |  2 ++
 kernel/bpf/verifier.c | 31 +++++++++++++++++++++++++------
 4 files changed, 45 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 907c69295293..14f800773997 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -792,12 +792,15 @@ enum bpf_type_flag {
 	/* DYNPTR points to skb_metadata_end()-skb_metadata_len() */
 	DYNPTR_TYPE_SKB_META	= BIT(19 + BPF_BASE_TYPE_BITS),
 
+	/* DYNPTR points to file */
+	DYNPTR_TYPE_FILE	= BIT(20 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
 
 #define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
-				 | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META)
+				 | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META | DYNPTR_TYPE_FILE)
 
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
@@ -1385,6 +1388,8 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_XDP,
 	/* Points to skb_metadata_end()-skb_metadata_len() */
 	BPF_DYNPTR_TYPE_SKB_META,
+	/* Underlying data is a file */
+	BPF_DYNPTR_TYPE_FILE,
 };
 
 int bpf_dynptr_check_size(u64 size);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a2ce17ea5edb..bf65b7fb761f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4252,6 +4252,16 @@ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct b
 	return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
 }
 
+__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
+{
+	return 0;
+}
+
+__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
+{
+	return 0;
+}
+
 __bpf_kfunc_end_defs();
 
 static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
@@ -4429,6 +4439,8 @@ BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
 BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index f50533169cc3..70221aafc35c 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -500,6 +500,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type)
 		return "xdp";
 	case BPF_DYNPTR_TYPE_SKB_META:
 		return "skb_meta";
+	case BPF_DYNPTR_TYPE_FILE:
+		return "file";
 	case BPF_DYNPTR_TYPE_INVALID:
 		return "<invalid>";
 	default:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f60cfab95230..cd48ead852a0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -692,6 +692,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 		return BPF_DYNPTR_TYPE_XDP;
 	case DYNPTR_TYPE_SKB_META:
 		return BPF_DYNPTR_TYPE_SKB_META;
+	case DYNPTR_TYPE_FILE:
+		return BPF_DYNPTR_TYPE_FILE;
 	default:
 		return BPF_DYNPTR_TYPE_INVALID;
 	}
@@ -710,6 +712,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
 		return DYNPTR_TYPE_XDP;
 	case BPF_DYNPTR_TYPE_SKB_META:
 		return DYNPTR_TYPE_SKB_META;
+	case BPF_DYNPTR_TYPE_FILE:
+		return DYNPTR_TYPE_FILE;
 	default:
 		return 0;
 	}
@@ -717,7 +721,7 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
 
 static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
 {
-	return type == BPF_DYNPTR_TYPE_RINGBUF;
+	return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE;
 }
 
 static void __mark_dynptr_reg(struct bpf_reg_state *reg,
@@ -12291,6 +12295,8 @@ enum special_kfunc_type {
 	KF_bpf_res_spin_unlock,
 	KF_bpf_res_spin_lock_irqsave,
 	KF_bpf_res_spin_unlock_irqrestore,
+	KF_bpf_dynptr_from_file,
+	KF_bpf_dynptr_file_discard,
 	KF___bpf_trap,
 	KF_bpf_task_work_schedule_signal,
 	KF_bpf_task_work_schedule_resume,
@@ -12363,6 +12369,8 @@ BTF_ID(func, bpf_res_spin_lock)
 BTF_ID(func, bpf_res_spin_unlock)
 BTF_ID(func, bpf_res_spin_lock_irqsave)
 BTF_ID(func, bpf_res_spin_unlock_irqrestore)
+BTF_ID(func, bpf_dynptr_from_file)
+BTF_ID(func, bpf_dynptr_file_discard)
 BTF_ID(func, __bpf_trap)
 BTF_ID(func, bpf_task_work_schedule_signal)
 BTF_ID(func, bpf_task_work_schedule_resume)
@@ -13326,6 +13334,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				dynptr_arg_type |= DYNPTR_TYPE_XDP;
 			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) {
 				dynptr_arg_type |= DYNPTR_TYPE_SKB_META;
+			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+				dynptr_arg_type |= DYNPTR_TYPE_FILE;
+			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) {
+				dynptr_arg_type |= DYNPTR_TYPE_FILE;
+				meta->release_regno = regno;
 			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
 				   (dynptr_arg_type & MEM_UNINIT)) {
 				enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
@@ -14006,12 +14019,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
 	 */
 	if (meta.release_regno) {
-		err = release_reference(env, regs[meta.release_regno].ref_obj_id);
-		if (err) {
-			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
-				func_name, meta.func_id);
-			return err;
+		struct bpf_reg_state *reg = &regs[meta.release_regno];
+
+		if (meta.initialized_dynptr.ref_obj_id) {
+			err = unmark_stack_slots_dynptr(env, reg);
+		} else {
+			err = release_reference(env, reg->ref_obj_id);
+			if (err)
+				verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+					func_name, meta.func_id);
 		}
+		if (err)
+			return err;
 	}
 
 	if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
-- 
cgit v1.2.3


From 2c52e8943a437af6093d8b0f0920f1764f0e5f64 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Sun, 26 Oct 2025 20:38:52 +0000
Subject: bpf: dispatch to sleepable file dynptr

File dynptr reads may sleep when the requested folios are not in
the page cache. To avoid sleeping in non-sleepable contexts while still
supporting valid sleepable use, given that dynptrs are non-sleepable by
default, enable sleeping only when bpf_dynptr_from_file() is invoked
from a sleepable context.

This change:
  * Introduces a sleepable constructor: bpf_dynptr_from_file_sleepable()
  * Override non-sleepable constructor with sleepable if it's always
  called in sleepable context

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-10-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  3 +++
 kernel/bpf/helpers.c  |  5 +++++
 kernel/bpf/verifier.c | 10 +++++++---
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 14f800773997..a47d67db3be5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -670,6 +670,9 @@ static inline bool bpf_map_has_internal_structs(struct bpf_map *map)
 
 void bpf_map_free_internal_structs(struct bpf_map *map, void *obj);
 
+int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
+				   struct bpf_dynptr *ptr__uninit);
+
 extern const struct bpf_map_ops bpf_map_offload_ops;
 
 /* bpf_type_flag contains a set of flags that are applicable to the values of
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 99a7def0b978..930e132f440f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4336,6 +4336,11 @@ __bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dy
 	return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit);
 }
 
+int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
+{
+	return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit);
+}
+
 __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
 {
 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 61589be91c65..542e23fb19c7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3124,7 +3124,8 @@ struct bpf_kfunc_btf_tab {
 	u32 nr_descs;
 };
 
-static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc);
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc,
+			    int insn_idx);
 
 static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
 {
@@ -21869,7 +21870,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 }
 
 /* replace a generic kfunc with a specialized version if necessary */
-static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc)
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx)
 {
 	struct bpf_prog *prog = env->prog;
 	bool seen_direct_write;
@@ -21904,6 +21905,9 @@ static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc
 	} else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) {
 		if (bpf_lsm_has_d_inode_locked(prog))
 			addr = (unsigned long)bpf_remove_dentry_xattr_locked;
+	} else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+		if (!env->insn_aux_data[insn_idx].non_sleepable)
+			addr = (unsigned long)bpf_dynptr_from_file_sleepable;
 	}
 
 set_imm:
@@ -21963,7 +21967,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EFAULT;
 	}
 
-	err = specialize_kfunc(env, desc);
+	err = specialize_kfunc(env, desc, insn_idx);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 457129aa3610f46bfa6f97725de731345d4aaef0 Mon Sep 17 00:00:00 2001
From: Jingyi Wang <jingyi.wang@oss.qualcomm.com>
Date: Wed, 22 Oct 2025 21:57:36 -0700
Subject: dt-bindings: arm: qcom,ids: Add SoC ID for SM8850

Add the ID for the Qualcomm SM8850 SoC which represents the Kaanapali
platform.

Signed-off-by: Jingyi Wang <jingyi.wang@oss.qualcomm.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20251022-knp-socid-v2-1-d147eadd09ee@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/arm/qcom,ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h
index cb8ce53146f0..19598ed4679e 100644
--- a/include/dt-bindings/arm/qcom,ids.h
+++ b/include/dt-bindings/arm/qcom,ids.h
@@ -286,6 +286,7 @@
 #define QCOM_ID_IPQ5424			651
 #define QCOM_ID_QCM6690			657
 #define QCOM_ID_QCS6690			658
+#define QCOM_ID_SM8850			660
 #define QCOM_ID_IPQ5404			671
 #define QCOM_ID_QCS9100			667
 #define QCOM_ID_QCS8300			674
-- 
cgit v1.2.3


From 82cb5be6ad64198a3a028aeb49dcc7f6224d558a Mon Sep 17 00:00:00 2001
From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
Date: Wed, 22 Oct 2025 10:19:36 +1000
Subject: net/tls: support setting the maximum payload size

During a handshake, an endpoint may specify a maximum record size limit.
Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
maximum record size. Meaning that, the outgoing records from the kernel
can exceed a lower size negotiated during the handshake. In such a case,
the TLS endpoint must send a fatal "record_overflow" alert [1], and
thus the record is discarded.

Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
support. For these devices, supporting TLS record size negotiation is
necessary because the maximum TLS record size supported by the controller
is less than the default 16KB currently used by the kernel.

Currently, there is no way to inform the kernel of such a limit. This patch
adds support to a new setsockopt() option `TLS_TX_MAX_PAYLOAD_LEN` that
allows for setting the maximum plaintext fragment size. Once set, outgoing
records are no larger than the size specified. This option can be used to
specify the record size limit.

[1] https://www.rfc-editor.org/rfc/rfc8449

Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://patch.msgid.link/20251022001937.20155-1-wilfred.opensource@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/tls.rst | 20 +++++++++++++
 include/net/tls.h                |  3 ++
 include/uapi/linux/tls.h         |  2 ++
 net/tls/tls_device.c             |  2 +-
 net/tls/tls_main.c               | 64 ++++++++++++++++++++++++++++++++++++++++
 net/tls/tls_sw.c                 |  2 +-
 6 files changed, 91 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 36cc7afc2527..980c442d7161 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -280,6 +280,26 @@ If the record decrypted turns out to had been padded or is not a data
 record it will be decrypted again into a kernel buffer without zero copy.
 Such events are counted in the ``TlsDecryptRetry`` statistic.
 
+TLS_TX_MAX_PAYLOAD_LEN
+~~~~~~~~~~~~~~~~~~~~~~
+
+Specifies the maximum size of the plaintext payload for transmitted TLS records.
+
+When this option is set, the kernel enforces the specified limit on all outgoing
+TLS records. No plaintext fragment will exceed this size. This option can be used
+to implement the TLS Record Size Limit extension [1].
+
+* For TLS 1.2, the value corresponds directly to the record size limit.
+* For TLS 1.3, the value should be set to record_size_limit - 1, since
+  the record size limit includes one additional byte for the ContentType
+  field.
+
+The valid range for this option is 64 to 16384 bytes for TLS 1.2, and 63 to
+16384 bytes for TLS 1.3. The lower minimum for TLS 1.3 accounts for the
+extra byte used by the ContentType field.
+
+[1] https://datatracker.ietf.org/doc/html/rfc8449
+
 Statistics
 ==========
 
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..f2af113728aa 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -53,6 +53,8 @@ struct tls_rec;
 
 /* Maximum data size carried in a TLS record */
 #define TLS_MAX_PAYLOAD_SIZE		((size_t)1 << 14)
+/* Minimum record size limit as per RFC8449 */
+#define TLS_MIN_RECORD_SIZE_LIM		((size_t)1 << 6)
 
 #define TLS_HEADER_SIZE			5
 #define TLS_NONCE_OFFSET		TLS_HEADER_SIZE
@@ -226,6 +228,7 @@ struct tls_context {
 	u8 rx_conf:3;
 	u8 zerocopy_sendfile:1;
 	u8 rx_no_pad:1;
+	u16 tx_max_payload_len;
 
 	int (*push_pending_record)(struct sock *sk, int flags);
 	void (*sk_write_space)(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index b66a800389cc..b8b9c42f848c 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -41,6 +41,7 @@
 #define TLS_RX			2	/* Set receive parameters */
 #define TLS_TX_ZEROCOPY_RO	3	/* TX zerocopy (only sendfile now) */
 #define TLS_RX_EXPECT_NO_PAD	4	/* Attempt opportunistic zero-copy */
+#define TLS_TX_MAX_PAYLOAD_LEN	5	/* Maximum plaintext size */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
@@ -194,6 +195,7 @@ enum {
 	TLS_INFO_RXCONF,
 	TLS_INFO_ZC_RO_TX,
 	TLS_INFO_RX_NO_PAD,
+	TLS_INFO_TX_MAX_PAYLOAD_LEN,
 	__TLS_INFO_MAX,
 };
 #define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index caa2b5d24622..4d29b390aed9 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -462,7 +462,7 @@ static int tls_push_data(struct sock *sk,
 	/* TLS_HEADER_SIZE is not counted as part of the TLS record, and
 	 * we need to leave room for an authentication tag.
 	 */
-	max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
+	max_open_record_len = tls_ctx->tx_max_payload_len +
 			      prot->prepend_size;
 	do {
 		rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 39a2ab47fe72..56ce0bc8317b 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -541,6 +541,28 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval,
 	return 0;
 }
 
+static int do_tls_getsockopt_tx_payload_len(struct sock *sk, char __user *optval,
+					    int __user *optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	u16 payload_len = ctx->tx_max_payload_len;
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if (len < sizeof(payload_len))
+		return -EINVAL;
+
+	if (put_user(sizeof(payload_len), optlen))
+		return -EFAULT;
+
+	if (copy_to_user(optval, &payload_len, sizeof(payload_len)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int do_tls_getsockopt(struct sock *sk, int optname,
 			     char __user *optval, int __user *optlen)
 {
@@ -560,6 +582,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
 	case TLS_RX_EXPECT_NO_PAD:
 		rc = do_tls_getsockopt_no_pad(sk, optval, optlen);
 		break;
+	case TLS_TX_MAX_PAYLOAD_LEN:
+		rc = do_tls_getsockopt_tx_payload_len(sk, optval, optlen);
+		break;
 	default:
 		rc = -ENOPROTOOPT;
 		break;
@@ -809,6 +834,32 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
 	return rc;
 }
 
+static int do_tls_setsockopt_tx_payload_len(struct sock *sk, sockptr_t optval,
+					    unsigned int optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx);
+	u16 value;
+	bool tls_13 = ctx->prot_info.version == TLS_1_3_VERSION;
+
+	if (sw_ctx && sw_ctx->open_rec)
+		return -EBUSY;
+
+	if (sockptr_is_null(optval) || optlen != sizeof(value))
+		return -EINVAL;
+
+	if (copy_from_sockptr(&value, optval, sizeof(value)))
+		return -EFAULT;
+
+	if (value < TLS_MIN_RECORD_SIZE_LIM - (tls_13 ? 1 : 0) ||
+	    value > TLS_MAX_PAYLOAD_SIZE)
+		return -EINVAL;
+
+	ctx->tx_max_payload_len = value;
+
+	return 0;
+}
+
 static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 			     unsigned int optlen)
 {
@@ -830,6 +881,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 	case TLS_RX_EXPECT_NO_PAD:
 		rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
 		break;
+	case TLS_TX_MAX_PAYLOAD_LEN:
+		lock_sock(sk);
+		rc = do_tls_setsockopt_tx_payload_len(sk, optval, optlen);
+		release_sock(sk);
+		break;
 	default:
 		rc = -ENOPROTOOPT;
 		break;
@@ -1019,6 +1075,7 @@ static int tls_init(struct sock *sk)
 
 	ctx->tx_conf = TLS_BASE;
 	ctx->rx_conf = TLS_BASE;
+	ctx->tx_max_payload_len = TLS_MAX_PAYLOAD_SIZE;
 	update_sk_prot(sk, ctx);
 out:
 	write_unlock_bh(&sk->sk_callback_lock);
@@ -1108,6 +1165,12 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
 			goto nla_failure;
 	}
 
+	err = nla_put_u16(skb, TLS_INFO_TX_MAX_PAYLOAD_LEN,
+			  ctx->tx_max_payload_len);
+
+	if (err)
+		goto nla_failure;
+
 	rcu_read_unlock();
 	nla_nest_end(skb, start);
 	return 0;
@@ -1129,6 +1192,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin)
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_TXCONF */
 		nla_total_size(0) +		/* TLS_INFO_ZC_RO_TX */
 		nla_total_size(0) +		/* TLS_INFO_RX_NO_PAD */
+		nla_total_size(sizeof(u16)) +   /* TLS_INFO_TX_MAX_PAYLOAD_LEN */
 		0;
 
 	return size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index d17135369980..9937d4c810f2 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
 		orig_size = msg_pl->sg.size;
 		full_record = false;
 		try_to_copy = msg_data_left(msg);
-		record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
+		record_room = tls_ctx->tx_max_payload_len - msg_pl->sg.size;
 		if (try_to_copy >= record_room) {
 			try_to_copy = record_room;
 			full_record = true;
-- 
cgit v1.2.3


From 151b98d10ef7c3174465e409b99d8762e7e8de60 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Thu, 23 Oct 2025 23:16:53 +0000
Subject: net: Add sk_clone().

sctp_accept() will use sk_clone_lock(), but it will be called
with the parent socket locked, and sctp_migrate() acquires the
child lock later.

Let's add no lock version of sk_clone_lock().

Note that lockdep complains if we simply use bh_lock_sock_nested().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-5-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h |  7 ++++++-
 net/core/sock.c    | 24 ++++++++++++++++--------
 2 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 01ce231603db..c7e58b8e8a90 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1822,7 +1822,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 void sk_free(struct sock *sk);
 void sk_net_refcnt_upgrade(struct sock *sk);
 void sk_destruct(struct sock *sk);
-struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority, bool lock);
+
+static inline struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
+{
+	return sk_clone(sk, priority, true);
+}
 
 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
 			     gfp_t priority);
diff --git a/net/core/sock.c b/net/core/sock.c
index a99132cc0965..7a9bbc2afcf0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2462,13 +2462,16 @@ static void sk_init_common(struct sock *sk)
 }
 
 /**
- *	sk_clone_lock - clone a socket, and lock its clone
- *	@sk: the socket to clone
- *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * sk_clone - clone a socket
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @lock: if true, lock the cloned sk
  *
- *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ * If @lock is true, the clone is locked by bh_lock_sock(), and
+ * caller must unlock socket even in error path by bh_unlock_sock().
  */
-struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
+		      bool lock)
 {
 	struct proto *prot = READ_ONCE(sk->sk_prot);
 	struct sk_filter *filter;
@@ -2497,9 +2500,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
 				      false, priority);
 	}
+
 	sk_node_init(&newsk->sk_node);
 	sock_lock_init(newsk);
-	bh_lock_sock(newsk);
+
+	if (lock)
+		bh_lock_sock(newsk);
+
 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
 	newsk->sk_backlog.len = 0;
 
@@ -2590,12 +2597,13 @@ free:
 	 * destructor and make plain sk_free()
 	 */
 	newsk->sk_destruct = NULL;
-	bh_unlock_sock(newsk);
+	if (lock)
+		bh_unlock_sock(newsk);
 	sk_free(newsk);
 	newsk = NULL;
 	goto out;
 }
-EXPORT_SYMBOL_GPL(sk_clone_lock);
+EXPORT_SYMBOL_GPL(sk_clone);
 
 static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
 {
-- 
cgit v1.2.3


From c49ed521f1772ca9203d22a1e5950f337fd5f930 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Thu, 23 Oct 2025 23:16:55 +0000
Subject: sctp: Remove sctp_pf.create_accept_sk().

sctp_v[46]_create_accept_sk() are no longer used.

Let's remove sctp_pf.create_accept_sk().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-7-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/structs.h |  3 ---
 net/sctp/ipv6.c            | 45 ---------------------------------------------
 net/sctp/protocol.c        | 27 ---------------------------
 3 files changed, 75 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2ae390219efd..3dd304e411d0 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -497,9 +497,6 @@ struct sctp_pf {
 	int  (*bind_verify) (struct sctp_sock *, union sctp_addr *);
 	int  (*send_verify) (struct sctp_sock *, union sctp_addr *);
 	int  (*supported_addrs)(const struct sctp_sock *, __be16 *);
-	struct sock *(*create_accept_sk) (struct sock *sk,
-					  struct sctp_association *asoc,
-					  bool kern);
 	int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr);
 	void (*to_sk_saddr)(union sctp_addr *, struct sock *sk);
 	void (*to_sk_daddr)(union sctp_addr *, struct sock *sk);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c0762424a854..069b7e45d8bd 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -777,50 +777,6 @@ static enum sctp_scope sctp_v6_scope(union sctp_addr *addr)
 	return retval;
 }
 
-/* Create and initialize a new sk for the socket to be returned by accept(). */
-static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
-					     struct sctp_association *asoc,
-					     bool kern)
-{
-	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
-	struct sctp6_sock *newsctp6sk;
-	struct inet_sock *newinet;
-	struct sock *newsk;
-
-	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern);
-	if (!newsk)
-		return NULL;
-
-	sock_init_data(NULL, newsk);
-
-	sctp_copy_sock(newsk, sk, asoc);
-	sock_reset_flag(sk, SOCK_ZAPPED);
-
-	newsctp6sk = (struct sctp6_sock *)newsk;
-	newinet = inet_sk(newsk);
-	newinet->pinet6 = &newsctp6sk->inet6;
-	newinet->ipv6_fl_list = NULL;
-
-	sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped;
-
-	newnp = inet6_sk(newsk);
-
-	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
-	newnp->ipv6_mc_list = NULL;
-	newnp->ipv6_ac_list = NULL;
-
-	sctp_v6_copy_ip_options(sk, newsk);
-
-	/* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname()
-	 * and getpeername().
-	 */
-	sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk);
-
-	newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
-
-	return newsk;
-}
-
 /* Format a sockaddr for return to user space. This makes sure the return is
  * AF_INET or AF_INET6 depending on the SCTP_I_WANT_MAPPED_V4_ADDR option.
  */
@@ -1167,7 +1123,6 @@ static struct sctp_pf sctp_pf_inet6 = {
 	.bind_verify   = sctp_inet6_bind_verify,
 	.send_verify   = sctp_inet6_send_verify,
 	.supported_addrs = sctp_inet6_supported_addrs,
-	.create_accept_sk = sctp_v6_create_accept_sk,
 	.addr_to_user  = sctp_v6_addr_to_user,
 	.to_sk_saddr   = sctp_v6_to_sk_saddr,
 	.to_sk_daddr   = sctp_v6_to_sk_daddr,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index ad2722d1ec15..2c3398f75d76 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -580,32 +580,6 @@ static int sctp_v4_is_ce(const struct sk_buff *skb)
 	return INET_ECN_is_ce(ip_hdr(skb)->tos);
 }
 
-/* Create and initialize a new sk for the socket returned by accept(). */
-static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
-					     struct sctp_association *asoc,
-					     bool kern)
-{
-	struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
-			sk->sk_prot, kern);
-	struct inet_sock *newinet;
-
-	if (!newsk)
-		return NULL;
-
-	sock_init_data(NULL, newsk);
-
-	sctp_copy_sock(newsk, sk, asoc);
-	sock_reset_flag(newsk, SOCK_ZAPPED);
-
-	sctp_v4_copy_ip_options(sk, newsk);
-
-	newinet = inet_sk(newsk);
-
-	newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr;
-
-	return newsk;
-}
-
 static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr)
 {
 	/* No address mapping for V4 sockets */
@@ -1113,7 +1087,6 @@ static struct sctp_pf sctp_pf_inet = {
 	.bind_verify   = sctp_inet_bind_verify,
 	.send_verify   = sctp_inet_send_verify,
 	.supported_addrs = sctp_inet_supported_addrs,
-	.create_accept_sk = sctp_v4_create_accept_sk,
 	.addr_to_user  = sctp_v4_addr_to_user,
 	.to_sk_saddr   = sctp_v4_to_sk_saddr,
 	.to_sk_daddr   = sctp_v4_to_sk_daddr,
-- 
cgit v1.2.3


From 71068e2e1b6bd78f5599e5bc89e125a75149884b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Thu, 23 Oct 2025 23:16:57 +0000
Subject: sctp: Remove sctp_copy_sock() and sctp_copy_descendant().

Now, sctp_accept() and sctp_do_peeloff() use sk_clone(), and
we no longer need sctp_copy_sock() and sctp_copy_descendant().

Let's remove them.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-9-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_sock.h |  8 ------
 include/net/sctp/sctp.h |  3 +--
 net/sctp/socket.c       | 71 -------------------------------------------------
 3 files changed, 1 insertion(+), 81 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index b6ec08072533..ac1c75975908 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -355,14 +355,6 @@ static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
 
 #define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk)
 
-static inline void __inet_sk_copy_descendant(struct sock *sk_to,
-					     const struct sock *sk_from,
-					     const int ancestor_size)
-{
-	memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
-	       sk_from->sk_prot->obj_size - ancestor_size);
-}
-
 int inet_sk_rebuild_header(struct sock *sk);
 
 /**
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index e96d1bd087f6..bb4b80c12541 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -94,8 +94,7 @@ void sctp_data_ready(struct sock *sk);
 __poll_t sctp_poll(struct file *file, struct socket *sock,
 		poll_table *wait);
 void sctp_sock_rfree(struct sk_buff *skb);
-void sctp_copy_sock(struct sock *newsk, struct sock *sk,
-		    struct sctp_association *asoc);
+
 extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 60d3e340dfed..ac737e60829b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -9491,72 +9491,6 @@ done:
 	sctp_skb_set_owner_r(skb, sk);
 }
 
-void sctp_copy_sock(struct sock *newsk, struct sock *sk,
-		    struct sctp_association *asoc)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	struct inet_sock *newinet;
-	struct sctp_sock *sp = sctp_sk(sk);
-
-	newsk->sk_type = sk->sk_type;
-	newsk->sk_bound_dev_if = sk->sk_bound_dev_if;
-	newsk->sk_flags = sk->sk_flags;
-	newsk->sk_tsflags = sk->sk_tsflags;
-	newsk->sk_no_check_tx = sk->sk_no_check_tx;
-	newsk->sk_no_check_rx = sk->sk_no_check_rx;
-	newsk->sk_reuse = sk->sk_reuse;
-	sctp_sk(newsk)->reuse = sp->reuse;
-
-	newsk->sk_shutdown = sk->sk_shutdown;
-	newsk->sk_destruct = sk->sk_destruct;
-	newsk->sk_family = sk->sk_family;
-	newsk->sk_protocol = IPPROTO_SCTP;
-	newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
-	newsk->sk_sndbuf = sk->sk_sndbuf;
-	newsk->sk_rcvbuf = sk->sk_rcvbuf;
-	newsk->sk_lingertime = sk->sk_lingertime;
-	newsk->sk_rcvtimeo = READ_ONCE(sk->sk_rcvtimeo);
-	newsk->sk_sndtimeo = READ_ONCE(sk->sk_sndtimeo);
-	newsk->sk_rxhash = sk->sk_rxhash;
-	newsk->sk_gso_type = sk->sk_gso_type;
-
-	newinet = inet_sk(newsk);
-
-	/* Initialize sk's sport, dport, rcv_saddr and daddr for
-	 * getsockname() and getpeername()
-	 */
-	newinet->inet_sport = inet->inet_sport;
-	newinet->inet_saddr = inet->inet_saddr;
-	newinet->inet_rcv_saddr = inet->inet_rcv_saddr;
-	newinet->inet_dport = htons(asoc->peer.port);
-	newinet->pmtudisc = inet->pmtudisc;
-	atomic_set(&newinet->inet_id, get_random_u16());
-
-	newinet->uc_ttl = inet->uc_ttl;
-	inet_set_bit(MC_LOOP, newsk);
-	newinet->mc_ttl = 1;
-	newinet->mc_index = 0;
-	newinet->mc_list = NULL;
-
-	if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
-		net_enable_timestamp();
-
-	/* Set newsk security attributes from original sk and connection
-	 * security attribute from asoc.
-	 */
-	security_sctp_sk_clone(asoc, sk, newsk);
-}
-
-static inline void sctp_copy_descendant(struct sock *sk_to,
-					const struct sock *sk_from)
-{
-	size_t ancestor_size = sizeof(struct inet_sock);
-
-	ancestor_size += sk_from->sk_prot->obj_size;
-	ancestor_size -= offsetof(struct sctp_sock, pd_lobby);
-	__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
-}
-
 /* Populate the fields of the newsk from the oldsk and migrate the assoc
  * and its messages to the newsk.
  */
@@ -9573,11 +9507,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	struct sctp_bind_hashbucket *head;
 	int err;
 
-	/* Migrate all the socket level options to the new socket.
-	 * Brute force copy old sctp opt.
-	 */
-	sctp_copy_descendant(newsk, oldsk);
-
 	/* Restore the ep value that was overwritten with the above structure
 	 * copy.
 	 */
-- 
cgit v1.2.3


From 6f147c8328e045de3a35155ca7c883d88da9e916 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Fri, 24 Oct 2025 10:51:45 +0800
Subject: net/sched: Remove unused typedef psched_tdiff_t

Since commit 051d44209842 ("net/sched: Retire CBQ qdisc")
this is not used anymore.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://patch.msgid.link/20251024025145.4069583-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index c660ac871083..4678db45832a 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -43,7 +43,6 @@ struct qdisc_walker {
  */
 
 typedef u64	psched_time_t;
-typedef long	psched_tdiff_t;
 
 /* Avoid doing 64 bit divide */
 #define PSCHED_SHIFT			6
-- 
cgit v1.2.3


From feeaf1346f80ffb181b6f9b739628103aa73b067 Mon Sep 17 00:00:00 2001
From: Xu Kuohai <xukuohai@huawei.com>
Date: Sat, 18 Oct 2025 11:57:36 +0800
Subject: bpf: Add overwrite mode for BPF ring buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the BPF ring buffer is full, a new event cannot be recorded until one
or more old events are consumed to make enough space for it. In cases such
as fault diagnostics, where recent events are more useful than older ones,
this mechanism may lead to critical events being lost.

So add overwrite mode for BPF ring buffer to address it. In this mode, the
new event overwrites the oldest event when the buffer is full.

The basic idea is as follows:

1. producer_pos tracks the next position to record new event. When there
   is enough free space, producer_pos is simply advanced by producer to
   make space for the new event.

2. To avoid waiting for consumer when the buffer is full, a new variable,
   overwrite_pos, is introduced for producer. It points to the oldest event
   committed in the buffer. It is advanced by producer to discard one or more
   oldest events to make space for the new event when the buffer is full.

3. pending_pos tracks the oldest event to be committed. pending_pos is never
   passed by producer_pos, so multiple producers never write to the same
   position at the same time.

The following example diagrams show how it works in a 4096-byte ring buffer.

1. At first, {producer,overwrite,pending,consumer}_pos are all set to 0.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                                                                       |
   |                                                                       |
   |                                                                       |
   +-----------------------------------------------------------------------+
   ^
   |
   |
producer_pos = 0
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

2. Now reserve a 512-byte event A.

   There is enough free space, so A is allocated at offset 0. And producer_pos
   is advanced to 512, the end of A. Since A is not submitted, the BUSY bit is
   set.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                                                              |
   |   A    |                                                              |
   | [BUSY] |                                                              |
   +-----------------------------------------------------------------------+
   ^        ^
   |        |
   |        |
   |    producer_pos = 512
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

3. Reserve event B, size 1024.

   B is allocated at offset 512 with BUSY bit set, and producer_pos is advanced
   to the end of B.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                            |
   |   A    |        B        |                                            |
   | [BUSY] |      [BUSY]     |                                            |
   +-----------------------------------------------------------------------+
   ^                          ^
   |                          |
   |                          |
   |                   producer_pos = 1536
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

4. Reserve event C, size 2048.

   C is allocated at offset 1536, and producer_pos is advanced to 3584.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   | [BUSY] |      [BUSY]     |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^                                                              ^
   |                                                              |
   |                                                              |
   |                                                    producer_pos = 3584
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

5. Submit event A.

   The BUSY bit of A is cleared. B becomes the oldest event to be committed, so
   pending_pos is advanced to 512, the start of B.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   |        |      [BUSY]     |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^        ^                                                     ^
   |        |                                                     |
   |        |                                                     |
   |   pending_pos = 512                                  producer_pos = 3584
   |
overwrite_pos = 0
consumer_pos = 0

6. Submit event B.

   The BUSY bit of B is cleared, and pending_pos is advanced to the start of C,
   which is now the oldest event to be committed.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   |        |                 |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^                          ^                                   ^
   |                          |                                   |
   |                          |                                   |
   |                     pending_pos = 1536               producer_pos = 3584
   |
overwrite_pos = 0
consumer_pos = 0

7. Reserve event D, size 1536 (3 * 512).

   There are 2048 bytes not being written between producer_pos (currently 3584)
   and pending_pos, so D is allocated at offset 3584, and producer_pos is advanced
   by 1536 (from 3584 to 5120).

   Since event D will overwrite all bytes of event A and the first 512 bytes of
   event B, overwrite_pos is advanced to the start of event C, the oldest event
   that is not overwritten.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                 |        |                                   |        |
   |      D End      |        |                 C                 | D Begin|
   |      [BUSY]     |        |               [BUSY]              | [BUSY] |
   +-----------------------------------------------------------------------+
   ^                 ^        ^
   |                 |        |
   |                 |   pending_pos = 1536
   |                 |   overwrite_pos = 1536
   |                 |
   |             producer_pos=5120
   |
consumer_pos = 0

8. Reserve event E, size 1024.

   Although there are 512 bytes not being written between producer_pos and
   pending_pos, E cannot be reserved, as it would overwrite the first 512
   bytes of event C, which is still being written.

9. Submit event C and D.

   pending_pos is advanced to the end of D.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                 |        |                                   |        |
   |      D End      |        |                 C                 | D Begin|
   |                 |        |                                   |        |
   +-----------------------------------------------------------------------+
   ^                 ^        ^
   |                 |        |
   |                 |   overwrite_pos = 1536
   |                 |
   |             producer_pos=5120
   |             pending_pos=5120
   |
consumer_pos = 0

The performance data for overwrite mode will be provided in a follow-up
patch that adds overwrite-mode benchmarks.

A sample of performance data for non-overwrite mode, collected on an x86_64
CPU and an arm64 CPU, before and after this patch, is shown below. As we can
see, no obvious performance regression occurs.

- x86_64 (AMD EPYC 9654)

Before:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.623 ± 0.027M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  15.812 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  7.871 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  6.703 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  2.896 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 2.054 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 1.864 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 1.580 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 1.484 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 1.369 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 1.316 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 1.272 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 1.239 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 1.226 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 1.213 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 1.193 ± 0.001M/s (drops 0.000 ± 0.000M/s)

After:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.845 ± 0.036M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  15.889 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  8.155 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  6.708 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  2.918 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 2.065 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 1.870 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 1.582 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 1.482 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 1.372 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 1.323 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 1.264 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 1.236 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 1.209 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 1.189 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 1.165 ± 0.002M/s (drops 0.000 ± 0.000M/s)

- arm64 (HiSilicon Kunpeng 920)

Before:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.310 ± 0.623M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  9.947 ± 0.004M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  6.634 ± 0.011M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  4.502 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  3.888 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 3.372 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 3.189 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 2.998 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 3.086 ± 0.018M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 2.845 ± 0.004M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 2.815 ± 0.008M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 2.771 ± 0.009M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 2.814 ± 0.011M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 2.752 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 2.695 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 2.710 ± 0.006M/s (drops 0.000 ± 0.000M/s)

After:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.283 ± 0.550M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  9.993 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  6.898 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  5.257 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  3.830 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 3.528 ± 0.013M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 3.265 ± 0.018M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 2.990 ± 0.007M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 2.929 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 2.898 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 2.818 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 2.789 ± 0.012M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 2.770 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 2.651 ± 0.007M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 2.669 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 2.695 ± 0.009M/s (drops 0.000 ± 0.000M/s)

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251018035738.4039621-2-xukuohai@huaweicloud.com
---
 include/uapi/linux/bpf.h       |   4 ++
 kernel/bpf/ringbuf.c           | 114 ++++++++++++++++++++++++++++++++++-------
 tools/include/uapi/linux/bpf.h |   4 ++
 3 files changed, 103 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 77edd0253989..1d73f165394d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1430,6 +1430,9 @@ enum {
 
 /* Do not translate kernel bpf_arena pointers to user pointers */
 	BPF_F_NO_USER_CONV	= (1U << 18),
+
+/* Enable BPF ringbuf overwrite mode */
+	BPF_F_RB_OVERWRITE	= (1U << 19),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -6231,6 +6234,7 @@ enum {
 	BPF_RB_RING_SIZE = 1,
 	BPF_RB_CONS_POS = 2,
 	BPF_RB_PROD_POS = 3,
+	BPF_RB_OVERWRITE_POS = 4,
 };
 
 /* BPF ring buffer constants */
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 719d73299397..cbfa109e907e 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -13,7 +13,7 @@
 #include <linux/btf_ids.h>
 #include <asm/rqspinlock.h>
 
-#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
+#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE)
 
 /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
 #define RINGBUF_PGOFF \
@@ -30,6 +30,7 @@ struct bpf_ringbuf {
 	u64 mask;
 	struct page **pages;
 	int nr_pages;
+	bool overwrite_mode;
 	rqspinlock_t spinlock ____cacheline_aligned_in_smp;
 	/* For user-space producer ring buffers, an atomic_t busy bit is used
 	 * to synchronize access to the ring buffers in the kernel, rather than
@@ -73,6 +74,7 @@ struct bpf_ringbuf {
 	unsigned long consumer_pos __aligned(PAGE_SIZE);
 	unsigned long producer_pos __aligned(PAGE_SIZE);
 	unsigned long pending_pos;
+	unsigned long overwrite_pos; /* position after the last overwritten record */
 	char data[] __aligned(PAGE_SIZE);
 };
 
@@ -166,7 +168,7 @@ static void bpf_ringbuf_notify(struct irq_work *work)
  * considering that the maximum value of data_sz is (4GB - 1), there
  * will be no overflow, so just note the size limit in the comments.
  */
-static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
+static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode)
 {
 	struct bpf_ringbuf *rb;
 
@@ -183,17 +185,25 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
 	rb->consumer_pos = 0;
 	rb->producer_pos = 0;
 	rb->pending_pos = 0;
+	rb->overwrite_mode = overwrite_mode;
 
 	return rb;
 }
 
 static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
 {
+	bool overwrite_mode = false;
 	struct bpf_ringbuf_map *rb_map;
 
 	if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
+	if (attr->map_flags & BPF_F_RB_OVERWRITE) {
+		if (attr->map_type != BPF_MAP_TYPE_RINGBUF)
+			return ERR_PTR(-EINVAL);
+		overwrite_mode = true;
+	}
+
 	if (attr->key_size || attr->value_size ||
 	    !is_power_of_2(attr->max_entries) ||
 	    !PAGE_ALIGNED(attr->max_entries))
@@ -205,7 +215,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
 
 	bpf_map_init_from_attr(&rb_map->map, attr);
 
-	rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
+	rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode);
 	if (!rb_map->rb) {
 		bpf_map_area_free(rb_map);
 		return ERR_PTR(-ENOMEM);
@@ -293,13 +303,26 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma
 	return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
 }
 
+/*
+ * Return an estimate of the available data in the ring buffer.
+ * Note: the returned value can exceed the actual ring buffer size because the
+ * function is not synchronized with the producer. The producer acquires the
+ * ring buffer's spinlock, but this function does not.
+ */
 static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
 {
-	unsigned long cons_pos, prod_pos;
+	unsigned long cons_pos, prod_pos, over_pos;
 
 	cons_pos = smp_load_acquire(&rb->consumer_pos);
-	prod_pos = smp_load_acquire(&rb->producer_pos);
-	return prod_pos - cons_pos;
+
+	if (unlikely(rb->overwrite_mode)) {
+		over_pos = smp_load_acquire(&rb->overwrite_pos);
+		prod_pos = smp_load_acquire(&rb->producer_pos);
+		return prod_pos - max(cons_pos, over_pos);
+	} else {
+		prod_pos = smp_load_acquire(&rb->producer_pos);
+		return prod_pos - cons_pos;
+	}
 }
 
 static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
@@ -402,11 +425,43 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
 	return (void*)((addr & PAGE_MASK) - off);
 }
 
+static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb,
+				  unsigned long new_prod_pos,
+				  unsigned long cons_pos,
+				  unsigned long pend_pos)
+{
+	/*
+	 * No space if oldest not yet committed record until the newest
+	 * record span more than (ringbuf_size - 1).
+	 */
+	if (new_prod_pos - pend_pos > rb->mask)
+		return false;
+
+	/* Ok, we have space in overwrite mode */
+	if (unlikely(rb->overwrite_mode))
+		return true;
+
+	/*
+	 * No space if producer position advances more than (ringbuf_size - 1)
+	 * ahead of consumer position when not in overwrite mode.
+	 */
+	if (new_prod_pos - cons_pos > rb->mask)
+		return false;
+
+	return true;
+}
+
+static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len)
+{
+	hdr_len &= ~BPF_RINGBUF_DISCARD_BIT;
+	return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8);
+}
+
 static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
 {
-	unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags;
+	unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags;
 	struct bpf_ringbuf_hdr *hdr;
-	u32 len, pg_off, tmp_size, hdr_len;
+	u32 len, pg_off, hdr_len;
 
 	if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
 		return NULL;
@@ -429,24 +484,43 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
 		hdr_len = READ_ONCE(hdr->len);
 		if (hdr_len & BPF_RINGBUF_BUSY_BIT)
 			break;
-		tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT;
-		tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8);
-		pend_pos += tmp_size;
+		pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
 	}
 	rb->pending_pos = pend_pos;
 
-	/* check for out of ringbuf space:
-	 * - by ensuring producer position doesn't advance more than
-	 *   (ringbuf_size - 1) ahead
-	 * - by ensuring oldest not yet committed record until newest
-	 *   record does not span more than (ringbuf_size - 1)
-	 */
-	if (new_prod_pos - cons_pos > rb->mask ||
-	    new_prod_pos - pend_pos > rb->mask) {
+	if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) {
 		raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
 		return NULL;
 	}
 
+	/*
+	 * In overwrite mode, advance overwrite_pos when the ring buffer is full.
+	 * The key points are to stay on record boundaries and consume enough records
+	 * to fit the new one.
+	 */
+	if (unlikely(rb->overwrite_mode)) {
+		over_pos = rb->overwrite_pos;
+		while (new_prod_pos - over_pos > rb->mask) {
+			hdr = (void *)rb->data + (over_pos & rb->mask);
+			hdr_len = READ_ONCE(hdr->len);
+			/*
+			 * The bpf_ringbuf_has_space() check above ensures we won’t
+			 * step over a record currently being worked on by another
+			 * producer.
+			 */
+			over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
+		}
+		/*
+		 * smp_store_release(&rb->producer_pos, new_prod_pos) at
+		 * the end of the function ensures that when consumer sees
+		 * the updated rb->producer_pos, it always sees the updated
+		 * rb->overwrite_pos, so when consumer reads overwrite_pos
+		 * after smp_load_acquire(r->producer_pos), the overwrite_pos
+		 * will always be valid.
+		 */
+		WRITE_ONCE(rb->overwrite_pos, over_pos);
+	}
+
 	hdr = (void *)rb->data + (prod_pos & rb->mask);
 	pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
 	hdr->len = size | BPF_RINGBUF_BUSY_BIT;
@@ -576,6 +650,8 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
 		return smp_load_acquire(&rb->consumer_pos);
 	case BPF_RB_PROD_POS:
 		return smp_load_acquire(&rb->producer_pos);
+	case BPF_RB_OVERWRITE_POS:
+		return smp_load_acquire(&rb->overwrite_pos);
 	default:
 		return 0;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 77edd0253989..1d73f165394d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1430,6 +1430,9 @@ enum {
 
 /* Do not translate kernel bpf_arena pointers to user pointers */
 	BPF_F_NO_USER_CONV	= (1U << 18),
+
+/* Enable BPF ringbuf overwrite mode */
+	BPF_F_RB_OVERWRITE	= (1U << 19),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -6231,6 +6234,7 @@ enum {
 	BPF_RB_RING_SIZE = 1,
 	BPF_RB_CONS_POS = 2,
 	BPF_RB_PROD_POS = 3,
+	BPF_RB_OVERWRITE_POS = 4,
 };
 
 /* BPF ring buffer constants */
-- 
cgit v1.2.3


From b94d45b6bbb42571ec225d3be0e7457c8765a5b4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 28 Oct 2025 09:56:38 +0100
Subject: seqlock: Allow KASAN to fail optimizing

Some KASAN builds are failing to properly optimize this code --
luckily we don't care about core quality for KASAN builds, so just
exclude it.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Closes: https://lore.kernel.org/oe-kbuild-all/202510251641.idrNXhv5-lkp@intel.com/
---
 include/linux/seqlock.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index b7bcc4111e90..a8a8661839b6 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -1234,11 +1234,14 @@ static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst)
 
 extern void __scoped_seqlock_invalid_target(void);
 
-#if defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000
+#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || defined(CONFIG_KASAN)
 /*
  * For some reason some GCC-8 architectures (nios2, alpha) have trouble
  * determining that the ss_done state is impossible in __scoped_seqlock_next()
  * below.
+ *
+ * Similarly KASAN is known to confuse compilers enough to break this. But we
+ * don't care about code quality for KASAN builds anyway.
  */
 static inline void __scoped_seqlock_bug(void) { }
 #else
-- 
cgit v1.2.3


From 90a18c512884adb49ddc2fb30e94594169aae808 Mon Sep 17 00:00:00 2001
From: Antonio Borneo <antonio.borneo@foss.st.com>
Date: Thu, 23 Oct 2025 15:26:50 +0200
Subject: pinctrl: pinconf-generic: Handle string values for generic properties

Allow a generic pinconf property to specify its argument as one of
the strings in a match list.
Convert the matching string to an integer value using the index in
the list, then keep using this value in the generic pinconf code.

Signed-off-by: Antonio Borneo <antonio.borneo@foss.st.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c       | 57 ++++++++++++++++++++++++---------
 include/linux/pinctrl/pinconf-generic.h | 11 +++++--
 2 files changed, 50 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index e3d10bbcdaeb..72906d71ae1a 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -65,11 +65,12 @@ static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev,
 	int i;
 
 	for (i = 0; i < nitems; i++) {
+		const struct pin_config_item *item = &items[i];
 		unsigned long config;
 		int ret;
 
 		/* We want to check out this parameter */
-		config = pinconf_to_config_packed(items[i].param, 0);
+		config = pinconf_to_config_packed(item->param, 0);
 		if (gname)
 			ret = pin_config_group_get(dev_name(pctldev->dev),
 						   gname, &config);
@@ -86,15 +87,22 @@ static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev,
 		if (*print_sep)
 			seq_puts(s, ", ");
 		*print_sep = 1;
-		seq_puts(s, items[i].display);
+		seq_puts(s, item->display);
 		/* Print unit if available */
-		if (items[i].has_arg) {
+		if (item->has_arg) {
 			u32 val = pinconf_to_config_argument(config);
 
-			if (items[i].format)
-				seq_printf(s, " (%u %s)", val, items[i].format);
+			if (item->format)
+				seq_printf(s, " (%u %s)", val, item->format);
 			else
 				seq_printf(s, " (0x%x)", val);
+
+			if (item->values && item->num_values) {
+				if (val < item->num_values)
+					seq_printf(s, " \"%s\"", item->values[val]);
+				else
+					seq_puts(s, " \"(unknown)\"");
+			}
 		}
 	}
 }
@@ -205,10 +213,10 @@ static const struct pinconf_generic_params dt_params[] = {
  * @ncfg. @ncfg is updated to reflect the number of entries after parsing. @cfg
  * needs to have enough memory allocated to hold all possible entries.
  */
-static void parse_dt_cfg(struct device_node *np,
-			 const struct pinconf_generic_params *params,
-			 unsigned int count, unsigned long *cfg,
-			 unsigned int *ncfg)
+static int parse_dt_cfg(struct device_node *np,
+			const struct pinconf_generic_params *params,
+			unsigned int count, unsigned long *cfg,
+			unsigned int *ncfg)
 {
 	int i;
 
@@ -217,7 +225,19 @@ static void parse_dt_cfg(struct device_node *np,
 		int ret;
 		const struct pinconf_generic_params *par = &params[i];
 
-		ret = of_property_read_u32(np, par->property, &val);
+		if (par->values && par->num_values) {
+			ret = fwnode_property_match_property_string(of_fwnode_handle(np),
+								    par->property,
+								    par->values, par->num_values);
+			if (ret == -ENOENT)
+				return ret;
+			if (ret >= 0) {
+				val = ret;
+				ret = 0;
+			}
+		} else {
+			ret = of_property_read_u32(np, par->property, &val);
+		}
 
 		/* property not found */
 		if (ret == -EINVAL)
@@ -231,6 +251,8 @@ static void parse_dt_cfg(struct device_node *np,
 		cfg[*ncfg] = pinconf_to_config_packed(par->param, val);
 		(*ncfg)++;
 	}
+
+	return 0;
 }
 
 /**
@@ -323,13 +345,16 @@ int pinconf_generic_parse_dt_config(struct device_node *np,
 	if (!cfg)
 		return -ENOMEM;
 
-	parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg);
+	ret = parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg);
+	if (ret)
+		return ret;
 	if (pctldev && pctldev->desc->num_custom_params &&
-		pctldev->desc->custom_params)
-		parse_dt_cfg(np, pctldev->desc->custom_params,
-			     pctldev->desc->num_custom_params, cfg, &ncfg);
-
-	ret = 0;
+		pctldev->desc->custom_params) {
+		ret = parse_dt_cfg(np, pctldev->desc->custom_params,
+				   pctldev->desc->num_custom_params, cfg, &ncfg);
+		if (ret)
+			return ret;
+	}
 
 	/* no configs found at all */
 	if (ncfg == 0) {
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index d9245ecec71d..f82add5d3302 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -181,21 +181,28 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param
 	return PIN_CONF_PACKED(param, argument);
 }
 
-#define PCONFDUMP(a, b, c, d) {					\
-	.param = a, .display = b, .format = c, .has_arg = d	\
+#define PCONFDUMP_WITH_VALUES(a, b, c, d, e, f) {		\
+	.param = a, .display = b, .format = c, .has_arg = d,	\
+	.values = e, .num_values = f				\
 	}
 
+#define PCONFDUMP(a, b, c, d)	PCONFDUMP_WITH_VALUES(a, b, c, d, NULL, 0)
+
 struct pin_config_item {
 	const enum pin_config_param param;
 	const char * const display;
 	const char * const format;
 	bool has_arg;
+	const char * const *values;
+	size_t num_values;
 };
 
 struct pinconf_generic_params {
 	const char * const property;
 	enum pin_config_param param;
 	u32 default_value;
+	const char * const *values;
+	size_t num_values;
 };
 
 int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
-- 
cgit v1.2.3


From 55c7f5ef904fc2dcc7ef5945c5efb0cd60b46d32 Mon Sep 17 00:00:00 2001
From: Antonio Borneo <antonio.borneo@foss.st.com>
Date: Thu, 23 Oct 2025 15:26:51 +0200
Subject: pinctrl: pinconf-generic: Add properties 'skew-delay-{in,out}put-ps'

Add the properties 'skew-delay-input-ps' and 'skew-delay-output-ps'
to the generic parameters used for parsing DT files. This allows to
specify the independent skew delay value for the two directions.
This enables drivers that use the generic pin configuration to get
the value passed through these new properties.

Signed-off-by: Antonio Borneo <antonio.borneo@foss.st.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c       | 4 ++++
 include/linux/pinctrl/pinconf-generic.h | 8 ++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 72906d71ae1a..366775841c63 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -54,6 +54,8 @@ static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_SLEEP_HARDWARE_STATE, "sleep hardware state", NULL, false),
 	PCONFDUMP(PIN_CONFIG_SLEW_RATE, "slew rate", NULL, true),
 	PCONFDUMP(PIN_CONFIG_SKEW_DELAY, "skew delay", NULL, true),
+	PCONFDUMP(PIN_CONFIG_SKEW_DELAY_INPUT_PS, "input skew delay", "ps", true),
+	PCONFDUMP(PIN_CONFIG_SKEW_DELAY_OUTPUT_PS, "output skew delay", "ps", true),
 };
 
 static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev,
@@ -198,6 +200,8 @@ static const struct pinconf_generic_params dt_params[] = {
 	{ "sleep-hardware-state", PIN_CONFIG_SLEEP_HARDWARE_STATE, 0 },
 	{ "slew-rate", PIN_CONFIG_SLEW_RATE, 0 },
 	{ "skew-delay", PIN_CONFIG_SKEW_DELAY, 0 },
+	{ "skew-delay-input-ps", PIN_CONFIG_SKEW_DELAY_INPUT_PS, 0 },
+	{ "skew-delay-output-ps", PIN_CONFIG_SKEW_DELAY_OUTPUT_PS, 0 },
 };
 
 /**
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index f82add5d3302..1be4032071c2 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -112,6 +112,12 @@ struct pinctrl_map;
  *	or latch delay (on outputs) this parameter (in a custom format)
  *	specifies the clock skew or latch delay. It typically controls how
  *	many double inverters are put in front of the line.
+ * @PIN_CONFIG_SKEW_DELAY_INPUT_PS: if the pin has independent values for the
+ *	programmable skew rate (on inputs) and latch delay (on outputs), then
+ *	this parameter specifies the clock skew only. The argument is in ps.
+ * @PIN_CONFIG_SKEW_DELAY_OUPUT_PS: if the pin has independent values for the
+ *	programmable skew rate (on inputs) and latch delay (on outputs), then
+ *	this parameter specifies the latch delay only. The argument is in ps.
  * @PIN_CONFIG_SLEEP_HARDWARE_STATE: indicate this is sleep related state.
  * @PIN_CONFIG_SLEW_RATE: if the pin can select slew rate, the argument to
  *	this parameter (on a custom format) tells the driver which alternative
@@ -147,6 +153,8 @@ enum pin_config_param {
 	PIN_CONFIG_PERSIST_STATE,
 	PIN_CONFIG_POWER_SOURCE,
 	PIN_CONFIG_SKEW_DELAY,
+	PIN_CONFIG_SKEW_DELAY_INPUT_PS,
+	PIN_CONFIG_SKEW_DELAY_OUTPUT_PS,
 	PIN_CONFIG_SLEEP_HARDWARE_STATE,
 	PIN_CONFIG_SLEW_RATE,
 	PIN_CONFIG_END = 0x7F,
-- 
cgit v1.2.3


From 7718f2a8b87af7363d60819ac0ac0da8b2f8ff00 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Thu, 23 Oct 2025 12:16:57 +0300
Subject: net/mlx5: Add software system image GUID infrastructure

Replace direct hardware system image GUID usage with a new software
system image GUID function that supports variable-length identifiers.

Key changes:
- Add mlx5_query_nic_sw_system_image_guid() function with length parameter.
- Update all callsites to use the new function and buffer/length approach.
- Modify mapping contexts to use byte arrays instead of u64 keys.
- Update devcom matching to support variable-length keys.
- Change mlx5_same_hw_devs() to use buffer comparison instead of u64.

This refactoring prepares the infrastructure for balance ID support,
which requires extending the system image GUID with additional data.
The change maintains backward compatibility while enabling future
enhancements.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761211020-925651-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      | 12 ++++++----
 .../net/ethernet/mellanox/mlx5/core/en/devlink.c   |  7 ++----
 .../net/ethernet/mellanox/mlx5/core/en/mapping.c   | 13 +++++++----
 .../net/ethernet/mellanox/mlx5/core/en/mapping.h   |  3 ++-
 .../ethernet/mellanox/mlx5/core/en/tc/int_port.c   |  8 ++++---
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 11 +++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 26 +++++++++++++---------
 .../ethernet/mellanox/mlx5/core/esw/devlink_port.c |  6 +----
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  8 ++++---
 drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c  |  4 +++-
 .../net/ethernet/mellanox/mlx5/core/lib/devcom.h   |  2 ++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  2 ++
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    | 15 +++++++++++++
 include/linux/mlx5/driver.h                        |  3 +++
 14 files changed, 80 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 891bbbbfbbf1..64c04f52990f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -564,10 +564,14 @@ int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev)
 
 bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev)
 {
-	u64 fsystem_guid, psystem_guid;
+	u8 fsystem_guid[MLX5_SW_IMAGE_GUID_MAX_BYTES];
+	u8 psystem_guid[MLX5_SW_IMAGE_GUID_MAX_BYTES];
+	u8 flen;
+	u8 plen;
 
-	fsystem_guid = mlx5_query_nic_system_image_guid(dev);
-	psystem_guid = mlx5_query_nic_system_image_guid(peer_dev);
+	mlx5_query_nic_sw_system_image_guid(dev, fsystem_guid, &flen);
+	mlx5_query_nic_sw_system_image_guid(peer_dev, psystem_guid, &plen);
 
-	return (fsystem_guid && psystem_guid && fsystem_guid == psystem_guid);
+	return plen && flen && flen == plen &&
+		!memcmp(fsystem_guid, psystem_guid, flen);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
index 0b1ac6e5c890..8818f65d1fbc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
@@ -40,11 +40,8 @@ void mlx5e_destroy_devlink(struct mlx5e_dev *mlx5e_dev)
 static void
 mlx5e_devlink_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_id *ppid)
 {
-	u64 parent_id;
-
-	parent_id = mlx5_query_nic_system_image_guid(dev);
-	ppid->id_len = sizeof(parent_id);
-	memcpy(ppid->id, &parent_id, sizeof(parent_id));
+	BUILD_BUG_ON(MLX5_SW_IMAGE_GUID_MAX_BYTES > MAX_PHYS_ITEM_ID_LEN);
+	mlx5_query_nic_sw_system_image_guid(dev, ppid->id, &ppid->id_len);
 }
 
 int mlx5e_devlink_port_register(struct mlx5e_dev *mlx5e_dev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c
index 4e72ca8070e2..1de18c7e96ec 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c
@@ -6,6 +6,7 @@
 #include <linux/xarray.h>
 #include <linux/hashtable.h>
 #include <linux/refcount.h>
+#include <linux/mlx5/driver.h>
 
 #include "mapping.h"
 
@@ -24,7 +25,8 @@ struct mapping_ctx {
 	struct delayed_work dwork;
 	struct list_head pending_list;
 	spinlock_t pending_list_lock; /* Guards pending list */
-	u64 id;
+	u8 id[MLX5_SW_IMAGE_GUID_MAX_BYTES];
+	u8 id_len;
 	u8 type;
 	struct list_head list;
 	refcount_t refcount;
@@ -220,13 +222,15 @@ mapping_create(size_t data_size, u32 max_id, bool delayed_removal)
 }
 
 struct mapping_ctx *
-mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delayed_removal)
+mapping_create_for_id(u8 *id, u8 id_len, u8 type, size_t data_size, u32 max_id,
+		      bool delayed_removal)
 {
 	struct mapping_ctx *ctx;
 
 	mutex_lock(&shared_ctx_lock);
 	list_for_each_entry(ctx, &shared_ctx_list, list) {
-		if (ctx->id == id && ctx->type == type) {
+		if (ctx->type == type && ctx->id_len == id_len &&
+		    !memcmp(id, ctx->id, id_len)) {
 			if (refcount_inc_not_zero(&ctx->refcount))
 				goto unlock;
 			break;
@@ -237,7 +241,8 @@ mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delaye
 	if (IS_ERR(ctx))
 		goto unlock;
 
-	ctx->id = id;
+	memcpy(ctx->id, id, id_len);
+	ctx->id_len = id_len;
 	ctx->type = type;
 	list_add(&ctx->list, &shared_ctx_list);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h
index 4e2119f0f4c1..e86a103d58b9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h
@@ -27,6 +27,7 @@ void mapping_destroy(struct mapping_ctx *ctx);
 /* adds mapping with an id or get an existing mapping with the same id
  */
 struct mapping_ctx *
-mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delayed_removal);
+mapping_create_for_id(u8 *id, u8 id_len, u8 type, size_t data_size, u32 max_id,
+		      bool delayed_removal);
 
 #endif /* __MLX5_MAPPING_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c
index 896f718483c3..991f47050643 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c
@@ -307,7 +307,8 @@ mlx5e_tc_int_port_init(struct mlx5e_priv *priv)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5e_tc_int_port_priv *int_port_priv;
-	u64 mapping_id;
+	u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES];
+	u8 id_len;
 
 	if (!mlx5e_tc_int_port_supported(esw))
 		return NULL;
@@ -316,9 +317,10 @@ mlx5e_tc_int_port_init(struct mlx5e_priv *priv)
 	if (!int_port_priv)
 		return NULL;
 
-	mapping_id = mlx5_query_nic_system_image_guid(priv->mdev);
+	mlx5_query_nic_sw_system_image_guid(priv->mdev, mapping_id, &id_len);
 
-	int_port_priv->metadata_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_INT_PORT,
+	int_port_priv->metadata_mapping = mapping_create_for_id(mapping_id, id_len,
+								MAPPING_TYPE_INT_PORT,
 								sizeof(u32) * 2,
 								(1 << ESW_VPORT_BITS) - 1, true);
 	if (IS_ERR(int_port_priv->metadata_mapping)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
index 870d12364f99..fc0e57403d25 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
@@ -2287,9 +2287,10 @@ mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains,
 		enum mlx5_flow_namespace_type ns_type,
 		struct mlx5e_post_act *post_act)
 {
+	u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES];
 	struct mlx5_tc_ct_priv *ct_priv;
 	struct mlx5_core_dev *dev;
-	u64 mapping_id;
+	u8 id_len;
 	int err;
 
 	dev = priv->mdev;
@@ -2301,16 +2302,18 @@ mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains,
 	if (!ct_priv)
 		goto err_alloc;
 
-	mapping_id = mlx5_query_nic_system_image_guid(dev);
+	mlx5_query_nic_sw_system_image_guid(dev, mapping_id, &id_len);
 
-	ct_priv->zone_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_ZONE,
+	ct_priv->zone_mapping = mapping_create_for_id(mapping_id, id_len,
+						      MAPPING_TYPE_ZONE,
 						      sizeof(u16), 0, true);
 	if (IS_ERR(ct_priv->zone_mapping)) {
 		err = PTR_ERR(ct_priv->zone_mapping);
 		goto err_mapping_zone;
 	}
 
-	ct_priv->labels_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_LABELS,
+	ct_priv->labels_mapping = mapping_create_for_id(mapping_id, id_len,
+							MAPPING_TYPE_LABELS,
 							sizeof(u32) * 4, 0, true);
 	if (IS_ERR(ct_priv->labels_mapping)) {
 		err = PTR_ERR(ct_priv->labels_mapping);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 54ccfb4e6c4e..a8773b2342c2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -5233,10 +5233,11 @@ static void mlx5e_tc_nic_destroy_miss_table(struct mlx5e_priv *priv)
 int mlx5e_tc_nic_init(struct mlx5e_priv *priv)
 {
 	struct mlx5e_tc_table *tc = mlx5e_fs_get_tc(priv->fs);
+	u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES];
 	struct mlx5_core_dev *dev = priv->mdev;
 	struct mapping_ctx *chains_mapping;
 	struct mlx5_chains_attr attr = {};
-	u64 mapping_id;
+	u8 id_len;
 	int err;
 
 	mlx5e_mod_hdr_tbl_init(&tc->mod_hdr);
@@ -5252,11 +5253,13 @@ int mlx5e_tc_nic_init(struct mlx5e_priv *priv)
 	lockdep_set_class(&tc->ht.mutex, &tc_ht_lock_key);
 	lockdep_init_map(&tc->ht.run_work.lockdep_map, "tc_ht_wq_key", &tc_ht_wq_key, 0);
 
-	mapping_id = mlx5_query_nic_system_image_guid(dev);
+	mlx5_query_nic_sw_system_image_guid(dev, mapping_id, &id_len);
 
-	chains_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_CHAIN,
+	chains_mapping = mapping_create_for_id(mapping_id, id_len,
+					       MAPPING_TYPE_CHAIN,
 					       sizeof(struct mlx5_mapped_obj),
-					       MLX5E_TC_TABLE_CHAIN_TAG_MASK, true);
+					       MLX5E_TC_TABLE_CHAIN_TAG_MASK,
+					       true);
 
 	if (IS_ERR(chains_mapping)) {
 		err = PTR_ERR(chains_mapping);
@@ -5387,14 +5390,15 @@ void mlx5e_tc_ht_cleanup(struct rhashtable *tc_ht)
 int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv)
 {
 	const size_t sz_enc_opts = sizeof(struct tunnel_match_enc_opts);
+	u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES];
 	struct mlx5_devcom_match_attr attr = {};
 	struct netdev_phys_item_id ppid;
 	struct mlx5e_rep_priv *rpriv;
 	struct mapping_ctx *mapping;
 	struct mlx5_eswitch *esw;
 	struct mlx5e_priv *priv;
-	u64 mapping_id;
 	int err = 0;
+	u8 id_len;
 
 	rpriv = container_of(uplink_priv, struct mlx5e_rep_priv, uplink_priv);
 	priv = netdev_priv(rpriv->netdev);
@@ -5412,9 +5416,9 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv)
 
 	uplink_priv->tc_psample = mlx5e_tc_sample_init(esw, uplink_priv->post_act);
 
-	mapping_id = mlx5_query_nic_system_image_guid(esw->dev);
+	mlx5_query_nic_sw_system_image_guid(esw->dev, mapping_id, &id_len);
 
-	mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_TUNNEL,
+	mapping = mapping_create_for_id(mapping_id, id_len, MAPPING_TYPE_TUNNEL,
 					sizeof(struct tunnel_match_key),
 					TUNNEL_INFO_BITS_MASK, true);
 
@@ -5427,8 +5431,10 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv)
 	/* Two last values are reserved for stack devices slow path table mark
 	 * and bridge ingress push mark.
 	 */
-	mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_TUNNEL_ENC_OPTS,
-					sz_enc_opts, ENC_OPTS_BITS_MASK - 2, true);
+	mapping = mapping_create_for_id(mapping_id, id_len,
+					MAPPING_TYPE_TUNNEL_ENC_OPTS,
+					sz_enc_opts, ENC_OPTS_BITS_MASK - 2,
+					true);
 	if (IS_ERR(mapping)) {
 		err = PTR_ERR(mapping);
 		goto err_enc_opts_mapping;
@@ -5449,7 +5455,7 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv)
 
 	err = netif_get_port_parent_id(priv->netdev, &ppid, false);
 	if (!err) {
-		memcpy(&attr.key.val, &ppid.id, sizeof(attr.key.val));
+		memcpy(&attr.key.buf, &ppid.id, ppid.id_len);
 		attr.flags = MLX5_DEVCOM_MATCH_FLAGS_NS;
 		attr.net = mlx5_core_net(esw->dev);
 		mlx5_esw_offloads_devcom_init(esw, &attr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
index cf88a106d80d..89a58dee50b3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
@@ -7,11 +7,7 @@
 static void
 mlx5_esw_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_id *ppid)
 {
-	u64 parent_id;
-
-	parent_id = mlx5_query_nic_system_image_guid(dev);
-	ppid->id_len = sizeof(parent_id);
-	memcpy(ppid->id, &parent_id, sizeof(parent_id));
+	mlx5_query_nic_sw_system_image_guid(dev, ppid->id, &ppid->id_len);
 }
 
 static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_num)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 34749814f19b..9735a75732cf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3557,10 +3557,11 @@ bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 cont
 
 int esw_offloads_enable(struct mlx5_eswitch *esw)
 {
+	u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES];
 	struct mapping_ctx *reg_c0_obj_pool;
 	struct mlx5_vport *vport;
 	unsigned long i;
-	u64 mapping_id;
+	u8 id_len;
 	int err;
 
 	mutex_init(&esw->offloads.termtbl_mutex);
@@ -3582,9 +3583,10 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
 	if (err)
 		goto err_vport_metadata;
 
-	mapping_id = mlx5_query_nic_system_image_guid(esw->dev);
+	mlx5_query_nic_sw_system_image_guid(esw->dev, mapping_id, &id_len);
 
-	reg_c0_obj_pool = mapping_create_for_id(mapping_id, MAPPING_TYPE_CHAIN,
+	reg_c0_obj_pool = mapping_create_for_id(mapping_id, id_len,
+						MAPPING_TYPE_CHAIN,
 						sizeof(struct mlx5_mapped_obj),
 						ESW_REG_C0_USER_DATA_METADATA_MASK,
 						true);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 3db0387bf6dc..1ac933cd8f02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -1418,10 +1418,12 @@ static void mlx5_lag_unregister_hca_devcom_comp(struct mlx5_core_dev *dev)
 static int mlx5_lag_register_hca_devcom_comp(struct mlx5_core_dev *dev)
 {
 	struct mlx5_devcom_match_attr attr = {
-		.key.val = mlx5_query_nic_system_image_guid(dev),
 		.flags = MLX5_DEVCOM_MATCH_FLAGS_NS,
 		.net = mlx5_core_net(dev),
 	};
+	u8 len __always_unused;
+
+	mlx5_query_nic_sw_system_image_guid(dev, attr.key.buf, &len);
 
 	/* This component is use to sync adding core_dev to lag_dev and to sync
 	 * changes of mlx5_adev_devices between LAG layer and other layers.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
index 609c85f47917..91e5ae529d5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
@@ -10,8 +10,10 @@ enum mlx5_devom_match_flags {
 	MLX5_DEVCOM_MATCH_FLAGS_NS = BIT(0),
 };
 
+#define MLX5_DEVCOM_MATCH_KEY_MAX 32
 union mlx5_devcom_match_key {
 	u64 val;
+	u8 buf[MLX5_DEVCOM_MATCH_KEY_MAX];
 };
 
 struct mlx5_devcom_match_attr {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 082259b56816..acef7d0ffa09 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -444,6 +444,8 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev);
 void mlx5_uninit_one_light(struct mlx5_core_dev *dev);
 void mlx5_unload_one_light(struct mlx5_core_dev *dev);
 
+void mlx5_query_nic_sw_system_image_guid(struct mlx5_core_dev *mdev, u8 *buf,
+					 u8 *len);
 int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap, u16 vport,
 				  u16 opmod);
 #define mlx5_vport_get_other_func_general_cap(dev, vport, out)		\
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 2ed2e530b07d..4224e2750865 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -1190,6 +1190,21 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev)
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid);
 
+void mlx5_query_nic_sw_system_image_guid(struct mlx5_core_dev *mdev, u8 *buf,
+					 u8 *len)
+{
+	u64 fw_system_image_guid;
+
+	*len = 0;
+
+	fw_system_image_guid = mlx5_query_nic_system_image_guid(mdev);
+	if (!fw_system_image_guid)
+		return;
+
+	memcpy(buf, &fw_system_image_guid, sizeof(fw_system_image_guid));
+	*len += sizeof(fw_system_image_guid);
+}
+
 static bool mlx5_vport_use_vhca_id_as_func_id(struct mlx5_core_dev *dev,
 					      u16 vport_num, u16 *vhca_id)
 {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5405ca1038f9..dcf262aa9ea6 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1379,4 +1379,7 @@ static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev)
 {
 	return devlink_net(priv_to_devlink(dev));
 }
+
+#define MLX5_SW_IMAGE_GUID_MAX_BYTES 8
+
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From 20d78ead947783b039b02ca4b8c551b4d1894759 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Thu, 23 Oct 2025 12:17:00 +0300
Subject: net/mlx5: Add balance ID support for LAG multiplane groups

Implement balance ID support for multiplane LAG configurations. This
feature enables per-multiplane group load balancing by extending the
software system image GUID with a balance ID component.

Key implementations:
- Enable lag_per_mp_group capability when supported by hardware.
- Append load_balance_id to software system image GUID when conditions
  are met.
- Increase MLX5_SW_IMAGE_GUID_MAX_BYTES from 8 to 9 to accommodate the
  extra byte.

The balance ID is appended to the system image GUID only when both
load_balance_id and lag_per_mp_group capabilities are available, ensuring
backward compatibility while enabling enhanced LAG functionality.

This enhancement allows for more granular load balancing control in complex
multi-plane LAG deployments, improving network performance and flexibility.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761211020-925651-6-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c  | 5 +++++
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 4 ++++
 include/linux/mlx5/driver.h                     | 2 +-
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 563267acf386..c904696cbc3a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -575,6 +575,11 @@ static int handle_hca_cap_2(struct mlx5_core_dev *dev, void *set_ctx)
 		do_set = true;
 	}
 
+	if (MLX5_CAP_GEN_2_MAX(dev, lag_per_mp_group)) {
+		MLX5_SET(cmd_hca_cap_2, set_hca_cap, lag_per_mp_group, 1);
+		do_set = true;
+	}
+
 	/* some FW versions that support querying MLX5_CAP_GENERAL_2
 	 * capabilities but don't support setting them.
 	 * Skip unnecessary update to hca_cap_2 when no changes were introduced
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 4224e2750865..992873536c1b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -1203,6 +1203,10 @@ void mlx5_query_nic_sw_system_image_guid(struct mlx5_core_dev *mdev, u8 *buf,
 
 	memcpy(buf, &fw_system_image_guid, sizeof(fw_system_image_guid));
 	*len += sizeof(fw_system_image_guid);
+
+	if (MLX5_CAP_GEN_2(mdev, load_balance_id) &&
+	    MLX5_CAP_GEN_2(mdev, lag_per_mp_group))
+		buf[(*len)++] = MLX5_CAP_GEN_2(mdev, load_balance_id);
 }
 
 static bool mlx5_vport_use_vhca_id_as_func_id(struct mlx5_core_dev *dev,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index dcf262aa9ea6..046396269ccf 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1380,6 +1380,6 @@ static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev)
 	return devlink_net(priv_to_devlink(dev));
 }
 
-#define MLX5_SW_IMAGE_GUID_MAX_BYTES 8
+#define MLX5_SW_IMAGE_GUID_MAX_BYTES 9
 
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From baeb66fbd4201d1c4325074e78b1f557dff89b5b Mon Sep 17 00:00:00 2001
From: Jimmy Hu <hhhuuu@google.com>
Date: Thu, 23 Oct 2025 05:49:45 +0000
Subject: usb: gadget: udc: fix use-after-free in usb_gadget_state_work

A race condition during gadget teardown can lead to a use-after-free
in usb_gadget_state_work(), as reported by KASAN:

  BUG: KASAN: invalid-access in sysfs_notify+0x2c/0xd0
  Workqueue: events usb_gadget_state_work

The fundamental race occurs because a concurrent event (e.g., an
interrupt) can call usb_gadget_set_state() and schedule gadget->work
at any time during the cleanup process in usb_del_gadget().

Commit 399a45e5237c ("usb: gadget: core: flush gadget workqueue after
device removal") attempted to fix this by moving flush_work() to after
device_del(). However, this does not fully solve the race, as a new
work item can still be scheduled *after* flush_work() completes but
before the gadget's memory is freed, leading to the same use-after-free.

This patch fixes the race condition robustly by introducing a 'teardown'
flag and a 'state_lock' spinlock to the usb_gadget struct. The flag is
set during cleanup in usb_del_gadget() *before* calling flush_work() to
prevent any new work from being scheduled once cleanup has commenced.
The scheduling site, usb_gadget_set_state(), now checks this flag under
the lock before queueing the work, thus safely closing the race window.

Fixes: 5702f75375aa9 ("usb: gadget: udc-core: move sysfs_notify() to a workqueue")
Cc: stable <stable@kernel.org>
Signed-off-by: Jimmy Hu <hhhuuu@google.com>
Link: https://patch.msgid.link/20251023054945.233861-1-hhhuuu@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/udc/core.c | 17 ++++++++++++++++-
 include/linux/usb/gadget.h    |  5 +++++
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 694653761c44..8dbe79bdc0f9 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1126,8 +1126,13 @@ static void usb_gadget_state_work(struct work_struct *work)
 void usb_gadget_set_state(struct usb_gadget *gadget,
 		enum usb_device_state state)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&gadget->state_lock, flags);
 	gadget->state = state;
-	schedule_work(&gadget->work);
+	if (!gadget->teardown)
+		schedule_work(&gadget->work);
+	spin_unlock_irqrestore(&gadget->state_lock, flags);
 	trace_usb_gadget_set_state(gadget, 0);
 }
 EXPORT_SYMBOL_GPL(usb_gadget_set_state);
@@ -1361,6 +1366,8 @@ static void usb_udc_nop_release(struct device *dev)
 void usb_initialize_gadget(struct device *parent, struct usb_gadget *gadget,
 		void (*release)(struct device *dev))
 {
+	spin_lock_init(&gadget->state_lock);
+	gadget->teardown = false;
 	INIT_WORK(&gadget->work, usb_gadget_state_work);
 	gadget->dev.parent = parent;
 
@@ -1535,6 +1542,7 @@ EXPORT_SYMBOL_GPL(usb_add_gadget_udc);
 void usb_del_gadget(struct usb_gadget *gadget)
 {
 	struct usb_udc *udc = gadget->udc;
+	unsigned long flags;
 
 	if (!udc)
 		return;
@@ -1548,6 +1556,13 @@ void usb_del_gadget(struct usb_gadget *gadget)
 	kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE);
 	sysfs_remove_link(&udc->dev.kobj, "gadget");
 	device_del(&gadget->dev);
+	/*
+	 * Set the teardown flag before flushing the work to prevent new work
+	 * from being scheduled while we are cleaning up.
+	 */
+	spin_lock_irqsave(&gadget->state_lock, flags);
+	gadget->teardown = true;
+	spin_unlock_irqrestore(&gadget->state_lock, flags);
 	flush_work(&gadget->work);
 	ida_free(&gadget_id_numbers, gadget->id_number);
 	cancel_work_sync(&udc->vbus_work);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 3aaf19e77558..8285b19a25e0 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -376,6 +376,9 @@ struct usb_gadget_ops {
  *	can handle. The UDC must support this and all slower speeds and lower
  *	number of lanes.
  * @state: the state we are now (attached, suspended, configured, etc)
+ * @state_lock: Spinlock protecting the `state` and `teardown` members.
+ * @teardown: True if the device is undergoing teardown, used to prevent
+ *	new work from being scheduled during cleanup.
  * @name: Identifies the controller hardware type.  Used in diagnostics
  *	and sometimes configuration.
  * @dev: Driver model state for this abstract device.
@@ -451,6 +454,8 @@ struct usb_gadget {
 	enum usb_ssp_rate		max_ssp_rate;
 
 	enum usb_device_state		state;
+	spinlock_t			state_lock;
+	bool				teardown;
 	const char			*name;
 	struct device			dev;
 	unsigned			isoch_delay;
-- 
cgit v1.2.3


From d1e6d2773898c7a1c19e12619d303920d32a9cd0 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Fri, 10 Oct 2025 17:38:13 +0200
Subject: rcu: Add a small-width RCU watching counter debug option

A later commit will reduce the size of the RCU watching counter to free up
some bits for another purpose. Paul suggested adding a config option to
test the extreme case where the counter is reduced to its minimum usable
width for rcutorture to poke at, so do that.

Make it only configurable under RCU_EXPERT. While at it, add a comment to
explain the layout of context_tracking->state.

Link: http://lore.kernel.org/r/4c2cb573-168f-4806-b1d9-164e8276e66a@paulmck-laptop
Suggested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/context_tracking_state.h | 44 ++++++++++++++++++++++++++++------
 kernel/rcu/Kconfig.debug               | 15 ++++++++++++
 2 files changed, 52 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 7b8433d5a8ef..0b81248aa03e 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -18,12 +18,6 @@ enum ctx_state {
 	CT_STATE_MAX		= 4,
 };
 
-/* Odd value for watching, else even. */
-#define CT_RCU_WATCHING CT_STATE_MAX
-
-#define CT_STATE_MASK (CT_STATE_MAX - 1)
-#define CT_RCU_WATCHING_MASK (~CT_STATE_MASK)
-
 struct context_tracking {
 #ifdef CONFIG_CONTEXT_TRACKING_USER
 	/*
@@ -44,9 +38,45 @@ struct context_tracking {
 #endif
 };
 
+/*
+ * We cram two different things within the same atomic variable:
+ *
+ *                     CT_RCU_WATCHING_START  CT_STATE_START
+ *                                |                |
+ *                                v                v
+ *     MSB [ RCU watching counter ][ context_state ] LSB
+ *         ^                       ^
+ *         |                       |
+ * CT_RCU_WATCHING_END        CT_STATE_END
+ *
+ * Bits are used from the LSB upwards, so unused bits (if any) will always be in
+ * upper bits of the variable.
+ */
 #ifdef CONFIG_CONTEXT_TRACKING
+#define CT_SIZE (sizeof(((struct context_tracking *)0)->state) * BITS_PER_BYTE)
+
+#define CT_STATE_WIDTH bits_per(CT_STATE_MAX - 1)
+#define CT_STATE_START 0
+#define CT_STATE_END   (CT_STATE_START + CT_STATE_WIDTH - 1)
+
+#define CT_RCU_WATCHING_MAX_WIDTH (CT_SIZE - CT_STATE_WIDTH)
+#define CT_RCU_WATCHING_WIDTH     (IS_ENABLED(CONFIG_RCU_DYNTICKS_TORTURE) ? 2 : CT_RCU_WATCHING_MAX_WIDTH)
+#define CT_RCU_WATCHING_START     (CT_STATE_END + 1)
+#define CT_RCU_WATCHING_END       (CT_RCU_WATCHING_START + CT_RCU_WATCHING_WIDTH - 1)
+#define CT_RCU_WATCHING           BIT(CT_RCU_WATCHING_START)
+
+#define CT_STATE_MASK        GENMASK(CT_STATE_END,        CT_STATE_START)
+#define CT_RCU_WATCHING_MASK GENMASK(CT_RCU_WATCHING_END, CT_RCU_WATCHING_START)
+
+#define CT_UNUSED_WIDTH (CT_RCU_WATCHING_MAX_WIDTH - CT_RCU_WATCHING_WIDTH)
+
+static_assert(CT_STATE_WIDTH        +
+	      CT_RCU_WATCHING_WIDTH +
+	      CT_UNUSED_WIDTH       ==
+	      CT_SIZE);
+
 DECLARE_PER_CPU(struct context_tracking, context_tracking);
-#endif
+#endif	/* CONFIG_CONTEXT_TRACKING */
 
 #ifdef CONFIG_CONTEXT_TRACKING_USER
 static __always_inline int __ct_state(void)
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 12e4c64ebae1..625d75392647 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -213,4 +213,19 @@ config RCU_STRICT_GRACE_PERIOD
 	  when looking for certain types of RCU usage bugs, for example,
 	  too-short RCU read-side critical sections.
 
+
+config RCU_DYNTICKS_TORTURE
+	bool "Minimize RCU dynticks counter size"
+	depends on RCU_EXPERT && !COMPILE_TEST
+	default n
+	help
+	  This option sets the width of the dynticks counter to its
+	  minimum usable value.  This minimum width greatly increases
+	  the probability of flushing out bugs involving counter wrap,
+	  but it also increases the probability of extending grace period
+	  durations.  This Kconfig option should therefore be avoided in
+	  production due to the consequent increased probability of OOMs.
+
+	  This has no value for production and is only for testing.
+
 endmenu # "RCU Debugging"
-- 
cgit v1.2.3


From bcce8c74f1ce1e2731ac0261287897e3768767d8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 24 Oct 2025 15:46:21 -0700
Subject: PCI: Enable host bridge emulation for PCI_DOMAINS_GENERIC platforms

The ability to emulate a host bridge is useful not only for hardware PCI
controllers like CONFIG_VMD, or virtual PCI controllers like
CONFIG_PCI_HYPERV, but also for test and development scenarios like
CONFIG_SAMPLES_DEVSEC [1].

One stumbling block for defining CONFIG_SAMPLES_DEVSEC, a sample
implementation of a platform TSM for PCI Device Security, is the need to
accommodate PCI_DOMAINS_GENERIC architectures alongside x86 [2].

In support of supplementing the existing CONFIG_PCI_BRIDGE_EMUL
infrastructure for host bridges:

* Introduce pci_bus_find_emul_domain_nr() as a common way to find a free
  PCI domain number whether that is to reuse the existing dynamic
  allocation code in the !ACPI case, or to assign an unused domain above
  the last ACPI segment.

* Convert pci-hyperv to the new allocator so that the PCI core can
  unconditionally assume that bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET
  is the dynamically allocated case.

A follow on patch can also convert vmd to the new scheme. Currently vmd is
limited to CONFIG_PCI_DOMAINS_GENERIC=n (x86) so, unlike pci-hyperv, it
does not immediately conflict with this new pci_bus_find_emul_domain_nr()
mechanism.

Link: http://lore.kernel.org/174107249038.1288555.12362100502109498455.stgit@dwillia2-xfh.jf.intel.com [1]
Reported-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Closes: http://lore.kernel.org/20250311144601.145736-3-suzuki.poulose@arm.com [2]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Cc: Lorenzo Pieralisi <lpieralisi@kernel.org>
Cc: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Cc: Rob Herring <robh@kernel.org>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Dexuan Cui <decui@microsoft.com>
Link: https://patch.msgid.link/20251024224622.1470555-2-dan.j.williams@intel.com
---
 drivers/pci/controller/pci-hyperv.c | 62 ++++++-------------------------------
 drivers/pci/pci.c                   | 24 +++++++++++++-
 drivers/pci/probe.c                 |  8 ++++-
 include/linux/pci.h                 |  7 +++++
 4 files changed, 46 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 146b43981b27..1e237d3538f9 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -3696,48 +3696,6 @@ static int hv_send_resources_released(struct hv_device *hdev)
 	return 0;
 }
 
-#define HVPCI_DOM_MAP_SIZE (64 * 1024)
-static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
-
-/*
- * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
- * as invalid for passthrough PCI devices of this driver.
- */
-#define HVPCI_DOM_INVALID 0
-
-/**
- * hv_get_dom_num() - Get a valid PCI domain number
- * Check if the PCI domain number is in use, and return another number if
- * it is in use.
- *
- * @dom: Requested domain number
- *
- * return: domain number on success, HVPCI_DOM_INVALID on failure
- */
-static u16 hv_get_dom_num(u16 dom)
-{
-	unsigned int i;
-
-	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
-		return dom;
-
-	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
-		if (test_and_set_bit(i, hvpci_dom_map) == 0)
-			return i;
-	}
-
-	return HVPCI_DOM_INVALID;
-}
-
-/**
- * hv_put_dom_num() - Mark the PCI domain number as free
- * @dom: Domain number to be freed
- */
-static void hv_put_dom_num(u16 dom)
-{
-	clear_bit(dom, hvpci_dom_map);
-}
-
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:	VMBus's tracking struct for this root PCI bus
@@ -3750,9 +3708,9 @@ static int hv_pci_probe(struct hv_device *hdev,
 {
 	struct pci_host_bridge *bridge;
 	struct hv_pcibus_device *hbus;
-	u16 dom_req, dom;
+	int ret, dom;
+	u16 dom_req;
 	char *name;
-	int ret;
 
 	bridge = devm_pci_alloc_host_bridge(&hdev->device, 0);
 	if (!bridge)
@@ -3779,11 +3737,14 @@ static int hv_pci_probe(struct hv_device *hdev,
 	 * PCI bus (which is actually emulated by the hypervisor) is domain 0.
 	 * (2) There will be no overlap between domains (after fixing possible
 	 * collisions) in the same VM.
+	 *
+	 * Because Gen1 VMs use domain 0, don't allow picking domain 0 here,
+	 * even if bytes 4 and 5 of the instance GUID are both zero. For wider
+	 * userspace compatibility, limit the domain ID to a 16-bit value.
 	 */
 	dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
-	dom = hv_get_dom_num(dom_req);
-
-	if (dom == HVPCI_DOM_INVALID) {
+	dom = pci_bus_find_emul_domain_nr(dom_req, 1, U16_MAX);
+	if (dom < 0) {
 		dev_err(&hdev->device,
 			"Unable to use dom# 0x%x or other numbers", dom_req);
 		ret = -EINVAL;
@@ -3917,7 +3878,7 @@ close:
 destroy_wq:
 	destroy_workqueue(hbus->wq);
 free_dom:
-	hv_put_dom_num(hbus->bridge->domain_nr);
+	pci_bus_release_emul_domain_nr(hbus->bridge->domain_nr);
 free_bus:
 	kfree(hbus);
 	return ret;
@@ -4042,8 +4003,6 @@ static void hv_pci_remove(struct hv_device *hdev)
 	irq_domain_remove(hbus->irq_domain);
 	irq_domain_free_fwnode(hbus->fwnode);
 
-	hv_put_dom_num(hbus->bridge->domain_nr);
-
 	kfree(hbus);
 }
 
@@ -4217,9 +4176,6 @@ static int __init init_hv_pci_drv(void)
 	if (ret)
 		return ret;
 
-	/* Set the invalid domain number's bit, so it will not be used */
-	set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
-
 	/* Initialize PCI block r/w interface */
 	hvpci_block_ops.read_block = hv_read_config_block;
 	hvpci_block_ops.write_block = hv_write_config_block;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b14dd064006c..f4ccb948e59d 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -6656,9 +6656,31 @@ static void pci_no_domains(void)
 #endif
 }
 
+#ifdef CONFIG_PCI_DOMAINS
+static DEFINE_IDA(pci_domain_nr_dynamic_ida);
+
+/**
+ * pci_bus_find_emul_domain_nr() - allocate a PCI domain number per constraints
+ * @hint: desired domain, 0 if any ID in the range of @min to @max is acceptable
+ * @min: minimum allowable domain
+ * @max: maximum allowable domain, no IDs higher than INT_MAX will be returned
+ */
+int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max)
+{
+	return ida_alloc_range(&pci_domain_nr_dynamic_ida, max(hint, min), max,
+			       GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(pci_bus_find_emul_domain_nr);
+
+void pci_bus_release_emul_domain_nr(int domain_nr)
+{
+	ida_free(&pci_domain_nr_dynamic_ida, domain_nr);
+}
+EXPORT_SYMBOL_GPL(pci_bus_release_emul_domain_nr);
+#endif
+
 #ifdef CONFIG_PCI_DOMAINS_GENERIC
 static DEFINE_IDA(pci_domain_nr_static_ida);
-static DEFINE_IDA(pci_domain_nr_dynamic_ida);
 
 static void of_pci_reserve_static_domain_nr(void)
 {
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index c83e75a0ec12..c3f6e2714440 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -657,6 +657,11 @@ static void pci_release_host_bridge_dev(struct device *dev)
 
 	pci_free_resource_list(&bridge->windows);
 	pci_free_resource_list(&bridge->dma_ranges);
+
+	/* Host bridges only have domain_nr set in the emulation case */
+	if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET)
+		pci_bus_release_emul_domain_nr(bridge->domain_nr);
+
 	kfree(bridge);
 }
 
@@ -1137,7 +1142,8 @@ unregister:
 	device_del(&bridge->dev);
 free:
 #ifdef CONFIG_PCI_DOMAINS_GENERIC
-	pci_bus_release_domain_nr(parent, bus->domain_nr);
+	if (bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET)
+		pci_bus_release_domain_nr(parent, bus->domain_nr);
 #endif
 	if (bus_registered)
 		put_device(&bus->dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1fdf81fbe1e..1ef1535802b0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1956,10 +1956,17 @@ DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T))
  */
 #ifdef CONFIG_PCI_DOMAINS
 extern int pci_domains_supported;
+int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max);
+void pci_bus_release_emul_domain_nr(int domain_nr);
 #else
 enum { pci_domains_supported = 0 };
 static inline int pci_domain_nr(struct pci_bus *bus) { return 0; }
 static inline int pci_proc_domain(struct pci_bus *bus) { return 0; }
+static inline int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max)
+{
+	return 0;
+}
+static inline void pci_bus_release_emul_domain_nr(int domain_nr) { }
 #endif /* CONFIG_PCI_DOMAINS */
 
 /*
-- 
cgit v1.2.3


From a544d9a66bdf20eb25cc40f99e1d09c825b71b26 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 28 Oct 2025 19:11:16 -0400
Subject: tracing: Have syscall trace events read user space string

As of commit 654ced4a1377 ("tracing: Introduce tracepoint_is_faultable()")
system call trace events allow faulting in user space memory. Have some of
the system call trace events take advantage of this.

Use the trace_user_fault_read() logic to read the user space buffer from
user space and instead of just saving the pointer to the buffer in the
system call event, also save the string that is passed in.

The syscall event has its nb_args shorten from an int to a short (where
even u8 is plenty big enough) and the freed two bytes are used for
"user_mask".  The new "user_mask" field is used to store the index of the
"args" field array that has the address to read from user space. This
value is set to 0 if the system call event does not need to read user
space for a field. This mask can be used to know if the event may fault or
not. Only one bit set in user_mask is supported at this time.

This allows the output to look like this:

 sys_access(filename: 0x7f8c55368470 "/etc/ld.so.preload", mode: 4)
 sys_execve(filename: 0x564ebcf5a6b8 "/usr/bin/emacs", argv: 0x7fff357c0300, envp: 0x564ebc4a4820)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.261867956@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/syscall.h       |   4 +-
 kernel/trace/trace_syscalls.c | 436 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 420 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 8e193f3a33b3..85f21ca15a41 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -16,6 +16,7 @@
  * @name: name of the syscall
  * @syscall_nr: number of the syscall
  * @nb_args: number of parameters it takes
+ * @user_mask: mask of @args that will read user space
  * @types: list of types as strings
  * @args: list of args as strings (args[i] matches types[i])
  * @enter_fields: list of fields for syscall_enter trace event
@@ -25,7 +26,8 @@
 struct syscall_metadata {
 	const char	*name;
 	int		syscall_nr;
-	int		nb_args;
+	short		nb_args;
+	short		user_mask;
 	const char	**types;
 	const char	**args;
 	struct list_head enter_fields;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 0f932b22f9ec..528ac90eda5d 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <trace/syscall.h>
 #include <trace/events/syscalls.h>
+#include <linux/kernel_stat.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -123,6 +124,9 @@ const char *get_syscall_name(int syscall)
 	return entry->name;
 }
 
+/* Added to user strings when max limit is reached */
+#define EXTRA "..."
+
 static enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags,
 		    struct trace_event *event)
@@ -132,7 +136,9 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
 	struct trace_entry *ent = iter->ent;
 	struct syscall_trace_enter *trace;
 	struct syscall_metadata *entry;
-	int i, syscall;
+	int i, syscall, val;
+	unsigned char *ptr;
+	int len;
 
 	trace = (typeof(trace))ent;
 	syscall = trace->nr;
@@ -167,6 +173,19 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
 		else
 			trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
 					 trace->args[i]);
+
+		if (!(BIT(i) & entry->user_mask))
+			continue;
+
+		/* This arg points to a user space string */
+		ptr = (void *)trace->args + sizeof(long) * entry->nb_args;
+		val = *(int *)ptr;
+
+		/* The value is a dynamic string (len << 16 | offset) */
+		ptr = (void *)ent + (val & 0xffff);
+		len = val >> 16;
+
+		trace_seq_printf(s, " \"%.*s\"", len, ptr);
 	}
 
 	trace_seq_putc(s, ')');
@@ -223,15 +242,27 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 
 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
 	for (i = 0; i < entry->nb_args; i++) {
-		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
-				entry->args[i], sizeof(unsigned long),
-				i == entry->nb_args - 1 ? "" : ", ");
+		if (i)
+			pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
+		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
+				entry->args[i], sizeof(unsigned long));
+
+		if (!(BIT(i) & entry->user_mask))
+			continue;
+
+		/* Add the format for the user space string */
+		pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
 	}
 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
 
 	for (i = 0; i < entry->nb_args; i++) {
 		pos += snprintf(buf + pos, LEN_OR_ZERO,
 				", ((unsigned long)(REC->%s))", entry->args[i]);
+		if (!(BIT(i) & entry->user_mask))
+			continue;
+		/* The user space string for arg has name __<arg>_val */
+		pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
+				entry->args[i]);
 	}
 
 #undef LEN_OR_ZERO
@@ -277,8 +308,12 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
 {
 	struct syscall_trace_enter trace;
 	struct syscall_metadata *meta = call->data;
+	unsigned long mask;
+	char *arg;
 	int offset = offsetof(typeof(trace), args);
+	int idx;
 	int ret = 0;
+	int len;
 	int i;
 
 	for (i = 0; i < meta->nb_args; i++) {
@@ -291,9 +326,148 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
 		offset += sizeof(unsigned long);
 	}
 
+	if (ret || !meta->user_mask)
+		return ret;
+
+	mask = meta->user_mask;
+	idx = ffs(mask) - 1;
+
+	/*
+	 * User space strings are faulted into a temporary buffer and then
+	 * added as a dynamic string to the end of the event.
+	 * The user space string name for the arg pointer is "__<arg>_val".
+	 */
+	len = strlen(meta->args[idx]) + sizeof("___val");
+	arg = kmalloc(len, GFP_KERNEL);
+	if (WARN_ON_ONCE(!arg)) {
+		meta->user_mask = 0;
+		return -ENOMEM;
+	}
+
+	snprintf(arg, len, "__%s_val", meta->args[idx]);
+
+	ret = trace_define_field(call, "__data_loc char[]",
+				 arg, offset, sizeof(int), 0,
+				 FILTER_OTHER);
+	if (ret)
+		kfree(arg);
 	return ret;
 }
 
+#define SYSCALL_FAULT_BUF_SZ 512
+
+/* Use the tracing per CPU buffer infrastructure to copy from user space */
+struct syscall_user_buffer {
+	struct trace_user_buf_info	buf;
+	struct rcu_head			rcu;
+};
+
+static struct syscall_user_buffer *syscall_buffer;
+
+static int syscall_fault_buffer_enable(void)
+{
+	struct syscall_user_buffer *sbuf;
+	int ret;
+
+	lockdep_assert_held(&syscall_trace_lock);
+
+	if (syscall_buffer) {
+		trace_user_fault_get(&syscall_buffer->buf);
+		return 0;
+	}
+
+	sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL);
+	if (!sbuf)
+		return -ENOMEM;
+
+	ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ);
+	if (ret < 0) {
+		kfree(sbuf);
+		return ret;
+	}
+
+	WRITE_ONCE(syscall_buffer, sbuf);
+
+	return 0;
+}
+
+static void rcu_free_syscall_buffer(struct rcu_head *rcu)
+{
+	struct syscall_user_buffer *sbuf =
+		container_of(rcu, struct syscall_user_buffer, rcu);
+
+	trace_user_fault_destroy(&sbuf->buf);
+	kfree(sbuf);
+}
+
+
+static void syscall_fault_buffer_disable(void)
+{
+	struct syscall_user_buffer *sbuf = syscall_buffer;
+
+	lockdep_assert_held(&syscall_trace_lock);
+
+	if (trace_user_fault_put(&sbuf->buf))
+		return;
+
+	WRITE_ONCE(syscall_buffer, NULL);
+	call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer);
+}
+
+static int syscall_copy_user(char *buf, const char __user *ptr,
+			     size_t size, void *data)
+{
+	unsigned long *ret_size = data;
+	int ret;
+
+	ret = strncpy_from_user(buf, ptr, size);
+	if (ret < 0)
+		return 1;
+	*ret_size = ret;
+	return 0;
+}
+
+static char *sys_fault_user(struct syscall_metadata *sys_data,
+			    struct syscall_user_buffer *sbuf,
+			    unsigned long *args, unsigned int *data_size)
+{
+	unsigned long size = SYSCALL_FAULT_BUF_SZ - 1;
+	unsigned long mask = sys_data->user_mask;
+	int idx = ffs(mask) - 1;
+	char *ptr;
+	char *buf;
+
+	/* Get the pointer to user space memory to read */
+	ptr = (char *)args[idx];
+	*data_size = 0;
+
+	buf = trace_user_fault_read(&sbuf->buf, ptr, size,
+				    syscall_copy_user, &size);
+	if (!buf)
+		return NULL;
+
+	/* Replace any non-printable characters with '.' */
+	for (int i = 0; i < size; i++) {
+		if (!isprint(buf[i]))
+			buf[i] = '.';
+	}
+
+	/*
+	 * If the text was truncated due to our max limit, add "..." to
+	 * the string.
+	 */
+	if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) {
+		strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA),
+			EXTRA, sizeof(EXTRA));
+		size = SYSCALL_FAULT_BUF_SZ;
+	} else {
+		buf[size++] = '\0';
+	}
+
+	*data_size = size;
+	return buf;
+}
+
 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 {
 	struct trace_array *tr = data;
@@ -302,15 +476,17 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	struct syscall_metadata *sys_data;
 	struct trace_event_buffer fbuffer;
 	unsigned long args[6];
+	char *user_ptr;
+	int user_size = 0;
 	int syscall_nr;
-	int size;
+	int size = 0;
+	bool mayfault;
 
 	/*
 	 * Syscall probe called with preemption enabled, but the ring
 	 * buffer and per-cpu data require preemption to be disabled.
 	 */
 	might_fault();
-	guard(preempt_notrace)();
 
 	syscall_nr = trace_get_syscall_nr(current, regs);
 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -327,7 +503,32 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
-	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+	/* Check if this syscall event faults in user space memory */
+	mayfault = sys_data->user_mask != 0;
+
+	guard(preempt_notrace)();
+
+	syscall_get_arguments(current, regs, args);
+
+	if (mayfault) {
+		struct syscall_user_buffer *sbuf;
+
+		/* If the syscall_buffer is NULL, tracing is being shutdown */
+		sbuf = READ_ONCE(syscall_buffer);
+		if (!sbuf)
+			return;
+
+		user_ptr = sys_fault_user(sys_data, sbuf, args, &user_size);
+		/*
+		 * user_size is the amount of data to append.
+		 * Need to add 4 for the meta field that points to
+		 * the user memory at the end of the event and also
+		 * stores its size.
+		 */
+		size = 4 + user_size;
+	}
+
+	size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
 	entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
 	if (!entry)
@@ -335,9 +536,36 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 
 	entry = ring_buffer_event_data(fbuffer.event);
 	entry->nr = syscall_nr;
-	syscall_get_arguments(current, regs, args);
+
 	memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
 
+	if (mayfault) {
+		void *ptr;
+		int val;
+
+		/*
+		 * Set the pointer to point to the meta data of the event
+		 * that has information about the stored user space memory.
+		 */
+		ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
+
+		/*
+		 * The meta data will store the offset of the user data from
+		 * the beginning of the event.
+		 */
+		val  = (ptr - (void *)entry) + 4;
+
+		/* Store the offset and the size into the meta data */
+		*(int *)ptr = val | (user_size << 16);
+
+		/* Nothing to do if the user space was empty or faulted */
+		if (user_size) {
+			/* Now store the user space data into the event */
+			ptr += 4;
+			memcpy(ptr, user_ptr, user_size);
+		}
+	}
+
 	trace_event_buffer_commit(&fbuffer);
 }
 
@@ -386,39 +614,50 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 static int reg_event_syscall_enter(struct trace_event_file *file,
 				   struct trace_event_call *call)
 {
+	struct syscall_metadata *sys_data = call->data;
 	struct trace_array *tr = file->tr;
 	int ret = 0;
 	int num;
 
-	num = ((struct syscall_metadata *)call->data)->syscall_nr;
+	num = sys_data->syscall_nr;
 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
-	mutex_lock(&syscall_trace_lock);
-	if (!tr->sys_refcount_enter)
+	guard(mutex)(&syscall_trace_lock);
+	if (sys_data->user_mask) {
+		ret = syscall_fault_buffer_enable();
+		if (ret < 0)
+			return ret;
+	}
+	if (!tr->sys_refcount_enter) {
 		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
-	if (!ret) {
-		WRITE_ONCE(tr->enter_syscall_files[num], file);
-		tr->sys_refcount_enter++;
+		if (ret < 0) {
+			if (sys_data->user_mask)
+				syscall_fault_buffer_disable();
+			return ret;
+		}
 	}
-	mutex_unlock(&syscall_trace_lock);
-	return ret;
+	WRITE_ONCE(tr->enter_syscall_files[num], file);
+	tr->sys_refcount_enter++;
+	return 0;
 }
 
 static void unreg_event_syscall_enter(struct trace_event_file *file,
 				      struct trace_event_call *call)
 {
+	struct syscall_metadata *sys_data = call->data;
 	struct trace_array *tr = file->tr;
 	int num;
 
-	num = ((struct syscall_metadata *)call->data)->syscall_nr;
+	num = sys_data->syscall_nr;
 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return;
-	mutex_lock(&syscall_trace_lock);
+	guard(mutex)(&syscall_trace_lock);
 	tr->sys_refcount_enter--;
 	WRITE_ONCE(tr->enter_syscall_files[num], NULL);
 	if (!tr->sys_refcount_enter)
 		unregister_trace_sys_enter(ftrace_syscall_enter, tr);
-	mutex_unlock(&syscall_trace_lock);
+	if (sys_data->user_mask)
+		syscall_fault_buffer_disable();
 }
 
 static int reg_event_syscall_exit(struct trace_event_file *file,
@@ -459,6 +698,163 @@ static void unreg_event_syscall_exit(struct trace_event_file *file,
 	mutex_unlock(&syscall_trace_lock);
 }
 
+/*
+ * For system calls that reference user space memory that can
+ * be recorded into the event, set the system call meta data's user_mask
+ * to the "args" index that points to the user space memory to retrieve.
+ */
+static void check_faultable_syscall(struct trace_event_call *call, int nr)
+{
+	struct syscall_metadata *sys_data = call->data;
+
+	/* Only work on entry */
+	if (sys_data->enter_event != call)
+		return;
+
+	switch (nr) {
+	/* user arg at position 0 */
+#ifdef __NR_access
+	case __NR_access:
+#endif
+	case __NR_acct:
+	case __NR_add_key: /* Just _type. TODO add _description */
+	case __NR_chdir:
+#ifdef  __NR_chown
+	case __NR_chown:
+#endif
+#ifdef  __NR_chmod
+	case __NR_chmod:
+#endif
+	case __NR_chroot:
+#ifdef __NR_creat
+	case __NR_creat:
+#endif
+	case __NR_delete_module:
+	case __NR_execve:
+	case __NR_fsopen:
+	case __NR_getxattr: /* Just pathname, TODO add name */
+#ifdef __NR_lchown
+	case __NR_lchown:
+#endif
+	case __NR_lgetxattr: /* Just pathname, TODO add name */
+	case __NR_lremovexattr: /* Just pathname, TODO add name */
+#ifdef __NR_link
+	case __NR_link: /* Just oldname. TODO add newname */
+#endif
+	case __NR_listxattr: /* Just pathname, TODO add list */
+	case __NR_llistxattr: /* Just pathname, TODO add list */
+	case __NR_lsetxattr: /* Just pathname, TODO add list */
+#ifdef __NR_open
+	case __NR_open:
+#endif
+	case __NR_memfd_create:
+	case __NR_mount: /* Just dev_name, TODO add dir_name and type */
+#ifdef __NR_mkdir
+	case __NR_mkdir:
+#endif
+#ifdef __NR_mknod
+	case __NR_mknod:
+#endif
+	case __NR_mq_open:
+	case __NR_mq_unlink:
+	case __NR_pivot_root: /* Just new_root, TODO add old_root */
+#ifdef __NR_readlink
+	case __NR_readlink:
+#endif
+	case __NR_removexattr: /* Just pathname, TODO add name */
+#ifdef __NR_rename
+	case __NR_rename: /* Just oldname. TODO add newname */
+#endif
+	case __NR_request_key: /* Just _type. TODO add _description */
+#ifdef  __NR_rmdir
+	case __NR_rmdir:
+#endif
+	case __NR_setxattr: /* Just pathname, TODO add list */
+	case __NR_shmdt:
+#ifdef __NR_statfs
+	case __NR_statfs:
+#endif
+	case __NR_swapon:
+	case __NR_swapoff:
+#ifdef __NR_symlink
+	case __NR_symlink: /* Just oldname. TODO add newname */
+#endif
+#ifdef __NR_truncate
+	case __NR_truncate:
+#endif
+#ifdef __NR_unlink
+	case __NR_unlink:
+#endif
+	case __NR_umount2:
+#ifdef __NR_utime
+	case __NR_utime:
+#endif
+#ifdef __NR_utimes
+	case __NR_utimes:
+#endif
+		sys_data->user_mask = BIT(0);
+		break;
+	/* user arg at position 1 */
+	case __NR_execveat:
+	case __NR_faccessat:
+	case __NR_faccessat2:
+	case __NR_finit_module:
+	case __NR_fchmodat:
+	case __NR_fchmodat2:
+	case __NR_fchownat:
+	case __NR_fgetxattr:
+	case __NR_flistxattr:
+	case __NR_fsetxattr:
+	case __NR_fspick:
+	case __NR_fremovexattr:
+#ifdef __NR_futimesat
+	case __NR_futimesat:
+#endif
+	case __NR_getxattrat: /* Just pathname, TODO add name */
+	case __NR_inotify_add_watch:
+	case __NR_linkat: /* Just oldname. TODO add newname */
+	case __NR_listxattrat: /* Just pathname, TODO add list */
+	case __NR_mkdirat:
+	case __NR_mknodat:
+	case __NR_mount_setattr:
+	case __NR_move_mount: /* Just from_pathname, TODO add to_pathname */
+	case __NR_name_to_handle_at:
+#ifdef __NR_newfstatat
+	case __NR_newfstatat:
+#endif
+	case __NR_openat:
+	case __NR_openat2:
+	case __NR_open_tree:
+	case __NR_open_tree_attr:
+	case __NR_readlinkat:
+#ifdef __NR_renameat
+	case __NR_renameat: /* Just oldname. TODO add newname */
+#endif
+	case __NR_renameat2: /* Just oldname. TODO add newname */
+	case __NR_removexattrat: /* Just pathname, TODO add name */
+	case __NR_quotactl:
+	case __NR_setxattrat: /* Just pathname, TODO add list */
+	case __NR_syslog:
+	case __NR_symlinkat: /* Just oldname. TODO add newname */
+	case __NR_statx:
+	case __NR_unlinkat:
+	case __NR_utimensat:
+		sys_data->user_mask = BIT(1);
+		break;
+	/* user arg at position 2 */
+	case __NR_init_module:
+	case __NR_fsconfig:
+		sys_data->user_mask = BIT(2);
+		break;
+	/* user arg at position 4 */
+	case __NR_fanotify_mark:
+		sys_data->user_mask = BIT(4);
+		break;
+	default:
+		sys_data->user_mask = 0;
+	}
+}
+
 static int __init init_syscall_trace(struct trace_event_call *call)
 {
 	int id;
@@ -471,6 +867,8 @@ static int __init init_syscall_trace(struct trace_event_call *call)
 		return -ENOSYS;
 	}
 
+	check_faultable_syscall(call, num);
+
 	if (set_syscall_print_fmt(call) < 0)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From b4f7624cfc9422209b844793521c60edb289fb69 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 28 Oct 2025 19:11:19 -0400
Subject: tracing: Have system call events record user array data

For system call events that have a length field, add a "user_arg_size"
parameter to the system call meta data that denotes the index of the args
array that holds the size of arg that the user_mask field has a bit set
for.

The "user_mask" has a bit set that denotes the arg that points to an array
in the user space address space and if a system call event has the
user_mask field set and the user_arg_size set, it will then record the
content of that address into the trace event, up to the size defined by
SYSCALL_FAULT_BUF_SZ - 1.

This allows the output to look like:

  sys_write(fd: 0xa, buf: 0x5646978d13c0 (01:00:05:00:00:00:00:00:01:87:55:89:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00), count: 0x20)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.763528474@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/syscall.h       |   4 +-
 kernel/trace/trace_syscalls.c | 121 ++++++++++++++++++++++++++++++------------
 2 files changed, 90 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 85f21ca15a41..9413c139da66 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -16,6 +16,7 @@
  * @name: name of the syscall
  * @syscall_nr: number of the syscall
  * @nb_args: number of parameters it takes
+ * @user_arg_size: holds @arg that has size of the user space to read
  * @user_mask: mask of @args that will read user space
  * @types: list of types as strings
  * @args: list of args as strings (args[i] matches types[i])
@@ -26,7 +27,8 @@
 struct syscall_metadata {
 	const char	*name;
 	int		syscall_nr;
-	short		nb_args;
+	u8		nb_args;
+	s8		user_arg_size;
 	short		user_mask;
 	const char	**types;
 	const char	**args;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ed9332f8bdf8..3f3cdfc9958e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -124,7 +124,7 @@ const char *get_syscall_name(int syscall)
 	return entry->name;
 }
 
-/* Added to user strings when max limit is reached */
+/* Added to user strings or arrays when max limit is reached */
 #define EXTRA "..."
 
 static enum print_line_t
@@ -136,9 +136,8 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
 	struct trace_entry *ent = iter->ent;
 	struct syscall_trace_enter *trace;
 	struct syscall_metadata *entry;
-	int i, syscall, val;
+	int i, syscall, val, len;
 	unsigned char *ptr;
-	int len;
 
 	trace = (typeof(trace))ent;
 	syscall = trace->nr;
@@ -185,7 +184,23 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
 		ptr = (void *)ent + (val & 0xffff);
 		len = val >> 16;
 
-		trace_seq_printf(s, " \"%.*s\"", len, ptr);
+		if (entry->user_arg_size < 0) {
+			trace_seq_printf(s, " \"%.*s\"", len, ptr);
+			continue;
+		}
+
+		val = trace->args[entry->user_arg_size];
+
+		trace_seq_puts(s, " (");
+		for (int x = 0; x < len; x++, ptr++) {
+			if (x)
+				trace_seq_putc(s, ':');
+			trace_seq_printf(s, "%02x", *ptr);
+		}
+		if (len < val)
+			trace_seq_printf(s, ", %s", EXTRA);
+
+		trace_seq_putc(s, ')');
 	}
 
 	trace_seq_putc(s, ')');
@@ -250,8 +265,11 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 		if (!(BIT(i) & entry->user_mask))
 			continue;
 
-		/* Add the format for the user space string */
-		pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
+		/* Add the format for the user space string or array */
+		if (entry->user_arg_size < 0)
+			pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
+		else
+			pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
 	}
 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
 
@@ -260,9 +278,14 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 				", ((unsigned long)(REC->%s))", entry->args[i]);
 		if (!(BIT(i) & entry->user_mask))
 			continue;
-		/* The user space string for arg has name __<arg>_val */
-		pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
-				entry->args[i]);
+		/* The user space data for arg has name __<arg>_val */
+		if (entry->user_arg_size < 0) {
+			pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
+					entry->args[i]);
+		} else {
+			pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)",
+					entry->args[i]);
+		}
 	}
 
 #undef LEN_OR_ZERO
@@ -333,9 +356,9 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
 	idx = ffs(mask) - 1;
 
 	/*
-	 * User space strings are faulted into a temporary buffer and then
-	 * added as a dynamic string to the end of the event.
-	 * The user space string name for the arg pointer is "__<arg>_val".
+	 * User space data is faulted into a temporary buffer and then
+	 * added as a dynamic string or array to the end of the event.
+	 * The user space data name for the arg pointer is "__<arg>_val".
 	 */
 	len = strlen(meta->args[idx]) + sizeof("___val");
 	arg = kmalloc(len, GFP_KERNEL);
@@ -431,9 +454,11 @@ static char *sys_fault_user(struct syscall_metadata *sys_data,
 			    struct syscall_user_buffer *sbuf,
 			    unsigned long *args, unsigned int *data_size)
 {
+	trace_user_buf_copy syscall_copy = syscall_copy_user;
 	unsigned long size = SYSCALL_FAULT_BUF_SZ - 1;
 	unsigned long mask = sys_data->user_mask;
 	int idx = ffs(mask) - 1;
+	bool array = false;
 	char *ptr;
 	char *buf;
 
@@ -441,27 +466,43 @@ static char *sys_fault_user(struct syscall_metadata *sys_data,
 	ptr = (char *)args[idx];
 	*data_size = 0;
 
+	/*
+	 * If this system call event has a size argument, use
+	 * it to define how much of user space memory to read,
+	 * and read it as an array and not a string.
+	 */
+	if (sys_data->user_arg_size >= 0) {
+		array = true;
+		size = args[sys_data->user_arg_size];
+		if (size > SYSCALL_FAULT_BUF_SZ - 1)
+			size = SYSCALL_FAULT_BUF_SZ - 1;
+		/* use normal copy_from_user() */
+		syscall_copy = NULL;
+	}
+
 	buf = trace_user_fault_read(&sbuf->buf, ptr, size,
-				    syscall_copy_user, &size);
+				    syscall_copy, &size);
 	if (!buf)
 		return NULL;
 
-	/* Replace any non-printable characters with '.' */
-	for (int i = 0; i < size; i++) {
-		if (!isprint(buf[i]))
-			buf[i] = '.';
-	}
+	/* For strings, replace any non-printable characters with '.' */
+	if (!array) {
+		for (int i = 0; i < size; i++) {
+			if (!isprint(buf[i]))
+				buf[i] = '.';
+		}
 
-	/*
-	 * If the text was truncated due to our max limit, add "..." to
-	 * the string.
-	 */
-	if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) {
-		strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA),
-			EXTRA, sizeof(EXTRA));
-		size = SYSCALL_FAULT_BUF_SZ;
-	} else {
-		buf[size++] = '\0';
+		/*
+		 * If the text was truncated due to our max limit, add "..." to
+		 * the string.
+		 */
+		if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) {
+			strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA),
+				EXTRA, sizeof(EXTRA));
+			size = SYSCALL_FAULT_BUF_SZ;
+		} else {
+			buf[size++] = '\0';
+		}
 	}
 
 	*data_size = size;
@@ -492,7 +533,7 @@ syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
 
 static void syscall_put_data(struct syscall_metadata *sys_data,
 			     struct syscall_trace_enter *entry,
-			     char *buffer, int size)
+			     char *buffer, int size, int user_size)
 {
 	void *ptr;
 	int val;
@@ -510,13 +551,16 @@ static void syscall_put_data(struct syscall_metadata *sys_data,
 	val  = (ptr - (void *)entry) + 4;
 
 	/* Store the offset and the size into the meta data */
-	*(int *)ptr = val | (size << 16);
+	*(int *)ptr = val | (user_size << 16);
+
+	if (WARN_ON_ONCE((ptr - (void *)entry + user_size) > size))
+		user_size = 0;
 
 	/* Nothing to do if the user space was empty or faulted */
-	if (size) {
+	if (user_size) {
 		/* Now store the user space data into the event */
 		ptr += 4;
-		memcpy(ptr, buffer, size);
+		memcpy(ptr, buffer, user_size);
 	}
 }
 
@@ -580,7 +624,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
 
 	if (mayfault)
-		syscall_put_data(sys_data, entry, user_ptr, user_size);
+		syscall_put_data(sys_data, entry, user_ptr, size, user_size);
 
 	trace_event_buffer_commit(&fbuffer);
 }
@@ -727,7 +771,16 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr)
 	if (sys_data->enter_event != call)
 		return;
 
+	sys_data->user_arg_size = -1;
+
 	switch (nr) {
+	/* user arg 1 with size arg at 2 */
+	case __NR_write:
+	case __NR_mq_timedsend:
+	case __NR_pwrite64:
+		sys_data->user_mask = BIT(1);
+		sys_data->user_arg_size = 2;
+		break;
 	/* user arg at position 0 */
 #ifdef __NR_access
 	case __NR_access:
@@ -1065,7 +1118,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
 
 	if (mayfault)
-		syscall_put_data(sys_data, rec, user_ptr, user_size);
+		syscall_put_data(sys_data, rec, user_ptr, size, user_size);
 
 	if ((valid_prog_array &&
 	     !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
-- 
cgit v1.2.3


From 011ea0501daaba36c06910fd383cf7428ea45844 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 28 Oct 2025 19:11:20 -0400
Subject: tracing: Display some syscall arrays as strings

Some of the system calls that read a fixed length of memory from the user
space address are not arrays but strings. Take a bit away from the nb_args
field in the syscall meta data to use as a flag to denote that the system
call's user_arg_size is being used as a string. The nb_args should never
be more than 6, so 7 bits is plenty to hold that number. When the
user_arg_is_str flag that, when set, will display the data array from the
user space address as a string and not an array.

This will allow the output to look like this:

  sys_sethostname(name: 0x5584310eb2a0 "debian", len: 6)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.930550359@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/syscall.h       |  4 +++-
 kernel/trace/trace_syscalls.c | 22 +++++++++++++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 9413c139da66..0dd7f2b33431 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -16,6 +16,7 @@
  * @name: name of the syscall
  * @syscall_nr: number of the syscall
  * @nb_args: number of parameters it takes
+ * @user_arg_is_str: set if the arg for @user_arg_size is a string
  * @user_arg_size: holds @arg that has size of the user space to read
  * @user_mask: mask of @args that will read user space
  * @types: list of types as strings
@@ -27,7 +28,8 @@
 struct syscall_metadata {
 	const char	*name;
 	int		syscall_nr;
-	u8		nb_args;
+	u8		nb_args:7;
+	u8		user_arg_is_str:1;
 	s8		user_arg_size;
 	short		user_mask;
 	const char	**types;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 3f3cdfc9958e..b8e9774a8abd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -184,7 +184,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
 		ptr = (void *)ent + (val & 0xffff);
 		len = val >> 16;
 
-		if (entry->user_arg_size < 0) {
+		if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
 			trace_seq_printf(s, " \"%.*s\"", len, ptr);
 			continue;
 		}
@@ -249,6 +249,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
 static int __init
 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 {
+	bool is_string = entry->user_arg_is_str;
 	int i;
 	int pos = 0;
 
@@ -266,7 +267,7 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 			continue;
 
 		/* Add the format for the user space string or array */
-		if (entry->user_arg_size < 0)
+		if (entry->user_arg_size < 0 || is_string)
 			pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
 		else
 			pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
@@ -279,7 +280,7 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 		if (!(BIT(i) & entry->user_mask))
 			continue;
 		/* The user space data for arg has name __<arg>_val */
-		if (entry->user_arg_size < 0) {
+		if (entry->user_arg_size < 0 || is_string) {
 			pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
 					entry->args[i]);
 		} else {
@@ -781,6 +782,21 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr)
 		sys_data->user_mask = BIT(1);
 		sys_data->user_arg_size = 2;
 		break;
+	/* user arg 0 with size arg at 1 as string */
+	case __NR_setdomainname:
+	case __NR_sethostname:
+		sys_data->user_mask = BIT(0);
+		sys_data->user_arg_size = 1;
+		sys_data->user_arg_is_str = 1;
+		break;
+#ifdef __NR_kexec_file_load
+	/* user arg 4 with size arg at 3 as string */
+	case __NR_kexec_file_load:
+		sys_data->user_mask = BIT(4);
+		sys_data->user_arg_size = 3;
+		sys_data->user_arg_is_str = 1;
+		break;
+#endif
 	/* user arg at position 0 */
 #ifdef __NR_access
 	case __NR_access:
-- 
cgit v1.2.3


From 32e0f607ac6a2bb5d144540897535fd01be77586 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 28 Oct 2025 19:11:24 -0400
Subject: tracing: Add trace_seq_pop() and seq_buf_pop()

In order to allow an interface to remove an added character from the
trace_seq and seq_buf descriptors, add helper functions trace_seq_pop()
and seq_buf_pop().

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231148.594898736@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h   | 17 +++++++++++++++++
 include/linux/trace_seq.h | 13 +++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 52791e070506..9f2839e73f8a 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -149,6 +149,23 @@ static inline void seq_buf_commit(struct seq_buf *s, int num)
 	}
 }
 
+/**
+ * seq_buf_pop - pop off the last written character
+ * @s: the seq_buf handle
+ *
+ * Removes the last written character to the seq_buf @s.
+ *
+ * Returns the last character or -1 if it is empty.
+ */
+static inline int seq_buf_pop(struct seq_buf *s)
+{
+	if (!s->len)
+		return -1;
+
+	s->len--;
+	return (unsigned int)s->buffer[s->len];
+}
+
 extern __printf(2, 3)
 int seq_buf_printf(struct seq_buf *s, const char *fmt, ...);
 extern __printf(2, 0)
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 557780fe1c77..4a0b8c172d27 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -80,6 +80,19 @@ static inline bool trace_seq_has_overflowed(struct trace_seq *s)
 	return s->full || seq_buf_has_overflowed(&s->seq);
 }
 
+/**
+ * trace_seq_pop - pop off the last written character
+ * @s: trace sequence descriptor
+ *
+ * Removes the last written character to the trace_seq @s.
+ *
+ * Returns the last character or -1 if it is empty.
+ */
+static inline int trace_seq_pop(struct trace_seq *s)
+{
+	return seq_buf_pop(&s->seq);
+}
+
 /*
  * Currently only defined when tracing is enabled.
  */
-- 
cgit v1.2.3


From c72568c21b97dbc48d02b769f4eec6667ad13d5a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 24 Oct 2025 09:12:40 +0000
Subject: net: rps: softnet_data reorg to make enqueue_to_backlog() fast

enqueue_to_backlog() is showing up in kernel profiles on hosts
with many cores, when RFS/RPS is used.

The following softnet_data fields need to be updated:

- input_queue_tail
- input_pkt_queue (next, prev, qlen, lock)
- backlog.state (if input_pkt_queue was empty)

Unfortunately they are currenly using two cache lines:

	/* --- cacheline 3 boundary (192 bytes) --- */
	call_single_data_t         csd __attribute__((__aligned__(64))); /*  0xc0  0x20 */
	struct softnet_data *      rps_ipi_next;         /*  0xe0   0x8 */
	unsigned int               cpu;                  /*  0xe8   0x4 */
	unsigned int               input_queue_tail;     /*  0xec   0x4 */
	struct sk_buff_head        input_pkt_queue;      /*  0xf0  0x18 */

	/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */

	struct napi_struct         backlog __attribute__((__aligned__(8))); /* 0x108 0x1f0 */

Add one ____cacheline_aligned_in_smp to make sure they now are using
a single cache line.

Also, because napi_struct has written fields, make @state its first field.

We want to make sure that cpus adding packets to sd->input_pkt_queue
are not slowing down cpus processing their backlog because of
false sharing.

After this patch new layout is:

	/* --- cacheline 5 boundary (320 bytes) --- */
	long int                   pad[3] __attribute__((__aligned__(64))); /* 0x140  0x18 */
	unsigned int               input_queue_tail;     /* 0x158   0x4 */

	/* XXX 4 bytes hole, try to pack */

	struct sk_buff_head        input_pkt_queue;      /* 0x160  0x18 */
	struct napi_struct         backlog __attribute__((__aligned__(8))); /* 0x178 0x1f0 */

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251024091240.3292546-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7f5aad5cc9a1..9c1e5042c5e7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -377,6 +377,8 @@ struct napi_config {
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
 struct napi_struct {
+	/* This field should be first or softnet_data.backlog needs tweaks. */
+	unsigned long		state;
 	/* The poll_list must only be managed by the entity which
 	 * changes the state of the NAPI_STATE_SCHED bit.  This means
 	 * whoever atomically sets that bit can add this napi_struct
@@ -385,7 +387,6 @@ struct napi_struct {
 	 */
 	struct list_head	poll_list;
 
-	unsigned long		state;
 	int			weight;
 	u32			defer_hard_irqs_count;
 	int			(*poll)(struct napi_struct *, int);
@@ -3529,9 +3530,17 @@ struct softnet_data {
 	call_single_data_t	csd ____cacheline_aligned_in_smp;
 	struct softnet_data	*rps_ipi_next;
 	unsigned int		cpu;
+
+	/* We force a cacheline alignment from here, to hold together
+	 * input_queue_tail, input_pkt_queue and backlog.state.
+	 * We add holes so that backlog.state is the last field
+	 * of this cache line.
+	 */
+	long			pad[3] ____cacheline_aligned_in_smp;
 	unsigned int		input_queue_tail;
 #endif
 	struct sk_buff_head	input_pkt_queue;
+
 	struct napi_struct	backlog;
 
 	struct numa_drop_counters drop_counters;
-- 
cgit v1.2.3


From 8443c3160858b860bfc2db6a8397c72c9f6b513e Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Fri, 24 Oct 2025 11:02:56 -0700
Subject: net: netmem: remove NET_IOV_MAX from net_iov_type enum

Remove the NET_IOV_MAX workaround from the net_iov_type enum. This entry
was previously added to force the enum size to unsigned long to satisfy
the NET_IOV_ASSERT_OFFSET static assertions.

After commit f3d85c9ee510 ("netmem: introduce struct netmem_desc
mirroring struct page") this approach became unnecessary by placing the
net_iov_type after the netmem_desc. Placing the net_iov_type after
netmem_desc results in the net_iov_type size having no effect on the
position or layout of the fields that mirror the struct page.

The layout before this patch:

struct net_iov {
	union {
		struct netmem_desc desc;                 /*     0    48 */
		struct {
			long unsigned int _flags;        /*     0     8 */
			long unsigned int pp_magic;      /*     8     8 */
			struct page_pool * pp;           /*    16     8 */
			long unsigned int _pp_mapping_pad; /*    24     8 */
			long unsigned int dma_addr;      /*    32     8 */
			atomic_long_t pp_ref_count;      /*    40     8 */
		};                                       /*     0    48 */
	};                                               /*     0    48 */
	struct net_iov_area *      owner;                /*    48     8 */
	enum net_iov_type          type;                 /*    56     8 */

	/* size: 64, cachelines: 1, members: 3 */
};

The layout after this patch:

struct net_iov {
	union {
		struct netmem_desc desc;                 /*     0    48 */
		struct {
			long unsigned int _flags;        /*     0     8 */
			long unsigned int pp_magic;      /*     8     8 */
			struct page_pool * pp;           /*    16     8 */
			long unsigned int _pp_mapping_pad; /*    24     8 */
			long unsigned int dma_addr;      /*    32     8 */
			atomic_long_t pp_ref_count;      /*    40     8 */
		};                                       /*     0    48 */
	};                                               /*     0    48 */
	struct net_iov_area *      owner;                /*    48     8 */
	enum net_iov_type          type;                 /*    56     4 */

	/* size: 64, cachelines: 1, members: 3 */
	/* padding: 4 */
};

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20251024-b4-devmem-remove-niov-max-v1-1-ba72c68bc869@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netmem.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netmem.h b/include/net/netmem.h
index 651e2c62d1dd..9e10f4ac50c3 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -68,10 +68,6 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
 enum net_iov_type {
 	NET_IOV_DMABUF,
 	NET_IOV_IOURING,
-
-	/* Force size to unsigned long to make the NET_IOV_ASSERTS below pass.
-	 */
-	NET_IOV_MAX = ULONG_MAX
 };
 
 /* A memory descriptor representing abstract networking I/O vectors,
-- 
cgit v1.2.3


From 294bfe0343da3b59db040c3a4dac05b4c91ce013 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 25 Oct 2025 09:40:59 +0200
Subject: sctp: Constify struct sctp_sched_ops

'struct sctp_sched_ops' is not modified in these drivers.

Constifying this structure moves some data to a read-only section, so
increases overall security, especially when the structure holds some
function pointers.

On a x86_64, with allmodconfig, as an example:
Before:
======
   text	   data	    bss	    dec	    hex	filename
   8019	    568	      0	   8587	   218b	net/sctp/stream_sched_fc.o

After:
=====
   text	   data	    bss	    dec	    hex	filename
   8275	    312	      0	   8587	   218b	net/sctp/stream_sched_fc.o

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://patch.msgid.link/dce03527eb7b7cc8a3c26d5cdac12bafe3350135.1761377890.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/stream_sched.h |  4 ++--
 include/net/sctp/structs.h      |  2 +-
 net/sctp/stream.c               |  8 ++++----
 net/sctp/stream_sched.c         | 16 ++++++++--------
 net/sctp/stream_sched_fc.c      |  4 ++--
 net/sctp/stream_sched_prio.c    |  2 +-
 net/sctp/stream_sched_rr.c      |  2 +-
 7 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
index 8034bf5febbe..77806ef1cb70 100644
--- a/include/net/sctp/stream_sched.h
+++ b/include/net/sctp/stream_sched.h
@@ -52,10 +52,10 @@ void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch);
 
 void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch);
 int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
-struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream);
+const struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream);
 
 void sctp_sched_ops_register(enum sctp_sched_type sched,
-			     struct sctp_sched_ops *sched_ops);
+			     const struct sctp_sched_ops *sched_ops);
 void sctp_sched_ops_prio_init(void);
 void sctp_sched_ops_rr_init(void);
 void sctp_sched_ops_fc_init(void);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 3dd304e411d0..5900196d65fd 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1073,7 +1073,7 @@ struct sctp_outq {
 	struct list_head out_chunk_list;
 
 	/* Stream scheduler being used */
-	struct sctp_sched_ops *sched;
+	const struct sctp_sched_ops *sched;
 
 	unsigned int out_qlen;	/* Total length of queued data chunks. */
 
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index f205556c5b24..0615e4426341 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -54,7 +54,7 @@ static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt)
 
 static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid)
 {
-	struct sctp_sched_ops *sched;
+	const struct sctp_sched_ops *sched;
 
 	if (!SCTP_SO(stream, sid)->ext)
 		return;
@@ -130,7 +130,7 @@ out:
 int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 		     gfp_t gfp)
 {
-	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+	const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
 	int i, ret = 0;
 
 	gfp |= __GFP_NOWARN;
@@ -182,7 +182,7 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
 
 void sctp_stream_free(struct sctp_stream *stream)
 {
-	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+	const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
 	int i;
 
 	sched->unsched_all(stream);
@@ -207,7 +207,7 @@ void sctp_stream_clear(struct sctp_stream *stream)
 
 void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
 {
-	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+	const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
 
 	sched->unsched_all(stream);
 	sctp_stream_outq_migrate(stream, new, new->outcnt);
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 54afbe4fb087..50f8b5240359 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -91,7 +91,7 @@ static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream)
 {
 }
 
-static struct sctp_sched_ops sctp_sched_fcfs = {
+static const struct sctp_sched_ops sctp_sched_fcfs = {
 	.set = sctp_sched_fcfs_set,
 	.get = sctp_sched_fcfs_get,
 	.init = sctp_sched_fcfs_init,
@@ -111,10 +111,10 @@ static void sctp_sched_ops_fcfs_init(void)
 
 /* API to other parts of the stack */
 
-static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1];
+static const struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1];
 
 void sctp_sched_ops_register(enum sctp_sched_type sched,
-			     struct sctp_sched_ops *sched_ops)
+			     const struct sctp_sched_ops *sched_ops)
 {
 	sctp_sched_ops[sched] = sched_ops;
 }
@@ -130,7 +130,7 @@ void sctp_sched_ops_init(void)
 
 static void sctp_sched_free_sched(struct sctp_stream *stream)
 {
-	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+	const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
 	struct sctp_stream_out_ext *soute;
 	int i;
 
@@ -148,9 +148,9 @@ static void sctp_sched_free_sched(struct sctp_stream *stream)
 int sctp_sched_set_sched(struct sctp_association *asoc,
 			 enum sctp_sched_type sched)
 {
-	struct sctp_sched_ops *old = asoc->outqueue.sched;
+	const struct sctp_sched_ops *old = asoc->outqueue.sched;
 	struct sctp_datamsg *msg = NULL;
-	struct sctp_sched_ops *n;
+	const struct sctp_sched_ops *n;
 	struct sctp_chunk *ch;
 	int i, ret = 0;
 
@@ -263,14 +263,14 @@ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch)
 
 int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp)
 {
-	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+	const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
 	struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext;
 
 	INIT_LIST_HEAD(&ext->outq);
 	return sched->init_sid(stream, sid, gfp);
 }
 
-struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream)
+const struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream)
 {
 	struct sctp_association *asoc;
 
diff --git a/net/sctp/stream_sched_fc.c b/net/sctp/stream_sched_fc.c
index 4bd18a497a6d..776c6de46c22 100644
--- a/net/sctp/stream_sched_fc.c
+++ b/net/sctp/stream_sched_fc.c
@@ -188,7 +188,7 @@ static void sctp_sched_fc_unsched_all(struct sctp_stream *stream)
 		list_del_init(&soute->fc_list);
 }
 
-static struct sctp_sched_ops sctp_sched_fc = {
+static const struct sctp_sched_ops sctp_sched_fc = {
 	.set = sctp_sched_fc_set,
 	.get = sctp_sched_fc_get,
 	.init = sctp_sched_fc_init,
@@ -206,7 +206,7 @@ void sctp_sched_ops_fc_init(void)
 	sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc);
 }
 
-static struct sctp_sched_ops sctp_sched_wfq = {
+static const struct sctp_sched_ops sctp_sched_wfq = {
 	.set = sctp_sched_wfq_set,
 	.get = sctp_sched_wfq_get,
 	.init = sctp_sched_fc_init,
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
index 4d4d9da331f4..fb6c55e5615d 100644
--- a/net/sctp/stream_sched_prio.c
+++ b/net/sctp/stream_sched_prio.c
@@ -300,7 +300,7 @@ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream)
 			sctp_sched_prio_unsched(soute);
 }
 
-static struct sctp_sched_ops sctp_sched_prio = {
+static const struct sctp_sched_ops sctp_sched_prio = {
 	.set = sctp_sched_prio_set,
 	.get = sctp_sched_prio_get,
 	.init = sctp_sched_prio_init,
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
index 1f235e7f643a..9157b653f196 100644
--- a/net/sctp/stream_sched_rr.c
+++ b/net/sctp/stream_sched_rr.c
@@ -171,7 +171,7 @@ static void sctp_sched_rr_unsched_all(struct sctp_stream *stream)
 		sctp_sched_rr_unsched(stream, soute);
 }
 
-static struct sctp_sched_ops sctp_sched_rr = {
+static const struct sctp_sched_ops sctp_sched_rr = {
 	.set = sctp_sched_rr_set,
 	.get = sctp_sched_rr_get,
 	.init = sctp_sched_rr_init,
-- 
cgit v1.2.3


From f74ee32963f1b74865fe679e2475450434fea51c Mon Sep 17 00:00:00 2001
From: Qinxin Xia <xiaqinxin@huawei.com>
Date: Tue, 28 Oct 2025 20:09:00 +0800
Subject: tools/dma: move dma_map_benchmark from selftests to tools/dma

dma_map_benchmark is a standalone developer tool rather than an
automated selftest. It has no pass/fail criteria, expects manual
invocation, and is built as a normal userspace binary. Move it to
tools/dma/ and add a minimal Makefile.

Suggested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Suggested-by: Barry Song <baohua@kernel.org>
Signed-off-by: Qinxin Xia <xiaqinxin@huawei.com>
Acked-by: Barry Song <baohua@kernel.org>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251028120900.2265511-3-xiaqinxin@huawei.com
---
 include/linux/map_benchmark.h                   |  32 ------
 include/uapi/linux/map_benchmark.h              |  35 +++++++
 kernel/dma/map_benchmark.c                      |   2 +-
 tools/Makefile                                  |  13 +--
 tools/dma/.gitignore                            |   3 +
 tools/dma/Makefile                              |  55 ++++++++++
 tools/dma/config                                |   1 +
 tools/dma/dma_map_benchmark.c                   | 127 +++++++++++++++++++++++
 tools/testing/selftests/dma/Makefile            |   7 --
 tools/testing/selftests/dma/config              |   1 -
 tools/testing/selftests/dma/dma_map_benchmark.c | 128 ------------------------
 11 files changed, 229 insertions(+), 175 deletions(-)
 delete mode 100644 include/linux/map_benchmark.h
 create mode 100644 include/uapi/linux/map_benchmark.h
 create mode 100644 tools/dma/.gitignore
 create mode 100644 tools/dma/Makefile
 create mode 100644 tools/dma/config
 create mode 100644 tools/dma/dma_map_benchmark.c
 delete mode 100644 tools/testing/selftests/dma/Makefile
 delete mode 100644 tools/testing/selftests/dma/config
 delete mode 100644 tools/testing/selftests/dma/dma_map_benchmark.c

(limited to 'include')

diff --git a/include/linux/map_benchmark.h b/include/linux/map_benchmark.h
deleted file mode 100644
index 48e2ff95332f..000000000000
--- a/include/linux/map_benchmark.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2022 HiSilicon Limited.
- */
-
-#ifndef _KERNEL_DMA_BENCHMARK_H
-#define _KERNEL_DMA_BENCHMARK_H
-
-#define DMA_MAP_BENCHMARK       _IOWR('d', 1, struct map_benchmark)
-#define DMA_MAP_MAX_THREADS     1024
-#define DMA_MAP_MAX_SECONDS     300
-#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC)
-
-#define DMA_MAP_BIDIRECTIONAL   0
-#define DMA_MAP_TO_DEVICE       1
-#define DMA_MAP_FROM_DEVICE     2
-
-struct map_benchmark {
-	__u64 avg_map_100ns; /* average map latency in 100ns */
-	__u64 map_stddev; /* standard deviation of map latency */
-	__u64 avg_unmap_100ns; /* as above */
-	__u64 unmap_stddev;
-	__u32 threads; /* how many threads will do map/unmap in parallel */
-	__u32 seconds; /* how long the test will last */
-	__s32 node; /* which numa node this benchmark will run on */
-	__u32 dma_bits; /* DMA addressing capability */
-	__u32 dma_dir; /* DMA data direction */
-	__u32 dma_trans_ns; /* time for DMA transmission in ns */
-	__u32 granule;  /* how many PAGE_SIZE will do map/unmap once a time */
-	__u8 expansion[76]; /* For future use */
-};
-#endif /* _KERNEL_DMA_BENCHMARK_H */
diff --git a/include/uapi/linux/map_benchmark.h b/include/uapi/linux/map_benchmark.h
new file mode 100644
index 000000000000..c2d91088a40d
--- /dev/null
+++ b/include/uapi/linux/map_benchmark.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2022-2025 HiSilicon Limited.
+ */
+
+#ifndef _UAPI_DMA_BENCHMARK_H
+#define _UAPI_DMA_BENCHMARK_H
+
+#include <linux/types.h>
+
+#define DMA_MAP_BENCHMARK       _IOWR('d', 1, struct map_benchmark)
+#define DMA_MAP_MAX_THREADS     1024
+#define DMA_MAP_MAX_SECONDS     300
+#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC)
+
+#define DMA_MAP_BIDIRECTIONAL   0
+#define DMA_MAP_TO_DEVICE       1
+#define DMA_MAP_FROM_DEVICE     2
+
+struct map_benchmark {
+	__u64 avg_map_100ns; /* average map latency in 100ns */
+	__u64 map_stddev; /* standard deviation of map latency */
+	__u64 avg_unmap_100ns; /* as above */
+	__u64 unmap_stddev;
+	__u32 threads; /* how many threads will do map/unmap in parallel */
+	__u32 seconds; /* how long the test will last */
+	__s32 node; /* which numa node this benchmark will run on */
+	__u32 dma_bits; /* DMA addressing capability */
+	__u32 dma_dir; /* DMA data direction */
+	__u32 dma_trans_ns; /* time for DMA transmission in ns */
+	__u32 granule;  /* how many PAGE_SIZE will do map/unmap once a time */
+	__u8 expansion[76]; /* For future use */
+};
+
+#endif /* _UAPI_DMA_BENCHMARK_H */
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index cc19a3efea89..794041a39e65 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -11,13 +11,13 @@
 #include <linux/dma-mapping.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
-#include <linux/map_benchmark.h>
 #include <linux/math64.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/timekeeping.h>
+#include <uapi/linux/map_benchmark.h>
 
 struct map_benchmark_data {
 	struct map_benchmark bparam;
diff --git a/tools/Makefile b/tools/Makefile
index c31cbbd12c45..cb40961a740f 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -14,6 +14,7 @@ help:
 	@echo '  counter                - counter tools'
 	@echo '  cpupower               - a tool for all things x86 CPU power'
 	@echo '  debugging              - tools for debugging'
+	@echo '  dma                    - tools for DMA mapping'
 	@echo '  firewire               - the userspace part of nosy, an IEEE-1394 traffic sniffer'
 	@echo '  firmware               - Firmware tools'
 	@echo '  freefall               - laptop accelerometer program for disk protection'
@@ -69,7 +70,7 @@ acpi: FORCE
 cpupower: FORCE
 	$(call descend,power/$@)
 
-counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi firmware debugging tracing: FORCE
+counter dma firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi firmware debugging tracing: FORCE
 	$(call descend,$@)
 
 bpf/%: FORCE
@@ -122,7 +123,7 @@ kvm_stat: FORCE
 ynl: FORCE
 	$(call descend,net/ynl)
 
-all: acpi counter cpupower gpio hv firewire \
+all: acpi counter cpupower dma gpio hv firewire \
 		perf selftests bootconfig spi turbostat usb \
 		virtio mm bpf x86_energy_perf_policy \
 		tmon freefall iio objtool kvm_stat wmi \
@@ -134,7 +135,7 @@ acpi_install:
 cpupower_install:
 	$(call descend,power/$(@:_install=),install)
 
-counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install debugging_install tracing_install:
+counter_install dma_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install debugging_install tracing_install:
 	$(call descend,$(@:_install=),install)
 
 selftests_install:
@@ -164,7 +165,7 @@ kvm_stat_install:
 ynl_install:
 	$(call descend,net/$(@:_install=),install)
 
-install: acpi_install counter_install cpupower_install gpio_install \
+install: acpi_install counter_install cpupower_install dma_install gpio_install \
 		hv_install firewire_install iio_install \
 		perf_install selftests_install turbostat_install usb_install \
 		virtio_install mm_install bpf_install x86_energy_perf_policy_install \
@@ -178,7 +179,7 @@ acpi_clean:
 cpupower_clean:
 	$(call descend,power/cpupower,clean)
 
-counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean firmware_clean debugging_clean tracing_clean:
+counter_clean dma_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean firmware_clean debugging_clean tracing_clean:
 	$(call descend,$(@:_clean=),clean)
 
 libapi_clean:
@@ -224,7 +225,7 @@ build_clean:
 ynl_clean:
 	$(call descend,net/$(@:_clean=),clean)
 
-clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \
+clean: acpi_clean counter_clean cpupower_clean dma_clean hv_clean firewire_clean \
 		perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \
 		mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
 		freefall_clean build_clean libbpf_clean libsubcmd_clean \
diff --git a/tools/dma/.gitignore b/tools/dma/.gitignore
new file mode 100644
index 000000000000..94b68cf4147b
--- /dev/null
+++ b/tools/dma/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+dma_map_benchmark
+include/linux/map_benchmark.h
diff --git a/tools/dma/Makefile b/tools/dma/Makefile
new file mode 100644
index 000000000000..e4abf37bf020
--- /dev/null
+++ b/tools/dma/Makefile
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0
+include ../scripts/Makefile.include
+
+bindir ?= /usr/bin
+
+# This will work when dma is built in tools env. where srctree
+# isn't set and when invoked from selftests build, where srctree
+# is set to ".". building_out_of_srctree is undefined for in srctree
+# builds
+ifndef building_out_of_srctree
+srctree := $(patsubst %/,%,$(dir $(CURDIR)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+endif
+
+# Do not use make's built-in rules
+# (this improves performance and avoids hard-to-debug behaviour);
+MAKEFLAGS += -r
+
+override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include
+
+ALL_TARGETS := dma_map_benchmark
+ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS))
+
+all: $(ALL_PROGRAMS)
+
+export srctree OUTPUT CC LD CFLAGS
+include $(srctree)/tools/build/Makefile.include
+
+#
+# We need the following to be outside of kernel tree
+#
+$(OUTPUT)include/linux/map_benchmark.h: ../../include/uapi/linux/map_benchmark.h
+	mkdir -p $(OUTPUT)include/linux 2>&1 || true
+	ln -sf $(CURDIR)/../../include/uapi/linux/map_benchmark.h $@
+
+prepare: $(OUTPUT)include/linux/map_benchmark.h
+
+FORCE:
+
+DMA_MAP_BENCHMARK = dma_map_benchmark
+$(DMA_MAP_BENCHMARK): prepare FORCE
+	$(CC) $(CFLAGS) $(DMA_MAP_BENCHMARK).c -o $(DMA_MAP_BENCHMARK)
+
+clean:
+	rm -f $(ALL_PROGRAMS)
+	rm -rf $(OUTPUT)include
+	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete
+
+install: $(ALL_PROGRAMS)
+	install -d -m 755 $(DESTDIR)$(bindir);		\
+	for program in $(ALL_PROGRAMS); do		\
+		install $$program $(DESTDIR)$(bindir);	\
+	done
+
+.PHONY: all install clean prepare FORCE
diff --git a/tools/dma/config b/tools/dma/config
new file mode 100644
index 000000000000..6102ee3c43cd
--- /dev/null
+++ b/tools/dma/config
@@ -0,0 +1 @@
+CONFIG_DMA_MAP_BENCHMARK=y
diff --git a/tools/dma/dma_map_benchmark.c b/tools/dma/dma_map_benchmark.c
new file mode 100644
index 000000000000..5474a450863c
--- /dev/null
+++ b/tools/dma/dma_map_benchmark.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 HiSilicon Limited.
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <linux/map_benchmark.h>
+
+#define NSEC_PER_MSEC	1000000L
+
+static char *directions[] = {
+	"BIDIRECTIONAL",
+	"TO_DEVICE",
+	"FROM_DEVICE",
+};
+
+int main(int argc, char **argv)
+{
+	struct map_benchmark map;
+	int fd, opt;
+	/* default single thread, run 20 seconds on NUMA_NO_NODE */
+	int threads = 1, seconds = 20, node = -1;
+	/* default dma mask 32bit, bidirectional DMA */
+	int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL;
+	/* default granule 1 PAGESIZE */
+	int granule = 1;
+
+	int cmd = DMA_MAP_BENCHMARK;
+
+	while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) {
+		switch (opt) {
+		case 't':
+			threads = atoi(optarg);
+			break;
+		case 's':
+			seconds = atoi(optarg);
+			break;
+		case 'n':
+			node = atoi(optarg);
+			break;
+		case 'b':
+			bits = atoi(optarg);
+			break;
+		case 'd':
+			dir = atoi(optarg);
+			break;
+		case 'x':
+			xdelay = atoi(optarg);
+			break;
+		case 'g':
+			granule = atoi(optarg);
+			break;
+		default:
+			return -1;
+		}
+	}
+
+	if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) {
+		fprintf(stderr, "invalid number of threads, must be in 1-%d\n",
+			DMA_MAP_MAX_THREADS);
+		exit(1);
+	}
+
+	if (seconds <= 0 || seconds > DMA_MAP_MAX_SECONDS) {
+		fprintf(stderr, "invalid number of seconds, must be in 1-%d\n",
+			DMA_MAP_MAX_SECONDS);
+		exit(1);
+	}
+
+	if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) {
+		fprintf(stderr, "invalid transmit delay, must be in 0-%ld\n",
+			DMA_MAP_MAX_TRANS_DELAY);
+		exit(1);
+	}
+
+	/* suppose the mininum DMA zone is 1MB in the world */
+	if (bits < 20 || bits > 64) {
+		fprintf(stderr, "invalid dma mask bit, must be in 20-64\n");
+		exit(1);
+	}
+
+	if (dir != DMA_MAP_BIDIRECTIONAL && dir != DMA_MAP_TO_DEVICE &&
+			dir != DMA_MAP_FROM_DEVICE) {
+		fprintf(stderr, "invalid dma direction\n");
+		exit(1);
+	}
+
+	if (granule < 1 || granule > 1024) {
+		fprintf(stderr, "invalid granule size\n");
+		exit(1);
+	}
+
+	fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR);
+	if (fd == -1) {
+		perror("open");
+		exit(1);
+	}
+
+	memset(&map, 0, sizeof(map));
+	map.seconds = seconds;
+	map.threads = threads;
+	map.node = node;
+	map.dma_bits = bits;
+	map.dma_dir = dir;
+	map.dma_trans_ns = xdelay;
+	map.granule = granule;
+
+	if (ioctl(fd, cmd, &map)) {
+		perror("ioctl");
+		exit(1);
+	}
+
+	printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n",
+			threads, seconds, node, dir[directions], granule);
+	printf("average map latency(us):%.1f standard deviation:%.1f\n",
+			map.avg_map_100ns/10.0, map.map_stddev/10.0);
+	printf("average unmap latency(us):%.1f standard deviation:%.1f\n",
+			map.avg_unmap_100ns/10.0, map.unmap_stddev/10.0);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/dma/Makefile b/tools/testing/selftests/dma/Makefile
deleted file mode 100644
index cd8c5ece1cba..000000000000
--- a/tools/testing/selftests/dma/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-CFLAGS += -I../../../../usr/include/
-CFLAGS += -I../../../../include/
-
-TEST_GEN_PROGS := dma_map_benchmark
-
-include ../lib.mk
diff --git a/tools/testing/selftests/dma/config b/tools/testing/selftests/dma/config
deleted file mode 100644
index 6102ee3c43cd..000000000000
--- a/tools/testing/selftests/dma/config
+++ /dev/null
@@ -1 +0,0 @@
-CONFIG_DMA_MAP_BENCHMARK=y
diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c
deleted file mode 100644
index b12f1f9babf8..000000000000
--- a/tools/testing/selftests/dma/dma_map_benchmark.c
+++ /dev/null
@@ -1,128 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2020 HiSilicon Limited.
- */
-
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <linux/types.h>
-#include <linux/map_benchmark.h>
-
-#define NSEC_PER_MSEC	1000000L
-
-static char *directions[] = {
-	"BIDIRECTIONAL",
-	"TO_DEVICE",
-	"FROM_DEVICE",
-};
-
-int main(int argc, char **argv)
-{
-	struct map_benchmark map;
-	int fd, opt;
-	/* default single thread, run 20 seconds on NUMA_NO_NODE */
-	int threads = 1, seconds = 20, node = -1;
-	/* default dma mask 32bit, bidirectional DMA */
-	int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL;
-	/* default granule 1 PAGESIZE */
-	int granule = 1;
-
-	int cmd = DMA_MAP_BENCHMARK;
-
-	while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) {
-		switch (opt) {
-		case 't':
-			threads = atoi(optarg);
-			break;
-		case 's':
-			seconds = atoi(optarg);
-			break;
-		case 'n':
-			node = atoi(optarg);
-			break;
-		case 'b':
-			bits = atoi(optarg);
-			break;
-		case 'd':
-			dir = atoi(optarg);
-			break;
-		case 'x':
-			xdelay = atoi(optarg);
-			break;
-		case 'g':
-			granule = atoi(optarg);
-			break;
-		default:
-			return -1;
-		}
-	}
-
-	if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) {
-		fprintf(stderr, "invalid number of threads, must be in 1-%d\n",
-			DMA_MAP_MAX_THREADS);
-		exit(1);
-	}
-
-	if (seconds <= 0 || seconds > DMA_MAP_MAX_SECONDS) {
-		fprintf(stderr, "invalid number of seconds, must be in 1-%d\n",
-			DMA_MAP_MAX_SECONDS);
-		exit(1);
-	}
-
-	if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) {
-		fprintf(stderr, "invalid transmit delay, must be in 0-%ld\n",
-			DMA_MAP_MAX_TRANS_DELAY);
-		exit(1);
-	}
-
-	/* suppose the mininum DMA zone is 1MB in the world */
-	if (bits < 20 || bits > 64) {
-		fprintf(stderr, "invalid dma mask bit, must be in 20-64\n");
-		exit(1);
-	}
-
-	if (dir != DMA_MAP_BIDIRECTIONAL && dir != DMA_MAP_TO_DEVICE &&
-			dir != DMA_MAP_FROM_DEVICE) {
-		fprintf(stderr, "invalid dma direction\n");
-		exit(1);
-	}
-
-	if (granule < 1 || granule > 1024) {
-		fprintf(stderr, "invalid granule size\n");
-		exit(1);
-	}
-
-	fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR);
-	if (fd == -1) {
-		perror("open");
-		exit(1);
-	}
-
-	memset(&map, 0, sizeof(map));
-	map.seconds = seconds;
-	map.threads = threads;
-	map.node = node;
-	map.dma_bits = bits;
-	map.dma_dir = dir;
-	map.dma_trans_ns = xdelay;
-	map.granule = granule;
-
-	if (ioctl(fd, cmd, &map)) {
-		perror("ioctl");
-		exit(1);
-	}
-
-	printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n",
-			threads, seconds, node, dir[directions], granule);
-	printf("average map latency(us):%.1f standard deviation:%.1f\n",
-			map.avg_map_100ns/10.0, map.map_stddev/10.0);
-	printf("average unmap latency(us):%.1f standard deviation:%.1f\n",
-			map.avg_unmap_100ns/10.0, map.unmap_stddev/10.0);
-
-	return 0;
-}
-- 
cgit v1.2.3


From ed7fc3cbb38ffdca7a189e15982ce96acab4684c Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 15 Oct 2025 12:12:47 +0300
Subject: dma-mapping: prepare dma_map_ops to conversion to physical address

Add new .map_phys() and .unmap_phys() callbacks to dma_map_ops as a
preparation to replace .map_page() and .unmap_page() respectively.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-1-3bbfe3a25cdf@kernel.org
---
 include/linux/dma-map-ops.h |  7 +++++++
 kernel/dma/mapping.c        |  4 ++++
 kernel/dma/ops_helpers.c    | 12 ++++++++++--
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 10882d00cb17..79d2a74d4b49 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -37,6 +37,13 @@ struct dma_map_ops {
 	void (*unmap_page)(struct device *dev, dma_addr_t dma_handle,
 			size_t size, enum dma_data_direction dir,
 			unsigned long attrs);
+
+	dma_addr_t (*map_phys)(struct device *dev, phys_addr_t phys,
+			size_t size, enum dma_data_direction dir,
+			unsigned long attrs);
+	void (*unmap_phys)(struct device *dev, dma_addr_t dma_handle,
+			size_t size, enum dma_data_direction dir,
+			unsigned long attrs);
 	/*
 	 * map_sg should return a negative error code on error. See
 	 * dma_map_sgtable() for a list of appropriate error codes
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index fe7472f13b10..4080aebe5deb 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -169,6 +169,8 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 		addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
 	else if (use_dma_iommu(dev))
 		addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
+	else if (ops->map_phys)
+		addr = ops->map_phys(dev, phys, size, dir, attrs);
 	else if (is_mmio) {
 		if (!ops->map_resource)
 			return DMA_MAPPING_ERROR;
@@ -223,6 +225,8 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
 		dma_direct_unmap_phys(dev, addr, size, dir, attrs);
 	else if (use_dma_iommu(dev))
 		iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
+	else if (ops->unmap_phys)
+		ops->unmap_phys(dev, addr, size, dir, attrs);
 	else if (is_mmio) {
 		if (ops->unmap_resource)
 			ops->unmap_resource(dev, addr, size, dir, attrs);
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index 6f9d604d9d40..1eccbdbc99c1 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -64,6 +64,7 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	struct page *page;
+	phys_addr_t phys;
 
 	page = dma_alloc_contiguous(dev, size, gfp);
 	if (!page)
@@ -71,9 +72,13 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
 	if (!page)
 		return NULL;
 
+	phys = page_to_phys(page);
 	if (use_dma_iommu(dev))
-		*dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size,
-						 dir, DMA_ATTR_SKIP_CPU_SYNC);
+		*dma_handle = iommu_dma_map_phys(dev, phys, size, dir,
+						 DMA_ATTR_SKIP_CPU_SYNC);
+	else if (ops->map_phys)
+		*dma_handle = ops->map_phys(dev, phys, size, dir,
+					    DMA_ATTR_SKIP_CPU_SYNC);
 	else
 		*dma_handle = ops->map_page(dev, page, 0, size, dir,
 					    DMA_ATTR_SKIP_CPU_SYNC);
@@ -94,6 +99,9 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
 	if (use_dma_iommu(dev))
 		iommu_dma_unmap_phys(dev, dma_handle, size, dir,
 				     DMA_ATTR_SKIP_CPU_SYNC);
+	else if (ops->unmap_phys)
+		ops->unmap_phys(dev, dma_handle, size, dir,
+				DMA_ATTR_SKIP_CPU_SYNC);
 	else if (ops->unmap_page)
 		ops->unmap_page(dev, dma_handle, size, dir,
 				DMA_ATTR_SKIP_CPU_SYNC);
-- 
cgit v1.2.3


From 14cb413af00c5d3950d1a339dd2b6f01ce313fce Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 15 Oct 2025 12:12:52 +0300
Subject: dma-mapping: remove unused mapping resource callbacks

After ARM and XEN conversions to use physical addresses for the mapping,
there are no in-kernel users for map_resource/unmap_resource callbacks,
so remove them.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-6-3bbfe3a25cdf@kernel.org
---
 include/linux/dma-map-ops.h |  6 ------
 kernel/dma/mapping.c        | 16 ++++------------
 2 files changed, 4 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 79d2a74d4b49..2e98ecc313a3 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -53,12 +53,6 @@ struct dma_map_ops {
 			enum dma_data_direction dir, unsigned long attrs);
 	void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nents,
 			enum dma_data_direction dir, unsigned long attrs);
-	dma_addr_t (*map_resource)(struct device *dev, phys_addr_t phys_addr,
-			size_t size, enum dma_data_direction dir,
-			unsigned long attrs);
-	void (*unmap_resource)(struct device *dev, dma_addr_t dma_handle,
-			size_t size, enum dma_data_direction dir,
-			unsigned long attrs);
 	void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle,
 			size_t size, enum dma_data_direction dir);
 	void (*sync_single_for_device)(struct device *dev,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 4080aebe5deb..32a85bfdf873 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -157,7 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	bool is_mmio = attrs & DMA_ATTR_MMIO;
-	dma_addr_t addr;
+	dma_addr_t addr = DMA_MAPPING_ERROR;
 
 	BUG_ON(!valid_dma_direction(dir));
 
@@ -171,18 +171,13 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 		addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
 	else if (ops->map_phys)
 		addr = ops->map_phys(dev, phys, size, dir, attrs);
-	else if (is_mmio) {
-		if (!ops->map_resource)
-			return DMA_MAPPING_ERROR;
-
-		addr = ops->map_resource(dev, phys, size, dir, attrs);
-	} else {
+	else if (!is_mmio && ops->map_page) {
 		struct page *page = phys_to_page(phys);
 		size_t offset = offset_in_page(phys);
 
 		/*
 		 * The dma_ops API contract for ops->map_page() requires
-		 * kmappable memory, while ops->map_resource() does not.
+		 * kmappable memory.
 		 */
 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
 	}
@@ -227,10 +222,7 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
 		iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
 	else if (ops->unmap_phys)
 		ops->unmap_phys(dev, addr, size, dir, attrs);
-	else if (is_mmio) {
-		if (ops->unmap_resource)
-			ops->unmap_resource(dev, addr, size, dir, attrs);
-	} else
+	else
 		ops->unmap_page(dev, addr, size, dir, attrs);
 	trace_dma_unmap_phys(dev, addr, size, dir, attrs);
 	debug_dma_unmap_phys(dev, addr, size, dir);
-- 
cgit v1.2.3


From 131971f67e258170c678fe572fda95f8cef88e66 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 15 Oct 2025 12:13:00 +0300
Subject: dma-mapping: remove unused map_page callback

After conversion of arch code to use physical address mapping,
there are no users of .map_page() and .unmap_page() callbacks,
so let's remove them.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-14-3bbfe3a25cdf@kernel.org
---
 include/linux/dma-map-ops.h |  7 -------
 kernel/dma/mapping.c        | 12 ------------
 kernel/dma/ops_helpers.c    |  8 +-------
 3 files changed, 1 insertion(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 2e98ecc313a3..4809204c674c 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -31,13 +31,6 @@ struct dma_map_ops {
 			void *cpu_addr, dma_addr_t dma_addr, size_t size,
 			unsigned long attrs);
 
-	dma_addr_t (*map_page)(struct device *dev, struct page *page,
-			unsigned long offset, size_t size,
-			enum dma_data_direction dir, unsigned long attrs);
-	void (*unmap_page)(struct device *dev, dma_addr_t dma_handle,
-			size_t size, enum dma_data_direction dir,
-			unsigned long attrs);
-
 	dma_addr_t (*map_phys)(struct device *dev, phys_addr_t phys,
 			size_t size, enum dma_data_direction dir,
 			unsigned long attrs);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 32a85bfdf873..37163eb49f9f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -171,16 +171,6 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 		addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
 	else if (ops->map_phys)
 		addr = ops->map_phys(dev, phys, size, dir, attrs);
-	else if (!is_mmio && ops->map_page) {
-		struct page *page = phys_to_page(phys);
-		size_t offset = offset_in_page(phys);
-
-		/*
-		 * The dma_ops API contract for ops->map_page() requires
-		 * kmappable memory.
-		 */
-		addr = ops->map_page(dev, page, offset, size, dir, attrs);
-	}
 
 	if (!is_mmio)
 		kmsan_handle_dma(phys, size, dir);
@@ -222,8 +212,6 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
 		iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
 	else if (ops->unmap_phys)
 		ops->unmap_phys(dev, addr, size, dir, attrs);
-	else
-		ops->unmap_page(dev, addr, size, dir, attrs);
 	trace_dma_unmap_phys(dev, addr, size, dir, attrs);
 	debug_dma_unmap_phys(dev, addr, size, dir);
 }
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index 1eccbdbc99c1..20caf9cabf69 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -76,11 +76,8 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
 	if (use_dma_iommu(dev))
 		*dma_handle = iommu_dma_map_phys(dev, phys, size, dir,
 						 DMA_ATTR_SKIP_CPU_SYNC);
-	else if (ops->map_phys)
-		*dma_handle = ops->map_phys(dev, phys, size, dir,
-					    DMA_ATTR_SKIP_CPU_SYNC);
 	else
-		*dma_handle = ops->map_page(dev, page, 0, size, dir,
+		*dma_handle = ops->map_phys(dev, phys, size, dir,
 					    DMA_ATTR_SKIP_CPU_SYNC);
 	if (*dma_handle == DMA_MAPPING_ERROR) {
 		dma_free_contiguous(dev, page, size);
@@ -102,8 +99,5 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
 	else if (ops->unmap_phys)
 		ops->unmap_phys(dev, dma_handle, size, dir,
 				DMA_ATTR_SKIP_CPU_SYNC);
-	else if (ops->unmap_page)
-		ops->unmap_page(dev, dma_handle, size, dir,
-				DMA_ATTR_SKIP_CPU_SYNC);
 	dma_free_contiguous(dev, page, size);
 }
-- 
cgit v1.2.3


From c31b9d2f589463a7cb286467a618b3b598654890 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Sep 2025 15:44:10 +0200
Subject: unwind: Shorten lines

There are some exceptionally long lines that cause ugly wrapping.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080118.545274393@infradead.org
---
 include/linux/unwind_deferred.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 26122d00708a..25f4dffebd1b 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -8,7 +8,9 @@
 
 struct unwind_work;
 
-typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie);
+typedef void (*unwind_callback_t)(struct unwind_work *work,
+				  struct unwind_stacktrace *trace,
+				  u64 cookie);
 
 struct unwind_work {
 	struct list_head		list;
@@ -68,9 +70,17 @@ static __always_inline void unwind_reset_info(void)
 static inline void unwind_task_init(struct task_struct *task) {}
 static inline void unwind_task_free(struct task_struct *task) {}
 
-static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; }
-static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; }
-static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; }
+static inline int unwind_user_faultable(struct unwind_stacktrace *trace)
+{ return -ENOSYS; }
+
+static inline int
+unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{ return -ENOSYS; }
+
+static inline int
+unwind_deferred_request(struct unwind_work *work, u64 *timestamp)
+{ return -ENOSYS; }
+
 static inline void unwind_deferred_cancel(struct unwind_work *work) {}
 
 static inline void unwind_deferred_task_exit(struct task_struct *task) {}
-- 
cgit v1.2.3


From b1164c7d118defb01a885b53f56e3336db784df7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Sep 2025 15:44:59 +0200
Subject: unwind: Add required include files

To be self sufficient, the file needs to include linux/types.h. This
provides things like u32/u64 and struct callback_head.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080118.665787071@infradead.org
---
 include/linux/unwind_deferred_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index 33b62ac25c86..29452ff49859 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
 #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
 
+#include <linux/types.h>
+
 struct unwind_cache {
 	unsigned long		unwind_completed;
 	unsigned int		nr_entries;
-- 
cgit v1.2.3


From 52a1ec718b3eb6da29a76d05a662365a997139cc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Sep 2025 15:46:00 +0200
Subject: unwind: Simplify unwind_reset_info()

Invert the condition of the first if and make it an early exit to
reduce an indent level for the rest fo the function.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080118.777916262@infradead.org
---
 include/linux/unwind_deferred.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 25f4dffebd1b..196e12c1449e 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -46,22 +46,22 @@ void unwind_deferred_task_exit(struct task_struct *task);
 static __always_inline void unwind_reset_info(void)
 {
 	struct unwind_task_info *info = &current->unwind_info;
-	unsigned long bits;
+	unsigned long bits = info->unwind_mask;
 
 	/* Was there any unwinding? */
-	if (unlikely(info->unwind_mask)) {
-		bits = info->unwind_mask;
-		do {
-			/* Is a task_work going to run again before going back */
-			if (bits & UNWIND_PENDING)
-				return;
-		} while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL));
-		current->unwind_info.id.id = 0;
-
-		if (unlikely(info->cache)) {
-			info->cache->nr_entries = 0;
-			info->cache->unwind_completed = 0;
-		}
+	if (likely(!bits))
+		return;
+
+	do {
+		/* Is a task_work going to run again before going back */
+		if (bits & UNWIND_PENDING)
+			return;
+	} while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL));
+	current->unwind_info.id.id = 0;
+
+	if (unlikely(info->cache)) {
+		info->cache->nr_entries = 0;
+		info->cache->unwind_completed = 0;
 	}
 }
 
-- 
cgit v1.2.3


From 639214f65b1db87c6992eadf93079ff0d8768c2d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Sep 2025 16:09:17 +0200
Subject: unwind: Make unwind_task_info::unwind_mask consistent

The unwind_task_info::unwind_mask was manipulated using a mixture of:

  regular store
  WRITE_ONCE()
  try_cmpxchg()
  set_bit()
  atomic_long_*()

Clean up and make it consistently atomic_long_t.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20250924080119.384384486@infradead.org
---
 include/linux/unwind_deferred.h       |  4 ++--
 include/linux/unwind_deferred_types.h |  3 ++-
 kernel/unwind/deferred.c              | 17 +++++++++--------
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 196e12c1449e..f4743c8cff4c 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -46,7 +46,7 @@ void unwind_deferred_task_exit(struct task_struct *task);
 static __always_inline void unwind_reset_info(void)
 {
 	struct unwind_task_info *info = &current->unwind_info;
-	unsigned long bits = info->unwind_mask;
+	unsigned long bits = atomic_long_read(&info->unwind_mask);
 
 	/* Was there any unwinding? */
 	if (likely(!bits))
@@ -56,7 +56,7 @@ static __always_inline void unwind_reset_info(void)
 		/* Is a task_work going to run again before going back */
 		if (bits & UNWIND_PENDING)
 			return;
-	} while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL));
+	} while (!atomic_long_try_cmpxchg(&info->unwind_mask, &bits, 0UL));
 	current->unwind_info.id.id = 0;
 
 	if (unlikely(info->cache)) {
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index 29452ff49859..0a4c8ddbbc57 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -3,6 +3,7 @@
 #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
 
 #include <linux/types.h>
+#include <linux/atomic.h>
 
 struct unwind_cache {
 	unsigned long		unwind_completed;
@@ -32,7 +33,7 @@ union unwind_task_id {
 };
 
 struct unwind_task_info {
-	unsigned long		unwind_mask;
+	atomic_long_t		unwind_mask;
 	struct unwind_cache	*cache;
 	struct callback_head	work;
 	union unwind_task_id	id;
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index 09617d8ae24b..a88fb481c4a3 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -53,7 +53,7 @@ DEFINE_STATIC_SRCU(unwind_srcu);
 
 static inline bool unwind_pending(struct unwind_task_info *info)
 {
-	return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
+	return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING;
 }
 
 /*
@@ -141,7 +141,7 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
 	cache->nr_entries = trace->nr;
 
 	/* Clear nr_entries on way back to user space */
-	set_bit(UNWIND_USED_BIT, &info->unwind_mask);
+	atomic_long_or(UNWIND_USED, &info->unwind_mask);
 
 	return 0;
 }
@@ -159,7 +159,7 @@ static void process_unwind_deferred(struct task_struct *task)
 
 	/* Clear pending bit but make sure to have the current bits */
 	bits = atomic_long_fetch_andnot(UNWIND_PENDING,
-				  (atomic_long_t *)&info->unwind_mask);
+					&info->unwind_mask);
 	/*
 	 * From here on out, the callback must always be called, even if it's
 	 * just an empty trace.
@@ -264,7 +264,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 
 	*cookie = get_cookie(info);
 
-	old = READ_ONCE(info->unwind_mask);
+	old = atomic_long_read(&info->unwind_mask);
 
 	/* Is this already queued or executed */
 	if (old & bit)
@@ -277,7 +277,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 	 * to have a callback.
 	 */
 	bits = UNWIND_PENDING | bit;
-	old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
+	old = atomic_long_fetch_or(bits, &info->unwind_mask);
 	if (old & bits) {
 		/*
 		 * If the work's bit was set, whatever set it had better
@@ -291,7 +291,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 	ret = task_work_add(current, &info->work, twa_mode);
 
 	if (WARN_ON_ONCE(ret))
-		WRITE_ONCE(info->unwind_mask, 0);
+		atomic_long_set(&info->unwind_mask, 0);
 
 	return ret;
 }
@@ -323,7 +323,8 @@ void unwind_deferred_cancel(struct unwind_work *work)
 	guard(rcu)();
 	/* Clear this bit from all threads */
 	for_each_process_thread(g, t) {
-		clear_bit(bit, &t->unwind_info.unwind_mask);
+		atomic_long_andnot(BIT(bit),
+				   &t->unwind_info.unwind_mask);
 		if (t->unwind_info.cache)
 			clear_bit(bit, &t->unwind_info.cache->unwind_completed);
 	}
@@ -353,7 +354,7 @@ void unwind_task_init(struct task_struct *task)
 
 	memset(info, 0, sizeof(*info));
 	init_task_work(&info->work, unwind_deferred_task_work);
-	info->unwind_mask = 0;
+	atomic_long_set(&info->unwind_mask, 0);
 }
 
 void unwind_task_free(struct task_struct *task)
-- 
cgit v1.2.3


From c79dd946e370af3537edb854f210cba3a94b4516 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 23 Sep 2025 13:27:34 +0200
Subject: unwind: Implement compat fp unwind

It is important to be able to unwind compat tasks too.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20250924080119.613695709@infradead.org
---
 include/linux/unwind_user_types.h |  1 +
 kernel/unwind/user.c              | 40 ++++++++++++++++++++++++++++-----------
 2 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
index a449f15be890..938f7e623332 100644
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -36,6 +36,7 @@ struct unwind_user_state {
 	unsigned long				ip;
 	unsigned long				sp;
 	unsigned long				fp;
+	unsigned int				ws;
 	enum unwind_user_type			current_type;
 	unsigned int				available_types;
 	bool					done;
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 9dcde797b5d9..642871527a13 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -8,19 +8,32 @@
 #include <linux/unwind_user.h>
 #include <linux/uaccess.h>
 
-static const struct unwind_user_frame fp_frame = {
-	ARCH_INIT_USER_FP_FRAME
-};
-
 #define for_each_user_frame(state) \
 	for (unwind_user_start(state); !(state)->done; unwind_user_next(state))
 
+static inline int
+get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws)
+{
+	unsigned long __user *addr = (void __user *)base + off;
+#ifdef CONFIG_COMPAT
+	if (ws == sizeof(int)) {
+		unsigned int data;
+		int ret = get_user(data, (unsigned int __user *)addr);
+		*word = data;
+		return ret;
+	}
+#endif
+	return get_user(*word, addr);
+}
+
 static int unwind_user_next_fp(struct unwind_user_state *state)
 {
-	const struct unwind_user_frame *frame = &fp_frame;
+	const struct unwind_user_frame frame = {
+		ARCH_INIT_USER_FP_FRAME(state->ws)
+	};
 	unsigned long cfa, fp, ra;
 
-	if (frame->use_fp) {
+	if (frame.use_fp) {
 		if (state->fp < state->sp)
 			return -EINVAL;
 		cfa = state->fp;
@@ -29,26 +42,26 @@ static int unwind_user_next_fp(struct unwind_user_state *state)
 	}
 
 	/* Get the Canonical Frame Address (CFA) */
-	cfa += frame->cfa_off;
+	cfa += frame.cfa_off;
 
 	/* stack going in wrong direction? */
 	if (cfa <= state->sp)
 		return -EINVAL;
 
 	/* Make sure that the address is word aligned */
-	if (cfa & (sizeof(long) - 1))
+	if (cfa & (state->ws - 1))
 		return -EINVAL;
 
 	/* Find the Return Address (RA) */
-	if (get_user(ra, (unsigned long *)(cfa + frame->ra_off)))
+	if (get_user_word(&ra, cfa, frame.ra_off, state->ws))
 		return -EINVAL;
 
-	if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off)))
+	if (frame.fp_off && get_user_word(&fp, cfa, frame.fp_off, state->ws))
 		return -EINVAL;
 
 	state->ip = ra;
 	state->sp = cfa;
-	if (frame->fp_off)
+	if (frame.fp_off)
 		state->fp = fp;
 	return 0;
 }
@@ -100,6 +113,11 @@ static int unwind_user_start(struct unwind_user_state *state)
 	state->ip = instruction_pointer(regs);
 	state->sp = user_stack_pointer(regs);
 	state->fp = frame_pointer(regs);
+	state->ws = unwind_user_word_size(regs);
+	if (!state->ws) {
+		state->done = true;
+		return -EINVAL;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From ae25884ad749e7f6e0c3565513bdc8aa2554a425 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 24 Oct 2025 12:31:10 +0200
Subject: unwind_user/x86: Teach FP unwind about start of function

When userspace is interrupted at the start of a function, before we
get a chance to complete the frame, unwind will miss one caller.

X86 has a uprobe specific fixup for this, add bits to the generic
unwinder to support this.

Suggested-by: Jens Remus <jremus@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251024145156.GM4068168@noisy.programming.kicks-ass.net
---
 arch/x86/events/core.c             | 40 --------------------------------------
 arch/x86/include/asm/unwind_user.h | 12 ++++++++++++
 arch/x86/include/asm/uprobes.h     |  9 +++++++++
 arch/x86/kernel/uprobes.c          | 32 ++++++++++++++++++++++++++++++
 include/linux/unwind_user_types.h  |  1 +
 kernel/unwind/user.c               | 39 ++++++++++++++++++++++++++++---------
 6 files changed, 84 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 745caa6c15a3..0cf68ad9dcd0 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2845,46 +2845,6 @@ static unsigned long get_segment_base(unsigned int segment)
 	return get_desc_base(desc);
 }
 
-#ifdef CONFIG_UPROBES
-/*
- * Heuristic-based check if uprobe is installed at the function entry.
- *
- * Under assumption of user code being compiled with frame pointers,
- * `push %rbp/%ebp` is a good indicator that we indeed are.
- *
- * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
- * If we get this wrong, captured stack trace might have one extra bogus
- * entry, but the rest of stack trace will still be meaningful.
- */
-static bool is_uprobe_at_func_entry(struct pt_regs *regs)
-{
-	struct arch_uprobe *auprobe;
-
-	if (!current->utask)
-		return false;
-
-	auprobe = current->utask->auprobe;
-	if (!auprobe)
-		return false;
-
-	/* push %rbp/%ebp */
-	if (auprobe->insn[0] == 0x55)
-		return true;
-
-	/* endbr64 (64-bit only) */
-	if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
-		return true;
-
-	return false;
-}
-
-#else
-static bool is_uprobe_at_func_entry(struct pt_regs *regs)
-{
-	return false;
-}
-#endif /* CONFIG_UPROBES */
-
 #ifdef CONFIG_IA32_EMULATION
 
 #include <linux/compat.h>
diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h
index b166e102d444..c4f1ff8874d6 100644
--- a/arch/x86/include/asm/unwind_user.h
+++ b/arch/x86/include/asm/unwind_user.h
@@ -3,6 +3,7 @@
 #define _ASM_X86_UNWIND_USER_H
 
 #include <asm/ptrace.h>
+#include <asm/uprobes.h>
 
 #define ARCH_INIT_USER_FP_FRAME(ws)			\
 	.cfa_off	=  2*(ws),			\
@@ -10,6 +11,12 @@
 	.fp_off		= -2*(ws),			\
 	.use_fp		= true,
 
+#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws)		\
+	.cfa_off	=  1*(ws),			\
+	.ra_off		= -1*(ws),			\
+	.fp_off		= 0,				\
+	.use_fp		= false,
+
 static inline int unwind_user_word_size(struct pt_regs *regs)
 {
 	/* We can't unwind VM86 stacks */
@@ -22,4 +29,9 @@ static inline int unwind_user_word_size(struct pt_regs *regs)
 	return sizeof(long);
 }
 
+static inline bool unwind_user_at_function_start(struct pt_regs *regs)
+{
+	return is_uprobe_at_func_entry(regs);
+}
+
 #endif /* _ASM_X86_UNWIND_USER_H */
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 1ee2e5115955..362210c79998 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -62,4 +62,13 @@ struct arch_uprobe_task {
 	unsigned int			saved_tf;
 };
 
+#ifdef CONFIG_UPROBES
+extern bool is_uprobe_at_func_entry(struct pt_regs *regs);
+#else
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+	return false;
+}
+#endif /* CONFIG_UPROBES */
+
 #endif	/* _ASM_UPROBES_H */
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index a563e90832d7..7be8e361ca55 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1791,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
 	else
 		return regs->sp <= ret->stack;
 }
+
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+	struct arch_uprobe *auprobe;
+
+	if (!current->utask)
+		return false;
+
+	auprobe = current->utask->auprobe;
+	if (!auprobe)
+		return false;
+
+	/* push %rbp/%ebp */
+	if (auprobe->insn[0] == 0x55)
+		return true;
+
+	/* endbr64 (64-bit only) */
+	if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
+		return true;
+
+	return false;
+}
diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
index 938f7e623332..412729a269bc 100644
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -39,6 +39,7 @@ struct unwind_user_state {
 	unsigned int				ws;
 	enum unwind_user_type			current_type;
 	unsigned int				available_types;
+	bool					topmost;
 	bool					done;
 };
 
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 642871527a13..39e270789444 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -26,14 +26,12 @@ get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws)
 	return get_user(*word, addr);
 }
 
-static int unwind_user_next_fp(struct unwind_user_state *state)
+static int unwind_user_next_common(struct unwind_user_state *state,
+				   const struct unwind_user_frame *frame)
 {
-	const struct unwind_user_frame frame = {
-		ARCH_INIT_USER_FP_FRAME(state->ws)
-	};
 	unsigned long cfa, fp, ra;
 
-	if (frame.use_fp) {
+	if (frame->use_fp) {
 		if (state->fp < state->sp)
 			return -EINVAL;
 		cfa = state->fp;
@@ -42,7 +40,7 @@ static int unwind_user_next_fp(struct unwind_user_state *state)
 	}
 
 	/* Get the Canonical Frame Address (CFA) */
-	cfa += frame.cfa_off;
+	cfa += frame->cfa_off;
 
 	/* stack going in wrong direction? */
 	if (cfa <= state->sp)
@@ -53,19 +51,41 @@ static int unwind_user_next_fp(struct unwind_user_state *state)
 		return -EINVAL;
 
 	/* Find the Return Address (RA) */
-	if (get_user_word(&ra, cfa, frame.ra_off, state->ws))
+	if (get_user_word(&ra, cfa, frame->ra_off, state->ws))
 		return -EINVAL;
 
-	if (frame.fp_off && get_user_word(&fp, cfa, frame.fp_off, state->ws))
+	if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws))
 		return -EINVAL;
 
 	state->ip = ra;
 	state->sp = cfa;
-	if (frame.fp_off)
+	if (frame->fp_off)
 		state->fp = fp;
+	state->topmost = false;
 	return 0;
 }
 
+static int unwind_user_next_fp(struct unwind_user_state *state)
+{
+#ifdef CONFIG_HAVE_UNWIND_USER_FP
+	struct pt_regs *regs = task_pt_regs(current);
+
+	if (state->topmost && unwind_user_at_function_start(regs)) {
+		const struct unwind_user_frame fp_entry_frame = {
+			ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws)
+		};
+		return unwind_user_next_common(state, &fp_entry_frame);
+	}
+
+	const struct unwind_user_frame fp_frame = {
+		ARCH_INIT_USER_FP_FRAME(state->ws)
+	};
+	return unwind_user_next_common(state, &fp_frame);
+#else
+	return -EINVAL;
+#endif
+}
+
 static int unwind_user_next(struct unwind_user_state *state)
 {
 	unsigned long iter_mask = state->available_types;
@@ -118,6 +138,7 @@ static int unwind_user_start(struct unwind_user_state *state)
 		state->done = true;
 		return -EINVAL;
 	}
+	state->topmost = true;
 
 	return 0;
 }
-- 
cgit v1.2.3


From c69993ecdd4dfde2b7da08b022052a33b203da07 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Oct 2025 15:17:05 +0200
Subject: perf: Support deferred user unwind

Add support for deferred userspace unwind to perf.

Where perf currently relies on in-place stack unwinding; from NMI
context and all that. This moves the userspace part of the unwind to
right before the return-to-userspace.

This has two distinct benefits, the biggest is that it moves the
unwind to a faultable context. It becomes possible to fault in debug
info (.eh_frame, SFrame etc.) that might not otherwise be readily
available. And secondly, it de-duplicates the user callchain where
multiple samples happen during the same kernel entry.

To facilitate this the perf interface is extended with a new record
type:

  PERF_RECORD_CALLCHAIN_DEFERRED

and two new attribute flags:

  perf_event_attr::defer_callchain - to request the user unwind be deferred
  perf_event_attr::defer_output    - to request PERF_RECORD_CALLCHAIN_DEFERRED records

The existing PERF_RECORD_SAMPLE callchain section gets a new
context type:

  PERF_CONTEXT_USER_DEFERRED

After which will come a single entry, denoting the 'cookie' of the
deferred callchain that should be attached here, matching the 'cookie'
field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED.

The 'defer_callchain' flag is expected on all events with
PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event
responsible for collecting side-band events (like mmap, comm etc.).
Setting 'defer_output' on multiple events will get you duplicated
PERF_RECORD_CALLCHAIN_DEFERRED records.

Based on earlier patches by Josh and Steven.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net
---
 include/linux/perf_event.h            |  2 +-
 include/linux/unwind_deferred.h       | 12 ------
 include/linux/unwind_deferred_types.h | 13 ++++++
 include/uapi/linux/perf_event.h       | 21 +++++++++-
 kernel/bpf/stackmap.c                 |  4 +-
 kernel/events/callchain.c             | 14 ++++++-
 kernel/events/core.c                  | 78 ++++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/perf_event.h | 21 +++++++++-
 8 files changed, 145 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fd1d91017b99..9870d768db4c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-		   u32 max_stack, bool crosstask, bool add_mark);
+		   u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
 extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index f4743c8cff4c..bc7ae7d21900 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -6,18 +6,6 @@
 #include <linux/unwind_user.h>
 #include <linux/unwind_deferred_types.h>
 
-struct unwind_work;
-
-typedef void (*unwind_callback_t)(struct unwind_work *work,
-				  struct unwind_stacktrace *trace,
-				  u64 cookie);
-
-struct unwind_work {
-	struct list_head		list;
-	unwind_callback_t		func;
-	int				bit;
-};
-
 #ifdef CONFIG_UNWIND_USER
 
 enum {
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index 0a4c8ddbbc57..18fa3932f61c 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -39,4 +39,17 @@ struct unwind_task_info {
 	union unwind_task_id	id;
 };
 
+struct unwind_work;
+struct unwind_stacktrace;
+
+typedef void (*unwind_callback_t)(struct unwind_work *work,
+				  struct unwind_stacktrace *trace,
+				  u64 cookie);
+
+struct unwind_work {
+	struct list_head		list;
+	unwind_callback_t		func;
+	int				bit;
+};
+
 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 78a362b80027..d292f96bc06f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -463,7 +463,9 @@ struct perf_event_attr {
 				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
 				remove_on_exec :  1, /* event is removed from task on exec */
 				sigtrap        :  1, /* send synchronous SIGTRAP on event */
-				__reserved_1   : 26;
+				defer_callchain:  1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */
+				defer_output   :  1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */
+				__reserved_1   : 24;
 
 	union {
 		__u32		wakeup_events;	  /* wake up every n events */
@@ -1239,6 +1241,22 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_AUX_OUTPUT_HW_ID		= 21,
 
+	/*
+	 * This user callchain capture was deferred until shortly before
+	 * returning to user space.  Previous samples would have kernel
+	 * callchains only and they need to be stitched with this to make full
+	 * callchains.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				cookie;
+	 *	u64				nr;
+	 *	u64				ips[nr];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_CALLCHAIN_DEFERRED		= 22,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
@@ -1269,6 +1287,7 @@ enum perf_callchain_context {
 	PERF_CONTEXT_HV				= (__u64)-32,
 	PERF_CONTEXT_KERNEL			= (__u64)-128,
 	PERF_CONTEXT_USER			= (__u64)-512,
+	PERF_CONTEXT_USER_DEFERRED		= (__u64)-640,
 
 	PERF_CONTEXT_GUEST			= (__u64)-2048,
 	PERF_CONTEXT_GUEST_KERNEL		= (__u64)-2176,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4d53cdd1374c..8f1dacaf01fe 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 		max_depth = sysctl_perf_event_max_stack;
 
 	trace = get_perf_callchain(regs, kernel, user, max_depth,
-				   false, false);
+				   false, false, 0);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
@@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 		trace = get_callchain_entry_for_task(task, max_depth);
 	else
 		trace = get_perf_callchain(regs, kernel, user, max_depth,
-					   crosstask, false);
+					   crosstask, false, 0);
 
 	if (unlikely(!trace) || trace->nr < skip) {
 		if (may_fault)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 808c0d7a31fa..b9c7e00725d6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-		   u32 max_stack, bool crosstask, bool add_mark)
+		   u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
 {
 	struct perf_callchain_entry *entry;
 	struct perf_callchain_entry_ctx ctx;
@@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
 			regs = task_pt_regs(current);
 		}
 
+		if (defer_cookie) {
+			/*
+			 * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+			 * which can be stitched to this one, and add
+			 * the cookie after it (it will be cut off when the
+			 * user stack is copied to the callchain).
+			 */
+			perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+			perf_callchain_store_context(&ctx, defer_cookie);
+			goto exit_put;
+		}
+
 		if (add_mark)
 			perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7541f6f85fcb..f6a08c73f783 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -56,6 +56,7 @@
 #include <linux/buildid.h>
 #include <linux/task_work.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/unwind_deferred.h>
 
 #include "internal.h"
 
@@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr)
 
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
+static struct unwind_work perf_unwind_work;
+
 struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
@@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 		!(current->flags & (PF_KTHREAD | PF_USER_WORKER));
 	/* Disallow cross-task user callchains. */
 	bool crosstask = event->ctx->task && event->ctx->task != current;
+	bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+			  event->attr.defer_callchain;
 	const u32 max_stack = event->attr.sample_max_stack;
 	struct perf_callchain_entry *callchain;
+	u64 defer_cookie;
 
 	if (!current->mm)
 		user = false;
@@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	if (!kernel && !user)
 		return &__empty_callchain;
 
-	callchain = get_perf_callchain(regs, kernel, user,
-				       max_stack, crosstask, true);
+	if (!(user && defer_user && !crosstask &&
+	      unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
+		defer_cookie = 0;
+
+	callchain = get_perf_callchain(regs, kernel, user, max_stack,
+				       crosstask, true, defer_cookie);
+
 	return callchain ?: &__empty_callchain;
 }
 
@@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog,
 	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
 }
 
+struct perf_callchain_deferred_event {
+	struct unwind_stacktrace *trace;
+	struct {
+		struct perf_event_header	header;
+		u64				cookie;
+		u64				nr;
+		u64				ips[];
+	} event;
+};
+
+static void perf_callchain_deferred_output(struct perf_event *event, void *data)
+{
+	struct perf_callchain_deferred_event *deferred_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret, size = deferred_event->event.header.size;
+
+	if (!event->attr.defer_output)
+		return;
+
+	/* XXX do we really need sample_id_all for this ??? */
+	perf_event_header__init_id(&deferred_event->event.header, &sample, event);
+
+	ret = perf_output_begin(&handle, &sample, event,
+				deferred_event->event.header.size);
+	if (ret)
+		goto out;
+
+	perf_output_put(&handle, deferred_event->event);
+	for (int i = 0; i < deferred_event->trace->nr; i++) {
+		u64 entry = deferred_event->trace->entries[i];
+		perf_output_put(&handle, entry);
+	}
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+out:
+	deferred_event->event.header.size = size;
+}
+
+static void perf_unwind_deferred_callback(struct unwind_work *work,
+					 struct unwind_stacktrace *trace, u64 cookie)
+{
+	struct perf_callchain_deferred_event deferred_event = {
+		.trace = trace,
+		.event = {
+			.header = {
+				.type = PERF_RECORD_CALLCHAIN_DEFERRED,
+				.misc = PERF_RECORD_MISC_USER,
+				.size = sizeof(deferred_event.event) +
+					(trace->nr * sizeof(u64)),
+			},
+			.cookie = cookie,
+			.nr = trace->nr,
+		},
+	};
+
+	perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
+}
+
 struct perf_text_poke_event {
 	const void		*old_bytes;
 	const void		*new_bytes;
@@ -14799,6 +14870,9 @@ void __init perf_event_init(void)
 
 	idr_init(&pmu_idr);
 
+	unwind_deferred_init(&perf_unwind_work,
+			     perf_unwind_deferred_callback);
+
 	perf_event_init_all_cpus();
 	init_srcu_struct(&pmus_srcu);
 	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 78a362b80027..d292f96bc06f 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -463,7 +463,9 @@ struct perf_event_attr {
 				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
 				remove_on_exec :  1, /* event is removed from task on exec */
 				sigtrap        :  1, /* send synchronous SIGTRAP on event */
-				__reserved_1   : 26;
+				defer_callchain:  1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */
+				defer_output   :  1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */
+				__reserved_1   : 24;
 
 	union {
 		__u32		wakeup_events;	  /* wake up every n events */
@@ -1239,6 +1241,22 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_AUX_OUTPUT_HW_ID		= 21,
 
+	/*
+	 * This user callchain capture was deferred until shortly before
+	 * returning to user space.  Previous samples would have kernel
+	 * callchains only and they need to be stitched with this to make full
+	 * callchains.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				cookie;
+	 *	u64				nr;
+	 *	u64				ips[nr];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_CALLCHAIN_DEFERRED		= 22,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
@@ -1269,6 +1287,7 @@ enum perf_callchain_context {
 	PERF_CONTEXT_HV				= (__u64)-32,
 	PERF_CONTEXT_KERNEL			= (__u64)-128,
 	PERF_CONTEXT_USER			= (__u64)-512,
+	PERF_CONTEXT_USER_DEFERRED		= (__u64)-640,
 
 	PERF_CONTEXT_GUEST			= (__u64)-2048,
 	PERF_CONTEXT_GUEST_KERNEL		= (__u64)-2176,
-- 
cgit v1.2.3


From 9c7f7262bc1affb9b9acd2ec2fb1f6314d5d474c Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Wed, 29 Oct 2025 09:12:47 +0100
Subject: regmap: add flat cache with sparse validity

The flat regcache will always assume the data in the cache is valid.
Since the cache is preferred over hardware access, this may shadow the
actual state of the device.

Add a new containing cache structure with the flat data table and a
bitmap indicating cache validity. REGCACHE_FLAT will still behave as
before, as the validity is ignored.

Define new cache type REGCACHE_FLAT_S: a flat cache with sparse
validity. The sparse validity is used to determine if a hardware access
should occur to initialize the cache on the fly, vs. at regmap init for
REGCACHE_FLAT. Contrary to REGCACHE_FLAT, this allows us to implement
regcache_ops.drop.

Signed-off-by: Sander Vanheule <sander@svanheule.net>
Link: https://patch.msgid.link/20251029081248.52607-2-sander@svanheule.net
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/internal.h      |   1 +
 drivers/base/regmap/regcache-flat.c | 102 ++++++++++++++++++++++++++++++++----
 drivers/base/regmap/regcache.c      |   1 +
 drivers/base/regmap/regmap-kunit.c  |  22 ++++++++
 include/linux/regmap.h              |  17 +++---
 5 files changed, 126 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index 6f31240ee4a9..8d19a1414d5b 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -288,6 +288,7 @@ enum regmap_endian regmap_get_val_endian(struct device *dev,
 					 const struct regmap_bus *bus,
 					 const struct regmap_config *config);
 
+extern struct regcache_ops regcache_flat_sparse_ops;
 extern struct regcache_ops regcache_rbtree_ops;
 extern struct regcache_ops regcache_maple_ops;
 extern struct regcache_ops regcache_flat_ops;
diff --git a/drivers/base/regmap/regcache-flat.c b/drivers/base/regmap/regcache-flat.c
index f36d3618b67c..86f7679175b1 100644
--- a/drivers/base/regmap/regcache-flat.c
+++ b/drivers/base/regmap/regcache-flat.c
@@ -6,7 +6,11 @@
 //
 // Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
 
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
 #include <linux/device.h>
+#include <linux/limits.h>
+#include <linux/overflow.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 
@@ -18,34 +22,62 @@ static inline unsigned int regcache_flat_get_index(const struct regmap *map,
 	return regcache_get_index_by_order(map, reg);
 }
 
+struct regcache_flat_data {
+	unsigned long *valid;
+	unsigned int data[];
+};
+
 static int regcache_flat_init(struct regmap *map)
 {
 	int i;
-	unsigned int *cache;
+	size_t cache_data_size;
+	unsigned int cache_size;
+	struct regcache_flat_data *cache;
 
 	if (!map || map->reg_stride_order < 0 || !map->max_register_is_set)
 		return -EINVAL;
 
-	map->cache = kcalloc(regcache_flat_get_index(map, map->max_register)
-			     + 1, sizeof(unsigned int), map->alloc_flags);
-	if (!map->cache)
+	cache_size = regcache_flat_get_index(map, map->max_register) + 1;
+	cache_data_size = struct_size(cache, data, cache_size);
+
+	if (cache_data_size == SIZE_MAX) {
+		dev_err(map->dev, "cannot allocate regmap cache");
 		return -ENOMEM;
+	}
 
-	cache = map->cache;
+	cache = kzalloc(cache_data_size, map->alloc_flags);
+	if (!cache)
+		return -ENOMEM;
+
+	cache->valid = bitmap_zalloc(cache_size, map->alloc_flags);
+	if (!cache->valid)
+		goto err_free;
+
+	map->cache = cache;
 
 	for (i = 0; i < map->num_reg_defaults; i++) {
 		unsigned int reg = map->reg_defaults[i].reg;
 		unsigned int index = regcache_flat_get_index(map, reg);
 
-		cache[index] = map->reg_defaults[i].def;
+		cache->data[index] = map->reg_defaults[i].def;
+		__set_bit(index, cache->valid);
 	}
 
 	return 0;
+
+err_free:
+	kfree(cache);
+	return -ENOMEM;
 }
 
 static int regcache_flat_exit(struct regmap *map)
 {
-	kfree(map->cache);
+	struct regcache_flat_data *cache = map->cache;
+
+	if (cache)
+		bitmap_free(cache->valid);
+
+	kfree(cache);
 	map->cache = NULL;
 
 	return 0;
@@ -54,10 +86,24 @@ static int regcache_flat_exit(struct regmap *map)
 static int regcache_flat_read(struct regmap *map,
 			      unsigned int reg, unsigned int *value)
 {
-	unsigned int *cache = map->cache;
+	struct regcache_flat_data *cache = map->cache;
 	unsigned int index = regcache_flat_get_index(map, reg);
 
-	*value = cache[index];
+	*value = cache->data[index];
+
+	return 0;
+}
+
+static int regcache_flat_sparse_read(struct regmap *map,
+				     unsigned int reg, unsigned int *value)
+{
+	struct regcache_flat_data *cache = map->cache;
+	unsigned int index = regcache_flat_get_index(map, reg);
+
+	if (unlikely(!test_bit(index, cache->valid)))
+		return -ENOENT;
+
+	*value = cache->data[index];
 
 	return 0;
 }
@@ -65,10 +111,34 @@ static int regcache_flat_read(struct regmap *map,
 static int regcache_flat_write(struct regmap *map, unsigned int reg,
 			       unsigned int value)
 {
-	unsigned int *cache = map->cache;
+	struct regcache_flat_data *cache = map->cache;
 	unsigned int index = regcache_flat_get_index(map, reg);
 
-	cache[index] = value;
+	cache->data[index] = value;
+
+	return 0;
+}
+
+static int regcache_flat_sparse_write(struct regmap *map, unsigned int reg,
+				      unsigned int value)
+{
+	struct regcache_flat_data *cache = map->cache;
+	unsigned int index = regcache_flat_get_index(map, reg);
+
+	cache->data[index] = value;
+	__set_bit(index, cache->valid);
+
+	return 0;
+}
+
+static int regcache_flat_drop(struct regmap *map, unsigned int min,
+			      unsigned int max)
+{
+	struct regcache_flat_data *cache = map->cache;
+	unsigned int bitmap_min = regcache_flat_get_index(map, min);
+	unsigned int bitmap_max = regcache_flat_get_index(map, max);
+
+	bitmap_clear(cache->valid, bitmap_min, bitmap_max + 1 - bitmap_min);
 
 	return 0;
 }
@@ -81,3 +151,13 @@ struct regcache_ops regcache_flat_ops = {
 	.read = regcache_flat_read,
 	.write = regcache_flat_write,
 };
+
+struct regcache_ops regcache_flat_sparse_ops = {
+	.type = REGCACHE_FLAT_S,
+	.name = "flat-sparse",
+	.init = regcache_flat_init,
+	.exit = regcache_flat_exit,
+	.read = regcache_flat_sparse_read,
+	.write = regcache_flat_sparse_write,
+	.drop = regcache_flat_drop,
+};
diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index c7650fa434ad..0392f5525cf3 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@@ -16,6 +16,7 @@
 #include "internal.h"
 
 static const struct regcache_ops *cache_types[] = {
+	&regcache_flat_sparse_ops,
 	&regcache_rbtree_ops,
 	&regcache_maple_ops,
 	&regcache_flat_ops,
diff --git a/drivers/base/regmap/regmap-kunit.c b/drivers/base/regmap/regmap-kunit.c
index 95c5bf2a78ee..f6fc5ed016da 100644
--- a/drivers/base/regmap/regmap-kunit.c
+++ b/drivers/base/regmap/regmap-kunit.c
@@ -54,6 +54,8 @@ static const char *regcache_type_name(enum regcache_type type)
 		return "none";
 	case REGCACHE_FLAT:
 		return "flat";
+	case REGCACHE_FLAT_S:
+		return "flat-sparse";
 	case REGCACHE_RBTREE:
 		return "rbtree";
 	case REGCACHE_MAPLE:
@@ -93,6 +95,8 @@ static const struct regmap_test_param regcache_types_list[] = {
 	{ .cache = REGCACHE_NONE, .fast_io = true },
 	{ .cache = REGCACHE_FLAT },
 	{ .cache = REGCACHE_FLAT, .fast_io = true },
+	{ .cache = REGCACHE_FLAT_S },
+	{ .cache = REGCACHE_FLAT_S, .fast_io = true },
 	{ .cache = REGCACHE_RBTREE },
 	{ .cache = REGCACHE_RBTREE, .fast_io = true },
 	{ .cache = REGCACHE_MAPLE },
@@ -104,6 +108,8 @@ KUNIT_ARRAY_PARAM(regcache_types, regcache_types_list, param_to_desc);
 static const struct regmap_test_param real_cache_types_only_list[] = {
 	{ .cache = REGCACHE_FLAT },
 	{ .cache = REGCACHE_FLAT, .fast_io = true },
+	{ .cache = REGCACHE_FLAT_S },
+	{ .cache = REGCACHE_FLAT_S, .fast_io = true },
 	{ .cache = REGCACHE_RBTREE },
 	{ .cache = REGCACHE_RBTREE, .fast_io = true },
 	{ .cache = REGCACHE_MAPLE },
@@ -119,6 +125,12 @@ static const struct regmap_test_param real_cache_types_list[] = {
 	{ .cache = REGCACHE_FLAT,   .from_reg = 0x2002 },
 	{ .cache = REGCACHE_FLAT,   .from_reg = 0x2003 },
 	{ .cache = REGCACHE_FLAT,   .from_reg = 0x2004 },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0 },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0, .fast_io = true },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0x2001 },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0x2002 },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0x2003 },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0x2004 },
 	{ .cache = REGCACHE_RBTREE, .from_reg = 0 },
 	{ .cache = REGCACHE_RBTREE, .from_reg = 0, .fast_io = true },
 	{ .cache = REGCACHE_RBTREE, .from_reg = 0x2001 },
@@ -136,6 +148,12 @@ static const struct regmap_test_param real_cache_types_list[] = {
 KUNIT_ARRAY_PARAM(real_cache_types, real_cache_types_list, param_to_desc);
 
 static const struct regmap_test_param sparse_cache_types_list[] = {
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0 },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0, .fast_io = true },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0x2001 },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0x2002 },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0x2003 },
+	{ .cache = REGCACHE_FLAT_S, .from_reg = 0x2004 },
 	{ .cache = REGCACHE_RBTREE, .from_reg = 0 },
 	{ .cache = REGCACHE_RBTREE, .from_reg = 0, .fast_io = true },
 	{ .cache = REGCACHE_RBTREE, .from_reg = 0x2001 },
@@ -1597,6 +1615,8 @@ static const struct regmap_test_param raw_types_list[] = {
 	{ .cache = REGCACHE_NONE,   .val_endian = REGMAP_ENDIAN_BIG },
 	{ .cache = REGCACHE_FLAT,   .val_endian = REGMAP_ENDIAN_LITTLE },
 	{ .cache = REGCACHE_FLAT,   .val_endian = REGMAP_ENDIAN_BIG },
+	{ .cache = REGCACHE_FLAT_S, .val_endian = REGMAP_ENDIAN_LITTLE },
+	{ .cache = REGCACHE_FLAT_S, .val_endian = REGMAP_ENDIAN_BIG },
 	{ .cache = REGCACHE_RBTREE, .val_endian = REGMAP_ENDIAN_LITTLE },
 	{ .cache = REGCACHE_RBTREE, .val_endian = REGMAP_ENDIAN_BIG },
 	{ .cache = REGCACHE_MAPLE,  .val_endian = REGMAP_ENDIAN_LITTLE },
@@ -1608,6 +1628,8 @@ KUNIT_ARRAY_PARAM(raw_test_types, raw_types_list, param_to_desc);
 static const struct regmap_test_param raw_cache_types_list[] = {
 	{ .cache = REGCACHE_FLAT,   .val_endian = REGMAP_ENDIAN_LITTLE },
 	{ .cache = REGCACHE_FLAT,   .val_endian = REGMAP_ENDIAN_BIG },
+	{ .cache = REGCACHE_FLAT_S, .val_endian = REGMAP_ENDIAN_LITTLE },
+	{ .cache = REGCACHE_FLAT_S, .val_endian = REGMAP_ENDIAN_BIG },
 	{ .cache = REGCACHE_RBTREE, .val_endian = REGMAP_ENDIAN_LITTLE },
 	{ .cache = REGCACHE_RBTREE, .val_endian = REGMAP_ENDIAN_BIG },
 	{ .cache = REGCACHE_MAPLE,  .val_endian = REGMAP_ENDIAN_LITTLE },
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 4e1ac1fbcec4..17bed25dc4e3 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -55,18 +55,23 @@ struct sdw_slave;
 #define REGMAP_DOWNSHIFT(s)	(s)
 
 /*
- * The supported cache types, the default is no cache.  Any new caches
- * should usually use the maple tree cache unless they specifically
- * require that there are never any allocations at runtime and can't
- * provide defaults in which case they should use the flat cache.  The
- * rbtree cache *may* have some performance advantage for very low end
- * systems that make heavy use of cache syncs but is mainly legacy.
+ * The supported cache types, the default is no cache.  Any new caches should
+ * usually use the maple tree cache unless they specifically require that there
+ * are never any allocations at runtime in which case they should use the sparse
+ * flat cache.  The rbtree cache *may* have some performance advantage for very
+ * low end systems that make heavy use of cache syncs but is mainly legacy.
+ * These caches are sparse and entries will be initialized from hardware if no
+ * default has been provided.
+ * The non-sparse flat cache is provided for compatibility with existing users
+ * and will zero-initialize cache entries for which no defaults are provided.
+ * New users should use the sparse flat cache.
  */
 enum regcache_type {
 	REGCACHE_NONE,
 	REGCACHE_RBTREE,
 	REGCACHE_FLAT,
 	REGCACHE_MAPLE,
+	REGCACHE_FLAT_S,
 };
 
 /**
-- 
cgit v1.2.3


From 7fabcb7fbabbcddd9dc42dbe4c92d18ce3e54283 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Oct 2025 10:04:17 +0200
Subject: mm,btrfs: add a filemap_flush_nr helper

Abstract out the btrfs-specific behavior of kicking off I/O on a number
of pages on an address_space into a well-defined helper.

Note: there is no kerneldoc comment for the new function because it is
not part of the public API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-7-hch@lst.de
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/btrfs/inode.c        | 13 ++-----------
 include/linux/pagemap.h |  1 +
 mm/filemap.c            | 22 ++++++++++++++++++++++
 3 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b97d6c1f7772..d12b8116adde 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8752,19 +8752,10 @@ static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write,
 			btrfs_queue_work(root->fs_info->flush_workers,
 					 &work->work);
 		} else {
-			struct writeback_control wbc = {
-				.nr_to_write = *nr_to_write,
-				.sync_mode = WB_SYNC_NONE,
-				.range_start = 0,
-				.range_end = LLONG_MAX,
-			};
-
-			ret = filemap_fdatawrite_wbc(tmp_inode->i_mapping,
-					&wbc);
+			ret = filemap_flush_nr(tmp_inode->i_mapping,
+					nr_to_write);
 			btrfs_add_delayed_iput(inode);
 
-			if (*nr_to_write != LONG_MAX)
-				*nr_to_write = wbc.nr_to_write;
 			if (ret || *nr_to_write <= 0)
 				goto out;
 		}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878..cebdf160d3dd 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -38,6 +38,7 @@ int filemap_invalidate_pages(struct address_space *mapping,
 int write_inode_now(struct inode *, int sync);
 int filemap_fdatawrite(struct address_space *);
 int filemap_flush(struct address_space *);
+int filemap_flush_nr(struct address_space *mapping, long *nr_to_write);
 int filemap_fdatawait_keep_errors(struct address_space *mapping);
 int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
 int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
diff --git a/mm/filemap.c b/mm/filemap.c
index 99d6919af60d..e344b79a012d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -474,6 +474,28 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
 
+/*
+ * Start writeback on @nr_to_write pages from @mapping.  No one but the existing
+ * btrfs caller should be using this.  Talk to linux-mm if you think adding a
+ * new caller is a good idea.
+ */
+int filemap_flush_nr(struct address_space *mapping, long *nr_to_write)
+{
+	struct writeback_control wbc = {
+		.nr_to_write = *nr_to_write,
+		.sync_mode = WB_SYNC_NONE,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+	};
+	int ret;
+
+	ret = filemap_fdatawrite_wbc(mapping, &wbc);
+	if (!ret)
+		*nr_to_write = wbc.nr_to_write;
+	return ret;
+}
+EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs");
+
 /**
  * filemap_range_has_page - check if a page exists in range.
  * @mapping:           address space within which to check
-- 
cgit v1.2.3


From 1bcb413d0cd80efb386751910036a93147fd8dbc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Oct 2025 10:04:19 +0200
Subject: mm: remove filemap_fdatawrite_wbc

Replace filemap_fdatawrite_wbc, which exposes a writeback_control to the
callers with a filemap_writeback helper that takes all the possible
arguments and declares the writeback_control itself.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-9-hch@lst.de
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c       |  6 +++---
 include/linux/pagemap.h |  2 --
 mm/filemap.c            | 54 +++++++++++++++++--------------------------------
 3 files changed, 21 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e76192d140e3..4448de35ec8b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -822,9 +822,9 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
  * @wbc: writeback_control of interest
  * @inode: target inode
  *
- * This function is to be used by __filemap_fdatawrite_range(), which is an
- * alternative entry point into writeback code, and first ensures @inode is
- * associated with a bdi_writeback and attaches it to @wbc.
+ * This function is to be used by filemap_writeback(), which is an alternative
+ * entry point into writeback code, and first ensures @inode is associated with
+ * a bdi_writeback and attaches it to @wbc.
  */
 void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
 		struct inode *inode)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index cebdf160d3dd..678d8ae23d01 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -60,8 +60,6 @@ int filemap_fdatawrite_range(struct address_space *mapping,
 		loff_t start, loff_t end);
 int filemap_check_errors(struct address_space *mapping);
 void __filemap_set_wb_err(struct address_space *mapping, int err);
-int filemap_fdatawrite_wbc(struct address_space *mapping,
-			   struct writeback_control *wbc);
 int kiocb_write_and_wait(struct kiocb *iocb, size_t count);
 
 static inline int filemap_write_and_wait(struct address_space *mapping)
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4c4a96c586..7126d0587c94 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -366,31 +366,30 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
 	return 0;
 }
 
-/**
- * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
- * @mapping:	address space structure to write
- * @wbc:	the writeback_control controlling the writeout
- *
- * Call writepages on the mapping using the provided wbc to control the
- * writeout.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int filemap_fdatawrite_wbc(struct address_space *mapping,
-			   struct writeback_control *wbc)
+static int filemap_writeback(struct address_space *mapping, loff_t start,
+		loff_t end, enum writeback_sync_modes sync_mode,
+		long *nr_to_write)
 {
+	struct writeback_control wbc = {
+		.sync_mode	= sync_mode,
+		.nr_to_write	= nr_to_write ? *nr_to_write : LONG_MAX,
+		.range_start	= start,
+		.range_end	= end,
+	};
 	int ret;
 
 	if (!mapping_can_writeback(mapping) ||
 	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
 
-	wbc_attach_fdatawrite_inode(wbc, mapping->host);
-	ret = do_writepages(mapping, wbc);
-	wbc_detach_inode(wbc);
+	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+	ret = do_writepages(mapping, &wbc);
+	wbc_detach_inode(&wbc);
+
+	if (!ret && nr_to_write)
+		*nr_to_write = wbc.nr_to_write;
 	return ret;
 }
-EXPORT_SYMBOL(filemap_fdatawrite_wbc);
 
 /**
  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
@@ -412,14 +411,7 @@ EXPORT_SYMBOL(filemap_fdatawrite_wbc);
 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 				loff_t end, int sync_mode)
 {
-	struct writeback_control wbc = {
-		.sync_mode = sync_mode,
-		.nr_to_write = LONG_MAX,
-		.range_start = start,
-		.range_end = end,
-	};
-
-	return filemap_fdatawrite_wbc(mapping, &wbc);
+	return filemap_writeback(mapping, start, end, sync_mode, NULL);
 }
 
 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
@@ -475,18 +467,8 @@ EXPORT_SYMBOL(filemap_flush);
  */
 int filemap_flush_nr(struct address_space *mapping, long *nr_to_write)
 {
-	struct writeback_control wbc = {
-		.nr_to_write = *nr_to_write,
-		.sync_mode = WB_SYNC_NONE,
-		.range_start = 0,
-		.range_end = LLONG_MAX,
-	};
-	int ret;
-
-	ret = filemap_fdatawrite_wbc(mapping, &wbc);
-	if (!ret)
-		*nr_to_write = wbc.nr_to_write;
-	return ret;
+	return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE,
+			nr_to_write);
 }
 EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs");
 
-- 
cgit v1.2.3


From 45cbce5b8877f339b72548f60aa97634044c255c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Oct 2025 10:04:20 +0200
Subject: mm: remove __filemap_fdatawrite_range

Use filemap_fdatawrite_range and filemap_fdatawrite_range_kick instead
of the low-level __filemap_fdatawrite_range that requires the caller
to know the internals of the writeback_control structure and remove
__filemap_fdatawrite_range now that it is trivial and only two callers
would be left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-10-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/sync.c               | 11 +++++------
 include/linux/pagemap.h |  2 --
 mm/fadvise.c            |  3 +--
 mm/filemap.c            | 25 +++++++------------------
 4 files changed, 13 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/fs/sync.c b/fs/sync.c
index 2955cd4c77a3..6d8b04e04c3c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -280,14 +280,13 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
 	}
 
 	if (flags & SYNC_FILE_RANGE_WRITE) {
-		int sync_mode = WB_SYNC_NONE;
-
 		if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
 			     SYNC_FILE_RANGE_WRITE_AND_WAIT)
-			sync_mode = WB_SYNC_ALL;
-
-		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
-						 sync_mode);
+			ret = filemap_fdatawrite_range(mapping, offset,
+					endbyte);
+		else
+			ret = filemap_fdatawrite_range_kick(mapping, offset,
+					endbyte);
 		if (ret < 0)
 			goto out;
 	}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 678d8ae23d01..d0a7dd43c835 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -54,8 +54,6 @@ static inline int filemap_fdatawait(struct address_space *mapping)
 bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend);
 int filemap_write_and_wait_range(struct address_space *mapping,
 		loff_t lstart, loff_t lend);
-int __filemap_fdatawrite_range(struct address_space *mapping,
-		loff_t start, loff_t end, int sync_mode);
 int filemap_fdatawrite_range(struct address_space *mapping,
 		loff_t start, loff_t end);
 int filemap_check_errors(struct address_space *mapping);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 588fe76c5a14..f1be619f0e58 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -111,8 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 		spin_unlock(&file->f_lock);
 		break;
 	case POSIX_FADV_DONTNEED:
-		__filemap_fdatawrite_range(mapping, offset, endbyte,
-					   WB_SYNC_NONE);
+		filemap_fdatawrite_range_kick(mapping, offset, endbyte);
 
 		/*
 		 * First and last FULL page! Partial pages are deliberately
diff --git a/mm/filemap.c b/mm/filemap.c
index 7126d0587c94..f90f5bb2b825 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -392,32 +392,23 @@ static int filemap_writeback(struct address_space *mapping, loff_t start,
 }
 
 /**
- * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * filemap_fdatawrite_range - start writeback on mapping dirty pages in range
  * @mapping:	address space structure to write
  * @start:	offset in bytes where the range starts
  * @end:	offset in bytes where the range ends (inclusive)
- * @sync_mode:	enable synchronous operation
  *
  * Start writeback against all of a mapping's dirty pages that lie
  * within the byte offsets <start, end> inclusive.
  *
- * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory cleansing writeback.  The difference between
- * these two operations is that if a dirty page/buffer is encountered, it must
- * be waited upon, and not just skipped over.
+ * This is a data integrity operation that waits upon dirty or in writeback
+ * pages.
  *
  * Return: %0 on success, negative error code otherwise.
  */
-int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
-				loff_t end, int sync_mode)
-{
-	return filemap_writeback(mapping, start, end, sync_mode, NULL);
-}
-
 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 		loff_t end)
 {
-	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
+	return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL);
 }
 EXPORT_SYMBOL(filemap_fdatawrite_range);
 
@@ -441,7 +432,7 @@ EXPORT_SYMBOL(filemap_fdatawrite);
 int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
 				  loff_t end)
 {
-	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
+	return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL);
 }
 EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
 
@@ -689,8 +680,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 		return 0;
 
 	if (mapping_needs_writeback(mapping)) {
-		err = __filemap_fdatawrite_range(mapping, lstart, lend,
-						 WB_SYNC_ALL);
+		err = filemap_fdatawrite_range(mapping, lstart, lend);
 		/*
 		 * Even if the above returned error, the pages may be
 		 * written partially (e.g. -ENOSPC), so we wait for it.
@@ -792,8 +782,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
 		return 0;
 
 	if (mapping_needs_writeback(mapping)) {
-		err = __filemap_fdatawrite_range(mapping, lstart, lend,
-						 WB_SYNC_ALL);
+		err = filemap_fdatawrite_range(mapping, lstart, lend);
 		/* See comment of filemap_write_and_wait() */
 		if (err != -EIO)
 			__filemap_fdatawait_range(mapping, lstart, lend);
-- 
cgit v1.2.3


From c28d67b33cbf6da2043ee7517f1aa4cbf92dbbba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Oct 2025 10:04:21 +0200
Subject: mm: rename filemap_fdatawrite_range_kick to filemap_flush_range

Rename filemap_fdatawrite_range_kick to filemap_flush_range because it
is the ranged version of filemap_flush.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-11-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/sync.c          | 3 +--
 include/linux/fs.h | 6 +++---
 mm/fadvise.c       | 2 +-
 mm/filemap.c       | 8 ++++----
 4 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/sync.c b/fs/sync.c
index 6d8b04e04c3c..1759f6ba36cd 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -285,8 +285,7 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
 			ret = filemap_fdatawrite_range(mapping, offset,
 					endbyte);
 		else
-			ret = filemap_fdatawrite_range_kick(mapping, offset,
-					endbyte);
+			ret = filemap_flush_range(mapping, offset, endbyte);
 		if (ret < 0)
 			goto out;
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..a5dbfa20f8d7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3014,7 +3014,7 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
 extern int __must_check file_check_and_advance_wb_err(struct file *file);
 extern int __must_check file_write_and_wait_range(struct file *file,
 						loff_t start, loff_t end);
-int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
+int filemap_flush_range(struct address_space *mapping, loff_t start,
 		loff_t end);
 
 static inline int file_write_and_wait(struct file *file)
@@ -3051,8 +3051,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
 	} else if (iocb->ki_flags & IOCB_DONTCACHE) {
 		struct address_space *mapping = iocb->ki_filp->f_mapping;
 
-		filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count,
-					      iocb->ki_pos - 1);
+		filemap_flush_range(mapping, iocb->ki_pos - count,
+				iocb->ki_pos - 1);
 	}
 
 	return count;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index f1be619f0e58..67028e30aa91 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -111,7 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 		spin_unlock(&file->f_lock);
 		break;
 	case POSIX_FADV_DONTNEED:
-		filemap_fdatawrite_range_kick(mapping, offset, endbyte);
+		filemap_flush_range(mapping, offset, endbyte);
 
 		/*
 		 * First and last FULL page! Partial pages are deliberately
diff --git a/mm/filemap.c b/mm/filemap.c
index f90f5bb2b825..fa770768ea3a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -419,7 +419,7 @@ int filemap_fdatawrite(struct address_space *mapping)
 EXPORT_SYMBOL(filemap_fdatawrite);
 
 /**
- * filemap_fdatawrite_range_kick - start writeback on a range
+ * filemap_flush_range - start writeback on a range
  * @mapping:	target address_space
  * @start:	index to start writeback on
  * @end:	last (inclusive) index for writeback
@@ -429,12 +429,12 @@ EXPORT_SYMBOL(filemap_fdatawrite);
  *
  * Return: %0 on success, negative error code otherwise.
  */
-int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
+int filemap_flush_range(struct address_space *mapping, loff_t start,
 				  loff_t end)
 {
 	return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL);
 }
-EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
+EXPORT_SYMBOL_GPL(filemap_flush_range);
 
 /**
  * filemap_flush - mostly a non-blocking flush
@@ -447,7 +447,7 @@ EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
  */
 int filemap_flush(struct address_space *mapping)
 {
-	return filemap_fdatawrite_range_kick(mapping, 0, LLONG_MAX);
+	return filemap_flush_range(mapping, 0, LLONG_MAX);
 }
 EXPORT_SYMBOL(filemap_flush);
 
-- 
cgit v1.2.3


From 90db4d4441f58d433ecf74f7e3bd17e0a553c20c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Oct 2025 05:45:48 +0200
Subject: writeback: allow the file system to override MIN_WRITEBACK_PAGES

The relatively low minimal writeback size of 4MiB means that written back
inodes on rotational media are switched a lot.  Besides introducing
additional seeks, this also can lead to extreme file fragmentation on
zoned devices when a lot of files are cached relative to the available
writeback bandwidth.

Add a superblock field that allows the file system to override the
default size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251017034611.651385-3-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c         | 14 +++++---------
 fs/super.c                |  1 +
 include/linux/fs.h        |  1 +
 include/linux/writeback.h |  5 +++++
 4 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 30de37865fa1..52763fa499d6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -32,11 +32,6 @@
 #include <linux/memcontrol.h>
 #include "internal.h"
 
-/*
- * 4MB minimal write chunk size
- */
-#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_SHIFT - 10))
-
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -1889,8 +1884,8 @@ out:
 	return ret;
 }
 
-static long writeback_chunk_size(struct bdi_writeback *wb,
-				 struct wb_writeback_work *work)
+static long writeback_chunk_size(struct super_block *sb,
+		struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
 	long pages;
 
@@ -1913,7 +1908,8 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
 	pages = min(wb->avg_write_bandwidth / 2,
 		    global_wb_domain.dirty_limit / DIRTY_SCOPE);
 	pages = min(pages, work->nr_pages);
-	return round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES);
+	return round_down(pages + sb->s_min_writeback_pages,
+			sb->s_min_writeback_pages);
 }
 
 /*
@@ -2015,7 +2011,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		inode->i_state |= I_SYNC;
 		wbc_attach_and_unlock_inode(&wbc, inode);
 
-		write_chunk = writeback_chunk_size(wb, work);
+		write_chunk = writeback_chunk_size(inode->i_sb, wb, work);
 		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
 
diff --git a/fs/super.c b/fs/super.c
index 5bab94fb7e03..599c1d2641fe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 		goto fail;
 	if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
 		goto fail;
+	s->s_min_writeback_pages = MIN_WRITEBACK_PAGES;
 	return s;
 
 fail:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a5dbfa20f8d7..6bf369095d2e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1583,6 +1583,7 @@ struct super_block {
 
 	spinlock_t		s_inode_wblist_lock;
 	struct list_head	s_inodes_wb;	/* writeback inodes */
+	long			s_min_writeback_pages;
 } __randomize_layout;
 
 static inline struct user_namespace *i_user_ns(const struct inode *inode)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 22dd4adc5667..49e1dd96f43e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *);
 void sb_mark_inode_writeback(struct inode *inode);
 void sb_clear_inode_writeback(struct inode *inode);
 
+/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_SHIFT - 10))
+
 #endif		/* WRITEBACK_H */
-- 
cgit v1.2.3


From 4952f35f0545f3b53dab8d5fd727c4827c2a2778 Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao@bytedance.com>
Date: Mon, 29 Sep 2025 19:13:49 +0800
Subject: fs: Make wbc_to_tag() inline and use it in fs.

The logic in wbc_to_tag() is widely used in file systems, so modify this
function to be inline and use it in file systems.

This patch has only passed compilation tests, but it should be fine.

Signed-off-by: Julian Sun <sunjunchao@bytedance.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/btrfs/extent_io.c      | 5 +----
 fs/ceph/addr.c            | 6 +-----
 fs/ext4/inode.c           | 5 +----
 fs/f2fs/data.c            | 5 +----
 fs/gfs2/aops.c            | 5 +----
 include/linux/writeback.h | 7 +++++++
 mm/page-writeback.c       | 6 ------
 7 files changed, 12 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c123a3ef154a..170dd7e80d11 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2460,10 +2460,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
 			       &BTRFS_I(inode)->runtime_flags))
 		wbc->tagged_writepages = 1;
 
-	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-		tag = PAGECACHE_TAG_TOWRITE;
-	else
-		tag = PAGECACHE_TAG_DIRTY;
+	tag = wbc_to_tag(wbc);
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 322ed268f14a..63b75d214210 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1045,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping,
 	ceph_wbc->index = ceph_wbc->start_index;
 	ceph_wbc->end = -1;
 
-	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
-		ceph_wbc->tag = PAGECACHE_TAG_TOWRITE;
-	} else {
-		ceph_wbc->tag = PAGECACHE_TAG_DIRTY;
-	}
+	ceph_wbc->tag = wbc_to_tag(wbc);
 
 	ceph_wbc->op_idx = -1;
 	ceph_wbc->num_ops = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9e4ac87211e..58d6194045e2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2619,10 +2619,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 	handle_t *handle = NULL;
 	int bpp = ext4_journal_blocks_per_folio(mpd->inode);
 
-	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
-		tag = PAGECACHE_TAG_TOWRITE;
-	else
-		tag = PAGECACHE_TAG_DIRTY;
+	tag = wbc_to_tag(mpd->wbc);
 
 	mpd->map.m_len = 0;
 	mpd->next_pos = mpd->start_pos;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ef38e62cda8f..826bcfb8230c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2986,10 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-		tag = PAGECACHE_TAG_TOWRITE;
-	else
-		tag = PAGECACHE_TAG_DIRTY;
+	tag = wbc_to_tag(wbc);
 retry:
 	retry = 0;
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 47d74afd63ac..12394fc5dd29 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -311,10 +311,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-		tag = PAGECACHE_TAG_TOWRITE;
-	else
-		tag = PAGECACHE_TAG_DIRTY;
+	tag = wbc_to_tag(wbc);
 
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 49e1dd96f43e..2a81816f7507 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -196,6 +196,13 @@ static inline void wait_on_inode(struct inode *inode)
 		       !(READ_ONCE(inode->i_state) & I_NEW));
 }
 
+static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc)
+{
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		return PAGECACHE_TAG_TOWRITE;
+	return PAGECACHE_TAG_DIRTY;
+}
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 #include <linux/cgroup.h>
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 757bc4d3b5b5..a124ab6a205d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2434,12 +2434,6 @@ static bool folio_prepare_writeback(struct address_space *mapping,
 	return true;
 }
 
-static xa_mark_t wbc_to_tag(struct writeback_control *wbc)
-{
-	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-		return PAGECACHE_TAG_TOWRITE;
-	return PAGECACHE_TAG_DIRTY;
-}
 
 static pgoff_t wbc_end(struct writeback_control *wbc)
 {
-- 
cgit v1.2.3


From f0e7036fc9cb08bdfb27d64eee7fc003ba0bc2e5 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Mon, 27 Oct 2025 10:22:30 +0200
Subject: ipv4: icmp: Add RFC 5837 support

Add the ability to append the incoming IP interface information to
ICMPv4 error messages in accordance with RFC 5837 and RFC 4884. This is
required for more meaningful traceroute results in unnumbered networks.

The feature is disabled by default and controlled via a new sysctl
("net.ipv4.icmp_errors_extension_mask") which accepts a bitmask of ICMP
extensions to append to ICMP error messages. Currently, only a single
value is supported, but the interface and the implementation should be
able to support more extensions, if needed.

Clone the skb and copy the relevant data portions before modifying the
skb as the caller of __icmp_send() still owns the skb after the function
returns. This should be fine since by default ICMP error messages are
rate limited to 1000 per second and no more than 1 per second per
specific host.

Trim or pad the packet to 128 bytes before appending the ICMP extension
structure in order to be compatible with legacy applications that assume
that the ICMP extension structure always starts at this offset (the
minimum length specified by RFC 4884).

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251027082232.232571-2-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst |  17 +++
 include/linux/icmp.h                   |  32 ++++++
 include/net/netns/ipv4.h               |   1 +
 net/ipv4/icmp.c                        | 191 ++++++++++++++++++++++++++++++++-
 net/ipv4/sysctl_net_ipv4.c             |  11 ++
 5 files changed, 251 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index a06cb99d66dc..ece1187ba0f1 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1796,6 +1796,23 @@ icmp_errors_use_inbound_ifaddr - BOOLEAN
 
 	Default: 0 (disabled)
 
+icmp_errors_extension_mask - UNSIGNED INTEGER
+	Bitmask of ICMP extensions to append to ICMPv4 error messages
+	("Destination Unreachable", "Time Exceeded" and "Parameter Problem").
+	The original datagram is trimmed / padded to 128 bytes in order to be
+	compatible with applications that do not comply with RFC 4884.
+
+	Possible extensions are:
+
+	==== ==============================================================
+	0x01 Incoming IP interface information according to RFC 5837.
+	     Extension will include the index, IPv4 address (if present),
+	     name and MTU of the IP interface that received the datagram
+	     which elicited the ICMP error.
+	==== ==============================================================
+
+	Default: 0x00 (no extensions)
+
 igmp_max_memberships - INTEGER
 	Change the maximum number of multicast groups we can subscribe to.
 	Default: 20
diff --git a/include/linux/icmp.h b/include/linux/icmp.h
index 0af4d210ee31..043ec5d9c882 100644
--- a/include/linux/icmp.h
+++ b/include/linux/icmp.h
@@ -40,4 +40,36 @@ void ip_icmp_error_rfc4884(const struct sk_buff *skb,
 			   struct sock_ee_data_rfc4884 *out,
 			   int thlen, int off);
 
+/* RFC 4884 */
+#define ICMP_EXT_ORIG_DGRAM_MIN_LEN	128
+#define ICMP_EXT_VERSION_2		2
+
+/* ICMP Extension Object Classes */
+#define ICMP_EXT_OBJ_CLASS_IIO		2	/* RFC 5837 */
+
+/* Interface Information Object - RFC 5837 */
+enum {
+	ICMP_EXT_CTYPE_IIO_ROLE_IIF,
+};
+
+#define ICMP_EXT_CTYPE_IIO_ROLE(ROLE)	((ROLE) << 6)
+#define ICMP_EXT_CTYPE_IIO_MTU		BIT(0)
+#define ICMP_EXT_CTYPE_IIO_NAME		BIT(1)
+#define ICMP_EXT_CTYPE_IIO_IPADDR	BIT(2)
+#define ICMP_EXT_CTYPE_IIO_IFINDEX	BIT(3)
+
+struct icmp_ext_iio_name_subobj {
+	u8 len;
+	char name[IFNAMSIZ];
+};
+
+enum {
+	/* RFC 5837 - Incoming IP Interface Role */
+	ICMP_ERR_EXT_IIO_IIF,
+	/* Add new constants above. Used by "icmp_errors_extension_mask"
+	 * sysctl.
+	 */
+	ICMP_ERR_EXT_COUNT,
+};
+
 #endif	/* _LINUX_ICMP_H */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 34eb3aecb3f2..0e96c90e56c6 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -135,6 +135,7 @@ struct netns_ipv4 {
 	u8 sysctl_icmp_echo_ignore_broadcasts;
 	u8 sysctl_icmp_ignore_bogus_error_responses;
 	u8 sysctl_icmp_errors_use_inbound_ifaddr;
+	u8 sysctl_icmp_errors_extension_mask;
 	int sysctl_icmp_ratelimit;
 	int sysctl_icmp_ratemask;
 	int sysctl_icmp_msgs_per_sec;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1b7fb5d935ed..4abbec2f47ef 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -582,6 +582,185 @@ relookup_failed:
 	return ERR_PTR(err);
 }
 
+struct icmp_ext_iio_addr4_subobj {
+	__be16 afi;
+	__be16 reserved;
+	__be32 addr4;
+};
+
+static unsigned int icmp_ext_iio_len(void)
+{
+	return sizeof(struct icmp_extobj_hdr) +
+		/* ifIndex */
+		sizeof(__be32) +
+		/* Interface Address Sub-Object */
+		sizeof(struct icmp_ext_iio_addr4_subobj) +
+		/* Interface Name Sub-Object. Length must be a multiple of 4
+		 * bytes.
+		 */
+		ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
+		/* MTU */
+		sizeof(__be32);
+}
+
+static unsigned int icmp_ext_max_len(u8 ext_objs)
+{
+	unsigned int ext_max_len;
+
+	ext_max_len = sizeof(struct icmp_ext_hdr);
+
+	if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+		ext_max_len += icmp_ext_iio_len();
+
+	return ext_max_len;
+}
+
+static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev)
+{
+	struct in_device *in_dev;
+	struct in_ifaddr *ifa;
+
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev)
+		return 0;
+
+	/* It is unclear from RFC 5837 which IP address should be chosen, but
+	 * it makes sense to choose a global unicast address.
+	 */
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
+		if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
+			continue;
+		if (ifa->ifa_scope != RT_SCOPE_UNIVERSE ||
+		    ipv4_is_multicast(ifa->ifa_address))
+			continue;
+		return ifa->ifa_address;
+	}
+
+	return 0;
+}
+
+static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
+				    int iif)
+{
+	struct icmp_ext_iio_name_subobj *name_subobj;
+	struct icmp_extobj_hdr *objh;
+	struct net_device *dev;
+	__be32 data;
+
+	if (!iif)
+		return;
+
+	/* Add the fields in the order specified by RFC 5837. */
+	objh = skb_put(skb, sizeof(*objh));
+	objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
+	objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
+
+	data = htonl(iif);
+	skb_put_data(skb, &data, sizeof(__be32));
+	objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
+
+	rcu_read_lock();
+
+	dev = dev_get_by_index_rcu(net, iif);
+	if (!dev)
+		goto out;
+
+	data = icmp_ext_iio_addr4_find(dev);
+	if (data) {
+		struct icmp_ext_iio_addr4_subobj *addr4_subobj;
+
+		addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj));
+		addr4_subobj->afi = htons(ICMP_AFI_IP);
+		addr4_subobj->addr4 = data;
+		objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
+	}
+
+	name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
+	name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
+	netdev_copy_name(dev, name_subobj->name);
+	objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
+
+	data = htonl(READ_ONCE(dev->mtu));
+	skb_put_data(skb, &data, sizeof(__be32));
+	objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
+
+out:
+	rcu_read_unlock();
+	objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
+}
+
+static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb,
+				 u8 ext_objs, int iif)
+{
+	if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+		icmp_ext_iio_iif_append(net, skb, iif);
+}
+
+static struct sk_buff *
+icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph,
+		unsigned int room, int iif)
+{
+	unsigned int payload_len, ext_max_len, ext_len;
+	struct icmp_ext_hdr *ext_hdr;
+	struct sk_buff *skb;
+	u8 ext_objs;
+	int nhoff;
+
+	switch (icmph->type) {
+	case ICMP_DEST_UNREACH:
+	case ICMP_TIME_EXCEEDED:
+	case ICMP_PARAMETERPROB:
+		break;
+	default:
+		return NULL;
+	}
+
+	ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask);
+	if (!ext_objs)
+		return NULL;
+
+	ext_max_len = icmp_ext_max_len(ext_objs);
+	if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
+		return NULL;
+
+	skb = skb_clone(skb_in, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	nhoff = skb_network_offset(skb);
+	payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
+
+	if (!pskb_network_may_pull(skb, payload_len))
+		goto free_skb;
+
+	if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
+	    __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
+		goto free_skb;
+
+	if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
+		goto free_skb;
+
+	ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
+	ext_hdr->version = ICMP_EXT_VERSION_2;
+
+	icmp_ext_objs_append(net, skb, ext_objs, iif);
+
+	/* Do not send an empty extension structure. */
+	ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
+	if (ext_len == sizeof(*ext_hdr))
+		goto free_skb;
+
+	ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
+	/* The length of the original datagram in 32-bit words (RFC 4884). */
+	icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32);
+
+	return skb;
+
+free_skb:
+	consume_skb(skb);
+	return NULL;
+}
+
 /*
  *	Send an ICMP message in response to a situation
  *
@@ -601,6 +780,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
 	struct icmp_bxm icmp_param;
 	struct rtable *rt = skb_rtable(skb_in);
 	bool apply_ratelimit = false;
+	struct sk_buff *ext_skb;
 	struct ipcm_cookie ipc;
 	struct flowi4 fl4;
 	__be32 saddr;
@@ -770,7 +950,12 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
 	if (room <= (int)sizeof(struct iphdr))
 		goto ende;
 
-	icmp_param.data_len = skb_in->len - icmp_param.offset;
+	ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
+				  parm->iif);
+	if (ext_skb)
+		icmp_param.skb = ext_skb;
+
+	icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
 	if (icmp_param.data_len > room)
 		icmp_param.data_len = room;
 	icmp_param.head_len = sizeof(struct icmphdr);
@@ -785,6 +970,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
 	trace_icmp_send(skb_in, type, code);
 
 	icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
+
+	if (ext_skb)
+		consume_skb(ext_skb);
 ende:
 	ip_rt_put(rt);
 out_unlock:
@@ -1502,6 +1690,7 @@ static int __net_init icmp_sk_init(struct net *net)
 	net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
 	net->ipv4.sysctl_icmp_ratemask = 0x1818;
 	net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+	net->ipv4.sysctl_icmp_errors_extension_mask = 0;
 	net->ipv4.sysctl_icmp_msgs_per_sec = 1000;
 	net->ipv4.sysctl_icmp_msgs_burst = 50;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 24dbc603cc44..0c7c8f9041cb 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -48,6 +48,8 @@ static int tcp_plb_max_rounds = 31;
 static int tcp_plb_max_cong_thresh = 256;
 static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
 static int tcp_ecn_mode_max = 2;
+static u32 icmp_errors_extension_mask_all =
+	GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
 
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
@@ -674,6 +676,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE
 	},
+	{
+		.procname	= "icmp_errors_extension_mask",
+		.data		= &init_net.ipv4.sysctl_icmp_errors_extension_mask,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &icmp_errors_extension_mask_all,
+	},
 	{
 		.procname	= "icmp_ratelimit",
 		.data		= &init_net.ipv4.sysctl_icmp_ratelimit,
-- 
cgit v1.2.3


From d12d04d221f8d928a27a66236228e7501cd4cad5 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Mon, 27 Oct 2025 10:22:31 +0200
Subject: ipv6: icmp: Add RFC 5837 support

Add the ability to append the incoming IP interface information to
ICMPv6 error messages in accordance with RFC 5837 and RFC 4884. This is
required for more meaningful traceroute results in unnumbered networks.

The feature is disabled by default and controlled via a new sysctl
("net.ipv6.icmp.errors_extension_mask") which accepts a bitmask of ICMP
extensions to append to ICMP error messages. Currently, only a single
value is supported, but the interface and the implementation should be
able to support more extensions, if needed.

Clone the skb and copy the relevant data portions before modifying the
skb as the caller of icmp6_send() still owns the skb after the function
returns. This should be fine since by default ICMP error messages are
rate limited to 1000 per second and no more than 1 per second per
specific host.

Trim or pad the packet to 128 bytes before appending the ICMP extension
structure in order to be compatible with legacy applications that assume
that the ICMP extension structure always starts at this offset (the
minimum length specified by RFC 4884).

Since commit 20e1954fe238 ("ipv6: RFC 4884 partial support for SIT/GRE
tunnels") it is possible for icmp6_send() to be called with an skb that
already contains ICMP extensions. This can happen when we receive an
ICMPv4 message with extensions from a tunnel and translate it to an
ICMPv6 message towards an IPv6 host in the overlay network. I could not
find an RFC that supports this behavior, but it makes sense to not
overwrite the original extensions that were appended to the packet.
Therefore, avoid appending extensions if the length field in the
provided ICMPv6 header is already filled.

Export netdev_copy_name() using EXPORT_IPV6_MOD_GPL() to make it
available to IPv6 when it is built as a module.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251027082232.232571-3-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst |  17 +++
 include/net/netns/ipv6.h               |   1 +
 net/core/dev.c                         |   1 +
 net/ipv6/af_inet6.c                    |   1 +
 net/ipv6/icmp.c                        | 214 ++++++++++++++++++++++++++++++++-
 5 files changed, 232 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index ece1187ba0f1..7cd35bfd39e6 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -3279,6 +3279,23 @@ error_anycast_as_unicast - BOOLEAN
 
 	Default: 0 (disabled)
 
+errors_extension_mask - UNSIGNED INTEGER
+	Bitmask of ICMP extensions to append to ICMPv6 error messages
+	("Destination Unreachable" and "Time Exceeded"). The original datagram
+	is trimmed / padded to 128 bytes in order to be compatible with
+	applications that do not comply with RFC 4884.
+
+	Possible extensions are:
+
+	==== ==============================================================
+	0x01 Incoming IP interface information according to RFC 5837.
+	     Extension will include the index, IPv6 address (if present),
+	     name and MTU of the IP interface that received the datagram
+	     which elicited the ICMP error.
+	==== ==============================================================
+
+	Default: 0x00 (no extensions)
+
 xfrm6_gc_thresh - INTEGER
 	(Obsolete since linux-4.14)
 	The threshold at which we will start garbage collecting for IPv6
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 47dc70d8100a..08d2ecc96e2b 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -56,6 +56,7 @@ struct netns_sysctl_ipv6 {
 	u8 skip_notify_on_dev_down;
 	u8 fib_notify_on_flag_change;
 	u8 icmpv6_error_anycast_as_unicast;
+	u8 icmpv6_errors_extension_mask;
 };
 
 struct netns_ipv6 {
diff --git a/net/core/dev.c b/net/core/dev.c
index d32f0b0c03bb..dccc1176f3c6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1163,6 +1163,7 @@ void netdev_copy_name(struct net_device *dev, char *name)
 		strscpy(name, dev->name, IFNAMSIZ);
 	} while (read_seqretry(&netdev_rename_lock, seq));
 }
+EXPORT_IPV6_MOD_GPL(netdev_copy_name);
 
 /**
  *	netdev_get_name - get a netdevice name, knowing its ifindex.
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 1b0314644e0c..44d7de1eec4f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -960,6 +960,7 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0;
 	net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0;
 	net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0;
+	net->ipv6.sysctl.icmpv6_errors_extension_mask = 0;
 
 	/* By default, rate limit error messages.
 	 * Except for pmtu discovery, it would break it.
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 56c974cf75d1..5d2f90babaa5 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -444,6 +444,193 @@ static int icmp6_iif(const struct sk_buff *skb)
 	return icmp6_dev(skb)->ifindex;
 }
 
+struct icmp6_ext_iio_addr6_subobj {
+	__be16 afi;
+	__be16 reserved;
+	struct in6_addr addr6;
+};
+
+static unsigned int icmp6_ext_iio_len(void)
+{
+	return sizeof(struct icmp_extobj_hdr) +
+		/* ifIndex */
+		sizeof(__be32) +
+		/* Interface Address Sub-Object */
+		sizeof(struct icmp6_ext_iio_addr6_subobj) +
+		/* Interface Name Sub-Object. Length must be a multiple of 4
+		 * bytes.
+		 */
+		ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
+		/* MTU */
+		sizeof(__be32);
+}
+
+static unsigned int icmp6_ext_max_len(u8 ext_objs)
+{
+	unsigned int ext_max_len;
+
+	ext_max_len = sizeof(struct icmp_ext_hdr);
+
+	if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+		ext_max_len += icmp6_ext_iio_len();
+
+	return ext_max_len;
+}
+
+static struct in6_addr *icmp6_ext_iio_addr6_find(const struct net_device *dev)
+{
+	struct inet6_dev *in6_dev;
+	struct inet6_ifaddr *ifa;
+
+	in6_dev = __in6_dev_get(dev);
+	if (!in6_dev)
+		return NULL;
+
+	/* It is unclear from RFC 5837 which IP address should be chosen, but
+	 * it makes sense to choose a global unicast address.
+	 */
+	list_for_each_entry_rcu(ifa, &in6_dev->addr_list, if_list) {
+		if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DADFAILED))
+			continue;
+		if (ipv6_addr_type(&ifa->addr) != IPV6_ADDR_UNICAST ||
+		    ipv6_addr_src_scope(&ifa->addr) != IPV6_ADDR_SCOPE_GLOBAL)
+			continue;
+		return &ifa->addr;
+	}
+
+	return NULL;
+}
+
+static void icmp6_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
+				     int iif)
+{
+	struct icmp_ext_iio_name_subobj *name_subobj;
+	struct icmp_extobj_hdr *objh;
+	struct net_device *dev;
+	struct in6_addr *addr6;
+	__be32 data;
+
+	if (!iif)
+		return;
+
+	/* Add the fields in the order specified by RFC 5837. */
+	objh = skb_put(skb, sizeof(*objh));
+	objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
+	objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
+
+	data = htonl(iif);
+	skb_put_data(skb, &data, sizeof(__be32));
+	objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
+
+	rcu_read_lock();
+
+	dev = dev_get_by_index_rcu(net, iif);
+	if (!dev)
+		goto out;
+
+	addr6 = icmp6_ext_iio_addr6_find(dev);
+	if (addr6) {
+		struct icmp6_ext_iio_addr6_subobj *addr6_subobj;
+
+		addr6_subobj = skb_put_zero(skb, sizeof(*addr6_subobj));
+		addr6_subobj->afi = htons(ICMP_AFI_IP6);
+		addr6_subobj->addr6 = *addr6;
+		objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
+	}
+
+	name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
+	name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
+	netdev_copy_name(dev, name_subobj->name);
+	objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
+
+	data = htonl(READ_ONCE(dev->mtu));
+	skb_put_data(skb, &data, sizeof(__be32));
+	objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
+
+out:
+	rcu_read_unlock();
+	objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
+}
+
+static void icmp6_ext_objs_append(struct net *net, struct sk_buff *skb,
+				  u8 ext_objs, int iif)
+{
+	if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+		icmp6_ext_iio_iif_append(net, skb, iif);
+}
+
+static struct sk_buff *
+icmp6_ext_append(struct net *net, struct sk_buff *skb_in,
+		 struct icmp6hdr *icmp6h, unsigned int room, int iif)
+{
+	unsigned int payload_len, ext_max_len, ext_len;
+	struct icmp_ext_hdr *ext_hdr;
+	struct sk_buff *skb;
+	u8 ext_objs;
+	int nhoff;
+
+	switch (icmp6h->icmp6_type) {
+	case ICMPV6_DEST_UNREACH:
+	case ICMPV6_TIME_EXCEED:
+		break;
+	default:
+		return NULL;
+	}
+
+	/* Do not overwrite existing extensions. This can happen when we
+	 * receive an ICMPv4 message with extensions from a tunnel and
+	 * translate it to an ICMPv6 message towards an IPv6 host in the
+	 * overlay network.
+	 */
+	if (icmp6h->icmp6_datagram_len)
+		return NULL;
+
+	ext_objs = READ_ONCE(net->ipv6.sysctl.icmpv6_errors_extension_mask);
+	if (!ext_objs)
+		return NULL;
+
+	ext_max_len = icmp6_ext_max_len(ext_objs);
+	if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
+		return NULL;
+
+	skb = skb_clone(skb_in, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	nhoff = skb_network_offset(skb);
+	payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
+
+	if (!pskb_network_may_pull(skb, payload_len))
+		goto free_skb;
+
+	if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
+	    __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
+		goto free_skb;
+
+	if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
+		goto free_skb;
+
+	ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
+	ext_hdr->version = ICMP_EXT_VERSION_2;
+
+	icmp6_ext_objs_append(net, skb, ext_objs, iif);
+
+	/* Do not send an empty extension structure. */
+	ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
+	if (ext_len == sizeof(*ext_hdr))
+		goto free_skb;
+
+	ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
+	/* The length of the original datagram in 64-bit words (RFC 4884). */
+	icmp6h->icmp6_datagram_len = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u64);
+
+	return skb;
+
+free_skb:
+	consume_skb(skb);
+	return NULL;
+}
+
 /*
  *	Send an ICMP message in response to a packet in error
  */
@@ -458,7 +645,9 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	struct ipv6_pinfo *np;
 	const struct in6_addr *saddr = NULL;
 	bool apply_ratelimit = false;
+	struct sk_buff *ext_skb;
 	struct dst_entry *dst;
+	unsigned int room;
 	struct icmp6hdr tmp_hdr;
 	struct flowi6 fl6;
 	struct icmpv6_msg msg;
@@ -612,8 +801,13 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	msg.offset = skb_network_offset(skb);
 	msg.type = type;
 
-	len = skb->len - msg.offset;
-	len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr));
+	room = IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr);
+	ext_skb = icmp6_ext_append(net, skb, &tmp_hdr, room, parm->iif);
+	if (ext_skb)
+		msg.skb = ext_skb;
+
+	len = msg.skb->len - msg.offset;
+	len = min_t(unsigned int, len, room);
 	if (len < 0) {
 		net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n",
 				    &hdr->saddr, &hdr->daddr);
@@ -635,6 +829,8 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	}
 
 out_dst_release:
+	if (ext_skb)
+		consume_skb(ext_skb);
 	dst_release(dst);
 out_unlock:
 	icmpv6_xmit_unlock(sk);
@@ -1171,6 +1367,10 @@ int icmpv6_err_convert(u8 type, u8 code, int *err)
 EXPORT_SYMBOL(icmpv6_err_convert);
 
 #ifdef CONFIG_SYSCTL
+
+static u32 icmpv6_errors_extension_mask_all =
+	GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
+
 static struct ctl_table ipv6_icmp_table_template[] = {
 	{
 		.procname	= "ratelimit",
@@ -1216,6 +1416,15 @@ static struct ctl_table ipv6_icmp_table_template[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
+	{
+		.procname	= "errors_extension_mask",
+		.data		= &init_net.ipv6.sysctl.icmpv6_errors_extension_mask,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &icmpv6_errors_extension_mask_all,
+	},
 };
 
 struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
@@ -1233,6 +1442,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
 		table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast;
 		table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr;
 		table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast;
+		table[6].data = &net->ipv6.sysctl.icmpv6_errors_extension_mask;
 	}
 	return table;
 }
-- 
cgit v1.2.3


From 26888de97b2ffe0267c12dd4e9fcd552545903f1 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 25 Oct 2025 20:49:50 +0200
Subject: net: phy: add iterator mdiobus_for_each_phy

Add an iterator for all PHY's on a MII bus, and phy_find_next()
as a prerequisite.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Wei Fang <wei.fang@nxp.com>
Link: https://patch.msgid.link/cd112f15-401a-43d9-8525-9ff0965a68cd@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 16 +++++++++-------
 include/linux/phy.h          | 11 ++++++++++-
 2 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index b7feaf0cb1df..737747cf1906 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1224,22 +1224,24 @@ int phy_get_c45_ids(struct phy_device *phydev)
 EXPORT_SYMBOL(phy_get_c45_ids);
 
 /**
- * phy_find_first - finds the first PHY device on the bus
+ * phy_find_next - finds the next PHY device on the bus
  * @bus: the target MII bus
+ * @pos: cursor
+ *
+ * Return: next phy_device on the bus, or NULL
  */
-struct phy_device *phy_find_first(struct mii_bus *bus)
+struct phy_device *phy_find_next(struct mii_bus *bus, struct phy_device *pos)
 {
-	struct phy_device *phydev;
-	int addr;
+	for (int addr = pos ? pos->mdio.addr + 1 : 0;
+	     addr < PHY_MAX_ADDR; addr++) {
+		struct phy_device *phydev = mdiobus_get_phy(bus, addr);
 
-	for (addr = 0; addr < PHY_MAX_ADDR; addr++) {
-		phydev = mdiobus_get_phy(bus, addr);
 		if (phydev)
 			return phydev;
 	}
 	return NULL;
 }
-EXPORT_SYMBOL(phy_find_first);
+EXPORT_SYMBOL_GPL(phy_find_next);
 
 /**
  * phy_prepare_link - prepares the PHY layer to monitor link status
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 17a2cdc9f1a0..358dd6f0ff96 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1869,7 +1869,7 @@ int phy_sfp_probe(struct phy_device *phydev,
 	          const struct sfp_upstream_ops *ops);
 struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
 			      phy_interface_t interface);
-struct phy_device *phy_find_first(struct mii_bus *bus);
+struct phy_device *phy_find_next(struct mii_bus *bus, struct phy_device *pos);
 int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 		      u32 flags, phy_interface_t interface);
 int phy_connect_direct(struct net_device *dev, struct phy_device *phydev,
@@ -1896,6 +1896,15 @@ bool phy_check_valid(int speed, int duplex, unsigned long *features);
 int phy_restart_aneg(struct phy_device *phydev);
 int phy_reset_after_clk_enable(struct phy_device *phydev);
 
+static inline struct phy_device *phy_find_first(struct mii_bus *bus)
+{
+	return phy_find_next(bus, NULL);
+}
+
+#define mdiobus_for_each_phy(_bus, _phydev)		\
+	for (_phydev = phy_find_first(_bus); _phydev;	\
+	     _phydev = phy_find_next(_bus, _phydev))
+
 #if IS_ENABLED(CONFIG_PHYLIB)
 int phy_start_cable_test(struct phy_device *phydev,
 			 struct netlink_ext_ack *extack);
-- 
cgit v1.2.3


From 00b3e8480be7a49203594bd1fdb4fd46f3b69d59 Mon Sep 17 00:00:00 2001
From: Izhar Ameer Shaikh <izhar.ameer.shaikh@amd.com>
Date: Tue, 21 Oct 2025 17:00:01 +0530
Subject: scsi: firmware: xilinx: Add support for secure read/write ioctl
 interface

Add support for a generic ioctl read/write interface using which users
can request firmware to perform read/write operations on a protected and
secure address space.

The functionality is introduced through the means of two new IOCTL IDs
which extend the existing PM_IOCTL EEMI API:

 - IOCTL_READ_REG
 - IOCTL_MASK_WRITE_REG

The caller only passes the node id of the given device and an offset.
The base address is not exposed to the caller and internally retrieved
by the firmware. Firmware will enforce an access policy on the incoming
read/write request.

Signed-off-by: Izhar Ameer Shaikh <izhar.ameer.shaikh@amd.com>
Reviewed-by: Tanmay Shah <tanmay.shah@amd.com>
Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey@amd.com>
Signed-off-by: Ajay Neeli <ajay.neeli@amd.com>
Acked-by: Senthil Nathan Thangaraj <senthilnathan.thangaraj@amd.com>
Acked-by: Michal Simek <michal.simek@amd.com>
Acked-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251021113003.13650-3-ajay.neeli@amd.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/firmware/xilinx/zynqmp.c     | 46 ++++++++++++++++++++++++++++++++++++
 include/linux/firmware/xlnx-zynqmp.h | 15 ++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 02da3e48bc8f..b7cd0eca9eaa 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -1616,6 +1616,52 @@ int zynqmp_pm_get_feature_config(enum pm_feature_config_id id,
 	return zynqmp_pm_invoke_fn(PM_IOCTL, payload, 3, 0, IOCTL_GET_FEATURE_CONFIG, id);
 }
 
+/**
+ * zynqmp_pm_sec_read_reg - PM call to securely read from given offset
+ *		of the node
+ * @node_id:	Node Id of the device
+ * @offset:	Offset to be used (20-bit)
+ * @ret_value:	Output data read from the given offset after
+ *		firmware access policy is successfully enforced
+ *
+ * Return:	Returns 0 on success or error value on failure
+ */
+int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value)
+{
+	u32 ret_payload[PAYLOAD_ARG_CNT];
+	u32 count = 1;
+	int ret;
+
+	if (!ret_value)
+		return -EINVAL;
+
+	ret = zynqmp_pm_invoke_fn(PM_IOCTL, ret_payload, 4, node_id, IOCTL_READ_REG,
+				  offset, count);
+
+	*ret_value = ret_payload[1];
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_sec_read_reg);
+
+/**
+ * zynqmp_pm_sec_mask_write_reg - PM call to securely write to given offset
+ *		of the node
+ * @node_id:	Node Id of the device
+ * @offset:	Offset to be used (20-bit)
+ * @mask:	Mask to be used
+ * @value:	Value to be written
+ *
+ * Return:	Returns 0 on success or error value on failure
+ */
+int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, u32 mask,
+				 u32 value)
+{
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 5, node_id, IOCTL_MASK_WRITE_REG,
+				   offset, mask, value);
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_sec_mask_write_reg);
+
 /**
  * zynqmp_pm_set_sd_config - PM call to set value of SD config registers
  * @node:	SD node ID
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index ae48d619c4e0..b161f37de5cc 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -241,6 +241,7 @@ enum pm_ioctl_id {
 	IOCTL_GET_FEATURE_CONFIG = 27,
 	/* IOCTL for Secure Read/Write Interface */
 	IOCTL_READ_REG = 28,
+	IOCTL_MASK_WRITE_REG = 29,
 	/* Dynamic SD/GEM configuration */
 	IOCTL_SET_SD_CONFIG = 30,
 	IOCTL_SET_GEM_CONFIG = 31,
@@ -619,6 +620,9 @@ int zynqmp_pm_feature(const u32 api_id);
 int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id);
 int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value);
 int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload);
+int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value);
+int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset,
+				 u32 mask, u32 value);
 int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset);
 int zynqmp_pm_force_pwrdwn(const u32 target,
 			   const enum zynqmp_pm_request_ack ack);
@@ -916,6 +920,17 @@ static inline int zynqmp_pm_request_wake(const u32 node,
 	return -ENODEV;
 }
 
+static inline int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset,
+					       u32 mask, u32 value)
+{
+	return -ENODEV;
+}
+
 static inline int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 0e4d26f79a74bc633846a27a9a20d52217c108dc Mon Sep 17 00:00:00 2001
From: Ajay Neeli <ajay.neeli@amd.com>
Date: Tue, 21 Oct 2025 17:00:02 +0530
Subject: scsi: firmware: xilinx: Add APIs for UFS PHY initialization

 - Add APIs for UFS PHY initialization.

 - Verify M-PHY TX-RX configuration readiness.

 - Confirm SRAM initialization and Set SRAM bypass.

 - Retrieve UFS calibration values.

Signed-off-by: Ajay Neeli <ajay.neeli@amd.com>
Acked-by: Senthil Nathan Thangaraj <senthilnathan.thangaraj@amd.com>
Acked-by: Michal Simek <michal.simek@amd.com>
Acked-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251021113003.13650-4-ajay.neeli@amd.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/firmware/xilinx/Makefile         |   2 +-
 drivers/firmware/xilinx/zynqmp-ufs.c     | 118 +++++++++++++++++++++++++++++++
 include/linux/firmware/xlnx-zynqmp-ufs.h |  38 ++++++++++
 include/linux/firmware/xlnx-zynqmp.h     |   1 +
 4 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/xilinx/zynqmp-ufs.c
 create mode 100644 include/linux/firmware/xlnx-zynqmp-ufs.h

(limited to 'include')

diff --git a/drivers/firmware/xilinx/Makefile b/drivers/firmware/xilinx/Makefile
index 875a53703c82..70f8f02f14a3 100644
--- a/drivers/firmware/xilinx/Makefile
+++ b/drivers/firmware/xilinx/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for Xilinx firmwares
 
-obj-$(CONFIG_ZYNQMP_FIRMWARE) += zynqmp.o
+obj-$(CONFIG_ZYNQMP_FIRMWARE) += zynqmp.o zynqmp-ufs.o
 obj-$(CONFIG_ZYNQMP_FIRMWARE_DEBUG) += zynqmp-debug.o
diff --git a/drivers/firmware/xilinx/zynqmp-ufs.c b/drivers/firmware/xilinx/zynqmp-ufs.c
new file mode 100644
index 000000000000..85da8a822f3a
--- /dev/null
+++ b/drivers/firmware/xilinx/zynqmp-ufs.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Firmware Layer for UFS APIs
+ *
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/firmware/xlnx-zynqmp.h>
+#include <linux/module.h>
+
+/* Register Node IDs */
+#define PM_REGNODE_PMC_IOU_SLCR		0x30000002 /* PMC IOU SLCR */
+#define PM_REGNODE_EFUSE_CACHE		0x30000003 /* EFUSE Cache */
+
+/* Register Offsets for PMC IOU SLCR */
+#define SRAM_CSR_OFFSET			0x104C /* SRAM Control and Status */
+#define TXRX_CFGRDY_OFFSET		0x1054 /* M-PHY TX-RX Config ready */
+
+/* Masks for SRAM Control and Status Register */
+#define SRAM_CSR_INIT_DONE_MASK		BIT(0) /* SRAM initialization done */
+#define SRAM_CSR_EXT_LD_DONE_MASK	BIT(1) /* SRAM External load done */
+#define SRAM_CSR_BYPASS_MASK		BIT(2) /* Bypass SRAM interface */
+
+/* Mask to check M-PHY TX-RX configuration readiness */
+#define TX_RX_CFG_RDY_MASK		GENMASK(3, 0)
+
+/* Register Offsets for EFUSE Cache */
+#define UFS_CAL_1_OFFSET		0xBE8 /* UFS Calibration Value */
+
+/**
+ * zynqmp_pm_is_mphy_tx_rx_config_ready - check M-PHY TX-RX config readiness
+ * @is_ready:	Store output status (true/false)
+ *
+ * Return:	Returns 0 on success or error value on failure.
+ */
+int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready)
+{
+	u32 regval;
+	int ret;
+
+	if (!is_ready)
+		return -EINVAL;
+
+	ret = zynqmp_pm_sec_read_reg(PM_REGNODE_PMC_IOU_SLCR, TXRX_CFGRDY_OFFSET, &regval);
+	if (ret)
+		return ret;
+
+	regval &= TX_RX_CFG_RDY_MASK;
+	if (regval)
+		*is_ready = true;
+	else
+		*is_ready = false;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_is_mphy_tx_rx_config_ready);
+
+/**
+ * zynqmp_pm_is_sram_init_done - check SRAM initialization
+ * @is_done:	Store output status (true/false)
+ *
+ * Return:	Returns 0 on success or error value on failure.
+ */
+int zynqmp_pm_is_sram_init_done(bool *is_done)
+{
+	u32 regval;
+	int ret;
+
+	if (!is_done)
+		return -EINVAL;
+
+	ret = zynqmp_pm_sec_read_reg(PM_REGNODE_PMC_IOU_SLCR, SRAM_CSR_OFFSET, &regval);
+	if (ret)
+		return ret;
+
+	regval &= SRAM_CSR_INIT_DONE_MASK;
+	if (regval)
+		*is_done = true;
+	else
+		*is_done = false;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_is_sram_init_done);
+
+/**
+ * zynqmp_pm_set_sram_bypass - Set SRAM bypass Control
+ *
+ * Return:	Returns 0 on success or error value on failure.
+ */
+int zynqmp_pm_set_sram_bypass(void)
+{
+	u32 sram_csr;
+	int ret;
+
+	ret = zynqmp_pm_sec_read_reg(PM_REGNODE_PMC_IOU_SLCR, SRAM_CSR_OFFSET, &sram_csr);
+	if (ret)
+		return ret;
+
+	sram_csr &= ~SRAM_CSR_EXT_LD_DONE_MASK;
+	sram_csr |= SRAM_CSR_BYPASS_MASK;
+
+	return zynqmp_pm_sec_mask_write_reg(PM_REGNODE_PMC_IOU_SLCR, SRAM_CSR_OFFSET,
+					    GENMASK(2, 1), sram_csr);
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_set_sram_bypass);
+
+/**
+ * zynqmp_pm_get_ufs_calibration_values - Read UFS calibration values
+ * @val:	Store the calibration value
+ *
+ * Return:	Returns 0 on success or error value on failure.
+ */
+int zynqmp_pm_get_ufs_calibration_values(u32 *val)
+{
+	return zynqmp_pm_sec_read_reg(PM_REGNODE_EFUSE_CACHE, UFS_CAL_1_OFFSET, val);
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_get_ufs_calibration_values);
diff --git a/include/linux/firmware/xlnx-zynqmp-ufs.h b/include/linux/firmware/xlnx-zynqmp-ufs.h
new file mode 100644
index 000000000000..d3538dd5822a
--- /dev/null
+++ b/include/linux/firmware/xlnx-zynqmp-ufs.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Firmware layer for UFS APIs.
+ *
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ */
+
+#ifndef __FIRMWARE_XLNX_ZYNQMP_UFS_H__
+#define __FIRMWARE_XLNX_ZYNQMP_UFS_H__
+
+#if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE)
+int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready);
+int zynqmp_pm_is_sram_init_done(bool *is_done);
+int zynqmp_pm_set_sram_bypass(void);
+int zynqmp_pm_get_ufs_calibration_values(u32 *val);
+#else
+static inline int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_is_sram_init_done(bool *is_done)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_set_sram_bypass(void)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_get_ufs_calibration_values(u32 *val)
+{
+	return -ENODEV;
+}
+#endif
+
+#endif /* __FIRMWARE_XLNX_ZYNQMP_UFS_H__ */
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index b161f37de5cc..784d5920b4cd 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 
 #include <linux/err.h>
+#include <linux/firmware/xlnx-zynqmp-ufs.h>
 
 #define ZYNQMP_PM_VERSION_MAJOR	1
 #define ZYNQMP_PM_VERSION_MINOR	0
-- 
cgit v1.2.3


From 769b8b2ffded4cd880669edd83e2952efeeb27f7 Mon Sep 17 00:00:00 2001
From: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
Date: Tue, 21 Oct 2025 17:00:03 +0530
Subject: scsi: ufs: amd-versal2: Add UFS support for AMD Versal Gen 2 SoC

Add support for the UFS host controller on the AMD Versal Gen 2 SoC,
built on the Synopsys DWC UFS architecture, using the UFSHCD DWC and
UFSHCD platform driver. This controller requires specific configurations
like M-PHY/RMMI/UniPro and vendor specific registers programming before
doing the UIC_LINKSTARTUP.

Signed-off-by: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
Signed-off-by: Ajay Neeli <ajay.neeli@amd.com>
Acked-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251021113003.13650-5-ajay.neeli@amd.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 MAINTAINERS                        |   7 +
 drivers/ufs/host/Kconfig           |  13 +
 drivers/ufs/host/Makefile          |   1 +
 drivers/ufs/host/ufs-amd-versal2.c | 564 +++++++++++++++++++++++++++++++++++++
 drivers/ufs/host/ufshcd-dwc.h      |  46 +++
 include/ufs/unipro.h               |   1 +
 6 files changed, 632 insertions(+)
 create mode 100644 drivers/ufs/host/ufs-amd-versal2.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 46126ce2f968..07db55181a29 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -26339,6 +26339,13 @@ F:	Documentation/devicetree/bindings/ufs/
 F:	Documentation/scsi/ufs.rst
 F:	drivers/ufs/core/
 
+UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER AMD VERSAL2
+M:	Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
+M:	Ajay Neeli <ajay.neeli@amd.com>
+S:	Maintained
+F:	Documentation/devicetree/bindings/ufs/amd,versal2-ufs.yaml
+F:	drivers/ufs/host/ufs-amd-versal2.c
+
 UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER DWC HOOKS
 M:	Pedro Sousa <pedrom.sousa@synopsys.com>
 L:	linux-scsi@vger.kernel.org
diff --git a/drivers/ufs/host/Kconfig b/drivers/ufs/host/Kconfig
index 191fbd799ec5..7d5117b2dab4 100644
--- a/drivers/ufs/host/Kconfig
+++ b/drivers/ufs/host/Kconfig
@@ -154,3 +154,16 @@ config SCSI_UFS_ROCKCHIP
 
 	  Select this if you have UFS controller on Rockchip chipset.
 	  If unsure, say N.
+
+config SCSI_UFS_AMD_VERSAL2
+	tristate "AMD Versal Gen 2 UFS controller platform driver"
+	depends on SCSI_UFSHCD_PLATFORM && (ARCH_ZYNQMP || COMPILE_TEST)
+	help
+	  This selects the AMD Versal Gen 2 specific additions on top of
+	  the UFSHCD DWC and UFSHCD platform driver. UFS host on AMD
+	  Versal Gen 2 needs some vendor specific configurations like PHY
+	  and vendor specific register accesses before accessing the
+	  hardware.
+
+	  Select this if you have UFS controller on AMD Versal Gen 2 SoC.
+	  If unsure, say N.
diff --git a/drivers/ufs/host/Makefile b/drivers/ufs/host/Makefile
index 2f97feb5db3f..65d8bb23ab7b 100644
--- a/drivers/ufs/host/Makefile
+++ b/drivers/ufs/host/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_SCSI_UFS_RENESAS) += ufs-renesas.o
 obj-$(CONFIG_SCSI_UFS_ROCKCHIP) += ufs-rockchip.o
 obj-$(CONFIG_SCSI_UFS_SPRD) += ufs-sprd.o
 obj-$(CONFIG_SCSI_UFS_TI_J721E) += ti-j721e-ufs.o
+obj-$(CONFIG_SCSI_UFS_AMD_VERSAL2) += ufs-amd-versal2.o ufshcd-dwc.o
diff --git a/drivers/ufs/host/ufs-amd-versal2.c b/drivers/ufs/host/ufs-amd-versal2.c
new file mode 100644
index 000000000000..40543db621a1
--- /dev/null
+++ b/drivers/ufs/host/ufs-amd-versal2.c
@@ -0,0 +1,564 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ *
+ * Authors: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/firmware/xlnx-zynqmp.h>
+#include <linux/irqreturn.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/reset.h>
+#include <ufs/unipro.h>
+
+#include "ufshcd-dwc.h"
+#include "ufshcd-pltfrm.h"
+#include "ufshci-dwc.h"
+
+/* PHY modes */
+#define UFSHCD_DWC_PHY_MODE_ROM         0
+
+#define MPHY_FAST_RX_AFE_CAL		BIT(2)
+#define MPHY_FW_CALIB_CFG_VAL		BIT(8)
+
+#define MPHY_RX_OVRD_EN			BIT(3)
+#define MPHY_RX_OVRD_VAL		BIT(2)
+#define MPHY_RX_ACK_MASK		BIT(0)
+
+#define TIMEOUT_MICROSEC	1000000
+
+struct ufs_versal2_host {
+	struct ufs_hba *hba;
+	struct reset_control *rstc;
+	struct reset_control *rstphy;
+	u32 phy_mode;
+	unsigned long host_clk;
+	u8 attcompval0;
+	u8 attcompval1;
+	u8 ctlecompval0;
+	u8 ctlecompval1;
+};
+
+static int ufs_versal2_phy_reg_write(struct ufs_hba *hba, u32 addr, u32 val)
+{
+	static struct ufshcd_dme_attr_val phy_write_attrs[] = {
+		{ UIC_ARG_MIB(CBCREGADDRLSB), 0, DME_LOCAL },
+		{ UIC_ARG_MIB(CBCREGADDRMSB), 0, DME_LOCAL },
+		{ UIC_ARG_MIB(CBCREGWRLSB), 0, DME_LOCAL },
+		{ UIC_ARG_MIB(CBCREGWRMSB), 0, DME_LOCAL },
+		{ UIC_ARG_MIB(CBCREGRDWRSEL), 1, DME_LOCAL },
+		{ UIC_ARG_MIB(VS_MPHYCFGUPDT), 1, DME_LOCAL }
+	};
+
+	phy_write_attrs[0].mib_val = (u8)addr;
+	phy_write_attrs[1].mib_val = (u8)(addr >> 8);
+	phy_write_attrs[2].mib_val = (u8)val;
+	phy_write_attrs[3].mib_val = (u8)(val >> 8);
+
+	return ufshcd_dwc_dme_set_attrs(hba, phy_write_attrs, ARRAY_SIZE(phy_write_attrs));
+}
+
+static int ufs_versal2_phy_reg_read(struct ufs_hba *hba, u32 addr, u32 *val)
+{
+	u32 mib_val;
+	int ret;
+	static struct ufshcd_dme_attr_val phy_read_attrs[] = {
+		{ UIC_ARG_MIB(CBCREGADDRLSB), 0, DME_LOCAL },
+		{ UIC_ARG_MIB(CBCREGADDRMSB), 0, DME_LOCAL },
+		{ UIC_ARG_MIB(CBCREGRDWRSEL), 0, DME_LOCAL },
+		{ UIC_ARG_MIB(VS_MPHYCFGUPDT), 1, DME_LOCAL }
+	};
+
+	phy_read_attrs[0].mib_val = (u8)addr;
+	phy_read_attrs[1].mib_val = (u8)(addr >> 8);
+
+	ret = ufshcd_dwc_dme_set_attrs(hba, phy_read_attrs, ARRAY_SIZE(phy_read_attrs));
+	if (ret)
+		return ret;
+
+	ret = ufshcd_dme_get(hba, UIC_ARG_MIB(CBCREGRDLSB), &mib_val);
+	if (ret)
+		return ret;
+
+	*val = mib_val;
+	ret = ufshcd_dme_get(hba, UIC_ARG_MIB(CBCREGRDMSB), &mib_val);
+	if (ret)
+		return ret;
+
+	*val |= (mib_val << 8);
+
+	return 0;
+}
+
+static int ufs_versal2_enable_phy(struct ufs_hba *hba)
+{
+	u32 offset, reg;
+	int ret;
+
+	ret = ufshcd_dme_set(hba, UIC_ARG_MIB(VS_MPHYDISABLE), 0);
+	if (ret)
+		return ret;
+
+	ret = ufshcd_dme_set(hba, UIC_ARG_MIB(VS_MPHYCFGUPDT), 1);
+	if (ret)
+		return ret;
+
+	/* Check Tx/Rx FSM states */
+	for (offset = 0; offset < 2; offset++) {
+		u32 time_left, mibsel;
+
+		time_left = TIMEOUT_MICROSEC;
+		mibsel = UIC_ARG_MIB_SEL(MTX_FSM_STATE, UIC_ARG_MPHY_TX_GEN_SEL_INDEX(offset));
+		do {
+			ret = ufshcd_dme_get(hba, mibsel, &reg);
+			if (ret)
+				return ret;
+
+			if (reg == TX_STATE_HIBERN8 || reg == TX_STATE_SLEEP ||
+			    reg == TX_STATE_LSBURST)
+				break;
+
+			time_left--;
+			usleep_range(1, 5);
+		} while (time_left);
+
+		if (!time_left) {
+			dev_err(hba->dev, "Invalid Tx FSM state.\n");
+			return -ETIMEDOUT;
+		}
+
+		time_left = TIMEOUT_MICROSEC;
+		mibsel = UIC_ARG_MIB_SEL(MRX_FSM_STATE, UIC_ARG_MPHY_RX_GEN_SEL_INDEX(offset));
+		do {
+			ret = ufshcd_dme_get(hba, mibsel, &reg);
+			if (ret)
+				return ret;
+
+			if (reg == RX_STATE_HIBERN8 || reg == RX_STATE_SLEEP ||
+			    reg == RX_STATE_LSBURST)
+				break;
+
+			time_left--;
+			usleep_range(1, 5);
+		} while (time_left);
+
+		if (!time_left) {
+			dev_err(hba->dev, "Invalid Rx FSM state.\n");
+			return -ETIMEDOUT;
+		}
+	}
+
+	return 0;
+}
+
+static int ufs_versal2_setup_phy(struct ufs_hba *hba)
+{
+	struct ufs_versal2_host *host = ufshcd_get_variant(hba);
+	int ret;
+	u32 reg;
+
+	/* Bypass RX-AFE offset calibrations (ATT/CTLE) */
+	ret = ufs_versal2_phy_reg_read(hba, FAST_FLAGS(0), &reg);
+	if (ret)
+		return ret;
+
+	reg |= MPHY_FAST_RX_AFE_CAL;
+	ret = ufs_versal2_phy_reg_write(hba, FAST_FLAGS(0), reg);
+	if (ret)
+		return ret;
+
+	ret = ufs_versal2_phy_reg_read(hba, FAST_FLAGS(1), &reg);
+	if (ret)
+		return ret;
+
+	reg |= MPHY_FAST_RX_AFE_CAL;
+	ret = ufs_versal2_phy_reg_write(hba, FAST_FLAGS(1), reg);
+	if (ret)
+		return ret;
+
+	/* Program ATT and CTLE compensation values */
+	if (host->attcompval0) {
+		ret = ufs_versal2_phy_reg_write(hba, RX_AFE_ATT_IDAC(0), host->attcompval0);
+		if (ret)
+			return ret;
+	}
+
+	if (host->attcompval1) {
+		ret = ufs_versal2_phy_reg_write(hba, RX_AFE_ATT_IDAC(1), host->attcompval1);
+		if (ret)
+			return ret;
+	}
+
+	if (host->ctlecompval0) {
+		ret = ufs_versal2_phy_reg_write(hba, RX_AFE_CTLE_IDAC(0), host->ctlecompval0);
+		if (ret)
+			return ret;
+	}
+
+	if (host->ctlecompval1) {
+		ret = ufs_versal2_phy_reg_write(hba, RX_AFE_CTLE_IDAC(1), host->ctlecompval1);
+		if (ret)
+			return ret;
+	}
+
+	ret = ufs_versal2_phy_reg_read(hba, FW_CALIB_CCFG(0), &reg);
+	if (ret)
+		return ret;
+
+	reg |= MPHY_FW_CALIB_CFG_VAL;
+	ret = ufs_versal2_phy_reg_write(hba, FW_CALIB_CCFG(0), reg);
+	if (ret)
+		return ret;
+
+	ret = ufs_versal2_phy_reg_read(hba, FW_CALIB_CCFG(1), &reg);
+	if (ret)
+		return ret;
+
+	reg |= MPHY_FW_CALIB_CFG_VAL;
+	return ufs_versal2_phy_reg_write(hba, FW_CALIB_CCFG(1), reg);
+}
+
+static int ufs_versal2_phy_init(struct ufs_hba *hba)
+{
+	struct ufs_versal2_host *host = ufshcd_get_variant(hba);
+	u32 time_left;
+	bool is_ready;
+	int ret;
+	static const struct ufshcd_dme_attr_val rmmi_attrs[] = {
+		{ UIC_ARG_MIB(CBREFCLKCTRL2), CBREFREFCLK_GATE_OVR_EN, DME_LOCAL },
+		{ UIC_ARG_MIB(CBCRCTRL), 1, DME_LOCAL },
+		{ UIC_ARG_MIB(CBC10DIRECTCONF2), 1, DME_LOCAL },
+		{ UIC_ARG_MIB(VS_MPHYCFGUPDT), 1, DME_LOCAL }
+	};
+
+	/* Wait for Tx/Rx config_rdy */
+	time_left = TIMEOUT_MICROSEC;
+	do {
+		time_left--;
+		ret = zynqmp_pm_is_mphy_tx_rx_config_ready(&is_ready);
+		if (ret)
+			return ret;
+
+		if (!is_ready)
+			break;
+
+		usleep_range(1, 5);
+	} while (time_left);
+
+	if (!time_left) {
+		dev_err(hba->dev, "Tx/Rx configuration signal busy.\n");
+		return -ETIMEDOUT;
+	}
+
+	ret = ufshcd_dwc_dme_set_attrs(hba, rmmi_attrs, ARRAY_SIZE(rmmi_attrs));
+	if (ret)
+		return ret;
+
+	ret = reset_control_deassert(host->rstphy);
+	if (ret) {
+		dev_err(hba->dev, "ufsphy reset deassert failed, err = %d\n", ret);
+		return ret;
+	}
+
+	/* Wait for SRAM init done */
+	time_left = TIMEOUT_MICROSEC;
+	do {
+		time_left--;
+		ret = zynqmp_pm_is_sram_init_done(&is_ready);
+		if (ret)
+			return ret;
+
+		if (is_ready)
+			break;
+
+		usleep_range(1, 5);
+	} while (time_left);
+
+	if (!time_left) {
+		dev_err(hba->dev, "SRAM initialization failed.\n");
+		return -ETIMEDOUT;
+	}
+
+	ret = ufs_versal2_setup_phy(hba);
+	if (ret)
+		return ret;
+
+	return ufs_versal2_enable_phy(hba);
+}
+
+static int ufs_versal2_init(struct ufs_hba *hba)
+{
+	struct ufs_versal2_host *host;
+	struct device *dev = hba->dev;
+	struct ufs_clk_info *clki;
+	int ret;
+	u32 cal;
+
+	host = devm_kzalloc(dev, sizeof(*host), GFP_KERNEL);
+	if (!host)
+		return -ENOMEM;
+
+	host->hba = hba;
+	ufshcd_set_variant(hba, host);
+
+	host->phy_mode = UFSHCD_DWC_PHY_MODE_ROM;
+
+	list_for_each_entry(clki, &hba->clk_list_head, list) {
+		if (!strcmp(clki->name, "core"))
+			host->host_clk = clk_get_rate(clki->clk);
+	}
+
+	host->rstc = devm_reset_control_get_exclusive(dev, "host");
+	if (IS_ERR(host->rstc)) {
+		dev_err(dev, "failed to get reset ctrl: host\n");
+		return PTR_ERR(host->rstc);
+	}
+
+	host->rstphy = devm_reset_control_get_exclusive(dev, "phy");
+	if (IS_ERR(host->rstphy)) {
+		dev_err(dev, "failed to get reset ctrl: phy\n");
+		return PTR_ERR(host->rstphy);
+	}
+
+	ret = reset_control_assert(host->rstc);
+	if (ret) {
+		dev_err(hba->dev, "host reset assert failed, err = %d\n", ret);
+		return ret;
+	}
+
+	ret = reset_control_assert(host->rstphy);
+	if (ret) {
+		dev_err(hba->dev, "phy reset assert failed, err = %d\n", ret);
+		return ret;
+	}
+
+	ret = zynqmp_pm_set_sram_bypass();
+	if (ret) {
+		dev_err(dev, "Bypass SRAM interface failed, err = %d\n", ret);
+		return ret;
+	}
+
+	ret = reset_control_deassert(host->rstc);
+	if (ret)
+		dev_err(hba->dev, "host reset deassert failed, err = %d\n", ret);
+
+	ret = zynqmp_pm_get_ufs_calibration_values(&cal);
+	if (ret) {
+		dev_err(dev, "failed to read calibration values\n");
+		return ret;
+	}
+
+	host->attcompval0 = (u8)cal;
+	host->attcompval1 = (u8)(cal >> 8);
+	host->ctlecompval0 = (u8)(cal >> 16);
+	host->ctlecompval1 = (u8)(cal >> 24);
+
+	hba->quirks |= UFSHCD_QUIRK_SKIP_DEF_UNIPRO_TIMEOUT_SETTING;
+
+	return 0;
+}
+
+static int ufs_versal2_hce_enable_notify(struct ufs_hba *hba,
+					 enum ufs_notify_change_status status)
+{
+	int ret = 0;
+
+	if (status == PRE_CHANGE) {
+		ret = ufs_versal2_phy_init(hba);
+		if (ret)
+			dev_err(hba->dev, "Phy init failed (%d)\n", ret);
+	}
+
+	return ret;
+}
+
+static int ufs_versal2_link_startup_notify(struct ufs_hba *hba,
+					   enum ufs_notify_change_status status)
+{
+	struct ufs_versal2_host *host = ufshcd_get_variant(hba);
+	int ret = 0;
+
+	switch (status) {
+	case PRE_CHANGE:
+		if (host->host_clk)
+			ufshcd_writel(hba, host->host_clk / 1000000, DWC_UFS_REG_HCLKDIV);
+
+		break;
+	case POST_CHANGE:
+		ret = ufshcd_dwc_link_startup_notify(hba, status);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int ufs_versal2_phy_ratesel(struct ufs_hba *hba, u32 activelanes, u32 rx_req)
+{
+	u32 time_left, reg, lane;
+	int ret;
+
+	for (lane = 0; lane < activelanes; lane++) {
+		time_left = TIMEOUT_MICROSEC;
+		ret = ufs_versal2_phy_reg_read(hba, RX_OVRD_IN_1(lane), &reg);
+		if (ret)
+			return ret;
+
+		reg |= MPHY_RX_OVRD_EN;
+		if (rx_req)
+			reg |= MPHY_RX_OVRD_VAL;
+		else
+			reg &= ~MPHY_RX_OVRD_VAL;
+
+		ret = ufs_versal2_phy_reg_write(hba, RX_OVRD_IN_1(lane), reg);
+		if (ret)
+			return ret;
+
+		do {
+			ret = ufs_versal2_phy_reg_read(hba, RX_PCS_OUT(lane), &reg);
+			if (ret)
+				return ret;
+
+			reg &= MPHY_RX_ACK_MASK;
+			if (reg == rx_req)
+				break;
+
+			time_left--;
+			usleep_range(1, 5);
+		} while (time_left);
+
+		if (!time_left) {
+			dev_err(hba->dev, "Invalid Rx Ack value.\n");
+			return -ETIMEDOUT;
+		}
+	}
+
+	return 0;
+}
+
+static int ufs_versal2_pwr_change_notify(struct ufs_hba *hba, enum ufs_notify_change_status status,
+					 const struct ufs_pa_layer_attr *dev_max_params,
+					 struct ufs_pa_layer_attr *dev_req_params)
+{
+	struct ufs_versal2_host *host = ufshcd_get_variant(hba);
+	u32 lane, reg, rate = 0;
+	int ret = 0;
+
+	if (status == PRE_CHANGE) {
+		memcpy(dev_req_params, dev_max_params, sizeof(struct ufs_pa_layer_attr));
+
+		/* If it is not a calibrated part, switch PWRMODE to SLOW_MODE */
+		if (!host->attcompval0 && !host->attcompval1 && !host->ctlecompval0 &&
+		    !host->ctlecompval1) {
+			dev_req_params->pwr_rx = SLOW_MODE;
+			dev_req_params->pwr_tx = SLOW_MODE;
+			return 0;
+		}
+
+		if (dev_req_params->pwr_rx == SLOW_MODE || dev_req_params->pwr_rx == SLOWAUTO_MODE)
+			return 0;
+
+		if (dev_req_params->hs_rate == PA_HS_MODE_B)
+			rate = 1;
+
+		 /* Select the rate */
+		ret = ufshcd_dme_set(hba, UIC_ARG_MIB(CBRATESEL), rate);
+		if (ret)
+			return ret;
+
+		ret = ufshcd_dme_set(hba, UIC_ARG_MIB(VS_MPHYCFGUPDT), 1);
+		if (ret)
+			return ret;
+
+		ret = ufs_versal2_phy_ratesel(hba, dev_req_params->lane_tx, 1);
+		if (ret)
+			return ret;
+
+		ret = ufs_versal2_phy_ratesel(hba, dev_req_params->lane_tx, 0);
+		if (ret)
+			return ret;
+
+		/* Remove rx_req override */
+		for (lane = 0; lane < dev_req_params->lane_tx; lane++) {
+			ret = ufs_versal2_phy_reg_read(hba, RX_OVRD_IN_1(lane), &reg);
+			if (ret)
+				return ret;
+
+			reg &= ~MPHY_RX_OVRD_EN;
+			ret = ufs_versal2_phy_reg_write(hba, RX_OVRD_IN_1(lane), reg);
+			if (ret)
+				return ret;
+		}
+
+		if (dev_req_params->lane_tx == UFS_LANE_2 && dev_req_params->lane_rx == UFS_LANE_2)
+			ret = ufshcd_dme_configure_adapt(hba, dev_req_params->gear_tx,
+							 PA_INITIAL_ADAPT);
+	}
+
+	return ret;
+}
+
+static struct ufs_hba_variant_ops ufs_versal2_hba_vops = {
+	.name			= "ufs-versal2-pltfm",
+	.init			= ufs_versal2_init,
+	.link_startup_notify	= ufs_versal2_link_startup_notify,
+	.hce_enable_notify	= ufs_versal2_hce_enable_notify,
+	.pwr_change_notify	= ufs_versal2_pwr_change_notify,
+};
+
+static const struct of_device_id ufs_versal2_pltfm_match[] = {
+	{
+		.compatible = "amd,versal2-ufs",
+		.data = &ufs_versal2_hba_vops,
+	},
+	{ },
+};
+MODULE_DEVICE_TABLE(of, ufs_versal2_pltfm_match);
+
+static int ufs_versal2_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	int ret;
+
+	/* Perform generic probe */
+	ret = ufshcd_pltfrm_init(pdev, &ufs_versal2_hba_vops);
+	if (ret)
+		dev_err(dev, "ufshcd_pltfrm_init() failed %d\n", ret);
+
+	return ret;
+}
+
+static void ufs_versal2_remove(struct platform_device *pdev)
+{
+	struct ufs_hba *hba = platform_get_drvdata(pdev);
+
+	pm_runtime_get_sync(&(pdev)->dev);
+	ufshcd_remove(hba);
+}
+
+static const struct dev_pm_ops ufs_versal2_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(ufshcd_system_suspend, ufshcd_system_resume)
+	SET_RUNTIME_PM_OPS(ufshcd_runtime_suspend, ufshcd_runtime_resume, NULL)
+};
+
+static struct platform_driver ufs_versal2_pltfm = {
+	.probe		= ufs_versal2_probe,
+	.remove		= ufs_versal2_remove,
+	.driver		= {
+		.name	= "ufshcd-versal2",
+		.pm	= &ufs_versal2_pm_ops,
+		.of_match_table	= of_match_ptr(ufs_versal2_pltfm_match),
+	},
+};
+
+module_platform_driver(ufs_versal2_pltfm);
+
+MODULE_AUTHOR("Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>");
+MODULE_DESCRIPTION("AMD Versal Gen 2 UFS Host Controller driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/ufs/host/ufshcd-dwc.h b/drivers/ufs/host/ufshcd-dwc.h
index ad91ea56662c..c618bb914904 100644
--- a/drivers/ufs/host/ufshcd-dwc.h
+++ b/drivers/ufs/host/ufshcd-dwc.h
@@ -12,6 +12,52 @@
 
 #include <ufs/ufshcd.h>
 
+/* RMMI Attributes */
+#define CBREFCLKCTRL2		0x8132
+#define CBCRCTRL		0x811F
+#define CBC10DIRECTCONF2	0x810E
+#define CBRATESEL		0x8114
+#define CBCREGADDRLSB		0x8116
+#define CBCREGADDRMSB		0x8117
+#define CBCREGWRLSB		0x8118
+#define CBCREGWRMSB		0x8119
+#define CBCREGRDLSB		0x811A
+#define CBCREGRDMSB		0x811B
+#define CBCREGRDWRSEL		0x811C
+
+#define CBREFREFCLK_GATE_OVR_EN		BIT(7)
+
+/* M-PHY Attributes */
+#define MTX_FSM_STATE		0x41
+#define MRX_FSM_STATE		0xC1
+
+/* M-PHY registers */
+#define RX_OVRD_IN_1(n)		(0x3006 + ((n) * 0x100))
+#define RX_PCS_OUT(n)		(0x300F + ((n) * 0x100))
+#define FAST_FLAGS(n)		(0x401C + ((n) * 0x100))
+#define RX_AFE_ATT_IDAC(n)	(0x4000 + ((n) * 0x100))
+#define RX_AFE_CTLE_IDAC(n)	(0x4001 + ((n) * 0x100))
+#define FW_CALIB_CCFG(n)	(0x404D + ((n) * 0x100))
+
+/* Tx/Rx FSM state */
+enum rx_fsm_state {
+	RX_STATE_DISABLED = 0,
+	RX_STATE_HIBERN8 = 1,
+	RX_STATE_SLEEP = 2,
+	RX_STATE_STALL = 3,
+	RX_STATE_LSBURST = 4,
+	RX_STATE_HSBURST = 5,
+};
+
+enum tx_fsm_state {
+	TX_STATE_DISABLED = 0,
+	TX_STATE_HIBERN8 = 1,
+	TX_STATE_SLEEP = 2,
+	TX_STATE_STALL = 3,
+	TX_STATE_LSBURST = 4,
+	TX_STATE_HSBURST = 5,
+};
+
 struct ufshcd_dme_attr_val {
 	u32 attr_sel;
 	u32 mib_val;
diff --git a/include/ufs/unipro.h b/include/ufs/unipro.h
index 360e1245fb40..faf1c471ad30 100644
--- a/include/ufs/unipro.h
+++ b/include/ufs/unipro.h
@@ -174,6 +174,7 @@
 #define VS_POWERSTATE		0xD083
 #define VS_MPHYCFGUPDT		0xD085
 #define VS_DEBUGOMC		0xD09E
+#define VS_MPHYDISABLE		0xD0C1
 
 #define PA_GRANULARITY_MIN_VAL	1
 #define PA_GRANULARITY_MAX_VAL	6
-- 
cgit v1.2.3


From 50b8e36994a042103ea92b6d9f6d7de725f9ac5f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:30:57 -0700
Subject: lib/crypto: blake2s: Adjust parameter order of blake2s()

Reorder the parameters of blake2s() from (out, in, key, outlen, inlen,
keylen) to (key, keylen, in, inlen, out, outlen).

This aligns BLAKE2s with the common conventions of pairing buffers and
their lengths, and having outputs follow inputs.  This is widely used
elsewhere in lib/crypto/ and crypto/, and even elsewhere in the BLAKE2s
code itself such as blake2s_init_key() and blake2s_final().  So
blake2s() was a bit of an exception.

Notably, this results in the same order as hmac_*_usingrawkey().

Note that since the type signature changed, it's not possible for a
blake2s() call site to be silently missed.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 drivers/char/random.c            |  4 ++--
 drivers/net/wireguard/cookie.c   |  4 ++--
 drivers/net/wireguard/noise.c    |  4 ++--
 include/crypto/blake2s.h         |  6 +++---
 lib/crypto/tests/blake2s_kunit.c | 16 ++++++++--------
 5 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index b8b24b6ed3fe..422c5c76571b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -701,7 +701,7 @@ static void extract_entropy(void *buf, size_t len)
 
 	/* next_key = HASHPRF(seed, RDSEED || 0) */
 	block.counter = 0;
-	blake2s(next_key, (u8 *)&block, seed, sizeof(next_key), sizeof(block), sizeof(seed));
+	blake2s(seed, sizeof(seed), (const u8 *)&block, sizeof(block), next_key, sizeof(next_key));
 	blake2s_init_key(&input_pool.hash, BLAKE2S_HASH_SIZE, next_key, sizeof(next_key));
 
 	spin_unlock_irqrestore(&input_pool.lock, flags);
@@ -711,7 +711,7 @@ static void extract_entropy(void *buf, size_t len)
 		i = min_t(size_t, len, BLAKE2S_HASH_SIZE);
 		/* output = HASHPRF(seed, RDSEED || ++counter) */
 		++block.counter;
-		blake2s(buf, (u8 *)&block, seed, i, sizeof(block), sizeof(seed));
+		blake2s(seed, sizeof(seed), (const u8 *)&block, sizeof(block), buf, i);
 		len -= i;
 		buf += i;
 	}
diff --git a/drivers/net/wireguard/cookie.c b/drivers/net/wireguard/cookie.c
index 94d0a7206084..be1b83aae03b 100644
--- a/drivers/net/wireguard/cookie.c
+++ b/drivers/net/wireguard/cookie.c
@@ -77,7 +77,7 @@ static void compute_mac1(u8 mac1[COOKIE_LEN], const void *message, size_t len,
 {
 	len = len - sizeof(struct message_macs) +
 	      offsetof(struct message_macs, mac1);
-	blake2s(mac1, message, key, COOKIE_LEN, len, NOISE_SYMMETRIC_KEY_LEN);
+	blake2s(key, NOISE_SYMMETRIC_KEY_LEN, message, len, mac1, COOKIE_LEN);
 }
 
 static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len,
@@ -85,7 +85,7 @@ static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len,
 {
 	len = len - sizeof(struct message_macs) +
 	      offsetof(struct message_macs, mac2);
-	blake2s(mac2, message, cookie, COOKIE_LEN, len, COOKIE_LEN);
+	blake2s(cookie, COOKIE_LEN, message, len, mac2, COOKIE_LEN);
 }
 
 static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb,
diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c
index 7eb9a23a3d4d..306abb876c80 100644
--- a/drivers/net/wireguard/noise.c
+++ b/drivers/net/wireguard/noise.c
@@ -35,8 +35,8 @@ void __init wg_noise_init(void)
 {
 	struct blake2s_state blake;
 
-	blake2s(handshake_init_chaining_key, handshake_name, NULL,
-		NOISE_HASH_LEN, sizeof(handshake_name), 0);
+	blake2s(NULL, 0, handshake_name, sizeof(handshake_name),
+		handshake_init_chaining_key, NOISE_HASH_LEN);
 	blake2s_init(&blake, NOISE_HASH_LEN);
 	blake2s_update(&blake, handshake_init_chaining_key, NOISE_HASH_LEN);
 	blake2s_update(&blake, identifier_name, sizeof(identifier_name));
diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index f9ffd39194eb..a7dd678725b2 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -86,9 +86,9 @@ static inline void blake2s_init_key(struct blake2s_state *state,
 void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen);
 void blake2s_final(struct blake2s_state *state, u8 *out);
 
-static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
-			   const size_t outlen, const size_t inlen,
-			   const size_t keylen)
+static inline void blake2s(const u8 *key, const size_t keylen,
+			   const u8 *in, const size_t inlen,
+			   u8 *out, const size_t outlen)
 {
 	struct blake2s_state state;
 
diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c
index 057c40132246..247bbdf7dc86 100644
--- a/lib/crypto/tests/blake2s_kunit.c
+++ b/lib/crypto/tests/blake2s_kunit.c
@@ -14,7 +14,7 @@
 static void blake2s_default(const u8 *data, size_t len,
 			    u8 out[BLAKE2S_HASH_SIZE])
 {
-	blake2s(out, data, NULL, BLAKE2S_HASH_SIZE, len, 0);
+	blake2s(NULL, 0, data, len, out, BLAKE2S_HASH_SIZE);
 }
 
 static void blake2s_init_default(struct blake2s_state *state)
@@ -52,7 +52,7 @@ static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
 	for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
 		rand_bytes_seeded_from_len(key, key_len);
 		for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
-			blake2s(hash, data, key, out_len, data_len, key_len);
+			blake2s(key, key_len, data, data_len, hash, out_len);
 			blake2s_update(&main_state, hash, out_len);
 		}
 	}
@@ -80,10 +80,10 @@ static void test_blake2s_with_guarded_key_buf(struct kunit *test)
 		rand_bytes(key, key_len);
 		memcpy(guarded_key, key, key_len);
 
-		blake2s(hash1, test_buf, key,
-			BLAKE2S_HASH_SIZE, data_len, key_len);
-		blake2s(hash2, test_buf, guarded_key,
-			BLAKE2S_HASH_SIZE, data_len, key_len);
+		blake2s(key, key_len, test_buf, data_len,
+			hash1, BLAKE2S_HASH_SIZE);
+		blake2s(guarded_key, key_len, test_buf, data_len,
+			hash2, BLAKE2S_HASH_SIZE);
 		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
 
 		blake2s_init_key(&state, BLAKE2S_HASH_SIZE,
@@ -107,8 +107,8 @@ static void test_blake2s_with_guarded_out_buf(struct kunit *test)
 		u8 hash[BLAKE2S_HASH_SIZE];
 		u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
 
-		blake2s(hash, test_buf, NULL, out_len, data_len, 0);
-		blake2s(guarded_hash, test_buf, NULL, out_len, data_len, 0);
+		blake2s(NULL, 0, test_buf, data_len, hash, out_len);
+		blake2s(NULL, 0, test_buf, data_len, guarded_hash, out_len);
 		KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
 	}
 }
-- 
cgit v1.2.3


From 5e0ec8e46d4d6488242bb39a4ce5c0276afa5f32 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:30:58 -0700
Subject: lib/crypto: blake2s: Rename blake2s_state to blake2s_ctx

For consistency with the SHA-1, SHA-2, SHA-3 (in development), and MD5
library APIs, rename blake2s_state to blake2s_ctx.

As a refresher, the ctx name:

- Is a bit shorter.
- Avoids confusion with the compression function state, which is also
  often called the state (but is just part of the full context).
- Is consistent with OpenSSL.

Not a big deal, of course.  But consistency is nice.  With a BLAKE2b
library API about to be added, this is a convenient time to update this.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 drivers/char/random.c            |  2 +-
 drivers/net/wireguard/cookie.c   | 14 +++++-----
 drivers/net/wireguard/noise.c    | 28 +++++++++----------
 include/crypto/blake2s.h         | 59 ++++++++++++++++++++--------------------
 lib/crypto/arm/blake2s-core.S    | 10 +++----
 lib/crypto/arm/blake2s.h         |  4 +--
 lib/crypto/blake2s.c             | 58 +++++++++++++++++++--------------------
 lib/crypto/tests/blake2s_kunit.c | 23 ++++++++--------
 lib/crypto/x86/blake2s.h         | 12 ++++----
 9 files changed, 104 insertions(+), 106 deletions(-)

(limited to 'include')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 422c5c76571b..7e0486d8c51d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -636,7 +636,7 @@ enum {
 };
 
 static struct {
-	struct blake2s_state hash;
+	struct blake2s_ctx hash;
 	spinlock_t lock;
 	unsigned int init_bits;
 } input_pool = {
diff --git a/drivers/net/wireguard/cookie.c b/drivers/net/wireguard/cookie.c
index be1b83aae03b..08731b3fa32b 100644
--- a/drivers/net/wireguard/cookie.c
+++ b/drivers/net/wireguard/cookie.c
@@ -33,7 +33,7 @@ static void precompute_key(u8 key[NOISE_SYMMETRIC_KEY_LEN],
 			   const u8 pubkey[NOISE_PUBLIC_KEY_LEN],
 			   const u8 label[COOKIE_KEY_LABEL_LEN])
 {
-	struct blake2s_state blake;
+	struct blake2s_ctx blake;
 
 	blake2s_init(&blake, NOISE_SYMMETRIC_KEY_LEN);
 	blake2s_update(&blake, label, COOKIE_KEY_LABEL_LEN);
@@ -91,7 +91,7 @@ static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len,
 static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb,
 			struct cookie_checker *checker)
 {
-	struct blake2s_state state;
+	struct blake2s_ctx blake;
 
 	if (wg_birthdate_has_expired(checker->secret_birthdate,
 				     COOKIE_SECRET_MAX_AGE)) {
@@ -103,15 +103,15 @@ static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb,
 
 	down_read(&checker->secret_lock);
 
-	blake2s_init_key(&state, COOKIE_LEN, checker->secret, NOISE_HASH_LEN);
+	blake2s_init_key(&blake, COOKIE_LEN, checker->secret, NOISE_HASH_LEN);
 	if (skb->protocol == htons(ETH_P_IP))
-		blake2s_update(&state, (u8 *)&ip_hdr(skb)->saddr,
+		blake2s_update(&blake, (u8 *)&ip_hdr(skb)->saddr,
 			       sizeof(struct in_addr));
 	else if (skb->protocol == htons(ETH_P_IPV6))
-		blake2s_update(&state, (u8 *)&ipv6_hdr(skb)->saddr,
+		blake2s_update(&blake, (u8 *)&ipv6_hdr(skb)->saddr,
 			       sizeof(struct in6_addr));
-	blake2s_update(&state, (u8 *)&udp_hdr(skb)->source, sizeof(__be16));
-	blake2s_final(&state, cookie);
+	blake2s_update(&blake, (u8 *)&udp_hdr(skb)->source, sizeof(__be16));
+	blake2s_final(&blake, cookie);
 
 	up_read(&checker->secret_lock);
 }
diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c
index 306abb876c80..1fe8468f0bef 100644
--- a/drivers/net/wireguard/noise.c
+++ b/drivers/net/wireguard/noise.c
@@ -33,7 +33,7 @@ static atomic64_t keypair_counter = ATOMIC64_INIT(0);
 
 void __init wg_noise_init(void)
 {
-	struct blake2s_state blake;
+	struct blake2s_ctx blake;
 
 	blake2s(NULL, 0, handshake_name, sizeof(handshake_name),
 		handshake_init_chaining_key, NOISE_HASH_LEN);
@@ -304,33 +304,33 @@ void wg_noise_set_static_identity_private_key(
 
 static void hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen, const size_t keylen)
 {
-	struct blake2s_state state;
+	struct blake2s_ctx blake;
 	u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
 	u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
 	int i;
 
 	if (keylen > BLAKE2S_BLOCK_SIZE) {
-		blake2s_init(&state, BLAKE2S_HASH_SIZE);
-		blake2s_update(&state, key, keylen);
-		blake2s_final(&state, x_key);
+		blake2s_init(&blake, BLAKE2S_HASH_SIZE);
+		blake2s_update(&blake, key, keylen);
+		blake2s_final(&blake, x_key);
 	} else
 		memcpy(x_key, key, keylen);
 
 	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
 		x_key[i] ^= 0x36;
 
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, in, inlen);
-	blake2s_final(&state, i_hash);
+	blake2s_init(&blake, BLAKE2S_HASH_SIZE);
+	blake2s_update(&blake, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&blake, in, inlen);
+	blake2s_final(&blake, i_hash);
 
 	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
 		x_key[i] ^= 0x5c ^ 0x36;
 
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
-	blake2s_final(&state, i_hash);
+	blake2s_init(&blake, BLAKE2S_HASH_SIZE);
+	blake2s_update(&blake, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&blake, i_hash, BLAKE2S_HASH_SIZE);
+	blake2s_final(&blake, i_hash);
 
 	memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
 	memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
@@ -431,7 +431,7 @@ static bool __must_check mix_precomputed_dh(u8 chaining_key[NOISE_HASH_LEN],
 
 static void mix_hash(u8 hash[NOISE_HASH_LEN], const u8 *src, size_t src_len)
 {
-	struct blake2s_state blake;
+	struct blake2s_ctx blake;
 
 	blake2s_init(&blake, NOISE_HASH_LEN);
 	blake2s_update(&blake, hash, NOISE_HASH_LEN);
diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index a7dd678725b2..4c8d532ee97b 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -22,7 +22,7 @@ enum blake2s_lengths {
 	BLAKE2S_256_HASH_SIZE = 32,
 };
 
-struct blake2s_state {
+struct blake2s_ctx {
 	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
 	u32 h[8];
 	u32 t[2];
@@ -43,62 +43,61 @@ enum blake2s_iv {
 	BLAKE2S_IV7 = 0x5BE0CD19UL,
 };
 
-static inline void __blake2s_init(struct blake2s_state *state, size_t outlen,
+static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen,
 				  const void *key, size_t keylen)
 {
-	state->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
-	state->h[1] = BLAKE2S_IV1;
-	state->h[2] = BLAKE2S_IV2;
-	state->h[3] = BLAKE2S_IV3;
-	state->h[4] = BLAKE2S_IV4;
-	state->h[5] = BLAKE2S_IV5;
-	state->h[6] = BLAKE2S_IV6;
-	state->h[7] = BLAKE2S_IV7;
-	state->t[0] = 0;
-	state->t[1] = 0;
-	state->f[0] = 0;
-	state->f[1] = 0;
-	state->buflen = 0;
-	state->outlen = outlen;
+	ctx->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+	ctx->h[1] = BLAKE2S_IV1;
+	ctx->h[2] = BLAKE2S_IV2;
+	ctx->h[3] = BLAKE2S_IV3;
+	ctx->h[4] = BLAKE2S_IV4;
+	ctx->h[5] = BLAKE2S_IV5;
+	ctx->h[6] = BLAKE2S_IV6;
+	ctx->h[7] = BLAKE2S_IV7;
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->f[0] = 0;
+	ctx->f[1] = 0;
+	ctx->buflen = 0;
+	ctx->outlen = outlen;
 	if (keylen) {
-		memcpy(state->buf, key, keylen);
-		memset(&state->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
-		state->buflen = BLAKE2S_BLOCK_SIZE;
+		memcpy(ctx->buf, key, keylen);
+		memset(&ctx->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
+		ctx->buflen = BLAKE2S_BLOCK_SIZE;
 	}
 }
 
-static inline void blake2s_init(struct blake2s_state *state,
-				const size_t outlen)
+static inline void blake2s_init(struct blake2s_ctx *ctx, const size_t outlen)
 {
-	__blake2s_init(state, outlen, NULL, 0);
+	__blake2s_init(ctx, outlen, NULL, 0);
 }
 
-static inline void blake2s_init_key(struct blake2s_state *state,
+static inline void blake2s_init_key(struct blake2s_ctx *ctx,
 				    const size_t outlen, const void *key,
 				    const size_t keylen)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
 		!key || !keylen || keylen > BLAKE2S_KEY_SIZE));
 
-	__blake2s_init(state, outlen, key, keylen);
+	__blake2s_init(ctx, outlen, key, keylen);
 }
 
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen);
-void blake2s_final(struct blake2s_state *state, u8 *out);
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen);
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out);
 
 static inline void blake2s(const u8 *key, const size_t keylen,
 			   const u8 *in, const size_t inlen,
 			   u8 *out, const size_t outlen)
 {
-	struct blake2s_state state;
+	struct blake2s_ctx ctx;
 
 	WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
 		outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
 		(!key && keylen)));
 
-	__blake2s_init(&state, outlen, key, keylen);
-	blake2s_update(&state, in, inlen);
-	blake2s_final(&state, out);
+	__blake2s_init(&ctx, outlen, key, keylen);
+	blake2s_update(&ctx, in, inlen);
+	blake2s_final(&ctx, out);
 }
 
 #endif /* _CRYPTO_BLAKE2S_H */
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
index 293f44fa8f31..78e758a7cb3e 100644
--- a/lib/crypto/arm/blake2s-core.S
+++ b/lib/crypto/arm/blake2s-core.S
@@ -170,10 +170,10 @@
 .endm
 
 //
-// void blake2s_compress(struct blake2s_state *state,
+// void blake2s_compress(struct blake2s_ctx *ctx,
 //			 const u8 *block, size_t nblocks, u32 inc);
 //
-// Only the first three fields of struct blake2s_state are used:
+// Only the first three fields of struct blake2s_ctx are used:
 //	u32 h[8];	(inout)
 //	u32 t[2];	(inout)
 //	u32 f[2];	(in)
@@ -183,7 +183,7 @@ ENTRY(blake2s_compress)
 	push		{r0-r2,r4-r11,lr}	// keep this an even number
 
 .Lnext_block:
-	// r0 is 'state'
+	// r0 is 'ctx'
 	// r1 is 'block'
 	// r3 is 'inc'
 
@@ -211,7 +211,7 @@ ENTRY(blake2s_compress)
 
 	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
 	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
-	mov		r14, r0			// r14 = state
+	mov		r14, r0			// r14 = ctx
 	adr		r12, .Lblake2s_IV
 	ldmia		r12!, {r8-r9}		// load IV[0..1]
 	__ldrd		r0, r1, r14, 40		// load f[0..1]
@@ -275,7 +275,7 @@ ENTRY(blake2s_compress)
 	// Advance to the next block, if there is one.  Note that if there are
 	// multiple blocks, then 'inc' (the counter increment amount) must be
 	// 64.  So we can simply set it to 64 without re-loading it.
-	ldm		sp, {r0, r1, r2}	// load (state, block, nblocks)
+	ldm		sp, {r0, r1, r2}	// load (ctx, block, nblocks)
 	mov		r3, #64			// set 'inc'
 	subs		r2, r2, #1		// nblocks--
 	str		r2, [sp, #8]
diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h
index aa7a97139ea7..ce009cd98de9 100644
--- a/lib/crypto/arm/blake2s.h
+++ b/lib/crypto/arm/blake2s.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 
 /* defined in blake2s-core.S */
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
-		      size_t nblocks, u32 inc);
+void blake2s_compress(struct blake2s_ctx *ctx,
+		      const u8 *block, size_t nblocks, u32 inc);
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 5638ed9d882d..1ad36cb29835 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -29,15 +29,15 @@ static const u8 blake2s_sigma[10][16] = {
 	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 };
 
-static inline void blake2s_increment_counter(struct blake2s_state *state,
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx,
 					     const u32 inc)
 {
-	state->t[0] += inc;
-	state->t[1] += (state->t[0] < inc);
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
 }
 
 static void __maybe_unused
-blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
+blake2s_compress_generic(struct blake2s_ctx *ctx, const u8 *block,
 			 size_t nblocks, const u32 inc)
 {
 	u32 m[16];
@@ -48,18 +48,18 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
 
 	while (nblocks > 0) {
-		blake2s_increment_counter(state, inc);
+		blake2s_increment_counter(ctx, inc);
 		memcpy(m, block, BLAKE2S_BLOCK_SIZE);
 		le32_to_cpu_array(m, ARRAY_SIZE(m));
-		memcpy(v, state->h, 32);
+		memcpy(v, ctx->h, 32);
 		v[ 8] = BLAKE2S_IV0;
 		v[ 9] = BLAKE2S_IV1;
 		v[10] = BLAKE2S_IV2;
 		v[11] = BLAKE2S_IV3;
-		v[12] = BLAKE2S_IV4 ^ state->t[0];
-		v[13] = BLAKE2S_IV5 ^ state->t[1];
-		v[14] = BLAKE2S_IV6 ^ state->f[0];
-		v[15] = BLAKE2S_IV7 ^ state->f[1];
+		v[12] = BLAKE2S_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2S_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2S_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2S_IV7 ^ ctx->f[1];
 
 #define G(r, i, a, b, c, d) do { \
 	a += b + m[blake2s_sigma[r][2 * i + 0]]; \
@@ -97,7 +97,7 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 #undef ROUND
 
 		for (i = 0; i < 8; ++i)
-			state->h[i] ^= v[i] ^ v[i + 8];
+			ctx->h[i] ^= v[i] ^ v[i + 8];
 
 		block += BLAKE2S_BLOCK_SIZE;
 		--nblocks;
@@ -110,45 +110,45 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 #define blake2s_compress blake2s_compress_generic
 #endif
 
-static inline void blake2s_set_lastblock(struct blake2s_state *state)
+static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
 {
-	state->f[0] = -1;
+	ctx->f[0] = -1;
 }
 
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
 {
-	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+	const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;
 
 	if (unlikely(!inlen))
 		return;
 	if (inlen > fill) {
-		memcpy(state->buf + state->buflen, in, fill);
-		blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
-		state->buflen = 0;
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
+		ctx->buflen = 0;
 		in += fill;
 		inlen -= fill;
 	}
 	if (inlen > BLAKE2S_BLOCK_SIZE) {
 		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
-		blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+		blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
 		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 	}
-	memcpy(state->buf + state->buflen, in, inlen);
-	state->buflen += inlen;
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
 }
 EXPORT_SYMBOL(blake2s_update);
 
-void blake2s_final(struct blake2s_state *state, u8 *out)
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && !out);
-	blake2s_set_lastblock(state);
-	memset(state->buf + state->buflen, 0,
-	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
-	blake2s_compress(state, state->buf, 1, state->buflen);
-	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
-	memcpy(out, state->h, state->outlen);
-	memzero_explicit(state, sizeof(*state));
+	blake2s_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 EXPORT_SYMBOL(blake2s_final);
 
diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c
index 247bbdf7dc86..6832d9aa7b82 100644
--- a/lib/crypto/tests/blake2s_kunit.c
+++ b/lib/crypto/tests/blake2s_kunit.c
@@ -17,9 +17,9 @@ static void blake2s_default(const u8 *data, size_t len,
 	blake2s(NULL, 0, data, len, out, BLAKE2S_HASH_SIZE);
 }
 
-static void blake2s_init_default(struct blake2s_state *state)
+static void blake2s_init_default(struct blake2s_ctx *ctx)
 {
-	blake2s_init(state, BLAKE2S_HASH_SIZE);
+	blake2s_init(ctx, BLAKE2S_HASH_SIZE);
 }
 
 /*
@@ -27,7 +27,7 @@ static void blake2s_init_default(struct blake2s_state *state)
  * with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE.
  */
 #define HASH blake2s_default
-#define HASH_CTX blake2s_state
+#define HASH_CTX blake2s_ctx
 #define HASH_SIZE BLAKE2S_HASH_SIZE
 #define HASH_INIT blake2s_init_default
 #define HASH_UPDATE blake2s_update
@@ -44,19 +44,19 @@ static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
 	u8 *data = &test_buf[0];
 	u8 *key = data + data_len;
 	u8 *hash = key + BLAKE2S_KEY_SIZE;
-	struct blake2s_state main_state;
+	struct blake2s_ctx main_ctx;
 	u8 main_hash[BLAKE2S_HASH_SIZE];
 
 	rand_bytes_seeded_from_len(data, data_len);
-	blake2s_init(&main_state, BLAKE2S_HASH_SIZE);
+	blake2s_init(&main_ctx, BLAKE2S_HASH_SIZE);
 	for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
 		rand_bytes_seeded_from_len(key, key_len);
 		for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
 			blake2s(key, key_len, data, data_len, hash, out_len);
-			blake2s_update(&main_state, hash, out_len);
+			blake2s_update(&main_ctx, hash, out_len);
 		}
 	}
-	blake2s_final(&main_state, main_hash);
+	blake2s_final(&main_ctx, main_hash);
 	KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated,
 			   BLAKE2S_HASH_SIZE);
 }
@@ -75,7 +75,7 @@ static void test_blake2s_with_guarded_key_buf(struct kunit *test)
 		u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
 		u8 hash1[BLAKE2S_HASH_SIZE];
 		u8 hash2[BLAKE2S_HASH_SIZE];
-		struct blake2s_state state;
+		struct blake2s_ctx ctx;
 
 		rand_bytes(key, key_len);
 		memcpy(guarded_key, key, key_len);
@@ -86,10 +86,9 @@ static void test_blake2s_with_guarded_key_buf(struct kunit *test)
 			hash2, BLAKE2S_HASH_SIZE);
 		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
 
-		blake2s_init_key(&state, BLAKE2S_HASH_SIZE,
-				 guarded_key, key_len);
-		blake2s_update(&state, test_buf, data_len);
-		blake2s_final(&state, hash2);
+		blake2s_init_key(&ctx, BLAKE2S_HASH_SIZE, guarded_key, key_len);
+		blake2s_update(&ctx, test_buf, data_len);
+		blake2s_final(&ctx, hash2);
 		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
 	}
 }
diff --git a/lib/crypto/x86/blake2s.h b/lib/crypto/x86/blake2s.h
index b6d30d2fa045..de360935b820 100644
--- a/lib/crypto/x86/blake2s.h
+++ b/lib/crypto/x86/blake2s.h
@@ -11,24 +11,24 @@
 #include <linux/kernel.h>
 #include <linux/sizes.h>
 
-asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
+asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
 				       const u8 *block, const size_t nblocks,
 				       const u32 inc);
-asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
+asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
 					const u8 *block, const size_t nblocks,
 					const u32 inc);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-static void blake2s_compress(struct blake2s_state *state, const u8 *block,
+static void blake2s_compress(struct blake2s_ctx *ctx, const u8 *block,
 			     size_t nblocks, const u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
 
 	if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
-		blake2s_compress_generic(state, block, nblocks, inc);
+		blake2s_compress_generic(ctx, block, nblocks, inc);
 		return;
 	}
 
@@ -38,9 +38,9 @@ static void blake2s_compress(struct blake2s_state *state, const u8 *block,
 
 		kernel_fpu_begin();
 		if (static_branch_likely(&blake2s_use_avx512))
-			blake2s_compress_avx512(state, block, blocks, inc);
+			blake2s_compress_avx512(ctx, block, blocks, inc);
 		else
-			blake2s_compress_ssse3(state, block, blocks, inc);
+			blake2s_compress_ssse3(ctx, block, blocks, inc);
 		kernel_fpu_end();
 
 		nblocks -= blocks;
-- 
cgit v1.2.3


From 5385bcbffe5a76a74d6bb135af1c88fb235f8134 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:30:59 -0700
Subject: lib/crypto: blake2s: Drop excessive const & rename block => data

A couple more small cleanups to the BLAKE2s code before these things get
propagated into the BLAKE2b code:

- Drop 'const' from some non-pointer function parameters.  It was a bit
  excessive and not conventional.

- Rename 'block' argument of blake2s_compress*() to 'data'.  This is for
  consistency with the SHA-* code, and also to avoid the implication
  that it points to a singular "block".

No functional changes.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/blake2s.h      | 13 ++++++-------
 lib/crypto/arm/blake2s-core.S |  6 +++---
 lib/crypto/arm/blake2s.h      |  2 +-
 lib/crypto/blake2s.c          | 12 ++++++------
 lib/crypto/x86/blake2s.h      | 18 ++++++++----------
 5 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index 4c8d532ee97b..33893057eb41 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -67,14 +67,13 @@ static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen,
 	}
 }
 
-static inline void blake2s_init(struct blake2s_ctx *ctx, const size_t outlen)
+static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen)
 {
 	__blake2s_init(ctx, outlen, NULL, 0);
 }
 
-static inline void blake2s_init_key(struct blake2s_ctx *ctx,
-				    const size_t outlen, const void *key,
-				    const size_t keylen)
+static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen,
+				    const void *key, size_t keylen)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
 		!key || !keylen || keylen > BLAKE2S_KEY_SIZE));
@@ -85,9 +84,9 @@ static inline void blake2s_init_key(struct blake2s_ctx *ctx,
 void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen);
 void blake2s_final(struct blake2s_ctx *ctx, u8 *out);
 
-static inline void blake2s(const u8 *key, const size_t keylen,
-			   const u8 *in, const size_t inlen,
-			   u8 *out, const size_t outlen)
+static inline void blake2s(const u8 *key, size_t keylen,
+			   const u8 *in, size_t inlen,
+			   u8 *out, size_t outlen)
 {
 	struct blake2s_ctx ctx;
 
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
index 78e758a7cb3e..14eb7c18a836 100644
--- a/lib/crypto/arm/blake2s-core.S
+++ b/lib/crypto/arm/blake2s-core.S
@@ -171,7 +171,7 @@
 
 //
 // void blake2s_compress(struct blake2s_ctx *ctx,
-//			 const u8 *block, size_t nblocks, u32 inc);
+//			 const u8 *data, size_t nblocks, u32 inc);
 //
 // Only the first three fields of struct blake2s_ctx are used:
 //	u32 h[8];	(inout)
@@ -184,7 +184,7 @@ ENTRY(blake2s_compress)
 
 .Lnext_block:
 	// r0 is 'ctx'
-	// r1 is 'block'
+	// r1 is 'data'
 	// r3 is 'inc'
 
 	// Load and increment the counter t[0..1].
@@ -275,7 +275,7 @@ ENTRY(blake2s_compress)
 	// Advance to the next block, if there is one.  Note that if there are
 	// multiple blocks, then 'inc' (the counter increment amount) must be
 	// 64.  So we can simply set it to 64 without re-loading it.
-	ldm		sp, {r0, r1, r2}	// load (ctx, block, nblocks)
+	ldm		sp, {r0, r1, r2}	// load (ctx, data, nblocks)
 	mov		r3, #64			// set 'inc'
 	subs		r2, r2, #1		// nblocks--
 	str		r2, [sp, #8]
diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h
index ce009cd98de9..42c04440c191 100644
--- a/lib/crypto/arm/blake2s.h
+++ b/lib/crypto/arm/blake2s.h
@@ -2,4 +2,4 @@
 
 /* defined in blake2s-core.S */
 void blake2s_compress(struct blake2s_ctx *ctx,
-		      const u8 *block, size_t nblocks, u32 inc);
+		      const u8 *data, size_t nblocks, u32 inc);
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 1ad36cb29835..6182c21ed943 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -29,16 +29,15 @@ static const u8 blake2s_sigma[10][16] = {
 	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 };
 
-static inline void blake2s_increment_counter(struct blake2s_ctx *ctx,
-					     const u32 inc)
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
 {
 	ctx->t[0] += inc;
 	ctx->t[1] += (ctx->t[0] < inc);
 }
 
 static void __maybe_unused
-blake2s_compress_generic(struct blake2s_ctx *ctx, const u8 *block,
-			 size_t nblocks, const u32 inc)
+blake2s_compress_generic(struct blake2s_ctx *ctx,
+			 const u8 *data, size_t nblocks, u32 inc)
 {
 	u32 m[16];
 	u32 v[16];
@@ -49,7 +48,7 @@ blake2s_compress_generic(struct blake2s_ctx *ctx, const u8 *block,
 
 	while (nblocks > 0) {
 		blake2s_increment_counter(ctx, inc);
-		memcpy(m, block, BLAKE2S_BLOCK_SIZE);
+		memcpy(m, data, BLAKE2S_BLOCK_SIZE);
 		le32_to_cpu_array(m, ARRAY_SIZE(m));
 		memcpy(v, ctx->h, 32);
 		v[ 8] = BLAKE2S_IV0;
@@ -99,7 +98,7 @@ blake2s_compress_generic(struct blake2s_ctx *ctx, const u8 *block,
 		for (i = 0; i < 8; ++i)
 			ctx->h[i] ^= v[i] ^ v[i + 8];
 
-		block += BLAKE2S_BLOCK_SIZE;
+		data += BLAKE2S_BLOCK_SIZE;
 		--nblocks;
 	}
 }
@@ -130,6 +129,7 @@ void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
 	}
 	if (inlen > BLAKE2S_BLOCK_SIZE) {
 		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+
 		blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
 		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
diff --git a/lib/crypto/x86/blake2s.h b/lib/crypto/x86/blake2s.h
index de360935b820..f8eed6cb042e 100644
--- a/lib/crypto/x86/blake2s.h
+++ b/lib/crypto/x86/blake2s.h
@@ -12,23 +12,21 @@
 #include <linux/sizes.h>
 
 asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
-				       const u8 *block, const size_t nblocks,
-				       const u32 inc);
+				       const u8 *data, size_t nblocks, u32 inc);
 asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
-					const u8 *block, const size_t nblocks,
-					const u32 inc);
+					const u8 *data, size_t nblocks, u32 inc);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-static void blake2s_compress(struct blake2s_ctx *ctx, const u8 *block,
-			     size_t nblocks, const u32 inc)
+static void blake2s_compress(struct blake2s_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
 
 	if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
-		blake2s_compress_generic(ctx, block, nblocks, inc);
+		blake2s_compress_generic(ctx, data, nblocks, inc);
 		return;
 	}
 
@@ -38,13 +36,13 @@ static void blake2s_compress(struct blake2s_ctx *ctx, const u8 *block,
 
 		kernel_fpu_begin();
 		if (static_branch_likely(&blake2s_use_avx512))
-			blake2s_compress_avx512(ctx, block, blocks, inc);
+			blake2s_compress_avx512(ctx, data, blocks, inc);
 		else
-			blake2s_compress_ssse3(ctx, block, blocks, inc);
+			blake2s_compress_ssse3(ctx, data, blocks, inc);
 		kernel_fpu_end();
 
+		data += blocks * BLAKE2S_BLOCK_SIZE;
 		nblocks -= blocks;
-		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
 
-- 
cgit v1.2.3


From b95d4471cb5830b59667ead8d1d59dc3d661a1df Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:31:00 -0700
Subject: lib/crypto: blake2s: Document the BLAKE2s library API

Add kerneldoc for the BLAKE2s library API.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/blake2s.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'include')

diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index 33893057eb41..648cb7824358 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -22,6 +22,15 @@ enum blake2s_lengths {
 	BLAKE2S_256_HASH_SIZE = 32,
 };
 
+/**
+ * struct blake2s_ctx - Context for hashing a message with BLAKE2s
+ * @h: compression function state
+ * @t: block counter
+ * @f: finalization indicator
+ * @buf: partial block buffer; 'buflen' bytes are valid
+ * @buflen: number of bytes buffered in @buf
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ */
 struct blake2s_ctx {
 	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
 	u32 h[8];
@@ -67,11 +76,27 @@ static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen,
 	}
 }
 
+/**
+ * blake2s_init() - Initialize a BLAKE2s context for a new message (unkeyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ *
+ * Context: Any context.
+ */
 static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen)
 {
 	__blake2s_init(ctx, outlen, NULL, 0);
 }
 
+/**
+ * blake2s_init_key() - Initialize a BLAKE2s context for a new message (keyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ * @key: the key
+ * @keylen: the key length in bytes, at most BLAKE2S_KEY_SIZE
+ *
+ * Context: Any context.
+ */
 static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen,
 				    const void *key, size_t keylen)
 {
@@ -81,9 +106,42 @@ static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen,
 	__blake2s_init(ctx, outlen, key, keylen);
 }
 
+/**
+ * blake2s_update() - Update a BLAKE2s context with message data
+ * @ctx: the context to update; must have been initialized
+ * @in: the message data
+ * @inlen: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
 void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen);
+
+/**
+ * blake2s_final() - Finish computing a BLAKE2s hash
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting BLAKE2s hash.  Its length will be equal to the
+ *	 @outlen that was passed to blake2s_init() or blake2s_init_key().
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
 void blake2s_final(struct blake2s_ctx *ctx, u8 *out);
 
+/**
+ * blake2s() - Compute BLAKE2s hash in one shot
+ * @key: the key, or NULL for an unkeyed hash
+ * @keylen: the key length in bytes (at most BLAKE2S_KEY_SIZE), or 0 for an
+ *	    unkeyed hash
+ * @in: the message data
+ * @inlen: the data length in bytes
+ * @out: (output) the resulting BLAKE2s hash, with length @outlen
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ *
+ * Context: Any context.
+ */
 static inline void blake2s(const u8 *key, size_t keylen,
 			   const u8 *in, size_t inlen,
 			   u8 *out, size_t outlen)
-- 
cgit v1.2.3


From c99d30706043481a1d631bbd9c7a4b70fe002a2b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:31:01 -0700
Subject: byteorder: Add le64_to_cpu_array() and cpu_to_le64_array()

Add le64_to_cpu_array() and cpu_to_le64_array().  These mirror the
corresponding 32-bit functions.

These will be used by the BLAKE2b code.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/linux/byteorder/generic.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index b3705e8bbe2b..55a44199de87 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -173,6 +173,22 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words)
 	}
 }
 
+static inline void le64_to_cpu_array(u64 *buf, unsigned int words)
+{
+	while (words--) {
+		__le64_to_cpus(buf);
+		buf++;
+	}
+}
+
+static inline void cpu_to_le64_array(u64 *buf, unsigned int words)
+{
+	while (words--) {
+		__cpu_to_le64s(buf);
+		buf++;
+	}
+}
+
 static inline void memcpy_from_le32(u32 *dst, const __le32 *src, size_t words)
 {
 	size_t i;
-- 
cgit v1.2.3


From 23a16c9533ed92cc639c8f5bd9eb104809fe2919 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:31:02 -0700
Subject: lib/crypto: blake2b: Add BLAKE2b library functions

Add a library API for BLAKE2b, closely modeled after the BLAKE2s API.

This will allow in-kernel users such as btrfs to use BLAKE2b without
going through the generic crypto layer.  In addition, as usual the
BLAKE2b crypto_shash algorithms will be reimplemented on top of this.

Note: to create lib/crypto/blake2b.c I made a copy of
lib/crypto/blake2s.c and made the updates from BLAKE2s => BLAKE2b.  This
way, the BLAKE2s and BLAKE2b code is kept consistent.  Therefore, it
borrows the SPDX-License-Identifier and Copyright from
lib/crypto/blake2s.c rather than crypto/blake2b_generic.c.

The library API uses 'struct blake2b_ctx', consistent with other
lib/crypto/ APIs.  The existing 'struct blake2b_state' will be removed
once the blake2b crypto_shash algorithms are updated to stop using it.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/blake2b.h          | 133 ++++++++++++++++++++++++++---
 include/crypto/internal/blake2b.h |  17 +++-
 lib/crypto/Kconfig                |  10 +++
 lib/crypto/Makefile               |   9 ++
 lib/crypto/blake2b.c              | 174 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 330 insertions(+), 13 deletions(-)
 create mode 100644 lib/crypto/blake2b.c

(limited to 'include')

diff --git a/include/crypto/blake2b.h b/include/crypto/blake2b.h
index dd7694477e50..4879e2ec2686 100644
--- a/include/crypto/blake2b.h
+++ b/include/crypto/blake2b.h
@@ -28,6 +28,25 @@ enum blake2b_lengths {
 	BLAKE2B_512_HASH_SIZE = 64,
 };
 
+/**
+ * struct blake2b_ctx - Context for hashing a message with BLAKE2b
+ * @h: compression function state
+ * @t: block counter
+ * @f: finalization indicator
+ * @buf: partial block buffer; 'buflen' bytes are valid
+ * @buflen: number of bytes buffered in @buf
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ */
+struct blake2b_ctx {
+	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
+	u64 h[8];
+	u64 t[2];
+	u64 f[2];
+	u8 buf[BLAKE2B_BLOCK_SIZE];
+	unsigned int buflen;
+	unsigned int outlen;
+};
+
 enum blake2b_iv {
 	BLAKE2B_IV0 = 0x6A09E667F3BCC908ULL,
 	BLAKE2B_IV1 = 0xBB67AE8584CAA73BULL,
@@ -39,19 +58,109 @@ enum blake2b_iv {
 	BLAKE2B_IV7 = 0x5BE0CD19137E2179ULL,
 };
 
-static inline void __blake2b_init(struct blake2b_state *state, size_t outlen,
-				  size_t keylen)
+static inline void __blake2b_init(struct blake2b_ctx *ctx, size_t outlen,
+				  const void *key, size_t keylen)
+{
+	ctx->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+	ctx->h[1] = BLAKE2B_IV1;
+	ctx->h[2] = BLAKE2B_IV2;
+	ctx->h[3] = BLAKE2B_IV3;
+	ctx->h[4] = BLAKE2B_IV4;
+	ctx->h[5] = BLAKE2B_IV5;
+	ctx->h[6] = BLAKE2B_IV6;
+	ctx->h[7] = BLAKE2B_IV7;
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->f[0] = 0;
+	ctx->f[1] = 0;
+	ctx->buflen = 0;
+	ctx->outlen = outlen;
+	if (keylen) {
+		memcpy(ctx->buf, key, keylen);
+		memset(&ctx->buf[keylen], 0, BLAKE2B_BLOCK_SIZE - keylen);
+		ctx->buflen = BLAKE2B_BLOCK_SIZE;
+	}
+}
+
+/**
+ * blake2b_init() - Initialize a BLAKE2b context for a new message (unkeyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b_init(struct blake2b_ctx *ctx, size_t outlen)
+{
+	__blake2b_init(ctx, outlen, NULL, 0);
+}
+
+/**
+ * blake2b_init_key() - Initialize a BLAKE2b context for a new message (keyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ * @key: the key
+ * @keylen: the key length in bytes, at most BLAKE2B_KEY_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b_init_key(struct blake2b_ctx *ctx, size_t outlen,
+				    const void *key, size_t keylen)
+{
+	WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2B_HASH_SIZE ||
+		!key || !keylen || keylen > BLAKE2B_KEY_SIZE));
+
+	__blake2b_init(ctx, outlen, key, keylen);
+}
+
+/**
+ * blake2b_update() - Update a BLAKE2b context with message data
+ * @ctx: the context to update; must have been initialized
+ * @in: the message data
+ * @inlen: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen);
+
+/**
+ * blake2b_final() - Finish computing a BLAKE2b hash
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting BLAKE2b hash.  Its length will be equal to the
+ *	 @outlen that was passed to blake2b_init() or blake2b_init_key().
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out);
+
+/**
+ * blake2b() - Compute BLAKE2b hash in one shot
+ * @key: the key, or NULL for an unkeyed hash
+ * @keylen: the key length in bytes (at most BLAKE2B_KEY_SIZE), or 0 for an
+ *	    unkeyed hash
+ * @in: the message data
+ * @inlen: the data length in bytes
+ * @out: (output) the resulting BLAKE2b hash, with length @outlen
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b(const u8 *key, size_t keylen,
+			   const u8 *in, size_t inlen,
+			   u8 *out, size_t outlen)
 {
-	state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
-	state->h[1] = BLAKE2B_IV1;
-	state->h[2] = BLAKE2B_IV2;
-	state->h[3] = BLAKE2B_IV3;
-	state->h[4] = BLAKE2B_IV4;
-	state->h[5] = BLAKE2B_IV5;
-	state->h[6] = BLAKE2B_IV6;
-	state->h[7] = BLAKE2B_IV7;
-	state->t[0] = 0;
-	state->t[1] = 0;
+	struct blake2b_ctx ctx;
+
+	WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
+		outlen > BLAKE2B_HASH_SIZE || keylen > BLAKE2B_KEY_SIZE ||
+		(!key && keylen)));
+
+	__blake2b_init(&ctx, outlen, key, keylen);
+	blake2b_update(&ctx, in, inlen);
+	blake2b_final(&ctx, out);
 }
 
 #endif /* _CRYPTO_BLAKE2B_H */
diff --git a/include/crypto/internal/blake2b.h b/include/crypto/internal/blake2b.h
index 3e09e2485306..3712df69def1 100644
--- a/include/crypto/internal/blake2b.h
+++ b/include/crypto/internal/blake2b.h
@@ -57,13 +57,28 @@ static inline int crypto_blake2b_setkey(struct crypto_shash *tfm,
 	return 0;
 }
 
+static inline void __crypto_blake2b_init(struct blake2b_state *state,
+					 size_t outlen, size_t keylen)
+{
+	state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+	state->h[1] = BLAKE2B_IV1;
+	state->h[2] = BLAKE2B_IV2;
+	state->h[3] = BLAKE2B_IV3;
+	state->h[4] = BLAKE2B_IV4;
+	state->h[5] = BLAKE2B_IV5;
+	state->h[6] = BLAKE2B_IV6;
+	state->h[7] = BLAKE2B_IV7;
+	state->t[0] = 0;
+	state->t[1] = 0;
+}
+
 static inline int crypto_blake2b_init(struct shash_desc *desc)
 {
 	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct blake2b_state *state = shash_desc_ctx(desc);
 	unsigned int outlen = crypto_shash_digestsize(desc->tfm);
 
-	__blake2b_init(state, outlen, tctx->keylen);
+	__crypto_blake2b_init(state, outlen, tctx->keylen);
 	return tctx->keylen ?
 	       crypto_shash_update(desc, tctx->key, BLAKE2B_BLOCK_SIZE) : 0;
 }
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 8886055e938f..918378b7e833 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -28,6 +28,16 @@ config CRYPTO_LIB_ARC4
 config CRYPTO_LIB_GF128MUL
 	tristate
 
+config CRYPTO_LIB_BLAKE2B
+	tristate
+	help
+	  The BLAKE2b library functions.  Select this if your module uses any of
+	  the functions from <crypto/blake2b.h>.
+
+config CRYPTO_LIB_BLAKE2B_ARCH
+	bool
+	depends on CRYPTO_LIB_BLAKE2B && !UML
+
 # BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.
 
 config CRYPTO_LIB_BLAKE2S_ARCH
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index bded351aeace..f863417b1681 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -31,6 +31,15 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL)		+= gf128mul.o
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B) += libblake2b.o
+libblake2b-y := blake2b.o
+CFLAGS_blake2b.o := -Wframe-larger-than=4096 #  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
+CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
+endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+
+################################################################################
+
 # blake2s is used by the /dev/random driver which is always builtin
 obj-y += blake2s.o
 ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y)
diff --git a/lib/crypto/blake2b.c b/lib/crypto/blake2b.c
new file mode 100644
index 000000000000..09c6d65d8a6e
--- /dev/null
+++ b/lib/crypto/blake2b.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2025 Google LLC
+ *
+ * This is an implementation of the BLAKE2b hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ */
+
+#include <crypto/blake2b.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+static const u8 blake2b_sigma[12][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
+static inline void blake2b_increment_counter(struct blake2b_ctx *ctx, u32 inc)
+{
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
+}
+
+static void __maybe_unused
+blake2b_compress_generic(struct blake2b_ctx *ctx,
+			 const u8 *data, size_t nblocks, u32 inc)
+{
+	u64 m[16];
+	u64 v[16];
+	int i;
+
+	WARN_ON(IS_ENABLED(DEBUG) &&
+		(nblocks > 1 && inc != BLAKE2B_BLOCK_SIZE));
+
+	while (nblocks > 0) {
+		blake2b_increment_counter(ctx, inc);
+		memcpy(m, data, BLAKE2B_BLOCK_SIZE);
+		le64_to_cpu_array(m, ARRAY_SIZE(m));
+		memcpy(v, ctx->h, 64);
+		v[ 8] = BLAKE2B_IV0;
+		v[ 9] = BLAKE2B_IV1;
+		v[10] = BLAKE2B_IV2;
+		v[11] = BLAKE2B_IV3;
+		v[12] = BLAKE2B_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2B_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2B_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2B_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+	a += b + m[blake2b_sigma[r][2 * i + 0]]; \
+	d = ror64(d ^ a, 32); \
+	c += d; \
+	b = ror64(b ^ c, 24); \
+	a += b + m[blake2b_sigma[r][2 * i + 1]]; \
+	d = ror64(d ^ a, 16); \
+	c += d; \
+	b = ror64(b ^ c, 63); \
+} while (0)
+
+#define ROUND(r) do { \
+	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+	G(r, 2, v[2], v[ 6], v[10], v[14]); \
+	G(r, 3, v[3], v[ 7], v[11], v[15]); \
+	G(r, 4, v[0], v[ 5], v[10], v[15]); \
+	G(r, 5, v[1], v[ 6], v[11], v[12]); \
+	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+		ROUND(0);
+		ROUND(1);
+		ROUND(2);
+		ROUND(3);
+		ROUND(4);
+		ROUND(5);
+		ROUND(6);
+		ROUND(7);
+		ROUND(8);
+		ROUND(9);
+		ROUND(10);
+		ROUND(11);
+
+#undef G
+#undef ROUND
+
+		for (i = 0; i < 8; ++i)
+			ctx->h[i] ^= v[i] ^ v[i + 8];
+
+		data += BLAKE2B_BLOCK_SIZE;
+		--nblocks;
+	}
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+#include "blake2b.h" /* $(SRCARCH)/blake2b.h */
+#else
+#define blake2b_compress blake2b_compress_generic
+#endif
+
+static inline void blake2b_set_lastblock(struct blake2b_ctx *ctx)
+{
+	ctx->f[0] = -1;
+}
+
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen)
+{
+	const size_t fill = BLAKE2B_BLOCK_SIZE - ctx->buflen;
+
+	if (unlikely(!inlen))
+		return;
+	if (inlen > fill) {
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2b_compress(ctx, ctx->buf, 1, BLAKE2B_BLOCK_SIZE);
+		ctx->buflen = 0;
+		in += fill;
+		inlen -= fill;
+	}
+	if (inlen > BLAKE2B_BLOCK_SIZE) {
+		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2B_BLOCK_SIZE);
+
+		blake2b_compress(ctx, in, nblocks - 1, BLAKE2B_BLOCK_SIZE);
+		in += BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+		inlen -= BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+	}
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
+}
+EXPORT_SYMBOL(blake2b_update);
+
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out)
+{
+	WARN_ON(IS_ENABLED(DEBUG) && !out);
+	blake2b_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2B_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2b_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le64_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(blake2b_final);
+
+#ifdef blake2b_mod_init_arch
+static int __init blake2b_mod_init(void)
+{
+	blake2b_mod_init_arch();
+	return 0;
+}
+subsys_initcall(blake2b_mod_init);
+
+static void __exit blake2b_mod_exit(void)
+{
+}
+module_exit(blake2b_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("BLAKE2b hash function");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From fa3ca9bfe3f001ed306cb3ce9761dacffbe143f8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:31:05 -0700
Subject: crypto: blake2b - Reimplement using library API

Replace blake2b_generic.c with a new file blake2b.c which implements the
BLAKE2b crypto_shash algorithms on top of the BLAKE2b library API.

Change the driver name suffix from "-generic" to "-lib" to reflect that
these algorithms now just use the (possibly arch-optimized) library.

This closely mirrors crypto/{md5,sha1,sha256,sha512}.c.

Remove include/crypto/internal/blake2b.h since it is no longer used.
Likewise, remove struct blake2b_state from include/crypto/blake2b.h.

Omit support for import_core and export_core, since there are no legacy
drivers that need these for these algorithms.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-10-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 crypto/Kconfig                    |   1 +
 crypto/Makefile                   |   3 +-
 crypto/blake2b.c                  | 111 ++++++++++++++++++++++
 crypto/blake2b_generic.c          | 192 --------------------------------------
 crypto/testmgr.c                  |   4 +
 include/crypto/blake2b.h          |  10 --
 include/crypto/internal/blake2b.h | 116 -----------------------
 7 files changed, 117 insertions(+), 320 deletions(-)
 create mode 100644 crypto/blake2b.c
 delete mode 100644 crypto/blake2b_generic.c
 delete mode 100644 include/crypto/internal/blake2b.h

(limited to 'include')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index a04595f9d0ca..0a7e74ac870b 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -881,6 +881,7 @@ menu "Hashes, digests, and MACs"
 config CRYPTO_BLAKE2B
 	tristate "BLAKE2b"
 	select CRYPTO_HASH
+	select CRYPTO_LIB_BLAKE2B
 	help
 	  BLAKE2b cryptographic hash function (RFC 7693)
 
diff --git a/crypto/Makefile b/crypto/Makefile
index e430e6e99b6a..5b02ca2cb04e 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -83,8 +83,7 @@ obj-$(CONFIG_CRYPTO_SM3_GENERIC) += sm3_generic.o
 obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
 CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
-obj-$(CONFIG_CRYPTO_BLAKE2B) += blake2b_generic.o
-CFLAGS_blake2b_generic.o := -Wframe-larger-than=4096 #  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
+obj-$(CONFIG_CRYPTO_BLAKE2B) += blake2b.o
 obj-$(CONFIG_CRYPTO_ECB) += ecb.o
 obj-$(CONFIG_CRYPTO_CBC) += cbc.o
 obj-$(CONFIG_CRYPTO_PCBC) += pcbc.o
diff --git a/crypto/blake2b.c b/crypto/blake2b.c
new file mode 100644
index 000000000000..67a6dae43a54
--- /dev/null
+++ b/crypto/blake2b.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Crypto API support for BLAKE2b
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2b.h>
+#include <crypto/internal/hash.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+struct blake2b_tfm_ctx {
+	unsigned int keylen;
+	u8 key[BLAKE2B_KEY_SIZE];
+};
+
+static int crypto_blake2b_setkey(struct crypto_shash *tfm,
+				 const u8 *key, unsigned int keylen)
+{
+	struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+
+	if (keylen > BLAKE2B_KEY_SIZE)
+		return -EINVAL;
+	memcpy(tctx->key, key, keylen);
+	tctx->keylen = keylen;
+	return 0;
+}
+
+#define BLAKE2B_CTX(desc) ((struct blake2b_ctx *)shash_desc_ctx(desc))
+
+static int crypto_blake2b_init(struct shash_desc *desc)
+{
+	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	unsigned int digestsize = crypto_shash_digestsize(desc->tfm);
+
+	blake2b_init_key(BLAKE2B_CTX(desc), digestsize,
+			 tctx->key, tctx->keylen);
+	return 0;
+}
+
+static int crypto_blake2b_update(struct shash_desc *desc,
+				 const u8 *data, unsigned int len)
+{
+	blake2b_update(BLAKE2B_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_blake2b_final(struct shash_desc *desc, u8 *out)
+{
+	blake2b_final(BLAKE2B_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_blake2b_digest(struct shash_desc *desc,
+				 const u8 *data, unsigned int len, u8 *out)
+{
+	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	unsigned int digestsize = crypto_shash_digestsize(desc->tfm);
+
+	blake2b(tctx->key, tctx->keylen, data, len, out, digestsize);
+	return 0;
+}
+
+#define BLAKE2B_ALG(name, digest_size)					\
+	{								\
+		.base.cra_name		= name,				\
+		.base.cra_driver_name	= name "-lib",			\
+		.base.cra_priority	= 300,				\
+		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
+		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
+		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
+		.base.cra_module	= THIS_MODULE,			\
+		.digestsize		= digest_size,			\
+		.setkey			= crypto_blake2b_setkey,	\
+		.init			= crypto_blake2b_init,		\
+		.update			= crypto_blake2b_update,	\
+		.final			= crypto_blake2b_final,		\
+		.digest			= crypto_blake2b_digest,	\
+		.descsize		= sizeof(struct blake2b_ctx),	\
+	}
+
+static struct shash_alg algs[] = {
+	BLAKE2B_ALG("blake2b-160", BLAKE2B_160_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-256", BLAKE2B_256_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-384", BLAKE2B_384_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-512", BLAKE2B_512_HASH_SIZE),
+};
+
+static int __init crypto_blake2b_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+module_init(crypto_blake2b_mod_init);
+
+static void __exit crypto_blake2b_mod_exit(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+module_exit(crypto_blake2b_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Crypto API support for BLAKE2b");
+
+MODULE_ALIAS_CRYPTO("blake2b-160");
+MODULE_ALIAS_CRYPTO("blake2b-160-lib");
+MODULE_ALIAS_CRYPTO("blake2b-256");
+MODULE_ALIAS_CRYPTO("blake2b-256-lib");
+MODULE_ALIAS_CRYPTO("blake2b-384");
+MODULE_ALIAS_CRYPTO("blake2b-384-lib");
+MODULE_ALIAS_CRYPTO("blake2b-512");
+MODULE_ALIAS_CRYPTO("blake2b-512-lib");
diff --git a/crypto/blake2b_generic.c b/crypto/blake2b_generic.c
deleted file mode 100644
index 60f056217510..000000000000
--- a/crypto/blake2b_generic.c
+++ /dev/null
@@ -1,192 +0,0 @@
-// SPDX-License-Identifier: (GPL-2.0-only OR Apache-2.0)
-/*
- * Generic implementation of the BLAKE2b digest algorithm.  Based on the BLAKE2b
- * reference implementation, but it has been heavily modified for use in the
- * kernel.  The reference implementation was:
- *
- *	Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under
- *	the terms of the CC0, the OpenSSL Licence, or the Apache Public License
- *	2.0, at your option.  The terms of these licenses can be found at:
- *
- *	- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- *	- OpenSSL license   : https://www.openssl.org/source/license.html
- *	- Apache 2.0        : https://www.apache.org/licenses/LICENSE-2.0
- *
- * More information about BLAKE2 can be found at https://blake2.net.
- */
-
-#include <crypto/internal/blake2b.h>
-#include <crypto/internal/hash.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-static const u8 blake2b_sigma[12][16] = {
-	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
-	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
-	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
-	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
-	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
-	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
-	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
-	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
-	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
-	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
-};
-
-static void blake2b_increment_counter(struct blake2b_state *S, const u64 inc)
-{
-	S->t[0] += inc;
-	S->t[1] += (S->t[0] < inc);
-}
-
-#define G(r,i,a,b,c,d)                                  \
-	do {                                            \
-		a = a + b + m[blake2b_sigma[r][2*i+0]]; \
-		d = ror64(d ^ a, 32);                   \
-		c = c + d;                              \
-		b = ror64(b ^ c, 24);                   \
-		a = a + b + m[blake2b_sigma[r][2*i+1]]; \
-		d = ror64(d ^ a, 16);                   \
-		c = c + d;                              \
-		b = ror64(b ^ c, 63);                   \
-	} while (0)
-
-#define ROUND(r)                                \
-	do {                                    \
-		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-		G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-		G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-		G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-		G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
-	} while (0)
-
-static void blake2b_compress_one_generic(struct blake2b_state *S,
-					 const u8 block[BLAKE2B_BLOCK_SIZE])
-{
-	u64 m[16];
-	u64 v[16];
-	size_t i;
-
-	for (i = 0; i < 16; ++i)
-		m[i] = get_unaligned_le64(block + i * sizeof(m[i]));
-
-	for (i = 0; i < 8; ++i)
-		v[i] = S->h[i];
-
-	v[ 8] = BLAKE2B_IV0;
-	v[ 9] = BLAKE2B_IV1;
-	v[10] = BLAKE2B_IV2;
-	v[11] = BLAKE2B_IV3;
-	v[12] = BLAKE2B_IV4 ^ S->t[0];
-	v[13] = BLAKE2B_IV5 ^ S->t[1];
-	v[14] = BLAKE2B_IV6 ^ S->f[0];
-	v[15] = BLAKE2B_IV7 ^ S->f[1];
-
-	ROUND(0);
-	ROUND(1);
-	ROUND(2);
-	ROUND(3);
-	ROUND(4);
-	ROUND(5);
-	ROUND(6);
-	ROUND(7);
-	ROUND(8);
-	ROUND(9);
-	ROUND(10);
-	ROUND(11);
-#ifdef CONFIG_CC_IS_CLANG
-#pragma nounroll /* https://llvm.org/pr45803 */
-#endif
-	for (i = 0; i < 8; ++i)
-		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
-}
-
-#undef G
-#undef ROUND
-
-static void blake2b_compress_generic(struct blake2b_state *state,
-				     const u8 *block, size_t nblocks, u32 inc)
-{
-	do {
-		blake2b_increment_counter(state, inc);
-		blake2b_compress_one_generic(state, block);
-		block += BLAKE2B_BLOCK_SIZE;
-	} while (--nblocks);
-}
-
-static int crypto_blake2b_update_generic(struct shash_desc *desc,
-					 const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2b_update_bo(desc, in, inlen,
-					blake2b_compress_generic);
-}
-
-static int crypto_blake2b_finup_generic(struct shash_desc *desc, const u8 *in,
-					unsigned int inlen, u8 *out)
-{
-	return crypto_blake2b_finup(desc, in, inlen, out,
-				    blake2b_compress_generic);
-}
-
-#define BLAKE2B_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 100,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY |	\
-					  CRYPTO_AHASH_ALG_BLOCK_ONLY |	\
-					  CRYPTO_AHASH_ALG_FINAL_NONZERO, \
-		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2b_setkey,	\
-		.init			= crypto_blake2b_init,		\
-		.update			= crypto_blake2b_update_generic, \
-		.finup			= crypto_blake2b_finup_generic,	\
-		.descsize		= BLAKE2B_DESC_SIZE,		\
-		.statesize		= BLAKE2B_STATE_SIZE,		\
-	}
-
-static struct shash_alg blake2b_algs[] = {
-	BLAKE2B_ALG("blake2b-160", "blake2b-160-generic",
-		    BLAKE2B_160_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-256", "blake2b-256-generic",
-		    BLAKE2B_256_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-384", "blake2b-384-generic",
-		    BLAKE2B_384_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-512", "blake2b-512-generic",
-		    BLAKE2B_512_HASH_SIZE),
-};
-
-static int __init blake2b_mod_init(void)
-{
-	return crypto_register_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs));
-}
-
-static void __exit blake2b_mod_fini(void)
-{
-	crypto_unregister_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs));
-}
-
-module_init(blake2b_mod_init);
-module_exit(blake2b_mod_fini);
-
-MODULE_AUTHOR("David Sterba <kdave@kernel.org>");
-MODULE_DESCRIPTION("BLAKE2b generic implementation");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("blake2b-160");
-MODULE_ALIAS_CRYPTO("blake2b-160-generic");
-MODULE_ALIAS_CRYPTO("blake2b-256");
-MODULE_ALIAS_CRYPTO("blake2b-256-generic");
-MODULE_ALIAS_CRYPTO("blake2b-384");
-MODULE_ALIAS_CRYPTO("blake2b-384-generic");
-MODULE_ALIAS_CRYPTO("blake2b-512");
-MODULE_ALIAS_CRYPTO("blake2b-512-generic");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 6a490aaa71b9..3ab7adc1cdce 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4332,6 +4332,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 	}, {
 		.alg = "blake2b-160",
+		.generic_driver = "blake2b-160-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
@@ -4339,6 +4340,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "blake2b-256",
+		.generic_driver = "blake2b-256-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
@@ -4346,6 +4348,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "blake2b-384",
+		.generic_driver = "blake2b-384-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
@@ -4353,6 +4356,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "blake2b-512",
+		.generic_driver = "blake2b-512-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
diff --git a/include/crypto/blake2b.h b/include/crypto/blake2b.h
index 4879e2ec2686..3bc37fd103a7 100644
--- a/include/crypto/blake2b.h
+++ b/include/crypto/blake2b.h
@@ -7,20 +7,10 @@
 #include <linux/types.h>
 #include <linux/string.h>
 
-struct blake2b_state {
-	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
-	u64 h[8];
-	u64 t[2];
-	/* The true state ends here.  The rest is temporary storage. */
-	u64 f[2];
-};
-
 enum blake2b_lengths {
 	BLAKE2B_BLOCK_SIZE = 128,
 	BLAKE2B_HASH_SIZE = 64,
 	BLAKE2B_KEY_SIZE = 64,
-	BLAKE2B_STATE_SIZE = offsetof(struct blake2b_state, f),
-	BLAKE2B_DESC_SIZE = sizeof(struct blake2b_state),
 
 	BLAKE2B_160_HASH_SIZE = 20,
 	BLAKE2B_256_HASH_SIZE = 32,
diff --git a/include/crypto/internal/blake2b.h b/include/crypto/internal/blake2b.h
deleted file mode 100644
index 3712df69def1..000000000000
--- a/include/crypto/internal/blake2b.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Helper functions for BLAKE2b implementations.
- * Keep this in sync with the corresponding BLAKE2s header.
- */
-
-#ifndef _CRYPTO_INTERNAL_BLAKE2B_H
-#define _CRYPTO_INTERNAL_BLAKE2B_H
-
-#include <asm/byteorder.h>
-#include <crypto/blake2b.h>
-#include <crypto/internal/hash.h>
-#include <linux/array_size.h>
-#include <linux/compiler.h>
-#include <linux/build_bug.h>
-#include <linux/errno.h>
-#include <linux/math.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-static inline void blake2b_set_lastblock(struct blake2b_state *state)
-{
-	state->f[0] = -1;
-	state->f[1] = 0;
-}
-
-static inline void blake2b_set_nonlast(struct blake2b_state *state)
-{
-	state->f[0] = 0;
-	state->f[1] = 0;
-}
-
-typedef void (*blake2b_compress_t)(struct blake2b_state *state,
-				   const u8 *block, size_t nblocks, u32 inc);
-
-/* Helper functions for shash implementations of BLAKE2b */
-
-struct blake2b_tfm_ctx {
-	u8 key[BLAKE2B_BLOCK_SIZE];
-	unsigned int keylen;
-};
-
-static inline int crypto_blake2b_setkey(struct crypto_shash *tfm,
-					const u8 *key, unsigned int keylen)
-{
-	struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-
-	if (keylen > BLAKE2B_KEY_SIZE)
-		return -EINVAL;
-
-	BUILD_BUG_ON(BLAKE2B_KEY_SIZE > BLAKE2B_BLOCK_SIZE);
-
-	memcpy(tctx->key, key, keylen);
-	memset(tctx->key + keylen, 0, BLAKE2B_BLOCK_SIZE - keylen);
-	tctx->keylen = keylen;
-
-	return 0;
-}
-
-static inline void __crypto_blake2b_init(struct blake2b_state *state,
-					 size_t outlen, size_t keylen)
-{
-	state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
-	state->h[1] = BLAKE2B_IV1;
-	state->h[2] = BLAKE2B_IV2;
-	state->h[3] = BLAKE2B_IV3;
-	state->h[4] = BLAKE2B_IV4;
-	state->h[5] = BLAKE2B_IV5;
-	state->h[6] = BLAKE2B_IV6;
-	state->h[7] = BLAKE2B_IV7;
-	state->t[0] = 0;
-	state->t[1] = 0;
-}
-
-static inline int crypto_blake2b_init(struct shash_desc *desc)
-{
-	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-	struct blake2b_state *state = shash_desc_ctx(desc);
-	unsigned int outlen = crypto_shash_digestsize(desc->tfm);
-
-	__crypto_blake2b_init(state, outlen, tctx->keylen);
-	return tctx->keylen ?
-	       crypto_shash_update(desc, tctx->key, BLAKE2B_BLOCK_SIZE) : 0;
-}
-
-static inline int crypto_blake2b_update_bo(struct shash_desc *desc,
-					   const u8 *in, unsigned int inlen,
-					   blake2b_compress_t compress)
-{
-	struct blake2b_state *state = shash_desc_ctx(desc);
-
-	blake2b_set_nonlast(state);
-	compress(state, in, inlen / BLAKE2B_BLOCK_SIZE, BLAKE2B_BLOCK_SIZE);
-	return inlen - round_down(inlen, BLAKE2B_BLOCK_SIZE);
-}
-
-static inline int crypto_blake2b_finup(struct shash_desc *desc, const u8 *in,
-				       unsigned int inlen, u8 *out,
-				       blake2b_compress_t compress)
-{
-	struct blake2b_state *state = shash_desc_ctx(desc);
-	u8 buf[BLAKE2B_BLOCK_SIZE];
-	int i;
-
-	memcpy(buf, in, inlen);
-	memset(buf + inlen, 0, BLAKE2B_BLOCK_SIZE - inlen);
-	blake2b_set_lastblock(state);
-	compress(state, buf, 1, inlen);
-	for (i = 0; i < ARRAY_SIZE(state->h); i++)
-		__cpu_to_le64s(&state->h[i]);
-	memcpy(out, state->h, crypto_shash_digestsize(desc->tfm));
-	memzero_explicit(buf, sizeof(buf));
-	return 0;
-}
-
-#endif /* _CRYPTO_INTERNAL_BLAKE2B_H */
-- 
cgit v1.2.3


From db82ddeaf42b93799a52df347284062893ea2ad6 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Mon, 27 Oct 2025 14:22:14 +0200
Subject: wifi: mac80211: add RX flag to report radiotap VHT information

mac80211 already reports some basic information in the radiotap header
with the known fields declared by the driver. However, drivers may want
to report more accurate information and in that case the full VHT
radiotap structure needs to be provided.

Add a new RX_FLAG_RADIOTAP_VHT which is set when the VHT information
should be pulled from the skb. Update the code to fill in the VHT fields
to only do so when requested by the driver or if the information has not
yet been set. This way the driver can fully control the information if
it chooses so.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251027142118.0bad1c307a21.I2cf285c20a822698039603f2af00ed9c548f2ee0@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h |  20 +++++++-
 include/net/mac80211.h           |   2 +
 net/mac80211/rx.c                | 104 +++++++++++++++++++++++++--------------
 3 files changed, 89 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 813e163ce27c..c60867e7e43c 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2017		Intel Deutschland GmbH
- * Copyright (c) 2018-2019, 2021-2022 Intel Corporation
+ * Copyright (c) 2018-2019, 2021-2022, 2025 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -202,6 +202,24 @@ enum ieee80211_radiotap_vht_coding {
 	IEEE80211_RADIOTAP_CODING_LDPC_USER3 = 0x08,
 };
 
+enum ieee80211_radiotap_vht_bandwidth {
+	/* Note: more values are defined but can't really be used */
+	IEEE80211_RADIOTAP_VHT_BW_20		= 0,
+	IEEE80211_RADIOTAP_VHT_BW_40		= 1,
+	IEEE80211_RADIOTAP_VHT_BW_80		= 4,
+	IEEE80211_RADIOTAP_VHT_BW_160		= 11,
+};
+
+struct ieee80211_radiotap_vht {
+	__le16 known;
+	u8 flags;
+	u8 bandwidth;
+	u8 mcs_nss[4];
+	u8 coding;
+	u8 group_id;
+	__le16 partial_aid;
+} __packed;
+
 /* for IEEE80211_RADIOTAP_TIMESTAMP */
 enum ieee80211_radiotap_timestamp_unit_spos {
 	IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MASK = 0x000F,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a55085cf4ec4..c326243e1f01 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1529,6 +1529,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  *	known the frame shouldn't be reported.
  * @RX_FLAG_8023: the frame has an 802.3 header (decap offload performed by
  *	hardware or driver)
+ * @RX_FLAG_RADIOTAP_VHT: VHT radiotap data is present
  */
 enum mac80211_rx_flags {
 	RX_FLAG_MMIC_ERROR		= BIT(0),
@@ -1564,6 +1565,7 @@ enum mac80211_rx_flags {
 	RX_FLAG_RADIOTAP_LSIG		= BIT(28),
 	RX_FLAG_NO_PSDU			= BIT(29),
 	RX_FLAG_8023			= BIT(30),
+	RX_FLAG_RADIOTAP_VHT		= BIT(31),
 };
 
 /**
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 4641a2a80856..b59aeed340b3 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -59,7 +59,8 @@ static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb,
 	status->flag &= ~(RX_FLAG_RADIOTAP_TLV_AT_END |
 			  RX_FLAG_RADIOTAP_LSIG |
 			  RX_FLAG_RADIOTAP_HE_MU |
-			  RX_FLAG_RADIOTAP_HE);
+			  RX_FLAG_RADIOTAP_HE |
+			  RX_FLAG_RADIOTAP_VHT);
 
 	hdr = (void *)skb->data;
 	fc = hdr->frame_control;
@@ -151,8 +152,10 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 	}
 
 	if (status->encoding == RX_ENC_VHT) {
+		/* Included even if RX_FLAG_RADIOTAP_VHT is not set */
 		len = ALIGN(len, 2);
 		len += 12;
+		BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_vht) != 12);
 	}
 
 	if (local->hw.radiotap_timestamp.units_pos >= 0) {
@@ -195,6 +198,9 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 		 * The position to look at depends on the existence (or non-
 		 * existence) of other elements, so take that into account...
 		 */
+		if (status->flag & RX_FLAG_RADIOTAP_VHT)
+			tlv_offset +=
+				sizeof(struct ieee80211_radiotap_vht);
 		if (status->flag & RX_FLAG_RADIOTAP_HE)
 			tlv_offset +=
 				sizeof(struct ieee80211_radiotap_he);
@@ -319,10 +325,17 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	u32 tlvs_len = 0;
 	int mpdulen, chain;
 	unsigned long chains = status->chains;
+	struct ieee80211_radiotap_vht vht = {};
 	struct ieee80211_radiotap_he he = {};
 	struct ieee80211_radiotap_he_mu he_mu = {};
 	struct ieee80211_radiotap_lsig lsig = {};
 
+	if (status->flag & RX_FLAG_RADIOTAP_VHT) {
+		vht = *(struct ieee80211_radiotap_vht *)skb->data;
+		skb_pull(skb, sizeof(vht));
+		WARN_ON_ONCE(status->encoding != RX_ENC_VHT);
+	}
+
 	if (status->flag & RX_FLAG_RADIOTAP_HE) {
 		he = *(struct ieee80211_radiotap_he *)skb->data;
 		skb_pull(skb, sizeof(he));
@@ -530,45 +543,61 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	}
 
 	if (status->encoding == RX_ENC_VHT) {
-		u16 known = local->hw.radiotap_vht_details;
+		u16 fill = local->hw.radiotap_vht_details;
 
-		rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT));
-		put_unaligned_le16(known, pos);
-		pos += 2;
-		/* flags */
-		if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
-			*pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI;
+		/* Leave driver filled fields alone */
+		fill &= ~le16_to_cpu(vht.known);
+		vht.known |= cpu_to_le16(fill);
+
+		if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_GI &&
+		    status->enc_flags & RX_ENC_FLAG_SHORT_GI)
+			vht.flags |= IEEE80211_RADIOTAP_VHT_FLAG_SGI;
 		/* in VHT, STBC is binary */
-		if (status->enc_flags & RX_ENC_FLAG_STBC_MASK)
-			*pos |= IEEE80211_RADIOTAP_VHT_FLAG_STBC;
-		if (status->enc_flags & RX_ENC_FLAG_BF)
+		if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_STBC &&
+		    status->enc_flags & RX_ENC_FLAG_STBC_MASK)
+			vht.flags |= IEEE80211_RADIOTAP_VHT_FLAG_STBC;
+		if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_BEAMFORMED &&
+		    status->enc_flags & RX_ENC_FLAG_BF)
 			*pos |= IEEE80211_RADIOTAP_VHT_FLAG_BEAMFORMED;
-		pos++;
-		/* bandwidth */
-		switch (status->bw) {
-		case RATE_INFO_BW_80:
-			*pos++ = 4;
-			break;
-		case RATE_INFO_BW_160:
-			*pos++ = 11;
-			break;
-		case RATE_INFO_BW_40:
-			*pos++ = 1;
-			break;
-		default:
-			*pos++ = 0;
+
+		if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH) {
+			switch (status->bw) {
+			case RATE_INFO_BW_40:
+				vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_40;
+				break;
+			case RATE_INFO_BW_80:
+				vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_80;
+				break;
+			case RATE_INFO_BW_160:
+				vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_160;
+				break;
+			default:
+				vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_20;
+				break;
+			}
 		}
-		/* MCS/NSS */
-		*pos = (status->rate_idx << 4) | status->nss;
-		pos += 4;
-		/* coding field */
-		if (status->enc_flags & RX_ENC_FLAG_LDPC)
-			*pos |= IEEE80211_RADIOTAP_CODING_LDPC_USER0;
-		pos++;
-		/* group ID */
-		pos++;
-		/* partial_aid */
-		pos += 2;
+
+		/*
+		 * If the driver filled in mcs_nss[0], then do not touch it.
+		 *
+		 * Otherwise, put some information about MCS/NSS into the
+		 * user 0 field. Note that this is not technically correct for
+		 * an MU frame as we might have decoded a different user.
+		 */
+		if (!vht.mcs_nss[0]) {
+			vht.mcs_nss[0] = (status->rate_idx << 4) | status->nss;
+
+			/* coding field */
+			if (status->enc_flags & RX_ENC_FLAG_LDPC)
+				vht.coding |= IEEE80211_RADIOTAP_CODING_LDPC_USER0;
+		}
+
+		/* ensure 2 byte alignment */
+		while ((pos - (u8 *)rthdr) & 1)
+			pos++;
+		rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT));
+		memcpy(pos, &vht, sizeof(vht));
+		pos += sizeof(vht);
 	}
 
 	if (local->hw.radiotap_timestamp.units_pos >= 0) {
@@ -834,6 +863,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 		return NULL;
 	}
 
+	if (status->flag & RX_FLAG_RADIOTAP_VHT)
+		rtap_space += sizeof(struct ieee80211_radiotap_vht);
+
 	if (status->flag & RX_FLAG_RADIOTAP_HE)
 		rtap_space += sizeof(struct ieee80211_radiotap_he);
 
-- 
cgit v1.2.3


From 61fafbee6cfed283c02a320896089f658fa67e56 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Tue, 28 Oct 2025 04:22:48 +0200
Subject: xfrm: Determine inner GSO type from packet inner protocol

The GSO segmentation functions for ESP tunnel mode
(xfrm4_tunnel_gso_segment and xfrm6_tunnel_gso_segment) were
determining the inner packet's L2 protocol type by checking the static
x->inner_mode.family field from the xfrm state.

This is unreliable. In tunnel mode, the state's actual inner family
could be defined by x->inner_mode.family or by
x->inner_mode_iaf.family. Checking only the former can lead to a
mismatch with the actual packet being processed, causing GSO to create
segments with the wrong L2 header type.

This patch fixes the bug by deriving the inner mode directly from the
packet's inner protocol stored in XFRM_MODE_SKB_CB(skb)->protocol.

Instead of replicating the code, this patch modifies the
xfrm_ip2inner_mode helper function. It now correctly returns
&x->inner_mode if the selector family (x->sel.family) is already
specified, thereby handling both specific and AF_UNSPEC cases
appropriately.

With this change, ESP GSO can use xfrm_ip2inner_mode to get the
correct inner mode. It doesn't affect existing callers, as the updated
logic now mirrors the checks they were already performing externally.

Fixes: 26dbd66eab80 ("esp: choose the correct inner protocol for GSO on inter address family tunnels")
Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      | 3 ++-
 net/ipv4/esp4_offload.c | 6 ++++--
 net/ipv6/esp6_offload.c | 6 ++++--
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index f3014e4f54fc..0a14daaa5dd4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -536,7 +536,8 @@ static inline int xfrm_af2proto(unsigned int family)
 
 static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
 {
-	if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
+	if ((x->sel.family != AF_UNSPEC) ||
+	    (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
 	    (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
 		return &x->inner_mode;
 	else
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index e0d94270da28..05828d4cb6cd 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -122,8 +122,10 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x,
 						struct sk_buff *skb,
 						netdev_features_t features)
 {
-	__be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6)
-						       : htons(ETH_P_IP);
+	const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x,
+					XFRM_MODE_SKB_CB(skb)->protocol);
+	__be16 type = inner_mode->family == AF_INET6 ? htons(ETH_P_IPV6)
+						     : htons(ETH_P_IP);
 
 	return skb_eth_gso_segment(skb, features, type);
 }
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 7b41fb4f00b5..22410243ebe8 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -158,8 +158,10 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x,
 						struct sk_buff *skb,
 						netdev_features_t features)
 {
-	__be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP)
-						      : htons(ETH_P_IPV6);
+	const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x,
+					XFRM_MODE_SKB_CB(skb)->protocol);
+	__be16 type = inner_mode->family == AF_INET ? htons(ETH_P_IP)
+						    : htons(ETH_P_IPV6);
 
 	return skb_eth_gso_segment(skb, features, type);
 }
-- 
cgit v1.2.3


From 57347d58a4011551e7d0e030f2f12e4d1a28feb6 Mon Sep 17 00:00:00 2001
From: "caivive (Weibiao Tu)" <cavivie@gmail.com>
Date: Thu, 28 Nov 2024 20:52:04 +0800
Subject: netfilter: fix typo in nf_conntrack_l4proto.h comment

In the comment for nf_conntrack_l4proto.h, the word "nfnetink" was
incorrectly spelled. It has been corrected to "nfnetlink".

Fixes a typo to enhance readability and ensure consistency.

Signed-off-by: caivive (Weibiao Tu) <cavivie@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_conntrack_l4proto.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 6929f8daf1ed..cd5020835a6d 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -30,7 +30,7 @@ struct nf_conntrack_l4proto {
 	/* called by gc worker if table is full */
 	bool (*can_early_drop)(const struct nf_conn *ct);
 
-	/* convert protoinfo to nfnetink attributes */
+	/* convert protoinfo to nfnetlink attributes */
 	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
 			 struct nf_conn *ct, bool destroy);
 
-- 
cgit v1.2.3


From 74a7b4f18396f07e87c7fda5c19d1fcfb8c1dd44 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 17 Oct 2025 00:08:02 -0700
Subject: sysctl: fix kernel-doc format warning

Describe the "type" struct member using '@type' and move it together
with the rest of the doc for ctl_table_header to avoid a kernel-doc
warning:

Warning: include/linux/sysctl.h:178 Incorrect use of kernel-doc format:
 * enum type - Enumeration to differentiate between ctl target types

Fixes: 2f2665c13af4 ("sysctl: replace child with an enumeration")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/sysctl.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 92e9146b1104..28c4a997fd21 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -156,6 +156,10 @@ struct ctl_node {
  * @nreg: When nreg drops to 0 the ctl_table_header will be unregistered.
  * @rcu: Delays the freeing of the inode. Introduced with "unfuck proc_sysctl ->d_compare()"
  *
+ * @type: Enumeration to differentiate between ctl target types
+ * @type.SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations
+ * @type.SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Identifies a permanently empty dir
+ *                                            target to serve as a mount point
  */
 struct ctl_table_header {
 	union {
@@ -175,13 +179,6 @@ struct ctl_table_header {
 	struct ctl_dir *parent;
 	struct ctl_node *node;
 	struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */
-	/**
-	 * enum type - Enumeration to differentiate between ctl target types
-	 * @SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations
-	 * @SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Used to identify a permanently
-	 *                                       empty directory target to serve
-	 *                                       as mount point.
-	 */
 	enum {
 		SYSCTL_TABLE_TYPE_DEFAULT,
 		SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY,
-- 
cgit v1.2.3


From aef3cdb47bbbef9fea9512ed6c02d64394449d53 Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic@linux.ibm.com>
Date: Mon, 27 Oct 2025 23:48:55 +0100
Subject: net/smc: make wr buffer count configurable

Think SMC_WR_BUF_CNT_SEND := SMC_WR_BUF_CNT used in send context and
SMC_WR_BUF_CNT_RECV := 3 * SMC_WR_BUF_CNT used in recv context. Those
get replaced with lgr->max_send_wr and lgr->max_recv_wr respective.

Please note that although with the default sysctl values
qp_attr.cap.max_send_wr ==  qp_attr.cap.max_recv_wr is maintained but
can not be assumed to be generally true any more. I see no downside to
that, but my confidence level is rather modest.

Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
Reviewed-by: Sidraya Jayagond <sidraya@linux.ibm.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Tested-by: Mahanta Jambigi <mjambigi@linux.ibm.com>
Link: https://patch.msgid.link/20251027224856.2970019-2-pasic@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/smc-sysctl.rst | 36 +++++++++++++++++++++++++++++++++
 include/net/netns/smc.h                 |  2 ++
 net/smc/smc_core.h                      |  6 ++++++
 net/smc/smc_ib.c                        | 10 ++++-----
 net/smc/smc_llc.c                       |  2 ++
 net/smc/smc_sysctl.c                    | 22 ++++++++++++++++++++
 net/smc/smc_sysctl.h                    |  2 ++
 net/smc/smc_wr.c                        | 31 ++++++++++++++--------------
 net/smc/smc_wr.h                        |  2 --
 9 files changed, 91 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst
index a874d007f2db..337ac2be167e 100644
--- a/Documentation/networking/smc-sysctl.rst
+++ b/Documentation/networking/smc-sysctl.rst
@@ -71,3 +71,39 @@ smcr_max_conns_per_lgr - INTEGER
 	acceptable value ranges from 16 to 255. Only for SMC-R v2.1 and later.
 
 	Default: 255
+
+smcr_max_send_wr - INTEGER
+	So-called work request buffers are SMCR link (and RDMA queue pair) level
+	resources necessary for performing RDMA operations. Since up to 255
+	connections can share a link group and thus also a link and the number
+	of the work request buffers is decided when the link is allocated,
+	depending on the workload it can be a bottleneck in a sense that threads
+	have to wait for work request buffers to become available. Before the
+	introduction of this control the maximal number of work request buffers
+	available on the send path used to be hard coded to 16. With this control
+	it becomes configurable. The acceptable range is between 2 and 2048.
+
+	Please be aware that all the buffers need to be allocated as a physically
+	continuous array in which each element is a single buffer and has the size
+	of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails we give up much
+	like before having this control.
+
+	Default: 16
+
+smcr_max_recv_wr - INTEGER
+	So-called work request buffers are SMCR link (and RDMA queue pair) level
+	resources necessary for performing RDMA operations. Since up to 255
+	connections can share a link group and thus also a link and the number
+	of the work request buffers is decided when the link is allocated,
+	depending on the workload it can be a bottleneck in a sense that threads
+	have to wait for work request buffers to become available. Before the
+	introduction of this control the maximal number of work request buffers
+	available on the receive path used to be hard coded to 16. With this control
+	it becomes configurable. The acceptable range is between 2 and 2048.
+
+	Please be aware that all the buffers need to be allocated as a physically
+	continuous array in which each element is a single buffer and has the size
+	of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails we give up much
+	like before having this control.
+
+	Default: 48
diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index fc752a50f91b..6ceb12baec24 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -24,5 +24,7 @@ struct netns_smc {
 	int				sysctl_rmem;
 	int				sysctl_max_links_per_lgr;
 	int				sysctl_max_conns_per_lgr;
+	unsigned int			sysctl_smcr_max_send_wr;
+	unsigned int			sysctl_smcr_max_recv_wr;
 };
 #endif
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index a5a78cbff341..8d06c8bb14e9 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -34,6 +34,8 @@
 					 * distributions may modify it to a value between
 					 * 16-255 as needed.
 					 */
+#define SMCR_MAX_SEND_WR_DEF	16	/* Default number of work requests per send queue */
+#define SMCR_MAX_RECV_WR_DEF	48	/* Default number of work requests per recv queue */
 
 struct smc_lgr_list {			/* list of link group definition */
 	struct list_head	list;
@@ -366,6 +368,10 @@ struct smc_link_group {
 						/* max conn can be assigned to lgr */
 			u8			max_links;
 						/* max links can be added in lgr */
+			u16			max_send_wr;
+						/* number of WR buffers on send */
+			u16			max_recv_wr;
+						/* number of WR buffers on recv */
 		};
 		struct { /* SMC-D */
 			struct smcd_gid		peer_gid;
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 0052f02756eb..1154907c5c05 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -669,11 +669,6 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
 		.recv_cq = lnk->smcibdev->roce_cq_recv,
 		.srq = NULL,
 		.cap = {
-				/* include unsolicited rdma_writes as well,
-				 * there are max. 2 RDMA_WRITE per 1 WR_SEND
-				 */
-			.max_send_wr = SMC_WR_BUF_CNT * 3,
-			.max_recv_wr = SMC_WR_BUF_CNT * 3,
 			.max_send_sge = SMC_IB_MAX_SEND_SGE,
 			.max_recv_sge = lnk->wr_rx_sge_cnt,
 			.max_inline_data = 0,
@@ -683,6 +678,11 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
 	};
 	int rc;
 
+	/* include unsolicited rdma_writes as well,
+	 * there are max. 2 RDMA_WRITE per 1 WR_SEND
+	 */
+	qp_attr.cap.max_send_wr = 3 * lnk->lgr->max_send_wr;
+	qp_attr.cap.max_recv_wr = lnk->lgr->max_recv_wr;
 	lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
 	rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
 	if (IS_ERR(lnk->roce_qp))
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index f865c58c3aa7..f5d5eb617526 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -2157,6 +2157,8 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
 	init_waitqueue_head(&lgr->llc_msg_waiter);
 	init_rwsem(&lgr->llc_conf_mutex);
 	lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time);
+	lgr->max_send_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_send_wr));
+	lgr->max_recv_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_recv_wr));
 }
 
 /* called after lgr was removed from lgr_list */
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
index 2fab6456f765..7b2471904d04 100644
--- a/net/smc/smc_sysctl.c
+++ b/net/smc/smc_sysctl.c
@@ -29,6 +29,8 @@ static int links_per_lgr_min = SMC_LINKS_ADD_LNK_MIN;
 static int links_per_lgr_max = SMC_LINKS_ADD_LNK_MAX;
 static int conns_per_lgr_min = SMC_CONN_PER_LGR_MIN;
 static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX;
+static unsigned int smcr_max_wr_min = 2;
+static unsigned int smcr_max_wr_max = 2048;
 
 static struct ctl_table smc_table[] = {
 	{
@@ -99,6 +101,24 @@ static struct ctl_table smc_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
+	{
+		.procname	= "smcr_max_send_wr",
+		.data		= &init_net.smc.sysctl_smcr_max_send_wr,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &smcr_max_wr_min,
+		.extra2		= &smcr_max_wr_max,
+	},
+	{
+		.procname	= "smcr_max_recv_wr",
+		.data		= &init_net.smc.sysctl_smcr_max_recv_wr,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &smcr_max_wr_min,
+		.extra2		= &smcr_max_wr_max,
+	},
 };
 
 int __net_init smc_sysctl_net_init(struct net *net)
@@ -130,6 +150,8 @@ int __net_init smc_sysctl_net_init(struct net *net)
 	WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init);
 	net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER;
 	net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER;
+	net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF;
+	net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF;
 	/* disable handshake limitation by default */
 	net->smc.limit_smc_hs = 0;
 
diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h
index eb2465ae1e15..8538915af7af 100644
--- a/net/smc/smc_sysctl.h
+++ b/net/smc/smc_sysctl.h
@@ -25,6 +25,8 @@ static inline int smc_sysctl_net_init(struct net *net)
 	net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
 	net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER;
 	net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER;
+	net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF;
+	net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF;
 	return 0;
 }
 
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index b04a21b8c511..883fb0f1ce43 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -547,9 +547,9 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
 		    IB_QP_DEST_QPN,
 		    &init_attr);
 
-	lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
+	lnk->wr_tx_cnt = min_t(size_t, lnk->lgr->max_send_wr,
 			       lnk->qp_attr.cap.max_send_wr);
-	lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
+	lnk->wr_rx_cnt = min_t(size_t, lnk->lgr->max_recv_wr,
 			       lnk->qp_attr.cap.max_recv_wr);
 }
 
@@ -741,50 +741,51 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
 int smc_wr_alloc_link_mem(struct smc_link *link)
 {
 	/* allocate link related memory */
-	link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
+	link->wr_tx_bufs = kcalloc(link->lgr->max_send_wr,
+				   SMC_WR_BUF_SIZE, GFP_KERNEL);
 	if (!link->wr_tx_bufs)
 		goto no_mem;
-	link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen,
+	link->wr_rx_bufs = kcalloc(link->lgr->max_recv_wr, link->wr_rx_buflen,
 				   GFP_KERNEL);
 	if (!link->wr_rx_bufs)
 		goto no_mem_wr_tx_bufs;
-	link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
-				  GFP_KERNEL);
+	link->wr_tx_ibs = kcalloc(link->lgr->max_send_wr,
+				  sizeof(link->wr_tx_ibs[0]), GFP_KERNEL);
 	if (!link->wr_tx_ibs)
 		goto no_mem_wr_rx_bufs;
-	link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
+	link->wr_rx_ibs = kcalloc(link->lgr->max_recv_wr,
 				  sizeof(link->wr_rx_ibs[0]),
 				  GFP_KERNEL);
 	if (!link->wr_rx_ibs)
 		goto no_mem_wr_tx_ibs;
-	link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
+	link->wr_tx_rdmas = kcalloc(link->lgr->max_send_wr,
 				    sizeof(link->wr_tx_rdmas[0]),
 				    GFP_KERNEL);
 	if (!link->wr_tx_rdmas)
 		goto no_mem_wr_rx_ibs;
-	link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
+	link->wr_tx_rdma_sges = kcalloc(link->lgr->max_send_wr,
 					sizeof(link->wr_tx_rdma_sges[0]),
 					GFP_KERNEL);
 	if (!link->wr_tx_rdma_sges)
 		goto no_mem_wr_tx_rdmas;
-	link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
+	link->wr_tx_sges = kcalloc(link->lgr->max_send_wr, sizeof(link->wr_tx_sges[0]),
 				   GFP_KERNEL);
 	if (!link->wr_tx_sges)
 		goto no_mem_wr_tx_rdma_sges;
-	link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
+	link->wr_rx_sges = kcalloc(link->lgr->max_recv_wr,
 				   sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt,
 				   GFP_KERNEL);
 	if (!link->wr_rx_sges)
 		goto no_mem_wr_tx_sges;
-	link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL);
+	link->wr_tx_mask = bitmap_zalloc(link->lgr->max_send_wr, GFP_KERNEL);
 	if (!link->wr_tx_mask)
 		goto no_mem_wr_rx_sges;
-	link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
+	link->wr_tx_pends = kcalloc(link->lgr->max_send_wr,
 				    sizeof(link->wr_tx_pends[0]),
 				    GFP_KERNEL);
 	if (!link->wr_tx_pends)
 		goto no_mem_wr_tx_mask;
-	link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
+	link->wr_tx_compl = kcalloc(link->lgr->max_send_wr,
 				    sizeof(link->wr_tx_compl[0]),
 				    GFP_KERNEL);
 	if (!link->wr_tx_compl)
@@ -905,7 +906,7 @@ int smc_wr_create_link(struct smc_link *lnk)
 		goto dma_unmap;
 	}
 	smc_wr_init_sge(lnk);
-	bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
+	bitmap_zero(lnk->wr_tx_mask, lnk->lgr->max_send_wr);
 	init_waitqueue_head(&lnk->wr_tx_wait);
 	rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL);
 	if (rc)
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index f3008dda222a..aa4533af9122 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -19,8 +19,6 @@
 #include "smc.h"
 #include "smc_core.h"
 
-#define SMC_WR_BUF_CNT 16	/* # of ctrl buffers per link */
-
 #define SMC_WR_TX_WAIT_FREE_SLOT_TIME	(10 * HZ)
 
 #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
-- 
cgit v1.2.3


From 4061c43a99772c66c378cfacaa71550ab3b35909 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 28 Oct 2025 09:45:48 +0100
Subject: pidfs: add missing PIDFD_INFO_SIZE_VER1

We grew struct pidfd_info not too long ago.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-3-ca449b7b7aa0@kernel.org
Fixes: 1d8db6fd698d ("pidfs, coredump: add PIDFD_INFO_COREDUMP")
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/pidfd.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 957db425d459..6ccbabd9a68d 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -28,6 +28,7 @@
 #define PIDFD_INFO_COREDUMP		(1UL << 4) /* Only returned if requested. */
 
 #define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
+#define PIDFD_INFO_SIZE_VER1		72 /* sizeof second published struct */
 
 /*
  * Values for @coredump_mask in pidfd_info.
-- 
cgit v1.2.3


From dfd78546c95330db2252e0d7e937a15ab5eddb4e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 28 Oct 2025 09:45:50 +0100
Subject: pidfd: add a new supported_mask field

Some of the future fields in struct pidfd_info can be optional. If the
kernel has nothing to emit in that field, then it doesn't set the flag
in the reply. This presents a problem: There is currently no way to know
what mask flags the kernel supports since one can't always count on them
being in the reply.

Add a new PIDFD_INFO_SUPPORTED_MASK flag and field that the kernel can
set in the reply. Userspace can use this to determine if the fields it
requires from the kernel are supported. This also gives us a way to
deprecate fields in the future, if that should become necessary.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-5-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c                 | 17 ++++++++++++++++-
 include/uapi/linux/pidfd.h |  3 +++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 7e4d90cc74ff..204ebd32791a 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -293,6 +293,14 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
 	return 0;
 }
 
+/* This must be updated whenever a new flag is added */
+#define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID | \
+			      PIDFD_INFO_CREDS | \
+			      PIDFD_INFO_CGROUPID | \
+			      PIDFD_INFO_EXIT | \
+			      PIDFD_INFO_COREDUMP | \
+			      PIDFD_INFO_SUPPORTED_MASK)
+
 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
@@ -306,7 +314,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 	const struct cred *c;
 	__u64 mask;
 
-	BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER1);
+	BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2);
 
 	if (!uinfo)
 		return -EINVAL;
@@ -412,6 +420,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 		return -ESRCH;
 
 copy_out:
+	if (mask & PIDFD_INFO_SUPPORTED_MASK) {
+		kinfo.mask |= PIDFD_INFO_SUPPORTED_MASK;
+		kinfo.supported_mask = PIDFD_INFO_SUPPORTED;
+	}
+
+	/* Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? */
+	WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask);
 	/*
 	 * If userspace and the kernel have the same struct size it can just
 	 * be copied. If userspace provides an older struct, only the bits that
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 6ccbabd9a68d..e05caa0e00fe 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -26,9 +26,11 @@
 #define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
 #define PIDFD_INFO_EXIT			(1UL << 3) /* Only returned if requested. */
 #define PIDFD_INFO_COREDUMP		(1UL << 4) /* Only returned if requested. */
+#define PIDFD_INFO_SUPPORTED_MASK	(1UL << 5) /* Want/got supported mask flags */
 
 #define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
 #define PIDFD_INFO_SIZE_VER1		72 /* sizeof second published struct */
+#define PIDFD_INFO_SIZE_VER2		80 /* sizeof third published struct */
 
 /*
  * Values for @coredump_mask in pidfd_info.
@@ -94,6 +96,7 @@ struct pidfd_info {
 	__s32 exit_code;
 	__u32 coredump_mask;
 	__u32 __spare1;
+	__u64 supported_mask;	/* Mask flags that this kernel supports */
 };
 
 #define PIDFS_IOCTL_MAGIC 0xFF
-- 
cgit v1.2.3


From 036375522be8425874e9e0f907c7127e315c7a52 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 28 Oct 2025 09:45:53 +0100
Subject: pidfs: expose coredump signal

Userspace needs access to the signal that caused the coredump before the
coredumping process has been reaped. Expose it as part of the coredump
information in struct pidfd_info. After the process has been reaped that
info is also available as part of PIDFD_INFO_EXIT's exit_code field.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-8-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c                 | 30 +++++++++++++++++++-----------
 include/uapi/linux/pidfd.h |  7 +++++--
 2 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index a3b80be3b98b..354ceb2126e7 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -41,6 +41,7 @@ void pidfs_get_root(struct path *path)
 
 enum pidfs_attr_mask_bits {
 	PIDFS_ATTR_BIT_EXIT	= 0,
+	PIDFS_ATTR_BIT_COREDUMP	= 1,
 };
 
 struct pidfs_attr {
@@ -51,6 +52,7 @@ struct pidfs_attr {
 		__s32 exit_code;
 	};
 	__u32 coredump_mask;
+	__u32 coredump_signal;
 };
 
 static struct rb_root pidfs_ino_tree = RB_ROOT;
@@ -297,7 +299,8 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
 			      PIDFD_INFO_CGROUPID | \
 			      PIDFD_INFO_EXIT | \
 			      PIDFD_INFO_COREDUMP | \
-			      PIDFD_INFO_SUPPORTED_MASK)
+			      PIDFD_INFO_SUPPORTED_MASK | \
+			      PIDFD_INFO_COREDUMP_SIGNAL)
 
 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -342,9 +345,12 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 
 	if (mask & PIDFD_INFO_COREDUMP) {
-		kinfo.coredump_mask = READ_ONCE(attr->coredump_mask);
-		if (kinfo.coredump_mask)
-			kinfo.mask |= PIDFD_INFO_COREDUMP;
+		if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) {
+			smp_rmb();
+			kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL;
+			kinfo.coredump_mask = attr->coredump_mask;
+			kinfo.coredump_signal = attr->coredump_signal;
+		}
 	}
 
 	task = get_pid_task(pid, PIDTYPE_PID);
@@ -370,6 +376,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 
 			kinfo.coredump_mask = pidfs_coredump_mask(flags);
 			kinfo.mask |= PIDFD_INFO_COREDUMP;
+			/* No coredump actually took place, so no coredump signal. */
 		}
 	}
 
@@ -666,20 +673,21 @@ void pidfs_coredump(const struct coredump_params *cprm)
 {
 	struct pid *pid = cprm->pid;
 	struct pidfs_attr *attr;
-	__u32 coredump_mask = 0;
 
 	attr = READ_ONCE(pid->attr);
 
 	VFS_WARN_ON_ONCE(!attr);
 	VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
 
-	/* Note how we were coredumped. */
-	coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
-	/* Note that we actually did coredump. */
-	coredump_mask |= PIDFD_COREDUMPED;
+	/* Note how we were coredumped and that we coredumped. */
+	attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) |
+			      PIDFD_COREDUMPED;
 	/* If coredumping is set to skip we should never end up here. */
-	VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
-	smp_store_release(&attr->coredump_mask, coredump_mask);
+	VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP);
+	/* Expose the signal number that caused the coredump. */
+	attr->coredump_signal = cprm->siginfo->si_signo;
+	smp_wmb();
+	set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask);
 }
 #endif
 
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index e05caa0e00fe..ea9a6811fc76 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -27,6 +27,7 @@
 #define PIDFD_INFO_EXIT			(1UL << 3) /* Only returned if requested. */
 #define PIDFD_INFO_COREDUMP		(1UL << 4) /* Only returned if requested. */
 #define PIDFD_INFO_SUPPORTED_MASK	(1UL << 5) /* Want/got supported mask flags */
+#define PIDFD_INFO_COREDUMP_SIGNAL	(1UL << 6) /* Always returned if PIDFD_INFO_COREDUMP is requested. */
 
 #define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
 #define PIDFD_INFO_SIZE_VER1		72 /* sizeof second published struct */
@@ -94,8 +95,10 @@ struct pidfd_info {
 	__u32 fsuid;
 	__u32 fsgid;
 	__s32 exit_code;
-	__u32 coredump_mask;
-	__u32 __spare1;
+	struct /* coredump info */ {
+		__u32 coredump_mask;
+		__u32 coredump_signal;
+	};
 	__u64 supported_mask;	/* Mask flags that this kernel supports */
 };
 
-- 
cgit v1.2.3


From 3c0c81de525d2a2718e23754a5795483167904ac Mon Sep 17 00:00:00 2001
From: Markus Theil <theil.markus@gmail.com>
Date: Tue, 11 Feb 2025 07:33:32 +0100
Subject: prandom: remove next_pseudo_random32

next_pseudo_random32 implements a LCG with known bad statistical
properties and was only used in two pieces of testing code.

With no remaining users now, remove it.

Signed-off-by: Markus Theil <theil.markus@gmail.com>
Reviewed-by: Krzysztof Karas <krzysztof.karas@intel.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/prandom.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index f2ed5b72b3d6..ff7dcc3fa105 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -47,10 +47,4 @@ static inline void prandom_seed_state(struct rnd_state *state, u64 seed)
 	state->s4 = __seed(i, 128U);
 }
 
-/* Pseudo random number generator from numerical recipes. */
-static inline u32 next_pseudo_random32(u32 seed)
-{
-	return seed * 1664525 + 1013904223;
-}
-
 #endif
-- 
cgit v1.2.3


From 8e4ec90701efec7f2814c89b398d6d4272636814 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 20 Oct 2025 07:55:55 -1000
Subject: freezer: Clarify that only cgroup1 freezer uses PM freezer

cgroup1 freezer piggybacks on the PM freezer, which inadvertently allowed
userspace to produce uninterruptible tasks at will. To avoid the issue,
cgroup2 freezer switched to a separate job control based mechanism. While
this happened a long time ago, the code and comment haven't been updated
making it confusing to people who aren't familiar with the history.

Rename cgroup_freezing() to cgroup1_freezing() and update comments on top of
freezing() and frozen() to clarify that cgroup2 freezer isn't covered by the
PM freezer mechanism.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Qu Wenruo <wqu@suse.com>
Link: https://patch.msgid.link/aPZ3q6Hm865NicBC@slm.duckdns.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/freezer.h        | 12 ++++++++----
 kernel/cgroup/legacy_freezer.c |  2 +-
 kernel/freezer.c               |  2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 32884c9721e5..0a8c6c4d1a82 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -22,14 +22,18 @@ extern bool pm_nosig_freezing;		/* PM nosig freezing in effect */
 extern unsigned int freeze_timeout_msecs;
 
 /*
- * Check if a process has been frozen
+ * Check if a process has been frozen for PM or cgroup1 freezer. Note that
+ * cgroup2 freezer uses the job control mechanism and does not interact with
+ * the PM freezer.
  */
 extern bool frozen(struct task_struct *p);
 
 extern bool freezing_slow_path(struct task_struct *p);
 
 /*
- * Check if there is a request to freeze a process
+ * Check if there is a request to freeze a task from PM or cgroup1 freezer.
+ * Note that cgroup2 freezer uses the job control mechanism and does not
+ * interact with the PM freezer.
  */
 static inline bool freezing(struct task_struct *p)
 {
@@ -63,9 +67,9 @@ extern bool freeze_task(struct task_struct *p);
 extern bool set_freezable(void);
 
 #ifdef CONFIG_CGROUP_FREEZER
-extern bool cgroup_freezing(struct task_struct *task);
+extern bool cgroup1_freezing(struct task_struct *task);
 #else /* !CONFIG_CGROUP_FREEZER */
-static inline bool cgroup_freezing(struct task_struct *task)
+static inline bool cgroup1_freezing(struct task_struct *task)
 {
 	return false;
 }
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index dd9417425d92..915b02f65980 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -63,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer)
 	return css_freezer(freezer->css.parent);
 }
 
-bool cgroup_freezing(struct task_struct *task)
+bool cgroup1_freezing(struct task_struct *task)
 {
 	bool ret;
 
diff --git a/kernel/freezer.c b/kernel/freezer.c
index ddc11a8bd2ea..a76bf957fb32 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,7 +44,7 @@ bool freezing_slow_path(struct task_struct *p)
 	if (tsk_is_oom_victim(p))
 		return false;
 
-	if (pm_nosig_freezing || cgroup_freezing(p))
+	if (pm_nosig_freezing || cgroup1_freezing(p))
 		return true;
 
 	if (pm_freezing && !(p->flags & PF_KTHREAD))
-- 
cgit v1.2.3


From 11e15a6f3287711e637e208df7089c710cef82b5 Mon Sep 17 00:00:00 2001
From: Raviteja Laggyshetty <quic_rlaggysh@quicinc.com>
Date: Fri, 26 Sep 2025 12:12:10 +0530
Subject: dt-bindings: interconnect: qcom: Drop QPIC_CORE IDs

As like other SDX targets, SDX75 QPIC BCM resource is also modeled as a
RPMh clock in clk-rpmh driver. However, for SDX75, this resource was also
described as an interconnect node mistakenly.

Hence, drop the QPIC interconnect IDs and let the clients use clk-rpmh
driver to vote for this resource.

Even though this change is an ABI break, it is necessary to avoid
describing the same resource provider in two different drivers, as it may
lead to votes from clients overriding each other.

Fixes: 956329ec7c5e ("dt-bindings: interconnect: Add compatibles for SDX75")
Signed-off-by: Raviteja Laggyshetty <quic_rlaggysh@quicinc.com>
[mani: kept the QUP defines value unchanged]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@oss.qualcomm.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20250926-sdx75-icc-v2-2-20d6820e455c@oss.qualcomm.com
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 include/dt-bindings/interconnect/qcom,sdx75.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/interconnect/qcom,sdx75.h b/include/dt-bindings/interconnect/qcom,sdx75.h
index e903f5f3dd8f..0e19ee8f1687 100644
--- a/include/dt-bindings/interconnect/qcom,sdx75.h
+++ b/include/dt-bindings/interconnect/qcom,sdx75.h
@@ -6,9 +6,7 @@
 #ifndef __DT_BINDINGS_INTERCONNECT_QCOM_SDX75_H
 #define __DT_BINDINGS_INTERCONNECT_QCOM_SDX75_H
 
-#define MASTER_QPIC_CORE		0
 #define MASTER_QUP_CORE_0		1
-#define SLAVE_QPIC_CORE			2
 #define SLAVE_QUP_CORE_0		3
 
 #define MASTER_LLCC			0
-- 
cgit v1.2.3


From c9822fad8038870bb690543539c8e9ad5213b12f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:14 +0100
Subject: libfs: allow to specify s_d_flags

Make it possible for pseudo filesystems to specify default dentry flags.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-1-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c                | 1 +
 include/linux/pseudo_fs.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/fs/libfs.c b/fs/libfs.c
index ce8c496a6940..4bb4d8a313e7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -680,6 +680,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_export_op = ctx->eops;
 	s->s_xattr = ctx->xattr;
 	s->s_time_gran = 1;
+	s->s_d_flags |= ctx->s_d_flags;
 	root = new_inode(s);
 	if (!root)
 		return -ENOMEM;
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index 2503f7625d65..a651e60d9410 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -9,6 +9,7 @@ struct pseudo_fs_context {
 	const struct xattr_handler * const *xattr;
 	const struct dentry_operations *dops;
 	unsigned long magic;
+	unsigned int s_d_flags;
 };
 
 struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
-- 
cgit v1.2.3


From 85e1a7ec61d9829af5897da421eb135c6cc73e07 Mon Sep 17 00:00:00 2001
From: T Pratham <t-pratham@ti.com>
Date: Wed, 22 Oct 2025 22:48:42 +0530
Subject: crypto: aead - Add support for on-stack AEAD req allocation

This patch introduces infrastructure for allocating req objects on the
stack for AEADs. The additions mirror the existing sync skcipher APIs.
This can be used in cases where simple sync AEAD operations are being
done. So allocating the request on stack avoides possible out-of-memory
errors.

The struct crypto_sync_aead is a wrapper around crypto_aead and should
be used in its place when sync only requests will be done on the stack.
Correspondingly, the request should be allocated with
SYNC_AEAD_REQUEST_ON_STACK().

Similar to sync_skcipher APIs, the new sync_aead APIs are wrappers
around the regular aead APIs to facilitate sync only operations. The
following crypto APIs are added:
 - struct crypto_sync_aead
 - crypto_alloc_sync_aead()
 - crypto_free_sync_aead()
 - crypto_aync_aead_tfm()
 - crypto_sync_aead_setkey()
 - crypto_sync_aead_setauthsize()
 - crypto_sync_aead_authsize()
 - crypto_sync_aead_maxauthsize()
 - crypto_sync_aead_ivsize()
 - crypto_sync_aead_blocksize()
 - crypto_sync_aead_get_flags()
 - crypto_sync_aead_set_flags()
 - crypto_sync_aead_clear_flags()
 - crypto_sync_aead_reqtfm()
 - aead_request_set_sync_tfm()
 - SYNC_AEAD_REQUEST_ON_STACK()

Signed-off-by: T Pratham <t-pratham@ti.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/aead.c         | 19 +++++++++++
 include/crypto/aead.h | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)

(limited to 'include')

diff --git a/crypto/aead.c b/crypto/aead.c
index 51ab3af691af..08d44c5e5c33 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -205,6 +205,25 @@ struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask)
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_aead);
 
+struct crypto_sync_aead *crypto_alloc_sync_aead(const char *alg_name, u32 type, u32 mask)
+{
+	struct crypto_aead *tfm;
+
+	/* Only sync algorithms are allowed. */
+	mask |= CRYPTO_ALG_ASYNC;
+	type &= ~(CRYPTO_ALG_ASYNC);
+
+	tfm = crypto_alloc_tfm(alg_name, &crypto_aead_type, type, mask);
+
+	if (!IS_ERR(tfm) && WARN_ON(crypto_aead_reqsize(tfm) > MAX_SYNC_AEAD_REQSIZE)) {
+		crypto_free_aead(tfm);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return (struct crypto_sync_aead *)tfm;
+}
+EXPORT_SYMBOL_GPL(crypto_alloc_sync_aead);
+
 int crypto_has_aead(const char *alg_name, u32 type, u32 mask)
 {
 	return crypto_type_has_alg(alg_name, &crypto_aead_type, type, mask);
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 0e8a41638678..8e66a1fa9c78 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -159,6 +159,21 @@ struct crypto_aead {
 	struct crypto_tfm base;
 };
 
+struct crypto_sync_aead {
+	struct crypto_aead base;
+};
+
+#define MAX_SYNC_AEAD_REQSIZE		384
+
+#define SYNC_AEAD_REQUEST_ON_STACK(name, _tfm)		\
+	char __##name##_desc[sizeof(struct aead_request) +	\
+			     MAX_SYNC_AEAD_REQSIZE		\
+			    ] CRYPTO_MINALIGN_ATTR;		\
+	struct aead_request *name =				\
+		(((struct aead_request *)__##name##_desc)->base.tfm = \
+			crypto_sync_aead_tfm((_tfm)),		\
+		 (void *)__##name##_desc)
+
 static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm)
 {
 	return container_of(tfm, struct crypto_aead, base);
@@ -180,11 +195,18 @@ static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm)
  */
 struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask);
 
+struct crypto_sync_aead *crypto_alloc_sync_aead(const char *alg_name, u32 type, u32 mask);
+
 static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm)
 {
 	return &tfm->base;
 }
 
+static inline struct crypto_tfm *crypto_sync_aead_tfm(struct crypto_sync_aead *tfm)
+{
+	return crypto_aead_tfm(&tfm->base);
+}
+
 /**
  * crypto_free_aead() - zeroize and free aead handle
  * @tfm: cipher handle to be freed
@@ -196,6 +218,11 @@ static inline void crypto_free_aead(struct crypto_aead *tfm)
 	crypto_destroy_tfm(tfm, crypto_aead_tfm(tfm));
 }
 
+static inline void crypto_free_sync_aead(struct crypto_sync_aead *tfm)
+{
+	crypto_free_aead(&tfm->base);
+}
+
 /**
  * crypto_has_aead() - Search for the availability of an aead.
  * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
@@ -238,6 +265,11 @@ static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm)
 	return crypto_aead_alg_ivsize(crypto_aead_alg(tfm));
 }
 
+static inline unsigned int crypto_sync_aead_ivsize(struct crypto_sync_aead *tfm)
+{
+	return crypto_aead_ivsize(&tfm->base);
+}
+
 /**
  * crypto_aead_authsize() - obtain maximum authentication data size
  * @tfm: cipher handle
@@ -255,6 +287,11 @@ static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm)
 	return tfm->authsize;
 }
 
+static inline unsigned int crypto_sync_aead_authsize(struct crypto_sync_aead *tfm)
+{
+	return crypto_aead_authsize(&tfm->base);
+}
+
 static inline unsigned int crypto_aead_alg_maxauthsize(struct aead_alg *alg)
 {
 	return alg->maxauthsize;
@@ -265,6 +302,11 @@ static inline unsigned int crypto_aead_maxauthsize(struct crypto_aead *aead)
 	return crypto_aead_alg_maxauthsize(crypto_aead_alg(aead));
 }
 
+static inline unsigned int crypto_sync_aead_maxauthsize(struct crypto_sync_aead *tfm)
+{
+	return crypto_aead_maxauthsize(&tfm->base);
+}
+
 /**
  * crypto_aead_blocksize() - obtain block size of cipher
  * @tfm: cipher handle
@@ -280,6 +322,11 @@ static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm)
 	return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm));
 }
 
+static inline unsigned int crypto_sync_aead_blocksize(struct crypto_sync_aead *tfm)
+{
+	return crypto_aead_blocksize(&tfm->base);
+}
+
 static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm)
 {
 	return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm));
@@ -300,6 +347,21 @@ static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags)
 	crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags);
 }
 
+static inline u32 crypto_sync_aead_get_flags(struct crypto_sync_aead *tfm)
+{
+	return crypto_aead_get_flags(&tfm->base);
+}
+
+static inline void crypto_sync_aead_set_flags(struct crypto_sync_aead *tfm, u32 flags)
+{
+	crypto_aead_set_flags(&tfm->base, flags);
+}
+
+static inline void crypto_sync_aead_clear_flags(struct crypto_sync_aead *tfm, u32 flags)
+{
+	crypto_aead_clear_flags(&tfm->base, flags);
+}
+
 /**
  * crypto_aead_setkey() - set key for cipher
  * @tfm: cipher handle
@@ -319,6 +381,12 @@ static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags)
 int crypto_aead_setkey(struct crypto_aead *tfm,
 		       const u8 *key, unsigned int keylen);
 
+static inline int crypto_sync_aead_setkey(struct crypto_sync_aead *tfm,
+					 const u8 *key, unsigned int keylen)
+{
+	return crypto_aead_setkey(&tfm->base, key, keylen);
+}
+
 /**
  * crypto_aead_setauthsize() - set authentication data size
  * @tfm: cipher handle
@@ -331,11 +399,24 @@ int crypto_aead_setkey(struct crypto_aead *tfm,
  */
 int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize);
 
+static inline int crypto_sync_aead_setauthsize(struct crypto_sync_aead *tfm,
+					       unsigned int authsize)
+{
+	return crypto_aead_setauthsize(&tfm->base, authsize);
+}
+
 static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req)
 {
 	return __crypto_aead_cast(req->base.tfm);
 }
 
+static inline struct crypto_sync_aead *crypto_sync_aead_reqtfm(struct aead_request *req)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+
+	return container_of(tfm, struct crypto_sync_aead, base);
+}
+
 /**
  * crypto_aead_encrypt() - encrypt plaintext
  * @req: reference to the aead_request handle that holds all information
@@ -417,6 +498,12 @@ static inline void aead_request_set_tfm(struct aead_request *req,
 	req->base.tfm = crypto_aead_tfm(tfm);
 }
 
+static inline void aead_request_set_sync_tfm(struct aead_request *req,
+					     struct crypto_sync_aead *tfm)
+{
+	aead_request_set_tfm(req, &tfm->base);
+}
+
 /**
  * aead_request_alloc() - allocate request data structure
  * @tfm: cipher handle to be registered with the request
-- 
cgit v1.2.3


From 12ad5b2346f905a3962b4aee701191b7a8d1905a Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Thu, 23 Oct 2025 19:48:11 +0200
Subject: keys: Annotate struct asymmetric_key_id with __counted_by

Add the __counted_by() compiler attribute to the flexible array member
'data' to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/keys/asymmetric-type.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/keys/asymmetric-type.h b/include/keys/asymmetric-type.h
index 69a13e1e5b2e..1b91c8f98688 100644
--- a/include/keys/asymmetric-type.h
+++ b/include/keys/asymmetric-type.h
@@ -49,7 +49,7 @@ enum asymmetric_payload_bits {
  */
 struct asymmetric_key_id {
 	unsigned short	len;
-	unsigned char	data[];
+	unsigned char	data[] __counted_by(len);
 };
 
 struct asymmetric_key_ids {
-- 
cgit v1.2.3


From 6568f14cb5ae68cd6c612604ca0c89301cf3a0d0 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Thu, 30 Oct 2025 18:01:54 -0700
Subject: vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN

An ftrace warning was reported in ftrace_init_ool_stub():

   WARNING: arch/powerpc/kernel/trace/ftrace.c:234 at ftrace_init_ool_stub+0x188/0x3f4, CPU#0: swapper/0

The problem is that the linker script is placing .text.startup in .text
rather than in .init.text, due to an inadvertent match of the TEXT_MAIN
'.text.[0-9a-zA-Z_]*' pattern.

This bug existed for some configurations before, but is only now coming
to light due to the TEXT_MAIN macro unification in commit 1ba9f8979426
("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros").

The .text.startup section consists of constructors which are used by
KASAN, KCSAN, and GCOV.  The constructors are only called during boot,
so .text.startup is supposed to match the INIT_TEXT pattern so it can be
placed in .init.text and freed after init.  But since INIT_TEXT comes
*after* TEXT_MAIN in the linker script, TEXT_MAIN needs to manually
exclude .text.startup.

Update TEXT_MAIN to exclude .text.startup (and its .text.startup.*
variant from -ffunction-sections), along with .text.exit and
.text.exit.* which should match EXIT_TEXT.

Specifically, use a series of more specific glob patterns to match
generic .text.* sections (for -ffunction-sections) while explicitly
excluding .text.startup[.*] and .text.exit[.*].

Also update INIT_TEXT and EXIT_TEXT to explicitly match their
-ffunction-sections variants (.text.startup.* and .text.exit.*).

Fixes: 1ba9f8979426 ("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros")
Closes: https://lore.kernel.org/72469502-ca37-4287-90b9-a751cecc498c@linux.ibm.com
Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Debugged-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Link: https://patch.msgid.link/07f74b4e5c43872572b7def30f2eac45f28675d9.1761872421.git.jpoimboe@kernel.org
---
 include/asm-generic/vmlinux.lds.h | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 5facbc994634..9de1d900fa15 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -88,13 +88,29 @@
 
 /*
  * Support -ffunction-sections by matching .text and .text.*,
- * but exclude '.text..*'.
+ * but exclude '.text..*', .text.startup[.*], and .text.exit[.*].
  *
- * Special .text.* sections that are typically grouped separately, such as
+ * .text.startup and .text.startup.* are matched later by INIT_TEXT.
+ * .text.exit and .text.exit.* are matched later by EXIT_TEXT.
+ *
+ * Other .text.* sections that are typically grouped separately, such as
  * .text.unlikely or .text.hot, must be matched explicitly before using
  * TEXT_MAIN.
  */
-#define TEXT_MAIN .text .text.[0-9a-zA-Z_]*
+#define TEXT_MAIN							\
+	.text								\
+	.text.[_0-9A-Za-df-rt-z]*					\
+	.text.s[_0-9A-Za-su-z]*						\
+	.text.st[_0-9A-Zb-z]*						\
+	.text.sta[_0-9A-Za-qs-z]*					\
+	.text.star[_0-9A-Za-su-z]*					\
+	.text.start[_0-9A-Za-tv-z]*					\
+	.text.startu[_0-9A-Za-oq-z]*					\
+	.text.startup[_0-9A-Za-z]*					\
+	.text.e[_0-9A-Za-wy-z]*						\
+	.text.ex[_0-9A-Za-hj-z]*					\
+	.text.exi[_0-9A-Za-su-z]*					\
+	.text.exit[_0-9A-Za-z]*
 
 /*
  * Support -fdata-sections by matching .data, .data.*, and others,
@@ -713,16 +729,16 @@
 
 #define INIT_TEXT							\
 	*(.init.text .init.text.*)					\
-	*(.text.startup)
+	*(.text.startup .text.startup.*)
 
 #define EXIT_DATA							\
 	*(.exit.data .exit.data.*)					\
 	*(.fini_array .fini_array.*)					\
-	*(.dtors .dtors.*)						\
+	*(.dtors .dtors.*)
 
 #define EXIT_TEXT							\
 	*(.exit.text)							\
-	*(.text.exit)							\
+	*(.text.exit .text.exit.*)
 
 #define EXIT_CALL							\
 	*(.exitcall.exit)
-- 
cgit v1.2.3


From 4511fd86db6f8f94f8aff01044f5c69aa38f81f4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 24 Oct 2025 18:08:09 +0100
Subject: filemap: Add folio_next_pos()

Replace the open-coded implementation in ocfs2 (which loses the top
32 bits on 32-bit architectures) with a helper in pagemap.h.

Fixes: 35edec1d52c0 (ocfs2: update truncate handling of partial clusters)
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-2-willy@infradead.org
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: ocfs2-devel@lists.linux.dev
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ocfs2/alloc.c        |  2 +-
 include/linux/pagemap.h | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 162711cc5b20..b267ec580da9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6892,7 +6892,7 @@ static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start,
 		ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1,
 				&phys);
 
-		start = folio_next_index(folio) << PAGE_SHIFT;
+		start = folio_next_pos(folio);
 	}
 out:
 	if (folios)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878..e16576e3763a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -941,6 +941,17 @@ static inline pgoff_t folio_next_index(const struct folio *folio)
 	return folio->index + folio_nr_pages(folio);
 }
 
+/**
+ * folio_next_pos - Get the file position of the next folio.
+ * @folio: The current folio.
+ *
+ * Return: The position of the folio which follows this folio in the file.
+ */
+static inline loff_t folio_next_pos(const struct folio *folio)
+{
+	return (loff_t)folio_next_index(folio) << PAGE_SHIFT;
+}
+
 /**
  * folio_file_page - The page for a particular index.
  * @folio: The folio which contains this index.
-- 
cgit v1.2.3


From 4f6b0435c613fdb76d85bb4aae009309a8ce8784 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Wed, 29 Oct 2025 23:16:18 +0000
Subject: can: convert generic HW timestamp ioctl to ndo_hwtstamp callbacks

Can has generic implementation of ndo_eth_ioctl which implements only HW
timestamping commands. Implement generic ndo_hwtstamp callbacks and use
it in drivers instead of generic ioctl interface.

Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Vincent Mailhol <mailhol@kernel.org>
Link: https://patch.msgid.link/20251029231620.1135640-2-vadim.fedorenko@linux.dev
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/dev.c                          | 45 +++++++++++-----------
 drivers/net/can/esd/esd_402_pci-core.c             |  3 +-
 drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c |  3 +-
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c     |  3 +-
 drivers/net/can/usb/etas_es58x/es58x_core.c        |  3 +-
 drivers/net/can/usb/gs_usb.c                       | 20 ++++++++--
 drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c   |  3 +-
 include/linux/can/dev.h                            |  6 ++-
 8 files changed, 54 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 0cc3d008adb3..80e1ab18de87 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -379,34 +379,33 @@ int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode)
 }
 EXPORT_SYMBOL_GPL(can_set_static_ctrlmode);
 
-/* generic implementation of netdev_ops::ndo_eth_ioctl for CAN devices
+/* generic implementation of netdev_ops::ndo_hwtstamp_get for CAN devices
  * supporting hardware timestamps
  */
-int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd)
+int can_hwtstamp_get(struct net_device *netdev,
+		     struct kernel_hwtstamp_config *cfg)
 {
-	struct hwtstamp_config hwts_cfg = { 0 };
-
-	switch (cmd) {
-	case SIOCSHWTSTAMP: /* set */
-		if (copy_from_user(&hwts_cfg, ifr->ifr_data, sizeof(hwts_cfg)))
-			return -EFAULT;
-		if (hwts_cfg.tx_type == HWTSTAMP_TX_ON &&
-		    hwts_cfg.rx_filter == HWTSTAMP_FILTER_ALL)
-			return 0;
-		return -ERANGE;
-
-	case SIOCGHWTSTAMP: /* get */
-		hwts_cfg.tx_type = HWTSTAMP_TX_ON;
-		hwts_cfg.rx_filter = HWTSTAMP_FILTER_ALL;
-		if (copy_to_user(ifr->ifr_data, &hwts_cfg, sizeof(hwts_cfg)))
-			return -EFAULT;
-		return 0;
+	cfg->tx_type = HWTSTAMP_TX_ON;
+	cfg->rx_filter = HWTSTAMP_FILTER_ALL;
 
-	default:
-		return -EOPNOTSUPP;
-	}
+	return 0;
+}
+EXPORT_SYMBOL(can_hwtstamp_get);
+
+/* generic implementation of netdev_ops::ndo_hwtstamp_set for CAN devices
+ * supporting hardware timestamps
+ */
+int can_hwtstamp_set(struct net_device *netdev,
+		     struct kernel_hwtstamp_config *cfg,
+		     struct netlink_ext_ack *extack)
+{
+	if (cfg->tx_type == HWTSTAMP_TX_ON &&
+	    cfg->rx_filter == HWTSTAMP_FILTER_ALL)
+		return 0;
+	NL_SET_ERR_MSG_MOD(extack, "Only TX on and RX all packets filter supported");
+	return -ERANGE;
 }
-EXPORT_SYMBOL(can_eth_ioctl_hwts);
+EXPORT_SYMBOL(can_hwtstamp_set);
 
 /* generic implementation of ethtool_ops::get_ts_info for CAN devices
  * supporting hardware timestamps
diff --git a/drivers/net/can/esd/esd_402_pci-core.c b/drivers/net/can/esd/esd_402_pci-core.c
index 05adecae6375..c826f00c551b 100644
--- a/drivers/net/can/esd/esd_402_pci-core.c
+++ b/drivers/net/can/esd/esd_402_pci-core.c
@@ -86,7 +86,8 @@ static const struct net_device_ops pci402_acc_netdev_ops = {
 	.ndo_open = acc_open,
 	.ndo_stop = acc_close,
 	.ndo_start_xmit = acc_start_xmit,
-	.ndo_eth_ioctl = can_eth_ioctl_hwts,
+	.ndo_hwtstamp_get = can_hwtstamp_get,
+	.ndo_hwtstamp_set = can_hwtstamp_set,
 };
 
 static const struct ethtool_ops pci402_acc_ethtool_ops = {
diff --git a/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c b/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c
index 705f9bb74cd2..d8c9bfb20230 100644
--- a/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c
+++ b/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c
@@ -902,8 +902,9 @@ static void kvaser_pciefd_bec_poll_timer(struct timer_list *data)
 static const struct net_device_ops kvaser_pciefd_netdev_ops = {
 	.ndo_open = kvaser_pciefd_open,
 	.ndo_stop = kvaser_pciefd_stop,
-	.ndo_eth_ioctl = can_eth_ioctl_hwts,
 	.ndo_start_xmit = kvaser_pciefd_start_xmit,
+	.ndo_hwtstamp_get = can_hwtstamp_get,
+	.ndo_hwtstamp_set = can_hwtstamp_set,
 };
 
 static int kvaser_pciefd_set_phys_id(struct net_device *netdev,
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 9402530ba3d4..c0f9d9fed02e 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -1714,7 +1714,8 @@ static const struct net_device_ops mcp251xfd_netdev_ops = {
 	.ndo_open = mcp251xfd_open,
 	.ndo_stop = mcp251xfd_stop,
 	.ndo_start_xmit	= mcp251xfd_start_xmit,
-	.ndo_eth_ioctl = can_eth_ioctl_hwts,
+	.ndo_hwtstamp_get = can_hwtstamp_get,
+	.ndo_hwtstamp_set = can_hwtstamp_set,
 };
 
 static void
diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c
index 47d9e03f3044..f799233c2b72 100644
--- a/drivers/net/can/usb/etas_es58x/es58x_core.c
+++ b/drivers/net/can/usb/etas_es58x/es58x_core.c
@@ -1976,7 +1976,8 @@ static const struct net_device_ops es58x_netdev_ops = {
 	.ndo_open = es58x_open,
 	.ndo_stop = es58x_stop,
 	.ndo_start_xmit = es58x_start_xmit,
-	.ndo_eth_ioctl = can_eth_ioctl_hwts,
+	.ndo_hwtstamp_get = can_hwtstamp_get,
+	.ndo_hwtstamp_set = can_hwtstamp_set,
 };
 
 static const struct ethtool_ops es58x_ethtool_ops = {
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 30608901a974..1321eb5e89ae 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -1087,12 +1087,25 @@ static int gs_can_close(struct net_device *netdev)
 	return 0;
 }
 
-static int gs_can_eth_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+static int gs_can_hwtstamp_get(struct net_device *netdev,
+			       struct kernel_hwtstamp_config *cfg)
 {
 	const struct gs_can *dev = netdev_priv(netdev);
 
 	if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP)
-		return can_eth_ioctl_hwts(netdev, ifr, cmd);
+		return can_hwtstamp_get(netdev, cfg);
+
+	return -EOPNOTSUPP;
+}
+
+static int gs_can_hwtstamp_set(struct net_device *netdev,
+			       struct kernel_hwtstamp_config *cfg,
+			       struct netlink_ext_ack *extack)
+{
+	const struct gs_can *dev = netdev_priv(netdev);
+
+	if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP)
+		return can_hwtstamp_set(netdev, cfg, extack);
 
 	return -EOPNOTSUPP;
 }
@@ -1101,7 +1114,8 @@ static const struct net_device_ops gs_usb_netdev_ops = {
 	.ndo_open = gs_can_open,
 	.ndo_stop = gs_can_close,
 	.ndo_start_xmit = gs_can_start_xmit,
-	.ndo_eth_ioctl = gs_can_eth_ioctl,
+	.ndo_hwtstamp_get = gs_can_hwtstamp_get,
+	.ndo_hwtstamp_set = gs_can_hwtstamp_set,
 };
 
 static int gs_usb_set_identify(struct net_device *netdev, bool do_identify)
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
index 89e22b66f919..62701ec34272 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
@@ -784,8 +784,9 @@ static int kvaser_usb_set_phys_id(struct net_device *netdev,
 static const struct net_device_ops kvaser_usb_netdev_ops = {
 	.ndo_open = kvaser_usb_open,
 	.ndo_stop = kvaser_usb_close,
-	.ndo_eth_ioctl = can_eth_ioctl_hwts,
 	.ndo_start_xmit = kvaser_usb_start_xmit,
+	.ndo_hwtstamp_get = can_hwtstamp_get,
+	.ndo_hwtstamp_set = can_hwtstamp_set,
 };
 
 static const struct ethtool_ops kvaser_usb_ethtool_ops = {
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 0fe8f80f223e..bd7410b5d8a6 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -129,7 +129,11 @@ void close_candev(struct net_device *dev);
 void can_set_default_mtu(struct net_device *dev);
 int __must_check can_set_static_ctrlmode(struct net_device *dev,
 					 u32 static_mode);
-int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd);
+int can_hwtstamp_get(struct net_device *netdev,
+		     struct kernel_hwtstamp_config *cfg);
+int can_hwtstamp_set(struct net_device *netdev,
+		     struct kernel_hwtstamp_config *cfg,
+		     struct netlink_ext_ack *extack);
 int can_ethtool_op_get_ts_info_hwts(struct net_device *dev,
 				    struct kernel_ethtool_ts_info *info);
 
-- 
cgit v1.2.3


From 7463f5ad36d8073a0e740433faf97f030d226398 Mon Sep 17 00:00:00 2001
From: Raviteja Laggyshetty <raviteja.laggyshetty@oss.qualcomm.com>
Date: Fri, 31 Oct 2025 03:38:47 +0000
Subject: dt-bindings: interconnect: document the RPMh Network-On-Chip
 interconnect in Kaanapali SoC

Document the RPMh Network-On-Chip Interconnect of the Kaanapali platform.

Co-developed-by: Odelu Kukatla <odelu.kukatla@oss.qualcomm.com>
Signed-off-by: Odelu Kukatla <odelu.kukatla@oss.qualcomm.com>
Signed-off-by: Raviteja Laggyshetty <raviteja.laggyshetty@oss.qualcomm.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20251031-knp-interconnect-v4-1-568bba2cb3e5@oss.qualcomm.com
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 .../bindings/interconnect/qcom,kaanapali-rpmh.yaml | 124 +++++++++++++++++
 .../dt-bindings/interconnect/qcom,kaanapali-rpmh.h | 149 +++++++++++++++++++++
 2 files changed, 273 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/interconnect/qcom,kaanapali-rpmh.yaml
 create mode 100644 include/dt-bindings/interconnect/qcom,kaanapali-rpmh.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/interconnect/qcom,kaanapali-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,kaanapali-rpmh.yaml
new file mode 100644
index 000000000000..2c3b2fd81a74
--- /dev/null
+++ b/Documentation/devicetree/bindings/interconnect/qcom,kaanapali-rpmh.yaml
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/interconnect/qcom,kaanapali-rpmh.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm RPMh Network-On-Chip Interconnect on Kaanapali
+
+maintainers:
+  - Raviteja Laggyshetty <raviteja.laggyshetty@oss.qualcomm.com>
+
+description: |
+  RPMh interconnect providers support system bandwidth requirements through
+  RPMh hardware accelerators known as Bus Clock Manager (BCM). The provider is
+  able to communicate with the BCM through the Resource State Coordinator (RSC)
+  associated with each execution environment. Provider nodes must point to at
+  least one RPMh device child node pertaining to their RSC and each provider
+  can map to multiple RPMh resources.
+
+  See also: include/dt-bindings/interconnect/qcom,kaanapali-rpmh.h
+
+properties:
+  compatible:
+    enum:
+      - qcom,kaanapali-aggre-noc
+      - qcom,kaanapali-clk-virt
+      - qcom,kaanapali-cnoc-main
+      - qcom,kaanapali-cnoc-cfg
+      - qcom,kaanapali-gem-noc
+      - qcom,kaanapali-lpass-ag-noc
+      - qcom,kaanapali-lpass-lpiaon-noc
+      - qcom,kaanapali-lpass-lpicx-noc
+      - qcom,kaanapali-mc-virt
+      - qcom,kaanapali-mmss-noc
+      - qcom,kaanapali-nsp-noc
+      - qcom,kaanapali-pcie-anoc
+      - qcom,kaanapali-system-noc
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    minItems: 2
+    maxItems: 3
+
+required:
+  - compatible
+
+allOf:
+  - $ref: qcom,rpmh-common.yaml#
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,kaanapali-clk-virt
+              - qcom,kaanapali-mc-virt
+    then:
+      properties:
+        reg: false
+    else:
+      required:
+        - reg
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,kaanapali-pcie-anoc
+    then:
+      properties:
+        clocks:
+          items:
+            - description: aggre-NOC PCIe AXI clock
+            - description: cfg-NOC PCIe a-NOC AHB clock
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,kaanapali-aggre-noc
+    then:
+      properties:
+        clocks:
+          items:
+            - description: aggre UFS PHY AXI clock
+            - description: aggre USB3 PRIM AXI clock
+            - description: RPMH CC IPA clock
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,kaanapali-aggre-noc
+              - qcom,kaanapali-pcie-anoc
+    then:
+      required:
+        - clocks
+    else:
+      properties:
+        clocks: false
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    clk_virt: interconnect-0 {
+      compatible = "qcom,kaanapali-clk-virt";
+      #interconnect-cells = <2>;
+      qcom,bcm-voters = <&apps_bcm_voter>;
+    };
+
+    aggre_noc: interconnect@16e0000 {
+      compatible = "qcom,kaanapali-aggre-noc";
+      reg = <0x016e0000 0x42400>;
+      #interconnect-cells = <2>;
+      clocks = <&gcc_aggre_ufs_phy_axi_clk>,
+               <&gcc_aggre_usb3_prim_axi_clk>,
+               <&rpmhcc_ipa_clk>;
+      qcom,bcm-voters = <&apps_bcm_voter>;
+    };
diff --git a/include/dt-bindings/interconnect/qcom,kaanapali-rpmh.h b/include/dt-bindings/interconnect/qcom,kaanapali-rpmh.h
new file mode 100644
index 000000000000..dde3f9abd677
--- /dev/null
+++ b/include/dt-bindings/interconnect/qcom,kaanapali-rpmh.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+ */
+
+#ifndef __DT_BINDINGS_INTERCONNECT_QCOM_KAANAPALI_H
+#define __DT_BINDINGS_INTERCONNECT_QCOM_KAANAPALI_H
+
+#define MASTER_QSPI_0				0
+#define MASTER_CRYPTO				1
+#define MASTER_QUP_1				2
+#define MASTER_SDCC_4				3
+#define MASTER_UFS_MEM				4
+#define MASTER_USB3				5
+#define MASTER_QUP_2				6
+#define MASTER_QUP_3				7
+#define MASTER_QUP_4				8
+#define MASTER_IPA				9
+#define MASTER_SOCCP_PROC			10
+#define MASTER_SP				11
+#define MASTER_QDSS_ETR				12
+#define MASTER_QDSS_ETR_1			13
+#define MASTER_SDCC_2				14
+#define SLAVE_A1NOC_SNOC			15
+#define SLAVE_A2NOC_SNOC			16
+
+#define MASTER_QUP_CORE_0			0
+#define MASTER_QUP_CORE_1			1
+#define MASTER_QUP_CORE_2			2
+#define MASTER_QUP_CORE_3			3
+#define MASTER_QUP_CORE_4			4
+#define SLAVE_QUP_CORE_0			5
+#define SLAVE_QUP_CORE_1			6
+#define SLAVE_QUP_CORE_2			7
+#define SLAVE_QUP_CORE_3			8
+#define SLAVE_QUP_CORE_4			9
+
+#define MASTER_CNOC_CFG				0
+#define SLAVE_AHB2PHY_SOUTH			1
+#define SLAVE_AHB2PHY_NORTH			2
+#define SLAVE_CAMERA_CFG			3
+#define SLAVE_CLK_CTL				4
+#define SLAVE_CRYPTO_0_CFG			5
+#define SLAVE_DISPLAY_CFG			6
+#define SLAVE_EVA_CFG				7
+#define SLAVE_GFX3D_CFG				8
+#define SLAVE_I2C				9
+#define SLAVE_I3C_IBI0_CFG			10
+#define SLAVE_I3C_IBI1_CFG			11
+#define SLAVE_IMEM_CFG				12
+#define SLAVE_IPC_ROUTER_CFG			13
+#define SLAVE_CNOC_MSS				14
+#define SLAVE_PCIE_CFG				15
+#define SLAVE_PRNG				16
+#define SLAVE_QDSS_CFG				17
+#define SLAVE_QSPI_0				18
+#define SLAVE_QUP_1				19
+#define SLAVE_QUP_2				20
+#define SLAVE_QUP_3				21
+#define SLAVE_QUP_4				22
+#define SLAVE_SDCC_2				23
+#define SLAVE_SDCC_4				24
+#define SLAVE_SPSS_CFG				25
+#define SLAVE_TCSR				26
+#define SLAVE_TLMM				27
+#define SLAVE_UFS_MEM_CFG			28
+#define SLAVE_USB3				29
+#define SLAVE_VENUS_CFG				30
+#define SLAVE_VSENSE_CTRL_CFG			31
+#define SLAVE_CNOC_MNOC_CFG			32
+#define SLAVE_PCIE_ANOC_CFG			33
+#define SLAVE_QDSS_STM				34
+#define SLAVE_TCU				35
+
+#define MASTER_GEM_NOC_CNOC			0
+#define MASTER_GEM_NOC_PCIE_SNOC		1
+#define SLAVE_AOSS				2
+#define SLAVE_IPA_CFG				3
+#define SLAVE_IPC_ROUTER_FENCE			4
+#define SLAVE_SOCCP				5
+#define SLAVE_TME_CFG				6
+#define SLAVE_APPSS				7
+#define SLAVE_CNOC_CFG				8
+#define SLAVE_DDRSS_CFG				9
+#define SLAVE_BOOT_IMEM				10
+#define SLAVE_IMEM				11
+#define SLAVE_PCIE_0				12
+
+#define MASTER_GPU_TCU				0
+#define MASTER_SYS_TCU				1
+#define MASTER_APPSS_PROC			2
+#define MASTER_GFX3D				3
+#define MASTER_LPASS_GEM_NOC			4
+#define MASTER_MSS_PROC				5
+#define MASTER_MNOC_HF_MEM_NOC			6
+#define MASTER_MNOC_SF_MEM_NOC			7
+#define MASTER_COMPUTE_NOC			8
+#define MASTER_ANOC_PCIE_GEM_NOC		9
+#define MASTER_QPACE				10
+#define MASTER_SNOC_SF_MEM_NOC			11
+#define MASTER_WLAN_Q6				12
+#define MASTER_GIC				13
+#define SLAVE_GEM_NOC_CNOC			14
+#define SLAVE_LLCC				15
+#define SLAVE_MEM_NOC_PCIE_SNOC			16
+
+#define MASTER_LPIAON_NOC			0
+#define SLAVE_LPASS_GEM_NOC			1
+
+#define MASTER_LPASS_LPINOC			0
+#define SLAVE_LPIAON_NOC_LPASS_AG_NOC		1
+
+#define MASTER_LPASS_PROC			0
+#define SLAVE_LPICX_NOC_LPIAON_NOC		1
+
+#define MASTER_LLCC				0
+#define SLAVE_EBI1				1
+
+#define MASTER_CAMNOC_HF			0
+#define MASTER_CAMNOC_NRT_ICP_SF		1
+#define MASTER_CAMNOC_RT_CDM_SF			2
+#define MASTER_CAMNOC_SF			3
+#define MASTER_MDP				4
+#define MASTER_MDSS_DCP				5
+#define MASTER_CDSP_HCP				6
+#define MASTER_VIDEO_CV_PROC			7
+#define MASTER_VIDEO_EVA			8
+#define MASTER_VIDEO_MVP			9
+#define MASTER_VIDEO_V_PROC			10
+#define MASTER_CNOC_MNOC_CFG			11
+#define SLAVE_MNOC_HF_MEM_NOC			12
+#define SLAVE_MNOC_SF_MEM_NOC			13
+#define SLAVE_SERVICE_MNOC			14
+
+#define MASTER_CDSP_PROC			0
+#define SLAVE_CDSP_MEM_NOC			1
+
+#define MASTER_PCIE_ANOC_CFG			0
+#define MASTER_PCIE_0				1
+#define SLAVE_ANOC_PCIE_GEM_NOC			2
+#define SLAVE_SERVICE_PCIE_ANOC			3
+
+#define MASTER_A1NOC_SNOC			0
+#define MASTER_A2NOC_SNOC			1
+#define MASTER_APSS_NOC				2
+#define MASTER_CNOC_SNOC			3
+#define SLAVE_SNOC_GEM_NOC_SF			4
+
+#endif
-- 
cgit v1.2.3


From 0de4c70d04a46a3c266547dd4275ce25f623796a Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Thu, 25 Sep 2025 09:56:47 +0900
Subject: tracing: fprobe: use rhltable for fprobe_ip_table

For now, all the kernel functions who are hooked by the fprobe will be
added to the hash table "fprobe_ip_table". The key of it is the function
address, and the value of it is "struct fprobe_hlist_node".

The budget of the hash table is FPROBE_IP_TABLE_SIZE, which is 256. And
this means the overhead of the hash table lookup will grow linearly if
the count of the functions in the fprobe more than 256. When we try to
hook all the kernel functions, the overhead will be huge.

Therefore, replace the hash table with rhltable to reduce the overhead.

Link: https://lore.kernel.org/all/20250819031825.55653-1-dongml2@chinatelecom.cn/

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/fprobe.h |   3 +-
 kernel/trace/fprobe.c  | 157 ++++++++++++++++++++++++++++---------------------
 2 files changed, 93 insertions(+), 67 deletions(-)

(limited to 'include')

diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 7964db96e41a..0a3bcd1718f3 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -7,6 +7,7 @@
 #include <linux/ftrace.h>
 #include <linux/rcupdate.h>
 #include <linux/refcount.h>
+#include <linux/rhashtable.h>
 #include <linux/slab.h>
 
 struct fprobe;
@@ -26,7 +27,7 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip,
  * @fp: The fprobe which owns this.
  */
 struct fprobe_hlist_node {
-	struct hlist_node	hlist;
+	struct rhlist_head	hlist;
 	unsigned long		addr;
 	struct fprobe		*fp;
 };
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 5a807d62e76d..e063e22e1134 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -10,6 +10,7 @@
 #include <linux/kprobes.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/rhashtable.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
 
@@ -41,60 +42,67 @@
  *  - RCU hlist traversal under disabling preempt
  */
 static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
-static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
+static struct rhltable fprobe_ip_table;
 static DEFINE_MUTEX(fprobe_mutex);
 
-/*
- * Find first fprobe in the hlist. It will be iterated twice in the entry
- * probe, once for correcting the total required size, the second time is
- * calling back the user handlers.
- * Thus the hlist in the fprobe_table must be sorted and new probe needs to
- * be added *before* the first fprobe.
- */
-static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
+static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed)
 {
-	struct fprobe_hlist_node *node;
-	struct hlist_head *head;
+	return hash_ptr(*(unsigned long **)data, 32);
+}
 
-	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
-	hlist_for_each_entry_rcu(node, head, hlist,
-				 lockdep_is_held(&fprobe_mutex)) {
-		if (node->addr == ip)
-			return node;
-	}
-	return NULL;
+static int fprobe_node_cmp(struct rhashtable_compare_arg *arg,
+			   const void *ptr)
+{
+	unsigned long key = *(unsigned long *)arg->key;
+	const struct fprobe_hlist_node *n = ptr;
+
+	return n->addr != key;
 }
-NOKPROBE_SYMBOL(find_first_fprobe_node);
 
-/* Node insertion and deletion requires the fprobe_mutex */
-static void insert_fprobe_node(struct fprobe_hlist_node *node)
+static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed)
 {
-	unsigned long ip = node->addr;
-	struct fprobe_hlist_node *next;
-	struct hlist_head *head;
+	const struct fprobe_hlist_node *n = data;
+
+	return hash_ptr((void *)n->addr, 32);
+}
 
+static const struct rhashtable_params fprobe_rht_params = {
+	.head_offset		= offsetof(struct fprobe_hlist_node, hlist),
+	.key_offset		= offsetof(struct fprobe_hlist_node, addr),
+	.key_len		= sizeof_field(struct fprobe_hlist_node, addr),
+	.hashfn			= fprobe_node_hashfn,
+	.obj_hashfn		= fprobe_node_obj_hashfn,
+	.obj_cmpfn		= fprobe_node_cmp,
+	.automatic_shrinking	= true,
+};
+
+/* Node insertion and deletion requires the fprobe_mutex */
+static int insert_fprobe_node(struct fprobe_hlist_node *node)
+{
 	lockdep_assert_held(&fprobe_mutex);
 
-	next = find_first_fprobe_node(ip);
-	if (next) {
-		hlist_add_before_rcu(&node->hlist, &next->hlist);
-		return;
-	}
-	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
-	hlist_add_head_rcu(&node->hlist, head);
+	return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
 }
 
 /* Return true if there are synonims */
 static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 {
 	lockdep_assert_held(&fprobe_mutex);
+	bool ret;
 
 	/* Avoid double deleting */
 	if (READ_ONCE(node->fp) != NULL) {
 		WRITE_ONCE(node->fp, NULL);
-		hlist_del_rcu(&node->hlist);
+		rhltable_remove(&fprobe_ip_table, &node->hlist,
+				fprobe_rht_params);
 	}
-	return !!find_first_fprobe_node(node->addr);
+
+	rcu_read_lock();
+	ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr,
+				fprobe_rht_params);
+	rcu_read_unlock();
+
+	return ret;
 }
 
 /* Check existence of the fprobe */
@@ -249,9 +257,10 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent
 static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 			struct ftrace_regs *fregs)
 {
-	struct fprobe_hlist_node *node, *first;
 	unsigned long *fgraph_data = NULL;
 	unsigned long func = trace->func;
+	struct fprobe_hlist_node *node;
+	struct rhlist_head *head, *pos;
 	unsigned long ret_ip;
 	int reserved_words;
 	struct fprobe *fp;
@@ -260,14 +269,11 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 	if (WARN_ON_ONCE(!fregs))
 		return 0;
 
-	first = node = find_first_fprobe_node(func);
-	if (unlikely(!first))
-		return 0;
-
+	head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params);
 	reserved_words = 0;
-	hlist_for_each_entry_from_rcu(node, hlist) {
+	rhl_for_each_entry_rcu(node, pos, head, hlist) {
 		if (node->addr != func)
-			break;
+			continue;
 		fp = READ_ONCE(node->fp);
 		if (!fp || !fp->exit_handler)
 			continue;
@@ -278,13 +284,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 		reserved_words +=
 			FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size);
 	}
-	node = first;
 	if (reserved_words) {
 		fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
 		if (unlikely(!fgraph_data)) {
-			hlist_for_each_entry_from_rcu(node, hlist) {
+			rhl_for_each_entry_rcu(node, pos, head, hlist) {
 				if (node->addr != func)
-					break;
+					continue;
 				fp = READ_ONCE(node->fp);
 				if (fp && !fprobe_disabled(fp))
 					fp->nmissed++;
@@ -299,12 +304,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 	 */
 	ret_ip = ftrace_regs_get_return_address(fregs);
 	used = 0;
-	hlist_for_each_entry_from_rcu(node, hlist) {
+	rhl_for_each_entry_rcu(node, pos, head, hlist) {
 		int data_size;
 		void *data;
 
 		if (node->addr != func)
-			break;
+			continue;
 		fp = READ_ONCE(node->fp);
 		if (!fp || fprobe_disabled(fp))
 			continue;
@@ -449,25 +454,21 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad
 	return 0;
 }
 
-static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head,
-					struct fprobe_addr_list *alist)
+static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
+					 struct fprobe_addr_list *alist)
 {
-	struct fprobe_hlist_node *node;
 	int ret = 0;
 
-	hlist_for_each_entry_rcu(node, head, hlist,
-				 lockdep_is_held(&fprobe_mutex)) {
-		if (!within_module(node->addr, mod))
-			continue;
-		if (delete_fprobe_node(node))
-			continue;
-		/*
-		 * If failed to update alist, just continue to update hlist.
-		 * Therefore, at list user handler will not hit anymore.
-		 */
-		if (!ret)
-			ret = fprobe_addr_list_add(alist, node->addr);
-	}
+	if (!within_module(node->addr, mod))
+		return;
+	if (delete_fprobe_node(node))
+		return;
+	/*
+	 * If failed to update alist, just continue to update hlist.
+	 * Therefore, at list user handler will not hit anymore.
+	 */
+	if (!ret)
+		ret = fprobe_addr_list_add(alist, node->addr);
 }
 
 /* Handle module unloading to manage fprobe_ip_table. */
@@ -475,8 +476,9 @@ static int fprobe_module_callback(struct notifier_block *nb,
 				  unsigned long val, void *data)
 {
 	struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT};
+	struct fprobe_hlist_node *node;
+	struct rhashtable_iter iter;
 	struct module *mod = data;
-	int i;
 
 	if (val != MODULE_STATE_GOING)
 		return NOTIFY_DONE;
@@ -487,8 +489,16 @@ static int fprobe_module_callback(struct notifier_block *nb,
 		return NOTIFY_DONE;
 
 	mutex_lock(&fprobe_mutex);
-	for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++)
-		fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist);
+	rhltable_walk_enter(&fprobe_ip_table, &iter);
+	do {
+		rhashtable_walk_start(&iter);
+
+		while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node))
+			fprobe_remove_node_in_module(mod, node, &alist);
+
+		rhashtable_walk_stop(&iter);
+	} while (node == ERR_PTR(-EAGAIN));
+	rhashtable_walk_exit(&iter);
 
 	if (alist.index > 0)
 		ftrace_set_filter_ips(&fprobe_graph_ops.ops,
@@ -728,8 +738,16 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 	ret = fprobe_graph_add_ips(addrs, num);
 	if (!ret) {
 		add_fprobe_hash(fp);
-		for (i = 0; i < hlist_array->size; i++)
-			insert_fprobe_node(&hlist_array->array[i]);
+		for (i = 0; i < hlist_array->size; i++) {
+			ret = insert_fprobe_node(&hlist_array->array[i]);
+			if (ret)
+				break;
+		}
+		/* fallback on insert error */
+		if (ret) {
+			for (i--; i >= 0; i--)
+				delete_fprobe_node(&hlist_array->array[i]);
+		}
 	}
 	mutex_unlock(&fprobe_mutex);
 
@@ -825,3 +843,10 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(unregister_fprobe);
+
+static int __init fprobe_initcall(void)
+{
+	rhltable_init(&fprobe_ip_table, &fprobe_rht_params);
+	return 0;
+}
+late_initcall(fprobe_initcall);
-- 
cgit v1.2.3


From 68c4c159a0db4409a5d6b5f4703d71b89a96f06a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Oct 2025 14:30:32 +0000
Subject: genirq: Fix percpu_devid irq affinity documentation

Stephen points out that some of the percpu_devid irq affinity
documentation is either missing or not matching the data structures.

Address all the issues in one go.

Fixes: 87b0031f7f73 ("irqdomain: Add firmware info reporting interface")
Fixes: 258e7d28a3dc ("genirq: Add affinity to percpu_devid interrupt requests")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251030143032.2035987-1-maz@kernel.org
---
 include/linux/interrupt.h | 1 +
 include/linux/irqdomain.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index fa62ab556ee3..266f2b39213a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -109,6 +109,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
  * @name:	name of the device
  * @dev_id:	cookie to identify the device
  * @percpu_dev_id:	cookie to identify the device
+ * @affinity:	CPUs this irqaction is allowed to run on
  * @next:	pointer to the next irqaction for shared interrupts
  * @irq:	interrupt number
  * @flags:	flags (see IRQF_* above)
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 5907baf6099d..952d3c8dd6b7 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -48,7 +48,7 @@ struct irq_fwspec {
  * struct irq_fwspec_info - firmware provided IRQ information structure
  *
  * @flags:		Information validity flags
- * @cpumask:		Affinity mask for this interrupt
+ * @affinity:		Affinity mask for this interrupt
  *
  * This structure reports firmware-specific information about an
  * interrupt. The only significant information is the affinity of a
-- 
cgit v1.2.3


From 54133f9b4b53ffa2204eb27cfc9d50072c9a52d2 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Wed, 29 Oct 2025 13:43:10 -0700
Subject: net: mana: Support HW link state events

Handle the NIC hardware link state events received from the HW
channel, then set the proper link state accordingly.

And, add a feature bit, GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE,
to inform the NIC hardware this handler exists.

Our MANA NIC only sends out the link state down/up messages
when we need to let the VM rerun DHCP client and change IP
address. So, add netif_carrier_on() in the probe(), let the NIC
show the right initial state in /sys/class/net/ethX/operstate.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1761770601-16920-1-git-send-email-haiyangz@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c  |  1 +
 drivers/net/ethernet/microsoft/mana/hw_channel.c | 12 ++++++
 drivers/net/ethernet/microsoft/mana/mana_en.c    | 54 +++++++++++++++++++++---
 include/net/mana/gdma.h                          |  4 +-
 include/net/mana/hw_channel.h                    |  2 +
 include/net/mana/mana.h                          |  4 ++
 6 files changed, 71 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 43f034e180c4..effe0a2f207a 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -528,6 +528,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
 	case GDMA_EQE_HWC_INIT_DONE:
 	case GDMA_EQE_HWC_SOC_SERVICE:
 	case GDMA_EQE_RNIC_QP_FATAL:
+	case GDMA_EQE_HWC_SOC_RECONFIG_DATA:
 		if (!eq->eq.callback)
 			break;
 
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index ada6c78a2bef..aa4e2731e2ba 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -118,6 +118,7 @@ static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self,
 	struct gdma_dev *gd = hwc->gdma_dev;
 	union hwc_init_type_data type_data;
 	union hwc_init_eq_id_db eq_db;
+	struct mana_context *ac;
 	u32 type, val;
 	int ret;
 
@@ -196,6 +197,17 @@ static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self,
 			hwc->hwc_timeout = val;
 			break;
 
+		case HWC_DATA_HW_LINK_CONNECT:
+		case HWC_DATA_HW_LINK_DISCONNECT:
+			ac = gd->gdma_context->mana.driver_data;
+			if (!ac)
+				break;
+
+			WRITE_ONCE(ac->link_event, type);
+			schedule_work(&ac->link_change_work);
+
+			break;
+
 		default:
 			dev_warn(hwc->dev, "Received unknown reconfig type %u\n", type);
 			break;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 0142fd98392c..739087081dfd 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -20,6 +20,7 @@
 
 #include <net/mana/mana.h>
 #include <net/mana/mana_auxiliary.h>
+#include <net/mana/hw_channel.h>
 
 static DEFINE_IDA(mana_adev_ida);
 
@@ -84,7 +85,6 @@ static int mana_open(struct net_device *ndev)
 	/* Ensure port state updated before txq state */
 	smp_wmb();
 
-	netif_carrier_on(ndev);
 	netif_tx_wake_all_queues(ndev);
 	netdev_dbg(ndev, "%s successful\n", __func__);
 	return 0;
@@ -100,6 +100,46 @@ static int mana_close(struct net_device *ndev)
 	return mana_detach(ndev, true);
 }
 
+static void mana_link_state_handle(struct work_struct *w)
+{
+	struct mana_context *ac;
+	struct net_device *ndev;
+	u32 link_event;
+	bool link_up;
+	int i;
+
+	ac = container_of(w, struct mana_context, link_change_work);
+
+	rtnl_lock();
+
+	link_event = READ_ONCE(ac->link_event);
+
+	if (link_event == HWC_DATA_HW_LINK_CONNECT)
+		link_up = true;
+	else if (link_event == HWC_DATA_HW_LINK_DISCONNECT)
+		link_up = false;
+	else
+		goto out;
+
+	/* Process all ports */
+	for (i = 0; i < ac->num_ports; i++) {
+		ndev = ac->ports[i];
+		if (!ndev)
+			continue;
+
+		if (link_up) {
+			netif_carrier_on(ndev);
+
+			__netdev_notify_peers(ndev);
+		} else {
+			netif_carrier_off(ndev);
+		}
+	}
+
+out:
+	rtnl_unlock();
+}
+
 static bool mana_can_tx(struct gdma_queue *wq)
 {
 	return mana_gd_wq_avail_space(wq) >= MAX_TX_WQE_SIZE;
@@ -3059,9 +3099,6 @@ int mana_attach(struct net_device *ndev)
 	/* Ensure port state updated before txq state */
 	smp_wmb();
 
-	if (apc->port_is_up)
-		netif_carrier_on(ndev);
-
 	netif_device_attach(ndev);
 
 	return 0;
@@ -3154,7 +3191,6 @@ int mana_detach(struct net_device *ndev, bool from_close)
 	smp_wmb();
 
 	netif_tx_disable(ndev);
-	netif_carrier_off(ndev);
 
 	if (apc->port_st_save) {
 		err = mana_dealloc_queues(ndev);
@@ -3243,6 +3279,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 		goto free_indir;
 	}
 
+	netif_carrier_on(ndev);
+
 	debugfs_create_u32("current_speed", 0400, apc->mana_port_debugfs, &apc->speed);
 
 	return 0;
@@ -3431,6 +3469,8 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 
 	if (!resuming) {
 		ac->num_ports = num_ports;
+
+		INIT_WORK(&ac->link_change_work, mana_link_state_handle);
 	} else {
 		if (ac->num_ports != num_ports) {
 			dev_err(dev, "The number of vPorts changed: %d->%d\n",
@@ -3438,6 +3478,8 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 			err = -EPROTO;
 			goto out;
 		}
+
+		enable_work(&ac->link_change_work);
 	}
 
 	if (ac->num_ports == 0)
@@ -3500,6 +3542,8 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 	int err;
 	int i;
 
+	disable_work_sync(&ac->link_change_work);
+
 	/* adev currently doesn't support suspending, always remove it */
 	if (gd->adev)
 		remove_adev(gd);
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 57df78cfbf82..637f42485dba 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -590,6 +590,7 @@ enum {
 
 /* Driver can self reset on FPGA Reconfig EQE notification */
 #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
+#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
 
 #define GDMA_DRV_CAP_FLAGS1 \
 	(GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
@@ -599,7 +600,8 @@ enum {
 	 GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
 	 GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \
 	 GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \
-	 GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE)
+	 GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \
+	 GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE)
 
 #define GDMA_DRV_CAP_FLAGS2 0
 
diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h
index 83cf93338eb3..16feb39616c1 100644
--- a/include/net/mana/hw_channel.h
+++ b/include/net/mana/hw_channel.h
@@ -24,6 +24,8 @@
 #define HWC_INIT_DATA_PF_DEST_CQ_ID	11
 
 #define HWC_DATA_CFG_HWC_TIMEOUT 1
+#define HWC_DATA_HW_LINK_CONNECT 2
+#define HWC_DATA_HW_LINK_DISCONNECT 3
 
 #define HW_CHANNEL_WAIT_RESOURCE_TIMEOUT_MS 30000
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 0921485565c0..8906901535f5 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -477,6 +477,10 @@ struct mana_context {
 	struct dentry *mana_eqs_debugfs;
 
 	struct net_device *ports[MAX_PORTS_IN_MANA_DEV];
+
+	/* Link state change work */
+	struct work_struct link_change_work;
+	u32 link_event;
 };
 
 struct mana_port_context {
-- 
cgit v1.2.3


From 30176bf7c871681df506f3165ffe76ec462db991 Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Wed, 29 Oct 2025 16:32:06 +0100
Subject: dpll: add phase-adjust-gran pin attribute

Phase-adjust values are currently limited by a min-max range. Some
hardware requires, for certain pin types, that values be multiples of
a specific granularity, as in the zl3073x driver.

Add a `phase-adjust-gran` pin attribute and an appropriate field in
dpll_pin_properties. If set by the driver, use its value to validate
user-provided phase-adjust values.

Reviewed-by: Michal Schmidt <mschmidt@redhat.com>
Reviewed-by: Petr Oros <poros@redhat.com>
Tested-by: Prathosh Satish <Prathosh.Satish@microchip.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Link: https://patch.msgid.link/20251029153207.178448-2-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/driver-api/dpll.rst     | 36 +++++++++++++++++++----------------
 Documentation/netlink/specs/dpll.yaml |  7 +++++++
 drivers/dpll/dpll_netlink.c           | 12 +++++++++++-
 include/linux/dpll.h                  |  1 +
 include/uapi/linux/dpll.h             |  1 +
 5 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst
index be1fc643b645..83118c728ed9 100644
--- a/Documentation/driver-api/dpll.rst
+++ b/Documentation/driver-api/dpll.rst
@@ -198,26 +198,28 @@ be requested with the same attribute with ``DPLL_CMD_DEVICE_SET`` command.
   ================================== ======================================
 
 Device may also provide ability to adjust a signal phase on a pin.
-If pin phase adjustment is supported, minimal and maximal values that pin
-handle shall be provide to the user on ``DPLL_CMD_PIN_GET`` respond
-with ``DPLL_A_PIN_PHASE_ADJUST_MIN`` and ``DPLL_A_PIN_PHASE_ADJUST_MAX``
+If pin phase adjustment is supported, minimal and maximal values and
+granularity that pin handle shall be provided to the user on
+``DPLL_CMD_PIN_GET`` respond with ``DPLL_A_PIN_PHASE_ADJUST_MIN``,
+``DPLL_A_PIN_PHASE_ADJUST_MAX`` and ``DPLL_A_PIN_PHASE_ADJUST_GRAN``
 attributes. Configured phase adjust value is provided with
 ``DPLL_A_PIN_PHASE_ADJUST`` attribute of a pin, and value change can be
 requested with the same attribute with ``DPLL_CMD_PIN_SET`` command.
 
-  =============================== ======================================
-  ``DPLL_A_PIN_ID``               configured pin id
-  ``DPLL_A_PIN_PHASE_ADJUST_MIN`` attr minimum value of phase adjustment
-  ``DPLL_A_PIN_PHASE_ADJUST_MAX`` attr maximum value of phase adjustment
-  ``DPLL_A_PIN_PHASE_ADJUST``     attr configured value of phase
-                                  adjustment on parent dpll device
-  ``DPLL_A_PIN_PARENT_DEVICE``    nested attribute for requesting
-                                  configuration on given parent dpll
-                                  device
-    ``DPLL_A_PIN_PARENT_ID``      parent dpll device id
-    ``DPLL_A_PIN_PHASE_OFFSET``   attr measured phase difference
-                                  between a pin and parent dpll device
-  =============================== ======================================
+  ================================ ==========================================
+  ``DPLL_A_PIN_ID``                configured pin id
+  ``DPLL_A_PIN_PHASE_ADJUST_GRAN`` attr granularity of phase adjustment value
+  ``DPLL_A_PIN_PHASE_ADJUST_MIN``  attr minimum value of phase adjustment
+  ``DPLL_A_PIN_PHASE_ADJUST_MAX``  attr maximum value of phase adjustment
+  ``DPLL_A_PIN_PHASE_ADJUST``      attr configured value of phase
+                                   adjustment on parent dpll device
+  ``DPLL_A_PIN_PARENT_DEVICE``     nested attribute for requesting
+                                   configuration on given parent dpll
+                                   device
+    ``DPLL_A_PIN_PARENT_ID``       parent dpll device id
+    ``DPLL_A_PIN_PHASE_OFFSET``    attr measured phase difference
+                                   between a pin and parent dpll device
+  ================================ ==========================================
 
 All phase related values are provided in pico seconds, which represents
 time difference between signals phase. The negative value means that
@@ -384,6 +386,8 @@ according to attribute purpose.
                                        frequencies
       ``DPLL_A_PIN_ANY_FREQUENCY_MIN`` attr minimum value of frequency
       ``DPLL_A_PIN_ANY_FREQUENCY_MAX`` attr maximum value of frequency
+    ``DPLL_A_PIN_PHASE_ADJUST_GRAN``   attr granularity of phase
+                                       adjustment value
     ``DPLL_A_PIN_PHASE_ADJUST_MIN``    attr minimum value of phase
                                        adjustment
     ``DPLL_A_PIN_PHASE_ADJUST_MAX``    attr maximum value of phase
diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
index 80728f6f9bc8..78d0724d7e12 100644
--- a/Documentation/netlink/specs/dpll.yaml
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -440,6 +440,12 @@ attribute-sets:
         doc: |
           Capable pin provides list of pins that can be bound to create a
           reference-sync pin pair.
+      -
+        name: phase-adjust-gran
+        type: u32
+        doc: |
+          Granularity of phase adjustment, in picoseconds. The value of
+          phase adjustment must be a multiple of this granularity.
 
   -
     name: pin-parent-device
@@ -616,6 +622,7 @@ operations:
             - capabilities
             - parent-device
             - parent-pin
+            - phase-adjust-gran
             - phase-adjust-min
             - phase-adjust-max
             - phase-adjust
diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index a4153bcb6dcf..64944f601ee5 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -637,6 +637,10 @@ dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
 	ret = dpll_msg_add_pin_freq(msg, pin, ref, extack);
 	if (ret)
 		return ret;
+	if (prop->phase_gran &&
+	    nla_put_u32(msg, DPLL_A_PIN_PHASE_ADJUST_GRAN,
+			prop->phase_gran))
+		return -EMSGSIZE;
 	if (nla_put_s32(msg, DPLL_A_PIN_PHASE_ADJUST_MIN,
 			prop->phase_range.min))
 		return -EMSGSIZE;
@@ -1261,7 +1265,13 @@ dpll_pin_phase_adj_set(struct dpll_pin *pin, struct nlattr *phase_adj_attr,
 	if (phase_adj > pin->prop.phase_range.max ||
 	    phase_adj < pin->prop.phase_range.min) {
 		NL_SET_ERR_MSG_ATTR(extack, phase_adj_attr,
-				    "phase adjust value not supported");
+				    "phase adjust value of out range");
+		return -EINVAL;
+	}
+	if (pin->prop.phase_gran && phase_adj % (s32)pin->prop.phase_gran) {
+		NL_SET_ERR_MSG_ATTR_FMT(extack, phase_adj_attr,
+					"phase adjust value not multiple of %u",
+					pin->prop.phase_gran);
 		return -EINVAL;
 	}
 
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 25be745bf41f..562f520b23c2 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -163,6 +163,7 @@ struct dpll_pin_properties {
 	u32 freq_supported_num;
 	struct dpll_pin_frequency *freq_supported;
 	struct dpll_pin_phase_adjust_range phase_range;
+	u32 phase_gran;
 };
 
 #if IS_ENABLED(CONFIG_DPLL)
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index ab1725a954d7..69d35570ac4f 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -251,6 +251,7 @@ enum dpll_a_pin {
 	DPLL_A_PIN_ESYNC_FREQUENCY_SUPPORTED,
 	DPLL_A_PIN_ESYNC_PULSE,
 	DPLL_A_PIN_REFERENCE_SYNC,
+	DPLL_A_PIN_PHASE_ADJUST_GRAN,
 
 	__DPLL_A_PIN_MAX,
 	DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1)
-- 
cgit v1.2.3


From 652a86b24c5ac444afaf7625c9340d55aab7f105 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Fri, 31 Oct 2025 14:08:32 +0100
Subject: err.h: add INIT_ERR_PTR() macro

Add INIT_ERR_PTR() macro to initialize static variables with error
pointers. This might be useful for specific case where there is a static
variable initialized to an error condition and then later set to the
real handle once probe finish/completes.

This is to handle compilation problems like:

error: initializer element is not constant

where ERR_PTR() can't be used.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20251031130835.7953-2-ansuelsmth@gmail.com
[bjorn: Added () suffix on macro references]
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/err.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/err.h b/include/linux/err.h
index 1d60aa86db53..8c37be0620ab 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -41,6 +41,14 @@ static inline void * __must_check ERR_PTR(long error)
 	return (void *) error;
 }
 
+/**
+ * INIT_ERR_PTR - Init a const error pointer.
+ * @error: A negative error code.
+ *
+ * Like ERR_PTR(), but usable to initialize static variables.
+ */
+#define INIT_ERR_PTR(error) ((void *)(error))
+
 /* Return the pointer in the percpu address space. */
 #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error))
 
-- 
cgit v1.2.3


From 51d0656959bcdb743232f9b530b4cca569e74e7f Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Mon, 27 Oct 2025 13:59:31 +0100
Subject: genirq/manage: Reduce priority of forced secondary interrupt handler

Crystal reports that the PCIe Advanced Error Reporting driver gets stuck
in an infinite loop on PREEMPT_RT:

Both the primary interrupt handler aer_irq() as well as the secondary
handler aer_isr() are forced into threads with identical priority.
Crystal writes that on the ARM system in question, the primary handler
has to clear an error in the Root Error Status register...

   "before the next error happens, or else the hardware will set the
    Multiple ERR_COR Received bit.  If that bit is set, then aer_isr()
    can't rely on the Error Source Identification register, so it scans
    through all devices looking for errors -- and for some reason, on
    this system, accessing the AER registers (or any Config Space above
    0x400, even though there are capabilities located there) generates
    an Unsupported Request Error (but returns valid data).  Since this
    happens more than once, without aer_irq() preempting, it causes
    another multi error and we get stuck in a loop."

The issue does not show on non-PREEMPT_RT because the primary handler
runs in hardirq context and thus can preempt the threaded secondary
handler, clear the Root Error Status register and prevent the secondary
handler from getting stuck.

Emulate the same behavior on PREEMPT_RT by assigning a lower default
priority to the secondary handler if the primary handler is forced into
a thread.

Reported-by: Crystal Wood <crwood@redhat.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Crystal Wood <crwood@redhat.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://patch.msgid.link/f6dcdb41be2694886b8dbf4fe7b3ab89e9d5114c.1761569303.git.lukas@wunner.de
Closes: https://lore.kernel.org/r/20250902224441.368483-1-crwood@redhat.com/
---
 include/linux/sched.h   |  1 +
 kernel/irq/manage.c     |  5 ++++-
 kernel/sched/syscalls.c | 13 +++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbb7340c5866..cd6be74d87b8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1901,6 +1901,7 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_para
 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
 extern void sched_set_fifo(struct task_struct *p);
 extern void sched_set_fifo_low(struct task_struct *p);
+extern void sched_set_fifo_secondary(struct task_struct *p);
 extern void sched_set_normal(struct task_struct *p, int nice);
 extern int sched_setattr(struct task_struct *, const struct sched_attr *);
 extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7a09d96f4d2d..c812b6f48f2b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1239,7 +1239,10 @@ static int irq_thread(void *data)
 
 	irq_thread_set_ready(desc, action);
 
-	sched_set_fifo(current);
+	if (action->handler == irq_forced_secondary_handler)
+		sched_set_fifo_secondary(current);
+	else
+		sched_set_fifo(current);
 
 	if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
 					   &action->thread_flags))
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 77ae87f36e84..48347950ac48 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -856,6 +856,19 @@ void sched_set_fifo_low(struct task_struct *p)
 }
 EXPORT_SYMBOL_GPL(sched_set_fifo_low);
 
+/*
+ * Used when the primary interrupt handler is forced into a thread, in addition
+ * to the (always threaded) secondary handler.  The secondary handler gets a
+ * slightly lower priority so that the primary handler can preempt it, thereby
+ * emulating the behavior of a non-PREEMPT_RT system where the primary handler
+ * runs in hard interrupt context.
+ */
+void sched_set_fifo_secondary(struct task_struct *p)
+{
+	struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 };
+	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+
 void sched_set_normal(struct task_struct *p, int nice)
 {
 	struct sched_attr attr = {
-- 
cgit v1.2.3


From 933ecf591275e850a46b28c6016d2688b92e23f6 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 1 Nov 2025 21:19:41 -0700
Subject: random: remove unused get_random_var_wait functions

None of these functions are used, so remove them.

This renders the two bugs moot:

- get_random_u64_wait() used the wrong pointer type, making it provide
  only 32 bits.

- The '#undef' directive used the wrong identifier, leaving the helper
  macro defined.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/random.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/random.h b/include/linux/random.h
index 333cecfca93f..8a8064dc3970 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -130,21 +130,6 @@ static inline int get_random_bytes_wait(void *buf, size_t nbytes)
 	return ret;
 }
 
-#define declare_get_random_var_wait(name, ret_type) \
-	static inline int get_random_ ## name ## _wait(ret_type *out) { \
-		int ret = wait_for_random_bytes(); \
-		if (unlikely(ret)) \
-			return ret; \
-		*out = get_random_ ## name(); \
-		return 0; \
-	}
-declare_get_random_var_wait(u8, u8)
-declare_get_random_var_wait(u16, u16)
-declare_get_random_var_wait(u32, u32)
-declare_get_random_var_wait(u64, u32)
-declare_get_random_var_wait(long, unsigned long)
-#undef declare_get_random_var
-
 #ifdef CONFIG_SMP
 int random_prepare_cpu(unsigned int cpu);
 int random_online_cpu(unsigned int cpu);
-- 
cgit v1.2.3


From 8ed6b8842c44a4a716dfd536e7f13aff77039a02 Mon Sep 17 00:00:00 2001
From: Dzmitry Sankouski <dsankouski@gmail.com>
Date: Thu, 25 Sep 2025 22:09:56 +0300
Subject: power: supply: max77705_charger: implement aicl feature

Adaptive input current allows charger to reduce it's current
consumption, when source is not able to provide enough power.

Signed-off-by: Dzmitry Sankouski <dsankouski@gmail.com>
Link: https://patch.msgid.link/20250925-max77705_77976_charger_improvement-v6-1-972c716c17d1@gmail.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/max77705_charger.c | 42 +++++++++++++++++++++++++++++++++
 include/linux/power/max77705_charger.h  |  2 ++
 2 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c
index b1a227bf72e2..35cdb10a0e89 100644
--- a/drivers/power/supply/max77705_charger.c
+++ b/drivers/power/supply/max77705_charger.c
@@ -40,6 +40,39 @@ static enum power_supply_property max77705_charger_props[] = {
 	POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT,
 };
 
+static irqreturn_t max77705_aicl_irq(int irq, void *irq_drv_data)
+{
+	struct max77705_charger_data *chg = irq_drv_data;
+	unsigned int regval, irq_status;
+	int err;
+
+	err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status);
+	if (err < 0)
+		return IRQ_HANDLED;
+
+	// irq is fiered at the end of current decrease sequence too
+	// early check AICL_I bit to guard against that excess irq call
+	while (!(irq_status & BIT(MAX77705_AICL_I))) {
+		err = regmap_field_read(chg->rfield[MAX77705_CHG_CHGIN_LIM], &regval);
+		if (err < 0)
+			return IRQ_HANDLED;
+
+		regval--;
+
+		err = regmap_field_write(chg->rfield[MAX77705_CHG_CHGIN_LIM], regval);
+		if (err < 0)
+			return IRQ_HANDLED;
+
+		msleep(AICL_WORK_DELAY_MS);
+
+		err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status);
+		if (err < 0)
+			return IRQ_HANDLED;
+	}
+
+	return IRQ_HANDLED;
+}
+
 static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data)
 {
 	struct max77705_charger_data *chg = irq_drv_data;
@@ -632,6 +665,15 @@ static int max77705_charger_probe(struct i2c_client *i2c)
 		goto destroy_wq;
 	}
 
+	ret = devm_request_threaded_irq(dev, regmap_irq_get_virq(irq_data, MAX77705_AICL_I),
+					NULL, max77705_aicl_irq,
+					IRQF_TRIGGER_NONE,
+					"aicl-irq", chg);
+	if (ret) {
+		dev_err_probe(dev, ret, "Failed to Request aicl IRQ\n");
+		goto destroy_wq;
+	}
+
 	ret = max77705_charger_enable(chg);
 	if (ret) {
 		dev_err_probe(dev, ret, "failed to enable charge\n");
diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h
index 6653abfdf747..b3950ce0625e 100644
--- a/include/linux/power/max77705_charger.h
+++ b/include/linux/power/max77705_charger.h
@@ -123,6 +123,8 @@
 #define MAX77705_DISABLE_SKIP		1
 #define MAX77705_AUTO_SKIP		0
 
+#define AICL_WORK_DELAY_MS		100
+
 /* uA */
 #define MAX77705_CURRENT_CHGIN_STEP	25000
 #define MAX77705_CURRENT_CHG_STEP	50000
-- 
cgit v1.2.3


From 3434be392051a2fdb295df3cfe07bf75235250a0 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Mon, 20 Oct 2025 10:38:14 +0000
Subject: scsi: target: Rename target_configure_unmap_from_queue()

Rename target_configure_unmap_from_queue() to
target_configure_unmap_from_bdev() since it now takes a bdev.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Link: https://patch.msgid.link/20251020103820.2917593-2-john.g.garry@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_device.c  | 6 +++---
 drivers/target/target_core_file.c    | 4 ++--
 drivers/target/target_core_iblock.c  | 4 ++--
 include/target/target_core_backend.h | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index 7bb711b24c0d..83fe3d9a9681 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -844,8 +844,8 @@ free_device:
  * Check if the underlying struct block_device supports discard and if yes
  * configure the UNMAP parameters.
  */
-bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
-				       struct block_device *bdev)
+bool target_configure_unmap_from_bdev(struct se_dev_attrib *attrib,
+				      struct block_device *bdev)
 {
 	int block_size = bdev_logical_block_size(bdev);
 
@@ -863,7 +863,7 @@ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
 		bdev_discard_alignment(bdev) / block_size;
 	return true;
 }
-EXPORT_SYMBOL(target_configure_unmap_from_queue);
+EXPORT_SYMBOL(target_configure_unmap_from_bdev);
 
 /*
  * Convert from blocksize advertised to the initiator to the 512 byte
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 2d78ef74633c..b2610073e8cc 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -92,8 +92,8 @@ static bool fd_configure_unmap(struct se_device *dev)
 	struct inode *inode = file->f_mapping->host;
 
 	if (S_ISBLK(inode->i_mode))
-		return target_configure_unmap_from_queue(&dev->dev_attrib,
-							 I_BDEV(inode));
+		return target_configure_unmap_from_bdev(&dev->dev_attrib,
+							I_BDEV(inode));
 
 	/* Limit UNMAP emulation to 8k Number of LBAs (NoLB) */
 	dev->dev_attrib.max_unmap_lba_count = 0x2000;
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 66c292b7d74b..281612b9830f 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -84,8 +84,8 @@ static bool iblock_configure_unmap(struct se_device *dev)
 {
 	struct iblock_dev *ib_dev = IBLOCK_DEV(dev);
 
-	return target_configure_unmap_from_queue(&dev->dev_attrib,
-						 ib_dev->ibd_bd);
+	return target_configure_unmap_from_bdev(&dev->dev_attrib,
+						ib_dev->ibd_bd);
 }
 
 static int iblock_configure_device(struct se_device *dev)
diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h
index 4063a701081b..d394306f8f49 100644
--- a/include/target/target_core_backend.h
+++ b/include/target/target_core_backend.h
@@ -121,8 +121,8 @@ sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd,
 
 bool target_sense_desc_format(struct se_device *dev);
 sector_t target_to_linux_sector(struct se_device *dev, sector_t lb);
-bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
-				       struct block_device *bdev);
+bool target_configure_unmap_from_bdev(struct se_dev_attrib *attrib,
+				      struct block_device *bdev);
 
 static inline bool target_dev_configured(struct se_device *se_dev)
 {
-- 
cgit v1.2.3


From d505447b8d78f4d81a67d492ac72b8d3a1805e72 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Mon, 20 Oct 2025 10:38:15 +0000
Subject: scsi: target: Add atomic se_device fields

Add atomic fields to the se_device and export them in configfs.

Initially only target_core_iblock will be supported and we will inherit
all the settings from the block layer.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
jpg: Stop being allowed to configure atomic write alignment,
Signed-off-by: John Garry <john.g.garry@oracle.com>
Link: https://patch.msgid.link/20251020103820.2917593-3-john.g.garry@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_configfs.c | 15 +++++++++++++++
 include/target/target_core_base.h     |  5 +++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c
index b19acd662726..5470c1258445 100644
--- a/drivers/target/target_core_configfs.c
+++ b/drivers/target/target_core_configfs.c
@@ -578,6 +578,11 @@ DEF_CONFIGFS_ATTRIB_SHOW(unmap_zeroes_data);
 DEF_CONFIGFS_ATTRIB_SHOW(max_write_same_len);
 DEF_CONFIGFS_ATTRIB_SHOW(emulate_rsoc);
 DEF_CONFIGFS_ATTRIB_SHOW(submit_type);
+DEF_CONFIGFS_ATTRIB_SHOW(atomic_max_len);
+DEF_CONFIGFS_ATTRIB_SHOW(atomic_alignment);
+DEF_CONFIGFS_ATTRIB_SHOW(atomic_granularity);
+DEF_CONFIGFS_ATTRIB_SHOW(atomic_max_with_boundary);
+DEF_CONFIGFS_ATTRIB_SHOW(atomic_max_boundary);
 
 #define DEF_CONFIGFS_ATTRIB_STORE_U32(_name)				\
 static ssize_t _name##_store(struct config_item *item, const char *page,\
@@ -1300,6 +1305,11 @@ CONFIGFS_ATTR(, max_write_same_len);
 CONFIGFS_ATTR(, alua_support);
 CONFIGFS_ATTR(, pgr_support);
 CONFIGFS_ATTR(, submit_type);
+CONFIGFS_ATTR_RO(, atomic_max_len);
+CONFIGFS_ATTR_RO(, atomic_alignment);
+CONFIGFS_ATTR_RO(, atomic_granularity);
+CONFIGFS_ATTR_RO(, atomic_max_with_boundary);
+CONFIGFS_ATTR_RO(, atomic_max_boundary);
 
 /*
  * dev_attrib attributes for devices using the target core SBC/SPC
@@ -1343,6 +1353,11 @@ struct configfs_attribute *sbc_attrib_attrs[] = {
 	&attr_pgr_support,
 	&attr_emulate_rsoc,
 	&attr_submit_type,
+	&attr_atomic_alignment,
+	&attr_atomic_max_len,
+	&attr_atomic_granularity,
+	&attr_atomic_max_with_boundary,
+	&attr_atomic_max_boundary,
 	NULL,
 };
 EXPORT_SYMBOL(sbc_attrib_attrs);
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index c4d9116904aa..70ece58d3078 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -731,6 +731,11 @@ struct se_dev_attrib {
 	u32		unmap_granularity;
 	u32		unmap_granularity_alignment;
 	u32		max_write_same_len;
+	u32		atomic_max_len;
+	u32		atomic_alignment;
+	u32		atomic_granularity;
+	u32		atomic_max_with_boundary;
+	u32		atomic_max_boundary;
 	u8		submit_type;
 	struct se_device *da_dev;
 	struct config_group da_group;
-- 
cgit v1.2.3


From c486634fe2b10301bd8f0319c70a919433bfdf17 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Mon, 20 Oct 2025 10:38:16 +0000
Subject: scsi: target: Add helper to set up atomic values from block_device

Add a helper function that sets up the atomic value based on a
block_device similar to what we do for unmap.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
jpg: Set atomic alignment, drop atomic_supported reference
Signed-off-by: John Garry <john.g.garry@oracle.com>
Link: https://patch.msgid.link/20251020103820.2917593-4-john.g.garry@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_device.c  | 17 +++++++++++++++++
 include/target/target_core_backend.h |  2 ++
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index 83fe3d9a9681..39a2d9c3eb9e 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -840,6 +840,23 @@ free_device:
 	return NULL;
 }
 
+void target_configure_write_atomic_from_bdev(struct se_dev_attrib *attrib,
+					     struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	int block_size = bdev_logical_block_size(bdev);
+
+	if (!bdev_can_atomic_write(bdev))
+		return;
+
+	attrib->atomic_max_len = queue_atomic_write_max_bytes(q) / block_size;
+	attrib->atomic_granularity = attrib->atomic_alignment =
+		queue_atomic_write_unit_min_bytes(q) / block_size;
+	attrib->atomic_max_with_boundary = 0;
+	attrib->atomic_max_boundary = 0;
+}
+EXPORT_SYMBOL_GPL(target_configure_write_atomic_from_bdev);
+
 /*
  * Check if the underlying struct block_device supports discard and if yes
  * configure the UNMAP parameters.
diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h
index d394306f8f49..e32de80854b6 100644
--- a/include/target/target_core_backend.h
+++ b/include/target/target_core_backend.h
@@ -123,6 +123,8 @@ bool target_sense_desc_format(struct se_device *dev);
 sector_t target_to_linux_sector(struct se_device *dev, sector_t lb);
 bool target_configure_unmap_from_bdev(struct se_dev_attrib *attrib,
 				      struct block_device *bdev);
+void target_configure_write_atomic_from_bdev(struct se_dev_attrib *attrib,
+					     struct block_device *bdev);
 
 static inline bool target_dev_configured(struct se_device *se_dev)
 {
-- 
cgit v1.2.3


From 526145725106b490b0c2d9f200b705b17a3da6b6 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Mon, 20 Oct 2025 10:38:17 +0000
Subject: scsi: target: Add WRITE_ATOMIC_16 handler

Add the core LIO code to process the WRITE_ATOMIC_16 command.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
jpg: fix return code from sbc_check_atomic, reformat
Signed-off-by: John Garry <john.g.garry@oracle.com>
Link: https://patch.msgid.link/20251020103820.2917593-5-john.g.garry@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_sbc.c  | 51 +++++++++++++++++++++++++++++++++++++++
 include/target/target_core_base.h |  1 +
 2 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
index fe8beb7dbab1..abe91dc8722e 100644
--- a/drivers/target/target_core_sbc.c
+++ b/drivers/target/target_core_sbc.c
@@ -764,6 +764,49 @@ sbc_check_dpofua(struct se_device *dev, struct se_cmd *cmd, unsigned char *cdb)
 	return 0;
 }
 
+static sense_reason_t
+sbc_check_atomic(struct se_device *dev, struct se_cmd *cmd, unsigned char *cdb)
+{
+	struct se_dev_attrib *attrib = &dev->dev_attrib;
+	u16 boundary, transfer_len;
+	u64 lba;
+
+	lba = transport_lba_64(cdb);
+	boundary = get_unaligned_be16(&cdb[10]);
+	transfer_len = get_unaligned_be16(&cdb[12]);
+
+	if (!attrib->atomic_max_len)
+		return TCM_UNSUPPORTED_SCSI_OPCODE;
+
+	if (boundary) {
+		if (transfer_len > attrib->atomic_max_with_boundary)
+			return TCM_INVALID_CDB_FIELD;
+
+		if (boundary > attrib->atomic_max_boundary)
+			return TCM_INVALID_CDB_FIELD;
+	} else {
+		if (transfer_len > attrib->atomic_max_len)
+			return TCM_INVALID_CDB_FIELD;
+	}
+
+	if (attrib->atomic_granularity) {
+		if (transfer_len % attrib->atomic_granularity)
+			return TCM_INVALID_CDB_FIELD;
+
+		if (boundary && boundary % attrib->atomic_granularity)
+			return TCM_INVALID_CDB_FIELD;
+	}
+
+	if (dev->dev_attrib.atomic_alignment) {
+		u64 _lba = lba;
+
+		if (do_div(_lba, dev->dev_attrib.atomic_alignment))
+			return TCM_INVALID_CDB_FIELD;
+	}
+
+	return 0;
+}
+
 sense_reason_t
 sbc_parse_cdb(struct se_cmd *cmd, struct exec_cmd_ops *ops)
 {
@@ -861,6 +904,7 @@ sbc_parse_cdb(struct se_cmd *cmd, struct exec_cmd_ops *ops)
 		break;
 	case WRITE_16:
 	case WRITE_VERIFY_16:
+	case WRITE_ATOMIC_16:
 		sectors = transport_get_sectors_16(cdb);
 		cmd->t_task_lba = transport_lba_64(cdb);
 
@@ -872,6 +916,13 @@ sbc_parse_cdb(struct se_cmd *cmd, struct exec_cmd_ops *ops)
 			return ret;
 
 		cmd->se_cmd_flags |= SCF_SCSI_DATA_CDB;
+		if (cdb[0] == WRITE_ATOMIC_16) {
+			cmd->se_cmd_flags |= SCF_ATOMIC;
+
+			ret = sbc_check_atomic(dev, cmd, cdb);
+			if (ret)
+				return ret;
+		}
 		cmd->execute_cmd = sbc_execute_rw;
 		break;
 	case VARIABLE_LENGTH_CMD:
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 70ece58d3078..56333b5726c8 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -158,6 +158,7 @@ enum se_cmd_flags_table {
 	SCF_TASK_ATTR_SET			= (1 << 17),
 	SCF_TREAT_READ_AS_NORMAL		= (1 << 18),
 	SCF_TASK_ORDERED_SYNC			= (1 << 19),
+	SCF_ATOMIC				= (1 << 20),
 };
 
 /*
-- 
cgit v1.2.3


From 95aa2041c654161d1b5c1eca5379d67d91ef1cf2 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Wed, 17 Sep 2025 17:12:53 -0500
Subject: scsi: target: Fix LUN/device R/W and total command stats

In commit 9cf2317b795d ("scsi: target: Move I/O path stats to per CPU")
I saw we sometimes use %u and also misread the spec. As a result I
thought all the stats were supposed to be 32-bit only. However, for the
majority of cases we support currently, the spec specifies u64 bit
stats. This patch converts the stats changed in the commit above to u64.

Fixes: 9cf2317b795d ("scsi: target: Move I/O path stats to per CPU")
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Link: https://patch.msgid.link/20250917221338.14813-2-michael.christie@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_stat.c | 24 ++++++++++++------------
 include/target/target_core_base.h | 12 ++++++------
 2 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index 6bdf2d8bd694..4fdc307ea38b 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -282,7 +282,7 @@ static ssize_t target_stat_lu_num_cmds_show(struct config_item *item,
 	struct se_device *dev = to_stat_lu_dev(item);
 	struct se_dev_io_stats *stats;
 	unsigned int cpu;
-	u32 cmds = 0;
+	u64 cmds = 0;
 
 	for_each_possible_cpu(cpu) {
 		stats = per_cpu_ptr(dev->stats, cpu);
@@ -290,7 +290,7 @@ static ssize_t target_stat_lu_num_cmds_show(struct config_item *item,
 	}
 
 	/* scsiLuNumCommands */
-	return snprintf(page, PAGE_SIZE, "%u\n", cmds);
+	return snprintf(page, PAGE_SIZE, "%llu\n", cmds);
 }
 
 static ssize_t target_stat_lu_read_mbytes_show(struct config_item *item,
@@ -299,7 +299,7 @@ static ssize_t target_stat_lu_read_mbytes_show(struct config_item *item,
 	struct se_device *dev = to_stat_lu_dev(item);
 	struct se_dev_io_stats *stats;
 	unsigned int cpu;
-	u32 bytes = 0;
+	u64 bytes = 0;
 
 	for_each_possible_cpu(cpu) {
 		stats = per_cpu_ptr(dev->stats, cpu);
@@ -307,7 +307,7 @@ static ssize_t target_stat_lu_read_mbytes_show(struct config_item *item,
 	}
 
 	/* scsiLuReadMegaBytes */
-	return snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
+	return snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
 }
 
 static ssize_t target_stat_lu_write_mbytes_show(struct config_item *item,
@@ -316,7 +316,7 @@ static ssize_t target_stat_lu_write_mbytes_show(struct config_item *item,
 	struct se_device *dev = to_stat_lu_dev(item);
 	struct se_dev_io_stats *stats;
 	unsigned int cpu;
-	u32 bytes = 0;
+	u64 bytes = 0;
 
 	for_each_possible_cpu(cpu) {
 		stats = per_cpu_ptr(dev->stats, cpu);
@@ -324,7 +324,7 @@ static ssize_t target_stat_lu_write_mbytes_show(struct config_item *item,
 	}
 
 	/* scsiLuWrittenMegaBytes */
-	return snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
+	return snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
 }
 
 static ssize_t target_stat_lu_resets_show(struct config_item *item, char *page)
@@ -1044,7 +1044,7 @@ static ssize_t target_stat_auth_num_cmds_show(struct config_item *item,
 	struct se_dev_entry *deve;
 	unsigned int cpu;
 	ssize_t ret;
-	u32 cmds = 0;
+	u64 cmds = 0;
 
 	rcu_read_lock();
 	deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1059,7 +1059,7 @@ static ssize_t target_stat_auth_num_cmds_show(struct config_item *item,
 	}
 
 	/* scsiAuthIntrOutCommands */
-	ret = snprintf(page, PAGE_SIZE, "%u\n", cmds);
+	ret = snprintf(page, PAGE_SIZE, "%llu\n", cmds);
 	rcu_read_unlock();
 	return ret;
 }
@@ -1073,7 +1073,7 @@ static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item,
 	struct se_dev_entry *deve;
 	unsigned int cpu;
 	ssize_t ret;
-	u32 bytes = 0;
+	u64 bytes = 0;
 
 	rcu_read_lock();
 	deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1088,7 +1088,7 @@ static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item,
 	}
 
 	/* scsiAuthIntrReadMegaBytes */
-	ret = snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
+	ret = snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
 	rcu_read_unlock();
 	return ret;
 }
@@ -1102,7 +1102,7 @@ static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item,
 	struct se_dev_entry *deve;
 	unsigned int cpu;
 	ssize_t ret;
-	u32 bytes = 0;
+	u64 bytes = 0;
 
 	rcu_read_lock();
 	deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1117,7 +1117,7 @@ static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item,
 	}
 
 	/* scsiAuthIntrWrittenMegaBytes */
-	ret = snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
+	ret = snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
 	rcu_read_unlock();
 	return ret;
 }
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index c4d9116904aa..27e1f9d5f0c6 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -671,9 +671,9 @@ struct se_lun_acl {
 };
 
 struct se_dev_entry_io_stats {
-	u32			total_cmds;
-	u32			read_bytes;
-	u32			write_bytes;
+	u64			total_cmds;
+	u64			read_bytes;
+	u64			write_bytes;
 };
 
 struct se_dev_entry {
@@ -806,9 +806,9 @@ struct se_device_queue {
 };
 
 struct se_dev_io_stats {
-	u32			total_cmds;
-	u32			read_bytes;
-	u32			write_bytes;
+	u64			total_cmds;
+	u64			read_bytes;
+	u64			write_bytes;
 };
 
 struct se_device {
-- 
cgit v1.2.3


From bbb490053173b737604a87af03f2113fb1c279a0 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Wed, 17 Sep 2025 17:12:55 -0500
Subject: scsi: target: Move LUN stats to per-CPU

The atomic use in the main I/O path is causing perf issues when using
higher performance backend devices and multiple queues (more than
10 when using vhost-scsi) like with this fio workload:

[global]
bs=4K
iodepth=128
direct=1
ioengine=libaio
group_reporting
time_based
runtime=120
name=standard-iops
rw=randread
numjobs=16
cpus_allowed=0-15

To fix this issue, move the LUN stats to per CPU.

Note: I forgot to include this patch with the delayed/ordered per CPU
tracking and per device/device entry per CPU stats. With this patch you
get the full 33% improvements when using fast backends, multiple queues
and multiple IO submiters.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Link: https://patch.msgid.link/20250917221338.14813-4-michael.christie@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_device.c          |  1 +
 drivers/target/target_core_fabric_configfs.c |  2 +-
 drivers/target/target_core_internal.h        |  1 +
 drivers/target/target_core_stat.c            | 67 +++++++++-------------------
 drivers/target/target_core_tpg.c             | 23 +++++++++-
 drivers/target/target_core_transport.c       | 22 ++++++---
 include/target/target_core_base.h            |  8 ++--
 7 files changed, 65 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index 7bb711b24c0d..2d4a7c0c69ce 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -814,6 +814,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
 	dev->dev_attrib.max_write_same_len = DA_MAX_WRITE_SAME_LEN;
 	dev->dev_attrib.submit_type = TARGET_FABRIC_DEFAULT_SUBMIT;
 
+	/* Skip allocating lun_stats since we can't export them. */
 	xcopy_lun = &dev->xcopy_lun;
 	rcu_assign_pointer(xcopy_lun->lun_se_dev, dev);
 	init_completion(&xcopy_lun->lun_shutdown_comp);
diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
index 7156a4dc1ca7..13159928e365 100644
--- a/drivers/target/target_core_fabric_configfs.c
+++ b/drivers/target/target_core_fabric_configfs.c
@@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item)
 	struct se_lun *lun = container_of(to_config_group(item),
 					  struct se_lun, lun_group);
 
-	kfree_rcu(lun, rcu_head);
+	call_rcu(&lun->rcu_head, target_tpg_free_lun);
 }
 
 static struct configfs_item_operations target_fabric_port_item_ops = {
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 20aab1f50565..763e6d26e187 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -125,6 +125,7 @@ void	core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *,
 				  struct se_lun *);
 void	core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
 struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64);
+void	target_tpg_free_lun(struct rcu_head *head);
 int	core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
 		bool, struct se_device *);
 void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index e29d43dacaf7..083205052be2 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -606,53 +606,30 @@ static ssize_t target_stat_tgt_port_port_index_show(struct config_item *item,
 	return ret;
 }
 
-static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item,
-		char *page)
-{
-	struct se_lun *lun = to_stat_tgt_port(item);
-	struct se_device *dev;
-	ssize_t ret = -ENODEV;
-
-	rcu_read_lock();
-	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%lu\n",
-			       atomic_long_read(&lun->lun_stats.cmd_pdus));
-	rcu_read_unlock();
-	return ret;
-}
-
-static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item,
-		char *page)
-{
-	struct se_lun *lun = to_stat_tgt_port(item);
-	struct se_device *dev;
-	ssize_t ret = -ENODEV;
-
-	rcu_read_lock();
-	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%u\n",
-			(u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20));
-	rcu_read_unlock();
-	return ret;
+#define tgt_port_show_per_cpu_stat(prefix, field, shift)		\
+per_cpu_stat_snprintf(scsi_port_stats, prefix, field, shift);		\
+static ssize_t								\
+target_stat_##prefix##_show(struct config_item *item, char *page)	\
+{									\
+	struct se_lun *lun = to_stat_tgt_port(item);			\
+	struct se_device *dev;						\
+	int ret;							\
+									\
+	rcu_read_lock();						\
+	dev = rcu_dereference(lun->lun_se_dev);				\
+	if (!dev) {							\
+		rcu_read_unlock();					\
+		return -ENODEV;						\
+	}								\
+									\
+	ret = per_cpu_stat_##prefix##_snprintf(lun->lun_stats, page);	\
+	rcu_read_unlock();						\
+	return ret;							\
 }
 
-static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item,
-		char *page)
-{
-	struct se_lun *lun = to_stat_tgt_port(item);
-	struct se_device *dev;
-	ssize_t ret = -ENODEV;
-
-	rcu_read_lock();
-	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%u\n",
-				(u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20));
-	rcu_read_unlock();
-	return ret;
-}
+tgt_port_show_per_cpu_stat(tgt_port_in_cmds, cmd_pdus, 0);
+tgt_port_show_per_cpu_stat(tgt_port_write_mbytes, rx_data_octets, 20);
+tgt_port_show_per_cpu_stat(tgt_port_read_mbytes, tx_data_octets, 20);
 
 static ssize_t target_stat_tgt_port_hs_in_cmds_show(struct config_item *item,
 		char *page)
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index c0e429e5ef31..8b5ad50baa43 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -548,7 +548,7 @@ int core_tpg_register(
 		ret = core_tpg_add_lun(se_tpg, se_tpg->tpg_virt_lun0,
 				true, g_lun0_dev);
 		if (ret < 0) {
-			kfree(se_tpg->tpg_virt_lun0);
+			target_tpg_free_lun(&se_tpg->tpg_virt_lun0->rcu_head);
 			return ret;
 		}
 	}
@@ -595,7 +595,7 @@ int core_tpg_deregister(struct se_portal_group *se_tpg)
 
 	if (se_tpg->proto_id >= 0) {
 		core_tpg_remove_lun(se_tpg, se_tpg->tpg_virt_lun0);
-		kfree_rcu(se_tpg->tpg_virt_lun0, rcu_head);
+		call_rcu(&se_tpg->tpg_virt_lun0->rcu_head, target_tpg_free_lun);
 	}
 
 	target_tpg_deregister_rtpi(se_tpg);
@@ -615,6 +615,13 @@ struct se_lun *core_tpg_alloc_lun(
 		pr_err("Unable to allocate se_lun memory\n");
 		return ERR_PTR(-ENOMEM);
 	}
+
+	lun->lun_stats = alloc_percpu(struct scsi_port_stats);
+	if (!lun->lun_stats) {
+		pr_err("Unable to allocate se_lun stats memory\n");
+		goto free_lun;
+	}
+
 	lun->unpacked_lun = unpacked_lun;
 	atomic_set(&lun->lun_acl_count, 0);
 	init_completion(&lun->lun_shutdown_comp);
@@ -628,6 +635,18 @@ struct se_lun *core_tpg_alloc_lun(
 	lun->lun_tpg = tpg;
 
 	return lun;
+
+free_lun:
+	kfree(lun);
+	return ERR_PTR(-ENOMEM);
+}
+
+void target_tpg_free_lun(struct rcu_head *head)
+{
+	struct se_lun *lun = container_of(head, struct se_lun, rcu_head);
+
+	free_percpu(lun->lun_stats);
+	kfree(lun);
 }
 
 int core_tpg_add_lun(
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 0a76bdfe5528..fca9b44288bc 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1571,7 +1571,12 @@ target_cmd_parse_cdb(struct se_cmd *cmd)
 		return ret;
 
 	cmd->se_cmd_flags |= SCF_SUPPORTED_SAM_OPCODE;
-	atomic_long_inc(&cmd->se_lun->lun_stats.cmd_pdus);
+	/*
+	 * If this is the xcopy_lun then we won't have lun_stats since we
+	 * can't export them.
+	 */
+	if (cmd->se_lun->lun_stats)
+		this_cpu_inc(cmd->se_lun->lun_stats->cmd_pdus);
 	return 0;
 }
 EXPORT_SYMBOL(target_cmd_parse_cdb);
@@ -2597,8 +2602,9 @@ queue_rsp:
 		    !(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL))
 			goto queue_status;
 
-		atomic_long_add(cmd->data_length,
-				&cmd->se_lun->lun_stats.tx_data_octets);
+		if (cmd->se_lun->lun_stats)
+			this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+				     cmd->data_length);
 		/*
 		 * Perform READ_STRIP of PI using software emulation when
 		 * backend had PI enabled, if the transport will not be
@@ -2621,14 +2627,16 @@ queue_rsp:
 			goto queue_full;
 		break;
 	case DMA_TO_DEVICE:
-		atomic_long_add(cmd->data_length,
-				&cmd->se_lun->lun_stats.rx_data_octets);
+		if (cmd->se_lun->lun_stats)
+			this_cpu_add(cmd->se_lun->lun_stats->rx_data_octets,
+				     cmd->data_length);
 		/*
 		 * Check if we need to send READ payload for BIDI-COMMAND
 		 */
 		if (cmd->se_cmd_flags & SCF_BIDI) {
-			atomic_long_add(cmd->data_length,
-					&cmd->se_lun->lun_stats.tx_data_octets);
+			if (cmd->se_lun->lun_stats)
+				this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+					     cmd->data_length);
 			ret = cmd->se_tfo->queue_data_in(cmd);
 			if (ret)
 				goto queue_full;
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 27e1f9d5f0c6..372da2eadf54 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -744,9 +744,9 @@ struct se_port_stat_grps {
 };
 
 struct scsi_port_stats {
-	atomic_long_t	cmd_pdus;
-	atomic_long_t	tx_data_octets;
-	atomic_long_t	rx_data_octets;
+	u64			cmd_pdus;
+	u64			tx_data_octets;
+	u64			rx_data_octets;
 };
 
 struct se_lun {
@@ -773,7 +773,7 @@ struct se_lun {
 	spinlock_t		lun_tg_pt_gp_lock;
 
 	struct se_portal_group	*lun_tpg;
-	struct scsi_port_stats	lun_stats;
+	struct scsi_port_stats	__percpu *lun_stats;
 	struct config_group	lun_group;
 	struct se_port_stat_grps port_stat_grps;
 	struct completion	lun_shutdown_comp;
-- 
cgit v1.2.3


From e21d451a82f39e91b7635c4fc3ff5ac082873ec3 Mon Sep 17 00:00:00 2001
From: Pierre Barre <pierre@barre.sh>
Date: Thu, 16 Oct 2025 15:58:36 +0200
Subject: 9p: Use kvmalloc for message buffers on supported transports

While developing a 9P server (https://github.com/Barre/ZeroFS) and
testing it under high-load, I was running into allocation failures.
The failures occur even with plenty of free memory available because
kmalloc requires contiguous physical memory.

This results in errors like:
ls: page allocation failure: order:7, mode:0x40c40(GFP_NOFS|__GFP_COMP)

This patch introduces a transport capability flag (supports_vmalloc)
that indicates whether a transport can work with vmalloc'd buffers
(non-physically contiguous memory). Transports requiring DMA should
leave this flag as false.

The fd-based transports (tcp, unix, fd) set this flag to true, and
p9_fcall_init will use kvmalloc instead of kmalloc for these
transports. This allows the allocator to fall back to vmalloc when
contiguous physical memory is not available.

Additionally, if kmem_cache_alloc fails, the code falls back to
kvmalloc for transports that support it.

Signed-off-by: Pierre Barre <pierre@barre.sh>
Reviewed-by: Christian Schoenebeck <linux_oss@crudebyte.com>
Message-ID: <d2017c29-11fb-44a5-bd0f-4204329bbefb@app.fastmail.com>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 include/net/9p/transport.h |  4 ++++
 net/9p/client.c            | 11 +++++++++--
 net/9p/trans_fd.c          |  3 +++
 net/9p/trans_rdma.c        |  1 +
 net/9p/trans_usbg.c        |  1 +
 net/9p/trans_virtio.c      |  1 +
 net/9p/trans_xen.c         |  1 +
 7 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h
index 766ec07c9599..f0981515148d 100644
--- a/include/net/9p/transport.h
+++ b/include/net/9p/transport.h
@@ -24,6 +24,9 @@
  *                   we're less flexible when choosing the response message
  *                   size in this case
  * @def: set if this transport should be considered the default
+ * @supports_vmalloc: set if this transport can work with vmalloc'd buffers
+ *                    (non-physically contiguous memory). Transports requiring
+ *                    DMA should leave this as false.
  * @create: member function to create a new connection on this transport
  * @close: member function to discard a connection on this transport
  * @request: member function to issue a request to the transport
@@ -44,6 +47,7 @@ struct p9_trans_module {
 	int maxsize;		/* max message size of transport */
 	bool pooled_rbuffers;
 	int def;		/* this transport should be default */
+	bool supports_vmalloc;	/* can work with vmalloc'd buffers */
 	struct module *owner;
 	int (*create)(struct p9_client *client,
 		      const char *devname, char *args);
diff --git a/net/9p/client.c b/net/9p/client.c
index 5c1ca57ccd28..2a4884c880c1 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -229,8 +229,15 @@ static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
 	if (likely(c->fcall_cache) && alloc_msize == c->msize) {
 		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
 		fc->cache = c->fcall_cache;
+		if (!fc->sdata && c->trans_mod->supports_vmalloc) {
+			fc->sdata = kvmalloc(alloc_msize, GFP_NOFS);
+			fc->cache = NULL;
+		}
 	} else {
-		fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
+		if (c->trans_mod->supports_vmalloc)
+			fc->sdata = kvmalloc(alloc_msize, GFP_NOFS);
+		else
+			fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
 		fc->cache = NULL;
 	}
 	if (!fc->sdata)
@@ -252,7 +259,7 @@ void p9_fcall_fini(struct p9_fcall *fc)
 	if (fc->cache)
 		kmem_cache_free(fc->cache, fc->sdata);
 	else
-		kfree(fc->sdata);
+		kvfree(fc->sdata);
 }
 EXPORT_SYMBOL(p9_fcall_fini);
 
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index a516745f732f..e7334033eba5 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -1101,6 +1101,7 @@ static struct p9_trans_module p9_tcp_trans = {
 	.maxsize = MAX_SOCK_BUF,
 	.pooled_rbuffers = false,
 	.def = 0,
+	.supports_vmalloc = true,
 	.create = p9_fd_create_tcp,
 	.close = p9_fd_close,
 	.request = p9_fd_request,
@@ -1115,6 +1116,7 @@ static struct p9_trans_module p9_unix_trans = {
 	.name = "unix",
 	.maxsize = MAX_SOCK_BUF,
 	.def = 0,
+	.supports_vmalloc = true,
 	.create = p9_fd_create_unix,
 	.close = p9_fd_close,
 	.request = p9_fd_request,
@@ -1129,6 +1131,7 @@ static struct p9_trans_module p9_fd_trans = {
 	.name = "fd",
 	.maxsize = MAX_SOCK_BUF,
 	.def = 0,
+	.supports_vmalloc = true,
 	.create = p9_fd_create,
 	.close = p9_fd_close,
 	.request = p9_fd_request,
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index b84748baf9cb..6c5ad232c194 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -749,6 +749,7 @@ static struct p9_trans_module p9_rdma_trans = {
 	.maxsize = P9_RDMA_MAXSIZE,
 	.pooled_rbuffers = true,
 	.def = 0,
+	.supports_vmalloc = false,
 	.owner = THIS_MODULE,
 	.create = rdma_create_trans,
 	.close = rdma_close,
diff --git a/net/9p/trans_usbg.c b/net/9p/trans_usbg.c
index 468f7e8f0277..2542ef099233 100644
--- a/net/9p/trans_usbg.c
+++ b/net/9p/trans_usbg.c
@@ -514,6 +514,7 @@ static struct p9_trans_module p9_usbg_trans = {
 	.close = p9_usbg_close,
 	.request = p9_usbg_request,
 	.cancel = p9_usbg_cancel,
+	.supports_vmalloc = false,
 	.owner = THIS_MODULE,
 };
 
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 0b8086f58ad5..12806207f4f0 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -803,6 +803,7 @@ static struct p9_trans_module p9_virtio_trans = {
 	.maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
 	.pooled_rbuffers = false,
 	.def = 1,
+	.supports_vmalloc = false,
 	.owner = THIS_MODULE,
 };
 
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index b9ff69c7522a..4b1cec0ab829 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -258,6 +258,7 @@ static struct p9_trans_module p9_xen_trans = {
 	.maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2),
 	.pooled_rbuffers = false,
 	.def = 1,
+	.supports_vmalloc = false,
 	.create = p9_xen_create,
 	.close = p9_xen_close,
 	.request = p9_xen_request,
-- 
cgit v1.2.3


From eeaf38a798aff6384983e5a0ac464d146de7ff55 Mon Sep 17 00:00:00 2001
From: Dominique Martinet <asmadeus@codewreck.org>
Date: Fri, 31 Oct 2025 16:40:07 +0900
Subject: net/9p: cleanup: change p9_trans_module->def to bool

'->def' is only ever used as a true/false flag

Reported-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Message-ID: <20251103-v9fs_trans_def_bool-v1-1-f33dc7ed9e81@codewreck.org>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 include/net/9p/transport.h | 2 +-
 net/9p/trans_fd.c          | 6 +++---
 net/9p/trans_rdma.c        | 2 +-
 net/9p/trans_virtio.c      | 2 +-
 net/9p/trans_xen.c         | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h
index f0981515148d..0aedabc9b7eb 100644
--- a/include/net/9p/transport.h
+++ b/include/net/9p/transport.h
@@ -46,7 +46,7 @@ struct p9_trans_module {
 	char *name;		/* name of transport */
 	int maxsize;		/* max message size of transport */
 	bool pooled_rbuffers;
-	int def;		/* this transport should be default */
+	bool def;		/* this transport should be default */
 	bool supports_vmalloc;	/* can work with vmalloc'd buffers */
 	struct module *owner;
 	int (*create)(struct p9_client *client,
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index e7334033eba5..bd4903d64827 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -1100,7 +1100,7 @@ static struct p9_trans_module p9_tcp_trans = {
 	.name = "tcp",
 	.maxsize = MAX_SOCK_BUF,
 	.pooled_rbuffers = false,
-	.def = 0,
+	.def = false,
 	.supports_vmalloc = true,
 	.create = p9_fd_create_tcp,
 	.close = p9_fd_close,
@@ -1115,7 +1115,7 @@ MODULE_ALIAS_9P("tcp");
 static struct p9_trans_module p9_unix_trans = {
 	.name = "unix",
 	.maxsize = MAX_SOCK_BUF,
-	.def = 0,
+	.def = false,
 	.supports_vmalloc = true,
 	.create = p9_fd_create_unix,
 	.close = p9_fd_close,
@@ -1130,7 +1130,7 @@ MODULE_ALIAS_9P("unix");
 static struct p9_trans_module p9_fd_trans = {
 	.name = "fd",
 	.maxsize = MAX_SOCK_BUF,
-	.def = 0,
+	.def = false,
 	.supports_vmalloc = true,
 	.create = p9_fd_create,
 	.close = p9_fd_close,
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 6c5ad232c194..a0bc766199da 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -748,7 +748,7 @@ static struct p9_trans_module p9_rdma_trans = {
 	.name = "rdma",
 	.maxsize = P9_RDMA_MAXSIZE,
 	.pooled_rbuffers = true,
-	.def = 0,
+	.def = false,
 	.supports_vmalloc = false,
 	.owner = THIS_MODULE,
 	.create = rdma_create_trans,
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 12806207f4f0..b58f50b00c72 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -802,7 +802,7 @@ static struct p9_trans_module p9_virtio_trans = {
 	 */
 	.maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
 	.pooled_rbuffers = false,
-	.def = 1,
+	.def = true,
 	.supports_vmalloc = false,
 	.owner = THIS_MODULE,
 };
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 4b1cec0ab829..9389c1247001 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -257,7 +257,7 @@ static struct p9_trans_module p9_xen_trans = {
 	.name = "xen",
 	.maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2),
 	.pooled_rbuffers = false,
-	.def = 1,
+	.def = true,
 	.supports_vmalloc = false,
 	.create = p9_xen_create,
 	.close = p9_xen_close,
-- 
cgit v1.2.3


From 695f2ca1b4247724576d57eae7b74b90dc69ba3c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 10 Oct 2025 16:36:16 -0500
Subject: fs/fs_parse: add back fsparam_u32hex

296b67059 removed fsparam_u32hex because there were no callers
(yet) and it didn't build due to using the nonexistent symbol
fs_param_is_u32_hex.

fs/9p will need this parser, so add it back with the appropriate
fix (use fs_param_is_u32).

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Message-ID: <20251010214222.1347785-2-sandeen@redhat.com>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 include/linux/fs_parser.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
index 5a0e897cae80..5e8a3b546033 100644
--- a/include/linux/fs_parser.h
+++ b/include/linux/fs_parser.h
@@ -120,6 +120,8 @@ static inline bool fs_validate_description(const char *name,
 #define fsparam_u32(NAME, OPT)	__fsparam(fs_param_is_u32, NAME, OPT, 0, NULL)
 #define fsparam_u32oct(NAME, OPT) \
 			__fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8)
+#define fsparam_u32hex(NAME, OPT) \
+			__fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)16)
 #define fsparam_s32(NAME, OPT)	__fsparam(fs_param_is_s32, NAME, OPT, 0, NULL)
 #define fsparam_u64(NAME, OPT)	__fsparam(fs_param_is_u64, NAME, OPT, 0, NULL)
 #define fsparam_enum(NAME, OPT, array)	__fsparam(fs_param_is_enum, NAME, OPT, 0, array)
-- 
cgit v1.2.3


From c44393d84149d6fc91d94fa39321c9657e91b388 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 10 Oct 2025 16:36:17 -0500
Subject: net/9p: move structures and macros to header files

With the new mount API all option parsing will need to happen
in fs/v9fs.c, so move some existing data structures and macros
to header files to facilitate this. Rename some to reflect
the transport they are used for (rdma, fd, etc), for clarity.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Message-ID: <20251010214222.1347785-3-sandeen@redhat.com>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 include/net/9p/client.h    |  6 ++++++
 include/net/9p/transport.h | 39 +++++++++++++++++++++++++++++++++++++++
 net/9p/client.c            |  6 ------
 net/9p/trans_fd.c          | 20 ++------------------
 net/9p/trans_rdma.c        | 25 ++-----------------------
 5 files changed, 49 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 4f785098c67a..2d46f8017bd5 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -16,6 +16,12 @@
 /* Number of requests per row */
 #define P9_ROW_MAXTAG 255
 
+/* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ +
+ * room for write (16 extra) or read (11 extra) operands.
+ */
+
+#define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ)
+
 /** enum p9_proto_versions - 9P protocol versions
  * @p9_proto_legacy: 9P Legacy mode, pre-9P2000.u
  * @p9_proto_2000u: 9P2000.u extension
diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h
index 0aedabc9b7eb..db6ad369a171 100644
--- a/include/net/9p/transport.h
+++ b/include/net/9p/transport.h
@@ -14,6 +14,45 @@
 #define P9_DEF_MIN_RESVPORT	(665U)
 #define P9_DEF_MAX_RESVPORT	(1023U)
 
+#define P9_FD_PORT 564
+
+#define P9_RDMA_PORT		5640
+#define P9_RDMA_SQ_DEPTH	32
+#define P9_RDMA_RQ_DEPTH	32
+#define P9_RDMA_TIMEOUT		30000		/* 30 seconds */
+
+/**
+ * struct p9_fd_opts - per-transport options for fd transport
+ * @rfd: file descriptor for reading (trans=fd)
+ * @wfd: file descriptor for writing (trans=fd)
+ * @port: port to connect to (trans=tcp)
+ * @privport: port is privileged
+ */
+
+struct p9_fd_opts {
+	int rfd;
+	int wfd;
+	u16 port;
+	bool privport;
+};
+
+/**
+ * struct p9_rdma_opts - Collection of mount options for rdma transport
+ * @port: port of connection
+ * @privport: Whether a privileged port may be used
+ * @sq_depth: The requested depth of the SQ. This really doesn't need
+ * to be any deeper than the number of threads used in the client
+ * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
+ * @timeout: Time to wait in msecs for CM events
+ */
+struct p9_rdma_opts {
+	short port;
+	bool privport;
+	int sq_depth;
+	int rq_depth;
+	long timeout;
+};
+
 /**
  * struct p9_trans_module - transport module interface
  * @list: used to maintain a list of currently available transports
diff --git a/net/9p/client.c b/net/9p/client.c
index 2a4884c880c1..802f548332a5 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -29,12 +29,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/9p.h>
 
-/* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ +
- * room for write (16 extra) or read (11 extra) operands.
- */
-
-#define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ)
-
 /* Client Option Parsing (code inspired by NFS code)
  *  - a little lazy - parse all client options
  */
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index bd4903d64827..b7e5933c4617 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -31,28 +31,12 @@
 
 #include <linux/syscalls.h> /* killme */
 
-#define P9_PORT 564
 #define MAX_SOCK_BUF (1024*1024)
 #define MAXPOLLWADDR	2
 
 static struct p9_trans_module p9_tcp_trans;
 static struct p9_trans_module p9_fd_trans;
 
-/**
- * struct p9_fd_opts - per-transport options
- * @rfd: file descriptor for reading (trans=fd)
- * @wfd: file descriptor for writing (trans=fd)
- * @port: port to connect to (trans=tcp)
- * @privport: port is privileged
- */
-
-struct p9_fd_opts {
-	int rfd;
-	int wfd;
-	u16 port;
-	bool privport;
-};
-
 /*
   * Option Parsing (code inspired by NFS code)
   *  - a little lazy - parse all fd-transport options
@@ -742,7 +726,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
 static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt)
 {
 	if (clnt->trans_mod == &p9_tcp_trans) {
-		if (clnt->trans_opts.tcp.port != P9_PORT)
+		if (clnt->trans_opts.tcp.port != P9_FD_PORT)
 			seq_printf(m, ",port=%u", clnt->trans_opts.tcp.port);
 	} else if (clnt->trans_mod == &p9_fd_trans) {
 		if (clnt->trans_opts.fd.rfd != ~0)
@@ -768,7 +752,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts)
 	int option;
 	char *options, *tmp_options;
 
-	opts->port = P9_PORT;
+	opts->port = P9_FD_PORT;
 	opts->rfd = ~0;
 	opts->wfd = ~0;
 	opts->privport = false;
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index a0bc766199da..87246463a954 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -32,14 +32,10 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 
-#define P9_PORT			5640
-#define P9_RDMA_SQ_DEPTH	32
-#define P9_RDMA_RQ_DEPTH	32
 #define P9_RDMA_SEND_SGE	4
 #define P9_RDMA_RECV_SGE	4
 #define P9_RDMA_IRD		0
 #define P9_RDMA_ORD		0
-#define P9_RDMA_TIMEOUT		30000		/* 30 seconds */
 #define P9_RDMA_MAXSIZE		(1024*1024)	/* 1MB */
 
 /**
@@ -110,23 +106,6 @@ struct p9_rdma_context {
 	};
 };
 
-/**
- * struct p9_rdma_opts - Collection of mount options
- * @port: port of connection
- * @privport: Whether a privileged port may be used
- * @sq_depth: The requested depth of the SQ. This really doesn't need
- * to be any deeper than the number of threads used in the client
- * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
- * @timeout: Time to wait in msecs for CM events
- */
-struct p9_rdma_opts {
-	short port;
-	bool privport;
-	int sq_depth;
-	int rq_depth;
-	long timeout;
-};
-
 /*
  * Option Parsing (code inspired by NFS code)
  */
@@ -151,7 +130,7 @@ static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt)
 {
 	struct p9_trans_rdma *rdma = clnt->trans;
 
-	if (rdma->port != P9_PORT)
+	if (rdma->port != P9_RDMA_PORT)
 		seq_printf(m, ",port=%u", rdma->port);
 	if (rdma->sq_depth != P9_RDMA_SQ_DEPTH)
 		seq_printf(m, ",sq=%u", rdma->sq_depth);
@@ -178,7 +157,7 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
 	int option;
 	char *options, *tmp_options;
 
-	opts->port = P9_PORT;
+	opts->port = P9_RDMA_PORT;
 	opts->sq_depth = P9_RDMA_SQ_DEPTH;
 	opts->rq_depth = P9_RDMA_RQ_DEPTH;
 	opts->timeout = P9_RDMA_TIMEOUT;
-- 
cgit v1.2.3


From 075e8bd4127f007910fc302ad5c3c471d0be4799 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 10 Oct 2025 16:36:18 -0500
Subject: 9p: create a v9fs_context structure to hold parsed options

This patch creates a new v9fs_context structure which includes
new p9_session_opts and p9_client_opts structures, as well as
re-using the existing p9_fd_opts and p9_rdma_opts to store options
during parsing. The new structure will be used in the next
commit to pass all parsed options to the appropriate transports.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Message-ID: <20251010214222.1347785-4-sandeen@redhat.com>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 include/net/9p/client.h    | 90 ++++++++++++++++++++++++++++++++++++++++++++++
 include/net/9p/transport.h | 32 -----------------
 2 files changed, 90 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 2d46f8017bd5..cc18443f7d51 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -132,6 +132,96 @@ struct p9_client {
 	char name[__NEW_UTS_LEN + 1];
 };
 
+/**
+ * struct p9_fd_opts - holds client options during parsing
+ * @msize: maximum data size negotiated by protocol
+ * @prot-Oversion: 9P protocol version to use
+ * @trans_mod: module API instantiated with this client
+ *
+ * These parsed options get transferred into client in
+ * apply_client_options()
+ */
+struct p9_client_opts {
+	unsigned int msize;
+	unsigned char proto_version;
+	struct p9_trans_module *trans_mod;
+};
+
+/**
+ * struct p9_fd_opts - per-transport options for fd transport
+ * @rfd: file descriptor for reading (trans=fd)
+ * @wfd: file descriptor for writing (trans=fd)
+ * @port: port to connect to (trans=tcp)
+ * @privport: port is privileged
+ */
+struct p9_fd_opts {
+	int rfd;
+	int wfd;
+	u16 port;
+	bool privport;
+};
+
+/**
+ * struct p9_rdma_opts - Collection of mount options for rdma transport
+ * @port: port of connection
+ * @privport: Whether a privileged port may be used
+ * @sq_depth: The requested depth of the SQ. This really doesn't need
+ * to be any deeper than the number of threads used in the client
+ * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
+ * @timeout: Time to wait in msecs for CM events
+ */
+struct p9_rdma_opts {
+	short port;
+	bool privport;
+	int sq_depth;
+	int rq_depth;
+	long timeout;
+};
+
+/**
+ * struct p9_session_opts - holds parsed options for v9fs_session_info
+ * @flags: session options of type &p9_session_flags
+ * @nodev: set to 1 to disable device mapping
+ * @debug: debug level
+ * @afid: authentication handle
+ * @cache: cache mode of type &p9_cache_bits
+ * @cachetag: the tag of the cache associated with this session
+ * @uname: string user name to mount hierarchy as
+ * @aname: mount specifier for remote hierarchy
+ * @dfltuid: default numeric userid to mount hierarchy as
+ * @dfltgid: default numeric groupid to mount hierarchy as
+ * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
+ * @session_lock_timeout: retry interval for blocking locks
+ *
+ * This strucure holds options which are parsed and will be transferred
+ * to the v9fs_session_info structure when mounted, and therefore largely
+ * duplicates struct v9fs_session_info.
+ */
+struct p9_session_opts {
+	unsigned int flags;
+	unsigned char nodev;
+	unsigned short debug;
+	unsigned int afid;
+	unsigned int cache;
+#ifdef CONFIG_9P_FSCACHE
+	char *cachetag;
+#endif
+	char *uname;
+	char *aname;
+	kuid_t dfltuid;
+	kgid_t dfltgid;
+	kuid_t uid;
+	long session_lock_timeout;
+};
+
+/* Used by mount API to store parsed mount options */
+struct v9fs_context {
+	struct p9_client_opts	client_opts;
+	struct p9_fd_opts	fd_opts;
+	struct p9_rdma_opts	rdma_opts;
+	struct p9_session_opts	session_opts;
+};
+
 /**
  * struct p9_fid - file system entity handle
  * @clnt: back pointer to instantiating &p9_client
diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h
index db6ad369a171..898a432a8063 100644
--- a/include/net/9p/transport.h
+++ b/include/net/9p/transport.h
@@ -21,38 +21,6 @@
 #define P9_RDMA_RQ_DEPTH	32
 #define P9_RDMA_TIMEOUT		30000		/* 30 seconds */
 
-/**
- * struct p9_fd_opts - per-transport options for fd transport
- * @rfd: file descriptor for reading (trans=fd)
- * @wfd: file descriptor for writing (trans=fd)
- * @port: port to connect to (trans=tcp)
- * @privport: port is privileged
- */
-
-struct p9_fd_opts {
-	int rfd;
-	int wfd;
-	u16 port;
-	bool privport;
-};
-
-/**
- * struct p9_rdma_opts - Collection of mount options for rdma transport
- * @port: port of connection
- * @privport: Whether a privileged port may be used
- * @sq_depth: The requested depth of the SQ. This really doesn't need
- * to be any deeper than the number of threads used in the client
- * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
- * @timeout: Time to wait in msecs for CM events
- */
-struct p9_rdma_opts {
-	short port;
-	bool privport;
-	int sq_depth;
-	int rq_depth;
-	long timeout;
-};
-
 /**
  * struct p9_trans_module - transport module interface
  * @list: used to maintain a list of currently available transports
-- 
cgit v1.2.3


From 1f3e4142c0eb178089ea0cbc97506a061470ad27 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 10 Oct 2025 16:36:19 -0500
Subject: 9p: convert to the new mount API

Convert 9p to the new mount API. This patch consolidates all parsing
into fs/9p/v9fs.c, which stores all results into a filesystem context
which can be passed to the various transports as needed.

Some of the parsing helper functions such as get_cache_mode() have been
eliminated in favor of using the new mount API's enum param type,
for simplicity.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Message-ID: <20251010214222.1347785-5-sandeen@redhat.com>
[ Dominique: handled source explicitly as per follow-up discussion ]
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 fs/9p/v9fs.c               | 552 +++++++++++++++++++++++----------------------
 fs/9p/v9fs.h               |   7 +-
 fs/9p/vfs_super.c          | 130 +++++++----
 include/net/9p/client.h    |   2 +-
 include/net/9p/transport.h |   2 +-
 net/9p/client.c            | 148 +-----------
 net/9p/mod.c               |   2 +-
 net/9p/trans_fd.c          | 109 +--------
 net/9p/trans_rdma.c        | 108 +--------
 net/9p/trans_usbg.c        |   4 +-
 net/9p/trans_virtio.c      |   8 +-
 net/9p/trans_xen.c         |   4 +-
 12 files changed, 424 insertions(+), 652 deletions(-)

(limited to 'include')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index a020a8f00a1a..05fc2ba3c5d4 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -13,7 +13,8 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/cred.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <net/9p/9p.h>
@@ -33,6 +34,10 @@ struct kmem_cache *v9fs_inode_cache;
  */
 
 enum {
+	/* Mount-point source, we need to handle this explicitly because
+	 * the code below accepts unknown args and the vfs layer only handles
+	 * source if we rejected it as EINVAL */
+	Opt_source,
 	/* Options that take integer arguments */
 	Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
 	/* String options */
@@ -43,55 +48,81 @@ enum {
 	Opt_access, Opt_posixacl,
 	/* Lock timeout option */
 	Opt_locktimeout,
-	/* Error token */
-	Opt_err
+
+	/* Client options */
+	Opt_msize, Opt_trans, Opt_legacy, Opt_version,
+
+	/* fd transport options */
+	/* Options that take integer arguments */
+	Opt_rfdno, Opt_wfdno,
+	/* Options that take no arguments */
+
+	/* rdma transport options */
+	/* Options that take integer arguments */
+	Opt_rq_depth, Opt_sq_depth, Opt_timeout,
+
+	/* Options for both fd and rdma transports */
+	Opt_port, Opt_privport,
 };
 
-static const match_table_t tokens = {
-	{Opt_debug, "debug=%x"},
-	{Opt_dfltuid, "dfltuid=%u"},
-	{Opt_dfltgid, "dfltgid=%u"},
-	{Opt_afid, "afid=%u"},
-	{Opt_uname, "uname=%s"},
-	{Opt_remotename, "aname=%s"},
-	{Opt_nodevmap, "nodevmap"},
-	{Opt_noxattr, "noxattr"},
-	{Opt_directio, "directio"},
-	{Opt_ignoreqv, "ignoreqv"},
-	{Opt_cache, "cache=%s"},
-	{Opt_cachetag, "cachetag=%s"},
-	{Opt_access, "access=%s"},
-	{Opt_posixacl, "posixacl"},
-	{Opt_locktimeout, "locktimeout=%u"},
-	{Opt_err, NULL}
+static const struct constant_table p9_versions[] = {
+	{ "9p2000",	p9_proto_legacy },
+	{ "9p2000.u",	p9_proto_2000u },
+	{ "9p2000.L",	p9_proto_2000L },
+	{}
 };
 
-/* Interpret mount options for cache mode */
-static int get_cache_mode(char *s)
-{
-	int version = -EINVAL;
-
-	if (!strcmp(s, "loose")) {
-		version = CACHE_SC_LOOSE;
-		p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
-	} else if (!strcmp(s, "fscache")) {
-		version = CACHE_SC_FSCACHE;
-		p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
-	} else if (!strcmp(s, "mmap")) {
-		version = CACHE_SC_MMAP;
-		p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n");
-	} else if (!strcmp(s, "readahead")) {
-		version = CACHE_SC_READAHEAD;
-		p9_debug(P9_DEBUG_9P, "Cache mode: readahead\n");
-	} else if (!strcmp(s, "none")) {
-		version = CACHE_SC_NONE;
-		p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
-	} else if (kstrtoint(s, 0, &version) != 0) {
-		version = -EINVAL;
-		pr_info("Unknown Cache mode or invalid value %s\n", s);
-	}
-	return version;
-}
+static const struct constant_table p9_cache_mode[] = {
+	{ "loose",	CACHE_SC_LOOSE },
+	{ "fscache",	CACHE_SC_FSCACHE },
+	{ "mmap",	CACHE_SC_MMAP },
+	{ "readahead",	CACHE_SC_READAHEAD },
+	{ "none",	CACHE_SC_NONE },
+	{}
+};
+
+/*
+ * This structure contains all parameters used for the core code,
+ * the client, and all the transports.
+ */
+const struct fs_parameter_spec v9fs_param_spec[] = {
+	fsparam_string	("source",	Opt_source),
+	fsparam_u32hex	("debug",	Opt_debug),
+	fsparam_uid	("dfltuid",	Opt_dfltuid),
+	fsparam_gid	("dfltgid",	Opt_dfltgid),
+	fsparam_u32	("afid",	Opt_afid),
+	fsparam_string	("uname",	Opt_uname),
+	fsparam_string	("aname",	Opt_remotename),
+	fsparam_flag	("nodevmap",	Opt_nodevmap),
+	fsparam_flag	("noxattr",	Opt_noxattr),
+	fsparam_flag	("directio",	Opt_directio),
+	fsparam_flag	("ignoreqv",	Opt_ignoreqv),
+	fsparam_enum	("cache",	Opt_cache, p9_cache_mode),
+	fsparam_string	("cachetag",	Opt_cachetag),
+	fsparam_string	("access",	Opt_access),
+	fsparam_flag	("posixacl",	Opt_posixacl),
+	fsparam_u32	("locktimeout",	Opt_locktimeout),
+
+	/* client options */
+	fsparam_u32	("msize",	Opt_msize),
+	fsparam_flag	("noextend",	Opt_legacy),
+	fsparam_string	("trans",	Opt_trans),
+	fsparam_enum	("version",	Opt_version, p9_versions),
+
+	/* fd transport options */
+	fsparam_u32	("rfdno",	Opt_rfdno),
+	fsparam_u32	("wfdno",	Opt_wfdno),
+
+	/* rdma transport options */
+	fsparam_u32	("sq",		Opt_sq_depth),
+	fsparam_u32	("rq",		Opt_rq_depth),
+	fsparam_u32	("timeout",	Opt_timeout),
+
+	/* fd and rdma transprt options */
+	fsparam_u32	("port",	Opt_port),
+	fsparam_flag	("privport",	Opt_privport),
+	{}
+};
 
 /*
  * Display the mount options in /proc/mounts.
@@ -153,267 +184,252 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
 }
 
 /**
- * v9fs_parse_options - parse mount options into session structure
- * @v9ses: existing v9fs session information
- * @opts: The mount option string
+ * v9fs_parse_param - parse a mount option into the filesystem context
+ * @fc: the filesystem context
+ * @param: the parameter to parse
  *
  * Return 0 upon success, -ERRNO upon failure.
  */
-
-static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
+int v9fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	char *options, *tmp_options;
-	substring_t args[MAX_OPT_ARGS];
-	char *p;
-	int option = 0;
+	struct v9fs_context *ctx = fc->fs_private;
+	struct fs_parse_result result;
 	char *s;
-	int ret = 0;
-
-	/* setup defaults */
-	v9ses->afid = ~0;
-	v9ses->debug = 0;
-	v9ses->cache = CACHE_NONE;
-#ifdef CONFIG_9P_FSCACHE
-	v9ses->cachetag = NULL;
-#endif
-	v9ses->session_lock_timeout = P9_LOCK_TIMEOUT;
-
-	if (!opts)
-		return 0;
+	int r;
+	int opt;
+	struct p9_client_opts	*clnt = &ctx->client_opts;
+	struct p9_fd_opts	*fd_opts = &ctx->fd_opts;
+	struct p9_rdma_opts	*rdma_opts = &ctx->rdma_opts;
+	struct p9_session_opts	*session_opts = &ctx->session_opts;
+
+	opt = fs_parse(fc, v9fs_param_spec, param, &result);
+	if (opt < 0) {
+		/*
+		 * We might like to report bad mount options here, but
+		 * traditionally 9p has ignored unknown mount options
+		 */
+		if (opt == -ENOPARAM)
+			return 0;
 
-	tmp_options = kstrdup(opts, GFP_KERNEL);
-	if (!tmp_options) {
-		ret = -ENOMEM;
-		goto fail_option_alloc;
+		return opt;
 	}
-	options = tmp_options;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token, r;
-
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_debug:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-			} else {
-				v9ses->debug = option;
+
+	switch (opt) {
+	case Opt_source:
+		if (fc->source) {
+			pr_info("p9: multiple sources not supported\n");
+			return -EINVAL;
+		}
+		fc->source = param->string;
+		param->string = NULL;
+		break;
+	case Opt_debug:
+		session_opts->debug = result.uint_32;
 #ifdef CONFIG_NET_9P_DEBUG
-				p9_debug_level = option;
+		p9_debug_level = result.uint_32;
 #endif
-			}
-			break;
-
-		case Opt_dfltuid:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-				continue;
-			}
-			v9ses->dfltuid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(v9ses->dfltuid)) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "uid field, but not a uid?\n");
-				ret = -EINVAL;
-			}
-			break;
-		case Opt_dfltgid:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-				continue;
-			}
-			v9ses->dfltgid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(v9ses->dfltgid)) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "gid field, but not a gid?\n");
-				ret = -EINVAL;
-			}
-			break;
-		case Opt_afid:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-			} else {
-				v9ses->afid = option;
-			}
-			break;
-		case Opt_uname:
-			kfree(v9ses->uname);
-			v9ses->uname = match_strdup(&args[0]);
-			if (!v9ses->uname) {
-				ret = -ENOMEM;
-				goto free_and_return;
-			}
-			break;
-		case Opt_remotename:
-			kfree(v9ses->aname);
-			v9ses->aname = match_strdup(&args[0]);
-			if (!v9ses->aname) {
-				ret = -ENOMEM;
-				goto free_and_return;
-			}
-			break;
-		case Opt_nodevmap:
-			v9ses->nodev = 1;
-			break;
-		case Opt_noxattr:
-			v9ses->flags |= V9FS_NO_XATTR;
-			break;
-		case Opt_directio:
-			v9ses->flags |= V9FS_DIRECT_IO;
-			break;
-		case Opt_ignoreqv:
-			v9ses->flags |= V9FS_IGNORE_QV;
-			break;
-		case Opt_cachetag:
+		break;
+
+	case Opt_dfltuid:
+		session_opts->dfltuid = result.uid;
+		break;
+	case Opt_dfltgid:
+		session_opts->dfltgid = result.gid;
+		break;
+	case Opt_afid:
+		session_opts->afid = result.uint_32;
+		break;
+	case Opt_uname:
+		kfree(session_opts->uname);
+		session_opts->uname = param->string;
+		param->string = NULL;
+		break;
+	case Opt_remotename:
+		kfree(session_opts->aname);
+		session_opts->aname = param->string;
+		param->string = NULL;
+		break;
+	case Opt_nodevmap:
+		session_opts->nodev = 1;
+		break;
+	case Opt_noxattr:
+		session_opts->flags |= V9FS_NO_XATTR;
+		break;
+	case Opt_directio:
+		session_opts->flags |= V9FS_DIRECT_IO;
+		break;
+	case Opt_ignoreqv:
+		session_opts->flags |= V9FS_IGNORE_QV;
+		break;
+	case Opt_cachetag:
 #ifdef CONFIG_9P_FSCACHE
-			kfree(v9ses->cachetag);
-			v9ses->cachetag = match_strdup(&args[0]);
-			if (!v9ses->cachetag) {
-				ret = -ENOMEM;
-				goto free_and_return;
-			}
+		kfree(session_opts->cachetag);
+		session_opts->cachetag = param->string;
+		param->string = NULL;
 #endif
-			break;
-		case Opt_cache:
-			s = match_strdup(&args[0]);
-			if (!s) {
-				ret = -ENOMEM;
-				p9_debug(P9_DEBUG_ERROR,
-					 "problem allocating copy of cache arg\n");
-				goto free_and_return;
-			}
-			r = get_cache_mode(s);
-			if (r < 0)
-				ret = r;
-			else
-				v9ses->cache = r;
-
-			kfree(s);
-			break;
-
-		case Opt_access:
-			s = match_strdup(&args[0]);
-			if (!s) {
-				ret = -ENOMEM;
-				p9_debug(P9_DEBUG_ERROR,
-					 "problem allocating copy of access arg\n");
-				goto free_and_return;
+		break;
+	case Opt_cache:
+		session_opts->cache = result.uint_32;
+		p9_debug(P9_DEBUG_9P, "Cache mode: %s\n", param->string);
+		break;
+	case Opt_access:
+		s = param->string;
+		session_opts->flags &= ~V9FS_ACCESS_MASK;
+		if (strcmp(s, "user") == 0) {
+			session_opts->flags |= V9FS_ACCESS_USER;
+		} else if (strcmp(s, "any") == 0) {
+			session_opts->flags |= V9FS_ACCESS_ANY;
+		} else if (strcmp(s, "client") == 0) {
+			session_opts->flags |= V9FS_ACCESS_CLIENT;
+		} else {
+			uid_t uid;
+
+			session_opts->flags |= V9FS_ACCESS_SINGLE;
+			r = kstrtouint(s, 10, &uid);
+			if (r) {
+				pr_info("Unknown access argument %s: %d\n",
+					param->string, r);
+				return r;
 			}
-
-			v9ses->flags &= ~V9FS_ACCESS_MASK;
-			if (strcmp(s, "user") == 0)
-				v9ses->flags |= V9FS_ACCESS_USER;
-			else if (strcmp(s, "any") == 0)
-				v9ses->flags |= V9FS_ACCESS_ANY;
-			else if (strcmp(s, "client") == 0) {
-				v9ses->flags |= V9FS_ACCESS_CLIENT;
-			} else {
-				uid_t uid;
-
-				v9ses->flags |= V9FS_ACCESS_SINGLE;
-				r = kstrtouint(s, 10, &uid);
-				if (r) {
-					ret = r;
-					pr_info("Unknown access argument %s: %d\n",
-						s, r);
-					kfree(s);
-					continue;
-				}
-				v9ses->uid = make_kuid(current_user_ns(), uid);
-				if (!uid_valid(v9ses->uid)) {
-					ret = -EINVAL;
-					pr_info("Unknown uid %s\n", s);
-				}
+			session_opts->uid = make_kuid(current_user_ns(), uid);
+			if (!uid_valid(session_opts->uid)) {
+				pr_info("Unknown uid %s\n", s);
+				return -EINVAL;
 			}
+		}
+		break;
 
-			kfree(s);
-			break;
-
-		case Opt_posixacl:
+	case Opt_posixacl:
 #ifdef CONFIG_9P_FS_POSIX_ACL
-			v9ses->flags |= V9FS_POSIX_ACL;
+		session_opts->flags |= V9FS_POSIX_ACL;
 #else
-			p9_debug(P9_DEBUG_ERROR,
-				 "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
+		p9_debug(P9_DEBUG_ERROR,
+			 "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
 #endif
-			break;
-
-		case Opt_locktimeout:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-				continue;
-			}
-			if (option < 1) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "locktimeout must be a greater than zero integer.\n");
-				ret = -EINVAL;
-				continue;
-			}
-			v9ses->session_lock_timeout = (long)option * HZ;
-			break;
+		break;
+
+	case Opt_locktimeout:
+		if (result.uint_32 < 1) {
+			p9_debug(P9_DEBUG_ERROR,
+				 "locktimeout must be a greater than zero integer.\n");
+			return -EINVAL;
+		}
+		session_opts->session_lock_timeout = (long)result.uint_32 * HZ;
+		break;
 
-		default:
-			continue;
+	/* Options for client */
+	case Opt_msize:
+		if (result.uint_32 < 4096) {
+			p9_debug(P9_DEBUG_ERROR, "msize should be at least 4k\n");
+			return -EINVAL;
+		}
+		if (result.uint_32 > INT_MAX) {
+			p9_debug(P9_DEBUG_ERROR, "msize too big\n");
+			return -EINVAL;
 		}
+		clnt->msize = result.uint_32;
+		break;
+	case Opt_trans:
+		v9fs_put_trans(clnt->trans_mod);
+		clnt->trans_mod = v9fs_get_trans_by_name(param->string);
+		if (!clnt->trans_mod) {
+			pr_info("Could not find request transport: %s\n",
+				param->string);
+			return -EINVAL;
+		}
+		break;
+	case Opt_legacy:
+		clnt->proto_version = p9_proto_legacy;
+		break;
+	case Opt_version:
+		clnt->proto_version = result.uint_32;
+		p9_debug(P9_DEBUG_9P, "Protocol version: %s\n", param->string);
+		break;
+	/* Options for fd transport */
+	case Opt_rfdno:
+		fd_opts->rfd = result.uint_32;
+		break;
+	case Opt_wfdno:
+		fd_opts->wfd = result.uint_32;
+		break;
+	/* Options for rdma transport */
+	case Opt_sq_depth:
+		rdma_opts->sq_depth = result.uint_32;
+		break;
+	case Opt_rq_depth:
+		rdma_opts->rq_depth = result.uint_32;
+		break;
+	case Opt_timeout:
+		rdma_opts->timeout = result.uint_32;
+		break;
+	/* Options for both fd and rdma transports */
+	case Opt_port:
+		fd_opts->port = result.uint_32;
+		rdma_opts->port = result.uint_32;
+		break;
+	case Opt_privport:
+		fd_opts->privport = true;
+		rdma_opts->port = true;
+		break;
 	}
 
-free_and_return:
-	kfree(tmp_options);
-fail_option_alloc:
-	return ret;
+	return 0;
+}
+
+static void v9fs_apply_options(struct v9fs_session_info *v9ses,
+		  struct fs_context *fc)
+{
+	struct v9fs_context	*ctx = fc->fs_private;
+
+	v9ses->debug = ctx->session_opts.debug;
+	v9ses->dfltuid = ctx->session_opts.dfltuid;
+	v9ses->dfltgid = ctx->session_opts.dfltgid;
+	v9ses->afid = ctx->session_opts.afid;
+	v9ses->uname = ctx->session_opts.uname;
+	ctx->session_opts.uname = NULL;
+	v9ses->aname = ctx->session_opts.aname;
+	ctx->session_opts.aname = NULL;
+	v9ses->nodev = ctx->session_opts.nodev;
+	/*
+	 * Note that we must |= flags here as session_init already
+	 * set basic flags. This adds in flags from parsed options.
+	 */
+	v9ses->flags |= ctx->session_opts.flags;
+#ifdef CONFIG_9P_FSCACHE
+	v9ses->cachetag = ctx->session_opts.cachetag;
+	ctx->session_opts.cachetag = NULL;
+#endif
+	v9ses->cache = ctx->session_opts.cache;
+	v9ses->uid = ctx->session_opts.uid;
+	v9ses->session_lock_timeout = ctx->session_opts.session_lock_timeout;
 }
 
 /**
  * v9fs_session_init - initialize session
  * @v9ses: session information structure
- * @dev_name: device being mounted
- * @data: options
+ * @fc: the filesystem mount context
  *
  */
 
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
-		  const char *dev_name, char *data)
+		  struct fs_context *fc)
 {
 	struct p9_fid *fid;
 	int rc = -ENOMEM;
 
-	v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
-	if (!v9ses->uname)
-		goto err_names;
-
-	v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
-	if (!v9ses->aname)
-		goto err_names;
 	init_rwsem(&v9ses->rename_sem);
 
-	v9ses->uid = INVALID_UID;
-	v9ses->dfltuid = V9FS_DEFUID;
-	v9ses->dfltgid = V9FS_DEFGID;
-
-	v9ses->clnt = p9_client_create(dev_name, data);
+	v9ses->clnt = p9_client_create(fc);
 	if (IS_ERR(v9ses->clnt)) {
 		rc = PTR_ERR(v9ses->clnt);
 		p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
 		goto err_names;
 	}
 
+	/*
+	 * Initialize flags on the real v9ses. v9fs_apply_options below
+	 * will |= the additional flags from parsed options.
+	 */
 	v9ses->flags = V9FS_ACCESS_USER;
 
 	if (p9_is_proto_dotl(v9ses->clnt)) {
@@ -423,9 +439,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		v9ses->flags |= V9FS_PROTO_2000U;
 	}
 
-	rc = v9fs_parse_options(v9ses, data);
-	if (rc < 0)
-		goto err_clnt;
+	v9fs_apply_options(v9ses, fc);
 
 	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
@@ -471,7 +485,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 #ifdef CONFIG_9P_FSCACHE
 	/* register the session for caching */
 	if (v9ses->cache & CACHE_FSCACHE) {
-		rc = v9fs_cache_session_get_cookie(v9ses, dev_name);
+		rc = v9fs_cache_session_get_cookie(v9ses, fc->source);
 		if (rc < 0)
 			goto err_clnt;
 	}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index f28bc763847a..6a12445d3858 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -10,6 +10,9 @@
 
 #include <linux/backing-dev.h>
 #include <linux/netfs.h>
+#include <linux/fs_parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
 
 /**
  * enum p9_session_flags - option flags for each 9P session
@@ -163,11 +166,13 @@ static inline struct fscache_volume *v9fs_session_cache(struct v9fs_session_info
 #endif
 }
 
+extern const struct fs_parameter_spec v9fs_param_spec[];
 
+extern int v9fs_parse_param(struct fs_context *fc, struct fs_parameter *param);
 extern int v9fs_show_options(struct seq_file *m, struct dentry *root);
 
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
-				 const char *dev_name, char *data);
+				 struct fs_context *fc);
 extern void v9fs_session_close(struct v9fs_session_info *v9ses);
 extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 1581ebac5bb4..315336de6f02 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -19,6 +19,7 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/fscache.h>
+#include <linux/fs_context.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -30,32 +31,10 @@
 
 static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
 
-/**
- * v9fs_set_super - set the superblock
- * @s: super block
- * @data: file system specific data
- *
- */
-
-static int v9fs_set_super(struct super_block *s, void *data)
-{
-	s->s_fs_info = data;
-	return set_anon_super(s, data);
-}
-
-/**
- * v9fs_fill_super - populate superblock with info
- * @sb: superblock
- * @v9ses: session information
- * @flags: flags propagated from v9fs_mount()
- *
- */
-
-static int
-v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
-		int flags)
+static int v9fs_fill_super(struct super_block *sb)
 {
 	int ret;
+	struct v9fs_session_info *v9ses = v9ses = sb->s_fs_info;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -95,16 +74,12 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 }
 
 /**
- * v9fs_mount - mount a superblock
- * @fs_type: file system type
- * @flags: mount flags
- * @dev_name: device name that was mounted
- * @data: mount options
+ * v9fs_get_tree - create the mountable root and superblock
+ * @fc: the filesystem context
  *
  */
 
-static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data)
+static int v9fs_get_tree(struct fs_context *fc)
 {
 	struct super_block *sb = NULL;
 	struct inode *inode = NULL;
@@ -117,20 +92,21 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
-	fid = v9fs_session_init(v9ses, dev_name, data);
+	fid = v9fs_session_init(v9ses, fc);
 	if (IS_ERR(fid)) {
 		retval = PTR_ERR(fid);
 		goto free_session;
 	}
 
-	sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses);
+	fc->s_fs_info = v9ses;
+	sb = sget_fc(fc, NULL, set_anon_super_fc);
 	if (IS_ERR(sb)) {
 		retval = PTR_ERR(sb);
 		goto clunk_fid;
 	}
-	retval = v9fs_fill_super(sb, v9ses, flags);
+	retval = v9fs_fill_super(sb);
 	if (retval)
 		goto release_sb;
 
@@ -159,14 +135,15 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	v9fs_fid_add(root, &fid);
 
 	p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
-	return dget(sb->s_root);
+	fc->root = dget(sb->s_root);
+	return 0;
 
 clunk_fid:
 	p9_fid_put(fid);
 	v9fs_session_close(v9ses);
 free_session:
 	kfree(v9ses);
-	return ERR_PTR(retval);
+	return retval;
 
 release_sb:
 	/*
@@ -177,7 +154,7 @@ release_sb:
 	 */
 	p9_fid_put(fid);
 	deactivate_locked_super(sb);
-	return ERR_PTR(retval);
+	return retval;
 }
 
 /**
@@ -303,11 +280,86 @@ static const struct super_operations v9fs_super_ops_dotl = {
 	.write_inode = v9fs_write_inode_dotl,
 };
 
+static void v9fs_free_fc(struct fs_context *fc)
+{
+	struct v9fs_context *ctx = fc->fs_private;
+
+	if (!ctx)
+		return;
+
+	/* These should be NULL by now but guard against leaks */
+	kfree(ctx->session_opts.uname);
+	kfree(ctx->session_opts.aname);
+#ifdef CONFIG_9P_FSCACHE
+	kfree(ctx->session_opts.cachetag);
+#endif
+	if (ctx->client_opts.trans_mod)
+		v9fs_put_trans(ctx->client_opts.trans_mod);
+	kfree(ctx);
+}
+
+static const struct fs_context_operations v9fs_context_ops = {
+	.parse_param	= v9fs_parse_param,
+	.get_tree	= v9fs_get_tree,
+	.free		= v9fs_free_fc,
+};
+
+static int v9fs_init_fs_context(struct fs_context *fc)
+{
+	struct v9fs_context	*ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	/* initialize core options */
+	ctx->session_opts.afid = ~0;
+	ctx->session_opts.cache = CACHE_NONE;
+	ctx->session_opts.session_lock_timeout = P9_LOCK_TIMEOUT;
+	ctx->session_opts.uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
+	if (!ctx->session_opts.uname)
+		goto error;
+
+	ctx->session_opts.aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
+	if (!ctx->session_opts.aname)
+		goto error;
+
+	ctx->session_opts.uid = INVALID_UID;
+	ctx->session_opts.dfltuid = V9FS_DEFUID;
+	ctx->session_opts.dfltgid = V9FS_DEFGID;
+
+	/* initialize client options */
+	ctx->client_opts.proto_version = p9_proto_2000L;
+	ctx->client_opts.msize = DEFAULT_MSIZE;
+
+	/* initialize fd transport options */
+	ctx->fd_opts.port = P9_FD_PORT;
+	ctx->fd_opts.rfd = ~0;
+	ctx->fd_opts.wfd = ~0;
+	ctx->fd_opts.privport = false;
+
+	/* initialize rdma transport options */
+	ctx->rdma_opts.port = P9_RDMA_PORT;
+	ctx->rdma_opts.sq_depth = P9_RDMA_SQ_DEPTH;
+	ctx->rdma_opts.rq_depth = P9_RDMA_RQ_DEPTH;
+	ctx->rdma_opts.timeout = P9_RDMA_TIMEOUT;
+	ctx->rdma_opts.privport = false;
+
+	fc->ops = &v9fs_context_ops;
+	fc->fs_private = ctx;
+
+	return 0;
+error:
+	fc->need_free = 1;
+	return -ENOMEM;
+}
+
 struct file_system_type v9fs_fs_type = {
 	.name = "9p",
-	.mount = v9fs_mount,
 	.kill_sb = v9fs_kill_super,
 	.owner = THIS_MODULE,
 	.fs_flags = FS_RENAME_DOES_D_MOVE,
+	.init_fs_context = v9fs_init_fs_context,
+	.parameters = v9fs_param_spec,
 };
 MODULE_ALIAS_FS("9p");
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index cc18443f7d51..838a94218b59 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -279,7 +279,7 @@ int p9_client_rename(struct p9_fid *fid, struct p9_fid *newdirfid,
 		     const char *name);
 int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name,
 		       struct p9_fid *newdirfid, const char *new_name);
-struct p9_client *p9_client_create(const char *dev_name, char *options);
+struct p9_client *p9_client_create(struct fs_context *fc);
 void p9_client_destroy(struct p9_client *clnt);
 void p9_client_disconnect(struct p9_client *clnt);
 void p9_client_begin_disconnect(struct p9_client *clnt);
diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h
index 898a432a8063..a912bbaa862f 100644
--- a/include/net/9p/transport.h
+++ b/include/net/9p/transport.h
@@ -57,7 +57,7 @@ struct p9_trans_module {
 	bool supports_vmalloc;	/* can work with vmalloc'd buffers */
 	struct module *owner;
 	int (*create)(struct p9_client *client,
-		      const char *devname, char *args);
+		      struct fs_context *fc);
 	void (*close)(struct p9_client *client);
 	int (*request)(struct p9_client *client, struct p9_req_t *req);
 	int (*cancel)(struct p9_client *client, struct p9_req_t *req);
diff --git a/net/9p/client.c b/net/9p/client.c
index 802f548332a5..f60d1d041adb 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -20,8 +20,8 @@
 #include <linux/uio.h>
 #include <linux/netfs.h>
 #include <net/9p/9p.h>
-#include <linux/parser.h>
 #include <linux/seq_file.h>
+#include <linux/fs_context.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
 #include "protocol.h"
@@ -33,22 +33,6 @@
  *  - a little lazy - parse all client options
  */
 
-enum {
-	Opt_msize,
-	Opt_trans,
-	Opt_legacy,
-	Opt_version,
-	Opt_err,
-};
-
-static const match_table_t tokens = {
-	{Opt_msize, "msize=%u"},
-	{Opt_legacy, "noextend"},
-	{Opt_trans, "trans=%s"},
-	{Opt_version, "version=%s"},
-	{Opt_err, NULL},
-};
-
 inline int p9_is_proto_dotl(struct p9_client *clnt)
 {
 	return clnt->proto_version == p9_proto_2000L;
@@ -97,124 +81,16 @@ static int safe_errno(int err)
 	return err;
 }
 
-/* Interpret mount option for protocol version */
-static int get_protocol_version(char *s)
+static int apply_client_options(struct p9_client *clnt, struct fs_context *fc)
 {
-	int version = -EINVAL;
-
-	if (!strcmp(s, "9p2000")) {
-		version = p9_proto_legacy;
-		p9_debug(P9_DEBUG_9P, "Protocol version: Legacy\n");
-	} else if (!strcmp(s, "9p2000.u")) {
-		version = p9_proto_2000u;
-		p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.u\n");
-	} else if (!strcmp(s, "9p2000.L")) {
-		version = p9_proto_2000L;
-		p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n");
-	} else {
-		pr_info("Unknown protocol version %s\n", s);
-	}
+	struct v9fs_context *ctx = fc->fs_private;
 
-	return version;
-}
-
-/**
- * parse_opts - parse mount options into client structure
- * @opts: options string passed from mount
- * @clnt: existing v9fs client information
- *
- * Return 0 upon success, -ERRNO upon failure
- */
-
-static int parse_opts(char *opts, struct p9_client *clnt)
-{
-	char *options, *tmp_options;
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-	char *s;
-	int ret = 0;
-
-	clnt->proto_version = p9_proto_2000L;
-	clnt->msize = DEFAULT_MSIZE;
-
-	if (!opts)
-		return 0;
+	clnt->msize = ctx->client_opts.msize;
+	clnt->trans_mod = ctx->client_opts.trans_mod;
+	ctx->client_opts.trans_mod = NULL;
+	clnt->proto_version = ctx->client_opts.proto_version;
 
-	tmp_options = kstrdup(opts, GFP_KERNEL);
-	if (!tmp_options)
-		return -ENOMEM;
-	options = tmp_options;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token, r;
-
-		if (!*p)
-			continue;
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_msize:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-				continue;
-			}
-			if (option < 4096) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "msize should be at least 4k\n");
-				ret = -EINVAL;
-				continue;
-			}
-			clnt->msize = option;
-			break;
-		case Opt_trans:
-			s = match_strdup(&args[0]);
-			if (!s) {
-				ret = -ENOMEM;
-				p9_debug(P9_DEBUG_ERROR,
-					 "problem allocating copy of trans arg\n");
-				goto free_and_return;
-			}
-
-			v9fs_put_trans(clnt->trans_mod);
-			clnt->trans_mod = v9fs_get_trans_by_name(s);
-			if (!clnt->trans_mod) {
-				pr_info("Could not find request transport: %s\n",
-					s);
-				ret = -EINVAL;
-			}
-			kfree(s);
-			break;
-		case Opt_legacy:
-			clnt->proto_version = p9_proto_legacy;
-			break;
-		case Opt_version:
-			s = match_strdup(&args[0]);
-			if (!s) {
-				ret = -ENOMEM;
-				p9_debug(P9_DEBUG_ERROR,
-					 "problem allocating copy of version arg\n");
-				goto free_and_return;
-			}
-			r = get_protocol_version(s);
-			if (r < 0)
-				ret = r;
-			else
-				clnt->proto_version = r;
-			kfree(s);
-			break;
-		default:
-			continue;
-		}
-	}
-
-free_and_return:
-	if (ret)
-		v9fs_put_trans(clnt->trans_mod);
-	kfree(tmp_options);
-	return ret;
+	return 0;
 }
 
 static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
@@ -975,7 +851,7 @@ error:
 	return err;
 }
 
-struct p9_client *p9_client_create(const char *dev_name, char *options)
+struct p9_client *p9_client_create(struct fs_context *fc)
 {
 	int err;
 	static atomic_t seqno = ATOMIC_INIT(0);
@@ -998,8 +874,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
 	idr_init(&clnt->fids);
 	idr_init(&clnt->reqs);
 
-	err = parse_opts(options, clnt);
-	if (err < 0)
+	err = apply_client_options(clnt, fc);
+	if (err)
 		goto free_client;
 
 	if (!clnt->trans_mod)
@@ -1015,7 +891,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
 	p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
 		 clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
 
-	err = clnt->trans_mod->create(clnt, dev_name, options);
+	err = clnt->trans_mod->create(clnt, fc);
 	if (err)
 		goto put_trans;
 
diff --git a/net/9p/mod.c b/net/9p/mod.c
index 55576c1866fa..85160b52da55 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -16,7 +16,6 @@
 #include <linux/moduleparam.h>
 #include <net/9p/9p.h>
 #include <linux/fs.h>
-#include <linux/parser.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
 #include <linux/list.h>
@@ -171,6 +170,7 @@ void v9fs_put_trans(struct p9_trans_module *m)
 	if (m)
 		module_put(m->owner);
 }
+EXPORT_SYMBOL(v9fs_put_trans);
 
 /**
  * init_p9 - Initialize module
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index b7e5933c4617..4cec4bba222d 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -22,7 +22,7 @@
 #include <linux/uaccess.h>
 #include <linux/inet.h>
 #include <linux/file.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <net/9p/9p.h>
@@ -37,26 +37,6 @@
 static struct p9_trans_module p9_tcp_trans;
 static struct p9_trans_module p9_fd_trans;
 
-/*
-  * Option Parsing (code inspired by NFS code)
-  *  - a little lazy - parse all fd-transport options
-  */
-
-enum {
-	/* Options that take integer arguments */
-	Opt_port, Opt_rfdno, Opt_wfdno, Opt_err,
-	/* Options that take no arguments */
-	Opt_privport,
-};
-
-static const match_table_t tokens = {
-	{Opt_port, "port=%u"},
-	{Opt_rfdno, "rfdno=%u"},
-	{Opt_wfdno, "wfdno=%u"},
-	{Opt_privport, "privport"},
-	{Opt_err, NULL},
-};
-
 enum {
 	Rworksched = 1,		/* read work scheduled or running */
 	Rpending = 2,		/* can read */
@@ -737,73 +717,6 @@ static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt)
 	return 0;
 }
 
-/**
- * parse_opts - parse mount options into p9_fd_opts structure
- * @params: options string passed from mount
- * @opts: fd transport-specific structure to parse options into
- *
- * Returns 0 upon success, -ERRNO upon failure
- */
-
-static int parse_opts(char *params, struct p9_fd_opts *opts)
-{
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-	char *options, *tmp_options;
-
-	opts->port = P9_FD_PORT;
-	opts->rfd = ~0;
-	opts->wfd = ~0;
-	opts->privport = false;
-
-	if (!params)
-		return 0;
-
-	tmp_options = kstrdup(params, GFP_KERNEL);
-	if (!tmp_options) {
-		p9_debug(P9_DEBUG_ERROR,
-			 "failed to allocate copy of option string\n");
-		return -ENOMEM;
-	}
-	options = tmp_options;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-		int r;
-		if (!*p)
-			continue;
-		token = match_token(p, tokens, args);
-		if ((token != Opt_err) && (token != Opt_privport)) {
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				continue;
-			}
-		}
-		switch (token) {
-		case Opt_port:
-			opts->port = option;
-			break;
-		case Opt_rfdno:
-			opts->rfd = option;
-			break;
-		case Opt_wfdno:
-			opts->wfd = option;
-			break;
-		case Opt_privport:
-			opts->privport = true;
-			break;
-		default:
-			continue;
-		}
-	}
-
-	kfree(tmp_options);
-	return 0;
-}
-
 static int p9_fd_open(struct p9_client *client, int rfd, int wfd)
 {
 	struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd),
@@ -958,17 +871,18 @@ static int p9_bind_privport(struct socket *sock)
 }
 
 static int
-p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
+p9_fd_create_tcp(struct p9_client *client, struct fs_context *fc)
 {
+	const char *addr = fc->source;
+	struct v9fs_context *ctx = fc->fs_private;
 	int err;
 	char port_str[6];
 	struct socket *csocket;
 	struct sockaddr_storage stor = { 0 };
 	struct p9_fd_opts opts;
 
-	err = parse_opts(args, &opts);
-	if (err < 0)
-		return err;
+	/* opts are already parsed in context */
+	opts = ctx->fd_opts;
 
 	if (!addr)
 		return -EINVAL;
@@ -1015,8 +929,9 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
 }
 
 static int
-p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
+p9_fd_create_unix(struct p9_client *client, struct fs_context *fc)
 {
+	const char *addr = fc->source;
 	int err;
 	struct socket *csocket;
 	struct sockaddr_un sun_server;
@@ -1055,14 +970,12 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
 }
 
 static int
-p9_fd_create(struct p9_client *client, const char *addr, char *args)
+p9_fd_create(struct p9_client *client, struct fs_context *fc)
 {
+	struct v9fs_context *ctx = fc->fs_private;
+	struct p9_fd_opts opts = ctx->fd_opts;
 	int err;
-	struct p9_fd_opts opts;
 
-	err = parse_opts(args, &opts);
-	if (err < 0)
-		return err;
 	client->trans_opts.fd.rfd = opts.rfd;
 	client->trans_opts.fd.wfd = opts.wfd;
 
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 87246463a954..4d406479f83b 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -22,7 +22,7 @@
 #include <linux/uaccess.h>
 #include <linux/inet.h>
 #include <linux/file.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
 #include <linux/semaphore.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
@@ -106,26 +106,6 @@ struct p9_rdma_context {
 	};
 };
 
-/*
- * Option Parsing (code inspired by NFS code)
- */
-enum {
-	/* Options that take integer arguments */
-	Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout,
-	/* Options that take no argument */
-	Opt_privport,
-	Opt_err,
-};
-
-static match_table_t tokens = {
-	{Opt_port, "port=%u"},
-	{Opt_sq_depth, "sq=%u"},
-	{Opt_rq_depth, "rq=%u"},
-	{Opt_timeout, "timeout=%u"},
-	{Opt_privport, "privport"},
-	{Opt_err, NULL},
-};
-
 static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt)
 {
 	struct p9_trans_rdma *rdma = clnt->trans;
@@ -143,77 +123,6 @@ static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt)
 	return 0;
 }
 
-/**
- * parse_opts - parse mount options into rdma options structure
- * @params: options string passed from mount
- * @opts: rdma transport-specific structure to parse options into
- *
- * Returns 0 upon success, -ERRNO upon failure
- */
-static int parse_opts(char *params, struct p9_rdma_opts *opts)
-{
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-	char *options, *tmp_options;
-
-	opts->port = P9_RDMA_PORT;
-	opts->sq_depth = P9_RDMA_SQ_DEPTH;
-	opts->rq_depth = P9_RDMA_RQ_DEPTH;
-	opts->timeout = P9_RDMA_TIMEOUT;
-	opts->privport = false;
-
-	if (!params)
-		return 0;
-
-	tmp_options = kstrdup(params, GFP_KERNEL);
-	if (!tmp_options) {
-		p9_debug(P9_DEBUG_ERROR,
-			 "failed to allocate copy of option string\n");
-		return -ENOMEM;
-	}
-	options = tmp_options;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-		int r;
-		if (!*p)
-			continue;
-		token = match_token(p, tokens, args);
-		if ((token != Opt_err) && (token != Opt_privport)) {
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				continue;
-			}
-		}
-		switch (token) {
-		case Opt_port:
-			opts->port = option;
-			break;
-		case Opt_sq_depth:
-			opts->sq_depth = option;
-			break;
-		case Opt_rq_depth:
-			opts->rq_depth = option;
-			break;
-		case Opt_timeout:
-			opts->timeout = option;
-			break;
-		case Opt_privport:
-			opts->privport = true;
-			break;
-		default:
-			continue;
-		}
-	}
-	/* RQ must be at least as large as the SQ */
-	opts->rq_depth = max(opts->rq_depth, opts->sq_depth);
-	kfree(tmp_options);
-	return 0;
-}
-
 static int
 p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
 {
@@ -607,14 +516,15 @@ static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
 /**
  * rdma_create_trans - Transport method for creating a transport instance
  * @client: client instance
- * @addr: IP address string
- * @args: Mount options string
+ * @fc: The filesystem context
  */
 static int
-rdma_create_trans(struct p9_client *client, const char *addr, char *args)
+rdma_create_trans(struct p9_client *client, struct fs_context *fc)
 {
+	const char *addr = fc->source;
+	struct v9fs_context *ctx = fc->fs_private;
+	struct p9_rdma_opts opts = ctx->rdma_opts;
 	int err;
-	struct p9_rdma_opts opts;
 	struct p9_trans_rdma *rdma;
 	struct rdma_conn_param conn_param;
 	struct ib_qp_init_attr qp_attr;
@@ -622,10 +532,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
 	if (addr == NULL)
 		return -EINVAL;
 
-	/* Parse the transport specific mount options */
-	err = parse_opts(args, &opts);
-	if (err < 0)
-		return err;
+	/* options are already parsed, in the fs context */
+	opts = ctx->rdma_opts;
 
 	/* Create and initialize the RDMA transport structure */
 	rdma = alloc_rdma(&opts);
diff --git a/net/9p/trans_usbg.c b/net/9p/trans_usbg.c
index 2542ef099233..93547637deae 100644
--- a/net/9p/trans_usbg.c
+++ b/net/9p/trans_usbg.c
@@ -27,6 +27,7 @@
 #include <linux/cleanup.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/fs_context.h>
 #include <linux/usb/composite.h>
 #include <linux/usb/func_utils.h>
 
@@ -376,8 +377,9 @@ out:
 	return ret;
 }
 
-static int p9_usbg_create(struct p9_client *client, const char *devname, char *args)
+static int p9_usbg_create(struct p9_client *client, struct fs_context *fc)
 {
+	const char *devname = fc->source;
 	struct f_usb9pfs_dev *dev;
 	struct f_usb9pfs *usb9pfs;
 	int ret = -ENOENT;
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index b58f50b00c72..10c2dd486438 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -26,7 +26,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <net/9p/9p.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
 #include <linux/scatterlist.h>
@@ -679,8 +679,7 @@ fail:
 /**
  * p9_virtio_create - allocate a new virtio channel
  * @client: client instance invoking this transport
- * @devname: string identifying the channel to connect to (unused)
- * @args: args passed from sys_mount() for per-transport options (unused)
+ * @fc: the filesystem context
  *
  * This sets up a transport channel for 9p communication.  Right now
  * we only match the first available channel, but eventually we could look up
@@ -691,8 +690,9 @@ fail:
  */
 
 static int
-p9_virtio_create(struct p9_client *client, const char *devname, char *args)
+p9_virtio_create(struct p9_client *client, struct fs_context *fc)
 {
+	const char *devname = fc->source;
 	struct virtio_chan *chan;
 	int ret = -ENOENT;
 	int found = 0;
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 9389c1247001..12f752a92332 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -15,6 +15,7 @@
 
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/fs_context.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
@@ -66,8 +67,9 @@ static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req)
 	return 1;
 }
 
-static int p9_xen_create(struct p9_client *client, const char *addr, char *args)
+static int p9_xen_create(struct p9_client *client, struct fs_context *fc)
 {
+	const char *addr = fc->source;
 	struct xen_9pfs_front_priv *priv;
 
 	if (addr == NULL)
-- 
cgit v1.2.3


From c95de73da12bf4586b7bcd6b23a6968c21991cc7 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 30 Oct 2025 22:41:18 -0700
Subject: mtd: spear_smi: fix kernel-doc warnings <linux/mtd/spear_smi.h>

Correct most kernel-doc warnings in include/linux/mtd/spear_smi.h
by adding a leading '@' to the description of struct members.
Add a new description for the missing @np member.

Warning: spear_smi.h:48 struct member 'name' not described
 in 'spear_smi_flash_info'
Warning: spear_smi.h:48 struct member 'mem_base' not described
 in 'spear_smi_flash_info'
Warning: spear_smi.h:48 struct member 'size' not described
 in 'spear_smi_flash_info'
Warning: spear_smi.h:48 struct member 'partitions' not described
 in 'spear_smi_flash_info'
Warning: spear_smi.h:48 struct member 'nr_partitions' not described
 in 'spear_smi_flash_info'
Warning: spear_smi.h:48 struct member 'fast_mode' not described
 in 'spear_smi_flash_info'
Warning: spear_smi.h:62 struct member 'clk_rate' not described
 in 'spear_smi_plat_data'
Warning: spear_smi.h:62 struct member 'num_flashes' not described
 in 'spear_smi_plat_data'
Warning: spear_smi.h:62 struct member 'board_flash_info' not described
 in 'spear_smi_plat_data'
Warning: spear_smi.h:62 struct member 'np' not described
 in 'spear_smi_plat_data'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 include/linux/mtd/spear_smi.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/mtd/spear_smi.h b/include/linux/mtd/spear_smi.h
index 581603ac1277..871634862627 100644
--- a/include/linux/mtd/spear_smi.h
+++ b/include/linux/mtd/spear_smi.h
@@ -31,12 +31,12 @@
  * struct spear_smi_flash_info - platform structure for passing flash
  * information
  *
- * name: name of the serial nor flash for identification
- * mem_base: the memory base on which the flash is mapped
- * size: size of the flash in bytes
- * partitions: parition details
- * nr_partitions: number of partitions
- * fast_mode: whether flash supports fast mode
+ * @name: name of the serial nor flash for identification
+ * @mem_base: the memory base on which the flash is mapped
+ * @size: size of the flash in bytes
+ * @partitions: parition details
+ * @nr_partitions: number of partitions
+ * @fast_mode: whether flash supports fast mode
  */
 
 struct spear_smi_flash_info {
@@ -51,9 +51,10 @@ struct spear_smi_flash_info {
 /**
  * struct spear_smi_plat_data - platform structure for configuring smi
  *
- * clk_rate: clk rate at which SMI must operate
- * num_flashes: number of flashes present on board
- * board_flash_info: specific details of each flash present on board
+ * @clk_rate: clk rate at which SMI must operate
+ * @num_flashes: number of flashes present on board
+ * @board_flash_info: specific details of each flash present on board
+ * @np: array of DT node pointers for all possible flash chip devices
  */
 struct spear_smi_plat_data {
 	unsigned long clk_rate;
-- 
cgit v1.2.3


From c3d78c34ad009a7cce57ae5b5c93e1bd03bb31a3 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Mon, 22 Sep 2025 11:30:10 +0800
Subject: perf: arm_pmuv3: Don't use PMCCNTR_EL0 on SMT cores

CPU_CYCLES is expected to count the logical CPU (PE) clock. Currently it's
preferred to use PMCCNTR_EL0 for counting CPU_CYCLES, but it'll count
processor clock rather than the PE clock (ARM DDI0487 L.b D13.1.3) if
one of the SMT siblings is not idle on a multi-threaded implementation.
So don't use it on SMT cores.

Introduce topology_core_has_smt() for knowing the SMT implementation and
cached it in arm_pmu::has_smt during allocation.

When counting cycles on SMT CPU 2-3 and CPU 3 is idle, without this
patch we'll get:
[root@client1 tmp]# perf stat -e cycles -A -C 2-3 -- stress-ng -c 1
--taskset 2 --timeout 1
[...]
 Performance counter stats for 'CPU(s) 2-3':

CPU2           2880457316      cycles
CPU3           2880459810      cycles
       1.254688470 seconds time elapsed

With this patch the idle state of CPU3 is observed as expected:
[root@client1 ~]#  perf stat -e cycles -A -C 2-3 -- stress-ng -c 1
--taskset 2 --timeout 1
[...]
 Performance counter stats for 'CPU(s) 2-3':

CPU2           2558580492      cycles
CPU3               305749      cycles
       1.113626410 seconds time elapsed

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmu.c        |  6 ++++++
 drivers/perf/arm_pmuv3.c      | 10 ++++++++++
 include/linux/arch_topology.h | 11 +++++++++++
 include/linux/perf/arm_pmu.h  |  1 +
 4 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 5c310e803dd7..ae437791b5f8 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -925,6 +925,12 @@ int armpmu_register(struct arm_pmu *pmu)
 	if (ret)
 		return ret;
 
+	/*
+	 * By this stage we know our supported CPUs on either DT/ACPI platforms,
+	 * detect the SMT implementation.
+	 */
+	pmu->has_smt = topology_core_has_smt(cpumask_first(&pmu->supported_cpus));
+
 	if (!pmu->set_event_filter)
 		pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE;
 
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 69c5cc8f5606..d1d6000517b2 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -981,6 +981,7 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,
 static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc,
 				     struct perf_event *event)
 {
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT;
 
@@ -1001,6 +1002,15 @@ static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc,
 	if (has_branch_stack(event))
 		return false;
 
+	/*
+	 * The PMCCNTR_EL0 increments from the processor clock rather than
+	 * the PE clock (ARM DDI0487 L.b D13.1.3) which means it'll continue
+	 * counting on a WFI PE if one of its SMT sibling is not idle on a
+	 * multi-threaded implementation. So don't use it on SMT cores.
+	 */
+	if (cpu_pmu->has_smt)
+		return false;
+
 	return true;
 }
 
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index d72d6e5aa200..daa1af2e8204 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -89,6 +89,17 @@ void remove_cpu_topology(unsigned int cpuid);
 void reset_cpu_topology(void);
 int parse_acpi_topology(void);
 void freq_inv_set_max_ratio(int cpu, u64 max_rate);
+
+/*
+ * Architectures like ARM64 don't have reliable architectural way to get SMT
+ * information and depend on the firmware (ACPI/OF) report. Non-SMT core won't
+ * initialize thread_id so we can use this to detect the SMT implementation.
+ */
+static inline bool topology_core_has_smt(int cpu)
+{
+	return cpu_topology[cpu].thread_id != -1;
+}
+
 #endif
 
 #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 93c9a26492fc..2d39322c40c4 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -119,6 +119,7 @@ struct arm_pmu {
 
 	/* PMUv3 only */
 	int		pmuver;
+	bool		has_smt;
 	u64		reg_pmmir;
 	u64		reg_brbidr;
 #define ARMV8_PMUV3_MAX_COMMON_EVENTS		0x40
-- 
cgit v1.2.3


From 56d9df41ef1847ed0523f57ec6117649d581401d Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Sat, 1 Nov 2025 01:45:04 +0100
Subject: rtc: ds1685: stop setting max_user_freq

max_user_freq has not been related to the hardware RTC since commit
6610e0893b8b ("RTC: Rework RTC code to use timerqueue for events"). Stop
setting it from individual driver to avoid confusing new contributors.

Acked-by: Joshua Kinard <linux@kumba.dev>
Link: https://patch.msgid.link/20251101-max_user_freq-v1-2-c9a274fd6883@bootlin.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/rtc/rtc-ds1685.c   | 3 ---
 include/linux/rtc/ds1685.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
index 97423f1d0361..5fc8e36b1abf 100644
--- a/drivers/rtc/rtc-ds1685.c
+++ b/drivers/rtc/rtc-ds1685.c
@@ -1268,9 +1268,6 @@ ds1685_rtc_probe(struct platform_device *pdev)
 	rtc_dev->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rtc_dev->range_max = RTC_TIMESTAMP_END_2099;
 
-	/* Maximum periodic rate is 8192Hz (0.122070ms). */
-	rtc_dev->max_user_freq = RTC_MAX_USER_FREQ;
-
 	/* See if the platform doesn't support UIE. */
 	if (pdata->uie_unsupported)
 		clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc_dev->features);
diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h
index 01da4582db6d..8ec0ebfaef04 100644
--- a/include/linux/rtc/ds1685.h
+++ b/include/linux/rtc/ds1685.h
@@ -324,7 +324,6 @@ struct ds1685_rtc_platform_data {
 #define RTC_SQW_2HZ		0x0f	/*  0    1   1   1   1  */
 #define RTC_SQW_0HZ		0x00	/*  0    0   0   0   0  */
 #define RTC_SQW_32768HZ		32768	/*  1    -   -   -   -  */
-#define RTC_MAX_USER_FREQ	8192
 
 
 /*
-- 
cgit v1.2.3


From 3eb6660f26d13acdbcb9241ac3e95d44419f2284 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Oct 2025 10:40:52 +0100
Subject: uaccess: Provide ASM GOTO safe wrappers for unsafe_*_user()

ASM GOTO is miscompiled by GCC when it is used inside a auto cleanup scope:

bool foo(u32 __user *p, u32 val)
{
	scoped_guard(pagefault)
		unsafe_put_user(val, p, efault);
	return true;
efault:
	return false;
}

 e80:	e8 00 00 00 00       	call   e85 <foo+0x5>
 e85:	65 48 8b 05 00 00 00 00 mov    %gs:0x0(%rip),%rax
 e8d:	83 80 04 14 00 00 01 	addl   $0x1,0x1404(%rax)   // pf_disable++
 e94:	89 37                	mov    %esi,(%rdi)
 e96:	83 a8 04 14 00 00 01 	subl   $0x1,0x1404(%rax)   // pf_disable--
 e9d:	b8 01 00 00 00       	mov    $0x1,%eax           // success
 ea2:	e9 00 00 00 00       	jmp    ea7 <foo+0x27>      // ret
 ea7:	31 c0                	xor    %eax,%eax           // fail
 ea9:	e9 00 00 00 00       	jmp    eae <foo+0x2e>      // ret

which is broken as it leaks the pagefault disable counter on failure.

Clang at least fails the build.

Linus suggested to add a local label into the macro scope and let that
jump to the actual caller supplied error label.

       	__label__ local_label;                                  \
        arch_unsafe_get_user(x, ptr, local_label);              \
	if (0) {                                                \
	local_label:                                            \
		goto label;                                     \

That works for both GCC and clang.

clang:

 c80:	0f 1f 44 00 00       	   nopl   0x0(%rax,%rax,1)
 c85:	65 48 8b 0c 25 00 00 00 00 mov    %gs:0x0,%rcx
 c8e:	ff 81 04 14 00 00    	   incl   0x1404(%rcx)	   // pf_disable++
 c94:	31 c0                	   xor    %eax,%eax        // set retval to false
 c96:	89 37                      mov    %esi,(%rdi)      // write
 c98:	b0 01                	   mov    $0x1,%al         // set retval to true
 c9a:	ff 89 04 14 00 00    	   decl   0x1404(%rcx)     // pf_disable--
 ca0:	2e e9 00 00 00 00    	   cs jmp ca6 <foo+0x26>   // ret

The exception table entry points correctly to c9a

GCC:

 f70:   e8 00 00 00 00          call   f75 <baz+0x5>
 f75:   65 48 8b 05 00 00 00 00 mov    %gs:0x0(%rip),%rax
 f7d:   83 80 04 14 00 00 01    addl   $0x1,0x1404(%rax)  // pf_disable++
 f84:   8b 17                   mov    (%rdi),%edx
 f86:   89 16                   mov    %edx,(%rsi)
 f88:   83 a8 04 14 00 00 01    subl   $0x1,0x1404(%rax) // pf_disable--
 f8f:   b8 01 00 00 00          mov    $0x1,%eax         // success
 f94:   e9 00 00 00 00          jmp    f99 <baz+0x29>    // ret
 f99:   83 a8 04 14 00 00 01    subl   $0x1,0x1404(%rax) // pf_disable--
 fa0:   31 c0                   xor    %eax,%eax         // fail
 fa2:   e9 00 00 00 00          jmp    fa7 <baz+0x37>    // ret

The exception table entry points correctly to f99

So both compilers optimize out the extra goto and emit correct and
efficient code.

Provide a generic wrapper to do that to avoid modifying all the affected
architecture specific implementation with that workaround.

The only change required for architectures is to rename unsafe_*_user() to
arch_unsafe_*_user(). That's done in subsequent changes.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/877bweujtn.ffs@tglx
---
 include/linux/uaccess.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 68 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 1beb5b395d81..8aa82b1d6013 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -518,7 +518,34 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
 		long count);
 long strnlen_user_nofault(const void __user *unsafe_addr, long count);
 
-#ifndef __get_kernel_nofault
+#ifdef arch_get_kernel_nofault
+/*
+ * Wrap the architecture implementation so that @label can be outside of a
+ * cleanup() scope. A regular C goto works correctly, but ASM goto does
+ * not. Clang rejects such an attempt, but GCC silently emits buggy code.
+ */
+#define __get_kernel_nofault(dst, src, type, label)		\
+do {								\
+	__label__ local_label;					\
+	arch_get_kernel_nofault(dst, src, type, local_label);	\
+	if (0) {						\
+	local_label:						\
+		goto label;					\
+	}							\
+} while (0)
+
+#define __put_kernel_nofault(dst, src, type, label)		\
+do {								\
+	__label__ local_label;					\
+	arch_put_kernel_nofault(dst, src, type, local_label);	\
+	if (0) {						\
+	local_label:						\
+		goto label;					\
+	}							\
+} while (0)
+
+#elif !defined(__get_kernel_nofault) /* arch_get_kernel_nofault */
+
 #define __get_kernel_nofault(dst, src, type, label)	\
 do {							\
 	type __user *p = (type __force __user *)(src);	\
@@ -535,7 +562,8 @@ do {							\
 	if (__put_user(data, p))			\
 		goto label;				\
 } while (0)
-#endif
+
+#endif  /* !__get_kernel_nofault */
 
 /**
  * get_kernel_nofault(): safely attempt to read from a location
@@ -549,7 +577,42 @@ do {							\
 	copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\
 })
 
-#ifndef user_access_begin
+#ifdef user_access_begin
+
+#ifdef arch_unsafe_get_user
+/*
+ * Wrap the architecture implementation so that @label can be outside of a
+ * cleanup() scope. A regular C goto works correctly, but ASM goto does
+ * not. Clang rejects such an attempt, but GCC silently emits buggy code.
+ *
+ * Some architectures use internal local labels already, but this extra
+ * indirection here is harmless because the compiler optimizes it out
+ * completely in any case. This construct just ensures that the ASM GOTO
+ * target is always in the local scope. The C goto 'label' works correctly
+ * when leaving a cleanup() scope.
+ */
+#define unsafe_get_user(x, ptr, label)			\
+do {							\
+	__label__ local_label;				\
+	arch_unsafe_get_user(x, ptr, local_label);	\
+	if (0) {					\
+	local_label:					\
+		goto label;				\
+	}						\
+} while (0)
+
+#define unsafe_put_user(x, ptr, label)			\
+do {							\
+	__label__ local_label;				\
+	arch_unsafe_put_user(x, ptr, local_label);	\
+	if (0) {					\
+	local_label:					\
+		goto label;				\
+	}						\
+} while (0)
+#endif /* arch_unsafe_get_user */
+
+#else /* user_access_begin */
 #define user_access_begin(ptr,len) access_ok(ptr, len)
 #define user_access_end() do { } while (0)
 #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
@@ -559,7 +622,8 @@ do {							\
 #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e)
 static inline unsigned long user_access_save(void) { return 0UL; }
 static inline void user_access_restore(unsigned long flags) { }
-#endif
+#endif /* !user_access_begin */
+
 #ifndef user_write_access_begin
 #define user_write_access_begin user_access_begin
 #define user_write_access_end user_access_end
-- 
cgit v1.2.3


From f17d28968b7ba8722aa218d2e1362e8b5e010bc6 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 2 Oct 2025 13:32:53 +0300
Subject: media: v4l2-subdev: Make media_entity_to_v4l2_subdev() const-aware

Retain the constness of the object in media_entity_to_v4l2_subdev(), by
switching to container_of_const().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 include/media/v4l2-subdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index e0bb58cb6d04..a37d9a847196 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -1103,7 +1103,7 @@ struct v4l2_subdev {
 	typeof(ent) __me_sd_ent = (ent);				\
 									\
 	__me_sd_ent ?							\
-		container_of(__me_sd_ent, struct v4l2_subdev, entity) :	\
+		container_of_const(__me_sd_ent, struct v4l2_subdev, entity) : \
 		NULL;							\
 })
 
-- 
cgit v1.2.3


From 68871116f961532910ccb97b6f437acf7e00548c Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 2 Oct 2025 13:32:54 +0300
Subject: media: v4l2-dev: Make macros to obtain containers const-aware

Retain the constness of the object in media_entity_to_video_device() and
to_video_device(), by switching to container_of_const().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 include/media/v4l2-dev.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h
index a213c3398dcf..2e0f6d2e6a78 100644
--- a/include/media/v4l2-dev.h
+++ b/include/media/v4l2-dev.h
@@ -320,8 +320,8 @@ struct video_device {
 	typeof(__entity) __me_vdev_ent = __entity;			\
 									\
 	__me_vdev_ent ?							\
-		container_of(__me_vdev_ent,  struct video_device, entity) : \
-		NULL;							\
+		container_of_const(__me_vdev_ent,  struct video_device, \
+				   entity) : NULL;			\
 })
 
 /**
@@ -330,7 +330,7 @@ struct video_device {
  *
  * @cd: pointer to &struct device
  */
-#define to_video_device(cd) container_of(cd, struct video_device, dev)
+#define to_video_device(cd) container_of_const(cd, struct video_device, dev)
 
 /**
  * __video_register_device - register video4linux devices
-- 
cgit v1.2.3


From 35f29b44ac0958cb4f4cb042b877d2546f3f6d27 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 2 Oct 2025 13:32:55 +0300
Subject: media: mc: Make macros to obtain containers const-aware

Retain the constness of the graph objects and interfaces in macros to
obtain their containers, by switching to container_of_const().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 include/media/media-entity.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/media/media-entity.h b/include/media/media-entity.h
index 64cf590b1134..b91ff6f8c3bb 100644
--- a/include/media/media-entity.h
+++ b/include/media/media-entity.h
@@ -627,7 +627,7 @@ static inline bool media_entity_enum_intersects(
  * @gobj: Pointer to the struct &media_gobj graph object
  */
 #define gobj_to_entity(gobj) \
-		container_of(gobj, struct media_entity, graph_obj)
+		container_of_const(gobj, struct media_entity, graph_obj)
 
 /**
  * gobj_to_pad - returns the struct &media_pad pointer from the
@@ -636,7 +636,7 @@ static inline bool media_entity_enum_intersects(
  * @gobj: Pointer to the struct &media_gobj graph object
  */
 #define gobj_to_pad(gobj) \
-		container_of(gobj, struct media_pad, graph_obj)
+		container_of_const(gobj, struct media_pad, graph_obj)
 
 /**
  * gobj_to_link - returns the struct &media_link pointer from the
@@ -645,7 +645,7 @@ static inline bool media_entity_enum_intersects(
  * @gobj: Pointer to the struct &media_gobj graph object
  */
 #define gobj_to_link(gobj) \
-		container_of(gobj, struct media_link, graph_obj)
+		container_of_const(gobj, struct media_link, graph_obj)
 
 /**
  * gobj_to_intf - returns the struct &media_interface pointer from the
@@ -654,7 +654,7 @@ static inline bool media_entity_enum_intersects(
  * @gobj: Pointer to the struct &media_gobj graph object
  */
 #define gobj_to_intf(gobj) \
-		container_of(gobj, struct media_interface, graph_obj)
+		container_of_const(gobj, struct media_interface, graph_obj)
 
 /**
  * intf_to_devnode - returns the struct media_intf_devnode pointer from the
@@ -663,7 +663,7 @@ static inline bool media_entity_enum_intersects(
  * @intf: Pointer to struct &media_intf_devnode
  */
 #define intf_to_devnode(intf) \
-		container_of(intf, struct media_intf_devnode, intf)
+		container_of_const(intf, struct media_intf_devnode, intf)
 
 /**
  *  media_gobj_create - Initialize a graph object
-- 
cgit v1.2.3


From ba92a96b1e95a67cb736d095dceb788207b90a7b Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Sun, 26 Oct 2025 20:08:29 +0200
Subject: media: saa7146: Replace saa7146_ext_vv.vbi_fops with write function

The vbi_fops stored in struct saa7146_ext_vv is a full
v4l2_file_operations, but only its .write field is used. Replace it with
a single vbi_write function pointer to save memory.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 drivers/media/common/saa7146/saa7146_fops.c | 4 ++--
 drivers/staging/media/av7110/av7110_v4l.c   | 4 ++--
 include/media/drv-intf/saa7146_vv.h         | 3 ++-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/saa7146/saa7146_fops.c b/drivers/media/common/saa7146/saa7146_fops.c
index 9d0362a75ecd..a9e3bad76d54 100644
--- a/drivers/media/common/saa7146/saa7146_fops.c
+++ b/drivers/media/common/saa7146/saa7146_fops.c
@@ -186,11 +186,11 @@ static ssize_t fops_write(struct file *file, const char __user *data, size_t cou
 	struct saa7146_dev *dev = video_drvdata(file);
 	int ret;
 
-	if (vdev->vfl_type != VFL_TYPE_VBI || !dev->ext_vv_data->vbi_fops.write)
+	if (vdev->vfl_type != VFL_TYPE_VBI || !dev->ext_vv_data->vbi_write)
 		return -EINVAL;
 	if (mutex_lock_interruptible(vdev->lock))
 		return -ERESTARTSYS;
-	ret = dev->ext_vv_data->vbi_fops.write(file, data, count, ppos);
+	ret = dev->ext_vv_data->vbi_write(file, data, count, ppos);
 	mutex_unlock(vdev->lock);
 	return ret;
 }
diff --git a/drivers/staging/media/av7110/av7110_v4l.c b/drivers/staging/media/av7110/av7110_v4l.c
index 04e659243f02..200a7a29ea31 100644
--- a/drivers/staging/media/av7110/av7110_v4l.c
+++ b/drivers/staging/media/av7110/av7110_v4l.c
@@ -940,7 +940,7 @@ static struct saa7146_ext_vv av7110_vv_data_st = {
 	.num_stds	= ARRAY_SIZE(standard),
 	.std_callback	= &std_callback,
 
-	.vbi_fops.write	= av7110_vbi_write,
+	.vbi_write	= av7110_vbi_write,
 };
 
 static struct saa7146_ext_vv av7110_vv_data_c = {
@@ -953,6 +953,6 @@ static struct saa7146_ext_vv av7110_vv_data_c = {
 	.num_stds	= ARRAY_SIZE(standard),
 	.std_callback	= &std_callback,
 
-	.vbi_fops.write	= av7110_vbi_write,
+	.vbi_write	= av7110_vbi_write,
 };
 
diff --git a/include/media/drv-intf/saa7146_vv.h b/include/media/drv-intf/saa7146_vv.h
index 55c7d70b9feb..f66f4dfccf14 100644
--- a/include/media/drv-intf/saa7146_vv.h
+++ b/include/media/drv-intf/saa7146_vv.h
@@ -130,7 +130,8 @@ struct saa7146_ext_vv
 	/* pointer to the saa7146 core ops */
 	const struct v4l2_ioctl_ops *core_ops;
 
-	struct v4l2_file_operations vbi_fops;
+	ssize_t (*vbi_write)(struct file *file, const char __user *data,
+			     size_t count, loff_t *ppos);
 };
 
 struct saa7146_use_ops  {
-- 
cgit v1.2.3


From bc49af56eea866c34d21bf582f65b02fc8c06ec3 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Date: Tue, 28 Oct 2025 20:34:23 -0700
Subject: blktrace: add support for REQ_OP_WRITE_ZEROES tracing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, REQ_OP_WRITE_ZEROES operations are not handled in the
blktrace infrastructure, resulting in incorrect or missing operation
labels in ftrace blktrace output. This manifests as write-zeroes
operations appearing with incorrect labels like "N" instead of a
proper "WZ" designation.

This patch adds complete support for REQ_OP_WRITE_ZEROES across the
blktrace infrastructure:

Add BLK_TC_WRITE_ZEROES trace category in blktrace_api.h and update
BLK_TC_END_V2 marker accordingly
Map REQ_OP_WRITE_ZEROES to BLK_TC_WRITE_ZEROES in __blk_add_trace()
to ensure proper trace event categorization
Update fill_rwbs() to generate "WZ" label for write-zeroes operations
in ftrace output, making them easily identifiable
Add "write-zeroes" string mapping in act_to_str array for debugfs
filter interface
Update blk_fill_rwbs() to handle REQ_OP_WRITE_ZEROES for block layer
event tracing

With this fix, write-zeroes operations are now correctly traced and
displayed.

===========================================================
BEFORE THIS PATCH
===========================================================
blkdiscard -z -o 0 -l 40960 /dev/nvme0n1
   blkdiscard-3809 [030] .....  1212.253701: block_bio_queue: 259,0 NS 0 + 80 [blkdiscard]
   blkdiscard-3809 [030] .....  1212.253703: block_getrq: 259,0 NS 0 + 80 [blkdiscard]
   blkdiscard-3809 [030] .....  1212.253704: block_io_start: 259,0 NS 40960 () 0 + 80 be,0,4 [blkdiscard]
   blkdiscard-3809 [030] .....  1212.253704: block_plug: [blkdiscard]
   blkdiscard-3809 [030] .....  1212.253706: block_unplug: [blkdiscard] 1
   blkdiscard-3809 [030] .....  1212.253706: block_rq_insert: 259,0 NS 40960 () 0 + 80 be,0,4 [blkdiscard]
kworker/30:1H-566  [030] .....  1212.253726: block_rq_issue: 259,0 NS 40960 () 0 + 80 be,0,4 [kworker/30:1H]
       <idle>-0    [030] d.h1.  1212.253957: block_rq_complete: 259,0 NS () 0 + 80 be,0,4 [0]
       <idle>-0    [030] dNh1.  1212.253960: block_io_done: 259,0 NS 0 () 0 + 0 none,0,0 [swapper/30]

Trace Event Breakdown:
 Event             | Device | Op  | Sector | Sectors | Byte Size | Calculation

 block_bio_queue   | 259,0  | NS  | 0      | 80      | -         | 80 × 512 = 40,960
 block_getrq       | 259,0  | NS  | 0      | 80      | -         | 80 × 512 = 40,960
 block_io_start    | 259,0  | NS  | 0      | 80      | 40960     | Direct from trace
 block_rq_insert   | 259,0  | NS  | 0      | 80      | 40960     | Direct from trace
 block_rq_issue    | 259,0  | NS  | 0      | 80      | 40960     | Direct from trace
 block_rq_complete | 259,0  | NS  | 0      | 80      | -         | 80 × 512 = 40,960
 block_io_done     | 259,0  | NS  | 0      | 0       | 0         | Completion (no data)

  Total Bytes Transferred: Sectors: 80 Bytes: 80 × 512 = 40,960 bytes

===========================================================
AFTER THIS PATCH
===========================================================
blkdiscard -z -o 0 -l 40960 /dev/nvme0n1

   blkdiscard-2477 [020] .....   960.989131: block_bio_queue: 259,0 WZS 0 + 80 [blkdiscard]
   blkdiscard-2477 [020] .....   960.989134: block_getrq: 259,0 WZS 0 + 80 [blkdiscard]
   blkdiscard-2477 [020] .....   960.989135: block_io_start: 259,0 WZS 40960 () 0 + 80 be,0,4 [blkdiscard]
   blkdiscard-2477 [020] .....   960.989138: block_plug: [blkdiscard]
   blkdiscard-2477 [020] .....   960.989140: block_unplug: [blkdiscard] 1
   blkdiscard-2477 [020] .....   960.989141: block_rq_insert: 259,0 WZS 40960 () 0 + 80 be,0,4 [blkdiscard]
kworker/20:1H-736  [020] .....   960.989166: block_rq_issue: 259,0 WZS 40960 () 0 + 80 be,0,4 [kworker/20:1H]
       <idle>-0    [020] d.h1.   960.989476: block_rq_complete: 259,0 WZS () 0 + 80 be,0,4 [0]
       <idle>-0    [020] dNh1.   960.989482: block_io_done: 259,0 WZS 0 () 0 + 0 none,0,0 [swapper/20]

Trace Event Breakdown:
 Event             | Device | Op  | Sector | Sectors | Byte Size | Calculation

 block_bio_queue   | 259,0  | WZS | 0      | 80      | -         | 80 × 512 = 40,960
 block_getrq       | 259,0  | WZS | 0      | 80      | -         | 80 × 512 = 40,960
 block_io_start    | 259,0  | WZS | 0      | 80      | 40960     | Direct from trace
 block_rq_insert   | 259,0  | WZS | 0      | 80      | 40960     | Direct from trace
 block_rq_issue    | 259,0  | WZS | 0      | 80      | 40960     | Direct from trace
 block_rq_complete | 259,0  | WZS | 0      | 80      | -         | 80 × 512 = 40,960
 block_io_done     | 259,0  | WZS | 0      | 0       | 0         | Completion (no data)

  Total Bytes Transferred: Sectors: 80 Bytes: 80 × 512 = 40,960 bytes

Tested with ftrace blktrace on NVMe devices using blkdiscard with
the -z (write-zeroes) flag.

Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blktrace_api.h |  4 +++-
 kernel/trace/blktrace.c           | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index 30f3d2589365..7c092d9f3aa4 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -35,7 +35,9 @@ enum blktrace_cat {
 	BLK_TC_ZONE_OPEN	= 1ull << 20,	/* zone open */
 	BLK_TC_ZONE_CLOSE	= 1ull << 21,	/* zone close */
 
-	BLK_TC_END_V2		= 1ull << 21,
+	BLK_TC_WRITE_ZEROES	= 1ull << 22,	/* write-zeroes */
+
+	BLK_TC_END_V2		= 1ull << 22,
 };
 
 #define BLK_TC_SHIFT		(16)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e4f26ddb7ee2..af8cbc8e1a7c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -360,6 +360,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	case REQ_OP_ZONE_CLOSE:
 		what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE);
 		break;
+	case REQ_OP_WRITE_ZEROES:
+		what |= BLK_TC_ACT(BLK_TC_WRITE_ZEROES);
+		break;
 	default:
 		break;
 	}
@@ -1408,7 +1411,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t)
 
 	if (tc & BLK_TC_DISCARD)
 		rwbs[i++] = 'D';
-	else if (tc & BLK_TC_WRITE)
+	else if (tc & BLK_TC_WRITE_ZEROES) {
+		rwbs[i++] = 'W';
+		rwbs[i++] = 'Z';
+	} else if (tc & BLK_TC_WRITE)
 		rwbs[i++] = 'W';
 	else if (t->bytes)
 		rwbs[i++] = 'R';
@@ -1951,6 +1957,7 @@ static const struct {
 	{ BLK_TC_DISCARD,	"discard"	},
 	{ BLK_TC_DRV_DATA,	"drv_data"	},
 	{ BLK_TC_FUA,		"fua"		},
+	{ BLK_TC_WRITE_ZEROES,	"write-zeroes"	},
 };
 
 static int blk_trace_str2mask(const char *str)
@@ -2164,6 +2171,10 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
 		rwbs[i++] = 'Z';
 		rwbs[i++] = 'C';
 		break;
+	case REQ_OP_WRITE_ZEROES:
+		rwbs[i++] = 'W';
+		rwbs[i++] = 'Z';
+		break;
 	default:
 		rwbs[i++] = 'N';
 	}
-- 
cgit v1.2.3


From c33e779aba6804778c1440192a8033a145ba588d Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Fri, 31 Oct 2025 14:34:29 -0600
Subject: io_uring: add wrapper type for io_req_tw_func_t arg

In preparation for uring_cmd implementations to implement functions
with the io_req_tw_func_t signature, introduce a wrapper struct
io_tw_req to hide the struct io_kiocb * argument. The intention is for
only the io_uring core to access the inner struct io_kiocb *. uring_cmd
implementations should instead call a helper from io_uring/cmd.h to
convert struct io_tw_req to struct io_uring_cmd *.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  6 +++++-
 io_uring/futex.c               | 16 +++++++++-------
 io_uring/io_uring.c            | 21 ++++++++++++---------
 io_uring/io_uring.h            |  4 ++--
 io_uring/msg_ring.c            |  3 ++-
 io_uring/notif.c               |  5 +++--
 io_uring/poll.c                | 11 ++++++-----
 io_uring/poll.h                |  2 +-
 io_uring/rw.c                  |  5 +++--
 io_uring/rw.h                  |  2 +-
 io_uring/timeout.c             | 18 +++++++++++-------
 io_uring/uring_cmd.c           |  3 ++-
 io_uring/waitid.c              |  7 ++++---
 13 files changed, 61 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 25ee982eb435..f064a438ce43 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -615,7 +615,11 @@ enum {
 	REQ_F_SQE_COPIED	= IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT),
 };
 
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw);
+struct io_tw_req {
+	struct io_kiocb *req;
+};
+
+typedef void (*io_req_tw_func_t)(struct io_tw_req tw_req, io_tw_token_t tw);
 
 struct io_task_work {
 	struct llist_node		node;
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 64f3bd51c84c..4e022c76236d 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -41,24 +41,26 @@ void io_futex_cache_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->futex_cache, kfree);
 }
 
-static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw)
+static void __io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 {
-	hlist_del_init(&req->hash_node);
-	io_req_task_complete(req, tw);
+	hlist_del_init(&tw_req.req->hash_node);
+	io_req_task_complete(tw_req, tw);
 }
 
-static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw)
+static void io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_tw_lock(ctx, tw);
 	io_cache_free(&ctx->futex_cache, req->async_data);
 	io_req_async_data_clear(req, 0);
-	__io_futex_complete(req, tw);
+	__io_futex_complete(tw_req, tw);
 }
 
-static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw)
+static void io_futexv_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
 	struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
 	struct futex_vector *futexv = req->async_data;
 
@@ -73,7 +75,7 @@ static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw)
 	}
 
 	io_req_async_data_free(req);
-	__io_futex_complete(req, tw);
+	__io_futex_complete(tw_req, tw);
 }
 
 static bool io_futexv_claim(struct io_futex *iof)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4e6676ac4662..01631b6ff442 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -291,7 +291,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 	mutex_lock(&ctx->uring_lock);
 	ts.cancel = io_should_terminate_tw(ctx);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
-		req->io_task_work.func(req, ts);
+		req->io_task_work.func((struct io_tw_req){req}, ts);
 	io_submit_flush_completions(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	percpu_ref_put(&ctx->refs);
@@ -539,9 +539,9 @@ static void io_queue_iowq(struct io_kiocb *req)
 	io_wq_enqueue(tctx->io_wq, &req->work);
 }
 
-static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw)
+static void io_req_queue_iowq_tw(struct io_tw_req tw_req, io_tw_token_t tw)
 {
-	io_queue_iowq(req);
+	io_queue_iowq(tw_req.req);
 }
 
 void io_req_queue_iowq(struct io_kiocb *req)
@@ -1166,7 +1166,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node,
 		}
 		INDIRECT_CALL_2(req->io_task_work.func,
 				io_poll_task_func, io_req_rw_complete,
-				req, ts);
+				(struct io_tw_req){req}, ts);
 		node = next;
 		(*count)++;
 		if (unlikely(need_resched())) {
@@ -1389,7 +1389,7 @@ static int __io_run_local_work_loop(struct llist_node **node,
 						    io_task_work.node);
 		INDIRECT_CALL_2(req->io_task_work.func,
 				io_poll_task_func, io_req_rw_complete,
-				req, tw);
+				(struct io_tw_req){req}, tw);
 		*node = next;
 		if (++ret >= events)
 			break;
@@ -1459,14 +1459,17 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events,
 	return ret;
 }
 
-static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw)
+static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
+
 	io_tw_lock(req->ctx, tw);
 	io_req_defer_failed(req, req->cqe.res);
 }
 
-void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw)
+void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_tw_lock(ctx, tw);
@@ -1702,9 +1705,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
 	return 0;
 }
 
-void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw)
+void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 {
-	io_req_complete_defer(req);
+	io_req_complete_defer(tw_req.req);
 }
 
 /*
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 44b8091c7fcd..f97356ce29d0 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -149,9 +149,9 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
 void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags);
 void io_req_task_queue(struct io_kiocb *req);
-void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw);
+void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
-void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw);
+void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw);
 struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries);
 struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count);
 void tctx_task_work(struct callback_head *cb);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 5e5b94236d72..7063ea7964e7 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -70,8 +70,9 @@ static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx)
 	return target_ctx->task_complete;
 }
 
-static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw)
+static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags);
diff --git a/io_uring/notif.c b/io_uring/notif.c
index d8ba1165c949..9960bb2a32d5 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -11,8 +11,9 @@
 
 static const struct ubuf_info_ops io_ubuf_ops;
 
-static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw)
+static void io_notif_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *notif = tw_req.req;
 	struct io_notif_data *nd = io_notif_to_data(notif);
 	struct io_ring_ctx *ctx = notif->ctx;
 
@@ -34,7 +35,7 @@ static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw)
 		}
 
 		nd = nd->next;
-		io_req_task_complete(notif, tw);
+		io_req_task_complete((struct io_tw_req){notif}, tw);
 	} while (nd);
 }
 
diff --git a/io_uring/poll.c b/io_uring/poll.c
index c403e751841a..8aa4e3a31e73 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -310,8 +310,9 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
 	return IOU_POLL_NO_ACTION;
 }
 
-void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw)
+void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
 	int ret;
 
 	ret = io_poll_check_events(req, tw);
@@ -332,7 +333,7 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw)
 			poll = io_kiocb_to_cmd(req, struct io_poll);
 			req->cqe.res = mangle_poll(req->cqe.res & poll->events);
 		} else if (ret == IOU_POLL_REISSUE) {
-			io_req_task_submit(req, tw);
+			io_req_task_submit(tw_req, tw);
 			return;
 		} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
 			req->cqe.res = ret;
@@ -340,14 +341,14 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw)
 		}
 
 		io_req_set_res(req, req->cqe.res, 0);
-		io_req_task_complete(req, tw);
+		io_req_task_complete(tw_req, tw);
 	} else {
 		io_tw_lock(req->ctx, tw);
 
 		if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
-			io_req_task_complete(req, tw);
+			io_req_task_complete(tw_req, tw);
 		else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE)
-			io_req_task_submit(req, tw);
+			io_req_task_submit(tw_req, tw);
 		else
 			io_req_defer_failed(req, ret);
 	}
diff --git a/io_uring/poll.h b/io_uring/poll.h
index c8438286dfa0..5647c5138932 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -46,4 +46,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
 			bool cancel_all);
 
-void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw);
+void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 5b2241a5813c..828ac4f902b4 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -564,8 +564,9 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
 	return res;
 }
 
-void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw)
+void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct kiocb *kiocb = &rw->kiocb;
 
@@ -581,7 +582,7 @@ void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw)
 		req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL);
 
 	io_req_rw_cleanup(req, 0);
-	io_req_task_complete(req, tw);
+	io_req_task_complete(tw_req, tw);
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res)
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 129a53fe5482..9bd7fbf70ea9 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -46,7 +46,7 @@ int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags);
 int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags);
 void io_readv_writev_cleanup(struct io_kiocb *req);
 void io_rw_fail(struct io_kiocb *req);
-void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw);
+void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw);
 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
 void io_rw_cache_free(const void *entry);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 444142ba9d04..d8fbbaf31cf3 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -68,8 +68,9 @@ static inline bool io_timeout_finish(struct io_timeout *timeout,
 
 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer);
 
-static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw)
+static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_timeout_data *data = req->async_data;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -85,7 +86,7 @@ static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw)
 		}
 	}
 
-	io_req_task_complete(req, tw);
+	io_req_task_complete(tw_req, tw);
 }
 
 static __cold bool io_flush_killed_timeouts(struct list_head *list, int err)
@@ -157,8 +158,10 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 	io_flush_killed_timeouts(&list, 0);
 }
 
-static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw)
+static void io_req_tw_fail_links(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *link = tw_req.req;
+
 	io_tw_lock(link->ctx, tw);
 	while (link) {
 		struct io_kiocb *nxt = link->link;
@@ -168,7 +171,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw)
 			res = link->cqe.res;
 		link->link = NULL;
 		io_req_set_res(link, res, 0);
-		io_req_task_complete(link, tw);
+		io_req_task_complete((struct io_tw_req){link}, tw);
 		link = nxt;
 	}
 }
@@ -317,8 +320,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 	return 0;
 }
 
-static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw)
+static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_kiocb *prev = timeout->prev;
 	int ret;
@@ -335,11 +339,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw)
 			ret = -ECANCELED;
 		}
 		io_req_set_res(req, ret ?: -ETIME, 0);
-		io_req_task_complete(req, tw);
+		io_req_task_complete(tw_req, tw);
 		io_put_req(prev);
 	} else {
 		io_req_set_res(req, -ETIME, 0);
-		io_req_task_complete(req, tw);
+		io_req_task_complete(tw_req, tw);
 	}
 }
 
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 9d67a2a721aa..c09b99e91c86 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -113,8 +113,9 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
 
-static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw)
+static void io_uring_cmd_work(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 	unsigned int flags = IO_URING_F_COMPLETE_DEFER;
 
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index c5e0d979903a..62f7f1f004a5 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -16,7 +16,7 @@
 #include "waitid.h"
 #include "../kernel/exit.h"
 
-static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw);
+static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw);
 
 #define IO_WAITID_CANCEL_FLAG	BIT(31)
 #define IO_WAITID_REF_MASK	GENMASK(30, 0)
@@ -194,8 +194,9 @@ static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req)
 	return true;
 }
 
-static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw)
+static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_kiocb *req = tw_req.req;
 	struct io_waitid_async *iwa = req->async_data;
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
@@ -229,7 +230,7 @@ static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw)
 	}
 
 	io_waitid_complete(req, ret);
-	io_req_task_complete(req, tw);
+	io_req_task_complete(tw_req, tw);
 }
 
 static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
-- 
cgit v1.2.3


From 20fb3d05a34b55c8ec28ec3d3555e70c5bc0c72d Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Fri, 31 Oct 2025 14:34:30 -0600
Subject: io_uring/uring_cmd: avoid double indirect call in task work dispatch

io_uring task work dispatch makes an indirect call to struct io_kiocb's
io_task_work.func field to allow running arbitrary task work functions.
In the uring_cmd case, this calls io_uring_cmd_work(), which immediately
makes another indirect call to struct io_uring_cmd's task_work_cb field.
Change the uring_cmd task work callbacks to functions whose signatures
match io_req_tw_func_t. Add a function io_uring_cmd_from_tw() to convert
from the task work's struct io_tw_req argument to struct io_uring_cmd *.
Define a constant IO_URING_CMD_TASK_WORK_ISSUE_FLAGS to avoid
manufacturing issue_flags in the uring_cmd task work callbacks. Now
uring_cmd task work dispatch makes a single indirect call to the
uring_cmd implementation's callback. This also allows removing the
task_work_cb field from struct io_uring_cmd, freeing up 8 bytes for
future storage.
Since fuse_uring_send_in_task() now has access to the io_tw_token_t,
check its cancel field directly instead of relying on the
IO_URING_F_TASK_DEAD issue flag.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c                  |  6 ++++--
 drivers/block/ublk_drv.c       | 22 +++++++++++-----------
 drivers/nvme/host/ioctl.c      |  7 ++++---
 fs/btrfs/ioctl.c               |  5 +++--
 fs/fuse/dev_uring.c            |  7 ++++---
 include/linux/io_uring/cmd.h   | 22 +++++++++++++---------
 include/linux/io_uring_types.h |  1 -
 io_uring/uring_cmd.c           | 18 ++----------------
 8 files changed, 41 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/block/ioctl.c b/block/ioctl.c
index d7489a56b33c..4ed17c5a4acc 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -769,14 +769,16 @@ struct blk_iou_cmd {
 	bool nowait;
 };
 
-static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
+static void blk_cmd_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
 	struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
 
 	if (bic->res == -EAGAIN && bic->nowait)
 		io_uring_cmd_issue_blocking(cmd);
 	else
-		io_uring_cmd_done(cmd, bic->res, issue_flags);
+		io_uring_cmd_done(cmd, bic->res,
+				  IO_URING_CMD_TASK_WORK_ISSUE_FLAGS);
 }
 
 static void bio_cmd_bio_end_io(struct bio *bio)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 0c74a41a6753..e0c601128efa 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1302,10 +1302,9 @@ static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
 	return true;
 }
 
-static void ublk_dispatch_req(struct ublk_queue *ubq,
-			      struct request *req,
-			      unsigned int issue_flags)
+static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
 {
+	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
 	int tag = req->tag;
 	struct ublk_io *io = &ubq->ios[tag];
 
@@ -1348,13 +1347,13 @@ static void ublk_dispatch_req(struct ublk_queue *ubq,
 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
 }
 
-static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
-			   unsigned int issue_flags)
+static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
 	struct ublk_queue *ubq = pdu->ubq;
 
-	ublk_dispatch_req(ubq, pdu->req, issue_flags);
+	ublk_dispatch_req(ubq, pdu->req);
 }
 
 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
@@ -1366,9 +1365,9 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
 }
 
-static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
-		unsigned int issue_flags)
+static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
 	struct request *rq = pdu->req_list;
 	struct request *next;
@@ -1376,7 +1375,7 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
 	do {
 		next = rq->rq_next;
 		rq->rq_next = NULL;
-		ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags);
+		ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
 		rq = next;
 	} while (rq);
 }
@@ -2523,9 +2522,10 @@ fail_put:
 	return NULL;
 }
 
-static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
-		unsigned int issue_flags)
+static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
+	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
 	int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
 
 	if (ret != -EIOCBQUEUED)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index c212fa952c0f..4fa8400a5627 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -398,14 +398,15 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
 	return io_uring_cmd_to_pdu(ioucmd, struct nvme_uring_cmd_pdu);
 }
 
-static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,
-			       unsigned issue_flags)
+static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_uring_cmd *ioucmd = io_uring_cmd_from_tw(tw_req);
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
 
 	if (pdu->bio)
 		blk_rq_unmap_user(pdu->bio);
-	io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags);
+	io_uring_cmd_done32(ioucmd, pdu->status, pdu->result,
+			    IO_URING_CMD_TASK_WORK_ISSUE_FLAGS);
 }
 
 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8cb7d5a462ef..3171d9df0246 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4649,8 +4649,9 @@ struct io_btrfs_cmd {
 	struct btrfs_uring_priv *priv;
 };
 
-static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags)
+static void btrfs_uring_read_finished(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
 	struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
 	struct btrfs_uring_priv *priv = bc->priv;
 	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
@@ -4695,7 +4696,7 @@ out:
 	btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 
-	io_uring_cmd_done(cmd, ret, issue_flags);
+	io_uring_cmd_done(cmd, ret, IO_URING_CMD_TASK_WORK_ISSUE_FLAGS);
 	add_rchar(current, ret);
 
 	for (index = 0; index < priv->nr_pages; index++)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index f6b12aebb8bb..f8c93dc45768 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -1209,14 +1209,15 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
  * User buffers are not mapped yet - the application does not have permission
  * to write to it - this has to be executed in ring task context.
  */
-static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
-				    unsigned int issue_flags)
+static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw)
 {
+	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
+	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
 	struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
 	struct fuse_ring_queue *queue = ent->queue;
 	int err;
 
-	if (!(issue_flags & IO_URING_F_TASK_DEAD)) {
+	if (!tw.cancel) {
 		err = fuse_uring_prepare_send(ent, ent->fuse_req);
 		if (err) {
 			fuse_uring_next_fuse_req(ent, queue, issue_flags);
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 7509025b4071..375fd048c4cb 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -11,17 +11,13 @@
 /* io_uring_cmd is being issued again */
 #define IORING_URING_CMD_REISSUE	(1U << 31)
 
-typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd,
-				  unsigned issue_flags);
-
 struct io_uring_cmd {
 	struct file	*file;
 	const struct io_uring_sqe *sqe;
-	/* callback to defer completions to task context */
-	io_uring_cmd_tw_t task_work_cb;
 	u32		cmd_op;
 	u32		flags;
 	u8		pdu[32]; /* available inline for free use */
+	u8		unused[8];
 };
 
 static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
@@ -60,7 +56,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2,
 			 unsigned issue_flags, bool is_cqe32);
 
 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
-			    io_uring_cmd_tw_t task_work_cb,
+			    io_req_tw_func_t task_work_cb,
 			    unsigned flags);
 
 /*
@@ -109,7 +105,7 @@ static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret,
 {
 }
 static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
-			    io_uring_cmd_tw_t task_work_cb, unsigned flags)
+			    io_req_tw_func_t task_work_cb, unsigned flags)
 {
 }
 static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
@@ -132,15 +128,23 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
 }
 #endif
 
+static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
+{
+	return io_kiocb_to_cmd(tw_req.req, struct io_uring_cmd);
+}
+
+/* task_work executor checks the deferred list completion */
+#define IO_URING_CMD_TASK_WORK_ISSUE_FLAGS IO_URING_F_COMPLETE_DEFER
+
 /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */
 static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
-			io_uring_cmd_tw_t task_work_cb)
+			io_req_tw_func_t task_work_cb)
 {
 	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
 }
 
 static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			io_uring_cmd_tw_t task_work_cb)
+			io_req_tw_func_t task_work_cb)
 {
 	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
 }
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f064a438ce43..92780764d5fa 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -39,7 +39,6 @@ enum io_uring_cmd_flags {
 	/* set when uring wants to cancel a previously issued command */
 	IO_URING_F_CANCEL		= (1 << 11),
 	IO_URING_F_COMPAT		= (1 << 12),
-	IO_URING_F_TASK_DEAD		= (1 << 13),
 };
 
 struct io_wq_work_node {
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index c09b99e91c86..197474911f04 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -113,21 +113,8 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
 
-static void io_uring_cmd_work(struct io_tw_req tw_req, io_tw_token_t tw)
-{
-	struct io_kiocb *req = tw_req.req;
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	unsigned int flags = IO_URING_F_COMPLETE_DEFER;
-
-	if (unlikely(tw.cancel))
-		flags |= IO_URING_F_TASK_DEAD;
-
-	/* task_work executor checks the deffered list completion */
-	ioucmd->task_work_cb(ioucmd, flags);
-}
-
 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
-			io_uring_cmd_tw_t task_work_cb,
+			io_req_tw_func_t task_work_cb,
 			unsigned flags)
 {
 	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
@@ -135,8 +122,7 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
 	if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT))
 		return;
 
-	ioucmd->task_work_cb = task_work_cb;
-	req->io_task_work.func = io_uring_cmd_work;
+	req->io_task_work.func = task_work_cb;
 	__io_req_task_work_add(req, flags);
 }
 EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task);
-- 
cgit v1.2.3


From 8627bc8c7d815d929ad59407e13458b564870acf Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 3 Nov 2025 17:34:18 +0100
Subject: ns: add missing authorship

I authored the files a short while ago.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/nstree.h | 1 +
 kernel/nscommon.c      | 1 +
 kernel/nstree.c        | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 8b8636690473..43aa262c0ea1 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
 #ifndef _LINUX_NSTREE_H
 #define _LINUX_NSTREE_H
 
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index c1fb2bad6d72..238402b189f7 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
 
 #include <linux/ns_common.h>
 #include <linux/proc_ns.h>
diff --git a/kernel/nstree.c b/kernel/nstree.c
index 369fd1675c6a..4eabab5fceaf 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
 
 #include <linux/nstree.h>
 #include <linux/proc_ns.h>
-- 
cgit v1.2.3


From d915fe20e5cba4bd50e41e792a32dcddc7490e25 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 3 Nov 2025 16:10:10 +0100
Subject: ns: add NS_COMMON_INIT()

Add an initializer that can be used for the ns common initialization for
static namespace such as most init namespaces.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/87ecqhy2y5.ffs@tglx
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index f5b68b8abb54..3a72c3f81eca 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -119,6 +119,16 @@ void __ns_common_free(struct ns_common *ns);
 		struct user_namespace *:   CLONE_NEWUSER,   \
 		struct uts_namespace *:    CLONE_NEWUTS)
 
+#define NS_COMMON_INIT(nsname, refs)							\
+{											\
+	.ns_type		= ns_common_type(&nsname),				\
+	.ns_id			= 0,							\
+	.inum			= ns_init_inum(&nsname),				\
+	.ops			= to_ns_operations(&nsname),				\
+	.stashed		= NULL,							\
+	.__ns_ref		= REFCOUNT_INIT(refs),					\
+}
+
 #define ns_common_init(__ns)                     \
 	__ns_common_init(to_ns_common(__ns),     \
 			 ns_common_type(__ns),   \
-- 
cgit v1.2.3


From 3dd50c58664e2684bd610a57bf3ab713cbb0ea91 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:21 +0100
Subject: ns: initialize ns_list_node for initial namespaces

Make sure that the list is always initialized for initial namespaces.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-8-2e6f823ebdc0@kernel.org
Fixes: 885fc8ac0a4d ("nstree: make iterator generic")
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 3a72c3f81eca..71a5e28344d1 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -127,6 +127,7 @@ void __ns_common_free(struct ns_common *ns);
 	.ops			= to_ns_operations(&nsname),				\
 	.stashed		= NULL,							\
 	.__ns_ref		= REFCOUNT_INIT(refs),					\
+	.ns_list_node		= LIST_HEAD_INIT(nsname.ns.ns_list_node),		\
 }
 
 #define ns_common_init(__ns)                     \
-- 
cgit v1.2.3


From 6b053576edb12c7739ea9c7c9900031361922631 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:22 +0100
Subject: ns: add __ns_ref_read()

Implement ns_ref_read() the same way as ns_ref_{get,put}().
No point in making that any more special or different from the other
helpers.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-9-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 71a5e28344d1..5e09facafd93 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -154,7 +154,12 @@ static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
 	return refcount_inc_not_zero(&ns->__ns_ref);
 }
 
-#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref)
+static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns)
+{
+	return refcount_read(&ns->__ns_ref);
+}
+
+#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns)))
 #define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref)
 #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
 #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
-- 
cgit v1.2.3


From 4b06b70c8244b442d58ae0fb59870cf31fdb422e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:23 +0100
Subject: ns: rename to exit_nsproxy_namespaces()

The current naming is very misleading as this really isn't exiting all
of the task's namespaces. It is only exiting the namespaces that hang of
off nsproxy. Reflect that in the name.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-10-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/nsproxy.h | 2 +-
 kernel/cgroup/cgroup.c  | 6 +++---
 kernel/exit.c           | 2 +-
 kernel/fork.c           | 2 +-
 kernel/nsproxy.c        | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index bd118a187dec..538ba8dba184 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -93,7 +93,7 @@ static inline struct cred *nsset_cred(struct nsset *set)
  */
 
 int copy_namespaces(u64 flags, struct task_struct *tsk);
-void exit_task_namespaces(struct task_struct *tsk);
+void exit_nsproxy_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 int exec_task_namespaces(void);
 void free_nsproxy(struct nsproxy *ns);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index df8bfd26d502..b758a9dd7526 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1519,9 +1519,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
 	} else {
 		/*
 		 * NOTE: This function may be called from bpf_cgroup_from_id()
-		 * on a task which has already passed exit_task_namespaces() and
-		 * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
-		 * cgroups visible for lookups.
+		 * on a task which has already passed exit_nsproxy_namespaces()
+		 * and nsproxy == NULL. Fall back to cgrp_dfl_root which will
+		 * make all cgroups visible for lookups.
 		 */
 		return &cgrp_dfl_root.cgrp;
 	}
diff --git a/kernel/exit.c b/kernel/exit.c
index 9f74e8f1c431..825998103520 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -962,7 +962,7 @@ void __noreturn do_exit(long code)
 	exit_fs(tsk);
 	if (group_dead)
 		disassociate_ctty(1);
-	exit_task_namespaces(tsk);
+	exit_nsproxy_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..0926bfe4b8df 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2453,7 +2453,7 @@ bad_fork_cleanup_io:
 	if (p->io_context)
 		exit_io_context(p);
 bad_fork_cleanup_namespaces:
-	exit_task_namespaces(p);
+	exit_nsproxy_namespaces(p);
 bad_fork_cleanup_mm:
 	if (p->mm) {
 		mm_clear_owner(p->mm, p);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 19aa64ab08c8..6ce76a0278ab 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -241,7 +241,7 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
 		put_nsproxy(ns);
 }
 
-void exit_task_namespaces(struct task_struct *p)
+void exit_nsproxy_namespaces(struct task_struct *p)
 {
 	switch_task_namespaces(p, NULL);
 }
-- 
cgit v1.2.3


From 3a18f809184bc5a1cfad7cde5b8b026e2ff61587 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:24 +0100
Subject: ns: add active reference count

The namespace tree is, among other things, currently used to support
file handles for namespaces. When a namespace is created it is placed on
the namespace trees and when it is destroyed it is removed from the
namespace trees.

While a namespace is on the namespace trees with a valid reference count
it is possible to reopen it through a namespace file handle. This is all
fine but has some issues that should be addressed.

On current kernels a namespace is visible to userspace in the
following cases:

(1) The namespace is in use by a task.
(2) The namespace is persisted through a VFS object (namespace file
    descriptor or bind-mount).
    Note that (2) only cares about direct persistence of the namespace
    itself not indirectly via e.g., file->f_cred file references or
    similar.
(3) The namespace is a hierarchical namespace type and is the parent of
    a single or multiple child namespaces.

Case (3) is interesting because it is possible that a parent namespace
might not fulfill any of (1) or (2), i.e., is invisible to userspace but
it may still be resurrected through the NS_GET_PARENT ioctl().

Currently namespace file handles allow much broader access to namespaces
than what is currently possible via (1)-(3). The reason is that
namespaces may remain pinned for completely internal reasons yet are
inaccessible to userspace.

For example, a user namespace my remain pinned by get_cred() calls to
stash the opener's credentials into file->f_cred. As it stands file
handles allow to resurrect such a users namespace even though this
should not be possible via (1)-(3). This is a fundamental uapi change
that we shouldn't do if we don't have to.

Consider the following insane case: Various architectures support the
CONFIG_MMU_LAZY_TLB_REFCOUNT option which uses lazy TLB destruction.
When this option is set a userspace task's struct mm_struct may be used
for kernel threads such as the idle task and will only be destroyed once
the cpu's runqueue switches back to another task. But because of ptrace()
permission checks struct mm_struct stashes the user namespace of the
task that struct mm_struct originally belonged to. The kernel thread
will take a reference on the struct mm_struct and thus pin it.

So on an idle system user namespaces can be persisted for arbitrary
amounts of time which also means that they can be resurrected using
namespace file handles. That makes no sense whatsoever. The problem is
of course excarabted on large systems with a huge number of cpus.

To handle this nicely we introduce an active reference count which
tracks (1)-(3). This is easy to do as all of these things are already
managed centrally. Only (1)-(3) will count towards the active reference
count and only namespaces which are active may be opened via namespace
file handles.

The problem is that namespaces may be resurrected. Which means that they
can become temporarily inactive and will be reactived some time later.
Currently the only example of this is the SIOGCSKNS socket ioctl. The
SIOCGSKNS ioctl allows to open a network namespace file descriptor based
on a socket file descriptor.

If a socket is tied to a network namespace that subsequently becomes
inactive but that socket is persisted by another process in another
network namespace (e.g., via SCM_RIGHTS of pidfd_getfd()) then the
SIOCGSKNS ioctl will resurrect this network namespace.

So calls to open_related_ns() and open_namespace() will end up
resurrecting the corresponding namespace tree.

Note that the active reference count does not regulate the lifetime of
the namespace itself. This is still done by the normal reference count.
The active reference count can only be elevated if the regular reference
count is elevated.

The active reference count also doesn't regulate the presence of a
namespace on the namespace trees. It only regulates its visiblity to
namespace file handles (and in later patches to listns()).

A namespace remains on the namespace trees from creation until its
actual destruction. This will allow the kernel to always reach any
namespace trivially and it will also enable subsystems like bpf to walk
the namespace lists on the system for tracing or general introspection
purposes.

Note that different namespaces have different visibility lifetimes on
current kernels. While most namespace are immediately released when the
last task using them exits, the user- and pid namespace are persisted
and thus both remain accessible via /proc/<pid>/ns/<ns_type>.

The user namespace lifetime is aliged with struct cred and is only
released through exit_creds(). However, it becomes inaccessible to
userspace once the last task using it is reaped, i.e., when
release_task() is called and all proc entries are flushed. Similarly,
the pid namespace is also visible until the last task using it has been
reaped and the associated pid numbers are freed.

The active reference counts of the user- and pid namespace are
decremented once the task is reaped.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-11-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nsfs.c                 |  48 ++++++++++-
 include/linux/ns_common.h | 141 +++++++++++++++++++++++++++++-
 include/linux/nsfs.h      |   3 +
 include/linux/nsproxy.h   |   3 +
 kernel/cred.c             |   6 ++
 kernel/exit.c             |   1 +
 kernel/fork.c             |   1 +
 kernel/nscommon.c         | 214 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/nsproxy.c          |  23 +++++
 kernel/nstree.c           |   8 ++
 kernel/pid.c              |   5 ++
 11 files changed, 449 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8b53fd361177..0c35e4e54711 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -58,6 +58,8 @@ const struct dentry_operations ns_dentry_operations = {
 static void nsfs_evict(struct inode *inode)
 {
 	struct ns_common *ns = inode->i_private;
+
+	__ns_ref_active_put(ns);
 	clear_inode(inode);
 	ns->ops->put(ns);
 }
@@ -419,6 +421,16 @@ static int nsfs_init_inode(struct inode *inode, void *data)
 	inode->i_mode |= S_IRUGO;
 	inode->i_fop = &ns_file_operations;
 	inode->i_ino = ns->inum;
+
+	/*
+	 * Bring the namespace subtree back to life if we have to. This
+	 * can happen when e.g., all processes using a network namespace
+	 * and all namespace files or namespace file bind-mounts have
+	 * died but there are still sockets pinning it. The SIOCGSKNS
+	 * ioctl on such a socket will resurrect the relevant namespace
+	 * subtree.
+	 */
+	__ns_ref_active_resurrect(ns);
 	return 0;
 }
 
@@ -495,7 +507,17 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 		if (ns->inum != fid->ns_inum)
 			return NULL;
 
-		if (!__ns_ref_get(ns))
+		/*
+		 * This is racy because we're not actually taking an
+		 * active reference. IOW, it could happen that the
+		 * namespace becomes inactive after this check.
+		 * We don't care because nsfs_init_inode() will just
+		 * resurrect the relevant namespace tree for us. If it
+		 * has been active here we just allow it's resurrection.
+		 * We could try to take an active reference here and
+		 * then drop it again. But really, why bother.
+		 */
+		if (!ns_get_unless_inactive(ns))
 			return NULL;
 	}
 
@@ -615,3 +637,27 @@ void __init nsfs_init(void)
 	nsfs_root_path.mnt = nsfs_mnt;
 	nsfs_root_path.dentry = nsfs_mnt->mnt_root;
 }
+
+void nsproxy_ns_active_get(struct nsproxy *ns)
+{
+	ns_ref_active_get(ns->mnt_ns);
+	ns_ref_active_get(ns->uts_ns);
+	ns_ref_active_get(ns->ipc_ns);
+	ns_ref_active_get(ns->pid_ns_for_children);
+	ns_ref_active_get(ns->cgroup_ns);
+	ns_ref_active_get(ns->net_ns);
+	ns_ref_active_get(ns->time_ns);
+	ns_ref_active_get(ns->time_ns_for_children);
+}
+
+void nsproxy_ns_active_put(struct nsproxy *ns)
+{
+	ns_ref_active_put(ns->mnt_ns);
+	ns_ref_active_put(ns->uts_ns);
+	ns_ref_active_put(ns->ipc_ns);
+	ns_ref_active_put(ns->pid_ns_for_children);
+	ns_ref_active_put(ns->cgroup_ns);
+	ns_ref_active_put(ns->net_ns);
+	ns_ref_active_put(ns->time_ns);
+	ns_ref_active_put(ns->time_ns_for_children);
+}
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 5e09facafd93..bdd0df15ad9c 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -4,7 +4,9 @@
 
 #include <linux/refcount.h>
 #include <linux/rbtree.h>
+#include <linux/vfsdebug.h>
 #include <uapi/linux/sched.h>
+#include <uapi/linux/nsfs.h>
 
 struct proc_ns_operations;
 
@@ -37,6 +39,67 @@ extern const struct proc_ns_operations cgroupns_operations;
 extern const struct proc_ns_operations timens_operations;
 extern const struct proc_ns_operations timens_for_children_operations;
 
+/*
+ * Namespace lifetimes are managed via a two-tier reference counting model:
+ *
+ * (1) __ns_ref (refcount_t): Main reference count tracking memory
+ *     lifetime. Controls when the namespace structure itself is freed.
+ *     It also pins the namespace on the namespace trees whereas (2)
+ *     only regulates their visibility to userspace.
+ *
+ * (2) __ns_ref_active (atomic_t): Reference count tracking active users.
+ *     Controls visibility of the namespace in the namespace trees.
+ *     Any live task that uses the namespace (via nsproxy or cred) holds
+ *     an active reference. Any open file descriptor or bind-mount of
+ *     the namespace holds an active reference. Once all tasks have
+ *     called exited their namespaces and all file descriptors and
+ *     bind-mounts have been released the active reference count drops
+ *     to zero and the namespace becomes inactive. IOW, the namespace
+ *     cannot be listed or opened via file handles anymore.
+ *
+ *     Note that it is valid to transition from active to inactive and
+ *     back from inactive to active e.g., when resurrecting an inactive
+ *     namespace tree via the SIOCGSKNS ioctl().
+ *
+ * Relationship and lifecycle states:
+ *
+ * - Active (__ns_ref_active > 0):
+ *   Namespace is actively used and visible to userspace. The namespace
+ *   can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file
+ *   handles, or discovered via listns().
+ *
+ * - Inactive (__ns_ref_active == 0, __ns_ref > 0):
+ *   No tasks are actively using the namespace and it isn't pinned by
+ *   any bind-mounts or open file descriptors anymore. But the namespace
+ *   is still kept alive by internal references. For example, the user
+ *   namespace could be pinned by an open file through file->f_cred
+ *   references when one of the now defunct tasks had opened a file and
+ *   handed the file descriptor off to another process via a UNIX
+ *   sockets. Such references keep the namespace structure alive through
+ *   __ns_ref but will not hold an active reference.
+ *
+ * - Destroyed (__ns_ref == 0):
+ *   No references remain. The namespace is removed from the tree and freed.
+ *
+ * State transitions:
+ *
+ * Active -> Inactive:
+ *   When the last task using the namespace exits it drops its active
+ *   references to all namespaces. However, user and pid namespaces
+ *   remain accessible until the task has been reaped.
+ *
+ * Inactive -> Active:
+ *   An inactive namespace tree might be resurrected due to e.g., the
+ *   SIOCGSKNS ioctl() on a socket.
+ *
+ * Inactive -> Destroyed:
+ *   When __ns_ref drops to zero the namespace is removed from the
+ *   namespaces trees and the memory is freed (after RCU grace period).
+ *
+ * Initial namespaces:
+ *   Boot-time namespaces (init_net, init_pid_ns, etc.) start with
+ *   __ns_ref_active = 1 and remain active forever.
+ */
 struct ns_common {
 	u32 ns_type;
 	struct dentry *stashed;
@@ -48,6 +111,7 @@ struct ns_common {
 			u64 ns_id;
 			struct rb_node ns_tree_node;
 			struct list_head ns_list_node;
+			atomic_t __ns_ref_active; /* do not use directly */
 		};
 		struct rcu_head ns_rcu;
 	};
@@ -56,6 +120,13 @@ struct ns_common {
 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
 void __ns_common_free(struct ns_common *ns);
 
+static __always_inline bool is_initial_namespace(struct ns_common *ns)
+{
+	VFS_WARN_ON_ONCE(ns->inum == 0);
+	return unlikely(in_range(ns->inum, MNT_NS_INIT_INO,
+				 IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1));
+}
+
 #define to_ns_common(__ns)                                    \
 	_Generic((__ns),                                      \
 		struct cgroup_namespace *:       &(__ns)->ns, \
@@ -127,6 +198,7 @@ void __ns_common_free(struct ns_common *ns);
 	.ops			= to_ns_operations(&nsname),				\
 	.stashed		= NULL,							\
 	.__ns_ref		= REFCOUNT_INIT(refs),					\
+	.__ns_ref_active	= ATOMIC_INIT(1),					\
 	.ns_list_node		= LIST_HEAD_INIT(nsname.ns.ns_list_node),		\
 }
 
@@ -144,14 +216,26 @@ void __ns_common_free(struct ns_common *ns);
 
 #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))
 
+static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns)
+{
+	return atomic_read(&ns->__ns_ref_active);
+}
+
 static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
 {
-	return refcount_dec_and_test(&ns->__ns_ref);
+	if (refcount_dec_and_test(&ns->__ns_ref)) {
+		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
+		return true;
+	}
+	return false;
 }
 
 static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
 {
-	return refcount_inc_not_zero(&ns->__ns_ref);
+	if (refcount_inc_not_zero(&ns->__ns_ref))
+		return true;
+	VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
+	return false;
 }
 
 static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns)
@@ -166,4 +250,57 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns
 #define ns_ref_put_and_lock(__ns, __lock) \
 	refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock))
 
+#define ns_ref_active_read(__ns) \
+	((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)
+
+void __ns_ref_active_get_owner(struct ns_common *ns);
+
+static __always_inline void __ns_ref_active_get(struct ns_common *ns)
+{
+	WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
+	VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0);
+}
+#define ns_ref_active_get(__ns) \
+	do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
+
+static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns)
+{
+	if (atomic_inc_not_zero(&ns->__ns_ref_active)) {
+		VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
+		return true;
+	}
+	return false;
+}
+
+#define ns_ref_active_get_owner(__ns) \
+	do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0)
+
+void __ns_ref_active_put_owner(struct ns_common *ns);
+
+static __always_inline void __ns_ref_active_put(struct ns_common *ns)
+{
+	if (atomic_dec_and_test(&ns->__ns_ref_active)) {
+		VFS_WARN_ON_ONCE(is_initial_namespace(ns));
+		VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
+		__ns_ref_active_put_owner(ns);
+	}
+}
+#define ns_ref_active_put(__ns) \
+	do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0)
+
+static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
+{
+	VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns));
+	if (!__ns_ref_active_read(ns))
+		return NULL;
+	if (!__ns_ref_get(ns))
+		return NULL;
+	return ns;
+}
+
+void __ns_ref_active_resurrect(struct ns_common *ns);
+
+#define ns_ref_active_resurrect(__ns) \
+	do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0)
+
 #endif
diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h
index e5a5fa83d36b..731b67fc2fec 100644
--- a/include/linux/nsfs.h
+++ b/include/linux/nsfs.h
@@ -37,4 +37,7 @@ void nsfs_init(void);
 
 #define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns)
 
+void nsproxy_ns_active_get(struct nsproxy *ns);
+void nsproxy_ns_active_put(struct nsproxy *ns);
+
 #endif /* _LINUX_NSFS_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 538ba8dba184..ac825eddec59 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -93,7 +93,10 @@ static inline struct cred *nsset_cred(struct nsset *set)
  */
 
 int copy_namespaces(u64 flags, struct task_struct *tsk);
+void switch_cred_namespaces(const struct cred *old, const struct cred *new);
 void exit_nsproxy_namespaces(struct task_struct *tsk);
+void get_cred_namespaces(struct task_struct *tsk);
+void exit_cred_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 int exec_task_namespaces(void);
 void free_nsproxy(struct nsproxy *ns);
diff --git a/kernel/cred.c b/kernel/cred.c
index dbf6b687dc5c..a6e7f580df14 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -306,6 +306,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags)
 		kdebug("share_creds(%p{%ld})",
 		       p->cred, atomic_long_read(&p->cred->usage));
 		inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+		get_cred_namespaces(p);
 		return 0;
 	}
 
@@ -343,6 +344,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags)
 
 	p->cred = p->real_cred = get_cred(new);
 	inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+	get_cred_namespaces(p);
+
 	return 0;
 
 error_put:
@@ -435,10 +438,13 @@ int commit_creds(struct cred *new)
 	 */
 	if (new->user != old->user || new->user_ns != old->user_ns)
 		inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+
 	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user || new->user_ns != old->user_ns)
 		dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+	if (new->user_ns != old->user_ns)
+		switch_cred_namespaces(old, new);
 
 	/* send notifications */
 	if (!uid_eq(new->uid,   old->uid)  ||
diff --git a/kernel/exit.c b/kernel/exit.c
index 825998103520..988e16efd66b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -291,6 +291,7 @@ repeat:
 	write_unlock_irq(&tasklist_lock);
 	/* @thread_pid can't go away until free_pids() below */
 	proc_flush_pid(thread_pid);
+	exit_cred_namespaces(p);
 	add_device_randomness(&p->se.sum_exec_runtime,
 			      sizeof(p->se.sum_exec_runtime));
 	free_pids(post.pids);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0926bfe4b8df..f1857672426e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2487,6 +2487,7 @@ bad_fork_cleanup_delayacct:
 	delayacct_tsk_free(p);
 bad_fork_cleanup_count:
 	dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+	exit_cred_namespaces(p);
 	exit_creds(p);
 bad_fork_free:
 	WRITE_ONCE(p->__state, TASK_DEAD);
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index 238402b189f7..abd1ac1a2d02 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -3,6 +3,7 @@
 
 #include <linux/ns_common.h>
 #include <linux/proc_ns.h>
+#include <linux/user_namespace.h>
 #include <linux/vfsdebug.h>
 
 #ifdef CONFIG_DEBUG_VFS
@@ -53,6 +54,8 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
 
 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
 {
+	int ret;
+
 	refcount_set(&ns->__ns_ref, 1);
 	ns->stashed = NULL;
 	ns->ops = ops;
@@ -69,10 +72,219 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 		ns->inum = inum;
 		return 0;
 	}
-	return proc_alloc_inum(&ns->inum);
+	ret = proc_alloc_inum(&ns->inum);
+	if (ret)
+		return ret;
+	/*
+	 * Tree ref starts at 0. It's incremented when namespace enters
+	 * active use (installed in nsproxy) and decremented when all
+	 * active uses are gone. Initial namespaces are always active.
+	 */
+	if (is_initial_namespace(ns))
+		atomic_set(&ns->__ns_ref_active, 1);
+	else
+		atomic_set(&ns->__ns_ref_active, 0);
+	return 0;
 }
 
 void __ns_common_free(struct ns_common *ns)
 {
 	proc_free_inum(ns->inum);
 }
+
+static struct ns_common *ns_owner(struct ns_common *ns)
+{
+	struct user_namespace *owner;
+
+	if (unlikely(!ns->ops))
+		return NULL;
+	VFS_WARN_ON_ONCE(!ns->ops->owner);
+	owner = ns->ops->owner(ns);
+	VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
+	if (!owner)
+		return NULL;
+	/* Skip init_user_ns as it's always active */
+	if (owner == &init_user_ns)
+		return NULL;
+	return to_ns_common(owner);
+}
+
+void __ns_ref_active_get_owner(struct ns_common *ns)
+{
+	ns = ns_owner(ns);
+	if (ns)
+		WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down.
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        +      +
+ *                        user_ns1 (2)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        +   +   +
+ *                        user_ns2 (3)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        -      -
+ *                        user_ns1 (0)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        +   -   +
+ *                        user_ns2 (2)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * The iteration stops once we reach a namespace that still has active
+ * references.
+ */
+void __ns_ref_active_put_owner(struct ns_common *ns)
+{
+	for (;;) {
+		ns = ns_owner(ns);
+		if (!ns)
+			return;
+		if (!atomic_dec_and_test(&ns->__ns_ref_active))
+			return;
+	}
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down. This makes it possible to efficiently
+ * resurrect a namespace tree:
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        +      +
+ *                        user_ns1 (2)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        +   +   +
+ *                        user_ns2 (3)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        -      -
+ *                        user_ns1 (0)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        +   -   +
+ *                        user_ns2 (2)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * Assume the whole tree is dead but all namespaces are still active:
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        -      -
+ *                        user_ns1 (0)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        -   -   -
+ *                        user_ns2 (0)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        +      -
+ *                        user_ns1 (0)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        -   +   -
+ *                        user_ns2 (0)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * If net_ns had a zero reference count and we bumped it we also need to
+ * take another reference on its owning user namespace. Similarly, if
+ * pid_ns had a zero reference count it also needs to take another
+ * reference on its owning user namespace. So both net_ns and pid_ns
+ * will each have their own reference on the owning user namespace.
+ *
+ * If the owning user namespace user_ns1 had a zero reference count then
+ * it also needs to take another reference on its owning user namespace
+ * and so on.
+ */
+void __ns_ref_active_resurrect(struct ns_common *ns)
+{
+	/* If we didn't resurrect the namespace we're done. */
+	if (atomic_fetch_add(1, &ns->__ns_ref_active))
+		return;
+
+	/*
+	 * We did resurrect it. Walk the ownership hierarchy upwards
+	 * until we found an owning user namespace that is active.
+	 */
+	for (;;) {
+		ns = ns_owner(ns);
+		if (!ns)
+			return;
+
+		if (atomic_fetch_add(1, &ns->__ns_ref_active))
+			return;
+	}
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 6ce76a0278ab..94c2cfe0afa1 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
 #include <linux/syscalls.h>
 #include <linux/cgroup.h>
 #include <linux/perf_event.h>
+#include <linux/nstree.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -179,12 +180,15 @@ int copy_namespaces(u64 flags, struct task_struct *tsk)
 	if ((flags & CLONE_VM) == 0)
 		timens_on_fork(new_ns, tsk);
 
+	nsproxy_ns_active_get(new_ns);
 	tsk->nsproxy = new_ns;
 	return 0;
 }
 
 void free_nsproxy(struct nsproxy *ns)
 {
+	nsproxy_ns_active_put(ns);
+
 	put_mnt_ns(ns->mnt_ns);
 	put_uts_ns(ns->uts_ns);
 	put_ipc_ns(ns->ipc_ns);
@@ -232,6 +236,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
 
 	might_sleep();
 
+	if (new)
+		nsproxy_ns_active_get(new);
+
 	task_lock(p);
 	ns = p->nsproxy;
 	p->nsproxy = new;
@@ -246,6 +253,22 @@ void exit_nsproxy_namespaces(struct task_struct *p)
 	switch_task_namespaces(p, NULL);
 }
 
+void switch_cred_namespaces(const struct cred *old, const struct cred *new)
+{
+	ns_ref_active_get(new->user_ns);
+	ns_ref_active_put(old->user_ns);
+}
+
+void get_cred_namespaces(struct task_struct *tsk)
+{
+	ns_ref_active_get(tsk->real_cred->user_ns);
+}
+
+void exit_cred_namespaces(struct task_struct *tsk)
+{
+	ns_ref_active_put(tsk->real_cred->user_ns);
+}
+
 int exec_task_namespaces(void)
 {
 	struct task_struct *tsk = current;
diff --git a/kernel/nstree.c b/kernel/nstree.c
index 4eabab5fceaf..e2a537785128 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -123,6 +123,14 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 	write_sequnlock(&ns_tree->ns_tree_lock);
 
 	VFS_WARN_ON_ONCE(node);
+
+	/*
+	 * Take an active reference on the owner namespace. This ensures
+	 * that the owner remains visible while any of its child namespaces
+	 * are active. For init namespaces this is a no-op as ns_owner()
+	 * returns NULL for namespaces owned by init_user_ns.
+	 */
+	__ns_ref_active_get_owner(ns);
 }
 
 void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
diff --git a/kernel/pid.c b/kernel/pid.c
index 19d4599c136c..a5a63dc0a491 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -112,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp)
 void free_pid(struct pid *pid)
 {
 	int i;
+	struct pid_namespace *active_ns;
 
 	lockdep_assert_not_held(&tasklist_lock);
 
+	active_ns = pid->numbers[pid->level].ns;
+	ns_ref_active_put(active_ns);
+
 	spin_lock(&pidmap_lock);
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
@@ -278,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 	}
 	spin_unlock(&pidmap_lock);
 	idr_preload_end();
+	ns_ref_active_get(ns);
 
 	return pid;
 
-- 
cgit v1.2.3


From 8895d2a3dbf49f23622ab8da9fb3909826edd6dc Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:25 +0100
Subject: ns: use anonymous struct to group list member

Make it easier to spot that they belong together conceptually.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-12-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index bdd0df15ad9c..32463203c824 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -109,8 +109,10 @@ struct ns_common {
 	union {
 		struct {
 			u64 ns_id;
-			struct rb_node ns_tree_node;
-			struct list_head ns_list_node;
+			struct /* per type rbtree and list */ {
+				struct rb_node ns_tree_node;
+				struct list_head ns_list_node;
+			};
 			atomic_t __ns_ref_active; /* do not use directly */
 		};
 		struct rcu_head ns_rcu;
-- 
cgit v1.2.3


From 2ccaebc686e9ef7e94b3a8d89706daed6e696667 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:26 +0100
Subject: nstree: introduce a unified tree

This will allow userspace to lookup and stat a namespace simply by its
identifier without having to know what type of namespace it is.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-13-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h |  3 ++
 kernel/nscommon.c         |  1 +
 kernel/nstree.c           | 96 +++++++++++++++++++++++++++++++++++------------
 3 files changed, 76 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 32463203c824..7a3c71b3a76f 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -109,6 +109,9 @@ struct ns_common {
 	union {
 		struct {
 			u64 ns_id;
+			struct /* global namespace rbtree and list */ {
+				struct rb_node ns_unified_tree_node;
+			};
 			struct /* per type rbtree and list */ {
 				struct rb_node ns_tree_node;
 				struct list_head ns_list_node;
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index abd1ac1a2d02..17a6ea44f054 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -62,6 +62,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 	ns->ns_id = 0;
 	ns->ns_type = ns_type;
 	RB_CLEAR_NODE(&ns->ns_tree_node);
+	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
 	INIT_LIST_HEAD(&ns->ns_list_node);
 
 #ifdef CONFIG_DEBUG_VFS
diff --git a/kernel/nstree.c b/kernel/nstree.c
index e2a537785128..bbb34b46b01b 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -5,31 +5,30 @@
 #include <linux/proc_ns.h>
 #include <linux/vfsdebug.h>
 
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
+static struct rb_root ns_unified_tree = RB_ROOT; /* protected by ns_tree_lock */
+
 /**
  * struct ns_tree - Namespace tree
  * @ns_tree: Rbtree of namespaces of a particular type
  * @ns_list: Sequentially walkable list of all namespaces of this type
- * @ns_tree_lock: Seqlock to protect the tree and list
  * @type: type of namespaces in this tree
  */
 struct ns_tree {
-       struct rb_root ns_tree;
-       struct list_head ns_list;
-       seqlock_t ns_tree_lock;
-       int type;
+	struct rb_root ns_tree;
+	struct list_head ns_list;
+	int type;
 };
 
 struct ns_tree mnt_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWNS,
 };
 
 struct ns_tree net_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWNET,
 };
 EXPORT_SYMBOL_GPL(net_ns_tree);
@@ -37,42 +36,36 @@ EXPORT_SYMBOL_GPL(net_ns_tree);
 struct ns_tree uts_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWUTS,
 };
 
 struct ns_tree user_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWUSER,
 };
 
 struct ns_tree ipc_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWIPC,
 };
 
 struct ns_tree pid_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWPID,
 };
 
 struct ns_tree cgroup_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWCGROUP,
 };
 
 struct ns_tree time_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWTIME,
 };
 
@@ -85,6 +78,13 @@ static inline struct ns_common *node_to_ns(const struct rb_node *node)
 	return rb_entry(node, struct ns_common, ns_tree_node);
 }
 
+static inline struct ns_common *node_to_ns_unified(const struct rb_node *node)
+{
+	if (!node)
+		return NULL;
+	return rb_entry(node, struct ns_common, ns_unified_tree_node);
+}
+
 static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
 {
 	struct ns_common *ns_a = node_to_ns(a);
@@ -99,15 +99,27 @@ static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
 	return 0;
 }
 
+static inline int ns_cmp_unified(struct rb_node *a, const struct rb_node *b)
+{
+	struct ns_common *ns_a = node_to_ns_unified(a);
+	struct ns_common *ns_b = node_to_ns_unified(b);
+	u64 ns_id_a = ns_a->ns_id;
+	u64 ns_id_b = ns_b->ns_id;
+
+	if (ns_id_a < ns_id_b)
+		return -1;
+	if (ns_id_a > ns_id_b)
+		return 1;
+	return 0;
+}
+
 void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 {
 	struct rb_node *node, *prev;
 
 	VFS_WARN_ON_ONCE(!ns->ns_id);
 
-	write_seqlock(&ns_tree->ns_tree_lock);
-
-	VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+	write_seqlock(&ns_tree_lock);
 
 	node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
 	/*
@@ -120,7 +132,8 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 	else
 		list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
 
-	write_sequnlock(&ns_tree->ns_tree_lock);
+	rb_find_add_rcu(&ns->ns_unified_tree_node, &ns_unified_tree, ns_cmp_unified);
+	write_sequnlock(&ns_tree_lock);
 
 	VFS_WARN_ON_ONCE(node);
 
@@ -139,11 +152,12 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
 	VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
 	VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
 
-	write_seqlock(&ns_tree->ns_tree_lock);
+	write_seqlock(&ns_tree_lock);
 	rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
+	rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree);
 	list_bidir_del_rcu(&ns->ns_list_node);
 	RB_CLEAR_NODE(&ns->ns_tree_node);
-	write_sequnlock(&ns_tree->ns_tree_lock);
+	write_sequnlock(&ns_tree_lock);
 }
 EXPORT_SYMBOL_GPL(__ns_tree_remove);
 
@@ -159,6 +173,17 @@ static int ns_find(const void *key, const struct rb_node *node)
 	return 0;
 }
 
+static int ns_find_unified(const void *key, const struct rb_node *node)
+{
+	const u64 ns_id = *(u64 *)key;
+	const struct ns_common *ns = node_to_ns_unified(node);
+
+	if (ns_id < ns->ns_id)
+		return -1;
+	if (ns_id > ns->ns_id)
+		return 1;
+	return 0;
+}
 
 static struct ns_tree *ns_tree_from_type(int ns_type)
 {
@@ -184,28 +209,51 @@ static struct ns_tree *ns_tree_from_type(int ns_type)
 	return NULL;
 }
 
-struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id)
 {
-	struct ns_tree *ns_tree;
 	struct rb_node *node;
 	unsigned int seq;
 
-	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+	do {
+		seq = read_seqbegin(&ns_tree_lock);
+		node = rb_find_rcu(&ns_id, &ns_unified_tree, ns_find_unified);
+		if (node)
+			break;
+	} while (read_seqretry(&ns_tree_lock, seq));
+
+	return node_to_ns_unified(node);
+}
+
+static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+	struct ns_tree *ns_tree;
+	struct rb_node *node;
+	unsigned int seq;
 
 	ns_tree = ns_tree_from_type(ns_type);
 	if (!ns_tree)
 		return NULL;
 
 	do {
-		seq = read_seqbegin(&ns_tree->ns_tree_lock);
+		seq = read_seqbegin(&ns_tree_lock);
 		node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
 		if (node)
 			break;
-	} while (read_seqretry(&ns_tree->ns_tree_lock, seq));
+	} while (read_seqretry(&ns_tree_lock, seq));
 
 	return node_to_ns(node);
 }
 
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+
+	if (ns_type)
+		return __ns_tree_lookup_rcu(ns_id, ns_type);
+
+	return __ns_unified_tree_lookup_rcu(ns_id);
+}
+
 /**
  * ns_tree_adjoined_rcu - find the next/previous namespace in the same
  * tree
-- 
cgit v1.2.3


From 3760342fd6312416491d536144e39297fa5b1950 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:28 +0100
Subject: nstree: assign fixed ids to the initial namespaces

The initial set of namespace comes with fixed inode numbers making it
easy for userspace to identify them solely based on that information.
This has long preceeded anything here.

Similarly, let's assign fixed namespace ids for the initial namespaces.

Kill the cookie and use a sequentially increasing number. This has the
nice side-effect that the owning user namespace will always have a
namespace id that is smaller than any of it's descendant namespaces.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-15-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c            |  2 +-
 include/linux/ns_common.h | 13 ++++++++++++-
 include/linux/nstree.h    | 15 +++++++++++----
 include/uapi/linux/nsfs.h | 14 ++++++++++++++
 kernel/nstree.c           | 13 ++++++++-----
 net/core/net_namespace.c  |  2 +-
 6 files changed, 47 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index 7b78dd48b6c3..eded33eeb647 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4094,7 +4094,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 		return ERR_PTR(ret);
 	}
 	if (!anon)
-		ns_tree_gen_id(&new_ns->ns);
+		ns_tree_gen_id(new_ns);
 	refcount_set(&new_ns->passive, 1);
 	new_ns->mounts = RB_ROOT;
 	init_waitqueue_head(&new_ns->poll);
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 7a3c71b3a76f..009a6dea724f 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -173,6 +173,17 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns)
 		struct user_namespace *:   &init_user_ns,   \
 		struct uts_namespace *:    &init_uts_ns)
 
+#define ns_init_id(__ns)						\
+	_Generic((__ns),						\
+		struct cgroup_namespace *:	CGROUP_NS_INIT_ID,	\
+		struct ipc_namespace *:		IPC_NS_INIT_ID,		\
+		struct mnt_namespace *:		MNT_NS_INIT_ID,		\
+		struct net *:			NET_NS_INIT_ID,		\
+		struct pid_namespace *:		PID_NS_INIT_ID,		\
+		struct time_namespace *:	TIME_NS_INIT_ID,	\
+		struct user_namespace *:	USER_NS_INIT_ID,	\
+		struct uts_namespace *:		UTS_NS_INIT_ID)
+
 #define to_ns_operations(__ns)                                                                         \
 	_Generic((__ns),                                                                               \
 		struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
@@ -198,7 +209,7 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns)
 #define NS_COMMON_INIT(nsname, refs)							\
 {											\
 	.ns_type		= ns_common_type(&nsname),				\
-	.ns_id			= 0,							\
+	.ns_id			= ns_init_id(&nsname),					\
 	.inum			= ns_init_inum(&nsname),				\
 	.ops			= to_ns_operations(&nsname),				\
 	.stashed		= NULL,							\
diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 43aa262c0ea1..38674c6fa4f7 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -9,6 +9,7 @@
 #include <linux/seqlock.h>
 #include <linux/rculist.h>
 #include <linux/cookie.h>
+#include <uapi/linux/nsfs.h>
 
 extern struct ns_tree cgroup_ns_tree;
 extern struct ns_tree ipc_ns_tree;
@@ -30,7 +31,11 @@ extern struct ns_tree uts_ns_tree;
 		struct user_namespace *:   &(user_ns_tree),	\
 		struct uts_namespace *:    &(uts_ns_tree))
 
-u64 ns_tree_gen_id(struct ns_common *ns);
+#define ns_tree_gen_id(__ns)                 \
+	__ns_tree_gen_id(to_ns_common(__ns), \
+			 (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
+
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id);
 void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree);
 void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree);
 struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type);
@@ -38,9 +43,9 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
 					 struct ns_tree *ns_tree,
 					 bool previous);
 
-static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
+static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree, u64 id)
 {
-	ns_tree_gen_id(ns);
+	__ns_tree_gen_id(ns, id);
 	__ns_tree_add_raw(ns, ns_tree);
 }
 
@@ -60,7 +65,9 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
  * This function assigns a new id to the namespace and adds it to the
  * appropriate namespace tree and list.
  */
-#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns))
+#define ns_tree_add(__ns)                                   \
+	__ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \
+		      (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
 
 /**
  * ns_tree_remove - Remove a namespace from a namespace tree
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index e098759ec917..f8bc2aad74d6 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -67,4 +67,18 @@ struct nsfs_file_handle {
 #define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */
 #define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */
 
+enum init_ns_id {
+	IPC_NS_INIT_ID		= 1ULL,
+	UTS_NS_INIT_ID		= 2ULL,
+	USER_NS_INIT_ID		= 3ULL,
+	PID_NS_INIT_ID		= 4ULL,
+	CGROUP_NS_INIT_ID	= 5ULL,
+	TIME_NS_INIT_ID		= 6ULL,
+	NET_NS_INIT_ID		= 7ULL,
+	MNT_NS_INIT_ID		= 8ULL,
+#ifdef __KERNEL__
+	NS_LAST_INIT_ID		= MNT_NS_INIT_ID,
+#endif
+};
+
 #endif /* __LINUX_NSFS_H */
diff --git a/kernel/nstree.c b/kernel/nstree.c
index bbb34b46b01b..cf102c5bb849 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -69,8 +69,6 @@ struct ns_tree time_ns_tree = {
 	.type = CLONE_NEWTIME,
 };
 
-DEFINE_COOKIE(namespace_cookie);
-
 static inline struct ns_common *node_to_ns(const struct rb_node *node)
 {
 	if (!node)
@@ -285,15 +283,20 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
 /**
  * ns_tree_gen_id - generate a new namespace id
  * @ns: namespace to generate id for
+ * @id: if non-zero, this is the initial namespace and this is a fixed id
  *
  * Generates a new namespace id and assigns it to the namespace. All
  * namespaces types share the same id space and thus can be compared
  * directly. IOW, when two ids of two namespace are equal, they are
  * identical.
  */
-u64 ns_tree_gen_id(struct ns_common *ns)
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id)
 {
-	guard(preempt)();
-	ns->ns_id = gen_cookie_next(&namespace_cookie);
+	static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1);
+
+	if (id)
+		ns->ns_id = id;
+	else
+		ns->ns_id = atomic64_inc_return(&namespace_cookie);
 	return ns->ns_id;
 }
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b0e0f22d7b21..83cbec4afcb3 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -439,7 +439,7 @@ static __net_init int setup_net(struct net *net)
 	LIST_HEAD(net_exit_list);
 	int error = 0;
 
-	net->net_cookie = ns_tree_gen_id(&net->ns);
+	net->net_cookie = ns_tree_gen_id(net);
 
 	list_for_each_entry(ops, &pernet_list, list) {
 		error = ops_init(ops, net);
-- 
cgit v1.2.3


From 3c1a52f2a6c865464babe7a85c2796aa31cc9744 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:29 +0100
Subject: nstree: maintain list of owned namespaces

The namespace tree doesn't express the ownership concept of namespace
appropriately. Maintain a list of directly owned namespaces per user
namespace. This will allow userspace and the kernel to use the listns()
system call to walk the namespace tree by owning user namespace. The
rbtree is used to find the relevant namespace entry point which allows
to continue iteration and the owner list can be used to walk the tree
completely lock free.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-16-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h |  8 ++++++
 kernel/nscommon.c         |  4 +++
 kernel/nstree.c           | 68 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 79 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 009a6dea724f..698aa2f7f486 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -116,6 +116,12 @@ struct ns_common {
 				struct rb_node ns_tree_node;
 				struct list_head ns_list_node;
 			};
+			struct /* namespace ownership rbtree and list */ {
+				struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */
+				struct list_head ns_owner; /* list of namespaces owned by this namespace */
+				struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */
+				struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */
+			};
 			atomic_t __ns_ref_active; /* do not use directly */
 		};
 		struct rcu_head ns_rcu;
@@ -216,6 +222,8 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns)
 	.__ns_ref		= REFCOUNT_INIT(refs),					\
 	.__ns_ref_active	= ATOMIC_INIT(1),					\
 	.ns_list_node		= LIST_HEAD_INIT(nsname.ns.ns_list_node),		\
+	.ns_owner_entry		= LIST_HEAD_INIT(nsname.ns.ns_owner_entry),		\
+	.ns_owner		= LIST_HEAD_INIT(nsname.ns.ns_owner),			\
 }
 
 #define ns_common_init(__ns)                     \
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index 17a6ea44f054..f0b7971392d2 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -63,7 +63,11 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 	ns->ns_type = ns_type;
 	RB_CLEAR_NODE(&ns->ns_tree_node);
 	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
+	RB_CLEAR_NODE(&ns->ns_owner_tree_node);
 	INIT_LIST_HEAD(&ns->ns_list_node);
+	ns->ns_owner_tree = RB_ROOT;
+	INIT_LIST_HEAD(&ns->ns_owner);
+	INIT_LIST_HEAD(&ns->ns_owner_entry);
 
 #ifdef CONFIG_DEBUG_VFS
 	ns_debug(ns, ops);
diff --git a/kernel/nstree.c b/kernel/nstree.c
index cf102c5bb849..1f54f914e30c 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -3,7 +3,9 @@
 
 #include <linux/nstree.h>
 #include <linux/proc_ns.h>
+#include <linux/rculist.h>
 #include <linux/vfsdebug.h>
+#include <linux/user_namespace.h>
 
 static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
 static struct rb_root ns_unified_tree = RB_ROOT; /* protected by ns_tree_lock */
@@ -83,6 +85,13 @@ static inline struct ns_common *node_to_ns_unified(const struct rb_node *node)
 	return rb_entry(node, struct ns_common, ns_unified_tree_node);
 }
 
+static inline struct ns_common *node_to_ns_owner(const struct rb_node *node)
+{
+	if (!node)
+		return NULL;
+	return rb_entry(node, struct ns_common, ns_owner_tree_node);
+}
+
 static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
 {
 	struct ns_common *ns_a = node_to_ns(a);
@@ -111,11 +120,27 @@ static inline int ns_cmp_unified(struct rb_node *a, const struct rb_node *b)
 	return 0;
 }
 
+static inline int ns_cmp_owner(struct rb_node *a, const struct rb_node *b)
+{
+	struct ns_common *ns_a = node_to_ns_owner(a);
+	struct ns_common *ns_b = node_to_ns_owner(b);
+	u64 ns_id_a = ns_a->ns_id;
+	u64 ns_id_b = ns_b->ns_id;
+
+	if (ns_id_a < ns_id_b)
+		return -1;
+	if (ns_id_a > ns_id_b)
+		return 1;
+	return 0;
+}
+
 void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 {
 	struct rb_node *node, *prev;
+	const struct proc_ns_operations *ops = ns->ops;
 
 	VFS_WARN_ON_ONCE(!ns->ns_id);
+	VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
 
 	write_seqlock(&ns_tree_lock);
 
@@ -131,6 +156,30 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 		list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
 
 	rb_find_add_rcu(&ns->ns_unified_tree_node, &ns_unified_tree, ns_cmp_unified);
+
+	if (ops) {
+		struct user_namespace *user_ns;
+
+		VFS_WARN_ON_ONCE(!ops->owner);
+		user_ns = ops->owner(ns);
+		if (user_ns) {
+			struct ns_common *owner = &user_ns->ns;
+			VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+			/* Insert into owner's rbtree */
+			rb_find_add_rcu(&ns->ns_owner_tree_node, &owner->ns_owner_tree, ns_cmp_owner);
+
+			/* Insert into owner's list in sorted order */
+			prev = rb_prev(&ns->ns_owner_tree_node);
+			if (!prev)
+				list_add_rcu(&ns->ns_owner_entry, &owner->ns_owner);
+			else
+				list_add_rcu(&ns->ns_owner_entry, &node_to_ns_owner(prev)->ns_owner_entry);
+		} else {
+			/* Only the initial user namespace doesn't have an owner. */
+			VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns));
+		}
+	}
 	write_sequnlock(&ns_tree_lock);
 
 	VFS_WARN_ON_ONCE(node);
@@ -146,6 +195,9 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 
 void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
 {
+	const struct proc_ns_operations *ops = ns->ops;
+	struct user_namespace *user_ns;
+
 	VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
 	VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
 	VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
@@ -153,8 +205,22 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
 	write_seqlock(&ns_tree_lock);
 	rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
 	rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree);
-	list_bidir_del_rcu(&ns->ns_list_node);
 	RB_CLEAR_NODE(&ns->ns_tree_node);
+
+	list_bidir_del_rcu(&ns->ns_list_node);
+
+	/* Remove from owner's rbtree if this namespace has an owner */
+	if (ops) {
+		user_ns = ops->owner(ns);
+		if (user_ns) {
+			struct ns_common *owner = &user_ns->ns;
+			rb_erase(&ns->ns_owner_tree_node, &owner->ns_owner_tree);
+			RB_CLEAR_NODE(&ns->ns_owner_tree_node);
+		}
+
+		list_bidir_del_rcu(&ns->ns_owner_entry);
+	}
+
 	write_sequnlock(&ns_tree_lock);
 }
 EXPORT_SYMBOL_GPL(__ns_tree_remove);
-- 
cgit v1.2.3


From 560e25e70fa40ec69f97f14207bde9bc18bec9b8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:31 +0100
Subject: nstree: add unified namespace list

Allow to walk the unified namespace list completely locklessly.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-18-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h |  2 ++
 kernel/nscommon.c         |  1 +
 kernel/nstree.c           | 13 ++++++++++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 698aa2f7f486..3f05dd7d40c7 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -111,6 +111,7 @@ struct ns_common {
 			u64 ns_id;
 			struct /* global namespace rbtree and list */ {
 				struct rb_node ns_unified_tree_node;
+				struct list_head ns_unified_list_node;
 			};
 			struct /* per type rbtree and list */ {
 				struct rb_node ns_tree_node;
@@ -224,6 +225,7 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns)
 	.ns_list_node		= LIST_HEAD_INIT(nsname.ns.ns_list_node),		\
 	.ns_owner_entry		= LIST_HEAD_INIT(nsname.ns.ns_owner_entry),		\
 	.ns_owner		= LIST_HEAD_INIT(nsname.ns.ns_owner),			\
+	.ns_unified_list_node	= LIST_HEAD_INIT(nsname.ns.ns_unified_list_node),	\
 }
 
 #define ns_common_init(__ns)                     \
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index f0b7971392d2..4cbe1ecc8df0 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -65,6 +65,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
 	RB_CLEAR_NODE(&ns->ns_owner_tree_node);
 	INIT_LIST_HEAD(&ns->ns_list_node);
+	INIT_LIST_HEAD(&ns->ns_unified_list_node);
 	ns->ns_owner_tree = RB_ROOT;
 	INIT_LIST_HEAD(&ns->ns_owner);
 	INIT_LIST_HEAD(&ns->ns_owner_entry);
diff --git a/kernel/nstree.c b/kernel/nstree.c
index 419d500d09df..dcad6a308547 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -9,6 +9,7 @@
 
 static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
 static struct rb_root ns_unified_tree = RB_ROOT; /* protected by ns_tree_lock */
+static LIST_HEAD(ns_unified_list); /* protected by ns_tree_lock */
 
 /**
  * struct ns_tree - Namespace tree
@@ -137,7 +138,13 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 	else
 		list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
 
+	/* Add to unified tree and list */
 	rb_find_add_rcu(&ns->ns_unified_tree_node, &ns_unified_tree, ns_cmp_unified);
+	prev = rb_prev(&ns->ns_unified_tree_node);
+	if (!prev)
+		list_add_rcu(&ns->ns_unified_list_node, &ns_unified_list);
+	else
+		list_add_rcu(&ns->ns_unified_list_node, &node_to_ns_unified(prev)->ns_unified_list_node);
 
 	if (ops) {
 		struct user_namespace *user_ns;
@@ -186,11 +193,15 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
 
 	write_seqlock(&ns_tree_lock);
 	rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
-	rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree);
 	RB_CLEAR_NODE(&ns->ns_tree_node);
 
 	list_bidir_del_rcu(&ns->ns_list_node);
 
+	rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree);
+	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
+
+	list_bidir_del_rcu(&ns->ns_unified_list_node);
+
 	/* Remove from owner's rbtree if this namespace has an owner */
 	if (ops) {
 		user_ns = ops->owner(ns);
-- 
cgit v1.2.3


From 76b6f5dfb3fda76fce1f9990d6fa58adc711122b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:32 +0100
Subject: nstree: add listns()

Add a new listns() system call that allows userspace to iterate through
namespaces in the system. This provides a programmatic interface to
discover and inspect namespaces, enhancing existing namespace apis.

Currently, there is no direct way for userspace to enumerate namespaces
in the system. Applications must resort to scanning /proc/<pid>/ns/
across all processes, which is:

1. Inefficient - requires iterating over all processes
2. Incomplete - misses inactive namespaces that aren't attached to any
   running process but are kept alive by file descriptors, bind mounts,
   or parent namespace references
3. Permission-heavy - requires access to /proc for many processes
4. No ordering or ownership.
5. No filtering per namespace type: Must always iterate and check all
   namespaces.

The list goes on. The listns() system call solves these problems by
providing direct kernel-level enumeration of namespaces. It is similar
to listmount() but obviously tailored to namespaces.

/*
 * @req: Pointer to struct ns_id_req specifying search parameters
 * @ns_ids: User buffer to receive namespace IDs
 * @nr_ns_ids: Size of ns_ids buffer (maximum number of IDs to return)
 * @flags: Reserved for future use (must be 0)
 */
ssize_t listns(const struct ns_id_req *req, u64 *ns_ids,
               size_t nr_ns_ids, unsigned int flags);

Returns:
- On success: Number of namespace IDs written to ns_ids
- On error: Negative error code

/*
 * @size: Structure size
 * @ns_id: Starting point for iteration; use 0 for first call, then
 *         use the last returned ID for subsequent calls to paginate
 * @ns_type: Bitmask of namespace types to include (from enum ns_type):
 *           0: Return all namespace types
 *           MNT_NS: Mount namespaces
 *           NET_NS: Network namespaces
 *           USER_NS: User namespaces
 *           etc. Can be OR'd together
 * @user_ns_id: Filter results to namespaces owned by this user namespace:
 *              0: Return all namespaces (subject to permission checks)
 *              LISTNS_CURRENT_USER: Namespaces owned by caller's user namespace
 *              Other value: Namespaces owned by the specified user namespace ID
 */
struct ns_id_req {
        __u32 size;         /* sizeof(struct ns_id_req) */
        __u32 spare;        /* Reserved, must be 0 */
        __u64 ns_id;        /* Last seen namespace ID (for pagination) */
        __u32 ns_type;      /* Filter by namespace type(s) */
        __u32 spare2;       /* Reserved, must be 0 */
        __u64 user_ns_id;   /* Filter by owning user namespace */
};

Example 1: List all namespaces

void list_all_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,          /* Start from beginning */
        .ns_type = 0,        /* All types */
        .user_ns_id = 0,     /* All user namespaces */
    };
    uint64_t ids[100];
    ssize_t ret;

    printf("All namespaces in the system:\n");
    do {
        ret = listns(&req, ids, 100, 0);
        if (ret < 0) {
            perror("listns");
            break;
        }

        for (ssize_t i = 0; i < ret; i++)
            printf("  Namespace ID: %llu\n", (unsigned long long)ids[i]);

        /* Continue from last seen ID */
        if (ret > 0)
            req.ns_id = ids[ret - 1];
    } while (ret == 100);  /* Buffer was full, more may exist */
}

Example 2: List network namespaces only

void list_network_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = NET_NS,   /* Only network namespaces */
        .user_ns_id = 0,
    };
    uint64_t ids[100];
    ssize_t ret;

    ret = listns(&req, ids, 100, 0);
    if (ret < 0) {
        perror("listns");
        return;
    }

    printf("Network namespaces: %zd found\n", ret);
    for (ssize_t i = 0; i < ret; i++)
        printf("  netns ID: %llu\n", (unsigned long long)ids[i]);
}

Example 3: List namespaces owned by current user namespace

void list_owned_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = 0,                      /* All types */
        .user_ns_id = LISTNS_CURRENT_USER, /* Current userns */
    };
    uint64_t ids[100];
    ssize_t ret;

    ret = listns(&req, ids, 100, 0);
    if (ret < 0) {
        perror("listns");
        return;
    }

    printf("Namespaces owned by my user namespace: %zd\n", ret);
    for (ssize_t i = 0; i < ret; i++)
        printf("  ns ID: %llu\n", (unsigned long long)ids[i]);
}

Example 4: List multiple namespace types

void list_network_and_mount_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = NET_NS | MNT_NS,  /* Network and mount */
        .user_ns_id = 0,
    };
    uint64_t ids[100];
    ssize_t ret;

    ret = listns(&req, ids, 100, 0);
    printf("Network and mount namespaces: %zd found\n", ret);
}

Example 5: Pagination through large namespace sets

void list_all_with_pagination(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = 0,
        .user_ns_id = 0,
    };
    uint64_t ids[50];
    size_t total = 0;
    ssize_t ret;

    printf("Enumerating all namespaces with pagination:\n");

    while (1) {
        ret = listns(&req, ids, 50, 0);
        if (ret < 0) {
            perror("listns");
            break;
        }
        if (ret == 0)
            break;  /* No more namespaces */

        total += ret;
        printf("  Batch: %zd namespaces\n", ret);

        /* Last ID in this batch becomes start of next batch */
        req.ns_id = ids[ret - 1];

        if (ret < 50)
            break;  /* Partial batch = end of results */
    }

    printf("Total: %zu namespaces\n", total);
}

Permission Model

listns() respects namespace isolation and capabilities:

(1) Global listing (user_ns_id = 0):
    - Requires CAP_SYS_ADMIN in the namespace's owning user namespace
    - OR the namespace must be in the caller's namespace context (e.g.,
      a namespace the caller is currently using)
    - User namespaces additionally allow listing if the caller has
      CAP_SYS_ADMIN in that user namespace itself
(2) Owner-filtered listing (user_ns_id != 0):
    - Requires CAP_SYS_ADMIN in the specified owner user namespace
    - OR the namespace must be in the caller's namespace context
    - This allows unprivileged processes to enumerate namespaces they own
(3) Visibility:
    - Only "active" namespaces are listed
    - A namespace is active if it has a non-zero __ns_ref_active count
    - This includes namespaces used by running processes, held by open
      file descriptors, or kept active by bind mounts
    - Inactive namespaces (kept alive only by internal kernel
      references) are not visible via listns()

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-19-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nsfs.c                      |  39 ++++
 include/linux/ns_common.h      |   2 +
 include/linux/syscalls.h       |   4 +
 include/linux/user_namespace.h |   4 +-
 include/uapi/linux/nsfs.h      |  44 +++++
 kernel/nscommon.c              |   2 +-
 kernel/nstree.c                | 397 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 489 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 4a95a0a38f86..ba6c8975c82e 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -471,6 +471,45 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 	return FILEID_NSFS;
 }
 
+bool is_current_namespace(struct ns_common *ns)
+{
+	switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+	case CLONE_NEWCGROUP:
+		return current_in_namespace(to_cg_ns(ns));
+#endif
+#ifdef CONFIG_IPC_NS
+	case CLONE_NEWIPC:
+		return current_in_namespace(to_ipc_ns(ns));
+#endif
+	case CLONE_NEWNS:
+		return current_in_namespace(to_mnt_ns(ns));
+#ifdef CONFIG_NET_NS
+	case CLONE_NEWNET:
+		return current_in_namespace(to_net_ns(ns));
+#endif
+#ifdef CONFIG_PID_NS
+	case CLONE_NEWPID:
+		return current_in_namespace(to_pid_ns(ns));
+#endif
+#ifdef CONFIG_TIME_NS
+	case CLONE_NEWTIME:
+		return current_in_namespace(to_time_ns(ns));
+#endif
+#ifdef CONFIG_USER_NS
+	case CLONE_NEWUSER:
+		return current_in_namespace(to_user_ns(ns));
+#endif
+#ifdef CONFIG_UTS_NS
+	case CLONE_NEWUTS:
+		return current_in_namespace(to_uts_ns(ns));
+#endif
+	default:
+		VFS_WARN_ON_ONCE(true);
+		return false;
+	}
+}
+
 static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 					int fh_len, int fh_type)
 {
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 3f05dd7d40c7..bd4492ef6ffc 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -129,8 +129,10 @@ struct ns_common {
 	};
 };
 
+bool is_current_namespace(struct ns_common *ns);
 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
 void __ns_common_free(struct ns_common *ns);
+struct ns_common *__must_check ns_owner(struct ns_common *ns);
 
 static __always_inline bool is_initial_namespace(struct ns_common *ns)
 {
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 66c06fcdfe19..cf84d98964b2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -77,6 +77,7 @@ struct cachestat_range;
 struct cachestat;
 struct statmount;
 struct mnt_id_req;
+struct ns_id_req;
 struct xattr_args;
 struct file_attr;
 
@@ -437,6 +438,9 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
 asmlinkage long sys_listmount(const struct mnt_id_req __user *req,
 			      u64 __user *mnt_ids, size_t nr_mnt_ids,
 			      unsigned int flags);
+asmlinkage long sys_listns(const struct ns_id_req __user *req,
+			   u64 __user *ns_ids, size_t nr_ns_ids,
+			   unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, off_t length);
 #if BITS_PER_LONG == 32
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 9a9aebbf96b9..9c3be157397e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -166,13 +166,13 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
 	ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
 }
 
-#ifdef CONFIG_USER_NS
-
 static inline struct user_namespace *to_user_ns(struct ns_common *ns)
 {
 	return container_of(ns, struct user_namespace, ns);
 }
 
+#ifdef CONFIG_USER_NS
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	if (ns)
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index f8bc2aad74d6..a25e38d1c874 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -81,4 +81,48 @@ enum init_ns_id {
 #endif
 };
 
+enum ns_type {
+	TIME_NS    = (1ULL << 7),  /* CLONE_NEWTIME */
+	MNT_NS     = (1ULL << 17), /* CLONE_NEWNS */
+	CGROUP_NS  = (1ULL << 25), /* CLONE_NEWCGROUP */
+	UTS_NS     = (1ULL << 26), /* CLONE_NEWUTS */
+	IPC_NS     = (1ULL << 27), /* CLONE_NEWIPC */
+	USER_NS    = (1ULL << 28), /* CLONE_NEWUSER */
+	PID_NS     = (1ULL << 29), /* CLONE_NEWPID */
+	NET_NS     = (1ULL << 30), /* CLONE_NEWNET */
+};
+
+/**
+ * struct ns_id_req - namespace ID request structure
+ * @size: size of this structure
+ * @spare: reserved for future use
+ * @filter: filter mask
+ * @ns_id: last namespace id
+ * @user_ns_id: owning user namespace ID
+ *
+ * Structure for passing namespace ID and miscellaneous parameters to
+ * statns(2) and listns(2).
+ *
+ * For statns(2) @param represents the request mask.
+ * For listns(2) @param represents the last listed mount id (or zero).
+ */
+struct ns_id_req {
+	__u32 size;
+	__u32 spare;
+	__u64 ns_id;
+	struct /* listns */ {
+		__u32 ns_type;
+		__u32 spare2;
+		__u64 user_ns_id;
+	};
+};
+
+/*
+ * Special @user_ns_id value that can be passed to listns()
+ */
+#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */
+
+/* List of all ns_id_req versions. */
+#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */
+
 #endif /* __LINUX_NSFS_H */
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index 4cbe1ecc8df0..6fe1c747fa46 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -98,7 +98,7 @@ void __ns_common_free(struct ns_common *ns)
 	proc_free_inum(ns->inum);
 }
 
-static struct ns_common *ns_owner(struct ns_common *ns)
+struct ns_common *__must_check ns_owner(struct ns_common *ns)
 {
 	struct user_namespace *owner;
 
diff --git a/kernel/nstree.c b/kernel/nstree.c
index dcad6a308547..4a8838683b6b 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -5,6 +5,7 @@
 #include <linux/proc_ns.h>
 #include <linux/rculist.h>
 #include <linux/vfsdebug.h>
+#include <linux/syscalls.h>
 #include <linux/user_namespace.h>
 
 static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
@@ -359,3 +360,399 @@ u64 __ns_tree_gen_id(struct ns_common *ns, u64 id)
 		ns->ns_id = atomic64_inc_return(&namespace_cookie);
 	return ns->ns_id;
 }
+
+struct klistns {
+	u64 __user *uns_ids;
+	u32 nr_ns_ids;
+	u64 last_ns_id;
+	u64 user_ns_id;
+	u32 ns_type;
+	struct user_namespace *user_ns;
+	bool userns_capable;
+	struct ns_common *first_ns;
+};
+
+static void __free_klistns_free(const struct klistns *kls)
+{
+	if (kls->user_ns_id != LISTNS_CURRENT_USER)
+		put_user_ns(kls->user_ns);
+	if (kls->first_ns && kls->first_ns->ops)
+		kls->first_ns->ops->put(kls->first_ns);
+}
+
+#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS)
+
+static int copy_ns_id_req(const struct ns_id_req __user *req,
+			  struct ns_id_req *kreq)
+{
+	int ret;
+	size_t usize;
+
+	BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0);
+
+	ret = get_user(usize, &req->size);
+	if (ret)
+		return -EFAULT;
+	if (unlikely(usize > PAGE_SIZE))
+		return -E2BIG;
+	if (unlikely(usize < NS_ID_REQ_SIZE_VER0))
+		return -EINVAL;
+	memset(kreq, 0, sizeof(*kreq));
+	ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
+	if (ret)
+		return ret;
+	if (kreq->spare != 0)
+		return -EINVAL;
+	if (kreq->ns_type & ~NS_ALL)
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq,
+				  u64 __user *ns_ids, size_t nr_ns_ids)
+{
+	kls->last_ns_id = kreq->ns_id;
+	kls->user_ns_id = kreq->user_ns_id;
+	kls->nr_ns_ids	= nr_ns_ids;
+	kls->ns_type	= kreq->ns_type;
+	kls->uns_ids	= ns_ids;
+	return 0;
+}
+
+/*
+ * Lookup a namespace owned by owner with id >= ns_id.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner)
+{
+	struct ns_common *ret = NULL;
+	struct rb_node *node;
+
+	VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+	read_seqlock_excl(&ns_tree_lock);
+	node = owner->ns_owner_tree.rb_node;
+
+	while (node) {
+		struct ns_common *ns;
+
+		ns = node_to_ns_owner(node);
+		if (ns_id <= ns->ns_id) {
+			ret = ns;
+			if (ns_id == ns->ns_id)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
+		}
+	}
+
+	if (ret)
+		ret = ns_get_unless_inactive(ret);
+	read_sequnlock_excl(&ns_tree_lock);
+	return ret;
+}
+
+static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type)
+{
+	struct ns_common *ns;
+
+	guard(rcu)();
+	ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type);
+	if (!ns)
+		return NULL;
+
+	if (!ns_get_unless_inactive(ns))
+		return NULL;
+
+	return ns;
+}
+
+static inline bool __must_check ns_requested(const struct klistns *kls,
+					     const struct ns_common *ns)
+{
+	return !kls->ns_type || (kls->ns_type & ns->ns_type);
+}
+
+static inline bool __must_check may_list_ns(const struct klistns *kls,
+					    struct ns_common *ns)
+{
+	if (kls->user_ns) {
+		if (kls->userns_capable)
+			return true;
+	} else {
+		struct ns_common *owner;
+		struct user_namespace *user_ns;
+
+		owner = ns_owner(ns);
+		if (owner)
+			user_ns = to_user_ns(owner);
+		else
+			user_ns = &init_user_ns;
+		if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN))
+			return true;
+	}
+
+	if (is_current_namespace(ns))
+		return true;
+
+	if (ns->ns_type != CLONE_NEWUSER)
+		return false;
+
+	if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN))
+		return true;
+
+	return false;
+}
+
+static void __ns_put(struct ns_common *ns)
+{
+	if (ns->ops)
+		ns->ops->put(ns);
+}
+
+DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T))
+
+static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls,
+							   struct ns_common *candidate)
+{
+	struct ns_common *ns __free(ns_put) = NULL;
+
+	if (!ns_requested(kls, candidate))
+		return NULL;
+
+	ns = ns_get_unless_inactive(candidate);
+	if (!ns)
+		return NULL;
+
+	if (!may_list_ns(kls, ns))
+		return NULL;
+
+	return no_free_ptr(ns);
+}
+
+static ssize_t do_listns_userns(struct klistns *kls)
+{
+	u64 __user *ns_ids = kls->uns_ids;
+	size_t nr_ns_ids = kls->nr_ns_ids;
+	struct ns_common *ns = NULL, *first_ns = NULL;
+	const struct list_head *head;
+	ssize_t ret;
+
+	VFS_WARN_ON_ONCE(!kls->user_ns_id);
+
+	if (kls->user_ns_id == LISTNS_CURRENT_USER)
+		ns = to_ns_common(current_user_ns());
+	else if (kls->user_ns_id)
+		ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER);
+	if (!ns)
+		return -EINVAL;
+	kls->user_ns = to_user_ns(ns);
+
+	/*
+	 * Use the rbtree to find the first namespace we care about and
+	 * then use it's list entry to iterate from there.
+	 */
+	if (kls->last_ns_id) {
+		kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns);
+		if (!kls->first_ns)
+			return -ENOENT;
+		first_ns = kls->first_ns;
+	}
+
+	ret = 0;
+	head = &to_ns_common(kls->user_ns)->ns_owner;
+	kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN);
+
+	rcu_read_lock();
+
+	if (!first_ns)
+		first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry);
+	for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids;
+	     ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) {
+		struct ns_common *valid __free(ns_put);
+
+		valid = legitimize_ns(kls, ns);
+		if (!valid)
+			continue;
+
+		rcu_read_unlock();
+
+		if (put_user(valid->ns_id, ns_ids + ret))
+			return -EINVAL;
+		nr_ns_ids--;
+		ret++;
+
+		rcu_read_lock();
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+/*
+ * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type)
+{
+	struct ns_common *ret = NULL;
+	struct ns_tree *ns_tree = NULL;
+	struct rb_node *node;
+
+	if (ns_type) {
+		ns_tree = ns_tree_from_type(ns_type);
+		if (!ns_tree)
+			return NULL;
+	}
+
+	read_seqlock_excl(&ns_tree_lock);
+	if (ns_tree)
+		node = ns_tree->ns_tree.rb_node;
+	else
+		node = ns_unified_tree.rb_node;
+
+	while (node) {
+		struct ns_common *ns;
+
+		if (ns_type)
+			ns = node_to_ns(node);
+		else
+			ns = node_to_ns_unified(node);
+
+		if (ns_id <= ns->ns_id) {
+			if (ns_type)
+				ret = node_to_ns(node);
+			else
+				ret = node_to_ns_unified(node);
+			if (ns_id == ns->ns_id)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
+		}
+	}
+
+	if (ret)
+		ret = ns_get_unless_inactive(ret);
+	read_sequnlock_excl(&ns_tree_lock);
+	return ret;
+}
+
+static inline struct ns_common *first_ns_common(const struct list_head *head,
+						struct ns_tree *ns_tree)
+{
+	if (ns_tree)
+		return list_entry_rcu(head->next, struct ns_common, ns_list_node);
+	return list_entry_rcu(head->next, struct ns_common, ns_unified_list_node);
+}
+
+static inline struct ns_common *next_ns_common(struct ns_common *ns,
+					       struct ns_tree *ns_tree)
+{
+	if (ns_tree)
+		return list_entry_rcu(ns->ns_list_node.next, struct ns_common, ns_list_node);
+	return list_entry_rcu(ns->ns_unified_list_node.next, struct ns_common, ns_unified_list_node);
+}
+
+static inline bool ns_common_is_head(struct ns_common *ns,
+				     const struct list_head *head,
+				     struct ns_tree *ns_tree)
+{
+	if (ns_tree)
+		return &ns->ns_list_node == head;
+	return &ns->ns_unified_list_node == head;
+}
+
+static ssize_t do_listns(struct klistns *kls)
+{
+	u64 __user *ns_ids = kls->uns_ids;
+	size_t nr_ns_ids = kls->nr_ns_ids;
+	struct ns_common *ns, *first_ns = NULL;
+	struct ns_tree *ns_tree = NULL;
+	const struct list_head *head;
+	u32 ns_type;
+	ssize_t ret;
+
+	if (hweight32(kls->ns_type) == 1)
+		ns_type = kls->ns_type;
+	else
+		ns_type = 0;
+
+	if (ns_type) {
+		ns_tree = ns_tree_from_type(ns_type);
+		if (!ns_tree)
+			return -EINVAL;
+	}
+
+	if (kls->last_ns_id) {
+		kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type);
+		if (!kls->first_ns)
+			return -ENOENT;
+		first_ns = kls->first_ns;
+	}
+
+	ret = 0;
+	if (ns_tree)
+		head = &ns_tree->ns_list;
+	else
+		head = &ns_unified_list;
+
+	rcu_read_lock();
+
+	if (!first_ns)
+		first_ns = first_ns_common(head, ns_tree);
+
+	for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids;
+	     ns = next_ns_common(ns, ns_tree)) {
+		struct ns_common *valid __free(ns_put);
+
+		valid = legitimize_ns(kls, ns);
+		if (!valid)
+			continue;
+
+		rcu_read_unlock();
+
+		if (put_user(valid->ns_id, ns_ids + ret))
+			return -EINVAL;
+
+		nr_ns_ids--;
+		ret++;
+
+		rcu_read_lock();
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req,
+		u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags)
+{
+	struct klistns klns __free(klistns_free) = {};
+	const size_t maxcount = 1000000;
+	struct ns_id_req kreq;
+	ssize_t ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (unlikely(nr_ns_ids > maxcount))
+		return -EOVERFLOW;
+
+	if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids)))
+		return -EFAULT;
+
+	ret = copy_ns_id_req(req, &kreq);
+	if (ret)
+		return ret;
+
+	ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids);
+	if (ret)
+		return ret;
+
+	if (kreq.user_ns_id)
+		return do_listns_userns(&klns);
+
+	return do_listns(&klns);
+}
-- 
cgit v1.2.3


From b36d4b6aa88ef039647228b98c59a875e92f8c8e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 29 Oct 2025 13:20:33 +0100
Subject: arch: hookup listns() system call

Add the listns() system call to all architectures.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-20-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 arch/alpha/kernel/syscalls/syscall.tbl      | 1 +
 arch/arm/tools/syscall.tbl                  | 1 +
 arch/arm64/tools/syscall_32.tbl             | 1 +
 arch/m68k/kernel/syscalls/syscall.tbl       | 1 +
 arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 1 +
 arch/parisc/kernel/syscalls/syscall.tbl     | 1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    | 1 +
 arch/s390/kernel/syscalls/syscall.tbl       | 1 +
 arch/sh/kernel/syscalls/syscall.tbl         | 1 +
 arch/sparc/kernel/syscalls/syscall.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_32.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl      | 1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     | 1 +
 include/uapi/asm-generic/unistd.h           | 4 +++-
 scripts/syscall.tbl                         | 1 +
 18 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 16dca28ebf17..3fed97478058 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -509,3 +509,4 @@
 577	common	open_tree_attr			sys_open_tree_attr
 578	common	file_getattr			sys_file_getattr
 579	common	file_setattr			sys_file_setattr
+580	common	listns				sys_listns
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index b07e699aaa3c..fd09afae72a2 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -484,3 +484,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 8d9088bc577d..8cdfe5d4dac9 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -481,3 +481,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index f41d38dfbf13..871a5d67bf41 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -469,3 +469,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 580af574fe73..022fc85d94b3 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -475,3 +475,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index d824ffe9a014..8cedc83c3266 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -408,3 +408,4 @@
 467	n32	open_tree_attr			sys_open_tree_attr
 468	n32	file_getattr			sys_file_getattr
 469	n32	file_setattr			sys_file_setattr
+470	n32	listns				sys_listns
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 7a7049c2c307..9b92bddf06b5 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -384,3 +384,4 @@
 467	n64	open_tree_attr			sys_open_tree_attr
 468	n64	file_getattr			sys_file_getattr
 469	n64	file_setattr			sys_file_setattr
+470	n64	listns				sys_listns
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index d330274f0601..f810b8a55716 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -457,3 +457,4 @@
 467	o32	open_tree_attr			sys_open_tree_attr
 468	o32	file_getattr			sys_file_getattr
 469	o32	file_setattr			sys_file_setattr
+470	o32	listns				sys_listns
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 88a788a7b18d..39bdacaa530b 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -468,3 +468,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index b453e80dfc00..ec4458cdb97b 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -560,3 +560,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 8a6744d658db..5863787ab036 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -472,3 +472,4 @@
 467  common	open_tree_attr		sys_open_tree_attr		sys_open_tree_attr
 468  common	file_getattr		sys_file_getattr		sys_file_getattr
 469  common	file_setattr		sys_file_setattr		sys_file_setattr
+470  common	listns			sys_listns			sys_listns
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 5e9c9eff5539..969c11325ade 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index ebb7d06d1044..39aa26b6a50b 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -515,3 +515,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4877e16da69a..e979a3eac7a3 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -475,3 +475,4 @@
 467	i386	open_tree_attr		sys_open_tree_attr
 468	i386	file_getattr		sys_file_getattr
 469	i386	file_setattr		sys_file_setattr
+470	i386	listns			sys_listns
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ced2a1deecd7..8a4ac4841be6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -394,6 +394,7 @@
 467	common	open_tree_attr		sys_open_tree_attr
 468	common	file_getattr		sys_file_getattr
 469	common	file_setattr		sys_file_setattr
+470	common	listns			sys_listns
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 374e4cb788d8..438a3b170402 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 04e0077fb4c9..942370b3f5d2 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -857,9 +857,11 @@ __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr)
 __SYSCALL(__NR_file_getattr, sys_file_getattr)
 #define __NR_file_setattr 469
 __SYSCALL(__NR_file_setattr, sys_file_setattr)
+#define __NR_listns 470
+__SYSCALL(__NR_listns, sys_listns)
 
 #undef __NR_syscalls
-#define __NR_syscalls 470
+#define __NR_syscalls 471
 
 /*
  * 32 bit systems traditionally used different
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index d1ae5e92c615..e74868be513c 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -410,3 +410,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns
-- 
cgit v1.2.3


From 16dad7801aad73138a2dff5ea950130646914d1f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Oct 2025 20:19:15 -1000
Subject: cgroup: Rename cgroup lifecycle hooks to cgroup_task_*()

The current names cgroup_exit(), cgroup_release(), and cgroup_free() are
confusing because they look like they're operating on cgroups themselves when
they're actually task lifecycle hooks. For example, cgroup_init() initializes
the cgroup subsystem while cgroup_exit() is a task exit notification to
cgroup. Rename them to cgroup_task_exit(), cgroup_task_release(), and
cgroup_task_free() to make it clear that these operate on tasks.

Cc: Dan Schatzberg <dschatzberg@meta.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Chen Ridong <chenridong@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h   | 12 ++++++------
 kernel/cgroup/cgroup.c   | 11 ++++++-----
 kernel/exit.c            |  4 ++--
 kernel/fork.c            |  2 +-
 kernel/sched/autogroup.c |  4 ++--
 5 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6ed477338b16..4068035176c4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -137,9 +137,9 @@ extern void cgroup_cancel_fork(struct task_struct *p,
 			       struct kernel_clone_args *kargs);
 extern void cgroup_post_fork(struct task_struct *p,
 			     struct kernel_clone_args *kargs);
-void cgroup_exit(struct task_struct *p);
-void cgroup_release(struct task_struct *p);
-void cgroup_free(struct task_struct *p);
+void cgroup_task_exit(struct task_struct *p);
+void cgroup_task_release(struct task_struct *p);
+void cgroup_task_free(struct task_struct *p);
 
 int cgroup_init_early(void);
 int cgroup_init(void);
@@ -680,9 +680,9 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
 				      struct kernel_clone_args *kargs) {}
 static inline void cgroup_post_fork(struct task_struct *p,
 				    struct kernel_clone_args *kargs) {}
-static inline void cgroup_exit(struct task_struct *p) {}
-static inline void cgroup_release(struct task_struct *p) {}
-static inline void cgroup_free(struct task_struct *p) {}
+static inline void cgroup_task_exit(struct task_struct *p) {}
+static inline void cgroup_task_release(struct task_struct *p) {}
+static inline void cgroup_task_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6ae5f48cf64e..826b7fd2f85d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -944,7 +944,8 @@ static void css_set_move_task(struct task_struct *task,
 		/*
 		 * We are synchronized through cgroup_threadgroup_rwsem
 		 * against PF_EXITING setting such that we can't race
-		 * against cgroup_exit()/cgroup_free() dropping the css_set.
+		 * against cgroup_task_exit()/cgroup_task_free() dropping
+		 * the css_set.
 		 */
 		WARN_ON_ONCE(task->flags & PF_EXITING);
 
@@ -6972,13 +6973,13 @@ void cgroup_post_fork(struct task_struct *child,
 }
 
 /**
- * cgroup_exit - detach cgroup from exiting task
+ * cgroup_task_exit - detach cgroup from exiting task
  * @tsk: pointer to task_struct of exiting process
  *
  * Description: Detach cgroup from @tsk.
  *
  */
-void cgroup_exit(struct task_struct *tsk)
+void cgroup_task_exit(struct task_struct *tsk)
 {
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
@@ -7010,7 +7011,7 @@ void cgroup_exit(struct task_struct *tsk)
 	} while_each_subsys_mask();
 }
 
-void cgroup_release(struct task_struct *task)
+void cgroup_task_release(struct task_struct *task)
 {
 	struct cgroup_subsys *ss;
 	int ssid;
@@ -7027,7 +7028,7 @@ void cgroup_release(struct task_struct *task)
 	}
 }
 
-void cgroup_free(struct task_struct *task)
+void cgroup_task_free(struct task_struct *task)
 {
 	struct css_set *cset = task_css_set(task);
 	put_css_set(cset);
diff --git a/kernel/exit.c b/kernel/exit.c
index 9f74e8f1c431..46173461e8de 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -257,7 +257,7 @@ repeat:
 	rcu_read_unlock();
 
 	pidfs_exit(p);
-	cgroup_release(p);
+	cgroup_task_release(p);
 
 	/* Retrieve @thread_pid before __unhash_process() may set it to NULL. */
 	thread_pid = task_pid(p);
@@ -967,7 +967,7 @@ void __noreturn do_exit(long code)
 	exit_thread(tsk);
 
 	sched_autogroup_exit_task(tsk);
-	cgroup_exit(tsk);
+	cgroup_task_exit(tsk);
 
 	/*
 	 * FIXME: do that only when needed, using sched_exit tracepoint
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..960c39c9c264 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -738,7 +738,7 @@ void __put_task_struct(struct task_struct *tsk)
 	unwind_task_free(tsk);
 	sched_ext_free(tsk);
 	io_uring_free(tsk);
-	cgroup_free(tsk);
+	cgroup_task_free(tsk);
 	task_numa_free(tsk, true);
 	security_task_free(tsk);
 	exit_creds(tsk);
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index cdea931aae30..954137775f38 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -178,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	 * this process can already run with task_group() == prev->tg or we can
 	 * race with cgroup code which can read autogroup = prev under rq->lock.
 	 * In the latter case for_each_thread() can not miss a migrating thread,
-	 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
-	 * can't be removed from thread list, we hold ->siglock.
+	 * cpu_cgroup_attach() must not be possible after cgroup_task_exit()
+	 * and it can't be removed from thread list, we hold ->siglock.
 	 *
 	 * If an exiting thread was already removed from thread list we rely on
 	 * sched_autogroup_exit_task().
-- 
cgit v1.2.3


From d245698d727ab8f5420b3e28d1243f96a5234851 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Oct 2025 20:19:17 -1000
Subject: cgroup: Defer task cgroup unlink until after the task is done
 switching out

When a task exits, css_set_move_task(tsk, cset, NULL, false) unlinks the task
from its cgroup. From the cgroup's perspective, the task is now gone. If this
makes the cgroup empty, it can be removed, triggering ->css_offline() callbacks
that notify controllers the cgroup is going offline resource-wise.

However, the exiting task can still run, perform memory operations, and schedule
until the final context switch in finish_task_switch(). This creates a confusing
situation where controllers are told a cgroup is offline while resource
activities are still happening in it. While this hasn't broken existing
controllers, it has caused direct confusion for sched_ext schedulers.

Split cgroup_task_exit() into two functions. cgroup_task_exit() now only calls
the subsystem exit callbacks and continues to be called from do_exit(). The
css_set cleanup is moved to the new cgroup_task_dead() which is called from
finish_task_switch() after the final context switch, so that the cgroup only
appears empty after the task is truly done running.

This also reorders operations so that subsys->exit() is now called before
unlinking from the cgroup, which shouldn't break anything.

Cc: Dan Schatzberg <dschatzberg@meta.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  2 ++
 kernel/cgroup/cgroup.c | 23 ++++++++++++++---------
 kernel/sched/core.c    |  2 ++
 3 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4068035176c4..bc892e3b37ee 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -138,6 +138,7 @@ extern void cgroup_cancel_fork(struct task_struct *p,
 extern void cgroup_post_fork(struct task_struct *p,
 			     struct kernel_clone_args *kargs);
 void cgroup_task_exit(struct task_struct *p);
+void cgroup_task_dead(struct task_struct *p);
 void cgroup_task_release(struct task_struct *p);
 void cgroup_task_free(struct task_struct *p);
 
@@ -681,6 +682,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
 static inline void cgroup_post_fork(struct task_struct *p,
 				    struct kernel_clone_args *kargs) {}
 static inline void cgroup_task_exit(struct task_struct *p) {}
+static inline void cgroup_task_dead(struct task_struct *p) {}
 static inline void cgroup_task_release(struct task_struct *p) {}
 static inline void cgroup_task_free(struct task_struct *p) {}
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b3c27900c5d2..aae180d56c8c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -944,7 +944,7 @@ static void css_set_move_task(struct task_struct *task,
 		/*
 		 * We are synchronized through cgroup_threadgroup_rwsem
 		 * against PF_EXITING setting such that we can't race
-		 * against cgroup_task_exit()/cgroup_task_free() dropping
+		 * against cgroup_task_dead()/cgroup_task_free() dropping
 		 * the css_set.
 		 */
 		WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -6982,10 +6982,20 @@ void cgroup_post_fork(struct task_struct *child,
 void cgroup_task_exit(struct task_struct *tsk)
 {
 	struct cgroup_subsys *ss;
-	struct css_set *cset;
 	int i;
 
-	spin_lock_irq(&css_set_lock);
+	/* see cgroup_post_fork() for details */
+	do_each_subsys_mask(ss, i, have_exit_callback) {
+		ss->exit(tsk);
+	} while_each_subsys_mask();
+}
+
+void cgroup_task_dead(struct task_struct *tsk)
+{
+	struct css_set *cset;
+	unsigned long flags;
+
+	spin_lock_irqsave(&css_set_lock, flags);
 
 	WARN_ON_ONCE(list_empty(&tsk->cg_list));
 	cset = task_css_set(tsk);
@@ -7003,12 +7013,7 @@ void cgroup_task_exit(struct task_struct *tsk)
 		     test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
 		cgroup_update_frozen(task_dfl_cgroup(tsk));
 
-	spin_unlock_irq(&css_set_lock);
-
-	/* see cgroup_post_fork() for details */
-	do_each_subsys_mask(ss, i, have_exit_callback) {
-		ss->exit(tsk);
-	} while_each_subsys_mask();
+	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
 void cgroup_task_release(struct task_struct *task)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f1ebf67b48e2..40f12e37f60f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5222,6 +5222,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
 
+		cgroup_task_dead(prev);
+
 		/* Task is done with its stack. */
 		put_task_stack(prev);
 
-- 
cgit v1.2.3


From 7900aa699c34401cf5d0c701d9ef72880ddc1a83 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 3 Nov 2025 10:25:13 -1000
Subject: sched_ext: Fix cgroup exit ordering by moving sched_ext_free() to
 finish_task_switch()

sched_ext_free() was called from __put_task_struct() when the last reference
to the task is dropped, which could be long after the task has finished
running. This causes cgroup-related problems:

- ops.init_task() can be called on a cgroup which didn't get ops.cgroup_init()'d
  during scheduler load, because the cgroup might be destroyed/unlinked
  while the zombie or dead task is still lingering on the scx_tasks list.

- ops.cgroup_exit() could be called before ops.exit_task() is called on all
  member tasks, leading to incorrect exit ordering.

Fix by moving it to finish_task_switch() to be called right after the final
context switch away from the dying task, matching when sched_class->task_dead()
is called. Rename it to sched_ext_dead() to match the new calling context.

By calling sched_ext_dead() before cgroup_task_dead(), we ensure that:

- Tasks visible on scx_tasks list have valid cgroups during scheduler load,
  as cgroup_mutex prevents cgroup destruction while the task is still linked.

- All member tasks have ops.exit_task() called and are removed from scx_tasks
  before the cgroup can be destroyed and trigger ops.cgroup_exit().

This fix is made possible by the cgroup_task_dead() split in the previous patch.

This also makes more sense resource-wise as there's no point in keeping
scheduler side resources around for dead tasks.

Reported-by: Dan Schatzberg <dschatzberg@meta.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 4 ++--
 kernel/fork.c             | 1 -
 kernel/sched/core.c       | 6 ++++++
 kernel/sched/ext.c        | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 4713f374acc0..eb776b094d36 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -208,14 +208,14 @@ struct sched_ext_entity {
 	struct list_head	tasks_node;
 };
 
-void sched_ext_free(struct task_struct *p);
+void sched_ext_dead(struct task_struct *p);
 void print_scx_info(const char *log_lvl, struct task_struct *p);
 void scx_softlockup(u32 dur_s);
 bool scx_rcu_cpu_stall(void);
 
 #else	/* !CONFIG_SCHED_CLASS_EXT */
 
-static inline void sched_ext_free(struct task_struct *p) {}
+static inline void sched_ext_dead(struct task_struct *p) {}
 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
 static inline void scx_softlockup(u32 dur_s) {}
 static inline bool scx_rcu_cpu_stall(void) { return false; }
diff --git a/kernel/fork.c b/kernel/fork.c
index 960c39c9c264..5ae37909a813 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -736,7 +736,6 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(tsk == current);
 
 	unwind_task_free(tsk);
-	sched_ext_free(tsk);
 	io_uring_free(tsk);
 	cgroup_task_free(tsk);
 	task_numa_free(tsk, true);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0324457622d7..3f4653106216 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5151,6 +5151,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
 
+		/*
+		 * sched_ext_dead() must come before cgroup_task_dead() to
+		 * prevent cgroups from being removed while its member tasks are
+		 * visible to SCX schedulers.
+		 */
+		sched_ext_dead(prev);
 		cgroup_task_dead(prev);
 
 		/* Task is done with its stack. */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d1ef5bda95ae..2811e4f42a37 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2966,7 +2966,7 @@ void scx_cancel_fork(struct task_struct *p)
 	percpu_up_read(&scx_fork_rwsem);
 }
 
-void sched_ext_free(struct task_struct *p)
+void sched_ext_dead(struct task_struct *p)
 {
 	unsigned long flags;
 
-- 
cgit v1.2.3


From 98c92de40f6ab05452f8919cc2ff800ade5dd9a3 Mon Sep 17 00:00:00 2001
From: Komal Bajaj <komal.bajaj@oss.qualcomm.com>
Date: Mon, 3 Nov 2025 16:53:10 +0530
Subject: dt-bindings: arm: qcom,ids: Add SoC ID for QCS6490

Add unique ID for Qualcomm QCS6490 SoC.

Signed-off-by: Komal Bajaj <komal.bajaj@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20251103-qcs6490_soc_id-v1-1-c139dd1e32c8@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/arm/qcom,ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h
index 19598ed4679e..8776844e0eeb 100644
--- a/include/dt-bindings/arm/qcom,ids.h
+++ b/include/dt-bindings/arm/qcom,ids.h
@@ -240,6 +240,7 @@
 #define QCOM_ID_SC7280			487
 #define QCOM_ID_SC7180P			495
 #define QCOM_ID_QCM6490			497
+#define QCOM_ID_QCS6490			498
 #define QCOM_ID_SM7325P			499
 #define QCOM_ID_IPQ5000			503
 #define QCOM_ID_IPQ0509			504
-- 
cgit v1.2.3


From 342d2a607450f256105781d29aa6300921c6152e Mon Sep 17 00:00:00 2001
From: Taniya Das <taniya.das@oss.qualcomm.com>
Date: Thu, 30 Oct 2025 16:39:06 +0530
Subject: dt-bindings: clock: qcom: Add Kaanapali Global clock controller

Add device tree bindings for the global clock controller on Qualcomm
Kaanapali platform.

Signed-off-by: Jingyi Wang <jingyi.wang@oss.qualcomm.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Taniya Das <taniya.das@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20251030-gcc_kaanapali-v2-v2-3-a774a587af6f@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../devicetree/bindings/clock/qcom,sm8750-gcc.yaml |   8 +-
 include/dt-bindings/clock/qcom,kaanapali-gcc.h     | 241 +++++++++++++++++++++
 2 files changed, 247 insertions(+), 2 deletions(-)
 create mode 100644 include/dt-bindings/clock/qcom,kaanapali-gcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8750-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8750-gcc.yaml
index aab7039fd28d..0114d347b26f 100644
--- a/Documentation/devicetree/bindings/clock/qcom,sm8750-gcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,sm8750-gcc.yaml
@@ -13,11 +13,15 @@ description: |
   Qualcomm global clock control module provides the clocks, resets and power
   domains on SM8750
 
-  See also: include/dt-bindings/clock/qcom,sm8750-gcc.h
+  See also:
+    include/dt-bindings/clock/qcom,kaanapali-gcc.h
+    include/dt-bindings/clock/qcom,sm8750-gcc.h
 
 properties:
   compatible:
-    const: qcom,sm8750-gcc
+    enum:
+      - qcom,kaanapali-gcc
+      - qcom,sm8750-gcc
 
   clocks:
     items:
diff --git a/include/dt-bindings/clock/qcom,kaanapali-gcc.h b/include/dt-bindings/clock/qcom,kaanapali-gcc.h
new file mode 100644
index 000000000000..890e48709f09
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,kaanapali-gcc.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GCC_KAANAPALI_H
+#define _DT_BINDINGS_CLK_QCOM_GCC_KAANAPALI_H
+
+/* GCC clocks */
+#define GCC_AGGRE_NOC_PCIE_AXI_CLK				0
+#define GCC_AGGRE_UFS_PHY_AXI_CLK				1
+#define GCC_AGGRE_USB3_PRIM_AXI_CLK				2
+#define GCC_BOOT_ROM_AHB_CLK					3
+#define GCC_CAM_BIST_MCLK_AHB_CLK				4
+#define GCC_CAMERA_AHB_CLK					5
+#define GCC_CAMERA_HF_AXI_CLK					6
+#define GCC_CAMERA_SF_AXI_CLK					7
+#define GCC_CAMERA_XO_CLK					8
+#define GCC_CFG_NOC_PCIE_ANOC_AHB_CLK				9
+#define GCC_CFG_NOC_USB3_PRIM_AXI_CLK				10
+#define GCC_CNOC_PCIE_SF_AXI_CLK				11
+#define GCC_DDRSS_PCIE_SF_QTB_CLK				12
+#define GCC_QMIP_CAMERA_CMD_AHB_CLK				13
+#define GCC_DISP_HF_AXI_CLK					14
+#define GCC_DISP_SF_AXI_CLK					15
+#define GCC_EVA_AHB_CLK						16
+#define GCC_EVA_AXI0_CLK					17
+#define GCC_EVA_AXI0C_CLK					18
+#define GCC_EVA_XO_CLK						19
+#define GCC_GP1_CLK						20
+#define GCC_GP1_CLK_SRC						21
+#define GCC_GP2_CLK						22
+#define GCC_GP2_CLK_SRC						23
+#define GCC_GP3_CLK						24
+#define GCC_GP3_CLK_SRC						25
+#define GCC_GPLL0						26
+#define GCC_GPLL0_OUT_EVEN					27
+#define GCC_GPLL1						28
+#define GCC_GPLL4						29
+#define GCC_GPLL7						30
+#define GCC_GPLL9						31
+#define GCC_GPU_CFG_AHB_CLK					32
+#define GCC_GPU_GEMNOC_GFX_CLK					33
+#define GCC_GPU_GPLL0_CLK_SRC					34
+#define GCC_GPU_GPLL0_DIV_CLK_SRC				35
+#define GCC_QMIP_VIDEO_VCODEC_AHB_CLK				36
+#define GCC_QMIP_GPU_AHB_CLK					37
+#define GCC_PCIE_0_AUX_CLK					38
+#define GCC_PCIE_0_AUX_CLK_SRC					39
+#define GCC_PCIE_0_CFG_AHB_CLK					40
+#define GCC_PCIE_0_MSTR_AXI_CLK					41
+#define GCC_PCIE_0_PHY_AUX_CLK					42
+#define GCC_PCIE_0_PHY_AUX_CLK_SRC				43
+#define GCC_PCIE_0_PHY_RCHNG_CLK				44
+#define GCC_PCIE_0_PHY_RCHNG_CLK_SRC				45
+#define GCC_PCIE_0_PIPE_CLK					46
+#define GCC_PCIE_0_PIPE_CLK_SRC					47
+#define GCC_PCIE_0_SLV_AXI_CLK					48
+#define GCC_PCIE_0_SLV_Q2A_AXI_CLK				49
+#define GCC_PCIE_RSCC_CFG_AHB_CLK				50
+#define GCC_PCIE_RSCC_XO_CLK					51
+#define GCC_PDM2_CLK						52
+#define GCC_PDM2_CLK_SRC					53
+#define GCC_PDM_AHB_CLK						54
+#define GCC_PDM_XO4_CLK						55
+#define GCC_QUPV3_I2C_CORE_CLK					56
+#define GCC_QUPV3_I2C_S0_CLK					57
+#define GCC_QUPV3_I2C_S0_CLK_SRC				58
+#define GCC_QUPV3_I2C_S1_CLK					59
+#define GCC_QUPV3_I2C_S1_CLK_SRC				60
+#define GCC_QUPV3_I2C_S2_CLK					61
+#define GCC_QUPV3_I2C_S2_CLK_SRC				62
+#define GCC_QUPV3_I2C_S3_CLK					63
+#define GCC_QUPV3_I2C_S3_CLK_SRC				64
+#define GCC_QUPV3_I2C_S4_CLK					65
+#define GCC_QUPV3_I2C_S4_CLK_SRC				66
+#define GCC_QUPV3_I2C_S_AHB_CLK					67
+#define GCC_QUPV3_WRAP1_CORE_2X_CLK				68
+#define GCC_QUPV3_WRAP1_CORE_CLK				69
+#define GCC_QUPV3_WRAP1_QSPI_REF_CLK				70
+#define GCC_QUPV3_WRAP1_QSPI_REF_CLK_SRC			71
+#define GCC_QUPV3_WRAP1_S0_CLK					72
+#define GCC_QUPV3_WRAP1_S0_CLK_SRC				73
+#define GCC_QUPV3_WRAP1_S1_CLK					74
+#define GCC_QUPV3_WRAP1_S1_CLK_SRC				75
+#define GCC_QUPV3_WRAP1_S2_CLK					76
+#define GCC_QUPV3_WRAP1_S2_CLK_SRC				77
+#define GCC_QUPV3_WRAP1_S3_CLK					78
+#define GCC_QUPV3_WRAP1_S3_CLK_SRC				79
+#define GCC_QUPV3_WRAP1_S4_CLK					80
+#define GCC_QUPV3_WRAP1_S4_CLK_SRC				81
+#define GCC_QUPV3_WRAP1_S5_CLK					82
+#define GCC_QUPV3_WRAP1_S5_CLK_SRC				83
+#define GCC_QUPV3_WRAP1_S6_CLK					84
+#define GCC_QUPV3_WRAP1_S6_CLK_SRC				85
+#define GCC_QUPV3_WRAP1_S7_CLK					86
+#define GCC_QUPV3_WRAP1_S7_CLK_SRC				87
+#define GCC_QUPV3_WRAP2_CORE_2X_CLK				88
+#define GCC_QUPV3_WRAP2_CORE_CLK				89
+#define GCC_QUPV3_WRAP2_S0_CLK					90
+#define GCC_QUPV3_WRAP2_S0_CLK_SRC				91
+#define GCC_QUPV3_WRAP2_S1_CLK					92
+#define GCC_QUPV3_WRAP2_S1_CLK_SRC				93
+#define GCC_QUPV3_WRAP2_S2_CLK					94
+#define GCC_QUPV3_WRAP2_S2_CLK_SRC				95
+#define GCC_QUPV3_WRAP2_S3_CLK					96
+#define GCC_QUPV3_WRAP2_S3_CLK_SRC				97
+#define GCC_QUPV3_WRAP2_S4_CLK					98
+#define GCC_QUPV3_WRAP2_S4_CLK_SRC				99
+#define GCC_QUPV3_WRAP3_CORE_2X_CLK				100
+#define GCC_QUPV3_WRAP3_CORE_CLK				101
+#define GCC_QUPV3_WRAP3_IBI_CTRL_0_CLK_SRC			102
+#define GCC_QUPV3_WRAP3_IBI_CTRL_1_CLK				103
+#define GCC_QUPV3_WRAP3_IBI_CTRL_2_CLK				104
+#define GCC_QUPV3_WRAP3_S0_CLK					105
+#define GCC_QUPV3_WRAP3_S0_CLK_SRC				106
+#define GCC_QUPV3_WRAP3_S1_CLK					107
+#define GCC_QUPV3_WRAP3_S1_CLK_SRC				108
+#define GCC_QUPV3_WRAP3_S2_CLK					109
+#define GCC_QUPV3_WRAP3_S2_CLK_SRC				110
+#define GCC_QUPV3_WRAP3_S3_CLK					111
+#define GCC_QUPV3_WRAP3_S3_CLK_SRC				112
+#define GCC_QUPV3_WRAP3_S4_CLK					113
+#define GCC_QUPV3_WRAP3_S4_CLK_SRC				114
+#define GCC_QUPV3_WRAP3_S5_CLK					115
+#define GCC_QUPV3_WRAP3_S5_CLK_SRC				116
+#define GCC_QUPV3_WRAP4_CORE_2X_CLK				117
+#define GCC_QUPV3_WRAP4_CORE_CLK				118
+#define GCC_QUPV3_WRAP4_S0_CLK					119
+#define GCC_QUPV3_WRAP4_S0_CLK_SRC				120
+#define GCC_QUPV3_WRAP4_S1_CLK					121
+#define GCC_QUPV3_WRAP4_S1_CLK_SRC				122
+#define GCC_QUPV3_WRAP4_S2_CLK					123
+#define GCC_QUPV3_WRAP4_S2_CLK_SRC				124
+#define GCC_QUPV3_WRAP4_S3_CLK					125
+#define GCC_QUPV3_WRAP4_S3_CLK_SRC				126
+#define GCC_QUPV3_WRAP4_S4_CLK					127
+#define GCC_QUPV3_WRAP4_S4_CLK_SRC				128
+#define GCC_QUPV3_WRAP_1_M_AXI_CLK				129
+#define GCC_QUPV3_WRAP_1_S_AHB_CLK				130
+#define GCC_QUPV3_WRAP_2_M_AHB_CLK				131
+#define GCC_QUPV3_WRAP_2_S_AHB_CLK				132
+#define GCC_QUPV3_WRAP_3_IBI_1_AHB_CLK				133
+#define GCC_QUPV3_WRAP_3_IBI_2_AHB_CLK				134
+#define GCC_QUPV3_WRAP_3_M_AHB_CLK				135
+#define GCC_QUPV3_WRAP_3_S_AHB_CLK				136
+#define GCC_QUPV3_WRAP_4_M_AHB_CLK				137
+#define GCC_QUPV3_WRAP_4_S_AHB_CLK				138
+#define GCC_SDCC2_AHB_CLK					139
+#define GCC_SDCC2_APPS_CLK					140
+#define GCC_SDCC2_APPS_CLK_SRC					141
+#define GCC_SDCC4_AHB_CLK					142
+#define GCC_SDCC4_APPS_CLK					143
+#define GCC_SDCC4_APPS_CLK_SRC					144
+#define GCC_UFS_PHY_AHB_CLK					145
+#define GCC_UFS_PHY_AXI_CLK					146
+#define GCC_UFS_PHY_AXI_CLK_SRC					147
+#define GCC_UFS_PHY_ICE_CORE_CLK				148
+#define GCC_UFS_PHY_ICE_CORE_CLK_SRC				149
+#define GCC_UFS_PHY_PHY_AUX_CLK					150
+#define GCC_UFS_PHY_PHY_AUX_CLK_SRC				151
+#define GCC_UFS_PHY_RX_SYMBOL_0_CLK				152
+#define GCC_UFS_PHY_RX_SYMBOL_0_CLK_SRC				153
+#define GCC_UFS_PHY_RX_SYMBOL_1_CLK				154
+#define GCC_UFS_PHY_RX_SYMBOL_1_CLK_SRC				155
+#define GCC_UFS_PHY_TX_SYMBOL_0_CLK				156
+#define GCC_UFS_PHY_TX_SYMBOL_0_CLK_SRC				157
+#define GCC_UFS_PHY_UNIPRO_CORE_CLK				158
+#define GCC_UFS_PHY_UNIPRO_CORE_CLK_SRC				159
+#define GCC_USB30_PRIM_MASTER_CLK				160
+#define GCC_USB30_PRIM_MASTER_CLK_SRC				161
+#define GCC_USB30_PRIM_MOCK_UTMI_CLK				162
+#define GCC_USB30_PRIM_MOCK_UTMI_CLK_SRC			163
+#define GCC_USB30_PRIM_MOCK_UTMI_POSTDIV_CLK_SRC		164
+#define GCC_USB30_PRIM_SLEEP_CLK				165
+#define GCC_USB3_PRIM_PHY_AUX_CLK				166
+#define GCC_USB3_PRIM_PHY_AUX_CLK_SRC				167
+#define GCC_USB3_PRIM_PHY_COM_AUX_CLK				168
+#define GCC_USB3_PRIM_PHY_PIPE_CLK				169
+#define GCC_USB3_PRIM_PHY_PIPE_CLK_SRC				170
+#define GCC_VIDEO_AHB_CLK					171
+#define GCC_VIDEO_AXI0_CLK					172
+#define GCC_VIDEO_AXI1_CLK					173
+#define GCC_VIDEO_XO_CLK					174
+#define GCC_QMIP_CAMERA_NRT_AHB_CLK				175
+#define GCC_QMIP_CAMERA_RT_AHB_CLK				176
+#define GCC_QMIP_DISP_DCP_SF_AHB_CLK				177
+#define GCC_QMIP_PCIE_AHB_CLK					178
+#define GCC_QMIP_VIDEO_CV_CPU_AHB_CLK				179
+#define GCC_QMIP_VIDEO_CVP_AHB_CLK				180
+#define GCC_QMIP_VIDEO_V_CPU_AHB_CLK				181
+#define GCC_DISP_AHB_CLK					182
+
+/* GCC power domains */
+#define GCC_PCIE_0_GDSC						0
+#define GCC_PCIE_0_PHY_GDSC					1
+#define GCC_UFS_MEM_PHY_GDSC					2
+#define GCC_UFS_PHY_GDSC					3
+#define GCC_USB30_PRIM_GDSC					4
+#define GCC_USB3_PHY_GDSC					5
+
+/* GCC resets */
+#define GCC_CAMERA_BCR						0
+#define GCC_DISPLAY_BCR						1
+#define GCC_EVA_AXI0_CLK_ARES					2
+#define GCC_EVA_AXI0C_CLK_ARES					3
+#define GCC_EVA_BCR						4
+#define GCC_GPU_BCR						5
+#define GCC_PCIE_0_BCR						6
+#define GCC_PCIE_0_LINK_DOWN_BCR				7
+#define GCC_PCIE_0_NOCSR_COM_PHY_BCR				8
+#define GCC_PCIE_0_PHY_BCR					9
+#define GCC_PCIE_0_PHY_NOCSR_COM_PHY_BCR			10
+#define GCC_PCIE_PHY_BCR					11
+#define GCC_PCIE_PHY_CFG_AHB_BCR				12
+#define GCC_PCIE_PHY_COM_BCR					13
+#define GCC_PCIE_RSCC_BCR					14
+#define GCC_PDM_BCR						15
+#define GCC_QUPV3_WRAPPER_1_BCR					16
+#define GCC_QUPV3_WRAPPER_2_BCR					17
+#define GCC_QUPV3_WRAPPER_3_BCR					18
+#define GCC_QUPV3_WRAPPER_4_BCR					19
+#define GCC_QUPV3_WRAPPER_I2C_BCR				20
+#define GCC_QUSB2PHY_PRIM_BCR					21
+#define GCC_QUSB2PHY_SEC_BCR					22
+#define GCC_SDCC2_BCR						23
+#define GCC_SDCC4_BCR						24
+#define GCC_UFS_PHY_BCR						25
+#define GCC_USB30_PRIM_BCR					26
+#define GCC_USB3_DP_PHY_PRIM_BCR				27
+#define GCC_USB3_DP_PHY_SEC_BCR					28
+#define GCC_USB3_PHY_PRIM_BCR					29
+#define GCC_USB3_PHY_SEC_BCR					30
+#define GCC_USB3PHY_PHY_PRIM_BCR				31
+#define GCC_USB3PHY_PHY_SEC_BCR					32
+#define GCC_VIDEO_AXI0_CLK_ARES					33
+#define GCC_VIDEO_AXI1_CLK_ARES					34
+#define GCC_VIDEO_BCR						35
+#define GCC_VIDEO_XO_CLK_ARES					36
+
+#endif
-- 
cgit v1.2.3


From d8f9581e1b7f1fe2e1ac985f4ea508d044c90733 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 29 Oct 2025 17:32:56 +0000
Subject: ipv6: Add in6_dev_rcu().

rcu_dereference_rtnl() does not clearly tell whether the caller
is under RCU or RTNL.

Let's add in6_dev_rcu() to make it easy to remove __in6_dev_get()
in the future.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-5-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/addrconf.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 9e5e95988b9e..78e8b877fb25 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -347,6 +347,11 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev)
 	return rcu_dereference_rtnl(dev->ip6_ptr);
 }
 
+static inline struct inet6_dev *in6_dev_rcu(const struct net_device *dev)
+{
+	return rcu_dereference(dev->ip6_ptr);
+}
+
 static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev)
 {
 	return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr);
-- 
cgit v1.2.3


From e833eb25161aae6cd0caf14782f405d0ed5765ed Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 29 Oct 2025 17:33:04 +0000
Subject: mpls: Protect net->mpls.platform_label with a per-netns mutex.

MPLS (re)uses RTNL to protect net->mpls.platform_label,
but the lock does not need to be RTNL at all.

Let's protect net->mpls.platform_label with a dedicated
per-netns mutex.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-13-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netns/mpls.h |  1 +
 net/mpls/af_mpls.c       | 55 +++++++++++++++++++++++++++++++-----------------
 net/mpls/internal.h      |  7 +++++-
 3 files changed, 43 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h
index 19ad2574b267..6682e51513ef 100644
--- a/include/net/netns/mpls.h
+++ b/include/net/netns/mpls.h
@@ -16,6 +16,7 @@ struct netns_mpls {
 	int default_ttl;
 	size_t platform_labels;
 	struct mpls_route __rcu * __rcu *platform_label;
+	struct mutex platform_mutex;
 
 	struct ctl_table_header *ctl;
 };
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 49fd15232dbe..d0d047dd2245 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -79,8 +79,8 @@ static struct mpls_route *mpls_route_input(struct net *net, unsigned int index)
 {
 	struct mpls_route __rcu **platform_label;
 
-	platform_label = rtnl_dereference(net->mpls.platform_label);
-	return rtnl_dereference(platform_label[index]);
+	platform_label = mpls_dereference(net, net->mpls.platform_label);
+	return mpls_dereference(net, platform_label[index]);
 }
 
 static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned int index)
@@ -578,10 +578,8 @@ static void mpls_route_update(struct net *net, unsigned index,
 	struct mpls_route __rcu **platform_label;
 	struct mpls_route *rt;
 
-	ASSERT_RTNL();
-
-	platform_label = rtnl_dereference(net->mpls.platform_label);
-	rt = rtnl_dereference(platform_label[index]);
+	platform_label = mpls_dereference(net, net->mpls.platform_label);
+	rt = mpls_dereference(net, platform_label[index]);
 	rcu_assign_pointer(platform_label[index], new);
 
 	mpls_notify_route(net, index, rt, new, info);
@@ -1472,8 +1470,6 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 	int err = -ENOMEM;
 	int i;
 
-	ASSERT_RTNL();
-
 	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
 	if (!mdev)
 		return ERR_PTR(err);
@@ -1633,6 +1629,8 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 	unsigned int flags;
 	int err;
 
+	mutex_lock(&net->mpls.platform_mutex);
+
 	if (event == NETDEV_REGISTER) {
 		mdev = mpls_add_dev(dev);
 		if (IS_ERR(mdev)) {
@@ -1695,9 +1693,11 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 	}
 
 out:
+	mutex_unlock(&net->mpls.platform_mutex);
 	return NOTIFY_OK;
 
 err:
+	mutex_unlock(&net->mpls.platform_mutex);
 	return notifier_from_errno(err);
 }
 
@@ -1973,6 +1973,7 @@ errout:
 static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
+	struct net *net = sock_net(skb->sk);
 	struct mpls_route_config *cfg;
 	int err;
 
@@ -1984,7 +1985,9 @@ static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto out;
 
+	mutex_lock(&net->mpls.platform_mutex);
 	err = mpls_route_del(cfg, extack);
+	mutex_unlock(&net->mpls.platform_mutex);
 out:
 	kfree(cfg);
 
@@ -1995,6 +1998,7 @@ out:
 static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
+	struct net *net = sock_net(skb->sk);
 	struct mpls_route_config *cfg;
 	int err;
 
@@ -2006,7 +2010,9 @@ static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto out;
 
+	mutex_lock(&net->mpls.platform_mutex);
 	err = mpls_route_add(cfg, extack);
+	mutex_unlock(&net->mpls.platform_mutex);
 out:
 	kfree(cfg);
 
@@ -2407,6 +2413,8 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
 	u8 n_labels;
 	int err;
 
+	mutex_lock(&net->mpls.platform_mutex);
+
 	err = mpls_valid_getroute_req(in_skb, in_nlh, tb, extack);
 	if (err < 0)
 		goto errout;
@@ -2450,7 +2458,8 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
 			goto errout_free;
 		}
 
-		return rtnl_unicast(skb, net, portid);
+		err = rtnl_unicast(skb, net, portid);
+		goto errout;
 	}
 
 	if (tb[RTA_NEWDST]) {
@@ -2542,12 +2551,14 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
 
 	err = rtnl_unicast(skb, net, portid);
 errout:
+	mutex_unlock(&net->mpls.platform_mutex);
 	return err;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
 	err = -EMSGSIZE;
 errout_free:
+	mutex_unlock(&net->mpls.platform_mutex);
 	kfree_skb(skb);
 	return err;
 }
@@ -2603,9 +2614,10 @@ static int resize_platform_label_table(struct net *net, size_t limit)
 		       lo->addr_len);
 	}
 
-	rtnl_lock();
+	mutex_lock(&net->mpls.platform_mutex);
+
 	/* Remember the original table */
-	old = rtnl_dereference(net->mpls.platform_label);
+	old = mpls_dereference(net, net->mpls.platform_label);
 	old_limit = net->mpls.platform_labels;
 
 	/* Free any labels beyond the new table */
@@ -2636,7 +2648,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
 	net->mpls.platform_labels = limit;
 	rcu_assign_pointer(net->mpls.platform_label, labels);
 
-	rtnl_unlock();
+	mutex_unlock(&net->mpls.platform_mutex);
 
 	mpls_rt_free(rt2);
 	mpls_rt_free(rt0);
@@ -2709,12 +2721,13 @@ static const struct ctl_table mpls_table[] = {
 	},
 };
 
-static int mpls_net_init(struct net *net)
+static __net_init int mpls_net_init(struct net *net)
 {
 	size_t table_size = ARRAY_SIZE(mpls_table);
 	struct ctl_table *table;
 	int i;
 
+	mutex_init(&net->mpls.platform_mutex);
 	net->mpls.platform_labels = 0;
 	net->mpls.platform_label = NULL;
 	net->mpls.ip_ttl_propagate = 1;
@@ -2740,7 +2753,7 @@ static int mpls_net_init(struct net *net)
 	return 0;
 }
 
-static void mpls_net_exit(struct net *net)
+static __net_exit void mpls_net_exit(struct net *net)
 {
 	struct mpls_route __rcu **platform_label;
 	size_t platform_labels;
@@ -2760,16 +2773,20 @@ static void mpls_net_exit(struct net *net)
 	 * As such no additional rcu synchronization is necessary when
 	 * freeing the platform_label table.
 	 */
-	rtnl_lock();
-	platform_label = rtnl_dereference(net->mpls.platform_label);
+	mutex_lock(&net->mpls.platform_mutex);
+
+	platform_label = mpls_dereference(net, net->mpls.platform_label);
 	platform_labels = net->mpls.platform_labels;
+
 	for (index = 0; index < platform_labels; index++) {
-		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
-		RCU_INIT_POINTER(platform_label[index], NULL);
+		struct mpls_route *rt;
+
+		rt = mpls_dereference(net, platform_label[index]);
 		mpls_notify_route(net, index, rt, NULL, NULL);
 		mpls_rt_free(rt);
 	}
-	rtnl_unlock();
+
+	mutex_unlock(&net->mpls.platform_mutex);
 
 	kvfree(platform_label);
 }
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 0df01a5395ee..80cb5bbcd946 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -185,6 +185,11 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
 	return result;
 }
 
+#define mpls_dereference(net, p)					\
+	rcu_dereference_protected(					\
+		(p),							\
+		lockdep_is_held(&(net)->mpls.platform_mutex))
+
 static inline struct mpls_dev *mpls_dev_rcu(const struct net_device *dev)
 {
 	return rcu_dereference(dev->mpls_ptr);
@@ -193,7 +198,7 @@ static inline struct mpls_dev *mpls_dev_rcu(const struct net_device *dev)
 static inline struct mpls_dev *mpls_dev_get(const struct net *net,
 					    const struct net_device *dev)
 {
-	return rcu_dereference_rtnl(dev->mpls_ptr);
+	return mpls_dereference(net, dev->mpls_ptr);
 }
 
 int nla_put_labels(struct sk_buff *skb, int attrtype,  u8 labels,
-- 
cgit v1.2.3


From c18d4b190a46651726c9a952667c74d2deb33c28 Mon Sep 17 00:00:00 2001
From: Samiullah Khawaja <skhawaja@google.com>
Date: Tue, 28 Oct 2025 20:30:05 +0000
Subject: net: Extend NAPI threaded polling to allow kthread based busy polling

Add a new state NAPI_STATE_THREADED_BUSY_POLL to the NAPI state enum to
enable and disable threaded busy polling.

When threaded busy polling is enabled for a NAPI, enable
NAPI_STATE_THREADED also.

When the threaded NAPI is scheduled, set NAPI_STATE_IN_BUSY_POLL to
signal napi_complete_done not to rearm interrupts.

Whenever NAPI_STATE_THREADED_BUSY_POLL is unset, the
NAPI_STATE_IN_BUSY_POLL will be unset, napi_complete_done unsets the
NAPI_STATE_SCHED_THREADED bit also, which in turn will make the kthread
go to sleep.

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Martin Karsten <mkarsten@uwaterloo.ca>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Link: https://patch.msgid.link/20251028203007.575686-2-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml |  5 +--
 Documentation/networking/napi.rst       | 50 +++++++++++++++++++++++++++-
 include/linux/netdevice.h               |  4 ++-
 include/uapi/linux/netdev.h             |  1 +
 net/core/dev.c                          | 58 +++++++++++++++++++++++++++------
 net/core/dev.h                          |  3 ++
 net/core/netdev-genl-gen.c              |  2 +-
 tools/include/uapi/linux/netdev.h       |  1 +
 8 files changed, 109 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index e00d3fa1c152..10c412b7433f 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -88,7 +88,7 @@ definitions:
   -
     name: napi-threaded
     type: enum
-    entries: [disabled, enabled]
+    entries: [disabled, enabled, busy-poll]
 
 attribute-sets:
   -
@@ -291,7 +291,8 @@ attribute-sets:
         name: threaded
         doc: Whether the NAPI is configured to operate in threaded polling
              mode. If this is set to enabled then the NAPI context operates
-             in threaded polling mode.
+             in threaded polling mode. If this is set to busy-poll, then the
+             threaded polling mode also busy polls.
         type: u32
         enum: napi-threaded
   -
diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst
index 7dd60366f4ff..4e008efebb35 100644
--- a/Documentation/networking/napi.rst
+++ b/Documentation/networking/napi.rst
@@ -263,7 +263,9 @@ are not well known).
 Busy polling is enabled by either setting ``SO_BUSY_POLL`` on
 selected sockets or using the global ``net.core.busy_poll`` and
 ``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling
-also exists.
+also exists. Threaded polling of NAPI also has a mode to busy poll for
+packets (:ref:`threaded busy polling<threaded_busy_poll>`) using the NAPI
+processing kthread.
 
 epoll-based busy polling
 ------------------------
@@ -426,6 +428,52 @@ Therefore, setting ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` is
 the recommended usage, because otherwise setting ``irq-suspend-timeout``
 might not have any discernible effect.
 
+.. _threaded_busy_poll:
+
+Threaded NAPI busy polling
+--------------------------
+
+Threaded NAPI busy polling extends threaded NAPI and adds support to do
+continuous busy polling of the NAPI. This can be useful for forwarding or
+AF_XDP applications.
+
+Threaded NAPI busy polling can be enabled on per NIC queue basis using Netlink.
+
+For example, using the following script:
+
+.. code-block:: bash
+
+  $ ynl --family netdev --do napi-set \
+            --json='{"id": 66, "threaded": "busy-poll"}'
+
+The kernel will create a kthread that busy polls on this NAPI.
+
+The user may elect to set the CPU affinity of this kthread to an unused CPU
+core to improve how often the NAPI is polled at the expense of wasted CPU
+cycles. Note that this will keep the CPU core busy with 100% usage.
+
+Once threaded busy polling is enabled for a NAPI, PID of the kthread can be
+retrieved using Netlink so the affinity of the kthread can be set up.
+
+For example, the following script can be used to fetch the PID:
+
+.. code-block:: bash
+
+  $ ynl --family netdev --do napi-get --json='{"id": 66}'
+
+This will output something like following, the pid `258` is the PID of the
+kthread that is polling this NAPI.
+
+.. code-block:: bash
+
+  $ {'defer-hard-irqs': 0,
+     'gro-flush-timeout': 0,
+     'id': 66,
+     'ifindex': 2,
+     'irq-suspend-timeout': 0,
+     'pid': 258,
+     'threaded': 'busy-poll'}
+
 .. _threaded:
 
 Threaded NAPI
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9c1e5042c5e7..e808071dbb7d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -423,11 +423,12 @@ enum {
 	NAPI_STATE_NPSVC,		/* Netpoll - don't dequeue from poll_list */
 	NAPI_STATE_LISTED,		/* NAPI added to system lists */
 	NAPI_STATE_NO_BUSY_POLL,	/* Do not add in napi_hash, no busy polling */
-	NAPI_STATE_IN_BUSY_POLL,	/* sk_busy_loop() owns this NAPI */
+	NAPI_STATE_IN_BUSY_POLL,	/* Do not rearm NAPI interrupt */
 	NAPI_STATE_PREFER_BUSY_POLL,	/* prefer busy-polling over softirq processing*/
 	NAPI_STATE_THREADED,		/* The poll is performed inside its own thread*/
 	NAPI_STATE_SCHED_THREADED,	/* Napi is currently scheduled in threaded mode */
 	NAPI_STATE_HAS_NOTIFIER,	/* Napi has an IRQ notifier */
+	NAPI_STATE_THREADED_BUSY_POLL,	/* The threaded NAPI poller will busy poll */
 };
 
 enum {
@@ -442,6 +443,7 @@ enum {
 	NAPIF_STATE_THREADED		= BIT(NAPI_STATE_THREADED),
 	NAPIF_STATE_SCHED_THREADED	= BIT(NAPI_STATE_SCHED_THREADED),
 	NAPIF_STATE_HAS_NOTIFIER	= BIT(NAPI_STATE_HAS_NOTIFIER),
+	NAPIF_STATE_THREADED_BUSY_POLL	= BIT(NAPI_STATE_THREADED_BUSY_POLL),
 };
 
 enum gro_result {
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 48eb49aa03d4..048c8de1a130 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -80,6 +80,7 @@ enum netdev_qstats_scope {
 enum netdev_napi_threaded {
 	NETDEV_NAPI_THREADED_DISABLED,
 	NETDEV_NAPI_THREADED_ENABLED,
+	NETDEV_NAPI_THREADED_BUSY_POLL,
 };
 
 enum {
diff --git a/net/core/dev.c b/net/core/dev.c
index dccc1176f3c6..2c1de5fb97d9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7089,7 +7089,8 @@ static void napi_stop_kthread(struct napi_struct *napi)
 		 */
 		if ((val & NAPIF_STATE_SCHED_THREADED) ||
 		    !(val & NAPIF_STATE_SCHED)) {
-			new = val & (~NAPIF_STATE_THREADED);
+			new = val & (~(NAPIF_STATE_THREADED |
+				       NAPIF_STATE_THREADED_BUSY_POLL));
 		} else {
 			msleep(20);
 			continue;
@@ -7113,6 +7114,16 @@ static void napi_stop_kthread(struct napi_struct *napi)
 	napi->thread = NULL;
 }
 
+static void napi_set_threaded_state(struct napi_struct *napi,
+				    enum netdev_napi_threaded threaded_mode)
+{
+	bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED;
+	bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL;
+
+	assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+	assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll);
+}
+
 int napi_set_threaded(struct napi_struct *napi,
 		      enum netdev_napi_threaded threaded)
 {
@@ -7139,7 +7150,7 @@ int napi_set_threaded(struct napi_struct *napi,
 	} else {
 		/* Make sure kthread is created before THREADED bit is set. */
 		smp_mb__before_atomic();
-		assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+		napi_set_threaded_state(napi, threaded);
 	}
 
 	return 0;
@@ -7531,7 +7542,9 @@ void napi_disable_locked(struct napi_struct *n)
 		}
 
 		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
-		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
+		new &= ~(NAPIF_STATE_THREADED |
+			 NAPIF_STATE_THREADED_BUSY_POLL |
+			 NAPIF_STATE_PREFER_BUSY_POLL);
 	} while (!try_cmpxchg(&n->state, &val, new));
 
 	hrtimer_cancel(&n->timer);
@@ -7743,7 +7756,7 @@ static int napi_thread_wait(struct napi_struct *napi)
 	return -1;
 }
 
-static void napi_threaded_poll_loop(struct napi_struct *napi)
+static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
 {
 	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 	struct softnet_data *sd;
@@ -7772,22 +7785,47 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
 		}
 		skb_defer_free_flush();
 		bpf_net_ctx_clear(bpf_net_ctx);
+
+		/* When busy poll is enabled, the old packets are not flushed in
+		 * napi_complete_done. So flush them here.
+		 */
+		if (busy_poll)
+			gro_flush_normal(&napi->gro, HZ >= 1000);
 		local_bh_enable();
 
+		/* Call cond_resched here to avoid watchdog warnings. */
+		if (repoll || busy_poll) {
+			rcu_softirq_qs_periodic(last_qs);
+			cond_resched();
+		}
+
 		if (!repoll)
 			break;
-
-		rcu_softirq_qs_periodic(last_qs);
-		cond_resched();
 	}
 }
 
 static int napi_threaded_poll(void *data)
 {
 	struct napi_struct *napi = data;
+	bool want_busy_poll;
+	bool in_busy_poll;
+	unsigned long val;
+
+	while (!napi_thread_wait(napi)) {
+		val = READ_ONCE(napi->state);
+
+		want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
+		in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL;
 
-	while (!napi_thread_wait(napi))
-		napi_threaded_poll_loop(napi);
+		if (unlikely(val & NAPIF_STATE_DISABLE))
+			want_busy_poll = false;
+
+		if (want_busy_poll != in_busy_poll)
+			assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
+				   want_busy_poll);
+
+		napi_threaded_poll_loop(napi, want_busy_poll);
+	}
 
 	return 0;
 }
@@ -13097,7 +13135,7 @@ static void run_backlog_napi(unsigned int cpu)
 {
 	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
 
-	napi_threaded_poll_loop(&sd->backlog);
+	napi_threaded_poll_loop(&sd->backlog, false);
 }
 
 static void backlog_napi_setup(unsigned int cpu)
diff --git a/net/core/dev.h b/net/core/dev.h
index 900880e8b5b4..4d872a79bafb 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -317,6 +317,9 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
 
 static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n)
 {
+	if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state))
+		return NETDEV_NAPI_THREADED_BUSY_POLL;
+
 	if (test_bit(NAPI_STATE_THREADED, &n->state))
 		return NETDEV_NAPI_THREADED_ENABLED;
 
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index e9a2a6f26cb7..ff20435c45d2 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -97,7 +97,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED
 	[NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
 	[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
 	[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
-	[NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1),
+	[NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2),
 };
 
 /* NETDEV_CMD_BIND_TX - do */
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 48eb49aa03d4..048c8de1a130 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -80,6 +80,7 @@ enum netdev_qstats_scope {
 enum netdev_napi_threaded {
 	NETDEV_NAPI_THREADED_DISABLED,
 	NETDEV_NAPI_THREADED_ENABLED,
+	NETDEV_NAPI_THREADED_BUSY_POLL,
 };
 
 enum {
-- 
cgit v1.2.3


From abcf6eef90c6e47efed62a7c233ffc1a6a90797e Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 27 Oct 2025 13:27:58 +0100
Subject: net: phy: introduce internal API for PHY MSE diagnostics

Add the base infrastructure for Mean Square Error (MSE) diagnostics,
as proposed by the OPEN Alliance "Advanced diagnostic features for
100BASE-T1 automotive Ethernet PHYs" [1] specification.

The OPEN Alliance spec defines only average MSE and average peak MSE
over a fixed number of symbols. However, other PHYs, such as the
KSZ9131, additionally expose a worst-peak MSE value latched since the
last channel capture. This API accounts for such vendor extensions by
adding a distinct capability bit and snapshot field.

Channel-to-pair mapping is normally straightforward, but in some cases
(e.g. 100BASE-TX with MDI-X resolution unknown) the mapping is ambiguous.
If hardware does not expose MDI-X status, the exact pair cannot be
determined. To avoid returning misleading per-channel data in this case,
a LINK selector is defined for aggregate MSE measurements.

All investigated devices differ in MSE capabilities, such
as sample rate, number of analyzed symbols, and scaling factors.
For example, the KSZ9131 uses different scaling for MSE and pMSE.
To make this visible to callers, scale limits and timing information
are returned via get_mse_capability().

Some PHYs sample very few symbols at high frequency (e.g. 2 us update
rate). To cover such cases and allow for future high-speed PHYs with
even shorter intervals, the refresh rate is reported as u64 in
picoseconds.

This patch introduces the internal PHY API for Mean Square Error
diagnostics. It defines new kernel-side data types and driver hooks:

  - struct phy_mse_capability: describes supported metrics, scale
    limits, update interval, and sampling length.
  - struct phy_mse_snapshot: holds one correlated measurement set.
  - New phy_driver ops: `get_mse_capability()` and `get_mse_snapshot()`.

These definitions form the core kernel API. No user-visible interfaces
are added in this commit.

Standardization notes:
OPEN Alliance defines presence and interpretation of some metrics but does
not fix numeric scales or sampling internals:

- SQI (3-bit, 0..7) is mandatory; correlation to SNR/BER is informative
  (OA 100BASE-T1 TC1 v1.0 6.1.2; OA 1000BASE-T1 TC12 v2.2 6.1.2).
- MSE is optional; OA recommends 2^16 symbols and scaling to 0..511,
  with a worst-case latch since last read (OA 100BASE-T1 TC1 v1.0 6.1.1; OA
  1000BASE-T1 TC12 v2.2 6.1.1). Refresh is recommended (~0.8-2.0 ms for
  100BASE-T1; ~80-200 us for 1000BASE-T1). Exact scaling/time windows
  are vendor-specific.
- Peak MSE (pMSE) is defined only for 100BASE-T1 as optional, e.g.
  128-symbol sliding window with 8-bit range and worst-case latch (OA
  100BASE-T1 TC1 v1.0 6.1.3).

Therefore this API exposes which measures and selectors a PHY supports,
and documents where behavior is standard-referenced vs vendor-specific.

[1] <https://opensig.org/wp-content/uploads/2024/01/
     Advanced_PHY_features_for_automotive_Ethernet_V1.0.pdf>

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251027122801.982364-2-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 358dd6f0ff96..e3474f03cbc1 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -903,6 +903,165 @@ struct phy_led {
 
 #define to_phy_led(d) container_of(d, struct phy_led, led_cdev)
 
+/*
+ * PHY_MSE_CAP_* - Bitmask flags for Mean Square Error (MSE) capabilities
+ *
+ * These flags describe which MSE metrics and selectors are implemented
+ * by the PHY for the current link mode. They are used in
+ * struct phy_mse_capability.supported_caps.
+ *
+ * Standardization:
+ *   The OPEN Alliance (OA) defines the presence of MSE/SQI/pMSE but not their
+ *   numeric scaling, update intervals, or aggregation windows.  See:
+ *     OA 100BASE-T1 TC1 v1.0, sections 6.1.1-6.1.3
+ *     OA 1000BASE-T1 TC12 v2.2, sections 6.1.1-6.1.2
+ *
+ * Description of flags:
+ *
+ *   PHY_MSE_CAP_CHANNEL_A
+ *     Per-pair diagnostics for Channel A are supported.  Mapping to the
+ *     physical wire pair may depend on MDI/MDI-X polarity.
+ *
+ *   PHY_MSE_CAP_CHANNEL_B, _C, _D
+ *     Same as above for channels B-D.
+ *
+ *   PHY_MSE_CAP_WORST_CHANNEL
+ *     The PHY or driver can identify and report the single worst-performing
+ *     channel without querying each one individually.
+ *
+ *   PHY_MSE_CAP_LINK
+ *     The PHY provides only a link-wide aggregate measurement or cannot map
+ *     results to a specific pair (for example 100BASE-TX with unknown
+ *     MDI/MDI-X).
+ *
+ *   PHY_MSE_CAP_AVG
+ *     Average MSE (mean DCQ metric) is supported.  For 100/1000BASE-T1 the OA
+ *     recommends 2^16 symbols, scaled 0..511, but the exact scaling is
+ *     vendor-specific.
+ *
+ *   PHY_MSE_CAP_PEAK
+ *     Peak MSE (current peak within the measurement window) is supported.
+ *     Defined as pMSE for 100BASE-T1; vendor-specific for others.
+ *
+ *   PHY_MSE_CAP_WORST_PEAK
+ *     Latched worst-case peak MSE since the last read (read-to-clear if
+ *     implemented).  Optional in OA 100BASE-T1 TC1 6.1.3.
+ */
+#define PHY_MSE_CAP_CHANNEL_A BIT(0)
+#define PHY_MSE_CAP_CHANNEL_B BIT(1)
+#define PHY_MSE_CAP_CHANNEL_C BIT(2)
+#define PHY_MSE_CAP_CHANNEL_D BIT(3)
+#define PHY_MSE_CAP_WORST_CHANNEL BIT(4)
+#define PHY_MSE_CAP_LINK BIT(5)
+#define PHY_MSE_CAP_AVG BIT(6)
+#define PHY_MSE_CAP_PEAK BIT(7)
+#define PHY_MSE_CAP_WORST_PEAK BIT(8)
+
+/*
+ * enum phy_mse_channel - Identifiers for selecting MSE measurement channels
+ *
+ * PHY_MSE_CHANNEL_A - PHY_MSE_CHANNEL_D
+ *   Select per-pair measurement for the corresponding channel.
+ *
+ * PHY_MSE_CHANNEL_WORST
+ *   Select the single worst-performing channel reported by hardware.
+ *
+ * PHY_MSE_CHANNEL_LINK
+ *   Select link-wide aggregate data (used when per-pair results are
+ *   unavailable).
+ */
+enum phy_mse_channel {
+	PHY_MSE_CHANNEL_A,
+	PHY_MSE_CHANNEL_B,
+	PHY_MSE_CHANNEL_C,
+	PHY_MSE_CHANNEL_D,
+	PHY_MSE_CHANNEL_WORST,
+	PHY_MSE_CHANNEL_LINK,
+};
+
+/**
+ * struct phy_mse_capability - Capabilities of Mean Square Error (MSE)
+ *                             measurement interface
+ *
+ * Standardization notes:
+ *
+ * - Presence of MSE/SQI/pMSE is defined by OPEN Alliance specs, but numeric
+ *   scaling, refresh/update rate and aggregation windows are not fixed and
+ *   are vendor-/product-specific. (OA 100BASE-T1 TC1 v1.0 6.1.*;
+ *   OA 1000BASE-T1 TC12 v2.2 6.1.*)
+ *
+ * - Typical recommendations: 2^16 symbols and 0..511 scaling for MSE; pMSE only
+ *   defined for 100BASE-T1 (sliding window example), others are vendor
+ *   extensions. Drivers must report actual scale/limits here.
+ *
+ * Describes the MSE measurement capabilities for the current link mode. These
+ * properties are dynamic and may change when link settings are modified.
+ * Callers should re-query this capability after any link state change to
+ * ensure they have the most up-to-date information.
+ *
+ * Callers should only request measurements for channels and types that are
+ * indicated as supported by the @supported_caps bitmask. If @supported_caps
+ * is 0, the device provides no MSE diagnostics, and driver operations should
+ * typically return -EOPNOTSUPP.
+ *
+ * Snapshot values for average and peak MSE can be normalized to a 0..1 ratio
+ * by dividing the raw snapshot by the corresponding @max_average_mse or
+ * @max_peak_mse value.
+ *
+ * @max_average_mse: The maximum value for an average MSE snapshot. This
+ *   defines the scale for the measurement. If the PHY_MSE_CAP_AVG capability is
+ *   supported, this value MUST be greater than 0. (vendor-specific units).
+ * @max_peak_mse: The maximum value for a peak MSE snapshot. If either
+ *   PHY_MSE_CAP_PEAK or PHY_MSE_CAP_WORST_PEAK is supported, this value MUST
+ *   be greater than 0. (vendor-specific units).
+ * @refresh_rate_ps: The typical interval, in picoseconds, between hardware
+ *   updates of the MSE values. This is an estimate, and callers should not
+ *   assume synchronous sampling. (vendor-specific units).
+ * @num_symbols: The number of symbols aggregated per hardware sample to
+ *   calculate the MSE. (vendor-specific units).
+ * @supported_caps: A bitmask of PHY_MSE_CAP_* values indicating which
+ *   measurement types (e.g., average, peak) and channels
+ *   (e.g., per-pair or link-wide) are supported.
+ */
+struct phy_mse_capability {
+	u64 max_average_mse;
+	u64 max_peak_mse;
+	u64 refresh_rate_ps;
+	u64 num_symbols;
+	u32 supported_caps;
+};
+
+/**
+ * struct phy_mse_snapshot - A snapshot of Mean Square Error (MSE) diagnostics
+ *
+ * Holds a set of MSE diagnostic values that were all captured from a single
+ * measurement window.
+ *
+ * Values are raw, device-scaled and not normalized. Use struct
+ * phy_mse_capability to interpret the scale and sampling window.
+ *
+ * @average_mse: The average MSE value over the measurement window.
+ *   OPEN Alliance references MSE as a DCQ metric; recommends 2^16 symbols and
+ *   0..511 scaling. Exact scale and refresh are vendor-specific.
+ *   (100BASE-T1 TC1 v1.0 6.1.1; 1000BASE-T1 TC12 v2.2 6.1.1).
+ *
+ * @peak_mse: The peak MSE value observed within the measurement window.
+ *   For 100BASE-T1, "pMSE" is optional and may be implemented via a sliding
+ *   128-symbol window with periodic capture; not standardized for 1000BASE-T1.
+ *   (100BASE-T1 TC1 v1.0 6.1.3, Table "DCQ.peakMSE").
+ *
+ * @worst_peak_mse: A latched high-water mark of the peak MSE since last read
+ *   (read-to-clear if implemented). OPEN Alliance shows a latched "worst case
+ *   peak MSE" for 100BASE-T1 pMSE; availability/semantics outside that are
+ *   vendor-specific. (100BASE-T1 TC1 v1.0 6.1.3, DCQ.peakMSE high byte;
+ *   1000BASE-T1 TC12 v2.2 treats DCQ details as vendor-specific.)
+ */
+struct phy_mse_snapshot {
+	u64 average_mse;
+	u64 peak_mse;
+	u64 worst_peak_mse;
+};
+
 /**
  * struct phy_driver - Driver structure for a particular PHY type
  *
@@ -1184,6 +1343,53 @@ struct phy_driver {
 	/** @get_sqi_max: Get the maximum signal quality indication */
 	int (*get_sqi_max)(struct phy_device *dev);
 
+	/**
+	 * @get_mse_capability: Get capabilities and scale of MSE measurement
+	 * @dev:    PHY device
+	 * @cap: Output (filled on success)
+	 *
+	 * Fill @cap with the PHY's MSE capability for the current
+	 * link mode: scale limits (max_average_mse, max_peak_mse), update
+	 * interval (refresh_rate_ps), sample length (num_symbols) and the
+	 * capability bitmask (supported_caps).
+	 *
+	 * Implementations may defer capability report until hardware has
+	 * converged; in that case they should return -EAGAIN and allow the
+	 * caller to retry later.
+	 *
+	 * Return: 0 on success. On failure, returns a negative errno code, such
+	 * as -EOPNOTSUPP if MSE measurement is not supported by the PHY or in
+	 * the current link mode, or -EAGAIN if the capability information is
+	 * not yet available.
+	 */
+	int (*get_mse_capability)(struct phy_device *dev,
+				  struct phy_mse_capability *cap);
+
+	/**
+	 * @get_mse_snapshot: Retrieve a snapshot of MSE diagnostic values
+	 * @dev:      PHY device
+	 * @channel:  Channel identifier (PHY_MSE_CHANNEL_*)
+	 * @snapshot: Output (filled on success)
+	 *
+	 * Fill @snapshot with a correlated set of MSE values from the most
+	 * recent measurement window.
+	 *
+	 * Callers must validate @channel against supported_caps returned by
+	 * get_mse_capability(). Drivers must not coerce @channel; if the
+	 * requested selector is not implemented by the device or current link
+	 * mode, the operation must fail.
+	 *
+	 * worst_peak_mse is latched and must be treated as read-to-clear.
+	 *
+	 * Return: 0 on success. On failure, returns a negative errno code, such
+	 * as -EOPNOTSUPP if MSE measurement is not supported by the PHY or in
+	 * the current link mode, or -EAGAIN if measurements are not yet
+	 * available.
+	 */
+	int (*get_mse_snapshot)(struct phy_device *dev,
+				enum phy_mse_channel channel,
+				struct phy_mse_snapshot *snapshot);
+
 	/* PLCA RS interface */
 	/** @get_plca_cfg: Return the current PLCA configuration */
 	int (*get_plca_cfg)(struct phy_device *dev,
-- 
cgit v1.2.3


From e6e93fb01302e9b7a15d17f3b8a00eff8a601654 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 27 Oct 2025 13:27:59 +0100
Subject: ethtool: netlink: add ETHTOOL_MSG_MSE_GET and wire up PHY MSE access

Introduce the userspace entry point for PHY MSE diagnostics via
ethtool netlink. This exposes the core API added previously and
returns both capability information and one or more snapshots.

Userspace sends ETHTOOL_MSG_MSE_GET. The reply carries:
- ETHTOOL_A_MSE_CAPABILITIES: scale limits and timing information
- ETHTOOL_A_MSE_CHANNEL_* nests: one or more snapshots (per-channel
  if available, otherwise WORST, otherwise LINK)

Link down returns -ENETDOWN.

Changes:
  - YAML: add attribute sets (mse, mse-capabilities, mse-snapshot)
    and the mse-get operation
  - UAPI (generated): add ETHTOOL_A_MSE_* enums and message IDs,
    ETHTOOL_MSG_MSE_GET/REPLY
  - ethtool core: add net/ethtool/mse.c implementing the request,
    register genl op, and hook into ethnl dispatch
  - docs: document MSE_GET in ethtool-netlink.rst

The include/uapi/linux/ethtool_netlink_generated.h is generated
from Documentation/netlink/specs/ethtool.yaml.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20251027122801.982364-3-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       |  86 +++++++
 Documentation/networking/ethtool-netlink.rst   |  64 +++++
 include/uapi/linux/ethtool_netlink_generated.h |  35 +++
 net/ethtool/Makefile                           |   2 +-
 net/ethtool/mse.c                              | 329 +++++++++++++++++++++++++
 net/ethtool/netlink.c                          |  10 +
 net/ethtool/netlink.h                          |   2 +
 7 files changed, 527 insertions(+), 1 deletion(-)
 create mode 100644 net/ethtool/mse.c

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 6a0fb1974513..05d2b6508b59 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -1823,6 +1823,73 @@ attribute-sets:
         type: uint
         enum: pse-event
         doc: List of events reported by the PSE controller
+  -
+    name: mse-capabilities
+    doc: MSE capabilities attribute set
+    attr-cnt-name: --ethtool-a-mse-capabilities-cnt
+    attributes:
+      -
+        name: max-average-mse
+        type: uint
+      -
+        name: max-peak-mse
+        type: uint
+      -
+        name: refresh-rate-ps
+        type: uint
+      -
+        name: num-symbols
+        type: uint
+  -
+    name: mse-snapshot
+    doc: MSE snapshot attribute set
+    attr-cnt-name: --ethtool-a-mse-snapshot-cnt
+    attributes:
+      -
+        name: average-mse
+        type: uint
+      -
+        name: peak-mse
+        type: uint
+      -
+        name: worst-peak-mse
+        type: uint
+  -
+    name: mse
+    attr-cnt-name: --ethtool-a-mse-cnt
+    attributes:
+      -
+        name: header
+        type: nest
+        nested-attributes: header
+      -
+        name: capabilities
+        type: nest
+        nested-attributes: mse-capabilities
+      -
+        name: channel-a
+        type: nest
+        nested-attributes: mse-snapshot
+      -
+        name: channel-b
+        type: nest
+        nested-attributes: mse-snapshot
+      -
+        name: channel-c
+        type: nest
+        nested-attributes: mse-snapshot
+      -
+        name: channel-d
+        type: nest
+        nested-attributes: mse-snapshot
+      -
+        name: worst-channel
+        type: nest
+        nested-attributes: mse-snapshot
+      -
+        name: link
+        type: nest
+        nested-attributes: mse-snapshot
 
 operations:
   enum-model: directional
@@ -2756,6 +2823,25 @@ operations:
         attributes:
           - header
           - context
+    -
+      name: mse-get
+      doc: Get PHY MSE measurement data and capabilities.
+      attribute-set: mse
+      do: &mse-get-op
+        request:
+          attributes:
+            - header
+        reply:
+          attributes:
+            - header
+            - capabilities
+            - channel-a
+            - channel-b
+            - channel-c
+            - channel-d
+            - worst-channel
+            - link
+      dump: *mse-get-op
 
 mcast-groups:
   list:
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index b270886c5f5d..af56c304cef4 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -242,6 +242,7 @@ Userspace to kernel:
   ``ETHTOOL_MSG_RSS_SET``               set RSS settings
   ``ETHTOOL_MSG_RSS_CREATE_ACT``        create an additional RSS context
   ``ETHTOOL_MSG_RSS_DELETE_ACT``        delete an additional RSS context
+  ``ETHTOOL_MSG_MSE_GET``               get MSE diagnostic data
   ===================================== =================================
 
 Kernel to userspace:
@@ -299,6 +300,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_RSS_CREATE_ACT_REPLY``     create an additional RSS context
   ``ETHTOOL_MSG_RSS_CREATE_NTF``           additional RSS context created
   ``ETHTOOL_MSG_RSS_DELETE_NTF``           additional RSS context deleted
+  ``ETHTOOL_MSG_MSE_GET_REPLY``            MSE diagnostic data
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -2458,6 +2460,68 @@ Kernel response contents:
 
 For a description of each attribute, see ``TSCONFIG_GET``.
 
+MSE_GET
+=======
+
+Retrieves detailed Mean Square Error (MSE) diagnostic information from the PHY.
+
+Request Contents:
+
+  ====================================  ======  ============================
+  ``ETHTOOL_A_MSE_HEADER``              nested  request header
+  ====================================  ======  ============================
+
+Kernel Response Contents:
+
+  ====================================  ======  ================================
+  ``ETHTOOL_A_MSE_HEADER``              nested  reply header
+  ``ETHTOOL_A_MSE_CAPABILITIES``        nested  capability/scale info for MSE
+                                                measurements
+  ``ETHTOOL_A_MSE_CHANNEL_A``           nested  snapshot for Channel A
+  ``ETHTOOL_A_MSE_CHANNEL_B``           nested  snapshot for Channel B
+  ``ETHTOOL_A_MSE_CHANNEL_C``           nested  snapshot for Channel C
+  ``ETHTOOL_A_MSE_CHANNEL_D``           nested  snapshot for Channel D
+  ``ETHTOOL_A_MSE_WORST_CHANNEL``       nested  snapshot for worst channel
+  ``ETHTOOL_A_MSE_LINK``                nested  snapshot for link-wide aggregate
+  ====================================  ======  ================================
+
+MSE Capabilities
+----------------
+
+This nested attribute reports the capability / scaling properties used to
+interpret snapshot values.
+
+  ============================================== ======  =========================
+  ``ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE`` uint    max avg_mse scale
+  ``ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE``    uint    max peak_mse scale
+  ``ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS`` uint    sample rate (picoseconds)
+  ``ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS``     uint    symbols per HW sample
+  ============================================== ======  =========================
+
+The max-average/peak fields are included only if the corresponding metric
+is supported by the PHY. Their absence indicates that the metric is not
+available.
+
+See ``struct phy_mse_capability`` kernel documentation in
+``include/linux/phy.h``.
+
+MSE Snapshot
+------------
+
+Each per-channel nest contains an atomic snapshot of MSE values for that
+selector (channel A/B/C/D, worst channel, or link).
+
+  ==========================================  ======  ===================
+  ``ETHTOOL_A_MSE_SNAPSHOT_AVERAGE_MSE``      uint    average MSE value
+  ``ETHTOOL_A_MSE_SNAPSHOT_PEAK_MSE``         uint    current peak MSE
+  ``ETHTOOL_A_MSE_SNAPSHOT_WORST_PEAK_MSE``   uint    worst-case peak MSE
+  ==========================================  ======  ===================
+
+Within each channel nest, only the metrics supported by the PHY will be present.
+
+See ``struct phy_mse_snapshot`` kernel documentation in
+``include/linux/phy.h``.
+
 Request translation
 ===================
 
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 0e8ac0d974e2..b71b175df46d 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -803,6 +803,39 @@ enum {
 	ETHTOOL_A_PSE_NTF_MAX = (__ETHTOOL_A_PSE_NTF_CNT - 1)
 };
 
+enum {
+	ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE = 1,
+	ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE,
+	ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS,
+	ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS,
+
+	__ETHTOOL_A_MSE_CAPABILITIES_CNT,
+	ETHTOOL_A_MSE_CAPABILITIES_MAX = (__ETHTOOL_A_MSE_CAPABILITIES_CNT - 1)
+};
+
+enum {
+	ETHTOOL_A_MSE_SNAPSHOT_AVERAGE_MSE = 1,
+	ETHTOOL_A_MSE_SNAPSHOT_PEAK_MSE,
+	ETHTOOL_A_MSE_SNAPSHOT_WORST_PEAK_MSE,
+
+	__ETHTOOL_A_MSE_SNAPSHOT_CNT,
+	ETHTOOL_A_MSE_SNAPSHOT_MAX = (__ETHTOOL_A_MSE_SNAPSHOT_CNT - 1)
+};
+
+enum {
+	ETHTOOL_A_MSE_HEADER = 1,
+	ETHTOOL_A_MSE_CAPABILITIES,
+	ETHTOOL_A_MSE_CHANNEL_A,
+	ETHTOOL_A_MSE_CHANNEL_B,
+	ETHTOOL_A_MSE_CHANNEL_C,
+	ETHTOOL_A_MSE_CHANNEL_D,
+	ETHTOOL_A_MSE_WORST_CHANNEL,
+	ETHTOOL_A_MSE_LINK,
+
+	__ETHTOOL_A_MSE_CNT,
+	ETHTOOL_A_MSE_MAX = (__ETHTOOL_A_MSE_CNT - 1)
+};
+
 enum {
 	ETHTOOL_MSG_USER_NONE = 0,
 	ETHTOOL_MSG_STRSET_GET = 1,
@@ -855,6 +888,7 @@ enum {
 	ETHTOOL_MSG_RSS_SET,
 	ETHTOOL_MSG_RSS_CREATE_ACT,
 	ETHTOOL_MSG_RSS_DELETE_ACT,
+	ETHTOOL_MSG_MSE_GET,
 
 	__ETHTOOL_MSG_USER_CNT,
 	ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1)
@@ -915,6 +949,7 @@ enum {
 	ETHTOOL_MSG_RSS_CREATE_ACT_REPLY,
 	ETHTOOL_MSG_RSS_CREATE_NTF,
 	ETHTOOL_MSG_RSS_DELETE_NTF,
+	ETHTOOL_MSG_MSE_GET_REPLY,
 
 	__ETHTOOL_MSG_KERNEL_CNT,
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 1e493553b977..629c10916670 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -9,4 +9,4 @@ ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
 		   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \
 		   module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o \
-		   phy.o tsconfig.o
+		   phy.o tsconfig.o mse.o
diff --git a/net/ethtool/mse.c b/net/ethtool/mse.c
new file mode 100644
index 000000000000..6aac004c3ffc
--- /dev/null
+++ b/net/ethtool/mse.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+#include <linux/slab.h>
+
+#include "netlink.h"
+#include "common.h"
+
+/* Channels A-D only; WORST and LINK are exclusive alternatives */
+#define PHY_MSE_CHANNEL_COUNT 4
+
+struct mse_req_info {
+	struct ethnl_req_info base;
+};
+
+struct mse_snapshot_entry {
+	struct phy_mse_snapshot snapshot;
+	int channel;
+};
+
+struct mse_reply_data {
+	struct ethnl_reply_data base;
+	struct phy_mse_capability capability;
+	struct mse_snapshot_entry *snapshots;
+	unsigned int num_snapshots;
+};
+
+static struct mse_reply_data *
+mse_repdata(const struct ethnl_reply_data *reply_base)
+{
+	return container_of(reply_base, struct mse_reply_data, base);
+}
+
+const struct nla_policy ethnl_mse_get_policy[] = {
+	[ETHTOOL_A_MSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static int get_snapshot_if_supported(struct phy_device *phydev,
+				     struct mse_reply_data *data,
+				     unsigned int *idx, u32 cap_bit,
+				     enum phy_mse_channel channel)
+{
+	int ret;
+
+	if (data->capability.supported_caps & cap_bit) {
+		ret = phydev->drv->get_mse_snapshot(phydev, channel,
+					&data->snapshots[*idx].snapshot);
+		if (ret)
+			return ret;
+		data->snapshots[*idx].channel = channel;
+		(*idx)++;
+	}
+
+	return 0;
+}
+
+static int mse_get_channels(struct phy_device *phydev,
+			    struct mse_reply_data *data)
+{
+	unsigned int i = 0;
+	int ret;
+
+	if (!data->capability.supported_caps)
+		return 0;
+
+	data->snapshots = kcalloc(PHY_MSE_CHANNEL_COUNT,
+				  sizeof(*data->snapshots), GFP_KERNEL);
+	if (!data->snapshots)
+		return -ENOMEM;
+
+	/* Priority 1: Individual channels */
+	ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_A,
+					PHY_MSE_CHANNEL_A);
+	if (ret)
+		return ret;
+	ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_B,
+					PHY_MSE_CHANNEL_B);
+	if (ret)
+		return ret;
+	ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_C,
+					PHY_MSE_CHANNEL_C);
+	if (ret)
+		return ret;
+	ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_D,
+					PHY_MSE_CHANNEL_D);
+	if (ret)
+		return ret;
+
+	/* If any individual channels were found, we are done. */
+	if (i > 0) {
+		data->num_snapshots = i;
+		return 0;
+	}
+
+	/* Priority 2: Worst channel, if no individual channels supported. */
+	ret = get_snapshot_if_supported(phydev, data, &i,
+					PHY_MSE_CAP_WORST_CHANNEL,
+					PHY_MSE_CHANNEL_WORST);
+	if (ret)
+		return ret;
+
+	/* If worst channel was found, we are done. */
+	if (i > 0) {
+		data->num_snapshots = i;
+		return 0;
+	}
+
+	/* Priority 3: Link-wide, if nothing else is supported. */
+	ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_LINK,
+					PHY_MSE_CHANNEL_LINK);
+	if (ret)
+		return ret;
+
+	data->num_snapshots = i;
+	return 0;
+}
+
+static int mse_prepare_data(const struct ethnl_req_info *req_base,
+			    struct ethnl_reply_data *reply_base,
+			    const struct genl_info *info)
+{
+	struct mse_reply_data *data = mse_repdata(reply_base);
+	struct net_device *dev = reply_base->dev;
+	struct phy_device *phydev;
+	int ret;
+
+	phydev = ethnl_req_get_phydev(req_base, info->attrs,
+				      ETHTOOL_A_MSE_HEADER, info->extack);
+	if (IS_ERR(phydev))
+		return PTR_ERR(phydev);
+	if (!phydev)
+		return -EOPNOTSUPP;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret)
+		return ret;
+
+	mutex_lock(&phydev->lock);
+
+	if (!phydev->drv || !phydev->drv->get_mse_capability ||
+	    !phydev->drv->get_mse_snapshot) {
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	}
+	if (!phydev->link) {
+		ret = -ENETDOWN;
+		goto out_unlock;
+	}
+
+	ret = phydev->drv->get_mse_capability(phydev, &data->capability);
+	if (ret)
+		goto out_unlock;
+
+	ret = mse_get_channels(phydev, data);
+
+out_unlock:
+	mutex_unlock(&phydev->lock);
+	ethnl_ops_complete(dev);
+	if (ret)
+		kfree(data->snapshots);
+	return ret;
+}
+
+static void mse_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+	struct mse_reply_data *data = mse_repdata(reply_base);
+
+	kfree(data->snapshots);
+}
+
+static int mse_reply_size(const struct ethnl_req_info *req_base,
+			  const struct ethnl_reply_data *reply_base)
+{
+	const struct mse_reply_data *data = mse_repdata(reply_base);
+	size_t len = 0;
+	unsigned int i;
+
+	/* ETHTOOL_A_MSE_CAPABILITIES */
+	len += nla_total_size(0);
+	if (data->capability.supported_caps & PHY_MSE_CAP_AVG)
+		/* ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE */
+		len += nla_total_size(sizeof(u64));
+	if (data->capability.supported_caps & (PHY_MSE_CAP_PEAK |
+					       PHY_MSE_CAP_WORST_PEAK))
+		/* ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE */
+		len += nla_total_size(sizeof(u64));
+	/* ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS */
+	len += nla_total_size(sizeof(u64));
+	/* ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS */
+	len += nla_total_size(sizeof(u64));
+
+	for (i = 0; i < data->num_snapshots; i++) {
+		size_t snapshot_len = 0;
+
+		/* Per-channel nest (e.g., ETHTOOL_A_MSE_CHANNEL_A / _B / _C /
+		 * _D / _WORST_CHANNEL / _LINK)
+		 */
+		snapshot_len += nla_total_size(0);
+
+		if (data->capability.supported_caps & PHY_MSE_CAP_AVG)
+			snapshot_len += nla_total_size(sizeof(u64));
+		if (data->capability.supported_caps & PHY_MSE_CAP_PEAK)
+			snapshot_len += nla_total_size(sizeof(u64));
+		if (data->capability.supported_caps & PHY_MSE_CAP_WORST_PEAK)
+			snapshot_len += nla_total_size(sizeof(u64));
+
+		len += snapshot_len;
+	}
+
+	return len;
+}
+
+static int mse_channel_to_attr(int ch)
+{
+	switch (ch) {
+	case PHY_MSE_CHANNEL_A:
+		return ETHTOOL_A_MSE_CHANNEL_A;
+	case PHY_MSE_CHANNEL_B:
+		return ETHTOOL_A_MSE_CHANNEL_B;
+	case PHY_MSE_CHANNEL_C:
+		return ETHTOOL_A_MSE_CHANNEL_C;
+	case PHY_MSE_CHANNEL_D:
+		return ETHTOOL_A_MSE_CHANNEL_D;
+	case PHY_MSE_CHANNEL_WORST:
+		return ETHTOOL_A_MSE_WORST_CHANNEL;
+	case PHY_MSE_CHANNEL_LINK:
+		return ETHTOOL_A_MSE_LINK;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int mse_fill_reply(struct sk_buff *skb,
+			  const struct ethnl_req_info *req_base,
+			  const struct ethnl_reply_data *reply_base)
+{
+	const struct mse_reply_data *data = mse_repdata(reply_base);
+	struct nlattr *nest;
+	unsigned int i;
+	int ret;
+
+	nest = nla_nest_start(skb, ETHTOOL_A_MSE_CAPABILITIES);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (data->capability.supported_caps & PHY_MSE_CAP_AVG) {
+		ret = nla_put_uint(skb,
+				   ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE,
+				   data->capability.max_average_mse);
+		if (ret < 0)
+			goto nla_put_nest_failure;
+	}
+
+	if (data->capability.supported_caps & (PHY_MSE_CAP_PEAK |
+					       PHY_MSE_CAP_WORST_PEAK)) {
+		ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE,
+				   data->capability.max_peak_mse);
+		if (ret < 0)
+			goto nla_put_nest_failure;
+	}
+
+	ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS,
+			   data->capability.refresh_rate_ps);
+	if (ret < 0)
+		goto nla_put_nest_failure;
+
+	ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS,
+			   data->capability.num_symbols);
+	if (ret < 0)
+		goto nla_put_nest_failure;
+
+	nla_nest_end(skb, nest);
+
+	for (i = 0; i < data->num_snapshots; i++) {
+		const struct mse_snapshot_entry *s = &data->snapshots[i];
+		int chan_attr;
+
+		chan_attr = mse_channel_to_attr(s->channel);
+		if (chan_attr < 0)
+			return chan_attr;
+
+		nest = nla_nest_start(skb, chan_attr);
+		if (!nest)
+			return -EMSGSIZE;
+
+		if (data->capability.supported_caps & PHY_MSE_CAP_AVG) {
+			ret = nla_put_uint(skb,
+					   ETHTOOL_A_MSE_SNAPSHOT_AVERAGE_MSE,
+					   s->snapshot.average_mse);
+			if (ret)
+				goto nla_put_nest_failure;
+		}
+		if (data->capability.supported_caps & PHY_MSE_CAP_PEAK) {
+			ret = nla_put_uint(skb, ETHTOOL_A_MSE_SNAPSHOT_PEAK_MSE,
+					   s->snapshot.peak_mse);
+			if (ret)
+				goto nla_put_nest_failure;
+		}
+		if (data->capability.supported_caps & PHY_MSE_CAP_WORST_PEAK) {
+			ret = nla_put_uint(skb,
+					   ETHTOOL_A_MSE_SNAPSHOT_WORST_PEAK_MSE,
+					   s->snapshot.worst_peak_mse);
+			if (ret)
+				goto nla_put_nest_failure;
+		}
+
+		nla_nest_end(skb, nest);
+	}
+
+	return 0;
+
+nla_put_nest_failure:
+	nla_nest_cancel(skb, nest);
+	return ret;
+}
+
+const struct ethnl_request_ops ethnl_mse_request_ops = {
+	.request_cmd = ETHTOOL_MSG_MSE_GET,
+	.reply_cmd = ETHTOOL_MSG_MSE_GET_REPLY,
+	.hdr_attr = ETHTOOL_A_MSE_HEADER,
+	.req_info_size = sizeof(struct mse_req_info),
+	.reply_data_size = sizeof(struct mse_reply_data),
+
+	.prepare_data = mse_prepare_data,
+	.cleanup_data = mse_cleanup_data,
+	.reply_size = mse_reply_size,
+	.fill_reply = mse_fill_reply,
+};
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 2f813f25f07e..6e5f0f4f815a 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -420,6 +420,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_TSCONFIG_GET]	= &ethnl_tsconfig_request_ops,
 	[ETHTOOL_MSG_TSCONFIG_SET]	= &ethnl_tsconfig_request_ops,
 	[ETHTOOL_MSG_PHY_GET]		= &ethnl_phy_request_ops,
+	[ETHTOOL_MSG_MSE_GET]		= &ethnl_mse_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -1534,6 +1535,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy	= ethnl_rss_delete_policy,
 		.maxattr = ARRAY_SIZE(ethnl_rss_delete_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_MSE_GET,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_perphy_start,
+		.dumpit	= ethnl_perphy_dumpit,
+		.done	= ethnl_perphy_done,
+		.policy = ethnl_mse_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_mse_get_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 1d4f9ecb3d26..89010eaa67df 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -442,6 +442,7 @@ extern const struct ethnl_request_ops ethnl_plca_status_request_ops;
 extern const struct ethnl_request_ops ethnl_mm_request_ops;
 extern const struct ethnl_request_ops ethnl_phy_request_ops;
 extern const struct ethnl_request_ops ethnl_tsconfig_request_ops;
+extern const struct ethnl_request_ops ethnl_mse_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -497,6 +498,7 @@ extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE
 extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1];
 extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1];
 extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1];
+extern const struct nla_policy ethnl_mse_get_policy[ETHTOOL_A_MSE_HEADER + 1];
 
 int ethnl_set_features(struct sk_buff *skb, struct genl_info *info);
 int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info);
-- 
cgit v1.2.3


From 85d55d8cc3ef7f77b249c97e9fac6a0fc5f5daa7 Mon Sep 17 00:00:00 2001
From: Akhil P Oommen <akhilpo@oss.qualcomm.com>
Date: Tue, 30 Sep 2025 11:18:06 +0530
Subject: soc: qcom: ubwc: Add config for Kaanapali

Add the ubwc configuration for Kaanapali chipset. This chipset brings
support for UBWC v6 version. The rest of the configurations remains
as usual.

Signed-off-by: Akhil P Oommen <akhilpo@oss.qualcomm.com>
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250930-kaana-gpu-support-v1-1-73530b0700ed@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/soc/qcom/ubwc_config.c | 11 +++++++++++
 include/linux/soc/qcom/ubwc.h  |  1 +
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/qcom/ubwc_config.c b/drivers/soc/qcom/ubwc_config.c
index 942fe6c17612..1c09796163b0 100644
--- a/drivers/soc/qcom/ubwc_config.c
+++ b/drivers/soc/qcom/ubwc_config.c
@@ -16,6 +16,16 @@ static const struct qcom_ubwc_cfg_data no_ubwc_data = {
 	/* no UBWC, no HBB */
 };
 
+static const struct qcom_ubwc_cfg_data kaanapali_data = {
+	.ubwc_enc_version = UBWC_6_0,
+	.ubwc_dec_version = UBWC_6_0,
+	.ubwc_swizzle = UBWC_SWIZZLE_ENABLE_LVL2 |
+			UBWC_SWIZZLE_ENABLE_LVL3,
+	.ubwc_bank_spread = true,
+	.highest_bank_bit = 16,
+	.macrotile_mode = true,
+};
+
 static const struct qcom_ubwc_cfg_data msm8937_data = {
 	.ubwc_enc_version = UBWC_1_0,
 	.ubwc_dec_version = UBWC_1_0,
@@ -234,6 +244,7 @@ static const struct of_device_id qcom_ubwc_configs[] __maybe_unused = {
 	{ .compatible = "qcom,apq8026", .data = &no_ubwc_data },
 	{ .compatible = "qcom,apq8074", .data = &no_ubwc_data },
 	{ .compatible = "qcom,apq8096", .data = &msm8998_data },
+	{ .compatible = "qcom,kaanapali", .data = &kaanapali_data, },
 	{ .compatible = "qcom,glymur", .data = &glymur_data},
 	{ .compatible = "qcom,msm8226", .data = &no_ubwc_data },
 	{ .compatible = "qcom,msm8916", .data = &no_ubwc_data },
diff --git a/include/linux/soc/qcom/ubwc.h b/include/linux/soc/qcom/ubwc.h
index 1ed8b1b16bc9..0a4edfe3d96d 100644
--- a/include/linux/soc/qcom/ubwc.h
+++ b/include/linux/soc/qcom/ubwc.h
@@ -52,6 +52,7 @@ struct qcom_ubwc_cfg_data {
 #define UBWC_4_0 0x40000000
 #define UBWC_4_3 0x40030000
 #define UBWC_5_0 0x50000000
+#define UBWC_6_0 0x60000000
 
 #if IS_ENABLED(CONFIG_QCOM_UBWC_CONFIG)
 const struct qcom_ubwc_cfg_data *qcom_ubwc_config_get_data(void);
-- 
cgit v1.2.3


From 603c646f001008eaf8b5a7a888043e5cc8c494a2 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 31 Oct 2025 14:28:53 -0700
Subject: coco/tsm: Introduce a core device for TEE Security Managers

A "TSM" is a platform component that provides an API for securely
provisioning resources for a confidential guest (TVM) to consume. The
name originates from the PCI specification for platform agent that
carries out operations for PCIe TDISP (TEE Device Interface Security
Protocol).

Instances of this core device are parented by a device representing the
platform security function like CONFIG_CRYPTO_DEV_CCP or
CONFIG_INTEL_TDX_HOST.

This device interface is a frontend to the aspects of a TSM and TEE I/O
that are cross-architecture common. This includes mechanisms like
enumerating available platform TEE I/O capabilities and provisioning
connections between the platform TSM and device DSMs (Device Security
Manager (TDISP)).

For now this is just the scaffolding for registering a TSM device sysfs
interface.

Cc: Xu Yilun <yilun.xu@linux.intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Co-developed-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251031212902.2256310-2-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/testing/sysfs-class-tsm |  9 +++
 MAINTAINERS                               |  2 +-
 drivers/virt/coco/Kconfig                 |  3 +
 drivers/virt/coco/Makefile                |  1 +
 drivers/virt/coco/tsm-core.c              | 93 +++++++++++++++++++++++++++++++
 include/linux/tsm.h                       | 11 ++++
 6 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ABI/testing/sysfs-class-tsm
 create mode 100644 drivers/virt/coco/tsm-core.c

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-tsm b/Documentation/ABI/testing/sysfs-class-tsm
new file mode 100644
index 000000000000..2949468deaf7
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-tsm
@@ -0,0 +1,9 @@
+What:		/sys/class/tsm/tsmN
+Contact:	linux-coco@lists.linux.dev
+Description:
+		"tsmN" is a device that represents the generic attributes of a
+		platform TEE Security Manager.  It is typically a child of a
+		platform enumerated TSM device. /sys/class/tsm/tsmN/uevent
+		signals when the PCI layer is able to support establishment of
+		link encryption and other device-security features coordinated
+		through a platform tsm.
diff --git a/MAINTAINERS b/MAINTAINERS
index 46bd8e033042..b8c9929532ed 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -26112,7 +26112,7 @@ M:	David Lechner <dlechner@baylibre.com>
 S:	Maintained
 F:	Documentation/devicetree/bindings/trigger-source/*
 
-TRUSTED SECURITY MODULE (TSM) INFRASTRUCTURE
+TRUSTED EXECUTION ENVIRONMENT SECURITY MANAGER (TSM)
 M:	Dan Williams <dan.j.williams@intel.com>
 L:	linux-coco@lists.linux.dev
 S:	Maintained
diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig
index 819a97e8ba99..bb0c6d6ddcc8 100644
--- a/drivers/virt/coco/Kconfig
+++ b/drivers/virt/coco/Kconfig
@@ -14,3 +14,6 @@ source "drivers/virt/coco/tdx-guest/Kconfig"
 source "drivers/virt/coco/arm-cca-guest/Kconfig"
 
 source "drivers/virt/coco/guest/Kconfig"
+
+config TSM
+	bool
diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile
index f918bbb61737..cb52021912b3 100644
--- a/drivers/virt/coco/Makefile
+++ b/drivers/virt/coco/Makefile
@@ -7,4 +7,5 @@ obj-$(CONFIG_ARM_PKVM_GUEST)	+= pkvm-guest/
 obj-$(CONFIG_SEV_GUEST)		+= sev-guest/
 obj-$(CONFIG_INTEL_TDX_GUEST)	+= tdx-guest/
 obj-$(CONFIG_ARM_CCA_GUEST)	+= arm-cca-guest/
+obj-$(CONFIG_TSM) 		+= tsm-core.o
 obj-$(CONFIG_TSM_GUEST)		+= guest/
diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c
new file mode 100644
index 000000000000..347507cc5e3f
--- /dev/null
+++ b/drivers/virt/coco/tsm-core.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024-2025 Intel Corporation. All rights reserved. */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/tsm.h>
+#include <linux/rwsem.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/cleanup.h>
+
+static struct class *tsm_class;
+static DECLARE_RWSEM(tsm_rwsem);
+static DEFINE_IDA(tsm_ida);
+
+static struct tsm_dev *alloc_tsm_dev(struct device *parent)
+{
+	struct device *dev;
+	int id;
+
+	struct tsm_dev *tsm_dev __free(kfree) =
+		kzalloc(sizeof(*tsm_dev), GFP_KERNEL);
+	if (!tsm_dev)
+		return ERR_PTR(-ENOMEM);
+
+	id = ida_alloc(&tsm_ida, GFP_KERNEL);
+	if (id < 0)
+		return ERR_PTR(id);
+
+	tsm_dev->id = id;
+	dev = &tsm_dev->dev;
+	dev->parent = parent;
+	dev->class = tsm_class;
+	device_initialize(dev);
+
+	return no_free_ptr(tsm_dev);
+}
+
+struct tsm_dev *tsm_register(struct device *parent)
+{
+	struct tsm_dev *tsm_dev __free(put_tsm_dev) = alloc_tsm_dev(parent);
+	struct device *dev;
+	int rc;
+
+	if (IS_ERR(tsm_dev))
+		return tsm_dev;
+
+	dev = &tsm_dev->dev;
+	rc = dev_set_name(dev, "tsm%d", tsm_dev->id);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = device_add(dev);
+	if (rc)
+		return ERR_PTR(rc);
+
+	return no_free_ptr(tsm_dev);
+}
+EXPORT_SYMBOL_GPL(tsm_register);
+
+void tsm_unregister(struct tsm_dev *tsm_dev)
+{
+	device_unregister(&tsm_dev->dev);
+}
+EXPORT_SYMBOL_GPL(tsm_unregister);
+
+static void tsm_release(struct device *dev)
+{
+	struct tsm_dev *tsm_dev = container_of(dev, typeof(*tsm_dev), dev);
+
+	ida_free(&tsm_ida, tsm_dev->id);
+	kfree(tsm_dev);
+}
+
+static int __init tsm_init(void)
+{
+	tsm_class = class_create("tsm");
+	if (IS_ERR(tsm_class))
+		return PTR_ERR(tsm_class);
+
+	tsm_class->dev_release = tsm_release;
+	return 0;
+}
+module_init(tsm_init)
+
+static void __exit tsm_exit(void)
+{
+	class_destroy(tsm_class);
+}
+module_exit(tsm_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TEE Security Manager Class Device");
diff --git a/include/linux/tsm.h b/include/linux/tsm.h
index 431054810dca..cd97c63ffa32 100644
--- a/include/linux/tsm.h
+++ b/include/linux/tsm.h
@@ -5,6 +5,7 @@
 #include <linux/sizes.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
+#include <linux/device.h>
 
 #define TSM_REPORT_INBLOB_MAX 64
 #define TSM_REPORT_OUTBLOB_MAX SZ_32K
@@ -107,6 +108,16 @@ struct tsm_report_ops {
 	bool (*report_bin_attr_visible)(int n);
 };
 
+struct tsm_dev {
+	struct device dev;
+	int id;
+};
+
+DEFINE_FREE(put_tsm_dev, struct tsm_dev *,
+	    if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev))
+
 int tsm_report_register(const struct tsm_report_ops *ops, void *priv);
 int tsm_report_unregister(const struct tsm_report_ops *ops);
+struct tsm_dev *tsm_register(struct device *parent);
+void tsm_unregister(struct tsm_dev *tsm_dev);
 #endif /* __TSM_H */
-- 
cgit v1.2.3


From f16469ee733ac52b2373216803699cbb05e82786 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 31 Oct 2025 14:28:54 -0700
Subject: PCI/IDE: Enumerate Selective Stream IDE capabilities

Link encryption is a new PCIe feature enumerated by "PCIe r7.0 section
7.9.26 IDE Extended Capability".

It is both a standalone port + endpoint capability, and a building block
for the security protocol defined by "PCIe r7.0 section 11 TEE Device
Interface Security Protocol (TDISP)". That protocol coordinates device
security setup between a platform TSM (TEE Security Manager) and a
device DSM (Device Security Manager). While the platform TSM can
allocate resources like Stream ID and manage keys, it still requires
system software to manage the IDE capability register block.

Add register definitions and basic enumeration in preparation for
Selective IDE Stream establishment. A follow on change selects the new
CONFIG_PCI_IDE symbol. Note that while the IDE specification defines
both a point-to-point "Link Stream" and a Root Port to endpoint
"Selective Stream", only "Selective Stream" is considered for Linux as
that is the predominant mode expected by Trusted Execution Environment
Security Managers (TSMs), and it is the security model that limits the
number of PCI components within the TCB in a PCIe topology with
switches.

Co-developed-by: Alexey Kardashevskiy <aik@amd.com>
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexey Kardashevskiy <aik@amd.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Link: https://patch.msgid.link/20251031212902.2256310-3-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/Kconfig           |  3 ++
 drivers/pci/Makefile          |  1 +
 drivers/pci/ide.c             | 88 +++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h             |  6 +++
 drivers/pci/probe.c           |  1 +
 include/linux/pci.h           |  7 ++++
 include/uapi/linux/pci_regs.h | 81 +++++++++++++++++++++++++++++++++++++++
 7 files changed, 187 insertions(+)
 create mode 100644 drivers/pci/ide.c

(limited to 'include')

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index f94f5d384362..b28423e2057f 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -122,6 +122,9 @@ config XEN_PCIDEV_FRONTEND
 config PCI_ATS
 	bool
 
+config PCI_IDE
+	bool
+
 config PCI_DOE
 	bool "Enable PCI Data Object Exchange (DOE) support"
 	help
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 67647f1880fb..6612256fd37d 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_PCI_P2PDMA)	+= p2pdma.o
 obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 obj-$(CONFIG_VGA_ARB)		+= vgaarb.o
 obj-$(CONFIG_PCI_DOE)		+= doe.o
+obj-$(CONFIG_PCI_IDE)		+= ide.o
 obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o
 obj-$(CONFIG_PCI_NPEM)		+= npem.o
 obj-$(CONFIG_PCIE_TPH)		+= tph.o
diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c
new file mode 100644
index 000000000000..26866edf91b4
--- /dev/null
+++ b/drivers/pci/ide.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2024-2025 Intel Corporation. All rights reserved. */
+
+/* PCIe r7.0 section 6.33 Integrity & Data Encryption (IDE) */
+
+#define dev_fmt(fmt) "PCI/IDE: " fmt
+#include <linux/bitfield.h>
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+
+#include "pci.h"
+
+static int __sel_ide_offset(u16 ide_cap, u8 nr_link_ide, u8 stream_index,
+			    u8 nr_ide_mem)
+{
+	u32 offset = ide_cap + PCI_IDE_LINK_STREAM_0 +
+		     nr_link_ide * PCI_IDE_LINK_BLOCK_SIZE;
+
+	/*
+	 * Assume a constant number of address association resources per stream
+	 * index
+	 */
+	return offset + stream_index * PCI_IDE_SEL_BLOCK_SIZE(nr_ide_mem);
+}
+
+void pci_ide_init(struct pci_dev *pdev)
+{
+	u16 nr_link_ide, nr_ide_mem, nr_streams;
+	u16 ide_cap;
+	u32 val;
+
+	if (!pci_is_pcie(pdev))
+		return;
+
+	ide_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_IDE);
+	if (!ide_cap)
+		return;
+
+	pci_read_config_dword(pdev, ide_cap + PCI_IDE_CAP, &val);
+	if ((val & PCI_IDE_CAP_SELECTIVE) == 0)
+		return;
+
+	/*
+	 * Require endpoint IDE capability to be paired with IDE Root Port IDE
+	 * capability.
+	 */
+	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) {
+		struct pci_dev *rp = pcie_find_root_port(pdev);
+
+		if (!rp->ide_cap)
+			return;
+	}
+
+	pdev->ide_cfg = FIELD_GET(PCI_IDE_CAP_SEL_CFG, val);
+	pdev->ide_tee_limit = FIELD_GET(PCI_IDE_CAP_TEE_LIMITED, val);
+
+	if (val & PCI_IDE_CAP_LINK)
+		nr_link_ide = 1 + FIELD_GET(PCI_IDE_CAP_LINK_TC_NUM, val);
+	else
+		nr_link_ide = 0;
+
+	nr_ide_mem = 0;
+	nr_streams = 1 + FIELD_GET(PCI_IDE_CAP_SEL_NUM, val);
+	for (u16 i = 0; i < nr_streams; i++) {
+		int pos = __sel_ide_offset(ide_cap, nr_link_ide, i, nr_ide_mem);
+		int nr_assoc;
+		u32 val;
+
+		pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CAP, &val);
+
+		/*
+		 * Let's not entertain streams that do not have a constant
+		 * number of address association blocks
+		 */
+		nr_assoc = FIELD_GET(PCI_IDE_SEL_CAP_ASSOC_NUM, val);
+		if (i && (nr_assoc != nr_ide_mem)) {
+			pci_info(pdev, "Unsupported Selective Stream %d capability, SKIP the rest\n", i);
+			nr_streams = i;
+			break;
+		}
+
+		nr_ide_mem = nr_assoc;
+	}
+
+	pdev->ide_cap = ide_cap;
+	pdev->nr_link_ide = nr_link_ide;
+	pdev->nr_ide_mem = nr_ide_mem;
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 4492b809094b..86ef13e7cece 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -613,6 +613,12 @@ static inline void pci_doe_sysfs_init(struct pci_dev *pdev) { }
 static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { }
 #endif
 
+#ifdef CONFIG_PCI_IDE
+void pci_ide_init(struct pci_dev *dev);
+#else
+static inline void pci_ide_init(struct pci_dev *dev) { }
+#endif
+
 /**
  * pci_dev_set_io_state - Set the new error state if possible.
  *
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0ce98e18b5a8..4c55020f3ddf 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2667,6 +2667,7 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	pci_doe_init(dev);		/* Data Object Exchange */
 	pci_tph_init(dev);		/* TLP Processing Hints */
 	pci_rebar_init(dev);		/* Resizable BAR */
+	pci_ide_init(dev);		/* Link Integrity and Data Encryption */
 
 	pcie_report_downtraining(dev);
 	pci_init_reset_methods(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1fdf81fbe1e..4402ca931124 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -539,6 +539,13 @@ struct pci_dev {
 #endif
 #ifdef CONFIG_PCI_NPEM
 	struct npem	*npem;		/* Native PCIe Enclosure Management */
+#endif
+#ifdef CONFIG_PCI_IDE
+	u16		ide_cap;	/* Link Integrity & Data Encryption */
+	u8		nr_ide_mem;	/* Address association resources for streams */
+	u8		nr_link_ide;	/* Link Stream count (Selective Stream offset) */
+	unsigned int	ide_cfg:1;	/* Config cycles over IDE */
+	unsigned int	ide_tee_limit:1; /* Disallow T=0 traffic over IDE */
 #endif
 	u16		acs_cap;	/* ACS Capability offset */
 	u8		supported_speeds; /* Supported Link Speeds Vector */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 07e06aafec50..05bd22d9e352 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -754,6 +754,7 @@
 #define PCI_EXT_CAP_ID_NPEM	0x29	/* Native PCIe Enclosure Management */
 #define PCI_EXT_CAP_ID_PL_32GT  0x2A    /* Physical Layer 32.0 GT/s */
 #define PCI_EXT_CAP_ID_DOE	0x2E	/* Data Object Exchange */
+#define PCI_EXT_CAP_ID_IDE	0x30    /* Integrity and Data Encryption */
 #define PCI_EXT_CAP_ID_PL_64GT	0x31	/* Physical Layer 64.0 GT/s */
 #define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_64GT
 
@@ -1249,4 +1250,84 @@
 #define PCI_DVSEC_CXL_PORT_CTL				0x0c
 #define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR		0x00000001
 
+/* Integrity and Data Encryption Extended Capability */
+#define PCI_IDE_CAP			0x04
+#define  PCI_IDE_CAP_LINK		0x1  /* Link IDE Stream Supported */
+#define  PCI_IDE_CAP_SELECTIVE		0x2  /* Selective IDE Streams Supported */
+#define  PCI_IDE_CAP_FLOWTHROUGH	0x4  /* Flow-Through IDE Stream Supported */
+#define  PCI_IDE_CAP_PARTIAL_HEADER_ENC 0x8  /* Partial Header Encryption Supported */
+#define  PCI_IDE_CAP_AGGREGATION	0x10 /* Aggregation Supported */
+#define  PCI_IDE_CAP_PCRC		0x20 /* PCRC Supported */
+#define  PCI_IDE_CAP_IDE_KM		0x40 /* IDE_KM Protocol Supported */
+#define  PCI_IDE_CAP_SEL_CFG		0x80 /* Selective IDE for Config Request Support */
+#define  PCI_IDE_CAP_ALG		__GENMASK(12, 8) /* Supported Algorithms */
+#define   PCI_IDE_CAP_ALG_AES_GCM_256	0    /* AES-GCM 256 key size, 96b MAC */
+#define  PCI_IDE_CAP_LINK_TC_NUM	__GENMASK(15, 13) /* Link IDE TCs */
+#define  PCI_IDE_CAP_SEL_NUM		__GENMASK(23, 16) /* Supported Selective IDE Streams */
+#define  PCI_IDE_CAP_TEE_LIMITED	0x1000000 /* TEE-Limited Stream Supported */
+#define PCI_IDE_CTL			0x08
+#define  PCI_IDE_CTL_FLOWTHROUGH_IDE	0x4  /* Flow-Through IDE Stream Enabled */
+
+#define PCI_IDE_LINK_STREAM_0		0xc  /* First Link Stream Register Block */
+#define  PCI_IDE_LINK_BLOCK_SIZE	8
+/* Link IDE Stream block, up to PCI_IDE_CAP_LINK_TC_NUM */
+#define PCI_IDE_LINK_CTL_0		0x00		  /* First Link Control Register Offset in block */
+#define  PCI_IDE_LINK_CTL_EN		0x1		  /* Link IDE Stream Enable */
+#define  PCI_IDE_LINK_CTL_TX_AGGR_NPR	__GENMASK(3, 2)	  /* Tx Aggregation Mode NPR */
+#define  PCI_IDE_LINK_CTL_TX_AGGR_PR	__GENMASK(5, 4)	  /* Tx Aggregation Mode PR */
+#define  PCI_IDE_LINK_CTL_TX_AGGR_CPL	__GENMASK(7, 6)	  /* Tx Aggregation Mode CPL */
+#define  PCI_IDE_LINK_CTL_PCRC_EN	0x100		  /* PCRC Enable */
+#define  PCI_IDE_LINK_CTL_PART_ENC	__GENMASK(13, 10) /* Partial Header Encryption Mode */
+#define  PCI_IDE_LINK_CTL_ALG		__GENMASK(18, 14) /* Selection from PCI_IDE_CAP_ALG */
+#define  PCI_IDE_LINK_CTL_TC		__GENMASK(21, 19) /* Traffic Class */
+#define  PCI_IDE_LINK_CTL_ID		__GENMASK(31, 24) /* Stream ID */
+#define PCI_IDE_LINK_STS_0		0x4               /* First Link Status Register Offset in block */
+#define  PCI_IDE_LINK_STS_STATE		__GENMASK(3, 0)   /* Link IDE Stream State */
+#define  PCI_IDE_LINK_STS_IDE_FAIL	0x80000000	  /* IDE fail message received */
+
+/* Selective IDE Stream block, up to PCI_IDE_CAP_SELECTIVE_STREAMS_NUM */
+/* Selective IDE Stream Capability Register */
+#define  PCI_IDE_SEL_CAP		0x00
+#define   PCI_IDE_SEL_CAP_ASSOC_NUM	__GENMASK(3, 0)
+/* Selective IDE Stream Control Register */
+#define  PCI_IDE_SEL_CTL		0x04
+#define   PCI_IDE_SEL_CTL_EN		0x1		  /* Selective IDE Stream Enable */
+#define   PCI_IDE_SEL_CTL_TX_AGGR_NPR	__GENMASK(3, 2)	  /* Tx Aggregation Mode NPR */
+#define   PCI_IDE_SEL_CTL_TX_AGGR_PR	__GENMASK(5, 4)   /* Tx Aggregation Mode PR */
+#define   PCI_IDE_SEL_CTL_TX_AGGR_CPL	__GENMASK(7, 6)	  /* Tx Aggregation Mode CPL */
+#define   PCI_IDE_SEL_CTL_PCRC_EN	0x100		  /* PCRC Enable */
+#define   PCI_IDE_SEL_CTL_CFG_EN	0x200		  /* Selective IDE for Configuration Requests */
+#define   PCI_IDE_SEL_CTL_PART_ENC	__GENMASK(13, 10) /* Partial Header Encryption Mode */
+#define   PCI_IDE_SEL_CTL_ALG		__GENMASK(18, 14) /* Selection from PCI_IDE_CAP_ALG */
+#define   PCI_IDE_SEL_CTL_TC		__GENMASK(21, 19) /* Traffic Class */
+#define   PCI_IDE_SEL_CTL_DEFAULT	0x400000	  /* Default Stream */
+#define   PCI_IDE_SEL_CTL_TEE_LIMITED	0x800000	  /* TEE-Limited Stream */
+#define   PCI_IDE_SEL_CTL_ID		__GENMASK(31, 24) /* Stream ID */
+#define   PCI_IDE_SEL_CTL_ID_MAX	255
+/* Selective IDE Stream Status Register */
+#define  PCI_IDE_SEL_STS		 0x08
+#define   PCI_IDE_SEL_STS_STATE		 __GENMASK(3, 0) /* Selective IDE Stream State */
+#define   PCI_IDE_SEL_STS_STATE_INSECURE 0
+#define   PCI_IDE_SEL_STS_STATE_SECURE	 2
+#define   PCI_IDE_SEL_STS_IDE_FAIL	 0x80000000	 /* IDE fail message received */
+/* IDE RID Association Register 1 */
+#define  PCI_IDE_SEL_RID_1		 0x0c
+#define   PCI_IDE_SEL_RID_1_LIMIT	 __GENMASK(23, 8)
+/* IDE RID Association Register 2 */
+#define  PCI_IDE_SEL_RID_2		0x10
+#define   PCI_IDE_SEL_RID_2_VALID	0x1
+#define   PCI_IDE_SEL_RID_2_BASE	__GENMASK(23, 8)
+#define   PCI_IDE_SEL_RID_2_SEG		__GENMASK(31, 24)
+/* Selective IDE Address Association Register Block, up to PCI_IDE_SEL_CAP_ASSOC_NUM */
+#define PCI_IDE_SEL_ADDR_BLOCK_SIZE	12
+#define  PCI_IDE_SEL_ADDR_1(x)		(20 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE)
+#define   PCI_IDE_SEL_ADDR_1_VALID	0x1
+#define   PCI_IDE_SEL_ADDR_1_BASE_LOW	__GENMASK(19, 8)
+#define   PCI_IDE_SEL_ADDR_1_LIMIT_LOW	__GENMASK(31, 20)
+/* IDE Address Association Register 2 is "Memory Limit Upper" */
+#define  PCI_IDE_SEL_ADDR_2(x)		(24 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE)
+/* IDE Address Association Register 3 is "Memory Base Upper" */
+#define  PCI_IDE_SEL_ADDR_3(x)		(28 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE)
+#define PCI_IDE_SEL_BLOCK_SIZE(nr_assoc)  (20 + PCI_IDE_SEL_ADDR_BLOCK_SIZE * (nr_assoc))
+
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From 215afa89d249bb095126cf00f8be719e421c75e9 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 31 Oct 2025 14:28:55 -0700
Subject: PCI: Introduce pci_walk_bus_reverse(), for_each_pci_dev_reverse()

PCI/TSM, the PCI core functionality for the PCIe TEE Device Interface
Security Protocol (TDISP), has a need to walk all subordinate functions of
a Device Security Manager (DSM) to setup a device security context. A DSM
is physical function 0 of multi-function or SR-IOV device endpoint, or it
is an upstream switch port.

In error scenarios or when a TEE Security Manager (TSM) device is removed
it needs to unwind all established DSM contexts.

Introduce reverse versions of PCI device iteration helpers to mirror the
setup path and ensure that dependent children are handled before parents.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251031212902.2256310-4-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/base/bus.c         | 38 ++++++++++++++++++++++++++++
 drivers/pci/bus.c          | 39 +++++++++++++++++++++++++++++
 drivers/pci/search.c       | 62 ++++++++++++++++++++++++++++++++++++++++------
 include/linux/device/bus.h |  3 +++
 include/linux/pci.h        | 11 ++++++++
 5 files changed, 145 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 5e75e1bce551..d19dae8f9d1b 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -334,6 +334,19 @@ static struct device *next_device(struct klist_iter *i)
 	return dev;
 }
 
+static struct device *prev_device(struct klist_iter *i)
+{
+	struct klist_node *n = klist_prev(i);
+	struct device *dev = NULL;
+	struct device_private *dev_prv;
+
+	if (n) {
+		dev_prv = to_device_private_bus(n);
+		dev = dev_prv->device;
+	}
+	return dev;
+}
+
 /**
  * bus_for_each_dev - device iterator.
  * @bus: bus type.
@@ -414,6 +427,31 @@ struct device *bus_find_device(const struct bus_type *bus,
 }
 EXPORT_SYMBOL_GPL(bus_find_device);
 
+struct device *bus_find_device_reverse(const struct bus_type *bus,
+				       struct device *start, const void *data,
+				       device_match_t match)
+{
+	struct subsys_private *sp = bus_to_subsys(bus);
+	struct klist_iter i;
+	struct device *dev;
+
+	if (!sp)
+		return NULL;
+
+	klist_iter_init_node(&sp->klist_devices, &i,
+			     (start ? &start->p->knode_bus : NULL));
+	while ((dev = prev_device(&i))) {
+		if (match(dev, data)) {
+			get_device(dev);
+			break;
+		}
+	}
+	klist_iter_exit(&i);
+	subsys_put(sp);
+	return dev;
+}
+EXPORT_SYMBOL_GPL(bus_find_device_reverse);
+
 static struct device_driver *next_driver(struct klist_iter *i)
 {
 	struct klist_node *n = klist_next(i);
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index f26aec6ff588..b8b17f825bc0 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -8,6 +8,7 @@
  */
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/cleanup.h>
 #include <linux/pci.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
@@ -432,6 +433,27 @@ static int __pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void
 	return ret;
 }
 
+static int __pci_walk_bus_reverse(struct pci_bus *top,
+				  int (*cb)(struct pci_dev *, void *),
+				  void *userdata)
+{
+	struct pci_dev *dev;
+	int ret = 0;
+
+	list_for_each_entry_reverse(dev, &top->devices, bus_list) {
+		if (dev->subordinate) {
+			ret = __pci_walk_bus_reverse(dev->subordinate, cb,
+						     userdata);
+			if (ret)
+				break;
+		}
+		ret = cb(dev, userdata);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
 /**
  *  pci_walk_bus - walk devices on/under bus, calling callback.
  *  @top: bus whose devices should be walked
@@ -453,6 +475,23 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void
 }
 EXPORT_SYMBOL_GPL(pci_walk_bus);
 
+/**
+ * pci_walk_bus_reverse - walk devices on/under bus, calling callback.
+ * @top: bus whose devices should be walked
+ * @cb: callback to be called for each device found
+ * @userdata: arbitrary pointer to be passed to callback
+ *
+ * Same semantics as pci_walk_bus(), but walks the bus in reverse order.
+ */
+void pci_walk_bus_reverse(struct pci_bus *top,
+			  int (*cb)(struct pci_dev *, void *), void *userdata)
+{
+	down_read(&pci_bus_sem);
+	__pci_walk_bus_reverse(top, cb, userdata);
+	up_read(&pci_bus_sem);
+}
+EXPORT_SYMBOL_GPL(pci_walk_bus_reverse);
+
 void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata)
 {
 	lockdep_assert_held(&pci_bus_sem);
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 53840634fbfc..e6e84dc62e82 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -282,6 +282,45 @@ static struct pci_dev *pci_get_dev_by_id(const struct pci_device_id *id,
 	return pdev;
 }
 
+static struct pci_dev *pci_get_dev_by_id_reverse(const struct pci_device_id *id,
+						 struct pci_dev *from)
+{
+	struct device *dev;
+	struct device *dev_start = NULL;
+	struct pci_dev *pdev = NULL;
+
+	if (from)
+		dev_start = &from->dev;
+	dev = bus_find_device_reverse(&pci_bus_type, dev_start, (void *)id,
+				      match_pci_dev_by_id);
+	if (dev)
+		pdev = to_pci_dev(dev);
+	pci_dev_put(from);
+	return pdev;
+}
+
+enum pci_search_direction {
+	PCI_SEARCH_FORWARD,
+	PCI_SEARCH_REVERSE,
+};
+
+static struct pci_dev *__pci_get_subsys(unsigned int vendor, unsigned int device,
+				 unsigned int ss_vendor, unsigned int ss_device,
+				 struct pci_dev *from, enum pci_search_direction dir)
+{
+	struct pci_device_id id = {
+		.vendor = vendor,
+		.device = device,
+		.subvendor = ss_vendor,
+		.subdevice = ss_device,
+	};
+
+	if (dir == PCI_SEARCH_FORWARD)
+		return pci_get_dev_by_id(&id, from);
+	else
+		return pci_get_dev_by_id_reverse(&id, from);
+}
+
 /**
  * pci_get_subsys - begin or continue searching for a PCI device by vendor/subvendor/device/subdevice id
  * @vendor: PCI vendor id to match, or %PCI_ANY_ID to match all vendor ids
@@ -302,14 +341,8 @@ struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device,
 			       unsigned int ss_vendor, unsigned int ss_device,
 			       struct pci_dev *from)
 {
-	struct pci_device_id id = {
-		.vendor = vendor,
-		.device = device,
-		.subvendor = ss_vendor,
-		.subdevice = ss_device,
-	};
-
-	return pci_get_dev_by_id(&id, from);
+	return __pci_get_subsys(vendor, device, ss_vendor, ss_device, from,
+				PCI_SEARCH_FORWARD);
 }
 EXPORT_SYMBOL(pci_get_subsys);
 
@@ -334,6 +367,19 @@ struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device,
 }
 EXPORT_SYMBOL(pci_get_device);
 
+/*
+ * Same semantics as pci_get_device(), except walks the PCI device list
+ * in reverse discovery order.
+ */
+struct pci_dev *pci_get_device_reverse(unsigned int vendor,
+				       unsigned int device,
+				       struct pci_dev *from)
+{
+	return __pci_get_subsys(vendor, device, PCI_ANY_ID, PCI_ANY_ID, from,
+				PCI_SEARCH_REVERSE);
+}
+EXPORT_SYMBOL(pci_get_device_reverse);
+
 /**
  * pci_get_class - begin or continue searching for a PCI device by class
  * @class: search for a PCI device with this class designation
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index f5a56efd2bd6..99b1002b3e31 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -150,6 +150,9 @@ int bus_for_each_dev(const struct bus_type *bus, struct device *start,
 		     void *data, device_iter_t fn);
 struct device *bus_find_device(const struct bus_type *bus, struct device *start,
 			       const void *data, device_match_t match);
+struct device *bus_find_device_reverse(const struct bus_type *bus,
+				       struct device *start, const void *data,
+				       device_match_t match);
 /**
  * bus_find_device_by_name - device iterator for locating a particular device
  * of a specific name.
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4402ca931124..b6a12a82be12 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -582,6 +582,8 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus);
 
 #define	to_pci_dev(n) container_of(n, struct pci_dev, dev)
 #define for_each_pci_dev(d) while ((d = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL)
+#define for_each_pci_dev_reverse(d) \
+	while ((d = pci_get_device_reverse(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL)
 
 static inline int pci_channel_offline(struct pci_dev *pdev)
 {
@@ -1242,6 +1244,8 @@ u64 pci_get_dsn(struct pci_dev *dev);
 
 struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device,
 			       struct pci_dev *from);
+struct pci_dev *pci_get_device_reverse(unsigned int vendor, unsigned int device,
+				       struct pci_dev *from);
 struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device,
 			       unsigned int ss_vendor, unsigned int ss_device,
 			       struct pci_dev *from);
@@ -1661,6 +1665,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 
 void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 		  void *userdata);
+void pci_walk_bus_reverse(struct pci_bus *top,
+			  int (*cb)(struct pci_dev *, void *), void *userdata);
 int pci_cfg_space_size(struct pci_dev *dev);
 unsigned char pci_bus_max_busnr(struct pci_bus *bus);
 resource_size_t pcibios_window_alignment(struct pci_bus *bus,
@@ -2049,6 +2055,11 @@ static inline struct pci_dev *pci_get_device(unsigned int vendor,
 					     struct pci_dev *from)
 { return NULL; }
 
+static inline struct pci_dev *pci_get_device_reverse(unsigned int vendor,
+						     unsigned int device,
+						     struct pci_dev *from)
+{ return NULL; }
+
 static inline struct pci_dev *pci_get_subsys(unsigned int vendor,
 					     unsigned int device,
 					     unsigned int ss_vendor,
-- 
cgit v1.2.3


From 3225f52cde56f46789a4972d3c54df8a4d75f022 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 31 Oct 2025 14:28:56 -0700
Subject: PCI/TSM: Establish Secure Sessions and Link Encryption

The PCIe 7.0 specification, section 11, defines the Trusted Execution
Environment (TEE) Device Interface Security Protocol (TDISP).  This
protocol definition builds upon Component Measurement and Authentication
(CMA), and link Integrity and Data Encryption (IDE). It adds support for
assigning devices (PCI physical or virtual function) to a confidential VM
such that the assigned device is enabled to access guest private memory
protected by technologies like Intel TDX, AMD SEV-SNP, RISCV COVE, or ARM
CCA.

The "TSM" (TEE Security Manager) is a concept in the TDISP specification
of an agent that mediates between a "DSM" (Device Security Manager) and
system software in both a VMM and a confidential VM. A VMM uses TSM ABIs
to setup link security and assign devices. A confidential VM uses TSM
ABIs to transition an assigned device into the TDISP "RUN" state and
validate its configuration. From a Linux perspective the TSM abstracts
many of the details of TDISP, IDE, and CMA. Some of those details leak
through at times, but for the most part TDISP is an internal
implementation detail of the TSM.

CONFIG_PCI_TSM adds an "authenticated" attribute and "tsm/" subdirectory
to pci-sysfs. Consider that the TSM driver may itself be a PCI driver.
Userspace can watch for the arrival of a "TSM" device,
/sys/class/tsm/tsm0/uevent KOBJ_CHANGE, to know when the PCI core has
initialized TSM services.

The operations that can be executed against a PCI device are split into
two mutually exclusive operation sets, "Link" and "Security" (struct
pci_tsm_{link,security}_ops). The "Link" operations manage physical link
security properties and communication with the device's Device Security
Manager firmware. These are the host side operations in TDISP. The
"Security" operations coordinate the security state of the assigned
virtual device (TDI). These are the guest side operations in TDISP.

Only "link" (Secure Session and physical Link Encryption) operations are
defined at this stage. There are placeholders for the device security
(Trusted Computing Base entry / exit) operations.

The locking allows for multiple devices to be executing commands
simultaneously, one outstanding command per-device and an rwsem
synchronizes the implementation relative to TSM registration/unregistration
events.

Thanks to Wu Hao for his work on an early draft of this support.

Cc: Lukas Wunner <lukas@wunner.de>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Alexey Kardashevskiy <aik@amd.com>
Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Link: https://patch.msgid.link/20251031212902.2256310-5-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/testing/sysfs-bus-pci |  51 +++
 Documentation/driver-api/pci/index.rst  |   1 +
 Documentation/driver-api/pci/tsm.rst    |  21 ++
 MAINTAINERS                             |   4 +-
 drivers/pci/Kconfig                     |  15 +
 drivers/pci/Makefile                    |   1 +
 drivers/pci/doe.c                       |   2 -
 drivers/pci/pci-sysfs.c                 |   4 +
 drivers/pci/pci.h                       |  10 +
 drivers/pci/probe.c                     |   3 +
 drivers/pci/remove.c                    |   6 +
 drivers/pci/tsm.c                       | 643 ++++++++++++++++++++++++++++++++
 drivers/virt/coco/tsm-core.c            |  46 ++-
 include/linux/pci-doe.h                 |   4 +
 include/linux/pci-tsm.h                 | 157 ++++++++
 include/linux/pci.h                     |   3 +
 include/linux/tsm.h                     |   5 +-
 include/uapi/linux/pci_regs.h           |   1 +
 18 files changed, 971 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/driver-api/pci/tsm.rst
 create mode 100644 drivers/pci/tsm.c
 create mode 100644 include/linux/pci-tsm.h

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 92debe879ffb..6ffe02f854d6 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -621,3 +621,54 @@ Description:
 		number extended capability. The file is read only and due to
 		the possible sensitivity of accessible serial numbers, admin
 		only.
+
+What:		/sys/bus/pci/devices/.../tsm/
+Contact:	linux-coco@lists.linux.dev
+Description:
+		This directory only appears if a physical device function
+		supports authentication (PCIe CMA-SPDM), interface security
+		(PCIe TDISP), and is accepted for secure operation by the
+		platform TSM driver. This attribute directory appears
+		dynamically after the platform TSM driver loads. So, only after
+		the /sys/class/tsm/tsm0 device arrives can tools assume that
+		devices without a tsm/ attribute directory will never have one;
+		before that, the security capabilities of the device relative to
+		the platform TSM are unknown. See
+		Documentation/ABI/testing/sysfs-class-tsm.
+
+What:		/sys/bus/pci/devices/.../tsm/connect
+Contact:	linux-coco@lists.linux.dev
+Description:
+		(RW) Write the name of a TSM (TEE Security Manager) device from
+		/sys/class/tsm to this file to establish a connection with the
+		device.  This typically includes an SPDM (DMTF Security
+		Protocols and Data Models) session over PCIe DOE (Data Object
+		Exchange) and may also include PCIe IDE (Integrity and Data
+		Encryption) establishment. Reads from this attribute return the
+		name of the connected TSM or the empty string if not
+		connected. A TSM device signals its readiness to accept PCI
+		connection via a KOBJ_CHANGE event.
+
+What:		/sys/bus/pci/devices/.../tsm/disconnect
+Contact:	linux-coco@lists.linux.dev
+Description:
+		(WO) Write the name of the TSM device that was specified
+		to 'connect' to teardown the connection.
+
+What:		/sys/bus/pci/devices/.../authenticated
+Contact:	linux-pci@vger.kernel.org
+Description:
+		When the device's tsm/ directory is present device
+		authentication (PCIe CMA-SPDM) and link encryption (PCIe IDE)
+		are handled by the platform TSM (TEE Security Manager). When the
+		tsm/ directory is not present this attribute reflects only the
+		native CMA-SPDM authentication state with the kernel's
+		certificate store.
+
+		If the attribute is not present, it indicates that
+		authentication is unsupported by the device, or the TSM has no
+		available authentication methods for the device.
+
+		When present and the tsm/ attribute directory is present, the
+		authenticated attribute is an alias for the device 'connect'
+		state. See the 'tsm/connect' attribute for more details.
diff --git a/Documentation/driver-api/pci/index.rst b/Documentation/driver-api/pci/index.rst
index a38e475cdbe3..9e1b801d0f74 100644
--- a/Documentation/driver-api/pci/index.rst
+++ b/Documentation/driver-api/pci/index.rst
@@ -10,6 +10,7 @@ The Linux PCI driver implementer's API guide
 
    pci
    p2pdma
+   tsm
 
 .. only::  subproject and html
 
diff --git a/Documentation/driver-api/pci/tsm.rst b/Documentation/driver-api/pci/tsm.rst
new file mode 100644
index 000000000000..232b92bec93f
--- /dev/null
+++ b/Documentation/driver-api/pci/tsm.rst
@@ -0,0 +1,21 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+========================================================
+PCI Trusted Execution Environment Security Manager (TSM)
+========================================================
+
+Subsystem Interfaces
+====================
+
+.. kernel-doc:: include/linux/pci-ide.h
+   :internal:
+
+.. kernel-doc:: drivers/pci/ide.c
+   :export:
+
+.. kernel-doc:: include/linux/pci-tsm.h
+   :internal:
+
+.. kernel-doc:: drivers/pci/tsm.c
+   :export:
diff --git a/MAINTAINERS b/MAINTAINERS
index b8c9929532ed..f1c8793bf03e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -26118,8 +26118,10 @@ L:	linux-coco@lists.linux.dev
 S:	Maintained
 F:	Documentation/ABI/testing/configfs-tsm-report
 F:	Documentation/driver-api/coco/
+F:	Documentation/driver-api/pci/tsm.rst
+F:	drivers/pci/tsm.c
 F:	drivers/virt/coco/guest/
-F:	include/linux/tsm*.h
+F:	include/linux/*tsm*.h
 F:	samples/tsm-mr/
 
 TRUSTED SERVICES TEE DRIVER
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index b28423e2057f..00b0210e1f1d 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -125,6 +125,21 @@ config PCI_ATS
 config PCI_IDE
 	bool
 
+config PCI_TSM
+	bool "PCI TSM: Device security protocol support"
+	select PCI_IDE
+	select PCI_DOE
+	select TSM
+	help
+	  The TEE (Trusted Execution Environment) Device Interface
+	  Security Protocol (TDISP) defines a "TSM" as a platform agent
+	  that manages device authentication, link encryption, link
+	  integrity protection, and assignment of PCI device functions
+	  (virtual or physical) to confidential computing VMs that can
+	  access (DMA) guest private memory.
+
+	  Enable a platform TSM driver to use this capability.
+
 config PCI_DOE
 	bool "Enable PCI Data Object Exchange (DOE) support"
 	help
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 6612256fd37d..2c545f877062 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 obj-$(CONFIG_VGA_ARB)		+= vgaarb.o
 obj-$(CONFIG_PCI_DOE)		+= doe.o
 obj-$(CONFIG_PCI_IDE)		+= ide.o
+obj-$(CONFIG_PCI_TSM)		+= tsm.o
 obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o
 obj-$(CONFIG_PCI_NPEM)		+= npem.o
 obj-$(CONFIG_PCIE_TPH)		+= tph.o
diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
index aae9a8a00406..62be9c8dbc52 100644
--- a/drivers/pci/doe.c
+++ b/drivers/pci/doe.c
@@ -24,8 +24,6 @@
 
 #include "pci.h"
 
-#define PCI_DOE_FEATURE_DISCOVERY 0
-
 /* Timeout of 1 second from 6.30.2 Operation, PCI Spec r6.0 */
 #define PCI_DOE_TIMEOUT HZ
 #define PCI_DOE_POLL_INTERVAL	(PCI_DOE_TIMEOUT / 128)
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 9d6f74bd95f8..7f9237a926c2 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1868,6 +1868,10 @@ const struct attribute_group *pci_dev_attr_groups[] = {
 #endif
 #ifdef CONFIG_PCI_DOE
 	&pci_doe_sysfs_group,
+#endif
+#ifdef CONFIG_PCI_TSM
+	&pci_tsm_auth_attr_group,
+	&pci_tsm_attr_group,
 #endif
 	NULL,
 };
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 86ef13e7cece..6e4cc1c9aa58 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -619,6 +619,16 @@ void pci_ide_init(struct pci_dev *dev);
 static inline void pci_ide_init(struct pci_dev *dev) { }
 #endif
 
+#ifdef CONFIG_PCI_TSM
+void pci_tsm_init(struct pci_dev *pdev);
+void pci_tsm_destroy(struct pci_dev *pdev);
+extern const struct attribute_group pci_tsm_attr_group;
+extern const struct attribute_group pci_tsm_auth_attr_group;
+#else
+static inline void pci_tsm_init(struct pci_dev *pdev) { }
+static inline void pci_tsm_destroy(struct pci_dev *pdev) { }
+#endif
+
 /**
  * pci_dev_set_io_state - Set the new error state if possible.
  *
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4c55020f3ddf..d1467348c169 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2763,6 +2763,9 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 	ret = device_add(&dev->dev);
 	WARN_ON(ret < 0);
 
+	/* Establish pdev->tsm for newly added (e.g. new SR-IOV VFs) */
+	pci_tsm_init(dev);
+
 	pci_npem_create(dev);
 
 	pci_doe_sysfs_init(dev);
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index ce5c25adef55..803391892c4a 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -57,6 +57,12 @@ static void pci_destroy_dev(struct pci_dev *dev)
 	pci_doe_sysfs_teardown(dev);
 	pci_npem_remove(dev);
 
+	/*
+	 * While device is in D0 drop the device from TSM link operations
+	 * including unbind and disconnect (IDE + SPDM teardown).
+	 */
+	pci_tsm_destroy(dev);
+
 	device_del(&dev->dev);
 
 	down_write(&pci_bus_sem);
diff --git a/drivers/pci/tsm.c b/drivers/pci/tsm.c
new file mode 100644
index 000000000000..6a2849f77adc
--- /dev/null
+++ b/drivers/pci/tsm.c
@@ -0,0 +1,643 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Interface with platform TEE Security Manager (TSM) objects as defined by
+ * PCIe r7.0 section 11 TEE Device Interface Security Protocol (TDISP)
+ *
+ * Copyright(c) 2024-2025 Intel Corporation. All rights reserved.
+ */
+
+#define dev_fmt(fmt) "PCI/TSM: " fmt
+
+#include <linux/bitfield.h>
+#include <linux/pci.h>
+#include <linux/pci-doe.h>
+#include <linux/pci-tsm.h>
+#include <linux/sysfs.h>
+#include <linux/tsm.h>
+#include <linux/xarray.h>
+#include "pci.h"
+
+/*
+ * Provide a read/write lock against the init / exit of pdev tsm
+ * capabilities and arrival/departure of a TSM instance
+ */
+static DECLARE_RWSEM(pci_tsm_rwsem);
+
+/*
+ * Count of TSMs registered that support physical link operations vs device
+ * security state management.
+ */
+static int pci_tsm_link_count;
+static int pci_tsm_devsec_count;
+
+static const struct pci_tsm_ops *to_pci_tsm_ops(struct pci_tsm *tsm)
+{
+	return tsm->tsm_dev->pci_ops;
+}
+
+static inline bool is_dsm(struct pci_dev *pdev)
+{
+	return pdev->tsm && pdev->tsm->dsm_dev == pdev;
+}
+
+static inline bool has_tee(struct pci_dev *pdev)
+{
+	return pdev->devcap & PCI_EXP_DEVCAP_TEE;
+}
+
+/* 'struct pci_tsm_pf0' wraps 'struct pci_tsm' when ->dsm_dev == ->pdev (self) */
+static struct pci_tsm_pf0 *to_pci_tsm_pf0(struct pci_tsm *tsm)
+{
+	/*
+	 * All "link" TSM contexts reference the device that hosts the DSM
+	 * interface for a set of devices. Walk to the DSM device and cast its
+	 * ->tsm context to a 'struct pci_tsm_pf0 *'.
+	 */
+	struct pci_dev *pf0 = tsm->dsm_dev;
+
+	if (!is_pci_tsm_pf0(pf0) || !is_dsm(pf0)) {
+		pci_WARN_ONCE(tsm->pdev, 1, "invalid context object\n");
+		return NULL;
+	}
+
+	return container_of(pf0->tsm, struct pci_tsm_pf0, base_tsm);
+}
+
+static void tsm_remove(struct pci_tsm *tsm)
+{
+	struct pci_dev *pdev;
+
+	if (!tsm)
+		return;
+
+	pdev = tsm->pdev;
+	to_pci_tsm_ops(tsm)->remove(tsm);
+	pdev->tsm = NULL;
+}
+DEFINE_FREE(tsm_remove, struct pci_tsm *, if (_T) tsm_remove(_T))
+
+static void pci_tsm_walk_fns(struct pci_dev *pdev,
+			     int (*cb)(struct pci_dev *pdev, void *data),
+			     void *data)
+{
+	/* Walk subordinate physical functions */
+	for (int i = 0; i < 8; i++) {
+		struct pci_dev *pf __free(pci_dev_put) = pci_get_slot(
+			pdev->bus, PCI_DEVFN(PCI_SLOT(pdev->devfn), i));
+
+		if (!pf)
+			continue;
+
+		/* on entry function 0 has already run @cb */
+		if (i > 0)
+			cb(pf, data);
+
+		/* walk virtual functions of each pf */
+		for (int j = 0; j < pci_num_vf(pf); j++) {
+			struct pci_dev *vf __free(pci_dev_put) =
+				pci_get_domain_bus_and_slot(
+					pci_domain_nr(pf->bus),
+					pci_iov_virtfn_bus(pf, j),
+					pci_iov_virtfn_devfn(pf, j));
+
+			if (!vf)
+				continue;
+
+			cb(vf, data);
+		}
+	}
+
+	/*
+	 * Walk downstream devices, assumes that an upstream DSM is
+	 * limited to downstream physical functions
+	 */
+	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM && is_dsm(pdev))
+		pci_walk_bus(pdev->subordinate, cb, data);
+}
+
+static void pci_tsm_walk_fns_reverse(struct pci_dev *pdev,
+				     int (*cb)(struct pci_dev *pdev,
+					       void *data),
+				     void *data)
+{
+	/* Reverse walk downstream devices */
+	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM && is_dsm(pdev))
+		pci_walk_bus_reverse(pdev->subordinate, cb, data);
+
+	/* Reverse walk subordinate physical functions */
+	for (int i = 7; i >= 0; i--) {
+		struct pci_dev *pf __free(pci_dev_put) = pci_get_slot(
+			pdev->bus, PCI_DEVFN(PCI_SLOT(pdev->devfn), i));
+
+		if (!pf)
+			continue;
+
+		/* reverse walk virtual functions */
+		for (int j = pci_num_vf(pf) - 1; j >= 0; j--) {
+			struct pci_dev *vf __free(pci_dev_put) =
+				pci_get_domain_bus_and_slot(
+					pci_domain_nr(pf->bus),
+					pci_iov_virtfn_bus(pf, j),
+					pci_iov_virtfn_devfn(pf, j));
+
+			if (!vf)
+				continue;
+			cb(vf, data);
+		}
+
+		/* on exit, caller will run @cb on function 0 */
+		if (i > 0)
+			cb(pf, data);
+	}
+}
+
+static int probe_fn(struct pci_dev *pdev, void *dsm)
+{
+	struct pci_dev *dsm_dev = dsm;
+	const struct pci_tsm_ops *ops = to_pci_tsm_ops(dsm_dev->tsm);
+
+	pdev->tsm = ops->probe(dsm_dev->tsm->tsm_dev, pdev);
+	pci_dbg(pdev, "setup TSM context: DSM: %s status: %s\n",
+		pci_name(dsm_dev), pdev->tsm ? "success" : "failed");
+	return 0;
+}
+
+static int pci_tsm_connect(struct pci_dev *pdev, struct tsm_dev *tsm_dev)
+{
+	int rc;
+	struct pci_tsm_pf0 *tsm_pf0;
+	const struct pci_tsm_ops *ops = tsm_dev->pci_ops;
+	struct pci_tsm *pci_tsm __free(tsm_remove) = ops->probe(tsm_dev, pdev);
+
+	/* connect() mutually exclusive with subfunction pci_tsm_init() */
+	lockdep_assert_held_write(&pci_tsm_rwsem);
+
+	if (!pci_tsm)
+		return -ENXIO;
+
+	pdev->tsm = pci_tsm;
+	tsm_pf0 = to_pci_tsm_pf0(pdev->tsm);
+
+	/* mutex_intr assumes connect() is always sysfs/user driven */
+	ACQUIRE(mutex_intr, lock)(&tsm_pf0->lock);
+	if ((rc = ACQUIRE_ERR(mutex_intr, &lock)))
+		return rc;
+
+	rc = ops->connect(pdev);
+	if (rc)
+		return rc;
+
+	pdev->tsm = no_free_ptr(pci_tsm);
+
+	/*
+	 * Now that the DSM is established, probe() all the potential
+	 * dependent functions. Failure to probe a function is not fatal
+	 * to connect(), it just disables subsequent security operations
+	 * for that function.
+	 *
+	 * Note this is done unconditionally, without regard to finding
+	 * PCI_EXP_DEVCAP_TEE on the dependent function, for robustness. The DSM
+	 * is the ultimate arbiter of security state relative to a given
+	 * interface id, and if it says it can manage TDISP state of a function,
+	 * let it.
+	 */
+	if (has_tee(pdev))
+		pci_tsm_walk_fns(pdev, probe_fn, pdev);
+	return 0;
+}
+
+static ssize_t connect_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct tsm_dev *tsm_dev;
+	int rc;
+
+	ACQUIRE(rwsem_read_intr, lock)(&pci_tsm_rwsem);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &lock)))
+		return rc;
+
+	if (!pdev->tsm)
+		return sysfs_emit(buf, "\n");
+
+	tsm_dev = pdev->tsm->tsm_dev;
+	return sysfs_emit(buf, "%s\n", dev_name(&tsm_dev->dev));
+}
+
+/* Is @tsm_dev managing physical link / session properties... */
+static bool is_link_tsm(struct tsm_dev *tsm_dev)
+{
+	return tsm_dev && tsm_dev->pci_ops && tsm_dev->pci_ops->link_ops.probe;
+}
+
+/* ...or is @tsm_dev managing device security state ? */
+static bool is_devsec_tsm(struct tsm_dev *tsm_dev)
+{
+	return tsm_dev && tsm_dev->pci_ops && tsm_dev->pci_ops->devsec_ops.lock;
+}
+
+static ssize_t connect_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t len)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int rc, id;
+
+	rc = sscanf(buf, "tsm%d\n", &id);
+	if (rc != 1)
+		return -EINVAL;
+
+	ACQUIRE(rwsem_write_kill, lock)(&pci_tsm_rwsem);
+	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &lock)))
+		return rc;
+
+	if (pdev->tsm)
+		return -EBUSY;
+
+	struct tsm_dev *tsm_dev __free(put_tsm_dev) = find_tsm_dev(id);
+	if (!is_link_tsm(tsm_dev))
+		return -ENXIO;
+
+	rc = pci_tsm_connect(pdev, tsm_dev);
+	if (rc)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_RW(connect);
+
+static int remove_fn(struct pci_dev *pdev, void *data)
+{
+	tsm_remove(pdev->tsm);
+	return 0;
+}
+
+static void __pci_tsm_disconnect(struct pci_dev *pdev)
+{
+	struct pci_tsm_pf0 *tsm_pf0 = to_pci_tsm_pf0(pdev->tsm);
+	const struct pci_tsm_ops *ops = to_pci_tsm_ops(pdev->tsm);
+
+	/* disconnect() mutually exclusive with subfunction pci_tsm_init() */
+	lockdep_assert_held_write(&pci_tsm_rwsem);
+
+	/*
+	 * disconnect() is uninterruptible as it may be called for device
+	 * teardown
+	 */
+	guard(mutex)(&tsm_pf0->lock);
+	pci_tsm_walk_fns_reverse(pdev, remove_fn, NULL);
+	ops->disconnect(pdev);
+}
+
+static void pci_tsm_disconnect(struct pci_dev *pdev)
+{
+	__pci_tsm_disconnect(pdev);
+	tsm_remove(pdev->tsm);
+}
+
+static ssize_t disconnect_store(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t len)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct tsm_dev *tsm_dev;
+	int rc;
+
+	ACQUIRE(rwsem_write_kill, lock)(&pci_tsm_rwsem);
+	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &lock)))
+		return rc;
+
+	if (!pdev->tsm)
+		return -ENXIO;
+
+	tsm_dev = pdev->tsm->tsm_dev;
+	if (!sysfs_streq(buf, dev_name(&tsm_dev->dev)))
+		return -EINVAL;
+
+	pci_tsm_disconnect(pdev);
+	return len;
+}
+static DEVICE_ATTR_WO(disconnect);
+
+/* The 'authenticated' attribute is exclusive to the presence of a 'link' TSM */
+static bool pci_tsm_link_group_visible(struct kobject *kobj)
+{
+	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
+
+	return pci_tsm_link_count && is_pci_tsm_pf0(pdev);
+}
+DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(pci_tsm_link);
+
+/*
+ * 'link' and 'devsec' TSMs share the same 'tsm/' sysfs group, so the TSM type
+ * specific attributes need individual visibility checks.
+ */
+static umode_t pci_tsm_attr_visible(struct kobject *kobj,
+				    struct attribute *attr, int n)
+{
+	if (pci_tsm_link_group_visible(kobj)) {
+		if (attr == &dev_attr_connect.attr ||
+		    attr == &dev_attr_disconnect.attr)
+			return attr->mode;
+	}
+
+	return 0;
+}
+
+static bool pci_tsm_group_visible(struct kobject *kobj)
+{
+	return pci_tsm_link_group_visible(kobj);
+}
+DEFINE_SYSFS_GROUP_VISIBLE(pci_tsm);
+
+static struct attribute *pci_tsm_attrs[] = {
+	&dev_attr_connect.attr,
+	&dev_attr_disconnect.attr,
+	NULL
+};
+
+const struct attribute_group pci_tsm_attr_group = {
+	.name = "tsm",
+	.attrs = pci_tsm_attrs,
+	.is_visible = SYSFS_GROUP_VISIBLE(pci_tsm),
+};
+
+static ssize_t authenticated_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	/*
+	 * When the SPDM session established via TSM the 'authenticated' state
+	 * of the device is identical to the connect state.
+	 */
+	return connect_show(dev, attr, buf);
+}
+static DEVICE_ATTR_RO(authenticated);
+
+static struct attribute *pci_tsm_auth_attrs[] = {
+	&dev_attr_authenticated.attr,
+	NULL
+};
+
+const struct attribute_group pci_tsm_auth_attr_group = {
+	.attrs = pci_tsm_auth_attrs,
+	.is_visible = SYSFS_GROUP_VISIBLE(pci_tsm_link),
+};
+
+/*
+ * Retrieve physical function0 device whether it has TEE capability or not
+ */
+static struct pci_dev *pf0_dev_get(struct pci_dev *pdev)
+{
+	struct pci_dev *pf_dev = pci_physfn(pdev);
+
+	if (PCI_FUNC(pf_dev->devfn) == 0)
+		return pci_dev_get(pf_dev);
+
+	return pci_get_slot(pf_dev->bus,
+			    pf_dev->devfn - PCI_FUNC(pf_dev->devfn));
+}
+
+/*
+ * Find the PCI Device instance that serves as the Device Security Manager (DSM)
+ * for @pdev. Note that no additional reference is held for the resulting device
+ * because that resulting object always has a registered lifetime
+ * greater-than-or-equal to that of the @pdev argument. This is by virtue of
+ * @pdev being a descendant of, or identical to, the returned DSM device.
+ */
+static struct pci_dev *find_dsm_dev(struct pci_dev *pdev)
+{
+	struct device *grandparent;
+	struct pci_dev *uport;
+
+	if (is_pci_tsm_pf0(pdev))
+		return pdev;
+
+	struct pci_dev *pf0 __free(pci_dev_put) = pf0_dev_get(pdev);
+	if (!pf0)
+		return NULL;
+
+	if (is_dsm(pf0))
+		return pf0;
+
+	/*
+	 * For cases where a switch may be hosting TDISP services on behalf of
+	 * downstream devices, check the first upstream port relative to this
+	 * endpoint.
+	 */
+	if (!pdev->dev.parent)
+		return NULL;
+	grandparent = pdev->dev.parent->parent;
+	if (!grandparent)
+		return NULL;
+	if (!dev_is_pci(grandparent))
+		return NULL;
+	uport = to_pci_dev(grandparent);
+	if (!pci_is_pcie(uport) ||
+	    pci_pcie_type(uport) != PCI_EXP_TYPE_UPSTREAM)
+		return NULL;
+
+	if (is_dsm(uport))
+		return uport;
+	return NULL;
+}
+
+/**
+ * pci_tsm_link_constructor() - base 'struct pci_tsm' initialization for link TSMs
+ * @pdev: The PCI device
+ * @tsm: context to initialize
+ * @tsm_dev: Platform TEE Security Manager, initiator of security operations
+ */
+int pci_tsm_link_constructor(struct pci_dev *pdev, struct pci_tsm *tsm,
+			     struct tsm_dev *tsm_dev)
+{
+	if (!is_link_tsm(tsm_dev))
+		return -EINVAL;
+
+	tsm->dsm_dev = find_dsm_dev(pdev);
+	if (!tsm->dsm_dev) {
+		pci_warn(pdev, "failed to find Device Security Manager\n");
+		return -ENXIO;
+	}
+	tsm->pdev = pdev;
+	tsm->tsm_dev = tsm_dev;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_tsm_link_constructor);
+
+/**
+ * pci_tsm_pf0_constructor() - common 'struct pci_tsm_pf0' (DSM) initialization
+ * @pdev: Physical Function 0 PCI device (as indicated by is_pci_tsm_pf0())
+ * @tsm: context to initialize
+ * @tsm_dev: Platform TEE Security Manager, initiator of security operations
+ */
+int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm,
+			    struct tsm_dev *tsm_dev)
+{
+	mutex_init(&tsm->lock);
+	tsm->doe_mb = pci_find_doe_mailbox(pdev, PCI_VENDOR_ID_PCI_SIG,
+					   PCI_DOE_FEATURE_CMA);
+	if (!tsm->doe_mb) {
+		pci_warn(pdev, "TSM init failure, no CMA mailbox\n");
+		return -ENODEV;
+	}
+
+	return pci_tsm_link_constructor(pdev, &tsm->base_tsm, tsm_dev);
+}
+EXPORT_SYMBOL_GPL(pci_tsm_pf0_constructor);
+
+void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *pf0_tsm)
+{
+	mutex_destroy(&pf0_tsm->lock);
+}
+EXPORT_SYMBOL_GPL(pci_tsm_pf0_destructor);
+
+static void pf0_sysfs_enable(struct pci_dev *pdev)
+{
+	bool tee = has_tee(pdev);
+
+	pci_dbg(pdev, "Device Security Manager detected (%s%s%s)\n",
+		pdev->ide_cap ? "IDE" : "", pdev->ide_cap && tee ? " " : "",
+		tee ? "TEE" : "");
+
+	sysfs_update_group(&pdev->dev.kobj, &pci_tsm_auth_attr_group);
+	sysfs_update_group(&pdev->dev.kobj, &pci_tsm_attr_group);
+}
+
+int pci_tsm_register(struct tsm_dev *tsm_dev)
+{
+	struct pci_dev *pdev = NULL;
+
+	if (!tsm_dev)
+		return -EINVAL;
+
+	/* The TSM device must only implement one of link_ops or devsec_ops */
+	if (!is_link_tsm(tsm_dev) && !is_devsec_tsm(tsm_dev))
+		return -EINVAL;
+
+	if (is_link_tsm(tsm_dev) && is_devsec_tsm(tsm_dev))
+		return -EINVAL;
+
+	guard(rwsem_write)(&pci_tsm_rwsem);
+
+	/* On first enable, update sysfs groups */
+	if (is_link_tsm(tsm_dev) && pci_tsm_link_count++ == 0) {
+		for_each_pci_dev(pdev)
+			if (is_pci_tsm_pf0(pdev))
+				pf0_sysfs_enable(pdev);
+	} else if (is_devsec_tsm(tsm_dev)) {
+		pci_tsm_devsec_count++;
+	}
+
+	return 0;
+}
+
+static void pci_tsm_fn_exit(struct pci_dev *pdev)
+{
+	/* TODO: unbind the fn */
+	tsm_remove(pdev->tsm);
+}
+
+/**
+ * __pci_tsm_destroy() - destroy the TSM context for @pdev
+ * @pdev: device to cleanup
+ * @tsm_dev: the TSM device being removed, or NULL if @pdev is being removed.
+ *
+ * At device removal or TSM unregistration all established context
+ * with the TSM is torn down. Additionally, if there are no more TSMs
+ * registered, the PCI tsm/ sysfs attributes are hidden.
+ */
+static void __pci_tsm_destroy(struct pci_dev *pdev, struct tsm_dev *tsm_dev)
+{
+	struct pci_tsm *tsm = pdev->tsm;
+
+	lockdep_assert_held_write(&pci_tsm_rwsem);
+
+	/*
+	 * First, handle the TSM removal case to shutdown @pdev sysfs, this is
+	 * skipped if the device itself is being removed since sysfs goes away
+	 * naturally at that point
+	 */
+	if (is_link_tsm(tsm_dev) && is_pci_tsm_pf0(pdev) && !pci_tsm_link_count) {
+		sysfs_update_group(&pdev->dev.kobj, &pci_tsm_auth_attr_group);
+		sysfs_update_group(&pdev->dev.kobj, &pci_tsm_attr_group);
+	}
+
+	/* Nothing else to do if this device never attached to the departing TSM */
+	if (!tsm)
+		return;
+
+	/* Now lookup the tsm_dev to destroy TSM context */
+	if (!tsm_dev)
+		tsm_dev = tsm->tsm_dev;
+	else if (tsm_dev != tsm->tsm_dev)
+		return;
+
+	if (is_link_tsm(tsm_dev) && is_pci_tsm_pf0(pdev))
+		pci_tsm_disconnect(pdev);
+	else
+		pci_tsm_fn_exit(pdev);
+}
+
+void pci_tsm_destroy(struct pci_dev *pdev)
+{
+	guard(rwsem_write)(&pci_tsm_rwsem);
+	__pci_tsm_destroy(pdev, NULL);
+}
+
+void pci_tsm_init(struct pci_dev *pdev)
+{
+	guard(rwsem_read)(&pci_tsm_rwsem);
+
+	/*
+	 * Subfunctions are either probed synchronous with connect() or later
+	 * when either the SR-IOV configuration is changed, or, unlikely,
+	 * connect() raced initial bus scanning.
+	 */
+	if (pdev->tsm)
+		return;
+
+	if (pci_tsm_link_count) {
+		struct pci_dev *dsm = find_dsm_dev(pdev);
+
+		if (!dsm)
+			return;
+
+		/*
+		 * The only path to init a Device Security Manager capable
+		 * device is via connect().
+		 */
+		if (!dsm->tsm)
+			return;
+
+		probe_fn(pdev, dsm);
+	}
+}
+
+void pci_tsm_unregister(struct tsm_dev *tsm_dev)
+{
+	struct pci_dev *pdev = NULL;
+
+	guard(rwsem_write)(&pci_tsm_rwsem);
+	if (is_link_tsm(tsm_dev))
+		pci_tsm_link_count--;
+	if (is_devsec_tsm(tsm_dev))
+		pci_tsm_devsec_count--;
+	for_each_pci_dev_reverse(pdev)
+		__pci_tsm_destroy(pdev, tsm_dev);
+}
+
+int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req,
+			 size_t req_sz, void *resp, size_t resp_sz)
+{
+	struct pci_tsm_pf0 *tsm;
+
+	if (!pdev->tsm || !is_pci_tsm_pf0(pdev))
+		return -ENXIO;
+
+	tsm = to_pci_tsm_pf0(pdev->tsm);
+	if (!tsm->doe_mb)
+		return -ENXIO;
+
+	return pci_doe(tsm->doe_mb, PCI_VENDOR_ID_PCI_SIG, type, req, req_sz,
+		       resp, resp_sz);
+}
+EXPORT_SYMBOL_GPL(pci_tsm_doe_transfer);
diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c
index 347507cc5e3f..0e705f3067a1 100644
--- a/drivers/virt/coco/tsm-core.c
+++ b/drivers/virt/coco/tsm-core.c
@@ -8,11 +8,29 @@
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/cleanup.h>
+#include <linux/pci-tsm.h>
 
 static struct class *tsm_class;
 static DECLARE_RWSEM(tsm_rwsem);
 static DEFINE_IDA(tsm_ida);
 
+static int match_id(struct device *dev, const void *data)
+{
+	struct tsm_dev *tsm_dev = container_of(dev, struct tsm_dev, dev);
+	int id = *(const int *)data;
+
+	return tsm_dev->id == id;
+}
+
+struct tsm_dev *find_tsm_dev(int id)
+{
+	struct device *dev = class_find_device(tsm_class, NULL, &id, match_id);
+
+	if (!dev)
+		return NULL;
+	return container_of(dev, struct tsm_dev, dev);
+}
+
 static struct tsm_dev *alloc_tsm_dev(struct device *parent)
 {
 	struct device *dev;
@@ -36,7 +54,29 @@ static struct tsm_dev *alloc_tsm_dev(struct device *parent)
 	return no_free_ptr(tsm_dev);
 }
 
-struct tsm_dev *tsm_register(struct device *parent)
+static struct tsm_dev *tsm_register_pci_or_reset(struct tsm_dev *tsm_dev,
+						 struct pci_tsm_ops *pci_ops)
+{
+	int rc;
+
+	if (!pci_ops)
+		return tsm_dev;
+
+	tsm_dev->pci_ops = pci_ops;
+	rc = pci_tsm_register(tsm_dev);
+	if (rc) {
+		dev_err(tsm_dev->dev.parent,
+			"PCI/TSM registration failure: %d\n", rc);
+		device_unregister(&tsm_dev->dev);
+		return ERR_PTR(rc);
+	}
+
+	/* Notify TSM userspace that PCI/TSM operations are now possible */
+	kobject_uevent(&tsm_dev->dev.kobj, KOBJ_CHANGE);
+	return tsm_dev;
+}
+
+struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *pci_ops)
 {
 	struct tsm_dev *tsm_dev __free(put_tsm_dev) = alloc_tsm_dev(parent);
 	struct device *dev;
@@ -54,12 +94,14 @@ struct tsm_dev *tsm_register(struct device *parent)
 	if (rc)
 		return ERR_PTR(rc);
 
-	return no_free_ptr(tsm_dev);
+	return tsm_register_pci_or_reset(no_free_ptr(tsm_dev), pci_ops);
 }
 EXPORT_SYMBOL_GPL(tsm_register);
 
 void tsm_unregister(struct tsm_dev *tsm_dev)
 {
+	if (tsm_dev->pci_ops)
+		pci_tsm_unregister(tsm_dev);
 	device_unregister(&tsm_dev->dev);
 }
 EXPORT_SYMBOL_GPL(tsm_unregister);
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
index 1f14aed4354b..bd4346a7c4e7 100644
--- a/include/linux/pci-doe.h
+++ b/include/linux/pci-doe.h
@@ -15,6 +15,10 @@
 
 struct pci_doe_mb;
 
+#define PCI_DOE_FEATURE_DISCOVERY 0
+#define PCI_DOE_FEATURE_CMA 1
+#define PCI_DOE_FEATURE_SSESSION 2
+
 struct pci_doe_mb *pci_find_doe_mailbox(struct pci_dev *pdev, u16 vendor,
 					u8 type);
 
diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h
new file mode 100644
index 000000000000..e921d30f9b6c
--- /dev/null
+++ b/include/linux/pci-tsm.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PCI_TSM_H
+#define __PCI_TSM_H
+#include <linux/mutex.h>
+#include <linux/pci.h>
+
+struct pci_tsm;
+struct tsm_dev;
+
+/*
+ * struct pci_tsm_ops - manage confidential links and security state
+ * @link_ops: Coordinate PCIe SPDM and IDE establishment via a platform TSM.
+ *	      Provide a secure session transport for TDISP state management
+ *	      (typically bare metal physical function operations).
+ * @devsec_ops: Lock, unlock, and interrogate the security state of the
+ *		function via the platform TSM (typically virtual function
+ *		operations).
+ *
+ * This operations are mutually exclusive either a tsm_dev instance
+ * manages physical link properties or it manages function security
+ * states like TDISP lock/unlock.
+ */
+struct pci_tsm_ops {
+	/*
+	 * struct pci_tsm_link_ops - Manage physical link and the TSM/DSM session
+	 * @probe: establish context with the TSM (allocate / wrap 'struct
+	 *	   pci_tsm') for follow-on link operations
+	 * @remove: destroy link operations context
+	 * @connect: establish / validate a secure connection (e.g. IDE)
+	 *	     with the device
+	 * @disconnect: teardown the secure link
+	 *
+	 * Context: @probe, @remove, @connect, and @disconnect run under
+	 * pci_tsm_rwsem held for write to sync with TSM unregistration and
+	 * mutual exclusion of @connect and @disconnect. @connect and
+	 * @disconnect additionally run under the DSM lock (struct
+	 * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions.
+	 */
+	struct_group_tagged(pci_tsm_link_ops, link_ops,
+		struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev,
+					 struct pci_dev *pdev);
+		void (*remove)(struct pci_tsm *tsm);
+		int (*connect)(struct pci_dev *pdev);
+		void (*disconnect)(struct pci_dev *pdev);
+	);
+
+	/*
+	 * struct pci_tsm_devsec_ops - Manage the security state of the function
+	 * @lock: establish context with the TSM (allocate / wrap 'struct
+	 *	  pci_tsm') for follow-on security state transitions from the
+	 *	  LOCKED state
+	 * @unlock: destroy TSM context and return device to UNLOCKED state
+	 *
+	 * Context: @lock and @unlock run under pci_tsm_rwsem held for write to
+	 * sync with TSM unregistration and each other
+	 */
+	struct_group_tagged(pci_tsm_devsec_ops, devsec_ops,
+		struct pci_tsm *(*lock)(struct tsm_dev *tsm_dev,
+					struct pci_dev *pdev);
+		void (*unlock)(struct pci_tsm *tsm);
+	);
+};
+
+/**
+ * struct pci_tsm - Core TSM context for a given PCIe endpoint
+ * @pdev: Back ref to device function, distinguishes type of pci_tsm context
+ * @dsm_dev: PCI Device Security Manager for link operations on @pdev
+ * @tsm_dev: PCI TEE Security Manager device for Link Confidentiality or Device
+ *	     Function Security operations
+ *
+ * This structure is wrapped by low level TSM driver data and returned by
+ * probe()/lock(), it is freed by the corresponding remove()/unlock().
+ *
+ * For link operations it serves to cache the association between a Device
+ * Security Manager (DSM) and the functions that manager can assign to a TVM.
+ * That can be "self", for assigning function0 of a TEE I/O device, a
+ * sub-function (SR-IOV virtual function, or non-function0
+ * multifunction-device), or a downstream endpoint (PCIe upstream switch-port as
+ * DSM).
+ */
+struct pci_tsm {
+	struct pci_dev *pdev;
+	struct pci_dev *dsm_dev;
+	struct tsm_dev *tsm_dev;
+};
+
+/**
+ * struct pci_tsm_pf0 - Physical Function 0 TDISP link context
+ * @base_tsm: generic core "tsm" context
+ * @lock: mutual exclustion for pci_tsm_ops invocation
+ * @doe_mb: PCIe Data Object Exchange mailbox
+ */
+struct pci_tsm_pf0 {
+	struct pci_tsm base_tsm;
+	struct mutex lock;
+	struct pci_doe_mb *doe_mb;
+};
+
+/* physical function0 and capable of 'connect' */
+static inline bool is_pci_tsm_pf0(struct pci_dev *pdev)
+{
+	if (!pdev)
+		return false;
+
+	if (!pci_is_pcie(pdev))
+		return false;
+
+	if (pdev->is_virtfn)
+		return false;
+
+	/*
+	 * Allow for a Device Security Manager (DSM) associated with function0
+	 * of an Endpoint to coordinate TDISP requests for other functions
+	 * (physical or virtual) of the device, or allow for an Upstream Port
+	 * DSM to accept TDISP requests for the Endpoints downstream of the
+	 * switch.
+	 */
+	switch (pci_pcie_type(pdev)) {
+	case PCI_EXP_TYPE_ENDPOINT:
+	case PCI_EXP_TYPE_UPSTREAM:
+	case PCI_EXP_TYPE_RC_END:
+		if (pdev->ide_cap || (pdev->devcap & PCI_EXP_DEVCAP_TEE))
+			break;
+		fallthrough;
+	default:
+		return false;
+	}
+
+	return PCI_FUNC(pdev->devfn) == 0;
+}
+
+#ifdef CONFIG_PCI_TSM
+int pci_tsm_register(struct tsm_dev *tsm_dev);
+void pci_tsm_unregister(struct tsm_dev *tsm_dev);
+int pci_tsm_link_constructor(struct pci_dev *pdev, struct pci_tsm *tsm,
+			     struct tsm_dev *tsm_dev);
+int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm,
+			    struct tsm_dev *tsm_dev);
+void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *tsm);
+int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req,
+			 size_t req_sz, void *resp, size_t resp_sz);
+#else
+static inline int pci_tsm_register(struct tsm_dev *tsm_dev)
+{
+	return 0;
+}
+static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev)
+{
+}
+static inline int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type,
+				       const void *req, size_t req_sz,
+				       void *resp, size_t resp_sz)
+{
+	return -ENXIO;
+}
+#endif
+#endif /*__PCI_TSM_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b6a12a82be12..2f9c0cb6a50a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -546,6 +546,9 @@ struct pci_dev {
 	u8		nr_link_ide;	/* Link Stream count (Selective Stream offset) */
 	unsigned int	ide_cfg:1;	/* Config cycles over IDE */
 	unsigned int	ide_tee_limit:1; /* Disallow T=0 traffic over IDE */
+#endif
+#ifdef CONFIG_PCI_TSM
+	struct pci_tsm *tsm;		/* TSM operation state */
 #endif
 	u16		acs_cap;	/* ACS Capability offset */
 	u8		supported_speeds; /* Supported Link Speeds Vector */
diff --git a/include/linux/tsm.h b/include/linux/tsm.h
index cd97c63ffa32..22e05b2aac69 100644
--- a/include/linux/tsm.h
+++ b/include/linux/tsm.h
@@ -108,9 +108,11 @@ struct tsm_report_ops {
 	bool (*report_bin_attr_visible)(int n);
 };
 
+struct pci_tsm_ops;
 struct tsm_dev {
 	struct device dev;
 	int id;
+	const struct pci_tsm_ops *pci_ops;
 };
 
 DEFINE_FREE(put_tsm_dev, struct tsm_dev *,
@@ -118,6 +120,7 @@ DEFINE_FREE(put_tsm_dev, struct tsm_dev *,
 
 int tsm_report_register(const struct tsm_report_ops *ops, void *priv);
 int tsm_report_unregister(const struct tsm_report_ops *ops);
-struct tsm_dev *tsm_register(struct device *parent);
+struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *ops);
 void tsm_unregister(struct tsm_dev *tsm_dev);
+struct tsm_dev *find_tsm_dev(int id);
 #endif /* __TSM_H */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 05bd22d9e352..f2759c1097bc 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -503,6 +503,7 @@
 #define  PCI_EXP_DEVCAP_PWR_VAL	0x03fc0000 /* Slot Power Limit Value */
 #define  PCI_EXP_DEVCAP_PWR_SCL	0x0c000000 /* Slot Power Limit Scale */
 #define  PCI_EXP_DEVCAP_FLR     0x10000000 /* Function Level Reset */
+#define  PCI_EXP_DEVCAP_TEE     0x40000000 /* TEE I/O (TDISP) Support */
 #define PCI_EXP_DEVCTL		0x08	/* Device Control */
 #define  PCI_EXP_DEVCTL_CERE	0x0001	/* Correctable Error Reporting En. */
 #define  PCI_EXP_DEVCTL_NFERE	0x0002	/* Non-Fatal Error Reporting Enable */
-- 
cgit v1.2.3


From c0c1262fbfbafe943dbccd5f97b500b72dbd2205 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 31 Oct 2025 14:28:57 -0700
Subject: PCI: Add PCIe Device 3 Extended Capability enumeration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PCIe r7.0 Section 7.7.9 Device 3 Extended Capability Structure, defines the
canonical location for determining the Flit Mode of a device. This status
is a dependency for PCIe IDE enabling. Add a new fm_enabled flag to 'struct
pci_dev'.

Cc: Lukas Wunner <lukas@wunner.de>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Xu Yilun <yilun.xu@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251031212902.2256310-6-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/probe.c           | 12 ++++++++++++
 include/linux/pci.h           |  1 +
 include/uapi/linux/pci_regs.h |  7 +++++++
 3 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index d1467348c169..3b54f1720be5 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2283,6 +2283,17 @@ int pci_configure_extended_tags(struct pci_dev *dev, void *ign)
 	return 0;
 }
 
+static void pci_dev3_init(struct pci_dev *pdev)
+{
+	u16 cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DEV3);
+	u32 val = 0;
+
+	if (!cap)
+		return;
+	pci_read_config_dword(pdev, cap + PCI_DEV3_STA, &val);
+	pdev->fm_enabled = !!(val & PCI_DEV3_STA_SEGMENT);
+}
+
 /**
  * pcie_relaxed_ordering_enabled - Probe for PCIe relaxed ordering enable
  * @dev: PCI device to query
@@ -2667,6 +2678,7 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	pci_doe_init(dev);		/* Data Object Exchange */
 	pci_tph_init(dev);		/* TLP Processing Hints */
 	pci_rebar_init(dev);		/* Resizable BAR */
+	pci_dev3_init(dev);		/* Device 3 capabilities */
 	pci_ide_init(dev);		/* Link Integrity and Data Encryption */
 
 	pcie_report_downtraining(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2f9c0cb6a50a..ea94799c81b0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -450,6 +450,7 @@ struct pci_dev {
 	unsigned int	pasid_enabled:1;	/* Process Address Space ID */
 	unsigned int	pri_enabled:1;		/* Page Request Interface */
 	unsigned int	tph_enabled:1;		/* TLP Processing Hints */
+	unsigned int	fm_enabled:1;		/* Flit Mode (segment captured) */
 	unsigned int	is_managed:1;		/* Managed via devres */
 	unsigned int	is_msi_managed:1;	/* MSI release via devres installed */
 	unsigned int	needs_freset:1;		/* Requires fundamental reset */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index f2759c1097bc..3add74ae2594 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -755,6 +755,7 @@
 #define PCI_EXT_CAP_ID_NPEM	0x29	/* Native PCIe Enclosure Management */
 #define PCI_EXT_CAP_ID_PL_32GT  0x2A    /* Physical Layer 32.0 GT/s */
 #define PCI_EXT_CAP_ID_DOE	0x2E	/* Data Object Exchange */
+#define PCI_EXT_CAP_ID_DEV3	0x2F	/* Device 3 Capability/Control/Status */
 #define PCI_EXT_CAP_ID_IDE	0x30    /* Integrity and Data Encryption */
 #define PCI_EXT_CAP_ID_PL_64GT	0x31	/* Physical Layer 64.0 GT/s */
 #define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_64GT
@@ -1246,6 +1247,12 @@
 /* Deprecated old name, replaced with PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE */
 #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL		PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE
 
+/* Device 3 Extended Capability */
+#define PCI_DEV3_CAP		0x04	/* Device 3 Capabilities Register */
+#define PCI_DEV3_CTL		0x08	/* Device 3 Control Register */
+#define PCI_DEV3_STA		0x0c	/* Device 3 Status Register */
+#define  PCI_DEV3_STA_SEGMENT	0x8	/* Segment Captured (end-to-end flit-mode detected) */
+
 /* Compute Express Link (CXL r3.1, sec 8.1.5) */
 #define PCI_DVSEC_CXL_PORT				3
 #define PCI_DVSEC_CXL_PORT_CTL				0x0c
-- 
cgit v1.2.3


From 1e4d2ff3ae450dab37b5b5726c3f7df3e60d6e89 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 31 Oct 2025 14:28:59 -0700
Subject: PCI/IDE: Add IDE establishment helpers

There are two components to establishing an encrypted link, provisioning
the stream in Partner Port config-space, and programming the keys into
the link layer via IDE_KM (IDE Key Management). This new library,
drivers/pci/ide.c, enables the former. IDE_KM, via a TSM low-level
driver, is saved for later.

With the platform TSM implementations of SEV-TIO and TDX Connect in mind
this library abstracts small differences in those implementations. For
example, TDX Connect handles Root Port register setup while SEV-TIO
expects System Software to update the Root Port registers. This is the
rationale for fine-grained 'setup' + 'enable' verbs.

The other design detail for TSM-coordinated IDE establishment is that
the TSM may manage allocation of Stream IDs, this is why the Stream ID
value is passed in to pci_ide_stream_setup().

The flow is:

pci_ide_stream_alloc():
    Allocate a Selective IDE Stream Register Block in each Partner Port
    (Endpoint + Root Port), and reserve a host bridge / platform stream
    slot. Gather Partner Port specific stream settings like Requester ID.

pci_ide_stream_register():
    Publish the stream in sysfs after allocating a Stream ID. In the TSM
    case the TSM allocates the Stream ID for the Partner Port pair.

pci_ide_stream_setup():
    Program the stream settings to a Partner Port. Caller is responsible
    for optionally calling this for the Root Port as well if the TSM
    implementation requires it.

pci_ide_stream_enable():
    Enable the stream after IDE_KM.

In support of system administrators auditing where platform, Root Port,
and Endpoint IDE stream resources are being spent, the allocated stream
is reflected as a symlink from the host bridge to the endpoint with the
name:

    stream%d.%d.%d

Where the tuple of integers reflects the allocated platform, Root Port,
and Endpoint stream index (Selective IDE Stream Register Block) values.

Thanks to Wu Hao for a draft implementation of this infrastructure.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Co-developed-by: Alexey Kardashevskiy <aik@amd.com>
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251031212902.2256310-8-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 .../ABI/testing/sysfs-devices-pci-host-bridge      |  14 +
 drivers/pci/ide.c                                  | 428 +++++++++++++++++++++
 drivers/pci/pci.h                                  |   2 +
 drivers/pci/probe.c                                |   1 +
 include/linux/pci-ide.h                            |  78 ++++
 include/linux/pci.h                                |   6 +
 6 files changed, 529 insertions(+)
 create mode 100644 include/linux/pci-ide.h

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge
index 8c3a652799f1..2c66e5bb2bf8 100644
--- a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge
+++ b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge
@@ -17,3 +17,17 @@ Description:
 		PNP0A08 (/sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A08:00). See
 		/sys/devices/pciDDDD:BB entry for details about the DDDD:BB
 		format.
+
+What:		pciDDDD:BB/streamH.R.E
+Contact:	linux-pci@vger.kernel.org
+Description:
+		(RO) When a platform has established a secure connection, PCIe
+		IDE, between two Partner Ports, this symlink appears. A stream
+		consumes a Stream ID slot in each of the Host bridge (H), Root
+		Port (R) and Endpoint (E).  The link points to the Endpoint PCI
+		device in the Selective IDE Stream pairing. Specifically, "R"
+		and "E" represent the assigned Selective IDE Stream Register
+		Block in the Root Port and Endpoint, and "H" represents a
+		platform specific pool of stream resources shared by the Root
+		Ports in a host bridge. See /sys/devices/pciDDDD:BB entry for
+		details about the DDDD:BB format.
diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c
index 26866edf91b4..7643840738fe 100644
--- a/drivers/pci/ide.c
+++ b/drivers/pci/ide.c
@@ -5,8 +5,12 @@
 
 #define dev_fmt(fmt) "PCI/IDE: " fmt
 #include <linux/bitfield.h>
+#include <linux/bitops.h>
 #include <linux/pci.h>
+#include <linux/pci-ide.h>
 #include <linux/pci_regs.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
 
 #include "pci.h"
 
@@ -23,12 +27,25 @@ static int __sel_ide_offset(u16 ide_cap, u8 nr_link_ide, u8 stream_index,
 	return offset + stream_index * PCI_IDE_SEL_BLOCK_SIZE(nr_ide_mem);
 }
 
+static int sel_ide_offset(struct pci_dev *pdev,
+			  struct pci_ide_partner *settings)
+{
+	return __sel_ide_offset(pdev->ide_cap, pdev->nr_link_ide,
+				settings->stream_index, pdev->nr_ide_mem);
+}
+
 void pci_ide_init(struct pci_dev *pdev)
 {
 	u16 nr_link_ide, nr_ide_mem, nr_streams;
 	u16 ide_cap;
 	u32 val;
 
+	/*
+	 * Unconditionally init so that ida idle state is consistent with
+	 * pdev->ide_cap.
+	 */
+	ida_init(&pdev->ide_stream_ida);
+
 	if (!pci_is_pcie(pdev))
 		return;
 
@@ -84,5 +101,416 @@ void pci_ide_init(struct pci_dev *pdev)
 
 	pdev->ide_cap = ide_cap;
 	pdev->nr_link_ide = nr_link_ide;
+	pdev->nr_sel_ide = nr_streams;
 	pdev->nr_ide_mem = nr_ide_mem;
 }
+
+struct stream_index {
+	struct ida *ida;
+	u8 stream_index;
+};
+
+static void free_stream_index(struct stream_index *stream)
+{
+	ida_free(stream->ida, stream->stream_index);
+}
+
+DEFINE_FREE(free_stream, struct stream_index *, if (_T) free_stream_index(_T))
+static struct stream_index *alloc_stream_index(struct ida *ida, u16 max,
+					       struct stream_index *stream)
+{
+	int id;
+
+	if (!max)
+		return NULL;
+
+	id = ida_alloc_max(ida, max - 1, GFP_KERNEL);
+	if (id < 0)
+		return NULL;
+
+	*stream = (struct stream_index) {
+		.ida = ida,
+		.stream_index = id,
+	};
+	return stream;
+}
+
+/**
+ * pci_ide_stream_alloc() - Reserve stream indices and probe for settings
+ * @pdev: IDE capable PCIe Endpoint Physical Function
+ *
+ * Retrieve the Requester ID range of @pdev for programming its Root
+ * Port IDE RID Association registers, and conversely retrieve the
+ * Requester ID of the Root Port for programming @pdev's IDE RID
+ * Association registers.
+ *
+ * Allocate a Selective IDE Stream Register Block instance per port.
+ *
+ * Allocate a platform stream resource from the associated host bridge.
+ * Retrieve stream association parameters for Requester ID range and
+ * address range restrictions for the stream.
+ */
+struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev)
+{
+	/* EP, RP, + HB Stream allocation */
+	struct stream_index __stream[PCI_IDE_HB + 1];
+	struct pci_host_bridge *hb;
+	struct pci_dev *rp;
+	int num_vf, rid_end;
+
+	if (!pci_is_pcie(pdev))
+		return NULL;
+
+	if (pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT)
+		return NULL;
+
+	if (!pdev->ide_cap)
+		return NULL;
+
+	struct pci_ide *ide __free(kfree) = kzalloc(sizeof(*ide), GFP_KERNEL);
+	if (!ide)
+		return NULL;
+
+	hb = pci_find_host_bridge(pdev->bus);
+	struct stream_index *hb_stream __free(free_stream) = alloc_stream_index(
+		&hb->ide_stream_ida, hb->nr_ide_streams, &__stream[PCI_IDE_HB]);
+	if (!hb_stream)
+		return NULL;
+
+	rp = pcie_find_root_port(pdev);
+	struct stream_index *rp_stream __free(free_stream) = alloc_stream_index(
+		&rp->ide_stream_ida, rp->nr_sel_ide, &__stream[PCI_IDE_RP]);
+	if (!rp_stream)
+		return NULL;
+
+	struct stream_index *ep_stream __free(free_stream) = alloc_stream_index(
+		&pdev->ide_stream_ida, pdev->nr_sel_ide, &__stream[PCI_IDE_EP]);
+	if (!ep_stream)
+		return NULL;
+
+	/* for SR-IOV case, cover all VFs */
+	num_vf = pci_num_vf(pdev);
+	if (num_vf)
+		rid_end = PCI_DEVID(pci_iov_virtfn_bus(pdev, num_vf),
+				    pci_iov_virtfn_devfn(pdev, num_vf));
+	else
+		rid_end = pci_dev_id(pdev);
+
+	*ide = (struct pci_ide) {
+		.pdev = pdev,
+		.partner = {
+			[PCI_IDE_EP] = {
+				.rid_start = pci_dev_id(rp),
+				.rid_end = pci_dev_id(rp),
+				.stream_index = no_free_ptr(ep_stream)->stream_index,
+			},
+			[PCI_IDE_RP] = {
+				.rid_start = pci_dev_id(pdev),
+				.rid_end = rid_end,
+				.stream_index = no_free_ptr(rp_stream)->stream_index,
+			},
+		},
+		.host_bridge_stream = no_free_ptr(hb_stream)->stream_index,
+		.stream_id = -1,
+	};
+
+	return_ptr(ide);
+}
+EXPORT_SYMBOL_GPL(pci_ide_stream_alloc);
+
+/**
+ * pci_ide_stream_free() - unwind pci_ide_stream_alloc()
+ * @ide: idle IDE settings descriptor
+ *
+ * Free all of the stream index (register block) allocations acquired by
+ * pci_ide_stream_alloc(). The stream represented by @ide is assumed to
+ * be unregistered and not instantiated in any device.
+ */
+void pci_ide_stream_free(struct pci_ide *ide)
+{
+	struct pci_dev *pdev = ide->pdev;
+	struct pci_dev *rp = pcie_find_root_port(pdev);
+	struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus);
+
+	ida_free(&pdev->ide_stream_ida, ide->partner[PCI_IDE_EP].stream_index);
+	ida_free(&rp->ide_stream_ida, ide->partner[PCI_IDE_RP].stream_index);
+	ida_free(&hb->ide_stream_ida, ide->host_bridge_stream);
+	kfree(ide);
+}
+EXPORT_SYMBOL_GPL(pci_ide_stream_free);
+
+/**
+ * pci_ide_stream_release() - unwind and release an @ide context
+ * @ide: partially or fully registered IDE settings descriptor
+ *
+ * In support of automatic cleanup of IDE setup routines perform IDE
+ * teardown in expected reverse order of setup and with respect to which
+ * aspects of IDE setup have successfully completed.
+ *
+ * Be careful that setup order mirrors this shutdown order. Otherwise,
+ * open code releasing the IDE context.
+ */
+void pci_ide_stream_release(struct pci_ide *ide)
+{
+	struct pci_dev *pdev = ide->pdev;
+	struct pci_dev *rp = pcie_find_root_port(pdev);
+
+	if (ide->partner[PCI_IDE_RP].enable)
+		pci_ide_stream_disable(rp, ide);
+
+	if (ide->partner[PCI_IDE_EP].enable)
+		pci_ide_stream_disable(pdev, ide);
+
+	if (ide->partner[PCI_IDE_RP].setup)
+		pci_ide_stream_teardown(rp, ide);
+
+	if (ide->partner[PCI_IDE_EP].setup)
+		pci_ide_stream_teardown(pdev, ide);
+
+	if (ide->name)
+		pci_ide_stream_unregister(ide);
+
+	pci_ide_stream_free(ide);
+}
+EXPORT_SYMBOL_GPL(pci_ide_stream_release);
+
+/**
+ * pci_ide_stream_register() - Prepare to activate an IDE Stream
+ * @ide: IDE settings descriptor
+ *
+ * After a Stream ID has been acquired for @ide, record the presence of
+ * the stream in sysfs. The expectation is that @ide is immutable while
+ * registered.
+ */
+int pci_ide_stream_register(struct pci_ide *ide)
+{
+	struct pci_dev *pdev = ide->pdev;
+	struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus);
+	u8 ep_stream, rp_stream;
+	int rc;
+
+	if (ide->stream_id < 0 || ide->stream_id > U8_MAX) {
+		pci_err(pdev, "Setup fail: Invalid Stream ID: %d\n", ide->stream_id);
+		return -ENXIO;
+	}
+
+	ep_stream = ide->partner[PCI_IDE_EP].stream_index;
+	rp_stream = ide->partner[PCI_IDE_RP].stream_index;
+	const char *name __free(kfree) = kasprintf(GFP_KERNEL, "stream%d.%d.%d",
+						   ide->host_bridge_stream,
+						   rp_stream, ep_stream);
+	if (!name)
+		return -ENOMEM;
+
+	rc = sysfs_create_link(&hb->dev.kobj, &pdev->dev.kobj, name);
+	if (rc)
+		return rc;
+
+	ide->name = no_free_ptr(name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_ide_stream_register);
+
+/**
+ * pci_ide_stream_unregister() - unwind pci_ide_stream_register()
+ * @ide: idle IDE settings descriptor
+ *
+ * In preparation for freeing @ide, remove sysfs enumeration for the
+ * stream.
+ */
+void pci_ide_stream_unregister(struct pci_ide *ide)
+{
+	struct pci_dev *pdev = ide->pdev;
+	struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus);
+
+	sysfs_remove_link(&hb->dev.kobj, ide->name);
+	kfree(ide->name);
+	ide->name = NULL;
+}
+EXPORT_SYMBOL_GPL(pci_ide_stream_unregister);
+
+static int pci_ide_domain(struct pci_dev *pdev)
+{
+	if (pdev->fm_enabled)
+		return pci_domain_nr(pdev->bus);
+	return 0;
+}
+
+struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, struct pci_ide *ide)
+{
+	if (!pci_is_pcie(pdev)) {
+		pci_warn_once(pdev, "not a PCIe device\n");
+		return NULL;
+	}
+
+	switch (pci_pcie_type(pdev)) {
+	case PCI_EXP_TYPE_ENDPOINT:
+		if (pdev != ide->pdev) {
+			pci_warn_once(pdev, "setup expected Endpoint: %s\n", pci_name(ide->pdev));
+			return NULL;
+		}
+		return &ide->partner[PCI_IDE_EP];
+	case PCI_EXP_TYPE_ROOT_PORT: {
+		struct pci_dev *rp = pcie_find_root_port(ide->pdev);
+
+		if (pdev != rp) {
+			pci_warn_once(pdev, "setup expected Root Port: %s\n",
+				      pci_name(rp));
+			return NULL;
+		}
+		return &ide->partner[PCI_IDE_RP];
+	}
+	default:
+		pci_warn_once(pdev, "invalid device type\n");
+		return NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(pci_ide_to_settings);
+
+static void set_ide_sel_ctl(struct pci_dev *pdev, struct pci_ide *ide,
+			    struct pci_ide_partner *settings, int pos,
+			    bool enable)
+{
+	u32 val = FIELD_PREP(PCI_IDE_SEL_CTL_ID, ide->stream_id) |
+		  FIELD_PREP(PCI_IDE_SEL_CTL_DEFAULT, settings->default_stream) |
+		  FIELD_PREP(PCI_IDE_SEL_CTL_CFG_EN, pdev->ide_cfg) |
+		  FIELD_PREP(PCI_IDE_SEL_CTL_TEE_LIMITED, pdev->ide_tee_limit) |
+		  FIELD_PREP(PCI_IDE_SEL_CTL_EN, enable);
+
+	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, val);
+}
+
+/**
+ * pci_ide_stream_setup() - program settings to Selective IDE Stream registers
+ * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port
+ * @ide: registered IDE settings descriptor
+ *
+ * When @pdev is a PCI_EXP_TYPE_ENDPOINT then the PCI_IDE_EP partner
+ * settings are written to @pdev's Selective IDE Stream register block,
+ * and when @pdev is a PCI_EXP_TYPE_ROOT_PORT, the PCI_IDE_RP settings
+ * are selected.
+ */
+void pci_ide_stream_setup(struct pci_dev *pdev, struct pci_ide *ide)
+{
+	struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide);
+	int pos;
+	u32 val;
+
+	if (!settings)
+		return;
+
+	pos = sel_ide_offset(pdev, settings);
+
+	val = FIELD_PREP(PCI_IDE_SEL_RID_1_LIMIT, settings->rid_end);
+	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, val);
+
+	val = FIELD_PREP(PCI_IDE_SEL_RID_2_VALID, 1) |
+	      FIELD_PREP(PCI_IDE_SEL_RID_2_BASE, settings->rid_start) |
+	      FIELD_PREP(PCI_IDE_SEL_RID_2_SEG, pci_ide_domain(pdev));
+
+	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, val);
+
+	/*
+	 * Setup control register early for devices that expect
+	 * stream_id is set during key programming.
+	 */
+	set_ide_sel_ctl(pdev, ide, settings, pos, false);
+	settings->setup = 1;
+}
+EXPORT_SYMBOL_GPL(pci_ide_stream_setup);
+
+/**
+ * pci_ide_stream_teardown() - disable the stream and clear all settings
+ * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port
+ * @ide: registered IDE settings descriptor
+ *
+ * For stream destruction, zero all registers that may have been written
+ * by pci_ide_stream_setup(). Consider pci_ide_stream_disable() to leave
+ * settings in place while temporarily disabling the stream.
+ */
+void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide)
+{
+	struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide);
+	int pos;
+
+	if (!settings)
+		return;
+
+	pos = sel_ide_offset(pdev, settings);
+
+	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, 0);
+	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, 0);
+	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, 0);
+	settings->setup = 0;
+}
+EXPORT_SYMBOL_GPL(pci_ide_stream_teardown);
+
+/**
+ * pci_ide_stream_enable() - enable a Selective IDE Stream
+ * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port
+ * @ide: registered and setup IDE settings descriptor
+ *
+ * Activate the stream by writing to the Selective IDE Stream Control
+ * Register.
+ *
+ * Return: 0 if the stream successfully entered the "secure" state, and -EINVAL
+ * if @ide is invalid, and -ENXIO if the stream fails to enter the secure state.
+ *
+ * Note that the state may go "insecure" at any point after returning 0, but
+ * those events are equivalent to a "link down" event and handled via
+ * asynchronous error reporting.
+ *
+ * Caller is responsible to clear the enable bit in the -ENXIO case.
+ */
+int pci_ide_stream_enable(struct pci_dev *pdev, struct pci_ide *ide)
+{
+	struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide);
+	int pos;
+	u32 val;
+
+	if (!settings)
+		return -EINVAL;
+
+	pos = sel_ide_offset(pdev, settings);
+
+	set_ide_sel_ctl(pdev, ide, settings, pos, true);
+	settings->enable = 1;
+
+	pci_read_config_dword(pdev, pos + PCI_IDE_SEL_STS, &val);
+	if (FIELD_GET(PCI_IDE_SEL_STS_STATE, val) !=
+	    PCI_IDE_SEL_STS_STATE_SECURE)
+		return -ENXIO;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_ide_stream_enable);
+
+/**
+ * pci_ide_stream_disable() - disable a Selective IDE Stream
+ * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port
+ * @ide: registered and setup IDE settings descriptor
+ *
+ * Clear the Selective IDE Stream Control Register, but leave all other
+ * registers untouched.
+ */
+void pci_ide_stream_disable(struct pci_dev *pdev, struct pci_ide *ide)
+{
+	struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide);
+	int pos;
+
+	if (!settings)
+		return;
+
+	pos = sel_ide_offset(pdev, settings);
+
+	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, 0);
+	settings->enable = 0;
+}
+EXPORT_SYMBOL_GPL(pci_ide_stream_disable);
+
+void pci_ide_init_host_bridge(struct pci_host_bridge *hb)
+{
+	hb->nr_ide_streams = 256;
+	ida_init(&hb->ide_stream_ida);
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 6e4cc1c9aa58..d3f16be40102 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -615,8 +615,10 @@ static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { }
 
 #ifdef CONFIG_PCI_IDE
 void pci_ide_init(struct pci_dev *dev);
+void pci_ide_init_host_bridge(struct pci_host_bridge *hb);
 #else
 static inline void pci_ide_init(struct pci_dev *dev) { }
+static inline void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { }
 #endif
 
 #ifdef CONFIG_PCI_TSM
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 3b54f1720be5..93fa7ba8dfa6 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -672,6 +672,7 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge)
 	bridge->native_dpc = 1;
 	bridge->domain_nr = PCI_DOMAIN_NR_NOT_SET;
 	bridge->native_cxl_error = 1;
+	pci_ide_init_host_bridge(bridge);
 
 	device_initialize(&bridge->dev);
 }
diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h
new file mode 100644
index 000000000000..e638f9429bf9
--- /dev/null
+++ b/include/linux/pci-ide.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common helpers for drivers (e.g. low-level PCI/TSM drivers) implementing the
+ * IDE key management protocol (IDE_KM) as defined by:
+ * PCIe r7.0 section 6.33 Integrity & Data Encryption (IDE)
+ *
+ * Copyright(c) 2024-2025 Intel Corporation. All rights reserved.
+ */
+
+#ifndef __PCI_IDE_H__
+#define __PCI_IDE_H__
+
+enum pci_ide_partner_select {
+	PCI_IDE_EP,
+	PCI_IDE_RP,
+	PCI_IDE_PARTNER_MAX,
+	/*
+	 * In addition to the resources in each partner port the
+	 * platform / host-bridge additionally has a Stream ID pool that
+	 * it shares across root ports. Let pci_ide_stream_alloc() use
+	 * the alloc_stream_index() helper as endpoints and root ports.
+	 */
+	PCI_IDE_HB = PCI_IDE_PARTNER_MAX,
+};
+
+/**
+ * struct pci_ide_partner - Per port pair Selective IDE Stream settings
+ * @rid_start: Partner Port Requester ID range start
+ * @rid_end: Partner Port Requester ID range end
+ * @stream_index: Selective IDE Stream Register Block selection
+ * @default_stream: Endpoint uses this stream for all upstream TLPs regardless of
+ *		    address and RID association registers
+ * @setup: flag to track whether to run pci_ide_stream_teardown() for this
+ *	   partner slot
+ * @enable: flag whether to run pci_ide_stream_disable() for this partner slot
+ */
+struct pci_ide_partner {
+	u16 rid_start;
+	u16 rid_end;
+	u8 stream_index;
+	unsigned int default_stream:1;
+	unsigned int setup:1;
+	unsigned int enable:1;
+};
+
+/**
+ * struct pci_ide - PCIe Selective IDE Stream descriptor
+ * @pdev: PCIe Endpoint in the pci_ide_partner pair
+ * @partner: per-partner settings
+ * @host_bridge_stream: allocated from host bridge @ide_stream_ida pool
+ * @stream_id: unique Stream ID (within Partner Port pairing)
+ * @name: name of the established Selective IDE Stream in sysfs
+ *
+ * Negative @stream_id values indicate "uninitialized" on the
+ * expectation that with TSM established IDE the TSM owns the stream_id
+ * allocation.
+ */
+struct pci_ide {
+	struct pci_dev *pdev;
+	struct pci_ide_partner partner[PCI_IDE_PARTNER_MAX];
+	u8 host_bridge_stream;
+	int stream_id;
+	const char *name;
+};
+
+struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev,
+					    struct pci_ide *ide);
+struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev);
+void pci_ide_stream_free(struct pci_ide *ide);
+int  pci_ide_stream_register(struct pci_ide *ide);
+void pci_ide_stream_unregister(struct pci_ide *ide);
+void pci_ide_stream_setup(struct pci_dev *pdev, struct pci_ide *ide);
+void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide);
+int pci_ide_stream_enable(struct pci_dev *pdev, struct pci_ide *ide);
+void pci_ide_stream_disable(struct pci_dev *pdev, struct pci_ide *ide);
+void pci_ide_stream_release(struct pci_ide *ide);
+DEFINE_FREE(pci_ide_stream_release, struct pci_ide *, if (_T) pci_ide_stream_release(_T))
+#endif /* __PCI_IDE_H__ */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ea94799c81b0..2c8dbae4916c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -545,6 +545,8 @@ struct pci_dev {
 	u16		ide_cap;	/* Link Integrity & Data Encryption */
 	u8		nr_ide_mem;	/* Address association resources for streams */
 	u8		nr_link_ide;	/* Link Stream count (Selective Stream offset) */
+	u16		nr_sel_ide;	/* Selective Stream count (register block allocator) */
+	struct ida	ide_stream_ida;
 	unsigned int	ide_cfg:1;	/* Config cycles over IDE */
 	unsigned int	ide_tee_limit:1; /* Disallow T=0 traffic over IDE */
 #endif
@@ -614,6 +616,10 @@ struct pci_host_bridge {
 	int		domain_nr;
 	struct list_head windows;	/* resource_entry */
 	struct list_head dma_ranges;	/* dma ranges resource list */
+#ifdef CONFIG_PCI_IDE
+	u16 nr_ide_streams; /* Max streams possibly active in @ide_stream_ida */
+	struct ida ide_stream_ida;
+#endif
 	u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */
 	int (*map_irq)(const struct pci_dev *, u8, u8);
 	void (*release_fn)(struct pci_host_bridge *);
-- 
cgit v1.2.3


From 9ddaf9c3ed007cd03c1335fb40920ad76f72a3d5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 31 Oct 2025 14:29:00 -0700
Subject: PCI/IDE: Report available IDE streams

The limited number of link-encryption (IDE) streams that a given set of
host bridges supports is a platform specific detail. Provide
pci_ide_init_nr_streams() as a generic facility for either platform TSM
drivers, or PCI core native IDE, to report the number available streams.
After invoking pci_ide_init_nr_streams() an "available_secure_streams"
attribute appears in PCI host bridge sysfs to convey that count.

Introduce a device-type, @pci_host_bridge_type, now that both a release
method and sysfs attribute groups are being specified for all 'struct
pci_host_bridge' instances.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Xu Yilun <yilun.xu@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251031212902.2256310-9-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 .../ABI/testing/sysfs-devices-pci-host-bridge      | 12 ++++
 drivers/pci/ide.c                                  | 67 ++++++++++++++++++++++
 drivers/pci/pci.h                                  |  1 +
 drivers/pci/probe.c                                | 14 ++++-
 include/linux/pci-ide.h                            |  1 +
 5 files changed, 94 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge
index 2c66e5bb2bf8..b91ec3450811 100644
--- a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge
+++ b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge
@@ -31,3 +31,15 @@ Description:
 		platform specific pool of stream resources shared by the Root
 		Ports in a host bridge. See /sys/devices/pciDDDD:BB entry for
 		details about the DDDD:BB format.
+
+What:		pciDDDD:BB/available_secure_streams
+Contact:	linux-pci@vger.kernel.org
+Description:
+		(RO) When a host bridge has Root Ports that support PCIe IDE
+		(link encryption and integrity protection) there may be a
+		limited number of Selective IDE Streams that can be used for
+		establishing new end-to-end secure links. This attribute
+		decrements upon secure link setup, and increments upon secure
+		link teardown. The in-use stream count is determined by counting
+		stream symlinks. See /sys/devices/pciDDDD:BB entry for details
+		about the DDDD:BB format.
diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c
index 7643840738fe..4ae3872589fc 100644
--- a/drivers/pci/ide.c
+++ b/drivers/pci/ide.c
@@ -514,3 +514,70 @@ void pci_ide_init_host_bridge(struct pci_host_bridge *hb)
 	hb->nr_ide_streams = 256;
 	ida_init(&hb->ide_stream_ida);
 }
+
+static ssize_t available_secure_streams_show(struct device *dev,
+					     struct device_attribute *attr,
+					     char *buf)
+{
+	struct pci_host_bridge *hb = to_pci_host_bridge(dev);
+	int nr = READ_ONCE(hb->nr_ide_streams);
+	int avail = nr;
+
+	if (!nr)
+		return -ENXIO;
+
+	/*
+	 * Yes, this is inefficient and racy, but it is only for occasional
+	 * platform resource surveys. Worst case is bounded to 256 streams.
+	 */
+	for (int i = 0; i < nr; i++)
+		if (ida_exists(&hb->ide_stream_ida, i))
+			avail--;
+	return sysfs_emit(buf, "%d\n", avail);
+}
+static DEVICE_ATTR_RO(available_secure_streams);
+
+static struct attribute *pci_ide_attrs[] = {
+	&dev_attr_available_secure_streams.attr,
+	NULL
+};
+
+static umode_t pci_ide_attr_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pci_host_bridge *hb = to_pci_host_bridge(dev);
+
+	if (a == &dev_attr_available_secure_streams.attr)
+		if (!hb->nr_ide_streams)
+			return 0;
+
+	return a->mode;
+}
+
+const struct attribute_group pci_ide_attr_group = {
+	.attrs = pci_ide_attrs,
+	.is_visible = pci_ide_attr_visible,
+};
+
+/**
+ * pci_ide_set_nr_streams() - sets size of the pool of IDE Stream resources
+ * @hb: host bridge boundary for the stream pool
+ * @nr: number of streams
+ *
+ * Platform PCI init and/or expert test module use only. Limit IDE
+ * Stream establishment by setting the number of stream resources
+ * available at the host bridge. Platform init code must set this before
+ * the first pci_ide_stream_alloc() call if the platform has less than the
+ * default of 256 streams per host-bridge.
+ *
+ * The "PCI_IDE" symbol namespace is required because this is typically
+ * a detail that is settled in early PCI init. I.e. this export is not
+ * for endpoint drivers.
+ */
+void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr)
+{
+	hb->nr_ide_streams = min(nr, 256);
+	WARN_ON_ONCE(!ida_is_empty(&hb->ide_stream_ida));
+	sysfs_update_group(&hb->dev.kobj, &pci_ide_attr_group);
+}
+EXPORT_SYMBOL_NS_GPL(pci_ide_set_nr_streams, "PCI_IDE");
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d3f16be40102..f6ffe5ee4717 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -616,6 +616,7 @@ static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { }
 #ifdef CONFIG_PCI_IDE
 void pci_ide_init(struct pci_dev *dev);
 void pci_ide_init_host_bridge(struct pci_host_bridge *hb);
+extern const struct attribute_group pci_ide_attr_group;
 #else
 static inline void pci_ide_init(struct pci_dev *dev) { }
 static inline void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { }
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 93fa7ba8dfa6..cfacf5bcd073 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -653,6 +653,18 @@ static void pci_release_host_bridge_dev(struct device *dev)
 	kfree(bridge);
 }
 
+static const struct attribute_group *pci_host_bridge_groups[] = {
+#ifdef CONFIG_PCI_IDE
+	&pci_ide_attr_group,
+#endif
+	NULL
+};
+
+static const struct device_type pci_host_bridge_type = {
+	.groups = pci_host_bridge_groups,
+	.release = pci_release_host_bridge_dev,
+};
+
 static void pci_init_host_bridge(struct pci_host_bridge *bridge)
 {
 	INIT_LIST_HEAD(&bridge->windows);
@@ -672,6 +684,7 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge)
 	bridge->native_dpc = 1;
 	bridge->domain_nr = PCI_DOMAIN_NR_NOT_SET;
 	bridge->native_cxl_error = 1;
+	bridge->dev.type = &pci_host_bridge_type;
 	pci_ide_init_host_bridge(bridge);
 
 	device_initialize(&bridge->dev);
@@ -686,7 +699,6 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv)
 		return NULL;
 
 	pci_init_host_bridge(bridge);
-	bridge->dev.release = pci_release_host_bridge_dev;
 
 	return bridge;
 }
diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h
index e638f9429bf9..85645b0a8620 100644
--- a/include/linux/pci-ide.h
+++ b/include/linux/pci-ide.h
@@ -63,6 +63,7 @@ struct pci_ide {
 	const char *name;
 };
 
+void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr);
 struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev,
 					    struct pci_ide *ide);
 struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev);
-- 
cgit v1.2.3


From a4438f06b1db15ce3d831ce82b8767665638aa2a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 31 Oct 2025 14:29:01 -0700
Subject: PCI/TSM: Report active IDE streams

Given that the platform TSM owns IDE Stream ID allocation, report the
active streams via the TSM class device. Establish a symlink from the
class device to the PCI endpoint device consuming the stream, named by
the Stream ID.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251031212902.2256310-10-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/testing/sysfs-class-tsm | 10 ++++++++++
 drivers/pci/ide.c                         |  4 ++++
 drivers/virt/coco/tsm-core.c              | 28 ++++++++++++++++++++++++++++
 include/linux/pci-ide.h                   |  2 ++
 include/linux/tsm.h                       |  3 +++
 5 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-tsm b/Documentation/ABI/testing/sysfs-class-tsm
index 2949468deaf7..6fc1a5ac6da1 100644
--- a/Documentation/ABI/testing/sysfs-class-tsm
+++ b/Documentation/ABI/testing/sysfs-class-tsm
@@ -7,3 +7,13 @@ Description:
 		signals when the PCI layer is able to support establishment of
 		link encryption and other device-security features coordinated
 		through a platform tsm.
+
+What:		/sys/class/tsm/tsmN/streamH.R.E
+Contact:	linux-pci@vger.kernel.org
+Description:
+		(RO) When a host bridge has established a secure connection via
+		the platform TSM, symlink appears. The primary function of this
+		is have a system global review of TSM resource consumption
+		across host bridges. The link points to the endpoint PCI device
+		and matches the same link published by the host bridge. See
+		Documentation/ABI/testing/sysfs-devices-pci-host-bridge.
diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c
index 4ae3872589fc..da5b1acccbb4 100644
--- a/drivers/pci/ide.c
+++ b/drivers/pci/ide.c
@@ -11,6 +11,7 @@
 #include <linux/pci_regs.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
+#include <linux/tsm.h>
 
 #include "pci.h"
 
@@ -261,6 +262,9 @@ void pci_ide_stream_release(struct pci_ide *ide)
 	if (ide->partner[PCI_IDE_EP].enable)
 		pci_ide_stream_disable(pdev, ide);
 
+	if (ide->tsm_dev)
+		tsm_ide_stream_unregister(ide);
+
 	if (ide->partner[PCI_IDE_RP].setup)
 		pci_ide_stream_teardown(rp, ide);
 
diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c
index 0e705f3067a1..f027876a2f19 100644
--- a/drivers/virt/coco/tsm-core.c
+++ b/drivers/virt/coco/tsm-core.c
@@ -4,11 +4,13 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/tsm.h>
+#include <linux/pci.h>
 #include <linux/rwsem.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/cleanup.h>
 #include <linux/pci-tsm.h>
+#include <linux/pci-ide.h>
 
 static struct class *tsm_class;
 static DECLARE_RWSEM(tsm_rwsem);
@@ -106,6 +108,32 @@ void tsm_unregister(struct tsm_dev *tsm_dev)
 }
 EXPORT_SYMBOL_GPL(tsm_unregister);
 
+/* must be invoked between tsm_register / tsm_unregister */
+int tsm_ide_stream_register(struct pci_ide *ide)
+{
+	struct pci_dev *pdev = ide->pdev;
+	struct pci_tsm *tsm = pdev->tsm;
+	struct tsm_dev *tsm_dev = tsm->tsm_dev;
+	int rc;
+
+	rc = sysfs_create_link(&tsm_dev->dev.kobj, &pdev->dev.kobj, ide->name);
+	if (rc)
+		return rc;
+
+	ide->tsm_dev = tsm_dev;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tsm_ide_stream_register);
+
+void tsm_ide_stream_unregister(struct pci_ide *ide)
+{
+	struct tsm_dev *tsm_dev = ide->tsm_dev;
+
+	ide->tsm_dev = NULL;
+	sysfs_remove_link(&tsm_dev->dev.kobj, ide->name);
+}
+EXPORT_SYMBOL_GPL(tsm_ide_stream_unregister);
+
 static void tsm_release(struct device *dev)
 {
 	struct tsm_dev *tsm_dev = container_of(dev, typeof(*tsm_dev), dev);
diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h
index 85645b0a8620..d0f10f3c89fc 100644
--- a/include/linux/pci-ide.h
+++ b/include/linux/pci-ide.h
@@ -50,6 +50,7 @@ struct pci_ide_partner {
  * @host_bridge_stream: allocated from host bridge @ide_stream_ida pool
  * @stream_id: unique Stream ID (within Partner Port pairing)
  * @name: name of the established Selective IDE Stream in sysfs
+ * @tsm_dev: For TSM established IDE, the TSM device context
  *
  * Negative @stream_id values indicate "uninitialized" on the
  * expectation that with TSM established IDE the TSM owns the stream_id
@@ -61,6 +62,7 @@ struct pci_ide {
 	u8 host_bridge_stream;
 	int stream_id;
 	const char *name;
+	struct tsm_dev *tsm_dev;
 };
 
 void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr);
diff --git a/include/linux/tsm.h b/include/linux/tsm.h
index 22e05b2aac69..a3b7ab668eff 100644
--- a/include/linux/tsm.h
+++ b/include/linux/tsm.h
@@ -123,4 +123,7 @@ int tsm_report_unregister(const struct tsm_report_ops *ops);
 struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *ops);
 void tsm_unregister(struct tsm_dev *tsm_dev);
 struct tsm_dev *find_tsm_dev(int id);
+struct pci_ide;
+int tsm_ide_stream_register(struct pci_ide *ide);
+void tsm_ide_stream_unregister(struct pci_ide *ide);
 #endif /* __TSM_H */
-- 
cgit v1.2.3


From e497310b4ffb559e1149ee89470d5c518d234ddf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:43:55 +0100
Subject: uaccess: Provide scoped user access regions

User space access regions are tedious and require similar code patterns all
over the place:

     	if (!user_read_access_begin(from, sizeof(*from)))
		return -EFAULT;
	unsafe_get_user(val, from, Efault);
	user_read_access_end();
	return 0;
Efault:
	user_read_access_end();
	return -EFAULT;

This got worse with the recent addition of masked user access, which
optimizes the speculation prevention:

	if (can_do_masked_user_access())
		from = masked_user_read_access_begin((from));
	else if (!user_read_access_begin(from, sizeof(*from)))
		return -EFAULT;
	unsafe_get_user(val, from, Efault);
	user_read_access_end();
	return 0;
Efault:
	user_read_access_end();
	return -EFAULT;

There have been issues with using the wrong user_*_access_end() variant in
the error path and other typical Copy&Pasta problems, e.g. using the wrong
fault label in the user accessor which ends up using the wrong accesss end
variant.

These patterns beg for scopes with automatic cleanup. The resulting outcome
is:
    	scoped_user_read_access(from, Efault)
		unsafe_get_user(val, from, Efault);
	return 0;
  Efault:
	return -EFAULT;

The scope guarantees the proper cleanup for the access mode is invoked both
in the success and the failure (fault) path.

The scoped_user_$MODE_access() macros are implemented as self terminating
nested for() loops. Thanks to Andrew Cooper for pointing me at them. The
scope can therefore be left with 'break', 'goto' and 'return'.  Even
'continue' "works" due to the self termination mechanism. Both GCC and
clang optimize all the convoluted macro maze out and the above results with
clang in:

 b80:	f3 0f 1e fa          	       endbr64
 b84:	48 b8 ef cd ab 89 67 45 23 01  movabs $0x123456789abcdef,%rax
 b8e:	48 39 c7    	               cmp    %rax,%rdi
 b91:	48 0f 47 f8          	       cmova  %rax,%rdi
 b95:	90                   	       nop
 b96:	90                   	       nop
 b97:	90                   	       nop
 b98:	31 c9                	       xor    %ecx,%ecx
 b9a:	8b 07                	       mov    (%rdi),%eax
 b9c:	89 06                	       mov    %eax,(%rsi)
 b9e:	85 c9                	       test   %ecx,%ecx
 ba0:	0f 94 c0             	       sete   %al
 ba3:	90                   	       nop
 ba4:	90                   	       nop
 ba5:	90                   	       nop
 ba6:	c3                   	       ret

Which looks as compact as it gets. The NOPs are placeholder for STAC/CLAC.
GCC emits the fault path seperately:

 bf0:	f3 0f 1e fa          	       endbr64
 bf4:	48 b8 ef cd ab 89 67 45 23 01  movabs $0x123456789abcdef,%rax
 bfe:	48 39 c7             	       cmp    %rax,%rdi
 c01:	48 0f 47 f8          	       cmova  %rax,%rdi
 c05:	90                   	       nop
 c06:	90                   	       nop
 c07:	90                   	       nop
 c08:	31 d2                	       xor    %edx,%edx
 c0a:	8b 07                	       mov    (%rdi),%eax
 c0c:	89 06                	       mov    %eax,(%rsi)
 c0e:	85 d2                	       test   %edx,%edx
 c10:	75 09                	       jne    c1b <afoo+0x2b>
 c12:	90                   	       nop
 c13:	90                   	       nop
 c14:	90                   	       nop
 c15:	b8 01 00 00 00       	       mov    $0x1,%eax
 c1a:	c3                   	       ret
 c1b:	90                   	       nop
 c1c:	90                   	       nop
 c1d:	90                   	       nop
 c1e:	31 c0                	       xor    %eax,%eax
 c20:	c3                   	       ret

The fault labels for the scoped*() macros and the fault labels for the
actual user space accessors can be shared and must be placed outside of the
scope.

If masked user access is enabled on an architecture, then the pointer
handed in to scoped_user_$MODE_access() can be modified to point to a
guaranteed faulting user address. This modification is only scope local as
the pointer is aliased inside the scope. When the scope is left the alias
is not longer in effect. IOW the original pointer value is preserved so it
can be used e.g. for fixup or diagnostic purposes in the fault path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027083745.546420421@linutronix.de
---
 include/linux/uaccess.h | 192 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 8aa82b1d6013..5f142c05b0dc 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_UACCESS_H__
 #define __LINUX_UACCESS_H__
 
+#include <linux/cleanup.h>
 #include <linux/fault-inject-usercopy.h>
 #include <linux/instrumented.h>
 #include <linux/minmax.h>
@@ -35,9 +36,17 @@
 
 #ifdef masked_user_access_begin
  #define can_do_masked_user_access() 1
+# ifndef masked_user_write_access_begin
+#  define masked_user_write_access_begin masked_user_access_begin
+# endif
+# ifndef masked_user_read_access_begin
+#  define masked_user_read_access_begin masked_user_access_begin
+#endif
 #else
  #define can_do_masked_user_access() 0
  #define masked_user_access_begin(src) NULL
+ #define masked_user_read_access_begin(src) NULL
+ #define masked_user_write_access_begin(src) NULL
  #define mask_user_address(src) (src)
 #endif
 
@@ -633,6 +642,189 @@ static inline void user_access_restore(unsigned long flags) { }
 #define user_read_access_end user_access_end
 #endif
 
+/* Define RW variant so the below _mode macro expansion works */
+#define masked_user_rw_access_begin(u)	masked_user_access_begin(u)
+#define user_rw_access_begin(u, s)	user_access_begin(u, s)
+#define user_rw_access_end()		user_access_end()
+
+/* Scoped user access */
+#define USER_ACCESS_GUARD(_mode)				\
+static __always_inline void __user *				\
+class_user_##_mode##_begin(void __user *ptr)			\
+{								\
+	return ptr;						\
+}								\
+								\
+static __always_inline void					\
+class_user_##_mode##_end(void __user *ptr)			\
+{								\
+	user_##_mode##_access_end();				\
+}								\
+								\
+DEFINE_CLASS(user_ ##_mode## _access, void __user *,		\
+	     class_user_##_mode##_end(_T),			\
+	     class_user_##_mode##_begin(ptr), void __user *ptr)	\
+								\
+static __always_inline class_user_##_mode##_access_t		\
+class_user_##_mode##_access_ptr(void __user *scope)		\
+{								\
+	return scope;						\
+}
+
+USER_ACCESS_GUARD(read)
+USER_ACCESS_GUARD(write)
+USER_ACCESS_GUARD(rw)
+#undef USER_ACCESS_GUARD
+
+/**
+ * __scoped_user_access_begin - Start a scoped user access
+ * @mode:	The mode of the access class (read, write, rw)
+ * @uptr:	The pointer to access user space memory
+ * @size:	Size of the access
+ * @elbl:	Error label to goto when the access region is rejected
+ *
+ * Internal helper for __scoped_user_access(). Don't use directly.
+ */
+#define __scoped_user_access_begin(mode, uptr, size, elbl)		\
+({									\
+	typeof(uptr) __retptr;						\
+									\
+	if (can_do_masked_user_access()) {				\
+		__retptr = masked_user_##mode##_access_begin(uptr);	\
+	} else {							\
+		__retptr = uptr;					\
+		if (!user_##mode##_access_begin(uptr, size))		\
+			goto elbl;					\
+	}								\
+	__retptr;							\
+})
+
+/**
+ * __scoped_user_access - Open a scope for user access
+ * @mode:	The mode of the access class (read, write, rw)
+ * @uptr:	The pointer to access user space memory
+ * @size:	Size of the access
+ * @elbl:	Error label to goto when the access region is rejected. It
+ *		must be placed outside the scope
+ *
+ * If the user access function inside the scope requires a fault label, it
+ * can use @elbl or a different label outside the scope, which requires
+ * that user access which is implemented with ASM GOTO has been properly
+ * wrapped. See unsafe_get_user() for reference.
+ *
+ *	scoped_user_rw_access(ptr, efault) {
+ *		unsafe_get_user(rval, &ptr->rval, efault);
+ *		unsafe_put_user(wval, &ptr->wval, efault);
+ *	}
+ *	return 0;
+ *  efault:
+ *	return -EFAULT;
+ *
+ * The scope is internally implemented as a autoterminating nested for()
+ * loop, which can be left with 'return', 'break' and 'goto' at any
+ * point.
+ *
+ * When the scope is left user_##@_mode##_access_end() is automatically
+ * invoked.
+ *
+ * When the architecture supports masked user access and the access region
+ * which is determined by @uptr and @size is not a valid user space
+ * address, i.e. < TASK_SIZE, the scope sets the pointer to a faulting user
+ * space address and does not terminate early. This optimizes for the good
+ * case and lets the performance uncritical bad case go through the fault.
+ *
+ * The eventual modification of the pointer is limited to the scope.
+ * Outside of the scope the original pointer value is unmodified, so that
+ * the original pointer value is available for diagnostic purposes in an
+ * out of scope fault path.
+ *
+ * Nesting scoped user access into a user access scope is invalid and fails
+ * the build. Nesting into other guards, e.g. pagefault is safe.
+ *
+ * The masked variant does not check the size of the access and relies on a
+ * mapping hole (e.g. guard page) to catch an out of range pointer, the
+ * first access to user memory inside the scope has to be within
+ * @uptr ... @uptr + PAGE_SIZE - 1
+ *
+ * Don't use directly. Use scoped_masked_user_$MODE_access() instead.
+ */
+#define __scoped_user_access(mode, uptr, size, elbl)					\
+for (bool done = false; !done; done = true)						\
+	for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \
+	     !done; done = true)							\
+		for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true)	\
+			/* Force modified pointer usage within the scope */		\
+			for (const typeof(uptr) uptr = _tmpptr; !done; done = true)
+
+/**
+ * scoped_user_read_access_size - Start a scoped user read access with given size
+ * @usrc:	Pointer to the user space address to read from
+ * @size:	Size of the access starting from @usrc
+ * @elbl:	Error label to goto when the access region is rejected
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_read_access_size(usrc, size, elbl)		\
+	__scoped_user_access(read, usrc, size, elbl)
+
+/**
+ * scoped_user_read_access - Start a scoped user read access
+ * @usrc:	Pointer to the user space address to read from
+ * @elbl:	Error label to goto when the access region is rejected
+ *
+ * The size of the access starting from @usrc is determined via sizeof(*@usrc)).
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_read_access(usrc, elbl)				\
+	scoped_user_read_access_size(usrc, sizeof(*(usrc)), elbl)
+
+/**
+ * scoped_user_write_access_size - Start a scoped user write access with given size
+ * @udst:	Pointer to the user space address to write to
+ * @size:	Size of the access starting from @udst
+ * @elbl:	Error label to goto when the access region is rejected
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_write_access_size(udst, size, elbl)			\
+	__scoped_user_access(write, udst, size, elbl)
+
+/**
+ * scoped_user_write_access - Start a scoped user write access
+ * @udst:	Pointer to the user space address to write to
+ * @elbl:	Error label to goto when the access region is rejected
+ *
+ * The size of the access starting from @udst is determined via sizeof(*@udst)).
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_write_access(udst, elbl)				\
+	scoped_user_write_access_size(udst, sizeof(*(udst)), elbl)
+
+/**
+ * scoped_user_rw_access_size - Start a scoped user read/write access with given size
+ * @uptr	Pointer to the user space address to read from and write to
+ * @size:	Size of the access starting from @uptr
+ * @elbl:	Error label to goto when the access region is rejected
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_rw_access_size(uptr, size, elbl)			\
+	__scoped_user_access(rw, uptr, size, elbl)
+
+/**
+ * scoped_user_rw_access - Start a scoped user read/write access
+ * @uptr	Pointer to the user space address to read from and write to
+ * @elbl:	Error label to goto when the access region is rejected
+ *
+ * The size of the access starting from @uptr is determined via sizeof(*@uptr)).
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_rw_access(uptr, elbl)				\
+	scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl)
+
 #ifdef CONFIG_HARDENED_USERCOPY
 void __noreturn usercopy_abort(const char *name, const char *detail,
 			       bool to_user, unsigned long offset,
-- 
cgit v1.2.3


From b2cfc0cd68b830dde80fce2406580e258a1e976d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:43:56 +0100
Subject: uaccess: Provide put/get_user_inline()

Provide convenience wrappers around scoped user access similar to
put/get_user(), which reduce the usage sites to:

       if (!get_user_inline(val, ptr))
       		return -EFAULT;

Should only be used if there is a demonstrable performance benefit.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027083745.609031602@linutronix.de
---
 include/linux/uaccess.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5f142c05b0dc..be395f5f7ee3 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -825,6 +825,56 @@ for (bool done = false; !done; done = true)						\
 #define scoped_user_rw_access(uptr, elbl)				\
 	scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl)
 
+/**
+ * get_user_inline - Read user data inlined
+ * @val:	The variable to store the value read from user memory
+ * @usrc:	Pointer to the user space memory to read from
+ *
+ * Return: 0 if successful, -EFAULT when faulted
+ *
+ * Inlined variant of get_user(). Only use when there is a demonstrable
+ * performance reason.
+ */
+#define get_user_inline(val, usrc)				\
+({								\
+	__label__ efault;					\
+	typeof(usrc) _tmpsrc = usrc;				\
+	int _ret = 0;						\
+								\
+	scoped_user_read_access(_tmpsrc, efault)		\
+		unsafe_get_user(val, _tmpsrc, efault);		\
+	if (0) {						\
+	efault:							\
+		_ret = -EFAULT;					\
+	}							\
+	_ret;							\
+})
+
+/**
+ * put_user_inline - Write to user memory inlined
+ * @val:	The value to write
+ * @udst:	Pointer to the user space memory to write to
+ *
+ * Return: 0 if successful, -EFAULT when faulted
+ *
+ * Inlined variant of put_user(). Only use when there is a demonstrable
+ * performance reason.
+ */
+#define put_user_inline(val, udst)				\
+({								\
+	__label__ efault;					\
+	typeof(udst) _tmpdst = udst;				\
+	int _ret = 0;						\
+								\
+	scoped_user_write_access(_tmpdst, efault)		\
+		unsafe_put_user(val, _tmpdst, efault);		\
+	if (0) {						\
+	efault:							\
+		_ret = -EFAULT;					\
+	}							\
+	_ret;							\
+})
+
 #ifdef CONFIG_HARDENED_USERCOPY
 void __noreturn usercopy_abort(const char *name, const char *detail,
 			       bool to_user, unsigned long offset,
-- 
cgit v1.2.3


From 3ca59da7aa5c7f569b04a511dc8670861d58b509 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:16 +0100
Subject: rseq: Avoid pointless evaluation in __rseq_notify_resume()

The RSEQ critical section mechanism only clears the event mask when a
critical section is registered, otherwise it is stale and collects
bits.

That means once a critical section is installed the first invocation of
that code when TIF_NOTIFY_RESUME is set will abort the critical section,
even when the TIF bit was not raised by the rseq preempt/migrate/signal
helpers.

This also has a performance implication because TIF_NOTIFY_RESUME is a
multiplexing TIF bit, which is utilized by quite some infrastructure. That
means every invocation of __rseq_notify_resume() goes unconditionally
through the heavy lifting of user space access and consistency checks even
if there is no reason to do so.

Keeping the stale event mask around when exiting to user space also
prevents it from being utilized by the upcoming time slice extension
mechanism.

Avoid this by reading and clearing the event mask before doing the user
space critical section access with interrupts or preemption disabled, which
ensures that the read and clear operation is CPU local atomic versus
scheduling and the membarrier IPI.

This is correct as after re-enabling interrupts/preemption any relevant
event will set the bit again and raise TIF_NOTIFY_RESUME, which makes the
user space exit code take another round of TIF bit clearing.

If the event mask was non-zero, invoke the slow path. On debug kernels the
slow path is invoked unconditionally and the result of the event mask
evaluation is handed in.

Add a exit path check after the TIF bit loop, which validates on debug
kernels that the event mask is zero before exiting to user space.

While at it reword the convoluted comment why the pt_regs pointer can be
NULL under certain circumstances.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.022571576@linutronix.de
---
 include/linux/irq-entry-common.h |  7 +++--
 include/linux/rseq.h             | 10 +++++-
 kernel/rseq.c                    | 66 ++++++++++++++++++++++++++--------------
 3 files changed, 58 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index d643c7c87822..e5941df13901 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -2,11 +2,12 @@
 #ifndef __LINUX_IRQENTRYCOMMON_H
 #define __LINUX_IRQENTRYCOMMON_H
 
+#include <linux/context_tracking.h>
+#include <linux/kmsan.h>
+#include <linux/rseq.h>
 #include <linux/static_call_types.h>
 #include <linux/syscalls.h>
-#include <linux/context_tracking.h>
 #include <linux/tick.h>
-#include <linux/kmsan.h>
 #include <linux/unwind_deferred.h>
 
 #include <asm/entry-common.h>
@@ -226,6 +227,8 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
 
+	rseq_exit_to_user_mode();
+
 	/* Ensure that kernel state is sane for a return to userspace */
 	kmap_assert_nomap();
 	lockdep_assert_irqs_disabled();
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 69553e7c14c1..7622b733a508 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -66,6 +66,14 @@ static inline void rseq_migrate(struct task_struct *t)
 	rseq_set_notify_resume(t);
 }
 
+static __always_inline void rseq_exit_to_user_mode(void)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
+		if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask))
+			current->rseq_event_mask = 0;
+	}
+}
+
 /*
  * If parent process has a registered restartable sequences area, the
  * child inherits. Unregister rseq for a clone with CLONE_VM set.
@@ -118,7 +126,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
 static inline void rseq_execve(struct task_struct *t)
 {
 }
-
+static inline void rseq_exit_to_user_mode(void) { }
 #endif
 
 #ifdef CONFIG_DEBUG_RSEQ
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2452b7366b00..246319d7cb0c 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -324,9 +324,9 @@ static bool rseq_warn_flags(const char *str, u32 flags)
 	return true;
 }
 
-static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
+static int rseq_check_flags(struct task_struct *t, u32 cs_flags)
 {
-	u32 flags, event_mask;
+	u32 flags;
 	int ret;
 
 	if (rseq_warn_flags("rseq_cs", cs_flags))
@@ -339,17 +339,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
 
 	if (rseq_warn_flags("rseq", flags))
 		return -EINVAL;
-
-	/*
-	 * Load and clear event mask atomically with respect to
-	 * scheduler preemption and membarrier IPIs.
-	 */
-	scoped_guard(RSEQ_EVENT_GUARD) {
-		event_mask = t->rseq_event_mask;
-		t->rseq_event_mask = 0;
-	}
-
-	return !!event_mask;
+	return 0;
 }
 
 static int clear_rseq_cs(struct rseq __user *rseq)
@@ -380,7 +370,7 @@ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
 	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
 }
 
-static int rseq_ip_fixup(struct pt_regs *regs)
+static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
 {
 	unsigned long ip = instruction_pointer(regs);
 	struct task_struct *t = current;
@@ -398,9 +388,11 @@ static int rseq_ip_fixup(struct pt_regs *regs)
 	 */
 	if (!in_rseq_cs(ip, &rseq_cs))
 		return clear_rseq_cs(t->rseq);
-	ret = rseq_need_restart(t, rseq_cs.flags);
-	if (ret <= 0)
+	ret = rseq_check_flags(t, rseq_cs.flags);
+	if (ret < 0)
 		return ret;
+	if (!abort)
+		return 0;
 	ret = clear_rseq_cs(t->rseq);
 	if (ret)
 		return ret;
@@ -430,14 +422,44 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 		return;
 
 	/*
-	 * regs is NULL if and only if the caller is in a syscall path.  Skip
-	 * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
-	 * kill a misbehaving userspace on debug kernels.
+	 * If invoked from hypervisors or IO-URING, then @regs is a NULL
+	 * pointer, so fixup cannot be done. If the syscall which led to
+	 * this invocation was invoked inside a critical section, then it
+	 * will either end up in this code again or a possible violation of
+	 * a syscall inside a critical region can only be detected by the
+	 * debug code in rseq_syscall() in a debug enabled kernel.
 	 */
 	if (regs) {
-		ret = rseq_ip_fixup(regs);
-		if (unlikely(ret < 0))
-			goto error;
+		/*
+		 * Read and clear the event mask first. If the task was not
+		 * preempted or migrated or a signal is on the way, there
+		 * is no point in doing any of the heavy lifting here on
+		 * production kernels. In that case TIF_NOTIFY_RESUME was
+		 * raised by some other functionality.
+		 *
+		 * This is correct because the read/clear operation is
+		 * guarded against scheduler preemption, which makes it CPU
+		 * local atomic. If the task is preempted right after
+		 * re-enabling preemption then TIF_NOTIFY_RESUME is set
+		 * again and this function is invoked another time _before_
+		 * the task is able to return to user mode.
+		 *
+		 * On a debug kernel, invoke the fixup code unconditionally
+		 * with the result handed in to allow the detection of
+		 * inconsistencies.
+		 */
+		u32 event_mask;
+
+		scoped_guard(RSEQ_EVENT_GUARD) {
+			event_mask = t->rseq_event_mask;
+			t->rseq_event_mask = 0;
+		}
+
+		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) {
+			ret = rseq_ip_fixup(regs, !!event_mask);
+			if (unlikely(ret < 0))
+				goto error;
+		}
 	}
 	if (unlikely(rseq_update_cpu_node_id(t)))
 		goto error;
-- 
cgit v1.2.3


From fdc0f39d289ebcf46ef44f43460207ef24c94ed7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:18 +0100
Subject: rseq: Condense the inline stubs

Scrolling over tons of pointless

	{
	}

lines to find the actual code is annoying at best.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.085971048@linutronix.de
---
 include/linux/rseq.h | 47 ++++++++++++-----------------------------------
 1 file changed, 12 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 7622b733a508..21f875af0e96 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -101,44 +101,21 @@ static inline void rseq_execve(struct task_struct *t)
 	t->rseq_event_mask = 0;
 }
 
-#else
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-}
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-					     struct pt_regs *regs)
-{
-}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
-{
-}
-static inline void rseq_preempt(struct task_struct *t)
-{
-}
-static inline void rseq_migrate(struct task_struct *t)
-{
-}
-static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
-{
-}
-static inline void rseq_execve(struct task_struct *t)
-{
-}
+#else /* CONFIG_RSEQ */
+static inline void rseq_set_notify_resume(struct task_struct *t) { }
+static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
+static inline void rseq_preempt(struct task_struct *t) { }
+static inline void rseq_migrate(struct task_struct *t) { }
+static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
+static inline void rseq_execve(struct task_struct *t) { }
 static inline void rseq_exit_to_user_mode(void) { }
-#endif
+#endif  /* !CONFIG_RSEQ */
 
 #ifdef CONFIG_DEBUG_RSEQ
-
 void rseq_syscall(struct pt_regs *regs);
-
-#else
-
-static inline void rseq_syscall(struct pt_regs *regs)
-{
-}
-
-#endif
+#else /* CONFIG_DEBUG_RSEQ */
+static inline void rseq_syscall(struct pt_regs *regs) { }
+#endif /* !CONFIG_DEBUG_RSEQ */
 
 #endif /* _LINUX_RSEQ_H */
-- 
cgit v1.2.3


From 41b43a6ba3848be8ceec77b8b2a56ddeca6167ed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:22 +0100
Subject: rseq: Remove the ksig argument from rseq_handle_notify_resume()

There is no point for this being visible in the resume_to_user_mode()
handling.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.211520245@linutronix.de
---
 include/linux/resume_user_mode.h |  2 +-
 include/linux/rseq.h             | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
index e0135e0adae0..dd3bf7da90a8 100644
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs)
 	mem_cgroup_handle_over_high(GFP_KERNEL);
 	blkcg_maybe_throttle_current();
 
-	rseq_handle_notify_resume(NULL, regs);
+	rseq_handle_notify_resume(regs);
 }
 
 #endif /* LINUX_RESUME_USER_MODE_H */
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 21f875af0e96..d72ddf7ce903 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -37,19 +37,20 @@ static inline void rseq_set_notify_resume(struct task_struct *t)
 
 void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
 
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-					     struct pt_regs *regs)
+static inline void rseq_handle_notify_resume(struct pt_regs *regs)
 {
 	if (current->rseq)
-		__rseq_handle_notify_resume(ksig, regs);
+		__rseq_handle_notify_resume(NULL, regs);
 }
 
 static inline void rseq_signal_deliver(struct ksignal *ksig,
 				       struct pt_regs *regs)
 {
-	scoped_guard(RSEQ_EVENT_GUARD)
-		__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
-	rseq_handle_notify_resume(ksig, regs);
+	if (current->rseq) {
+		scoped_guard(RSEQ_EVENT_GUARD)
+			__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
+		__rseq_handle_notify_resume(ksig, regs);
+	}
 }
 
 /* rseq_preempt() requires preemption to be disabled. */
@@ -103,7 +104,7 @@ static inline void rseq_execve(struct task_struct *t)
 
 #else /* CONFIG_RSEQ */
 static inline void rseq_set_notify_resume(struct task_struct *t) { }
-static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
+static inline void rseq_handle_notify_resume(struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_preempt(struct task_struct *t) { }
 static inline void rseq_migrate(struct task_struct *t) { }
-- 
cgit v1.2.3


From d923739e2e356424cc566143a3323c62cd6ed067 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:26 +0100
Subject: rseq: Simplify the event notification

Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
flags") the bits in task::rseq_event_mask are meaningless and just extra
work in terms of setting them individually.

Aside of that the only relevant point where an event has to be raised is
context switch. Neither the CPU nor MM CID can change without going through
a context switch.

Collapse them all into a single boolean which simplifies the code a lot and
remove the pointless invocations which have been sprinkled all over the
place for no value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.336978188@linutronix.de
---
 fs/exec.c                 |  2 +-
 include/linux/rseq.h      | 66 ++++++++++-------------------------------------
 include/linux/sched.h     | 10 +++----
 include/uapi/linux/rseq.h | 21 +++++----------
 kernel/rseq.c             | 28 ++++++++++++--------
 kernel/sched/core.c       |  5 +---
 kernel/sched/membarrier.c |  8 +++---
 7 files changed, 48 insertions(+), 92 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 4298e7e08d5d..e45b29890269 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1775,7 +1775,7 @@ out:
 		force_fatal_sig(SIGSEGV);
 
 	sched_mm_cid_after_execve(current);
-	rseq_set_notify_resume(current);
+	rseq_sched_switch_event(current);
 	current->in_execve = 0;
 
 	return retval;
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index d72ddf7ce903..241067bf20db 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -3,38 +3,8 @@
 #define _LINUX_RSEQ_H
 
 #ifdef CONFIG_RSEQ
-
-#include <linux/preempt.h>
 #include <linux/sched.h>
 
-#ifdef CONFIG_MEMBARRIER
-# define RSEQ_EVENT_GUARD	irq
-#else
-# define RSEQ_EVENT_GUARD	preempt
-#endif
-
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
-	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
-	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
-	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
-
-enum rseq_event_mask {
-	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
-	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
-	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-	if (t->rseq)
-		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-}
-
 void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
 
 static inline void rseq_handle_notify_resume(struct pt_regs *regs)
@@ -43,35 +13,27 @@ static inline void rseq_handle_notify_resume(struct pt_regs *regs)
 		__rseq_handle_notify_resume(NULL, regs);
 }
 
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
 {
 	if (current->rseq) {
-		scoped_guard(RSEQ_EVENT_GUARD)
-			__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
+		current->rseq_event_pending = true;
 		__rseq_handle_notify_resume(ksig, regs);
 	}
 }
 
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
+static inline void rseq_sched_switch_event(struct task_struct *t)
 {
-	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
-}
-
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
-{
-	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
+	if (t->rseq) {
+		t->rseq_event_pending = true;
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+	}
 }
 
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
-		if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask))
-			current->rseq_event_mask = 0;
+		if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending))
+			current->rseq_event_pending = false;
 	}
 }
 
@@ -85,12 +47,12 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
 		t->rseq = NULL;
 		t->rseq_len = 0;
 		t->rseq_sig = 0;
-		t->rseq_event_mask = 0;
+		t->rseq_event_pending = false;
 	} else {
 		t->rseq = current->rseq;
 		t->rseq_len = current->rseq_len;
 		t->rseq_sig = current->rseq_sig;
-		t->rseq_event_mask = current->rseq_event_mask;
+		t->rseq_event_pending = current->rseq_event_pending;
 	}
 }
 
@@ -99,15 +61,13 @@ static inline void rseq_execve(struct task_struct *t)
 	t->rseq = NULL;
 	t->rseq_len = 0;
 	t->rseq_sig = 0;
-	t->rseq_event_mask = 0;
+	t->rseq_event_pending = false;
 }
 
 #else /* CONFIG_RSEQ */
-static inline void rseq_set_notify_resume(struct task_struct *t) { }
 static inline void rseq_handle_notify_resume(struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
-static inline void rseq_preempt(struct task_struct *t) { }
-static inline void rseq_migrate(struct task_struct *t) { }
+static inline void rseq_sched_switch_event(struct task_struct *t) { }
 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
 static inline void rseq_execve(struct task_struct *t) { }
 static inline void rseq_exit_to_user_mode(void) { }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b469878de25c..6627c527c2c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1407,14 +1407,14 @@ struct task_struct {
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_RSEQ
-	struct rseq __user *rseq;
-	u32 rseq_len;
-	u32 rseq_sig;
+	struct rseq __user		*rseq;
+	u32				rseq_len;
+	u32				rseq_sig;
 	/*
-	 * RmW on rseq_event_mask must be performed atomically
+	 * RmW on rseq_event_pending must be performed atomically
 	 * with respect to preemption.
 	 */
-	unsigned long rseq_event_mask;
+	bool				rseq_event_pending;
 # ifdef CONFIG_DEBUG_RSEQ
 	/*
 	 * This is a place holder to save a copy of the rseq fields for
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index c233aae5eac9..1b76d508400c 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -114,20 +114,13 @@ struct rseq {
 	/*
 	 * Restartable sequences flags field.
 	 *
-	 * This field should only be updated by the thread which
-	 * registered this data structure. Read by the kernel.
-	 * Mainly used for single-stepping through rseq critical sections
-	 * with debuggers.
-	 *
-	 * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
-	 *     Inhibit instruction sequence block restart on preemption
-	 *     for this thread.
-	 * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
-	 *     Inhibit instruction sequence block restart on signal
-	 *     delivery for this thread.
-	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
-	 *     Inhibit instruction sequence block restart on migration for
-	 *     this thread.
+	 * This field was initially intended to allow event masking for
+	 * single-stepping through rseq critical sections with debuggers.
+	 * The kernel does not support this anymore and the relevant bits
+	 * are checked for being always false:
+	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
+	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
+	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
 	 */
 	__u32 flags;
 
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 80af48a972f0..59adc1a7183b 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -78,6 +78,12 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/rseq.h>
 
+#ifdef CONFIG_MEMBARRIER
+# define RSEQ_EVENT_GUARD	irq
+#else
+# define RSEQ_EVENT_GUARD	preempt
+#endif
+
 /* The original rseq structure size (including padding) is 32 bytes. */
 #define ORIG_RSEQ_SIZE		32
 
@@ -430,11 +436,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	 */
 	if (regs) {
 		/*
-		 * Read and clear the event mask first. If the task was not
-		 * preempted or migrated or a signal is on the way, there
-		 * is no point in doing any of the heavy lifting here on
-		 * production kernels. In that case TIF_NOTIFY_RESUME was
-		 * raised by some other functionality.
+		 * Read and clear the event pending bit first. If the task
+		 * was not preempted or migrated or a signal is on the way,
+		 * there is no point in doing any of the heavy lifting here
+		 * on production kernels. In that case TIF_NOTIFY_RESUME
+		 * was raised by some other functionality.
 		 *
 		 * This is correct because the read/clear operation is
 		 * guarded against scheduler preemption, which makes it CPU
@@ -447,15 +453,15 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 		 * with the result handed in to allow the detection of
 		 * inconsistencies.
 		 */
-		u32 event_mask;
+		bool event;
 
 		scoped_guard(RSEQ_EVENT_GUARD) {
-			event_mask = t->rseq_event_mask;
-			t->rseq_event_mask = 0;
+			event = t->rseq_event_pending;
+			t->rseq_event_pending = false;
 		}
 
-		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) {
-			ret = rseq_ip_fixup(regs, !!event_mask);
+		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
+			ret = rseq_ip_fixup(regs, event);
 			if (unlikely(ret < 0))
 				goto error;
 		}
@@ -584,7 +590,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 	 * registered, ensure the cpu_id_start and cpu_id fields
 	 * are updated before returning to user-space.
 	 */
-	rseq_set_notify_resume(current);
+	rseq_sched_switch_event(current);
 
 	return 0;
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f1ebf67b48e2..b75e8e1eca4a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3329,7 +3329,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
-		rseq_migrate(p);
 		sched_mm_cid_migrate_from(p);
 		perf_event_task_migrate(p);
 	}
@@ -4763,7 +4762,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 		p->sched_task_group = tg;
 	}
 #endif
-	rseq_migrate(p);
 	/*
 	 * We're setting the CPU for the first time, we don't migrate,
 	 * so use __set_task_cpu().
@@ -4827,7 +4825,6 @@ void wake_up_new_task(struct task_struct *p)
 	 * as we're not fully set-up yet.
 	 */
 	p->recent_used_cpu = task_cpu(p);
-	rseq_migrate(p);
 	__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
 	rq = __task_rq_lock(p, &rf);
 	update_rq_clock(rq);
@@ -5121,7 +5118,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 	kcov_prepare_switch(prev);
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
-	rseq_preempt(prev);
+	rseq_sched_switch_event(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
 	kmap_local_sched_out();
 	prepare_task(next);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 62fba83b7bb1..623445603725 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -199,7 +199,7 @@ static void ipi_rseq(void *info)
 	 * is negligible.
 	 */
 	smp_mb();
-	rseq_preempt(current);
+	rseq_sched_switch_event(current);
 }
 
 static void ipi_sync_rq_state(void *info)
@@ -407,9 +407,9 @@ static int membarrier_private_expedited(int flags, int cpu_id)
 		 * membarrier, we will end up with some thread in the mm
 		 * running without a core sync.
 		 *
-		 * For RSEQ, don't rseq_preempt() the caller.  User code
-		 * is not supposed to issue syscalls at all from inside an
-		 * rseq critical section.
+		 * For RSEQ, don't invoke rseq_sched_switch_event() on the
+		 * caller.  User code is not supposed to issue syscalls at
+		 * all from inside an rseq critical section.
 		 */
 		if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
 			preempt_disable();
-- 
cgit v1.2.3


From 83409986f49f17b14a675f9c598ad50d4c60191b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:28 +0100
Subject: rseq, virt: Retrigger RSEQ after vcpu_run()

Hypervisors invoke resume_user_mode_work() before entering the guest, which
clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user
space context available to them, so the rseq notify handler skips
inspecting the critical section, but updates the CPU/MM CID values
unconditionally so that the eventual pending rseq event is not lost on the
way to user space.

This is a pointless exercise as the task might be rescheduled before
actually returning to user space and it creates unnecessary work in the
vcpu_run() loops.

It's way more efficient to ignore that invocation based on @regs == NULL
and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the
vcpu_run() loop before returning from the ioctl().

This ensures that a pending RSEQ update is not lost and the IDs are updated
before returning to user space.

Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into
a NOOP.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Sean Christopherson <seanjc@google.com>
Link: https://patch.msgid.link/20251027084306.399495855@linutronix.de
---
 drivers/hv/mshv_root_main.c |  3 ++
 include/linux/rseq.h        | 17 ++++++++++
 kernel/rseq.c               | 78 ++++++++++++++++++++++++---------------------
 virt/kvm/kvm_main.c         |  7 ++++
 4 files changed, 68 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index e3b2bd417c46..a21a0eb0f5be 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -29,6 +29,7 @@
 #include <linux/crash_dump.h>
 #include <linux/panic_notifier.h>
 #include <linux/vmalloc.h>
+#include <linux/rseq.h>
 
 #include "mshv_eventfd.h"
 #include "mshv.h"
@@ -560,6 +561,8 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
 		}
 	} while (!vp->run.flags.intercept_suspend);
 
+	rseq_virt_userspace_exit();
+
 	return ret;
 }
 
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 241067bf20db..c6267f70c746 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -37,6 +37,22 @@ static __always_inline void rseq_exit_to_user_mode(void)
 	}
 }
 
+/*
+ * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
+ * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
+ * that case just to do it eventually again before returning to user space,
+ * the entry resume_user_mode_work() invocation is ignored as the register
+ * argument is NULL.
+ *
+ * After returning from guest mode, they have to invoke this function to
+ * re-raise TIF_NOTIFY_RESUME if necessary.
+ */
+static inline void rseq_virt_userspace_exit(void)
+{
+	if (current->rseq_event_pending)
+		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+}
+
 /*
  * If parent process has a registered restartable sequences area, the
  * child inherits. Unregister rseq for a clone with CLONE_VM set.
@@ -68,6 +84,7 @@ static inline void rseq_execve(struct task_struct *t)
 static inline void rseq_handle_notify_resume(struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_sched_switch_event(struct task_struct *t) { }
+static inline void rseq_virt_userspace_exit(void) { }
 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
 static inline void rseq_execve(struct task_struct *t) { }
 static inline void rseq_exit_to_user_mode(void) { }
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 59adc1a7183b..01e711383e05 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -422,50 +422,54 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 {
 	struct task_struct *t = current;
 	int ret, sig;
+	bool event;
+
+	/*
+	 * If invoked from hypervisors before entering the guest via
+	 * resume_user_mode_work(), then @regs is a NULL pointer.
+	 *
+	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
+	 * it before returning from the ioctl() to user space when
+	 * rseq_event.sched_switch is set.
+	 *
+	 * So it's safe to ignore here instead of pointlessly updating it
+	 * in the vcpu_run() loop.
+	 */
+	if (!regs)
+		return;
 
 	if (unlikely(t->flags & PF_EXITING))
 		return;
 
 	/*
-	 * If invoked from hypervisors or IO-URING, then @regs is a NULL
-	 * pointer, so fixup cannot be done. If the syscall which led to
-	 * this invocation was invoked inside a critical section, then it
-	 * will either end up in this code again or a possible violation of
-	 * a syscall inside a critical region can only be detected by the
-	 * debug code in rseq_syscall() in a debug enabled kernel.
+	 * Read and clear the event pending bit first. If the task
+	 * was not preempted or migrated or a signal is on the way,
+	 * there is no point in doing any of the heavy lifting here
+	 * on production kernels. In that case TIF_NOTIFY_RESUME
+	 * was raised by some other functionality.
+	 *
+	 * This is correct because the read/clear operation is
+	 * guarded against scheduler preemption, which makes it CPU
+	 * local atomic. If the task is preempted right after
+	 * re-enabling preemption then TIF_NOTIFY_RESUME is set
+	 * again and this function is invoked another time _before_
+	 * the task is able to return to user mode.
+	 *
+	 * On a debug kernel, invoke the fixup code unconditionally
+	 * with the result handed in to allow the detection of
+	 * inconsistencies.
 	 */
-	if (regs) {
-		/*
-		 * Read and clear the event pending bit first. If the task
-		 * was not preempted or migrated or a signal is on the way,
-		 * there is no point in doing any of the heavy lifting here
-		 * on production kernels. In that case TIF_NOTIFY_RESUME
-		 * was raised by some other functionality.
-		 *
-		 * This is correct because the read/clear operation is
-		 * guarded against scheduler preemption, which makes it CPU
-		 * local atomic. If the task is preempted right after
-		 * re-enabling preemption then TIF_NOTIFY_RESUME is set
-		 * again and this function is invoked another time _before_
-		 * the task is able to return to user mode.
-		 *
-		 * On a debug kernel, invoke the fixup code unconditionally
-		 * with the result handed in to allow the detection of
-		 * inconsistencies.
-		 */
-		bool event;
-
-		scoped_guard(RSEQ_EVENT_GUARD) {
-			event = t->rseq_event_pending;
-			t->rseq_event_pending = false;
-		}
-
-		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
-			ret = rseq_ip_fixup(regs, event);
-			if (unlikely(ret < 0))
-				goto error;
-		}
+	scoped_guard(RSEQ_EVENT_GUARD) {
+		event = t->rseq_event_pending;
+		t->rseq_event_pending = false;
 	}
+
+	if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
+		ret = rseq_ip_fixup(regs, event);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
 	if (unlikely(rseq_update_cpu_node_id(t)))
 		goto error;
 	return;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b7a0ae2a7b20..4255fcf9c6e5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -49,6 +49,7 @@
 #include <linux/lockdep.h>
 #include <linux/kthread.h>
 #include <linux/suspend.h>
+#include <linux/rseq.h>
 
 #include <asm/processor.h>
 #include <asm/ioctl.h>
@@ -4476,6 +4477,12 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = kvm_arch_vcpu_ioctl_run(vcpu);
 		vcpu->wants_to_run = false;
 
+		/*
+		 * FIXME: Remove this hack once all KVM architectures
+		 * support the generic TIF bits, i.e. a dedicated TIF_RSEQ.
+		 */
+		rseq_virt_userspace_exit();
+
 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
 		break;
 	}
-- 
cgit v1.2.3


From faba9d250eaec7afa248bba71531a08ccc497aab Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:33 +0100
Subject: rseq: Introduce struct rseq_data

In preparation for a major rewrite of this code, provide a data structure
for rseq management.

Put all the rseq related data into it (except for the debug part), which
allows to simplify fork/execve by using memset() and memcpy() instead of
adding new fields to initialize over and over.

Create a storage struct for event management as well and put the
sched_switch event and a indicator for RSEQ on a task into it as a
start. That uses a union, which allows to mask and clear the whole lot
efficiently.

The indicators are explicitly not a bit field. Bit fields generate abysmal
code.

The boolean members are defined as u8 as that actually guarantees that it
fits. There seem to be strange architecture ABIs which need more than 8
bits for a boolean.

The has_rseq member is redundant vs. task::rseq, but it turns out that
boolean operations and quick checks on the union generate better code than
fiddling with separate entities and data types.

This struct will be extended over time to carry more information.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.527086690@linutronix.de
---
 include/linux/rseq.h       | 48 ++++++++++++++++-------------------
 include/linux/rseq_types.h | 51 +++++++++++++++++++++++++++++++++++++
 include/linux/sched.h      | 14 +++--------
 kernel/ptrace.c            |  6 ++---
 kernel/rseq.c              | 63 +++++++++++++++++++++++-----------------------
 5 files changed, 110 insertions(+), 72 deletions(-)
 create mode 100644 include/linux/rseq_types.h

(limited to 'include')

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index c6267f70c746..ab91b1e6bb4a 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -9,22 +9,22 @@ void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
 
 static inline void rseq_handle_notify_resume(struct pt_regs *regs)
 {
-	if (current->rseq)
+	if (current->rseq.event.has_rseq)
 		__rseq_handle_notify_resume(NULL, regs);
 }
 
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
 {
-	if (current->rseq) {
-		current->rseq_event_pending = true;
+	if (current->rseq.event.has_rseq) {
+		current->rseq.event.sched_switch = true;
 		__rseq_handle_notify_resume(ksig, regs);
 	}
 }
 
 static inline void rseq_sched_switch_event(struct task_struct *t)
 {
-	if (t->rseq) {
-		t->rseq_event_pending = true;
+	if (t->rseq.event.has_rseq) {
+		t->rseq.event.sched_switch = true;
 		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
 	}
 }
@@ -32,8 +32,9 @@ static inline void rseq_sched_switch_event(struct task_struct *t)
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
-		if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending))
-			current->rseq_event_pending = false;
+		if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
+				 current->rseq.event.events))
+			current->rseq.event.events = 0;
 	}
 }
 
@@ -49,35 +50,30 @@ static __always_inline void rseq_exit_to_user_mode(void)
  */
 static inline void rseq_virt_userspace_exit(void)
 {
-	if (current->rseq_event_pending)
+	if (current->rseq.event.sched_switch)
 		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
 }
 
+static inline void rseq_reset(struct task_struct *t)
+{
+	memset(&t->rseq, 0, sizeof(t->rseq));
+}
+
+static inline void rseq_execve(struct task_struct *t)
+{
+	rseq_reset(t);
+}
+
 /*
  * If parent process has a registered restartable sequences area, the
  * child inherits. Unregister rseq for a clone with CLONE_VM set.
  */
 static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
 {
-	if (clone_flags & CLONE_VM) {
-		t->rseq = NULL;
-		t->rseq_len = 0;
-		t->rseq_sig = 0;
-		t->rseq_event_pending = false;
-	} else {
+	if (clone_flags & CLONE_VM)
+		rseq_reset(t);
+	else
 		t->rseq = current->rseq;
-		t->rseq_len = current->rseq_len;
-		t->rseq_sig = current->rseq_sig;
-		t->rseq_event_pending = current->rseq_event_pending;
-	}
-}
-
-static inline void rseq_execve(struct task_struct *t)
-{
-	t->rseq = NULL;
-	t->rseq_len = 0;
-	t->rseq_sig = 0;
-	t->rseq_event_pending = false;
 }
 
 #else /* CONFIG_RSEQ */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
new file mode 100644
index 000000000000..f7a60c8eddc9
--- /dev/null
+++ b/include/linux/rseq_types.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_TYPES_H
+#define _LINUX_RSEQ_TYPES_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_RSEQ
+struct rseq;
+
+/**
+ * struct rseq_event - Storage for rseq related event management
+ * @all:		Compound to initialize and clear the data efficiently
+ * @events:		Compound to access events with a single load/store
+ * @sched_switch:	True if the task was scheduled out
+ * @has_rseq:		True if the task has a rseq pointer installed
+ */
+struct rseq_event {
+	union {
+		u32				all;
+		struct {
+			union {
+				u16		events;
+				struct {
+					u8	sched_switch;
+				};
+			};
+
+			u8			has_rseq;
+		};
+	};
+};
+
+/**
+ * struct rseq_data - Storage for all rseq related data
+ * @usrptr:	Pointer to the registered user space RSEQ memory
+ * @len:	Length of the RSEQ region
+ * @sig:	Signature of critial section abort IPs
+ * @event:	Storage for event management
+ */
+struct rseq_data {
+	struct rseq __user		*usrptr;
+	u32				len;
+	u32				sig;
+	struct rseq_event		event;
+};
+
+#else /* CONFIG_RSEQ */
+struct rseq_data { };
+#endif /* !CONFIG_RSEQ */
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6627c527c2c7..15627769409d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -41,6 +41,7 @@
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
 #include <linux/restart_block.h>
+#include <linux/rseq_types.h>
 #include <uapi/linux/rseq.h>
 #include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
@@ -1406,16 +1407,8 @@ struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
-#ifdef CONFIG_RSEQ
-	struct rseq __user		*rseq;
-	u32				rseq_len;
-	u32				rseq_sig;
-	/*
-	 * RmW on rseq_event_pending must be performed atomically
-	 * with respect to preemption.
-	 */
-	bool				rseq_event_pending;
-# ifdef CONFIG_DEBUG_RSEQ
+	struct rseq_data		rseq;
+#ifdef CONFIG_DEBUG_RSEQ
 	/*
 	 * This is a place holder to save a copy of the rseq fields for
 	 * validation of read-only fields. The struct rseq has a
@@ -1423,7 +1416,6 @@ struct task_struct {
 	 * directly. Reserve a size large enough for the known fields.
 	 */
 	char				rseq_fields[sizeof(struct rseq)];
-# endif
 #endif
 
 #ifdef CONFIG_SCHED_MM_CID
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 75a84efad40f..392ec2f75f01 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
 					  unsigned long size, void __user *data)
 {
 	struct ptrace_rseq_configuration conf = {
-		.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
-		.rseq_abi_size = task->rseq_len,
-		.signature = task->rseq_sig,
+		.rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr,
+		.rseq_abi_size = task->rseq.len,
+		.signature = task->rseq.sig,
 		.flags = 0,
 	};
 
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 81dddafa2f2e..aae62661e6bb 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -103,13 +103,13 @@ static int rseq_validate_ro_fields(struct task_struct *t)
 				      DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 	u32 cpu_id_start, cpu_id, node_id, mm_cid;
-	struct rseq __user *rseq = t->rseq;
+	struct rseq __user *rseq = t->rseq.usrptr;
 
 	/*
 	 * Validate fields which are required to be read-only by
 	 * user-space.
 	 */
-	if (!user_read_access_begin(rseq, t->rseq_len))
+	if (!user_read_access_begin(rseq, t->rseq.len))
 		goto efault;
 	unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
 	unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
@@ -147,10 +147,10 @@ efault:
  * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
  * state.
  */
-#define rseq_unsafe_put_user(t, value, field, error_label)		\
-	do {								\
-		unsafe_put_user(value, &t->rseq->field, error_label);	\
-		rseq_kernel_fields(t)->field = value;			\
+#define rseq_unsafe_put_user(t, value, field, error_label)			\
+	do {									\
+		unsafe_put_user(value, &t->rseq.usrptr->field, error_label);	\
+		rseq_kernel_fields(t)->field = value;				\
 	} while (0)
 
 #else
@@ -160,12 +160,12 @@ static int rseq_validate_ro_fields(struct task_struct *t)
 }
 
 #define rseq_unsafe_put_user(t, value, field, error_label)		\
-	unsafe_put_user(value, &t->rseq->field, error_label)
+	unsafe_put_user(value, &t->rseq.usrptr->field, error_label)
 #endif
 
 static int rseq_update_cpu_node_id(struct task_struct *t)
 {
-	struct rseq __user *rseq = t->rseq;
+	struct rseq __user *rseq = t->rseq.usrptr;
 	u32 cpu_id = raw_smp_processor_id();
 	u32 node_id = cpu_to_node(cpu_id);
 	u32 mm_cid = task_mm_cid(t);
@@ -176,7 +176,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
 	if (rseq_validate_ro_fields(t))
 		goto efault;
 	WARN_ON_ONCE((int) mm_cid < 0);
-	if (!user_write_access_begin(rseq, t->rseq_len))
+	if (!user_write_access_begin(rseq, t->rseq.len))
 		goto efault;
 
 	rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
@@ -201,7 +201,7 @@ efault:
 
 static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
 {
-	struct rseq __user *rseq = t->rseq;
+	struct rseq __user *rseq = t->rseq.usrptr;
 	u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
 	    mm_cid = 0;
 
@@ -211,7 +211,7 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
 	if (rseq_validate_ro_fields(t))
 		goto efault;
 
-	if (!user_write_access_begin(rseq, t->rseq_len))
+	if (!user_write_access_begin(rseq, t->rseq.len))
 		goto efault;
 
 	/*
@@ -272,7 +272,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 	u32 sig;
 	int ret;
 
-	ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr);
+	ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr);
 	if (ret)
 		return ret;
 
@@ -305,10 +305,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 	if (ret)
 		return ret;
 
-	if (current->rseq_sig != sig) {
+	if (current->rseq.sig != sig) {
 		printk_ratelimited(KERN_WARNING
 			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
-			sig, current->rseq_sig, current->pid, usig);
+			sig, current->rseq.sig, current->pid, usig);
 		return -EINVAL;
 	}
 	return 0;
@@ -338,7 +338,7 @@ static int rseq_check_flags(struct task_struct *t, u32 cs_flags)
 		return -EINVAL;
 
 	/* Get thread flags. */
-	ret = get_user(flags, &t->rseq->flags);
+	ret = get_user(flags, &t->rseq.usrptr->flags);
 	if (ret)
 		return ret;
 
@@ -392,13 +392,13 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
 	 * Clear the rseq_cs pointer and return.
 	 */
 	if (!in_rseq_cs(ip, &rseq_cs))
-		return clear_rseq_cs(t->rseq);
+		return clear_rseq_cs(t->rseq.usrptr);
 	ret = rseq_check_flags(t, rseq_cs.flags);
 	if (ret < 0)
 		return ret;
 	if (!abort)
 		return 0;
-	ret = clear_rseq_cs(t->rseq);
+	ret = clear_rseq_cs(t->rseq.usrptr);
 	if (ret)
 		return ret;
 	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
@@ -460,8 +460,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	 * inconsistencies.
 	 */
 	scoped_guard(RSEQ_EVENT_GUARD) {
-		event = t->rseq_event_pending;
-		t->rseq_event_pending = false;
+		event = t->rseq.event.sched_switch;
+		t->rseq.event.sched_switch = false;
 	}
 
 	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
@@ -492,7 +492,7 @@ void rseq_syscall(struct pt_regs *regs)
 	struct task_struct *t = current;
 	struct rseq_cs rseq_cs;
 
-	if (!t->rseq)
+	if (!t->rseq.usrptr)
 		return;
 	if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
 		force_sig(SIGSEGV);
@@ -511,33 +511,31 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 		if (flags & ~RSEQ_FLAG_UNREGISTER)
 			return -EINVAL;
 		/* Unregister rseq for current thread. */
-		if (current->rseq != rseq || !current->rseq)
+		if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
 			return -EINVAL;
-		if (rseq_len != current->rseq_len)
+		if (rseq_len != current->rseq.len)
 			return -EINVAL;
-		if (current->rseq_sig != sig)
+		if (current->rseq.sig != sig)
 			return -EPERM;
 		ret = rseq_reset_rseq_cpu_node_id(current);
 		if (ret)
 			return ret;
-		current->rseq = NULL;
-		current->rseq_sig = 0;
-		current->rseq_len = 0;
+		rseq_reset(current);
 		return 0;
 	}
 
 	if (unlikely(flags))
 		return -EINVAL;
 
-	if (current->rseq) {
+	if (current->rseq.usrptr) {
 		/*
 		 * If rseq is already registered, check whether
 		 * the provided address differs from the prior
 		 * one.
 		 */
-		if (current->rseq != rseq || rseq_len != current->rseq_len)
+		if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
 			return -EINVAL;
-		if (current->rseq_sig != sig)
+		if (current->rseq.sig != sig)
 			return -EPERM;
 		/* Already registered. */
 		return -EBUSY;
@@ -586,15 +584,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 	 * Activate the registration by setting the rseq area address, length
 	 * and signature in the task struct.
 	 */
-	current->rseq = rseq;
-	current->rseq_len = rseq_len;
-	current->rseq_sig = sig;
+	current->rseq.usrptr = rseq;
+	current->rseq.len = rseq_len;
+	current->rseq.sig = sig;
 
 	/*
 	 * If rseq was previously inactive, and has just been
 	 * registered, ensure the cpu_id_start and cpu_id fields
 	 * are updated before returning to user-space.
 	 */
+	current->rseq.event.has_rseq = true;
 	rseq_sched_switch_event(current);
 
 	return 0;
-- 
cgit v1.2.3


From 5204be16790f305febbf331d0ec2cead7978b3c3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:36 +0100
Subject: entry: Clean up header

Clean up the include ordering, kernel-doc and other trivialities before
making further changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.590338411@linutronix.de
---
 include/linux/entry-common.h     | 8 ++++----
 include/linux/irq-entry-common.h | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 7177436f0f9e..c585221ff16b 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -3,11 +3,11 @@
 #define __LINUX_ENTRYCOMMON_H
 
 #include <linux/irq-entry-common.h>
+#include <linux/livepatch.h>
 #include <linux/ptrace.h>
+#include <linux/resume_user_mode.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
-#include <linux/livepatch.h>
-#include <linux/resume_user_mode.h>
 
 #include <asm/entry-common.h>
 #include <asm/syscall.h>
@@ -37,6 +37,7 @@
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
 				 ARCH_SYSCALL_WORK_ENTER)
+
 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
@@ -61,8 +62,7 @@
  */
 void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
 
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
-			 unsigned long work);
+long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work);
 
 /**
  * syscall_enter_from_user_mode_work - Check and handle work before invoking
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index e5941df13901..9b1f386ffeb1 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -68,6 +68,7 @@ static __always_inline bool arch_in_rcu_eqs(void) { return false; }
 
 /**
  * enter_from_user_mode - Establish state when coming from user mode
+ * @regs:	Pointer to currents pt_regs
  *
  * Syscall/interrupt entry disables interrupts, but user mode is traced as
  * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
@@ -357,6 +358,7 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
  * Conditional reschedule with additional sanity checks.
  */
 void raw_irqentry_exit_cond_resched(void);
+
 #ifdef CONFIG_PREEMPT_DYNAMIC
 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #define irqentry_exit_cond_resched_dynamic_enabled	raw_irqentry_exit_cond_resched
-- 
cgit v1.2.3


From 54a5ab56242f96555999aaa41228f77b4a76e386 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:38 +0100
Subject: entry: Remove syscall_enter_from_user_mode_prepare()

Open code the only user in the x86 syscall code and reduce the zoo of
functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.652839989@linutronix.de
---
 arch/x86/entry/syscall_32.c   |  3 ++-
 include/linux/entry-common.h  | 26 +++++---------------------
 kernel/entry/syscall-common.c |  8 --------
 3 files changed, 7 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 2b15ea17bb7c..a67a644d0cfe 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -274,9 +274,10 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
 	 * fetch EBP before invoking any of the syscall entry work
 	 * functions.
 	 */
-	syscall_enter_from_user_mode_prepare(regs);
+	enter_from_user_mode(regs);
 
 	instrumentation_begin();
+	local_irq_enable();
 	/* Fetch EBP from where the vDSO stashed it. */
 	if (IS_ENABLED(CONFIG_X86_64)) {
 		/*
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index c585221ff16b..75b194c34e18 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -45,23 +45,6 @@
 				 SYSCALL_WORK_SYSCALL_EXIT_TRAP	|	\
 				 ARCH_SYSCALL_WORK_EXIT)
 
-/**
- * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
- * @regs:	Pointer to currents pt_regs
- *
- * Invoked from architecture specific syscall entry code with interrupts
- * disabled. The calling code has to be non-instrumentable. When the
- * function returns all state is correct, interrupts are enabled and the
- * subsequent functions can be instrumented.
- *
- * This handles lockdep, RCU (context tracking) and tracing state, i.e.
- * the functionality provided by enter_from_user_mode().
- *
- * This is invoked when there is extra architecture specific functionality
- * to be done between establishing state and handling user mode entry work.
- */
-void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
-
 long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work);
 
 /**
@@ -71,8 +54,8 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work)
  * @syscall:	The syscall number
  *
  * Invoked from architecture specific syscall entry code with interrupts
- * enabled after invoking syscall_enter_from_user_mode_prepare() and extra
- * architecture specific work.
+ * enabled after invoking enter_from_user_mode(), enabling interrupts and
+ * extra architecture specific work.
  *
  * Returns: The original or a modified syscall number
  *
@@ -108,8 +91,9 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re
  * function returns all state is correct, interrupts are enabled and the
  * subsequent functions can be instrumented.
  *
- * This is combination of syscall_enter_from_user_mode_prepare() and
- * syscall_enter_from_user_mode_work().
+ * This is the combination of enter_from_user_mode() and
+ * syscall_enter_from_user_mode_work() to be used when there is no
+ * architecture specific work to be done between the two.
  *
  * Returns: The original or a modified syscall number. See
  * syscall_enter_from_user_mode_work() for further explanation.
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
index 66e6ba7fa80c..940a597ded40 100644
--- a/kernel/entry/syscall-common.c
+++ b/kernel/entry/syscall-common.c
@@ -63,14 +63,6 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
 	return ret ? : syscall;
 }
 
-noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
-{
-	enter_from_user_mode(regs);
-	instrumentation_begin();
-	local_irq_enable();
-	instrumentation_end();
-}
-
 /*
  * If SYSCALL_EMU is set, then the only reason to report is when
  * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
-- 
cgit v1.2.3


From 7702a9c2856794b6bf961b408eba3bacb753bd5b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:40 +0100
Subject: entry: Inline irqentry_enter/exit_from/to_user_mode()

There is no point to have this as a function which just inlines
enter_from_user_mode(). The function call overhead is larger than the
function itself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.715309918@linutronix.de
---
 include/linux/irq-entry-common.h | 13 +++++++++++--
 kernel/entry/common.c            | 13 -------------
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 9b1f386ffeb1..83c9d841d9e1 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -278,7 +278,10 @@ static __always_inline void exit_to_user_mode(void)
  *
  * The function establishes state (lockdep, RCU (context tracking), tracing)
  */
-void irqentry_enter_from_user_mode(struct pt_regs *regs);
+static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
+{
+	enter_from_user_mode(regs);
+}
 
 /**
  * irqentry_exit_to_user_mode - Interrupt exit work
@@ -293,7 +296,13 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs);
  * Interrupt exit is not invoking #1 which is the syscall specific one time
  * work.
  */
-void irqentry_exit_to_user_mode(struct pt_regs *regs);
+static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs)
+{
+	instrumentation_begin();
+	exit_to_user_mode_prepare(regs);
+	instrumentation_end();
+	exit_to_user_mode();
+}
 
 #ifndef irqentry_state
 /**
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f62e1d1b2063..70a16db4cc0a 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -62,19 +62,6 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 	return ti_work;
 }
 
-noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
-{
-	enter_from_user_mode(regs);
-}
-
-noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
-{
-	instrumentation_begin();
-	exit_to_user_mode_prepare(regs);
-	instrumentation_end();
-	exit_to_user_mode();
-}
-
 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 {
 	irqentry_state_t ret = {
-- 
cgit v1.2.3


From 4fc9225d19ad6289c03340a520d35e3a6d1aebed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:42 +0100
Subject: sched: Move MM CID related functions to sched.h

There is nothing mm specific in that and including mm.h can cause header
recursion hell.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.778457951@linutronix.de
---
 include/linux/mm.h    | 25 -------------------------
 include/linux/sched.h | 26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d16b33bacc32..17cfbba9914c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2401,31 +2401,6 @@ struct zap_details {
 /* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
 #define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))
 
-#ifdef CONFIG_SCHED_MM_CID
-void sched_mm_cid_before_execve(struct task_struct *t);
-void sched_mm_cid_after_execve(struct task_struct *t);
-void sched_mm_cid_fork(struct task_struct *t);
-void sched_mm_cid_exit_signals(struct task_struct *t);
-static inline int task_mm_cid(struct task_struct *t)
-{
-	return t->mm_cid;
-}
-#else
-static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_fork(struct task_struct *t) { }
-static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
-static inline int task_mm_cid(struct task_struct *t)
-{
-	/*
-	 * Use the processor id as a fall-back when the mm cid feature is
-	 * disabled. This provides functional per-cpu data structure accesses
-	 * in user-space, althrough it won't provide the memory usage benefits.
-	 */
-	return raw_smp_processor_id();
-}
-#endif
-
 #ifdef CONFIG_MMU
 extern bool can_do_mlock(void);
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 15627769409d..24a9da7ca3e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2310,6 +2310,32 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
 #define alloc_tag_restore(_tag, _old)		do {} while (0)
 #endif
 
+/* Avoids recursive inclusion hell */
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_before_execve(struct task_struct *t);
+void sched_mm_cid_after_execve(struct task_struct *t);
+void sched_mm_cid_fork(struct task_struct *t);
+void sched_mm_cid_exit_signals(struct task_struct *t);
+static inline int task_mm_cid(struct task_struct *t)
+{
+	return t->mm_cid;
+}
+#else
+static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
+static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
+static inline int task_mm_cid(struct task_struct *t)
+{
+	/*
+	 * Use the processor id as a fall-back when the mm cid feature is
+	 * disabled. This provides functional per-cpu data structure accesses
+	 * in user-space, althrough it won't provide the memory usage benefits.
+	 */
+	return task_cpu(t);
+}
+#endif
+
 #ifndef MODULE
 #ifndef COMPILE_OFFSETS
 
-- 
cgit v1.2.3


From 4b7de6df20d43dd651031aef8d818fa5da981dbf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:45 +0100
Subject: rseq: Cache CPU ID and MM CID values

In preparation for rewriting RSEQ exit to user space handling provide
storage to cache the CPU ID and MM CID values which were written to user
space. That prepares for a quick check, which avoids the update when
nothing changed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.841964081@linutronix.de
---
 include/linux/rseq.h        |  7 +++++--
 include/linux/rseq_types.h  | 21 +++++++++++++++++++++
 include/trace/events/rseq.h |  4 ++--
 kernel/rseq.c               |  4 ++++
 4 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index ab91b1e6bb4a..d315a92afb36 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -57,6 +57,7 @@ static inline void rseq_virt_userspace_exit(void)
 static inline void rseq_reset(struct task_struct *t)
 {
 	memset(&t->rseq, 0, sizeof(t->rseq));
+	t->rseq.ids.cpu_cid = ~0ULL;
 }
 
 static inline void rseq_execve(struct task_struct *t)
@@ -70,10 +71,12 @@ static inline void rseq_execve(struct task_struct *t)
  */
 static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
 {
-	if (clone_flags & CLONE_VM)
+	if (clone_flags & CLONE_VM) {
 		rseq_reset(t);
-	else
+	} else {
 		t->rseq = current->rseq;
+		t->rseq.ids.cpu_cid = ~0ULL;
+	}
 }
 
 #else /* CONFIG_RSEQ */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index f7a60c8eddc9..40901b033b92 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -30,18 +30,39 @@ struct rseq_event {
 	};
 };
 
+/**
+ * struct rseq_ids - Cache for ids, which need to be updated
+ * @cpu_cid:	Compound of @cpu_id and @mm_cid to make the
+ *		compiler emit a single compare on 64-bit
+ * @cpu_id:	The CPU ID which was written last to user space
+ * @mm_cid:	The MM CID which was written last to user space
+ *
+ * @cpu_id and @mm_cid are updated when the data is written to user space.
+ */
+struct rseq_ids {
+	union {
+		u64		cpu_cid;
+		struct {
+			u32	cpu_id;
+			u32	mm_cid;
+		};
+	};
+};
+
 /**
  * struct rseq_data - Storage for all rseq related data
  * @usrptr:	Pointer to the registered user space RSEQ memory
  * @len:	Length of the RSEQ region
  * @sig:	Signature of critial section abort IPs
  * @event:	Storage for event management
+ * @ids:	Storage for cached CPU ID and MM CID
  */
 struct rseq_data {
 	struct rseq __user		*usrptr;
 	u32				len;
 	u32				sig;
 	struct rseq_event		event;
+	struct rseq_ids			ids;
 };
 
 #else /* CONFIG_RSEQ */
diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
index 823b47d1ba1e..ce85d650bf4b 100644
--- a/include/trace/events/rseq.h
+++ b/include/trace/events/rseq.h
@@ -21,9 +21,9 @@ TRACE_EVENT(rseq_update,
 	),
 
 	TP_fast_assign(
-		__entry->cpu_id = raw_smp_processor_id();
+		__entry->cpu_id = t->rseq.ids.cpu_id;
 		__entry->node_id = cpu_to_node(__entry->cpu_id);
-		__entry->mm_cid = task_mm_cid(t);
+		__entry->mm_cid = t->rseq.ids.mm_cid;
 	),
 
 	TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id,
diff --git a/kernel/rseq.c b/kernel/rseq.c
index aae62661e6bb..ad1e7cecd527 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -184,6 +184,10 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
 	rseq_unsafe_put_user(t, node_id, node_id, efault_end);
 	rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
 
+	/* Cache the user space values */
+	t->rseq.ids.cpu_id = cpu_id;
+	t->rseq.ids.mm_cid = mm_cid;
+
 	/*
 	 * Additional feature fields added after ORIG_RSEQ_SIZE
 	 * need to be conditionally updated only if
-- 
cgit v1.2.3


From 2fc0e4b4126caadfa5772ba69276b350609584dd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:48 +0100
Subject: rseq: Record interrupt from user space

For RSEQ the only relevant reason to inspect and eventually fixup (abort)
user space critical sections is when user space was interrupted and the
task was scheduled out.

If the user to kernel entry was from a syscall no fixup is required. If
user space invokes a syscall from a critical section it can keep the
pieces as documented.

This is only supported on architectures which utilize the generic entry
code. If your architecture does not use it, bad luck.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.905067101@linutronix.de
---
 include/linux/irq-entry-common.h |  3 ++-
 include/linux/rseq.h             | 16 +++++++++++-----
 include/linux/rseq_entry.h       | 18 ++++++++++++++++++
 include/linux/rseq_types.h       |  2 ++
 4 files changed, 33 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/rseq_entry.h

(limited to 'include')

diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 83c9d841d9e1..cb31fb84d7b4 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -4,7 +4,7 @@
 
 #include <linux/context_tracking.h>
 #include <linux/kmsan.h>
-#include <linux/rseq.h>
+#include <linux/rseq_entry.h>
 #include <linux/static_call_types.h>
 #include <linux/syscalls.h>
 #include <linux/tick.h>
@@ -281,6 +281,7 @@ static __always_inline void exit_to_user_mode(void)
 static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
 {
 	enter_from_user_mode(regs);
+	rseq_note_user_irq_entry();
 }
 
 /**
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index d315a92afb36..a200836a6fe3 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -31,11 +31,17 @@ static inline void rseq_sched_switch_event(struct task_struct *t)
 
 static __always_inline void rseq_exit_to_user_mode(void)
 {
-	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
-		if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
-				 current->rseq.event.events))
-			current->rseq.event.events = 0;
-	}
+	struct rseq_event *ev = &current->rseq.event;
+
+	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+		WARN_ON_ONCE(ev->sched_switch);
+
+	/*
+	 * Ensure that event (especially user_irq) is cleared when the
+	 * interrupt did not result in a schedule and therefore the
+	 * rseq processing did not clear it.
+	 */
+	ev->events = 0;
 }
 
 /*
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
new file mode 100644
index 000000000000..ce30e87ce1f5
--- /dev/null
+++ b/include/linux/rseq_entry.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_ENTRY_H
+#define _LINUX_RSEQ_ENTRY_H
+
+#ifdef CONFIG_RSEQ
+#include <linux/rseq.h>
+
+static __always_inline void rseq_note_user_irq_entry(void)
+{
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
+		current->rseq.event.user_irq = true;
+}
+
+#else /* CONFIG_RSEQ */
+static inline void rseq_note_user_irq_entry(void) { }
+#endif /* !CONFIG_RSEQ */
+
+#endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 40901b033b92..80f6c398ef0f 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -12,6 +12,7 @@ struct rseq;
  * @all:		Compound to initialize and clear the data efficiently
  * @events:		Compound to access events with a single load/store
  * @sched_switch:	True if the task was scheduled out
+ * @user_irq:		True on interrupt entry from user mode
  * @has_rseq:		True if the task has a rseq pointer installed
  */
 struct rseq_event {
@@ -22,6 +23,7 @@ struct rseq_event {
 				u16		events;
 				struct {
 					u8	sched_switch;
+					u8	user_irq;
 				};
 			};
 
-- 
cgit v1.2.3


From dab344753e021fe84c24f9d8b0b63cb5bcf463d7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:50 +0100
Subject: rseq: Provide tracepoint wrappers for inline code

Provide tracepoint wrappers for the upcoming RSEQ exit to user space inline
fast path, so that the header can be safely included by code which defines
actual trace points.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.967114316@linutronix.de
---
 include/linux/rseq_entry.h | 28 ++++++++++++++++++++++++++++
 kernel/rseq.c              | 19 ++++++++++++++++++-
 2 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index ce30e87ce1f5..5be507a127eb 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -5,6 +5,34 @@
 #ifdef CONFIG_RSEQ
 #include <linux/rseq.h>
 
+#include <linux/tracepoint-defs.h>
+
+#ifdef CONFIG_TRACEPOINTS
+DECLARE_TRACEPOINT(rseq_update);
+DECLARE_TRACEPOINT(rseq_ip_fixup);
+void __rseq_trace_update(struct task_struct *t);
+void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+			   unsigned long offset, unsigned long abort_ip);
+
+static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
+{
+	if (tracepoint_enabled(rseq_update) && ids)
+		__rseq_trace_update(t);
+}
+
+static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+				       unsigned long offset, unsigned long abort_ip)
+{
+	if (tracepoint_enabled(rseq_ip_fixup))
+		__rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+}
+
+#else /* CONFIG_TRACEPOINT */
+static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
+static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+				       unsigned long offset, unsigned long abort_ip) { }
+#endif /* !CONFIG_TRACEPOINT */
+
 static __always_inline void rseq_note_user_irq_entry(void)
 {
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
diff --git a/kernel/rseq.c b/kernel/rseq.c
index ad1e7cecd527..f49d3118c3e0 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -70,7 +70,7 @@
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
-#include <linux/rseq.h>
+#include <linux/rseq_entry.h>
 #include <linux/types.h>
 #include <linux/ratelimit.h>
 #include <asm/ptrace.h>
@@ -91,6 +91,23 @@
 				  RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
 				  RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
 
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Out of line, so the actual update functions can be in a header to be
+ * inlined into the exit to user code.
+ */
+void __rseq_trace_update(struct task_struct *t)
+{
+	trace_rseq_update(t);
+}
+
+void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+			   unsigned long offset, unsigned long abort_ip)
+{
+	trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip);
+}
+#endif /* CONFIG_TRACEPOINTS */
+
 #ifdef CONFIG_DEBUG_RSEQ
 static struct rseq *rseq_kernel_fields(struct task_struct *t)
 {
-- 
cgit v1.2.3


From 5412910487d0839111e4f2f3a6f33f6c9af9b007 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:52 +0100
Subject: rseq: Expose lightweight statistics in debugfs

Analyzing the call frequency without actually using tracing is helpful for
analysis of this infrastructure. The overhead is minimal as it just
increments a per CPU counter associated to each operation.

The debugfs readout provides a racy sum of all counters.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.027916598@linutronix.de
---
 include/linux/rseq.h       | 16 ----------
 include/linux/rseq_entry.h | 49 ++++++++++++++++++++++++++++
 init/Kconfig               | 12 +++++++
 kernel/rseq.c              | 79 ++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 133 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index a200836a6fe3..7f347c3a4af8 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -29,21 +29,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t)
 	}
 }
 
-static __always_inline void rseq_exit_to_user_mode(void)
-{
-	struct rseq_event *ev = &current->rseq.event;
-
-	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
-		WARN_ON_ONCE(ev->sched_switch);
-
-	/*
-	 * Ensure that event (especially user_irq) is cleared when the
-	 * interrupt did not result in a schedule and therefore the
-	 * rseq processing did not clear it.
-	 */
-	ev->events = 0;
-}
-
 /*
  * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
  * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
@@ -92,7 +77,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t) { }
 static inline void rseq_virt_userspace_exit(void) { }
 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
 static inline void rseq_execve(struct task_struct *t) { }
-static inline void rseq_exit_to_user_mode(void) { }
 #endif  /* !CONFIG_RSEQ */
 
 #ifdef CONFIG_DEBUG_RSEQ
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index 5be507a127eb..ff9080b89be3 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -2,6 +2,37 @@
 #ifndef _LINUX_RSEQ_ENTRY_H
 #define _LINUX_RSEQ_ENTRY_H
 
+/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
+#ifdef CONFIG_RSEQ_STATS
+#include <linux/percpu.h>
+
+struct rseq_stats {
+	unsigned long	exit;
+	unsigned long	signal;
+	unsigned long	slowpath;
+	unsigned long	ids;
+	unsigned long	cs;
+	unsigned long	clear;
+	unsigned long	fixup;
+};
+
+DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
+
+/*
+ * Slow path has interrupts and preemption enabled, but the fast path
+ * runs with interrupts disabled so there is no point in having the
+ * preemption checks implied in __this_cpu_inc() for every operation.
+ */
+#ifdef RSEQ_BUILD_SLOW_PATH
+#define rseq_stat_inc(which)	this_cpu_inc((which))
+#else
+#define rseq_stat_inc(which)	raw_cpu_inc((which))
+#endif
+
+#else /* CONFIG_RSEQ_STATS */
+#define rseq_stat_inc(x)	do { } while (0)
+#endif /* !CONFIG_RSEQ_STATS */
+
 #ifdef CONFIG_RSEQ
 #include <linux/rseq.h>
 
@@ -39,8 +70,26 @@ static __always_inline void rseq_note_user_irq_entry(void)
 		current->rseq.event.user_irq = true;
 }
 
+static __always_inline void rseq_exit_to_user_mode(void)
+{
+	struct rseq_event *ev = &current->rseq.event;
+
+	rseq_stat_inc(rseq_stats.exit);
+
+	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+		WARN_ON_ONCE(ev->sched_switch);
+
+	/*
+	 * Ensure that event (especially user_irq) is cleared when the
+	 * interrupt did not result in a schedule and therefore the
+	 * rseq processing did not clear it.
+	 */
+	ev->events = 0;
+}
+
 #else /* CONFIG_RSEQ */
 static inline void rseq_note_user_irq_entry(void) { }
+static inline void rseq_exit_to_user_mode(void) { }
 #endif /* !CONFIG_RSEQ */
 
 #endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/init/Kconfig b/init/Kconfig
index cab3ad28ca49..f39fdfb28797 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1913,6 +1913,18 @@ config RSEQ
 
 	  If unsure, say Y.
 
+config RSEQ_STATS
+	default n
+	bool "Enable lightweight statistics of restartable sequences" if EXPERT
+	depends on RSEQ && DEBUG_FS
+	help
+	  Enable lightweight counters which expose information about the
+	  frequency of RSEQ operations via debugfs. Mostly interesting for
+	  kernel debugging or performance analysis. While lightweight it's
+	  still adding code into the user/kernel mode transitions.
+
+	  If unsure, say N.
+
 config DEBUG_RSEQ
 	default n
 	bool "Enable debugging of rseq() system call" if EXPERT
diff --git a/kernel/rseq.c b/kernel/rseq.c
index f49d3118c3e0..c0dbe2ed26a8 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -67,12 +67,16 @@
  *   F1. <failure>
  */
 
+/* Required to select the proper per_cpu ops for rseq_stats_inc() */
+#define RSEQ_BUILD_SLOW_PATH
+
+#include <linux/debugfs.h>
+#include <linux/ratelimit.h>
+#include <linux/rseq_entry.h>
 #include <linux/sched.h>
-#include <linux/uaccess.h>
 #include <linux/syscalls.h>
-#include <linux/rseq_entry.h>
+#include <linux/uaccess.h>
 #include <linux/types.h>
-#include <linux/ratelimit.h>
 #include <asm/ptrace.h>
 
 #define CREATE_TRACE_POINTS
@@ -108,6 +112,56 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
 }
 #endif /* CONFIG_TRACEPOINTS */
 
+#ifdef CONFIG_RSEQ_STATS
+DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
+
+static int rseq_debug_show(struct seq_file *m, void *p)
+{
+	struct rseq_stats stats = { };
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		stats.exit	+= data_race(per_cpu(rseq_stats.exit, cpu));
+		stats.signal	+= data_race(per_cpu(rseq_stats.signal, cpu));
+		stats.slowpath	+= data_race(per_cpu(rseq_stats.slowpath, cpu));
+		stats.ids	+= data_race(per_cpu(rseq_stats.ids, cpu));
+		stats.cs	+= data_race(per_cpu(rseq_stats.cs, cpu));
+		stats.clear	+= data_race(per_cpu(rseq_stats.clear, cpu));
+		stats.fixup	+= data_race(per_cpu(rseq_stats.fixup, cpu));
+	}
+
+	seq_printf(m, "exit:   %16lu\n", stats.exit);
+	seq_printf(m, "signal: %16lu\n", stats.signal);
+	seq_printf(m, "slowp:  %16lu\n", stats.slowpath);
+	seq_printf(m, "ids:    %16lu\n", stats.ids);
+	seq_printf(m, "cs:     %16lu\n", stats.cs);
+	seq_printf(m, "clear:  %16lu\n", stats.clear);
+	seq_printf(m, "fixup:  %16lu\n", stats.fixup);
+	return 0;
+}
+
+static int rseq_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rseq_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_ops = {
+	.open		= rseq_debug_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init rseq_debugfs_init(void)
+{
+	struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
+
+	debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops);
+	return 0;
+}
+__initcall(rseq_debugfs_init);
+#endif /* CONFIG_RSEQ_STATS */
+
 #ifdef CONFIG_DEBUG_RSEQ
 static struct rseq *rseq_kernel_fields(struct task_struct *t)
 {
@@ -187,12 +241,13 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
 	u32 node_id = cpu_to_node(cpu_id);
 	u32 mm_cid = task_mm_cid(t);
 
-	/*
-	 * Validate read-only rseq fields.
-	 */
+	rseq_stat_inc(rseq_stats.ids);
+
+	/* Validate read-only rseq fields on debug kernels */
 	if (rseq_validate_ro_fields(t))
 		goto efault;
 	WARN_ON_ONCE((int) mm_cid < 0);
+
 	if (!user_write_access_begin(rseq, t->rseq.len))
 		goto efault;
 
@@ -403,6 +458,8 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
 	struct rseq_cs rseq_cs;
 	int ret;
 
+	rseq_stat_inc(rseq_stats.cs);
+
 	ret = rseq_get_rseq_cs(t, &rseq_cs);
 	if (ret)
 		return ret;
@@ -412,8 +469,10 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
 	 * If not nested over a rseq critical section, restart is useless.
 	 * Clear the rseq_cs pointer and return.
 	 */
-	if (!in_rseq_cs(ip, &rseq_cs))
+	if (!in_rseq_cs(ip, &rseq_cs)) {
+		rseq_stat_inc(rseq_stats.clear);
 		return clear_rseq_cs(t->rseq.usrptr);
+	}
 	ret = rseq_check_flags(t, rseq_cs.flags);
 	if (ret < 0)
 		return ret;
@@ -422,6 +481,7 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
 	ret = clear_rseq_cs(t->rseq.usrptr);
 	if (ret)
 		return ret;
+	rseq_stat_inc(rseq_stats.fixup);
 	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
 			    rseq_cs.abort_ip);
 	instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
@@ -462,6 +522,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	if (unlikely(t->flags & PF_EXITING))
 		return;
 
+	if (ksig)
+		rseq_stat_inc(rseq_stats.signal);
+	else
+		rseq_stat_inc(rseq_stats.slowpath);
+
 	/*
 	 * Read and clear the event pending bit first. If the task
 	 * was not preempted or migrated or a signal is on the way,
-- 
cgit v1.2.3


From 9c37cb6e80b8fcdddc1236ba42ffd438f511192b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:55 +0100
Subject: rseq: Provide static branch for runtime debugging

Config based debug is rarely turned on and is not available easily when
things go wrong.

Provide a static branch to allow permanent integration of debug mechanisms
along with the usual toggles in Kconfig, command line and debugfs.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.089270547@linutronix.de
---
 Documentation/admin-guide/kernel-parameters.txt |  4 ++
 include/linux/rseq_entry.h                      |  3 +
 init/Kconfig                                    | 14 +++++
 kernel/rseq.c                                   | 73 +++++++++++++++++++++++--
 4 files changed, 90 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6c42061ca20e..e63827475792 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6500,6 +6500,10 @@
 			Memory area to be used by remote processor image,
 			managed by CMA.
 
+	rseq_debug=	[KNL] Enable or disable restartable sequence
+			debug mode. Defaults to CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE.
+			Format: <bool>
+
 	rt_group_sched=	[KNL] Enable or disable SCHED_RR/FIFO group scheduling
 			when CONFIG_RT_GROUP_SCHED=y. Defaults to
 			!CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED.
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index ff9080b89be3..ed8e5f89499b 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -34,6 +34,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
 #endif /* !CONFIG_RSEQ_STATS */
 
 #ifdef CONFIG_RSEQ
+#include <linux/jump_label.h>
 #include <linux/rseq.h>
 
 #include <linux/tracepoint-defs.h>
@@ -64,6 +65,8 @@ static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
 				       unsigned long offset, unsigned long abort_ip) { }
 #endif /* !CONFIG_TRACEPOINT */
 
+DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
+
 static __always_inline void rseq_note_user_irq_entry(void)
 {
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
diff --git a/init/Kconfig b/init/Kconfig
index f39fdfb28797..bde40ab664e2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1925,10 +1925,24 @@ config RSEQ_STATS
 
 	  If unsure, say N.
 
+config RSEQ_DEBUG_DEFAULT_ENABLE
+	default n
+	bool "Enable restartable sequences debug mode by default" if EXPERT
+	depends on RSEQ
+	help
+	  This enables the static branch for debug mode of restartable
+	  sequences.
+
+	  This also can be controlled on the kernel command line via the
+	  command line parameter "rseq_debug=0/1" and through debugfs.
+
+	  If unsure, say N.
+
 config DEBUG_RSEQ
 	default n
 	bool "Enable debugging of rseq() system call" if EXPERT
 	depends on RSEQ && DEBUG_KERNEL
+	select RSEQ_DEBUG_DEFAULT_ENABLE
 	help
 	  Enable extra debugging checks for the rseq system call.
 
diff --git a/kernel/rseq.c b/kernel/rseq.c
index c0dbe2ed26a8..679ab8ebdfd3 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -95,6 +95,27 @@
 				  RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
 				  RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
 
+DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
+
+static inline void rseq_control_debug(bool on)
+{
+	if (on)
+		static_branch_enable(&rseq_debug_enabled);
+	else
+		static_branch_disable(&rseq_debug_enabled);
+}
+
+static int __init rseq_setup_debug(char *str)
+{
+	bool on;
+
+	if (kstrtobool(str, &on))
+		return -EINVAL;
+	rseq_control_debug(on);
+	return 1;
+}
+__setup("rseq_debug=", rseq_setup_debug);
+
 #ifdef CONFIG_TRACEPOINTS
 /*
  * Out of line, so the actual update functions can be in a header to be
@@ -112,10 +133,11 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
 }
 #endif /* CONFIG_TRACEPOINTS */
 
+#ifdef CONFIG_DEBUG_FS
 #ifdef CONFIG_RSEQ_STATS
 DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
 
-static int rseq_debug_show(struct seq_file *m, void *p)
+static int rseq_stats_show(struct seq_file *m, void *p)
 {
 	struct rseq_stats stats = { };
 	unsigned int cpu;
@@ -140,14 +162,56 @@ static int rseq_debug_show(struct seq_file *m, void *p)
 	return 0;
 }
 
+static int rseq_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rseq_stats_show, inode->i_private);
+}
+
+static const struct file_operations stat_ops = {
+	.open		= rseq_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init rseq_stats_init(struct dentry *root_dir)
+{
+	debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops);
+	return 0;
+}
+#else
+static inline void rseq_stats_init(struct dentry *root_dir) { }
+#endif /* CONFIG_RSEQ_STATS */
+
+static int rseq_debug_show(struct seq_file *m, void *p)
+{
+	bool on = static_branch_unlikely(&rseq_debug_enabled);
+
+	seq_printf(m, "%d\n", on);
+	return 0;
+}
+
+static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf,
+			    size_t count, loff_t *ppos)
+{
+	bool on;
+
+	if (kstrtobool_from_user(ubuf, count, &on))
+		return -EINVAL;
+
+	rseq_control_debug(on);
+	return count;
+}
+
 static int rseq_debug_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, rseq_debug_show, inode->i_private);
 }
 
-static const struct file_operations dfs_ops = {
+static const struct file_operations debug_ops = {
 	.open		= rseq_debug_open,
 	.read		= seq_read,
+	.write		= rseq_debug_write,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
@@ -156,11 +220,12 @@ static int __init rseq_debugfs_init(void)
 {
 	struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
 
-	debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops);
+	debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
+	rseq_stats_init(root_dir);
 	return 0;
 }
 __initcall(rseq_debugfs_init);
-#endif /* CONFIG_RSEQ_STATS */
+#endif /* CONFIG_DEBUG_FS */
 
 #ifdef CONFIG_DEBUG_RSEQ
 static struct rseq *rseq_kernel_fields(struct task_struct *t)
-- 
cgit v1.2.3


From abc850e7616c91ebaa3f5ba3617ab0a104d45039 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:44:57 +0100
Subject: rseq: Provide and use rseq_update_user_cs()

Provide a straight forward implementation to check for and eventually
clear/fixup critical sections in user space.

The non-debug version does only the minimal sanity checks and aims for
efficiency.

There are two attack vectors, which are checked for:

  1) An abort IP which is in the kernel address space. That would cause at
     least x86 to return to kernel space via IRET.

  2) A rogue critical section descriptor with an abort IP pointing to some
     arbitrary address, which is not preceded by the RSEQ signature.

If the section descriptors are invalid then the resulting misbehaviour of
the user space application is not the kernels problem.

The kernel provides a run-time switchable debug slow path, which implements
the full zoo of checks including termination of the task when one of the
gazillion conditions is not met.

Replace the zoo in rseq.c with it and invoke it from the TIF_NOTIFY_RESUME
handler. Move the remainders into the CONFIG_DEBUG_RSEQ section, which will
be replaced and removed in a subsequent step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.151465632@linutronix.de
---
 include/linux/rseq_entry.h | 206 +++++++++++++++++++++++++++++++++++++
 include/linux/rseq_types.h |  11 +-
 kernel/rseq.c              | 246 ++++++++++++++-------------------------------
 3 files changed, 291 insertions(+), 172 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index ed8e5f89499b..f9510ce72211 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -36,6 +36,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
 #ifdef CONFIG_RSEQ
 #include <linux/jump_label.h>
 #include <linux/rseq.h>
+#include <linux/uaccess.h>
 
 #include <linux/tracepoint-defs.h>
 
@@ -67,12 +68,217 @@ static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
 
 DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
 
+#ifdef RSEQ_BUILD_SLOW_PATH
+#define rseq_inline
+#else
+#define rseq_inline __always_inline
+#endif
+
+bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
+
 static __always_inline void rseq_note_user_irq_entry(void)
 {
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
 		current->rseq.event.user_irq = true;
 }
 
+/*
+ * Check whether there is a valid critical section and whether the
+ * instruction pointer in @regs is inside the critical section.
+ *
+ *  - If the critical section is invalid, terminate the task.
+ *
+ *  - If valid and the instruction pointer is inside, set it to the abort IP.
+ *
+ *  - If valid and the instruction pointer is outside, clear the critical
+ *    section address.
+ *
+ * Returns true, if the section was valid and either fixup or clear was
+ * done, false otherwise.
+ *
+ * In the failure case task::rseq_event::fatal is set when a invalid
+ * section was found. It's clear when the failure was an unresolved page
+ * fault.
+ *
+ * If inlined into the exit to user path with interrupts disabled, the
+ * caller has to protect against page faults with pagefault_disable().
+ *
+ * In preemptible task context this would be counterproductive as the page
+ * faults could not be fully resolved. As a consequence unresolved page
+ * faults in task context are fatal too.
+ */
+
+#ifdef RSEQ_BUILD_SLOW_PATH
+/*
+ * The debug version is put out of line, but kept here so the code stays
+ * together.
+ *
+ * @csaddr has already been checked by the caller to be in user space
+ */
+bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
+			       unsigned long csaddr)
+{
+	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
+	u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
+	unsigned long ip = instruction_pointer(regs);
+	u64 __user *uc_head = (u64 __user *) ucs;
+	u32 usig, __user *uc_sig;
+
+	scoped_user_rw_access(ucs, efault) {
+		/*
+		 * Evaluate the user pile and exit if one of the conditions
+		 * is not fulfilled.
+		 */
+		unsafe_get_user(start_ip, &ucs->start_ip, efault);
+		if (unlikely(start_ip >= tasksize))
+			goto die;
+		/* If outside, just clear the critical section. */
+		if (ip < start_ip)
+			goto clear;
+
+		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
+		cs_end = start_ip + offset;
+		/* Check for overflow and wraparound */
+		if (unlikely(cs_end >= tasksize || cs_end < start_ip))
+			goto die;
+
+		/* If not inside, clear it. */
+		if (ip >= cs_end)
+			goto clear;
+
+		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
+		/* Ensure it's "valid" */
+		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
+			goto die;
+		/* Validate that the abort IP is not in the critical section */
+		if (unlikely(abort_ip - start_ip < offset))
+			goto die;
+
+		/*
+		 * Check version and flags for 0. No point in emitting
+		 * deprecated warnings before dying. That could be done in
+		 * the slow path eventually, but *shrug*.
+		 */
+		unsafe_get_user(head, uc_head, efault);
+		if (unlikely(head))
+			goto die;
+
+		/* abort_ip - 4 is >= 0. See abort_ip check above */
+		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
+		unsafe_get_user(usig, uc_sig, efault);
+		if (unlikely(usig != t->rseq.sig))
+			goto die;
+
+		/* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
+		if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+			/* If not in interrupt from user context, let it die */
+			if (unlikely(!t->rseq.event.user_irq))
+				goto die;
+		}
+		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+		instruction_pointer_set(regs, (unsigned long)abort_ip);
+		rseq_stat_inc(rseq_stats.fixup);
+		break;
+	clear:
+		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+		rseq_stat_inc(rseq_stats.clear);
+		abort_ip = 0ULL;
+	}
+
+	if (unlikely(abort_ip))
+		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+	return true;
+die:
+	t->rseq.event.fatal = true;
+efault:
+	return false;
+}
+
+#endif /* RSEQ_BUILD_SLOW_PATH */
+
+/*
+ * This only ensures that abort_ip is in the user address space and
+ * validates that it is preceded by the signature.
+ *
+ * No other sanity checks are done here, that's what the debug code is for.
+ */
+static rseq_inline bool
+rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
+{
+	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
+	unsigned long ip = instruction_pointer(regs);
+	u64 start_ip, abort_ip, offset;
+	u32 usig, __user *uc_sig;
+
+	rseq_stat_inc(rseq_stats.cs);
+
+	if (unlikely(csaddr >= TASK_SIZE)) {
+		t->rseq.event.fatal = true;
+		return false;
+	}
+
+	if (static_branch_unlikely(&rseq_debug_enabled))
+		return rseq_debug_update_user_cs(t, regs, csaddr);
+
+	scoped_user_rw_access(ucs, efault) {
+		unsafe_get_user(start_ip, &ucs->start_ip, efault);
+		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
+		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
+
+		/*
+		 * No sanity checks. If user space screwed it up, it can
+		 * keep the pieces. That's what debug code is for.
+		 *
+		 * If outside, just clear the critical section.
+		 */
+		if (ip - start_ip >= offset)
+			goto clear;
+
+		/*
+		 * Two requirements for @abort_ip:
+		 *   - Must be in user space as x86 IRET would happily return to
+		 *     the kernel.
+		 *   - The four bytes preceding the instruction at @abort_ip must
+		 *     contain the signature.
+		 *
+		 * The latter protects against the following attack vector:
+		 *
+		 * An attacker with limited abilities to write, creates a critical
+		 * section descriptor, sets the abort IP to a library function or
+		 * some other ROP gadget and stores the address of the descriptor
+		 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
+		 * protection.
+		 */
+		if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig))
+			goto die;
+
+		/* The address is guaranteed to be >= 0 and < TASK_SIZE */
+		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
+		unsafe_get_user(usig, uc_sig, efault);
+		if (unlikely(usig != t->rseq.sig))
+			goto die;
+
+		/* Invalidate the critical section */
+		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+		/* Update the instruction pointer */
+		instruction_pointer_set(regs, (unsigned long)abort_ip);
+		rseq_stat_inc(rseq_stats.fixup);
+		break;
+	clear:
+		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+		rseq_stat_inc(rseq_stats.clear);
+		abort_ip = 0ULL;
+	}
+
+	if (unlikely(abort_ip))
+		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+	return true;
+die:
+	t->rseq.event.fatal = true;
+efault:
+	return false;
+}
+
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	struct rseq_event *ev = &current->rseq.event;
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 80f6c398ef0f..7c123947bb98 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -14,10 +14,12 @@ struct rseq;
  * @sched_switch:	True if the task was scheduled out
  * @user_irq:		True on interrupt entry from user mode
  * @has_rseq:		True if the task has a rseq pointer installed
+ * @error:		Compound error code for the slow path to analyze
+ * @fatal:		User space data corrupted or invalid
  */
 struct rseq_event {
 	union {
-		u32				all;
+		u64				all;
 		struct {
 			union {
 				u16		events;
@@ -28,6 +30,13 @@ struct rseq_event {
 			};
 
 			u8			has_rseq;
+			u8			__pad;
+			union {
+				u16		error;
+				struct {
+					u8	fatal;
+				};
+			};
 		};
 	};
 };
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 679ab8ebdfd3..12a9b6ab2cef 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -382,175 +382,18 @@ efault:
 	return -EFAULT;
 }
 
-/*
- * Get the user-space pointer value stored in the 'rseq_cs' field.
- */
-static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs)
-{
-	if (!rseq_cs)
-		return -EFAULT;
-
-#ifdef CONFIG_64BIT
-	if (get_user(*rseq_cs, &rseq->rseq_cs))
-		return -EFAULT;
-#else
-	if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs)))
-		return -EFAULT;
-#endif
-
-	return 0;
-}
-
-/*
- * If the rseq_cs field of 'struct rseq' contains a valid pointer to
- * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
- */
-static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
-{
-	struct rseq_cs __user *urseq_cs;
-	u64 ptr;
-	u32 __user *usig;
-	u32 sig;
-	int ret;
-
-	ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr);
-	if (ret)
-		return ret;
-
-	/* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
-	if (!ptr) {
-		memset(rseq_cs, 0, sizeof(*rseq_cs));
-		return 0;
-	}
-	/* Check that the pointer value fits in the user-space process space. */
-	if (ptr >= TASK_SIZE)
-		return -EINVAL;
-	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
-	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
-		return -EFAULT;
-
-	if (rseq_cs->start_ip >= TASK_SIZE ||
-	    rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
-	    rseq_cs->abort_ip >= TASK_SIZE ||
-	    rseq_cs->version > 0)
-		return -EINVAL;
-	/* Check for overflow. */
-	if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
-		return -EINVAL;
-	/* Ensure that abort_ip is not in the critical section. */
-	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
-		return -EINVAL;
-
-	usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
-	ret = get_user(sig, usig);
-	if (ret)
-		return ret;
-
-	if (current->rseq.sig != sig) {
-		printk_ratelimited(KERN_WARNING
-			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
-			sig, current->rseq.sig, current->pid, usig);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static bool rseq_warn_flags(const char *str, u32 flags)
-{
-	u32 test_flags;
-
-	if (!flags)
-		return false;
-	test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS;
-	if (test_flags)
-		pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str);
-	test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS;
-	if (test_flags)
-		pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str);
-	return true;
-}
-
-static int rseq_check_flags(struct task_struct *t, u32 cs_flags)
-{
-	u32 flags;
-	int ret;
-
-	if (rseq_warn_flags("rseq_cs", cs_flags))
-		return -EINVAL;
-
-	/* Get thread flags. */
-	ret = get_user(flags, &t->rseq.usrptr->flags);
-	if (ret)
-		return ret;
-
-	if (rseq_warn_flags("rseq", flags))
-		return -EINVAL;
-	return 0;
-}
-
-static int clear_rseq_cs(struct rseq __user *rseq)
-{
-	/*
-	 * The rseq_cs field is set to NULL on preemption or signal
-	 * delivery on top of rseq assembly block, as well as on top
-	 * of code outside of the rseq assembly block. This performs
-	 * a lazy clear of the rseq_cs field.
-	 *
-	 * Set rseq_cs to NULL.
-	 */
-#ifdef CONFIG_64BIT
-	return put_user(0UL, &rseq->rseq_cs);
-#else
-	if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs)))
-		return -EFAULT;
-	return 0;
-#endif
-}
-
-/*
- * Unsigned comparison will be true when ip >= start_ip, and when
- * ip < start_ip + post_commit_offset.
- */
-static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
-{
-	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
-}
-
-static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
+static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
 {
-	unsigned long ip = instruction_pointer(regs);
-	struct task_struct *t = current;
-	struct rseq_cs rseq_cs;
-	int ret;
-
-	rseq_stat_inc(rseq_stats.cs);
-
-	ret = rseq_get_rseq_cs(t, &rseq_cs);
-	if (ret)
-		return ret;
-
-	/*
-	 * Handle potentially not being within a critical section.
-	 * If not nested over a rseq critical section, restart is useless.
-	 * Clear the rseq_cs pointer and return.
-	 */
-	if (!in_rseq_cs(ip, &rseq_cs)) {
-		rseq_stat_inc(rseq_stats.clear);
-		return clear_rseq_cs(t->rseq.usrptr);
-	}
-	ret = rseq_check_flags(t, rseq_cs.flags);
-	if (ret < 0)
-		return ret;
-	if (!abort)
-		return 0;
-	ret = clear_rseq_cs(t->rseq.usrptr);
-	if (ret)
-		return ret;
-	rseq_stat_inc(rseq_stats.fixup);
-	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
-			    rseq_cs.abort_ip);
-	instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
-	return 0;
+	struct rseq __user *urseq = t->rseq.usrptr;
+	u64 csaddr;
+
+	scoped_user_read_access(urseq, efault)
+		unsafe_get_user(csaddr, &urseq->rseq_cs, efault);
+	if (likely(!csaddr))
+		return true;
+	return rseq_update_user_cs(t, regs, csaddr);
+efault:
+	return false;
 }
 
 /*
@@ -567,8 +410,8 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
 void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 {
 	struct task_struct *t = current;
-	int ret, sig;
 	bool event;
+	int sig;
 
 	/*
 	 * If invoked from hypervisors before entering the guest via
@@ -618,8 +461,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
 		return;
 
-	ret = rseq_ip_fixup(regs, event);
-	if (unlikely(ret < 0))
+	if (!rseq_handle_cs(t, regs))
 		goto error;
 
 	if (unlikely(rseq_update_cpu_node_id(t)))
@@ -632,6 +474,68 @@ error:
 }
 
 #ifdef CONFIG_DEBUG_RSEQ
+/*
+ * Unsigned comparison will be true when ip >= start_ip, and when
+ * ip < start_ip + post_commit_offset.
+ */
+static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
+{
+	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
+}
+
+/*
+ * If the rseq_cs field of 'struct rseq' contains a valid pointer to
+ * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
+ */
+static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
+{
+	struct rseq __user *urseq = t->rseq.usrptr;
+	struct rseq_cs __user *urseq_cs;
+	u32 __user *usig;
+	u64 ptr;
+	u32 sig;
+	int ret;
+
+	if (get_user(ptr, &rseq->rseq_cs))
+		return -EFAULT;
+
+	/* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
+	if (!ptr) {
+		memset(rseq_cs, 0, sizeof(*rseq_cs));
+		return 0;
+	}
+	/* Check that the pointer value fits in the user-space process space. */
+	if (ptr >= TASK_SIZE)
+		return -EINVAL;
+	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
+	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
+		return -EFAULT;
+
+	if (rseq_cs->start_ip >= TASK_SIZE ||
+	    rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
+	    rseq_cs->abort_ip >= TASK_SIZE ||
+	    rseq_cs->version > 0)
+		return -EINVAL;
+	/* Check for overflow. */
+	if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
+		return -EINVAL;
+	/* Ensure that abort_ip is not in the critical section. */
+	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
+		return -EINVAL;
+
+	usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
+	ret = get_user(sig, usig);
+	if (ret)
+		return ret;
+
+	if (current->rseq.sig != sig) {
+		printk_ratelimited(KERN_WARNING
+			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
+			sig, current->rseq.sig, current->pid, usig);
+		return -EINVAL;
+	}
+	return 0;
+}
 
 /*
  * Terminate the process if a syscall is issued within a restartable
-- 
cgit v1.2.3


From c1cbad8f99b5c73c6af6e96acbfa64eaaaeb085f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:02 +0100
Subject: rseq: Make exit debugging static branch based

Disconnect it from the config switch and use the static debug branch. This
is a temporary measure for validating the rework. At the end this check
needs to be hidden behind lockdep as it has nothing to do with the other
debug infrastructure, which mainly aids user space debugging by enabling a
zoo of checks which terminate misbehaving tasks instead of letting them
keep the hard to diagnose pieces.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.272660745@linutronix.de
---
 include/linux/rseq_entry.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index f9510ce72211..5bdcf5b5f595 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -285,7 +285,7 @@ static __always_inline void rseq_exit_to_user_mode(void)
 
 	rseq_stat_inc(rseq_stats.exit);
 
-	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+	if (static_branch_unlikely(&rseq_debug_enabled))
 		WARN_ON_ONCE(ev->sched_switch);
 
 	/*
-- 
cgit v1.2.3


From eaa9088d568c84afd72fa32dbe01833aef861d0d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:05 +0100
Subject: rseq: Use static branch for syscall exit debug when
 GENERIC_IRQ_ENTRY=y

Make the syscall exit debug mechanism available via the static branch on
architectures which utilize the generic entry code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.333440475@linutronix.de
---
 include/linux/entry-common.h |  2 +-
 include/linux/rseq_entry.h   |  9 +++++++++
 kernel/rseq.c                | 10 ++++++++--
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 75b194c34e18..d967184ae08f 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -146,7 +146,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
 			local_irq_enable();
 	}
 
-	rseq_syscall(regs);
+	rseq_debug_syscall_return(regs);
 
 	/*
 	 * Do one-time syscall specific work. If these work items are
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index 5bdcf5b5f595..fb53a6ff05d7 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -296,9 +296,18 @@ static __always_inline void rseq_exit_to_user_mode(void)
 	ev->events = 0;
 }
 
+void __rseq_debug_syscall_return(struct pt_regs *regs);
+
+static inline void rseq_debug_syscall_return(struct pt_regs *regs)
+{
+	if (static_branch_unlikely(&rseq_debug_enabled))
+		__rseq_debug_syscall_return(regs);
+}
+
 #else /* CONFIG_RSEQ */
 static inline void rseq_note_user_irq_entry(void) { }
 static inline void rseq_exit_to_user_mode(void) { }
+static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
 #endif /* !CONFIG_RSEQ */
 
 #endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/kernel/rseq.c b/kernel/rseq.c
index abd6bfadbcc6..97631554ae96 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -473,12 +473,11 @@ error:
 	force_sigsegv(sig);
 }
 
-#ifdef CONFIG_DEBUG_RSEQ
 /*
  * Terminate the process if a syscall is issued within a restartable
  * sequence.
  */
-void rseq_syscall(struct pt_regs *regs)
+void __rseq_debug_syscall_return(struct pt_regs *regs)
 {
 	struct task_struct *t = current;
 	u64 csaddr;
@@ -496,6 +495,13 @@ void rseq_syscall(struct pt_regs *regs)
 fail:
 	force_sig(SIGSEGV);
 }
+
+#ifdef CONFIG_DEBUG_RSEQ
+/* Kept around to keep GENERIC_ENTRY=n architectures supported. */
+void rseq_syscall(struct pt_regs *regs)
+{
+	__rseq_debug_syscall_return(regs);
+}
 #endif
 
 /*
-- 
cgit v1.2.3


From 0f085b41880e3140efa6941ff2b8fd43bac4d659 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:08 +0100
Subject: rseq: Provide and use rseq_set_ids()

Provide a new and straight forward implementation to set the IDs (CPU ID,
Node ID and MM CID), which can be later inlined into the fast path.

It does all operations in one scoped_user_rw_access() section and retrieves
also the critical section member (rseq::cs_rseq) from user space to avoid
another user..begin/end() pair. This is in preparation for optimizing the
fast path to avoid extra work when not required.

On rseq registration set the CPU ID fields to RSEQ_CPU_ID_UNINITIALIZED and
node and MM CID to zero. That's the same as the kernel internal reset
values. That makes the debug validation in the exit code work correctly on
the first exit to user space.

Use it to replace the whole related zoo in rseq.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.393972266@linutronix.de
---
 fs/binfmt_elf.c            |   2 +-
 include/linux/rseq.h       |  16 ++-
 include/linux/rseq_entry.h |  89 +++++++++++++++++
 include/linux/sched.h      |  10 --
 kernel/rseq.c              | 236 ++++++++++-----------------------------------
 5 files changed, 151 insertions(+), 202 deletions(-)

(limited to 'include')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e4653bb99946..3eb734c192e9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,7 +46,7 @@
 #include <linux/cred.h>
 #include <linux/dax.h>
 #include <linux/uaccess.h>
-#include <linux/rseq.h>
+#include <uapi/linux/rseq.h>
 #include <asm/param.h>
 #include <asm/page.h>
 
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 7f347c3a4af8..92f9cd49489b 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -5,6 +5,8 @@
 #ifdef CONFIG_RSEQ
 #include <linux/sched.h>
 
+#include <uapi/linux/rseq.h>
+
 void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
 
 static inline void rseq_handle_notify_resume(struct pt_regs *regs)
@@ -48,7 +50,7 @@ static inline void rseq_virt_userspace_exit(void)
 static inline void rseq_reset(struct task_struct *t)
 {
 	memset(&t->rseq, 0, sizeof(t->rseq));
-	t->rseq.ids.cpu_cid = ~0ULL;
+	t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
 }
 
 static inline void rseq_execve(struct task_struct *t)
@@ -59,15 +61,19 @@ static inline void rseq_execve(struct task_struct *t)
 /*
  * If parent process has a registered restartable sequences area, the
  * child inherits. Unregister rseq for a clone with CLONE_VM set.
+ *
+ * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
+ * on the COW page on exit to user space, when the child stays on the same
+ * CPU as the parent. That's obviously not guaranteed, but in overcommit
+ * scenarios it is more likely and optimizes for the fork/exec case without
+ * taking the fault.
  */
 static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
 {
-	if (clone_flags & CLONE_VM) {
+	if (clone_flags & CLONE_VM)
 		rseq_reset(t);
-	} else {
+	else
 		t->rseq = current->rseq;
-		t->rseq.ids.cpu_cid = ~0ULL;
-	}
 }
 
 #else /* CONFIG_RSEQ */
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index fb53a6ff05d7..37444e80fd45 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -75,6 +75,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
 #endif
 
 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
+bool rseq_debug_validate_ids(struct task_struct *t);
 
 static __always_inline void rseq_note_user_irq_entry(void)
 {
@@ -194,6 +195,43 @@ efault:
 	return false;
 }
 
+/*
+ * On debug kernels validate that user space did not mess with it if the
+ * debug branch is enabled.
+ */
+bool rseq_debug_validate_ids(struct task_struct *t)
+{
+	struct rseq __user *rseq = t->rseq.usrptr;
+	u32 cpu_id, uval, node_id;
+
+	/*
+	 * On the first exit after registering the rseq region CPU ID is
+	 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
+	 */
+	node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
+		  cpu_to_node(t->rseq.ids.cpu_id) : 0;
+
+	scoped_user_read_access(rseq, efault) {
+		unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
+		if (cpu_id != t->rseq.ids.cpu_id)
+			goto die;
+		unsafe_get_user(uval, &rseq->cpu_id, efault);
+		if (uval != cpu_id)
+			goto die;
+		unsafe_get_user(uval, &rseq->node_id, efault);
+		if (uval != node_id)
+			goto die;
+		unsafe_get_user(uval, &rseq->mm_cid, efault);
+		if (uval != t->rseq.ids.mm_cid)
+			goto die;
+	}
+	return true;
+die:
+	t->rseq.event.fatal = true;
+efault:
+	return false;
+}
+
 #endif /* RSEQ_BUILD_SLOW_PATH */
 
 /*
@@ -279,6 +317,57 @@ efault:
 	return false;
 }
 
+/*
+ * Updates CPU ID, Node ID and MM CID and reads the critical section
+ * address, when @csaddr != NULL. This allows to put the ID update and the
+ * read under the same uaccess region to spare a separate begin/end.
+ *
+ * As this is either invoked from a C wrapper with @csaddr = NULL or from
+ * the fast path code with a valid pointer, a clever compiler should be
+ * able to optimize the read out. Spares a duplicate implementation.
+ *
+ * Returns true, if the operation was successful, false otherwise.
+ *
+ * In the failure case task::rseq_event::fatal is set when invalid data
+ * was found on debug kernels. It's clear when the failure was an unresolved page
+ * fault.
+ *
+ * If inlined into the exit to user path with interrupts disabled, the
+ * caller has to protect against page faults with pagefault_disable().
+ *
+ * In preemptible task context this would be counterproductive as the page
+ * faults could not be fully resolved. As a consequence unresolved page
+ * faults in task context are fatal too.
+ */
+static rseq_inline
+bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
+			     u32 node_id, u64 *csaddr)
+{
+	struct rseq __user *rseq = t->rseq.usrptr;
+
+	if (static_branch_unlikely(&rseq_debug_enabled)) {
+		if (!rseq_debug_validate_ids(t))
+			return false;
+	}
+
+	scoped_user_rw_access(rseq, efault) {
+		unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
+		unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
+		unsafe_put_user(node_id, &rseq->node_id, efault);
+		unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
+		if (csaddr)
+			unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
+	}
+
+	/* Cache the new values */
+	t->rseq.ids.cpu_cid = ids->cpu_cid;
+	rseq_stat_inc(rseq_stats.ids);
+	rseq_trace_update(t, ids);
+	return true;
+efault:
+	return false;
+}
+
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	struct rseq_event *ev = &current->rseq.event;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 24a9da7ca3e7..e47abc8685d7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -42,7 +42,6 @@
 #include <linux/posix-timers_types.h>
 #include <linux/restart_block.h>
 #include <linux/rseq_types.h>
-#include <uapi/linux/rseq.h>
 #include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
 #include <linux/rv.h>
@@ -1408,15 +1407,6 @@ struct task_struct {
 #endif /* CONFIG_NUMA_BALANCING */
 
 	struct rseq_data		rseq;
-#ifdef CONFIG_DEBUG_RSEQ
-	/*
-	 * This is a place holder to save a copy of the rseq fields for
-	 * validation of read-only fields. The struct rseq has a
-	 * variable-length array at the end, so it cannot be used
-	 * directly. Reserve a size large enough for the known fields.
-	 */
-	char				rseq_fields[sizeof(struct rseq)];
-#endif
 
 #ifdef CONFIG_SCHED_MM_CID
 	int				mm_cid;		/* Current cid in mm */
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 97631554ae96..1e4f1d2cdfe5 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -88,13 +88,6 @@
 # define RSEQ_EVENT_GUARD	preempt
 #endif
 
-/* The original rseq structure size (including padding) is 32 bytes. */
-#define ORIG_RSEQ_SIZE		32
-
-#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
-				  RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
-				  RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
-
 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
 
 static inline void rseq_control_debug(bool on)
@@ -227,159 +220,9 @@ static int __init rseq_debugfs_init(void)
 __initcall(rseq_debugfs_init);
 #endif /* CONFIG_DEBUG_FS */
 
-#ifdef CONFIG_DEBUG_RSEQ
-static struct rseq *rseq_kernel_fields(struct task_struct *t)
-{
-	return (struct rseq *) t->rseq_fields;
-}
-
-static int rseq_validate_ro_fields(struct task_struct *t)
-{
-	static DEFINE_RATELIMIT_STATE(_rs,
-				      DEFAULT_RATELIMIT_INTERVAL,
-				      DEFAULT_RATELIMIT_BURST);
-	u32 cpu_id_start, cpu_id, node_id, mm_cid;
-	struct rseq __user *rseq = t->rseq.usrptr;
-
-	/*
-	 * Validate fields which are required to be read-only by
-	 * user-space.
-	 */
-	if (!user_read_access_begin(rseq, t->rseq.len))
-		goto efault;
-	unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
-	unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
-	unsafe_get_user(node_id, &rseq->node_id, efault_end);
-	unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
-	user_read_access_end();
-
-	if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
-	    cpu_id != rseq_kernel_fields(t)->cpu_id ||
-	    node_id != rseq_kernel_fields(t)->node_id ||
-	    mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
-
-		pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
-			"\tcpu_id_start: %u ?= %u\n"
-			"\tcpu_id:       %u ?= %u\n"
-			"\tnode_id:      %u ?= %u\n"
-			"\tmm_cid:       %u ?= %u\n",
-			t->pid, t->comm,
-			cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
-			cpu_id, rseq_kernel_fields(t)->cpu_id,
-			node_id, rseq_kernel_fields(t)->node_id,
-			mm_cid, rseq_kernel_fields(t)->mm_cid);
-	}
-
-	/* For now, only print a console warning on mismatch. */
-	return 0;
-
-efault_end:
-	user_read_access_end();
-efault:
-	return -EFAULT;
-}
-
-/*
- * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
- * state.
- */
-#define rseq_unsafe_put_user(t, value, field, error_label)			\
-	do {									\
-		unsafe_put_user(value, &t->rseq.usrptr->field, error_label);	\
-		rseq_kernel_fields(t)->field = value;				\
-	} while (0)
-
-#else
-static int rseq_validate_ro_fields(struct task_struct *t)
-{
-	return 0;
-}
-
-#define rseq_unsafe_put_user(t, value, field, error_label)		\
-	unsafe_put_user(value, &t->rseq.usrptr->field, error_label)
-#endif
-
-static int rseq_update_cpu_node_id(struct task_struct *t)
-{
-	struct rseq __user *rseq = t->rseq.usrptr;
-	u32 cpu_id = raw_smp_processor_id();
-	u32 node_id = cpu_to_node(cpu_id);
-	u32 mm_cid = task_mm_cid(t);
-
-	rseq_stat_inc(rseq_stats.ids);
-
-	/* Validate read-only rseq fields on debug kernels */
-	if (rseq_validate_ro_fields(t))
-		goto efault;
-	WARN_ON_ONCE((int) mm_cid < 0);
-
-	if (!user_write_access_begin(rseq, t->rseq.len))
-		goto efault;
-
-	rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
-	rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
-	rseq_unsafe_put_user(t, node_id, node_id, efault_end);
-	rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
-
-	/* Cache the user space values */
-	t->rseq.ids.cpu_id = cpu_id;
-	t->rseq.ids.mm_cid = mm_cid;
-
-	/*
-	 * Additional feature fields added after ORIG_RSEQ_SIZE
-	 * need to be conditionally updated only if
-	 * t->rseq_len != ORIG_RSEQ_SIZE.
-	 */
-	user_write_access_end();
-	trace_rseq_update(t);
-	return 0;
-
-efault_end:
-	user_write_access_end();
-efault:
-	return -EFAULT;
-}
-
-static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
+static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
 {
-	struct rseq __user *rseq = t->rseq.usrptr;
-	u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
-	    mm_cid = 0;
-
-	/*
-	 * Validate read-only rseq fields.
-	 */
-	if (rseq_validate_ro_fields(t))
-		goto efault;
-
-	if (!user_write_access_begin(rseq, t->rseq.len))
-		goto efault;
-
-	/*
-	 * Reset all fields to their initial state.
-	 *
-	 * All fields have an initial state of 0 except cpu_id which is set to
-	 * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after
-	 * unregistration can figure out that rseq needs to be registered
-	 * again.
-	 */
-	rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end);
-	rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
-	rseq_unsafe_put_user(t, node_id, node_id, efault_end);
-	rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
-
-	/*
-	 * Additional feature fields added after ORIG_RSEQ_SIZE
-	 * need to be conditionally reset only if
-	 * t->rseq_len != ORIG_RSEQ_SIZE.
-	 */
-	user_write_access_end();
-	return 0;
-
-efault_end:
-	user_write_access_end();
-efault:
-	return -EFAULT;
+	return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
 }
 
 static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
@@ -410,6 +253,8 @@ efault:
 void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 {
 	struct task_struct *t = current;
+	struct rseq_ids ids;
+	u32 node_id;
 	bool event;
 	int sig;
 
@@ -456,6 +301,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	scoped_guard(RSEQ_EVENT_GUARD) {
 		event = t->rseq.event.sched_switch;
 		t->rseq.event.sched_switch = false;
+		ids.cpu_id = task_cpu(t);
+		ids.mm_cid = task_mm_cid(t);
 	}
 
 	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
@@ -464,7 +311,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	if (!rseq_handle_cs(t, regs))
 		goto error;
 
-	if (unlikely(rseq_update_cpu_node_id(t)))
+	node_id = cpu_to_node(ids.cpu_id);
+	if (!rseq_set_ids(t, &ids, node_id))
 		goto error;
 	return;
 
@@ -504,13 +352,33 @@ void rseq_syscall(struct pt_regs *regs)
 }
 #endif
 
+static bool rseq_reset_ids(void)
+{
+	struct rseq_ids ids = {
+		.cpu_id		= RSEQ_CPU_ID_UNINITIALIZED,
+		.mm_cid		= 0,
+	};
+
+	/*
+	 * If this fails, terminate it because this leaves the kernel in
+	 * stupid state as exit to user space will try to fixup the ids
+	 * again.
+	 */
+	if (rseq_set_ids(current, &ids, 0))
+		return true;
+
+	force_sig(SIGSEGV);
+	return false;
+}
+
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE		32
+
 /*
  * sys_rseq - setup restartable sequences for caller thread.
  */
 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
 {
-	int ret;
-
 	if (flags & RSEQ_FLAG_UNREGISTER) {
 		if (flags & ~RSEQ_FLAG_UNREGISTER)
 			return -EINVAL;
@@ -521,9 +389,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 			return -EINVAL;
 		if (current->rseq.sig != sig)
 			return -EPERM;
-		ret = rseq_reset_rseq_cpu_node_id(current);
-		if (ret)
-			return ret;
+		if (!rseq_reset_ids())
+			return -EFAULT;
 		rseq_reset(current);
 		return 0;
 	}
@@ -563,27 +430,22 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 	if (!access_ok(rseq, rseq_len))
 		return -EFAULT;
 
-	/*
-	 * If the rseq_cs pointer is non-NULL on registration, clear it to
-	 * avoid a potential segfault on return to user-space. The proper thing
-	 * to do would have been to fail the registration but this would break
-	 * older libcs that reuse the rseq area for new threads without
-	 * clearing the fields. Don't bother reading it, just reset it.
-	 */
-	if (put_user(0UL, &rseq->rseq_cs))
-		return -EFAULT;
+	scoped_user_write_access(rseq, efault) {
+		/*
+		 * If the rseq_cs pointer is non-NULL on registration, clear it to
+		 * avoid a potential segfault on return to user-space. The proper thing
+		 * to do would have been to fail the registration but this would break
+		 * older libcs that reuse the rseq area for new threads without
+		 * clearing the fields. Don't bother reading it, just reset it.
+		 */
+		unsafe_put_user(0UL, &rseq->rseq_cs, efault);
+		/* Initialize IDs in user space */
+		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
+		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
+		unsafe_put_user(0U, &rseq->node_id, efault);
+		unsafe_put_user(0U, &rseq->mm_cid, efault);
+	}
 
-#ifdef CONFIG_DEBUG_RSEQ
-	/*
-	 * Initialize the in-kernel rseq fields copy for validation of
-	 * read-only fields.
-	 */
-	if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
-	    get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
-	    get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
-	    get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
-		return -EFAULT;
-#endif
 	/*
 	 * Activate the registration by setting the rseq area address, length
 	 * and signature in the task struct.
@@ -599,6 +461,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 	 */
 	current->rseq.event.has_rseq = true;
 	rseq_sched_switch_event(current);
-
 	return 0;
+
+efault:
+	return -EFAULT;
 }
-- 
cgit v1.2.3


From 9f6ffd4cebda86841700775de3213f22bb0ea22d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:10 +0100
Subject: rseq: Separate the signal delivery path

Completely separate the signal delivery path from the notify handler as
they have different semantics versus the event handling.

The signal delivery only needs to ensure that the interrupted user context
was not in a critical section or the section is aborted before it switches
to the signal frame context. The signal frame context does not have the
original instruction pointer anymore, so that can't be handled on exit to
user space.

No point in updating the CPU/CID ids as they might change again before the
task returns to user space for real.

The fast path optimization, which checks for the 'entry from user via
interrupt' condition is only available for architectures which use the
generic entry code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.455429038@linutronix.de
---
 include/linux/rseq.h | 21 ++++++++++++++++-----
 kernel/rseq.c        | 30 ++++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 92f9cd49489b..f5a43188023f 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -7,22 +7,33 @@
 
 #include <uapi/linux/rseq.h>
 
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
+void __rseq_handle_notify_resume(struct pt_regs *regs);
 
 static inline void rseq_handle_notify_resume(struct pt_regs *regs)
 {
 	if (current->rseq.event.has_rseq)
-		__rseq_handle_notify_resume(NULL, regs);
+		__rseq_handle_notify_resume(regs);
 }
 
+void __rseq_signal_deliver(int sig, struct pt_regs *regs);
+
+/*
+ * Invoked from signal delivery to fixup based on the register context before
+ * switching to the signal delivery context.
+ */
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
 {
-	if (current->rseq.event.has_rseq) {
-		current->rseq.event.sched_switch = true;
-		__rseq_handle_notify_resume(ksig, regs);
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+		/* '&' is intentional to spare one conditional branch */
+		if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
+			__rseq_signal_deliver(ksig->sig, regs);
+	} else {
+		if (current->rseq.event.has_rseq)
+			__rseq_signal_deliver(ksig->sig, regs);
 	}
 }
 
+/* Raised from context switch and exevce to force evaluation on exit to user */
 static inline void rseq_sched_switch_event(struct task_struct *t)
 {
 	if (t->rseq.event.has_rseq) {
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 1e4f1d2cdfe5..13faadc737ad 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -250,13 +250,12 @@ efault:
  * respect to other threads scheduled on the same CPU, and with respect
  * to signal handlers.
  */
-void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
+void __rseq_handle_notify_resume(struct pt_regs *regs)
 {
 	struct task_struct *t = current;
 	struct rseq_ids ids;
 	u32 node_id;
 	bool event;
-	int sig;
 
 	/*
 	 * If invoked from hypervisors before entering the guest via
@@ -275,10 +274,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	if (unlikely(t->flags & PF_EXITING))
 		return;
 
-	if (ksig)
-		rseq_stat_inc(rseq_stats.signal);
-	else
-		rseq_stat_inc(rseq_stats.slowpath);
+	rseq_stat_inc(rseq_stats.slowpath);
 
 	/*
 	 * Read and clear the event pending bit first. If the task
@@ -317,8 +313,26 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	return;
 
 error:
-	sig = ksig ? ksig->sig : 0;
-	force_sigsegv(sig);
+	force_sig(SIGSEGV);
+}
+
+void __rseq_signal_deliver(int sig, struct pt_regs *regs)
+{
+	rseq_stat_inc(rseq_stats.signal);
+	/*
+	 * Don't update IDs, they are handled on exit to user if
+	 * necessary. The important thing is to abort a critical section of
+	 * the interrupted context as after this point the instruction
+	 * pointer in @regs points to the signal handler.
+	 */
+	if (unlikely(!rseq_handle_cs(current, regs))) {
+		/*
+		 * Clear the errors just in case this might survive
+		 * magically, but leave the rest intact.
+		 */
+		current->rseq.event.error = 0;
+		force_sigsegv(sig);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From e2d4f42271155045a49b89530f2c06ad8e9f1a1e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:12 +0100
Subject: rseq: Rework the TIF_NOTIFY handler

Replace the whole logic with a new implementation, which is shared with
signal delivery and the upcoming exit fast path.

Contrary to the original implementation, this ignores invocations from
KVM/IO-uring, which invoke resume_user_mode_work() with the @regs argument
set to NULL.

The original implementation updated the CPU/Node/MM CID fields, but that
was just a side effect, which was addressing the problem that this
invocation cleared TIF_NOTIFY_RESUME, which in turn could cause an update
on return to user space to be lost.

This problem has been addressed differently, so that it's not longer
required to do that update before entering the guest.

That might be considered a user visible change, when the hosts thread TLS
memory is mapped into the guest, but as this was never intentionally
supported, this abuse of kernel internal implementation details is not
considered an ABI break.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.517640811@linutronix.de
---
 include/linux/rseq_entry.h | 29 ++++++++++++++++++
 kernel/rseq.c              | 76 ++++++++++++++++++++--------------------------
 2 files changed, 62 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index 37444e80fd45..aa1c0464a16c 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -368,6 +368,35 @@ efault:
 	return false;
 }
 
+/*
+ * Update user space with new IDs and conditionally check whether the task
+ * is in a critical section.
+ */
+static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
+					struct rseq_ids *ids, u32 node_id)
+{
+	u64 csaddr;
+
+	if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
+		return false;
+
+	/*
+	 * On architectures which utilize the generic entry code this
+	 * allows to skip the critical section when the entry was not from
+	 * a user space interrupt, unless debug mode is enabled.
+	 */
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+		if (!static_branch_unlikely(&rseq_debug_enabled)) {
+			if (likely(!t->rseq.event.user_irq))
+				return true;
+		}
+	}
+	if (likely(!csaddr))
+		return true;
+	/* Sigh, this really needs to do work */
+	return rseq_update_user_cs(t, regs, csaddr);
+}
+
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	struct rseq_event *ev = &current->rseq.event;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 13faadc737ad..148fb2103023 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -82,12 +82,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/rseq.h>
 
-#ifdef CONFIG_MEMBARRIER
-# define RSEQ_EVENT_GUARD	irq
-#else
-# define RSEQ_EVENT_GUARD	preempt
-#endif
-
 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
 
 static inline void rseq_control_debug(bool on)
@@ -239,38 +233,15 @@ efault:
 	return false;
 }
 
-/*
- * This resume handler must always be executed between any of:
- * - preemption,
- * - signal delivery,
- * and return to user-space.
- *
- * This is how we can ensure that the entire rseq critical section
- * will issue the commit instruction only if executed atomically with
- * respect to other threads scheduled on the same CPU, and with respect
- * to signal handlers.
- */
-void __rseq_handle_notify_resume(struct pt_regs *regs)
+static void rseq_slowpath_update_usr(struct pt_regs *regs)
 {
+	/* Preserve rseq state and user_irq state for exit to user */
+	const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
 	struct task_struct *t = current;
 	struct rseq_ids ids;
 	u32 node_id;
 	bool event;
 
-	/*
-	 * If invoked from hypervisors before entering the guest via
-	 * resume_user_mode_work(), then @regs is a NULL pointer.
-	 *
-	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
-	 * it before returning from the ioctl() to user space when
-	 * rseq_event.sched_switch is set.
-	 *
-	 * So it's safe to ignore here instead of pointlessly updating it
-	 * in the vcpu_run() loop.
-	 */
-	if (!regs)
-		return;
-
 	if (unlikely(t->flags & PF_EXITING))
 		return;
 
@@ -294,26 +265,45 @@ void __rseq_handle_notify_resume(struct pt_regs *regs)
 	 * with the result handed in to allow the detection of
 	 * inconsistencies.
 	 */
-	scoped_guard(RSEQ_EVENT_GUARD) {
+	scoped_guard(irq) {
 		event = t->rseq.event.sched_switch;
-		t->rseq.event.sched_switch = false;
+		t->rseq.event.all &= evt_mask.all;
 		ids.cpu_id = task_cpu(t);
 		ids.mm_cid = task_mm_cid(t);
 	}
 
-	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
+	if (!event)
 		return;
 
-	if (!rseq_handle_cs(t, regs))
-		goto error;
-
 	node_id = cpu_to_node(ids.cpu_id);
-	if (!rseq_set_ids(t, &ids, node_id))
-		goto error;
-	return;
 
-error:
-	force_sig(SIGSEGV);
+	if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
+		/*
+		 * Clear the errors just in case this might survive magically, but
+		 * leave the rest intact.
+		 */
+		t->rseq.event.error = 0;
+		force_sig(SIGSEGV);
+	}
+}
+
+void __rseq_handle_notify_resume(struct pt_regs *regs)
+{
+	/*
+	 * If invoked from hypervisors before entering the guest via
+	 * resume_user_mode_work(), then @regs is a NULL pointer.
+	 *
+	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
+	 * it before returning from the ioctl() to user space when
+	 * rseq_event.sched_switch is set.
+	 *
+	 * So it's safe to ignore here instead of pointlessly updating it
+	 * in the vcpu_run() loop.
+	 */
+	if (!regs)
+		return;
+
+	rseq_slowpath_update_usr(regs);
 }
 
 void __rseq_signal_deliver(int sig, struct pt_regs *regs)
-- 
cgit v1.2.3


From 39a167560a61f913560ba803a96dbe6c15239f5c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:14 +0100
Subject: rseq: Optimize event setting

After removing the various condition bits earlier it turns out that one
extra information is needed to avoid setting event::sched_switch and
TIF_NOTIFY_RESUME unconditionally on every context switch.

The update of the RSEQ user space memory is only required, when either

  the task was interrupted in user space and schedules

or

  the CPU or MM CID changes in schedule() independent of the entry mode

Right now only the interrupt from user information is available.

Add an event flag, which is set when the CPU or MM CID or both change.

Evaluate this event in the scheduler to decide whether the sched_switch
event and the TIF bit need to be set.

It's an extra conditional in context_switch(), but the downside of
unconditionally handling RSEQ after a context switch to user is way more
significant. The utilized boolean logic minimizes this to a single
conditional branch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.578058898@linutronix.de
---
 fs/exec.c                  |  2 +-
 include/linux/rseq.h       | 81 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/rseq_types.h | 11 +++++--
 kernel/rseq.c              |  2 +-
 kernel/sched/core.c        |  7 +++-
 kernel/sched/sched.h       |  5 ++-
 6 files changed, 95 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index e45b29890269..90e47eb156ab 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1775,7 +1775,7 @@ out:
 		force_fatal_sig(SIGSEGV);
 
 	sched_mm_cid_after_execve(current);
-	rseq_sched_switch_event(current);
+	rseq_force_update();
 	current->in_execve = 0;
 
 	return retval;
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index f5a43188023f..abfbeb42d1a2 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -11,7 +11,8 @@ void __rseq_handle_notify_resume(struct pt_regs *regs);
 
 static inline void rseq_handle_notify_resume(struct pt_regs *regs)
 {
-	if (current->rseq.event.has_rseq)
+	/* '&' is intentional to spare one conditional branch */
+	if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
 		__rseq_handle_notify_resume(regs);
 }
 
@@ -33,12 +34,75 @@ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *reg
 	}
 }
 
-/* Raised from context switch and exevce to force evaluation on exit to user */
-static inline void rseq_sched_switch_event(struct task_struct *t)
+static inline void rseq_raise_notify_resume(struct task_struct *t)
 {
-	if (t->rseq.event.has_rseq) {
-		t->rseq.event.sched_switch = true;
-		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+}
+
+/* Invoked from context switch to force evaluation on exit to user */
+static __always_inline void rseq_sched_switch_event(struct task_struct *t)
+{
+	struct rseq_event *ev = &t->rseq.event;
+
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+		/*
+		 * Avoid a boat load of conditionals by using simple logic
+		 * to determine whether NOTIFY_RESUME needs to be raised.
+		 *
+		 * It's required when the CPU or MM CID has changed or
+		 * the entry was from user space.
+		 */
+		bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
+
+		if (raise) {
+			ev->sched_switch = true;
+			rseq_raise_notify_resume(t);
+		}
+	} else {
+		if (ev->has_rseq) {
+			t->rseq.event.sched_switch = true;
+			rseq_raise_notify_resume(t);
+		}
+	}
+}
+
+/*
+ * Invoked from __set_task_cpu() when a task migrates to enforce an IDs
+ * update.
+ *
+ * This does not raise TIF_NOTIFY_RESUME as that happens in
+ * rseq_sched_switch_event().
+ */
+static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu)
+{
+	t->rseq.event.ids_changed = true;
+}
+
+/*
+ * Invoked from switch_mm_cid() in context switch when the task gets a MM
+ * CID assigned.
+ *
+ * This does not raise TIF_NOTIFY_RESUME as that happens in
+ * rseq_sched_switch_event().
+ */
+static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
+{
+	/*
+	 * Requires a comparison as the switch_mm_cid() code does not
+	 * provide a conditional for it readily. So avoid excessive updates
+	 * when nothing changes.
+	 */
+	if (t->rseq.ids.mm_cid != cid)
+		t->rseq.event.ids_changed = true;
+}
+
+/* Enforce a full update after RSEQ registration and when execve() failed */
+static inline void rseq_force_update(void)
+{
+	if (current->rseq.event.has_rseq) {
+		current->rseq.event.ids_changed = true;
+		current->rseq.event.sched_switch = true;
+		rseq_raise_notify_resume(current);
 	}
 }
 
@@ -55,7 +119,7 @@ static inline void rseq_sched_switch_event(struct task_struct *t)
 static inline void rseq_virt_userspace_exit(void)
 {
 	if (current->rseq.event.sched_switch)
-		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+		rseq_raise_notify_resume(current);
 }
 
 static inline void rseq_reset(struct task_struct *t)
@@ -91,6 +155,9 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
 static inline void rseq_handle_notify_resume(struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_sched_switch_event(struct task_struct *t) { }
+static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
+static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
+static inline void rseq_force_update(void) { }
 static inline void rseq_virt_userspace_exit(void) { }
 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
 static inline void rseq_execve(struct task_struct *t) { }
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 7c123947bb98..a1389fff4fca 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -11,20 +11,27 @@ struct rseq;
  * struct rseq_event - Storage for rseq related event management
  * @all:		Compound to initialize and clear the data efficiently
  * @events:		Compound to access events with a single load/store
- * @sched_switch:	True if the task was scheduled out
+ * @sched_switch:	True if the task was scheduled and needs update on
+ *			exit to user
+ * @ids_changed:	Indicator that IDs need to be updated
  * @user_irq:		True on interrupt entry from user mode
  * @has_rseq:		True if the task has a rseq pointer installed
  * @error:		Compound error code for the slow path to analyze
  * @fatal:		User space data corrupted or invalid
+ *
+ * @sched_switch and @ids_changed must be adjacent and the combo must be
+ * 16bit aligned to allow a single store, when both are set at the same
+ * time in the scheduler.
  */
 struct rseq_event {
 	union {
 		u64				all;
 		struct {
 			union {
-				u16		events;
+				u32		events;
 				struct {
 					u8	sched_switch;
+					u8	ids_changed;
 					u8	user_irq;
 				};
 			};
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 148fb2103023..183dde756808 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -464,7 +464,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 	 * are updated before returning to user-space.
 	 */
 	current->rseq.event.has_rseq = true;
-	rseq_sched_switch_event(current);
+	rseq_force_update();
 	return 0;
 
 efault:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b75e8e1eca4a..579a8e93578f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5118,7 +5118,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 	kcov_prepare_switch(prev);
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
-	rseq_sched_switch_event(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
 	kmap_local_sched_out();
 	prepare_task(next);
@@ -5316,6 +5315,12 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	/* switch_mm_cid() requires the memory barriers above. */
 	switch_mm_cid(rq, prev, next);
 
+	/*
+	 * Tell rseq that the task was scheduled in. Must be after
+	 * switch_mm_cid() to get the TIF flag set.
+	 */
+	rseq_sched_switch_event(next);
+
 	prepare_lock_switch(rq, next, rf);
 
 	/* Here we just switch the register state and the stack. */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d7..4838dda75b10 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2209,6 +2209,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 	smp_wmb();
 	WRITE_ONCE(task_thread_info(p)->cpu, cpu);
 	p->wake_cpu = cpu;
+	rseq_sched_set_task_cpu(p, cpu);
 #endif /* CONFIG_SMP */
 }
 
@@ -3807,8 +3808,10 @@ static inline void switch_mm_cid(struct rq *rq,
 		mm_cid_put_lazy(prev);
 		prev->mm_cid = -1;
 	}
-	if (next->mm_cid_active)
+	if (next->mm_cid_active) {
 		next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
+		rseq_sched_set_task_mm_cid(next, next->mm_cid);
+	}
 }
 
 #else /* !CONFIG_SCHED_MM_CID: */
-- 
cgit v1.2.3


From 05b44aef709cae5e4274590f050cf35049dcc24e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:17 +0100
Subject: rseq: Implement fast path for exit to user

Implement the actual logic for handling RSEQ updates in a fast path after
handling the TIF work and at the point where the task is actually returning
to user space.

This is the right point to do that because at this point the CPU and the MM
CID are stable and cannot longer change due to yet another reschedule.
That happens when the task is handling it via TIF_NOTIFY_RESUME in
resume_user_mode_work(), which is invoked from the exit to user mode work
loop.

The function is invoked after the TIF work is handled and runs with
interrupts disabled, which means it cannot resolve page faults. It
therefore disables page faults and in case the access to the user space
memory faults, it:

  - notes the fail in the event struct
  - raises TIF_NOTIFY_RESUME
  - returns false to the caller

The caller has to go back to the TIF work, which runs with interrupts
enabled and therefore can resolve the page faults. This happens mostly on
fork() when the memory is marked COW.

If the user memory inspection finds invalid data, the function returns
false as well and sets the fatal flag in the event struct along with
TIF_NOTIFY_RESUME. The slow path notify handler has to evaluate that flag
and terminate the task with SIGSEGV as documented.

The initial decision to invoke any of this is based on one flags in the
event struct: @sched_switch. The decision is in pseudo ASM:

      load	tsk::event::sched_switch
      jnz	inspect_user_space
      mov	$0, tsk::event::events
      ...
      leave

So for the common case where the task was not scheduled out, this really
boils down to three instructions before going out if the compiler is not
completely stupid (and yes, some of them are).

If the condition is true, then it checks, whether CPU ID or MM CID have
changed. If so, then the CPU/MM IDs have to be updated and are thereby
cached for the next round. The update unconditionally retrieves the user
space critical section address to spare another user*begin/end() pair.  If
that's not zero and tsk::event::user_irq is set, then the critical section
is analyzed and acted upon. If either zero or the entry came via syscall
the critical section analysis is skipped.

If the comparison is false then the critical section has to be analyzed
because the event flag is then only true when entry from user was by
interrupt.

This is provided without the actual hookup to let reviewers focus on the
implementation details. The hookup happens in the next step.

Note: As with quite some other optimizations this depends on the generic
entry infrastructure and is not enabled to be sucked into random
architecture implementations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.638929615@linutronix.de
---
 include/linux/rseq_entry.h | 133 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/rseq_types.h |   3 +
 kernel/rseq.c              |   2 +
 3 files changed, 135 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index aa1c0464a16c..3f13be7301fa 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -10,6 +10,7 @@ struct rseq_stats {
 	unsigned long	exit;
 	unsigned long	signal;
 	unsigned long	slowpath;
+	unsigned long	fastpath;
 	unsigned long	ids;
 	unsigned long	cs;
 	unsigned long	clear;
@@ -245,12 +246,13 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c
 {
 	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
 	unsigned long ip = instruction_pointer(regs);
+	unsigned long tasksize = TASK_SIZE;
 	u64 start_ip, abort_ip, offset;
 	u32 usig, __user *uc_sig;
 
 	rseq_stat_inc(rseq_stats.cs);
 
-	if (unlikely(csaddr >= TASK_SIZE)) {
+	if (unlikely(csaddr >= tasksize)) {
 		t->rseq.event.fatal = true;
 		return false;
 	}
@@ -287,7 +289,7 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c
 		 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
 		 * protection.
 		 */
-		if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig))
+		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
 			goto die;
 
 		/* The address is guaranteed to be >= 0 and < TASK_SIZE */
@@ -397,6 +399,128 @@ static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *r
 	return rseq_update_user_cs(t, regs, csaddr);
 }
 
+/*
+ * If you want to use this then convert your architecture to the generic
+ * entry code. I'm tired of building workarounds for people who can't be
+ * bothered to make the maintenance of generic infrastructure less
+ * burdensome. Just sucking everything into the architecture code and
+ * thereby making others chase the horrible hacks and keep them working is
+ * neither acceptable nor sustainable.
+ */
+#ifdef CONFIG_GENERIC_ENTRY
+
+/*
+ * This is inlined into the exit path because:
+ *
+ * 1) It's a one time comparison in the fast path when there is no event to
+ *    handle
+ *
+ * 2) The access to the user space rseq memory (TLS) is unlikely to fault
+ *    so the straight inline operation is:
+ *
+ *	- Four 32-bit stores only if CPU ID/ MM CID need to be updated
+ *	- One 64-bit load to retrieve the critical section address
+ *
+ * 3) In the unlikely case that the critical section address is != NULL:
+ *
+ *     - One 64-bit load to retrieve the start IP
+ *     - One 64-bit load to retrieve the offset for calculating the end
+ *     - One 64-bit load to retrieve the abort IP
+ *     - One 64-bit load to retrieve the signature
+ *     - One store to clear the critical section address
+ *
+ * The non-debug case implements only the minimal required checking. It
+ * provides protection against a rogue abort IP in kernel space, which
+ * would be exploitable at least on x86, and also against a rogue CS
+ * descriptor by checking the signature at the abort IP. Any fallout from
+ * invalid critical section descriptors is a user space problem. The debug
+ * case provides the full set of checks and terminates the task if a
+ * condition is not met.
+ *
+ * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
+ * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
+ * slow path there will handle the failure.
+ */
+static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
+{
+	/*
+	 * Page faults need to be disabled as this is called with
+	 * interrupts disabled
+	 */
+	guard(pagefault)();
+	if (likely(!t->rseq.event.ids_changed)) {
+		struct rseq __user *rseq = t->rseq.usrptr;
+		/*
+		 * If IDs have not changed rseq_event::user_irq must be true
+		 * See rseq_sched_switch_event().
+		 */
+		u64 csaddr;
+
+		if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs)))
+			return false;
+
+		if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
+			if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
+				return false;
+		}
+		return true;
+	}
+
+	struct rseq_ids ids = {
+		.cpu_id = task_cpu(t),
+		.mm_cid = task_mm_cid(t),
+	};
+	u32 node_id = cpu_to_node(ids.cpu_id);
+
+	return rseq_update_usr(t, regs, &ids, node_id);
+}
+
+static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+{
+	struct task_struct *t = current;
+
+	/*
+	 * If the task did not go through schedule or got the flag enforced
+	 * by the rseq syscall or execve, then nothing to do here.
+	 *
+	 * CPU ID and MM CID can only change when going through a context
+	 * switch.
+	 *
+	 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
+	 * only when rseq_event::has_rseq is true. That conditional is
+	 * required to avoid setting the TIF bit if RSEQ is not registered
+	 * for a task. rseq_event::sched_switch is cleared when RSEQ is
+	 * unregistered by a task so it's sufficient to check for the
+	 * sched_switch bit alone.
+	 *
+	 * A sane compiler requires three instructions for the nothing to do
+	 * case including clearing the events, but your mileage might vary.
+	 */
+	if (unlikely((t->rseq.event.sched_switch))) {
+		rseq_stat_inc(rseq_stats.fastpath);
+
+		if (unlikely(!rseq_exit_user_update(regs, t)))
+			return true;
+	}
+	/* Clear state so next entry starts from a clean slate */
+	t->rseq.event.events = 0;
+	return false;
+}
+
+static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+{
+	if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
+		current->rseq.event.slowpath = true;
+		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+		return true;
+	}
+	return false;
+}
+
+#else /* CONFIG_GENERIC_ENTRY */
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; }
+#endif /* !CONFIG_GENERIC_ENTRY */
+
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	struct rseq_event *ev = &current->rseq.event;
@@ -421,9 +545,12 @@ static inline void rseq_debug_syscall_return(struct pt_regs *regs)
 	if (static_branch_unlikely(&rseq_debug_enabled))
 		__rseq_debug_syscall_return(regs);
 }
-
 #else /* CONFIG_RSEQ */
 static inline void rseq_note_user_irq_entry(void) { }
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+{
+	return false;
+}
 static inline void rseq_exit_to_user_mode(void) { }
 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
 #endif /* !CONFIG_RSEQ */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index a1389fff4fca..9c7a34154de8 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -18,6 +18,8 @@ struct rseq;
  * @has_rseq:		True if the task has a rseq pointer installed
  * @error:		Compound error code for the slow path to analyze
  * @fatal:		User space data corrupted or invalid
+ * @slowpath:		Indicator that slow path processing via TIF_NOTIFY_RESUME
+ *			is required
  *
  * @sched_switch and @ids_changed must be adjacent and the combo must be
  * 16bit aligned to allow a single store, when both are set at the same
@@ -42,6 +44,7 @@ struct rseq_event {
 				u16		error;
 				struct {
 					u8	fatal;
+					u8	slowpath;
 				};
 			};
 		};
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 183dde756808..c5d6336c6956 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -133,6 +133,7 @@ static int rseq_stats_show(struct seq_file *m, void *p)
 		stats.exit	+= data_race(per_cpu(rseq_stats.exit, cpu));
 		stats.signal	+= data_race(per_cpu(rseq_stats.signal, cpu));
 		stats.slowpath	+= data_race(per_cpu(rseq_stats.slowpath, cpu));
+		stats.fastpath	+= data_race(per_cpu(rseq_stats.fastpath, cpu));
 		stats.ids	+= data_race(per_cpu(rseq_stats.ids, cpu));
 		stats.cs	+= data_race(per_cpu(rseq_stats.cs, cpu));
 		stats.clear	+= data_race(per_cpu(rseq_stats.clear, cpu));
@@ -142,6 +143,7 @@ static int rseq_stats_show(struct seq_file *m, void *p)
 	seq_printf(m, "exit:   %16lu\n", stats.exit);
 	seq_printf(m, "signal: %16lu\n", stats.signal);
 	seq_printf(m, "slowp:  %16lu\n", stats.slowpath);
+	seq_printf(m, "fastp:  %16lu\n", stats.fastpath);
 	seq_printf(m, "ids:    %16lu\n", stats.ids);
 	seq_printf(m, "cs:     %16lu\n", stats.cs);
 	seq_printf(m, "clear:  %16lu\n", stats.clear);
-- 
cgit v1.2.3


From 3db6b38dfe640207da706b286d4181237391f5bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:19 +0100
Subject: rseq: Switch to fast path processing on exit to user

Now that all bits and pieces are in place, hook the RSEQ handling fast path
function into exit_to_user_mode_prepare() after the TIF work bits have been
handled. If case of fast path failure, TIF_NOTIFY_RESUME has been raised
and the caller needs to take another turn through the TIF handling slow
path.

This only works for architectures which use the generic entry code.
Architectures who still have their own incomplete hacks are not supported
and won't be.

This results in the following improvements:

  Kernel build	       Before		  After		      Reduction

  exit to user         80692981		  80514451
  signal checks:          32581		       121	       99%
  slowpath runs:        1201408   1.49%	       198 0.00%      100%
  fastpath runs:			    675941 0.84%       N/A
  id updates:           1233989   1.53%	     50541 0.06%       96%
  cs checks:            1125366   1.39%	         0 0.00%      100%
    cs cleared:         1125366      100%	 0            100%
    cs fixup:                 0        0%	 0

  RSEQ selftests      Before		  After		      Reduction

  exit to user:       386281778		  387373750
  signal checks:       35661203		          0           100%
  slowpath runs:      140542396 36.38%	        100  0.00%    100%
  fastpath runs:			    9509789  2.51%     N/A
  id updates:         176203599 45.62%	    9087994  2.35%     95%
  cs checks:          175587856 45.46%	    4728394  1.22%     98%
    cs cleared:       172359544   98.16%    1319307   27.90%   99%
    cs fixup:           3228312    1.84%    3409087   72.10%

The 'cs cleared' and 'cs fixup' percentages are not relative to the exit to
user invocations, they are relative to the actual 'cs check' invocations.

While some of this could have been avoided in the original code, like the
obvious clearing of CS when it's already clear, the main problem of going
through TIF_NOTIFY_RESUME cannot be solved. In some workloads the RSEQ
notify handler is invoked more than once before going out to user
space. Doing this once when everything has stabilized is the only solution
to avoid this.

The initial attempt to completely decouple it from the TIF work turned out
to be suboptimal for workloads, which do a lot of quick and short system
calls. Even if the fast path decision is only 4 instructions (including a
conditional branch), this adds up quickly and becomes measurable when the
rate for actually having to handle rseq is in the low single digit
percentage range of user/kernel transitions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.701201365@linutronix.de
---
 include/linux/irq-entry-common.h |  7 ++-----
 include/linux/resume_user_mode.h |  2 +-
 include/linux/rseq.h             | 18 ++++++++++++------
 init/Kconfig                     |  2 +-
 kernel/entry/common.c            | 26 +++++++++++++++++++-------
 kernel/rseq.c                    |  8 ++++++--
 6 files changed, 41 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index cb31fb84d7b4..8f5ceeaaaea5 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -197,11 +197,8 @@ static __always_inline void arch_exit_to_user_mode(void) { }
  */
 void arch_do_signal_or_restart(struct pt_regs *regs);
 
-/**
- * exit_to_user_mode_loop - do any pending work before leaving to user space
- */
-unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-				     unsigned long ti_work);
+/* Handle pending TIF work */
+unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work);
 
 /**
  * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
index dd3bf7da90a8..bf92227c78d0 100644
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs)
 	mem_cgroup_handle_over_high(GFP_KERNEL);
 	blkcg_maybe_throttle_current();
 
-	rseq_handle_notify_resume(regs);
+	rseq_handle_slowpath(regs);
 }
 
 #endif /* LINUX_RESUME_USER_MODE_H */
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index abfbeb42d1a2..ded4baa34586 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -7,13 +7,19 @@
 
 #include <uapi/linux/rseq.h>
 
-void __rseq_handle_notify_resume(struct pt_regs *regs);
+void __rseq_handle_slowpath(struct pt_regs *regs);
 
-static inline void rseq_handle_notify_resume(struct pt_regs *regs)
+/* Invoked from resume_user_mode_work() */
+static inline void rseq_handle_slowpath(struct pt_regs *regs)
 {
-	/* '&' is intentional to spare one conditional branch */
-	if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
-		__rseq_handle_notify_resume(regs);
+	if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
+		if (current->rseq.event.slowpath)
+			__rseq_handle_slowpath(regs);
+	} else {
+		/* '&' is intentional to spare one conditional branch */
+		if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
+			__rseq_handle_slowpath(regs);
+	}
 }
 
 void __rseq_signal_deliver(int sig, struct pt_regs *regs);
@@ -152,7 +158,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
 }
 
 #else /* CONFIG_RSEQ */
-static inline void rseq_handle_notify_resume(struct pt_regs *regs) { }
+static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_sched_switch_event(struct task_struct *t) { }
 static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
diff --git a/init/Kconfig b/init/Kconfig
index bde40ab664e2..d1c606ec632e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1941,7 +1941,7 @@ config RSEQ_DEBUG_DEFAULT_ENABLE
 config DEBUG_RSEQ
 	default n
 	bool "Enable debugging of rseq() system call" if EXPERT
-	depends on RSEQ && DEBUG_KERNEL
+	depends on RSEQ && DEBUG_KERNEL && !GENERIC_ENTRY
 	select RSEQ_DEBUG_DEFAULT_ENABLE
 	help
 	  Enable extra debugging checks for the rseq system call.
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 70a16db4cc0a..523a3e758af4 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -11,13 +11,8 @@
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
-/**
- * exit_to_user_mode_loop - do any pending work before leaving to user space
- * @regs:	Pointer to pt_regs on entry stack
- * @ti_work:	TIF work flags as read by the caller
- */
-__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-						     unsigned long ti_work)
+static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
+							      unsigned long ti_work)
 {
 	/*
 	 * Before returning to user space ensure that all pending work
@@ -62,6 +57,23 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 	return ti_work;
 }
 
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs:	Pointer to pt_regs on entry stack
+ * @ti_work:	TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+						     unsigned long ti_work)
+{
+	for (;;) {
+		ti_work = __exit_to_user_mode_loop(regs, ti_work);
+
+		if (likely(!rseq_exit_to_user_mode_restart(regs)))
+			return ti_work;
+		ti_work = read_thread_flags();
+	}
+}
+
 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 {
 	irqentry_state_t ret = {
diff --git a/kernel/rseq.c b/kernel/rseq.c
index c5d6336c6956..395d8b002350 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -237,7 +237,11 @@ efault:
 
 static void rseq_slowpath_update_usr(struct pt_regs *regs)
 {
-	/* Preserve rseq state and user_irq state for exit to user */
+	/*
+	 * Preserve rseq state and user_irq state. The generic entry code
+	 * clears user_irq on the way out, the non-generic entry
+	 * architectures are not having user_irq.
+	 */
 	const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
 	struct task_struct *t = current;
 	struct rseq_ids ids;
@@ -289,7 +293,7 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs)
 	}
 }
 
-void __rseq_handle_notify_resume(struct pt_regs *regs)
+void __rseq_handle_slowpath(struct pt_regs *regs)
 {
 	/*
 	 * If invoked from hypervisors before entering the guest via
-- 
cgit v1.2.3


From 70fe25a3bc53a891f0e6184c12bd55cc524cb13b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:21 +0100
Subject: entry: Split up exit_to_user_mode_prepare()

exit_to_user_mode_prepare() is used for both interrupts and syscalls, but
there is extra rseq work, which is only required for in the interrupt exit
case.

Split up the function and provide wrappers for syscalls and interrupts,
which allows to separate the rseq exit work in the next step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.782234789@linutronix.de
---
 arch/arm64/kernel/entry-common.c |  2 +-
 include/linux/entry-common.h     |  2 +-
 include/linux/irq-entry-common.h | 49 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index a9c81715ce59..0a97e2621f60 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -100,7 +100,7 @@ static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs)
 static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs)
 {
 	local_irq_disable();
-	exit_to_user_mode_prepare(regs);
+	exit_to_user_mode_prepare_legacy(regs);
 	local_daif_mask();
 	mte_check_tfsr_exit();
 	exit_to_user_mode();
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index d967184ae08f..87efb38b7081 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -156,7 +156,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
 	if (unlikely(work & SYSCALL_WORK_EXIT))
 		syscall_exit_work(regs, work);
 	local_irq_disable_exit_to_user();
-	exit_to_user_mode_prepare(regs);
+	syscall_exit_to_user_mode_prepare(regs);
 }
 
 /**
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 8f5ceeaaaea5..5ea61722bb70 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -201,7 +201,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
 unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work);
 
 /**
- * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
  * @regs:	Pointer to pt_regs on entry stack
  *
  * 1) check that interrupts are disabled
@@ -209,8 +209,10 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work
  * 3) call exit_to_user_mode_loop() if any flags from
  *    EXIT_TO_USER_MODE_WORK are set
  * 4) check that interrupts are still disabled
+ *
+ * Don't invoke directly, use the syscall/irqentry_ prefixed variants below
  */
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	unsigned long ti_work;
 
@@ -224,15 +226,52 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
 		ti_work = exit_to_user_mode_loop(regs, ti_work);
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
+}
 
-	rseq_exit_to_user_mode();
-
+static __always_inline void __exit_to_user_mode_validate(void)
+{
 	/* Ensure that kernel state is sane for a return to userspace */
 	kmap_assert_nomap();
 	lockdep_assert_irqs_disabled();
 	lockdep_sys_exit();
 }
 
+/* Temporary workaround to keep ARM64 alive */
+static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs)
+{
+	__exit_to_user_mode_prepare(regs);
+	rseq_exit_to_user_mode();
+	__exit_to_user_mode_validate();
+}
+
+/**
+ * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs:	Pointer to pt_regs on entry stack
+ *
+ * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for
+ * syscalls and interrupts.
+ */
+static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+	__exit_to_user_mode_prepare(regs);
+	rseq_exit_to_user_mode();
+	__exit_to_user_mode_validate();
+}
+
+/**
+ * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs:	Pointer to pt_regs on entry stack
+ *
+ * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for
+ * syscalls and interrupts.
+ */
+static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+	__exit_to_user_mode_prepare(regs);
+	rseq_exit_to_user_mode();
+	__exit_to_user_mode_validate();
+}
+
 /**
  * exit_to_user_mode - Fixup state when exiting to user mode
  *
@@ -297,7 +336,7 @@ static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
 static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs)
 {
 	instrumentation_begin();
-	exit_to_user_mode_prepare(regs);
+	irqentry_exit_to_user_mode_prepare(regs);
 	instrumentation_end();
 	exit_to_user_mode();
 }
-- 
cgit v1.2.3


From 7a5201ea1907534efe3a6e9c001ef4c0257cb3f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:24 +0100
Subject: rseq: Split up rseq_exit_to_user_mode()

Separate the interrupt and syscall exit handling. Syscall exit does not
require to clear the user_irq bit as it can't be set. On interrupt exit it
can be set when the interrupt did not result in a scheduling event and
therefore the return path did not invoke the TIF work handling, which would
have cleared it.

The debug check for the event state is also not really required even when
debug mode is enabled via the static key. Debug mode is largely aiding user
space by enabling a larger amount of validation checks, which cause a
segfault when a malformed critical section is detected. In production mode
the critical section handling takes the content mostly as is and lets user
space keep the pieces when it screwed up.

On kernel changes in that area the state check is useful, but that can be
done when lockdep is enabled, which is anyway a required test scenario for
fundamental changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.842785700@linutronix.de
---
 include/linux/irq-entry-common.h |  6 +++---
 include/linux/rseq_entry.h       | 36 ++++++++++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 5ea61722bb70..bc5d178e0b91 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -240,7 +240,7 @@ static __always_inline void __exit_to_user_mode_validate(void)
 static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs)
 {
 	__exit_to_user_mode_prepare(regs);
-	rseq_exit_to_user_mode();
+	rseq_exit_to_user_mode_legacy();
 	__exit_to_user_mode_validate();
 }
 
@@ -254,7 +254,7 @@ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *reg
 static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	__exit_to_user_mode_prepare(regs);
-	rseq_exit_to_user_mode();
+	rseq_syscall_exit_to_user_mode();
 	__exit_to_user_mode_validate();
 }
 
@@ -268,7 +268,7 @@ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *re
 static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	__exit_to_user_mode_prepare(regs);
-	rseq_exit_to_user_mode();
+	rseq_irqentry_exit_to_user_mode();
 	__exit_to_user_mode_validate();
 }
 
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index 3f13be7301fa..958a63eeb2d3 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -521,7 +521,37 @@ static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; }
 #endif /* !CONFIG_GENERIC_ENTRY */
 
-static __always_inline void rseq_exit_to_user_mode(void)
+static __always_inline void rseq_syscall_exit_to_user_mode(void)
+{
+	struct rseq_event *ev = &current->rseq.event;
+
+	rseq_stat_inc(rseq_stats.exit);
+
+	/* Needed to remove the store for the !lockdep case */
+	if (IS_ENABLED(CONFIG_LOCKDEP)) {
+		WARN_ON_ONCE(ev->sched_switch);
+		ev->events = 0;
+	}
+}
+
+static __always_inline void rseq_irqentry_exit_to_user_mode(void)
+{
+	struct rseq_event *ev = &current->rseq.event;
+
+	rseq_stat_inc(rseq_stats.exit);
+
+	lockdep_assert_once(!ev->sched_switch);
+
+	/*
+	 * Ensure that event (especially user_irq) is cleared when the
+	 * interrupt did not result in a schedule and therefore the
+	 * rseq processing could not clear it.
+	 */
+	ev->events = 0;
+}
+
+/* Required to keep ARM64 working */
+static __always_inline void rseq_exit_to_user_mode_legacy(void)
 {
 	struct rseq_event *ev = &current->rseq.event;
 
@@ -551,7 +581,9 @@ static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
 {
 	return false;
 }
-static inline void rseq_exit_to_user_mode(void) { }
+static inline void rseq_syscall_exit_to_user_mode(void) { }
+static inline void rseq_irqentry_exit_to_user_mode(void) { }
+static inline void rseq_exit_to_user_mode_legacy(void) { }
 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
 #endif /* !CONFIG_RSEQ */
 
-- 
cgit v1.2.3


From 32034df66b5f49626aa450ceaf1849a08d87906e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Oct 2025 09:45:26 +0100
Subject: rseq: Switch to TIF_RSEQ if supported

TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is suboptimal especially
with the RSEQ fast path depending on it, but not really handling it.

Define a separate TIF_RSEQ in the generic TIF space and enable the full
separation of fast and slow path for architectures which utilize that.

That avoids the hassle with invocations of resume_user_mode_work() from
hypervisors, which clear TIF_NOTIFY_RESUME. It makes the therefore required
re-evaluation at the end of vcpu_run() a NOOP on architectures which
utilize the generic TIF space and have a separate TIF_RSEQ.

The hypervisor TIF handling does not include the separate TIF_RSEQ as there
is no point in doing so. The guest does neither know nor care about the VMM
host applications RSEQ state. That state is only relevant when the ioctl()
returns to user space.

The fastpath implementation still utilizes TIF_NOTIFY_RESUME for failure
handling, but this only happens within exit_to_user_mode_loop(), so
arguably the hypervisor ioctl() code is long done when this happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.903622031@linutronix.de
---
 include/asm-generic/thread_info_tif.h |  3 +++
 include/linux/irq-entry-common.h      |  2 +-
 include/linux/rseq.h                  | 22 +++++++++++++++-------
 include/linux/rseq_entry.h            | 32 +++++++++++++++++++++++++++++---
 include/linux/thread_info.h           |  5 +++++
 kernel/entry/common.c                 | 10 ++++++++--
 6 files changed, 61 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h
index ee3793e9b1a4..da1610a78f92 100644
--- a/include/asm-generic/thread_info_tif.h
+++ b/include/asm-generic/thread_info_tif.h
@@ -45,4 +45,7 @@
 # define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
 #endif
 
+#define TIF_RSEQ		11	// Run RSEQ fast path
+#define _TIF_RSEQ		BIT(TIF_RSEQ)
+
 #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index bc5d178e0b91..72e3f7a59469 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -30,7 +30,7 @@
 #define EXIT_TO_USER_MODE_WORK						\
 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
 	 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY |			\
-	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |			\
+	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ |		\
 	 ARCH_EXIT_TO_USER_MODE_WORK)
 
 /**
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index ded4baa34586..b5e4803c4ebe 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -42,7 +42,7 @@ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *reg
 
 static inline void rseq_raise_notify_resume(struct task_struct *t)
 {
-	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+	set_tsk_thread_flag(t, TIF_RSEQ);
 }
 
 /* Invoked from context switch to force evaluation on exit to user */
@@ -114,17 +114,25 @@ static inline void rseq_force_update(void)
 
 /*
  * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
- * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
- * that case just to do it eventually again before returning to user space,
- * the entry resume_user_mode_work() invocation is ignored as the register
- * argument is NULL.
+ * which clears TIF_NOTIFY_RESUME on architectures that don't use the
+ * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
  *
- * After returning from guest mode, they have to invoke this function to
- * re-raise TIF_NOTIFY_RESUME if necessary.
+ * To avoid updating user space RSEQ in that case just to do it eventually
+ * again before returning to user space, because __rseq_handle_slowpath()
+ * does nothing when invoked with NULL register state.
+ *
+ * After returning from guest mode, before exiting to userspace, hypervisors
+ * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
  */
 static inline void rseq_virt_userspace_exit(void)
 {
 	if (current->rseq.event.sched_switch)
+	/*
+	 * The generic optimization for deferring RSEQ updates until the next
+	 * exit relies on having a dedicated TIF_RSEQ.
+	 */
+	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
+	    current->rseq.event.sched_switch)
 		rseq_raise_notify_resume(current);
 }
 
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index 958a63eeb2d3..c92167ff8a7f 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -507,18 +507,44 @@ static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *reg
 	return false;
 }
 
-static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+static __always_inline bool test_tif_rseq(unsigned long ti_work)
 {
+	return ti_work & _TIF_RSEQ;
+}
+
+static __always_inline void clear_tif_rseq(void)
+{
+	static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
+	clear_thread_flag(TIF_RSEQ);
+}
+#else
+static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
+static __always_inline void clear_tif_rseq(void) { }
+#endif
+
+static __always_inline bool
+rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
+{
+	if (likely(!test_tif_rseq(ti_work)))
+		return false;
+
 	if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
 		current->rseq.event.slowpath = true;
 		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
 		return true;
 	}
+
+	clear_tif_rseq();
 	return false;
 }
 
 #else /* CONFIG_GENERIC_ENTRY */
-static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; }
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
+{
+	return false;
+}
 #endif /* !CONFIG_GENERIC_ENTRY */
 
 static __always_inline void rseq_syscall_exit_to_user_mode(void)
@@ -577,7 +603,7 @@ static inline void rseq_debug_syscall_return(struct pt_regs *regs)
 }
 #else /* CONFIG_RSEQ */
 static inline void rseq_note_user_irq_entry(void) { }
-static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
 {
 	return false;
 }
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index dd925d84fa46..b40de9bab4b7 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -67,6 +67,11 @@ enum syscall_work_bit {
 #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
 #endif
 
+#ifndef TIF_RSEQ
+# define TIF_RSEQ	TIF_NOTIFY_RESUME
+# define _TIF_RSEQ	_TIF_NOTIFY_RESUME
+#endif
+
 #ifdef __KERNEL__
 
 #ifndef arch_set_restart_data
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 523a3e758af4..5c792b30c58a 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -11,6 +11,12 @@
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+#define EXIT_TO_USER_MODE_WORK_LOOP	(EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ)
+#else
+#define EXIT_TO_USER_MODE_WORK_LOOP	(EXIT_TO_USER_MODE_WORK)
+#endif
+
 static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
 							      unsigned long ti_work)
 {
@@ -18,7 +24,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
 	 * Before returning to user space ensure that all pending work
 	 * items have been completed.
 	 */
-	while (ti_work & EXIT_TO_USER_MODE_WORK) {
+	while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
 
 		local_irq_enable_exit_to_user(ti_work);
 
@@ -68,7 +74,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 	for (;;) {
 		ti_work = __exit_to_user_mode_loop(regs, ti_work);
 
-		if (likely(!rseq_exit_to_user_mode_restart(regs)))
+		if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work)))
 			return ti_work;
 		ti_work = read_thread_flags();
 	}
-- 
cgit v1.2.3


From 323d93f0432edb5415c79bd35e15e5754a76e486 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 31 Oct 2025 12:02:12 +0100
Subject: cleanup: Always inline everything

KASAN bloat caused cleanup helper functions to not get inlined:

  vmlinux.o: error: objtool: irqentry_exit+0x323: call to class_user_rw_access_destructor() with UACCESS enabled

Force inline all the cleanup helpers like they already are on normal
builds.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251031105435.GU4068168@noisy.programming.kicks-ass.net
---
 include/linux/cleanup.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 2573585b7f06..d1806ac5342c 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -208,7 +208,7 @@
  */
 
 #define DEFINE_FREE(_name, _type, _free) \
-	static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; }
+	static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; }
 
 #define __free(_name)	__cleanup(__free_##_name)
 
@@ -220,7 +220,7 @@
 		__val;                      \
 	})
 
-static inline __must_check
+static __always_inline __must_check
 const volatile void * __must_check_fn(const volatile void *val)
 { return val; }
 
@@ -274,16 +274,16 @@ const volatile void * __must_check_fn(const volatile void *val)
 
 #define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...)		\
 typedef _type class_##_name##_t;					\
-static inline void class_##_name##_destructor(_type *p)			\
+static __always_inline void class_##_name##_destructor(_type *p)	\
 { _type _T = *p; _exit; }						\
-static inline _type class_##_name##_constructor(_init_args)		\
+static __always_inline _type class_##_name##_constructor(_init_args)	\
 { _type t = _init; return t; }
 
 #define EXTEND_CLASS(_name, ext, _init, _init_args...)			\
 typedef class_##_name##_t class_##_name##ext##_t;			\
-static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\
+static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \
 { class_##_name##_destructor(p); }					\
-static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
+static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
 { class_##_name##_t t = _init; return t; }
 
 #define CLASS(_name, var)						\
@@ -347,7 +347,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
 	})
 
 #define __DEFINE_GUARD_LOCK_PTR(_name, _exp)                                \
-	static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
+	static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
 	{                                                                   \
 		void *_ptr = (void *)(__force unsigned long)*(_exp);        \
 		if (IS_ERR(_ptr)) {                                         \
@@ -355,7 +355,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
 		}                                                           \
 		return _ptr;                                                \
 	}                                                                   \
-	static inline int class_##_name##_lock_err(class_##_name##_t *_T)   \
+	static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \
 	{                                                                   \
 		long _rc = (__force unsigned long)*(_exp);                  \
 		if (!_rc) {                                                 \
@@ -384,9 +384,9 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
 	EXTEND_CLASS(_name, _ext, \
 		     ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \
 		     class_##_name##_t _T) \
-	static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
+	static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
 	{ return class_##_name##_lock_ptr(_T); } \
-	static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
+	static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
 	{ return class_##_name##_lock_err(_T); }
 
 /*
@@ -466,7 +466,7 @@ typedef struct {							\
 	__VA_ARGS__;							\
 } class_##_name##_t;							\
 									\
-static inline void class_##_name##_destructor(class_##_name##_t *_T)	\
+static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \
 {									\
 	if (!__GUARD_IS_ERR(_T->lock)) { _unlock; }			\
 }									\
@@ -474,7 +474,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T)	\
 __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock)
 
 #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock)			\
-static inline class_##_name##_t class_##_name##_constructor(_type *l)	\
+static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \
 {									\
 	class_##_name##_t _t = { .lock = l }, *_T = &_t;		\
 	_lock;								\
@@ -482,7 +482,7 @@ static inline class_##_name##_t class_##_name##_constructor(_type *l)	\
 }
 
 #define __DEFINE_LOCK_GUARD_0(_name, _lock)				\
-static inline class_##_name##_t class_##_name##_constructor(void)	\
+static __always_inline class_##_name##_t class_##_name##_constructor(void) \
 {									\
 	class_##_name##_t _t = { .lock = (void*)1 },			\
 			 *_T __maybe_unused = &_t;			\
@@ -508,9 +508,9 @@ __DEFINE_LOCK_GUARD_0(_name, _lock)
 		        if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\
 			_t; }),						\
 		     typeof_member(class_##_name##_t, lock) l)		\
-	static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
+	static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
 	{ return class_##_name##_lock_ptr(_T); } \
-	static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
+	static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
 	{ return class_##_name##_lock_err(_T); }
 
 #define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \
-- 
cgit v1.2.3


From 27cb3de7f43ac0263474d87a2c84d96f904d73e2 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <tonghao@bamaicloud.com>
Date: Tue, 28 Oct 2025 12:32:44 +0800
Subject: net: add net cookie for net device trace events

In a multi-network card or container environment, this is needed in order
to differentiate between trace events relating to net devices that exist
in different network namespaces and share the same name.

for xmit_timeout trace events:
[002] ..s1.  1838.311662: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=3
[007] ..s1.  1839.335650: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=4100
[007] ..s1.  1844.455659: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=3
[002] ..s1.  1850.087647: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=3

Cc: Eran Ben Elisha <eranbe@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Simon Horman <horms@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Suggested-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Tonghao Zhang <tonghao@bamaicloud.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20251028043244.82288-1-tonghao@bamaicloud.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/trace/events/net.h | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index d55162c12f90..fdd9ad474ce3 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -35,6 +35,7 @@ TRACE_EVENT(net_dev_start_xmit,
 		__field(	u16,			gso_size	)
 		__field(	u16,			gso_segs	)
 		__field(	u16,			gso_type	)
+		__field(	u64,			net_cookie	)
 	),
 
 	TP_fast_assign(
@@ -57,16 +58,18 @@ TRACE_EVENT(net_dev_start_xmit,
 		__entry->gso_size = skb_shinfo(skb)->gso_size;
 		__entry->gso_segs = skb_shinfo(skb)->gso_segs;
 		__entry->gso_type = skb_shinfo(skb)->gso_type;
+		__entry->net_cookie = dev_net(dev)->net_cookie;
 	),
 
-	TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
+	TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x net_cookie=%llu",
 		  __get_str(name), __entry->queue_mapping, __entry->skbaddr,
 		  __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
 		  __entry->protocol, __entry->ip_summed, __entry->len,
 		  __entry->data_len,
 		  __entry->network_offset, __entry->transport_offset_valid,
 		  __entry->transport_offset, __entry->tx_flags,
-		  __entry->gso_size, __entry->gso_segs, __entry->gso_type)
+		  __entry->gso_size, __entry->gso_segs,
+		  __entry->gso_type, __entry->net_cookie)
 );
 
 TRACE_EVENT(net_dev_xmit,
@@ -83,17 +86,21 @@ TRACE_EVENT(net_dev_xmit,
 		__field(	unsigned int,	len		)
 		__field(	int,		rc		)
 		__string(	name,		dev->name	)
+		__field(	u64,		net_cookie	)
 	),
 
 	TP_fast_assign(
 		__entry->skbaddr = skb;
 		__entry->len = skb_len;
 		__entry->rc = rc;
+		__entry->net_cookie = dev_net(dev)->net_cookie;
 		__assign_str(name);
 	),
 
-	TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
-		__get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
+	TP_printk("dev=%s skbaddr=%p len=%u rc=%d net_cookie=%llu",
+		__get_str(name), __entry->skbaddr,
+		__entry->len, __entry->rc,
+		__entry->net_cookie)
 );
 
 TRACE_EVENT(net_dev_xmit_timeout,
@@ -107,16 +114,19 @@ TRACE_EVENT(net_dev_xmit_timeout,
 		__string(	name,		dev->name	)
 		__string(	driver,		netdev_drivername(dev))
 		__field(	int,		queue_index	)
+		__field(	u64,		net_cookie	)
 	),
 
 	TP_fast_assign(
 		__assign_str(name);
 		__assign_str(driver);
 		__entry->queue_index = queue_index;
+		__entry->net_cookie = dev_net(dev)->net_cookie;
 	),
 
-	TP_printk("dev=%s driver=%s queue=%d",
-		__get_str(name), __get_str(driver), __entry->queue_index)
+	TP_printk("dev=%s driver=%s queue=%d net_cookie=%llu",
+		__get_str(name), __get_str(driver),
+		__entry->queue_index, __entry->net_cookie)
 );
 
 DECLARE_EVENT_CLASS(net_dev_template,
@@ -129,16 +139,20 @@ DECLARE_EVENT_CLASS(net_dev_template,
 		__field(	void *,		skbaddr		)
 		__field(	unsigned int,	len		)
 		__string(	name,		skb->dev->name	)
+		__field(	u64,		net_cookie	)
 	),
 
 	TP_fast_assign(
 		__entry->skbaddr = skb;
 		__entry->len = skb->len;
+		__entry->net_cookie = dev_net(skb->dev)->net_cookie;
 		__assign_str(name);
 	),
 
-	TP_printk("dev=%s skbaddr=%p len=%u",
-		__get_str(name), __entry->skbaddr, __entry->len)
+	TP_printk("dev=%s skbaddr=%p len=%u net_cookie=%llu",
+		__get_str(name), __entry->skbaddr,
+		__entry->len,
+		__entry->net_cookie)
 )
 
 DEFINE_EVENT(net_dev_template, net_dev_queue,
@@ -188,6 +202,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 		__field(	unsigned char,		nr_frags	)
 		__field(	u16,			gso_size	)
 		__field(	u16,			gso_type	)
+		__field(	u64,			net_cookie	)
 	),
 
 	TP_fast_assign(
@@ -214,16 +229,18 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
 		__entry->gso_size = skb_shinfo(skb)->gso_size;
 		__entry->gso_type = skb_shinfo(skb)->gso_type;
+		__entry->net_cookie = dev_net(skb->dev)->net_cookie;
 	),
 
-	TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
+	TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x net_cookie=%llu",
 		  __get_str(name), __entry->napi_id, __entry->queue_mapping,
 		  __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
 		  __entry->vlan_tci, __entry->protocol, __entry->ip_summed,
 		  __entry->hash, __entry->l4_hash, __entry->len,
 		  __entry->data_len, __entry->truesize,
 		  __entry->mac_header_valid, __entry->mac_header,
-		  __entry->nr_frags, __entry->gso_size, __entry->gso_type)
+		  __entry->nr_frags, __entry->gso_size,
+		  __entry->gso_type, __entry->net_cookie)
 );
 
 DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry,
-- 
cgit v1.2.3


From 4e97bae1b412cd6ed8053b3d8a242122952985cc Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 3 Nov 2025 00:12:40 +0100
Subject: cleanup: fix scoped_class()

This is a class, not a guard so why on earth is it checking for guard
pointers or conditional lock acquisition? None of it makes any sense at
all.

I'm not sure what happened back then. Maybe I had a brief psychedelic
period that I completely forgot about and spaced out into a zone where
that initial macro implementation made any sense at all.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-1-cb3ec8711a6a@kernel.org
Fixes: 5c21c5f22d07 ("cleanup: add a scoped version of CLASS()")
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/cleanup.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 2573585b7f06..19c7e475d3a4 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -290,15 +290,16 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
 	class_##_name##_t var __cleanup(class_##_name##_destructor) =	\
 		class_##_name##_constructor
 
-#define scoped_class(_name, var, args)                          \
-	for (CLASS(_name, var)(args);                           \
-	     __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \
-	     ({ goto _label; }))                                \
-		if (0) {                                        \
-_label:                                                         \
-			break;                                  \
+#define __scoped_class(_name, var, _label, args...)        \
+	for (CLASS(_name, var)(args); ; ({ goto _label; })) \
+		if (0) {                                   \
+_label:                                                    \
+			break;                             \
 		} else
 
+#define scoped_class(_name, var, args...) \
+	__scoped_class(_name, var, __UNIQUE_ID(label), args)
+
 /*
  * DEFINE_GUARD(name, type, lock, unlock):
  *	trivial wrapper around DEFINE_CLASS() above specifically
-- 
cgit v1.2.3


From 4c7ceeb62d3330b6fb2b549ae833a92c0f481f3e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 3 Nov 2025 00:12:41 +0100
Subject: cred: add kernel_cred() helper

Access kernel creds based off of init_task. This will let us avoid any
direct access to init_cred.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-2-cb3ec8711a6a@kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/cred.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 89ae50ad2ace..8ab3718184ad 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -20,6 +20,8 @@
 struct cred;
 struct inode;
 
+extern struct task_struct init_task;
+
 /*
  * COW Supplementary groups list
  */
@@ -156,6 +158,11 @@ extern struct cred *prepare_exec_creds(void);
 extern int commit_creds(struct cred *);
 extern void abort_creds(struct cred *);
 extern struct cred *prepare_kernel_cred(struct task_struct *);
+static inline const struct cred *kernel_cred(void)
+{
+	/* shut up sparse */
+	return rcu_dereference_raw(init_task.cred);
+}
 extern int set_security_override(struct cred *, u32);
 extern int set_security_override_from_ctx(struct cred *, const char *);
 extern int set_create_files_as(struct cred *, struct inode *);
-- 
cgit v1.2.3


From 40314c2818b700da695c9686348be7aef9e156a2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 3 Nov 2025 00:12:42 +0100
Subject: cred: make init_cred static

There's zero need to expose struct init_cred. The very few places that
need access can just go through init_task which is already exported.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-3-cb3ec8711a6a@kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/init_task.h    |  1 -
 init/init_task.c             | 27 +++++++++++++++++++++++++++
 kernel/cred.c                | 27 ---------------------------
 security/keys/process_keys.c |  2 +-
 4 files changed, 28 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index bccb3f1f6262..a6cb241ea00c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,7 +25,6 @@
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 extern struct nsproxy init_nsproxy;
-extern struct cred init_cred;
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 #define INIT_PREV_CPUTIME(x)	.prev_cputime = {			\
diff --git a/init/init_task.c b/init/init_task.c
index a55e2189206f..d970a847b657 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -62,6 +62,33 @@ unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
 };
 #endif
 
+/* init to 2 - one for init_task, one to ensure it is never freed */
+static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
+
+/*
+ * The initial credentials for the initial task
+ */
+static struct cred init_cred = {
+	.usage			= ATOMIC_INIT(4),
+	.uid			= GLOBAL_ROOT_UID,
+	.gid			= GLOBAL_ROOT_GID,
+	.suid			= GLOBAL_ROOT_UID,
+	.sgid			= GLOBAL_ROOT_GID,
+	.euid			= GLOBAL_ROOT_UID,
+	.egid			= GLOBAL_ROOT_GID,
+	.fsuid			= GLOBAL_ROOT_UID,
+	.fsgid			= GLOBAL_ROOT_GID,
+	.securebits		= SECUREBITS_DEFAULT,
+	.cap_inheritable	= CAP_EMPTY_SET,
+	.cap_permitted		= CAP_FULL_SET,
+	.cap_effective		= CAP_FULL_SET,
+	.cap_bset		= CAP_FULL_SET,
+	.user			= INIT_USER,
+	.user_ns		= &init_user_ns,
+	.group_info		= &init_groups,
+	.ucounts		= &init_ucounts,
+};
+
 /*
  * Set up the first task table, touch at your own risk!. Base=0,
  * limit=0x1fffff (=2MB)
diff --git a/kernel/cred.c b/kernel/cred.c
index dbf6b687dc5c..ac87ed9d43b1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,33 +35,6 @@ do {									\
 
 static struct kmem_cache *cred_jar;
 
-/* init to 2 - one for init_task, one to ensure it is never freed */
-static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
-
-/*
- * The initial credentials for the initial task
- */
-struct cred init_cred = {
-	.usage			= ATOMIC_INIT(4),
-	.uid			= GLOBAL_ROOT_UID,
-	.gid			= GLOBAL_ROOT_GID,
-	.suid			= GLOBAL_ROOT_UID,
-	.sgid			= GLOBAL_ROOT_GID,
-	.euid			= GLOBAL_ROOT_UID,
-	.egid			= GLOBAL_ROOT_GID,
-	.fsuid			= GLOBAL_ROOT_UID,
-	.fsgid			= GLOBAL_ROOT_GID,
-	.securebits		= SECUREBITS_DEFAULT,
-	.cap_inheritable	= CAP_EMPTY_SET,
-	.cap_permitted		= CAP_FULL_SET,
-	.cap_effective		= CAP_FULL_SET,
-	.cap_bset		= CAP_FULL_SET,
-	.user			= INIT_USER,
-	.user_ns		= &init_user_ns,
-	.group_info		= &init_groups,
-	.ucounts		= &init_ucounts,
-};
-
 /*
  * The RCU callback to actually dispose of a set of credentials
  */
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index b5d5333ab330..a63c46bb2d14 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -51,7 +51,7 @@ static struct key *get_user_register(struct user_namespace *user_ns)
 	if (!reg_keyring) {
 		reg_keyring = keyring_alloc(".user_reg",
 					    user_ns->owner, INVALID_GID,
-					    &init_cred,
+					    kernel_cred(),
 					    KEY_POS_WRITE | KEY_POS_SEARCH |
 					    KEY_USR_VIEW | KEY_USR_READ,
 					    0,
-- 
cgit v1.2.3


From ae40e6c65791f47c76cc14d0cce2707fe6053f72 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 3 Nov 2025 00:12:43 +0100
Subject: cred: add scoped_with_kernel_creds()

Add a new cleanup class for override creds. We can make use of this in a
bunch of places going forward.

Based on this scoped_with_kernel_creds() that can be used to temporarily
assume kernel credentials for specific tasks such as firmware loading,
or coredump socket connections. At no point will the caller interact
with the kernel credentials directly.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-4-cb3ec8711a6a@kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/cred.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8ab3718184ad..be2cd07b174c 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -187,6 +187,14 @@ static inline const struct cred *revert_creds(const struct cred *revert_cred)
 	return rcu_replace_pointer(current->cred, revert_cred, 1);
 }
 
+DEFINE_CLASS(override_creds,
+	     const struct cred *,
+	     revert_creds(_T),
+	     override_creds(override_cred), const struct cred *override_cred)
+
+#define scoped_with_kernel_creds() \
+	scoped_class(override_creds, __UNIQUE_ID(cred), kernel_cred())
+
 /**
  * get_cred_many - Get references on a set of credentials
  * @cred: The credentials to reference
-- 
cgit v1.2.3


From 019e52e8d324d568e71730946beb11e7b275ff08 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 3 Nov 2025 12:26:49 +0100
Subject: cred: add scoped_with_creds() guards

and implement scoped_with_kernel_creds() on top of it.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-1-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/cred.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index be2cd07b174c..6ea2d81a740b 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -192,8 +192,10 @@ DEFINE_CLASS(override_creds,
 	     revert_creds(_T),
 	     override_creds(override_cred), const struct cred *override_cred)
 
-#define scoped_with_kernel_creds() \
-	scoped_class(override_creds, __UNIQUE_ID(cred), kernel_cred())
+#define scoped_with_creds(cred) \
+	scoped_class(override_creds, __UNIQUE_ID(label), cred)
+
+#define scoped_with_kernel_creds() scoped_with_creds(kernel_cred())
 
 /**
  * get_cred_many - Get references on a set of credentials
-- 
cgit v1.2.3


From c8ad3098e1272444b6c75910d6196a36f5c8bc17 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 3 Nov 2025 15:57:27 +0100
Subject: cred: add prepare credential guard

A lot of code uses the following pattern:

* prepare new credentials
* modify them for their use-case
* drop them

Support that easier with the new guard infrastructure.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-1-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/cred.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 6ea2d81a740b..343a140a6ba2 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -280,6 +280,11 @@ static inline void put_cred(const struct cred *cred)
 	put_cred_many(cred, 1);
 }
 
+DEFINE_CLASS(prepare_creds,
+	      struct cred *,
+	      if (_T) put_cred(_T),
+	      prepare_creds(), void)
+
 DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T))
 
 /**
-- 
cgit v1.2.3


From ecaba8b7990d8c6d8ba097cd4499b3b92d9df6ea Mon Sep 17 00:00:00 2001
From: Baojun Xu <baojun.xu@ti.com>
Date: Tue, 4 Nov 2025 12:13:12 +0800
Subject: ASoC: tas2781: Add tas5822 support

TAS5822 has on-chip DSP without current/voltage feedback.

Signed-off-by: Baojun Xu <baojun.xu@ti.com>
Link: https://patch.msgid.link/20251104041314.792-1-baojun.xu@ti.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/tas2781.h        | 1 +
 sound/soc/codecs/tas2781-i2c.c | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h
index 0fbcdb15c74b..c3b4c43dd2bf 100644
--- a/include/sound/tas2781.h
+++ b/include/sound/tas2781.h
@@ -122,6 +122,7 @@ enum audio_device {
 	TAS2781,
 	TAS5802,
 	TAS5815,
+	TAS5822,
 	TAS5825,
 	TAS5827,
 	TAS5828,
diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c
index 0f41951e7578..8f853310649c 100644
--- a/sound/soc/codecs/tas2781-i2c.c
+++ b/sound/soc/codecs/tas2781-i2c.c
@@ -110,6 +110,7 @@ static const struct i2c_device_id tasdevice_id[] = {
 	{ "tas2781", TAS2781 },
 	{ "tas5802", TAS5802 },
 	{ "tas5815", TAS5815 },
+	{ "tas5822", TAS5822 },
 	{ "tas5825", TAS5825 },
 	{ "tas5827", TAS5827 },
 	{ "tas5828", TAS5828 },
@@ -129,6 +130,7 @@ static const struct of_device_id tasdevice_of_match[] = {
 	{ .compatible = "ti,tas2781" },
 	{ .compatible = "ti,tas5802" },
 	{ .compatible = "ti,tas5815" },
+	{ .compatible = "ti,tas5822" },
 	{ .compatible = "ti,tas5825" },
 	{ .compatible = "ti,tas5827" },
 	{ .compatible = "ti,tas5828" },
@@ -1669,7 +1671,7 @@ static void tasdevice_fw_ready(const struct firmware *fmw,
 	tas_priv->fw_state = TASDEVICE_DSP_FW_ALL_OK;
 
 	/* There is no calibration required for
-	 * TAS5802/TAS5815/TAS5825/TAS5827/TAS5828.
+	 * TAS5802/TAS5815/TAS5822/TAS5825/TAS5827/TAS5828.
 	 */
 	if (tas_priv->chip_id < TAS5802) {
 		ret = tasdevice_create_cali_ctrls(tas_priv);
@@ -1727,6 +1729,7 @@ out:
 		case TAS2781:
 		case TAS5802:
 		case TAS5815:
+		case TAS5822:
 		case TAS5825:
 		case TAS5827:
 		case TAS5828:
@@ -1892,6 +1895,7 @@ static int tasdevice_codec_probe(struct snd_soc_component *codec)
 		break;
 	case TAS5802:
 	case TAS5815:
+	case TAS5822:
 	case TAS5825:
 	case TAS5827:
 	case TAS5828:
@@ -2068,6 +2072,7 @@ static const struct acpi_device_id tasdevice_acpi_match[] = {
 	{ "TXNW2781", TAS2781 },
 	{ "TXNW5802", TAS5802 },
 	{ "TXNW5815", TAS5815 },
+	{ "TXNW5822", TAS5822 },
 	{ "TXNW5825", TAS5825 },
 	{ "TXNW5827", TAS5827 },
 	{ "TXNW5828", TAS5828 },
-- 
cgit v1.2.3


From 30ed05adca4a05c50594384cff18910858dd1d35 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 30 Oct 2025 08:06:46 +0800
Subject: xsk: use a smaller new lock for shared pool case

- Split cq_lock into two smaller locks: cq_prod_lock and
  cq_cached_prod_lock
- Avoid disabling/enabling interrupts in the hot xmit path

In either xsk_cq_cancel_locked() or xsk_cq_reserve_locked() function,
the race condition is only between multiple xsks sharing the same
pool. They are all in the process context rather than interrupt context,
so now the small lock named cq_cached_prod_lock can be used without
handling interrupts.

While cq_cached_prod_lock ensures the exclusive modification of
@cached_prod, cq_prod_lock in xsk_cq_submit_addr_locked() only cares
about @producer and corresponding @desc. Both of them don't necessarily
be consistent with @cached_prod protected by cq_cached_prod_lock.
That's the reason why the previous big lock can be split into two
smaller ones. Please note that SPSC rule is all about the global state
of producer and consumer that can affect both layers instead of local
or cached ones.

Frequently disabling and enabling interrupt are very time consuming
in some cases, especially in a per-descriptor granularity, which now
can be avoided after this optimization, even when the pool is shared by
multiple xsks.

With this patch, the performance number[1] could go from 1,872,565 pps
to 1,961,009 pps. It's a minor rise of around 5%.

[1]: taskset -c 1 ./xdpsock -i enp2s0f1 -q 0 -t -S -s 64

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://patch.msgid.link/20251030000646.18859-3-kerneljasonxing@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/xsk_buff_pool.h | 13 +++++++++----
 net/xdp/xsk.c               | 15 ++++++---------
 net/xdp/xsk_buff_pool.c     |  3 ++-
 3 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index cac56e6b0869..92a2358c6ce3 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -85,11 +85,16 @@ struct xsk_buff_pool {
 	bool unaligned;
 	bool tx_sw_csum;
 	void *addrs;
-	/* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
-	 * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when
-	 * sockets share a single cq when the same netdev and queue id is shared.
+	/* Mutual exclusion of the completion ring in the SKB mode.
+	 * Protect: NAPI TX thread and sendmsg error paths in the SKB
+	 * destructor callback.
 	 */
-	spinlock_t cq_lock;
+	spinlock_t cq_prod_lock;
+	/* Mutual exclusion of the completion ring in the SKB mode.
+	 * Protect: when sockets share a single cq when the same netdev
+	 * and queue id is shared.
+	 */
+	spinlock_t cq_cached_prod_lock;
 	struct xdp_buff_xsk *free_heads[];
 };
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7b0c68a70888..2f26c918d448 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -548,12 +548,11 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 
 static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
 {
-	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&pool->cq_lock, flags);
+	spin_lock(&pool->cq_cached_prod_lock);
 	ret = xskq_prod_reserve(pool->cq);
-	spin_unlock_irqrestore(&pool->cq_lock, flags);
+	spin_unlock(&pool->cq_cached_prod_lock);
 
 	return ret;
 }
@@ -566,7 +565,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
 	unsigned long flags;
 	u32 idx;
 
-	spin_lock_irqsave(&pool->cq_lock, flags);
+	spin_lock_irqsave(&pool->cq_prod_lock, flags);
 	idx = xskq_get_prod(pool->cq);
 
 	xskq_prod_write_addr(pool->cq, idx,
@@ -583,16 +582,14 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
 		}
 	}
 	xskq_prod_submit_n(pool->cq, descs_processed);
-	spin_unlock_irqrestore(&pool->cq_lock, flags);
+	spin_unlock_irqrestore(&pool->cq_prod_lock, flags);
 }
 
 static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&pool->cq_lock, flags);
+	spin_lock(&pool->cq_cached_prod_lock);
 	xskq_prod_cancel_n(pool->cq, n);
-	spin_unlock_irqrestore(&pool->cq_lock, flags);
+	spin_unlock(&pool->cq_cached_prod_lock);
 }
 
 static void xsk_inc_num_desc(struct sk_buff *skb)
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 309075050b2a..00a4eddaa0cd 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -90,7 +90,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 	INIT_LIST_HEAD(&pool->xskb_list);
 	INIT_LIST_HEAD(&pool->xsk_tx_list);
 	spin_lock_init(&pool->xsk_tx_list_lock);
-	spin_lock_init(&pool->cq_lock);
+	spin_lock_init(&pool->cq_prod_lock);
+	spin_lock_init(&pool->cq_cached_prod_lock);
 	refcount_set(&pool->users, 1);
 
 	pool->fq = xs->fq_tmp;
-- 
cgit v1.2.3


From ec7f31b2a2d3bf6b9e4d4b8cd156587f1d0607d5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Nov 2025 05:16:45 -0500
Subject: block: make bio auto-integrity deadlock safe

The current block layer automatic integrity protection allocates the
actual integrity buffer, which has three problems:

 - because it happens at the bottom of the I/O stack and doesn't use a
   mempool it can deadlock under load
 - because the data size in a bio is almost unbounded when using lage
   folios it can relatively easily exceed the maximum kmalloc size
 - even when it does not exceed the maximum kmalloc size, it could
   exceed the maximum segment size of the device

Fix this by limiting the I/O size so that we can allocate at least a
2MiB integrity buffer, i.e. 128MiB for 8 byte PI and 512 byte integrity
intervals, and create a mempool as a last resort for this maximum size,
mirroring the scheme used for bvecs.  As a nice upside none of this
can fail now, so we remove the error handling and open code the
trivial addition of the bip vec.

The new allocation helpers sit outside of bio-integrity-auto.c because
I plan to reuse them for file system based PI in the near future.

Fixes: 7ba1ba12eeef ("block: Block layer data integrity support")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity-auto.c    | 22 +++-----------------
 block/bio-integrity.c         | 48 +++++++++++++++++++++++++++++++++++++++++++
 block/blk-settings.c          | 21 +++++++++++++++++++
 include/linux/bio-integrity.h |  6 ++++++
 include/linux/blk-integrity.h |  5 +++++
 5 files changed, 83 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c
index 2f4a244749ac..9850c338548d 100644
--- a/block/bio-integrity-auto.c
+++ b/block/bio-integrity-auto.c
@@ -29,7 +29,7 @@ static void bio_integrity_finish(struct bio_integrity_data *bid)
 {
 	bid->bio->bi_integrity = NULL;
 	bid->bio->bi_opf &= ~REQ_INTEGRITY;
-	kfree(bvec_virt(bid->bip.bip_vec));
+	bio_integrity_free_buf(&bid->bip);
 	mempool_free(bid, &bid_pool);
 }
 
@@ -110,8 +110,6 @@ bool bio_integrity_prep(struct bio *bio)
 	struct bio_integrity_data *bid;
 	bool set_flags = true;
 	gfp_t gfp = GFP_NOIO;
-	unsigned int len;
-	void *buf;
 
 	if (!bi)
 		return true;
@@ -152,17 +150,12 @@ bool bio_integrity_prep(struct bio *bio)
 	if (WARN_ON_ONCE(bio_has_crypt_ctx(bio)))
 		return true;
 
-	/* Allocate kernel buffer for protection data */
-	len = bio_integrity_bytes(bi, bio_sectors(bio));
-	buf = kmalloc(len, gfp);
-	if (!buf)
-		goto err_end_io;
 	bid = mempool_alloc(&bid_pool, GFP_NOIO);
 	bio_integrity_init(bio, &bid->bip, &bid->bvec, 1);
-
 	bid->bio = bio;
-
 	bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY;
+	bio_integrity_alloc_buf(bio, gfp & __GFP_ZERO);
+
 	bip_set_seed(&bid->bip, bio->bi_iter.bi_sector);
 
 	if (set_flags) {
@@ -174,21 +167,12 @@ bool bio_integrity_prep(struct bio *bio)
 			bid->bip.bip_flags |= BIP_CHECK_REFTAG;
 	}
 
-	if (bio_integrity_add_page(bio, virt_to_page(buf), len,
-			offset_in_page(buf)) < len)
-		goto err_end_io;
-
 	/* Auto-generate integrity metadata if this is a write */
 	if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip))
 		blk_integrity_generate(bio);
 	else
 		bid->saved_bio_iter = bio->bi_iter;
 	return true;
-
-err_end_io:
-	bio->bi_status = BLK_STS_RESOURCE;
-	bio_endio(bio);
-	return false;
 }
 EXPORT_SYMBOL(bio_integrity_prep);
 
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index bed26f1ec869..09eeaf6e74b8 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -14,6 +14,45 @@ struct bio_integrity_alloc {
 	struct bio_vec			bvecs[];
 };
 
+static mempool_t integrity_buf_pool;
+
+void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer)
+{
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	unsigned int len = bio_integrity_bytes(bi, bio_sectors(bio));
+	gfp_t gfp = GFP_NOIO | (zero_buffer ? __GFP_ZERO : 0);
+	void *buf;
+
+	buf = kmalloc(len, (gfp & ~__GFP_DIRECT_RECLAIM) |
+			__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN);
+	if (unlikely(!buf)) {
+		struct page *page;
+
+		page = mempool_alloc(&integrity_buf_pool, GFP_NOFS);
+		if (zero_buffer)
+			memset(page_address(page), 0, len);
+		bvec_set_page(&bip->bip_vec[0], page, len, 0);
+		bip->bip_flags |= BIP_MEMPOOL;
+	} else {
+		bvec_set_page(&bip->bip_vec[0], virt_to_page(buf), len,
+				offset_in_page(buf));
+	}
+
+	bip->bip_vcnt = 1;
+	bip->bip_iter.bi_size = len;
+}
+
+void bio_integrity_free_buf(struct bio_integrity_payload *bip)
+{
+	struct bio_vec *bv = &bip->bip_vec[0];
+
+	if (bip->bip_flags & BIP_MEMPOOL)
+		mempool_free(bv->bv_page, &integrity_buf_pool);
+	else
+		kfree(bvec_virt(bv));
+}
+
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
@@ -438,3 +477,12 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 
 	return 0;
 }
+
+static int __init bio_integrity_initfn(void)
+{
+	if (mempool_init_page_pool(&integrity_buf_pool, BIO_POOL_SIZE,
+			get_order(BLK_INTEGRITY_MAX_SIZE)))
+		panic("bio: can't create integrity buf pool\n");
+	return 0;
+}
+subsys_initcall(bio_integrity_initfn);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 345b6a271cc3..e0d0b035f39d 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -123,6 +123,19 @@ static int blk_validate_zoned_limits(struct queue_limits *lim)
 	return 0;
 }
 
+/*
+ * Maximum size of I/O that needs a block layer integrity buffer.  Limited
+ * by the number of intervals for which we can fit the integrity buffer into
+ * the buffer size.  Because the buffer is a single segment it is also limited
+ * by the maximum segment size.
+ */
+static inline unsigned int max_integrity_io_size(struct queue_limits *lim)
+{
+	return min_t(unsigned int, lim->max_segment_size,
+		(BLK_INTEGRITY_MAX_SIZE / lim->integrity.metadata_size) <<
+			lim->integrity.interval_exp);
+}
+
 static int blk_validate_integrity_limits(struct queue_limits *lim)
 {
 	struct blk_integrity *bi = &lim->integrity;
@@ -184,6 +197,14 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
 	if (!bi->interval_exp)
 		bi->interval_exp = ilog2(lim->logical_block_size);
 
+	/*
+	 * The block layer automatically adds integrity data for bios that don't
+	 * already have it.  Limit the I/O size so that a single maximum size
+	 * metadata segment can cover the integrity data for the entire I/O.
+	 */
+	lim->max_sectors = min(lim->max_sectors,
+		max_integrity_io_size(lim) >> SECTOR_SHIFT);
+
 	return 0;
 }
 
diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h
index 851254f36eb3..3d05296a5afe 100644
--- a/include/linux/bio-integrity.h
+++ b/include/linux/bio-integrity.h
@@ -14,6 +14,8 @@ enum bip_flags {
 	BIP_CHECK_REFTAG	= 1 << 6, /* reftag check */
 	BIP_CHECK_APPTAG	= 1 << 7, /* apptag check */
 	BIP_P2P_DMA		= 1 << 8, /* using P2P address */
+
+	BIP_MEMPOOL		= 1 << 15, /* buffer backed by mempool */
 };
 
 struct bio_integrity_payload {
@@ -140,4 +142,8 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
 	return 0;
 }
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
+
+void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer);
+void bio_integrity_free_buf(struct bio_integrity_payload *bip);
+
 #endif /* _LINUX_BIO_INTEGRITY_H */
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index b659373788f6..c2030fd8ba0a 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -8,6 +8,11 @@
 
 struct request;
 
+/*
+ * Maximum contiguous integrity buffer allocation.
+ */
+#define BLK_INTEGRITY_MAX_SIZE		SZ_2M
+
 enum blk_integrity_flags {
 	BLK_INTEGRITY_NOVERIFY		= 1 << 0,
 	BLK_INTEGRITY_NOGENERATE	= 1 << 1,
-- 
cgit v1.2.3


From 1b6aa81c85621d6b55099906585ff09a477203b8 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 3 Nov 2025 11:50:15 +0000
Subject: net: stmmac: add support for configuring the phy_intf_sel inputs

When dwmac is synthesised with support for multiple PHY interfaces, the
core provides phy_intf_sel inputs, sampled on reset, to configure the
PHY facing interface. Use stmmac_get_phy_intf_sel() in core code to
determine the dwmac phy_intf_sel input value, and provide a new
platform method called with this value just before we issue a soft
reset to the dwmac core.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vFt4h-0000000Chos-3wxX@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 34 +++++++++++++++++++++++
 include/linux/stmmac.h                            |  1 +
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 6d4323d04573..ccf383b355e7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3102,6 +3102,36 @@ int stmmac_get_phy_intf_sel(phy_interface_t interface)
 }
 EXPORT_SYMBOL_GPL(stmmac_get_phy_intf_sel);
 
+static int stmmac_prereset_configure(struct stmmac_priv *priv)
+{
+	struct plat_stmmacenet_data *plat_dat = priv->plat;
+	phy_interface_t interface;
+	int phy_intf_sel, ret;
+
+	if (!plat_dat->set_phy_intf_sel)
+		return 0;
+
+	interface = plat_dat->phy_interface;
+	phy_intf_sel = stmmac_get_phy_intf_sel(interface);
+	if (phy_intf_sel < 0) {
+		netdev_err(priv->dev,
+			   "failed to get phy_intf_sel for %s: %pe\n",
+			   phy_modes(interface), ERR_PTR(phy_intf_sel));
+		return phy_intf_sel;
+	}
+
+	ret = plat_dat->set_phy_intf_sel(plat_dat->bsp_priv, phy_intf_sel);
+	if (ret == -EINVAL)
+		netdev_err(priv->dev, "platform does not support %s\n",
+			   phy_modes(interface));
+	else if (ret < 0)
+		netdev_err(priv->dev,
+			   "platform failed to set interface %s: %pe\n",
+			   phy_modes(interface), ERR_PTR(ret));
+
+	return ret;
+}
+
 /**
  * stmmac_init_dma_engine - DMA init.
  * @priv: driver private structure
@@ -3128,6 +3158,10 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 	if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE))
 		priv->plat->dma_cfg->atds = 1;
 
+	ret = stmmac_prereset_configure(priv);
+	if (ret)
+		return ret;
+
 	ret = stmmac_reset(priv, priv->ioaddr);
 	if (ret) {
 		netdev_err(priv->dev, "Failed to reset the dma\n");
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 151c81c560c8..48e9f1d4e17e 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -250,6 +250,7 @@ struct plat_stmmacenet_data {
 	struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES];
 	void (*get_interfaces)(struct stmmac_priv *priv, void *bsp_priv,
 			       unsigned long *interfaces);
+	int (*set_phy_intf_sel)(void *priv, u8 phy_intf_sel);
 	int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i,
 			       phy_interface_t interface, int speed);
 	void (*fix_mac_speed)(void *priv, int speed, unsigned int mode);
-- 
cgit v1.2.3


From f88191c7f3618405f1fc5c331a94ebfe601c5b08 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Sat, 1 Nov 2025 18:56:51 +0100
Subject: mptcp: pm: in-kernel: record fullmesh endp nb

Instead of iterating over all endpoints, under RCU read lock, just to
check if one of them as the fullmesh flag, we can keep a counter of
fullmesh endpoint, similar to what is done with the other flags.

This counter is now checked, before iterating over all endpoints.

Similar to the other counters, this new one is also exposed. A userspace
app can then know when it is being used in a fullmesh mode, with
potentially (too) many subflows.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251101-net-next-mptcp-fm-endp-nb-bind-v1-1-b4166772d6bb@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/mptcp.h |  3 ++-
 net/mptcp/pm_kernel.c      | 38 +++++++++++++++++++++++++++++++++++---
 net/mptcp/protocol.h       |  1 +
 net/mptcp/sockopt.c        |  2 ++
 4 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 87cfab874e24..04eea6d1d0a9 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -70,7 +70,8 @@ struct mptcp_info {
 	__u64	mptcpi_bytes_acked;
 	__u8	mptcpi_subflows_total;
 	__u8	mptcpi_endp_laminar_max;
-	__u8	reserved[2];
+	__u8	mptcpi_endp_fullmesh_max;
+	__u8	reserved;
 	__u32	mptcpi_last_data_sent;
 	__u32	mptcpi_last_data_recv;
 	__u32	mptcpi_last_ack_recv;
diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
index 2ae95476dba3..e2918c68ff02 100644
--- a/net/mptcp/pm_kernel.c
+++ b/net/mptcp/pm_kernel.c
@@ -22,6 +22,7 @@ struct pm_nl_pernet {
 	u8			endp_signal_max;
 	u8			endp_subflow_max;
 	u8			endp_laminar_max;
+	u8			endp_fullmesh_max;
 	u8			limit_add_addr_accepted;
 	u8			limit_extra_subflows;
 	u8			next_id;
@@ -70,6 +71,14 @@ u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk)
 }
 EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max);
 
+u8 mptcp_pm_get_endp_fullmesh_max(const struct mptcp_sock *msk)
+{
+	struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+	return READ_ONCE(pernet->endp_fullmesh_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_fullmesh_max);
+
 u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk)
 {
 	struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
@@ -603,9 +612,12 @@ fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
 	int i;
 
 	/* If there is at least one MPTCP endpoint with a fullmesh flag */
-	i = fill_local_addresses_vec_fullmesh(msk, remote, locals, c_flag_case);
-	if (i)
-		return i;
+	if (mptcp_pm_get_endp_fullmesh_max(msk)) {
+		i = fill_local_addresses_vec_fullmesh(msk, remote, locals,
+						      c_flag_case);
+		if (i)
+			return i;
+	}
 
 	/* If there is at least one MPTCP endpoint with a laminar flag */
 	if (mptcp_pm_get_endp_laminar_max(msk))
@@ -790,6 +802,10 @@ find_next:
 		addr_max = pernet->endp_laminar_max;
 		WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1);
 	}
+	if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) {
+		addr_max = pernet->endp_fullmesh_max;
+		WRITE_ONCE(pernet->endp_fullmesh_max, addr_max + 1);
+	}
 
 	pernet->endpoints++;
 	if (!entry->addr.port)
@@ -1187,6 +1203,10 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
 		addr_max = pernet->endp_laminar_max;
 		WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1);
 	}
+	if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) {
+		addr_max = pernet->endp_fullmesh_max;
+		WRITE_ONCE(pernet->endp_fullmesh_max, addr_max - 1);
+	}
 
 	pernet->endpoints--;
 	list_del_rcu(&entry->list);
@@ -1502,6 +1522,18 @@ int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local,
 	changed = (local->flags ^ entry->flags) & mask;
 	entry->flags = (entry->flags & ~mask) | (local->flags & mask);
 	*local = *entry;
+
+	if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) {
+		u8 addr_max = pernet->endp_fullmesh_max;
+
+		if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)
+			addr_max++;
+		else
+			addr_max--;
+
+		WRITE_ONCE(pernet->endp_fullmesh_max, addr_max);
+	}
+
 	spin_unlock_bh(&pernet->lock);
 
 	mptcp_pm_nl_set_flags_all(net, local, changed);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 379a88e14e8d..9a3429175758 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -1183,6 +1183,7 @@ void __mptcp_pm_kernel_worker(struct mptcp_sock *msk);
 u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk);
 u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk);
 u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_endp_fullmesh_max(const struct mptcp_sock *msk);
 u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk);
 u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk);
 
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index a28a48385885..de90a2897d2d 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -982,6 +982,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 			mptcp_pm_get_endp_subflow_max(msk);
 		info->mptcpi_endp_laminar_max =
 			mptcp_pm_get_endp_laminar_max(msk);
+		info->mptcpi_endp_fullmesh_max =
+			mptcp_pm_get_endp_fullmesh_max(msk);
 	}
 
 	if (__mptcp_check_fallback(msk))
-- 
cgit v1.2.3


From 617a0dd24ef2b4e6240df48b1fbac1c3ebfa9282 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 3 Nov 2025 23:26:49 +0100
Subject: net: phy: make phy_device members pause and asym_pause bitfield bits

We can reduce the size of struct phy_device a little by switching
the type of members pause and asym_pause from int to a single bit.
As C99 is supported now, we can use type bool for the bitfield members,
what provides us with the benefit of the usual implicit bool conversions.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/764e9a31-b40b-4dc9-b808-118192a16d87@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy-c45.c    | 20 ++++++++++----------
 drivers/net/phy/phy_device.c | 16 ++++++++--------
 include/linux/phy.h          |  4 ++--
 3 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 61670be0f095..1a7b32be4625 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -485,8 +485,8 @@ static int genphy_c45_baset1_read_lpa(struct phy_device *phydev)
 		mii_t1_adv_l_mod_linkmode_t(phydev->lp_advertising, 0);
 		mii_t1_adv_m_mod_linkmode_t(phydev->lp_advertising, 0);
 
-		phydev->pause = 0;
-		phydev->asym_pause = 0;
+		phydev->pause = false;
+		phydev->asym_pause = false;
 
 		return 0;
 	}
@@ -498,8 +498,8 @@ static int genphy_c45_baset1_read_lpa(struct phy_device *phydev)
 		return val;
 
 	mii_t1_adv_l_mod_linkmode_t(phydev->lp_advertising, val);
-	phydev->pause = val & MDIO_AN_T1_ADV_L_PAUSE_CAP ? 1 : 0;
-	phydev->asym_pause = val & MDIO_AN_T1_ADV_L_PAUSE_ASYM ? 1 : 0;
+	phydev->pause = val & MDIO_AN_T1_ADV_L_PAUSE_CAP;
+	phydev->asym_pause = val & MDIO_AN_T1_ADV_L_PAUSE_ASYM;
 
 	val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_T1_LP_M);
 	if (val < 0)
@@ -536,8 +536,8 @@ int genphy_c45_read_lpa(struct phy_device *phydev)
 				   phydev->lp_advertising);
 		mii_10gbt_stat_mod_linkmode_lpa_t(phydev->lp_advertising, 0);
 		mii_adv_mod_linkmode_adv_t(phydev->lp_advertising, 0);
-		phydev->pause = 0;
-		phydev->asym_pause = 0;
+		phydev->pause = false;
+		phydev->asym_pause = false;
 
 		return 0;
 	}
@@ -551,8 +551,8 @@ int genphy_c45_read_lpa(struct phy_device *phydev)
 		return val;
 
 	mii_adv_mod_linkmode_adv_t(phydev->lp_advertising, val);
-	phydev->pause = val & LPA_PAUSE_CAP ? 1 : 0;
-	phydev->asym_pause = val & LPA_PAUSE_ASYM ? 1 : 0;
+	phydev->pause = val & LPA_PAUSE_CAP;
+	phydev->asym_pause = val & LPA_PAUSE_ASYM;
 
 	/* Read the link partner's 10G advertisement */
 	val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_10GBT_STAT);
@@ -1171,8 +1171,8 @@ int genphy_c45_read_status(struct phy_device *phydev)
 
 	phydev->speed = SPEED_UNKNOWN;
 	phydev->duplex = DUPLEX_UNKNOWN;
-	phydev->pause = 0;
-	phydev->asym_pause = 0;
+	phydev->pause = false;
+	phydev->asym_pause = false;
 
 	if (phydev->autoneg == AUTONEG_ENABLE) {
 		ret = genphy_c45_read_lpa(phydev);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 737747cf1906..81984d4ebb7c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -825,8 +825,8 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 
 	dev->speed = SPEED_UNKNOWN;
 	dev->duplex = DUPLEX_UNKNOWN;
-	dev->pause = 0;
-	dev->asym_pause = 0;
+	dev->pause = false;
+	dev->asym_pause = false;
 	dev->link = 0;
 	dev->port = PORT_TP;
 	dev->interface = PHY_INTERFACE_MODE_GMII;
@@ -2092,8 +2092,8 @@ int genphy_setup_forced(struct phy_device *phydev)
 {
 	u16 ctl;
 
-	phydev->pause = 0;
-	phydev->asym_pause = 0;
+	phydev->pause = false;
+	phydev->asym_pause = false;
 
 	ctl = mii_bmcr_encode_fixed(phydev->speed, phydev->duplex);
 
@@ -2500,8 +2500,8 @@ int genphy_read_status(struct phy_device *phydev)
 	phydev->master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED;
 	phydev->speed = SPEED_UNKNOWN;
 	phydev->duplex = DUPLEX_UNKNOWN;
-	phydev->pause = 0;
-	phydev->asym_pause = 0;
+	phydev->pause = false;
+	phydev->asym_pause = false;
 
 	if (phydev->is_gigabit_capable) {
 		err = genphy_read_master_slave(phydev);
@@ -2554,8 +2554,8 @@ int genphy_c37_read_status(struct phy_device *phydev, bool *changed)
 	/* Signal link has changed */
 	*changed = true;
 	phydev->duplex = DUPLEX_UNKNOWN;
-	phydev->pause = 0;
-	phydev->asym_pause = 0;
+	phydev->pause = false;
+	phydev->asym_pause = false;
 
 	if (phydev->autoneg == AUTONEG_ENABLE && phydev->autoneg_complete) {
 		lpa = phy_read(phydev, MII_LPA);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e3474f03cbc1..d145a200ea21 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -666,6 +666,8 @@ struct phy_device {
 	/* The most recently read link state */
 	unsigned link:1;
 	unsigned autoneg_complete:1;
+	bool pause:1;
+	bool asym_pause:1;
 
 	/* Interrupts are enabled */
 	unsigned interrupts:1;
@@ -690,8 +692,6 @@ struct phy_device {
 	int speed;
 	int duplex;
 	int port;
-	int pause;
-	int asym_pause;
 	u8 master_slave_get;
 	u8 master_slave_set;
 	u8 master_slave_state;
-- 
cgit v1.2.3


From c9445e3c087656e01d0160a48f90389856baf368 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 30 Oct 2025 22:41:19 +0100
Subject: net: phy: fixed_phy: add helper fixed_phy_register_100fd

In few places a 100FD fixed PHY is used. Create a helper so that users
don't have to define the struct fixed_phy_status.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/bf564b19-e9bc-4896-aeae-9f721cc4fecd@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/fixed_phy.c | 12 ++++++++++++
 include/linux/phy_fixed.h   |  6 ++++++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 0e1b28f06f18..bdc3a4bffede 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -227,6 +227,18 @@ struct phy_device *fixed_phy_register(const struct fixed_phy_status *status,
 }
 EXPORT_SYMBOL_GPL(fixed_phy_register);
 
+struct phy_device *fixed_phy_register_100fd(void)
+{
+	static const struct fixed_phy_status status = {
+		.link	= 1,
+		.speed	= SPEED_100,
+		.duplex	= DUPLEX_FULL,
+	};
+
+	return fixed_phy_register(&status, NULL);
+}
+EXPORT_SYMBOL_GPL(fixed_phy_register_100fd);
+
 void fixed_phy_unregister(struct phy_device *phy)
 {
 	phy_device_remove(phy);
diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index d17ff750c708..08275ef64147 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -20,6 +20,7 @@ extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier);
 void fixed_phy_add(const struct fixed_phy_status *status);
 struct phy_device *fixed_phy_register(const struct fixed_phy_status *status,
 				      struct device_node *np);
+struct phy_device *fixed_phy_register_100fd(void);
 
 extern void fixed_phy_unregister(struct phy_device *phydev);
 extern int fixed_phy_set_link_update(struct phy_device *phydev,
@@ -34,6 +35,11 @@ fixed_phy_register(const struct fixed_phy_status *status,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct phy_device *fixed_phy_register_100fd(void)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline void fixed_phy_unregister(struct phy_device *phydev)
 {
 }
-- 
cgit v1.2.3


From 5de9ea1c50f061892625388880e83fdc50a4ef66 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 30 Oct 2025 22:46:32 +0100
Subject: net: phy: fixed_phy: remove fixed_phy_add

fixed_phy_add() has a number of problems/disadvantages:
- It uses phy address 0 w/o checking whether a fixed phy with this
  address exists already.
- A subsequent call to fixed_phy_register() would also use phy address 0,
  because fixed_phy_add() doesn't mark it as used.
- fixed_phy_add() is used from platform code, therefore requires that
  fixed_phy code is built-in.

Now that for the only two users (coldfire/5272 and bcm47xx) fixed_phy
creation has been moved to the respective ethernet driver (fec, b44),
we can remove fixed_phy_add().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/bee046a1-1e77-4057-8b04-fdb2a1bbbd08@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/fixed_phy.c | 6 ------
 include/linux/phy_fixed.h   | 2 --
 2 files changed, 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index bdc3a4bffede..d498d8a9bba6 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -131,12 +131,6 @@ static int __fixed_phy_add(int phy_addr,
 	return 0;
 }
 
-void fixed_phy_add(const struct fixed_phy_status *status)
-{
-	__fixed_phy_add(0, status);
-}
-EXPORT_SYMBOL_GPL(fixed_phy_add);
-
 static DEFINE_IDA(phy_fixed_ida);
 
 static void fixed_phy_del(int phy_addr)
diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 08275ef64147..8bade999831c 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -17,7 +17,6 @@ struct net_device;
 
 #if IS_ENABLED(CONFIG_FIXED_PHY)
 extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier);
-void fixed_phy_add(const struct fixed_phy_status *status);
 struct phy_device *fixed_phy_register(const struct fixed_phy_status *status,
 				      struct device_node *np);
 struct phy_device *fixed_phy_register_100fd(void);
@@ -27,7 +26,6 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev,
 			int (*link_update)(struct net_device *,
 					   struct fixed_phy_status *));
 #else
-static inline void fixed_phy_add(const struct fixed_phy_status *status) {}
 static inline struct phy_device *
 fixed_phy_register(const struct fixed_phy_status *status,
 		   struct device_node *np)
-- 
cgit v1.2.3


From bf33247a90d3e85d53a9b55bb276b725456ff0bf Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 3 Nov 2025 16:26:09 -0800
Subject: net: Add struct sockaddr_unsized for sockaddr of unknown length

Add flexible sockaddr structure to support addresses longer than the
traditional 14-byte struct sockaddr::sa_data limitation without
requiring the full 128-byte sa_data of struct sockaddr_storage. This
allows the network APIs to pass around a pointer to an object that
isn't lying to the compiler about how big it is, but must be accompanied
by its actual size as an additional parameter.

It's possible we may way to migrate to including the size with the
struct in the future, e.g.:

struct sockaddr_unsized {
	u16 sa_data_len;
	u16 sa_family;
	u8  sa_data[] __counted_by(sa_data_len);
};

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-1-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/socket.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 3b262487ec06..7b1a01be29da 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -40,6 +40,23 @@ struct sockaddr {
 	};
 };
 
+/**
+ * struct sockaddr_unsized - Unspecified size sockaddr for callbacks
+ * @sa_family: Address family (AF_UNIX, AF_INET, AF_INET6, etc.)
+ * @sa_data: Flexible array for address data
+ *
+ * This structure is designed for callback interfaces where the
+ * total size is known via the sockaddr_len parameter. Unlike struct
+ * sockaddr which has a fixed 14-byte sa_data limit or struct
+ * sockaddr_storage which has a fixed 128-byte sa_data limit, this
+ * structure can accommodate addresses of any size, but must be used
+ * carefully.
+ */
+struct sockaddr_unsized {
+	__kernel_sa_family_t	sa_family;	/* address family, AF_xxx */
+	char			sa_data[];	/* flexible address data */
+};
+
 struct linger {
 	int		l_onoff;	/* Linger active		*/
 	int		l_linger;	/* How long to linger for	*/
-- 
cgit v1.2.3


From 0e50474fa514822e9d990874e554bf8043a201d7 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 3 Nov 2025 16:26:10 -0800
Subject: net: Convert proto_ops bind() callbacks to use sockaddr_unsized

Update all struct proto_ops bind() callback function prototypes from
"struct sockaddr *" to "struct sockaddr_unsized *" to avoid lying to the
compiler about object sizes. Calls into struct proto handlers gain casts
that will be removed in the struct proto conversion patch.

No binary changes expected.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-2-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 crypto/af_alg.c                                      |  2 +-
 drivers/block/drbd/drbd_receiver.c                   |  4 ++--
 drivers/infiniband/hw/erdma/erdma_cm.c               |  4 ++--
 drivers/infiniband/sw/siw/siw_cm.c                   |  6 +++---
 drivers/isdn/mISDN/l1oip_core.c                      |  2 +-
 drivers/isdn/mISDN/socket.c                          |  4 ++--
 drivers/net/ppp/pptp.c                               |  4 ++--
 drivers/nvme/host/tcp.c                              |  2 +-
 drivers/nvme/target/tcp.c                            |  2 +-
 drivers/target/iscsi/iscsi_target_login.c            |  2 +-
 drivers/xen/pvcalls-back.c                           |  2 +-
 fs/afs/rxrpc.c                                       |  6 +++---
 fs/dlm/lowcomms.c                                    |  6 +++---
 fs/ocfs2/cluster/tcp.c                               |  4 ++--
 fs/smb/client/connect.c                              |  2 +-
 fs/smb/server/transport_tcp.c                        |  4 ++--
 include/linux/net.h                                  |  4 ++--
 include/net/inet_common.h                            |  2 +-
 include/net/ipv6.h                                   |  2 +-
 include/net/sock.h                                   |  2 +-
 net/9p/trans_fd.c                                    |  2 +-
 net/appletalk/ddp.c                                  |  2 +-
 net/atm/pvc.c                                        |  4 ++--
 net/atm/svc.c                                        |  2 +-
 net/ax25/af_ax25.c                                   |  2 +-
 net/bluetooth/hci_sock.c                             |  2 +-
 net/bluetooth/iso.c                                  |  4 ++--
 net/bluetooth/l2cap_sock.c                           |  2 +-
 net/bluetooth/rfcomm/core.c                          |  4 ++--
 net/bluetooth/rfcomm/sock.c                          |  2 +-
 net/bluetooth/sco.c                                  |  2 +-
 net/can/isotp.c                                      |  2 +-
 net/can/j1939/socket.c                               |  2 +-
 net/can/raw.c                                        |  2 +-
 net/core/sock.c                                      |  2 +-
 net/ieee802154/socket.c                              |  4 ++--
 net/ipv4/af_inet.c                                   |  4 ++--
 net/ipv4/udp_tunnel_core.c                           |  2 +-
 net/ipv6/af_inet6.c                                  |  4 ++--
 net/ipv6/ip6_udp_tunnel.c                            |  2 +-
 net/iucv/af_iucv.c                                   |  2 +-
 net/l2tp/l2tp_core.c                                 |  4 ++--
 net/llc/af_llc.c                                     |  2 +-
 net/mctp/af_mctp.c                                   |  2 +-
 net/mctp/test/route-test.c                           |  2 +-
 net/mptcp/protocol.c                                 |  6 +++---
 net/mptcp/subflow.c                                  |  2 +-
 net/netfilter/ipvs/ip_vs_sync.c                      |  4 ++--
 net/netlink/af_netlink.c                             |  2 +-
 net/netrom/af_netrom.c                               |  2 +-
 net/nfc/llcp_sock.c                                  |  4 ++--
 net/packet/af_packet.c                               | 11 ++++++-----
 net/phonet/socket.c                                  |  8 ++++----
 net/qrtr/af_qrtr.c                                   |  2 +-
 net/qrtr/ns.c                                        |  2 +-
 net/rds/bind.c                                       |  2 +-
 net/rds/rds.h                                        |  2 +-
 net/rds/tcp_connect.c                                |  2 +-
 net/rds/tcp_listen.c                                 |  2 +-
 net/rose/af_rose.c                                   |  2 +-
 net/rxrpc/af_rxrpc.c                                 |  2 +-
 net/rxrpc/rxperf.c                                   |  2 +-
 net/smc/af_smc.c                                     |  2 +-
 net/smc/smc.h                                        |  2 +-
 net/socket.c                                         |  6 +++---
 net/sunrpc/clnt.c                                    |  4 ++--
 net/sunrpc/svcsock.c                                 |  2 +-
 net/sunrpc/xprtsock.c                                |  4 ++--
 net/tipc/socket.c                                    |  4 ++--
 net/unix/af_unix.c                                   |  4 ++--
 net/vmw_vsock/af_vsock.c                             |  4 ++--
 net/x25/af_x25.c                                     |  2 +-
 net/xdp/xsk.c                                        |  2 +-
 tools/testing/selftests/bpf/test_kmods/bpf_testmod.c |  2 +-
 74 files changed, 113 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index ca6fdcc6c54a..5e760ab62618 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -145,7 +145,7 @@ void af_alg_release_parent(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(af_alg_release_parent);
 
-static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int alg_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	const u32 allowed = CRYPTO_ALG_KERN_DRIVER_ONLY;
 	struct sock *sk = sock->sk;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index caaf2781136d..d9296f74f902 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -450,7 +450,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection)
 	*  a free one dynamically.
 	*/
 	what = "bind before connect";
-	err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
+	err = sock->ops->bind(sock, (struct sockaddr_unsized *) &src_in6, my_addr_len);
 	if (err < 0)
 		goto out;
 
@@ -537,7 +537,7 @@ static int prepare_listen_socket(struct drbd_connection *connection, struct acce
 	drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
 
 	what = "bind before listen";
-	err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
+	err = s_listen->ops->bind(s_listen, (struct sockaddr_unsized *)&my_addr, my_addr_len);
 	if (err < 0)
 		goto out;
 
diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c
index e0acc185e719..ef66a6359eb9 100644
--- a/drivers/infiniband/hw/erdma/erdma_cm.c
+++ b/drivers/infiniband/hw/erdma/erdma_cm.c
@@ -993,7 +993,7 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
 	int ret;
 
 	sock_set_reuseaddr(s->sk);
-	ret = s->ops->bind(s, laddr, laddrlen);
+	ret = s->ops->bind(s, (struct sockaddr_unsized *)laddr, laddrlen);
 	if (ret)
 		return ret;
 	ret = s->ops->connect(s, raddr, raddrlen, flags);
@@ -1315,7 +1315,7 @@ int erdma_create_listen(struct iw_cm_id *id, int backlog)
 	if (ipv4_is_zeronet(laddr->sin_addr.s_addr))
 		s->sk->sk_bound_dev_if = dev->netdev->ifindex;
 
-	ret = s->ops->bind(s, (struct sockaddr *)laddr,
+	ret = s->ops->bind(s, (struct sockaddr_unsized *)laddr,
 			   sizeof(struct sockaddr_in));
 	if (ret)
 		goto error;
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index 708b13993fdf..7fe118cacb3f 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -1340,7 +1340,7 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
 			return rv;
 	}
 
-	rv = s->ops->bind(s, laddr, size);
+	rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr, size);
 	if (rv < 0)
 		return rv;
 
@@ -1789,7 +1789,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
 				goto error;
 			}
 		}
-		rv = s->ops->bind(s, (struct sockaddr *)laddr,
+		rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr,
 				  sizeof(struct sockaddr_in));
 	} else {
 		struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr);
@@ -1813,7 +1813,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
 				goto error;
 			}
 		}
-		rv = s->ops->bind(s, (struct sockaddr *)laddr,
+		rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr,
 				  sizeof(struct sockaddr_in6));
 	}
 	if (rv) {
diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c
index f732f6614d37..6ab036e4a35f 100644
--- a/drivers/isdn/mISDN/l1oip_core.c
+++ b/drivers/isdn/mISDN/l1oip_core.c
@@ -676,7 +676,7 @@ l1oip_socket_thread(void *data)
 	hc->sin_remote.sin_port = htons((unsigned short)hc->remoteport);
 
 	/* bind to incoming port */
-	if (socket->ops->bind(socket, (struct sockaddr *)&hc->sin_local,
+	if (socket->ops->bind(socket, (struct sockaddr_unsized *)&hc->sin_local,
 			      sizeof(hc->sin_local))) {
 		printk(KERN_ERR "%s: Failed to bind socket to port %d.\n",
 		       __func__, hc->localport);
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index b215b28cad7b..77b900db1cac 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -462,7 +462,7 @@ static int data_sock_getsockopt(struct socket *sock, int level, int optname,
 }
 
 static int
-data_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+data_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
 {
 	struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr;
 	struct sock *sk = sock->sk;
@@ -696,7 +696,7 @@ base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 }
 
 static int
-base_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+base_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
 {
 	struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr;
 	struct sock *sk = sock->sk;
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 90737cb71892..d07e87a0974c 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -382,8 +382,8 @@ drop:
 	return NET_RX_DROP;
 }
 
-static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr,
-	int sockaddr_len)
+static int pptp_bind(struct socket *sock, struct sockaddr_unsized *uservaddr,
+		     int sockaddr_len)
 {
 	struct sock *sk = sock->sk;
 	struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 9a96df1a511c..35d0bd91f6fd 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1834,7 +1834,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 	sk_set_memalloc(queue->sock->sk);
 
 	if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
-		ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
+		ret = kernel_bind(queue->sock, (struct sockaddr_unsized *)&ctrl->src_addr,
 			sizeof(ctrl->src_addr));
 		if (ret) {
 			dev_err(nctrl->device,
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 470bf37e5a63..d543da09ef8e 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -2055,7 +2055,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 	if (so_priority > 0)
 		sock_set_priority(port->sock->sk, so_priority);
 
-	ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
+	ret = kernel_bind(port->sock, (struct sockaddr_unsized *)&port->addr,
 			sizeof(port->addr));
 	if (ret) {
 		pr_err("failed to bind port socket %d\n", ret);
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index c2ac9a99ebbb..53aca059dc16 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -822,7 +822,7 @@ int iscsit_setup_np(
 	sock_set_reuseaddr(sock->sk);
 	ip_sock_set_freebind(sock->sk);
 
-	ret = kernel_bind(sock, (struct sockaddr *)&np->np_sockaddr, len);
+	ret = kernel_bind(sock, (struct sockaddr_unsized *)&np->np_sockaddr, len);
 	if (ret < 0) {
 		pr_err("kernel_bind() failed: %d\n", ret);
 		goto fail;
diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c
index fd7ed65e0197..da1b516b9cfd 100644
--- a/drivers/xen/pvcalls-back.c
+++ b/drivers/xen/pvcalls-back.c
@@ -650,7 +650,7 @@ static int pvcalls_back_bind(struct xenbus_device *dev,
 	if (ret < 0)
 		goto out;
 
-	ret = inet_bind(map->sock, (struct sockaddr *)&req->u.bind.addr,
+	ret = inet_bind(map->sock, (struct sockaddr_unsized *)&req->u.bind.addr,
 			req->u.bind.len);
 	if (ret < 0)
 		goto out;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index c1cadf8fb346..bf0e4ea0aafd 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -82,16 +82,16 @@ int afs_open_socket(struct afs_net *net)
 	if (ret < 0)
 		pr_err("Couldn't create RxGK CM key: %d\n", ret);
 
-	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx));
 	if (ret == -EADDRINUSE) {
 		srx.transport.sin6.sin6_port = 0;
-		ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+		ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx));
 	}
 	if (ret < 0)
 		goto error_2;
 
 	srx.srx_service = YFS_CM_SERVICE;
-	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx));
 	if (ret < 0)
 		goto error_2;
 
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9a0b6c2b6b01..0500421b6e3b 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1134,7 +1134,7 @@ static int sctp_bind_addrs(struct socket *sock, __be16 port)
 		make_sockaddr(&localaddr, port, &addr_len);
 
 		if (!i)
-			result = kernel_bind(sock, addr, addr_len);
+			result = kernel_bind(sock, (struct sockaddr_unsized *)addr, addr_len);
 		else
 			result = sock_bind_add(sock->sk, addr, addr_len);
 
@@ -1813,7 +1813,7 @@ static int dlm_tcp_bind(struct socket *sock)
 	memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
 	make_sockaddr(&src_addr, 0, &addr_len);
 
-	result = kernel_bind(sock, (struct sockaddr *)&src_addr,
+	result = kernel_bind(sock, (struct sockaddr_unsized *)&src_addr,
 			     addr_len);
 	if (result < 0) {
 		/* This *may* not indicate a critical error */
@@ -1852,7 +1852,7 @@ static int dlm_tcp_listen_bind(struct socket *sock)
 
 	/* Bind to our port */
 	make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
-	return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0],
+	return kernel_bind(sock, (struct sockaddr_unsized *)&dlm_local_addr[0],
 			   addr_len);
 }
 
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index b05d4e9d13b2..c7734193d8d7 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1615,7 +1615,7 @@ static void o2net_start_connect(struct work_struct *work)
 	myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
 	myaddr.sin_port = htons(0); /* any port */
 
-	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
+	ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&myaddr,
 			      sizeof(myaddr));
 	if (ret) {
 		mlog(ML_ERROR, "bind failed with %d at address %pI4\n",
@@ -2002,7 +2002,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
 	INIT_WORK(&o2net_listen_work, o2net_accept_many);
 
 	sock->sk->sk_reuse = SK_CAN_REUSE;
-	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+	ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin));
 	if (ret < 0) {
 		printk(KERN_ERR "o2net: Error %d while binding socket at "
 		       "%pI4:%u\n", ret, &addr, ntohs(port)); 
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index dd12f3eb61dc..96d972263020 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -3112,7 +3112,7 @@ bind_socket(struct TCP_Server_Info *server)
 		struct socket *socket = server->ssocket;
 
 		rc = kernel_bind(socket,
-				 (struct sockaddr *) &server->srcaddr,
+				 (struct sockaddr_unsized *) &server->srcaddr,
 				 sizeof(server->srcaddr));
 		if (rc < 0) {
 			struct sockaddr_in *saddr4;
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 7a1e3dcc2cde..bf694bc78c65 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -519,10 +519,10 @@ static int create_socket(struct interface *iface)
 	}
 
 	if (ipv4)
-		ret = kernel_bind(ksmbd_socket, (struct sockaddr *)&sin,
+		ret = kernel_bind(ksmbd_socket, (struct sockaddr_unsized *)&sin,
 				  sizeof(sin));
 	else
-		ret = kernel_bind(ksmbd_socket, (struct sockaddr *)&sin6,
+		ret = kernel_bind(ksmbd_socket, (struct sockaddr_unsized *)&sin6,
 				  sizeof(sin6));
 	if (ret) {
 		pr_err("Failed to bind socket: %d\n", ret);
diff --git a/include/linux/net.h b/include/linux/net.h
index ec09620f40f7..0e316f063113 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -163,7 +163,7 @@ struct proto_ops {
 	struct module	*owner;
 	int		(*release)   (struct socket *sock);
 	int		(*bind)	     (struct socket *sock,
-				      struct sockaddr *myaddr,
+				      struct sockaddr_unsized *myaddr,
 				      int sockaddr_len);
 	int		(*connect)   (struct socket *sock,
 				      struct sockaddr *vaddr,
@@ -345,7 +345,7 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec,
 int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec,
 		   size_t num, size_t len, int flags);
 
-int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen);
+int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen);
 int kernel_listen(struct socket *sock, int backlog);
 int kernel_accept(struct socket *sock, struct socket **newsock, int flags);
 int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index c17a6585d0b0..1666cf6f539e 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -42,7 +42,7 @@ int inet_shutdown(struct socket *sock, int how);
 int inet_listen(struct socket *sock, int backlog);
 int __inet_listen_sk(struct sock *sk, int backlog);
 void inet_sock_destruct(struct sock *sk);
-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len);
 int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 /* Don't allocate port at this moment, defer to connect. */
 #define BIND_FORCE_ADDRESS_NO_PORT	(1 << 0)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 2ccdf85f34f1..2188bad9a687 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1208,7 +1208,7 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);
 void inet6_cleanup_sock(struct sock *sk);
 void inet6_sock_destruct(struct sock *sk);
 int inet6_release(struct socket *sock);
-int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len);
 int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		  int peer);
diff --git a/include/net/sock.h b/include/net/sock.h
index c7e58b8e8a90..acbb78c96d69 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1920,7 +1920,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
  * Functions to fill in entries in struct proto_ops when a protocol
  * does not implement a particular function.
  */
-int sock_no_bind(struct socket *, struct sockaddr *, int);
+int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len);
 int sock_no_connect(struct socket *, struct sockaddr *, int, int);
 int sock_no_socketpair(struct socket *, struct socket *);
 int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index a516745f732f..ef517bb307e2 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -966,7 +966,7 @@ static int p9_bind_privport(struct socket *sock)
 			((struct sockaddr_in *)&stor)->sin_port = htons((ushort)port);
 		else
 			((struct sockaddr_in6 *)&stor)->sin6_port = htons((ushort)port);
-		err = kernel_bind(sock, (struct sockaddr *)&stor, sizeof(stor));
+		err = kernel_bind(sock, (struct sockaddr_unsized *)&stor, sizeof(stor));
 		if (err != -EADDRINUSE)
 			break;
 	}
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 30242fe10341..45db43cde67f 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1149,7 +1149,7 @@ out:
 }
 
 /* Set the address 'our end' of the connection */
-static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int atalk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct sockaddr_at *addr = (struct sockaddr_at *)uaddr;
 	struct sock *sk = sock->sk;
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 66d9a9bd5896..62fdf07c53de 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -24,7 +24,7 @@ static int pvc_shutdown(struct socket *sock, int how)
 	return 0;
 }
 
-static int pvc_bind(struct socket *sock, struct sockaddr *sockaddr,
+static int pvc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr,
 		    int sockaddr_len)
 {
 	struct sock *sk = sock->sk;
@@ -59,7 +59,7 @@ out:
 static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr,
 		       int sockaddr_len, int flags)
 {
-	return pvc_bind(sock, sockaddr, sockaddr_len);
+	return pvc_bind(sock, (struct sockaddr_unsized *)sockaddr, sockaddr_len);
 }
 
 static int pvc_setsockopt(struct socket *sock, int level, int optname,
diff --git a/net/atm/svc.c b/net/atm/svc.c
index f8137ae693b0..1906a493c8aa 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -97,7 +97,7 @@ static int svc_release(struct socket *sock)
 	return 0;
 }
 
-static int svc_bind(struct socket *sock, struct sockaddr *sockaddr,
+static int svc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr,
 		    int sockaddr_len)
 {
 	DEFINE_WAIT(wait);
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 6ef8b2a57a9b..23c558ff9682 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1094,7 +1094,7 @@ static int ax25_release(struct socket *sock)
  *	that we've implemented support for SO_BINDTODEVICE. It is however small
  *	and trivially backward compatible.
  */
-static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int ax25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
 	struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index fc866759910d..ba9f48771e11 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1185,7 +1185,7 @@ static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd,
 }
 #endif
 
-static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
+static int hci_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
 			 int addr_len)
 {
 	struct sockaddr_hci haddr;
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 3d98cb6291da..6a7e1b4a8701 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -944,7 +944,7 @@ static int iso_sock_create(struct net *net, struct socket *sock, int protocol,
 	return 0;
 }
 
-static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr,
+static int iso_sock_bind_bc(struct socket *sock, struct sockaddr_unsized *addr,
 			    int addr_len)
 {
 	struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
@@ -1022,7 +1022,7 @@ done:
 	return err;
 }
 
-static int iso_sock_bind(struct socket *sock, struct sockaddr *addr,
+static int iso_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
 			 int addr_len)
 {
 	struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 814fb8610ac4..ca7394d8fa4e 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -80,7 +80,7 @@ static int l2cap_validate_le_psm(u16 psm)
 	return 0;
 }
 
-static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
+static int l2cap_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen)
 {
 	struct sock *sk = sock->sk;
 	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 96250807b32b..d62fd6c57617 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -781,7 +781,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
 	addr.l2_psm    = 0;
 	addr.l2_cid    = 0;
 	addr.l2_bdaddr_type = BDADDR_BREDR;
-	*err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
+	*err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr));
 	if (*err < 0)
 		goto failed;
 
@@ -2068,7 +2068,7 @@ static int rfcomm_add_listener(bdaddr_t *ba)
 	addr.l2_psm    = cpu_to_le16(L2CAP_PSM_RFCOMM);
 	addr.l2_cid    = 0;
 	addr.l2_bdaddr_type = BDADDR_BREDR;
-	err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
+	err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr));
 	if (err < 0) {
 		BT_ERR("Bind failed %d", err);
 		goto failed;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 913402806fa0..8c8762bbc6de 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock,
 	return 0;
 }
 
-static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+static int rfcomm_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
 {
 	struct sockaddr_rc sa;
 	struct sock *sk = sock->sk;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index ab0cf442d57b..01d878205e58 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -605,7 +605,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
 	return 0;
 }
 
-static int sco_sock_bind(struct socket *sock, struct sockaddr *addr,
+static int sco_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
 			 int addr_len)
 {
 	struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
diff --git a/net/can/isotp.c b/net/can/isotp.c
index 74ee1e52249b..ce588b85665a 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -1246,7 +1246,7 @@ static int isotp_release(struct socket *sock)
 	return 0;
 }
 
-static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len)
+static int isotp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len)
 {
 	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
 	struct sock *sk = sock->sk;
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index 88e7160d4248..a2abedc757d0 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -440,7 +440,7 @@ static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len)
 	return 0;
 }
 
-static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
+static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len)
 {
 	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
 	struct j1939_sock *jsk = j1939_sk(sock->sk);
diff --git a/net/can/raw.c b/net/can/raw.c
index a53853f5e9af..f36a83d3447c 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -449,7 +449,7 @@ static int raw_release(struct socket *sock)
 	return 0;
 }
 
-static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
+static int raw_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len)
 {
 	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
 	struct sock *sk = sock->sk;
diff --git a/net/core/sock.c b/net/core/sock.c
index 7a9bbc2afcf0..1e1ce18bba16 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3462,7 +3462,7 @@ EXPORT_SYMBOL_GPL(sk_set_peek_off);
  * function, some default processing is provided.
  */
 
-int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
+int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index 18d267921bb5..99ddfad9bb88 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -96,13 +96,13 @@ static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg,
 	return sk->sk_prot->sendmsg(sk, msg, len);
 }
 
-static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr,
+static int ieee802154_sock_bind(struct socket *sock, struct sockaddr_unsized *uaddr,
 				int addr_len)
 {
 	struct sock *sk = sock->sk;
 
 	if (sk->sk_prot->bind)
-		return sk->sk_prot->bind(sk, uaddr, addr_len);
+		return sk->sk_prot->bind(sk, (struct sockaddr *)uaddr, addr_len);
 
 	return sock_no_bind(sock, uaddr, addr_len);
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0784e2a873a1..aa43d16e48ff 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -464,9 +464,9 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	return __inet_bind(sk, uaddr, addr_len, flags);
 }
 
-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
-	return inet_bind_sk(sock->sk, uaddr, addr_len);
+	return inet_bind_sk(sock->sk, (struct sockaddr *)uaddr, addr_len);
 }
 EXPORT_SYMBOL(inet_bind);
 
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 54386e06a813..11e5a88c923d 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -29,7 +29,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 	udp_addr.sin_family = AF_INET;
 	udp_addr.sin_addr = cfg->local_ip;
 	udp_addr.sin_port = cfg->local_udp_port;
-	err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
+	err = kernel_bind(sock, (struct sockaddr_unsized *)&udp_addr,
 			  sizeof(udp_addr));
 	if (err < 0)
 		goto error;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 44d7de1eec4f..c92d27e35fbc 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -465,9 +465,9 @@ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 }
 
 /* bind for INET6 API */
-int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
-	return inet6_bind_sk(sock->sk, uaddr, addr_len);
+	return inet6_bind_sk(sock->sk, (struct sockaddr *)uaddr, addr_len);
 }
 EXPORT_SYMBOL(inet6_bind);
 
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 0ff547a4bff7..b0d9286b33c8 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -40,7 +40,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 	memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
 	       sizeof(udp6_addr.sin6_addr));
 	udp6_addr.sin6_port = cfg->local_udp_port;
-	err = kernel_bind(sock, (struct sockaddr *)&udp6_addr,
+	err = kernel_bind(sock, (struct sockaddr_unsized *)&udp6_addr,
 			  sizeof(udp6_addr));
 	if (err < 0)
 		goto error;
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 4ddfc633d30c..3941e32cda69 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -563,7 +563,7 @@ static void __iucv_auto_name(struct iucv_sock *iucv)
 }
 
 /* Bind an unbound socket */
-static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr,
+static int iucv_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
 			  int addr_len)
 {
 	DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr);
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 369a2f2e459c..4b5e372a5cd4 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1503,7 +1503,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
 			memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6,
 			       sizeof(ip6_addr.l2tp_addr));
 			ip6_addr.l2tp_conn_id = tunnel_id;
-			err = kernel_bind(sock, (struct sockaddr *)&ip6_addr,
+			err = kernel_bind(sock, (struct sockaddr_unsized *)&ip6_addr,
 					  sizeof(ip6_addr));
 			if (err < 0)
 				goto out;
@@ -1530,7 +1530,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
 			ip_addr.l2tp_family = AF_INET;
 			ip_addr.l2tp_addr = cfg->local_ip;
 			ip_addr.l2tp_conn_id = tunnel_id;
-			err = kernel_bind(sock, (struct sockaddr *)&ip_addr,
+			err = kernel_bind(sock, (struct sockaddr_unsized *)&ip_addr,
 					  sizeof(ip_addr));
 			if (err < 0)
 				goto out;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5958a80fe14c..e5bb0c0d708c 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -337,7 +337,7 @@ out:
  *	otherwise all hell will break loose.
  *	Returns: 0 upon success, negative otherwise.
  */
-static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
+static int llc_ui_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addrlen)
 {
 	struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr;
 	struct sock *sk = sock->sk;
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index b99ba14f39d2..5b1ef50637b7 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -49,7 +49,7 @@ static bool mctp_sockaddr_ext_is_ok(const struct sockaddr_mctp_ext *addr)
 	       !addr->__smctp_pad0[2];
 }
 
-static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
+static int mctp_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen)
 {
 	struct sock *sk = sock->sk;
 	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c
index 69a3ccfc6310..be9149ac79dd 100644
--- a/net/mctp/test/route-test.c
+++ b/net/mctp/test/route-test.c
@@ -205,7 +205,7 @@ static void __mctp_route_test_init(struct kunit *test,
 	addr.smctp_network = netid;
 	addr.smctp_addr.s_addr = 8;
 	addr.smctp_type = 0;
-	rc = kernel_bind(sock, (struct sockaddr *)&addr, sizeof(addr));
+	rc = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr));
 	KUNIT_ASSERT_EQ(test, rc, 0);
 
 	*devp = dev;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index d568575cdcb5..53e2b095dfb1 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3856,7 +3856,7 @@ static struct proto mptcp_prot = {
 	.no_autobind	= true,
 };
 
-static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int mptcp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sock->sk);
 	struct sock *ssk, *sk = sock->sk;
@@ -3870,10 +3870,10 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	}
 
 	if (sk->sk_family == AF_INET)
-		err = inet_bind_sk(ssk, uaddr, addr_len);
+		err = inet_bind_sk(ssk, (struct sockaddr *)uaddr, addr_len);
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
 	else if (sk->sk_family == AF_INET6)
-		err = inet6_bind_sk(ssk, uaddr, addr_len);
+		err = inet6_bind_sk(ssk, (struct sockaddr *)uaddr, addr_len);
 #endif
 	if (!err)
 		mptcp_copy_inaddrs(sk, ssk);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index e8325890a322..d90237bf433c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1660,7 +1660,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local,
 		addrlen = sizeof(struct sockaddr_in6);
 #endif
 	ssk->sk_bound_dev_if = local->ifindex;
-	err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
+	err = kernel_bind(sf, (struct sockaddr_unsized *)&addr, addrlen);
 	if (err) {
 		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR);
 		pr_debug("msk=%p local=%d remote=%d bind error: %d\n",
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3402675bf521..d8c089ef387c 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1435,7 +1435,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
 	sin.sin_addr.s_addr  = addr;
 	sin.sin_port         = 0;
 
-	return kernel_bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+	return kernel_bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin));
 }
 
 static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
@@ -1542,7 +1542,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id,
 
 	get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
 	sock->sk->sk_bound_dev_if = dev->ifindex;
-	result = kernel_bind(sock, (struct sockaddr *)&mcast_addr, salen);
+	result = kernel_bind(sock, (struct sockaddr_unsized *)&mcast_addr, salen);
 	if (result < 0) {
 		pr_err("Error binding to the multicast addr\n");
 		goto error;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 687a84c48882..18490a56edd0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -966,7 +966,7 @@ static void netlink_undo_bind(int group, long unsigned int groups,
 			nlk->netlink_unbind(sock_net(sk), undo + 1);
 }
 
-static int netlink_bind(struct socket *sock, struct sockaddr *addr,
+static int netlink_bind(struct socket *sock, struct sockaddr_unsized *addr,
 			int addr_len)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 3331669d8e33..33468124d53d 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -561,7 +561,7 @@ static int nr_release(struct socket *sock)
 	return 0;
 }
 
-static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int nr_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
 	struct nr_sock *nr = nr_sk(sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 57a2f97004e1..26e6ceb48a82 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -56,7 +56,7 @@ static struct proto llcp_sock_proto = {
 	.obj_size = sizeof(struct nfc_llcp_sock),
 };
 
-static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
+static int llcp_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen)
 {
 	struct sock *sk = sock->sk;
 	struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
@@ -146,7 +146,7 @@ error:
 	return ret;
 }
 
-static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr *addr,
+static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
 			      int alen)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 173e6edda08f..fccad2a529cc 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3279,11 +3279,12 @@ out_unlock:
  *	Bind a packet socket to a device
  */
 
-static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
+static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr,
 			    int addr_len)
 {
 	struct sock *sk = sock->sk;
-	char name[sizeof(uaddr->sa_data_min) + 1];
+	struct sockaddr *sa = (struct sockaddr *)uaddr;
+	char name[sizeof(sa->sa_data_min) + 1];
 
 	/*
 	 *	Check legality
@@ -3294,13 +3295,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
 	/* uaddr->sa_data comes from the userspace, it's not guaranteed to be
 	 * zero-terminated.
 	 */
-	memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min));
-	name[sizeof(uaddr->sa_data_min)] = 0;
+	memcpy(name, sa->sa_data, sizeof(sa->sa_data_min));
+	name[sizeof(sa->sa_data_min)] = 0;
 
 	return packet_do_bind(sk, name, 0, 0);
 }
 
-static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int packet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
 	struct sock *sk = sock->sk;
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index db2d552e9b32..478b02647733 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -153,7 +153,7 @@ EXPORT_SYMBOL(pn_sock_unhash);
 
 static DEFINE_MUTEX(port_mutex);
 
-static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
+static int pn_socket_bind(struct socket *sock, struct sockaddr_unsized *addr, int len)
 {
 	struct sock *sk = sock->sk;
 	struct pn_sock *pn = pn_sk(sk);
@@ -163,7 +163,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
 	u8 saddr;
 
 	if (sk->sk_prot->bind)
-		return sk->sk_prot->bind(sk, addr, len);
+		return sk->sk_prot->bind(sk, (struct sockaddr *)addr, len);
 
 	if (len < sizeof(struct sockaddr_pn))
 		return -EINVAL;
@@ -206,8 +206,8 @@ static int pn_socket_autobind(struct socket *sock)
 
 	memset(&sa, 0, sizeof(sa));
 	sa.spn_family = AF_PHONET;
-	err = pn_socket_bind(sock, (struct sockaddr *)&sa,
-				sizeof(struct sockaddr_pn));
+	err = pn_socket_bind(sock, (struct sockaddr_unsized *)&sa,
+			     sizeof(struct sockaddr_pn));
 	if (err != -EINVAL)
 		return err;
 	BUG_ON(!pn_port(pn_sk(sock->sk)->sobject));
diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c
index 00c51cf693f3..00bd3dd9f0f9 100644
--- a/net/qrtr/af_qrtr.c
+++ b/net/qrtr/af_qrtr.c
@@ -824,7 +824,7 @@ static int qrtr_autobind(struct socket *sock)
 }
 
 /* Bind socket to specified sockaddr. */
-static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len)
+static int qrtr_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
 {
 	DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr);
 	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c
index 3de9350cbf30..bfcc1a453f23 100644
--- a/net/qrtr/ns.c
+++ b/net/qrtr/ns.c
@@ -714,7 +714,7 @@ int qrtr_ns_init(void)
 	sq.sq_port = QRTR_PORT_CTRL;
 	qrtr_ns.local_node = sq.sq_node;
 
-	ret = kernel_bind(qrtr_ns.sock, (struct sockaddr *)&sq, sizeof(sq));
+	ret = kernel_bind(qrtr_ns.sock, (struct sockaddr_unsized *)&sq, sizeof(sq));
 	if (ret < 0) {
 		pr_err("failed to bind to socket\n");
 		goto err_wq;
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 97a29172a8ee..f800d920d969 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -160,7 +160,7 @@ void rds_remove_bound(struct rds_sock *rs)
 	rs->rs_bound_addr = in6addr_any;
 }
 
-int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+int rds_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
 	struct rds_sock *rs = rds_sk_to_rs(sk);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 5b1c072e2e7f..a029e5fcdea7 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -735,7 +735,7 @@ extern wait_queue_head_t rds_poll_waitq;
 
 
 /* bind.c */
-int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+int rds_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len);
 void rds_remove_bound(struct rds_sock *rs);
 struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
 				__u32 scope_id);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index a0046e99d6df..1eff3b03ab77 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -145,7 +145,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 		addrlen = sizeof(sin);
 	}
 
-	ret = kernel_bind(sock, addr, addrlen);
+	ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen);
 	if (ret) {
 		rdsdebug("bind failed with %d at address %pI6c\n",
 			 ret, &conn->c_laddr);
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 91e34af3fe5d..820d3e20de19 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -290,7 +290,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
 		addr_len = sizeof(*sin);
 	}
 
-	ret = kernel_bind(sock, (struct sockaddr *)&ss, addr_len);
+	ret = kernel_bind(sock, (struct sockaddr_unsized *)&ss, addr_len);
 	if (ret < 0) {
 		rdsdebug("could not bind %s listener socket: %d\n",
 			 isv6 ? "IPv6" : "IPv4", ret);
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 543f9e8ebb69..47369eab5aec 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -693,7 +693,7 @@ static int rose_release(struct socket *sock)
 	return 0;
 }
 
-static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int rose_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
 	struct rose_sock *rose = rose_sk(sk);
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 36df0274d7b7..245f37a74394 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -127,7 +127,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
 /*
  * bind a local address to an RxRPC socket
  */
-static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
+static int rxrpc_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
 {
 	struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr;
 	struct rxrpc_local *local;
diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c
index 2ea71e3831f7..98ea76fae70f 100644
--- a/net/rxrpc/rxperf.c
+++ b/net/rxrpc/rxperf.c
@@ -211,7 +211,7 @@ static int rxperf_open_socket(void)
 
 	ret = rxrpc_sock_set_security_keyring(socket->sk, rxperf_sec_keyring);
 
-	ret = kernel_bind(socket, (struct sockaddr *)&srx, sizeof(srx));
+	ret = kernel_bind(socket, (struct sockaddr_unsized *)&srx, sizeof(srx));
 	if (ret < 0)
 		goto error_2;
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e9d0e62e0b1b..be18ab08f15d 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -421,7 +421,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
 	return sk;
 }
 
-int smc_bind(struct socket *sock, struct sockaddr *uaddr,
+int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr,
 	     int addr_len)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 2c9084963739..a008dbe6d6f6 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -42,7 +42,7 @@ void smc_unhash_sk(struct sock *sk);
 void smc_release_cb(struct sock *sk);
 
 int smc_release(struct socket *sock);
-int smc_bind(struct socket *sock, struct sockaddr *uaddr,
+int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr,
 	     int addr_len);
 int smc_connect(struct socket *sock, struct sockaddr *addr,
 		int alen, int flags);
diff --git a/net/socket.c b/net/socket.c
index e8892b218708..aaefb2e519a7 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1872,7 +1872,7 @@ int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
 				   addrlen);
 	if (!err)
 		err = READ_ONCE(sock->ops)->bind(sock,
-						 (struct sockaddr *)address,
+						 (struct sockaddr_unsized *)address,
 						 addrlen);
 	return err;
 }
@@ -3583,13 +3583,13 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd,
  *	Returns 0 or an error.
  */
 
-int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
+int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen)
 {
 	struct sockaddr_storage address;
 
 	memcpy(&address, addr, addrlen);
 
-	return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *)&address,
+	return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr_unsized *)&address,
 					  addrlen);
 }
 EXPORT_SYMBOL(kernel_bind);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 8ca354ecfd02..318ee24ad900 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1457,12 +1457,12 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen,
 	switch (sap->sa_family) {
 	case AF_INET:
 		err = kernel_bind(sock,
-				(struct sockaddr *)&rpc_inaddr_loopback,
+				(struct sockaddr_unsized *)&rpc_inaddr_loopback,
 				sizeof(rpc_inaddr_loopback));
 		break;
 	case AF_INET6:
 		err = kernel_bind(sock,
-				(struct sockaddr *)&rpc_in6addr_loopback,
+				(struct sockaddr_unsized *)&rpc_in6addr_loopback,
 				sizeof(rpc_in6addr_loopback));
 		break;
 	default:
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7b90abc5cf0e..16ff6c100821 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1557,7 +1557,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 		ip6_sock_set_v6only(sock->sk);
 	if (type == SOCK_STREAM)
 		sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */
-	error = kernel_bind(sock, sin, len);
+	error = kernel_bind(sock, (struct sockaddr_unsized *)sin, len);
 	if (error < 0)
 		goto bummer;
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3aa987e7f072..95732a45b059 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1845,8 +1845,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
 	memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
 	do {
 		rpc_set_port((struct sockaddr *)&myaddr, port);
-		err = kernel_bind(sock, (struct sockaddr *)&myaddr,
-				transport->xprt.addrlen);
+		err = kernel_bind(sock, (struct sockaddr_unsized *)&myaddr,
+				  transport->xprt.addrlen);
 		if (err == 0) {
 			if (transport->xprt.reuseport)
 				transport->srcport = port;
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index bc614a1f019c..3903a97ada7d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -710,7 +710,7 @@ int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
 	return res;
 }
 
-static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
+static int tipc_bind(struct socket *sock, struct sockaddr_unsized *skaddr, int alen)
 {
 	struct tipc_uaddr *ua = (struct tipc_uaddr *)skaddr;
 	u32 atype = ua->addrtype;
@@ -726,7 +726,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
 			return -EACCES;
 		}
 	}
-	return tipc_sk_bind(sock, skaddr, alen);
+	return tipc_sk_bind(sock, (struct sockaddr *)skaddr, alen);
 }
 
 /**
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 54177caa9c12..788775f0eea7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -843,7 +843,7 @@ out:
 }
 
 static int unix_release(struct socket *);
-static int unix_bind(struct socket *, struct sockaddr *, int);
+static int unix_bind(struct socket *, struct sockaddr_unsized *, int);
 static int unix_stream_connect(struct socket *, struct sockaddr *,
 			       int addr_len, int flags);
 static int unix_socketpair(struct socket *, struct socket *);
@@ -1466,7 +1466,7 @@ out:
 	return err;
 }
 
-static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int unix_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 	struct sock *sk = sock->sk;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 76763247a377..0e5609e7284b 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -987,7 +987,7 @@ static int vsock_release(struct socket *sock)
 }
 
 static int
-vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+vsock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
 {
 	int err;
 	struct sock *sk;
@@ -995,7 +995,7 @@ vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 
 	sk = sock->sk;
 
-	if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0)
+	if (vsock_addr_cast((struct sockaddr *)addr, addr_len, &vm_addr) != 0)
 		return -EINVAL;
 
 	lock_sock(sk);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 655d1e0ae25f..ca8006d8f792 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -670,7 +670,7 @@ out:
 	return 0;
 }
 
-static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int x25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
 	struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 2f26c918d448..ed8b612ec29d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1238,7 +1238,7 @@ static bool xsk_validate_queues(struct xdp_sock *xs)
 	return xs->fq_tmp && xs->cq_tmp;
 }
 
-static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
 {
 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
 	struct sock *sk = sock->sk;
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index 8074bc5f6f20..0497b5dea25c 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -923,7 +923,7 @@ __bpf_kfunc int bpf_kfunc_call_kernel_bind(struct addr_args *args)
 		goto out;
 	}
 
-	err = kernel_bind(sock, (struct sockaddr *)&args->addr, args->addrlen);
+	err = kernel_bind(sock, (struct sockaddr_unsized *)&args->addr, args->addrlen);
 out:
 	mutex_unlock(&sock_lock);
 
-- 
cgit v1.2.3


From 85cb0757d7e1f9370a8b52a8b8144c37941cba0a Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 3 Nov 2025 16:26:11 -0800
Subject: net: Convert proto_ops connect() callbacks to use sockaddr_unsized

Update all struct proto_ops connect() callback function prototypes from
"struct sockaddr *" to "struct sockaddr_unsized *" to avoid lying to the
compiler about object sizes. Calls into struct proto handlers gain casts
that will be removed in the struct proto conversion patch.

No binary changes expected.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-3-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/block/drbd/drbd_receiver.c                   |  2 +-
 drivers/infiniband/hw/erdma/erdma_cm.c               |  2 +-
 drivers/infiniband/sw/siw/siw_cm.c                   |  2 +-
 drivers/net/ppp/pppoe.c                              |  4 ++--
 drivers/net/ppp/pptp.c                               |  4 ++--
 drivers/net/wireless/ath/ath10k/qmi.c                |  2 +-
 drivers/net/wireless/ath/ath11k/qmi.c                |  2 +-
 drivers/net/wireless/ath/ath12k/qmi.c                |  2 +-
 drivers/nvme/host/tcp.c                              |  2 +-
 drivers/slimbus/qcom-ngd-ctrl.c                      |  2 +-
 drivers/xen/pvcalls-back.c                           |  2 +-
 fs/coredump.c                                        |  2 +-
 fs/dlm/lowcomms.c                                    |  2 +-
 fs/ocfs2/cluster/tcp.c                               |  2 +-
 fs/smb/client/connect.c                              |  2 +-
 include/linux/bpf-cgroup.h                           |  6 +++---
 include/linux/net.h                                  |  4 ++--
 include/net/inet_common.h                            |  6 +++---
 include/net/sctp/sctp.h                              |  2 +-
 include/net/sock.h                                   |  2 +-
 include/net/vsock_addr.h                             |  2 +-
 net/9p/trans_fd.c                                    |  6 +++---
 net/appletalk/ddp.c                                  |  2 +-
 net/atm/pvc.c                                        |  4 ++--
 net/atm/svc.c                                        |  2 +-
 net/ax25/af_ax25.c                                   |  2 +-
 net/bluetooth/iso.c                                  |  2 +-
 net/bluetooth/l2cap_sock.c                           |  2 +-
 net/bluetooth/rfcomm/core.c                          |  2 +-
 net/bluetooth/rfcomm/sock.c                          |  3 ++-
 net/bluetooth/sco.c                                  |  2 +-
 net/caif/caif_socket.c                               |  2 +-
 net/can/bcm.c                                        |  2 +-
 net/can/j1939/socket.c                               |  2 +-
 net/ceph/messenger.c                                 |  2 +-
 net/core/sock.c                                      |  2 +-
 net/ieee802154/socket.c                              |  4 ++--
 net/ipv4/af_inet.c                                   | 14 +++++++-------
 net/ipv4/tcp.c                                       |  2 +-
 net/ipv4/udp_tunnel_core.c                           |  2 +-
 net/ipv6/ip6_udp_tunnel.c                            |  2 +-
 net/iucv/af_iucv.c                                   |  4 ++--
 net/l2tp/l2tp_core.c                                 |  4 ++--
 net/l2tp/l2tp_ppp.c                                  |  2 +-
 net/llc/af_llc.c                                     |  2 +-
 net/mctp/af_mctp.c                                   |  2 +-
 net/mctp/test/utils.c                                |  5 +++--
 net/mptcp/subflow.c                                  |  2 +-
 net/netfilter/ipvs/ip_vs_sync.c                      |  2 +-
 net/netlink/af_netlink.c                             |  2 +-
 net/netrom/af_netrom.c                               |  4 ++--
 net/nfc/llcp_sock.c                                  |  2 +-
 net/nfc/rawsock.c                                    |  2 +-
 net/phonet/socket.c                                  |  6 +++---
 net/qrtr/af_qrtr.c                                   |  2 +-
 net/rds/af_rds.c                                     |  2 +-
 net/rds/tcp_connect.c                                |  2 +-
 net/rose/af_rose.c                                   |  3 ++-
 net/rxrpc/af_rxrpc.c                                 |  2 +-
 net/sctp/socket.c                                    |  4 ++--
 net/smc/af_smc.c                                     |  4 ++--
 net/smc/smc.h                                        |  2 +-
 net/socket.c                                         |  8 ++++----
 net/sunrpc/clnt.c                                    |  2 +-
 net/sunrpc/xprtsock.c                                |  5 +++--
 net/tipc/socket.c                                    |  2 +-
 net/unix/af_unix.c                                   |  8 ++++----
 net/vmw_vsock/af_vsock.c                             |  6 +++---
 net/vmw_vsock/vsock_addr.c                           |  2 +-
 net/x25/af_x25.c                                     |  2 +-
 samples/qmi/qmi_sample_client.c                      |  2 +-
 tools/testing/selftests/bpf/test_kmods/bpf_testmod.c |  2 +-
 72 files changed, 110 insertions(+), 106 deletions(-)

(limited to 'include')

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d9296f74f902..33bc91665fe8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -458,7 +458,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection)
 	 * stay C_WF_CONNECTION, don't go Disconnecting! */
 	disconnect_on_error = 0;
 	what = "connect";
-	err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
+	err = sock->ops->connect(sock, (struct sockaddr_unsized *) &peer_in6, peer_addr_len, 0);
 
 out:
 	if (err < 0) {
diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c
index ef66a6359eb9..ed21ba0037a4 100644
--- a/drivers/infiniband/hw/erdma/erdma_cm.c
+++ b/drivers/infiniband/hw/erdma/erdma_cm.c
@@ -996,7 +996,7 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
 	ret = s->ops->bind(s, (struct sockaddr_unsized *)laddr, laddrlen);
 	if (ret)
 		return ret;
-	ret = s->ops->connect(s, raddr, raddrlen, flags);
+	ret = s->ops->connect(s, (struct sockaddr_unsized *)raddr, raddrlen, flags);
 	return ret < 0 ? ret : 0;
 }
 
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index 7fe118cacb3f..eb0bd4f79a85 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -1344,7 +1344,7 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
 	if (rv < 0)
 		return rv;
 
-	rv = s->ops->connect(s, raddr, size, flags);
+	rv = s->ops->connect(s, (struct sockaddr_unsized *)raddr, size, flags);
 
 	return rv < 0 ? rv : 0;
 }
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 4ac6afce267b..4275b393a454 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -608,8 +608,8 @@ static int pppoe_release(struct socket *sock)
 	return 0;
 }
 
-static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr,
-		  int sockaddr_len, int flags)
+static int pppoe_connect(struct socket *sock, struct sockaddr_unsized *uservaddr,
+			 int sockaddr_len, int flags)
 {
 	struct sock *sk = sock->sk;
 	struct sockaddr_pppox *sp = (struct sockaddr_pppox *)uservaddr;
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index d07e87a0974c..b18acd810561 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -415,8 +415,8 @@ out:
 	return error;
 }
 
-static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr,
-	int sockaddr_len, int flags)
+static int pptp_connect(struct socket *sock, struct sockaddr_unsized *uservaddr,
+			int sockaddr_len, int flags)
 {
 	struct sock *sk = sock->sk;
 	struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr;
diff --git a/drivers/net/wireless/ath/ath10k/qmi.c b/drivers/net/wireless/ath/ath10k/qmi.c
index f1f33af0170a..8275345631a0 100644
--- a/drivers/net/wireless/ath/ath10k/qmi.c
+++ b/drivers/net/wireless/ath/ath10k/qmi.c
@@ -986,7 +986,7 @@ static int ath10k_qmi_new_server(struct qmi_handle *qmi_hdl,
 
 	ath10k_dbg(ar, ATH10K_DBG_QMI, "wifi fw qmi service found\n");
 
-	ret = kernel_connect(qmi_hdl->sock, (struct sockaddr *)&qmi->sq,
+	ret = kernel_connect(qmi_hdl->sock, (struct sockaddr_unsized *)&qmi->sq,
 			     sizeof(qmi->sq), 0);
 	if (ret) {
 		ath10k_err(ar, "failed to connect to a remote QMI service port\n");
diff --git a/drivers/net/wireless/ath/ath11k/qmi.c b/drivers/net/wireless/ath/ath11k/qmi.c
index aea56c38bf8f..ff6a97e328b8 100644
--- a/drivers/net/wireless/ath/ath11k/qmi.c
+++ b/drivers/net/wireless/ath/ath11k/qmi.c
@@ -3177,7 +3177,7 @@ static int ath11k_qmi_ops_new_server(struct qmi_handle *qmi_hdl,
 	sq->sq_node = service->node;
 	sq->sq_port = service->port;
 
-	ret = kernel_connect(qmi_hdl->sock, (struct sockaddr *)sq,
+	ret = kernel_connect(qmi_hdl->sock, (struct sockaddr_unsized *)sq,
 			     sizeof(*sq), 0);
 	if (ret) {
 		ath11k_warn(ab, "failed to connect to qmi remote service: %d\n", ret);
diff --git a/drivers/net/wireless/ath/ath12k/qmi.c b/drivers/net/wireless/ath/ath12k/qmi.c
index 36325e62aa24..cf9c25df3ffd 100644
--- a/drivers/net/wireless/ath/ath12k/qmi.c
+++ b/drivers/net/wireless/ath/ath12k/qmi.c
@@ -3740,7 +3740,7 @@ static int ath12k_qmi_ops_new_server(struct qmi_handle *qmi_hdl,
 	sq->sq_node = service->node;
 	sq->sq_port = service->port;
 
-	ret = kernel_connect(qmi_hdl->sock, (struct sockaddr *)sq,
+	ret = kernel_connect(qmi_hdl->sock, (struct sockaddr_unsized *)sq,
 			     sizeof(*sq), 0);
 	if (ret) {
 		ath12k_warn(ab, "qmi failed to connect to remote service %d\n", ret);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 35d0bd91f6fd..6795b8286c35 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1872,7 +1872,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 	dev_dbg(nctrl->device, "connecting queue %d\n",
 			nvme_tcp_queue_id(queue));
 
-	ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
+	ret = kernel_connect(queue->sock, (struct sockaddr_unsized *)&ctrl->addr,
 		sizeof(ctrl->addr), 0);
 	if (ret) {
 		dev_err(nctrl->device,
diff --git a/drivers/slimbus/qcom-ngd-ctrl.c b/drivers/slimbus/qcom-ngd-ctrl.c
index 4fb66986cc22..fdb94dc4a730 100644
--- a/drivers/slimbus/qcom-ngd-ctrl.c
+++ b/drivers/slimbus/qcom-ngd-ctrl.c
@@ -463,7 +463,7 @@ static int qcom_slim_qmi_init(struct qcom_slim_ngd_ctrl *ctrl,
 	}
 
 	rc = kernel_connect(handle->sock,
-				(struct sockaddr *)&ctrl->qmi.svc_info,
+				(struct sockaddr_unsized *)&ctrl->qmi.svc_info,
 				sizeof(ctrl->qmi.svc_info), 0);
 	if (rc < 0) {
 		dev_err(ctrl->dev, "Remote Service connect failed: %d\n", rc);
diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c
index da1b516b9cfd..c5b6f6fa11eb 100644
--- a/drivers/xen/pvcalls-back.c
+++ b/drivers/xen/pvcalls-back.c
@@ -409,7 +409,7 @@ static int pvcalls_back_connect(struct xenbus_device *dev,
 	ret = sock_create(AF_INET, SOCK_STREAM, 0, &sock);
 	if (ret < 0)
 		goto out;
-	ret = inet_stream_connect(sock, sa, req->u.connect.len, 0);
+	ret = inet_stream_connect(sock, (struct sockaddr_unsized *)sa, req->u.connect.len, 0);
 	if (ret < 0) {
 		sock_release(sock);
 		goto out;
diff --git a/fs/coredump.c b/fs/coredump.c
index 5c1c381ee380..14837d9e2abb 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -708,7 +708,7 @@ static bool coredump_sock_connect(struct core_name *cn, struct coredump_params *
 	 */
 	pidfs_coredump(cprm);
 
-	retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len,
+	retval = kernel_connect(socket, (struct sockaddr_unsized *)(&addr), addr_len,
 				O_NONBLOCK | SOCK_COREDUMP);
 
 	if (retval) {
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 0500421b6e3b..f832dafdaca8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1599,7 +1599,7 @@ static int dlm_connect(struct connection *con)
 
 	log_print_ratelimited("connecting to %d", con->nodeid);
 	make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len);
-	result = kernel_connect(sock, (struct sockaddr *)&addr, addr_len, 0);
+	result = kernel_connect(sock, (struct sockaddr_unsized *)&addr, addr_len, 0);
 	switch (result) {
 	case -EINPROGRESS:
 		/* not an error */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index c7734193d8d7..79b281e32f4c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1638,7 +1638,7 @@ static void o2net_start_connect(struct work_struct *work)
 	remoteaddr.sin_port = node->nd_ipv4_port;
 
 	ret = sc->sc_sock->ops->connect(sc->sc_sock,
-					(struct sockaddr *)&remoteaddr,
+					(struct sockaddr_unsized *)&remoteaddr,
 					sizeof(remoteaddr),
 					O_NONBLOCK);
 	if (ret == -EINPROGRESS)
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 96d972263020..73120988661a 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -3411,7 +3411,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
 		 socket->sk->sk_sndbuf,
 		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
 
-	rc = kernel_connect(socket, saddr, slen,
+	rc = kernel_connect(socket, (struct sockaddr_unsized *)saddr, slen,
 			    server->noblockcnt ? O_NONBLOCK : 0);
 	/*
 	 * When mounting SMB root file systems, we do not want to block in
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index aedf573bdb42..a7fb4f46974f 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -238,7 +238,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))					       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \
 							  atype, NULL, NULL);  \
 	__ret;								       \
 })
@@ -248,7 +248,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \
 							  atype, t_ctx, NULL); \
 		release_sock(sk);					       \
 	}								       \
@@ -266,7 +266,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \
 							  atype, NULL, &__flags); \
 		release_sock(sk);					       \
 		if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)	       \
diff --git a/include/linux/net.h b/include/linux/net.h
index 0e316f063113..db6bc997ca5b 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -166,7 +166,7 @@ struct proto_ops {
 				      struct sockaddr_unsized *myaddr,
 				      int sockaddr_len);
 	int		(*connect)   (struct socket *sock,
-				      struct sockaddr *vaddr,
+				      struct sockaddr_unsized *vaddr,
 				      int sockaddr_len, int flags);
 	int		(*socketpair)(struct socket *sock1,
 				      struct socket *sock2);
@@ -348,7 +348,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec,
 int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen);
 int kernel_listen(struct socket *sock, int backlog);
 int kernel_accept(struct socket *sock, struct socket **newsock, int flags);
-int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
+int kernel_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen,
 		   int flags);
 int kernel_getsockname(struct socket *sock, struct sockaddr *addr);
 int kernel_getpeername(struct socket *sock, struct sockaddr *addr);
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 1666cf6f539e..ebafd96912bb 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -23,11 +23,11 @@ struct sockaddr;
 struct socket;
 
 int inet_release(struct socket *sock);
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 			int addr_len, int flags);
-int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 			  int addr_len, int flags, int is_sendmsg);
-int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 		       int addr_len, int flags);
 int inet_accept(struct socket *sock, struct socket *newsock,
 		struct proto_accept_arg *arg);
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index bb4b80c12541..58242b37b47a 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -85,7 +85,7 @@ void sctp_udp_sock_stop(struct net *net);
 /*
  * sctp/socket.c
  */
-int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr,
+int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 		      int addr_len, int flags);
 int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
 int sctp_inet_listen(struct socket *sock, int backlog);
diff --git a/include/net/sock.h b/include/net/sock.h
index acbb78c96d69..589fbce77217 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1921,7 +1921,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
  * does not implement a particular function.
  */
 int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len);
-int sock_no_connect(struct socket *, struct sockaddr *, int, int);
+int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags);
 int sock_no_socketpair(struct socket *, struct socket *);
 int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *);
 int sock_no_getname(struct socket *, struct sockaddr *, int);
diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h
index cf8cc140d68d..c3f4cc206198 100644
--- a/include/net/vsock_addr.h
+++ b/include/net/vsock_addr.h
@@ -16,7 +16,7 @@ bool vsock_addr_bound(const struct sockaddr_vm *addr);
 void vsock_addr_unbind(struct sockaddr_vm *addr);
 bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
 			    const struct sockaddr_vm *other);
-int vsock_addr_cast(const struct sockaddr *addr, size_t len,
+int vsock_addr_cast(const struct sockaddr_unsized *addr, size_t len,
 		    struct sockaddr_vm **out_addr);
 
 #endif
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index ef517bb307e2..49d674f5e73a 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -1018,7 +1018,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
 	}
 
 	err = READ_ONCE(csocket->ops)->connect(csocket,
-					       (struct sockaddr *)&stor,
+					       (struct sockaddr_unsized *)&stor,
 					       sizeof(stor), 0);
 	if (err < 0) {
 		pr_err("%s (%d): problem connecting socket to %s\n",
@@ -1058,8 +1058,8 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
 
 		return err;
 	}
-	err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sun_server,
-			sizeof(struct sockaddr_un) - 1, 0);
+	err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr_unsized *)&sun_server,
+					       sizeof(struct sockaddr_un) - 1, 0);
 	if (err < 0) {
 		pr_err("%s (%d): problem connecting socket: %s: %d\n",
 		       __func__, task_pid_nr(current), addr, err);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 45db43cde67f..2a01fff46c9d 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1204,7 +1204,7 @@ out:
 }
 
 /* Set the address we talk to */
-static int atalk_connect(struct socket *sock, struct sockaddr *uaddr,
+static int atalk_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 			 int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 62fdf07c53de..8f5e76f5dd9e 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -56,10 +56,10 @@ out:
 	return error;
 }
 
-static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr,
+static int pvc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr,
 		       int sockaddr_len, int flags)
 {
-	return pvc_bind(sock, (struct sockaddr_unsized *)sockaddr, sockaddr_len);
+	return pvc_bind(sock, sockaddr, sockaddr_len);
 }
 
 static int pvc_setsockopt(struct socket *sock, int level, int optname,
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 1906a493c8aa..005964250ecd 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -153,7 +153,7 @@ out:
 	return error;
 }
 
-static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
+static int svc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr,
 		       int sockaddr_len, int flags)
 {
 	DEFINE_WAIT(wait);
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 23c558ff9682..7ebbff2f0020 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1175,7 +1175,7 @@ out:
  *	FIXME: nonblock behaviour looks like it may have a bug.
  */
 static int __must_check ax25_connect(struct socket *sock,
-	struct sockaddr *uaddr, int addr_len, int flags)
+	struct sockaddr_unsized *uaddr, int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
 	ax25_cb *ax25 = sk_to_ax25(sk), *ax25t;
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 6a7e1b4a8701..243505b89733 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1080,7 +1080,7 @@ done:
 	return err;
 }
 
-static int iso_sock_connect(struct socket *sock, struct sockaddr *addr,
+static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
 			    int alen, int flags)
 {
 	struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index ca7394d8fa4e..9ee189c815d4 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -178,7 +178,7 @@ done:
 	return err;
 }
 
-static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr,
+static int l2cap_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
 			      int alen, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index d62fd6c57617..57b1dca8141f 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -808,7 +808,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
 	addr.l2_psm    = cpu_to_le16(L2CAP_PSM_RFCOMM);
 	addr.l2_cid    = 0;
 	addr.l2_bdaddr_type = BDADDR_BREDR;
-	*err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK);
+	*err = kernel_connect(sock, (struct sockaddr_unsized *)&addr, sizeof(addr), O_NONBLOCK);
 	if (*err == 0 || *err == -EINPROGRESS)
 		return s;
 
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 8c8762bbc6de..be6639cd6f59 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -371,7 +371,8 @@ done:
 	return err;
 }
 
-static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+static int rfcomm_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
+			       int alen, int flags)
 {
 	struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
 	struct sock *sk = sock->sk;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 01d878205e58..7afe65e7ff37 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -639,7 +639,7 @@ done:
 	return err;
 }
 
-static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+static int sco_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags)
 {
 	struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
 	struct sock *sk = sock->sk;
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 039dfbd367c9..af218742af5a 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -734,7 +734,7 @@ bad_sol:
  *  o sock->state: holds the SS_* socket state and is updated by connect and
  *	disconnect.
  */
-static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
+static int caif_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 			int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 5e690a2377e4..7eba8ae01a5b 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1657,7 +1657,7 @@ static int bcm_release(struct socket *sock)
 	return 0;
 }
 
-static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
+static int bcm_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len,
 		       int flags)
 {
 	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index a2abedc757d0..6272326dd614 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -535,7 +535,7 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, in
 	return ret;
 }
 
-static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr,
+static int j1939_sk_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 			    int len, int flags)
 {
 	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f8181acaf870..70b25f4ecba6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -460,7 +460,7 @@ int ceph_tcp_connect(struct ceph_connection *con)
 	set_sock_callbacks(sock, con);
 
 	con_sock_state_connecting(con);
-	ret = kernel_connect(sock, (struct sockaddr *)&ss, sizeof(ss),
+	ret = kernel_connect(sock, (struct sockaddr_unsized *)&ss, sizeof(ss),
 			     O_NONBLOCK);
 	if (ret == -EINPROGRESS) {
 		dout("connect %s EINPROGRESS sk_state = %u\n",
diff --git a/net/core/sock.c b/net/core/sock.c
index 1e1ce18bba16..f97a0e958991 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3468,7 +3468,7 @@ int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
 }
 EXPORT_SYMBOL(sock_no_bind);
 
-int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
+int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr,
 		    int len, int flags)
 {
 	return -EOPNOTSUPP;
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index 99ddfad9bb88..b93fd85f248a 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -107,7 +107,7 @@ static int ieee802154_sock_bind(struct socket *sock, struct sockaddr_unsized *ua
 	return sock_no_bind(sock, uaddr, addr_len);
 }
 
-static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr,
+static int ieee802154_sock_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 				   int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
@@ -118,7 +118,7 @@ static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (uaddr->sa_family == AF_UNSPEC)
 		return sk->sk_prot->disconnect(sk, flags);
 
-	return sk->sk_prot->connect(sk, uaddr, addr_len);
+	return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
 }
 
 static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index aa43d16e48ff..0844de9ac6a4 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -567,7 +567,7 @@ out:
 	return err;
 }
 
-int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 		       int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
@@ -584,14 +584,14 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 		return prot->disconnect(sk, flags);
 
 	if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
-		err = prot->pre_connect(sk, uaddr, addr_len);
+		err = prot->pre_connect(sk, (struct sockaddr *)uaddr, addr_len);
 		if (err)
 			return err;
 	}
 
 	if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
 		return -EAGAIN;
-	return prot->connect(sk, uaddr, addr_len);
+	return prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
 }
 EXPORT_SYMBOL(inet_dgram_connect);
 
@@ -623,7 +623,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
  *	Connect to a remote host. There is regrettably still a little
  *	TCP 'magic' in here.
  */
-int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 			  int addr_len, int flags, int is_sendmsg)
 {
 	struct sock *sk = sock->sk;
@@ -671,12 +671,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 			goto out;
 
 		if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
-			err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
+			err = sk->sk_prot->pre_connect(sk, (struct sockaddr *)uaddr, addr_len);
 			if (err)
 				goto out;
 		}
 
-		err = sk->sk_prot->connect(sk, uaddr, addr_len);
+		err = sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
 		if (err < 0)
 			goto out;
 
@@ -741,7 +741,7 @@ sock_error:
 }
 EXPORT_SYMBOL(__inet_stream_connect);
 
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 			int addr_len, int flags)
 {
 	int err;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a9345aa5a2e5..dee578aad690 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1061,7 +1061,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
 		}
 	}
 	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
-	err = __inet_stream_connect(sk->sk_socket, uaddr,
+	err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr,
 				    msg->msg_namelen, flags, 1);
 	/* fastopen_req could already be freed in __inet_stream_connect
 	 * if the connection times out or gets rst
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 11e5a88c923d..b1f667c52cb2 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -38,7 +38,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 		udp_addr.sin_family = AF_INET;
 		udp_addr.sin_addr = cfg->peer_ip;
 		udp_addr.sin_port = cfg->peer_udp_port;
-		err = kernel_connect(sock, (struct sockaddr *)&udp_addr,
+		err = kernel_connect(sock, (struct sockaddr_unsized *)&udp_addr,
 				     sizeof(udp_addr), 0);
 		if (err < 0)
 			goto error;
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index b0d9286b33c8..cef3e0210744 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -52,7 +52,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 		       sizeof(udp6_addr.sin6_addr));
 		udp6_addr.sin6_port = cfg->peer_udp_port;
 		err = kernel_connect(sock,
-				     (struct sockaddr *)&udp6_addr,
+				     (struct sockaddr_unsized *)&udp6_addr,
 				     sizeof(udp6_addr), 0);
 	}
 	if (err < 0)
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 3941e32cda69..a4f1df92417d 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -668,7 +668,7 @@ static int iucv_sock_autobind(struct sock *sk)
 	return err;
 }
 
-static int afiucv_path_connect(struct socket *sock, struct sockaddr *addr)
+static int afiucv_path_connect(struct socket *sock, struct sockaddr_unsized *addr)
 {
 	DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr);
 	struct sock *sk = sock->sk;
@@ -714,7 +714,7 @@ done:
 }
 
 /* Connect an unconnected socket */
-static int iucv_sock_connect(struct socket *sock, struct sockaddr *addr,
+static int iucv_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
 			     int alen, int flags)
 {
 	DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr);
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 4b5e372a5cd4..c4f4a57cd67c 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1513,7 +1513,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
 			       sizeof(ip6_addr.l2tp_addr));
 			ip6_addr.l2tp_conn_id = peer_tunnel_id;
 			err = kernel_connect(sock,
-					     (struct sockaddr *)&ip6_addr,
+					     (struct sockaddr_unsized *)&ip6_addr,
 					     sizeof(ip6_addr), 0);
 			if (err < 0)
 				goto out;
@@ -1538,7 +1538,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
 			ip_addr.l2tp_family = AF_INET;
 			ip_addr.l2tp_addr = cfg->peer_ip;
 			ip_addr.l2tp_conn_id = peer_tunnel_id;
-			err = kernel_connect(sock, (struct sockaddr *)&ip_addr,
+			err = kernel_connect(sock, (struct sockaddr_unsized *)&ip_addr,
 					     sizeof(ip_addr), 0);
 			if (err < 0)
 				goto out;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 5e12e7ce17d8..ae4543d5597b 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -684,7 +684,7 @@ static struct l2tp_tunnel *pppol2tp_tunnel_get(struct net *net,
 
 /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket
  */
-static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
+static int pppol2tp_connect(struct socket *sock, struct sockaddr_unsized *uservaddr,
 			    int sockaddr_len, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index e5bb0c0d708c..59d593bb5d18 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -477,7 +477,7 @@ out:
  *	This function will autobind if user did not previously call bind.
  *	Returns: 0 upon success, negative otherwise.
  */
-static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr,
+static int llc_ui_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 			  int addrlen, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 5b1ef50637b7..209a963112e3 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -128,7 +128,7 @@ out_release:
 /* Used to set a specific peer prior to bind. Not used for outbound
  * connections (Tag Owner set) since MCTP is a datagram protocol.
  */
-static int mctp_connect(struct socket *sock, struct sockaddr *addr,
+static int mctp_connect(struct socket *sock, struct sockaddr_unsized *addr,
 			int addrlen, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c
index 953d41902771..35f6be814567 100644
--- a/net/mctp/test/utils.c
+++ b/net/mctp/test/utils.c
@@ -279,7 +279,7 @@ void mctp_test_bind_run(struct kunit *test,
 		addr.smctp_addr.s_addr = setup->peer_addr;
 		/* connect() type must match bind() type */
 		addr.smctp_type = setup->bind_type;
-		rc = kernel_connect(*sock, (struct sockaddr *)&addr,
+		rc = kernel_connect(*sock, (struct sockaddr_unsized *)&addr,
 				    sizeof(addr), 0);
 		KUNIT_EXPECT_EQ(test, rc, 0);
 	}
@@ -292,5 +292,6 @@ void mctp_test_bind_run(struct kunit *test,
 	addr.smctp_type = setup->bind_type;
 
 	*ret_bind_errno =
-		kernel_bind(*sock, (struct sockaddr *)&addr, sizeof(addr));
+		kernel_bind(*sock, (struct sockaddr_unsized *)&addr,
+			    sizeof(addr));
 }
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index d90237bf433c..30961b3d1702 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1680,7 +1680,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local,
 
 	sock_hold(ssk);
 	list_add_tail(&subflow->node, &msk->conn_list);
-	err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
+	err = kernel_connect(sf, (struct sockaddr_unsized *)&addr, addrlen, O_NONBLOCK);
 	if (err && err != -EINPROGRESS) {
 		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR);
 		pr_debug("msk=%p local=%d remote=%d connect error: %d\n",
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index d8c089ef387c..5a0c6f42bd8f 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1501,7 +1501,7 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id,
 	}
 
 	get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
-	result = kernel_connect(sock, (struct sockaddr *)&mcast_addr,
+	result = kernel_connect(sock, (struct sockaddr_unsized *)&mcast_addr,
 				salen, 0);
 	if (result < 0) {
 		pr_err("Error connecting to the multicast addr\n");
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 18490a56edd0..8e5151f0c6e4 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1054,7 +1054,7 @@ unlock:
 	return err;
 }
 
-static int netlink_connect(struct socket *sock, struct sockaddr *addr,
+static int netlink_connect(struct socket *sock, struct sockaddr_unsized *addr,
 			   int alen, int flags)
 {
 	int err = 0;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 33468124d53d..5ed1a71ceec1 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -632,8 +632,8 @@ static int nr_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr
 	return 0;
 }
 
-static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
-	int addr_len, int flags)
+static int nr_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
+		      int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
 	struct nr_sock *nr = nr_sk(sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 26e6ceb48a82..f1be1e84f665 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -648,7 +648,7 @@ out:
 	return err;
 }
 
-static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr,
+static int llcp_sock_connect(struct socket *sock, struct sockaddr_unsized *_addr,
 			     int len, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 5125392bb68e..b049022399ae 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -73,7 +73,7 @@ static int rawsock_release(struct socket *sock)
 	return 0;
 }
 
-static int rawsock_connect(struct socket *sock, struct sockaddr *_addr,
+static int rawsock_connect(struct socket *sock, struct sockaddr_unsized *_addr,
 			   int len, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 478b02647733..9391378083a4 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -214,8 +214,8 @@ static int pn_socket_autobind(struct socket *sock)
 	return 0; /* socket was already bound */
 }
 
-static int pn_socket_connect(struct socket *sock, struct sockaddr *addr,
-		int len, int flags)
+static int pn_socket_connect(struct socket *sock, struct sockaddr_unsized *addr,
+			     int len, int flags)
 {
 	struct sock *sk = sock->sk;
 	struct pn_sock *pn = pn_sk(sk);
@@ -252,7 +252,7 @@ static int pn_socket_connect(struct socket *sock, struct sockaddr *addr,
 	pn->resource = pn_sockaddr_get_resource(spn);
 	sock->state = SS_CONNECTING;
 
-	err = sk->sk_prot->connect(sk, addr, len);
+	err = sk->sk_prot->connect(sk, (struct sockaddr *)addr, len);
 	if (err) {
 		sock->state = SS_UNCONNECTED;
 		pn->dobject = 0;
diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c
index 00bd3dd9f0f9..dab839f61ee9 100644
--- a/net/qrtr/af_qrtr.c
+++ b/net/qrtr/af_qrtr.c
@@ -1084,7 +1084,7 @@ out:
 	return rc;
 }
 
-static int qrtr_connect(struct socket *sock, struct sockaddr *saddr,
+static int qrtr_connect(struct socket *sock, struct sockaddr_unsized *saddr,
 			int len, int flags)
 {
 	DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr);
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 4a7217fbeab6..b396c673dfaf 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -533,7 +533,7 @@ out:
 
 }
 
-static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
+static int rds_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 		       int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 1eff3b03ab77..92891b0d224d 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -173,7 +173,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 	 * own the socket
 	 */
 	rds_tcp_set_callbacks(sock, cp);
-	ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK);
+	ret = kernel_connect(sock, (struct sockaddr_unsized *)addr, addrlen, O_NONBLOCK);
 
 	rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
 	if (ret == -EINPROGRESS)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 47369eab5aec..fd67494f2815 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -765,7 +765,8 @@ out_release:
 	return err;
 }
 
-static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags)
+static int rose_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len,
+			int flags)
 {
 	struct sock *sk = sock->sk;
 	struct rose_sock *rose = rose_sk(sk);
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 245f37a74394..0c2c68c4b07e 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -481,7 +481,7 @@ EXPORT_SYMBOL(rxrpc_kernel_set_notifications);
  * - this just targets it at a specific destination; no actual connection
  *   negotiation takes place
  */
-static int rxrpc_connect(struct socket *sock, struct sockaddr *addr,
+static int rxrpc_connect(struct socket *sock, struct sockaddr_unsized *addr,
 			 int addr_len, int flags)
 {
 	struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)addr;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ac737e60829b..940abbced191 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4820,7 +4820,7 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr,
 	return err;
 }
 
-int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr,
+int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 		      int addr_len, int flags)
 {
 	if (addr_len < sizeof(uaddr->sa_family))
@@ -4829,7 +4829,7 @@ int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (uaddr->sa_family == AF_UNSPEC)
 		return -EOPNOTSUPP;
 
-	return sctp_connect(sock->sk, uaddr, addr_len, flags);
+	return sctp_connect(sock->sk, (struct sockaddr *)uaddr, addr_len, flags);
 }
 
 /* Only called when shutdown a listening SCTP socket. */
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index be18ab08f15d..0ef3e16a8517 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1642,7 +1642,7 @@ out:
 	release_sock(&smc->sk);
 }
 
-int smc_connect(struct socket *sock, struct sockaddr *addr,
+int smc_connect(struct socket *sock, struct sockaddr_unsized *addr,
 		int alen, int flags)
 {
 	struct sock *sk = sock->sk;
@@ -1694,7 +1694,7 @@ int smc_connect(struct socket *sock, struct sockaddr *addr,
 		rc = -EALREADY;
 		goto out;
 	}
-	rc = kernel_connect(smc->clcsock, addr, alen, flags);
+	rc = kernel_connect(smc->clcsock, (struct sockaddr_unsized *)addr, alen, flags);
 	if (rc && rc != -EINPROGRESS)
 		goto out;
 
diff --git a/net/smc/smc.h b/net/smc/smc.h
index a008dbe6d6f6..9e6af72784ba 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -44,7 +44,7 @@ void smc_release_cb(struct sock *sk);
 int smc_release(struct socket *sock);
 int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr,
 	     int addr_len);
-int smc_connect(struct socket *sock, struct sockaddr *addr,
+int smc_connect(struct socket *sock, struct sockaddr_unsized *addr,
 		int alen, int flags);
 int smc_accept(struct socket *sock, struct socket *new_sock,
 	       struct proto_accept_arg *arg);
diff --git a/net/socket.c b/net/socket.c
index aaefb2e519a7..101a7ed574e7 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2099,8 +2099,8 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
 	if (err)
 		goto out;
 
-	err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address,
-				addrlen, sock->file->f_flags | file_flags);
+	err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)address,
+					    addrlen, sock->file->f_flags | file_flags);
 out:
 	return err;
 }
@@ -3662,14 +3662,14 @@ EXPORT_SYMBOL(kernel_accept);
  *	Returns 0 or an error code.
  */
 
-int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
+int kernel_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen,
 		   int flags)
 {
 	struct sockaddr_storage address;
 
 	memcpy(&address, addr, addrlen);
 
-	return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address,
+	return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)&address,
 					     addrlen, flags);
 }
 EXPORT_SYMBOL(kernel_connect);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 318ee24ad900..58442ae1c2da 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1474,7 +1474,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen,
 		goto out_release;
 	}
 
-	err = kernel_connect(sock, sap, salen, 0);
+	err = kernel_connect(sock, (struct sockaddr_unsized *)sap, salen, 0);
 	if (err < 0) {
 		dprintk("RPC:       can't connect UDP socket (%d)\n", err);
 		goto out_release;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 95732a45b059..2e1fe6013361 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2005,7 +2005,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
 
 	xs_stream_start_connect(transport);
 
-	return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0);
+	return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), xprt->addrlen, 0);
 }
 
 /**
@@ -2405,7 +2405,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 
 	/* Tell the socket layer to start connecting... */
 	set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);
-	return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
+	return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt),
+			      xprt->addrlen, O_NONBLOCK);
 }
 
 /**
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 3903a97ada7d..817b07d95a91 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2565,7 +2565,7 @@ static bool tipc_sockaddr_is_sane(struct sockaddr_tipc *addr)
  *
  * Return: 0 on success, errno otherwise
  */
-static int tipc_connect(struct socket *sock, struct sockaddr *dest,
+static int tipc_connect(struct socket *sock, struct sockaddr_unsized *dest,
 			int destlen, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 788775f0eea7..3b44cadaed96 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -844,7 +844,7 @@ out:
 
 static int unix_release(struct socket *);
 static int unix_bind(struct socket *, struct sockaddr_unsized *, int);
-static int unix_stream_connect(struct socket *, struct sockaddr *,
+static int unix_stream_connect(struct socket *, struct sockaddr_unsized *,
 			       int addr_len, int flags);
 static int unix_socketpair(struct socket *, struct socket *);
 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
@@ -866,7 +866,7 @@ static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
-static int unix_dgram_connect(struct socket *, struct sockaddr *,
+static int unix_dgram_connect(struct socket *, struct sockaddr_unsized *,
 			      int, int);
 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
@@ -1512,7 +1512,7 @@ static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 	unix_state_unlock(sk2);
 }
 
-static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
+static int unix_dgram_connect(struct socket *sock, struct sockaddr_unsized *addr,
 			      int alen, int flags)
 {
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
@@ -1631,7 +1631,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo)
 	return timeo;
 }
 
-static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 			       int addr_len, int flags)
 {
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 0e5609e7284b..72bb6b7ed386 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -995,7 +995,7 @@ vsock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
 
 	sk = sock->sk;
 
-	if (vsock_addr_cast((struct sockaddr *)addr, addr_len, &vm_addr) != 0)
+	if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0)
 		return -EINVAL;
 
 	lock_sock(sk);
@@ -1328,7 +1328,7 @@ out:
 }
 
 static int vsock_dgram_connect(struct socket *sock,
-			       struct sockaddr *addr, int addr_len, int flags)
+			       struct sockaddr_unsized *addr, int addr_len, int flags)
 {
 	int err;
 	struct sock *sk;
@@ -1528,7 +1528,7 @@ static void vsock_connect_timeout(struct work_struct *work)
 	sock_put(sk);
 }
 
-static int vsock_connect(struct socket *sock, struct sockaddr *addr,
+static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr,
 			 int addr_len, int flags)
 {
 	int err;
diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c
index 223b9660a759..a986aa6fff9b 100644
--- a/net/vmw_vsock/vsock_addr.c
+++ b/net/vmw_vsock/vsock_addr.c
@@ -57,7 +57,7 @@ bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
 }
 EXPORT_SYMBOL_GPL(vsock_addr_equals_addr);
 
-int vsock_addr_cast(const struct sockaddr *addr,
+int vsock_addr_cast(const struct sockaddr_unsized *addr,
 		    size_t len, struct sockaddr_vm **out_addr)
 {
 	if (len < sizeof(**out_addr))
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index ca8006d8f792..af8762b24039 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -743,7 +743,7 @@ static int x25_wait_for_connection_establishment(struct sock *sk)
 	return rc;
 }
 
-static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
+static int x25_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 		       int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
diff --git a/samples/qmi/qmi_sample_client.c b/samples/qmi/qmi_sample_client.c
index b27d861f354f..d1814582319b 100644
--- a/samples/qmi/qmi_sample_client.c
+++ b/samples/qmi/qmi_sample_client.c
@@ -468,7 +468,7 @@ static int qmi_sample_probe(struct platform_device *pdev)
 		return ret;
 
 	sq = dev_get_platdata(&pdev->dev);
-	ret = kernel_connect(sample->qmi.sock, (struct sockaddr *)sq,
+	ret = kernel_connect(sample->qmi.sock, (struct sockaddr_unsized *)sq,
 			     sizeof(*sq), 0);
 	if (ret < 0) {
 		pr_err("failed to connect to remote service port\n");
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index 0497b5dea25c..8eeebaa951f0 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -900,7 +900,7 @@ __bpf_kfunc int bpf_kfunc_call_kernel_connect(struct addr_args *args)
 		goto out;
 	}
 
-	err = kernel_connect(sock, (struct sockaddr *)&args->addr,
+	err = kernel_connect(sock, (struct sockaddr_unsized *)&args->addr,
 			     args->addrlen, 0);
 out:
 	mutex_unlock(&sock_lock);
-- 
cgit v1.2.3


From 3d39d34146f2b38127eadf36a0513e130eaa7eec Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 3 Nov 2025 16:26:12 -0800
Subject: net: Remove struct sockaddr from net.h

Now that struct sockaddr is no longer used by net.h, remove it.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-4-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/net.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index db6bc997ca5b..f58b38ab37f8 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -148,7 +148,6 @@ typedef struct {
 
 struct vm_area_struct;
 struct page;
-struct sockaddr;
 struct msghdr;
 struct module;
 struct sk_buff;
-- 
cgit v1.2.3


From 449f68f8fffa2c41fc265730bd05a3c4947916c1 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 3 Nov 2025 16:26:13 -0800
Subject: net: Convert proto callbacks from sockaddr to sockaddr_unsized

Convert struct proto pre_connect(), connect(), bind(), and bind_add()
callback function prototypes from struct sockaddr to struct sockaddr_unsized.
This does not change per-implementation use of sockaddr for passing around
an arbitrarily sized sockaddr struct. Those will be addressed in future
patches.

Additionally removes the no longer referenced struct sockaddr from
include/net/inet_common.h.

No binary changes expected.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-5-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 fs/dlm/lowcomms.c         |  4 ++--
 include/net/inet_common.h |  5 ++---
 include/net/ip.h          |  4 ++--
 include/net/ipv6.h        |  8 ++++----
 include/net/ipv6_stubs.h  |  2 +-
 include/net/ping.h        |  2 +-
 include/net/sock.h        | 10 +++++-----
 include/net/tcp.h         |  2 +-
 include/net/udp.h         |  2 +-
 net/core/filter.c         |  5 +++--
 net/core/sock.c           |  2 +-
 net/ieee802154/socket.c   | 12 ++++++------
 net/ipv4/af_inet.c        | 14 +++++++-------
 net/ipv4/datagram.c       |  4 ++--
 net/ipv4/ping.c           |  8 ++++----
 net/ipv4/raw.c            |  3 ++-
 net/ipv4/tcp_ipv4.c       |  4 ++--
 net/ipv4/udp.c            |  6 ++++--
 net/ipv6/af_inet6.c       |  6 +++---
 net/ipv6/datagram.c       |  8 ++++----
 net/ipv6/ping.c           |  2 +-
 net/ipv6/raw.c            |  3 ++-
 net/ipv6/tcp_ipv6.c       |  6 +++---
 net/ipv6/udp.c            |  5 +++--
 net/l2tp/l2tp_ip.c        |  6 ++++--
 net/l2tp/l2tp_ip6.c       |  5 +++--
 net/mptcp/pm_kernel.c     |  4 ++--
 net/mptcp/protocol.c      |  7 ++++---
 net/phonet/pep.c          |  3 ++-
 net/phonet/socket.c       |  4 ++--
 net/sctp/socket.c         |  9 +++++----
 31 files changed, 88 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index f832dafdaca8..b3958008ba3f 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1126,7 +1126,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
 static int sctp_bind_addrs(struct socket *sock, __be16 port)
 {
 	struct sockaddr_storage localaddr;
-	struct sockaddr *addr = (struct sockaddr *)&localaddr;
+	struct sockaddr_unsized *addr = (struct sockaddr_unsized *)&localaddr;
 	int i, addr_len, result = 0;
 
 	for (i = 0; i < dlm_local_count; i++) {
@@ -1134,7 +1134,7 @@ static int sctp_bind_addrs(struct socket *sock, __be16 port)
 		make_sockaddr(&localaddr, port, &addr_len);
 
 		if (!i)
-			result = kernel_bind(sock, (struct sockaddr_unsized *)addr, addr_len);
+			result = kernel_bind(sock, addr, addr_len);
 		else
 			result = sock_bind_add(sock->sk, addr, addr_len);
 
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index ebafd96912bb..5dd2bf24449e 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -19,7 +19,6 @@ struct msghdr;
 struct net;
 struct page;
 struct sock;
-struct sockaddr;
 struct socket;
 
 int inet_release(struct socket *sock);
@@ -43,7 +42,7 @@ int inet_listen(struct socket *sock, int backlog);
 int __inet_listen_sk(struct sock *sk, int backlog);
 void inet_sock_destruct(struct sock *sk);
 int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len);
-int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
 /* Don't allocate port at this moment, defer to connect. */
 #define BIND_FORCE_ADDRESS_NO_PORT	(1 << 0)
 /* Grab and release socket lock. */
@@ -52,7 +51,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 #define BIND_FROM_BPF			(1 << 2)
 /* Skip CAP_NET_BIND_SERVICE check. */
 #define BIND_NO_CAP_NET_BIND_SERVICE	(1 << 3)
-int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
 		u32 flags);
 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		 int peer);
diff --git a/include/net/ip.h b/include/net/ip.h
index 380afb691c41..69d5cef46004 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -261,8 +261,8 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet,
 }
 
 /* datagram.c */
-int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
-int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
+int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
 
 void ip4_datagram_release_cb(struct sock *sk);
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 2188bad9a687..74fbf1ad8065 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1188,10 +1188,10 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 int ipv6_getsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, int __user *optlen);
 
-int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr,
+int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr,
 			   int addr_len);
-int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
-int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr,
+int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr, int addr_len);
+int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *addr,
 				 int addr_len);
 int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
 void ip6_datagram_release_cb(struct sock *sk);
@@ -1209,7 +1209,7 @@ void inet6_cleanup_sock(struct sock *sk);
 void inet6_sock_destruct(struct sock *sk);
 int inet6_release(struct socket *sock);
 int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len);
-int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		  int peer);
 int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 8a3465c8c2c5..d3013e721b14 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -80,7 +80,7 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly;
 
 /* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
 struct ipv6_bpf_stub {
-	int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+	int (*inet6_bind)(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
 			  u32 flags);
 	struct sock *(*udp6_lib_lookup)(const struct net *net,
 				     const struct in6_addr *saddr, __be16 sport,
diff --git a/include/net/ping.h b/include/net/ping.h
index 9634b8800814..05bfd594a64c 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -58,7 +58,7 @@ void ping_unhash(struct sock *sk);
 
 int  ping_init_sock(struct sock *sk);
 void ping_close(struct sock *sk, long timeout);
-int  ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int  ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
 void ping_err(struct sk_buff *skb, int offset, u32 info);
 int  ping_getfrag(void *from, char *to, int offset, int fraglen, int odd,
 		  struct sk_buff *);
diff --git a/include/net/sock.h b/include/net/sock.h
index 589fbce77217..a5f36ea9d46f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1274,10 +1274,10 @@ struct proto {
 	void			(*close)(struct sock *sk,
 					long timeout);
 	int			(*pre_connect)(struct sock *sk,
-					struct sockaddr *uaddr,
+					struct sockaddr_unsized *uaddr,
 					int addr_len);
 	int			(*connect)(struct sock *sk,
-					struct sockaddr *uaddr,
+					struct sockaddr_unsized *uaddr,
 					int addr_len);
 	int			(*disconnect)(struct sock *sk, int flags);
 
@@ -1306,9 +1306,9 @@ struct proto {
 					   size_t len, int flags, int *addr_len);
 	void			(*splice_eof)(struct socket *sock);
 	int			(*bind)(struct sock *sk,
-					struct sockaddr *addr, int addr_len);
+					struct sockaddr_unsized *addr, int addr_len);
 	int			(*bind_add)(struct sock *sk,
-					struct sockaddr *addr, int addr_len);
+					struct sockaddr_unsized *addr, int addr_len);
 
 	int			(*backlog_rcv) (struct sock *sk,
 						struct sk_buff *skb);
@@ -3105,7 +3105,7 @@ void sock_set_reuseaddr(struct sock *sk);
 void sock_set_reuseport(struct sock *sk);
 void sock_set_sndtimeo(struct sock *sk, s64 secs);
 
-int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);
+int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len);
 
 int sock_get_timeout(long timeo, void *optval, bool old_timeval);
 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4fd6d8d1230d..0aa1f07d036a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -535,7 +535,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 				  struct request_sock *req_unhash,
 				  bool *own_req);
 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
-int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
 int tcp_connect(struct sock *sk);
 enum tcp_synack_type {
 	TCP_SYNACK_NORMAL,
diff --git a/include/net/udp.h b/include/net/udp.h
index cffedb3e40f2..a061d1b22ddc 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -424,7 +424,7 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
 int udp_rcv(struct sk_buff *skb);
 int udp_ioctl(struct sock *sk, int cmd, int *karg);
 int udp_init_sock(struct sock *sk);
-int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
 int __udp_disconnect(struct sock *sk, int flags);
 int udp_disconnect(struct sock *sk, int flags);
 __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
diff --git a/net/core/filter.c b/net/core/filter.c
index 16105f52927d..90273da74807 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5978,7 +5978,7 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
 			return err;
 		if (((struct sockaddr_in *)addr)->sin_port == htons(0))
 			flags |= BIND_FORCE_ADDRESS_NO_PORT;
-		return __inet_bind(sk, addr, addr_len, flags);
+		return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (addr->sa_family == AF_INET6) {
 		if (addr_len < SIN6_LEN_RFC2133)
@@ -5988,7 +5988,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
 		/* ipv6_bpf_stub cannot be NULL, since it's called from
 		 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
 		 */
-		return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
+		return ipv6_bpf_stub->inet6_bind(sk, (struct sockaddr_unsized *)addr,
+						 addr_len, flags);
 #endif /* CONFIG_IPV6 */
 	}
 #endif /* CONFIG_INET */
diff --git a/net/core/sock.c b/net/core/sock.c
index f97a0e958991..3b74fc71f51c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -4395,7 +4395,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
 EXPORT_SYMBOL(sk_busy_loop_end);
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
-int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
+int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len)
 {
 	if (!sk->sk_prot->bind_add)
 		return -EOPNOTSUPP;
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index b93fd85f248a..e542fbe113e7 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -102,7 +102,7 @@ static int ieee802154_sock_bind(struct socket *sock, struct sockaddr_unsized *ua
 	struct sock *sk = sock->sk;
 
 	if (sk->sk_prot->bind)
-		return sk->sk_prot->bind(sk, (struct sockaddr *)uaddr, addr_len);
+		return sk->sk_prot->bind(sk, uaddr, addr_len);
 
 	return sock_no_bind(sock, uaddr, addr_len);
 }
@@ -118,7 +118,7 @@ static int ieee802154_sock_connect(struct socket *sock, struct sockaddr_unsized
 	if (uaddr->sa_family == AF_UNSPEC)
 		return sk->sk_prot->disconnect(sk, flags);
 
-	return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+	return sk->sk_prot->connect(sk, uaddr, addr_len);
 }
 
 static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg,
@@ -193,7 +193,7 @@ static void raw_close(struct sock *sk, long timeout)
 	sk_common_release(sk);
 }
 
-static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len)
+static int raw_bind(struct sock *sk, struct sockaddr_unsized *_uaddr, int len)
 {
 	struct ieee802154_addr addr;
 	struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr;
@@ -227,7 +227,7 @@ out:
 	return err;
 }
 
-static int raw_connect(struct sock *sk, struct sockaddr *uaddr,
+static int raw_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 		       int addr_len)
 {
 	return -ENOTSUPP;
@@ -485,7 +485,7 @@ static void dgram_close(struct sock *sk, long timeout)
 	sk_common_release(sk);
 }
 
-static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len)
+static int dgram_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int len)
 {
 	struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
 	struct ieee802154_addr haddr;
@@ -563,7 +563,7 @@ static int dgram_ioctl(struct sock *sk, int cmd, int *karg)
 }
 
 /* FIXME: autobind */
-static int dgram_connect(struct sock *sk, struct sockaddr *uaddr,
+static int dgram_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 			 int len)
 {
 	struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0844de9ac6a4..d5ac089356eb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -441,7 +441,7 @@ int inet_release(struct socket *sock)
 }
 EXPORT_SYMBOL(inet_release);
 
-int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	u32 flags = BIND_WITH_LOCK;
 	int err;
@@ -466,11 +466,11 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
-	return inet_bind_sk(sock->sk, (struct sockaddr *)uaddr, addr_len);
+	return inet_bind_sk(sock->sk, uaddr, addr_len);
 }
 EXPORT_SYMBOL(inet_bind);
 
-int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
 		u32 flags)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
@@ -584,14 +584,14 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 		return prot->disconnect(sk, flags);
 
 	if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
-		err = prot->pre_connect(sk, (struct sockaddr *)uaddr, addr_len);
+		err = prot->pre_connect(sk, uaddr, addr_len);
 		if (err)
 			return err;
 	}
 
 	if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
 		return -EAGAIN;
-	return prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+	return prot->connect(sk, uaddr, addr_len);
 }
 EXPORT_SYMBOL(inet_dgram_connect);
 
@@ -671,12 +671,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
 			goto out;
 
 		if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
-			err = sk->sk_prot->pre_connect(sk, (struct sockaddr *)uaddr, addr_len);
+			err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
 			if (err)
 				goto out;
 		}
 
-		err = sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+		err = sk->sk_prot->connect(sk, uaddr, addr_len);
 		if (err < 0)
 			goto out;
 
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index c2b2cda1a7e5..1614593b6d72 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -16,7 +16,7 @@
 #include <net/tcp_states.h>
 #include <net/sock_reuseport.h>
 
-int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
@@ -84,7 +84,7 @@ out:
 }
 EXPORT_SYMBOL(__ip4_datagram_connect);
 
-int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	int res;
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 5321c5801c64..ad56588107cc 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -286,7 +286,7 @@ void ping_close(struct sock *sk, long timeout)
 }
 EXPORT_IPV6_MOD_GPL(ping_close);
 
-static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+static int ping_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 			    int addr_len)
 {
 	/* This check is replicated from __ip4_datagram_connect() and
@@ -301,7 +301,7 @@ static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 
 /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
 static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
-				struct sockaddr *uaddr, int addr_len)
+				struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct net *net = sock_net(sk);
 	if (sk->sk_family == AF_INET) {
@@ -387,7 +387,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
 	return 0;
 }
 
-static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
+static void ping_set_saddr(struct sock *sk, struct sockaddr_unsized *saddr)
 {
 	if (saddr->sa_family == AF_INET) {
 		struct inet_sock *isk = inet_sk(sk);
@@ -407,7 +407,7 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
  * Moreover, we don't allow binding to multi- and broadcast addresses.
  */
 
-int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct inet_sock *isk = inet_sk(sk);
 	unsigned short snum;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index d54ebb7df966..5998c4cc6f47 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -697,7 +697,8 @@ static void raw_destroy(struct sock *sk)
 }
 
 /* This gets rid of all the nasties in af_inet. -DaveM */
-static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int raw_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+		    int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 40a76da5364a..b7526a7888cb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -205,7 +205,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 }
 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
 
-static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 			      int addr_len)
 {
 	/* This check is replicated from tcp_v4_connect() and intended to
@@ -221,7 +221,7 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 }
 
 /* This will initiate an outgoing connection. */
-int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 	struct inet_timewait_death_row *tcp_death_row;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 30dfbf73729d..ffe074cb5865 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2159,7 +2159,8 @@ csum_copy_err:
 	goto try_again;
 }
 
-int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+		    int addr_len)
 {
 	/* This check is replicated from __ip4_datagram_connect() and
 	 * intended to prevent BPF program called below from accessing bytes
@@ -2172,7 +2173,8 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 }
 EXPORT_IPV6_MOD(udp_pre_connect);
 
-static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int udp_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+		       int addr_len)
 {
 	int res;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c92d27e35fbc..b705751eb73c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -277,7 +277,7 @@ out_sk_release:
 	goto out;
 }
 
-static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+static int __inet6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
 			u32 flags)
 {
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
@@ -438,7 +438,7 @@ out_unlock:
 	goto out;
 }
 
-int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	u32 flags = BIND_WITH_LOCK;
 	const struct proto *prot;
@@ -467,7 +467,7 @@ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 /* bind for INET6 API */
 int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
-	return inet6_bind_sk(sock->sk, (struct sockaddr *)uaddr, addr_len);
+	return inet6_bind_sk(sock->sk, uaddr, addr_len);
 }
 EXPORT_SYMBOL(inet6_bind);
 
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 33ebe93d80e3..83e03176819c 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -138,7 +138,7 @@ void ip6_datagram_release_cb(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(ip6_datagram_release_cb);
 
-int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
+int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 			   int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
@@ -194,7 +194,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
 		sin.sin_port = usin->sin6_port;
 
 		err = __ip4_datagram_connect(sk,
-					     (struct sockaddr *) &sin,
+					     (struct sockaddr_unsized *)&sin,
 					     sizeof(sin));
 
 ipv4_connected:
@@ -271,7 +271,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(__ip6_datagram_connect);
 
-int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	int res;
 
@@ -282,7 +282,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 }
 EXPORT_SYMBOL_GPL(ip6_datagram_connect);
 
-int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr,
+int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *uaddr,
 				 int addr_len)
 {
 	DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index d7a2cdaa2631..e4afc651731a 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -45,7 +45,7 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
 	return 0;
 }
 
-static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+static int ping_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 			       int addr_len)
 {
 	/* This check is replicated from __ip6_datagram_connect() and
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e369f54844dd..b4cd05dba9b6 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -214,7 +214,8 @@ bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
 }
 
 /* This cleans up af_inet6 a bit. -DaveM */
-static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int rawv6_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+		      int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 06eb90e4078e..7df21c1cba21 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -118,7 +118,7 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb)
 				   ipv6_hdr(skb)->saddr.s6_addr32);
 }
 
-static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 			      int addr_len)
 {
 	/* This check is replicated from tcp_v6_connect() and intended to
@@ -133,7 +133,7 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len);
 }
 
-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 			  int addr_len)
 {
 	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
@@ -238,7 +238,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		tp->af_specific = &tcp_sock_ipv6_mapped_specific;
 #endif
 
-		err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+		err = tcp_v4_connect(sk, (struct sockaddr_unsized *)&sin, sizeof(sin));
 
 		if (err) {
 			icsk->icsk_ext_hdr_len = exthdrlen;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 813a2ba75824..794c13674e8a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1282,7 +1282,7 @@ static void udp_v6_flush_pending_frames(struct sock *sk)
 	}
 }
 
-static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+static int udpv6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 			     int addr_len)
 {
 	if (addr_len < offsetofend(struct sockaddr, sa_family))
@@ -1303,7 +1303,8 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 
-static int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int udpv6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+			 int addr_len)
 {
 	int res;
 
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 29795d2839e8..cac1ff59cb83 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -267,7 +267,8 @@ static void l2tp_ip_destroy_sock(struct sock *sk)
 	}
 }
 
-static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int l2tp_ip_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+			int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr;
@@ -328,7 +329,8 @@ out:
 	return ret;
 }
 
-static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int l2tp_ip_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+			   int addr_len)
 {
 	struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr;
 	struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk));
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index ea232f338dcb..05a396ba6a3e 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -280,7 +280,8 @@ static void l2tp_ip6_destroy_sock(struct sock *sk)
 	}
 }
 
-static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int l2tp_ip6_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+			 int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -383,7 +384,7 @@ out_unlock:
 	return err;
 }
 
-static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr,
+static int l2tp_ip6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 			    int addr_len)
 {
 	struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr;
diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
index e50721c670d0..598f01a573c1 100644
--- a/net/mptcp/pm_kernel.c
+++ b/net/mptcp/pm_kernel.c
@@ -867,10 +867,10 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
 		addrlen = sizeof(struct sockaddr_in6);
 #endif
 	if (ssk->sk_family == AF_INET)
-		err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
+		err = inet_bind_sk(ssk, (struct sockaddr_unsized *)&addr, addrlen);
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
 	else if (ssk->sk_family == AF_INET6)
-		err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
+		err = inet6_bind_sk(ssk, (struct sockaddr_unsized *)&addr, addrlen);
 #endif
 	if (err)
 		return err;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 53e2b095dfb1..4cd5df01446e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3746,7 +3746,8 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
 	return 0;
 }
 
-static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int mptcp_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+			 int addr_len)
 {
 	struct mptcp_subflow_context *subflow;
 	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -3870,10 +3871,10 @@ static int mptcp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int a
 	}
 
 	if (sk->sk_family == AF_INET)
-		err = inet_bind_sk(ssk, (struct sockaddr *)uaddr, addr_len);
+		err = inet_bind_sk(ssk, uaddr, addr_len);
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
 	else if (sk->sk_family == AF_INET6)
-		err = inet6_bind_sk(ssk, (struct sockaddr *)uaddr, addr_len);
+		err = inet6_bind_sk(ssk, uaddr, addr_len);
 #endif
 	if (!err)
 		mptcp_copy_inaddrs(sk, ssk);
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 4db564d9d522..120e711ea78c 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -882,7 +882,8 @@ drop:
 	return newsk;
 }
 
-static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
+static int pep_sock_connect(struct sock *sk, struct sockaddr_unsized *addr,
+			    int len)
 {
 	struct pep_sock *pn = pep_sk(sk);
 	int err;
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 9391378083a4..4423d483c630 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -163,7 +163,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr_unsized *addr, in
 	u8 saddr;
 
 	if (sk->sk_prot->bind)
-		return sk->sk_prot->bind(sk, (struct sockaddr *)addr, len);
+		return sk->sk_prot->bind(sk, addr, len);
 
 	if (len < sizeof(struct sockaddr_pn))
 		return -EINVAL;
@@ -252,7 +252,7 @@ static int pn_socket_connect(struct socket *sock, struct sockaddr_unsized *addr,
 	pn->resource = pn_sockaddr_get_resource(spn);
 	sock->state = SS_CONNECTING;
 
-	err = sk->sk_prot->connect(sk, (struct sockaddr *)addr, len);
+	err = sk->sk_prot->connect(sk, addr, len);
 	if (err) {
 		sock->state = SS_UNCONNECTED;
 		pn->dobject = 0;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 940abbced191..38d2932acebf 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -306,7 +306,8 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
  *             sockaddr_in6 [RFC 2553]),
  *   addr_len - the size of the address structure.
  */
-static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
+static int sctp_bind(struct sock *sk, struct sockaddr_unsized *addr,
+		     int addr_len)
 {
 	int retval = 0;
 
@@ -1053,13 +1054,13 @@ static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *addrs,
 	}
 }
 
-static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs,
-		int addrlen)
+static int sctp_bind_add(struct sock *sk, struct sockaddr_unsized *addrs,
+			 int addrlen)
 {
 	int err;
 
 	lock_sock(sk);
-	err = sctp_setsockopt_bindx(sk, addrs, addrlen, SCTP_BINDX_ADD_ADDR);
+	err = sctp_setsockopt_bindx(sk, (struct sockaddr *)addrs, addrlen, SCTP_BINDX_ADD_ADDR);
 	release_sock(sk);
 	return err;
 }
-- 
cgit v1.2.3


From 8116d803e7f8f20bf00ce23ff8bd0baab41e1635 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 3 Nov 2025 16:26:14 -0800
Subject: bpf: Convert cgroup sockaddr filters to use sockaddr_unsized
 consistently

Update BPF cgroup sockaddr filtering infrastructure to use sockaddr_unsized
consistently throughout the call chain, removing redundant explicit casts
from callers.

No binary changes expected.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-6-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/bpf-cgroup.h | 17 ++++++++++-------
 kernel/bpf/cgroup.c        |  4 ++--
 net/ipv4/af_inet.c         |  4 ++--
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index a7fb4f46974f..d1eb5c7729cb 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -120,7 +120,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
-				      struct sockaddr *uaddr,
+				      struct sockaddr_unsized *uaddr,
 				      int *uaddrlen,
 				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
@@ -238,8 +238,9 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))					       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \
-							  atype, NULL, NULL);  \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk,		       \
+				(struct sockaddr_unsized *)uaddr, uaddrlen,     \
+				atype, NULL, NULL);			       \
 	__ret;								       \
 })
 
@@ -248,8 +249,9 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \
-							  atype, t_ctx, NULL); \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk,		       \
+				(struct sockaddr_unsized *)uaddr, uaddrlen,     \
+				atype, t_ctx, NULL);			       \
 		release_sock(sk);					       \
 	}								       \
 	__ret;								       \
@@ -266,8 +268,9 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \
-							  atype, NULL, &__flags); \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk,		       \
+				(struct sockaddr_unsized *)uaddr, uaddrlen,     \
+				atype, NULL, &__flags);			       \
 		release_sock(sk);					       \
 		if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)	       \
 			*bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE;	       \
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 248f517d66d0..497aedc9afa1 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1665,7 +1665,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  * returned value != 1 during execution. In all other cases, 0 is returned.
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
-				      struct sockaddr *uaddr,
+				      struct sockaddr_unsized *uaddr,
 				      int *uaddrlen,
 				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
@@ -1673,7 +1673,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 {
 	struct bpf_sock_addr_kern ctx = {
 		.sk = sk,
-		.uaddr = uaddr,
+		.uaddr = (struct sockaddr *)uaddr,
 		.t_ctx = t_ctx,
 	};
 	struct sockaddr_storage unspec;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d5ac089356eb..a31b94ce8968 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -834,7 +834,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		}
 		sin->sin_port = inet->inet_dport;
 		sin->sin_addr.s_addr = inet->inet_daddr;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+		BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
 				       CGROUP_INET4_GETPEERNAME);
 	} else {
 		__be32 addr = inet->inet_rcv_saddr;
@@ -842,7 +842,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 			addr = inet->inet_saddr;
 		sin->sin_port = inet->inet_sport;
 		sin->sin_addr.s_addr = addr;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+		BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
 				       CGROUP_INET4_GETSOCKNAME);
 	}
 	release_sock(sk);
-- 
cgit v1.2.3


From c1a799eef62b8c3298a4d82753fe0f2a448e5e4f Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 3 Nov 2025 16:26:15 -0800
Subject: bpf: Convert bpf_sock_addr_kern "uaddr" to sockaddr_unsized

Change struct bpf_sock_addr_kern to use sockaddr_unsized for the "uaddr"
field instead of sockaddr. This improves type safety in the BPF cgroup
socket address filtering code.

The casting in __cgroup_bpf_run_filter_sock_addr() is updated to match the
new type, removing an unnecessary cast in the initialization and updating
the conditional assignment to use the appropriate sockaddr_unsized cast.

Additionally rename the "unspec" variable to "storage" to better align
with its usage.

No binary changes expected.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-7-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/filter.h | 2 +-
 kernel/bpf/cgroup.c    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index f5c859b8131a..e116de7edc58 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1515,7 +1515,7 @@ static inline int bpf_tell_extensions(void)
 
 struct bpf_sock_addr_kern {
 	struct sock *sk;
-	struct sockaddr *uaddr;
+	struct sockaddr_unsized *uaddr;
 	/* Temporary "register" to make indirect stores to nested structures
 	 * defined above. We need three registers to make such a store, but
 	 * only two (src and dst) are available at convert_ctx_access time
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 497aedc9afa1..69988af44b37 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1673,10 +1673,10 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 {
 	struct bpf_sock_addr_kern ctx = {
 		.sk = sk,
-		.uaddr = (struct sockaddr *)uaddr,
+		.uaddr = uaddr,
 		.t_ctx = t_ctx,
 	};
-	struct sockaddr_storage unspec;
+	struct sockaddr_storage storage;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -1688,8 +1688,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 		return 0;
 
 	if (!ctx.uaddr) {
-		memset(&unspec, 0, sizeof(unspec));
-		ctx.uaddr = (struct sockaddr *)&unspec;
+		memset(&storage, 0, sizeof(storage));
+		ctx.uaddr = (struct sockaddr_unsized *)&storage;
 		ctx.uaddrlen = 0;
 	} else {
 		ctx.uaddrlen = *uaddrlen;
-- 
cgit v1.2.3


From 2b5e9f9b7e414c5eeb20dd7a7b80816ff55cf57b Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 3 Nov 2025 16:26:16 -0800
Subject: net: Convert struct sockaddr to fixed-size "sa_data[14]"

Revert struct sockaddr from flexible array to fixed 14-byte "sa_data",
to solve over 36,000 -Wflex-array-member-not-at-end warnings, since
struct sockaddr is embedded within many network structs.

With socket/proto sockaddr-based internal APIs switched to use struct
sockaddr_unsized, there should be no more uses of struct sockaddr that
depend on reading beyond the end of struct sockaddr::sa_data that might
trigger bounds checking.

Comparing an x86_64 "allyesconfig" vmlinux build before and after this
patch showed no new "ud1" instructions from CONFIG_UBSAN_BOUNDS nor any
new "field-spanning" memcpy CONFIG_FORTIFY_SOURCE instrumentations.

Cc: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-8-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/socket.h                         |  6 ++----
 net/core/dev.c                                 |  2 +-
 net/core/dev_ioctl.c                           |  2 +-
 net/ipv4/arp.c                                 |  2 +-
 net/packet/af_packet.c                         | 10 +++++-----
 tools/perf/trace/beauty/include/linux/socket.h |  5 +----
 6 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 7b1a01be29da..944027f9765e 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -32,12 +32,10 @@ typedef __kernel_sa_family_t	sa_family_t;
  *	1003.1g requires sa_family_t and that sa_data is char.
  */
 
+/* Deprecated for in-kernel use. Use struct sockaddr_unsized instead. */
 struct sockaddr {
 	sa_family_t	sa_family;	/* address family, AF_xxx	*/
-	union {
-		char sa_data_min[14];		/* Minimum 14 bytes of protocol address	*/
-		DECLARE_FLEX_ARRAY(char, sa_data);
-	};
+	char		sa_data[14];	/* 14 bytes of protocol address	*/
 };
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index ba39146bbd25..537aa43edff0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9973,7 +9973,7 @@ DECLARE_RWSEM(dev_addr_sem);
 /* "sa" is a true struct sockaddr with limited "sa_data" member. */
 int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
 {
-	size_t size = sizeof(sa->sa_data_min);
+	size_t size = sizeof(sa->sa_data);
 	struct net_device *dev;
 	int ret = 0;
 
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index ad54b12d4b4c..b3ce0fb24a69 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -596,7 +596,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
 		if (ifr->ifr_hwaddr.sa_family != dev->type)
 			return -EINVAL;
 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
-		       min(sizeof(ifr->ifr_hwaddr.sa_data_min),
+		       min(sizeof(ifr->ifr_hwaddr.sa_data),
 			   (size_t)dev->addr_len));
 		netdev_lock_ops(dev);
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index f3bfecf8a234..7f3863daaa40 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1189,7 +1189,7 @@ static int arp_req_get(struct net *net, struct arpreq *r)
 
 	read_lock_bh(&neigh->lock);
 	memcpy(r->arp_ha.sa_data, neigh->ha,
-	       min(dev->addr_len, sizeof(r->arp_ha.sa_data_min)));
+	       min(dev->addr_len, sizeof(r->arp_ha.sa_data)));
 	r->arp_flags = arp_state_to_flags(neigh);
 	read_unlock_bh(&neigh->lock);
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index fccad2a529cc..494d628d10a5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3284,7 +3284,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr,
 {
 	struct sock *sk = sock->sk;
 	struct sockaddr *sa = (struct sockaddr *)uaddr;
-	char name[sizeof(sa->sa_data_min) + 1];
+	char name[sizeof(sa->sa_data) + 1];
 
 	/*
 	 *	Check legality
@@ -3295,8 +3295,8 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr,
 	/* uaddr->sa_data comes from the userspace, it's not guaranteed to be
 	 * zero-terminated.
 	 */
-	memcpy(name, sa->sa_data, sizeof(sa->sa_data_min));
-	name[sizeof(sa->sa_data_min)] = 0;
+	memcpy(name, sa->sa_data, sizeof(sa->sa_data));
+	name[sizeof(sa->sa_data)] = 0;
 
 	return packet_do_bind(sk, name, 0, 0);
 }
@@ -3581,11 +3581,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
 		return -EOPNOTSUPP;
 
 	uaddr->sa_family = AF_PACKET;
-	memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min));
+	memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
 	if (dev)
-		strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min));
+		strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
 	rcu_read_unlock();
 
 	return sizeof(*uaddr);
diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h
index 3b262487ec06..77d7c59f5d8b 100644
--- a/tools/perf/trace/beauty/include/linux/socket.h
+++ b/tools/perf/trace/beauty/include/linux/socket.h
@@ -34,10 +34,7 @@ typedef __kernel_sa_family_t	sa_family_t;
 
 struct sockaddr {
 	sa_family_t	sa_family;	/* address family, AF_xxx	*/
-	union {
-		char sa_data_min[14];		/* Minimum 14 bytes of protocol address	*/
-		DECLARE_FLEX_ARRAY(char, sa_data);
-	};
+	char		sa_data[14];	/* 14 bytes of protocol address	*/
 };
 
 struct linger {
-- 
cgit v1.2.3


From 7c5b184db7145fd417785377337bd15c4fe1d0f4 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:29:59 -0400
Subject: genpt: Generic Page Table base API

The generic API is intended to be separated from the implementation of
page table algorithms. It contains only accessors for walking and
manipulating the table and helpers that are useful for building an
implementation. Memory management is not in the generic API, but part of
the implementation.

Using a multi-compilation approach the implementation module would include
headers in this order:

  common.h
  defs_FMT.h
  pt_defs.h
  FMT.h
  pt_common.h
  IMPLEMENTATION.h

Where each compilation unit would have a combination of FMT and
IMPLEMENTATION to produce a per-format per-implementation module.

The API is designed so that the format headers have minimal logic, and
default implementations are provided if the format doesn't include one.

Generally formats provide their code via an inline function using the
pattern:

  static inline FMTpt_XX(..) {}
  #define pt_XX FMTpt_XX

The common code then enforces a function signature so that there is no
drift in function arguments, or accidental polymorphic functions (as has
been slightly troublesome in mm). Use of function-like #defines are
avoided in the format even though many of the functions are small enough.

Provide kdocs for the API surface.

This is enough to implement the 8 initial format variations with all of
their features:
 * Entries comprised of contiguous blocks of IO PTEs for larger page
   sizes (AMDv1, ARMv8)
 * Multi-level tables, up to 6 levels. Runtime selected top level
 * The size of the top table level can be selected at runtime (ARM's
   concatenated tables)
 * The number of levels in the table can optionally increase dynamically
   during map (AMDv1)
 * Optional leaf entries at any level
 * 32 bit/64 bit virtual and output addresses, using every bit
 * Sign extended addressing (x86)
 * Dirty tracking

A basic simple format takes about 200 lines to declare the require inline
functions.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 .clang-format                              |   1 +
 drivers/iommu/Kconfig                      |   2 +
 drivers/iommu/generic_pt/Kconfig           |  20 +
 drivers/iommu/generic_pt/pt_common.h       | 358 ++++++++++++++++
 drivers/iommu/generic_pt/pt_defs.h         | 329 +++++++++++++++
 drivers/iommu/generic_pt/pt_fmt_defaults.h | 233 +++++++++++
 drivers/iommu/generic_pt/pt_iter.h         | 636 +++++++++++++++++++++++++++++
 drivers/iommu/generic_pt/pt_log2.h         | 122 ++++++
 include/linux/generic_pt/common.h          | 135 ++++++
 9 files changed, 1836 insertions(+)
 create mode 100644 drivers/iommu/generic_pt/Kconfig
 create mode 100644 drivers/iommu/generic_pt/pt_common.h
 create mode 100644 drivers/iommu/generic_pt/pt_defs.h
 create mode 100644 drivers/iommu/generic_pt/pt_fmt_defaults.h
 create mode 100644 drivers/iommu/generic_pt/pt_iter.h
 create mode 100644 drivers/iommu/generic_pt/pt_log2.h
 create mode 100644 include/linux/generic_pt/common.h

(limited to 'include')

diff --git a/.clang-format b/.clang-format
index f371a13b4d19..9e6a9177f8fb 100644
--- a/.clang-format
+++ b/.clang-format
@@ -415,6 +415,7 @@ ForEachMacros:
   - 'for_each_prop_dlc_cpus'
   - 'for_each_prop_dlc_platforms'
   - 'for_each_property_of_node'
+  - 'for_each_pt_level_entry'
   - 'for_each_rdt_resource'
   - 'for_each_reg'
   - 'for_each_reg_filtered'
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 70d29b14d851..c9ae3221cd6f 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -384,3 +384,5 @@ config SPRD_IOMMU
 	  Say Y here if you want to use the multimedia devices listed above.
 
 endif # IOMMU_SUPPORT
+
+source "drivers/iommu/generic_pt/Kconfig"
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
new file mode 100644
index 000000000000..fb0f431ddba0
--- /dev/null
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menuconfig GENERIC_PT
+	bool "Generic Radix Page Table"
+	help
+	  Generic library for building radix tree page tables.
+
+	  Generic PT provides a set of HW page table formats and a common
+	  set of APIs to work with them.
+
+if GENERIC_PT
+config DEBUG_GENERIC_PT
+	bool "Extra debugging checks for GENERIC_PT"
+	help
+	  Enable extra run time debugging checks for GENERIC_PT code. This
+	  incurs a runtime cost and should not be enabled for production
+	  kernels.
+
+	  The kunit tests require this to be enabled to get full coverage.
+endif
diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h
new file mode 100644
index 000000000000..f64f800725db
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_common.h
@@ -0,0 +1,358 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * This header is included after the format. It contains definitions
+ * that build on the format definitions to create the basic format API.
+ *
+ * The format API is listed here, with kdocs. The functions without bodies are
+ * implemented in the format using the pattern:
+ *     static inline FMTpt_XXX(..) {..}
+ *     #define pt_XXX FMTpt_XXX
+ *
+ * If the format doesn't implement a function then pt_fmt_defaults.h can provide
+ * a generic version.
+ *
+ * The routines marked "@pts: Entry to query" operate on the entire contiguous
+ * entry and can be called with a pts->index pointing to any sub item that makes
+ * up that entry.
+ *
+ * The header order is:
+ *  pt_defs.h
+ *  FMT.h
+ *  pt_common.h
+ */
+#ifndef __GENERIC_PT_PT_COMMON_H
+#define __GENERIC_PT_PT_COMMON_H
+
+#include "pt_defs.h"
+#include "pt_fmt_defaults.h"
+
+/**
+ * pt_attr_from_entry() - Convert the permission bits back to attrs
+ * @pts: Entry to convert from
+ * @attrs: Resulting attrs
+ *
+ * Fill in the attrs with the permission bits encoded in the current leaf entry.
+ * The attrs should be usable with pt_install_leaf_entry() to reconstruct the
+ * same entry.
+ */
+static inline void pt_attr_from_entry(const struct pt_state *pts,
+				      struct pt_write_attrs *attrs);
+
+/**
+ * pt_can_have_leaf() - True if the current level can have an OA entry
+ * @pts: The current level
+ *
+ * True if the current level can support pt_install_leaf_entry(). A leaf
+ * entry produce an OA.
+ */
+static inline bool pt_can_have_leaf(const struct pt_state *pts);
+
+/**
+ * pt_can_have_table() - True if the current level can have a lower table
+ * @pts: The current level
+ *
+ * Every level except 0 is allowed to have a lower table.
+ */
+static inline bool pt_can_have_table(const struct pt_state *pts)
+{
+	/* No further tables at level 0 */
+	return pts->level > 0;
+}
+
+/**
+ * pt_clear_entries() - Make entries empty (non-present)
+ * @pts: Starting table index
+ * @num_contig_lg2: Number of contiguous items to clear
+ *
+ * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY
+ * and does not have any effect on table walking. The starting index must be
+ * aligned to num_contig_lg2.
+ */
+static inline void pt_clear_entries(struct pt_state *pts,
+				    unsigned int num_contig_lg2);
+
+/**
+ * pt_entry_make_write_dirty() - Make an entry dirty
+ * @pts: Table entry to change
+ *
+ * Make pt_entry_is_write_dirty() return true for this entry. This can be called
+ * asynchronously with any other table manipulation under a RCU lock and must
+ * not corrupt the table.
+ */
+static inline bool pt_entry_make_write_dirty(struct pt_state *pts);
+
+/**
+ * pt_entry_make_write_clean() - Make the entry write clean
+ * @pts: Table entry to change
+ *
+ * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will
+ * eventually be notified of this change via a TLB flush, which is the point
+ * that the HW must become synchronized. Any "write dirty" prior to the TLB
+ * flush can be lost, but once the TLB flush completes all writes must make
+ * their entries write dirty.
+ *
+ * The format should alter the entry in a way that is compatible with any
+ * concurrent update from HW. The entire contiguous entry is changed.
+ */
+static inline void pt_entry_make_write_clean(struct pt_state *pts);
+
+/**
+ * pt_entry_is_write_dirty() - True if the entry has been written to
+ * @pts: Entry to query
+ *
+ * "write dirty" means that the HW has written to the OA translated
+ * by this entry. If the entry is contiguous then the consolidated
+ * "write dirty" for all the items must be returned.
+ */
+static inline bool pt_entry_is_write_dirty(const struct pt_state *pts);
+
+/**
+ * pt_dirty_supported() - True if the page table supports dirty tracking
+ * @common: Page table to query
+ */
+static inline bool pt_dirty_supported(struct pt_common *common);
+
+/**
+ * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry
+ * @pts: Entry to query
+ *
+ * Return the number of contiguous items this leaf entry spans. If the entry
+ * is single item it returns ilog2(1).
+ */
+static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts);
+
+/**
+ * pt_entry_oa() - Output Address for this leaf entry
+ * @pts: Entry to query
+ *
+ * Return the output address for the start of the entry. If the entry
+ * is contiguous this returns the same value for each sub-item. I.e.::
+ *
+ *    log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0
+ *
+ * See pt_item_oa(). The format should implement one of these two functions
+ * depending on how it stores the OAs in the table.
+ */
+static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts);
+
+/**
+ * pt_entry_oa_lg2sz() - Return the size of an OA entry
+ * @pts: Entry to query
+ *
+ * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise
+ * it returns the total VA/OA size of the entire contiguous entry.
+ */
+static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts)
+{
+	return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts);
+}
+
+/**
+ * pt_entry_oa_exact() - Return the complete OA for an entry
+ * @pts: Entry to query
+ *
+ * During iteration the first entry could have a VA with an offset from the
+ * natural start of the entry. Return the exact OA including the pts's VA
+ * offset.
+ */
+static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts)
+{
+	return _pt_entry_oa_fast(pts) |
+	       log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts));
+}
+
+/**
+ * pt_full_va_prefix() - The top bits of the VA
+ * @common: Page table to query
+ *
+ * This is usually 0, but some formats have their VA space going downward from
+ * PT_VADDR_MAX, and will return that instead. This value must always be
+ * adjusted by struct pt_common max_vasz_lg2.
+ */
+static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common);
+
+/**
+ * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry
+ * @common: Page table to query
+ *
+ * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT).
+ * This is useful to create optimized paths for common cases of PAGE_SIZE
+ * mappings.
+ */
+static inline bool pt_has_system_page_size(const struct pt_common *common);
+
+/**
+ * pt_install_leaf_entry() - Write a leaf entry to the table
+ * @pts: Table index to change
+ * @oa: Output Address for this leaf
+ * @oasz_lg2: Size in VA/OA for this leaf
+ * @attrs: Attributes to modify the entry
+ *
+ * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates
+ * the VA indicated by pts to the given OA.
+ *
+ * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz().
+ * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2.
+ *
+ * This must not be called if pt_can_have_leaf() == false. Contiguous sizes
+ * not indicated by pt_possible_sizes() must not be specified.
+ */
+static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+					 unsigned int oasz_lg2,
+					 const struct pt_write_attrs *attrs);
+
+/**
+ * pt_install_table() - Write a table entry to the table
+ * @pts: Table index to change
+ * @table_pa: CPU physical address of the lower table's memory
+ * @attrs: Attributes to modify the table index
+ *
+ * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa
+ * is the table at pts->level - 1. This is done by cmpxchg so pts must have the
+ * current entry loaded. The pts is updated with the installed entry.
+ *
+ * This must not be called if pt_can_have_table() == false.
+ *
+ * Returns: true if the table was installed successfully.
+ */
+static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa,
+				    const struct pt_write_attrs *attrs);
+
+/**
+ * pt_item_oa() - Output Address for this leaf item
+ * @pts: Item to query
+ *
+ * Return the output address for this item. If the item is part of a contiguous
+ * entry it returns the value of the OA for this individual sub item.
+ *
+ * See pt_entry_oa(). The format should implement one of these two functions
+ * depending on how it stores the OA's in the table.
+ */
+static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts);
+
+/**
+ * pt_load_entry_raw() - Read from the location pts points at into the pts
+ * @pts: Table index to load
+ *
+ * Return the type of entry that was loaded. pts->entry will be filled in with
+ * the entry's content. See pt_load_entry()
+ */
+static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts);
+
+/**
+ * pt_max_oa_lg2() - Return the maximum OA the table format can hold
+ * @common: Page table to query
+ *
+ * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the
+ * OA. This is the absolute maximum address the table can hold. struct pt_common
+ * max_oasz_lg2 sets a lower dynamic maximum based on HW capability.
+ */
+static inline unsigned int
+pt_max_oa_lg2(const struct pt_common *common);
+
+/**
+ * pt_num_items_lg2() - Return the number of items in this table level
+ * @pts: The current level
+ *
+ * The number of items in a table level defines the number of bits this level
+ * decodes from the VA. This function is not called for the top level,
+ * so it does not need to compute a special value for the top case. The
+ * result for the top is based on pt_common max_vasz_lg2.
+ *
+ * The value is used as part of determining the table indexes via the
+ * equation::
+ *
+ *   log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2())
+ */
+static inline unsigned int pt_num_items_lg2(const struct pt_state *pts);
+
+/**
+ * pt_pgsz_lg2_to_level - Return the level that maps the page size
+ * @common: Page table to query
+ * @pgsize_lg2: Log2 page size
+ *
+ * Returns the table level that will map the given page size. The page
+ * size must be part of the pt_possible_sizes() for some level.
+ */
+static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
+						unsigned int pgsize_lg2);
+
+/**
+ * pt_possible_sizes() - Return a bitmap of possible output sizes at this level
+ * @pts: The current level
+ *
+ * Each level has a list of possible output sizes that can be installed as
+ * leaf entries. If pt_can_have_leaf() is false returns zero.
+ *
+ * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating
+ * that a non-contiguous single item leaf entry is supported. The following
+ * pt_num_items_lg2() number of bits can be set indicating contiguous entries
+ * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be
+ * set, contiguous entries cannot span the entire table.
+ *
+ * The OR of pt_possible_sizes() of all levels is the typical bitmask of all
+ * supported sizes in the entire table.
+ */
+static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts);
+
+/**
+ * pt_table_item_lg2sz() - Size of a single item entry in this table level
+ * @pts: The current level
+ *
+ * The size of the item specifies how much VA and OA a single item occupies.
+ *
+ * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous
+ * entries.
+ */
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
+
+/**
+ * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table
+ * @pts: The current level
+ *
+ * Return the size of VA decoded by the entire table level.
+ */
+static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts)
+{
+	if (pts->range->top_level == pts->level)
+		return pts->range->max_vasz_lg2;
+	return min_t(unsigned int, pts->range->common->max_vasz_lg2,
+		     pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts));
+}
+
+/**
+ * pt_table_pa() - Return the CPU physical address of the table entry
+ * @pts: Entry to query
+ *
+ * This is only ever called on PT_ENTRY_TABLE entries. Must return the same
+ * value passed to pt_install_table().
+ */
+static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts);
+
+/**
+ * pt_table_ptr() - Return a CPU pointer for a table item
+ * @pts: Entry to query
+ *
+ * Same as pt_table_pa() but returns a CPU pointer.
+ */
+static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts)
+{
+	return __va(pt_table_pa(pts));
+}
+
+/**
+ * pt_load_entry() - Read from the location pts points at into the pts
+ * @pts: Table index to load
+ *
+ * Set the type of entry that was loaded. pts->entry and pts->table_lower
+ * will be filled in with the entry's content.
+ */
+static inline void pt_load_entry(struct pt_state *pts)
+{
+	pts->type = pt_load_entry_raw(pts);
+	if (pts->type == PT_ENTRY_TABLE)
+		pts->table_lower = pt_table_ptr(pts);
+}
+#endif
diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h
new file mode 100644
index 000000000000..819057de50d8
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_defs.h
@@ -0,0 +1,329 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * This header is included before the format. It contains definitions
+ * that are required to compile the format. The header order is:
+ *  pt_defs.h
+ *  fmt_XX.h
+ *  pt_common.h
+ */
+#ifndef __GENERIC_PT_DEFS_H
+#define __GENERIC_PT_DEFS_H
+
+#include <linux/generic_pt/common.h>
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/bits.h>
+#include <linux/limits.h>
+#include <linux/bug.h>
+#include <linux/kconfig.h>
+#include "pt_log2.h"
+
+/* Header self-compile default defines */
+#ifndef pt_write_attrs
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+#endif
+
+struct pt_table_p;
+
+enum {
+	PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX,
+	PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32,
+	PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX,
+	PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32,
+};
+
+/*
+ * The format instantiation can have features wired off or on to optimize the
+ * code gen. Supported features are just a reflection of what the current set of
+ * kernel users want to use.
+ */
+#ifndef PT_SUPPORTED_FEATURES
+#define PT_SUPPORTED_FEATURES 0
+#endif
+
+/*
+ * When in debug mode we compile all formats with all features. This allows the
+ * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or
+ * FULL_VA.
+ */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
+enum {
+	PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES,
+	PT_DEBUG_SUPPORTED_FEATURES =
+		UINT_MAX &
+		~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ?
+			  BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) :
+			  BIT(PT_FEAT_SIGN_EXTEND)),
+};
+#undef PT_SUPPORTED_FEATURES
+#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES
+#endif
+
+#ifndef PT_FORCE_ENABLED_FEATURES
+#define PT_FORCE_ENABLED_FEATURES 0
+#endif
+
+/**
+ * DOC: Generic Page Table Language
+ *
+ * Language used in Generic Page Table
+ *  VA
+ *     The input address to the page table, often the virtual address.
+ *  OA
+ *     The output address from the page table, often the physical address.
+ *  leaf
+ *     An entry that results in an output address.
+ *  start/end
+ *     An half-open range, e.g. [0,0) refers to no VA.
+ *  start/last
+ *     An inclusive closed range, e.g. [0,0] refers to the VA 0
+ *  common
+ *     The generic page table container struct pt_common
+ *  level
+ *     Level 0 is always a table of only leaves with no futher table pointers.
+ *     Increasing levels increase the size of the table items. The least
+ *     significant VA bits used to index page tables are used to index the Level
+ *     0 table. The various labels for table levels used by HW descriptions are
+ *     not used.
+ *  top_level
+ *     The inclusive highest level of the table. A two-level table
+ *     has a top level of 1.
+ *  table
+ *     A linear array of translation items for that level.
+ *  index
+ *     The position in a table of an element: item = table[index]
+ *  item
+ *     A single index in a table
+ *  entry
+ *     A single logical element in a table. If contiguous pages are not
+ *     supported then item and entry are the same thing, otherwise entry refers
+ *     to all the items that comprise a single contiguous translation.
+ *  item/entry_size
+ *     The number of bytes of VA the table index translates for.
+ *     If the item is a table entry then the next table covers
+ *     this size. If the entry translates to an output address then the
+ *     full OA is: OA | (VA % entry_size)
+ *  contig_count
+ *     The number of consecutive items fused into a single entry.
+ *     item_size * contig_count is the size of that entry's translation.
+ *  lg2
+ *     Indicates the value is encoded as log2, i.e. 1<<x is the actual value.
+ *     Normally the compiler is fine to optimize divide and mod with log2 values
+ *     automatically when inlining, however if the values are not constant
+ *     expressions it can't. So we do it by hand; we want to avoid 64-bit
+ *     divmod.
+ */
+
+/* Returned by pt_load_entry() and for_each_pt_level_entry() */
+enum pt_entry_type {
+	PT_ENTRY_EMPTY,
+	/* Entry is valid and points to a lower table level */
+	PT_ENTRY_TABLE,
+	/* Entry is valid and returns an output address */
+	PT_ENTRY_OA,
+};
+
+struct pt_range {
+	struct pt_common *common;
+	struct pt_table_p *top_table;
+	pt_vaddr_t va;
+	pt_vaddr_t last_va;
+	u8 top_level;
+	u8 max_vasz_lg2;
+};
+
+/*
+ * Similar to xa_state, this records information about an in-progress parse at a
+ * single level.
+ */
+struct pt_state {
+	struct pt_range *range;
+	struct pt_table_p *table;
+	struct pt_table_p *table_lower;
+	u64 entry;
+	enum pt_entry_type type;
+	unsigned short index;
+	unsigned short end_index;
+	u8 level;
+};
+
+#define pt_cur_table(pts, type) ((type *)((pts)->table))
+
+/*
+ * Try to install a new table pointer. The locking methodology requires this to
+ * be atomic (multiple threads can race to install a pointer). The losing
+ * threads will fail the atomic and return false. They should free any memory
+ * and reparse the table level again.
+ */
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
+static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry)
+{
+	u64 *entryp = pt_cur_table(pts, u64) + pts->index;
+	u64 old_entry = pts->entry;
+	bool ret;
+
+	/*
+	 * Ensure the zero'd table content itself is visible before its PTE can
+	 * be. release is a NOP on !SMP, but the HW is still doing an acquire.
+	 */
+	if (!IS_ENABLED(CONFIG_SMP))
+		dma_wmb();
+	ret = try_cmpxchg64_release(entryp, &old_entry, table_entry);
+	if (ret)
+		pts->entry = table_entry;
+	return ret;
+}
+#endif
+
+static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry)
+{
+	u32 *entryp = pt_cur_table(pts, u32) + pts->index;
+	u32 old_entry = pts->entry;
+	bool ret;
+
+	/*
+	 * Ensure the zero'd table content itself is visible before its PTE can
+	 * be. release is a NOP on !SMP, but the HW is still doing an acquire.
+	 */
+	if (!IS_ENABLED(CONFIG_SMP))
+		dma_wmb();
+	ret = try_cmpxchg_release(entryp, &old_entry, table_entry);
+	if (ret)
+		pts->entry = table_entry;
+	return ret;
+}
+
+#define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr))
+
+static inline bool pt_feature(const struct pt_common *common,
+			      unsigned int feature_nr)
+{
+	if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr))
+		return true;
+	if (!PT_SUPPORTED_FEATURE(feature_nr))
+		return false;
+	return common->features & BIT(feature_nr);
+}
+
+static inline bool pts_feature(const struct pt_state *pts,
+			       unsigned int feature_nr)
+{
+	return pt_feature(pts->range->common, feature_nr);
+}
+
+/*
+ * PT_WARN_ON is used for invariants that the kunit should be checking can't
+ * happen.
+ */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
+#define PT_WARN_ON WARN_ON
+#else
+static inline bool PT_WARN_ON(bool condition)
+{
+	return false;
+}
+#endif
+
+/* These all work on the VA type */
+#define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2)
+#define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2)
+#define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2)
+#define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2)
+#define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2)
+#define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2)
+#define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2)
+#define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2)
+#define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2)
+#define vaffs(a) ffs_t(pt_vaddr_t, a)
+#define vafls(a) fls_t(pt_vaddr_t, a)
+#define vaffz(a) ffz_t(pt_vaddr_t, a)
+
+/*
+ * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and
+ * generate a useful defined result. The non-fva versions will malfunction at
+ * this extreme.
+ */
+static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2)
+{
+	if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+		return 0;
+	return log2_div_t(pt_vaddr_t, a, b_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2)
+{
+	if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+		return a;
+	return log2_mod_t(pt_vaddr_t, a, b_lg2);
+}
+
+static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b,
+				  unsigned int c_lg2)
+{
+	if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2)
+		return true;
+	return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val,
+					 unsigned int b_lg2)
+{
+	if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+		return val;
+	return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2)
+{
+	if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+		return PT_VADDR_MAX;
+	return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2);
+}
+
+/* These all work on the OA type */
+#define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2)
+#define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2)
+#define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2)
+#define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2)
+#define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2)
+#define oaffs(a) ffs_t(pt_oaddr_t, a)
+#define oafls(a) fls_t(pt_oaddr_t, a)
+#define oaffz(a) ffz_t(pt_oaddr_t, a)
+
+static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem,
+				    unsigned int top_level)
+{
+	return top_level | (uintptr_t)table_mem;
+}
+
+static inline void pt_top_set(struct pt_common *common,
+			      struct pt_table_p *table_mem,
+			      unsigned int top_level)
+{
+	WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level));
+}
+
+static inline void pt_top_set_level(struct pt_common *common,
+				    unsigned int top_level)
+{
+	pt_top_set(common, NULL, top_level);
+}
+
+static inline unsigned int pt_top_get_level(const struct pt_common *common)
+{
+	return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS);
+}
+
+static inline bool pt_check_install_leaf_args(struct pt_state *pts,
+					      pt_oaddr_t oa,
+					      unsigned int oasz_lg2);
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h
new file mode 100644
index 000000000000..60d594bbb106
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h
@@ -0,0 +1,233 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Default definitions for formats that don't define these functions.
+ */
+#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H
+#define __GENERIC_PT_PT_FMT_DEFAULTS_H
+
+#include "pt_defs.h"
+#include <linux/log2.h>
+
+/* Header self-compile default defines */
+#ifndef pt_load_entry_raw
+#include "fmt/amdv1.h"
+#endif
+
+/*
+ * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and
+ * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top.
+ */
+#ifndef pt_table_item_lg2sz
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts)
+{
+	return PT_GRANULE_LG2SZ +
+	       (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level;
+}
+#endif
+
+#ifndef pt_pgsz_lg2_to_level
+static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
+						unsigned int pgsize_lg2)
+{
+	return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) /
+	       (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE));
+}
+#endif
+
+/*
+ * If not supplied by the format then contiguous pages are not supported.
+ *
+ * If contiguous pages are supported then the format must also provide
+ * pt_contig_count_lg2() if it supports a single contiguous size per level,
+ * or pt_possible_sizes() if it supports multiple sizes per level.
+ */
+#ifndef pt_entry_num_contig_lg2
+static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+	return ilog2(1);
+}
+
+/*
+ * Return the number of contiguous OA items forming an entry at this table level
+ */
+static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts)
+{
+	return ilog2(1);
+}
+#endif
+
+/* If not supplied by the format then dirty tracking is not supported */
+#ifndef pt_entry_is_write_dirty
+static inline bool pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+	return false;
+}
+
+static inline void pt_entry_make_write_clean(struct pt_state *pts)
+{
+}
+
+static inline bool pt_dirty_supported(struct pt_common *common)
+{
+	return false;
+}
+#else
+/* If not supplied then dirty tracking is always enabled */
+#ifndef pt_dirty_supported
+static inline bool pt_dirty_supported(struct pt_common *common)
+{
+	return true;
+}
+#endif
+#endif
+
+#ifndef pt_entry_make_write_dirty
+static inline bool pt_entry_make_write_dirty(struct pt_state *pts)
+{
+	return false;
+}
+#endif
+
+/*
+ * Format supplies either:
+ *   pt_entry_oa - OA is at the start of a contiguous entry
+ * or
+ *   pt_item_oa  - OA is adjusted for every item in a contiguous entry
+ *
+ * Build the missing one
+ *
+ * The internal helper _pt_entry_oa_fast() allows generating
+ * an efficient pt_entry_oa_exact(), it doesn't care which
+ * option is selected.
+ */
+#ifdef pt_entry_oa
+static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts)
+{
+	return pt_entry_oa(pts) |
+	       log2_mul(pts->index, pt_table_item_lg2sz(pts));
+}
+#define _pt_entry_oa_fast pt_entry_oa
+#endif
+
+#ifdef pt_item_oa
+static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts)
+{
+	return log2_set_mod(pt_item_oa(pts), 0,
+			    pt_entry_num_contig_lg2(pts) +
+				    pt_table_item_lg2sz(pts));
+}
+#define _pt_entry_oa_fast pt_item_oa
+#endif
+
+/*
+ * If not supplied by the format then use the constant
+ * PT_MAX_OUTPUT_ADDRESS_LG2.
+ */
+#ifndef pt_max_oa_lg2
+static inline unsigned int
+pt_max_oa_lg2(const struct pt_common *common)
+{
+	return PT_MAX_OUTPUT_ADDRESS_LG2;
+}
+#endif
+
+#ifndef pt_has_system_page_size
+static inline bool pt_has_system_page_size(const struct pt_common *common)
+{
+	return PT_GRANULE_LG2SZ == PAGE_SHIFT;
+}
+#endif
+
+/*
+ * If not supplied by the format then assume only one contiguous size determined
+ * by pt_contig_count_lg2()
+ */
+#ifndef pt_possible_sizes
+static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts);
+
+/* Return a bitmap of possible leaf page sizes at this level */
+static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts)
+{
+	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+	if (!pt_can_have_leaf(pts))
+		return 0;
+	return log2_to_int(isz_lg2) |
+	       log2_to_int(pt_contig_count_lg2(pts) + isz_lg2);
+}
+#endif
+
+/* If not supplied by the format then use 0. */
+#ifndef pt_full_va_prefix
+static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common)
+{
+	return 0;
+}
+#endif
+
+/* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */
+#ifndef pt_clear_entries
+static inline void pt_clear_entries64(struct pt_state *pts,
+				      unsigned int num_contig_lg2)
+{
+	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+	u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+	PT_WARN_ON(log2_mod(pts->index, num_contig_lg2));
+	for (; tablep != end; tablep++)
+		WRITE_ONCE(*tablep, 0);
+}
+
+static inline void pt_clear_entries32(struct pt_state *pts,
+				      unsigned int num_contig_lg2)
+{
+	u32 *tablep = pt_cur_table(pts, u32) + pts->index;
+	u32 *end = tablep + log2_to_int(num_contig_lg2);
+
+	PT_WARN_ON(log2_mod(pts->index, num_contig_lg2));
+	for (; tablep != end; tablep++)
+		WRITE_ONCE(*tablep, 0);
+}
+
+static inline void pt_clear_entries(struct pt_state *pts,
+				    unsigned int num_contig_lg2)
+{
+	if (PT_ITEM_WORD_SIZE == sizeof(u32))
+		pt_clear_entries32(pts, num_contig_lg2);
+	else
+		pt_clear_entries64(pts, num_contig_lg2);
+}
+#define pt_clear_entries pt_clear_entries
+#endif
+
+/*
+ * Format can call in the pt_install_leaf_entry() to check the arguments are all
+ * aligned correctly.
+ */
+static inline bool pt_check_install_leaf_args(struct pt_state *pts,
+					      pt_oaddr_t oa,
+					      unsigned int oasz_lg2)
+{
+	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+	if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2)))
+		return false;
+
+#ifdef pt_possible_sizes
+	if (PT_WARN_ON(isz_lg2 > oasz_lg2 ||
+		       oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts)))
+		return false;
+#else
+	if (PT_WARN_ON(oasz_lg2 != isz_lg2 &&
+		       oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts)))
+		return false;
+#endif
+
+	if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2)))
+		return false;
+	return true;
+}
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h
new file mode 100644
index 000000000000..87f4a26c1a41
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_iter.h
@@ -0,0 +1,636 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Iterators for Generic Page Table
+ */
+#ifndef __GENERIC_PT_PT_ITER_H
+#define __GENERIC_PT_PT_ITER_H
+
+#include "pt_common.h"
+
+#include <linux/errno.h>
+
+/*
+ * Use to mangle symbols so that backtraces and the symbol table are
+ * understandable. Any non-inlined function should get mangled like this.
+ */
+#define NS(fn) CONCATENATE(PTPFX, fn)
+
+/**
+ * pt_check_range() - Validate the range can be iterated
+ * @range: Range to validate
+ *
+ * Check that VA and last_va fall within the permitted range of VAs. If the
+ * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension
+ * is correct.
+ */
+static inline int pt_check_range(struct pt_range *range)
+{
+	pt_vaddr_t prefix;
+
+	PT_WARN_ON(!range->max_vasz_lg2);
+
+	if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) {
+		PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2);
+		prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ?
+				 PT_VADDR_MAX :
+				 0;
+	} else {
+		prefix = pt_full_va_prefix(range->common);
+	}
+
+	if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) ||
+	    !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2))
+		return -ERANGE;
+	return 0;
+}
+
+/**
+ * pt_index_to_va() - Update range->va to the current pts->index
+ * @pts: Iteration State
+ *
+ * Adjust range->va to match the current index. This is done in a lazy manner
+ * since computing the VA takes several instructions and is rarely required.
+ */
+static inline void pt_index_to_va(struct pt_state *pts)
+{
+	pt_vaddr_t lower_va;
+
+	lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts));
+	pts->range->va = fvalog2_set_mod(pts->range->va, lower_va,
+					 pt_table_oa_lg2sz(pts));
+}
+
+/*
+ * Add index_count_lg2 number of entries to pts's VA and index. The VA will be
+ * adjusted to the end of the contiguous block if it is currently in the middle.
+ */
+static inline void _pt_advance(struct pt_state *pts,
+			       unsigned int index_count_lg2)
+{
+	pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0,
+				  index_count_lg2);
+}
+
+/**
+ * pt_entry_fully_covered() - Check if the item or entry is entirely contained
+ *                            within pts->range
+ * @pts: Iteration State
+ * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or
+ *            pt_entry_oa_lg2sz()
+ *
+ * Returns: true if the item is fully enclosed by the pts->range.
+ */
+static inline bool pt_entry_fully_covered(const struct pt_state *pts,
+					  unsigned int oasz_lg2)
+{
+	struct pt_range *range = pts->range;
+
+	/* Range begins at the start of the entry */
+	if (log2_mod(pts->range->va, oasz_lg2))
+		return false;
+
+	/* Range ends past the end of the entry */
+	if (!log2_div_eq(range->va, range->last_va, oasz_lg2))
+		return true;
+
+	/* Range ends at the end of the entry */
+	return log2_mod_eq_max(range->last_va, oasz_lg2);
+}
+
+/**
+ * pt_range_to_index() - Starting index for an iteration
+ * @pts: Iteration State
+ *
+ * Return: the starting index for the iteration in pts.
+ */
+static inline unsigned int pt_range_to_index(const struct pt_state *pts)
+{
+	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+	PT_WARN_ON(pts->level > pts->range->top_level);
+	if (pts->range->top_level == pts->level)
+		return log2_div(fvalog2_mod(pts->range->va,
+					    pts->range->max_vasz_lg2),
+				isz_lg2);
+	return log2_mod(log2_div(pts->range->va, isz_lg2),
+			pt_num_items_lg2(pts));
+}
+
+/**
+ * pt_range_to_end_index() - Ending index iteration
+ * @pts: Iteration State
+ *
+ * Return: the last index for the iteration in pts.
+ */
+static inline unsigned int pt_range_to_end_index(const struct pt_state *pts)
+{
+	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+	struct pt_range *range = pts->range;
+	unsigned int num_entries_lg2;
+
+	if (range->va == range->last_va)
+		return pts->index + 1;
+
+	if (pts->range->top_level == pts->level)
+		return log2_div(fvalog2_mod(pts->range->last_va,
+					    pts->range->max_vasz_lg2),
+				isz_lg2) +
+		       1;
+
+	num_entries_lg2 = pt_num_items_lg2(pts);
+
+	/* last_va falls within this table */
+	if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2))
+		return log2_mod(log2_div(pts->range->last_va, isz_lg2),
+				num_entries_lg2) +
+		       1;
+
+	return log2_to_int(num_entries_lg2);
+}
+
+static inline void _pt_iter_first(struct pt_state *pts)
+{
+	pts->index = pt_range_to_index(pts);
+	pts->end_index = pt_range_to_end_index(pts);
+	PT_WARN_ON(pts->index > pts->end_index);
+}
+
+static inline bool _pt_iter_load(struct pt_state *pts)
+{
+	if (pts->index >= pts->end_index)
+		return false;
+	pt_load_entry(pts);
+	return true;
+}
+
+/**
+ * pt_next_entry() - Advance pts to the next entry
+ * @pts: Iteration State
+ *
+ * Update pts to go to the next index at this level. If pts is pointing at a
+ * contiguous entry then the index may advance my more than one.
+ */
+static inline void pt_next_entry(struct pt_state *pts)
+{
+	if (pts->type == PT_ENTRY_OA &&
+	    !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0))
+		_pt_advance(pts, pt_entry_num_contig_lg2(pts));
+	else
+		pts->index++;
+	pt_index_to_va(pts);
+}
+
+/**
+ * for_each_pt_level_entry() - For loop wrapper over entries in the range
+ * @pts: Iteration State
+ *
+ * This is the basic iteration primitive. It iterates over all the entries in
+ * pts->range that fall within the pts's current table level. Each step does
+ * pt_load_entry(pts).
+ */
+#define for_each_pt_level_entry(pts) \
+	for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts))
+
+/**
+ * pt_load_single_entry() - Version of pt_load_entry() usable within a walker
+ * @pts: Iteration State
+ *
+ * Alternative to for_each_pt_level_entry() if the walker function uses only a
+ * single entry.
+ */
+static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts)
+{
+	pts->index = pt_range_to_index(pts);
+	pt_load_entry(pts);
+	return pts->type;
+}
+
+static __always_inline struct pt_range _pt_top_range(struct pt_common *common,
+						     uintptr_t top_of_table)
+{
+	struct pt_range range = {
+		.common = common,
+		.top_table =
+			(struct pt_table_p *)(top_of_table &
+					      ~(uintptr_t)PT_TOP_LEVEL_MASK),
+		.top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS),
+	};
+	struct pt_state pts = { .range = &range, .level = range.top_level };
+	unsigned int max_vasz_lg2;
+
+	max_vasz_lg2 = common->max_vasz_lg2;
+	if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+	    pts.level != PT_MAX_TOP_LEVEL)
+		max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2,
+				     pt_num_items_lg2(&pts) +
+					     pt_table_item_lg2sz(&pts));
+
+	/*
+	 * The top range will default to the lower region only with sign extend.
+	 */
+	range.max_vasz_lg2 = max_vasz_lg2;
+	if (pt_feature(common, PT_FEAT_SIGN_EXTEND))
+		max_vasz_lg2--;
+
+	range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2);
+	range.last_va =
+		fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2);
+	return range;
+}
+
+/**
+ * pt_top_range() - Return a range that spans part of the top level
+ * @common: Table
+ *
+ * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the
+ * total page table. Otherwise it returns the entire page table.
+ */
+static __always_inline struct pt_range pt_top_range(struct pt_common *common)
+{
+	/*
+	 * The top pointer can change without locking. We capture the value and
+	 * it's level here and are safe to walk it so long as both values are
+	 * captured without tearing.
+	 */
+	return _pt_top_range(common, READ_ONCE(common->top_of_table));
+}
+
+/**
+ * pt_all_range() - Return a range that spans the entire page table
+ * @common: Table
+ *
+ * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND
+ * is supported range->va and range->last_va will be incorrect during the
+ * iteration and must not be accessed.
+ */
+static inline struct pt_range pt_all_range(struct pt_common *common)
+{
+	struct pt_range range = pt_top_range(common);
+
+	if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
+		return range;
+
+	/*
+	 * Pretend the table is linear from 0 without a sign extension. This
+	 * generates the correct indexes for iteration.
+	 */
+	range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2);
+	return range;
+}
+
+/**
+ * pt_upper_range() - Return a range that spans part of the top level
+ * @common: Table
+ *
+ * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the
+ * total page table. Otherwise it returns the entire page table.
+ */
+static inline struct pt_range pt_upper_range(struct pt_common *common)
+{
+	struct pt_range range = pt_top_range(common);
+
+	if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
+		return range;
+
+	range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1);
+	range.last_va = PT_VADDR_MAX;
+	return range;
+}
+
+/**
+ * pt_make_range() - Return a range that spans part of the table
+ * @common: Table
+ * @va: Start address
+ * @last_va: Last address
+ *
+ * The caller must validate the range with pt_check_range() before using it.
+ */
+static __always_inline struct pt_range
+pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va)
+{
+	struct pt_range range =
+		_pt_top_range(common, READ_ONCE(common->top_of_table));
+
+	range.va = va;
+	range.last_va = last_va;
+
+	return range;
+}
+
+/*
+ * Span a slice of the table starting at a lower table level from an active
+ * walk.
+ */
+static __always_inline struct pt_range
+pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va,
+		    pt_vaddr_t last_va)
+{
+	struct pt_range range = *parent;
+
+	range.va = va;
+	range.last_va = last_va;
+
+	PT_WARN_ON(last_va < va);
+	PT_WARN_ON(pt_check_range(&range));
+
+	return range;
+}
+
+/**
+ * pt_init() - Initialize a pt_state on the stack
+ * @range: Range pointer to embed in the state
+ * @level: Table level for the state
+ * @table: Pointer to the table memory at level
+ *
+ * Helper to initialize the on-stack pt_state from walker arguments.
+ */
+static __always_inline struct pt_state
+pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table)
+{
+	struct pt_state pts = {
+		.range = range,
+		.table = table,
+		.level = level,
+	};
+	return pts;
+}
+
+/**
+ * pt_init_top() - Initialize a pt_state on the stack
+ * @range: Range pointer to embed in the state
+ *
+ * The pt_state points to the top most level.
+ */
+static __always_inline struct pt_state pt_init_top(struct pt_range *range)
+{
+	return pt_init(range, range->top_level, range->top_table);
+}
+
+typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg,
+			     unsigned int level, struct pt_table_p *table);
+
+/**
+ * pt_descend() - Recursively invoke the walker for the lower level
+ * @pts: Iteration State
+ * @arg: Value to pass to the function
+ * @fn: Walker function to call
+ *
+ * pts must point to a table item. Invoke fn as a walker on the table
+ * pts points to.
+ */
+static __always_inline int pt_descend(struct pt_state *pts, void *arg,
+				      pt_level_fn_t fn)
+{
+	int ret;
+
+	if (PT_WARN_ON(!pts->table_lower))
+		return -EINVAL;
+
+	ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower);
+	return ret;
+}
+
+/**
+ * pt_walk_range() - Walk over a VA range
+ * @range: Range pointer
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * Walk over a VA range. The caller should have done a validity check, at
+ * least calling pt_check_range(), when building range. The walk will
+ * start at the top most table.
+ */
+static __always_inline int pt_walk_range(struct pt_range *range,
+					 pt_level_fn_t fn, void *arg)
+{
+	return fn(range, arg, range->top_level, range->top_table);
+}
+
+/*
+ * pt_walk_descend() - Recursively invoke the walker for a slice of a lower
+ *                     level
+ * @pts: Iteration State
+ * @va: Start address
+ * @last_va: Last address
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * With pts pointing at a table item this will descend and over a slice of the
+ * lower table. The caller must ensure that va/last_va are within the table
+ * item. This creates a new walk and does not alter pts or pts->range.
+ */
+static __always_inline int pt_walk_descend(const struct pt_state *pts,
+					   pt_vaddr_t va, pt_vaddr_t last_va,
+					   pt_level_fn_t fn, void *arg)
+{
+	struct pt_range range = pt_make_child_range(pts->range, va, last_va);
+
+	if (PT_WARN_ON(!pt_can_have_table(pts)) ||
+	    PT_WARN_ON(!pts->table_lower))
+		return -EINVAL;
+
+	return fn(&range, arg, pts->level - 1, pts->table_lower);
+}
+
+/*
+ * pt_walk_descend_all() - Recursively invoke the walker for a table item
+ * @parent_pts: Iteration State
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * With pts pointing at a table item this will descend and over the entire lower
+ * table. This creates a new walk and does not alter pts or pts->range.
+ */
+static __always_inline int
+pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn,
+		    void *arg)
+{
+	unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts);
+
+	return pt_walk_descend(parent_pts,
+			       log2_set_mod(parent_pts->range->va, 0, isz_lg2),
+			       log2_set_mod_max(parent_pts->range->va, isz_lg2),
+			       fn, arg);
+}
+
+/**
+ * pt_range_slice() - Return a range that spans indexes
+ * @pts: Iteration State
+ * @start_index: Starting index within pts
+ * @end_index: Ending index within pts
+ *
+ * Create a range than spans an index range of the current table level
+ * pt_state points at.
+ */
+static inline struct pt_range pt_range_slice(const struct pt_state *pts,
+					     unsigned int start_index,
+					     unsigned int end_index)
+{
+	unsigned int table_lg2sz = pt_table_oa_lg2sz(pts);
+	pt_vaddr_t last_va;
+	pt_vaddr_t va;
+
+	va = fvalog2_set_mod(pts->range->va,
+			     log2_mul(start_index, pt_table_item_lg2sz(pts)),
+			     table_lg2sz);
+	last_va = fvalog2_set_mod(
+		pts->range->va,
+		log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz);
+	return pt_make_child_range(pts->range, va, last_va);
+}
+
+/**
+ * pt_top_memsize_lg2()
+ * @common: Table
+ * @top_of_table: Top of table value from _pt_top_set()
+ *
+ * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this
+ * will compute the top size assuming the table will grow.
+ */
+static inline unsigned int pt_top_memsize_lg2(struct pt_common *common,
+					      uintptr_t top_of_table)
+{
+	struct pt_range range = _pt_top_range(common, top_of_table);
+	struct pt_state pts = pt_init_top(&range);
+	unsigned int num_items_lg2;
+
+	num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts);
+	if (range.top_level != PT_MAX_TOP_LEVEL &&
+	    pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+		num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts));
+
+	/* Round up the allocation size to the minimum alignment */
+	return max(ffs_t(u64, PT_TOP_PHYS_MASK),
+		   num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE));
+}
+
+/**
+ * pt_compute_best_pgsize() - Determine the best page size for leaf entries
+ * @pgsz_bitmap: Permitted page sizes
+ * @va: Starting virtual address for the leaf entry
+ * @last_va: Last virtual address for the leaf entry, sets the max page size
+ * @oa: Starting output address for the leaf entry
+ *
+ * Compute the largest page size for va, last_va, and oa together and return it
+ * in lg2. The largest page size depends on the format's supported page sizes at
+ * this level, and the relative alignment of the VA and OA addresses. 0 means
+ * the OA cannot be stored with the provided pgsz_bitmap.
+ */
+static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap,
+						  pt_vaddr_t va,
+						  pt_vaddr_t last_va,
+						  pt_oaddr_t oa)
+{
+	unsigned int best_pgsz_lg2;
+	unsigned int pgsz_lg2;
+	pt_vaddr_t len = last_va - va + 1;
+	pt_vaddr_t mask;
+
+	if (PT_WARN_ON(va >= last_va))
+		return 0;
+
+	/*
+	 * Given a VA/OA pair the best page size is the largest page size
+	 * where:
+	 *
+	 * 1) VA and OA start at the page. Bitwise this is the count of least
+	 *    significant 0 bits.
+	 *    This also implies that last_va/oa has the same prefix as va/oa.
+	 */
+	mask = va | oa;
+
+	/*
+	 * 2) The page size is not larger than the last_va (length). Since page
+	 *    sizes are always power of two this can't be larger than the
+	 *    largest power of two factor of the length.
+	 */
+	mask |= log2_to_int(vafls(len) - 1);
+
+	best_pgsz_lg2 = vaffs(mask);
+
+	/* Choose the highest bit <= best_pgsz_lg2 */
+	if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1)
+		pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1);
+
+	pgsz_lg2 = vafls(pgsz_bitmap);
+	if (!pgsz_lg2)
+		return 0;
+
+	pgsz_lg2--;
+
+	PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0);
+	PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0);
+	PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va);
+	PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
+	PT_WARN_ON(
+		!oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
+	return pgsz_lg2;
+}
+
+#define _PT_MAKE_CALL_LEVEL(fn)                                          \
+	static __always_inline int fn(struct pt_range *range, void *arg, \
+				      unsigned int level,                \
+				      struct pt_table_p *table)          \
+	{                                                                \
+		static_assert(PT_MAX_TOP_LEVEL <= 5);                    \
+		if (level == 0)                                          \
+			return CONCATENATE(fn, 0)(range, arg, 0, table); \
+		if (level == 1 || PT_MAX_TOP_LEVEL == 1)                 \
+			return CONCATENATE(fn, 1)(range, arg, 1, table); \
+		if (level == 2 || PT_MAX_TOP_LEVEL == 2)                 \
+			return CONCATENATE(fn, 2)(range, arg, 2, table); \
+		if (level == 3 || PT_MAX_TOP_LEVEL == 3)                 \
+			return CONCATENATE(fn, 3)(range, arg, 3, table); \
+		if (level == 4 || PT_MAX_TOP_LEVEL == 4)                 \
+			return CONCATENATE(fn, 4)(range, arg, 4, table); \
+		return CONCATENATE(fn, 5)(range, arg, 5, table);         \
+	}
+
+static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg,
+					 unsigned int unused_level,
+					 struct pt_table_p *table)
+{
+	static_assert(PT_MAX_TOP_LEVEL <= 5);
+	return -EPROTOTYPE;
+}
+
+#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn)            \
+	static inline int fn(struct pt_range *range, void *arg,     \
+			     unsigned int unused_level,             \
+			     struct pt_table_p *table)              \
+	{                                                           \
+		return do_fn(range, arg, level, table, descend_fn); \
+	}
+
+/**
+ * PT_MAKE_LEVELS() - Build an unwound walker
+ * @fn: Name of the walker function
+ * @do_fn: Function to call at each level
+ *
+ * This builds a function call tree that can be fully inlined.
+ * The caller must provide a function body in an __always_inline function::
+ *
+ *  static __always_inline int do(struct pt_range *range, void *arg,
+ *         unsigned int level, struct pt_table_p *table,
+ *         pt_level_fn_t descend_fn)
+ *
+ * An inline function will be created for each table level that calls do_fn with
+ * a compile time constant for level and a pointer to the next lower function.
+ * This generates an optimally inlined walk where each of the functions sees a
+ * constant level and can codegen the exact constants/etc for that level.
+ *
+ * Note this can produce a lot of code!
+ */
+#define PT_MAKE_LEVELS(fn, do_fn)                                             \
+	__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err,     \
+			   do_fn);                                            \
+	__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \
+	__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \
+	__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \
+	__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \
+	__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \
+	_PT_MAKE_CALL_LEVEL(fn)
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_log2.h b/drivers/iommu/generic_pt/pt_log2.h
new file mode 100644
index 000000000000..6dbbed119238
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_log2.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Helper macros for working with log2 values
+ *
+ */
+#ifndef __GENERIC_PT_LOG2_H
+#define __GENERIC_PT_LOG2_H
+#include <linux/bitops.h>
+#include <linux/limits.h>
+
+/* Compute a */
+#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2)))
+static_assert(log2_to_int_t(unsigned int, 0) == 1);
+
+/* Compute a - 1 (aka all low bits set) */
+#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1))
+
+/* Compute a / b */
+#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2)))
+static_assert(log2_div_t(unsigned int, 4, 2) == 1);
+
+/*
+ * Compute:
+ *   a / c == b / c
+ * aka the high bits are equal
+ */
+#define log2_div_eq_t(type, a, b, c_lg2) \
+	(log2_div_t(type, (a) ^ (b), c_lg2) == 0)
+static_assert(log2_div_eq_t(unsigned int, 1, 1, 2));
+
+/* Compute a % b */
+#define log2_mod_t(type, a, b_lg2) \
+	((type)(((type)a) & log2_to_max_int_t(type, b_lg2)))
+static_assert(log2_mod_t(unsigned int, 1, 2) == 1);
+
+/*
+ * Compute:
+ *   a % b == b - 1
+ * aka the low bits are all 1s
+ */
+#define log2_mod_eq_max_t(type, a, b_lg2) \
+	(log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2))
+static_assert(log2_mod_eq_max_t(unsigned int, 3, 2));
+
+/*
+ * Return a value such that:
+ *    a / b == ret / b
+ *    ret % b == val
+ * aka set the low bits to val. val must be < b
+ */
+#define log2_set_mod_t(type, a, val, b_lg2) \
+	((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val)))
+static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1);
+
+/* Return a value such that:
+ *    a / b == ret / b
+ *    ret % b == b - 1
+ * aka set the low bits to all 1s
+ */
+#define log2_set_mod_max_t(type, a, b_lg2) \
+	(((type)(a)) | log2_to_max_int_t(type, b_lg2))
+static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3);
+
+/* Compute a * b */
+#define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2)))
+static_assert(log2_mul_t(unsigned int, 2, 2) == 8);
+
+#define _dispatch_sz(type, fn, a) \
+	(sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a))
+
+/*
+ * Return the highest value such that:
+ *    fls_t(u32, 0) == 0
+ *    fls_t(u3, 1) == 1
+ *    a >= log2_to_int(ret - 1)
+ * aka find last set bit
+ */
+static inline unsigned int fls32(u32 a)
+{
+	return fls(a);
+}
+#define fls_t(type, a) _dispatch_sz(type, fls, a)
+
+/*
+ * Return the highest value such that:
+ *    ffs_t(u32, 0) == UNDEFINED
+ *    ffs_t(u32, 1) == 0
+ *    log_mod(a, ret) == 0
+ * aka find first set bit
+ */
+static inline unsigned int __ffs32(u32 a)
+{
+	return __ffs(a);
+}
+#define ffs_t(type, a) _dispatch_sz(type, __ffs, a)
+
+/*
+ * Return the highest value such that:
+ *    ffz_t(u32, U32_MAX) == UNDEFINED
+ *    ffz_t(u32, 0) == 0
+ *    ffz_t(u32, 1) == 1
+ *    log_mod(a, ret) == log_to_max_int(ret)
+ * aka find first zero bit
+ */
+static inline unsigned int ffz32(u32 a)
+{
+	return ffz(a);
+}
+static inline unsigned int ffz64(u64 a)
+{
+	if (sizeof(u64) == sizeof(unsigned long))
+		return ffz(a);
+
+	if ((u32)a == U32_MAX)
+		return ffz32(a >> 32) + 32;
+	return ffz32(a);
+}
+#define ffz_t(type, a) _dispatch_sz(type, ffz, a)
+
+#endif
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
new file mode 100644
index 000000000000..e69a75511313
--- /dev/null
+++ b/include/linux/generic_pt/common.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __GENERIC_PT_COMMON_H
+#define __GENERIC_PT_COMMON_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+#include <linux/bits.h>
+
+/**
+ * DOC: Generic Radix Page Table
+ *
+ * Generic Radix Page Table is a set of functions and helpers to efficiently
+ * parse radix style page tables typically seen in HW implementations. The
+ * interface is built to deliver similar code generation as the mm's pte/pmd/etc
+ * system by fully inlining the exact code required to handle each table level.
+ *
+ * Like the mm subsystem each format contributes its parsing implementation
+ * under common names and the common code implements the required algorithms.
+ *
+ * The system is divided into three logical levels:
+ *
+ *  - The page table format and its manipulation functions
+ *  - Generic helpers to give a consistent API regardless of underlying format
+ *  - An algorithm implementation (e.g. IOMMU/DRM/KVM/MM)
+ *
+ * Multiple implementations are supported. The intention is to have the generic
+ * format code be re-usable for whatever specialized implementation is required.
+ * The generic code is solely about the format of the radix tree; it does not
+ * include memory allocation or higher level decisions that are left for the
+ * implementation.
+ *
+ * The generic framework supports a superset of functions across many HW
+ * implementations:
+ *
+ *  - Entries comprised of contiguous blocks of IO PTEs for larger page sizes
+ *  - Multi-level tables, up to 6 levels. Runtime selected top level
+ *  - Runtime variable table level size (ARM's concatenated tables)
+ *  - Expandable top level allowing dynamic sizing of table levels
+ *  - Optional leaf entries at any level
+ *  - 32-bit/64-bit virtual and output addresses, using every address bit
+ *  - Dirty tracking
+ *  - Sign extended addressing
+ */
+
+/**
+ * struct pt_common - struct for all page table implementations
+ */
+struct pt_common {
+	/**
+	 * @top_of_table: Encodes the table top pointer and the top level in a
+	 * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower
+	 * bits of the aligned table pointer are used for the level.
+	 */
+	uintptr_t top_of_table;
+	/**
+	 * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits
+	 * must be zero. This may be less than what the page table format
+	 * supports, but must not be more.
+	 */
+	u8 max_oasz_lg2;
+	/**
+	 * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits
+	 * are 0 or 1 depending on pt_full_va_prefix(). This may be less than
+	 * what the page table format supports, but must not be more. When
+	 * PT_FEAT_DYNAMIC_TOP is set this reflects the maximum VA capability.
+	 */
+	u8 max_vasz_lg2;
+	/**
+	 * @features: Bitmap of `enum pt_features`
+	 */
+	unsigned int features;
+};
+
+/* Encoding parameters for top_of_table */
+enum {
+	PT_TOP_LEVEL_BITS = 3,
+	PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0),
+};
+
+/**
+ * enum pt_features - Features turned on in the table. Each symbol is a bit
+ * position.
+ */
+enum pt_features {
+	/**
+	 * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to
+	 * PT_VADDR_MAX.
+	 */
+	PT_FEAT_FULL_VA,
+	/**
+	 * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased
+	 * dynamically during map. This requires HW support for atomically
+	 * setting both the table top pointer and the starting table level.
+	 */
+	PT_FEAT_DYNAMIC_TOP,
+	/**
+	 * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign
+	 * extends up to the full pt_vaddr_t. This divides the page table into
+	 * three VA ranges::
+	 *
+	 *   0         -> 2^N - 1             Lower
+	 *   2^N       -> (MAX - 2^N - 1)     Non-Canonical
+	 *   MAX - 2^N -> MAX                 Upper
+	 *
+	 * In this mode pt_common::max_vasz_lg2 includes the sign bit and the
+	 * upper bits that don't fall within the translation are just validated.
+	 *
+	 * If not set there is no sign extension and valid VA goes from 0 to 2^N
+	 * - 1.
+	 */
+	PT_FEAT_SIGN_EXTEND,
+	/**
+	 * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA
+	 * ranges which will clean out any walk cache or any IOPTE fully
+	 * contained by the range. The optimization objective is to minimize the
+	 * number of flushes even if ranges include IOVA gaps that do not need
+	 * to be flushed.
+	 */
+	PT_FEAT_FLUSH_RANGE,
+	/**
+	 * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that
+	 * the optimization objective is to only flush IOVA that has been
+	 * changed. This mode is suitable for cases like hypervisor shadowing
+	 * where flushing unchanged ranges may cause the hypervisor to reparse
+	 * significant amount of page table.
+	 */
+	PT_FEAT_FLUSH_RANGE_NO_GAPS,
+	/* private: */
+	PT_FEAT_FMT_START,
+};
+
+#endif
-- 
cgit v1.2.3


From cdb39d9185795b744dab4d4d782f2fe3f5eca10c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:01 -0400
Subject: iommupt: Add the basic structure of the iommu implementation

The existing IOMMU page table implementations duplicate all of the working
algorithms for each format. By using the generic page table API a single C
version of the IOMMU algorithms can be created and re-used for all of the
different formats used in the drivers. The implementation will provide a
single C version of the iommu domain operations: iova_to_phys, map, unmap,
and read_and_clear_dirty.

Further, adding new algorithms and techniques becomes easy to do across
the entire fleet of drivers and formats.

The C functions are drop in compatible with the existing iommu_domain_ops
using the IOMMU_PT_DOMAIN_OPS() macro. Each per-format implementation
compilation unit will produce exported symbols following the pattern
pt_iommu_FMT_map_pages() which the macro directly maps to the
iommu_domain_ops members. This avoids the additional function pointer
indirection like io-pgtable has.

The top level struct used by the drivers is pt_iommu_table_FMT. It
contains the other structs to allow container_of() to move between the
driver, iommu page table, generic page table, and generic format layers.

   struct pt_iommu_table_amdv1 {
       struct pt_iommu {
	      struct iommu_domain domain;
       } iommu;
       struct pt_amdv1 {
	      struct pt_common common;
       } amdpt;
   };

The driver is expected to union the pt_iommu_table_FMT with its own
existing domain struct:

   struct driver_domain {
       union {
	       struct iommu_domain domain;
	       struct pt_iommu_table_amdv1 amdv1;
       };
   };
   PT_IOMMU_CHECK_DOMAIN(struct driver_domain, amdv1, domain);

To create an alias to avoid renaming 'domain' in a lot of driver code.

This allows all the layers to access all the necessary functions to
implement their different roles with no change to any of the existing
iommu core code.

Implement the basic starting point: pt_iommu_init(), get_info() and
deinit().

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/Kconfig              |  13 ++
 drivers/iommu/generic_pt/fmt/iommu_template.h |  39 ++++
 drivers/iommu/generic_pt/iommu_pt.h           | 259 ++++++++++++++++++++++++++
 include/linux/generic_pt/iommu.h              | 150 +++++++++++++++
 4 files changed, 461 insertions(+)
 create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h
 create mode 100644 drivers/iommu/generic_pt/iommu_pt.h
 create mode 100644 include/linux/generic_pt/iommu.h

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
index fb0f431ddba0..a81dfdd72ca0 100644
--- a/drivers/iommu/generic_pt/Kconfig
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -17,4 +17,17 @@ config DEBUG_GENERIC_PT
 	  kernels.
 
 	  The kunit tests require this to be enabled to get full coverage.
+
+config IOMMU_PT
+	tristate "IOMMU Page Tables"
+	select IOMMU_API
+	depends on IOMMU_SUPPORT
+	depends on GENERIC_PT
+	help
+	  Generic library for building IOMMU page tables
+
+	  IOMMU_PT provides an implementation of the page table operations
+	  related to struct iommu_domain using GENERIC_PT. It provides a single
+	  implementation of the page table operations that can be shared by
+	  multiple drivers.
 endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h
new file mode 100644
index 000000000000..5b631bc07cbc
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_template.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Template to build the iommu module and kunit from the format and
+ * implementation headers.
+ *
+ * The format should have:
+ *  #define PT_FMT <name>
+ *  #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy))
+ * And optionally:
+ *  #define PT_FORCE_ENABLED_FEATURES ..
+ *  #define PT_FMT_VARIANT <suffix>
+ */
+#include <linux/args.h>
+#include <linux/stringify.h>
+
+#ifdef PT_FMT_VARIANT
+#define PTPFX_RAW \
+	CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT)
+#else
+#define PTPFX_RAW PT_FMT
+#endif
+
+#define PTPFX CONCATENATE(PTPFX_RAW, _)
+
+#define _PT_FMT_H PT_FMT.h
+#define PT_FMT_H __stringify(_PT_FMT_H)
+
+#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H)
+#define PT_DEFS_H __stringify(_PT_DEFS_H)
+
+#include <linux/generic_pt/common.h>
+#include PT_DEFS_H
+#include "../pt_defs.h"
+#include PT_FMT_H
+#include "../pt_common.h"
+
+#include "../iommu_pt.h"
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
new file mode 100644
index 000000000000..564f2d3a6e11
--- /dev/null
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * "Templated C code" for implementing the iommu operations for page tables.
+ * This is compiled multiple times, over all the page table formats to pick up
+ * the per-format definitions.
+ */
+#ifndef __GENERIC_PT_IOMMU_PT_H
+#define __GENERIC_PT_IOMMU_PT_H
+
+#include "pt_iter.h"
+
+#include <linux/export.h>
+#include <linux/iommu.h>
+#include "../iommu-pages.h"
+
+#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
+
+struct pt_iommu_collect_args {
+	struct iommu_pages_list free_list;
+};
+
+static int __collect_tables(struct pt_range *range, void *arg,
+			    unsigned int level, struct pt_table_p *table)
+{
+	struct pt_state pts = pt_init(range, level, table);
+	struct pt_iommu_collect_args *collect = arg;
+	int ret;
+
+	if (!pt_can_have_table(&pts))
+		return 0;
+
+	for_each_pt_level_entry(&pts) {
+		if (pts.type == PT_ENTRY_TABLE) {
+			iommu_pages_list_add(&collect->free_list, pts.table_lower);
+			ret = pt_descend(&pts, arg, __collect_tables);
+			if (ret)
+				return ret;
+			continue;
+		}
+	}
+	return 0;
+}
+
+static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
+						 uintptr_t top_of_table,
+						 gfp_t gfp)
+{
+	struct pt_iommu *iommu_table = iommu_from_common(common);
+
+	/*
+	 * Top doesn't need the free list or otherwise, so it technically
+	 * doesn't need to use iommu pages. Use the API anyhow as the top is
+	 * usually not smaller than PAGE_SIZE to keep things simple.
+	 */
+	return iommu_alloc_pages_node_sz(
+		iommu_table->nid, gfp,
+		log2_to_int(pt_top_memsize_lg2(common, top_of_table)));
+}
+
+static void NS(get_info)(struct pt_iommu *iommu_table,
+			 struct pt_iommu_info *info)
+{
+	struct pt_common *common = common_from_iommu(iommu_table);
+	struct pt_range range = pt_top_range(common);
+	struct pt_state pts = pt_init_top(&range);
+	pt_vaddr_t pgsize_bitmap = 0;
+
+	if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) {
+		for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL;
+		     pts.level++) {
+			if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2)
+				break;
+			pgsize_bitmap |= pt_possible_sizes(&pts);
+		}
+	} else {
+		for (pts.level = 0; pts.level <= range.top_level; pts.level++)
+			pgsize_bitmap |= pt_possible_sizes(&pts);
+	}
+
+	/* Hide page sizes larger than the maximum OA */
+	info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2);
+}
+
+static void NS(deinit)(struct pt_iommu *iommu_table)
+{
+	struct pt_common *common = common_from_iommu(iommu_table);
+	struct pt_range range = pt_all_range(common);
+	struct pt_iommu_collect_args collect = {
+		.free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
+	};
+
+	iommu_pages_list_add(&collect.free_list, range.top_table);
+	pt_walk_range(&range, __collect_tables, &collect);
+
+	/*
+	 * The driver has to already have fenced the HW access to the page table
+	 * and invalidated any caching referring to this memory.
+	 */
+	iommu_put_pages_list(&collect.free_list);
+}
+
+static const struct pt_iommu_ops NS(ops) = {
+	.get_info = NS(get_info),
+	.deinit = NS(deinit),
+};
+
+static int pt_init_common(struct pt_common *common)
+{
+	struct pt_range top_range = pt_top_range(common);
+
+	if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL))
+		return -EINVAL;
+
+	if (top_range.top_level == PT_MAX_TOP_LEVEL ||
+	    common->max_vasz_lg2 == top_range.max_vasz_lg2)
+		common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP);
+
+	if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2)
+		common->features |= BIT(PT_FEAT_FULL_VA);
+
+	/* Requested features must match features compiled into this format */
+	if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) ||
+	    (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) &&
+	     (common->features & PT_FORCE_ENABLED_FEATURES) !=
+		     PT_FORCE_ENABLED_FEATURES))
+		return -EOPNOTSUPP;
+
+	if (common->max_oasz_lg2 == 0)
+		common->max_oasz_lg2 = pt_max_oa_lg2(common);
+	else
+		common->max_oasz_lg2 = min(common->max_oasz_lg2,
+					   pt_max_oa_lg2(common));
+	return 0;
+}
+
+static int pt_iommu_init_domain(struct pt_iommu *iommu_table,
+				struct iommu_domain *domain)
+{
+	struct pt_common *common = common_from_iommu(iommu_table);
+	struct pt_iommu_info info;
+	struct pt_range range;
+
+	NS(get_info)(iommu_table, &info);
+
+	domain->type = __IOMMU_DOMAIN_PAGING;
+	domain->pgsize_bitmap = info.pgsize_bitmap;
+
+	if (pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+		range = _pt_top_range(common,
+				      _pt_top_set(NULL, PT_MAX_TOP_LEVEL));
+	else
+		range = pt_top_range(common);
+
+	/* A 64-bit high address space table on a 32-bit system cannot work. */
+	domain->geometry.aperture_start = (unsigned long)range.va;
+	if ((pt_vaddr_t)domain->geometry.aperture_start != range.va)
+		return -EOVERFLOW;
+
+	/*
+	 * The aperture is limited to what the API can do after considering all
+	 * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used
+	 * to store a VA. Set the aperture to something that is valid for all
+	 * cases. Saturate instead of truncate the end if the types are smaller
+	 * than the top range. aperture_end should be called aperture_last.
+	 */
+	domain->geometry.aperture_end = (unsigned long)range.last_va;
+	if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) {
+		domain->geometry.aperture_end = ULONG_MAX;
+		domain->pgsize_bitmap &= ULONG_MAX;
+	}
+	domain->geometry.force_aperture = true;
+
+	return 0;
+}
+
+static void pt_iommu_zero(struct pt_iommu_table *fmt_table)
+{
+	struct pt_iommu *iommu_table = &fmt_table->iommu;
+	struct pt_iommu cfg = *iommu_table;
+
+	static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0);
+	memset_after(fmt_table, 0, iommu.domain);
+
+	/* The caller can initialize some of these values */
+	iommu_table->nid = cfg.nid;
+}
+
+#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
+#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
+
+int pt_iommu_init(struct pt_iommu_table *fmt_table,
+		  const struct pt_iommu_table_cfg *cfg, gfp_t gfp)
+{
+	struct pt_iommu *iommu_table = &fmt_table->iommu;
+	struct pt_common *common = common_from_iommu(iommu_table);
+	struct pt_table_p *table_mem;
+	int ret;
+
+	if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 ||
+	    !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2)
+		return -EINVAL;
+
+	pt_iommu_zero(fmt_table);
+	common->features = cfg->common.features;
+	common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2;
+	common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2;
+	ret = pt_iommu_fmt_init(fmt_table, cfg);
+	if (ret)
+		return ret;
+
+	if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common))
+		return -EINVAL;
+
+	ret = pt_init_common(common);
+	if (ret)
+		return ret;
+
+	if (pt_feature(common, PT_FEAT_SIGN_EXTEND) &&
+	    (pt_feature(common, PT_FEAT_FULL_VA) ||
+	     pt_feature(common, PT_FEAT_DYNAMIC_TOP)))
+		return -EINVAL;
+
+	ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain);
+	if (ret)
+		return ret;
+
+	table_mem = table_alloc_top(common, common->top_of_table, gfp);
+	if (IS_ERR(table_mem))
+		return PTR_ERR(table_mem);
+	pt_top_set(common, table_mem, pt_top_get_level(common));
+
+	/* Must be last, see pt_iommu_deinit() */
+	iommu_table->ops = &NS(ops);
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU");
+
+#ifdef pt_iommu_fmt_hw_info
+#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info)
+#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info)
+void pt_iommu_hw_info(struct pt_iommu_table *fmt_table,
+		      struct pt_iommu_table_hw_info *info)
+{
+	struct pt_iommu *iommu_table = &fmt_table->iommu;
+	struct pt_common *common = common_from_iommu(iommu_table);
+	struct pt_range top_range = pt_top_range(common);
+
+	pt_iommu_fmt_hw_info(fmt_table, &top_range, info);
+}
+EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU");
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW));
+MODULE_IMPORT_NS("GENERIC_PT");
+
+#endif  /* __GENERIC_PT_IOMMU_PT_H */
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
new file mode 100644
index 000000000000..defa96abc497
--- /dev/null
+++ b/include/linux/generic_pt/iommu.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __GENERIC_PT_IOMMU_H
+#define __GENERIC_PT_IOMMU_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/iommu.h>
+#include <linux/mm_types.h>
+
+struct pt_iommu_ops;
+
+/**
+ * DOC: IOMMU Radix Page Table
+ *
+ * The IOMMU implementation of the Generic Page Table provides an ops struct
+ * that is useful to go with an iommu_domain to serve the DMA API, IOMMUFD and
+ * the generic map/unmap interface.
+ *
+ * This interface uses a caller provided locking approach. The caller must have
+ * a VA range lock concept that prevents concurrent threads from calling ops on
+ * the same VA. Generally the range lock must be at least as large as a single
+ * map call.
+ */
+
+/**
+ * struct pt_iommu - Base structure for IOMMU page tables
+ *
+ * The format-specific struct will include this as the first member.
+ */
+struct pt_iommu {
+	/**
+	 * @domain: The core IOMMU domain. The driver should use a union to
+	 * overlay this memory with its previously existing domain struct to
+	 * create an alias.
+	 */
+	struct iommu_domain domain;
+
+	/**
+	 * @ops: Function pointers to access the API
+	 */
+	const struct pt_iommu_ops *ops;
+
+	/**
+	 * @nid: Node ID to use for table memory allocations. The IOMMU driver
+	 * may want to set the NID to the device's NID, if there are multiple
+	 * table walkers.
+	 */
+	int nid;
+};
+
+/**
+ * struct pt_iommu_info - Details about the IOMMU page table
+ *
+ * Returned from pt_iommu_ops->get_info()
+ */
+struct pt_iommu_info {
+	/**
+	 * @pgsize_bitmap: A bitmask where each set bit indicates
+	 * a page size that can be natively stored in the page table.
+	 */
+	u64 pgsize_bitmap;
+};
+
+struct pt_iommu_ops {
+	/**
+	 * @get_info: Return the pt_iommu_info structure
+	 * @iommu_table: Table to query
+	 *
+	 * Return some basic static information about the page table.
+	 */
+	void (*get_info)(struct pt_iommu *iommu_table,
+			 struct pt_iommu_info *info);
+
+	/**
+	 * @deinit: Undo a format specific init operation
+	 * @iommu_table: Table to destroy
+	 *
+	 * Release all of the memory. The caller must have already removed the
+	 * table from all HW access and all caches.
+	 */
+	void (*deinit)(struct pt_iommu *iommu_table);
+};
+
+static inline void pt_iommu_deinit(struct pt_iommu *iommu_table)
+{
+	/*
+	 * It is safe to call pt_iommu_deinit() before an init, or if init
+	 * fails. The ops pointer will only become non-NULL if deinit needs to be
+	 * run.
+	 */
+	if (iommu_table->ops)
+		iommu_table->ops->deinit(iommu_table);
+}
+
+/**
+ * struct pt_iommu_cfg - Common configuration values for all formats
+ */
+struct pt_iommu_cfg {
+	/**
+	 * @features: Features required. Only these features will be turned on.
+	 * The feature list should reflect what the IOMMU HW is capable of.
+	 */
+	unsigned int features;
+	/**
+	 * @hw_max_vasz_lg2: Maximum VA the IOMMU HW can support. This will
+	 * imply the top level of the table.
+	 */
+	u8 hw_max_vasz_lg2;
+	/**
+	 * @hw_max_oasz_lg2: Maximum OA the IOMMU HW can support. The format
+	 * might select a lower maximum OA.
+	 */
+	u8 hw_max_oasz_lg2;
+};
+
+/* Generate the exported function signatures from iommu_pt.h */
+#define IOMMU_PROTOTYPES(fmt)                                             \
+	int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table,           \
+				  const struct pt_iommu_##fmt##_cfg *cfg, \
+				  gfp_t gfp);                             \
+	void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table,       \
+				      struct pt_iommu_##fmt##_hw_info *info)
+#define IOMMU_FORMAT(fmt, member)       \
+	struct pt_iommu_##fmt {         \
+		struct pt_iommu iommu;  \
+		struct pt_##fmt member; \
+	};                              \
+	IOMMU_PROTOTYPES(fmt)
+
+/*
+ * The driver should setup its domain struct like
+ *	union {
+ *		struct iommu_domain domain;
+ *		struct pt_iommu_xxx xx;
+ *	};
+ * PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, xx.iommu, domain);
+ *
+ * Which creates an alias between driver_domain.domain and
+ * driver_domain.xx.iommu.domain. This is to avoid a mass rename of existing
+ * driver_domain.domain users.
+ */
+#define PT_IOMMU_CHECK_DOMAIN(s, pt_iommu_memb, domain_memb) \
+	static_assert(offsetof(s, pt_iommu_memb.domain) ==   \
+		      offsetof(s, domain_memb))
+
+#undef IOMMU_PROTOTYPES
+#undef IOMMU_FORMAT
+#endif
-- 
cgit v1.2.3


From 879ced2bab1ba95e98fac56c9503791183bc7cbb Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:02 -0400
Subject: iommupt: Add the AMD IOMMU v1 page table format

AMD IOMMU v1 is unique in supporting contiguous pages with a variable size
and it can decode the full 64 bit VA space. Unlike other x86 page tables
this explicitly does not do sign extension as part of allowing the entire
64 bit VA space to be supported.

The general design is quite similar to the x86 PAE format, except with a
6th level and quite different PTE encoding.

This format is the only one that uses the PT_FEAT_DYNAMIC_TOP feature in
the existing code as the existing AMDv1 code starts out with a 3 level
table and adds levels on the fly if more IOVA is needed.

Comparing the performance of several operations to the existing version:

iommu_map()
   pgsz  ,avg new,old ns, min new,old ns  , min % (+ve is better)
     2^12,     65,64    ,      62,61      ,  -1.01
     2^13,     70,66    ,      67,62      ,  -8.08
     2^14,     73,69    ,      71,65      ,  -9.09
     2^15,     78,75    ,      75,71      ,  -5.05
     2^16,     89,89    ,      86,84      ,  -2.02
     2^17,    128,121   ,     124,112     , -10.10
     2^18,    175,175   ,     170,163     ,  -4.04
     2^19,    264,306   ,     261,279     ,   6.06
     2^20,    444,525   ,     438,489     ,  10.10
     2^21,     60,62    ,      58,59      ,   1.01
 256*2^12,    381,1833  ,     367,1795    ,  79.79
 256*2^21,    375,1623  ,     356,1555    ,  77.77
 256*2^30,    356,1338  ,     349,1277    ,  72.72

iommu_unmap()
   pgsz  ,avg new,old ns, min new,old ns  , min % (+ve is better)
     2^12,     76,89    ,      71,86      ,  17.17
     2^13,     79,89    ,      75,86      ,  12.12
     2^14,     78,90    ,      74,86      ,  13.13
     2^15,     82,89    ,      74,86      ,  13.13
     2^16,     79,89    ,      74,86      ,  13.13
     2^17,     81,89    ,      77,87      ,  11.11
     2^18,     90,92    ,      87,89      ,   2.02
     2^19,     91,93    ,      88,90      ,   2.02
     2^20,     96,95    ,      91,92      ,   1.01
     2^21,     72,88    ,      68,85      ,  20.20
 256*2^12,    372,6583  ,     364,6251    ,  94.94
 256*2^21,    398,6032  ,     392,5758    ,  93.93
 256*2^30,    396,5665  ,     389,5258    ,  92.92

The ~5-17x speedup when working with mutli-PTE map/unmaps is because the
AMD implementation rewalks the entire table on every new PTE while this
version retains its position. The same speedup will be seen with dirtys as
well.

The old implementation triggers a compiler optimization that ends up
generating a "rep stos" memset for contiguous PTEs. Since AMD can have
contiguous PTEs that span 2Kbytes of table this is a huge win compared to
a normal movq loop. It is why the unmap side has a fairly flat runtime as
the contiguous PTE sides increases. This version makes it explicit with a
memset64() call.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/Makefile                     |   1 +
 drivers/iommu/generic_pt/Kconfig           |  12 +
 drivers/iommu/generic_pt/fmt/Makefile      |  11 +
 drivers/iommu/generic_pt/fmt/amdv1.h       | 387 +++++++++++++++++++++++++++++
 drivers/iommu/generic_pt/fmt/defs_amdv1.h  |  21 ++
 drivers/iommu/generic_pt/fmt/iommu_amdv1.c |  15 ++
 include/linux/generic_pt/common.h          |  19 ++
 include/linux/generic_pt/iommu.h           |  12 +
 8 files changed, 478 insertions(+)
 create mode 100644 drivers/iommu/generic_pt/fmt/Makefile
 create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h
 create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h
 create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c

(limited to 'include')

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 355294fa9033..b17ef9818759 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -3,6 +3,7 @@ obj-y += arm/ iommufd/
 obj-$(CONFIG_AMD_IOMMU) += amd/
 obj-$(CONFIG_INTEL_IOMMU) += intel/
 obj-$(CONFIG_RISCV_IOMMU) += riscv/
+obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/
 obj-$(CONFIG_IOMMU_API) += iommu.o
 obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o
 obj-$(CONFIG_IOMMU_API) += iommu-traces.o
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
index a81dfdd72ca0..cbdad222923b 100644
--- a/drivers/iommu/generic_pt/Kconfig
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -30,4 +30,16 @@ config IOMMU_PT
 	  related to struct iommu_domain using GENERIC_PT. It provides a single
 	  implementation of the page table operations that can be shared by
 	  multiple drivers.
+
+if IOMMU_PT
+config IOMMU_PT_AMDV1
+	tristate "IOMMU page table for 64-bit AMD IOMMU v1"
+	depends on !GENERIC_ATOMIC64 # for cmpxchg64
+	help
+	  iommu_domain implementation for the AMD v1 page table. AMDv1 is the
+	  "host" page table. It supports granular page sizes of almost every
+	  power of 2 and decodes an full 64-bit IOVA space.
+
+	  Selected automatically by an IOMMU driver that uses this format.
+endif
 endif
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
new file mode 100644
index 000000000000..a4d83b7e0cf6
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
+
+define create_format
+obj-$(2) += iommu_$(1).o
+
+endef
+
+$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y)))
+$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m)))
diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h
new file mode 100644
index 000000000000..7423ed71417d
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/amdv1.h
@@ -0,0 +1,387 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * AMD IOMMU v1 page table
+ *
+ * This is described in Section "2.2.3 I/O Page Tables for Host Translations"
+ * of the "AMD I/O Virtualization Technology (IOMMU) Specification"
+ *
+ * Note the level numbering here matches the core code, so level 0 is the same
+ * as mode 1.
+ *
+ */
+#ifndef __GENERIC_PT_FMT_AMDV1_H
+#define __GENERIC_PT_FMT_AMDV1_H
+
+#include "defs_amdv1.h"
+#include "../pt_defs.h"
+
+#include <asm/page.h>
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/mem_encrypt.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+
+enum {
+	PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+	PT_MAX_VA_ADDRESS_LG2 = 64,
+	PT_ITEM_WORD_SIZE = sizeof(u64),
+	PT_MAX_TOP_LEVEL = 5,
+	PT_GRANULE_LG2SZ = 12,
+	PT_TABLEMEM_LG2SZ = 12,
+
+	/* The DTE only has these bits for the top phyiscal address */
+	PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* PTE bits */
+enum {
+	AMDV1PT_FMT_PR = BIT(0),
+	AMDV1PT_FMT_D = BIT(6),
+	AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
+	AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
+	AMDV1PT_FMT_FC = BIT_ULL(60),
+	AMDV1PT_FMT_IR = BIT_ULL(61),
+	AMDV1PT_FMT_IW = BIT_ULL(62),
+};
+
+/*
+ * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
+ * these defines to avoid it.
+ */
+#define AMDV1PT_FMT_NL_DEFAULT 0
+#define AMDV1PT_FMT_NL_SIZE 7
+
+static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
+{
+	u64 entry = pts->entry;
+
+	if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+		entry = __sme_clr(entry);
+	return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
+}
+#define pt_table_pa amdv1pt_table_pa
+
+/* Returns the oa for the start of the contiguous entry */
+static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
+{
+	u64 entry = pts->entry;
+	pt_oaddr_t oa;
+
+	if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+		entry = __sme_clr(entry);
+	oa = FIELD_GET(AMDV1PT_FMT_OA, entry);
+
+	if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
+		unsigned int sz_bits = oaffz(oa);
+
+		oa = oalog2_set_mod(oa, 0, sz_bits);
+	} else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
+			      AMDV1PT_FMT_NL_DEFAULT))
+		return 0;
+	return oalog2_mul(oa, PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa amdv1pt_entry_oa
+
+static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
+{
+	/*
+	 * Table 15: Page Table Level Parameters
+	 * The top most level cannot have translation entries
+	 */
+	return pts->level < PT_MAX_TOP_LEVEL;
+}
+#define pt_can_have_leaf amdv1pt_can_have_leaf
+
+/* Body in pt_fmt_defaults.h */
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
+
+static inline unsigned int
+amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+	u32 code;
+
+	if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
+	    AMDV1PT_FMT_NL_DEFAULT)
+		return ilog2(1);
+
+	PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
+		   AMDV1PT_FMT_NL_SIZE);
+
+	/*
+	 * The contiguous size is encoded in the length of a string of 1's in
+	 * the low bits of the OA. Reverse the equation:
+	 *  code = log2_to_int(num_contig_lg2 + item_lg2sz -
+	 *              PT_GRANULE_LG2SZ - 1) - 1
+	 * Which can be expressed as:
+	 *  num_contig_lg2 = oalog2_ffz(code) + 1 -
+	 *              item_lg2sz - PT_GRANULE_LG2SZ
+	 *
+	 * Assume the bit layout is correct and remove the masking. Reorganize
+	 * the equation to move all the arithmetic before the ffz.
+	 */
+	code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
+			      pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
+	return ffz_t(u32, code);
+}
+#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2
+
+static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
+{
+	/*
+	 * Top entry covers bits [63:57] only, this is handled through
+	 * max_vasz_lg2.
+	 */
+	if (PT_WARN_ON(pts->level == 5))
+		return 7;
+	return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 amdv1pt_num_items_lg2
+
+static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
+{
+	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+	if (!amdv1pt_can_have_leaf(pts))
+		return 0;
+
+	/*
+	 * Table 14: Example Page Size Encodings
+	 * Address bits 51:32 can be used to encode page sizes greater than 4
+	 * Gbytes. Address bits 63:52 are zero-extended.
+	 *
+	 * 512GB Pages are not supported due to a hardware bug.
+	 * Otherwise every power of two size is supported.
+	 */
+	return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
+			   isz_lg2) & ~SZ_512G;
+}
+#define pt_possible_sizes amdv1pt_possible_sizes
+
+static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
+{
+	const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+	unsigned int next_level;
+	u64 entry;
+
+	pts->entry = entry = READ_ONCE(*tablep);
+	if (!(entry & AMDV1PT_FMT_PR))
+		return PT_ENTRY_EMPTY;
+
+	next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
+	if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
+	    next_level == AMDV1PT_FMT_NL_SIZE)
+		return PT_ENTRY_OA;
+	return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw amdv1pt_load_entry_raw
+
+static inline void
+amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+			   unsigned int oasz_lg2,
+			   const struct pt_write_attrs *attrs)
+{
+	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+	u64 entry;
+
+	if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+		return;
+
+	entry = AMDV1PT_FMT_PR |
+		FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+		attrs->descriptor_bits;
+
+	if (oasz_lg2 == isz_lg2) {
+		entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+				    AMDV1PT_FMT_NL_DEFAULT);
+		WRITE_ONCE(*tablep, entry);
+	} else {
+		unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
+		u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+		entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+				    AMDV1PT_FMT_NL_SIZE) |
+			 FIELD_PREP(AMDV1PT_FMT_OA,
+				    oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
+						  1) -
+					    1);
+
+		/* See amdv1pt_clear_entries() */
+		if (num_contig_lg2 <= ilog2(32)) {
+			for (; tablep != end; tablep++)
+				WRITE_ONCE(*tablep, entry);
+		} else {
+			memset64(tablep, entry, log2_to_int(num_contig_lg2));
+		}
+	}
+	pts->entry = entry;
+}
+#define pt_install_leaf_entry amdv1pt_install_leaf_entry
+
+static inline bool amdv1pt_install_table(struct pt_state *pts,
+					 pt_oaddr_t table_pa,
+					 const struct pt_write_attrs *attrs)
+{
+	u64 entry;
+
+	/*
+	 * IR and IW are ANDed from the table levels along with the PTE. We
+	 * always control permissions from the PTE, so always set IR and IW for
+	 * tables.
+	 */
+	entry = AMDV1PT_FMT_PR |
+		FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
+		FIELD_PREP(AMDV1PT_FMT_OA,
+			   log2_div(table_pa, PT_GRANULE_LG2SZ)) |
+		AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
+	if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+		entry = __sme_set(entry);
+	return pt_table_install64(pts, entry);
+}
+#define pt_install_table amdv1pt_install_table
+
+static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
+					   struct pt_write_attrs *attrs)
+{
+	attrs->descriptor_bits =
+		pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
+}
+#define pt_attr_from_entry amdv1pt_attr_from_entry
+
+static inline void amdv1pt_clear_entries(struct pt_state *pts,
+					 unsigned int num_contig_lg2)
+{
+	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+	u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+	/*
+	 * gcc generates rep stos for the io-pgtable code, and this difference
+	 * can show in microbenchmarks with larger contiguous page sizes.
+	 * rep is slower for small cases.
+	 */
+	if (num_contig_lg2 <= ilog2(32)) {
+		for (; tablep != end; tablep++)
+			WRITE_ONCE(*tablep, 0);
+	} else {
+		memset64(tablep, 0, log2_to_int(num_contig_lg2));
+	}
+}
+#define pt_clear_entries amdv1pt_clear_entries
+
+static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+	unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+	u64 *tablep = pt_cur_table(pts, u64) +
+		      log2_set_mod(pts->index, 0, num_contig_lg2);
+	u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+	for (; tablep != end; tablep++)
+		if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
+			return true;
+	return false;
+}
+#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty
+
+static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
+{
+	unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+	u64 *tablep = pt_cur_table(pts, u64) +
+		      log2_set_mod(pts->index, 0, num_contig_lg2);
+	u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+	for (; tablep != end; tablep++)
+		WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
+}
+#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean
+
+static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
+{
+	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+	u64 new = pts->entry | AMDV1PT_FMT_D;
+
+	return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_amdv1
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+	return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
+			->amdpt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+	return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
+}
+
+static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
+					 struct pt_write_attrs *attrs,
+					 unsigned int iommu_prot)
+{
+	u64 pte = 0;
+
+	if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
+		pte |= AMDV1PT_FMT_FC;
+	if (iommu_prot & IOMMU_READ)
+		pte |= AMDV1PT_FMT_IR;
+	if (iommu_prot & IOMMU_WRITE)
+		pte |= AMDV1PT_FMT_IW;
+
+	/*
+	 * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+	 * control this. For now if the tables use sme_set then so do the ptes.
+	 */
+	if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+		pte = __sme_set(pte);
+
+	attrs->descriptor_bits = pte;
+	return 0;
+}
+#define pt_iommu_set_prot amdv1pt_iommu_set_prot
+
+static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
+					 const struct pt_iommu_amdv1_cfg *cfg)
+{
+	struct pt_amdv1 *table = &iommu_table->amdpt;
+	unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
+
+	if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
+		return -EINVAL;
+
+	if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
+	    cfg->starting_level != PT_MAX_TOP_LEVEL)
+		max_vasz_lg2 = PT_GRANULE_LG2SZ +
+			       (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
+				       (cfg->starting_level + 1);
+
+	table->common.max_vasz_lg2 =
+		min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
+	table->common.max_oasz_lg2 =
+		min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+	pt_top_set_level(&table->common, cfg->starting_level);
+	return 0;
+}
+#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
+
+static inline void
+amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
+			  const struct pt_range *top_range,
+			  struct pt_iommu_amdv1_hw_info *info)
+{
+	info->host_pt_root = virt_to_phys(top_range->top_table);
+	PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
+	info->mode = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_amdv1.h b/drivers/iommu/generic_pt/fmt/defs_amdv1.h
new file mode 100644
index 000000000000..0b9614ca6d10
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_amdv1.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H
+#define __GENERIC_PT_FMT_DEFS_AMDV1_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct amdv1pt_write_attrs {
+	u64 descriptor_bits;
+	gfp_t gfp;
+};
+#define pt_write_attrs amdv1pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_amdv1.c b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c
new file mode 100644
index 000000000000..72a2337d0c55
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT amdv1
+#define PT_SUPPORTED_FEATURES                                          \
+	(BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) |             \
+	 BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
+	 BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |                           \
+	 BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+#define PT_FORCE_ENABLED_FEATURES                                       \
+	(BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+	 BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+
+#include "iommu_template.h"
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
index e69a75511313..21e33489cbf2 100644
--- a/include/linux/generic_pt/common.h
+++ b/include/linux/generic_pt/common.h
@@ -132,4 +132,23 @@ enum pt_features {
 	PT_FEAT_FMT_START,
 };
 
+struct pt_amdv1 {
+	struct pt_common common;
+};
+
+enum {
+	/*
+	 * The memory backing the tables is encrypted. Use __sme_set() to adjust
+	 * the page table pointers in the tree. This only works with
+	 * CONFIG_AMD_MEM_ENCRYPT.
+	 */
+	PT_FEAT_AMDV1_ENCRYPT_TABLES = PT_FEAT_FMT_START,
+	/*
+	 * The PTEs are set to prevent cache incoherent traffic, such as PCI no
+	 * snoop. This is set either at creation time or before the first map
+	 * operation.
+	 */
+	PT_FEAT_AMDV1_FORCE_COHERENCE,
+};
+
 #endif
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index defa96abc497..dc731fe003d1 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -145,6 +145,18 @@ struct pt_iommu_cfg {
 	static_assert(offsetof(s, pt_iommu_memb.domain) ==   \
 		      offsetof(s, domain_memb))
 
+struct pt_iommu_amdv1_cfg {
+	struct pt_iommu_cfg common;
+	unsigned int starting_level;
+};
+
+struct pt_iommu_amdv1_hw_info {
+	u64 host_pt_root;
+	u8 mode;
+};
+
+IOMMU_FORMAT(amdv1, amdpt);
+
 #undef IOMMU_PROTOTYPES
 #undef IOMMU_FORMAT
 #endif
-- 
cgit v1.2.3


From 9d4c274cd7d5e1b6b9e116e155f16bcd208237d8 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:03 -0400
Subject: iommupt: Add iova_to_phys op

iova_to_phys is a performance path for the DMA API and iommufd, implement
it using an unrolled get_user_pages() like function waterfall scheme.

The implementation itself is fairly trivial.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/iommu_pt.h | 105 ++++++++++++++++++++++++++++++++++++
 include/linux/generic_pt/iommu.h    |  19 +++++--
 2 files changed, 119 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 564f2d3a6e11..5ff1b887928a 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -17,6 +17,111 @@
 
 #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
 
+static int make_range_ul(struct pt_common *common, struct pt_range *range,
+			 unsigned long iova, unsigned long len)
+{
+	unsigned long last;
+
+	if (unlikely(len == 0))
+		return -EINVAL;
+
+	if (check_add_overflow(iova, len - 1, &last))
+		return -EOVERFLOW;
+
+	*range = pt_make_range(common, iova, last);
+	if (sizeof(iova) > sizeof(range->va)) {
+		if (unlikely(range->va != iova || range->last_va != last))
+			return -EOVERFLOW;
+	}
+	return 0;
+}
+
+static __maybe_unused int make_range_u64(struct pt_common *common,
+					 struct pt_range *range, u64 iova,
+					 u64 len)
+{
+	if (unlikely(iova > ULONG_MAX || len > ULONG_MAX))
+		return -EOVERFLOW;
+	return make_range_ul(common, range, iova, len);
+}
+
+/*
+ * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch
+ * to the correct validation based on the type.
+ */
+#define make_range_no_check(common, range, iova, len)                   \
+	({                                                              \
+		int ret;                                                \
+		if (sizeof(iova) > sizeof(unsigned long) ||             \
+		    sizeof(len) > sizeof(unsigned long))                \
+			ret = make_range_u64(common, range, iova, len); \
+		else                                                    \
+			ret = make_range_ul(common, range, iova, len);  \
+		ret;                                                    \
+	})
+
+#define make_range(common, range, iova, len)                             \
+	({                                                               \
+		int ret = make_range_no_check(common, range, iova, len); \
+		if (!ret)                                                \
+			ret = pt_check_range(range);                     \
+		ret;                                                     \
+	})
+
+static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg,
+					     unsigned int level,
+					     struct pt_table_p *table,
+					     pt_level_fn_t descend_fn)
+{
+	struct pt_state pts = pt_init(range, level, table);
+	pt_oaddr_t *res = arg;
+
+	switch (pt_load_single_entry(&pts)) {
+	case PT_ENTRY_EMPTY:
+		return -ENOENT;
+	case PT_ENTRY_TABLE:
+		return pt_descend(&pts, arg, descend_fn);
+	case PT_ENTRY_OA:
+		*res = pt_entry_oa_exact(&pts);
+		return 0;
+	}
+	return -ENOENT;
+}
+PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys);
+
+/**
+ * iova_to_phys() - Return the output address for the given IOVA
+ * @iommu_table: Table to query
+ * @iova: IO virtual address to query
+ *
+ * Determine the output address from the given IOVA. @iova may have any
+ * alignment, the returned physical will be adjusted with any sub page offset.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Return: 0 if there is no translation for the given iova.
+ */
+phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain,
+				    dma_addr_t iova)
+{
+	struct pt_iommu *iommu_table =
+		container_of(domain, struct pt_iommu, domain);
+	struct pt_range range;
+	pt_oaddr_t res;
+	int ret;
+
+	ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+	if (ret)
+		return ret;
+
+	ret = pt_walk_range(&range, __iova_to_phys, &res);
+	/* PHYS_ADDR_MAX would be a better error code */
+	if (ret)
+		return 0;
+	return res;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU");
+
 struct pt_iommu_collect_args {
 	struct iommu_pages_list free_list;
 };
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index dc731fe003d1..5622856e1998 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -116,11 +116,13 @@ struct pt_iommu_cfg {
 };
 
 /* Generate the exported function signatures from iommu_pt.h */
-#define IOMMU_PROTOTYPES(fmt)                                             \
-	int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table,           \
-				  const struct pt_iommu_##fmt##_cfg *cfg, \
-				  gfp_t gfp);                             \
-	void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table,       \
+#define IOMMU_PROTOTYPES(fmt)                                                  \
+	phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \
+						  dma_addr_t iova);            \
+	int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table,                \
+				  const struct pt_iommu_##fmt##_cfg *cfg,      \
+				  gfp_t gfp);                                  \
+	void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table,            \
 				      struct pt_iommu_##fmt##_hw_info *info)
 #define IOMMU_FORMAT(fmt, member)       \
 	struct pt_iommu_##fmt {         \
@@ -129,6 +131,13 @@ struct pt_iommu_cfg {
 	};                              \
 	IOMMU_PROTOTYPES(fmt)
 
+/*
+ * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the
+ * iommu_pt
+ */
+#define IOMMU_PT_DOMAIN_OPS(fmt) \
+	.iova_to_phys = &pt_iommu_##fmt##_iova_to_phys,
+
 /*
  * The driver should setup its domain struct like
  *	union {
-- 
cgit v1.2.3


From 7c53f4238aa8bfb476e177263133ead2eeb8d55d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:04 -0400
Subject: iommupt: Add unmap_pages op

unmap_pages removes mappings and any fully contained interior tables from
the given range. This follows the now-standard iommu_domain API definition
where it does not split up larger page sizes into smaller. The caller must
perform unmap only on ranges created by map or it must have somehow
otherwise determined safe cut points (eg iommufd/vfio use iova_to_phys to
scan for them)

A future work will provide 'cut' which explicitly does the page size split
if the HW can support it.

unmap is implemented with a recursive descent of the tree. If the caller
provides a VA range that spans an entire table item then the table memory
can be freed as well.

If an entire table item can be freed then this version will also check the
leaf-only level of the tree to ensure that all entries are present to
generate -EINVAL. Many of the existing drivers don't do this extra check.

This version sits under the iommu_domain_ops as unmap_pages() but does not
require the external page size calculation. The implementation is actually
unmap_range() and can do arbitrary ranges, internally handling all the
validation and supporting any arrangment of page sizes. A future series
can optimize __iommu_unmap() to take advantage of this.

Freed page table memory is batched up in the gather and will be freed in
the driver's iotlb_sync() callback after the IOTLB flush completes.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/iommu_pt.h | 156 ++++++++++++++++++++++++++++++++++++
 include/linux/generic_pt/iommu.h    |  10 ++-
 2 files changed, 164 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 5ff1b887928a..e3d1b272723d 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -14,6 +14,29 @@
 #include <linux/export.h>
 #include <linux/iommu.h>
 #include "../iommu-pages.h"
+#include <linux/cleanup.h>
+#include <linux/dma-mapping.h>
+
+static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
+			       struct pt_iommu *iommu_table, pt_vaddr_t iova,
+			       pt_vaddr_t len,
+			       struct iommu_pages_list *free_list)
+{
+	struct pt_common *common = common_from_iommu(iommu_table);
+
+	if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) &&
+	    iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) {
+		iommu_iotlb_sync(&iommu_table->domain, iotlb_gather);
+		/*
+		 * Note that the sync frees the gather's free list, so we must
+		 * not have any pages on that list that are covered by iova/len
+		 */
+	} else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) {
+		iommu_iotlb_gather_add_range(iotlb_gather, iova, len);
+	}
+
+	iommu_pages_list_splice(free_list, &iotlb_gather->freelist);
+}
 
 #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
 
@@ -164,6 +187,139 @@ static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
 		log2_to_int(pt_top_memsize_lg2(common, top_of_table)));
 }
 
+struct pt_unmap_args {
+	struct iommu_pages_list free_list;
+	pt_vaddr_t unmapped;
+};
+
+static __maybe_unused int __unmap_range(struct pt_range *range, void *arg,
+					unsigned int level,
+					struct pt_table_p *table)
+{
+	struct pt_state pts = pt_init(range, level, table);
+	struct pt_unmap_args *unmap = arg;
+	unsigned int num_oas = 0;
+	unsigned int start_index;
+	int ret = 0;
+
+	_pt_iter_first(&pts);
+	start_index = pts.index;
+	pts.type = pt_load_entry_raw(&pts);
+	/*
+	 * A starting index is in the middle of a contiguous entry
+	 *
+	 * The IOMMU API does not require drivers to support unmapping parts of
+	 * large pages. Long ago VFIO would try to split maps but the current
+	 * version never does.
+	 *
+	 * Instead when unmap reaches a partial unmap of the start of a large
+	 * IOPTE it should remove the entire IOPTE and return that size to the
+	 * caller.
+	 */
+	if (pts.type == PT_ENTRY_OA) {
+		if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts)))
+			return -EINVAL;
+		/* Micro optimization */
+		goto start_oa;
+	}
+
+	do {
+		if (pts.type != PT_ENTRY_OA) {
+			bool fully_covered;
+
+			if (pts.type != PT_ENTRY_TABLE) {
+				ret = -EINVAL;
+				break;
+			}
+
+			if (pts.index != start_index)
+				pt_index_to_va(&pts);
+			pts.table_lower = pt_table_ptr(&pts);
+
+			fully_covered = pt_entry_fully_covered(
+				&pts, pt_table_item_lg2sz(&pts));
+
+			ret = pt_descend(&pts, arg, __unmap_range);
+			if (ret)
+				break;
+
+			/*
+			 * If the unmapping range fully covers the table then we
+			 * can free it as well. The clear is delayed until we
+			 * succeed in clearing the lower table levels.
+			 */
+			if (fully_covered) {
+				iommu_pages_list_add(&unmap->free_list,
+						     pts.table_lower);
+				pt_clear_entries(&pts, ilog2(1));
+			}
+			pts.index++;
+		} else {
+			unsigned int num_contig_lg2;
+start_oa:
+			/*
+			 * If the caller requested an last that falls within a
+			 * single entry then the entire entry is unmapped and
+			 * the length returned will be larger than requested.
+			 */
+			num_contig_lg2 = pt_entry_num_contig_lg2(&pts);
+			pt_clear_entries(&pts, num_contig_lg2);
+			num_oas += log2_to_int(num_contig_lg2);
+			pts.index += log2_to_int(num_contig_lg2);
+		}
+		if (pts.index >= pts.end_index)
+			break;
+		pts.type = pt_load_entry_raw(&pts);
+	} while (true);
+
+	unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts));
+	return ret;
+}
+
+/**
+ * unmap_pages() - Make a range of IOVA empty/not present
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @pgsize: Length of each page
+ * @pgcount: Length of the range in pgsize units starting from @iova
+ * @iotlb_gather: Gather struct that must be flushed on return
+ *
+ * unmap_pages() will remove a translation created by map_pages(). It cannot
+ * subdivide a mapping created by map_pages(), so it should be called with IOVA
+ * ranges that match those passed to map_pages(). The IOVA range can aggregate
+ * contiguous map_pages() calls so long as no individual range is split.
+ *
+ * Context: The caller must hold a write range lock that includes
+ * the whole range.
+ *
+ * Returns: Number of bytes of VA unmapped. iova + res will be the point
+ * unmapping stopped.
+ */
+size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova,
+			      size_t pgsize, size_t pgcount,
+			      struct iommu_iotlb_gather *iotlb_gather)
+{
+	struct pt_iommu *iommu_table =
+		container_of(domain, struct pt_iommu, domain);
+	struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT(
+					       unmap.free_list) };
+	pt_vaddr_t len = pgsize * pgcount;
+	struct pt_range range;
+	int ret;
+
+	ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
+	if (ret)
+		return 0;
+
+	pt_walk_range(&range, __unmap_range, &unmap);
+
+	gather_range_pages(iotlb_gather, iommu_table, iova, len,
+			   &unmap.free_list);
+
+	return unmap.unmapped;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU");
+
 static void NS(get_info)(struct pt_iommu *iommu_table,
 			 struct pt_iommu_info *info)
 {
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index 5622856e1998..ceb6bc9cea37 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -9,6 +9,7 @@
 #include <linux/iommu.h>
 #include <linux/mm_types.h>
 
+struct iommu_iotlb_gather;
 struct pt_iommu_ops;
 
 /**
@@ -119,6 +120,10 @@ struct pt_iommu_cfg {
 #define IOMMU_PROTOTYPES(fmt)                                                  \
 	phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \
 						  dma_addr_t iova);            \
+	size_t pt_iommu_##fmt##_unmap_pages(                                   \
+		struct iommu_domain *domain, unsigned long iova,               \
+		size_t pgsize, size_t pgcount,                                 \
+		struct iommu_iotlb_gather *iotlb_gather);                      \
 	int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table,                \
 				  const struct pt_iommu_##fmt##_cfg *cfg,      \
 				  gfp_t gfp);                                  \
@@ -135,8 +140,9 @@ struct pt_iommu_cfg {
  * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the
  * iommu_pt
  */
-#define IOMMU_PT_DOMAIN_OPS(fmt) \
-	.iova_to_phys = &pt_iommu_##fmt##_iova_to_phys,
+#define IOMMU_PT_DOMAIN_OPS(fmt)                        \
+	.iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \
+	.unmap_pages = &pt_iommu_##fmt##_unmap_pages
 
 /*
  * The driver should setup its domain struct like
-- 
cgit v1.2.3


From dcd6a011a8d523a114af2360a8753de5bd60c139 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:05 -0400
Subject: iommupt: Add map_pages op

map is slightly complicated because it has to handle a number of special
edge cases:
 - Overmapping a previously shared, but now empty, table level with an OA.
   Requries validating and freeing the possibly empty tables
 - Doing the above across an entire to-be-created contiguous entry
 - Installing a new shared table level concurrently with another thread
 - Expanding the table by adding more top levels

Table expansion is a unique feature of AMDv1, this version is quite
similar except we handle racing concurrent lockless map. The table top
pointer and starting level are encoded in a single uintptr_t which ensures
we can READ_ONCE() without tearing. Any op will do the READ_ONCE() and use
that fixed point as its starting point. Concurrent expansion is handled
with a table global spinlock.

When inserting a new table entry map checks that the entire portion of the
table is empty. This includes freeing any empty lower tables that will be
overwritten by an OA. A separate free list is used while checking and
collecting all the empty lower tables so that writing the new entry is
uninterrupted, either the new entry fully writes or nothing changes.

A special fast path for PAGE_SIZE is implemented that does a direct walk
to the leaf level and installs a single entry. This gives ~15% improvement
for iommu_map() when mapping lists of single pages.

This version sits under the iommu_domain_ops as map_pages() but does not
require the external page size calculation. The implementation is actually
map_range() and can do arbitrary ranges, internally handling all the
validation and supporting any arrangment of page sizes. A future series
can optimize iommu_map() to take advantage of this.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/iommu_pt.h | 501 +++++++++++++++++++++++++++++++++++-
 drivers/iommu/generic_pt/pt_iter.h  |   2 +-
 include/linux/generic_pt/iommu.h    |  59 +++++
 3 files changed, 560 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index e3d1b272723d..f32e81509f4f 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -91,6 +91,23 @@ static __maybe_unused int make_range_u64(struct pt_common *common,
 		ret;                                                     \
 	})
 
+static inline unsigned int compute_best_pgsize(struct pt_state *pts,
+					       pt_oaddr_t oa)
+{
+	struct pt_iommu *iommu_table = iommu_from_common(pts->range->common);
+
+	if (!pt_can_have_leaf(pts))
+		return 0;
+
+	/*
+	 * The page size is limited by the domain's bitmap. This allows the core
+	 * code to reduce the supported page sizes by changing the bitmap.
+	 */
+	return pt_compute_best_pgsize(pt_possible_sizes(pts) &
+					      iommu_table->domain.pgsize_bitmap,
+				      pts->range->va, pts->range->last_va, oa);
+}
+
 static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg,
 					     unsigned int level,
 					     struct pt_table_p *table,
@@ -147,6 +164,8 @@ EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU");
 
 struct pt_iommu_collect_args {
 	struct iommu_pages_list free_list;
+	/* Fail if any OAs are within the range */
+	u8 check_mapped : 1;
 };
 
 static int __collect_tables(struct pt_range *range, void *arg,
@@ -156,7 +175,7 @@ static int __collect_tables(struct pt_range *range, void *arg,
 	struct pt_iommu_collect_args *collect = arg;
 	int ret;
 
-	if (!pt_can_have_table(&pts))
+	if (!collect->check_mapped && !pt_can_have_table(&pts))
 		return 0;
 
 	for_each_pt_level_entry(&pts) {
@@ -167,6 +186,8 @@ static int __collect_tables(struct pt_range *range, void *arg,
 				return ret;
 			continue;
 		}
+		if (pts.type == PT_ENTRY_OA && collect->check_mapped)
+			return -EADDRINUSE;
 	}
 	return 0;
 }
@@ -187,6 +208,477 @@ static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
 		log2_to_int(pt_top_memsize_lg2(common, top_of_table)));
 }
 
+/* Allocate an interior table */
+static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts,
+					     gfp_t gfp)
+{
+	struct pt_iommu *iommu_table =
+		iommu_from_common(parent_pts->range->common);
+	struct pt_state child_pts =
+		pt_init(parent_pts->range, parent_pts->level - 1, NULL);
+
+	return iommu_alloc_pages_node_sz(
+		iommu_table->nid, gfp,
+		log2_to_int(pt_num_items_lg2(&child_pts) +
+			    ilog2(PT_ITEM_WORD_SIZE)));
+}
+
+static inline int pt_iommu_new_table(struct pt_state *pts,
+				     struct pt_write_attrs *attrs)
+{
+	struct pt_table_p *table_mem;
+	phys_addr_t phys;
+
+	/* Given PA/VA/length can't be represented */
+	if (PT_WARN_ON(!pt_can_have_table(pts)))
+		return -ENXIO;
+
+	table_mem = table_alloc(pts, attrs->gfp);
+	if (IS_ERR(table_mem))
+		return PTR_ERR(table_mem);
+
+	phys = virt_to_phys(table_mem);
+	if (!pt_install_table(pts, phys, attrs)) {
+		iommu_free_pages(table_mem);
+		return -EAGAIN;
+	}
+
+	if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
+		/*
+		 * The underlying table can't store the physical table address.
+		 * This happens when kunit testing tables outside their normal
+		 * environment where a CPU might be limited.
+		 */
+		pt_load_single_entry(pts);
+		if (PT_WARN_ON(pt_table_pa(pts) != phys)) {
+			pt_clear_entries(pts, ilog2(1));
+			iommu_free_pages(table_mem);
+			return -EINVAL;
+		}
+	}
+
+	pts->table_lower = table_mem;
+	return 0;
+}
+
+struct pt_iommu_map_args {
+	struct iommu_iotlb_gather *iotlb_gather;
+	struct pt_write_attrs attrs;
+	pt_oaddr_t oa;
+	unsigned int leaf_pgsize_lg2;
+	unsigned int leaf_level;
+};
+
+/*
+ * This will recursively check any tables in the block to validate they are
+ * empty and then free them through the gather.
+ */
+static int clear_contig(const struct pt_state *start_pts,
+			struct iommu_iotlb_gather *iotlb_gather,
+			unsigned int step, unsigned int pgsize_lg2)
+{
+	struct pt_iommu *iommu_table =
+		iommu_from_common(start_pts->range->common);
+	struct pt_range range = *start_pts->range;
+	struct pt_state pts =
+		pt_init(&range, start_pts->level, start_pts->table);
+	struct pt_iommu_collect_args collect = { .check_mapped = true };
+	int ret;
+
+	pts.index = start_pts->index;
+	pts.end_index = start_pts->index + step;
+	for (; _pt_iter_load(&pts); pt_next_entry(&pts)) {
+		if (pts.type == PT_ENTRY_TABLE) {
+			collect.free_list =
+				IOMMU_PAGES_LIST_INIT(collect.free_list);
+			ret = pt_walk_descend_all(&pts, __collect_tables,
+						  &collect);
+			if (ret)
+				return ret;
+
+			/*
+			 * The table item must be cleared before we can update
+			 * the gather
+			 */
+			pt_clear_entries(&pts, ilog2(1));
+
+			iommu_pages_list_add(&collect.free_list,
+					     pt_table_ptr(&pts));
+			gather_range_pages(
+				iotlb_gather, iommu_table, range.va,
+				log2_to_int(pt_table_item_lg2sz(&pts)),
+				&collect.free_list);
+		} else if (pts.type != PT_ENTRY_EMPTY) {
+			return -EADDRINUSE;
+		}
+	}
+	return 0;
+}
+
+static int __map_range_leaf(struct pt_range *range, void *arg,
+			    unsigned int level, struct pt_table_p *table)
+{
+	struct pt_state pts = pt_init(range, level, table);
+	struct pt_iommu_map_args *map = arg;
+	unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2;
+	unsigned int start_index;
+	pt_oaddr_t oa = map->oa;
+	unsigned int step;
+	bool need_contig;
+	int ret = 0;
+
+	PT_WARN_ON(map->leaf_level != level);
+	PT_WARN_ON(!pt_can_have_leaf(&pts));
+
+	step = log2_to_int_t(unsigned int,
+			     leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts));
+	need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts);
+
+	_pt_iter_first(&pts);
+	start_index = pts.index;
+	do {
+		pts.type = pt_load_entry_raw(&pts);
+		if (pts.type != PT_ENTRY_EMPTY || need_contig) {
+			if (pts.index != start_index)
+				pt_index_to_va(&pts);
+			ret = clear_contig(&pts, map->iotlb_gather, step,
+					   leaf_pgsize_lg2);
+			if (ret)
+				break;
+		}
+
+		if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
+			pt_index_to_va(&pts);
+			PT_WARN_ON(compute_best_pgsize(&pts, oa) !=
+				   leaf_pgsize_lg2);
+		}
+		pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs);
+
+		oa += log2_to_int(leaf_pgsize_lg2);
+		pts.index += step;
+	} while (pts.index < pts.end_index);
+
+	map->oa = oa;
+	return ret;
+}
+
+static int __map_range(struct pt_range *range, void *arg, unsigned int level,
+		       struct pt_table_p *table)
+{
+	struct pt_state pts = pt_init(range, level, table);
+	struct pt_iommu_map_args *map = arg;
+	int ret;
+
+	PT_WARN_ON(map->leaf_level == level);
+	PT_WARN_ON(!pt_can_have_table(&pts));
+
+	_pt_iter_first(&pts);
+
+	/* Descend to a child table */
+	do {
+		pts.type = pt_load_entry_raw(&pts);
+
+		if (pts.type != PT_ENTRY_TABLE) {
+			if (pts.type != PT_ENTRY_EMPTY)
+				return -EADDRINUSE;
+			ret = pt_iommu_new_table(&pts, &map->attrs);
+			if (ret) {
+				/*
+				 * Racing with another thread installing a table
+				 */
+				if (ret == -EAGAIN)
+					continue;
+				return ret;
+			}
+		} else {
+			pts.table_lower = pt_table_ptr(&pts);
+		}
+
+		/*
+		 * The already present table can possibly be shared with another
+		 * concurrent map.
+		 */
+		if (map->leaf_level == level - 1)
+			ret = pt_descend(&pts, arg, __map_range_leaf);
+		else
+			ret = pt_descend(&pts, arg, __map_range);
+		if (ret)
+			return ret;
+
+		pts.index++;
+		pt_index_to_va(&pts);
+		if (pts.index >= pts.end_index)
+			break;
+	} while (true);
+	return 0;
+}
+
+/*
+ * Fast path for the easy case of mapping a 4k page to an already allocated
+ * table. This is a common workload. If it returns EAGAIN run the full algorithm
+ * instead.
+ */
+static __always_inline int __do_map_single_page(struct pt_range *range,
+						void *arg, unsigned int level,
+						struct pt_table_p *table,
+						pt_level_fn_t descend_fn)
+{
+	struct pt_state pts = pt_init(range, level, table);
+	struct pt_iommu_map_args *map = arg;
+
+	pts.type = pt_load_single_entry(&pts);
+	if (level == 0) {
+		if (pts.type != PT_ENTRY_EMPTY)
+			return -EADDRINUSE;
+		pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT,
+				      &map->attrs);
+		map->oa += PAGE_SIZE;
+		return 0;
+	}
+	if (pts.type == PT_ENTRY_TABLE)
+		return pt_descend(&pts, arg, descend_fn);
+	/* Something else, use the slow path */
+	return -EAGAIN;
+}
+PT_MAKE_LEVELS(__map_single_page, __do_map_single_page);
+
+/*
+ * Add a table to the top, increasing the top level as much as necessary to
+ * encompass range.
+ */
+static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
+			struct pt_iommu_map_args *map)
+{
+	struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list);
+	struct pt_common *common = common_from_iommu(iommu_table);
+	uintptr_t top_of_table = READ_ONCE(common->top_of_table);
+	uintptr_t new_top_of_table = top_of_table;
+	struct pt_table_p *table_mem;
+	unsigned int new_level;
+	spinlock_t *domain_lock;
+	unsigned long flags;
+	int ret;
+
+	while (true) {
+		struct pt_range top_range =
+			_pt_top_range(common, new_top_of_table);
+		struct pt_state pts = pt_init_top(&top_range);
+
+		top_range.va = range->va;
+		top_range.last_va = range->last_va;
+
+		if (!pt_check_range(&top_range) && map->leaf_level <= pts.level)
+			break;
+
+		pts.level++;
+		if (pts.level > PT_MAX_TOP_LEVEL ||
+		    pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) {
+			ret = -ERANGE;
+			goto err_free;
+		}
+
+		new_level = pts.level;
+		table_mem = table_alloc_top(
+			common, _pt_top_set(NULL, pts.level), map->attrs.gfp);
+		if (IS_ERR(table_mem))
+			return PTR_ERR(table_mem);
+		iommu_pages_list_add(&free_list, table_mem);
+
+		/* The new table links to the lower table always at index 0 */
+		top_range.va = 0;
+		top_range.top_level = new_level;
+		pts.table_lower = pts.table;
+		pts.table = table_mem;
+		pt_load_single_entry(&pts);
+		PT_WARN_ON(pts.index != 0);
+		pt_install_table(&pts, virt_to_phys(pts.table_lower),
+				 &map->attrs);
+		new_top_of_table = _pt_top_set(pts.table, pts.level);
+	}
+
+	/*
+	 * top_of_table is write locked by the spinlock, but readers can use
+	 * READ_ONCE() to get the value. Since we encode both the level and the
+	 * pointer in one quanta the lockless reader will always see something
+	 * valid. The HW must be updated to the new level under the spinlock
+	 * before top_of_table is updated so that concurrent readers don't map
+	 * into the new level until it is fully functional. If another thread
+	 * already updated it while we were working then throw everything away
+	 * and try again.
+	 */
+	domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table);
+	spin_lock_irqsave(domain_lock, flags);
+	if (common->top_of_table != top_of_table) {
+		spin_unlock_irqrestore(domain_lock, flags);
+		ret = -EAGAIN;
+		goto err_free;
+	}
+
+	/*
+	 * We do not issue any flushes for change_top on the expectation that
+	 * any walk cache will not become a problem by adding another layer to
+	 * the tree. Misses will rewalk from the updated top pointer, hits
+	 * continue to be correct. Negative caching is fine too since all the
+	 * new IOVA added by the new top is non-present.
+	 */
+	iommu_table->driver_ops->change_top(
+		iommu_table, virt_to_phys(table_mem), new_level);
+	WRITE_ONCE(common->top_of_table, new_top_of_table);
+	spin_unlock_irqrestore(domain_lock, flags);
+	return 0;
+
+err_free:
+	iommu_put_pages_list(&free_list);
+	return ret;
+}
+
+static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range,
+			   struct pt_iommu_map_args *map)
+{
+	struct pt_common *common = common_from_iommu(iommu_table);
+	int ret;
+
+	do {
+		ret = pt_check_range(range);
+		if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+			return ret;
+
+		if (!ret && map->leaf_level <= range->top_level)
+			break;
+
+		ret = increase_top(iommu_table, range, map);
+		if (ret && ret != -EAGAIN)
+			return ret;
+
+		/* Reload the new top */
+		*range = pt_make_range(common, range->va, range->last_va);
+	} while (ret);
+	PT_WARN_ON(pt_check_range(range));
+	return 0;
+}
+
+static int do_map(struct pt_range *range, bool single_page,
+		  struct pt_iommu_map_args *map)
+{
+	if (single_page) {
+		int ret;
+
+		ret = pt_walk_range(range, __map_single_page, map);
+		if (ret != -EAGAIN)
+			return ret;
+		/* EAGAIN falls through to the full path */
+	}
+
+	if (map->leaf_level == range->top_level)
+		return pt_walk_range(range, __map_range_leaf, map);
+	return pt_walk_range(range, __map_range, map);
+}
+
+/**
+ * map_pages() - Install translation for an IOVA range
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @paddr: Physical/Output address to start
+ * @pgsize: Length of each page
+ * @pgcount: Length of the range in pgsize units starting from @iova
+ * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO
+ * @gfp: GFP flags for any memory allocations
+ * @mapped: Total bytes successfully mapped
+ *
+ * The range starting at IOVA will have paddr installed into it. The caller
+ * must specify a valid pgsize and pgcount to segment the range into compatible
+ * blocks.
+ *
+ * On error the caller will probably want to invoke unmap on the range from iova
+ * up to the amount indicated by @mapped to return the table back to an
+ * unchanged state.
+ *
+ * Context: The caller must hold a write range lock that includes the whole
+ * range.
+ *
+ * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were
+ * mapped are added to @mapped, @mapped is not zerod first.
+ */
+int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova,
+			 phys_addr_t paddr, size_t pgsize, size_t pgcount,
+			 int prot, gfp_t gfp, size_t *mapped)
+{
+	struct pt_iommu *iommu_table =
+		container_of(domain, struct pt_iommu, domain);
+	pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap;
+	struct pt_common *common = common_from_iommu(iommu_table);
+	struct iommu_iotlb_gather iotlb_gather;
+	pt_vaddr_t len = pgsize * pgcount;
+	struct pt_iommu_map_args map = {
+		.iotlb_gather = &iotlb_gather,
+		.oa = paddr,
+		.leaf_pgsize_lg2 = vaffs(pgsize),
+	};
+	bool single_page = false;
+	struct pt_range range;
+	int ret;
+
+	iommu_iotlb_gather_init(&iotlb_gather);
+
+	if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE))))
+		return -EINVAL;
+
+	/* Check the paddr doesn't exceed what the table can store */
+	if ((sizeof(pt_oaddr_t) < sizeof(paddr) &&
+	     (pt_vaddr_t)paddr > PT_VADDR_MAX) ||
+	    (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 &&
+	     oalog2_div(paddr, common->max_oasz_lg2)))
+		return -ERANGE;
+
+	ret = pt_iommu_set_prot(common, &map.attrs, prot);
+	if (ret)
+		return ret;
+	map.attrs.gfp = gfp;
+
+	ret = make_range_no_check(common, &range, iova, len);
+	if (ret)
+		return ret;
+
+	/* Calculate target page size and level for the leaves */
+	if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE &&
+	    pgcount == 1) {
+		PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE));
+		if (log2_mod(iova | paddr, PAGE_SHIFT))
+			return -ENXIO;
+		map.leaf_pgsize_lg2 = PAGE_SHIFT;
+		map.leaf_level = 0;
+		single_page = true;
+	} else {
+		map.leaf_pgsize_lg2 = pt_compute_best_pgsize(
+			pgsize_bitmap, range.va, range.last_va, paddr);
+		if (!map.leaf_pgsize_lg2)
+			return -ENXIO;
+		map.leaf_level =
+			pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2);
+	}
+
+	ret = check_map_range(iommu_table, &range, &map);
+	if (ret)
+		return ret;
+
+	PT_WARN_ON(map.leaf_level > range.top_level);
+
+	ret = do_map(&range, single_page, &map);
+
+	/*
+	 * Table levels were freed and replaced with large items, flush any walk
+	 * cache that may refer to the freed levels.
+	 */
+	if (!iommu_pages_list_empty(&iotlb_gather.freelist))
+		iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather);
+
+	/* Bytes successfully mapped */
+	PT_WARN_ON(!ret && map.oa - paddr != len);
+	*mapped += map.oa - paddr;
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU");
+
 struct pt_unmap_args {
 	struct iommu_pages_list free_list;
 	pt_vaddr_t unmapped;
@@ -445,6 +937,7 @@ static void pt_iommu_zero(struct pt_iommu_table *fmt_table)
 	memset_after(fmt_table, 0, iommu.domain);
 
 	/* The caller can initialize some of these values */
+	iommu_table->driver_ops = cfg.driver_ops;
 	iommu_table->nid = cfg.nid;
 }
 
@@ -478,6 +971,12 @@ int pt_iommu_init(struct pt_iommu_table *fmt_table,
 	if (ret)
 		return ret;
 
+	if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+	    WARN_ON(!iommu_table->driver_ops ||
+		    !iommu_table->driver_ops->change_top ||
+		    !iommu_table->driver_ops->get_top_lock))
+		return -EINVAL;
+
 	if (pt_feature(common, PT_FEAT_SIGN_EXTEND) &&
 	    (pt_feature(common, PT_FEAT_FULL_VA) ||
 	     pt_feature(common, PT_FEAT_DYNAMIC_TOP)))
diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h
index 87f4a26c1a41..c0d8617cce29 100644
--- a/drivers/iommu/generic_pt/pt_iter.h
+++ b/drivers/iommu/generic_pt/pt_iter.h
@@ -612,7 +612,7 @@ static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg,
  * This builds a function call tree that can be fully inlined.
  * The caller must provide a function body in an __always_inline function::
  *
- *  static __always_inline int do(struct pt_range *range, void *arg,
+ *  static __always_inline int do_fn(struct pt_range *range, void *arg,
  *         unsigned int level, struct pt_table_p *table,
  *         pt_level_fn_t descend_fn)
  *
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index ceb6bc9cea37..0d59423024d5 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -11,6 +11,7 @@
 
 struct iommu_iotlb_gather;
 struct pt_iommu_ops;
+struct pt_iommu_driver_ops;
 
 /**
  * DOC: IOMMU Radix Page Table
@@ -43,6 +44,12 @@ struct pt_iommu {
 	 */
 	const struct pt_iommu_ops *ops;
 
+	/**
+	 * @driver_ops: Function pointers provided by the HW driver to help
+	 * manage HW details like caches.
+	 */
+	const struct pt_iommu_driver_ops *driver_ops;
+
 	/**
 	 * @nid: Node ID to use for table memory allocations. The IOMMU driver
 	 * may want to set the NID to the device's NID, if there are multiple
@@ -84,6 +91,53 @@ struct pt_iommu_ops {
 	void (*deinit)(struct pt_iommu *iommu_table);
 };
 
+/**
+ * struct pt_iommu_driver_ops - HW IOTLB cache flushing operations
+ *
+ * The IOMMU driver should implement these using container_of(iommu_table) to
+ * get to it's iommu_domain derived structure. All ops can be called in atomic
+ * contexts as they are buried under DMA API calls.
+ */
+struct pt_iommu_driver_ops {
+	/**
+	 * @change_top: Update the top of table pointer
+	 * @iommu_table: Table to operate on
+	 * @top_paddr: New CPU physical address of the top pointer
+	 * @top_level: IOMMU PT level of the new top
+	 *
+	 * Called under the get_top_lock() spinlock. The driver must update all
+	 * HW references to this domain with a new top address and
+	 * configuration. On return mappings placed in the new top must be
+	 * reachable by the HW.
+	 *
+	 * top_level encodes the level in IOMMU PT format, level 0 is the
+	 * smallest page size increasing from there. This has to be translated
+	 * to any HW specific format. During this call the new top will not be
+	 * visible to any other API.
+	 *
+	 * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if
+	 * enabled.
+	 */
+	void (*change_top)(struct pt_iommu *iommu_table, phys_addr_t top_paddr,
+			   unsigned int top_level);
+
+	/**
+	 * @get_top_lock: lock to hold when changing the table top
+	 * @iommu_table: Table to operate on
+	 *
+	 * Return a lock to hold when changing the table top page table from
+	 * being stored in HW. The lock will be held prior to calling
+	 * change_top() and released once the top is fully visible.
+	 *
+	 * Typically this would be a lock that protects the iommu_domain's
+	 * attachment list.
+	 *
+	 * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if
+	 * enabled.
+	 */
+	spinlock_t *(*get_top_lock)(struct pt_iommu *iommu_table);
+};
+
 static inline void pt_iommu_deinit(struct pt_iommu *iommu_table)
 {
 	/*
@@ -120,6 +174,10 @@ struct pt_iommu_cfg {
 #define IOMMU_PROTOTYPES(fmt)                                                  \
 	phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \
 						  dma_addr_t iova);            \
+	int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain,            \
+				       unsigned long iova, phys_addr_t paddr,  \
+				       size_t pgsize, size_t pgcount,          \
+				       int prot, gfp_t gfp, size_t *mapped);   \
 	size_t pt_iommu_##fmt##_unmap_pages(                                   \
 		struct iommu_domain *domain, unsigned long iova,               \
 		size_t pgsize, size_t pgcount,                                 \
@@ -142,6 +200,7 @@ struct pt_iommu_cfg {
  */
 #define IOMMU_PT_DOMAIN_OPS(fmt)                        \
 	.iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \
+	.map_pages = &pt_iommu_##fmt##_map_pages,       \
 	.unmap_pages = &pt_iommu_##fmt##_unmap_pages
 
 /*
-- 
cgit v1.2.3


From 4a00f943489103b4b9edff9f39bd484efbfb15fa Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:06 -0400
Subject: iommupt: Add read_and_clear_dirty op

IOMMU HW now supports updating a dirty bit in an entry when a DMA writes
to the entry's VA range. iommufd has a uAPI to read and clear the dirty
bits from the tables.

This is a trivial recursive descent algorithm to read and optionally clear
the dirty bits. The format needs a function to tell if a contiguous entry
is dirty, and a function to clear a contiguous entry back to clean.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/iommu_pt.h | 104 ++++++++++++++++++++++++++++++++++++
 include/linux/generic_pt/iommu.h    |   6 +++
 2 files changed, 110 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index f32e81509f4f..448c5796d4a8 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -162,6 +162,108 @@ phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU");
 
+struct pt_iommu_dirty_args {
+	struct iommu_dirty_bitmap *dirty;
+	unsigned int flags;
+};
+
+static void record_dirty(struct pt_state *pts,
+			 struct pt_iommu_dirty_args *dirty,
+			 unsigned int num_contig_lg2)
+{
+	pt_vaddr_t dirty_len;
+
+	if (num_contig_lg2 != ilog2(1)) {
+		unsigned int index = pts->index;
+		unsigned int end_index = log2_set_mod_max_t(
+			unsigned int, pts->index, num_contig_lg2);
+
+		/* Adjust for being contained inside a contiguous page */
+		end_index = min(end_index, pts->end_index);
+		dirty_len = (end_index - index) *
+				log2_to_int(pt_table_item_lg2sz(pts));
+	} else {
+		dirty_len = log2_to_int(pt_table_item_lg2sz(pts));
+	}
+
+	if (dirty->dirty->bitmap)
+		iova_bitmap_set(dirty->dirty->bitmap, pts->range->va,
+				dirty_len);
+
+	if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) {
+		pt_entry_make_write_clean(pts);
+		iommu_iotlb_gather_add_range(dirty->dirty->gather,
+					     pts->range->va, dirty_len);
+	}
+}
+
+static inline int __read_and_clear_dirty(struct pt_range *range, void *arg,
+					 unsigned int level,
+					 struct pt_table_p *table)
+{
+	struct pt_state pts = pt_init(range, level, table);
+	struct pt_iommu_dirty_args *dirty = arg;
+	int ret;
+
+	for_each_pt_level_entry(&pts) {
+		if (pts.type == PT_ENTRY_TABLE) {
+			ret = pt_descend(&pts, arg, __read_and_clear_dirty);
+			if (ret)
+				return ret;
+			continue;
+		}
+		if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts))
+			record_dirty(&pts, dirty,
+				     pt_entry_num_contig_lg2(&pts));
+	}
+	return 0;
+}
+
+/**
+ * read_and_clear_dirty() - Manipulate the HW set write dirty state
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @size: Length of the IOVA
+ * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR
+ * @dirty: Place to store the dirty bits
+ *
+ * Iterate over all the entries in the mapped range and record their write dirty
+ * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then
+ * the entries will be left dirty, otherwise they are returned to being not
+ * write dirty.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Returns: -ERRNO on failure, 0 on success.
+ */
+int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain,
+				    unsigned long iova, size_t size,
+				    unsigned long flags,
+				    struct iommu_dirty_bitmap *dirty)
+{
+	struct pt_iommu *iommu_table =
+		container_of(domain, struct pt_iommu, domain);
+	struct pt_iommu_dirty_args dirty_args = {
+		.dirty = dirty,
+		.flags = flags,
+	};
+	struct pt_range range;
+	int ret;
+
+#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty)
+	return -EOPNOTSUPP;
+#endif
+
+	ret = make_range(common_from_iommu(iommu_table), &range, iova, size);
+	if (ret)
+		return ret;
+
+	ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args);
+	PT_WARN_ON(ret);
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU");
+
 struct pt_iommu_collect_args {
 	struct iommu_pages_list free_list;
 	/* Fail if any OAs are within the range */
@@ -1015,5 +1117,7 @@ EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW));
 MODULE_IMPORT_NS("GENERIC_PT");
+/* For iommu_dirty_bitmap_record() */
+MODULE_IMPORT_NS("IOMMUFD");
 
 #endif  /* __GENERIC_PT_IOMMU_PT_H */
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index 0d59423024d5..03a906fbe12a 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -12,6 +12,7 @@
 struct iommu_iotlb_gather;
 struct pt_iommu_ops;
 struct pt_iommu_driver_ops;
+struct iommu_dirty_bitmap;
 
 /**
  * DOC: IOMMU Radix Page Table
@@ -182,6 +183,9 @@ struct pt_iommu_cfg {
 		struct iommu_domain *domain, unsigned long iova,               \
 		size_t pgsize, size_t pgcount,                                 \
 		struct iommu_iotlb_gather *iotlb_gather);                      \
+	int pt_iommu_##fmt##_read_and_clear_dirty(                             \
+		struct iommu_domain *domain, unsigned long iova, size_t size,  \
+		unsigned long flags, struct iommu_dirty_bitmap *dirty);        \
 	int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table,                \
 				  const struct pt_iommu_##fmt##_cfg *cfg,      \
 				  gfp_t gfp);                                  \
@@ -202,6 +206,8 @@ struct pt_iommu_cfg {
 	.iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \
 	.map_pages = &pt_iommu_##fmt##_map_pages,       \
 	.unmap_pages = &pt_iommu_##fmt##_unmap_pages
+#define IOMMU_PT_DIRTY_OPS(fmt) \
+	.read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty
 
 /*
  * The driver should setup its domain struct like
-- 
cgit v1.2.3


From e5359dcc617a2174d834bab4083340196615d8bd Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:08 -0400
Subject: iommupt: Add a mock pagetable format for iommufd selftest to use

The iommufd self test uses an xarray to store the pfns and their orders to
emulate a page table. Slightly modify the amdv1 page table to create a
real page table that has similar properties:

 - 2k base granule to simulate something like a 4k page table on a 64K
   PAGE_SIZE ARM system
 - Contiguous page support for every PFN order
 - Dirty tracking

AMDv1 is the closest format, as it is the only one that already supports
every page size. Tweak it to have only 5 levels and an 11 bit base granule
and compile it separately as a format variant.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/fmt/Makefile     |  1 +
 drivers/iommu/generic_pt/fmt/amdv1.h      | 18 ++++++++++++++++--
 drivers/iommu/generic_pt/fmt/iommu_mock.c | 10 ++++++++++
 include/linux/generic_pt/iommu.h          |  6 ++++++
 4 files changed, 33 insertions(+), 2 deletions(-)
 create mode 100644 drivers/iommu/generic_pt/fmt/iommu_mock.c

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
index 32f3956c7509..f0c22cf5f7be 100644
--- a/drivers/iommu/generic_pt/fmt/Makefile
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
+iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock
 
 IOMMU_PT_KUNIT_TEST :=
 define create_format
diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h
index aaf76bfd21da..aa8e1a8ec95f 100644
--- a/drivers/iommu/generic_pt/fmt/amdv1.h
+++ b/drivers/iommu/generic_pt/fmt/amdv1.h
@@ -26,11 +26,23 @@
 #include <linux/string.h>
 
 enum {
-	PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
-	PT_MAX_VA_ADDRESS_LG2 = 64,
 	PT_ITEM_WORD_SIZE = sizeof(u64),
+	/*
+	 * The IOMMUFD selftest uses the AMDv1 format with some alterations It
+	 * uses a 2k page size to test cases where the CPU page size is not the
+	 * same.
+	 */
+#ifdef AMDV1_IOMMUFD_SELFTEST
+	PT_MAX_VA_ADDRESS_LG2 = 56,
+	PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
+	PT_MAX_TOP_LEVEL = 4,
+	PT_GRANULE_LG2SZ = 11,
+#else
+	PT_MAX_VA_ADDRESS_LG2 = 64,
+	PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
 	PT_MAX_TOP_LEVEL = 5,
 	PT_GRANULE_LG2SZ = 12,
+#endif
 	PT_TABLEMEM_LG2SZ = 12,
 
 	/* The DTE only has these bits for the top phyiscal address */
@@ -374,6 +386,7 @@ static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
 }
 #define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
 
+#ifndef PT_FMT_VARIANT
 static inline void
 amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
 			  const struct pt_range *top_range,
@@ -384,6 +397,7 @@ amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
 	info->mode = top_range->top_level + 1;
 }
 #define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
+#endif
 
 #if defined(GENERIC_PT_KUNIT)
 static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
diff --git a/drivers/iommu/generic_pt/fmt/iommu_mock.c b/drivers/iommu/generic_pt/fmt/iommu_mock.c
new file mode 100644
index 000000000000..74e597cba9d9
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_mock.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define AMDV1_IOMMUFD_SELFTEST 1
+#define PT_FMT amdv1
+#define PT_FMT_VARIANT mock
+#define PT_SUPPORTED_FEATURES 0
+
+#include "iommu_template.h"
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index 03a906fbe12a..848a5fb76272 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -237,6 +237,12 @@ struct pt_iommu_amdv1_hw_info {
 
 IOMMU_FORMAT(amdv1, amdpt);
 
+/* amdv1_mock is used by the iommufd selftest */
+#define pt_iommu_amdv1_mock pt_iommu_amdv1
+#define pt_iommu_amdv1_mock_cfg pt_iommu_amdv1_cfg
+struct pt_iommu_amdv1_mock_hw_info;
+IOMMU_PROTOTYPES(amdv1_mock);
+
 #undef IOMMU_PROTOTYPES
 #undef IOMMU_FORMAT
 #endif
-- 
cgit v1.2.3


From e93d5945ed5bb086431e83eed7ab98b6c058cc0b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:09 -0400
Subject: iommufd: Change the selftest to use iommupt instead of xarray

The iommufd self test uses an xarray to store the pfns and their orders to
emulate a page table. Make it act more like a real iommu driver by
replacing the xarray with an iommupt based page table. The new AMDv1 mock
format behaves similarly to the xarray.

Add set_dirty() as a iommu_pt operation to allow the test suite to
simulate HW dirty.

Userspace can select between several formats including the normal AMDv1
format and a special MOCK_IOMMUPT_HUGE variation for testing huge page
dirty tracking. To make the dirty tracking test work the page table must
only store exactly 2M huge pages otherwise the logic the test uses
fails. They cannot be broken up or combined.

Aside from aligning the selftest with a real page table implementation,
this helps test the iommupt code itself.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/iommu_pt.h           |  39 +++
 drivers/iommu/iommufd/Kconfig                 |   1 +
 drivers/iommu/iommufd/iommufd_test.h          |  11 +-
 drivers/iommu/iommufd/selftest.c              | 424 +++++++++++---------------
 include/linux/generic_pt/iommu.h              |  12 +
 tools/testing/selftests/iommu/iommufd.c       |  60 ++--
 tools/testing/selftests/iommu/iommufd_utils.h |  12 +
 7 files changed, 282 insertions(+), 277 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 448c5796d4a8..142001f5aa83 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -264,6 +264,41 @@ int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU");
 
+static inline int __set_dirty(struct pt_range *range, void *arg,
+			      unsigned int level, struct pt_table_p *table)
+{
+	struct pt_state pts = pt_init(range, level, table);
+
+	switch (pt_load_single_entry(&pts)) {
+	case PT_ENTRY_EMPTY:
+		return -ENOENT;
+	case PT_ENTRY_TABLE:
+		return pt_descend(&pts, arg, __set_dirty);
+	case PT_ENTRY_OA:
+		if (!pt_entry_make_write_dirty(&pts))
+			return -EAGAIN;
+		return 0;
+	}
+	return -ENOENT;
+}
+
+static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table,
+					dma_addr_t iova)
+{
+	struct pt_range range;
+	int ret;
+
+	ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * Note: There is no locking here yet, if the test suite races this it
+	 * can crash. It should use RCU locking eventually.
+	 */
+	return pt_walk_range(&range, __set_dirty, NULL);
+}
+
 struct pt_iommu_collect_args {
 	struct iommu_pages_list free_list;
 	/* Fail if any OAs are within the range */
@@ -957,6 +992,10 @@ static void NS(deinit)(struct pt_iommu *iommu_table)
 }
 
 static const struct pt_iommu_ops NS(ops) = {
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \
+	IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty)
+	.set_dirty = NS(set_dirty),
+#endif
 	.get_info = NS(get_info),
 	.deinit = NS(deinit),
 };
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
index 2beeb4f60ee5..eae3f03629b0 100644
--- a/drivers/iommu/iommufd/Kconfig
+++ b/drivers/iommu/iommufd/Kconfig
@@ -41,6 +41,7 @@ config IOMMUFD_TEST
 	depends on DEBUG_KERNEL
 	depends on FAULT_INJECTION
 	depends on RUNTIME_TESTING_MENU
+	depends on IOMMU_PT_AMDV1
 	select IOMMUFD_DRIVER
 	default n
 	help
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 8fc618b2bcf9..781a75c79eea 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -31,9 +31,18 @@ enum {
 	IOMMU_TEST_OP_PASID_CHECK_HWPT,
 };
 
+enum {
+	MOCK_IOMMUPT_DEFAULT = 0,
+	MOCK_IOMMUPT_HUGE,
+	MOCK_IOMMUPT_AMDV1,
+};
+
+/* These values are true for MOCK_IOMMUPT_DEFAULT */
 enum {
 	MOCK_APERTURE_START = 1UL << 24,
 	MOCK_APERTURE_LAST = (1UL << 31) - 1,
+	MOCK_PAGE_SIZE = 2048,
+	MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE,
 };
 
 enum {
@@ -52,7 +61,6 @@ enum {
 
 enum {
 	MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
-	MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1,
 	MOCK_FLAGS_DEVICE_PASID = 1 << 2,
 };
 
@@ -205,6 +213,7 @@ struct iommu_test_hw_info {
  */
 struct iommu_hwpt_selftest {
 	__u32 iotlb;
+	__u32 pagetable_type;
 };
 
 /* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 5661d2da2b67..f6379f387d3b 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -12,6 +12,8 @@
 #include <linux/slab.h>
 #include <linux/xarray.h>
 #include <uapi/linux/iommufd.h>
+#include <linux/generic_pt/iommu.h>
+#include "../iommu-pages.h"
 
 #include "../iommu-priv.h"
 #include "io_pagetable.h"
@@ -41,21 +43,6 @@ static DEFINE_IDA(mock_dev_ida);
 
 enum {
 	MOCK_DIRTY_TRACK = 1,
-	MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
-	MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE,
-
-	/*
-	 * Like a real page table alignment requires the low bits of the address
-	 * to be zero. xarray also requires the high bit to be zero, so we store
-	 * the pfns shifted. The upper bits are used for metadata.
-	 */
-	MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE,
-
-	_MOCK_PFN_START = MOCK_PFN_MASK + 1,
-	MOCK_PFN_START_IOVA = _MOCK_PFN_START,
-	MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
-	MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1,
-	MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2,
 };
 
 static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain);
@@ -124,10 +111,15 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
 }
 
 struct mock_iommu_domain {
+	union {
+		struct iommu_domain domain;
+		struct pt_iommu iommu;
+		struct pt_iommu_amdv1 amdv1;
+	};
 	unsigned long flags;
-	struct iommu_domain domain;
-	struct xarray pfns;
 };
+PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain);
 
 static inline struct mock_iommu_domain *
 to_mock_domain(struct iommu_domain *domain)
@@ -344,74 +336,6 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
 	return 0;
 }
 
-static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock,
-				      unsigned long iova, size_t page_size,
-				      unsigned long flags)
-{
-	unsigned long cur, end = iova + page_size - 1;
-	bool dirty = false;
-	void *ent, *old;
-
-	for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) {
-		ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
-		if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA))
-			continue;
-
-		dirty = true;
-		/* Clear dirty */
-		if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
-			unsigned long val;
-
-			val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
-			old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE,
-				       xa_mk_value(val), GFP_KERNEL);
-			WARN_ON_ONCE(ent != old);
-		}
-	}
-
-	return dirty;
-}
-
-static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
-					    unsigned long iova, size_t size,
-					    unsigned long flags,
-					    struct iommu_dirty_bitmap *dirty)
-{
-	struct mock_iommu_domain *mock = to_mock_domain(domain);
-	unsigned long end = iova + size;
-	void *ent;
-
-	if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap)
-		return -EINVAL;
-
-	do {
-		unsigned long pgsize = MOCK_IO_PAGE_SIZE;
-		unsigned long head;
-
-		ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
-		if (!ent) {
-			iova += pgsize;
-			continue;
-		}
-
-		if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA)
-			pgsize = MOCK_HUGE_PAGE_SIZE;
-		head = iova & ~(pgsize - 1);
-
-		/* Clear dirty */
-		if (mock_test_and_clear_dirty(mock, head, pgsize, flags))
-			iommu_dirty_bitmap_record(dirty, iova, pgsize);
-		iova += pgsize;
-	} while (iova < end);
-
-	return 0;
-}
-
-static const struct iommu_dirty_ops dirty_ops = {
-	.set_dirty_tracking = mock_domain_set_dirty_tracking,
-	.read_and_clear_dirty = mock_domain_read_and_clear_dirty,
-};
-
 static struct mock_iommu_domain_nested *
 __mock_domain_alloc_nested(const struct iommu_user_data *user_data)
 {
@@ -446,7 +370,7 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
 
 	if (flags & ~IOMMU_HWPT_ALLOC_PASID)
 		return ERR_PTR(-EOPNOTSUPP);
-	if (!parent || parent->ops != mock_ops.default_domain_ops)
+	if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING))
 		return ERR_PTR(-EINVAL);
 
 	mock_parent = to_mock_domain(parent);
@@ -459,159 +383,170 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
 	return &mock_nested->domain;
 }
 
-static struct iommu_domain *
-mock_domain_alloc_paging_flags(struct device *dev, u32 flags,
-			       const struct iommu_user_data *user_data)
-{
-	bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
-	const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
-				 IOMMU_HWPT_ALLOC_NEST_PARENT |
-				 IOMMU_HWPT_ALLOC_PASID;
-	struct mock_dev *mdev = to_mock_dev(dev);
-	bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
-	struct mock_iommu_domain *mock;
-
-	if (user_data)
-		return ERR_PTR(-EOPNOTSUPP);
-	if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
-		return ERR_PTR(-EOPNOTSUPP);
-
-	mock = kzalloc(sizeof(*mock), GFP_KERNEL);
-	if (!mock)
-		return ERR_PTR(-ENOMEM);
-	mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
-	mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
-	mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
-	if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA)
-		mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE;
-	mock->domain.ops = mock_ops.default_domain_ops;
-	mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
-	xa_init(&mock->pfns);
-
-	if (has_dirty_flag)
-		mock->domain.dirty_ops = &dirty_ops;
-	return &mock->domain;
-}
-
 static void mock_domain_free(struct iommu_domain *domain)
 {
 	struct mock_iommu_domain *mock = to_mock_domain(domain);
 
-	WARN_ON(!xa_empty(&mock->pfns));
+	pt_iommu_deinit(&mock->iommu);
 	kfree(mock);
 }
 
-static int mock_domain_map_pages(struct iommu_domain *domain,
-				 unsigned long iova, phys_addr_t paddr,
-				 size_t pgsize, size_t pgcount, int prot,
-				 gfp_t gfp, size_t *mapped)
+static void mock_iotlb_sync(struct iommu_domain *domain,
+				struct iommu_iotlb_gather *gather)
 {
-	struct mock_iommu_domain *mock = to_mock_domain(domain);
-	unsigned long flags = MOCK_PFN_START_IOVA;
-	unsigned long start_iova = iova;
+	iommu_put_pages_list(&gather->freelist);
+}
 
-	/*
-	 * xarray does not reliably work with fault injection because it does a
-	 * retry allocation, so put our own failure point.
-	 */
-	if (iommufd_should_fail())
-		return -ENOENT;
+static const struct iommu_domain_ops amdv1_mock_ops = {
+	IOMMU_PT_DOMAIN_OPS(amdv1_mock),
+	.free = mock_domain_free,
+	.attach_dev = mock_domain_nop_attach,
+	.set_dev_pasid = mock_domain_set_dev_pasid_nop,
+	.iotlb_sync = &mock_iotlb_sync,
+};
 
-	WARN_ON(iova % MOCK_IO_PAGE_SIZE);
-	WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
-	for (; pgcount; pgcount--) {
-		size_t cur;
+static const struct iommu_domain_ops amdv1_mock_huge_ops = {
+	IOMMU_PT_DOMAIN_OPS(amdv1_mock),
+	.free = mock_domain_free,
+	.attach_dev = mock_domain_nop_attach,
+	.set_dev_pasid = mock_domain_set_dev_pasid_nop,
+	.iotlb_sync = &mock_iotlb_sync,
+};
+#undef pt_iommu_amdv1_mock_map_pages
 
-		for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
-			void *old;
+static const struct iommu_dirty_ops amdv1_mock_dirty_ops = {
+	IOMMU_PT_DIRTY_OPS(amdv1_mock),
+	.set_dirty_tracking = mock_domain_set_dirty_tracking,
+};
 
-			if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
-				flags = MOCK_PFN_LAST_IOVA;
-			if (pgsize != MOCK_IO_PAGE_SIZE) {
-				flags |= MOCK_PFN_HUGE_IOVA;
-			}
-			old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE,
-				       xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) |
-						   flags),
-				       gfp);
-			if (xa_is_err(old)) {
-				for (; start_iova != iova;
-				     start_iova += MOCK_IO_PAGE_SIZE)
-					xa_erase(&mock->pfns,
-						 start_iova /
-							 MOCK_IO_PAGE_SIZE);
-				return xa_err(old);
-			}
-			WARN_ON(old);
-			iova += MOCK_IO_PAGE_SIZE;
-			paddr += MOCK_IO_PAGE_SIZE;
-			*mapped += MOCK_IO_PAGE_SIZE;
-			flags = 0;
-		}
-	}
-	return 0;
-}
+static const struct iommu_domain_ops amdv1_ops = {
+	IOMMU_PT_DOMAIN_OPS(amdv1),
+	.free = mock_domain_free,
+	.attach_dev = mock_domain_nop_attach,
+	.set_dev_pasid = mock_domain_set_dev_pasid_nop,
+	.iotlb_sync = &mock_iotlb_sync,
+};
+
+static const struct iommu_dirty_ops amdv1_dirty_ops = {
+	IOMMU_PT_DIRTY_OPS(amdv1),
+	.set_dirty_tracking = mock_domain_set_dirty_tracking,
+};
 
-static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
-				      unsigned long iova, size_t pgsize,
-				      size_t pgcount,
-				      struct iommu_iotlb_gather *iotlb_gather)
+static struct mock_iommu_domain *
+mock_domain_alloc_pgtable(struct device *dev,
+			  const struct iommu_hwpt_selftest *user_cfg, u32 flags)
 {
-	struct mock_iommu_domain *mock = to_mock_domain(domain);
-	bool first = true;
-	size_t ret = 0;
-	void *ent;
+	struct mock_iommu_domain *mock;
+	int rc;
 
-	WARN_ON(iova % MOCK_IO_PAGE_SIZE);
-	WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
+	mock = kzalloc(sizeof(*mock), GFP_KERNEL);
+	if (!mock)
+		return ERR_PTR(-ENOMEM);
+	mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
 
-	for (; pgcount; pgcount--) {
-		size_t cur;
+	mock->amdv1.iommu.nid = NUMA_NO_NODE;
+
+	switch (user_cfg->pagetable_type) {
+	case MOCK_IOMMUPT_DEFAULT:
+	case MOCK_IOMMUPT_HUGE: {
+		struct pt_iommu_amdv1_cfg cfg = {};
+
+		/* The mock version has a 2k page size */
+		cfg.common.hw_max_vasz_lg2 = 56;
+		cfg.common.hw_max_oasz_lg2 = 51;
+		cfg.starting_level = 2;
+		if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE)
+			mock->domain.ops = &amdv1_mock_huge_ops;
+		else
+			mock->domain.ops = &amdv1_mock_ops;
+		rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL);
+		if (rc)
+			goto err_free;
+
+		/*
+		 * In huge mode userspace should only provide huge pages, we
+		 * have to include PAGE_SIZE for the domain to be accepted by
+		 * iommufd.
+		 */
+		if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE)
+			mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE |
+						     PAGE_SIZE;
+		if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+			mock->domain.dirty_ops = &amdv1_mock_dirty_ops;
+		break;
+	}
 
-		for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
-			ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+	case MOCK_IOMMUPT_AMDV1: {
+		struct pt_iommu_amdv1_cfg cfg = {};
+
+		cfg.common.hw_max_vasz_lg2 = 64;
+		cfg.common.hw_max_oasz_lg2 = 52;
+		cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
+				      BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |
+				      BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);
+		cfg.starting_level = 2;
+		mock->domain.ops = &amdv1_ops;
+		rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL);
+		if (rc)
+			goto err_free;
+		if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+			mock->domain.dirty_ops = &amdv1_dirty_ops;
+		break;
+	}
+	default:
+		rc = -EOPNOTSUPP;
+		goto err_free;
+	}
 
-			/*
-			 * iommufd generates unmaps that must be a strict
-			 * superset of the map's performend So every
-			 * starting/ending IOVA should have been an iova passed
-			 * to map.
-			 *
-			 * This simple logic doesn't work when the HUGE_PAGE is
-			 * turned on since the core code will automatically
-			 * switch between the two page sizes creating a break in
-			 * the unmap calls. The break can land in the middle of
-			 * contiguous IOVA.
-			 */
-			if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) {
-				if (first) {
-					WARN_ON(ent && !(xa_to_value(ent) &
-							 MOCK_PFN_START_IOVA));
-					first = false;
-				}
-				if (pgcount == 1 &&
-				    cur + MOCK_IO_PAGE_SIZE == pgsize)
-					WARN_ON(ent && !(xa_to_value(ent) &
-							 MOCK_PFN_LAST_IOVA));
-			}
+	/*
+	 * Override the real aperture to the MOCK aperture for test purposes.
+	 */
+	if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) {
+		WARN_ON(mock->domain.geometry.aperture_start != 0);
+		WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST);
 
-			iova += MOCK_IO_PAGE_SIZE;
-			ret += MOCK_IO_PAGE_SIZE;
-		}
+		mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
+		mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
 	}
-	return ret;
+
+	return mock;
+err_free:
+	kfree(mock);
+	return ERR_PTR(rc);
 }
 
-static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
-					    dma_addr_t iova)
+static struct iommu_domain *
+mock_domain_alloc_paging_flags(struct device *dev, u32 flags,
+			       const struct iommu_user_data *user_data)
 {
-	struct mock_iommu_domain *mock = to_mock_domain(domain);
-	void *ent;
+	bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+	const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+				 IOMMU_HWPT_ALLOC_NEST_PARENT |
+				 IOMMU_HWPT_ALLOC_PASID;
+	struct mock_dev *mdev = to_mock_dev(dev);
+	bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
+	struct iommu_hwpt_selftest user_cfg = {};
+	struct mock_iommu_domain *mock;
+	int rc;
+
+	if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST &&
+			  user_data->type != IOMMU_HWPT_DATA_NONE))
+		return ERR_PTR(-EOPNOTSUPP);
 
-	WARN_ON(iova % MOCK_IO_PAGE_SIZE);
-	ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
-	WARN_ON(!ent);
-	return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE;
+	if (user_data) {
+		rc = iommu_copy_struct_from_user(
+			&user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb);
+		if (rc)
+			return ERR_PTR(rc);
+	}
+
+	mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags);
+	if (IS_ERR(mock))
+		return ERR_CAST(mock);
+	return &mock->domain;
 }
 
 static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
@@ -955,15 +890,6 @@ static const struct iommu_ops mock_ops = {
 	.user_pasid_table = true,
 	.get_viommu_size = mock_get_viommu_size,
 	.viommu_init = mock_viommu_init,
-	.default_domain_ops =
-		&(struct iommu_domain_ops){
-			.free = mock_domain_free,
-			.attach_dev = mock_domain_nop_attach,
-			.map_pages = mock_domain_map_pages,
-			.unmap_pages = mock_domain_unmap_pages,
-			.iova_to_phys = mock_domain_iova_to_phys,
-			.set_dev_pasid = mock_domain_set_dev_pasid_nop,
-		},
 };
 
 static void mock_domain_free_nested(struct iommu_domain *domain)
@@ -1047,7 +973,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
 	if (IS_ERR(hwpt))
 		return hwpt;
 	if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED ||
-	    hwpt->domain->ops != mock_ops.default_domain_ops) {
+	    hwpt->domain->owner != &mock_ops) {
 		iommufd_put_object(ucmd->ictx, &hwpt->obj);
 		return ERR_PTR(-EINVAL);
 	}
@@ -1088,7 +1014,6 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
 		{},
 	};
 	const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY |
-				MOCK_FLAGS_DEVICE_HUGE_IOVA |
 				MOCK_FLAGS_DEVICE_PASID;
 	struct mock_dev *mdev;
 	int rc, i;
@@ -1277,23 +1202,25 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
 {
 	struct iommufd_hw_pagetable *hwpt;
 	struct mock_iommu_domain *mock;
+	unsigned int page_size;
 	uintptr_t end;
 	int rc;
 
-	if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE ||
-	    (uintptr_t)uptr % MOCK_IO_PAGE_SIZE ||
-	    check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
-		return -EINVAL;
-
 	hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
 	if (IS_ERR(hwpt))
 		return PTR_ERR(hwpt);
 
-	for (; length; length -= MOCK_IO_PAGE_SIZE) {
+	page_size = 1 << __ffs(mock->domain.pgsize_bitmap);
+	if (iova % page_size || length % page_size ||
+	    (uintptr_t)uptr % page_size ||
+	    check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
+		return -EINVAL;
+
+	for (; length; length -= page_size) {
 		struct page *pages[1];
+		phys_addr_t io_phys;
 		unsigned long pfn;
 		long npages;
-		void *ent;
 
 		npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0,
 					     pages);
@@ -1308,15 +1235,14 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
 		pfn = page_to_pfn(pages[0]);
 		put_page(pages[0]);
 
-		ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
-		if (!ent ||
-		    (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE !=
-			    pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
+		io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova);
+		if (io_phys !=
+		    pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
 			rc = -EINVAL;
 			goto out_put;
 		}
-		iova += MOCK_IO_PAGE_SIZE;
-		uptr += MOCK_IO_PAGE_SIZE;
+		iova += page_size;
+		uptr += page_size;
 	}
 	rc = 0;
 
@@ -1795,7 +1721,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
 	if (IS_ERR(hwpt))
 		return PTR_ERR(hwpt);
 
-	if (!(mock->flags & MOCK_DIRTY_TRACK)) {
+	if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) {
 		rc = -EINVAL;
 		goto out_put;
 	}
@@ -1814,22 +1740,10 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
 	}
 
 	for (i = 0; i < max; i++) {
-		unsigned long cur = iova + i * page_size;
-		void *ent, *old;
-
 		if (!test_bit(i, (unsigned long *)tmp))
 			continue;
-
-		ent = xa_load(&mock->pfns, cur / page_size);
-		if (ent) {
-			unsigned long val;
-
-			val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA;
-			old = xa_store(&mock->pfns, cur / page_size,
-				       xa_mk_value(val), GFP_KERNEL);
-			WARN_ON_ONCE(ent != old);
-			count++;
-		}
+		mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size);
+		count++;
 	}
 
 	cmd->dirty.out_nr_dirty = count;
@@ -2202,3 +2116,5 @@ void iommufd_test_exit(void)
 	platform_device_unregister(selftest_iommu_dev);
 	debugfs_remove_recursive(dbgfs_root);
 }
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index 848a5fb76272..f2a763aba088 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -73,6 +73,18 @@ struct pt_iommu_info {
 };
 
 struct pt_iommu_ops {
+	/**
+	 * @set_dirty: Make the iova write dirty
+	 * @iommu_table: Table to manipulate
+	 * @iova: IO virtual address to start
+	 *
+	 * This is only used by iommufd testing. It makes the iova dirty so that
+	 * read_and_clear_dirty() will see it as dirty. Unlike all the other ops
+	 * this one is safe to call without holding any locking. It may return
+	 * -EAGAIN if there is a race.
+	 */
+	int (*set_dirty)(struct pt_iommu *iommu_table, dma_addr_t iova);
+
 	/**
 	 * @get_info: Return the pt_iommu_info structure
 	 * @iommu_table: Table to query
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 3eebf5e3b974..595b0a3ead64 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -13,9 +13,6 @@
 
 static unsigned long HUGEPAGE_SIZE;
 
-#define MOCK_PAGE_SIZE (PAGE_SIZE / 2)
-#define MOCK_HUGE_PAGE_SIZE (512 * MOCK_PAGE_SIZE)
-
 static unsigned long get_huge_page_size(void)
 {
 	char buf[80];
@@ -2058,6 +2055,12 @@ FIXTURE_VARIANT(iommufd_dirty_tracking)
 
 FIXTURE_SETUP(iommufd_dirty_tracking)
 {
+	struct iommu_option cmd = {
+		.size = sizeof(cmd),
+		.option_id = IOMMU_OPTION_HUGE_PAGES,
+		.op = IOMMU_OPTION_OP_SET,
+		.val64 = 0,
+	};
 	size_t mmap_buffer_size;
 	unsigned long size;
 	int mmap_flags;
@@ -2066,7 +2069,7 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
 
 	if (variant->buffer_size < MOCK_PAGE_SIZE) {
 		SKIP(return,
-		     "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%lu",
+		     "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%u",
 		     variant->buffer_size, MOCK_PAGE_SIZE);
 	}
 
@@ -2114,16 +2117,18 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
 	assert((uintptr_t)self->bitmap % PAGE_SIZE == 0);
 
 	test_ioctl_ioas_alloc(&self->ioas_id);
-	/* Enable 1M mock IOMMU hugepages */
-	if (variant->hugepages) {
-		test_cmd_mock_domain_flags(self->ioas_id,
-					   MOCK_FLAGS_DEVICE_HUGE_IOVA,
-					   &self->stdev_id, &self->hwpt_id,
-					   &self->idev_id);
-	} else {
-		test_cmd_mock_domain(self->ioas_id, &self->stdev_id,
-				     &self->hwpt_id, &self->idev_id);
-	}
+
+	/*
+	 * For dirty testing it is important that the page size fed into
+	 * the iommu page tables matches the size the dirty logic
+	 * expects, or set_dirty can touch too much stuff.
+	 */
+	cmd.object_id = self->ioas_id;
+	if (!variant->hugepages)
+		ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd));
+
+	test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id,
+			     &self->idev_id);
 }
 
 FIXTURE_TEARDOWN(iommufd_dirty_tracking)
@@ -2248,18 +2253,23 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability)
 TEST_F(iommufd_dirty_tracking, get_dirty_bitmap)
 {
 	uint32_t page_size = MOCK_PAGE_SIZE;
+	uint32_t ioas_id = self->ioas_id;
 	uint32_t hwpt_id;
-	uint32_t ioas_id;
 
 	if (variant->hugepages)
 		page_size = MOCK_HUGE_PAGE_SIZE;
 
-	test_ioctl_ioas_alloc(&ioas_id);
 	test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer,
 				     variant->buffer_size, MOCK_APERTURE_START);
 
-	test_cmd_hwpt_alloc(self->idev_id, ioas_id,
-			    IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+	if (variant->hugepages)
+		test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id,
+					    IOMMU_HWPT_ALLOC_DIRTY_TRACKING,
+					    MOCK_IOMMUPT_HUGE, &hwpt_id);
+	else
+		test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id,
+					    IOMMU_HWPT_ALLOC_DIRTY_TRACKING,
+					    MOCK_IOMMUPT_DEFAULT, &hwpt_id);
 
 	test_cmd_set_dirty_tracking(hwpt_id, true);
 
@@ -2285,18 +2295,24 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap)
 TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear)
 {
 	uint32_t page_size = MOCK_PAGE_SIZE;
+	uint32_t ioas_id = self->ioas_id;
 	uint32_t hwpt_id;
-	uint32_t ioas_id;
 
 	if (variant->hugepages)
 		page_size = MOCK_HUGE_PAGE_SIZE;
 
-	test_ioctl_ioas_alloc(&ioas_id);
 	test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer,
 				     variant->buffer_size, MOCK_APERTURE_START);
 
-	test_cmd_hwpt_alloc(self->idev_id, ioas_id,
-			    IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+
+	if (variant->hugepages)
+		test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id,
+					    IOMMU_HWPT_ALLOC_DIRTY_TRACKING,
+					    MOCK_IOMMUPT_HUGE, &hwpt_id);
+	else
+		test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id,
+					    IOMMU_HWPT_ALLOC_DIRTY_TRACKING,
+					    MOCK_IOMMUPT_DEFAULT, &hwpt_id);
 
 	test_cmd_set_dirty_tracking(hwpt_id, true);
 
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 772ca1db6e59..08e529fde1cc 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -215,6 +215,18 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i
 	ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags,   \
 					  hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \
 					  0))
+#define test_cmd_hwpt_alloc_iommupt(device_id, pt_id, flags, iommupt_type, \
+				    hwpt_id)                               \
+	({                                                                 \
+		struct iommu_hwpt_selftest user_cfg = {                    \
+			.pagetable_type = iommupt_type                     \
+		};                                                         \
+                                                                           \
+		ASSERT_EQ(0, _test_cmd_hwpt_alloc(                         \
+				     self->fd, device_id, pt_id, 0, flags, \
+				     hwpt_id, IOMMU_HWPT_DATA_SELFTEST,    \
+				     &user_cfg, sizeof(user_cfg)));        \
+	})
 #define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id)   \
 	EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc(                      \
 				     self->fd, device_id, pt_id, 0, flags, \
-- 
cgit v1.2.3


From aef5de756ea871ab44e3a1a87be6c944e6587c51 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:10 -0400
Subject: iommupt: Add the x86 64 bit page table format

This is used by x86 CPUs and can be used in AMD/VT-d x86 IOMMUs. When a
x86 IOMMU is running SVA the MM will be using this format.

This implementation follows the AMD v2 io-pgtable version.

There is nothing remarkable here, the format can have 4 or 5 levels and
limited support for different page sizes. No contiguous pages support.

x86 uses a sign extension mechanism where the top bits of the VA must
match the sign bit. The core code supports this through
PT_FEAT_SIGN_EXTEND which creates and upper and lower VA range. All the
new operations will work correctly in both spaces, however currently there
is no way to report the upper space to other layers. Future patches can
improve that.

In principle this can support 3 page tables levels matching the 32 bit PAE
table format, but no iommu driver needs this. The focus is on the modern
64 bit 4 and 5 level formats.

Comparing the performance of several operations to the existing version:

iommu_map()
   pgsz  ,avg new,old ns, min new,old ns  , min % (+ve is better)
     2^12,     71,61    ,      66,58      , -13.13
     2^21,     66,60    ,      61,55      , -10.10
     2^30,     59,56    ,      56,54      ,  -3.03
 256*2^12,    392,1360  ,     345,1289    ,  73.73
 256*2^21,    383,1159  ,     335,1145    ,  70.70
 256*2^30,    378,965   ,     331,892     ,  62.62

iommu_unmap()
   pgsz  ,avg new,old ns, min new,old ns  , min % (+ve is better)
     2^12,     77,71    ,      73,68      ,  -7.07
     2^21,     76,70    ,      70,66      ,  -6.06
     2^30,     69,66    ,      66,63      ,  -4.04
 256*2^12,    225,899   ,     210,870     ,  75.75
 256*2^21,    262,722   ,     248,710     ,  65.65
 256*2^30,    251,643   ,     244,634     ,  61.61

The small -ve values in the iommu_unmap() are due to the core code calling
iommu_pgsize() before invoking the domain op. This is unncessary with this
implementation. Future work optimizes this and gets to 2%, 4%, 3%.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/.kunitconfig       |   1 +
 drivers/iommu/generic_pt/Kconfig            |  11 ++
 drivers/iommu/generic_pt/fmt/Makefile       |   2 +
 drivers/iommu/generic_pt/fmt/defs_x86_64.h  |  21 +++
 drivers/iommu/generic_pt/fmt/iommu_x86_64.c |  11 ++
 drivers/iommu/generic_pt/fmt/x86_64.h       | 255 ++++++++++++++++++++++++++++
 include/linux/generic_pt/common.h           |  13 ++
 include/linux/generic_pt/iommu.h            |  11 ++
 8 files changed, 325 insertions(+)
 create mode 100644 drivers/iommu/generic_pt/fmt/defs_x86_64.h
 create mode 100644 drivers/iommu/generic_pt/fmt/iommu_x86_64.c
 create mode 100644 drivers/iommu/generic_pt/fmt/x86_64.h

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig
index 936c327f0661..2016c5e5ac0f 100644
--- a/drivers/iommu/generic_pt/.kunitconfig
+++ b/drivers/iommu/generic_pt/.kunitconfig
@@ -3,6 +3,7 @@ CONFIG_GENERIC_PT=y
 CONFIG_DEBUG_GENERIC_PT=y
 CONFIG_IOMMU_PT=y
 CONFIG_IOMMU_PT_AMDV1=y
+CONFIG_IOMMU_PT_X86_64=y
 CONFIG_IOMMU_PT_KUNIT_TEST=y
 
 CONFIG_IOMMUFD=y
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
index 81652cd9c69f..6dcb771b3c58 100644
--- a/drivers/iommu/generic_pt/Kconfig
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -42,10 +42,21 @@ config IOMMU_PT_AMDV1
 
 	  Selected automatically by an IOMMU driver that uses this format.
 
+config IOMMU_PT_X86_64
+	tristate "IOMMU page table for x86 64-bit, 4/5 levels"
+	depends on !GENERIC_ATOMIC64 # for cmpxchg64
+	help
+	  iommu_domain implementation for the x86 64-bit 4/5 level page table.
+	  It supports 4K/2M/1G page sizes and can decode a sign-extended
+	  portion of the 64-bit IOVA space.
+
+	  Selected automatically by an IOMMU driver that uses this format.
+
 config IOMMU_PT_KUNIT_TEST
 	tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS
 	depends on KUNIT
 	depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1
+	depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64
 	default KUNIT_ALL_TESTS
 	help
 	  Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
index f0c22cf5f7be..5a3379107999 100644
--- a/drivers/iommu/generic_pt/fmt/Makefile
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -3,6 +3,8 @@
 iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
 iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock
 
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64
+
 IOMMU_PT_KUNIT_TEST :=
 define create_format
 obj-$(2) += iommu_$(1).o
diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h
new file mode 100644
index 000000000000..6f589e1f55d3
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H
+#define __GENERIC_PT_FMT_DEFS_X86_64_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct x86_64_pt_write_attrs {
+	u64 descriptor_bits;
+	gfp_t gfp;
+};
+#define pt_write_attrs x86_64_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
new file mode 100644
index 000000000000..5c5960d871a3
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT x86_64
+#define PT_SUPPORTED_FEATURES                                  \
+	(BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \
+	 BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) |                    \
+	 BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h
new file mode 100644
index 000000000000..18d736d14b2d
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/x86_64.h
@@ -0,0 +1,255 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * x86 page table. Supports the 4 and 5 level variations.
+ *
+ * The 4 and 5 level version is described in:
+ *   Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software
+ *   Developer's Manual Volume 3
+ *
+ *   Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization
+ *   Technology for Directed I/O Architecture Specification"
+ *
+ *   Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O
+ *   Virtualization Technology (IOMMU) Specification"
+ *
+ * It is used by x86 CPUs, AMD and VT-d IOMMU HW.
+ *
+ * Note the 3 level format is very similar and almost implemented here. The
+ * reserved/ignored layout is different and there are functional bit
+ * differences.
+ *
+ * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower
+ * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign
+ * extended addressing with this page table format.
+ *
+ * The named levels in the spec map to the pts->level as:
+ *   Table/PTE - 0
+ *   Directory/PDE - 1
+ *   Directory Ptr/PDPTE - 2
+ *   PML4/PML4E - 3
+ *   PML5/PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_X86_64_H
+#define __GENERIC_PT_FMT_X86_64_H
+
+#include "defs_x86_64.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+#include <linux/mem_encrypt.h>
+
+enum {
+	PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+	PT_MAX_VA_ADDRESS_LG2 = 57,
+	PT_ITEM_WORD_SIZE = sizeof(u64),
+	PT_MAX_TOP_LEVEL = 4,
+	PT_GRANULE_LG2SZ = 12,
+	PT_TABLEMEM_LG2SZ = 12,
+
+	/*
+	 * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k
+	 * aligned and is limited by the architected HAW
+	 */
+	PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+	X86_64_FMT_P = BIT(0),
+	X86_64_FMT_RW = BIT(1),
+	X86_64_FMT_U = BIT(2),
+	X86_64_FMT_A = BIT(5),
+	X86_64_FMT_D = BIT(6),
+	X86_64_FMT_OA = GENMASK_ULL(51, 12),
+	X86_64_FMT_XD = BIT_ULL(63),
+};
+
+/* PDPTE/PDE */
+enum {
+	X86_64_FMT_PS = BIT(7),
+};
+
+static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts)
+{
+	u64 entry = pts->entry;
+
+	if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+		entry = __sme_clr(entry);
+	return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry),
+			  PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa x86_64_pt_table_pa
+
+static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts)
+{
+	u64 entry = pts->entry;
+
+	if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+		entry = __sme_clr(entry);
+	return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry),
+			  PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa x86_64_pt_entry_oa
+
+static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts)
+{
+	return pts->level <= 2;
+}
+#define pt_can_have_leaf x86_64_pt_can_have_leaf
+
+static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts)
+{
+	return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 x86_64_pt_num_items_lg2
+
+static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts)
+{
+	const u64 *tablep = pt_cur_table(pts, u64);
+	u64 entry;
+
+	pts->entry = entry = READ_ONCE(tablep[pts->index]);
+	if (!(entry & X86_64_FMT_P))
+		return PT_ENTRY_EMPTY;
+	if (pts->level == 0 ||
+	    (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS)))
+		return PT_ENTRY_OA;
+	return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw x86_64_pt_load_entry_raw
+
+static inline void
+x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+			     unsigned int oasz_lg2,
+			     const struct pt_write_attrs *attrs)
+{
+	u64 *tablep = pt_cur_table(pts, u64);
+	u64 entry;
+
+	if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+		return;
+
+	entry = X86_64_FMT_P |
+		FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+		attrs->descriptor_bits;
+	if (pts->level != 0)
+		entry |= X86_64_FMT_PS;
+
+	WRITE_ONCE(tablep[pts->index], entry);
+	pts->entry = entry;
+}
+#define pt_install_leaf_entry x86_64_pt_install_leaf_entry
+
+static inline bool x86_64_pt_install_table(struct pt_state *pts,
+					   pt_oaddr_t table_pa,
+					   const struct pt_write_attrs *attrs)
+{
+	u64 entry;
+
+	entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A |
+		FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+	if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+		entry = __sme_set(entry);
+	return pt_table_install64(pts, entry);
+}
+#define pt_install_table x86_64_pt_install_table
+
+static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts,
+					     struct pt_write_attrs *attrs)
+{
+	attrs->descriptor_bits = pts->entry &
+				 (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A |
+				  X86_64_FMT_D | X86_64_FMT_XD);
+}
+#define pt_attr_from_entry x86_64_pt_attr_from_entry
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_x86_64
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+	return &container_of(iommu_table, struct pt_iommu_table, iommu)
+			->x86_64_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+	return &container_of(common, struct pt_iommu_table, x86_64_pt.common)
+			->iommu;
+}
+
+static inline int x86_64_pt_iommu_set_prot(struct pt_common *common,
+					   struct pt_write_attrs *attrs,
+					   unsigned int iommu_prot)
+{
+	u64 pte;
+
+	pte = X86_64_FMT_U | X86_64_FMT_A | X86_64_FMT_D;
+	if (iommu_prot & IOMMU_WRITE)
+		pte |= X86_64_FMT_RW;
+
+	/*
+	 * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+	 * control this. For now if the tables use sme_set then so do the ptes.
+	 */
+	if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+		pte = __sme_set(pte);
+
+	attrs->descriptor_bits = pte;
+	return 0;
+}
+#define pt_iommu_set_prot x86_64_pt_iommu_set_prot
+
+static inline int
+x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table,
+			 const struct pt_iommu_x86_64_cfg *cfg)
+{
+	struct pt_x86_64 *table = &iommu_table->x86_64_pt;
+
+	if (cfg->common.hw_max_vasz_lg2 < 31 ||
+	    cfg->common.hw_max_vasz_lg2 > 57)
+		return -EINVAL;
+
+	/* Top of 2, 3, 4 */
+	pt_top_set_level(&table->common,
+			 (cfg->common.hw_max_vasz_lg2 - 31) / 9 + 2);
+
+	table->common.max_oasz_lg2 =
+		min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+	return 0;
+}
+#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init
+
+static inline void
+x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table,
+			    const struct pt_range *top_range,
+			    struct pt_iommu_x86_64_hw_info *info)
+{
+	info->gcr3_pt = virt_to_phys(top_range->top_table);
+	PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK);
+	info->levels = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = {
+	[0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
+		.common.hw_max_vasz_lg2 = 48 },
+	[1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
+		.common.hw_max_vasz_lg2 = 57 },
+	/* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */
+	[2] = { .common.hw_max_vasz_lg2 = 47 },
+	[3] = { .common.hw_max_vasz_lg2 = 56 },
+};
+#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES =  BIT(PT_FEAT_SIGN_EXTEND)};
+#endif
+#endif
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
index 21e33489cbf2..96f8a6a7d60e 100644
--- a/include/linux/generic_pt/common.h
+++ b/include/linux/generic_pt/common.h
@@ -151,4 +151,17 @@ enum {
 	PT_FEAT_AMDV1_FORCE_COHERENCE,
 };
 
+struct pt_x86_64 {
+	struct pt_common common;
+};
+
+enum {
+	/*
+	 * The memory backing the tables is encrypted. Use __sme_set() to adjust
+	 * the page table pointers in the tree. This only works with
+	 * CONFIG_AMD_MEM_ENCRYPT.
+	 */
+	PT_FEAT_X86_64_AMD_ENCRYPT_TABLES = PT_FEAT_FMT_START,
+};
+
 #endif
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index f2a763aba088..fde7ccf007c5 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -255,6 +255,17 @@ IOMMU_FORMAT(amdv1, amdpt);
 struct pt_iommu_amdv1_mock_hw_info;
 IOMMU_PROTOTYPES(amdv1_mock);
 
+struct pt_iommu_x86_64_cfg {
+	struct pt_iommu_cfg common;
+};
+
+struct pt_iommu_x86_64_hw_info {
+	u64 gcr3_pt;
+	u8 levels;
+};
+
+IOMMU_FORMAT(x86_64, x86_64_pt);
+
 #undef IOMMU_PROTOTYPES
 #undef IOMMU_FORMAT
 #endif
-- 
cgit v1.2.3


From 2fdf6db436e3071a8e4c9c3e67674448a13860d4 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:12 -0400
Subject: iommu/amd: Remove AMD io_pgtable support

None of this is used anymore, delete it.

Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/amd/Makefile          |   2 +-
 drivers/iommu/amd/amd_iommu_types.h |  98 ------
 drivers/iommu/amd/io_pgtable.c      | 575 ------------------------------------
 drivers/iommu/amd/io_pgtable_v2.c   | 370 -----------------------
 drivers/iommu/io-pgtable.c          |   4 -
 include/linux/io-pgtable.h          |   2 -
 6 files changed, 1 insertion(+), 1050 deletions(-)
 delete mode 100644 drivers/iommu/amd/io_pgtable.c
 delete mode 100644 drivers/iommu/amd/io_pgtable_v2.c

(limited to 'include')

diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index 59c04a67f398..5412a563c697 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o
+obj-y += iommu.o init.o quirks.o ppr.o pasid.o
 obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index d90a285b44eb..4b4a37fad70e 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -18,7 +18,6 @@
 #include <linux/spinlock.h>
 #include <linux/pci.h>
 #include <linux/irqreturn.h>
-#include <linux/io-pgtable.h>
 #include <linux/generic_pt/iommu.h>
 
 /*
@@ -338,76 +337,7 @@
 #define GUEST_PGTABLE_4_LEVEL	0x00
 #define GUEST_PGTABLE_5_LEVEL	0x01
 
-#define PM_LEVEL_SHIFT(x)	(12 + ((x) * 9))
-#define PM_LEVEL_SIZE(x)	(((x) < 6) ? \
-				  ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
-				   (0xffffffffffffffffULL))
-#define PM_LEVEL_INDEX(x, a)	(((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
-#define PM_LEVEL_ENC(x)		(((x) << 9) & 0xe00ULL)
-#define PM_LEVEL_PDE(x, a)	((a) | PM_LEVEL_ENC((x)) | \
-				 IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define PM_PTE_LEVEL(pte)	(((pte) >> 9) & 0x7ULL)
-
-#define PM_MAP_4k		0
 #define PM_ADDR_MASK		0x000ffffffffff000ULL
-#define PM_MAP_MASK(lvl)	(PM_ADDR_MASK & \
-				(~((1ULL << (12 + ((lvl) * 9))) - 1)))
-#define PM_ALIGNED(lvl, addr)	((PM_MAP_MASK(lvl) & (addr)) == (addr))
-
-/*
- * Returns the page table level to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_LEVEL(pagesize) \
-		((__ffs(pagesize) - 12) / 9)
-/*
- * Returns the number of ptes to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_PTE_COUNT(pagesize) \
-		(1ULL << ((__ffs(pagesize) - 12) % 9))
-
-/*
- * Aligns a given io-virtual address to a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_ALIGN(address, pagesize) \
-		((address) & ~((pagesize) - 1))
-/*
- * Creates an IOMMU PTE for an address and a given pagesize
- * The PTE has no permission bits set
- * Pagesize is expected to be a power-of-two larger than 4096
- */
-#define PAGE_SIZE_PTE(address, pagesize)		\
-		(((address) | ((pagesize) - 1)) &	\
-		 (~(pagesize >> 1)) & PM_ADDR_MASK)
-
-/*
- * Takes a PTE value with mode=0x07 and returns the page size it maps
- */
-#define PTE_PAGE_SIZE(pte) \
-	(1ULL << (1 + ffz(((pte) | 0xfffULL))))
-
-/*
- * Takes a page-table level and returns the default page-size for this level
- */
-#define PTE_LEVEL_PAGE_SIZE(level)			\
-	(1ULL << (12 + (9 * (level))))
-
-/*
- * The IOPTE dirty bit
- */
-#define IOMMU_PTE_HD_BIT (6)
-
-/*
- * Bit value definition for I/O PTE fields
- */
-#define IOMMU_PTE_PR	BIT_ULL(0)
-#define IOMMU_PTE_HD	BIT_ULL(IOMMU_PTE_HD_BIT)
-#define IOMMU_PTE_U	BIT_ULL(59)
-#define IOMMU_PTE_FC	BIT_ULL(60)
-#define IOMMU_PTE_IR	BIT_ULL(61)
-#define IOMMU_PTE_IW	BIT_ULL(62)
 
 /*
  * Bit value definition for DTE fields
@@ -437,12 +367,6 @@
 /* DTE[128:179] | DTE[184:191] */
 #define DTE_DATA2_INTR_MASK	~GENMASK_ULL(55, 52)
 
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
-#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
-#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
-
 #define IOMMU_PROT_MASK 0x03
 #define IOMMU_PROT_IR 0x01
 #define IOMMU_PROT_IW 0x02
@@ -535,19 +459,6 @@ struct amd_irte_ops;
 
 #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED      (1 << 0)
 
-#define io_pgtable_to_data(x) \
-	container_of((x), struct amd_io_pgtable, pgtbl)
-
-#define io_pgtable_ops_to_data(x) \
-	io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
-
-#define io_pgtable_ops_to_domain(x) \
-	container_of(io_pgtable_ops_to_data(x), \
-		     struct protection_domain, iop)
-
-#define io_pgtable_cfg_to_data(x) \
-	container_of((x), struct amd_io_pgtable, pgtbl.cfg)
-
 struct gcr3_tbl_info {
 	u64	*gcr3_tbl;	/* Guest CR3 table */
 	int	glx;		/* Number of levels for GCR3 table */
@@ -555,14 +466,6 @@ struct gcr3_tbl_info {
 	u16	domid;		/* Per device domain ID */
 };
 
-struct amd_io_pgtable {
-	seqcount_t		seqcount;	/* Protects root/mode update */
-	struct io_pgtable	pgtbl;
-	int			mode;
-	u64			*root;
-	u64			*pgd;		/* v2 pgtable pgd pointer */
-};
-
 enum protection_domain_mode {
 	PD_MODE_NONE,
 	PD_MODE_V1,
@@ -597,7 +500,6 @@ struct protection_domain {
 		struct pt_iommu_x86_64 amdv2;
 	};
 	struct list_head dev_list; /* List of all devices in this domain */
-	struct amd_io_pgtable iop;
 	spinlock_t lock;	/* mostly used to lock the page table*/
 	u16 id;			/* the domain id written to the device table */
 	enum protection_domain_mode pd_mode; /* Track page table type */
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
deleted file mode 100644
index f64244938c9a..000000000000
--- a/drivers/iommu/amd/io_pgtable.c
+++ /dev/null
@@ -1,575 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CPU-agnostic AMD IO page table allocator.
- *
- * Copyright (C) 2020 Advanced Micro Devices, Inc.
- * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#define pr_fmt(fmt)     "AMD-Vi: " fmt
-#define dev_fmt(fmt)    pr_fmt(fmt)
-
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/dma-mapping.h>
-#include <linux/seqlock.h>
-
-#include <asm/barrier.h>
-
-#include "amd_iommu_types.h"
-#include "amd_iommu.h"
-#include "../iommu-pages.h"
-
-/*
- * Helper function to get the first pte of a large mapping
- */
-static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
-			 unsigned long *count)
-{
-	unsigned long pte_mask, pg_size, cnt;
-	u64 *fpte;
-
-	pg_size  = PTE_PAGE_SIZE(*pte);
-	cnt      = PAGE_SIZE_PTE_COUNT(pg_size);
-	pte_mask = ~((cnt << 3) - 1);
-	fpte     = (u64 *)(((unsigned long)pte) & pte_mask);
-
-	if (page_size)
-		*page_size = pg_size;
-
-	if (count)
-		*count = cnt;
-
-	return fpte;
-}
-
-static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl)
-{
-	u64 *p;
-	int i;
-
-	for (i = 0; i < 512; ++i) {
-		/* PTE present? */
-		if (!IOMMU_PTE_PRESENT(pt[i]))
-			continue;
-
-		/* Large PTE? */
-		if (PM_PTE_LEVEL(pt[i]) == 0 ||
-		    PM_PTE_LEVEL(pt[i]) == 7)
-			continue;
-
-		/*
-		 * Free the next level. No need to look at l1 tables here since
-		 * they can only contain leaf PTEs; just free them directly.
-		 */
-		p = IOMMU_PTE_PAGE(pt[i]);
-		if (lvl > 2)
-			free_pt_lvl(p, freelist, lvl - 1);
-		else
-			iommu_pages_list_add(freelist, p);
-	}
-
-	iommu_pages_list_add(freelist, pt);
-}
-
-static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist)
-{
-	switch (mode) {
-	case PAGE_MODE_NONE:
-	case PAGE_MODE_7_LEVEL:
-		break;
-	case PAGE_MODE_1_LEVEL:
-		iommu_pages_list_add(freelist, root);
-		break;
-	case PAGE_MODE_2_LEVEL:
-	case PAGE_MODE_3_LEVEL:
-	case PAGE_MODE_4_LEVEL:
-	case PAGE_MODE_5_LEVEL:
-	case PAGE_MODE_6_LEVEL:
-		free_pt_lvl(root, freelist, mode);
-		break;
-	default:
-		BUG();
-	}
-}
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct amd_io_pgtable *pgtable,
-				   unsigned long address,
-				   unsigned int page_size_level,
-				   gfp_t gfp)
-{
-	struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
-	struct protection_domain *domain =
-		container_of(pgtable, struct protection_domain, iop);
-	unsigned long flags;
-	bool ret = true;
-	u64 *pte;
-
-	pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K);
-	if (!pte)
-		return false;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	if (address <= PM_LEVEL_SIZE(pgtable->mode) &&
-	    pgtable->mode - 1 >= page_size_level)
-		goto out;
-
-	ret = false;
-	if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level))
-		goto out;
-
-	*pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root));
-
-	write_seqcount_begin(&pgtable->seqcount);
-	pgtable->root  = pte;
-	pgtable->mode += 1;
-	write_seqcount_end(&pgtable->seqcount);
-
-	pte = NULL;
-	ret = true;
-
-out:
-	spin_unlock_irqrestore(&domain->lock, flags);
-	iommu_free_pages(pte);
-
-	return ret;
-}
-
-static u64 *alloc_pte(struct amd_io_pgtable *pgtable,
-		      unsigned long address,
-		      unsigned long page_size,
-		      u64 **pte_page,
-		      gfp_t gfp,
-		      bool *updated)
-{
-	unsigned long last_addr = address + (page_size - 1);
-	struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
-	unsigned int seqcount;
-	int level, end_lvl;
-	u64 *pte, *page;
-
-	BUG_ON(!is_power_of_2(page_size));
-
-	while (last_addr > PM_LEVEL_SIZE(pgtable->mode) ||
-	       pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) {
-		/*
-		 * Return an error if there is no memory to update the
-		 * page-table.
-		 */
-		if (!increase_address_space(pgtable, last_addr,
-					    PAGE_SIZE_LEVEL(page_size), gfp))
-			return NULL;
-	}
-
-
-	do {
-		seqcount = read_seqcount_begin(&pgtable->seqcount);
-
-		level   = pgtable->mode - 1;
-		pte     = &pgtable->root[PM_LEVEL_INDEX(level, address)];
-	} while (read_seqcount_retry(&pgtable->seqcount, seqcount));
-
-
-	address = PAGE_SIZE_ALIGN(address, page_size);
-	end_lvl = PAGE_SIZE_LEVEL(page_size);
-
-	while (level > end_lvl) {
-		u64 __pte, __npte;
-		int pte_level;
-
-		__pte     = *pte;
-		pte_level = PM_PTE_LEVEL(__pte);
-
-		/*
-		 * If we replace a series of large PTEs, we need
-		 * to tear down all of them.
-		 */
-		if (IOMMU_PTE_PRESENT(__pte) &&
-		    pte_level == PAGE_MODE_7_LEVEL) {
-			unsigned long count, i;
-			u64 *lpte;
-
-			lpte = first_pte_l7(pte, NULL, &count);
-
-			/*
-			 * Unmap the replicated PTEs that still match the
-			 * original large mapping
-			 */
-			for (i = 0; i < count; ++i)
-				cmpxchg64(&lpte[i], __pte, 0ULL);
-
-			*updated = true;
-			continue;
-		}
-
-		if (!IOMMU_PTE_PRESENT(__pte) ||
-		    pte_level == PAGE_MODE_NONE) {
-			page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp,
-							 SZ_4K);
-
-			if (!page)
-				return NULL;
-
-			__npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
-
-			/* pte could have been changed somewhere. */
-			if (!try_cmpxchg64(pte, &__pte, __npte))
-				iommu_free_pages(page);
-			else if (IOMMU_PTE_PRESENT(__pte))
-				*updated = true;
-
-			continue;
-		}
-
-		/* No level skipping support yet */
-		if (pte_level != level)
-			return NULL;
-
-		level -= 1;
-
-		pte = IOMMU_PTE_PAGE(__pte);
-
-		if (pte_page && level == end_lvl)
-			*pte_page = pte;
-
-		pte = &pte[PM_LEVEL_INDEX(level, address)];
-	}
-
-	return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
-		      unsigned long address,
-		      unsigned long *page_size)
-{
-	int level;
-	unsigned int seqcount;
-	u64 *pte;
-
-	*page_size = 0;
-
-	if (address > PM_LEVEL_SIZE(pgtable->mode))
-		return NULL;
-
-	do {
-		seqcount = read_seqcount_begin(&pgtable->seqcount);
-		level	   =  pgtable->mode - 1;
-		pte	   = &pgtable->root[PM_LEVEL_INDEX(level, address)];
-	} while (read_seqcount_retry(&pgtable->seqcount, seqcount));
-
-	*page_size =  PTE_LEVEL_PAGE_SIZE(level);
-
-	while (level > 0) {
-
-		/* Not Present */
-		if (!IOMMU_PTE_PRESENT(*pte))
-			return NULL;
-
-		/* Large PTE */
-		if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL ||
-		    PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE)
-			break;
-
-		/* No level skipping support yet */
-		if (PM_PTE_LEVEL(*pte) != level)
-			return NULL;
-
-		level -= 1;
-
-		/* Walk to the next level */
-		pte	   = IOMMU_PTE_PAGE(*pte);
-		pte	   = &pte[PM_LEVEL_INDEX(level, address)];
-		*page_size = PTE_LEVEL_PAGE_SIZE(level);
-	}
-
-	/*
-	 * If we have a series of large PTEs, make
-	 * sure to return a pointer to the first one.
-	 */
-	if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
-		pte = first_pte_l7(pte, page_size, NULL);
-
-	return pte;
-}
-
-static void free_clear_pte(u64 *pte, u64 pteval,
-			   struct iommu_pages_list *freelist)
-{
-	u64 *pt;
-	int mode;
-
-	while (!try_cmpxchg64(pte, &pteval, 0))
-		pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
-
-	if (!IOMMU_PTE_PRESENT(pteval))
-		return;
-
-	pt   = IOMMU_PTE_PAGE(pteval);
-	mode = IOMMU_PTE_MODE(pteval);
-
-	free_sub_pt(pt, mode, freelist);
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
-			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
-			      int prot, gfp_t gfp, size_t *mapped)
-{
-	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
-	struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
-	bool updated = false;
-	u64 __pte, *pte;
-	int ret, i, count;
-	size_t size = pgcount << __ffs(pgsize);
-	unsigned long o_iova = iova;
-
-	BUG_ON(!IS_ALIGNED(iova, pgsize));
-	BUG_ON(!IS_ALIGNED(paddr, pgsize));
-
-	ret = -EINVAL;
-	if (!(prot & IOMMU_PROT_MASK))
-		goto out;
-
-	while (pgcount > 0) {
-		count = PAGE_SIZE_PTE_COUNT(pgsize);
-		pte   = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated);
-
-		ret = -ENOMEM;
-		if (!pte)
-			goto out;
-
-		for (i = 0; i < count; ++i)
-			free_clear_pte(&pte[i], pte[i], &freelist);
-
-		if (!iommu_pages_list_empty(&freelist))
-			updated = true;
-
-		if (count > 1) {
-			__pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize);
-			__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
-		} else
-			__pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
-
-		if (prot & IOMMU_PROT_IR)
-			__pte |= IOMMU_PTE_IR;
-		if (prot & IOMMU_PROT_IW)
-			__pte |= IOMMU_PTE_IW;
-
-		for (i = 0; i < count; ++i)
-			pte[i] = __pte;
-
-		iova  += pgsize;
-		paddr += pgsize;
-		pgcount--;
-		if (mapped)
-			*mapped += pgsize;
-	}
-
-	ret = 0;
-
-out:
-	if (updated) {
-		struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
-		unsigned long flags;
-
-		spin_lock_irqsave(&dom->lock, flags);
-		/*
-		 * Flush domain TLB(s) and wait for completion. Any Device-Table
-		 * Updates and flushing already happened in
-		 * increase_address_space().
-		 */
-		amd_iommu_domain_flush_pages(dom, o_iova, size);
-		spin_unlock_irqrestore(&dom->lock, flags);
-	}
-
-	/* Everything flushed out, free pages now */
-	iommu_put_pages_list(&freelist);
-
-	return ret;
-}
-
-static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops,
-					  unsigned long iova,
-					  size_t pgsize, size_t pgcount,
-					  struct iommu_iotlb_gather *gather)
-{
-	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
-	unsigned long long unmapped;
-	unsigned long unmap_size;
-	u64 *pte;
-	size_t size = pgcount << __ffs(pgsize);
-
-	BUG_ON(!is_power_of_2(pgsize));
-
-	unmapped = 0;
-
-	while (unmapped < size) {
-		pte = fetch_pte(pgtable, iova, &unmap_size);
-		if (pte) {
-			int i, count;
-
-			count = PAGE_SIZE_PTE_COUNT(unmap_size);
-			for (i = 0; i < count; i++)
-				pte[i] = 0ULL;
-		} else {
-			return unmapped;
-		}
-
-		iova = (iova & ~(unmap_size - 1)) + unmap_size;
-		unmapped += unmap_size;
-	}
-
-	return unmapped;
-}
-
-static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
-{
-	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
-	unsigned long offset_mask, pte_pgsize;
-	u64 *pte, __pte;
-
-	pte = fetch_pte(pgtable, iova, &pte_pgsize);
-
-	if (!pte || !IOMMU_PTE_PRESENT(*pte))
-		return 0;
-
-	offset_mask = pte_pgsize - 1;
-	__pte	    = __sme_clr(*pte & PM_ADDR_MASK);
-
-	return (__pte & ~offset_mask) | (iova & offset_mask);
-}
-
-static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
-				     unsigned long flags)
-{
-	bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
-	bool dirty = false;
-	int i, count;
-
-	/*
-	 * 2.2.3.2 Host Dirty Support
-	 * When a non-default page size is used , software must OR the
-	 * Dirty bits in all of the replicated host PTEs used to map
-	 * the page. The IOMMU does not guarantee the Dirty bits are
-	 * set in all of the replicated PTEs. Any portion of the page
-	 * may have been written even if the Dirty bit is set in only
-	 * one of the replicated PTEs.
-	 */
-	count = PAGE_SIZE_PTE_COUNT(size);
-	for (i = 0; i < count && test_only; i++) {
-		if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
-			dirty = true;
-			break;
-		}
-	}
-
-	for (i = 0; i < count && !test_only; i++) {
-		if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
-				       (unsigned long *)&ptep[i])) {
-			dirty = true;
-		}
-	}
-
-	return dirty;
-}
-
-static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
-					 unsigned long iova, size_t size,
-					 unsigned long flags,
-					 struct iommu_dirty_bitmap *dirty)
-{
-	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
-	unsigned long end = iova + size - 1;
-
-	do {
-		unsigned long pgsize = 0;
-		u64 *ptep, pte;
-
-		ptep = fetch_pte(pgtable, iova, &pgsize);
-		if (ptep)
-			pte = READ_ONCE(*ptep);
-		if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
-			pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
-			iova += pgsize;
-			continue;
-		}
-
-		/*
-		 * Mark the whole IOVA range as dirty even if only one of
-		 * the replicated PTEs were marked dirty.
-		 */
-		if (pte_test_and_clear_dirty(ptep, pgsize, flags))
-			iommu_dirty_bitmap_record(dirty, iova, pgsize);
-		iova += pgsize;
-	} while (iova < end);
-
-	return 0;
-}
-
-/*
- * ----------------------------------------------------
- */
-static void v1_free_pgtable(struct io_pgtable *iop)
-{
-	struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl);
-	struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
-
-	if (pgtable->mode == PAGE_MODE_NONE)
-		return;
-
-	/* Page-table is not visible to IOMMU anymore, so free it */
-	BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
-	       pgtable->mode > amd_iommu_hpt_level);
-
-	free_sub_pt(pgtable->root, pgtable->mode, &freelist);
-	iommu_put_pages_list(&freelist);
-}
-
-static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
-{
-	struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
-
-	pgtable->root =
-		iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K);
-	if (!pgtable->root)
-		return NULL;
-	pgtable->mode = PAGE_MODE_3_LEVEL;
-	seqcount_init(&pgtable->seqcount);
-
-	cfg->pgsize_bitmap  = amd_iommu_pgsize_bitmap;
-	cfg->ias            = IOMMU_IN_ADDR_BIT_SIZE;
-	cfg->oas            = IOMMU_OUT_ADDR_BIT_SIZE;
-
-	pgtable->pgtbl.ops.map_pages    = iommu_v1_map_pages;
-	pgtable->pgtbl.ops.unmap_pages  = iommu_v1_unmap_pages;
-	pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys;
-	pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
-
-	return &pgtable->pgtbl;
-}
-
-struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
-	.alloc	= v1_alloc_pgtable,
-	.free	= v1_free_pgtable,
-};
diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c
deleted file mode 100644
index b47941353ccb..000000000000
--- a/drivers/iommu/amd/io_pgtable_v2.c
+++ /dev/null
@@ -1,370 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CPU-agnostic AMD IO page table v2 allocator.
- *
- * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc.
- * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- * Author: Vasant Hegde <vasant.hegde@amd.com>
- */
-
-#define pr_fmt(fmt)	"AMD-Vi: " fmt
-#define dev_fmt(fmt)	pr_fmt(fmt)
-
-#include <linux/bitops.h>
-#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-
-#include <asm/barrier.h>
-
-#include "amd_iommu_types.h"
-#include "amd_iommu.h"
-#include "../iommu-pages.h"
-
-#define IOMMU_PAGE_PRESENT	BIT_ULL(0)	/* Is present */
-#define IOMMU_PAGE_RW		BIT_ULL(1)	/* Writeable */
-#define IOMMU_PAGE_USER		BIT_ULL(2)	/* Userspace addressable */
-#define IOMMU_PAGE_PWT		BIT_ULL(3)	/* Page write through */
-#define IOMMU_PAGE_PCD		BIT_ULL(4)	/* Page cache disabled */
-#define IOMMU_PAGE_ACCESS	BIT_ULL(5)	/* Was accessed (updated by IOMMU) */
-#define IOMMU_PAGE_DIRTY	BIT_ULL(6)	/* Was written to (updated by IOMMU) */
-#define IOMMU_PAGE_PSE		BIT_ULL(7)	/* Page Size Extensions */
-#define IOMMU_PAGE_NX		BIT_ULL(63)	/* No execute */
-
-#define MAX_PTRS_PER_PAGE	512
-
-#define IOMMU_PAGE_SIZE_2M	BIT_ULL(21)
-#define IOMMU_PAGE_SIZE_1G	BIT_ULL(30)
-
-
-static inline int get_pgtable_level(void)
-{
-	return amd_iommu_gpt_level;
-}
-
-static inline bool is_large_pte(u64 pte)
-{
-	return (pte & IOMMU_PAGE_PSE);
-}
-
-static inline u64 set_pgtable_attr(u64 *page)
-{
-	u64 prot;
-
-	prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER;
-	prot |= IOMMU_PAGE_ACCESS;
-
-	return (iommu_virt_to_phys(page) | prot);
-}
-
-static inline void *get_pgtable_pte(u64 pte)
-{
-	return iommu_phys_to_virt(pte & PM_ADDR_MASK);
-}
-
-static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot)
-{
-	u64 pte;
-
-	pte = __sme_set(paddr & PM_ADDR_MASK);
-	pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER;
-	pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
-
-	if (prot & IOMMU_PROT_IW)
-		pte |= IOMMU_PAGE_RW;
-
-	/* Large page */
-	if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M)
-		pte |= IOMMU_PAGE_PSE;
-
-	return pte;
-}
-
-static inline u64 get_alloc_page_size(u64 size)
-{
-	if (size >= IOMMU_PAGE_SIZE_1G)
-		return IOMMU_PAGE_SIZE_1G;
-
-	if (size >= IOMMU_PAGE_SIZE_2M)
-		return IOMMU_PAGE_SIZE_2M;
-
-	return PAGE_SIZE;
-}
-
-static inline int page_size_to_level(u64 pg_size)
-{
-	if (pg_size == IOMMU_PAGE_SIZE_1G)
-		return PAGE_MODE_3_LEVEL;
-	if (pg_size == IOMMU_PAGE_SIZE_2M)
-		return PAGE_MODE_2_LEVEL;
-
-	return PAGE_MODE_1_LEVEL;
-}
-
-static void free_pgtable(u64 *pt, int level)
-{
-	u64 *p;
-	int i;
-
-	for (i = 0; i < MAX_PTRS_PER_PAGE; i++) {
-		/* PTE present? */
-		if (!IOMMU_PTE_PRESENT(pt[i]))
-			continue;
-
-		if (is_large_pte(pt[i]))
-			continue;
-
-		/*
-		 * Free the next level. No need to look at l1 tables here since
-		 * they can only contain leaf PTEs; just free them directly.
-		 */
-		p = get_pgtable_pte(pt[i]);
-		if (level > 2)
-			free_pgtable(p, level - 1);
-		else
-			iommu_free_pages(p);
-	}
-
-	iommu_free_pages(pt);
-}
-
-/* Allocate page table */
-static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova,
-			 unsigned long pg_size, gfp_t gfp, bool *updated)
-{
-	u64 *pte, *page;
-	int level, end_level;
-
-	level = get_pgtable_level() - 1;
-	end_level = page_size_to_level(pg_size);
-	pte = &pgd[PM_LEVEL_INDEX(level, iova)];
-	iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE);
-
-	while (level >= end_level) {
-		u64 __pte, __npte;
-
-		__pte = *pte;
-
-		if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) {
-			/* Unmap large pte */
-			cmpxchg64(pte, *pte, 0ULL);
-			*updated = true;
-			continue;
-		}
-
-		if (!IOMMU_PTE_PRESENT(__pte)) {
-			page = iommu_alloc_pages_node_sz(nid, gfp, SZ_4K);
-			if (!page)
-				return NULL;
-
-			__npte = set_pgtable_attr(page);
-			/* pte could have been changed somewhere. */
-			if (!try_cmpxchg64(pte, &__pte, __npte))
-				iommu_free_pages(page);
-			else if (IOMMU_PTE_PRESENT(__pte))
-				*updated = true;
-
-			continue;
-		}
-
-		level -= 1;
-		pte = get_pgtable_pte(__pte);
-		pte = &pte[PM_LEVEL_INDEX(level, iova)];
-	}
-
-	/* Tear down existing pte entries */
-	if (IOMMU_PTE_PRESENT(*pte)) {
-		u64 *__pte;
-
-		*updated = true;
-		__pte = get_pgtable_pte(*pte);
-		cmpxchg64(pte, *pte, 0ULL);
-		if (pg_size == IOMMU_PAGE_SIZE_1G)
-			free_pgtable(__pte, end_level - 1);
-		else if (pg_size == IOMMU_PAGE_SIZE_2M)
-			iommu_free_pages(__pte);
-	}
-
-	return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address.
- * If there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
-		      unsigned long iova, unsigned long *page_size)
-{
-	u64 *pte;
-	int level;
-
-	level = get_pgtable_level() - 1;
-	pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)];
-	/* Default page size is 4K */
-	*page_size = PAGE_SIZE;
-
-	while (level) {
-		/* Not present */
-		if (!IOMMU_PTE_PRESENT(*pte))
-			return NULL;
-
-		/* Walk to the next level */
-		pte = get_pgtable_pte(*pte);
-		pte = &pte[PM_LEVEL_INDEX(level - 1, iova)];
-
-		/* Large page */
-		if (is_large_pte(*pte)) {
-			if (level == PAGE_MODE_3_LEVEL)
-				*page_size = IOMMU_PAGE_SIZE_1G;
-			else if (level == PAGE_MODE_2_LEVEL)
-				*page_size = IOMMU_PAGE_SIZE_2M;
-			else
-				return NULL;	/* Wrongly set PSE bit in PTE */
-
-			break;
-		}
-
-		level -= 1;
-	}
-
-	return pte;
-}
-
-static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
-			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
-			      int prot, gfp_t gfp, size_t *mapped)
-{
-	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
-	struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
-	u64 *pte;
-	unsigned long map_size;
-	unsigned long mapped_size = 0;
-	unsigned long o_iova = iova;
-	size_t size = pgcount << __ffs(pgsize);
-	int ret = 0;
-	bool updated = false;
-
-	if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount)
-		return -EINVAL;
-
-	if (!(prot & IOMMU_PROT_MASK))
-		return -EINVAL;
-
-	while (mapped_size < size) {
-		map_size = get_alloc_page_size(pgsize);
-		pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd,
-				   iova, map_size, gfp, &updated);
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		*pte = set_pte_attr(paddr, map_size, prot);
-
-		iova += map_size;
-		paddr += map_size;
-		mapped_size += map_size;
-	}
-
-out:
-	if (updated) {
-		struct protection_domain *pdom = io_pgtable_ops_to_domain(ops);
-		unsigned long flags;
-
-		spin_lock_irqsave(&pdom->lock, flags);
-		amd_iommu_domain_flush_pages(pdom, o_iova, size);
-		spin_unlock_irqrestore(&pdom->lock, flags);
-	}
-
-	if (mapped)
-		*mapped += mapped_size;
-
-	return ret;
-}
-
-static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops,
-					  unsigned long iova,
-					  size_t pgsize, size_t pgcount,
-					  struct iommu_iotlb_gather *gather)
-{
-	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
-	struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
-	unsigned long unmap_size;
-	unsigned long unmapped = 0;
-	size_t size = pgcount << __ffs(pgsize);
-	u64 *pte;
-
-	if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount))
-		return 0;
-
-	while (unmapped < size) {
-		pte = fetch_pte(pgtable, iova, &unmap_size);
-		if (!pte)
-			return unmapped;
-
-		*pte = 0ULL;
-
-		iova = (iova & ~(unmap_size - 1)) + unmap_size;
-		unmapped += unmap_size;
-	}
-
-	return unmapped;
-}
-
-static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
-{
-	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
-	unsigned long offset_mask, pte_pgsize;
-	u64 *pte, __pte;
-
-	pte = fetch_pte(pgtable, iova, &pte_pgsize);
-	if (!pte || !IOMMU_PTE_PRESENT(*pte))
-		return 0;
-
-	offset_mask = pte_pgsize - 1;
-	__pte = __sme_clr(*pte & PM_ADDR_MASK);
-
-	return (__pte & ~offset_mask) | (iova & offset_mask);
-}
-
-/*
- * ----------------------------------------------------
- */
-static void v2_free_pgtable(struct io_pgtable *iop)
-{
-	struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl);
-
-	if (!pgtable || !pgtable->pgd)
-		return;
-
-	/* Free page table */
-	free_pgtable(pgtable->pgd, get_pgtable_level());
-	pgtable->pgd = NULL;
-}
-
-static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
-{
-	struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
-	int ias = IOMMU_IN_ADDR_BIT_SIZE;
-
-	pgtable->pgd = iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K);
-	if (!pgtable->pgd)
-		return NULL;
-
-	if (get_pgtable_level() == PAGE_MODE_5_LEVEL)
-		ias = 57;
-
-	pgtable->pgtbl.ops.map_pages    = iommu_v2_map_pages;
-	pgtable->pgtbl.ops.unmap_pages  = iommu_v2_unmap_pages;
-	pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys;
-
-	cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
-	cfg->ias           = ias;
-	cfg->oas           = IOMMU_OUT_ADDR_BIT_SIZE;
-
-	return &pgtable->pgtbl;
-}
-
-struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = {
-	.alloc	= v2_alloc_pgtable,
-	.free	= v2_free_pgtable,
-};
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 8841c1487f00..843fec8e8a51 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -28,10 +28,6 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
 #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
 	[ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
 #endif
-#ifdef CONFIG_AMD_IOMMU
-	[AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
-	[AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns,
-#endif
 };
 
 static int check_custom_allocator(enum io_pgtable_fmt fmt,
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 8a823c6f2b4a..7a1516011ccf 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -15,8 +15,6 @@ enum io_pgtable_fmt {
 	ARM_64_LPAE_S2,
 	ARM_V7S,
 	ARM_MALI_LPAE,
-	AMD_IOMMU_V1,
-	AMD_IOMMU_V2,
 	APPLE_DART,
 	APPLE_DART2,
 	IO_PGTABLE_NUM_FMTS,
-- 
cgit v1.2.3


From bc5233c0904eb116a4bd94e10cd3666733216063 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 4 Nov 2025 14:30:13 -0400
Subject: iommupt: Add a kunit test for the IOMMU implementation

This intends to have high coverage of the page table format functions and
the IOMMU implementation itself, exercising the various corner cases.

The kunit tests can be run in the kunit framework, using commands like:

tools/testing/kunit/kunit.py run --build_dir build_kunit_arm64 --arch arm64 --make_options LLVM=-19 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig
tools/testing/kunit/kunit.py run --build_dir build_kunit_uml --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig
tools/testing/kunit/kunit.py run --build_dir build_kunit_x86_64 --arch x86_64 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig
tools/testing/kunit/kunit.py run --build_dir build_kunit_i386 --arch i386 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig
tools/testing/kunit/kunit.py run --build_dir build_kunit_i386pae --arch i386 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig --kconfig_add CONFIG_X86_PAE=y

There are several interesting corner cases on the 32 bit platforms that
need checking.

Like the generic tests, these are run on the format's configuration list
using kunit "params". This also checks the core iommu parts of the page
table code as it enters the logic through a mock iommu_domain.

The following are checked:
 - PT_FEAT_DYNAMIC_TOP properly adds levels one by one
 - Every page size can be iommu_map()'d, and mapping creates that size
 - iommu_iova_to_phys() works with every page size
 - Test converting OA -> non present -> OA when the two OAs overlap and
   free table levels
 - Test that unmap stops at holes, unmap doesn't split, and unmap returns
   the right values for partial unmap requests
 - Randomly map/unmap. Checks map with random sizes, that map fails when
   hitting collisions doing nothing, unmap/map with random intersections and
   full unmap of random sizes. Also checks iommu_iova_to_phys() with random
   sizes
 - Check for memory leaks by monitoring NR_SECONDARY_PAGETABLE

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/fmt/iommu_template.h |   1 +
 drivers/iommu/generic_pt/kunit_iommu.h        |   2 +
 drivers/iommu/generic_pt/kunit_iommu_pt.h     | 487 ++++++++++++++++++++++++++
 include/linux/irqchip/riscv-imsic.h           |   3 +-
 4 files changed, 491 insertions(+), 2 deletions(-)
 create mode 100644 drivers/iommu/generic_pt/kunit_iommu_pt.h

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h
index 11e85106ae30..d28e86abdf2e 100644
--- a/drivers/iommu/generic_pt/fmt/iommu_template.h
+++ b/drivers/iommu/generic_pt/fmt/iommu_template.h
@@ -44,4 +44,5 @@
  * which means we are building the kunit modle.
  */
 #include "../kunit_generic_pt.h"
+#include "../kunit_iommu_pt.h"
 #endif
diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h
index 28ec313f151e..d541235632aa 100644
--- a/drivers/iommu/generic_pt/kunit_iommu.h
+++ b/drivers/iommu/generic_pt/kunit_iommu.h
@@ -71,6 +71,8 @@ struct kunit_iommu_priv {
 	unsigned int largest_pgsz_lg2;
 	pt_oaddr_t test_oa;
 	pt_vaddr_t safe_pgsize_bitmap;
+	unsigned long orig_nr_secondary_pagetable;
+
 };
 PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain);
 
diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h
new file mode 100644
index 000000000000..e8a63c8ea850
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "kunit_iommu.h"
+#include "pt_iter.h"
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+		   pt_vaddr_t len);
+
+struct count_valids {
+	u64 per_size[PT_VADDR_MAX_LG2];
+};
+
+static int __count_valids(struct pt_range *range, void *arg, unsigned int level,
+			  struct pt_table_p *table)
+{
+	struct pt_state pts = pt_init(range, level, table);
+	struct count_valids *valids = arg;
+
+	for_each_pt_level_entry(&pts) {
+		if (pts.type == PT_ENTRY_TABLE) {
+			pt_descend(&pts, arg, __count_valids);
+			continue;
+		}
+		if (pts.type == PT_ENTRY_OA) {
+			valids->per_size[pt_entry_oa_lg2sz(&pts)]++;
+			continue;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Number of valid table entries. This counts contiguous entries as a single
+ * valid.
+ */
+static unsigned int count_valids(struct kunit *test)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	struct pt_range range = pt_top_range(priv->common);
+	struct count_valids valids = {};
+	u64 total = 0;
+	unsigned int i;
+
+	KUNIT_ASSERT_NO_ERRNO(test,
+			      pt_walk_range(&range, __count_valids, &valids));
+
+	for (i = 0; i != ARRAY_SIZE(valids.per_size); i++)
+		total += valids.per_size[i];
+	return total;
+}
+
+/* Only a single page size is present, count the number of valid entries */
+static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	struct pt_range range = pt_top_range(priv->common);
+	struct count_valids valids = {};
+	u64 total = 0;
+	unsigned int i;
+
+	KUNIT_ASSERT_NO_ERRNO(test,
+			      pt_walk_range(&range, __count_valids, &valids));
+
+	for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) {
+		if ((1ULL << i) == pgsz)
+			total = valids.per_size[i];
+		else
+			KUNIT_ASSERT_EQ(test, valids.per_size[i], 0);
+	}
+	return total;
+}
+
+static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	size_t ret;
+
+	ret = iommu_unmap(&priv->domain, va, len);
+	KUNIT_ASSERT_EQ(test, ret, len);
+}
+
+static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+		       pt_vaddr_t len)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2);
+	pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2);
+
+	for (; pfn != end_pfn; pfn++) {
+		phys_addr_t res = iommu_iova_to_phys(&priv->domain,
+						     pfn * priv->smallest_pgsz);
+
+		KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa);
+		if (res != pa)
+			break;
+		pa += priv->smallest_pgsz;
+	}
+}
+
+static void test_increase_level(struct kunit *test)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	struct pt_common *common = priv->common;
+
+	if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+		kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format");
+
+	if (IS_32BIT)
+		kunit_skip(test, "Unable to test on 32bit");
+
+	KUNIT_ASSERT_GT(test, common->max_vasz_lg2,
+			pt_top_range(common).max_vasz_lg2);
+
+	/* Add every possible level to the max */
+	while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) {
+		struct pt_range top_range = pt_top_range(common);
+
+		if (top_range.va == 0)
+			do_map(test, top_range.last_va + 1, 0,
+			       priv->smallest_pgsz);
+		else
+			do_map(test, top_range.va - priv->smallest_pgsz, 0,
+			       priv->smallest_pgsz);
+
+		KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level,
+				top_range.top_level + 1);
+		KUNIT_ASSERT_GE(test, common->max_vasz_lg2,
+				pt_top_range(common).max_vasz_lg2);
+	}
+}
+
+static void test_map_simple(struct kunit *test)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	struct pt_range range = pt_top_range(priv->common);
+	struct count_valids valids = {};
+	pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap;
+	unsigned int pgsz_lg2;
+	pt_vaddr_t cur_va;
+
+	/* Map every reported page size */
+	cur_va = range.va + priv->smallest_pgsz * 256;
+	for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+		pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2);
+		u64 len = log2_to_int(pgsz_lg2);
+
+		if (!(pgsize_bitmap & len))
+			continue;
+
+		cur_va = ALIGN(cur_va, len);
+		do_map(test, cur_va, paddr, len);
+		if (len <= SZ_2G)
+			check_iova(test, cur_va, paddr, len);
+		cur_va += len;
+	}
+
+	/* The read interface reports that every page size was created */
+	range = pt_top_range(priv->common);
+	KUNIT_ASSERT_NO_ERRNO(test,
+			      pt_walk_range(&range, __count_valids, &valids));
+	for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+		if (pgsize_bitmap & (1ULL << pgsz_lg2))
+			KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1);
+		else
+			KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0);
+	}
+
+	/* Unmap works */
+	range = pt_top_range(priv->common);
+	cur_va = range.va + priv->smallest_pgsz * 256;
+	for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+		u64 len = log2_to_int(pgsz_lg2);
+
+		if (!(pgsize_bitmap & len))
+			continue;
+		cur_va = ALIGN(cur_va, len);
+		do_unmap(test, cur_va, len);
+		cur_va += len;
+	}
+	KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+}
+
+/*
+ * Test to convert a table pointer into an OA by mapping something small,
+ * unmapping it so as to leave behind a table pointer, then mapping something
+ * larger that will convert the table into an OA.
+ */
+static void test_map_table_to_oa(struct kunit *test)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	pt_vaddr_t limited_pgbitmap =
+		priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G);
+	struct pt_range range = pt_top_range(priv->common);
+	unsigned int pgsz_lg2;
+	pt_vaddr_t max_pgsize;
+	pt_vaddr_t cur_va;
+
+	max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1);
+	KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize);
+
+	for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+		pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2);
+		u64 len = log2_to_int(pgsz_lg2);
+		pt_vaddr_t offset;
+
+		if (!(priv->info.pgsize_bitmap & len))
+			continue;
+		if (len > max_pgsize)
+			break;
+
+		cur_va = ALIGN(range.va + priv->smallest_pgsz * 256,
+			       max_pgsize);
+		for (offset = 0; offset != max_pgsize; offset += len)
+			do_map(test, cur_va + offset, paddr + offset, len);
+		check_iova(test, cur_va, paddr, max_pgsize);
+		KUNIT_ASSERT_EQ(test, count_valids_single(test, len),
+				log2_div(max_pgsize, pgsz_lg2));
+
+		if (len == max_pgsize) {
+			do_unmap(test, cur_va, max_pgsize);
+		} else {
+			do_unmap(test, cur_va, max_pgsize / 2);
+			for (offset = max_pgsize / 2; offset != max_pgsize;
+			     offset += len)
+				do_unmap(test, cur_va + offset, len);
+		}
+
+		KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+	}
+}
+
+/*
+ * Test unmapping a small page at the start of a large page. This always unmaps
+ * the large page.
+ */
+static void test_unmap_split(struct kunit *test)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	struct pt_range top_range = pt_top_range(priv->common);
+	pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap;
+	unsigned int pgsz_lg2;
+	unsigned int count = 0;
+
+	for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+		pt_vaddr_t base_len = log2_to_int(pgsz_lg2);
+		unsigned int next_pgsz_lg2;
+
+		if (!(pgsize_bitmap & base_len))
+			continue;
+
+		for (next_pgsz_lg2 = pgsz_lg2 + 1;
+		     next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) {
+			pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2);
+			pt_vaddr_t vaddr = top_range.va;
+			pt_oaddr_t paddr = 0;
+			size_t gnmapped;
+
+			if (!(pgsize_bitmap & next_len))
+				continue;
+
+			do_map(test, vaddr, paddr, next_len);
+			gnmapped = iommu_unmap(&priv->domain, vaddr, base_len);
+			KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+
+			/* Make sure unmap doesn't keep going */
+			do_map(test, vaddr, paddr, next_len);
+			do_map(test, vaddr + next_len, paddr, next_len);
+			gnmapped = iommu_unmap(&priv->domain, vaddr, base_len);
+			KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+			gnmapped = iommu_unmap(&priv->domain, vaddr + next_len,
+					       next_len);
+			KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+
+			count++;
+		}
+	}
+
+	if (count == 0)
+		kunit_skip(test, "Test needs two page sizes");
+}
+
+static void unmap_collisions(struct kunit *test, struct maple_tree *mt,
+			     pt_vaddr_t start, pt_vaddr_t last)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	MA_STATE(mas, mt, start, last);
+	void *entry;
+
+	mtree_lock(mt);
+	mas_for_each(&mas, entry, last) {
+		pt_vaddr_t mas_start = mas.index;
+		pt_vaddr_t len = (mas.last - mas_start) + 1;
+		pt_oaddr_t paddr;
+
+		mas_erase(&mas);
+		mas_pause(&mas);
+		mtree_unlock(mt);
+
+		paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2);
+		check_iova(test, mas_start, paddr, len);
+		do_unmap(test, mas_start, len);
+		mtree_lock(mt);
+	}
+	mtree_unlock(mt);
+}
+
+static void clamp_range(struct kunit *test, struct pt_range *range)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+
+	if (range->last_va - range->va > SZ_1G)
+		range->last_va = range->va + SZ_1G;
+	KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX);
+	if (range->va <= MAPLE_RESERVED_RANGE)
+		range->va =
+			ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz);
+}
+
+/*
+ * Randomly map and unmap ranges that can large physical pages. If a random
+ * range overlaps with existing ranges then unmap them. This hits all the
+ * special cases.
+ */
+static void test_random_map(struct kunit *test)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	struct pt_range upper_range = pt_upper_range(priv->common);
+	struct pt_range top_range = pt_top_range(priv->common);
+	struct maple_tree mt;
+	unsigned int iter;
+
+	mt_init(&mt);
+
+	/*
+	 * Shrink the range so randomization is more likely to have
+	 * intersections
+	 */
+	clamp_range(test, &top_range);
+	clamp_range(test, &upper_range);
+
+	for (iter = 0; iter != 1000; iter++) {
+		struct pt_range *range = &top_range;
+		pt_oaddr_t paddr;
+		pt_vaddr_t start;
+		pt_vaddr_t end;
+		int ret;
+
+		if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) &&
+		    ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1))
+			range = &upper_range;
+
+		start = get_random_u32_below(
+			min(U32_MAX, range->last_va - range->va));
+		end = get_random_u32_below(
+			min(U32_MAX, range->last_va - start));
+
+		start = ALIGN_DOWN(start, priv->smallest_pgsz);
+		end = ALIGN(end, priv->smallest_pgsz);
+		start += range->va;
+		end += start;
+		if (start < range->va || end > range->last_va + 1 ||
+		    start >= end)
+			continue;
+
+		/* Try overmapping to test the failure handling */
+		paddr = oalog2_mod(start, priv->common->max_oasz_lg2);
+		ret = iommu_map(&priv->domain, start, paddr, end - start,
+				IOMMU_READ | IOMMU_WRITE, GFP_KERNEL);
+		if (ret) {
+			KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE);
+			unmap_collisions(test, &mt, start, end - 1);
+			do_map(test, start, paddr, end - start);
+		}
+
+		KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range",
+					 mtree_insert_range(&mt, start, end - 1,
+							    XA_ZERO_ENTRY,
+							    GFP_KERNEL));
+
+		check_iova(test, start, paddr, end - start);
+		if (iter % 100)
+			cond_resched();
+	}
+
+	unmap_collisions(test, &mt, 0, PT_VADDR_MAX);
+	KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+
+	mtree_destroy(&mt);
+}
+
+/* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */
+static void test_pgsize_boundary(struct kunit *test)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	struct pt_range top_range = pt_top_range(priv->common);
+
+	if (top_range.va != 0 || top_range.last_va < 0xfef9ffff ||
+	    priv->smallest_pgsz != SZ_4K)
+		kunit_skip(test, "Format does not have the required range");
+
+	do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1);
+}
+
+/* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */
+static void test_mixed(struct kunit *test)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+	struct pt_range top_range = pt_top_range(priv->common);
+	u64 start = 0x3fe400ULL << 12;
+	u64 end = 0x4c0600ULL << 12;
+	pt_vaddr_t len = end - start;
+	pt_oaddr_t oa = start;
+
+	if (top_range.last_va <= start || sizeof(unsigned long) == 4)
+		kunit_skip(test, "range is too small");
+	if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21)))
+		kunit_skip(test, "incompatible psize");
+
+	do_map(test, start, oa, len);
+	/* 14 2M, 3 1G, 3 2M */
+	KUNIT_ASSERT_EQ(test, count_valids(test), 20);
+	check_iova(test, start, oa, len);
+}
+
+static struct kunit_case iommu_test_cases[] = {
+	KUNIT_CASE_FMT(test_increase_level),
+	KUNIT_CASE_FMT(test_map_simple),
+	KUNIT_CASE_FMT(test_map_table_to_oa),
+	KUNIT_CASE_FMT(test_unmap_split),
+	KUNIT_CASE_FMT(test_random_map),
+	KUNIT_CASE_FMT(test_pgsize_boundary),
+	KUNIT_CASE_FMT(test_mixed),
+	{},
+};
+
+static int pt_kunit_iommu_init(struct kunit *test)
+{
+	struct kunit_iommu_priv *priv;
+	int ret;
+
+	priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->orig_nr_secondary_pagetable =
+		global_node_page_state(NR_SECONDARY_PAGETABLE);
+	ret = pt_kunit_priv_init(test, priv);
+	if (ret) {
+		kunit_kfree(test, priv);
+		return ret;
+	}
+	test->priv = priv;
+	return 0;
+}
+
+static void pt_kunit_iommu_exit(struct kunit *test)
+{
+	struct kunit_iommu_priv *priv = test->priv;
+
+	if (!test->priv)
+		return;
+
+	pt_iommu_deinit(priv->iommu);
+	/*
+	 * Look for memory leaks, assumes kunit is running isolated and nothing
+	 * else is using secondary page tables.
+	 */
+	KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable,
+			global_node_page_state(NR_SECONDARY_PAGETABLE));
+	kunit_kfree(test, test->priv);
+}
+
+static struct kunit_suite NS(iommu_suite) = {
+	.name = __stringify(NS(iommu_test)),
+	.init = pt_kunit_iommu_init,
+	.exit = pt_kunit_iommu_exit,
+	.test_cases = iommu_test_cases,
+};
+kunit_test_suites(&NS(iommu_suite));
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Kunit for generic page table");
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h
index 7494952c5518..7f3ff5c5ea53 100644
--- a/include/linux/irqchip/riscv-imsic.h
+++ b/include/linux/irqchip/riscv-imsic.h
@@ -10,7 +10,6 @@
 #include <linux/bitops.h>
 #include <linux/device.h>
 #include <linux/fwnode.h>
-#include <asm/csr.h>
 
 #define IMSIC_MMIO_PAGE_SHIFT		12
 #define IMSIC_MMIO_PAGE_SZ		BIT(IMSIC_MMIO_PAGE_SHIFT)
@@ -86,7 +85,7 @@ static inline const struct imsic_global_config *imsic_get_global_config(void)
 
 #endif
 
-#ifdef CONFIG_ACPI
+#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_RISCV_IMSIC)
 int imsic_platform_acpi_probe(struct fwnode_handle *fwnode);
 struct fwnode_handle *imsic_acpi_get_fwnode(struct device *dev);
 #else
-- 
cgit v1.2.3


From aefd967dab6469f5b827b59e50016a760dcc1fbc Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 23 Oct 2025 15:22:31 -0300
Subject: iommupt: Use the incoherent start/stop functions for
 PT_FEAT_DMA_INCOHERENT

This is the first step to supporting an incoherent walker, start and stop
the incoherence around the allocation and frees of the page table memory.

The iommu_pages API maps this to dma_map/unmap_single(), or arch cache
flushing calls.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/iommu_pt.h    | 89 ++++++++++++++++++++++++++--------
 drivers/iommu/generic_pt/kunit_iommu.h |  1 +
 drivers/iommu/generic_pt/pt_defs.h     |  5 +-
 include/linux/generic_pt/common.h      |  6 +++
 include/linux/generic_pt/iommu.h       |  7 +++
 5 files changed, 88 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 142001f5aa83..2cad07da995a 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -24,6 +24,10 @@ static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
 {
 	struct pt_common *common = common_from_iommu(iommu_table);
 
+	if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+		iommu_pages_stop_incoherent_list(free_list,
+						 iommu_table->iommu_device);
+
 	if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) &&
 	    iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) {
 		iommu_iotlb_sync(&iommu_table->domain, iotlb_gather);
@@ -329,35 +333,55 @@ static int __collect_tables(struct pt_range *range, void *arg,
 	return 0;
 }
 
-static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
-						 uintptr_t top_of_table,
-						 gfp_t gfp)
+enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH};
+
+/* Allocate a table, the empty table will be ready to be installed. */
+static inline struct pt_table_p *_table_alloc(struct pt_common *common,
+					      size_t lg2sz, gfp_t gfp,
+					      enum alloc_mode mode)
 {
 	struct pt_iommu *iommu_table = iommu_from_common(common);
+	struct pt_table_p *table_mem;
+
+	table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp,
+					      log2_to_int(lg2sz));
+	if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+	    mode == ALLOC_NORMAL) {
+		int ret = iommu_pages_start_incoherent(
+			table_mem, iommu_table->iommu_device);
+		if (ret) {
+			iommu_free_pages(table_mem);
+			return ERR_PTR(ret);
+		}
+	}
+	return table_mem;
+}
 
+static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
+						 uintptr_t top_of_table,
+						 gfp_t gfp,
+						 enum alloc_mode mode)
+{
 	/*
 	 * Top doesn't need the free list or otherwise, so it technically
 	 * doesn't need to use iommu pages. Use the API anyhow as the top is
 	 * usually not smaller than PAGE_SIZE to keep things simple.
 	 */
-	return iommu_alloc_pages_node_sz(
-		iommu_table->nid, gfp,
-		log2_to_int(pt_top_memsize_lg2(common, top_of_table)));
+	return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table),
+			    gfp, mode);
 }
 
 /* Allocate an interior table */
 static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts,
-					     gfp_t gfp)
+					     gfp_t gfp, enum alloc_mode mode)
 {
-	struct pt_iommu *iommu_table =
-		iommu_from_common(parent_pts->range->common);
 	struct pt_state child_pts =
 		pt_init(parent_pts->range, parent_pts->level - 1, NULL);
 
-	return iommu_alloc_pages_node_sz(
-		iommu_table->nid, gfp,
-		log2_to_int(pt_num_items_lg2(&child_pts) +
-			    ilog2(PT_ITEM_WORD_SIZE)));
+	return _table_alloc(parent_pts->range->common,
+			    pt_num_items_lg2(&child_pts) +
+				    ilog2(PT_ITEM_WORD_SIZE),
+			    gfp, mode);
 }
 
 static inline int pt_iommu_new_table(struct pt_state *pts,
@@ -370,13 +394,15 @@ static inline int pt_iommu_new_table(struct pt_state *pts,
 	if (PT_WARN_ON(!pt_can_have_table(pts)))
 		return -ENXIO;
 
-	table_mem = table_alloc(pts, attrs->gfp);
+	table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL);
 	if (IS_ERR(table_mem))
 		return PTR_ERR(table_mem);
 
 	phys = virt_to_phys(table_mem);
 	if (!pt_install_table(pts, phys, attrs)) {
-		iommu_free_pages(table_mem);
+		iommu_pages_free_incoherent(
+			table_mem,
+			iommu_from_common(pts->range->common)->iommu_device);
 		return -EAGAIN;
 	}
 
@@ -389,7 +415,9 @@ static inline int pt_iommu_new_table(struct pt_state *pts,
 		pt_load_single_entry(pts);
 		if (PT_WARN_ON(pt_table_pa(pts) != phys)) {
 			pt_clear_entries(pts, ilog2(1));
-			iommu_free_pages(table_mem);
+			iommu_pages_free_incoherent(
+				table_mem, iommu_from_common(pts->range->common)
+						   ->iommu_device);
 			return -EINVAL;
 		}
 	}
@@ -615,8 +643,9 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
 		}
 
 		new_level = pts.level;
-		table_mem = table_alloc_top(
-			common, _pt_top_set(NULL, pts.level), map->attrs.gfp);
+		table_mem =
+			table_alloc_top(common, _pt_top_set(NULL, pts.level),
+					map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH);
 		if (IS_ERR(table_mem))
 			return PTR_ERR(table_mem);
 		iommu_pages_list_add(&free_list, table_mem);
@@ -633,6 +662,16 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
 		new_top_of_table = _pt_top_set(pts.table, pts.level);
 	}
 
+	/*
+	 * Avoid double flushing, flush it once after all pt_install_table()
+	 */
+	if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+		ret = iommu_pages_start_incoherent_list(
+			&free_list, iommu_table->iommu_device);
+		if (ret)
+			goto err_free;
+	}
+
 	/*
 	 * top_of_table is write locked by the spinlock, but readers can use
 	 * READ_ONCE() to get the value. Since we encode both the level and the
@@ -665,6 +704,9 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
 	return 0;
 
 err_free:
+	if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+		iommu_pages_stop_incoherent_list(&free_list,
+						 iommu_table->iommu_device);
 	iommu_put_pages_list(&free_list);
 	return ret;
 }
@@ -988,6 +1030,9 @@ static void NS(deinit)(struct pt_iommu *iommu_table)
 	 * The driver has to already have fenced the HW access to the page table
 	 * and invalidated any caching referring to this memory.
 	 */
+	if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+		iommu_pages_stop_incoherent_list(&collect.free_list,
+						 iommu_table->iommu_device);
 	iommu_put_pages_list(&collect.free_list);
 }
 
@@ -1078,6 +1123,7 @@ static void pt_iommu_zero(struct pt_iommu_table *fmt_table)
 	memset_after(fmt_table, 0, iommu.domain);
 
 	/* The caller can initialize some of these values */
+	iommu_table->iommu_device = cfg.iommu_device;
 	iommu_table->driver_ops = cfg.driver_ops;
 	iommu_table->nid = cfg.nid;
 }
@@ -1123,11 +1169,16 @@ int pt_iommu_init(struct pt_iommu_table *fmt_table,
 	     pt_feature(common, PT_FEAT_DYNAMIC_TOP)))
 		return -EINVAL;
 
+	if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+	    WARN_ON(!iommu_table->iommu_device))
+		return -EINVAL;
+
 	ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain);
 	if (ret)
 		return ret;
 
-	table_mem = table_alloc_top(common, common->top_of_table, gfp);
+	table_mem = table_alloc_top(common, common->top_of_table, gfp,
+				    ALLOC_NORMAL);
 	if (IS_ERR(table_mem))
 		return PTR_ERR(table_mem);
 	pt_top_set(common, table_mem, pt_top_get_level(common));
diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h
index d541235632aa..5d4f269627d5 100644
--- a/drivers/iommu/generic_pt/kunit_iommu.h
+++ b/drivers/iommu/generic_pt/kunit_iommu.h
@@ -139,6 +139,7 @@ static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv)
 
 	priv->fmt_table.iommu.nid = NUMA_NO_NODE;
 	priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops;
+	priv->fmt_table.iommu.iommu_device = priv->dummy_dev;
 	priv->domain.ops = &kunit_pt_ops;
 	ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL);
 	if (ret) {
diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h
index 819057de50d8..c25544d72f97 100644
--- a/drivers/iommu/generic_pt/pt_defs.h
+++ b/drivers/iommu/generic_pt/pt_defs.h
@@ -48,13 +48,16 @@ enum {
 /*
  * When in debug mode we compile all formats with all features. This allows the
  * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or
- * FULL_VA.
+ * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have
  */
 #if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
 enum {
 	PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES,
 	PT_DEBUG_SUPPORTED_FEATURES =
 		UINT_MAX &
+		~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ?
+			   0 :
+			   BIT(PT_FEAT_DMA_INCOHERENT))) &
 		~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ?
 			  BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) :
 			  BIT(PT_FEAT_SIGN_EXTEND)),
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
index 96f8a6a7d60e..883069e32952 100644
--- a/include/linux/generic_pt/common.h
+++ b/include/linux/generic_pt/common.h
@@ -85,6 +85,12 @@ enum {
  * position.
  */
 enum pt_features {
+	/**
+	 * @PT_FEAT_DMA_INCOHERENT: Cache flush page table memory before
+	 * assuming the HW can read it. Otherwise a SMP release is sufficient
+	 * for HW to read it.
+	 */
+	PT_FEAT_DMA_INCOHERENT,
 	/**
 	 * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to
 	 * PT_VADDR_MAX.
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index fde7ccf007c5..21132e342a79 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -57,6 +57,13 @@ struct pt_iommu {
 	 * table walkers.
 	 */
 	int nid;
+
+	/**
+	 * @iommu_device: Device pointer used for any DMA cache flushing when
+	 * PT_FEAT_DMA_INCOHERENT. This is the iommu device that created the
+	 * page table which must have dma ops that perform cache flushing.
+	 */
+	struct device *iommu_device;
 };
 
 /**
-- 
cgit v1.2.3


From 5448c1558f60d4051c90938f2878c6fb20e2982a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 23 Oct 2025 15:22:33 -0300
Subject: iommupt: Add the Intel VT-d second stage page table format

The VT-d second stage format is almost the same as the x86 PAE format,
except the bit encodings in the PTE are different and a few new PTE
features, like force coherency are present.

Among all the formats it is unique in not having a designated present bit.

Comparing the performance of several operations to the existing version:

iommu_map()
   pgsz  ,avg new,old ns, min new,old ns  , min % (+ve is better)
     2^12,     53,66    ,      50,64      ,  21.21
     2^21,     59,70    ,      56,67      ,  16.16
     2^30,     54,66    ,      52,63      ,  17.17
 256*2^12,    384,524   ,     337,516     ,  34.34
 256*2^21,    387,632   ,     336,626     ,  46.46
 256*2^30,    376,629   ,     323,623     ,  48.48

iommu_unmap()
   pgsz  ,avg new,old ns, min new,old ns  , min % (+ve is better)
     2^12,     67,86    ,      63,84      ,  25.25
     2^21,     64,84    ,      59,80      ,  26.26
     2^30,     59,78    ,      56,74      ,  24.24
 256*2^12,    216,335   ,     198,317     ,  37.37
 256*2^21,    245,350   ,     232,344     ,  32.32
 256*2^30,    248,345   ,     226,339     ,  33.33

Cc: Tina Zhang <tina.zhang@intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/.kunitconfig      |   1 +
 drivers/iommu/generic_pt/Kconfig           |  11 ++
 drivers/iommu/generic_pt/fmt/Makefile      |   2 +
 drivers/iommu/generic_pt/fmt/defs_vtdss.h  |  21 +++
 drivers/iommu/generic_pt/fmt/iommu_vtdss.c |  10 +
 drivers/iommu/generic_pt/fmt/vtdss.h       | 292 +++++++++++++++++++++++++++++
 include/linux/generic_pt/common.h          |  18 ++
 include/linux/generic_pt/iommu.h           |  11 ++
 8 files changed, 366 insertions(+)
 create mode 100644 drivers/iommu/generic_pt/fmt/defs_vtdss.h
 create mode 100644 drivers/iommu/generic_pt/fmt/iommu_vtdss.c
 create mode 100644 drivers/iommu/generic_pt/fmt/vtdss.h

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig
index 2016c5e5ac0f..52ac9e661ffd 100644
--- a/drivers/iommu/generic_pt/.kunitconfig
+++ b/drivers/iommu/generic_pt/.kunitconfig
@@ -3,6 +3,7 @@ CONFIG_GENERIC_PT=y
 CONFIG_DEBUG_GENERIC_PT=y
 CONFIG_IOMMU_PT=y
 CONFIG_IOMMU_PT_AMDV1=y
+CONFIG_IOMMU_PT_VTDSS=y
 CONFIG_IOMMU_PT_X86_64=y
 CONFIG_IOMMU_PT_KUNIT_TEST=y
 
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
index 6dcb771b3c58..79f65268f312 100644
--- a/drivers/iommu/generic_pt/Kconfig
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -42,6 +42,16 @@ config IOMMU_PT_AMDV1
 
 	  Selected automatically by an IOMMU driver that uses this format.
 
+config IOMMU_PT_VTDSS
+       tristate "IOMMU page table for Intel VT-d Second Stage"
+	depends on !GENERIC_ATOMIC64 # for cmpxchg64
+	help
+	  iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5
+	  level Second Stage page table. It is similar to the X86_64 format with
+	  4K/2M/1G page sizes.
+
+	  Selected automatically by an IOMMU driver that uses this format.
+
 config IOMMU_PT_X86_64
 	tristate "IOMMU page table for x86 64-bit, 4/5 levels"
 	depends on !GENERIC_ATOMIC64 # for cmpxchg64
@@ -57,6 +67,7 @@ config IOMMU_PT_KUNIT_TEST
 	depends on KUNIT
 	depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1
 	depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64
+	depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS
 	default KUNIT_ALL_TESTS
 	help
 	  Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
index 5a3379107999..976b49ec97dc 100644
--- a/drivers/iommu/generic_pt/fmt/Makefile
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -3,6 +3,8 @@
 iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
 iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock
 
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss
+
 iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64
 
 IOMMU_PT_KUNIT_TEST :=
diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
new file mode 100644
index 000000000000..4a239bcaae2a
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H
+#define __GENERIC_PT_FMT_DEFS_VTDSS_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct vtdss_pt_write_attrs {
+	u64 descriptor_bits;
+	gfp_t gfp;
+};
+#define pt_write_attrs vtdss_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
new file mode 100644
index 000000000000..f551711e2a33
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT vtdss
+#define PT_SUPPORTED_FEATURES                                            \
+	(BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \
+	 BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h
new file mode 100644
index 000000000000..d9774848eb6f
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/vtdss.h
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Intel VT-d Second Stange 5/4 level page table
+ *
+ * This is described in
+ *   Section "3.7 Second-Stage Translation"
+ *   Section "9.8 Second-Stage Paging Entries"
+ *
+ * Of the "Intel Virtualization Technology for Directed I/O Architecture
+ * Specification".
+ *
+ * The named levels in the spec map to the pts->level as:
+ *   Table/SS-PTE - 0
+ *   Directory/SS-PDE - 1
+ *   Directory Ptr/SS-PDPTE - 2
+ *   PML4/SS-PML4E - 3
+ *   PML5/SS-PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_VTDSS_H
+#define __GENERIC_PT_FMT_VTDSS_H
+
+#include "defs_vtdss.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+
+enum {
+	PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+	PT_MAX_VA_ADDRESS_LG2 = 57,
+	PT_ITEM_WORD_SIZE = sizeof(u64),
+	PT_MAX_TOP_LEVEL = 4,
+	PT_GRANULE_LG2SZ = 12,
+	PT_TABLEMEM_LG2SZ = 12,
+
+	/* SSPTPTR is 4k aligned and limited by HAW */
+	PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+	VTDSS_FMT_R = BIT(0),
+	VTDSS_FMT_W = BIT(1),
+	VTDSS_FMT_A = BIT(8),
+	VTDSS_FMT_D = BIT(9),
+	VTDSS_FMT_SNP = BIT(11),
+	VTDSS_FMT_OA = GENMASK_ULL(51, 12),
+};
+
+/* PDPTE/PDE */
+enum {
+	VTDSS_FMT_PS = BIT(7),
+};
+
+#define common_to_vtdss_pt(common_ptr) \
+	container_of_const(common_ptr, struct pt_vtdss, common)
+#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common)
+
+static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts)
+{
+	return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+			  PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa vtdss_pt_table_pa
+
+static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts)
+{
+	return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+			  PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa vtdss_pt_entry_oa
+
+static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts)
+{
+	return pts->level <= 2;
+}
+#define pt_can_have_leaf vtdss_pt_can_have_leaf
+
+static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts)
+{
+	return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 vtdss_pt_num_items_lg2
+
+static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts)
+{
+	const u64 *tablep = pt_cur_table(pts, u64);
+	u64 entry;
+
+	pts->entry = entry = READ_ONCE(tablep[pts->index]);
+	if (!entry)
+		return PT_ENTRY_EMPTY;
+	if (pts->level == 0 ||
+	    (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS)))
+		return PT_ENTRY_OA;
+	return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw vtdss_pt_load_entry_raw
+
+static inline void
+vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+			    unsigned int oasz_lg2,
+			    const struct pt_write_attrs *attrs)
+{
+	u64 *tablep = pt_cur_table(pts, u64);
+	u64 entry;
+
+	if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+		return;
+
+	entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+		attrs->descriptor_bits;
+	if (pts->level != 0)
+		entry |= VTDSS_FMT_PS;
+
+	WRITE_ONCE(tablep[pts->index], entry);
+	pts->entry = entry;
+}
+#define pt_install_leaf_entry vtdss_pt_install_leaf_entry
+
+static inline bool vtdss_pt_install_table(struct pt_state *pts,
+					  pt_oaddr_t table_pa,
+					  const struct pt_write_attrs *attrs)
+{
+	u64 entry;
+
+	entry = VTDSS_FMT_R | VTDSS_FMT_W |
+		FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+	return pt_table_install64(pts, entry);
+}
+#define pt_install_table vtdss_pt_install_table
+
+static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts,
+					    struct pt_write_attrs *attrs)
+{
+	attrs->descriptor_bits = pts->entry &
+				 (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP);
+}
+#define pt_attr_from_entry vtdss_pt_attr_from_entry
+
+static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+	return READ_ONCE(*tablep) & VTDSS_FMT_D;
+}
+#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty
+
+static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts)
+{
+	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+	WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D);
+}
+#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean
+
+static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts)
+{
+	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+	u64 new = pts->entry | VTDSS_FMT_D;
+
+	return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty
+
+static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common)
+{
+	return 10;
+}
+#define pt_max_sw_bit vtdss_pt_max_sw_bit
+
+static inline u64 vtdss_pt_sw_bit(unsigned int bitnr)
+{
+	/* Bits marked Ignored in the specification */
+	switch (bitnr) {
+	case 0:
+		return BIT(10);
+	case 1 ... 9:
+		return BIT_ULL((bitnr - 1) + 52);
+	case 10:
+		return BIT_ULL(63);
+	/* Some bits in 9-3 are available in some entries */
+	default:
+		if (__builtin_constant_p(bitnr))
+			BUILD_BUG();
+		else
+			PT_WARN_ON(true);
+		return 0;
+	}
+}
+#define pt_sw_bit vtdss_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_vtdss
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+	return &container_of(iommu_table, struct pt_iommu_table, iommu)
+			->vtdss_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+	return &container_of(common, struct pt_iommu_table, vtdss_pt.common)
+			->iommu;
+}
+
+static inline int vtdss_pt_iommu_set_prot(struct pt_common *common,
+					  struct pt_write_attrs *attrs,
+					  unsigned int iommu_prot)
+{
+	u64 pte = 0;
+
+	/*
+	 * VTDSS does not have a present bit, so we tell if any entry is present
+	 * by checking for R or W.
+	 */
+	if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+		return -EINVAL;
+
+	if (iommu_prot & IOMMU_READ)
+		pte |= VTDSS_FMT_R;
+	if (iommu_prot & IOMMU_WRITE)
+		pte |= VTDSS_FMT_W;
+	if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE))
+		pte |= VTDSS_FMT_SNP;
+
+	if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) &&
+	    !(iommu_prot & IOMMU_WRITE)) {
+		pr_err_ratelimited(
+			"Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+		return -EINVAL;
+	}
+
+	attrs->descriptor_bits = pte;
+	return 0;
+}
+#define pt_iommu_set_prot vtdss_pt_iommu_set_prot
+
+static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
+					  const struct pt_iommu_vtdss_cfg *cfg)
+{
+	struct pt_vtdss *table = &iommu_table->vtdss_pt;
+	unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2;
+
+	if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2)
+		return -EOPNOTSUPP;
+	else if (vasz_lg2 > 48)
+		pt_top_set_level(&table->common, 4);
+	else if (vasz_lg2 > 39)
+		pt_top_set_level(&table->common, 3);
+	else if (vasz_lg2 > 30)
+		pt_top_set_level(&table->common, 2);
+	else
+		return -EOPNOTSUPP;
+	return 0;
+}
+#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init
+
+static inline void
+vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table,
+			   const struct pt_range *top_range,
+			   struct pt_iommu_vtdss_hw_info *info)
+{
+	info->ssptptr = virt_to_phys(top_range->top_table);
+	PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK);
+	/*
+	 * top_level = 2 = 3 level table aw=1
+	 * top_level = 3 = 4 level table aw=2
+	 * top_level = 4 = 5 level table aw=3
+	 */
+	info->aw = top_range->top_level - 1;
+}
+#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = {
+	[0] = { .common.hw_max_vasz_lg2 = 39 },
+	[1] = { .common.hw_max_vasz_lg2 = 48 },
+	[2] = { .common.hw_max_vasz_lg2 = 57 },
+};
+#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) };
+#endif
+#endif
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
index 883069e32952..6a9a1acb5aad 100644
--- a/include/linux/generic_pt/common.h
+++ b/include/linux/generic_pt/common.h
@@ -157,6 +157,24 @@ enum {
 	PT_FEAT_AMDV1_FORCE_COHERENCE,
 };
 
+struct pt_vtdss {
+	struct pt_common common;
+};
+
+enum {
+	/*
+	 * The PTEs are set to prevent cache incoherent traffic, such as PCI no
+	 * snoop. This is set either at creation time or before the first map
+	 * operation.
+	 */
+	PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START,
+	/*
+	 * Prevent creating read-only PTEs. Used to work around HW errata
+	 * ERRATA_772415_SPR17.
+	 */
+	PT_FEAT_VTDSS_FORCE_WRITEABLE,
+};
+
 struct pt_x86_64 {
 	struct pt_common common;
 };
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index 21132e342a79..cfe05a77f86b 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -262,6 +262,17 @@ IOMMU_FORMAT(amdv1, amdpt);
 struct pt_iommu_amdv1_mock_hw_info;
 IOMMU_PROTOTYPES(amdv1_mock);
 
+struct pt_iommu_vtdss_cfg {
+	struct pt_iommu_cfg common;
+};
+
+struct pt_iommu_vtdss_hw_info {
+	u64 ssptptr;
+	u8 aw;
+};
+
+IOMMU_FORMAT(vtdss, vtdss_pt);
+
 struct pt_iommu_x86_64_cfg {
 	struct pt_iommu_cfg common;
 };
-- 
cgit v1.2.3


From 0485a18d9141775d54489997b284fe2557b5898e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 4 Nov 2025 15:46:32 +0100
Subject: fs: rename fs_types.h to fs_dirent.h

We will split out a bunch of types into a separate header.
So free up the appropriate name for it.

Link: https://patch.msgid.link/20251104-work-fs-header-v1-1-fb39a2efe39e@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/Makefile               |   2 +-
 fs/fs_dirent.c            | 105 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/fs_types.c             | 105 ----------------------------------------------
 include/linux/fs.h        |   2 +-
 include/linux/fs_dirent.h |  78 ++++++++++++++++++++++++++++++++++
 include/linux/fs_types.h  |  75 ---------------------------------
 6 files changed, 185 insertions(+), 182 deletions(-)
 create mode 100644 fs/fs_dirent.c
 delete mode 100644 fs/fs_types.c
 create mode 100644 include/linux/fs_dirent.h
 delete mode 100644 include/linux/fs_types.h

(limited to 'include')

diff --git a/fs/Makefile b/fs/Makefile
index e3523ab2e587..a04274a3c854 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,7 +14,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
-		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
+		fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
 		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
 		file_attr.o
 
diff --git a/fs/fs_dirent.c b/fs/fs_dirent.c
new file mode 100644
index 000000000000..e5e08f213816
--- /dev/null
+++ b/fs/fs_dirent.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs_dirent.h>
+#include <linux/export.h>
+
+/*
+ * fs on-disk file type to dirent file type conversion
+ */
+static const unsigned char fs_dtype_by_ftype[FT_MAX] = {
+	[FT_UNKNOWN]	= DT_UNKNOWN,
+	[FT_REG_FILE]	= DT_REG,
+	[FT_DIR]	= DT_DIR,
+	[FT_CHRDEV]	= DT_CHR,
+	[FT_BLKDEV]	= DT_BLK,
+	[FT_FIFO]	= DT_FIFO,
+	[FT_SOCK]	= DT_SOCK,
+	[FT_SYMLINK]	= DT_LNK
+};
+
+/**
+ * fs_ftype_to_dtype() - fs on-disk file type to dirent type.
+ * @filetype: The on-disk file type to convert.
+ *
+ * This function converts the on-disk file type value (FT_*) to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN		- Unknown type
+ * * DT_FIFO		- FIFO
+ * * DT_CHR		- Character device
+ * * DT_DIR		- Directory
+ * * DT_BLK		- Block device
+ * * DT_REG		- Regular file
+ * * DT_LNK		- Symbolic link
+ * * DT_SOCK		- Local-domain socket
+ */
+unsigned char fs_ftype_to_dtype(unsigned int filetype)
+{
+	if (filetype >= FT_MAX)
+		return DT_UNKNOWN;
+
+	return fs_dtype_by_ftype[filetype];
+}
+EXPORT_SYMBOL_GPL(fs_ftype_to_dtype);
+
+/*
+ * dirent file type to fs on-disk file type conversion
+ * Values not initialized explicitly are FT_UNKNOWN (0).
+ */
+static const unsigned char fs_ftype_by_dtype[DT_MAX] = {
+	[DT_REG]	= FT_REG_FILE,
+	[DT_DIR]	= FT_DIR,
+	[DT_LNK]	= FT_SYMLINK,
+	[DT_CHR]	= FT_CHRDEV,
+	[DT_BLK]	= FT_BLKDEV,
+	[DT_FIFO]	= FT_FIFO,
+	[DT_SOCK]	= FT_SOCK,
+};
+
+/**
+ * fs_umode_to_ftype() - file mode to on-disk file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the on-disk file type (FT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * FT_UNKNOWN		- Unknown type
+ * * FT_REG_FILE	- Regular file
+ * * FT_DIR		- Directory
+ * * FT_CHRDEV		- Character device
+ * * FT_BLKDEV		- Block device
+ * * FT_FIFO		- FIFO
+ * * FT_SOCK		- Local-domain socket
+ * * FT_SYMLINK		- Symbolic link
+ */
+unsigned char fs_umode_to_ftype(umode_t mode)
+{
+	return fs_ftype_by_dtype[S_DT(mode)];
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_ftype);
+
+/**
+ * fs_umode_to_dtype() - file mode to dirent file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN		- Unknown type
+ * * DT_FIFO		- FIFO
+ * * DT_CHR		- Character device
+ * * DT_DIR		- Directory
+ * * DT_BLK		- Block device
+ * * DT_REG		- Regular file
+ * * DT_LNK		- Symbolic link
+ * * DT_SOCK		- Local-domain socket
+ */
+unsigned char fs_umode_to_dtype(umode_t mode)
+{
+	return fs_ftype_to_dtype(fs_umode_to_ftype(mode));
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_dtype);
diff --git a/fs/fs_types.c b/fs/fs_types.c
deleted file mode 100644
index 78365e5dc08c..000000000000
--- a/fs/fs_types.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/fs.h>
-#include <linux/export.h>
-
-/*
- * fs on-disk file type to dirent file type conversion
- */
-static const unsigned char fs_dtype_by_ftype[FT_MAX] = {
-	[FT_UNKNOWN]	= DT_UNKNOWN,
-	[FT_REG_FILE]	= DT_REG,
-	[FT_DIR]	= DT_DIR,
-	[FT_CHRDEV]	= DT_CHR,
-	[FT_BLKDEV]	= DT_BLK,
-	[FT_FIFO]	= DT_FIFO,
-	[FT_SOCK]	= DT_SOCK,
-	[FT_SYMLINK]	= DT_LNK
-};
-
-/**
- * fs_ftype_to_dtype() - fs on-disk file type to dirent type.
- * @filetype: The on-disk file type to convert.
- *
- * This function converts the on-disk file type value (FT_*) to the directory
- * entry type (DT_*).
- *
- * Context: Any context.
- * Return:
- * * DT_UNKNOWN		- Unknown type
- * * DT_FIFO		- FIFO
- * * DT_CHR		- Character device
- * * DT_DIR		- Directory
- * * DT_BLK		- Block device
- * * DT_REG		- Regular file
- * * DT_LNK		- Symbolic link
- * * DT_SOCK		- Local-domain socket
- */
-unsigned char fs_ftype_to_dtype(unsigned int filetype)
-{
-	if (filetype >= FT_MAX)
-		return DT_UNKNOWN;
-
-	return fs_dtype_by_ftype[filetype];
-}
-EXPORT_SYMBOL_GPL(fs_ftype_to_dtype);
-
-/*
- * dirent file type to fs on-disk file type conversion
- * Values not initialized explicitly are FT_UNKNOWN (0).
- */
-static const unsigned char fs_ftype_by_dtype[DT_MAX] = {
-	[DT_REG]	= FT_REG_FILE,
-	[DT_DIR]	= FT_DIR,
-	[DT_LNK]	= FT_SYMLINK,
-	[DT_CHR]	= FT_CHRDEV,
-	[DT_BLK]	= FT_BLKDEV,
-	[DT_FIFO]	= FT_FIFO,
-	[DT_SOCK]	= FT_SOCK,
-};
-
-/**
- * fs_umode_to_ftype() - file mode to on-disk file type.
- * @mode: The file mode to convert.
- *
- * This function converts the file mode value to the on-disk file type (FT_*).
- *
- * Context: Any context.
- * Return:
- * * FT_UNKNOWN		- Unknown type
- * * FT_REG_FILE	- Regular file
- * * FT_DIR		- Directory
- * * FT_CHRDEV		- Character device
- * * FT_BLKDEV		- Block device
- * * FT_FIFO		- FIFO
- * * FT_SOCK		- Local-domain socket
- * * FT_SYMLINK		- Symbolic link
- */
-unsigned char fs_umode_to_ftype(umode_t mode)
-{
-	return fs_ftype_by_dtype[S_DT(mode)];
-}
-EXPORT_SYMBOL_GPL(fs_umode_to_ftype);
-
-/**
- * fs_umode_to_dtype() - file mode to dirent file type.
- * @mode: The file mode to convert.
- *
- * This function converts the file mode value to the directory
- * entry type (DT_*).
- *
- * Context: Any context.
- * Return:
- * * DT_UNKNOWN		- Unknown type
- * * DT_FIFO		- FIFO
- * * DT_CHR		- Character device
- * * DT_DIR		- Directory
- * * DT_BLK		- Block device
- * * DT_REG		- Regular file
- * * DT_LNK		- Symbolic link
- * * DT_SOCK		- Local-domain socket
- */
-unsigned char fs_umode_to_dtype(umode_t mode)
-{
-	return fs_ftype_to_dtype(fs_umode_to_ftype(mode));
-}
-EXPORT_SYMBOL_GPL(fs_umode_to_dtype);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..3c971ddace41 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -37,7 +37,7 @@
 #include <linux/uuid.h>
 #include <linux/errseq.h>
 #include <linux/ioprio.h>
-#include <linux/fs_types.h>
+#include <linux/fs_dirent.h>
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
 #include <linux/mount.h>
diff --git a/include/linux/fs_dirent.h b/include/linux/fs_dirent.h
new file mode 100644
index 000000000000..92f75c5bac19
--- /dev/null
+++ b/include/linux/fs_dirent.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FS_DIRENT_H
+#define _LINUX_FS_DIRENT_H
+
+#include <linux/stat.h>
+#include <linux/types.h>
+
+/*
+ * This is a header for the common implementation of dirent
+ * to fs on-disk file type conversion.  Although the fs on-disk
+ * bits are specific to every file system, in practice, many
+ * file systems use the exact same on-disk format to describe
+ * the lower 3 file type bits that represent the 7 POSIX file
+ * types.
+ *
+ * It is important to note that the definitions in this
+ * header MUST NOT change. This would break both the
+ * userspace ABI and the on-disk format of filesystems
+ * using this code.
+ *
+ * All those file systems can use this generic code for the
+ * conversions.
+ */
+
+/*
+ * struct dirent file types
+ * exposed to user via getdents(2), readdir(3)
+ *
+ * These match bits 12..15 of stat.st_mode
+ * (ie "(i_mode >> 12) & 15").
+ */
+#define S_DT_SHIFT	12
+#define S_DT(mode)	(((mode) & S_IFMT) >> S_DT_SHIFT)
+#define S_DT_MASK	(S_IFMT >> S_DT_SHIFT)
+
+/* these are defined by POSIX and also present in glibc's dirent.h */
+#define DT_UNKNOWN	0
+#define DT_FIFO		1
+#define DT_CHR		2
+#define DT_DIR		4
+#define DT_BLK		6
+#define DT_REG		8
+#define DT_LNK		10
+#define DT_SOCK		12
+#define DT_WHT		14
+
+#define DT_MAX		(S_DT_MASK + 1) /* 16 */
+
+/*
+ * fs on-disk file types.
+ * Only the low 3 bits are used for the POSIX file types.
+ * Other bits are reserved for fs private use.
+ * These definitions are shared and used by multiple filesystems,
+ * and MUST NOT change under any circumstances.
+ *
+ * Note that no fs currently stores the whiteout type on-disk,
+ * so whiteout dirents are exposed to user as DT_CHR.
+ */
+#define FT_UNKNOWN	0
+#define FT_REG_FILE	1
+#define FT_DIR		2
+#define FT_CHRDEV	3
+#define FT_BLKDEV	4
+#define FT_FIFO		5
+#define FT_SOCK		6
+#define FT_SYMLINK	7
+
+#define FT_MAX		8
+
+/*
+ * declarations for helper functions, accompanying implementation
+ * is in fs/fs_dirent.c
+ */
+extern unsigned char fs_ftype_to_dtype(unsigned int filetype);
+extern unsigned char fs_umode_to_ftype(umode_t mode);
+extern unsigned char fs_umode_to_dtype(umode_t mode);
+
+#endif /* _LINUX_FS_DIRENT_H */
diff --git a/include/linux/fs_types.h b/include/linux/fs_types.h
deleted file mode 100644
index 54816791196f..000000000000
--- a/include/linux/fs_types.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_FS_TYPES_H
-#define _LINUX_FS_TYPES_H
-
-/*
- * This is a header for the common implementation of dirent
- * to fs on-disk file type conversion.  Although the fs on-disk
- * bits are specific to every file system, in practice, many
- * file systems use the exact same on-disk format to describe
- * the lower 3 file type bits that represent the 7 POSIX file
- * types.
- *
- * It is important to note that the definitions in this
- * header MUST NOT change. This would break both the
- * userspace ABI and the on-disk format of filesystems
- * using this code.
- *
- * All those file systems can use this generic code for the
- * conversions.
- */
-
-/*
- * struct dirent file types
- * exposed to user via getdents(2), readdir(3)
- *
- * These match bits 12..15 of stat.st_mode
- * (ie "(i_mode >> 12) & 15").
- */
-#define S_DT_SHIFT	12
-#define S_DT(mode)	(((mode) & S_IFMT) >> S_DT_SHIFT)
-#define S_DT_MASK	(S_IFMT >> S_DT_SHIFT)
-
-/* these are defined by POSIX and also present in glibc's dirent.h */
-#define DT_UNKNOWN	0
-#define DT_FIFO		1
-#define DT_CHR		2
-#define DT_DIR		4
-#define DT_BLK		6
-#define DT_REG		8
-#define DT_LNK		10
-#define DT_SOCK		12
-#define DT_WHT		14
-
-#define DT_MAX		(S_DT_MASK + 1) /* 16 */
-
-/*
- * fs on-disk file types.
- * Only the low 3 bits are used for the POSIX file types.
- * Other bits are reserved for fs private use.
- * These definitions are shared and used by multiple filesystems,
- * and MUST NOT change under any circumstances.
- *
- * Note that no fs currently stores the whiteout type on-disk,
- * so whiteout dirents are exposed to user as DT_CHR.
- */
-#define FT_UNKNOWN	0
-#define FT_REG_FILE	1
-#define FT_DIR		2
-#define FT_CHRDEV	3
-#define FT_BLKDEV	4
-#define FT_FIFO		5
-#define FT_SOCK		6
-#define FT_SYMLINK	7
-
-#define FT_MAX		8
-
-/*
- * declarations for helper functions, accompanying implementation
- * is in fs/fs_types.c
- */
-extern unsigned char fs_ftype_to_dtype(unsigned int filetype);
-extern unsigned char fs_umode_to_ftype(umode_t mode);
-extern unsigned char fs_umode_to_dtype(umode_t mode);
-
-#endif
-- 
cgit v1.2.3


From b2f35ac4146d32d4424aaa941bbc681f12c1b9e6 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 25 Sep 2025 17:26:04 -0700
Subject: iomap: add caller-provided callbacks for read and readahead

Add caller-provided callbacks for read and readahead so that it can be
used generically, especially by filesystems that are not block-based.

In particular, this:
* Modifies the read and readahead interface to take in a
  struct iomap_read_folio_ctx that is publicly defined as:

  struct iomap_read_folio_ctx {
	const struct iomap_read_ops *ops;
	struct folio *cur_folio;
	struct readahead_control *rac;
	void *read_ctx;
  };

  where struct iomap_read_ops is defined as:

  struct iomap_read_ops {
      int (*read_folio_range)(const struct iomap_iter *iter,
                             struct iomap_read_folio_ctx *ctx,
                             size_t len);
      void (*read_submit)(struct iomap_read_folio_ctx *ctx);
  };

  read_folio_range() reads in the folio range and is required by the
  caller to provide. read_submit() is optional and is used for
  submitting any pending read requests.

* Modifies existing filesystems that use iomap for read and readahead to
  use the new API, through the new statically inlined helpers
  iomap_bio_read_folio() and iomap_bio_readahead(). There is no change
  in functionality for those filesystems.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/iomap/operations.rst | 44 ++++++++++++++++++
 block/fops.c                                   |  5 +-
 fs/erofs/data.c                                |  5 +-
 fs/gfs2/aops.c                                 |  6 +--
 fs/iomap/buffered-io.c                         | 55 +++++++++++-----------
 fs/xfs/xfs_aops.c                              |  5 +-
 fs/zonefs/file.c                               |  5 +-
 include/linux/iomap.h                          | 63 +++++++++++++++++++++++++-
 8 files changed, 149 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 387fd9cc72ca..c88205132039 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -135,6 +135,28 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:
 
  * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``.
 
+``struct iomap_read_ops``
+--------------------------
+
+.. code-block:: c
+
+ struct iomap_read_ops {
+     int (*read_folio_range)(const struct iomap_iter *iter,
+                             struct iomap_read_folio_ctx *ctx, size_t len);
+     void (*submit_read)(struct iomap_read_folio_ctx *ctx);
+ };
+
+iomap calls these functions:
+
+  - ``read_folio_range``: Called to read in the range. This must be provided
+    by the caller. The caller is responsible for calling
+    iomap_finish_folio_read() after reading in the folio range. This should be
+    done even if an error is encountered during the read. This returns 0 on
+    success or a negative error on failure.
+
+  - ``submit_read``: Submit any pending read requests. This function is
+    optional.
+
 Internal per-Folio State
 ------------------------
 
@@ -182,6 +204,28 @@ The ``flags`` argument to ``->iomap_begin`` will be set to zero.
 The pagecache takes whatever locks it needs before calling the
 filesystem.
 
+Both ``iomap_readahead`` and ``iomap_read_folio`` pass in a ``struct
+iomap_read_folio_ctx``:
+
+.. code-block:: c
+
+ struct iomap_read_folio_ctx {
+    const struct iomap_read_ops *ops;
+    struct folio *cur_folio;
+    struct readahead_control *rac;
+    void *read_ctx;
+ };
+
+``iomap_readahead`` must set:
+ * ``ops->read_folio_range()`` and ``rac``
+
+``iomap_read_folio`` must set:
+ * ``ops->read_folio_range()`` and ``cur_folio``
+
+``ops->submit_read()`` and ``read_ctx`` are optional. ``read_ctx`` is used to
+pass in any custom data the caller needs accessible in the ops callbacks for
+fulfilling reads.
+
 Buffered Writes
 ---------------
 
diff --git a/block/fops.c b/block/fops.c
index 5e3db9fead77..4dad9c2d5796 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -540,12 +540,13 @@ const struct address_space_operations def_blk_aops = {
 #else /* CONFIG_BUFFER_HEAD */
 static int blkdev_read_folio(struct file *file, struct folio *folio)
 {
-	return iomap_read_folio(folio, &blkdev_iomap_ops);
+	iomap_bio_read_folio(folio, &blkdev_iomap_ops);
+	return 0;
 }
 
 static void blkdev_readahead(struct readahead_control *rac)
 {
-	iomap_readahead(rac, &blkdev_iomap_ops);
+	iomap_bio_readahead(rac, &blkdev_iomap_ops);
 }
 
 static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 8ca29962a3dd..bb13c4cb8455 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -371,7 +371,8 @@ static int erofs_read_folio(struct file *file, struct folio *folio)
 {
 	trace_erofs_read_folio(folio, true);
 
-	return iomap_read_folio(folio, &erofs_iomap_ops);
+	iomap_bio_read_folio(folio, &erofs_iomap_ops);
+	return 0;
 }
 
 static void erofs_readahead(struct readahead_control *rac)
@@ -379,7 +380,7 @@ static void erofs_readahead(struct readahead_control *rac)
 	trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
 					readahead_count(rac), true);
 
-	return iomap_readahead(rac, &erofs_iomap_ops);
+	iomap_bio_readahead(rac, &erofs_iomap_ops);
 }
 
 static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 47d74afd63ac..38d4f343187a 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -424,11 +424,11 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
 	struct inode *inode = folio->mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	int error;
+	int error = 0;
 
 	if (!gfs2_is_jdata(ip) ||
 	    (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
-		error = iomap_read_folio(folio, &gfs2_iomap_ops);
+		iomap_bio_read_folio(folio, &gfs2_iomap_ops);
 	} else if (gfs2_is_stuffed(ip)) {
 		error = stuffed_read_folio(ip, folio);
 	} else {
@@ -503,7 +503,7 @@ static void gfs2_readahead(struct readahead_control *rac)
 	else if (gfs2_is_jdata(ip))
 		mpage_readahead(rac, gfs2_block_map);
 	else
-		iomap_readahead(rac, &gfs2_iomap_ops);
+		iomap_bio_readahead(rac, &gfs2_iomap_ops);
 }
 
 /**
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 12b23ff97000..d7100a5f953a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -328,8 +328,8 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
 }
 
 #ifdef CONFIG_BLOCK
-static void iomap_finish_folio_read(struct folio *folio, size_t off,
-		size_t len, int error)
+void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
+		int error)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	bool uptodate = !error;
@@ -349,6 +349,7 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off,
 	if (finished)
 		folio_end_read(folio, uptodate);
 }
+EXPORT_SYMBOL_GPL(iomap_finish_folio_read);
 
 static void iomap_read_end_io(struct bio *bio)
 {
@@ -360,12 +361,6 @@ static void iomap_read_end_io(struct bio *bio)
 	bio_put(bio);
 }
 
-struct iomap_read_folio_ctx {
-	struct folio		*cur_folio;
-	void			*read_ctx;
-	struct readahead_control *rac;
-};
-
 static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx)
 {
 	struct bio *bio = ctx->read_ctx;
@@ -374,7 +369,7 @@ static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx)
 		submit_bio(bio);
 }
 
-static void iomap_bio_read_folio_range(const struct iomap_iter *iter,
+static int iomap_bio_read_folio_range(const struct iomap_iter *iter,
 		struct iomap_read_folio_ctx *ctx, size_t plen)
 {
 	struct folio *folio = ctx->cur_folio;
@@ -412,8 +407,15 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter,
 		bio_add_folio_nofail(bio, folio, plen, poff);
 		ctx->read_ctx = bio;
 	}
+	return 0;
 }
 
+const struct iomap_read_ops iomap_bio_read_ops = {
+	.read_folio_range	= iomap_bio_read_folio_range,
+	.submit_read		= iomap_bio_submit_read,
+};
+EXPORT_SYMBOL_GPL(iomap_bio_read_ops);
+
 static void iomap_read_init(struct folio *folio)
 {
 	struct iomap_folio_state *ifs = folio->private;
@@ -544,7 +546,9 @@ static int iomap_read_folio_iter(struct iomap_iter *iter,
 			if (!*bytes_pending)
 				iomap_read_init(folio);
 			*bytes_pending += plen;
-			iomap_bio_read_folio_range(iter, ctx, plen);
+			ret = ctx->ops->read_folio_range(iter, ctx, plen);
+			if (ret)
+				return ret;
 		}
 
 		ret = iomap_iter_advance(iter, plen);
@@ -556,26 +560,25 @@ static int iomap_read_folio_iter(struct iomap_iter *iter,
 	return 0;
 }
 
-int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
+int iomap_read_folio(const struct iomap_ops *ops,
+		struct iomap_read_folio_ctx *ctx)
 {
+	struct folio *folio = ctx->cur_folio;
 	struct iomap_iter iter = {
 		.inode		= folio->mapping->host,
 		.pos		= folio_pos(folio),
 		.len		= folio_size(folio),
 	};
-	struct iomap_read_folio_ctx ctx = {
-		.cur_folio	= folio,
-	};
 	size_t bytes_pending = 0;
 	int ret;
 
 	trace_iomap_readpage(iter.inode, 1);
 
 	while ((ret = iomap_iter(&iter, ops)) > 0)
-		iter.status = iomap_read_folio_iter(&iter, &ctx,
-				&bytes_pending);
+		iter.status = iomap_read_folio_iter(&iter, ctx, &bytes_pending);
 
-	iomap_bio_submit_read(&ctx);
+	if (ctx->ops->submit_read)
+		ctx->ops->submit_read(ctx);
 
 	iomap_read_end(folio, bytes_pending);
 
@@ -615,8 +618,8 @@ static int iomap_readahead_iter(struct iomap_iter *iter,
 
 /**
  * iomap_readahead - Attempt to read pages from a file.
- * @rac: Describes the pages to be read.
  * @ops: The operations vector for the filesystem.
+ * @ctx: The ctx used for issuing readahead.
  *
  * This function is for filesystems to call to implement their readahead
  * address_space operation.
@@ -628,28 +631,28 @@ static int iomap_readahead_iter(struct iomap_iter *iter,
  * function is called with memalloc_nofs set, so allocations will not cause
  * the filesystem to be reentered.
  */
-void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
+void iomap_readahead(const struct iomap_ops *ops,
+		struct iomap_read_folio_ctx *ctx)
 {
+	struct readahead_control *rac = ctx->rac;
 	struct iomap_iter iter = {
 		.inode	= rac->mapping->host,
 		.pos	= readahead_pos(rac),
 		.len	= readahead_length(rac),
 	};
-	struct iomap_read_folio_ctx ctx = {
-		.rac	= rac,
-	};
 	size_t cur_bytes_pending;
 
 	trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
 
 	while (iomap_iter(&iter, ops) > 0)
-		iter.status = iomap_readahead_iter(&iter, &ctx,
+		iter.status = iomap_readahead_iter(&iter, ctx,
 					&cur_bytes_pending);
 
-	iomap_bio_submit_read(&ctx);
+	if (ctx->ops->submit_read)
+		ctx->ops->submit_read(ctx);
 
-	if (ctx.cur_folio)
-		iomap_read_end(ctx.cur_folio, cur_bytes_pending);
+	if (ctx->cur_folio)
+		iomap_read_end(ctx->cur_folio, cur_bytes_pending);
 }
 EXPORT_SYMBOL_GPL(iomap_readahead);
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a26f79815533..0c2ed00733f2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -742,14 +742,15 @@ xfs_vm_read_folio(
 	struct file		*unused,
 	struct folio		*folio)
 {
-	return iomap_read_folio(folio, &xfs_read_iomap_ops);
+	iomap_bio_read_folio(folio, &xfs_read_iomap_ops);
+	return 0;
 }
 
 STATIC void
 xfs_vm_readahead(
 	struct readahead_control	*rac)
 {
-	iomap_readahead(rac, &xfs_read_iomap_ops);
+	iomap_bio_readahead(rac, &xfs_read_iomap_ops);
 }
 
 static int
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 90e2ad8ee5f4..c1e5e30e90a0 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -112,12 +112,13 @@ static const struct iomap_ops zonefs_write_iomap_ops = {
 
 static int zonefs_read_folio(struct file *unused, struct folio *folio)
 {
-	return iomap_read_folio(folio, &zonefs_read_iomap_ops);
+	iomap_bio_read_folio(folio, &zonefs_read_iomap_ops);
+	return 0;
 }
 
 static void zonefs_readahead(struct readahead_control *rac)
 {
-	iomap_readahead(rac, &zonefs_read_iomap_ops);
+	iomap_bio_readahead(rac, &zonefs_read_iomap_ops);
 }
 
 /*
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4469b2318b08..37435b912755 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -16,6 +16,7 @@ struct inode;
 struct iomap_iter;
 struct iomap_dio;
 struct iomap_writepage_ctx;
+struct iomap_read_folio_ctx;
 struct iov_iter;
 struct kiocb;
 struct page;
@@ -337,8 +338,10 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter)
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops,
 		const struct iomap_write_ops *write_ops, void *private);
-int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
-void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
+int iomap_read_folio(const struct iomap_ops *ops,
+		struct iomap_read_folio_ctx *ctx);
+void iomap_readahead(const struct iomap_ops *ops,
+		struct iomap_read_folio_ctx *ctx);
 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
@@ -465,6 +468,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 		loff_t pos, loff_t end_pos, unsigned int dirty_len);
 int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error);
 
+void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
+		int error);
 void iomap_start_folio_write(struct inode *inode, struct folio *folio,
 		size_t len);
 void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
@@ -473,6 +478,34 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio);
 int iomap_writepages(struct iomap_writepage_ctx *wpc);
 
+struct iomap_read_folio_ctx {
+	const struct iomap_read_ops *ops;
+	struct folio		*cur_folio;
+	struct readahead_control *rac;
+	void			*read_ctx;
+};
+
+struct iomap_read_ops {
+	/*
+	 * Read in a folio range.
+	 *
+	 * The caller is responsible for calling iomap_finish_folio_read() after
+	 * reading in the folio range. This should be done even if an error is
+	 * encountered during the read.
+	 *
+	 * Returns 0 on success or a negative error on failure.
+	 */
+	int (*read_folio_range)(const struct iomap_iter *iter,
+			struct iomap_read_folio_ctx *ctx, size_t len);
+
+	/*
+	 * Submit any pending read requests.
+	 *
+	 * This is optional.
+	 */
+	void (*submit_read)(struct iomap_read_folio_ctx *ctx);
+};
+
 /*
  * Flags for direct I/O ->end_io:
  */
@@ -538,4 +571,30 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
 
 extern struct bio_set iomap_ioend_bioset;
 
+#ifdef CONFIG_BLOCK
+extern const struct iomap_read_ops iomap_bio_read_ops;
+
+static inline void iomap_bio_read_folio(struct folio *folio,
+		const struct iomap_ops *ops)
+{
+	struct iomap_read_folio_ctx ctx = {
+		.ops		= &iomap_bio_read_ops,
+		.cur_folio	= folio,
+	};
+
+	iomap_read_folio(ops, &ctx);
+}
+
+static inline void iomap_bio_readahead(struct readahead_control *rac,
+		const struct iomap_ops *ops)
+{
+	struct iomap_read_folio_ctx ctx = {
+		.ops		= &iomap_bio_read_ops,
+		.rac		= rac,
+	};
+
+	iomap_readahead(ops, &ctx);
+}
+#endif /* CONFIG_BLOCK */
+
 #endif /* LINUX_IOMAP_H */
-- 
cgit v1.2.3


From d4e88bb08e5f7e6eb4e9c3685894b9b57bfdfb08 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 25 Sep 2025 17:26:06 -0700
Subject: iomap: make iomap_read_folio() a void return

No errors are propagated in iomap_read_folio(). Change
iomap_read_folio() to a void return to make this clearer to callers.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 9 +--------
 include/linux/iomap.h  | 2 +-
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 0d88a4f3c791..1dbcac17fefd 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -495,7 +495,7 @@ static int iomap_read_folio_iter(struct iomap_iter *iter,
 	return 0;
 }
 
-int iomap_read_folio(const struct iomap_ops *ops,
+void iomap_read_folio(const struct iomap_ops *ops,
 		struct iomap_read_folio_ctx *ctx)
 {
 	struct folio *folio = ctx->cur_folio;
@@ -516,13 +516,6 @@ int iomap_read_folio(const struct iomap_ops *ops,
 		ctx->ops->submit_read(ctx);
 
 	iomap_read_end(folio, bytes_pending);
-
-	/*
-	 * Just like mpage_readahead and block_read_full_folio, we always
-	 * return 0 and just set the folio error flag on errors.  This
-	 * should be cleaned up throughout the stack eventually.
-	 */
-	return 0;
 }
 EXPORT_SYMBOL_GPL(iomap_read_folio);
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 37435b912755..6d864b446b6e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -338,7 +338,7 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter)
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops,
 		const struct iomap_write_ops *write_ops, void *private);
-int iomap_read_folio(const struct iomap_ops *ops,
+void iomap_read_folio(const struct iomap_ops *ops,
 		struct iomap_read_folio_ctx *ctx);
 void iomap_readahead(const struct iomap_ops *ops,
 		struct iomap_read_folio_ctx *ctx);
-- 
cgit v1.2.3


From f8d98072feee32722086ddae4f288b6c45ae4330 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 3 Oct 2025 09:46:35 -0400
Subject: filemap: add helper to look up dirty folios in a range

Add a new filemap_get_folios_dirty() helper to look up existing dirty
folios in a range and add them to a folio_batch. This is to support
optimization of certain iomap operations that only care about dirty
folios in a target range. For example, zero range only zeroes the subset
of dirty pages over unwritten mappings, seek hole/data may use similar
logic in the future, etc.

Note that the helper is intended for use under internal fs locks.
Therefore it trylocks folios in order to filter out clean folios.
This loosely follows the logic from filemap_range_has_writeback().

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pagemap.h |  2 ++
 mm/filemap.c            | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878..7274a86b4871 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -977,6 +977,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
 		pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
 unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
 		pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);
+unsigned filemap_get_folios_dirty(struct address_space *mapping,
+		pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
 
 struct folio *read_cache_folio(struct address_space *, pgoff_t index,
 		filler_t *filler, struct file *file);
diff --git a/mm/filemap.c b/mm/filemap.c
index 13f0259d993c..da1be27de10d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2366,6 +2366,64 @@ out:
 }
 EXPORT_SYMBOL(filemap_get_folios_tag);
 
+/**
+ * filemap_get_folios_dirty - Get a batch of dirty folios
+ * @mapping:	The address_space to search
+ * @start:	The starting folio index
+ * @end:	The final folio index (inclusive)
+ * @fbatch:	The batch to fill
+ *
+ * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except
+ * the returned folios are presumed to be dirty or undergoing writeback. Dirty
+ * state is presumed because we don't block on folio lock nor want to miss
+ * folios. Callers that need to can recheck state upon locking the folio.
+ *
+ * This may not return all dirty folios if the batch gets filled up.
+ *
+ * Return: The number of folios found.
+ * Also update @start to be positioned for traversal of the next folio.
+ */
+unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start,
+			pgoff_t end, struct folio_batch *fbatch)
+{
+	XA_STATE(xas, &mapping->i_pages, *start);
+	struct folio *folio;
+
+	rcu_read_lock();
+	while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
+		if (xa_is_value(folio))
+			continue;
+		if (folio_trylock(folio)) {
+			bool clean = !folio_test_dirty(folio) &&
+				     !folio_test_writeback(folio);
+			folio_unlock(folio);
+			if (clean) {
+				folio_put(folio);
+				continue;
+			}
+		}
+		if (!folio_batch_add(fbatch, folio)) {
+			unsigned long nr = folio_nr_pages(folio);
+			*start = folio->index + nr;
+			goto out;
+		}
+	}
+	/*
+	 * We come here when there is no folio beyond @end. We take care to not
+	 * overflow the index @start as it confuses some of the callers. This
+	 * breaks the iteration when there is a folio at index -1 but that is
+	 * already broke anyway.
+	 */
+	if (end == (pgoff_t)-1)
+		*start = (pgoff_t)-1;
+	else
+		*start = end + 1;
+out:
+	rcu_read_unlock();
+
+	return folio_batch_count(fbatch);
+}
+
 /*
  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
  * a _large_ part of the i/o request. Imagine the worst scenario:
-- 
cgit v1.2.3


From 395ed1ef0012e1bb1e4050e84ba0173b3623112a Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 3 Oct 2025 09:46:37 -0400
Subject: iomap: optional zero range dirty folio processing

The only way zero range can currently process unwritten mappings
with dirty pagecache is to check whether the range is dirty before
mapping lookup and then flush when at least one underlying mapping
is unwritten. This ordering is required to prevent iomap lookup from
racing with folio writeback and reclaim.

Since zero range can skip ranges of unwritten mappings that are
clean in cache, this operation can be improved by allowing the
filesystem to provide a set of dirty folios that require zeroing. In
turn, rather than flush or iterate file offsets, zero range can
iterate on folios in the batch and advance over clean or uncached
ranges in between.

Add a folio_batch in struct iomap and provide a helper for
filesystems to populate the batch at lookup time. Update the folio
lookup path to return the next folio in the batch, if provided, and
advance the iter if the folio starts beyond the current offset.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++---
 fs/iomap/iter.c        |  6 ++++
 include/linux/iomap.h  |  4 +++
 3 files changed, 95 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index b5e85cd24360..1cabd9b0249e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -772,6 +772,28 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter,
 	if (!mapping_large_folio_support(iter->inode->i_mapping))
 		len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
 
+	if (iter->fbatch) {
+		struct folio *folio = folio_batch_next(iter->fbatch);
+
+		if (!folio)
+			return NULL;
+
+		/*
+		 * The folio mapping generally shouldn't have changed based on
+		 * fs locks, but be consistent with filemap lookup and retry
+		 * the iter if it does.
+		 */
+		folio_lock(folio);
+		if (unlikely(folio->mapping != iter->inode->i_mapping)) {
+			iter->iomap.flags |= IOMAP_F_STALE;
+			folio_unlock(folio);
+			return NULL;
+		}
+
+		folio_get(folio);
+		return folio;
+	}
+
 	if (write_ops && write_ops->get_folio)
 		return write_ops->get_folio(iter, pos, len);
 	return iomap_get_folio(iter, pos, len);
@@ -832,6 +854,8 @@ static int iomap_write_begin(struct iomap_iter *iter,
 	int status = 0;
 
 	len = min_not_zero(len, *plen);
+	*foliop = NULL;
+	*plen = 0;
 
 	if (fatal_signal_pending(current))
 		return -EINTR;
@@ -840,6 +864,15 @@ static int iomap_write_begin(struct iomap_iter *iter,
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);
 
+	/*
+	 * No folio means we're done with a batch. We still have range to
+	 * process so return and let the caller iterate and refill the batch.
+	 */
+	if (!folio) {
+		WARN_ON_ONCE(!iter->fbatch);
+		return 0;
+	}
+
 	/*
 	 * Now we have a locked folio, before we do anything with it we need to
 	 * check that the iomap we have cached is not stale. The inode extent
@@ -860,6 +893,22 @@ static int iomap_write_begin(struct iomap_iter *iter,
 		}
 	}
 
+	/*
+	 * The folios in a batch may not be contiguous. If we've skipped
+	 * forward, advance the iter to the pos of the current folio. If the
+	 * folio starts beyond the end of the mapping, it may have been trimmed
+	 * since the lookup for whatever reason. Return a NULL folio to
+	 * terminate the op.
+	 */
+	if (folio_pos(folio) > iter->pos) {
+		len = min_t(u64, folio_pos(folio) - iter->pos,
+				 iomap_length(iter));
+		status = iomap_iter_advance(iter, len);
+		len = iomap_length(iter);
+		if (status || !len)
+			goto out_unlock;
+	}
+
 	pos = iomap_trim_folio_range(iter, folio, poffset, &len);
 
 	if (srcmap->type == IOMAP_INLINE)
@@ -1406,6 +1455,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 		if (iter->iomap.flags & IOMAP_F_STALE)
 			break;
 
+		/* a NULL folio means we're done with a folio batch */
+		if (!folio) {
+			status = iomap_iter_advance_full(iter);
+			break;
+		}
+
 		/* warn about zeroing folios beyond eof that won't write back */
 		WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
 
@@ -1430,6 +1485,26 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 	return status;
 }
 
+loff_t
+iomap_fill_dirty_folios(
+	struct iomap_iter	*iter,
+	loff_t			offset,
+	loff_t			length)
+{
+	struct address_space	*mapping = iter->inode->i_mapping;
+	pgoff_t			start = offset >> PAGE_SHIFT;
+	pgoff_t			end = (offset + length - 1) >> PAGE_SHIFT;
+
+	iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL);
+	if (!iter->fbatch)
+		return offset + length;
+	folio_batch_init(iter->fbatch);
+
+	filemap_get_folios_dirty(mapping, &start, end, iter->fbatch);
+	return (start << PAGE_SHIFT);
+}
+EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios);
+
 int
 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 		const struct iomap_ops *ops,
@@ -1459,7 +1534,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 	 * flushing on partial eof zeroing, special case it to zero the
 	 * unaligned start portion if already dirty in pagecache.
 	 */
-	if (off &&
+	if (!iter.fbatch && off &&
 	    filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
 		iter.len = plen;
 		while ((ret = iomap_iter(&iter, ops)) > 0)
@@ -1476,13 +1551,18 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 	 * if dirty and the fs returns a mapping that might convert on
 	 * writeback.
 	 */
-	range_dirty = filemap_range_needs_writeback(inode->i_mapping,
-					iter.pos, iter.pos + iter.len - 1);
+	range_dirty = filemap_range_needs_writeback(mapping, iter.pos,
+					iter.pos + iter.len - 1);
 	while ((ret = iomap_iter(&iter, ops)) > 0) {
 		const struct iomap *srcmap = iomap_iter_srcmap(&iter);
 
-		if (srcmap->type == IOMAP_HOLE ||
-		    srcmap->type == IOMAP_UNWRITTEN) {
+		if (WARN_ON_ONCE(iter.fbatch &&
+				 srcmap->type != IOMAP_UNWRITTEN))
+			return -EIO;
+
+		if (!iter.fbatch &&
+		    (srcmap->type == IOMAP_HOLE ||
+		     srcmap->type == IOMAP_UNWRITTEN)) {
 			s64 status;
 
 			if (range_dirty) {
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index 91d2024e00da..8692e5e41c6d 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -8,6 +8,12 @@
 
 static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
 {
+	if (iter->fbatch) {
+		folio_batch_release(iter->fbatch);
+		kfree(iter->fbatch);
+		iter->fbatch = NULL;
+	}
+
 	iter->status = 0;
 	memset(&iter->iomap, 0, sizeof(iter->iomap));
 	memset(&iter->srcmap, 0, sizeof(iter->srcmap));
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6d864b446b6e..65d123114883 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/mm_types.h>
 #include <linux/blkdev.h>
+#include <linux/pagevec.h>
 
 struct address_space;
 struct fiemap_extent_info;
@@ -242,6 +243,7 @@ struct iomap_iter {
 	unsigned flags;
 	struct iomap iomap;
 	struct iomap srcmap;
+	struct folio_batch *fbatch;
 	void *private;
 };
 
@@ -350,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops,
 		const struct iomap_write_ops *write_ops);
+loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset,
+		loff_t length);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
 		bool *did_zero, const struct iomap_ops *ops,
 		const struct iomap_write_ops *write_ops, void *private);
-- 
cgit v1.2.3


From 001397f5ef4908ea46a63059439e8c3bf3552d9f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 31 Oct 2025 14:10:26 +0100
Subject: iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag

Btrfs requires all of its bios to be fs block aligned, normally it's
totally fine but with the incoming block size larger than page size
(bs > ps) support, the requirement is no longer met for direct IOs.

Because iomap_dio_bio_iter() calls bio_iov_iter_get_pages(), only
requiring alignment to be bdev_logical_block_size().

In the real world that value is either 512 or 4K, on 4K page sized
systems it means bio_iov_iter_get_pages() can break the bio at any page
boundary, breaking btrfs' requirement for bs > ps cases.

To address this problem, introduce a new public iomap dio flag,
IOMAP_DIO_FSBLOCK_ALIGNED.

When calling __iomap_dio_rw() with that new flag, iomap_dio::flags will
inherit that new flag, and iomap_dio_bio_iter() will take fs block size
into the calculation of the alignment, and pass the alignment to
bio_iov_iter_get_pages(), respecting the fs block size requirement.

The initial user of this flag will be btrfs, which needs to calculate the
checksum for direct read and thus requires the biovec to be fs block
aligned for the incoming bs > ps support.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
[hch: also align pos/len, incorporate the trace flags from Darrick]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251031131045.1613229-2-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/direct-io.c  | 17 +++++++++++++++--
 fs/iomap/trace.h      |  7 ++++---
 include/linux/iomap.h |  8 ++++++++
 3 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index e9e5f0703160..8b2f9fb89eb3 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -336,8 +336,18 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 	int nr_pages, ret = 0;
 	u64 copied = 0;
 	size_t orig_count;
+	unsigned int alignment;
 
-	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1))
+	/*
+	 * File systems that write out of place and always allocate new blocks
+	 * need each bio to be block aligned as that's the unit of allocation.
+	 */
+	if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED)
+		alignment = fs_block_size;
+	else
+		alignment = bdev_logical_block_size(iomap->bdev);
+
+	if ((pos | length) & (alignment - 1))
 		return -EINVAL;
 
 	if (dio->flags & IOMAP_DIO_WRITE) {
@@ -434,7 +444,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 		bio->bi_end_io = iomap_dio_bio_end_io;
 
 		ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
-				bdev_logical_block_size(iomap->bdev) - 1);
+					     alignment - 1);
 		if (unlikely(ret)) {
 			/*
 			 * We have to stop part way through an IO. We must fall
@@ -639,6 +649,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		iomi.flags |= IOMAP_NOWAIT;
 
+	if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED)
+		dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
+
 	if (iov_iter_rw(iter) == READ) {
 		/* reads can always complete inline */
 		dio->flags |= IOMAP_DIO_INLINE_COMP;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index a61c1dae4742..532787277b16 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -122,9 +122,10 @@ DEFINE_RANGE_EVENT(iomap_zero_iter);
 
 
 #define IOMAP_DIO_STRINGS \
-	{IOMAP_DIO_FORCE_WAIT,	"DIO_FORCE_WAIT" }, \
-	{IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \
-	{IOMAP_DIO_PARTIAL,	"DIO_PARTIAL" }
+	{IOMAP_DIO_FORCE_WAIT,		"DIO_FORCE_WAIT" }, \
+	{IOMAP_DIO_OVERWRITE_ONLY,	"DIO_OVERWRITE_ONLY" }, \
+	{IOMAP_DIO_PARTIAL,		"DIO_PARTIAL" }, \
+	{IOMAP_DIO_FSBLOCK_ALIGNED,	"DIO_FSBLOCK_ALIGNED" }
 
 DECLARE_EVENT_CLASS(iomap_class,
 	TP_PROTO(struct inode *inode, struct iomap *iomap),
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 65d123114883..8b1ac08c7474 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -553,6 +553,14 @@ struct iomap_dio_ops {
  */
 #define IOMAP_DIO_PARTIAL		(1 << 2)
 
+/*
+ * Ensure each bio is aligned to fs block size.
+ *
+ * For filesystems which need to calculate/verify the checksum of each fs
+ * block. Otherwise they may not be able to handle unaligned bios.
+ */
+#define IOMAP_DIO_FSBLOCK_ALIGNED	(1 << 3)
+
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int dio_flags, void *private, size_t done_before);
-- 
cgit v1.2.3


From cf76553aaa363620f58a6b6409bf544f4bcfa8de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 5 Nov 2025 11:00:14 +0100
Subject: entry,unwind/deferred: Fix unwind_reset_info() placement

Stephen reported that on KASAN builds he's seeing:

vmlinux.o: warning: objtool: user_exc_vmm_communication+0x15a: call to __kasan_check_read() leaves .noinstr.text section
vmlinux.o: warning: objtool: exc_debug_user+0x182: call to __kasan_check_read() leaves .noinstr.text section
vmlinux.o: warning: objtool: exc_int3+0x123: call to __kasan_check_read() leaves .noinstr.text section
vmlinux.o: warning: objtool: noist_exc_machine_check+0x17a: call to __kasan_check_read() leaves .noinstr.text section
vmlinux.o: warning: objtool: fred_exc_machine_check+0x17e: call to __kasan_check_read() leaves .noinstr.text section

This turns out to be atomic ops from unwind_reset_info() that have
explicit instrumentation. Place unwind_reset_info() in the preceding
instrumentation_begin() section.

Fixes: c6439bfaabf2 ("Merge tag 'trace-deferred-unwind-v6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251105100014.GY4068168@noisy.programming.kicks-ass.net
---
 include/linux/irq-entry-common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index d643c7c87822..ba1ed42f8a1c 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -253,11 +253,11 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
 static __always_inline void exit_to_user_mode(void)
 {
 	instrumentation_begin();
+	unwind_reset_info();
 	trace_hardirqs_on_prepare();
 	lockdep_hardirqs_on_prepare();
 	instrumentation_end();
 
-	unwind_reset_info();
 	user_enter_irqoff();
 	arch_exit_to_user_mode();
 	lockdep_hardirqs_on(CALLER_ADDR0);
-- 
cgit v1.2.3


From 8637fa89e678422995301ddb20b74190dffcccee Mon Sep 17 00:00:00 2001
From: Yongpeng Yang <yangyongpeng@xiaomi.com>
Date: Tue, 4 Nov 2025 20:50:10 +0800
Subject: block: add __must_check attribute to sb_min_blocksize()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When sb_min_blocksize() returns 0 and the return value is not checked,
it may lead to a situation where sb->s_blocksize is 0 when
accessing the filesystem super block. After commit a64e5a596067bd
("bdev: add back PAGE_SIZE block size validation for
sb_set_blocksize()"), this becomes more likely to happen when the
block device’s logical_block_size is larger than PAGE_SIZE and the
filesystem is unformatted. Add the __must_check attribute to ensure
callers always check the return value.

Cc: stable@vger.kernel.org # v6.15
Suggested-by: Matthew Wilcox <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Yongpeng Yang <yangyongpeng@xiaomi.com>
Link: https://patch.msgid.link/20251104125009.2111925-6-yangyongpeng.storage@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c       | 2 +-
 include/linux/fs.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index 810707cca970..638f0cd458ae 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -231,7 +231,7 @@ int sb_set_blocksize(struct super_block *sb, int size)
 
 EXPORT_SYMBOL(sb_set_blocksize);
 
-int sb_min_blocksize(struct super_block *sb, int size)
+int __must_check sb_min_blocksize(struct super_block *sb, int size)
 {
 	int minsize = bdev_logical_block_size(sb->s_bdev);
 	if (size < minsize)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..3ea98c6cce81 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3423,8 +3423,8 @@ static inline void remove_inode_hash(struct inode *inode)
 extern void inode_sb_list_add(struct inode *inode);
 extern void inode_add_lru(struct inode *inode);
 
-extern int sb_set_blocksize(struct super_block *, int);
-extern int sb_min_blocksize(struct super_block *, int);
+int sb_set_blocksize(struct super_block *sb, int size);
+int __must_check sb_min_blocksize(struct super_block *sb, int size);
 
 int generic_file_mmap(struct file *, struct vm_area_struct *);
 int generic_file_mmap_prepare(struct vm_area_desc *desc);
-- 
cgit v1.2.3


From ae83f3b72621bd3187eb7956c7c2993a97d4b187 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Thu, 9 Oct 2025 20:06:09 -0700
Subject: module: Add compile-time check for embedded NUL characters

Long ago, the kernel module license checks were bypassed by embedding a
NUL character in the MODULE_LICENSE() string[1]. By using a string like
"GPL\0proprietary text", the kernel would only read "GPL" due to C string
termination at the NUL byte, allowing proprietary modules to avoid kernel
tainting and access GPL-only symbols.

The MODULE_INFO() macro stores these strings in the .modinfo ELF
section, and get_next_modinfo() uses strcmp()-family functions
which stop at the first NUL. This split the embedded string into two
separate .modinfo entries, with only the first part being processed by
license_is_gpl_compatible().

Add a compile-time check using static_assert that compares the full
string length (sizeof - 1) against __builtin_strlen(), which stops at
the first NUL. If they differ, compilation fails with a clear error
message.

While this check can still be circumvented by modifying the ELF binary
post-compilation, it prevents accidental embedded NULs and forces
intentional abuse to require deliberate binary manipulation rather than
simple source-level tricks.

Build tested with test modules containing both valid and invalid license
strings. The check correctly rejects:

    MODULE_LICENSE("GPL\0proprietary")

while accepting normal declarations:

    MODULE_LICENSE("GPL")

Link: https://lwn.net/Articles/82305/ [1]
Suggested-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Kees Cook <kees@kernel.org>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Reviewed-by: Aaron Tomlin <atomlin@atomlin.com>
Reviewed-by: Petr Pavlu <petr.pavlu@suse.com>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
---
 include/linux/moduleparam.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 6907aedc4f74..915f32f7d888 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -26,6 +26,9 @@
 
 /* Generic info of form tag = "info" */
 #define MODULE_INFO(tag, info)					  \
+	static_assert(						  \
+		sizeof(info) - 1 == __builtin_strlen(info),	  \
+		"MODULE_INFO(" #tag ", ...) contains embedded NUL byte"); \
 	static const char __UNIQUE_ID(modinfo)[]			  \
 		__used __section(".modinfo") __aligned(1)		  \
 		= __MODULE_INFO_PREFIX __stringify(tag) "=" info
-- 
cgit v1.2.3


From 3c36965df80801344850388592e95033eceea05b Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Mon, 27 Oct 2025 12:05:22 +0100
Subject: regulator: Add support for MediaTek MT6363 SPMI PMIC Regulators

Add a driver for the regulators found on the MT6363 PMIC, fully
controlled by SPMI interface.
This PMIC regulates voltage with an input range of 2.6-5.0V, and
features 10 buck converters and 26 LDOs.

Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://patch.msgid.link/20251027110527.21002-5-angelogioacchino.delregno@collabora.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig                  |  10 +
 drivers/regulator/Makefile                 |   1 +
 drivers/regulator/mt6363-regulator.c       | 938 +++++++++++++++++++++++++++++
 include/linux/regulator/mt6363-regulator.h | 330 ++++++++++
 4 files changed, 1279 insertions(+)
 create mode 100644 drivers/regulator/mt6363-regulator.c
 create mode 100644 include/linux/regulator/mt6363-regulator.h

(limited to 'include')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 961b1a946346..99f2fdc62eee 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -954,6 +954,16 @@ config REGULATOR_MT6360
 	  2-channel buck with Thermal Shutdown and Overload Protection
 	  6-channel High PSRR and Low Dropout LDO.
 
+config REGULATOR_MT6363
+	tristate "MT6363 SPMI PMIC regulator driver"
+	depends on SPMI
+	select REGMAP_SPMI
+	help
+	  Say Y here to enable support for regulators found in the MediaTek
+	  MT6363 SPMI PMIC.
+	  This driver supports the control of different power rails of device
+	  through regulator interface.
+
 config REGULATOR_MT6370
 	tristate "MT6370 SubPMIC Regulator"
 	depends on MFD_MT6370
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index d5c3da9ecdb4..8f2750d3df33 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -113,6 +113,7 @@ obj-$(CONFIG_REGULATOR_MT6357)	+= mt6357-regulator.o
 obj-$(CONFIG_REGULATOR_MT6358)	+= mt6358-regulator.o
 obj-$(CONFIG_REGULATOR_MT6359)	+= mt6359-regulator.o
 obj-$(CONFIG_REGULATOR_MT6360) += mt6360-regulator.o
+obj-$(CONFIG_REGULATOR_MT6363) += mt6363-regulator.o
 obj-$(CONFIG_REGULATOR_MT6370) += mt6370-regulator.o
 obj-$(CONFIG_REGULATOR_MT6380)	+= mt6380-regulator.o
 obj-$(CONFIG_REGULATOR_MT6397)	+= mt6397-regulator.o
diff --git a/drivers/regulator/mt6363-regulator.c b/drivers/regulator/mt6363-regulator.c
new file mode 100644
index 000000000000..94ac955afa45
--- /dev/null
+++ b/drivers/regulator/mt6363-regulator.c
@@ -0,0 +1,938 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Copyright (c) 2024 MediaTek Inc.
+// Copyright (c) 2025 Collabora Ltd
+//                    AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/devm-helpers.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/spmi.h>
+
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/mt6363-regulator.h>
+#include <linux/regulator/of_regulator.h>
+
+#define MT6363_REGULATOR_MODE_NORMAL	0
+#define MT6363_REGULATOR_MODE_FCCM	1
+#define MT6363_REGULATOR_MODE_LP	2
+#define MT6363_REGULATOR_MODE_ULP	3
+
+#define EN_SET_OFFSET			0x1
+#define EN_CLR_OFFSET			0x2
+#define OP_CFG_OFFSET			0x5
+
+#define NORMAL_OP_CFG			0x10
+#define NORMAL_OP_EN			0x800000
+
+#define OC_IRQ_ENABLE_DELAY_MS		10
+
+/* Unlock keys for TMA and BUCK_TOP */
+#define MT6363_TMA_UNLOCK_VALUE		0x9c9c
+#define MT6363_BUCK_TOP_UNLOCK_VALUE	0x5543
+
+enum {
+	MT6363_ID_VBUCK1,
+	MT6363_ID_VBUCK2,
+	MT6363_ID_VBUCK3,
+	MT6363_ID_VBUCK4,
+	MT6363_ID_VBUCK5,
+	MT6363_ID_VBUCK6,
+	MT6363_ID_VBUCK7,
+	MT6363_ID_VS1,
+	MT6363_ID_VS2,
+	MT6363_ID_VS3,
+	MT6363_ID_VA12_1,
+	MT6363_ID_VA12_2,
+	MT6363_ID_VA15,
+	MT6363_ID_VAUX18,
+	MT6363_ID_VCN13,
+	MT6363_ID_VCN15,
+	MT6363_ID_VEMC,
+	MT6363_ID_VIO075,
+	MT6363_ID_VIO18,
+	MT6363_ID_VM18,
+	MT6363_ID_VSRAM_APU,
+	MT6363_ID_VSRAM_CPUB,
+	MT6363_ID_VSRAM_CPUM,
+	MT6363_ID_VSRAM_CPUL,
+	MT6363_ID_VSRAM_DIGRF,
+	MT6363_ID_VSRAM_MDFE,
+	MT6363_ID_VSRAM_MODEM,
+	MT6363_ID_VRF09,
+	MT6363_ID_VRF12,
+	MT6363_ID_VRF13,
+	MT6363_ID_VRF18,
+	MT6363_ID_VRFIO18,
+	MT6363_ID_VTREF18,
+	MT6363_ID_VUFS12,
+	MT6363_ID_VUFS18,
+};
+
+/**
+ * struct mt6363_regulator_info - MT6363 regulators information
+ * @desc: Regulator description structure
+ * @lp_mode_reg: Low Power mode register (normal/idle)
+ * @lp_mode_mask: Low Power mode regulator mask
+ * @hw_lp_mode_reg: Hardware voted Low Power mode register (normal/idle)
+ * @hw_lp_mode_mask: Hardware voted Low Power mode regulator mask
+ * @modeset_reg: AUTO/PWM mode register
+ * @modeset_mask: AUTO/PWM regulator mask
+ * @lp_imax_uA: Maximum load current (microamps), for Low Power mode only
+ * @op_en_reg: Operation mode enablement register
+ * @orig_op_en: Backup of a regulator's operation mode enablement register
+ * @orig_op_cfg: Backup of a regulator's operation mode configuration register
+ * @oc_work: Delayed work for enabling overcurrent IRQ
+ * @hwirq: PMIC-Internal HW Interrupt for overcurrent event
+ * @virq: Mapped Interrupt for overcurrent event
+ */
+struct mt6363_regulator_info {
+	struct regulator_desc desc;
+	u16 lp_mode_reg;
+	u16 lp_mode_mask;
+	u16 hw_lp_mode_reg;
+	u16 hw_lp_mode_mask;
+	u16 modeset_reg;
+	u16 modeset_mask;
+	int lp_imax_uA;
+	u16 op_en_reg;
+	u32 orig_op_en;
+	u8 orig_op_cfg;
+	struct delayed_work oc_work;
+	u8 hwirq;
+	int virq;
+};
+
+#define MT6363_BUCK(match, vreg, min, max, step, en_reg, lp_reg,	\
+		    mset_reg, ocp_intn)					\
+[MT6363_ID_##vreg] = {							\
+	.desc = {							\
+		.name = match,						\
+		.supply_name = "vsys-"match,				\
+		.of_match = of_match_ptr(match),			\
+		.ops = &mt6363_vreg_setclr_ops,				\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6363_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = (max - min) / step + 1,			\
+		.min_uV = min,						\
+		.uV_step = step,					\
+		.enable_reg = en_reg,					\
+		.enable_mask = BIT(MT6363_RG_BUCK_##vreg##_EN_BIT),	\
+		.vsel_reg = MT6363_RG_BUCK_##vreg##_VOSEL_ADDR,		\
+		.vsel_mask = MT6363_RG_BUCK_##vreg##_VOSEL_MASK,	\
+		.of_map_mode = mt6363_map_mode,				\
+	},								\
+	.lp_mode_reg = lp_reg,						\
+	.lp_mode_mask = BIT(MT6363_RG_BUCK_##vreg##_LP_BIT),		\
+	.hw_lp_mode_reg = MT6363_BUCK_##vreg##_HW_LP_MODE,		\
+	.hw_lp_mode_mask = 0xc,						\
+	.modeset_reg = mset_reg,					\
+	.modeset_mask = BIT(MT6363_RG_##vreg##_FCCM_BIT),		\
+	.lp_imax_uA = 100000,						\
+	.op_en_reg = MT6363_BUCK_##vreg##_OP_EN_0,			\
+	.hwirq = ocp_intn,						\
+}
+
+#define MT6363_LDO_LINEAR_OPS(match, vreg, in_sup, vops, min, max,	\
+			      step, buck_reg, ocp_intn)			\
+[MT6363_ID_##vreg] = {							\
+	.desc = {							\
+		.name = match,						\
+		.supply_name = in_sup,					\
+		.of_match = of_match_ptr(match),			\
+		.ops = &vops,						\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6363_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = (max - min) / step + 1,			\
+		.min_uV = min,						\
+		.uV_step = step,					\
+		.enable_reg = MT6363_RG_##buck_reg##_EN_ADDR,		\
+		.enable_mask = BIT(MT6363_RG_LDO_##vreg##_EN_BIT),	\
+		.vsel_reg = MT6363_RG_LDO_##vreg##_VOSEL_ADDR,		\
+		.vsel_mask = MT6363_RG_LDO_##vreg##_VOSEL_MASK,		\
+		.of_map_mode = mt6363_map_mode,				\
+	},								\
+	.lp_mode_reg = MT6363_RG_##buck_reg##_LP_ADDR,			\
+	.lp_mode_mask = BIT(MT6363_RG_LDO_##vreg##_LP_BIT),		\
+	.hw_lp_mode_reg = MT6363_LDO_##vreg##_HW_LP_MODE,		\
+	.hw_lp_mode_mask = 0x4,						\
+	.hwirq = ocp_intn,						\
+}
+
+#define MT6363_LDO_L_SC(match, vreg, inp, min, max, step, buck_reg,	\
+			ocp_intn)					\
+	MT6363_LDO_LINEAR_OPS(match, vreg, inp, mt6363_vreg_setclr_ops,	\
+			      min, max, step, buck_reg, ocp_intn)
+
+#define MT6363_LDO_L(match, vreg, inp, min, max, step, buck_reg,	\
+		     ocp_intn)						\
+	MT6363_LDO_LINEAR_OPS(match, vreg, inp, mt6363_ldo_linear_ops,	\
+			      min, max, step, buck_reg, ocp_intn)
+
+#define MT6363_LDO_LINEAR_CAL_OPS(match, vreg, in_sup, vops, vrnum,	\
+				  ocp_intn)				\
+[MT6363_ID_##vreg] = {							\
+	.desc = {							\
+		.name = match,						\
+		.supply_name = in_sup,					\
+		.of_match = of_match_ptr(match),			\
+		.ops = &vops,						\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6363_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = ARRAY_SIZE(ldo_volt_ranges##vrnum) * 11,	\
+		.linear_ranges = ldo_volt_ranges##vrnum,		\
+		.n_linear_ranges = ARRAY_SIZE(ldo_volt_ranges##vrnum),	\
+		.linear_range_selectors_bitfield = ldos_cal_selectors,	\
+		.enable_reg = MT6363_RG_LDO_##vreg##_ADDR,		\
+		.enable_mask = BIT(MT6363_RG_LDO_##vreg##_EN_BIT),	\
+		.vsel_reg = MT6363_RG_##vreg##_VOCAL_ADDR,		\
+		.vsel_mask = MT6363_RG_##vreg##_VOCAL_MASK,		\
+		.vsel_range_reg = MT6363_RG_##vreg##_VOSEL_ADDR,	\
+		.vsel_range_mask = MT6363_RG_##vreg##_VOSEL_MASK,	\
+		.of_map_mode = mt6363_map_mode,				\
+	},								\
+	.lp_mode_reg = MT6363_RG_LDO_##vreg##_ADDR,			\
+	.lp_mode_mask = BIT(MT6363_RG_LDO_##vreg##_LP_BIT),		\
+	.hw_lp_mode_reg = MT6363_LDO_##vreg##_HW_LP_MODE,		\
+	.hw_lp_mode_mask = 0x4,						\
+	.lp_imax_uA = 10000,						\
+	.op_en_reg = MT6363_LDO_##vreg##_OP_EN0,			\
+	.hwirq = ocp_intn,						\
+}
+
+#define MT6363_LDO_VT(match, vreg, inp, vranges_num, ocp_intn)		\
+	MT6363_LDO_LINEAR_CAL_OPS(match, vreg, inp, mt6363_ldo_vtable_ops,\
+				  vranges_num, ocp_intn)
+
+static const unsigned int ldos_cal_selectors[] = {
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+static const struct linear_range ldo_volt_ranges0[] = {
+	REGULATOR_LINEAR_RANGE(1200000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1300000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1500000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1700000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1800000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2000000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2500000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2600000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2700000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2800000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2900000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(3000000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(3100000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(3300000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(3400000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(3500000, 0, 10, 10000)
+};
+
+static const struct linear_range ldo_volt_ranges1[] = {
+	REGULATOR_LINEAR_RANGE(900000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1000000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1100000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1200000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1300000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1700000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1800000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1810000, 0, 10, 10000)
+};
+
+static const struct linear_range ldo_volt_ranges2[] = {
+	REGULATOR_LINEAR_RANGE(1800000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1900000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2000000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2100000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2200000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2300000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2400000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2500000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2600000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2700000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2800000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2900000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(3000000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(3100000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(3200000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(3300000, 0, 10, 10000)
+};
+
+static const struct linear_range ldo_volt_ranges3[] = {
+	REGULATOR_LINEAR_RANGE(600000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(700000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(800000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(900000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1000000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1100000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1200000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1300000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1400000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1500000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1600000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1700000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1800000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1900000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2000000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(2100000, 0, 10, 10000)
+};
+
+static const struct linear_range ldo_volt_ranges4[] = {
+	REGULATOR_LINEAR_RANGE(550000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(600000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(650000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(700000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(750000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(800000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(900000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(950000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(1000000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(1050000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(1100000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(1150000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(1700000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(1750000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(1800000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(1850000, 0, 10, 5000)
+};
+
+static const struct linear_range ldo_volt_ranges5[] = {
+	REGULATOR_LINEAR_RANGE(600000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(650000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(700000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(750000, 0, 10, 5000),
+	REGULATOR_LINEAR_RANGE(800000, 0, 10, 5000)
+};
+
+static int mt6363_vreg_enable_setclr(struct regulator_dev *rdev)
+{
+	return regmap_write(rdev->regmap, rdev->desc->enable_reg + EN_SET_OFFSET,
+			    rdev->desc->enable_mask);
+}
+
+static int mt6363_vreg_disable_setclr(struct regulator_dev *rdev)
+{
+	return regmap_write(rdev->regmap, rdev->desc->enable_reg + EN_CLR_OFFSET,
+			    rdev->desc->enable_mask);
+}
+
+static inline unsigned int mt6363_map_mode(unsigned int mode)
+{
+	switch (mode) {
+	case MT6363_REGULATOR_MODE_NORMAL:
+		return REGULATOR_MODE_NORMAL;
+	case MT6363_REGULATOR_MODE_FCCM:
+		return REGULATOR_MODE_FAST;
+	case MT6363_REGULATOR_MODE_LP:
+		return REGULATOR_MODE_IDLE;
+	case MT6363_REGULATOR_MODE_ULP:
+		return REGULATOR_MODE_STANDBY;
+	default:
+		return REGULATOR_MODE_INVALID;
+	}
+}
+
+static unsigned int mt6363_regulator_get_mode(struct regulator_dev *rdev)
+{
+	struct mt6363_regulator_info *info = rdev_get_drvdata(rdev);
+	unsigned int val;
+	int ret;
+
+	if (info->modeset_reg) {
+		ret = regmap_read(rdev->regmap, info->modeset_reg, &val);
+		if (ret) {
+			dev_err(&rdev->dev, "Failed to get mt6363 mode: %d\n", ret);
+			return ret;
+		}
+
+		if (val & info->modeset_mask)
+			return REGULATOR_MODE_FAST;
+	} else {
+		val = 0;
+	};
+
+	ret = regmap_read(rdev->regmap, info->hw_lp_mode_reg, &val);
+	val &= info->hw_lp_mode_mask;
+
+	if (ret) {
+		dev_err(&rdev->dev, "Failed to get lp mode: %d\n", ret);
+		return ret;
+	}
+
+	if (val)
+		return REGULATOR_MODE_IDLE;
+	else
+		return REGULATOR_MODE_NORMAL;
+}
+
+static int mt6363_buck_unlock(struct regmap *map, bool unlock)
+{
+	u16 buf = unlock ? MT6363_BUCK_TOP_UNLOCK_VALUE : 0;
+
+	return regmap_bulk_write(map, MT6363_BUCK_TOP_KEY_PROT_LO, &buf, sizeof(buf));
+}
+
+static int mt6363_regulator_set_mode(struct regulator_dev *rdev,
+				     unsigned int mode)
+{
+	struct mt6363_regulator_info *info = rdev_get_drvdata(rdev);
+	struct regmap *regmap = rdev->regmap;
+	int cur_mode, ret;
+
+	if (!info->modeset_reg && mode == REGULATOR_MODE_FAST)
+		return -EOPNOTSUPP;
+
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		ret = mt6363_buck_unlock(regmap, true);
+		if (ret)
+			break;
+
+		ret = regmap_set_bits(regmap, info->modeset_reg, info->modeset_mask);
+
+		mt6363_buck_unlock(regmap, false);
+		break;
+	case REGULATOR_MODE_NORMAL:
+		cur_mode = mt6363_regulator_get_mode(rdev);
+		if (cur_mode < 0) {
+			ret = cur_mode;
+			break;
+		}
+
+		if (cur_mode == REGULATOR_MODE_FAST) {
+			ret = mt6363_buck_unlock(regmap, true);
+			if (ret)
+				break;
+
+			ret = regmap_clear_bits(regmap, info->modeset_reg, info->modeset_mask);
+
+			mt6363_buck_unlock(regmap, false);
+			break;
+		} else if (cur_mode == REGULATOR_MODE_IDLE) {
+			ret = regmap_clear_bits(regmap, info->lp_mode_reg, info->lp_mode_mask);
+			if (ret == 0)
+				usleep_range(100, 200);
+		} else {
+			ret = 0;
+		}
+		break;
+	case REGULATOR_MODE_IDLE:
+		ret = regmap_set_bits(regmap, info->lp_mode_reg, info->lp_mode_mask);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret) {
+		dev_err(&rdev->dev, "Failed to set mode %u: %d\n", mode, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int mt6363_regulator_set_load(struct regulator_dev *rdev, int load_uA)
+{
+	struct mt6363_regulator_info *info = rdev_get_drvdata(rdev);
+	unsigned int opmode_cfg, opmode_en;
+	int i, ret;
+
+	if (!info->lp_imax_uA)
+		return -EINVAL;
+
+	if (load_uA >= info->lp_imax_uA) {
+		ret = mt6363_regulator_set_mode(rdev, REGULATOR_MODE_NORMAL);
+		if (ret)
+			return ret;
+
+		opmode_cfg = NORMAL_OP_CFG;
+		opmode_en = NORMAL_OP_EN;
+	} else {
+		opmode_cfg = info->orig_op_cfg;
+		opmode_en = info->orig_op_en;
+	}
+
+	ret = regmap_write(rdev->regmap, info->op_en_reg + OP_CFG_OFFSET, opmode_cfg);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < 3; i++) {
+		ret = regmap_write(rdev->regmap, info->op_en_reg + i,
+				   (opmode_en >> (i * 8)) & GENMASK(7, 0));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mt6363_vemc_set_voltage_sel(struct regulator_dev *rdev, unsigned int sel)
+{
+	const u16 tma_unlock_key = MT6363_TMA_UNLOCK_VALUE;
+	const struct regulator_desc *rdesc = rdev->desc;
+	struct regmap *regmap = rdev->regmap;
+	unsigned int range, val;
+	int i, ret;
+	u16 mask;
+
+	for (i = 0; i < rdesc->n_linear_ranges; i++) {
+		const struct linear_range *r = &rdesc->linear_ranges[i];
+		unsigned int voltages_in_range = linear_range_values_in_range(r);
+
+		if (sel < voltages_in_range)
+			break;
+		sel -= voltages_in_range;
+	}
+
+	if (i == rdesc->n_linear_ranges)
+		return -EINVAL;
+
+	ret = regmap_read(rdev->regmap, MT6363_TOP_TRAP, &val);
+	if (ret)
+		return ret;
+
+	if (val > 1)
+		return -EINVAL;
+
+	/* Unlock TMA for writing */
+	ret = regmap_bulk_write(rdev->regmap, MT6363_TOP_TMA_KEY_L,
+				&tma_unlock_key, sizeof(tma_unlock_key));
+	if (ret)
+		return ret;
+
+	/* If HW trapping value is 1, use VEMC_VOSEL_1 instead of VEMC_VOSEL_0 */
+	if (val == 1) {
+		mask = MT6363_RG_VEMC_VOSEL_1_MASK;
+		sel = FIELD_PREP(MT6363_RG_VEMC_VOSEL_1_MASK, sel);
+	} else {
+		mask = rdesc->vsel_mask;
+	}
+
+	sel <<= ffs(rdesc->vsel_mask) - 1;
+	sel += rdesc->linear_ranges[i].min_sel;
+
+	range = rdesc->linear_range_selectors_bitfield[i];
+	range <<= ffs(rdesc->vsel_range_mask) - 1;
+
+	/* Write to the vreg calibration register for voltage finetuning */
+	ret = regmap_update_bits(regmap, rdesc->vsel_range_reg,
+				 rdesc->vsel_range_mask, range);
+	if (ret)
+		goto lock_tma;
+
+	/* Function must return the result of this write operation */
+	ret = regmap_update_bits(regmap, rdesc->vsel_reg, mask, sel);
+
+lock_tma:
+	/* Unconditionally re-lock TMA */
+	val = 0;
+	regmap_bulk_write(rdev->regmap, MT6363_TOP_TMA_KEY_L, &val, 2);
+
+	return ret;
+}
+
+static int mt6363_vemc_get_voltage_sel(struct regulator_dev *rdev)
+{
+	const struct regulator_desc *rdesc = rdev->desc;
+	unsigned int vosel, trap, calsel;
+	int vcal, vsel, range, ret;
+
+	ret = regmap_read(rdev->regmap, rdesc->vsel_reg, &vosel);
+	if (ret)
+		return ret;
+
+	ret = regmap_read(rdev->regmap, rdesc->vsel_range_reg, &calsel);
+	if (ret)
+		return ret;
+
+	calsel &= rdesc->vsel_range_mask;
+	for (range = 0; range < rdesc->n_linear_ranges; range++)
+		if (rdesc->linear_range_selectors_bitfield[range] != calsel)
+			break;
+
+	if (range == rdesc->n_linear_ranges)
+		return -EINVAL;
+
+	ret = regmap_read(rdev->regmap, MT6363_TOP_TRAP, &trap);
+	if (ret)
+		return ret;
+
+	/* If HW trapping value is 1, use VEMC_VOSEL_1 instead of VEMC_VOSEL_0 */
+	if (trap > 1)
+		return -EINVAL;
+	else if (trap == 1)
+		vsel = FIELD_GET(MT6363_RG_VEMC_VOSEL_1_MASK, vosel);
+	else
+		vsel = vosel & rdesc->vsel_mask;
+
+	vcal = linear_range_values_in_range_array(rdesc->linear_ranges, range);
+
+	return vsel + vcal;
+}
+
+static int mt6363_va15_set_voltage_sel(struct regulator_dev *rdev, unsigned int sel)
+{
+	struct regmap *regmap = rdev->regmap;
+	int ret;
+
+	ret = mt6363_buck_unlock(regmap, true);
+	if (ret)
+		return ret;
+
+	ret = regulator_set_voltage_sel_pickable_regmap(rdev, sel);
+	if (ret)
+		goto va15_unlock;
+
+	ret = regmap_update_bits(regmap, MT6363_RG_BUCK_EFUSE_RSV1,
+				 MT6363_RG_BUCK_EFUSE_RSV1_MASK, sel);
+	if (ret)
+		goto va15_unlock;
+
+va15_unlock:
+	mt6363_buck_unlock(rdev->regmap, false);
+	return ret;
+}
+
+static void mt6363_oc_irq_enable_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct mt6363_regulator_info *info =
+		container_of(dwork, struct mt6363_regulator_info, oc_work);
+
+	enable_irq(info->virq);
+}
+
+static irqreturn_t mt6363_oc_isr(int irq, void *data)
+{
+	struct regulator_dev *rdev = (struct regulator_dev *)data;
+	struct mt6363_regulator_info *info = rdev_get_drvdata(rdev);
+
+	disable_irq_nosync(info->virq);
+
+	if (regulator_is_enabled_regmap(rdev))
+		regulator_notifier_call_chain(rdev, REGULATOR_EVENT_OVER_CURRENT, NULL);
+
+	schedule_delayed_work(&info->oc_work, msecs_to_jiffies(OC_IRQ_ENABLE_DELAY_MS));
+
+	return IRQ_HANDLED;
+}
+
+static int mt6363_set_ocp(struct regulator_dev *rdev, int lim, int severity, bool enable)
+{
+	struct mt6363_regulator_info *info = rdev_get_drvdata(rdev);
+
+	/* MT6363 supports only enabling protection and does not support limits */
+	if (lim || severity != REGULATOR_SEVERITY_PROT || !enable)
+		return -EOPNOTSUPP;
+
+	/* If there is no OCP interrupt, there's nothing to set */
+	if (info->virq <= 0)
+		return -EOPNOTSUPP;
+
+	return devm_request_threaded_irq(&rdev->dev, info->virq, NULL,
+					 mt6363_oc_isr, IRQF_ONESHOT,
+					 info->desc.name, rdev);
+}
+
+static const struct regulator_ops mt6363_vreg_setclr_ops = {
+	.list_voltage = regulator_list_voltage_linear,
+	.map_voltage = regulator_map_voltage_linear,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = mt6363_vreg_enable_setclr,
+	.disable = mt6363_vreg_disable_setclr,
+	.is_enabled = regulator_is_enabled_regmap,
+	.set_mode = mt6363_regulator_set_mode,
+	.get_mode = mt6363_regulator_get_mode,
+	.set_load = mt6363_regulator_set_load,
+	.set_over_current_protection = mt6363_set_ocp,
+};
+
+static const struct regulator_ops mt6363_ldo_linear_ops = {
+	.list_voltage = regulator_list_voltage_linear,
+	.map_voltage = regulator_map_voltage_linear,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.set_mode = mt6363_regulator_set_mode,
+	.get_mode = mt6363_regulator_get_mode,
+	.set_over_current_protection = mt6363_set_ocp,
+};
+
+static const struct regulator_ops mt6363_ldo_vtable_ops = {
+	.list_voltage = regulator_list_voltage_pickable_linear_range,
+	.map_voltage = regulator_map_voltage_pickable_linear_range,
+	.set_voltage_sel = regulator_set_voltage_sel_pickable_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_pickable_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.set_mode = mt6363_regulator_set_mode,
+	.get_mode = mt6363_regulator_get_mode,
+	.set_load = mt6363_regulator_set_load,
+	.set_over_current_protection = mt6363_set_ocp,
+};
+
+static const struct regulator_ops mt6363_ldo_vemc_ops = {
+	.list_voltage = regulator_list_voltage_pickable_linear_range,
+	.map_voltage = regulator_map_voltage_pickable_linear_range,
+	.set_voltage_sel = mt6363_vemc_set_voltage_sel,
+	.get_voltage_sel = mt6363_vemc_get_voltage_sel,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.set_mode = mt6363_regulator_set_mode,
+	.get_mode = mt6363_regulator_get_mode,
+	.set_load = mt6363_regulator_set_load,
+	.set_over_current_protection = mt6363_set_ocp,
+};
+
+static const struct regulator_ops mt6363_ldo_va15_ops = {
+	.list_voltage = regulator_list_voltage_pickable_linear_range,
+	.map_voltage = regulator_map_voltage_pickable_linear_range,
+	.set_voltage_sel = mt6363_va15_set_voltage_sel,
+	.get_voltage_sel = regulator_get_voltage_sel_pickable_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.set_mode = mt6363_regulator_set_mode,
+	.get_mode = mt6363_regulator_get_mode,
+	.set_load = mt6363_regulator_set_load,
+	.set_over_current_protection = mt6363_set_ocp,
+};
+
+/* The array is indexed by id(MT6363_ID_XXX) */
+static struct mt6363_regulator_info mt6363_regulators[] = {
+	MT6363_BUCK("vbuck1", VBUCK1, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR,
+		    MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_FCCM_ADDR, 1),
+	MT6363_BUCK("vbuck2", VBUCK2, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR,
+		    MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_FCCM_ADDR, 2),
+	MT6363_BUCK("vbuck3", VBUCK3, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR,
+		    MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_FCCM_ADDR, 3),
+	MT6363_BUCK("vbuck4", VBUCK4, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR,
+		    MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_1_FCCM_ADDR, 4),
+	MT6363_BUCK("vbuck5", VBUCK5, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR,
+		    MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_1_FCCM_ADDR, 5),
+	MT6363_BUCK("vbuck6", VBUCK6, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR,
+		    MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_1_FCCM_ADDR, 6),
+	MT6363_BUCK("vbuck7", VBUCK7, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR,
+		    MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_1_FCCM_ADDR, 7),
+	MT6363_BUCK("vs1", VS1, 0, 2200000, 12500, MT6363_RG_BUCK1_EN_ADDR,
+		    MT6363_RG_BUCK1_LP_ADDR, MT6363_RG_VS1_FCCM_ADDR, 8),
+	MT6363_BUCK("vs2", VS2, 0, 1600000, 12500, MT6363_RG_BUCK0_EN_ADDR,
+		    MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_FCCM_ADDR, 0),
+	MT6363_BUCK("vs3", VS3, 0, 1193750, 6250, MT6363_RG_BUCK1_EN_ADDR,
+		    MT6363_RG_BUCK1_LP_ADDR, MT6363_RG_VS3_FCCM_ADDR, 9),
+	MT6363_LDO_VT("va12-1", VA12_1, "vs2-ldo2", 3, 37),
+	MT6363_LDO_VT("va12-2", VA12_2, "vs2-ldo2", 3, 38),
+	MT6363_LDO_LINEAR_CAL_OPS("va15", VA15, "vs1-ldo1", mt6363_ldo_va15_ops, 3, 39),
+	MT6363_LDO_VT("vaux18", VAUX18, "vsys-ldo1", 2, 31),
+	MT6363_LDO_VT("vcn13", VCN13, "vs2-ldo2", 1, 17),
+	MT6363_LDO_VT("vcn15", VCN15, "vs1-ldo2", 3, 16),
+	MT6363_LDO_LINEAR_CAL_OPS("vemc", VEMC, "vsys-ldo1", mt6363_ldo_vemc_ops, 0, 32),
+	MT6363_LDO_VT("vio0p75", VIO075, "vs1-ldo1", 5, 36),
+	MT6363_LDO_VT("vio18", VIO18, "vs1-ldo2", 3, 35),
+	MT6363_LDO_VT("vm18", VM18, "vs1-ldo1", 4, 40),
+	MT6363_LDO_L("vsram-apu", VSRAM_APU, "vs3-ldo1", 400000, 1193750, 6250, BUCK1, 30),
+	MT6363_LDO_L("vsram-cpub", VSRAM_CPUB, "vs2-ldo1", 400000, 1193750, 6250, BUCK1, 27),
+	MT6363_LDO_L("vsram-cpum", VSRAM_CPUM, "vs2-ldo1", 400000, 1193750, 6250, BUCK1, 28),
+	MT6363_LDO_L("vsram-cpul", VSRAM_CPUL, "vs2-ldo2", 400000, 1193750, 6250, BUCK1, 29),
+	MT6363_LDO_L_SC("vsram-digrf", VSRAM_DIGRF, "vs3-ldo1", 400000, 1193750, 6250, BUCK1, 23),
+	MT6363_LDO_L_SC("vsram-mdfe", VSRAM_MDFE, "vs3-ldo1", 400000, 1193750, 6250, BUCK1, 24),
+	MT6363_LDO_L_SC("vsram-modem", VSRAM_MODEM, "vs3-ldo2", 400000, 1193750, 6250, BUCK1, 25),
+	MT6363_LDO_VT("vrf0p9", VRF09, "vs3-ldo2", 1, 18),
+	MT6363_LDO_VT("vrf12", VRF12, "vs2-ldo1", 3, 19),
+	MT6363_LDO_VT("vrf13", VRF13, "vs2-ldo1", 1, 20),
+	MT6363_LDO_VT("vrf18", VRF18, "vs1-ldo1", 3, 21),
+	MT6363_LDO_VT("vrf-io18", VRFIO18, "vs1-ldo1", 3, 22),
+	MT6363_LDO_VT("vtref18", VTREF18, "vsys-ldo1", 2, 26),
+	MT6363_LDO_VT("vufs12", VUFS12, "vs2-ldo1", 4, 33),
+	MT6363_LDO_VT("vufs18", VUFS18, "vs1-ldo2", 3, 34),
+};
+
+static int mt6363_backup_op_setting(struct regmap *map, struct mt6363_regulator_info *info)
+{
+	unsigned int i, val;
+	int ret;
+
+	ret = regmap_read(map, info->op_en_reg + OP_CFG_OFFSET, &val);
+	if (ret)
+		return ret;
+
+	info->orig_op_cfg = val;
+
+	for (i = 0; i < 3; i++) {
+		ret = regmap_read(map, info->op_en_reg + i, &val);
+		if (ret)
+			return ret;
+
+		info->orig_op_en |= val << (i * 8);
+	}
+
+	return 0;
+}
+
+static void mt6363_irq_remove(void *data)
+{
+	int *virq = data;
+
+	irq_dispose_mapping(*virq);
+}
+
+static void mt6363_spmi_remove(void *data)
+{
+	struct spmi_device *sdev = data;
+
+	spmi_device_remove(sdev);
+};
+
+static struct regmap *mt6363_spmi_register_regmap(struct device *dev)
+{
+	struct regmap_config mt6363_regmap_config = {
+		.reg_bits = 16,
+		.val_bits = 16,
+		.max_register = 0x1f90,
+		.fast_io = true,
+	};
+	struct spmi_device *sdev, *sparent;
+	u32 base;
+	int ret;
+
+	if (!dev->parent)
+		return ERR_PTR(-ENODEV);
+
+	ret = device_property_read_u32(dev, "reg", &base);
+	if (ret)
+		return ERR_PTR(ret);
+
+	sparent = to_spmi_device(dev->parent);
+	if (!sparent)
+		return ERR_PTR(-ENODEV);
+
+	sdev = spmi_device_alloc(sparent->ctrl);
+	if (!sdev)
+		return ERR_PTR(-ENODEV);
+
+	sdev->usid = sparent->usid;
+	dev_set_name(&sdev->dev, "%d-%02x-regulator", sdev->ctrl->nr, sdev->usid);
+	ret = device_add(&sdev->dev);
+	if (ret) {
+		put_device(&sdev->dev);
+		return ERR_PTR(ret);
+	};
+
+	ret = devm_add_action_or_reset(dev, mt6363_spmi_remove, sdev);
+	if (ret)
+		return ERR_PTR(ret);
+
+	mt6363_regmap_config.reg_base = base;
+
+	return devm_regmap_init_spmi_ext(sdev, &mt6363_regmap_config);
+}
+
+static int mt6363_regulator_probe(struct platform_device *pdev)
+{
+	struct device_node *interrupt_parent;
+	struct regulator_config config = {};
+	struct mt6363_regulator_info *info;
+	struct device *dev = &pdev->dev;
+	struct regulator_dev *rdev;
+	struct irq_domain *domain;
+	struct irq_fwspec fwspec;
+	struct spmi_device *sdev;
+	int i, ret;
+
+	config.regmap = mt6363_spmi_register_regmap(dev);
+	if (IS_ERR(config.regmap))
+		return dev_err_probe(dev, PTR_ERR(config.regmap),
+				     "Cannot get regmap\n");
+	config.dev = dev;
+	sdev = to_spmi_device(dev->parent);
+
+	interrupt_parent = of_irq_find_parent(dev->of_node);
+	if (!interrupt_parent)
+		return dev_err_probe(dev, -EINVAL, "Cannot find IRQ parent\n");
+
+	domain = irq_find_host(interrupt_parent);
+	of_node_put(interrupt_parent);
+	fwspec.fwnode = domain->fwnode;
+
+	fwspec.param_count = 3;
+	fwspec.param[0] = sdev->usid;
+	fwspec.param[2] = IRQ_TYPE_LEVEL_HIGH;
+
+	for (i = 0; i < ARRAY_SIZE(mt6363_regulators); i++) {
+		info = &mt6363_regulators[i];
+
+		fwspec.param[1] = info->hwirq;
+		info->virq = irq_create_fwspec_mapping(&fwspec);
+		if (!info->virq)
+			return dev_err_probe(dev, -EINVAL,
+					     "Failed to map IRQ%d\n", info->hwirq);
+
+		ret = devm_add_action_or_reset(dev, mt6363_irq_remove, &info->virq);
+		if (ret) {
+			irq_dispose_mapping(info->hwirq);
+			return ret;
+		}
+
+		config.driver_data = info;
+		INIT_DELAYED_WORK(&info->oc_work, mt6363_oc_irq_enable_work);
+
+		rdev = devm_regulator_register(dev, &info->desc, &config);
+		if (IS_ERR(rdev))
+			return dev_err_probe(dev, PTR_ERR(rdev),
+					     "failed to register %s\n", info->desc.name);
+
+		if (info->lp_imax_uA) {
+			ret = mt6363_backup_op_setting(config.regmap, info);
+			if (ret) {
+				dev_warn(dev, "Failed to backup op_setting for %s\n",
+					 info->desc.name);
+				info->lp_imax_uA = 0;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static const struct of_device_id mt6363_regulator_match[] = {
+	{ .compatible = "mediatek,mt6363-regulator" },
+	{ /* sentinel */ }
+};
+
+static struct platform_driver mt6363_regulator_driver = {
+	.driver = {
+		.name = "mt6363-regulator",
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+		.of_match_table = mt6363_regulator_match,
+	},
+	.probe = mt6363_regulator_probe,
+};
+module_platform_driver(mt6363_regulator_driver);
+
+MODULE_AUTHOR("AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>");
+MODULE_DESCRIPTION("Regulator Driver for MediaTek MT6363 PMIC");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/regulator/mt6363-regulator.h b/include/linux/regulator/mt6363-regulator.h
new file mode 100644
index 000000000000..60761f01d3ad
--- /dev/null
+++ b/include/linux/regulator/mt6363-regulator.h
@@ -0,0 +1,330 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 MediaTek Inc.
+ * Copyright (c) 2025 Collabora Ltd
+ */
+
+#include <linux/bits.h>
+
+#ifndef __LINUX_REGULATOR_MT6363_H
+#define __LINUX_REGULATOR_MT6363_H
+
+/* Register */
+#define MT6363_TOP_TRAP				0x6
+#define MT6363_TOP_TMA_KEY_L			0x36e
+#define MT6363_RG_BUCK0_EN_ADDR			0x210
+#define MT6363_RG_BUCK_VS2_EN_BIT		0
+#define MT6363_RG_BUCK_VBUCK1_EN_BIT		1
+#define MT6363_RG_BUCK_VBUCK2_EN_BIT		2
+#define MT6363_RG_BUCK_VBUCK3_EN_BIT		3
+#define MT6363_RG_BUCK_VBUCK4_EN_BIT		4
+#define MT6363_RG_BUCK_VBUCK5_EN_BIT		5
+#define MT6363_RG_BUCK_VBUCK6_EN_BIT		6
+#define MT6363_RG_BUCK_VBUCK7_EN_BIT		7
+#define MT6363_RG_BUCK1_EN_ADDR			0x213
+#define MT6363_RG_BUCK_VS1_EN_BIT		0
+#define MT6363_RG_BUCK_VS3_EN_BIT		1
+#define MT6363_RG_LDO_VSRAM_DIGRF_EN_BIT	4
+#define MT6363_RG_LDO_VSRAM_MDFE_EN_BIT		5
+#define MT6363_RG_LDO_VSRAM_MODEM_EN_BIT	6
+#define MT6363_RG_BUCK0_LP_ADDR			0x216
+#define MT6363_RG_BUCK_VS2_LP_BIT		0
+#define MT6363_RG_BUCK_VBUCK1_LP_BIT		1
+#define MT6363_RG_BUCK_VBUCK2_LP_BIT		2
+#define MT6363_RG_BUCK_VBUCK3_LP_BIT		3
+#define MT6363_RG_BUCK_VBUCK4_LP_BIT		4
+#define MT6363_RG_BUCK_VBUCK5_LP_BIT		5
+#define MT6363_RG_BUCK_VBUCK6_LP_BIT		6
+#define MT6363_RG_BUCK_VBUCK7_LP_BIT		7
+#define MT6363_RG_BUCK1_LP_ADDR			0x219
+#define MT6363_RG_BUCK_VS1_LP_BIT		0
+#define MT6363_RG_BUCK_VS3_LP_BIT		1
+#define MT6363_RG_LDO_VSRAM_DIGRF_LP_BIT	4
+#define MT6363_RG_LDO_VSRAM_MDFE_LP_BIT		5
+#define MT6363_RG_LDO_VSRAM_MODEM_LP_BIT	6
+#define MT6363_RG_BUCK_VS2_VOSEL_ADDR		0x21c
+#define MT6363_RG_BUCK_VS2_VOSEL_MASK		GENMASK(7, 0)
+#define MT6363_RG_BUCK_VBUCK1_VOSEL_ADDR	0x21d
+#define MT6363_RG_BUCK_VBUCK1_VOSEL_MASK	GENMASK(7, 0)
+#define MT6363_RG_BUCK_VBUCK2_VOSEL_ADDR	0x21e
+#define MT6363_RG_BUCK_VBUCK2_VOSEL_MASK	GENMASK(7, 0)
+#define MT6363_RG_BUCK_VBUCK3_VOSEL_ADDR	0x21f
+#define MT6363_RG_BUCK_VBUCK3_VOSEL_MASK	GENMASK(7, 0)
+#define MT6363_RG_BUCK_VBUCK4_VOSEL_ADDR	0x220
+#define MT6363_RG_BUCK_VBUCK4_VOSEL_MASK	GENMASK(7, 0)
+#define MT6363_RG_BUCK_VBUCK5_VOSEL_ADDR	0x221
+#define MT6363_RG_BUCK_VBUCK5_VOSEL_MASK	GENMASK(7, 0)
+#define MT6363_RG_BUCK_VBUCK6_VOSEL_ADDR	0x222
+#define MT6363_RG_BUCK_VBUCK6_VOSEL_MASK	GENMASK(7, 0)
+#define MT6363_RG_BUCK_VBUCK7_VOSEL_ADDR	0x223
+#define MT6363_RG_BUCK_VBUCK7_VOSEL_MASK	GENMASK(7, 0)
+#define MT6363_RG_BUCK_VS1_VOSEL_ADDR		0x224
+#define MT6363_RG_BUCK_VS1_VOSEL_MASK		GENMASK(7, 0)
+#define MT6363_RG_BUCK_VS3_VOSEL_ADDR		0x225
+#define MT6363_RG_BUCK_VS3_VOSEL_MASK		GENMASK(7, 0)
+#define MT6363_RG_LDO_VSRAM_DIGRF_VOSEL_ADDR	0x228
+#define MT6363_RG_LDO_VSRAM_DIGRF_VOSEL_MASK	GENMASK(6, 0)
+#define MT6363_RG_LDO_VSRAM_MDFE_VOSEL_ADDR	0x229
+#define MT6363_RG_LDO_VSRAM_MDFE_VOSEL_MASK	GENMASK(6, 0)
+#define MT6363_RG_LDO_VSRAM_MODEM_VOSEL_ADDR	0x22a
+#define MT6363_RG_LDO_VSRAM_MODEM_VOSEL_MASK	GENMASK(6, 0)
+#define MT6363_BUCK_TOP_KEY_PROT_LO		0x13fa
+#define MT6363_BUCK_VS2_WDTDBG_VOSEL_ADDR	0x13fc
+#define MT6363_BUCK_VBUCK1_WDTDBG_VOSEL_ADDR	0x13fd
+#define MT6363_BUCK_VBUCK2_WDTDBG_VOSEL_ADDR	0x13fe
+#define MT6363_BUCK_VBUCK3_WDTDBG_VOSEL_ADDR	0x13ff
+#define MT6363_BUCK_VBUCK4_WDTDBG_VOSEL_ADDR	0x1400
+#define MT6363_BUCK_VBUCK5_WDTDBG_VOSEL_ADDR	0x1401
+#define MT6363_BUCK_VBUCK6_WDTDBG_VOSEL_ADDR	0x1402
+#define MT6363_BUCK_VBUCK7_WDTDBG_VOSEL_ADDR	0x1403
+#define MT6363_BUCK_VS1_WDTDBG_VOSEL_ADDR	0x1404
+#define MT6363_BUCK_VS3_WDTDBG_VOSEL_ADDR	0x1405
+#define MT6363_RG_BUCK_EFUSE_RSV1		0x1417
+#define MT6363_RG_BUCK_EFUSE_RSV1_MASK		GENMASK(7, 4)
+#define MT6363_BUCK_VS2_OP_EN_0			0x145d
+#define MT6363_BUCK_VS2_HW_LP_MODE		0x1468
+#define MT6363_BUCK_VBUCK1_OP_EN_0		0x14dd
+#define MT6363_BUCK_VBUCK1_HW_LP_MODE		0x14e8
+#define MT6363_RG_BUCK_VBUCK1_SSHUB_EN_ADDR	0x14ea
+#define MT6363_RG_BUCK_VBUCK1_SSHUB_VOSEL_ADDR	0x14eb
+#define MT6363_RG_BUCK_VBUCK1_SSHUB_VOSEL_MASK	GENMASK(7, 0)
+#define MT6363_BUCK_VBUCK2_OP_EN_0		0x155d
+#define MT6363_BUCK_VBUCK2_HW_LP_MODE		0x1568
+#define MT6363_RG_BUCK_VBUCK2_SSHUB_EN_ADDR	0x156a
+#define MT6363_RG_BUCK_VBUCK2_SSHUB_VOSEL_ADDR	0x156b
+#define MT6363_RG_BUCK_VBUCK2_SSHUB_VOSEL_MASK	GENMASK(7, 0)
+#define MT6363_BUCK_VBUCK3_OP_EN_0		0x15dd
+#define MT6363_BUCK_VBUCK3_HW_LP_MODE		0x15e8
+#define MT6363_BUCK_VBUCK4_OP_EN_0		0x165d
+#define MT6363_BUCK_VBUCK4_HW_LP_MODE		0x1668
+#define MT6363_RG_BUCK_VBUCK4_SSHUB_EN_ADDR	0x166a
+#define MT6363_RG_BUCK_VBUCK4_SSHUB_VOSEL_ADDR	0x166b
+#define MT6363_RG_BUCK_VBUCK4_SSHUB_VOSEL_MASK	GENMASK(7, 0)
+#define MT6363_BUCK_VBUCK5_OP_EN_0		0x16dd
+#define MT6363_BUCK_VBUCK5_HW_LP_MODE		0x16e8
+#define MT6363_BUCK_VBUCK6_OP_EN_0		0x175d
+#define MT6363_BUCK_VBUCK6_HW_LP_MODE		0x1768
+#define MT6363_BUCK_VBUCK7_OP_EN_0		0x17dd
+#define MT6363_BUCK_VBUCK7_HW_LP_MODE		0x17e8
+#define MT6363_BUCK_VS1_OP_EN_0			0x185d
+#define MT6363_BUCK_VS1_HW_LP_MODE		0x1868
+#define MT6363_BUCK_VS3_OP_EN_0			0x18dd
+#define MT6363_BUCK_VS3_HW_LP_MODE		0x18e8
+#define MT6363_RG_VS1_FCCM_ADDR			0x1964
+#define MT6363_RG_VS1_FCCM_BIT			0
+#define MT6363_RG_VS3_FCCM_ADDR			0x1973
+#define MT6363_RG_VS3_FCCM_BIT			0
+#define MT6363_RG_BUCK0_FCCM_ADDR		0x1a02
+#define MT6363_RG_VBUCK1_FCCM_BIT		0
+#define MT6363_RG_VBUCK2_FCCM_BIT		1
+#define MT6363_RG_VBUCK3_FCCM_BIT		2
+#define MT6363_RG_VS2_FCCM_BIT			3
+#define MT6363_RG_BUCK0_1_FCCM_ADDR		0x1a82
+#define MT6363_RG_VBUCK4_FCCM_BIT		0
+#define MT6363_RG_VBUCK5_FCCM_BIT		1
+#define MT6363_RG_VBUCK6_FCCM_BIT		2
+#define MT6363_RG_VBUCK7_FCCM_BIT		3
+#define MT6363_RG_VCN13_VOSEL_ADDR		0x1b0f
+#define MT6363_RG_VCN13_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VEMC_VOSEL_ADDR		0x1b10
+#define MT6363_RG_VEMC_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VEMC_VOSEL_1_MASK		GENMASK(7, 4)
+#define MT6363_RG_LDO_VSRAM_CPUB_VOSEL_ADDR	0x1b14
+#define MT6363_RG_LDO_VSRAM_CPUB_VOSEL_MASK	GENMASK(6, 0)
+#define MT6363_RG_LDO_VSRAM_CPUM_VOSEL_ADDR	0x1b15
+#define MT6363_RG_LDO_VSRAM_CPUM_VOSEL_MASK	GENMASK(6, 0)
+#define MT6363_RG_LDO_VSRAM_CPUL_VOSEL_ADDR	0x1b16
+#define MT6363_RG_LDO_VSRAM_CPUL_VOSEL_MASK	GENMASK(6, 0)
+#define MT6363_RG_LDO_VSRAM_APU_VOSEL_ADDR	0x1b17
+#define MT6363_RG_LDO_VSRAM_APU_VOSEL_MASK	GENMASK(6, 0)
+#define MT6363_RG_VEMC_VOCAL_ADDR		0x1b1b
+#define MT6363_RG_VEMC_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_LDO_VCN15_ADDR		0x1b57
+#define MT6363_RG_LDO_VCN15_EN_BIT		0
+#define MT6363_RG_LDO_VCN15_LP_BIT		1
+#define MT6363_LDO_VCN15_HW_LP_MODE		0x1b5b
+#define MT6363_LDO_VCN15_OP_EN0			0x1b5c
+#define MT6363_RG_LDO_VRF09_ADDR		0x1b65
+#define MT6363_RG_LDO_VRF09_EN_BIT		0
+#define MT6363_RG_LDO_VRF09_LP_BIT		1
+#define MT6363_LDO_VRF09_HW_LP_MODE		0x1b69
+#define MT6363_LDO_VRF09_OP_EN0			0x1b6a
+#define MT6363_RG_LDO_VRF12_ADDR		0x1b73
+#define MT6363_RG_LDO_VRF12_EN_BIT		0
+#define MT6363_RG_LDO_VRF12_LP_BIT		1
+#define MT6363_LDO_VRF12_HW_LP_MODE		0x1b77
+#define MT6363_LDO_VRF12_OP_EN0			0x1b78
+#define MT6363_RG_LDO_VRF13_ADDR		0x1b81
+#define MT6363_RG_LDO_VRF13_EN_BIT		0
+#define MT6363_RG_LDO_VRF13_LP_BIT		1
+#define MT6363_LDO_VRF13_HW_LP_MODE		0x1b85
+#define MT6363_LDO_VRF13_OP_EN0			0x1b86
+#define MT6363_RG_LDO_VRF18_ADDR		0x1b8f
+#define MT6363_RG_LDO_VRF18_EN_BIT		0
+#define MT6363_RG_LDO_VRF18_LP_BIT		1
+#define MT6363_LDO_VRF18_HW_LP_MODE		0x1b93
+#define MT6363_LDO_VRF18_OP_EN0			0x1b94
+#define MT6363_RG_LDO_VRFIO18_ADDR		0x1b9d
+#define MT6363_RG_LDO_VRFIO18_EN_BIT		0
+#define MT6363_RG_LDO_VRFIO18_LP_BIT		1
+#define MT6363_LDO_VRFIO18_HW_LP_MODE		0x1ba1
+#define MT6363_LDO_VRFIO18_OP_EN0		0x1ba2
+#define MT6363_RG_LDO_VTREF18_ADDR		0x1bd7
+#define MT6363_RG_LDO_VTREF18_EN_BIT		0
+#define MT6363_RG_LDO_VTREF18_LP_BIT		1
+#define MT6363_LDO_VTREF18_HW_LP_MODE		0x1bdb
+#define MT6363_LDO_VTREF18_OP_EN0		0x1bdc
+#define MT6363_RG_LDO_VAUX18_ADDR		0x1be5
+#define MT6363_RG_LDO_VAUX18_EN_BIT		0
+#define MT6363_RG_LDO_VAUX18_LP_BIT		1
+#define MT6363_LDO_VAUX18_HW_LP_MODE		0x1be9
+#define MT6363_LDO_VAUX18_OP_EN0		0x1bea
+#define MT6363_RG_LDO_VEMC_ADDR			0x1bf3
+#define MT6363_RG_LDO_VEMC_EN_BIT		0
+#define MT6363_RG_LDO_VEMC_LP_BIT		1
+#define MT6363_LDO_VEMC_HW_LP_MODE		0x1bf7
+#define MT6363_LDO_VEMC_OP_EN0			0x1bf8
+#define MT6363_RG_LDO_VUFS12_ADDR		0x1c01
+#define MT6363_RG_LDO_VUFS12_EN_BIT		0
+#define MT6363_RG_LDO_VUFS12_LP_BIT		1
+#define MT6363_LDO_VUFS12_HW_LP_MODE		0x1c05
+#define MT6363_LDO_VUFS12_OP_EN0		0x1c06
+#define MT6363_RG_LDO_VUFS18_ADDR		0x1c0f
+#define MT6363_RG_LDO_VUFS18_EN_BIT		0
+#define MT6363_RG_LDO_VUFS18_LP_BIT		1
+#define MT6363_LDO_VUFS18_HW_LP_MODE		0x1c13
+#define MT6363_LDO_VUFS18_OP_EN0		0x1c14
+#define MT6363_RG_LDO_VIO18_ADDR		0x1c1d
+#define MT6363_RG_LDO_VIO18_EN_BIT		0
+#define MT6363_RG_LDO_VIO18_LP_BIT		1
+#define MT6363_LDO_VIO18_HW_LP_MODE		0x1c21
+#define MT6363_LDO_VIO18_OP_EN0			0x1c22
+#define MT6363_RG_LDO_VIO075_ADDR		0x1c57
+#define MT6363_RG_LDO_VIO075_EN_BIT		0
+#define MT6363_RG_LDO_VIO075_LP_BIT		1
+#define MT6363_LDO_VIO075_HW_LP_MODE		0x1c5b
+#define MT6363_LDO_VIO075_OP_EN0		0x1c5c
+#define MT6363_RG_LDO_VA12_1_ADDR		0x1c65
+#define MT6363_RG_LDO_VA12_1_EN_BIT		0
+#define MT6363_RG_LDO_VA12_1_LP_BIT		1
+#define MT6363_LDO_VA12_1_HW_LP_MODE		0x1c69
+#define MT6363_LDO_VA12_1_OP_EN0		0x1c6a
+#define MT6363_RG_LDO_VA12_2_ADDR		0x1c73
+#define MT6363_RG_LDO_VA12_2_EN_BIT		0
+#define MT6363_RG_LDO_VA12_2_LP_BIT		1
+#define MT6363_LDO_VA12_2_HW_LP_MODE		0x1c77
+#define MT6363_LDO_VA12_2_OP_EN0		0x1c78
+#define MT6363_RG_LDO_VA15_ADDR			0x1c81
+#define MT6363_RG_LDO_VA15_EN_BIT		0
+#define MT6363_RG_LDO_VA15_LP_BIT		1
+#define MT6363_LDO_VA15_HW_LP_MODE		0x1c85
+#define MT6363_LDO_VA15_OP_EN0			0x1c86
+#define MT6363_RG_LDO_VM18_ADDR			0x1c8f
+#define MT6363_RG_LDO_VM18_EN_BIT		0
+#define MT6363_RG_LDO_VM18_LP_BIT		1
+#define MT6363_LDO_VM18_HW_LP_MODE		0x1c93
+#define MT6363_LDO_VM18_OP_EN0			0x1c94
+#define MT6363_RG_LDO_VCN13_ADDR		0x1cd7
+#define MT6363_RG_LDO_VCN13_EN_BIT		0
+#define MT6363_RG_LDO_VCN13_LP_BIT		1
+#define MT6363_LDO_VCN13_HW_LP_MODE		0x1cdb
+#define MT6363_LDO_VCN13_OP_EN0			0x1ce4
+#define MT6363_LDO_VSRAM_DIGRF_HW_LP_MODE	0x1cf1
+#define MT6363_LDO_VSRAM_DIGRF_OP_EN0		0x1cfa
+#define MT6363_LDO_VSRAM_MDFE_HW_LP_MODE	0x1d5b
+#define MT6363_LDO_VSRAM_MDFE_OP_EN0		0x1d64
+#define MT6363_LDO_VSRAM_MODEM_HW_LP_MODE	0x1d76
+#define MT6363_LDO_VSRAM_MODEM_OP_EN0		0x1d7f
+#define MT6363_RG_LDO_VSRAM_CPUB_ADDR		0x1dd7
+#define MT6363_RG_LDO_VSRAM_CPUB_EN_BIT		0
+#define MT6363_RG_LDO_VSRAM_CPUB_LP_BIT		1
+#define MT6363_LDO_VSRAM_CPUB_HW_LP_MODE	0x1ddb
+#define MT6363_LDO_VSRAM_CPUB_OP_EN0		0x1de4
+#define MT6363_RG_LDO_VSRAM_CPUM_ADDR		0x1ded
+#define MT6363_RG_LDO_VSRAM_CPUM_EN_BIT		0
+#define MT6363_RG_LDO_VSRAM_CPUM_LP_BIT		1
+#define MT6363_LDO_VSRAM_CPUM_HW_LP_MODE	0x1df1
+#define MT6363_LDO_VSRAM_CPUM_OP_EN0		0x1dfa
+#define MT6363_RG_LDO_VSRAM_CPUL_ADDR		0x1e57
+#define MT6363_RG_LDO_VSRAM_CPUL_EN_BIT		0
+#define MT6363_RG_LDO_VSRAM_CPUL_LP_BIT		1
+#define MT6363_LDO_VSRAM_CPUL_HW_LP_MODE	0x1e5b
+#define MT6363_LDO_VSRAM_CPUL_OP_EN0		0x1e64
+#define MT6363_RG_LDO_VSRAM_APU_ADDR		0x1e6d
+#define MT6363_RG_LDO_VSRAM_APU_EN_BIT		0
+#define MT6363_RG_LDO_VSRAM_APU_LP_BIT		1
+#define MT6363_LDO_VSRAM_APU_HW_LP_MODE		0x1e71
+#define MT6363_LDO_VSRAM_APU_OP_EN0		0x1e7a
+#define MT6363_RG_VTREF18_VOCAL_ADDR		0x1ed8
+#define MT6363_RG_VTREF18_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VTREF18_VOSEL_ADDR		0x1ed9
+#define MT6363_RG_VTREF18_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VAUX18_VOCAL_ADDR		0x1edc
+#define MT6363_RG_VAUX18_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VAUX18_VOSEL_ADDR		0x1edd
+#define MT6363_RG_VAUX18_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VCN15_VOCAL_ADDR		0x1ee3
+#define MT6363_RG_VCN15_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VCN15_VOSEL_ADDR		0x1ee4
+#define MT6363_RG_VCN15_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VUFS18_VOCAL_ADDR		0x1ee7
+#define MT6363_RG_VUFS18_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VUFS18_VOSEL_ADDR		0x1ee8
+#define MT6363_RG_VUFS18_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VIO18_VOCAL_ADDR		0x1eeb
+#define MT6363_RG_VIO18_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VIO18_VOSEL_ADDR		0x1eec
+#define MT6363_RG_VIO18_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VM18_VOCAL_ADDR		0x1eef
+#define MT6363_RG_VM18_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VM18_VOSEL_ADDR		0x1ef0
+#define MT6363_RG_VM18_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VA15_VOCAL_ADDR		0x1ef3
+#define MT6363_RG_VA15_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VA15_VOSEL_ADDR		0x1ef4
+#define MT6363_RG_VA15_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VRF18_VOCAL_ADDR		0x1ef7
+#define MT6363_RG_VRF18_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VRF18_VOSEL_ADDR		0x1ef8
+#define MT6363_RG_VRF18_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VRFIO18_VOCAL_ADDR		0x1efb
+#define MT6363_RG_VRFIO18_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VRFIO18_VOSEL_ADDR		0x1efc
+#define MT6363_RG_VRFIO18_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VIO075_VOCFG_ADDR		0x1f01
+#define MT6363_RG_VIO075_VOCAL_ADDR		MT6363_RG_VIO075_VOCFG_ADDR
+#define MT6363_RG_VIO075_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VIO075_VOSEL_ADDR		MT6363_RG_VIO075_VOCFG_ADDR
+#define MT6363_RG_VIO075_VOSEL_MASK		GENMASK(6, 4)
+#define MT6363_RG_VCN13_VOCAL_ADDR		0x1f58
+#define MT6363_RG_VCN13_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VUFS12_VOCAL_ADDR		0x1f61
+#define MT6363_RG_VUFS12_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VUFS12_VOSEL_ADDR		0x1f62
+#define MT6363_RG_VUFS12_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VA12_1_VOCAL_ADDR		0x1f65
+#define MT6363_RG_VA12_1_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VA12_1_VOSEL_ADDR		0x1f66
+#define MT6363_RG_VA12_1_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VA12_2_VOCAL_ADDR		0x1f69
+#define MT6363_RG_VA12_2_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VA12_2_VOSEL_ADDR		0x1f6a
+#define MT6363_RG_VA12_2_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VRF12_VOCAL_ADDR		0x1f6d
+#define MT6363_RG_VRF12_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VRF12_VOSEL_ADDR		0x1f6e
+#define MT6363_RG_VRF12_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VRF13_VOCAL_ADDR		0x1f71
+#define MT6363_RG_VRF13_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VRF13_VOSEL_ADDR		0x1f72
+#define MT6363_RG_VRF13_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VRF09_VOCAL_ADDR		0x1f78
+#define MT6363_RG_VRF09_VOCAL_MASK		GENMASK(3, 0)
+#define MT6363_RG_VRF09_VOSEL_ADDR		0x1f79
+#define MT6363_RG_VRF09_VOSEL_MASK		GENMASK(3, 0)
+#define MT6363_ISINK_EN_CTRL0			0x21db
+#define MT6363_ISINK_CTRL0_MASK			GENMASK(7, 0)
+#define MT6363_ISINK_EN_CTRL1			0x21dc
+#define MT6363_ISINK_CTRL1_MASK			GENMASK(7, 4)
+
+#endif /* __LINUX_REGULATOR_MT6363_H */
-- 
cgit v1.2.3


From fdb9aed869f34d776298b3a8197909eb820e4d0d Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 5 Nov 2025 06:22:38 +0900
Subject: block: introduce disk_report_zone()

Commit b76b840fd933 ("dm: Fix dm-zoned-reclaim zone write pointer
alignment") introduced an indirect call for the callback function of a
report zones executed with blkdev_report_zones(). This is necessary so
that the function disk_zone_wplug_sync_wp_offset() can be called to
refresh a zone write plug zone write pointer offset after a write error.
However, this solution makes following the path of a zone information
harder to understand.

Clean this up by introducing the new blk_report_zones_args structure to
define a zone report callback and its private data and introduce the
helper function disk_report_zone() which calls both
disk_zone_wplug_sync_wp_offset() and the zone report user callback
function for all zones of a zone report. This helper function must be
called by all block device drivers that implement the report zones
block operation in order to correctly report a zone information.

All block device drivers supporting the report_zones block operation are
updated to use this new scheme.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c                 | 79 +++++++++++++++++++++------------------
 drivers/block/null_blk/null_blk.h |  3 +-
 drivers/block/null_blk/zoned.c    |  4 +-
 drivers/block/ublk_drv.c          |  4 +-
 drivers/block/virtio_blk.c        | 11 +++---
 drivers/block/zloop.c             |  4 +-
 drivers/md/dm-zone.c              | 54 ++++++++++++++------------
 drivers/md/dm.h                   |  3 +-
 drivers/nvme/host/core.c          |  5 +--
 drivers/nvme/host/multipath.c     |  4 +-
 drivers/nvme/host/nvme.h          |  2 +-
 drivers/nvme/host/zns.c           | 10 ++---
 drivers/scsi/sd.h                 |  2 +-
 drivers/scsi/sd_zbc.c             | 20 ++++------
 include/linux/blkdev.h            |  7 +++-
 include/linux/device-mapper.h     | 10 ++++-
 16 files changed, 120 insertions(+), 102 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 345a99c0b031..de3524c17f67 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -114,30 +114,16 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
 }
 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
 
-struct disk_report_zones_cb_args {
-	struct gendisk	*disk;
-	report_zones_cb	user_cb;
-	void		*user_data;
+/*
+ * Zone report arguments for block device drivers report_zones operation.
+ * @cb: report_zones_cb callback for each reported zone.
+ * @data: Private data passed to report_zones_cb.
+ */
+struct blk_report_zones_args {
+	report_zones_cb cb;
+	void		*data;
 };
 
-static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
-					   struct blk_zone *zone);
-
-static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx,
-				void *data)
-{
-	struct disk_report_zones_cb_args *args = data;
-	struct gendisk *disk = args->disk;
-
-	if (disk->zone_wplugs_hash)
-		disk_zone_wplug_sync_wp_offset(disk, zone);
-
-	if (!args->user_cb)
-		return 0;
-
-	return args->user_cb(zone, idx, args->user_data);
-}
-
 /**
  * blkdev_report_zones - Get zones information
  * @bdev:	Target block device
@@ -161,10 +147,9 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 			unsigned int nr_zones, report_zones_cb cb, void *data)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	struct disk_report_zones_cb_args args = {
-		.disk = disk,
-		.user_cb = cb,
-		.user_data = data,
+	struct blk_report_zones_args args = {
+		.cb = cb,
+		.data = data,
 	};
 
 	if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
@@ -173,8 +158,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 	if (!nr_zones || sector >= get_capacity(disk))
 		return 0;
 
-	return disk->fops->report_zones(disk, sector, nr_zones,
-					disk_report_zones_cb, &args);
+	return disk->fops->report_zones(disk, sector, nr_zones, &args);
 }
 EXPORT_SYMBOL_GPL(blkdev_report_zones);
 
@@ -692,15 +676,32 @@ static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
 	disk_put_zone_wplug(zwplug);
 }
 
-static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector)
+/**
+ * disk_report_zone - Report one zone
+ * @disk:	Target disk
+ * @zone:	The zone to report
+ * @idx:	The index of the zone in the overall zone report
+ * @args:	report zones callback and data
+ *
+ * Description:
+ *    Helper function for block device drivers to report one zone of a zone
+ *    report initiated with blkdev_report_zones(). The zone being reported is
+ *    specified by @zone and used to update, if necessary, the zone write plug
+ *    information for the zone. If @args specifies a user callback function,
+ *    this callback is executed.
+ */
+int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
+		     unsigned int idx, struct blk_report_zones_args *args)
 {
-	struct disk_report_zones_cb_args args = {
-		.disk = disk,
-	};
+	if (disk->zone_wplugs_hash)
+		disk_zone_wplug_sync_wp_offset(disk, zone);
+
+	if (args && args->cb)
+		return args->cb(zone, idx, args->data);
 
-	return disk->fops->report_zones(disk, sector, 1,
-					disk_report_zones_cb, &args);
+	return 0;
 }
+EXPORT_SYMBOL_GPL(disk_report_zone);
 
 static void blk_zone_reset_bio_endio(struct bio *bio)
 {
@@ -1786,6 +1787,10 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 	sector_t capacity = get_capacity(disk);
 	struct blk_revalidate_zone_args args = { };
 	unsigned int memflags, noio_flag;
+	struct blk_report_zones_args rep_args = {
+		.cb = blk_revalidate_zone_cb,
+		.data = &args,
+	};
 	int ret = -ENOMEM;
 
 	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
@@ -1817,8 +1822,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 		return ret;
 	}
 
-	ret = disk->fops->report_zones(disk, 0, UINT_MAX,
-				       blk_revalidate_zone_cb, &args);
+	ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args);
 	if (!ret) {
 		pr_warn("%s: No zones reported\n", disk->disk_name);
 		ret = -ENODEV;
@@ -1863,6 +1867,7 @@ EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
 			   sector_t nr_sects, gfp_t gfp_mask)
 {
+	struct gendisk *disk = bdev->bd_disk;
 	int ret;
 
 	if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
@@ -1878,7 +1883,7 @@ int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
 	 * pointer. Undo this using a report zone to update the zone write
 	 * pointer to the correct current value.
 	 */
-	ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector);
+	ret = disk->fops->report_zones(disk, sector, 1, NULL);
 	if (ret != 1)
 		return ret < 0 ? ret : -EIO;
 
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 7bb6128dbaaf..6c4c4bbe7dad 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -143,7 +143,8 @@ int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim);
 int null_register_zoned_dev(struct nullb *nullb);
 void null_free_zoned_dev(struct nullb_device *dev);
 int null_report_zones(struct gendisk *disk, sector_t sector,
-		      unsigned int nr_zones, report_zones_cb cb, void *data);
+		      unsigned int nr_zones,
+		      struct blk_report_zones_args *args);
 blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
 				    sector_t sector, sector_t nr_sectors);
 size_t null_zone_valid_read_len(struct nullb *nullb,
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 4e5728f45989..6a93b12a06ff 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -191,7 +191,7 @@ void null_free_zoned_dev(struct nullb_device *dev)
 }
 
 int null_report_zones(struct gendisk *disk, sector_t sector,
-		unsigned int nr_zones, report_zones_cb cb, void *data)
+		unsigned int nr_zones, struct blk_report_zones_args *args)
 {
 	struct nullb *nullb = disk->private_data;
 	struct nullb_device *dev = nullb->dev;
@@ -225,7 +225,7 @@ int null_report_zones(struct gendisk *disk, sector_t sector,
 		blkz.capacity = zone->capacity;
 		null_unlock_zone(dev, zone);
 
-		error = cb(&blkz, i, data);
+		error = disk_report_zone(disk, &blkz, i, args);
 		if (error)
 			return error;
 	}
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 96e07763cd28..97cc4bc0a6ce 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -367,7 +367,7 @@ static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
 }
 
 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
-		      unsigned int nr_zones, report_zones_cb cb, void *data)
+		      unsigned int nr_zones, struct blk_report_zones_args *args)
 {
 	struct ublk_device *ub = disk->private_data;
 	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
@@ -430,7 +430,7 @@ free_req:
 			if (!zone->len)
 				break;
 
-			ret = cb(zone, i, data);
+			ret = disk_report_zone(disk, zone, i, args);
 			if (ret)
 				goto out;
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f061420dfb10..a5e97f03dbf0 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -584,7 +584,8 @@ out:
 
 static int virtblk_parse_zone(struct virtio_blk *vblk,
 			       struct virtio_blk_zone_descriptor *entry,
-			       unsigned int idx, report_zones_cb cb, void *data)
+			       unsigned int idx,
+			       struct blk_report_zones_args *args)
 {
 	struct blk_zone zone = { };
 
@@ -650,12 +651,12 @@ static int virtblk_parse_zone(struct virtio_blk *vblk,
 	 * The callback below checks the validity of the reported
 	 * entry data, no need to further validate it here.
 	 */
-	return cb(&zone, idx, data);
+	return disk_report_zone(vblk->disk, &zone, idx, args);
 }
 
 static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
-				 unsigned int nr_zones, report_zones_cb cb,
-				 void *data)
+				 unsigned int nr_zones,
+				 struct blk_report_zones_args *args)
 {
 	struct virtio_blk *vblk = disk->private_data;
 	struct virtio_blk_zone_report *report;
@@ -693,7 +694,7 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
 
 		for (i = 0; i < nz && zone_idx < nr_zones; i++) {
 			ret = virtblk_parse_zone(vblk, &report->zones[i],
-						 zone_idx, cb, data);
+						 zone_idx, args);
 			if (ret)
 				goto fail_report;
 
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
index a423228e201b..92be9f0af00a 100644
--- a/drivers/block/zloop.c
+++ b/drivers/block/zloop.c
@@ -647,7 +647,7 @@ static int zloop_open(struct gendisk *disk, blk_mode_t mode)
 }
 
 static int zloop_report_zones(struct gendisk *disk, sector_t sector,
-		unsigned int nr_zones, report_zones_cb cb, void *data)
+		unsigned int nr_zones, struct blk_report_zones_args *args)
 {
 	struct zloop_device *zlo = disk->private_data;
 	struct blk_zone blkz = {};
@@ -687,7 +687,7 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector,
 
 		mutex_unlock(&zone->lock);
 
-		ret = cb(&blkz, i, data);
+		ret = disk_report_zone(disk, &blkz, i, args);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 78e17dd4d01b..984fb621b0e9 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -17,33 +17,26 @@
  * For internal zone reports bypassing the top BIO submission path.
  */
 static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
-				  sector_t sector, unsigned int nr_zones,
-				  report_zones_cb cb, void *data)
+				  unsigned int nr_zones,
+				  struct dm_report_zones_args *args)
 {
-	struct gendisk *disk = md->disk;
-	int ret;
-	struct dm_report_zones_args args = {
-		.next_sector = sector,
-		.orig_data = data,
-		.orig_cb = cb,
-	};
-
 	do {
 		struct dm_target *tgt;
+		int ret;
 
-		tgt = dm_table_find_target(t, args.next_sector);
+		tgt = dm_table_find_target(t, args->next_sector);
 		if (WARN_ON_ONCE(!tgt->type->report_zones))
 			return -EIO;
 
-		args.tgt = tgt;
-		ret = tgt->type->report_zones(tgt, &args,
-					      nr_zones - args.zone_idx);
+		args->tgt = tgt;
+		ret = tgt->type->report_zones(tgt, args,
+					      nr_zones - args->zone_idx);
 		if (ret < 0)
 			return ret;
-	} while (args.zone_idx < nr_zones &&
-		 args.next_sector < get_capacity(disk));
+	} while (args->zone_idx < nr_zones &&
+		 args->next_sector < get_capacity(md->disk));
 
-	return args.zone_idx;
+	return args->zone_idx;
 }
 
 /*
@@ -52,7 +45,8 @@ static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
  * generally implemented by targets using dm_report_zones().
  */
 int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
-			unsigned int nr_zones, report_zones_cb cb, void *data)
+			unsigned int nr_zones,
+			struct blk_report_zones_args *args)
 {
 	struct mapped_device *md = disk->private_data;
 	struct dm_table *map;
@@ -76,9 +70,14 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 		map = zone_revalidate_map;
 	}
 
-	if (map)
-		ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb,
-					     data);
+	if (map) {
+		struct dm_report_zones_args dm_args = {
+			.disk = md->disk,
+			.next_sector = sector,
+			.rep_args = args,
+		};
+		ret = dm_blk_do_report_zones(md, map, nr_zones, &dm_args);
+	}
 
 	if (put_table)
 		dm_put_live_table(md, srcu_idx);
@@ -113,7 +112,9 @@ static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
 	}
 
 	args->next_sector = zone->start + zone->len;
-	return args->orig_cb(zone, args->zone_idx++, args->orig_data);
+
+	return disk_report_zone(args->disk, zone, args->zone_idx++,
+				args->rep_args);
 }
 
 /*
@@ -492,10 +493,15 @@ int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
 			     sector_t sector, unsigned int nr_zones,
 			     unsigned long *need_reset)
 {
+	struct dm_report_zones_args args = {
+		.disk = md->disk,
+		.next_sector = sector,
+		.cb = dm_zone_need_reset_cb,
+		.data = need_reset,
+	};
 	int ret;
 
-	ret = dm_blk_do_report_zones(md, t, sector, nr_zones,
-				     dm_zone_need_reset_cb, need_reset);
+	ret = dm_blk_do_report_zones(md, t, nr_zones, &args);
 	if (ret != nr_zones) {
 		DMERR("Get %s zone reset bitmap failed\n",
 		      md->disk->disk_name);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 245f52b59215..7a795979ec72 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -109,7 +109,8 @@ void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim);
 void dm_zone_endio(struct dm_io *io, struct bio *clone);
 #ifdef CONFIG_BLK_DEV_ZONED
 int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
-			unsigned int nr_zones, report_zones_cb cb, void *data);
+			unsigned int nr_zones,
+			struct blk_report_zones_args *args);
 bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
 int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
 			     sector_t sector, unsigned int nr_zones,
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fa4181d7de73..c0fe50fb7b08 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2599,10 +2599,9 @@ static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static int nvme_report_zones(struct gendisk *disk, sector_t sector,
-		unsigned int nr_zones, report_zones_cb cb, void *data)
+		unsigned int nr_zones, struct blk_report_zones_args *args)
 {
-	return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
-			data);
+	return nvme_ns_report_zones(disk->private_data, sector, nr_zones, args);
 }
 #else
 #define nvme_report_zones	NULL
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 543e17aead12..0b7ac0735bd0 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -576,7 +576,7 @@ static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
-		unsigned int nr_zones, report_zones_cb cb, void *data)
+		unsigned int nr_zones, struct blk_report_zones_args *args)
 {
 	struct nvme_ns_head *head = disk->private_data;
 	struct nvme_ns *ns;
@@ -585,7 +585,7 @@ static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = nvme_find_path(head);
 	if (ns)
-		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
+		ret = nvme_ns_report_zones(ns, sector, nr_zones, args);
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
 }
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 102fae6a231c..928c748ccbd1 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -1108,7 +1108,7 @@ struct nvme_zone_info {
 };
 
 int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
-		unsigned int nr_zones, report_zones_cb cb, void *data);
+		unsigned int nr_zones, struct blk_report_zones_args *args);
 int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf,
 		struct nvme_zone_info *zi);
 void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index cce4c5b55aa9..deea2dbef5b8 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -148,8 +148,8 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
 
 static int nvme_zone_parse_entry(struct nvme_ns *ns,
 				 struct nvme_zone_descriptor *entry,
-				 unsigned int idx, report_zones_cb cb,
-				 void *data)
+				 unsigned int idx,
+				 struct blk_report_zones_args *args)
 {
 	struct nvme_ns_head *head = ns->head;
 	struct blk_zone zone = { };
@@ -169,11 +169,11 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
 	else
 		zone.wp = nvme_lba_to_sect(head, le64_to_cpu(entry->wp));
 
-	return cb(&zone, idx, data);
+	return disk_report_zone(ns->disk, &zone, idx, args);
 }
 
 int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
-		unsigned int nr_zones, report_zones_cb cb, void *data)
+		unsigned int nr_zones, struct blk_report_zones_args *args)
 {
 	struct nvme_zone_report *report;
 	struct nvme_command c = { };
@@ -213,7 +213,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
 
 		for (i = 0; i < nz && zone_idx < nr_zones; i++) {
 			ret = nvme_zone_parse_entry(ns, &report->entries[i],
-						    zone_idx, cb, data);
+						    zone_idx, args);
 			if (ret)
 				goto out_free;
 			zone_idx++;
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 36382eca941c..574af8243016 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -240,7 +240,7 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
 unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
 			     struct scsi_sense_hdr *sshdr);
 int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
-		unsigned int nr_zones, report_zones_cb cb, void *data);
+		unsigned int nr_zones, struct blk_report_zones_args *args);
 
 #else /* CONFIG_BLK_DEV_ZONED */
 
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index a8db66428f80..56e455fb5add 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -35,8 +35,7 @@ static bool sd_zbc_is_gap_zone(const u8 buf[64])
  * @buf: SCSI zone descriptor.
  * @idx: Index of the zone relative to the first zone reported by the current
  *	sd_zbc_report_zones() call.
- * @cb: Callback function pointer.
- * @data: Second argument passed to @cb.
+ * @args: report zones arguments (callback, etc)
  *
  * Return: Value returned by @cb.
  *
@@ -44,12 +43,11 @@ static bool sd_zbc_is_gap_zone(const u8 buf[64])
  * call @cb(blk_zone, @data).
  */
 static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64],
-			       unsigned int idx, report_zones_cb cb, void *data)
+			unsigned int idx, struct blk_report_zones_args *args)
 {
 	struct scsi_device *sdp = sdkp->device;
 	struct blk_zone zone = { 0 };
 	sector_t start_lba, gran;
-	int ret;
 
 	if (WARN_ON_ONCE(sd_zbc_is_gap_zone(buf)))
 		return -EINVAL;
@@ -87,11 +85,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64],
 	else
 		zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24]));
 
-	ret = cb(&zone, idx, data);
-	if (ret)
-		return ret;
-
-	return 0;
+	return disk_report_zone(sdkp->disk, &zone, idx, args);
 }
 
 /**
@@ -217,14 +211,14 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
  * @disk: Disk to report zones for.
  * @sector: Start sector.
  * @nr_zones: Maximum number of zones to report.
- * @cb: Callback function called to report zone information.
- * @data: Second argument passed to @cb.
+ * @args: Callback arguments.
  *
  * Called by the block layer to iterate over zone information. See also the
  * disk->fops->report_zones() calls in block/blk-zoned.c.
  */
 int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
-			unsigned int nr_zones, report_zones_cb cb, void *data)
+			unsigned int nr_zones,
+			struct blk_report_zones_args *args)
 {
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	sector_t lba = sectors_to_logical(sdkp->device, sector);
@@ -283,7 +277,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 			}
 
 			ret = sd_zbc_parse_report(sdkp, buf + offset, zone_idx,
-						  cb, data);
+						  args);
 			if (ret)
 				goto out;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 99be263b31ab..2f75fb15f55f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -38,6 +38,7 @@ struct blk_flush_queue;
 struct kiocb;
 struct pr_ops;
 struct rq_qos;
+struct blk_report_zones_args;
 struct blk_queue_stats;
 struct blk_stat_callback;
 struct blk_crypto_profile;
@@ -432,6 +433,9 @@ struct queue_limits {
 typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 			       void *data);
 
+int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
+		     unsigned int idx, struct blk_report_zones_args *args);
+
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 		unsigned int nr_zones, report_zones_cb cb, void *data);
@@ -1662,7 +1666,8 @@ struct block_device_operations {
 	/* this callback is with swap_lock and sometimes page table lock held */
 	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 	int (*report_zones)(struct gendisk *, sector_t sector,
-			unsigned int nr_zones, report_zones_cb cb, void *data);
+			    unsigned int nr_zones,
+			    struct blk_report_zones_args *args);
 	char *(*devnode)(struct gendisk *disk, umode_t *mode);
 	/* returns the length of the identifier or a negative errno: */
 	int (*get_unique_id)(struct gendisk *disk, u8 id[16],
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 84fdc3a6a19a..38f625af6ab4 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -538,12 +538,18 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone);
 #ifdef CONFIG_BLK_DEV_ZONED
 struct dm_report_zones_args {
 	struct dm_target *tgt;
+	struct gendisk *disk;
 	sector_t next_sector;
 
-	void *orig_data;
-	report_zones_cb orig_cb;
 	unsigned int zone_idx;
 
+	/* for block layer ->report_zones */
+	struct blk_report_zones_args *rep_args;
+
+	/* for internal users */
+	report_zones_cb cb;
+	void *data;
+
 	/* must be filled by ->report_zones before calling dm_report_zones_cb */
 	sector_t start;
 };
-- 
cgit v1.2.3


From 6e945ffb6555705cf20b1fcdc21a139911562995 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 5 Nov 2025 06:22:40 +0900
Subject: block: use zone condition to determine conventional zones

The conv_zones_bitmap field of struct gendisk is used to define a bitmap
to identify the conventional zones of a zoned block device. The bit for
a zone is set in this bitmap if the zone is a conventional one, that is,
if the zone type is BLK_ZONE_TYPE_CONVENTIONAL. For such zone, this
always corresponds to the zone condition BLK_ZONE_COND_NOT_WP.
In other words, conv_zones_bitmap tracks a single condition of the
zones of a zoned block device.

In preparation for tracking more zone conditions, change
conv_zones_bitmap into an array of zone conditions, using 1 byte per
zone. This increases the memory usage from 1 bit per zone to 1 byte per
zone, that is, from 16 KiB to about 100 KiB for a 30 TB SMR HDD with 256
MiB zones. This is a trade-off to allow fast cached report zones later
on top of this change.

Rename the conv_zones_bitmap field of struct gendisk to zones_cond. Add
a blk_revalidate_zone_cond() function to initialize the zones_cond array
of a disk during device scan and to update it on device revalidation.
Move the allocation of the zones_cond array to
disk_revalidate_zone_resources(), making sure that this array is always
allocated, even for devices that do not need zone write plugs (zone
resources), to ensure that bdev_zone_is_seq() can be re-implemented to
use the zone condition array in place of the conv zones bitmap.

Finally, the function bdev_zone_is_seq() is rewritten to use a test on
the condition of the target zone.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 153 ++++++++++++++++++++++++++++++++++---------------
 include/linux/blkdev.h |  37 +++---------
 2 files changed, 117 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index d4fc87b0be6b..f62862274f9a 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -114,6 +114,33 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
 }
 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
 
+/**
+ * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
+ * @bdev:       block device to check
+ * @sector:     sector number
+ *
+ * Check if @sector on @bdev is contained in a sequential write required zone.
+ */
+bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	unsigned int zno = disk_zone_no(disk, sector);
+	bool is_seq = false;
+	u8 *zones_cond;
+
+	if (!bdev_is_zoned(bdev))
+		return false;
+
+	rcu_read_lock();
+	zones_cond = rcu_dereference(disk->zones_cond);
+	if (zones_cond && zno < disk->nr_zones)
+		is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP;
+	rcu_read_unlock();
+
+	return is_seq;
+}
+EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
+
 /*
  * Zone report arguments for block device drivers report_zones operation.
  * @cb: report_zones_cb callback for each reported zone.
@@ -1458,22 +1485,16 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
 	disk->zone_wplugs_hash_bits = 0;
 }
 
-static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk,
-					       unsigned long *bitmap)
+static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
 {
-	unsigned int nr_conv_zones = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
-	if (bitmap)
-		nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones);
-	bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap,
-				     lockdep_is_held(&disk->zone_wplugs_lock));
+	zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
+				lockdep_is_held(&disk->zone_wplugs_lock));
 	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 
-	kfree_rcu_mightsleep(bitmap);
-
-	return nr_conv_zones;
+	kfree_rcu_mightsleep(zones_cond);
 }
 
 void disk_free_zone_resources(struct gendisk *disk)
@@ -1497,7 +1518,7 @@ void disk_free_zone_resources(struct gendisk *disk)
 	mempool_destroy(disk->zone_wplugs_pool);
 	disk->zone_wplugs_pool = NULL;
 
-	disk_set_conv_zones_bitmap(disk, NULL);
+	disk_set_zones_cond_array(disk, NULL);
 	disk->zone_capacity = 0;
 	disk->last_zone_capacity = 0;
 	disk->nr_zones = 0;
@@ -1516,12 +1537,31 @@ static inline bool disk_need_zone_resources(struct gendisk *disk)
 		queue_emulates_zone_append(disk->queue);
 }
 
+struct blk_revalidate_zone_args {
+	struct gendisk	*disk;
+	u8		*zones_cond;
+	unsigned int	nr_zones;
+	unsigned int	nr_conv_zones;
+	unsigned int	zone_capacity;
+	unsigned int	last_zone_capacity;
+	sector_t	sector;
+};
+
 static int disk_revalidate_zone_resources(struct gendisk *disk,
-					  unsigned int nr_zones)
+				struct blk_revalidate_zone_args *args)
 {
 	struct queue_limits *lim = &disk->queue->limits;
 	unsigned int pool_size;
 
+	args->disk = disk;
+	args->nr_zones =
+		DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors);
+
+	/* Cached zone conditions: 1 byte per zone */
+	args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO);
+	if (!args->zones_cond)
+		return -ENOMEM;
+
 	if (!disk_need_zone_resources(disk))
 		return 0;
 
@@ -1531,7 +1571,8 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
 	 */
 	pool_size = max(lim->max_open_zones, lim->max_active_zones);
 	if (!pool_size)
-		pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
+		pool_size =
+			min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
 
 	if (!disk->zone_wplugs_hash)
 		return disk_alloc_zone_resources(disk, pool_size);
@@ -1539,15 +1580,6 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
 	return 0;
 }
 
-struct blk_revalidate_zone_args {
-	struct gendisk	*disk;
-	unsigned long	*conv_zones_bitmap;
-	unsigned int	nr_zones;
-	unsigned int	zone_capacity;
-	unsigned int	last_zone_capacity;
-	sector_t	sector;
-};
-
 /*
  * Update the disk zone resources information and device queue limits.
  * The disk queue is frozen when this is executed.
@@ -1556,7 +1588,7 @@ static int disk_update_zone_resources(struct gendisk *disk,
 				      struct blk_revalidate_zone_args *args)
 {
 	struct request_queue *q = disk->queue;
-	unsigned int nr_seq_zones, nr_conv_zones;
+	unsigned int nr_seq_zones;
 	unsigned int pool_size, memflags;
 	struct queue_limits lim;
 	int ret = 0;
@@ -1566,24 +1598,24 @@ static int disk_update_zone_resources(struct gendisk *disk,
 	memflags = blk_mq_freeze_queue(q);
 
 	disk->nr_zones = args->nr_zones;
-	disk->zone_capacity = args->zone_capacity;
-	disk->last_zone_capacity = args->last_zone_capacity;
-	nr_conv_zones =
-		disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap);
-	if (nr_conv_zones >= disk->nr_zones) {
+	if (args->nr_conv_zones >= disk->nr_zones) {
 		pr_warn("%s: Invalid number of conventional zones %u / %u\n",
-			disk->disk_name, nr_conv_zones, disk->nr_zones);
+			disk->disk_name, args->nr_conv_zones, disk->nr_zones);
 		ret = -ENODEV;
 		goto unfreeze;
 	}
 
+	disk->zone_capacity = args->zone_capacity;
+	disk->last_zone_capacity = args->last_zone_capacity;
+	disk_set_zones_cond_array(disk, args->zones_cond);
+
 	/*
-	 * Some devices can advertize zone resource limits that are larger than
+	 * Some devices can advertise zone resource limits that are larger than
 	 * the number of sequential zones of the zoned block device, e.g. a
 	 * small ZNS namespace. For such case, assume that the zoned device has
 	 * no zone resource limits.
 	 */
-	nr_seq_zones = disk->nr_zones - nr_conv_zones;
+	nr_seq_zones = disk->nr_zones - args->nr_conv_zones;
 	if (lim.max_open_zones >= nr_seq_zones)
 		lim.max_open_zones = 0;
 	if (lim.max_active_zones >= nr_seq_zones)
@@ -1624,6 +1656,44 @@ unfreeze:
 	return ret;
 }
 
+static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx,
+				    struct blk_revalidate_zone_args *args)
+{
+	enum blk_zone_cond cond = zone->cond;
+
+	/* Check that the zone condition is consistent with the zone type. */
+	switch (cond) {
+	case BLK_ZONE_COND_NOT_WP:
+		if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
+			goto invalid_condition;
+		break;
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_EXP_OPEN:
+	case BLK_ZONE_COND_CLOSED:
+	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_FULL:
+	case BLK_ZONE_COND_OFFLINE:
+	case BLK_ZONE_COND_READONLY:
+		if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+			goto invalid_condition;
+		break;
+	default:
+		pr_warn("%s: Invalid zone condition 0x%X\n",
+			args->disk->disk_name, cond);
+		return -ENODEV;
+	}
+
+	args->zones_cond[idx] = cond;
+
+	return 0;
+
+invalid_condition:
+	pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n",
+		args->disk->disk_name, cond, zone->type);
+
+	return -ENODEV;
+}
+
 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
 				    struct blk_revalidate_zone_args *args)
 {
@@ -1638,17 +1708,7 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
 	if (disk_zone_is_last(disk, zone))
 		args->last_zone_capacity = zone->capacity;
 
-	if (!disk_need_zone_resources(disk))
-		return 0;
-
-	if (!args->conv_zones_bitmap) {
-		args->conv_zones_bitmap =
-			bitmap_zalloc(args->nr_zones, GFP_NOIO);
-		if (!args->conv_zones_bitmap)
-			return -ENOMEM;
-	}
-
-	set_bit(idx, args->conv_zones_bitmap);
+	args->nr_conv_zones++;
 
 	return 0;
 }
@@ -1746,6 +1806,11 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 		return -ENODEV;
 	}
 
+	/* Check zone condition */
+	ret = blk_revalidate_zone_cond(zone, idx, args);
+	if (ret)
+		return ret;
+
 	/* Check zone type */
 	switch (zone->type) {
 	case BLK_ZONE_TYPE_CONVENTIONAL:
@@ -1813,10 +1878,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 	 * Ensure that all memory allocations in this context are done as if
 	 * GFP_NOIO was specified.
 	 */
-	args.disk = disk;
-	args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
 	noio_flag = memalloc_noio_save();
-	ret = disk_revalidate_zone_resources(disk, args.nr_zones);
+	ret = disk_revalidate_zone_resources(disk, &args);
 	if (ret) {
 		memalloc_noio_restore(noio_flag);
 		return ret;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2f75fb15f55f..53bcfbc2f68f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -196,7 +196,7 @@ struct gendisk {
 	unsigned int		nr_zones;
 	unsigned int		zone_capacity;
 	unsigned int		last_zone_capacity;
-	unsigned long __rcu	*conv_zones_bitmap;
+	u8 __rcu		*zones_cond;
 	unsigned int		zone_wplugs_hash_bits;
 	atomic_t		nr_zone_wplugs;
 	spinlock_t		zone_wplugs_lock;
@@ -925,12 +925,20 @@ static inline unsigned int bdev_zone_capacity(struct block_device *bdev,
 {
 	return disk_zone_capacity(bdev->bd_disk, pos);
 }
+
+bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector);
+
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return 0;
 }
 
+static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
+{
+	return false;
+}
+
 static inline bool bio_needs_zone_write_plugging(struct bio *bio)
 {
 	return false;
@@ -1533,33 +1541,6 @@ static inline bool bdev_is_zone_aligned(struct block_device *bdev,
 	return bdev_is_zone_start(bdev, sector);
 }
 
-/**
- * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
- * @bdev:	block device to check
- * @sector:	sector number
- *
- * Check if @sector on @bdev is contained in a sequential write required zone.
- */
-static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
-{
-	bool is_seq = false;
-
-#if IS_ENABLED(CONFIG_BLK_DEV_ZONED)
-	if (bdev_is_zoned(bdev)) {
-		struct gendisk *disk = bdev->bd_disk;
-		unsigned long *bitmap;
-
-		rcu_read_lock();
-		bitmap = rcu_dereference(disk->conv_zones_bitmap);
-		is_seq = !bitmap ||
-			!test_bit(disk_zone_no(disk, sector), bitmap);
-		rcu_read_unlock();
-	}
-#endif
-
-	return is_seq;
-}
-
 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
 			   sector_t nr_sects, gfp_t gfp_mask);
 
-- 
cgit v1.2.3


From 0bf0e2e4666822b62d7ad6473dc37fd6b377b5f1 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 5 Nov 2025 06:22:41 +0900
Subject: block: track zone conditions

The function blk_revalidate_zone_cond() already caches the condition of
all zones of a zoned block device in the zones_cond array of a gendisk.
However, the zone conditions are updated only when the device is scanned
or revalidated.

Implement tracking of the runtime changes to zone conditions using
the new cond field in struct blk_zone_wplug. The size of this structure
remains 112 Bytes as the new field replaces the 4 Bytes padding at the
end of the structure.

Beause zones that do not have a zone write plug can be in the empty,
implicit open, explicit open or full condition, the zones_cond array of
a disk is used to track the conditions, of zones that do not have a zone
write plug. The condition of such zone is updated in the disk zones_cond
array when a zone reset, reset all or finish operation is executed, and
also when a zone write plug is removed from the disk hash table when the
zone becomes full.

Since a device may automatically close an implicitly open zone when
writing to an empty or closed zone, if the total number of open zones
has reached the device limit, the BLK_ZONE_COND_IMP_OPEN and
BLK_ZONE_COND_CLOSED zone conditions cannot be precisely tracked. To
overcome this, the zone condition BLK_ZONE_COND_ACTIVE is introduced to
represent a zone that has the condition BLK_ZONE_COND_IMP_OPEN,
BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED.  This follows the
definition of an active zone as defined in the NVMe Zoned Namespace
specifications. As such, for a zoned device that has a limit on the
maximum number of open zones, we will never have more zones in the
BLK_ZONE_COND_ACTIVE condition than the device limit. This is compatible
with the SCSI ZBC and ATA ZAC specifications for SMR HDDs as these
devices do not have a limit on the number of active zones.

The function disk_zone_wplug_set_wp_offset() is modified to use the new
helper disk_zone_wplug_update_cond() to update a zone write plug
condition whenever a zone write plug write offset is updated on
submission or merging of write BIOs to a zone.

The functions blk_zone_reset_bio_endio(), blk_zone_reset_all_bio_endio()
and blk_zone_finish_bio_endio() are modified to update the condition of
the zones targeted by reset, reset_all and finish operations, either
using though disk_zone_wplug_set_wp_offset() for zones that have a
zone write plug, or using the disk_zone_set_cond() helper to update the
zones_cond array of the disk for zones that do not have a zone write
plug.

When a zone write plug is removed from the disk hash table (when the
zone becomes empty or full), the condition of struct blk_zone_wplug is
used to update the disk zones_cond array. Conversely, when a zone write
plug is added to the disk hash table, the zones_cond array is used to
initialize the zone write plug condition.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c             | 120 +++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/blkzoned.h |  11 ++++
 2 files changed, 125 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index f62862274f9a..c5fa303093a9 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -33,6 +33,7 @@ static const char *const zone_cond_name[] = {
 	ZONE_COND_NAME(READONLY),
 	ZONE_COND_NAME(FULL),
 	ZONE_COND_NAME(OFFLINE),
+	ZONE_COND_NAME(ACTIVE),
 };
 #undef ZONE_COND_NAME
 
@@ -57,6 +58,7 @@ static const char *const zone_cond_name[] = {
  * @zone_no: The number of the zone the plug is managing.
  * @wp_offset: The zone write pointer location relative to the start of the zone
  *             as a number of 512B sectors.
+ * @cond: Condition of the zone
  */
 struct blk_zone_wplug {
 	struct hlist_node	node;
@@ -69,6 +71,7 @@ struct blk_zone_wplug {
 	unsigned int		flags;
 	unsigned int		zone_no;
 	unsigned int		wp_offset;
+	enum blk_zone_cond	cond;
 };
 
 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
@@ -114,6 +117,57 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
 }
 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
 
+static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno,
+			      enum blk_zone_cond cond)
+{
+	if (!zones_cond)
+		return;
+
+	switch (cond) {
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_EXP_OPEN:
+	case BLK_ZONE_COND_CLOSED:
+		zones_cond[zno] = BLK_ZONE_COND_ACTIVE;
+		return;
+	case BLK_ZONE_COND_NOT_WP:
+	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_FULL:
+	case BLK_ZONE_COND_OFFLINE:
+	case BLK_ZONE_COND_READONLY:
+	default:
+		zones_cond[zno] = cond;
+		return;
+	}
+}
+
+static void disk_zone_set_cond(struct gendisk *disk, sector_t sector,
+			       enum blk_zone_cond cond)
+{
+	u8 *zones_cond;
+
+	rcu_read_lock();
+	zones_cond = rcu_dereference(disk->zones_cond);
+	if (zones_cond) {
+		unsigned int zno = disk_zone_no(disk, sector);
+
+		/*
+		 * The condition of a conventional, readonly and offline zones
+		 * never changes, so do nothing if the target zone is in one of
+		 * these conditions.
+		 */
+		switch (zones_cond[zno]) {
+		case BLK_ZONE_COND_NOT_WP:
+		case BLK_ZONE_COND_READONLY:
+		case BLK_ZONE_COND_OFFLINE:
+			break;
+		default:
+			blk_zone_set_cond(zones_cond, zno, cond);
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
 /**
  * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
  * @bdev:       block device to check
@@ -416,6 +470,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
 {
 	struct blk_zone_wplug *zwplg;
 	unsigned long flags;
+	u8 *zones_cond;
 	unsigned int idx =
 		hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
 
@@ -431,6 +486,20 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
 			return false;
 		}
 	}
+
+	/*
+	 * Set the zone condition: if we do not yet have a zones_cond array
+	 * attached to the disk, then this is a zone write plug insert from the
+	 * first call to blk_revalidate_disk_zones(), in which case the zone is
+	 * necessarilly in the active condition.
+	 */
+	zones_cond = rcu_dereference_check(disk->zones_cond,
+				lockdep_is_held(&disk->zone_wplugs_lock));
+	if (zones_cond)
+		zwplug->cond = zones_cond[zwplug->zone_no];
+	else
+		zwplug->cond = BLK_ZONE_COND_ACTIVE;
+
 	hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
 	atomic_inc(&disk->nr_zone_wplugs);
 	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
@@ -530,10 +599,15 @@ static void disk_remove_zone_wplug(struct gendisk *disk,
 
 	/*
 	 * Mark the zone write plug as unhashed and drop the extra reference we
-	 * took when the plug was inserted in the hash table.
+	 * took when the plug was inserted in the hash table. Also update the
+	 * disk zone condition array with the current condition of the zone
+	 * write plug.
 	 */
 	zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
 	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
+				lockdep_is_held(&disk->zone_wplugs_lock)),
+			  zwplug->zone_no, zwplug->cond);
 	hlist_del_init_rcu(&zwplug->node);
 	atomic_dec(&disk->nr_zone_wplugs);
 	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
@@ -635,6 +709,22 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
 		blk_zone_wplug_bio_io_error(zwplug, bio);
 }
 
+/*
+ * Update a zone write plug condition based on the write pointer offset.
+ */
+static void disk_zone_wplug_update_cond(struct gendisk *disk,
+					struct blk_zone_wplug *zwplug)
+{
+	lockdep_assert_held(&zwplug->lock);
+
+	if (disk_zone_wplug_is_full(disk, zwplug))
+		zwplug->cond = BLK_ZONE_COND_FULL;
+	else if (!zwplug->wp_offset)
+		zwplug->cond = BLK_ZONE_COND_EMPTY;
+	else
+		zwplug->cond = BLK_ZONE_COND_ACTIVE;
+}
+
 /*
  * Set a zone write plug write pointer offset to the specified value.
  * This aborts all plugged BIOs, which is fine as this function is called for
@@ -650,6 +740,8 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
 	/* Update the zone write pointer and abort all plugged BIOs. */
 	zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
 	zwplug->wp_offset = wp_offset;
+	disk_zone_wplug_update_cond(disk, zwplug);
+
 	disk_zone_wplug_abort(zwplug);
 
 	/*
@@ -733,6 +825,7 @@ EXPORT_SYMBOL_GPL(disk_report_zone);
 static void blk_zone_reset_bio_endio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	sector_t sector = bio->bi_iter.bi_sector;
 	struct blk_zone_wplug *zwplug;
 
 	/*
@@ -741,7 +834,7 @@ static void blk_zone_reset_bio_endio(struct bio *bio)
 	 * resetting zones while writes are still in-flight will result in the
 	 * writes failing anyway.
 	 */
-	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+	zwplug = disk_get_zone_wplug(disk, sector);
 	if (zwplug) {
 		unsigned long flags;
 
@@ -749,14 +842,18 @@ static void blk_zone_reset_bio_endio(struct bio *bio)
 		disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
 		spin_unlock_irqrestore(&zwplug->lock, flags);
 		disk_put_zone_wplug(zwplug);
+	} else {
+		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
 	}
 }
 
 static void blk_zone_reset_all_bio_endio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	sector_t capacity = get_capacity(disk);
 	struct blk_zone_wplug *zwplug;
 	unsigned long flags;
+	sector_t sector;
 	unsigned int i;
 
 	/* Update the condition of all zone write plugs. */
@@ -770,12 +867,18 @@ static void blk_zone_reset_all_bio_endio(struct bio *bio)
 		}
 	}
 	rcu_read_unlock();
+
+	/* Update the cached zone conditions. */
+	for (sector = 0; sector < capacity;
+	     sector += bdev_zone_sectors(bio->bi_bdev))
+		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
 }
 
 static void blk_zone_finish_bio_endio(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	struct gendisk *disk = bdev->bd_disk;
+	sector_t sector = bio->bi_iter.bi_sector;
 	struct blk_zone_wplug *zwplug;
 
 	/*
@@ -784,7 +887,7 @@ static void blk_zone_finish_bio_endio(struct bio *bio)
 	 * is fine as resetting zones while writes are still in-flight will
 	 * result in the writes failing anyway.
 	 */
-	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+	zwplug = disk_get_zone_wplug(disk, sector);
 	if (zwplug) {
 		unsigned long flags;
 
@@ -793,6 +896,8 @@ static void blk_zone_finish_bio_endio(struct bio *bio)
 					      bdev_zone_sectors(bdev));
 		spin_unlock_irqrestore(&zwplug->lock, flags);
 		disk_put_zone_wplug(zwplug);
+	} else {
+		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL);
 	}
 }
 
@@ -888,6 +993,7 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
  */
 void blk_zone_write_plug_bio_merged(struct bio *bio)
 {
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
 	struct blk_zone_wplug *zwplug;
 	unsigned long flags;
 
@@ -909,13 +1015,13 @@ void blk_zone_write_plug_bio_merged(struct bio *bio)
 	 * have at least one request and one BIO referencing the zone write
 	 * plug. So this should not fail.
 	 */
-	zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
-				     bio->bi_iter.bi_sector);
+	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
 	if (WARN_ON_ONCE(!zwplug))
 		return;
 
 	spin_lock_irqsave(&zwplug->lock, flags);
 	zwplug->wp_offset += bio_sectors(bio);
+	disk_zone_wplug_update_cond(disk, zwplug);
 	spin_unlock_irqrestore(&zwplug->lock, flags);
 }
 
@@ -974,6 +1080,7 @@ void blk_zone_write_plug_init_request(struct request *req)
 		/* Drop the reference taken by disk_zone_wplug_add_bio(). */
 		blk_queue_exit(q);
 		zwplug->wp_offset += bio_sectors(bio);
+		disk_zone_wplug_update_cond(disk, zwplug);
 
 		req_back_sector += bio_sectors(bio);
 	}
@@ -1037,6 +1144,7 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
 
 	/* Advance the zone write pointer offset. */
 	zwplug->wp_offset += bio_sectors(bio);
+	disk_zone_wplug_update_cond(disk, zwplug);
 
 	return true;
 }
@@ -1683,7 +1791,7 @@ static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx,
 		return -ENODEV;
 	}
 
-	args->zones_cond[idx] = cond;
+	blk_zone_set_cond(args->zones_cond, idx, cond);
 
 	return 0;
 
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index f85743ef6e7d..5c7662971414 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -48,6 +48,8 @@ enum blk_zone_type {
  *                      FINISH ZONE command.
  * @BLK_ZONE_COND_READONLY: The zone is read-only.
  * @BLK_ZONE_COND_OFFLINE: The zone is offline (sectors cannot be read/written).
+ * @BLK_ZONE_COND_ACTIVE: The zone is either implicitly open, explicitly open,
+ *			  or closed.
  *
  * The Zone Condition state machine in the ZBC/ZAC standards maps the above
  * deinitions as:
@@ -61,6 +63,13 @@ enum blk_zone_type {
  *
  * Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should
  * be considered invalid.
+ *
+ * The condition BLK_ZONE_COND_ACTIVE is used only with cached zone reports.
+ * It is used to report any of the BLK_ZONE_COND_IMP_OPEN,
+ * BLK_ZONE_COND_EXP_OPEN and BLK_ZONE_COND_CLOSED conditions. Conversely, a
+ * regular zone report will never report a zone condition using
+ * BLK_ZONE_COND_ACTIVE and instead use the conditions BLK_ZONE_COND_IMP_OPEN,
+ * BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED as reported by the device.
  */
 enum blk_zone_cond {
 	BLK_ZONE_COND_NOT_WP	= 0x0,
@@ -71,6 +80,8 @@ enum blk_zone_cond {
 	BLK_ZONE_COND_READONLY	= 0xD,
 	BLK_ZONE_COND_FULL	= 0xE,
 	BLK_ZONE_COND_OFFLINE	= 0xF,
+
+	BLK_ZONE_COND_ACTIVE	= 0xFF,
 };
 
 /**
-- 
cgit v1.2.3


From f2284eec5053df271c78e687672247922bcee881 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 5 Nov 2025 06:22:43 +0900
Subject: block: introduce blkdev_get_zone_info()

Introduce the function blkdev_get_zone_info() to obtain a single zone
information from cached zone data, that is, either from the zone write
plug for the target zone if it exists and from the disk zones_cond
array otherwise.

Since sequential zones that do not have a zone write plug are either
full, empty or in a bad state (read-only or offline), the zone write
pointer can be inferred from the zone condition cached in the disk
zones_cond array. For sequential zones that have a zone write plug, the
zone condition and zone write pointer are obtained from the condition
and write pointer offset managed with the zone write plug. This allows
obtaining the information for a zone much more quickly than having to
execute a report zones command on the device.

blkdev_get_zone_info() falls back to using a regular zone report if the
target zone is flagged as needing an update with the
BLK_ZONE_WPLUG_NEED_WP_UPDATE flag, or if the target device does not
use zone write plugs (i.e. a device mapper device). In this case, the
new function blkdev_report_zone_fallback() is used and the zone
condition is reported consistantly with the cahced report, that is, the
BLK_ZONE_COND_ACTIVE condition is used in place of the implicit open,
explicit open and closed conditions. This is achieved by adding the
.report_active field to struct blk_report_zones_args and by having
disk_report_zone() sets the correct zone condition if .report_active is
true.

In preparation for using blkdev_get_zone_info() in upcoming file systems
changes, also export this function as a GPL symbol.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 141 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |   3 ++
 2 files changed, 144 insertions(+)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 9ce7570c91e1..d98babfe49df 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -203,6 +203,7 @@ EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
 struct blk_report_zones_args {
 	report_zones_cb cb;
 	void		*data;
+	bool		report_active;
 };
 
 static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector,
@@ -820,6 +821,23 @@ static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
 		     unsigned int idx, struct blk_report_zones_args *args)
 {
+	if (args->report_active) {
+		/*
+		 * If we come here, then this is a report zones as a fallback
+		 * for a cached report. So collapse the implicit open, explicit
+		 * open and closed conditions into the active zone condition.
+		 */
+		switch (zone->cond) {
+		case BLK_ZONE_COND_IMP_OPEN:
+		case BLK_ZONE_COND_EXP_OPEN:
+		case BLK_ZONE_COND_CLOSED:
+			zone->cond = BLK_ZONE_COND_ACTIVE;
+			break;
+		default:
+			break;
+		}
+	}
+
 	if (disk->zone_wplugs_hash)
 		disk_zone_wplug_sync_wp_offset(disk, zone);
 
@@ -830,6 +848,129 @@ int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
 }
 EXPORT_SYMBOL_GPL(disk_report_zone);
 
+static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx,
+				 void *data)
+{
+	memcpy(data, zone, sizeof(struct blk_zone));
+	return 0;
+}
+
+static int blkdev_report_zone_fallback(struct block_device *bdev,
+				       sector_t sector, struct blk_zone *zone)
+{
+	struct blk_report_zones_args args = {
+		.cb = blkdev_report_zone_cb,
+		.data = zone,
+		.report_active = true,
+	};
+
+	return blkdev_do_report_zones(bdev, sector, 1, &args);
+}
+
+/**
+ * blkdev_get_zone_info - Get a single zone information from cached data
+ * @bdev:   Target block device
+ * @sector: Sector contained by the target zone
+ * @zone:   zone structure to return the zone information
+ *
+ * Description:
+ *    Get the zone information for the zone containing @sector using the zone
+ *    write plug of the target zone, if one exist, or the disk zone condition
+ *    array otherwise. The zone condition may be reported as being
+ *    the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit
+ *    open, explicit open or closed condition.
+ *
+ *    Returns 0 on success and a negative error code on failure.
+ */
+int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
+			 struct blk_zone *zone)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	sector_t zone_sectors = bdev_zone_sectors(bdev);
+	struct blk_zone_wplug *zwplug;
+	unsigned long flags;
+	u8 *zones_cond;
+
+	if (!bdev_is_zoned(bdev))
+		return -EOPNOTSUPP;
+
+	if (sector >= get_capacity(disk))
+		return -EINVAL;
+
+	memset(zone, 0, sizeof(*zone));
+	sector = ALIGN_DOWN(sector, zone_sectors);
+
+	rcu_read_lock();
+	zones_cond = rcu_dereference(disk->zones_cond);
+	if (!disk->zone_wplugs_hash || !zones_cond) {
+		rcu_read_unlock();
+		return blkdev_report_zone_fallback(bdev, sector, zone);
+	}
+	zone->cond = zones_cond[disk_zone_no(disk, sector)];
+	rcu_read_unlock();
+
+	zone->start = sector;
+	zone->len = zone_sectors;
+
+	/*
+	 * If this is a conventional zone, we do not have a zone write plug and
+	 * can report the zone immediately.
+	 */
+	if (zone->cond == BLK_ZONE_COND_NOT_WP) {
+		zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
+		zone->capacity = zone_sectors;
+		zone->wp = ULLONG_MAX;
+		return 0;
+	}
+
+	/*
+	 * This is a sequential write required zone. If the zone is read-only or
+	 * offline, only set the zone write pointer to an invalid value and
+	 * report the zone.
+	 */
+	zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+	if (disk_zone_is_last(disk, zone))
+		zone->capacity = disk->last_zone_capacity;
+	else
+		zone->capacity = disk->zone_capacity;
+
+	if (zone->cond == BLK_ZONE_COND_READONLY ||
+	    zone->cond == BLK_ZONE_COND_OFFLINE) {
+		zone->wp = ULLONG_MAX;
+		return 0;
+	}
+
+	/*
+	 * If the zone does not have a zone write plug, it is either full or
+	 * empty, as we otherwise would have a zone write plug for it. In this
+	 * case, set the write pointer accordingly and report the zone.
+	 * Otherwise, if we have a zone write plug, use it.
+	 */
+	zwplug = disk_get_zone_wplug(disk, sector);
+	if (!zwplug) {
+		if (zone->cond == BLK_ZONE_COND_FULL)
+			zone->wp = ULLONG_MAX;
+		else
+			zone->wp = sector;
+		return 0;
+	}
+
+	spin_lock_irqsave(&zwplug->lock, flags);
+	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) {
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		disk_put_zone_wplug(zwplug);
+		return blkdev_report_zone_fallback(bdev, sector, zone);
+	}
+	zone->cond = zwplug->cond;
+	zone->wp = sector + zwplug->wp_offset;
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+
+	disk_put_zone_wplug(zwplug);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkdev_get_zone_info);
+
 static void blk_zone_reset_bio_endio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 53bcfbc2f68f..03a594b4dfbc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -436,6 +436,9 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
 		     unsigned int idx, struct blk_report_zones_args *args);
 
+int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
+			 struct blk_zone *zone);
+
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 		unsigned int nr_zones, report_zones_cb cb, void *data);
-- 
cgit v1.2.3


From 31f0656a4ab712edf2888eabcc0664197a4a938e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 5 Nov 2025 06:22:44 +0900
Subject: block: introduce blkdev_report_zones_cached()

Introduce the function blkdev_report_zones_cached() to provide a fast
report zone built using the blkdev_get_zone_info() function, which gets
zone information from a disk zones_cond array or zone write plugs.
For a large capacity SMR drive, such fast report zone can be completed
in a few milliseconds compared to several seconds completion times
when the report zone is obtained from the device.

The zone report is built in the same manner as with the regular
blkdev_report_zones() function, that is, the first zone reported is the
one containing the specified start sector and the report is limited to
the specified number of zones (nr_zones argument). The information for
each zone in the report is obtained using blkdev_get_zone_info().

For zoned devices that do not use zone write plug resources,
using blkdev_get_zone_info() is inefficient as the zone report would
be very slow, generated one zone at a time. To avoid this,
blkdev_report_zones_cached() falls back to calling
blkdev_do_report_zones() to execute a regular zone report. In this case,
the .report_active field of struct blk_report_zones_args is set to true
to report zone conditions using the BLK_ZONE_COND_ACTIVE condition in
place of the implicit open, explicit open and closed conditions.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 88 ++++++++++++++++++++++++++++++++++++++++++--------
 include/linux/blkdev.h |  2 ++
 2 files changed, 77 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index d98babfe49df..bbd105b11843 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -74,6 +74,19 @@ struct blk_zone_wplug {
 	enum blk_zone_cond	cond;
 };
 
+static inline bool disk_need_zone_resources(struct gendisk *disk)
+{
+	/*
+	 * All request-based zoned devices need zone resources so that the
+	 * block layer can automatically handle write BIO plugging. BIO-based
+	 * device drivers (e.g. DM devices) are normally responsible for
+	 * handling zone write ordering and do not need zone resources, unless
+	 * the driver requires zone append emulation.
+	 */
+	return queue_is_mq(disk->queue) ||
+		queue_emulates_zone_append(disk->queue);
+}
+
 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
 {
 	return 1U << disk->zone_wplugs_hash_bits;
@@ -971,6 +984,68 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(blkdev_get_zone_info);
 
+/**
+ * blkdev_report_zones_cached - Get cached zones information
+ * @bdev:     Target block device
+ * @sector:   Sector from which to report zones
+ * @nr_zones: Maximum number of zones to report
+ * @cb:       Callback function called for each reported zone
+ * @data:     Private data for the callback function
+ *
+ * Description:
+ *    Similar to blkdev_report_zones() but instead of calling into the low level
+ *    device driver to get the zone report from the device, use
+ *    blkdev_get_zone_info() to generate the report from the disk zone write
+ *    plugs and zones condition array. Since calling this function without a
+ *    callback does not make sense, @cb must be specified.
+ */
+int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
+			unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	sector_t capacity = get_capacity(disk);
+	sector_t zone_sectors = bdev_zone_sectors(bdev);
+	unsigned int idx = 0;
+	struct blk_zone zone;
+	int ret;
+
+	if (!cb || !bdev_is_zoned(bdev) ||
+	    WARN_ON_ONCE(!disk->fops->report_zones))
+		return -EOPNOTSUPP;
+
+	if (!nr_zones || sector >= capacity)
+		return 0;
+
+	/*
+	 * If we do not have any zone write plug resources, fallback to using
+	 * the regular zone report.
+	 */
+	if (!disk_need_zone_resources(disk)) {
+		struct blk_report_zones_args args = {
+			.cb = cb,
+			.data = data,
+			.report_active = true,
+		};
+
+		return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
+	}
+
+	for (sector = ALIGN_DOWN(sector, zone_sectors);
+	     sector < capacity && idx < nr_zones;
+	     sector += zone_sectors, idx++) {
+		ret = blkdev_get_zone_info(bdev, sector, &zone);
+		if (ret)
+			return ret;
+
+		ret = cb(&zone, idx, data);
+		if (ret)
+			return ret;
+	}
+
+	return idx;
+}
+EXPORT_SYMBOL_GPL(blkdev_report_zones_cached);
+
 static void blk_zone_reset_bio_endio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
@@ -1781,19 +1856,6 @@ void disk_free_zone_resources(struct gendisk *disk)
 	disk->nr_zones = 0;
 }
 
-static inline bool disk_need_zone_resources(struct gendisk *disk)
-{
-	/*
-	 * All mq zoned devices need zone resources so that the block layer
-	 * can automatically handle write BIO plugging. BIO-based device drivers
-	 * (e.g. DM devices) are normally responsible for handling zone write
-	 * ordering and do not need zone resources, unless the driver requires
-	 * zone append emulation.
-	 */
-	return queue_is_mq(disk->queue) ||
-		queue_emulates_zone_append(disk->queue);
-}
-
 struct blk_revalidate_zone_args {
 	struct gendisk	*disk;
 	u8		*zones_cond;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 03a594b4dfbc..f0ab02e0a673 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -442,6 +442,8 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 		unsigned int nr_zones, report_zones_cb cb, void *data);
+int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
+		unsigned int nr_zones, report_zones_cb cb, void *data);
 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
 		sector_t sectors, sector_t nr_sectors);
 int blk_revalidate_disk_zones(struct gendisk *disk);
-- 
cgit v1.2.3


From b30ffcdc0c15a88f8866529d3532454e02571221 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 5 Nov 2025 06:22:45 +0900
Subject: block: introduce BLKREPORTZONESV2 ioctl

Introduce the new BLKREPORTZONESV2 ioctl command to allow user
applications access to the fast zone report implemented by
blkdev_report_zones_cached(). This new ioctl is defined as number 142
and is documented in include/uapi/linux/fs.h.

Unlike the existing BLKREPORTZONES ioctl, this new ioctl uses the flags
field of struct blk_zone_report also as an input. If the user sets the
BLK_ZONE_REP_CACHED flag as an input, then blkdev_report_zones_cached()
is used to generate the zone report using cached zone information. If
this flag is not set, then BLKREPORTZONESV2 behaves in the same manner
as BLKREPORTZONES and the zone report is generated by accessing the
zoned device.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c             | 25 ++++++++++++++++++++++---
 block/ioctl.c                 |  1 +
 include/uapi/linux/blkzoned.h | 35 ++++++++++++++++++++++++++++++-----
 include/uapi/linux/fs.h       |  2 +-
 4 files changed, 54 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index bbd105b11843..7a7b0704f095 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -357,7 +357,12 @@ static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
 }
 
 /*
- * BLKREPORTZONE ioctl processing.
+ * Mask of valid input flags for BLKREPORTZONEV2 ioctl.
+ */
+#define BLK_ZONE_REPV2_INPUT_FLAGS	BLK_ZONE_REP_CACHED
+
+/*
+ * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing.
  * Called from blkdev_ioctl.
  */
 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
@@ -381,8 +386,22 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
 		return -EINVAL;
 
 	args.zones = argp + sizeof(struct blk_zone_report);
-	ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
-				  blkdev_copy_zone_to_user, &args);
+
+	switch (cmd) {
+	case BLKREPORTZONE:
+		ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
+					  blkdev_copy_zone_to_user, &args);
+		break;
+	case BLKREPORTZONEV2:
+		if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS)
+			return -EINVAL;
+		ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones,
+					 blkdev_copy_zone_to_user, &args);
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	if (ret < 0)
 		return ret;
 
diff --git a/block/ioctl.c b/block/ioctl.c
index 3927ca4707d0..698629e4c619 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -581,6 +581,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
 	case BLKGETDISKSEQ:
 		return put_u64(argp, bdev->bd_disk->diskseq);
 	case BLKREPORTZONE:
+	case BLKREPORTZONEV2:
 		return blkdev_report_zones_ioctl(bdev, cmd, arg);
 	case BLKRESETZONE:
 	case BLKOPENZONE:
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index 5c7662971414..e33f02703350 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -87,10 +87,20 @@ enum blk_zone_cond {
 /**
  * enum blk_zone_report_flags - Feature flags of reported zone descriptors.
  *
- * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field.
+ * @BLK_ZONE_REP_CAPACITY: Output only. Indicates that zone descriptors in a
+ *			   zone report have a valid capacity field.
+ * @BLK_ZONE_REP_CACHED: Input only. Indicates that the zone report should be
+ *			 generated using cached zone information. In this case,
+ *			 the implicit open, explicit open and closed zone
+ *			 conditions are all reported with the
+ *			 BLK_ZONE_COND_ACTIVE condition.
  */
 enum blk_zone_report_flags {
-	BLK_ZONE_REP_CAPACITY	= (1 << 0),
+	/* Output flags */
+	BLK_ZONE_REP_CAPACITY	= (1U << 0),
+
+	/* Input flags */
+	BLK_ZONE_REP_CACHED	= (1U << 31),
 };
 
 /**
@@ -133,6 +143,10 @@ struct blk_zone {
  * @sector: starting sector of report
  * @nr_zones: IN maximum / OUT actual
  * @flags: one or more flags as defined by enum blk_zone_report_flags.
+ * @flags: one or more flags as defined by enum blk_zone_report_flags.
+ *	   With BLKREPORTZONE, this field is ignored as an input and is valid
+ *	   only as an output. Using BLKREPORTZONEV2, this field is used as both
+ *	   input and output.
  * @zones: Space to hold @nr_zones @zones entries on reply.
  *
  * The array of at most @nr_zones must follow this structure in memory.
@@ -159,9 +173,19 @@ struct blk_zone_range {
 /**
  * Zoned block device ioctl's:
  *
- * @BLKREPORTZONE: Get zone information. Takes a zone report as argument.
- *                 The zone report will start from the zone containing the
- *                 sector specified in the report request structure.
+ * @BLKREPORTZONE: Get zone information from a zoned device. Takes a zone report
+ *		   as argument. The zone report will start from the zone
+ *		   containing the sector specified in struct blk_zone_report.
+ *		   The flags field of struct blk_zone_report is used as an
+ *		   output only and ignored as an input.
+ *		   DEPRECATED, use BLKREPORTZONEV2 instead.
+ * @BLKREPORTZONEV2: Same as @BLKREPORTZONE but uses the flags field of
+ *		     struct blk_zone_report as an input, allowing to get a zone
+ *		     report using cached zone information if the flag
+ *		     BLK_ZONE_REP_CACHED is set. In such case, the zone report
+ *		     may include zones with the condition @BLK_ZONE_COND_ACTIVE
+ *		     (c.f. the description of this condition above for more
+ *		     details).
  * @BLKRESETZONE: Reset the write pointer of the zones in the specified
  *                sector range. The sector range must be zone aligned.
  * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
@@ -180,5 +204,6 @@ struct blk_zone_range {
 #define BLKOPENZONE	_IOW(0x12, 134, struct blk_zone_range)
 #define BLKCLOSEZONE	_IOW(0x12, 135, struct blk_zone_range)
 #define BLKFINISHZONE	_IOW(0x12, 136, struct blk_zone_range)
+#define BLKREPORTZONEV2	_IOWR(0x12, 142, struct blk_zone_report)
 
 #endif /* _UAPI_BLKZONED_H */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 957ce3343a4f..66ca526cf786 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -298,7 +298,7 @@ struct file_attr {
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
 #define BLKGETDISKSEQ _IOR(0x12,128,__u64)
-/* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */
+/* 130-136 and 142 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */
 /* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */
 #define BLKTRACESETUP2 _IOWR(0x12, 142, struct blk_user_trace_setup2)
 
-- 
cgit v1.2.3


From 0a0da3f92118950862700497bc7917f0fbf6a6e8 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 30 Oct 2025 13:09:24 -0700
Subject: KVM: Make support for kvm_arch_vcpu_async_ioctl() mandatory

Implement kvm_arch_vcpu_async_ioctl() "natively" in x86 and arm64 instead
of relying on an #ifdef'd stub, and drop HAVE_KVM_VCPU_ASYNC_IOCTL in
anticipation of using the API on x86.  Once x86 uses the API, providing a
stub for one architecture and having all other architectures opt-in
requires more code than simply implementing the API in the lone holdout.

Eliminating the Kconfig will also reduce churn if the API is renamed in
the future (spoiler alert).

No functional change intended.

Acked-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Kai Huang <kai.huang@intel.com>
Link: https://patch.msgid.link/20251030200951.3402865-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm64/kvm/arm.c       |  6 ++++++
 arch/loongarch/kvm/Kconfig |  1 -
 arch/mips/kvm/Kconfig      |  1 -
 arch/powerpc/kvm/Kconfig   |  1 -
 arch/riscv/kvm/Kconfig     |  1 -
 arch/s390/kvm/Kconfig      |  1 -
 arch/x86/kvm/x86.c         |  6 ++++++
 include/linux/kvm_host.h   | 10 ----------
 virt/kvm/Kconfig           |  3 ---
 9 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 870953b4a8a7..ef5bf57f79b7 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1835,6 +1835,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	return r;
 }
 
+long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl,
+			       unsigned long arg)
+{
+	return -ENOIOCTLCMD;
+}
+
 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
 
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig
index ae64bbdf83a7..ed4f724db774 100644
--- a/arch/loongarch/kvm/Kconfig
+++ b/arch/loongarch/kvm/Kconfig
@@ -25,7 +25,6 @@ config KVM
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_MSI
 	select HAVE_KVM_READONLY_MEM
-	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_COMMON
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select KVM_GENERIC_HARDWARE_ENABLING
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index ab57221fa4dd..cc13cc35f208 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
 	select EXPORT_UASM
 	select KVM_COMMON
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
-	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_MMIO
 	select KVM_GENERIC_MMU_NOTIFIER
 	select KVM_GENERIC_HARDWARE_ENABLING
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 2f2702c867f7..c9a2d50ff1b0 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ if VIRTUALIZATION
 config KVM
 	bool
 	select KVM_COMMON
-	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_VFIO
 	select HAVE_KVM_IRQ_BYPASS
 
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index c50328212917..77379f77840a 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -23,7 +23,6 @@ config KVM
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_MSI
-	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select HAVE_KVM_READONLY_MEM
 	select HAVE_KVM_DIRTY_RING_ACQ_REL
 	select KVM_COMMON
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index cae908d64550..96d16028e8b7 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -20,7 +20,6 @@ config KVM
 	def_tristate y
 	prompt "Kernel-based Virtual Machine (KVM) support"
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
-	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_ASYNC_PF
 	select KVM_ASYNC_PF_SYNC
 	select KVM_COMMON
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4b5d2d09634..ca5ba2caf314 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7240,6 +7240,12 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
 	return 0;
 }
 
+long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl,
+			       unsigned long arg)
+{
+	return -ENOIOCTLCMD;
+}
+
 int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm = filp->private_data;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5bd76cf394fa..7186b2ae4b57 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2437,18 +2437,8 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_HAVE_KVM_NO_POLL */
 
-#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL
 long kvm_arch_vcpu_async_ioctl(struct file *filp,
 			       unsigned int ioctl, unsigned long arg);
-#else
-static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
-					     unsigned int ioctl,
-					     unsigned long arg)
-{
-	return -ENOIOCTLCMD;
-}
-#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
-
 void kvm_arch_guest_memory_reclaimed(struct kvm *kvm);
 
 #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 5f0015c5dd95..267c7369c765 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -78,9 +78,6 @@ config HAVE_KVM_IRQ_BYPASS
        tristate
        select IRQ_BYPASS_MANAGER
 
-config HAVE_KVM_VCPU_ASYNC_IOCTL
-       bool
-
 config HAVE_KVM_VCPU_RUN_PID_CHANGE
        bool
 
-- 
cgit v1.2.3


From 50efc2340a598da4bafa40bc01e18f8cf73a4ae3 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 30 Oct 2025 13:09:25 -0700
Subject: KVM: Rename kvm_arch_vcpu_async_ioctl() to
 kvm_arch_vcpu_unlocked_ioctl()

Rename the "async" ioctl API to "unlocked" so that upcoming usage in x86's
TDX code doesn't result in a massive misnomer.  To avoid having to retry
SEAMCALLs, TDX needs to acquire kvm->lock *and* all vcpu->mutex locks, and
acquiring all of those locks after/inside the current vCPU's mutex is a
non-starter.  However, TDX also needs to acquire the vCPU's mutex and load
the vCPU, i.e. the handling is very much not async to the vCPU.

No functional change intended.

Acked-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Kai Huang <kai.huang@intel.com>
Link: https://patch.msgid.link/20251030200951.3402865-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm64/kvm/arm.c       | 4 ++--
 arch/loongarch/kvm/vcpu.c  | 4 ++--
 arch/mips/kvm/mips.c       | 4 ++--
 arch/powerpc/kvm/powerpc.c | 4 ++--
 arch/riscv/kvm/vcpu.c      | 4 ++--
 arch/s390/kvm/kvm-s390.c   | 4 ++--
 arch/x86/kvm/x86.c         | 4 ++--
 include/linux/kvm_host.h   | 4 ++--
 virt/kvm/kvm_main.c        | 6 +++---
 9 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index ef5bf57f79b7..cf23f6b07ec7 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1835,8 +1835,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	return r;
 }
 
-long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl,
-			       unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+				  unsigned long arg)
 {
 	return -ENOIOCTLCMD;
 }
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 30e3b089a596..9a5844e85fd3 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -1471,8 +1471,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 	return 0;
 }
 
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
-			       unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+				  unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
 	struct kvm_vcpu *vcpu = filp->private_data;
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index a75587018f44..b0fb92fda4d4 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -895,8 +895,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl,
-			       unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+				  unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2ba057171ebe..9a89a6d98f97 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -2028,8 +2028,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return -EINVAL;
 }
 
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
-			       unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+				  unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index bccb919ca615..a4bd6077eecc 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -238,8 +238,8 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 	return VM_FAULT_SIGBUS;
 }
 
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
-			       unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+				  unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 16ba04062854..8c4caa5f2fcd 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5730,8 +5730,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
-			       unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+				  unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ca5ba2caf314..b85cb213a336 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7240,8 +7240,8 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
 	return 0;
 }
 
-long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl,
-			       unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+				  unsigned long arg)
 {
 	return -ENOIOCTLCMD;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7186b2ae4b57..d93f75b05ae2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1557,6 +1557,8 @@ long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg);
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp,
+				  unsigned int ioctl, unsigned long arg);
 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
@@ -2437,8 +2439,6 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_HAVE_KVM_NO_POLL */
 
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
-			       unsigned int ioctl, unsigned long arg);
 void kvm_arch_guest_memory_reclaimed(struct kvm *kvm);
 
 #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b7a0ae2a7b20..b7db1d5f71a8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4434,10 +4434,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		return r;
 
 	/*
-	 * Some architectures have vcpu ioctls that are asynchronous to vcpu
-	 * execution; mutex_lock() would break them.
+	 * Let arch code handle select vCPU ioctls without holding vcpu->mutex,
+	 * e.g. to support ioctls that can run asynchronous to vCPU execution.
 	 */
-	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
+	r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg);
 	if (r != -ENOIOCTLCMD)
 		return r;
 
-- 
cgit v1.2.3


From e0b62a4dee24e9176f2c4be52a1b47fe1d97c560 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 4 Nov 2025 15:46:33 +0100
Subject: fs: add fs/super_types.h header

Split out super block associated structures into a separate header.

Link: https://patch.msgid.link/20251104-work-fs-header-v1-2-fb39a2efe39e@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h             | 308 +------------------------------------
 include/linux/fs/super_types.h | 335 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 336 insertions(+), 307 deletions(-)
 create mode 100644 include/linux/fs/super_types.h

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3c971ddace41..ae71c359077a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_FS_H
 #define _LINUX_FS_H
 
+#include <linux/fs/super_types.h>
 #include <linux/vfsdebug.h>
 #include <linux/linkage.h>
 #include <linux/wait_bit.h>
@@ -11,7 +12,6 @@
 #include <linux/stat.h>
 #include <linux/cache.h>
 #include <linux/list.h>
-#include <linux/list_lru.h>
 #include <linux/llist.h>
 #include <linux/radix-tree.h>
 #include <linux/xarray.h>
@@ -37,7 +37,6 @@
 #include <linux/uuid.h>
 #include <linux/errseq.h>
 #include <linux/ioprio.h>
-#include <linux/fs_dirent.h>
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
 #include <linux/mount.h>
@@ -52,11 +51,9 @@
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
 
-struct backing_dev_info;
 struct bdi_writeback;
 struct bio;
 struct io_comp_batch;
-struct export_operations;
 struct fiemap_extent_info;
 struct hd_geometry;
 struct iovec;
@@ -70,12 +67,8 @@ struct vfsmount;
 struct cred;
 struct swap_info_struct;
 struct seq_file;
-struct workqueue_struct;
 struct iov_iter;
-struct fscrypt_operations;
-struct fsverity_operations;
 struct fsnotify_mark_connector;
-struct fsnotify_sb_info;
 struct fs_context;
 struct fs_parameter_spec;
 struct file_kattr;
@@ -298,11 +291,6 @@ struct iattr {
 	struct file	*ia_file;
 };
 
-/*
- * Includes for diskquotas.
- */
-#include <linux/quota.h>
-
 /*
  * Maximum number of layers of fs stack.  Needs to be limited to
  * prevent kernel stack overflow
@@ -1347,49 +1335,6 @@ extern void f_delown(struct file *filp);
 extern pid_t f_getown(struct file *filp);
 extern int send_sigurg(struct file *file);
 
-/*
- * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
- * represented in both.
- */
-#define SB_RDONLY       BIT(0)	/* Mount read-only */
-#define SB_NOSUID       BIT(1)	/* Ignore suid and sgid bits */
-#define SB_NODEV        BIT(2)	/* Disallow access to device special files */
-#define SB_NOEXEC       BIT(3)	/* Disallow program execution */
-#define SB_SYNCHRONOUS  BIT(4)	/* Writes are synced at once */
-#define SB_MANDLOCK     BIT(6)	/* Allow mandatory locks on an FS */
-#define SB_DIRSYNC      BIT(7)	/* Directory modifications are synchronous */
-#define SB_NOATIME      BIT(10)	/* Do not update access times. */
-#define SB_NODIRATIME   BIT(11)	/* Do not update directory access times */
-#define SB_SILENT       BIT(15)
-#define SB_POSIXACL     BIT(16)	/* Supports POSIX ACLs */
-#define SB_INLINECRYPT  BIT(17)	/* Use blk-crypto for encrypted files */
-#define SB_KERNMOUNT    BIT(22)	/* this is a kern_mount call */
-#define SB_I_VERSION    BIT(23)	/* Update inode I_version field */
-#define SB_LAZYTIME     BIT(25)	/* Update the on-disk [acm]times lazily */
-
-/* These sb flags are internal to the kernel */
-#define SB_DEAD         BIT(21)
-#define SB_DYING        BIT(24)
-#define SB_FORCE        BIT(27)
-#define SB_NOSEC        BIT(28)
-#define SB_BORN         BIT(29)
-#define SB_ACTIVE       BIT(30)
-#define SB_NOUSER       BIT(31)
-
-/* These flags relate to encoding and casefolding */
-#define SB_ENC_STRICT_MODE_FL		(1 << 0)
-#define SB_ENC_NO_COMPAT_FALLBACK_FL	(1 << 1)
-
-#define sb_has_strict_encoding(sb) \
-	(sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
-
-#if IS_ENABLED(CONFIG_UNICODE)
-#define sb_no_casefold_compat_fallback(sb) \
-	(sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL)
-#else
-#define sb_no_casefold_compat_fallback(sb) (1)
-#endif
-
 /*
  *	Umount options
  */
@@ -1400,191 +1345,6 @@ extern int send_sigurg(struct file *file);
 #define UMOUNT_NOFOLLOW	0x00000008	/* Don't follow symlink on umount */
 #define UMOUNT_UNUSED	0x80000000	/* Flag guaranteed to be unused */
 
-/* sb->s_iflags */
-#define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
-#define SB_I_NOEXEC	0x00000002	/* Ignore executables on this fs */
-#define SB_I_NODEV	0x00000004	/* Ignore devices on this fs */
-#define SB_I_STABLE_WRITES 0x00000008	/* don't modify blks until WB is done */
-
-/* sb->s_iflags to limit user namespace mounts */
-#define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
-#define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
-#define SB_I_UNTRUSTED_MOUNTER		0x00000040
-#define SB_I_EVM_HMAC_UNSUPPORTED	0x00000080
-
-#define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
-#define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
-#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
-#define SB_I_RETIRED	0x00000800	/* superblock shouldn't be reused */
-#define SB_I_NOUMASK	0x00001000	/* VFS does not apply umask */
-#define SB_I_NOIDMAP	0x00002000	/* No idmapped mounts on this superblock */
-#define SB_I_ALLOW_HSM	0x00004000	/* Allow HSM events on this superblock */
-
-/* Possible states of 'frozen' field */
-enum {
-	SB_UNFROZEN = 0,		/* FS is unfrozen */
-	SB_FREEZE_WRITE	= 1,		/* Writes, dir ops, ioctls frozen */
-	SB_FREEZE_PAGEFAULT = 2,	/* Page faults stopped as well */
-	SB_FREEZE_FS = 3,		/* For internal FS use (e.g. to stop
-					 * internal threads if needed) */
-	SB_FREEZE_COMPLETE = 4,		/* ->freeze_fs finished successfully */
-};
-
-#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
-
-struct sb_writers {
-	unsigned short			frozen;		/* Is sb frozen? */
-	int				freeze_kcount;	/* How many kernel freeze requests? */
-	int				freeze_ucount;	/* How many userspace freeze requests? */
-	const void			*freeze_owner;	/* Owner of the freeze */
-	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
-};
-
-struct mount;
-
-struct super_block {
-	struct list_head	s_list;		/* Keep this first */
-	dev_t			s_dev;		/* search index; _not_ kdev_t */
-	unsigned char		s_blocksize_bits;
-	unsigned long		s_blocksize;
-	loff_t			s_maxbytes;	/* Max file size */
-	struct file_system_type	*s_type;
-	const struct super_operations	*s_op;
-	const struct dquot_operations	*dq_op;
-	const struct quotactl_ops	*s_qcop;
-	const struct export_operations *s_export_op;
-	unsigned long		s_flags;
-	unsigned long		s_iflags;	/* internal SB_I_* flags */
-	unsigned long		s_magic;
-	struct dentry		*s_root;
-	struct rw_semaphore	s_umount;
-	int			s_count;
-	atomic_t		s_active;
-#ifdef CONFIG_SECURITY
-	void                    *s_security;
-#endif
-	const struct xattr_handler * const *s_xattr;
-#ifdef CONFIG_FS_ENCRYPTION
-	const struct fscrypt_operations	*s_cop;
-	struct fscrypt_keyring	*s_master_keys; /* master crypto keys in use */
-#endif
-#ifdef CONFIG_FS_VERITY
-	const struct fsverity_operations *s_vop;
-#endif
-#if IS_ENABLED(CONFIG_UNICODE)
-	struct unicode_map *s_encoding;
-	__u16 s_encoding_flags;
-#endif
-	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
-	struct mount		*s_mounts;	/* list of mounts; _not_ for fs use */
-	struct block_device	*s_bdev;	/* can go away once we use an accessor for @s_bdev_file */
-	struct file		*s_bdev_file;
-	struct backing_dev_info *s_bdi;
-	struct mtd_info		*s_mtd;
-	struct hlist_node	s_instances;
-	unsigned int		s_quota_types;	/* Bitmask of supported quota types */
-	struct quota_info	s_dquot;	/* Diskquota specific options */
-
-	struct sb_writers	s_writers;
-
-	/*
-	 * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
-	 * s_fsnotify_info together for cache efficiency. They are frequently
-	 * accessed and rarely modified.
-	 */
-	void			*s_fs_info;	/* Filesystem private info */
-
-	/* Granularity of c/m/atime in ns (cannot be worse than a second) */
-	u32			s_time_gran;
-	/* Time limits for c/m/atime in seconds */
-	time64_t		   s_time_min;
-	time64_t		   s_time_max;
-#ifdef CONFIG_FSNOTIFY
-	u32			s_fsnotify_mask;
-	struct fsnotify_sb_info	*s_fsnotify_info;
-#endif
-
-	/*
-	 * q: why are s_id and s_sysfs_name not the same? both are human
-	 * readable strings that identify the filesystem
-	 * a: s_id is allowed to change at runtime; it's used in log messages,
-	 * and we want to when a device starts out as single device (s_id is dev
-	 * name) but then a device is hot added and we have to switch to
-	 * identifying it by UUID
-	 * but s_sysfs_name is a handle for programmatic access, and can't
-	 * change at runtime
-	 */
-	char			s_id[32];	/* Informational name */
-	uuid_t			s_uuid;		/* UUID */
-	u8			s_uuid_len;	/* Default 16, possibly smaller for weird filesystems */
-
-	/* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
-	char			s_sysfs_name[UUID_STRING_LEN + 1];
-
-	unsigned int		s_max_links;
-	unsigned int		s_d_flags;	/* default d_flags for dentries */
-
-	/*
-	 * The next field is for VFS *only*. No filesystems have any business
-	 * even looking at it. You had been warned.
-	 */
-	struct mutex s_vfs_rename_mutex;	/* Kludge */
-
-	/*
-	 * Filesystem subtype.  If non-empty the filesystem type field
-	 * in /proc/mounts will be "type.subtype"
-	 */
-	const char *s_subtype;
-
-	const struct dentry_operations *__s_d_op; /* default d_op for dentries */
-
-	struct shrinker *s_shrink;	/* per-sb shrinker handle */
-
-	/* Number of inodes with nlink == 0 but still referenced */
-	atomic_long_t s_remove_count;
-
-	/* Read-only state of the superblock is being changed */
-	int s_readonly_remount;
-
-	/* per-sb errseq_t for reporting writeback errors via syncfs */
-	errseq_t s_wb_err;
-
-	/* AIO completions deferred from interrupt context */
-	struct workqueue_struct *s_dio_done_wq;
-	struct hlist_head s_pins;
-
-	/*
-	 * Owning user namespace and default context in which to
-	 * interpret filesystem uids, gids, quotas, device nodes,
-	 * xattrs and security labels.
-	 */
-	struct user_namespace *s_user_ns;
-
-	/*
-	 * The list_lru structure is essentially just a pointer to a table
-	 * of per-node lru lists, each of which has its own spinlock.
-	 * There is no need to put them into separate cachelines.
-	 */
-	struct list_lru		s_dentry_lru;
-	struct list_lru		s_inode_lru;
-	struct rcu_head		rcu;
-	struct work_struct	destroy_work;
-
-	struct mutex		s_sync_lock;	/* sync serialisation lock */
-
-	/*
-	 * Indicates how deep in a filesystem stack this SB is
-	 */
-	int s_stack_depth;
-
-	/* s_inode_list_lock protects s_inodes */
-	spinlock_t		s_inode_list_lock ____cacheline_aligned_in_smp;
-	struct list_head	s_inodes;	/* all inodes */
-
-	spinlock_t		s_inode_wblist_lock;
-	struct list_head	s_inodes_wb;	/* writeback inodes */
-} __randomize_layout;
-
 static inline struct user_namespace *i_user_ns(const struct inode *inode)
 {
 	return inode->i_sb->s_user_ns;
@@ -2431,72 +2191,6 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
 					struct file *dst_file, loff_t dst_pos,
 					loff_t len, unsigned int remap_flags);
 
-/**
- * enum freeze_holder - holder of the freeze
- * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
- * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
- * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
- * @FREEZE_EXCL: a freeze that can only be undone by the owner
- *
- * Indicate who the owner of the freeze or thaw request is and whether
- * the freeze needs to be exclusive or can nest.
- * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
- * same holder aren't allowed. It is however allowed to hold a single
- * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
- * the same time. This is relied upon by some filesystems during online
- * repair or similar.
- */
-enum freeze_holder {
-	FREEZE_HOLDER_KERNEL	= (1U << 0),
-	FREEZE_HOLDER_USERSPACE	= (1U << 1),
-	FREEZE_MAY_NEST		= (1U << 2),
-	FREEZE_EXCL		= (1U << 3),
-};
-
-struct super_operations {
-   	struct inode *(*alloc_inode)(struct super_block *sb);
-	void (*destroy_inode)(struct inode *);
-	void (*free_inode)(struct inode *);
-
-   	void (*dirty_inode) (struct inode *, int flags);
-	int (*write_inode) (struct inode *, struct writeback_control *wbc);
-	int (*drop_inode) (struct inode *);
-	void (*evict_inode) (struct inode *);
-	void (*put_super) (struct super_block *);
-	int (*sync_fs)(struct super_block *sb, int wait);
-	int (*freeze_super) (struct super_block *, enum freeze_holder who, const void *owner);
-	int (*freeze_fs) (struct super_block *);
-	int (*thaw_super) (struct super_block *, enum freeze_holder who, const void *owner);
-	int (*unfreeze_fs) (struct super_block *);
-	int (*statfs) (struct dentry *, struct kstatfs *);
-	int (*remount_fs) (struct super_block *, int *, char *);
-	void (*umount_begin) (struct super_block *);
-
-	int (*show_options)(struct seq_file *, struct dentry *);
-	int (*show_devname)(struct seq_file *, struct dentry *);
-	int (*show_path)(struct seq_file *, struct dentry *);
-	int (*show_stats)(struct seq_file *, struct dentry *);
-#ifdef CONFIG_QUOTA
-	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
-	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
-	struct dquot __rcu **(*get_dquots)(struct inode *);
-#endif
-	long (*nr_cached_objects)(struct super_block *,
-				  struct shrink_control *);
-	long (*free_cached_objects)(struct super_block *,
-				    struct shrink_control *);
-	/*
-	 * If a filesystem can support graceful removal of a device and
-	 * continue read-write operations, implement this callback.
-	 *
-	 * Return 0 if the filesystem can continue read-write.
-	 * Non-zero return value or no such callback means the fs will be shutdown
-	 * as usual.
-	 */
-	int (*remove_bdev)(struct super_block *sb, struct block_device *bdev);
-	void (*shutdown)(struct super_block *sb);
-};
-
 /*
  * Inode flags - they have no relation to superblock flags now
  */
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
new file mode 100644
index 000000000000..45cfd45b9fe0
--- /dev/null
+++ b/include/linux/fs/super_types.h
@@ -0,0 +1,335 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FS_SUPER_TYPES_H
+#define _LINUX_FS_SUPER_TYPES_H
+
+#include <linux/fs_dirent.h>
+#include <linux/errseq.h>
+#include <linux/list_lru.h>
+#include <linux/list.h>
+#include <linux/list_bl.h>
+#include <linux/llist.h>
+#include <linux/uidgid.h>
+#include <linux/uuid.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/workqueue_types.h>
+#include <linux/quota.h>
+
+struct backing_dev_info;
+struct block_device;
+struct dentry;
+struct dentry_operations;
+struct dquot_operations;
+struct export_operations;
+struct file;
+struct file_system_type;
+struct fscrypt_operations;
+struct fsnotify_sb_info;
+struct fsverity_operations;
+struct kstatfs;
+struct mount;
+struct mtd_info;
+struct quotactl_ops;
+struct shrinker;
+struct unicode_map;
+struct user_namespace;
+struct workqueue_struct;
+struct writeback_control;
+struct xattr_handler;
+
+extern struct super_block *blockdev_superblock;
+
+/* Possible states of 'frozen' field */
+enum {
+	SB_UNFROZEN		= 0,	/* FS is unfrozen */
+	SB_FREEZE_WRITE		= 1,	/* Writes, dir ops, ioctls frozen */
+	SB_FREEZE_PAGEFAULT	= 2,	/* Page faults stopped as well */
+	SB_FREEZE_FS		= 3,	/* For internal FS use (e.g. to stop internal threads if needed) */
+	SB_FREEZE_COMPLETE	= 4,	/* ->freeze_fs finished successfully */
+};
+
+#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
+
+struct sb_writers {
+	unsigned short			frozen;		/* Is sb frozen? */
+	int				freeze_kcount;	/* How many kernel freeze requests? */
+	int				freeze_ucount;	/* How many userspace freeze requests? */
+	const void			*freeze_owner;	/* Owner of the freeze */
+	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
+};
+
+/**
+ * enum freeze_holder - holder of the freeze
+ * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
+ * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
+ * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
+ * @FREEZE_EXCL: a freeze that can only be undone by the owner
+ *
+ * Indicate who the owner of the freeze or thaw request is and whether
+ * the freeze needs to be exclusive or can nest.
+ * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
+ * same holder aren't allowed. It is however allowed to hold a single
+ * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
+ * the same time. This is relied upon by some filesystems during online
+ * repair or similar.
+ */
+enum freeze_holder {
+	FREEZE_HOLDER_KERNEL	= (1U << 0),
+	FREEZE_HOLDER_USERSPACE	= (1U << 1),
+	FREEZE_MAY_NEST		= (1U << 2),
+	FREEZE_EXCL		= (1U << 3),
+};
+
+struct super_operations {
+	struct inode *(*alloc_inode)(struct super_block *sb);
+	void (*destroy_inode)(struct inode *inode);
+	void (*free_inode)(struct inode *inode);
+	void (*dirty_inode)(struct inode *inode, int flags);
+	int (*write_inode)(struct inode *inode, struct writeback_control *wbc);
+	int (*drop_inode)(struct inode *inode);
+	void (*evict_inode)(struct inode *inode);
+	void (*put_super)(struct super_block *sb);
+	int (*sync_fs)(struct super_block *sb, int wait);
+	int (*freeze_super)(struct super_block *sb, enum freeze_holder who,
+			    const void *owner);
+	int (*freeze_fs)(struct super_block *sb);
+	int (*thaw_super)(struct super_block *sb, enum freeze_holder who,
+			  const void *owner);
+	int (*unfreeze_fs)(struct super_block *sb);
+	int (*statfs)(struct dentry *dentry, struct kstatfs *kstatfs);
+	int (*remount_fs) (struct super_block *, int *, char *);
+	void (*umount_begin)(struct super_block *sb);
+
+	int (*show_options)(struct seq_file *seq, struct dentry *dentry);
+	int (*show_devname)(struct seq_file *seq, struct dentry *dentry);
+	int (*show_path)(struct seq_file *seq, struct dentry *dentry);
+	int (*show_stats)(struct seq_file *seq, struct dentry *dentry);
+#ifdef CONFIG_QUOTA
+	ssize_t (*quota_read)(struct super_block *sb, int type, char *data,
+			      size_t len, loff_t off);
+	ssize_t (*quota_write)(struct super_block *sb, int type,
+			       const char *data, size_t len, loff_t off);
+	struct dquot __rcu **(*get_dquots)(struct inode *inode);
+#endif
+	long (*nr_cached_objects)(struct super_block *sb,
+				  struct shrink_control *sc);
+	long (*free_cached_objects)(struct super_block *sb,
+				    struct shrink_control *sc);
+	/*
+	 * If a filesystem can support graceful removal of a device and
+	 * continue read-write operations, implement this callback.
+	 *
+	 * Return 0 if the filesystem can continue read-write.
+	 * Non-zero return value or no such callback means the fs will be shutdown
+	 * as usual.
+	 */
+	int (*remove_bdev)(struct super_block *sb, struct block_device *bdev);
+	void (*shutdown)(struct super_block *sb);
+};
+
+struct super_block {
+	struct list_head			s_list;		/* Keep this first */
+	dev_t					s_dev;		/* search index; _not_ kdev_t */
+	unsigned char				s_blocksize_bits;
+	unsigned long				s_blocksize;
+	loff_t					s_maxbytes;	/* Max file size */
+	struct file_system_type			*s_type;
+	const struct super_operations		*s_op;
+	const struct dquot_operations		*dq_op;
+	const struct quotactl_ops		*s_qcop;
+	const struct export_operations		*s_export_op;
+	unsigned long				s_flags;
+	unsigned long				s_iflags;	/* internal SB_I_* flags */
+	unsigned long				s_magic;
+	struct dentry				*s_root;
+	struct rw_semaphore			s_umount;
+	int					s_count;
+	atomic_t				s_active;
+#ifdef CONFIG_SECURITY
+	void					*s_security;
+#endif
+	const struct xattr_handler		*const *s_xattr;
+#ifdef CONFIG_FS_ENCRYPTION
+	const struct fscrypt_operations		*s_cop;
+	struct fscrypt_keyring			*s_master_keys; /* master crypto keys in use */
+#endif
+#ifdef CONFIG_FS_VERITY
+	const struct fsverity_operations	*s_vop;
+#endif
+#if IS_ENABLED(CONFIG_UNICODE)
+	struct unicode_map			*s_encoding;
+	__u16					s_encoding_flags;
+#endif
+	struct hlist_bl_head			s_roots;	/* alternate root dentries for NFS */
+	struct mount				*s_mounts;	/* list of mounts; _not_ for fs use */
+	struct block_device			*s_bdev;	/* can go away once we use an accessor for @s_bdev_file */
+	struct file				*s_bdev_file;
+	struct backing_dev_info 		*s_bdi;
+	struct mtd_info				*s_mtd;
+	struct hlist_node			s_instances;
+	unsigned int				s_quota_types;	/* Bitmask of supported quota types */
+	struct quota_info			s_dquot;	/* Diskquota specific options */
+
+	struct sb_writers			s_writers;
+
+	/*
+	 * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
+	 * s_fsnotify_info together for cache efficiency. They are frequently
+	 * accessed and rarely modified.
+	 */
+	void					*s_fs_info;	/* Filesystem private info */
+
+	/* Granularity of c/m/atime in ns (cannot be worse than a second) */
+	u32					s_time_gran;
+	/* Time limits for c/m/atime in seconds */
+	time64_t				s_time_min;
+	time64_t		   		s_time_max;
+#ifdef CONFIG_FSNOTIFY
+	u32					s_fsnotify_mask;
+	struct fsnotify_sb_info			*s_fsnotify_info;
+#endif
+
+	/*
+	 * q: why are s_id and s_sysfs_name not the same? both are human
+	 * readable strings that identify the filesystem
+	 * a: s_id is allowed to change at runtime; it's used in log messages,
+	 * and we want to when a device starts out as single device (s_id is dev
+	 * name) but then a device is hot added and we have to switch to
+	 * identifying it by UUID
+	 * but s_sysfs_name is a handle for programmatic access, and can't
+	 * change at runtime
+	 */
+	char					s_id[32];	/* Informational name */
+	uuid_t					s_uuid;		/* UUID */
+	u8					s_uuid_len;	/* Default 16, possibly smaller for weird filesystems */
+
+	/* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
+	char					s_sysfs_name[UUID_STRING_LEN + 1];
+
+	unsigned int				s_max_links;
+	unsigned int				s_d_flags;	/* default d_flags for dentries */
+
+	/*
+	 * The next field is for VFS *only*. No filesystems have any business
+	 * even looking at it. You had been warned.
+	 */
+	struct mutex				s_vfs_rename_mutex;	/* Kludge */
+
+	/*
+	 * Filesystem subtype.  If non-empty the filesystem type field
+	 * in /proc/mounts will be "type.subtype"
+	 */
+	const char				*s_subtype;
+
+	const struct dentry_operations		*__s_d_op; /* default d_op for dentries */
+
+	struct shrinker				*s_shrink;	/* per-sb shrinker handle */
+
+	/* Number of inodes with nlink == 0 but still referenced */
+	atomic_long_t				s_remove_count;
+
+	/* Read-only state of the superblock is being changed */
+	int					s_readonly_remount;
+
+	/* per-sb errseq_t for reporting writeback errors via syncfs */
+	errseq_t s_wb_err;
+
+	/* AIO completions deferred from interrupt context */
+	struct workqueue_struct			*s_dio_done_wq;
+	struct hlist_head			s_pins;
+
+	/*
+	 * Owning user namespace and default context in which to
+	 * interpret filesystem uids, gids, quotas, device nodes,
+	 * xattrs and security labels.
+	 */
+	struct user_namespace			*s_user_ns;
+
+	/*
+	 * The list_lru structure is essentially just a pointer to a table
+	 * of per-node lru lists, each of which has its own spinlock.
+	 * There is no need to put them into separate cachelines.
+	 */
+	struct list_lru				s_dentry_lru;
+	struct list_lru				s_inode_lru;
+	struct rcu_head				rcu;
+	struct work_struct			destroy_work;
+
+	struct mutex				s_sync_lock;	/* sync serialisation lock */
+
+	/*
+	 * Indicates how deep in a filesystem stack this SB is
+	 */
+	int s_stack_depth;
+
+	/* s_inode_list_lock protects s_inodes */
+	spinlock_t				s_inode_list_lock ____cacheline_aligned_in_smp;
+	struct list_head			s_inodes;	/* all inodes */
+
+	spinlock_t				s_inode_wblist_lock;
+	struct list_head			s_inodes_wb;	/* writeback inodes */
+} __randomize_layout;
+
+/*
+ * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
+ * represented in both.
+ */
+#define SB_RDONLY       BIT(0)	/* Mount read-only */
+#define SB_NOSUID       BIT(1)	/* Ignore suid and sgid bits */
+#define SB_NODEV        BIT(2)	/* Disallow access to device special files */
+#define SB_NOEXEC       BIT(3)	/* Disallow program execution */
+#define SB_SYNCHRONOUS  BIT(4)	/* Writes are synced at once */
+#define SB_MANDLOCK     BIT(6)	/* Allow mandatory locks on an FS */
+#define SB_DIRSYNC      BIT(7)	/* Directory modifications are synchronous */
+#define SB_NOATIME      BIT(10)	/* Do not update access times. */
+#define SB_NODIRATIME   BIT(11)	/* Do not update directory access times */
+#define SB_SILENT       BIT(15)
+#define SB_POSIXACL     BIT(16)	/* Supports POSIX ACLs */
+#define SB_INLINECRYPT  BIT(17)	/* Use blk-crypto for encrypted files */
+#define SB_KERNMOUNT    BIT(22)	/* this is a kern_mount call */
+#define SB_I_VERSION    BIT(23)	/* Update inode I_version field */
+#define SB_LAZYTIME     BIT(25)	/* Update the on-disk [acm]times lazily */
+
+/* These sb flags are internal to the kernel */
+#define SB_DEAD         BIT(21)
+#define SB_DYING        BIT(24)
+#define SB_FORCE        BIT(27)
+#define SB_NOSEC        BIT(28)
+#define SB_BORN         BIT(29)
+#define SB_ACTIVE       BIT(30)
+#define SB_NOUSER       BIT(31)
+
+/* These flags relate to encoding and casefolding */
+#define SB_ENC_STRICT_MODE_FL		(1 << 0)
+#define SB_ENC_NO_COMPAT_FALLBACK_FL	(1 << 1)
+
+#define sb_has_strict_encoding(sb) \
+	(sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
+
+#if IS_ENABLED(CONFIG_UNICODE)
+#define sb_no_casefold_compat_fallback(sb) \
+	(sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL)
+#else
+#define sb_no_casefold_compat_fallback(sb) (1)
+#endif
+
+/* sb->s_iflags */
+#define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
+#define SB_I_NOEXEC	0x00000002	/* Ignore executables on this fs */
+#define SB_I_NODEV	0x00000004	/* Ignore devices on this fs */
+#define SB_I_STABLE_WRITES 0x00000008	/* don't modify blks until WB is done */
+
+/* sb->s_iflags to limit user namespace mounts */
+#define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
+#define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
+#define SB_I_UNTRUSTED_MOUNTER		0x00000040
+#define SB_I_EVM_HMAC_UNSUPPORTED	0x00000080
+
+#define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
+#define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
+#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
+#define SB_I_RETIRED	0x00000800	/* superblock shouldn't be reused */
+#define SB_I_NOUMASK	0x00001000	/* VFS does not apply umask */
+#define SB_I_NOIDMAP	0x00002000	/* No idmapped mounts on this superblock */
+#define SB_I_ALLOW_HSM	0x00004000	/* Allow HSM events on this superblock */
+
+#endif /* _LINUX_FS_SUPER_TYPES_H */
-- 
cgit v1.2.3


From f7b3d14165222a3ad9c4d0d31dfa81e396751801 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 4 Nov 2025 15:46:34 +0100
Subject: fs: add fs/super.h header

Split out super block associated functions into a separate header.

Link: https://patch.msgid.link/20251104-work-fs-header-v1-3-fb39a2efe39e@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h       | 220 +-------------------------------------------
 include/linux/fs/super.h | 233 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 234 insertions(+), 219 deletions(-)
 create mode 100644 include/linux/fs/super.h

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae71c359077a..64af28318fbf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_FS_H
 #define _LINUX_FS_H
 
-#include <linux/fs/super_types.h>
+#include <linux/fs/super.h>
 #include <linux/vfsdebug.h>
 #include <linux/linkage.h>
 #include <linux/wait_bit.h>
@@ -1662,66 +1662,6 @@ struct timespec64 simple_inode_init_ts(struct inode *inode);
  * Snapshotting support.
  */
 
-/*
- * These are internal functions, please use sb_start_{write,pagefault,intwrite}
- * instead.
- */
-static inline void __sb_end_write(struct super_block *sb, int level)
-{
-	percpu_up_read(sb->s_writers.rw_sem + level-1);
-}
-
-static inline void __sb_start_write(struct super_block *sb, int level)
-{
-	percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true);
-}
-
-static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
-{
-	return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
-}
-
-#define __sb_writers_acquired(sb, lev)	\
-	percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
-#define __sb_writers_release(sb, lev)	\
-	percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_)
-
-/**
- * __sb_write_started - check if sb freeze level is held
- * @sb: the super we write to
- * @level: the freeze level
- *
- * * > 0 - sb freeze level is held
- * *   0 - sb freeze level is not held
- * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
- */
-static inline int __sb_write_started(const struct super_block *sb, int level)
-{
-	return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
-}
-
-/**
- * sb_write_started - check if SB_FREEZE_WRITE is held
- * @sb: the super we write to
- *
- * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
- */
-static inline bool sb_write_started(const struct super_block *sb)
-{
-	return __sb_write_started(sb, SB_FREEZE_WRITE);
-}
-
-/**
- * sb_write_not_started - check if SB_FREEZE_WRITE is not held
- * @sb: the super we write to
- *
- * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
- */
-static inline bool sb_write_not_started(const struct super_block *sb)
-{
-	return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
-}
-
 /**
  * file_write_started - check if SB_FREEZE_WRITE is held
  * @file: the file we write to
@@ -1752,118 +1692,6 @@ static inline bool file_write_not_started(const struct file *file)
 	return sb_write_not_started(file_inode(file)->i_sb);
 }
 
-/**
- * sb_end_write - drop write access to a superblock
- * @sb: the super we wrote to
- *
- * Decrement number of writers to the filesystem. Wake up possible waiters
- * wanting to freeze the filesystem.
- */
-static inline void sb_end_write(struct super_block *sb)
-{
-	__sb_end_write(sb, SB_FREEZE_WRITE);
-}
-
-/**
- * sb_end_pagefault - drop write access to a superblock from a page fault
- * @sb: the super we wrote to
- *
- * Decrement number of processes handling write page fault to the filesystem.
- * Wake up possible waiters wanting to freeze the filesystem.
- */
-static inline void sb_end_pagefault(struct super_block *sb)
-{
-	__sb_end_write(sb, SB_FREEZE_PAGEFAULT);
-}
-
-/**
- * sb_end_intwrite - drop write access to a superblock for internal fs purposes
- * @sb: the super we wrote to
- *
- * Decrement fs-internal number of writers to the filesystem.  Wake up possible
- * waiters wanting to freeze the filesystem.
- */
-static inline void sb_end_intwrite(struct super_block *sb)
-{
-	__sb_end_write(sb, SB_FREEZE_FS);
-}
-
-/**
- * sb_start_write - get write access to a superblock
- * @sb: the super we write to
- *
- * When a process wants to write data or metadata to a file system (i.e. dirty
- * a page or an inode), it should embed the operation in a sb_start_write() -
- * sb_end_write() pair to get exclusion against file system freezing. This
- * function increments number of writers preventing freezing. If the file
- * system is already frozen, the function waits until the file system is
- * thawed.
- *
- * Since freeze protection behaves as a lock, users have to preserve
- * ordering of freeze protection and other filesystem locks. Generally,
- * freeze protection should be the outermost lock. In particular, we have:
- *
- * sb_start_write
- *   -> i_rwsem			(write path, truncate, directory ops, ...)
- *   -> s_umount		(freeze_super, thaw_super)
- */
-static inline void sb_start_write(struct super_block *sb)
-{
-	__sb_start_write(sb, SB_FREEZE_WRITE);
-}
-
-static inline bool sb_start_write_trylock(struct super_block *sb)
-{
-	return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
-}
-
-/**
- * sb_start_pagefault - get write access to a superblock from a page fault
- * @sb: the super we write to
- *
- * When a process starts handling write page fault, it should embed the
- * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
- * exclusion against file system freezing. This is needed since the page fault
- * is going to dirty a page. This function increments number of running page
- * faults preventing freezing. If the file system is already frozen, the
- * function waits until the file system is thawed.
- *
- * Since page fault freeze protection behaves as a lock, users have to preserve
- * ordering of freeze protection and other filesystem locks. It is advised to
- * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
- * handling code implies lock dependency:
- *
- * mmap_lock
- *   -> sb_start_pagefault
- */
-static inline void sb_start_pagefault(struct super_block *sb)
-{
-	__sb_start_write(sb, SB_FREEZE_PAGEFAULT);
-}
-
-/**
- * sb_start_intwrite - get write access to a superblock for internal fs purposes
- * @sb: the super we write to
- *
- * This is the third level of protection against filesystem freezing. It is
- * free for use by a filesystem. The only requirement is that it must rank
- * below sb_start_pagefault.
- *
- * For example filesystem can call sb_start_intwrite() when starting a
- * transaction which somewhat eases handling of freezing for internal sources
- * of filesystem changes (internal fs threads, discarding preallocation on file
- * close, etc.).
- */
-static inline void sb_start_intwrite(struct super_block *sb)
-{
-	__sb_start_write(sb, SB_FREEZE_FS);
-}
-
-static inline bool sb_start_intwrite_trylock(struct super_block *sb)
-{
-	return __sb_start_write_trylock(sb, SB_FREEZE_FS);
-}
-
 bool inode_owner_or_capable(struct mnt_idmap *idmap,
 			    const struct inode *inode);
 
@@ -2233,7 +2061,6 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
  */
 #define __IS_FLG(inode, flg)	((inode)->i_sb->s_flags & (flg))
 
-static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
 #define IS_RDONLY(inode)	sb_rdonly((inode)->i_sb)
 #define IS_SYNC(inode)		(__IS_FLG(inode, SB_SYNCHRONOUS) || \
 					((inode)->i_flags & S_SYNC))
@@ -2467,10 +2294,6 @@ extern int unregister_filesystem(struct file_system_type *);
 extern int vfs_statfs(const struct path *, struct kstatfs *);
 extern int user_statfs(const char __user *, struct kstatfs *);
 extern int fd_statfs(int, struct kstatfs *);
-int freeze_super(struct super_block *super, enum freeze_holder who,
-		 const void *freeze_owner);
-int thaw_super(struct super_block *super, enum freeze_holder who,
-	       const void *freeze_owner);
 extern __printf(2, 3)
 int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
 extern int super_setup_bdi(struct super_block *sb);
@@ -2657,12 +2480,6 @@ extern struct kmem_cache *names_cachep;
 #define __getname()		kmem_cache_alloc(names_cachep, GFP_KERNEL)
 #define __putname(name)		kmem_cache_free(names_cachep, (void *)(name))
 
-extern struct super_block *blockdev_superblock;
-static inline bool sb_is_blkdev_sb(struct super_block *sb)
-{
-	return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
-}
-
 void emergency_thaw_all(void);
 extern int sync_filesystem(struct super_block *);
 extern const struct file_operations def_blk_fops;
@@ -3117,9 +2934,6 @@ static inline void remove_inode_hash(struct inode *inode)
 extern void inode_sb_list_add(struct inode *inode);
 extern void inode_add_lru(struct inode *inode);
 
-extern int sb_set_blocksize(struct super_block *, int);
-extern int sb_min_blocksize(struct super_block *, int);
-
 int generic_file_mmap(struct file *, struct vm_area_struct *);
 int generic_file_mmap_prepare(struct vm_area_desc *desc);
 int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
@@ -3439,38 +3253,6 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir,
 }
 #endif
 
-static inline struct unicode_map *sb_encoding(const struct super_block *sb)
-{
-#if IS_ENABLED(CONFIG_UNICODE)
-	return sb->s_encoding;
-#else
-	return NULL;
-#endif
-}
-
-static inline bool sb_has_encoding(const struct super_block *sb)
-{
-	return !!sb_encoding(sb);
-}
-
-/*
- * Compare if two super blocks have the same encoding and flags
- */
-static inline bool sb_same_encoding(const struct super_block *sb1,
-				    const struct super_block *sb2)
-{
-#if IS_ENABLED(CONFIG_UNICODE)
-	if (sb1->s_encoding == sb2->s_encoding)
-		return true;
-
-	return (sb1->s_encoding && sb2->s_encoding &&
-	       (sb1->s_encoding->version == sb2->s_encoding->version) &&
-	       (sb1->s_encoding_flags == sb2->s_encoding_flags));
-#else
-	return true;
-#endif
-}
-
 int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
 		unsigned int ia_valid);
 int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *);
diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h
new file mode 100644
index 000000000000..c0d22b12c1c9
--- /dev/null
+++ b/include/linux/fs/super.h
@@ -0,0 +1,233 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FS_SUPER_H
+#define _LINUX_FS_SUPER_H
+
+#include <linux/fs/super_types.h>
+#include <linux/unicode.h>
+
+/*
+ * These are internal functions, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+static inline void __sb_end_write(struct super_block *sb, int level)
+{
+	percpu_up_read(sb->s_writers.rw_sem + level - 1);
+}
+
+static inline void __sb_start_write(struct super_block *sb, int level)
+{
+	percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true);
+}
+
+static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
+{
+	return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
+}
+
+#define __sb_writers_acquired(sb, lev) \
+	percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev) - 1], 1, _THIS_IP_)
+#define __sb_writers_release(sb, lev) \
+	percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev) - 1], _THIS_IP_)
+
+/**
+ * __sb_write_started - check if sb freeze level is held
+ * @sb: the super we write to
+ * @level: the freeze level
+ *
+ * * > 0 - sb freeze level is held
+ * *   0 - sb freeze level is not held
+ * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
+ */
+static inline int __sb_write_started(const struct super_block *sb, int level)
+{
+	return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
+}
+
+/**
+ * sb_write_started - check if SB_FREEZE_WRITE is held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
+static inline bool sb_write_started(const struct super_block *sb)
+{
+	return __sb_write_started(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_write_not_started - check if SB_FREEZE_WRITE is not held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
+static inline bool sb_write_not_started(const struct super_block *sb)
+{
+	return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
+}
+
+/**
+ * sb_end_write - drop write access to a superblock
+ * @sb: the super we wrote to
+ *
+ * Decrement number of writers to the filesystem. Wake up possible waiters
+ * wanting to freeze the filesystem.
+ */
+static inline void sb_end_write(struct super_block *sb)
+{
+	__sb_end_write(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_end_pagefault - drop write access to a superblock from a page fault
+ * @sb: the super we wrote to
+ *
+ * Decrement number of processes handling write page fault to the filesystem.
+ * Wake up possible waiters wanting to freeze the filesystem.
+ */
+static inline void sb_end_pagefault(struct super_block *sb)
+{
+	__sb_end_write(sb, SB_FREEZE_PAGEFAULT);
+}
+
+/**
+ * sb_end_intwrite - drop write access to a superblock for internal fs purposes
+ * @sb: the super we wrote to
+ *
+ * Decrement fs-internal number of writers to the filesystem.  Wake up possible
+ * waiters wanting to freeze the filesystem.
+ */
+static inline void sb_end_intwrite(struct super_block *sb)
+{
+	__sb_end_write(sb, SB_FREEZE_FS);
+}
+
+/**
+ * sb_start_write - get write access to a superblock
+ * @sb: the super we write to
+ *
+ * When a process wants to write data or metadata to a file system (i.e. dirty
+ * a page or an inode), it should embed the operation in a sb_start_write() -
+ * sb_end_write() pair to get exclusion against file system freezing. This
+ * function increments number of writers preventing freezing. If the file
+ * system is already frozen, the function waits until the file system is
+ * thawed.
+ *
+ * Since freeze protection behaves as a lock, users have to preserve
+ * ordering of freeze protection and other filesystem locks. Generally,
+ * freeze protection should be the outermost lock. In particular, we have:
+ *
+ * sb_start_write
+ *   -> i_rwsem			(write path, truncate, directory ops, ...)
+ *   -> s_umount		(freeze_super, thaw_super)
+ */
+static inline void sb_start_write(struct super_block *sb)
+{
+	__sb_start_write(sb, SB_FREEZE_WRITE);
+}
+
+static inline bool sb_start_write_trylock(struct super_block *sb)
+{
+	return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_start_pagefault - get write access to a superblock from a page fault
+ * @sb: the super we write to
+ *
+ * When a process starts handling write page fault, it should embed the
+ * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
+ * exclusion against file system freezing. This is needed since the page fault
+ * is going to dirty a page. This function increments number of running page
+ * faults preventing freezing. If the file system is already frozen, the
+ * function waits until the file system is thawed.
+ *
+ * Since page fault freeze protection behaves as a lock, users have to preserve
+ * ordering of freeze protection and other filesystem locks. It is advised to
+ * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
+ * handling code implies lock dependency:
+ *
+ * mmap_lock
+ *   -> sb_start_pagefault
+ */
+static inline void sb_start_pagefault(struct super_block *sb)
+{
+	__sb_start_write(sb, SB_FREEZE_PAGEFAULT);
+}
+
+/**
+ * sb_start_intwrite - get write access to a superblock for internal fs purposes
+ * @sb: the super we write to
+ *
+ * This is the third level of protection against filesystem freezing. It is
+ * free for use by a filesystem. The only requirement is that it must rank
+ * below sb_start_pagefault.
+ *
+ * For example filesystem can call sb_start_intwrite() when starting a
+ * transaction which somewhat eases handling of freezing for internal sources
+ * of filesystem changes (internal fs threads, discarding preallocation on file
+ * close, etc.).
+ */
+static inline void sb_start_intwrite(struct super_block *sb)
+{
+	__sb_start_write(sb, SB_FREEZE_FS);
+}
+
+static inline bool sb_start_intwrite_trylock(struct super_block *sb)
+{
+	return __sb_start_write_trylock(sb, SB_FREEZE_FS);
+}
+
+static inline bool sb_rdonly(const struct super_block *sb)
+{
+	return sb->s_flags & SB_RDONLY;
+}
+
+static inline bool sb_is_blkdev_sb(struct super_block *sb)
+{
+	return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+static inline struct unicode_map *sb_encoding(const struct super_block *sb)
+{
+	return sb->s_encoding;
+}
+
+/* Compare if two super blocks have the same encoding and flags */
+static inline bool sb_same_encoding(const struct super_block *sb1,
+				    const struct super_block *sb2)
+{
+	if (sb1->s_encoding == sb2->s_encoding)
+		return true;
+
+	return (sb1->s_encoding && sb2->s_encoding &&
+		(sb1->s_encoding->version == sb2->s_encoding->version) &&
+		(sb1->s_encoding_flags == sb2->s_encoding_flags));
+}
+#else
+static inline struct unicode_map *sb_encoding(const struct super_block *sb)
+{
+	return NULL;
+}
+
+static inline bool sb_same_encoding(const struct super_block *sb1,
+				    const struct super_block *sb2)
+{
+	return true;
+}
+#endif
+
+static inline bool sb_has_encoding(const struct super_block *sb)
+{
+	return !!sb_encoding(sb);
+}
+
+int sb_set_blocksize(struct super_block *sb, int size);
+int sb_min_blocksize(struct super_block *sb, int size);
+
+int freeze_super(struct super_block *super, enum freeze_holder who,
+		 const void *freeze_owner);
+int thaw_super(struct super_block *super, enum freeze_holder who,
+	       const void *freeze_owner);
+
+#endif /* _LINUX_FS_SUPER_H */
-- 
cgit v1.2.3


From 5b8ed52866e3d19e02860c7cf1d6bbbd70b619e9 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Tue, 4 Nov 2025 18:04:48 +0100
Subject: fs: inline current_umask() and move it to fs_struct.h

There is no good reason to have this as a func call, other than avoiding
the churn of adding fs_struct.h as needed.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251104170448.630414-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/9p/acl.c               | 1 +
 fs/btrfs/inode.c          | 1 +
 fs/f2fs/acl.c             | 1 +
 fs/fat/inode.c            | 1 +
 fs/fs_struct.c            | 6 ------
 fs/hfsplus/options.c      | 1 +
 fs/hpfs/super.c           | 1 +
 fs/nilfs2/nilfs.h         | 1 +
 fs/ntfs3/super.c          | 1 +
 fs/ocfs2/acl.c            | 1 +
 fs/omfs/inode.c           | 1 +
 fs/smb/client/file.c      | 1 +
 fs/smb/client/inode.c     | 1 +
 fs/smb/client/smb1ops.c   | 1 +
 include/linux/fs.h        | 2 --
 include/linux/fs_struct.h | 6 ++++++
 include/linux/namei.h     | 1 +
 17 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index eed551d8555f..633da5e37299 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -6,6 +6,7 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/fs_struct.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <linux/slab.h>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b1b3a0553ee..dee166426511 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9,6 +9,7 @@
 #include <linux/blk-cgroup.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/fs_struct.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/time.h>
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index d4d7f329d23f..fa8d81a30fb9 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -9,6 +9,7 @@
  *
  * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
  */
+#include <linux/fs_struct.h>
 #include <linux/f2fs_fs.h>
 #include "f2fs.h"
 #include "xattr.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9648ed097816..309e560038dd 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -22,6 +22,7 @@
 #include <linux/unaligned.h>
 #include <linux/random.h>
 #include <linux/iversion.h>
+#include <linux/fs_struct.h>
 #include "fat.h"
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 28be762ac1c6..b8c46c5a38a0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -146,12 +146,6 @@ int unshare_fs_struct(void)
 }
 EXPORT_SYMBOL_GPL(unshare_fs_struct);
 
-int current_umask(void)
-{
-	return current->fs->umask;
-}
-EXPORT_SYMBOL(current_umask);
-
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.users		= 1,
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index a66a09a56bf7..9b377481f397 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/fs_struct.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
 #include <linux/nls.h>
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 8ab85e7ac91e..371aa6de8075 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -9,6 +9,7 @@
 
 #include "hpfs_fn.h"
 #include <linux/module.h>
+#include <linux/fs_struct.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
 #include <linux/init.h>
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f466daa39440..b7e3d91b6243 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -14,6 +14,7 @@
 #include <linux/buffer_head.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
+#include <linux/fs_struct.h>
 #include <linux/nilfs2_api.h>
 #include <linux/nilfs2_ondisk.h>
 #include "the_nilfs.h"
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index ddff94c091b8..8d09dfec970a 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -51,6 +51,7 @@
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/fs.h>
+#include <linux/fs_struct.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
 #include <linux/log2.h>
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 62464d194da3..af1e2cedb217 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/fs_struct.h>
 
 #include <cluster/masklog.h>
 
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 135c49c5d848..89dc093f2752 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -14,6 +14,7 @@
 #include <linux/writeback.h>
 #include <linux/seq_file.h>
 #include <linux/crc-itu-t.h>
+#include <linux/fs_struct.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
 #include "omfs.h"
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 474dadeb1593..9dc0a968ec89 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -9,6 +9,7 @@
  *
  */
 #include <linux/fs.h>
+#include <linux/fs_struct.h>
 #include <linux/filelock.h>
 #include <linux/backing-dev.h>
 #include <linux/stat.h>
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index cac355364e43..28a73717851c 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -6,6 +6,7 @@
  *
  */
 #include <linux/fs.h>
+#include <linux/fs_struct.h>
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index ca8f3dd7ff63..78650527d4bb 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -7,6 +7,7 @@
 
 #include <linux/pagemap.h>
 #include <linux/vfs.h>
+#include <linux/fs_struct.h>
 #include <uapi/linux/magic.h>
 #include "cifsglob.h"
 #include "cifsproto.h"
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 64af28318fbf..c0c0095b2b60 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2336,8 +2336,6 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch
 	va_end(args);
 }
 
-extern int current_umask(void);
-
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 int inode_update_timestamps(struct inode *inode, int flags);
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index baf200ab5c77..0070764b790a 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_FS_STRUCT_H
 #define _LINUX_FS_STRUCT_H
 
+#include <linux/sched.h>
 #include <linux/path.h>
 #include <linux/spinlock.h>
 #include <linux/seqlock.h>
@@ -41,4 +42,9 @@ static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
 
 extern bool current_chrooted(void);
 
+static inline int current_umask(void)
+{
+	return current->fs->umask;
+}
+
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index fed86221c69c..b0679c7420a8 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -7,6 +7,7 @@
 #include <linux/path.h>
 #include <linux/fcntl.h>
 #include <linux/errno.h>
+#include <linux/fs_struct.h>
 
 enum { MAX_NESTED_LINKS = 8 };
 
-- 
cgit v1.2.3


From 8e4d576ed3ff917eda65b989ba56b02d9a3894f9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 4 Nov 2025 13:12:30 +0100
Subject: fs: add super_write_guard

Link: https://patch.msgid.link/20251104-work-guards-v1-1-5108ac78a171@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs/super.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h
index c0d22b12c1c9..b874105743b3 100644
--- a/include/linux/fs/super.h
+++ b/include/linux/fs/super.h
@@ -125,6 +125,11 @@ static inline void sb_start_write(struct super_block *sb)
 	__sb_start_write(sb, SB_FREEZE_WRITE);
 }
 
+DEFINE_GUARD(super_write,
+	     struct super_block *,
+	     sb_start_write(_T),
+	     sb_end_write(_T))
+
 static inline bool sb_start_write_trylock(struct super_block *sb)
 {
 	return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
-- 
cgit v1.2.3


From 4868d2d52df6f724b01531843805a3b1322e2dd9 Mon Sep 17 00:00:00 2001
From: Longfang Liu <liulongfang@huawei.com>
Date: Thu, 30 Oct 2025 09:57:43 +0800
Subject: crypto: hisilicon - qm updates BAR configuration

On new platforms greater than QM_HW_V3, the configuration region for the
live migration function of the accelerator device is no longer
placed in the VF, but is instead placed in the PF.

Therefore, the configuration region of the live migration function
needs to be opened when the QM driver is loaded. When the QM driver
is uninstalled, the driver needs to clear this configuration.

Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Reviewed-by: Shameer Kolothum <shameerkolothum@gmail.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Link: https://lore.kernel.org/r/20251030015744.131771-2-liulongfang@huawei.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/crypto/hisilicon/qm.c | 27 +++++++++++++++++++++++++++
 include/linux/hisi_acc_qm.h   |  3 +++
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index a5b96adf2d1e..e88085c2cbb3 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -3019,11 +3019,36 @@ static void qm_put_pci_res(struct hisi_qm *qm)
 	pci_release_mem_regions(pdev);
 }
 
+static void hisi_mig_region_clear(struct hisi_qm *qm)
+{
+	u32 val;
+
+	/* Clear migration region set of PF */
+	if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) {
+		val = readl(qm->io_base + QM_MIG_REGION_SEL);
+		val &= ~QM_MIG_REGION_EN;
+		writel(val, qm->io_base + QM_MIG_REGION_SEL);
+	}
+}
+
+static void hisi_mig_region_enable(struct hisi_qm *qm)
+{
+	u32 val;
+
+	/* Select migration region of PF */
+	if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) {
+		val = readl(qm->io_base + QM_MIG_REGION_SEL);
+		val |= QM_MIG_REGION_EN;
+		writel(val, qm->io_base + QM_MIG_REGION_SEL);
+	}
+}
+
 static void hisi_qm_pci_uninit(struct hisi_qm *qm)
 {
 	struct pci_dev *pdev = qm->pdev;
 
 	pci_free_irq_vectors(pdev);
+	hisi_mig_region_clear(qm);
 	qm_put_pci_res(qm);
 	pci_disable_device(pdev);
 }
@@ -5725,6 +5750,7 @@ int hisi_qm_init(struct hisi_qm *qm)
 		goto err_free_qm_memory;
 
 	qm_cmd_init(qm);
+	hisi_mig_region_enable(qm);
 
 	return 0;
 
@@ -5863,6 +5889,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm)
 	}
 
 	qm_cmd_init(qm);
+	hisi_mig_region_enable(qm);
 	hisi_qm_dev_err_init(qm);
 	/* Set the doorbell timeout to QM_DB_TIMEOUT_CFG ns. */
 	writel(QM_DB_TIMEOUT_SET, qm->io_base + QM_DB_TIMEOUT_CFG);
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index c4690e365ade..ca1ec437a3ca 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -99,6 +99,9 @@
 
 #define QM_DEV_ALG_MAX_LEN		256
 
+#define QM_MIG_REGION_SEL		0x100198
+#define QM_MIG_REGION_EN		BIT(0)
+
 /* uacce mode of the driver */
 #define UACCE_MODE_NOUACCE		0 /* don't use uacce */
 #define UACCE_MODE_SVA			1 /* use uacce sva mode */
-- 
cgit v1.2.3


From 313a335057f0894e6e59290d4e7fb8b35ec250e6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 3 Nov 2025 15:57:33 +0100
Subject: coredump: mark struct mm_struct as const

We don't actually modify it.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-7-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/coredump.c                  | 2 +-
 include/linux/sched/coredump.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/coredump.c b/fs/coredump.c
index 590360ba0a28..8253b28bc728 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -1092,7 +1092,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo)
 	size_t *argv __free(kfree) = NULL;
 	struct core_state core_state;
 	struct core_name cn;
-	struct mm_struct *mm = current->mm;
+	const struct mm_struct *mm = current->mm;
 	const struct linux_binfmt *binfmt = mm->binfmt;
 	const struct cred *old_cred;
 	int argc = 0;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index b7fafe999073..624fda17a785 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -8,7 +8,7 @@
 #define SUID_DUMP_USER		1	/* Dump as user of process */
 #define SUID_DUMP_ROOT		2	/* Dump as root */
 
-static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm)
+static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm)
 {
 	/*
 	 * By convention, dumpable bits are contained in first 32 bits of the
-- 
cgit v1.2.3


From 34dc27f02cb3799d56a99002261e4d091da0cea4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 5 Nov 2025 12:32:02 -0800
Subject: srcu: Create an srcu_expedite_current() function

This commit creates an srcu_expedite_current() function that expedites
the current (and possibly the next) SRCU grace period for the specified
srcu_struct structure.  This functionality will be inherited by RCU
Tasks Trace courtesy of its mapping to SRCU fast.

If the current SRCU grace period is already waiting, that wait will
complete before the expediting takes effect.  If there is no SRCU grace
period in flight, this function might well create one.

[ paulmck: Apply Zqiang feedback for PREEMPT_RT use. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <bpf@vger.kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcutiny.h |  1 +
 include/linux/srcutree.h |  8 +++++++
 kernel/rcu/srcutree.c    | 58 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+)

(limited to 'include')

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 51ce25f07930..3bfbd44cb1b3 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -103,6 +103,7 @@ static inline void srcu_barrier(struct srcu_struct *ssp)
 	synchronize_srcu(ssp);
 }
 
+static inline void srcu_expedite_current(struct srcu_struct *ssp) { }
 #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0)
 #define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0)
 
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 42098e0fa0b7..93ad18acd6d0 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -42,6 +42,8 @@ struct srcu_data {
 	struct timer_list delay_work;		/* Delay for CB invoking */
 	struct work_struct work;		/* Context for CB invoking. */
 	struct rcu_head srcu_barrier_head;	/* For srcu_barrier() use. */
+	struct rcu_head srcu_ec_head;		/* For srcu_expedite_current() use. */
+	int srcu_ec_state;			/*  State for srcu_expedite_current(). */
 	struct srcu_node *mynode;		/* Leaf srcu_node. */
 	unsigned long grpmask;			/* Mask for leaf srcu_node */
 						/*  ->srcu_data_have_cbs[]. */
@@ -135,6 +137,11 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
 
+/* Values for srcu_expedite_current() state (->srcu_ec_state). */
+#define SRCU_EC_IDLE		0
+#define SRCU_EC_PENDING		1
+#define SRCU_EC_REPOST		2
+
 /*
  * Values for initializing gp sequence fields. Higher values allow wrap arounds to
  * occur earlier.
@@ -210,6 +217,7 @@ struct srcu_struct {
 int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
 void synchronize_srcu_expedited(struct srcu_struct *ssp);
 void srcu_barrier(struct srcu_struct *ssp);
+void srcu_expedite_current(struct srcu_struct *ssp);
 void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf);
 
 // Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 1ff94b76d91f..38b440b0b0c8 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1688,6 +1688,64 @@ void srcu_barrier(struct srcu_struct *ssp)
 }
 EXPORT_SYMBOL_GPL(srcu_barrier);
 
+/* Callback for srcu_expedite_current() usage. */
+static void srcu_expedite_current_cb(struct rcu_head *rhp)
+{
+	unsigned long flags;
+	bool needcb = false;
+	struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head);
+
+	spin_lock_irqsave_sdp_contention(sdp, &flags);
+	if (sdp->srcu_ec_state == SRCU_EC_IDLE) {
+		WARN_ON_ONCE(1);
+	} else if (sdp->srcu_ec_state == SRCU_EC_PENDING) {
+		sdp->srcu_ec_state = SRCU_EC_IDLE;
+	} else {
+		WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST);
+		sdp->srcu_ec_state = SRCU_EC_PENDING;
+		needcb = true;
+	}
+	spin_unlock_irqrestore_rcu_node(sdp, flags);
+	// If needed, requeue ourselves as an expedited SRCU callback.
+	if (needcb)
+		__call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false);
+}
+
+/**
+ * srcu_expedite_current - Expedite the current SRCU grace period
+ * @ssp: srcu_struct to expedite.
+ *
+ * Cause the current SRCU grace period to become expedited.  The grace
+ * period following the current one might also be expedited.  If there is
+ * no current grace period, one might be created.  If the current grace
+ * period is currently sleeping, that sleep will complete before expediting
+ * will take effect.
+ */
+void srcu_expedite_current(struct srcu_struct *ssp)
+{
+	unsigned long flags;
+	bool needcb = false;
+	struct srcu_data *sdp;
+
+	migrate_disable();
+	sdp = this_cpu_ptr(ssp->sda);
+	spin_lock_irqsave_sdp_contention(sdp, &flags);
+	if (sdp->srcu_ec_state == SRCU_EC_IDLE) {
+		sdp->srcu_ec_state = SRCU_EC_PENDING;
+		needcb = true;
+	} else if (sdp->srcu_ec_state == SRCU_EC_PENDING) {
+		sdp->srcu_ec_state = SRCU_EC_REPOST;
+	} else {
+		WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST);
+	}
+	spin_unlock_irqrestore_rcu_node(sdp, flags);
+	// If needed, queue an expedited SRCU callback.
+	if (needcb)
+		__call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false);
+	migrate_enable();
+}
+EXPORT_SYMBOL_GPL(srcu_expedite_current);
+
 /**
  * srcu_batches_completed - return batches completed.
  * @ssp: srcu_struct on which to report batch completion.
-- 
cgit v1.2.3


From ee90848499b169070dbf85a4276a45ccbb7ff7d3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 5 Nov 2025 12:32:04 -0800
Subject: srcu: Create a DEFINE_SRCU_FAST()

This commit creates DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST()
macros that are similar to DEFINE_SRCU() and DEFINE_STATIC_SRCU(),
but which create srcu_struct structures that are usable only by readers
initiated by srcu_read_lock_fast() and friends.

This commit does make DEFINE_SRCU_FAST() available to modules, in which
case the per-CPU srcu_data structures are not created at compile time, but
rather at module-load time.  This means that the >srcu_reader_flavor field
of the srcu_data structure is not available.  Therefore,
this commit instead creates an ->srcu_reader_flavor field in the
srcu_struct structure, adds arguments to the DEFINE_SRCU()-related
macros to initialize this new field, and extends the checks in the
__srcu_check_read_flavor() function to include this new field.

This commit also allows dynamically allocated srcu_struct structure
to be marked for SRCU-fast readers.  It does so by defining a new
init_srcu_struct_fast() function that marks the specified srcu_struct
structure for use by srcu_read_lock_fast() and friends.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: <bpf@vger.kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/notifier.h |  2 +-
 include/linux/srcu.h     | 16 ++++++++++++++--
 include/linux/srcutiny.h | 13 ++++++++++---
 include/linux/srcutree.h | 30 +++++++++++++++++++-----------
 kernel/rcu/srcutree.c    | 36 ++++++++++++++++++++++++++++++++++--
 5 files changed, 78 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index b42e64734968..01b6c9d9956f 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -109,7 +109,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 		.mutex = __MUTEX_INITIALIZER(name.mutex),	\
 		.head = NULL,					\
 		.srcuu = __SRCU_USAGE_INIT(name.srcuu),		\
-		.srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \
+		.srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu, 0), \
 	}
 
 #define ATOMIC_NOTIFIER_HEAD(name)				\
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index ada65b58bc4c..26de47820c58 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -25,8 +25,10 @@ struct srcu_struct;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
-		       struct lock_class_key *key);
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key);
+#ifndef CONFIG_TINY_SRCU
+int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key);
+#endif // #ifndef CONFIG_TINY_SRCU
 
 #define init_srcu_struct(ssp) \
 ({ \
@@ -35,10 +37,20 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
 	__init_srcu_struct((ssp), #ssp, &__srcu_key); \
 })
 
+#define init_srcu_struct_fast(ssp) \
+({ \
+	static struct lock_class_key __srcu_key; \
+	\
+	__init_srcu_struct_fast((ssp), #ssp, &__srcu_key); \
+})
+
 #define __SRCU_DEP_MAP_INIT(srcu_name)	.dep_map = { .name = #srcu_name },
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 int init_srcu_struct(struct srcu_struct *ssp);
+#ifndef CONFIG_TINY_SRCU
+int init_srcu_struct_fast(struct srcu_struct *ssp);
+#endif // #ifndef CONFIG_TINY_SRCU
 
 #define __SRCU_DEP_MAP_INIT(srcu_name)
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 3bfbd44cb1b3..92e6ab53398f 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -31,7 +31,7 @@ struct srcu_struct {
 
 void srcu_drive_gp(struct work_struct *wp);
 
-#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored)			\
+#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored, ____ignored)	\
 {									\
 	.srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq),	\
 	.srcu_cb_tail = &name.srcu_cb_head,				\
@@ -44,13 +44,20 @@ void srcu_drive_gp(struct work_struct *wp);
  * Tree SRCU, which needs some per-CPU data.
  */
 #define DEFINE_SRCU(name) \
-	struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name)
+	struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name)
 #define DEFINE_STATIC_SRCU(name) \
-	static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name)
+	static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name)
+#define DEFINE_SRCU_FAST(name) DEFINE_SRCU(name)
+#define DEFINE_STATIC_SRCU_FAST(name) \
+	static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name)
 
 // Dummy structure for srcu_notifier_head.
 struct srcu_usage { };
 #define __SRCU_USAGE_INIT(name) { }
+#define __init_srcu_struct_fast __init_srcu_struct
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#define init_srcu_struct_fast init_srcu_struct
+#endif // #ifndef CONFIG_DEBUG_LOCK_ALLOC
 
 void synchronize_srcu(struct srcu_struct *ssp);
 
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 93ad18acd6d0..7ff4a11bc5a3 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -104,6 +104,7 @@ struct srcu_usage {
 struct srcu_struct {
 	struct srcu_ctr __percpu *srcu_ctrp;
 	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
+	u8 srcu_reader_flavor;
 	struct lockdep_map dep_map;
 	struct srcu_usage *srcu_sup;		/* Update-side data. */
 };
@@ -162,20 +163,21 @@ struct srcu_struct {
 	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),					\
 }
 
-#define __SRCU_STRUCT_INIT_COMMON(name, usage_name)						\
+#define __SRCU_STRUCT_INIT_COMMON(name, usage_name, fast)					\
 	.srcu_sup = &usage_name,								\
+	.srcu_reader_flavor = fast,								\
 	__SRCU_DEP_MAP_INIT(name)
 
-#define __SRCU_STRUCT_INIT_MODULE(name, usage_name)						\
+#define __SRCU_STRUCT_INIT_MODULE(name, usage_name, fast)					\
 {												\
-	__SRCU_STRUCT_INIT_COMMON(name, usage_name)						\
+	__SRCU_STRUCT_INIT_COMMON(name, usage_name, fast)					\
 }
 
-#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name)						\
+#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name, fast)					\
 {												\
 	.sda = &pcpu_name,									\
 	.srcu_ctrp = &pcpu_name.srcu_ctrs[0],							\
-	__SRCU_STRUCT_INIT_COMMON(name, usage_name)						\
+	__SRCU_STRUCT_INIT_COMMON(name, usage_name, fast)						\
 }
 
 /*
@@ -196,23 +198,29 @@ struct srcu_struct {
  *	init_srcu_struct(&my_srcu);
  *
  * See include/linux/percpu-defs.h for the rules on per-CPU variables.
+ *
+ * DEFINE_SRCU_FAST() creates an srcu_struct and associated structures
+ * whose readers must be of the SRCU-fast variety.
  */
 #ifdef MODULE
-# define __DEFINE_SRCU(name, is_static)								\
+# define __DEFINE_SRCU(name, fast, is_static)							\
 	static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage);	\
-	is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage);	\
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage,	\
+								      fast);			\
 	extern struct srcu_struct * const __srcu_struct_##name;					\
 	struct srcu_struct * const __srcu_struct_##name						\
 		__section("___srcu_struct_ptrs") = &name
 #else
-# define __DEFINE_SRCU(name, is_static)								\
+# define __DEFINE_SRCU(name, fast, is_static)							\
 	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);				\
 	static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage);	\
 	is_static struct srcu_struct name =							\
-		__SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data)
+		__SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data, fast)
 #endif
-#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
-#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
+#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, 0, /* not static */)
+#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, 0, static)
+#define DEFINE_SRCU_FAST(name)		__DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, /* not static */)
+#define DEFINE_STATIC_SRCU_FAST(name)	__DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, static)
 
 int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
 void synchronize_srcu_expedited(struct srcu_struct *ssp);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 38b440b0b0c8..9869a13b8763 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -286,16 +286,29 @@ err_free_sup:
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
-		       struct lock_class_key *key)
+static int
+__init_srcu_struct_common(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
 {
 	/* Don't re-initialize a lock while it is held. */
 	debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
 	lockdep_init_map(&ssp->dep_map, name, key, 0);
 	return init_srcu_struct_fields(ssp, false);
 }
+
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
+{
+	ssp->srcu_reader_flavor = 0;
+	return __init_srcu_struct_common(ssp, name, key);
+}
 EXPORT_SYMBOL_GPL(__init_srcu_struct);
 
+int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
+{
+	ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST;
+	return __init_srcu_struct_common(ssp, name, key);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct_fast);
+
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /**
@@ -308,10 +321,26 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
  */
 int init_srcu_struct(struct srcu_struct *ssp)
 {
+	ssp->srcu_reader_flavor = 0;
 	return init_srcu_struct_fields(ssp, false);
 }
 EXPORT_SYMBOL_GPL(init_srcu_struct);
 
+/**
+ * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure
+ * @ssp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function.  Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct_fast(struct srcu_struct *ssp)
+{
+	ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST;
+	return init_srcu_struct_fields(ssp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct_fast);
+
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /*
@@ -734,6 +763,9 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
 
 	sdp = raw_cpu_ptr(ssp->sda);
 	old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor);
+	WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor);
+	WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor &&
+		     old_read_flavor != ssp->srcu_reader_flavor);
 	if (!old_read_flavor) {
 		old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor);
 		if (!old_read_flavor)
-- 
cgit v1.2.3


From 8235bcfd39e865763e764b4c968012bdfb808af1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 5 Nov 2025 12:32:07 -0800
Subject: srcu: Require special srcu_struct define/init for SRCU-fast readers

This commit adds CONFIG_PROVE_RCU=y checking to enforce the new rule that
srcu_struct structures passed to srcu_read_lock_fast() and other SRCU-fast
read-side markers be either initialized with init_srcu_struct_fast()
on the one hand or defined using either DEFINE_SRCU_FAST() or
DEFINE_STATIC_SRCU_FAST().  This will enable removal of the non-debug
read-side checks from srcu_read_lock_fast() and friends, which on my
laptop provides a 25% speedup (which admittedly amounts to about half
a nanosecond, but when tracing fastpaths...)

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: <bpf@vger.kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h  | 34 ++++++++++++++++++++++------------
 kernel/rcu/srcutree.c |  1 +
 2 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 26de47820c58..2982b5a6930f 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -271,17 +271,26 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
  * @ssp: srcu_struct in which to register the new reader.
  *
  * Enter an SRCU read-side critical section, but for a light-weight
- * smp_mb()-free reader.  See srcu_read_lock() for more information.
- *
- * If srcu_read_lock_fast() is ever used on an srcu_struct structure,
- * then none of the other flavors may be used, whether before, during,
- * or after.  Note that grace-period auto-expediting is disabled for _fast
- * srcu_struct structures because auto-expedited grace periods invoke
- * synchronize_rcu_expedited(), IPIs and all.
- *
- * Note that srcu_read_lock_fast() can be invoked only from those contexts
- * where RCU is watching, that is, from contexts where it would be legal
- * to invoke rcu_read_lock().  Otherwise, lockdep will complain.
+ * smp_mb()-free reader.  See srcu_read_lock() for more information.  This
+ * function is NMI-safe, in a manner similar to srcu_read_lock_nmisafe().
+ *
+ * For srcu_read_lock_fast() to be used on an srcu_struct structure,
+ * that structure must have been defined using either DEFINE_SRCU_FAST()
+ * or DEFINE_STATIC_SRCU_FAST() on the one hand or initialized with
+ * init_srcu_struct_fast() on the other.  Such an srcu_struct structure
+ * cannot be passed to any non-fast variant of srcu_read_{,un}lock() or
+ * srcu_{down,up}_read().  In kernels built with CONFIG_PROVE_RCU=y,
+ * __srcu_check_read_flavor() will complain bitterly if you ignore this
+ * restriction.
+ *
+ * Grace-period auto-expediting is disabled for SRCU-fast srcu_struct
+ * structures because SRCU-fast expedited grace periods invoke
+ * synchronize_rcu_expedited(), IPIs and all.  If you need expedited
+ * SRCU-fast grace periods, use synchronize_srcu_expedited().
+ *
+ * The srcu_read_lock_fast() function can be invoked only from those
+ * contexts where RCU is watching, that is, from contexts where it would
+ * be legal to invoke rcu_read_lock().  Otherwise, lockdep will complain.
  */
 static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp)
 {
@@ -317,7 +326,8 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_
  * srcu_down_read() for more information.
  *
  * The same srcu_struct may be used concurrently by srcu_down_read_fast()
- * and srcu_read_lock_fast().
+ * and srcu_read_lock_fast().  However, the same definition/initialization
+ * requirements called out for srcu_read_lock_safe() apply.
  */
 static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp)
 {
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index c29203b23d1a..2f8aa280911e 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -766,6 +766,7 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
 	WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor);
 	WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor &&
 		     old_read_flavor != ssp->srcu_reader_flavor);
+	WARN_ON_ONCE(read_flavor == SRCU_READ_FLAVOR_FAST && !ssp->srcu_reader_flavor);
 	if (!old_read_flavor) {
 		old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor);
 		if (!old_read_flavor)
-- 
cgit v1.2.3


From ac51c40c2c148a75f3191ff401c9889a7fc12cb1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 5 Nov 2025 12:32:08 -0800
Subject: srcu: Make SRCU-fast readers enforce use of SRCU-fast definition/init

This commit makes CONFIG_PROVE_RCU=y kernels enforce the new rule
that srcu_struct structures that are passed to srcu_read_lock_fast()
and other SRCU-fast read-side markers be either initialized with
init_srcu_struct_fast() on the one hand or defined with DEFINE_SRCU_FAST()
or DEFINE_STATIC_SRCU_FAST() on the other.

This eliminates the read-side test that was formerly included in
srcu_read_lock_fast() and friends, speeding these primitives up by
about 25% (admittedly only about half of a nanosecond, but when tracing
on fastpaths...)

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: <bpf@vger.kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h     |  6 +++---
 include/linux/srcutiny.h |  1 -
 include/linux/srcutree.h | 16 +---------------
 3 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 2982b5a6930f..41e27c1d917d 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -297,7 +297,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *
 	struct srcu_ctr __percpu *retval;
 
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast().");
-	srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
 	retval = __srcu_read_lock_fast(ssp);
 	rcu_try_lock_acquire(&ssp->dep_map);
 	return retval;
@@ -312,7 +312,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_
 {
 	struct srcu_ctr __percpu *retval;
 
-	srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
 	retval = __srcu_read_lock_fast(ssp);
 	return retval;
 }
@@ -333,7 +333,7 @@ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *
 {
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_down_read_fast().");
-	srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
 	return __srcu_read_lock_fast(ssp);
 }
 
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 92e6ab53398f..1ecc3393fb26 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -112,7 +112,6 @@ static inline void srcu_barrier(struct srcu_struct *ssp)
 
 static inline void srcu_expedite_current(struct srcu_struct *ssp) { }
 #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0)
-#define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0)
 
 /* Defined here to avoid size increase for non-torture kernels. */
 static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 7ff4a11bc5a3..6080a9094618 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -307,21 +307,7 @@ __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
 
 void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
 
-// Record reader usage even for CONFIG_PROVE_RCU=n kernels.  This is
-// needed only for flavors that require grace-period smp_mb() calls to be
-// promoted to synchronize_rcu().
-static inline void srcu_check_read_flavor_force(struct srcu_struct *ssp, int read_flavor)
-{
-	struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
-
-	if (likely(READ_ONCE(sdp->srcu_reader_flavor) & read_flavor))
-		return;
-
-	// Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered.
-	__srcu_check_read_flavor(ssp, read_flavor);
-}
-
-// Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels.
+// Record SRCU-reader usage type only for CONFIG_PROVE_RCU=y kernels.
 static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
 {
 	if (IS_ENABLED(CONFIG_PROVE_RCU))
-- 
cgit v1.2.3


From 88b6a93af4345e901206d0576bdb4e88ea3eaeb8 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Tue, 4 Nov 2025 00:49:24 +0100
Subject: dt-bindings: clock: rk3568: Add SCMI clock ids

The Trusted Firmware on RK3568 exposes 3 clocks via the SCMI clock
interface. Add descriptive IDs for them.

The clock ids are used in both the older vendor-binary TF-A, as well
as the recently merged upstream SCMI clock implementation.

Link: https://review.trustedfirmware.org/c/TF-A/trusted-firmware-a/+/31265
Reviewed-by: Diederik de Haas <diederik@cknow-tech.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20251103234926.416137-2-heiko@sntech.de
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/clock/rk3568-cru.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rk3568-cru.h b/include/dt-bindings/clock/rk3568-cru.h
index 5263085c5b23..18bb8d41d741 100644
--- a/include/dt-bindings/clock/rk3568-cru.h
+++ b/include/dt-bindings/clock/rk3568-cru.h
@@ -485,6 +485,12 @@
 
 #define CLK_NR_CLKS		(PCLK_CORE_PVTM + 1)
 
+/* scmi-clocks indices */
+
+#define SCMI_CLK_CPU		0
+#define SCMI_CLK_GPU		1
+#define SCMI_CLK_NPU		2
+
 /* pmu soft-reset indices */
 /* pmucru_softrst_con0 */
 #define SRST_P_PDPMU_NIU	0
-- 
cgit v1.2.3


From 34e82569d59391bf7d808a558ff631c4428b026d Mon Sep 17 00:00:00 2001
From: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
Date: Wed, 5 Nov 2025 12:19:57 -0800
Subject: rcu: use WRITE_ONCE() for ->next and ->pprev of hlist_nulls

In rculist_nulls.h we can still see ordinary assignments to ->pprev and
->next of hlist_nulls.

As noted in the two patches below:
commit efd04f8a8b45 ("rcu: Use WRITE_ONCE() for assignments to ->next for
rculist_nulls")
commit 860c8802ace1 ("rcu: Use WRITE_ONCE() for assignments to ->pprev for
hlist_nulls")

We should use WRITE_ONCE().

Signed-off-by: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rculist_nulls.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 89186c499dd4..d5a656cc4c6a 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -138,7 +138,7 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
 
 	if (last) {
 		WRITE_ONCE(n->next, last->next);
-		n->pprev = &last->next;
+		WRITE_ONCE(n->pprev, &last->next);
 		rcu_assign_pointer(hlist_nulls_next_rcu(last), n);
 	} else {
 		hlist_nulls_add_head_rcu(n, h);
@@ -148,8 +148,8 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
 /* after that hlist_nulls_del will work */
 static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
 {
-	n->pprev = &n->next;
-	n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
+	WRITE_ONCE(n->pprev, &n->next);
+	WRITE_ONCE(n->next, (struct hlist_nulls_node *)NULLS_MARKER(NULL));
 }
 
 /**
-- 
cgit v1.2.3


From ca38f0f65eefd79889b409c89c6932d7e2fe0993 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Tue, 4 Nov 2025 00:40:32 +0100
Subject: dt-bindings: clock: rk3568: Drop CLK_NR_CLKS define

CLK_NR_CLKS has always only be used on the driver side to calculate array
sizes should never have been part of the clock-binding.

Let's drop it, since the kernel code no longer uses it either and nothing
else has ever used it.

Acked-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Link: https://patch.msgid.link/20251103234032.413563-3-heiko@sntech.de
---
 include/dt-bindings/clock/rk3568-cru.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rk3568-cru.h b/include/dt-bindings/clock/rk3568-cru.h
index 18bb8d41d741..1e0aef8a645d 100644
--- a/include/dt-bindings/clock/rk3568-cru.h
+++ b/include/dt-bindings/clock/rk3568-cru.h
@@ -483,8 +483,6 @@
 
 #define PCLK_CORE_PVTM		450
 
-#define CLK_NR_CLKS		(PCLK_CORE_PVTM + 1)
-
 /* scmi-clocks indices */
 
 #define SCMI_CLK_CPU		0
-- 
cgit v1.2.3


From b4ce5923e780d6896d4aaf19de5a27652b8bf1ea Mon Sep 17 00:00:00 2001
From: Anton Protopopov <a.s.protopopov@gmail.com>
Date: Wed, 5 Nov 2025 09:03:59 +0000
Subject: bpf, x86: add new map type: instructions array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On bpf(BPF_PROG_LOAD) syscall user-supplied BPF programs are
translated by the verifier into "xlated" BPF programs. During this
process the original instructions offsets might be adjusted and/or
individual instructions might be replaced by new sets of instructions,
or deleted.

Add a new BPF map type which is aimed to keep track of how, for a
given program, the original instructions were relocated during the
verification. Also, besides keeping track of the original -> xlated
mapping, make x86 JIT to build the xlated -> jitted mapping for every
instruction listed in an instruction array. This is required for every
future application of instruction arrays: static keys, indirect jumps
and indirect calls.

A map of the BPF_MAP_TYPE_INSN_ARRAY type must be created with a u32
keys and value of size 8. The values have different semantics for
userspace and for BPF space. For userspace a value consists of two
u32 values – xlated and jitted offsets. For BPF side the value is
a real pointer to a jitted instruction.

On map creation/initialization, before loading the program, each
element of the map should be initialized to point to an instruction
offset within the program. Before the program load such maps should
be made frozen. After the program verification xlated and jitted
offsets can be read via the bpf(2) syscall.

If a tracked instruction is removed by the verifier, then the xlated
offset is set to (u32)-1 which is considered to be too big for a valid
BPF program offset.

One such a map can, obviously, be used to track one and only one BPF
program.  If the verification process was unsuccessful, then the same
map can be re-used to verify the program with a different log level.
However, if the program was loaded fine, then such a map, being
frozen in any case, can't be reused by other programs even after the
program release.

Example. Consider the following original and xlated programs:

    Original prog:                      Xlated prog:

     0:  r1 = 0x0                        0: r1 = 0
     1:  *(u32 *)(r10 - 0x4) = r1        1: *(u32 *)(r10 -4) = r1
     2:  r2 = r10                        2: r2 = r10
     3:  r2 += -0x4                      3: r2 += -4
     4:  r1 = 0x0 ll                     4: r1 = map[id:88]
     6:  call 0x1                        6: r1 += 272
                                         7: r0 = *(u32 *)(r2 +0)
                                         8: if r0 >= 0x1 goto pc+3
                                         9: r0 <<= 3
                                        10: r0 += r1
                                        11: goto pc+1
                                        12: r0 = 0
     7:  r6 = r0                        13: r6 = r0
     8:  if r6 == 0x0 goto +0x2         14: if r6 == 0x0 goto pc+4
     9:  call 0x76                      15: r0 = 0xffffffff8d2079c0
                                        17: r0 = *(u64 *)(r0 +0)
    10:  *(u64 *)(r6 + 0x0) = r0        18: *(u64 *)(r6 +0) = r0
    11:  r0 = 0x0                       19: r0 = 0x0
    12:  exit                           20: exit

An instruction array map, containing, e.g., instructions [0,4,7,12]
will be translated by the verifier to [0,4,13,20]. A map with
index 5 (the middle of 16-byte instruction) or indexes greater than 12
(outside the program boundaries) would be rejected.

The functionality provided by this patch will be extended in consequent
patches to implement BPF Static Keys, indirect jumps, and indirect calls.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-2-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c    |   9 ++
 include/linux/bpf.h            |  15 +++
 include/linux/bpf_types.h      |   1 +
 include/linux/bpf_verifier.h   |   2 +
 include/uapi/linux/bpf.h       |  21 +++
 kernel/bpf/Makefile            |   2 +-
 kernel/bpf/bpf_insn_array.c    | 286 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  22 ++++
 kernel/bpf/verifier.c          |  45 +++++++
 tools/include/uapi/linux/bpf.h |  21 +++
 10 files changed, 423 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/bpf_insn_array.c

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index de5083cb1d37..91f92d65ae83 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3827,6 +3827,15 @@ out_image:
 			jit_data->header = header;
 			jit_data->rw_header = rw_header;
 		}
+
+		/*
+		 * The bpf_prog_update_insn_ptrs function expects addrs to
+		 * point to the first byte of the jitted instruction (unlike
+		 * the bpf_prog_fill_jited_linfo below, which, for historical
+		 * reasons, expects to point to the next instruction)
+		 */
+		bpf_prog_update_insn_ptrs(prog, addrs, image);
+
 		/*
 		 * ctx.prog_offset is used when CFI preambles put code *before*
 		 * the function. See emit_cfi(). For FineIBT specifically this code
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a47d67db3be5..9d41a6affcef 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3797,4 +3797,19 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char *
 			   const char **linep, int *nump);
 struct bpf_prog *bpf_prog_find_from_stack(void);
 
+int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog);
+int bpf_insn_array_ready(struct bpf_map *map);
+void bpf_insn_array_release(struct bpf_map *map);
+void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len);
+void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len);
+
+#ifdef CONFIG_BPF_SYSCALL
+void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image);
+#else
+static inline void
+bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
+{
+}
+#endif
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fa78f49d4a9a..b13de31e163f 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -133,6 +133,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops)
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c6eb68b6389c..6b820d8d77af 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -754,8 +754,10 @@ struct bpf_verifier_env {
 	struct list_head free_list;	/* list of struct bpf_verifier_state_list */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */
+	struct bpf_map *insn_array_maps[MAX_USED_MAPS]; /* array of INSN_ARRAY map's to be relocated */
 	u32 used_map_cnt;		/* number of used maps */
 	u32 used_btf_cnt;		/* number of used BTF objects */
+	u32 insn_array_map_cnt;		/* number of used maps of type BPF_MAP_TYPE_INSN_ARRAY */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	u32 hidden_subprog_cnt;		/* number of hidden subprogs */
 	int exception_callback_subprog;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1d73f165394d..f5713f59ac10 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1026,6 +1026,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
 	BPF_MAP_TYPE_ARENA,
+	BPF_MAP_TYPE_INSN_ARRAY,
 	__MAX_BPF_MAP_TYPE
 };
 
@@ -7649,4 +7650,24 @@ enum bpf_kfunc_flags {
 	BPF_F_PAD_ZEROS = (1ULL << 0),
 };
 
+/*
+ * Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type.
+ *
+ * Before the map is used the orig_off field should point to an
+ * instruction inside the program being loaded. The other fields
+ * must be set to 0.
+ *
+ * After the program is loaded, the xlated_off will be adjusted
+ * by the verifier to point to the index of the original instruction
+ * in the xlated program. If the instruction is deleted, it will
+ * be set to (u32)-1. The jitted_off will be set to the corresponding
+ * offset in the jitted image of the program.
+ */
+struct bpf_insn_array_value {
+	__u32 orig_off;
+	__u32 xlated_off;
+	__u32 jitted_off;
+	__u32 :32;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7fd0badfacb1..232cbc97434d 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,7 +9,7 @@ CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
new file mode 100644
index 000000000000..2053fda377bb
--- /dev/null
+++ b/kernel/bpf/bpf_insn_array.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Isovalent */
+
+#include <linux/bpf.h>
+
+struct bpf_insn_array {
+	struct bpf_map map;
+	atomic_t used;
+	long *ips;
+	DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values);
+};
+
+#define cast_insn_array(MAP_PTR) \
+	container_of((MAP_PTR), struct bpf_insn_array, map)
+
+#define INSN_DELETED ((u32)-1)
+
+static inline u64 insn_array_alloc_size(u32 max_entries)
+{
+	const u64 base_size = sizeof(struct bpf_insn_array);
+	const u64 entry_size = sizeof(struct bpf_insn_array_value);
+
+	return base_size + max_entries * (entry_size + sizeof(long));
+}
+
+static int insn_array_alloc_check(union bpf_attr *attr)
+{
+	u32 value_size = sizeof(struct bpf_insn_array_value);
+
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != value_size || attr->map_flags != 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void insn_array_free(struct bpf_map *map)
+{
+	struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+	bpf_map_area_free(insn_array);
+}
+
+static struct bpf_map *insn_array_alloc(union bpf_attr *attr)
+{
+	u64 size = insn_array_alloc_size(attr->max_entries);
+	struct bpf_insn_array *insn_array;
+
+	insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE);
+	if (!insn_array)
+		return ERR_PTR(-ENOMEM);
+
+	/* ips are allocated right after the insn_array->values[] array */
+	insn_array->ips = (void *)&insn_array->values[attr->max_entries];
+
+	bpf_map_init_from_attr(&insn_array->map, attr);
+
+	return &insn_array->map;
+}
+
+static void *insn_array_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_insn_array *insn_array = cast_insn_array(map);
+	u32 index = *(u32 *)key;
+
+	if (unlikely(index >= insn_array->map.max_entries))
+		return NULL;
+
+	return &insn_array->values[index];
+}
+
+static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
+{
+	struct bpf_insn_array *insn_array = cast_insn_array(map);
+	u32 index = *(u32 *)key;
+	struct bpf_insn_array_value val = {};
+
+	if (unlikely(index >= insn_array->map.max_entries))
+		return -E2BIG;
+
+	if (unlikely(map_flags & BPF_NOEXIST))
+		return -EEXIST;
+
+	copy_map_value(map, &val, value);
+	if (val.jitted_off || val.xlated_off)
+		return -EINVAL;
+
+	insn_array->values[index].orig_off = val.orig_off;
+
+	return 0;
+}
+
+static long insn_array_delete_elem(struct bpf_map *map, void *key)
+{
+	return -EINVAL;
+}
+
+static int insn_array_check_btf(const struct bpf_map *map,
+			      const struct btf *btf,
+			      const struct btf_type *key_type,
+			      const struct btf_type *value_type)
+{
+	if (!btf_type_is_i32(key_type))
+		return -EINVAL;
+
+	if (!btf_type_is_i64(value_type))
+		return -EINVAL;
+
+	return 0;
+}
+
+static u64 insn_array_mem_usage(const struct bpf_map *map)
+{
+	return insn_array_alloc_size(map->max_entries);
+}
+
+BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array)
+
+const struct bpf_map_ops insn_array_map_ops = {
+	.map_alloc_check = insn_array_alloc_check,
+	.map_alloc = insn_array_alloc,
+	.map_free = insn_array_free,
+	.map_get_next_key = bpf_array_get_next_key,
+	.map_lookup_elem = insn_array_lookup_elem,
+	.map_update_elem = insn_array_update_elem,
+	.map_delete_elem = insn_array_delete_elem,
+	.map_check_btf = insn_array_check_btf,
+	.map_mem_usage = insn_array_mem_usage,
+	.map_btf_id = &insn_array_btf_ids[0],
+};
+
+static inline bool is_frozen(struct bpf_map *map)
+{
+	guard(mutex)(&map->freeze_mutex);
+
+	return map->frozen;
+}
+
+static bool is_insn_array(const struct bpf_map *map)
+{
+	return map->map_type == BPF_MAP_TYPE_INSN_ARRAY;
+}
+
+static inline bool valid_offsets(const struct bpf_insn_array *insn_array,
+				 const struct bpf_prog *prog)
+{
+	u32 off;
+	int i;
+
+	for (i = 0; i < insn_array->map.max_entries; i++) {
+		off = insn_array->values[i].orig_off;
+
+		if (off >= prog->len)
+			return false;
+
+		if (off > 0) {
+			if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog)
+{
+	struct bpf_insn_array *insn_array = cast_insn_array(map);
+	struct bpf_insn_array_value *values = insn_array->values;
+	int i;
+
+	if (!is_frozen(map))
+		return -EINVAL;
+
+	if (!valid_offsets(insn_array, prog))
+		return -EINVAL;
+
+	/*
+	 * There can be only one program using the map
+	 */
+	if (atomic_xchg(&insn_array->used, 1))
+		return -EBUSY;
+
+	/*
+	 * Reset all the map indexes to the original values.  This is needed,
+	 * e.g., when a replay of verification with different log level should
+	 * be performed.
+	 */
+	for (i = 0; i < map->max_entries; i++)
+		values[i].xlated_off = values[i].orig_off;
+
+	return 0;
+}
+
+int bpf_insn_array_ready(struct bpf_map *map)
+{
+	struct bpf_insn_array *insn_array = cast_insn_array(map);
+	int i;
+
+	for (i = 0; i < map->max_entries; i++) {
+		if (insn_array->values[i].xlated_off == INSN_DELETED)
+			continue;
+		if (!insn_array->ips[i])
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+void bpf_insn_array_release(struct bpf_map *map)
+{
+	struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+	atomic_set(&insn_array->used, 0);
+}
+
+void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len)
+{
+	struct bpf_insn_array *insn_array = cast_insn_array(map);
+	int i;
+
+	if (len <= 1)
+		return;
+
+	for (i = 0; i < map->max_entries; i++) {
+		if (insn_array->values[i].xlated_off <= off)
+			continue;
+		if (insn_array->values[i].xlated_off == INSN_DELETED)
+			continue;
+		insn_array->values[i].xlated_off += len - 1;
+	}
+}
+
+void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len)
+{
+	struct bpf_insn_array *insn_array = cast_insn_array(map);
+	int i;
+
+	for (i = 0; i < map->max_entries; i++) {
+		if (insn_array->values[i].xlated_off < off)
+			continue;
+		if (insn_array->values[i].xlated_off == INSN_DELETED)
+			continue;
+		if (insn_array->values[i].xlated_off < off + len)
+			insn_array->values[i].xlated_off = INSN_DELETED;
+		else
+			insn_array->values[i].xlated_off -= len;
+	}
+}
+
+/*
+ * This function is called by JITs. The image is the real program
+ * image, the offsets array set up the xlated -> jitted mapping.
+ * The offsets[xlated] offset should point to the beginning of
+ * the jitted instruction.
+ */
+void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
+{
+	struct bpf_insn_array *insn_array;
+	struct bpf_map *map;
+	u32 xlated_off;
+	int i, j;
+
+	if (!offsets || !image)
+		return;
+
+	for (i = 0; i < prog->aux->used_map_cnt; i++) {
+		map = prog->aux->used_maps[i];
+		if (!is_insn_array(map))
+			continue;
+
+		insn_array = cast_insn_array(map);
+		for (j = 0; j < map->max_entries; j++) {
+			xlated_off = insn_array->values[j].xlated_off;
+			if (xlated_off == INSN_DELETED)
+				continue;
+			if (xlated_off < prog->aux->subprog_start)
+				continue;
+			xlated_off -= prog->aux->subprog_start;
+			if (xlated_off >= prog->len)
+				continue;
+
+			insn_array->values[j].jitted_off = offsets[xlated_off];
+			insn_array->ips[j] = (long)(image + offsets[xlated_off]);
+		}
+	}
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8a129746bd6c..f62d61b6730a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1493,6 +1493,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_MAP_TYPE_STRUCT_OPS:
 	case BPF_MAP_TYPE_CPUMAP:
 	case BPF_MAP_TYPE_ARENA:
+	case BPF_MAP_TYPE_INSN_ARRAY:
 		if (!bpf_token_capable(token, CAP_BPF))
 			goto put_token;
 		break;
@@ -2853,6 +2854,23 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
 	return err;
 }
 
+static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < prog->aux->used_map_cnt; i++) {
+		if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY)
+			continue;
+
+		err = bpf_insn_array_ready(prog->aux->used_maps[i]);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define BPF_PROG_LOAD_LAST_FIELD keyring_id
 
@@ -3082,6 +3100,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (err < 0)
 		goto free_used_maps;
 
+	err = bpf_prog_mark_insn_arrays_ready(prog);
+	if (err < 0)
+		goto free_used_maps;
+
 	err = bpf_prog_alloc_id(prog);
 	if (err)
 		goto free_used_maps;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e4928846e763..dfe5741812b9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10086,6 +10086,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_map_push_elem)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_INSN_ARRAY:
+		goto error;
 	default:
 		break;
 	}
@@ -20582,6 +20584,15 @@ static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map)
 
 	env->used_maps[env->used_map_cnt++] = map;
 
+	if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
+		err = bpf_insn_array_init(map, env->prog);
+		if (err) {
+			verbose(env, "Failed to properly initialize insn array\n");
+			return err;
+		}
+		env->insn_array_maps[env->insn_array_map_cnt++] = map;
+	}
+
 	return env->used_map_cnt - 1;
 }
 
@@ -20828,6 +20839,33 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
 	}
 }
 
+static void release_insn_arrays(struct bpf_verifier_env *env)
+{
+	int i;
+
+	for (i = 0; i < env->insn_array_map_cnt; i++)
+		bpf_insn_array_release(env->insn_array_maps[i]);
+}
+
+static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+	int i;
+
+	if (len == 1)
+		return;
+
+	for (i = 0; i < env->insn_array_map_cnt; i++)
+		bpf_insn_array_adjust(env->insn_array_maps[i], off, len);
+}
+
+static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+	int i;
+
+	for (i = 0; i < env->insn_array_map_cnt; i++)
+		bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len);
+}
+
 static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
 {
 	struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
@@ -20869,6 +20907,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 	}
 	adjust_insn_aux_data(env, new_prog, off, len);
 	adjust_subprog_starts(env, off, len);
+	adjust_insn_arrays(env, off, len);
 	adjust_poke_descs(new_prog, off, len);
 	return new_prog;
 }
@@ -21052,6 +21091,8 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
 	if (err)
 		return err;
 
+	adjust_insn_arrays_after_remove(env, off, cnt);
+
 	memmove(aux_data + off,	aux_data + off + cnt,
 		sizeof(*aux_data) * (orig_prog_len - off - cnt));
 
@@ -21695,6 +21736,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->jited_linfo = prog->aux->jited_linfo;
 		func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
 		func[i]->aux->arena = prog->aux->arena;
+		func[i]->aux->used_maps = env->used_maps;
+		func[i]->aux->used_map_cnt = env->used_map_cnt;
 		num_exentries = 0;
 		insn = func[i]->insnsi;
 		for (j = 0; j < func[i]->len; j++, insn++) {
@@ -24871,6 +24914,8 @@ skip_full_check:
 	adjust_btf_func(env);
 
 err_release_maps:
+	if (ret)
+		release_insn_arrays(env);
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_used_maps() will release them.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1d73f165394d..f5713f59ac10 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1026,6 +1026,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
 	BPF_MAP_TYPE_ARENA,
+	BPF_MAP_TYPE_INSN_ARRAY,
 	__MAX_BPF_MAP_TYPE
 };
 
@@ -7649,4 +7650,24 @@ enum bpf_kfunc_flags {
 	BPF_F_PAD_ZEROS = (1ULL << 0),
 };
 
+/*
+ * Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type.
+ *
+ * Before the map is used the orig_off field should point to an
+ * instruction inside the program being loaded. The other fields
+ * must be set to 0.
+ *
+ * After the program is loaded, the xlated_off will be adjusted
+ * by the verifier to point to the index of the original instruction
+ * in the xlated program. If the instruction is deleted, it will
+ * be set to (u32)-1. The jitted_off will be set to the corresponding
+ * offset in the jitted image of the program.
+ */
+struct bpf_insn_array_value {
+	__u32 orig_off;
+	__u32 xlated_off;
+	__u32 jitted_off;
+	__u32 :32;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 493d9e0d608339a32f568504d5fd411a261bb0af Mon Sep 17 00:00:00 2001
From: Anton Protopopov <a.s.protopopov@gmail.com>
Date: Wed, 5 Nov 2025 09:04:06 +0000
Subject: bpf, x86: add support for indirect jumps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for a new instruction

    BPF_JMP|BPF_X|BPF_JA, SRC=0, DST=Rx, off=0, imm=0

which does an indirect jump to a location stored in Rx.  The register
Rx should have type PTR_TO_INSN. This new type assures that the Rx
register contains a value (or a range of values) loaded from a
correct jump table – map of type instruction array.

For example, for a C switch LLVM will generate the following code:

    0:   r3 = r1                    # "switch (r3)"
    1:   if r3 > 0x13 goto +0x666   # check r3 boundaries
    2:   r3 <<= 0x3                 # adjust to an index in array of addresses
    3:   r1 = 0xbeef ll             # r1 is PTR_TO_MAP_VALUE, r1->map_ptr=M
    5:   r1 += r3                   # r1 inherits boundaries from r3
    6:   r1 = *(u64 *)(r1 + 0x0)    # r1 now has type INSN_TO_PTR
    7:   gotox r1                   # jit will generate proper code

Here the gotox instruction corresponds to one particular map. This is
possible however to have a gotox instruction which can be loaded from
different maps, e.g.

    0:   r1 &= 0x1
    1:   r2 <<= 0x3
    2:   r3 = 0x0 ll                # load from map M_1
    4:   r3 += r2
    5:   if r1 == 0x0 goto +0x4
    6:   r1 <<= 0x3
    7:   r3 = 0x0 ll                # load from map M_2
    9:   r3 += r1
    A:   r1 = *(u64 *)(r3 + 0x0)
    B:   gotox r1                   # jump to target loaded from M_1 or M_2

During check_cfg stage the verifier will collect all the maps which
point to inside the subprog being verified. When building the config,
the high 16 bytes of the insn_state are used, so this patch
(theoretically) supports jump tables of up to 2^16 slots.

During the later stage, in check_indirect_jump, it is checked that
the register Rx was loaded from a particular instruction array.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-9-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c  |   3 +
 include/linux/bpf.h          |   1 +
 include/linux/bpf_verifier.h |   9 ++
 kernel/bpf/bpf_insn_array.c  |  15 ++
 kernel/bpf/core.c            |   1 +
 kernel/bpf/liveness.c        |   3 +
 kernel/bpf/log.c             |   1 +
 kernel/bpf/verifier.c        | 373 ++++++++++++++++++++++++++++++++++++++++++-
 8 files changed, 400 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bbd2b03d2b74..36a0d4db9f68 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2628,6 +2628,9 @@ emit_cond_jmp:		/* Convert BPF opcode to x86 */
 
 			break;
 
+		case BPF_JMP | BPF_JA | BPF_X:
+			emit_indirect_jump(&prog, insn->dst_reg, image + addrs[i - 1]);
+			break;
 		case BPF_JMP | BPF_JA:
 		case BPF_JMP32 | BPF_JA:
 			if (BPF_CLASS(insn->code) == BPF_JMP) {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9d41a6affcef..09d5dc541d1c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1001,6 +1001,7 @@ enum bpf_reg_type {
 	PTR_TO_ARENA,
 	PTR_TO_BUF,		 /* reg points to a read/write buffer */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
+	PTR_TO_INSN,		 /* reg points to a bpf program instruction */
 	CONST_PTR_TO_DYNPTR,	 /* reg points to a const struct bpf_dynptr */
 	__BPF_REG_TYPE_MAX,
 
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 6b820d8d77af..5441341f1ab9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -527,6 +527,7 @@ struct bpf_insn_aux_data {
 		struct {
 			u32 map_index;		/* index into used_maps[] */
 			u32 map_off;		/* offset from value base address */
+			struct bpf_iarray *jt;	/* jump table for gotox instruction */
 		};
 		struct {
 			enum bpf_reg_type reg_type;	/* type of pseudo_btf_id */
@@ -840,6 +841,7 @@ struct bpf_verifier_env {
 	struct bpf_scc_info **scc_info;
 	u32 scc_cnt;
 	struct bpf_iarray *succ;
+	struct bpf_iarray *gotox_tmp_buf;
 };
 
 static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
@@ -1050,6 +1052,13 @@ static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_
 	return !(off % BPF_REG_SIZE);
 }
 
+static inline bool insn_is_gotox(struct bpf_insn *insn)
+{
+	return BPF_CLASS(insn->code) == BPF_JMP &&
+	       BPF_OP(insn->code) == BPF_JA &&
+	       BPF_SRC(insn->code) == BPF_X;
+}
+
 const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type);
 const char *dynptr_type_str(enum bpf_dynptr_type type);
 const char *iter_type_str(const struct btf *btf, u32 btf_id);
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
index 2053fda377bb..61ce52882632 100644
--- a/kernel/bpf/bpf_insn_array.c
+++ b/kernel/bpf/bpf_insn_array.c
@@ -114,6 +114,20 @@ static u64 insn_array_mem_usage(const struct bpf_map *map)
 	return insn_array_alloc_size(map->max_entries);
 }
 
+static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+	struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+	if ((off % sizeof(long)) != 0 ||
+	    (off / sizeof(long)) >= map->max_entries)
+		return -EINVAL;
+
+	/* from BPF's point of view, this map is a jump table */
+	*imm = (unsigned long)insn_array->ips + off;
+
+	return 0;
+}
+
 BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array)
 
 const struct bpf_map_ops insn_array_map_ops = {
@@ -126,6 +140,7 @@ const struct bpf_map_ops insn_array_map_ops = {
 	.map_delete_elem = insn_array_delete_elem,
 	.map_check_btf = insn_array_check_btf,
 	.map_mem_usage = insn_array_mem_usage,
+	.map_direct_value_addr = insn_array_map_direct_value_addr,
 	.map_btf_id = &insn_array_btf_ids[0],
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4b62a03d6df5..ef4448f18aad 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1708,6 +1708,7 @@ bool bpf_opcode_in_insntable(u8 code)
 		[BPF_LD | BPF_IND | BPF_B] = true,
 		[BPF_LD | BPF_IND | BPF_H] = true,
 		[BPF_LD | BPF_IND | BPF_W] = true,
+		[BPF_JMP | BPF_JA | BPF_X] = true,
 		[BPF_JMP | BPF_JCOND] = true,
 	};
 #undef BPF_INSN_3_TBL
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index bffb495bc933..a7240013fd9d 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -485,6 +485,9 @@ bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
 	struct bpf_iarray *succ;
 	int insn_sz;
 
+	if (unlikely(insn_is_gotox(insn)))
+		return env->insn_aux_data[idx].jt;
+
 	/* pre-allocated array of size up to 2; reset cnt, as it may have been used already */
 	succ = env->succ;
 	succ->cnt = 0;
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 70221aafc35c..a0c3b35de2ce 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -461,6 +461,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
 		[PTR_TO_ARENA]		= "arena",
 		[PTR_TO_BUF]		= "buf",
 		[PTR_TO_FUNC]		= "func",
+		[PTR_TO_INSN]		= "insn",
 		[PTR_TO_MAP_KEY]	= "map_key",
 		[CONST_PTR_TO_DYNPTR]	= "dynptr_ptr",
 	};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 781669f649f2..1268fa075d4c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6006,6 +6006,18 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 	return 0;
 }
 
+/*
+ * Return the size of the memory region accessible from a pointer to map value.
+ * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible.
+ */
+static u32 map_mem_size(const struct bpf_map *map)
+{
+	if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+		return map->max_entries * sizeof(long);
+
+	return map->value_size;
+}
+
 /* check read/write into a map element with possible variable offset */
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			    int off, int size, bool zero_size_allowed,
@@ -6015,11 +6027,11 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 	struct bpf_reg_state *reg = &state->regs[regno];
 	struct bpf_map *map = reg->map_ptr;
+	u32 mem_size = map_mem_size(map);
 	struct btf_record *rec;
 	int err, i;
 
-	err = check_mem_region_access(env, regno, off, size, map->value_size,
-				      zero_size_allowed);
+	err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed);
 	if (err)
 		return err;
 
@@ -7481,6 +7493,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 {
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = regs + regno;
+	bool insn_array = reg->type == PTR_TO_MAP_VALUE &&
+			  reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY;
 	int size, err = 0;
 
 	size = bpf_size_to_bytes(bpf_size);
@@ -7488,7 +7502,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		return size;
 
 	/* alignment checks will add in reg->off themselves */
-	err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);
+	err = check_ptr_alignment(env, reg, off, size, strict_alignment_once || insn_array);
 	if (err)
 		return err;
 
@@ -7515,6 +7529,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			verbose(env, "R%d leaks addr into map\n", value_regno);
 			return -EACCES;
 		}
+		if (t == BPF_WRITE && insn_array) {
+			verbose(env, "writes into insn_array not allowed\n");
+			return -EACCES;
+		}
+
 		err = check_map_access_type(env, regno, off, size, t);
 		if (err)
 			return err;
@@ -7543,6 +7562,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 				regs[value_regno].type = SCALAR_VALUE;
 				__mark_reg_known(&regs[value_regno], val);
+			} else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
+				if (bpf_size != BPF_DW) {
+					verbose(env, "Invalid read of %d bytes from insn_array\n",
+						     size);
+					return -EACCES;
+				}
+				copy_register_state(&regs[value_regno], reg);
+				regs[value_regno].type = PTR_TO_INSN;
 			} else {
 				mark_reg_unknown(env, regs, value_regno);
 			}
@@ -17096,7 +17123,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 		dst_reg->type = PTR_TO_MAP_VALUE;
 		dst_reg->off = aux->map_off;
-		WARN_ON_ONCE(map->max_entries != 1);
+		WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY &&
+			     map->max_entries != 1);
 		/* We want reg->id to be same (0) as map_value is not distinct */
 	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
 		   insn->src_reg == BPF_PSEUDO_MAP_IDX) {
@@ -17864,6 +17892,206 @@ static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem)
 	return new;
 }
 
+static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items)
+{
+	struct bpf_insn_array_value *value;
+	u32 i;
+
+	for (i = start; i <= end; i++) {
+		value = map->ops->map_lookup_elem(map, &i);
+		if (!value)
+			return -EINVAL;
+		items[i - start] = value->xlated_off;
+	}
+	return 0;
+}
+
+static int cmp_ptr_to_u32(const void *a, const void *b)
+{
+	return *(u32 *)a - *(u32 *)b;
+}
+
+static int sort_insn_array_uniq(u32 *items, int cnt)
+{
+	int unique = 1;
+	int i;
+
+	sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL);
+
+	for (i = 1; i < cnt; i++)
+		if (items[i] != items[unique - 1])
+			items[unique++] = items[i];
+
+	return unique;
+}
+
+/*
+ * sort_unique({map[start], ..., map[end]}) into off
+ */
+static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off)
+{
+	u32 n = end - start + 1;
+	int err;
+
+	err = copy_insn_array(map, start, end, off);
+	if (err)
+		return err;
+
+	return sort_insn_array_uniq(off, n);
+}
+
+/*
+ * Copy all unique offsets from the map
+ */
+static struct bpf_iarray *jt_from_map(struct bpf_map *map)
+{
+	struct bpf_iarray *jt;
+	int err;
+	int n;
+
+	jt = iarray_realloc(NULL, map->max_entries);
+	if (!jt)
+		return ERR_PTR(-ENOMEM);
+
+	n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items);
+	if (n < 0) {
+		err = n;
+		goto err_free;
+	}
+	if (n == 0) {
+		err = -EINVAL;
+		goto err_free;
+	}
+	jt->cnt = n;
+	return jt;
+
+err_free:
+	kvfree(jt);
+	return ERR_PTR(err);
+}
+
+/*
+ * Find and collect all maps which fit in the subprog. Return the result as one
+ * combined jump table in jt->items (allocated with kvcalloc)
+ */
+static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
+					  int subprog_start, int subprog_end)
+{
+	struct bpf_iarray *jt = NULL;
+	struct bpf_map *map;
+	struct bpf_iarray *jt_cur;
+	int i;
+
+	for (i = 0; i < env->insn_array_map_cnt; i++) {
+		/*
+		 * TODO (when needed): collect only jump tables, not static keys
+		 * or maps for indirect calls
+		 */
+		map = env->insn_array_maps[i];
+
+		jt_cur = jt_from_map(map);
+		if (IS_ERR(jt_cur)) {
+			kvfree(jt);
+			return jt_cur;
+		}
+
+		/*
+		 * This is enough to check one element. The full table is
+		 * checked to fit inside the subprog later in create_jt()
+		 */
+		if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) {
+			u32 old_cnt = jt ? jt->cnt : 0;
+			jt = iarray_realloc(jt, old_cnt + jt_cur->cnt);
+			if (!jt) {
+				kvfree(jt_cur);
+				return ERR_PTR(-ENOMEM);
+			}
+			memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2);
+		}
+
+		kvfree(jt_cur);
+	}
+
+	if (!jt) {
+		verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
+		return ERR_PTR(-EINVAL);
+	}
+
+	jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt);
+	return jt;
+}
+
+static struct bpf_iarray *
+create_jt(int t, struct bpf_verifier_env *env)
+{
+	static struct bpf_subprog_info *subprog;
+	int subprog_start, subprog_end;
+	struct bpf_iarray *jt;
+	int i;
+
+	subprog = bpf_find_containing_subprog(env, t);
+	subprog_start = subprog->start;
+	subprog_end = (subprog + 1)->start;
+	jt = jt_from_subprog(env, subprog_start, subprog_end);
+	if (IS_ERR(jt))
+		return jt;
+
+	/* Check that the every element of the jump table fits within the given subprogram */
+	for (i = 0; i < jt->cnt; i++) {
+		if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) {
+			verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n",
+					t, subprog_start, subprog_end);
+			kvfree(jt);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	return jt;
+}
+
+/* "conditional jump with N edges" */
+static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
+{
+	int *insn_stack = env->cfg.insn_stack;
+	int *insn_state = env->cfg.insn_state;
+	bool keep_exploring = false;
+	struct bpf_iarray *jt;
+	int i, w;
+
+	jt = env->insn_aux_data[t].jt;
+	if (!jt) {
+		jt = create_jt(t, env);
+		if (IS_ERR(jt))
+			return PTR_ERR(jt);
+
+		env->insn_aux_data[t].jt = jt;
+	}
+
+	mark_prune_point(env, t);
+	for (i = 0; i < jt->cnt; i++) {
+		w = jt->items[i];
+		if (w < 0 || w >= env->prog->len) {
+			verbose(env, "indirect jump out of range from insn %d to %d\n", t, w);
+			return -EINVAL;
+		}
+
+		mark_jmp_point(env, w);
+
+		/* EXPLORED || DISCOVERED */
+		if (insn_state[w])
+			continue;
+
+		if (env->cfg.cur_stack >= env->prog->len)
+			return -E2BIG;
+
+		insn_stack[env->cfg.cur_stack++] = w;
+		insn_state[w] |= DISCOVERED;
+		keep_exploring = true;
+	}
+
+	return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
+}
+
 /* Visits the instruction at index t and returns one of the following:
  *  < 0 - an error occurred
  *  DONE_EXPLORING - the instruction was fully explored
@@ -17956,8 +18184,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 		return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
 
 	case BPF_JA:
-		if (BPF_SRC(insn->code) != BPF_K)
-			return -EINVAL;
+		if (BPF_SRC(insn->code) == BPF_X)
+			return visit_gotox_insn(t, env);
 
 		if (BPF_CLASS(insn->code) == BPF_JMP)
 			off = insn->off;
@@ -18886,6 +19114,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 		return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
 	case PTR_TO_ARENA:
 		return true;
+	case PTR_TO_INSN:
+		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
+			rold->off == rcur->off && range_within(rold, rcur) &&
+			tnum_in(rold->var_off, rcur->var_off);
 	default:
 		return regs_exact(rold, rcur, idmap);
 	}
@@ -19895,6 +20127,99 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
 	return PROCESS_BPF_EXIT;
 }
 
+static int indirect_jump_min_max_index(struct bpf_verifier_env *env,
+				       int regno,
+				       struct bpf_map *map,
+				       u32 *pmin_index, u32 *pmax_index)
+{
+	struct bpf_reg_state *reg = reg_state(env, regno);
+	u64 min_index, max_index;
+	const u32 size = 8;
+
+	if (check_add_overflow(reg->umin_value, reg->off, &min_index) ||
+		(min_index > (u64) U32_MAX * size)) {
+		verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n",
+			     regno, reg->umin_value, reg->off);
+		return -ERANGE;
+	}
+	if (check_add_overflow(reg->umax_value, reg->off, &max_index) ||
+		(max_index > (u64) U32_MAX * size)) {
+		verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n",
+			     regno, reg->umax_value, reg->off);
+		return -ERANGE;
+	}
+
+	min_index /= size;
+	max_index /= size;
+
+	if (max_index >= map->max_entries) {
+		verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n",
+			     regno, min_index, max_index, map->max_entries);
+		return -EINVAL;
+	}
+
+	*pmin_index = min_index;
+	*pmax_index = max_index;
+	return 0;
+}
+
+/* gotox *dst_reg */
+static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+	struct bpf_verifier_state *other_branch;
+	struct bpf_reg_state *dst_reg;
+	struct bpf_map *map;
+	u32 min_index, max_index;
+	int err = 0;
+	int n;
+	int i;
+
+	dst_reg = reg_state(env, insn->dst_reg);
+	if (dst_reg->type != PTR_TO_INSN) {
+		verbose(env, "R%d has type %s, expected PTR_TO_INSN\n",
+			     insn->dst_reg, reg_type_str(env, dst_reg->type));
+		return -EINVAL;
+	}
+
+	map = dst_reg->map_ptr;
+	if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg))
+		return -EFAULT;
+
+	if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env,
+			    "R%d has incorrect map type %d", insn->dst_reg, map->map_type))
+		return -EFAULT;
+
+	err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index);
+	if (err)
+		return err;
+
+	/* Ensure that the buffer is large enough */
+	if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) {
+		env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf,
+						    max_index - min_index + 1);
+		if (!env->gotox_tmp_buf)
+			return -ENOMEM;
+	}
+
+	n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items);
+	if (n < 0)
+		return n;
+	if (n == 0) {
+		verbose(env, "register R%d doesn't point to any offset in map id=%d\n",
+			     insn->dst_reg, map->id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < n - 1; i++) {
+		other_branch = push_stack(env, env->gotox_tmp_buf->items[i],
+					  env->insn_idx, env->cur_state->speculative);
+		if (IS_ERR(other_branch))
+			return PTR_ERR(other_branch);
+	}
+	env->insn_idx = env->gotox_tmp_buf->items[n-1];
+	return 0;
+}
+
 static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
 {
 	int err;
@@ -19997,6 +20322,15 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
 
 			mark_reg_scratched(env, BPF_REG_0);
 		} else if (opcode == BPF_JA) {
+			if (BPF_SRC(insn->code) == BPF_X) {
+				if (insn->src_reg != BPF_REG_0 ||
+				    insn->imm != 0 || insn->off != 0) {
+					verbose(env, "BPF_JA|BPF_X uses reserved fields\n");
+					return -EINVAL;
+				}
+				return check_indirect_jump(env, insn);
+			}
+
 			if (BPF_SRC(insn->code) != BPF_K ||
 			    insn->src_reg != BPF_REG_0 ||
 			    insn->dst_reg != BPF_REG_0 ||
@@ -20513,6 +20847,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 		case BPF_MAP_TYPE_QUEUE:
 		case BPF_MAP_TYPE_STACK:
 		case BPF_MAP_TYPE_ARENA:
+		case BPF_MAP_TYPE_INSN_ARRAY:
 			break;
 		default:
 			verbose(env,
@@ -21070,6 +21405,27 @@ static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
 	return 0;
 }
 
+/*
+ * Clean up dynamically allocated fields of aux data for instructions [start, ...]
+ */
+static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len)
+{
+	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+	struct bpf_insn *insns = env->prog->insnsi;
+	int end = start + len;
+	int i;
+
+	for (i = start; i < end; i++) {
+		if (insn_is_gotox(&insns[i])) {
+			kvfree(aux_data[i].jt);
+			aux_data[i].jt = NULL;
+		}
+
+		if (bpf_is_ldimm64(&insns[i]))
+			i++;
+	}
+}
+
 static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
 {
 	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
@@ -21079,6 +21435,9 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
 	if (bpf_prog_is_offloaded(env->prog->aux))
 		bpf_prog_offload_remove_insns(env, off, cnt);
 
+	/* Should be called before bpf_remove_insns, as it uses prog->insnsi */
+	clear_insn_aux_data(env, off, cnt);
+
 	err = bpf_remove_insns(env->prog, off, cnt);
 	if (err)
 		return err;
@@ -24945,12 +25304,14 @@ err_release_maps:
 err_unlock:
 	if (!is_priv)
 		mutex_unlock(&bpf_verifier_lock);
+	clear_insn_aux_data(env, 0, env->prog->len);
 	vfree(env->insn_aux_data);
 err_free_env:
 	bpf_stack_liveness_free(env);
 	kvfree(env->cfg.insn_postorder);
 	kvfree(env->scc_info);
 	kvfree(env->succ);
+	kvfree(env->gotox_tmp_buf);
 	kvfree(env);
 	return ret;
 }
-- 
cgit v1.2.3


From 0593447248044ab609b43b947d0e198c887ac281 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 25 Oct 2025 22:50:20 -0700
Subject: lib/crypto: sha3: Add SHA-3 support

Add SHA-3 support to lib/crypto/.  All six algorithms in the SHA-3
family are supported: four digests (SHA3-224, SHA3-256, SHA3-384, and
SHA3-512) and two extendable-output functions (SHAKE128 and SHAKE256).

The SHAKE algorithms will be required for ML-DSA.

[EB: simplified the API to use fewer types and functions, fixed bug that
     sometimes caused incorrect SHAKE output, cleaned up the
     documentation, dropped an ad-hoc test that was inconsistent with
     the rest of lib/crypto/, and many other cleanups]

Signed-off-by: David Howells <dhowells@redhat.com>
Co-developed-by: Eric Biggers <ebiggers@kernel.org>
Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 Documentation/crypto/index.rst |   1 +
 Documentation/crypto/sha3.rst  | 119 ++++++++++++++
 include/crypto/sha3.h          | 322 +++++++++++++++++++++++++++++++++++-
 lib/crypto/Kconfig             |   7 +
 lib/crypto/Makefile            |   5 +
 lib/crypto/sha3.c              | 359 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 810 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/crypto/sha3.rst
 create mode 100644 lib/crypto/sha3.c

(limited to 'include')

diff --git a/Documentation/crypto/index.rst b/Documentation/crypto/index.rst
index 100b47d049c0..4ee667c446f9 100644
--- a/Documentation/crypto/index.rst
+++ b/Documentation/crypto/index.rst
@@ -27,3 +27,4 @@ for cryptographic use cases, as well as programming examples.
    descore-readme
    device_drivers/index
    krb5
+   sha3
diff --git a/Documentation/crypto/sha3.rst b/Documentation/crypto/sha3.rst
new file mode 100644
index 000000000000..b705e70691d7
--- /dev/null
+++ b/Documentation/crypto/sha3.rst
@@ -0,0 +1,119 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+==========================
+SHA-3 Algorithm Collection
+==========================
+
+.. contents::
+
+Overview
+========
+
+The SHA-3 family of algorithms, as specified in NIST FIPS-202 [1]_, contains six
+algorithms based on the Keccak sponge function.  The differences between them
+are: the "rate" (how much of the state buffer gets updated with new data between
+invocations of the Keccak function and analogous to the "block size"), what
+domain separation suffix gets appended to the input data, and how much output
+data is extracted at the end.  The Keccak sponge function is designed such that
+arbitrary amounts of output can be obtained for certain algorithms.
+
+Four digest algorithms are provided:
+
+ - SHA3-224
+ - SHA3-256
+ - SHA3-384
+ - SHA3-512
+
+Additionally, two Extendable-Output Functions (XOFs) are provided:
+
+ - SHAKE128
+ - SHAKE256
+
+The SHA-3 library API supports all six of these algorithms.  The four digest
+algorithms are also supported by the crypto_shash and crypto_ahash APIs.
+
+This document describes the SHA-3 library API.
+
+
+Digests
+=======
+
+The following functions compute SHA-3 digests::
+
+	void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE]);
+	void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE]);
+	void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE]);
+	void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE]);
+
+For users that need to pass in data incrementally, an incremental API is also
+provided.  The incremental API uses the following struct::
+
+	struct sha3_ctx { ... };
+
+Initialization is done with one of::
+
+	void sha3_224_init(struct sha3_ctx *ctx);
+	void sha3_256_init(struct sha3_ctx *ctx);
+	void sha3_384_init(struct sha3_ctx *ctx);
+	void sha3_512_init(struct sha3_ctx *ctx);
+
+Input data is then added with any number of calls to::
+
+	void sha3_update(struct sha3_ctx *ctx, const u8 *in, size_t in_len);
+
+Finally, the digest is generated using::
+
+	void sha3_final(struct sha3_ctx *ctx, u8 *out);
+
+which also zeroizes the context.  The length of the digest is determined by the
+initialization function that was called.
+
+
+Extendable-Output Functions
+===========================
+
+The following functions compute the SHA-3 extendable-output functions (XOFs)::
+
+	void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+	void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+
+For users that need to provide the input data incrementally and/or receive the
+output data incrementally, an incremental API is also provided.  The incremental
+API uses the following struct::
+
+	struct shake_ctx { ... };
+
+Initialization is done with one of::
+
+	void shake128_init(struct shake_ctx *ctx);
+	void shake256_init(struct shake_ctx *ctx);
+
+Input data is then added with any number of calls to::
+
+	void shake_update(struct shake_ctx *ctx, const u8 *in, size_t in_len);
+
+Finally, the output data is extracted with any number of calls to::
+
+	void shake_squeeze(struct shake_ctx *ctx, u8 *out, size_t out_len);
+
+and telling it how much data should be extracted.  Note that performing multiple
+squeezes, with the output laid consecutively in a buffer, gets exactly the same
+output as doing a single squeeze for the combined amount over the same buffer.
+
+More input data cannot be added after squeezing has started.
+
+Once all the desired output has been extracted, zeroize the context::
+
+	void shake_zeroize_ctx(struct shake_ctx *ctx);
+
+
+References
+==========
+
+.. [1] https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+
+
+API Function Reference
+======================
+
+.. kernel-doc:: include/crypto/sha3.h
diff --git a/include/crypto/sha3.h b/include/crypto/sha3.h
index 41e1b83a6d91..c0c468ee099e 100644
--- a/include/crypto/sha3.h
+++ b/include/crypto/sha3.h
@@ -1,11 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Common values for SHA-3 algorithms
+ *
+ * See also Documentation/crypto/sha3.rst
  */
 #ifndef __CRYPTO_SHA3_H__
 #define __CRYPTO_SHA3_H__
 
 #include <linux/types.h>
+#include <linux/string.h>
 
 #define SHA3_224_DIGEST_SIZE	(224 / 8)
 #define SHA3_224_BLOCK_SIZE	(200 - 2 * SHA3_224_DIGEST_SIZE)
@@ -23,14 +26,327 @@
 #define SHA3_512_BLOCK_SIZE	(200 - 2 * SHA3_512_DIGEST_SIZE)
 #define SHA3_512_EXPORT_SIZE	SHA3_STATE_SIZE + SHA3_512_BLOCK_SIZE + 1
 
+/*
+ * SHAKE128 and SHAKE256 actually have variable output size, but this is used to
+ * calculate the block size (rate) analogously to the above.
+ */
+#define SHAKE128_DEFAULT_SIZE	(128 / 8)
+#define SHAKE128_BLOCK_SIZE	(200 - 2 * SHAKE128_DEFAULT_SIZE)
+#define SHAKE256_DEFAULT_SIZE	(256 / 8)
+#define SHAKE256_BLOCK_SIZE	(200 - 2 * SHAKE256_DEFAULT_SIZE)
+
 #define SHA3_STATE_SIZE		200
 
 struct shash_desc;
 
+int crypto_sha3_init(struct shash_desc *desc);
+
+/*
+ * State for the Keccak-f[1600] permutation: 25 64-bit words.
+ *
+ * We usually keep the state words as little-endian, to make absorbing and
+ * squeezing easier.  (It means that absorbing and squeezing can just treat the
+ * state as a byte array.)  The state words are converted to native-endian only
+ * temporarily by implementations of the permutation that need native-endian
+ * words.  Of course, that conversion is a no-op on little-endian machines.
+ */
 struct sha3_state {
-	u64		st[SHA3_STATE_SIZE / 8];
+	union {
+		u64 st[SHA3_STATE_SIZE / 8]; /* temporarily retained for compatibility purposes */
+
+		__le64 words[SHA3_STATE_SIZE / 8];
+		u8 bytes[SHA3_STATE_SIZE];
+
+		u64 native_words[SHA3_STATE_SIZE / 8]; /* see comment above */
+	};
 };
 
-int crypto_sha3_init(struct shash_desc *desc);
+/* Internal context, shared by the digests (SHA3-*) and the XOFs (SHAKE*) */
+struct __sha3_ctx {
+	struct sha3_state state;
+	u8 digest_size;		/* Digests only: the digest size in bytes */
+	u8 block_size;		/* Block size in bytes */
+	u8 absorb_offset;	/* Index of next state byte to absorb into */
+	u8 squeeze_offset;	/* XOFs only: index of next state byte to extract */
+};
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len);
+
+/**
+ * struct sha3_ctx - Context for SHA3-224, SHA3-256, SHA3-384, or SHA3-512
+ * @ctx: private
+ */
+struct sha3_ctx {
+	struct __sha3_ctx ctx;
+};
+
+/**
+ * sha3_zeroize_ctx() - Zeroize a SHA-3 context
+ * @ctx: The context to zeroize
+ *
+ * This is already called by sha3_final().  Call this explicitly when abandoning
+ * a context without calling sha3_final().
+ */
+static inline void sha3_zeroize_ctx(struct sha3_ctx *ctx)
+{
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+/**
+ * struct shake_ctx - Context for SHAKE128 or SHAKE256
+ * @ctx: private
+ */
+struct shake_ctx {
+	struct __sha3_ctx ctx;
+};
+
+/**
+ * shake_zeroize_ctx() - Zeroize a SHAKE context
+ * @ctx: The context to zeroize
+ *
+ * Call this after the last squeeze.
+ */
+static inline void shake_zeroize_ctx(struct shake_ctx *ctx)
+{
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+/**
+ * sha3_224_init() - Initialize a context for SHA3-224
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-224 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_224_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_224_DIGEST_SIZE,
+		.ctx.block_size = SHA3_224_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_256_init() - Initialize a context for SHA3-256
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-256 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_256_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_256_DIGEST_SIZE,
+		.ctx.block_size = SHA3_256_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_384_init() - Initialize a context for SHA3-384
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-384 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_384_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_384_DIGEST_SIZE,
+		.ctx.block_size = SHA3_384_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_512_init() - Initialize a context for SHA3-512
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-512 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_512_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_512_DIGEST_SIZE,
+		.ctx.block_size = SHA3_512_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_update() - Update a SHA-3 digest context with input data
+ * @ctx: The context to update; must have been initialized
+ * @in: The input data
+ * @in_len: Length of the input data in bytes
+ *
+ * This can be called any number of times to add data to a SHA3-224, SHA3-256,
+ * SHA3-384, or SHA3-512 digest (depending on which init function was called).
+ *
+ * Context: Any context.
+ */
+static inline void sha3_update(struct sha3_ctx *ctx,
+			       const u8 *in, size_t in_len)
+{
+	__sha3_update(&ctx->ctx, in, in_len);
+}
+
+/**
+ * sha3_final() - Finish computing a SHA-3 message digest
+ * @ctx: The context to finalize; must have been initialized
+ * @out: (output) The resulting SHA3-224, SHA3-256, SHA3-384, or SHA3-512
+ *	 message digest, matching the init function that was called.  Note that
+ *	 the size differs for each one; see SHA3_*_DIGEST_SIZE.
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void sha3_final(struct sha3_ctx *ctx, u8 *out);
+
+/**
+ * shake128_init() - Initialize a context for SHAKE128
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHAKE128 extendable-output function (XOF) computation.
+ *
+ * Context: Any context.
+ */
+static inline void shake128_init(struct shake_ctx *ctx)
+{
+	*ctx = (struct shake_ctx){
+		.ctx.block_size = SHAKE128_BLOCK_SIZE,
+	};
+}
+
+/**
+ * shake256_init() - Initialize a context for SHAKE256
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHAKE256 extendable-output function (XOF) computation.
+ *
+ * Context: Any context.
+ */
+static inline void shake256_init(struct shake_ctx *ctx)
+{
+	*ctx = (struct shake_ctx){
+		.ctx.block_size = SHAKE256_BLOCK_SIZE,
+	};
+}
+
+/**
+ * shake_update() - Update a SHAKE context with input data
+ * @ctx: The context to update; must have been initialized
+ * @in: The input data
+ * @in_len: Length of the input data in bytes
+ *
+ * This can be called any number of times to add more input data to SHAKE128 or
+ * SHAKE256.  This cannot be called after squeezing has begun.
+ *
+ * Context: Any context.
+ */
+static inline void shake_update(struct shake_ctx *ctx,
+				const u8 *in, size_t in_len)
+{
+	__sha3_update(&ctx->ctx, in, in_len);
+}
+
+/**
+ * shake_squeeze() - Generate output from SHAKE128 or SHAKE256
+ * @ctx: The context to squeeze; must have been initialized
+ * @out: Where to write the resulting output data
+ * @out_len: The amount of data to extract to @out in bytes
+ *
+ * This may be called multiple times.  A number of consecutive squeezes laid
+ * end-to-end will yield the same output as one big squeeze generating the same
+ * total amount of output.  More input cannot be provided after squeezing has
+ * begun.  After the last squeeze, call shake_zeroize_ctx().
+ *
+ * Context: Any context.
+ */
+void shake_squeeze(struct shake_ctx *ctx, u8 *out, size_t out_len);
+
+/**
+ * sha3_224() - Compute SHA3-224 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-224 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE]);
+
+/**
+ * sha3_256() - Compute SHA3-256 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-256 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE]);
+
+/**
+ * sha3_384() - Compute SHA3-384 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-384 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE]);
+
+/**
+ * sha3_512() - Compute SHA3-512 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-512 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE]);
+
+/**
+ * shake128() - Compute SHAKE128 in one shot
+ * @in: The input data to be used
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the output will be stored
+ * @out_len: Length of the output to produce in bytes
+ *
+ * Convenience function that computes SHAKE128 in one shot.  Use this instead of
+ * the incremental API if you're able to provide all the input at once as well
+ * as receive all the output at once.  All output lengths are supported.
+ *
+ * Context: Any context.
+ */
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+
+/**
+ * shake256() - Compute SHAKE256 in one shot
+ * @in: The input data to be used
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the output will be stored
+ * @out_len: Length of the output to produce in bytes
+ *
+ * Convenience function that computes SHAKE256 in one shot.  Use this instead of
+ * the incremental API if you're able to provide all the input at once as well
+ * as receive all the output at once.  All output lengths are supported.
+ *
+ * Context: Any context.
+ */
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len);
 
-#endif
+#endif /* __CRYPTO_SHA3_H__ */
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 280b888153bf..a05f5a349cd8 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -195,6 +195,13 @@ config CRYPTO_LIB_SHA512_ARCH
 	default y if SPARC64
 	default y if X86_64
 
+config CRYPTO_LIB_SHA3
+	tristate
+	select CRYPTO_LIB_UTILS
+	help
+	  The SHA3 library functions.  Select this if your module uses any of
+	  the functions from <crypto/sha3.h>.
+
 config CRYPTO_LIB_SM3
 	tristate
 
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index bc26777d08e9..0cfdb511f32b 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -278,6 +278,11 @@ endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
+libsha3-y := sha3.o
+
+################################################################################
+
 obj-$(CONFIG_MPILIB) += mpi/
 
 obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)		+= simd.o
diff --git a/lib/crypto/sha3.c b/lib/crypto/sha3.c
new file mode 100644
index 000000000000..56d8353f9c5b
--- /dev/null
+++ b/lib/crypto/sha3.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-3, as specified in
+ * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ *               Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *               David Howells <dhowells@redhat.com>
+ *
+ * See also Documentation/crypto/sha3.rst
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/sha3.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+/*
+ * On some 32-bit architectures, such as h8300, GCC ends up using over 1 KB of
+ * stack if the round calculation gets inlined into the loop in
+ * sha3_keccakf_generic().  On the other hand, on 64-bit architectures with
+ * plenty of [64-bit wide] general purpose registers, not inlining it severely
+ * hurts performance.  So let's use 64-bitness as a heuristic to decide whether
+ * to inline or not.
+ */
+#ifdef CONFIG_64BIT
+#define SHA3_INLINE inline
+#else
+#define SHA3_INLINE noinline
+#endif
+
+#define SHA3_KECCAK_ROUNDS 24
+
+static const u64 sha3_keccakf_rndc[SHA3_KECCAK_ROUNDS] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+/*
+ * Perform a single round of Keccak mixing.
+ */
+static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25])
+{
+	u64 t[5], tt, bc[5];
+
+	/* Theta */
+	bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+	bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+	bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+	bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+	bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+	t[0] = bc[4] ^ rol64(bc[1], 1);
+	t[1] = bc[0] ^ rol64(bc[2], 1);
+	t[2] = bc[1] ^ rol64(bc[3], 1);
+	t[3] = bc[2] ^ rol64(bc[4], 1);
+	t[4] = bc[3] ^ rol64(bc[0], 1);
+
+	st[0] ^= t[0];
+
+	/* Rho Pi */
+	tt = st[1];
+	st[ 1] = rol64(st[ 6] ^ t[1], 44);
+	st[ 6] = rol64(st[ 9] ^ t[4], 20);
+	st[ 9] = rol64(st[22] ^ t[2], 61);
+	st[22] = rol64(st[14] ^ t[4], 39);
+	st[14] = rol64(st[20] ^ t[0], 18);
+	st[20] = rol64(st[ 2] ^ t[2], 62);
+	st[ 2] = rol64(st[12] ^ t[2], 43);
+	st[12] = rol64(st[13] ^ t[3], 25);
+	st[13] = rol64(st[19] ^ t[4],  8);
+	st[19] = rol64(st[23] ^ t[3], 56);
+	st[23] = rol64(st[15] ^ t[0], 41);
+	st[15] = rol64(st[ 4] ^ t[4], 27);
+	st[ 4] = rol64(st[24] ^ t[4], 14);
+	st[24] = rol64(st[21] ^ t[1],  2);
+	st[21] = rol64(st[ 8] ^ t[3], 55);
+	st[ 8] = rol64(st[16] ^ t[1], 45);
+	st[16] = rol64(st[ 5] ^ t[0], 36);
+	st[ 5] = rol64(st[ 3] ^ t[3], 28);
+	st[ 3] = rol64(st[18] ^ t[3], 21);
+	st[18] = rol64(st[17] ^ t[2], 15);
+	st[17] = rol64(st[11] ^ t[1], 10);
+	st[11] = rol64(st[ 7] ^ t[2],  6);
+	st[ 7] = rol64(st[10] ^ t[0],  3);
+	st[10] = rol64(    tt ^ t[1],  1);
+
+	/* Chi */
+	bc[ 0] = ~st[ 1] & st[ 2];
+	bc[ 1] = ~st[ 2] & st[ 3];
+	bc[ 2] = ~st[ 3] & st[ 4];
+	bc[ 3] = ~st[ 4] & st[ 0];
+	bc[ 4] = ~st[ 0] & st[ 1];
+	st[ 0] ^= bc[ 0];
+	st[ 1] ^= bc[ 1];
+	st[ 2] ^= bc[ 2];
+	st[ 3] ^= bc[ 3];
+	st[ 4] ^= bc[ 4];
+
+	bc[ 0] = ~st[ 6] & st[ 7];
+	bc[ 1] = ~st[ 7] & st[ 8];
+	bc[ 2] = ~st[ 8] & st[ 9];
+	bc[ 3] = ~st[ 9] & st[ 5];
+	bc[ 4] = ~st[ 5] & st[ 6];
+	st[ 5] ^= bc[ 0];
+	st[ 6] ^= bc[ 1];
+	st[ 7] ^= bc[ 2];
+	st[ 8] ^= bc[ 3];
+	st[ 9] ^= bc[ 4];
+
+	bc[ 0] = ~st[11] & st[12];
+	bc[ 1] = ~st[12] & st[13];
+	bc[ 2] = ~st[13] & st[14];
+	bc[ 3] = ~st[14] & st[10];
+	bc[ 4] = ~st[10] & st[11];
+	st[10] ^= bc[ 0];
+	st[11] ^= bc[ 1];
+	st[12] ^= bc[ 2];
+	st[13] ^= bc[ 3];
+	st[14] ^= bc[ 4];
+
+	bc[ 0] = ~st[16] & st[17];
+	bc[ 1] = ~st[17] & st[18];
+	bc[ 2] = ~st[18] & st[19];
+	bc[ 3] = ~st[19] & st[15];
+	bc[ 4] = ~st[15] & st[16];
+	st[15] ^= bc[ 0];
+	st[16] ^= bc[ 1];
+	st[17] ^= bc[ 2];
+	st[18] ^= bc[ 3];
+	st[19] ^= bc[ 4];
+
+	bc[ 0] = ~st[21] & st[22];
+	bc[ 1] = ~st[22] & st[23];
+	bc[ 2] = ~st[23] & st[24];
+	bc[ 3] = ~st[24] & st[20];
+	bc[ 4] = ~st[20] & st[21];
+	st[20] ^= bc[ 0];
+	st[21] ^= bc[ 1];
+	st[22] ^= bc[ 2];
+	st[23] ^= bc[ 3];
+	st[24] ^= bc[ 4];
+}
+
+/* Generic implementation of the Keccak-f[1600] permutation */
+static void sha3_keccakf_generic(struct sha3_state *state)
+{
+	/*
+	 * Temporarily convert the state words from little-endian to native-
+	 * endian so that they can be operated on.  Note that on little-endian
+	 * machines this conversion is a no-op and is optimized out.
+	 */
+
+	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+		state->native_words[i] = le64_to_cpu(state->words[i]);
+
+	for (int round = 0; round < SHA3_KECCAK_ROUNDS; round++) {
+		sha3_keccakf_one_round_generic(state->native_words);
+		/* Iota */
+		state->native_words[0] ^= sha3_keccakf_rndc[round];
+	}
+
+	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+		state->words[i] = cpu_to_le64(state->native_words[i]);
+}
+
+/*
+ * Generic implementation of absorbing the given nonzero number of full blocks
+ * into the sponge function Keccak[r=8*block_size, c=1600-8*block_size].
+ */
+static void __maybe_unused
+sha3_absorb_blocks_generic(struct sha3_state *state, const u8 *data,
+			   size_t nblocks, size_t block_size)
+{
+	do {
+		for (size_t i = 0; i < block_size; i += 8)
+			state->words[i / 8] ^= get_unaligned((__le64 *)&data[i]);
+		sha3_keccakf_generic(state);
+		data += block_size;
+	} while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA3_ARCH
+#include "sha3.h" /* $(SRCARCH)/sha3.h */
+#else
+#define sha3_keccakf		sha3_keccakf_generic
+#define sha3_absorb_blocks	sha3_absorb_blocks_generic
+#endif
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len)
+{
+	const size_t block_size = ctx->block_size;
+	size_t absorb_offset = ctx->absorb_offset;
+
+	/* Warn if squeezing has already begun. */
+	WARN_ON_ONCE(absorb_offset >= block_size);
+
+	if (absorb_offset && absorb_offset + in_len >= block_size) {
+		crypto_xor(&ctx->state.bytes[absorb_offset], in,
+			   block_size - absorb_offset);
+		in += block_size - absorb_offset;
+		in_len -= block_size - absorb_offset;
+		sha3_keccakf(&ctx->state);
+		absorb_offset = 0;
+	}
+
+	if (in_len >= block_size) {
+		size_t nblocks = in_len / block_size;
+
+		sha3_absorb_blocks(&ctx->state, in, nblocks, block_size);
+		in += nblocks * block_size;
+		in_len -= nblocks * block_size;
+	}
+
+	if (in_len) {
+		crypto_xor(&ctx->state.bytes[absorb_offset], in, in_len);
+		absorb_offset += in_len;
+	}
+	ctx->absorb_offset = absorb_offset;
+}
+EXPORT_SYMBOL_GPL(__sha3_update);
+
+void sha3_final(struct sha3_ctx *sha3_ctx, u8 *out)
+{
+	struct __sha3_ctx *ctx = &sha3_ctx->ctx;
+
+	ctx->state.bytes[ctx->absorb_offset] ^= 0x06;
+	ctx->state.bytes[ctx->block_size - 1] ^= 0x80;
+	sha3_keccakf(&ctx->state);
+	memcpy(out, ctx->state.bytes, ctx->digest_size);
+	sha3_zeroize_ctx(sha3_ctx);
+}
+EXPORT_SYMBOL_GPL(sha3_final);
+
+void shake_squeeze(struct shake_ctx *shake_ctx, u8 *out, size_t out_len)
+{
+	struct __sha3_ctx *ctx = &shake_ctx->ctx;
+	const size_t block_size = ctx->block_size;
+	size_t squeeze_offset = ctx->squeeze_offset;
+
+	if (ctx->absorb_offset < block_size) {
+		/* First squeeze: */
+
+		/* Add the domain separation suffix and padding. */
+		ctx->state.bytes[ctx->absorb_offset] ^= 0x1f;
+		ctx->state.bytes[block_size - 1] ^= 0x80;
+
+		/* Indicate that squeezing has begun. */
+		ctx->absorb_offset = block_size;
+
+		/*
+		 * Indicate that no output is pending yet, i.e. sha3_keccakf()
+		 * will need to be called before the first copy.
+		 */
+		squeeze_offset = block_size;
+	}
+	while (out_len) {
+		if (squeeze_offset == block_size) {
+			sha3_keccakf(&ctx->state);
+			squeeze_offset = 0;
+		}
+		size_t copy = min(out_len, block_size - squeeze_offset);
+
+		memcpy(out, &ctx->state.bytes[squeeze_offset], copy);
+		out += copy;
+		out_len -= copy;
+		squeeze_offset += copy;
+	}
+	ctx->squeeze_offset = squeeze_offset;
+}
+EXPORT_SYMBOL_GPL(shake_squeeze);
+
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	sha3_224_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_224);
+
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	sha3_256_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_256);
+
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	sha3_384_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_384);
+
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	sha3_512_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_512);
+
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	struct shake_ctx ctx;
+
+	shake128_init(&ctx);
+	shake_update(&ctx, in, in_len);
+	shake_squeeze(&ctx, out, out_len);
+	shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake128);
+
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	struct shake_ctx ctx;
+
+	shake256_init(&ctx);
+	shake_update(&ctx, in, in_len);
+	shake_squeeze(&ctx, out, out_len);
+	shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake256);
+
+#ifdef sha3_mod_init_arch
+static int __init sha3_mod_init(void)
+{
+	sha3_mod_init_arch();
+	return 0;
+}
+subsys_initcall(sha3_mod_init);
+
+static void __exit sha3_mod_exit(void)
+{
+}
+module_exit(sha3_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-3 library functions");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From f1799d17285ca99243328cd92133a9f84ee3a593 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 25 Oct 2025 22:50:31 -0700
Subject: crypto: sha3 - Reimplement using library API

Replace sha3_generic.c with a new file sha3.c which implements the SHA-3
crypto_shash algorithms on top of the SHA-3 library API.

Change the driver name suffix from "-generic" to "-lib" to reflect that
these algorithms now just use the (possibly arch-optimized) library.

This closely mirrors crypto/{md5,sha1,sha256,sha512,blake2b}.c.

Implement export_core and import_core, since crypto/hmac.c expects these
to be present.  (Note that there is no security purpose in wrapping
SHA-3 with HMAC.  HMAC was designed for older algorithms that don't
resist length extension attacks.  But since someone could be using
"hmac(sha3-*)" via crypto_shash anyway, keep supporting it for now.)

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Link: https://lore.kernel.org/r/20251026055032.1413733-15-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 crypto/Kconfig        |   1 +
 crypto/Makefile       |   2 +-
 crypto/sha3.c         | 166 +++++++++++++++++++++++++++++
 crypto/sha3_generic.c | 290 --------------------------------------------------
 crypto/testmgr.c      |   8 ++
 include/crypto/sha3.h |   6 --
 6 files changed, 176 insertions(+), 297 deletions(-)
 create mode 100644 crypto/sha3.c
 delete mode 100644 crypto/sha3_generic.c

(limited to 'include')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 0a7e74ac870b..57b85e903cf0 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1006,6 +1006,7 @@ config CRYPTO_SHA512
 config CRYPTO_SHA3
 	tristate "SHA-3"
 	select CRYPTO_HASH
+	select CRYPTO_LIB_SHA3
 	help
 	  SHA-3 secure hash algorithms (FIPS 202, ISO/IEC 10118-3)
 
diff --git a/crypto/Makefile b/crypto/Makefile
index 5b02ca2cb04e..0388ff8d219d 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -78,7 +78,7 @@ obj-$(CONFIG_CRYPTO_RMD160) += rmd160.o
 obj-$(CONFIG_CRYPTO_SHA1) += sha1.o
 obj-$(CONFIG_CRYPTO_SHA256) += sha256.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512.o
-obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
+obj-$(CONFIG_CRYPTO_SHA3) += sha3.o
 obj-$(CONFIG_CRYPTO_SM3_GENERIC) += sm3_generic.o
 obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
diff --git a/crypto/sha3.c b/crypto/sha3.c
new file mode 100644
index 000000000000..8f364979ec89
--- /dev/null
+++ b/crypto/sha3.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Crypto API support for SHA-3
+ * (https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf)
+ */
+#include <crypto/internal/hash.h>
+#include <crypto/sha3.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#define SHA3_CTX(desc) ((struct sha3_ctx *)shash_desc_ctx(desc))
+
+static int crypto_sha3_224_init(struct shash_desc *desc)
+{
+	sha3_224_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_256_init(struct shash_desc *desc)
+{
+	sha3_256_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_384_init(struct shash_desc *desc)
+{
+	sha3_384_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_512_init(struct shash_desc *desc)
+{
+	sha3_512_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	sha3_update(SHA3_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_sha3_final(struct shash_desc *desc, u8 *out)
+{
+	sha3_final(SHA3_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_sha3_224_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_224(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_256_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_256(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_384_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_384(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_512_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_512(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_export_core(struct shash_desc *desc, void *out)
+{
+	memcpy(out, SHA3_CTX(desc), sizeof(struct sha3_ctx));
+	return 0;
+}
+
+static int crypto_sha3_import_core(struct shash_desc *desc, const void *in)
+{
+	memcpy(SHA3_CTX(desc), in, sizeof(struct sha3_ctx));
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
+	.digestsize		= SHA3_224_DIGEST_SIZE,
+	.init			= crypto_sha3_224_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_224_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-224",
+	.base.cra_driver_name	= "sha3-224-lib",
+	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_256_DIGEST_SIZE,
+	.init			= crypto_sha3_256_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_256_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-256",
+	.base.cra_driver_name	= "sha3-256-lib",
+	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_384_DIGEST_SIZE,
+	.init			= crypto_sha3_384_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_384_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-384",
+	.base.cra_driver_name	= "sha3-384-lib",
+	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_512_DIGEST_SIZE,
+	.init			= crypto_sha3_512_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_512_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-512",
+	.base.cra_driver_name	= "sha3-512-lib",
+	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init crypto_sha3_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+module_init(crypto_sha3_mod_init);
+
+static void __exit crypto_sha3_mod_exit(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+module_exit(crypto_sha3_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Crypto API support for SHA-3");
+
+MODULE_ALIAS_CRYPTO("sha3-224");
+MODULE_ALIAS_CRYPTO("sha3-224-lib");
+MODULE_ALIAS_CRYPTO("sha3-256");
+MODULE_ALIAS_CRYPTO("sha3-256-lib");
+MODULE_ALIAS_CRYPTO("sha3-384");
+MODULE_ALIAS_CRYPTO("sha3-384-lib");
+MODULE_ALIAS_CRYPTO("sha3-512");
+MODULE_ALIAS_CRYPTO("sha3-512-lib");
diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c
deleted file mode 100644
index 41d1e506e6de..000000000000
--- a/crypto/sha3_generic.c
+++ /dev/null
@@ -1,290 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * SHA-3, as specified in
- * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
- *
- * SHA-3 code by Jeff Garzik <jeff@garzik.org>
- *               Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-/*
- * On some 32-bit architectures (h8300), GCC ends up using
- * over 1 KB of stack if we inline the round calculation into the loop
- * in keccakf(). On the other hand, on 64-bit architectures with plenty
- * of [64-bit wide] general purpose registers, not inlining it severely
- * hurts performance. So let's use 64-bitness as a heuristic to decide
- * whether to inline or not.
- */
-#ifdef CONFIG_64BIT
-#define SHA3_INLINE	inline
-#else
-#define SHA3_INLINE	noinline
-#endif
-
-#define KECCAK_ROUNDS 24
-
-static const u64 keccakf_rndc[24] = {
-	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
-	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
-	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
-	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
-	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
-	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
-	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
-	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
-};
-
-/* update the state with given number of rounds */
-
-static SHA3_INLINE void keccakf_round(u64 st[25])
-{
-	u64 t[5], tt, bc[5];
-
-	/* Theta */
-	bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
-	bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
-	bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
-	bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
-	bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-
-	t[0] = bc[4] ^ rol64(bc[1], 1);
-	t[1] = bc[0] ^ rol64(bc[2], 1);
-	t[2] = bc[1] ^ rol64(bc[3], 1);
-	t[3] = bc[2] ^ rol64(bc[4], 1);
-	t[4] = bc[3] ^ rol64(bc[0], 1);
-
-	st[0] ^= t[0];
-
-	/* Rho Pi */
-	tt = st[1];
-	st[ 1] = rol64(st[ 6] ^ t[1], 44);
-	st[ 6] = rol64(st[ 9] ^ t[4], 20);
-	st[ 9] = rol64(st[22] ^ t[2], 61);
-	st[22] = rol64(st[14] ^ t[4], 39);
-	st[14] = rol64(st[20] ^ t[0], 18);
-	st[20] = rol64(st[ 2] ^ t[2], 62);
-	st[ 2] = rol64(st[12] ^ t[2], 43);
-	st[12] = rol64(st[13] ^ t[3], 25);
-	st[13] = rol64(st[19] ^ t[4],  8);
-	st[19] = rol64(st[23] ^ t[3], 56);
-	st[23] = rol64(st[15] ^ t[0], 41);
-	st[15] = rol64(st[ 4] ^ t[4], 27);
-	st[ 4] = rol64(st[24] ^ t[4], 14);
-	st[24] = rol64(st[21] ^ t[1],  2);
-	st[21] = rol64(st[ 8] ^ t[3], 55);
-	st[ 8] = rol64(st[16] ^ t[1], 45);
-	st[16] = rol64(st[ 5] ^ t[0], 36);
-	st[ 5] = rol64(st[ 3] ^ t[3], 28);
-	st[ 3] = rol64(st[18] ^ t[3], 21);
-	st[18] = rol64(st[17] ^ t[2], 15);
-	st[17] = rol64(st[11] ^ t[1], 10);
-	st[11] = rol64(st[ 7] ^ t[2],  6);
-	st[ 7] = rol64(st[10] ^ t[0],  3);
-	st[10] = rol64(    tt ^ t[1],  1);
-
-	/* Chi */
-	bc[ 0] = ~st[ 1] & st[ 2];
-	bc[ 1] = ~st[ 2] & st[ 3];
-	bc[ 2] = ~st[ 3] & st[ 4];
-	bc[ 3] = ~st[ 4] & st[ 0];
-	bc[ 4] = ~st[ 0] & st[ 1];
-	st[ 0] ^= bc[ 0];
-	st[ 1] ^= bc[ 1];
-	st[ 2] ^= bc[ 2];
-	st[ 3] ^= bc[ 3];
-	st[ 4] ^= bc[ 4];
-
-	bc[ 0] = ~st[ 6] & st[ 7];
-	bc[ 1] = ~st[ 7] & st[ 8];
-	bc[ 2] = ~st[ 8] & st[ 9];
-	bc[ 3] = ~st[ 9] & st[ 5];
-	bc[ 4] = ~st[ 5] & st[ 6];
-	st[ 5] ^= bc[ 0];
-	st[ 6] ^= bc[ 1];
-	st[ 7] ^= bc[ 2];
-	st[ 8] ^= bc[ 3];
-	st[ 9] ^= bc[ 4];
-
-	bc[ 0] = ~st[11] & st[12];
-	bc[ 1] = ~st[12] & st[13];
-	bc[ 2] = ~st[13] & st[14];
-	bc[ 3] = ~st[14] & st[10];
-	bc[ 4] = ~st[10] & st[11];
-	st[10] ^= bc[ 0];
-	st[11] ^= bc[ 1];
-	st[12] ^= bc[ 2];
-	st[13] ^= bc[ 3];
-	st[14] ^= bc[ 4];
-
-	bc[ 0] = ~st[16] & st[17];
-	bc[ 1] = ~st[17] & st[18];
-	bc[ 2] = ~st[18] & st[19];
-	bc[ 3] = ~st[19] & st[15];
-	bc[ 4] = ~st[15] & st[16];
-	st[15] ^= bc[ 0];
-	st[16] ^= bc[ 1];
-	st[17] ^= bc[ 2];
-	st[18] ^= bc[ 3];
-	st[19] ^= bc[ 4];
-
-	bc[ 0] = ~st[21] & st[22];
-	bc[ 1] = ~st[22] & st[23];
-	bc[ 2] = ~st[23] & st[24];
-	bc[ 3] = ~st[24] & st[20];
-	bc[ 4] = ~st[20] & st[21];
-	st[20] ^= bc[ 0];
-	st[21] ^= bc[ 1];
-	st[22] ^= bc[ 2];
-	st[23] ^= bc[ 3];
-	st[24] ^= bc[ 4];
-}
-
-static void keccakf(u64 st[25])
-{
-	int round;
-
-	for (round = 0; round < KECCAK_ROUNDS; round++) {
-		keccakf_round(st);
-		/* Iota */
-		st[0] ^= keccakf_rndc[round];
-	}
-}
-
-int crypto_sha3_init(struct shash_desc *desc)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-
-	memset(sctx->st, 0, sizeof(sctx->st));
-	return 0;
-}
-EXPORT_SYMBOL(crypto_sha3_init);
-
-static int crypto_sha3_update(struct shash_desc *desc, const u8 *data,
-			      unsigned int len)
-{
-	unsigned int rsiz = crypto_shash_blocksize(desc->tfm);
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	unsigned int rsizw = rsiz / 8;
-
-	do {
-		int i;
-
-		for (i = 0; i < rsizw; i++)
-			sctx->st[i] ^= get_unaligned_le64(data + 8 * i);
-		keccakf(sctx->st);
-
-		data += rsiz;
-		len -= rsiz;
-	} while (len >= rsiz);
-	return len;
-}
-
-static int crypto_sha3_finup(struct shash_desc *desc, const u8 *src,
-			     unsigned int len, u8 *out)
-{
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
-	unsigned int rsiz = crypto_shash_blocksize(desc->tfm);
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	__le64 block[SHA3_224_BLOCK_SIZE / 8] = {};
-	__le64 *digest = (__le64 *)out;
-	unsigned int rsizw = rsiz / 8;
-	u8 *p;
-	int i;
-
-	p = memcpy(block, src, len);
-	p[len++] = 0x06;
-	p[rsiz - 1] |= 0x80;
-
-	for (i = 0; i < rsizw; i++)
-		sctx->st[i] ^= le64_to_cpu(block[i]);
-	memzero_explicit(block, sizeof(block));
-
-	keccakf(sctx->st);
-
-	for (i = 0; i < digest_size / 8; i++)
-		put_unaligned_le64(sctx->st[i], digest++);
-
-	if (digest_size & 4)
-		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
-
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize		= SHA3_224_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-224",
-	.base.cra_driver_name	= "sha3-224-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA3_256_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-256",
-	.base.cra_driver_name	= "sha3-256-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA3_384_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-384",
-	.base.cra_driver_name	= "sha3-384-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA3_512_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-512",
-	.base.cra_driver_name	= "sha3-512-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int __init sha3_generic_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha3_generic_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(sha3_generic_mod_init);
-module_exit(sha3_generic_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-3 Secure Hash Algorithm");
-
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_ALIAS_CRYPTO("sha3-224-generic");
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-256-generic");
-MODULE_ALIAS_CRYPTO("sha3-384");
-MODULE_ALIAS_CRYPTO("sha3-384-generic");
-MODULE_ALIAS_CRYPTO("sha3-512");
-MODULE_ALIAS_CRYPTO("sha3-512-generic");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 3ab7adc1cdce..90d06c3ec967 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -5104,6 +5104,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-224)",
+		.generic_driver = "hmac(sha3-224-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5111,6 +5112,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-256)",
+		.generic_driver = "hmac(sha3-256-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5118,6 +5120,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-384)",
+		.generic_driver = "hmac(sha3-384-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5125,6 +5128,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-512)",
+		.generic_driver = "hmac(sha3-512-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5478,6 +5482,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-224",
+		.generic_driver = "sha3-224-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5485,6 +5490,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-256",
+		.generic_driver = "sha3-256-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5492,6 +5498,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-384",
+		.generic_driver = "sha3-384-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5499,6 +5506,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-512",
+		.generic_driver = "sha3-512-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
diff --git a/include/crypto/sha3.h b/include/crypto/sha3.h
index c0c468ee099e..c9e4182ff74f 100644
--- a/include/crypto/sha3.h
+++ b/include/crypto/sha3.h
@@ -37,10 +37,6 @@
 
 #define SHA3_STATE_SIZE		200
 
-struct shash_desc;
-
-int crypto_sha3_init(struct shash_desc *desc);
-
 /*
  * State for the Keccak-f[1600] permutation: 25 64-bit words.
  *
@@ -52,8 +48,6 @@ int crypto_sha3_init(struct shash_desc *desc);
  */
 struct sha3_state {
 	union {
-		u64 st[SHA3_STATE_SIZE / 8]; /* temporarily retained for compatibility purposes */
-
 		__le64 words[SHA3_STATE_SIZE / 8];
 		u8 bytes[SHA3_STATE_SIZE];
 
-- 
cgit v1.2.3


From 512c83265796d613f21255c766839eaed1c1cc79 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 4 Nov 2025 20:51:27 -0800
Subject: IB/rdmavt: rdmavt_qp.h: clean up kernel-doc comments

Correct the kernel-doc comments format to avoid around 35 kernel-doc
warnings:

- use struct keyword to introduce struct kernel-doc comments
- use correct variable name for some struct members
- use correct function name in comments for some functions
- fix spelling in a few comments
- use a ':' instead of '-' to separate struct members from their
  descriptions
- add a function name heading for rvt_div_mtu()

This leaves one struct member that is not described:
rdmavt_qp.h:206: warning: Function parameter or struct member 'wq'
 not described in 'rvt_krwq'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251105045127.106822-1-rdunlap@infradead.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/rdma/rdmavt_qp.h | 70 +++++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index d67892944193..71140ea0aeb2 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -144,7 +144,7 @@
 #define RVT_SEND_COMPLETION_ONLY	(IB_SEND_RESERVED_START << 1)
 
 /**
- * rvt_ud_wr - IB UD work plus AH cache
+ * struct rvt_ud_wr - IB UD work plus AH cache
  * @wr: valid IB work request
  * @attr: pointer to an allocated AH attribute
  *
@@ -184,10 +184,10 @@ struct rvt_swqe {
  * struct rvt_krwq - kernel struct receive work request
  * @p_lock: lock to protect producer of the kernel buffer
  * @head: index of next entry to fill
- * @c_lock:lock to protect consumer of the kernel buffer
+ * @c_lock: lock to protect consumer of the kernel buffer
  * @tail: index of next entry to pull
- * @count: count is aproximate of total receive enteries posted
- * @rvt_rwqe: struct of receive work request queue entry
+ * @count: count is approximate of total receive entries posted
+ * @curr_wq: struct of receive work request queue entry
  *
  * This structure is used to contain the head pointer,
  * tail pointer and receive work queue entries for kernel
@@ -309,10 +309,10 @@ struct rvt_ack_entry {
 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
 
 /**
- * rvt_operation_params - op table entry
- * @length - the length to copy into the swqe entry
- * @qpt_support - a bit mask indicating QP type support
- * @flags - RVT_OPERATION flags (see above)
+ * struct rvt_operation_params - op table entry
+ * @length: the length to copy into the swqe entry
+ * @qpt_support: a bit mask indicating QP type support
+ * @flags: RVT_OPERATION flags (see above)
  *
  * This supports table driven post send so that
  * the driver can have differing an potentially
@@ -552,7 +552,7 @@ static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n)
 
 /**
  * rvt_is_user_qp - return if this is user mode QP
- * @qp - the target QP
+ * @qp: the target QP
  */
 static inline bool rvt_is_user_qp(struct rvt_qp *qp)
 {
@@ -561,7 +561,7 @@ static inline bool rvt_is_user_qp(struct rvt_qp *qp)
 
 /**
  * rvt_get_qp - get a QP reference
- * @qp - the QP to hold
+ * @qp: the QP to hold
  */
 static inline void rvt_get_qp(struct rvt_qp *qp)
 {
@@ -570,7 +570,7 @@ static inline void rvt_get_qp(struct rvt_qp *qp)
 
 /**
  * rvt_put_qp - release a QP reference
- * @qp - the QP to release
+ * @qp: the QP to release
  */
 static inline void rvt_put_qp(struct rvt_qp *qp)
 {
@@ -580,7 +580,7 @@ static inline void rvt_put_qp(struct rvt_qp *qp)
 
 /**
  * rvt_put_swqe - drop mr refs held by swqe
- * @wqe - the send wqe
+ * @wqe: the send wqe
  *
  * This drops any mr references held by the swqe
  */
@@ -597,8 +597,8 @@ static inline void rvt_put_swqe(struct rvt_swqe *wqe)
 
 /**
  * rvt_qp_wqe_reserve - reserve operation
- * @qp - the rvt qp
- * @wqe - the send wqe
+ * @qp: the rvt qp
+ * @wqe: the send wqe
  *
  * This routine used in post send to record
  * a wqe relative reserved operation use.
@@ -612,8 +612,8 @@ static inline void rvt_qp_wqe_reserve(
 
 /**
  * rvt_qp_wqe_unreserve - clean reserved operation
- * @qp - the rvt qp
- * @flags - send wqe flags
+ * @qp: the rvt qp
+ * @flags: send wqe flags
  *
  * This decrements the reserve use count.
  *
@@ -653,8 +653,8 @@ u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len);
 
 /**
  * rvt_div_round_up_mtu - round up divide
- * @qp - the qp pair
- * @len - the length
+ * @qp: the qp pair
+ * @len: the length
  *
  * Perform a shift based mtu round up divide
  */
@@ -664,8 +664,9 @@ static inline u32 rvt_div_round_up_mtu(struct rvt_qp *qp, u32 len)
 }
 
 /**
- * @qp - the qp pair
- * @len - the length
+ * rvt_div_mtu - shift-based divide
+ * @qp: the qp pair
+ * @len: the length
  *
  * Perform a shift based mtu divide
  */
@@ -676,7 +677,7 @@ static inline u32 rvt_div_mtu(struct rvt_qp *qp, u32 len)
 
 /**
  * rvt_timeout_to_jiffies - Convert a ULP timeout input into jiffies
- * @timeout - timeout input(0 - 31).
+ * @timeout: timeout input(0 - 31).
  *
  * Return a timeout value in jiffies.
  */
@@ -690,7 +691,8 @@ static inline unsigned long rvt_timeout_to_jiffies(u8 timeout)
 
 /**
  * rvt_lookup_qpn - return the QP with the given QPN
- * @ibp: the ibport
+ * @rdi: rvt device info structure
+ * @rvp: the ibport
  * @qpn: the QP number to look up
  *
  * The caller must hold the rcu_read_lock(), and keep the lock until
@@ -716,9 +718,9 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
 }
 
 /**
- * rvt_mod_retry_timer - mod a retry timer
- * @qp - the QP
- * @shift - timeout shift to wait for multiple packets
+ * rvt_mod_retry_timer_ext - mod a retry timer
+ * @qp: the QP
+ * @shift: timeout shift to wait for multiple packets
  * Modify a potentially already running retry timer
  */
 static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift)
@@ -753,7 +755,7 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
 }
 
 /**
- * rvt_qp_sqwe_incr - increment ring index
+ * rvt_qp_swqe_incr - increment ring index
  * @qp: the qp
  * @val: the starting value
  *
@@ -811,10 +813,10 @@ static inline void rvt_send_cq(struct rvt_qp *qp, struct ib_wc *wc,
 
 /**
  * rvt_qp_complete_swqe - insert send completion
- * @qp - the qp
- * @wqe - the send wqe
- * @opcode - wc operation (driver dependent)
- * @status - completion status
+ * @qp: the qp
+ * @wqe: the send wqe
+ * @opcode: wc operation (driver dependent)
+ * @status: completion status
  *
  * Update the s_last information, and then insert a send
  * completion into the completion
@@ -891,7 +893,7 @@ void rvt_ruc_loopback(struct rvt_qp *qp);
 
 /**
  * struct rvt_qp_iter - the iterator for QPs
- * @qp - the current QP
+ * @qp: the current QP
  *
  * This structure defines the current iterator
  * state for sequenced access to all QPs relative
@@ -913,7 +915,7 @@ struct rvt_qp_iter {
 
 /**
  * ib_cq_tail - Return tail index of cq buffer
- * @send_cq - The cq for send
+ * @send_cq: The cq for send
  *
  * This is called in qp_iter_print to get tail
  * of cq buffer.
@@ -929,7 +931,7 @@ static inline u32 ib_cq_tail(struct ib_cq *send_cq)
 
 /**
  * ib_cq_head - Return head index of cq buffer
- * @send_cq - The cq for send
+ * @send_cq: The cq for send
  *
  * This is called in qp_iter_print to get head
  * of cq buffer.
@@ -945,7 +947,7 @@ static inline u32 ib_cq_head(struct ib_cq *send_cq)
 
 /**
  * rvt_free_rq - free memory allocated for rvt_rq struct
- * @rvt_rq: request queue data structure
+ * @rq: request queue data structure
  *
  * This function should only be called if the rvt_mmap_info()
  * has not succeeded.
-- 
cgit v1.2.3


From 5f20bc206beb902e32b77216cb7935b46ca00b0a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 23 Oct 2025 12:46:14 -0700
Subject: platform/x86: ISST: isst_if.h: fix all kernel-doc warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix all kernel-doc warnings in <uapi/linux/isst_if.h>:

- don't use "[]" in the variable name in kernel-doc
- add a few missing entries
- change "power_domain" to "power_domain_id" in kernel-doc to match
  the struct member name
- add a leading '@' on a few existing kernel-doc lines
- use '_' instead of '-' in struct member names

Examples (but not all 27 warnings):

Warning: include/uapi/linux/isst_if.h:63 struct member 'cpu_map'
 not described in 'isst_if_cpu_maps'
Warning: ../include/uapi/linux/isst_if.h:95 struct member 'req_count'
 not described in 'isst_if_io_regs'
Warning: include/uapi/linux/isst_if.h:132 struct member 'mbox_cmd'
 not described in 'isst_if_mbox_cmds'
Warning: ../include/uapi/linux/isst_if.h:183 struct member 'supported'
 not described in 'isst_core_power'
Warning: ../include/uapi/linux/isst_if.h:206 struct member
 'power_domain_id' not described in 'isst_clos_param'
Warning: ../include/uapi/linux/isst_if.h:239 struct member 'assoc_info'
 not described in 'isst_if_clos_assoc_cmds'
Warning: ../include/uapi/linux/isst_if.h:286 struct member 'sst_tf_support'
 not described in 'isst_perf_level_info'
Warning: ../include/uapi/linux/isst_if.h:375 struct member 'trl_freq_mhz'
 not described in 'isst_perf_level_data_info'
Warning: ../include/uapi/linux/isst_if.h:475 struct member 'max_buckets'
 not described in 'isst_turbo_freq_info'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251023194615.180824-1-rdunlap@infradead.org
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/uapi/linux/isst_if.h | 50 ++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index 8197a4800604..40aa545101a3 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -52,7 +52,7 @@ struct isst_if_cpu_map {
 /**
  * struct isst_if_cpu_maps - structure for CPU map IOCTL
  * @cmd_count:	Number of CPU mapping command in cpu_map[]
- * @cpu_map[]:	Holds one or more CPU map data structure
+ * @cpu_map:	Holds one or more CPU map data structure
  *
  * This structure used with ioctl ISST_IF_GET_PHY_ID to send
  * one or more CPU mapping commands. Here IOCTL return value indicates
@@ -82,8 +82,8 @@ struct isst_if_io_reg {
 
 /**
  * struct isst_if_io_regs - structure for IO register commands
- * @cmd_count:	Number of io reg commands in io_reg[]
- * @io_reg[]:	Holds one or more io_reg command structure
+ * @req_count:	Number of io reg commands in io_reg[]
+ * @io_reg:	Holds one or more io_reg command structure
  *
  * This structure used with ioctl ISST_IF_IO_CMD to send
  * one or more read/write commands to PUNIT. Here IOCTL return value
@@ -120,7 +120,7 @@ struct isst_if_mbox_cmd {
 /**
  * struct isst_if_mbox_cmds - structure for mailbox commands
  * @cmd_count:	Number of mailbox commands in mbox_cmd[]
- * @mbox_cmd[]:	Holds one or more mbox commands
+ * @mbox_cmd:	Holds one or more mbox commands
  *
  * This structure used with ioctl ISST_IF_MBOX_COMMAND to send
  * one or more mailbox commands to PUNIT. Here IOCTL return value
@@ -152,7 +152,7 @@ struct isst_if_msr_cmd {
 /**
  * struct isst_if_msr_cmds - structure for msr commands
  * @cmd_count:	Number of mailbox commands in msr_cmd[]
- * @msr_cmd[]:	Holds one or more msr commands
+ * @msr_cmd:	Holds one or more msr commands
  *
  * This structure used with ioctl ISST_IF_MSR_COMMAND to send
  * one or more MSR commands. IOCTL return value indicates number of
@@ -167,8 +167,9 @@ struct isst_if_msr_cmds {
  * struct isst_core_power - Structure to get/set core_power feature
  * @get_set:	0: Get, 1: Set
  * @socket_id:	Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
  * @enable:	Feature enable status
+ * @supported:	Power domain supports SST_CP interface
  * @priority_type: Priority type for the feature (ordered/proportional)
  *
  * Structure to get/set core_power feature state using IOCTL
@@ -187,11 +188,11 @@ struct isst_core_power {
  * struct isst_clos_param - Structure to get/set clos praram
  * @get_set:	0: Get, 1: Set
  * @socket_id:	Socket/package id
- * @power_domain:	Power Domain id
- * clos:	Clos ID for the parameters
- * min_freq_mhz: Minimum frequency in MHz
- * max_freq_mhz: Maximum frequency in MHz
- * prop_prio:	Proportional priority from 0-15
+ * @power_domain_id:	Power Domain id
+ * @clos:	Clos ID for the parameters
+ * @min_freq_mhz: Minimum frequency in MHz
+ * @max_freq_mhz: Maximum frequency in MHz
+ * @prop_prio:	Proportional priority from 0-15
  *
  * Structure to get/set per clos property using IOCTL
  * ISST_IF_CLOS_PARAM.
@@ -209,7 +210,7 @@ struct isst_clos_param {
 /**
  * struct isst_if_clos_assoc - Structure to assign clos to a CPU
  * @socket_id:	Socket/package id
- * @power_domain:	Power Domain id
+ * @power_domain_id:	Power Domain id
  * @logical_cpu: CPU number
  * @clos:	Clos ID to assign to the logical CPU
  *
@@ -228,6 +229,7 @@ struct isst_if_clos_assoc {
  * @get_set:	Request is for get or set
  * @punit_cpu_map: Set to 1 if the CPU number is punit numbering not
  *		   Linux CPU number
+ * @assoc_info: CLOS data for this CPU
  *
  * Structure used to get/set associate CPUs to clos using IOCTL
  * ISST_IF_CLOS_ASSOC.
@@ -257,7 +259,7 @@ struct isst_tpmi_instance_count {
 /**
  * struct isst_perf_level_info - Structure to get information on SST-PP levels
  * @socket_id:	Socket/package id
- * @power_domain:	Power Domain id
+ * @power_domain_id:	Power Domain id
  * @logical_cpu: CPU number
  * @clos:	Clos ID to assign to the logical CPU
  * @max_level: Maximum performance level supported by the platform
@@ -267,8 +269,8 @@ struct isst_tpmi_instance_count {
  * @feature_state: SST-BF and SST-TF (enabled/disabled) status at current level
  * @locked: SST-PP performance level change is locked/unlocked
  * @enabled: SST-PP feature is enabled or not
- * @sst-tf_support: SST-TF support status at this level
- * @sst-bf_support: SST-BF support status at this level
+ * @sst_tf_support: SST-TF support status at this level
+ * @sst_bf_support: SST-BF support status at this level
  *
  * Structure to get SST-PP details using IOCTL ISST_IF_PERF_LEVELS.
  */
@@ -289,7 +291,7 @@ struct isst_perf_level_info {
 /**
  * struct isst_perf_level_control - Structure to set SST-PP level
  * @socket_id:	Socket/package id
- * @power_domain:	Power Domain id
+ * @power_domain_id:	Power Domain id
  * @level:	level to set
  *
  * Structure used change SST-PP level using IOCTL ISST_IF_PERF_SET_LEVEL.
@@ -303,7 +305,7 @@ struct isst_perf_level_control {
 /**
  * struct isst_perf_feature_control - Structure to activate SST-BF/SST-TF
  * @socket_id:	Socket/package id
- * @power_domain:	Power Domain id
+ * @power_domain_id:	Power Domain id
  * @feature:	bit 0 = SST-BF state, bit 1 = SST-TF state
  *
  * Structure used to enable SST-BF/SST-TF using IOCTL ISST_IF_PERF_SET_FEATURE.
@@ -320,7 +322,7 @@ struct isst_perf_feature_control {
 /**
  * struct isst_perf_level_data_info - Structure to get SST-PP level details
  * @socket_id:	Socket/package id
- * @power_domain:	Power Domain id
+ * @power_domain_id:	Power Domain id
  * @level:	SST-PP level for which caller wants to get information
  * @tdp_ratio: TDP Ratio
  * @base_freq_mhz: Base frequency in MHz
@@ -341,8 +343,8 @@ struct isst_perf_feature_control {
  * @pm_fabric_freq_mhz: Fabric (Uncore) minimum frequency
  * @max_buckets: Maximum trl buckets
  * @max_trl_levels: Maximum trl levels
- * @bucket_core_counts[TRL_MAX_BUCKETS]: Number of cores per bucket
- * @trl_freq_mhz[TRL_MAX_LEVELS][TRL_MAX_BUCKETS]: maximum frequency
+ * @bucket_core_counts: Number of cores per bucket
+ * @trl_freq_mhz: maximum frequency
  * for a bucket and trl level
  *
  * Structure used to get information on frequencies and TDP for a SST-PP
@@ -402,7 +404,7 @@ struct isst_perf_level_fabric_info {
 /**
  * struct isst_perf_level_cpu_mask - Structure to get SST-PP level CPU mask
  * @socket_id:	Socket/package id
- * @power_domain:	Power Domain id
+ * @power_domain_id:	Power Domain id
  * @level:	SST-PP level for which caller wants to get information
  * @punit_cpu_map: Set to 1 if the CPU number is punit numbering not
  *		   Linux CPU number. If 0 CPU buffer is copied to user space
@@ -430,7 +432,7 @@ struct isst_perf_level_cpu_mask {
 /**
  * struct isst_base_freq_info - Structure to get SST-BF frequencies
  * @socket_id:	Socket/package id
- * @power_domain:	Power Domain id
+ * @power_domain_id:	Power Domain id
  * @level:	SST-PP level for which caller wants to get information
  * @high_base_freq_mhz: High priority CPU base frequency
  * @low_base_freq_mhz: Low priority CPU base frequency
@@ -453,9 +455,11 @@ struct isst_base_freq_info {
 /**
  * struct isst_turbo_freq_info - Structure to get SST-TF frequencies
  * @socket_id:	Socket/package id
- * @power_domain:	Power Domain id
+ * @power_domain_id:	Power Domain id
  * @level:	SST-PP level for which caller wants to get information
  * @max_clip_freqs: Maximum number of low priority core clipping frequencies
+ * @max_buckets: Maximum trl buckets
+ * @max_trl_levels: Maximum trl levels
  * @lp_clip_freq_mhz: Clip frequencies per trl level
  * @bucket_core_counts: Maximum number of cores for a bucket
  * @trl_freq_mhz: Frequencies per trl level for each bucket
-- 
cgit v1.2.3


From 6b47af35a6dded074ff583361f6d6668dd7a401d Mon Sep 17 00:00:00 2001
From: Raju Rangoju <Raju.Rangoju@amd.com>
Date: Fri, 31 Oct 2025 16:48:11 +0530
Subject: net: selftests: export packet creation helpers for driver use

Export the network selftest packet creation infrastructure to allow
network drivers to reuse the existing selftest framework instead of
duplicating packet creation code.

Signed-off-by: Raju Rangoju <Raju.Rangoju@amd.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251031111811.775434-1-Raju.Rangoju@amd.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/selftests.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/selftests.c    | 48 +++++++-----------------------------------------
 2 files changed, 52 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/net/selftests.h b/include/net/selftests.h
index e65e8d230d33..c36e07406ad4 100644
--- a/include/net/selftests.h
+++ b/include/net/selftests.h
@@ -3,9 +3,48 @@
 #define _NET_SELFTESTS
 
 #include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+struct net_packet_attrs {
+	const unsigned char *src;
+	const unsigned char *dst;
+	u32 ip_src;
+	u32 ip_dst;
+	bool tcp;
+	u16 sport;
+	u16 dport;
+	int timeout;
+	int size;
+	int max_size;
+	u8 id;
+	u16 queue_mapping;
+	bool bad_csum;
+};
+
+struct net_test_priv {
+	struct net_packet_attrs *packet;
+	struct packet_type pt;
+	struct completion comp;
+	int double_vlan;
+	int vlan_id;
+	int ok;
+};
+
+struct netsfhdr {
+	__be32 version;
+	__be64 magic;
+	u8 id;
+} __packed;
+
+#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
+			   sizeof(struct netsfhdr))
+#define NET_TEST_PKT_MAGIC	0xdeadcafecafedeadULL
+#define NET_LB_TIMEOUT		msecs_to_jiffies(200)
 
 #if IS_ENABLED(CONFIG_NET_SELFTESTS)
 
+struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id,
+				 struct net_packet_attrs *attr);
 void net_selftest(struct net_device *ndev, struct ethtool_test *etest,
 		  u64 *buf);
 int net_selftest_get_count(void);
@@ -13,6 +52,12 @@ void net_selftest_get_strings(u8 *data);
 
 #else
 
+static inline struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id,
+					       struct net_packet_attrs *attr)
+{
+	return NULL;
+}
+
 static inline void net_selftest(struct net_device *ndev, struct ethtool_test *etest,
 				u64 *buf)
 {
diff --git a/net/core/selftests.c b/net/core/selftests.c
index 3d79133a91a6..8b81feb82c4a 100644
--- a/net/core/selftests.c
+++ b/net/core/selftests.c
@@ -14,46 +14,10 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 
-struct net_packet_attrs {
-	const unsigned char *src;
-	const unsigned char *dst;
-	u32 ip_src;
-	u32 ip_dst;
-	bool tcp;
-	u16 sport;
-	u16 dport;
-	int timeout;
-	int size;
-	int max_size;
-	u8 id;
-	u16 queue_mapping;
-	bool bad_csum;
-};
-
-struct net_test_priv {
-	struct net_packet_attrs *packet;
-	struct packet_type pt;
-	struct completion comp;
-	int double_vlan;
-	int vlan_id;
-	int ok;
-};
-
-struct netsfhdr {
-	__be32 version;
-	__be64 magic;
-	u8 id;
-} __packed;
-
 static u8 net_test_next_id;
 
-#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
-			   sizeof(struct netsfhdr))
-#define NET_TEST_PKT_MAGIC	0xdeadcafecafedeadULL
-#define NET_LB_TIMEOUT		msecs_to_jiffies(200)
-
-static struct sk_buff *net_test_get_skb(struct net_device *ndev,
-					struct net_packet_attrs *attr)
+struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id,
+				 struct net_packet_attrs *attr)
 {
 	struct sk_buff *skb = NULL;
 	struct udphdr *uhdr = NULL;
@@ -142,8 +106,8 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev,
 	shdr = skb_put(skb, sizeof(*shdr));
 	shdr->version = 0;
 	shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC);
-	attr->id = net_test_next_id;
-	shdr->id = net_test_next_id++;
+	attr->id = id;
+	shdr->id = id;
 
 	if (attr->size) {
 		void *payload = skb_put(skb, attr->size);
@@ -190,6 +154,7 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev,
 
 	return skb;
 }
+EXPORT_SYMBOL_GPL(net_test_get_skb);
 
 static int net_test_loopback_validate(struct sk_buff *skb,
 				      struct net_device *ndev,
@@ -286,12 +251,13 @@ static int __net_test_loopback(struct net_device *ndev,
 	tpriv->packet = attr;
 	dev_add_pack(&tpriv->pt);
 
-	skb = net_test_get_skb(ndev, attr);
+	skb = net_test_get_skb(ndev, net_test_next_id, attr);
 	if (!skb) {
 		ret = -ENOMEM;
 		goto cleanup;
 	}
 
+	net_test_next_id++;
 	ret = dev_direct_xmit(skb, attr->queue_mapping);
 	if (ret < 0) {
 		goto cleanup;
-- 
cgit v1.2.3


From 1b0f3f9ee41ee2bdd206667f85ea2aa36dfe6e69 Mon Sep 17 00:00:00 2001
From: Shuming Fan <shumingf@realtek.com>
Date: Thu, 6 Nov 2025 17:33:35 +0800
Subject: ASoC: SDCA: support Q7.8 volume format

The SDCA specification uses Q7.8 volume format.
This patch adds a field to indicate whether it is SDCA volume control
and supports the volume settings.

Signed-off-by: Shuming Fan <shumingf@realtek.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251106093335.1363237-1-shumingf@realtek.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h        |  1 +
 sound/soc/sdca/sdca_asoc.c | 34 +++++++------------------
 sound/soc/soc-ops.c        | 62 ++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 61 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 1aebf14fcf80..53b4129ee97a 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1225,6 +1225,7 @@ struct soc_mixer_control {
 	unsigned int sign_bit;
 	unsigned int invert:1;
 	unsigned int autodisable:1;
+	unsigned int sdca_q78:1;
 #ifdef CONFIG_SND_SOC_TOPOLOGY
 	struct snd_soc_dobj dobj;
 #endif
diff --git a/sound/soc/sdca/sdca_asoc.c b/sound/soc/sdca/sdca_asoc.c
index c493ec530cc5..892b7c028fae 100644
--- a/sound/soc/sdca/sdca_asoc.c
+++ b/sound/soc/sdca/sdca_asoc.c
@@ -795,7 +795,6 @@ static int control_limit_kctl(struct device *dev,
 	struct sdca_control_range *range;
 	int min, max, step;
 	unsigned int *tlv;
-	int shift;
 
 	if (control->type != SDCA_CTL_DATATYPE_Q7P8DB)
 		return 0;
@@ -814,37 +813,22 @@ static int control_limit_kctl(struct device *dev,
 	min = sign_extend32(min, control->nbits - 1);
 	max = sign_extend32(max, control->nbits - 1);
 
-	/*
-	 * FIXME: Only support power of 2 step sizes as this can be supported
-	 * by a simple shift.
-	 */
-	if (hweight32(step) != 1) {
-		dev_err(dev, "%s: %s: currently unsupported step size\n",
-			entity->label, control->label);
-		return -EINVAL;
-	}
-
-	/*
-	 * The SDCA volumes are in steps of 1/256th of a dB, a step down of
-	 * 64 (shift of 6) gives 1/4dB. 1/4dB is the smallest unit that is also
-	 * representable in the ALSA TLVs which are in 1/100ths of a dB.
-	 */
-	shift = max(ffs(step) - 1, 6);
-
 	tlv = devm_kcalloc(dev, 4, sizeof(*tlv), GFP_KERNEL);
 	if (!tlv)
 		return -ENOMEM;
 
-	tlv[0] = SNDRV_CTL_TLVT_DB_SCALE;
+	tlv[0] = SNDRV_CTL_TLVT_DB_MINMAX;
 	tlv[1] = 2 * sizeof(*tlv);
 	tlv[2] = (min * 100) >> 8;
-	tlv[3] = ((1 << shift) * 100) >> 8;
+	tlv[3] = (max * 100) >> 8;
+
+	step = (step * 100) >> 8;
 
-	mc->min = min >> shift;
-	mc->max = max >> shift;
-	mc->shift = shift;
-	mc->rshift = shift;
-	mc->sign_bit = 15 - shift;
+	mc->min = ((int)tlv[2] / step);
+	mc->max = ((int)tlv[3] / step);
+	mc->shift = step;
+	mc->sign_bit = 15;
+	mc->sdca_q78 = 1;
 
 	kctl->tlv.p = tlv;
 	kctl->access |= SNDRV_CTL_ELEM_ACCESS_TLV_READ;
diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c
index d2b6fb8e0b6c..ce86978c158d 100644
--- a/sound/soc/soc-ops.c
+++ b/sound/soc/soc-ops.c
@@ -110,6 +110,36 @@ int snd_soc_put_enum_double(struct snd_kcontrol *kcontrol,
 }
 EXPORT_SYMBOL_GPL(snd_soc_put_enum_double);
 
+static int sdca_soc_q78_reg_to_ctl(struct soc_mixer_control *mc, unsigned int reg_val,
+				unsigned int mask, unsigned int shift, int max)
+{
+	int val = reg_val;
+
+	if (WARN_ON(!mc->shift))
+		return -EINVAL;
+
+	val = sign_extend32(val, mc->sign_bit);
+	val = (((val * 100) >> 8) / (int)mc->shift);
+	val -= mc->min;
+
+	return val & mask;
+}
+
+static unsigned int sdca_soc_q78_ctl_to_reg(struct soc_mixer_control *mc, int val,
+					 unsigned int mask, unsigned int shift, int max)
+{
+	unsigned int ret_val;
+	int reg_val;
+
+	if (WARN_ON(!mc->shift))
+		return -EINVAL;
+
+	reg_val = val + mc->min;
+	ret_val = (int)((reg_val * mc->shift) << 8) / 100;
+
+	return ret_val & mask;
+}
+
 static int soc_mixer_reg_to_ctl(struct soc_mixer_control *mc, unsigned int reg_val,
 				unsigned int mask, unsigned int shift, int max)
 {
@@ -197,19 +227,27 @@ static int soc_put_volsw(struct snd_kcontrol *kcontrol,
 			 struct snd_ctl_elem_value *ucontrol,
 			 struct soc_mixer_control *mc, int mask, int max)
 {
+	unsigned int (*ctl_to_reg)(struct soc_mixer_control *, int, unsigned int, unsigned int, int);
 	struct snd_soc_component *component = snd_kcontrol_chip(kcontrol);
 	unsigned int val1, val_mask;
 	unsigned int val2 = 0;
 	bool double_r = false;
 	int ret;
 
+	if (mc->sdca_q78) {
+		ctl_to_reg = sdca_soc_q78_ctl_to_reg;
+		val_mask = mask;
+	} else {
+		ctl_to_reg = soc_mixer_ctl_to_reg;
+		val_mask = mask << mc->shift;
+	}
+
 	ret = soc_mixer_valid_ctl(mc, ucontrol->value.integer.value[0], max);
 	if (ret)
 		return ret;
 
-	val1 = soc_mixer_ctl_to_reg(mc, ucontrol->value.integer.value[0],
+	val1 = ctl_to_reg(mc, ucontrol->value.integer.value[0],
 				    mask, mc->shift, max);
-	val_mask = mask << mc->shift;
 
 	if (snd_soc_volsw_is_stereo(mc)) {
 		ret = soc_mixer_valid_ctl(mc, ucontrol->value.integer.value[1], max);
@@ -217,14 +255,10 @@ static int soc_put_volsw(struct snd_kcontrol *kcontrol,
 			return ret;
 
 		if (mc->reg == mc->rreg) {
-			val1 |= soc_mixer_ctl_to_reg(mc,
-						     ucontrol->value.integer.value[1],
-						     mask, mc->rshift, max);
+			val1 |= ctl_to_reg(mc, ucontrol->value.integer.value[1], mask, mc->rshift, max);
 			val_mask |= mask << mc->rshift;
 		} else {
-			val2 = soc_mixer_ctl_to_reg(mc,
-						    ucontrol->value.integer.value[1],
-						    mask, mc->shift, max);
+			val2 = ctl_to_reg(mc, ucontrol->value.integer.value[1], mask, mc->shift, max);
 			double_r = true;
 		}
 	}
@@ -248,21 +282,27 @@ static int soc_get_volsw(struct snd_kcontrol *kcontrol,
 			 struct snd_ctl_elem_value *ucontrol,
 			 struct soc_mixer_control *mc, int mask, int max)
 {
+	int (*reg_to_ctl)(struct soc_mixer_control *, unsigned int, unsigned int, unsigned int, int);
 	struct snd_soc_component *component = snd_kcontrol_chip(kcontrol);
 	unsigned int reg_val;
 	int val;
 
+	if (mc->sdca_q78)
+		reg_to_ctl = sdca_soc_q78_reg_to_ctl;
+	else
+		reg_to_ctl = soc_mixer_reg_to_ctl;
+
 	reg_val = snd_soc_component_read(component, mc->reg);
-	val = soc_mixer_reg_to_ctl(mc, reg_val, mask, mc->shift, max);
+	val = reg_to_ctl(mc, reg_val, mask, mc->shift, max);
 
 	ucontrol->value.integer.value[0] = val;
 
 	if (snd_soc_volsw_is_stereo(mc)) {
 		if (mc->reg == mc->rreg) {
-			val = soc_mixer_reg_to_ctl(mc, reg_val, mask, mc->rshift, max);
+			val = reg_to_ctl(mc, reg_val, mask, mc->rshift, max);
 		} else {
 			reg_val = snd_soc_component_read(component, mc->rreg);
-			val = soc_mixer_reg_to_ctl(mc, reg_val, mask, mc->shift, max);
+			val = reg_to_ctl(mc, reg_val, mask, mc->shift, max);
 		}
 
 		ucontrol->value.integer.value[1] = val;
-- 
cgit v1.2.3


From b340412a3b22b60b5e19cce8726940c7b5b14439 Mon Sep 17 00:00:00 2001
From: James Calligeros <jcalligeros99@gmail.com>
Date: Sat, 25 Oct 2025 10:24:36 +1000
Subject: mfd: macsmc: Add new __SMC_KEY macro

When using the _SMC_KEY macro in switch/case statements, GCC 15.2.1 errors
out with 'case label does not reduce to an integer constant'. Introduce
a new __SMC_KEY macro that can be used instead.

Signed-off-by: James Calligeros <jcalligeros99@gmail.com>
Link: https://patch.msgid.link/20251025-macsmc-subdevs-v4-5-374d5c9eba0e@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/macsmc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mfd/macsmc.h b/include/linux/mfd/macsmc.h
index 6b13f01a8592..f6f80c33b5cf 100644
--- a/include/linux/mfd/macsmc.h
+++ b/include/linux/mfd/macsmc.h
@@ -41,6 +41,7 @@ typedef u32 smc_key;
  */
 #define SMC_KEY(s) (smc_key)(_SMC_KEY(#s))
 #define _SMC_KEY(s) (((s)[0] << 24) | ((s)[1] << 16) | ((s)[2] << 8) | (s)[3])
+#define __SMC_KEY(a, b, c, d) (((u32)(a) << 24) | ((u32)(b) << 16) | ((u32)(c) << 8) | ((u32)(d)))
 
 #define APPLE_SMC_READABLE BIT(7)
 #define APPLE_SMC_WRITABLE BIT(6)
-- 
cgit v1.2.3


From d306cbbc34cc9aa6ed2235472110fe797f887db7 Mon Sep 17 00:00:00 2001
From: Atharva Tiwari <atharvatiwarilinuxdev@gmail.com>
Date: Tue, 7 Oct 2025 18:35:10 +0530
Subject: mfd: macsmc: Make SMC write buffers const

Mark the write buffer arguments in apple_smc_write(), apple_smc_rw(),
and apple_smc_write_atomic() as const. These functions do not modify
the data provided by the caller, so the parameters should be const
qualified.

Signed-off-by: Atharva Tiwari <atharvatiwarilinuxdev@gmail.com>
Reviewed-by: Sven Peter <sven@kernel.org>
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/macsmc.c       | 6 +++---
 include/linux/mfd/macsmc.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/macsmc.c b/drivers/mfd/macsmc.c
index e6cdae221f1d..e3893e255ce5 100644
--- a/drivers/mfd/macsmc.c
+++ b/drivers/mfd/macsmc.c
@@ -173,7 +173,7 @@ int apple_smc_read(struct apple_smc *smc, smc_key key, void *buf, size_t size)
 }
 EXPORT_SYMBOL(apple_smc_read);
 
-int apple_smc_write(struct apple_smc *smc, smc_key key, void *buf, size_t size)
+int apple_smc_write(struct apple_smc *smc, smc_key key, const void *buf, size_t size)
 {
 	guard(mutex)(&smc->mutex);
 
@@ -181,7 +181,7 @@ int apple_smc_write(struct apple_smc *smc, smc_key key, void *buf, size_t size)
 }
 EXPORT_SYMBOL(apple_smc_write);
 
-int apple_smc_rw(struct apple_smc *smc, smc_key key, void *wbuf, size_t wsize,
+int apple_smc_rw(struct apple_smc *smc, smc_key key, const void *wbuf, size_t wsize,
 		 void *rbuf, size_t rsize)
 {
 	guard(mutex)(&smc->mutex);
@@ -239,7 +239,7 @@ int apple_smc_enter_atomic(struct apple_smc *smc)
 }
 EXPORT_SYMBOL(apple_smc_enter_atomic);
 
-int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, void *buf, size_t size)
+int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, const void *buf, size_t size)
 {
 	guard(spinlock_irqsave)(&smc->lock);
 	u8 result;
diff --git a/include/linux/mfd/macsmc.h b/include/linux/mfd/macsmc.h
index f6f80c33b5cf..cc09ecce0df7 100644
--- a/include/linux/mfd/macsmc.h
+++ b/include/linux/mfd/macsmc.h
@@ -150,7 +150,7 @@ int apple_smc_read(struct apple_smc *smc, smc_key key, void *buf, size_t size);
  *
  * Return: Zero on success, negative errno on error
  */
-int apple_smc_write(struct apple_smc *smc, smc_key key, void *buf, size_t size);
+int apple_smc_write(struct apple_smc *smc, smc_key key, const void *buf, size_t size);
 
 /**
  * apple_smc_enter_atomic - Enter atomic mode to be able to use apple_smc_write_atomic
@@ -177,7 +177,7 @@ int apple_smc_enter_atomic(struct apple_smc *smc);
  *
  * Return: Zero on success, negative errno on error
  */
-int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, void *buf, size_t size);
+int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, const void *buf, size_t size);
 
 /**
  * apple_smc_rw - Write and then read using the given SMC key
@@ -190,7 +190,7 @@ int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, void *buf, size_t
  *
  * Return: Zero on success, negative errno on error
  */
-int apple_smc_rw(struct apple_smc *smc, smc_key key, void *wbuf, size_t wsize,
+int apple_smc_rw(struct apple_smc *smc, smc_key key, const void *wbuf, size_t wsize,
 		 void *rbuf, size_t rsize);
 
 /**
-- 
cgit v1.2.3


From 617347e716178d3a317a129ece05116967f06d53 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Wed, 25 Jun 2025 14:32:58 +0100
Subject: mfd: wl1273-core: Remove the header

The wl1273 FM radio is on Arnd's unused driver list:

  https://lore.kernel.org/lkml/a15bb180-401d-49ad-a212-0c81d613fbc8@app.fastmail.com/

Other patches have removed the core, the ASoC code and the Radio code.
With all those in, remove the header.

Also, tidy the ref in the docs.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Lee Jones <lee@kernel.org>
---
 Documentation/admin-guide/media/radio-cardlist.rst |   1 -
 include/linux/mfd/wl1273-core.h                    | 277 ---------------------
 2 files changed, 278 deletions(-)
 delete mode 100644 include/linux/mfd/wl1273-core.h

(limited to 'include')

diff --git a/Documentation/admin-guide/media/radio-cardlist.rst b/Documentation/admin-guide/media/radio-cardlist.rst
index a82a146bf912..cec724256812 100644
--- a/Documentation/admin-guide/media/radio-cardlist.rst
+++ b/Documentation/admin-guide/media/radio-cardlist.rst
@@ -30,7 +30,6 @@ radio-terratec         TerraTec ActiveRadio ISA Standalone
 radio-timb             Enable the Timberdale radio driver
 radio-trust            Trust FM radio card
 radio-typhoon          Typhoon Radio (a.k.a. EcoRadio)
-radio-wl1273           Texas Instruments WL1273 I2C FM Radio
 fm_drv                 ISA radio devices
 fm_drv                 ISA radio devices
 radio-zoltrix          Zoltrix Radio
diff --git a/include/linux/mfd/wl1273-core.h b/include/linux/mfd/wl1273-core.h
deleted file mode 100644
index c28cf76d5c31..000000000000
--- a/include/linux/mfd/wl1273-core.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * include/linux/mfd/wl1273-core.h
- *
- * Some definitions for the wl1273 radio receiver/transmitter chip.
- *
- * Copyright (C) 2010 Nokia Corporation
- * Author: Matti J. Aaltonen <matti.j.aaltonen@nokia.com>
- */
-
-#ifndef WL1273_CORE_H
-#define WL1273_CORE_H
-
-#include <linux/i2c.h>
-#include <linux/mfd/core.h>
-
-#define WL1273_FM_DRIVER_NAME	"wl1273-fm"
-#define RX71_FM_I2C_ADDR	0x22
-
-#define WL1273_STEREO_GET		0
-#define WL1273_RSSI_LVL_GET		1
-#define WL1273_IF_COUNT_GET		2
-#define WL1273_FLAG_GET			3
-#define WL1273_RDS_SYNC_GET		4
-#define WL1273_RDS_DATA_GET		5
-#define WL1273_FREQ_SET			10
-#define WL1273_AF_FREQ_SET		11
-#define WL1273_MOST_MODE_SET		12
-#define WL1273_MOST_BLEND_SET		13
-#define WL1273_DEMPH_MODE_SET		14
-#define WL1273_SEARCH_LVL_SET		15
-#define WL1273_BAND_SET			16
-#define WL1273_MUTE_STATUS_SET		17
-#define WL1273_RDS_PAUSE_LVL_SET	18
-#define WL1273_RDS_PAUSE_DUR_SET	19
-#define WL1273_RDS_MEM_SET		20
-#define WL1273_RDS_BLK_B_SET		21
-#define WL1273_RDS_MSK_B_SET		22
-#define WL1273_RDS_PI_MASK_SET		23
-#define WL1273_RDS_PI_SET		24
-#define WL1273_RDS_SYSTEM_SET		25
-#define WL1273_INT_MASK_SET		26
-#define WL1273_SEARCH_DIR_SET		27
-#define WL1273_VOLUME_SET		28
-#define WL1273_AUDIO_ENABLE		29
-#define WL1273_PCM_MODE_SET		30
-#define WL1273_I2S_MODE_CONFIG_SET	31
-#define WL1273_POWER_SET		32
-#define WL1273_INTX_CONFIG_SET		33
-#define WL1273_PULL_EN_SET		34
-#define WL1273_HILO_SET			35
-#define WL1273_SWITCH2FREF		36
-#define WL1273_FREQ_DRIFT_REPORT	37
-
-#define WL1273_PCE_GET			40
-#define WL1273_FIRM_VER_GET		41
-#define WL1273_ASIC_VER_GET		42
-#define WL1273_ASIC_ID_GET		43
-#define WL1273_MAN_ID_GET		44
-#define WL1273_TUNER_MODE_SET		45
-#define WL1273_STOP_SEARCH		46
-#define WL1273_RDS_CNTRL_SET		47
-
-#define WL1273_WRITE_HARDWARE_REG	100
-#define WL1273_CODE_DOWNLOAD		101
-#define WL1273_RESET			102
-
-#define WL1273_FM_POWER_MODE		254
-#define WL1273_FM_INTERRUPT		255
-
-/* Transmitter API */
-
-#define WL1273_CHANL_SET			55
-#define WL1273_SCAN_SPACING_SET			56
-#define WL1273_REF_SET				57
-#define WL1273_POWER_ENB_SET			90
-#define WL1273_POWER_ATT_SET			58
-#define WL1273_POWER_LEV_SET			59
-#define WL1273_AUDIO_DEV_SET			60
-#define WL1273_PILOT_DEV_SET			61
-#define WL1273_RDS_DEV_SET			62
-#define WL1273_PUPD_SET				91
-#define WL1273_AUDIO_IO_SET			63
-#define WL1273_PREMPH_SET			64
-#define WL1273_MONO_SET				66
-#define WL1273_MUTE				92
-#define WL1273_MPX_LMT_ENABLE			67
-#define WL1273_PI_SET				93
-#define WL1273_ECC_SET				69
-#define WL1273_PTY				70
-#define WL1273_AF				71
-#define WL1273_DISPLAY_MODE			74
-#define WL1273_RDS_REP_SET			77
-#define WL1273_RDS_CONFIG_DATA_SET		98
-#define WL1273_RDS_DATA_SET			99
-#define WL1273_RDS_DATA_ENB			94
-#define WL1273_TA_SET				78
-#define WL1273_TP_SET				79
-#define WL1273_DI_SET				80
-#define WL1273_MS_SET				81
-#define WL1273_PS_SCROLL_SPEED			82
-#define WL1273_TX_AUDIO_LEVEL_TEST		96
-#define WL1273_TX_AUDIO_LEVEL_TEST_THRESHOLD	73
-#define WL1273_TX_AUDIO_INPUT_LEVEL_RANGE_SET	54
-#define WL1273_RX_ANTENNA_SELECT		87
-#define WL1273_I2C_DEV_ADDR_SET			86
-#define WL1273_REF_ERR_CALIB_PARAM_SET		88
-#define WL1273_REF_ERR_CALIB_PERIODICITY_SET	89
-#define WL1273_SOC_INT_TRIGGER			52
-#define WL1273_SOC_AUDIO_PATH_SET		83
-#define WL1273_SOC_PCMI_OVERRIDE		84
-#define WL1273_SOC_I2S_OVERRIDE			85
-#define WL1273_RSSI_BLOCK_SCAN_FREQ_SET		95
-#define WL1273_RSSI_BLOCK_SCAN_START		97
-#define WL1273_RSSI_BLOCK_SCAN_DATA_GET		5
-#define WL1273_READ_FMANT_TUNE_VALUE		104
-
-#define WL1273_RDS_OFF		0
-#define WL1273_RDS_ON		1
-#define WL1273_RDS_RESET	2
-
-#define WL1273_AUDIO_DIGITAL	0
-#define WL1273_AUDIO_ANALOG	1
-
-#define WL1273_MODE_RX		BIT(0)
-#define WL1273_MODE_TX		BIT(1)
-#define WL1273_MODE_OFF		BIT(2)
-#define WL1273_MODE_SUSPENDED	BIT(3)
-
-#define WL1273_RADIO_CHILD	BIT(0)
-#define WL1273_CODEC_CHILD	BIT(1)
-
-#define WL1273_RX_MONO		1
-#define WL1273_RX_STEREO	0
-#define WL1273_TX_MONO		0
-#define WL1273_TX_STEREO	1
-
-#define WL1273_MAX_VOLUME	0xffff
-#define WL1273_DEFAULT_VOLUME	0x78b8
-
-/* I2S protocol, left channel first, data width 16 bits */
-#define WL1273_PCM_DEF_MODE		0x00
-
-/* Rx */
-#define WL1273_AUDIO_ENABLE_I2S		BIT(0)
-#define WL1273_AUDIO_ENABLE_ANALOG	BIT(1)
-
-/* Tx */
-#define WL1273_AUDIO_IO_SET_ANALOG	0
-#define WL1273_AUDIO_IO_SET_I2S		1
-
-#define WL1273_PUPD_SET_OFF		0x00
-#define WL1273_PUPD_SET_ON		0x01
-#define WL1273_PUPD_SET_RETENTION	0x10
-
-/* I2S mode */
-#define WL1273_IS2_WIDTH_32	0x0
-#define WL1273_IS2_WIDTH_40	0x1
-#define WL1273_IS2_WIDTH_22_23	0x2
-#define WL1273_IS2_WIDTH_23_22	0x3
-#define WL1273_IS2_WIDTH_48	0x4
-#define WL1273_IS2_WIDTH_50	0x5
-#define WL1273_IS2_WIDTH_60	0x6
-#define WL1273_IS2_WIDTH_64	0x7
-#define WL1273_IS2_WIDTH_80	0x8
-#define WL1273_IS2_WIDTH_96	0x9
-#define WL1273_IS2_WIDTH_128	0xa
-#define WL1273_IS2_WIDTH	0xf
-
-#define WL1273_IS2_FORMAT_STD	(0x0 << 4)
-#define WL1273_IS2_FORMAT_LEFT	(0x1 << 4)
-#define WL1273_IS2_FORMAT_RIGHT	(0x2 << 4)
-#define WL1273_IS2_FORMAT_USER	(0x3 << 4)
-
-#define WL1273_IS2_MASTER	(0x0 << 6)
-#define WL1273_IS2_SLAVEW	(0x1 << 6)
-
-#define WL1273_IS2_TRI_AFTER_SENDING	(0x0 << 7)
-#define WL1273_IS2_TRI_ALWAYS_ACTIVE	(0x1 << 7)
-
-#define WL1273_IS2_SDOWS_RR	(0x0 << 8)
-#define WL1273_IS2_SDOWS_RF	(0x1 << 8)
-#define WL1273_IS2_SDOWS_FR	(0x2 << 8)
-#define WL1273_IS2_SDOWS_FF	(0x3 << 8)
-
-#define WL1273_IS2_TRI_OPT	(0x0 << 10)
-#define WL1273_IS2_TRI_ALWAYS	(0x1 << 10)
-
-#define WL1273_IS2_RATE_48K	(0x0 << 12)
-#define WL1273_IS2_RATE_44_1K	(0x1 << 12)
-#define WL1273_IS2_RATE_32K	(0x2 << 12)
-#define WL1273_IS2_RATE_22_05K	(0x4 << 12)
-#define WL1273_IS2_RATE_16K	(0x5 << 12)
-#define WL1273_IS2_RATE_12K	(0x8 << 12)
-#define WL1273_IS2_RATE_11_025	(0x9 << 12)
-#define WL1273_IS2_RATE_8K	(0xa << 12)
-#define WL1273_IS2_RATE		(0xf << 12)
-
-#define WL1273_I2S_DEF_MODE	(WL1273_IS2_WIDTH_32 | \
-				 WL1273_IS2_FORMAT_STD | \
-				 WL1273_IS2_MASTER | \
-				 WL1273_IS2_TRI_AFTER_SENDING | \
-				 WL1273_IS2_SDOWS_RR | \
-				 WL1273_IS2_TRI_OPT | \
-				 WL1273_IS2_RATE_48K)
-
-#define SCHAR_MIN (-128)
-#define SCHAR_MAX 127
-
-#define WL1273_FR_EVENT			BIT(0)
-#define WL1273_BL_EVENT			BIT(1)
-#define WL1273_RDS_EVENT		BIT(2)
-#define WL1273_BBLK_EVENT		BIT(3)
-#define WL1273_LSYNC_EVENT		BIT(4)
-#define WL1273_LEV_EVENT		BIT(5)
-#define WL1273_IFFR_EVENT		BIT(6)
-#define WL1273_PI_EVENT			BIT(7)
-#define WL1273_PD_EVENT			BIT(8)
-#define WL1273_STIC_EVENT		BIT(9)
-#define WL1273_MAL_EVENT		BIT(10)
-#define WL1273_POW_ENB_EVENT		BIT(11)
-#define WL1273_SCAN_OVER_EVENT		BIT(12)
-#define WL1273_ERROR_EVENT		BIT(13)
-
-#define TUNER_MODE_STOP_SEARCH		0
-#define TUNER_MODE_PRESET		1
-#define TUNER_MODE_AUTO_SEEK		2
-#define TUNER_MODE_AF			3
-#define TUNER_MODE_AUTO_SEEK_PI		4
-#define TUNER_MODE_AUTO_SEEK_BULK	5
-
-#define RDS_BLOCK_SIZE	3
-
-struct wl1273_fm_platform_data {
-	int (*request_resources) (struct i2c_client *client);
-	void (*free_resources) (void);
-	void (*enable) (void);
-	void (*disable) (void);
-
-	u8 forbidden_modes;
-	unsigned int children;
-};
-
-#define WL1273_FM_CORE_CELLS	2
-
-#define WL1273_BAND_OTHER	0
-#define WL1273_BAND_JAPAN	1
-
-#define WL1273_BAND_JAPAN_LOW	76000
-#define WL1273_BAND_JAPAN_HIGH	90000
-#define WL1273_BAND_OTHER_LOW	87500
-#define WL1273_BAND_OTHER_HIGH	108000
-
-#define WL1273_BAND_TX_LOW	76000
-#define WL1273_BAND_TX_HIGH	108000
-
-struct wl1273_core {
-	struct mfd_cell cells[WL1273_FM_CORE_CELLS];
-	struct wl1273_fm_platform_data *pdata;
-
-	unsigned int mode;
-	unsigned int i2s_mode;
-	unsigned int volume;
-	unsigned int audio_mode;
-	unsigned int channel_number;
-	struct mutex lock; /* for serializing fm radio operations */
-
-	struct i2c_client *client;
-
-	int (*read)(struct wl1273_core *core, u8, u16 *);
-	int (*write)(struct wl1273_core *core, u8, u16);
-	int (*write_data)(struct wl1273_core *core, u8 *, u16);
-	int (*set_audio)(struct wl1273_core *core, unsigned int);
-	int (*set_volume)(struct wl1273_core *core, unsigned int);
-};
-
-#endif	/* ifndef WL1273_CORE_H */
-- 
cgit v1.2.3


From dd064d5101ea473d39c39ffaa8beeb8f47bbeb09 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 13 Oct 2025 09:51:18 +0800
Subject: ext4: introduce seq counter for the extent status entry

In the iomap_write_iter(), the iomap buffered write frame does not hold
any locks between querying the inode extent mapping info and performing
page cache writes. As a result, the extent mapping can be changed due to
concurrent I/O in flight. Similarly, in the iomap_writepage_map(), the
write-back process faces a similar problem: concurrent changes can
invalidate the extent mapping before the I/O is submitted.

Therefore, both of these processes must recheck the mapping info after
acquiring the folio lock. To address this, similar to XFS, we propose
introducing an extent sequence number to serve as a validity cookie for
the extent. After commit 24b7a2331fcd ("ext4: clairfy the rules for
modifying extents"), we can ensure the extent information should always
be processed through the extent status tree, and the extent status tree
is always uptodate under i_rwsem or invalidate_lock or folio lock, so
it's safe to introduce this sequence number. The sequence number will be
increased whenever the extent status tree changes, preparing for the
buffered write iomap conversion.

Besides, this mechanism is also applicable for the moving extents case.
In move_extent_per_page(), it also needs to reacquire data_sem and check
the mapping info again under the folio lock.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-3-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h              |  2 ++
 fs/ext4/extents_status.c    | 25 +++++++++++++++++++++----
 fs/ext4/super.c             |  1 +
 include/trace/events/ext4.h | 23 +++++++++++++++--------
 4 files changed, 39 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 57087da6c7be..eff97b3a1093 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1138,6 +1138,8 @@ struct ext4_inode_info {
 	ext4_lblk_t i_es_shrink_lblk;	/* Offset where we start searching for
 					   extents to shrink. Protected by
 					   i_es_lock  */
+	u64 i_es_seq;			/* Change counter for extents.
+					   Protected by i_es_lock */
 
 	/* ialloc */
 	ext4_group_t	i_last_alloc_group;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 31dc0496f8d0..c3daa57ecd35 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -235,6 +235,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
 	return es->es_lblk + es->es_len - 1;
 }
 
+static inline void ext4_es_inc_seq(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
+}
+
 /*
  * search through the tree for an delayed extent with a given offset.  If
  * it can't be found, try to find next extent.
@@ -906,7 +913,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 	newes.es_lblk = lblk;
 	newes.es_len = len;
 	ext4_es_store_pblock_status(&newes, pblk, status);
-	trace_ext4_es_insert_extent(inode, &newes);
 
 	ext4_es_insert_extent_check(inode, &newes);
 
@@ -955,6 +961,11 @@ retry:
 		}
 		pending = err3;
 	}
+	/*
+	 * TODO: For cache on-disk extents, there is no need to increment
+	 * the sequence counter, this requires future optimization.
+	 */
+	ext4_es_inc_seq(inode);
 error:
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 	/*
@@ -981,6 +992,7 @@ error:
 	if (err1 || err2 || err3 < 0)
 		goto retry;
 
+	trace_ext4_es_insert_extent(inode, &newes);
 	ext4_es_print_tree(inode);
 	return;
 }
@@ -1550,7 +1562,6 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
 		return;
 
-	trace_ext4_es_remove_extent(inode, lblk, len);
 	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
 		 lblk, len, inode->i_ino);
 
@@ -1570,16 +1581,21 @@ retry:
 	 */
 	write_lock(&EXT4_I(inode)->i_es_lock);
 	err = __es_remove_extent(inode, lblk, end, &reserved, es);
+	if (err)
+		goto error;
 	/* Free preallocated extent if it didn't get used. */
 	if (es) {
 		if (!es->es_len)
 			__es_free_extent(es);
 		es = NULL;
 	}
+	ext4_es_inc_seq(inode);
+error:
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 	if (err)
 		goto retry;
 
+	trace_ext4_es_remove_extent(inode, lblk, len);
 	ext4_es_print_tree(inode);
 	ext4_da_release_space(inode, reserved);
 }
@@ -2140,8 +2156,6 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
 	newes.es_lblk = lblk;
 	newes.es_len = len;
 	ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
-	trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
-					    end_allocated);
 
 	ext4_es_insert_extent_check(inode, &newes);
 
@@ -2196,11 +2210,14 @@ retry:
 			pr2 = NULL;
 		}
 	}
+	ext4_es_inc_seq(inode);
 error:
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 	if (err1 || err2 || err3 < 0)
 		goto retry;
 
+	trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
+					    end_allocated);
 	ext4_es_print_tree(inode);
 	ext4_print_pending_tree(inode);
 	return;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 33e7c08c9529..760c9d7588be 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1406,6 +1406,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_es_all_nr = 0;
 	ei->i_es_shk_nr = 0;
 	ei->i_es_shrink_lblk = 0;
+	ei->i_es_seq = 0;
 	ei->i_reserved_data_blocks = 0;
 	spin_lock_init(&(ei->i_block_reservation_lock));
 	ext4_init_pending_tree(&ei->i_pending_tree);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index a374e7ea7e57..6a0754d38acf 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2210,7 +2210,8 @@ DECLARE_EVENT_CLASS(ext4__es_extent,
 		__field(	ext4_lblk_t,	lblk		)
 		__field(	ext4_lblk_t,	len		)
 		__field(	ext4_fsblk_t,	pblk		)
-		__field(	char, status	)
+		__field(	char,		status		)
+		__field(	u64,		seq		)
 	),
 
 	TP_fast_assign(
@@ -2220,13 +2221,15 @@ DECLARE_EVENT_CLASS(ext4__es_extent,
 		__entry->len	= es->es_len;
 		__entry->pblk	= ext4_es_show_pblock(es);
 		__entry->status	= ext4_es_status(es);
+		__entry->seq	= EXT4_I(inode)->i_es_seq;
 	),
 
-	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
+	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->lblk, __entry->len,
-		  __entry->pblk, show_extent_status(__entry->status))
+		  __entry->pblk, show_extent_status(__entry->status),
+		  __entry->seq)
 );
 
 DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent,
@@ -2251,6 +2254,7 @@ TRACE_EVENT(ext4_es_remove_extent,
 		__field(	ino_t,	ino			)
 		__field(	loff_t,	lblk			)
 		__field(	loff_t,	len			)
+		__field(	u64,	seq			)
 	),
 
 	TP_fast_assign(
@@ -2258,12 +2262,13 @@ TRACE_EVENT(ext4_es_remove_extent,
 		__entry->ino	= inode->i_ino;
 		__entry->lblk	= lblk;
 		__entry->len	= len;
+		__entry->seq	= EXT4_I(inode)->i_es_seq;
 	),
 
-	TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
+	TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
-		  __entry->lblk, __entry->len)
+		  __entry->lblk, __entry->len, __entry->seq)
 );
 
 TRACE_EVENT(ext4_es_find_extent_range_enter,
@@ -2523,6 +2528,7 @@ TRACE_EVENT(ext4_es_insert_delayed_extent,
 		__field(	char,		status		)
 		__field(	bool,		lclu_allocated	)
 		__field(	bool,		end_allocated	)
+		__field(	u64,		seq		)
 	),
 
 	TP_fast_assign(
@@ -2534,15 +2540,16 @@ TRACE_EVENT(ext4_es_insert_delayed_extent,
 		__entry->status		= ext4_es_status(es);
 		__entry->lclu_allocated	= lclu_allocated;
 		__entry->end_allocated	= end_allocated;
+		__entry->seq		= EXT4_I(inode)->i_es_seq;
 	),
 
-	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
-		  "allocated %d %d",
+	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s allocated %d %d seq %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->lblk, __entry->len,
 		  __entry->pblk, show_extent_status(__entry->status),
-		  __entry->lclu_allocated, __entry->end_allocated)
+		  __entry->lclu_allocated, __entry->end_allocated,
+		  __entry->seq)
 );
 
 /* fsmap traces */
-- 
cgit v1.2.3


From 9dbf945320b11c5865d2f550f8e972566d04d181 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 13 Oct 2025 09:51:28 +0800
Subject: ext4: add two trace points for moving extents

To facilitate tracking the length, type, and outcome of the move extent,
add a trace point at both the entry and exit of mext_move_extent().

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-13-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/move_extent.c       | 14 +++++++--
 include/trace/events/ext4.h | 74 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index f04755c2165a..0550fd30fd10 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -13,6 +13,8 @@
 #include "ext4.h"
 #include "ext4_extents.h"
 
+#include <trace/events/ext4.h>
+
 struct mext_data {
 	struct inode *orig_inode;	/* Origin file inode */
 	struct inode *donor_inode;	/* Donor file inode */
@@ -311,10 +313,14 @@ static int mext_move_extent(struct mext_data *mext, u64 *m_len)
 	int ret, ret2;
 
 	*m_len = 0;
+	trace_ext4_move_extent_enter(orig_inode, orig_map, donor_inode,
+				     mext->donor_lblk);
 	credits = ext4_chunk_trans_extent(orig_inode, 0) * 2;
 	handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, credits);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
 
 	ret = mext_move_begin(mext, folio, &move_type);
 	if (ret)
@@ -379,6 +385,10 @@ unlock:
 	mext_folio_double_unlock(folio);
 stop_handle:
 	ext4_journal_stop(handle);
+out:
+	trace_ext4_move_extent_exit(orig_inode, orig_map->m_lblk, donor_inode,
+				    mext->donor_lblk, orig_map->m_len, *m_len,
+				    move_type, ret);
 	return ret;
 
 repair_branches:
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 6a0754d38acf..a05bdd48e16e 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -3016,6 +3016,80 @@ TRACE_EVENT(ext4_update_sb,
 		  __entry->fsblk, __entry->flags)
 );
 
+TRACE_EVENT(ext4_move_extent_enter,
+	TP_PROTO(struct inode *orig_inode, struct ext4_map_blocks *orig_map,
+		 struct inode *donor_inode, ext4_lblk_t donor_lblk),
+
+	TP_ARGS(orig_inode, orig_map, donor_inode, donor_lblk),
+
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(ino_t, orig_ino)
+		__field(ext4_lblk_t, orig_lblk)
+		__field(unsigned int, orig_flags)
+		__field(ino_t, donor_ino)
+		__field(ext4_lblk_t, donor_lblk)
+		__field(unsigned int, len)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= orig_inode->i_sb->s_dev;
+		__entry->orig_ino	= orig_inode->i_ino;
+		__entry->orig_lblk	= orig_map->m_lblk;
+		__entry->orig_flags	= orig_map->m_flags;
+		__entry->donor_ino	= donor_inode->i_ino;
+		__entry->donor_lblk	= donor_lblk;
+		__entry->len		= orig_map->m_len;
+	),
+
+	TP_printk("dev %d,%d origin ino %lu lblk %u flags %s donor ino %lu lblk %u len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->orig_ino,  __entry->orig_lblk,
+		  show_mflags(__entry->orig_flags),
+		  (unsigned long) __entry->donor_ino,  __entry->donor_lblk,
+		  __entry->len)
+);
+
+TRACE_EVENT(ext4_move_extent_exit,
+	TP_PROTO(struct inode *orig_inode, ext4_lblk_t orig_lblk,
+		 struct inode *donor_inode, ext4_lblk_t donor_lblk,
+		 unsigned int m_len, u64 move_len, int move_type, int ret),
+
+	TP_ARGS(orig_inode, orig_lblk, donor_inode, donor_lblk, m_len,
+		move_len, move_type, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(ino_t, orig_ino)
+		__field(ext4_lblk_t, orig_lblk)
+		__field(ino_t, donor_ino)
+		__field(ext4_lblk_t, donor_lblk)
+		__field(unsigned int, m_len)
+		__field(u64, move_len)
+		__field(int, move_type)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= orig_inode->i_sb->s_dev;
+		__entry->orig_ino	= orig_inode->i_ino;
+		__entry->orig_lblk	= orig_lblk;
+		__entry->donor_ino	= donor_inode->i_ino;
+		__entry->donor_lblk	= donor_lblk;
+		__entry->m_len		= m_len;
+		__entry->move_len	= move_len;
+		__entry->move_type	= move_type;
+		__entry->ret		= ret;
+	),
+
+	TP_printk("dev %d,%d origin ino %lu lblk %u donor ino %lu lblk %u m_len %u, move_len %llu type %d ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->orig_ino,  __entry->orig_lblk,
+		  (unsigned long) __entry->donor_ino,  __entry->donor_lblk,
+		  __entry->m_len, __entry->move_len, __entry->move_type,
+		  __entry->ret)
+);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 6a571d762cda6c25517c5533b8bd06d56028cdcb Mon Sep 17 00:00:00 2001
From: Mukesh Ojha <mukesh.ojha@oss.qualcomm.com>
Date: Tue, 4 Nov 2025 18:39:05 +0530
Subject: soc: qcom: socinfo: Add support for new fields in revision 20

Add support for socinfo version 20. Version 20 adds a new field
package id and its zeroth bit contain information that can be
can be used to tune temperature thresholds on devices which might
be able to withstand higher temperatures. Zeroth bit value 1 means
that its heat dissipation is better and more relaxed thermal
scheme can be put in place and 0 means a more aggressive scheme
may be needed.

Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Signed-off-by: Mukesh Ojha <mukesh.ojha@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20251104130906.167666-1-mukesh.ojha@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/soc/qcom/socinfo.c       | 6 ++++++
 include/linux/soc/qcom/socinfo.h | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c
index af0188bd3880..37567f5492fa 100644
--- a/drivers/soc/qcom/socinfo.c
+++ b/drivers/soc/qcom/socinfo.c
@@ -213,6 +213,7 @@ struct socinfo_params {
 	u32 num_func_clusters;
 	u32 boot_cluster;
 	u32 boot_core;
+	u32 raw_package_type;
 };
 
 struct smem_image_version {
@@ -675,6 +676,11 @@ static void socinfo_debugfs_init(struct qcom_socinfo *qcom_socinfo,
 			   &qcom_socinfo->info.fmt);
 
 	switch (qcom_socinfo->info.fmt) {
+	case SOCINFO_VERSION(0, 20):
+		qcom_socinfo->info.raw_package_type = __le32_to_cpu(info->raw_package_type);
+		debugfs_create_u32("raw_package_type", 0444, qcom_socinfo->dbg_root,
+				   &qcom_socinfo->info.raw_package_type);
+		fallthrough;
 	case SOCINFO_VERSION(0, 19):
 		qcom_socinfo->info.num_func_clusters = __le32_to_cpu(info->num_func_clusters);
 		qcom_socinfo->info.boot_cluster = __le32_to_cpu(info->boot_cluster);
diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h
index 608950443eee..c4dae173cc30 100644
--- a/include/linux/soc/qcom/socinfo.h
+++ b/include/linux/soc/qcom/socinfo.h
@@ -82,6 +82,8 @@ struct socinfo {
 	__le32 num_func_clusters;
 	__le32 boot_cluster;
 	__le32 boot_core;
+	/* Version 20 */
+	__le32 raw_package_type;
 };
 
 /* Internal feature codes */
-- 
cgit v1.2.3


From 6918667af5a7315eff3c56d871be4c5439f7f9d2 Mon Sep 17 00:00:00 2001
From: Mukesh Ojha <mukesh.ojha@oss.qualcomm.com>
Date: Tue, 4 Nov 2025 18:39:06 +0530
Subject: soc: qcom: socinfo: Add reserve field to support future extension

Some of the new field added to socinfo structure with version 21, 22
and 23 which is only used by boot firmware and it is of no use for
Linux.Add reserve field in socinfo so that the structure remain
updated and prepared if we get any new field in future which could
be used by Linux. While at it, also updates switch case for backward
compatibility if the SoC runs with boot firmware which has these
new version added.

Signed-off-by: Mukesh Ojha <mukesh.ojha@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20251104130906.167666-2-mukesh.ojha@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/soc/qcom/socinfo.c       | 3 +++
 include/linux/soc/qcom/socinfo.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c
index 37567f5492fa..003a2304d535 100644
--- a/drivers/soc/qcom/socinfo.c
+++ b/drivers/soc/qcom/socinfo.c
@@ -676,6 +676,9 @@ static void socinfo_debugfs_init(struct qcom_socinfo *qcom_socinfo,
 			   &qcom_socinfo->info.fmt);
 
 	switch (qcom_socinfo->info.fmt) {
+	case SOCINFO_VERSION(0, 23):
+	case SOCINFO_VERSION(0, 22):
+	case SOCINFO_VERSION(0, 21):
 	case SOCINFO_VERSION(0, 20):
 		qcom_socinfo->info.raw_package_type = __le32_to_cpu(info->raw_package_type);
 		debugfs_create_u32("raw_package_type", 0444, qcom_socinfo->dbg_root,
diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h
index c4dae173cc30..ba823a0013c5 100644
--- a/include/linux/soc/qcom/socinfo.h
+++ b/include/linux/soc/qcom/socinfo.h
@@ -84,6 +84,8 @@ struct socinfo {
 	__le32 boot_core;
 	/* Version 20 */
 	__le32 raw_package_type;
+	/* Version 21, 22, 23 */
+	__le32 reserve1[4];
 };
 
 /* Internal feature codes */
-- 
cgit v1.2.3


From 9352d40c8bcd2ef29366d2c38b163c0b115039ed Mon Sep 17 00:00:00 2001
From: Mohammad Heib <mheib@redhat.com>
Date: Sat, 25 Oct 2025 16:08:58 +0300
Subject: devlink: Add new "max_mac_per_vf" generic device param

Add a new device generic parameter to controls the maximum
number of MAC filters allowed per VF.

For example, to limit a VF to 3 MAC addresses:
 $ devlink dev param set pci/0000:3b:00.0 name max_mac_per_vf \
        value 3 \
        cmode runtime

Signed-off-by: Mohammad Heib <mheib@redhat.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 Documentation/networking/devlink/devlink-params.rst | 4 ++++
 include/net/devlink.h                               | 4 ++++
 net/devlink/param.c                                 | 5 +++++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst
index 0a9c20d70122..c0597d456641 100644
--- a/Documentation/networking/devlink/devlink-params.rst
+++ b/Documentation/networking/devlink/devlink-params.rst
@@ -151,3 +151,7 @@ own name.
    * - ``num_doorbells``
      - u32
      - Controls the number of doorbells used by the device.
+   * - ``max_mac_per_vf``
+     - u32
+     - Controls the maximum number of MAC address filters that can be assigned
+       to a Virtual Function (VF).
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 9e824f61e40f..d01046ef0577 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -532,6 +532,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_CLOCK_ID,
 	DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS,
 	DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS,
+	DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -602,6 +603,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME "num_doorbells"
 #define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE DEVLINK_PARAM_TYPE_U32
 
+#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME "max_mac_per_vf"
+#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE DEVLINK_PARAM_TYPE_U32
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/net/devlink/param.c b/net/devlink/param.c
index 70e69523412c..6b233b13b69a 100644
--- a/net/devlink/param.c
+++ b/net/devlink/param.c
@@ -112,6 +112,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME,
 		.type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF,
+		.name = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME,
+		.type = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3


From c6230446b1a6f3c91effafd99f604de455da52e5 Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Mon, 3 Nov 2025 12:20:20 +0000
Subject: net: dsa: add tagging driver for MaxLinear GSW1xx switch family

Add support for a new DSA tagging protocol driver for the MaxLinear
GSW1xx switch family. The GSW1xx switches use a proprietary 8-byte
special tag inserted between the source MAC address and the EtherType
field to indicate the source and destination ports for frames
traversing the CPU port.

Implement the tag handling logic to insert the special tag on transmit
and parse it on receive.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Tested-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Link: https://patch.msgid.link/0e973ebfd9433c30c96f50670da9e9449a0d98f2.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS                   |   3 +-
 include/net/dsa.h             |   2 +
 include/uapi/linux/if_ether.h |   1 +
 net/dsa/Kconfig               |   8 +++
 net/dsa/Makefile              |   1 +
 net/dsa/tag_mxl-gsw1xx.c      | 116 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 130 insertions(+), 1 deletion(-)
 create mode 100644 net/dsa/tag_mxl-gsw1xx.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 12cd8a5ab274..0dc4aa37d903 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14053,7 +14053,7 @@ F:	tools/testing/selftests/landlock/
 K:	landlock
 K:	LANDLOCK
 
-LANTIQ / INTEL Ethernet drivers
+LANTIQ / MAXLINEAR / INTEL Ethernet DSA drivers
 M:	Hauke Mehrtens <hauke@hauke-m.de>
 L:	netdev@vger.kernel.org
 S:	Maintained
@@ -14061,6 +14061,7 @@ F:	Documentation/devicetree/bindings/net/dsa/lantiq,gswip.yaml
 F:	drivers/net/dsa/lantiq/*
 F:	drivers/net/ethernet/lantiq_xrx200.c
 F:	net/dsa/tag_gswip.c
+F:	net/dsa/tag_mxl-gsw1xx.c
 
 LANTIQ MIPS ARCHITECTURE
 M:	John Crispin <john@phrozen.org>
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 67762fdaf3c7..2df2e2ead9a8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -56,6 +56,7 @@ struct tc_action;
 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE	28
 #define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE	29
 #define DSA_TAG_PROTO_YT921X_VALUE		30
+#define DSA_TAG_PROTO_MXL_GSW1XX_VALUE		31
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -89,6 +90,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_LAN937X		= DSA_TAG_PROTO_LAN937X_VALUE,
 	DSA_TAG_PROTO_VSC73XX_8021Q	= DSA_TAG_PROTO_VSC73XX_8021Q_VALUE,
 	DSA_TAG_PROTO_YT921X		= DSA_TAG_PROTO_YT921X_VALUE,
+	DSA_TAG_PROTO_MXL_GSW1XX	= DSA_TAG_PROTO_MXL_GSW1XX_VALUE,
 };
 
 struct dsa_switch;
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index cfd200c87e5e..2c93b7b731c8 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -92,6 +92,7 @@
 #define ETH_P_ETHERCAT	0x88A4		/* EtherCAT			*/
 #define ETH_P_8021AD	0x88A8          /* 802.1ad Service VLAN		*/
 #define ETH_P_802_EX1	0x88B5		/* 802.1 Local Experimental 1.  */
+#define ETH_P_MXLGSW	0x88C3		/* MaxLinear GSW DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_PREAUTH	0x88C7		/* 802.11 Preauthentication */
 #define ETH_P_TIPC	0x88CA		/* TIPC 			*/
 #define ETH_P_LLDP	0x88CC		/* Link Layer Discovery Protocol */
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 6b94028b1fcc..f86b30742122 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -104,6 +104,14 @@ config NET_DSA_TAG_MTK
 	  Say Y or M if you want to enable support for tagging frames for
 	  Mediatek switches.
 
+config NET_DSA_TAG_MXL_GSW1XX
+	tristate "Tag driver for MaxLinear GSW1xx switches"
+	help
+	  The GSW1xx family of switches supports an 8-byte special tag which
+	  can be used on the CPU port of the switch.
+	  Say Y or M if you want to enable support for tagging frames for
+	  MaxLinear GSW1xx switches.
+
 config NET_DSA_TAG_KSZ
 	tristate "Tag driver for Microchip 8795/937x/9477/9893 families of switches"
 	help
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 4b011a1d5c87..42d173f5a701 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o
 obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
 obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
 obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
+obj-$(CONFIG_NET_DSA_TAG_MXL_GSW1XX) += tag_mxl-gsw1xx.o
 obj-$(CONFIG_NET_DSA_TAG_NONE) += tag_none.o
 obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o
 obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o
diff --git a/net/dsa/tag_mxl-gsw1xx.c b/net/dsa/tag_mxl-gsw1xx.c
new file mode 100644
index 000000000000..701a079955f2
--- /dev/null
+++ b/net/dsa/tag_mxl-gsw1xx.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * DSA driver Special Tag support for MaxLinear GSW1xx switch chips
+ *
+ * Copyright (C) 2025 Daniel Golle <daniel@makrotopia.org>
+ * Copyright (C) 2023 - 2024 MaxLinear Inc.
+ */
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#include "tag.h"
+
+/* To define the outgoing port and to discover the incoming port a special
+ * tag is used by the GSW1xx.
+ *
+ *       Dest MAC       Src MAC    special TAG        EtherType
+ * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |...
+ *                                |<--------------->|
+ */
+
+#define GSW1XX_TAG_NAME		"gsw1xx"
+
+/* special tag header length (RX and TX) */
+#define GSW1XX_HEADER_LEN		8
+
+/* Word 0 = Ethertype -> 0x88C3 */
+
+/* Word 1 */
+#define GSW1XX_TX_PORT_MAP		GENMASK(7, 0)
+#define GSW1XX_TX_PORT_MAP_EN		BIT(15)
+#define GSW1XX_TX_CLASS_EN		BIT(14)
+#define GSW1XX_TX_TIME_STAMP_EN		BIT(13)
+#define GSW1XX_TX_LRN_DIS		BIT(12)
+#define GSW1XX_TX_CLASS			GENMASK(11, 8)
+
+/* special tag in RX path header */
+/* Word 2 */
+#define GSW1XX_RX_PORT_MAP		GENMASK(15, 8)
+
+static struct sk_buff *gsw1xx_tag_xmit(struct sk_buff *skb,
+				       struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	__be16 *gsw1xx_tag;
+
+	/* provide additional space 'GSW1XX_HEADER_LEN' bytes */
+	skb_push(skb, GSW1XX_HEADER_LEN);
+
+	/* add space between MAC address and Ethertype */
+	dsa_alloc_etype_header(skb, GSW1XX_HEADER_LEN);
+
+	/* special tag ingress */
+	gsw1xx_tag = dsa_etype_header_pos_tx(skb);
+	gsw1xx_tag[0] = htons(ETH_P_MXLGSW);
+	gsw1xx_tag[1] = htons(GSW1XX_TX_PORT_MAP_EN | GSW1XX_TX_LRN_DIS |
+			FIELD_PREP(GSW1XX_TX_PORT_MAP, BIT(dp->index)));
+
+	gsw1xx_tag[2] = 0;
+	gsw1xx_tag[3] = 0;
+
+	return skb;
+}
+
+static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb,
+				      struct net_device *dev)
+{
+	int port;
+	__be16 *gsw1xx_tag;
+
+	if (unlikely(!pskb_may_pull(skb, GSW1XX_HEADER_LEN))) {
+		dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull SKB\n");
+		return NULL;
+	}
+
+	gsw1xx_tag = dsa_etype_header_pos_rx(skb);
+
+	if (unlikely(ntohs(gsw1xx_tag[0]) != ETH_P_MXLGSW)) {
+		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid special tag\n");
+		dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag);
+		return NULL;
+	}
+
+	/* Get source port information */
+	port = FIELD_GET(GSW1XX_RX_PORT_MAP, ntohs(gsw1xx_tag[1]));
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
+	if (!skb->dev) {
+		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
+		dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag);
+		return NULL;
+	}
+
+	/* remove the GSW1xx special tag between MAC addresses and the current
+	 * ethertype field.
+	 */
+	skb_pull_rcsum(skb, GSW1XX_HEADER_LEN);
+	dsa_strip_etype_header(skb, GSW1XX_HEADER_LEN);
+
+	return skb;
+}
+
+static const struct dsa_device_ops gsw1xx_netdev_ops = {
+	.name			= GSW1XX_TAG_NAME,
+	.proto			= DSA_TAG_PROTO_MXL_GSW1XX,
+	.xmit			= gsw1xx_tag_xmit,
+	.rcv			= gsw1xx_tag_rcv,
+	.needed_headroom	= GSW1XX_HEADER_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for MaxLinear GSW1xx 8 byte protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MXL_GSW1XX, GSW1XX_TAG_NAME);
+
+module_dsa_tag_driver(gsw1xx_netdev_ops);
-- 
cgit v1.2.3


From 38724a474c0fc37b6604e8b20c75d87446fc2fd1 Mon Sep 17 00:00:00 2001
From: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Date: Thu, 30 Oct 2025 14:59:46 +0100
Subject: ice: add virtchnl definitions and static data for GTP RSS

Add virtchnl protocol header and field definitions for advanced RSS
configuration including GTPC, GTPU, L2TPv2, ECPRI, PPP, GRE, and IP
fragment headers.

- Define new virtchnl protocol header types
- Add RSS field selectors for tunnel protocols
- Extend static mapping arrays for protocol field matching
- Add L2TPv2 session ID and length+session ID field support

This provides the foundational definitions needed for VF RSS
configuration of tunnel protocols.

Co-developed-by: Dan Nowlin <dan.nowlin@intel.com>
Signed-off-by: Dan Nowlin <dan.nowlin@intel.com>
Co-developed-by: Jie Wang <jie1x.wang@intel.com>
Signed-off-by: Jie Wang <jie1x.wang@intel.com>
Co-developed-by: Junfeng Guo <junfeng.guo@intel.com>
Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>
Co-developed-by: Qi Zhang <qi.z.zhang@intel.com>
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
Co-developed-by: Ting Xu <ting.xu@intel.com>
Signed-off-by: Ting Xu <ting.xu@intel.com>
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_vf_lib.h |  48 ++++++
 drivers/net/ethernet/intel/ice/virt/rss.c   | 219 +++++++++++++++++++++++++++-
 include/linux/avf/virtchnl.h                |  50 +++++++
 3 files changed, 316 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.h b/drivers/net/ethernet/intel/ice/ice_vf_lib.h
index b00708907176..7a9c75d1d07c 100644
--- a/drivers/net/ethernet/intel/ice/ice_vf_lib.h
+++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.h
@@ -53,6 +53,46 @@ struct ice_mdd_vf_events {
 	u16 last_printed;
 };
 
+enum ice_hash_ip_ctx_type {
+	ICE_HASH_IP_CTX_IP = 0,
+	ICE_HASH_IP_CTX_IP_ESP,
+	ICE_HASH_IP_CTX_IP_UDP_ESP,
+	ICE_HASH_IP_CTX_IP_AH,
+	ICE_HASH_IP_CTX_IP_PFCP,
+	ICE_HASH_IP_CTX_IP_UDP,
+	ICE_HASH_IP_CTX_IP_TCP,
+	ICE_HASH_IP_CTX_IP_SCTP,
+	ICE_HASH_IP_CTX_MAX,
+};
+
+struct ice_vf_hash_ip_ctx {
+	struct ice_rss_hash_cfg ctx[ICE_HASH_IP_CTX_MAX];
+};
+
+enum ice_hash_gtpu_ctx_type {
+	ICE_HASH_GTPU_CTX_EH_IP = 0,
+	ICE_HASH_GTPU_CTX_EH_IP_UDP,
+	ICE_HASH_GTPU_CTX_EH_IP_TCP,
+	ICE_HASH_GTPU_CTX_UP_IP,
+	ICE_HASH_GTPU_CTX_UP_IP_UDP,
+	ICE_HASH_GTPU_CTX_UP_IP_TCP,
+	ICE_HASH_GTPU_CTX_DW_IP,
+	ICE_HASH_GTPU_CTX_DW_IP_UDP,
+	ICE_HASH_GTPU_CTX_DW_IP_TCP,
+	ICE_HASH_GTPU_CTX_MAX,
+};
+
+struct ice_vf_hash_gtpu_ctx {
+	struct ice_rss_hash_cfg ctx[ICE_HASH_GTPU_CTX_MAX];
+};
+
+struct ice_vf_hash_ctx {
+	struct ice_vf_hash_ip_ctx v4;
+	struct ice_vf_hash_ip_ctx v6;
+	struct ice_vf_hash_gtpu_ctx ipv4;
+	struct ice_vf_hash_gtpu_ctx ipv6;
+};
+
 /* Structure to store fdir fv entry */
 struct ice_fdir_prof_info {
 	struct ice_parser_profile prof;
@@ -66,6 +106,12 @@ struct ice_vf_qs_bw {
 	u8 tc;
 };
 
+/* Structure to store RSS field vector entry */
+struct ice_rss_prof_info {
+	struct ice_parser_profile prof;
+	bool symm;
+};
+
 /* VF operations */
 struct ice_vf_ops {
 	enum ice_disq_rst_src reset_type;
@@ -106,6 +152,8 @@ struct ice_vf {
 	u16 ctrl_vsi_idx;
 	struct ice_vf_fdir fdir;
 	struct ice_fdir_prof_info fdir_prof_info[ICE_MAX_PTGS];
+	struct ice_rss_prof_info rss_prof_info[ICE_MAX_PTGS];
+	struct ice_vf_hash_ctx hash_ctx;
 	u64 rss_hashcfg;		/* RSS hash configuration */
 	struct ice_sw *vf_sw_id;	/* switch ID the VF VSIs connect to */
 	struct virtchnl_version_info vf_ver;
diff --git a/drivers/net/ethernet/intel/ice/virt/rss.c b/drivers/net/ethernet/intel/ice/virt/rss.c
index cbdbb32d512b..ee0d1ec32d56 100644
--- a/drivers/net/ethernet/intel/ice/virt/rss.c
+++ b/drivers/net/ethernet/intel/ice/virt/rss.c
@@ -36,6 +36,11 @@ static const struct ice_vc_hdr_match_type ice_vc_hdr_list[] = {
 	{VIRTCHNL_PROTO_HDR_ESP,	ICE_FLOW_SEG_HDR_ESP},
 	{VIRTCHNL_PROTO_HDR_AH,		ICE_FLOW_SEG_HDR_AH},
 	{VIRTCHNL_PROTO_HDR_PFCP,	ICE_FLOW_SEG_HDR_PFCP_SESSION},
+	{VIRTCHNL_PROTO_HDR_GTPC,	ICE_FLOW_SEG_HDR_GTPC},
+	{VIRTCHNL_PROTO_HDR_L2TPV2,	ICE_FLOW_SEG_HDR_L2TPV2},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,	ICE_FLOW_SEG_HDR_IPV_FRAG},
+	{VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG,	ICE_FLOW_SEG_HDR_IPV_FRAG},
+	{VIRTCHNL_PROTO_HDR_GRE,        ICE_FLOW_SEG_HDR_GRE},
 };
 
 struct ice_vc_hash_field_match_type {
@@ -87,8 +92,125 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = {
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
 		ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
-	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+	{VIRTCHNL_PROTO_HDR_IPV4,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_FRAG_PKID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_ID)},
+	{VIRTCHNL_PROTO_HDR_IPV4,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
 		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST),
+		ICE_FLOW_HASH_IPV4},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_FRAG_PKID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_ID)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)},
 	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC),
 		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA)},
 	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST),
@@ -110,6 +232,35 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = {
 		ICE_FLOW_HASH_IPV6 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
 	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
 		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG_PKID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_ID)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST),
+		ICE_FLOW_HASH_IPV6_PRE64},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		ICE_FLOW_HASH_IPV6_PRE64 |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
 	{VIRTCHNL_PROTO_HDR_TCP,
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT),
 		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT)},
@@ -120,6 +271,25 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = {
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) |
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT),
 		ICE_FLOW_HASH_TCP_PORT},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_CHKSUM),
+		ICE_FLOW_HASH_TCP_PORT |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_CHKSUM)},
 	{VIRTCHNL_PROTO_HDR_UDP,
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT),
 		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT)},
@@ -130,6 +300,25 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = {
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) |
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT),
 		ICE_FLOW_HASH_UDP_PORT},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_CHKSUM),
+		ICE_FLOW_HASH_UDP_PORT |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_CHKSUM)},
 	{VIRTCHNL_PROTO_HDR_SCTP,
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT),
 		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT)},
@@ -140,6 +329,25 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = {
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) |
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT),
 		ICE_FLOW_HASH_SCTP_PORT},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_CHKSUM),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_CHKSUM)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_CHKSUM),
+		ICE_FLOW_HASH_SCTP_PORT |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_CHKSUM)},
 	{VIRTCHNL_PROTO_HDR_PPPOE,
 		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PPPOE_SESS_ID),
 		BIT_ULL(ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID)},
@@ -155,6 +363,15 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = {
 		BIT_ULL(ICE_FLOW_FIELD_IDX_AH_SPI)},
 	{VIRTCHNL_PROTO_HDR_PFCP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PFCP_SEID),
 		BIT_ULL(ICE_FLOW_FIELD_IDX_PFCP_SEID)},
+	{VIRTCHNL_PROTO_HDR_GTPC,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_GTPC_TEID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_GTPC_TEID)},
+	{VIRTCHNL_PROTO_HDR_L2TPV2,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_L2TPV2_SESS_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_L2TPV2_SESS_ID)},
+	{VIRTCHNL_PROTO_HDR_L2TPV2,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_L2TPV2_LEN_SESS_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_L2TPV2_LEN_SESS_ID)},
 };
 
 /**
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 5be1881abbb6..11bdab5522fd 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -1253,6 +1253,17 @@ enum virtchnl_proto_hdr_type {
 	VIRTCHNL_PROTO_HDR_ESP,
 	VIRTCHNL_PROTO_HDR_AH,
 	VIRTCHNL_PROTO_HDR_PFCP,
+	VIRTCHNL_PROTO_HDR_GTPC,
+	VIRTCHNL_PROTO_HDR_ECPRI,
+	VIRTCHNL_PROTO_HDR_L2TPV2,
+	VIRTCHNL_PROTO_HDR_PPP,
+	/* IPv4 and IPv6 Fragment header types are only associated to
+	 * VIRTCHNL_PROTO_HDR_IPV4 and VIRTCHNL_PROTO_HDR_IPV6 respectively,
+	 * cannot be used independently.
+	 */
+	VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+	VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG,
+	VIRTCHNL_PROTO_HDR_GRE,
 };
 
 /* Protocol header field within a protocol header. */
@@ -1275,6 +1286,7 @@ enum virtchnl_proto_hdr_field {
 	VIRTCHNL_PROTO_HDR_IPV4_DSCP,
 	VIRTCHNL_PROTO_HDR_IPV4_TTL,
 	VIRTCHNL_PROTO_HDR_IPV4_PROT,
+	VIRTCHNL_PROTO_HDR_IPV4_CHKSUM,
 	/* IPV6 */
 	VIRTCHNL_PROTO_HDR_IPV6_SRC =
 		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6),
@@ -1282,18 +1294,34 @@ enum virtchnl_proto_hdr_field {
 	VIRTCHNL_PROTO_HDR_IPV6_TC,
 	VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT,
 	VIRTCHNL_PROTO_HDR_IPV6_PROT,
+	/* IPV6 Prefix */
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX32_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX32_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX40_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX40_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX48_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX48_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX56_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX56_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX96_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX96_DST,
 	/* TCP */
 	VIRTCHNL_PROTO_HDR_TCP_SRC_PORT =
 		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_TCP),
 	VIRTCHNL_PROTO_HDR_TCP_DST_PORT,
+	VIRTCHNL_PROTO_HDR_TCP_CHKSUM,
 	/* UDP */
 	VIRTCHNL_PROTO_HDR_UDP_SRC_PORT =
 		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_UDP),
 	VIRTCHNL_PROTO_HDR_UDP_DST_PORT,
+	VIRTCHNL_PROTO_HDR_UDP_CHKSUM,
 	/* SCTP */
 	VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT =
 		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_SCTP),
 	VIRTCHNL_PROTO_HDR_SCTP_DST_PORT,
+	VIRTCHNL_PROTO_HDR_SCTP_CHKSUM,
 	/* GTPU_IP */
 	VIRTCHNL_PROTO_HDR_GTPU_IP_TEID =
 		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_IP),
@@ -1317,6 +1345,28 @@ enum virtchnl_proto_hdr_field {
 	VIRTCHNL_PROTO_HDR_PFCP_S_FIELD =
 		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PFCP),
 	VIRTCHNL_PROTO_HDR_PFCP_SEID,
+	/* GTPC */
+	VIRTCHNL_PROTO_HDR_GTPC_TEID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPC),
+	/* ECPRI */
+	VIRTCHNL_PROTO_HDR_ECPRI_MSG_TYPE =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ECPRI),
+	VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID,
+	/* IPv4 Dummy Fragment */
+	VIRTCHNL_PROTO_HDR_IPV4_FRAG_PKID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV4_FRAG),
+	/* IPv6 Extension Fragment */
+	VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG_PKID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG),
+	/* GTPU_DWN/UP */
+	VIRTCHNL_PROTO_HDR_GTPU_DWN_QFI =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN),
+	VIRTCHNL_PROTO_HDR_GTPU_UP_QFI =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP),
+	/* L2TPv2 */
+	VIRTCHNL_PROTO_HDR_L2TPV2_SESS_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_L2TPV2),
+	VIRTCHNL_PROTO_HDR_L2TPV2_LEN_SESS_ID,
 };
 
 struct virtchnl_proto_hdr {
-- 
cgit v1.2.3


From 9311e6c29b348b005e79228ef6facd38ebcc73f9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 Nov 2025 08:12:36 -1000
Subject: cgroup: Fix sleeping from invalid context warning on PREEMPT_RT

cgroup_task_dead() is called from finish_task_switch() which runs with
preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The
function needs to acquire css_set_lock which is a regular spinlock that can
sleep on RT kernels, leading to "sleeping function called from invalid
context" warnings.

css_set_lock is too large in scope to convert to a raw_spinlock. However,
the unlinking operations don't need to run synchronously - they just need
to complete after the task is done running.

On PREEMPT_RT, defer the work through irq_work. While the work doesn't need
to happen immediately, it can't be delayed indefinitely either as the dead
task pins the cgroup and task_struct can be pinned indefinitely. Use the
lazy version of irq_work to allow batching and lower impact while ensuring
timely completion.

v2: Use IRQ_WORK_INIT_LAZY instead of immediate irq_work and add explanation
    for why the work can't be delayed indefinitely (Sebastian Andrzej Siewior).

Fixes: d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out")
Reported-by: Calvin Owens <calvin@wbinvd.org>
Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched.h  |  5 ++++-
 kernel/cgroup/cgroup.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbb7340c5866..5e80d48488ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1324,7 +1324,10 @@ struct task_struct {
 	struct css_set __rcu		*cgroups;
 	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
 	struct list_head		cg_list;
-#endif
+#ifdef CONFIG_PREEMPT_RT
+	struct llist_node		cg_dead_lnode;
+#endif	/* CONFIG_PREEMPT_RT */
+#endif	/* CONFIG_CGROUPS */
 #ifdef CONFIG_X86_CPU_RESCTRL
 	u32				closid;
 	u32				rmid;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index aae180d56c8c..48019a661c08 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -290,6 +290,7 @@ static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 			      struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
+static void cgroup_rt_init(void);
 
 #ifdef CONFIG_DEBUG_CGROUP_REF
 #define CGROUP_REF_FN_ATTRS	noinline
@@ -6360,6 +6361,7 @@ int __init cgroup_init(void)
 	BUG_ON(ss_rstat_init(NULL));
 
 	get_user_ns(init_cgroup_ns.user_ns);
+	cgroup_rt_init();
 
 	cgroup_lock();
 
@@ -6990,7 +6992,7 @@ void cgroup_task_exit(struct task_struct *tsk)
 	} while_each_subsys_mask();
 }
 
-void cgroup_task_dead(struct task_struct *tsk)
+static void do_cgroup_task_dead(struct task_struct *tsk)
 {
 	struct css_set *cset;
 	unsigned long flags;
@@ -7016,6 +7018,57 @@ void cgroup_task_dead(struct task_struct *tsk)
 	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
+ * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
+ * this lead to sleeping in the invalid context warning bug. css_set_lock is too
+ * big to become a raw_spinlock. The task_dead path doesn't need to run
+ * synchronously but can't be delayed indefinitely either as the dead task pins
+ * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
+ * irq_work to allow batching while ensuring timely completion.
+ */
+static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
+static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
+
+static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
+{
+	struct llist_node *lnode;
+	struct task_struct *task, *next;
+
+	lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
+	llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
+		do_cgroup_task_dead(task);
+		put_task_struct(task);
+	}
+}
+
+static void __init cgroup_rt_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
+		per_cpu(cgrp_dead_tasks_iwork, cpu) =
+			IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
+	}
+}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+	get_task_struct(task);
+	llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
+	irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
+}
+#else	/* CONFIG_PREEMPT_RT */
+static void __init cgroup_rt_init(void) {}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+	do_cgroup_task_dead(task);
+}
+#endif	/* CONFIG_PREEMPT_RT */
+
 void cgroup_task_release(struct task_struct *task)
 {
 	struct cgroup_subsys *ss;
-- 
cgit v1.2.3


From 15638d52cbcf6e969f4a5e2757b118355db583f3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Nov 2025 14:52:15 -0500
Subject: block: fix cached zone reporting after zone append was used

No zone plugs are allocated when a zone is opened by calling Zone Append
on it.  This makes the cached zone reporting report incorrectly empty
zones if the file system is unmounted and report zones is called after
that, e.g. by xfstests test cases using the scratch device.

Fix this by recording if zone append was used on a device, and disable
cached reporting for the device until a ZONE_RESET_ALL happens that
guarantees all zones are empty.

We could probably do even better using a per-zone flag, but the practical
use cache for zone reporting after the initial mount are rather limited,
so let's keep things simple for now.

Fixes: 31f0656a4ab7 ("block: introduce blkdev_report_zones_cached()")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 26 +++++++++++++++++++++-----
 include/linux/blkdev.h |  1 +
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index a0ce17e2143f..c5226bcaaa94 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -899,6 +899,19 @@ static int blkdev_report_zone_fallback(struct block_device *bdev,
 	return blkdev_do_report_zones(bdev, sector, 1, &args);
 }
 
+/*
+ * For devices that natively support zone append operations, we do not use zone
+ * write plugging for zone append writes, which makes the zone condition
+ * tracking invalid once zone append was used.  In that case fall back to a
+ * regular report zones to get correct information.
+ */
+static inline bool blkdev_has_cached_report_zones(struct block_device *bdev)
+{
+	return disk_need_zone_resources(bdev->bd_disk) &&
+		(bdev_emulates_zone_append(bdev) ||
+		 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state));
+}
+
 /**
  * blkdev_get_zone_info - Get a single zone information from cached data
  * @bdev:   Target block device
@@ -932,6 +945,9 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
 	memset(zone, 0, sizeof(*zone));
 	sector = ALIGN_DOWN(sector, zone_sectors);
 
+	if (!blkdev_has_cached_report_zones(bdev))
+		return blkdev_report_zone_fallback(bdev, sector, zone);
+
 	rcu_read_lock();
 	zones_cond = rcu_dereference(disk->zones_cond);
 	if (!disk->zone_wplugs_hash || !zones_cond) {
@@ -1035,11 +1051,7 @@ int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
 	if (!nr_zones || sector >= capacity)
 		return 0;
 
-	/*
-	 * If we do not have any zone write plug resources, fallback to using
-	 * the regular zone report.
-	 */
-	if (!disk_need_zone_resources(disk)) {
+	if (!blkdev_has_cached_report_zones(bdev)) {
 		struct blk_report_zones_args args = {
 			.cb = cb,
 			.data = data,
@@ -1115,6 +1127,7 @@ static void blk_zone_reset_all_bio_endio(struct bio *bio)
 	for (sector = 0; sector < capacity;
 	     sector += bdev_zone_sectors(bio->bi_bdev))
 		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
+	clear_bit(GD_ZONE_APPEND_USED, &disk->state);
 }
 
 static void blk_zone_finish_bio_endio(struct bio *bio)
@@ -1474,6 +1487,9 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
 	struct blk_zone_wplug *zwplug;
 	unsigned long flags;
 
+	if (!test_bit(GD_ZONE_APPEND_USED, &disk->state))
+		set_bit(GD_ZONE_APPEND_USED, &disk->state);
+
 	/*
 	 * We have native support for zone append operations, so we are not
 	 * going to handle @bio through plugging. However, we may already have a
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f0ab02e0a673..6a498aa7f7e7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -173,6 +173,7 @@ struct gendisk {
 #define GD_ADDED			4
 #define GD_SUPPRESS_PART_SCAN		5
 #define GD_OWNS_QUEUE			6
+#define GD_ZONE_APPEND_USED		7
 
 	struct mutex open_mutex;	/* open/close mutex */
 	unsigned open_partitions;	/* number of open partitions */
-- 
cgit v1.2.3


From 24ab8efb9aea77764dd99d2bad41fd8991223013 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 31 Oct 2025 22:20:55 +0100
Subject: xsk: Move NETDEV_XDP_ACT_ZC into generic header

Move NETDEV_XDP_ACT_ZC into xdp_sock_drv.h header such that external code
can reuse it, and rename it into more generic NETDEV_XDP_ACT_XSK.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Co-developed-by: David Wei <dw@davidwei.uk>
Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20251031212103.310683-7-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/xdp_sock_drv.h | 4 ++++
 net/xdp/xsk_buff_pool.c    | 6 +-----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 4f2d3268a676..242e34f771cc 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -12,6 +12,10 @@
 #define XDP_UMEM_MIN_CHUNK_SHIFT 11
 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT)
 
+#define NETDEV_XDP_ACT_XSK	(NETDEV_XDP_ACT_BASIC |		\
+				 NETDEV_XDP_ACT_REDIRECT |	\
+				 NETDEV_XDP_ACT_XSK_ZEROCOPY)
+
 struct xsk_cb_desc {
 	void *src;
 	u8 off;
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 00a4eddaa0cd..51526034c42a 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -155,10 +155,6 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
 	}
 }
 
-#define NETDEV_XDP_ACT_ZC	(NETDEV_XDP_ACT_BASIC |		\
-				 NETDEV_XDP_ACT_REDIRECT |	\
-				 NETDEV_XDP_ACT_XSK_ZEROCOPY)
-
 int xp_assign_dev(struct xsk_buff_pool *pool,
 		  struct net_device *netdev, u16 queue_id, u16 flags)
 {
@@ -200,7 +196,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
 		/* For copy-mode, we are done. */
 		return 0;
 
-	if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) {
+	if ((netdev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) {
 		err = -EOPNOTSUPP;
 		goto err_unreg_pool;
 	}
-- 
cgit v1.2.3


From 2f6b2565d43cdb5087cac23d530cca84aa3d897e Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Tue, 14 Oct 2025 08:04:55 -0700
Subject: block: accumulate memory segment gaps per bio

The blk-mq dma iterator has an optimization for requests that align to
the device's iommu merge boundary. This boundary may be larger than the
device's virtual boundary, but the code had been depending on that queue
limit to know ahead of time if the request is guaranteed to align to
that optimization.

Rather than rely on that queue limit, which many devices may not report,
save the lowest set bit of any boundary gap between each segment in the
bio while checking the segments. The request stores the value for
merging and quickly checking per io if the request can use iova
optimizations.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               |  1 +
 block/blk-map.c           |  3 +++
 block/blk-merge.c         | 39 ++++++++++++++++++++++++++++++++++++---
 block/blk-mq-dma.c        |  3 +--
 block/blk-mq.c            |  6 ++++++
 include/linux/bio.h       |  2 ++
 include/linux/blk-mq.h    | 16 ++++++++++++++++
 include/linux/blk_types.h | 12 ++++++++++++
 8 files changed, 77 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index b3a79285c278..7b13bdf72de0 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -253,6 +253,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
 	bio->bi_write_hint = 0;
 	bio->bi_write_stream = 0;
 	bio->bi_status = 0;
+	bio->bi_bvec_gap_bit = 0;
 	bio->bi_iter.bi_sector = 0;
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_idx = 0;
diff --git a/block/blk-map.c b/block/blk-map.c
index 60faf036fb6e..17a1dc288678 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -459,6 +459,8 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio)
 	if (rq->bio) {
 		if (!ll_back_merge_fn(rq, bio, nr_segs))
 			return -EINVAL;
+		rq->phys_gap_bit = bio_seg_gap(rq->q, rq->biotail, bio,
+					       rq->phys_gap_bit);
 		rq->biotail->bi_next = bio;
 		rq->biotail = bio;
 		rq->__data_len += bio->bi_iter.bi_size;
@@ -469,6 +471,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio)
 	rq->nr_phys_segments = nr_segs;
 	rq->bio = rq->biotail = bio;
 	rq->__data_len = bio->bi_iter.bi_size;
+	rq->phys_gap_bit = bio->bi_bvec_gap_bit;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_append_bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index c47d18587a0b..3ca6fbf8b787 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -302,6 +302,12 @@ static unsigned int bio_split_alignment(struct bio *bio,
 	return lim->logical_block_size;
 }
 
+static inline unsigned int bvec_seg_gap(struct bio_vec *bvprv,
+					struct bio_vec *bv)
+{
+	return bv->bv_offset | (bvprv->bv_offset + bvprv->bv_len);
+}
+
 /**
  * bio_split_io_at - check if and where to split a bio
  * @bio:  [in] bio to be split
@@ -319,8 +325,8 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
 		unsigned *segs, unsigned max_bytes, unsigned len_align_mask)
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
+	unsigned nsegs = 0, bytes = 0, gaps = 0;
 	struct bvec_iter iter;
-	unsigned nsegs = 0, bytes = 0;
 
 	bio_for_each_bvec(bv, bio, iter) {
 		if (bv.bv_offset & lim->dma_alignment ||
@@ -331,8 +337,11 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
-		if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
-			goto split;
+		if (bvprvp) {
+			if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
+				goto split;
+			gaps |= bvec_seg_gap(bvprvp, &bv);
+		}
 
 		if (nsegs < lim->max_segments &&
 		    bytes + bv.bv_len <= max_bytes &&
@@ -350,6 +359,7 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
 	}
 
 	*segs = nsegs;
+	bio->bi_bvec_gap_bit = ffs(gaps);
 	return 0;
 split:
 	if (bio->bi_opf & REQ_ATOMIC)
@@ -385,6 +395,7 @@ split:
 	 * big IO can be trival, disable iopoll when split needed.
 	 */
 	bio_clear_polled(bio);
+	bio->bi_bvec_gap_bit = ffs(gaps);
 	return bytes >> SECTOR_SHIFT;
 }
 EXPORT_SYMBOL_GPL(bio_split_io_at);
@@ -721,6 +732,21 @@ static bool blk_atomic_write_mergeable_rqs(struct request *rq,
 	return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC);
 }
 
+u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next,
+	       u8 gaps_bit)
+{
+	struct bio_vec pb, nb;
+
+	gaps_bit = min_not_zero(gaps_bit, prev->bi_bvec_gap_bit);
+	gaps_bit = min_not_zero(gaps_bit, next->bi_bvec_gap_bit);
+
+	bio_get_last_bvec(prev, &pb);
+	bio_get_first_bvec(next, &nb);
+	if (!biovec_phys_mergeable(q, &pb, &nb))
+		gaps_bit = min_not_zero(gaps_bit, ffs(bvec_seg_gap(&pb, &nb)));
+	return gaps_bit;
+}
+
 /*
  * For non-mq, this has to be called with the request spinlock acquired.
  * For mq with scheduling, the appropriate queue wide lock should be held.
@@ -785,6 +811,9 @@ static struct request *attempt_merge(struct request_queue *q,
 	if (next->start_time_ns < req->start_time_ns)
 		req->start_time_ns = next->start_time_ns;
 
+	req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, next->bio,
+					min_not_zero(next->phys_gap_bit,
+						     req->phys_gap_bit));
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 
@@ -908,6 +937,8 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
 	if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
 		blk_zone_write_plug_bio_merged(bio);
 
+	req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, bio,
+					req->phys_gap_bit);
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
 	req->__data_len += bio->bi_iter.bi_size;
@@ -942,6 +973,8 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
 
 	blk_update_mixed_merge(req, bio, true);
 
+	req->phys_gap_bit = bio_seg_gap(req->q, bio, req->bio,
+					req->phys_gap_bit);
 	bio->bi_next = req->bio;
 	req->bio = bio;
 
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 449950029872..94d3461b5bc8 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -79,8 +79,7 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
 static inline bool blk_can_dma_map_iova(struct request *req,
 		struct device *dma_dev)
 {
-	return !((queue_virt_boundary(req->q) + 1) &
-		dma_get_merge_boundary(dma_dev));
+	return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev));
 }
 
 static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d626d32f6e57..b2fdeaac0efb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_LIST_HEAD(&rq->queuelist);
 	rq->q = q;
 	rq->__sector = (sector_t) -1;
+	rq->phys_gap_bit = 0;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->tag = BLK_MQ_NO_TAG;
@@ -668,6 +669,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
 			goto out_queue_exit;
 	}
 	rq->__data_len = 0;
+	rq->phys_gap_bit = 0;
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;
 	return rq;
@@ -748,6 +750,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
 	blk_mq_rq_time_init(rq, alloc_time_ns);
 	rq->__data_len = 0;
+	rq->phys_gap_bit = 0;
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;
 	return rq;
@@ -2674,6 +2677,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
 	rq->bio = rq->biotail = bio;
 	rq->__sector = bio->bi_iter.bi_sector;
 	rq->__data_len = bio->bi_iter.bi_size;
+	rq->phys_gap_bit = bio->bi_bvec_gap_bit;
+
 	rq->nr_phys_segments = nr_segs;
 	if (bio_integrity(bio))
 		rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
@@ -3380,6 +3385,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 	}
 	rq->nr_phys_segments = rq_src->nr_phys_segments;
 	rq->nr_integrity_segments = rq_src->nr_integrity_segments;
+	rq->phys_gap_bit = rq_src->phys_gap_bit;
 
 	if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
 		goto free_and_out;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 16c1c85613b7..ad2d57908c1c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -324,6 +324,8 @@ extern struct bio *bio_split(struct bio *bio, int sectors,
 			     gfp_t gfp, struct bio_set *bs);
 int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
 		unsigned *segs, unsigned max_bytes, unsigned len_align);
+u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next,
+		u8 gaps_bit);
 
 /**
  * bio_next_split - get next @sectors from a bio, splitting if necessary
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b25d12545f46..b54506b3b76d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -152,6 +152,14 @@ struct request {
 	unsigned short nr_phys_segments;
 	unsigned short nr_integrity_segments;
 
+	/*
+	 * The lowest set bit for address gaps between physical segments. This
+	 * provides information necessary for dma optimization opprotunities,
+	 * like for testing if the segments can be coalesced against the
+	 * device's iommu granule.
+	 */
+	unsigned char phys_gap_bit;
+
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 	struct bio_crypt_ctx *crypt_ctx;
 	struct blk_crypto_keyslot *crypt_keyslot;
@@ -208,6 +216,14 @@ struct request {
 	void *end_io_data;
 };
 
+/*
+ * Returns a mask with all bits starting at req->phys_gap_bit set to 1.
+ */
+static inline unsigned long req_phys_gap_mask(const struct request *req)
+{
+	return ~(((1 << req->phys_gap_bit) >> 1) - 1);
+}
+
 static inline enum req_op req_op(const struct request *req)
 {
 	return req->cmd_flags & REQ_OP_MASK;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8e8d1cc8b06c..53501ebb0623 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -218,6 +218,18 @@ struct bio {
 	enum rw_hint		bi_write_hint;
 	u8			bi_write_stream;
 	blk_status_t		bi_status;
+
+	/*
+	 * The bvec gap bit indicates the lowest set bit in any address offset
+	 * between all bi_io_vecs. This field is initialized only after the bio
+	 * is split to the hardware limits (see bio_split_io_at()). The value
+	 * may be used to consider DMA optimization when performing that
+	 * mapping. The value is compared to a power of two mask where the
+	 * result depends on any bit set within the mask, so saving the lowest
+	 * bit is sufficient to know if any segment gap collides with the mask.
+	 */
+	u8			bi_bvec_gap_bit;
+
 	atomic_t		__bi_remaining;
 
 	struct bvec_iter	bi_iter;
-- 
cgit v1.2.3


From 4c0a17e28340e458627d672564200406e220d6a3 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 5 Nov 2025 10:05:33 +0100
Subject: slab: prevent recursive kmalloc() in alloc_empty_sheaf()

We want to expand usage of sheaves to all non-boot caches, including
kmalloc caches. Since sheaves themselves are also allocated by
kmalloc(), we need to prevent excessive or infinite recursion -
depending on sheaf size, the sheaf can be allocated from smaller, same
or larger kmalloc size bucket, there's no particular constraint.

This is similar to allocating the objext arrays so let's just reuse the
existing mechanisms for those. __GFP_NO_OBJ_EXT in alloc_empty_sheaf()
will prevent a nested kmalloc() from allocating a sheaf itself - it will
either have sheaves already, or fallback to a non-sheaf-cached
allocation (so bootstrap of sheaves in a kmalloc cache that allocates
sheaves from its own size bucket is possible). Additionally, reuse
OBJCGS_CLEAR_MASK to clear unwanted gfp flags from the nested
allocation.

Link: https://patch.msgid.link/20251105-sheaves-cleanups-v1-5-b8218e1ac7ef@suse.cz
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/gfp_types.h |  6 ------
 mm/slub.c                 | 36 ++++++++++++++++++++++++++----------
 2 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 65db9349f905..3de43b12209e 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -55,9 +55,7 @@ enum {
 #ifdef CONFIG_LOCKDEP
 	___GFP_NOLOCKDEP_BIT,
 #endif
-#ifdef CONFIG_SLAB_OBJ_EXT
 	___GFP_NO_OBJ_EXT_BIT,
-#endif
 	___GFP_LAST_BIT
 };
 
@@ -98,11 +96,7 @@ enum {
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
-#ifdef CONFIG_SLAB_OBJ_EXT
 #define ___GFP_NO_OBJ_EXT       BIT(___GFP_NO_OBJ_EXT_BIT)
-#else
-#define ___GFP_NO_OBJ_EXT       0
-#endif
 
 /*
  * Physical address zone modifiers (see linux/mmzone.h - low four bits)
diff --git a/mm/slub.c b/mm/slub.c
index a7c6d79154f8..f729c208965b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2031,6 +2031,14 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
 }
 #endif /* CONFIG_SLUB_DEBUG */
 
+/*
+ * The allocated objcg pointers array is not accounted directly.
+ * Moreover, it should not come from DMA buffer and is not readily
+ * reclaimable. So those GFP bits should be masked off.
+ */
+#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | \
+				__GFP_ACCOUNT | __GFP_NOFAIL)
+
 #ifdef CONFIG_SLAB_OBJ_EXT
 
 #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
@@ -2081,14 +2089,6 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
 
-/*
- * The allocated objcg pointers array is not accounted directly.
- * Moreover, it should not come from DMA buffer and is not readily
- * reclaimable. So those GFP bits should be masked off.
- */
-#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | \
-				__GFP_ACCOUNT | __GFP_NOFAIL)
-
 static inline void init_slab_obj_exts(struct slab *slab)
 {
 	slab->obj_exts = 0;
@@ -2596,8 +2596,24 @@ static void *setup_object(struct kmem_cache *s, void *object)
 
 static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
 {
-	struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects,
-					s->sheaf_capacity), gfp);
+	struct slab_sheaf *sheaf;
+	size_t sheaf_size;
+
+	if (gfp & __GFP_NO_OBJ_EXT)
+		return NULL;
+
+	gfp &= ~OBJCGS_CLEAR_MASK;
+
+	/*
+	 * Prevent recursion to the same cache, or a deep stack of kmallocs of
+	 * varying sizes (sheaf capacity might differ for each kmalloc size
+	 * bucket)
+	 */
+	if (s->flags & SLAB_KMALLOC)
+		gfp |= __GFP_NO_OBJ_EXT;
+
+	sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity);
+	sheaf = kzalloc(sheaf_size, gfp);
 
 	if (unlikely(!sheaf))
 		return NULL;
-- 
cgit v1.2.3


From ce284f882022ebcb953984c7eccf4fc4eb531978 Mon Sep 17 00:00:00 2001
From: Michal Wilczynski <m.wilczynski@samsung.com>
Date: Thu, 16 Oct 2025 15:38:01 +0200
Subject: pwm: Export `pwmchip_release` for external use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The upcoming Rust abstraction layer for the PWM subsystem uses a custom
`dev->release` handler to safely manage the lifetime of its driver
data.

To prevent leaking the memory of the `struct pwm_chip` (allocated by
`pwmchip_alloc`), this custom handler must also call the original
`pwmchip_release` function to complete the cleanup.

Make `pwmchip_release` a global, exported function so that it can be
called from the Rust FFI bridge. This involves removing the `static`
keyword, adding a prototype to the public header, and exporting the
symbol.

Reviewed-by: Elle Rhumsaa <elle@weathered-steel.dev>
Signed-off-by: Michal Wilczynski <m.wilczynski@samsung.com>
Link: https://patch.msgid.link/20251016-rust-next-pwm-working-fan-for-sending-v16-1-a5df2405d2bd@samsung.com
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 drivers/pwm/core.c  | 3 ++-
 include/linux/pwm.h | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index ea2ccf42e814..47c9333baaf6 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -1608,12 +1608,13 @@ void pwmchip_put(struct pwm_chip *chip)
 }
 EXPORT_SYMBOL_GPL(pwmchip_put);
 
-static void pwmchip_release(struct device *pwmchip_dev)
+void pwmchip_release(struct device *pwmchip_dev)
 {
 	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
 
 	kfree(chip);
 }
+EXPORT_SYMBOL_GPL(pwmchip_release);
 
 struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t sizeof_priv)
 {
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 549ac4aaad59..148f056f336b 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -488,6 +488,12 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner);
 #define pwmchip_add(chip) __pwmchip_add(chip, THIS_MODULE)
 void pwmchip_remove(struct pwm_chip *chip);
 
+/*
+ * For FFI wrapper use only:
+ * The Rust PWM abstraction needs this to properly free the pwm_chip.
+ */
+void pwmchip_release(struct device *dev);
+
 int __devm_pwmchip_add(struct device *dev, struct pwm_chip *chip, struct module *owner);
 #define devm_pwmchip_add(dev, chip) __devm_pwmchip_add(dev, chip, THIS_MODULE)
 
-- 
cgit v1.2.3


From 37827223f86aa71b267769d5f51ca16b44b45ae5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 5 Nov 2025 12:32:12 -0800
Subject: srcu: Add SRCU_READ_FLAVOR_FAST_UPDOWN CPP macro

This commit adds the SRCU_READ_FLAVOR_FAST_UPDOWN=0x8 macro
and adjusts rcutorture to make use of it.  In this commit, both
SRCU_READ_FLAVOR_FAST=0x4 and the new SRCU_READ_FLAVOR_FAST_UPDOWN
test SRCU-fast.  When the SRCU-fast-updown is added, the new
SRCU_READ_FLAVOR_FAST_UPDOWN macro will test it when passed to the
rcutorture.reader_flavor module parameter.

The old SRCU_READ_FLAVOR_FAST macro's value changed from 0x8 to 0x4.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: <bpf@vger.kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h    | 16 +++++++++-------
 kernel/rcu/rcutorture.c | 24 ++++++++++++++++++------
 2 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 41e27c1d917d..1dd6812aabe7 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -56,13 +56,15 @@ int init_srcu_struct_fast(struct srcu_struct *ssp);
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */
-#define SRCU_READ_FLAVOR_NORMAL	0x1		// srcu_read_lock().
-#define SRCU_READ_FLAVOR_NMI	0x2		// srcu_read_lock_nmisafe().
-//				0x4		// SRCU-lite is no longer with us.
-#define SRCU_READ_FLAVOR_FAST	0x8		// srcu_read_lock_fast().
-#define SRCU_READ_FLAVOR_ALL   (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \
-				SRCU_READ_FLAVOR_FAST) // All of the above.
-#define SRCU_READ_FLAVOR_SLOWGP	SRCU_READ_FLAVOR_FAST
+#define SRCU_READ_FLAVOR_NORMAL		0x1		// srcu_read_lock().
+#define SRCU_READ_FLAVOR_NMI		0x2		// srcu_read_lock_nmisafe().
+//					0x4		// SRCU-lite is no longer with us.
+#define SRCU_READ_FLAVOR_FAST		0x4		// srcu_read_lock_fast().
+#define SRCU_READ_FLAVOR_FAST_UPDOWN	0x8		// srcu_read_lock_fast().
+#define SRCU_READ_FLAVOR_ALL		(SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \
+					 SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN)
+						// All of the above.
+#define SRCU_READ_FLAVOR_SLOWGP		(SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN)
 						// Flavors requiring synchronize_rcu()
 						// instead of smp_mb().
 void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ec9d474d60cb..8973cae0a3ef 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -702,6 +702,8 @@ static void srcu_torture_init(void)
 	rcu_sync_torture_init();
 	if (reader_flavor & SRCU_READ_FLAVOR_FAST)
 		srcu_ctlp = &srcu_ctlf;
+	if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
+		srcu_ctlp = &srcu_ctlf;
 }
 
 static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
@@ -728,6 +730,12 @@ static int srcu_torture_read_lock(void)
 		ret += idx << 1;
 	}
 	if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+		scp = srcu_read_lock_fast(srcu_ctlp);
+		idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+		WARN_ON_ONCE(idx & ~0x1);
+		ret += idx << 2;
+	}
+	if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
 		scp = srcu_read_lock_fast(srcu_ctlp);
 		idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
 		WARN_ON_ONCE(idx & ~0x1);
@@ -758,8 +766,10 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
 static void srcu_torture_read_unlock(int idx)
 {
 	WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
-	if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+	if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
 		srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
+	if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+		srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 2));
 	if (reader_flavor & SRCU_READ_FLAVOR_NMI)
 		srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
 	if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL))
@@ -793,7 +803,7 @@ static int srcu_torture_down_read(void)
 		WARN_ON_ONCE(idx & ~0x1);
 		return idx;
 	}
-	if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+	if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
 		scp = srcu_down_read_fast(srcu_ctlp);
 		idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
 		WARN_ON_ONCE(idx & ~0x1);
@@ -806,7 +816,7 @@ static int srcu_torture_down_read(void)
 static void srcu_torture_up_read(int idx)
 {
 	WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
-	if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+	if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
 		srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
 	else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) ||
 		 !(reader_flavor & SRCU_READ_FLAVOR_ALL))
@@ -901,14 +911,16 @@ static struct rcu_torture_ops srcu_ops = {
 	.no_pi_lock	= IS_ENABLED(CONFIG_TINY_SRCU),
 	.debug_objects	= 1,
 	.have_up_down	= IS_ENABLED(CONFIG_TINY_SRCU)
-				? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST,
+				? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN,
 	.name		= "srcu"
 };
 
 static void srcud_torture_init(void)
 {
 	rcu_sync_torture_init();
-	if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+	if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
+		WARN_ON(init_srcu_struct_fast(&srcu_ctld));
+	else if (reader_flavor & SRCU_READ_FLAVOR_FAST)
 		WARN_ON(init_srcu_struct_fast(&srcu_ctld));
 	else
 		WARN_ON(init_srcu_struct(&srcu_ctld));
@@ -953,7 +965,7 @@ static struct rcu_torture_ops srcud_ops = {
 	.no_pi_lock	= IS_ENABLED(CONFIG_TINY_SRCU),
 	.debug_objects	= 1,
 	.have_up_down	= IS_ENABLED(CONFIG_TINY_SRCU)
-				? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST,
+				? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN,
 	.name		= "srcud"
 };
 
-- 
cgit v1.2.3


From 187de7c212e5fa87779e1026bf949337bca0cdaa Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 27 Oct 2025 17:18:03 +0106
Subject: printk: nbcon: Allow unsafe write_atomic() for panic

There may be console drivers that have not yet figured out a way
to implement safe atomic printing (->write_atomic() callback).
These drivers could choose to only implement threaded printing
(->write_thread() callback), but then it is guaranteed that _no_
output will be printed during panic. Not even attempted.

As a result, developers may be tempted to implement unsafe
->write_atomic() callbacks and/or implement some sort of custom
deferred printing trickery to try to make it work. This goes
against the principle intention of the nbcon API as well as
endangers other nbcon drivers that are doing things correctly
(safely).

As a compromise, allow nbcon drivers to implement unsafe
->write_atomic() callbacks by providing a new console flag
CON_NBCON_ATOMIC_UNSAFE. When specified, the ->write_atomic()
callback for that console will _only_ be called during the
final "hope and pray" flush attempt at the end of a panic:
nbcon_atomic_flush_unsafe().

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Link: https://lore.kernel.org/lkml/b2qps3uywhmjaym4mht2wpxul4yqtuuayeoq4iv4k3zf5wdgh3@tocu6c7mj4lt
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/all/swdpckuwwlv3uiessmtnf2jwlx3jusw6u7fpk5iggqo4t2vdws@7rpjso4gr7qp/ [1]
Link: https://lore.kernel.org/all/20251103-fix_netpoll_aa-v4-1-4cfecdf6da7c@debian.org/ [2]
Link: https://patch.msgid.link/20251027161212.334219-2-john.ogness@linutronix.de
[pmladek@suse.com: Fix build with rework/nbcon-in-kdb branch.]
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h | 19 ++++++++++++++++---
 kernel/printk/nbcon.c   | 47 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 48 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/console.h b/include/linux/console.h
index d17f1f525bec..5f17321ed962 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -186,6 +186,8 @@ static inline void con_debug_leave(void) { }
  *			printing callbacks must not be called.
  * @CON_NBCON:		Console can operate outside of the legacy style console_lock
  *			constraints.
+ * @CON_NBCON_ATOMIC_UNSAFE: The write_atomic() callback is not safe and is
+ *			therefore only used by nbcon_atomic_flush_unsafe().
  */
 enum cons_flags {
 	CON_PRINTBUFFER		= BIT(0),
@@ -197,6 +199,7 @@ enum cons_flags {
 	CON_EXTENDED		= BIT(6),
 	CON_SUSPENDED		= BIT(7),
 	CON_NBCON		= BIT(8),
+	CON_NBCON_ATOMIC_UNSAFE	= BIT(9),
 };
 
 /**
@@ -608,6 +611,7 @@ extern void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
 extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
 extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
 extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt);
+extern bool nbcon_allow_unsafe_takeover(void);
 extern bool nbcon_kdb_try_acquire(struct console *con,
 				  struct nbcon_write_context *wctxt);
 extern void nbcon_kdb_release(struct nbcon_write_context *wctxt);
@@ -627,9 +631,18 @@ static inline bool console_is_usable(struct console *con, short flags, bool use_
 		return false;
 
 	if (flags & CON_NBCON) {
-		/* The write_atomic() callback is optional. */
-		if (use_atomic && !con->write_atomic)
-			return false;
+		if (use_atomic) {
+			/* The write_atomic() callback is optional. */
+			if (!con->write_atomic)
+				return false;
+
+			/*
+			 * An unsafe write_atomic() callback is only usable
+			 * when unsafe takeovers are allowed.
+			 */
+			if ((flags & CON_NBCON_ATOMIC_UNSAFE) && !nbcon_allow_unsafe_takeover())
+				return false;
+		}
 
 		/*
 		 * For the !use_atomic case, @printk_kthreads_running is not
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index fdd1cbebe77d..90412c2b2961 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -1408,6 +1408,26 @@ enum nbcon_prio nbcon_get_default_prio(void)
 	return NBCON_PRIO_NORMAL;
 }
 
+/*
+ * Track if it is allowed to perform unsafe hostile takeovers of console
+ * ownership. When true, console drivers might perform unsafe actions while
+ * printing. It is externally available via nbcon_allow_unsafe_takeover().
+ */
+static bool panic_nbcon_allow_unsafe_takeover;
+
+/**
+ * nbcon_allow_unsafe_takeover - Check if unsafe console takeovers are allowed
+ *
+ * Return:	True, when it is permitted to perform unsafe console printing
+ *
+ * This is also used by console_is_usable() to determine if it is allowed to
+ * call write_atomic() callbacks flagged as unsafe (CON_NBCON_ATOMIC_UNSAFE).
+ */
+bool nbcon_allow_unsafe_takeover(void)
+{
+	return panic_on_this_cpu() && panic_nbcon_allow_unsafe_takeover;
+}
+
 /**
  * nbcon_legacy_emit_next_record - Print one record for an nbcon console
  *					in legacy contexts
@@ -1478,7 +1498,6 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
  *					write_atomic() callback
  * @con:			The nbcon console to flush
  * @stop_seq:			Flush up until this record
- * @allow_unsafe_takeover:	True, to allow unsafe hostile takeovers
  *
  * Return:	0 if @con was flushed up to @stop_seq Otherwise, error code on
  *		failure.
@@ -1497,8 +1516,7 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
  * returned, it cannot be expected that the unfinalized record will become
  * available.
  */
-static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
-					    bool allow_unsafe_takeover)
+static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq)
 {
 	struct nbcon_write_context wctxt = { };
 	struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
@@ -1507,7 +1525,7 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
 	ctxt->console			= con;
 	ctxt->spinwait_max_us		= 2000;
 	ctxt->prio			= nbcon_get_default_prio();
-	ctxt->allow_unsafe_takeover	= allow_unsafe_takeover;
+	ctxt->allow_unsafe_takeover	= nbcon_allow_unsafe_takeover();
 
 	if (!nbcon_context_try_acquire(ctxt, false))
 		return -EPERM;
@@ -1538,15 +1556,13 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
  *					write_atomic() callback
  * @con:			The nbcon console to flush
  * @stop_seq:			Flush up until this record
- * @allow_unsafe_takeover:	True, to allow unsafe hostile takeovers
  *
  * This will stop flushing before @stop_seq if another context has ownership.
  * That context is then responsible for the flushing. Likewise, if new records
  * are added while this context was flushing and there is no other context
  * to handle the printing, this context must also flush those records.
  */
-static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
-					   bool allow_unsafe_takeover)
+static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq)
 {
 	struct console_flush_type ft;
 	unsigned long flags;
@@ -1561,7 +1577,7 @@ again:
 	 */
 	local_irq_save(flags);
 
-	err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);
+	err = __nbcon_atomic_flush_pending_con(con, stop_seq);
 
 	local_irq_restore(flags);
 
@@ -1593,9 +1609,8 @@ again:
  * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their
  *					write_atomic() callback
  * @stop_seq:			Flush up until this record
- * @allow_unsafe_takeover:	True, to allow unsafe hostile takeovers
  */
-static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover)
+static void __nbcon_atomic_flush_pending(u64 stop_seq)
 {
 	struct console *con;
 	int cookie;
@@ -1613,7 +1628,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove
 		if (nbcon_seq_read(con) >= stop_seq)
 			continue;
 
-		nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);
+		nbcon_atomic_flush_pending_con(con, stop_seq);
 	}
 	console_srcu_read_unlock(cookie);
 }
@@ -1629,7 +1644,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove
  */
 void nbcon_atomic_flush_pending(void)
 {
-	__nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false);
+	__nbcon_atomic_flush_pending(prb_next_reserve_seq(prb));
 }
 
 /**
@@ -1641,7 +1656,9 @@ void nbcon_atomic_flush_pending(void)
  */
 void nbcon_atomic_flush_unsafe(void)
 {
-	__nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true);
+	panic_nbcon_allow_unsafe_takeover = true;
+	__nbcon_atomic_flush_pending(prb_next_reserve_seq(prb));
+	panic_nbcon_allow_unsafe_takeover = false;
 }
 
 /**
@@ -1848,7 +1865,7 @@ void nbcon_device_release(struct console *con)
 		 * using the legacy loop.
 		 */
 		if (ft.nbcon_atomic) {
-			__nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false);
+			__nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb));
 		} else if (ft.legacy_direct) {
 			if (console_trylock())
 				console_unlock();
@@ -1918,5 +1935,5 @@ void nbcon_kdb_release(struct nbcon_write_context *wctxt)
 	 * The console was locked only when the write_atomic() callback
 	 * was usable.
 	 */
-	__nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb), false);
+	__nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb));
 }
-- 
cgit v1.2.3


From 7ab06ea41af53aa1713186ceaa154179e4b0d4c9 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyccccc@gmail.com>
Date: Wed, 5 Nov 2025 18:38:49 +0800
Subject: arch_topology: Provide a stub topology_core_has_smt() for
 !CONFIG_GENERIC_ARCH_TOPOLOGY

The arm_pmu driver is using topology_core_has_smt() for retrieving
the SMT implementation which depends on CONFIG_GENERIC_ARCH_TOPOLOGY.
The config is optional on arm platforms so provide a
!CONFIG_GENERIC_ARCH_TOPOLOGY stub for topology_core_has_smt().

Fixes: c3d78c34ad00 ("perf: arm_pmuv3: Don't use PMCCNTR_EL0 on SMT cores")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511041757.vuCGOmFc-lkp@intel.com/
Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Yicong Yang <yangyccccc@gmail.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/arch_topology.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index daa1af2e8204..0c2a8b846c20 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -100,6 +100,10 @@ static inline bool topology_core_has_smt(int cpu)
 	return cpu_topology[cpu].thread_id != -1;
 }
 
-#endif
+#else
+
+static inline bool topology_core_has_smt(int cpu) { return false; }
+
+#endif /* CONFIG_GENERIC_ARCH_TOPOLOGY */
 
 #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
-- 
cgit v1.2.3


From 25976c314f6596254c9b1e2291d94393b7d5ae81 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Fri, 7 Nov 2025 15:38:44 +0900
Subject: block: introduce bdev_zone_start()

Introduce the function bdev_zone_start() as a more explicit (and clear)
replacement for ALIGN_DOWN() to get the start sector of a zone
containing a particular sector of a zoned block device.

Use this new helper in blkdev_get_zone_info() and
blkdev_report_zones_cached().

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 4 ++--
 include/linux/blkdev.h | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index b580d59ce210..3791755bc6ad 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -950,7 +950,7 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
 		return -EINVAL;
 
 	memset(zone, 0, sizeof(*zone));
-	sector = ALIGN_DOWN(sector, zone_sectors);
+	sector = bdev_zone_start(bdev, sector);
 
 	if (!blkdev_has_cached_report_zones(bdev))
 		return blkdev_report_zone_fallback(bdev, sector, zone);
@@ -1068,7 +1068,7 @@ int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
 		return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
 	}
 
-	for (sector = ALIGN_DOWN(sector, zone_sectors);
+	for (sector = bdev_zone_start(bdev, sector);
 	     sector < capacity && idx < nr_zones;
 	     sector += zone_sectors, idx++) {
 		ret = blkdev_get_zone_info(bdev, sector, &zone);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6a498aa7f7e7..2fff8a80dbd2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1522,6 +1522,12 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev)
 	return q->limits.chunk_sectors;
 }
 
+static inline sector_t bdev_zone_start(struct block_device *bdev,
+				       sector_t sector)
+{
+	return sector & ~(bdev_zone_sectors(bdev) - 1);
+}
+
 static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev,
 						   sector_t sector)
 {
-- 
cgit v1.2.3


From be88c549e9d78828a2e06126ed7e17fc2e030f1f Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Thu, 6 Nov 2025 00:32:40 +0000
Subject: tcp: Call tcp_syn_ack_timeout() directly.

Since DCCP has been removed, we do not need to use
request_sock_ops.syn_ack_timeout().

Let's call tcp_syn_ack_timeout() directly.

Now other function pointers of request_sock_ops are
protocol-dependent.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106003357.273403-2-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/request_sock.h      | 1 -
 net/ipv4/inet_connection_sock.c | 4 +++-
 net/ipv4/tcp_ipv4.c             | 1 -
 net/ipv4/tcp_timer.c            | 3 +--
 net/ipv6/tcp_ipv6.c             | 1 -
 5 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index cd4d4cf71d0d..9b9e04f6bb89 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -36,7 +36,6 @@ struct request_sock_ops {
 				      struct sk_buff *skb,
 				      enum sk_rst_reason reason);
 	void		(*destructor)(struct request_sock *req);
-	void		(*syn_ack_timeout)(const struct request_sock *req);
 };
 
 struct saved_syn {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3b83b66b2284..6a86c1ac3011 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1096,9 +1096,11 @@ static void reqsk_timer_handler(struct timer_list *t)
 			young <<= 1;
 		}
 	}
+
 	syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept),
 		       &expire, &resend);
-	req->rsk_ops->syn_ack_timeout(req);
+	tcp_syn_ack_timeout(req);
+
 	if (!expire &&
 	    (!resend ||
 	     !tcp_rtx_synack(sk_listener, req) ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b7526a7888cb..0cfebac33a91 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1660,7 +1660,6 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
 	.send_ack	=	tcp_v4_reqsk_send_ack,
 	.destructor	=	tcp_v4_reqsk_destructor,
 	.send_reset	=	tcp_v4_send_reset,
-	.syn_ack_timeout =	tcp_syn_ack_timeout,
 };
 
 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 2dd73a4e8e51..0672c3d8f4f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -458,7 +458,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 	struct tcp_sock *tp = tcp_sk(sk);
 	int max_retries;
 
-	req->rsk_ops->syn_ack_timeout(req);
+	tcp_syn_ack_timeout(req);
 
 	/* Add one more retry for fastopen.
 	 * Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
@@ -752,7 +752,6 @@ void tcp_syn_ack_timeout(const struct request_sock *req)
 
 	__NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS);
 }
-EXPORT_IPV6_MOD(tcp_syn_ack_timeout);
 
 void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len)
 {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7df21c1cba21..08113f430124 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -796,7 +796,6 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
 	.send_ack	=	tcp_v6_reqsk_send_ack,
 	.destructor	=	tcp_v6_reqsk_destructor,
 	.send_reset	=	tcp_v6_send_reset,
-	.syn_ack_timeout =	tcp_syn_ack_timeout,
 };
 
 const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
-- 
cgit v1.2.3


From 3ce5dd8161ecdf12ffe0af99ff8980f1432f64a5 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Thu, 6 Nov 2025 00:32:41 +0000
Subject: tcp: Remove timeout arg from reqsk_queue_hash_req().

inet_csk_reqsk_queue_hash_add() is no longer shared by DCCP.

We do not need to pass req->timeout down to reqsk_queue_hash_req().

Let's move tcp_timeout_init() from tcp_conn_request() to
reqsk_queue_hash_req().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106003357.273403-3-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_connection_sock.h |  3 +--
 net/ipv4/inet_connection_sock.c    | 11 +++++------
 net/ipv4/tcp_input.c               | 14 +++++---------
 3 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b4b886647607..90a99a2fc804 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -267,8 +267,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
 				      struct request_sock *req,
 				      struct sock *child);
-bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
-				   unsigned long timeout);
+bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req);
 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
 					 struct request_sock *req,
 					 bool own_req);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6a86c1ac3011..d9c674403eb0 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1144,8 +1144,7 @@ drop:
 	reqsk_put(oreq);
 }
 
-static bool reqsk_queue_hash_req(struct request_sock *req,
-				 unsigned long timeout)
+static bool reqsk_queue_hash_req(struct request_sock *req)
 {
 	bool found_dup_sk = false;
 
@@ -1153,8 +1152,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req,
 		return false;
 
 	/* The timer needs to be setup after a successful insertion. */
+	req->timeout = tcp_timeout_init((struct sock *)req);
 	timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
-	mod_timer(&req->rsk_timer, jiffies + timeout);
+	mod_timer(&req->rsk_timer, jiffies + req->timeout);
 
 	/* before letting lookups find us, make sure all req fields
 	 * are committed to memory and refcnt initialized.
@@ -1164,10 +1164,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req,
 	return true;
 }
 
-bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
-				   unsigned long timeout)
+bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req)
 {
-	if (!reqsk_queue_hash_req(req, timeout))
+	if (!reqsk_queue_hash_req(req))
 		return false;
 
 	inet_csk_reqsk_queue_added(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6db1d4c36a88..804ec56bdd24 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7531,15 +7531,11 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		sock_put(fastopen_sk);
 	} else {
 		tcp_rsk(req)->tfo_listener = false;
-		if (!want_cookie) {
-			req->timeout = tcp_timeout_init((struct sock *)req);
-			if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req,
-								    req->timeout))) {
-				reqsk_free(req);
-				dst_release(dst);
-				return 0;
-			}
-
+		if (!want_cookie &&
+		    unlikely(!inet_csk_reqsk_queue_hash_add(sk, req))) {
+			reqsk_free(req);
+			dst_release(dst);
+			return 0;
 		}
 		af_ops->send_synack(sk, dst, &fl, req, &foc,
 				    !want_cookie ? TCP_SYNACK_NORMAL :
-- 
cgit v1.2.3


From 207ce0f6bc131812c96cf4f6db328af5396cebac Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Thu, 6 Nov 2025 00:32:43 +0000
Subject: tcp: Remove timeout arg from reqsk_timeout().

reqsk_timeout() is always called with @timeout being TCP_RTO_MAX.

Let's remove the arg.

As a prep for the next patch, reqsk_timeout() is moved to tcp.h
and renamed to tcp_reqsk_timeout().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106003357.273403-5-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_connection_sock.h | 8 --------
 include/net/tcp.h                  | 7 +++++++
 net/ipv4/inet_connection_sock.c    | 2 +-
 net/ipv4/tcp_minisocks.c           | 5 +++--
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 90a99a2fc804..fd40af2221b9 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -290,14 +290,6 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
 bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
 void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req);
 
-static inline unsigned long
-reqsk_timeout(struct request_sock *req, unsigned long max_timeout)
-{
-	u64 timeout = (u64)req->timeout << req->num_timeout;
-
-	return (unsigned long)min_t(u64, timeout, max_timeout);
-}
-
 void inet_csk_destroy_sock(struct sock *sk);
 void inet_csk_prepare_for_destroy_sock(struct sock *sk);
 void inet_csk_prepare_forced_close(struct sock *sk);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0aa1f07d036a..0c7274ac7ed5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -841,6 +841,13 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 	return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
 }
 
+static inline unsigned long tcp_reqsk_timeout(struct request_sock *req)
+{
+	u64 timeout = (u64)req->timeout << req->num_timeout;
+
+	return (unsigned long)min_t(u64, timeout, TCP_RTO_MAX);
+}
+
 u32 tcp_delack_max(const struct sock *sk);
 
 /* Compute the actual rto_min value */
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 2bfe7af51bbb..b4eae731c9ba 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1105,7 +1105,7 @@ static void reqsk_timer_handler(struct timer_list *t)
 	     inet_rsk(req)->acked)) {
 		if (req->num_timeout++ == 0)
 			atomic_dec(&queue->young);
-		mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX));
+		mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req));
 
 		if (!nreq)
 			return;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ded2cf1f6006..d8f4d813e8dd 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -714,7 +714,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			 * it can be estimated (approximately)
 			 * from another data.
 			 */
-			tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
+			tmp_opt.ts_recent_stamp = ktime_get_seconds() -
+				tcp_reqsk_timeout(req) / HZ;
 			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 		}
 	}
@@ -753,7 +754,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		    !tcp_rtx_synack(sk, req)) {
 			unsigned long expires = jiffies;
 
-			expires += reqsk_timeout(req, TCP_RTO_MAX);
+			expires += tcp_reqsk_timeout(req);
 			if (!fastopen)
 				mod_timer_pending(&req->rsk_timer, expires);
 			else
-- 
cgit v1.2.3


From 1e9d3005e02cba82047d49f859982fc73b9a100b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Thu, 6 Nov 2025 00:32:44 +0000
Subject: tcp: Apply max RTO to non-TFO SYN+ACK.

Since commit 54a378f43425 ("tcp: add the ability to control
max RTO"), TFO SYN+ACK RTO is capped by the TFO full sk's
inet_csk(sk)->icsk_rto_max.

The value is inherited from the parent listener.

Let's apply the same cap to non-TFO SYN+ACK.

Note that req->rsk_listener is always non-NULL when we call
tcp_reqsk_timeout() in reqsk_timer_handler() or tcp_check_req().

It could be NULL for SYN cookie req, but we do not use
req->timeout then.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106003357.273403-6-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0c7274ac7ed5..4833ec7903ec 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -845,7 +845,8 @@ static inline unsigned long tcp_reqsk_timeout(struct request_sock *req)
 {
 	u64 timeout = (u64)req->timeout << req->num_timeout;
 
-	return (unsigned long)min_t(u64, timeout, TCP_RTO_MAX);
+	return (unsigned long)min_t(u64, timeout,
+				    tcp_rto_max(req->rsk_listener));
 }
 
 u32 tcp_delack_max(const struct sock *sk);
-- 
cgit v1.2.3


From 416dd649f3aa3774907c668167a29c668dbc634b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 6 Nov 2025 11:52:36 +0000
Subject: tcp: add net.ipv4.tcp_comp_sack_rtt_percent

TCP SACK compression has been added in 2018 in commit
5d9f4262b7ea ("tcp: add SACK compression").

It is working great for WAN flows (with large RTT).
Wifi in particular gets a significant boost _when_ ACK are suppressed.

Add a new sysctl so that we can tune the very conservative 5 % value
that has been used so far in this formula, so that small RTT flows
can benefit from this feature.

delay = min ( 5 % of RTT, 1 ms)

This patch adds new tcp_comp_sack_rtt_percent sysctl
to ease experiments and tuning.

Given that we cap the delay to 1ms (tcp_comp_sack_delay_ns sysctl),
set the default value to 33 %.

Quoting Neal Cardwell ( https://lore.kernel.org/netdev/CADVnQymZ1tFnEA1Q=vtECs0=Db7zHQ8=+WCQtnhHFVbEOzjVnQ@mail.gmail.com/ )

The rationale for 33% is basically to try to facilitate pipelining,
where there are always at least 3 ACKs and 3 GSO/TSO skbs per SRTT, so
that the path can maintain a budget for 3 full-sized GSO/TSO skbs "in
flight" at all times:

+ 1 skb in the qdisc waiting to be sent by the NIC next
+ 1 skb being sent by the NIC (being serialized by the NIC out onto the wire)
+ 1 skb being received and aggregated by the receiver machine's
aggregation mechanism (some combination of LRO, GRO, and sack
compression)

Note that this is basically the same magic number (3) and the same
rationales as:

(a) tcp_tso_should_defer() ensuring that we defer sending data for no
longer than cwnd/tcp_tso_win_divisor (where tcp_tso_win_divisor = 3),
and
(b) bbr_quantization_budget() ensuring that cwnd is at least 3 GSO/TSO
skbs to maintain pipelining and full throughput at low RTTs

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Link: https://patch.msgid.link/20251106115236.3450026-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst | 13 +++++++++++--
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  9 +++++++++
 net/ipv4/tcp_input.c                   | 26 ++++++++++++++++++--------
 net/ipv4/tcp_ipv4.c                    |  1 +
 5 files changed, 40 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 7cd35bfd39e6..2bae61be1859 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -854,9 +854,18 @@ tcp_sack - BOOLEAN
 
 	Default: 1 (enabled)
 
+tcp_comp_sack_rtt_percent - INTEGER
+	Percentage of SRTT used for the compressed SACK feature.
+	See tcp_comp_sack_nr, tcp_comp_sack_delay_ns, tcp_comp_sack_slack_ns.
+
+	Possible values : 1 - 1000
+
+	Default : 33 %
+
 tcp_comp_sack_delay_ns - LONG INTEGER
-	TCP tries to reduce number of SACK sent, using a timer
-	based on 5% of SRTT, capped by this sysctl, in nano seconds.
+	TCP tries to reduce number of SACK sent, using a timer based
+	on tcp_comp_sack_rtt_percent of SRTT, capped by this sysctl
+	in nano seconds.
 	The default is 1ms, based on TSO autosizing period.
 
 	Default : 1,000,000 ns (1 ms)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 0e96c90e56c6..de9d36acc8e2 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -221,6 +221,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_pacing_ss_ratio;
 	int sysctl_tcp_pacing_ca_ratio;
 	unsigned int sysctl_tcp_child_ehash_entries;
+	int sysctl_tcp_comp_sack_rtt_percent;
 	unsigned long sysctl_tcp_comp_sack_delay_ns;
 	unsigned long sysctl_tcp_comp_sack_slack_ns;
 	int sysctl_max_syn_backlog;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0c7c8f9041cb..35367f8e2da3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1451,6 +1451,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+	{
+		.procname	= "tcp_comp_sack_rtt_percent",
+		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE_THOUSAND,
+	},
 	{
 		.procname	= "tcp_comp_sack_slack_ns",
 		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 804ec56bdd24..9df5d7515605 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5893,7 +5893,9 @@ static inline void tcp_data_snd_check(struct sock *sk)
 static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned long rtt, delay;
+	struct net *net = sock_net(sk);
+	unsigned long rtt;
+	u64 delay;
 
 	    /* More than one full frame received... */
 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -5912,7 +5914,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 		 * Defer the ack until tcp_release_cb().
 		 */
 		if (sock_owned_by_user_nocheck(sk) &&
-		    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
+		    READ_ONCE(net->ipv4.sysctl_tcp_backlog_ack_defer)) {
 			set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
 			return;
 		}
@@ -5927,7 +5929,7 @@ send_now:
 	}
 
 	if (!tcp_is_sack(tp) ||
-	    tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
+	    tp->compressed_ack >= READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_nr))
 		goto send_now;
 
 	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
@@ -5942,18 +5944,26 @@ send_now:
 	if (hrtimer_is_queued(&tp->compressed_ack_timer))
 		return;
 
-	/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
+	/* compress ack timer : comp_sack_rtt_percent of rtt,
+	 * but no more than tcp_comp_sack_delay_ns.
+	 */
 
 	rtt = tp->rcv_rtt_est.rtt_us;
 	if (tp->srtt_us && tp->srtt_us < rtt)
 		rtt = tp->srtt_us;
 
-	delay = min_t(unsigned long,
-		      READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
-		      rtt * (NSEC_PER_USEC >> 3)/20);
+	/* delay = (rtt >> 3) * NSEC_PER_USEC * comp_sack_rtt_percent / 100
+	 * ->
+	 * delay = rtt * 1.25 * comp_sack_rtt_percent
+	 */
+	delay = (u64)(rtt + (rtt >> 2)) *
+		READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_rtt_percent);
+
+	delay = min(delay, READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_delay_ns));
+
 	sock_hold(sk);
 	hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
-			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
+			       READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_slack_ns),
 			       HRTIMER_MODE_REL_PINNED_SOFT);
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0cfebac33a91..a7d9fec2950b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3595,6 +3595,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
+	net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
-- 
cgit v1.2.3


From b87ee13e34931779ac1dcd3264beba50b54966fd Mon Sep 17 00:00:00 2001
From: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
Date: Wed, 5 Nov 2025 10:42:12 +0530
Subject: net: phy: phy-c45: add OATC14 10BASE-T1S PHY cable diagnostic support

Add support for Open Alliance TC14 (OATC14) 10BASE-T1S PHYs cable
diagnostic feature.

This patch implements:
- genphy_c45_oatc14_cable_test_start() to initiate a cable test
- genphy_c45_oatc14_cable_test_get_status() to retrieve test results
- Helper function to map PHY cable test status to ethtool result codes
- Function declarations and exports for use by PHY drivers

This enables ethtool to report ok, open, short, and undetectable cable
conditions on OATC14 10Base-T1S PHYs.

Open Alliance TC14 10BASE-T1S Advanced Diagnostic PHY Features
Specification ref:
https://opensig.org/wp-content/uploads/2025/06/OPEN_Alliance_10BASE-T1S_Advanced_PHY_features_for-automotive_Ethernet_V2.1b.pdf

Signed-off-by: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
Link: https://patch.msgid.link/20251105051213.50443-2-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/mdio-open-alliance.h |  36 +++++++++++
 drivers/net/phy/phy-c45.c            | 122 +++++++++++++++++++++++++++++++++++
 include/linux/phy.h                  |   3 +
 3 files changed, 161 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/mdio-open-alliance.h b/drivers/net/phy/mdio-open-alliance.h
index 931e14660d75..6850a3f0b31e 100644
--- a/drivers/net/phy/mdio-open-alliance.h
+++ b/drivers/net/phy/mdio-open-alliance.h
@@ -43,4 +43,40 @@
 /* Version Identifiers */
 #define OATC14_IDM		0x0a00
 
+/*
+ * Open Alliance TC14 (10BASE-T1S) - Advanced Diagnostic Features Registers
+ *
+ * Refer to the OPEN Alliance documentation:
+ *   https://opensig.org/automotive-ethernet-specifications/
+ *
+ * Specification:
+ *   "10BASE-T1S Advanced Diagnostic PHY Features"
+ *   https://opensig.org/wp-content/uploads/2025/06/OPEN_Alliance_10BASE-T1S_Advanced_PHY_features_for-automotive_Ethernet_V2.1b.pdf
+ */
+/* Advanced Diagnostic Features Capability Register*/
+#define MDIO_OATC14_ADFCAP		0xcc00
+#define OATC14_ADFCAP_HDD_CAPABILITY	GENMASK(10, 8)
+
+/* Harness Defect Detection Register */
+#define MDIO_OATC14_HDD			0xcc01
+#define OATC14_HDD_CONTROL		BIT(15)
+#define OATC14_HDD_READY		BIT(14)
+#define OATC14_HDD_START_CONTROL	BIT(13)
+#define OATC14_HDD_VALID		BIT(2)
+#define OATC14_HDD_SHORT_OPEN_STATUS	GENMASK(1, 0)
+
+/* Bus Short/Open Status:
+ * 0 0 - no fault; everything is ok. (Default)
+ * 0 1 - detected as an open or missing termination(s)
+ * 1 0 - detected as a short or extra termination(s)
+ * 1 1 - fault but fault type not detectable. More details can be available by
+ *       vender specific register if supported.
+ */
+enum oatc14_hdd_status {
+	OATC14_HDD_STATUS_CABLE_OK = 0,
+	OATC14_HDD_STATUS_OPEN,
+	OATC14_HDD_STATUS_SHORT,
+	OATC14_HDD_STATUS_NOT_DETECTABLE,
+};
+
 #endif /* __MDIO_OPEN_ALLIANCE__ */
diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 1a7b32be4625..e8e5be4684ab 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -7,6 +7,7 @@
 #include <linux/mdio.h>
 #include <linux/mii.h>
 #include <linux/phy.h>
+#include <linux/ethtool_netlink.h>
 
 #include "mdio-open-alliance.h"
 #include "phylib-internal.h"
@@ -1573,3 +1574,124 @@ int genphy_c45_ethtool_set_eee(struct phy_device *phydev,
 	return ret;
 }
 EXPORT_SYMBOL(genphy_c45_ethtool_set_eee);
+
+/**
+ * oatc14_cable_test_get_result_code - Convert hardware cable test status to
+ *                                     ethtool result code.
+ * @status: The hardware-reported cable test status
+ *
+ * This helper function maps the OATC14 HDD cable test status to the
+ * corresponding ethtool cable test result code. It provides a translation
+ * between the device-specific status values and the standardized ethtool
+ * result codes.
+ *
+ * Return:
+ * * ETHTOOL_A_CABLE_RESULT_CODE_OK          - Cable is OK
+ * * ETHTOOL_A_CABLE_RESULT_CODE_OPEN        - Open circuit detected
+ * * ETHTOOL_A_CABLE_RESULT_CODE_SAME_SHORT  - Short circuit detected
+ * * ETHTOOL_A_CABLE_RESULT_CODE_UNSPEC      - Status not detectable or invalid
+ */
+static int oatc14_cable_test_get_result_code(enum oatc14_hdd_status status)
+{
+	switch (status) {
+	case OATC14_HDD_STATUS_CABLE_OK:
+		return ETHTOOL_A_CABLE_RESULT_CODE_OK;
+	case OATC14_HDD_STATUS_OPEN:
+		return ETHTOOL_A_CABLE_RESULT_CODE_OPEN;
+	case OATC14_HDD_STATUS_SHORT:
+		return ETHTOOL_A_CABLE_RESULT_CODE_SAME_SHORT;
+	case OATC14_HDD_STATUS_NOT_DETECTABLE:
+	default:
+		return ETHTOOL_A_CABLE_RESULT_CODE_UNSPEC;
+	}
+}
+
+/**
+ * genphy_c45_oatc14_cable_test_get_status - Get status of OATC14 10Base-T1S
+ *                                           PHY cable test.
+ * @phydev:   pointer to the PHY device structure
+ * @finished: pointer to a boolean set true if the test is complete
+ *
+ * Retrieves the current status of the OATC14 10Base-T1S PHY cable test.
+ * This function reads the OATC14 HDD register to determine whether the test
+ * results are valid and whether the test has finished.
+ *
+ * If the test is complete, the function reports the cable test result via
+ * the ethtool cable test interface using ethnl_cable_test_result(), and then
+ * clears the test control bit in the PHY register to reset the test state.
+ *
+ * Return: 0 on success, or a negative error code on failure (e.g. register
+ *         read/write error).
+ */
+int genphy_c45_oatc14_cable_test_get_status(struct phy_device *phydev,
+					    bool *finished)
+{
+	int ret;
+	u8 sts;
+
+	*finished = false;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_HDD);
+	if (ret < 0)
+		return ret;
+
+	if (!(ret & OATC14_HDD_VALID))
+		return 0;
+
+	*finished = true;
+
+	sts = FIELD_GET(OATC14_HDD_SHORT_OPEN_STATUS, ret);
+
+	ret = ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A,
+				      oatc14_cable_test_get_result_code(sts));
+	if (ret)
+		return ret;
+
+	return phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2,
+				  MDIO_OATC14_HDD, OATC14_HDD_CONTROL);
+}
+EXPORT_SYMBOL(genphy_c45_oatc14_cable_test_get_status);
+
+/**
+ * genphy_c45_oatc14_cable_test_start - Start a cable test on an OATC14
+ *                                      10Base-T1S PHY.
+ * @phydev: Pointer to the PHY device structure
+ *
+ * This function initiates a cable diagnostic test on a Clause 45 OATC14
+ * 10Base-T1S capable PHY device. It first reads the PHY’s advanced diagnostic
+ * capability register to check if High Definition Diagnostics (HDD) mode is
+ * supported. If the PHY does not report HDD capability, cable testing is not
+ * supported and the function returns -EOPNOTSUPP.
+ *
+ * For PHYs that support HDD, the function sets the appropriate control bits in
+ * the OATC14_HDD register to enable and start the cable diagnostic test.
+ *
+ * Return:
+ * * 0 on success
+ * * -EOPNOTSUPP if the PHY does not support HDD capability
+ * * A negative error code on I/O or register access failures
+ */
+int genphy_c45_oatc14_cable_test_start(struct phy_device *phydev)
+{
+	int ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_ADFCAP);
+	if (ret < 0)
+		return ret;
+
+	if (!(ret & OATC14_ADFCAP_HDD_CAPABILITY))
+		return -EOPNOTSUPP;
+
+	ret = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_HDD,
+			       OATC14_HDD_CONTROL);
+	if (ret)
+		return ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_HDD);
+	if (ret < 0)
+		return ret;
+
+	return phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_HDD,
+				OATC14_HDD_START_CONTROL);
+}
+EXPORT_SYMBOL(genphy_c45_oatc14_cable_test_start);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d145a200ea21..bf5457341ca8 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -2251,6 +2251,9 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev,
 int genphy_c45_ethtool_set_eee(struct phy_device *phydev,
 			       struct ethtool_keee *data);
 int genphy_c45_an_config_eee_aneg(struct phy_device *phydev);
+int genphy_c45_oatc14_cable_test_start(struct phy_device *phydev);
+int genphy_c45_oatc14_cable_test_get_status(struct phy_device *phydev,
+					    bool *finished);
 
 /* The gen10g_* functions are the old Clause 45 stub */
 int gen10g_config_aneg(struct phy_device *phydev);
-- 
cgit v1.2.3


From f73e0f46bbfab29b111ff52d047f15aa13623972 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 5 Nov 2025 23:09:17 +0100
Subject: net: phy: fixed_phy: shrink size of struct fixed_phy_status

All three members are effectively of type bool, so make this explicit
and shrink size of struct fixed_phy_status.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/9eca3d7e-fa64-4724-8fdc-f2c1a8f2ae8f@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/fixed_phy.c | 2 +-
 include/linux/phy_fixed.h   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index d498d8a9bba6..9bd6937411e4 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -224,7 +224,7 @@ EXPORT_SYMBOL_GPL(fixed_phy_register);
 struct phy_device *fixed_phy_register_100fd(void)
 {
 	static const struct fixed_phy_status status = {
-		.link	= 1,
+		.link	= true,
 		.speed	= SPEED_100,
 		.duplex	= DUPLEX_FULL,
 	};
diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 8bade999831c..436bff20f324 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -5,11 +5,11 @@
 #include <linux/types.h>
 
 struct fixed_phy_status {
-	int link;
 	int speed;
 	int duplex;
-	int pause;
-	int asym_pause;
+	bool link:1;
+	bool pause:1;
+	bool asym_pause:1;
 };
 
 struct device_node;
-- 
cgit v1.2.3


From dae4a92399fa8d68aa917db6bb3245f83021e762 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 5 Nov 2025 16:26:02 -0800
Subject: psp: report basic stats from the core

Track and report stats common to all psp devices from the core. A
'stale-event' is when the core marks the rx state of an active
psp_assoc as incapable of authenticating psp encapsulated data.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251106002608.1578518-2-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/psp.yaml | 40 +++++++++++++++++++
 include/net/psp/types.h              |  9 +++++
 include/uapi/linux/psp.h             | 10 +++++
 net/psp/psp-nl-gen.c                 | 19 +++++++++
 net/psp/psp-nl-gen.h                 |  2 +
 net/psp/psp_nl.c                     | 74 ++++++++++++++++++++++++++++++++++++
 net/psp/psp_sock.c                   |  4 +-
 7 files changed, 157 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml
index 944429e5c9a8..914148221384 100644
--- a/Documentation/netlink/specs/psp.yaml
+++ b/Documentation/netlink/specs/psp.yaml
@@ -76,6 +76,28 @@ attribute-sets:
         name: spi
         doc: Security Parameters Index (SPI) of the association.
         type: u32
+  -
+    name: stats
+    attributes:
+      -
+        name: dev-id
+        doc: PSP device ID.
+        type: u32
+        checks:
+          min: 1
+      -
+        name: key-rotations
+        type: uint
+        doc: |
+          Number of key rotations during the lifetime of the device.
+          Kernel statistic.
+      -
+        name: stale-events
+        type: uint
+        doc: |
+          Number of times a socket's Rx got shut down due to using
+          a key which went stale (fully rotated out).
+          Kernel statistic.
 
 operations:
   list:
@@ -177,6 +199,24 @@ operations:
         pre: psp-assoc-device-get-locked
         post: psp-device-unlock
 
+    -
+      name: get-stats
+      doc: Get device statistics.
+      attribute-set: stats
+      do:
+        request:
+          attributes:
+            - dev-id
+        reply: &stats-all
+          attributes:
+            - dev-id
+            - key-rotations
+            - stale-events
+        pre: psp-device-get-locked
+        post: psp-device-unlock
+      dump:
+        reply: *stats-all
+
 mcast-groups:
   list:
     -
diff --git a/include/net/psp/types.h b/include/net/psp/types.h
index 31cee64b7c86..5b0ccaac3882 100644
--- a/include/net/psp/types.h
+++ b/include/net/psp/types.h
@@ -59,6 +59,10 @@ struct psp_dev_config {
  *			device key
  * @stale_assocs:	associations which use a rotated out key
  *
+ * @stats:	statistics maintained by the core
+ * @stats.rotations:	See stats attr key-rotations
+ * @stats.stales:	See stats attr stale-events
+ *
  * @rcu:	RCU head for freeing the structure
  */
 struct psp_dev {
@@ -81,6 +85,11 @@ struct psp_dev {
 	struct list_head prev_assocs;
 	struct list_head stale_assocs;
 
+	struct {
+		unsigned long rotations;
+		unsigned long stales;
+	} stats;
+
 	struct rcu_head rcu;
 };
 
diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h
index 607c42c39ba5..31592760ad79 100644
--- a/include/uapi/linux/psp.h
+++ b/include/uapi/linux/psp.h
@@ -45,6 +45,15 @@ enum {
 	PSP_A_KEYS_MAX = (__PSP_A_KEYS_MAX - 1)
 };
 
+enum {
+	PSP_A_STATS_DEV_ID = 1,
+	PSP_A_STATS_KEY_ROTATIONS,
+	PSP_A_STATS_STALE_EVENTS,
+
+	__PSP_A_STATS_MAX,
+	PSP_A_STATS_MAX = (__PSP_A_STATS_MAX - 1)
+};
+
 enum {
 	PSP_CMD_DEV_GET = 1,
 	PSP_CMD_DEV_ADD_NTF,
@@ -55,6 +64,7 @@ enum {
 	PSP_CMD_KEY_ROTATE_NTF,
 	PSP_CMD_RX_ASSOC,
 	PSP_CMD_TX_ASSOC,
+	PSP_CMD_GET_STATS,
 
 	__PSP_CMD_MAX,
 	PSP_CMD_MAX = (__PSP_CMD_MAX - 1)
diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c
index 9fdd6f831803..73f8b06d66f0 100644
--- a/net/psp/psp-nl-gen.c
+++ b/net/psp/psp-nl-gen.c
@@ -47,6 +47,11 @@ static const struct nla_policy psp_tx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] =
 	[PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, },
 };
 
+/* PSP_CMD_GET_STATS - do */
+static const struct nla_policy psp_get_stats_nl_policy[PSP_A_STATS_DEV_ID + 1] = {
+	[PSP_A_STATS_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
 /* Ops table for psp */
 static const struct genl_split_ops psp_nl_ops[] = {
 	{
@@ -99,6 +104,20 @@ static const struct genl_split_ops psp_nl_ops[] = {
 		.maxattr	= PSP_A_ASSOC_SOCK_FD,
 		.flags		= GENL_CMD_CAP_DO,
 	},
+	{
+		.cmd		= PSP_CMD_GET_STATS,
+		.pre_doit	= psp_device_get_locked,
+		.doit		= psp_nl_get_stats_doit,
+		.post_doit	= psp_device_unlock,
+		.policy		= psp_get_stats_nl_policy,
+		.maxattr	= PSP_A_STATS_DEV_ID,
+		.flags		= GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd	= PSP_CMD_GET_STATS,
+		.dumpit	= psp_nl_get_stats_dumpit,
+		.flags	= GENL_CMD_CAP_DUMP,
+	},
 };
 
 static const struct genl_multicast_group psp_nl_mcgrps[] = {
diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h
index 25268ed11fb5..5bc3b5d5a53e 100644
--- a/net/psp/psp-nl-gen.h
+++ b/net/psp/psp-nl-gen.h
@@ -28,6 +28,8 @@ int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info);
 int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info);
 int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info);
 int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info);
+int psp_nl_get_stats_doit(struct sk_buff *skb, struct genl_info *info);
+int psp_nl_get_stats_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 
 enum {
 	PSP_NLGRP_MGMT,
diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c
index 8aaca62744c3..f990cccbe99c 100644
--- a/net/psp/psp_nl.c
+++ b/net/psp/psp_nl.c
@@ -262,6 +262,7 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info)
 		     psd->generation & ~PSP_GEN_VALID_MASK);
 
 	psp_assocs_key_rotated(psd);
+	psd->stats.rotations++;
 
 	nlmsg_end(ntf, (struct nlmsghdr *)ntf->data);
 	genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf,
@@ -503,3 +504,76 @@ err_free_msg:
 	nlmsg_free(rsp);
 	return err;
 }
+
+static int
+psp_nl_stats_fill(struct psp_dev *psd, struct sk_buff *rsp,
+		  const struct genl_info *info)
+{
+	void *hdr;
+
+	hdr = genlmsg_iput(rsp, info);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(rsp, PSP_A_STATS_DEV_ID, psd->id) ||
+	    nla_put_uint(rsp, PSP_A_STATS_KEY_ROTATIONS,
+			 psd->stats.rotations) ||
+	    nla_put_uint(rsp, PSP_A_STATS_STALE_EVENTS, psd->stats.stales))
+		goto err_cancel_msg;
+
+	genlmsg_end(rsp, hdr);
+	return 0;
+
+err_cancel_msg:
+	genlmsg_cancel(rsp, hdr);
+	return -EMSGSIZE;
+}
+
+int psp_nl_get_stats_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct psp_dev *psd = info->user_ptr[0];
+	struct sk_buff *rsp;
+	int err;
+
+	rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!rsp)
+		return -ENOMEM;
+
+	err = psp_nl_stats_fill(psd, rsp, info);
+	if (err)
+		goto err_free_msg;
+
+	return genlmsg_reply(rsp, info);
+
+err_free_msg:
+	nlmsg_free(rsp);
+	return err;
+}
+
+static int
+psp_nl_stats_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb,
+			    struct psp_dev *psd)
+{
+	if (psp_dev_check_access(psd, sock_net(rsp->sk)))
+		return 0;
+
+	return psp_nl_stats_fill(psd, rsp, genl_info_dump(cb));
+}
+
+int psp_nl_get_stats_dumpit(struct sk_buff *rsp, struct netlink_callback *cb)
+{
+	struct psp_dev *psd;
+	int err = 0;
+
+	mutex_lock(&psp_devs_lock);
+	xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) {
+		mutex_lock(&psd->lock);
+		err = psp_nl_stats_get_dumpit_one(rsp, cb, psd);
+		mutex_unlock(&psd->lock);
+		if (err)
+			break;
+	}
+	mutex_unlock(&psp_devs_lock);
+
+	return err;
+}
diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c
index a931d825d1cc..f785672b7df6 100644
--- a/net/psp/psp_sock.c
+++ b/net/psp/psp_sock.c
@@ -253,8 +253,10 @@ void psp_assocs_key_rotated(struct psp_dev *psd)
 	/* Mark the stale associations as invalid, they will no longer
 	 * be able to Rx any traffic.
 	 */
-	list_for_each_entry_safe(pas, next, &psd->prev_assocs, assocs_list)
+	list_for_each_entry_safe(pas, next, &psd->prev_assocs, assocs_list) {
 		pas->generation |= ~PSP_GEN_VALID_MASK;
+		psd->stats.stales++;
+	}
 	list_splice_init(&psd->prev_assocs, &psd->stale_assocs);
 	list_splice_init(&psd->active_assocs, &psd->prev_assocs);
 
-- 
cgit v1.2.3


From f05d26198cf2c71f25f6bbe62ca4481c15543922 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 5 Nov 2025 16:26:04 -0800
Subject: psp: add stats from psp spec to driver facing api

Provide a driver api for reporting device statistics required by the
"Implementation Requirements" section of the PSP Architecture
Specification. Use a warning to ensure drivers report stats required
by the spec.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251106002608.1578518-4-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/psp.yaml | 55 ++++++++++++++++++++++++++++++++++++
 include/net/psp/types.h              | 23 +++++++++++++++
 include/uapi/linux/psp.h             |  8 ++++++
 net/psp/psp_main.c                   |  3 +-
 net/psp/psp_nl.c                     | 21 +++++++++++++-
 5 files changed, 108 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml
index 914148221384..f3a57782d2cf 100644
--- a/Documentation/netlink/specs/psp.yaml
+++ b/Documentation/netlink/specs/psp.yaml
@@ -98,6 +98,61 @@ attribute-sets:
           Number of times a socket's Rx got shut down due to using
           a key which went stale (fully rotated out).
           Kernel statistic.
+      -
+        name: rx-packets
+        type: uint
+        doc: |
+          Number of successfully processed and authenticated PSP packets.
+          Device statistic (from the PSP spec).
+      -
+        name: rx-bytes
+        type: uint
+        doc: |
+          Number of successfully authenticated PSP bytes received, counting from
+          the first byte after the IV through the last byte of payload.
+          The fixed initial portion of the PSP header (16 bytes)
+          and the PSP trailer/ICV (16 bytes) are not included in this count.
+          Device statistic (from the PSP spec).
+      -
+        name: rx-auth-fail
+        type: uint
+        doc: |
+          Number of received PSP packets with unsuccessful authentication.
+          Device statistic (from the PSP spec).
+      -
+        name: rx-error
+        type: uint
+        doc: |
+          Number of received PSP packets with length/framing errors.
+          Device statistic (from the PSP spec).
+      -
+        name: rx-bad
+        type: uint
+        doc: |
+          Number of received PSP packets with miscellaneous errors
+          (invalid master key indicated by SPI, unsupported version, etc.)
+          Device statistic (from the PSP spec).
+      -
+        name: tx-packets
+        type: uint
+        doc: |
+          Number of successfully processed PSP packets for transmission.
+          Device statistic (from the PSP spec).
+      -
+        name: tx-bytes
+        type: uint
+        doc: |
+          Number of successfully processed PSP bytes for transmit, counting from
+          the first byte after the IV through the last byte of payload.
+          The fixed initial portion of the PSP header (16 bytes)
+          and the PSP trailer/ICV (16 bytes) are not included in this count.
+          Device statistic (from the PSP spec).
+      -
+        name: tx-error
+        type: uint
+        doc: |
+          Number of PSP packets for transmission with errors.
+          Device statistic (from the PSP spec).
 
 operations:
   list:
diff --git a/include/net/psp/types.h b/include/net/psp/types.h
index 5b0ccaac3882..25a9096d4e7d 100644
--- a/include/net/psp/types.h
+++ b/include/net/psp/types.h
@@ -150,6 +150,22 @@ struct psp_assoc {
 	u8 drv_data[] __aligned(8);
 };
 
+struct psp_dev_stats {
+	union {
+		struct {
+			u64 rx_packets;
+			u64 rx_bytes;
+			u64 rx_auth_fail;
+			u64 rx_error;
+			u64 rx_bad;
+			u64 tx_packets;
+			u64 tx_bytes;
+			u64 tx_error;
+		};
+		DECLARE_FLEX_ARRAY(u64, required);
+	};
+};
+
 /**
  * struct psp_dev_ops - netdev driver facing PSP callbacks
  */
@@ -188,6 +204,13 @@ struct psp_dev_ops {
 	 * Remove an association from the device.
 	 */
 	void (*tx_key_del)(struct psp_dev *psd, struct psp_assoc *pas);
+
+	/**
+	 * @get_stats: get statistics from the device
+	 * Stats required by the spec must be maintained and filled in.
+	 * Stats must be filled in member-by-member, never memset the struct.
+	 */
+	void (*get_stats)(struct psp_dev *psd, struct psp_dev_stats *stats);
 };
 
 #endif /* __NET_PSP_H */
diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h
index 31592760ad79..d8449c043ba1 100644
--- a/include/uapi/linux/psp.h
+++ b/include/uapi/linux/psp.h
@@ -49,6 +49,14 @@ enum {
 	PSP_A_STATS_DEV_ID = 1,
 	PSP_A_STATS_KEY_ROTATIONS,
 	PSP_A_STATS_STALE_EVENTS,
+	PSP_A_STATS_RX_PACKETS,
+	PSP_A_STATS_RX_BYTES,
+	PSP_A_STATS_RX_AUTH_FAIL,
+	PSP_A_STATS_RX_ERROR,
+	PSP_A_STATS_RX_BAD,
+	PSP_A_STATS_TX_PACKETS,
+	PSP_A_STATS_TX_BYTES,
+	PSP_A_STATS_TX_ERROR,
 
 	__PSP_A_STATS_MAX,
 	PSP_A_STATS_MAX = (__PSP_A_STATS_MAX - 1)
diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c
index 481aaf0fc9fc..a8534124f626 100644
--- a/net/psp/psp_main.c
+++ b/net/psp/psp_main.c
@@ -60,7 +60,8 @@ psp_dev_create(struct net_device *netdev,
 		    !psd_ops->key_rotate ||
 		    !psd_ops->rx_spi_alloc ||
 		    !psd_ops->tx_key_add ||
-		    !psd_ops->tx_key_del))
+		    !psd_ops->tx_key_del ||
+		    !psd_ops->get_stats))
 		return ERR_PTR(-EINVAL);
 
 	psd = kzalloc(sizeof(*psd), GFP_KERNEL);
diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c
index f990cccbe99c..6afd7707ec12 100644
--- a/net/psp/psp_nl.c
+++ b/net/psp/psp_nl.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
+#include <linux/ethtool.h>
 #include <linux/skbuff.h>
 #include <linux/xarray.h>
 #include <net/genetlink.h>
@@ -509,7 +510,17 @@ static int
 psp_nl_stats_fill(struct psp_dev *psd, struct sk_buff *rsp,
 		  const struct genl_info *info)
 {
+	unsigned int required_cnt = sizeof(struct psp_dev_stats) / sizeof(u64);
+	struct psp_dev_stats stats;
 	void *hdr;
+	int i;
+
+	memset(&stats, 0xff, sizeof(stats));
+	psd->ops->get_stats(psd, &stats);
+
+	for (i = 0; i < required_cnt; i++)
+		if (WARN_ON_ONCE(stats.required[i] == ETHTOOL_STAT_NOT_SET))
+			return -EOPNOTSUPP;
 
 	hdr = genlmsg_iput(rsp, info);
 	if (!hdr)
@@ -518,7 +529,15 @@ psp_nl_stats_fill(struct psp_dev *psd, struct sk_buff *rsp,
 	if (nla_put_u32(rsp, PSP_A_STATS_DEV_ID, psd->id) ||
 	    nla_put_uint(rsp, PSP_A_STATS_KEY_ROTATIONS,
 			 psd->stats.rotations) ||
-	    nla_put_uint(rsp, PSP_A_STATS_STALE_EVENTS, psd->stats.stales))
+	    nla_put_uint(rsp, PSP_A_STATS_STALE_EVENTS, psd->stats.stales) ||
+	    nla_put_uint(rsp, PSP_A_STATS_RX_PACKETS, stats.rx_packets) ||
+	    nla_put_uint(rsp, PSP_A_STATS_RX_BYTES, stats.rx_bytes) ||
+	    nla_put_uint(rsp, PSP_A_STATS_RX_AUTH_FAIL, stats.rx_auth_fail) ||
+	    nla_put_uint(rsp, PSP_A_STATS_RX_ERROR, stats.rx_error) ||
+	    nla_put_uint(rsp, PSP_A_STATS_RX_BAD, stats.rx_bad) ||
+	    nla_put_uint(rsp, PSP_A_STATS_TX_PACKETS, stats.tx_packets) ||
+	    nla_put_uint(rsp, PSP_A_STATS_TX_BYTES, stats.tx_bytes) ||
+	    nla_put_uint(rsp, PSP_A_STATS_TX_ERROR, stats.tx_error))
 		goto err_cancel_msg;
 
 	genlmsg_end(rsp, hdr);
-- 
cgit v1.2.3


From 8fdfdb1488162c195f3f0af10b7bc2b8b42928c5 Mon Sep 17 00:00:00 2001
From: Markus Probst <markus.probst@posteo.de>
Date: Tue, 4 Nov 2025 14:24:32 +0000
Subject: scsi: sd: Add manage_restart device attribute to scsi_disk

In addition to the already existing manage_shutdown,
manage_system_start_stop and manage_runtime_start_stop device scsi_disk
attributes, add manage_restart, which allows the high-level device
driver (sd) to manage the device power state for SYSTEM_RESTART if set
to 1.

This attribute is necessary for the following commit "ata: stop disk on
restart if ACPI power resources are found" to avoid a potential disk
power failure in the case the SATA power connector does not retain the
power state after a restart.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Markus Probst <markus.probst@posteo.de>
Link: https://patch.msgid.link/20251104142413.322347-2-markus.probst@posteo.de
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/sd.c          | 34 +++++++++++++++++++++++++++++++++-
 include/scsi/scsi_device.h |  6 ++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 0252d3f6bed1..f2c0744b4480 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -318,6 +318,35 @@ static ssize_t manage_shutdown_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(manage_shutdown);
 
+static ssize_t manage_restart_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct scsi_disk *sdkp = to_scsi_disk(dev);
+	struct scsi_device *sdp = sdkp->device;
+
+	return sysfs_emit(buf, "%u\n", sdp->manage_restart);
+}
+
+static ssize_t manage_restart_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct scsi_disk *sdkp = to_scsi_disk(dev);
+	struct scsi_device *sdp = sdkp->device;
+	bool v;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (kstrtobool(buf, &v))
+		return -EINVAL;
+
+	sdp->manage_restart = v;
+
+	return count;
+}
+static DEVICE_ATTR_RW(manage_restart);
+
 static ssize_t
 allow_restart_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
@@ -654,6 +683,7 @@ static struct attribute *sd_disk_attrs[] = {
 	&dev_attr_manage_system_start_stop.attr,
 	&dev_attr_manage_runtime_start_stop.attr,
 	&dev_attr_manage_shutdown.attr,
+	&dev_attr_manage_restart.attr,
 	&dev_attr_protection_type.attr,
 	&dev_attr_protection_mode.attr,
 	&dev_attr_app_tag_own.attr,
@@ -4177,7 +4207,9 @@ static void sd_shutdown(struct device *dev)
 	    (system_state == SYSTEM_POWER_OFF &&
 	     sdkp->device->manage_shutdown) ||
 	    (system_state == SYSTEM_RUNNING &&
-	     sdkp->device->manage_runtime_start_stop)) {
+	     sdkp->device->manage_runtime_start_stop) ||
+	    (system_state == SYSTEM_RESTART &&
+	     sdkp->device->manage_restart)) {
 		sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n");
 		sd_start_stop_device(sdkp, 0);
 	}
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 6d6500148c4b..c7e657ac8b6d 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -178,6 +178,12 @@ struct scsi_device {
 	 */
 	unsigned manage_shutdown:1;
 
+	/*
+	 * If true, let the high-level device driver (sd) manage the device
+	 * power state for system restart (reboot) operations.
+	 */
+	unsigned manage_restart:1;
+
 	/*
 	 * If set and if the device is runtime suspended, ask the high-level
 	 * device driver (sd) to force a runtime resume of the device.
-- 
cgit v1.2.3


From bf9f0b00bb7fd0470c1255bcc8e76c81d122a609 Mon Sep 17 00:00:00 2001
From: Jai Luthra <jai.luthra@ideasonboard.com>
Date: Wed, 29 Oct 2025 16:00:08 +0530
Subject: include: linux: Destage VCHIQ interface headers

Move the VCHIQ headers from drivers/staging/vc04_services/include to
include/linux/raspberrypi

This is done so that they can be shared between the VCHIQ interface
(which is going to be de-staged in a subsequent commit from staging) and
the VCHIQ drivers left in the staging/vc04_services (namely
bcm2835-audio, bcm2835-camera).

The include/linux/raspberrypi/ provides a central location to serve both of
these areas.

Co-developed-by: Umang Jain <umang.jain@ideasonboard.com>
Signed-off-by: Umang Jain <umang.jain@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Jai Luthra <jai.luthra@ideasonboard.com>
Link: https://patch.msgid.link/20251029-vchiq-destage-v3-4-da8d6c83c2c5@ideasonboard.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                                        |   1 +
 .../vc04_services/bcm2835-audio/bcm2835-vchiq.c    |   5 +-
 .../staging/vc04_services/bcm2835-audio/bcm2835.c  |   3 +-
 .../staging/vc04_services/bcm2835-audio/bcm2835.h  |   3 +-
 .../include/linux/raspberrypi/vchiq.h              | 112 ----
 .../vc04_services/interface/vchiq_arm/vchiq_arm.c  |   9 +-
 .../vc04_services/interface/vchiq_arm/vchiq_arm.h  | 164 ------
 .../vc04_services/interface/vchiq_arm/vchiq_bus.c  |   4 +-
 .../vc04_services/interface/vchiq_arm/vchiq_bus.h  |  60 --
 .../vc04_services/interface/vchiq_arm/vchiq_cfg.h  |  41 --
 .../vc04_services/interface/vchiq_arm/vchiq_core.c |   4 +-
 .../vc04_services/interface/vchiq_arm/vchiq_core.h | 646 ---------------------
 .../interface/vchiq_arm/vchiq_debugfs.c            |   6 +-
 .../interface/vchiq_arm/vchiq_debugfs.h            |  22 -
 .../vc04_services/interface/vchiq_arm/vchiq_dev.c  |   7 +-
 .../interface/vchiq_arm/vchiq_ioctl.h              |   3 +-
 .../staging/vc04_services/vchiq-mmal/mmal-vchiq.c  |   5 +-
 include/linux/raspberrypi/vchiq.h                  | 112 ++++
 include/linux/raspberrypi/vchiq_arm.h              | 164 ++++++
 include/linux/raspberrypi/vchiq_bus.h              |  60 ++
 include/linux/raspberrypi/vchiq_cfg.h              |  41 ++
 include/linux/raspberrypi/vchiq_core.h             | 646 +++++++++++++++++++++
 include/linux/raspberrypi/vchiq_debugfs.h          |  22 +
 23 files changed, 1072 insertions(+), 1068 deletions(-)
 delete mode 100644 drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h
 delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h
 delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h
 delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h
 delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h
 delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h
 create mode 100644 include/linux/raspberrypi/vchiq.h
 create mode 100644 include/linux/raspberrypi/vchiq_arm.h
 create mode 100644 include/linux/raspberrypi/vchiq_bus.h
 create mode 100644 include/linux/raspberrypi/vchiq_cfg.h
 create mode 100644 include/linux/raspberrypi/vchiq_core.h
 create mode 100644 include/linux/raspberrypi/vchiq_debugfs.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 3da2c26a796b..cd223e119d48 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4829,6 +4829,7 @@ T:	git https://github.com/broadcom/stblinux.git
 F:	Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml
 F:	drivers/pci/controller/pcie-brcmstb.c
 F:	drivers/staging/vc04_services
+F:	include/linux/raspberrypi/vchiq*
 N:	bcm2711
 N:	bcm2712
 N:	bcm283*
diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c b/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c
index 0dbe76ee5570..7368b384497f 100644
--- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c
+++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c
@@ -4,11 +4,12 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/completion.h>
+
+#include <linux/raspberrypi/vchiq_arm.h>
+
 #include "bcm2835.h"
 #include "vc_vchi_audioserv_defs.h"
 
-#include "../interface/vchiq_arm/vchiq_arm.h"
-
 struct bcm2835_audio_instance {
 	struct device *dev;
 	unsigned int service_handle;
diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c
index b74cb104e9de..f292a6618166 100644
--- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c
+++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c
@@ -6,7 +6,8 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 
-#include "../interface/vchiq_arm/vchiq_bus.h"
+#include <linux/raspberrypi/vchiq_bus.h>
+
 #include "bcm2835.h"
 
 static bool enable_hdmi;
diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h
index 49ec5b496edb..5a1348747ff4 100644
--- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h
+++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h
@@ -5,13 +5,12 @@
 #define __SOUND_ARM_BCM2835_H
 
 #include <linux/device.h>
+#include <linux/raspberrypi/vchiq.h>
 #include <linux/wait.h>
 #include <sound/core.h>
 #include <sound/pcm.h>
 #include <sound/pcm-indirect.h>
 
-#include "../include/linux/raspberrypi/vchiq.h"
-
 #define MAX_SUBSTREAMS   (8)
 #define AVAIL_SUBSTREAMS_MASK  (0xff)
 
diff --git a/drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h b/drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h
deleted file mode 100644
index ee4469f4fc51..000000000000
--- a/drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */
-
-#ifndef VCHIQ_H
-#define VCHIQ_H
-
-#define VCHIQ_MAKE_FOURCC(x0, x1, x2, x3) \
-			(((x0) << 24) | ((x1) << 16) | ((x2) << 8) | (x3))
-
-enum vchiq_reason {
-	VCHIQ_SERVICE_OPENED,         /* service, -, -             */
-	VCHIQ_SERVICE_CLOSED,         /* service, -, -             */
-	VCHIQ_MESSAGE_AVAILABLE,      /* service, header, -        */
-	VCHIQ_BULK_TRANSMIT_DONE,     /* service, -, bulk_userdata */
-	VCHIQ_BULK_RECEIVE_DONE,      /* service, -, bulk_userdata */
-	VCHIQ_BULK_TRANSMIT_ABORTED,  /* service, -, bulk_userdata */
-	VCHIQ_BULK_RECEIVE_ABORTED    /* service, -, bulk_userdata */
-};
-
-enum vchiq_bulk_mode {
-	VCHIQ_BULK_MODE_CALLBACK,
-	VCHIQ_BULK_MODE_BLOCKING,
-	VCHIQ_BULK_MODE_NOCALLBACK,
-	VCHIQ_BULK_MODE_WAITING		/* Reserved for internal use */
-};
-
-enum vchiq_service_option {
-	VCHIQ_SERVICE_OPTION_AUTOCLOSE,
-	VCHIQ_SERVICE_OPTION_SLOT_QUOTA,
-	VCHIQ_SERVICE_OPTION_MESSAGE_QUOTA,
-	VCHIQ_SERVICE_OPTION_SYNCHRONOUS,
-	VCHIQ_SERVICE_OPTION_TRACE
-};
-
-struct vchiq_header {
-	/* The message identifier - opaque to applications. */
-	int msgid;
-
-	/* Size of message data. */
-	unsigned int size;
-
-	char data[];           /* message */
-};
-
-struct vchiq_element {
-	const void __user *data;
-	unsigned int size;
-};
-
-struct vchiq_instance;
-struct vchiq_state;
-
-struct vchiq_service_base {
-	int fourcc;
-	int (*callback)(struct vchiq_instance *instance,
-			enum vchiq_reason reason,
-			struct vchiq_header *header,
-			unsigned int handle,
-			void *cb_data, void __user *cb_userdata);
-	void *userdata;
-};
-
-struct vchiq_completion_data_kernel {
-	enum vchiq_reason reason;
-	struct vchiq_header *header;
-	void *service_userdata;
-	void *cb_data;
-	void  __user *cb_userdata;
-};
-
-struct vchiq_service_params_kernel {
-	int fourcc;
-	int (*callback)(struct vchiq_instance *instance,
-			enum vchiq_reason reason,
-			struct vchiq_header *header,
-			unsigned int handle,
-			void *cb_data, void __user *cb_userdata);
-	void *userdata;
-	short version;       /* Increment for non-trivial changes */
-	short version_min;   /* Update for incompatible changes */
-};
-
-extern int vchiq_initialise(struct vchiq_state *state,
-			    struct vchiq_instance **pinstance);
-extern int vchiq_shutdown(struct vchiq_instance *instance);
-extern int vchiq_connect(struct vchiq_instance *instance);
-extern int vchiq_open_service(struct vchiq_instance *instance,
-			      const struct vchiq_service_params_kernel *params,
-			      unsigned int *pservice);
-extern int vchiq_close_service(struct vchiq_instance *instance,
-			       unsigned int service);
-extern int vchiq_use_service(struct vchiq_instance *instance, unsigned int service);
-extern int vchiq_release_service(struct vchiq_instance *instance,
-				 unsigned int service);
-extern void vchiq_msg_queue_push(struct vchiq_instance *instance, unsigned int handle,
-				 struct vchiq_header *header);
-extern void vchiq_release_message(struct vchiq_instance *instance, unsigned int service,
-				  struct vchiq_header *header);
-extern int vchiq_queue_kernel_message(struct vchiq_instance *instance, unsigned int handle,
-				      void *data, unsigned int size);
-extern int vchiq_bulk_transmit(struct vchiq_instance *instance, unsigned int service,
-			       const void *data, unsigned int size, void *userdata,
-			       enum vchiq_bulk_mode mode);
-extern int vchiq_bulk_receive(struct vchiq_instance *instance, unsigned int service,
-			      void *data, unsigned int size, void *userdata,
-			      enum vchiq_bulk_mode mode);
-extern void *vchiq_get_service_userdata(struct vchiq_instance *instance, unsigned int service);
-extern int vchiq_get_peer_version(struct vchiq_instance *instance, unsigned int handle,
-				  short *peer_version);
-extern struct vchiq_header *vchiq_msg_hold(struct vchiq_instance *instance, unsigned int handle);
-
-#endif /* VCHIQ_H */
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
index a2074069e79e..6a7b96d3dae6 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
@@ -30,11 +30,12 @@
 #include <linux/uaccess.h>
 #include <soc/bcm2835/raspberrypi-firmware.h>
 
-#include "vchiq_core.h"
+#include <linux/raspberrypi/vchiq_core.h>
+#include <linux/raspberrypi/vchiq_arm.h>
+#include <linux/raspberrypi/vchiq_bus.h>
+#include <linux/raspberrypi/vchiq_debugfs.h>
+
 #include "vchiq_ioctl.h"
-#include "vchiq_arm.h"
-#include "vchiq_bus.h"
-#include "vchiq_debugfs.h"
 
 #define DEVICE_NAME "vchiq"
 
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h
deleted file mode 100644
index e32b02f99024..000000000000
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/*
- * Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved.
- * Copyright (c) 2010-2012 Broadcom. All rights reserved.
- */
-
-#ifndef VCHIQ_ARM_H
-#define VCHIQ_ARM_H
-
-#include <linux/mutex.h>
-#include <linux/platform_device.h>
-#include <linux/semaphore.h>
-#include <linux/atomic.h>
-#include "vchiq_core.h"
-#include "vchiq_debugfs.h"
-
-/* Some per-instance constants */
-#define MAX_COMPLETIONS 128
-#define MAX_SERVICES 64
-#define MAX_ELEMENTS 8
-#define MSG_QUEUE_SIZE 128
-
-#define VCHIQ_DRV_MAX_CALLBACKS 10
-
-struct rpi_firmware;
-struct vchiq_device;
-
-enum USE_TYPE_E {
-	USE_TYPE_SERVICE,
-	USE_TYPE_VCHIQ
-};
-
-struct vchiq_platform_info {
-	unsigned int cache_line_size;
-};
-
-struct vchiq_drv_mgmt {
-	struct rpi_firmware *fw;
-	const struct vchiq_platform_info *info;
-
-	bool connected;
-	int num_deferred_callbacks;
-	/* Protects connected and num_deferred_callbacks */
-	struct mutex connected_mutex;
-
-	void (*deferred_callback[VCHIQ_DRV_MAX_CALLBACKS])(void);
-
-	struct semaphore free_fragments_sema;
-	struct semaphore free_fragments_mutex;
-	char *fragments_base;
-	char *free_fragments;
-	unsigned int fragments_size;
-
-	void __iomem *regs;
-
-	struct vchiq_state state;
-};
-
-struct user_service {
-	struct vchiq_service *service;
-	void __user *userdata;
-	struct vchiq_instance *instance;
-	char is_vchi;
-	char dequeue_pending;
-	char close_pending;
-	int message_available_pos;
-	int msg_insert;
-	int msg_remove;
-	struct completion insert_event;
-	struct completion remove_event;
-	struct completion close_event;
-	struct vchiq_header *msg_queue[MSG_QUEUE_SIZE];
-};
-
-struct bulk_waiter_node {
-	struct bulk_waiter bulk_waiter;
-	int pid;
-	struct list_head list;
-};
-
-struct vchiq_instance {
-	struct vchiq_state *state;
-	struct vchiq_completion_data_kernel completions[MAX_COMPLETIONS];
-	int completion_insert;
-	int completion_remove;
-	struct completion insert_event;
-	struct completion remove_event;
-	struct mutex completion_mutex;
-
-	int connected;
-	int closing;
-	int pid;
-	int mark;
-	int use_close_delivered;
-	int trace;
-
-	struct list_head bulk_waiter_list;
-	struct mutex bulk_waiter_list_mutex;
-
-	struct vchiq_debugfs_node debugfs_node;
-};
-
-int
-vchiq_use_service(struct vchiq_instance *instance, unsigned int handle);
-
-extern int
-vchiq_release_service(struct vchiq_instance *instance, unsigned int handle);
-
-extern int
-vchiq_check_service(struct vchiq_service *service);
-
-extern void
-vchiq_dump_service_use_state(struct vchiq_state *state);
-
-extern int
-vchiq_use_internal(struct vchiq_state *state, struct vchiq_service *service,
-		   enum USE_TYPE_E use_type);
-extern int
-vchiq_release_internal(struct vchiq_state *state,
-		       struct vchiq_service *service);
-
-extern struct vchiq_debugfs_node *
-vchiq_instance_get_debugfs_node(struct vchiq_instance *instance);
-
-extern int
-vchiq_instance_get_use_count(struct vchiq_instance *instance);
-
-extern int
-vchiq_instance_get_pid(struct vchiq_instance *instance);
-
-extern int
-vchiq_instance_get_trace(struct vchiq_instance *instance);
-
-extern void
-vchiq_instance_set_trace(struct vchiq_instance *instance, int trace);
-
-extern void
-vchiq_add_connected_callback(struct vchiq_device *device,
-			     void (*callback)(void));
-
-#if IS_ENABLED(CONFIG_VCHIQ_CDEV)
-
-extern void
-vchiq_deregister_chrdev(void);
-
-extern int
-vchiq_register_chrdev(struct device *parent);
-
-#else
-
-static inline void vchiq_deregister_chrdev(void) { }
-static inline int vchiq_register_chrdev(struct device *parent) { return 0; }
-
-#endif /* IS_ENABLED(CONFIG_VCHIQ_CDEV) */
-
-extern int
-service_callback(struct vchiq_instance *vchiq_instance, enum vchiq_reason reason,
-		 struct vchiq_header *header, unsigned int handle,
-		 void *cb_data, void __user *cb_userdata);
-
-extern void
-free_bulk_waiter(struct vchiq_instance *instance);
-
-#endif /* VCHIQ_ARM_H */
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c
index 41ece91ab88a..f50e637d505c 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c
@@ -11,8 +11,8 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 
-#include "vchiq_arm.h"
-#include "vchiq_bus.h"
+#include <linux/raspberrypi/vchiq_arm.h>
+#include <linux/raspberrypi/vchiq_bus.h>
 
 static int vchiq_bus_type_match(struct device *dev, const struct device_driver *drv)
 {
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h
deleted file mode 100644
index 9de179b39f85..000000000000
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2023 Ideas On Board Oy
- */
-
-#ifndef _VCHIQ_DEVICE_H
-#define _VCHIQ_DEVICE_H
-
-#include <linux/device.h>
-#include <linux/mod_devicetable.h>
-
-struct vchiq_drv_mgmt;
-
-struct vchiq_device {
-	struct device dev;
-	struct vchiq_drv_mgmt *drv_mgmt;
-};
-
-struct vchiq_driver {
-	int		(*probe)(struct vchiq_device *device);
-	void		(*remove)(struct vchiq_device *device);
-	int		(*resume)(struct vchiq_device *device);
-	int		(*suspend)(struct vchiq_device *device,
-				   pm_message_t state);
-
-	const struct vchiq_device_id *id_table;
-	struct device_driver driver;
-};
-
-static inline struct vchiq_device *to_vchiq_device(struct device *d)
-{
-	return container_of(d, struct vchiq_device, dev);
-}
-
-static inline struct vchiq_driver *to_vchiq_driver(struct device_driver *d)
-{
-	return container_of(d, struct vchiq_driver, driver);
-}
-
-extern const struct bus_type vchiq_bus_type;
-
-struct vchiq_device *
-vchiq_device_register(struct device *parent, const char *name);
-void vchiq_device_unregister(struct vchiq_device *dev);
-
-int vchiq_driver_register(struct vchiq_driver *vchiq_drv);
-void vchiq_driver_unregister(struct vchiq_driver *vchiq_drv);
-
-/**
- * module_vchiq_driver() - Helper macro for registering a vchiq driver
- * @__vchiq_driver: vchiq driver struct
- *
- * Helper macro for vchiq drivers which do not do anything special in
- * module init/exit. This eliminates a lot of boilerplate. Each module may only
- * use this macro once, and calling it replaces module_init() and module_exit()
- */
-#define module_vchiq_driver(__vchiq_driver) \
-	module_driver(__vchiq_driver, vchiq_driver_register, vchiq_driver_unregister)
-
-#endif /* _VCHIQ_DEVICE_H */
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h
deleted file mode 100644
index a16d0299996c..000000000000
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/* Copyright (c) 2010-2014 Broadcom. All rights reserved. */
-
-#ifndef VCHIQ_CFG_H
-#define VCHIQ_CFG_H
-
-#define VCHIQ_MAGIC              VCHIQ_MAKE_FOURCC('V', 'C', 'H', 'I')
-/* The version of VCHIQ - change with any non-trivial change */
-#define VCHIQ_VERSION            8
-/*
- * The minimum compatible version - update to match VCHIQ_VERSION with any
- * incompatible change
- */
-#define VCHIQ_VERSION_MIN        3
-
-/* The version that introduced the VCHIQ_IOC_LIB_VERSION ioctl */
-#define VCHIQ_VERSION_LIB_VERSION 7
-
-/* The version that introduced the VCHIQ_IOC_CLOSE_DELIVERED ioctl */
-#define VCHIQ_VERSION_CLOSE_DELIVERED 7
-
-/* The version that made it safe to use SYNCHRONOUS mode */
-#define VCHIQ_VERSION_SYNCHRONOUS_MODE 8
-
-#define VCHIQ_MAX_STATES         1
-#define VCHIQ_MAX_SERVICES       4096
-#define VCHIQ_MAX_SLOTS          128
-#define VCHIQ_MAX_SLOTS_PER_SIDE 64
-
-#define VCHIQ_NUM_CURRENT_BULKS        32
-#define VCHIQ_NUM_SERVICE_BULKS        4
-
-#ifndef VCHIQ_ENABLE_DEBUG
-#define VCHIQ_ENABLE_DEBUG             1
-#endif
-
-#ifndef VCHIQ_ENABLE_STATS
-#define VCHIQ_ENABLE_STATS             1
-#endif
-
-#endif /* VCHIQ_CFG_H */
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c
index 130be2f58342..83de27cfd469 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c
@@ -15,8 +15,8 @@
 #include <linux/rcupdate.h>
 #include <linux/sched/signal.h>
 
-#include "vchiq_arm.h"
-#include "vchiq_core.h"
+#include <linux/raspberrypi/vchiq_arm.h>
+#include <linux/raspberrypi/vchiq_core.h>
 
 #define VCHIQ_SLOT_HANDLER_STACK 8192
 
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h
deleted file mode 100644
index e3ed50d26c37..000000000000
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h
+++ /dev/null
@@ -1,646 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */
-
-#ifndef VCHIQ_CORE_H
-#define VCHIQ_CORE_H
-
-#include <linux/mutex.h>
-#include <linux/completion.h>
-#include <linux/dma-mapping.h>
-#include <linux/dev_printk.h>
-#include <linux/kthread.h>
-#include <linux/kref.h>
-#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
-#include <linux/spinlock_types.h>
-#include <linux/wait.h>
-
-#include "../../include/linux/raspberrypi/vchiq.h"
-#include "vchiq_cfg.h"
-
-/* Do this so that we can test-build the code on non-rpi systems */
-#if IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE)
-
-#else
-
-#ifndef dsb
-#define dsb(a)
-#endif
-
-#endif	/* IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) */
-
-#define VCHIQ_SERVICE_HANDLE_INVALID 0
-
-#define VCHIQ_SLOT_SIZE     4096
-#define VCHIQ_MAX_MSG_SIZE  (VCHIQ_SLOT_SIZE - sizeof(struct vchiq_header))
-
-#define VCHIQ_SLOT_MASK        (VCHIQ_SLOT_SIZE - 1)
-#define VCHIQ_SLOT_QUEUE_MASK  (VCHIQ_MAX_SLOTS_PER_SIDE - 1)
-#define VCHIQ_SLOT_ZERO_SLOTS  DIV_ROUND_UP(sizeof(struct vchiq_slot_zero), \
-					    VCHIQ_SLOT_SIZE)
-
-#define BITSET_SIZE(b)        ((b + 31) >> 5)
-#define BITSET_WORD(b)        (b >> 5)
-#define BITSET_BIT(b)         (1 << (b & 31))
-#define BITSET_IS_SET(bs, b)  (bs[BITSET_WORD(b)] & BITSET_BIT(b))
-#define BITSET_SET(bs, b)     (bs[BITSET_WORD(b)] |= BITSET_BIT(b))
-
-enum {
-	DEBUG_ENTRIES,
-#if VCHIQ_ENABLE_DEBUG
-	DEBUG_SLOT_HANDLER_COUNT,
-	DEBUG_SLOT_HANDLER_LINE,
-	DEBUG_PARSE_LINE,
-	DEBUG_PARSE_HEADER,
-	DEBUG_PARSE_MSGID,
-	DEBUG_AWAIT_COMPLETION_LINE,
-	DEBUG_DEQUEUE_MESSAGE_LINE,
-	DEBUG_SERVICE_CALLBACK_LINE,
-	DEBUG_MSG_QUEUE_FULL_COUNT,
-	DEBUG_COMPLETION_QUEUE_FULL_COUNT,
-#endif
-	DEBUG_MAX
-};
-
-#if VCHIQ_ENABLE_DEBUG
-
-#define DEBUG_INITIALISE(local) int *debug_ptr = (local)->debug
-#define DEBUG_TRACE(d) \
-	do { debug_ptr[DEBUG_ ## d] = __LINE__; dsb(sy); } while (0)
-#define DEBUG_VALUE(d, v) \
-	do { debug_ptr[DEBUG_ ## d] = (v); dsb(sy); } while (0)
-#define DEBUG_COUNT(d) \
-	do { debug_ptr[DEBUG_ ## d]++; dsb(sy); } while (0)
-
-#else /* VCHIQ_ENABLE_DEBUG */
-
-#define DEBUG_INITIALISE(local)
-#define DEBUG_TRACE(d)
-#define DEBUG_VALUE(d, v)
-#define DEBUG_COUNT(d)
-
-#endif /* VCHIQ_ENABLE_DEBUG */
-
-enum vchiq_connstate {
-	VCHIQ_CONNSTATE_DISCONNECTED,
-	VCHIQ_CONNSTATE_CONNECTING,
-	VCHIQ_CONNSTATE_CONNECTED,
-	VCHIQ_CONNSTATE_PAUSING,
-	VCHIQ_CONNSTATE_PAUSE_SENT,
-	VCHIQ_CONNSTATE_PAUSED,
-	VCHIQ_CONNSTATE_RESUMING,
-	VCHIQ_CONNSTATE_PAUSE_TIMEOUT,
-	VCHIQ_CONNSTATE_RESUME_TIMEOUT
-};
-
-enum {
-	VCHIQ_SRVSTATE_FREE,
-	VCHIQ_SRVSTATE_HIDDEN,
-	VCHIQ_SRVSTATE_LISTENING,
-	VCHIQ_SRVSTATE_OPENING,
-	VCHIQ_SRVSTATE_OPEN,
-	VCHIQ_SRVSTATE_OPENSYNC,
-	VCHIQ_SRVSTATE_CLOSESENT,
-	VCHIQ_SRVSTATE_CLOSERECVD,
-	VCHIQ_SRVSTATE_CLOSEWAIT,
-	VCHIQ_SRVSTATE_CLOSED
-};
-
-enum vchiq_bulk_dir {
-	VCHIQ_BULK_TRANSMIT,
-	VCHIQ_BULK_RECEIVE
-};
-
-struct vchiq_bulk {
-	short mode;
-	short dir;
-	void *cb_data;
-	void __user *cb_userdata;
-	struct bulk_waiter *waiter;
-	dma_addr_t dma_addr;
-	int size;
-	void *remote_data;
-	int remote_size;
-	int actual;
-	void *offset;
-	void __user *uoffset;
-};
-
-struct vchiq_bulk_queue {
-	int local_insert;  /* Where to insert the next local bulk */
-	int remote_insert; /* Where to insert the next remote bulk (master) */
-	int process;       /* Bulk to transfer next */
-	int remote_notify; /* Bulk to notify the remote client of next (mstr) */
-	int remove;        /* Bulk to notify the local client of, and remove, next */
-	struct vchiq_bulk bulks[VCHIQ_NUM_SERVICE_BULKS];
-};
-
-/*
- * Remote events provide a way of presenting several virtual doorbells to a
- * peer (ARM host to VPU) using only one physical doorbell. They can be thought
- * of as a way for the peer to signal a semaphore, in this case implemented as
- * a workqueue.
- *
- * Remote events remain signalled until acknowledged by the receiver, and they
- * are non-counting. They are designed in such a way as to minimise the number
- * of interrupts and avoid unnecessary waiting.
- *
- * A remote_event is as small data structures that live in shared memory. It
- * comprises two booleans - armed and fired:
- *
- * The sender sets fired when they signal the receiver.
- * If fired is set, the receiver has been signalled and need not wait.
- * The receiver sets the armed field before they begin to wait.
- * If armed is set, the receiver is waiting and wishes to be woken by interrupt.
- */
-struct remote_event {
-	int armed;
-	int fired;
-	u32 __unused;
-};
-
-struct opaque_platform_state;
-
-struct vchiq_slot {
-	char data[VCHIQ_SLOT_SIZE];
-};
-
-struct vchiq_slot_info {
-	/* Use two counters rather than one to avoid the need for a mutex. */
-	short use_count;
-	short release_count;
-};
-
-/*
- * VCHIQ is a reliable connection-oriented datagram protocol.
- *
- * A VCHIQ service is equivalent to a TCP connection, except:
- * + FOURCCs are used for the rendezvous, and port numbers are assigned at the
- *   time the connection is established.
- * + There is less of a distinction between server and client sockets, the only
- *   difference being which end makes the first move.
- * + For a multi-client server, the server creates new "listening" services as
- *   the existing one becomes connected - there is no need to specify the
- *   maximum number of clients up front.
- * + Data transfer is reliable but packetized (messages have defined ends).
- * + Messages can be either short (capable of fitting in a slot) and in-band,
- *   or copied between external buffers (bulk transfers).
- */
-struct vchiq_service {
-	struct vchiq_service_base base;
-	unsigned int handle;
-	struct kref ref_count;
-	struct rcu_head rcu;
-	int srvstate;
-	void (*userdata_term)(void *userdata);
-	unsigned int localport;
-	unsigned int remoteport;
-	int public_fourcc;
-	int client_id;
-	char auto_close;
-	char sync;
-	char closing;
-	char trace;
-	atomic_t poll_flags;
-	short version;
-	short version_min;
-	short peer_version;
-
-	struct vchiq_state *state;
-	struct vchiq_instance *instance;
-
-	int service_use_count;
-
-	struct vchiq_bulk_queue bulk_tx;
-	struct vchiq_bulk_queue bulk_rx;
-
-	struct completion remove_event;
-	struct completion bulk_remove_event;
-	struct mutex bulk_mutex;
-
-	struct service_stats_struct {
-		int quota_stalls;
-		int slot_stalls;
-		int bulk_stalls;
-		int error_count;
-		int ctrl_tx_count;
-		int ctrl_rx_count;
-		int bulk_tx_count;
-		int bulk_rx_count;
-		int bulk_aborted_count;
-		u64 ctrl_tx_bytes;
-		u64 ctrl_rx_bytes;
-		u64 bulk_tx_bytes;
-		u64 bulk_rx_bytes;
-	} stats;
-
-	int msg_queue_read;
-	int msg_queue_write;
-	struct completion msg_queue_pop;
-	struct completion msg_queue_push;
-	struct vchiq_header *msg_queue[VCHIQ_MAX_SLOTS];
-};
-
-/*
- * The quota information is outside struct vchiq_service so that it can
- * be statically allocated, since for accounting reasons a service's slot
- * usage is carried over between users of the same port number.
- */
-struct vchiq_service_quota {
-	unsigned short slot_quota;
-	unsigned short slot_use_count;
-	unsigned short message_quota;
-	unsigned short message_use_count;
-	struct completion quota_event;
-	int previous_tx_index;
-};
-
-struct vchiq_shared_state {
-	/* A non-zero value here indicates that the content is valid. */
-	int initialised;
-
-	/* The first and last (inclusive) slots allocated to the owner. */
-	int slot_first;
-	int slot_last;
-
-	/* The slot allocated to synchronous messages from the owner. */
-	int slot_sync;
-
-	/*
-	 * Signalling this event indicates that owner's slot handler thread
-	 * should run.
-	 */
-	struct remote_event trigger;
-
-	/*
-	 * Indicates the byte position within the stream where the next message
-	 * will be written. The least significant bits are an index into the
-	 * slot. The next bits are the index of the slot in slot_queue.
-	 */
-	int tx_pos;
-
-	/* This event should be signalled when a slot is recycled. */
-	struct remote_event recycle;
-
-	/* The slot_queue index where the next recycled slot will be written. */
-	int slot_queue_recycle;
-
-	/* This event should be signalled when a synchronous message is sent. */
-	struct remote_event sync_trigger;
-
-	/*
-	 * This event should be signalled when a synchronous message has been
-	 * released.
-	 */
-	struct remote_event sync_release;
-
-	/* A circular buffer of slot indexes. */
-	int slot_queue[VCHIQ_MAX_SLOTS_PER_SIDE];
-
-	/* Debugging state */
-	int debug[DEBUG_MAX];
-};
-
-/*
- * vchiq_slot_zero describes the memory shared between the ARM host and the
- * VideoCore VPU. The "master" and "slave" states are owned by the respective
- * sides but visible to the other; the slots are shared, and the remaining
- * fields are read-only.
- *
- * In the configuration used by this implementation, the memory is allocated
- * by the host, the VPU is the master (the side which controls the DMA for bulk
- * transfers), and the host is the slave.
- *
- * The ownership of slots changes with use:
- * + When empty they are owned by the sender.
- * + When partially filled they are shared with the receiver.
- * + When completely full they are owned by the receiver.
- * + When the receiver has finished processing the contents, they are recycled
- *   back to the sender.
- */
-struct vchiq_slot_zero {
-	int magic;
-	short version;
-	short version_min;
-	int slot_zero_size;
-	int slot_size;
-	int max_slots;
-	int max_slots_per_side;
-	int platform_data[2];
-	struct vchiq_shared_state master;
-	struct vchiq_shared_state slave;
-	struct vchiq_slot_info slots[VCHIQ_MAX_SLOTS];
-};
-
-/*
- * This is the private runtime state used by each side. The same structure was
- * originally used by both sides, but implementations have since diverged.
- */
-struct vchiq_state {
-	struct device *dev;
-	int id;
-	int initialised;
-	enum vchiq_connstate conn_state;
-	short version_common;
-
-	struct vchiq_shared_state *local;
-	struct vchiq_shared_state *remote;
-	struct vchiq_slot *slot_data;
-
-	unsigned short default_slot_quota;
-	unsigned short default_message_quota;
-
-	/* Event indicating connect message received */
-	struct completion connect;
-
-	/* Mutex protecting services */
-	struct mutex mutex;
-	struct vchiq_instance **instance;
-
-	/* Processes all incoming messages which aren't synchronous */
-	struct task_struct *slot_handler_thread;
-
-	/*
-	 * Slots which have been fully processed and released by the (peer)
-	 * receiver are added to the receiver queue, which is asynchronously
-	 * processed by the recycle thread.
-	 */
-	struct task_struct *recycle_thread;
-
-	/*
-	 * Processes incoming synchronous messages
-	 *
-	 * The synchronous message channel is shared between all synchronous
-	 * services, and provides a way for urgent messages to bypass
-	 * potentially long queues of asynchronous messages in the normal slots.
-	 *
-	 * There can be only one outstanding synchronous message in
-	 * each direction, and as a precious shared resource synchronous
-	 * services should be used sparingly.
-	 */
-	struct task_struct *sync_thread;
-
-	/* Local implementation of the trigger remote event */
-	wait_queue_head_t trigger_event;
-
-	/* Local implementation of the recycle remote event */
-	wait_queue_head_t recycle_event;
-
-	/* Local implementation of the sync trigger remote event */
-	wait_queue_head_t sync_trigger_event;
-
-	/* Local implementation of the sync release remote event */
-	wait_queue_head_t sync_release_event;
-
-	char *tx_data;
-	char *rx_data;
-	struct vchiq_slot_info *rx_info;
-
-	struct mutex slot_mutex;
-
-	struct mutex recycle_mutex;
-
-	struct mutex sync_mutex;
-
-	spinlock_t msg_queue_spinlock;
-
-	spinlock_t bulk_waiter_spinlock;
-
-	spinlock_t quota_spinlock;
-
-	/*
-	 * Indicates the byte position within the stream from where the next
-	 * message will be read. The least significant bits are an index into
-	 * the slot.The next bits are the index of the slot in
-	 * remote->slot_queue.
-	 */
-	int rx_pos;
-
-	/*
-	 * A cached copy of local->tx_pos. Only write to local->tx_pos, and read
-	 * from remote->tx_pos.
-	 */
-	int local_tx_pos;
-
-	/* The slot_queue index of the slot to become available next. */
-	int slot_queue_available;
-
-	/* A flag to indicate if any poll has been requested */
-	int poll_needed;
-
-	/* Ths index of the previous slot used for data messages. */
-	int previous_data_index;
-
-	/* The number of slots occupied by data messages. */
-	unsigned short data_use_count;
-
-	/* The maximum number of slots to be occupied by data messages. */
-	unsigned short data_quota;
-
-	/* An array of bit sets indicating which services must be polled. */
-	atomic_t poll_services[BITSET_SIZE(VCHIQ_MAX_SERVICES)];
-
-	/* The number of the first unused service */
-	int unused_service;
-
-	/* Signalled when a free slot becomes available. */
-	struct completion slot_available_event;
-
-	/* Signalled when a free data slot becomes available. */
-	struct completion data_quota_event;
-
-	struct state_stats_struct {
-		int slot_stalls;
-		int data_stalls;
-		int ctrl_tx_count;
-		int ctrl_rx_count;
-		int error_count;
-	} stats;
-
-	struct vchiq_service __rcu *services[VCHIQ_MAX_SERVICES];
-	struct vchiq_service_quota service_quotas[VCHIQ_MAX_SERVICES];
-	struct vchiq_slot_info slot_info[VCHIQ_MAX_SLOTS];
-
-	struct opaque_platform_state *platform_state;
-};
-
-struct pagelist {
-	u32 length;
-	u16 type;
-	u16 offset;
-	u32 addrs[1];	/* N.B. 12 LSBs hold the number
-			 * of following pages at consecutive
-			 * addresses.
-			 */
-};
-
-struct vchiq_pagelist_info {
-	struct pagelist *pagelist;
-	size_t pagelist_buffer_size;
-	dma_addr_t dma_addr;
-	enum dma_data_direction dma_dir;
-	unsigned int num_pages;
-	unsigned int pages_need_release;
-	struct page **pages;
-	struct scatterlist *scatterlist;
-	unsigned int scatterlist_mapped;
-};
-
-static inline bool vchiq_remote_initialised(const struct vchiq_state *state)
-{
-	return state->remote && state->remote->initialised;
-}
-
-struct bulk_waiter {
-	struct vchiq_bulk *bulk;
-	struct completion event;
-	int actual;
-};
-
-struct vchiq_config {
-	unsigned int max_msg_size;
-	unsigned int bulk_threshold;	/* The message size above which it
-					 * is better to use a bulk transfer
-					 * (<= max_msg_size)
-					 */
-	unsigned int max_outstanding_bulks;
-	unsigned int max_services;
-	short version;      /* The version of VCHIQ */
-	short version_min;  /* The minimum compatible version of VCHIQ */
-};
-
-extern spinlock_t bulk_waiter_spinlock;
-
-extern const char *
-get_conn_state_name(enum vchiq_connstate conn_state);
-
-extern struct vchiq_slot_zero *
-vchiq_init_slots(struct device *dev, void *mem_base, int mem_size);
-
-extern int
-vchiq_init_state(struct vchiq_state *state, struct vchiq_slot_zero *slot_zero, struct device *dev);
-
-extern int
-vchiq_connect_internal(struct vchiq_state *state, struct vchiq_instance *instance);
-
-struct vchiq_service *
-vchiq_add_service_internal(struct vchiq_state *state,
-			   const struct vchiq_service_params_kernel *params,
-			   int srvstate, struct vchiq_instance *instance,
-			   void (*userdata_term)(void *userdata));
-
-extern int
-vchiq_open_service_internal(struct vchiq_service *service, int client_id);
-
-extern int
-vchiq_close_service_internal(struct vchiq_service *service, int close_recvd);
-
-extern void
-vchiq_terminate_service_internal(struct vchiq_service *service);
-
-extern void
-vchiq_free_service_internal(struct vchiq_service *service);
-
-extern void
-vchiq_shutdown_internal(struct vchiq_state *state, struct vchiq_instance *instance);
-
-extern void
-remote_event_pollall(struct vchiq_state *state);
-
-extern int
-vchiq_bulk_xfer_waiting(struct vchiq_instance *instance, unsigned int handle,
-			struct bulk_waiter *userdata);
-
-extern int
-vchiq_bulk_xfer_blocking(struct vchiq_instance *instance, unsigned int handle,
-			 struct vchiq_bulk *bulk);
-
-extern int
-vchiq_bulk_xfer_callback(struct vchiq_instance *instance, unsigned int handle,
-			 struct vchiq_bulk *bulk);
-
-extern void
-vchiq_dump_state(struct seq_file *f, struct vchiq_state *state);
-
-extern void
-request_poll(struct vchiq_state *state, struct vchiq_service *service,
-	     int poll_type);
-
-struct vchiq_service *handle_to_service(struct vchiq_instance *instance, unsigned int handle);
-
-extern struct vchiq_service *
-find_service_by_handle(struct vchiq_instance *instance, unsigned int handle);
-
-extern struct vchiq_service *
-find_service_by_port(struct vchiq_state *state, unsigned int localport);
-
-extern struct vchiq_service *
-find_service_for_instance(struct vchiq_instance *instance, unsigned int handle);
-
-extern struct vchiq_service *
-find_closed_service_for_instance(struct vchiq_instance *instance, unsigned int handle);
-
-extern struct vchiq_service *
-__next_service_by_instance(struct vchiq_state *state,
-			   struct vchiq_instance *instance,
-			   int *pidx);
-
-extern struct vchiq_service *
-next_service_by_instance(struct vchiq_state *state,
-			 struct vchiq_instance *instance,
-			 int *pidx);
-
-extern void
-vchiq_service_get(struct vchiq_service *service);
-
-extern void
-vchiq_service_put(struct vchiq_service *service);
-
-extern int
-vchiq_queue_message(struct vchiq_instance *instance, unsigned int handle,
-		    ssize_t (*copy_callback)(void *context, void *dest,
-					     size_t offset, size_t maxsize),
-		    void *context,
-		    size_t size);
-
-void vchiq_dump_platform_state(struct seq_file *f);
-
-void vchiq_dump_platform_instances(struct vchiq_state *state, struct seq_file *f);
-
-void vchiq_dump_platform_service_state(struct seq_file *f, struct vchiq_service *service);
-
-int vchiq_use_service_internal(struct vchiq_service *service);
-
-int vchiq_release_service_internal(struct vchiq_service *service);
-
-void vchiq_on_remote_use(struct vchiq_state *state);
-
-void vchiq_on_remote_release(struct vchiq_state *state);
-
-int vchiq_platform_init_state(struct vchiq_state *state);
-
-int vchiq_check_service(struct vchiq_service *service);
-
-int vchiq_send_remote_use(struct vchiq_state *state);
-
-int vchiq_send_remote_use_active(struct vchiq_state *state);
-
-void vchiq_platform_conn_state_changed(struct vchiq_state *state,
-				       enum vchiq_connstate oldstate,
-				  enum vchiq_connstate newstate);
-
-void vchiq_set_conn_state(struct vchiq_state *state, enum vchiq_connstate newstate);
-
-void vchiq_log_dump_mem(struct device *dev, const char *label, u32 addr,
-			const void *void_mem, size_t num_bytes);
-
-int vchiq_remove_service(struct vchiq_instance *instance, unsigned int service);
-
-int vchiq_get_client_id(struct vchiq_instance *instance, unsigned int service);
-
-void vchiq_get_config(struct vchiq_config *config);
-
-int vchiq_set_service_option(struct vchiq_instance *instance, unsigned int service,
-			     enum vchiq_service_option option, int value);
-
-#endif
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c
index d5f7f61c5626..c82326a9b6d9 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c
@@ -5,9 +5,9 @@
  */
 
 #include <linux/debugfs.h>
-#include "vchiq_core.h"
-#include "vchiq_arm.h"
-#include "vchiq_debugfs.h"
+#include <linux/raspberrypi/vchiq_core.h>
+#include <linux/raspberrypi/vchiq_arm.h>
+#include <linux/raspberrypi/vchiq_debugfs.h>
 
 #ifdef CONFIG_DEBUG_FS
 
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h
deleted file mode 100644
index b29e6693c949..000000000000
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/* Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. */
-
-#ifndef VCHIQ_DEBUGFS_H
-#define VCHIQ_DEBUGFS_H
-
-struct vchiq_state;
-struct vchiq_instance;
-
-struct vchiq_debugfs_node {
-	struct dentry *dentry;
-};
-
-void vchiq_debugfs_init(struct vchiq_state *state);
-
-void vchiq_debugfs_deinit(void);
-
-void vchiq_debugfs_add_instance(struct vchiq_instance *instance);
-
-void vchiq_debugfs_remove_instance(struct vchiq_instance *instance);
-
-#endif /* VCHIQ_DEBUGFS_H */
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c
index 3b20ba5c7362..0f3dde2657d6 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c
@@ -11,10 +11,11 @@
 #include <linux/compat.h>
 #include <linux/miscdevice.h>
 
-#include "vchiq_core.h"
+#include <linux/raspberrypi/vchiq_core.h>
+#include <linux/raspberrypi/vchiq_arm.h>
+#include <linux/raspberrypi/vchiq_debugfs.h>
+
 #include "vchiq_ioctl.h"
-#include "vchiq_arm.h"
-#include "vchiq_debugfs.h"
 
 static const char *const ioctl_names[] = {
 	"CONNECT",
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h
index afb71a83cfe7..d0c759f6d8ea 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h
@@ -5,8 +5,7 @@
 #define VCHIQ_IOCTLS_H
 
 #include <linux/ioctl.h>
-
-#include "../../include/linux/raspberrypi/vchiq.h"
+#include <linux/raspberrypi/vchiq.h>
 
 #define VCHIQ_IOC_MAGIC 0xc4
 #define VCHIQ_INVALID_HANDLE (~0)
diff --git a/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c b/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c
index c2b5a37915f2..cd073ed3ea2d 100644
--- a/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c
+++ b/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c
@@ -22,11 +22,12 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/completion.h>
+#include <linux/raspberrypi/vchiq.h>
 #include <linux/vmalloc.h>
 #include <media/videobuf2-vmalloc.h>
 
-#include "../include/linux/raspberrypi/vchiq.h"
-#include "../interface/vchiq_arm/vchiq_arm.h"
+#include <linux/raspberrypi/vchiq_arm.h>
+
 #include "mmal-common.h"
 #include "mmal-vchiq.h"
 #include "mmal-msg.h"
diff --git a/include/linux/raspberrypi/vchiq.h b/include/linux/raspberrypi/vchiq.h
new file mode 100644
index 000000000000..ee4469f4fc51
--- /dev/null
+++ b/include/linux/raspberrypi/vchiq.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */
+
+#ifndef VCHIQ_H
+#define VCHIQ_H
+
+#define VCHIQ_MAKE_FOURCC(x0, x1, x2, x3) \
+			(((x0) << 24) | ((x1) << 16) | ((x2) << 8) | (x3))
+
+enum vchiq_reason {
+	VCHIQ_SERVICE_OPENED,         /* service, -, -             */
+	VCHIQ_SERVICE_CLOSED,         /* service, -, -             */
+	VCHIQ_MESSAGE_AVAILABLE,      /* service, header, -        */
+	VCHIQ_BULK_TRANSMIT_DONE,     /* service, -, bulk_userdata */
+	VCHIQ_BULK_RECEIVE_DONE,      /* service, -, bulk_userdata */
+	VCHIQ_BULK_TRANSMIT_ABORTED,  /* service, -, bulk_userdata */
+	VCHIQ_BULK_RECEIVE_ABORTED    /* service, -, bulk_userdata */
+};
+
+enum vchiq_bulk_mode {
+	VCHIQ_BULK_MODE_CALLBACK,
+	VCHIQ_BULK_MODE_BLOCKING,
+	VCHIQ_BULK_MODE_NOCALLBACK,
+	VCHIQ_BULK_MODE_WAITING		/* Reserved for internal use */
+};
+
+enum vchiq_service_option {
+	VCHIQ_SERVICE_OPTION_AUTOCLOSE,
+	VCHIQ_SERVICE_OPTION_SLOT_QUOTA,
+	VCHIQ_SERVICE_OPTION_MESSAGE_QUOTA,
+	VCHIQ_SERVICE_OPTION_SYNCHRONOUS,
+	VCHIQ_SERVICE_OPTION_TRACE
+};
+
+struct vchiq_header {
+	/* The message identifier - opaque to applications. */
+	int msgid;
+
+	/* Size of message data. */
+	unsigned int size;
+
+	char data[];           /* message */
+};
+
+struct vchiq_element {
+	const void __user *data;
+	unsigned int size;
+};
+
+struct vchiq_instance;
+struct vchiq_state;
+
+struct vchiq_service_base {
+	int fourcc;
+	int (*callback)(struct vchiq_instance *instance,
+			enum vchiq_reason reason,
+			struct vchiq_header *header,
+			unsigned int handle,
+			void *cb_data, void __user *cb_userdata);
+	void *userdata;
+};
+
+struct vchiq_completion_data_kernel {
+	enum vchiq_reason reason;
+	struct vchiq_header *header;
+	void *service_userdata;
+	void *cb_data;
+	void  __user *cb_userdata;
+};
+
+struct vchiq_service_params_kernel {
+	int fourcc;
+	int (*callback)(struct vchiq_instance *instance,
+			enum vchiq_reason reason,
+			struct vchiq_header *header,
+			unsigned int handle,
+			void *cb_data, void __user *cb_userdata);
+	void *userdata;
+	short version;       /* Increment for non-trivial changes */
+	short version_min;   /* Update for incompatible changes */
+};
+
+extern int vchiq_initialise(struct vchiq_state *state,
+			    struct vchiq_instance **pinstance);
+extern int vchiq_shutdown(struct vchiq_instance *instance);
+extern int vchiq_connect(struct vchiq_instance *instance);
+extern int vchiq_open_service(struct vchiq_instance *instance,
+			      const struct vchiq_service_params_kernel *params,
+			      unsigned int *pservice);
+extern int vchiq_close_service(struct vchiq_instance *instance,
+			       unsigned int service);
+extern int vchiq_use_service(struct vchiq_instance *instance, unsigned int service);
+extern int vchiq_release_service(struct vchiq_instance *instance,
+				 unsigned int service);
+extern void vchiq_msg_queue_push(struct vchiq_instance *instance, unsigned int handle,
+				 struct vchiq_header *header);
+extern void vchiq_release_message(struct vchiq_instance *instance, unsigned int service,
+				  struct vchiq_header *header);
+extern int vchiq_queue_kernel_message(struct vchiq_instance *instance, unsigned int handle,
+				      void *data, unsigned int size);
+extern int vchiq_bulk_transmit(struct vchiq_instance *instance, unsigned int service,
+			       const void *data, unsigned int size, void *userdata,
+			       enum vchiq_bulk_mode mode);
+extern int vchiq_bulk_receive(struct vchiq_instance *instance, unsigned int service,
+			      void *data, unsigned int size, void *userdata,
+			      enum vchiq_bulk_mode mode);
+extern void *vchiq_get_service_userdata(struct vchiq_instance *instance, unsigned int service);
+extern int vchiq_get_peer_version(struct vchiq_instance *instance, unsigned int handle,
+				  short *peer_version);
+extern struct vchiq_header *vchiq_msg_hold(struct vchiq_instance *instance, unsigned int handle);
+
+#endif /* VCHIQ_H */
diff --git a/include/linux/raspberrypi/vchiq_arm.h b/include/linux/raspberrypi/vchiq_arm.h
new file mode 100644
index 000000000000..e32b02f99024
--- /dev/null
+++ b/include/linux/raspberrypi/vchiq_arm.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved.
+ * Copyright (c) 2010-2012 Broadcom. All rights reserved.
+ */
+
+#ifndef VCHIQ_ARM_H
+#define VCHIQ_ARM_H
+
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/semaphore.h>
+#include <linux/atomic.h>
+#include "vchiq_core.h"
+#include "vchiq_debugfs.h"
+
+/* Some per-instance constants */
+#define MAX_COMPLETIONS 128
+#define MAX_SERVICES 64
+#define MAX_ELEMENTS 8
+#define MSG_QUEUE_SIZE 128
+
+#define VCHIQ_DRV_MAX_CALLBACKS 10
+
+struct rpi_firmware;
+struct vchiq_device;
+
+enum USE_TYPE_E {
+	USE_TYPE_SERVICE,
+	USE_TYPE_VCHIQ
+};
+
+struct vchiq_platform_info {
+	unsigned int cache_line_size;
+};
+
+struct vchiq_drv_mgmt {
+	struct rpi_firmware *fw;
+	const struct vchiq_platform_info *info;
+
+	bool connected;
+	int num_deferred_callbacks;
+	/* Protects connected and num_deferred_callbacks */
+	struct mutex connected_mutex;
+
+	void (*deferred_callback[VCHIQ_DRV_MAX_CALLBACKS])(void);
+
+	struct semaphore free_fragments_sema;
+	struct semaphore free_fragments_mutex;
+	char *fragments_base;
+	char *free_fragments;
+	unsigned int fragments_size;
+
+	void __iomem *regs;
+
+	struct vchiq_state state;
+};
+
+struct user_service {
+	struct vchiq_service *service;
+	void __user *userdata;
+	struct vchiq_instance *instance;
+	char is_vchi;
+	char dequeue_pending;
+	char close_pending;
+	int message_available_pos;
+	int msg_insert;
+	int msg_remove;
+	struct completion insert_event;
+	struct completion remove_event;
+	struct completion close_event;
+	struct vchiq_header *msg_queue[MSG_QUEUE_SIZE];
+};
+
+struct bulk_waiter_node {
+	struct bulk_waiter bulk_waiter;
+	int pid;
+	struct list_head list;
+};
+
+struct vchiq_instance {
+	struct vchiq_state *state;
+	struct vchiq_completion_data_kernel completions[MAX_COMPLETIONS];
+	int completion_insert;
+	int completion_remove;
+	struct completion insert_event;
+	struct completion remove_event;
+	struct mutex completion_mutex;
+
+	int connected;
+	int closing;
+	int pid;
+	int mark;
+	int use_close_delivered;
+	int trace;
+
+	struct list_head bulk_waiter_list;
+	struct mutex bulk_waiter_list_mutex;
+
+	struct vchiq_debugfs_node debugfs_node;
+};
+
+int
+vchiq_use_service(struct vchiq_instance *instance, unsigned int handle);
+
+extern int
+vchiq_release_service(struct vchiq_instance *instance, unsigned int handle);
+
+extern int
+vchiq_check_service(struct vchiq_service *service);
+
+extern void
+vchiq_dump_service_use_state(struct vchiq_state *state);
+
+extern int
+vchiq_use_internal(struct vchiq_state *state, struct vchiq_service *service,
+		   enum USE_TYPE_E use_type);
+extern int
+vchiq_release_internal(struct vchiq_state *state,
+		       struct vchiq_service *service);
+
+extern struct vchiq_debugfs_node *
+vchiq_instance_get_debugfs_node(struct vchiq_instance *instance);
+
+extern int
+vchiq_instance_get_use_count(struct vchiq_instance *instance);
+
+extern int
+vchiq_instance_get_pid(struct vchiq_instance *instance);
+
+extern int
+vchiq_instance_get_trace(struct vchiq_instance *instance);
+
+extern void
+vchiq_instance_set_trace(struct vchiq_instance *instance, int trace);
+
+extern void
+vchiq_add_connected_callback(struct vchiq_device *device,
+			     void (*callback)(void));
+
+#if IS_ENABLED(CONFIG_VCHIQ_CDEV)
+
+extern void
+vchiq_deregister_chrdev(void);
+
+extern int
+vchiq_register_chrdev(struct device *parent);
+
+#else
+
+static inline void vchiq_deregister_chrdev(void) { }
+static inline int vchiq_register_chrdev(struct device *parent) { return 0; }
+
+#endif /* IS_ENABLED(CONFIG_VCHIQ_CDEV) */
+
+extern int
+service_callback(struct vchiq_instance *vchiq_instance, enum vchiq_reason reason,
+		 struct vchiq_header *header, unsigned int handle,
+		 void *cb_data, void __user *cb_userdata);
+
+extern void
+free_bulk_waiter(struct vchiq_instance *instance);
+
+#endif /* VCHIQ_ARM_H */
diff --git a/include/linux/raspberrypi/vchiq_bus.h b/include/linux/raspberrypi/vchiq_bus.h
new file mode 100644
index 000000000000..9de179b39f85
--- /dev/null
+++ b/include/linux/raspberrypi/vchiq_bus.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Ideas On Board Oy
+ */
+
+#ifndef _VCHIQ_DEVICE_H
+#define _VCHIQ_DEVICE_H
+
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+
+struct vchiq_drv_mgmt;
+
+struct vchiq_device {
+	struct device dev;
+	struct vchiq_drv_mgmt *drv_mgmt;
+};
+
+struct vchiq_driver {
+	int		(*probe)(struct vchiq_device *device);
+	void		(*remove)(struct vchiq_device *device);
+	int		(*resume)(struct vchiq_device *device);
+	int		(*suspend)(struct vchiq_device *device,
+				   pm_message_t state);
+
+	const struct vchiq_device_id *id_table;
+	struct device_driver driver;
+};
+
+static inline struct vchiq_device *to_vchiq_device(struct device *d)
+{
+	return container_of(d, struct vchiq_device, dev);
+}
+
+static inline struct vchiq_driver *to_vchiq_driver(struct device_driver *d)
+{
+	return container_of(d, struct vchiq_driver, driver);
+}
+
+extern const struct bus_type vchiq_bus_type;
+
+struct vchiq_device *
+vchiq_device_register(struct device *parent, const char *name);
+void vchiq_device_unregister(struct vchiq_device *dev);
+
+int vchiq_driver_register(struct vchiq_driver *vchiq_drv);
+void vchiq_driver_unregister(struct vchiq_driver *vchiq_drv);
+
+/**
+ * module_vchiq_driver() - Helper macro for registering a vchiq driver
+ * @__vchiq_driver: vchiq driver struct
+ *
+ * Helper macro for vchiq drivers which do not do anything special in
+ * module init/exit. This eliminates a lot of boilerplate. Each module may only
+ * use this macro once, and calling it replaces module_init() and module_exit()
+ */
+#define module_vchiq_driver(__vchiq_driver) \
+	module_driver(__vchiq_driver, vchiq_driver_register, vchiq_driver_unregister)
+
+#endif /* _VCHIQ_DEVICE_H */
diff --git a/include/linux/raspberrypi/vchiq_cfg.h b/include/linux/raspberrypi/vchiq_cfg.h
new file mode 100644
index 000000000000..a16d0299996c
--- /dev/null
+++ b/include/linux/raspberrypi/vchiq_cfg.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright (c) 2010-2014 Broadcom. All rights reserved. */
+
+#ifndef VCHIQ_CFG_H
+#define VCHIQ_CFG_H
+
+#define VCHIQ_MAGIC              VCHIQ_MAKE_FOURCC('V', 'C', 'H', 'I')
+/* The version of VCHIQ - change with any non-trivial change */
+#define VCHIQ_VERSION            8
+/*
+ * The minimum compatible version - update to match VCHIQ_VERSION with any
+ * incompatible change
+ */
+#define VCHIQ_VERSION_MIN        3
+
+/* The version that introduced the VCHIQ_IOC_LIB_VERSION ioctl */
+#define VCHIQ_VERSION_LIB_VERSION 7
+
+/* The version that introduced the VCHIQ_IOC_CLOSE_DELIVERED ioctl */
+#define VCHIQ_VERSION_CLOSE_DELIVERED 7
+
+/* The version that made it safe to use SYNCHRONOUS mode */
+#define VCHIQ_VERSION_SYNCHRONOUS_MODE 8
+
+#define VCHIQ_MAX_STATES         1
+#define VCHIQ_MAX_SERVICES       4096
+#define VCHIQ_MAX_SLOTS          128
+#define VCHIQ_MAX_SLOTS_PER_SIDE 64
+
+#define VCHIQ_NUM_CURRENT_BULKS        32
+#define VCHIQ_NUM_SERVICE_BULKS        4
+
+#ifndef VCHIQ_ENABLE_DEBUG
+#define VCHIQ_ENABLE_DEBUG             1
+#endif
+
+#ifndef VCHIQ_ENABLE_STATS
+#define VCHIQ_ENABLE_STATS             1
+#endif
+
+#endif /* VCHIQ_CFG_H */
diff --git a/include/linux/raspberrypi/vchiq_core.h b/include/linux/raspberrypi/vchiq_core.h
new file mode 100644
index 000000000000..e7bf7a114985
--- /dev/null
+++ b/include/linux/raspberrypi/vchiq_core.h
@@ -0,0 +1,646 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */
+
+#ifndef VCHIQ_CORE_H
+#define VCHIQ_CORE_H
+
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/dev_printk.h>
+#include <linux/kthread.h>
+#include <linux/kref.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock_types.h>
+#include <linux/wait.h>
+
+#include "vchiq.h"
+#include "vchiq_cfg.h"
+
+/* Do this so that we can test-build the code on non-rpi systems */
+#if IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE)
+
+#else
+
+#ifndef dsb
+#define dsb(a)
+#endif
+
+#endif	/* IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) */
+
+#define VCHIQ_SERVICE_HANDLE_INVALID 0
+
+#define VCHIQ_SLOT_SIZE     4096
+#define VCHIQ_MAX_MSG_SIZE  (VCHIQ_SLOT_SIZE - sizeof(struct vchiq_header))
+
+#define VCHIQ_SLOT_MASK        (VCHIQ_SLOT_SIZE - 1)
+#define VCHIQ_SLOT_QUEUE_MASK  (VCHIQ_MAX_SLOTS_PER_SIDE - 1)
+#define VCHIQ_SLOT_ZERO_SLOTS  DIV_ROUND_UP(sizeof(struct vchiq_slot_zero), \
+					    VCHIQ_SLOT_SIZE)
+
+#define BITSET_SIZE(b)        ((b + 31) >> 5)
+#define BITSET_WORD(b)        (b >> 5)
+#define BITSET_BIT(b)         (1 << (b & 31))
+#define BITSET_IS_SET(bs, b)  (bs[BITSET_WORD(b)] & BITSET_BIT(b))
+#define BITSET_SET(bs, b)     (bs[BITSET_WORD(b)] |= BITSET_BIT(b))
+
+enum {
+	DEBUG_ENTRIES,
+#if VCHIQ_ENABLE_DEBUG
+	DEBUG_SLOT_HANDLER_COUNT,
+	DEBUG_SLOT_HANDLER_LINE,
+	DEBUG_PARSE_LINE,
+	DEBUG_PARSE_HEADER,
+	DEBUG_PARSE_MSGID,
+	DEBUG_AWAIT_COMPLETION_LINE,
+	DEBUG_DEQUEUE_MESSAGE_LINE,
+	DEBUG_SERVICE_CALLBACK_LINE,
+	DEBUG_MSG_QUEUE_FULL_COUNT,
+	DEBUG_COMPLETION_QUEUE_FULL_COUNT,
+#endif
+	DEBUG_MAX
+};
+
+#if VCHIQ_ENABLE_DEBUG
+
+#define DEBUG_INITIALISE(local) int *debug_ptr = (local)->debug
+#define DEBUG_TRACE(d) \
+	do { debug_ptr[DEBUG_ ## d] = __LINE__; dsb(sy); } while (0)
+#define DEBUG_VALUE(d, v) \
+	do { debug_ptr[DEBUG_ ## d] = (v); dsb(sy); } while (0)
+#define DEBUG_COUNT(d) \
+	do { debug_ptr[DEBUG_ ## d]++; dsb(sy); } while (0)
+
+#else /* VCHIQ_ENABLE_DEBUG */
+
+#define DEBUG_INITIALISE(local)
+#define DEBUG_TRACE(d)
+#define DEBUG_VALUE(d, v)
+#define DEBUG_COUNT(d)
+
+#endif /* VCHIQ_ENABLE_DEBUG */
+
+enum vchiq_connstate {
+	VCHIQ_CONNSTATE_DISCONNECTED,
+	VCHIQ_CONNSTATE_CONNECTING,
+	VCHIQ_CONNSTATE_CONNECTED,
+	VCHIQ_CONNSTATE_PAUSING,
+	VCHIQ_CONNSTATE_PAUSE_SENT,
+	VCHIQ_CONNSTATE_PAUSED,
+	VCHIQ_CONNSTATE_RESUMING,
+	VCHIQ_CONNSTATE_PAUSE_TIMEOUT,
+	VCHIQ_CONNSTATE_RESUME_TIMEOUT
+};
+
+enum {
+	VCHIQ_SRVSTATE_FREE,
+	VCHIQ_SRVSTATE_HIDDEN,
+	VCHIQ_SRVSTATE_LISTENING,
+	VCHIQ_SRVSTATE_OPENING,
+	VCHIQ_SRVSTATE_OPEN,
+	VCHIQ_SRVSTATE_OPENSYNC,
+	VCHIQ_SRVSTATE_CLOSESENT,
+	VCHIQ_SRVSTATE_CLOSERECVD,
+	VCHIQ_SRVSTATE_CLOSEWAIT,
+	VCHIQ_SRVSTATE_CLOSED
+};
+
+enum vchiq_bulk_dir {
+	VCHIQ_BULK_TRANSMIT,
+	VCHIQ_BULK_RECEIVE
+};
+
+struct vchiq_bulk {
+	short mode;
+	short dir;
+	void *cb_data;
+	void __user *cb_userdata;
+	struct bulk_waiter *waiter;
+	dma_addr_t dma_addr;
+	int size;
+	void *remote_data;
+	int remote_size;
+	int actual;
+	void *offset;
+	void __user *uoffset;
+};
+
+struct vchiq_bulk_queue {
+	int local_insert;  /* Where to insert the next local bulk */
+	int remote_insert; /* Where to insert the next remote bulk (master) */
+	int process;       /* Bulk to transfer next */
+	int remote_notify; /* Bulk to notify the remote client of next (mstr) */
+	int remove;        /* Bulk to notify the local client of, and remove, next */
+	struct vchiq_bulk bulks[VCHIQ_NUM_SERVICE_BULKS];
+};
+
+/*
+ * Remote events provide a way of presenting several virtual doorbells to a
+ * peer (ARM host to VPU) using only one physical doorbell. They can be thought
+ * of as a way for the peer to signal a semaphore, in this case implemented as
+ * a workqueue.
+ *
+ * Remote events remain signalled until acknowledged by the receiver, and they
+ * are non-counting. They are designed in such a way as to minimise the number
+ * of interrupts and avoid unnecessary waiting.
+ *
+ * A remote_event is as small data structures that live in shared memory. It
+ * comprises two booleans - armed and fired:
+ *
+ * The sender sets fired when they signal the receiver.
+ * If fired is set, the receiver has been signalled and need not wait.
+ * The receiver sets the armed field before they begin to wait.
+ * If armed is set, the receiver is waiting and wishes to be woken by interrupt.
+ */
+struct remote_event {
+	int armed;
+	int fired;
+	u32 __unused;
+};
+
+struct opaque_platform_state;
+
+struct vchiq_slot {
+	char data[VCHIQ_SLOT_SIZE];
+};
+
+struct vchiq_slot_info {
+	/* Use two counters rather than one to avoid the need for a mutex. */
+	short use_count;
+	short release_count;
+};
+
+/*
+ * VCHIQ is a reliable connection-oriented datagram protocol.
+ *
+ * A VCHIQ service is equivalent to a TCP connection, except:
+ * + FOURCCs are used for the rendezvous, and port numbers are assigned at the
+ *   time the connection is established.
+ * + There is less of a distinction between server and client sockets, the only
+ *   difference being which end makes the first move.
+ * + For a multi-client server, the server creates new "listening" services as
+ *   the existing one becomes connected - there is no need to specify the
+ *   maximum number of clients up front.
+ * + Data transfer is reliable but packetized (messages have defined ends).
+ * + Messages can be either short (capable of fitting in a slot) and in-band,
+ *   or copied between external buffers (bulk transfers).
+ */
+struct vchiq_service {
+	struct vchiq_service_base base;
+	unsigned int handle;
+	struct kref ref_count;
+	struct rcu_head rcu;
+	int srvstate;
+	void (*userdata_term)(void *userdata);
+	unsigned int localport;
+	unsigned int remoteport;
+	int public_fourcc;
+	int client_id;
+	char auto_close;
+	char sync;
+	char closing;
+	char trace;
+	atomic_t poll_flags;
+	short version;
+	short version_min;
+	short peer_version;
+
+	struct vchiq_state *state;
+	struct vchiq_instance *instance;
+
+	int service_use_count;
+
+	struct vchiq_bulk_queue bulk_tx;
+	struct vchiq_bulk_queue bulk_rx;
+
+	struct completion remove_event;
+	struct completion bulk_remove_event;
+	struct mutex bulk_mutex;
+
+	struct service_stats_struct {
+		int quota_stalls;
+		int slot_stalls;
+		int bulk_stalls;
+		int error_count;
+		int ctrl_tx_count;
+		int ctrl_rx_count;
+		int bulk_tx_count;
+		int bulk_rx_count;
+		int bulk_aborted_count;
+		u64 ctrl_tx_bytes;
+		u64 ctrl_rx_bytes;
+		u64 bulk_tx_bytes;
+		u64 bulk_rx_bytes;
+	} stats;
+
+	int msg_queue_read;
+	int msg_queue_write;
+	struct completion msg_queue_pop;
+	struct completion msg_queue_push;
+	struct vchiq_header *msg_queue[VCHIQ_MAX_SLOTS];
+};
+
+/*
+ * The quota information is outside struct vchiq_service so that it can
+ * be statically allocated, since for accounting reasons a service's slot
+ * usage is carried over between users of the same port number.
+ */
+struct vchiq_service_quota {
+	unsigned short slot_quota;
+	unsigned short slot_use_count;
+	unsigned short message_quota;
+	unsigned short message_use_count;
+	struct completion quota_event;
+	int previous_tx_index;
+};
+
+struct vchiq_shared_state {
+	/* A non-zero value here indicates that the content is valid. */
+	int initialised;
+
+	/* The first and last (inclusive) slots allocated to the owner. */
+	int slot_first;
+	int slot_last;
+
+	/* The slot allocated to synchronous messages from the owner. */
+	int slot_sync;
+
+	/*
+	 * Signalling this event indicates that owner's slot handler thread
+	 * should run.
+	 */
+	struct remote_event trigger;
+
+	/*
+	 * Indicates the byte position within the stream where the next message
+	 * will be written. The least significant bits are an index into the
+	 * slot. The next bits are the index of the slot in slot_queue.
+	 */
+	int tx_pos;
+
+	/* This event should be signalled when a slot is recycled. */
+	struct remote_event recycle;
+
+	/* The slot_queue index where the next recycled slot will be written. */
+	int slot_queue_recycle;
+
+	/* This event should be signalled when a synchronous message is sent. */
+	struct remote_event sync_trigger;
+
+	/*
+	 * This event should be signalled when a synchronous message has been
+	 * released.
+	 */
+	struct remote_event sync_release;
+
+	/* A circular buffer of slot indexes. */
+	int slot_queue[VCHIQ_MAX_SLOTS_PER_SIDE];
+
+	/* Debugging state */
+	int debug[DEBUG_MAX];
+};
+
+/*
+ * vchiq_slot_zero describes the memory shared between the ARM host and the
+ * VideoCore VPU. The "master" and "slave" states are owned by the respective
+ * sides but visible to the other; the slots are shared, and the remaining
+ * fields are read-only.
+ *
+ * In the configuration used by this implementation, the memory is allocated
+ * by the host, the VPU is the master (the side which controls the DMA for bulk
+ * transfers), and the host is the slave.
+ *
+ * The ownership of slots changes with use:
+ * + When empty they are owned by the sender.
+ * + When partially filled they are shared with the receiver.
+ * + When completely full they are owned by the receiver.
+ * + When the receiver has finished processing the contents, they are recycled
+ *   back to the sender.
+ */
+struct vchiq_slot_zero {
+	int magic;
+	short version;
+	short version_min;
+	int slot_zero_size;
+	int slot_size;
+	int max_slots;
+	int max_slots_per_side;
+	int platform_data[2];
+	struct vchiq_shared_state master;
+	struct vchiq_shared_state slave;
+	struct vchiq_slot_info slots[VCHIQ_MAX_SLOTS];
+};
+
+/*
+ * This is the private runtime state used by each side. The same structure was
+ * originally used by both sides, but implementations have since diverged.
+ */
+struct vchiq_state {
+	struct device *dev;
+	int id;
+	int initialised;
+	enum vchiq_connstate conn_state;
+	short version_common;
+
+	struct vchiq_shared_state *local;
+	struct vchiq_shared_state *remote;
+	struct vchiq_slot *slot_data;
+
+	unsigned short default_slot_quota;
+	unsigned short default_message_quota;
+
+	/* Event indicating connect message received */
+	struct completion connect;
+
+	/* Mutex protecting services */
+	struct mutex mutex;
+	struct vchiq_instance **instance;
+
+	/* Processes all incoming messages which aren't synchronous */
+	struct task_struct *slot_handler_thread;
+
+	/*
+	 * Slots which have been fully processed and released by the (peer)
+	 * receiver are added to the receiver queue, which is asynchronously
+	 * processed by the recycle thread.
+	 */
+	struct task_struct *recycle_thread;
+
+	/*
+	 * Processes incoming synchronous messages
+	 *
+	 * The synchronous message channel is shared between all synchronous
+	 * services, and provides a way for urgent messages to bypass
+	 * potentially long queues of asynchronous messages in the normal slots.
+	 *
+	 * There can be only one outstanding synchronous message in
+	 * each direction, and as a precious shared resource synchronous
+	 * services should be used sparingly.
+	 */
+	struct task_struct *sync_thread;
+
+	/* Local implementation of the trigger remote event */
+	wait_queue_head_t trigger_event;
+
+	/* Local implementation of the recycle remote event */
+	wait_queue_head_t recycle_event;
+
+	/* Local implementation of the sync trigger remote event */
+	wait_queue_head_t sync_trigger_event;
+
+	/* Local implementation of the sync release remote event */
+	wait_queue_head_t sync_release_event;
+
+	char *tx_data;
+	char *rx_data;
+	struct vchiq_slot_info *rx_info;
+
+	struct mutex slot_mutex;
+
+	struct mutex recycle_mutex;
+
+	struct mutex sync_mutex;
+
+	spinlock_t msg_queue_spinlock;
+
+	spinlock_t bulk_waiter_spinlock;
+
+	spinlock_t quota_spinlock;
+
+	/*
+	 * Indicates the byte position within the stream from where the next
+	 * message will be read. The least significant bits are an index into
+	 * the slot.The next bits are the index of the slot in
+	 * remote->slot_queue.
+	 */
+	int rx_pos;
+
+	/*
+	 * A cached copy of local->tx_pos. Only write to local->tx_pos, and read
+	 * from remote->tx_pos.
+	 */
+	int local_tx_pos;
+
+	/* The slot_queue index of the slot to become available next. */
+	int slot_queue_available;
+
+	/* A flag to indicate if any poll has been requested */
+	int poll_needed;
+
+	/* Ths index of the previous slot used for data messages. */
+	int previous_data_index;
+
+	/* The number of slots occupied by data messages. */
+	unsigned short data_use_count;
+
+	/* The maximum number of slots to be occupied by data messages. */
+	unsigned short data_quota;
+
+	/* An array of bit sets indicating which services must be polled. */
+	atomic_t poll_services[BITSET_SIZE(VCHIQ_MAX_SERVICES)];
+
+	/* The number of the first unused service */
+	int unused_service;
+
+	/* Signalled when a free slot becomes available. */
+	struct completion slot_available_event;
+
+	/* Signalled when a free data slot becomes available. */
+	struct completion data_quota_event;
+
+	struct state_stats_struct {
+		int slot_stalls;
+		int data_stalls;
+		int ctrl_tx_count;
+		int ctrl_rx_count;
+		int error_count;
+	} stats;
+
+	struct vchiq_service __rcu *services[VCHIQ_MAX_SERVICES];
+	struct vchiq_service_quota service_quotas[VCHIQ_MAX_SERVICES];
+	struct vchiq_slot_info slot_info[VCHIQ_MAX_SLOTS];
+
+	struct opaque_platform_state *platform_state;
+};
+
+struct pagelist {
+	u32 length;
+	u16 type;
+	u16 offset;
+	u32 addrs[1];	/* N.B. 12 LSBs hold the number
+			 * of following pages at consecutive
+			 * addresses.
+			 */
+};
+
+struct vchiq_pagelist_info {
+	struct pagelist *pagelist;
+	size_t pagelist_buffer_size;
+	dma_addr_t dma_addr;
+	enum dma_data_direction dma_dir;
+	unsigned int num_pages;
+	unsigned int pages_need_release;
+	struct page **pages;
+	struct scatterlist *scatterlist;
+	unsigned int scatterlist_mapped;
+};
+
+static inline bool vchiq_remote_initialised(const struct vchiq_state *state)
+{
+	return state->remote && state->remote->initialised;
+}
+
+struct bulk_waiter {
+	struct vchiq_bulk *bulk;
+	struct completion event;
+	int actual;
+};
+
+struct vchiq_config {
+	unsigned int max_msg_size;
+	unsigned int bulk_threshold;	/* The message size above which it
+					 * is better to use a bulk transfer
+					 * (<= max_msg_size)
+					 */
+	unsigned int max_outstanding_bulks;
+	unsigned int max_services;
+	short version;      /* The version of VCHIQ */
+	short version_min;  /* The minimum compatible version of VCHIQ */
+};
+
+extern spinlock_t bulk_waiter_spinlock;
+
+extern const char *
+get_conn_state_name(enum vchiq_connstate conn_state);
+
+extern struct vchiq_slot_zero *
+vchiq_init_slots(struct device *dev, void *mem_base, int mem_size);
+
+extern int
+vchiq_init_state(struct vchiq_state *state, struct vchiq_slot_zero *slot_zero, struct device *dev);
+
+extern int
+vchiq_connect_internal(struct vchiq_state *state, struct vchiq_instance *instance);
+
+struct vchiq_service *
+vchiq_add_service_internal(struct vchiq_state *state,
+			   const struct vchiq_service_params_kernel *params,
+			   int srvstate, struct vchiq_instance *instance,
+			   void (*userdata_term)(void *userdata));
+
+extern int
+vchiq_open_service_internal(struct vchiq_service *service, int client_id);
+
+extern int
+vchiq_close_service_internal(struct vchiq_service *service, int close_recvd);
+
+extern void
+vchiq_terminate_service_internal(struct vchiq_service *service);
+
+extern void
+vchiq_free_service_internal(struct vchiq_service *service);
+
+extern void
+vchiq_shutdown_internal(struct vchiq_state *state, struct vchiq_instance *instance);
+
+extern void
+remote_event_pollall(struct vchiq_state *state);
+
+extern int
+vchiq_bulk_xfer_waiting(struct vchiq_instance *instance, unsigned int handle,
+			struct bulk_waiter *userdata);
+
+extern int
+vchiq_bulk_xfer_blocking(struct vchiq_instance *instance, unsigned int handle,
+			 struct vchiq_bulk *bulk);
+
+extern int
+vchiq_bulk_xfer_callback(struct vchiq_instance *instance, unsigned int handle,
+			 struct vchiq_bulk *bulk);
+
+extern void
+vchiq_dump_state(struct seq_file *f, struct vchiq_state *state);
+
+extern void
+request_poll(struct vchiq_state *state, struct vchiq_service *service,
+	     int poll_type);
+
+struct vchiq_service *handle_to_service(struct vchiq_instance *instance, unsigned int handle);
+
+extern struct vchiq_service *
+find_service_by_handle(struct vchiq_instance *instance, unsigned int handle);
+
+extern struct vchiq_service *
+find_service_by_port(struct vchiq_state *state, unsigned int localport);
+
+extern struct vchiq_service *
+find_service_for_instance(struct vchiq_instance *instance, unsigned int handle);
+
+extern struct vchiq_service *
+find_closed_service_for_instance(struct vchiq_instance *instance, unsigned int handle);
+
+extern struct vchiq_service *
+__next_service_by_instance(struct vchiq_state *state,
+			   struct vchiq_instance *instance,
+			   int *pidx);
+
+extern struct vchiq_service *
+next_service_by_instance(struct vchiq_state *state,
+			 struct vchiq_instance *instance,
+			 int *pidx);
+
+extern void
+vchiq_service_get(struct vchiq_service *service);
+
+extern void
+vchiq_service_put(struct vchiq_service *service);
+
+extern int
+vchiq_queue_message(struct vchiq_instance *instance, unsigned int handle,
+		    ssize_t (*copy_callback)(void *context, void *dest,
+					     size_t offset, size_t maxsize),
+		    void *context,
+		    size_t size);
+
+void vchiq_dump_platform_state(struct seq_file *f);
+
+void vchiq_dump_platform_instances(struct vchiq_state *state, struct seq_file *f);
+
+void vchiq_dump_platform_service_state(struct seq_file *f, struct vchiq_service *service);
+
+int vchiq_use_service_internal(struct vchiq_service *service);
+
+int vchiq_release_service_internal(struct vchiq_service *service);
+
+void vchiq_on_remote_use(struct vchiq_state *state);
+
+void vchiq_on_remote_release(struct vchiq_state *state);
+
+int vchiq_platform_init_state(struct vchiq_state *state);
+
+int vchiq_check_service(struct vchiq_service *service);
+
+int vchiq_send_remote_use(struct vchiq_state *state);
+
+int vchiq_send_remote_use_active(struct vchiq_state *state);
+
+void vchiq_platform_conn_state_changed(struct vchiq_state *state,
+				       enum vchiq_connstate oldstate,
+				  enum vchiq_connstate newstate);
+
+void vchiq_set_conn_state(struct vchiq_state *state, enum vchiq_connstate newstate);
+
+void vchiq_log_dump_mem(struct device *dev, const char *label, u32 addr,
+			const void *void_mem, size_t num_bytes);
+
+int vchiq_remove_service(struct vchiq_instance *instance, unsigned int service);
+
+int vchiq_get_client_id(struct vchiq_instance *instance, unsigned int service);
+
+void vchiq_get_config(struct vchiq_config *config);
+
+int vchiq_set_service_option(struct vchiq_instance *instance, unsigned int service,
+			     enum vchiq_service_option option, int value);
+
+#endif
diff --git a/include/linux/raspberrypi/vchiq_debugfs.h b/include/linux/raspberrypi/vchiq_debugfs.h
new file mode 100644
index 000000000000..b29e6693c949
--- /dev/null
+++ b/include/linux/raspberrypi/vchiq_debugfs.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. */
+
+#ifndef VCHIQ_DEBUGFS_H
+#define VCHIQ_DEBUGFS_H
+
+struct vchiq_state;
+struct vchiq_instance;
+
+struct vchiq_debugfs_node {
+	struct dentry *dentry;
+};
+
+void vchiq_debugfs_init(struct vchiq_state *state);
+
+void vchiq_debugfs_deinit(void);
+
+void vchiq_debugfs_add_instance(struct vchiq_instance *instance);
+
+void vchiq_debugfs_remove_instance(struct vchiq_instance *instance);
+
+#endif /* VCHIQ_DEBUGFS_H */
-- 
cgit v1.2.3


From 7b8a8ec20cfce2298f6737089f5d17407ea346b4 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 27 Oct 2025 11:34:01 +0200
Subject: PCI/TPH: Expose pcie_tph_get_st_table_loc()

Expose pcie_tph_get_st_table_loc() to be used by drivers as will be done
in the next patch from the series.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20251027-st-direct-mode-v1-1-e0ad953866b6@nvidia.com
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/pci/tph.c       | 16 +++++++++++++---
 include/linux/pci-tph.h |  1 +
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c
index cc64f93709a4..ca4f97be7538 100644
--- a/drivers/pci/tph.c
+++ b/drivers/pci/tph.c
@@ -155,7 +155,16 @@ static u8 get_st_modes(struct pci_dev *pdev)
 	return reg;
 }
 
-static u32 get_st_table_loc(struct pci_dev *pdev)
+/**
+ * pcie_tph_get_st_table_loc - Return the device's ST table location
+ * @pdev: PCI device to query
+ *
+ * Return:
+ *  PCI_TPH_LOC_NONE - Not present
+ *  PCI_TPH_LOC_CAP  - Located in the TPH Requester Extended Capability
+ *  PCI_TPH_LOC_MSIX - Located in the MSI-X Table
+ */
+u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev)
 {
 	u32 reg;
 
@@ -163,6 +172,7 @@ static u32 get_st_table_loc(struct pci_dev *pdev)
 
 	return FIELD_GET(PCI_TPH_CAP_LOC_MASK, reg);
 }
+EXPORT_SYMBOL(pcie_tph_get_st_table_loc);
 
 /*
  * Return the size of ST table. If ST table is not in TPH Requester Extended
@@ -174,7 +184,7 @@ u16 pcie_tph_get_st_table_size(struct pci_dev *pdev)
 	u32 loc;
 
 	/* Check ST table location first */
-	loc = get_st_table_loc(pdev);
+	loc = pcie_tph_get_st_table_loc(pdev);
 
 	/* Convert loc to match with PCI_TPH_LOC_* defined in pci_regs.h */
 	loc = FIELD_PREP(PCI_TPH_CAP_LOC_MASK, loc);
@@ -299,7 +309,7 @@ int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag)
 	 */
 	set_ctrl_reg_req_en(pdev, PCI_TPH_REQ_DISABLE);
 
-	loc = get_st_table_loc(pdev);
+	loc = pcie_tph_get_st_table_loc(pdev);
 	/* Convert loc to match with PCI_TPH_LOC_* */
 	loc = FIELD_PREP(PCI_TPH_CAP_LOC_MASK, loc);
 
diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h
index 9e4e331b1603..ba28140ce670 100644
--- a/include/linux/pci-tph.h
+++ b/include/linux/pci-tph.h
@@ -29,6 +29,7 @@ int pcie_tph_get_cpu_st(struct pci_dev *dev,
 void pcie_disable_tph(struct pci_dev *pdev);
 int pcie_enable_tph(struct pci_dev *pdev, int mode);
 u16 pcie_tph_get_st_table_size(struct pci_dev *pdev);
+u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev);
 #else
 static inline int pcie_tph_set_st_entry(struct pci_dev *pdev,
 					unsigned int index, u16 tag)
-- 
cgit v1.2.3


From 6948417b3f1fafbeab85c051f8dba5e305a8f9c4 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Wed, 29 Oct 2025 17:42:53 +0200
Subject: net/mlx5: Add OTHER_ESWITCH HW capabilities

Add OTHER_ESWITCH capabilities which includes other_eswitch and
eswitch_owner_vhca_id to all steering objects.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-1-98bb707b5d57@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 47 ++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 07614cd95bed..9b8f88987d2f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5251,13 +5251,15 @@ struct mlx5_ifc_set_fte_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_at_41[0xf];
+	u8         other_eswitch[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
-	u8         reserved_at_88[0x18];
+	u8         reserved_at_88[0x8];
+	u8         eswitch_owner_vhca_id[0x10];
 
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
@@ -8809,13 +8811,15 @@ struct mlx5_ifc_destroy_flow_table_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_at_41[0xf];
+	u8         other_eswitch[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
-	u8         reserved_at_88[0x18];
+	u8         reserved_at_88[0x8];
+	u8         eswitch_owner_vhca_id[0x10];
 
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
@@ -8840,13 +8844,15 @@ struct mlx5_ifc_destroy_flow_group_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_at_41[0xf];
+	u8         other_eswitch[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
-	u8         reserved_at_88[0x18];
+	u8         reserved_at_88[0x8];
+	u8         eswitch_owner_vhca_id[0x10];
 
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
@@ -8985,13 +8991,15 @@ struct mlx5_ifc_delete_fte_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_at_41[0xf];
+	u8         other_eswitch[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
-	u8         reserved_at_88[0x18];
+	u8         reserved_at_88[0x8];
+	u8         eswitch_owner_vhca_id[0x10];
 
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
@@ -9535,13 +9543,15 @@ struct mlx5_ifc_create_flow_table_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_at_41[0xf];
+	u8         other_eswitch[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
-	u8         reserved_at_88[0x18];
+	u8         reserved_at_88[0x8];
+	u8         eswitch_owner_vhca_id[0x10];
 
 	u8         reserved_at_a0[0x20];
 
@@ -9580,7 +9590,8 @@ struct mlx5_ifc_create_flow_group_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_at_41[0xf];
+	u8         other_eswitch[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x20];
@@ -9588,7 +9599,7 @@ struct mlx5_ifc_create_flow_group_in_bits {
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x4];
 	u8         group_type[0x4];
-	u8         reserved_at_90[0x10];
+	u8         eswitch_owner_vhca_id[0x10];
 
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
@@ -11876,10 +11887,12 @@ struct mlx5_ifc_set_flow_table_root_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_at_41[0xf];
+	u8         other_eswitch[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vport_number[0x10];
 
-	u8         reserved_at_60[0x20];
+	u8         reserved_at_60[0x10];
+	u8         eswitch_owner_vhca_id[0x10];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x7];
@@ -11919,14 +11932,16 @@ struct mlx5_ifc_modify_flow_table_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_at_41[0xf];
+	u8         other_eswitch[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x10];
 	u8         modify_field_select[0x10];
 
 	u8         table_type[0x8];
-	u8         reserved_at_88[0x18];
+	u8         reserved_at_88[0x8];
+	u8         eswitch_owner_vhca_id[0x10];
 
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
-- 
cgit v1.2.3


From 3b848dec7e821bace785b9e405bf1884c077635a Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Wed, 29 Oct 2025 17:42:54 +0200
Subject: net/mlx5: fs, Add other_eswitch support for steering tables

Add other_eswitch support which allows flow tables creation above vports
that reside on different esw managers.

The new flag MLX5_FLOW_TABLE_OTHER_ESWITCH indicates if the
esw_owner_vhca_id attribute is supported.

Note that this is only supported if the Advanced-RDMA cap-
rdma_transport_manager_other_eswitch is set.
And it is the caller responsibility to check that.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-2-98bb707b5d57@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  | 31 +++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 18 ++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |  1 +
 include/linux/mlx5/fs.h                           |  2 ++
 4 files changed, 42 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 1af76da8b132..ced747bef641 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -239,6 +239,10 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns,
 	MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport);
 	MLX5_SET(set_flow_table_root_in, in, other_vport,
 		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
+	MLX5_SET(set_flow_table_root_in, in, eswitch_owner_vhca_id,
+		 ft->esw_owner_vhca_id);
+	MLX5_SET(set_flow_table_root_in, in, other_eswitch,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH));
 
 	err = mlx5_cmd_exec_in(dev, set_flow_table_root, in);
 	if (!err &&
@@ -302,6 +306,10 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 	MLX5_SET(create_flow_table_in, in, vport_number, ft->vport);
 	MLX5_SET(create_flow_table_in, in, other_vport,
 		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
+	MLX5_SET(create_flow_table_in, in, eswitch_owner_vhca_id,
+		 ft->esw_owner_vhca_id);
+	MLX5_SET(create_flow_table_in, in, other_eswitch,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH));
 
 	MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en,
 		 en_decap);
@@ -360,6 +368,10 @@ static int mlx5_cmd_destroy_flow_table(struct mlx5_flow_root_namespace *ns,
 	MLX5_SET(destroy_flow_table_in, in, vport_number, ft->vport);
 	MLX5_SET(destroy_flow_table_in, in, other_vport,
 		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
+	MLX5_SET(destroy_flow_table_in, in, eswitch_owner_vhca_id,
+		 ft->esw_owner_vhca_id);
+	MLX5_SET(destroy_flow_table_in, in, other_eswitch,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH));
 
 	err = mlx5_cmd_exec_in(dev, destroy_flow_table, in);
 	if (!err)
@@ -394,6 +406,10 @@ static int mlx5_cmd_modify_flow_table(struct mlx5_flow_root_namespace *ns,
 		MLX5_SET(modify_flow_table_in, in, vport_number, ft->vport);
 		MLX5_SET(modify_flow_table_in, in, other_vport,
 			 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
+		MLX5_SET(modify_flow_table_in, in, eswitch_owner_vhca_id,
+			 ft->esw_owner_vhca_id);
+		MLX5_SET(modify_flow_table_in, in, other_eswitch,
+			 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH));
 		MLX5_SET(modify_flow_table_in, in, modify_field_select,
 			 MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID);
 		if (next_ft) {
@@ -429,6 +445,10 @@ static int mlx5_cmd_create_flow_group(struct mlx5_flow_root_namespace *ns,
 	MLX5_SET(create_flow_group_in, in, vport_number, ft->vport);
 	MLX5_SET(create_flow_group_in, in, other_vport,
 		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
+	MLX5_SET(create_flow_group_in, in, eswitch_owner_vhca_id,
+		 ft->esw_owner_vhca_id);
+	MLX5_SET(create_flow_group_in, in, other_eswitch,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH));
 	err = mlx5_cmd_exec_inout(dev, create_flow_group, in, out);
 	if (!err)
 		fg->id = MLX5_GET(create_flow_group_out, out,
@@ -451,6 +471,10 @@ static int mlx5_cmd_destroy_flow_group(struct mlx5_flow_root_namespace *ns,
 	MLX5_SET(destroy_flow_group_in, in, vport_number, ft->vport);
 	MLX5_SET(destroy_flow_group_in, in, other_vport,
 		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
+	MLX5_SET(destroy_flow_group_in, in, eswitch_owner_vhca_id,
+		 ft->esw_owner_vhca_id);
+	MLX5_SET(destroy_flow_group_in, in, other_eswitch,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH));
 	return mlx5_cmd_exec_in(dev, destroy_flow_group, in);
 }
 
@@ -559,6 +583,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(set_fte_in, in, vport_number, ft->vport);
 	MLX5_SET(set_fte_in, in, other_vport,
 		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
+	MLX5_SET(set_fte_in, in, eswitch_owner_vhca_id, ft->esw_owner_vhca_id);
+	MLX5_SET(set_fte_in, in, other_eswitch,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH));
 
 	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
 	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
@@ -788,6 +815,10 @@ static int mlx5_cmd_delete_fte(struct mlx5_flow_root_namespace *ns,
 	MLX5_SET(delete_fte_in, in, vport_number, ft->vport);
 	MLX5_SET(delete_fte_in, in, other_vport,
 		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
+	MLX5_SET(delete_fte_in, in, eswitch_owner_vhca_id,
+		 ft->esw_owner_vhca_id);
+	MLX5_SET(delete_fte_in, in, other_eswitch,
+		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH));
 
 	return mlx5_cmd_exec_in(dev, delete_fte, in);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 2db3ffb0a2b2..87e381c82ed3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -939,10 +939,10 @@ static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *f
 	return fg;
 }
 
-static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport,
-						enum fs_flow_table_type table_type,
-						enum fs_flow_table_op_mod op_mod,
-						u32 flags)
+static struct mlx5_flow_table *
+alloc_flow_table(struct mlx5_flow_table_attr *ft_attr, u16 vport,
+		 enum fs_flow_table_type table_type,
+		 enum fs_flow_table_op_mod op_mod)
 {
 	struct mlx5_flow_table *ft;
 	int ret;
@@ -957,12 +957,13 @@ static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport,
 		return ERR_PTR(ret);
 	}
 
-	ft->level = level;
+	ft->level = ft_attr->level;
 	ft->node.type = FS_TYPE_FLOW_TABLE;
 	ft->op_mod = op_mod;
 	ft->type = table_type;
 	ft->vport = vport;
-	ft->flags = flags;
+	ft->esw_owner_vhca_id = ft_attr->esw_owner_vhca_id;
+	ft->flags = ft_attr->flags;
 	INIT_LIST_HEAD(&ft->fwd_rules);
 	mutex_init(&ft->lock);
 
@@ -1370,10 +1371,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 	/* The level is related to the
 	 * priority level range.
 	 */
-	ft = alloc_flow_table(ft_attr->level,
-			      vport,
-			      root->table_type,
-			      op_mod, ft_attr->flags);
+	ft = alloc_flow_table(ft_attr, vport, root->table_type, op_mod);
 	if (IS_ERR(ft)) {
 		err = PTR_ERR(ft);
 		goto unlock_root;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 8458ce203dac..0a9a5ef34c21 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -205,6 +205,7 @@ struct mlx5_flow_table {
 	};
 	u32				id;
 	u16				vport;
+	u16				esw_owner_vhca_id;
 	unsigned int			max_fte;
 	unsigned int			level;
 	enum fs_flow_table_type		type;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 6ac76a0c3827..6325a7fa0df2 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -71,6 +71,7 @@ enum {
 	MLX5_FLOW_TABLE_UNMANAGED = BIT(3),
 	MLX5_FLOW_TABLE_OTHER_VPORT = BIT(4),
 	MLX5_FLOW_TABLE_UPLINK_VPORT = BIT(5),
+	MLX5_FLOW_TABLE_OTHER_ESWITCH = BIT(6),
 };
 
 #define LEFTOVERS_RULE_NUM	 2
@@ -208,6 +209,7 @@ struct mlx5_flow_table_attr {
 	u32 flags;
 	u16 uid;
 	u16 vport;
+	u16 esw_owner_vhca_id;
 	struct mlx5_flow_table *next_ft;
 
 	struct {
-- 
cgit v1.2.3


From 583b4fe1c19d978bb787e0adf9ce469cb7f68455 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Wed, 29 Oct 2025 17:42:55 +0200
Subject: net/mlx5: fs, set non default device per namespace

Add mlx5_fs_set_root_dev() function which swaps the root namespace
core device with another one for a given table_type.

It is intended for usage only by RDMA_TRANSPORT tables in case of LAG
configuration, to allow the creation of tables during LAG always
through the LAG master device, which is valid since during LAG the
master is allowed to manage the RDMA_TRANSPORT tables of its slaves.

In addition move the table_type enum to global include to allow its use
in a downstream patch in the RDMA driver.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-3-98bb707b5d57@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 56 +++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 18 --------
 include/linux/mlx5/fs.h                           | 22 +++++++++
 3 files changed, 78 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 87e381c82ed3..5b210c54a592 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -3308,6 +3308,62 @@ err:
 	return ret;
 }
 
+static bool mlx5_fs_ns_is_empty(struct mlx5_flow_namespace *ns)
+{
+	struct fs_prio *iter_prio;
+
+	fs_for_each_prio(iter_prio, ns) {
+		if (iter_prio->num_ft)
+			return false;
+	}
+
+	return true;
+}
+
+int mlx5_fs_set_root_dev(struct mlx5_core_dev *dev,
+			 struct mlx5_core_dev *new_dev,
+			 enum fs_flow_table_type table_type)
+{
+	struct mlx5_flow_root_namespace	**root;
+	int total_vports;
+	int i;
+
+	switch (table_type) {
+	case FS_FT_RDMA_TRANSPORT_TX:
+		root = dev->priv.steering->rdma_transport_tx_root_ns;
+		total_vports = dev->priv.steering->rdma_transport_tx_vports;
+		break;
+	case FS_FT_RDMA_TRANSPORT_RX:
+		root = dev->priv.steering->rdma_transport_rx_root_ns;
+		total_vports = dev->priv.steering->rdma_transport_rx_vports;
+		break;
+	default:
+		WARN_ON_ONCE(true);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < total_vports; i++) {
+		mutex_lock(&root[i]->chain_lock);
+		if (!mlx5_fs_ns_is_empty(&root[i]->ns)) {
+			mutex_unlock(&root[i]->chain_lock);
+			goto err;
+		}
+		root[i]->dev = new_dev;
+		mutex_unlock(&root[i]->chain_lock);
+	}
+	return 0;
+err:
+	while (i--) {
+		mutex_lock(&root[i]->chain_lock);
+		root[i]->dev = dev;
+		mutex_unlock(&root[i]->chain_lock);
+	}
+	/* If you hit this error try destroying all flow tables and try again */
+	mlx5_core_err(dev, "Failed to set root device for RDMA TRANSPORT\n");
+	return -EINVAL;
+}
+EXPORT_SYMBOL(mlx5_fs_set_root_dev);
+
 static int init_rdma_transport_rx_root_ns(struct mlx5_flow_steering *steering)
 {
 	struct mlx5_core_dev *dev = steering->dev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 0a9a5ef34c21..1c6591425260 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -103,24 +103,6 @@ enum fs_node_type {
 	FS_TYPE_FLOW_DEST
 };
 
-enum fs_flow_table_type {
-	FS_FT_NIC_RX          = 0x0,
-	FS_FT_NIC_TX          = 0x1,
-	FS_FT_ESW_EGRESS_ACL  = 0x2,
-	FS_FT_ESW_INGRESS_ACL = 0x3,
-	FS_FT_FDB             = 0X4,
-	FS_FT_SNIFFER_RX	= 0X5,
-	FS_FT_SNIFFER_TX	= 0X6,
-	FS_FT_RDMA_RX		= 0X7,
-	FS_FT_RDMA_TX		= 0X8,
-	FS_FT_PORT_SEL		= 0X9,
-	FS_FT_FDB_RX		= 0xa,
-	FS_FT_FDB_TX		= 0xb,
-	FS_FT_RDMA_TRANSPORT_RX	= 0xd,
-	FS_FT_RDMA_TRANSPORT_TX	= 0xe,
-	FS_FT_MAX_TYPE = FS_FT_RDMA_TRANSPORT_TX,
-};
-
 enum fs_flow_table_op_mod {
 	FS_FT_OP_MOD_NORMAL,
 	FS_FT_OP_MOD_LAG_DEMUX,
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 6325a7fa0df2..fe721557bd1d 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -128,6 +128,24 @@ enum {
 	FDB_PER_VPORT,
 };
 
+enum fs_flow_table_type {
+	FS_FT_NIC_RX          = 0x0,
+	FS_FT_NIC_TX          = 0x1,
+	FS_FT_ESW_EGRESS_ACL  = 0x2,
+	FS_FT_ESW_INGRESS_ACL = 0x3,
+	FS_FT_FDB             = 0X4,
+	FS_FT_SNIFFER_RX	= 0X5,
+	FS_FT_SNIFFER_TX	= 0X6,
+	FS_FT_RDMA_RX		= 0X7,
+	FS_FT_RDMA_TX		= 0X8,
+	FS_FT_PORT_SEL		= 0X9,
+	FS_FT_FDB_RX		= 0xa,
+	FS_FT_FDB_TX		= 0xb,
+	FS_FT_RDMA_TRANSPORT_RX	= 0xd,
+	FS_FT_RDMA_TRANSPORT_TX	= 0xe,
+	FS_FT_MAX_TYPE = FS_FT_RDMA_TRANSPORT_TX,
+};
+
 struct mlx5_pkt_reformat;
 struct mlx5_modify_hdr;
 struct mlx5_flow_definer;
@@ -355,4 +373,8 @@ u32 mlx5_flow_table_id(struct mlx5_flow_table *ft);
 
 struct mlx5_flow_root_namespace *
 mlx5_get_root_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type ns_type);
+
+int mlx5_fs_set_root_dev(struct mlx5_core_dev *dev,
+			 struct mlx5_core_dev *new_dev,
+			 enum fs_flow_table_type table_type);
 #endif
-- 
cgit v1.2.3


From 1d165919c8261b927f8dc8cfe61eb04342bedb7e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 25 Oct 2025 19:47:59 -0700
Subject: iio: imu: adis: fix all kernel-doc warnings in header file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Correct and add to adis.h to resolve all kernel-doc warnings:

- add a missing struct member description
- change one non-kernel-doc comment to use /* instead of /**
- correct function parameter @value to @val (7 locations)
- add function return value comments (13 locations)

Warning: include/linux/iio/imu/adis.h:97 struct member 'has_fifo'
 not described in 'adis_data'
Warning: include/linux/iio/imu/adis.h:139 Incorrect use of kernel-doc
 format: * The state_lock is meant to be used during operations that
 require
Warning: include/linux/iio/imu/adis.h:158 struct member '"__adis_"'
 not described in 'adis'
Warning: include/linux/iio/imu/adis.h:264 function parameter 'val'
 not described in 'adis_write_reg'
Warning: include/linux/iio/imu/adis.h:371 No description found for
 return value of 'adis_update_bits_base'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/imu/adis.h | 45 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h
index aa160511e265..bfb6df68e6c9 100644
--- a/include/linux/iio/imu/adis.h
+++ b/include/linux/iio/imu/adis.h
@@ -57,6 +57,7 @@ struct adis_timeout {
  * @enable_irq: Hook for ADIS devices that have a special IRQ enable/disable
  * @unmasked_drdy: True for devices that cannot mask/unmask the data ready pin
  * @has_paging: True if ADIS device has paged registers
+ * @has_fifo: True if ADIS device has a hardware FIFO
  * @burst_reg_cmd:	Register command that triggers burst
  * @burst_len:		Burst size in the SPI RX buffer. If @burst_max_len is defined,
  *			this should be the minimum size supported by the device.
@@ -136,7 +137,7 @@ struct adis {
 	const struct adis_data	*data;
 	unsigned int		burst_extra_len;
 	const struct adis_ops	*ops;
-	/**
+	/*
 	 * The state_lock is meant to be used during operations that require
 	 * a sequence of SPI R/W in order to protect the SPI transfer
 	 * information (fields 'xfer', 'msg' & 'current_page') between
@@ -166,7 +167,7 @@ int __adis_reset(struct adis *adis);
  * adis_reset() - Reset the device
  * @adis: The adis device
  *
- * Returns 0 on success, a negative error code otherwise
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int adis_reset(struct adis *adis)
 {
@@ -183,7 +184,9 @@ int __adis_read_reg(struct adis *adis, unsigned int reg,
  * __adis_write_reg_8() - Write single byte to a register (unlocked)
  * @adis: The adis device
  * @reg: The address of the register to be written
- * @value: The value to write
+ * @val: The value to write
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg,
 				     u8 val)
@@ -195,7 +198,9 @@ static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg,
  * __adis_write_reg_16() - Write 2 bytes to a pair of registers (unlocked)
  * @adis: The adis device
  * @reg: The address of the lower of the two registers
- * @value: Value to be written
+ * @val: Value to be written
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg,
 				      u16 val)
@@ -207,7 +212,9 @@ static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg,
  * __adis_write_reg_32() - write 4 bytes to four registers (unlocked)
  * @adis: The adis device
  * @reg: The address of the lower of the four register
- * @value: Value to be written
+ * @val: Value to be written
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg,
 				      u32 val)
@@ -220,6 +227,8 @@ static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg,
  * @adis: The adis device
  * @reg: The address of the lower of the two registers
  * @val: The value read back from the device
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg,
 				     u16 *val)
@@ -239,6 +248,8 @@ static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg,
  * @adis: The adis device
  * @reg: The address of the lower of the two registers
  * @val: The value read back from the device
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg,
 				     u32 *val)
@@ -257,8 +268,10 @@ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg,
  * adis_write_reg() - write N bytes to register
  * @adis: The adis device
  * @reg: The address of the lower of the two registers
- * @value: The value to write to device (up to 4 bytes)
+ * @val: The value to write to device (up to 4 bytes)
  * @size: The size of the @value (in bytes)
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int adis_write_reg(struct adis *adis, unsigned int reg,
 				 unsigned int val, unsigned int size)
@@ -273,6 +286,8 @@ static inline int adis_write_reg(struct adis *adis, unsigned int reg,
  * @reg: The address of the lower of the two registers
  * @val: The value read back from the device
  * @size: The size of the @val buffer
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static int adis_read_reg(struct adis *adis, unsigned int reg,
 			 unsigned int *val, unsigned int size)
@@ -285,7 +300,9 @@ static int adis_read_reg(struct adis *adis, unsigned int reg,
  * adis_write_reg_8() - Write single byte to a register
  * @adis: The adis device
  * @reg: The address of the register to be written
- * @value: The value to write
+ * @val: The value to write
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int adis_write_reg_8(struct adis *adis, unsigned int reg,
 				   u8 val)
@@ -297,7 +314,9 @@ static inline int adis_write_reg_8(struct adis *adis, unsigned int reg,
  * adis_write_reg_16() - Write 2 bytes to a pair of registers
  * @adis: The adis device
  * @reg: The address of the lower of the two registers
- * @value: Value to be written
+ * @val: Value to be written
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int adis_write_reg_16(struct adis *adis, unsigned int reg,
 				    u16 val)
@@ -309,7 +328,9 @@ static inline int adis_write_reg_16(struct adis *adis, unsigned int reg,
  * adis_write_reg_32() - write 4 bytes to four registers
  * @adis: The adis device
  * @reg: The address of the lower of the four register
- * @value: Value to be written
+ * @val: Value to be written
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int adis_write_reg_32(struct adis *adis, unsigned int reg,
 				    u32 val)
@@ -322,6 +343,8 @@ static inline int adis_write_reg_32(struct adis *adis, unsigned int reg,
  * @adis: The adis device
  * @reg: The address of the lower of the two registers
  * @val: The value read back from the device
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int adis_read_reg_16(struct adis *adis, unsigned int reg,
 				   u16 *val)
@@ -341,6 +364,8 @@ static inline int adis_read_reg_16(struct adis *adis, unsigned int reg,
  * @adis: The adis device
  * @reg: The address of the lower of the two registers
  * @val: The value read back from the device
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int adis_read_reg_32(struct adis *adis, unsigned int reg,
 				   u32 *val)
@@ -366,6 +391,8 @@ int __adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask,
  * @size: Size of the register to update
  *
  * Updates the desired bits of @reg in accordance with @mask and @val.
+ *
+ * Returns: %0 on success, a negative error code otherwise
  */
 static inline int adis_update_bits_base(struct adis *adis, unsigned int reg,
 					const u32 mask, const u32 val, u8 size)
-- 
cgit v1.2.3


From aaf46c6a6df6052881c2e75cba65aeb6f1cfa88a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 23 Oct 2025 22:21:02 -0700
Subject: tee: <uapi/linux/tee.h: fix all kernel-doc issues

Fix kernel-doc warnings so that there no other kernel-doc issues
in <uapi/linux/tee.h>:

- add ending ':' to some struct members as needed for kernel-doc
- change struct name in kernel-doc to match the actual struct name (2x)
- add a @params: kernel-doc entry multiple times

Warning: tee.h:265 struct member 'ret_origin' not described
 in 'tee_ioctl_open_session_arg'
Warning: tee.h:265 struct member 'num_params' not described
 in 'tee_ioctl_open_session_arg'
Warning: tee.h:265 struct member 'params' not described
 in 'tee_ioctl_open_session_arg'
Warning: tee.h:351 struct member 'num_params' not described
 in 'tee_iocl_supp_recv_arg'
Warning: tee.h:351 struct member 'params' not described
 in 'tee_iocl_supp_recv_arg'
Warning: tee.h:372 struct member 'num_params' not described
 in 'tee_iocl_supp_send_arg'
Warning: tee.h:372 struct member 'params' not described
 in 'tee_iocl_supp_send_arg'
Warning: tee.h:298: expecting prototype for struct
 tee_ioctl_invoke_func_arg. Prototype was for
 struct tee_ioctl_invoke_arg instead
Warning: tee.h:473: expecting prototype for struct
 tee_ioctl_invoke_func_arg. Prototype was for struct
 tee_ioctl_object_invoke_arg instead

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Sumit Garg <sumit.garg@oss.qualcomm.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 include/uapi/linux/tee.h | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 386ad36f1a0a..cab5cadca8ef 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -249,8 +249,9 @@ struct tee_ioctl_param {
  * @cancel_id:	[in] Cancellation id, a unique value to identify this request
  * @session:	[out] Session id
  * @ret:	[out] return value
- * @ret_origin	[out] origin of the return value
- * @num_params	[in] number of parameters following this struct
+ * @ret_origin:	[out] origin of the return value
+ * @num_params:	[in] number of &struct tee_ioctl_param entries in @params
+ * @params:	array of ioctl parameters
  */
 struct tee_ioctl_open_session_arg {
 	__u8 uuid[TEE_IOCTL_UUID_LEN];
@@ -276,14 +277,14 @@ struct tee_ioctl_open_session_arg {
 				     struct tee_ioctl_buf_data)
 
 /**
- * struct tee_ioctl_invoke_func_arg - Invokes a function in a Trusted
- * Application
+ * struct tee_ioctl_invoke_arg - Invokes a function in a Trusted Application
  * @func:	[in] Trusted Application function, specific to the TA
  * @session:	[in] Session id
  * @cancel_id:	[in] Cancellation id, a unique value to identify this request
  * @ret:	[out] return value
- * @ret_origin	[out] origin of the return value
- * @num_params	[in] number of parameters following this struct
+ * @ret_origin:	[out] origin of the return value
+ * @num_params:	[in] number of parameters following this struct
+ * @params:	array of ioctl parameters
  */
 struct tee_ioctl_invoke_arg {
 	__u32 func;
@@ -338,7 +339,8 @@ struct tee_ioctl_close_session_arg {
 /**
  * struct tee_iocl_supp_recv_arg - Receive a request for a supplicant function
  * @func:	[in] supplicant function
- * @num_params	[in/out] number of parameters following this struct
+ * @num_params:	[in/out] number of &struct tee_ioctl_param entries in @params
+ * @params:	array of ioctl parameters
  *
  * @num_params is the number of params that tee-supplicant has room to
  * receive when input, @num_params is the number of actual params
@@ -363,7 +365,8 @@ struct tee_iocl_supp_recv_arg {
 /**
  * struct tee_iocl_supp_send_arg - Send a response to a received request
  * @ret:	[out] return value
- * @num_params	[in] number of parameters following this struct
+ * @num_params:	[in] number of &struct tee_ioctl_param entries in @params
+ * @params:	array of ioctl parameters
  */
 struct tee_iocl_supp_send_arg {
 	__u32 ret;
@@ -454,11 +457,13 @@ struct tee_ioctl_shm_register_fd_data {
  */
 
 /**
- * struct tee_ioctl_invoke_func_arg - Invokes an object in a Trusted Application
+ * struct tee_ioctl_object_invoke_arg - Invokes an object in a
+ *   Trusted Application
  * @id:		[in] Object id
  * @op:		[in] Object operation, specific to the object
  * @ret:	[out] return value
  * @num_params:	[in] number of parameters following this struct
+ * @params:	array of ioctl parameters
  */
 struct tee_ioctl_object_invoke_arg {
 	__u64 id;
-- 
cgit v1.2.3


From 7cd3d204412b0584df38fd7be20002137f34721a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 9 Nov 2025 22:11:23 +0100
Subject: ns: don't increment or decrement initial namespaces

There's no need to bump the active reference counts of initial
namespaces as they're always active and can simply remain at 1.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-2-ae8a4ad5a3b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 23 ++++++++++++++++++++---
 kernel/nscommon.c         |  6 ++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index bd4492ef6ffc..791b18dc77d0 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -141,6 +141,12 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns)
 				 IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1));
 }
 
+static __always_inline bool is_ns_init_id(const struct ns_common *ns)
+{
+	VFS_WARN_ON_ONCE(ns->ns_id == 0);
+	return ns->ns_id <= NS_LAST_INIT_ID;
+}
+
 #define to_ns_common(__ns)                                    \
 	_Generic((__ns),                                      \
 		struct cgroup_namespace *:       &(__ns)->ns, \
@@ -285,14 +291,19 @@ void __ns_ref_active_get_owner(struct ns_common *ns);
 
 static __always_inline void __ns_ref_active_get(struct ns_common *ns)
 {
-	WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
-	VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0);
+	/* Initial namespaces are always active. */
+	if (!is_ns_init_id(ns))
+		WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
 }
 #define ns_ref_active_get(__ns) \
 	do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
 
 static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns)
 {
+	/* Initial namespaces are always active. */
+	if (is_ns_init_id(ns))
+		return true;
+
 	if (atomic_inc_not_zero(&ns->__ns_ref_active)) {
 		VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
 		return true;
@@ -307,6 +318,10 @@ void __ns_ref_active_put_owner(struct ns_common *ns);
 
 static __always_inline void __ns_ref_active_put(struct ns_common *ns)
 {
+	/* Initial namespaces are always active. */
+	if (is_ns_init_id(ns))
+		return;
+
 	if (atomic_dec_and_test(&ns->__ns_ref_active)) {
 		VFS_WARN_ON_ONCE(is_initial_namespace(ns));
 		VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
@@ -319,8 +334,10 @@ static __always_inline void __ns_ref_active_put(struct ns_common *ns)
 static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
 {
 	VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns));
-	if (!__ns_ref_active_read(ns))
+	if (!__ns_ref_active_read(ns)) {
+		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
 		return NULL;
+	}
 	if (!__ns_ref_get(ns))
 		return NULL;
 	return ns;
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index d67ae7ad7759..70cb66232e4c 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -177,6 +177,7 @@ void __ns_ref_active_put_owner(struct ns_common *ns)
 		ns = ns_owner(ns);
 		if (!ns)
 			return;
+		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
 		if (!atomic_dec_and_test(&ns->__ns_ref_active))
 			return;
 	}
@@ -276,6 +277,10 @@ void __ns_ref_active_put_owner(struct ns_common *ns)
  */
 void __ns_ref_active_resurrect(struct ns_common *ns)
 {
+	/* Initial namespaces are always active. */
+	if (is_ns_init_id(ns))
+		return;
+
 	/* If we didn't resurrect the namespace we're done. */
 	if (atomic_fetch_add(1, &ns->__ns_ref_active))
 		return;
@@ -289,6 +294,7 @@ void __ns_ref_active_resurrect(struct ns_common *ns)
 		if (!ns)
 			return;
 
+		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
 		if (atomic_fetch_add(1, &ns->__ns_ref_active))
 			return;
 	}
-- 
cgit v1.2.3


From f8d5a8970d2f49411824fb1fdd34bbb3eea22756 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 9 Nov 2025 22:11:26 +0100
Subject: ns: handle setns(pidfd, ...) cleanly

The setns() system call supports:

(1) namespace file descriptors (nsfd)
(2) process file descriptors (pidfd)

When using nsfds the namespaces will remain active because they are
pinned by the vfs. However, when pidfds are used things are more
complicated.

When the target task exits and passes through exit_nsproxy_namespaces()
or is reaped and thus also passes through exit_cred_namespaces() after
the setns()'ing task has called prepare_nsset() but before the active
reference count of the set of namespaces it wants to setns() to might
have been dropped already:

  P1                                                              P2

  pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS)
                                                                  pidfd = pidfd_open(pid_p1)
                                                                  setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS)
                                                                  prepare_nsset()

  exit(0)
  // ns->__ns_active_ref        == 1
  // parent_ns->__ns_active_ref == 1
  -> exit_nsproxy_namespaces()
  -> exit_cred_namespaces()

  // ns_active_ref_put() will also put
  // the reference on the owner of the
  // namespace. If the only reason the
  // owning namespace was alive was
  // because it was a parent of @ns
  // it's active reference count now goes
  // to zero... --------------------------------
  //                                           |
  // ns->__ns_active_ref        == 0           |
  // parent_ns->__ns_active_ref == 0           |
                                               |                  commit_nsset()
                                               -----------------> // If setns()
                                                                  // now manages to install the namespaces
                                                                  // it will call ns_active_ref_get()
                                                                  // on them thus bumping the active reference
                                                                  // count from zero again but without also
                                                                  // taking the required reference on the owner.
                                                                  // Thus we get:
                                                                  //
                                                                  // ns->__ns_active_ref        == 1
                                                                  // parent_ns->__ns_active_ref == 0

  When later someone does ns_active_ref_put() on @ns it will underflow
  parent_ns->__ns_active_ref leading to a splat from our asserts
  thinking there are still active references when in fact the counter
  just underflowed.

So resurrect the ownership chain if necessary as well. If the caller
succeeded to grab passive references to the set of namespaces the
setns() should simply succeed even if the target task exists or gets
reaped in the meantime and thus has dropped all active references to its
namespaces.

The race is rare and can only be triggered when using pidfs to setns()
to namespaces. Also note that active reference on initial namespaces are
nops.

Since we now always handle parent references directly we can drop
ns_ref_active_get_owner() when adding a namespace to a namespace tree.
This is now all handled uniformly in the places where the new namespaces
actually become active.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-5-ae8a4ad5a3b3@kernel.org
Fixes: 3c9820d5c64a ("ns: add active reference count")
Reported-by: syzbot+1957b26299cf3ff7890c@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nsfs.c                 |  2 +-
 include/linux/ns_common.h | 47 ++++-------------------------------------------
 kernel/nscommon.c         | 21 ++++++++++++---------
 kernel/nstree.c           |  8 --------
 4 files changed, 17 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index ba6c8975c82e..a80f8d2a4122 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -430,7 +430,7 @@ static int nsfs_init_inode(struct inode *inode, void *data)
 	 * ioctl on such a socket will resurrect the relevant namespace
 	 * subtree.
 	 */
-	__ns_ref_active_resurrect(ns);
+	__ns_ref_active_get(ns);
 	return 0;
 }
 
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 791b18dc77d0..3aaba2ca31d7 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -287,47 +287,8 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns
 #define ns_ref_active_read(__ns) \
 	((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)
 
-void __ns_ref_active_get_owner(struct ns_common *ns);
+void __ns_ref_active_put(struct ns_common *ns);
 
-static __always_inline void __ns_ref_active_get(struct ns_common *ns)
-{
-	/* Initial namespaces are always active. */
-	if (!is_ns_init_id(ns))
-		WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
-}
-#define ns_ref_active_get(__ns) \
-	do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
-
-static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns)
-{
-	/* Initial namespaces are always active. */
-	if (is_ns_init_id(ns))
-		return true;
-
-	if (atomic_inc_not_zero(&ns->__ns_ref_active)) {
-		VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
-		return true;
-	}
-	return false;
-}
-
-#define ns_ref_active_get_owner(__ns) \
-	do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0)
-
-void __ns_ref_active_put_owner(struct ns_common *ns);
-
-static __always_inline void __ns_ref_active_put(struct ns_common *ns)
-{
-	/* Initial namespaces are always active. */
-	if (is_ns_init_id(ns))
-		return;
-
-	if (atomic_dec_and_test(&ns->__ns_ref_active)) {
-		VFS_WARN_ON_ONCE(is_initial_namespace(ns));
-		VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
-		__ns_ref_active_put_owner(ns);
-	}
-}
 #define ns_ref_active_put(__ns) \
 	do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0)
 
@@ -343,9 +304,9 @@ static __always_inline struct ns_common *__must_check ns_get_unless_inactive(str
 	return ns;
 }
 
-void __ns_ref_active_resurrect(struct ns_common *ns);
+void __ns_ref_active_get(struct ns_common *ns);
 
-#define ns_ref_active_resurrect(__ns) \
-	do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0)
+#define ns_ref_active_get(__ns) \
+	do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
 
 #endif
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index 70cb66232e4c..bfd2d6805776 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -114,13 +114,6 @@ struct ns_common *__must_check ns_owner(struct ns_common *ns)
 	return to_ns_common(owner);
 }
 
-void __ns_ref_active_get_owner(struct ns_common *ns)
-{
-	ns = ns_owner(ns);
-	if (ns)
-		WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
-}
-
 /*
  * The active reference count works by having each namespace that gets
  * created take a single active reference on its owning user namespace.
@@ -171,8 +164,18 @@ void __ns_ref_active_get_owner(struct ns_common *ns)
  * The iteration stops once we reach a namespace that still has active
  * references.
  */
-void __ns_ref_active_put_owner(struct ns_common *ns)
+void __ns_ref_active_put(struct ns_common *ns)
 {
+	/* Initial namespaces are always active. */
+	if (is_ns_init_id(ns))
+		return;
+
+	if (!atomic_dec_and_test(&ns->__ns_ref_active))
+		return;
+
+	VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+	VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
+
 	for (;;) {
 		ns = ns_owner(ns);
 		if (!ns)
@@ -275,7 +278,7 @@ void __ns_ref_active_put_owner(struct ns_common *ns)
  * it also needs to take another reference on its owning user namespace
  * and so on.
  */
-void __ns_ref_active_resurrect(struct ns_common *ns)
+void __ns_ref_active_get(struct ns_common *ns)
 {
 	/* Initial namespaces are always active. */
 	if (is_ns_init_id(ns))
diff --git a/kernel/nstree.c b/kernel/nstree.c
index f27f772a6762..97404fb90749 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -173,14 +173,6 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 	write_sequnlock(&ns_tree_lock);
 
 	VFS_WARN_ON_ONCE(node);
-
-	/*
-	 * Take an active reference on the owner namespace. This ensures
-	 * that the owner remains visible while any of its child namespaces
-	 * are active. For init namespaces this is a no-op as ns_owner()
-	 * returns NULL for namespaces owned by init_user_ns.
-	 */
-	__ns_ref_active_get_owner(ns);
 }
 
 void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
-- 
cgit v1.2.3


From 57b39aabb99ea69b9046df2915404a931d9d6695 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 9 Nov 2025 22:11:27 +0100
Subject: ns: add asserts for active refcount underflow

Add a few more assert to detect active reference count underflows.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-6-ae8a4ad5a3b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h |  1 -
 kernel/nscommon.c         | 18 ++++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 3aaba2ca31d7..66ea09b48377 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -294,7 +294,6 @@ void __ns_ref_active_put(struct ns_common *ns);
 
 static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
 {
-	VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns));
 	if (!__ns_ref_active_read(ns)) {
 		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
 		return NULL;
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index bfd2d6805776..c910b979e433 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -170,8 +170,10 @@ void __ns_ref_active_put(struct ns_common *ns)
 	if (is_ns_init_id(ns))
 		return;
 
-	if (!atomic_dec_and_test(&ns->__ns_ref_active))
+	if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
 		return;
+	}
 
 	VFS_WARN_ON_ONCE(is_ns_init_id(ns));
 	VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
@@ -181,8 +183,10 @@ void __ns_ref_active_put(struct ns_common *ns)
 		if (!ns)
 			return;
 		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
-		if (!atomic_dec_and_test(&ns->__ns_ref_active))
+		if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+			VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
 			return;
+		}
 	}
 }
 
@@ -280,12 +284,16 @@ void __ns_ref_active_put(struct ns_common *ns)
  */
 void __ns_ref_active_get(struct ns_common *ns)
 {
+	int prev;
+
 	/* Initial namespaces are always active. */
 	if (is_ns_init_id(ns))
 		return;
 
 	/* If we didn't resurrect the namespace we're done. */
-	if (atomic_fetch_add(1, &ns->__ns_ref_active))
+	prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+	VFS_WARN_ON_ONCE(prev < 0);
+	if (likely(prev))
 		return;
 
 	/*
@@ -298,7 +306,9 @@ void __ns_ref_active_get(struct ns_common *ns)
 			return;
 
 		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
-		if (atomic_fetch_add(1, &ns->__ns_ref_active))
+		prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+		VFS_WARN_ON_ONCE(prev < 0);
+		if (likely(prev))
 			return;
 	}
 }
-- 
cgit v1.2.3


From 69674282fc97fffd98a85ab5b4837edbc5898145 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 15:36:49 +0100
Subject: wifi: ieee80211: split mesh definitions out

The ieee80211.h file has gotten very long, start splitting it
by putting mesh definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.489713ca8b34.I3befb4bf6ace0315758a1794224ddd18c4652e32@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211-mesh.h | 230 +++++++++++++++++++++++++++++++++++++++++
 include/linux/ieee80211.h      | 211 +------------------------------------
 2 files changed, 232 insertions(+), 209 deletions(-)
 create mode 100644 include/linux/ieee80211-mesh.h

(limited to 'include')

diff --git a/include/linux/ieee80211-mesh.h b/include/linux/ieee80211-mesh.h
new file mode 100644
index 000000000000..4b829bcb38b6
--- /dev/null
+++ b/include/linux/ieee80211-mesh.h
@@ -0,0 +1,230 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * IEEE 802.11 mesh definitions
+ *
+ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ * <jkmaline@cc.hut.fi>
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright (c) 2005, Devicescape Software, Inc.
+ * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
+ * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright (c) 2018 - 2025 Intel Corporation
+ */
+
+#ifndef LINUX_IEEE80211_MESH_H
+#define LINUX_IEEE80211_MESH_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#define IEEE80211_MAX_MESH_ID_LEN	32
+
+struct ieee80211s_hdr {
+	u8 flags;
+	u8 ttl;
+	__le32 seqnum;
+	u8 eaddr1[ETH_ALEN];
+	u8 eaddr2[ETH_ALEN];
+} __packed __aligned(2);
+
+/* Mesh flags */
+#define MESH_FLAGS_AE_A4 	0x1
+#define MESH_FLAGS_AE_A5_A6	0x2
+#define MESH_FLAGS_AE		0x3
+#define MESH_FLAGS_PS_DEEP	0x4
+
+/**
+ * enum ieee80211_preq_flags - mesh PREQ element flags
+ *
+ * @IEEE80211_PREQ_PROACTIVE_PREP_FLAG: proactive PREP subfield
+ */
+enum ieee80211_preq_flags {
+	IEEE80211_PREQ_PROACTIVE_PREP_FLAG	= 1<<2,
+};
+
+/**
+ * enum ieee80211_preq_target_flags - mesh PREQ element per target flags
+ *
+ * @IEEE80211_PREQ_TO_FLAG: target only subfield
+ * @IEEE80211_PREQ_USN_FLAG: unknown target HWMP sequence number subfield
+ */
+enum ieee80211_preq_target_flags {
+	IEEE80211_PREQ_TO_FLAG	= 1<<0,
+	IEEE80211_PREQ_USN_FLAG	= 1<<2,
+};
+
+/**
+ * struct ieee80211_mesh_chansw_params_ie - mesh channel switch parameters IE
+ * @mesh_ttl: Time To Live
+ * @mesh_flags: Flags
+ * @mesh_reason: Reason Code
+ * @mesh_pre_value: Precedence Value
+ *
+ * This structure represents the payload of the "Mesh Channel Switch
+ * Parameters element" as described in IEEE Std 802.11-2020 section
+ * 9.4.2.102.
+ */
+struct ieee80211_mesh_chansw_params_ie {
+	u8 mesh_ttl;
+	u8 mesh_flags;
+	__le16 mesh_reason;
+	__le16 mesh_pre_value;
+} __packed;
+
+/**
+ * struct ieee80211_meshconf_ie - Mesh Configuration element
+ * @meshconf_psel: Active Path Selection Protocol Identifier
+ * @meshconf_pmetric: Active Path Selection Metric Identifier
+ * @meshconf_congest: Congestion Control Mode Identifier
+ * @meshconf_synch: Synchronization Method Identifier
+ * @meshconf_auth: Authentication Protocol Identifier
+ * @meshconf_form: Mesh Formation Info
+ * @meshconf_cap: Mesh Capability (see &enum mesh_config_capab_flags)
+ *
+ * This structure represents the payload of the "Mesh Configuration
+ * element" as described in IEEE Std 802.11-2020 section 9.4.2.97.
+ */
+struct ieee80211_meshconf_ie {
+	u8 meshconf_psel;
+	u8 meshconf_pmetric;
+	u8 meshconf_congest;
+	u8 meshconf_synch;
+	u8 meshconf_auth;
+	u8 meshconf_form;
+	u8 meshconf_cap;
+} __packed;
+
+/**
+ * enum mesh_config_capab_flags - Mesh Configuration IE capability field flags
+ *
+ * @IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS: STA is willing to establish
+ *	additional mesh peerings with other mesh STAs
+ * @IEEE80211_MESHCONF_CAPAB_FORWARDING: the STA forwards MSDUs
+ * @IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING: TBTT adjustment procedure
+ *	is ongoing
+ * @IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL: STA is in deep sleep mode or has
+ *	neighbors in deep sleep mode
+ *
+ * Enumerates the "Mesh Capability" as described in IEEE Std
+ * 802.11-2020 section 9.4.2.97.7.
+ */
+enum mesh_config_capab_flags {
+	IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS		= 0x01,
+	IEEE80211_MESHCONF_CAPAB_FORWARDING		= 0x08,
+	IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING		= 0x20,
+	IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL	= 0x40,
+};
+
+#define IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE 0x1
+
+/*
+ * mesh channel switch parameters element's flag indicator
+ *
+ */
+#define WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT BIT(0)
+#define WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR BIT(1)
+#define WLAN_EID_CHAN_SWITCH_PARAM_REASON BIT(2)
+
+/**
+ * struct ieee80211_rann_ie - RANN (root announcement) element
+ * @rann_flags: Flags
+ * @rann_hopcount: Hop Count
+ * @rann_ttl: Element TTL
+ * @rann_addr: Root Mesh STA Address
+ * @rann_seq: HWMP Sequence Number
+ * @rann_interval: Interval
+ * @rann_metric: Metric
+ *
+ * This structure represents the payload of the "RANN element" as
+ * described in IEEE Std 802.11-2020 section 9.4.2.111.
+ */
+struct ieee80211_rann_ie {
+	u8 rann_flags;
+	u8 rann_hopcount;
+	u8 rann_ttl;
+	u8 rann_addr[ETH_ALEN];
+	__le32 rann_seq;
+	__le32 rann_interval;
+	__le32 rann_metric;
+} __packed;
+
+enum ieee80211_rann_flags {
+	RANN_FLAG_IS_GATE = 1 << 0,
+};
+
+/* Mesh action codes */
+enum ieee80211_mesh_actioncode {
+	WLAN_MESH_ACTION_LINK_METRIC_REPORT,
+	WLAN_MESH_ACTION_HWMP_PATH_SELECTION,
+	WLAN_MESH_ACTION_GATE_ANNOUNCEMENT,
+	WLAN_MESH_ACTION_CONGESTION_CONTROL_NOTIFICATION,
+	WLAN_MESH_ACTION_MCCA_SETUP_REQUEST,
+	WLAN_MESH_ACTION_MCCA_SETUP_REPLY,
+	WLAN_MESH_ACTION_MCCA_ADVERTISEMENT_REQUEST,
+	WLAN_MESH_ACTION_MCCA_ADVERTISEMENT,
+	WLAN_MESH_ACTION_MCCA_TEARDOWN,
+	WLAN_MESH_ACTION_TBTT_ADJUSTMENT_REQUEST,
+	WLAN_MESH_ACTION_TBTT_ADJUSTMENT_RESPONSE,
+};
+
+/**
+ * enum ieee80211_mesh_sync_method - mesh synchronization method identifier
+ *
+ * @IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET: the default synchronization method
+ * @IEEE80211_SYNC_METHOD_VENDOR: a vendor specific synchronization method
+ *	that will be specified in a vendor specific information element
+ */
+enum ieee80211_mesh_sync_method {
+	IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET = 1,
+	IEEE80211_SYNC_METHOD_VENDOR = 255,
+};
+
+/**
+ * enum ieee80211_mesh_path_protocol - mesh path selection protocol identifier
+ *
+ * @IEEE80211_PATH_PROTOCOL_HWMP: the default path selection protocol
+ * @IEEE80211_PATH_PROTOCOL_VENDOR: a vendor specific protocol that will
+ *	be specified in a vendor specific information element
+ */
+enum ieee80211_mesh_path_protocol {
+	IEEE80211_PATH_PROTOCOL_HWMP = 1,
+	IEEE80211_PATH_PROTOCOL_VENDOR = 255,
+};
+
+/**
+ * enum ieee80211_mesh_path_metric - mesh path selection metric identifier
+ *
+ * @IEEE80211_PATH_METRIC_AIRTIME: the default path selection metric
+ * @IEEE80211_PATH_METRIC_VENDOR: a vendor specific metric that will be
+ *	specified in a vendor specific information element
+ */
+enum ieee80211_mesh_path_metric {
+	IEEE80211_PATH_METRIC_AIRTIME = 1,
+	IEEE80211_PATH_METRIC_VENDOR = 255,
+};
+
+/**
+ * enum ieee80211_root_mode_identifier - root mesh STA mode identifier
+ *
+ * These attribute are used by dot11MeshHWMPRootMode to set root mesh STA mode
+ *
+ * @IEEE80211_ROOTMODE_NO_ROOT: the mesh STA is not a root mesh STA (default)
+ * @IEEE80211_ROOTMODE_ROOT: the mesh STA is a root mesh STA if greater than
+ *	this value
+ * @IEEE80211_PROACTIVE_PREQ_NO_PREP: the mesh STA is a root mesh STA supports
+ *	the proactive PREQ with proactive PREP subfield set to 0
+ * @IEEE80211_PROACTIVE_PREQ_WITH_PREP: the mesh STA is a root mesh STA
+ *	supports the proactive PREQ with proactive PREP subfield set to 1
+ * @IEEE80211_PROACTIVE_RANN: the mesh STA is a root mesh STA supports
+ *	the proactive RANN
+ */
+enum ieee80211_root_mode_identifier {
+	IEEE80211_ROOTMODE_NO_ROOT = 0,
+	IEEE80211_ROOTMODE_ROOT = 1,
+	IEEE80211_PROACTIVE_PREQ_NO_PREP = 2,
+	IEEE80211_PROACTIVE_PREQ_WITH_PREP = 3,
+	IEEE80211_PROACTIVE_RANN = 4,
+};
+
+#endif /* LINUX_IEEE80211_MESH_H */
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index ddff9102f633..fe78b150ab45 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -252,8 +252,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 
 #define IEEE80211_MAX_SSID_LEN		32
 
-#define IEEE80211_MAX_MESH_ID_LEN	32
-
 #define IEEE80211_FIRST_TSPEC_TSID	8
 #define IEEE80211_NUM_TIDS		16
 
@@ -881,40 +879,6 @@ static inline u16 ieee80211_get_sn(struct ieee80211_hdr *hdr)
 	return le16_get_bits(hdr->seq_ctrl, IEEE80211_SCTL_SEQ);
 }
 
-struct ieee80211s_hdr {
-	u8 flags;
-	u8 ttl;
-	__le32 seqnum;
-	u8 eaddr1[ETH_ALEN];
-	u8 eaddr2[ETH_ALEN];
-} __packed __aligned(2);
-
-/* Mesh flags */
-#define MESH_FLAGS_AE_A4 	0x1
-#define MESH_FLAGS_AE_A5_A6	0x2
-#define MESH_FLAGS_AE		0x3
-#define MESH_FLAGS_PS_DEEP	0x4
-
-/**
- * enum ieee80211_preq_flags - mesh PREQ element flags
- *
- * @IEEE80211_PREQ_PROACTIVE_PREP_FLAG: proactive PREP subfield
- */
-enum ieee80211_preq_flags {
-	IEEE80211_PREQ_PROACTIVE_PREP_FLAG	= 1<<2,
-};
-
-/**
- * enum ieee80211_preq_target_flags - mesh PREQ element per target flags
- *
- * @IEEE80211_PREQ_TO_FLAG: target only subfield
- * @IEEE80211_PREQ_USN_FLAG: unknown target HWMP sequence number subfield
- */
-enum ieee80211_preq_target_flags {
-	IEEE80211_PREQ_TO_FLAG	= 1<<0,
-	IEEE80211_PREQ_USN_FLAG	= 1<<2,
-};
-
 /**
  * struct ieee80211_quiet_ie - Quiet element
  * @count: Quiet Count
@@ -993,24 +957,6 @@ struct ieee80211_sec_chan_offs_ie {
 	u8 sec_chan_offs;
 } __packed;
 
-/**
- * struct ieee80211_mesh_chansw_params_ie - mesh channel switch parameters IE
- * @mesh_ttl: Time To Live
- * @mesh_flags: Flags
- * @mesh_reason: Reason Code
- * @mesh_pre_value: Precedence Value
- *
- * This structure represents the payload of the "Mesh Channel Switch
- * Parameters element" as described in IEEE Std 802.11-2020 section
- * 9.4.2.102.
- */
-struct ieee80211_mesh_chansw_params_ie {
-	u8 mesh_ttl;
-	u8 mesh_flags;
-	__le16 mesh_reason;
-	__le16 mesh_pre_value;
-} __packed;
-
 /**
  * struct ieee80211_wide_bw_chansw_ie - wide bandwidth channel switch IE
  * @new_channel_width: New Channel Width
@@ -1051,87 +997,6 @@ struct ieee80211_tim_ie {
 	};
 } __packed;
 
-/**
- * struct ieee80211_meshconf_ie - Mesh Configuration element
- * @meshconf_psel: Active Path Selection Protocol Identifier
- * @meshconf_pmetric: Active Path Selection Metric Identifier
- * @meshconf_congest: Congestion Control Mode Identifier
- * @meshconf_synch: Synchronization Method Identifier
- * @meshconf_auth: Authentication Protocol Identifier
- * @meshconf_form: Mesh Formation Info
- * @meshconf_cap: Mesh Capability (see &enum mesh_config_capab_flags)
- *
- * This structure represents the payload of the "Mesh Configuration
- * element" as described in IEEE Std 802.11-2020 section 9.4.2.97.
- */
-struct ieee80211_meshconf_ie {
-	u8 meshconf_psel;
-	u8 meshconf_pmetric;
-	u8 meshconf_congest;
-	u8 meshconf_synch;
-	u8 meshconf_auth;
-	u8 meshconf_form;
-	u8 meshconf_cap;
-} __packed;
-
-/**
- * enum mesh_config_capab_flags - Mesh Configuration IE capability field flags
- *
- * @IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS: STA is willing to establish
- *	additional mesh peerings with other mesh STAs
- * @IEEE80211_MESHCONF_CAPAB_FORWARDING: the STA forwards MSDUs
- * @IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING: TBTT adjustment procedure
- *	is ongoing
- * @IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL: STA is in deep sleep mode or has
- *	neighbors in deep sleep mode
- *
- * Enumerates the "Mesh Capability" as described in IEEE Std
- * 802.11-2020 section 9.4.2.97.7.
- */
-enum mesh_config_capab_flags {
-	IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS		= 0x01,
-	IEEE80211_MESHCONF_CAPAB_FORWARDING		= 0x08,
-	IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING		= 0x20,
-	IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL	= 0x40,
-};
-
-#define IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE 0x1
-
-/*
- * mesh channel switch parameters element's flag indicator
- *
- */
-#define WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT BIT(0)
-#define WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR BIT(1)
-#define WLAN_EID_CHAN_SWITCH_PARAM_REASON BIT(2)
-
-/**
- * struct ieee80211_rann_ie - RANN (root announcement) element
- * @rann_flags: Flags
- * @rann_hopcount: Hop Count
- * @rann_ttl: Element TTL
- * @rann_addr: Root Mesh STA Address
- * @rann_seq: HWMP Sequence Number
- * @rann_interval: Interval
- * @rann_metric: Metric
- *
- * This structure represents the payload of the "RANN element" as
- * described in IEEE Std 802.11-2020 section 9.4.2.111.
- */
-struct ieee80211_rann_ie {
-	u8 rann_flags;
-	u8 rann_hopcount;
-	u8 rann_ttl;
-	u8 rann_addr[ETH_ALEN];
-	__le32 rann_seq;
-	__le32 rann_interval;
-	__le32 rann_metric;
-} __packed;
-
-enum ieee80211_rann_flags {
-	RANN_FLAG_IS_GATE = 1 << 0,
-};
-
 enum ieee80211_ht_chanwidth_values {
 	IEEE80211_HT_CHANWIDTH_20MHZ = 0,
 	IEEE80211_HT_CHANWIDTH_ANY = 1,
@@ -3971,21 +3836,6 @@ enum ieee80211_self_protected_actioncode {
 	WLAN_SP_MGK_ACK = 5,
 };
 
-/* Mesh action codes */
-enum ieee80211_mesh_actioncode {
-	WLAN_MESH_ACTION_LINK_METRIC_REPORT,
-	WLAN_MESH_ACTION_HWMP_PATH_SELECTION,
-	WLAN_MESH_ACTION_GATE_ANNOUNCEMENT,
-	WLAN_MESH_ACTION_CONGESTION_CONTROL_NOTIFICATION,
-	WLAN_MESH_ACTION_MCCA_SETUP_REQUEST,
-	WLAN_MESH_ACTION_MCCA_SETUP_REPLY,
-	WLAN_MESH_ACTION_MCCA_ADVERTISEMENT_REQUEST,
-	WLAN_MESH_ACTION_MCCA_ADVERTISEMENT,
-	WLAN_MESH_ACTION_MCCA_TEARDOWN,
-	WLAN_MESH_ACTION_TBTT_ADJUSTMENT_REQUEST,
-	WLAN_MESH_ACTION_TBTT_ADJUSTMENT_RESPONSE,
-};
-
 /* Unprotected WNM action codes */
 enum ieee80211_unprotected_wnm_actioncode {
 	WLAN_UNPROTECTED_WNM_ACTION_TIM = 0,
@@ -4198,65 +4048,6 @@ enum ieee80211_tdls_actioncode {
 /* BSS Coex IE information field bits */
 #define WLAN_BSS_COEX_INFORMATION_REQUEST	BIT(0)
 
-/**
- * enum ieee80211_mesh_sync_method - mesh synchronization method identifier
- *
- * @IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET: the default synchronization method
- * @IEEE80211_SYNC_METHOD_VENDOR: a vendor specific synchronization method
- *	that will be specified in a vendor specific information element
- */
-enum ieee80211_mesh_sync_method {
-	IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET = 1,
-	IEEE80211_SYNC_METHOD_VENDOR = 255,
-};
-
-/**
- * enum ieee80211_mesh_path_protocol - mesh path selection protocol identifier
- *
- * @IEEE80211_PATH_PROTOCOL_HWMP: the default path selection protocol
- * @IEEE80211_PATH_PROTOCOL_VENDOR: a vendor specific protocol that will
- *	be specified in a vendor specific information element
- */
-enum ieee80211_mesh_path_protocol {
-	IEEE80211_PATH_PROTOCOL_HWMP = 1,
-	IEEE80211_PATH_PROTOCOL_VENDOR = 255,
-};
-
-/**
- * enum ieee80211_mesh_path_metric - mesh path selection metric identifier
- *
- * @IEEE80211_PATH_METRIC_AIRTIME: the default path selection metric
- * @IEEE80211_PATH_METRIC_VENDOR: a vendor specific metric that will be
- *	specified in a vendor specific information element
- */
-enum ieee80211_mesh_path_metric {
-	IEEE80211_PATH_METRIC_AIRTIME = 1,
-	IEEE80211_PATH_METRIC_VENDOR = 255,
-};
-
-/**
- * enum ieee80211_root_mode_identifier - root mesh STA mode identifier
- *
- * These attribute are used by dot11MeshHWMPRootMode to set root mesh STA mode
- *
- * @IEEE80211_ROOTMODE_NO_ROOT: the mesh STA is not a root mesh STA (default)
- * @IEEE80211_ROOTMODE_ROOT: the mesh STA is a root mesh STA if greater than
- *	this value
- * @IEEE80211_PROACTIVE_PREQ_NO_PREP: the mesh STA is a root mesh STA supports
- *	the proactive PREQ with proactive PREP subfield set to 0
- * @IEEE80211_PROACTIVE_PREQ_WITH_PREP: the mesh STA is a root mesh STA
- *	supports the proactive PREQ with proactive PREP subfield set to 1
- * @IEEE80211_PROACTIVE_RANN: the mesh STA is a root mesh STA supports
- *	the proactive RANN
- */
-enum ieee80211_root_mode_identifier {
-	IEEE80211_ROOTMODE_NO_ROOT = 0,
-	IEEE80211_ROOTMODE_ROOT = 1,
-	IEEE80211_PROACTIVE_PREQ_NO_PREP = 2,
-	IEEE80211_PROACTIVE_PREQ_WITH_PREP = 3,
-	IEEE80211_PROACTIVE_RANN = 4,
-};
-
 /*
  * IEEE 802.11-2007 7.3.2.9 Country information element
  *
@@ -6098,4 +5889,6 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap)
 #define NAN_DEV_CAPA_NDPE_SUPPORTED		0x08
 #define NAN_DEV_CAPA_S3_SUPPORTED		0x10
 
+#include "ieee80211-mesh.h"
+
 #endif /* LINUX_IEEE80211_H */
-- 
cgit v1.2.3


From fdc1c141f3ef4dc94e3880e973061681843f62c0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 15:36:50 +0100
Subject: wifi: ieee80211: split HT definitions out

The ieee80211.h file has gotten very long, continue splitting
it by putting HT definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.7532471178d0.Id956a5433ad8658e4e5c0272dbcbb59587206142@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211-ht.h | 292 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/ieee80211.h    | 272 +---------------------------------------
 2 files changed, 293 insertions(+), 271 deletions(-)
 create mode 100644 include/linux/ieee80211-ht.h

(limited to 'include')

diff --git a/include/linux/ieee80211-ht.h b/include/linux/ieee80211-ht.h
new file mode 100644
index 000000000000..21bbf470540f
--- /dev/null
+++ b/include/linux/ieee80211-ht.h
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * IEEE 802.11 HT definitions
+ *
+ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ * <jkmaline@cc.hut.fi>
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright (c) 2005, Devicescape Software, Inc.
+ * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
+ * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright (c) 2018 - 2025 Intel Corporation
+ */
+
+#ifndef LINUX_IEEE80211_HT_H
+#define LINUX_IEEE80211_HT_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/* Maximal size of an A-MSDU that can be transported in a HT BA session */
+#define IEEE80211_MAX_MPDU_LEN_HT_BA		4095
+
+/* Maximal size of an A-MSDU */
+#define IEEE80211_MAX_MPDU_LEN_HT_3839		3839
+#define IEEE80211_MAX_MPDU_LEN_HT_7935		7935
+
+#define IEEE80211_HT_CTL_LEN		4
+
+enum ieee80211_ht_chanwidth_values {
+	IEEE80211_HT_CHANWIDTH_20MHZ = 0,
+	IEEE80211_HT_CHANWIDTH_ANY = 1,
+};
+
+/**
+ * struct ieee80211_bar - Block Ack Request frame format
+ * @frame_control: Frame Control
+ * @duration: Duration
+ * @ra: RA
+ * @ta: TA
+ * @control: BAR Control
+ * @start_seq_num: Starting Sequence Number (see Figure 9-37)
+ *
+ * This structure represents the "BlockAckReq frame format"
+ * as described in IEEE Std 802.11-2020 section 9.3.1.7.
+*/
+struct ieee80211_bar {
+	__le16 frame_control;
+	__le16 duration;
+	__u8 ra[ETH_ALEN];
+	__u8 ta[ETH_ALEN];
+	__le16 control;
+	__le16 start_seq_num;
+} __packed;
+
+/* 802.11 BAR control masks */
+#define IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL	0x0000
+#define IEEE80211_BAR_CTRL_MULTI_TID		0x0002
+#define IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA	0x0004
+#define IEEE80211_BAR_CTRL_TID_INFO_MASK	0xf000
+#define IEEE80211_BAR_CTRL_TID_INFO_SHIFT	12
+
+#define IEEE80211_HT_MCS_MASK_LEN		10
+
+/**
+ * struct ieee80211_mcs_info - Supported MCS Set field
+ * @rx_mask: RX mask
+ * @rx_highest: highest supported RX rate. If set represents
+ *	the highest supported RX data rate in units of 1 Mbps.
+ *	If this field is 0 this value should not be used to
+ *	consider the highest RX data rate supported.
+ * @tx_params: TX parameters
+ * @reserved: Reserved bits
+ *
+ * This structure represents the "Supported MCS Set field" as
+ * described in IEEE Std 802.11-2020 section 9.4.2.55.4.
+ */
+struct ieee80211_mcs_info {
+	u8 rx_mask[IEEE80211_HT_MCS_MASK_LEN];
+	__le16 rx_highest;
+	u8 tx_params;
+	u8 reserved[3];
+} __packed;
+
+/* 802.11n HT capability MSC set */
+#define IEEE80211_HT_MCS_RX_HIGHEST_MASK	0x3ff
+#define IEEE80211_HT_MCS_TX_DEFINED		0x01
+#define IEEE80211_HT_MCS_TX_RX_DIFF		0x02
+/* value 0 == 1 stream etc */
+#define IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK	0x0C
+#define IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT	2
+#define		IEEE80211_HT_MCS_TX_MAX_STREAMS	4
+#define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION	0x10
+
+#define IEEE80211_HT_MCS_CHAINS(mcs) ((mcs) == 32 ? 1 : (1 + ((mcs) >> 3)))
+
+/*
+ * 802.11n D5.0 20.3.5 / 20.6 says:
+ * - indices 0 to 7 and 32 are single spatial stream
+ * - 8 to 31 are multiple spatial streams using equal modulation
+ *   [8..15 for two streams, 16..23 for three and 24..31 for four]
+ * - remainder are multiple spatial streams using unequal modulation
+ */
+#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START 33
+#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE \
+	(IEEE80211_HT_MCS_UNEQUAL_MODULATION_START / 8)
+
+/**
+ * struct ieee80211_ht_cap - HT capabilities element
+ * @cap_info: HT Capability Information
+ * @ampdu_params_info: A-MPDU Parameters
+ * @mcs: Supported MCS Set
+ * @extended_ht_cap_info: HT Extended Capabilities
+ * @tx_BF_cap_info: Transmit Beamforming Capabilities
+ * @antenna_selection_info: ASEL Capability
+ *
+ * This structure represents the payload of the "HT Capabilities
+ * element" as described in IEEE Std 802.11-2020 section 9.4.2.55.
+ */
+struct ieee80211_ht_cap {
+	__le16 cap_info;
+	u8 ampdu_params_info;
+
+	/* 16 bytes MCS information */
+	struct ieee80211_mcs_info mcs;
+
+	__le16 extended_ht_cap_info;
+	__le32 tx_BF_cap_info;
+	u8 antenna_selection_info;
+} __packed;
+
+/* 802.11n HT capabilities masks (for cap_info) */
+#define IEEE80211_HT_CAP_LDPC_CODING		0x0001
+#define IEEE80211_HT_CAP_SUP_WIDTH_20_40	0x0002
+#define IEEE80211_HT_CAP_SM_PS			0x000C
+#define		IEEE80211_HT_CAP_SM_PS_SHIFT	2
+#define IEEE80211_HT_CAP_GRN_FLD		0x0010
+#define IEEE80211_HT_CAP_SGI_20			0x0020
+#define IEEE80211_HT_CAP_SGI_40			0x0040
+#define IEEE80211_HT_CAP_TX_STBC		0x0080
+#define IEEE80211_HT_CAP_RX_STBC		0x0300
+#define		IEEE80211_HT_CAP_RX_STBC_SHIFT	8
+#define IEEE80211_HT_CAP_DELAY_BA		0x0400
+#define IEEE80211_HT_CAP_MAX_AMSDU		0x0800
+#define IEEE80211_HT_CAP_DSSSCCK40		0x1000
+#define IEEE80211_HT_CAP_RESERVED		0x2000
+#define IEEE80211_HT_CAP_40MHZ_INTOLERANT	0x4000
+#define IEEE80211_HT_CAP_LSIG_TXOP_PROT		0x8000
+
+/* 802.11n HT extended capabilities masks (for extended_ht_cap_info) */
+#define IEEE80211_HT_EXT_CAP_PCO		0x0001
+#define IEEE80211_HT_EXT_CAP_PCO_TIME		0x0006
+#define		IEEE80211_HT_EXT_CAP_PCO_TIME_SHIFT	1
+#define IEEE80211_HT_EXT_CAP_MCS_FB		0x0300
+#define		IEEE80211_HT_EXT_CAP_MCS_FB_SHIFT	8
+#define IEEE80211_HT_EXT_CAP_HTC_SUP		0x0400
+#define IEEE80211_HT_EXT_CAP_RD_RESPONDER	0x0800
+
+/* 802.11n HT capability AMPDU settings (for ampdu_params_info) */
+#define IEEE80211_HT_AMPDU_PARM_FACTOR		0x03
+#define IEEE80211_HT_AMPDU_PARM_DENSITY		0x1C
+#define		IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT	2
+
+/*
+ * Maximum length of AMPDU that the STA can receive in high-throughput (HT).
+ * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
+ */
+enum ieee80211_max_ampdu_length_exp {
+	IEEE80211_HT_MAX_AMPDU_8K = 0,
+	IEEE80211_HT_MAX_AMPDU_16K = 1,
+	IEEE80211_HT_MAX_AMPDU_32K = 2,
+	IEEE80211_HT_MAX_AMPDU_64K = 3
+};
+
+#define IEEE80211_HT_MAX_AMPDU_FACTOR 13
+
+/* Minimum MPDU start spacing */
+enum ieee80211_min_mpdu_spacing {
+	IEEE80211_HT_MPDU_DENSITY_NONE = 0,	/* No restriction */
+	IEEE80211_HT_MPDU_DENSITY_0_25 = 1,	/* 1/4 usec */
+	IEEE80211_HT_MPDU_DENSITY_0_5 = 2,	/* 1/2 usec */
+	IEEE80211_HT_MPDU_DENSITY_1 = 3,	/* 1 usec */
+	IEEE80211_HT_MPDU_DENSITY_2 = 4,	/* 2 usec */
+	IEEE80211_HT_MPDU_DENSITY_4 = 5,	/* 4 usec */
+	IEEE80211_HT_MPDU_DENSITY_8 = 6,	/* 8 usec */
+	IEEE80211_HT_MPDU_DENSITY_16 = 7	/* 16 usec */
+};
+
+/**
+ * struct ieee80211_ht_operation - HT operation IE
+ * @primary_chan: Primary Channel
+ * @ht_param: HT Operation Information parameters
+ * @operation_mode: HT Operation Information operation mode
+ * @stbc_param: HT Operation Information STBC params
+ * @basic_set: Basic HT-MCS Set
+ *
+ * This structure represents the payload of the "HT Operation
+ * element" as described in IEEE Std 802.11-2020 section 9.4.2.56.
+ */
+struct ieee80211_ht_operation {
+	u8 primary_chan;
+	u8 ht_param;
+	__le16 operation_mode;
+	__le16 stbc_param;
+	u8 basic_set[16];
+} __packed;
+
+/* for ht_param */
+#define IEEE80211_HT_PARAM_CHA_SEC_OFFSET		0x03
+#define		IEEE80211_HT_PARAM_CHA_SEC_NONE		0x00
+#define		IEEE80211_HT_PARAM_CHA_SEC_ABOVE	0x01
+#define		IEEE80211_HT_PARAM_CHA_SEC_BELOW	0x03
+#define IEEE80211_HT_PARAM_CHAN_WIDTH_ANY		0x04
+#define IEEE80211_HT_PARAM_RIFS_MODE			0x08
+
+/* for operation_mode */
+#define IEEE80211_HT_OP_MODE_PROTECTION			0x0003
+#define		IEEE80211_HT_OP_MODE_PROTECTION_NONE		0
+#define		IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER	1
+#define		IEEE80211_HT_OP_MODE_PROTECTION_20MHZ		2
+#define		IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED	3
+#define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT		0x0004
+#define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT		0x0010
+#define IEEE80211_HT_OP_MODE_CCFS2_SHIFT		5
+#define IEEE80211_HT_OP_MODE_CCFS2_MASK			0x1fe0
+
+/* for stbc_param */
+#define IEEE80211_HT_STBC_PARAM_DUAL_BEACON		0x0040
+#define IEEE80211_HT_STBC_PARAM_DUAL_CTS_PROT		0x0080
+#define IEEE80211_HT_STBC_PARAM_STBC_BEACON		0x0100
+#define IEEE80211_HT_STBC_PARAM_LSIG_TXOP_FULLPROT	0x0200
+#define IEEE80211_HT_STBC_PARAM_PCO_ACTIVE		0x0400
+#define IEEE80211_HT_STBC_PARAM_PCO_PHASE		0x0800
+
+
+/* block-ack parameters */
+#define IEEE80211_ADDBA_PARAM_AMSDU_MASK 0x0001
+#define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002
+#define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C
+#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFC0
+#define IEEE80211_DELBA_PARAM_TID_MASK 0xF000
+#define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800
+
+/*
+ * A-MPDU buffer sizes
+ * According to HT size varies from 8 to 64 frames
+ * HE adds the ability to have up to 256 frames.
+ * EHT adds the ability to have up to 1K frames.
+ */
+#define IEEE80211_MIN_AMPDU_BUF		0x8
+#define IEEE80211_MAX_AMPDU_BUF_HT	0x40
+#define IEEE80211_MAX_AMPDU_BUF_HE	0x100
+#define IEEE80211_MAX_AMPDU_BUF_EHT	0x400
+
+
+/* Spatial Multiplexing Power Save Modes (for capability) */
+#define WLAN_HT_CAP_SM_PS_STATIC	0
+#define WLAN_HT_CAP_SM_PS_DYNAMIC	1
+#define WLAN_HT_CAP_SM_PS_INVALID	2
+#define WLAN_HT_CAP_SM_PS_DISABLED	3
+
+/* for SM power control field lower two bits */
+#define WLAN_HT_SMPS_CONTROL_DISABLED	0
+#define WLAN_HT_SMPS_CONTROL_STATIC	1
+#define WLAN_HT_SMPS_CONTROL_DYNAMIC	3
+
+/* HT action codes */
+enum ieee80211_ht_actioncode {
+	WLAN_HT_ACTION_NOTIFY_CHANWIDTH = 0,
+	WLAN_HT_ACTION_SMPS = 1,
+	WLAN_HT_ACTION_PSMP = 2,
+	WLAN_HT_ACTION_PCO_PHASE = 3,
+	WLAN_HT_ACTION_CSI = 4,
+	WLAN_HT_ACTION_NONCOMPRESSED_BF = 5,
+	WLAN_HT_ACTION_COMPRESSED_BF = 6,
+	WLAN_HT_ACTION_ASEL_IDX_FEEDBACK = 7,
+};
+
+/* BACK action code */
+enum ieee80211_back_actioncode {
+	WLAN_ACTION_ADDBA_REQ = 0,
+	WLAN_ACTION_ADDBA_RESP = 1,
+	WLAN_ACTION_DELBA = 2,
+};
+
+/* BACK (block-ack) parties */
+enum ieee80211_back_parties {
+	WLAN_BACK_RECIPIENT = 0,
+	WLAN_BACK_INITIATOR = 1,
+};
+
+#endif /* LINUX_IEEE80211_HT_H */
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index fe78b150ab45..0a9b4a8025cd 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -239,13 +239,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */
 #define IEEE80211_MAX_FRAME_LEN		2352
 
-/* Maximal size of an A-MSDU that can be transported in a HT BA session */
-#define IEEE80211_MAX_MPDU_LEN_HT_BA		4095
-
-/* Maximal size of an A-MSDU */
-#define IEEE80211_MAX_MPDU_LEN_HT_3839		3839
-#define IEEE80211_MAX_MPDU_LEN_HT_7935		7935
-
 #define IEEE80211_MAX_MPDU_LEN_VHT_3895		3895
 #define IEEE80211_MAX_MPDU_LEN_VHT_7991		7991
 #define IEEE80211_MAX_MPDU_LEN_VHT_11454	11454
@@ -302,8 +295,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 #define IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK	0x03
 #define IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT	5
 
-#define IEEE80211_HT_CTL_LEN		4
-
 /* trigger type within common_info of trigger frame */
 #define IEEE80211_TRIGGER_TYPE_MASK		0xf
 #define IEEE80211_TRIGGER_TYPE_BASIC		0x0
@@ -997,11 +988,6 @@ struct ieee80211_tim_ie {
 	};
 } __packed;
 
-enum ieee80211_ht_chanwidth_values {
-	IEEE80211_HT_CHANWIDTH_20MHZ = 0,
-	IEEE80211_HT_CHANWIDTH_ANY = 1,
-};
-
 /**
  * enum ieee80211_vht_opmode_bits - VHT operating mode field bits
  * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask
@@ -1677,146 +1663,6 @@ struct ieee80211_p2p_noa_attr {
 #define IEEE80211_P2P_OPPPS_ENABLE_BIT		BIT(7)
 #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK	0x7F
 
-/**
- * struct ieee80211_bar - Block Ack Request frame format
- * @frame_control: Frame Control
- * @duration: Duration
- * @ra: RA
- * @ta: TA
- * @control: BAR Control
- * @start_seq_num: Starting Sequence Number (see Figure 9-37)
- *
- * This structure represents the "BlockAckReq frame format"
- * as described in IEEE Std 802.11-2020 section 9.3.1.7.
-*/
-struct ieee80211_bar {
-	__le16 frame_control;
-	__le16 duration;
-	__u8 ra[ETH_ALEN];
-	__u8 ta[ETH_ALEN];
-	__le16 control;
-	__le16 start_seq_num;
-} __packed;
-
-/* 802.11 BAR control masks */
-#define IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL	0x0000
-#define IEEE80211_BAR_CTRL_MULTI_TID		0x0002
-#define IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA	0x0004
-#define IEEE80211_BAR_CTRL_TID_INFO_MASK	0xf000
-#define IEEE80211_BAR_CTRL_TID_INFO_SHIFT	12
-
-#define IEEE80211_HT_MCS_MASK_LEN		10
-
-/**
- * struct ieee80211_mcs_info - Supported MCS Set field
- * @rx_mask: RX mask
- * @rx_highest: highest supported RX rate. If set represents
- *	the highest supported RX data rate in units of 1 Mbps.
- *	If this field is 0 this value should not be used to
- *	consider the highest RX data rate supported.
- * @tx_params: TX parameters
- * @reserved: Reserved bits
- *
- * This structure represents the "Supported MCS Set field" as
- * described in IEEE Std 802.11-2020 section 9.4.2.55.4.
- */
-struct ieee80211_mcs_info {
-	u8 rx_mask[IEEE80211_HT_MCS_MASK_LEN];
-	__le16 rx_highest;
-	u8 tx_params;
-	u8 reserved[3];
-} __packed;
-
-/* 802.11n HT capability MSC set */
-#define IEEE80211_HT_MCS_RX_HIGHEST_MASK	0x3ff
-#define IEEE80211_HT_MCS_TX_DEFINED		0x01
-#define IEEE80211_HT_MCS_TX_RX_DIFF		0x02
-/* value 0 == 1 stream etc */
-#define IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK	0x0C
-#define IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT	2
-#define		IEEE80211_HT_MCS_TX_MAX_STREAMS	4
-#define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION	0x10
-
-#define IEEE80211_HT_MCS_CHAINS(mcs) ((mcs) == 32 ? 1 : (1 + ((mcs) >> 3)))
-
-/*
- * 802.11n D5.0 20.3.5 / 20.6 says:
- * - indices 0 to 7 and 32 are single spatial stream
- * - 8 to 31 are multiple spatial streams using equal modulation
- *   [8..15 for two streams, 16..23 for three and 24..31 for four]
- * - remainder are multiple spatial streams using unequal modulation
- */
-#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START 33
-#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE \
-	(IEEE80211_HT_MCS_UNEQUAL_MODULATION_START / 8)
-
-/**
- * struct ieee80211_ht_cap - HT capabilities element
- * @cap_info: HT Capability Information
- * @ampdu_params_info: A-MPDU Parameters
- * @mcs: Supported MCS Set
- * @extended_ht_cap_info: HT Extended Capabilities
- * @tx_BF_cap_info: Transmit Beamforming Capabilities
- * @antenna_selection_info: ASEL Capability
- *
- * This structure represents the payload of the "HT Capabilities
- * element" as described in IEEE Std 802.11-2020 section 9.4.2.55.
- */
-struct ieee80211_ht_cap {
-	__le16 cap_info;
-	u8 ampdu_params_info;
-
-	/* 16 bytes MCS information */
-	struct ieee80211_mcs_info mcs;
-
-	__le16 extended_ht_cap_info;
-	__le32 tx_BF_cap_info;
-	u8 antenna_selection_info;
-} __packed;
-
-/* 802.11n HT capabilities masks (for cap_info) */
-#define IEEE80211_HT_CAP_LDPC_CODING		0x0001
-#define IEEE80211_HT_CAP_SUP_WIDTH_20_40	0x0002
-#define IEEE80211_HT_CAP_SM_PS			0x000C
-#define		IEEE80211_HT_CAP_SM_PS_SHIFT	2
-#define IEEE80211_HT_CAP_GRN_FLD		0x0010
-#define IEEE80211_HT_CAP_SGI_20			0x0020
-#define IEEE80211_HT_CAP_SGI_40			0x0040
-#define IEEE80211_HT_CAP_TX_STBC		0x0080
-#define IEEE80211_HT_CAP_RX_STBC		0x0300
-#define		IEEE80211_HT_CAP_RX_STBC_SHIFT	8
-#define IEEE80211_HT_CAP_DELAY_BA		0x0400
-#define IEEE80211_HT_CAP_MAX_AMSDU		0x0800
-#define IEEE80211_HT_CAP_DSSSCCK40		0x1000
-#define IEEE80211_HT_CAP_RESERVED		0x2000
-#define IEEE80211_HT_CAP_40MHZ_INTOLERANT	0x4000
-#define IEEE80211_HT_CAP_LSIG_TXOP_PROT		0x8000
-
-/* 802.11n HT extended capabilities masks (for extended_ht_cap_info) */
-#define IEEE80211_HT_EXT_CAP_PCO		0x0001
-#define IEEE80211_HT_EXT_CAP_PCO_TIME		0x0006
-#define		IEEE80211_HT_EXT_CAP_PCO_TIME_SHIFT	1
-#define IEEE80211_HT_EXT_CAP_MCS_FB		0x0300
-#define		IEEE80211_HT_EXT_CAP_MCS_FB_SHIFT	8
-#define IEEE80211_HT_EXT_CAP_HTC_SUP		0x0400
-#define IEEE80211_HT_EXT_CAP_RD_RESPONDER	0x0800
-
-/* 802.11n HT capability AMPDU settings (for ampdu_params_info) */
-#define IEEE80211_HT_AMPDU_PARM_FACTOR		0x03
-#define IEEE80211_HT_AMPDU_PARM_DENSITY		0x1C
-#define		IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT	2
-
-/*
- * Maximum length of AMPDU that the STA can receive in high-throughput (HT).
- * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
- */
-enum ieee80211_max_ampdu_length_exp {
-	IEEE80211_HT_MAX_AMPDU_8K = 0,
-	IEEE80211_HT_MAX_AMPDU_16K = 1,
-	IEEE80211_HT_MAX_AMPDU_32K = 2,
-	IEEE80211_HT_MAX_AMPDU_64K = 3
-};
-
 /*
  * Maximum length of AMPDU that the STA can receive in VHT.
  * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
@@ -1832,98 +1678,6 @@ enum ieee80211_vht_max_ampdu_length_exp {
 	IEEE80211_VHT_MAX_AMPDU_1024K = 7
 };
 
-#define IEEE80211_HT_MAX_AMPDU_FACTOR 13
-
-/* Minimum MPDU start spacing */
-enum ieee80211_min_mpdu_spacing {
-	IEEE80211_HT_MPDU_DENSITY_NONE = 0,	/* No restriction */
-	IEEE80211_HT_MPDU_DENSITY_0_25 = 1,	/* 1/4 usec */
-	IEEE80211_HT_MPDU_DENSITY_0_5 = 2,	/* 1/2 usec */
-	IEEE80211_HT_MPDU_DENSITY_1 = 3,	/* 1 usec */
-	IEEE80211_HT_MPDU_DENSITY_2 = 4,	/* 2 usec */
-	IEEE80211_HT_MPDU_DENSITY_4 = 5,	/* 4 usec */
-	IEEE80211_HT_MPDU_DENSITY_8 = 6,	/* 8 usec */
-	IEEE80211_HT_MPDU_DENSITY_16 = 7	/* 16 usec */
-};
-
-/**
- * struct ieee80211_ht_operation - HT operation IE
- * @primary_chan: Primary Channel
- * @ht_param: HT Operation Information parameters
- * @operation_mode: HT Operation Information operation mode
- * @stbc_param: HT Operation Information STBC params
- * @basic_set: Basic HT-MCS Set
- *
- * This structure represents the payload of the "HT Operation
- * element" as described in IEEE Std 802.11-2020 section 9.4.2.56.
- */
-struct ieee80211_ht_operation {
-	u8 primary_chan;
-	u8 ht_param;
-	__le16 operation_mode;
-	__le16 stbc_param;
-	u8 basic_set[16];
-} __packed;
-
-/* for ht_param */
-#define IEEE80211_HT_PARAM_CHA_SEC_OFFSET		0x03
-#define		IEEE80211_HT_PARAM_CHA_SEC_NONE		0x00
-#define		IEEE80211_HT_PARAM_CHA_SEC_ABOVE	0x01
-#define		IEEE80211_HT_PARAM_CHA_SEC_BELOW	0x03
-#define IEEE80211_HT_PARAM_CHAN_WIDTH_ANY		0x04
-#define IEEE80211_HT_PARAM_RIFS_MODE			0x08
-
-/* for operation_mode */
-#define IEEE80211_HT_OP_MODE_PROTECTION			0x0003
-#define		IEEE80211_HT_OP_MODE_PROTECTION_NONE		0
-#define		IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER	1
-#define		IEEE80211_HT_OP_MODE_PROTECTION_20MHZ		2
-#define		IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED	3
-#define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT		0x0004
-#define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT		0x0010
-#define IEEE80211_HT_OP_MODE_CCFS2_SHIFT		5
-#define IEEE80211_HT_OP_MODE_CCFS2_MASK			0x1fe0
-
-/* for stbc_param */
-#define IEEE80211_HT_STBC_PARAM_DUAL_BEACON		0x0040
-#define IEEE80211_HT_STBC_PARAM_DUAL_CTS_PROT		0x0080
-#define IEEE80211_HT_STBC_PARAM_STBC_BEACON		0x0100
-#define IEEE80211_HT_STBC_PARAM_LSIG_TXOP_FULLPROT	0x0200
-#define IEEE80211_HT_STBC_PARAM_PCO_ACTIVE		0x0400
-#define IEEE80211_HT_STBC_PARAM_PCO_PHASE		0x0800
-
-
-/* block-ack parameters */
-#define IEEE80211_ADDBA_PARAM_AMSDU_MASK 0x0001
-#define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002
-#define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C
-#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFC0
-#define IEEE80211_DELBA_PARAM_TID_MASK 0xF000
-#define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800
-
-/*
- * A-MPDU buffer sizes
- * According to HT size varies from 8 to 64 frames
- * HE adds the ability to have up to 256 frames.
- * EHT adds the ability to have up to 1K frames.
- */
-#define IEEE80211_MIN_AMPDU_BUF		0x8
-#define IEEE80211_MAX_AMPDU_BUF_HT	0x40
-#define IEEE80211_MAX_AMPDU_BUF_HE	0x100
-#define IEEE80211_MAX_AMPDU_BUF_EHT	0x400
-
-
-/* Spatial Multiplexing Power Save Modes (for capability) */
-#define WLAN_HT_CAP_SM_PS_STATIC	0
-#define WLAN_HT_CAP_SM_PS_DYNAMIC	1
-#define WLAN_HT_CAP_SM_PS_INVALID	2
-#define WLAN_HT_CAP_SM_PS_DISABLED	3
-
-/* for SM power control field lower two bits */
-#define WLAN_HT_SMPS_CONTROL_DISABLED	0
-#define WLAN_HT_SMPS_CONTROL_STATIC	1
-#define WLAN_HT_SMPS_CONTROL_DYNAMIC	3
-
 /**
  * struct ieee80211_vht_mcs_info - VHT MCS information
  * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams
@@ -3807,18 +3561,6 @@ enum ieee80211_spectrum_mgmt_actioncode {
 	WLAN_ACTION_SPCT_CHL_SWITCH = 4,
 };
 
-/* HT action codes */
-enum ieee80211_ht_actioncode {
-	WLAN_HT_ACTION_NOTIFY_CHANWIDTH = 0,
-	WLAN_HT_ACTION_SMPS = 1,
-	WLAN_HT_ACTION_PSMP = 2,
-	WLAN_HT_ACTION_PCO_PHASE = 3,
-	WLAN_HT_ACTION_CSI = 4,
-	WLAN_HT_ACTION_NONCOMPRESSED_BF = 5,
-	WLAN_HT_ACTION_COMPRESSED_BF = 6,
-	WLAN_HT_ACTION_ASEL_IDX_FEEDBACK = 7,
-};
-
 /* VHT action codes */
 enum ieee80211_vht_actioncode {
 	WLAN_VHT_ACTION_COMPRESSED_BF = 0,
@@ -4155,19 +3897,6 @@ struct ieee80211_bss_max_idle_period_ie {
 	u8 idle_options;
 } __packed;
 
-/* BACK action code */
-enum ieee80211_back_actioncode {
-	WLAN_ACTION_ADDBA_REQ = 0,
-	WLAN_ACTION_ADDBA_RESP = 1,
-	WLAN_ACTION_DELBA = 2,
-};
-
-/* BACK (block-ack) parties */
-enum ieee80211_back_parties {
-	WLAN_BACK_RECIPIENT = 0,
-	WLAN_BACK_INITIATOR = 1,
-};
-
 /* SA Query action */
 enum ieee80211_sa_query_action {
 	WLAN_ACTION_SA_QUERY_REQUEST = 0,
@@ -5889,6 +5618,7 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap)
 #define NAN_DEV_CAPA_NDPE_SUPPORTED		0x08
 #define NAN_DEV_CAPA_S3_SUPPORTED		0x10
 
+#include "ieee80211-ht.h"
 #include "ieee80211-mesh.h"
 
 #endif /* LINUX_IEEE80211_H */
-- 
cgit v1.2.3


From 7cb14da1d7bbfa4a6417ed7f1bc07dd77bcd9c83 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 15:36:51 +0100
Subject: wifi: ieee80211: split VHT definitions out

The ieee80211.h file has gotten very long, continue splitting
it by putting VHT definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.c31cb771a250.I787a13064db7d80440101de3445be17881daf1b6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211-vht.h | 236 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/ieee80211.h     | 216 +-------------------------------------
 2 files changed, 237 insertions(+), 215 deletions(-)
 create mode 100644 include/linux/ieee80211-vht.h

(limited to 'include')

diff --git a/include/linux/ieee80211-vht.h b/include/linux/ieee80211-vht.h
new file mode 100644
index 000000000000..898dfb561fef
--- /dev/null
+++ b/include/linux/ieee80211-vht.h
@@ -0,0 +1,236 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * IEEE 802.11 VHT definitions
+ *
+ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ * <jkmaline@cc.hut.fi>
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright (c) 2005, Devicescape Software, Inc.
+ * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
+ * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright (c) 2018 - 2025 Intel Corporation
+ */
+
+#ifndef LINUX_IEEE80211_VHT_H
+#define LINUX_IEEE80211_VHT_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#define IEEE80211_MAX_MPDU_LEN_VHT_3895		3895
+#define IEEE80211_MAX_MPDU_LEN_VHT_7991		7991
+#define IEEE80211_MAX_MPDU_LEN_VHT_11454	11454
+
+/**
+ * enum ieee80211_vht_opmode_bits - VHT operating mode field bits
+ * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask
+ * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: 20 MHz channel width
+ * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width
+ * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width
+ * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width
+ * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag
+ * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask
+ *	(the NSS value is the value of this field + 1)
+ * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift
+ * @IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF: indicates streams in SU-MIMO PPDU
+ *	using a beamforming steering matrix
+ */
+enum ieee80211_vht_opmode_bits {
+	IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK	= 0x03,
+	IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ	= 0,
+	IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ	= 1,
+	IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ	= 2,
+	IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ	= 3,
+	IEEE80211_OPMODE_NOTIF_BW_160_80P80	= 0x04,
+	IEEE80211_OPMODE_NOTIF_RX_NSS_MASK	= 0x70,
+	IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT	= 4,
+	IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF	= 0x80,
+};
+
+/*
+ * Maximum length of AMPDU that the STA can receive in VHT.
+ * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
+ */
+enum ieee80211_vht_max_ampdu_length_exp {
+	IEEE80211_VHT_MAX_AMPDU_8K = 0,
+	IEEE80211_VHT_MAX_AMPDU_16K = 1,
+	IEEE80211_VHT_MAX_AMPDU_32K = 2,
+	IEEE80211_VHT_MAX_AMPDU_64K = 3,
+	IEEE80211_VHT_MAX_AMPDU_128K = 4,
+	IEEE80211_VHT_MAX_AMPDU_256K = 5,
+	IEEE80211_VHT_MAX_AMPDU_512K = 6,
+	IEEE80211_VHT_MAX_AMPDU_1024K = 7
+};
+
+/**
+ * struct ieee80211_vht_mcs_info - VHT MCS information
+ * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams
+ * @rx_highest: Indicates highest long GI VHT PPDU data rate
+ *	STA can receive. Rate expressed in units of 1 Mbps.
+ *	If this field is 0 this value should not be used to
+ *	consider the highest RX data rate supported.
+ *	The top 3 bits of this field indicate the Maximum NSTS,total
+ *	(a beamformee capability.)
+ * @tx_mcs_map: TX MCS map 2 bits for each stream, total 8 streams
+ * @tx_highest: Indicates highest long GI VHT PPDU data rate
+ *	STA can transmit. Rate expressed in units of 1 Mbps.
+ *	If this field is 0 this value should not be used to
+ *	consider the highest TX data rate supported.
+ *	The top 2 bits of this field are reserved, the
+ *	3rd bit from the top indiciates VHT Extended NSS BW
+ *	Capability.
+ */
+struct ieee80211_vht_mcs_info {
+	__le16 rx_mcs_map;
+	__le16 rx_highest;
+	__le16 tx_mcs_map;
+	__le16 tx_highest;
+} __packed;
+
+/* for rx_highest */
+#define IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT	13
+#define IEEE80211_VHT_MAX_NSTS_TOTAL_MASK	(7 << IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT)
+
+/* for tx_highest */
+#define IEEE80211_VHT_EXT_NSS_BW_CAPABLE	(1 << 13)
+
+/**
+ * enum ieee80211_vht_mcs_support - VHT MCS support definitions
+ * @IEEE80211_VHT_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
+ *	number of streams
+ * @IEEE80211_VHT_MCS_SUPPORT_0_8: MCSes 0-8 are supported
+ * @IEEE80211_VHT_MCS_SUPPORT_0_9: MCSes 0-9 are supported
+ * @IEEE80211_VHT_MCS_NOT_SUPPORTED: This number of streams isn't supported
+ *
+ * These definitions are used in each 2-bit subfield of the @rx_mcs_map
+ * and @tx_mcs_map fields of &struct ieee80211_vht_mcs_info, which are
+ * both split into 8 subfields by number of streams. These values indicate
+ * which MCSes are supported for the number of streams the value appears
+ * for.
+ */
+enum ieee80211_vht_mcs_support {
+	IEEE80211_VHT_MCS_SUPPORT_0_7	= 0,
+	IEEE80211_VHT_MCS_SUPPORT_0_8	= 1,
+	IEEE80211_VHT_MCS_SUPPORT_0_9	= 2,
+	IEEE80211_VHT_MCS_NOT_SUPPORTED	= 3,
+};
+
+/**
+ * struct ieee80211_vht_cap - VHT capabilities
+ *
+ * This structure is the "VHT capabilities element" as
+ * described in 802.11ac D3.0 8.4.2.160
+ * @vht_cap_info: VHT capability info
+ * @supp_mcs: VHT MCS supported rates
+ */
+struct ieee80211_vht_cap {
+	__le32 vht_cap_info;
+	struct ieee80211_vht_mcs_info supp_mcs;
+} __packed;
+
+/**
+ * enum ieee80211_vht_chanwidth - VHT channel width
+ * @IEEE80211_VHT_CHANWIDTH_USE_HT: use the HT operation IE to
+ *	determine the channel width (20 or 40 MHz)
+ * @IEEE80211_VHT_CHANWIDTH_80MHZ: 80 MHz bandwidth
+ * @IEEE80211_VHT_CHANWIDTH_160MHZ: 160 MHz bandwidth
+ * @IEEE80211_VHT_CHANWIDTH_80P80MHZ: 80+80 MHz bandwidth
+ */
+enum ieee80211_vht_chanwidth {
+	IEEE80211_VHT_CHANWIDTH_USE_HT		= 0,
+	IEEE80211_VHT_CHANWIDTH_80MHZ		= 1,
+	IEEE80211_VHT_CHANWIDTH_160MHZ		= 2,
+	IEEE80211_VHT_CHANWIDTH_80P80MHZ	= 3,
+};
+
+/**
+ * struct ieee80211_vht_operation - VHT operation IE
+ *
+ * This structure is the "VHT operation element" as
+ * described in 802.11ac D3.0 8.4.2.161
+ * @chan_width: Operating channel width
+ * @center_freq_seg0_idx: center freq segment 0 index
+ * @center_freq_seg1_idx: center freq segment 1 index
+ * @basic_mcs_set: VHT Basic MCS rate set
+ */
+struct ieee80211_vht_operation {
+	u8 chan_width;
+	u8 center_freq_seg0_idx;
+	u8 center_freq_seg1_idx;
+	__le16 basic_mcs_set;
+} __packed;
+
+/* 802.11ac VHT Capabilities */
+#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895			0x00000000
+#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991			0x00000001
+#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454			0x00000002
+#define IEEE80211_VHT_CAP_MAX_MPDU_MASK				0x00000003
+#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ		0x00000004
+#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ	0x00000008
+#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK			0x0000000C
+#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_SHIFT			2
+#define IEEE80211_VHT_CAP_RXLDPC				0x00000010
+#define IEEE80211_VHT_CAP_SHORT_GI_80				0x00000020
+#define IEEE80211_VHT_CAP_SHORT_GI_160				0x00000040
+#define IEEE80211_VHT_CAP_TXSTBC				0x00000080
+#define IEEE80211_VHT_CAP_RXSTBC_1				0x00000100
+#define IEEE80211_VHT_CAP_RXSTBC_2				0x00000200
+#define IEEE80211_VHT_CAP_RXSTBC_3				0x00000300
+#define IEEE80211_VHT_CAP_RXSTBC_4				0x00000400
+#define IEEE80211_VHT_CAP_RXSTBC_MASK				0x00000700
+#define IEEE80211_VHT_CAP_RXSTBC_SHIFT				8
+#define IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE			0x00000800
+#define IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE			0x00001000
+#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT                  13
+#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK			\
+		(7 << IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT)
+#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT		16
+#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK		\
+		(7 << IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT)
+#define IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE			0x00080000
+#define IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE			0x00100000
+#define IEEE80211_VHT_CAP_VHT_TXOP_PS				0x00200000
+#define IEEE80211_VHT_CAP_HTC_VHT				0x00400000
+#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT	23
+#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK	\
+		(7 << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT)
+#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB	0x08000000
+#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB	0x0c000000
+#define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN			0x10000000
+#define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN			0x20000000
+#define IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT			30
+#define IEEE80211_VHT_CAP_EXT_NSS_BW_MASK			0xc0000000
+
+/**
+ * ieee80211_get_vht_max_nss - return max NSS for a given bandwidth/MCS
+ * @cap: VHT capabilities of the peer
+ * @bw: bandwidth to use
+ * @mcs: MCS index to use
+ * @ext_nss_bw_capable: indicates whether or not the local transmitter
+ *	(rate scaling algorithm) can deal with the new logic
+ *	(dot11VHTExtendedNSSBWCapable)
+ * @max_vht_nss: current maximum NSS as advertised by the STA in
+ *	operating mode notification, can be 0 in which case the
+ *	capability data will be used to derive this (from MCS support)
+ * Return: The maximum NSS that can be used for the given bandwidth/MCS
+ *	combination
+ *
+ * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can
+ * vary for a given BW/MCS. This function parses the data.
+ *
+ * Note: This function is exported by cfg80211.
+ */
+int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
+			      enum ieee80211_vht_chanwidth bw,
+			      int mcs, bool ext_nss_bw_capable,
+			      unsigned int max_vht_nss);
+
+/* VHT action codes */
+enum ieee80211_vht_actioncode {
+	WLAN_VHT_ACTION_COMPRESSED_BF = 0,
+	WLAN_VHT_ACTION_GROUPID_MGMT = 1,
+	WLAN_VHT_ACTION_OPMODE_NOTIF = 2,
+};
+
+#endif /* LINUX_IEEE80211_VHT_H */
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 0a9b4a8025cd..0b247b28c661 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -239,10 +239,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */
 #define IEEE80211_MAX_FRAME_LEN		2352
 
-#define IEEE80211_MAX_MPDU_LEN_VHT_3895		3895
-#define IEEE80211_MAX_MPDU_LEN_VHT_7991		7991
-#define IEEE80211_MAX_MPDU_LEN_VHT_11454	11454
-
 #define IEEE80211_MAX_SSID_LEN		32
 
 #define IEEE80211_FIRST_TSPEC_TSID	8
@@ -988,32 +984,6 @@ struct ieee80211_tim_ie {
 	};
 } __packed;
 
-/**
- * enum ieee80211_vht_opmode_bits - VHT operating mode field bits
- * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask
- * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: 20 MHz channel width
- * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width
- * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width
- * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width
- * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag
- * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask
- *	(the NSS value is the value of this field + 1)
- * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift
- * @IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF: indicates streams in SU-MIMO PPDU
- *	using a beamforming steering matrix
- */
-enum ieee80211_vht_opmode_bits {
-	IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK	= 0x03,
-	IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ	= 0,
-	IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ	= 1,
-	IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ	= 2,
-	IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ	= 3,
-	IEEE80211_OPMODE_NOTIF_BW_160_80P80	= 0x04,
-	IEEE80211_OPMODE_NOTIF_RX_NSS_MASK	= 0x70,
-	IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT	= 4,
-	IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF	= 0x80,
-};
-
 /**
  * enum ieee80211_s1g_chanwidth - S1G channel widths
  * These are defined in IEEE802.11-2016ah Table 10-20
@@ -1663,119 +1633,6 @@ struct ieee80211_p2p_noa_attr {
 #define IEEE80211_P2P_OPPPS_ENABLE_BIT		BIT(7)
 #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK	0x7F
 
-/*
- * Maximum length of AMPDU that the STA can receive in VHT.
- * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
- */
-enum ieee80211_vht_max_ampdu_length_exp {
-	IEEE80211_VHT_MAX_AMPDU_8K = 0,
-	IEEE80211_VHT_MAX_AMPDU_16K = 1,
-	IEEE80211_VHT_MAX_AMPDU_32K = 2,
-	IEEE80211_VHT_MAX_AMPDU_64K = 3,
-	IEEE80211_VHT_MAX_AMPDU_128K = 4,
-	IEEE80211_VHT_MAX_AMPDU_256K = 5,
-	IEEE80211_VHT_MAX_AMPDU_512K = 6,
-	IEEE80211_VHT_MAX_AMPDU_1024K = 7
-};
-
-/**
- * struct ieee80211_vht_mcs_info - VHT MCS information
- * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams
- * @rx_highest: Indicates highest long GI VHT PPDU data rate
- *	STA can receive. Rate expressed in units of 1 Mbps.
- *	If this field is 0 this value should not be used to
- *	consider the highest RX data rate supported.
- *	The top 3 bits of this field indicate the Maximum NSTS,total
- *	(a beamformee capability.)
- * @tx_mcs_map: TX MCS map 2 bits for each stream, total 8 streams
- * @tx_highest: Indicates highest long GI VHT PPDU data rate
- *	STA can transmit. Rate expressed in units of 1 Mbps.
- *	If this field is 0 this value should not be used to
- *	consider the highest TX data rate supported.
- *	The top 2 bits of this field are reserved, the
- *	3rd bit from the top indiciates VHT Extended NSS BW
- *	Capability.
- */
-struct ieee80211_vht_mcs_info {
-	__le16 rx_mcs_map;
-	__le16 rx_highest;
-	__le16 tx_mcs_map;
-	__le16 tx_highest;
-} __packed;
-
-/* for rx_highest */
-#define IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT	13
-#define IEEE80211_VHT_MAX_NSTS_TOTAL_MASK	(7 << IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT)
-
-/* for tx_highest */
-#define IEEE80211_VHT_EXT_NSS_BW_CAPABLE	(1 << 13)
-
-/**
- * enum ieee80211_vht_mcs_support - VHT MCS support definitions
- * @IEEE80211_VHT_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
- *	number of streams
- * @IEEE80211_VHT_MCS_SUPPORT_0_8: MCSes 0-8 are supported
- * @IEEE80211_VHT_MCS_SUPPORT_0_9: MCSes 0-9 are supported
- * @IEEE80211_VHT_MCS_NOT_SUPPORTED: This number of streams isn't supported
- *
- * These definitions are used in each 2-bit subfield of the @rx_mcs_map
- * and @tx_mcs_map fields of &struct ieee80211_vht_mcs_info, which are
- * both split into 8 subfields by number of streams. These values indicate
- * which MCSes are supported for the number of streams the value appears
- * for.
- */
-enum ieee80211_vht_mcs_support {
-	IEEE80211_VHT_MCS_SUPPORT_0_7	= 0,
-	IEEE80211_VHT_MCS_SUPPORT_0_8	= 1,
-	IEEE80211_VHT_MCS_SUPPORT_0_9	= 2,
-	IEEE80211_VHT_MCS_NOT_SUPPORTED	= 3,
-};
-
-/**
- * struct ieee80211_vht_cap - VHT capabilities
- *
- * This structure is the "VHT capabilities element" as
- * described in 802.11ac D3.0 8.4.2.160
- * @vht_cap_info: VHT capability info
- * @supp_mcs: VHT MCS supported rates
- */
-struct ieee80211_vht_cap {
-	__le32 vht_cap_info;
-	struct ieee80211_vht_mcs_info supp_mcs;
-} __packed;
-
-/**
- * enum ieee80211_vht_chanwidth - VHT channel width
- * @IEEE80211_VHT_CHANWIDTH_USE_HT: use the HT operation IE to
- *	determine the channel width (20 or 40 MHz)
- * @IEEE80211_VHT_CHANWIDTH_80MHZ: 80 MHz bandwidth
- * @IEEE80211_VHT_CHANWIDTH_160MHZ: 160 MHz bandwidth
- * @IEEE80211_VHT_CHANWIDTH_80P80MHZ: 80+80 MHz bandwidth
- */
-enum ieee80211_vht_chanwidth {
-	IEEE80211_VHT_CHANWIDTH_USE_HT		= 0,
-	IEEE80211_VHT_CHANWIDTH_80MHZ		= 1,
-	IEEE80211_VHT_CHANWIDTH_160MHZ		= 2,
-	IEEE80211_VHT_CHANWIDTH_80P80MHZ	= 3,
-};
-
-/**
- * struct ieee80211_vht_operation - VHT operation IE
- *
- * This structure is the "VHT operation element" as
- * described in 802.11ac D3.0 8.4.2.161
- * @chan_width: Operating channel width
- * @center_freq_seg0_idx: center freq segment 0 index
- * @center_freq_seg1_idx: center freq segment 1 index
- * @basic_mcs_set: VHT Basic MCS rate set
- */
-struct ieee80211_vht_operation {
-	u8 chan_width;
-	u8 center_freq_seg0_idx;
-	u8 center_freq_seg1_idx;
-	__le16 basic_mcs_set;
-} __packed;
-
 /**
  * struct ieee80211_he_cap_elem - HE capabilities element
  * @mac_cap_info: HE MAC Capabilities Information
@@ -2045,71 +1902,6 @@ struct ieee80211_eht_operation_info {
 	u8 optional[];
 } __packed;
 
-/* 802.11ac VHT Capabilities */
-#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895			0x00000000
-#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991			0x00000001
-#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454			0x00000002
-#define IEEE80211_VHT_CAP_MAX_MPDU_MASK				0x00000003
-#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ		0x00000004
-#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ	0x00000008
-#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK			0x0000000C
-#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_SHIFT			2
-#define IEEE80211_VHT_CAP_RXLDPC				0x00000010
-#define IEEE80211_VHT_CAP_SHORT_GI_80				0x00000020
-#define IEEE80211_VHT_CAP_SHORT_GI_160				0x00000040
-#define IEEE80211_VHT_CAP_TXSTBC				0x00000080
-#define IEEE80211_VHT_CAP_RXSTBC_1				0x00000100
-#define IEEE80211_VHT_CAP_RXSTBC_2				0x00000200
-#define IEEE80211_VHT_CAP_RXSTBC_3				0x00000300
-#define IEEE80211_VHT_CAP_RXSTBC_4				0x00000400
-#define IEEE80211_VHT_CAP_RXSTBC_MASK				0x00000700
-#define IEEE80211_VHT_CAP_RXSTBC_SHIFT				8
-#define IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE			0x00000800
-#define IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE			0x00001000
-#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT                  13
-#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK			\
-		(7 << IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT)
-#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT		16
-#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK		\
-		(7 << IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT)
-#define IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE			0x00080000
-#define IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE			0x00100000
-#define IEEE80211_VHT_CAP_VHT_TXOP_PS				0x00200000
-#define IEEE80211_VHT_CAP_HTC_VHT				0x00400000
-#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT	23
-#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK	\
-		(7 << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT)
-#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB	0x08000000
-#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB	0x0c000000
-#define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN			0x10000000
-#define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN			0x20000000
-#define IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT			30
-#define IEEE80211_VHT_CAP_EXT_NSS_BW_MASK			0xc0000000
-
-/**
- * ieee80211_get_vht_max_nss - return max NSS for a given bandwidth/MCS
- * @cap: VHT capabilities of the peer
- * @bw: bandwidth to use
- * @mcs: MCS index to use
- * @ext_nss_bw_capable: indicates whether or not the local transmitter
- *	(rate scaling algorithm) can deal with the new logic
- *	(dot11VHTExtendedNSSBWCapable)
- * @max_vht_nss: current maximum NSS as advertised by the STA in
- *	operating mode notification, can be 0 in which case the
- *	capability data will be used to derive this (from MCS support)
- * Return: The maximum NSS that can be used for the given bandwidth/MCS
- *	combination
- *
- * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can
- * vary for a given BW/MCS. This function parses the data.
- *
- * Note: This function is exported by cfg80211.
- */
-int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
-			      enum ieee80211_vht_chanwidth bw,
-			      int mcs, bool ext_nss_bw_capable,
-			      unsigned int max_vht_nss);
-
 /* 802.11ax HE MAC capabilities */
 #define IEEE80211_HE_MAC_CAP0_HTC_HE				0x01
 #define IEEE80211_HE_MAC_CAP0_TWT_REQ				0x02
@@ -3561,13 +3353,6 @@ enum ieee80211_spectrum_mgmt_actioncode {
 	WLAN_ACTION_SPCT_CHL_SWITCH = 4,
 };
 
-/* VHT action codes */
-enum ieee80211_vht_actioncode {
-	WLAN_VHT_ACTION_COMPRESSED_BF = 0,
-	WLAN_VHT_ACTION_GROUPID_MGMT = 1,
-	WLAN_VHT_ACTION_OPMODE_NOTIF = 2,
-};
-
 /* Self Protected Action codes */
 enum ieee80211_self_protected_actioncode {
 	WLAN_SP_RESERVED = 0,
@@ -5619,6 +5404,7 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap)
 #define NAN_DEV_CAPA_S3_SUPPORTED		0x10
 
 #include "ieee80211-ht.h"
+#include "ieee80211-vht.h"
 #include "ieee80211-mesh.h"
 
 #endif /* LINUX_IEEE80211_H */
-- 
cgit v1.2.3


From 02a2cf302557eb59794bba0b05d6755f44928d78 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 15:36:52 +0100
Subject: wifi: ieee80211: split HE definitions out

The ieee80211.h file has gotten very long, continue splitting
it by putting HE definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.6998c0802104.I3dd7cfea6abbd118b999ecdedd48437d39cb0533@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211-he.h | 824 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/ieee80211.h    | 806 +-----------------------------------------
 2 files changed, 827 insertions(+), 803 deletions(-)
 create mode 100644 include/linux/ieee80211-he.h

(limited to 'include')

diff --git a/include/linux/ieee80211-he.h b/include/linux/ieee80211-he.h
new file mode 100644
index 000000000000..904d50db5bb8
--- /dev/null
+++ b/include/linux/ieee80211-he.h
@@ -0,0 +1,824 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * IEEE 802.11 HE definitions
+ *
+ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ * <jkmaline@cc.hut.fi>
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright (c) 2005, Devicescape Software, Inc.
+ * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
+ * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright (c) 2018 - 2025 Intel Corporation
+ */
+
+#ifndef LINUX_IEEE80211_HE_H
+#define LINUX_IEEE80211_HE_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#define IEEE80211_TWT_CONTROL_NDP			BIT(0)
+#define IEEE80211_TWT_CONTROL_RESP_MODE			BIT(1)
+#define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST	BIT(3)
+#define IEEE80211_TWT_CONTROL_RX_DISABLED		BIT(4)
+#define IEEE80211_TWT_CONTROL_WAKE_DUR_UNIT		BIT(5)
+
+#define IEEE80211_TWT_REQTYPE_REQUEST			BIT(0)
+#define IEEE80211_TWT_REQTYPE_SETUP_CMD			GENMASK(3, 1)
+#define IEEE80211_TWT_REQTYPE_TRIGGER			BIT(4)
+#define IEEE80211_TWT_REQTYPE_IMPLICIT			BIT(5)
+#define IEEE80211_TWT_REQTYPE_FLOWTYPE			BIT(6)
+#define IEEE80211_TWT_REQTYPE_FLOWID			GENMASK(9, 7)
+#define IEEE80211_TWT_REQTYPE_WAKE_INT_EXP		GENMASK(14, 10)
+#define IEEE80211_TWT_REQTYPE_PROTECTION		BIT(15)
+
+enum ieee80211_twt_setup_cmd {
+	TWT_SETUP_CMD_REQUEST,
+	TWT_SETUP_CMD_SUGGEST,
+	TWT_SETUP_CMD_DEMAND,
+	TWT_SETUP_CMD_GROUPING,
+	TWT_SETUP_CMD_ACCEPT,
+	TWT_SETUP_CMD_ALTERNATE,
+	TWT_SETUP_CMD_DICTATE,
+	TWT_SETUP_CMD_REJECT,
+};
+
+struct ieee80211_twt_params {
+	__le16 req_type;
+	__le64 twt;
+	u8 min_twt_dur;
+	__le16 mantissa;
+	u8 channel;
+} __packed;
+
+struct ieee80211_twt_setup {
+	u8 dialog_token;
+	u8 element_id;
+	u8 length;
+	u8 control;
+	u8 params[];
+} __packed;
+
+/**
+ * struct ieee80211_he_cap_elem - HE capabilities element
+ * @mac_cap_info: HE MAC Capabilities Information
+ * @phy_cap_info: HE PHY Capabilities Information
+ *
+ * This structure represents the fixed fields of the payload of the
+ * "HE capabilities element" as described in IEEE Std 802.11ax-2021
+ * sections 9.4.2.248.2 and 9.4.2.248.3.
+ */
+struct ieee80211_he_cap_elem {
+	u8 mac_cap_info[6];
+	u8 phy_cap_info[11];
+} __packed;
+
+#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN	5
+
+/**
+ * enum ieee80211_he_mcs_support - HE MCS support definitions
+ * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
+ *	number of streams
+ * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported
+ * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported
+ * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported
+ *
+ * These definitions are used in each 2-bit subfield of the rx_mcs_*
+ * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are
+ * both split into 8 subfields by number of streams. These values indicate
+ * which MCSes are supported for the number of streams the value appears
+ * for.
+ */
+enum ieee80211_he_mcs_support {
+	IEEE80211_HE_MCS_SUPPORT_0_7	= 0,
+	IEEE80211_HE_MCS_SUPPORT_0_9	= 1,
+	IEEE80211_HE_MCS_SUPPORT_0_11	= 2,
+	IEEE80211_HE_MCS_NOT_SUPPORTED	= 3,
+};
+
+/**
+ * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field
+ *
+ * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field
+ * described in P802.11ax_D2.0 section 9.4.2.237.4
+ *
+ * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     widths less than 80MHz.
+ * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     widths less than 80MHz.
+ * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     width 160MHz.
+ * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     width 160MHz.
+ * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for
+ *     channel width 80p80MHz.
+ * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for
+ *     channel width 80p80MHz.
+ */
+struct ieee80211_he_mcs_nss_supp {
+	__le16 rx_mcs_80;
+	__le16 tx_mcs_80;
+	__le16 rx_mcs_160;
+	__le16 tx_mcs_160;
+	__le16 rx_mcs_80p80;
+	__le16 tx_mcs_80p80;
+} __packed;
+
+/**
+ * struct ieee80211_he_operation - HE Operation element
+ * @he_oper_params: HE Operation Parameters + BSS Color Information
+ * @he_mcs_nss_set: Basic HE-MCS And NSS Set
+ * @optional: Optional fields VHT Operation Information, Max Co-Hosted
+ *            BSSID Indicator, and 6 GHz Operation Information
+ *
+ * This structure represents the payload of the "HE Operation
+ * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.249.
+ */
+struct ieee80211_he_operation {
+	__le32 he_oper_params;
+	__le16 he_mcs_nss_set;
+	u8 optional[];
+} __packed;
+
+/**
+ * struct ieee80211_he_spr - Spatial Reuse Parameter Set element
+ * @he_sr_control: SR Control
+ * @optional: Optional fields Non-SRG OBSS PD Max Offset, SRG OBSS PD
+ *            Min Offset, SRG OBSS PD Max Offset, SRG BSS Color
+ *            Bitmap, and SRG Partial BSSID Bitmap
+ *
+ * This structure represents the payload of the "Spatial Reuse
+ * Parameter Set element" as described in IEEE Std 802.11ax-2021
+ * section 9.4.2.252.
+ */
+struct ieee80211_he_spr {
+	u8 he_sr_control;
+	u8 optional[];
+} __packed;
+
+/**
+ * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field
+ * @aifsn: ACI/AIFSN
+ * @ecw_min_max: ECWmin/ECWmax
+ * @mu_edca_timer: MU EDCA Timer
+ *
+ * This structure represents the "MU AC Parameter Record" as described
+ * in IEEE Std 802.11ax-2021 section 9.4.2.251, Figure 9-788p.
+ */
+struct ieee80211_he_mu_edca_param_ac_rec {
+	u8 aifsn;
+	u8 ecw_min_max;
+	u8 mu_edca_timer;
+} __packed;
+
+/**
+ * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element
+ * @mu_qos_info: QoS Info
+ * @ac_be: MU AC_BE Parameter Record
+ * @ac_bk: MU AC_BK Parameter Record
+ * @ac_vi: MU AC_VI Parameter Record
+ * @ac_vo: MU AC_VO Parameter Record
+ *
+ * This structure represents the payload of the "MU EDCA Parameter Set
+ * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.251.
+ */
+struct ieee80211_mu_edca_param_set {
+	u8 mu_qos_info;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_be;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_bk;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_vi;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_vo;
+} __packed;
+
+/* 802.11ax HE MAC capabilities */
+#define IEEE80211_HE_MAC_CAP0_HTC_HE				0x01
+#define IEEE80211_HE_MAC_CAP0_TWT_REQ				0x02
+#define IEEE80211_HE_MAC_CAP0_TWT_RES				0x04
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP		0x00
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1		0x08
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2		0x10
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3		0x18
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK			0x18
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1		0x00
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2		0x20
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4		0x40
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8		0x60
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16		0x80
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32		0xa0
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64		0xc0
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED	0xe0
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK		0xe0
+
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED		0x00
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128			0x01
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256			0x02
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512			0x03
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK		0x03
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US		0x00
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US		0x04
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US		0x08
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK		0x0c
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_1		0x00
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_2		0x10
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_3		0x20
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_4		0x30
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_5		0x40
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_6		0x50
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_7		0x60
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8		0x70
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_MASK		0x70
+
+/* Link adaptation is split between byte HE_MAC_CAP1 and
+ * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE
+ * in which case the following values apply:
+ * 0 = No feedback.
+ * 1 = reserved.
+ * 2 = Unsolicited feedback.
+ * 3 = both
+ */
+#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION			0x80
+
+#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION			0x01
+#define IEEE80211_HE_MAC_CAP2_ALL_ACK				0x02
+#define IEEE80211_HE_MAC_CAP2_TRS				0x04
+#define IEEE80211_HE_MAC_CAP2_BSR				0x08
+#define IEEE80211_HE_MAC_CAP2_BCAST_TWT				0x10
+#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP			0x20
+#define IEEE80211_HE_MAC_CAP2_MU_CASCADING			0x40
+#define IEEE80211_HE_MAC_CAP2_ACK_EN				0x80
+
+#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL			0x02
+#define IEEE80211_HE_MAC_CAP3_OFDMA_RA				0x04
+
+/* The maximum length of an A-MDPU is defined by the combination of the Maximum
+ * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the
+ * same field in the HE capabilities.
+ */
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0		0x00
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1		0x08
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2		0x10
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3		0x18
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK		0x18
+#define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG			0x20
+#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED			0x40
+#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS		0x80
+
+#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG		0x01
+#define IEEE80211_HE_MAC_CAP4_QTP				0x02
+#define IEEE80211_HE_MAC_CAP4_BQR				0x04
+#define IEEE80211_HE_MAC_CAP4_PSR_RESP				0x08
+#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP			0x10
+#define IEEE80211_HE_MAC_CAP4_OPS				0x20
+#define IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU			0x40
+/* Multi TID agg TX is split between byte #4 and #5
+ * The value is a combination of B39,B40,B41
+ */
+#define IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39		0x80
+
+#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40		0x01
+#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41		0x02
+#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION	0x04
+#define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU			0x08
+#define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX		0x10
+#define IEEE80211_HE_MAC_CAP5_HE_DYNAMIC_SM_PS			0x20
+#define IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING		0x40
+#define IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX		0x80
+
+#define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR	20
+#define IEEE80211_HE_HT_MAX_AMPDU_FACTOR	16
+#define IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR	13
+
+/* 802.11ax HE PHY capabilities */
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G		0x02
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G	0x04
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G		0x08
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G	0x10
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL		0x1e
+
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G	0x20
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G	0x40
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK			0xfe
+
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ	0x01
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ	0x02
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ	0x04
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ	0x08
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK			0x0f
+#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A				0x10
+#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD			0x20
+#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US		0x40
+/* Midamble RX/TX Max NSTS is split between byte #2 and byte #3 */
+#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS			0x80
+
+#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_TX_MAX_NSTS			0x01
+#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US			0x02
+#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ			0x04
+#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ			0x08
+#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX				0x10
+#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX				0x20
+
+/* Note that the meaning of UL MU below is different between an AP and a non-AP
+ * sta, where in the AP case it indicates support for Rx and in the non-AP sta
+ * case it indicates support for Tx.
+ */
+#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO			0x40
+#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO			0x80
+
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM			0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK			0x01
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK			0x02
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM			0x03
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK			0x03
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1				0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2				0x04
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM			0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK			0x08
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK			0x10
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM			0x18
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK			0x18
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1				0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2				0x20
+#define IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU		0x40
+#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER				0x80
+
+#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE				0x01
+#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER				0x02
+
+/* Minimal allowed value of Max STS under 80MHz is 3 */
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4		0x0c
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5		0x10
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6		0x14
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7		0x18
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8		0x1c
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK	0x1c
+
+/* Minimal allowed value of Max STS above 80MHz is 3 */
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4		0x60
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5		0x80
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6		0xa0
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7		0xc0
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8		0xe0
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK	0xe0
+
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1	0x00
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2	0x01
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3	0x02
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4	0x03
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5	0x04
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6	0x05
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7	0x06
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8	0x07
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK	0x07
+
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1	0x00
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2	0x08
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3	0x10
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4	0x18
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5	0x20
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6	0x28
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7	0x30
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8	0x38
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK	0x38
+
+#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK				0x40
+#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK				0x80
+
+#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU			0x01
+#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU			0x02
+#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB			0x04
+#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB		0x08
+#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB				0x10
+#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE			0x20
+#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO		0x40
+#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT			0x80
+
+#define IEEE80211_HE_PHY_CAP7_PSR_BASED_SR				0x01
+#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP			0x02
+#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI		0x04
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_1					0x08
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_2					0x10
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_3					0x18
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_4					0x20
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_5					0x28
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_6					0x30
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_7					0x38
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK				0x38
+#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ			0x40
+#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ			0x80
+
+#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI		0x01
+#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G		0x02
+#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU			0x04
+#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU			0x08
+#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI		0x10
+#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_TX_2X_AND_1XLTF		0x20
+#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242				0x00
+#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484				0x40
+#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996				0x80
+#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996				0xc0
+#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK				0xc0
+
+#define IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM		0x01
+#define IEEE80211_HE_PHY_CAP9_NON_TRIGGERED_CQI_FEEDBACK		0x02
+#define IEEE80211_HE_PHY_CAP9_TX_1024_QAM_LESS_THAN_242_TONE_RU		0x04
+#define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU		0x08
+#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB	0x10
+#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB	0x20
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US			0x0
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US			0x1
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US			0x2
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED		0x3
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_POS			6
+#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK			0xc0
+
+#define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF			0x01
+
+/* 802.11ax HE TX/RX MCS NSS Support  */
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS			(3)
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS			(6)
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS			(11)
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK			0x07c0
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK			0xf800
+
+/* TX/RX HE MCS Support field Highest MCS subfield encoding */
+enum ieee80211_he_highest_mcs_supported_subfield_enc {
+	HIGHEST_MCS_SUPPORTED_MCS7 = 0,
+	HIGHEST_MCS_SUPPORTED_MCS8,
+	HIGHEST_MCS_SUPPORTED_MCS9,
+	HIGHEST_MCS_SUPPORTED_MCS10,
+	HIGHEST_MCS_SUPPORTED_MCS11,
+};
+
+/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */
+static inline u8
+ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap)
+{
+	u8 count = 4;
+
+	if (he_cap->phy_cap_info[0] &
+	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)
+		count += 4;
+
+	if (he_cap->phy_cap_info[0] &
+	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G)
+		count += 4;
+
+	return count;
+}
+
+/* 802.11ax HE PPE Thresholds */
+#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS			(1)
+#define IEEE80211_PPE_THRES_NSS_POS				(0)
+#define IEEE80211_PPE_THRES_NSS_MASK				(7)
+#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU	\
+	(BIT(5) | BIT(6))
+#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK		0x78
+#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS		(3)
+#define IEEE80211_PPE_THRES_INFO_PPET_SIZE			(3)
+#define IEEE80211_HE_PPE_THRES_INFO_HEADER_SIZE			(7)
+
+/*
+ * Calculate 802.11ax HE capabilities IE PPE field size
+ * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8*
+ */
+static inline u8
+ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
+{
+	u8 n;
+
+	if ((phy_cap_info[6] &
+	     IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0)
+		return 0;
+
+	n = hweight8(ppe_thres_hdr &
+		     IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK);
+	n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >>
+		   IEEE80211_PPE_THRES_NSS_POS));
+
+	/*
+	 * Each pair is 6 bits, and we need to add the 7 "header" bits to the
+	 * total size.
+	 */
+	n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7;
+	n = DIV_ROUND_UP(n, 8);
+
+	return n;
+}
+
+static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len)
+{
+	const struct ieee80211_he_cap_elem *he_cap_ie_elem = (const void *)data;
+	u8 needed = sizeof(*he_cap_ie_elem);
+
+	if (len < needed)
+		return false;
+
+	needed += ieee80211_he_mcs_nss_size(he_cap_ie_elem);
+	if (len < needed)
+		return false;
+
+	if (he_cap_ie_elem->phy_cap_info[6] &
+			IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) {
+		if (len < needed + 1)
+			return false;
+		needed += ieee80211_he_ppe_size(data[needed],
+						he_cap_ie_elem->phy_cap_info);
+	}
+
+	return len >= needed;
+}
+
+/* HE Operation defines */
+#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK		0x00000007
+#define IEEE80211_HE_OPERATION_TWT_REQUIRED			0x00000008
+#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK		0x00003ff0
+#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET		4
+#define IEEE80211_HE_OPERATION_VHT_OPER_INFO			0x00004000
+#define IEEE80211_HE_OPERATION_CO_HOSTED_BSS			0x00008000
+#define IEEE80211_HE_OPERATION_ER_SU_DISABLE			0x00010000
+#define IEEE80211_HE_OPERATION_6GHZ_OP_INFO			0x00020000
+#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK			0x3f000000
+#define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET			24
+#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x40000000
+#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED		0x80000000
+
+#define IEEE80211_6GHZ_CTRL_REG_LPI_AP			0
+#define IEEE80211_6GHZ_CTRL_REG_SP_AP			1
+#define IEEE80211_6GHZ_CTRL_REG_VLP_AP			2
+#define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP		3
+#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD	4
+#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP		8
+
+/**
+ * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field
+ * @primary: primary channel
+ * @control: control flags
+ * @ccfs0: channel center frequency segment 0
+ * @ccfs1: channel center frequency segment 1
+ * @minrate: minimum rate (in 1 Mbps units)
+ */
+struct ieee80211_he_6ghz_oper {
+	u8 primary;
+#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH	0x3
+#define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ	0
+#define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ	1
+#define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ	2
+#define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ	3
+#define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON	0x4
+#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO	0x78
+	u8 control;
+	u8 ccfs0;
+	u8 ccfs1;
+	u8 minrate;
+} __packed;
+
+/**
+ * enum ieee80211_reg_conn_bits - represents Regulatory connectivity field bits.
+ *
+ * This enumeration defines bit flags used to represent regulatory connectivity
+ * field bits.
+ *
+ * @IEEE80211_REG_CONN_LPI_VALID: Indicates whether the LPI bit is valid.
+ * @IEEE80211_REG_CONN_LPI_VALUE: Represents the value of the LPI bit.
+ * @IEEE80211_REG_CONN_SP_VALID: Indicates whether the SP bit is valid.
+ * @IEEE80211_REG_CONN_SP_VALUE: Represents the value of the SP bit.
+ */
+enum ieee80211_reg_conn_bits {
+	IEEE80211_REG_CONN_LPI_VALID = BIT(0),
+	IEEE80211_REG_CONN_LPI_VALUE = BIT(1),
+	IEEE80211_REG_CONN_SP_VALID = BIT(2),
+	IEEE80211_REG_CONN_SP_VALUE = BIT(3),
+};
+
+/* transmit power interpretation type of transmit power envelope element */
+enum ieee80211_tx_power_intrpt_type {
+	IEEE80211_TPE_LOCAL_EIRP,
+	IEEE80211_TPE_LOCAL_EIRP_PSD,
+	IEEE80211_TPE_REG_CLIENT_EIRP,
+	IEEE80211_TPE_REG_CLIENT_EIRP_PSD,
+};
+
+/* category type of transmit power envelope element */
+enum ieee80211_tx_power_category_6ghz {
+	IEEE80211_TPE_CAT_6GHZ_DEFAULT = 0,
+	IEEE80211_TPE_CAT_6GHZ_SUBORDINATE = 1,
+};
+
+/*
+ * For IEEE80211_TPE_LOCAL_EIRP / IEEE80211_TPE_REG_CLIENT_EIRP,
+ * setting to 63.5 dBm means no constraint.
+ */
+#define IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT	127
+
+/*
+ * For IEEE80211_TPE_LOCAL_EIRP_PSD / IEEE80211_TPE_REG_CLIENT_EIRP_PSD,
+ * setting to 127 indicates no PSD limit for the 20 MHz channel.
+ */
+#define IEEE80211_TPE_PSD_NO_LIMIT		127
+
+/**
+ * struct ieee80211_tx_pwr_env - Transmit Power Envelope
+ * @info: Transmit Power Information field
+ * @variable: Maximum Transmit Power field
+ *
+ * This structure represents the payload of the "Transmit Power
+ * Envelope element" as described in IEEE Std 802.11ax-2021 section
+ * 9.4.2.161
+ */
+struct ieee80211_tx_pwr_env {
+	u8 info;
+	u8 variable[];
+} __packed;
+
+#define IEEE80211_TX_PWR_ENV_INFO_COUNT 0x7
+#define IEEE80211_TX_PWR_ENV_INFO_INTERPRET 0x38
+#define IEEE80211_TX_PWR_ENV_INFO_CATEGORY 0xC0
+
+#define IEEE80211_TX_PWR_ENV_EXT_COUNT	0xF
+
+static inline bool ieee80211_valid_tpe_element(const u8 *data, u8 len)
+{
+	const struct ieee80211_tx_pwr_env *env = (const void *)data;
+	u8 count, interpret, category;
+	u8 needed = sizeof(*env);
+	u8 N; /* also called N in the spec */
+
+	if (len < needed)
+		return false;
+
+	count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT);
+	interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET);
+	category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY);
+
+	switch (category) {
+	case IEEE80211_TPE_CAT_6GHZ_DEFAULT:
+	case IEEE80211_TPE_CAT_6GHZ_SUBORDINATE:
+		break;
+	default:
+		return false;
+	}
+
+	switch (interpret) {
+	case IEEE80211_TPE_LOCAL_EIRP:
+	case IEEE80211_TPE_REG_CLIENT_EIRP:
+		if (count > 3)
+			return false;
+
+		/* count == 0 encodes 1 value for 20 MHz, etc. */
+		needed += count + 1;
+
+		if (len < needed)
+			return false;
+
+		/* there can be extension fields not accounted for in 'count' */
+
+		return true;
+	case IEEE80211_TPE_LOCAL_EIRP_PSD:
+	case IEEE80211_TPE_REG_CLIENT_EIRP_PSD:
+		if (count > 4)
+			return false;
+
+		N = count ? 1 << (count - 1) : 1;
+		needed += N;
+
+		if (len < needed)
+			return false;
+
+		if (len > needed) {
+			u8 K = u8_get_bits(env->variable[N],
+					   IEEE80211_TX_PWR_ENV_EXT_COUNT);
+
+			needed += 1 + K;
+			if (len < needed)
+				return false;
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size
+ * @he_oper_ie: byte data of the He Operations IE, stating from the byte
+ *	after the ext ID byte. It is assumed that he_oper_ie has at least
+ *	sizeof(struct ieee80211_he_operation) bytes, the caller must have
+ *	validated this.
+ * @return the actual size of the IE data (not including header), or 0 on error
+ */
+static inline u8
+ieee80211_he_oper_size(const u8 *he_oper_ie)
+{
+	const struct ieee80211_he_operation *he_oper = (const void *)he_oper_ie;
+	u8 oper_len = sizeof(struct ieee80211_he_operation);
+	u32 he_oper_params;
+
+	/* Make sure the input is not NULL */
+	if (!he_oper_ie)
+		return 0;
+
+	/* Calc required length */
+	he_oper_params = le32_to_cpu(he_oper->he_oper_params);
+	if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO)
+		oper_len += 3;
+	if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS)
+		oper_len++;
+	if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)
+		oper_len += sizeof(struct ieee80211_he_6ghz_oper);
+
+	/* Add the first byte (extension ID) to the total length */
+	oper_len++;
+
+	return oper_len;
+}
+
+/**
+ * ieee80211_he_6ghz_oper - obtain 6 GHz operation field
+ * @he_oper: HE operation element (must be pre-validated for size)
+ *	but may be %NULL
+ *
+ * Return: a pointer to the 6 GHz operation field, or %NULL
+ */
+static inline const struct ieee80211_he_6ghz_oper *
+ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper)
+{
+	const u8 *ret;
+	u32 he_oper_params;
+
+	if (!he_oper)
+		return NULL;
+
+	ret = (const void *)&he_oper->optional;
+
+	he_oper_params = le32_to_cpu(he_oper->he_oper_params);
+
+	if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO))
+		return NULL;
+	if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO)
+		ret += 3;
+	if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS)
+		ret++;
+
+	return (const void *)ret;
+}
+
+/* HE Spatial Reuse defines */
+#define IEEE80211_HE_SPR_PSR_DISALLOWED				BIT(0)
+#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED		BIT(1)
+#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT			BIT(2)
+#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT		BIT(3)
+#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED		BIT(4)
+
+/*
+ * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size
+ * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the byte
+ *	after the ext ID byte. It is assumed that he_spr_ie has at least
+ *	sizeof(struct ieee80211_he_spr) bytes, the caller must have validated
+ *	this
+ * @return the actual size of the IE data (not including header), or 0 on error
+ */
+static inline u8
+ieee80211_he_spr_size(const u8 *he_spr_ie)
+{
+	const struct ieee80211_he_spr *he_spr = (const void *)he_spr_ie;
+	u8 spr_len = sizeof(struct ieee80211_he_spr);
+	u8 he_spr_params;
+
+	/* Make sure the input is not NULL */
+	if (!he_spr_ie)
+		return 0;
+
+	/* Calc required length */
+	he_spr_params = he_spr->he_sr_control;
+	if (he_spr_params & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT)
+		spr_len++;
+	if (he_spr_params & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT)
+		spr_len += 18;
+
+	/* Add the first byte (extension ID) to the total length */
+	spr_len++;
+
+	return spr_len;
+}
+
+struct ieee80211_he_6ghz_capa {
+	/* uses IEEE80211_HE_6GHZ_CAP_* below */
+	__le16 capa;
+} __packed;
+
+/* HE 6 GHz band capabilities */
+/* uses enum ieee80211_min_mpdu_spacing values */
+#define IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START	0x0007
+/* uses enum ieee80211_vht_max_ampdu_length_exp values */
+#define IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP	0x0038
+/* uses IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_* values */
+#define IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN	0x00c0
+/* WLAN_HT_CAP_SM_PS_* values */
+#define IEEE80211_HE_6GHZ_CAP_SM_PS		0x0600
+#define IEEE80211_HE_6GHZ_CAP_RD_RESPONDER	0x0800
+#define IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS	0x1000
+#define IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS	0x2000
+
+#endif /* LINUX_IEEE80211_HE_H */
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 0b247b28c661..a3dbbcee00ee 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1141,48 +1141,6 @@ ieee80211_s1g_optional_len(__le16 fc)
 	return len;
 }
 
-#define IEEE80211_TWT_CONTROL_NDP			BIT(0)
-#define IEEE80211_TWT_CONTROL_RESP_MODE			BIT(1)
-#define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST	BIT(3)
-#define IEEE80211_TWT_CONTROL_RX_DISABLED		BIT(4)
-#define IEEE80211_TWT_CONTROL_WAKE_DUR_UNIT		BIT(5)
-
-#define IEEE80211_TWT_REQTYPE_REQUEST			BIT(0)
-#define IEEE80211_TWT_REQTYPE_SETUP_CMD			GENMASK(3, 1)
-#define IEEE80211_TWT_REQTYPE_TRIGGER			BIT(4)
-#define IEEE80211_TWT_REQTYPE_IMPLICIT			BIT(5)
-#define IEEE80211_TWT_REQTYPE_FLOWTYPE			BIT(6)
-#define IEEE80211_TWT_REQTYPE_FLOWID			GENMASK(9, 7)
-#define IEEE80211_TWT_REQTYPE_WAKE_INT_EXP		GENMASK(14, 10)
-#define IEEE80211_TWT_REQTYPE_PROTECTION		BIT(15)
-
-enum ieee80211_twt_setup_cmd {
-	TWT_SETUP_CMD_REQUEST,
-	TWT_SETUP_CMD_SUGGEST,
-	TWT_SETUP_CMD_DEMAND,
-	TWT_SETUP_CMD_GROUPING,
-	TWT_SETUP_CMD_ACCEPT,
-	TWT_SETUP_CMD_ALTERNATE,
-	TWT_SETUP_CMD_DICTATE,
-	TWT_SETUP_CMD_REJECT,
-};
-
-struct ieee80211_twt_params {
-	__le16 req_type;
-	__le64 twt;
-	u8 min_twt_dur;
-	__le16 mantissa;
-	u8 channel;
-} __packed;
-
-struct ieee80211_twt_setup {
-	u8 dialog_token;
-	u8 element_id;
-	u8 length;
-	u8 control;
-	u8 params[];
-} __packed;
-
 #define IEEE80211_TTLM_MAX_CNT				2
 #define IEEE80211_TTLM_CONTROL_DIRECTION		0x03
 #define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP		0x04
@@ -1633,137 +1591,6 @@ struct ieee80211_p2p_noa_attr {
 #define IEEE80211_P2P_OPPPS_ENABLE_BIT		BIT(7)
 #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK	0x7F
 
-/**
- * struct ieee80211_he_cap_elem - HE capabilities element
- * @mac_cap_info: HE MAC Capabilities Information
- * @phy_cap_info: HE PHY Capabilities Information
- *
- * This structure represents the fixed fields of the payload of the
- * "HE capabilities element" as described in IEEE Std 802.11ax-2021
- * sections 9.4.2.248.2 and 9.4.2.248.3.
- */
-struct ieee80211_he_cap_elem {
-	u8 mac_cap_info[6];
-	u8 phy_cap_info[11];
-} __packed;
-
-#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN	5
-
-/**
- * enum ieee80211_he_mcs_support - HE MCS support definitions
- * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
- *	number of streams
- * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported
- * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported
- * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported
- *
- * These definitions are used in each 2-bit subfield of the rx_mcs_*
- * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are
- * both split into 8 subfields by number of streams. These values indicate
- * which MCSes are supported for the number of streams the value appears
- * for.
- */
-enum ieee80211_he_mcs_support {
-	IEEE80211_HE_MCS_SUPPORT_0_7	= 0,
-	IEEE80211_HE_MCS_SUPPORT_0_9	= 1,
-	IEEE80211_HE_MCS_SUPPORT_0_11	= 2,
-	IEEE80211_HE_MCS_NOT_SUPPORTED	= 3,
-};
-
-/**
- * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field
- *
- * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field
- * described in P802.11ax_D2.0 section 9.4.2.237.4
- *
- * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel
- *     widths less than 80MHz.
- * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel
- *     widths less than 80MHz.
- * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel
- *     width 160MHz.
- * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel
- *     width 160MHz.
- * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for
- *     channel width 80p80MHz.
- * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for
- *     channel width 80p80MHz.
- */
-struct ieee80211_he_mcs_nss_supp {
-	__le16 rx_mcs_80;
-	__le16 tx_mcs_80;
-	__le16 rx_mcs_160;
-	__le16 tx_mcs_160;
-	__le16 rx_mcs_80p80;
-	__le16 tx_mcs_80p80;
-} __packed;
-
-/**
- * struct ieee80211_he_operation - HE Operation element
- * @he_oper_params: HE Operation Parameters + BSS Color Information
- * @he_mcs_nss_set: Basic HE-MCS And NSS Set
- * @optional: Optional fields VHT Operation Information, Max Co-Hosted
- *            BSSID Indicator, and 6 GHz Operation Information
- *
- * This structure represents the payload of the "HE Operation
- * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.249.
- */
-struct ieee80211_he_operation {
-	__le32 he_oper_params;
-	__le16 he_mcs_nss_set;
-	u8 optional[];
-} __packed;
-
-/**
- * struct ieee80211_he_spr - Spatial Reuse Parameter Set element
- * @he_sr_control: SR Control
- * @optional: Optional fields Non-SRG OBSS PD Max Offset, SRG OBSS PD
- *            Min Offset, SRG OBSS PD Max Offset, SRG BSS Color
- *            Bitmap, and SRG Partial BSSID Bitmap
- *
- * This structure represents the payload of the "Spatial Reuse
- * Parameter Set element" as described in IEEE Std 802.11ax-2021
- * section 9.4.2.252.
- */
-struct ieee80211_he_spr {
-	u8 he_sr_control;
-	u8 optional[];
-} __packed;
-
-/**
- * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field
- * @aifsn: ACI/AIFSN
- * @ecw_min_max: ECWmin/ECWmax
- * @mu_edca_timer: MU EDCA Timer
- *
- * This structure represents the "MU AC Parameter Record" as described
- * in IEEE Std 802.11ax-2021 section 9.4.2.251, Figure 9-788p.
- */
-struct ieee80211_he_mu_edca_param_ac_rec {
-	u8 aifsn;
-	u8 ecw_min_max;
-	u8 mu_edca_timer;
-} __packed;
-
-/**
- * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element
- * @mu_qos_info: QoS Info
- * @ac_be: MU AC_BE Parameter Record
- * @ac_bk: MU AC_BK Parameter Record
- * @ac_vi: MU AC_VI Parameter Record
- * @ac_vo: MU AC_VO Parameter Record
- *
- * This structure represents the payload of the "MU EDCA Parameter Set
- * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.251.
- */
-struct ieee80211_mu_edca_param_set {
-	u8 mu_qos_info;
-	struct ieee80211_he_mu_edca_param_ac_rec ac_be;
-	struct ieee80211_he_mu_edca_param_ac_rec ac_bk;
-	struct ieee80211_he_mu_edca_param_ac_rec ac_vi;
-	struct ieee80211_he_mu_edca_param_ac_rec ac_vo;
-} __packed;
-
 #define IEEE80211_EHT_MCS_NSS_RX 0x0f
 #define IEEE80211_EHT_MCS_NSS_TX 0xf0
 
@@ -1902,618 +1729,6 @@ struct ieee80211_eht_operation_info {
 	u8 optional[];
 } __packed;
 
-/* 802.11ax HE MAC capabilities */
-#define IEEE80211_HE_MAC_CAP0_HTC_HE				0x01
-#define IEEE80211_HE_MAC_CAP0_TWT_REQ				0x02
-#define IEEE80211_HE_MAC_CAP0_TWT_RES				0x04
-#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP		0x00
-#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1		0x08
-#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2		0x10
-#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3		0x18
-#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK			0x18
-#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1		0x00
-#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2		0x20
-#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4		0x40
-#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8		0x60
-#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16		0x80
-#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32		0xa0
-#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64		0xc0
-#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED	0xe0
-#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK		0xe0
-
-#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED		0x00
-#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128			0x01
-#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256			0x02
-#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512			0x03
-#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK		0x03
-#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US		0x00
-#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US		0x04
-#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US		0x08
-#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK		0x0c
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_1		0x00
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_2		0x10
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_3		0x20
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_4		0x30
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_5		0x40
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_6		0x50
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_7		0x60
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8		0x70
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_MASK		0x70
-
-/* Link adaptation is split between byte HE_MAC_CAP1 and
- * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE
- * in which case the following values apply:
- * 0 = No feedback.
- * 1 = reserved.
- * 2 = Unsolicited feedback.
- * 3 = both
- */
-#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION			0x80
-
-#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION			0x01
-#define IEEE80211_HE_MAC_CAP2_ALL_ACK				0x02
-#define IEEE80211_HE_MAC_CAP2_TRS				0x04
-#define IEEE80211_HE_MAC_CAP2_BSR				0x08
-#define IEEE80211_HE_MAC_CAP2_BCAST_TWT				0x10
-#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP			0x20
-#define IEEE80211_HE_MAC_CAP2_MU_CASCADING			0x40
-#define IEEE80211_HE_MAC_CAP2_ACK_EN				0x80
-
-#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL			0x02
-#define IEEE80211_HE_MAC_CAP3_OFDMA_RA				0x04
-
-/* The maximum length of an A-MDPU is defined by the combination of the Maximum
- * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the
- * same field in the HE capabilities.
- */
-#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0		0x00
-#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1		0x08
-#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2		0x10
-#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3		0x18
-#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK		0x18
-#define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG			0x20
-#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED			0x40
-#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS		0x80
-
-#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG		0x01
-#define IEEE80211_HE_MAC_CAP4_QTP				0x02
-#define IEEE80211_HE_MAC_CAP4_BQR				0x04
-#define IEEE80211_HE_MAC_CAP4_PSR_RESP				0x08
-#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP			0x10
-#define IEEE80211_HE_MAC_CAP4_OPS				0x20
-#define IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU			0x40
-/* Multi TID agg TX is split between byte #4 and #5
- * The value is a combination of B39,B40,B41
- */
-#define IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39		0x80
-
-#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40		0x01
-#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41		0x02
-#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION	0x04
-#define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU			0x08
-#define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX		0x10
-#define IEEE80211_HE_MAC_CAP5_HE_DYNAMIC_SM_PS			0x20
-#define IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING		0x40
-#define IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX		0x80
-
-#define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR	20
-#define IEEE80211_HE_HT_MAX_AMPDU_FACTOR	16
-#define IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR	13
-
-/* 802.11ax HE PHY capabilities */
-#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G		0x02
-#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G	0x04
-#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G		0x08
-#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G	0x10
-#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL		0x1e
-
-#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G	0x20
-#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G	0x40
-#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK			0xfe
-
-#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ	0x01
-#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ	0x02
-#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ	0x04
-#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ	0x08
-#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK			0x0f
-#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A				0x10
-#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD			0x20
-#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US		0x40
-/* Midamble RX/TX Max NSTS is split between byte #2 and byte #3 */
-#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS			0x80
-
-#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_TX_MAX_NSTS			0x01
-#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US			0x02
-#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ			0x04
-#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ			0x08
-#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX				0x10
-#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX				0x20
-
-/* Note that the meaning of UL MU below is different between an AP and a non-AP
- * sta, where in the AP case it indicates support for Rx and in the non-AP sta
- * case it indicates support for Tx.
- */
-#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO			0x40
-#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO			0x80
-
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM			0x00
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK			0x01
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK			0x02
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM			0x03
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK			0x03
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1				0x00
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2				0x04
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM			0x00
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK			0x08
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK			0x10
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM			0x18
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK			0x18
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1				0x00
-#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2				0x20
-#define IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU		0x40
-#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER				0x80
-
-#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE				0x01
-#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER				0x02
-
-/* Minimal allowed value of Max STS under 80MHz is 3 */
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4		0x0c
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5		0x10
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6		0x14
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7		0x18
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8		0x1c
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK	0x1c
-
-/* Minimal allowed value of Max STS above 80MHz is 3 */
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4		0x60
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5		0x80
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6		0xa0
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7		0xc0
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8		0xe0
-#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK	0xe0
-
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1	0x00
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2	0x01
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3	0x02
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4	0x03
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5	0x04
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6	0x05
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7	0x06
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8	0x07
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK	0x07
-
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1	0x00
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2	0x08
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3	0x10
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4	0x18
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5	0x20
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6	0x28
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7	0x30
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8	0x38
-#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK	0x38
-
-#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK				0x40
-#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK				0x80
-
-#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU			0x01
-#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU			0x02
-#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB			0x04
-#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB		0x08
-#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB				0x10
-#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE			0x20
-#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO		0x40
-#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT			0x80
-
-#define IEEE80211_HE_PHY_CAP7_PSR_BASED_SR				0x01
-#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP			0x02
-#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI		0x04
-#define IEEE80211_HE_PHY_CAP7_MAX_NC_1					0x08
-#define IEEE80211_HE_PHY_CAP7_MAX_NC_2					0x10
-#define IEEE80211_HE_PHY_CAP7_MAX_NC_3					0x18
-#define IEEE80211_HE_PHY_CAP7_MAX_NC_4					0x20
-#define IEEE80211_HE_PHY_CAP7_MAX_NC_5					0x28
-#define IEEE80211_HE_PHY_CAP7_MAX_NC_6					0x30
-#define IEEE80211_HE_PHY_CAP7_MAX_NC_7					0x38
-#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK				0x38
-#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ			0x40
-#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ			0x80
-
-#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI		0x01
-#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G		0x02
-#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU			0x04
-#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU			0x08
-#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI		0x10
-#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_TX_2X_AND_1XLTF		0x20
-#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242				0x00
-#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484				0x40
-#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996				0x80
-#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996				0xc0
-#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK				0xc0
-
-#define IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM		0x01
-#define IEEE80211_HE_PHY_CAP9_NON_TRIGGERED_CQI_FEEDBACK		0x02
-#define IEEE80211_HE_PHY_CAP9_TX_1024_QAM_LESS_THAN_242_TONE_RU		0x04
-#define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU		0x08
-#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB	0x10
-#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB	0x20
-#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US			0x0
-#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US			0x1
-#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US			0x2
-#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED		0x3
-#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_POS			6
-#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK			0xc0
-
-#define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF			0x01
-
-/* 802.11ax HE TX/RX MCS NSS Support  */
-#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS			(3)
-#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS			(6)
-#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS			(11)
-#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK			0x07c0
-#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK			0xf800
-
-/* TX/RX HE MCS Support field Highest MCS subfield encoding */
-enum ieee80211_he_highest_mcs_supported_subfield_enc {
-	HIGHEST_MCS_SUPPORTED_MCS7 = 0,
-	HIGHEST_MCS_SUPPORTED_MCS8,
-	HIGHEST_MCS_SUPPORTED_MCS9,
-	HIGHEST_MCS_SUPPORTED_MCS10,
-	HIGHEST_MCS_SUPPORTED_MCS11,
-};
-
-/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */
-static inline u8
-ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap)
-{
-	u8 count = 4;
-
-	if (he_cap->phy_cap_info[0] &
-	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)
-		count += 4;
-
-	if (he_cap->phy_cap_info[0] &
-	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G)
-		count += 4;
-
-	return count;
-}
-
-/* 802.11ax HE PPE Thresholds */
-#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS			(1)
-#define IEEE80211_PPE_THRES_NSS_POS				(0)
-#define IEEE80211_PPE_THRES_NSS_MASK				(7)
-#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU	\
-	(BIT(5) | BIT(6))
-#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK		0x78
-#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS		(3)
-#define IEEE80211_PPE_THRES_INFO_PPET_SIZE			(3)
-#define IEEE80211_HE_PPE_THRES_INFO_HEADER_SIZE			(7)
-
-/*
- * Calculate 802.11ax HE capabilities IE PPE field size
- * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8*
- */
-static inline u8
-ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
-{
-	u8 n;
-
-	if ((phy_cap_info[6] &
-	     IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0)
-		return 0;
-
-	n = hweight8(ppe_thres_hdr &
-		     IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK);
-	n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >>
-		   IEEE80211_PPE_THRES_NSS_POS));
-
-	/*
-	 * Each pair is 6 bits, and we need to add the 7 "header" bits to the
-	 * total size.
-	 */
-	n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7;
-	n = DIV_ROUND_UP(n, 8);
-
-	return n;
-}
-
-static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len)
-{
-	const struct ieee80211_he_cap_elem *he_cap_ie_elem = (const void *)data;
-	u8 needed = sizeof(*he_cap_ie_elem);
-
-	if (len < needed)
-		return false;
-
-	needed += ieee80211_he_mcs_nss_size(he_cap_ie_elem);
-	if (len < needed)
-		return false;
-
-	if (he_cap_ie_elem->phy_cap_info[6] &
-			IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) {
-		if (len < needed + 1)
-			return false;
-		needed += ieee80211_he_ppe_size(data[needed],
-						he_cap_ie_elem->phy_cap_info);
-	}
-
-	return len >= needed;
-}
-
-/* HE Operation defines */
-#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK		0x00000007
-#define IEEE80211_HE_OPERATION_TWT_REQUIRED			0x00000008
-#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK		0x00003ff0
-#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET		4
-#define IEEE80211_HE_OPERATION_VHT_OPER_INFO			0x00004000
-#define IEEE80211_HE_OPERATION_CO_HOSTED_BSS			0x00008000
-#define IEEE80211_HE_OPERATION_ER_SU_DISABLE			0x00010000
-#define IEEE80211_HE_OPERATION_6GHZ_OP_INFO			0x00020000
-#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK			0x3f000000
-#define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET			24
-#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x40000000
-#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED		0x80000000
-
-#define IEEE80211_6GHZ_CTRL_REG_LPI_AP			0
-#define IEEE80211_6GHZ_CTRL_REG_SP_AP			1
-#define IEEE80211_6GHZ_CTRL_REG_VLP_AP			2
-#define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP		3
-#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD	4
-#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP		8
-
-/**
- * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field
- * @primary: primary channel
- * @control: control flags
- * @ccfs0: channel center frequency segment 0
- * @ccfs1: channel center frequency segment 1
- * @minrate: minimum rate (in 1 Mbps units)
- */
-struct ieee80211_he_6ghz_oper {
-	u8 primary;
-#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH	0x3
-#define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ	0
-#define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ	1
-#define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ	2
-#define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ	3
-#define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON	0x4
-#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO	0x78
-	u8 control;
-	u8 ccfs0;
-	u8 ccfs1;
-	u8 minrate;
-} __packed;
-
-/**
- * enum ieee80211_reg_conn_bits - represents Regulatory connectivity field bits.
- *
- * This enumeration defines bit flags used to represent regulatory connectivity
- * field bits.
- *
- * @IEEE80211_REG_CONN_LPI_VALID: Indicates whether the LPI bit is valid.
- * @IEEE80211_REG_CONN_LPI_VALUE: Represents the value of the LPI bit.
- * @IEEE80211_REG_CONN_SP_VALID: Indicates whether the SP bit is valid.
- * @IEEE80211_REG_CONN_SP_VALUE: Represents the value of the SP bit.
- */
-enum ieee80211_reg_conn_bits {
-	IEEE80211_REG_CONN_LPI_VALID = BIT(0),
-	IEEE80211_REG_CONN_LPI_VALUE = BIT(1),
-	IEEE80211_REG_CONN_SP_VALID = BIT(2),
-	IEEE80211_REG_CONN_SP_VALUE = BIT(3),
-};
-
-/* transmit power interpretation type of transmit power envelope element */
-enum ieee80211_tx_power_intrpt_type {
-	IEEE80211_TPE_LOCAL_EIRP,
-	IEEE80211_TPE_LOCAL_EIRP_PSD,
-	IEEE80211_TPE_REG_CLIENT_EIRP,
-	IEEE80211_TPE_REG_CLIENT_EIRP_PSD,
-};
-
-/* category type of transmit power envelope element */
-enum ieee80211_tx_power_category_6ghz {
-	IEEE80211_TPE_CAT_6GHZ_DEFAULT = 0,
-	IEEE80211_TPE_CAT_6GHZ_SUBORDINATE = 1,
-};
-
-/*
- * For IEEE80211_TPE_LOCAL_EIRP / IEEE80211_TPE_REG_CLIENT_EIRP,
- * setting to 63.5 dBm means no constraint.
- */
-#define IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT	127
-
-/*
- * For IEEE80211_TPE_LOCAL_EIRP_PSD / IEEE80211_TPE_REG_CLIENT_EIRP_PSD,
- * setting to 127 indicates no PSD limit for the 20 MHz channel.
- */
-#define IEEE80211_TPE_PSD_NO_LIMIT		127
-
-/**
- * struct ieee80211_tx_pwr_env - Transmit Power Envelope
- * @info: Transmit Power Information field
- * @variable: Maximum Transmit Power field
- *
- * This structure represents the payload of the "Transmit Power
- * Envelope element" as described in IEEE Std 802.11ax-2021 section
- * 9.4.2.161
- */
-struct ieee80211_tx_pwr_env {
-	u8 info;
-	u8 variable[];
-} __packed;
-
-#define IEEE80211_TX_PWR_ENV_INFO_COUNT 0x7
-#define IEEE80211_TX_PWR_ENV_INFO_INTERPRET 0x38
-#define IEEE80211_TX_PWR_ENV_INFO_CATEGORY 0xC0
-
-#define IEEE80211_TX_PWR_ENV_EXT_COUNT	0xF
-
-static inline bool ieee80211_valid_tpe_element(const u8 *data, u8 len)
-{
-	const struct ieee80211_tx_pwr_env *env = (const void *)data;
-	u8 count, interpret, category;
-	u8 needed = sizeof(*env);
-	u8 N; /* also called N in the spec */
-
-	if (len < needed)
-		return false;
-
-	count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT);
-	interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET);
-	category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY);
-
-	switch (category) {
-	case IEEE80211_TPE_CAT_6GHZ_DEFAULT:
-	case IEEE80211_TPE_CAT_6GHZ_SUBORDINATE:
-		break;
-	default:
-		return false;
-	}
-
-	switch (interpret) {
-	case IEEE80211_TPE_LOCAL_EIRP:
-	case IEEE80211_TPE_REG_CLIENT_EIRP:
-		if (count > 3)
-			return false;
-
-		/* count == 0 encodes 1 value for 20 MHz, etc. */
-		needed += count + 1;
-
-		if (len < needed)
-			return false;
-
-		/* there can be extension fields not accounted for in 'count' */
-
-		return true;
-	case IEEE80211_TPE_LOCAL_EIRP_PSD:
-	case IEEE80211_TPE_REG_CLIENT_EIRP_PSD:
-		if (count > 4)
-			return false;
-
-		N = count ? 1 << (count - 1) : 1;
-		needed += N;
-
-		if (len < needed)
-			return false;
-
-		if (len > needed) {
-			u8 K = u8_get_bits(env->variable[N],
-					   IEEE80211_TX_PWR_ENV_EXT_COUNT);
-
-			needed += 1 + K;
-			if (len < needed)
-				return false;
-		}
-
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size
- * @he_oper_ie: byte data of the He Operations IE, stating from the byte
- *	after the ext ID byte. It is assumed that he_oper_ie has at least
- *	sizeof(struct ieee80211_he_operation) bytes, the caller must have
- *	validated this.
- * @return the actual size of the IE data (not including header), or 0 on error
- */
-static inline u8
-ieee80211_he_oper_size(const u8 *he_oper_ie)
-{
-	const struct ieee80211_he_operation *he_oper = (const void *)he_oper_ie;
-	u8 oper_len = sizeof(struct ieee80211_he_operation);
-	u32 he_oper_params;
-
-	/* Make sure the input is not NULL */
-	if (!he_oper_ie)
-		return 0;
-
-	/* Calc required length */
-	he_oper_params = le32_to_cpu(he_oper->he_oper_params);
-	if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO)
-		oper_len += 3;
-	if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS)
-		oper_len++;
-	if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)
-		oper_len += sizeof(struct ieee80211_he_6ghz_oper);
-
-	/* Add the first byte (extension ID) to the total length */
-	oper_len++;
-
-	return oper_len;
-}
-
-/**
- * ieee80211_he_6ghz_oper - obtain 6 GHz operation field
- * @he_oper: HE operation element (must be pre-validated for size)
- *	but may be %NULL
- *
- * Return: a pointer to the 6 GHz operation field, or %NULL
- */
-static inline const struct ieee80211_he_6ghz_oper *
-ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper)
-{
-	const u8 *ret;
-	u32 he_oper_params;
-
-	if (!he_oper)
-		return NULL;
-
-	ret = (const void *)&he_oper->optional;
-
-	he_oper_params = le32_to_cpu(he_oper->he_oper_params);
-
-	if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO))
-		return NULL;
-	if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO)
-		ret += 3;
-	if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS)
-		ret++;
-
-	return (const void *)ret;
-}
-
-/* HE Spatial Reuse defines */
-#define IEEE80211_HE_SPR_PSR_DISALLOWED				BIT(0)
-#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED		BIT(1)
-#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT			BIT(2)
-#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT		BIT(3)
-#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED		BIT(4)
-
-/*
- * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size
- * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the byte
- *	after the ext ID byte. It is assumed that he_spr_ie has at least
- *	sizeof(struct ieee80211_he_spr) bytes, the caller must have validated
- *	this
- * @return the actual size of the IE data (not including header), or 0 on error
- */
-static inline u8
-ieee80211_he_spr_size(const u8 *he_spr_ie)
-{
-	const struct ieee80211_he_spr *he_spr = (const void *)he_spr_ie;
-	u8 spr_len = sizeof(struct ieee80211_he_spr);
-	u8 he_spr_params;
-
-	/* Make sure the input is not NULL */
-	if (!he_spr_ie)
-		return 0;
-
-	/* Calc required length */
-	he_spr_params = he_spr->he_sr_control;
-	if (he_spr_params & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT)
-		spr_len++;
-	if (he_spr_params & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT)
-		spr_len += 18;
-
-	/* Add the first byte (extension ID) to the total length */
-	spr_len++;
-
-	return spr_len;
-}
-
 /* S1G Capabilities Information field */
 #define IEEE80211_S1G_CAPABILITY_LEN	15
 
@@ -2697,6 +1912,9 @@ ieee80211_he_spr_size(const u8 *he_spr_ie)
 #define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ	3
 #define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ	4
 
+/* need HE definitions for EHT functions */
+#include "ieee80211-he.h"
+
 /* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */
 static inline u8
 ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap,
@@ -3815,24 +3033,6 @@ struct ieee80211_tspec_ie {
 	__le16 medium_time;
 } __packed;
 
-struct ieee80211_he_6ghz_capa {
-	/* uses IEEE80211_HE_6GHZ_CAP_* below */
-	__le16 capa;
-} __packed;
-
-/* HE 6 GHz band capabilities */
-/* uses enum ieee80211_min_mpdu_spacing values */
-#define IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START	0x0007
-/* uses enum ieee80211_vht_max_ampdu_length_exp values */
-#define IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP	0x0038
-/* uses IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_* values */
-#define IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN	0x00c0
-/* WLAN_HT_CAP_SM_PS_* values */
-#define IEEE80211_HE_6GHZ_CAP_SM_PS		0x0600
-#define IEEE80211_HE_6GHZ_CAP_RD_RESPONDER	0x0800
-#define IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS	0x1000
-#define IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS	0x2000
-
 /**
  * ieee80211_get_qos_ctl - get pointer to qos control bytes
  * @hdr: the frame
-- 
cgit v1.2.3


From 86bc0c662322b4749cd666678d2fdce7015bcae3 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 15:36:53 +0100
Subject: wifi: ieee80211: split EHT definitions out

The ieee80211.h file has gotten very long, continue splitting
it by putting EHT definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.bf77fe169140.I691267e0edd914c604a5bfd447d33be00044c9b4@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211-eht.h | 1182 +++++++++++++++++++++++++++++++++++++++++
 include/linux/ieee80211.h     | 1164 +---------------------------------------
 2 files changed, 1184 insertions(+), 1162 deletions(-)
 create mode 100644 include/linux/ieee80211-eht.h

(limited to 'include')

diff --git a/include/linux/ieee80211-eht.h b/include/linux/ieee80211-eht.h
new file mode 100644
index 000000000000..f9782e46c5e5
--- /dev/null
+++ b/include/linux/ieee80211-eht.h
@@ -0,0 +1,1182 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * IEEE 802.11 EHT definitions
+ *
+ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ * <jkmaline@cc.hut.fi>
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright (c) 2005, Devicescape Software, Inc.
+ * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
+ * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright (c) 2018 - 2025 Intel Corporation
+ */
+
+#ifndef LINUX_IEEE80211_EHT_H
+#define LINUX_IEEE80211_EHT_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+/* need HE definitions for the inlines here */
+#include <linux/ieee80211-he.h>
+
+#define IEEE80211_TTLM_MAX_CNT				2
+#define IEEE80211_TTLM_CONTROL_DIRECTION		0x03
+#define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP		0x04
+#define IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT	0x08
+#define IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT	0x10
+#define IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE		0x20
+
+#define IEEE80211_TTLM_DIRECTION_DOWN		0
+#define IEEE80211_TTLM_DIRECTION_UP		1
+#define IEEE80211_TTLM_DIRECTION_BOTH		2
+
+/**
+ * struct ieee80211_ttlm_elem - TID-To-Link Mapping element
+ *
+ * Defined in section 9.4.2.314 in P802.11be_D4
+ *
+ * @control: the first part of control field
+ * @optional: the second part of control field
+ */
+struct ieee80211_ttlm_elem {
+	u8 control;
+	u8 optional[];
+} __packed;
+
+#define IEEE80211_EHT_MCS_NSS_RX 0x0f
+#define IEEE80211_EHT_MCS_NSS_TX 0xf0
+
+/**
+ * struct ieee80211_eht_mcs_nss_supp_20mhz_only - EHT 20MHz only station max
+ * supported NSS for per MCS.
+ *
+ * For each field below, bits 0 - 3 indicate the maximal number of spatial
+ * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams
+ * for Tx.
+ *
+ * @rx_tx_mcs7_max_nss: indicates the maximum number of spatial streams
+ *     supported for reception and the maximum number of spatial streams
+ *     supported for transmission for MCS 0 - 7.
+ * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams
+ *     supported for reception and the maximum number of spatial streams
+ *     supported for transmission for MCS 8 - 9.
+ * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams
+ *     supported for reception and the maximum number of spatial streams
+ *     supported for transmission for MCS 10 - 11.
+ * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams
+ *     supported for reception and the maximum number of spatial streams
+ *     supported for transmission for MCS 12 - 13.
+ * @rx_tx_max_nss: array of the previous fields for easier loop access
+ */
+struct ieee80211_eht_mcs_nss_supp_20mhz_only {
+	union {
+		struct {
+			u8 rx_tx_mcs7_max_nss;
+			u8 rx_tx_mcs9_max_nss;
+			u8 rx_tx_mcs11_max_nss;
+			u8 rx_tx_mcs13_max_nss;
+		};
+		u8 rx_tx_max_nss[4];
+	};
+};
+
+/**
+ * struct ieee80211_eht_mcs_nss_supp_bw - EHT max supported NSS per MCS (except
+ * 20MHz only stations).
+ *
+ * For each field below, bits 0 - 3 indicate the maximal number of spatial
+ * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams
+ * for Tx.
+ *
+ * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams
+ *     supported for reception and the maximum number of spatial streams
+ *     supported for transmission for MCS 0 - 9.
+ * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams
+ *     supported for reception and the maximum number of spatial streams
+ *     supported for transmission for MCS 10 - 11.
+ * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams
+ *     supported for reception and the maximum number of spatial streams
+ *     supported for transmission for MCS 12 - 13.
+ * @rx_tx_max_nss: array of the previous fields for easier loop access
+ */
+struct ieee80211_eht_mcs_nss_supp_bw {
+	union {
+		struct {
+			u8 rx_tx_mcs9_max_nss;
+			u8 rx_tx_mcs11_max_nss;
+			u8 rx_tx_mcs13_max_nss;
+		};
+		u8 rx_tx_max_nss[3];
+	};
+};
+
+/**
+ * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data
+ *
+ * This structure is the "EHT Capabilities element" fixed fields as
+ * described in P802.11be_D2.0 section 9.4.2.313.
+ *
+ * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP*
+ * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP*
+ */
+struct ieee80211_eht_cap_elem_fixed {
+	u8 mac_cap_info[2];
+	u8 phy_cap_info[9];
+} __packed;
+
+/**
+ * struct ieee80211_eht_cap_elem - EHT capabilities element
+ * @fixed: fixed parts, see &ieee80211_eht_cap_elem_fixed
+ * @optional: optional parts
+ */
+struct ieee80211_eht_cap_elem {
+	struct ieee80211_eht_cap_elem_fixed fixed;
+
+	/*
+	 * Followed by:
+	 * Supported EHT-MCS And NSS Set field: 4, 3, 6 or 9 octets.
+	 * EHT PPE Thresholds field: variable length.
+	 */
+	u8 optional[];
+} __packed;
+
+#define IEEE80211_EHT_OPER_INFO_PRESENT	                        0x01
+#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT	0x02
+#define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION	                0x04
+#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT         0x08
+#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK      0x30
+#define IEEE80211_EHT_OPER_MCS15_DISABLE                        0x40
+
+/**
+ * struct ieee80211_eht_operation - eht operation element
+ *
+ * This structure is the "EHT Operation Element" fields as
+ * described in P802.11be_D2.0 section 9.4.2.311
+ *
+ * @params: EHT operation element parameters. See &IEEE80211_EHT_OPER_*
+ * @basic_mcs_nss: indicates the EHT-MCSs for each number of spatial streams in
+ *     EHT PPDUs that are supported by all EHT STAs in the BSS in transmit and
+ *     receive.
+ * @optional: optional parts
+ */
+struct ieee80211_eht_operation {
+	u8 params;
+	struct ieee80211_eht_mcs_nss_supp_20mhz_only basic_mcs_nss;
+	u8 optional[];
+} __packed;
+
+/**
+ * struct ieee80211_eht_operation_info - eht operation information
+ *
+ * @control: EHT operation information control.
+ * @ccfs0: defines a channel center frequency for a 20, 40, 80, 160, or 320 MHz
+ *     EHT BSS.
+ * @ccfs1: defines a channel center frequency for a 160 or 320 MHz EHT BSS.
+ * @optional: optional parts
+ */
+struct ieee80211_eht_operation_info {
+	u8 control;
+	u8 ccfs0;
+	u8 ccfs1;
+	u8 optional[];
+} __packed;
+
+/* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */
+#define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS			0x01
+#define IEEE80211_EHT_MAC_CAP0_OM_CONTROL			0x02
+#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1		0x04
+#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2		0x08
+#define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT			0x10
+#define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC			0x20
+#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK		0xc0
+#define	IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895	        0
+#define	IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991	        1
+#define	IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454	        2
+
+#define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK		0x01
+#define IEEE80211_EHT_MAC_CAP1_EHT_TRS				0x02
+#define IEEE80211_EHT_MAC_CAP1_TXOP_RET				0x04
+#define IEEE80211_EHT_MAC_CAP1_TWO_BQRS				0x08
+#define IEEE80211_EHT_MAC_CAP1_EHT_LINK_ADAPT_MASK		0x30
+#define IEEE80211_EHT_MAC_CAP1_UNSOL_EPCS_PRIO_ACCESS		0x40
+
+/* EHT PHY capabilities as defined in P802.11be_D2.0 section 9.4.2.313.3 */
+#define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ			0x02
+#define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ		0x04
+#define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI		0x08
+#define IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO		0x10
+#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER			0x20
+#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE			0x40
+
+/* EHT beamformee number of spatial streams <= 80MHz is split */
+#define IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK		0x80
+#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK		0x03
+
+#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK	0x1c
+#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK	0xe0
+
+#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK		0x07
+#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK		0x38
+
+/* EHT number of sounding dimensions for 320MHz is split */
+#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK		0xc0
+#define IEEE80211_EHT_PHY_CAP3_SOUNDING_DIM_320MHZ_MASK		0x01
+#define IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK		0x02
+#define IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK		0x04
+#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK		0x08
+#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK		0x10
+#define IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK			0x20
+#define IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK		0x40
+#define IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK			0x80
+
+#define IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO		0x01
+#define IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP			0x02
+#define IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP		0x04
+#define IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI	0x08
+#define IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK			0xf0
+
+#define IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK		0x01
+#define IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP		0x02
+#define IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP		0x04
+#define IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT		0x08
+#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK	0x30
+#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US	0
+#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US	1
+#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US	2
+#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US	3
+
+/* Maximum number of supported EHT LTF is split */
+#define IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK	0xc0
+#define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF		0x40
+#define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK	0x07
+
+#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ			0x08
+#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ		0x30
+#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ		0x40
+#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK			0x78
+#define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP		0x80
+
+#define IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW	0x01
+#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ	0x02
+#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ	0x04
+#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ	0x08
+#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ		0x10
+#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ		0x20
+#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ		0x40
+#define IEEE80211_EHT_PHY_CAP7_TB_SOUNDING_FDBK_RATE_LIMIT	0x80
+
+#define IEEE80211_EHT_PHY_CAP8_RX_1024QAM_WIDER_BW_DL_OFDMA	0x01
+#define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA	0x02
+
+/*
+ * EHT operation channel width as defined in P802.11be_D2.0 section 9.4.2.311
+ */
+#define IEEE80211_EHT_OPER_CHAN_WIDTH		0x7
+#define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ	0
+#define IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ	1
+#define IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ	2
+#define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ	3
+#define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ	4
+
+/* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */
+static inline u8
+ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap,
+			   const struct ieee80211_eht_cap_elem_fixed *eht_cap,
+			   bool from_ap)
+{
+	u8 count = 0;
+
+	/* on 2.4 GHz, if it supports 40 MHz, the result is 3 */
+	if (he_cap->phy_cap_info[0] &
+	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G)
+		return 3;
+
+	/* on 2.4 GHz, these three bits are reserved, so should be 0 */
+	if (he_cap->phy_cap_info[0] &
+	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G)
+		count += 3;
+
+	if (he_cap->phy_cap_info[0] &
+	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)
+		count += 3;
+
+	if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)
+		count += 3;
+
+	if (count)
+		return count;
+
+	return from_ap ? 3 : 4;
+}
+
+/* 802.11be EHT PPE Thresholds */
+#define IEEE80211_EHT_PPE_THRES_NSS_POS			0
+#define IEEE80211_EHT_PPE_THRES_NSS_MASK		0xf
+#define IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK	0x1f0
+#define IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE		3
+#define IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE	9
+
+/*
+ * Calculate 802.11be EHT capabilities IE EHT field size
+ */
+static inline u8
+ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info)
+{
+	u32 n;
+
+	if (!(phy_cap_info[5] &
+	      IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT))
+		return 0;
+
+	n = hweight16(ppe_thres_hdr &
+		      IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK);
+	n *= 1 + u16_get_bits(ppe_thres_hdr, IEEE80211_EHT_PPE_THRES_NSS_MASK);
+
+	/*
+	 * Each pair is 6 bits, and we need to add the 9 "header" bits to the
+	 * total size.
+	 */
+	n = n * IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE * 2 +
+	    IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE;
+	return DIV_ROUND_UP(n, 8);
+}
+
+static inline bool
+ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len,
+			   bool from_ap)
+{
+	const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data;
+	u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed);
+
+	if (len < needed || !he_capa)
+		return false;
+
+	needed += ieee80211_eht_mcs_nss_size((const void *)he_capa,
+					     (const void *)data,
+					     from_ap);
+	if (len < needed)
+		return false;
+
+	if (elem->phy_cap_info[5] &
+			IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) {
+		u16 ppe_thres_hdr;
+
+		if (len < needed + sizeof(ppe_thres_hdr))
+			return false;
+
+		ppe_thres_hdr = get_unaligned_le16(data + needed);
+		needed += ieee80211_eht_ppe_size(ppe_thres_hdr,
+						 elem->phy_cap_info);
+	}
+
+	return len >= needed;
+}
+
+static inline bool
+ieee80211_eht_oper_size_ok(const u8 *data, u8 len)
+{
+	const struct ieee80211_eht_operation *elem = (const void *)data;
+	u8 needed = sizeof(*elem);
+
+	if (len < needed)
+		return false;
+
+	if (elem->params & IEEE80211_EHT_OPER_INFO_PRESENT) {
+		needed += 3;
+
+		if (elem->params &
+		    IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)
+			needed += 2;
+	}
+
+	return len >= needed;
+}
+
+/* must validate ieee80211_eht_oper_size_ok() first */
+static inline u16
+ieee80211_eht_oper_dis_subchan_bitmap(const struct ieee80211_eht_operation *eht_oper)
+{
+	const struct ieee80211_eht_operation_info *info =
+		(const void *)eht_oper->optional;
+
+	if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT))
+		return 0;
+
+	if (!(eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT))
+		return 0;
+
+	return get_unaligned_le16(info->optional);
+}
+
+#define IEEE80211_BW_IND_DIS_SUBCH_PRESENT	BIT(1)
+
+struct ieee80211_bandwidth_indication {
+	u8 params;
+	struct ieee80211_eht_operation_info info;
+} __packed;
+
+static inline bool
+ieee80211_bandwidth_indication_size_ok(const u8 *data, u8 len)
+{
+	const struct ieee80211_bandwidth_indication *bwi = (const void *)data;
+
+	if (len < sizeof(*bwi))
+		return false;
+
+	if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT &&
+	    len < sizeof(*bwi) + 2)
+		return false;
+
+	return true;
+}
+
+/* Protected EHT action codes */
+enum ieee80211_protected_eht_actioncode {
+	WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0,
+	WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1,
+	WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2,
+	WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ = 3,
+	WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP = 4,
+	WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN = 5,
+	WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF = 6,
+	WLAN_PROTECTED_EHT_ACTION_LINK_RECOMMEND = 7,
+	WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_REQ = 8,
+	WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_RESP = 9,
+	WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_NOTIF = 10,
+	WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ = 11,
+	WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP = 12,
+};
+
+/* multi-link device */
+#define IEEE80211_MLD_MAX_NUM_LINKS	15
+
+#define IEEE80211_ML_CONTROL_TYPE			0x0007
+#define IEEE80211_ML_CONTROL_TYPE_BASIC			0
+#define IEEE80211_ML_CONTROL_TYPE_PREQ			1
+#define IEEE80211_ML_CONTROL_TYPE_RECONF		2
+#define IEEE80211_ML_CONTROL_TYPE_TDLS			3
+#define IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS		4
+#define IEEE80211_ML_CONTROL_PRESENCE_MASK		0xfff0
+
+struct ieee80211_multi_link_elem {
+	__le16 control;
+	u8 variable[];
+} __packed;
+
+#define IEEE80211_MLC_BASIC_PRES_LINK_ID		0x0010
+#define IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT	0x0020
+#define IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY		0x0040
+#define IEEE80211_MLC_BASIC_PRES_EML_CAPA		0x0080
+#define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP		0x0100
+#define IEEE80211_MLC_BASIC_PRES_MLD_ID			0x0200
+#define IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP	0x0400
+
+#define IEEE80211_MED_SYNC_DELAY_DURATION		0x00ff
+#define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH	0x0f00
+#define IEEE80211_MED_SYNC_DELAY_SYNC_MAX_NUM_TXOPS	0xf000
+
+/*
+ * Described in P802.11be_D3.0
+ * dot11MSDTimerDuration should default to 5484 (i.e. 171.375)
+ * dot11MSDOFDMEDthreshold defaults to -72 (i.e. 0)
+ * dot11MSDTXOPMAX defaults to 1
+ */
+#define IEEE80211_MED_SYNC_DELAY_DEFAULT		0x10ac
+
+#define IEEE80211_EML_CAP_EMLSR_SUPP			0x0001
+#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY		0x000e
+#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_0US		0
+#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_32US		1
+#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_64US		2
+#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_128US		3
+#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US		4
+#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY	0x0070
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_0US		0
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_16US		1
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_32US		2
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_64US		3
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_128US		4
+#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US		5
+#define IEEE80211_EML_CAP_EMLMR_SUPPORT			0x0080
+#define IEEE80211_EML_CAP_EMLMR_DELAY			0x0700
+#define  IEEE80211_EML_CAP_EMLMR_DELAY_0US			0
+#define  IEEE80211_EML_CAP_EMLMR_DELAY_32US			1
+#define  IEEE80211_EML_CAP_EMLMR_DELAY_64US			2
+#define  IEEE80211_EML_CAP_EMLMR_DELAY_128US			3
+#define  IEEE80211_EML_CAP_EMLMR_DELAY_256US			4
+#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT		0x7800
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_0			0
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128US		1
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_256US		2
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_512US		3
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_1TU		4
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_2TU		5
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_4TU		6
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_8TU		7
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_16TU		8
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_32TU		9
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_64TU		10
+#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU		11
+
+#define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS		0x000f
+#define IEEE80211_MLD_CAP_OP_SRS_SUPPORT		0x0010
+#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP	0x0060
+#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_NO_SUPP	0
+#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME	1
+#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_RESERVED	2
+#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF	3
+#define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND		0x0f80
+#define IEEE80211_MLD_CAP_OP_AAR_SUPPORT		0x1000
+#define IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT	0x2000
+#define IEEE80211_MLD_CAP_OP_ALIGNED_TWT_SUPPORT	0x4000
+
+struct ieee80211_mle_basic_common_info {
+	u8 len;
+	u8 mld_mac_addr[ETH_ALEN];
+	u8 variable[];
+} __packed;
+
+#define IEEE80211_MLC_PREQ_PRES_MLD_ID			0x0010
+
+struct ieee80211_mle_preq_common_info {
+	u8 len;
+	u8 variable[];
+} __packed;
+
+#define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR		0x0010
+#define IEEE80211_MLC_RECONF_PRES_EML_CAPA		0x0020
+#define IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP		0x0040
+#define IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP	0x0080
+
+/* no fixed fields in RECONF */
+
+struct ieee80211_mle_tdls_common_info {
+	u8 len;
+	u8 ap_mld_mac_addr[ETH_ALEN];
+} __packed;
+
+#define IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR	0x0010
+
+/* no fixed fields in PRIO_ACCESS */
+
+/**
+ * ieee80211_mle_common_size - check multi-link element common size
+ * @data: multi-link element, must already be checked for size using
+ *	ieee80211_mle_size_ok()
+ * Return: the size of the multi-link element's "common" subfield 
+ */
+static inline u8 ieee80211_mle_common_size(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+
+	switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) {
+	case IEEE80211_ML_CONTROL_TYPE_BASIC:
+	case IEEE80211_ML_CONTROL_TYPE_PREQ:
+	case IEEE80211_ML_CONTROL_TYPE_TDLS:
+	case IEEE80211_ML_CONTROL_TYPE_RECONF:
+	case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS:
+		/*
+		 * The length is the first octet pointed by mle->variable so no
+		 * need to add anything
+		 */
+		break;
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+
+	return sizeof(*mle) + mle->variable[0];
+}
+
+/**
+ * ieee80211_mle_get_link_id - returns the link ID
+ * @data: the basic multi link element
+ * Return: the link ID, or -1 if not present
+ *
+ * The element is assumed to be of the correct type (BASIC) and big enough,
+ * this must be checked using ieee80211_mle_type_ok().
+ */
+static inline int ieee80211_mle_get_link_id(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	const u8 *common = mle->variable;
+
+	/* common points now at the beginning of ieee80211_mle_basic_common_info */
+	common += sizeof(struct ieee80211_mle_basic_common_info);
+
+	if (!(control & IEEE80211_MLC_BASIC_PRES_LINK_ID))
+		return -1;
+
+	return *common;
+}
+
+/**
+ * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count
+ * @data: pointer to the basic multi link element
+ * Return: the BSS Parameter Change Count field value, or -1 if not present
+ *
+ * The element is assumed to be of the correct type (BASIC) and big enough,
+ * this must be checked using ieee80211_mle_type_ok().
+ */
+static inline int
+ieee80211_mle_get_bss_param_ch_cnt(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	const u8 *common = mle->variable;
+
+	/* common points now at the beginning of ieee80211_mle_basic_common_info */
+	common += sizeof(struct ieee80211_mle_basic_common_info);
+
+	if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT))
+		return -1;
+
+	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
+		common += 1;
+
+	return *common;
+}
+
+/**
+ * ieee80211_mle_get_eml_med_sync_delay - returns the medium sync delay
+ * @data: pointer to the multi-link element
+ * Return: the medium synchronization delay field value from the multi-link
+ *	element, or the default value (%IEEE80211_MED_SYNC_DELAY_DEFAULT)
+ *	if not present
+ *
+ * The element is assumed to be of the correct type (BASIC) and big enough,
+ * this must be checked using ieee80211_mle_type_ok().
+ */
+static inline u16 ieee80211_mle_get_eml_med_sync_delay(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	const u8 *common = mle->variable;
+
+	/* common points now at the beginning of ieee80211_mle_basic_common_info */
+	common += sizeof(struct ieee80211_mle_basic_common_info);
+
+	if (!(control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY))
+		return IEEE80211_MED_SYNC_DELAY_DEFAULT;
+
+	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
+		common += 1;
+
+	return get_unaligned_le16(common);
+}
+
+/**
+ * ieee80211_mle_get_eml_cap - returns the EML capability
+ * @data: pointer to the multi-link element
+ * Return: the EML capability field value from the multi-link element,
+ *	or 0 if not present
+ *
+ * The element is assumed to be of the correct type (BASIC) and big enough,
+ * this must be checked using ieee80211_mle_type_ok().
+ */
+static inline u16 ieee80211_mle_get_eml_cap(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	const u8 *common = mle->variable;
+
+	/* common points now at the beginning of ieee80211_mle_basic_common_info */
+	common += sizeof(struct ieee80211_mle_basic_common_info);
+
+	if (!(control & IEEE80211_MLC_BASIC_PRES_EML_CAPA))
+		return 0;
+
+	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
+		common += 2;
+
+	return get_unaligned_le16(common);
+}
+
+/**
+ * ieee80211_mle_get_mld_capa_op - returns the MLD capabilities and operations.
+ * @data: pointer to the multi-link element
+ * Return: the MLD capabilities and operations field value from the multi-link
+ *	element, or 0 if not present
+ *
+ * The element is assumed to be of the correct type (BASIC) and big enough,
+ * this must be checked using ieee80211_mle_type_ok().
+ */
+static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	const u8 *common = mle->variable;
+
+	/*
+	 * common points now at the beginning of
+	 * ieee80211_mle_basic_common_info
+	 */
+	common += sizeof(struct ieee80211_mle_basic_common_info);
+
+	if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP))
+		return 0;
+
+	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
+		common += 2;
+	if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
+		common += 2;
+
+	return get_unaligned_le16(common);
+}
+
+/* Defined in Figure 9-1074t in P802.11be_D7.0 */
+#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE           0x0001
+#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_RECO_MAX_LINKS_MASK    0x001e
+#define IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE               0x0020
+#define IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK     0x0040
+#define IEEE80211_EHT_ML_EXT_MLD_CAPA_BTM_MLD_RECO_MULTI_AP     0x0080
+
+/**
+ * ieee80211_mle_get_ext_mld_capa_op - returns the extended MLD capabilities
+ *	and operations.
+ * @data: pointer to the multi-link element
+ * Return: the extended MLD capabilities and operations field value from
+ *	the multi-link element, or 0 if not present
+ *
+ * The element is assumed to be of the correct type (BASIC) and big enough,
+ * this must be checked using ieee80211_mle_type_ok().
+ */
+static inline u16 ieee80211_mle_get_ext_mld_capa_op(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	const u8 *common = mle->variable;
+
+	/*
+	 * common points now at the beginning of
+	 * ieee80211_mle_basic_common_info
+	 */
+	common += sizeof(struct ieee80211_mle_basic_common_info);
+
+	if (!(control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP))
+		return 0;
+
+	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
+		common += 2;
+	if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
+		common += 2;
+	if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)
+		common += 2;
+	if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID)
+		common += 1;
+
+	return get_unaligned_le16(common);
+}
+
+/**
+ * ieee80211_mle_get_mld_id - returns the MLD ID
+ * @data: pointer to the multi-link element
+ * Return: The MLD ID in the given multi-link element, or 0 if not present
+ *
+ * The element is assumed to be of the correct type (BASIC) and big enough,
+ * this must be checked using ieee80211_mle_type_ok().
+ */
+static inline u8 ieee80211_mle_get_mld_id(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	const u8 *common = mle->variable;
+
+	/*
+	 * common points now at the beginning of
+	 * ieee80211_mle_basic_common_info
+	 */
+	common += sizeof(struct ieee80211_mle_basic_common_info);
+
+	if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_ID))
+		return 0;
+
+	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
+		common += 2;
+	if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
+		common += 2;
+	if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)
+		common += 2;
+
+	return *common;
+}
+
+/**
+ * ieee80211_mle_size_ok - validate multi-link element size
+ * @data: pointer to the element data
+ * @len: length of the containing element
+ * Return: whether or not the multi-link element size is OK
+ */
+static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u8 fixed = sizeof(*mle);
+	u8 common = 0;
+	bool check_common_len = false;
+	u16 control;
+
+	if (!data || len < fixed)
+		return false;
+
+	control = le16_to_cpu(mle->control);
+
+	switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) {
+	case IEEE80211_ML_CONTROL_TYPE_BASIC:
+		common += sizeof(struct ieee80211_mle_basic_common_info);
+		check_common_len = true;
+		if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
+			common += 1;
+		if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
+			common += 1;
+		if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
+			common += 2;
+		if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
+			common += 2;
+		if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)
+			common += 2;
+		if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID)
+			common += 1;
+		if (control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP)
+			common += 2;
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_PREQ:
+		common += sizeof(struct ieee80211_mle_preq_common_info);
+		if (control & IEEE80211_MLC_PREQ_PRES_MLD_ID)
+			common += 1;
+		check_common_len = true;
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_RECONF:
+		if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR)
+			common += ETH_ALEN;
+		if (control & IEEE80211_MLC_RECONF_PRES_EML_CAPA)
+			common += 2;
+		if (control & IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP)
+			common += 2;
+		if (control & IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP)
+			common += 2;
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_TDLS:
+		common += sizeof(struct ieee80211_mle_tdls_common_info);
+		check_common_len = true;
+		break;
+	case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS:
+		common = ETH_ALEN + 1;
+		break;
+	default:
+		/* we don't know this type */
+		return true;
+	}
+
+	if (len < fixed + common)
+		return false;
+
+	if (!check_common_len)
+		return true;
+
+	/* if present, common length is the first octet there */
+	return mle->variable[0] >= common;
+}
+
+/**
+ * ieee80211_mle_type_ok - validate multi-link element type and size
+ * @data: pointer to the element data
+ * @type: expected type of the element
+ * @len: length of the containing element
+ * Return: whether or not the multi-link element type matches and size is OK
+ */
+static inline bool ieee80211_mle_type_ok(const u8 *data, u8 type, size_t len)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control;
+
+	if (!ieee80211_mle_size_ok(data, len))
+		return false;
+
+	control = le16_to_cpu(mle->control);
+
+	if (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE) == type)
+		return true;
+
+	return false;
+}
+
+enum ieee80211_mle_subelems {
+	IEEE80211_MLE_SUBELEM_PER_STA_PROFILE		= 0,
+	IEEE80211_MLE_SUBELEM_FRAGMENT		        = 254,
+};
+
+#define IEEE80211_MLE_STA_CONTROL_LINK_ID			0x000f
+#define IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE		0x0010
+#define IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT		0x0020
+#define IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT		0x0040
+#define IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT		0x0080
+#define IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT		0x0100
+#define IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT	0x0200
+#define IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE		0x0400
+#define IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT	0x0800
+
+struct ieee80211_mle_per_sta_profile {
+	__le16 control;
+	u8 sta_info_len;
+	u8 variable[];
+} __packed;
+
+/**
+ * ieee80211_mle_basic_sta_prof_size_ok - validate basic multi-link element sta
+ *	profile size
+ * @data: pointer to the sub element data
+ * @len: length of the containing sub element
+ * Return: %true if the STA profile is large enough, %false otherwise
+ */
+static inline bool ieee80211_mle_basic_sta_prof_size_ok(const u8 *data,
+							size_t len)
+{
+	const struct ieee80211_mle_per_sta_profile *prof = (const void *)data;
+	u16 control;
+	u8 fixed = sizeof(*prof);
+	u8 info_len = 1;
+
+	if (len < fixed)
+		return false;
+
+	control = le16_to_cpu(prof->control);
+
+	if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT)
+		info_len += 6;
+	if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT)
+		info_len += 2;
+	if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT)
+		info_len += 8;
+	if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT)
+		info_len += 2;
+	if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE &&
+	    control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) {
+		if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE)
+			info_len += 2;
+		else
+			info_len += 1;
+	}
+	if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT)
+		info_len += 1;
+
+	return prof->sta_info_len >= info_len &&
+	       fixed + prof->sta_info_len - 1 <= len;
+}
+
+/**
+ * ieee80211_mle_basic_sta_prof_bss_param_ch_cnt - get per-STA profile BSS
+ *	parameter change count
+ * @prof: the per-STA profile, having been checked with
+ *	ieee80211_mle_basic_sta_prof_size_ok() for the correct length
+ *
+ * Return: The BSS parameter change count value if present, 0 otherwise.
+ */
+static inline u8
+ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(const struct ieee80211_mle_per_sta_profile *prof)
+{
+	u16 control = le16_to_cpu(prof->control);
+	const u8 *pos = prof->variable;
+
+	if (!(control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT))
+		return 0;
+
+	if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT)
+		pos += 6;
+	if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT)
+		pos += 2;
+	if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT)
+		pos += 8;
+	if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT)
+		pos += 2;
+	if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE &&
+	    control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) {
+		if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE)
+			pos += 2;
+		else
+			pos += 1;
+	}
+
+	return *pos;
+}
+
+#define IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID			0x000f
+#define IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE		0x0010
+#define IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT		0x0020
+#define IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT		0x0040
+#define	IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE                 0x0780
+#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_AP_REM          0
+#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_OP_PARAM_UPDATE 1
+#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK        2
+#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK        3
+#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_NSTR_STATUS     4
+#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT       0x0800
+
+/**
+ * ieee80211_mle_reconf_sta_prof_size_ok - validate reconfiguration multi-link
+ *	element sta profile size.
+ * @data: pointer to the sub element data
+ * @len: length of the containing sub element
+ * Return: %true if the STA profile is large enough, %false otherwise
+ */
+static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data,
+							 size_t len)
+{
+	const struct ieee80211_mle_per_sta_profile *prof = (const void *)data;
+	u16 control;
+	u8 fixed = sizeof(*prof);
+	u8 info_len = 1;
+
+	if (len < fixed)
+		return false;
+
+	control = le16_to_cpu(prof->control);
+
+	if (control & IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT)
+		info_len += ETH_ALEN;
+	if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT)
+		info_len += 2;
+	if (control & IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT)
+		info_len += 2;
+
+	return prof->sta_info_len >= info_len &&
+	       fixed + prof->sta_info_len - 1 <= len;
+}
+
+#define IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID			0x000f
+#define IEEE80211_EPCS_ENA_RESP_BODY_LEN                        3
+
+static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len)
+{
+	const struct ieee80211_ttlm_elem *t2l = (const void *)data;
+	u8 control, fixed = sizeof(*t2l), elem_len = 0;
+
+	if (len < fixed)
+		return false;
+
+	control = t2l->control;
+
+	if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT)
+		elem_len += 2;
+	if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)
+		elem_len += 3;
+
+	if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) {
+		u8 bm_size;
+
+		elem_len += 1;
+		if (len < fixed + elem_len)
+			return false;
+
+		if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE)
+			bm_size = 1;
+		else
+			bm_size = 2;
+
+		elem_len += hweight8(t2l->optional[0]) * bm_size;
+	}
+
+	return len >= fixed + elem_len;
+}
+
+/**
+ * ieee80211_emlsr_pad_delay_in_us - Fetch the EMLSR Padding delay
+ *	in microseconds
+ * @eml_cap: EML capabilities field value from common info field of
+ *	the Multi-link element
+ * Return: the EMLSR Padding delay (in microseconds) encoded in the
+ *	EML Capabilities field
+ */
+
+static inline u32 ieee80211_emlsr_pad_delay_in_us(u16 eml_cap)
+{
+	/* IEEE Std 802.11be-2024 Table 9-417i—Encoding of the EMLSR
+	 * Padding Delay subfield.
+	 */
+	u32 pad_delay = u16_get_bits(eml_cap,
+				     IEEE80211_EML_CAP_EMLSR_PADDING_DELAY);
+
+	if (!pad_delay ||
+	    pad_delay > IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US)
+		return 0;
+
+	return 32 * (1 << (pad_delay - 1));
+}
+
+/**
+ * ieee80211_emlsr_trans_delay_in_us - Fetch the EMLSR Transition
+ *	delay in microseconds
+ * @eml_cap: EML capabilities field value from common info field of
+ *	the Multi-link element
+ * Return: the EMLSR Transition delay (in microseconds) encoded in the
+ *	EML Capabilities field
+ */
+
+static inline u32 ieee80211_emlsr_trans_delay_in_us(u16 eml_cap)
+{
+	/* IEEE Std 802.11be-2024 Table 9-417j—Encoding of the EMLSR
+	 * Transition Delay subfield.
+	 */
+	u32 trans_delay =
+		u16_get_bits(eml_cap,
+			     IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY);
+
+	/* invalid values also just use 0 */
+	if (!trans_delay ||
+	    trans_delay > IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US)
+		return 0;
+
+	return 16 * (1 << (trans_delay - 1));
+}
+
+/**
+ * ieee80211_eml_trans_timeout_in_us - Fetch the EMLSR Transition
+ *	timeout value in microseconds
+ * @eml_cap: EML capabilities field value from common info field of
+ *	the Multi-link element
+ * Return: the EMLSR Transition timeout (in microseconds) encoded in
+ *	the EML Capabilities field
+ */
+
+static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap)
+{
+	/* IEEE Std 802.11be-2024 Table 9-417m—Encoding of the
+	 * Transition Timeout subfield.
+	 */
+	u8 timeout = u16_get_bits(eml_cap,
+				  IEEE80211_EML_CAP_TRANSITION_TIMEOUT);
+
+	/* invalid values also just use 0 */
+	if (!timeout || timeout > IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU)
+		return 0;
+
+	return 128 * (1 << (timeout - 1));
+}
+
+#define for_each_mle_subelement(_elem, _data, _len)			\
+	if (ieee80211_mle_size_ok(_data, _len))				\
+		for_each_element(_elem,					\
+				 _data + ieee80211_mle_common_size(_data),\
+				 _len - ieee80211_mle_common_size(_data))
+
+#endif /* LINUX_IEEE80211_H */
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index a3dbbcee00ee..63a9775b059d 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1141,30 +1141,6 @@ ieee80211_s1g_optional_len(__le16 fc)
 	return len;
 }
 
-#define IEEE80211_TTLM_MAX_CNT				2
-#define IEEE80211_TTLM_CONTROL_DIRECTION		0x03
-#define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP		0x04
-#define IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT	0x08
-#define IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT	0x10
-#define IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE		0x20
-
-#define IEEE80211_TTLM_DIRECTION_DOWN		0
-#define IEEE80211_TTLM_DIRECTION_UP		1
-#define IEEE80211_TTLM_DIRECTION_BOTH		2
-
-/**
- * struct ieee80211_ttlm_elem - TID-To-Link Mapping element
- *
- * Defined in section 9.4.2.314 in P802.11be_D4
- *
- * @control: the first part of control field
- * @optional: the second part of control field
- */
-struct ieee80211_ttlm_elem {
-	u8 control;
-	u8 optional[];
-} __packed;
-
 /**
  * struct ieee80211_bss_load_elem - BSS Load elemen
  *
@@ -1591,144 +1567,6 @@ struct ieee80211_p2p_noa_attr {
 #define IEEE80211_P2P_OPPPS_ENABLE_BIT		BIT(7)
 #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK	0x7F
 
-#define IEEE80211_EHT_MCS_NSS_RX 0x0f
-#define IEEE80211_EHT_MCS_NSS_TX 0xf0
-
-/**
- * struct ieee80211_eht_mcs_nss_supp_20mhz_only - EHT 20MHz only station max
- * supported NSS for per MCS.
- *
- * For each field below, bits 0 - 3 indicate the maximal number of spatial
- * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams
- * for Tx.
- *
- * @rx_tx_mcs7_max_nss: indicates the maximum number of spatial streams
- *     supported for reception and the maximum number of spatial streams
- *     supported for transmission for MCS 0 - 7.
- * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams
- *     supported for reception and the maximum number of spatial streams
- *     supported for transmission for MCS 8 - 9.
- * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams
- *     supported for reception and the maximum number of spatial streams
- *     supported for transmission for MCS 10 - 11.
- * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams
- *     supported for reception and the maximum number of spatial streams
- *     supported for transmission for MCS 12 - 13.
- * @rx_tx_max_nss: array of the previous fields for easier loop access
- */
-struct ieee80211_eht_mcs_nss_supp_20mhz_only {
-	union {
-		struct {
-			u8 rx_tx_mcs7_max_nss;
-			u8 rx_tx_mcs9_max_nss;
-			u8 rx_tx_mcs11_max_nss;
-			u8 rx_tx_mcs13_max_nss;
-		};
-		u8 rx_tx_max_nss[4];
-	};
-};
-
-/**
- * struct ieee80211_eht_mcs_nss_supp_bw - EHT max supported NSS per MCS (except
- * 20MHz only stations).
- *
- * For each field below, bits 0 - 3 indicate the maximal number of spatial
- * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams
- * for Tx.
- *
- * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams
- *     supported for reception and the maximum number of spatial streams
- *     supported for transmission for MCS 0 - 9.
- * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams
- *     supported for reception and the maximum number of spatial streams
- *     supported for transmission for MCS 10 - 11.
- * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams
- *     supported for reception and the maximum number of spatial streams
- *     supported for transmission for MCS 12 - 13.
- * @rx_tx_max_nss: array of the previous fields for easier loop access
- */
-struct ieee80211_eht_mcs_nss_supp_bw {
-	union {
-		struct {
-			u8 rx_tx_mcs9_max_nss;
-			u8 rx_tx_mcs11_max_nss;
-			u8 rx_tx_mcs13_max_nss;
-		};
-		u8 rx_tx_max_nss[3];
-	};
-};
-
-/**
- * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data
- *
- * This structure is the "EHT Capabilities element" fixed fields as
- * described in P802.11be_D2.0 section 9.4.2.313.
- *
- * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP*
- * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP*
- */
-struct ieee80211_eht_cap_elem_fixed {
-	u8 mac_cap_info[2];
-	u8 phy_cap_info[9];
-} __packed;
-
-/**
- * struct ieee80211_eht_cap_elem - EHT capabilities element
- * @fixed: fixed parts, see &ieee80211_eht_cap_elem_fixed
- * @optional: optional parts
- */
-struct ieee80211_eht_cap_elem {
-	struct ieee80211_eht_cap_elem_fixed fixed;
-
-	/*
-	 * Followed by:
-	 * Supported EHT-MCS And NSS Set field: 4, 3, 6 or 9 octets.
-	 * EHT PPE Thresholds field: variable length.
-	 */
-	u8 optional[];
-} __packed;
-
-#define IEEE80211_EHT_OPER_INFO_PRESENT	                        0x01
-#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT	0x02
-#define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION	                0x04
-#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT         0x08
-#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK      0x30
-#define IEEE80211_EHT_OPER_MCS15_DISABLE                        0x40
-
-/**
- * struct ieee80211_eht_operation - eht operation element
- *
- * This structure is the "EHT Operation Element" fields as
- * described in P802.11be_D2.0 section 9.4.2.311
- *
- * @params: EHT operation element parameters. See &IEEE80211_EHT_OPER_*
- * @basic_mcs_nss: indicates the EHT-MCSs for each number of spatial streams in
- *     EHT PPDUs that are supported by all EHT STAs in the BSS in transmit and
- *     receive.
- * @optional: optional parts
- */
-struct ieee80211_eht_operation {
-	u8 params;
-	struct ieee80211_eht_mcs_nss_supp_20mhz_only basic_mcs_nss;
-	u8 optional[];
-} __packed;
-
-/**
- * struct ieee80211_eht_operation_info - eht operation information
- *
- * @control: EHT operation information control.
- * @ccfs0: defines a channel center frequency for a 20, 40, 80, 160, or 320 MHz
- *     EHT BSS.
- * @ccfs1: defines a channel center frequency for a 160 or 320 MHz EHT BSS.
- * @optional: optional parts
- */
-struct ieee80211_eht_operation_info {
-	u8 control;
-	u8 ccfs0;
-	u8 ccfs1;
-	u8 optional[];
-} __packed;
-
 /* S1G Capabilities Information field */
 #define IEEE80211_S1G_CAPABILITY_LEN	15
 
@@ -1815,258 +1653,6 @@ struct ieee80211_eht_operation_info {
 #define S1G_2M_PRIMARY_LOCATION_LOWER	0
 #define S1G_2M_PRIMARY_LOCATION_UPPER	1
 
-/* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */
-#define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS			0x01
-#define IEEE80211_EHT_MAC_CAP0_OM_CONTROL			0x02
-#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1		0x04
-#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2		0x08
-#define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT			0x10
-#define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC			0x20
-#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK		0xc0
-#define	IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895	        0
-#define	IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991	        1
-#define	IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454	        2
-
-#define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK		0x01
-#define IEEE80211_EHT_MAC_CAP1_EHT_TRS				0x02
-#define IEEE80211_EHT_MAC_CAP1_TXOP_RET				0x04
-#define IEEE80211_EHT_MAC_CAP1_TWO_BQRS				0x08
-#define IEEE80211_EHT_MAC_CAP1_EHT_LINK_ADAPT_MASK		0x30
-#define IEEE80211_EHT_MAC_CAP1_UNSOL_EPCS_PRIO_ACCESS		0x40
-
-/* EHT PHY capabilities as defined in P802.11be_D2.0 section 9.4.2.313.3 */
-#define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ			0x02
-#define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ		0x04
-#define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI		0x08
-#define IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO		0x10
-#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER			0x20
-#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE			0x40
-
-/* EHT beamformee number of spatial streams <= 80MHz is split */
-#define IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK		0x80
-#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK		0x03
-
-#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK	0x1c
-#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK	0xe0
-
-#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK		0x07
-#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK		0x38
-
-/* EHT number of sounding dimensions for 320MHz is split */
-#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK		0xc0
-#define IEEE80211_EHT_PHY_CAP3_SOUNDING_DIM_320MHZ_MASK		0x01
-#define IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK		0x02
-#define IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK		0x04
-#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK		0x08
-#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK		0x10
-#define IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK			0x20
-#define IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK		0x40
-#define IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK			0x80
-
-#define IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO		0x01
-#define IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP			0x02
-#define IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP		0x04
-#define IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI	0x08
-#define IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK			0xf0
-
-#define IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK		0x01
-#define IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP		0x02
-#define IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP		0x04
-#define IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT		0x08
-#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK	0x30
-#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US	0
-#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US	1
-#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US	2
-#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US	3
-
-/* Maximum number of supported EHT LTF is split */
-#define IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK	0xc0
-#define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF		0x40
-#define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK	0x07
-
-#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ			0x08
-#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ		0x30
-#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ		0x40
-#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK			0x78
-#define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP		0x80
-
-#define IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW	0x01
-#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ	0x02
-#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ	0x04
-#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ	0x08
-#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ		0x10
-#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ		0x20
-#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ		0x40
-#define IEEE80211_EHT_PHY_CAP7_TB_SOUNDING_FDBK_RATE_LIMIT	0x80
-
-#define IEEE80211_EHT_PHY_CAP8_RX_1024QAM_WIDER_BW_DL_OFDMA	0x01
-#define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA	0x02
-
-/*
- * EHT operation channel width as defined in P802.11be_D2.0 section 9.4.2.311
- */
-#define IEEE80211_EHT_OPER_CHAN_WIDTH		0x7
-#define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ	0
-#define IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ	1
-#define IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ	2
-#define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ	3
-#define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ	4
-
-/* need HE definitions for EHT functions */
-#include "ieee80211-he.h"
-
-/* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */
-static inline u8
-ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap,
-			   const struct ieee80211_eht_cap_elem_fixed *eht_cap,
-			   bool from_ap)
-{
-	u8 count = 0;
-
-	/* on 2.4 GHz, if it supports 40 MHz, the result is 3 */
-	if (he_cap->phy_cap_info[0] &
-	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G)
-		return 3;
-
-	/* on 2.4 GHz, these three bits are reserved, so should be 0 */
-	if (he_cap->phy_cap_info[0] &
-	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G)
-		count += 3;
-
-	if (he_cap->phy_cap_info[0] &
-	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)
-		count += 3;
-
-	if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)
-		count += 3;
-
-	if (count)
-		return count;
-
-	return from_ap ? 3 : 4;
-}
-
-/* 802.11be EHT PPE Thresholds */
-#define IEEE80211_EHT_PPE_THRES_NSS_POS			0
-#define IEEE80211_EHT_PPE_THRES_NSS_MASK		0xf
-#define IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK	0x1f0
-#define IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE		3
-#define IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE	9
-
-/*
- * Calculate 802.11be EHT capabilities IE EHT field size
- */
-static inline u8
-ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info)
-{
-	u32 n;
-
-	if (!(phy_cap_info[5] &
-	      IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT))
-		return 0;
-
-	n = hweight16(ppe_thres_hdr &
-		      IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK);
-	n *= 1 + u16_get_bits(ppe_thres_hdr, IEEE80211_EHT_PPE_THRES_NSS_MASK);
-
-	/*
-	 * Each pair is 6 bits, and we need to add the 9 "header" bits to the
-	 * total size.
-	 */
-	n = n * IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE * 2 +
-	    IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE;
-	return DIV_ROUND_UP(n, 8);
-}
-
-static inline bool
-ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len,
-			   bool from_ap)
-{
-	const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data;
-	u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed);
-
-	if (len < needed || !he_capa)
-		return false;
-
-	needed += ieee80211_eht_mcs_nss_size((const void *)he_capa,
-					     (const void *)data,
-					     from_ap);
-	if (len < needed)
-		return false;
-
-	if (elem->phy_cap_info[5] &
-			IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) {
-		u16 ppe_thres_hdr;
-
-		if (len < needed + sizeof(ppe_thres_hdr))
-			return false;
-
-		ppe_thres_hdr = get_unaligned_le16(data + needed);
-		needed += ieee80211_eht_ppe_size(ppe_thres_hdr,
-						 elem->phy_cap_info);
-	}
-
-	return len >= needed;
-}
-
-static inline bool
-ieee80211_eht_oper_size_ok(const u8 *data, u8 len)
-{
-	const struct ieee80211_eht_operation *elem = (const void *)data;
-	u8 needed = sizeof(*elem);
-
-	if (len < needed)
-		return false;
-
-	if (elem->params & IEEE80211_EHT_OPER_INFO_PRESENT) {
-		needed += 3;
-
-		if (elem->params &
-		    IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)
-			needed += 2;
-	}
-
-	return len >= needed;
-}
-
-/* must validate ieee80211_eht_oper_size_ok() first */
-static inline u16
-ieee80211_eht_oper_dis_subchan_bitmap(const struct ieee80211_eht_operation *eht_oper)
-{
-	const struct ieee80211_eht_operation_info *info =
-		(const void *)eht_oper->optional;
-
-	if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT))
-		return 0;
-
-	if (!(eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT))
-		return 0;
-
-	return get_unaligned_le16(info->optional);
-}
-
-#define IEEE80211_BW_IND_DIS_SUBCH_PRESENT	BIT(1)
-
-struct ieee80211_bandwidth_indication {
-	u8 params;
-	struct ieee80211_eht_operation_info info;
-} __packed;
-
-static inline bool
-ieee80211_bandwidth_indication_size_ok(const u8 *data, u8 len)
-{
-	const struct ieee80211_bandwidth_indication *bwi = (const void *)data;
-
-	if (len < sizeof(*bwi))
-		return false;
-
-	if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT &&
-	    len < sizeof(*bwi) + 2)
-		return false;
-
-	return true;
-}
-
 #define LISTEN_INT_USF	GENMASK(15, 14)
 #define LISTEN_INT_UI	GENMASK(13, 0)
 
@@ -2587,23 +2173,6 @@ enum ieee80211_unprotected_wnm_actioncode {
 	WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE = 1,
 };
 
-/* Protected EHT action codes */
-enum ieee80211_protected_eht_actioncode {
-	WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0,
-	WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1,
-	WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2,
-	WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ = 3,
-	WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP = 4,
-	WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN = 5,
-	WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF = 6,
-	WLAN_PROTECTED_EHT_ACTION_LINK_RECOMMEND = 7,
-	WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_REQ = 8,
-	WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_RESP = 9,
-	WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_NOTIF = 10,
-	WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ = 11,
-	WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP = 12,
-};
-
 /* Security key length */
 enum ieee80211_key_len {
 	WLAN_KEY_LEN_WEP40 = 5,
@@ -3855,737 +3424,6 @@ struct ieee80211_tbtt_info_ge_11 {
 	struct ieee80211_rnr_mld_params mld_params;
 } __packed;
 
-/* multi-link device */
-#define IEEE80211_MLD_MAX_NUM_LINKS	15
-
-#define IEEE80211_ML_CONTROL_TYPE			0x0007
-#define IEEE80211_ML_CONTROL_TYPE_BASIC			0
-#define IEEE80211_ML_CONTROL_TYPE_PREQ			1
-#define IEEE80211_ML_CONTROL_TYPE_RECONF		2
-#define IEEE80211_ML_CONTROL_TYPE_TDLS			3
-#define IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS		4
-#define IEEE80211_ML_CONTROL_PRESENCE_MASK		0xfff0
-
-struct ieee80211_multi_link_elem {
-	__le16 control;
-	u8 variable[];
-} __packed;
-
-#define IEEE80211_MLC_BASIC_PRES_LINK_ID		0x0010
-#define IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT	0x0020
-#define IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY		0x0040
-#define IEEE80211_MLC_BASIC_PRES_EML_CAPA		0x0080
-#define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP		0x0100
-#define IEEE80211_MLC_BASIC_PRES_MLD_ID			0x0200
-#define IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP	0x0400
-
-#define IEEE80211_MED_SYNC_DELAY_DURATION		0x00ff
-#define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH	0x0f00
-#define IEEE80211_MED_SYNC_DELAY_SYNC_MAX_NUM_TXOPS	0xf000
-
-/*
- * Described in P802.11be_D3.0
- * dot11MSDTimerDuration should default to 5484 (i.e. 171.375)
- * dot11MSDOFDMEDthreshold defaults to -72 (i.e. 0)
- * dot11MSDTXOPMAX defaults to 1
- */
-#define IEEE80211_MED_SYNC_DELAY_DEFAULT		0x10ac
-
-#define IEEE80211_EML_CAP_EMLSR_SUPP			0x0001
-#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY		0x000e
-#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_0US		0
-#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_32US		1
-#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_64US		2
-#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_128US		3
-#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US		4
-#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY	0x0070
-#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_0US		0
-#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_16US		1
-#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_32US		2
-#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_64US		3
-#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_128US		4
-#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US		5
-#define IEEE80211_EML_CAP_EMLMR_SUPPORT			0x0080
-#define IEEE80211_EML_CAP_EMLMR_DELAY			0x0700
-#define  IEEE80211_EML_CAP_EMLMR_DELAY_0US			0
-#define  IEEE80211_EML_CAP_EMLMR_DELAY_32US			1
-#define  IEEE80211_EML_CAP_EMLMR_DELAY_64US			2
-#define  IEEE80211_EML_CAP_EMLMR_DELAY_128US			3
-#define  IEEE80211_EML_CAP_EMLMR_DELAY_256US			4
-#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT		0x7800
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_0			0
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128US		1
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_256US		2
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_512US		3
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_1TU		4
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_2TU		5
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_4TU		6
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_8TU		7
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_16TU		8
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_32TU		9
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_64TU		10
-#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU		11
-
-#define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS		0x000f
-#define IEEE80211_MLD_CAP_OP_SRS_SUPPORT		0x0010
-#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP	0x0060
-#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_NO_SUPP	0
-#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME	1
-#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_RESERVED	2
-#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF	3
-#define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND		0x0f80
-#define IEEE80211_MLD_CAP_OP_AAR_SUPPORT		0x1000
-#define IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT	0x2000
-#define IEEE80211_MLD_CAP_OP_ALIGNED_TWT_SUPPORT	0x4000
-
-struct ieee80211_mle_basic_common_info {
-	u8 len;
-	u8 mld_mac_addr[ETH_ALEN];
-	u8 variable[];
-} __packed;
-
-#define IEEE80211_MLC_PREQ_PRES_MLD_ID			0x0010
-
-struct ieee80211_mle_preq_common_info {
-	u8 len;
-	u8 variable[];
-} __packed;
-
-#define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR		0x0010
-#define IEEE80211_MLC_RECONF_PRES_EML_CAPA		0x0020
-#define IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP		0x0040
-#define IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP	0x0080
-
-/* no fixed fields in RECONF */
-
-struct ieee80211_mle_tdls_common_info {
-	u8 len;
-	u8 ap_mld_mac_addr[ETH_ALEN];
-} __packed;
-
-#define IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR	0x0010
-
-/* no fixed fields in PRIO_ACCESS */
-
-/**
- * ieee80211_mle_common_size - check multi-link element common size
- * @data: multi-link element, must already be checked for size using
- *	ieee80211_mle_size_ok()
- * Return: the size of the multi-link element's "common" subfield 
- */
-static inline u8 ieee80211_mle_common_size(const u8 *data)
-{
-	const struct ieee80211_multi_link_elem *mle = (const void *)data;
-	u16 control = le16_to_cpu(mle->control);
-
-	switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) {
-	case IEEE80211_ML_CONTROL_TYPE_BASIC:
-	case IEEE80211_ML_CONTROL_TYPE_PREQ:
-	case IEEE80211_ML_CONTROL_TYPE_TDLS:
-	case IEEE80211_ML_CONTROL_TYPE_RECONF:
-	case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS:
-		/*
-		 * The length is the first octet pointed by mle->variable so no
-		 * need to add anything
-		 */
-		break;
-	default:
-		WARN_ON(1);
-		return 0;
-	}
-
-	return sizeof(*mle) + mle->variable[0];
-}
-
-/**
- * ieee80211_mle_get_link_id - returns the link ID
- * @data: the basic multi link element
- * Return: the link ID, or -1 if not present
- *
- * The element is assumed to be of the correct type (BASIC) and big enough,
- * this must be checked using ieee80211_mle_type_ok().
- */
-static inline int ieee80211_mle_get_link_id(const u8 *data)
-{
-	const struct ieee80211_multi_link_elem *mle = (const void *)data;
-	u16 control = le16_to_cpu(mle->control);
-	const u8 *common = mle->variable;
-
-	/* common points now at the beginning of ieee80211_mle_basic_common_info */
-	common += sizeof(struct ieee80211_mle_basic_common_info);
-
-	if (!(control & IEEE80211_MLC_BASIC_PRES_LINK_ID))
-		return -1;
-
-	return *common;
-}
-
-/**
- * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count
- * @data: pointer to the basic multi link element
- * Return: the BSS Parameter Change Count field value, or -1 if not present
- *
- * The element is assumed to be of the correct type (BASIC) and big enough,
- * this must be checked using ieee80211_mle_type_ok().
- */
-static inline int
-ieee80211_mle_get_bss_param_ch_cnt(const u8 *data)
-{
-	const struct ieee80211_multi_link_elem *mle = (const void *)data;
-	u16 control = le16_to_cpu(mle->control);
-	const u8 *common = mle->variable;
-
-	/* common points now at the beginning of ieee80211_mle_basic_common_info */
-	common += sizeof(struct ieee80211_mle_basic_common_info);
-
-	if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT))
-		return -1;
-
-	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
-		common += 1;
-
-	return *common;
-}
-
-/**
- * ieee80211_mle_get_eml_med_sync_delay - returns the medium sync delay
- * @data: pointer to the multi-link element
- * Return: the medium synchronization delay field value from the multi-link
- *	element, or the default value (%IEEE80211_MED_SYNC_DELAY_DEFAULT)
- *	if not present
- *
- * The element is assumed to be of the correct type (BASIC) and big enough,
- * this must be checked using ieee80211_mle_type_ok().
- */
-static inline u16 ieee80211_mle_get_eml_med_sync_delay(const u8 *data)
-{
-	const struct ieee80211_multi_link_elem *mle = (const void *)data;
-	u16 control = le16_to_cpu(mle->control);
-	const u8 *common = mle->variable;
-
-	/* common points now at the beginning of ieee80211_mle_basic_common_info */
-	common += sizeof(struct ieee80211_mle_basic_common_info);
-
-	if (!(control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY))
-		return IEEE80211_MED_SYNC_DELAY_DEFAULT;
-
-	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
-		common += 1;
-	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
-		common += 1;
-
-	return get_unaligned_le16(common);
-}
-
-/**
- * ieee80211_mle_get_eml_cap - returns the EML capability
- * @data: pointer to the multi-link element
- * Return: the EML capability field value from the multi-link element,
- *	or 0 if not present
- *
- * The element is assumed to be of the correct type (BASIC) and big enough,
- * this must be checked using ieee80211_mle_type_ok().
- */
-static inline u16 ieee80211_mle_get_eml_cap(const u8 *data)
-{
-	const struct ieee80211_multi_link_elem *mle = (const void *)data;
-	u16 control = le16_to_cpu(mle->control);
-	const u8 *common = mle->variable;
-
-	/* common points now at the beginning of ieee80211_mle_basic_common_info */
-	common += sizeof(struct ieee80211_mle_basic_common_info);
-
-	if (!(control & IEEE80211_MLC_BASIC_PRES_EML_CAPA))
-		return 0;
-
-	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
-		common += 1;
-	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
-		common += 1;
-	if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
-		common += 2;
-
-	return get_unaligned_le16(common);
-}
-
-/**
- * ieee80211_mle_get_mld_capa_op - returns the MLD capabilities and operations.
- * @data: pointer to the multi-link element
- * Return: the MLD capabilities and operations field value from the multi-link
- *	element, or 0 if not present
- *
- * The element is assumed to be of the correct type (BASIC) and big enough,
- * this must be checked using ieee80211_mle_type_ok().
- */
-static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data)
-{
-	const struct ieee80211_multi_link_elem *mle = (const void *)data;
-	u16 control = le16_to_cpu(mle->control);
-	const u8 *common = mle->variable;
-
-	/*
-	 * common points now at the beginning of
-	 * ieee80211_mle_basic_common_info
-	 */
-	common += sizeof(struct ieee80211_mle_basic_common_info);
-
-	if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP))
-		return 0;
-
-	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
-		common += 1;
-	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
-		common += 1;
-	if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
-		common += 2;
-	if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
-		common += 2;
-
-	return get_unaligned_le16(common);
-}
-
-/* Defined in Figure 9-1074t in P802.11be_D7.0 */
-#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE           0x0001
-#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_RECO_MAX_LINKS_MASK    0x001e
-#define IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE               0x0020
-#define IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK     0x0040
-#define IEEE80211_EHT_ML_EXT_MLD_CAPA_BTM_MLD_RECO_MULTI_AP     0x0080
-
-/**
- * ieee80211_mle_get_ext_mld_capa_op - returns the extended MLD capabilities
- *	and operations.
- * @data: pointer to the multi-link element
- * Return: the extended MLD capabilities and operations field value from
- *	the multi-link element, or 0 if not present
- *
- * The element is assumed to be of the correct type (BASIC) and big enough,
- * this must be checked using ieee80211_mle_type_ok().
- */
-static inline u16 ieee80211_mle_get_ext_mld_capa_op(const u8 *data)
-{
-	const struct ieee80211_multi_link_elem *mle = (const void *)data;
-	u16 control = le16_to_cpu(mle->control);
-	const u8 *common = mle->variable;
-
-	/*
-	 * common points now at the beginning of
-	 * ieee80211_mle_basic_common_info
-	 */
-	common += sizeof(struct ieee80211_mle_basic_common_info);
-
-	if (!(control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP))
-		return 0;
-
-	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
-		common += 1;
-	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
-		common += 1;
-	if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
-		common += 2;
-	if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
-		common += 2;
-	if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)
-		common += 2;
-	if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID)
-		common += 1;
-
-	return get_unaligned_le16(common);
-}
-
-/**
- * ieee80211_mle_get_mld_id - returns the MLD ID
- * @data: pointer to the multi-link element
- * Return: The MLD ID in the given multi-link element, or 0 if not present
- *
- * The element is assumed to be of the correct type (BASIC) and big enough,
- * this must be checked using ieee80211_mle_type_ok().
- */
-static inline u8 ieee80211_mle_get_mld_id(const u8 *data)
-{
-	const struct ieee80211_multi_link_elem *mle = (const void *)data;
-	u16 control = le16_to_cpu(mle->control);
-	const u8 *common = mle->variable;
-
-	/*
-	 * common points now at the beginning of
-	 * ieee80211_mle_basic_common_info
-	 */
-	common += sizeof(struct ieee80211_mle_basic_common_info);
-
-	if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_ID))
-		return 0;
-
-	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
-		common += 1;
-	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
-		common += 1;
-	if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
-		common += 2;
-	if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
-		common += 2;
-	if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)
-		common += 2;
-
-	return *common;
-}
-
-/**
- * ieee80211_mle_size_ok - validate multi-link element size
- * @data: pointer to the element data
- * @len: length of the containing element
- * Return: whether or not the multi-link element size is OK
- */
-static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len)
-{
-	const struct ieee80211_multi_link_elem *mle = (const void *)data;
-	u8 fixed = sizeof(*mle);
-	u8 common = 0;
-	bool check_common_len = false;
-	u16 control;
-
-	if (!data || len < fixed)
-		return false;
-
-	control = le16_to_cpu(mle->control);
-
-	switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) {
-	case IEEE80211_ML_CONTROL_TYPE_BASIC:
-		common += sizeof(struct ieee80211_mle_basic_common_info);
-		check_common_len = true;
-		if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
-			common += 1;
-		if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
-			common += 1;
-		if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
-			common += 2;
-		if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
-			common += 2;
-		if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)
-			common += 2;
-		if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID)
-			common += 1;
-		if (control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP)
-			common += 2;
-		break;
-	case IEEE80211_ML_CONTROL_TYPE_PREQ:
-		common += sizeof(struct ieee80211_mle_preq_common_info);
-		if (control & IEEE80211_MLC_PREQ_PRES_MLD_ID)
-			common += 1;
-		check_common_len = true;
-		break;
-	case IEEE80211_ML_CONTROL_TYPE_RECONF:
-		if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR)
-			common += ETH_ALEN;
-		if (control & IEEE80211_MLC_RECONF_PRES_EML_CAPA)
-			common += 2;
-		if (control & IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP)
-			common += 2;
-		if (control & IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP)
-			common += 2;
-		break;
-	case IEEE80211_ML_CONTROL_TYPE_TDLS:
-		common += sizeof(struct ieee80211_mle_tdls_common_info);
-		check_common_len = true;
-		break;
-	case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS:
-		common = ETH_ALEN + 1;
-		break;
-	default:
-		/* we don't know this type */
-		return true;
-	}
-
-	if (len < fixed + common)
-		return false;
-
-	if (!check_common_len)
-		return true;
-
-	/* if present, common length is the first octet there */
-	return mle->variable[0] >= common;
-}
-
-/**
- * ieee80211_mle_type_ok - validate multi-link element type and size
- * @data: pointer to the element data
- * @type: expected type of the element
- * @len: length of the containing element
- * Return: whether or not the multi-link element type matches and size is OK
- */
-static inline bool ieee80211_mle_type_ok(const u8 *data, u8 type, size_t len)
-{
-	const struct ieee80211_multi_link_elem *mle = (const void *)data;
-	u16 control;
-
-	if (!ieee80211_mle_size_ok(data, len))
-		return false;
-
-	control = le16_to_cpu(mle->control);
-
-	if (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE) == type)
-		return true;
-
-	return false;
-}
-
-enum ieee80211_mle_subelems {
-	IEEE80211_MLE_SUBELEM_PER_STA_PROFILE		= 0,
-	IEEE80211_MLE_SUBELEM_FRAGMENT		        = 254,
-};
-
-#define IEEE80211_MLE_STA_CONTROL_LINK_ID			0x000f
-#define IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE		0x0010
-#define IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT		0x0020
-#define IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT		0x0040
-#define IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT		0x0080
-#define IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT		0x0100
-#define IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT	0x0200
-#define IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE		0x0400
-#define IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT	0x0800
-
-struct ieee80211_mle_per_sta_profile {
-	__le16 control;
-	u8 sta_info_len;
-	u8 variable[];
-} __packed;
-
-/**
- * ieee80211_mle_basic_sta_prof_size_ok - validate basic multi-link element sta
- *	profile size
- * @data: pointer to the sub element data
- * @len: length of the containing sub element
- * Return: %true if the STA profile is large enough, %false otherwise
- */
-static inline bool ieee80211_mle_basic_sta_prof_size_ok(const u8 *data,
-							size_t len)
-{
-	const struct ieee80211_mle_per_sta_profile *prof = (const void *)data;
-	u16 control;
-	u8 fixed = sizeof(*prof);
-	u8 info_len = 1;
-
-	if (len < fixed)
-		return false;
-
-	control = le16_to_cpu(prof->control);
-
-	if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT)
-		info_len += 6;
-	if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT)
-		info_len += 2;
-	if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT)
-		info_len += 8;
-	if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT)
-		info_len += 2;
-	if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE &&
-	    control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) {
-		if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE)
-			info_len += 2;
-		else
-			info_len += 1;
-	}
-	if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT)
-		info_len += 1;
-
-	return prof->sta_info_len >= info_len &&
-	       fixed + prof->sta_info_len - 1 <= len;
-}
-
-/**
- * ieee80211_mle_basic_sta_prof_bss_param_ch_cnt - get per-STA profile BSS
- *	parameter change count
- * @prof: the per-STA profile, having been checked with
- *	ieee80211_mle_basic_sta_prof_size_ok() for the correct length
- *
- * Return: The BSS parameter change count value if present, 0 otherwise.
- */
-static inline u8
-ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(const struct ieee80211_mle_per_sta_profile *prof)
-{
-	u16 control = le16_to_cpu(prof->control);
-	const u8 *pos = prof->variable;
-
-	if (!(control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT))
-		return 0;
-
-	if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT)
-		pos += 6;
-	if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT)
-		pos += 2;
-	if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT)
-		pos += 8;
-	if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT)
-		pos += 2;
-	if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE &&
-	    control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) {
-		if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE)
-			pos += 2;
-		else
-			pos += 1;
-	}
-
-	return *pos;
-}
-
-#define IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID			0x000f
-#define IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE		0x0010
-#define IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT		0x0020
-#define IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT		0x0040
-#define	IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE                 0x0780
-#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_AP_REM          0
-#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_OP_PARAM_UPDATE 1
-#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK        2
-#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK        3
-#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_NSTR_STATUS     4
-#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT       0x0800
-
-/**
- * ieee80211_mle_reconf_sta_prof_size_ok - validate reconfiguration multi-link
- *	element sta profile size.
- * @data: pointer to the sub element data
- * @len: length of the containing sub element
- * Return: %true if the STA profile is large enough, %false otherwise
- */
-static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data,
-							 size_t len)
-{
-	const struct ieee80211_mle_per_sta_profile *prof = (const void *)data;
-	u16 control;
-	u8 fixed = sizeof(*prof);
-	u8 info_len = 1;
-
-	if (len < fixed)
-		return false;
-
-	control = le16_to_cpu(prof->control);
-
-	if (control & IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT)
-		info_len += ETH_ALEN;
-	if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT)
-		info_len += 2;
-	if (control & IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT)
-		info_len += 2;
-
-	return prof->sta_info_len >= info_len &&
-	       fixed + prof->sta_info_len - 1 <= len;
-}
-
-#define IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID			0x000f
-#define IEEE80211_EPCS_ENA_RESP_BODY_LEN                        3
-
-static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len)
-{
-	const struct ieee80211_ttlm_elem *t2l = (const void *)data;
-	u8 control, fixed = sizeof(*t2l), elem_len = 0;
-
-	if (len < fixed)
-		return false;
-
-	control = t2l->control;
-
-	if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT)
-		elem_len += 2;
-	if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)
-		elem_len += 3;
-
-	if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) {
-		u8 bm_size;
-
-		elem_len += 1;
-		if (len < fixed + elem_len)
-			return false;
-
-		if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE)
-			bm_size = 1;
-		else
-			bm_size = 2;
-
-		elem_len += hweight8(t2l->optional[0]) * bm_size;
-	}
-
-	return len >= fixed + elem_len;
-}
-
-/**
- * ieee80211_emlsr_pad_delay_in_us - Fetch the EMLSR Padding delay
- *	in microseconds
- * @eml_cap: EML capabilities field value from common info field of
- *	the Multi-link element
- * Return: the EMLSR Padding delay (in microseconds) encoded in the
- *	EML Capabilities field
- */
-
-static inline u32 ieee80211_emlsr_pad_delay_in_us(u16 eml_cap)
-{
-	/* IEEE Std 802.11be-2024 Table 9-417i—Encoding of the EMLSR
-	 * Padding Delay subfield.
-	 */
-	u32 pad_delay = u16_get_bits(eml_cap,
-				     IEEE80211_EML_CAP_EMLSR_PADDING_DELAY);
-
-	if (!pad_delay ||
-	    pad_delay > IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US)
-		return 0;
-
-	return 32 * (1 << (pad_delay - 1));
-}
-
-/**
- * ieee80211_emlsr_trans_delay_in_us - Fetch the EMLSR Transition
- *	delay in microseconds
- * @eml_cap: EML capabilities field value from common info field of
- *	the Multi-link element
- * Return: the EMLSR Transition delay (in microseconds) encoded in the
- *	EML Capabilities field
- */
-
-static inline u32 ieee80211_emlsr_trans_delay_in_us(u16 eml_cap)
-{
-	/* IEEE Std 802.11be-2024 Table 9-417j—Encoding of the EMLSR
-	 * Transition Delay subfield.
-	 */
-	u32 trans_delay =
-		u16_get_bits(eml_cap,
-			     IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY);
-
-	/* invalid values also just use 0 */
-	if (!trans_delay ||
-	    trans_delay > IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US)
-		return 0;
-
-	return 16 * (1 << (trans_delay - 1));
-}
-
-/**
- * ieee80211_eml_trans_timeout_in_us - Fetch the EMLSR Transition
- *	timeout value in microseconds
- * @eml_cap: EML capabilities field value from common info field of
- *	the Multi-link element
- * Return: the EMLSR Transition timeout (in microseconds) encoded in
- *	the EML Capabilities field
- */
-
-static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap)
-{
-	/* IEEE Std 802.11be-2024 Table 9-417m—Encoding of the
-	 * Transition Timeout subfield.
-	 */
-	u8 timeout = u16_get_bits(eml_cap,
-				  IEEE80211_EML_CAP_TRANSITION_TIMEOUT);
-
-	/* invalid values also just use 0 */
-	if (!timeout || timeout > IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU)
-		return 0;
-
-	return 128 * (1 << (timeout - 1));
-}
-
-#define for_each_mle_subelement(_elem, _data, _len)			\
-	if (ieee80211_mle_size_ok(_data, _len))				\
-		for_each_element(_elem,					\
-				 _data + ieee80211_mle_common_size(_data),\
-				 _len - ieee80211_mle_common_size(_data))
-
 /* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */
 #define NAN_OP_MODE_PHY_MODE_VHT	0x01
 #define NAN_OP_MODE_PHY_MODE_HE		0x10
@@ -4605,6 +3443,8 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap)
 
 #include "ieee80211-ht.h"
 #include "ieee80211-vht.h"
+#include "ieee80211-he.h"
+#include "ieee80211-eht.h"
 #include "ieee80211-mesh.h"
 
 #endif /* LINUX_IEEE80211_H */
-- 
cgit v1.2.3


From 00105d7600bfb171037783da5f26e2565c7d2106 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 15:36:54 +0100
Subject: wifi: ieee80211: split S1G definitions out

The ieee80211.h file has gotten very long, continue splitting
it by putting S1G definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.82c0bddee6e3.Ic6646615286dad240b42e31e9d428c7e4ea40ce0@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211-s1g.h | 575 +++++++++++++++++++++++++++++++++++++++++
 include/linux/ieee80211.h     | 585 ++----------------------------------------
 2 files changed, 591 insertions(+), 569 deletions(-)
 create mode 100644 include/linux/ieee80211-s1g.h

(limited to 'include')

diff --git a/include/linux/ieee80211-s1g.h b/include/linux/ieee80211-s1g.h
new file mode 100644
index 000000000000..5b9ed2dcc00e
--- /dev/null
+++ b/include/linux/ieee80211-s1g.h
@@ -0,0 +1,575 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * IEEE 802.11 S1G definitions
+ *
+ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ * <jkmaline@cc.hut.fi>
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright (c) 2005, Devicescape Software, Inc.
+ * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
+ * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright (c) 2018 - 2025 Intel Corporation
+ */
+
+#ifndef LINUX_IEEE80211_S1G_H
+#define LINUX_IEEE80211_S1G_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/* bits unique to S1G beacon frame control */
+#define IEEE80211_S1G_BCN_NEXT_TBTT	0x100
+#define IEEE80211_S1G_BCN_CSSID		0x200
+#define IEEE80211_S1G_BCN_ANO		0x400
+
+/* see 802.11ah-2016 9.9 NDP CMAC frames */
+#define IEEE80211_S1G_1MHZ_NDP_BITS	25
+#define IEEE80211_S1G_1MHZ_NDP_BYTES	4
+#define IEEE80211_S1G_2MHZ_NDP_BITS	37
+#define IEEE80211_S1G_2MHZ_NDP_BYTES	5
+
+/**
+ * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT &&
+ * IEEE80211_STYPE_S1G_BEACON
+ * @fc: frame control bytes in little-endian byteorder
+ * Return: whether or not the frame is an S1G beacon
+ */
+static inline bool ieee80211_is_s1g_beacon(__le16 fc)
+{
+	return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE |
+				 IEEE80211_FCTL_STYPE)) ==
+	       cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON);
+}
+
+/**
+ * ieee80211_s1g_has_next_tbtt - check if IEEE80211_S1G_BCN_NEXT_TBTT
+ * @fc: frame control bytes in little-endian byteorder
+ * Return: whether or not the frame contains the variable-length
+ *	next TBTT field
+ */
+static inline bool ieee80211_s1g_has_next_tbtt(__le16 fc)
+{
+	return ieee80211_is_s1g_beacon(fc) &&
+		(fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT));
+}
+
+/**
+ * ieee80211_s1g_has_ano - check if IEEE80211_S1G_BCN_ANO
+ * @fc: frame control bytes in little-endian byteorder
+ * Return: whether or not the frame contains the variable-length
+ *	ANO field
+ */
+static inline bool ieee80211_s1g_has_ano(__le16 fc)
+{
+	return ieee80211_is_s1g_beacon(fc) &&
+		(fc & cpu_to_le16(IEEE80211_S1G_BCN_ANO));
+}
+
+/**
+ * ieee80211_s1g_has_cssid - check if IEEE80211_S1G_BCN_CSSID
+ * @fc: frame control bytes in little-endian byteorder
+ * Return: whether or not the frame contains the variable-length
+ *	compressed SSID field
+ */
+static inline bool ieee80211_s1g_has_cssid(__le16 fc)
+{
+	return ieee80211_is_s1g_beacon(fc) &&
+		(fc & cpu_to_le16(IEEE80211_S1G_BCN_CSSID));
+}
+
+/**
+ * enum ieee80211_s1g_chanwidth - S1G channel widths
+ * These are defined in IEEE802.11-2016ah Table 10-20
+ * as BSS Channel Width
+ *
+ * @IEEE80211_S1G_CHANWIDTH_1MHZ: 1MHz operating channel
+ * @IEEE80211_S1G_CHANWIDTH_2MHZ: 2MHz operating channel
+ * @IEEE80211_S1G_CHANWIDTH_4MHZ: 4MHz operating channel
+ * @IEEE80211_S1G_CHANWIDTH_8MHZ: 8MHz operating channel
+ * @IEEE80211_S1G_CHANWIDTH_16MHZ: 16MHz operating channel
+ */
+enum ieee80211_s1g_chanwidth {
+	IEEE80211_S1G_CHANWIDTH_1MHZ = 0,
+	IEEE80211_S1G_CHANWIDTH_2MHZ = 1,
+	IEEE80211_S1G_CHANWIDTH_4MHZ = 3,
+	IEEE80211_S1G_CHANWIDTH_8MHZ = 7,
+	IEEE80211_S1G_CHANWIDTH_16MHZ = 15,
+};
+
+/**
+ * enum ieee80211_s1g_pri_chanwidth - S1G primary channel widths
+ *	described in IEEE80211-2024 Table 10-39.
+ *
+ * @IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: 2MHz primary channel
+ * @IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: 1MHz primary channel
+ */
+enum ieee80211_s1g_pri_chanwidth {
+	IEEE80211_S1G_PRI_CHANWIDTH_2MHZ = 0,
+	IEEE80211_S1G_PRI_CHANWIDTH_1MHZ = 1,
+};
+
+/**
+ * struct ieee80211_s1g_bcn_compat_ie - S1G Beacon Compatibility element
+ * @compat_info: Compatibility Information
+ * @beacon_int: Beacon Interval
+ * @tsf_completion: TSF Completion
+ *
+ * This structure represents the payload of the "S1G Beacon
+ * Compatibility element" as described in IEEE Std 802.11-2020 section
+ * 9.4.2.196.
+ */
+struct ieee80211_s1g_bcn_compat_ie {
+	__le16 compat_info;
+	__le16 beacon_int;
+	__le32 tsf_completion;
+} __packed;
+
+/**
+ * struct ieee80211_s1g_oper_ie - S1G Operation element
+ * @ch_width: S1G Operation Information Channel Width
+ * @oper_class: S1G Operation Information Operating Class
+ * @primary_ch: S1G Operation Information Primary Channel Number
+ * @oper_ch: S1G Operation Information  Channel Center Frequency
+ * @basic_mcs_nss: Basic S1G-MCS and NSS Set
+ *
+ * This structure represents the payload of the "S1G Operation
+ * element" as described in IEEE Std 802.11-2020 section 9.4.2.212.
+ */
+struct ieee80211_s1g_oper_ie {
+	u8 ch_width;
+	u8 oper_class;
+	u8 primary_ch;
+	u8 oper_ch;
+	__le16 basic_mcs_nss;
+} __packed;
+
+/**
+ * struct ieee80211_aid_response_ie - AID Response element
+ * @aid: AID/Group AID
+ * @switch_count: AID Switch Count
+ * @response_int: AID Response Interval
+ *
+ * This structure represents the payload of the "AID Response element"
+ * as described in IEEE Std 802.11-2020 section 9.4.2.194.
+ */
+struct ieee80211_aid_response_ie {
+	__le16 aid;
+	u8 switch_count;
+	__le16 response_int;
+} __packed;
+
+struct ieee80211_s1g_cap {
+	u8 capab_info[10];
+	u8 supp_mcs_nss[5];
+} __packed;
+
+/**
+ * ieee80211_s1g_optional_len - determine length of optional S1G beacon fields
+ * @fc: frame control bytes in little-endian byteorder
+ * Return: total length in bytes of the optional fixed-length fields
+ *
+ * S1G beacons may contain up to three optional fixed-length fields that
+ * precede the variable-length elements. Whether these fields are present
+ * is indicated by flags in the frame control field.
+ *
+ * From IEEE 802.11-2024 section 9.3.4.3:
+ *  - Next TBTT field may be 0 or 3 bytes
+ *  - Short SSID field may be 0 or 4 bytes
+ *  - Access Network Options (ANO) field may be 0 or 1 byte
+ */
+static inline size_t
+ieee80211_s1g_optional_len(__le16 fc)
+{
+	size_t len = 0;
+
+	if (ieee80211_s1g_has_next_tbtt(fc))
+		len += 3;
+
+	if (ieee80211_s1g_has_cssid(fc))
+		len += 4;
+
+	if (ieee80211_s1g_has_ano(fc))
+		len += 1;
+
+	return len;
+}
+
+/* S1G Capabilities Information field */
+#define IEEE80211_S1G_CAPABILITY_LEN	15
+
+#define S1G_CAP0_S1G_LONG	BIT(0)
+#define S1G_CAP0_SGI_1MHZ	BIT(1)
+#define S1G_CAP0_SGI_2MHZ	BIT(2)
+#define S1G_CAP0_SGI_4MHZ	BIT(3)
+#define S1G_CAP0_SGI_8MHZ	BIT(4)
+#define S1G_CAP0_SGI_16MHZ	BIT(5)
+#define S1G_CAP0_SUPP_CH_WIDTH	GENMASK(7, 6)
+
+#define S1G_SUPP_CH_WIDTH_2	0
+#define S1G_SUPP_CH_WIDTH_4	1
+#define S1G_SUPP_CH_WIDTH_8	2
+#define S1G_SUPP_CH_WIDTH_16	3
+#define S1G_SUPP_CH_WIDTH_MAX(cap) ((1 << FIELD_GET(S1G_CAP0_SUPP_CH_WIDTH, \
+						    cap[0])) << 1)
+
+#define S1G_CAP1_RX_LDPC	BIT(0)
+#define S1G_CAP1_TX_STBC	BIT(1)
+#define S1G_CAP1_RX_STBC	BIT(2)
+#define S1G_CAP1_SU_BFER	BIT(3)
+#define S1G_CAP1_SU_BFEE	BIT(4)
+#define S1G_CAP1_BFEE_STS	GENMASK(7, 5)
+
+#define S1G_CAP2_SOUNDING_DIMENSIONS	GENMASK(2, 0)
+#define S1G_CAP2_MU_BFER		BIT(3)
+#define S1G_CAP2_MU_BFEE		BIT(4)
+#define S1G_CAP2_PLUS_HTC_VHT		BIT(5)
+#define S1G_CAP2_TRAVELING_PILOT	GENMASK(7, 6)
+
+#define S1G_CAP3_RD_RESPONDER		BIT(0)
+#define S1G_CAP3_HT_DELAYED_BA		BIT(1)
+#define S1G_CAP3_MAX_MPDU_LEN		BIT(2)
+#define S1G_CAP3_MAX_AMPDU_LEN_EXP	GENMASK(4, 3)
+#define S1G_CAP3_MIN_MPDU_START		GENMASK(7, 5)
+
+#define S1G_CAP4_UPLINK_SYNC	BIT(0)
+#define S1G_CAP4_DYNAMIC_AID	BIT(1)
+#define S1G_CAP4_BAT		BIT(2)
+#define S1G_CAP4_TIME_ADE	BIT(3)
+#define S1G_CAP4_NON_TIM	BIT(4)
+#define S1G_CAP4_GROUP_AID	BIT(5)
+#define S1G_CAP4_STA_TYPE	GENMASK(7, 6)
+
+#define S1G_CAP5_CENT_AUTH_CONTROL	BIT(0)
+#define S1G_CAP5_DIST_AUTH_CONTROL	BIT(1)
+#define S1G_CAP5_AMSDU			BIT(2)
+#define S1G_CAP5_AMPDU			BIT(3)
+#define S1G_CAP5_ASYMMETRIC_BA		BIT(4)
+#define S1G_CAP5_FLOW_CONTROL		BIT(5)
+#define S1G_CAP5_SECTORIZED_BEAM	GENMASK(7, 6)
+
+#define S1G_CAP6_OBSS_MITIGATION	BIT(0)
+#define S1G_CAP6_FRAGMENT_BA		BIT(1)
+#define S1G_CAP6_NDP_PS_POLL		BIT(2)
+#define S1G_CAP6_RAW_OPERATION		BIT(3)
+#define S1G_CAP6_PAGE_SLICING		BIT(4)
+#define S1G_CAP6_TXOP_SHARING_IMP_ACK	BIT(5)
+#define S1G_CAP6_VHT_LINK_ADAPT		GENMASK(7, 6)
+
+#define S1G_CAP7_TACK_AS_PS_POLL		BIT(0)
+#define S1G_CAP7_DUP_1MHZ			BIT(1)
+#define S1G_CAP7_MCS_NEGOTIATION		BIT(2)
+#define S1G_CAP7_1MHZ_CTL_RESPONSE_PREAMBLE	BIT(3)
+#define S1G_CAP7_NDP_BFING_REPORT_POLL		BIT(4)
+#define S1G_CAP7_UNSOLICITED_DYN_AID		BIT(5)
+#define S1G_CAP7_SECTOR_TRAINING_OPERATION	BIT(6)
+#define S1G_CAP7_TEMP_PS_MODE_SWITCH		BIT(7)
+
+#define S1G_CAP8_TWT_GROUPING	BIT(0)
+#define S1G_CAP8_BDT		BIT(1)
+#define S1G_CAP8_COLOR		GENMASK(4, 2)
+#define S1G_CAP8_TWT_REQUEST	BIT(5)
+#define S1G_CAP8_TWT_RESPOND	BIT(6)
+#define S1G_CAP8_PV1_FRAME	BIT(7)
+
+#define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0)
+
+#define S1G_OPER_CH_WIDTH_PRIMARY	BIT(0)
+#define S1G_OPER_CH_WIDTH_OPER		GENMASK(4, 1)
+#define S1G_OPER_CH_PRIMARY_LOCATION	BIT(5)
+
+#define S1G_2M_PRIMARY_LOCATION_LOWER	0
+#define S1G_2M_PRIMARY_LOCATION_UPPER	1
+
+#define LISTEN_INT_USF	GENMASK(15, 14)
+#define LISTEN_INT_UI	GENMASK(13, 0)
+
+#define IEEE80211_MAX_USF	FIELD_MAX(LISTEN_INT_USF)
+#define IEEE80211_MAX_UI	FIELD_MAX(LISTEN_INT_UI)
+
+/* S1G encoding types */
+#define IEEE80211_S1G_TIM_ENC_MODE_BLOCK	0
+#define IEEE80211_S1G_TIM_ENC_MODE_SINGLE	1
+#define IEEE80211_S1G_TIM_ENC_MODE_OLB		2
+
+enum ieee80211_s1g_actioncode {
+	WLAN_S1G_AID_SWITCH_REQUEST,
+	WLAN_S1G_AID_SWITCH_RESPONSE,
+	WLAN_S1G_SYNC_CONTROL,
+	WLAN_S1G_STA_INFO_ANNOUNCE,
+	WLAN_S1G_EDCA_PARAM_SET,
+	WLAN_S1G_EL_OPERATION,
+	WLAN_S1G_TWT_SETUP,
+	WLAN_S1G_TWT_TEARDOWN,
+	WLAN_S1G_SECT_GROUP_ID_LIST,
+	WLAN_S1G_SECT_ID_FEEDBACK,
+	WLAN_S1G_TWT_INFORMATION = 11,
+};
+
+/**
+ * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon
+ * @fc: frame control bytes in little-endian byteorder
+ * @variable: pointer to the beacon frame elements
+ * @variable_len: length of the frame elements
+ * Return: whether or not the frame is an S1G short beacon. As per
+ *	IEEE80211-2024 11.1.3.10.1, The S1G beacon compatibility element shall
+ *	always be present as the first element in beacon frames generated at a
+ *	TBTT (Target Beacon Transmission Time), so any frame not containing
+ *	this element must have been generated at a TSBTT (Target Short Beacon
+ *	Transmission Time) that is not a TBTT. Additionally, short beacons are
+ *	prohibited from containing the S1G beacon compatibility element as per
+ *	IEEE80211-2024 9.3.4.3 Table 9-76, so if we have an S1G beacon with
+ *	either no elements or the first element is not the beacon compatibility
+ *	element, we have a short beacon.
+ */
+static inline bool ieee80211_is_s1g_short_beacon(__le16 fc, const u8 *variable,
+						 size_t variable_len)
+{
+	if (!ieee80211_is_s1g_beacon(fc))
+		return false;
+
+	/*
+	 * If the frame does not contain at least 1 element (this is perfectly
+	 * valid in a short beacon) and is an S1G beacon, we have a short
+	 * beacon.
+	 */
+	if (variable_len < 2)
+		return true;
+
+	return variable[0] != WLAN_EID_S1G_BCN_COMPAT;
+}
+
+struct s1g_tim_aid {
+	u16 aid;
+	u8 target_blk; /* Target block index */
+	u8 target_subblk; /* Target subblock index */
+	u8 target_subblk_bit; /* Target subblock bit */
+};
+
+struct s1g_tim_enc_block {
+	u8 enc_mode;
+	bool inverse;
+	const u8 *ptr;
+	u8 len;
+
+	/*
+	 * For an OLB encoded block that spans multiple blocks, this
+	 * is the offset into the span described by that encoded block.
+	 */
+	u8 olb_blk_offset;
+};
+
+/*
+ * Helper routines to quickly extract the length of an encoded block. Validation
+ * is also performed to ensure the length extracted lies within the TIM.
+ */
+
+static inline int ieee80211_s1g_len_bitmap(const u8 *ptr, const u8 *end)
+{
+	u8 blkmap;
+	u8 n_subblks;
+
+	if (ptr >= end)
+		return -EINVAL;
+
+	blkmap = *ptr;
+	n_subblks = hweight8(blkmap);
+
+	if (ptr + 1 + n_subblks > end)
+		return -EINVAL;
+
+	return 1 + n_subblks;
+}
+
+static inline int ieee80211_s1g_len_single(const u8 *ptr, const u8 *end)
+{
+	return (ptr + 1 > end) ? -EINVAL : 1;
+}
+
+static inline int ieee80211_s1g_len_olb(const u8 *ptr, const u8 *end)
+{
+	if (ptr >= end)
+		return -EINVAL;
+
+	return (ptr + 1 + *ptr > end) ? -EINVAL : 1 + *ptr;
+}
+
+/*
+ * Enumerate all encoded blocks until we find the encoded block that describes
+ * our target AID. OLB is a special case as a single encoded block can describe
+ * multiple blocks as a single encoded block.
+ */
+static inline int ieee80211_s1g_find_target_block(struct s1g_tim_enc_block *enc,
+						  const struct s1g_tim_aid *aid,
+						  const u8 *ptr, const u8 *end)
+{
+	/* need at least block-control octet */
+	while (ptr + 1 <= end) {
+		u8 ctrl = *ptr++;
+		u8 mode = ctrl & 0x03;
+		bool contains, inverse = ctrl & BIT(2);
+		u8 span, blk_off = ctrl >> 3;
+		int len;
+
+		switch (mode) {
+		case IEEE80211_S1G_TIM_ENC_MODE_BLOCK:
+			len = ieee80211_s1g_len_bitmap(ptr, end);
+			contains = blk_off == aid->target_blk;
+			break;
+		case IEEE80211_S1G_TIM_ENC_MODE_SINGLE:
+			len = ieee80211_s1g_len_single(ptr, end);
+			contains = blk_off == aid->target_blk;
+			break;
+		case IEEE80211_S1G_TIM_ENC_MODE_OLB:
+			len = ieee80211_s1g_len_olb(ptr, end);
+			/*
+			 * An OLB encoded block can describe more then one
+			 * block, meaning an encoded OLB block can span more
+			 * then a single block.
+			 */
+			if (len > 0) {
+				/* Minus one for the length octet */
+				span = DIV_ROUND_UP(len - 1, 8);
+				/*
+				 * Check if our target block lies within the
+				 * block span described by this encoded block.
+				 */
+				contains = (aid->target_blk >= blk_off) &&
+					   (aid->target_blk < blk_off + span);
+			}
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+
+		if (len < 0)
+			return len;
+
+		if (contains) {
+			enc->enc_mode = mode;
+			enc->inverse = inverse;
+			enc->ptr = ptr;
+			enc->len = (u8)len;
+			enc->olb_blk_offset = blk_off;
+			return 0;
+		}
+
+		ptr += len;
+	}
+
+	return -ENOENT;
+}
+
+static inline bool ieee80211_s1g_parse_bitmap(struct s1g_tim_enc_block *enc,
+					      struct s1g_tim_aid *aid)
+{
+	const u8 *ptr = enc->ptr;
+	u8 blkmap = *ptr++;
+
+	/*
+	 * If our block bitmap does not contain a set bit that corresponds
+	 * to our AID, it could mean a variety of things depending on if
+	 * the encoding mode is inverted or not.
+	 *
+	 * 1. If inverted, it means the entire subblock is present and hence
+	 *    our AID has been set.
+	 * 2. If not inverted, it means our subblock is not present and hence
+	 *    it is all zero meaning our AID is not set.
+	 */
+	if (!(blkmap & BIT(aid->target_subblk)))
+		return enc->inverse;
+
+	/*
+	 * Increment ptr by the number of set subblocks that appear before our
+	 * target subblock. If our target subblock is 0, do nothing as ptr
+	 * already points to our target subblock.
+	 */
+	if (aid->target_subblk)
+		ptr += hweight8(blkmap & GENMASK(aid->target_subblk - 1, 0));
+
+	return !!(*ptr & BIT(aid->target_subblk_bit)) ^ enc->inverse;
+}
+
+static inline bool ieee80211_s1g_parse_single(struct s1g_tim_enc_block *enc,
+					      struct s1g_tim_aid *aid)
+{
+	/*
+	 * Single AID mode describes, as the name suggests, a single AID
+	 * within the block described by the encoded block. The octet
+	 * contains the 6 LSBs of the AID described in the block. The other
+	 * 2 bits are reserved. When inversed, every single AID described
+	 * by the current block have buffered traffic except for the AID
+	 * described in the single AID octet.
+	 */
+	return ((*enc->ptr & 0x3f) == (aid->aid & 0x3f)) ^ enc->inverse;
+}
+
+static inline bool ieee80211_s1g_parse_olb(struct s1g_tim_enc_block *enc,
+					   struct s1g_tim_aid *aid)
+{
+	const u8 *ptr = enc->ptr;
+	u8 blk_len = *ptr++;
+	/*
+	 * Given an OLB encoded block that describes multiple blocks,
+	 * calculate the offset into the span. Then calculate the
+	 * subblock location normally.
+	 */
+	u16 span_offset = aid->target_blk - enc->olb_blk_offset;
+	u16 subblk_idx = span_offset * 8 + aid->target_subblk;
+
+	if (subblk_idx >= blk_len)
+		return enc->inverse;
+
+	return !!(ptr[subblk_idx] & BIT(aid->target_subblk_bit)) ^ enc->inverse;
+}
+
+/*
+ * An S1G PVB has 3 non optional encoding types, each that can be inverted.
+ * An S1G PVB is constructed with zero or more encoded block subfields. Each
+ * encoded block represents a single "block" of AIDs (64), and each encoded
+ * block can contain one of the 3 encoding types alongside a single bit for
+ * whether the bits should be inverted.
+ *
+ * As the standard makes no guarantee about the ordering of encoded blocks,
+ * we must parse every encoded block in the worst case scenario given an
+ * AID that lies within the last block.
+ */
+static inline bool ieee80211_s1g_check_tim(const struct ieee80211_tim_ie *tim,
+					   u8 tim_len, u16 aid)
+{
+	int err;
+	struct s1g_tim_aid target_aid;
+	struct s1g_tim_enc_block enc_blk;
+
+	if (tim_len < 3)
+		return false;
+
+	target_aid.aid = aid;
+	target_aid.target_blk = (aid >> 6) & 0x1f;
+	target_aid.target_subblk = (aid >> 3) & 0x7;
+	target_aid.target_subblk_bit = aid & 0x7;
+
+	/*
+	 * Find our AIDs target encoded block and fill &enc_blk with the
+	 * encoded blocks information. If no entry is found or an error
+	 * occurs return false.
+	 */
+	err = ieee80211_s1g_find_target_block(&enc_blk, &target_aid,
+					      tim->virtual_map,
+					      (const u8 *)tim + tim_len + 2);
+	if (err)
+		return false;
+
+	switch (enc_blk.enc_mode) {
+	case IEEE80211_S1G_TIM_ENC_MODE_BLOCK:
+		return ieee80211_s1g_parse_bitmap(&enc_blk, &target_aid);
+	case IEEE80211_S1G_TIM_ENC_MODE_SINGLE:
+		return ieee80211_s1g_parse_single(&enc_blk, &target_aid);
+	case IEEE80211_S1G_TIM_ENC_MODE_OLB:
+		return ieee80211_s1g_parse_olb(&enc_blk, &target_aid);
+	default:
+		return false;
+	}
+}
+
+#endif /* LINUX_IEEE80211_H */
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 63a9775b059d..1b27bbac145b 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -109,17 +109,6 @@
 #define IEEE80211_STYPE_DMG_BEACON		0x0000
 #define IEEE80211_STYPE_S1G_BEACON		0x0010
 
-/* bits unique to S1G beacon */
-#define IEEE80211_S1G_BCN_NEXT_TBTT	0x100
-#define IEEE80211_S1G_BCN_CSSID		0x200
-#define IEEE80211_S1G_BCN_ANO		0x400
-
-/* see 802.11ah-2016 9.9 NDP CMAC frames */
-#define IEEE80211_S1G_1MHZ_NDP_BITS	25
-#define IEEE80211_S1G_1MHZ_NDP_BYTES	4
-#define IEEE80211_S1G_2MHZ_NDP_BITS	37
-#define IEEE80211_S1G_2MHZ_NDP_BYTES	5
-
 #define IEEE80211_NDP_FTYPE_CTS			0
 #define IEEE80211_NDP_FTYPE_CF_END		0
 #define IEEE80211_NDP_FTYPE_PS_POLL		1
@@ -221,11 +210,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 #define IEEE80211_MAX_TIM_LEN		251
 #define IEEE80211_MAX_MESH_PEERINGS	63
 
-/* S1G encoding types */
-#define IEEE80211_S1G_TIM_ENC_MODE_BLOCK	0
-#define IEEE80211_S1G_TIM_ENC_MODE_SINGLE	1
-#define IEEE80211_S1G_TIM_ENC_MODE_OLB		2
-
 /* Maximum size for the MA-UNITDATA primitive, 802.11 standard section
    6.2.1.1.2.
 
@@ -604,55 +588,6 @@ static inline bool ieee80211_is_beacon(__le16 fc)
 	       cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON);
 }
 
-/**
- * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT &&
- * IEEE80211_STYPE_S1G_BEACON
- * @fc: frame control bytes in little-endian byteorder
- * Return: whether or not the frame is an S1G beacon
- */
-static inline bool ieee80211_is_s1g_beacon(__le16 fc)
-{
-	return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE |
-				 IEEE80211_FCTL_STYPE)) ==
-	       cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON);
-}
-
-/**
- * ieee80211_s1g_has_next_tbtt - check if IEEE80211_S1G_BCN_NEXT_TBTT
- * @fc: frame control bytes in little-endian byteorder
- * Return: whether or not the frame contains the variable-length
- *	next TBTT field
- */
-static inline bool ieee80211_s1g_has_next_tbtt(__le16 fc)
-{
-	return ieee80211_is_s1g_beacon(fc) &&
-		(fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT));
-}
-
-/**
- * ieee80211_s1g_has_ano - check if IEEE80211_S1G_BCN_ANO
- * @fc: frame control bytes in little-endian byteorder
- * Return: whether or not the frame contains the variable-length
- *	ANO field
- */
-static inline bool ieee80211_s1g_has_ano(__le16 fc)
-{
-	return ieee80211_is_s1g_beacon(fc) &&
-		(fc & cpu_to_le16(IEEE80211_S1G_BCN_ANO));
-}
-
-/**
- * ieee80211_s1g_has_cssid - check if IEEE80211_S1G_BCN_CSSID
- * @fc: frame control bytes in little-endian byteorder
- * Return: whether or not the frame contains the variable-length
- *	compressed SSID field
- */
-static inline bool ieee80211_s1g_has_cssid(__le16 fc)
-{
-	return ieee80211_is_s1g_beacon(fc) &&
-		(fc & cpu_to_le16(IEEE80211_S1G_BCN_CSSID));
-}
-
 /**
  * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM
  * @fc: frame control bytes in little-endian byteorder
@@ -984,37 +919,6 @@ struct ieee80211_tim_ie {
 	};
 } __packed;
 
-/**
- * enum ieee80211_s1g_chanwidth - S1G channel widths
- * These are defined in IEEE802.11-2016ah Table 10-20
- * as BSS Channel Width
- *
- * @IEEE80211_S1G_CHANWIDTH_1MHZ: 1MHz operating channel
- * @IEEE80211_S1G_CHANWIDTH_2MHZ: 2MHz operating channel
- * @IEEE80211_S1G_CHANWIDTH_4MHZ: 4MHz operating channel
- * @IEEE80211_S1G_CHANWIDTH_8MHZ: 8MHz operating channel
- * @IEEE80211_S1G_CHANWIDTH_16MHZ: 16MHz operating channel
- */
-enum ieee80211_s1g_chanwidth {
-	IEEE80211_S1G_CHANWIDTH_1MHZ = 0,
-	IEEE80211_S1G_CHANWIDTH_2MHZ = 1,
-	IEEE80211_S1G_CHANWIDTH_4MHZ = 3,
-	IEEE80211_S1G_CHANWIDTH_8MHZ = 7,
-	IEEE80211_S1G_CHANWIDTH_16MHZ = 15,
-};
-
-/**
- * enum ieee80211_s1g_pri_chanwidth - S1G primary channel widths
- *	described in IEEE80211-2024 Table 10-39.
- *
- * @IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: 2MHz primary channel
- * @IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: 1MHz primary channel
- */
-enum ieee80211_s1g_pri_chanwidth {
-	IEEE80211_S1G_PRI_CHANWIDTH_2MHZ = 0,
-	IEEE80211_S1G_PRI_CHANWIDTH_1MHZ = 1,
-};
-
 #define WLAN_SA_QUERY_TR_ID_LEN 2
 #define WLAN_MEMBERSHIP_LEN 8
 #define WLAN_USER_POSITION_LEN 16
@@ -1042,61 +946,6 @@ struct ieee80211_addba_ext_ie {
 	u8 data;
 } __packed;
 
-/**
- * struct ieee80211_s1g_bcn_compat_ie - S1G Beacon Compatibility element
- * @compat_info: Compatibility Information
- * @beacon_int: Beacon Interval
- * @tsf_completion: TSF Completion
- *
- * This structure represents the payload of the "S1G Beacon
- * Compatibility element" as described in IEEE Std 802.11-2020 section
- * 9.4.2.196.
- */
-struct ieee80211_s1g_bcn_compat_ie {
-	__le16 compat_info;
-	__le16 beacon_int;
-	__le32 tsf_completion;
-} __packed;
-
-/**
- * struct ieee80211_s1g_oper_ie - S1G Operation element
- * @ch_width: S1G Operation Information Channel Width
- * @oper_class: S1G Operation Information Operating Class
- * @primary_ch: S1G Operation Information Primary Channel Number
- * @oper_ch: S1G Operation Information  Channel Center Frequency
- * @basic_mcs_nss: Basic S1G-MCS and NSS Set
- *
- * This structure represents the payload of the "S1G Operation
- * element" as described in IEEE Std 802.11-2020 section 9.4.2.212.
- */
-struct ieee80211_s1g_oper_ie {
-	u8 ch_width;
-	u8 oper_class;
-	u8 primary_ch;
-	u8 oper_ch;
-	__le16 basic_mcs_nss;
-} __packed;
-
-/**
- * struct ieee80211_aid_response_ie - AID Response element
- * @aid: AID/Group AID
- * @switch_count: AID Switch Count
- * @response_int: AID Response Interval
- *
- * This structure represents the payload of the "AID Response element"
- * as described in IEEE Std 802.11-2020 section 9.4.2.194.
- */
-struct ieee80211_aid_response_ie {
-	__le16 aid;
-	u8 switch_count;
-	__le16 response_int;
-} __packed;
-
-struct ieee80211_s1g_cap {
-	u8 capab_info[10];
-	u8 supp_mcs_nss[5];
-} __packed;
-
 struct ieee80211_ext {
 	__le16 frame_control;
 	__le16 duration;
@@ -1110,37 +959,6 @@ struct ieee80211_ext {
 	} u;
 } __packed __aligned(2);
 
-/**
- * ieee80211_s1g_optional_len - determine length of optional S1G beacon fields
- * @fc: frame control bytes in little-endian byteorder
- * Return: total length in bytes of the optional fixed-length fields
- *
- * S1G beacons may contain up to three optional fixed-length fields that
- * precede the variable-length elements. Whether these fields are present
- * is indicated by flags in the frame control field.
- *
- * From IEEE 802.11-2024 section 9.3.4.3:
- *  - Next TBTT field may be 0 or 3 bytes
- *  - Short SSID field may be 0 or 4 bytes
- *  - Access Network Options (ANO) field may be 0 or 1 byte
- */
-static inline size_t
-ieee80211_s1g_optional_len(__le16 fc)
-{
-	size_t len = 0;
-
-	if (ieee80211_s1g_has_next_tbtt(fc))
-		len += 3;
-
-	if (ieee80211_s1g_has_cssid(fc))
-		len += 4;
-
-	if (ieee80211_s1g_has_ano(fc))
-		len += 1;
-
-	return len;
-}
-
 /**
  * struct ieee80211_bss_load_elem - BSS Load elemen
  *
@@ -1567,98 +1385,6 @@ struct ieee80211_p2p_noa_attr {
 #define IEEE80211_P2P_OPPPS_ENABLE_BIT		BIT(7)
 #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK	0x7F
 
-/* S1G Capabilities Information field */
-#define IEEE80211_S1G_CAPABILITY_LEN	15
-
-#define S1G_CAP0_S1G_LONG	BIT(0)
-#define S1G_CAP0_SGI_1MHZ	BIT(1)
-#define S1G_CAP0_SGI_2MHZ	BIT(2)
-#define S1G_CAP0_SGI_4MHZ	BIT(3)
-#define S1G_CAP0_SGI_8MHZ	BIT(4)
-#define S1G_CAP0_SGI_16MHZ	BIT(5)
-#define S1G_CAP0_SUPP_CH_WIDTH	GENMASK(7, 6)
-
-#define S1G_SUPP_CH_WIDTH_2	0
-#define S1G_SUPP_CH_WIDTH_4	1
-#define S1G_SUPP_CH_WIDTH_8	2
-#define S1G_SUPP_CH_WIDTH_16	3
-#define S1G_SUPP_CH_WIDTH_MAX(cap) ((1 << FIELD_GET(S1G_CAP0_SUPP_CH_WIDTH, \
-						    cap[0])) << 1)
-
-#define S1G_CAP1_RX_LDPC	BIT(0)
-#define S1G_CAP1_TX_STBC	BIT(1)
-#define S1G_CAP1_RX_STBC	BIT(2)
-#define S1G_CAP1_SU_BFER	BIT(3)
-#define S1G_CAP1_SU_BFEE	BIT(4)
-#define S1G_CAP1_BFEE_STS	GENMASK(7, 5)
-
-#define S1G_CAP2_SOUNDING_DIMENSIONS	GENMASK(2, 0)
-#define S1G_CAP2_MU_BFER		BIT(3)
-#define S1G_CAP2_MU_BFEE		BIT(4)
-#define S1G_CAP2_PLUS_HTC_VHT		BIT(5)
-#define S1G_CAP2_TRAVELING_PILOT	GENMASK(7, 6)
-
-#define S1G_CAP3_RD_RESPONDER		BIT(0)
-#define S1G_CAP3_HT_DELAYED_BA		BIT(1)
-#define S1G_CAP3_MAX_MPDU_LEN		BIT(2)
-#define S1G_CAP3_MAX_AMPDU_LEN_EXP	GENMASK(4, 3)
-#define S1G_CAP3_MIN_MPDU_START		GENMASK(7, 5)
-
-#define S1G_CAP4_UPLINK_SYNC	BIT(0)
-#define S1G_CAP4_DYNAMIC_AID	BIT(1)
-#define S1G_CAP4_BAT		BIT(2)
-#define S1G_CAP4_TIME_ADE	BIT(3)
-#define S1G_CAP4_NON_TIM	BIT(4)
-#define S1G_CAP4_GROUP_AID	BIT(5)
-#define S1G_CAP4_STA_TYPE	GENMASK(7, 6)
-
-#define S1G_CAP5_CENT_AUTH_CONTROL	BIT(0)
-#define S1G_CAP5_DIST_AUTH_CONTROL	BIT(1)
-#define S1G_CAP5_AMSDU			BIT(2)
-#define S1G_CAP5_AMPDU			BIT(3)
-#define S1G_CAP5_ASYMMETRIC_BA		BIT(4)
-#define S1G_CAP5_FLOW_CONTROL		BIT(5)
-#define S1G_CAP5_SECTORIZED_BEAM	GENMASK(7, 6)
-
-#define S1G_CAP6_OBSS_MITIGATION	BIT(0)
-#define S1G_CAP6_FRAGMENT_BA		BIT(1)
-#define S1G_CAP6_NDP_PS_POLL		BIT(2)
-#define S1G_CAP6_RAW_OPERATION		BIT(3)
-#define S1G_CAP6_PAGE_SLICING		BIT(4)
-#define S1G_CAP6_TXOP_SHARING_IMP_ACK	BIT(5)
-#define S1G_CAP6_VHT_LINK_ADAPT		GENMASK(7, 6)
-
-#define S1G_CAP7_TACK_AS_PS_POLL		BIT(0)
-#define S1G_CAP7_DUP_1MHZ			BIT(1)
-#define S1G_CAP7_MCS_NEGOTIATION		BIT(2)
-#define S1G_CAP7_1MHZ_CTL_RESPONSE_PREAMBLE	BIT(3)
-#define S1G_CAP7_NDP_BFING_REPORT_POLL		BIT(4)
-#define S1G_CAP7_UNSOLICITED_DYN_AID		BIT(5)
-#define S1G_CAP7_SECTOR_TRAINING_OPERATION	BIT(6)
-#define S1G_CAP7_TEMP_PS_MODE_SWITCH		BIT(7)
-
-#define S1G_CAP8_TWT_GROUPING	BIT(0)
-#define S1G_CAP8_BDT		BIT(1)
-#define S1G_CAP8_COLOR		GENMASK(4, 2)
-#define S1G_CAP8_TWT_REQUEST	BIT(5)
-#define S1G_CAP8_TWT_RESPOND	BIT(6)
-#define S1G_CAP8_PV1_FRAME	BIT(7)
-
-#define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0)
-
-#define S1G_OPER_CH_WIDTH_PRIMARY	BIT(0)
-#define S1G_OPER_CH_WIDTH_OPER		GENMASK(4, 1)
-#define S1G_OPER_CH_PRIMARY_LOCATION	BIT(5)
-
-#define S1G_2M_PRIMARY_LOCATION_LOWER	0
-#define S1G_2M_PRIMARY_LOCATION_UPPER	1
-
-#define LISTEN_INT_USF	GENMASK(15, 14)
-#define LISTEN_INT_UI	GENMASK(13, 0)
-
-#define IEEE80211_MAX_USF	FIELD_MAX(LISTEN_INT_USF)
-#define IEEE80211_MAX_UI	FIELD_MAX(LISTEN_INT_UI)
-
 /* Authentication algorithms */
 #define WLAN_AUTH_OPEN 0
 #define WLAN_AUTH_SHARED_KEY 1
@@ -2189,20 +1915,6 @@ enum ieee80211_key_len {
 	WLAN_KEY_LEN_BIP_GMAC_256 = 32,
 };
 
-enum ieee80211_s1g_actioncode {
-	WLAN_S1G_AID_SWITCH_REQUEST,
-	WLAN_S1G_AID_SWITCH_RESPONSE,
-	WLAN_S1G_SYNC_CONTROL,
-	WLAN_S1G_STA_INFO_ANNOUNCE,
-	WLAN_S1G_EDCA_PARAM_SET,
-	WLAN_S1G_EL_OPERATION,
-	WLAN_S1G_TWT_SETUP,
-	WLAN_S1G_TWT_TEARDOWN,
-	WLAN_S1G_SECT_GROUP_ID_LIST,
-	WLAN_S1G_SECT_ID_FEEDBACK,
-	WLAN_S1G_TWT_INFORMATION = 11,
-};
-
 /* Radio measurement action codes as defined in IEEE 802.11-2024 - Table 9-470 */
 enum ieee80211_radio_measurement_actioncode {
 	WLAN_RM_ACTION_RADIO_MEASUREMENT_REQUEST = 0,
@@ -2877,254 +2589,6 @@ static inline bool __ieee80211_check_tim(const struct ieee80211_tim_ie *tim,
 	return !!(tim->virtual_map[index] & mask);
 }
 
-struct s1g_tim_aid {
-	u16 aid;
-	u8 target_blk; /* Target block index */
-	u8 target_subblk; /* Target subblock index */
-	u8 target_subblk_bit; /* Target subblock bit */
-};
-
-struct s1g_tim_enc_block {
-	u8 enc_mode;
-	bool inverse;
-	const u8 *ptr;
-	u8 len;
-
-	/*
-	 * For an OLB encoded block that spans multiple blocks, this
-	 * is the offset into the span described by that encoded block.
-	 */
-	u8 olb_blk_offset;
-};
-
-/*
- * Helper routines to quickly extract the length of an encoded block. Validation
- * is also performed to ensure the length extracted lies within the TIM.
- */
-
-static inline int ieee80211_s1g_len_bitmap(const u8 *ptr, const u8 *end)
-{
-	u8 blkmap;
-	u8 n_subblks;
-
-	if (ptr >= end)
-		return -EINVAL;
-
-	blkmap = *ptr;
-	n_subblks = hweight8(blkmap);
-
-	if (ptr + 1 + n_subblks > end)
-		return -EINVAL;
-
-	return 1 + n_subblks;
-}
-
-static inline int ieee80211_s1g_len_single(const u8 *ptr, const u8 *end)
-{
-	return (ptr + 1 > end) ? -EINVAL : 1;
-}
-
-static inline int ieee80211_s1g_len_olb(const u8 *ptr, const u8 *end)
-{
-	if (ptr >= end)
-		return -EINVAL;
-
-	return (ptr + 1 + *ptr > end) ? -EINVAL : 1 + *ptr;
-}
-
-/*
- * Enumerate all encoded blocks until we find the encoded block that describes
- * our target AID. OLB is a special case as a single encoded block can describe
- * multiple blocks as a single encoded block.
- */
-static inline int ieee80211_s1g_find_target_block(struct s1g_tim_enc_block *enc,
-						  const struct s1g_tim_aid *aid,
-						  const u8 *ptr, const u8 *end)
-{
-	/* need at least block-control octet */
-	while (ptr + 1 <= end) {
-		u8 ctrl = *ptr++;
-		u8 mode = ctrl & 0x03;
-		bool contains, inverse = ctrl & BIT(2);
-		u8 span, blk_off = ctrl >> 3;
-		int len;
-
-		switch (mode) {
-		case IEEE80211_S1G_TIM_ENC_MODE_BLOCK:
-			len = ieee80211_s1g_len_bitmap(ptr, end);
-			contains = blk_off == aid->target_blk;
-			break;
-		case IEEE80211_S1G_TIM_ENC_MODE_SINGLE:
-			len = ieee80211_s1g_len_single(ptr, end);
-			contains = blk_off == aid->target_blk;
-			break;
-		case IEEE80211_S1G_TIM_ENC_MODE_OLB:
-			len = ieee80211_s1g_len_olb(ptr, end);
-			/*
-			 * An OLB encoded block can describe more then one
-			 * block, meaning an encoded OLB block can span more
-			 * then a single block.
-			 */
-			if (len > 0) {
-				/* Minus one for the length octet */
-				span = DIV_ROUND_UP(len - 1, 8);
-				/*
-				 * Check if our target block lies within the
-				 * block span described by this encoded block.
-				 */
-				contains = (aid->target_blk >= blk_off) &&
-					   (aid->target_blk < blk_off + span);
-			}
-			break;
-		default:
-			return -EOPNOTSUPP;
-		}
-
-		if (len < 0)
-			return len;
-
-		if (contains) {
-			enc->enc_mode = mode;
-			enc->inverse = inverse;
-			enc->ptr = ptr;
-			enc->len = (u8)len;
-			enc->olb_blk_offset = blk_off;
-			return 0;
-		}
-
-		ptr += len;
-	}
-
-	return -ENOENT;
-}
-
-static inline bool ieee80211_s1g_parse_bitmap(struct s1g_tim_enc_block *enc,
-					      struct s1g_tim_aid *aid)
-{
-	const u8 *ptr = enc->ptr;
-	u8 blkmap = *ptr++;
-
-	/*
-	 * If our block bitmap does not contain a set bit that corresponds
-	 * to our AID, it could mean a variety of things depending on if
-	 * the encoding mode is inverted or not.
-	 *
-	 * 1. If inverted, it means the entire subblock is present and hence
-	 *    our AID has been set.
-	 * 2. If not inverted, it means our subblock is not present and hence
-	 *    it is all zero meaning our AID is not set.
-	 */
-	if (!(blkmap & BIT(aid->target_subblk)))
-		return enc->inverse;
-
-	/*
-	 * Increment ptr by the number of set subblocks that appear before our
-	 * target subblock. If our target subblock is 0, do nothing as ptr
-	 * already points to our target subblock.
-	 */
-	if (aid->target_subblk)
-		ptr += hweight8(blkmap & GENMASK(aid->target_subblk - 1, 0));
-
-	return !!(*ptr & BIT(aid->target_subblk_bit)) ^ enc->inverse;
-}
-
-static inline bool ieee80211_s1g_parse_single(struct s1g_tim_enc_block *enc,
-					      struct s1g_tim_aid *aid)
-{
-	/*
-	 * Single AID mode describes, as the name suggests, a single AID
-	 * within the block described by the encoded block. The octet
-	 * contains the 6 LSBs of the AID described in the block. The other
-	 * 2 bits are reserved. When inversed, every single AID described
-	 * by the current block have buffered traffic except for the AID
-	 * described in the single AID octet.
-	 */
-	return ((*enc->ptr & 0x3f) == (aid->aid & 0x3f)) ^ enc->inverse;
-}
-
-static inline bool ieee80211_s1g_parse_olb(struct s1g_tim_enc_block *enc,
-					   struct s1g_tim_aid *aid)
-{
-	const u8 *ptr = enc->ptr;
-	u8 blk_len = *ptr++;
-	/*
-	 * Given an OLB encoded block that describes multiple blocks,
-	 * calculate the offset into the span. Then calculate the
-	 * subblock location normally.
-	 */
-	u16 span_offset = aid->target_blk - enc->olb_blk_offset;
-	u16 subblk_idx = span_offset * 8 + aid->target_subblk;
-
-	if (subblk_idx >= blk_len)
-		return enc->inverse;
-
-	return !!(ptr[subblk_idx] & BIT(aid->target_subblk_bit)) ^ enc->inverse;
-}
-
-/*
- * An S1G PVB has 3 non optional encoding types, each that can be inverted.
- * An S1G PVB is constructed with zero or more encoded block subfields. Each
- * encoded block represents a single "block" of AIDs (64), and each encoded
- * block can contain one of the 3 encoding types alongside a single bit for
- * whether the bits should be inverted.
- *
- * As the standard makes no guarantee about the ordering of encoded blocks,
- * we must parse every encoded block in the worst case scenario given an
- * AID that lies within the last block.
- */
-static inline bool ieee80211_s1g_check_tim(const struct ieee80211_tim_ie *tim,
-					   u8 tim_len, u16 aid)
-{
-	int err;
-	struct s1g_tim_aid target_aid;
-	struct s1g_tim_enc_block enc_blk;
-
-	if (tim_len < 3)
-		return false;
-
-	target_aid.aid = aid;
-	target_aid.target_blk = (aid >> 6) & 0x1f;
-	target_aid.target_subblk = (aid >> 3) & 0x7;
-	target_aid.target_subblk_bit = aid & 0x7;
-
-	/*
-	 * Find our AIDs target encoded block and fill &enc_blk with the
-	 * encoded blocks information. If no entry is found or an error
-	 * occurs return false.
-	 */
-	err = ieee80211_s1g_find_target_block(&enc_blk, &target_aid,
-					      tim->virtual_map,
-					      (const u8 *)tim + tim_len + 2);
-	if (err)
-		return false;
-
-	switch (enc_blk.enc_mode) {
-	case IEEE80211_S1G_TIM_ENC_MODE_BLOCK:
-		return ieee80211_s1g_parse_bitmap(&enc_blk, &target_aid);
-	case IEEE80211_S1G_TIM_ENC_MODE_SINGLE:
-		return ieee80211_s1g_parse_single(&enc_blk, &target_aid);
-	case IEEE80211_S1G_TIM_ENC_MODE_OLB:
-		return ieee80211_s1g_parse_olb(&enc_blk, &target_aid);
-	default:
-		return false;
-	}
-}
-
-/**
- * ieee80211_check_tim - check if AID bit is set in TIM
- * @tim: the TIM IE
- * @tim_len: length of the TIM IE
- * @aid: the AID to look for
- * @s1g: whether the TIM is from an S1G PPDU
- * Return: whether or not traffic is indicated in the TIM for the given AID
- */
-static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim,
-				       u8 tim_len, u16 aid, bool s1g)
-{
-	return s1g ? ieee80211_s1g_check_tim(tim, tim_len, aid) :
-		     __ieee80211_check_tim(tim, tim_len, aid);
-}
-
 /**
  * ieee80211_get_tdls_action - get TDLS action code
  * @skb: the skb containing the frame, length will not be checked
@@ -3258,39 +2722,6 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb)
 	return false;
 }
 
-/**
- * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon
- * @fc: frame control bytes in little-endian byteorder
- * @variable: pointer to the beacon frame elements
- * @variable_len: length of the frame elements
- * Return: whether or not the frame is an S1G short beacon. As per
- *	IEEE80211-2024 11.1.3.10.1, The S1G beacon compatibility element shall
- *	always be present as the first element in beacon frames generated at a
- *	TBTT (Target Beacon Transmission Time), so any frame not containing
- *	this element must have been generated at a TSBTT (Target Short Beacon
- *	Transmission Time) that is not a TBTT. Additionally, short beacons are
- *	prohibited from containing the S1G beacon compatibility element as per
- *	IEEE80211-2024 9.3.4.3 Table 9-76, so if we have an S1G beacon with
- *	either no elements or the first element is not the beacon compatibility
- *	element, we have a short beacon.
- */
-static inline bool ieee80211_is_s1g_short_beacon(__le16 fc, const u8 *variable,
-						 size_t variable_len)
-{
-	if (!ieee80211_is_s1g_beacon(fc))
-		return false;
-
-	/*
-	 * If the frame does not contain at least 1 element (this is perfectly
-	 * valid in a short beacon) and is an S1G beacon, we have a short
-	 * beacon.
-	 */
-	if (variable_len < 2)
-		return true;
-
-	return variable[0] != WLAN_EID_S1G_BCN_COMPAT;
-}
-
 struct element {
 	u8 id;
 	u8 datalen;
@@ -3446,5 +2877,21 @@ struct ieee80211_tbtt_info_ge_11 {
 #include "ieee80211-he.h"
 #include "ieee80211-eht.h"
 #include "ieee80211-mesh.h"
+#include "ieee80211-s1g.h"
+
+/**
+ * ieee80211_check_tim - check if AID bit is set in TIM
+ * @tim: the TIM IE
+ * @tim_len: length of the TIM IE
+ * @aid: the AID to look for
+ * @s1g: whether the TIM is from an S1G PPDU
+ * Return: whether or not traffic is indicated in the TIM for the given AID
+ */
+static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim,
+				       u8 tim_len, u16 aid, bool s1g)
+{
+	return s1g ? ieee80211_s1g_check_tim(tim, tim_len, aid) :
+		     __ieee80211_check_tim(tim, tim_len, aid);
+}
 
 #endif /* LINUX_IEEE80211_H */
-- 
cgit v1.2.3


From fcd42b909ba06737dfcda47f3a0a9718bd3ebf03 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 15:36:55 +0100
Subject: wifi: ieee80211: split P2P definitions out

The ieee80211.h file has gotten very long, continue splitting
it by putting P2P definitions into a separate file. Note that
P2P isn't really even IEEE 802.11 but WFA.

Link: https://patch.msgid.link/20251105153843.e47b2614e9d2.Id242f61da720e365f6b5d7a4a545fbbc2f1e92b4@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211-p2p.h | 71 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/ieee80211.h     | 53 +-------------------------------
 2 files changed, 72 insertions(+), 52 deletions(-)
 create mode 100644 include/linux/ieee80211-p2p.h

(limited to 'include')

diff --git a/include/linux/ieee80211-p2p.h b/include/linux/ieee80211-p2p.h
new file mode 100644
index 000000000000..180891c11f08
--- /dev/null
+++ b/include/linux/ieee80211-p2p.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * WFA P2P definitions
+ *
+ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ * <jkmaline@cc.hut.fi>
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright (c) 2005, Devicescape Software, Inc.
+ * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
+ * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright (c) 2018 - 2025 Intel Corporation
+ */
+
+#ifndef LINUX_IEEE80211_P2P_H
+#define LINUX_IEEE80211_P2P_H
+
+#include <linux/types.h>
+/*
+ * Peer-to-Peer IE attribute related definitions.
+ */
+/*
+ * enum ieee80211_p2p_attr_id - identifies type of peer-to-peer attribute.
+ */
+enum ieee80211_p2p_attr_id {
+	IEEE80211_P2P_ATTR_STATUS = 0,
+	IEEE80211_P2P_ATTR_MINOR_REASON,
+	IEEE80211_P2P_ATTR_CAPABILITY,
+	IEEE80211_P2P_ATTR_DEVICE_ID,
+	IEEE80211_P2P_ATTR_GO_INTENT,
+	IEEE80211_P2P_ATTR_GO_CONFIG_TIMEOUT,
+	IEEE80211_P2P_ATTR_LISTEN_CHANNEL,
+	IEEE80211_P2P_ATTR_GROUP_BSSID,
+	IEEE80211_P2P_ATTR_EXT_LISTEN_TIMING,
+	IEEE80211_P2P_ATTR_INTENDED_IFACE_ADDR,
+	IEEE80211_P2P_ATTR_MANAGABILITY,
+	IEEE80211_P2P_ATTR_CHANNEL_LIST,
+	IEEE80211_P2P_ATTR_ABSENCE_NOTICE,
+	IEEE80211_P2P_ATTR_DEVICE_INFO,
+	IEEE80211_P2P_ATTR_GROUP_INFO,
+	IEEE80211_P2P_ATTR_GROUP_ID,
+	IEEE80211_P2P_ATTR_INTERFACE,
+	IEEE80211_P2P_ATTR_OPER_CHANNEL,
+	IEEE80211_P2P_ATTR_INVITE_FLAGS,
+	/* 19 - 220: Reserved */
+	IEEE80211_P2P_ATTR_VENDOR_SPECIFIC = 221,
+
+	IEEE80211_P2P_ATTR_MAX
+};
+
+/* Notice of Absence attribute - described in P2P spec 4.1.14 */
+/* Typical max value used here */
+#define IEEE80211_P2P_NOA_DESC_MAX	4
+
+struct ieee80211_p2p_noa_desc {
+	u8 count;
+	__le32 duration;
+	__le32 interval;
+	__le32 start_time;
+} __packed;
+
+struct ieee80211_p2p_noa_attr {
+	u8 index;
+	u8 oppps_ctwindow;
+	struct ieee80211_p2p_noa_desc desc[IEEE80211_P2P_NOA_DESC_MAX];
+} __packed;
+
+#define IEEE80211_P2P_OPPPS_ENABLE_BIT		BIT(7)
+#define IEEE80211_P2P_OPPPS_CTWINDOW_MASK	0x7F
+
+#endif /* LINUX_IEEE80211_P2P_H */
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 1b27bbac145b..fa0f7f917ce7 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1333,58 +1333,6 @@ struct ieee80211_tdls_data {
 	} u;
 } __packed;
 
-/*
- * Peer-to-Peer IE attribute related definitions.
- */
-/*
- * enum ieee80211_p2p_attr_id - identifies type of peer-to-peer attribute.
- */
-enum ieee80211_p2p_attr_id {
-	IEEE80211_P2P_ATTR_STATUS = 0,
-	IEEE80211_P2P_ATTR_MINOR_REASON,
-	IEEE80211_P2P_ATTR_CAPABILITY,
-	IEEE80211_P2P_ATTR_DEVICE_ID,
-	IEEE80211_P2P_ATTR_GO_INTENT,
-	IEEE80211_P2P_ATTR_GO_CONFIG_TIMEOUT,
-	IEEE80211_P2P_ATTR_LISTEN_CHANNEL,
-	IEEE80211_P2P_ATTR_GROUP_BSSID,
-	IEEE80211_P2P_ATTR_EXT_LISTEN_TIMING,
-	IEEE80211_P2P_ATTR_INTENDED_IFACE_ADDR,
-	IEEE80211_P2P_ATTR_MANAGABILITY,
-	IEEE80211_P2P_ATTR_CHANNEL_LIST,
-	IEEE80211_P2P_ATTR_ABSENCE_NOTICE,
-	IEEE80211_P2P_ATTR_DEVICE_INFO,
-	IEEE80211_P2P_ATTR_GROUP_INFO,
-	IEEE80211_P2P_ATTR_GROUP_ID,
-	IEEE80211_P2P_ATTR_INTERFACE,
-	IEEE80211_P2P_ATTR_OPER_CHANNEL,
-	IEEE80211_P2P_ATTR_INVITE_FLAGS,
-	/* 19 - 220: Reserved */
-	IEEE80211_P2P_ATTR_VENDOR_SPECIFIC = 221,
-
-	IEEE80211_P2P_ATTR_MAX
-};
-
-/* Notice of Absence attribute - described in P2P spec 4.1.14 */
-/* Typical max value used here */
-#define IEEE80211_P2P_NOA_DESC_MAX	4
-
-struct ieee80211_p2p_noa_desc {
-	u8 count;
-	__le32 duration;
-	__le32 interval;
-	__le32 start_time;
-} __packed;
-
-struct ieee80211_p2p_noa_attr {
-	u8 index;
-	u8 oppps_ctwindow;
-	struct ieee80211_p2p_noa_desc desc[IEEE80211_P2P_NOA_DESC_MAX];
-} __packed;
-
-#define IEEE80211_P2P_OPPPS_ENABLE_BIT		BIT(7)
-#define IEEE80211_P2P_OPPPS_CTWINDOW_MASK	0x7F
-
 /* Authentication algorithms */
 #define WLAN_AUTH_OPEN 0
 #define WLAN_AUTH_SHARED_KEY 1
@@ -2878,6 +2826,7 @@ struct ieee80211_tbtt_info_ge_11 {
 #include "ieee80211-eht.h"
 #include "ieee80211-mesh.h"
 #include "ieee80211-s1g.h"
+#include "ieee80211-p2p.h"
 
 /**
  * ieee80211_check_tim - check if AID bit is set in TIM
-- 
cgit v1.2.3


From 60a3734192fa6909c48e33b0d212990ebaff54c4 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 15:36:56 +0100
Subject: wifi: ieee80211: split NAN definitions out

The ieee80211.h file has gotten very long, continue splitting
it by putting NAN definitions into a separate file. Note that
NAN isn't really even IEEE 802.11 but WFA.

Link: https://patch.msgid.link/20251105153843.8da0e796dda2.I7b2ce11220b70e8794019501eabbf8afbaf431a6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211-nan.h | 35 +++++++++++++++++++++++++++++++++++
 include/linux/ieee80211.h     | 18 +-----------------
 2 files changed, 36 insertions(+), 17 deletions(-)
 create mode 100644 include/linux/ieee80211-nan.h

(limited to 'include')

diff --git a/include/linux/ieee80211-nan.h b/include/linux/ieee80211-nan.h
new file mode 100644
index 000000000000..d07959bf8a90
--- /dev/null
+++ b/include/linux/ieee80211-nan.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * WFA NAN definitions
+ *
+ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ * <jkmaline@cc.hut.fi>
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright (c) 2005, Devicescape Software, Inc.
+ * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
+ * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright (c) 2018 - 2025 Intel Corporation
+ */
+
+#ifndef LINUX_IEEE80211_NAN_H
+#define LINUX_IEEE80211_NAN_H
+
+/* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */
+#define NAN_OP_MODE_PHY_MODE_VHT	0x01
+#define NAN_OP_MODE_PHY_MODE_HE		0x10
+#define NAN_OP_MODE_PHY_MODE_MASK	0x11
+#define NAN_OP_MODE_80P80MHZ		0x02
+#define NAN_OP_MODE_160MHZ		0x04
+#define NAN_OP_MODE_PNDL_SUPPRTED	0x08
+
+/* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification
+ * Table 79
+ */
+#define NAN_DEV_CAPA_DFS_OWNER			0x01
+#define NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED	0x02
+#define NAN_DEV_CAPA_SIM_NDP_RX_SUPPORTED	0x04
+#define NAN_DEV_CAPA_NDPE_SUPPORTED		0x08
+#define NAN_DEV_CAPA_S3_SUPPORTED		0x10
+
+#endif /* LINUX_IEEE80211_NAN_H */
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index fa0f7f917ce7..48ce05e1d203 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2803,23 +2803,6 @@ struct ieee80211_tbtt_info_ge_11 {
 	struct ieee80211_rnr_mld_params mld_params;
 } __packed;
 
-/* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */
-#define NAN_OP_MODE_PHY_MODE_VHT	0x01
-#define NAN_OP_MODE_PHY_MODE_HE		0x10
-#define NAN_OP_MODE_PHY_MODE_MASK	0x11
-#define NAN_OP_MODE_80P80MHZ		0x02
-#define NAN_OP_MODE_160MHZ		0x04
-#define NAN_OP_MODE_PNDL_SUPPRTED	0x08
-
-/* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification
- * Table 79
- */
-#define NAN_DEV_CAPA_DFS_OWNER			0x01
-#define NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED	0x02
-#define NAN_DEV_CAPA_SIM_NDP_RX_SUPPORTED	0x04
-#define NAN_DEV_CAPA_NDPE_SUPPORTED		0x08
-#define NAN_DEV_CAPA_S3_SUPPORTED		0x10
-
 #include "ieee80211-ht.h"
 #include "ieee80211-vht.h"
 #include "ieee80211-he.h"
@@ -2827,6 +2810,7 @@ struct ieee80211_tbtt_info_ge_11 {
 #include "ieee80211-mesh.h"
 #include "ieee80211-s1g.h"
 #include "ieee80211-p2p.h"
+#include "ieee80211-nan.h"
 
 /**
  * ieee80211_check_tim - check if AID bit is set in TIM
-- 
cgit v1.2.3


From 30b6089aad35500e683025dddc029ac28705385d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 15:39:57 +0100
Subject: wifi: cfg80211: fix EHT typo

This is clearly EHT, not ETH, fix the typo.

Link: https://patch.msgid.link/20251105153958.e9d4af3b768e.I5f3378326837e3f62928a2f1fd3403f29cea069b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f2e8963cfaac..84be0cdd1da0 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -685,7 +685,7 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband,
 }
 
 /**
- * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype
+ * ieee80211_get_eht_iftype_cap - return EHT capabilities for an sband's iftype
  * @sband: the sband to search for the iftype on
  * @iftype: enum nl80211_iftype
  *
-- 
cgit v1.2.3


From 1a1cad924e8a60252132446fbba1284035010b4f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 15:39:58 +0100
Subject: wifi: mac80211: fix EHT typo

This is clearly EHT, not ETH, fix the typo.

Link: https://patch.msgid.link/20251105153958.12a04517f7ec.Idcf800817fa30605b1002c3d2287cad016e7aea7@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c326243e1f01..c2e49542626c 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -7223,7 +7223,7 @@ ieee80211_get_he_6ghz_capa_vif(const struct ieee80211_supported_band *sband,
 }
 
 /**
- * ieee80211_get_eht_iftype_cap_vif - return ETH capabilities for sband/vif
+ * ieee80211_get_eht_iftype_cap_vif - return EHT capabilities for sband/vif
  * @sband: the sband to search for the iftype on
  * @vif: the vif to get the iftype from
  *
-- 
cgit v1.2.3


From 68eb1b791ac8da7c3d03967143f1417e2978bf5e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Nov 2025 16:08:10 +0100
Subject: wifi: mac80211: pass frame type to element parsing

This will be needed for UHR operation parsing, and we
already pass whether or not the frame is an action
frame, replace that by the full type. Note this fixes
a few cases where 'false' was erroneously passed (mesh
and TDLS) and removes ieee802_11_parse_elems_crc() as
it's unused.

Link: https://patch.msgid.link/20251105160810.a476d20a6e01.Ie659535f9357f2f9a3c73f8c059ccfc96bf93b54@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |  1 +
 net/mac80211/agg-rx.c      |  7 +++++--
 net/mac80211/ibss.c        | 14 +++++++++-----
 net/mac80211/ieee80211_i.h | 21 ++++++---------------
 net/mac80211/mesh.c        | 26 ++++++++++++++++----------
 net/mac80211/mesh_hwmp.c   |  7 +++++--
 net/mac80211/mesh_plink.c  |  7 +++++--
 net/mac80211/mlme.c        | 45 ++++++++++++++++++++++++++++++++++-----------
 net/mac80211/parse.c       | 30 +++++++++++++++++++++++++-----
 net/mac80211/scan.c        |  6 +++++-
 net/mac80211/tdls.c        | 12 +++++++++---
 net/mac80211/tests/elems.c |  4 +++-
 12 files changed, 123 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 48ce05e1d203..6d4bc80caf96 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -43,6 +43,7 @@
 #define IEEE80211_FCTL_VERS		0x0003
 #define IEEE80211_FCTL_FTYPE		0x000c
 #define IEEE80211_FCTL_STYPE		0x00f0
+#define IEEE80211_FCTL_TYPE		(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)
 #define IEEE80211_FCTL_TODS		0x0100
 #define IEEE80211_FCTL_FROMDS		0x0200
 #define IEEE80211_FCTL_MOREFRAGS	0x0400
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index e38f46ffebfa..7da909d78c68 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -9,7 +9,7 @@
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2010, Intel Corporation
  * Copyright(c) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2024 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
  */
 
 /**
@@ -206,7 +206,10 @@ u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta,
 	if (elem_len <= 0)
 		return 0;
 
-	elems = ieee802_11_parse_elems(elem_data, elem_len, true, NULL);
+	elems = ieee802_11_parse_elems(elem_data, elem_len,
+				       IEEE80211_FTYPE_MGMT |
+				       IEEE80211_STYPE_ACTION,
+				       NULL);
 
 	if (!elems || elems->parse_error || !elems->addba_ext_ie)
 		goto free;
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 6e36b09fe97f..168f84a1353b 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -9,7 +9,7 @@
  * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright(c) 2016 Intel Deutschland GmbH
- * Copyright(c) 2018-2024 Intel Corporation
+ * Copyright(c) 2018-2025 Intel Corporation
  */
 
 #include <linux/delay.h>
@@ -1554,6 +1554,7 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata,
 {
 	size_t baselen;
 	struct ieee802_11_elems *elems;
+	u16 type;
 
 	BUILD_BUG_ON(offsetof(typeof(mgmt->u.probe_resp), variable) !=
 		     offsetof(typeof(mgmt->u.beacon), variable));
@@ -1566,8 +1567,9 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata,
 	if (baselen > len)
 		return;
 
+	type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE;
 	elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable,
-				       len - baselen, false, NULL);
+				       len - baselen, type, NULL);
 
 	if (elems) {
 		ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, elems);
@@ -1616,9 +1618,11 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 			if (ies_len < 0)
 				break;
 
-			elems = ieee802_11_parse_elems(
-				mgmt->u.action.u.chan_switch.variable,
-				ies_len, true, NULL);
+			elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable,
+						       ies_len,
+						       IEEE80211_FTYPE_MGMT |
+						       IEEE80211_STYPE_ACTION,
+						       NULL);
 
 			if (elems && !elems->parse_error)
 				ieee80211_rx_mgmt_spectrum_mgmt(sdata, mgmt,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 898ccbc4ec64..7f1ce9fc01c7 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2422,7 +2422,8 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata,
  * @mode: connection mode for parsing
  * @start: pointer to the elements
  * @len: length of the elements
- * @action: %true if the elements came from an action frame
+ * @type: type of the frame the elements came from
+ *	(action, probe response, beacon, etc.)
  * @filter: bitmap of element IDs to filter out while calculating
  *	the element CRC
  * @crc: CRC starting value
@@ -2440,7 +2441,7 @@ struct ieee80211_elems_parse_params {
 	enum ieee80211_conn_mode mode;
 	const u8 *start;
 	size_t len;
-	bool action;
+	u8 type;
 	u64 filter;
 	u32 crc;
 	struct cfg80211_bss *bss;
@@ -2452,17 +2453,14 @@ struct ieee802_11_elems *
 ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params);
 
 static inline struct ieee802_11_elems *
-ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
-			   u64 filter, u32 crc,
-			   struct cfg80211_bss *bss)
+ieee802_11_parse_elems(const u8 *start, size_t len, u8 type,
+		       struct cfg80211_bss *bss)
 {
 	struct ieee80211_elems_parse_params params = {
 		.mode = IEEE80211_CONN_MODE_HIGHEST,
 		.start = start,
 		.len = len,
-		.action = action,
-		.filter = filter,
-		.crc = crc,
+		.type = type,
 		.bss = bss,
 		.link_id = -1,
 	};
@@ -2470,13 +2468,6 @@ ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
 	return ieee802_11_parse_elems_full(&params);
 }
 
-static inline struct ieee802_11_elems *
-ieee802_11_parse_elems(const u8 *start, size_t len, bool action,
-		       struct cfg80211_bss *bss)
-{
-	return ieee802_11_parse_elems_crc(start, len, action, 0, 0, bss);
-}
-
 extern const int ieee802_1d_to_ac[8];
 
 static inline int ieee80211_ac_from_tid(int tid)
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index f37068a533f4..68901f1def0d 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2008, 2009 open80211s Ltd.
- * Copyright (C) 2018 - 2024 Intel Corporation
+ * Copyright (C) 2018 - 2025 Intel Corporation
  * Authors:    Luis Carlos Cobo <luisca@cozybit.com>
  * 	       Javier Cardona <javier@cozybit.com>
  */
@@ -1410,7 +1410,10 @@ ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata,
 	if (baselen > len)
 		return;
 
-	elems = ieee802_11_parse_elems(pos, len - baselen, false, NULL);
+	elems = ieee802_11_parse_elems(pos, len - baselen,
+				       IEEE80211_FTYPE_MGMT |
+				       IEEE80211_STYPE_PROBE_REQ,
+				       NULL);
 	if (!elems)
 		return;
 
@@ -1455,11 +1458,11 @@ free:
 }
 
 static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
-					u16 stype,
 					struct ieee80211_mgmt *mgmt,
 					size_t len,
 					struct ieee80211_rx_status *rx_status)
 {
+	u16 type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 	struct ieee802_11_elems *elems;
@@ -1469,7 +1472,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
 	enum nl80211_band band = rx_status->band;
 
 	/* ignore ProbeResp to foreign address */
-	if (stype == IEEE80211_STYPE_PROBE_RESP &&
+	if (type == (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP) &&
 	    !ether_addr_equal(mgmt->da, sdata->vif.addr))
 		return;
 
@@ -1478,8 +1481,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable,
-				       len - baselen,
-				       false, NULL);
+				       len - baselen, type, NULL);
 	if (!elems)
 		return;
 
@@ -1514,7 +1516,9 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
 	}
 
 	if (ifmsh->sync_ops)
-		ifmsh->sync_ops->rx_bcn_presp(sdata, stype, mgmt, len,
+		ifmsh->sync_ops->rx_bcn_presp(sdata,
+					      type & IEEE80211_FCTL_STYPE,
+					      mgmt, len,
 					      elems->mesh_config, rx_status);
 free:
 	kfree(elems);
@@ -1622,7 +1626,10 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata,
 	pos = mgmt->u.action.u.chan_switch.variable;
 	baselen = offsetof(struct ieee80211_mgmt,
 			   u.action.u.chan_switch.variable);
-	elems = ieee802_11_parse_elems(pos, len - baselen, true, NULL);
+	elems = ieee802_11_parse_elems(pos, len - baselen,
+				       IEEE80211_FTYPE_MGMT |
+				       IEEE80211_STYPE_ACTION,
+				       NULL);
 	if (!elems)
 		return;
 
@@ -1699,8 +1706,7 @@ void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 	switch (stype) {
 	case IEEE80211_STYPE_PROBE_RESP:
 	case IEEE80211_STYPE_BEACON:
-		ieee80211_mesh_rx_bcn_presp(sdata, stype, mgmt, skb->len,
-					    rx_status);
+		ieee80211_mesh_rx_bcn_presp(sdata, mgmt, skb->len, rx_status);
 		break;
 	case IEEE80211_STYPE_PROBE_REQ:
 		ieee80211_mesh_rx_probe_req(sdata, mgmt, skb->len);
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 9101858525dd..a41b57bd11ff 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2008, 2009 open80211s Ltd.
- * Copyright (C) 2019, 2021-2023 Intel Corporation
+ * Copyright (C) 2019, 2021-2023, 2025 Intel Corporation
  * Author:     Luis Carlos Cobo <luisca@cozybit.com>
  */
 
@@ -951,7 +951,10 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
 
 	baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt;
 	elems = ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable,
-				       len - baselen, false, NULL);
+				       len - baselen,
+				       IEEE80211_FTYPE_MGMT |
+				       IEEE80211_STYPE_ACTION,
+				       NULL);
 	if (!elems)
 		return;
 
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index cb45a5d2009d..04c931cd2063 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2008, 2009 open80211s Ltd.
- * Copyright (C) 2019, 2021-2024 Intel Corporation
+ * Copyright (C) 2019, 2021-2025 Intel Corporation
  * Author:     Luis Carlos Cobo <luisca@cozybit.com>
  */
 #include <linux/gfp.h>
@@ -1248,7 +1248,10 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata,
 		if (baselen > len)
 			return;
 	}
-	elems = ieee802_11_parse_elems(baseaddr, len - baselen, true, NULL);
+	elems = ieee802_11_parse_elems(baseaddr, len - baselen,
+				       IEEE80211_FTYPE_MGMT |
+				       IEEE80211_STYPE_ACTION,
+				       NULL);
 	if (elems) {
 		mesh_process_plink_frame(sdata, mgmt, elems, rx_status);
 		kfree(elems);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2ee9eae89d05..804c3a95b7c6 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -999,6 +999,9 @@ ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata,
 		.from_ap = true,
 		.start = ies->data,
 		.len = ies->len,
+		.type = ies->from_beacon ?
+			IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON :
+			IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP,
 	};
 	struct ieee802_11_elems *elems;
 	struct ieee80211_supported_band *sband;
@@ -5177,7 +5180,9 @@ static void ieee80211_epcs_teardown(struct ieee80211_sub_if_data *sdata)
 			continue;
 		}
 
-		elems = ieee802_11_parse_elems(ies->data, ies->len, false,
+		elems = ieee802_11_parse_elems(ies->data, ies->len,
+					       IEEE80211_FTYPE_MGMT |
+					       IEEE80211_STYPE_BEACON,
 					       NULL);
 		if (!elems) {
 			rcu_read_unlock();
@@ -5223,6 +5228,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
 		.len = elem_len,
 		.link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id,
 		.from_ap = true,
+		.type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE,
 	};
 	bool is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ;
 	bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ;
@@ -6356,6 +6362,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		.bss = NULL,
 		.link_id = -1,
 		.from_ap = true,
+		.type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE,
 	};
 	struct ieee802_11_elems *elems;
 	int ac;
@@ -7264,7 +7271,9 @@ ieee80211_mgd_check_cross_link_csa(struct ieee80211_sub_if_data *sdata,
 						    (prof->sta_info_len - 1),
 						    len -
 						    (prof->sta_info_len - 1),
-						    false, NULL);
+						    IEEE80211_FTYPE_MGMT |
+						    IEEE80211_STYPE_BEACON,
+						    NULL);
 
 		/* memory allocation failed - let's hope that's transient */
 		if (!prof_elems)
@@ -7368,6 +7377,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
 		.mode = link->u.mgd.conn.mode,
 		.link_id = -1,
 		.from_ap = true,
+		.type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE,
 	};
 
 	lockdep_assert_wiphy(local->hw.wiphy);
@@ -7970,7 +7980,10 @@ void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata,
 	ies_len  = len - offsetof(struct ieee80211_mgmt,
 				  u.action.u.ttlm_req.variable);
 	elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable,
-				       ies_len, true, NULL);
+				       ies_len,
+				       IEEE80211_FTYPE_MGMT |
+				       IEEE80211_STYPE_ACTION,
+				       NULL);
 	if (!elems) {
 		ttlm_res = NEG_TTLM_RES_REJECT;
 		goto out;
@@ -8176,9 +8189,11 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 				break;
 
 			/* CSA IE cannot be overridden, no need for BSSID */
-			elems = ieee802_11_parse_elems(
-					mgmt->u.action.u.chan_switch.variable,
-					ies_len, true, NULL);
+			elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable,
+						       ies_len,
+						       IEEE80211_FTYPE_MGMT |
+						       IEEE80211_STYPE_ACTION,
+						       NULL);
 
 			if (elems && !elems->parse_error) {
 				enum ieee80211_csa_source src =
@@ -8205,9 +8220,11 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 			 * extended CSA IE can't be overridden, no need for
 			 * BSSID
 			 */
-			elems = ieee802_11_parse_elems(
-					mgmt->u.action.u.ext_chan_switch.variable,
-					ies_len, true, NULL);
+			elems = ieee802_11_parse_elems(mgmt->u.action.u.ext_chan_switch.variable,
+						       ies_len,
+						       IEEE80211_FTYPE_MGMT |
+						       IEEE80211_STYPE_ACTION,
+						       NULL);
 
 			if (elems && !elems->parse_error) {
 				enum ieee80211_csa_source src;
@@ -10985,7 +11002,10 @@ static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata,
 		pos = scratch + sizeof(control);
 		len -= sizeof(control);
 
-		link_elems = ieee802_11_parse_elems(pos, len, false, NULL);
+		link_elems = ieee802_11_parse_elems(pos, len,
+						    IEEE80211_FTYPE_MGMT |
+						    IEEE80211_STYPE_ACTION,
+						    NULL);
 		if (!link_elems)
 			continue;
 
@@ -11036,7 +11056,10 @@ void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata,
 				 u.action.u.epcs.variable) -
 		IEEE80211_EPCS_ENA_RESP_BODY_LEN;
 
-	elems = ieee802_11_parse_elems(pos, ies_len, true, NULL);
+	elems = ieee802_11_parse_elems(pos, ies_len,
+				       IEEE80211_FTYPE_MGMT |
+				       IEEE80211_STYPE_ACTION,
+				       NULL);
 	if (!elems)
 		return;
 
diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c
index c5e0f7f46004..bfc4ecb7a048 100644
--- a/net/mac80211/parse.c
+++ b/net/mac80211/parse.c
@@ -6,7 +6,7 @@
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2024 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
  *
  * element parsing for mac80211
  */
@@ -286,6 +286,24 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params,
 
 	bitmap_zero(seen_elems, 256);
 
+	switch (params->type) {
+	/* we don't need to parse assoc request, luckily (it's value 0) */
+	case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ:
+	case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ:
+	default:
+		WARN(1, "invalid frame type 0x%x for element parsing\n",
+		     params->type);
+		break;
+	case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_RESP:
+	case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_RESP:
+	case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ:
+	case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP:
+	case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON:
+	case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION:
+	case IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON:
+		break;
+	}
+
 	for_each_element(elem, params->start, params->len) {
 		const struct element *subelem;
 		u8 elem_parse_failed;
@@ -566,7 +584,8 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params,
 			if (params->mode < IEEE80211_CONN_MODE_VHT)
 				break;
 
-			if (!params->action) {
+			if (params->type != (IEEE80211_FTYPE_MGMT |
+					     IEEE80211_STYPE_ACTION)) {
 				elem_parse_failed =
 					IEEE80211_PARSE_ERR_UNEXPECTED_ELEM;
 				break;
@@ -582,7 +601,8 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params,
 		case WLAN_EID_CHANNEL_SWITCH_WRAPPER:
 			if (params->mode < IEEE80211_CONN_MODE_VHT)
 				break;
-			if (params->action) {
+			if (params->type == (IEEE80211_FTYPE_MGMT |
+					     IEEE80211_STYPE_ACTION)) {
 				elem_parse_failed =
 					IEEE80211_PARSE_ERR_UNEXPECTED_ELEM;
 				break;
@@ -942,7 +962,7 @@ ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse,
 	sub->len = end - sub->start;
 
 	sub->mode = params->mode;
-	sub->action = params->action;
+	sub->type = params->type;
 	sub->from_ap = params->from_ap;
 	sub->link_id = -1;
 
@@ -1041,7 +1061,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
 		sub.start = elems_parse->scratch_pos;
 		sub.mode = params->mode;
 		sub.len = nontx_len;
-		sub.action = params->action;
+		sub.type = params->type;
 		sub.link_id = params->link_id;
 
 		/* consume the space used for non-transmitted profile */
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index bb9563f50e7b..5ef315ed3b0f 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -76,7 +76,11 @@ void ieee80211_inform_bss(struct wiphy *wiphy,
 	if (!update_data)
 		return;
 
-	elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL);
+	elems = ieee802_11_parse_elems(ies->data, ies->len,
+				       update_data->beacon ?
+					IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON :
+					IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP,
+				       NULL);
 	if (!elems)
 		return;
 
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index ba5fbacbeeda..dbbfe2d6842f 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -6,7 +6,7 @@
  * Copyright 2014, Intel Corporation
  * Copyright 2014  Intel Mobile Communications GmbH
  * Copyright 2015 - 2016 Intel Deutschland GmbH
- * Copyright (C) 2019, 2021-2024 Intel Corporation
+ * Copyright (C) 2019, 2021-2025 Intel Corporation
  */
 
 #include <linux/ieee80211.h>
@@ -1783,7 +1783,10 @@ ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata,
 	}
 
 	elems = ieee802_11_parse_elems(tf->u.chan_switch_resp.variable,
-				       skb->len - baselen, false, NULL);
+				       skb->len - baselen,
+				       IEEE80211_FTYPE_MGMT |
+				       IEEE80211_STYPE_ACTION,
+				       NULL);
 	if (!elems) {
 		ret = -ENOMEM;
 		goto out;
@@ -1902,7 +1905,10 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata,
 	}
 
 	elems = ieee802_11_parse_elems(tf->u.chan_switch_req.variable,
-				       skb->len - baselen, false, NULL);
+				       skb->len - baselen,
+				       IEEE80211_FTYPE_MGMT |
+				       IEEE80211_STYPE_ACTION,
+				       NULL);
 	if (!elems)
 		return -ENOMEM;
 
diff --git a/net/mac80211/tests/elems.c b/net/mac80211/tests/elems.c
index a53c55a879a8..1039794a0183 100644
--- a/net/mac80211/tests/elems.c
+++ b/net/mac80211/tests/elems.c
@@ -2,7 +2,7 @@
 /*
  * KUnit tests for element parsing
  *
- * Copyright (C) 2023-2024 Intel Corporation
+ * Copyright (C) 2023-2025 Intel Corporation
  */
 #include <kunit/test.h>
 #include "../ieee80211_i.h"
@@ -15,6 +15,8 @@ static void mle_defrag(struct kunit *test)
 		.link_id = 12,
 		.from_ap = true,
 		.mode = IEEE80211_CONN_MODE_EHT,
+		/* type is not really relevant here */
+		.type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON,
 	};
 	struct ieee802_11_elems *parsed;
 	struct sk_buff *skb;
-- 
cgit v1.2.3


From 473235677af46ecb167917887586646e9d70d9ff Mon Sep 17 00:00:00 2001
From: Chien Wong <m@xv97.com>
Date: Fri, 7 Nov 2025 22:23:32 +0800
Subject: wifi: cfg80211: fix doc of struct key_params

The seq in struct key_params is for many ciphers, including CCMP, GCMP,
CMAC, GMAC. In addition to get_key(), it is also used when setting keys.

Signed-off-by: Chien Wong <m@xv97.com>
Link: https://patch.msgid.link/20251107142332.181308-1-m@xv97.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 84be0cdd1da0..d87c18e1b133 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -786,8 +786,7 @@ struct vif_params {
  * @key: key material
  * @key_len: length of key material
  * @cipher: cipher suite selector
- * @seq: sequence counter (IV/PN) for TKIP and CCMP keys, only used
- *	with the get_key() callback, must be in little endian,
+ * @seq: sequence counter (IV/PN), must be in little endian,
  *	length given by @seq_len.
  * @seq_len: length of @seq.
  * @vlan_id: vlan_id for VLAN group key (if nonzero)
-- 
cgit v1.2.3


From 1de3d9e2cd3a3c6f503cd31ec1f552c9dd8cf8ca Mon Sep 17 00:00:00 2001
From: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Date: Thu, 6 Nov 2025 22:16:01 +0100
Subject: dt-bindings: clock: r8a779a0: Add ZG core clock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the core clock used by the GPU on the Renesas R-Car V3U
(R8A779A0) SoC.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://patch.msgid.link/20251106211604.2766465-2-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/r8a779a0-cpg-mssr.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/r8a779a0-cpg-mssr.h b/include/dt-bindings/clock/r8a779a0-cpg-mssr.h
index f1d737ca7ca1..124a6b8856df 100644
--- a/include/dt-bindings/clock/r8a779a0-cpg-mssr.h
+++ b/include/dt-bindings/clock/r8a779a0-cpg-mssr.h
@@ -51,5 +51,6 @@
 #define R8A779A0_CLK_CBFUSA		40
 #define R8A779A0_CLK_R			41
 #define R8A779A0_CLK_OSC		42
+#define R8A779A0_CLK_ZG			43
 
 #endif /* __DT_BINDINGS_CLOCK_R8A779A0_CPG_MSSR_H__ */
-- 
cgit v1.2.3


From aaa5abcc9d44d2c8484f779ab46d242d774cabcb Mon Sep 17 00:00:00 2001
From: Carl Worth <carl@os.amperecomputing.com>
Date: Thu, 25 Sep 2025 18:42:31 +0800
Subject: coresight: tmc: add the handle of the event to the path

The handle is essential for retrieving the AUX_EVENT of each CPU and is
required in perf mode. It has been added to the coresight_path so that
dependent devices can access it from the path when needed.

The existing bug can be reproduced with:
perf record -e cs_etm//k -C 0-9 dd if=/dev/zero of=/dev/null

Showing an oops as follows:
Unable to handle kernel paging request at virtual address 000f6e84934ed19e

Call trace:
 tmc_etr_get_buffer+0x30/0x80 [coresight_tmc] (P)
 catu_enable_hw+0xbc/0x3d0 [coresight_catu]
 catu_enable+0x70/0xe0 [coresight_catu]
 coresight_enable_path+0xb0/0x258 [coresight]

Fixes: 080ee83cc361 ("Coresight: Change functions to accept the coresight_path")
Signed-off-by: Carl Worth <carl@os.amperecomputing.com>
Reviewed-by: Leo Yan <leo.yan@arm.com>
Co-developed-by: Jie Gan <jie.gan@oss.qualcomm.com>
Signed-off-by: Jie Gan <jie.gan@oss.qualcomm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-1-edd8a07c1646@oss.qualcomm.com
---
 drivers/hwtracing/coresight/coresight-etm-perf.c |  1 +
 drivers/hwtracing/coresight/coresight-tmc-etr.c  |  3 ++-
 include/linux/coresight.h                        | 10 ++++++----
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index f677c08233ba..5c256af6e54a 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -520,6 +520,7 @@ static void etm_event_start(struct perf_event *event, int flags)
 		goto out;
 
 	path = etm_event_cpu_path(event_data, cpu);
+	path->handle = handle;
 	/* We need a sink, no need to continue without one */
 	sink = coresight_get_sink(path);
 	if (WARN_ON_ONCE(!sink))
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 800be06598c1..60b0e0a6da05 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1334,7 +1334,8 @@ out:
 struct etr_buf *tmc_etr_get_buffer(struct coresight_device *csdev,
 				   enum cs_mode mode, void *data)
 {
-	struct perf_output_handle *handle = data;
+	struct coresight_path *path = data;
+	struct perf_output_handle *handle = path->handle;
 	struct etr_perf_buffer *etr_perf;
 
 	switch (mode) {
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 6de59ce8ef8c..2626105e3719 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -332,12 +332,14 @@ static struct coresight_dev_list (var) = {				\
 
 /**
  * struct coresight_path - data needed by enable/disable path
- * @path_list:              path from source to sink.
- * @trace_id:          trace_id of the whole path.
+ * @path_list:		path from source to sink.
+ * @trace_id:		trace_id of the whole path.
+ * @handle:		handle of the aux_event.
  */
 struct coresight_path {
-	struct list_head	path_list;
-	u8			trace_id;
+	struct list_head		path_list;
+	u8				trace_id;
+	struct perf_output_handle	*handle;
 };
 
 enum cs_mode {
-- 
cgit v1.2.3


From 94baedb51dea4b0c97e3c9acd90953bec98d03e7 Mon Sep 17 00:00:00 2001
From: Jie Gan <jie.gan@oss.qualcomm.com>
Date: Thu, 25 Sep 2025 18:42:32 +0800
Subject: coresight: change helper_ops to accept coresight_path

Update the helper_enable and helper_disable functions to accept
coresight_path instead of a generic void *data, as coresight_path
encapsulates all the necessary data required by devices along the path.

Tested-by: Carl Worth <carl@os.amperecomputing.com>
Reviewed-by: Carl Worth <carl@os.amperecomputing.com>
Reviewed-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Jie Gan <jie.gan@oss.qualcomm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-2-edd8a07c1646@oss.qualcomm.com
---
 drivers/hwtracing/coresight/coresight-catu.c      | 10 +++++-----
 drivers/hwtracing/coresight/coresight-core.c      | 20 ++++++++++++--------
 drivers/hwtracing/coresight/coresight-ctcu-core.c |  9 +++------
 drivers/hwtracing/coresight/coresight-cti-core.c  |  5 +++--
 drivers/hwtracing/coresight/coresight-cti.h       |  5 +++--
 drivers/hwtracing/coresight/coresight-tmc-etr.c   |  4 ++--
 drivers/hwtracing/coresight/coresight-tmc.h       |  3 ++-
 include/linux/coresight.h                         |  5 +++--
 8 files changed, 33 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index a3ccb7034ae1..69b36bae97ab 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -397,7 +397,7 @@ static int catu_wait_for_ready(struct catu_drvdata *drvdata)
 }
 
 static int catu_enable_hw(struct catu_drvdata *drvdata, enum cs_mode cs_mode,
-			  void *data)
+			  struct coresight_path *path)
 {
 	int rc;
 	u32 control, mode;
@@ -425,7 +425,7 @@ static int catu_enable_hw(struct catu_drvdata *drvdata, enum cs_mode cs_mode,
 	etrdev = coresight_find_input_type(
 		csdev->pdata, CORESIGHT_DEV_TYPE_SINK, etr_subtype);
 	if (etrdev) {
-		etr_buf = tmc_etr_get_buffer(etrdev, cs_mode, data);
+		etr_buf = tmc_etr_get_buffer(etrdev, cs_mode, path);
 		if (IS_ERR(etr_buf))
 			return PTR_ERR(etr_buf);
 	}
@@ -455,7 +455,7 @@ static int catu_enable_hw(struct catu_drvdata *drvdata, enum cs_mode cs_mode,
 }
 
 static int catu_enable(struct coresight_device *csdev, enum cs_mode mode,
-		       void *data)
+		       struct coresight_path *path)
 {
 	int rc = 0;
 	struct catu_drvdata *catu_drvdata = csdev_to_catu_drvdata(csdev);
@@ -463,7 +463,7 @@ static int catu_enable(struct coresight_device *csdev, enum cs_mode mode,
 	guard(raw_spinlock_irqsave)(&catu_drvdata->spinlock);
 	if (csdev->refcnt == 0) {
 		CS_UNLOCK(catu_drvdata->base);
-		rc = catu_enable_hw(catu_drvdata, mode, data);
+		rc = catu_enable_hw(catu_drvdata, mode, path);
 		CS_LOCK(catu_drvdata->base);
 	}
 	if (!rc)
@@ -488,7 +488,7 @@ static int catu_disable_hw(struct catu_drvdata *drvdata)
 	return rc;
 }
 
-static int catu_disable(struct coresight_device *csdev, void *__unused)
+static int catu_disable(struct coresight_device *csdev, struct coresight_path *path)
 {
 	int rc = 0;
 	struct catu_drvdata *catu_drvdata = csdev_to_catu_drvdata(csdev);
diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index 3267192f0c1c..f44ec9e5b692 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -355,17 +355,20 @@ static bool coresight_is_helper(struct coresight_device *csdev)
 }
 
 static int coresight_enable_helper(struct coresight_device *csdev,
-				   enum cs_mode mode, void *data)
+				   enum cs_mode mode,
+				   struct coresight_path *path)
 {
-	return helper_ops(csdev)->enable(csdev, mode, data);
+	return helper_ops(csdev)->enable(csdev, mode, path);
 }
 
-static void coresight_disable_helper(struct coresight_device *csdev, void *data)
+static void coresight_disable_helper(struct coresight_device *csdev,
+				     struct coresight_path *path)
 {
-	helper_ops(csdev)->disable(csdev, data);
+	helper_ops(csdev)->disable(csdev, path);
 }
 
-static void coresight_disable_helpers(struct coresight_device *csdev, void *data)
+static void coresight_disable_helpers(struct coresight_device *csdev,
+				      struct coresight_path *path)
 {
 	int i;
 	struct coresight_device *helper;
@@ -373,7 +376,7 @@ static void coresight_disable_helpers(struct coresight_device *csdev, void *data
 	for (i = 0; i < csdev->pdata->nr_outconns; ++i) {
 		helper = csdev->pdata->out_conns[i]->dest_dev;
 		if (helper && coresight_is_helper(helper))
-			coresight_disable_helper(helper, data);
+			coresight_disable_helper(helper, path);
 	}
 }
 
@@ -479,7 +482,8 @@ void coresight_disable_path(struct coresight_path *path)
 EXPORT_SYMBOL_GPL(coresight_disable_path);
 
 static int coresight_enable_helpers(struct coresight_device *csdev,
-				    enum cs_mode mode, void *data)
+				    enum cs_mode mode,
+				    struct coresight_path *path)
 {
 	int i, ret = 0;
 	struct coresight_device *helper;
@@ -489,7 +493,7 @@ static int coresight_enable_helpers(struct coresight_device *csdev,
 		if (!helper || !coresight_is_helper(helper))
 			continue;
 
-		ret = coresight_enable_helper(helper, mode, data);
+		ret = coresight_enable_helper(helper, mode, path);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/hwtracing/coresight/coresight-ctcu-core.c b/drivers/hwtracing/coresight/coresight-ctcu-core.c
index c586495e9a08..abed15eb72b4 100644
--- a/drivers/hwtracing/coresight/coresight-ctcu-core.c
+++ b/drivers/hwtracing/coresight/coresight-ctcu-core.c
@@ -156,17 +156,14 @@ static int ctcu_set_etr_traceid(struct coresight_device *csdev, struct coresight
 	return __ctcu_set_etr_traceid(csdev, traceid, port_num, enable);
 }
 
-static int ctcu_enable(struct coresight_device *csdev, enum cs_mode mode, void *data)
+static int ctcu_enable(struct coresight_device *csdev, enum cs_mode mode,
+		       struct coresight_path *path)
 {
-	struct coresight_path *path = (struct coresight_path *)data;
-
 	return ctcu_set_etr_traceid(csdev, path, true);
 }
 
-static int ctcu_disable(struct coresight_device *csdev, void *data)
+static int ctcu_disable(struct coresight_device *csdev, struct coresight_path *path)
 {
-	struct coresight_path *path = (struct coresight_path *)data;
-
 	return ctcu_set_etr_traceid(csdev, path, false);
 }
 
diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c
index 8fb30dd73fd2..bfbc365bb2ef 100644
--- a/drivers/hwtracing/coresight/coresight-cti-core.c
+++ b/drivers/hwtracing/coresight/coresight-cti-core.c
@@ -799,14 +799,15 @@ static void cti_pm_release(struct cti_drvdata *drvdata)
 }
 
 /** cti ect operations **/
-int cti_enable(struct coresight_device *csdev, enum cs_mode mode, void *data)
+int cti_enable(struct coresight_device *csdev, enum cs_mode mode,
+	       struct coresight_path *path)
 {
 	struct cti_drvdata *drvdata = csdev_to_cti_drvdata(csdev);
 
 	return cti_enable_hw(drvdata);
 }
 
-int cti_disable(struct coresight_device *csdev, void *data)
+int cti_disable(struct coresight_device *csdev, struct coresight_path *path)
 {
 	struct cti_drvdata *drvdata = csdev_to_cti_drvdata(csdev);
 
diff --git a/drivers/hwtracing/coresight/coresight-cti.h b/drivers/hwtracing/coresight/coresight-cti.h
index 8362a47c939c..4f89091ee93f 100644
--- a/drivers/hwtracing/coresight/coresight-cti.h
+++ b/drivers/hwtracing/coresight/coresight-cti.h
@@ -216,8 +216,9 @@ int cti_add_connection_entry(struct device *dev, struct cti_drvdata *drvdata,
 			     const char *assoc_dev_name);
 struct cti_trig_con *cti_allocate_trig_con(struct device *dev, int in_sigs,
 					   int out_sigs);
-int cti_enable(struct coresight_device *csdev, enum cs_mode mode, void *data);
-int cti_disable(struct coresight_device *csdev, void *data);
+int cti_enable(struct coresight_device *csdev, enum cs_mode mode,
+	       struct coresight_path *path);
+int cti_disable(struct coresight_device *csdev, struct coresight_path *path);
 void cti_write_all_hw_regs(struct cti_drvdata *drvdata);
 void cti_write_intack(struct device *dev, u32 ackval);
 void cti_write_single_reg(struct cti_drvdata *drvdata, int offset, u32 value);
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 60b0e0a6da05..51c6f73dd15c 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1332,9 +1332,9 @@ out:
 }
 
 struct etr_buf *tmc_etr_get_buffer(struct coresight_device *csdev,
-				   enum cs_mode mode, void *data)
+				   enum cs_mode mode,
+				   struct coresight_path *path)
 {
-	struct coresight_path *path = data;
 	struct perf_output_handle *handle = path->handle;
 	struct etr_perf_buffer *etr_perf;
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc.h b/drivers/hwtracing/coresight/coresight-tmc.h
index cbb4ba439158..95473d131032 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.h
+++ b/drivers/hwtracing/coresight/coresight-tmc.h
@@ -442,7 +442,8 @@ struct coresight_device *tmc_etr_get_catu_device(struct tmc_drvdata *drvdata);
 void tmc_etr_set_catu_ops(const struct etr_buf_operations *catu);
 void tmc_etr_remove_catu_ops(void);
 struct etr_buf *tmc_etr_get_buffer(struct coresight_device *csdev,
-				   enum cs_mode mode, void *data);
+				   enum cs_mode mode,
+				   struct coresight_path *path);
 extern const struct attribute_group coresight_etr_group;
 
 #endif
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 2626105e3719..2bee2e3bb1c6 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -424,8 +424,9 @@ struct coresight_ops_source {
  */
 struct coresight_ops_helper {
 	int (*enable)(struct coresight_device *csdev, enum cs_mode mode,
-		      void *data);
-	int (*disable)(struct coresight_device *csdev, void *data);
+		      struct coresight_path *path);
+	int (*disable)(struct coresight_device *csdev,
+		       struct coresight_path *path);
 };
 
 
-- 
cgit v1.2.3


From b139702a889692ec30702534ebb1ae2b11ed1cbf Mon Sep 17 00:00:00 2001
From: Jie Gan <jie.gan@oss.qualcomm.com>
Date: Thu, 25 Sep 2025 18:42:33 +0800
Subject: coresight: change the sink_ops to accept coresight_path

Update the sink_enable functions to accept coresight_path instead of
a generic void *data, as coresight_path encapsulates all the necessary
data required by devices along the path.

Tested-by: Carl Worth <carl@os.amperecomputing.com>
Reviewed-by: Carl Worth <carl@os.amperecomputing.com>
Reviewed-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Jie Gan <jie.gan@oss.qualcomm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-3-edd8a07c1646@oss.qualcomm.com
---
 drivers/hwtracing/coresight/coresight-core.c     | 10 +++++-----
 drivers/hwtracing/coresight/coresight-dummy.c    |  2 +-
 drivers/hwtracing/coresight/coresight-etb10.c    |  8 ++++----
 drivers/hwtracing/coresight/coresight-etm-perf.c |  2 +-
 drivers/hwtracing/coresight/coresight-priv.h     |  3 +--
 drivers/hwtracing/coresight/coresight-sysfs.c    |  2 +-
 drivers/hwtracing/coresight/coresight-tmc-etf.c  | 10 ++++++----
 drivers/hwtracing/coresight/coresight-tmc-etr.c  | 10 ++++++----
 drivers/hwtracing/coresight/coresight-tpiu.c     |  2 +-
 drivers/hwtracing/coresight/coresight-trbe.c     |  4 ++--
 drivers/hwtracing/coresight/ultrasoc-smb.c       |  9 +++++----
 include/linux/coresight.h                        |  2 +-
 12 files changed, 34 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index f44ec9e5b692..c660cf8adb1c 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -300,9 +300,10 @@ unlock:
 EXPORT_SYMBOL_GPL(coresight_add_helper);
 
 static int coresight_enable_sink(struct coresight_device *csdev,
-				 enum cs_mode mode, void *data)
+				 enum cs_mode mode,
+				 struct coresight_path *path)
 {
-	return sink_ops(csdev)->enable(csdev, mode, data);
+	return sink_ops(csdev)->enable(csdev, mode, path);
 }
 
 static void coresight_disable_sink(struct coresight_device *csdev)
@@ -501,8 +502,7 @@ static int coresight_enable_helpers(struct coresight_device *csdev,
 	return 0;
 }
 
-int coresight_enable_path(struct coresight_path *path, enum cs_mode mode,
-			  void *sink_data)
+int coresight_enable_path(struct coresight_path *path, enum cs_mode mode)
 {
 	int ret = 0;
 	u32 type;
@@ -532,7 +532,7 @@ int coresight_enable_path(struct coresight_path *path, enum cs_mode mode,
 
 		switch (type) {
 		case CORESIGHT_DEV_TYPE_SINK:
-			ret = coresight_enable_sink(csdev, mode, sink_data);
+			ret = coresight_enable_sink(csdev, mode, path);
 			/*
 			 * Sink is the first component turned on. If we
 			 * failed to enable the sink, there are no components
diff --git a/drivers/hwtracing/coresight/coresight-dummy.c b/drivers/hwtracing/coresight/coresight-dummy.c
index aaa92b5081e3..14322c99e29d 100644
--- a/drivers/hwtracing/coresight/coresight-dummy.c
+++ b/drivers/hwtracing/coresight/coresight-dummy.c
@@ -52,7 +52,7 @@ static int dummy_source_trace_id(struct coresight_device *csdev, __maybe_unused
 }
 
 static int dummy_sink_enable(struct coresight_device *csdev, enum cs_mode mode,
-				void *data)
+			     struct coresight_path *path)
 {
 	dev_dbg(csdev->dev.parent, "Dummy sink enabled\n");
 
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index 35db1b6093d1..6657602d8f2e 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -167,13 +167,13 @@ out:
 	return ret;
 }
 
-static int etb_enable_perf(struct coresight_device *csdev, void *data)
+static int etb_enable_perf(struct coresight_device *csdev, struct coresight_path *path)
 {
 	int ret = 0;
 	pid_t pid;
 	unsigned long flags;
 	struct etb_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-	struct perf_output_handle *handle = data;
+	struct perf_output_handle *handle = path->handle;
 	struct cs_buffers *buf = etm_perf_sink_config(handle);
 
 	raw_spin_lock_irqsave(&drvdata->spinlock, flags);
@@ -224,7 +224,7 @@ out:
 }
 
 static int etb_enable(struct coresight_device *csdev, enum cs_mode mode,
-		      void *data)
+		      struct coresight_path *path)
 {
 	int ret;
 
@@ -233,7 +233,7 @@ static int etb_enable(struct coresight_device *csdev, enum cs_mode mode,
 		ret = etb_enable_sysfs(csdev);
 		break;
 	case CS_MODE_PERF:
-		ret = etb_enable_perf(csdev, data);
+		ret = etb_enable_perf(csdev, path);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 5c256af6e54a..17afa0f4cdee 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -527,7 +527,7 @@ static void etm_event_start(struct perf_event *event, int flags)
 		goto fail_end_stop;
 
 	/* Nothing will happen without a path */
-	if (coresight_enable_path(path, CS_MODE_PERF, handle))
+	if (coresight_enable_path(path, CS_MODE_PERF))
 		goto fail_end_stop;
 
 	/* Finally enable the tracer */
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h
index 33e22b1ba043..fd896ac07942 100644
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -135,8 +135,7 @@ static inline void CS_UNLOCK(void __iomem *addr)
 }
 
 void coresight_disable_path(struct coresight_path *path);
-int coresight_enable_path(struct coresight_path *path, enum cs_mode mode,
-			  void *sink_data);
+int coresight_enable_path(struct coresight_path *path, enum cs_mode mode);
 struct coresight_device *coresight_get_sink(struct coresight_path *path);
 struct coresight_device *coresight_get_sink_by_id(u32 id);
 struct coresight_device *
diff --git a/drivers/hwtracing/coresight/coresight-sysfs.c b/drivers/hwtracing/coresight/coresight-sysfs.c
index 5e52324aa9ac..d2a6ed8bcc74 100644
--- a/drivers/hwtracing/coresight/coresight-sysfs.c
+++ b/drivers/hwtracing/coresight/coresight-sysfs.c
@@ -215,7 +215,7 @@ int coresight_enable_sysfs(struct coresight_device *csdev)
 	if (!IS_VALID_CS_TRACE_ID(path->trace_id))
 		goto err_path;
 
-	ret = coresight_enable_path(path, CS_MODE_SYSFS, NULL);
+	ret = coresight_enable_path(path, CS_MODE_SYSFS);
 	if (ret)
 		goto err_path;
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index 0f45ab5e5249..8882b1c4cdc0 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -246,13 +246,14 @@ out:
 	return ret;
 }
 
-static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, void *data)
+static int tmc_enable_etf_sink_perf(struct coresight_device *csdev,
+				    struct coresight_path *path)
 {
 	int ret = 0;
 	pid_t pid;
 	unsigned long flags;
 	struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-	struct perf_output_handle *handle = data;
+	struct perf_output_handle *handle = path->handle;
 	struct cs_buffers *buf = etm_perf_sink_config(handle);
 
 	raw_spin_lock_irqsave(&drvdata->spinlock, flags);
@@ -304,7 +305,8 @@ static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, void *data)
 }
 
 static int tmc_enable_etf_sink(struct coresight_device *csdev,
-			       enum cs_mode mode, void *data)
+			       enum cs_mode mode,
+			       struct coresight_path *path)
 {
 	int ret;
 
@@ -313,7 +315,7 @@ static int tmc_enable_etf_sink(struct coresight_device *csdev,
 		ret = tmc_enable_etf_sink_sysfs(csdev);
 		break;
 	case CS_MODE_PERF:
-		ret = tmc_enable_etf_sink_perf(csdev, data);
+		ret = tmc_enable_etf_sink_perf(csdev, path);
 		break;
 	/* We shouldn't be here */
 	default:
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 51c6f73dd15c..e0d83ee01b77 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1733,13 +1733,14 @@ out:
 	return size;
 }
 
-static int tmc_enable_etr_sink_perf(struct coresight_device *csdev, void *data)
+static int tmc_enable_etr_sink_perf(struct coresight_device *csdev,
+				    struct coresight_path *path)
 {
 	int rc = 0;
 	pid_t pid;
 	unsigned long flags;
 	struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-	struct perf_output_handle *handle = data;
+	struct perf_output_handle *handle = path->handle;
 	struct etr_perf_buffer *etr_perf = etm_perf_sink_config(handle);
 
 	raw_spin_lock_irqsave(&drvdata->spinlock, flags);
@@ -1787,13 +1788,14 @@ unlock_out:
 }
 
 static int tmc_enable_etr_sink(struct coresight_device *csdev,
-			       enum cs_mode mode, void *data)
+			       enum cs_mode mode,
+			       struct coresight_path *path)
 {
 	switch (mode) {
 	case CS_MODE_SYSFS:
 		return tmc_enable_etr_sink_sysfs(csdev);
 	case CS_MODE_PERF:
-		return tmc_enable_etr_sink_perf(csdev, data);
+		return tmc_enable_etr_sink_perf(csdev, path);
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index 9463afdbda8a..aaa44bc521c3 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -75,7 +75,7 @@ static void tpiu_enable_hw(struct csdev_access *csa)
 }
 
 static int tpiu_enable(struct coresight_device *csdev, enum cs_mode mode,
-		       void *__unused)
+		       struct coresight_path *path)
 {
 	struct tpiu_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
 
diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c
index 43643d2c5bdd..293715b4ff0e 100644
--- a/drivers/hwtracing/coresight/coresight-trbe.c
+++ b/drivers/hwtracing/coresight/coresight-trbe.c
@@ -1013,11 +1013,11 @@ err:
 }
 
 static int arm_trbe_enable(struct coresight_device *csdev, enum cs_mode mode,
-			   void *data)
+			   struct coresight_path *path)
 {
 	struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
 	struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev);
-	struct perf_output_handle *handle = data;
+	struct perf_output_handle *handle = path->handle;
 	struct trbe_buf *buf = etm_perf_sink_config(handle);
 
 	WARN_ON(cpudata->cpu != smp_processor_id());
diff --git a/drivers/hwtracing/coresight/ultrasoc-smb.c b/drivers/hwtracing/coresight/ultrasoc-smb.c
index 26cfc939e5bd..8f7922a5e534 100644
--- a/drivers/hwtracing/coresight/ultrasoc-smb.c
+++ b/drivers/hwtracing/coresight/ultrasoc-smb.c
@@ -213,10 +213,11 @@ static void smb_enable_sysfs(struct coresight_device *csdev)
 	coresight_set_mode(csdev, CS_MODE_SYSFS);
 }
 
-static int smb_enable_perf(struct coresight_device *csdev, void *data)
+static int smb_enable_perf(struct coresight_device *csdev,
+			   struct coresight_path *path)
 {
 	struct smb_drv_data *drvdata = dev_get_drvdata(csdev->dev.parent);
-	struct perf_output_handle *handle = data;
+	struct perf_output_handle *handle = path->handle;
 	struct cs_buffers *buf = etm_perf_sink_config(handle);
 	pid_t pid;
 
@@ -240,7 +241,7 @@ static int smb_enable_perf(struct coresight_device *csdev, void *data)
 }
 
 static int smb_enable(struct coresight_device *csdev, enum cs_mode mode,
-		      void *data)
+		      struct coresight_path *path)
 {
 	struct smb_drv_data *drvdata = dev_get_drvdata(csdev->dev.parent);
 	int ret = 0;
@@ -261,7 +262,7 @@ static int smb_enable(struct coresight_device *csdev, enum cs_mode mode,
 		smb_enable_sysfs(csdev);
 		break;
 	case CS_MODE_PERF:
-		ret = smb_enable_perf(csdev, data);
+		ret = smb_enable_perf(csdev, path);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 2bee2e3bb1c6..56d0108658db 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -367,7 +367,7 @@ enum cs_mode {
  */
 struct coresight_ops_sink {
 	int (*enable)(struct coresight_device *csdev, enum cs_mode mode,
-		      void *data);
+		      struct coresight_path *path);
 	int (*disable)(struct coresight_device *csdev);
 	void *(*alloc_buffer)(struct coresight_device *csdev,
 			      struct perf_event *event, void **pages,
-- 
cgit v1.2.3


From 772ada50282b0c80343c8989147db816961f571d Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Mon, 10 Nov 2025 11:43:27 +0000
Subject: ASoC: cs35l56: Alter error codes for calibration routine

Adjust the error codes returned by the calibration debugfs files
to provide a wider range of errors and make them more consistent.

There is a limited number of standard errors and it's not always
easy to find an error code that accurately describes what happened.
Additionally, user code often uses strerror() or something similar
to report a generic error description. The original calibration
code used a limited set of errors to attempt to avoid user error
strings that would be confusing or unclear on a file read/write.
However, this restricts the ability to provide informative errors.

This limited error range didn't help very much with debugging so
it has been expanded, rather than worrying about what strerror()
would return.

The errors are now more consistent:

ENXIO       Calibration is not supported by the driver.
EOPNOTSUPP  The given calibration command is not supported.
EBUSY       Cannot calibrate because the amp is playing audio.
ERANGE      Calibration result was out-of-range.
ETIMEDOUT   Calibration did not complete.
EMSGSIZE    Blob written to cal_data is the wrong size.
ENODATA     No calibration data available to read from cal_data,
            or
            Blob written to cal_data does not contain calibration,
            or
            No calibration data available to save to UEFI.
EIO         General failure to communicate with the firmware, mainly
            indicating that firmware controls are missing.
EINVAL      Has its normal meaning that an invalid argument was passed.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20251110114327.84370-1-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           |  3 +++
 sound/soc/codecs/cs35l56-shared.c | 29 ++++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index 82559be0f249..0a740a99ad31 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -265,6 +265,9 @@
 #define CS35L56_PS3_POLL_US				500
 #define CS35L56_PS3_TIMEOUT_US				300000
 
+#define CS35L56_CAL_STATUS_SUCCESS			1
+#define CS35L56_CAL_STATUS_OUT_OF_RANGE			3
+
 #define CS35L56_CONTROL_PORT_READY_US			2200
 #define CS35L56_HALO_STATE_POLL_US			1000
 #define CS35L56_HALO_STATE_TIMEOUT_US			250000
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index bbacac6bda81..fe0693c3f7de 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -1022,8 +1022,10 @@ static int cs35l56_perform_calibration(struct cs35l56_base *cs35l56_base)
 		return ret;
 
 	ret = cs35l56_wait_for_ps3(cs35l56_base);
-	if (ret)
+	if (ret) {
+		ret = -EBUSY;
 		goto err_pm_put;
+	}
 
 	regmap_update_bits_check(cs35l56_base->regmap, CS35L56_MIXER_NGATE_CH1_CFG,
 				 CS35L56_AUX_NGATE_CHn_EN, 0, &ngate_ch1_was_enabled);
@@ -1038,7 +1040,7 @@ static int cs35l56_perform_calibration(struct cs35l56_base *cs35l56_base)
 		if (!ctl) {
 			dev_err(cs35l56_base->dev, "Could not get %s control\n",
 				calibration_controls->status);
-			ret = -ENXIO;
+			ret = -EIO;
 			goto err;
 		}
 
@@ -1050,12 +1052,15 @@ static int cs35l56_perform_calibration(struct cs35l56_base *cs35l56_base)
 					      0, &val, sizeof(val));
 		if (ret < 0) {
 			dev_err(cs35l56_base->dev, "Could not write %s: %d\n", "CALI_NORM_EN", ret);
+			ret = -EIO;
 			goto err;
 		}
 
 		ret = cs35l56_mbox_send(cs35l56_base, CS35L56_MBOX_CMD_AUDIO_CALIBRATION);
-		if (ret)
+		if (ret) {
+			ret = -EIO;
 			goto err;
+		}
 
 		if (read_poll_timeout(cs_dsp_coeff_read_ctrl, ret,
 				      (val == cpu_to_be32(1)),
@@ -1065,16 +1070,24 @@ static int cs35l56_perform_calibration(struct cs35l56_base *cs35l56_base)
 				      ctl, 0, &val, sizeof(val))) {
 			dev_err(cs35l56_base->dev, "Calibration timed out (CAL_STATUS: %u)\n",
 				be32_to_cpu(val));
-			ret = -ETIMEDOUT;
-			goto err;
+			switch (be32_to_cpu(val)) {
+			case CS35L56_CAL_STATUS_OUT_OF_RANGE:
+				ret = -ERANGE;
+				goto err;
+			default:
+				ret = -ETIMEDOUT;
+				goto err;
+			}
 		}
 	}
 
 	cs35l56_base->cal_data_valid = false;
 	memset(&cal_data, 0, sizeof(cal_data));
 	ret = cs_amp_read_cal_coeffs(dsp, calibration_controls, &cal_data);
-	if (ret)
+	if (ret) {
+		ret = -EIO;
 		goto err;
+	}
 
 	dev_info(cs35l56_base->dev, "Cal status:%d calR:%d ambient:%d\n",
 		 cal_data.calStatus, cal_data.calR, cal_data.calAmbient);
@@ -1141,7 +1154,7 @@ ssize_t cs35l56_calibrate_debugfs_write(struct cs35l56_base *cs35l56_base,
 			return ret;
 		break;
 	default:
-		return -ENXIO;
+		return -EOPNOTSUPP;
 	}
 
 	return count;
@@ -1170,6 +1183,8 @@ ssize_t cs35l56_cal_ambient_debugfs_write(struct cs35l56_base *cs35l56_base,
 		goto out;
 
 	ret = cs_amp_write_ambient_temp(cs35l56_base->dsp, cs35l56_base->calibration_controls, val);
+	if (ret)
+		ret = -EIO;
 out:
 	pm_runtime_put(cs35l56_base->dev);
 
-- 
cgit v1.2.3


From 22ea7b9d96e26147b7a3ea1be7aa106cc700907c Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sun, 2 Nov 2025 22:53:11 +0100
Subject: platform/x86: asus-wmi: export symbols used for read/write WMI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export symbols for reading/writing WMI symbols using a namespace.
Existing functions:
- asus_wmi_evaluate_method
- asus_wmi_set_devstate
New function:
- asus_wmi_get_devstate_dsts

The new function is intended for use with DSTS WMI method only and
avoids requiring the asus_wmi driver data to select the WMI method.

Co-developed-by: Denis Benato <denis.benato@linux.dev>
Signed-off-by: Denis Benato <denis.benato@linux.dev>
Signed-off-by: Luke D. Jones <luke@ljones.dev>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Link: https://patch.msgid.link/20251102215319.3126879-2-denis.benato@linux.dev
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/asus-wmi.c            | 46 ++++++++++++++++++++++++++++--
 include/linux/platform_data/x86/asus-wmi.h |  5 ++++
 2 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index e72a2b5d158e..c3e90517ce0f 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -390,7 +390,7 @@ int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval)
 {
 	return asus_wmi_evaluate_method3(method_id, arg0, arg1, 0, retval);
 }
-EXPORT_SYMBOL_GPL(asus_wmi_evaluate_method);
+EXPORT_SYMBOL_NS_GPL(asus_wmi_evaluate_method, "ASUS_WMI");
 
 static int asus_wmi_evaluate_method5(u32 method_id,
 		u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 *retval)
@@ -554,12 +554,52 @@ static int asus_wmi_get_devstate(struct asus_wmi *asus, u32 dev_id, u32 *retval)
 	return 0;
 }
 
-int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param,
-				 u32 *retval)
+/**
+ * asus_wmi_get_devstate_dsts() - Get the WMI function state.
+ * @dev_id: The WMI method ID to call.
+ * @retval: A pointer to where to store the value returned from WMI.
+ *
+ * Returns:
+ * * %-ENODEV	- method ID is unsupported.
+ * * %0			- successful and retval is filled.
+ * * %other		- error from WMI call.
+ */
+int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval)
+{
+	int err;
+
+	err = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, dev_id, 0, retval);
+	if (err)
+		return err;
+
+	if ((*retval & ASUS_WMI_DSTS_PRESENCE_BIT) == 0x00)
+		return -ENODEV;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(asus_wmi_get_devstate_dsts, "ASUS_WMI");
+
+/**
+ * asus_wmi_set_devstate() - Set the WMI function state.
+ *
+ * Note: an asus_wmi_set_devstate() call must be paired with a
+ * asus_wmi_get_devstate_dsts() to check if the WMI function is supported.
+ *
+ * @dev_id: The WMI function to call.
+ * @ctrl_param: The argument to be used for this WMI function.
+ * @retval: A pointer to where to store the value returned from WMI.
+ *
+ * Returns:
+ * * %-ENODEV	- method ID is unsupported.
+ * * %0			- successful and retval is filled.
+ * * %other		- error from WMI call.
+ */
+int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval)
 {
 	return asus_wmi_evaluate_method(ASUS_WMI_METHODID_DEVS, dev_id,
 					ctrl_param, retval);
 }
+EXPORT_SYMBOL_NS_GPL(asus_wmi_set_devstate, "ASUS_WMI");
 
 /* Helper for special devices with magic return codes */
 static int asus_wmi_get_devstate_bits(struct asus_wmi *asus,
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 8a515179113d..dbd44d9fbb6f 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -166,6 +166,7 @@ enum asus_ally_mcu_hack {
 #if IS_REACHABLE(CONFIG_ASUS_WMI)
 void set_ally_mcu_hack(enum asus_ally_mcu_hack status);
 void set_ally_mcu_powersave(bool enabled);
+int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval);
 int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval);
 int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval);
 #else
@@ -179,6 +180,10 @@ static inline int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval)
 {
 	return -ENODEV;
 }
+static inline int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval)
+{
+	return -ENODEV;
+}
 static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1,
 					   u32 *retval)
 {
-- 
cgit v1.2.3


From 0d9a7085ec24983b5b8a4971c0eb6c756c75f1bf Mon Sep 17 00:00:00 2001
From: Bean Huo <beanhuo@micron.com>
Date: Sat, 8 Nov 2025 00:05:16 +0100
Subject: scsi: ufs: core: Convert string descriptor format macros to enum

Convert SD_ASCII_STD and SD_RAW from boolean macros to enum values for
improved code readability. This makes ufshcd_read_string_desc() calls
self-documenting by using explicit enum values instead of true/false.

Move the ufshcd_read_string_desc() declaration from include/ufs/ufshcd.h
to drivers/ufs/core/ufshcd-priv.h since this function is not exported.

Co-developed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Suggested-by: Avri Altman <Avri.Altman@sandisk.com>
Signed-off-by: Bean Huo <beanhuo@micron.com>
Link: https://patch.msgid.link/20251107230518.4060231-2-beanhuo@iokpp.de
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd-priv.h | 14 ++++++++++----
 drivers/ufs/core/ufshcd.c      |  8 +++-----
 include/ufs/ufshcd.h           |  4 ----
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index d0a2c963a27d..d74742a855b2 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -78,11 +78,17 @@ int ufshcd_try_to_abort_task(struct ufs_hba *hba, int tag);
 void ufshcd_release_scsi_cmd(struct ufs_hba *hba,
 			     struct ufshcd_lrb *lrbp);
 
-#define SD_ASCII_STD true
-#define SD_RAW false
-int ufshcd_read_string_desc(struct ufs_hba *hba, u8 desc_index,
-			    u8 **buf, bool ascii);
+/**
+ * enum ufs_descr_fmt - UFS string descriptor format
+ * @SD_RAW: Raw UTF-16 format
+ * @SD_ASCII_STD: Convert to null-terminated ASCII string
+ */
+enum ufs_descr_fmt {
+	SD_RAW = 0,
+	SD_ASCII_STD = 1,
+};
 
+int ufshcd_read_string_desc(struct ufs_hba *hba, u8 desc_index, u8 **buf, enum ufs_descr_fmt fmt);
 int ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd);
 int ufshcd_send_bsg_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd);
 
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 8339fec975b9..2a653137a9ea 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -3773,16 +3773,14 @@ static inline char ufshcd_remove_non_printable(u8 ch)
  * @desc_index: descriptor index
  * @buf: pointer to buffer where descriptor would be read,
  *       the caller should free the memory.
- * @ascii: if true convert from unicode to ascii characters
- *         null terminated string.
+ * @fmt: if %SD_ASCII_STD, convert from UTF-16 to ASCII
  *
  * Return:
  * *      string size on success.
  * *      -ENOMEM: on allocation failure
  * *      -EINVAL: on a wrong parameter
  */
-int ufshcd_read_string_desc(struct ufs_hba *hba, u8 desc_index,
-			    u8 **buf, bool ascii)
+int ufshcd_read_string_desc(struct ufs_hba *hba, u8 desc_index, u8 **buf, enum ufs_descr_fmt fmt)
 {
 	struct uc_string_id *uc_str;
 	u8 *str;
@@ -3811,7 +3809,7 @@ int ufshcd_read_string_desc(struct ufs_hba *hba, u8 desc_index,
 		goto out;
 	}
 
-	if (ascii) {
+	if (fmt == SD_ASCII_STD) {
 		ssize_t ascii_len;
 		int i;
 		/* remove header and divide by 2 to move from UTF16 to UTF8 */
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 9425cfd9d00e..b4eb2fa58552 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1431,10 +1431,6 @@ static inline int ufshcd_disable_host_tx_lcc(struct ufs_hba *hba)
 void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit);
 void ufshcd_fixup_dev_quirks(struct ufs_hba *hba,
 			     const struct ufs_dev_quirk *fixups);
-#define SD_ASCII_STD true
-#define SD_RAW false
-int ufshcd_read_string_desc(struct ufs_hba *hba, u8 desc_index,
-			    u8 **buf, bool ascii);
 
 void ufshcd_hold(struct ufs_hba *hba);
 void ufshcd_release(struct ufs_hba *hba);
-- 
cgit v1.2.3


From b06b8c421485e0e96d7fd6aa614fb0b6f2778a03 Mon Sep 17 00:00:00 2001
From: Bean Huo <beanhuo@micron.com>
Date: Sat, 8 Nov 2025 00:05:18 +0100
Subject: scsi: ufs: core: Add OP-TEE based RPMB driver for UFS devices

Add OP-TEE based RPMB support for UFS devices. This enables secure RPMB
operations on UFS devices through OP-TEE, providing the same
functionality available for eMMC devices and extending kernel-based
secure storage support to UFS-based systems.

Benefits of OP-TEE based RPMB implementation:

 - Eliminates dependency on userspace supplicant for RPMB access

 - Enables early boot secure storage access (e.g., fTPM, secure UEFI
   variables)

 - Provides kernel-level RPMB access as soon as UFS driver is
   initialized

 - Removes complex initramfs dependencies and boot ordering requirements

 - Ensures reliable and deterministic secure storage operations

 - Supports both built-in and modular fTPM configurations

[mkp: make this build as a module]

Co-developed-by: Can Guo <can.guo@oss.qualcomm.com>
Signed-off-by: Can Guo <can.guo@oss.qualcomm.com>
Reviewed-by: Avri Altman <avri.altman@sandisk.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Bean Huo <beanhuo@micron.com>
Link: https://patch.msgid.link/20251107230518.4060231-4-beanhuo@iokpp.de
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/misc/Kconfig           |   2 +-
 drivers/ufs/core/Makefile      |   1 +
 drivers/ufs/core/ufs-rpmb.c    | 254 +++++++++++++++++++++++++++++++++++++++++
 drivers/ufs/core/ufshcd-priv.h |  13 +++
 drivers/ufs/core/ufshcd.c      |  86 +++++++++++++-
 include/ufs/ufs.h              |   5 +
 include/ufs/ufshcd.h           |   7 +-
 7 files changed, 361 insertions(+), 7 deletions(-)
 create mode 100644 drivers/ufs/core/ufs-rpmb.c

(limited to 'include')

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index b9c11f67315f..9d1de68dee27 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -106,7 +106,7 @@ config PHANTOM
 
 config RPMB
 	tristate "RPMB partition interface"
-	depends on MMC
+	depends on MMC || SCSI_UFSHCD
 	help
 	  Unified RPMB unit interface for RPMB capable devices such as eMMC and
 	  UFS. Provides interface for in-kernel security controllers to access
diff --git a/drivers/ufs/core/Makefile b/drivers/ufs/core/Makefile
index cf820fa09a04..51e1867e524e 100644
--- a/drivers/ufs/core/Makefile
+++ b/drivers/ufs/core/Makefile
@@ -2,6 +2,7 @@
 
 obj-$(CONFIG_SCSI_UFSHCD)		+= ufshcd-core.o
 ufshcd-core-y				+= ufshcd.o ufs-sysfs.o ufs-mcq.o
+ufshcd-core-$(CONFIG_RPMB)		+= ufs-rpmb.o
 ufshcd-core-$(CONFIG_DEBUG_FS)		+= ufs-debugfs.o
 ufshcd-core-$(CONFIG_SCSI_UFS_BSG)	+= ufs_bsg.o
 ufshcd-core-$(CONFIG_SCSI_UFS_CRYPTO)	+= ufshcd-crypto.o
diff --git a/drivers/ufs/core/ufs-rpmb.c b/drivers/ufs/core/ufs-rpmb.c
new file mode 100644
index 000000000000..ffad049872b9
--- /dev/null
+++ b/drivers/ufs/core/ufs-rpmb.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * UFS OP-TEE based RPMB Driver
+ *
+ * Copyright (C) 2025 Micron Technology, Inc.
+ * Copyright (C) 2025 Qualcomm Technologies, Inc.
+ *
+ * Authors:
+ *	Bean Huo <beanhuo@micron.com>
+ *	Can Guo <can.guo@oss.qualcomm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/rpmb.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <ufs/ufshcd.h>
+#include <linux/unaligned.h>
+#include "ufshcd-priv.h"
+
+#define UFS_RPMB_SEC_PROTOCOL		0xEC	/* JEDEC UFS application */
+#define UFS_RPMB_SEC_PROTOCOL_ID	0x01	/* JEDEC UFS RPMB protocol ID, CDB byte3 */
+
+static const struct bus_type ufs_rpmb_bus_type = {
+	.name = "ufs_rpmb",
+};
+
+/* UFS RPMB device structure */
+struct ufs_rpmb_dev {
+	u8 region_id;
+	struct device dev;
+	struct rpmb_dev *rdev;
+	struct ufs_hba *hba;
+	struct list_head node;
+};
+
+static int ufs_sec_submit(struct ufs_hba *hba, u16 spsp, void *buffer, size_t len, bool send)
+{
+	struct scsi_device *sdev = hba->ufs_rpmb_wlun;
+	u8 cdb[12] = { };
+
+	cdb[0] = send ? SECURITY_PROTOCOL_OUT : SECURITY_PROTOCOL_IN;
+	cdb[1] = UFS_RPMB_SEC_PROTOCOL;
+	put_unaligned_be16(spsp, &cdb[2]);
+	put_unaligned_be32(len, &cdb[6]);
+
+	return scsi_execute_cmd(sdev, cdb, send ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
+				buffer, len, /*timeout=*/30 * HZ, 0, NULL);
+}
+
+/* UFS RPMB route frames implementation */
+static int ufs_rpmb_route_frames(struct device *dev, u8 *req, unsigned int req_len, u8 *resp,
+					unsigned int resp_len)
+{
+	struct ufs_rpmb_dev *ufs_rpmb = dev_get_drvdata(dev);
+	struct rpmb_frame *frm_out = (struct rpmb_frame *)req;
+	bool need_result_read = true;
+	u16 req_type, protocol_id;
+	struct ufs_hba *hba;
+	int ret;
+
+	if (!ufs_rpmb) {
+		dev_err(dev, "Missing driver data\n");
+		return -ENODEV;
+	}
+
+	hba = ufs_rpmb->hba;
+
+	req_type = be16_to_cpu(frm_out->req_resp);
+
+	switch (req_type) {
+	case RPMB_PROGRAM_KEY:
+		if (req_len != sizeof(struct rpmb_frame) || resp_len != sizeof(struct rpmb_frame))
+			return -EINVAL;
+		break;
+	case RPMB_GET_WRITE_COUNTER:
+		if (req_len != sizeof(struct rpmb_frame) || resp_len != sizeof(struct rpmb_frame))
+			return -EINVAL;
+		need_result_read = false;
+		break;
+	case RPMB_WRITE_DATA:
+		if (req_len % sizeof(struct rpmb_frame) || resp_len != sizeof(struct rpmb_frame))
+			return -EINVAL;
+		break;
+	case RPMB_READ_DATA:
+		if (req_len != sizeof(struct rpmb_frame) || resp_len % sizeof(struct rpmb_frame))
+			return -EINVAL;
+		need_result_read = false;
+		break;
+	default:
+		dev_err(dev, "Unknown request type=0x%04x\n", req_type);
+		return -EINVAL;
+	}
+
+	protocol_id = ufs_rpmb->region_id << 8 | UFS_RPMB_SEC_PROTOCOL_ID;
+
+	ret = ufs_sec_submit(hba, protocol_id, req, req_len, true);
+	if (ret) {
+		dev_err(dev, "Command failed with ret=%d\n", ret);
+		return ret;
+	}
+
+	if (need_result_read) {
+		struct rpmb_frame *frm_resp = (struct rpmb_frame *)resp;
+
+		memset(frm_resp, 0, sizeof(*frm_resp));
+		frm_resp->req_resp = cpu_to_be16(RPMB_RESULT_READ);
+		ret = ufs_sec_submit(hba, protocol_id, resp, resp_len, true);
+		if (ret) {
+			dev_err(dev, "Result read request failed with ret=%d\n", ret);
+			return ret;
+		}
+	}
+
+	if (!ret) {
+		ret = ufs_sec_submit(hba, protocol_id, resp, resp_len, false);
+		if (ret)
+			dev_err(dev, "Response read failed with ret=%d\n", ret);
+	}
+
+	return ret;
+}
+
+static void ufs_rpmb_device_release(struct device *dev)
+{
+	struct ufs_rpmb_dev *ufs_rpmb = dev_get_drvdata(dev);
+
+	rpmb_dev_unregister(ufs_rpmb->rdev);
+}
+
+/* UFS RPMB device registration */
+int ufs_rpmb_probe(struct ufs_hba *hba)
+{
+	struct ufs_rpmb_dev *ufs_rpmb, *it, *tmp;
+	struct rpmb_dev *rdev;
+	char *cid = NULL;
+	int region;
+	u32 cap;
+	int ret;
+
+	if (!hba->ufs_rpmb_wlun || hba->dev_info.b_advanced_rpmb_en) {
+		dev_info(hba->dev, "Skip OP-TEE RPMB registration\n");
+		return -ENODEV;
+	}
+
+	/* Check if device_id is available */
+	if (!hba->dev_info.device_id) {
+		dev_err(hba->dev, "UFS Device ID not available\n");
+		return -EINVAL;
+	}
+
+	INIT_LIST_HEAD(&hba->rpmbs);
+
+	struct rpmb_descr descr = {
+		.type = RPMB_TYPE_UFS,
+		.route_frames = ufs_rpmb_route_frames,
+		.reliable_wr_count = hba->dev_info.rpmb_io_size,
+	};
+
+	for (region = 0; region < ARRAY_SIZE(hba->dev_info.rpmb_region_size); region++) {
+		cap = hba->dev_info.rpmb_region_size[region];
+		if (!cap)
+			continue;
+
+		ufs_rpmb = devm_kzalloc(hba->dev, sizeof(*ufs_rpmb), GFP_KERNEL);
+		if (!ufs_rpmb) {
+			ret = -ENOMEM;
+			goto err_out;
+		}
+
+		ufs_rpmb->hba = hba;
+		ufs_rpmb->dev.parent = &hba->ufs_rpmb_wlun->sdev_gendev;
+		ufs_rpmb->dev.bus = &ufs_rpmb_bus_type;
+		ufs_rpmb->dev.release = ufs_rpmb_device_release;
+		dev_set_name(&ufs_rpmb->dev, "ufs_rpmb%d", region);
+
+		/* Set driver data BEFORE device_register */
+		dev_set_drvdata(&ufs_rpmb->dev, ufs_rpmb);
+
+		ret = device_register(&ufs_rpmb->dev);
+		if (ret) {
+			dev_err(hba->dev, "Failed to register UFS RPMB device %d\n", region);
+			put_device(&ufs_rpmb->dev);
+			goto err_out;
+		}
+
+		/* Create unique ID by appending region number to device_id */
+		cid = kasprintf(GFP_KERNEL, "%s-R%d", hba->dev_info.device_id, region);
+		if (!cid) {
+			device_unregister(&ufs_rpmb->dev);
+			ret = -ENOMEM;
+			goto err_out;
+		}
+
+		descr.dev_id = cid;
+		descr.dev_id_len = strlen(cid);
+		descr.capacity = cap;
+
+		/* Register RPMB device */
+		rdev = rpmb_dev_register(&ufs_rpmb->dev, &descr);
+		if (IS_ERR(rdev)) {
+			dev_err(hba->dev, "Failed to register UFS RPMB device.\n");
+			device_unregister(&ufs_rpmb->dev);
+			ret = PTR_ERR(rdev);
+			goto err_out;
+		}
+
+		kfree(cid);
+		cid = NULL;
+
+		ufs_rpmb->rdev = rdev;
+		ufs_rpmb->region_id = region;
+
+		list_add_tail(&ufs_rpmb->node, &hba->rpmbs);
+
+		dev_info(hba->dev, "UFS RPMB region %d registered (capacity=%u)\n", region, cap);
+	}
+
+	return 0;
+err_out:
+	kfree(cid);
+	list_for_each_entry_safe(it, tmp, &hba->rpmbs, node) {
+		list_del(&it->node);
+		device_unregister(&it->dev);
+	}
+
+	return ret;
+}
+
+/* UFS RPMB remove handler */
+void ufs_rpmb_remove(struct ufs_hba *hba)
+{
+	struct ufs_rpmb_dev *ufs_rpmb, *tmp;
+
+	if (list_empty(&hba->rpmbs))
+		return;
+
+	/* Remove all registered RPMB devices */
+	list_for_each_entry_safe(ufs_rpmb, tmp, &hba->rpmbs, node) {
+		dev_info(hba->dev, "Removing UFS RPMB region %d\n", ufs_rpmb->region_id);
+		/* Remove from list first */
+		list_del(&ufs_rpmb->node);
+		/* Unregister device */
+		device_unregister(&ufs_rpmb->dev);
+	}
+
+	dev_info(hba->dev, "All UFS RPMB devices unregistered\n");
+}
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("OP-TEE UFS RPMB driver");
diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index d74742a855b2..9ed2587bc4e8 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -417,4 +417,17 @@ static inline u32 ufshcd_mcq_get_sq_head_slot(struct ufs_hw_queue *q)
 	return val / sizeof(struct utp_transfer_req_desc);
 }
 
+#if IS_ENABLED(CONFIG_RPMB)
+int ufs_rpmb_probe(struct ufs_hba *hba);
+void ufs_rpmb_remove(struct ufs_hba *hba);
+#else
+static inline int ufs_rpmb_probe(struct ufs_hba *hba)
+{
+	return 0;
+}
+static inline void ufs_rpmb_remove(struct ufs_hba *hba)
+{
+}
+#endif
+
 #endif /* _UFSHCD_PRIV_H_ */
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index af7f87f27630..1a402b1ae37f 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -5254,10 +5254,15 @@ static void ufshcd_lu_init(struct ufs_hba *hba, struct scsi_device *sdev)
 	    desc_buf[UNIT_DESC_PARAM_LU_WR_PROTECT] == UFS_LU_POWER_ON_WP)
 		hba->dev_info.is_lu_power_on_wp = true;
 
-	/* In case of RPMB LU, check if advanced RPMB mode is enabled */
-	if (desc_buf[UNIT_DESC_PARAM_UNIT_INDEX] == UFS_UPIU_RPMB_WLUN &&
-	    desc_buf[RPMB_UNIT_DESC_PARAM_REGION_EN] & BIT(4))
-		hba->dev_info.b_advanced_rpmb_en = true;
+	/* In case of RPMB LU, check if advanced RPMB mode is enabled, and get region size */
+	if (desc_buf[UNIT_DESC_PARAM_UNIT_INDEX] == UFS_UPIU_RPMB_WLUN) {
+		if (desc_buf[RPMB_UNIT_DESC_PARAM_REGION_EN] & BIT(4))
+			hba->dev_info.b_advanced_rpmb_en = true;
+		hba->dev_info.rpmb_region_size[0] = desc_buf[RPMB_UNIT_DESC_PARAM_REGION0_SIZE];
+		hba->dev_info.rpmb_region_size[1] = desc_buf[RPMB_UNIT_DESC_PARAM_REGION1_SIZE];
+		hba->dev_info.rpmb_region_size[2] = desc_buf[RPMB_UNIT_DESC_PARAM_REGION2_SIZE];
+		hba->dev_info.rpmb_region_size[3] = desc_buf[RPMB_UNIT_DESC_PARAM_REGION3_SIZE];
+	}
 
 
 	kfree(desc_buf);
@@ -8187,8 +8192,11 @@ static int ufshcd_scsi_add_wlus(struct ufs_hba *hba)
 		ufshcd_upiu_wlun_to_scsi_wlun(UFS_UPIU_RPMB_WLUN), NULL);
 	if (IS_ERR(sdev_rpmb)) {
 		ret = PTR_ERR(sdev_rpmb);
+		hba->ufs_rpmb_wlun = NULL;
+		dev_err(hba->dev, "%s: RPMB WLUN not found\n", __func__);
 		goto remove_ufs_device_wlun;
 	}
+	hba->ufs_rpmb_wlun = sdev_rpmb;
 	ufshcd_blk_pm_runtime_init(sdev_rpmb);
 	scsi_device_put(sdev_rpmb);
 
@@ -8456,6 +8464,67 @@ static void ufs_init_rtc(struct ufs_hba *hba, u8 *desc_buf)
 	dev_info->rtc_update_period = 0;
 }
 
+/**
+ * ufshcd_create_device_id - Generate unique device identifier string
+ * @hba: per-adapter instance
+ * @desc_buf: device descriptor buffer
+ *
+ * Creates a unique device ID string combining manufacturer ID, spec version,
+ * model name, serial number (as hex), device version, and manufacture date.
+ *
+ * Returns: Allocated device ID string on success, NULL on failure
+ */
+static char *ufshcd_create_device_id(struct ufs_hba *hba, u8 *desc_buf)
+{
+	struct ufs_dev_info *dev_info = &hba->dev_info;
+	u16 manufacture_date;
+	u16 device_version;
+	u8 *serial_number;
+	char *serial_hex;
+	char *device_id;
+	u8 serial_index;
+	int serial_len;
+	int ret;
+
+	serial_index = desc_buf[DEVICE_DESC_PARAM_SN];
+
+	ret = ufshcd_read_string_desc(hba, serial_index, &serial_number, SD_RAW);
+	if (ret < 0) {
+		dev_err(hba->dev, "Failed reading Serial Number. err = %d\n", ret);
+		return NULL;
+	}
+
+	device_version = get_unaligned_be16(&desc_buf[DEVICE_DESC_PARAM_DEV_VER]);
+	manufacture_date = get_unaligned_be16(&desc_buf[DEVICE_DESC_PARAM_MANF_DATE]);
+
+	serial_len = ret;
+	/* Allocate buffer for hex string: 2 chars per byte + null terminator */
+	serial_hex = kzalloc(serial_len * 2 + 1, GFP_KERNEL);
+	if (!serial_hex) {
+		kfree(serial_number);
+		return NULL;
+	}
+
+	bin2hex(serial_hex, serial_number, serial_len);
+
+	/*
+	 * Device ID format is ABI with secure world - do not change without firmware
+	 * coordination.
+	 */
+	device_id = kasprintf(GFP_KERNEL, "%04X-%04X-%s-%s-%04X-%04X",
+			      dev_info->wmanufacturerid, dev_info->wspecversion,
+			      dev_info->model, serial_hex, device_version,
+			      manufacture_date);
+
+	kfree(serial_hex);
+	kfree(serial_number);
+
+	if (!device_id)
+		dev_warn(hba->dev, "Failed to allocate unique device ID\n");
+
+	return device_id;
+}
+
 static int ufs_get_device_desc(struct ufs_hba *hba)
 {
 	int err;
@@ -8507,6 +8576,9 @@ static int ufs_get_device_desc(struct ufs_hba *hba)
 		goto out;
 	}
 
+	/* Generate unique device ID */
+	dev_info->device_id = ufshcd_create_device_id(hba, desc_buf);
+
 	hba->luns_avail = desc_buf[DEVICE_DESC_PARAM_NUM_LU] +
 		desc_buf[DEVICE_DESC_PARAM_NUM_WLU];
 
@@ -8542,6 +8614,8 @@ static void ufs_put_device_desc(struct ufs_hba *hba)
 
 	kfree(dev_info->model);
 	dev_info->model = NULL;
+	kfree(dev_info->device_id);
+	dev_info->device_id = NULL;
 }
 
 /**
@@ -8685,6 +8759,8 @@ static int ufshcd_device_geo_params_init(struct ufs_hba *hba)
 	else if (desc_buf[GEOMETRY_DESC_PARAM_MAX_NUM_LUN] == 0)
 		hba->dev_info.max_lu_supported = 8;
 
+	hba->dev_info.rpmb_io_size = desc_buf[GEOMETRY_DESC_PARAM_RPMB_RW_SIZE];
+
 out:
 	kfree(desc_buf);
 	return err;
@@ -8871,6 +8947,7 @@ static int ufshcd_add_lus(struct ufs_hba *hba)
 
 	ufs_bsg_probe(hba);
 	scsi_scan_host(hba->host);
+	ufs_rpmb_probe(hba);
 
 out:
 	return ret;
@@ -10425,6 +10502,7 @@ void ufshcd_remove(struct ufs_hba *hba)
 		ufshcd_rpm_get_sync(hba);
 	ufs_hwmon_remove(hba);
 	ufs_bsg_remove(hba);
+	ufs_rpmb_remove(hba);
 	ufs_sysfs_remove_nodes(hba->dev);
 	cancel_delayed_work_sync(&hba->ufs_rtc_update_work);
 	blk_mq_destroy_queue(hba->tmf_queue);
diff --git a/include/ufs/ufs.h b/include/ufs/ufs.h
index 245a6a829ce9..ab8f6c07b5a2 100644
--- a/include/ufs/ufs.h
+++ b/include/ufs/ufs.h
@@ -651,6 +651,11 @@ struct ufs_dev_info {
 	u8 rtt_cap; /* bDeviceRTTCap */
 
 	bool hid_sup;
+
+	/* Unique device ID string (manufacturer+model+serial+version+date) */
+	char *device_id;
+	u8 rpmb_io_size;
+	u8 rpmb_region_size[4];
 };
 
 #endif /* End of Header */
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index b4eb2fa58552..7efef792c269 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -826,6 +826,7 @@ enum ufshcd_mcq_opr {
  * @host: Scsi_Host instance of the driver
  * @dev: device handle
  * @ufs_device_wlun: WLUN that controls the entire UFS device.
+ * @ufs_rpmb_wlun: RPMB WLUN SCSI device
  * @hwmon_device: device instance registered with the hwmon core.
  * @curr_dev_pwr_mode: active UFS device power mode.
  * @uic_link_state: active state of the link to the UFS device.
@@ -941,8 +942,8 @@ enum ufshcd_mcq_opr {
  * @pm_qos_mutex: synchronizes PM QoS request and status updates
  * @critical_health_count: count of critical health exceptions
  * @dev_lvl_exception_count: count of device level exceptions since last reset
- * @dev_lvl_exception_id: vendor specific information about the
- * device level exception event.
+ * @dev_lvl_exception_id: vendor specific information about the device level exception event.
+ * @rpmbs: list of OP-TEE RPMB devices (one per RPMB region)
  */
 struct ufs_hba {
 	void __iomem *mmio_base;
@@ -960,6 +961,7 @@ struct ufs_hba {
 	struct Scsi_Host *host;
 	struct device *dev;
 	struct scsi_device *ufs_device_wlun;
+	struct scsi_device *ufs_rpmb_wlun;
 
 #ifdef CONFIG_SCSI_UFS_HWMON
 	struct device *hwmon_device;
@@ -1117,6 +1119,7 @@ struct ufs_hba {
 	int critical_health_count;
 	atomic_t dev_lvl_exception_count;
 	u64 dev_lvl_exception_id;
+	struct list_head rpmbs;
 };
 
 /**
-- 
cgit v1.2.3


From 8989d328dfe7c7a3b9f4b9f0ef60006d277f81cc Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 5 Nov 2025 21:19:38 +0100
Subject: net: Helper to move packet data and metadata after skb_push/pull

Lay groundwork for fixing BPF helpers available to TC(X) programs.

When skb_push() or skb_pull() is called in a TC(X) ingress BPF program, the
skb metadata must be kept in front of the MAC header. Otherwise, BPF
programs using the __sk_buff->data_meta pseudo-pointer lose access to it.

Introduce a helper that moves both metadata and a specified number of
packet data bytes together, suitable as a drop-in replacement for
memmove().

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-1-5ceb08a9b37b@cloudflare.com
---
 include/linux/skbuff.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a7cc3d1f4fd1..ff90281ddf90 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4564,6 +4564,81 @@ static inline void skb_metadata_clear(struct sk_buff *skb)
 	skb_metadata_set(skb, 0);
 }
 
+/**
+ * skb_data_move - Move packet data and metadata after skb_push() or skb_pull().
+ * @skb: packet to operate on
+ * @len: number of bytes pushed or pulled from &sk_buff->data
+ * @n: number of bytes to memmove() from pre-push/pull &sk_buff->data
+ *
+ * Moves @n bytes of packet data, can be zero, and all bytes of skb metadata.
+ *
+ * Assumes metadata is located immediately before &sk_buff->data prior to the
+ * push/pull, and that sufficient headroom exists to hold it after an
+ * skb_push(). Otherwise, metadata is cleared and a one-time warning is issued.
+ *
+ * Prefer skb_postpull_data_move() or skb_postpush_data_move() to calling this
+ * helper directly.
+ */
+static inline void skb_data_move(struct sk_buff *skb, const int len,
+				 const unsigned int n)
+{
+	const u8 meta_len = skb_metadata_len(skb);
+	u8 *meta, *meta_end;
+
+	if (!len || (!n && !meta_len))
+		return;
+
+	if (!meta_len)
+		goto no_metadata;
+
+	meta_end = skb_metadata_end(skb);
+	meta = meta_end - meta_len;
+
+	if (WARN_ON_ONCE(meta_end + len != skb->data ||
+			 meta_len > skb_headroom(skb))) {
+		skb_metadata_clear(skb);
+		goto no_metadata;
+	}
+
+	memmove(meta + len, meta, meta_len + n);
+	return;
+
+no_metadata:
+	memmove(skb->data, skb->data - len, n);
+}
+
+/**
+ * skb_postpull_data_move - Move packet data and metadata after skb_pull().
+ * @skb: packet to operate on
+ * @len: number of bytes pulled from &sk_buff->data
+ * @n: number of bytes to memmove() from pre-pull &sk_buff->data
+ *
+ * See skb_data_move() for details.
+ */
+static inline void skb_postpull_data_move(struct sk_buff *skb,
+					  const unsigned int len,
+					  const unsigned int n)
+{
+	DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
+	skb_data_move(skb, len, n);
+}
+
+/**
+ * skb_postpush_data_move - Move packet data and metadata after skb_push().
+ * @skb: packet to operate on
+ * @len: number of bytes pushed onto &sk_buff->data
+ * @n: number of bytes to memmove() from pre-push &sk_buff->data
+ *
+ * See skb_data_move() for details.
+ */
+static inline void skb_postpush_data_move(struct sk_buff *skb,
+					  const unsigned int len,
+					  const unsigned int n)
+{
+	DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
+	skb_data_move(skb, -len, n);
+}
+
 struct sk_buff *skb_clone_sk(struct sk_buff *skb);
 
 #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
-- 
cgit v1.2.3


From f38499ff45f567c932d0911e6a30b8ca022b9b52 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 5 Nov 2025 21:19:40 +0100
Subject: bpf: Unclone skb head on bpf_dynptr_write to skb metadata

Currently bpf_dynptr_from_skb_meta() marks the dynptr as read-only when
the skb is cloned, preventing writes to metadata.

Remove this restriction and unclone the skb head on bpf_dynptr_write() to
metadata, now that the metadata is preserved during uncloning. This makes
metadata dynptr consistent with skb dynptr, allowing writes regardless of
whether the skb is cloned.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-3-5ceb08a9b37b@cloudflare.com
---
 include/linux/filter.h |  9 +++++++++
 kernel/bpf/helpers.c   |  6 ++----
 net/core/filter.c      | 18 ++++++++++++------
 3 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index e116de7edc58..a104b3994230 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1781,6 +1781,8 @@ int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
 void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len);
 void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
 		      void *buf, unsigned long len, bool flush);
+int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
+			       const void *from, u32 len, u64 flags);
 void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset);
 #else /* CONFIG_NET */
 static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
@@ -1817,6 +1819,13 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi
 {
 }
 
+static inline int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
+					     const void *from, u32 len,
+					     u64 flags)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
 {
 	return ERR_PTR(-EOPNOTSUPP);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index eb25e70e0bdc..3e830fd31f5f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1842,10 +1842,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
 			return -EINVAL;
 		return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
 	case BPF_DYNPTR_TYPE_SKB_META:
-		if (flags)
-			return -EINVAL;
-		memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len);
-		return 0;
+		return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src,
+						  len, flags);
 	default:
 		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
 		return -EFAULT;
diff --git a/net/core/filter.c b/net/core/filter.c
index 52721efba332..673299fd3d58 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -12102,6 +12102,18 @@ void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
 	return skb_metadata_end(skb) - skb_metadata_len(skb) + offset;
 }
 
+int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
+			       const void *from, u32 len, u64 flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	if (unlikely(bpf_try_make_writable(skb, 0)))
+		return -EFAULT;
+
+	memmove(bpf_skb_meta_pointer(skb, offset), from, len);
+	return 0;
+}
+
 __bpf_kfunc_start_defs();
 __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
 				    struct bpf_dynptr *ptr__uninit)
@@ -12129,9 +12141,6 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
  * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to
  * &__sk_buff->data_meta.
  *
- * If passed @skb_ is a clone which shares the data with the original, the
- * dynptr will be read-only. This limitation may be lifted in the future.
- *
  * Return:
  * * %0         - dynptr ready to use
  * * %-EINVAL   - invalid flags, dynptr set to null
@@ -12149,9 +12158,6 @@ __bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags,
 
 	bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb));
 
-	if (skb_cloned(skb))
-		bpf_dynptr_set_rdonly(ptr);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From b85be58e2f7cff47f7477ae61022644a198ee592 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 5 Nov 2025 21:19:41 +0100
Subject: vlan: Make vlan_remove_tag return nothing

All callers ignore the return value.

Prepare to reorder memmove() after skb_pull() which is a common pattern.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-4-5ceb08a9b37b@cloudflare.com
---
 include/linux/if_vlan.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 15e01935d3fa..afa5cc61a0fa 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -731,10 +731,8 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb,
  *
  * Expects the skb to contain a VLAN tag in the payload, and to have skb->data
  * pointing at the MAC header.
- *
- * Returns: a new pointer to skb->data, or NULL on failure to pull.
  */
-static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
+static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
 {
 	struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
 
@@ -742,7 +740,7 @@ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
 
 	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
 	vlan_set_encap_proto(skb, vhdr);
-	return __skb_pull(skb, VLAN_HLEN);
+	__skb_pull(skb, VLAN_HLEN);
 }
 
 /**
-- 
cgit v1.2.3


From efd35c26239bed39193201e958d65e695231ccda Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 5 Nov 2025 21:19:42 +0100
Subject: bpf: Make bpf_skb_vlan_pop helper metadata-safe

Use the metadata-aware helper to move packet bytes after skb_pull(),
ensuring metadata remains valid after calling the BPF helper.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-5-5ceb08a9b37b@cloudflare.com
---
 include/linux/if_vlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index afa5cc61a0fa..4ecc2509b0d4 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -738,9 +738,9 @@ static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
 
 	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
 
-	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
 	vlan_set_encap_proto(skb, vhdr);
 	__skb_pull(skb, VLAN_HLEN);
+	skb_postpull_data_move(skb, VLAN_HLEN, 2 * ETH_ALEN);
 }
 
 /**
-- 
cgit v1.2.3


From 55ffc98b44d28e13a218306666d16f2c7236d0ae Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 5 Nov 2025 21:19:43 +0100
Subject: bpf: Make bpf_skb_vlan_push helper metadata-safe

Use the metadata-aware helper to move packet bytes after skb_push(),
ensuring metadata remains valid after calling the BPF helper.

Also, take care to reserve sufficient headroom for metadata to fit.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-6-5ceb08a9b37b@cloudflare.com
---
 include/linux/if_vlan.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 4ecc2509b0d4..f7f34eb15e06 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -355,16 +355,17 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
 					  __be16 vlan_proto, u16 vlan_tci,
 					  unsigned int mac_len)
 {
+	const u8 meta_len = mac_len > ETH_TLEN ? skb_metadata_len(skb) : 0;
 	struct vlan_ethhdr *veth;
 
-	if (skb_cow_head(skb, VLAN_HLEN) < 0)
+	if (skb_cow_head(skb, meta_len + VLAN_HLEN) < 0)
 		return -ENOMEM;
 
 	skb_push(skb, VLAN_HLEN);
 
 	/* Move the mac header sans proto to the beginning of the new header. */
 	if (likely(mac_len > ETH_TLEN))
-		memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
+		skb_postpush_data_move(skb, VLAN_HLEN, mac_len - ETH_TLEN);
 	if (skb_mac_header_was_set(skb))
 		skb->mac_header -= VLAN_HLEN;
 
-- 
cgit v1.2.3


From 15f295f55656658e65bdbc9b901d6b2e49d68d72 Mon Sep 17 00:00:00 2001
From: "D. Wythe" <alibuda@linux.alibaba.com>
Date: Fri, 7 Nov 2025 11:56:31 +0800
Subject: net/smc: bpf: Introduce generic hook for handshake flow

The introduction of IPPROTO_SMC enables eBPF programs to determine
whether to use SMC based on the context of socket creation, such as
network namespaces, PID and comm name, etc.

As a subsequent enhancement, to introduce a new generic hook that
allows decisions on whether to use SMC or not at runtime, including
but not limited to local/remote IP address or ports.

User can write their own implememtion via bpf_struct_ops now to choose
whether to use SMC or not before TCP 3rd handshake to be comleted.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Link: https://patch.msgid.link/20251107035632.115950-3-alibuda@linux.alibaba.com
---
 include/net/netns/smc.h |   3 ++
 include/net/smc.h       |  53 ++++++++++++++++++
 net/ipv4/tcp_output.c   |  31 ++++++-----
 net/smc/Kconfig         |  10 ++++
 net/smc/Makefile        |   1 +
 net/smc/af_smc.c        |   9 ++++
 net/smc/smc_hs_bpf.c    | 140 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_hs_bpf.h    |  31 +++++++++++
 net/smc/smc_sysctl.c    |  91 +++++++++++++++++++++++++++++++
 9 files changed, 355 insertions(+), 14 deletions(-)
 create mode 100644 net/smc/smc_hs_bpf.c
 create mode 100644 net/smc/smc_hs_bpf.h

(limited to 'include')

diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index 6ceb12baec24..ed24c9f638ee 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -17,6 +17,9 @@ struct netns_smc {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header		*smc_hdr;
 #endif
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+	struct smc_hs_ctrl __rcu	*hs_ctrl;
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
 	unsigned int			sysctl_autocorking_size;
 	unsigned int			sysctl_smcr_buf_type;
 	int				sysctl_smcr_testlink_time;
diff --git a/include/net/smc.h b/include/net/smc.h
index 08bee529ed8d..bfdc4c41f019 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -17,6 +17,8 @@
 #include <linux/wait.h>
 #include <linux/dibs.h>
 
+struct tcp_sock;
+struct inet_request_sock;
 struct sock;
 
 #define SMC_MAX_PNETID_LEN	16	/* Max. length of PNET id */
@@ -50,4 +52,55 @@ struct smcd_dev {
 	u8 going_away : 1;
 };
 
+#define SMC_HS_CTRL_NAME_MAX 16
+
+enum {
+	/* ops can be inherit from init_net */
+	SMC_HS_CTRL_FLAG_INHERITABLE = 0x1,
+
+	SMC_HS_CTRL_ALL_FLAGS = SMC_HS_CTRL_FLAG_INHERITABLE,
+};
+
+struct smc_hs_ctrl {
+	/* private */
+
+	struct list_head list;
+	struct module *owner;
+
+	/* public */
+
+	/* unique name */
+	char name[SMC_HS_CTRL_NAME_MAX];
+	int flags;
+
+	/* Invoked before computing SMC option for SYN packets.
+	 * We can control whether to set SMC options by returning various value.
+	 * Return 0 to disable SMC, or return any other value to enable it.
+	 */
+	int (*syn_option)(struct tcp_sock *tp);
+
+	/* Invoked before Set up SMC options for SYN-ACK packets
+	 * We can control whether to respond SMC options by returning various
+	 * value. Return 0 to disable SMC, or return any other value to enable
+	 * it.
+	 */
+	int (*synack_option)(const struct tcp_sock *tp,
+			     struct inet_request_sock *ireq);
+};
+
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+#define smc_call_hsbpf(init_val, tp, func, ...) ({				\
+	typeof(init_val) __ret = (init_val);					\
+	struct smc_hs_ctrl *ctrl;						\
+	rcu_read_lock();							\
+	ctrl = rcu_dereference(sock_net((struct sock *)(tp))->smc.hs_ctrl);	\
+	if (ctrl && ctrl->func)							\
+		__ret = ctrl->func(tp, ##__VA_ARGS__);				\
+	rcu_read_unlock();							\
+	__ret;									\
+})
+#else
+#define smc_call_hsbpf(init_val, tp, ...)  ({ (void)(tp); (init_val); })
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
 #endif	/* _SMC_H */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f5df7a71f62..479afb714bdf 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -40,6 +40,7 @@
 #include <net/tcp.h>
 #include <net/tcp_ecn.h>
 #include <net/mptcp.h>
+#include <net/smc.h>
 #include <net/proto_memory.h>
 #include <net/psp.h>
 
@@ -802,34 +803,36 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
 	mptcp_options_write(th, ptr, tp, opts);
 }
 
-static void smc_set_option(const struct tcp_sock *tp,
+static void smc_set_option(struct tcp_sock *tp,
 			   struct tcp_out_options *opts,
 			   unsigned int *remaining)
 {
 #if IS_ENABLED(CONFIG_SMC)
-	if (static_branch_unlikely(&tcp_have_smc)) {
-		if (tp->syn_smc) {
-			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
-				opts->options |= OPTION_SMC;
-				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
-			}
+	if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) {
+		tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option);
+		/* re-check syn_smc */
+		if (tp->syn_smc &&
+		    *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+			opts->options |= OPTION_SMC;
+			*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
 		}
 	}
 #endif
 }
 
 static void smc_set_option_cond(const struct tcp_sock *tp,
-				const struct inet_request_sock *ireq,
+				struct inet_request_sock *ireq,
 				struct tcp_out_options *opts,
 				unsigned int *remaining)
 {
 #if IS_ENABLED(CONFIG_SMC)
-	if (static_branch_unlikely(&tcp_have_smc)) {
-		if (tp->syn_smc && ireq->smc_ok) {
-			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
-				opts->options |= OPTION_SMC;
-				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
-			}
+	if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) {
+		ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq);
+		/* re-check smc_ok */
+		if (ireq->smc_ok &&
+		    *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+			opts->options |= OPTION_SMC;
+			*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
 		}
 	}
 #endif
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index 99ecd59d1f4b..325addf83cc6 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -19,3 +19,13 @@ config SMC_DIAG
 	  smcss.
 
 	  if unsure, say Y.
+
+config SMC_HS_CTRL_BPF
+	bool "Generic eBPF hook for SMC handshake flow"
+	depends on SMC && BPF_SYSCALL
+	default y
+	help
+	  SMC_HS_CTRL_BPF enables support to register generic eBPF hook for SMC
+	  handshake flow, which offer much greater flexibility in modifying the behavior
+	  of the SMC protocol stack compared to a complete kernel-based approach. Select
+	  this option if you want filtring the handshake process via eBPF programs.
\ No newline at end of file
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 0e754cbc38f9..5368634c5dd6 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -6,3 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
 smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
 smc-y += smc_tracepoint.o smc_inet.o
 smc-$(CONFIG_SYSCTL) += smc_sysctl.o
+smc-$(CONFIG_SMC_HS_CTRL_BPF) += smc_hs_bpf.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 0ef3e16a8517..e388de8dca09 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -58,6 +58,7 @@
 #include "smc_tracepoint.h"
 #include "smc_sysctl.h"
 #include "smc_inet.h"
+#include "smc_hs_bpf.h"
 
 static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
 						 * creation on server
@@ -3600,8 +3601,16 @@ static int __init smc_init(void)
 		pr_err("%s: smc_inet_init fails with %d\n", __func__, rc);
 		goto out_ulp;
 	}
+	rc = bpf_smc_hs_ctrl_init();
+	if (rc) {
+		pr_err("%s: bpf_smc_hs_ctrl_init fails with %d\n", __func__,
+		       rc);
+		goto out_inet;
+	}
 	static_branch_enable(&tcp_have_smc);
 	return 0;
+out_inet:
+	smc_inet_exit();
 out_ulp:
 	tcp_unregister_ulp(&smc_ulp_ops);
 out_ib:
diff --git a/net/smc/smc_hs_bpf.c b/net/smc/smc_hs_bpf.c
new file mode 100644
index 000000000000..063d23d85850
--- /dev/null
+++ b/net/smc/smc_hs_bpf.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Generic hook for SMC handshake flow.
+ *
+ *  Copyright IBM Corp. 2016
+ *  Copyright (c) 2025, Alibaba Inc.
+ *
+ *  Author: D. Wythe <alibuda@linux.alibaba.com>
+ */
+
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/rculist.h>
+
+#include "smc_hs_bpf.h"
+
+static DEFINE_SPINLOCK(smc_hs_ctrl_list_lock);
+static LIST_HEAD(smc_hs_ctrl_list);
+
+static int smc_hs_ctrl_reg(struct smc_hs_ctrl *ctrl)
+{
+	int ret = 0;
+
+	spin_lock(&smc_hs_ctrl_list_lock);
+	/* already exist or duplicate name */
+	if (smc_hs_ctrl_find_by_name(ctrl->name))
+		ret = -EEXIST;
+	else
+		list_add_tail_rcu(&ctrl->list, &smc_hs_ctrl_list);
+	spin_unlock(&smc_hs_ctrl_list_lock);
+	return ret;
+}
+
+static void smc_hs_ctrl_unreg(struct smc_hs_ctrl *ctrl)
+{
+	spin_lock(&smc_hs_ctrl_list_lock);
+	list_del_rcu(&ctrl->list);
+	spin_unlock(&smc_hs_ctrl_list_lock);
+
+	/* Ensure that all readers to complete */
+	synchronize_rcu();
+}
+
+struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name)
+{
+	struct smc_hs_ctrl *ctrl;
+
+	list_for_each_entry_rcu(ctrl, &smc_hs_ctrl_list, list) {
+		if (strcmp(ctrl->name, name) == 0)
+			return ctrl;
+	}
+	return NULL;
+}
+
+static int __smc_bpf_stub_set_tcp_option(struct tcp_sock *tp) { return 1; }
+static int __smc_bpf_stub_set_tcp_option_cond(const struct tcp_sock *tp,
+					      struct inet_request_sock *ireq)
+{
+	return 1;
+}
+
+static struct smc_hs_ctrl __smc_bpf_hs_ctrl = {
+	.syn_option	= __smc_bpf_stub_set_tcp_option,
+	.synack_option	= __smc_bpf_stub_set_tcp_option_cond,
+};
+
+static int smc_bpf_hs_ctrl_init(struct btf *btf) { return 0; }
+
+static int smc_bpf_hs_ctrl_reg(void *kdata, struct bpf_link *link)
+{
+	if (link)
+		return -EOPNOTSUPP;
+
+	return smc_hs_ctrl_reg(kdata);
+}
+
+static void smc_bpf_hs_ctrl_unreg(void *kdata, struct bpf_link *link)
+{
+	smc_hs_ctrl_unreg(kdata);
+}
+
+static int smc_bpf_hs_ctrl_init_member(const struct btf_type *t,
+				       const struct btf_member *member,
+				       void *kdata, const void *udata)
+{
+	const struct smc_hs_ctrl *u_ctrl;
+	struct smc_hs_ctrl *k_ctrl;
+	u32 moff;
+
+	u_ctrl = (const struct smc_hs_ctrl *)udata;
+	k_ctrl = (struct smc_hs_ctrl *)kdata;
+
+	moff = __btf_member_bit_offset(t, member) / 8;
+	switch (moff) {
+	case offsetof(struct smc_hs_ctrl, name):
+		if (bpf_obj_name_cpy(k_ctrl->name, u_ctrl->name,
+				     sizeof(u_ctrl->name)) <= 0)
+			return -EINVAL;
+		return 1;
+	case offsetof(struct smc_hs_ctrl, flags):
+		if (u_ctrl->flags & ~SMC_HS_CTRL_ALL_FLAGS)
+			return -EINVAL;
+		k_ctrl->flags = u_ctrl->flags;
+		return 1;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static const struct bpf_func_proto *
+bpf_smc_hs_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return bpf_base_func_proto(func_id, prog);
+}
+
+static const struct bpf_verifier_ops smc_bpf_verifier_ops = {
+	.get_func_proto		= bpf_smc_hs_func_proto,
+	.is_valid_access	= bpf_tracing_btf_ctx_access,
+};
+
+static struct bpf_struct_ops bpf_smc_hs_ctrl_ops = {
+	.name		= "smc_hs_ctrl",
+	.init		= smc_bpf_hs_ctrl_init,
+	.reg		= smc_bpf_hs_ctrl_reg,
+	.unreg		= smc_bpf_hs_ctrl_unreg,
+	.cfi_stubs	= &__smc_bpf_hs_ctrl,
+	.verifier_ops	= &smc_bpf_verifier_ops,
+	.init_member	= smc_bpf_hs_ctrl_init_member,
+	.owner		= THIS_MODULE,
+};
+
+int bpf_smc_hs_ctrl_init(void)
+{
+	return register_bpf_struct_ops(&bpf_smc_hs_ctrl_ops, smc_hs_ctrl);
+}
diff --git a/net/smc/smc_hs_bpf.h b/net/smc/smc_hs_bpf.h
new file mode 100644
index 000000000000..f5f1807c079e
--- /dev/null
+++ b/net/smc/smc_hs_bpf.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Generic hook for SMC handshake flow.
+ *
+ *  Copyright IBM Corp. 2016
+ *  Copyright (c) 2025, Alibaba Inc.
+ *
+ *  Author: D. Wythe <alibuda@linux.alibaba.com>
+ */
+
+#ifndef __SMC_HS_CTRL
+#define __SMC_HS_CTRL
+
+#include <net/smc.h>
+
+/* Find hs_ctrl by the target name, which required to be a c-string.
+ * Return NULL if no such ctrl was found,otherwise, return a valid ctrl.
+ *
+ * Note: Caller MUST ensure it's was invoked under rcu_read_lock.
+ */
+struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name);
+
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+int bpf_smc_hs_ctrl_init(void);
+#else
+static inline int bpf_smc_hs_ctrl_init(void) { return 0; }
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
+#endif /* __SMC_HS_CTRL */
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
index 7b2471904d04..b1efed546243 100644
--- a/net/smc/smc_sysctl.c
+++ b/net/smc/smc_sysctl.c
@@ -12,12 +12,14 @@
 
 #include <linux/init.h>
 #include <linux/sysctl.h>
+#include <linux/bpf.h>
 #include <net/net_namespace.h>
 
 #include "smc.h"
 #include "smc_core.h"
 #include "smc_llc.h"
 #include "smc_sysctl.h"
+#include "smc_hs_bpf.h"
 
 static int min_sndbuf = SMC_BUF_MIN_SIZE;
 static int min_rcvbuf = SMC_BUF_MIN_SIZE;
@@ -32,6 +34,69 @@ static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX;
 static unsigned int smcr_max_wr_min = 2;
 static unsigned int smcr_max_wr_max = 2048;
 
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+static int smc_net_replace_smc_hs_ctrl(struct net *net, const char *name)
+{
+	struct smc_hs_ctrl *ctrl = NULL;
+
+	rcu_read_lock();
+	/* null or empty name ask to clear current ctrl */
+	if (name && name[0]) {
+		ctrl = smc_hs_ctrl_find_by_name(name);
+		if (!ctrl) {
+			rcu_read_unlock();
+			return -EINVAL;
+		}
+		/* no change, just return */
+		if (ctrl == rcu_dereference(net->smc.hs_ctrl)) {
+			rcu_read_unlock();
+			return 0;
+		}
+		if (!bpf_try_module_get(ctrl, ctrl->owner)) {
+			rcu_read_unlock();
+			return -EBUSY;
+		}
+	}
+	/* xhcg old ctrl with the new one atomically */
+	ctrl = unrcu_pointer(xchg(&net->smc.hs_ctrl, RCU_INITIALIZER(ctrl)));
+	/* release old ctrl */
+	if (ctrl)
+		bpf_module_put(ctrl, ctrl->owner);
+
+	rcu_read_unlock();
+	return 0;
+}
+
+static int proc_smc_hs_ctrl(const struct ctl_table *ctl, int write,
+			    void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(ctl->data, struct net, smc.hs_ctrl);
+	char val[SMC_HS_CTRL_NAME_MAX];
+	const struct ctl_table tbl = {
+		.data = val,
+		.maxlen = SMC_HS_CTRL_NAME_MAX,
+	};
+	struct smc_hs_ctrl *ctrl;
+	int ret;
+
+	rcu_read_lock();
+	ctrl = rcu_dereference(net->smc.hs_ctrl);
+	if (ctrl)
+		memcpy(val, ctrl->name, sizeof(ctrl->name));
+	else
+		val[0] = '\0';
+	rcu_read_unlock();
+
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	if (ret)
+		return ret;
+
+	if (write)
+		ret = smc_net_replace_smc_hs_ctrl(net, val);
+	return ret;
+}
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
 static struct ctl_table smc_table[] = {
 	{
 		.procname       = "autocorking_size",
@@ -119,6 +184,15 @@ static struct ctl_table smc_table[] = {
 		.extra1		= &smcr_max_wr_min,
 		.extra2		= &smcr_max_wr_max,
 	},
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+	{
+		.procname	= "hs_ctrl",
+		.data		= &init_net.smc.hs_ctrl,
+		.mode		= 0644,
+		.maxlen		= SMC_HS_CTRL_NAME_MAX,
+		.proc_handler	= proc_smc_hs_ctrl,
+	},
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
 };
 
 int __net_init smc_sysctl_net_init(struct net *net)
@@ -129,6 +203,16 @@ int __net_init smc_sysctl_net_init(struct net *net)
 	table = smc_table;
 	if (!net_eq(net, &init_net)) {
 		int i;
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+		struct smc_hs_ctrl *ctrl;
+
+		rcu_read_lock();
+		ctrl = rcu_dereference(init_net.smc.hs_ctrl);
+		if (ctrl && ctrl->flags & SMC_HS_CTRL_FLAG_INHERITABLE &&
+		    bpf_try_module_get(ctrl, ctrl->owner))
+			rcu_assign_pointer(net->smc.hs_ctrl, ctrl);
+		rcu_read_unlock();
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
 
 		table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
 		if (!table)
@@ -161,6 +245,9 @@ err_reg:
 	if (!net_eq(net, &init_net))
 		kfree(table);
 err_alloc:
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+	smc_net_replace_smc_hs_ctrl(net, NULL);
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
 	return -ENOMEM;
 }
 
@@ -170,6 +257,10 @@ void __net_exit smc_sysctl_net_exit(struct net *net)
 
 	table = net->smc.smc_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(net->smc.smc_hdr);
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+	smc_net_replace_smc_hs_ctrl(net, NULL);
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
 	if (!net_eq(net, &init_net))
 		kfree(table);
 }
-- 
cgit v1.2.3


From 73edb26b06ea0eb5bd8c6cae5b2df212ae3c7ab5 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Thu, 6 Nov 2025 22:34:06 +0000
Subject: sctp: Don't inherit do_auto_asconf in sctp_clone_sock().

syzbot reported list_del(&sp->auto_asconf_list) corruption
in sctp_destroy_sock().

The repro calls setsockopt(SCTP_AUTO_ASCONF, 1) to a SCTP
listener, calls accept(), and close()s the child socket.

setsockopt(SCTP_AUTO_ASCONF, 1) sets sp->do_auto_asconf
to 1 and links sp->auto_asconf_list to a per-netns list.

Both fields are placed after sp->pd_lobby in struct sctp_sock,
and sctp_copy_descendant() did not copy the fields before the
cited commit.

Also, sctp_clone_sock() did not set them explicitly.

In addition, sctp_auto_asconf_init() is called from
sctp_sock_migrate(), but it initialises the fields only
conditionally.

The two fields relied on __GFP_ZERO added in sk_alloc(),
but sk_clone() does not use it.

Let's clear newsp->do_auto_asconf in sctp_clone_sock().

[0]:
list_del corruption. prev->next should be ffff8880799e9148, but was ffff8880799e8808. (prev=ffff88803347d9f8)
kernel BUG at lib/list_debug.c:64!
Oops: invalid opcode: 0000 [#1] SMP KASAN PTI
CPU: 0 UID: 0 PID: 6008 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025
RIP: 0010:__list_del_entry_valid_or_report+0x15a/0x190 lib/list_debug.c:62
Code: e8 7b 26 71 fd 43 80 3c 2c 00 74 08 4c 89 ff e8 7c ee 92 fd 49 8b 17 48 c7 c7 80 0a bf 8b 48 89 de 4c 89 f9 e8 07 c6 94 fc 90 <0f> 0b 4c 89 f7 e8 4c 26 71 fd 43 80 3c 2c 00 74 08 4c 89 ff e8 4d
RSP: 0018:ffffc90003067ad8 EFLAGS: 00010246
RAX: 000000000000006d RBX: ffff8880799e9148 RCX: b056988859ee6e00
RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000
RBP: dffffc0000000000 R08: ffffc90003067807 R09: 1ffff9200060cf00
R10: dffffc0000000000 R11: fffff5200060cf01 R12: 1ffff1100668fb3f
R13: dffffc0000000000 R14: ffff88803347d9f8 R15: ffff88803347d9f8
FS:  00005555823e5500(0000) GS:ffff88812613e000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000200000000480 CR3: 00000000741ce000 CR4: 00000000003526f0
Call Trace:
 <TASK>
 __list_del_entry_valid include/linux/list.h:132 [inline]
 __list_del_entry include/linux/list.h:223 [inline]
 list_del include/linux/list.h:237 [inline]
 sctp_destroy_sock+0xb4/0x370 net/sctp/socket.c:5163
 sk_common_release+0x75/0x310 net/core/sock.c:3961
 sctp_close+0x77e/0x900 net/sctp/socket.c:1550
 inet_release+0x144/0x190 net/ipv4/af_inet.c:437
 __sock_release net/socket.c:662 [inline]
 sock_close+0xc3/0x240 net/socket.c:1455
 __fput+0x44c/0xa70 fs/file_table.c:468
 task_work_run+0x1d4/0x260 kernel/task_work.c:227
 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
 exit_to_user_mode_loop+0xe9/0x130 kernel/entry/common.c:43
 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline]
 syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline]
 syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline]
 do_syscall_64+0x2bd/0xfa0 arch/x86/entry/syscall_64.c:100
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Fixes: 16942cf4d3e3 ("sctp: Use sk_clone() in sctp_accept().")
Reported-by: syzbot+ba535cb417f106327741@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/690d2185.a70a0220.22f260.000e.GAE@google.com/
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251106223418.1455510-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/structs.h | 4 ----
 net/sctp/socket.c          | 1 +
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 5900196d65fd..affee44bd38e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -228,10 +228,6 @@ struct sctp_sock {
 
 	atomic_t pd_mode;
 
-	/* Fields after this point will be skipped on copies, like on accept
-	 * and peeloff operations
-	 */
-
 	/* Receive to here while partial delivery is in effect. */
 	struct sk_buff_head pd_lobby;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 38d2932acebf..d808096f5ab1 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4885,6 +4885,7 @@ static struct sock *sctp_clone_sock(struct sock *sk,
 	}
 #endif
 
+	newsp->do_auto_asconf = 0;
 	skb_queue_head_init(&newsp->pd_lobby);
 
 	newsp->ep = sctp_endpoint_new(newsk, GFP_KERNEL);
-- 
cgit v1.2.3


From 7ff14c52049eafecdd72cd0a12cae6905876566a Mon Sep 17 00:00:00 2001
From: Simon Schippers <simon.schippers@tu-dortmund.de>
Date: Thu, 6 Nov 2025 18:56:15 +0100
Subject: usbnet: Add support for Byte Queue Limits (BQL)

In the current implementation, usbnet uses a fixed tx_qlen of:

USB2: 60 * 1518 bytes = 91.08 KB
USB3: 60 * 5 * 1518 bytes = 454.80 KB

Such large transmit queues can be problematic, especially for cellular
modems. For example, with a typical celluar link speed of 10 Mbit/s, a
fully occupied USB3 transmit queue results in:

454.80 KB / (10 Mbit/s / 8 bit/byte) = 363.84 ms

of additional latency.

This patch adds support for Byte Queue Limits (BQL) [1] to dynamically
manage the transmit queue size and reduce latency without sacrificing
throughput.

Testing was performed on various devices using the usbnet driver for
packet transmission:

- DELOCK 66045: USB3 to 2.5 GbE adapter (ax88179_178a)
- DELOCK 61969: USB2 to 1 GbE adapter (asix)
- Quectel RM520: 5G modem (qmi_wwan)
- USB2 Android tethering (cdc_ncm)

No performance degradation was observed for iperf3 TCP or UDP traffic,
while latency for a prioritized ping application was significantly
reduced. For example, using the USB3 to 2.5 GbE adapter, which was fully
utilized by iperf3 UDP traffic, the prioritized ping was improved from
1.6 ms to 0.6 ms. With the same setup but with a 100 Mbit/s Ethernet
connection, the prioritized ping was improved from 35 ms to 5 ms.

[1] https://lwn.net/Articles/469652/

Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106175615.26948-1-simon.schippers@tu-dortmund.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/usbnet.c   | 11 +++++++++++
 include/linux/usb/usbnet.h |  2 ++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index f3087fb62f4f..3d10cf791c51 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -831,6 +831,7 @@ int usbnet_stop(struct net_device *net)
 
 	clear_bit(EVENT_DEV_OPEN, &dev->flags);
 	netif_stop_queue (net);
+	netdev_reset_queue(net);
 
 	netif_info(dev, ifdown, dev->net,
 		   "stop stats: rx/tx %lu/%lu, errs %lu/%lu\n",
@@ -939,6 +940,7 @@ int usbnet_open(struct net_device *net)
 	}
 
 	set_bit(EVENT_DEV_OPEN, &dev->flags);
+	netdev_reset_queue(net);
 	netif_start_queue (net);
 	netif_info(dev, ifup, dev->net,
 		   "open: enable queueing (rx %d, tx %d) mtu %d %s framing\n",
@@ -1500,6 +1502,7 @@ netdev_tx_t usbnet_start_xmit(struct sk_buff *skb, struct net_device *net)
 	case 0:
 		netif_trans_update(net);
 		__usbnet_queue_skb(&dev->txq, skb, tx_start);
+		netdev_sent_queue(net, skb->len);
 		if (dev->txq.qlen >= TX_QLEN (dev))
 			netif_stop_queue (net);
 	}
@@ -1563,6 +1566,7 @@ static inline void usb_free_skb(struct sk_buff *skb)
 static void usbnet_bh(struct timer_list *t)
 {
 	struct usbnet		*dev = timer_container_of(dev, t, delay);
+	unsigned int bytes_compl = 0, pkts_compl = 0;
 	struct sk_buff		*skb;
 	struct skb_data		*entry;
 
@@ -1574,6 +1578,8 @@ static void usbnet_bh(struct timer_list *t)
 				usb_free_skb(skb);
 			continue;
 		case tx_done:
+			bytes_compl += skb->len;
+			pkts_compl++;
 			kfree(entry->urb->sg);
 			fallthrough;
 		case rx_cleanup:
@@ -1584,6 +1590,10 @@ static void usbnet_bh(struct timer_list *t)
 		}
 	}
 
+	spin_lock_bh(&dev->bql_spinlock);
+	netdev_completed_queue(dev->net, pkts_compl, bytes_compl);
+	spin_unlock_bh(&dev->bql_spinlock);
+
 	/* restart RX again after disabling due to high error rate */
 	clear_bit(EVENT_RX_KILL, &dev->flags);
 
@@ -1755,6 +1765,7 @@ usbnet_probe(struct usb_interface *udev, const struct usb_device_id *prod)
 	skb_queue_head_init (&dev->txq);
 	skb_queue_head_init (&dev->done);
 	skb_queue_head_init(&dev->rxq_pause);
+	spin_lock_init(&dev->bql_spinlock);
 	INIT_WORK(&dev->bh_work, usbnet_bh_work);
 	INIT_WORK (&dev->kevent, usbnet_deferred_kevent);
 	init_usb_anchor(&dev->deferred);
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index a2d54122823d..2945923a8a95 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -14,6 +14,7 @@
 #include <linux/skbuff.h>
 #include <linux/types.h>
 #include <linux/usb.h>
+#include <linux/spinlock.h>
 
 /* interface from usbnet core to each USB networking link we handle */
 struct usbnet {
@@ -59,6 +60,7 @@ struct usbnet {
 	struct mutex		interrupt_mutex;
 	struct usb_anchor	deferred;
 	struct work_struct	bh_work;
+	spinlock_t		bql_spinlock;
 
 	struct work_struct	kevent;
 	unsigned long		flags;
-- 
cgit v1.2.3


From 62ed1b58224636185fa689db81224b8c8af46473 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Mon, 3 Nov 2025 20:57:57 +0800
Subject: md: allow configuring logical block size

Previously, raid array used the maximum logical block size (LBS)
of all member disks. Adding a larger LBS disk at runtime could
unexpectedly increase RAID's LBS, risking corruption of existing
partitions. This can be reproduced by:

```
  # LBS of sd[de] is 512 bytes, sdf is 4096 bytes.
  mdadm -CRq /dev/md0 -l1 -n3 /dev/sd[de] missing --assume-clean

  # LBS is 512
  cat /sys/block/md0/queue/logical_block_size

  # create partition md0p1
  parted -s /dev/md0 mklabel gpt mkpart primary 1MiB 100%
  lsblk | grep md0p1

  # LBS becomes 4096 after adding sdf
  mdadm --add -q /dev/md0 /dev/sdf
  cat /sys/block/md0/queue/logical_block_size

  # partition lost
  partprobe /dev/md0
  lsblk | grep md0p1
```

Simply restricting larger-LBS disks is inflexible. In some scenarios,
only disks with 512 bytes LBS are available currently, but later, disks
with 4KB LBS may be added to the array.

Making LBS configurable is the best way to solve this scenario.
After this patch, the raid will:
  - store LBS in disk metadata
  - add a read-write sysfs 'mdX/logical_block_size'

Future mdadm should support setting LBS via metadata field during RAID
creation and the new sysfs. Though the kernel allows runtime LBS changes,
users should avoid modifying it after creating partitions or filesystems
to prevent compatibility issues.

Only 1.x metadata supports configurable LBS. 0.90 metadata inits all
fields to default values at auto-detect. Supporting 0.90 would require
more extensive changes and no such use case has been observed.

Note that many RAID paths rely on PAGE_SIZE alignment, including for
metadata I/O. A larger LBS than PAGE_SIZE will result in metadata
read/write failures. So this config should be prevented.

Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-6-linan666@huaweicloud.com
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 Documentation/admin-guide/md.rst | 10 ++++++
 drivers/md/md-linear.c           |  1 +
 drivers/md/md.c                  | 77 ++++++++++++++++++++++++++++++++++++++++
 drivers/md/md.h                  |  1 +
 drivers/md/raid0.c               |  1 +
 drivers/md/raid1.c               |  1 +
 drivers/md/raid10.c              |  1 +
 drivers/md/raid5.c               |  1 +
 include/uapi/linux/raid/md_p.h   |  3 +-
 9 files changed, 95 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index deed823eab01..dc7eab191caa 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -238,6 +238,16 @@ All md devices contain:
      the number of devices in a raid4/5/6, or to support external
      metadata formats which mandate such clipping.
 
+  logical_block_size
+     Configure the array's logical block size in bytes. This attribute
+     is only supported for 1.x meta. Write the value before starting
+     array. The final array LBS uses the maximum between this
+     configuration and LBS of all combined devices. Note that
+     LBS cannot exceed PAGE_SIZE before RAID supports folio.
+     WARNING: Arrays created on new kernel cannot be assembled at old
+     kernel due to padding check, Set module parameter 'check_new_feature'
+     to false to bypass, but data loss may occur.
+
   reshape_position
      This is either ``none`` or a sector number within the devices of
      the array where ``reshape`` is up to.  If this is set, the three
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 25a6ddedea65..8d7b82c4a723 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -72,6 +72,7 @@ static int linear_set_limits(struct mddev *mddev)
 
 	md_init_stacking_limits(&lim);
 	lim.max_hw_sectors = mddev->chunk_sectors;
+	lim.logical_block_size = mddev->logical_block_size;
 	lim.max_write_zeroes_sectors = mddev->chunk_sectors;
 	lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors;
 	lim.io_min = mddev->chunk_sectors << 9;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9676e2477df6..7b5c5967568f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1999,6 +1999,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
 		mddev->layout = le32_to_cpu(sb->layout);
 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
 		mddev->dev_sectors = le64_to_cpu(sb->size);
+		mddev->logical_block_size = le32_to_cpu(sb->logical_block_size);
 		mddev->events = ev1;
 		mddev->bitmap_info.offset = 0;
 		mddev->bitmap_info.space = 0;
@@ -2208,6 +2209,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
 	sb->level = cpu_to_le32(mddev->level);
 	sb->layout = cpu_to_le32(mddev->layout);
+	sb->logical_block_size = cpu_to_le32(mddev->logical_block_size);
 	if (test_bit(FailFast, &rdev->flags))
 		sb->devflags |= FailFast1;
 	else
@@ -5936,6 +5938,68 @@ static struct md_sysfs_entry md_serialize_policy =
 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
        serialize_policy_store);
 
+static int mddev_set_logical_block_size(struct mddev *mddev,
+				unsigned int lbs)
+{
+	int err = 0;
+	struct queue_limits lim;
+
+	if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) {
+		pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n",
+		       mdname(mddev), lbs);
+		return -EINVAL;
+	}
+
+	lim = queue_limits_start_update(mddev->gendisk->queue);
+	lim.logical_block_size = lbs;
+	pr_info("%s: logical_block_size is changed, data may be lost\n",
+		mdname(mddev));
+	err = queue_limits_commit_update(mddev->gendisk->queue, &lim);
+	if (err)
+		return err;
+
+	mddev->logical_block_size = lbs;
+	/* New lbs will be written to superblock after array is running */
+	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
+	return 0;
+}
+
+static ssize_t
+lbs_show(struct mddev *mddev, char *page)
+{
+	return sprintf(page, "%u\n", mddev->logical_block_size);
+}
+
+static ssize_t
+lbs_store(struct mddev *mddev, const char *buf, size_t len)
+{
+	unsigned int lbs;
+	int err = -EBUSY;
+
+	/* Only 1.x meta supports configurable LBS */
+	if (mddev->major_version == 0)
+		return -EINVAL;
+
+	if (mddev->pers)
+		return -EBUSY;
+
+	err = kstrtouint(buf, 10, &lbs);
+	if (err < 0)
+		return -EINVAL;
+
+	err = mddev_lock(mddev);
+	if (err)
+		goto unlock;
+
+	err = mddev_set_logical_block_size(mddev, lbs);
+
+unlock:
+	mddev_unlock(mddev);
+	return err ?: len;
+}
+
+static struct md_sysfs_entry md_logical_block_size =
+__ATTR(logical_block_size, 0644, lbs_show, lbs_store);
 
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
@@ -5958,6 +6022,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_consistency_policy.attr,
 	&md_fail_last_dev.attr,
 	&md_serialize_policy.attr,
+	&md_logical_block_size.attr,
 	NULL,
 };
 
@@ -6088,6 +6153,17 @@ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
 			return -EINVAL;
 	}
 
+	/*
+	 * Before RAID adding folio support, the logical_block_size
+	 * should be smaller than the page size.
+	 */
+	if (lim->logical_block_size > PAGE_SIZE) {
+		pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+	mddev->logical_block_size = lim->logical_block_size;
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
@@ -6699,6 +6775,7 @@ static void md_clean(struct mddev *mddev)
 	mddev->chunk_sectors = 0;
 	mddev->ctime = mddev->utime = 0;
 	mddev->layout = 0;
+	mddev->logical_block_size = 0;
 	mddev->max_disks = 0;
 	mddev->events = 0;
 	mddev->can_decrease_events = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fd6e001c1d38..6985f2829bbd 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -433,6 +433,7 @@ struct mddev {
 	sector_t			array_sectors; /* exported array size */
 	int				external_size; /* size managed
 							* externally */
+	unsigned int			logical_block_size;
 	__u64				events;
 	/* If the last 'event' was simply a clean->dirty transition, and
 	 * we didn't write it to the spares, then it is safe and simple
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index fbf763401521..47aee1b1d4d1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -380,6 +380,7 @@ static int raid0_set_limits(struct mddev *mddev)
 	lim.max_hw_sectors = mddev->chunk_sectors;
 	lim.max_write_zeroes_sectors = mddev->chunk_sectors;
 	lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors;
+	lim.logical_block_size = mddev->logical_block_size;
 	lim.io_min = mddev->chunk_sectors << 9;
 	lim.io_opt = lim.io_min * mddev->raid_disks;
 	lim.chunk_sectors = mddev->chunk_sectors;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 592a40233004..57d50465eed1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3213,6 +3213,7 @@ static int raid1_set_limits(struct mddev *mddev)
 	md_init_stacking_limits(&lim);
 	lim.max_write_zeroes_sectors = 0;
 	lim.max_hw_wzeroes_unmap_sectors = 0;
+	lim.logical_block_size = mddev->logical_block_size;
 	lim.features |= BLK_FEAT_ATOMIC_WRITES;
 	err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
 	if (err)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 14dcd5142eb4..84be4cc7e873 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4000,6 +4000,7 @@ static int raid10_set_queue_limits(struct mddev *mddev)
 	md_init_stacking_limits(&lim);
 	lim.max_write_zeroes_sectors = 0;
 	lim.max_hw_wzeroes_unmap_sectors = 0;
+	lim.logical_block_size = mddev->logical_block_size;
 	lim.io_min = mddev->chunk_sectors << 9;
 	lim.chunk_sectors = mddev->chunk_sectors;
 	lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 24b32a0c95b4..cdbc7eba5c54 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7745,6 +7745,7 @@ static int raid5_set_limits(struct mddev *mddev)
 	stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
 
 	md_init_stacking_limits(&lim);
+	lim.logical_block_size = mddev->logical_block_size;
 	lim.io_min = mddev->chunk_sectors << 9;
 	lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
 	lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index ac74133a4768..310068bb2a1d 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -291,7 +291,8 @@ struct mdp_superblock_1 {
 	__le64	resync_offset;	/* data before this offset (from data_offset) known to be in sync */
 	__le32	sb_csum;	/* checksum up to devs[max_dev] */
 	__le32	max_dev;	/* size of devs[] array to consider */
-	__u8	pad3[64-32];	/* set to 0 when writing */
+	__le32  logical_block_size;	/* same as q->limits->logical_block_size */
+	__u8	pad3[64-36];	/* set to 0 when writing */
 
 	/* device state information. Indexed by dev_number.
 	 * 2 bytes per device
-- 
cgit v1.2.3


From 2b9a0f21fbb8a3b7df7faa5b7534897a86c44b98 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:13 +0100
Subject: ns: move namespace types into separate header

Add a dedicated header for namespace types.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-1-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns/ns_common_types.h | 205 +++++++++++++++++++++++++++++++++++++
 include/linux/ns_common.h          | 196 +----------------------------------
 2 files changed, 206 insertions(+), 195 deletions(-)
 create mode 100644 include/linux/ns/ns_common_types.h

(limited to 'include')

diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h
new file mode 100644
index 000000000000..ccd1d1e116f6
--- /dev/null
+++ b/include/linux/ns/ns_common_types.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NS_COMMON_TYPES_H
+#define _LINUX_NS_COMMON_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/rbtree.h>
+#include <linux/refcount.h>
+#include <linux/types.h>
+
+struct cgroup_namespace;
+struct dentry;
+struct ipc_namespace;
+struct mnt_namespace;
+struct net;
+struct pid_namespace;
+struct proc_ns_operations;
+struct time_namespace;
+struct user_namespace;
+struct uts_namespace;
+
+extern struct cgroup_namespace init_cgroup_ns;
+extern struct ipc_namespace init_ipc_ns;
+extern struct mnt_namespace init_mnt_ns;
+extern struct net init_net;
+extern struct pid_namespace init_pid_ns;
+extern struct time_namespace init_time_ns;
+extern struct user_namespace init_user_ns;
+extern struct uts_namespace init_uts_ns;
+
+extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations mntns_operations;
+extern const struct proc_ns_operations netns_operations;
+extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
+extern const struct proc_ns_operations timens_operations;
+extern const struct proc_ns_operations timens_for_children_operations;
+extern const struct proc_ns_operations userns_operations;
+extern const struct proc_ns_operations utsns_operations;
+
+/*
+ * Namespace lifetimes are managed via a two-tier reference counting model:
+ *
+ * (1) __ns_ref (refcount_t): Main reference count tracking memory
+ *     lifetime. Controls when the namespace structure itself is freed.
+ *     It also pins the namespace on the namespace trees whereas (2)
+ *     only regulates their visibility to userspace.
+ *
+ * (2) __ns_ref_active (atomic_t): Reference count tracking active users.
+ *     Controls visibility of the namespace in the namespace trees.
+ *     Any live task that uses the namespace (via nsproxy or cred) holds
+ *     an active reference. Any open file descriptor or bind-mount of
+ *     the namespace holds an active reference. Once all tasks have
+ *     called exited their namespaces and all file descriptors and
+ *     bind-mounts have been released the active reference count drops
+ *     to zero and the namespace becomes inactive. IOW, the namespace
+ *     cannot be listed or opened via file handles anymore.
+ *
+ *     Note that it is valid to transition from active to inactive and
+ *     back from inactive to active e.g., when resurrecting an inactive
+ *     namespace tree via the SIOCGSKNS ioctl().
+ *
+ * Relationship and lifecycle states:
+ *
+ * - Active (__ns_ref_active > 0):
+ *   Namespace is actively used and visible to userspace. The namespace
+ *   can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file
+ *   handles, or discovered via listns().
+ *
+ * - Inactive (__ns_ref_active == 0, __ns_ref > 0):
+ *   No tasks are actively using the namespace and it isn't pinned by
+ *   any bind-mounts or open file descriptors anymore. But the namespace
+ *   is still kept alive by internal references. For example, the user
+ *   namespace could be pinned by an open file through file->f_cred
+ *   references when one of the now defunct tasks had opened a file and
+ *   handed the file descriptor off to another process via a UNIX
+ *   sockets. Such references keep the namespace structure alive through
+ *   __ns_ref but will not hold an active reference.
+ *
+ * - Destroyed (__ns_ref == 0):
+ *   No references remain. The namespace is removed from the tree and freed.
+ *
+ * State transitions:
+ *
+ * Active -> Inactive:
+ *   When the last task using the namespace exits it drops its active
+ *   references to all namespaces. However, user and pid namespaces
+ *   remain accessible until the task has been reaped.
+ *
+ * Inactive -> Active:
+ *   An inactive namespace tree might be resurrected due to e.g., the
+ *   SIOCGSKNS ioctl() on a socket.
+ *
+ * Inactive -> Destroyed:
+ *   When __ns_ref drops to zero the namespace is removed from the
+ *   namespaces trees and the memory is freed (after RCU grace period).
+ *
+ * Initial namespaces:
+ *   Boot-time namespaces (init_net, init_pid_ns, etc.) start with
+ *   __ns_ref_active = 1 and remain active forever.
+ */
+struct ns_common {
+	u32 ns_type;
+	struct dentry *stashed;
+	const struct proc_ns_operations *ops;
+	unsigned int inum;
+	refcount_t __ns_ref; /* do not use directly */
+	union {
+		struct {
+			u64 ns_id;
+			struct /* global namespace rbtree and list */ {
+				struct rb_node ns_unified_tree_node;
+				struct list_head ns_unified_list_node;
+			};
+			struct /* per type rbtree and list */ {
+				struct rb_node ns_tree_node;
+				struct list_head ns_list_node;
+			};
+			struct /* namespace ownership rbtree and list */ {
+				struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */
+				struct list_head ns_owner; /* list of namespaces owned by this namespace */
+				struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */
+				struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */
+			};
+			atomic_t __ns_ref_active; /* do not use directly */
+		};
+		struct rcu_head ns_rcu;
+	};
+};
+
+#define to_ns_common(__ns)                                    \
+	_Generic((__ns),                                      \
+		struct cgroup_namespace *:       &(__ns)->ns, \
+		const struct cgroup_namespace *: &(__ns)->ns, \
+		struct ipc_namespace *:          &(__ns)->ns, \
+		const struct ipc_namespace *:    &(__ns)->ns, \
+		struct mnt_namespace *:          &(__ns)->ns, \
+		const struct mnt_namespace *:    &(__ns)->ns, \
+		struct net *:                    &(__ns)->ns, \
+		const struct net *:              &(__ns)->ns, \
+		struct pid_namespace *:          &(__ns)->ns, \
+		const struct pid_namespace *:    &(__ns)->ns, \
+		struct time_namespace *:         &(__ns)->ns, \
+		const struct time_namespace *:   &(__ns)->ns, \
+		struct user_namespace *:         &(__ns)->ns, \
+		const struct user_namespace *:   &(__ns)->ns, \
+		struct uts_namespace *:          &(__ns)->ns, \
+		const struct uts_namespace *:    &(__ns)->ns)
+
+#define ns_init_inum(__ns)                                     \
+	_Generic((__ns),                                       \
+		struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
+		struct ipc_namespace *:    IPC_NS_INIT_INO,    \
+		struct mnt_namespace *:    MNT_NS_INIT_INO,    \
+		struct net *:              NET_NS_INIT_INO,    \
+		struct pid_namespace *:    PID_NS_INIT_INO,    \
+		struct time_namespace *:   TIME_NS_INIT_INO,   \
+		struct user_namespace *:   USER_NS_INIT_INO,   \
+		struct uts_namespace *:    UTS_NS_INIT_INO)
+
+#define ns_init_ns(__ns)                                    \
+	_Generic((__ns),                                    \
+		struct cgroup_namespace *: &init_cgroup_ns, \
+		struct ipc_namespace *:    &init_ipc_ns,    \
+		struct mnt_namespace *:    &init_mnt_ns,     \
+		struct net *:              &init_net,       \
+		struct pid_namespace *:    &init_pid_ns,    \
+		struct time_namespace *:   &init_time_ns,   \
+		struct user_namespace *:   &init_user_ns,   \
+		struct uts_namespace *:    &init_uts_ns)
+
+#define ns_init_id(__ns)						\
+	_Generic((__ns),						\
+		struct cgroup_namespace *:	CGROUP_NS_INIT_ID,	\
+		struct ipc_namespace *:		IPC_NS_INIT_ID,		\
+		struct mnt_namespace *:		MNT_NS_INIT_ID,		\
+		struct net *:			NET_NS_INIT_ID,		\
+		struct pid_namespace *:		PID_NS_INIT_ID,		\
+		struct time_namespace *:	TIME_NS_INIT_ID,	\
+		struct user_namespace *:	USER_NS_INIT_ID,	\
+		struct uts_namespace *:		UTS_NS_INIT_ID)
+
+#define to_ns_operations(__ns)                                                                         \
+	_Generic((__ns),                                                                               \
+		struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
+		struct ipc_namespace *:    (IS_ENABLED(CONFIG_IPC_NS)  ? &ipcns_operations    : NULL), \
+		struct mnt_namespace *:    &mntns_operations,                                          \
+		struct net *:              (IS_ENABLED(CONFIG_NET_NS)  ? &netns_operations    : NULL), \
+		struct pid_namespace *:    (IS_ENABLED(CONFIG_PID_NS)  ? &pidns_operations    : NULL), \
+		struct time_namespace *:   (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations   : NULL), \
+		struct user_namespace *:   (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations   : NULL), \
+		struct uts_namespace *:    (IS_ENABLED(CONFIG_UTS_NS)  ? &utsns_operations    : NULL))
+
+#define ns_common_type(__ns)                                \
+	_Generic((__ns),                                    \
+		struct cgroup_namespace *: CLONE_NEWCGROUP, \
+		struct ipc_namespace *:    CLONE_NEWIPC,    \
+		struct mnt_namespace *:    CLONE_NEWNS,     \
+		struct net *:              CLONE_NEWNET,    \
+		struct pid_namespace *:    CLONE_NEWPID,    \
+		struct time_namespace *:   CLONE_NEWTIME,   \
+		struct user_namespace *:   CLONE_NEWUSER,   \
+		struct uts_namespace *:    CLONE_NEWUTS)
+
+#endif /* _LINUX_NS_COMMON_TYPES_H */
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 66ea09b48377..6a4ca8c3b9c4 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -2,133 +2,12 @@
 #ifndef _LINUX_NS_COMMON_H
 #define _LINUX_NS_COMMON_H
 
+#include <linux/ns/ns_common_types.h>
 #include <linux/refcount.h>
-#include <linux/rbtree.h>
 #include <linux/vfsdebug.h>
 #include <uapi/linux/sched.h>
 #include <uapi/linux/nsfs.h>
 
-struct proc_ns_operations;
-
-struct cgroup_namespace;
-struct ipc_namespace;
-struct mnt_namespace;
-struct net;
-struct pid_namespace;
-struct time_namespace;
-struct user_namespace;
-struct uts_namespace;
-
-extern struct cgroup_namespace init_cgroup_ns;
-extern struct ipc_namespace init_ipc_ns;
-extern struct mnt_namespace init_mnt_ns;
-extern struct net init_net;
-extern struct pid_namespace init_pid_ns;
-extern struct time_namespace init_time_ns;
-extern struct user_namespace init_user_ns;
-extern struct uts_namespace init_uts_ns;
-
-extern const struct proc_ns_operations netns_operations;
-extern const struct proc_ns_operations utsns_operations;
-extern const struct proc_ns_operations ipcns_operations;
-extern const struct proc_ns_operations pidns_operations;
-extern const struct proc_ns_operations pidns_for_children_operations;
-extern const struct proc_ns_operations userns_operations;
-extern const struct proc_ns_operations mntns_operations;
-extern const struct proc_ns_operations cgroupns_operations;
-extern const struct proc_ns_operations timens_operations;
-extern const struct proc_ns_operations timens_for_children_operations;
-
-/*
- * Namespace lifetimes are managed via a two-tier reference counting model:
- *
- * (1) __ns_ref (refcount_t): Main reference count tracking memory
- *     lifetime. Controls when the namespace structure itself is freed.
- *     It also pins the namespace on the namespace trees whereas (2)
- *     only regulates their visibility to userspace.
- *
- * (2) __ns_ref_active (atomic_t): Reference count tracking active users.
- *     Controls visibility of the namespace in the namespace trees.
- *     Any live task that uses the namespace (via nsproxy or cred) holds
- *     an active reference. Any open file descriptor or bind-mount of
- *     the namespace holds an active reference. Once all tasks have
- *     called exited their namespaces and all file descriptors and
- *     bind-mounts have been released the active reference count drops
- *     to zero and the namespace becomes inactive. IOW, the namespace
- *     cannot be listed or opened via file handles anymore.
- *
- *     Note that it is valid to transition from active to inactive and
- *     back from inactive to active e.g., when resurrecting an inactive
- *     namespace tree via the SIOCGSKNS ioctl().
- *
- * Relationship and lifecycle states:
- *
- * - Active (__ns_ref_active > 0):
- *   Namespace is actively used and visible to userspace. The namespace
- *   can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file
- *   handles, or discovered via listns().
- *
- * - Inactive (__ns_ref_active == 0, __ns_ref > 0):
- *   No tasks are actively using the namespace and it isn't pinned by
- *   any bind-mounts or open file descriptors anymore. But the namespace
- *   is still kept alive by internal references. For example, the user
- *   namespace could be pinned by an open file through file->f_cred
- *   references when one of the now defunct tasks had opened a file and
- *   handed the file descriptor off to another process via a UNIX
- *   sockets. Such references keep the namespace structure alive through
- *   __ns_ref but will not hold an active reference.
- *
- * - Destroyed (__ns_ref == 0):
- *   No references remain. The namespace is removed from the tree and freed.
- *
- * State transitions:
- *
- * Active -> Inactive:
- *   When the last task using the namespace exits it drops its active
- *   references to all namespaces. However, user and pid namespaces
- *   remain accessible until the task has been reaped.
- *
- * Inactive -> Active:
- *   An inactive namespace tree might be resurrected due to e.g., the
- *   SIOCGSKNS ioctl() on a socket.
- *
- * Inactive -> Destroyed:
- *   When __ns_ref drops to zero the namespace is removed from the
- *   namespaces trees and the memory is freed (after RCU grace period).
- *
- * Initial namespaces:
- *   Boot-time namespaces (init_net, init_pid_ns, etc.) start with
- *   __ns_ref_active = 1 and remain active forever.
- */
-struct ns_common {
-	u32 ns_type;
-	struct dentry *stashed;
-	const struct proc_ns_operations *ops;
-	unsigned int inum;
-	refcount_t __ns_ref; /* do not use directly */
-	union {
-		struct {
-			u64 ns_id;
-			struct /* global namespace rbtree and list */ {
-				struct rb_node ns_unified_tree_node;
-				struct list_head ns_unified_list_node;
-			};
-			struct /* per type rbtree and list */ {
-				struct rb_node ns_tree_node;
-				struct list_head ns_list_node;
-			};
-			struct /* namespace ownership rbtree and list */ {
-				struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */
-				struct list_head ns_owner; /* list of namespaces owned by this namespace */
-				struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */
-				struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */
-			};
-			atomic_t __ns_ref_active; /* do not use directly */
-		};
-		struct rcu_head ns_rcu;
-	};
-};
-
 bool is_current_namespace(struct ns_common *ns);
 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
 void __ns_common_free(struct ns_common *ns);
@@ -147,79 +26,6 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns)
 	return ns->ns_id <= NS_LAST_INIT_ID;
 }
 
-#define to_ns_common(__ns)                                    \
-	_Generic((__ns),                                      \
-		struct cgroup_namespace *:       &(__ns)->ns, \
-		const struct cgroup_namespace *: &(__ns)->ns, \
-		struct ipc_namespace *:          &(__ns)->ns, \
-		const struct ipc_namespace *:    &(__ns)->ns, \
-		struct mnt_namespace *:          &(__ns)->ns, \
-		const struct mnt_namespace *:    &(__ns)->ns, \
-		struct net *:                    &(__ns)->ns, \
-		const struct net *:              &(__ns)->ns, \
-		struct pid_namespace *:          &(__ns)->ns, \
-		const struct pid_namespace *:    &(__ns)->ns, \
-		struct time_namespace *:         &(__ns)->ns, \
-		const struct time_namespace *:   &(__ns)->ns, \
-		struct user_namespace *:         &(__ns)->ns, \
-		const struct user_namespace *:   &(__ns)->ns, \
-		struct uts_namespace *:          &(__ns)->ns, \
-		const struct uts_namespace *:    &(__ns)->ns)
-
-#define ns_init_inum(__ns)                                     \
-	_Generic((__ns),                                       \
-		struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
-		struct ipc_namespace *:    IPC_NS_INIT_INO,    \
-		struct mnt_namespace *:    MNT_NS_INIT_INO,    \
-		struct net *:              NET_NS_INIT_INO,    \
-		struct pid_namespace *:    PID_NS_INIT_INO,    \
-		struct time_namespace *:   TIME_NS_INIT_INO,   \
-		struct user_namespace *:   USER_NS_INIT_INO,   \
-		struct uts_namespace *:    UTS_NS_INIT_INO)
-
-#define ns_init_ns(__ns)                                    \
-	_Generic((__ns),                                    \
-		struct cgroup_namespace *: &init_cgroup_ns, \
-		struct ipc_namespace *:    &init_ipc_ns,    \
-		struct mnt_namespace *:    &init_mnt_ns,     \
-		struct net *:              &init_net,       \
-		struct pid_namespace *:    &init_pid_ns,    \
-		struct time_namespace *:   &init_time_ns,   \
-		struct user_namespace *:   &init_user_ns,   \
-		struct uts_namespace *:    &init_uts_ns)
-
-#define ns_init_id(__ns)						\
-	_Generic((__ns),						\
-		struct cgroup_namespace *:	CGROUP_NS_INIT_ID,	\
-		struct ipc_namespace *:		IPC_NS_INIT_ID,		\
-		struct mnt_namespace *:		MNT_NS_INIT_ID,		\
-		struct net *:			NET_NS_INIT_ID,		\
-		struct pid_namespace *:		PID_NS_INIT_ID,		\
-		struct time_namespace *:	TIME_NS_INIT_ID,	\
-		struct user_namespace *:	USER_NS_INIT_ID,	\
-		struct uts_namespace *:		UTS_NS_INIT_ID)
-
-#define to_ns_operations(__ns)                                                                         \
-	_Generic((__ns),                                                                               \
-		struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
-		struct ipc_namespace *:    (IS_ENABLED(CONFIG_IPC_NS)  ? &ipcns_operations    : NULL), \
-		struct mnt_namespace *:    &mntns_operations,                                          \
-		struct net *:              (IS_ENABLED(CONFIG_NET_NS)  ? &netns_operations    : NULL), \
-		struct pid_namespace *:    (IS_ENABLED(CONFIG_PID_NS)  ? &pidns_operations    : NULL), \
-		struct time_namespace *:   (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations   : NULL), \
-		struct user_namespace *:   (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations   : NULL), \
-		struct uts_namespace *:    (IS_ENABLED(CONFIG_UTS_NS)  ? &utsns_operations    : NULL))
-
-#define ns_common_type(__ns)                                \
-	_Generic((__ns),                                    \
-		struct cgroup_namespace *: CLONE_NEWCGROUP, \
-		struct ipc_namespace *:    CLONE_NEWIPC,    \
-		struct mnt_namespace *:    CLONE_NEWNS,     \
-		struct net *:              CLONE_NEWNET,    \
-		struct pid_namespace *:    CLONE_NEWPID,    \
-		struct time_namespace *:   CLONE_NEWTIME,   \
-		struct user_namespace *:   CLONE_NEWUSER,   \
-		struct uts_namespace *:    CLONE_NEWUTS)
 
 #define NS_COMMON_INIT(nsname, refs)							\
 {											\
-- 
cgit v1.2.3


From ea1549e628ec51dcbea1d158301993364b754d75 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:14 +0100
Subject: nstree: decouple from ns_common header

Foward declare struct ns_common and remove the include of ns_common.h.
We want ns_common.h to possibly include nstree structures but not the
other way around.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-2-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/nstree.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 38674c6fa4f7..25040a98a92b 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -3,7 +3,6 @@
 #ifndef _LINUX_NSTREE_H
 #define _LINUX_NSTREE_H
 
-#include <linux/ns_common.h>
 #include <linux/nsproxy.h>
 #include <linux/rbtree.h>
 #include <linux/seqlock.h>
@@ -11,6 +10,8 @@
 #include <linux/cookie.h>
 #include <uapi/linux/nsfs.h>
 
+struct ns_common;
+
 extern struct ns_tree cgroup_ns_tree;
 extern struct ns_tree ipc_ns_tree;
 extern struct ns_tree mnt_ns_tree;
-- 
cgit v1.2.3


From 1c64fb02ac46f5ca93ac9f5470f124921b4713b7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:15 +0100
Subject: nstree: move nstree types into separate header

Introduce two new fundamental data structures for namespace tree
management in a separate header file.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-3-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns/nstree_types.h | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/nstree.h          |  1 +
 2 files changed, 37 insertions(+)
 create mode 100644 include/linux/ns/nstree_types.h

(limited to 'include')

diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h
new file mode 100644
index 000000000000..6ee0c39686f8
--- /dev/null
+++ b/include/linux/ns/nstree_types.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+#ifndef _LINUX_NSTREE_TYPES_H
+#define _LINUX_NSTREE_TYPES_H
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+/**
+ * struct ns_tree_root - Root of a namespace tree
+ * @ns_rb: Red-black tree root for efficient lookups
+ * @ns_list_head: List head for sequential iteration
+ *
+ * Each namespace tree maintains both an rbtree (for O(log n) lookups)
+ * and a list (for efficient sequential iteration). The list is kept in
+ * the same sorted order as the rbtree.
+ */
+struct ns_tree_root {
+	struct rb_root ns_rb;
+	struct list_head ns_list_head;
+};
+
+/**
+ * struct ns_tree_node - Node in a namespace tree
+ * @ns_node: Red-black tree node
+ * @ns_list_entry: List entry for sequential iteration
+ *
+ * Represents a namespace's position in a tree. Each namespace has
+ * multiple tree nodes for different trees (unified, per-type, owner).
+ */
+struct ns_tree_node {
+	struct rb_node ns_node;
+	struct list_head ns_list_entry;
+};
+
+#endif /* _LINUX_NSTREE_TYPES_H */
diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 25040a98a92b..0e275df7e99a 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -3,6 +3,7 @@
 #ifndef _LINUX_NSTREE_H
 #define _LINUX_NSTREE_H
 
+#include <linux/ns/nstree_types.h>
 #include <linux/nsproxy.h>
 #include <linux/rbtree.h>
 #include <linux/seqlock.h>
-- 
cgit v1.2.3


From d12ea8062fd31f02beeeb76a7884ab9bc4f5b197 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:16 +0100
Subject: nstree: add helper to operate on struct ns_tree_{node,root}

Add helpers that work on the combined rbtree and rculist combined.
This will make the code a lot more managable and legible.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-4-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/nstree.h |  8 +++++
 kernel/nstree.c        | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)

(limited to 'include')

diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 0e275df7e99a..98b848cf2f1c 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -22,6 +22,14 @@ extern struct ns_tree time_ns_tree;
 extern struct ns_tree user_ns_tree;
 extern struct ns_tree uts_ns_tree;
 
+void ns_tree_node_init(struct ns_tree_node *node);
+void ns_tree_root_init(struct ns_tree_root *root);
+bool ns_tree_node_empty(const struct ns_tree_node *node);
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+				  struct ns_tree_root *root,
+				  int (*cmp)(struct rb_node *, const struct rb_node *));
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root);
+
 #define to_ns_tree(__ns)					\
 	_Generic((__ns),					\
 		struct cgroup_namespace *: &(cgroup_ns_tree),	\
diff --git a/kernel/nstree.c b/kernel/nstree.c
index 97404fb90749..fe71ff943f70 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -73,6 +73,91 @@ struct ns_tree time_ns_tree = {
 	.type = CLONE_NEWTIME,
 };
 
+/**
+ * ns_tree_node_init - Initialize a namespace tree node
+ * @node: The node to initialize
+ *
+ * Initializes both the rbtree node and list entry.
+ */
+void ns_tree_node_init(struct ns_tree_node *node)
+{
+	RB_CLEAR_NODE(&node->ns_node);
+	INIT_LIST_HEAD(&node->ns_list_entry);
+}
+
+/**
+ * ns_tree_root_init - Initialize a namespace tree root
+ * @root: The root to initialize
+ *
+ * Initializes both the rbtree root and list head.
+ */
+void ns_tree_root_init(struct ns_tree_root *root)
+{
+	root->ns_rb = RB_ROOT;
+	INIT_LIST_HEAD(&root->ns_list_head);
+}
+
+/**
+ * ns_tree_node_empty - Check if a namespace tree node is empty
+ * @node: The node to check
+ *
+ * Returns true if the node is not in any tree.
+ */
+bool ns_tree_node_empty(const struct ns_tree_node *node)
+{
+	return RB_EMPTY_NODE(&node->ns_node);
+}
+
+/**
+ * ns_tree_node_add - Add a node to a namespace tree
+ * @node: The node to add
+ * @root: The tree root to add to
+ * @cmp: Comparison function for rbtree insertion
+ *
+ * Adds the node to both the rbtree and the list, maintaining sorted order.
+ * The list is maintained in the same order as the rbtree to enable efficient
+ * iteration.
+ *
+ * Returns: NULL if insertion succeeded, existing node if duplicate found
+ */
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+				  struct ns_tree_root *root,
+				  int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node *ret, *prev;
+
+	/* Add to rbtree */
+	ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp);
+
+	/* Add to list in sorted order */
+	prev = rb_prev(&node->ns_node);
+	if (!prev) {
+		/* No previous node, add at head */
+		list_add_rcu(&node->ns_list_entry, &root->ns_list_head);
+	} else {
+		/* Add after previous node */
+		struct ns_tree_node *prev_node;
+		prev_node = rb_entry(prev, struct ns_tree_node, ns_node);
+		list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry);
+	}
+
+	return ret;
+}
+
+/**
+ * ns_tree_node_del - Remove a node from a namespace tree
+ * @node: The node to remove
+ * @root: The tree root to remove from
+ *
+ * Removes the node from both the rbtree and the list atomically.
+ */
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root)
+{
+	rb_erase(&node->ns_node, &root->ns_rb);
+	RB_CLEAR_NODE(&node->ns_node);
+	list_bidir_del_rcu(&node->ns_list_entry);
+}
+
 static inline struct ns_common *node_to_ns(const struct rb_node *node)
 {
 	if (!node)
-- 
cgit v1.2.3


From a657bc8a75cf40c3d0814fe6488ba4af56528f42 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:17 +0100
Subject: nstree: switch to new structures

Switch the nstree management to the new combined structures.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-5-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c                     |   2 +-
 include/linux/ns/ns_common_types.h |  27 ++---
 include/linux/ns/nstree_types.h    |  19 ++++
 include/linux/ns_common.h          |  27 +++--
 include/linux/nstree.h             |  26 ++---
 kernel/nscommon.c                  |  13 +--
 kernel/nstree.c                    | 199 ++++++++++++++-----------------------
 7 files changed, 136 insertions(+), 177 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index eded33eeb647..ad19530a13b2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -138,7 +138,7 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
 
 	if (!node)
 		return NULL;
-	ns = rb_entry(node, struct ns_common, ns_tree_node);
+	ns = rb_entry(node, struct ns_common, ns_tree_node.ns_node);
 	return container_of(ns, struct mnt_namespace, ns);
 }
 
diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h
index ccd1d1e116f6..b332b019b29c 100644
--- a/include/linux/ns/ns_common_types.h
+++ b/include/linux/ns/ns_common_types.h
@@ -3,6 +3,7 @@
 #define _LINUX_NS_COMMON_TYPES_H
 
 #include <linux/atomic.h>
+#include <linux/ns/nstree_types.h>
 #include <linux/rbtree.h>
 #include <linux/refcount.h>
 #include <linux/types.h>
@@ -98,6 +99,13 @@ extern const struct proc_ns_operations utsns_operations;
  * Initial namespaces:
  *   Boot-time namespaces (init_net, init_pid_ns, etc.) start with
  *   __ns_ref_active = 1 and remain active forever.
+ *
+ * @ns_type: type of namespace (e.g., CLONE_NEWNET)
+ * @stashed: cached dentry to be used by the vfs
+ * @ops: namespace operations
+ * @inum: namespace inode number (quickly recycled for non-initial namespaces)
+ * @__ns_ref: main reference count (do not use directly)
+ * @ns_tree: namespace tree nodes and active reference count
  */
 struct ns_common {
 	u32 ns_type;
@@ -106,24 +114,7 @@ struct ns_common {
 	unsigned int inum;
 	refcount_t __ns_ref; /* do not use directly */
 	union {
-		struct {
-			u64 ns_id;
-			struct /* global namespace rbtree and list */ {
-				struct rb_node ns_unified_tree_node;
-				struct list_head ns_unified_list_node;
-			};
-			struct /* per type rbtree and list */ {
-				struct rb_node ns_tree_node;
-				struct list_head ns_list_node;
-			};
-			struct /* namespace ownership rbtree and list */ {
-				struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */
-				struct list_head ns_owner; /* list of namespaces owned by this namespace */
-				struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */
-				struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */
-			};
-			atomic_t __ns_ref_active; /* do not use directly */
-		};
+		struct ns_tree;
 		struct rcu_head ns_rcu;
 	};
 };
diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h
index 6ee0c39686f8..2fb28ee31efb 100644
--- a/include/linux/ns/nstree_types.h
+++ b/include/linux/ns/nstree_types.h
@@ -33,4 +33,23 @@ struct ns_tree_node {
 	struct list_head ns_list_entry;
 };
 
+/**
+ * struct ns_tree - Namespace tree nodes and active reference count
+ * @ns_id: Unique namespace identifier
+ * @__ns_ref_active: Active reference count (do not use directly)
+ * @ns_unified_node: Node in the global namespace tree
+ * @ns_tree_node: Node in the per-type namespace tree
+ * @ns_owner_node: Node in the owner namespace's tree of owned namespaces
+ * @ns_owner_root: Root of the tree of namespaces owned by this namespace
+ *                 (only used when this namespace is an owner)
+ */
+struct ns_tree {
+	u64 ns_id;
+	atomic_t __ns_ref_active;
+	struct ns_tree_node ns_unified_node;
+	struct ns_tree_node ns_tree_node;
+	struct ns_tree_node ns_owner_node;
+	struct ns_tree_root ns_owner_root;
+};
+
 #endif /* _LINUX_NSTREE_TYPES_H */
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 6a4ca8c3b9c4..f90509ee0900 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -26,20 +26,19 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns)
 	return ns->ns_id <= NS_LAST_INIT_ID;
 }
 
-
-#define NS_COMMON_INIT(nsname, refs)							\
-{											\
-	.ns_type		= ns_common_type(&nsname),				\
-	.ns_id			= ns_init_id(&nsname),					\
-	.inum			= ns_init_inum(&nsname),				\
-	.ops			= to_ns_operations(&nsname),				\
-	.stashed		= NULL,							\
-	.__ns_ref		= REFCOUNT_INIT(refs),					\
-	.__ns_ref_active	= ATOMIC_INIT(1),					\
-	.ns_list_node		= LIST_HEAD_INIT(nsname.ns.ns_list_node),		\
-	.ns_owner_entry		= LIST_HEAD_INIT(nsname.ns.ns_owner_entry),		\
-	.ns_owner		= LIST_HEAD_INIT(nsname.ns.ns_owner),			\
-	.ns_unified_list_node	= LIST_HEAD_INIT(nsname.ns.ns_unified_list_node),	\
+#define NS_COMMON_INIT(nsname, refs)									\
+{													\
+	.ns_type			= ns_common_type(&nsname),					\
+	.ns_id				= ns_init_id(&nsname),						\
+	.inum				= ns_init_inum(&nsname),					\
+	.ops				= to_ns_operations(&nsname),					\
+	.stashed			= NULL,								\
+	.__ns_ref			= REFCOUNT_INIT(refs),						\
+	.__ns_ref_active		= ATOMIC_INIT(1),						\
+	.ns_unified_node.ns_list_entry	= LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry),	\
+	.ns_tree_node.ns_list_entry	= LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry),		\
+	.ns_owner_node.ns_list_entry	= LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry),	\
+	.ns_owner_root.ns_list_head	= LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head),		\
 }
 
 #define ns_common_init(__ns)                     \
diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 98b848cf2f1c..175e4625bfa6 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -13,14 +13,14 @@
 
 struct ns_common;
 
-extern struct ns_tree cgroup_ns_tree;
-extern struct ns_tree ipc_ns_tree;
-extern struct ns_tree mnt_ns_tree;
-extern struct ns_tree net_ns_tree;
-extern struct ns_tree pid_ns_tree;
-extern struct ns_tree time_ns_tree;
-extern struct ns_tree user_ns_tree;
-extern struct ns_tree uts_ns_tree;
+extern struct ns_tree_root cgroup_ns_tree;
+extern struct ns_tree_root ipc_ns_tree;
+extern struct ns_tree_root mnt_ns_tree;
+extern struct ns_tree_root net_ns_tree;
+extern struct ns_tree_root pid_ns_tree;
+extern struct ns_tree_root time_ns_tree;
+extern struct ns_tree_root user_ns_tree;
+extern struct ns_tree_root uts_ns_tree;
 
 void ns_tree_node_init(struct ns_tree_node *node);
 void ns_tree_root_init(struct ns_tree_root *root);
@@ -46,14 +46,14 @@ void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root);
 			 (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
 
 u64 __ns_tree_gen_id(struct ns_common *ns, u64 id);
-void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree);
-void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree);
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree);
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree);
 struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type);
 struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
-					 struct ns_tree *ns_tree,
+					 struct ns_tree_root *ns_tree,
 					 bool previous);
 
-static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree, u64 id)
+static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id)
 {
 	__ns_tree_gen_id(ns, id);
 	__ns_tree_add_raw(ns, ns_tree);
@@ -91,6 +91,6 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree,
 #define ns_tree_adjoined_rcu(__ns, __previous) \
 	__ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous)
 
-#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node))
+#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node))
 
 #endif /* _LINUX_NSTREE_H */
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index c910b979e433..88f70baccb75 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
 
 #include <linux/ns_common.h>
+#include <linux/nstree.h>
 #include <linux/proc_ns.h>
 #include <linux/user_namespace.h>
 #include <linux/vfsdebug.h>
@@ -61,14 +62,10 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 	ns->ops = ops;
 	ns->ns_id = 0;
 	ns->ns_type = ns_type;
-	RB_CLEAR_NODE(&ns->ns_tree_node);
-	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
-	RB_CLEAR_NODE(&ns->ns_owner_tree_node);
-	INIT_LIST_HEAD(&ns->ns_list_node);
-	INIT_LIST_HEAD(&ns->ns_unified_list_node);
-	ns->ns_owner_tree = RB_ROOT;
-	INIT_LIST_HEAD(&ns->ns_owner);
-	INIT_LIST_HEAD(&ns->ns_owner_entry);
+	ns_tree_node_init(&ns->ns_tree_node);
+	ns_tree_node_init(&ns->ns_unified_node);
+	ns_tree_node_init(&ns->ns_owner_node);
+	ns_tree_root_init(&ns->ns_owner_root);
 
 #ifdef CONFIG_DEBUG_VFS
 	ns_debug(ns, ops);
diff --git a/kernel/nstree.c b/kernel/nstree.c
index fe71ff943f70..6c7ec9fbf25f 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -9,68 +9,51 @@
 #include <linux/user_namespace.h>
 
 static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
-static struct rb_root ns_unified_tree = RB_ROOT; /* protected by ns_tree_lock */
-static LIST_HEAD(ns_unified_list); /* protected by ns_tree_lock */
 
-/**
- * struct ns_tree - Namespace tree
- * @ns_tree: Rbtree of namespaces of a particular type
- * @ns_list: Sequentially walkable list of all namespaces of this type
- * @type: type of namespaces in this tree
- */
-struct ns_tree {
-	struct rb_root ns_tree;
-	struct list_head ns_list;
-	int type;
+static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */
+	.ns_rb = RB_ROOT,
+	.ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head),
 };
 
-struct ns_tree mnt_ns_tree = {
-	.ns_tree = RB_ROOT,
-	.ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
-	.type = CLONE_NEWNS,
+struct ns_tree_root mnt_ns_tree = {
+	.ns_rb = RB_ROOT,
+	.ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head),
 };
 
-struct ns_tree net_ns_tree = {
-	.ns_tree = RB_ROOT,
-	.ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
-	.type = CLONE_NEWNET,
+struct ns_tree_root net_ns_tree = {
+	.ns_rb = RB_ROOT,
+	.ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head),
 };
 EXPORT_SYMBOL_GPL(net_ns_tree);
 
-struct ns_tree uts_ns_tree = {
-	.ns_tree = RB_ROOT,
-	.ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
-	.type = CLONE_NEWUTS,
+struct ns_tree_root uts_ns_tree = {
+	.ns_rb = RB_ROOT,
+	.ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head),
 };
 
-struct ns_tree user_ns_tree = {
-	.ns_tree = RB_ROOT,
-	.ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
-	.type = CLONE_NEWUSER,
+struct ns_tree_root user_ns_tree = {
+	.ns_rb = RB_ROOT,
+	.ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head),
 };
 
-struct ns_tree ipc_ns_tree = {
-	.ns_tree = RB_ROOT,
-	.ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
-	.type = CLONE_NEWIPC,
+struct ns_tree_root ipc_ns_tree = {
+	.ns_rb = RB_ROOT,
+	.ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head),
 };
 
-struct ns_tree pid_ns_tree = {
-	.ns_tree = RB_ROOT,
-	.ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
-	.type = CLONE_NEWPID,
+struct ns_tree_root pid_ns_tree = {
+	.ns_rb = RB_ROOT,
+	.ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head),
 };
 
-struct ns_tree cgroup_ns_tree = {
-	.ns_tree = RB_ROOT,
-	.ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
-	.type = CLONE_NEWCGROUP,
+struct ns_tree_root cgroup_ns_tree = {
+	.ns_rb = RB_ROOT,
+	.ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head),
 };
 
-struct ns_tree time_ns_tree = {
-	.ns_tree = RB_ROOT,
-	.ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
-	.type = CLONE_NEWTIME,
+struct ns_tree_root time_ns_tree = {
+	.ns_rb = RB_ROOT,
+	.ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head),
 };
 
 /**
@@ -162,21 +145,21 @@ static inline struct ns_common *node_to_ns(const struct rb_node *node)
 {
 	if (!node)
 		return NULL;
-	return rb_entry(node, struct ns_common, ns_tree_node);
+	return rb_entry(node, struct ns_common, ns_tree_node.ns_node);
 }
 
 static inline struct ns_common *node_to_ns_unified(const struct rb_node *node)
 {
 	if (!node)
 		return NULL;
-	return rb_entry(node, struct ns_common, ns_unified_tree_node);
+	return rb_entry(node, struct ns_common, ns_unified_node.ns_node);
 }
 
 static inline struct ns_common *node_to_ns_owner(const struct rb_node *node)
 {
 	if (!node)
 		return NULL;
-	return rb_entry(node, struct ns_common, ns_owner_tree_node);
+	return rb_entry(node, struct ns_common, ns_owner_node.ns_node);
 }
 
 static int ns_id_cmp(u64 id_a, u64 id_b)
@@ -203,35 +186,22 @@ static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b)
 	return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id);
 }
 
-void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree)
 {
-	struct rb_node *node, *prev;
+	struct rb_node *node;
 	const struct proc_ns_operations *ops = ns->ops;
 
 	VFS_WARN_ON_ONCE(!ns->ns_id);
-	VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
 
 	write_seqlock(&ns_tree_lock);
 
-	node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
-	/*
-	 * If there's no previous entry simply add it after the
-	 * head and if there is add it after the previous entry.
-	 */
-	prev = rb_prev(&ns->ns_tree_node);
-	if (!prev)
-		list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list);
-	else
-		list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
+	/* Add to per-type tree and list */
+	node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp);
 
 	/* Add to unified tree and list */
-	rb_find_add_rcu(&ns->ns_unified_tree_node, &ns_unified_tree, ns_cmp_unified);
-	prev = rb_prev(&ns->ns_unified_tree_node);
-	if (!prev)
-		list_add_rcu(&ns->ns_unified_list_node, &ns_unified_list);
-	else
-		list_add_rcu(&ns->ns_unified_list_node, &node_to_ns_unified(prev)->ns_unified_list_node);
+	ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified);
 
+	/* Add to owner's tree if applicable */
 	if (ops) {
 		struct user_namespace *user_ns;
 
@@ -241,15 +211,8 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 			struct ns_common *owner = &user_ns->ns;
 			VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
 
-			/* Insert into owner's rbtree */
-			rb_find_add_rcu(&ns->ns_owner_tree_node, &owner->ns_owner_tree, ns_cmp_owner);
-
-			/* Insert into owner's list in sorted order */
-			prev = rb_prev(&ns->ns_owner_tree_node);
-			if (!prev)
-				list_add_rcu(&ns->ns_owner_entry, &owner->ns_owner);
-			else
-				list_add_rcu(&ns->ns_owner_entry, &node_to_ns_owner(prev)->ns_owner_entry);
+			/* Insert into owner's tree and list */
+			ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner);
 		} else {
 			/* Only the initial user namespace doesn't have an owner. */
 			VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns));
@@ -260,36 +223,29 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 	VFS_WARN_ON_ONCE(node);
 }
 
-void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree)
 {
 	const struct proc_ns_operations *ops = ns->ops;
 	struct user_namespace *user_ns;
 
-	VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
-	VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
-	VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+	VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node));
+	VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry));
 
 	write_seqlock(&ns_tree_lock);
-	rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
-	RB_CLEAR_NODE(&ns->ns_tree_node);
-
-	list_bidir_del_rcu(&ns->ns_list_node);
 
-	rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree);
-	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
+	/* Remove from per-type tree and list */
+	ns_tree_node_del(&ns->ns_tree_node, ns_tree);
 
-	list_bidir_del_rcu(&ns->ns_unified_list_node);
+	/* Remove from unified tree and list */
+	ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root);
 
-	/* Remove from owner's rbtree if this namespace has an owner */
+	/* Remove from owner's tree if applicable */
 	if (ops) {
 		user_ns = ops->owner(ns);
 		if (user_ns) {
 			struct ns_common *owner = &user_ns->ns;
-			rb_erase(&ns->ns_owner_tree_node, &owner->ns_owner_tree);
-			RB_CLEAR_NODE(&ns->ns_owner_tree_node);
+			ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root);
 		}
-
-		list_bidir_del_rcu(&ns->ns_owner_entry);
 	}
 
 	write_sequnlock(&ns_tree_lock);
@@ -320,7 +276,7 @@ static int ns_find_unified(const void *key, const struct rb_node *node)
 	return 0;
 }
 
-static struct ns_tree *ns_tree_from_type(int ns_type)
+static struct ns_tree_root *ns_tree_from_type(int ns_type)
 {
 	switch (ns_type) {
 	case CLONE_NEWCGROUP:
@@ -351,7 +307,7 @@ static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id)
 
 	do {
 		seq = read_seqbegin(&ns_tree_lock);
-		node = rb_find_rcu(&ns_id, &ns_unified_tree, ns_find_unified);
+		node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified);
 		if (node)
 			break;
 	} while (read_seqretry(&ns_tree_lock, seq));
@@ -361,7 +317,7 @@ static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id)
 
 static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type)
 {
-	struct ns_tree *ns_tree;
+	struct ns_tree_root *ns_tree;
 	struct rb_node *node;
 	unsigned int seq;
 
@@ -371,7 +327,7 @@ static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type)
 
 	do {
 		seq = read_seqbegin(&ns_tree_lock);
-		node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
+		node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find);
 		if (node)
 			break;
 	} while (read_seqretry(&ns_tree_lock, seq));
@@ -399,22 +355,20 @@ struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
  * there is no next/previous namespace, -ENOENT is returned.
  */
 struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
-					 struct ns_tree *ns_tree, bool previous)
+					 struct ns_tree_root *ns_tree, bool previous)
 {
 	struct list_head *list;
 
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
 
 	if (previous)
-		list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node));
+		list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry));
 	else
-		list = rcu_dereference(list_next_rcu(&ns->ns_list_node));
-	if (list_is_head(list, &ns_tree->ns_list))
+		list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry));
+	if (list_is_head(list, &ns_tree->ns_list_head))
 		return ERR_PTR(-ENOENT);
 
-	VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type);
-
-	return list_entry_rcu(list, struct ns_common, ns_list_node);
+	return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry);
 }
 
 /**
@@ -508,7 +462,7 @@ static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner)
 	VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
 
 	read_seqlock_excl(&ns_tree_lock);
-	node = owner->ns_owner_tree.rb_node;
+	node = owner->ns_owner_root.ns_rb.rb_node;
 
 	while (node) {
 		struct ns_common *ns;
@@ -638,16 +592,15 @@ static ssize_t do_listns_userns(struct klistns *kls)
 	}
 
 	ret = 0;
-	head = &to_ns_common(kls->user_ns)->ns_owner;
+	head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head;
 	kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN);
 
 	rcu_read_lock();
 
 	if (!first_ns)
-		first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry);
-
-	for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids;
-	     ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) {
+		first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_node.ns_list_entry);
+	for (ns = first_ns; &ns->ns_owner_node.ns_list_entry != head && nr_ns_ids;
+	     ns = list_entry_rcu(ns->ns_owner_node.ns_list_entry.next, typeof(*ns), ns_owner_node.ns_list_entry)) {
 		struct ns_common *valid;
 
 		valid = legitimize_ns(kls, ns);
@@ -682,7 +635,7 @@ static ssize_t do_listns_userns(struct klistns *kls)
 static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type)
 {
 	struct ns_common *ret = NULL;
-	struct ns_tree *ns_tree = NULL;
+	struct ns_tree_root *ns_tree = NULL;
 	struct rb_node *node;
 
 	if (ns_type) {
@@ -693,9 +646,9 @@ static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type)
 
 	read_seqlock_excl(&ns_tree_lock);
 	if (ns_tree)
-		node = ns_tree->ns_tree.rb_node;
+		node = ns_tree->ns_rb.rb_node;
 	else
-		node = ns_unified_tree.rb_node;
+		node = ns_unified_root.ns_rb.rb_node;
 
 	while (node) {
 		struct ns_common *ns;
@@ -725,28 +678,28 @@ static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type)
 }
 
 static inline struct ns_common *first_ns_common(const struct list_head *head,
-						struct ns_tree *ns_tree)
+						struct ns_tree_root *ns_tree)
 {
 	if (ns_tree)
-		return list_entry_rcu(head->next, struct ns_common, ns_list_node);
-	return list_entry_rcu(head->next, struct ns_common, ns_unified_list_node);
+		return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry);
+	return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry);
 }
 
 static inline struct ns_common *next_ns_common(struct ns_common *ns,
-					       struct ns_tree *ns_tree)
+					       struct ns_tree_root *ns_tree)
 {
 	if (ns_tree)
-		return list_entry_rcu(ns->ns_list_node.next, struct ns_common, ns_list_node);
-	return list_entry_rcu(ns->ns_unified_list_node.next, struct ns_common, ns_unified_list_node);
+		return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry);
+	return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry);
 }
 
 static inline bool ns_common_is_head(struct ns_common *ns,
 				     const struct list_head *head,
-				     struct ns_tree *ns_tree)
+				     struct ns_tree_root *ns_tree)
 {
 	if (ns_tree)
-		return &ns->ns_list_node == head;
-	return &ns->ns_unified_list_node == head;
+		return &ns->ns_tree_node.ns_list_entry == head;
+	return &ns->ns_unified_node.ns_list_entry == head;
 }
 
 static ssize_t do_listns(struct klistns *kls)
@@ -754,7 +707,7 @@ static ssize_t do_listns(struct klistns *kls)
 	u64 __user *ns_ids = kls->uns_ids;
 	size_t nr_ns_ids = kls->nr_ns_ids;
 	struct ns_common *ns, *first_ns = NULL, *prev = NULL;
-	struct ns_tree *ns_tree = NULL;
+	struct ns_tree_root *ns_tree = NULL;
 	const struct list_head *head;
 	u32 ns_type;
 	ssize_t ret;
@@ -779,9 +732,9 @@ static ssize_t do_listns(struct klistns *kls)
 
 	ret = 0;
 	if (ns_tree)
-		head = &ns_tree->ns_list;
+		head = &ns_tree->ns_list_head;
 	else
-		head = &ns_unified_list;
+		head = &ns_unified_root.ns_list_head;
 
 	rcu_read_lock();
 
-- 
cgit v1.2.3


From ed93c0697a8dcb70972a77bca2522a6a23ba6658 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:20 +0100
Subject: ns: make is_initial_namespace() argument const

We don't modify the data structure at all so pass it as const.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-8-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index f90509ee0900..7e4df96b7411 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -13,7 +13,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 void __ns_common_free(struct ns_common *ns);
 struct ns_common *__must_check ns_owner(struct ns_common *ns);
 
-static __always_inline bool is_initial_namespace(struct ns_common *ns)
+static __always_inline bool is_initial_namespace(const struct ns_common *ns)
 {
 	VFS_WARN_ON_ONCE(ns->inum == 0);
 	return unlikely(in_range(ns->inum, MNT_NS_INIT_INO,
-- 
cgit v1.2.3


From 6bf253855aa8c970d2191f87ee23f9f184ddaa79 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:21 +0100
Subject: ns: rename is_initial_namespace()

Rename is_initial_namespace() to ns_init_inum() and make it symmetrical
with the ns id variant.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-9-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 2 +-
 kernel/nscommon.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 7e4df96b7411..b9e8f21a6984 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -13,7 +13,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 void __ns_common_free(struct ns_common *ns);
 struct ns_common *__must_check ns_owner(struct ns_common *ns);
 
-static __always_inline bool is_initial_namespace(const struct ns_common *ns)
+static __always_inline bool is_ns_init_inum(const struct ns_common *ns)
 {
 	VFS_WARN_ON_ONCE(ns->inum == 0);
 	return unlikely(in_range(ns->inum, MNT_NS_INIT_INO,
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index 88f70baccb75..bdc3c86231d3 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -82,7 +82,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 	 * active use (installed in nsproxy) and decremented when all
 	 * active uses are gone. Initial namespaces are always active.
 	 */
-	if (is_initial_namespace(ns))
+	if (is_ns_init_inum(ns))
 		atomic_set(&ns->__ns_ref_active, 1);
 	else
 		atomic_set(&ns->__ns_ref_active, 0);
-- 
cgit v1.2.3


From 657aeb436d70c66583cb2b5b6c65ca64bcf503a8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:24 +0100
Subject: ns: make all reference counts on initial namespace a nop

They are always active so no need to needlessly cacheline ping-pong.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-12-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index b9e8f21a6984..5b8f2f0163d7 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -62,6 +62,8 @@ static __always_inline __must_check int __ns_ref_active_read(const struct ns_com
 
 static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
 {
+	if (is_ns_init_id(ns))
+		return false;
 	if (refcount_dec_and_test(&ns->__ns_ref)) {
 		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
 		return true;
@@ -71,6 +73,8 @@ static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
 
 static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
 {
+	if (is_ns_init_id(ns))
+		return true;
 	if (refcount_inc_not_zero(&ns->__ns_ref))
 		return true;
 	VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
@@ -82,12 +86,27 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns
 	return refcount_read(&ns->__ns_ref);
 }
 
+static __always_inline void __ns_ref_inc(struct ns_common *ns)
+{
+	if (is_ns_init_id(ns))
+		return;
+	refcount_inc(&ns->__ns_ref);
+}
+
+static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns,
+							       spinlock_t *ns_lock)
+{
+	if (is_ns_init_id(ns))
+		return false;
+	return refcount_dec_and_lock(&ns->__ns_ref, ns_lock);
+}
+
 #define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns)))
-#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref)
+#define ns_ref_inc(__ns) __ns_ref_inc(to_ns_common((__ns)))
 #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
 #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
-#define ns_ref_put_and_lock(__ns, __lock) \
-	refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock))
+#define ns_ref_put_and_lock(__ns, __ns_lock) \
+	__ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock)
 
 #define ns_ref_active_read(__ns) \
 	((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)
-- 
cgit v1.2.3


From 2b60d56acc5b4fcab29fc323e6b82597ec78596f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:25 +0100
Subject: ns: add asserts for initial namespace reference counts

They always remain fixed at one. Notice when that assumptions is broken.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-13-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 5b8f2f0163d7..dfb6b798ba82 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -60,10 +60,17 @@ static __always_inline __must_check int __ns_ref_active_read(const struct ns_com
 	return atomic_read(&ns->__ns_ref_active);
 }
 
+static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns)
+{
+	return refcount_read(&ns->__ns_ref);
+}
+
 static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
 {
-	if (is_ns_init_id(ns))
+	if (is_ns_init_id(ns)) {
+		VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
 		return false;
+	}
 	if (refcount_dec_and_test(&ns->__ns_ref)) {
 		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
 		return true;
@@ -73,31 +80,32 @@ static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
 
 static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
 {
-	if (is_ns_init_id(ns))
+	if (is_ns_init_id(ns)) {
+		VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
 		return true;
+	}
 	if (refcount_inc_not_zero(&ns->__ns_ref))
 		return true;
 	VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
 	return false;
 }
 
-static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns)
-{
-	return refcount_read(&ns->__ns_ref);
-}
-
 static __always_inline void __ns_ref_inc(struct ns_common *ns)
 {
-	if (is_ns_init_id(ns))
+	if (is_ns_init_id(ns)) {
+		VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
 		return;
+	}
 	refcount_inc(&ns->__ns_ref);
 }
 
 static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns,
 							       spinlock_t *ns_lock)
 {
-	if (is_ns_init_id(ns))
+	if (is_ns_init_id(ns)) {
+		VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
 		return false;
+	}
 	return refcount_dec_and_lock(&ns->__ns_ref, ns_lock);
 }
 
-- 
cgit v1.2.3


From 7118daabb65585163fd70eb782f1fbbdb64968a6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:26 +0100
Subject: ns: add asserts for initial namespace active reference counts

They always remain fixed at one. Notice when that assumptions is broken.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-14-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index dfb6b798ba82..43f709ab846a 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -69,6 +69,7 @@ static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
 {
 	if (is_ns_init_id(ns)) {
 		VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
 		return false;
 	}
 	if (refcount_dec_and_test(&ns->__ns_ref)) {
@@ -82,6 +83,7 @@ static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
 {
 	if (is_ns_init_id(ns)) {
 		VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
 		return true;
 	}
 	if (refcount_inc_not_zero(&ns->__ns_ref))
@@ -94,6 +96,7 @@ static __always_inline void __ns_ref_inc(struct ns_common *ns)
 {
 	if (is_ns_init_id(ns)) {
 		VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
 		return;
 	}
 	refcount_inc(&ns->__ns_ref);
@@ -104,6 +107,7 @@ static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common
 {
 	if (is_ns_init_id(ns)) {
 		VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
 		return false;
 	}
 	return refcount_dec_and_lock(&ns->__ns_ref, ns_lock);
-- 
cgit v1.2.3


From 282879afa01936954a570e15b4088a89b6e1b549 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:27 +0100
Subject: pid: rely on common reference count behavior

Now that we changed the generic reference counting mechanism for all
namespaces to never manipulate reference counts of initial namespaces we
can drop the special handling for pid namespaces.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-15-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pid_namespace.h | 3 +--
 kernel/pid_namespace.c        | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 445517a72ad0..0e7ae12c96d2 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -61,8 +61,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
 
 static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
 {
-	if (ns != &init_pid_ns)
-		ns_ref_inc(ns);
+	ns_ref_inc(ns);
 	return ns;
 }
 
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 650be58d8d18..e48f5de41361 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags,
 
 void put_pid_ns(struct pid_namespace *ns)
 {
-	if (ns && ns != &init_pid_ns && ns_ref_put(ns))
+	if (ns && ns_ref_put(ns))
 		schedule_work(&ns->work);
 }
 EXPORT_SYMBOL_GPL(put_pid_ns);
-- 
cgit v1.2.3


From c2bbd2db521b018c59fb0ff8e1cdfa8ee907ba88 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Nov 2025 16:08:28 +0100
Subject: ns: drop custom reference count initialization for initial namespaces

Initial namespaces don't modify their reference count anymore.
They remain fixed at one so drop the custom refcount initializations.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-16-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c            | 2 +-
 include/linux/ns_common.h | 4 ++--
 init/version-timestamp.c  | 2 +-
 ipc/msgutil.c             | 2 +-
 kernel/cgroup/cgroup.c    | 2 +-
 kernel/pid.c              | 2 +-
 kernel/time/namespace.c   | 2 +-
 kernel/user.c             | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index efaff8680eaf..25289b869be1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5986,7 +5986,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 }
 
 struct mnt_namespace init_mnt_ns = {
-	.ns		= NS_COMMON_INIT(init_mnt_ns, 1),
+	.ns		= NS_COMMON_INIT(init_mnt_ns),
 	.user_ns	= &init_user_ns,
 	.passive	= REFCOUNT_INIT(1),
 	.mounts		= RB_ROOT,
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 43f709ab846a..136f6a322e53 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -26,14 +26,14 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns)
 	return ns->ns_id <= NS_LAST_INIT_ID;
 }
 
-#define NS_COMMON_INIT(nsname, refs)									\
+#define NS_COMMON_INIT(nsname)										\
 {													\
 	.ns_type			= ns_common_type(&nsname),					\
 	.ns_id				= ns_init_id(&nsname),						\
 	.inum				= ns_init_inum(&nsname),					\
 	.ops				= to_ns_operations(&nsname),					\
 	.stashed			= NULL,								\
-	.__ns_ref			= REFCOUNT_INIT(refs),						\
+	.__ns_ref			= REFCOUNT_INIT(1),						\
 	.__ns_ref_active		= ATOMIC_INIT(1),						\
 	.ns_unified_node.ns_list_entry	= LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry),	\
 	.ns_tree_node.ns_list_entry	= LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry),		\
diff --git a/init/version-timestamp.c b/init/version-timestamp.c
index 56ded64fdfe4..375726e05f69 100644
--- a/init/version-timestamp.c
+++ b/init/version-timestamp.c
@@ -8,7 +8,7 @@
 #include <linux/utsname.h>
 
 struct uts_namespace init_uts_ns = {
-	.ns = NS_COMMON_INIT(init_uts_ns, 2),
+	.ns = NS_COMMON_INIT(init_uts_ns),
 	.name = {
 		.sysname	= UTS_SYSNAME,
 		.nodename	= UTS_NODENAME,
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 55a908ec0674..e28f0cecb2ec 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -27,7 +27,7 @@ DEFINE_SPINLOCK(mq_lock);
  * and not CONFIG_IPC_NS.
  */
 struct ipc_namespace init_ipc_ns = {
-	.ns = NS_COMMON_INIT(init_ipc_ns, 1),
+	.ns = NS_COMMON_INIT(init_ipc_ns),
 	.user_ns = &init_user_ns,
 };
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 20ab84b2cf4e..2bf3951ca88f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -250,7 +250,7 @@ bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
-	.ns		= NS_COMMON_INIT(init_cgroup_ns, 2),
+	.ns		= NS_COMMON_INIT(init_cgroup_ns),
 	.user_ns	= &init_user_ns,
 	.root_cset	= &init_css_set,
 };
diff --git a/kernel/pid.c b/kernel/pid.c
index a5a63dc0a491..a31771bc89c1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -71,7 +71,7 @@ static int pid_max_max = PID_MAX_LIMIT;
  * the scheme scales to up to 4 million PIDs, runtime.
  */
 struct pid_namespace init_pid_ns = {
-	.ns = NS_COMMON_INIT(init_pid_ns, 2),
+	.ns = NS_COMMON_INIT(init_pid_ns),
 	.idr = IDR_INIT(init_pid_ns.idr),
 	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 19911f88e2b8..e76be24b132c 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -478,7 +478,7 @@ const struct proc_ns_operations timens_for_children_operations = {
 };
 
 struct time_namespace init_time_ns = {
-	.ns		= NS_COMMON_INIT(init_time_ns, 3),
+	.ns		= NS_COMMON_INIT(init_time_ns),
 	.user_ns	= &init_user_ns,
 	.frozen_offsets	= true,
 };
diff --git a/kernel/user.c b/kernel/user.c
index 4b3132e786d9..7aef4e679a6a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc);
  * and 1 for... ?
  */
 struct user_namespace init_user_ns = {
-	.ns = NS_COMMON_INIT(init_user_ns, 3),
+	.ns = NS_COMMON_INIT(init_user_ns),
 	.uid_map = {
 		{
 			.extent[0] = {
-- 
cgit v1.2.3


From 8da7bea7db692e786165b71729fb68b7ff65ee56 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Fri, 31 Oct 2025 18:33:28 +0800
Subject: xsk: add indirect call for xsk_destruct_skb

Since Eric proposed an idea about adding indirect call wrappers for
UDP and managed to see a huge improvement[1], the same situation can
also be applied in xsk scenario.

This patch adds an indirect call for xsk and helps current copy mode
improve the performance by around 1% stably which was observed with
IXGBE at 10Gb/sec loaded. If the throughput grows, the positive effect
will be magnified. I applied this patch on top of batch xmit series[2],
and was able to see <5% improvement from our internal application
which is a little bit unstable though.

Use INDIRECT wrappers to keep xsk_destruct_skb static as it used to
be when the mitigation config is off.

Be aware of the freeing path that can be very hot since the frequency
can reach around 2,000,000 times per second with the xdpsock test.

[1]: https://lore.kernel.org/netdev/20251006193103.2684156-2-edumazet@google.com/
[2]: https://lore.kernel.org/all/20251021131209.41491-1-kerneljasonxing@gmail.com/

Suggested-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://patch.msgid.link/20251031103328.95468-1-kerneljasonxing@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/xdp_sock.h | 7 +++++++
 net/core/skbuff.c      | 8 +++++---
 net/xdp/xsk.c          | 3 ++-
 3 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index ce587a225661..23e8861e8b25 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -125,6 +125,7 @@ struct xsk_tx_metadata_ops {
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(struct list_head *flush_list);
+INDIRECT_CALLABLE_DECLARE(void xsk_destruct_skb(struct sk_buff *));
 
 /**
  *  xsk_tx_metadata_to_compl - Save enough relevant metadata information
@@ -218,6 +219,12 @@ static inline void __xsk_map_flush(struct list_head *flush_list)
 {
 }
 
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static inline void xsk_destruct_skb(struct sk_buff *skb)
+{
+}
+#endif
+
 static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
 					    struct xsk_tx_metadata_compl *compl)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d95658b738d1..4f4d7ab7057f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -81,6 +81,7 @@
 #include <net/page_pool/helpers.h>
 #include <net/psp/types.h>
 #include <net/dropreason.h>
+#include <net/xdp_sock.h>
 
 #include <linux/uaccess.h>
 #include <trace/events/skb.h>
@@ -1140,12 +1141,13 @@ void skb_release_head_state(struct sk_buff *skb)
 	if (skb->destructor) {
 		DEBUG_NET_WARN_ON_ONCE(in_hardirq());
 #ifdef CONFIG_INET
-		INDIRECT_CALL_3(skb->destructor,
+		INDIRECT_CALL_4(skb->destructor,
 				tcp_wfree, __sock_wfree, sock_wfree,
+				xsk_destruct_skb,
 				skb);
 #else
-		INDIRECT_CALL_1(skb->destructor,
-				sock_wfree,
+		INDIRECT_CALL_2(skb->destructor,
+				sock_wfree, xsk_destruct_skb,
 				skb);
 
 #endif
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ed8b612ec29d..bcfd400e9cf8 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -602,7 +602,8 @@ static u32 xsk_get_num_desc(struct sk_buff *skb)
 	return XSKCB(skb)->num_descs;
 }
 
-static void xsk_destruct_skb(struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE
+void xsk_destruct_skb(struct sk_buff *skb)
 {
 	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
 
-- 
cgit v1.2.3


From dca3aa666fbd71118905d88bb1c353881002b647 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sun, 9 Nov 2025 13:19:31 +0100
Subject: fs: move inode fields used during fast path lookup closer together

This should avoid *some* cache misses.

Successful path lookup is guaranteed to load at least ->i_mode,
->i_opflags and ->i_acl. At the same time the common case will avoid
looking at more fields.

struct inode is not guaranteed to have any particular alignment, notably
ext4 has it only aligned to 8 bytes meaning nearby fields might happen
to be on the same or only adjacent cache lines depending on luck (or no
luck).

According to pahole:
        umode_t                    i_mode;               /*     0     2 */
        short unsigned int         i_opflags;            /*     2     2 */
        kuid_t                     i_uid;                /*     4     4 */
        kgid_t                     i_gid;                /*     8     4 */
        unsigned int               i_flags;              /*    12     4 */
        struct posix_acl *         i_acl;                /*    16     8 */
        struct posix_acl *         i_default_acl;        /*    24     8 */

->i_acl is unnecessarily separated by 8 bytes from the other fields.
With struct inode being offset 48 bytes into the cacheline this means an
avoidable miss. Note it will still be there for the 56 byte case.

New layout:
        umode_t                    i_mode;               /*     0     2 */
        short unsigned int         i_opflags;            /*     2     2 */
        unsigned int               i_flags;              /*     4     4 */
        struct posix_acl *         i_acl;                /*     8     8 */
        struct posix_acl *         i_default_acl;        /*    16     8 */
        kuid_t                     i_uid;                /*    24     4 */
        kgid_t                     i_gid;                /*    28     4 */

I verified with pahole there are no size or hole changes.

This is stopgap until someone(tm) sanitizes the layout in the first
place, allocation methods aside.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251109121931.1285366-1-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c0c0095b2b60..64dc2e2c281f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -781,14 +781,13 @@ enum inode_state_flags_t {
 struct inode {
 	umode_t			i_mode;
 	unsigned short		i_opflags;
-	kuid_t			i_uid;
-	kgid_t			i_gid;
 	unsigned int		i_flags;
-
 #ifdef CONFIG_FS_POSIX_ACL
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
 #endif
+	kuid_t			i_uid;
+	kgid_t			i_gid;
 
 	const struct inode_operations	*i_op;
 	struct super_block	*i_sb;
-- 
cgit v1.2.3


From e18efacc9c2f17b12c6e019cabad70a2989bd3a9 Mon Sep 17 00:00:00 2001
From: Pagadala Yesu Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Date: Mon, 10 Nov 2025 14:10:29 +0200
Subject: wifi: cfg80211/mac80211: clean up duplicate ap_power handling

Move duplicated ap_power type handling code to an inline
function in cfg80211.

Signed-off-by: Pagadala Yesu Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251110140806.959948da1cb5.I893b5168329fb3232f249c182a35c99804112da6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 24 ++++++++++++++++++++++++
 net/mac80211/mlme.c    | 20 +-------------------
 net/wireless/scan.c    | 15 +--------------
 3 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d87c18e1b133..1b257eaf8de5 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -10134,6 +10134,30 @@ static inline int cfg80211_color_change_notify(struct net_device *dev,
 					 0, 0, link_id);
 }
 
+/**
+ * cfg80211_6ghz_power_type - determine AP regulatory power type
+ * @control: control flags
+ *
+ * Return: regulatory power type from &enum ieee80211_ap_reg_power
+ */
+static inline enum ieee80211_ap_reg_power
+cfg80211_6ghz_power_type(u8 control)
+{
+	switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) {
+	case IEEE80211_6GHZ_CTRL_REG_LPI_AP:
+	case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP:
+		return IEEE80211_REG_LPI_AP;
+	case IEEE80211_6GHZ_CTRL_REG_SP_AP:
+	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP:
+	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD:
+		return IEEE80211_REG_SP_AP;
+	case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
+		return IEEE80211_REG_VLP_AP;
+	default:
+		return IEEE80211_REG_UNSET_AP;
+	}
+}
+
 /**
  * cfg80211_links_removed - Notify about removed STA MLD setup links.
  * @dev: network device.
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 804c3a95b7c6..64230696f3e4 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -6034,24 +6034,6 @@ ieee80211_determine_our_sta_mode_assoc(struct ieee80211_sub_if_data *sdata,
 			       conn->bw_limit, tmp.bw_limit);
 }
 
-static enum ieee80211_ap_reg_power
-ieee80211_ap_power_type(u8 control)
-{
-	switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) {
-	case IEEE80211_6GHZ_CTRL_REG_LPI_AP:
-	case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP:
-		return IEEE80211_REG_LPI_AP;
-	case IEEE80211_6GHZ_CTRL_REG_SP_AP:
-	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP:
-	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD:
-		return IEEE80211_REG_SP_AP;
-	case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
-		return IEEE80211_REG_VLP_AP;
-	default:
-		return IEEE80211_REG_UNSET_AP;
-	}
-}
-
 static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_link_data *link,
 				  int link_id,
@@ -6094,7 +6076,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 		he_6ghz_oper = ieee80211_he_6ghz_oper(elems->he_operation);
 		if (he_6ghz_oper)
 			link->conf->power_type =
-				ieee80211_ap_power_type(he_6ghz_oper->control);
+				cfg80211_6ghz_power_type(he_6ghz_oper->control);
 		else
 			link_info(link,
 				  "HE 6 GHz operation missing (on %d MHz), expect issues\n",
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 90a9187a6b13..68c4130d602f 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -2230,20 +2230,7 @@ cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len)
 	if (!he_6ghz_oper)
 		return IEEE80211_REG_UNSET_AP;
 
-	switch (u8_get_bits(he_6ghz_oper->control,
-			    IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) {
-	case IEEE80211_6GHZ_CTRL_REG_LPI_AP:
-	case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP:
-		return IEEE80211_REG_LPI_AP;
-	case IEEE80211_6GHZ_CTRL_REG_SP_AP:
-	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP:
-	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD:
-		return IEEE80211_REG_SP_AP;
-	case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
-		return IEEE80211_REG_VLP_AP;
-	default:
-		return IEEE80211_REG_UNSET_AP;
-	}
+	return cfg80211_6ghz_power_type(he_6ghz_oper->control);
 }
 
 static bool cfg80211_6ghz_power_type_valid(const u8 *elems, size_t elems_len,
-- 
cgit v1.2.3


From b54cf0f4495a8f3fa94245cdda7716792400299e Mon Sep 17 00:00:00 2001
From: Pagadala Yesu Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Date: Mon, 10 Nov 2025 14:10:30 +0200
Subject: wifi: cfg80211/mac80211: Add fallback mechanism for INDOOR_SP
 connection

Implement fallback to LPI mode when SP mode is not permitted
by regulatory constraints for INDOOR_SP connections.
Limit fallback mechanism to client mode.

Signed-off-by: Pagadala Yesu Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251110140806.8b43201a34ae.I37fc7bb5892eb9d044d619802e8f2095fde6b296@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 9 +++++++--
 net/mac80211/mlme.c    | 3 ++-
 net/wireless/core.h    | 3 ++-
 net/wireless/nl80211.c | 4 ++--
 net/wireless/scan.c    | 7 ++++---
 5 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1b257eaf8de5..625cb2c78361 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -10137,22 +10137,27 @@ static inline int cfg80211_color_change_notify(struct net_device *dev,
 /**
  * cfg80211_6ghz_power_type - determine AP regulatory power type
  * @control: control flags
+ * @client_flags: &enum ieee80211_channel_flags for station mode to enable
+ *	SP to LPI fallback, zero otherwise.
  *
  * Return: regulatory power type from &enum ieee80211_ap_reg_power
  */
 static inline enum ieee80211_ap_reg_power
-cfg80211_6ghz_power_type(u8 control)
+cfg80211_6ghz_power_type(u8 control, u32 client_flags)
 {
 	switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) {
 	case IEEE80211_6GHZ_CTRL_REG_LPI_AP:
 	case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP:
 		return IEEE80211_REG_LPI_AP;
 	case IEEE80211_6GHZ_CTRL_REG_SP_AP:
-	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP:
 	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD:
 		return IEEE80211_REG_SP_AP;
 	case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
 		return IEEE80211_REG_VLP_AP;
+	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP:
+		if (client_flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT)
+			return IEEE80211_REG_LPI_AP;
+		return IEEE80211_REG_SP_AP;
 	default:
 		return IEEE80211_REG_UNSET_AP;
 	}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 64230696f3e4..c705d3f45aff 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -6076,7 +6076,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 		he_6ghz_oper = ieee80211_he_6ghz_oper(elems->he_operation);
 		if (he_6ghz_oper)
 			link->conf->power_type =
-				cfg80211_6ghz_power_type(he_6ghz_oper->control);
+				cfg80211_6ghz_power_type(he_6ghz_oper->control,
+							 cbss->channel->flags);
 		else
 			link_info(link,
 				  "HE 6 GHz operation missing (on %d MHz), expect issues\n",
diff --git a/net/wireless/core.h b/net/wireless/core.h
index b6bd7f4d6385..82f343663e8f 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -550,7 +550,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
 		    bool signal_valid, unsigned long ts);
 
 enum ieee80211_ap_reg_power
-cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len);
+cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len,
+			     u32 client_flags);
 
 #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
 #define CFG80211_DEV_WARN_ON(cond)	WARN_ON(cond)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2187e148389d..29c92bc8291b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6748,7 +6748,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	beacon_check.relax = true;
 	beacon_check.reg_power =
 		cfg80211_get_6ghz_power_type(params->beacon.tail,
-					     params->beacon.tail_len);
+					     params->beacon.tail_len, 0);
 	if (!cfg80211_reg_check_beaconing(&rdev->wiphy, &params->chandef,
 					  &beacon_check)) {
 		err = -EINVAL;
@@ -6927,7 +6927,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 	beacon_check.relax = true;
 	beacon_check.reg_power =
 		cfg80211_get_6ghz_power_type(params->beacon.tail,
-					     params->beacon.tail_len);
+					     params->beacon.tail_len, 0);
 	if (!cfg80211_reg_check_beaconing(&rdev->wiphy,
 					  &wdev->links[link_id].ap.chandef,
 					  &beacon_check)) {
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 68c4130d602f..7546647752fd 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -2212,7 +2212,8 @@ struct cfg80211_inform_single_bss_data {
 };
 
 enum ieee80211_ap_reg_power
-cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len)
+cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len,
+			     u32 client_flags)
 {
 	const struct ieee80211_he_6ghz_oper *he_6ghz_oper;
 	struct ieee80211_he_operation *he_oper;
@@ -2230,13 +2231,13 @@ cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len)
 	if (!he_6ghz_oper)
 		return IEEE80211_REG_UNSET_AP;
 
-	return cfg80211_6ghz_power_type(he_6ghz_oper->control);
+	return cfg80211_6ghz_power_type(he_6ghz_oper->control, client_flags);
 }
 
 static bool cfg80211_6ghz_power_type_valid(const u8 *elems, size_t elems_len,
 					   const u32 flags)
 {
-	switch (cfg80211_get_6ghz_power_type(elems, elems_len)) {
+	switch (cfg80211_get_6ghz_power_type(elems, elems_len, flags)) {
 	case IEEE80211_REG_LPI_AP:
 		return true;
 	case IEEE80211_REG_SP_AP:
-- 
cgit v1.2.3


From f5a538c07df26f5c601e41f7b9c7ade3e1e75803 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 31 Oct 2025 13:54:24 +0100
Subject: sched/deadline: Fix dl_server stop condition

Gabriel reported that the dl_server doesn't stop as expected.

The problem was found to be the fact that idle time and fair runtime are
treated equally. Both will count towards dl_server runtime and push the
activation forwards when it is in the zero-laxity wait state.

Notably:

  dl_server_update_idle()
    update_curr_dl_se()
      if (dl_defer && dl_throttled && dl_runtime_exceeded())
        hrtimer_try_to_cancel(); // stop timer
	replenish_dl_new_period()
	  deadline = now + dl_deadline; // fwd period
	  runtime = dl_runtime;
        start_dl_timer(); // restart timer

And while we do want idle time accounted towards the *current* activation of
the dl_server -- after all, a fair task could've ran if we had any -- we don't
necessarily want idle time to cause or push forward an activation.

Introduce dl_defer_idle to make this distinction. It will be set once idle time
pushed the activation forward, once set idle time will only be allowed to
consume any runtime but not push the activation. This will then cause
dl_server_timer() to fire, which will stop the dl_server.

Any non-idle time accounting during this phase will clear dl_defer_idle, so
only a full period of idle will cause the dl_server to stop.

Reported-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251101000057.GA2184199@noisy.programming.kicks-ass.net
---
 include/linux/sched.h   | 15 +++++++++------
 kernel/sched/deadline.c | 40 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 07576479c0ed..bb436ee1942d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -685,20 +685,22 @@ struct sched_dl_entity {
 	 *
 	 * @dl_server tells if this is a server entity.
 	 *
-	 * @dl_defer tells if this is a deferred or regular server. For
-	 * now only defer server exists.
-	 *
-	 * @dl_defer_armed tells if the deferrable server is waiting
-	 * for the replenishment timer to activate it.
-	 *
 	 * @dl_server_active tells if the dlserver is active(started).
 	 * dlserver is started on first cfs enqueue on an idle runqueue
 	 * and is stopped when a dequeue results in 0 cfs tasks on the
 	 * runqueue. In other words, dlserver is active only when cpu's
 	 * runqueue has atleast one cfs task.
 	 *
+	 * @dl_defer tells if this is a deferred or regular server. For
+	 * now only defer server exists.
+	 *
+	 * @dl_defer_armed tells if the deferrable server is waiting
+	 * for the replenishment timer to activate it.
+	 *
 	 * @dl_defer_running tells if the deferrable server is actually
 	 * running, skipping the defer phase.
+	 *
+	 * @dl_defer_idle tracks idle state
 	 */
 	unsigned int			dl_throttled      : 1;
 	unsigned int			dl_yielded        : 1;
@@ -709,6 +711,7 @@ struct sched_dl_entity {
 	unsigned int			dl_defer	  : 1;
 	unsigned int			dl_defer_armed	  : 1;
 	unsigned int			dl_defer_running  : 1;
+	unsigned int			dl_defer_idle     : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ece25caf379c..8307f24b8900 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1173,6 +1173,11 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
 		 */
 		rq->donor->sched_class->update_curr(rq);
 
+		if (dl_se->dl_defer_idle) {
+			dl_server_stop(dl_se);
+			return HRTIMER_NORESTART;
+		}
+
 		if (dl_se->dl_defer_armed) {
 			/*
 			 * First check if the server could consume runtime in background.
@@ -1420,10 +1425,11 @@ s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta
 }
 
 static inline void
-update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
-			int flags);
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags);
+
 static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
 {
+	bool idle = rq->curr == rq->idle;
 	s64 scaled_delta_exec;
 
 	if (unlikely(delta_exec <= 0)) {
@@ -1444,6 +1450,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
 
 	dl_se->runtime -= scaled_delta_exec;
 
+	if (dl_se->dl_defer_idle && !idle)
+		dl_se->dl_defer_idle = 0;
+
 	/*
 	 * The fair server can consume its runtime while throttled (not queued/
 	 * running as regular CFS).
@@ -1453,6 +1462,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
 	 * starting a new period, pushing the activation.
 	 */
 	if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) {
+		/*
+		 * Non-servers would never get time accounted while throttled.
+		 */
+		WARN_ON_ONCE(!dl_server(dl_se));
+
+		/*
+		 * While the server is marked idle, do not push out the
+		 * activation further, instead wait for the period timer
+		 * to lapse and stop the server.
+		 */
+		if (dl_se->dl_defer_idle && idle) {
+			/*
+			 * The timer is at the zero-laxity point, this means
+			 * dl_server_stop() / dl_server_start() can happen
+			 * while now < deadline. This means update_dl_entity()
+			 * will not replenish. Additionally start_dl_timer()
+			 * will be set for 'deadline - runtime'. Negative
+			 * runtime will not do.
+			 */
+			dl_se->runtime = 0;
+			return;
+		}
+
 		/*
 		 * If the server was previously activated - the starving condition
 		 * took place, it this point it went away because the fair scheduler
@@ -1465,6 +1497,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
 
 		replenish_dl_new_period(dl_se, dl_se->rq);
 
+		if (idle)
+			dl_se->dl_defer_idle = 1;
+
 		/*
 		 * Not being able to start the timer seems problematic. If it could not
 		 * be started for whatever reason, we need to "unthrottle" the DL server
@@ -1590,6 +1625,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
 	hrtimer_try_to_cancel(&dl_se->dl_timer);
 	dl_se->dl_defer_armed = 0;
 	dl_se->dl_throttled = 0;
+	dl_se->dl_defer_idle = 0;
 	dl_se->dl_server_active = 0;
 }
 
-- 
cgit v1.2.3


From 0e535824d0bcf7c9bb0532d902283c31c78cd6f3 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@nvidia.com>
Date: Fri, 7 Nov 2025 23:04:02 -0800
Subject: devlink: Introduce switchdev_inactive eswitch mode

Adds DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE attribute to UAPI and
documentation.

Before having traffic flow through an eswitch, a user may want to have the
ability to block traffic towards the FDB until FDB is fully programmed and
the user is ready to send traffic to it. For example: when two eswitches
are present for vports in a multi-PF setup, one eswitch may take over the
traffic from the other when the user chooses.
Before this take over, a user may want to first program the inactive
eswitch and then once ready redirect traffic to this new eswitch.

switchdev modes transition semantics:

legacy->switchdev_inactive: Create switchdev mode normally, traffic not
  allowed to flow yet.

switchdev_inactive->switchdev: Enable traffic to flow.

switchdev->switchdev_inactive: Block traffic on the FDB, FDB and
  representros state and content is preserved.

When eswitch is configured to this mode, traffic is ignored/dropped on
this eswitch FDB, while current configuration is kept, e.g FDB rules and
netdev representros are kept available, FDB programming is allowed.

Example:
 # start inactive switchdev
devlink dev eswitch set pci/0000:08:00.1 mode switchdev_inactive
 # setup TC rules, representors etc ..
 # activate
devlink dev eswitch set pci/0000:08:00.1 mode switchdev

Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251108070404.1551708-2-saeed@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/netlink/specs/devlink.yaml                  |  2 ++
 Documentation/networking/devlink/devlink-eswitch-attr.rst | 13 +++++++++++++
 include/uapi/linux/devlink.h                              |  1 +
 net/devlink/netlink_gen.c                                 |  2 +-
 4 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml
index 3db59c965869..426d5aa7d955 100644
--- a/Documentation/netlink/specs/devlink.yaml
+++ b/Documentation/netlink/specs/devlink.yaml
@@ -99,6 +99,8 @@ definitions:
         name: legacy
       -
         name: switchdev
+      -
+        name: switchdev-inactive
   -
     type: enum
     name: eswitch-inline-mode
diff --git a/Documentation/networking/devlink/devlink-eswitch-attr.rst b/Documentation/networking/devlink/devlink-eswitch-attr.rst
index 08bb39ab1528..eafe09abc40c 100644
--- a/Documentation/networking/devlink/devlink-eswitch-attr.rst
+++ b/Documentation/networking/devlink/devlink-eswitch-attr.rst
@@ -39,6 +39,10 @@ The following is a list of E-Switch attributes.
          rules.
        * ``switchdev`` allows for more advanced offloading capabilities of
          the E-Switch to hardware.
+       * ``switchdev_inactive`` switchdev mode but starts inactive, doesn't allow traffic
+         until explicitly activated. This mode is useful for orchestrators that
+         want to prepare the device in switchdev mode but only activate it when
+         all configurations are done.
    * - ``inline-mode``
      - enum
      - Some HWs need the VF driver to put part of the packet
@@ -74,3 +78,12 @@ Example Usage
 
     # enable encap-mode with legacy mode
     $ devlink dev eswitch set pci/0000:08:00.0 mode legacy inline-mode none encap-mode basic
+
+    # start switchdev mode in inactive state
+    $ devlink dev eswitch set pci/0000:08:00.0 mode switchdev_inactive
+
+    # setup switchdev configurations, representors, FDB entries, etc..
+    ...
+
+    # activate switchdev mode to allow traffic
+    $ devlink dev eswitch set pci/0000:08:00.0 mode switchdev
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index bcad11a787a5..157f11d3fb72 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -181,6 +181,7 @@ enum devlink_sb_threshold_type {
 enum devlink_eswitch_mode {
 	DEVLINK_ESWITCH_MODE_LEGACY,
 	DEVLINK_ESWITCH_MODE_SWITCHDEV,
+	DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE,
 };
 
 enum devlink_eswitch_inline_mode {
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index 9fd00977d59e..5ad435aee29d 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -229,7 +229,7 @@ static const struct nla_policy devlink_eswitch_get_nl_policy[DEVLINK_ATTR_DEV_NA
 static const struct nla_policy devlink_eswitch_set_nl_policy[DEVLINK_ATTR_ESWITCH_ENCAP_MODE + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
-	[DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_MAX(NLA_U16, 1),
+	[DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_MAX(NLA_U16, 2),
 	[DEVLINK_ATTR_ESWITCH_INLINE_MODE] = NLA_POLICY_MAX(NLA_U8, 3),
 	[DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = NLA_POLICY_MAX(NLA_U8, 1),
 };
-- 
cgit v1.2.3


From 9da611df15aa8d519f9947b88a5c733267cba888 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@nvidia.com>
Date: Fri, 7 Nov 2025 23:04:04 -0800
Subject: net/mlx5: E-Switch, support eswitch inactive mode

Add support for eswitch switchdev inactive mode

Inactive mode: Drop all traffic going to FDB, Remove
mpfs l2 rules and disconnect adjacent vports.

Active mode: Traffic flows through FDB, mpfs table populated, and
adjacent vports are connected.

Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Adithya Jayachandran <ajayachandra@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251108070404.1551708-4-saeed@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../ethernet/mellanox/mlx5/core/esw/adj_vport.c    |  15 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |   6 +
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 207 ++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c |   2 +-
 include/linux/mlx5/fs.h                            |   1 +
 6 files changed, 214 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c
index 0091ba697bae..250af09b5af2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c
@@ -4,13 +4,8 @@
 #include "fs_core.h"
 #include "eswitch.h"
 
-enum {
-	MLX5_ADJ_VPORT_DISCONNECT = 0x0,
-	MLX5_ADJ_VPORT_CONNECT = 0x1,
-};
-
-static int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev,
-				     u16 vport, bool connect)
+int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev, u16 vport,
+			      bool connect)
 {
 	u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {};
 
@@ -24,7 +19,7 @@ static int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev,
 	MLX5_SET(modify_vport_state_in, in, egress_connect_valid, 1);
 	MLX5_SET(modify_vport_state_in, in, ingress_connect, connect);
 	MLX5_SET(modify_vport_state_in, in, egress_connect, connect);
-
+	MLX5_SET(modify_vport_state_in, in, admin_state, connect);
 	return mlx5_cmd_exec_in(dev, modify_vport_state, in);
 }
 
@@ -96,7 +91,6 @@ static int mlx5_esw_adj_vport_create(struct mlx5_eswitch *esw, u16 vhca_id,
 	if (err)
 		goto acl_ns_remove;
 
-	mlx5_esw_adj_vport_modify(esw->dev, vport_num, MLX5_ADJ_VPORT_CONNECT);
 	return 0;
 
 acl_ns_remove:
@@ -117,8 +111,7 @@ static void mlx5_esw_adj_vport_destroy(struct mlx5_eswitch *esw,
 
 	esw_debug(esw->dev, "Destroying adjacent vport %d for vhca_id 0x%x\n",
 		  vport_num, vport->vhca_id);
-	mlx5_esw_adj_vport_modify(esw->dev, vport_num,
-				  MLX5_ADJ_VPORT_DISCONNECT);
+
 	mlx5_esw_offloads_rep_remove(esw, vport);
 	mlx5_fs_vport_egress_acl_ns_remove(esw->dev->priv.steering,
 					   vport->index);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 16eb99aba2a7..beaec450a734 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -264,6 +264,9 @@ struct mlx5_eswitch_fdb {
 
 		struct offloads_fdb {
 			struct mlx5_flow_namespace *ns;
+			struct mlx5_flow_table *drop_root;
+			struct mlx5_flow_handle *drop_root_rule;
+			struct mlx5_fc *drop_root_fc;
 			struct mlx5_flow_table *tc_miss_table;
 			struct mlx5_flow_table *slow_fdb;
 			struct mlx5_flow_group *send_to_vport_grp;
@@ -392,6 +395,7 @@ struct mlx5_eswitch {
 	struct mlx5_esw_offload offloads;
 	u32 last_vport_idx;
 	int                     mode;
+	bool                    offloads_inactive;
 	u16                     manager_vport;
 	u16                     first_host_vport;
 	u8			num_peers;
@@ -634,6 +638,8 @@ const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev);
 
 void mlx5_esw_adjacent_vhcas_setup(struct mlx5_eswitch *esw);
 void mlx5_esw_adjacent_vhcas_cleanup(struct mlx5_eswitch *esw);
+int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev, u16 vport,
+			      bool connect);
 
 #define MLX5_DEBUG_ESWITCH_MASK BIT(3)
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 4092ea29c630..0b1a180ef238 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1577,6 +1577,7 @@ esw_chains_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *miss_fdb)
 	attr.max_grp_num = esw->params.large_group_num;
 	attr.default_ft = miss_fdb;
 	attr.mapping = esw->offloads.reg_c0_obj_pool;
+	attr.fs_base_prio = FDB_BYPASS_PATH;
 
 	chains = mlx5_chains_create(dev, &attr);
 	if (IS_ERR(chains)) {
@@ -2355,6 +2356,131 @@ static void esw_mode_change(struct mlx5_eswitch *esw, u16 mode)
 	mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
 }
 
+static void mlx5_esw_fdb_drop_destroy(struct mlx5_eswitch *esw)
+{
+	if (!esw->fdb_table.offloads.drop_root)
+		return;
+
+	esw_debug(esw->dev, "Destroying FDB drop root table %#x fc %#x\n",
+		  esw->fdb_table.offloads.drop_root->id,
+		  esw->fdb_table.offloads.drop_root_fc->id);
+	mlx5_del_flow_rules(esw->fdb_table.offloads.drop_root_rule);
+	/* Don't free flow counter here, can be reused on a later activation */
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.drop_root);
+	esw->fdb_table.offloads.drop_root_rule = NULL;
+	esw->fdb_table.offloads.drop_root = NULL;
+}
+
+static int mlx5_esw_fdb_drop_create(struct mlx5_eswitch *esw)
+{
+	struct mlx5_flow_destination drop_fc_dst = {};
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_act flow_act = {};
+	struct mlx5_flow_handle *flow_rule;
+	struct mlx5_flow_table *table;
+	int err = 0, dst_num = 0;
+
+	if (esw->fdb_table.offloads.drop_root)
+		return 0;
+
+	root_ns = esw->fdb_table.offloads.ns;
+
+	ft_attr.prio = FDB_DROP_ROOT;
+	ft_attr.max_fte = 1;
+	ft_attr.autogroup.max_num_groups = 1;
+	table = mlx5_create_auto_grouped_flow_table(root_ns, &ft_attr);
+	if (IS_ERR(table)) {
+		esw_warn(dev, "Failed to create fdb drop root table, err %pe\n",
+			 table);
+		return PTR_ERR(table);
+	}
+
+	/* Drop FC reusable, create once on first deactivation of FDB */
+	if (!esw->fdb_table.offloads.drop_root_fc) {
+		struct mlx5_fc *counter = mlx5_fc_create(dev, 0);
+
+		err = PTR_ERR_OR_ZERO(counter);
+		if (err)
+			esw_warn(esw->dev, "create fdb drop fc err %d\n", err);
+		else
+			esw->fdb_table.offloads.drop_root_fc = counter;
+	}
+
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+	if (esw->fdb_table.offloads.drop_root_fc) {
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		drop_fc_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		drop_fc_dst.counter = esw->fdb_table.offloads.drop_root_fc;
+		dst = &drop_fc_dst;
+		dst_num++;
+	}
+
+	flow_rule = mlx5_add_flow_rules(table, NULL, &flow_act, dst, dst_num);
+	err = PTR_ERR_OR_ZERO(flow_rule);
+	if (err) {
+		esw_warn(esw->dev,
+			 "fs offloads: Failed to add vport rx drop rule err %d\n",
+			 err);
+		goto err_flow_rule;
+	}
+
+	esw->fdb_table.offloads.drop_root = table;
+	esw->fdb_table.offloads.drop_root_rule = flow_rule;
+	esw_debug(esw->dev, "Created FDB drop root table %#x fc %#x\n",
+		  table->id, dst ? dst->counter->id : 0);
+	return 0;
+
+err_flow_rule:
+	/* no need to free drop fc, esw_offloads_steering_cleanup will do it */
+	mlx5_destroy_flow_table(table);
+	return err;
+}
+
+static void mlx5_esw_fdb_active(struct mlx5_eswitch *esw)
+{
+	struct mlx5_vport *vport;
+	unsigned long i;
+
+	mlx5_esw_fdb_drop_destroy(esw);
+	mlx5_mpfs_enable(esw->dev);
+
+	mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) {
+		if (!vport->adjacent)
+			continue;
+		esw_debug(esw->dev, "Connecting vport %d to eswitch\n",
+			  vport->vport);
+		mlx5_esw_adj_vport_modify(esw->dev, vport->vport, true);
+	}
+
+	esw->offloads_inactive = false;
+	esw_warn(esw->dev, "MPFS/FDB active\n");
+}
+
+static void mlx5_esw_fdb_inactive(struct mlx5_eswitch *esw)
+{
+	struct mlx5_vport *vport;
+	unsigned long i;
+
+	mlx5_mpfs_disable(esw->dev);
+	mlx5_esw_fdb_drop_create(esw);
+
+	mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) {
+		if (!vport->adjacent)
+			continue;
+		esw_debug(esw->dev, "Disconnecting vport %u from eswitch\n",
+			  vport->vport);
+
+		mlx5_esw_adj_vport_modify(esw->dev, vport->vport, false);
+	}
+
+	esw->offloads_inactive = true;
+	esw_warn(esw->dev, "MPFS/FDB inactive\n");
+}
+
 static int esw_offloads_start(struct mlx5_eswitch *esw,
 			      struct netlink_ext_ack *extack)
 {
@@ -3438,6 +3564,10 @@ create_indir_err:
 
 static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
 {
+	mlx5_esw_fdb_drop_destroy(esw);
+	if (esw->fdb_table.offloads.drop_root_fc)
+		mlx5_fc_destroy(esw->dev, esw->fdb_table.offloads.drop_root_fc);
+	esw->fdb_table.offloads.drop_root_fc = NULL;
 	esw_destroy_vport_rx_drop_rule(esw);
 	esw_destroy_vport_rx_drop_group(esw);
 	esw_destroy_vport_rx_group(esw);
@@ -3600,6 +3730,11 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
 	if (err)
 		goto err_steering_init;
 
+	if (esw->offloads_inactive)
+		mlx5_esw_fdb_inactive(esw);
+	else
+		mlx5_esw_fdb_active(esw);
+
 	/* Representor will control the vport link state */
 	mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs)
 		vport->info.link_state = MLX5_VPORT_ADMIN_STATE_DOWN;
@@ -3666,6 +3801,9 @@ void esw_offloads_disable(struct mlx5_eswitch *esw)
 	esw_offloads_metadata_uninit(esw);
 	mlx5_rdma_disable_roce(esw->dev);
 	mlx5_esw_adjacent_vhcas_cleanup(esw);
+	/* must be done after vhcas cleanup to avoid adjacent vports connect */
+	if (esw->offloads_inactive)
+		mlx5_esw_fdb_active(esw); /* legacy mode always active */
 	mutex_destroy(&esw->offloads.termtbl_mutex);
 }
 
@@ -3676,6 +3814,7 @@ static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode)
 		*mlx5_mode = MLX5_ESWITCH_LEGACY;
 		break;
 	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
+	case DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE:
 		*mlx5_mode = MLX5_ESWITCH_OFFLOADS;
 		break;
 	default:
@@ -3685,14 +3824,17 @@ static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode)
 	return 0;
 }
 
-static int esw_mode_to_devlink(u16 mlx5_mode, u16 *mode)
+static int esw_mode_to_devlink(struct mlx5_eswitch *esw, u16 *mode)
 {
-	switch (mlx5_mode) {
+	switch (esw->mode) {
 	case MLX5_ESWITCH_LEGACY:
 		*mode = DEVLINK_ESWITCH_MODE_LEGACY;
 		break;
 	case MLX5_ESWITCH_OFFLOADS:
-		*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV;
+		if (esw->offloads_inactive)
+			*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE;
+		else
+			*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV;
 		break;
 	default:
 		return -EINVAL;
@@ -3798,6 +3940,45 @@ static bool mlx5_devlink_netdev_netns_immutable_set(struct devlink *devlink,
 	return ret;
 }
 
+/* Returns true when only changing between active and inactive switchdev mode */
+static bool mlx5_devlink_switchdev_active_mode_change(struct mlx5_eswitch *esw,
+						      u16 devlink_mode)
+{
+	/* current mode is not switchdev */
+	if (esw->mode != MLX5_ESWITCH_OFFLOADS)
+		return false;
+
+	/* new mode is not switchdev */
+	if (devlink_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV &&
+	    devlink_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE)
+		return false;
+
+	/* already inactive: no change in current state */
+	if (devlink_mode == DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE &&
+	    esw->offloads_inactive)
+		return false;
+
+	/* already active: no change in current state */
+	if (devlink_mode == DEVLINK_ESWITCH_MODE_SWITCHDEV &&
+	    !esw->offloads_inactive)
+		return false;
+
+	down_write(&esw->mode_lock);
+	esw->offloads_inactive = !esw->offloads_inactive;
+	esw->eswitch_operation_in_progress = true;
+	up_write(&esw->mode_lock);
+
+	if (esw->offloads_inactive)
+		mlx5_esw_fdb_inactive(esw);
+	else
+		mlx5_esw_fdb_active(esw);
+
+	down_write(&esw->mode_lock);
+	esw->eswitch_operation_in_progress = false;
+	up_write(&esw->mode_lock);
+	return true;
+}
+
 int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 				  struct netlink_ext_ack *extack)
 {
@@ -3812,12 +3993,16 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 	if (esw_mode_from_devlink(mode, &mlx5_mode))
 		return -EINVAL;
 
-	if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && mlx5_get_sd(esw->dev)) {
+	if (mlx5_mode == MLX5_ESWITCH_OFFLOADS && mlx5_get_sd(esw->dev)) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can't change E-Switch mode to switchdev when multi-PF netdev (Socket Direct) is configured.");
 		return -EPERM;
 	}
 
+	/* Avoid try_lock, active/inactive mode change is not restricted */
+	if (mlx5_devlink_switchdev_active_mode_change(esw, mode))
+		return 0;
+
 	mlx5_lag_disable_change(esw->dev);
 	err = mlx5_esw_try_lock(esw);
 	if (err < 0) {
@@ -3840,7 +4025,7 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 	esw->eswitch_operation_in_progress = true;
 	up_write(&esw->mode_lock);
 
-	if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV &&
+	if (mlx5_mode == MLX5_ESWITCH_OFFLOADS &&
 	    !mlx5_devlink_netdev_netns_immutable_set(devlink, true)) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can't change E-Switch mode to switchdev when netdev net namespace has diverged from the devlink's.");
@@ -3848,25 +4033,27 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 		goto skip;
 	}
 
-	if (mode == DEVLINK_ESWITCH_MODE_LEGACY)
+	if (mlx5_mode == MLX5_ESWITCH_LEGACY)
 		esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY;
 	mlx5_eswitch_disable_locked(esw);
-	if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) {
+	if (mlx5_mode == MLX5_ESWITCH_OFFLOADS) {
 		if (mlx5_devlink_trap_get_num_active(esw->dev)) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Can't change mode while devlink traps are active");
 			err = -EOPNOTSUPP;
 			goto skip;
 		}
+		esw->offloads_inactive =
+			(mode == DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE);
 		err = esw_offloads_start(esw, extack);
-	} else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) {
+	} else if (mlx5_mode == MLX5_ESWITCH_LEGACY) {
 		err = esw_offloads_stop(esw, extack);
 	} else {
 		err = -EINVAL;
 	}
 
 skip:
-	if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && err)
+	if (mlx5_mode == MLX5_ESWITCH_OFFLOADS && err)
 		mlx5_devlink_netdev_netns_immutable_set(devlink, false);
 	down_write(&esw->mode_lock);
 	esw->eswitch_operation_in_progress = false;
@@ -3885,7 +4072,7 @@ int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 	if (IS_ERR(esw))
 		return PTR_ERR(esw);
 
-	return esw_mode_to_devlink(esw->mode, mode);
+	return esw_mode_to_devlink(esw, mode);
 }
 
 static int mlx5_esw_vports_inline_set(struct mlx5_eswitch *esw, u8 mlx5_mode,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 2db3ffb0a2b2..2ca3bddbdf05 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -3520,6 +3520,11 @@ static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
 	if (!steering->fdb_root_ns)
 		return -ENOMEM;
 
+	maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_DROP_ROOT, 1);
+	err = PTR_ERR_OR_ZERO(maj_prio);
+	if (err)
+		goto out_err;
+
 	err = create_fdb_bypass(steering);
 	if (err)
 		goto out_err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
index 99fb7a53add0..4a88a42ae4f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
@@ -167,7 +167,7 @@ int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac)
 		if (err)
 			goto free_l2table_index;
 		mlx5_core_dbg(dev, "MPFS entry %pM, set @index (%d)\n",
-			      l2addr->node.addr, l2addr->index);
+			      l2addr->node.addr, index);
 	}
 
 	l2addr->index = index;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 6ac76a0c3827..7bf2449c53b2 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -116,6 +116,7 @@ enum mlx5_flow_namespace_type {
 };
 
 enum {
+	FDB_DROP_ROOT,
 	FDB_BYPASS_PATH,
 	FDB_CRYPTO_INGRESS,
 	FDB_TC_OFFLOAD,
-- 
cgit v1.2.3


From 4f739ed19d222de33b19ca639a34523fbbec20d0 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 14 Oct 2025 07:51:56 +0200
Subject: rv: Pass va_list to reactors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only thing the reactors can do with the passed in varargs is to
convert it into a va_list. Do that in a central helper instead.
It simplifies the reactors, removes some hairy macro-generated code
and introduces a convenient hook point to modify reactor behavior.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-1-0b9e51919ea8@linutronix.de
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/rv.h               | 11 +++++++++--
 include/rv/da_monitor.h          | 35 ++++++++++-------------------------
 include/rv/ltl_monitor.h         | 18 +++++-------------
 kernel/trace/rv/reactor_panic.c  |  6 +-----
 kernel/trace/rv/reactor_printk.c |  6 +-----
 kernel/trace/rv/rv_reactors.c    | 16 +++++++++++++++-
 6 files changed, 41 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index 9520aab34bcb..b567b0191e67 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -88,7 +88,7 @@ union rv_task_monitor {
 struct rv_reactor {
 	const char		*name;
 	const char		*description;
-	__printf(1, 2) void	(*react)(const char *msg, ...);
+	__printf(1, 0) void	(*react)(const char *msg, va_list args);
 	struct list_head	list;
 };
 #endif
@@ -102,7 +102,7 @@ struct rv_monitor {
 	void			(*reset)(void);
 #ifdef CONFIG_RV_REACTORS
 	struct rv_reactor	*reactor;
-	__printf(1, 2) void	(*react)(const char *msg, ...);
+	__printf(1, 0) void	(*react)(const char *msg, va_list args);
 #endif
 	struct list_head	list;
 	struct rv_monitor	*parent;
@@ -119,11 +119,18 @@ void rv_put_task_monitor_slot(int slot);
 bool rv_reacting_on(void);
 int rv_unregister_reactor(struct rv_reactor *reactor);
 int rv_register_reactor(struct rv_reactor *reactor);
+__printf(2, 3)
+void rv_react(struct rv_monitor *monitor, const char *msg, ...);
 #else
 static inline bool rv_reacting_on(void)
 {
 	return false;
 }
+
+__printf(2, 3)
+static inline void rv_react(struct rv_monitor *monitor, const char *msg, ...)
+{
+}
 #endif /* CONFIG_RV_REACTORS */
 
 #endif /* CONFIG_RV */
diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 17fa4f6e5ea6..0cef64366538 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -16,34 +16,19 @@
 #include <linux/bug.h>
 #include <linux/sched.h>
 
-#ifdef CONFIG_RV_REACTORS
-
-#define DECLARE_RV_REACTING_HELPERS(name, type)							\
-static void cond_react_##name(type curr_state, type event)					\
-{												\
-	if (!rv_reacting_on() || !rv_##name.react)						\
-		return;										\
-	rv_##name.react("rv: monitor %s does not allow event %s on state %s\n",			\
-			#name,									\
-			model_get_event_name_##name(event),					\
-			model_get_state_name_##name(curr_state));				\
-}
-
-#else /* CONFIG_RV_REACTOR */
-
-#define DECLARE_RV_REACTING_HELPERS(name, type)							\
-static void cond_react_##name(type curr_state, type event)					\
-{												\
-	return;											\
-}
-#endif
-
 /*
  * Generic helpers for all types of deterministic automata monitors.
  */
 #define DECLARE_DA_MON_GENERIC_HELPERS(name, type)						\
 												\
-DECLARE_RV_REACTING_HELPERS(name, type)								\
+static void react_##name(type curr_state, type event)						\
+{												\
+	rv_react(&rv_##name,									\
+		 "rv: monitor %s does not allow event %s on state %s\n",			\
+		 #name,										\
+		 model_get_event_name_##name(event),						\
+		 model_get_state_name_##name(curr_state));					\
+}												\
 												\
 /*												\
  * da_monitor_reset_##name - reset a monitor and setting it to init state			\
@@ -126,7 +111,7 @@ da_event_##name(struct da_monitor *da_mon, enum events_##name event)				\
 	for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) {					\
 		next_state = model_get_next_state_##name(curr_state, event);			\
 		if (next_state == INVALID_STATE) {						\
-			cond_react_##name(curr_state, event);					\
+			react_##name(curr_state, event);					\
 			trace_error_##name(model_get_state_name_##name(curr_state),		\
 					   model_get_event_name_##name(event));			\
 			return false;								\
@@ -165,7 +150,7 @@ static inline bool da_event_##name(struct da_monitor *da_mon, struct task_struct
 	for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) {					\
 		next_state = model_get_next_state_##name(curr_state, event);			\
 		if (next_state == INVALID_STATE) {						\
-			cond_react_##name(curr_state, event);					\
+			react_##name(curr_state, event);					\
 			trace_error_##name(tsk->pid,						\
 					   model_get_state_name_##name(curr_state),		\
 					   model_get_event_name_##name(event));			\
diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h
index 5368cf5fd623..00c42b36f961 100644
--- a/include/rv/ltl_monitor.h
+++ b/include/rv/ltl_monitor.h
@@ -16,21 +16,12 @@
 #error "Please include $(MODEL_NAME).h generated by rvgen"
 #endif
 
-#ifdef CONFIG_RV_REACTORS
 #define RV_MONITOR_NAME CONCATENATE(rv_, MONITOR_NAME)
-static struct rv_monitor RV_MONITOR_NAME;
 
-static void rv_cond_react(struct task_struct *task)
-{
-	if (!rv_reacting_on() || !RV_MONITOR_NAME.react)
-		return;
-	RV_MONITOR_NAME.react("rv: "__stringify(MONITOR_NAME)": %s[%d]: violation detected\n",
-			      task->comm, task->pid);
-}
+#ifdef CONFIG_RV_REACTORS
+static struct rv_monitor RV_MONITOR_NAME;
 #else
-static void rv_cond_react(struct task_struct *task)
-{
-}
+extern struct rv_monitor RV_MONITOR_NAME;
 #endif
 
 static int ltl_monitor_slot = RV_PER_TASK_MONITOR_INIT;
@@ -98,7 +89,8 @@ static void ltl_monitor_destroy(void)
 static void ltl_illegal_state(struct task_struct *task, struct ltl_monitor *mon)
 {
 	CONCATENATE(trace_error_, MONITOR_NAME)(task);
-	rv_cond_react(task);
+	rv_react(&RV_MONITOR_NAME, "rv: "__stringify(MONITOR_NAME)": %s[%d]: violation detected\n",
+		 task->comm, task->pid);
 }
 
 static void ltl_attempt_start(struct task_struct *task, struct ltl_monitor *mon)
diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c
index 74c6bcc2c749..76537b8a4343 100644
--- a/kernel/trace/rv/reactor_panic.c
+++ b/kernel/trace/rv/reactor_panic.c
@@ -13,13 +13,9 @@
 #include <linux/init.h>
 #include <linux/rv.h>
 
-__printf(1, 2) static void rv_panic_reaction(const char *msg, ...)
+__printf(1, 0) static void rv_panic_reaction(const char *msg, va_list args)
 {
-	va_list args;
-
-	va_start(args, msg);
 	vpanic(msg, args);
-	va_end(args);
 }
 
 static struct rv_reactor rv_panic = {
diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c
index 2dae2916c05f..48c934e315b3 100644
--- a/kernel/trace/rv/reactor_printk.c
+++ b/kernel/trace/rv/reactor_printk.c
@@ -12,13 +12,9 @@
 #include <linux/init.h>
 #include <linux/rv.h>
 
-__printf(1, 2) static void rv_printk_reaction(const char *msg, ...)
+__printf(1, 0) static void rv_printk_reaction(const char *msg, va_list args)
 {
-	va_list args;
-
-	va_start(args, msg);
 	vprintk_deferred(msg, args);
-	va_end(args);
 }
 
 static struct rv_reactor rv_printk = {
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index d32859fec238..cb1a5968055a 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -438,7 +438,7 @@ int reactor_populate_monitor(struct rv_monitor *mon)
 /*
  * Nop reactor register
  */
-__printf(1, 2) static void rv_nop_reaction(const char *msg, ...)
+__printf(1, 0) static void rv_nop_reaction(const char *msg, va_list args)
 {
 }
 
@@ -477,3 +477,17 @@ rm_available:
 out_err:
 	return -ENOMEM;
 }
+
+void rv_react(struct rv_monitor *monitor, const char *msg, ...)
+{
+	va_list args;
+
+	if (!rv_reacting_on() || !monitor->react)
+		return;
+
+	va_start(args, msg);
+
+	monitor->react(msg, args);
+
+	va_end(args);
+}
-- 
cgit v1.2.3


From 68f63cea46d3a410a41d9ab74d338038a22bc2ad Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 14 Oct 2025 07:51:57 +0200
Subject: rv: Make rv_reacting_on() static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no external users left.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-2-0b9e51919ea8@linutronix.de
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/rv.h            | 6 ------
 kernel/trace/rv/rv_reactors.c | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index b567b0191e67..92fd467547e7 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -116,17 +116,11 @@ int rv_get_task_monitor_slot(void);
 void rv_put_task_monitor_slot(int slot);
 
 #ifdef CONFIG_RV_REACTORS
-bool rv_reacting_on(void);
 int rv_unregister_reactor(struct rv_reactor *reactor);
 int rv_register_reactor(struct rv_reactor *reactor);
 __printf(2, 3)
 void rv_react(struct rv_monitor *monitor, const char *msg, ...);
 #else
-static inline bool rv_reacting_on(void)
-{
-	return false;
-}
-
 __printf(2, 3)
 static inline void rv_react(struct rv_monitor *monitor, const char *msg, ...)
 {
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index cb1a5968055a..8c02426bc3bd 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -347,7 +347,7 @@ static bool __read_mostly reacting_on;
  *
  * Returns 1 if on, 0 otherwise.
  */
-bool rv_reacting_on(void)
+static bool rv_reacting_on(void)
 {
 	/* Ensures that concurrent monitors read consistent reacting_on */
 	smp_rmb();
-- 
cgit v1.2.3


From 69f3474a01e9867dd99fc4b703fa834ea1835c7d Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 11 Nov 2025 13:08:49 +0000
Subject: ASoC: cs35l56: Add control to read CAL_SET_STATUS

Create an ALSA control to read the value of the firmware
CAL_SET_STATUS control. This reports whether the firmware is
using a calibration blob or the default calibration from the
.bin file.

The firmware only reports a valid value in this register while
audio is actually playing and the internal PLL is locked to the
audio clock. Otherwise it returns a status of "unknown".

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20251111130850.513969-2-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           |  9 ++++++++
 sound/soc/codecs/cs35l56-shared.c | 48 +++++++++++++++++++++++++++++++++++++++
 sound/soc/codecs/cs35l56.c        | 16 +++++++++++++
 3 files changed, 73 insertions(+)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index 0a740a99ad31..bd13958bf19d 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -16,6 +16,8 @@
 #include <linux/spi/spi.h>
 #include <sound/cs-amp-lib.h>
 
+struct snd_ctl_elem_value;
+
 #define CS35L56_DEVID					0x0000000
 #define CS35L56_REVID					0x0000004
 #define CS35L56_RELID					0x000000C
@@ -268,6 +270,10 @@
 #define CS35L56_CAL_STATUS_SUCCESS			1
 #define CS35L56_CAL_STATUS_OUT_OF_RANGE			3
 
+#define CS35L56_CAL_SET_STATUS_UNKNOWN			0
+#define CS35L56_CAL_SET_STATUS_DEFAULT			1
+#define CS35L56_CAL_SET_STATUS_SET			2
+
 #define CS35L56_CONTROL_PORT_READY_US			2200
 #define CS35L56_HALO_STATE_POLL_US			1000
 #define CS35L56_HALO_STATE_TIMEOUT_US			250000
@@ -363,6 +369,7 @@ extern const struct regmap_config cs35l63_regmap_i2c;
 extern const struct regmap_config cs35l63_regmap_sdw;
 
 extern const struct cirrus_amp_cal_controls cs35l56_calibration_controls;
+extern const char * const cs35l56_cal_set_status_text[3];
 
 extern const char * const cs35l56_tx_input_texts[CS35L56_NUM_INPUT_SRC];
 extern const unsigned int cs35l56_tx_input_values[CS35L56_NUM_INPUT_SRC];
@@ -396,6 +403,8 @@ ssize_t cs35l56_cal_data_debugfs_write(struct cs35l56_base *cs35l56_base,
 void cs35l56_create_cal_debugfs(struct cs35l56_base *cs35l56_base,
 				const struct cs35l56_cal_debugfs_fops *fops);
 void cs35l56_remove_cal_debugfs(struct cs35l56_base *cs35l56_base);
+int cs35l56_cal_set_status_get(struct cs35l56_base *cs35l56_base,
+			       struct snd_ctl_elem_value *uvalue);
 int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base,
 			     bool *fw_missing, unsigned int *fw_version);
 void cs35l56_log_tuning(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp);
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index fe0693c3f7de..4fba4127c40c 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -1262,6 +1262,54 @@ void cs35l56_remove_cal_debugfs(struct cs35l56_base *cs35l56_base)
 }
 EXPORT_SYMBOL_NS_GPL(cs35l56_remove_cal_debugfs, "SND_SOC_CS35L56_SHARED");
 
+const char * const cs35l56_cal_set_status_text[] = {
+	"Unknown", "Default", "Set",
+};
+EXPORT_SYMBOL_NS_GPL(cs35l56_cal_set_status_text, "SND_SOC_CS35L56_SHARED");
+
+int cs35l56_cal_set_status_get(struct cs35l56_base *cs35l56_base,
+			       struct snd_ctl_elem_value *uvalue)
+{
+	struct cs_dsp *dsp = cs35l56_base->dsp;
+	__be32 cal_set_status_be;
+	int alg_id;
+	int ret;
+
+	switch (cs35l56_base->type) {
+	case 0x54:
+	case 0x56:
+	case 0x57:
+		alg_id = 0x9f210;
+		break;
+	default:
+		alg_id = 0xbf210;
+		break;
+	}
+
+	scoped_guard(mutex, &dsp->pwr_lock) {
+		ret = cs_dsp_coeff_read_ctrl(cs_dsp_get_ctl(dsp,
+							    "CAL_SET_STATUS",
+							    WMFW_ADSP2_YM, alg_id),
+					      0, &cal_set_status_be,
+					      sizeof(cal_set_status_be));
+	}
+	if (ret) {
+		uvalue->value.enumerated.item[0] = CS35L56_CAL_SET_STATUS_UNKNOWN;
+		return 0;
+	}
+
+	switch (be32_to_cpu(cal_set_status_be)) {
+	case CS35L56_CAL_SET_STATUS_DEFAULT:
+	case CS35L56_CAL_SET_STATUS_SET:
+		uvalue->value.enumerated.item[0] = be32_to_cpu(cal_set_status_be);
+		return 0;
+	default:
+		uvalue->value.enumerated.item[0] = CS35L56_CAL_SET_STATUS_UNKNOWN;
+		return 0;
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_cal_set_status_get, "SND_SOC_CS35L56_SHARED");
+
 int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base,
 			     bool *fw_missing, unsigned int *fw_version)
 {
diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c
index 091a72325507..e1eb7360b058 100644
--- a/sound/soc/codecs/cs35l56.c
+++ b/sound/soc/codecs/cs35l56.c
@@ -66,6 +66,18 @@ static int cs35l56_dspwait_put_volsw(struct snd_kcontrol *kcontrol,
 
 static DECLARE_TLV_DB_SCALE(vol_tlv, -10000, 25, 0);
 
+static SOC_ENUM_SINGLE_DECL(cs35l56_cal_set_status_enum, SND_SOC_NOPM, 0,
+			    cs35l56_cal_set_status_text);
+
+static int cs35l56_cal_set_status_ctl_get(struct snd_kcontrol *kcontrol,
+					  struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component = snd_kcontrol_chip(kcontrol);
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component);
+
+	return cs35l56_cal_set_status_get(&cs35l56->base, ucontrol);
+}
+
 static const struct snd_kcontrol_new cs35l56_controls[] = {
 	SOC_SINGLE_EXT("Speaker Switch",
 		       CS35L56_MAIN_RENDER_USER_MUTE, 0, 1, 1,
@@ -83,6 +95,8 @@ static const struct snd_kcontrol_new cs35l56_controls[] = {
 	SOC_SINGLE_EXT("Posture Number", CS35L56_MAIN_POSTURE_NUMBER,
 		       0, 255, 0,
 		       cs35l56_dspwait_get_volsw, cs35l56_dspwait_put_volsw),
+	SOC_ENUM_EXT("CAL_SET_STATUS", cs35l56_cal_set_status_enum,
+		     cs35l56_cal_set_status_ctl_get, NULL),
 };
 
 static const struct snd_kcontrol_new cs35l63_controls[] = {
@@ -102,6 +116,8 @@ static const struct snd_kcontrol_new cs35l63_controls[] = {
 	SOC_SINGLE_EXT("Posture Number", CS35L63_MAIN_POSTURE_NUMBER,
 		       0, 255, 0,
 		       cs35l56_dspwait_get_volsw, cs35l56_dspwait_put_volsw),
+	SOC_ENUM_EXT("CAL_SET_STATUS", cs35l56_cal_set_status_enum,
+		     cs35l56_cal_set_status_ctl_get, NULL),
 };
 
 static SOC_VALUE_ENUM_SINGLE_DECL(cs35l56_asp1tx1_enum,
-- 
cgit v1.2.3


From 32172cf3cb543a04c41a1677c97a38e60cad05b6 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 11 Nov 2025 13:08:50 +0000
Subject: ASoC: cs35l56: Allow restoring factory calibration through ALSA
 control

Add an ALSA control (CAL_DATA) that can be used to restore amp calibration,
instead of using debugfs. A readback control (CAL_DATA_RB) is also added
for factory testing.

On ChromeOS the process that restores amp calibration from NVRAM has
limited permissions and cannot access debugfs. It requires an ALSA control
that it can write the calibration blob into. ChromeOS also restricts access
to ALSA controls, which avoids the risk of accidental or malicious
overwriting of good calibration data with bad data. As this control is not
needed for normal Linux-based distros it is a Kconfig option.

A separate control, CAL_DATA_RB, provides a readback of the current
calibration data, which could be either from a write to CAL_DATA or the
result of factory production-line calibration.

The write and read are intentionally separate controls to defeat "dumb"
save-and-restore tools like alsa-restore that assume it is safe to save
all control values and write them back in any order at some undefined
future time. Such behavior carries the risk of restoring stale or bad data
over the top of good data.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20251111130850.513969-3-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           |  2 ++
 sound/soc/codecs/Kconfig          | 14 ++++++++
 sound/soc/codecs/cs35l56-shared.c |  5 +--
 sound/soc/codecs/cs35l56.c        | 67 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 86 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index bd13958bf19d..883f6a7e50aa 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -388,6 +388,8 @@ int cs35l56_runtime_suspend_common(struct cs35l56_base *cs35l56_base);
 int cs35l56_runtime_resume_common(struct cs35l56_base *cs35l56_base, bool is_soundwire);
 void cs35l56_init_cs_dsp(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp);
 int cs35l56_get_calibration(struct cs35l56_base *cs35l56_base);
+int cs35l56_stash_calibration(struct cs35l56_base *cs35l56_base,
+			      const struct cirrus_amp_cal_data *data);
 ssize_t cs35l56_calibrate_debugfs_write(struct cs35l56_base *cs35l56_base,
 					const char __user *from, size_t count,
 					loff_t *ppos);
diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig
index 6da2fff9323c..433af9bc7564 100644
--- a/sound/soc/codecs/Kconfig
+++ b/sound/soc/codecs/Kconfig
@@ -912,6 +912,20 @@ config SND_SOC_CS35L56_CAL_DEBUGFS
 	  Create debugfs entries used during factory-line manufacture
 	  for factory calibration.
 
+	  If unsure select "N".
+
+config SND_SOC_CS35L56_CAL_SET_CTRL
+	bool "CS35L56 ALSA control to restore factory calibration"
+	default N
+	select SND_SOC_CS35L56_CAL_SYSFS_COMMON
+	help
+	  Allow restoring factory calibration data through an ALSA
+	  control. This is only needed on platforms without UEFI or
+	  some other method of non-volatile storage that the driver
+	  can access directly.
+
+	  On most platforms this is not needed.
+
 	  If unsure select "N".
 endmenu
 
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index 4fba4127c40c..7424e1353062 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -962,8 +962,8 @@ int cs35l56_get_calibration(struct cs35l56_base *cs35l56_base)
 }
 EXPORT_SYMBOL_NS_GPL(cs35l56_get_calibration, "SND_SOC_CS35L56_SHARED");
 
-static int cs35l56_stash_calibration(struct cs35l56_base *cs35l56_base,
-				     const struct cirrus_amp_cal_data *data)
+int cs35l56_stash_calibration(struct cs35l56_base *cs35l56_base,
+			      const struct cirrus_amp_cal_data *data)
 {
 
 	/* Ignore if it is empty */
@@ -980,6 +980,7 @@ static int cs35l56_stash_calibration(struct cs35l56_base *cs35l56_base,
 
 	return 0;
 }
+EXPORT_SYMBOL_NS_GPL(cs35l56_stash_calibration, "SND_SOC_CS35L56_SHARED");
 
 static int cs35l56_perform_calibration(struct cs35l56_base *cs35l56_base)
 {
diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c
index e1eb7360b058..6feef971024b 100644
--- a/sound/soc/codecs/cs35l56.c
+++ b/sound/soc/codecs/cs35l56.c
@@ -1040,6 +1040,67 @@ static const struct cs35l56_cal_debugfs_fops cs35l56_cal_debugfs_fops = {
 	},
 };
 
+static int cs35l56_cal_data_rb_ctl_get(struct snd_kcontrol *kcontrol,
+				    struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component = snd_kcontrol_chip(kcontrol);
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component);
+
+	if (!cs35l56->base.cal_data_valid)
+		return -ENODATA;
+
+	memcpy(ucontrol->value.bytes.data, &cs35l56->base.cal_data,
+	       sizeof(cs35l56->base.cal_data));
+
+	return 0;
+}
+
+static int cs35l56_cal_data_ctl_get(struct snd_kcontrol *kcontrol,
+				    struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component = snd_kcontrol_chip(kcontrol);
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component);
+
+	/*
+	 * This control is write-only but mixer libraries often try to read
+	 * a control before writing it. So we have to implement read.
+	 * Return zeros so a write of valid data will always be a change
+	 * from its "current value".
+	 */
+	memset(ucontrol->value.bytes.data, 0, sizeof(cs35l56->base.cal_data));
+
+	return 0;
+}
+
+static int cs35l56_cal_data_ctl_set(struct snd_kcontrol *kcontrol,
+				    struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component = snd_kcontrol_chip(kcontrol);
+	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component);
+	const struct cirrus_amp_cal_data *cal_data = (const void *)ucontrol->value.bytes.data;
+	int ret;
+
+	if (cs35l56->base.cal_data_valid)
+		return -EACCES;
+
+	ret = cs35l56_stash_calibration(&cs35l56->base, cal_data);
+	if (ret)
+		return ret;
+
+	ret = cs35l56_new_cal_data_apply(cs35l56);
+	if (ret < 0)
+		return ret;
+
+	return 1;
+}
+
+static const struct snd_kcontrol_new cs35l56_cal_data_restore_controls[] = {
+	SND_SOC_BYTES_E("CAL_DATA", 0, sizeof(struct cirrus_amp_cal_data) / sizeof(u32),
+			cs35l56_cal_data_ctl_get, cs35l56_cal_data_ctl_set),
+	SND_SOC_BYTES_E("CAL_DATA_RB", 0, sizeof(struct cirrus_amp_cal_data) / sizeof(u32),
+			cs35l56_cal_data_rb_ctl_get, NULL),
+};
+
 static int cs35l56_set_fw_suffix(struct cs35l56_private *cs35l56)
 {
 	if (cs35l56->dsp.fwf_suffix)
@@ -1134,6 +1195,12 @@ static int cs35l56_component_probe(struct snd_soc_component *component)
 		break;
 	}
 
+	if (!ret && IS_ENABLED(CONFIG_SND_SOC_CS35L56_CAL_SET_CTRL)) {
+		ret = snd_soc_add_component_controls(component,
+						     cs35l56_cal_data_restore_controls,
+						     ARRAY_SIZE(cs35l56_cal_data_restore_controls));
+	}
+
 	if (ret)
 		return dev_err_probe(cs35l56->base.dev, ret, "unable to add controls\n");
 
-- 
cgit v1.2.3


From c07a491c1b735e0c27454ea5c27a446d43401b1e Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Fri, 31 Oct 2025 19:24:48 -0700
Subject: net: export netdev_get_by_index_lock()

Need to call netdev_get_by_index_lock() from io_uring/zcrx.c, but it is
currently private to net. Export the function in linux/netdevice.h.

Signed-off-by: David Wei <dw@davidwei.uk>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/netdevice.h | 1 +
 net/core/dev.h            | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d1a687444b27..77c46a2823ec 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3401,6 +3401,7 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *netdev_get_by_index(struct net *net, int ifindex,
 				       netdevice_tracker *tracker, gfp_t gfp);
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex);
 struct net_device *netdev_get_by_name(struct net *net, const char *name,
 				      netdevice_tracker *tracker, gfp_t gfp);
 struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
diff --git a/net/core/dev.h b/net/core/dev.h
index 900880e8b5b4..df8a90fe89f8 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -29,7 +29,6 @@ struct napi_struct *
 netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
 struct net_device *dev_get_by_napi_id(unsigned int napi_id);
 
-struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex);
 struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
 struct net_device *
 netdev_xa_find_lock(struct net *net, struct net_device *dev,
-- 
cgit v1.2.3


From 3b521bf8c51246466e2c337f1f2b60acfdfe82d6 Mon Sep 17 00:00:00 2001
From: Laurentiu Mihalcea <laurentiu.mihalcea@nxp.com>
Date: Tue, 4 Nov 2025 04:02:55 -0800
Subject: dt-bindings: clock: document 8ULP's SIM LPAV

Add documentation for i.MX8ULP's SIM LPAV module.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Laurentiu Mihalcea <laurentiu.mihalcea@nxp.com>
Link: https://lore.kernel.org/r/20251104120301.913-3-laurentiumihalcea111@gmail.com
Signed-off-by: Abel Vesa <abel.vesa@linaro.org>
---
 .../bindings/clock/fsl,imx8ulp-sim-lpav.yaml       | 72 ++++++++++++++++++++++
 include/dt-bindings/clock/imx8ulp-clock.h          |  5 ++
 include/dt-bindings/reset/fsl,imx8ulp-sim-lpav.h   | 16 +++++
 3 files changed, 93 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/fsl,imx8ulp-sim-lpav.yaml
 create mode 100644 include/dt-bindings/reset/fsl,imx8ulp-sim-lpav.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/fsl,imx8ulp-sim-lpav.yaml b/Documentation/devicetree/bindings/clock/fsl,imx8ulp-sim-lpav.yaml
new file mode 100644
index 000000000000..662e07528d76
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/fsl,imx8ulp-sim-lpav.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/fsl,imx8ulp-sim-lpav.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP i.MX8ULP LPAV System Integration Module (SIM)
+
+maintainers:
+  - Laurentiu Mihalcea <laurentiu.mihalcea@nxp.com>
+
+description:
+  The i.MX8ULP LPAV subsystem contains a block control module known as
+  SIM LPAV, which offers functionalities such as clock gating or reset
+  line assertion/de-assertion.
+
+properties:
+  compatible:
+    const: fsl,imx8ulp-sim-lpav
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    maxItems: 3
+
+  clock-names:
+    items:
+      - const: bus
+      - const: core
+      - const: plat
+
+  '#clock-cells':
+    const: 1
+
+  '#reset-cells':
+    const: 1
+
+  mux-controller:
+    $ref: /schemas/mux/reg-mux.yaml#
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - '#clock-cells'
+  - '#reset-cells'
+  - mux-controller
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/imx8ulp-clock.h>
+
+    clock-controller@2da50000 {
+        compatible = "fsl,imx8ulp-sim-lpav";
+        reg = <0x2da50000 0x10000>;
+        clocks = <&cgc2 IMX8ULP_CLK_LPAV_BUS_DIV>,
+                 <&cgc2 IMX8ULP_CLK_HIFI_DIVCORE>,
+                 <&cgc2 IMX8ULP_CLK_HIFI_DIVPLAT>;
+        clock-names = "bus", "core", "plat";
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+
+        mux-controller {
+            compatible = "reg-mux";
+            #mux-control-cells = <1>;
+            mux-reg-masks = <0x8 0x00000200>;
+        };
+    };
diff --git a/include/dt-bindings/clock/imx8ulp-clock.h b/include/dt-bindings/clock/imx8ulp-clock.h
index 827404fadf5c..c62d84d093a9 100644
--- a/include/dt-bindings/clock/imx8ulp-clock.h
+++ b/include/dt-bindings/clock/imx8ulp-clock.h
@@ -255,4 +255,9 @@
 
 #define IMX8ULP_CLK_PCC5_END		56
 
+/* LPAV SIM */
+#define IMX8ULP_CLK_SIM_LPAV_HIFI_CORE		0
+#define IMX8ULP_CLK_SIM_LPAV_HIFI_PBCLK		1
+#define IMX8ULP_CLK_SIM_LPAV_HIFI_PLAT		2
+
 #endif
diff --git a/include/dt-bindings/reset/fsl,imx8ulp-sim-lpav.h b/include/dt-bindings/reset/fsl,imx8ulp-sim-lpav.h
new file mode 100644
index 000000000000..adf95bb26d21
--- /dev/null
+++ b/include/dt-bindings/reset/fsl,imx8ulp-sim-lpav.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright 2025 NXP
+ */
+
+#ifndef DT_BINDING_RESET_IMX8ULP_SIM_LPAV_H
+#define DT_BINDING_RESET_IMX8ULP_SIM_LPAV_H
+
+#define IMX8ULP_SIM_LPAV_HIFI4_DSP_DBG_RST	0
+#define IMX8ULP_SIM_LPAV_HIFI4_DSP_RST		1
+#define IMX8ULP_SIM_LPAV_HIFI4_DSP_STALL	2
+#define IMX8ULP_SIM_LPAV_DSI_RST_BYTE_N		3
+#define IMX8ULP_SIM_LPAV_DSI_RST_ESC_N		4
+#define IMX8ULP_SIM_LPAV_DSI_RST_DPI_N		5
+
+#endif /* DT_BINDING_RESET_IMX8ULP_SIM_LPAV_H */
-- 
cgit v1.2.3


From 781f60e45bdfe351aad692ac0fa89e36f8bf4a36 Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley@microchip.com>
Date: Mon, 10 Nov 2025 11:23:50 +0000
Subject: reset: mpfs: add non-auxiliary bus probing

While the auxiliary bus was a nice bandaid, and meant that re-writing
the representation of the clock regions in devicetree was not required,
it has run its course. The "mss_top_sysreg" region that contains the
clock and reset regions, also contains pinctrl and an interrupt
controller, so the time has come rewrite the devicetree and probe the
reset controller from an mfd devicetree node, rather than implement
those drivers using the auxiliary bus. Wanting to avoid propagating this
naive/incorrect description of the hardware to the new pic64gx SoC is a
major motivating factor here.

Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Acked-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
---
 drivers/clk/microchip/clk-mpfs.c |  4 +-
 drivers/reset/Kconfig            |  1 +
 drivers/reset/reset-mpfs.c       | 91 ++++++++++++++++++++++++----------------
 include/soc/microchip/mpfs.h     |  3 +-
 4 files changed, 60 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/microchip/clk-mpfs.c b/drivers/clk/microchip/clk-mpfs.c
index 484893e68b67..ee58304913ef 100644
--- a/drivers/clk/microchip/clk-mpfs.c
+++ b/drivers/clk/microchip/clk-mpfs.c
@@ -38,7 +38,7 @@ static const struct regmap_config mpfs_clk_regmap_config = {
 	.reg_stride = 4,
 	.val_bits = 32,
 	.val_format_endian = REGMAP_ENDIAN_LITTLE,
-	.max_register = REG_SUBBLK_CLOCK_CR,
+	.max_register = REG_SUBBLK_RESET_CR,
 };
 
 /*
@@ -502,7 +502,7 @@ static inline int mpfs_clk_old_format_probe(struct mpfs_clock_data *clk_data,
 	if (IS_ERR(clk_data->regmap))
 		return PTR_ERR(clk_data->regmap);
 
-	return mpfs_reset_controller_register(dev, clk_data->base + REG_SUBBLK_RESET_CR);
+	return mpfs_reset_controller_register(dev, clk_data->regmap);
 }
 
 static int mpfs_clk_probe(struct platform_device *pdev)
diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig
index 78b7078478d4..0ec4b7cd08d6 100644
--- a/drivers/reset/Kconfig
+++ b/drivers/reset/Kconfig
@@ -200,6 +200,7 @@ config RESET_PISTACHIO
 config RESET_POLARFIRE_SOC
 	bool "Microchip PolarFire SoC (MPFS) Reset Driver"
 	depends on MCHP_CLK_MPFS
+	depends on MFD_SYSCON
 	select AUXILIARY_BUS
 	default MCHP_CLK_MPFS
 	help
diff --git a/drivers/reset/reset-mpfs.c b/drivers/reset/reset-mpfs.c
index f6fa10e03ea8..8ffcc54ee6f6 100644
--- a/drivers/reset/reset-mpfs.c
+++ b/drivers/reset/reset-mpfs.c
@@ -9,11 +9,13 @@
 #include <linux/auxiliary_bus.h>
 #include <linux/delay.h>
 #include <linux/io.h>
+#include <linux/mfd/syscon.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
-#include <linux/slab.h>
+#include <linux/regmap.h>
 #include <linux/reset-controller.h>
+#include <linux/slab.h>
 #include <dt-bindings/clock/microchip,mpfs-clock.h>
 #include <soc/microchip/mpfs.h>
 
@@ -27,11 +29,10 @@
 #define MPFS_SLEEP_MIN_US	100
 #define MPFS_SLEEP_MAX_US	200
 
-/* block concurrent access to the soft reset register */
-static DEFINE_SPINLOCK(mpfs_reset_lock);
+#define REG_SUBBLK_RESET_CR	0x88u
 
 struct mpfs_reset {
-	void __iomem *base;
+	struct regmap *regmap;
 	struct reset_controller_dev rcdev;
 };
 
@@ -46,41 +47,25 @@ static inline struct mpfs_reset *to_mpfs_reset(struct reset_controller_dev *rcde
 static int mpfs_assert(struct reset_controller_dev *rcdev, unsigned long id)
 {
 	struct mpfs_reset *rst = to_mpfs_reset(rcdev);
-	unsigned long flags;
-	u32 reg;
-
-	spin_lock_irqsave(&mpfs_reset_lock, flags);
-
-	reg = readl(rst->base);
-	reg |= BIT(id);
-	writel(reg, rst->base);
 
-	spin_unlock_irqrestore(&mpfs_reset_lock, flags);
+	return regmap_set_bits(rst->regmap, REG_SUBBLK_RESET_CR, BIT(id));
 
-	return 0;
 }
 
 static int mpfs_deassert(struct reset_controller_dev *rcdev, unsigned long id)
 {
 	struct mpfs_reset *rst = to_mpfs_reset(rcdev);
-	unsigned long flags;
-	u32 reg;
-
-	spin_lock_irqsave(&mpfs_reset_lock, flags);
 
-	reg = readl(rst->base);
-	reg &= ~BIT(id);
-	writel(reg, rst->base);
+	return regmap_clear_bits(rst->regmap, REG_SUBBLK_RESET_CR, BIT(id));
 
-	spin_unlock_irqrestore(&mpfs_reset_lock, flags);
-
-	return 0;
 }
 
 static int mpfs_status(struct reset_controller_dev *rcdev, unsigned long id)
 {
 	struct mpfs_reset *rst = to_mpfs_reset(rcdev);
-	u32 reg = readl(rst->base);
+	u32 reg;
+
+	regmap_read(rst->regmap, REG_SUBBLK_RESET_CR, &reg);
 
 	/*
 	 * It is safe to return here as MPFS_NUM_RESETS makes sure the sign bit
@@ -130,23 +115,58 @@ static int mpfs_reset_xlate(struct reset_controller_dev *rcdev,
 	return index - MPFS_PERIPH_OFFSET;
 }
 
-static int mpfs_reset_probe(struct auxiliary_device *adev,
-			    const struct auxiliary_device_id *id)
+static int mpfs_reset_mfd_probe(struct platform_device *pdev)
+{
+	struct reset_controller_dev *rcdev;
+	struct device *dev = &pdev->dev;
+	struct mpfs_reset *rst;
+
+	rst = devm_kzalloc(dev, sizeof(*rst), GFP_KERNEL);
+	if (!rst)
+		return -ENOMEM;
+
+	rcdev = &rst->rcdev;
+	rcdev->dev = dev;
+	rcdev->ops = &mpfs_reset_ops;
+
+	rcdev->of_node = pdev->dev.parent->of_node;
+	rcdev->of_reset_n_cells = 1;
+	rcdev->of_xlate = mpfs_reset_xlate;
+	rcdev->nr_resets = MPFS_NUM_RESETS;
+
+	rst->regmap = device_node_to_regmap(pdev->dev.parent->of_node);
+	if (IS_ERR(rst->regmap))
+		return dev_err_probe(dev, PTR_ERR(rst->regmap),
+				     "Failed to find syscon regmap\n");
+
+	return devm_reset_controller_register(dev, rcdev);
+}
+
+static struct platform_driver mpfs_reset_mfd_driver = {
+	.probe = mpfs_reset_mfd_probe,
+	.driver = {
+		.name = "mpfs-reset",
+	},
+};
+module_platform_driver(mpfs_reset_mfd_driver);
+
+static int mpfs_reset_adev_probe(struct auxiliary_device *adev,
+				 const struct auxiliary_device_id *id)
 {
-	struct device *dev = &adev->dev;
 	struct reset_controller_dev *rcdev;
+	struct device *dev = &adev->dev;
 	struct mpfs_reset *rst;
 
 	rst = devm_kzalloc(dev, sizeof(*rst), GFP_KERNEL);
 	if (!rst)
 		return -ENOMEM;
 
-	rst->base = (void __iomem *)adev->dev.platform_data;
+	rst->regmap = (struct regmap *)adev->dev.platform_data;
 
 	rcdev = &rst->rcdev;
 	rcdev->dev = dev;
-	rcdev->dev->parent = dev->parent;
 	rcdev->ops = &mpfs_reset_ops;
+
 	rcdev->of_node = dev->parent->of_node;
 	rcdev->of_reset_n_cells = 1;
 	rcdev->of_xlate = mpfs_reset_xlate;
@@ -155,12 +175,11 @@ static int mpfs_reset_probe(struct auxiliary_device *adev,
 	return devm_reset_controller_register(dev, rcdev);
 }
 
-int mpfs_reset_controller_register(struct device *clk_dev, void __iomem *base)
+int mpfs_reset_controller_register(struct device *clk_dev, struct regmap *map)
 {
 	struct auxiliary_device *adev;
 
-	adev = devm_auxiliary_device_create(clk_dev, "reset-mpfs",
-					    (__force void *)base);
+	adev = devm_auxiliary_device_create(clk_dev, "reset-mpfs", (void *)map);
 	if (!adev)
 		return -ENODEV;
 
@@ -176,12 +195,12 @@ static const struct auxiliary_device_id mpfs_reset_ids[] = {
 };
 MODULE_DEVICE_TABLE(auxiliary, mpfs_reset_ids);
 
-static struct auxiliary_driver mpfs_reset_driver = {
-	.probe		= mpfs_reset_probe,
+static struct auxiliary_driver mpfs_reset_aux_driver = {
+	.probe		= mpfs_reset_adev_probe,
 	.id_table	= mpfs_reset_ids,
 };
 
-module_auxiliary_driver(mpfs_reset_driver);
+module_auxiliary_driver(mpfs_reset_aux_driver);
 
 MODULE_DESCRIPTION("Microchip PolarFire SoC Reset Driver");
 MODULE_AUTHOR("Conor Dooley <conor.dooley@microchip.com>");
diff --git a/include/soc/microchip/mpfs.h b/include/soc/microchip/mpfs.h
index 0bd67e10b704..ec04c98a8b63 100644
--- a/include/soc/microchip/mpfs.h
+++ b/include/soc/microchip/mpfs.h
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/of_device.h>
+#include <linux/regmap.h>
 
 struct mpfs_sys_controller;
 
@@ -44,7 +45,7 @@ struct mtd_info *mpfs_sys_controller_get_flash(struct mpfs_sys_controller *mpfs_
 
 #if IS_ENABLED(CONFIG_MCHP_CLK_MPFS)
 #if IS_ENABLED(CONFIG_RESET_POLARFIRE_SOC)
-int mpfs_reset_controller_register(struct device *clk_dev, void __iomem *base);
+int mpfs_reset_controller_register(struct device *clk_dev, struct regmap *map);
 #else
 static inline int mpfs_reset_controller_register(struct device *clk_dev, void __iomem *base) { return 0; }
 #endif /* if IS_ENABLED(CONFIG_RESET_POLARFIRE_SOC) */
-- 
cgit v1.2.3


From cb46a58d77e5b433e9f4538faaa2a73970157e8d Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 10 Oct 2025 03:44:04 -0700
Subject: efi/memattr: Convert efi_memattr_init() return type to void

The efi_memattr_init() function's return values (0 and -ENOMEM) are never
checked by callers. Convert the function to return void since the return
status is unused.

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/memattr.c | 7 +++----
 include/linux/efi.h            | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c
index c38b1a335590..e727cc5909cb 100644
--- a/drivers/firmware/efi/memattr.c
+++ b/drivers/firmware/efi/memattr.c
@@ -19,19 +19,19 @@ unsigned long __ro_after_init efi_mem_attr_table = EFI_INVALID_TABLE_ADDR;
  * Reserve the memory associated with the Memory Attributes configuration
  * table, if it exists.
  */
-int __init efi_memattr_init(void)
+void __init efi_memattr_init(void)
 {
 	efi_memory_attributes_table_t *tbl;
 	unsigned long size;
 
 	if (efi_mem_attr_table == EFI_INVALID_TABLE_ADDR)
-		return 0;
+		return;
 
 	tbl = early_memremap(efi_mem_attr_table, sizeof(*tbl));
 	if (!tbl) {
 		pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n",
 		       efi_mem_attr_table);
-		return -ENOMEM;
+		return;
 	}
 
 	if (tbl->version > 2) {
@@ -61,7 +61,6 @@ int __init efi_memattr_init(void)
 
 unmap:
 	early_memunmap(tbl, sizeof(*tbl));
-	return 0;
 }
 
 /*
diff --git a/include/linux/efi.h b/include/linux/efi.h
index a98cc39e7aaa..0b9eb3d2ff97 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -772,7 +772,7 @@ extern unsigned long efi_mem_attr_table;
  */
 typedef int (*efi_memattr_perm_setter)(struct mm_struct *, efi_memory_desc_t *, bool);
 
-extern int efi_memattr_init(void);
+extern void efi_memattr_init(void);
 extern int efi_memattr_apply_permissions(struct mm_struct *mm,
 					 efi_memattr_perm_setter fn);
 
-- 
cgit v1.2.3


From a2860501203cf7a2116adf3bb4e4c456c5750872 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 15 Oct 2025 22:56:37 +0200
Subject: efi/runtime-wrappers: Keep track of the efi_runtime_lock owner

The EFI runtime wrappers use a file local semaphore to serialize access
to the EFI runtime services. This means that any calls to the arch
wrappers around the runtime services will also be serialized, removing
the need for redundant locking.

For robustness, add a facility that allows those arch wrappers to assert
that the semaphore was taken by the current task.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/firmware/efi/runtime-wrappers.c | 17 ++++++++++++++++-
 include/linux/efi.h                     |  2 ++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index 708b777857d3..da8d29621644 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -202,6 +202,8 @@ void efi_call_virt_check_flags(unsigned long flags, const void *caller)
  */
 static DEFINE_SEMAPHORE(efi_runtime_lock, 1);
 
+static struct task_struct *efi_runtime_lock_owner;
+
 /*
  * Expose the EFI runtime lock to the UV platform
  */
@@ -219,6 +221,8 @@ static void __nocfi efi_call_rts(struct work_struct *work)
 	efi_status_t status = EFI_NOT_FOUND;
 	unsigned long flags;
 
+	efi_runtime_lock_owner = current;
+
 	arch_efi_call_virt_setup();
 	flags = efi_call_virt_save_flags();
 
@@ -310,6 +314,7 @@ static void __nocfi efi_call_rts(struct work_struct *work)
 
 	efi_rts_work.status = status;
 	complete(&efi_rts_work.efi_rts_comp);
+	efi_runtime_lock_owner = NULL;
 }
 
 static efi_status_t __efi_queue_work(enum efi_rts_ids id,
@@ -444,8 +449,10 @@ virt_efi_set_variable_nb(efi_char16_t *name, efi_guid_t *vendor, u32 attr,
 	if (down_trylock(&efi_runtime_lock))
 		return EFI_NOT_READY;
 
+	efi_runtime_lock_owner = current;
 	status = efi_call_virt_pointer(efi.runtime, set_variable, name, vendor,
 				       attr, data_size, data);
+	efi_runtime_lock_owner = NULL;
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -481,9 +488,11 @@ virt_efi_query_variable_info_nb(u32 attr, u64 *storage_space,
 	if (down_trylock(&efi_runtime_lock))
 		return EFI_NOT_READY;
 
+	efi_runtime_lock_owner = current;
 	status = efi_call_virt_pointer(efi.runtime, query_variable_info, attr,
 				       storage_space, remaining_space,
 				       max_variable_size);
+	efi_runtime_lock_owner = NULL;
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -509,12 +518,13 @@ virt_efi_reset_system(int reset_type, efi_status_t status,
 		return;
 	}
 
+	efi_runtime_lock_owner = current;
 	arch_efi_call_virt_setup();
 	efi_rts_work.efi_rts_id = EFI_RESET_SYSTEM;
 	arch_efi_call_virt(efi.runtime, reset_system, reset_type, status,
 			   data_size, data);
 	arch_efi_call_virt_teardown();
-
+	efi_runtime_lock_owner = NULL;
 	up(&efi_runtime_lock);
 }
 
@@ -587,3 +597,8 @@ efi_call_acpi_prm_handler(efi_status_t (__efiapi *handler_addr)(u64, void *),
 }
 
 #endif
+
+void efi_runtime_assert_lock_held(void)
+{
+	WARN_ON(efi_runtime_lock_owner != current);
+}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index a98cc39e7aaa..b23ff8b83219 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1126,6 +1126,8 @@ static inline bool efi_runtime_disabled(void) { return true; }
 extern void efi_call_virt_check_flags(unsigned long flags, const void *caller);
 extern unsigned long efi_call_virt_save_flags(void);
 
+void efi_runtime_assert_lock_held(void);
+
 enum efi_secureboot_mode {
 	efi_secureboot_mode_unset,
 	efi_secureboot_mode_unknown,
-- 
cgit v1.2.3


From 3d176751e541362ff40c2478d6a2de41f8c62318 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 9 Nov 2025 15:47:17 -0800
Subject: lib/crypto: polyval: Add POLYVAL library

Add support for POLYVAL to lib/crypto/.

This will replace the polyval crypto_shash algorithm and its use in the
hctr2 template, simplifying the code and reducing overhead.

Specifically, this commit introduces the POLYVAL library API and a
generic implementation of it.  Later commits will migrate the existing
architecture-optimized implementations of POLYVAL into lib/crypto/ and
add a KUnit test suite.

I've also rewritten the generic implementation completely, using a more
modern approach instead of the traditional table-based approach.  It's
now constant-time, requires no precomputation or dynamic memory
allocations, decreases the per-key memory usage from 4096 bytes to 16
bytes, and is faster than the old polyval-generic even on bulk data
reusing the same key (at least on x86_64, where I measured 15% faster).
We should do this for GHASH too, but for now just do it for POLYVAL.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251109234726.638437-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/polyval.h | 171 +++++++++++++++++++++++++-
 lib/crypto/Kconfig       |  10 ++
 lib/crypto/Makefile      |   8 ++
 lib/crypto/polyval.c     | 307 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 493 insertions(+), 3 deletions(-)
 create mode 100644 lib/crypto/polyval.c

(limited to 'include')

diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h
index d2e63743e592..5ba4c248cad1 100644
--- a/include/crypto/polyval.h
+++ b/include/crypto/polyval.h
@@ -1,14 +1,179 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * Common values for the Polyval hash algorithm
+ * POLYVAL library API
  *
- * Copyright 2021 Google LLC
+ * Copyright 2025 Google LLC
  */
 
 #ifndef _CRYPTO_POLYVAL_H
 #define _CRYPTO_POLYVAL_H
 
+#include <linux/string.h>
+#include <linux/types.h>
+
 #define POLYVAL_BLOCK_SIZE	16
 #define POLYVAL_DIGEST_SIZE	16
 
+/**
+ * struct polyval_elem - An element of the POLYVAL finite field
+ * @bytes: View of the element as a byte array (unioned with @lo and @hi)
+ * @lo: The low 64 terms of the element's polynomial
+ * @hi: The high 64 terms of the element's polynomial
+ *
+ * This represents an element of the finite field GF(2^128), using the POLYVAL
+ * convention: little-endian byte order and natural bit order.
+ */
+struct polyval_elem {
+	union {
+		u8 bytes[POLYVAL_BLOCK_SIZE];
+		struct {
+			__le64 lo;
+			__le64 hi;
+		};
+	};
+};
+
+/**
+ * struct polyval_key - Prepared key for POLYVAL
+ *
+ * This may contain just the raw key H, or it may contain precomputed key
+ * powers, depending on the platform's POLYVAL implementation.  Use
+ * polyval_preparekey() to initialize this.
+ */
+struct polyval_key {
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#error "Unhandled arch"
+#else /* CONFIG_CRYPTO_LIB_POLYVAL_ARCH */
+	/** @h: The hash key H */
+	struct polyval_elem h;
+#endif /* !CONFIG_CRYPTO_LIB_POLYVAL_ARCH */
+};
+
+/**
+ * struct polyval_ctx - Context for computing a POLYVAL value
+ * @key: Pointer to the prepared POLYVAL key.  The user of the API is
+ *	 responsible for ensuring that the key lives as long as the context.
+ * @acc: The accumulator
+ * @partial: Number of data bytes processed so far modulo POLYVAL_BLOCK_SIZE
+ */
+struct polyval_ctx {
+	const struct polyval_key *key;
+	struct polyval_elem acc;
+	size_t partial;
+};
+
+/**
+ * polyval_preparekey() - Prepare a POLYVAL key
+ * @key: (output) The key structure to initialize
+ * @raw_key: The raw hash key
+ *
+ * Initialize a POLYVAL key structure from a raw key.  This may be a simple
+ * copy, or it may involve precomputing powers of the key, depending on the
+ * platform's POLYVAL implementation.
+ *
+ * Context: Any context.
+ */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+void polyval_preparekey(struct polyval_key *key,
+			const u8 raw_key[POLYVAL_BLOCK_SIZE]);
+
+#else
+static inline void polyval_preparekey(struct polyval_key *key,
+				      const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	/* Just a simple copy, so inline it. */
+	memcpy(key->h.bytes, raw_key, POLYVAL_BLOCK_SIZE);
+}
 #endif
+
+/**
+ * polyval_init() - Initialize a POLYVAL context for a new message
+ * @ctx: The context to initialize
+ * @key: The key to use.  Note that a pointer to the key is saved in the
+ *	 context, so the key must live at least as long as the context.
+ */
+static inline void polyval_init(struct polyval_ctx *ctx,
+				const struct polyval_key *key)
+{
+	*ctx = (struct polyval_ctx){ .key = key };
+}
+
+/**
+ * polyval_import_blkaligned() - Import a POLYVAL accumulator value
+ * @ctx: The context to initialize
+ * @key: The key to import.  Note that a pointer to the key is saved in the
+ *	 context, so the key must live at least as long as the context.
+ * @acc: The accumulator value to import.
+ *
+ * This imports an accumulator that was saved by polyval_export_blkaligned().
+ * The same key must be used.
+ */
+static inline void
+polyval_import_blkaligned(struct polyval_ctx *ctx,
+			  const struct polyval_key *key,
+			  const struct polyval_elem *acc)
+{
+	*ctx = (struct polyval_ctx){ .key = key, .acc = *acc };
+}
+
+/**
+ * polyval_export_blkaligned() - Export a POLYVAL accumulator value
+ * @ctx: The context to export the accumulator value from
+ * @acc: (output) The exported accumulator value
+ *
+ * This exports the accumulator from a POLYVAL context.  The number of data
+ * bytes processed so far must be a multiple of POLYVAL_BLOCK_SIZE.
+ */
+static inline void polyval_export_blkaligned(const struct polyval_ctx *ctx,
+					     struct polyval_elem *acc)
+{
+	*acc = ctx->acc;
+}
+
+/**
+ * polyval_update() - Update a POLYVAL context with message data
+ * @ctx: The context to update; must have been initialized
+ * @data: The message data
+ * @len: The data length in bytes.  Doesn't need to be block-aligned.
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len);
+
+/**
+ * polyval_final() - Finish computing a POLYVAL value
+ * @ctx: The context to finalize
+ * @out: The output value
+ *
+ * If the total data length isn't a multiple of POLYVAL_BLOCK_SIZE, then the
+ * final block is automatically zero-padded.
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE]);
+
+/**
+ * polyval() - Compute a POLYVAL value
+ * @key: The prepared key
+ * @data: The message data
+ * @len: The data length in bytes.  Doesn't need to be block-aligned.
+ * @out: The output value
+ *
+ * Context: Any context.
+ */
+static inline void polyval(const struct polyval_key *key,
+			   const u8 *data, size_t len,
+			   u8 out[POLYVAL_BLOCK_SIZE])
+{
+	struct polyval_ctx ctx;
+
+	polyval_init(&ctx, key);
+	polyval_update(&ctx, data, len);
+	polyval_final(&ctx, out);
+}
+
+#endif /* _CRYPTO_POLYVAL_H */
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 7445054fc0ad..6545f0e83b83 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -135,6 +135,16 @@ config CRYPTO_LIB_POLY1305_RSIZE
 	default 9 if ARM || ARM64
 	default 1
 
+config CRYPTO_LIB_POLYVAL
+	tristate
+	help
+	  The POLYVAL library functions.  Select this if your module uses any of
+	  the functions from <crypto/polyval.h>.
+
+config CRYPTO_LIB_POLYVAL_ARCH
+	bool
+	depends on CRYPTO_LIB_POLYVAL && !UML
+
 config CRYPTO_LIB_CHACHA20POLY1305
 	tristate
 	select CRYPTO_LIB_CHACHA
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 5515e73bfd5e..055e44008805 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -198,6 +198,14 @@ clean-files += arm/poly1305-core.S \
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL) += libpolyval.o
+libpolyval-y := polyval.o
+ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
+CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
+endif
+
+################################################################################
+
 obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
 libsha1-y := sha1.o
 ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
diff --git a/lib/crypto/polyval.c b/lib/crypto/polyval.c
new file mode 100644
index 000000000000..5796275f574a
--- /dev/null
+++ b/lib/crypto/polyval.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * POLYVAL library functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/polyval.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+/*
+ * POLYVAL is an almost-XOR-universal hash function.  Similar to GHASH, POLYVAL
+ * interprets the message as the coefficients of a polynomial in GF(2^128) and
+ * evaluates that polynomial at a secret point.  POLYVAL has a simple
+ * mathematical relationship with GHASH, but it uses a better field convention
+ * which makes it easier and faster to implement.
+ *
+ * POLYVAL is not a cryptographic hash function, and it should be used only by
+ * algorithms that are specifically designed to use it.
+ *
+ * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
+ * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
+ *
+ * POLYVAL is also used by HCTR2.  See "Length-preserving encryption with HCTR2"
+ * (https://eprint.iacr.org/2021/1441.pdf).
+ *
+ * This file provides a library API for POLYVAL.  This API can delegate to
+ * either a generic implementation or an architecture-optimized implementation.
+ *
+ * For the generic implementation, we don't use the traditional table approach
+ * to GF(2^128) multiplication.  That approach is not constant-time and requires
+ * a lot of memory.  Instead, we use a different approach which emulates
+ * carryless multiplication using standard multiplications by spreading the data
+ * bits apart using "holes".  This allows the carries to spill harmlessly.  This
+ * approach is borrowed from BoringSSL, which in turn credits BearSSL's
+ * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
+ * "holes" trick and a presentation by Shay Gueron
+ * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
+ * 256-bit => 128-bit reduction algorithm.
+ */
+
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+	/*
+	 * With 64-bit multiplicands and one term every 4 bits, there would be
+	 * up to 64 / 4 = 16 one bits per column when each multiplication is
+	 * written out as a series of additions in the schoolbook manner.
+	 * Unfortunately, that doesn't work since the value 16 is 1 too large to
+	 * fit in 4 bits.  Carries would sometimes overflow into the next term.
+	 *
+	 * Using one term every 5 bits would work.  However, that would cost
+	 * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
+	 *
+	 * Instead, mask off 4 bits from one multiplicand, giving a max of 15
+	 * one bits per column.  Then handle those 4 bits separately.
+	 */
+	u64 a0 = a & 0x1111111111111110;
+	u64 a1 = a & 0x2222222222222220;
+	u64 a2 = a & 0x4444444444444440;
+	u64 a3 = a & 0x8888888888888880;
+
+	u64 b0 = b & 0x1111111111111111;
+	u64 b1 = b & 0x2222222222222222;
+	u64 b2 = b & 0x4444444444444444;
+	u64 b3 = b & 0x8888888888888888;
+
+	/* Multiply the high 60 bits of @a by @b. */
+	u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
+		  (a2 * (u128)b2) ^ (a3 * (u128)b1);
+	u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
+		  (a2 * (u128)b3) ^ (a3 * (u128)b2);
+	u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
+		  (a2 * (u128)b0) ^ (a3 * (u128)b3);
+	u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
+		  (a2 * (u128)b1) ^ (a3 * (u128)b0);
+
+	/* Multiply the low 4 bits of @a by @b. */
+	u64 e0 = -(a & 1) & b;
+	u64 e1 = -((a >> 1) & 1) & b;
+	u64 e2 = -((a >> 2) & 1) & b;
+	u64 e3 = -((a >> 3) & 1) & b;
+	u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
+	u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
+
+	/* Add all the intermediate products together. */
+	*out_lo = (((u64)c0) & 0x1111111111111111) ^
+		  (((u64)c1) & 0x2222222222222222) ^
+		  (((u64)c2) & 0x4444444444444444) ^
+		  (((u64)c3) & 0x8888888888888888) ^ extra_lo;
+	*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
+		  (((u64)(c1 >> 64)) & 0x2222222222222222) ^
+		  (((u64)(c2 >> 64)) & 0x4444444444444444) ^
+		  (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
+}
+
+#else /* CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Do a 32 x 32 => 64 bit carryless multiplication. */
+static u64 clmul32(u32 a, u32 b)
+{
+	/*
+	 * With 32-bit multiplicands and one term every 4 bits, there are up to
+	 * 32 / 4 = 8 one bits per column when each multiplication is written
+	 * out as a series of additions in the schoolbook manner.  The value 8
+	 * fits in 4 bits, so the carries don't overflow into the next term.
+	 */
+	u32 a0 = a & 0x11111111;
+	u32 a1 = a & 0x22222222;
+	u32 a2 = a & 0x44444444;
+	u32 a3 = a & 0x88888888;
+
+	u32 b0 = b & 0x11111111;
+	u32 b1 = b & 0x22222222;
+	u32 b2 = b & 0x44444444;
+	u32 b3 = b & 0x88888888;
+
+	u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
+		 (a2 * (u64)b2) ^ (a3 * (u64)b1);
+	u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
+		 (a2 * (u64)b3) ^ (a3 * (u64)b2);
+	u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
+		 (a2 * (u64)b0) ^ (a3 * (u64)b3);
+	u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
+		 (a2 * (u64)b1) ^ (a3 * (u64)b0);
+
+	/* Add all the intermediate products together. */
+	return (c0 & 0x1111111111111111) ^
+	       (c1 & 0x2222222222222222) ^
+	       (c2 & 0x4444444444444444) ^
+	       (c3 & 0x8888888888888888);
+}
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+	u32 a_lo = (u32)a;
+	u32 a_hi = a >> 32;
+	u32 b_lo = (u32)b;
+	u32 b_hi = b >> 32;
+
+	/* Karatsuba multiplication */
+	u64 lo = clmul32(a_lo, b_lo);
+	u64 hi = clmul32(a_hi, b_hi);
+	u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
+
+	*out_lo = lo ^ (mi << 32);
+	*out_hi = hi ^ (mi >> 32);
+}
+#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
+static void __maybe_unused
+polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
+{
+	u64 c0, c1, c2, c3, mi0, mi1;
+
+	/*
+	 * Carryless-multiply @a by @b using Karatsuba multiplication.  Store
+	 * the 256-bit product in @c0 (low) through @c3 (high).
+	 */
+	clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
+	clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
+	clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
+		&mi0, &mi1);
+	mi0 ^= c0 ^ c2;
+	mi1 ^= c1 ^ c3;
+	c1 ^= mi0;
+	c2 ^= mi1;
+
+	/*
+	 * Cancel out the low 128 bits of the product by adding multiples of
+	 * G(x) = x^128 + x^127 + x^126 + x^121 + 1.  Do this in two steps, each
+	 * of which cancels out 64 bits.  Note that we break G(x) into three
+	 * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
+	 */
+
+	/*
+	 * First, add G(x) times c0 as follows:
+	 *
+	 * (c0, c1, c2) = (0,
+	 *                 c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
+	 *		   c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
+	 */
+	c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
+	c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
+
+	/*
+	 * Second, add G(x) times the new c1:
+	 *
+	 * (c1, c2, c3) = (0,
+	 *                 c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
+	 *		   c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
+	 */
+	c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
+	c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
+
+	/* Return (c2, c3).  This implicitly multiplies by x^-128. */
+	a->lo = cpu_to_le64(c2);
+	a->hi = cpu_to_le64(c3);
+}
+
+static void __maybe_unused
+polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
+		       const u8 *data, size_t nblocks)
+{
+	do {
+		acc->lo ^= get_unaligned((__le64 *)data);
+		acc->hi ^= get_unaligned((__le64 *)(data + 8));
+		polyval_mul_generic(acc, key);
+		data += POLYVAL_BLOCK_SIZE;
+	} while (--nblocks);
+}
+
+/* Include the arch-optimized implementation of POLYVAL, if one is available. */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#include "polyval.h" /* $(SRCARCH)/polyval.h */
+void polyval_preparekey(struct polyval_key *key,
+			const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	polyval_preparekey_arch(key, raw_key);
+}
+EXPORT_SYMBOL_GPL(polyval_preparekey);
+#endif /* Else, polyval_preparekey() is an inline function. */
+
+/*
+ * polyval_mul_generic() and polyval_blocks_generic() take the key as a
+ * polyval_elem rather than a polyval_key, so that arch-optimized
+ * implementations with a different key format can use it as a fallback (if they
+ * have H^1 stored somewhere in their struct).  Thus, the following dispatch
+ * code is needed to pass the appropriate key argument.
+ */
+
+static void polyval_mul(struct polyval_ctx *ctx)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+	polyval_mul_arch(&ctx->acc, ctx->key);
+#else
+	polyval_mul_generic(&ctx->acc, &ctx->key->h);
+#endif
+}
+
+static void polyval_blocks(struct polyval_ctx *ctx,
+			   const u8 *data, size_t nblocks)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+	polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
+#else
+	polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
+#endif
+}
+
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
+{
+	if (unlikely(ctx->partial)) {
+		size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
+
+		len -= n;
+		while (n--)
+			ctx->acc.bytes[ctx->partial++] ^= *data++;
+		if (ctx->partial < POLYVAL_BLOCK_SIZE)
+			return;
+		polyval_mul(ctx);
+	}
+	if (len >= POLYVAL_BLOCK_SIZE) {
+		size_t nblocks = len / POLYVAL_BLOCK_SIZE;
+
+		polyval_blocks(ctx, data, nblocks);
+		data += len & ~(POLYVAL_BLOCK_SIZE - 1);
+		len &= POLYVAL_BLOCK_SIZE - 1;
+	}
+	for (size_t i = 0; i < len; i++)
+		ctx->acc.bytes[i] ^= data[i];
+	ctx->partial = len;
+}
+EXPORT_SYMBOL_GPL(polyval_update);
+
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
+{
+	if (unlikely(ctx->partial))
+		polyval_mul(ctx);
+	memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(polyval_final);
+
+#ifdef polyval_mod_init_arch
+static int __init polyval_mod_init(void)
+{
+	polyval_mod_init_arch();
+	return 0;
+}
+subsys_initcall(polyval_mod_init);
+
+static void __exit polyval_mod_exit(void)
+{
+}
+module_exit(polyval_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 37919e239ebb2cba573cca56292f7c39fa6d7415 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 9 Nov 2025 15:47:19 -0800
Subject: lib/crypto: arm64/polyval: Migrate optimized code into library

Migrate the arm64 implementation of POLYVAL into lib/crypto/, wiring it
up to the POLYVAL library interface.  This makes the POLYVAL library be
properly optimized on arm64.

This drops the arm64 optimizations of polyval in the crypto_shash API.
That's fine, since polyval will be removed from crypto_shash entirely
since it is unneeded there.  But even if it comes back, the crypto_shash
API could just be implemented on top of the library API, as usual.

Adjust the names and prototypes of the assembly functions to align more
closely with the rest of the library code.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251109234726.638437-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/arm64/crypto/Kconfig           |  10 -
 arch/arm64/crypto/Makefile          |   3 -
 arch/arm64/crypto/polyval-ce-core.S | 361 ------------------------------------
 arch/arm64/crypto/polyval-ce-glue.c | 158 ----------------
 include/crypto/polyval.h            |   8 +
 lib/crypto/Kconfig                  |   1 +
 lib/crypto/Makefile                 |   1 +
 lib/crypto/arm64/polyval-ce-core.S  | 359 +++++++++++++++++++++++++++++++++++
 lib/crypto/arm64/polyval.h          |  82 ++++++++
 9 files changed, 451 insertions(+), 532 deletions(-)
 delete mode 100644 arch/arm64/crypto/polyval-ce-core.S
 delete mode 100644 arch/arm64/crypto/polyval-ce-glue.c
 create mode 100644 lib/crypto/arm64/polyval-ce-core.S
 create mode 100644 lib/crypto/arm64/polyval.h

(limited to 'include')

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 376d6b50743f..bdd276a6e540 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -47,16 +47,6 @@ config CRYPTO_SM3_ARM64_CE
 	  Architecture: arm64 using:
 	  - ARMv8.2 Crypto Extensions
 
-config CRYPTO_POLYVAL_ARM64_CE
-	tristate "Hash functions: POLYVAL (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_POLYVAL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  Architecture: arm64 using:
-	  - ARMv8 Crypto Extensions
-
 config CRYPTO_AES_ARM64
 	tristate "Ciphers: AES, modes: ECB, CBC, CTR, CTS, XCTR, XTS"
 	select CRYPTO_AES
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index fd3d590fa113..1e330aa08d3f 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -29,9 +29,6 @@ sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 
-obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
-polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
-
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
 
diff --git a/arch/arm64/crypto/polyval-ce-core.S b/arch/arm64/crypto/polyval-ce-core.S
deleted file mode 100644
index b5326540d2e3..000000000000
--- a/arch/arm64/crypto/polyval-ce-core.S
+++ /dev/null
@@ -1,361 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Implementation of POLYVAL using ARMv8 Crypto Extensions.
- *
- * Copyright 2021 Google LLC
- */
-/*
- * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
- * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
- * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
- * finite field multiplication into two steps.
- *
- * In the first step, we consider h^i, m_i as normal polynomials of degree less
- * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
- * is simply polynomial multiplication.
- *
- * In the second step, we compute the reduction of p(x) modulo the finite field
- * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
- *
- * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
- * multiplication is finite field multiplication. The advantage is that the
- * two-step process  only requires 1 finite field reduction for every 8
- * polynomial multiplications. Further parallelism is gained by interleaving the
- * multiplications and polynomial reductions.
- */
-
-#include <linux/linkage.h>
-#define STRIDE_BLOCKS 8
-
-KEY_POWERS	.req	x0
-MSG		.req	x1
-BLOCKS_LEFT	.req	x2
-ACCUMULATOR	.req	x3
-KEY_START	.req	x10
-EXTRA_BYTES	.req	x11
-TMP	.req	x13
-
-M0	.req	v0
-M1	.req	v1
-M2	.req	v2
-M3	.req	v3
-M4	.req	v4
-M5	.req	v5
-M6	.req	v6
-M7	.req	v7
-KEY8	.req	v8
-KEY7	.req	v9
-KEY6	.req	v10
-KEY5	.req	v11
-KEY4	.req	v12
-KEY3	.req	v13
-KEY2	.req	v14
-KEY1	.req	v15
-PL	.req	v16
-PH	.req	v17
-TMP_V	.req	v18
-LO	.req	v20
-MI	.req	v21
-HI	.req	v22
-SUM	.req	v23
-GSTAR	.req	v24
-
-	.text
-
-	.arch	armv8-a+crypto
-	.align	4
-
-.Lgstar:
-	.quad	0xc200000000000000, 0xc200000000000000
-
-/*
- * Computes the product of two 128-bit polynomials in X and Y and XORs the
- * components of the 256-bit product into LO, MI, HI.
- *
- * Given:
- *  X = [X_1 : X_0]
- *  Y = [Y_1 : Y_0]
- *
- * We compute:
- *  LO += X_0 * Y_0
- *  MI += (X_0 + X_1) * (Y_0 + Y_1)
- *  HI += X_1 * Y_1
- *
- * Later, the 256-bit result can be extracted as:
- *   [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
- * This step is done when computing the polynomial reduction for efficiency
- * reasons.
- *
- * Karatsuba multiplication is used instead of Schoolbook multiplication because
- * it was found to be slightly faster on ARM64 CPUs.
- *
- */
-.macro karatsuba1 X Y
-	X .req \X
-	Y .req \Y
-	ext	v25.16b, X.16b, X.16b, #8
-	ext	v26.16b, Y.16b, Y.16b, #8
-	eor	v25.16b, v25.16b, X.16b
-	eor	v26.16b, v26.16b, Y.16b
-	pmull2	v28.1q, X.2d, Y.2d
-	pmull	v29.1q, X.1d, Y.1d
-	pmull	v27.1q, v25.1d, v26.1d
-	eor	HI.16b, HI.16b, v28.16b
-	eor	LO.16b, LO.16b, v29.16b
-	eor	MI.16b, MI.16b, v27.16b
-	.unreq X
-	.unreq Y
-.endm
-
-/*
- * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
- * them.
- */
-.macro karatsuba1_store X Y
-	X .req \X
-	Y .req \Y
-	ext	v25.16b, X.16b, X.16b, #8
-	ext	v26.16b, Y.16b, Y.16b, #8
-	eor	v25.16b, v25.16b, X.16b
-	eor	v26.16b, v26.16b, Y.16b
-	pmull2	HI.1q, X.2d, Y.2d
-	pmull	LO.1q, X.1d, Y.1d
-	pmull	MI.1q, v25.1d, v26.1d
-	.unreq X
-	.unreq Y
-.endm
-
-/*
- * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
- * the result in PL, PH.
- * [PH : PL] =
- *   [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
- */
-.macro karatsuba2
-	// v4 = [HI_1 + MI_1 : HI_0 + MI_0]
-	eor	v4.16b, HI.16b, MI.16b
-	// v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
-	eor	v4.16b, v4.16b, LO.16b
-	// v5 = [HI_0 : LO_1]
-	ext	v5.16b, LO.16b, HI.16b, #8
-	// v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
-	eor	v4.16b, v4.16b, v5.16b
-	// HI = [HI_0 : HI_1]
-	ext	HI.16b, HI.16b, HI.16b, #8
-	// LO = [LO_0 : LO_1]
-	ext	LO.16b, LO.16b, LO.16b, #8
-	// PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
-	ext	PH.16b, v4.16b, HI.16b, #8
-	// PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
-	ext	PL.16b, LO.16b, v4.16b, #8
-.endm
-
-/*
- * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
- *
- * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
- * x^128 + x^127 + x^126 + x^121 + 1.
- *
- * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
- * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
- * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
- * of x^128, this product has two extra factors of x^128.  To get it back into
- * Montgomery form, we need to remove one of these factors by dividing by x^128.
- *
- * To accomplish both of these goals, we add multiples of g(x) that cancel out
- * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
- * bits are zero, the polynomial division by x^128 can be done by right
- * shifting.
- *
- * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
- * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
- * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
- * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
- * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
- * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
- *
- * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
- * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
- * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
- * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
- * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
- *
- * So our final computation is:
- *   T = T_1 : T_0 = g*(x) * P_0
- *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
- *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
- *
- * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
- * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
- * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
- */
-.macro montgomery_reduction dest
-	DEST .req \dest
-	// TMP_V = T_1 : T_0 = P_0 * g*(x)
-	pmull	TMP_V.1q, PL.1d, GSTAR.1d
-	// TMP_V = T_0 : T_1
-	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
-	// TMP_V = P_1 + T_0 : P_0 + T_1
-	eor	TMP_V.16b, PL.16b, TMP_V.16b
-	// PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
-	eor	PH.16b, PH.16b, TMP_V.16b
-	// TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
-	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
-	eor	DEST.16b, PH.16b, TMP_V.16b
-	.unreq DEST
-.endm
-
-/*
- * Compute Polyval on 8 blocks.
- *
- * If reduce is set, also computes the montgomery reduction of the
- * previous full_stride call and XORs with the first message block.
- * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
- * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
- *
- * Sets PL, PH.
- */
-.macro full_stride reduce
-	eor		LO.16b, LO.16b, LO.16b
-	eor		MI.16b, MI.16b, MI.16b
-	eor		HI.16b, HI.16b, HI.16b
-
-	ld1		{M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
-	ld1		{M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
-
-	karatsuba1 M7 KEY1
-	.if \reduce
-	pmull	TMP_V.1q, PL.1d, GSTAR.1d
-	.endif
-
-	karatsuba1 M6 KEY2
-	.if \reduce
-	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
-	.endif
-
-	karatsuba1 M5 KEY3
-	.if \reduce
-	eor	TMP_V.16b, PL.16b, TMP_V.16b
-	.endif
-
-	karatsuba1 M4 KEY4
-	.if \reduce
-	eor	PH.16b, PH.16b, TMP_V.16b
-	.endif
-
-	karatsuba1 M3 KEY5
-	.if \reduce
-	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
-	.endif
-
-	karatsuba1 M2 KEY6
-	.if \reduce
-	eor	SUM.16b, PH.16b, TMP_V.16b
-	.endif
-
-	karatsuba1 M1 KEY7
-	eor	M0.16b, M0.16b, SUM.16b
-
-	karatsuba1 M0 KEY8
-	karatsuba2
-.endm
-
-/*
- * Handle any extra blocks after full_stride loop.
- */
-.macro partial_stride
-	add	KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
-	sub	KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
-	ld1	{KEY1.16b}, [KEY_POWERS], #16
-
-	ld1	{TMP_V.16b}, [MSG], #16
-	eor	SUM.16b, SUM.16b, TMP_V.16b
-	karatsuba1_store KEY1 SUM
-	sub	BLOCKS_LEFT, BLOCKS_LEFT, #1
-
-	tst	BLOCKS_LEFT, #4
-	beq	.Lpartial4BlocksDone
-	ld1	{M0.16b, M1.16b,  M2.16b, M3.16b}, [MSG], #64
-	ld1	{KEY8.16b, KEY7.16b, KEY6.16b,	KEY5.16b}, [KEY_POWERS], #64
-	karatsuba1 M0 KEY8
-	karatsuba1 M1 KEY7
-	karatsuba1 M2 KEY6
-	karatsuba1 M3 KEY5
-.Lpartial4BlocksDone:
-	tst	BLOCKS_LEFT, #2
-	beq	.Lpartial2BlocksDone
-	ld1	{M0.16b, M1.16b}, [MSG], #32
-	ld1	{KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
-	karatsuba1 M0 KEY8
-	karatsuba1 M1 KEY7
-.Lpartial2BlocksDone:
-	tst	BLOCKS_LEFT, #1
-	beq	.LpartialDone
-	ld1	{M0.16b}, [MSG], #16
-	ld1	{KEY8.16b}, [KEY_POWERS], #16
-	karatsuba1 M0 KEY8
-.LpartialDone:
-	karatsuba2
-	montgomery_reduction SUM
-.endm
-
-/*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
- *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void pmull_polyval_mul(u8 *op1, const u8 *op2);
- */
-SYM_FUNC_START(pmull_polyval_mul)
-	adr	TMP, .Lgstar
-	ld1	{GSTAR.2d}, [TMP]
-	ld1	{v0.16b}, [x0]
-	ld1	{v1.16b}, [x1]
-	karatsuba1_store v0 v1
-	karatsuba2
-	montgomery_reduction SUM
-	st1	{SUM.16b}, [x0]
-	ret
-SYM_FUNC_END(pmull_polyval_mul)
-
-/*
- * Perform polynomial evaluation as specified by POLYVAL.  This computes:
- *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
- * where n=nblocks, h is the hash key, and m_i are the message blocks.
- *
- * x0 - pointer to precomputed key powers h^8 ... h^1
- * x1 - pointer to message blocks
- * x2 - number of blocks to hash
- * x3 - pointer to accumulator
- *
- * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
- *			     size_t nblocks, u8 *accumulator);
- */
-SYM_FUNC_START(pmull_polyval_update)
-	adr	TMP, .Lgstar
-	mov	KEY_START, KEY_POWERS
-	ld1	{GSTAR.2d}, [TMP]
-	ld1	{SUM.16b}, [ACCUMULATOR]
-	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
-	blt .LstrideLoopExit
-	ld1	{KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
-	ld1	{KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
-	full_stride 0
-	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
-	blt .LstrideLoopExitReduce
-.LstrideLoop:
-	full_stride 1
-	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
-	bge	.LstrideLoop
-.LstrideLoopExitReduce:
-	montgomery_reduction SUM
-.LstrideLoopExit:
-	adds	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
-	beq	.LskipPartial
-	partial_stride
-.LskipPartial:
-	st1	{SUM.16b}, [ACCUMULATOR]
-	ret
-SYM_FUNC_END(pmull_polyval_update)
diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c
deleted file mode 100644
index c4e653688ea0..000000000000
--- a/arch/arm64/crypto/polyval-ce-glue.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using ARMv8 Crypto Extensions
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication accelerated by
- * ARMv8 Crypto Extensions instructions to implement the finite field operations.
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define NUM_KEY_POWERS	8
-
-struct polyval_tfm_ctx {
-	/*
-	 * These powers must be in the order h^8, ..., h^1.
-	 */
-	u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE];
-};
-
-struct polyval_desc_ctx {
-	u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2);
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator)
-{
-	kernel_neon_begin();
-	pmull_polyval_update(keys, in, nblocks, accumulator);
-	kernel_neon_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
-	kernel_neon_begin();
-	pmull_polyval_mul(op1, op2);
-	kernel_neon_end();
-}
-
-static int polyval_arm64_setkey(struct crypto_shash *tfm,
-			const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-	int i;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
-	for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
-		memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
-		internal_polyval_mul(tctx->key_powers[i],
-				     tctx->key_powers[i+1]);
-	}
-
-	return 0;
-}
-
-static int polyval_arm64_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_arm64_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-	unsigned int nblocks;
-
-	do {
-		/* allow rescheduling every 4K bytes */
-		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
-		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
-		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
-		src += nblocks * POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_arm64_finup(struct shash_desc *desc, const u8 *src,
-			       unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-
-	if (len) {
-		crypto_xor(dctx->buffer, src, len);
-		internal_polyval_mul(dctx->buffer,
-				     tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_arm64_init,
-	.update		= polyval_arm64_update,
-	.finup		= polyval_arm64_finup,
-	.setkey		= polyval_arm64_setkey,
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-ce",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct polyval_tfm_ctx),
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-static int __init polyval_ce_mod_init(void)
-{
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_ce_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_cpu_feature_match(PMULL, polyval_ce_mod_init)
-module_exit(polyval_ce_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-ce");
diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h
index 5ba4c248cad1..f8aaf4275fbd 100644
--- a/include/crypto/polyval.h
+++ b/include/crypto/polyval.h
@@ -39,10 +39,18 @@ struct polyval_elem {
  * This may contain just the raw key H, or it may contain precomputed key
  * powers, depending on the platform's POLYVAL implementation.  Use
  * polyval_preparekey() to initialize this.
+ *
+ * By H^i we mean H^(i-1) * H * x^-128, with base case H^1 = H.  I.e. the
+ * exponentiation repeats the POLYVAL dot operation, with its "extra" x^-128.
  */
 struct polyval_key {
 #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#ifdef CONFIG_ARM64
+	/** @h_powers: Powers of the hash key H^8 through H^1 */
+	struct polyval_elem h_powers[8];
+#else
 #error "Unhandled arch"
+#endif
 #else /* CONFIG_CRYPTO_LIB_POLYVAL_ARCH */
 	/** @h: The hash key H */
 	struct polyval_elem h;
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 6545f0e83b83..430723994142 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -144,6 +144,7 @@ config CRYPTO_LIB_POLYVAL
 config CRYPTO_LIB_POLYVAL_ARCH
 	bool
 	depends on CRYPTO_LIB_POLYVAL && !UML
+	default y if ARM64 && KERNEL_MODE_NEON
 
 config CRYPTO_LIB_CHACHA20POLY1305
 	tristate
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 055e44008805..2efa96afcb4b 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -202,6 +202,7 @@ obj-$(CONFIG_CRYPTO_LIB_POLYVAL) += libpolyval.o
 libpolyval-y := polyval.o
 ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
 CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
+libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
 endif
 
 ################################################################################
diff --git a/lib/crypto/arm64/polyval-ce-core.S b/lib/crypto/arm64/polyval-ce-core.S
new file mode 100644
index 000000000000..7c731a044d02
--- /dev/null
+++ b/lib/crypto/arm64/polyval-ce-core.S
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Implementation of POLYVAL using ARMv8 Crypto Extensions.
+ *
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
+ * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
+ * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
+ * finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process  only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#define STRIDE_BLOCKS 8
+
+ACCUMULATOR	.req	x0
+KEY_POWERS	.req	x1
+MSG		.req	x2
+BLOCKS_LEFT	.req	x3
+KEY_START	.req	x10
+EXTRA_BYTES	.req	x11
+TMP	.req	x13
+
+M0	.req	v0
+M1	.req	v1
+M2	.req	v2
+M3	.req	v3
+M4	.req	v4
+M5	.req	v5
+M6	.req	v6
+M7	.req	v7
+KEY8	.req	v8
+KEY7	.req	v9
+KEY6	.req	v10
+KEY5	.req	v11
+KEY4	.req	v12
+KEY3	.req	v13
+KEY2	.req	v14
+KEY1	.req	v15
+PL	.req	v16
+PH	.req	v17
+TMP_V	.req	v18
+LO	.req	v20
+MI	.req	v21
+HI	.req	v22
+SUM	.req	v23
+GSTAR	.req	v24
+
+	.text
+
+	.arch	armv8-a+crypto
+	.align	4
+
+.Lgstar:
+	.quad	0xc200000000000000, 0xc200000000000000
+
+/*
+ * Computes the product of two 128-bit polynomials in X and Y and XORs the
+ * components of the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ *  X = [X_1 : X_0]
+ *  Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ *  LO += X_0 * Y_0
+ *  MI += (X_0 + X_1) * (Y_0 + Y_1)
+ *  HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ *   [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * Karatsuba multiplication is used instead of Schoolbook multiplication because
+ * it was found to be slightly faster on ARM64 CPUs.
+ *
+ */
+.macro karatsuba1 X Y
+	X .req \X
+	Y .req \Y
+	ext	v25.16b, X.16b, X.16b, #8
+	ext	v26.16b, Y.16b, Y.16b, #8
+	eor	v25.16b, v25.16b, X.16b
+	eor	v26.16b, v26.16b, Y.16b
+	pmull2	v28.1q, X.2d, Y.2d
+	pmull	v29.1q, X.1d, Y.1d
+	pmull	v27.1q, v25.1d, v26.1d
+	eor	HI.16b, HI.16b, v28.16b
+	eor	LO.16b, LO.16b, v29.16b
+	eor	MI.16b, MI.16b, v27.16b
+	.unreq X
+	.unreq Y
+.endm
+
+/*
+ * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
+ * them.
+ */
+.macro karatsuba1_store X Y
+	X .req \X
+	Y .req \Y
+	ext	v25.16b, X.16b, X.16b, #8
+	ext	v26.16b, Y.16b, Y.16b, #8
+	eor	v25.16b, v25.16b, X.16b
+	eor	v26.16b, v26.16b, Y.16b
+	pmull2	HI.1q, X.2d, Y.2d
+	pmull	LO.1q, X.1d, Y.1d
+	pmull	MI.1q, v25.1d, v26.1d
+	.unreq X
+	.unreq Y
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] =
+ *   [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ */
+.macro karatsuba2
+	// v4 = [HI_1 + MI_1 : HI_0 + MI_0]
+	eor	v4.16b, HI.16b, MI.16b
+	// v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
+	eor	v4.16b, v4.16b, LO.16b
+	// v5 = [HI_0 : LO_1]
+	ext	v5.16b, LO.16b, HI.16b, #8
+	// v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
+	eor	v4.16b, v4.16b, v5.16b
+	// HI = [HI_0 : HI_1]
+	ext	HI.16b, HI.16b, HI.16b, #8
+	// LO = [LO_0 : LO_1]
+	ext	LO.16b, LO.16b, LO.16b, #8
+	// PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
+	ext	PH.16b, v4.16b, HI.16b, #8
+	// PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+	ext	PL.16b, LO.16b, v4.16b, #8
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
+ * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128.  To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right
+ * shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ *   T = T_1 : T_0 = g*(x) * P_0
+ *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+	DEST .req \dest
+	// TMP_V = T_1 : T_0 = P_0 * g*(x)
+	pmull	TMP_V.1q, PL.1d, GSTAR.1d
+	// TMP_V = T_0 : T_1
+	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+	// TMP_V = P_1 + T_0 : P_0 + T_1
+	eor	TMP_V.16b, PL.16b, TMP_V.16b
+	// PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+	eor	PH.16b, PH.16b, TMP_V.16b
+	// TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
+	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
+	eor	DEST.16b, PH.16b, TMP_V.16b
+	.unreq DEST
+.endm
+
+/*
+ * Compute Polyval on 8 blocks.
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ *
+ * Sets PL, PH.
+ */
+.macro full_stride reduce
+	eor		LO.16b, LO.16b, LO.16b
+	eor		MI.16b, MI.16b, MI.16b
+	eor		HI.16b, HI.16b, HI.16b
+
+	ld1		{M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+	ld1		{M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
+
+	karatsuba1 M7 KEY1
+	.if \reduce
+	pmull	TMP_V.1q, PL.1d, GSTAR.1d
+	.endif
+
+	karatsuba1 M6 KEY2
+	.if \reduce
+	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+	.endif
+
+	karatsuba1 M5 KEY3
+	.if \reduce
+	eor	TMP_V.16b, PL.16b, TMP_V.16b
+	.endif
+
+	karatsuba1 M4 KEY4
+	.if \reduce
+	eor	PH.16b, PH.16b, TMP_V.16b
+	.endif
+
+	karatsuba1 M3 KEY5
+	.if \reduce
+	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
+	.endif
+
+	karatsuba1 M2 KEY6
+	.if \reduce
+	eor	SUM.16b, PH.16b, TMP_V.16b
+	.endif
+
+	karatsuba1 M1 KEY7
+	eor	M0.16b, M0.16b, SUM.16b
+
+	karatsuba1 M0 KEY8
+	karatsuba2
+.endm
+
+/*
+ * Handle any extra blocks after full_stride loop.
+ */
+.macro partial_stride
+	add	KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
+	sub	KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
+	ld1	{KEY1.16b}, [KEY_POWERS], #16
+
+	ld1	{TMP_V.16b}, [MSG], #16
+	eor	SUM.16b, SUM.16b, TMP_V.16b
+	karatsuba1_store KEY1 SUM
+	sub	BLOCKS_LEFT, BLOCKS_LEFT, #1
+
+	tst	BLOCKS_LEFT, #4
+	beq	.Lpartial4BlocksDone
+	ld1	{M0.16b, M1.16b,  M2.16b, M3.16b}, [MSG], #64
+	ld1	{KEY8.16b, KEY7.16b, KEY6.16b,	KEY5.16b}, [KEY_POWERS], #64
+	karatsuba1 M0 KEY8
+	karatsuba1 M1 KEY7
+	karatsuba1 M2 KEY6
+	karatsuba1 M3 KEY5
+.Lpartial4BlocksDone:
+	tst	BLOCKS_LEFT, #2
+	beq	.Lpartial2BlocksDone
+	ld1	{M0.16b, M1.16b}, [MSG], #32
+	ld1	{KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
+	karatsuba1 M0 KEY8
+	karatsuba1 M1 KEY7
+.Lpartial2BlocksDone:
+	tst	BLOCKS_LEFT, #1
+	beq	.LpartialDone
+	ld1	{M0.16b}, [MSG], #16
+	ld1	{KEY8.16b}, [KEY_POWERS], #16
+	karatsuba1 M0 KEY8
+.LpartialDone:
+	karatsuba2
+	montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pmull(struct polyval_elem *a,
+ *			  const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pmull)
+	adr	TMP, .Lgstar
+	ld1	{GSTAR.2d}, [TMP]
+	ld1	{v0.16b}, [x0]
+	ld1	{v1.16b}, [x1]
+	karatsuba1_store v0 v1
+	karatsuba2
+	montgomery_reduction SUM
+	st1	{SUM.16b}, [x0]
+	ret
+SYM_FUNC_END(polyval_mul_pmull)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL.  This computes:
+ *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * x0 - pointer to accumulator
+ * x1 - pointer to precomputed key powers h^8 ... h^1
+ * x2 - pointer to message blocks
+ * x3 - number of blocks to hash
+ *
+ * void polyval_blocks_pmull(struct polyval_elem *acc,
+ *			     const struct polyval_key *key,
+ *			     const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pmull)
+	adr	TMP, .Lgstar
+	mov	KEY_START, KEY_POWERS
+	ld1	{GSTAR.2d}, [TMP]
+	ld1	{SUM.16b}, [ACCUMULATOR]
+	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	blt .LstrideLoopExit
+	ld1	{KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+	ld1	{KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
+	full_stride 0
+	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	blt .LstrideLoopExitReduce
+.LstrideLoop:
+	full_stride 1
+	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	bge	.LstrideLoop
+.LstrideLoopExitReduce:
+	montgomery_reduction SUM
+.LstrideLoopExit:
+	adds	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	beq	.LskipPartial
+	partial_stride
+.LskipPartial:
+	st1	{SUM.16b}, [ACCUMULATOR]
+	ret
+SYM_FUNC_END(polyval_blocks_pmull)
diff --git a/lib/crypto/arm64/polyval.h b/lib/crypto/arm64/polyval.h
new file mode 100644
index 000000000000..2486e80750d0
--- /dev/null
+++ b/lib/crypto/arm64/polyval.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, arm64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+asmlinkage void polyval_mul_pmull(struct polyval_elem *a,
+				  const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pmull(struct polyval_elem *acc,
+				     const struct polyval_key *key,
+				     const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+				    const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		kernel_neon_begin();
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_pmull(&key->h_powers[i],
+					  &key->h_powers[NUM_H_POWERS - 1]);
+		}
+		kernel_neon_end();
+	} else {
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_generic(&key->h_powers[i],
+					    &key->h_powers[NUM_H_POWERS - 1]);
+		}
+	}
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+			     const struct polyval_key *key)
+{
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		kernel_neon_begin();
+		polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
+		kernel_neon_end();
+	} else {
+		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	}
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+				const struct polyval_key *key,
+				const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		do {
+			/* Allow rescheduling every 4 KiB. */
+			size_t n = min_t(size_t, nblocks,
+					 4096 / POLYVAL_BLOCK_SIZE);
+
+			kernel_neon_begin();
+			polyval_blocks_pmull(acc, key, data, n);
+			kernel_neon_end();
+			data += n * POLYVAL_BLOCK_SIZE;
+			nblocks -= n;
+		} while (nblocks);
+	} else {
+		polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+				       data, nblocks);
+	}
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(PMULL))
+		static_branch_enable(&have_pmull);
+}
-- 
cgit v1.2.3


From 4d8da35579daad0392d238460ed7e9629d49ca35 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 9 Nov 2025 15:47:20 -0800
Subject: lib/crypto: x86/polyval: Migrate optimized code into library

Migrate the x86_64 implementation of POLYVAL into lib/crypto/, wiring it
up to the POLYVAL library interface.  This makes the POLYVAL library be
properly optimized on x86_64.

This drops the x86_64 optimizations of polyval in the crypto_shash API.
That's fine, since polyval will be removed from crypto_shash entirely
since it is unneeded there.  But even if it comes back, the crypto_shash
API could just be implemented on top of the library API, as usual.

Adjust the names and prototypes of the assembly functions to align more
closely with the rest of the library code.

Also replace a movaps instruction with movups to remove the assumption
that the key struct is 16-byte aligned.  Users can still align the key
if they want (and at least in this case, movups is just as fast as
movaps), but it's inconvenient to require it.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251109234726.638437-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/x86/crypto/Kconfig                |  10 -
 arch/x86/crypto/Makefile               |   3 -
 arch/x86/crypto/polyval-clmulni_asm.S  | 321 ---------------------------------
 arch/x86/crypto/polyval-clmulni_glue.c | 180 ------------------
 include/crypto/polyval.h               |   3 +
 lib/crypto/Kconfig                     |   1 +
 lib/crypto/Makefile                    |   1 +
 lib/crypto/x86/polyval-pclmul-avx.S    | 319 ++++++++++++++++++++++++++++++++
 lib/crypto/x86/polyval.h               |  83 +++++++++
 9 files changed, 407 insertions(+), 514 deletions(-)
 delete mode 100644 arch/x86/crypto/polyval-clmulni_asm.S
 delete mode 100644 arch/x86/crypto/polyval-clmulni_glue.c
 create mode 100644 lib/crypto/x86/polyval-pclmul-avx.S
 create mode 100644 lib/crypto/x86/polyval.h

(limited to 'include')

diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 48d3076b6053..3fd2423d3cf8 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -353,16 +353,6 @@ config CRYPTO_NHPOLY1305_AVX2
 	  Architecture: x86_64 using:
 	  - AVX2 (Advanced Vector Extensions 2)
 
-config CRYPTO_POLYVAL_CLMUL_NI
-	tristate "Hash functions: POLYVAL (CLMUL-NI)"
-	depends on 64BIT
-	select CRYPTO_POLYVAL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  Architecture: x86_64 using:
-	  - CLMUL-NI (carry-less multiplication new instructions)
-
 config CRYPTO_SM3_AVX_X86_64
 	tristate "Hash functions: SM3 (AVX)"
 	depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2d30d5d36145..4a24dd38da50 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -52,9 +52,6 @@ aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 
-obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
-polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-
 obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S
deleted file mode 100644
index a6ebe4e7dd2b..000000000000
--- a/arch/x86/crypto/polyval-clmulni_asm.S
+++ /dev/null
@@ -1,321 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 2021 Google LLC
- */
-/*
- * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
- * instructions. It works on 8 blocks at a time, by precomputing the first 8
- * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
- * allows us to split finite field multiplication into two steps.
- *
- * In the first step, we consider h^i, m_i as normal polynomials of degree less
- * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
- * is simply polynomial multiplication.
- *
- * In the second step, we compute the reduction of p(x) modulo the finite field
- * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
- *
- * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
- * multiplication is finite field multiplication. The advantage is that the
- * two-step process  only requires 1 finite field reduction for every 8
- * polynomial multiplications. Further parallelism is gained by interleaving the
- * multiplications and polynomial reductions.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define STRIDE_BLOCKS 8
-
-#define GSTAR %xmm7
-#define PL %xmm8
-#define PH %xmm9
-#define TMP_XMM %xmm11
-#define LO %xmm12
-#define HI %xmm13
-#define MI %xmm14
-#define SUM %xmm15
-
-#define KEY_POWERS %rdi
-#define MSG %rsi
-#define BLOCKS_LEFT %rdx
-#define ACCUMULATOR %rcx
-#define TMP %rax
-
-.section    .rodata.cst16.gstar, "aM", @progbits, 16
-.align 16
-
-.Lgstar:
-	.quad 0xc200000000000000, 0xc200000000000000
-
-.text
-
-/*
- * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
- * count pointed to by MSG and KEY_POWERS.
- */
-.macro schoolbook1 count
-	.set i, 0
-	.rept (\count)
-		schoolbook1_iteration i 0
-		.set i, (i +1)
-	.endr
-.endm
-
-/*
- * Computes the product of two 128-bit polynomials at the memory locations
- * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
- * the 256-bit product into LO, MI, HI.
- *
- * Given:
- *   X = [X_1 : X_0]
- *   Y = [Y_1 : Y_0]
- *
- * We compute:
- *   LO += X_0 * Y_0
- *   MI += X_0 * Y_1 + X_1 * Y_0
- *   HI += X_1 * Y_1
- *
- * Later, the 256-bit result can be extracted as:
- *   [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- * This step is done when computing the polynomial reduction for efficiency
- * reasons.
- *
- * If xor_sum == 1, then also XOR the value of SUM into m_0.  This avoids an
- * extra multiplication of SUM and h^8.
- */
-.macro schoolbook1_iteration i xor_sum
-	movups (16*\i)(MSG), %xmm0
-	.if (\i == 0 && \xor_sum == 1)
-		pxor SUM, %xmm0
-	.endif
-	vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
-	vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
-	vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
-	vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
-	vpxor %xmm2, MI, MI
-	vpxor %xmm1, LO, LO
-	vpxor %xmm4, HI, HI
-	vpxor %xmm3, MI, MI
-.endm
-
-/*
- * Performs the same computation as schoolbook1_iteration, except we expect the
- * arguments to already be loaded into xmm0 and xmm1 and we set the result
- * registers LO, MI, and HI directly rather than XOR'ing into them.
- */
-.macro schoolbook1_noload
-	vpclmulqdq $0x01, %xmm0, %xmm1, MI
-	vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
-	vpclmulqdq $0x00, %xmm0, %xmm1, LO
-	vpclmulqdq $0x11, %xmm0, %xmm1, HI
-	vpxor %xmm2, MI, MI
-.endm
-
-/*
- * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
- * the result in PL, PH.
- *   [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- */
-.macro schoolbook2
-	vpslldq $8, MI, PL
-	vpsrldq $8, MI, PH
-	pxor LO, PL
-	pxor HI, PH
-.endm
-
-/*
- * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
- *
- * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
- * x^128 + x^127 + x^126 + x^121 + 1.
- *
- * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
- * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
- * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
- * of x^128, this product has two extra factors of x^128.  To get it back into
- * Montgomery form, we need to remove one of these factors by dividing by x^128.
- *
- * To accomplish both of these goals, we add multiples of g(x) that cancel out
- * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
- * bits are zero, the polynomial division by x^128 can be done by right shifting.
- *
- * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
- * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
- * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
- * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
- * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
- * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
- *
- * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
- * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
- * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
- * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
- * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
- *
- * So our final computation is:
- *   T = T_1 : T_0 = g*(x) * P_0
- *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
- *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
- *
- * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
- * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
- * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
- */
-.macro montgomery_reduction dest
-	vpclmulqdq $0x00, PL, GSTAR, TMP_XMM	# TMP_XMM = T_1 : T_0 = P_0 * g*(x)
-	pshufd $0b01001110, TMP_XMM, TMP_XMM	# TMP_XMM = T_0 : T_1
-	pxor PL, TMP_XMM			# TMP_XMM = P_1 + T_0 : P_0 + T_1
-	pxor TMP_XMM, PH			# PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
-	pclmulqdq $0x11, GSTAR, TMP_XMM		# TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
-	vpxor TMP_XMM, PH, \dest
-.endm
-
-/*
- * Compute schoolbook multiplication for 8 blocks
- * m_0h^8 + ... + m_7h^1
- *
- * If reduce is set, also computes the montgomery reduction of the
- * previous full_stride call and XORs with the first message block.
- * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
- * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
- */
-.macro full_stride reduce
-	pxor LO, LO
-	pxor HI, HI
-	pxor MI, MI
-
-	schoolbook1_iteration 7 0
-	.if \reduce
-		vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 6 0
-	.if \reduce
-		pshufd $0b01001110, TMP_XMM, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 5 0
-	.if \reduce
-		pxor PL, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 4 0
-	.if \reduce
-		pxor TMP_XMM, PH
-	.endif
-
-	schoolbook1_iteration 3 0
-	.if \reduce
-		pclmulqdq $0x11, GSTAR, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 2 0
-	.if \reduce
-		vpxor TMP_XMM, PH, SUM
-	.endif
-
-	schoolbook1_iteration 1 0
-
-	schoolbook1_iteration 0 1
-
-	addq $(8*16), MSG
-	schoolbook2
-.endm
-
-/*
- * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
- */
-.macro partial_stride
-	mov BLOCKS_LEFT, TMP
-	shlq $4, TMP
-	addq $(16*STRIDE_BLOCKS), KEY_POWERS
-	subq TMP, KEY_POWERS
-
-	movups (MSG), %xmm0
-	pxor SUM, %xmm0
-	movaps (KEY_POWERS), %xmm1
-	schoolbook1_noload
-	dec BLOCKS_LEFT
-	addq $16, MSG
-	addq $16, KEY_POWERS
-
-	test $4, BLOCKS_LEFT
-	jz .Lpartial4BlocksDone
-	schoolbook1 4
-	addq $(4*16), MSG
-	addq $(4*16), KEY_POWERS
-.Lpartial4BlocksDone:
-	test $2, BLOCKS_LEFT
-	jz .Lpartial2BlocksDone
-	schoolbook1 2
-	addq $(2*16), MSG
-	addq $(2*16), KEY_POWERS
-.Lpartial2BlocksDone:
-	test $1, BLOCKS_LEFT
-	jz .LpartialDone
-	schoolbook1 1
-.LpartialDone:
-	schoolbook2
-	montgomery_reduction SUM
-.endm
-
-/*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
- *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void clmul_polyval_mul(u8 *op1, const u8 *op2);
- */
-SYM_FUNC_START(clmul_polyval_mul)
-	FRAME_BEGIN
-	vmovdqa .Lgstar(%rip), GSTAR
-	movups (%rdi), %xmm0
-	movups (%rsi), %xmm1
-	schoolbook1_noload
-	schoolbook2
-	montgomery_reduction SUM
-	movups SUM, (%rdi)
-	FRAME_END
-	RET
-SYM_FUNC_END(clmul_polyval_mul)
-
-/*
- * Perform polynomial evaluation as specified by POLYVAL.  This computes:
- *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
- * where n=nblocks, h is the hash key, and m_i are the message blocks.
- *
- * rdi - pointer to precomputed key powers h^8 ... h^1
- * rsi - pointer to message blocks
- * rdx - number of blocks to hash
- * rcx - pointer to the accumulator
- *
- * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- *	const u8 *in, size_t nblocks, u8 *accumulator);
- */
-SYM_FUNC_START(clmul_polyval_update)
-	FRAME_BEGIN
-	vmovdqa .Lgstar(%rip), GSTAR
-	movups (ACCUMULATOR), SUM
-	subq $STRIDE_BLOCKS, BLOCKS_LEFT
-	js .LstrideLoopExit
-	full_stride 0
-	subq $STRIDE_BLOCKS, BLOCKS_LEFT
-	js .LstrideLoopExitReduce
-.LstrideLoop:
-	full_stride 1
-	subq $STRIDE_BLOCKS, BLOCKS_LEFT
-	jns .LstrideLoop
-.LstrideLoopExitReduce:
-	montgomery_reduction SUM
-.LstrideLoopExit:
-	add $STRIDE_BLOCKS, BLOCKS_LEFT
-	jz .LskipPartial
-	partial_stride
-.LskipPartial:
-	movups SUM, (ACCUMULATOR)
-	FRAME_END
-	RET
-SYM_FUNC_END(clmul_polyval_update)
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
deleted file mode 100644
index 6b466867f91a..000000000000
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ /dev/null
@@ -1,180 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using PCMULQDQ-NI
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication
- * accelerated by PCLMULQDQ-NI to implement the finite field
- * operations.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define POLYVAL_ALIGN	16
-#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
-#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
-#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA)
-#define NUM_KEY_POWERS	8
-
-struct polyval_tfm_ctx {
-	/*
-	 * These powers must be in the order h^8, ..., h^1.
-	 */
-	u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR;
-};
-
-struct polyval_desc_ctx {
-	u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
-
-static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
-{
-	return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN);
-}
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator)
-{
-	kernel_fpu_begin();
-	clmul_polyval_update(keys, in, nblocks, accumulator);
-	kernel_fpu_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
-	kernel_fpu_begin();
-	clmul_polyval_mul(op1, op2);
-	kernel_fpu_end();
-}
-
-static int polyval_x86_setkey(struct crypto_shash *tfm,
-			const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm);
-	int i;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
-	for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
-		memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
-		internal_polyval_mul(tctx->key_powers[i],
-				     tctx->key_powers[i+1]);
-	}
-
-	return 0;
-}
-
-static int polyval_x86_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_x86_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-	unsigned int nblocks;
-
-	do {
-		/* Allow rescheduling every 4K bytes. */
-		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
-		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
-		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
-		src += nblocks * POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
-			     unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-
-	if (len) {
-		crypto_xor(dctx->buffer, src, len);
-		internal_polyval_mul(dctx->buffer,
-				     tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_x86_init,
-	.update		= polyval_x86_update,
-	.finup		= polyval_x86_finup,
-	.setkey		= polyval_x86_setkey,
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-clmulni",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= POLYVAL_CTX_SIZE,
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
-	X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init polyval_clmulni_mod_init(void)
-{
-	if (!x86_match_cpu(pcmul_cpu_id))
-		return -ENODEV;
-
-	if (!boot_cpu_has(X86_FEATURE_AVX))
-		return -ENODEV;
-
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_clmulni_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_clmulni_mod_init);
-module_exit(polyval_clmulni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-clmulni");
diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h
index f8aaf4275fbd..b28b8ef11353 100644
--- a/include/crypto/polyval.h
+++ b/include/crypto/polyval.h
@@ -48,6 +48,9 @@ struct polyval_key {
 #ifdef CONFIG_ARM64
 	/** @h_powers: Powers of the hash key H^8 through H^1 */
 	struct polyval_elem h_powers[8];
+#elif defined(CONFIG_X86)
+	/** @h_powers: Powers of the hash key H^8 through H^1 */
+	struct polyval_elem h_powers[8];
 #else
 #error "Unhandled arch"
 #endif
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 430723994142..9d04b3771ce2 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -145,6 +145,7 @@ config CRYPTO_LIB_POLYVAL_ARCH
 	bool
 	depends on CRYPTO_LIB_POLYVAL && !UML
 	default y if ARM64 && KERNEL_MODE_NEON
+	default y if X86_64
 
 config CRYPTO_LIB_CHACHA20POLY1305
 	tristate
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 2efa96afcb4b..6580991f8e12 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -203,6 +203,7 @@ libpolyval-y := polyval.o
 ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
 CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
 libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
+libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
 endif
 
 ################################################################################
diff --git a/lib/crypto/x86/polyval-pclmul-avx.S b/lib/crypto/x86/polyval-pclmul-avx.S
new file mode 100644
index 000000000000..7f739465ad35
--- /dev/null
+++ b/lib/crypto/x86/polyval-pclmul-avx.S
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
+ * instructions. It works on 8 blocks at a time, by precomputing the first 8
+ * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
+ * allows us to split finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process  only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STRIDE_BLOCKS 8
+
+#define GSTAR %xmm7
+#define PL %xmm8
+#define PH %xmm9
+#define TMP_XMM %xmm11
+#define LO %xmm12
+#define HI %xmm13
+#define MI %xmm14
+#define SUM %xmm15
+
+#define ACCUMULATOR %rdi
+#define KEY_POWERS %rsi
+#define MSG %rdx
+#define BLOCKS_LEFT %rcx
+#define TMP %rax
+
+.section    .rodata.cst16.gstar, "aM", @progbits, 16
+.align 16
+
+.Lgstar:
+	.quad 0xc200000000000000, 0xc200000000000000
+
+.text
+
+/*
+ * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
+ * count pointed to by MSG and KEY_POWERS.
+ */
+.macro schoolbook1 count
+	.set i, 0
+	.rept (\count)
+		schoolbook1_iteration i 0
+		.set i, (i +1)
+	.endr
+.endm
+
+/*
+ * Computes the product of two 128-bit polynomials at the memory locations
+ * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
+ * the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ *   X = [X_1 : X_0]
+ *   Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ *   LO += X_0 * Y_0
+ *   MI += X_0 * Y_1 + X_1 * Y_0
+ *   HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ *   [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * If xor_sum == 1, then also XOR the value of SUM into m_0.  This avoids an
+ * extra multiplication of SUM and h^8.
+ */
+.macro schoolbook1_iteration i xor_sum
+	movups (16*\i)(MSG), %xmm0
+	.if (\i == 0 && \xor_sum == 1)
+		pxor SUM, %xmm0
+	.endif
+	vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
+	vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
+	vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
+	vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
+	vpxor %xmm2, MI, MI
+	vpxor %xmm1, LO, LO
+	vpxor %xmm4, HI, HI
+	vpxor %xmm3, MI, MI
+.endm
+
+/*
+ * Performs the same computation as schoolbook1_iteration, except we expect the
+ * arguments to already be loaded into xmm0 and xmm1 and we set the result
+ * registers LO, MI, and HI directly rather than XOR'ing into them.
+ */
+.macro schoolbook1_noload
+	vpclmulqdq $0x01, %xmm0, %xmm1, MI
+	vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
+	vpclmulqdq $0x00, %xmm0, %xmm1, LO
+	vpclmulqdq $0x11, %xmm0, %xmm1, HI
+	vpxor %xmm2, MI, MI
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ *   [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ */
+.macro schoolbook2
+	vpslldq $8, MI, PL
+	vpsrldq $8, MI, PH
+	pxor LO, PL
+	pxor HI, PH
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
+ * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128.  To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ *   T = T_1 : T_0 = g*(x) * P_0
+ *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+	vpclmulqdq $0x00, PL, GSTAR, TMP_XMM	# TMP_XMM = T_1 : T_0 = P_0 * g*(x)
+	pshufd $0b01001110, TMP_XMM, TMP_XMM	# TMP_XMM = T_0 : T_1
+	pxor PL, TMP_XMM			# TMP_XMM = P_1 + T_0 : P_0 + T_1
+	pxor TMP_XMM, PH			# PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+	pclmulqdq $0x11, GSTAR, TMP_XMM		# TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
+	vpxor TMP_XMM, PH, \dest
+.endm
+
+/*
+ * Compute schoolbook multiplication for 8 blocks
+ * m_0h^8 + ... + m_7h^1
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ */
+.macro full_stride reduce
+	pxor LO, LO
+	pxor HI, HI
+	pxor MI, MI
+
+	schoolbook1_iteration 7 0
+	.if \reduce
+		vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 6 0
+	.if \reduce
+		pshufd $0b01001110, TMP_XMM, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 5 0
+	.if \reduce
+		pxor PL, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 4 0
+	.if \reduce
+		pxor TMP_XMM, PH
+	.endif
+
+	schoolbook1_iteration 3 0
+	.if \reduce
+		pclmulqdq $0x11, GSTAR, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 2 0
+	.if \reduce
+		vpxor TMP_XMM, PH, SUM
+	.endif
+
+	schoolbook1_iteration 1 0
+
+	schoolbook1_iteration 0 1
+
+	addq $(8*16), MSG
+	schoolbook2
+.endm
+
+/*
+ * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
+ */
+.macro partial_stride
+	mov BLOCKS_LEFT, TMP
+	shlq $4, TMP
+	addq $(16*STRIDE_BLOCKS), KEY_POWERS
+	subq TMP, KEY_POWERS
+
+	movups (MSG), %xmm0
+	pxor SUM, %xmm0
+	movups (KEY_POWERS), %xmm1
+	schoolbook1_noload
+	dec BLOCKS_LEFT
+	addq $16, MSG
+	addq $16, KEY_POWERS
+
+	test $4, BLOCKS_LEFT
+	jz .Lpartial4BlocksDone
+	schoolbook1 4
+	addq $(4*16), MSG
+	addq $(4*16), KEY_POWERS
+.Lpartial4BlocksDone:
+	test $2, BLOCKS_LEFT
+	jz .Lpartial2BlocksDone
+	schoolbook1 2
+	addq $(2*16), MSG
+	addq $(2*16), KEY_POWERS
+.Lpartial2BlocksDone:
+	test $1, BLOCKS_LEFT
+	jz .LpartialDone
+	schoolbook1 1
+.LpartialDone:
+	schoolbook2
+	montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ *			       const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pclmul_avx)
+	FRAME_BEGIN
+	vmovdqa .Lgstar(%rip), GSTAR
+	movups (%rdi), %xmm0
+	movups (%rsi), %xmm1
+	schoolbook1_noload
+	schoolbook2
+	montgomery_reduction SUM
+	movups SUM, (%rdi)
+	FRAME_END
+	RET
+SYM_FUNC_END(polyval_mul_pclmul_avx)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL.  This computes:
+ *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * rdi - pointer to the accumulator
+ * rsi - pointer to precomputed key powers h^8 ... h^1
+ * rdx - pointer to message blocks
+ * rcx - number of blocks to hash
+ *
+ * void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ *				  const struct polyval_key *key,
+ *				  const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pclmul_avx)
+	FRAME_BEGIN
+	vmovdqa .Lgstar(%rip), GSTAR
+	movups (ACCUMULATOR), SUM
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	js .LstrideLoopExit
+	full_stride 0
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	js .LstrideLoopExitReduce
+.LstrideLoop:
+	full_stride 1
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	jns .LstrideLoop
+.LstrideLoopExitReduce:
+	montgomery_reduction SUM
+.LstrideLoopExit:
+	add $STRIDE_BLOCKS, BLOCKS_LEFT
+	jz .LskipPartial
+	partial_stride
+.LskipPartial:
+	movups SUM, (ACCUMULATOR)
+	FRAME_END
+	RET
+SYM_FUNC_END(polyval_blocks_pclmul_avx)
diff --git a/lib/crypto/x86/polyval.h b/lib/crypto/x86/polyval.h
new file mode 100644
index 000000000000..ef8797521420
--- /dev/null
+++ b/lib/crypto/x86/polyval.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, x86_64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+
+asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
+				       const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+					  const struct polyval_key *key,
+					  const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+				    const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_pclmul_avx(
+				&key->h_powers[i],
+				&key->h_powers[NUM_H_POWERS - 1]);
+		}
+		kernel_fpu_end();
+	} else {
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_generic(&key->h_powers[i],
+					    &key->h_powers[NUM_H_POWERS - 1]);
+		}
+	}
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+			     const struct polyval_key *key)
+{
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
+		kernel_fpu_end();
+	} else {
+		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	}
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+				const struct polyval_key *key,
+				const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		do {
+			/* Allow rescheduling every 4 KiB. */
+			size_t n = min_t(size_t, nblocks,
+					 4096 / POLYVAL_BLOCK_SIZE);
+
+			kernel_fpu_begin();
+			polyval_blocks_pclmul_avx(acc, key, data, n);
+			kernel_fpu_end();
+			data += n * POLYVAL_BLOCK_SIZE;
+			nblocks -= n;
+		} while (nblocks);
+	} else {
+		polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+				       data, nblocks);
+	}
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
+	    boot_cpu_has(X86_FEATURE_AVX))
+		static_branch_enable(&have_pclmul_avx);
+}
-- 
cgit v1.2.3


From 693d1eaca940f277af24c74873ef2313816ff444 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Tue, 11 Nov 2025 18:58:35 +0000
Subject: coresight: Change device mode to atomic type

The device mode is defined as local type. This type cannot promise
SMP-safe access.

Change to atomic type and impose relax ordering, which ensures the
SMP-safe synchronisation and the ordering between the mode setting and
relevant operations.

Fixes: 22fd532eaa0c ("coresight: etm3x: adding operation mode for etm_enable()")
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Tested-by: James Clark <james.clark@linaro.org>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20251111-arm_coresight_power_management_fix-v6-1-f55553b6c8b3@arm.com
---
 include/linux/coresight.h | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 56d0108658db..2b48be97fcd0 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -251,15 +251,11 @@ struct coresight_trace_id_map {
  *		by @coresight_ops.
  * @access:	Device i/o access abstraction for this device.
  * @dev:	The device entity associated to this component.
- * @mode:	This tracer's mode, i.e sysFS, Perf or disabled. This is
- *		actually an 'enum cs_mode', but is stored in an atomic type.
- *		This is always accessed through local_read() and local_set(),
- *		but wherever it's done from within the Coresight device's lock,
- *		a non-atomic read would also work. This is the main point of
- *		synchronisation between code happening inside the sysfs mode's
- *		coresight_mutex and outside when running in Perf mode. A compare
- *		and exchange swap is done to atomically claim one mode or the
- *		other.
+ * @mode:	The device mode, i.e sysFS, Perf or disabled. This is actually
+ *		an 'enum cs_mode' but stored in an atomic type. Access is always
+ *		through atomic APIs, ensuring SMP-safe synchronisation between
+ *		racing from sysFS and Perf mode. A compare-and-exchange
+ *		operation is done to atomically claim one mode or the other.
  * @refcnt:	keep track of what is in use. Only access this outside of the
  *		device's spinlock when the coresight_mutex held and mode ==
  *		CS_MODE_SYSFS. Otherwise it must be accessed from inside the
@@ -288,7 +284,7 @@ struct coresight_device {
 	const struct coresight_ops *ops;
 	struct csdev_access access;
 	struct device dev;
-	local_t	mode;
+	atomic_t mode;
 	int refcnt;
 	bool orphan;
 	/* sink specific fields */
@@ -624,13 +620,14 @@ static inline bool coresight_is_percpu_sink(struct coresight_device *csdev)
 static inline bool coresight_take_mode(struct coresight_device *csdev,
 				       enum cs_mode new_mode)
 {
-	return local_cmpxchg(&csdev->mode, CS_MODE_DISABLED, new_mode) ==
-	       CS_MODE_DISABLED;
+	int curr = CS_MODE_DISABLED;
+
+	return atomic_try_cmpxchg_acquire(&csdev->mode, &curr, new_mode);
 }
 
 static inline enum cs_mode coresight_get_mode(struct coresight_device *csdev)
 {
-	return local_read(&csdev->mode);
+	return atomic_read_acquire(&csdev->mode);
 }
 
 static inline void coresight_set_mode(struct coresight_device *csdev,
@@ -646,7 +643,7 @@ static inline void coresight_set_mode(struct coresight_device *csdev,
 	WARN(new_mode != CS_MODE_DISABLED && current_mode != CS_MODE_DISABLED &&
 	     current_mode != new_mode, "Device already in use\n");
 
-	local_set(&csdev->mode, new_mode);
+	atomic_set_release(&csdev->mode, new_mode);
 }
 
 struct coresight_device *coresight_register(struct coresight_desc *desc);
-- 
cgit v1.2.3


From 5422318e27d7a4662701f518e2e51b9f73a331b1 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Tue, 11 Nov 2025 14:24:48 +0200
Subject: net/mlx5: Expose definition for 1600Gbps link mode

This patch exposes new link mode for 1600Gbps, utilizing 8 lanes at
200Gbps per lane.

Co-developed-by: Yael Chemla <ychemla@nvidia.com>
Reviewed-by: Shahar Shitrit <shshitrit@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1762863888-1092798-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/port.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 58770b86f793..1df9d9a57bbc 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -112,6 +112,7 @@ enum mlx5e_ext_link_mode {
 	MLX5E_400GAUI_2_400GBASE_CR2_KR2	= 17,
 	MLX5E_800GAUI_8_800GBASE_CR8_KR8	= 19,
 	MLX5E_800GAUI_4_800GBASE_CR4_KR4	= 20,
+	MLX5E_1600TAUI_8_1600TBASE_CR8_KR8	= 23,
 	MLX5E_EXT_LINK_MODES_NUMBER,
 };
 
-- 
cgit v1.2.3


From 4be9f3cc582a24b08f6580f65fa48a4d70332ab5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 11 Nov 2025 09:12:43 -0500
Subject: filelock: rework the __break_lease API to use flags

Currently __break_lease takes both a type and an openmode. With the
addition of directory leases, that makes less sense. Declare a set of
LEASE_BREAK_* flags that can be used to control how lease breaks work
instead of requiring a type and an openmode.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-2-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/locks.c               | 29 +++++++++++++++++----------
 include/linux/filelock.h | 52 +++++++++++++++++++++++++++++++++++-------------
 2 files changed, 56 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/locks.c b/fs/locks.c
index b33c327c21dc..3cdd84a0fbed 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1529,24 +1529,31 @@ any_leases_conflict(struct inode *inode, struct file_lease *breaker)
 /**
  *	__break_lease	-	revoke all outstanding leases on file
  *	@inode: the inode of the file to return
- *	@mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR:
- *	    break all leases
- *	@type: FL_LEASE: break leases and delegations; FL_DELEG: break
- *	    only delegations
+ *	@flags: LEASE_BREAK_* flags
  *
  *	break_lease (inlined for speed) has checked there already is at least
  *	some kind of lock (maybe a lease) on this file.  Leases are broken on
- *	a call to open() or truncate().  This function can sleep unless you
- *	specified %O_NONBLOCK to your open().
+ *	a call to open() or truncate().  This function can block waiting for the
+ *	lease break unless you specify LEASE_BREAK_NONBLOCK.
  */
-int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
+int __break_lease(struct inode *inode, unsigned int flags)
 {
-	int error = 0;
-	struct file_lock_context *ctx;
 	struct file_lease *new_fl, *fl, *tmp;
+	struct file_lock_context *ctx;
 	unsigned long break_time;
-	int want_write = (mode & O_ACCMODE) != O_RDONLY;
+	unsigned int type;
 	LIST_HEAD(dispose);
+	bool want_write = !(flags & LEASE_BREAK_OPEN_RDONLY);
+	int error = 0;
+
+	if (flags & LEASE_BREAK_LEASE)
+		type = FL_LEASE;
+	else if (flags & LEASE_BREAK_DELEG)
+		type = FL_DELEG;
+	else if (flags & LEASE_BREAK_LAYOUT)
+		type = FL_LAYOUT;
+	else
+		return -EINVAL;
 
 	new_fl = lease_alloc(NULL, type, want_write ? F_WRLCK : F_RDLCK);
 	if (IS_ERR(new_fl))
@@ -1595,7 +1602,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 	if (list_empty(&ctx->flc_lease))
 		goto out;
 
-	if (mode & O_NONBLOCK) {
+	if (flags & LEASE_BREAK_NONBLOCK) {
 		trace_break_lease_noblock(inode, new_fl);
 		error = -EWOULDBLOCK;
 		goto out;
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index c2ce8ba05d06..47da6aa28d8d 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -212,7 +212,14 @@ int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);
 void locks_init_lease(struct file_lease *);
 void locks_free_lease(struct file_lease *fl);
 struct file_lease *locks_alloc_lease(void);
-int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
+
+#define LEASE_BREAK_LEASE		BIT(0)	// break leases and delegations
+#define LEASE_BREAK_DELEG		BIT(1)	// break delegations only
+#define LEASE_BREAK_LAYOUT		BIT(2)	// break layouts only
+#define LEASE_BREAK_NONBLOCK		BIT(3)	// non-blocking break
+#define LEASE_BREAK_OPEN_RDONLY		BIT(4)	// readonly open event
+
+int __break_lease(struct inode *inode, unsigned int flags);
 void lease_get_mtime(struct inode *, struct timespec64 *time);
 int generic_setlease(struct file *, int, struct file_lease **, void **priv);
 int kernel_setlease(struct file *, int, struct file_lease **, void **);
@@ -367,7 +374,7 @@ static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *f
 	return -ENOLCK;
 }
 
-static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
+static inline int __break_lease(struct inode *inode, unsigned int flags)
 {
 	return 0;
 }
@@ -428,6 +435,17 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
 }
 
 #ifdef CONFIG_FILE_LOCKING
+static inline unsigned int openmode_to_lease_flags(unsigned int mode)
+{
+	unsigned int flags = 0;
+
+	if ((mode & O_ACCMODE) == O_RDONLY)
+		flags |= LEASE_BREAK_OPEN_RDONLY;
+	if (mode & O_NONBLOCK)
+		flags |= LEASE_BREAK_NONBLOCK;
+	return flags;
+}
+
 static inline int break_lease(struct inode *inode, unsigned int mode)
 {
 	struct file_lock_context *flctx;
@@ -443,11 +461,11 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 		return 0;
 	smp_mb();
 	if (!list_empty_careful(&flctx->flc_lease))
-		return __break_lease(inode, mode, FL_LEASE);
+		return __break_lease(inode, LEASE_BREAK_LEASE | openmode_to_lease_flags(mode));
 	return 0;
 }
 
-static inline int break_deleg(struct inode *inode, unsigned int mode)
+static inline int break_deleg(struct inode *inode, unsigned int flags)
 {
 	struct file_lock_context *flctx;
 
@@ -461,8 +479,10 @@ static inline int break_deleg(struct inode *inode, unsigned int mode)
 	if (!flctx)
 		return 0;
 	smp_mb();
-	if (!list_empty_careful(&flctx->flc_lease))
-		return __break_lease(inode, mode, FL_DELEG);
+	if (!list_empty_careful(&flctx->flc_lease)) {
+		flags |= LEASE_BREAK_DELEG;
+		return __break_lease(inode, flags);
+	}
 	return 0;
 }
 
@@ -470,7 +490,7 @@ static inline int try_break_deleg(struct inode *inode, struct inode **delegated_
 {
 	int ret;
 
-	ret = break_deleg(inode, O_WRONLY|O_NONBLOCK);
+	ret = break_deleg(inode, LEASE_BREAK_NONBLOCK);
 	if (ret == -EWOULDBLOCK && delegated_inode) {
 		*delegated_inode = inode;
 		ihold(inode);
@@ -482,7 +502,7 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 {
 	int ret;
 
-	ret = break_deleg(*delegated_inode, O_WRONLY);
+	ret = break_deleg(*delegated_inode, 0);
 	iput(*delegated_inode);
 	*delegated_inode = NULL;
 	return ret;
@@ -491,20 +511,24 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 static inline int break_layout(struct inode *inode, bool wait)
 {
 	smp_mb();
-	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
-		return __break_lease(inode,
-				wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
-				FL_LAYOUT);
+	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) {
+		unsigned int flags = LEASE_BREAK_LAYOUT;
+
+		if (!wait)
+			flags |= LEASE_BREAK_NONBLOCK;
+
+		return __break_lease(inode, flags);
+	}
 	return 0;
 }
 
 #else /* !CONFIG_FILE_LOCKING */
-static inline int break_lease(struct inode *inode, unsigned int mode)
+static inline int break_lease(struct inode *inode, bool wait)
 {
 	return 0;
 }
 
-static inline int break_deleg(struct inode *inode, unsigned int mode)
+static inline int break_deleg(struct inode *inode, unsigned int flags)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 6976ed2dd0d59086d16d853ac9b21776be68aaad Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 11 Nov 2025 09:12:44 -0500
Subject: filelock: add struct delegated_inode

The current API requires a pointer to an inode pointer. It's easy for
callers to get this wrong. Add a new delegated_inode structure and use
that to pass back any inode that needs to be waited on.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-3-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/attr.c                |  2 +-
 fs/namei.c               | 18 +++++++++---------
 fs/open.c                |  8 ++++----
 fs/posix_acl.c           |  8 ++++----
 fs/utimes.c              |  4 ++--
 fs/xattr.c               | 12 ++++++------
 include/linux/filelock.h | 36 +++++++++++++++++++++++++++---------
 include/linux/fs.h       |  9 +++++----
 include/linux/xattr.h    |  4 ++--
 9 files changed, 60 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/fs/attr.c b/fs/attr.c
index 795f231d00e8..b9ec6b47bab2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -415,7 +415,7 @@ EXPORT_SYMBOL(may_setattr);
  * performed on the raw inode simply pass @nop_mnt_idmap.
  */
 int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
-		  struct iattr *attr, struct inode **delegated_inode)
+		  struct iattr *attr, struct delegated_inode *delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	umode_t mode = inode->i_mode;
diff --git a/fs/namei.c b/fs/namei.c
index 7377020a2cba..bf42f146f847 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4648,7 +4648,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
  * raw inode simply pass @nop_mnt_idmap.
  */
 int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
-	       struct dentry *dentry, struct inode **delegated_inode)
+	       struct dentry *dentry, struct delegated_inode *delegated_inode)
 {
 	struct inode *target = dentry->d_inode;
 	int error = may_delete(idmap, dir, dentry, 0);
@@ -4706,7 +4706,7 @@ int do_unlinkat(int dfd, struct filename *name)
 	struct qstr last;
 	int type;
 	struct inode *inode = NULL;
-	struct inode *delegated_inode = NULL;
+	struct delegated_inode delegated_inode = { };
 	unsigned int lookup_flags = 0;
 retry:
 	error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
@@ -4743,7 +4743,7 @@ exit3:
 	if (inode)
 		iput(inode);	/* truncate the inode here */
 	inode = NULL;
-	if (delegated_inode) {
+	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
@@ -4892,7 +4892,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
  */
 int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
 	     struct inode *dir, struct dentry *new_dentry,
-	     struct inode **delegated_inode)
+	     struct delegated_inode *delegated_inode)
 {
 	struct inode *inode = old_dentry->d_inode;
 	unsigned max_links = dir->i_sb->s_max_links;
@@ -4968,7 +4968,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd,
 	struct mnt_idmap *idmap;
 	struct dentry *new_dentry;
 	struct path old_path, new_path;
-	struct inode *delegated_inode = NULL;
+	struct delegated_inode delegated_inode = { };
 	int how = 0;
 	int error;
 
@@ -5012,7 +5012,7 @@ retry:
 			 new_dentry, &delegated_inode);
 out_dput:
 	end_creating_path(&new_path, new_dentry);
-	if (delegated_inode) {
+	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error) {
 			path_put(&old_path);
@@ -5098,7 +5098,7 @@ int vfs_rename(struct renamedata *rd)
 	struct inode *new_dir = d_inode(rd->new_parent);
 	struct dentry *old_dentry = rd->old_dentry;
 	struct dentry *new_dentry = rd->new_dentry;
-	struct inode **delegated_inode = rd->delegated_inode;
+	struct delegated_inode *delegated_inode = rd->delegated_inode;
 	unsigned int flags = rd->flags;
 	bool is_dir = d_is_dir(old_dentry);
 	struct inode *source = old_dentry->d_inode;
@@ -5261,7 +5261,7 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
 	struct path old_path, new_path;
 	struct qstr old_last, new_last;
 	int old_type, new_type;
-	struct inode *delegated_inode = NULL;
+	struct delegated_inode delegated_inode = { };
 	unsigned int lookup_flags = 0, target_flags =
 		LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
 	bool should_retry = false;
@@ -5369,7 +5369,7 @@ exit4:
 exit3:
 	unlock_rename(new_path.dentry, old_path.dentry);
 exit_lock_rename:
-	if (delegated_inode) {
+	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
diff --git a/fs/open.c b/fs/open.c
index 3d64372ecc67..fdaa6f08f6f4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -631,7 +631,7 @@ out:
 int chmod_common(const struct path *path, umode_t mode)
 {
 	struct inode *inode = path->dentry->d_inode;
-	struct inode *delegated_inode = NULL;
+	struct delegated_inode delegated_inode = { };
 	struct iattr newattrs;
 	int error;
 
@@ -651,7 +651,7 @@ retry_deleg:
 			      &newattrs, &delegated_inode);
 out_unlock:
 	inode_unlock(inode);
-	if (delegated_inode) {
+	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
@@ -756,7 +756,7 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
 	struct mnt_idmap *idmap;
 	struct user_namespace *fs_userns;
 	struct inode *inode = path->dentry->d_inode;
-	struct inode *delegated_inode = NULL;
+	struct delegated_inode delegated_inode = { };
 	int error;
 	struct iattr newattrs;
 	kuid_t uid;
@@ -791,7 +791,7 @@ retry_deleg:
 		error = notify_change(idmap, path->dentry, &newattrs,
 				      &delegated_inode);
 	inode_unlock(inode);
-	if (delegated_inode) {
+	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4050942ab52f..768f027c1428 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1091,7 +1091,7 @@ int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 	int acl_type;
 	int error;
 	struct inode *inode = d_inode(dentry);
-	struct inode *delegated_inode = NULL;
+	struct delegated_inode delegated_inode = { };
 
 	acl_type = posix_acl_type(acl_name);
 	if (acl_type < 0)
@@ -1141,7 +1141,7 @@ retry_deleg:
 out_inode_unlock:
 	inode_unlock(inode);
 
-	if (delegated_inode) {
+	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
@@ -1212,7 +1212,7 @@ int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 	int acl_type;
 	int error;
 	struct inode *inode = d_inode(dentry);
-	struct inode *delegated_inode = NULL;
+	struct delegated_inode delegated_inode = { };
 
 	acl_type = posix_acl_type(acl_name);
 	if (acl_type < 0)
@@ -1249,7 +1249,7 @@ retry_deleg:
 out_inode_unlock:
 	inode_unlock(inode);
 
-	if (delegated_inode) {
+	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
diff --git a/fs/utimes.c b/fs/utimes.c
index c7c7958e57b2..bf9f45bdef54 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -22,7 +22,7 @@ int vfs_utimes(const struct path *path, struct timespec64 *times)
 	int error;
 	struct iattr newattrs;
 	struct inode *inode = path->dentry->d_inode;
-	struct inode *delegated_inode = NULL;
+	struct delegated_inode delegated_inode = { };
 
 	if (times) {
 		if (!nsec_valid(times[0].tv_nsec) ||
@@ -66,7 +66,7 @@ retry_deleg:
 	error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs,
 			      &delegated_inode);
 	inode_unlock(inode);
-	if (delegated_inode) {
+	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
diff --git a/fs/xattr.c b/fs/xattr.c
index 8851a5ef34f5..32d445fb60aa 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -274,7 +274,7 @@ int __vfs_setxattr_noperm(struct mnt_idmap *idmap,
 int
 __vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry,
 		      const char *name, const void *value, size_t size,
-		      int flags, struct inode **delegated_inode)
+		      int flags, struct delegated_inode *delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
@@ -305,7 +305,7 @@ vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	     const char *name, const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
-	struct inode *delegated_inode = NULL;
+	struct delegated_inode delegated_inode = { };
 	const void  *orig_value = value;
 	int error;
 
@@ -322,7 +322,7 @@ retry_deleg:
 				      flags, &delegated_inode);
 	inode_unlock(inode);
 
-	if (delegated_inode) {
+	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
@@ -533,7 +533,7 @@ EXPORT_SYMBOL(__vfs_removexattr);
 int
 __vfs_removexattr_locked(struct mnt_idmap *idmap,
 			 struct dentry *dentry, const char *name,
-			 struct inode **delegated_inode)
+			 struct delegated_inode *delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
@@ -567,7 +567,7 @@ vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		const char *name)
 {
 	struct inode *inode = dentry->d_inode;
-	struct inode *delegated_inode = NULL;
+	struct delegated_inode delegated_inode = { };
 	int error;
 
 retry_deleg:
@@ -576,7 +576,7 @@ retry_deleg:
 					 name, &delegated_inode);
 	inode_unlock(inode);
 
-	if (delegated_inode) {
+	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index 47da6aa28d8d..208d108df2d7 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -486,25 +486,35 @@ static inline int break_deleg(struct inode *inode, unsigned int flags)
 	return 0;
 }
 
-static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
+struct delegated_inode {
+	struct inode *di_inode;
+};
+
+static inline bool is_delegated(struct delegated_inode *di)
+{
+	return di->di_inode;
+}
+
+static inline int try_break_deleg(struct inode *inode,
+				  struct delegated_inode *di)
 {
 	int ret;
 
 	ret = break_deleg(inode, LEASE_BREAK_NONBLOCK);
-	if (ret == -EWOULDBLOCK && delegated_inode) {
-		*delegated_inode = inode;
+	if (ret == -EWOULDBLOCK && di) {
+		di->di_inode = inode;
 		ihold(inode);
 	}
 	return ret;
 }
 
-static inline int break_deleg_wait(struct inode **delegated_inode)
+static inline int break_deleg_wait(struct delegated_inode *di)
 {
 	int ret;
 
-	ret = break_deleg(*delegated_inode, 0);
-	iput(*delegated_inode);
-	*delegated_inode = NULL;
+	ret = break_deleg(di->di_inode, 0);
+	iput(di->di_inode);
+	di->di_inode = NULL;
 	return ret;
 }
 
@@ -523,6 +533,13 @@ static inline int break_layout(struct inode *inode, bool wait)
 }
 
 #else /* !CONFIG_FILE_LOCKING */
+struct delegated_inode { };
+
+static inline bool is_delegated(struct delegated_inode *di)
+{
+	return false;
+}
+
 static inline int break_lease(struct inode *inode, bool wait)
 {
 	return 0;
@@ -533,12 +550,13 @@ static inline int break_deleg(struct inode *inode, unsigned int flags)
 	return 0;
 }
 
-static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
+static inline int try_break_deleg(struct inode *inode,
+				  struct delegated_inode *delegated_inode)
 {
 	return 0;
 }
 
-static inline int break_deleg_wait(struct inode **delegated_inode)
+static inline int break_deleg_wait(struct delegated_inode *delegated_inode)
 {
 	BUG();
 	return 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..909a88e3979d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -80,6 +80,7 @@ struct fs_context;
 struct fs_parameter_spec;
 struct file_kattr;
 struct iomap_ops;
+struct delegated_inode;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -2119,10 +2120,10 @@ int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
 int vfs_symlink(struct mnt_idmap *, struct inode *,
 		struct dentry *, const char *);
 int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
-	     struct dentry *, struct inode **);
+	     struct dentry *, struct delegated_inode *);
 int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *);
 int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
-	       struct inode **);
+	       struct delegated_inode *);
 
 /**
  * struct renamedata - contains all information required for renaming
@@ -2140,7 +2141,7 @@ struct renamedata {
 	struct dentry *old_dentry;
 	struct dentry *new_parent;
 	struct dentry *new_dentry;
-	struct inode **delegated_inode;
+	struct delegated_inode *delegated_inode;
 	unsigned int flags;
 } __randomize_layout;
 
@@ -3071,7 +3072,7 @@ static inline int bmap(struct inode *inode,  sector_t *block)
 #endif
 
 int notify_change(struct mnt_idmap *, struct dentry *,
-		  struct iattr *, struct inode **);
+		  struct iattr *, struct delegated_inode *);
 int inode_permission(struct mnt_idmap *, struct inode *, int);
 int generic_permission(struct mnt_idmap *, struct inode *, int);
 static inline int file_permission(struct file *file, int mask)
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 86b0d47984a1..64e9afe7d647 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -85,12 +85,12 @@ int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *,
 			  const char *, const void *, size_t, int);
 int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *,
 			  const char *, const void *, size_t, int,
-			  struct inode **);
+			  struct delegated_inode *);
 int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *,
 		 const void *, size_t, int);
 int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);
 int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *,
-			     const char *, struct inode **);
+			     const char *, struct delegated_inode *);
 int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);
 
 ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
-- 
cgit v1.2.3


From e12d203b8c880061c0bf0339cad51e5851a33442 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 11 Nov 2025 09:12:47 -0500
Subject: vfs: allow mkdir to wait for delegation break on parent

In order to add directory delegation support, we need to break
delegations on the parent whenever there is going to be a change in the
directory.

Add a new delegated_inode parameter to vfs_mkdir. All of the existing
callers set that to NULL for now, except for do_mkdirat which will
properly block until the lease is gone.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-6-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/base/devtmpfs.c  |  2 +-
 fs/cachefiles/namei.c    |  2 +-
 fs/ecryptfs/inode.c      |  2 +-
 fs/init.c                |  2 +-
 fs/namei.c               | 24 ++++++++++++++++++------
 fs/nfsd/nfs4recover.c    |  2 +-
 fs/nfsd/vfs.c            |  2 +-
 fs/overlayfs/overlayfs.h |  2 +-
 fs/smb/server/vfs.c      |  2 +-
 fs/xfs/scrub/orphanage.c |  2 +-
 include/linux/fs.h       |  2 +-
 11 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 9d4e46ad8352..0e79621cb0f7 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -180,7 +180,7 @@ static int dev_mkdir(const char *name, umode_t mode)
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode);
+	dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, NULL);
 	if (!IS_ERR(dentry))
 		/* mark as kernel-created inode */
 		d_inode(dentry)->i_private = &thread;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index d1edb2ac3837..50c0f9c76d1f 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -130,7 +130,7 @@ retry:
 			goto mkdir_error;
 		ret = cachefiles_inject_write_error();
 		if (ret == 0)
-			subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
+			subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700, NULL);
 		else
 			subdir = ERR_PTR(ret);
 		if (IS_ERR(subdir)) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ed1394da8d6b..35830b3144f8 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -508,7 +508,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 		goto out;
 
 	lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir,
-				 lower_dentry, mode);
+				 lower_dentry, mode, NULL);
 	rc = PTR_ERR(lower_dentry);
 	if (IS_ERR(lower_dentry))
 		goto out;
diff --git a/fs/init.c b/fs/init.c
index 07f592ccdba8..895f8a09a71a 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -233,7 +233,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
 	error = security_path_mkdir(&path, dentry, mode);
 	if (!error) {
 		dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
-				  dentry, mode);
+				  dentry, mode, NULL);
 		if (IS_ERR(dentry))
 			error = PTR_ERR(dentry);
 	}
diff --git a/fs/namei.c b/fs/namei.c
index 5bcf3e93d350..76c0587d991f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4407,10 +4407,11 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
 
 /**
  * vfs_mkdir - create directory returning correct dentry if possible
- * @idmap:	idmap of the mount the inode was found from
- * @dir:	inode of the parent directory
- * @dentry:	dentry of the child directory
- * @mode:	mode of the child directory
+ * @idmap:		idmap of the mount the inode was found from
+ * @dir:		inode of the parent directory
+ * @dentry:		dentry of the child directory
+ * @mode:		mode of the child directory
+ * @delegated_inode:	returns parent inode, if the inode is delegated.
  *
  * Create a directory.
  *
@@ -4427,7 +4428,8 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
  * In case of an error the dentry is dput() and an ERR_PTR() is returned.
  */
 struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
-			 struct dentry *dentry, umode_t mode)
+			 struct dentry *dentry, umode_t mode,
+			 struct delegated_inode *delegated_inode)
 {
 	int error;
 	unsigned max_links = dir->i_sb->s_max_links;
@@ -4450,6 +4452,10 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	if (max_links && dir->i_nlink >= max_links)
 		goto err;
 
+	error = try_break_deleg(dir, delegated_inode);
+	if (error)
+		goto err;
+
 	de = dir->i_op->mkdir(idmap, dir, dentry, mode);
 	error = PTR_ERR(de);
 	if (IS_ERR(de))
@@ -4473,6 +4479,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode)
 	struct path path;
 	int error;
 	unsigned int lookup_flags = LOOKUP_DIRECTORY;
+	struct delegated_inode delegated_inode = { };
 
 retry:
 	dentry = filename_create(dfd, name, &path, lookup_flags);
@@ -4484,11 +4491,16 @@ retry:
 			mode_strip_umask(path.dentry->d_inode, mode));
 	if (!error) {
 		dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
-				  dentry, mode);
+				   dentry, mode, &delegated_inode);
 		if (IS_ERR(dentry))
 			error = PTR_ERR(dentry);
 	}
 	end_creating_path(&path, dentry);
+	if (is_delegated(&delegated_inode)) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry;
+	}
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
 		goto retry;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index e2b9472e5c78..1f56834b2072 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -213,7 +213,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		 * as well be forgiving and just succeed silently.
 		 */
 		goto out_put;
-	dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
+	dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, 0700, NULL);
 	if (IS_ERR(dentry))
 		status = PTR_ERR(dentry);
 out_put:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9cb20d4aeab1..97aef140cbf5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1558,7 +1558,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			nfsd_check_ignore_resizing(iap);
 		break;
 	case S_IFDIR:
-		dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode);
+		dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode, NULL);
 		if (IS_ERR(dchild)) {
 			host_err = PTR_ERR(dchild);
 		} else if (d_is_negative(dchild)) {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index c8fd5951fc5e..0f65f9a5d54d 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -248,7 +248,7 @@ static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs,
 {
 	struct dentry *ret;
 
-	ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
+	ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, NULL);
 	pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret));
 	return ret;
 }
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 891ed2dc2b73..3d2190f26623 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -230,7 +230,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
 	idmap = mnt_idmap(path.mnt);
 	mode |= S_IFDIR;
 	d = dentry;
-	dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
+	dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode, NULL);
 	if (IS_ERR(dentry))
 		err = PTR_ERR(dentry);
 	else if (d_is_negative(dentry))
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index 9c12cb844231..91c9d07b97f3 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -167,7 +167,7 @@ xrep_orphanage_create(
 	 */
 	if (d_really_is_negative(orphanage_dentry)) {
 		orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode,
-					     orphanage_dentry, 0750);
+					     orphanage_dentry, 0750, NULL);
 		error = PTR_ERR(orphanage_dentry);
 		if (IS_ERR(orphanage_dentry))
 			goto out_unlock_root;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 909a88e3979d..20bb4c8a4e8e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2114,7 +2114,7 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap,
 int vfs_create(struct mnt_idmap *, struct inode *,
 	       struct dentry *, umode_t, bool);
 struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
-			 struct dentry *, umode_t);
+			 struct dentry *, umode_t, struct delegated_inode *);
 int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
               umode_t, dev_t);
 int vfs_symlink(struct mnt_idmap *, struct inode *,
-- 
cgit v1.2.3


From 4fa76319cd0cc97ca54ff71c94814dc5b1983ad2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 11 Nov 2025 09:12:48 -0500
Subject: vfs: allow rmdir to wait for delegation break on parent

In order to add directory delegation support, we need to break
delegations on the parent whenever there is going to be a change in the
directory.

Add a delegated_inode struct to vfs_rmdir() and populate that
pointer with the parent inode if it's non-NULL. Most existing in-kernel
callers pass in a NULL pointer.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-7-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/base/devtmpfs.c  |  2 +-
 fs/ecryptfs/inode.c      |  2 +-
 fs/namei.c               | 22 +++++++++++++++++-----
 fs/nfsd/nfs4recover.c    |  4 ++--
 fs/nfsd/vfs.c            |  2 +-
 fs/overlayfs/overlayfs.h |  2 +-
 fs/smb/server/vfs.c      |  4 ++--
 include/linux/fs.h       |  3 ++-
 8 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 0e79621cb0f7..104025104ef7 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -261,7 +261,7 @@ static int dev_rmdir(const char *name)
 		return PTR_ERR(dentry);
 	if (d_inode(dentry)->i_private == &thread)
 		err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry),
-				dentry);
+				dentry, NULL);
 	else
 		err = -EPERM;
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 35830b3144f8..88631291b325 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -540,7 +540,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 		if (d_unhashed(lower_dentry))
 			rc = -EINVAL;
 		else
-			rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry);
+			rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry, NULL);
 	}
 	if (!rc) {
 		clear_nlink(d_inode(dentry));
diff --git a/fs/namei.c b/fs/namei.c
index 76c0587d991f..9e0393a92091 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4522,9 +4522,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
 
 /**
  * vfs_rmdir - remove directory
- * @idmap:	idmap of the mount the inode was found from
- * @dir:	inode of the parent directory
- * @dentry:	dentry of the child directory
+ * @idmap:		idmap of the mount the inode was found from
+ * @dir:		inode of the parent directory
+ * @dentry:		dentry of the child directory
+ * @delegated_inode:	returns parent inode, if it's delegated.
  *
  * Remove a directory.
  *
@@ -4535,7 +4536,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
  * raw inode simply pass @nop_mnt_idmap.
  */
 int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
-		     struct dentry *dentry)
+	      struct dentry *dentry, struct delegated_inode *delegated_inode)
 {
 	int error = may_delete(idmap, dir, dentry, 1);
 
@@ -4557,6 +4558,10 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
 	if (error)
 		goto out;
 
+	error = try_break_deleg(dir, delegated_inode);
+	if (error)
+		goto out;
+
 	error = dir->i_op->rmdir(dir, dentry);
 	if (error)
 		goto out;
@@ -4583,6 +4588,7 @@ int do_rmdir(int dfd, struct filename *name)
 	struct qstr last;
 	int type;
 	unsigned int lookup_flags = 0;
+	struct delegated_inode delegated_inode = { };
 retry:
 	error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
 	if (error)
@@ -4612,7 +4618,8 @@ retry:
 	error = security_path_rmdir(&path, dentry);
 	if (error)
 		goto exit4;
-	error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry);
+	error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode,
+			  dentry, &delegated_inode);
 exit4:
 	dput(dentry);
 exit3:
@@ -4620,6 +4627,11 @@ exit3:
 	mnt_drop_write(path.mnt);
 exit2:
 	path_put(&path);
+	if (is_delegated(&delegated_inode)) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry;
+	}
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
 		goto retry;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 1f56834b2072..30bae93931d9 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -337,7 +337,7 @@ nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn)
 	status = -ENOENT;
 	if (d_really_is_negative(dentry))
 		goto out;
-	status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry);
+	status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry, NULL);
 out:
 	dput(dentry);
 out_unlock:
@@ -427,7 +427,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 	if (nfs4_has_reclaimed_state(name, nn))
 		goto out_free;
 
-	status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child);
+	status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child, NULL);
 	if (status)
 		printk("failed to remove client recovery directory %pd\n",
 				child);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 97aef140cbf5..c400ea94ff2e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2108,7 +2108,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 				break;
 		}
 	} else {
-		host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry);
+		host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry, NULL);
 	}
 	fh_fill_post_attrs(fhp);
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0f65f9a5d54d..d215d7349489 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -206,7 +206,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs,
 static inline int ovl_do_rmdir(struct ovl_fs *ofs,
 			       struct inode *dir, struct dentry *dentry)
 {
-	int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry);
+	int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL);
 
 	pr_debug("rmdir(%pd2) = %i\n", dentry, err);
 	return err;
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 3d2190f26623..c5f0f3170d58 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -609,7 +609,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
 
 	idmap = mnt_idmap(path->mnt);
 	if (S_ISDIR(d_inode(path->dentry)->i_mode)) {
-		err = vfs_rmdir(idmap, d_inode(parent), path->dentry);
+		err = vfs_rmdir(idmap, d_inode(parent), path->dentry, NULL);
 		if (err && err != -ENOTEMPTY)
 			ksmbd_debug(VFS, "rmdir failed, err %d\n", err);
 	} else {
@@ -1090,7 +1090,7 @@ int ksmbd_vfs_unlink(struct file *filp)
 	dget(dentry);
 
 	if (S_ISDIR(d_inode(dentry)->i_mode))
-		err = vfs_rmdir(idmap, d_inode(dir), dentry);
+		err = vfs_rmdir(idmap, d_inode(dir), dentry, NULL);
 	else
 		err = vfs_unlink(idmap, d_inode(dir), dentry, NULL);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 20bb4c8a4e8e..12873214e1c7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2121,7 +2121,8 @@ int vfs_symlink(struct mnt_idmap *, struct inode *,
 		struct dentry *, const char *);
 int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
 	     struct dentry *, struct delegated_inode *);
-int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *);
+int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *,
+	      struct delegated_inode *);
 int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
 	       struct delegated_inode *);
 
-- 
cgit v1.2.3


From 85bbffcad7307e2ca6136be657cc21b0e1c42241 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 11 Nov 2025 09:12:50 -0500
Subject: vfs: clean up argument list for vfs_create()

As Neil points out:

"I would be in favour of dropping the "dir" arg because it is always
d_inode(dentry->d_parent) which is stable."

...and...

"Also *every* caller of vfs_create() passes ".excl = true".  So maybe we
don't need that arg at all."

Drop both arguments from vfs_create() and fix up the callers.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-9-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ecryptfs/inode.c      |  3 +--
 fs/namei.c               | 11 ++++-------
 fs/nfsd/nfs3proc.c       |  2 +-
 fs/nfsd/vfs.c            |  3 +--
 fs/open.c                |  4 +---
 fs/overlayfs/overlayfs.h |  2 +-
 fs/smb/server/vfs.c      |  3 +--
 include/linux/fs.h       |  3 +--
 8 files changed, 11 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 88631291b325..d109e3763a88 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -188,8 +188,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 
 	rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir);
 	if (!rc)
-		rc = vfs_create(&nop_mnt_idmap, lower_dir,
-				lower_dentry, mode, true);
+		rc = vfs_create(&nop_mnt_idmap, lower_dentry, mode);
 	if (rc) {
 		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
 		       "rc = [%d]\n", __func__, rc);
diff --git a/fs/namei.c b/fs/namei.c
index f439429bdfa2..9586c6aba6ea 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3461,10 +3461,8 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
 /**
  * vfs_create - create new file
  * @idmap:	idmap of the mount the inode was found from
- * @dir:	inode of the parent directory
  * @dentry:	dentry of the child file
  * @mode:	mode of the child file
- * @want_excl:	whether the file must not yet exist
  *
  * Create a new file.
  *
@@ -3474,9 +3472,9 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
  * On non-idmapped mounts or if permission checking is to be performed on the
  * raw inode simply pass @nop_mnt_idmap.
  */
-int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
-	       struct dentry *dentry, umode_t mode, bool want_excl)
+int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode)
 {
+	struct inode *dir = d_inode(dentry->d_parent);
 	int error;
 
 	error = may_create(idmap, dir, dentry);
@@ -3490,7 +3488,7 @@ int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	error = security_inode_create(dir, dentry, mode);
 	if (error)
 		return error;
-	error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
+	error = dir->i_op->create(idmap, dir, dentry, mode, true);
 	if (!error)
 		fsnotify_create(dir, dentry);
 	return error;
@@ -4383,8 +4381,7 @@ retry:
 	idmap = mnt_idmap(path.mnt);
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
-			error = vfs_create(idmap, path.dentry->d_inode,
-					   dentry, mode, true);
+			error = vfs_create(idmap, dentry, mode);
 			if (!error)
 				security_path_post_mknod(idmap, dentry);
 			break;
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index b6d03e1ef5f7..30ea7ffa2aff 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -344,7 +344,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	status = fh_fill_pre_attrs(fhp);
 	if (status != nfs_ok)
 		goto out;
-	host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true);
+	host_err = vfs_create(&nop_mnt_idmap, child, iap->ia_mode);
 	if (host_err < 0) {
 		status = nfserrno(host_err);
 		goto out;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c400ea94ff2e..464fd54675f3 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1552,8 +1552,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = 0;
 	switch (type) {
 	case S_IFREG:
-		host_err = vfs_create(&nop_mnt_idmap, dirp, dchild,
-				      iap->ia_mode, true);
+		host_err = vfs_create(&nop_mnt_idmap, dchild, iap->ia_mode);
 		if (!host_err)
 			nfsd_check_ignore_resizing(iap);
 		break;
diff --git a/fs/open.c b/fs/open.c
index fdaa6f08f6f4..e440f58e3ce8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1171,9 +1171,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 	if (IS_ERR(f))
 		return f;
 
-	error = vfs_create(mnt_idmap(path->mnt),
-			   d_inode(path->dentry->d_parent),
-			   path->dentry, mode, true);
+	error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode);
 	if (!error)
 		error = vfs_open(path, f);
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index d215d7349489..2bdc434941eb 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -235,7 +235,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs,
 				struct inode *dir, struct dentry *dentry,
 				umode_t mode)
 {
-	int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true);
+	int err = vfs_create(ovl_upper_mnt_idmap(ofs), dentry, mode);
 
 	pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
 	return err;
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index c5f0f3170d58..83ece2de4b23 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -188,8 +188,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
 	}
 
 	mode |= S_IFREG;
-	err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry),
-			 dentry, mode, true);
+	err = vfs_create(mnt_idmap(path.mnt), dentry, mode);
 	if (!err) {
 		ksmbd_vfs_inherit_owner(work, d_inode(path.dentry),
 					d_inode(dentry));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 12873214e1c7..21876ef1fec9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2111,8 +2111,7 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap,
 /*
  * VFS helper functions..
  */
-int vfs_create(struct mnt_idmap *, struct inode *,
-	       struct dentry *, umode_t, bool);
+int vfs_create(struct mnt_idmap *, struct dentry *, umode_t);
 struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
 			 struct dentry *, umode_t, struct delegated_inode *);
 int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
-- 
cgit v1.2.3


From c826229c6a82fe1fe7b7752692f87a881eb4b545 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 11 Nov 2025 09:12:51 -0500
Subject: vfs: make vfs_create break delegations on parent directory

In order to add directory delegation support, we need to break
delegations on the parent whenever there is going to be a change in the
directory.

Add a delegated_inode parameter to vfs_create. Most callers are
converted to pass in NULL, but do_mknodat() is changed to wait for a
delegation break if there is one.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-10-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ecryptfs/inode.c      |  2 +-
 fs/namei.c               | 15 +++++++++++++--
 fs/nfsd/nfs3proc.c       |  2 +-
 fs/nfsd/vfs.c            |  2 +-
 fs/open.c                |  2 +-
 fs/overlayfs/overlayfs.h |  2 +-
 fs/smb/server/vfs.c      |  2 +-
 include/linux/fs.h       |  3 ++-
 8 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d109e3763a88..3341f00dd087 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -188,7 +188,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 
 	rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir);
 	if (!rc)
-		rc = vfs_create(&nop_mnt_idmap, lower_dentry, mode);
+		rc = vfs_create(&nop_mnt_idmap, lower_dentry, mode, NULL);
 	if (rc) {
 		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
 		       "rc = [%d]\n", __func__, rc);
diff --git a/fs/namei.c b/fs/namei.c
index 9586c6aba6ea..b20f053374a5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3463,6 +3463,7 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
  * @idmap:	idmap of the mount the inode was found from
  * @dentry:	dentry of the child file
  * @mode:	mode of the child file
+ * @di:		returns parent inode, if the inode is delegated.
  *
  * Create a new file.
  *
@@ -3472,7 +3473,8 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
  * On non-idmapped mounts or if permission checking is to be performed on the
  * raw inode simply pass @nop_mnt_idmap.
  */
-int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode)
+int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode,
+	       struct delegated_inode *di)
 {
 	struct inode *dir = d_inode(dentry->d_parent);
 	int error;
@@ -3486,6 +3488,9 @@ int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode)
 
 	mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
 	error = security_inode_create(dir, dentry, mode);
+	if (error)
+		return error;
+	error = try_break_deleg(dir, di);
 	if (error)
 		return error;
 	error = dir->i_op->create(idmap, dir, dentry, mode, true);
@@ -4358,6 +4363,7 @@ static int may_mknod(umode_t mode)
 static int do_mknodat(int dfd, struct filename *name, umode_t mode,
 		unsigned int dev)
 {
+	struct delegated_inode di = { };
 	struct mnt_idmap *idmap;
 	struct dentry *dentry;
 	struct path path;
@@ -4381,7 +4387,7 @@ retry:
 	idmap = mnt_idmap(path.mnt);
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
-			error = vfs_create(idmap, dentry, mode);
+			error = vfs_create(idmap, dentry, mode, &di);
 			if (!error)
 				security_path_post_mknod(idmap, dentry);
 			break;
@@ -4396,6 +4402,11 @@ retry:
 	}
 out2:
 	end_creating_path(&path, dentry);
+	if (is_delegated(&di)) {
+		error = break_deleg_wait(&di);
+		if (!error)
+			goto retry;
+	}
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
 		goto retry;
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 30ea7ffa2aff..2cb972b5ed99 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -344,7 +344,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	status = fh_fill_pre_attrs(fhp);
 	if (status != nfs_ok)
 		goto out;
-	host_err = vfs_create(&nop_mnt_idmap, child, iap->ia_mode);
+	host_err = vfs_create(&nop_mnt_idmap, child, iap->ia_mode, NULL);
 	if (host_err < 0) {
 		status = nfserrno(host_err);
 		goto out;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 464fd54675f3..de5f46f8c6d3 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1552,7 +1552,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = 0;
 	switch (type) {
 	case S_IFREG:
-		host_err = vfs_create(&nop_mnt_idmap, dchild, iap->ia_mode);
+		host_err = vfs_create(&nop_mnt_idmap, dchild, iap->ia_mode, NULL);
 		if (!host_err)
 			nfsd_check_ignore_resizing(iap);
 		break;
diff --git a/fs/open.c b/fs/open.c
index e440f58e3ce8..92cf2e11781b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1171,7 +1171,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 	if (IS_ERR(f))
 		return f;
 
-	error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode);
+	error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode, NULL);
 	if (!error)
 		error = vfs_open(path, f);
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 2bdc434941eb..e30441cc9c63 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -235,7 +235,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs,
 				struct inode *dir, struct dentry *dentry,
 				umode_t mode)
 {
-	int err = vfs_create(ovl_upper_mnt_idmap(ofs), dentry, mode);
+	int err = vfs_create(ovl_upper_mnt_idmap(ofs), dentry, mode, NULL);
 
 	pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
 	return err;
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 83ece2de4b23..3747851b61c8 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -188,7 +188,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
 	}
 
 	mode |= S_IFREG;
-	err = vfs_create(mnt_idmap(path.mnt), dentry, mode);
+	err = vfs_create(mnt_idmap(path.mnt), dentry, mode, NULL);
 	if (!err) {
 		ksmbd_vfs_inherit_owner(work, d_inode(path.dentry),
 					d_inode(dentry));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21876ef1fec9..83b05aec4e10 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2111,7 +2111,8 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap,
 /*
  * VFS helper functions..
  */
-int vfs_create(struct mnt_idmap *, struct dentry *, umode_t);
+int vfs_create(struct mnt_idmap *, struct dentry *, umode_t,
+	       struct delegated_inode *);
 struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
 			 struct dentry *, umode_t, struct delegated_inode *);
 int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
-- 
cgit v1.2.3


From e8960c1b2ee9ba75d65492b8e90e851d11e5f215 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 11 Nov 2025 09:12:52 -0500
Subject: vfs: make vfs_mknod break delegations on parent directory

In order to add directory delegation support, we need to break
delegations on the parent whenever there is going to be a change in the
directory.

Add a new delegated_inode pointer to vfs_mknod() and have the
appropriate callers wait when there is an outstanding delegation. All
other callers just set the pointer to NULL.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-11-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/base/devtmpfs.c  |  2 +-
 fs/ecryptfs/inode.c      |  2 +-
 fs/init.c                |  2 +-
 fs/namei.c               | 23 +++++++++++++++--------
 fs/nfsd/vfs.c            |  2 +-
 fs/overlayfs/overlayfs.h |  2 +-
 include/linux/fs.h       |  4 ++--
 net/unix/af_unix.c       |  2 +-
 8 files changed, 23 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 104025104ef7..2f576ecf1832 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -231,7 +231,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 		return PTR_ERR(dentry);
 
 	err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode,
-			dev->devt);
+			dev->devt, NULL);
 	if (!err) {
 		struct iattr newattrs;
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3341f00dd087..83f06452476d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -564,7 +564,7 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	rc = lock_parent(dentry, &lower_dentry, &lower_dir);
 	if (!rc)
 		rc = vfs_mknod(&nop_mnt_idmap, lower_dir,
-			       lower_dentry, mode, dev);
+			       lower_dentry, mode, dev, NULL);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out;
 	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
diff --git a/fs/init.c b/fs/init.c
index 895f8a09a71a..4f02260dd65b 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -157,7 +157,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
 	error = security_path_mknod(&path, dentry, mode, dev);
 	if (!error)
 		error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
-				  dentry, mode, new_decode_dev(dev));
+				  dentry, mode, new_decode_dev(dev), NULL);
 	end_creating_path(&path, dentry);
 	return error;
 }
diff --git a/fs/namei.c b/fs/namei.c
index b20f053374a5..e9616134390f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4295,13 +4295,15 @@ inline struct dentry *start_creating_user_path(
 }
 EXPORT_SYMBOL(start_creating_user_path);
 
+
 /**
  * vfs_mknod - create device node or file
- * @idmap:	idmap of the mount the inode was found from
- * @dir:	inode of the parent directory
- * @dentry:	dentry of the child device node
- * @mode:	mode of the child device node
- * @dev:	device number of device to create
+ * @idmap:		idmap of the mount the inode was found from
+ * @dir:		inode of the parent directory
+ * @dentry:		dentry of the child device node
+ * @mode:		mode of the child device node
+ * @dev:		device number of device to create
+ * @delegated_inode:	returns parent inode, if the inode is delegated.
  *
  * Create a device node or file.
  *
@@ -4312,7 +4314,8 @@ EXPORT_SYMBOL(start_creating_user_path);
  * raw inode simply pass @nop_mnt_idmap.
  */
 int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
-	      struct dentry *dentry, umode_t mode, dev_t dev)
+	      struct dentry *dentry, umode_t mode, dev_t dev,
+	      struct delegated_inode *delegated_inode)
 {
 	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
 	int error = may_create(idmap, dir, dentry);
@@ -4336,6 +4339,10 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	if (error)
 		return error;
 
+	error = try_break_deleg(dir, delegated_inode);
+	if (error)
+		return error;
+
 	error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -4393,11 +4400,11 @@ retry:
 			break;
 		case S_IFCHR: case S_IFBLK:
 			error = vfs_mknod(idmap, path.dentry->d_inode,
-					  dentry, mode, new_decode_dev(dev));
+					  dentry, mode, new_decode_dev(dev), &di);
 			break;
 		case S_IFIFO: case S_IFSOCK:
 			error = vfs_mknod(idmap, path.dentry->d_inode,
-					  dentry, mode, 0);
+					  dentry, mode, 0, &di);
 			break;
 	}
 out2:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index de5f46f8c6d3..6684935007b1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1573,7 +1573,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	case S_IFIFO:
 	case S_IFSOCK:
 		host_err = vfs_mknod(&nop_mnt_idmap, dirp, dchild,
-				     iap->ia_mode, rdev);
+				     iap->ia_mode, rdev, NULL);
 		break;
 	default:
 		printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e30441cc9c63..afd95721f76e 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -257,7 +257,7 @@ static inline int ovl_do_mknod(struct ovl_fs *ofs,
 			       struct inode *dir, struct dentry *dentry,
 			       umode_t mode, dev_t dev)
 {
-	int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev);
+	int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev, NULL);
 
 	pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
 	return err;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 83b05aec4e10..1a5d86cfafaa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2116,7 +2116,7 @@ int vfs_create(struct mnt_idmap *, struct dentry *, umode_t,
 struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
 			 struct dentry *, umode_t, struct delegated_inode *);
 int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
-              umode_t, dev_t);
+	      umode_t, dev_t, struct delegated_inode *);
 int vfs_symlink(struct mnt_idmap *, struct inode *,
 		struct dentry *, const char *);
 int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
@@ -2152,7 +2152,7 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap,
 			       struct inode *dir, struct dentry *dentry)
 {
 	return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE,
-			 WHITEOUT_DEV);
+			 WHITEOUT_DEV, NULL);
 }
 
 struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 768098dec231..db1fd8d6a84c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1399,7 +1399,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 	idmap = mnt_idmap(parent.mnt);
 	err = security_path_mknod(&parent, dentry, mode, 0);
 	if (!err)
-		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
+		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0, NULL);
 	if (err)
 		goto out_path;
 	err = mutex_lock_interruptible(&u->bindlock);
-- 
cgit v1.2.3


From 92bf53577f01aad988f7f39f69163b41f94cfb7d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 11 Nov 2025 09:12:53 -0500
Subject: vfs: make vfs_symlink break delegations on parent dir

In order to add directory delegation support, we must break delegations
on the parent on any change to the directory.

Add a delegated_inode parameter to vfs_symlink() and have it break the
delegation. do_symlinkat() can then wait on the delegation break before
proceeding.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-12-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ecryptfs/inode.c      |  2 +-
 fs/init.c                |  2 +-
 fs/namei.c               | 16 ++++++++++++++--
 fs/nfsd/vfs.c            |  2 +-
 fs/overlayfs/overlayfs.h |  2 +-
 include/linux/fs.h       |  2 +-
 6 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 83f06452476d..ba15e7359dfa 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -479,7 +479,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap,
 	if (rc)
 		goto out_lock;
 	rc = vfs_symlink(&nop_mnt_idmap, lower_dir, lower_dentry,
-			 encoded_symname);
+			 encoded_symname, NULL);
 	kfree(encoded_symname);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out_lock;
diff --git a/fs/init.c b/fs/init.c
index 4f02260dd65b..e0f5429c0a49 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -209,7 +209,7 @@ int __init init_symlink(const char *oldname, const char *newname)
 	error = security_path_symlink(&path, dentry, oldname);
 	if (!error)
 		error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
-				    dentry, oldname);
+				    dentry, oldname, NULL);
 	end_creating_path(&path, dentry);
 	return error;
 }
diff --git a/fs/namei.c b/fs/namei.c
index e9616134390f..d5ab28947b2b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4845,6 +4845,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
  * @dir:	inode of the parent directory
  * @dentry:	dentry of the child symlink file
  * @oldname:	name of the file to link to
+ * @delegated_inode: returns victim inode, if the inode is delegated.
  *
  * Create a symlink.
  *
@@ -4855,7 +4856,8 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
  * raw inode simply pass @nop_mnt_idmap.
  */
 int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
-		struct dentry *dentry, const char *oldname)
+		struct dentry *dentry, const char *oldname,
+		struct delegated_inode *delegated_inode)
 {
 	int error;
 
@@ -4870,6 +4872,10 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	if (error)
 		return error;
 
+	error = try_break_deleg(dir, delegated_inode);
+	if (error)
+		return error;
+
 	error = dir->i_op->symlink(idmap, dir, dentry, oldname);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -4883,6 +4889,7 @@ int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
 	struct dentry *dentry;
 	struct path path;
 	unsigned int lookup_flags = 0;
+	struct delegated_inode delegated_inode = { };
 
 	if (IS_ERR(from)) {
 		error = PTR_ERR(from);
@@ -4897,8 +4904,13 @@ retry:
 	error = security_path_symlink(&path, dentry, from->name);
 	if (!error)
 		error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
-				    dentry, from->name);
+				    dentry, from->name, &delegated_inode);
 	end_creating_path(&path, dentry);
+	if (is_delegated(&delegated_inode)) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry;
+	}
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
 		goto retry;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6684935007b1..28710da4cce7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1742,7 +1742,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = fh_fill_pre_attrs(fhp);
 	if (err != nfs_ok)
 		goto out_unlock;
-	host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path);
+	host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path, NULL);
 	err = nfserrno(host_err);
 	cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
 	if (!err)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index afd95721f76e..5065961bd370 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -267,7 +267,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs,
 				 struct inode *dir, struct dentry *dentry,
 				 const char *oldname)
 {
-	int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname);
+	int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname, NULL);
 
 	pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
 	return err;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1a5d86cfafaa..64323e618724 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2118,7 +2118,7 @@ struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
 int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
 	      umode_t, dev_t, struct delegated_inode *);
 int vfs_symlink(struct mnt_idmap *, struct inode *,
-		struct dentry *, const char *);
+		struct dentry *, const char *, struct delegated_inode *);
 int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
 	     struct dentry *, struct delegated_inode *);
 int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *,
-- 
cgit v1.2.3


From 1602bad16d7df82faca6d7c70821117684a66f49 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 11 Nov 2025 09:12:58 -0500
Subject: vfs: expose delegation support to userland

Now that support for recallable directory delegations is available,
expose this functionality to userland with new F_SETDELEG and F_GETDELEG
commands for fcntl().

Note that this also allows userland to request a FL_DELEG type lease on
files too. Userland applications that do will get signalled when there
are metadata changes in addition to just data changes (which is a
limitation of FL_LEASE leases).

These commands accept a new "struct delegation" argument that contains a
flags field for future expansion.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-17-52f3feebb2f2@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fcntl.c                 | 13 +++++++++++++
 fs/locks.c                 | 45 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/filelock.h   | 12 ++++++++++++
 include/uapi/linux/fcntl.h | 11 +++++++++++
 4 files changed, 76 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 72f8433d9109..f93dbca08435 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -445,6 +445,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		struct file *filp)
 {
 	void __user *argp = (void __user *)arg;
+	struct delegation deleg;
 	int argi = (int)arg;
 	struct flock flock;
 	long err = -EINVAL;
@@ -550,6 +551,18 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_SET_RW_HINT:
 		err = fcntl_set_rw_hint(filp, arg);
 		break;
+	case F_GETDELEG:
+		if (copy_from_user(&deleg, argp, sizeof(deleg)))
+			return -EFAULT;
+		err = fcntl_getdeleg(filp, &deleg);
+		if (!err && copy_to_user(argp, &deleg, sizeof(deleg)))
+			return -EFAULT;
+		break;
+	case F_SETDELEG:
+		if (copy_from_user(&deleg, argp, sizeof(deleg)))
+			return -EFAULT;
+		err = fcntl_setdeleg(fd, filp, &deleg);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/locks.c b/fs/locks.c
index dd290a87f58e..7f4ccc7974bc 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1703,7 +1703,7 @@ EXPORT_SYMBOL(lease_get_mtime);
  *	XXX: sfr & willy disagree over whether F_INPROGRESS
  *	should be returned to userspace.
  */
-int fcntl_getlease(struct file *filp)
+static int __fcntl_getlease(struct file *filp, unsigned int flavor)
 {
 	struct file_lease *fl;
 	struct inode *inode = file_inode(filp);
@@ -1719,7 +1719,8 @@ int fcntl_getlease(struct file *filp)
 		list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
 			if (fl->c.flc_file != filp)
 				continue;
-			type = target_leasetype(fl);
+			if (fl->c.flc_flags & flavor)
+				type = target_leasetype(fl);
 			break;
 		}
 		spin_unlock(&ctx->flc_lock);
@@ -1730,6 +1731,19 @@ int fcntl_getlease(struct file *filp)
 	return type;
 }
 
+int fcntl_getlease(struct file *filp)
+{
+	return __fcntl_getlease(filp, FL_LEASE);
+}
+
+int fcntl_getdeleg(struct file *filp, struct delegation *deleg)
+{
+	if (deleg->d_flags != 0 || deleg->__pad != 0)
+		return -EINVAL;
+	deleg->d_type = __fcntl_getlease(filp, FL_DELEG);
+	return 0;
+}
+
 /**
  * check_conflicting_open - see if the given file points to an inode that has
  *			    an existing open that would conflict with the
@@ -2039,13 +2053,13 @@ vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
 }
 EXPORT_SYMBOL_GPL(vfs_setlease);
 
-static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
+static int do_fcntl_add_lease(unsigned int fd, struct file *filp, unsigned int flavor, int arg)
 {
 	struct file_lease *fl;
 	struct fasync_struct *new;
 	int error;
 
-	fl = lease_alloc(filp, FL_LEASE, arg);
+	fl = lease_alloc(filp, flavor, arg);
 	if (IS_ERR(fl))
 		return PTR_ERR(fl);
 
@@ -2081,7 +2095,28 @@ int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
 
 	if (arg == F_UNLCK)
 		return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
-	return do_fcntl_add_lease(fd, filp, arg);
+	return do_fcntl_add_lease(fd, filp, FL_LEASE, arg);
+}
+
+/**
+ *	fcntl_setdeleg	-	sets a delegation on an open file
+ *	@fd: open file descriptor
+ *	@filp: file pointer
+ *	@deleg: delegation request from userland
+ *
+ *	Call this fcntl to establish a delegation on the file.
+ *	Note that you also need to call %F_SETSIG to
+ *	receive a signal when the lease is broken.
+ */
+int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg)
+{
+	/* For now, no flags are supported */
+	if (deleg->d_flags != 0 || deleg->__pad != 0)
+		return -EINVAL;
+
+	if (deleg->d_type == F_UNLCK)
+		return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
+	return do_fcntl_add_lease(fd, filp, FL_DELEG, deleg->d_type);
 }
 
 /**
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index 208d108df2d7..54b824c05299 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -159,6 +159,8 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int,
 
 int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
 int fcntl_getlease(struct file *filp);
+int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg);
+int fcntl_getdeleg(struct file *filp, struct delegation *deleg);
 
 static inline bool lock_is_unlock(struct file_lock *fl)
 {
@@ -278,6 +280,16 @@ static inline int fcntl_getlease(struct file *filp)
 	return F_UNLCK;
 }
 
+static inline int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg)
+{
+	return -EINVAL;
+}
+
+static inline int fcntl_getdeleg(struct file *filp, struct delegation *deleg)
+{
+	return -EINVAL;
+}
+
 static inline bool lock_is_unlock(struct file_lock *fl)
 {
 	return false;
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 3741ea1b73d8..008fac15e573 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -79,6 +79,17 @@
  */
 #define RWF_WRITE_LIFE_NOT_SET	RWH_WRITE_LIFE_NOT_SET
 
+/* Set/Get delegations */
+#define F_GETDELEG		(F_LINUX_SPECIFIC_BASE + 15)
+#define F_SETDELEG		(F_LINUX_SPECIFIC_BASE + 16)
+
+/* Argument structure for F_GETDELEG and F_SETDELEG */
+struct delegation {
+	uint32_t	d_flags;	/* Must be 0 */
+	uint16_t	d_type;		/* F_RDLCK, F_WRLCK, F_UNLCK */
+	uint16_t	__pad;		/* Must be 0 */
+};
+
 /*
  * Types of directory notifications that may be requested.
  */
-- 
cgit v1.2.3


From a3f8f8662771285511ae26c4c8d3ba1cd22159b9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 5 Nov 2025 14:39:45 +0100
Subject: power: always freeze efivarfs

The efivarfs filesystems must always be frozen and thawed to resync
variable state. Make it so.

Link: https://patch.msgid.link/20251105-vorbild-zutreffen-fe00d1dd98db@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/efivarfs/super.c      |  1 +
 fs/super.c               | 13 ++++++++++---
 include/linux/fs.h       |  3 ++-
 kernel/power/hibernate.c |  9 +++------
 kernel/power/suspend.c   |  3 +--
 5 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 1f4d8ce56667..6de97565d5f7 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -533,6 +533,7 @@ static struct file_system_type efivarfs_type = {
 	.init_fs_context = efivarfs_init_fs_context,
 	.kill_sb = efivarfs_kill_sb,
 	.parameters = efivarfs_parameters,
+	.fs_flags = FS_POWER_FREEZE,
 };
 
 static __init int efivarfs_init(void)
diff --git a/fs/super.c b/fs/super.c
index 5bab94fb7e03..277b84e5c279 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1183,11 +1183,14 @@ static inline bool get_active_super(struct super_block *sb)
 
 static const char *filesystems_freeze_ptr = "filesystems_freeze";
 
-static void filesystems_freeze_callback(struct super_block *sb, void *unused)
+static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all_ptr)
 {
 	if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super)
 		return;
 
+	if (freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE))
+		return;
+
 	if (!get_active_super(sb))
 		return;
 
@@ -1201,9 +1204,13 @@ static void filesystems_freeze_callback(struct super_block *sb, void *unused)
 	deactivate_super(sb);
 }
 
-void filesystems_freeze(void)
+void filesystems_freeze(bool freeze_all)
 {
-	__iterate_supers(filesystems_freeze_callback, NULL,
+	void *freeze_all_ptr = NULL;
+
+	if (freeze_all)
+		freeze_all_ptr = &freeze_all;
+	__iterate_supers(filesystems_freeze_callback, freeze_all_ptr,
 			 SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3ea98c6cce81..249a1da8440e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2689,6 +2689,7 @@ struct file_system_type {
 #define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
 #define FS_MGTIME		64	/* FS uses multigrain timestamps */
 #define FS_LBS			128	/* FS supports LBS */
+#define FS_POWER_FREEZE		256	/* Always freeze on suspend/hibernate */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
 	const struct fs_parameter_spec *parameters;
@@ -3606,7 +3607,7 @@ extern void drop_super_exclusive(struct super_block *sb);
 extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg);
 extern void iterate_supers_type(struct file_system_type *,
 			        void (*)(struct super_block *, void *), void *);
-void filesystems_freeze(void);
+void filesystems_freeze(bool freeze_all);
 void filesystems_thaw(void);
 
 extern int dcache_dir_open(struct inode *, struct file *);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 14e85ff23551..1f250ce036a0 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -825,8 +825,7 @@ int hibernate(void)
 		goto Restore;
 
 	ksys_sync_helper();
-	if (filesystem_freeze_enabled)
-		filesystems_freeze();
+	filesystems_freeze(filesystem_freeze_enabled);
 
 	error = freeze_processes();
 	if (error)
@@ -932,8 +931,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data)
 	if (error)
 		goto restore;
 
-	if (filesystem_freeze_enabled)
-		filesystems_freeze();
+	filesystems_freeze(filesystem_freeze_enabled);
 
 	error = freeze_processes();
 	if (error)
@@ -1083,8 +1081,7 @@ static int software_resume(void)
 	if (error)
 		goto Restore;
 
-	if (filesystem_freeze_enabled)
-		filesystems_freeze();
+	filesystems_freeze(filesystem_freeze_enabled);
 
 	pm_pr_dbg("Preparing processes for hibernation restore.\n");
 	error = freeze_processes();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4bb4686c1c08..c933a63a9718 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -375,8 +375,7 @@ static int suspend_prepare(suspend_state_t state)
 	if (error)
 		goto Restore;
 
-	if (filesystem_freeze_enabled)
-		filesystems_freeze();
+	filesystems_freeze(filesystem_freeze_enabled);
 	trace_suspend_resume(TPS("freeze_processes"), 0, true);
 	error = suspend_freeze_processes();
 	trace_suspend_resume(TPS("freeze_processes"), 0, false);
-- 
cgit v1.2.3


From ad9c62bd8946621ed02ac94131a921222508a8bc Mon Sep 17 00:00:00 2001
From: Jiaqi Yan <jiaqiyan@google.com>
Date: Mon, 13 Oct 2025 18:59:01 +0000
Subject: KVM: arm64: VM exit to userspace to handle SEA

When APEI fails to handle a stage-2 synchronous external abort (SEA),
today KVM injects an asynchronous SError to the VCPU then resumes it,
which usually results in unpleasant guest kernel panic.

One major situation of guest SEA is when vCPU consumes recoverable
uncorrected memory error (UER). Although SError and guest kernel panic
effectively stops the propagation of corrupted memory, guest may
re-use the corrupted memory if auto-rebooted; in worse case, guest
boot may run into poisoned memory. So there is room to recover from
an UER in a more graceful manner.

Alternatively KVM can redirect the synchronous SEA event to VMM to
- Reduce blast radius if possible. VMM can inject a SEA to VCPU via
  KVM's existing KVM_SET_VCPU_EVENTS API. If the memory poison
  consumption or fault is not from guest kernel, blast radius can be
  limited to the triggering thread in guest userspace, so VM can
  keep running.
- Allow VMM to protect from future memory poison consumption by
  unmapping the page from stage-2, or to interrupt guest of the
  poisoned page so guest kernel can unmap it from stage-1 page table.
- Allow VMM to track SEA events that VM customers care about, to restart
  VM when certain number of distinct poison events have happened,
  to provide observability to customers in log management UI.

Introduce an userspace-visible feature to enable VMM handle SEA:
- KVM_CAP_ARM_SEA_TO_USER. As the alternative fallback behavior
  when host APEI fails to claim a SEA, userspace can opt in this new
  capability to let KVM exit to userspace during SEA if it is not
  owned by host.
- KVM_EXIT_ARM_SEA. A new exit reason is introduced for this.
  KVM fills kvm_run.arm_sea with as much as possible information about
  the SEA, enabling VMM to emulate SEA to guest by itself.
  - Sanitized ESR_EL2. The general rule is to keep only the bits
    useful for userspace and relevant to guest memory.
  - Flags indicating if faulting guest physical address is valid.
  - Faulting guest physical and virtual addresses if valid.

Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
Co-developed-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://msgid.link/20251013185903.1372553-2-jiaqiyan@google.com
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h |  2 ++
 arch/arm64/kvm/arm.c              |  5 +++
 arch/arm64/kvm/mmu.c              | 68 ++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/kvm.h          | 10 ++++++
 4 files changed, 84 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 64302c438355..366bf337ef64 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -350,6 +350,8 @@ struct kvm_arch {
 #define KVM_ARCH_FLAG_GUEST_HAS_SVE			9
 	/* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */
 #define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS		10
+	/* Unhandled SEAs are taken to userspace */
+#define KVM_ARCH_FLAG_EXIT_SEA				11
 	unsigned long flags;
 
 	/* VM-wide vCPU feature set */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 870953b4a8a7..511d2e8ef6c7 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -132,6 +132,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		}
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_CAP_ARM_SEA_TO_USER:
+		r = 0;
+		set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags);
+		break;
 	default:
 		break;
 	}
@@ -327,6 +331,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_IRQFD_RESAMPLE:
 	case KVM_CAP_COUNTER_OFFSET:
 	case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
+	case KVM_CAP_ARM_SEA_TO_USER:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 7cc964af8d30..58cb169727a6 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1899,8 +1899,48 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 	read_unlock(&vcpu->kvm->mmu_lock);
 }
 
+/*
+ * Returns true if the SEA should be handled locally within KVM if the abort
+ * is caused by a kernel memory allocation (e.g. stage-2 table memory).
+ */
+static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr)
+{
+	/*
+	 * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort
+	 * taken from a guest EL to EL2 is due to a host-imposed access (e.g.
+	 * stage-2 PTW).
+	 */
+	if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+		return true;
+
+	/* KVM owns the VNCR when the vCPU isn't in a nested context. */
+	if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR))
+		return true;
+
+	/*
+	 * Determining if an external abort during a table walk happened at
+	 * stage-2 is only possible with S1PTW is set. Otherwise, since KVM
+	 * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the
+	 * PA of the stage-1 descriptor) can reach here and are reported
+	 * with a TTW ESR value.
+	 */
+	return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW));
+}
+
 int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
 {
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_run *run = vcpu->run;
+	u64 esr = kvm_vcpu_get_esr(vcpu);
+	u64 esr_mask = ESR_ELx_EC_MASK	|
+		       ESR_ELx_IL	|
+		       ESR_ELx_FnV	|
+		       ESR_ELx_EA	|
+		       ESR_ELx_CM	|
+		       ESR_ELx_WNR	|
+		       ESR_ELx_FSC;
+	u64 ipa;
+
 	/*
 	 * Give APEI the opportunity to claim the abort before handling it
 	 * within KVM. apei_claim_sea() expects to be called with IRQs enabled.
@@ -1909,7 +1949,33 @@ int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
 	if (apei_claim_sea(NULL) == 0)
 		return 1;
 
-	return kvm_inject_serror(vcpu);
+	if (host_owns_sea(vcpu, esr) ||
+	    !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags))
+		return kvm_inject_serror(vcpu);
+
+	/* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */
+	if (kvm_has_ras(kvm))
+		esr_mask |= ESR_ELx_SET_MASK;
+
+	/*
+	 * Exit to userspace, and provide faulting guest virtual and physical
+	 * addresses in case userspace wants to emulate SEA to guest by
+	 * writing to FAR_ELx and HPFAR_ELx registers.
+	 */
+	memset(&run->arm_sea, 0, sizeof(run->arm_sea));
+	run->exit_reason = KVM_EXIT_ARM_SEA;
+	run->arm_sea.esr = esr & esr_mask;
+
+	if (!(esr & ESR_ELx_FnV))
+		run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu);
+
+	ipa = kvm_vcpu_get_fault_ipa(vcpu);
+	if (ipa != INVALID_GPA) {
+		run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID;
+		run->arm_sea.gpa = ipa;
+	}
+
+	return 0;
 }
 
 /**
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 52f6000ab020..1e541193e98d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -179,6 +179,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_LOONGARCH_IOCSR  38
 #define KVM_EXIT_MEMORY_FAULT     39
 #define KVM_EXIT_TDX              40
+#define KVM_EXIT_ARM_SEA          41
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -473,6 +474,14 @@ struct kvm_run {
 				} setup_event_notify;
 			};
 		} tdx;
+		/* KVM_EXIT_ARM_SEA */
+		struct {
+#define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID	(1ULL << 0)
+			__u64 flags;
+			__u64 esr;
+			__u64 gva;
+			__u64 gpa;
+		} arm_sea;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -963,6 +972,7 @@ struct kvm_enable_cap {
 #define KVM_CAP_RISCV_MP_STATE_RESET 242
 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
 #define KVM_CAP_GUEST_MEMFD_FLAGS 244
+#define KVM_CAP_ARM_SEA_TO_USER 245
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
-- 
cgit v1.2.3


From 4e5cba5bb6f37ceaba6a2628a171cbede02f969c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 11 Nov 2025 22:29:08 -0800
Subject: RDMA/cm: Correct typedef and bad line warnings

In include/rdma/ib_cm.h:

Correct a typedef's kernel-doc notation by adding the 'typedef' keyword
to it to avoid a warning.
Add a leading " *" to a kernel-doc line to avoid a warning.

Warning: ib_cm.h:289 function parameter 'ib_cm_handler' not described
 in 'int'
Warning: ib_cm.h:289 expecting prototype for ib_cm_handler().  Prototype
 was for int() instead
Warning: ib_cm.h:484 bad line: connection message in case duplicates
 are received.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251112062908.2711007-1-rdunlap@infradead.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/rdma/ib_cm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 1fa3786f82f4..4808a355de41 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -271,7 +271,7 @@ struct ib_cm_event {
 #define CM_APR_ATTR_ID		cpu_to_be16(0x001A)
 
 /**
- * ib_cm_handler - User-defined callback to process communication events.
+ * typedef ib_cm_handler - User-defined callback to process communication events.
  * @cm_id: Communication identifier associated with the reported event.
  * @event: Information about the communication event.
  *
@@ -482,7 +482,7 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id,
 
 /**
  * ib_prepare_cm_mra - Prepares to send a message receipt acknowledgment to a
-     connection message in case duplicates are received.
+ *   connection message in case duplicates are received.
  * @cm_id: Connection identifier associated with the connection message.
  */
 int ib_prepare_cm_mra(struct ib_cm_id *cm_id);
-- 
cgit v1.2.3


From 78f0e33cd6c939a555aa80dbed2fec6b333a7660 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 11 Nov 2025 06:28:15 +0000
Subject: fs/namespace: correctly handle errors returned by
 grab_requested_mnt_ns

grab_requested_mnt_ns was changed to return error codes on failure, but
its callers were not updated to check for error pointers, still checking
only for a NULL return value.

This commit updates the callers to use IS_ERR() or IS_ERR_OR_NULL() and
PTR_ERR() to correctly check for and propagate errors.

This also makes sure that the logic actually works and mount namespace
file descriptors can be used to refere to mounts.

Christian Brauner <brauner@kernel.org> says:

Rework the patch to be more ergonomic and in line with our overall error
handling patterns.

Fixes: 7b9d14af8777 ("fs: allow mount namespace fd")
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Andrei Vagin <avagin@google.com>
Link: https://patch.msgid.link/20251111062815.2546189-1-avagin@google.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 32 ++++++++++++++++----------------
 include/uapi/linux/mount.h |  2 +-
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index cc6e00e72437..2bad25709b2c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -141,7 +141,8 @@ static void mnt_ns_release(struct mnt_namespace *ns)
 		kfree(ns);
 	}
 }
-DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+DEFINE_FREE(mnt_ns_release, struct mnt_namespace *,
+	    if (!IS_ERR(_T)) mnt_ns_release(_T))
 
 static void mnt_ns_release_rcu(struct rcu_head *rcu)
 {
@@ -5726,7 +5727,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
 	ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
 	if (ret)
 		return ret;
-	if (kreq->spare != 0)
+	if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id)
 		return -EINVAL;
 	/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
 	if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
@@ -5743,16 +5744,12 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
 {
 	struct mnt_namespace *mnt_ns;
 
-	if (kreq->mnt_ns_id && kreq->spare)
-		return ERR_PTR(-EINVAL);
-
-	if (kreq->mnt_ns_id)
-		return lookup_mnt_ns(kreq->mnt_ns_id);
-
-	if (kreq->spare) {
+	if (kreq->mnt_ns_id) {
+		mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id);
+	} else if (kreq->mnt_ns_fd) {
 		struct ns_common *ns;
 
-		CLASS(fd, f)(kreq->spare);
+		CLASS(fd, f)(kreq->mnt_ns_fd);
 		if (fd_empty(f))
 			return ERR_PTR(-EBADF);
 
@@ -5767,6 +5764,8 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
 	} else {
 		mnt_ns = current->nsproxy->mnt_ns;
 	}
+	if (!mnt_ns)
+		return ERR_PTR(-ENOENT);
 
 	refcount_inc(&mnt_ns->passive);
 	return mnt_ns;
@@ -5791,8 +5790,8 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
 		return ret;
 
 	ns = grab_requested_mnt_ns(&kreq);
-	if (!ns)
-		return -ENOENT;
+	if (IS_ERR(ns))
+		return PTR_ERR(ns);
 
 	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
 	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
@@ -5902,8 +5901,8 @@ static void __free_klistmount_free(const struct klistmount *kls)
 static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq,
 				     size_t nr_mnt_ids)
 {
-
 	u64 last_mnt_id = kreq->param;
+	struct mnt_namespace *ns;
 
 	/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
 	if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
@@ -5917,9 +5916,10 @@ static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *
 	if (!kls->kmnt_ids)
 		return -ENOMEM;
 
-	kls->ns = grab_requested_mnt_ns(kreq);
-	if (!kls->ns)
-		return -ENOENT;
+	ns = grab_requested_mnt_ns(kreq);
+	if (IS_ERR(ns))
+		return PTR_ERR(ns);
+	kls->ns = ns;
 
 	kls->mnt_parent_id = kreq->mnt_id;
 	return 0;
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 7fa67c2031a5..5d3f8c9e3a62 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -197,7 +197,7 @@ struct statmount {
  */
 struct mnt_id_req {
 	__u32 size;
-	__u32 spare;
+	__u32 mnt_ns_fd;
 	__u64 mnt_id;
 	__u64 param;
 	__u64 mnt_ns_id;
-- 
cgit v1.2.3


From 12741624645e098b2234a5ae341045a97473caf1 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 5 Nov 2025 22:20:24 +0100
Subject: fs: add iput_not_last()

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251105212025.807549-1-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c         | 12 ++++++++++++
 include/linux/fs.h |  1 +
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..cff1d3af0d57 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1967,6 +1967,18 @@ retry:
 }
 EXPORT_SYMBOL(iput);
 
+/**
+ *	iput_not_last	- put an inode assuming this is not the last reference
+ *	@inode: inode to put
+ */
+void iput_not_last(struct inode *inode)
+{
+	VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode);
+
+	WARN_ON(atomic_sub_return(1, &inode->i_count) == 0);
+}
+EXPORT_SYMBOL(iput_not_last);
+
 #ifdef CONFIG_BLOCK
 /**
  *	bmap	- find a block number in a file
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 249a1da8440e..dd3b57cfadee 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2824,6 +2824,7 @@ extern int current_umask(void);
 
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
+void iput_not_last(struct inode *);
 int inode_update_timestamps(struct inode *inode, int flags);
 int generic_update_time(struct inode *, int);
 
-- 
cgit v1.2.3


From 7e6cea5ae2f5e62112fce69acc07ee8b694b6dd0 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Tue, 11 Nov 2025 11:36:52 -0800
Subject: docs: document iomap writeback's iomap_finish_folio_write()
 requirement

Document that iomap_finish_folio_write() must be called after writeback
on the range completes.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-4-joannelkoong@gmail.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/iomap/operations.rst | 3 +++
 include/linux/iomap.h                          | 4 ++++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index c88205132039..4d30723be7fa 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -361,6 +361,9 @@ The fields are as follows:
     delalloc reservations to avoid having delalloc reservations for
     clean pagecache.
     This function must be supplied by the filesystem.
+    If this succeeds, iomap_finish_folio_write() must be called once writeback
+    completes for the range, regardless of whether the writeback succeeded or
+    failed.
 
   - ``writeback_submit``: Submit the previous built writeback context.
     Block based file systems should use the iomap_ioend_writeback_submit
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8b1ac08c7474..a5032e456079 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -435,6 +435,10 @@ struct iomap_writeback_ops {
 	 * An existing mapping from a previous call to this method can be reused
 	 * by the file system if it is still valid.
 	 *
+	 * If this succeeds, iomap_finish_folio_write() must be called once
+	 * writeback completes for the range, regardless of whether the
+	 * writeback succeeded or failed.
+	 *
 	 * Returns the number of bytes processed or a negative errno.
 	 */
 	ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc,
-- 
cgit v1.2.3


From 6b1fd2281fb0873ec56f8791d4e4898302070804 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Tue, 11 Nov 2025 11:36:53 -0800
Subject: iomap: optimize pending async writeback accounting

Pending writebacks must be accounted for to determine when all requests
have completed and writeback on the folio should be ended. Currently
this is done by atomically incrementing ifs->write_bytes_pending for
every range to be written back.

Instead, the number of atomic operations can be minimized by setting
ifs->write_bytes_pending to the folio size, internally tracking how many
bytes are written back asynchronously, and then after sending off all
the requests, decrementing ifs->write_bytes_pending by the number of
bytes not written back asynchronously. Now, for N ranges written back,
only N + 2 atomic operations are required instead of 2N + 2.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-5-joannelkoong@gmail.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fuse/file.c         |  4 ++--
 fs/iomap/buffered-io.c | 58 +++++++++++++++++++++++++++++---------------------
 fs/iomap/ioend.c       |  2 --
 include/linux/iomap.h  |  2 --
 4 files changed, 36 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8275b6681b9b..b343a6f37563 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1885,7 +1885,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
 		 * scope of the fi->lock alleviates xarray lock
 		 * contention and noticeably improves performance.
 		 */
-		iomap_finish_folio_write(inode, ap->folios[i], 1);
+		iomap_finish_folio_write(inode, ap->folios[i],
+					 ap->descs[i].length);
 
 	wake_up(&fi->page_waitq);
 }
@@ -2221,7 +2222,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
 		ap = &wpa->ia.ap;
 	}
 
-	iomap_start_folio_write(inode, folio, 1);
 	fuse_writepage_args_page_fill(wpa, folio, ap->num_folios,
 				      offset, len);
 	data->nr_bytes += len;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 0eb439b523b1..1873a2f74883 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1641,16 +1641,25 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
 
-void iomap_start_folio_write(struct inode *inode, struct folio *folio,
-		size_t len)
+static void iomap_writeback_init(struct inode *inode, struct folio *folio)
 {
 	struct iomap_folio_state *ifs = folio->private;
 
 	WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
-	if (ifs)
-		atomic_add(len, &ifs->write_bytes_pending);
+	if (ifs) {
+		WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+		/*
+		 * Set this to the folio size. After processing the folio for
+		 * writeback in iomap_writeback_folio(), we'll subtract any
+		 * ranges not written back.
+		 *
+		 * We do this because otherwise, we would have to atomically
+		 * increment ifs->write_bytes_pending every time a range in the
+		 * folio needs to be written back.
+		 */
+		atomic_set(&ifs->write_bytes_pending, folio_size(folio));
+	}
 }
-EXPORT_SYMBOL_GPL(iomap_start_folio_write);
 
 void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 		size_t len)
@@ -1667,7 +1676,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
 
 static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
 		struct folio *folio, u64 pos, u32 rlen, u64 end_pos,
-		bool *wb_pending)
+		size_t *bytes_submitted)
 {
 	do {
 		ssize_t ret;
@@ -1681,11 +1690,11 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
 		pos += ret;
 
 		/*
-		 * Holes are not be written back by ->writeback_range, so track
+		 * Holes are not written back by ->writeback_range, so track
 		 * if we did handle anything that is not a hole here.
 		 */
 		if (wpc->iomap.type != IOMAP_HOLE)
-			*wb_pending = true;
+			*bytes_submitted += ret;
 	} while (rlen);
 
 	return 0;
@@ -1756,7 +1765,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
 	u64 pos = folio_pos(folio);
 	u64 end_pos = pos + folio_size(folio);
 	u64 end_aligned = 0;
-	bool wb_pending = false;
+	size_t bytes_submitted = 0;
 	int error = 0;
 	u32 rlen;
 
@@ -1776,14 +1785,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
 			iomap_set_range_dirty(folio, 0, end_pos - pos);
 		}
 
-		/*
-		 * Keep the I/O completion handler from clearing the writeback
-		 * bit until we have submitted all blocks by adding a bias to
-		 * ifs->write_bytes_pending, which is dropped after submitting
-		 * all blocks.
-		 */
-		WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
-		iomap_start_folio_write(inode, folio, 1);
+		iomap_writeback_init(inode, folio);
 	}
 
 	/*
@@ -1798,13 +1800,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
 	end_aligned = round_up(end_pos, i_blocksize(inode));
 	while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
 		error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos,
-				&wb_pending);
+				&bytes_submitted);
 		if (error)
 			break;
 		pos += rlen;
 	}
 
-	if (wb_pending)
+	if (bytes_submitted)
 		wpc->nr_folios++;
 
 	/*
@@ -1822,12 +1824,20 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
 	 * bit ourselves right after unlocking the page.
 	 */
 	if (ifs) {
-		if (atomic_dec_and_test(&ifs->write_bytes_pending))
-			folio_end_writeback(folio);
-	} else {
-		if (!wb_pending)
-			folio_end_writeback(folio);
+		/*
+		 * Subtract any bytes that were initially accounted to
+		 * write_bytes_pending but skipped for writeback.
+		 */
+		size_t bytes_not_submitted = folio_size(folio) -
+				bytes_submitted;
+
+		if (bytes_not_submitted)
+			iomap_finish_folio_write(inode, folio,
+					bytes_not_submitted);
+	} else if (!bytes_submitted) {
+		folio_end_writeback(folio);
 	}
+
 	mapping_set_error(inode->i_mapping, error);
 	return error;
 }
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index b49fa75eab26..86f44922ed3b 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -194,8 +194,6 @@ new_ioend:
 	if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
 		goto new_ioend;
 
-	iomap_start_folio_write(wpc->inode, folio, map_len);
-
 	/*
 	 * Clamp io_offset and io_size to the incore EOF so that ondisk
 	 * file size updates in the ioend completion are byte-accurate.
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index a5032e456079..b49e47f069db 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -478,8 +478,6 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error);
 
 void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
 		int error);
-void iomap_start_folio_write(struct inode *inode, struct folio *folio,
-		size_t len);
 void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 		size_t len);
 
-- 
cgit v1.2.3


From f8eaf79406fe9415db0e7a5c175b50cb01265199 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Tue, 11 Nov 2025 11:36:54 -0800
Subject: iomap: simplify ->read_folio_range() error handling for reads

Instead of requiring that the caller calls iomap_finish_folio_read()
even if the ->read_folio_range() callback returns an error, account for
this internally in iomap instead, which makes the interface simpler and
makes it match writeback's ->read_folio_range() error handling
expectations.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-6-joannelkoong@gmail.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/iomap/operations.rst |  7 ++-
 fs/fuse/file.c                                 | 10 +---
 fs/iomap/buffered-io.c                         | 63 ++++++++++++++------------
 include/linux/iomap.h                          |  5 +-
 4 files changed, 41 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 4d30723be7fa..64f4baf5750e 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -149,10 +149,9 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:
 iomap calls these functions:
 
   - ``read_folio_range``: Called to read in the range. This must be provided
-    by the caller. The caller is responsible for calling
-    iomap_finish_folio_read() after reading in the folio range. This should be
-    done even if an error is encountered during the read. This returns 0 on
-    success or a negative error on failure.
+    by the caller. If this succeeds, iomap_finish_folio_read() must be called
+    after the range is read in, regardless of whether the read succeeded or
+    failed.
 
   - ``submit_read``: Submit any pending read requests. This function is
     optional.
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b343a6f37563..7bcb650a9f26 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -922,13 +922,6 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter,
 
 	if (ctx->rac) {
 		ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len);
-		/*
-		 * If fuse_handle_readahead was successful, fuse_readpages_end
-		 * will do the iomap_finish_folio_read, else we need to call it
-		 * here
-		 */
-		if (ret)
-			iomap_finish_folio_read(folio, off, len, ret);
 	} else {
 		/*
 		 *  for non-readahead read requests, do reads synchronously
@@ -936,7 +929,8 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter,
 		 *  out-of-order reads
 		 */
 		ret = fuse_do_readfolio(file, folio, off, len);
-		iomap_finish_folio_read(folio, off, len, ret);
+		if (!ret)
+			iomap_finish_folio_read(folio, off, len, ret);
 	}
 	return ret;
 }
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 1873a2f74883..c82b5b24d4b3 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -398,7 +398,8 @@ static void iomap_read_init(struct folio *folio)
 		 * has already finished reading in the entire folio.
 		 */
 		spin_lock_irq(&ifs->state_lock);
-		ifs->read_bytes_pending += len + 1;
+		WARN_ON_ONCE(ifs->read_bytes_pending != 0);
+		ifs->read_bytes_pending = len + 1;
 		spin_unlock_irq(&ifs->state_lock);
 	}
 }
@@ -414,43 +415,47 @@ static void iomap_read_init(struct folio *folio)
  */
 static void iomap_read_end(struct folio *folio, size_t bytes_submitted)
 {
-	struct iomap_folio_state *ifs;
-
-	/*
-	 * If there are no bytes submitted, this means we are responsible for
-	 * unlocking the folio here, since no IO helper has taken ownership of
-	 * it.
-	 */
-	if (!bytes_submitted) {
-		folio_unlock(folio);
-		return;
-	}
+	struct iomap_folio_state *ifs = folio->private;
 
-	ifs = folio->private;
 	if (ifs) {
 		bool end_read, uptodate;
-		/*
-		 * Subtract any bytes that were initially accounted to
-		 * read_bytes_pending but skipped for IO.
-		 * The +1 accounts for the bias we added in iomap_read_init().
-		 */
-		size_t bytes_not_submitted = folio_size(folio) + 1 -
-				bytes_submitted;
 
 		spin_lock_irq(&ifs->state_lock);
-		ifs->read_bytes_pending -= bytes_not_submitted;
-		/*
-		 * If !ifs->read_bytes_pending, this means all pending reads
-		 * by the IO helper have already completed, which means we need
-		 * to end the folio read here. If ifs->read_bytes_pending != 0,
-		 * the IO helper will end the folio read.
-		 */
-		end_read = !ifs->read_bytes_pending;
+		if (!ifs->read_bytes_pending) {
+			WARN_ON_ONCE(bytes_submitted);
+			end_read = true;
+		} else {
+			/*
+			 * Subtract any bytes that were initially accounted to
+			 * read_bytes_pending but skipped for IO. The +1
+			 * accounts for the bias we added in iomap_read_init().
+			 */
+			size_t bytes_not_submitted = folio_size(folio) + 1 -
+					bytes_submitted;
+			ifs->read_bytes_pending -= bytes_not_submitted;
+			/*
+			 * If !ifs->read_bytes_pending, this means all pending
+			 * reads by the IO helper have already completed, which
+			 * means we need to end the folio read here. If
+			 * ifs->read_bytes_pending != 0, the IO helper will end
+			 * the folio read.
+			 */
+			end_read = !ifs->read_bytes_pending;
+		}
 		if (end_read)
 			uptodate = ifs_is_fully_uptodate(folio, ifs);
 		spin_unlock_irq(&ifs->state_lock);
 		if (end_read)
 			folio_end_read(folio, uptodate);
+	} else if (!bytes_submitted) {
+		/*
+		 * If there were no bytes submitted, this means we are
+		 * responsible for unlocking the folio here, since no IO helper
+		 * has taken ownership of it. If there were bytes submitted,
+		 * then the IO helper will end the read via
+		 * iomap_finish_folio_read().
+		 */
+		folio_unlock(folio);
 	}
 }
 
@@ -498,10 +503,10 @@ static int iomap_read_folio_iter(struct iomap_iter *iter,
 		} else {
 			if (!*bytes_submitted)
 				iomap_read_init(folio);
-			*bytes_submitted += plen;
 			ret = ctx->ops->read_folio_range(iter, ctx, plen);
 			if (ret)
 				return ret;
+			*bytes_submitted += plen;
 		}
 
 		ret = iomap_iter_advance(iter, plen);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index b49e47f069db..520e967cb501 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -495,9 +495,8 @@ struct iomap_read_ops {
 	/*
 	 * Read in a folio range.
 	 *
-	 * The caller is responsible for calling iomap_finish_folio_read() after
-	 * reading in the folio range. This should be done even if an error is
-	 * encountered during the read.
+	 * If this succeeds, iomap_finish_folio_read() must be called after the
+	 * range is read in, regardless of whether the read succeeded or failed.
 	 *
 	 * Returns 0 on success or a negative error on failure.
 	 */
-- 
cgit v1.2.3


From 395b95530343e7f4bdd2870190d985a222997fb6 Mon Sep 17 00:00:00 2001
From: Luis Henriques <luis@igalia.com>
Date: Tue, 16 Sep 2025 14:53:07 +0100
Subject: dcache: export shrink_dentry_list() and add new helper
 d_dispose_if_unused()

Add and export a new helper d_dispose_if_unused() which is simply a wrapper
around to_shrink_list(), to add an entry to a dispose list if it's not used
anymore.

Also export shrink_dentry_list() to kill all dentries in a dispose list.

Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Luis Henriques <luis@igalia.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/dcache.c            | 18 ++++++++++++------
 include/linux/dcache.h |  2 ++
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index 035cccbc9276..bffb1b47a907 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1086,6 +1086,15 @@ struct dentry *d_find_alias_rcu(struct inode *inode)
 	return de;
 }
 
+void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose)
+{
+	spin_lock(&dentry->d_lock);
+	if (!dentry->d_lockref.count)
+		to_shrink_list(dentry, dispose);
+	spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_dispose_if_unused);
+
 /*
  *	Try to kill dentries associated with this inode.
  * WARNING: you must own a reference to inode.
@@ -1096,12 +1105,8 @@ void d_prune_aliases(struct inode *inode)
 	struct dentry *dentry;
 
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
-		spin_lock(&dentry->d_lock);
-		if (!dentry->d_lockref.count)
-			to_shrink_list(dentry, &dispose);
-		spin_unlock(&dentry->d_lock);
-	}
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias)
+		d_dispose_if_unused(dentry, &dispose);
 	spin_unlock(&inode->i_lock);
 	shrink_dentry_list(&dispose);
 }
@@ -1141,6 +1146,7 @@ void shrink_dentry_list(struct list_head *list)
 		shrink_kill(dentry);
 	}
 }
+EXPORT_SYMBOL(shrink_dentry_list);
 
 static enum lru_status dentry_lru_isolate(struct list_head *item,
 		struct list_lru_one *lru, void *arg)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index c83e02b94389..2bc1339bf6d0 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -268,6 +268,8 @@ extern void d_tmpfile(struct file *, struct inode *);
 
 extern struct dentry *d_find_alias(struct inode *);
 extern void d_prune_aliases(struct inode *);
+extern void d_dispose_if_unused(struct dentry *, struct list_head *);
+extern void shrink_dentry_list(struct list_head *);
 
 extern struct dentry *d_find_alias_rcu(struct inode *);
 
-- 
cgit v1.2.3


From 854e8df2ce6b02c8be40d6f26bd8aa700b375bb2 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 23 Oct 2025 10:21:42 +0200
Subject: fs/pipe: stop duplicating union pipe_index declaration

Now that we build with -fms-extensions, union pipe_index can be
included as an anonymous member in struct pipe_inode_info, avoiding
the duplication.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Link: https://patch.msgid.link/20251023082142.2104456-1-linux@rasmusvillemoes.dk
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pipe_fs_i.h | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 9d42d473d201..7f6a92ac9704 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -44,11 +44,11 @@ typedef unsigned int pipe_index_t;
 typedef unsigned short pipe_index_t;
 #endif
 
-/*
- * We have to declare this outside 'struct pipe_inode_info',
- * but then we can't use 'union pipe_index' for an anonymous
- * union, so we end up having to duplicate this declaration
- * below. Annoying.
+/**
+ *	struct pipe_index - pipe indeces
+ *	@head: The point of buffer production
+ *	@tail: The point of buffer consumption
+ *	@head_tail: unsigned long union of @head and @tail
  */
 union pipe_index {
 	unsigned long head_tail;
@@ -63,9 +63,7 @@ union pipe_index {
  *	@mutex: mutex protecting the whole thing
  *	@rd_wait: reader wait point in case of empty pipe
  *	@wr_wait: writer wait point in case of full pipe
- *	@head: The point of buffer production
- *	@tail: The point of buffer consumption
- *	@head_tail: unsigned long union of @head and @tail
+ *	@pipe_index: the pipe indeces
  *	@note_loss: The next read() should insert a data-lost message
  *	@max_usage: The maximum number of slots that may be used in the ring
  *	@ring_size: total number of buffers (should be a power of 2)
@@ -87,14 +85,7 @@ struct pipe_inode_info {
 	struct mutex mutex;
 	wait_queue_head_t rd_wait, wr_wait;
 
-	/* This has to match the 'union pipe_index' above */
-	union {
-		unsigned long head_tail;
-		struct {
-			pipe_index_t head;
-			pipe_index_t tail;
-		};
-	};
+	union pipe_index;
 
 	unsigned int max_usage;
 	unsigned int ring_size;
-- 
cgit v1.2.3


From e631df89cd5d638a9d7c152dd9b0a92643efab3e Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Fri, 7 Nov 2025 15:21:47 +0100
Subject: fs: speed up path lookup with cheaper handling of MAY_EXEC

The generic inode_permission() routine does work which is known to be of
no significance for lookup. There are checks for MAY_WRITE, while the
requested permission is MAY_EXEC. Additionally devcgroup_inode_permission()
is called to check for devices, but it is an invariant the inode is a
directory.

Absent a ->permission func, execution lands in generic_permission()
which checks upfront if the requested permission is granted for
everyone.

We can elide the branches which are guaranteed to be false and cut
straight to the check if everyone happens to be allowed MAY_EXEC on the
inode (which holds true most of the time).

Moreover, filesystems which provide their own ->permission routine can
take advantage of the optimization by setting the IOP_FASTPERM_MAY_EXEC
flag on their inodes, which they can legitimately do if their MAY_EXEC
handling matches generic_permission().

As a simple benchmark, as part of compilation gcc issues access(2) on
numerous long paths, for example /usr/lib/gcc/x86_64-linux-gnu/12/crtendS.o

Issuing access(2) on it in a loop on ext4 on Sapphire Rapids (ops/s):
before: 3797556
after:  3987789 (+5%)

Note: this depends on the not-yet-landed ext4 patch to mark inodes with
cache_no_acl()

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251107142149.989998-2-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c         | 43 +++++++++++++++++++++++++++++++++++++++++--
 include/linux/fs.h | 13 +++++++------
 2 files changed, 48 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/namei.c b/fs/namei.c
index 1d4d17f24fb2..94cb52b01022 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -540,6 +540,9 @@ static inline int do_inode_permission(struct mnt_idmap *idmap,
  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  *
  * Separate out file-system wide checks from inode-specific permission checks.
+ *
+ * Note: lookup_inode_permission_may_exec() does not call here. If you add
+ * MAY_EXEC checks, adjust it.
  */
 static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 {
@@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap,
 }
 EXPORT_SYMBOL(inode_permission);
 
+/*
+ * lookup_inode_permission_may_exec - Check traversal right for given inode
+ *
+ * This is a special case routine for may_lookup() making assumptions specific
+ * to path traversal. Use inode_permission() if you are doing something else.
+ *
+ * Work is shaved off compared to inode_permission() as follows:
+ * - we know for a fact there is no MAY_WRITE to worry about
+ * - it is an invariant the inode is a directory
+ *
+ * Since majority of real-world traversal happens on inodes which grant it for
+ * everyone, we check it upfront and only resort to more expensive work if it
+ * fails.
+ *
+ * Filesystems which have their own ->permission hook and consequently miss out
+ * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
+ * on their directory inodes.
+ */
+static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
+	struct inode *inode, int mask)
+{
+	/* Lookup already checked this to return -ENOTDIR */
+	VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
+	VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);
+
+	mask |= MAY_EXEC;
+
+	if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
+		return inode_permission(idmap, inode, mask);
+
+	if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
+		return inode_permission(idmap, inode, mask);
+
+	return security_inode_permission(inode, mask);
+}
+
 /**
  * path_get - get a reference to a path
  * @path: path to get the reference to
@@ -1855,7 +1894,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
 	int err, mask;
 
 	mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
-	err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
+	err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
 	if (likely(!err))
 		return 0;
 
@@ -1870,7 +1909,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
 	if (err != -ECHILD)	// hard error
 		return err;
 
-	return inode_permission(idmap, nd->inode, MAY_EXEC);
+	return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
 }
 
 static int reserve_stack(struct nameidata *nd, struct path *link)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..ff69734b9fde 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -659,13 +659,14 @@ is_uncached_acl(struct posix_acl *acl)
 	return (long)acl & 1;
 }
 
-#define IOP_FASTPERM	0x0001
-#define IOP_LOOKUP	0x0002
-#define IOP_NOFOLLOW	0x0004
-#define IOP_XATTR	0x0008
+#define IOP_FASTPERM		0x0001
+#define IOP_LOOKUP		0x0002
+#define IOP_NOFOLLOW		0x0004
+#define IOP_XATTR		0x0008
 #define IOP_DEFAULT_READLINK	0x0010
-#define IOP_MGTIME	0x0020
-#define IOP_CACHED_LINK	0x0040
+#define IOP_MGTIME		0x0020
+#define IOP_CACHED_LINK		0x0040
+#define IOP_FASTPERM_MAY_EXEC	0x0080
 
 /*
  * Inode state bits.  Protected by inode->i_lock
-- 
cgit v1.2.3


From 21b561dab1406e63740ebe240c7b69f19e1bcf58 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 5 Nov 2025 16:36:22 +0100
Subject: fs: hide dentry_cache behind runtime const machinery

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251105153622.758836-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/dcache.c                       | 6 ++++--
 include/asm-generic/vmlinux.lds.h | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index 035cccbc9276..5c6282b03ba2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -86,7 +86,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
 
-static struct kmem_cache *dentry_cache __ro_after_init;
+static struct kmem_cache *__dentry_cache __ro_after_init;
+#define dentry_cache runtime_const_ptr(__dentry_cache)
 
 const struct qstr empty_name = QSTR_INIT("", 0);
 EXPORT_SYMBOL(empty_name);
@@ -3222,9 +3223,10 @@ static void __init dcache_init(void)
 	 * but it is probably not worth it because of the cache nature
 	 * of the dcache.
 	 */
-	dentry_cache = KMEM_CACHE_USERCOPY(dentry,
+	__dentry_cache = KMEM_CACHE_USERCOPY(dentry,
 		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
 		d_shortname.string);
+	runtime_const_init(ptr, __dentry_cache);
 
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8a9a2e732a65..20939d2445e7 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -955,7 +955,8 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
 
 #define RUNTIME_CONST_VARIABLES						\
 		RUNTIME_CONST(shift, d_hash_shift)			\
-		RUNTIME_CONST(ptr, dentry_hashtable)
+		RUNTIME_CONST(ptr, dentry_hashtable)			\
+		RUNTIME_CONST(ptr, __dentry_cache)
 
 /* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */
 #define KUNIT_TABLE()							\
-- 
cgit v1.2.3


From f99eb098090e4c8bfca4190b545e20450fee8250 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sun, 2 Nov 2025 22:53:12 +0100
Subject: platform/x86: asus-armoury: move existing tunings to asus-armoury
 module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fw_attributes_class provides a much cleaner interface to all of the
attributes introduced to asus-wmi. This patch moves all of these extra
attributes over to fw_attributes_class, and shifts the bulk of these
definitions to a new kernel module to reduce the clutter of asus-wmi
with the intention of deprecating the asus-wmi attributes in future.

The work applies only to WMI methods which don't have a clearly defined
place within the sysfs and as a result ended up lumped together in
/sys/devices/platform/asus-nb-wmi/ with no standard API.

Where possible the fw attrs now implement defaults, min, max, scalar,
choices, etc. As en example dgpu_disable becomes:

/sys/class/firmware-attributes/asus-armoury/attributes/dgpu_disable/
├── current_value
├── display_name
├── possible_values
└── type

as do other attributes.

Co-developed-by: Denis Benato <denis.benato@linux.dev>
Signed-off-by: Denis Benato <denis.benato@linux.dev>
Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://patch.msgid.link/20251102215319.3126879-3-denis.benato@linux.dev
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/hid/hid-asus.c                             |   1 +
 drivers/platform/x86/Kconfig                       |  12 +
 drivers/platform/x86/Makefile                      |   1 +
 drivers/platform/x86/asus-armoury.c                | 763 +++++++++++++++++++++
 drivers/platform/x86/asus-armoury.h                | 200 ++++++
 drivers/platform/x86/asus-wmi.c                    |  10 +-
 .../linux/platform_data/x86/asus-wmi-leds-ids.h    |  50 ++
 include/linux/platform_data/x86/asus-wmi.h         |  44 +-
 8 files changed, 1034 insertions(+), 47 deletions(-)
 create mode 100644 drivers/platform/x86/asus-armoury.c
 create mode 100644 drivers/platform/x86/asus-armoury.h
 create mode 100644 include/linux/platform_data/x86/asus-wmi-leds-ids.h

(limited to 'include')

diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c
index a444d41e53b6..472bca54642b 100644
--- a/drivers/hid/hid-asus.c
+++ b/drivers/hid/hid-asus.c
@@ -27,6 +27,7 @@
 #include <linux/hid.h>
 #include <linux/module.h>
 #include <linux/platform_data/x86/asus-wmi.h>
+#include <linux/platform_data/x86/asus-wmi-leds-ids.h>
 #include <linux/input/mt.h>
 #include <linux/usb.h> /* For to_usb_interface for T100 touchpad intf check */
 #include <linux/power_supply.h>
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 1e9b84f1098f..ba0806b48bb9 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -264,6 +264,18 @@ config ASUS_WIRELESS
 	  If you choose to compile this driver as a module the module will be
 	  called asus-wireless.
 
+config ASUS_ARMOURY
+	tristate "ASUS Armoury driver"
+	depends on ASUS_WMI
+	select FW_ATTR_CLASS
+	help
+	  Say Y here if you have a WMI aware Asus machine and would like to use the
+	  firmware_attributes API to control various settings typically exposed in
+	  the ASUS Armoury Crate application available on Windows.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called asus-armoury.
+
 config ASUS_WMI
 	tristate "ASUS WMI Driver"
 	depends on ACPI_WMI
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index d722e244a4a7..d59a2ed5932c 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_APPLE_GMUX)	+= apple-gmux.o
 # ASUS
 obj-$(CONFIG_ASUS_LAPTOP)	+= asus-laptop.o
 obj-$(CONFIG_ASUS_WIRELESS)	+= asus-wireless.o
+obj-$(CONFIG_ASUS_ARMOURY)	+= asus-armoury.o
 obj-$(CONFIG_ASUS_WMI)		+= asus-wmi.o
 obj-$(CONFIG_ASUS_NB_WMI)	+= asus-nb-wmi.o
 obj-$(CONFIG_ASUS_TF103C_DOCK)	+= asus-tf103c-dock.o
diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c
new file mode 100644
index 000000000000..81b4972df818
--- /dev/null
+++ b/drivers/platform/x86/asus-armoury.c
@@ -0,0 +1,763 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Asus Armoury (WMI) attributes driver.
+ *
+ * This driver uses the fw_attributes class to expose various WMI functions
+ * that are present in many gaming and some non-gaming ASUS laptops.
+ *
+ * These typically don't fit anywhere else in the sysfs such as under LED class,
+ * hwmon or others, and are set in Windows using the ASUS Armoury Crate tool.
+ *
+ * Copyright(C) 2024 Luke Jones <luke@ljones.dev>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/acpi.h>
+#include <linux/array_size.h>
+#include <linux/bitfield.h>
+#include <linux/device.h>
+#include <linux/dmi.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/kobject.h>
+#include <linux/kstrtox.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/platform_data/x86/asus-wmi.h>
+#include <linux/printk.h>
+#include <linux/sysfs.h>
+
+#include "asus-armoury.h"
+#include "firmware_attributes_class.h"
+
+#define ASUS_NB_WMI_EVENT_GUID "0B3CBB35-E3C2-45ED-91C2-4C5A6D195D1C"
+
+#define ASUS_MINI_LED_MODE_MASK   GENMASK(1, 0)
+/* Standard modes for devices with only on/off */
+#define ASUS_MINI_LED_OFF         0x00
+#define ASUS_MINI_LED_ON          0x01
+/* Like "on" but the effect is more vibrant or brighter */
+#define ASUS_MINI_LED_STRONG_MODE 0x02
+/* New modes for devices with 3 mini-led mode types */
+#define ASUS_MINI_LED_2024_WEAK   0x00
+#define ASUS_MINI_LED_2024_STRONG 0x01
+#define ASUS_MINI_LED_2024_OFF    0x02
+
+struct asus_armoury_priv {
+	struct device *fw_attr_dev;
+	struct kset *fw_attr_kset;
+
+	/*
+	 * Mutex to protect eGPU activation/deactivation
+	 * sequences and dGPU connection status:
+	 * do not allow concurrent changes or changes
+	 * before a reboot if dGPU got disabled.
+	 */
+	struct mutex egpu_mutex;
+
+	u32 mini_led_dev_id;
+	u32 gpu_mux_dev_id;
+};
+
+static struct asus_armoury_priv asus_armoury = {
+	.egpu_mutex = __MUTEX_INITIALIZER(asus_armoury.egpu_mutex),
+};
+
+struct fw_attrs_group {
+	bool pending_reboot;
+};
+
+static struct fw_attrs_group fw_attrs = {
+	.pending_reboot = false,
+};
+
+struct asus_attr_group {
+	const struct attribute_group *attr_group;
+	u32 wmi_devid;
+};
+
+static void asus_set_reboot_and_signal_event(void)
+{
+	fw_attrs.pending_reboot = true;
+	kobject_uevent(&asus_armoury.fw_attr_dev->kobj, KOBJ_CHANGE);
+}
+
+static ssize_t pending_reboot_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", fw_attrs.pending_reboot);
+}
+
+static struct kobj_attribute pending_reboot = __ATTR_RO(pending_reboot);
+
+static bool asus_bios_requires_reboot(struct kobj_attribute *attr)
+{
+	return !strcmp(attr->attr.name, "gpu_mux_mode");
+}
+
+/**
+ * armoury_has_devstate() - Check presence of the WMI function state.
+ *
+ * @dev_id: The WMI method ID to check for presence.
+ *
+ * Returns: true iif method is supported.
+ */
+static bool armoury_has_devstate(u32 dev_id)
+{
+	u32 retval;
+	int status;
+
+	status = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, dev_id, 0, &retval);
+	pr_debug("%s called (0x%08x), retval: 0x%08x\n", __func__, dev_id, retval);
+
+	return status == 0 && (retval & ASUS_WMI_DSTS_PRESENCE_BIT);
+}
+
+/**
+ * armoury_get_devstate() - Get the WMI function state.
+ * @attr: NULL or the kobj_attribute associated to called WMI function.
+ * @dev_id: The WMI method ID to call.
+ * @retval:
+ * * non-NULL pointer to where to store the value returned from WMI
+ * * with the function presence bit cleared.
+ *
+ * Intended usage is from sysfs attribute checking associated WMI function.
+ *
+ * Returns:
+ * * %-ENODEV	- method ID is unsupported.
+ * * %0		- successful and retval is filled.
+ * * %other	- error from WMI call.
+ */
+static int armoury_get_devstate(struct kobj_attribute *attr, u32 *retval, u32 dev_id)
+{
+	int err;
+
+	err = asus_wmi_get_devstate_dsts(dev_id, retval);
+	if (err) {
+		if (attr)
+			pr_err("Failed to get %s: %d\n", attr->attr.name, err);
+		else
+			pr_err("Failed to get devstate for 0x%x: %d\n", dev_id, err);
+
+		return err;
+	}
+
+	/*
+	 * asus_wmi_get_devstate_dsts will populate retval with WMI return, but
+	 * the true value is expressed when ASUS_WMI_DSTS_PRESENCE_BIT is clear.
+	 */
+	*retval &= ~ASUS_WMI_DSTS_PRESENCE_BIT;
+
+	return 0;
+}
+
+/**
+ * armoury_set_devstate() - Set the WMI function state.
+ * @attr: The kobj_attribute associated to called WMI function.
+ * @dev_id: The WMI method ID to call.
+ * @value: The new value to be set.
+ * @retval: Where to store the value returned from WMI or NULL.
+ *
+ * Intended usage is from sysfs attribute setting associated WMI function.
+ * Before calling the presence of the function should be checked.
+ *
+ * Every WMI write MUST go through this function to enforce safety checks.
+ *
+ * Results !1 is usually considered a fail by ASUS, but some WMI methods
+ * (like eGPU or CPU cores) do use > 1 to return a status code or similar:
+ * in these cases caller is interested in the actual return value
+ * and should perform relevant checks.
+ *
+ * Returns:
+ * * %-EIO	- WMI function returned an error.
+ * * %0		- successful and retval is filled.
+ * * %other	- error from WMI call.
+ */
+static int armoury_set_devstate(struct kobj_attribute *attr,
+				     u32 value, u32 *retval, u32 dev_id)
+{
+	u32 result;
+	int err;
+
+	err = asus_wmi_set_devstate(dev_id, value, retval ? retval : &result);
+	if (err) {
+		if (attr)
+			pr_err("Failed to set %s: %d\n", attr->attr.name, err);
+		else
+			pr_err("Failed to set devstate for 0x%x: %d\n", dev_id, err);
+
+		return err;
+	}
+
+	/*
+	 * If retval == NULL caller is uninterested in return value:
+	 * perform the most common result check here.
+	 */
+	if ((retval == NULL) && (result == 0)) {
+		pr_err("Failed to set %s: (result): 0x%x\n", attr->attr.name, result);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int armoury_attr_enum_list(char *buf, size_t enum_values)
+{
+	size_t i;
+	int len = 0;
+
+	for (i = 0; i < enum_values; i++) {
+		if (i == 0)
+			len += sysfs_emit_at(buf, len, "%zu", i);
+		else
+			len += sysfs_emit_at(buf, len, ";%zu", i);
+	}
+	len += sysfs_emit_at(buf, len, "\n");
+
+	return len;
+}
+
+ssize_t armoury_attr_uint_store(struct kobject *kobj, struct kobj_attribute *attr,
+				const char *buf, size_t count, u32 min, u32 max,
+				u32 *store_value, u32 wmi_dev)
+{
+	u32 value;
+	int err;
+
+	err = kstrtou32(buf, 10, &value);
+	if (err)
+		return err;
+
+	if (value < min || value > max)
+		return -EINVAL;
+
+	err = armoury_set_devstate(attr, value, NULL, wmi_dev);
+	if (err)
+		return err;
+
+	if (store_value != NULL)
+		*store_value = value;
+	sysfs_notify(kobj, NULL, attr->attr.name);
+
+	if (asus_bios_requires_reboot(attr))
+		asus_set_reboot_and_signal_event();
+
+	return count;
+}
+
+ssize_t armoury_attr_uint_show(struct kobject *kobj, struct kobj_attribute *attr,
+				char *buf, u32 wmi_dev)
+{
+	u32 result;
+	int err;
+
+	err = armoury_get_devstate(attr, &result, wmi_dev);
+	if (err)
+		return err;
+
+	return sysfs_emit(buf, "%u\n", result);
+}
+
+static ssize_t enum_type_show(struct kobject *kobj, struct kobj_attribute *attr,
+			      char *buf)
+{
+	return sysfs_emit(buf, "enumeration\n");
+}
+
+/* Mini-LED mode **************************************************************/
+
+/* Values map for mini-led modes on 2023 and earlier models. */
+static u32 mini_led_mode1_map[] = {
+	[0] = ASUS_MINI_LED_OFF,
+	[1] = ASUS_MINI_LED_ON,
+};
+
+/* Values map for mini-led modes on 2024 and later models. */
+static u32 mini_led_mode2_map[] = {
+	[0] = ASUS_MINI_LED_2024_OFF,
+	[1] = ASUS_MINI_LED_2024_WEAK,
+	[2] = ASUS_MINI_LED_2024_STRONG,
+};
+
+static ssize_t mini_led_mode_current_value_show(struct kobject *kobj,
+						struct kobj_attribute *attr, char *buf)
+{
+	u32 *mini_led_mode_map;
+	size_t mini_led_mode_map_size;
+	u32 i, mode;
+	int err;
+
+	switch (asus_armoury.mini_led_dev_id) {
+	case ASUS_WMI_DEVID_MINI_LED_MODE:
+		mini_led_mode_map = mini_led_mode1_map;
+		mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode1_map);
+		break;
+
+	case ASUS_WMI_DEVID_MINI_LED_MODE2:
+		mini_led_mode_map = mini_led_mode2_map;
+		mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode2_map);
+		break;
+
+	default:
+		pr_err("Unrecognized mini-LED device: %u\n", asus_armoury.mini_led_dev_id);
+		return -ENODEV;
+	}
+
+	err = armoury_get_devstate(attr, &mode, asus_armoury.mini_led_dev_id);
+	if (err)
+		return err;
+
+	mode = FIELD_GET(ASUS_MINI_LED_MODE_MASK, 0);
+
+	for (i = 0; i < mini_led_mode_map_size; i++)
+		return sysfs_emit(buf, "%u\n", mini_led_mode_map[i]);
+
+	pr_warn("Unrecognized mini-LED mode: %u", mode);
+	return -EINVAL;
+}
+
+static ssize_t mini_led_mode_current_value_store(struct kobject *kobj,
+						 struct kobj_attribute *attr,
+						 const char *buf, size_t count)
+{
+	u32 *mini_led_mode_map;
+	size_t mini_led_mode_map_size;
+	u32 mode;
+	int err;
+
+	err = kstrtou32(buf, 10, &mode);
+	if (err)
+		return err;
+
+	switch (asus_armoury.mini_led_dev_id) {
+	case ASUS_WMI_DEVID_MINI_LED_MODE:
+		mini_led_mode_map = mini_led_mode1_map;
+		mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode1_map);
+		break;
+
+	case ASUS_WMI_DEVID_MINI_LED_MODE2:
+		mini_led_mode_map = mini_led_mode2_map;
+		mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode2_map);
+		break;
+
+	default:
+		pr_err("Unrecognized mini-LED devid: %u\n", asus_armoury.mini_led_dev_id);
+		return -EINVAL;
+	}
+
+	if (mode >= mini_led_mode_map_size) {
+		return pr_warn("mini-LED mode unrecognized device: %u\n", mode);
+		return -ENODEV;
+	}
+
+	return armoury_attr_uint_store(kobj, attr, buf, count,
+				       0, mini_led_mode_map[mode],
+				       NULL, asus_armoury.mini_led_dev_id);
+}
+
+static ssize_t mini_led_mode_possible_values_show(struct kobject *kobj,
+						  struct kobj_attribute *attr, char *buf)
+{
+	switch (asus_armoury.mini_led_dev_id) {
+	case ASUS_WMI_DEVID_MINI_LED_MODE:
+		return armoury_attr_enum_list(buf, ARRAY_SIZE(mini_led_mode1_map));
+	case ASUS_WMI_DEVID_MINI_LED_MODE2:
+		return armoury_attr_enum_list(buf, ARRAY_SIZE(mini_led_mode2_map));
+	default:
+		return -ENODEV;
+	}
+}
+ASUS_ATTR_GROUP_ENUM(mini_led_mode, "mini_led_mode", "Set the mini-LED backlight mode");
+
+static ssize_t gpu_mux_mode_current_value_store(struct kobject *kobj,
+						struct kobj_attribute *attr,
+						const char *buf, size_t count)
+{
+	int result, err;
+	bool optimus;
+
+	err = kstrtobool(buf, &optimus);
+	if (err)
+		return err;
+
+	if (armoury_has_devstate(ASUS_WMI_DEVID_DGPU)) {
+		err = armoury_get_devstate(NULL, &result, ASUS_WMI_DEVID_DGPU);
+		if (err)
+			return err;
+		if (result && !optimus) {
+			pr_warn("Cannot switch MUX to dGPU mode when dGPU is disabled: %02X\n",
+				result);
+			return -ENODEV;
+		}
+	}
+
+	if (armoury_has_devstate(ASUS_WMI_DEVID_EGPU)) {
+		err = armoury_get_devstate(NULL, &result, ASUS_WMI_DEVID_EGPU);
+		if (err)
+			return err;
+		if (result && !optimus) {
+			pr_warn("Cannot switch MUX to dGPU mode when eGPU is enabled\n");
+			return -EBUSY;
+		}
+	}
+
+	err = armoury_set_devstate(attr, optimus ? 1 : 0, NULL, asus_armoury.gpu_mux_dev_id);
+	if (err)
+		return err;
+
+	sysfs_notify(kobj, NULL, attr->attr.name);
+	asus_set_reboot_and_signal_event();
+
+	return count;
+}
+ASUS_WMI_SHOW_INT(gpu_mux_mode_current_value, asus_armoury.gpu_mux_dev_id);
+ASUS_ATTR_GROUP_BOOL(gpu_mux_mode, "gpu_mux_mode", "Set the GPU display MUX mode");
+
+static ssize_t dgpu_disable_current_value_store(struct kobject *kobj,
+						struct kobj_attribute *attr, const char *buf,
+						size_t count)
+{
+	int result, err;
+	bool disable;
+
+	err = kstrtobool(buf, &disable);
+	if (err)
+		return err;
+
+	if (asus_armoury.gpu_mux_dev_id) {
+		err = armoury_get_devstate(NULL, &result, asus_armoury.gpu_mux_dev_id);
+		if (err)
+			return err;
+		if (!result && disable) {
+			pr_warn("Cannot disable dGPU when the MUX is in dGPU mode\n");
+			return -EBUSY;
+		}
+	}
+
+	scoped_guard(mutex, &asus_armoury.egpu_mutex) {
+		err = armoury_set_devstate(attr, disable ? 1 : 0, NULL, ASUS_WMI_DEVID_DGPU);
+		if (err)
+			return err;
+	}
+
+	sysfs_notify(kobj, NULL, attr->attr.name);
+
+	return count;
+}
+ASUS_WMI_SHOW_INT(dgpu_disable_current_value, ASUS_WMI_DEVID_DGPU);
+ASUS_ATTR_GROUP_BOOL(dgpu_disable, "dgpu_disable", "Disable the dGPU");
+
+/* Values map for eGPU activation requests. */
+static u32 egpu_status_map[] = {
+	[0] = 0x00000000U,
+	[1] = 0x00000001U,
+	[2] = 0x00000101U,
+	[3] = 0x00000201U,
+};
+
+/*
+ * armoury_pci_rescan() - Performs a PCI rescan
+ *
+ * Bring up any GPU that has been hotplugged in the system.
+ */
+static void armoury_pci_rescan(void)
+{
+	struct pci_bus *b = NULL;
+
+	pci_lock_rescan_remove();
+	while ((b = pci_find_next_bus(b)) != NULL)
+		pci_rescan_bus(b);
+	pci_unlock_rescan_remove();
+}
+
+/*
+ * The ACPI call to enable the eGPU might also disable the internal dGPU,
+ * but this is not always the case and on certain models enabling the eGPU
+ * when the dGPU is either still active or has been disabled without rebooting
+ * will make both GPUs malfunction and the kernel will detect many
+ * PCI AER unrecoverable errors.
+ */
+static ssize_t egpu_enable_current_value_store(struct kobject *kobj, struct kobj_attribute *attr,
+							const char *buf, size_t count)
+{
+	int err;
+	u32 requested, enable, result;
+
+	err = kstrtou32(buf, 10, &requested);
+	if (err)
+		return err;
+
+	if (requested >= ARRAY_SIZE(egpu_status_map))
+		return -EINVAL;
+	enable = egpu_status_map[requested];
+
+	scoped_guard(mutex, &asus_armoury.egpu_mutex) {
+		/* Ensure the eGPU is connected before attempting to activate it. */
+		if (enable) {
+			err = armoury_get_devstate(NULL, &result, ASUS_WMI_DEVID_EGPU_CONNECTED);
+			if (err) {
+				pr_warn("Failed to get eGPU connection status: %d\n", err);
+				return err;
+			}
+			if (!result) {
+				pr_warn("Cannot activate eGPU while undetected\n");
+				return -ENOENT;
+			}
+		}
+
+		if (asus_armoury.gpu_mux_dev_id) {
+			err = armoury_get_devstate(NULL, &result, asus_armoury.gpu_mux_dev_id);
+			if (err)
+				return err;
+
+			if (!result && enable) {
+				pr_warn("Cannot enable eGPU when the MUX is in dGPU mode\n");
+				return -ENODEV;
+			}
+		}
+
+		err = armoury_set_devstate(attr, enable, &result, ASUS_WMI_DEVID_EGPU);
+		if (err) {
+			pr_err("Failed to set %s: %d\n", attr->attr.name, err);
+			return err;
+		}
+
+		/*
+		 * ACPI returns value 0x01 on success and 0x02 on a partial activation:
+		 * performing a pci rescan will bring up the device in pci-e 3.0 speed,
+		 * after a reboot the device will work at full speed.
+		 */
+		switch (result) {
+		case 0x01:
+			/*
+			 * When a GPU is in use it does not get disconnected even if
+			 * the ACPI call returns a success.
+			 */
+			if (!enable) {
+				err = armoury_get_devstate(attr, &result, ASUS_WMI_DEVID_EGPU);
+				if (err) {
+					pr_warn("Failed to ensure eGPU is deactivated: %d\n", err);
+					return err;
+				}
+
+				if (result != 0)
+					return -EBUSY;
+			}
+
+			pr_debug("Success changing the eGPU status\n");
+			break;
+		case 0x02:
+			pr_info("Success changing the eGPU status, a reboot is strongly advised\n");
+			asus_set_reboot_and_signal_event();
+			break;
+		default:
+			pr_err("Failed to change the eGPU status: wmi result is 0x%x\n", result);
+			return -EIO;
+		}
+	}
+
+	/*
+	 * Perform a PCI rescan: on every tested model this is necessary
+	 * to make the eGPU visible on the bus without rebooting.
+	 */
+	armoury_pci_rescan();
+
+	sysfs_notify(kobj, NULL, attr->attr.name);
+
+	return count;
+}
+
+static ssize_t egpu_enable_current_value_show(struct kobject *kobj, struct kobj_attribute *attr,
+						char *buf)
+{
+	int i, err;
+	u32 status;
+
+	scoped_guard(mutex, &asus_armoury.egpu_mutex) {
+		err = armoury_get_devstate(attr, &status, ASUS_WMI_DEVID_EGPU);
+		if (err)
+			return err;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(egpu_status_map); i++) {
+		if (egpu_status_map[i] == status)
+			return sysfs_emit(buf, "%u\n", i);
+	}
+
+	return -EIO;
+}
+
+static ssize_t egpu_enable_possible_values_show(struct kobject *kobj, struct kobj_attribute *attr,
+						char *buf)
+{
+	return armoury_attr_enum_list(buf, ARRAY_SIZE(egpu_status_map));
+}
+ASUS_ATTR_GROUP_ENUM(egpu_enable, "egpu_enable", "Enable the eGPU (also disables dGPU)");
+
+/* Simple attribute creation */
+ASUS_ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2\n",
+			    "Show the current mode of charging");
+ASUS_ATTR_GROUP_BOOL_RW(boot_sound, "boot_sound", ASUS_WMI_DEVID_BOOT_SOUND,
+			"Set the boot POST sound");
+ASUS_ATTR_GROUP_BOOL_RW(mcu_powersave, "mcu_powersave", ASUS_WMI_DEVID_MCU_POWERSAVE,
+			"Set MCU powersaving mode");
+ASUS_ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD,
+			"Set the panel refresh overdrive");
+ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED,
+			"Show the eGPU connection status");
+
+/* If an attribute does not require any special case handling add it here */
+static const struct asus_attr_group armoury_attr_groups[] = {
+	{ &egpu_connected_attr_group, ASUS_WMI_DEVID_EGPU_CONNECTED },
+	{ &egpu_enable_attr_group, ASUS_WMI_DEVID_EGPU },
+	{ &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU },
+
+	{ &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE },
+	{ &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND },
+	{ &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE },
+	{ &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD },
+};
+
+static int asus_fw_attr_add(void)
+{
+	int err, i;
+
+	asus_armoury.fw_attr_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0),
+						NULL, "%s", DRIVER_NAME);
+	if (IS_ERR(asus_armoury.fw_attr_dev)) {
+		err = PTR_ERR(asus_armoury.fw_attr_dev);
+		goto fail_class_get;
+	}
+
+	asus_armoury.fw_attr_kset = kset_create_and_add("attributes", NULL,
+						&asus_armoury.fw_attr_dev->kobj);
+	if (!asus_armoury.fw_attr_kset) {
+		err = -ENOMEM;
+		goto err_destroy_classdev;
+	}
+
+	err = sysfs_create_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr);
+	if (err) {
+		pr_err("Failed to create sysfs level attributes\n");
+		goto err_destroy_kset;
+	}
+
+	asus_armoury.mini_led_dev_id = 0;
+	if (armoury_has_devstate(ASUS_WMI_DEVID_MINI_LED_MODE))
+		asus_armoury.mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE;
+	else if (armoury_has_devstate(ASUS_WMI_DEVID_MINI_LED_MODE2))
+		asus_armoury.mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE2;
+
+	if (asus_armoury.mini_led_dev_id) {
+		err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj,
+					 &mini_led_mode_attr_group);
+		if (err) {
+			pr_err("Failed to create sysfs-group for mini_led\n");
+			goto err_remove_file;
+		}
+	}
+
+	asus_armoury.gpu_mux_dev_id = 0;
+	if (armoury_has_devstate(ASUS_WMI_DEVID_GPU_MUX))
+		asus_armoury.gpu_mux_dev_id = ASUS_WMI_DEVID_GPU_MUX;
+	else if (armoury_has_devstate(ASUS_WMI_DEVID_GPU_MUX_VIVO))
+		asus_armoury.gpu_mux_dev_id = ASUS_WMI_DEVID_GPU_MUX_VIVO;
+
+	if (asus_armoury.gpu_mux_dev_id) {
+		err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj,
+					 &gpu_mux_mode_attr_group);
+		if (err) {
+			pr_err("Failed to create sysfs-group for gpu_mux\n");
+			goto err_remove_mini_led_group;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(armoury_attr_groups); i++) {
+		if (!armoury_has_devstate(armoury_attr_groups[i].wmi_devid))
+			continue;
+
+		err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj,
+					 armoury_attr_groups[i].attr_group);
+		if (err) {
+			pr_err("Failed to create sysfs-group for %s\n",
+			       armoury_attr_groups[i].attr_group->name);
+			goto err_remove_groups;
+		}
+	}
+
+	return 0;
+
+err_remove_groups:
+	while (i--) {
+		if (armoury_has_devstate(armoury_attr_groups[i].wmi_devid))
+			sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj,
+					   armoury_attr_groups[i].attr_group);
+	}
+	if (asus_armoury.gpu_mux_dev_id)
+		sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &gpu_mux_mode_attr_group);
+err_remove_mini_led_group:
+	if (asus_armoury.mini_led_dev_id)
+		sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &mini_led_mode_attr_group);
+err_remove_file:
+	sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr);
+err_destroy_kset:
+	kset_unregister(asus_armoury.fw_attr_kset);
+err_destroy_classdev:
+fail_class_get:
+	device_destroy(&firmware_attributes_class, MKDEV(0, 0));
+	return err;
+}
+
+/* Init / exit ****************************************************************/
+
+static int __init asus_fw_init(void)
+{
+	char *wmi_uid;
+
+	wmi_uid = wmi_get_acpi_device_uid(ASUS_WMI_MGMT_GUID);
+	if (!wmi_uid)
+		return -ENODEV;
+
+	/*
+	 * if equal to "ASUSWMI" then it's DCTS that can't be used for this
+	 * driver, DSTS is required.
+	 */
+	if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI))
+		return -ENODEV;
+
+	return asus_fw_attr_add();
+}
+
+static void __exit asus_fw_exit(void)
+{
+	int i;
+
+	for (i = ARRAY_SIZE(armoury_attr_groups) - 1; i >= 0; i--) {
+		if (armoury_has_devstate(armoury_attr_groups[i].wmi_devid))
+			sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj,
+					   armoury_attr_groups[i].attr_group);
+	}
+
+	if (asus_armoury.gpu_mux_dev_id)
+		sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &gpu_mux_mode_attr_group);
+
+	if (asus_armoury.mini_led_dev_id)
+		sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &mini_led_mode_attr_group);
+
+	sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr);
+	kset_unregister(asus_armoury.fw_attr_kset);
+	device_destroy(&firmware_attributes_class, MKDEV(0, 0));
+}
+
+module_init(asus_fw_init);
+module_exit(asus_fw_exit);
+
+MODULE_IMPORT_NS("ASUS_WMI");
+MODULE_AUTHOR("Luke Jones <luke@ljones.dev>");
+MODULE_DESCRIPTION("ASUS BIOS Configuration Driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("wmi:" ASUS_NB_WMI_EVENT_GUID);
diff --git a/drivers/platform/x86/asus-armoury.h b/drivers/platform/x86/asus-armoury.h
new file mode 100644
index 000000000000..3a2a674a1b55
--- /dev/null
+++ b/drivers/platform/x86/asus-armoury.h
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Definitions for kernel modules using asus-armoury driver
+ *
+ * Copyright (c) 2024 Luke Jones <luke@ljones.dev>
+ */
+
+#ifndef _ASUS_ARMOURY_H_
+#define _ASUS_ARMOURY_H_
+
+#include <linux/platform_device.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+
+#define DRIVER_NAME "asus-armoury"
+
+/**
+ * armoury_attr_uint_store() - Send an uint to WMI method if within min/max.
+ * @kobj: Pointer to the driver object.
+ * @attr: Pointer to the attribute calling this function.
+ * @buf: The buffer to read from, this is parsed to `uint` type.
+ * @count: Required by sysfs attribute macros, pass in from the callee attr.
+ * @min: Minimum accepted value. Below this returns -EINVAL.
+ * @max: Maximum accepted value. Above this returns -EINVAL.
+ * @store_value: Pointer to where the parsed value should be stored.
+ * @wmi_dev: The WMI function ID to use.
+ *
+ * This function is intended to be generic so it can be called from any "_store"
+ * attribute which works only with integers.
+ *
+ * Integers to be sent to the WMI method is inclusive range checked and
+ * an error returned if out of range.
+ *
+ * If the value is valid and WMI is success then the sysfs attribute is notified
+ * and if asus_bios_requires_reboot() is true then reboot attribute
+ * is also notified.
+ *
+ * Returns: Either count, or an error.
+ */
+ssize_t armoury_attr_uint_store(struct kobject *kobj, struct kobj_attribute *attr,
+				const char *buf, size_t count, u32 min, u32 max,
+				u32 *store_value, u32 wmi_dev);
+
+/**
+ * armoury_attr_uint_show() - Receive an uint from a WMI method.
+ * @kobj: Pointer to the driver object.
+ * @attr: Pointer to the attribute calling this function.
+ * @buf: The buffer to write to, as an `uint` type.
+ * @wmi_dev: The WMI function ID to use.
+ *
+ * This function is intended to be generic so it can be called from any "_show"
+ * attribute which works only with integers.
+ *
+ * Returns: Either count, or an error.
+ */
+ssize_t armoury_attr_uint_show(struct kobject *kobj, struct kobj_attribute *attr,
+				char *buf, u32 wmi_dev);
+
+#define __ASUS_ATTR_RO(_func, _name)					\
+	{								\
+		.attr = { .name = __stringify(_name), .mode = 0444 },	\
+		.show = _func##_##_name##_show,				\
+	}
+
+#define __ASUS_ATTR_RO_AS(_name, _show)					\
+	{								\
+		.attr = { .name = __stringify(_name), .mode = 0444 },	\
+		.show = _show,						\
+	}
+
+#define __ASUS_ATTR_RW(_func, _name) \
+	__ATTR(_name, 0644, _func##_##_name##_show, _func##_##_name##_store)
+
+#define __WMI_STORE_INT(_attr, _min, _max, _wmi)				\
+	static ssize_t _attr##_store(struct kobject *kobj,			\
+				     struct kobj_attribute *attr,		\
+				     const char *buf, size_t count)		\
+	{									\
+		return armoury_attr_uint_store(kobj, attr, buf, count, _min,	\
+					_max, NULL, _wmi);			\
+	}
+
+#define ASUS_WMI_SHOW_INT(_attr, _wmi)						\
+	static ssize_t _attr##_show(struct kobject *kobj,			\
+				    struct kobj_attribute *attr, char *buf)	\
+	{									\
+		return armoury_attr_uint_show(kobj, attr, buf, _wmi);		\
+	}
+
+/* Create functions and attributes for use in other macros or on their own */
+
+/* Shows a formatted static variable */
+#define __ATTR_SHOW_FMT(_prop, _attrname, _fmt, _val)				\
+	static ssize_t _attrname##_##_prop##_show(				\
+		struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
+	{									\
+		return sysfs_emit(buf, _fmt, _val);				\
+	}									\
+	static struct kobj_attribute attr_##_attrname##_##_prop =		\
+		__ASUS_ATTR_RO(_attrname, _prop)
+
+#define __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, _possible, _dispname)\
+	ASUS_WMI_SHOW_INT(_attrname##_current_value, _wmi);		\
+	static struct kobj_attribute attr_##_attrname##_current_value =		\
+		__ASUS_ATTR_RO(_attrname, current_value);			\
+	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);		\
+	__ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible);		\
+	static struct kobj_attribute attr_##_attrname##_type =			\
+		__ASUS_ATTR_RO_AS(type, enum_type_show);			\
+	static struct attribute *_attrname##_attrs[] = {			\
+		&attr_##_attrname##_current_value.attr,				\
+		&attr_##_attrname##_display_name.attr,				\
+		&attr_##_attrname##_possible_values.attr,			\
+		&attr_##_attrname##_type.attr,					\
+		NULL								\
+	};									\
+	static const struct attribute_group _attrname##_attr_group = {		\
+		.name = _fsname, .attrs = _attrname##_attrs			\
+	}
+
+#define __ATTR_RW_INT_GROUP_ENUM(_attrname, _minv, _maxv, _wmi, _fsname,\
+				 _possible, _dispname)			\
+	__WMI_STORE_INT(_attrname##_current_value, _minv, _maxv, _wmi);	\
+	ASUS_WMI_SHOW_INT(_attrname##_current_value, _wmi);	\
+	static struct kobj_attribute attr_##_attrname##_current_value =	\
+		__ASUS_ATTR_RW(_attrname, current_value);		\
+	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);	\
+	__ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible);	\
+	static struct kobj_attribute attr_##_attrname##_type =		\
+		__ASUS_ATTR_RO_AS(type, enum_type_show);		\
+	static struct attribute *_attrname##_attrs[] = {		\
+		&attr_##_attrname##_current_value.attr,			\
+		&attr_##_attrname##_display_name.attr,			\
+		&attr_##_attrname##_possible_values.attr,		\
+		&attr_##_attrname##_type.attr,				\
+		NULL							\
+	};								\
+	static const struct attribute_group _attrname##_attr_group = {	\
+		.name = _fsname, .attrs = _attrname##_attrs		\
+	}
+
+/* Boolean style enumeration, base macro. Requires adding show/store */
+#define __ATTR_GROUP_ENUM(_attrname, _fsname, _possible, _dispname)	\
+	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);	\
+	__ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible);	\
+	static struct kobj_attribute attr_##_attrname##_type =		\
+		__ASUS_ATTR_RO_AS(type, enum_type_show);		\
+	static struct attribute *_attrname##_attrs[] = {		\
+		&attr_##_attrname##_current_value.attr,			\
+		&attr_##_attrname##_display_name.attr,			\
+		&attr_##_attrname##_possible_values.attr,		\
+		&attr_##_attrname##_type.attr,				\
+		NULL							\
+	};								\
+	static const struct attribute_group _attrname##_attr_group = {	\
+		.name = _fsname, .attrs = _attrname##_attrs		\
+	}
+
+#define ASUS_ATTR_GROUP_BOOL_RO(_attrname, _fsname, _wmi, _dispname)	\
+	__ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, "0;1", _dispname)
+
+
+#define ASUS_ATTR_GROUP_BOOL_RW(_attrname, _fsname, _wmi, _dispname)	\
+	__ATTR_RW_INT_GROUP_ENUM(_attrname, 0, 1, _wmi, _fsname, "0;1", _dispname)
+
+#define ASUS_ATTR_GROUP_ENUM_INT_RO(_attrname, _fsname, _wmi, _possible, _dispname)	\
+	__ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, _possible, _dispname)
+
+/*
+ * Requires <name>_current_value_show(), <name>_current_value_show()
+ */
+#define ASUS_ATTR_GROUP_BOOL(_attrname, _fsname, _dispname)		\
+	static struct kobj_attribute attr_##_attrname##_current_value =	\
+		__ASUS_ATTR_RW(_attrname, current_value);		\
+	__ATTR_GROUP_ENUM(_attrname, _fsname, "0;1", _dispname)
+
+/*
+ * Requires <name>_current_value_show(), <name>_current_value_show()
+ * and <name>_possible_values_show()
+ */
+#define ASUS_ATTR_GROUP_ENUM(_attrname, _fsname, _dispname)			\
+	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);		\
+	static struct kobj_attribute attr_##_attrname##_current_value =		\
+		__ASUS_ATTR_RW(_attrname, current_value);			\
+	static struct kobj_attribute attr_##_attrname##_possible_values =	\
+		__ASUS_ATTR_RO(_attrname, possible_values);			\
+	static struct kobj_attribute attr_##_attrname##_type =			\
+		__ASUS_ATTR_RO_AS(type, enum_type_show);			\
+	static struct attribute *_attrname##_attrs[] = {			\
+		&attr_##_attrname##_current_value.attr,				\
+		&attr_##_attrname##_display_name.attr,				\
+		&attr_##_attrname##_possible_values.attr,			\
+		&attr_##_attrname##_type.attr,					\
+		NULL								\
+	};									\
+	static const struct attribute_group _attrname##_attr_group = {		\
+		.name = _fsname, .attrs = _attrname##_attrs			\
+	}
+
+#endif /* _ASUS_ARMOURY_H_ */
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index c3e90517ce0f..ff98267e5981 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -15,6 +15,7 @@
 
 #include <linux/acpi.h>
 #include <linux/backlight.h>
+#include <linux/bits.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/dmi.h>
@@ -30,6 +31,7 @@
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
 #include <linux/platform_data/x86/asus-wmi.h>
+#include <linux/platform_data/x86/asus-wmi-leds-ids.h>
 #include <linux/platform_device.h>
 #include <linux/platform_profile.h>
 #include <linux/power_supply.h>
@@ -55,8 +57,6 @@ module_param(fnlock_default, bool, 0444);
 #define to_asus_wmi_driver(pdrv)					\
 	(container_of((pdrv), struct asus_wmi_driver, platform_driver))
 
-#define ASUS_WMI_MGMT_GUID	"97845ED0-4E6D-11DE-8A39-0800200C9A66"
-
 #define NOTIFY_BRNUP_MIN		0x11
 #define NOTIFY_BRNUP_MAX		0x1f
 #define NOTIFY_BRNDOWN_MIN		0x20
@@ -105,8 +105,6 @@ module_param(fnlock_default, bool, 0444);
 #define USB_INTEL_XUSB2PR		0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
 
-#define ASUS_ACPI_UID_ASUSWMI		"ASUSWMI"
-
 #define WMI_EVENT_MASK			0xFFFF
 
 #define FAN_CURVE_POINTS		8
@@ -561,8 +559,8 @@ static int asus_wmi_get_devstate(struct asus_wmi *asus, u32 dev_id, u32 *retval)
  *
  * Returns:
  * * %-ENODEV	- method ID is unsupported.
- * * %0			- successful and retval is filled.
- * * %other		- error from WMI call.
+ * * %0		- successful and retval is filled.
+ * * %other	- error from WMI call.
  */
 int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval)
 {
diff --git a/include/linux/platform_data/x86/asus-wmi-leds-ids.h b/include/linux/platform_data/x86/asus-wmi-leds-ids.h
new file mode 100644
index 000000000000..034a039c4e37
--- /dev/null
+++ b/include/linux/platform_data/x86/asus-wmi-leds-ids.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H
+#define __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H
+
+#include <linux/dmi.h>
+#include <linux/types.h>
+
+/* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */
+#if IS_REACHABLE(CONFIG_ASUS_WMI) || IS_REACHABLE(CONFIG_HID_ASUS)
+static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = {
+	{
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Zephyrus"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Strix"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt P16"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GA403U"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GU605M"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "RC71L"),
+		},
+	},
+	{ },
+};
+#endif
+
+#endif	/* __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H */
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index dbd44d9fbb6f..8ea8925a0fc5 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -4,7 +4,9 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/dmi.h>
+
+#define ASUS_WMI_MGMT_GUID	"97845ED0-4E6D-11DE-8A39-0800200C9A66"
+#define ASUS_ACPI_UID_ASUSWMI	"ASUSWMI"
 
 /* WMI Methods */
 #define ASUS_WMI_METHODID_SPEC	        0x43455053 /* BIOS SPECification */
@@ -191,44 +193,4 @@ static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1,
 }
 #endif
 
-/* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */
-static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = {
-	{
-		.matches = {
-			DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Zephyrus"),
-		},
-	},
-	{
-		.matches = {
-			DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Strix"),
-		},
-	},
-	{
-		.matches = {
-			DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"),
-		},
-	},
-	{
-		.matches = {
-			DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt P16"),
-		},
-	},
-	{
-		.matches = {
-			DMI_MATCH(DMI_BOARD_NAME, "GA403U"),
-		},
-	},
-	{
-		.matches = {
-			DMI_MATCH(DMI_BOARD_NAME, "GU605M"),
-		},
-	},
-	{
-		.matches = {
-			DMI_MATCH(DMI_BOARD_NAME, "RC71L"),
-		},
-	},
-	{ },
-};
-
 #endif	/* __PLATFORM_DATA_X86_ASUS_WMI_H */
-- 
cgit v1.2.3


From 628cb03b15f2a0f10534979b3ea9c8befe87c381 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sun, 2 Nov 2025 22:53:13 +0100
Subject: platform/x86: asus-armoury: add panel_hd_mode attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add panel_hd_mode to toggle the panel mode between single and high
definition modes.

Signed-off-by: Denis Benato <denis.benato@linux.dev>
Signed-off-by: Luke D. Jones <luke@ljones.dev>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://patch.msgid.link/20251102215319.3126879-4-denis.benato@linux.dev
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/asus-armoury.c        | 6 +++++-
 include/linux/platform_data/x86/asus-wmi.h | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c
index 81b4972df818..f0cb973a487e 100644
--- a/drivers/platform/x86/asus-armoury.c
+++ b/drivers/platform/x86/asus-armoury.c
@@ -96,7 +96,8 @@ static struct kobj_attribute pending_reboot = __ATTR_RO(pending_reboot);
 
 static bool asus_bios_requires_reboot(struct kobj_attribute *attr)
 {
-	return !strcmp(attr->attr.name, "gpu_mux_mode");
+	return !strcmp(attr->attr.name, "gpu_mux_mode") ||
+	       !strcmp(attr->attr.name, "panel_hd_mode");
 }
 
 /**
@@ -607,6 +608,8 @@ ASUS_ATTR_GROUP_BOOL_RW(mcu_powersave, "mcu_powersave", ASUS_WMI_DEVID_MCU_POWER
 			"Set MCU powersaving mode");
 ASUS_ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD,
 			"Set the panel refresh overdrive");
+ASUS_ATTR_GROUP_BOOL_RW(panel_hd_mode, "panel_hd_mode", ASUS_WMI_DEVID_PANEL_HD,
+			"Set the panel HD mode to UHD<0> or FHD<1>");
 ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED,
 			"Show the eGPU connection status");
 
@@ -620,6 +623,7 @@ static const struct asus_attr_group armoury_attr_groups[] = {
 	{ &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND },
 	{ &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE },
 	{ &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD },
+	{ &panel_hd_mode_attr_group, ASUS_WMI_DEVID_PANEL_HD },
 };
 
 static int asus_fw_attr_add(void)
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 8ea8925a0fc5..3cc235b20be4 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -75,6 +75,7 @@
 #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO 0x00110019
 
 /* Misc */
+#define ASUS_WMI_DEVID_PANEL_HD		0x0005001C
 #define ASUS_WMI_DEVID_PANEL_OD		0x00050019
 #define ASUS_WMI_DEVID_CAMERA		0x00060013
 #define ASUS_WMI_DEVID_LID_FLIP		0x00060062
-- 
cgit v1.2.3


From bfd3749d489ec0df27ed94ee3dfd9475fea27bf9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Nov 2025 09:18:04 -1000
Subject: sched_ext: Use shorter slice in bypass mode

There have been reported cases of bypass mode not making forward progress fast
enough. The 20ms default slice is unnecessarily long for bypass mode where the
primary goal is ensuring all tasks can make forward progress.

Introduce SCX_SLICE_BYPASS set to 5ms and make the scheduler automatically
switch to it when entering bypass mode. Also make the bypass slice value
tunable through the slice_bypass_us module parameter (adjustable between 100us
and 100ms) to make it easier to test whether slice durations are a factor in
problem cases.

v3: Use READ_ONCE/WRITE_ONCE for scx_slice_dfl access (Dan).

v2: Removed slice_dfl_us module parameter. Fixed typos (Andrea).

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 11 +++++++++++
 kernel/sched/ext.c        | 34 +++++++++++++++++++++++++++++++---
 2 files changed, 42 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index eb776b094d36..60285c3d07cf 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -17,7 +17,18 @@
 enum scx_public_consts {
 	SCX_OPS_NAME_LEN	= 128,
 
+	/*
+	 * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses
+	 * to set the slice for a task that is selected for execution.
+	 * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice
+	 * refill has been triggered.
+	 *
+	 * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass
+	 * mode. As making forward progress for all tasks is the main goal of
+	 * the bypass mode, a shorter slice is used.
+	 */
 	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
+	SCX_SLICE_BYPASS	=  5 * 1000000, /*  5ms */
 	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
 };
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 652a364e9e4c..1a9b28dd0961 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -143,6 +143,32 @@ static struct scx_dump_data scx_dump_data = {
 /* /sys/kernel/sched_ext interface */
 static struct kset *scx_kset;
 
+/*
+ * Parameters that can be adjusted through /sys/module/sched_ext/parameters.
+ * There usually is no reason to modify these as normal scheduler operation
+ * shouldn't be affected by them. The knobs are primarily for debugging.
+ */
+static u64 scx_slice_dfl = SCX_SLICE_DFL;
+static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
+
+static int set_slice_us(const char *val, const struct kernel_param *kp)
+{
+	return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC);
+}
+
+static const struct kernel_param_ops slice_us_param_ops = {
+	.set = set_slice_us,
+	.get = param_get_uint,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX	"sched_ext."
+
+module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600);
+MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)");
+
+#undef MODULE_PARAM_PREFIX
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched_ext.h>
 
@@ -919,7 +945,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
 
 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
 {
-	p->scx.slice = SCX_SLICE_DFL;
+	p->scx.slice = READ_ONCE(scx_slice_dfl);
 	__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
 }
 
@@ -2896,7 +2922,7 @@ void init_scx_entity(struct sched_ext_entity *scx)
 	INIT_LIST_HEAD(&scx->runnable_node);
 	scx->runnable_at = jiffies;
 	scx->ddsp_dsq_id = SCX_DSQ_INVALID;
-	scx->slice = SCX_SLICE_DFL;
+	scx->slice = READ_ONCE(scx_slice_dfl);
 }
 
 void scx_pre_fork(struct task_struct *p)
@@ -3774,6 +3800,7 @@ static void scx_bypass(bool bypass)
 		WARN_ON_ONCE(scx_bypass_depth <= 0);
 		if (scx_bypass_depth != 1)
 			goto unlock;
+		WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
 		bypass_timestamp = ktime_get_ns();
 		if (sch)
 			scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
@@ -3782,6 +3809,7 @@ static void scx_bypass(bool bypass)
 		WARN_ON_ONCE(scx_bypass_depth < 0);
 		if (scx_bypass_depth != 0)
 			goto unlock;
+		WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL);
 		if (sch)
 			scx_add_event(sch, SCX_EV_BYPASS_DURATION,
 				      ktime_get_ns() - bypass_timestamp);
@@ -4780,7 +4808,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 			queue_flags |= DEQUEUE_CLASS;
 
 		scoped_guard (sched_change, p, queue_flags) {
-			p->scx.slice = SCX_SLICE_DFL;
+			p->scx.slice = READ_ONCE(scx_slice_dfl);
 			p->sched_class = new_class;
 		}
 	}
-- 
cgit v1.2.3


From 61debc251c1c9150c7bdfd5c028bc2d078e17d22 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Nov 2025 09:18:06 -1000
Subject: sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass
 mode

Bypass mode routes tasks through fallback dispatch queues. Originally a single
global DSQ, b7b3b2dbae73 ("sched_ext: Split the global DSQ per NUMA node")
changed this to per-node DSQs to resolve NUMA-related livelocks.

Dan Schatzberg found per-node DSQs can still livelock when many threads are
pinned to different small CPU subsets: each CPU must scan many incompatible
tasks to find runnable ones, causing severe contention with high CPU counts.

Switch to per-CPU bypass DSQs. Each task queues on its current CPU. Default
idle CPU selection and direct dispatch handle most cases well.

This introduces a failure mode when tasks concentrate on one CPU in
over-saturated systems. If the BPF scheduler severely skews placement before
triggering bypass, that CPU's queue may be too long to drain, causing RCU
stalls. A load balancer in a future patch will address this. The bypass DSQ is
separate from local DSQ to enable load balancing: local DSQs use rq locks,
preventing efficient scanning and transfer across CPUs, especially problematic
when systems are already contended.

v2: Clarified why bypass DSQ is separate from local DSQ (Andrea Righi).

Reported-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h |  1 +
 kernel/sched/ext.c        | 16 +++++++++++++---
 kernel/sched/sched.h      |  1 +
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 60285c3d07cf..3d3216ff9188 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -57,6 +57,7 @@ enum scx_dsq_id_flags {
 	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
 	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
 	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
+	SCX_DSQ_BYPASS		= SCX_DSQ_FLAG_BUILTIN | 3,
 	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
 	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
 };
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 43083602c15e..747391a3f6e3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1298,7 +1298,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 
 	if (scx_rq_bypassing(rq)) {
 		__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
-		goto global;
+		goto bypass;
 	}
 
 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -1356,6 +1356,9 @@ local:
 global:
 	dsq = find_global_dsq(sch, p);
 	goto enqueue;
+bypass:
+	dsq = &task_rq(p)->scx.bypass_dsq;
+	goto enqueue;
 
 enqueue:
 	/*
@@ -2154,8 +2157,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	if (consume_global_dsq(sch, rq))
 		goto has_tasks;
 
-	if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
-	    scx_rq_bypassing(rq) || !scx_rq_online(rq))
+	if (scx_rq_bypassing(rq)) {
+		if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
+			goto has_tasks;
+		else
+			goto no_tasks;
+	}
+
+	if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
 		goto no_tasks;
 
 	dspc->rq = rq;
@@ -5371,6 +5380,7 @@ void __init init_sched_ext_class(void)
 		int  n = cpu_to_node(cpu);
 
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+		init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
 		INIT_LIST_HEAD(&rq->scx.runnable_list);
 		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 27aae2a298f8..5991133a4849 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -808,6 +808,7 @@ struct scx_rq {
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;
+	struct scx_dispatch_q	bypass_dsq;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */
 
-- 
cgit v1.2.3


From 582f700e1bdc5978f41e3d8d65d3e16e34e9be8a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Nov 2025 09:18:12 -1000
Subject: sched_ext: Hook up hardlockup detector

A poorly behaving BPF scheduler can trigger hard lockup. For example, on a
large system with many tasks pinned to different subsets of CPUs, if the BPF
scheduler puts all tasks in a single DSQ and lets all CPUs at it, the DSQ lock
can be contended to the point where hardlockup triggers. Unfortunately,
hardlockup can be the first signal out of such situations, thus requiring
hardlockup handling.

Hook scx_hardlockup() into the hardlockup detector to try kicking out the
current scheduler in an attempt to recover the system to a good state. The
handling strategy can delay watchdog taking its own action by one polling
period; however, given that the only remediation for hardlockup is crash, this
is likely an acceptable trade-off.

v2: Add missing dummy scx_hardlockup() definition for
    !CONFIG_SCHED_CLASS_EXT (kernel test bot).

Reported-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Cc: Emil Tsalapatis <etsal@meta.com>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h |  2 ++
 kernel/sched/ext.c        | 18 ++++++++++++++++++
 kernel/watchdog.c         |  9 +++++++++
 3 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 3d3216ff9188..d6c152475f5b 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -223,6 +223,7 @@ struct sched_ext_entity {
 void sched_ext_dead(struct task_struct *p);
 void print_scx_info(const char *log_lvl, struct task_struct *p);
 void scx_softlockup(u32 dur_s);
+bool scx_hardlockup(void);
 bool scx_rcu_cpu_stall(void);
 
 #else	/* !CONFIG_SCHED_CLASS_EXT */
@@ -230,6 +231,7 @@ bool scx_rcu_cpu_stall(void);
 static inline void sched_ext_dead(struct task_struct *p) {}
 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
 static inline void scx_softlockup(u32 dur_s) {}
+static inline bool scx_hardlockup(void) { return false; }
 static inline bool scx_rcu_cpu_stall(void) { return false; }
 
 #endif	/* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 85bb052459ec..b5c87a03f112 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3711,6 +3711,24 @@ void scx_softlockup(u32 dur_s)
 			smp_processor_id(), dur_s);
 }
 
+/**
+ * scx_hardlockup - sched_ext hardlockup handler
+ *
+ * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting
+ * numerous affinitized tasks in a single queue and directing all CPUs at it.
+ * Try kicking out the current scheduler in an attempt to recover the system to
+ * a good state before taking more drastic actions.
+ */
+bool scx_hardlockup(void)
+{
+	if (!handle_lockup("hard lockup - CPU %d", smp_processor_id()))
+		return false;
+
+	printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
+			smp_processor_id());
+	return true;
+}
+
 /**
  * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
  * @bypass: true for bypass, false for unbypass
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5b62d1002783..8dfac4a8f587 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -196,6 +196,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
 #ifdef CONFIG_SYSFS
 		++hardlockup_count;
 #endif
+		/*
+		 * A poorly behaving BPF scheduler can trigger hard lockup by
+		 * e.g. putting numerous affinitized tasks in a single queue and
+		 * directing all CPUs at it. The following call can return true
+		 * only once when sched_ext is enabled and will immediately
+		 * abort the BPF scheduler and print out a warning message.
+		 */
+		if (scx_hardlockup())
+			return;
 
 		/* Only print hardlockups once. */
 		if (per_cpu(watchdog_hardlockup_warned, cpu))
-- 
cgit v1.2.3


From d2974cc79f7139cc851b84ad4f77805e93c40fe1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Nov 2025 09:18:14 -1000
Subject: sched_ext: Factor out scx_dsq_list_node cursor initialization into
 INIT_DSQ_LIST_CURSOR

Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR
macro in preparation for additional users.

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 7 +++++++
 kernel/sched/ext.c        | 5 ++---
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index d6c152475f5b..70ee5c28a74d 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -149,6 +149,13 @@ struct scx_dsq_list_node {
 	u32			priv;		/* can be used by iter cursor */
 };
 
+#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv)				\
+	(struct scx_dsq_list_node) {						\
+		.node = LIST_HEAD_INIT((__node).node),				\
+		.flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags),			\
+		.priv = (__priv),						\
+	}
+
 /*
  * The following is embedded in task_struct and contains all fields necessary
  * for a task to be scheduled by SCX.
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b5c87a03f112..56946aceeb28 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6253,9 +6253,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
 	if (!kit->dsq)
 		return -ENOENT;
 
-	INIT_LIST_HEAD(&kit->cursor.node);
-	kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags;
-	kit->cursor.priv = READ_ONCE(kit->dsq->seq);
+	kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags,
+					   READ_ONCE(kit->dsq->seq));
 
 	return 0;
 }
-- 
cgit v1.2.3


From 95d1df610cdc7497510cc710435a5c8c4e3db606 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Nov 2025 09:18:16 -1000
Subject: sched_ext: Implement load balancer for bypass mode

In bypass mode, tasks are queued on per-CPU bypass DSQs. While this works well
in most cases, there is a failure mode where a BPF scheduler can skew task
placement severely before triggering bypass in highly over-saturated systems.
If most tasks end up concentrated on a few CPUs, those CPUs can accumulate
queues that are too long to drain in a reasonable time, leading to RCU stalls
and hung tasks.

Implement a simple timer-based load balancer that redistributes tasks across
CPUs within each NUMA node. The balancer runs periodically (default 500ms,
tunable via bypass_lb_intv_us module parameter) and moves tasks from overloaded
CPUs to underloaded ones.

When moving tasks between bypass DSQs, the load balancer holds nested DSQ locks
to avoid dropping and reacquiring the donor DSQ lock on each iteration, as
donor DSQs can be very long and highly contended. Add the SCX_ENQ_NESTED flag
and use raw_spin_lock_nested() in dispatch_enqueue() to support this. The load
balancer timer function reads scx_bypass_depth locklessly to check whether
bypass mode is active. Use WRITE_ONCE() when updating scx_bypass_depth to pair
with the READ_ONCE() in the timer function.

This has been tested on a 192 CPU dual socket AMD EPYC machine with ~20k
runnable tasks running scx_cpu0. As scx_cpu0 queues all tasks to CPU0, almost
all tasks end up on CPU0 creating severe imbalance. Without the load balancer,
disabling the scheduler can lead to RCU stalls and hung tasks, taking a very
long time to complete. With the load balancer, disable completes in about a
second.

The load balancing operation can be monitored using the sched_ext_bypass_lb
tracepoint and disabled by setting bypass_lb_intv_us to 0.

v2: Lock both rq and DSQ in bypass_lb_cpu() and use dispatch_dequeue_locked()
    to prevent races with dispatch_dequeue() (Andrea Righi).

Cc: Andrea Righi <arighi@nvidia.com>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
Cc: Emil Tsalapatis <etsal@meta.com>
Reviewed_by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/trace/events/sched_ext.h |  39 +++++++
 kernel/sched/ext.c               | 239 ++++++++++++++++++++++++++++++++++++++-
 kernel/sched/ext_internal.h      |   6 +
 3 files changed, 281 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
index 50e4b712735a..d1bf5acd59c5 100644
--- a/include/trace/events/sched_ext.h
+++ b/include/trace/events/sched_ext.h
@@ -45,6 +45,45 @@ TRACE_EVENT(sched_ext_event,
 	)
 );
 
+TRACE_EVENT(sched_ext_bypass_lb,
+
+	TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced,
+		 __u32 before_min, __u32 before_max,
+		 __u32 after_min, __u32 after_max),
+
+	TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced,
+		before_min, before_max, after_min, after_max),
+
+	TP_STRUCT__entry(
+		__field(	__u32,		node		)
+		__field(	__u32,		nr_cpus		)
+		__field(	__u32,		nr_tasks	)
+		__field(	__u32,		nr_balanced	)
+		__field(	__u32,		before_min	)
+		__field(	__u32,		before_max	)
+		__field(	__u32,		after_min	)
+		__field(	__u32,		after_max	)
+	),
+
+	TP_fast_assign(
+		__entry->node		= node;
+		__entry->nr_cpus	= nr_cpus;
+		__entry->nr_tasks	= nr_tasks;
+		__entry->nr_balanced	= nr_balanced;
+		__entry->before_min	= before_min;
+		__entry->before_max	= before_max;
+		__entry->after_min	= after_min;
+		__entry->after_max	= after_max;
+	),
+
+	TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u",
+		  __entry->node, __entry->nr_cpus,
+		  __entry->nr_tasks, __entry->nr_balanced,
+		  __entry->before_min, __entry->after_min,
+		  __entry->before_max, __entry->after_max
+	)
+);
+
 #endif /* _TRACE_SCHED_EXT_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 10d8532f8d9b..c900667b25b8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -34,6 +34,8 @@ DEFINE_STATIC_KEY_FALSE(__scx_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
 static int scx_bypass_depth;
+static cpumask_var_t scx_bypass_lb_donee_cpumask;
+static cpumask_var_t scx_bypass_lb_resched_cpumask;
 static bool scx_aborting;
 static bool scx_init_task_enabled;
 static bool scx_switching_all;
@@ -149,6 +151,7 @@ static struct kset *scx_kset;
  */
 static u64 scx_slice_dfl = SCX_SLICE_DFL;
 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
+static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US;
 
 static int set_slice_us(const char *val, const struct kernel_param *kp)
 {
@@ -160,11 +163,23 @@ static const struct kernel_param_ops slice_us_param_ops = {
 	.get = param_get_uint,
 };
 
+static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp)
+{
+	return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC);
+}
+
+static const struct kernel_param_ops bypass_lb_intv_us_param_ops = {
+	.set = set_bypass_lb_intv_us,
+	.get = param_get_uint,
+};
+
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX	"sched_ext."
 
 module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600);
 MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)");
+module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600);
+MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)");
 
 #undef MODULE_PARAM_PREFIX
 
@@ -962,7 +977,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
 		     !RB_EMPTY_NODE(&p->scx.dsq_priq));
 
 	if (!is_local) {
-		raw_spin_lock(&dsq->lock);
+		raw_spin_lock_nested(&dsq->lock,
+			(enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0);
+
 		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
 			scx_error(sch, "attempting to dispatch to a destroyed dsq");
 			/* fall back to the global dsq */
@@ -3744,6 +3761,207 @@ bool scx_hardlockup(void)
 	return true;
 }
 
+static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
+			 struct cpumask *donee_mask, struct cpumask *resched_mask,
+			 u32 nr_donor_target, u32 nr_donee_target)
+{
+	struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+	struct task_struct *p, *n;
+	struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
+	s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
+	u32 nr_balanced = 0, min_delta_us;
+
+	/*
+	 * All we want to guarantee is reasonable forward progress. No reason to
+	 * fine tune. Assuming every task on @donor_dsq runs their full slice,
+	 * consider offloading iff the total queued duration is over the
+	 * threshold.
+	 */
+	min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
+	if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+		return 0;
+
+	raw_spin_rq_lock_irq(rq);
+	raw_spin_lock(&donor_dsq->lock);
+	list_add(&cursor.node, &donor_dsq->list);
+resume:
+	n = container_of(&cursor, struct task_struct, scx.dsq_list);
+	n = nldsq_next_task(donor_dsq, n, false);
+
+	while ((p = n)) {
+		struct rq *donee_rq;
+		struct scx_dispatch_q *donee_dsq;
+		int donee;
+
+		n = nldsq_next_task(donor_dsq, n, false);
+
+		if (donor_dsq->nr <= nr_donor_target)
+			break;
+
+		if (cpumask_empty(donee_mask))
+			break;
+
+		donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
+		if (donee >= nr_cpu_ids)
+			continue;
+
+		donee_rq = cpu_rq(donee);
+		donee_dsq = &donee_rq->scx.bypass_dsq;
+
+		/*
+		 * $p's rq is not locked but $p's DSQ lock protects its
+		 * scheduling properties making this test safe.
+		 */
+		if (!task_can_run_on_remote_rq(sch, p, donee_rq, false))
+			continue;
+
+		/*
+		 * Moving $p from one non-local DSQ to another. The source rq
+		 * and DSQ are already locked. Do an abbreviated dequeue and
+		 * then perform enqueue without unlocking $donor_dsq.
+		 *
+		 * We don't want to drop and reacquire the lock on each
+		 * iteration as @donor_dsq can be very long and potentially
+		 * highly contended. Donee DSQs are less likely to be contended.
+		 * The nested locking is safe as only this LB moves tasks
+		 * between bypass DSQs.
+		 */
+		dispatch_dequeue_locked(p, donor_dsq);
+		dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
+
+		/*
+		 * $donee might have been idle and need to be woken up. No need
+		 * to be clever. Kick every CPU that receives tasks.
+		 */
+		cpumask_set_cpu(donee, resched_mask);
+
+		if (READ_ONCE(donee_dsq->nr) >= nr_donee_target)
+			cpumask_clear_cpu(donee, donee_mask);
+
+		nr_balanced++;
+		if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) {
+			list_move_tail(&cursor.node, &n->scx.dsq_list.node);
+			raw_spin_unlock(&donor_dsq->lock);
+			raw_spin_rq_unlock_irq(rq);
+			cpu_relax();
+			raw_spin_rq_lock_irq(rq);
+			raw_spin_lock(&donor_dsq->lock);
+			goto resume;
+		}
+	}
+
+	list_del_init(&cursor.node);
+	raw_spin_unlock(&donor_dsq->lock);
+	raw_spin_rq_unlock_irq(rq);
+
+	return nr_balanced;
+}
+
+static void bypass_lb_node(struct scx_sched *sch, int node)
+{
+	const struct cpumask *node_mask = cpumask_of_node(node);
+	struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
+	struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
+	u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
+	u32 nr_target, nr_donor_target;
+	u32 before_min = U32_MAX, before_max = 0;
+	u32 after_min = U32_MAX, after_max = 0;
+	int cpu;
+
+	/* count the target tasks and CPUs */
+	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+		u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+		nr_tasks += nr;
+		nr_cpus++;
+
+		before_min = min(nr, before_min);
+		before_max = max(nr, before_max);
+	}
+
+	if (!nr_cpus)
+		return;
+
+	/*
+	 * We don't want CPUs to have more than $nr_donor_target tasks and
+	 * balancing to fill donee CPUs upto $nr_target. Once targets are
+	 * calculated, find the donee CPUs.
+	 */
+	nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus);
+	nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100);
+
+	cpumask_clear(donee_mask);
+	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+		if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target)
+			cpumask_set_cpu(cpu, donee_mask);
+	}
+
+	/* iterate !donee CPUs and see if they should be offloaded */
+	cpumask_clear(resched_mask);
+	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+		struct rq *rq = cpu_rq(cpu);
+		struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+
+		if (cpumask_empty(donee_mask))
+			break;
+		if (cpumask_test_cpu(cpu, donee_mask))
+			continue;
+		if (READ_ONCE(donor_dsq->nr) <= nr_donor_target)
+			continue;
+
+		nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask,
+					     nr_donor_target, nr_target);
+	}
+
+	for_each_cpu(cpu, resched_mask) {
+		struct rq *rq = cpu_rq(cpu);
+
+		raw_spin_rq_lock_irq(rq);
+		resched_curr(rq);
+		raw_spin_rq_unlock_irq(rq);
+	}
+
+	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+		u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+		after_min = min(nr, after_min);
+		after_max = max(nr, after_max);
+
+	}
+
+	trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced,
+				  before_min, before_max, after_min, after_max);
+}
+
+/*
+ * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine
+ * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some
+ * bypass DSQs can be overloaded. If there are enough tasks to saturate other
+ * lightly loaded CPUs, such imbalance can lead to very high execution latency
+ * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such
+ * outcomes, a simple load balancing mechanism is implemented by the following
+ * timer which runs periodically while bypass mode is in effect.
+ */
+static void scx_bypass_lb_timerfn(struct timer_list *timer)
+{
+	struct scx_sched *sch;
+	int node;
+	u32 intv_us;
+
+	sch = rcu_dereference_all(scx_root);
+	if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth))
+		return;
+
+	for_each_node_with_cpus(node)
+		bypass_lb_node(sch, node);
+
+	intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+	if (intv_us)
+		mod_timer(timer, jiffies + usecs_to_jiffies(intv_us));
+}
+
+static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn);
+
 /**
  * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
  * @bypass: true for bypass, false for unbypass
@@ -3787,7 +4005,9 @@ static void scx_bypass(bool bypass)
 	sch = rcu_dereference_bh(scx_root);
 
 	if (bypass) {
-		scx_bypass_depth++;
+		u32 intv_us;
+
+		WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1);
 		WARN_ON_ONCE(scx_bypass_depth <= 0);
 		if (scx_bypass_depth != 1)
 			goto unlock;
@@ -3795,8 +4015,15 @@ static void scx_bypass(bool bypass)
 		bypass_timestamp = ktime_get_ns();
 		if (sch)
 			scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
+
+		intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+		if (intv_us && !timer_pending(&scx_bypass_lb_timer)) {
+			scx_bypass_lb_timer.expires =
+				jiffies + usecs_to_jiffies(intv_us);
+			add_timer_global(&scx_bypass_lb_timer);
+		}
 	} else {
-		scx_bypass_depth--;
+		WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1);
 		WARN_ON_ONCE(scx_bypass_depth < 0);
 		if (scx_bypass_depth != 0)
 			goto unlock;
@@ -7052,6 +7279,12 @@ static int __init scx_init(void)
 		return ret;
 	}
 
+	if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) ||
+	    !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) {
+		pr_err("sched_ext: Failed to allocate cpumasks\n");
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 __initcall(scx_init);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index dd6f25fb6159..386c677e4c9a 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -23,6 +23,11 @@ enum scx_consts {
 	 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
 	 */
 	SCX_TASK_ITER_BATCH		= 32,
+
+	SCX_BYPASS_LB_DFL_INTV_US	= 500 * USEC_PER_MSEC,
+	SCX_BYPASS_LB_DONOR_PCT		= 125,
+	SCX_BYPASS_LB_MIN_DELTA_DIV	= 4,
+	SCX_BYPASS_LB_BATCH		= 256,
 };
 
 enum scx_exit_kind {
@@ -963,6 +968,7 @@ enum scx_enq_flags {
 
 	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
 	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
+	SCX_ENQ_NESTED		= 1LLU << 58,
 };
 
 enum scx_deq_flags {
-- 
cgit v1.2.3


From 05d6f1cc2dc214c1491181be13f37d2a3a26f694 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 13 Oct 2025 11:12:02 +0200
Subject: compiler.h: remove ARCH_SEL()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Its last user was removed in commit 8ea815399c3f ("compiler: remove
__ADDRESSABLE_ASM{_STR,}() again").

Link: https://lkml.kernel.org/r/20251013-arch-sel-v1-1-7eef9b22ceb0@linutronix.de
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 5b45ea7dff3e..a9a2f8aae821 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -269,12 +269,6 @@ static inline void *offset_to_ptr(const int *off)
 
 #endif /* __ASSEMBLY__ */
 
-#ifdef CONFIG_64BIT
-#define ARCH_SEL(a,b) a
-#else
-#define ARCH_SEL(a,b) b
-#endif
-
 /*
  * Force the compiler to emit 'sym' as a symbol, so that we can reference
  * it from inline assembler. Necessary in case 'sym' could be inlined
-- 
cgit v1.2.3


From adc15829fb73e402903b7030729263b6ee4a7232 Mon Sep 17 00:00:00 2001
From: Sourabh Jain <sourabhjain@linux.ibm.com>
Date: Thu, 16 Oct 2025 19:58:31 +0530
Subject: crash: let architecture decide crash memory export to iomem_resource

With the generic crashkernel reservation, the kernel emits the following
warning on powerpc:

WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/mem.c:341 add_system_ram_resources+0xfc/0x180
Modules linked in:
CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-auto-12607-g5472d60c129f #1 VOLUNTARY
Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.01 (NH1110_069) hv:phyp pSeries
NIP:  c00000000201de3c LR: c00000000201de34 CTR: 0000000000000000
REGS: c000000127cef8a0 TRAP: 0700   Not tainted (6.17.0-auto-12607-g5472d60c129f)
MSR:  8000000002029033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 84000840  XER: 20040010
CFAR: c00000000017eed0 IRQMASK: 0
GPR00: c00000000201de34 c000000127cefb40 c0000000016a8100 0000000000000001
GPR04: c00000012005aa00 0000000020000000 c000000002b705c8 0000000000000000
GPR08: 000000007fffffff fffffffffffffff0 c000000002db8100 000000011fffffff
GPR12: c00000000201dd40 c000000002ff0000 c0000000000112bc 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c0000000015a3808
GPR24: c00000000200468c c000000001699888 0000000000000106 c0000000020d1950
GPR28: c0000000014683f8 0000000081000200 c0000000015c1868 c000000002b9f710
NIP [c00000000201de3c] add_system_ram_resources+0xfc/0x180
LR [c00000000201de34] add_system_ram_resources+0xf4/0x180
Call Trace:
add_system_ram_resources+0xf4/0x180 (unreliable)
do_one_initcall+0x60/0x36c
do_initcalls+0x120/0x220
kernel_init_freeable+0x23c/0x390
kernel_init+0x34/0x26c
ret_from_kernel_user_thread+0x14/0x1c

This warning occurs due to a conflict between crashkernel and System RAM
iomem resources.

The generic crashkernel reservation adds the crashkernel memory range to
/proc/iomem during early initialization. Later, all memblock ranges are
added to /proc/iomem as System RAM. If the crashkernel region overlaps
with any memblock range, it causes a conflict while adding those memblock
regions as iomem resources, triggering the above warning. The conflicting
memblock regions are then omitted from /proc/iomem.

For example, if the following crashkernel region is added to /proc/iomem:
20000000-11fffffff : Crash kernel

then the following memblock regions System RAM regions fail to be inserted:
00000000-7fffffff : System RAM
80000000-257fffffff : System RAM

Fix this by not adding the crashkernel memory to /proc/iomem on powerpc.
Introduce an architecture hook to let each architecture decide whether to
export the crashkernel region to /proc/iomem.

For more info checkout commit c40dd2f766440 ("powerpc: Add System RAM
to /proc/iomem") and commit bce074bdbc36 ("powerpc: insert System RAM
resource to prevent crashkernel conflict")

Note: Before switching to the generic crashkernel reservation, powerpc
never exported the crashkernel region to /proc/iomem.

Link: https://lkml.kernel.org/r/20251016142831.144515-1-sourabhjain@linux.ibm.com
Fixes: e3185ee438c2 ("powerpc/crash: use generic crashkernel reservation").
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Closes: https://lore.kernel.org/all/90937fe0-2e76-4c82-b27e-7b8a7fe3ac69@linux.ibm.com/
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Cc: Baoquan he <bhe@redhat.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/include/asm/crash_reserve.h | 8 ++++++++
 include/linux/crash_reserve.h            | 6 ++++++
 kernel/crash_reserve.c                   | 3 +++
 3 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/arch/powerpc/include/asm/crash_reserve.h b/arch/powerpc/include/asm/crash_reserve.h
index 6467ce29b1fa..d1b570ddbf98 100644
--- a/arch/powerpc/include/asm/crash_reserve.h
+++ b/arch/powerpc/include/asm/crash_reserve.h
@@ -5,4 +5,12 @@
 /* crash kernel regions are Page size agliged */
 #define CRASH_ALIGN             PAGE_SIZE
 
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static inline bool arch_add_crash_res_to_iomem(void)
+{
+	return false;
+}
+#define arch_add_crash_res_to_iomem arch_add_crash_res_to_iomem
+#endif
+
 #endif /* _ASM_POWERPC_CRASH_RESERVE_H */
diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
index 7b44b41d0a20..f0dc03d94ca2 100644
--- a/include/linux/crash_reserve.h
+++ b/include/linux/crash_reserve.h
@@ -32,6 +32,12 @@ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 void __init reserve_crashkernel_cma(unsigned long long cma_size);
 
 #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef arch_add_crash_res_to_iomem
+static inline bool arch_add_crash_res_to_iomem(void)
+{
+	return true;
+}
+#endif
 #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
 #define DEFAULT_CRASH_KERNEL_LOW_SIZE	(128UL << 20)
 #endif
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index 87bf4d41eabb..62e60e0223cf 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsigned long long cma_size)
 #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
 static __init int insert_crashkernel_resources(void)
 {
+	if (!arch_add_crash_res_to_iomem())
+		return 0;
+
 	if (crashk_res.start < crashk_res.end)
 		insert_resource(&iomem_resource, &crashk_res);
 
-- 
cgit v1.2.3


From 37ade54f386c829597f74b54bad335c12bd2a698 Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Wed, 22 Oct 2025 10:28:04 +0200
Subject: taint/module: remove unnecessary taint_flag.module field

The TAINT_RANDSTRUCT and TAINT_FWCTL flags are mistakenly set in the
taint_flags table as per-module flags.  While this can be trivially
corrected, the issue can be avoided altogether by removing the
taint_flag.module field.

This is possible because, since commit 7fd8329ba502 ("taint/module: Clean
up global and module taint flags handling") in 2016, the handling of
module taint flags has been fully generic.  Specifically,
module_flags_taint() can print all flags, and the required output buffer
size is properly defined in terms of TAINT_FLAGS_COUNT.  The actual
per-module flags are always those added to module.taints by calls to
add_taint_module().

Link: https://lkml.kernel.org/r/20251022082938.26670-1-petr.pavlu@suse.com
Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Acked-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Aaron Tomlin <atomlin@atomlin.com>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/panic.h |  1 -
 kernel/module/main.c  |  2 +-
 kernel/panic.c        | 46 +++++++++++++++++++++-------------------------
 3 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/panic.h b/include/linux/panic.h
index 6f972a66c13e..a00bc0937698 100644
--- a/include/linux/panic.h
+++ b/include/linux/panic.h
@@ -86,7 +86,6 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout)
 struct taint_flag {
 	char c_true;		/* character printed when tainted */
 	char c_false;		/* character printed when not tainted */
-	bool module;		/* also show as a per-module taint flag */
 	const char *desc;	/* verbose description of the set taint flag */
 };
 
diff --git a/kernel/module/main.c b/kernel/module/main.c
index c66b26184936..6f219751df7e 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -954,7 +954,7 @@ size_t module_flags_taint(unsigned long taints, char *buf)
 	int i;
 
 	for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
-		if (taint_flags[i].module && test_bit(i, &taints))
+		if (test_bit(i, &taints))
 			buf[l++] = taint_flags[i].c_true;
 	}
 
diff --git a/kernel/panic.c b/kernel/panic.c
index ec59cade1f83..ffceb6f13935 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -628,17 +628,13 @@ void panic(const char *fmt, ...)
 }
 EXPORT_SYMBOL(panic);
 
-#define TAINT_FLAG(taint, _c_true, _c_false, _module)			\
+#define TAINT_FLAG(taint, _c_true, _c_false)				\
 	[ TAINT_##taint ] = {						\
 		.c_true = _c_true, .c_false = _c_false,			\
-		.module = _module,					\
 		.desc = #taint,						\
 	}
 
 /*
- * TAINT_FORCED_RMMOD could be a per-module flag but the module
- * is being removed anyway.
- *
  * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT,
  * please also modify tools/debugging/kernel-chktaint and
  * Documentation/admin-guide/tainted-kernels.rst, including its
@@ -646,26 +642,26 @@ EXPORT_SYMBOL(panic);
  * /proc/sys/kernel/tainted.
  */
 const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
-	TAINT_FLAG(PROPRIETARY_MODULE,		'P', 'G', true),
-	TAINT_FLAG(FORCED_MODULE,		'F', ' ', true),
-	TAINT_FLAG(CPU_OUT_OF_SPEC,		'S', ' ', false),
-	TAINT_FLAG(FORCED_RMMOD,		'R', ' ', false),
-	TAINT_FLAG(MACHINE_CHECK,		'M', ' ', false),
-	TAINT_FLAG(BAD_PAGE,			'B', ' ', false),
-	TAINT_FLAG(USER,			'U', ' ', false),
-	TAINT_FLAG(DIE,				'D', ' ', false),
-	TAINT_FLAG(OVERRIDDEN_ACPI_TABLE,	'A', ' ', false),
-	TAINT_FLAG(WARN,			'W', ' ', false),
-	TAINT_FLAG(CRAP,			'C', ' ', true),
-	TAINT_FLAG(FIRMWARE_WORKAROUND,		'I', ' ', false),
-	TAINT_FLAG(OOT_MODULE,			'O', ' ', true),
-	TAINT_FLAG(UNSIGNED_MODULE,		'E', ' ', true),
-	TAINT_FLAG(SOFTLOCKUP,			'L', ' ', false),
-	TAINT_FLAG(LIVEPATCH,			'K', ' ', true),
-	TAINT_FLAG(AUX,				'X', ' ', true),
-	TAINT_FLAG(RANDSTRUCT,			'T', ' ', true),
-	TAINT_FLAG(TEST,			'N', ' ', true),
-	TAINT_FLAG(FWCTL,			'J', ' ', true),
+	TAINT_FLAG(PROPRIETARY_MODULE,		'P', 'G'),
+	TAINT_FLAG(FORCED_MODULE,		'F', ' '),
+	TAINT_FLAG(CPU_OUT_OF_SPEC,		'S', ' '),
+	TAINT_FLAG(FORCED_RMMOD,		'R', ' '),
+	TAINT_FLAG(MACHINE_CHECK,		'M', ' '),
+	TAINT_FLAG(BAD_PAGE,			'B', ' '),
+	TAINT_FLAG(USER,			'U', ' '),
+	TAINT_FLAG(DIE,				'D', ' '),
+	TAINT_FLAG(OVERRIDDEN_ACPI_TABLE,	'A', ' '),
+	TAINT_FLAG(WARN,			'W', ' '),
+	TAINT_FLAG(CRAP,			'C', ' '),
+	TAINT_FLAG(FIRMWARE_WORKAROUND,		'I', ' '),
+	TAINT_FLAG(OOT_MODULE,			'O', ' '),
+	TAINT_FLAG(UNSIGNED_MODULE,		'E', ' '),
+	TAINT_FLAG(SOFTLOCKUP,			'L', ' '),
+	TAINT_FLAG(LIVEPATCH,			'K', ' '),
+	TAINT_FLAG(AUX,				'X', ' '),
+	TAINT_FLAG(RANDSTRUCT,			'T', ' '),
+	TAINT_FLAG(TEST,			'N', ' '),
+	TAINT_FLAG(FWCTL,			'J', ' '),
 };
 
 #undef TAINT_FLAG
-- 
cgit v1.2.3


From d99dc586ca7c7729450af2ed39ca1483c0eb7b5c Mon Sep 17 00:00:00 2001
From: "Yury Norov (NVIDIA)" <yury.norov@gmail.com>
Date: Thu, 23 Oct 2025 13:16:06 -0400
Subject: uaccess: decouple INLINE_COPY_FROM_USER and CONFIG_RUST
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 1f9a8286bc0c ("uaccess: always export _copy_[from|to]_user with
CONFIG_RUST") exports _copy_{from,to}_user() unconditionally, if RUST is
enabled.  This pollutes exported symbols namespace, and spreads RUST
ifdefery in core files.

It's better to declare a corresponding helper under the rust/helpers,
similarly to how non-underscored copy_{from,to}_user() is handled.

[yury.norov@gmail.com: drop rust part of comment for _copy_from_user(), per Alice]
  Link: https://lkml.kernel.org/r/20251024154754.99768-1-yury.norov@gmail.com
Link: https://lkml.kernel.org/r/20251023171607.1171534-1-yury.norov@gmail.com
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Tested-by: Alice Ryhl <aliceryhl@google.com>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Trevor Gross <tmgross@umich.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/uaccess.h |  2 --
 lib/usercopy.c          |  4 ++--
 rust/helpers/uaccess.c  | 12 ++++++++++++
 3 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 1beb5b395d81..01cbd7dd0ba3 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -152,8 +152,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
  * directly in the normal copy_to/from_user(), the other ones go
  * through an extern _copy_to/from_user(), which expands the same code
  * here.
- *
- * Rust code always uses the extern definition.
  */
 static inline __must_check unsigned long
 _inline_copy_from_user(void *to, const void __user *from, unsigned long n)
diff --git a/lib/usercopy.c b/lib/usercopy.c
index 7b17b83c8042..b00a3a957de6 100644
--- a/lib/usercopy.c
+++ b/lib/usercopy.c
@@ -12,7 +12,7 @@
 
 /* out-of-line parts */
 
-#if !defined(INLINE_COPY_FROM_USER) || defined(CONFIG_RUST)
+#if !defined(INLINE_COPY_FROM_USER)
 unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	return _inline_copy_from_user(to, from, n);
@@ -20,7 +20,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n
 EXPORT_SYMBOL(_copy_from_user);
 #endif
 
-#if !defined(INLINE_COPY_TO_USER) || defined(CONFIG_RUST)
+#if !defined(INLINE_COPY_TO_USER)
 unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	return _inline_copy_to_user(to, from, n);
diff --git a/rust/helpers/uaccess.c b/rust/helpers/uaccess.c
index f49076f813cd..4629b2d15529 100644
--- a/rust/helpers/uaccess.c
+++ b/rust/helpers/uaccess.c
@@ -13,3 +13,15 @@ unsigned long rust_helper_copy_to_user(void __user *to, const void *from,
 {
 	return copy_to_user(to, from, n);
 }
+
+#ifdef INLINE_COPY_FROM_USER
+unsigned long rust_helper__copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	return _inline_copy_from_user(to, from, n);
+}
+
+unsigned long rust_helper__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	return _inline_copy_to_user(to, from, n);
+}
+#endif
-- 
cgit v1.2.3


From 6c2e6e2c1af1809d1d9cdbd50ac80f54f5995bdb Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Sat, 25 Oct 2025 16:00:03 +0800
Subject: dynamic_debug: add support for print stack

In practical problem diagnosis, especially during the boot phase, it is
often desirable to know the call sequence.  However, currently, apart from
adding print statements and recompiling the kernel, there seems to be no
good alternative.  If dynamic_debug supported printing the call stack, it
would be very helpful for diagnosing issues.  This patch add support '+d'
for dump stack.

Link: https://lkml.kernel.org/r/20251025080003.312536-1-yebin@huaweicloud.com
Signed-off-by: Ye Bin <yebin10@huawei.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/dynamic-debug-howto.rst |  5 +++--
 include/linux/dynamic_debug.h                     | 17 ++++++++++++++---
 lib/dynamic_debug.c                               |  1 +
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst
index 7c036590cd07..095a63892257 100644
--- a/Documentation/admin-guide/dynamic-debug-howto.rst
+++ b/Documentation/admin-guide/dynamic-debug-howto.rst
@@ -223,12 +223,13 @@ The flags are::
   f    Include the function name
   s    Include the source file name
   l    Include line number
+  d    Include call trace
 
 For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only
 the ``p`` flag has meaning, other flags are ignored.
 
-Note the regexp ``^[-+=][fslmpt_]+$`` matches a flags specification.
-To clear all flags at once, use ``=_`` or ``-fslmpt``.
+Note the regexp ``^[-+=][fslmptd_]+$`` matches a flags specification.
+To clear all flags at once, use ``=_`` or ``-fslmptd``.
 
 
 Debug messages during Boot Process
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index ff44ec346162..05743900a116 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -38,11 +38,12 @@ struct _ddebug {
 #define _DPRINTK_FLAGS_INCL_LINENO	(1<<3)
 #define _DPRINTK_FLAGS_INCL_TID		(1<<4)
 #define _DPRINTK_FLAGS_INCL_SOURCENAME	(1<<5)
+#define _DPRINTK_FLAGS_INCL_STACK	(1<<6)
 
 #define _DPRINTK_FLAGS_INCL_ANY		\
 	(_DPRINTK_FLAGS_INCL_MODNAME | _DPRINTK_FLAGS_INCL_FUNCNAME |\
 	 _DPRINTK_FLAGS_INCL_LINENO  | _DPRINTK_FLAGS_INCL_TID |\
-	 _DPRINTK_FLAGS_INCL_SOURCENAME)
+	 _DPRINTK_FLAGS_INCL_SOURCENAME | _DPRINTK_FLAGS_INCL_STACK)
 
 #if defined DEBUG
 #define _DPRINTK_FLAGS_DEFAULT _DPRINTK_FLAGS_PRINT
@@ -160,6 +161,12 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 			 const struct ib_device *ibdev,
 			 const char *fmt, ...);
 
+#define __dynamic_dump_stack(desc)				\
+{								\
+	if (desc.flags & _DPRINTK_FLAGS_INCL_STACK)		\
+		dump_stack();					\
+}
+
 #define DEFINE_DYNAMIC_DEBUG_METADATA_CLS(name, cls, fmt)	\
 	static struct _ddebug  __aligned(8)			\
 	__section("__dyndbg") name = {				\
@@ -220,8 +227,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
  */
 #define __dynamic_func_call_cls(id, cls, fmt, func, ...) do {	\
 	DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt);	\
-	if (DYNAMIC_DEBUG_BRANCH(id))				\
+	if (DYNAMIC_DEBUG_BRANCH(id)) {				\
 		func(&id, ##__VA_ARGS__);			\
+		__dynamic_dump_stack(id);			\
+	}							\
 } while (0)
 #define __dynamic_func_call(id, fmt, func, ...)				\
 	__dynamic_func_call_cls(id, _DPRINTK_CLASS_DFLT, fmt,		\
@@ -229,8 +238,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 
 #define __dynamic_func_call_cls_no_desc(id, cls, fmt, func, ...) do {	\
 	DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt);		\
-	if (DYNAMIC_DEBUG_BRANCH(id))					\
+	if (DYNAMIC_DEBUG_BRANCH(id)) {					\
 		func(__VA_ARGS__);					\
+		__dynamic_dump_stack(id);				\
+	}								\
 } while (0)
 #define __dynamic_func_call_no_desc(id, fmt, func, ...)			\
 	__dynamic_func_call_cls_no_desc(id, _DPRINTK_CLASS_DFLT,	\
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 5a007952f7f2..7d7892e57a01 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -95,6 +95,7 @@ static const struct { unsigned flag:8; char opt_char; } opt_array[] = {
 	{ _DPRINTK_FLAGS_INCL_SOURCENAME, 's' },
 	{ _DPRINTK_FLAGS_INCL_LINENO, 'l' },
 	{ _DPRINTK_FLAGS_INCL_TID, 't' },
+	{ _DPRINTK_FLAGS_INCL_STACK, 'd' },
 	{ _DPRINTK_FLAGS_NONE, '_' },
 };
 
-- 
cgit v1.2.3


From a0b8c6af29a4be3ca2ff9a95cf71e54db5d73e65 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Fri, 24 Oct 2025 21:51:20 +0100
Subject: lib/xxhash: remove more unused xxh functions

xxh32_reset() and xxh32_copy_state() are unused, and with those gone, the
xxh32_state struct is also unused.

xxh64_copy_state() is also unused.

Remove them all.

(Also fixes a comment above the xxh64_state that referred to it as
xxh32_state).

Link: https://lkml.kernel.org/r/20251024205120.454508-1-linux@treblig.org
Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xxhash.h | 46 +---------------------------------------------
 lib/xxhash.c           | 29 -----------------------------
 2 files changed, 1 insertion(+), 74 deletions(-)

(limited to 'include')

diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h
index 27f57eca8cb1..587122e2c29c 100644
--- a/include/linux/xxhash.h
+++ b/include/linux/xxhash.h
@@ -141,21 +141,7 @@ static inline unsigned long xxhash(const void *input, size_t length,
  */
 
 /**
- * struct xxh32_state - private xxh32 state, do not use members directly
- */
-struct xxh32_state {
-	uint32_t total_len_32;
-	uint32_t large_len;
-	uint32_t v1;
-	uint32_t v2;
-	uint32_t v3;
-	uint32_t v4;
-	uint32_t mem32[4];
-	uint32_t memsize;
-};
-
-/**
- * struct xxh32_state - private xxh64 state, do not use members directly
+ * struct xxh64_state - private xxh64 state, do not use members directly
  */
 struct xxh64_state {
 	uint64_t total_len;
@@ -167,16 +153,6 @@ struct xxh64_state {
 	uint32_t memsize;
 };
 
-/**
- * xxh32_reset() - reset the xxh32 state to start a new hashing operation
- *
- * @state: The xxh32 state to reset.
- * @seed:  Initialize the hash state with this seed.
- *
- * Call this function on any xxh32_state to prepare for a new hashing operation.
- */
-void xxh32_reset(struct xxh32_state *state, uint32_t seed);
-
 /**
  * xxh64_reset() - reset the xxh64 state to start a new hashing operation
  *
@@ -210,24 +186,4 @@ int xxh64_update(struct xxh64_state *state, const void *input, size_t length);
  */
 uint64_t xxh64_digest(const struct xxh64_state *state);
 
-/*-**************************
- * Utils
- ***************************/
-
-/**
- * xxh32_copy_state() - copy the source state into the destination state
- *
- * @src: The source xxh32 state.
- * @dst: The destination xxh32 state.
- */
-void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src);
-
-/**
- * xxh64_copy_state() - copy the source state into the destination state
- *
- * @src: The source xxh64 state.
- * @dst: The destination xxh64 state.
- */
-void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src);
-
 #endif /* XXHASH_H */
diff --git a/lib/xxhash.c b/lib/xxhash.c
index cf629766f376..4125b3e3cf7f 100644
--- a/lib/xxhash.c
+++ b/lib/xxhash.c
@@ -73,21 +73,6 @@ static const uint64_t PRIME64_3 =  1609587929392839161ULL;
 static const uint64_t PRIME64_4 =  9650029242287828579ULL;
 static const uint64_t PRIME64_5 =  2870177450012600261ULL;
 
-/*-**************************
- *  Utils
- ***************************/
-void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src)
-{
-	memcpy(dst, src, sizeof(*dst));
-}
-EXPORT_SYMBOL(xxh32_copy_state);
-
-void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src)
-{
-	memcpy(dst, src, sizeof(*dst));
-}
-EXPORT_SYMBOL(xxh64_copy_state);
-
 /*-***************************
  * Simple Hash Functions
  ****************************/
@@ -239,20 +224,6 @@ EXPORT_SYMBOL(xxh64);
 /*-**************************************************
  * Advanced Hash Functions
  ***************************************************/
-void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed)
-{
-	/* use a local state for memcpy() to avoid strict-aliasing warnings */
-	struct xxh32_state state;
-
-	memset(&state, 0, sizeof(state));
-	state.v1 = seed + PRIME32_1 + PRIME32_2;
-	state.v2 = seed + PRIME32_2;
-	state.v3 = seed + 0;
-	state.v4 = seed - PRIME32_1;
-	memcpy(statePtr, &state, sizeof(state));
-}
-EXPORT_SYMBOL(xxh32_reset);
-
 void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed)
 {
 	/* use a local state for memcpy() to avoid strict-aliasing warnings */
-- 
cgit v1.2.3


From 113557b0406818a8a5df3479b0a89125d2b2a04c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 7 Nov 2025 13:41:17 -0400
Subject: vfio: Provide a get_region_info op

Instead of hooking the general ioctl op, have the core code directly
decode VFIO_DEVICE_GET_REGION_INFO and call an op just for it.

This is intended to allow mechanical changes to the drivers to pull their
VFIO_DEVICE_GET_REGION_INFO int oa function. Later patches will improve
the function signature to consolidate more code.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/vfio/pci/vfio_pci_core.c | 9 ++++++---
 drivers/vfio/vfio_main.c         | 7 +++++++
 include/linux/vfio.h             | 2 ++
 include/linux/vfio_pci_core.h    | 2 ++
 4 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 7dcf5439dedc..1dc350003f07 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -996,9 +996,11 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
 	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
 }
 
-static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
-					  struct vfio_region_info __user *arg)
+int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+				   struct vfio_region_info __user *arg)
 {
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
 	struct pci_dev *pdev = vdev->pdev;
 	struct vfio_region_info info;
@@ -1132,6 +1134,7 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
 
 	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info);
 
 static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev,
 				       struct vfio_irq_info __user *arg)
@@ -1458,7 +1461,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 	case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO:
 		return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg);
 	case VFIO_DEVICE_GET_REGION_INFO:
-		return vfio_pci_ioctl_get_region_info(vdev, uarg);
+		return vfio_pci_ioctl_get_region_info(core_vdev, uarg);
 	case VFIO_DEVICE_IOEVENTFD:
 		return vfio_pci_ioctl_ioeventfd(vdev, uarg);
 	case VFIO_DEVICE_PCI_HOT_RESET:
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 38c8e9350a60..a390163ce706 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1296,7 +1296,14 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
 		ret = vfio_ioctl_device_feature(device, uptr);
 		break;
 
+	case VFIO_DEVICE_GET_REGION_INFO:
+		if (!device->ops->get_region_info)
+			goto ioctl_fallback;
+		ret = device->ops->get_region_info(device, uptr);
+		break;
+
 	default:
+ioctl_fallback:
 		if (unlikely(!device->ops->ioctl))
 			ret = -EINVAL;
 		else
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index eb563f538dee..be5fcf8432e8 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -132,6 +132,8 @@ struct vfio_device_ops {
 			 size_t count, loff_t *size);
 	long	(*ioctl)(struct vfio_device *vdev, unsigned int cmd,
 			 unsigned long arg);
+	int	(*get_region_info)(struct vfio_device *vdev,
+				   struct vfio_region_info __user *arg);
 	int	(*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
 	void	(*request)(struct vfio_device *vdev, unsigned int count);
 	int	(*match)(struct vfio_device *vdev, char *buf);
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index f541044e42a2..160bc2e31ece 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -115,6 +115,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 		unsigned long arg);
 int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 				void __user *arg, size_t argsz);
+int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+				   struct vfio_region_info __user *arg);
 ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
 		size_t count, loff_t *ppos);
 ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
-- 
cgit v1.2.3


From d604e1ec246d236deff57ac7e89e073dd911d60b Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 31 Oct 2025 13:39:09 -0700
Subject: scsi: core: Support allocating reserved commands

Quite some drivers are using management commands internally. These
commands typically use the same tag pool as regular SCSI commands. Tags
for these management commands are set aside before allocating the
block-mq tag bitmap for regular SCSI commands. The block layer already
supports this via the reserved tag mechanism. Add a new field
'nr_reserved_cmds' to the SCSI host template to instruct the block layer
to set aside a tag space for these management commands by using reserved
tags. Exclude reserved commands from .can_queue because .can_queue is
visible in sysfs.

[ bvanassche: modified patch title and patch description. Left out the
  following statements: "if (sht->nr_reserved_cmds)" and also
  "if (sdev->host->nr_reserved_cmds) flags |= BLK_MQ_REQ_RESERVED;". Moved
  nr_reserved_cmds declarations and statements close to the
  corresponding can_queue declarations and statements. See also
  https://lore.kernel.org/linux-scsi/20210503150333.130310-11-hare@suse.de/ ]

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251031204029.2883185-2-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/hosts.c     |  1 +
 drivers/scsi/scsi_lib.c  |  3 ++-
 include/scsi/scsi_host.h | 21 ++++++++++++++++++++-
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index eb224a338fa2..8b7f5fafa9e0 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -436,6 +436,7 @@ struct Scsi_Host *scsi_host_alloc(const struct scsi_host_template *sht, int priv
 	shost->hostt = sht;
 	shost->this_id = sht->this_id;
 	shost->can_queue = sht->can_queue;
+	shost->nr_reserved_cmds = sht->nr_reserved_cmds;
 	shost->sg_tablesize = sht->sg_tablesize;
 	shost->sg_prot_tablesize = sht->sg_prot_tablesize;
 	shost->cmd_per_lun = sht->cmd_per_lun;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d7e42293b864..d52bbbe5a357 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2083,7 +2083,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
 		tag_set->ops = &scsi_mq_ops_no_commit;
 	tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1;
 	tag_set->nr_maps = shost->nr_maps ? : 1;
-	tag_set->queue_depth = shost->can_queue;
+	tag_set->queue_depth = shost->can_queue + shost->nr_reserved_cmds;
+	tag_set->reserved_tags = shost->nr_reserved_cmds;
 	tag_set->cmd_size = cmd_size;
 	tag_set->numa_node = dev_to_node(shost->dma_dev);
 	if (shost->hostt->tag_alloc_policy_rr)
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index f5a243261236..7b8f144ccf7d 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -375,10 +375,19 @@ struct scsi_host_template {
 	/*
 	 * This determines if we will use a non-interrupt driven
 	 * or an interrupt driven scheme.  It is set to the maximum number
-	 * of simultaneous commands a single hw queue in HBA will accept.
+	 * of simultaneous commands a single hw queue in HBA will accept
+	 * excluding internal commands.
 	 */
 	int can_queue;
 
+	/*
+	 * This determines how many commands the HBA will set aside
+	 * for internal commands. This number will be added to
+	 * @can_queue to calculate the maximum number of simultaneous
+	 * commands sent to the host.
+	 */
+	int nr_reserved_cmds;
+
 	/*
 	 * In many instances, especially where disconnect / reconnect are
 	 * supported, our host also has an ID on the SCSI bus.  If this is
@@ -611,7 +620,17 @@ struct Scsi_Host {
 	unsigned short max_cmd_len;
 
 	int this_id;
+
+	/*
+	 * Number of commands this host can handle at the same time.
+	 * This excludes reserved commands as specified by nr_reserved_cmds.
+	 */
 	int can_queue;
+	/*
+	 * Number of reserved commands to allocate, if any.
+	 */
+	unsigned int nr_reserved_cmds;
+
 	short cmd_per_lun;
 	short unsigned int sg_tablesize;
 	short unsigned int sg_prot_tablesize;
-- 
cgit v1.2.3


From d630fbf6fc8ce2fc95de7784de5499387b682dc1 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 31 Oct 2025 13:39:12 -0700
Subject: scsi: core: Support allocating a pseudo SCSI device

Allocate a pseudo SCSI device if 'nr_reserved_cmds' has been set. Pseudo
SCSI devices have the SCSI ID <max_id>:U64_MAX so they won't clash with
any devices the LLD might create. Pseudo SCSI devices are excluded from
scanning and will not show up in sysfs. Additionally, pseudo SCSI
devices are skipped by shost_for_each_device(). This prevents that the
SCSI error handler tries to submit a reset to a non-existent logical
unit.

Do not allocate a budget map for pseudo SCSI devices since the
cmd_per_lun limit does not apply to pseudo SCSI devices.

Do not perform queue depth ramp up / ramp down for pseudo SCSI devices.

Pseudo SCSI devices will be used to send internal commands to a storage
device.

[ bvanassche: edited patch description / renamed host_sdev into
  pseudo_sdev / unexported scsi_get_host_dev() / modified error path in
  scsi_get_pseudo_dev() / skip pseudo devices in __scsi_iterate_devices()
  and also when calling sdev_init(), sdev_configure() and sdev_destroy().
  See also
  https://lore.kernel.org/linux-scsi/20211125151048.103910-2-hare@suse.de/ ]

Reviewed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251031204029.2883185-5-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/hosts.c       |  8 ++++++
 drivers/scsi/scsi.c        |  7 +++--
 drivers/scsi/scsi_priv.h   |  1 +
 drivers/scsi/scsi_scan.c   | 67 +++++++++++++++++++++++++++++++++++++++++++++-
 drivers/scsi/scsi_sysfs.c  |  5 +++-
 include/scsi/scsi_device.h | 16 +++++++++++
 include/scsi/scsi_host.h   |  6 +++++
 7 files changed, 106 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 8b7f5fafa9e0..ad1476fb5035 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -307,6 +307,14 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
 	if (error)
 		goto out_del_dev;
 
+	if (shost->nr_reserved_cmds) {
+		shost->pseudo_sdev = scsi_get_pseudo_sdev(shost);
+		if (!shost->pseudo_sdev) {
+			error = -ENOMEM;
+			goto out_del_dev;
+		}
+	}
+
 	scsi_proc_host_add(shost);
 	scsi_autopm_put_host(shost);
 	return error;
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 589ae28b2c8b..76cdad063f7b 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -831,8 +831,11 @@ struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *shost,
 	spin_lock_irqsave(shost->host_lock, flags);
 	while (list->next != &shost->__devices) {
 		next = list_entry(list->next, struct scsi_device, siblings);
-		/* skip devices that we can't get a reference to */
-		if (!scsi_device_get(next))
+		/*
+		 * Skip pseudo devices and also devices we can't get a
+		 * reference to.
+		 */
+		if (!scsi_device_is_pseudo_dev(next) && !scsi_device_get(next))
 			break;
 		next = NULL;
 		list = list->next;
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 5b2b19f5e8ec..d07ec15d6c00 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -135,6 +135,7 @@ extern int scsi_complete_async_scans(void);
 extern int scsi_scan_host_selected(struct Scsi_Host *, unsigned int,
 				   unsigned int, u64, enum scsi_scan_mode);
 extern void scsi_forget_host(struct Scsi_Host *);
+struct scsi_device *scsi_get_pseudo_sdev(struct Scsi_Host *);
 
 /* scsi_sysctl.c */
 #ifdef CONFIG_SYSCTL
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index de039efef290..7acbfcfc2172 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -349,6 +349,9 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 
 	scsi_sysfs_device_initialize(sdev);
 
+	if (scsi_device_is_pseudo_dev(sdev))
+		return sdev;
+
 	depth = sdev->host->cmd_per_lun ?: 1;
 
 	/*
@@ -1070,6 +1073,9 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 
 	sdev->sdev_bflags = *bflags;
 
+	if (scsi_device_is_pseudo_dev(sdev))
+		return SCSI_SCAN_LUN_PRESENT;
+
 	/*
 	 * No need to freeze the queue as it isn't reachable to anyone else yet.
 	 */
@@ -1213,6 +1219,12 @@ static int scsi_probe_and_add_lun(struct scsi_target *starget,
 	if (!sdev)
 		goto out;
 
+	if (scsi_device_is_pseudo_dev(sdev)) {
+		if (bflagsp)
+			*bflagsp = BLIST_NOLUN;
+		return SCSI_SCAN_LUN_PRESENT;
+	}
+
 	result = kmalloc(result_len, GFP_KERNEL);
 	if (!result)
 		goto out_free_sdev;
@@ -2084,12 +2096,65 @@ void scsi_forget_host(struct Scsi_Host *shost)
  restart:
 	spin_lock_irqsave(shost->host_lock, flags);
 	list_for_each_entry(sdev, &shost->__devices, siblings) {
-		if (sdev->sdev_state == SDEV_DEL)
+		if (scsi_device_is_pseudo_dev(sdev) ||
+		    sdev->sdev_state == SDEV_DEL)
 			continue;
 		spin_unlock_irqrestore(shost->host_lock, flags);
 		__scsi_remove_device(sdev);
 		goto restart;
 	}
 	spin_unlock_irqrestore(shost->host_lock, flags);
+
+	/*
+	 * Remove the pseudo device last since it may be needed during removal
+	 * of other SCSI devices.
+	 */
+	if (shost->pseudo_sdev)
+		__scsi_remove_device(shost->pseudo_sdev);
 }
 
+/**
+ * scsi_get_pseudo_sdev() - Attach a pseudo SCSI device to a SCSI host
+ * @shost: Host that needs a pseudo SCSI device
+ *
+ * Lock status: None assumed.
+ *
+ * Returns:     The scsi_device or NULL
+ *
+ * Notes:
+ *	Attach a single scsi_device to the Scsi_Host. The primary aim for this
+ *	device is to serve as a container from which SCSI commands can be
+ *	allocated. Each SCSI command will carry a command tag allocated by the
+ *	block layer. These SCSI commands can be used by the LLDD to send
+ *	internal or passthrough commands without having to manage tag allocation
+ *	inside the LLDD.
+ */
+struct scsi_device *scsi_get_pseudo_sdev(struct Scsi_Host *shost)
+{
+	struct scsi_device *sdev = NULL;
+	struct scsi_target *starget;
+
+	guard(mutex)(&shost->scan_mutex);
+
+	if (!scsi_host_scan_allowed(shost))
+		goto out;
+
+	starget = scsi_alloc_target(&shost->shost_gendev, 0, shost->max_id);
+	if (!starget)
+		goto out;
+
+	sdev = scsi_alloc_sdev(starget, U64_MAX, NULL);
+	if (!sdev) {
+		scsi_target_reap(starget);
+		goto put_target;
+	}
+
+	sdev->borken = 0;
+
+put_target:
+	/* See also the get_device(dev) call in scsi_alloc_target(). */
+	put_device(&starget->dev);
+
+out:
+	return sdev;
+}
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 15ba493d2138..c37992147847 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -1406,6 +1406,9 @@ int scsi_sysfs_add_sdev(struct scsi_device *sdev)
 	int error;
 	struct scsi_target *starget = sdev->sdev_target;
 
+	if (WARN_ON_ONCE(scsi_device_is_pseudo_dev(sdev)))
+		return -EINVAL;
+
 	error = scsi_target_add(starget);
 	if (error)
 		return error;
@@ -1513,7 +1516,7 @@ void __scsi_remove_device(struct scsi_device *sdev)
 	kref_put(&sdev->host->tagset_refcnt, scsi_mq_free_tags);
 	cancel_work_sync(&sdev->requeue_work);
 
-	if (sdev->host->hostt->sdev_destroy)
+	if (!scsi_device_is_pseudo_dev(sdev) && sdev->host->hostt->sdev_destroy)
 		sdev->host->hostt->sdev_destroy(sdev);
 	transport_destroy_device(dev);
 
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 4c106342c4ae..918631088711 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -589,6 +589,22 @@ static inline unsigned int sdev_id(struct scsi_device *sdev)
 #define scmd_id(scmd) sdev_id((scmd)->device)
 #define scmd_channel(scmd) sdev_channel((scmd)->device)
 
+/**
+ * scsi_device_is_pseudo_dev() - Whether a device is a pseudo SCSI device.
+ * @sdev: SCSI device to examine
+ *
+ * A pseudo SCSI device can be used to allocate SCSI commands but does not show
+ * up in sysfs. Additionally, the logical unit information in *@sdev is made up.
+ *
+ * This function tests the LUN number instead of comparing @sdev with
+ * @sdev->host->pseudo_sdev because this function may be called before
+ * @sdev->host->pseudo_sdev has been initialized.
+ */
+static inline bool scsi_device_is_pseudo_dev(struct scsi_device *sdev)
+{
+	return sdev->lun == U64_MAX;
+}
+
 /*
  * checks for positions of the SCSI state machine
  */
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 7b8f144ccf7d..4f945a20d198 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -721,6 +721,12 @@ struct Scsi_Host {
 	/* ldm bits */
 	struct device		shost_gendev, shost_dev;
 
+	/*
+	 * A SCSI device structure used for sending internal commands to the
+	 * HBA. There is no corresponding logical unit inside the SCSI device.
+	 */
+	struct scsi_device *pseudo_sdev;
+
 	/*
 	 * Points to the transport data (if any) which is allocated
 	 * separately
-- 
cgit v1.2.3


From 11ea1de3fc4ba94127034cb01df63a666c4c9836 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Fri, 31 Oct 2025 13:39:13 -0700
Subject: scsi: core: Introduce .queue_reserved_command()

Reserved commands will be used by SCSI LLDs for submitting internal
commands. Since the SCSI host, target and device limits do not apply to
the reserved command use cases, bypass the SCSI host limit checks for
reserved commands. Introduce the .queue_reserved_command() callback for
reserved commands. Additionally, do not activate the SCSI error handler
if a reserved command fails such that reserved commands can be submitted
from inside the SCSI error handler.

[ bvanassche: modified patch title and patch description. Renamed
  .reserved_queuecommand() into .queue_reserved_command(). Changed
  the second argument of __blk_mq_end_request() from 0 into error
  code in the completion path if cmd->result != 0. Rewrote the
  scsi_queue_rq() changes. See also
  https://lore.kernel.org/linux-scsi/1666693096-180008-5-git-send-email-john.garry@huawei.com/ ]

Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://patch.msgid.link/20251031204029.2883185-6-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/hosts.c     |  6 ++++++
 drivers/scsi/scsi_lib.c  | 54 ++++++++++++++++++++++++++++++++++--------------
 include/scsi/scsi_host.h |  6 ++++++
 3 files changed, 50 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index ad1476fb5035..e047747d4ecf 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -231,6 +231,12 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
 		goto fail;
 	}
 
+	if (shost->nr_reserved_cmds && !sht->queue_reserved_command) {
+		shost_printk(KERN_ERR, shost,
+			     "nr_reserved_cmds set but no method to queue\n");
+		goto fail;
+	}
+
 	/* Use min_t(int, ...) in case shost->can_queue exceeds SHRT_MAX */
 	shost->cmd_per_lun = min_t(int, shost->cmd_per_lun,
 				   shost->can_queue);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 53ff348b3a4c..d4e874bbf2ea 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1534,6 +1534,14 @@ static void scsi_complete(struct request *rq)
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
 	enum scsi_disposition disposition;
 
+	if (blk_mq_is_reserved_rq(rq)) {
+		/* Only pass-through requests are supported in this code path. */
+		WARN_ON_ONCE(!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd)));
+		scsi_mq_uninit_cmd(cmd);
+		__blk_mq_end_request(rq, scsi_result_to_blk_status(cmd->result));
+		return;
+	}
+
 	INIT_LIST_HEAD(&cmd->eh_entry);
 
 	atomic_inc(&cmd->device->iodone_cnt);
@@ -1823,25 +1831,31 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	WARN_ON_ONCE(cmd->budget_token < 0);
 
 	/*
-	 * If the device is not in running state we will reject some or all
-	 * commands.
+	 * Bypass the SCSI device, SCSI target and SCSI host checks for
+	 * reserved commands.
 	 */
-	if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
-		ret = scsi_device_state_check(sdev, req);
-		if (ret != BLK_STS_OK)
-			goto out_put_budget;
-	}
+	if (!blk_mq_is_reserved_rq(req)) {
+		/*
+		 * If the device is not in running state we will reject some or
+		 * all commands.
+		 */
+		if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
+			ret = scsi_device_state_check(sdev, req);
+			if (ret != BLK_STS_OK)
+				goto out_put_budget;
+		}
 
-	ret = BLK_STS_RESOURCE;
-	if (!scsi_target_queue_ready(shost, sdev))
-		goto out_put_budget;
-	if (unlikely(scsi_host_in_recovery(shost))) {
-		if (cmd->flags & SCMD_FAIL_IF_RECOVERING)
-			ret = BLK_STS_OFFLINE;
-		goto out_dec_target_busy;
+		ret = BLK_STS_RESOURCE;
+		if (!scsi_target_queue_ready(shost, sdev))
+			goto out_put_budget;
+		if (unlikely(scsi_host_in_recovery(shost))) {
+			if (cmd->flags & SCMD_FAIL_IF_RECOVERING)
+				ret = BLK_STS_OFFLINE;
+			goto out_dec_target_busy;
+		}
+		if (!scsi_host_queue_ready(q, shost, sdev, cmd))
+			goto out_dec_target_busy;
 	}
-	if (!scsi_host_queue_ready(q, shost, sdev, cmd))
-		goto out_dec_target_busy;
 
 	/*
 	 * Only clear the driver-private command data if the LLD does not supply
@@ -1870,6 +1884,14 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	cmd->submitter = SUBMITTED_BY_BLOCK_LAYER;
 
 	blk_mq_start_request(req);
+	if (blk_mq_is_reserved_rq(req)) {
+		reason = shost->hostt->queue_reserved_command(shost, cmd);
+		if (reason) {
+			ret = BLK_STS_RESOURCE;
+			goto out_put_budget;
+		}
+		return BLK_STS_OK;
+	}
 	reason = scsi_dispatch_cmd(cmd);
 	if (reason) {
 		scsi_set_blocked(cmd, reason);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 4f945a20d198..e87cf7eadd26 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -86,6 +86,12 @@ struct scsi_host_template {
 	 */
 	int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);
 
+	/*
+	 * Queue a reserved command (BLK_MQ_REQ_RESERVED). The .queuecommand()
+	 * documentation also applies to the .queue_reserved_command() callback.
+	 */
+	int (*queue_reserved_command)(struct Scsi_Host *, struct scsi_cmnd *);
+
 	/*
 	 * The commit_rqs function is used to trigger a hardware
 	 * doorbell after some requests have been queued with
-- 
cgit v1.2.3


From a2ab4e33286de37f3fe8f28f86f5f71d6b0ae3b0 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 31 Oct 2025 13:39:14 -0700
Subject: scsi: core: Add scsi_{get,put}_internal_cmd() helpers

Add helper functions to allow LLDDs to allocate and free internal commands.

[ bvanassche: changed the 'nowait' argument into a 'flags' argument. See also
  https://lore.kernel.org/linux-scsi/20211125151048.103910-3-hare@suse.de/ ]

Reviewed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251031204029.2883185-7-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_lib.c    | 38 ++++++++++++++++++++++++++++++++++++++
 include/scsi/scsi_device.h |  4 ++++
 2 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d4e874bbf2ea..51ad2ad07e43 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2134,6 +2134,44 @@ void scsi_mq_free_tags(struct kref *kref)
 	complete(&shost->tagset_freed);
 }
 
+/**
+ * scsi_get_internal_cmd() - Allocate an internal SCSI command.
+ * @sdev: SCSI device from which to allocate the command
+ * @data_direction: Data direction for the allocated command
+ * @flags: request allocation flags, e.g. BLK_MQ_REQ_RESERVED or
+ *	BLK_MQ_REQ_NOWAIT.
+ *
+ * Allocates a SCSI command for internal LLDD use.
+ */
+struct scsi_cmnd *scsi_get_internal_cmd(struct scsi_device *sdev,
+					enum dma_data_direction data_direction,
+					blk_mq_req_flags_t flags)
+{
+	enum req_op op = data_direction == DMA_TO_DEVICE ? REQ_OP_DRV_OUT :
+							   REQ_OP_DRV_IN;
+	struct scsi_cmnd *scmd;
+	struct request *rq;
+
+	rq = scsi_alloc_request(sdev->request_queue, op, flags);
+	if (IS_ERR(rq))
+		return NULL;
+	scmd = blk_mq_rq_to_pdu(rq);
+	scmd->device = sdev;
+
+	return scmd;
+}
+EXPORT_SYMBOL_GPL(scsi_get_internal_cmd);
+
+/**
+ * scsi_put_internal_cmd() - Free an internal SCSI command.
+ * @scmd: SCSI command to be freed
+ */
+void scsi_put_internal_cmd(struct scsi_cmnd *scmd)
+{
+	blk_mq_free_request(blk_mq_rq_from_pdu(scmd));
+}
+EXPORT_SYMBOL_GPL(scsi_put_internal_cmd);
+
 /**
  * scsi_device_from_queue - return sdev associated with a request_queue
  * @q: The request queue to return the sdev from
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 918631088711..1e2e599517e9 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -558,6 +558,10 @@ int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd,
 		     const struct scsi_exec_args *args);
 void scsi_failures_reset_retries(struct scsi_failures *failures);
 
+struct scsi_cmnd *scsi_get_internal_cmd(struct scsi_device *sdev,
+					enum dma_data_direction data_direction,
+					blk_mq_req_flags_t flags);
+void scsi_put_internal_cmd(struct scsi_cmnd *scmd);
 extern void sdev_disable_disk_events(struct scsi_device *sdev);
 extern void sdev_enable_disk_events(struct scsi_device *sdev);
 extern int scsi_vpd_lun_id(struct scsi_device *, char *, size_t);
-- 
cgit v1.2.3


From 22089c218037ca7cd50d4fa20e8b5bd746a9b397 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 31 Oct 2025 13:39:31 -0700
Subject: scsi: ufs: core: Optimize the hot path

Set .cmd_size in the SCSI host template such that the SCSI core makes
struct scsi_cmnd and struct ufshcd_lrb adjacent. Convert the cmd->lrbp
and lrbp->cmd memory loads into pointer offset calculations. Remove the
data structure members that became superfluous, namely ufshcd_lrb.cmd
and ufs_hba.lrb. Since ufshcd_lrb.cmd is removed, this pointer cannot be
used anymore to test whether or not a command is a SCSI command.
Introduce a new function for this purpose, namely ufshcd_is_scsi_cmd().

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251031204029.2883185-24-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufs-mcq.c       |   9 +-
 drivers/ufs/core/ufshcd-crypto.h |  18 +--
 drivers/ufs/core/ufshcd-priv.h   |  41 ++++++-
 drivers/ufs/core/ufshcd.c        | 235 +++++++++++++++++++++------------------
 include/ufs/ufshcd.h             |   5 -
 5 files changed, 179 insertions(+), 129 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c
index 1de3360a7af1..776ff0896a2a 100644
--- a/drivers/ufs/core/ufs-mcq.c
+++ b/drivers/ufs/core/ufs-mcq.c
@@ -534,8 +534,8 @@ static int ufshcd_mcq_sq_start(struct ufs_hba *hba, struct ufs_hw_queue *hwq)
  */
 int ufshcd_mcq_sq_cleanup(struct ufs_hba *hba, int task_tag)
 {
-	struct ufshcd_lrb *lrbp = &hba->lrb[task_tag];
-	struct scsi_cmnd *cmd = lrbp->cmd;
+	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, task_tag);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	struct ufs_hw_queue *hwq;
 	void __iomem *reg, *opr_sqd_base;
 	u32 nexus, id, val;
@@ -618,7 +618,8 @@ static void ufshcd_mcq_nullify_sqe(struct utp_transfer_req_desc *utrd)
 static bool ufshcd_mcq_sqe_search(struct ufs_hba *hba,
 				  struct ufs_hw_queue *hwq, int task_tag)
 {
-	struct ufshcd_lrb *lrbp = &hba->lrb[task_tag];
+	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, task_tag);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	struct utp_transfer_req_desc *utrd;
 	__le64  cmd_desc_base_addr;
 	bool ret = false;
@@ -669,7 +670,7 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
 	struct Scsi_Host *host = cmd->device->host;
 	struct ufs_hba *hba = shost_priv(host);
 	int tag = scsi_cmd_to_rq(cmd)->tag;
-	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	struct ufs_hw_queue *hwq;
 	int err;
 
diff --git a/drivers/ufs/core/ufshcd-crypto.h b/drivers/ufs/core/ufshcd-crypto.h
index 89bb97c14c15..c148a5194378 100644
--- a/drivers/ufs/core/ufshcd-crypto.h
+++ b/drivers/ufs/core/ufshcd-crypto.h
@@ -38,10 +38,10 @@ ufshcd_prepare_req_desc_hdr_crypto(struct ufshcd_lrb *lrbp,
 }
 
 static inline int ufshcd_crypto_fill_prdt(struct ufs_hba *hba,
-					  struct ufshcd_lrb *lrbp)
+					  struct scsi_cmnd *cmd)
 {
-	struct scsi_cmnd *cmd = lrbp->cmd;
 	const struct bio_crypt_ctx *crypt_ctx = scsi_cmd_to_rq(cmd)->crypt_ctx;
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 
 	if (crypt_ctx && hba->vops && hba->vops->fill_crypto_prdt)
 		return hba->vops->fill_crypto_prdt(hba, crypt_ctx,
@@ -51,17 +51,19 @@ static inline int ufshcd_crypto_fill_prdt(struct ufs_hba *hba,
 }
 
 static inline void ufshcd_crypto_clear_prdt(struct ufs_hba *hba,
-					    struct ufshcd_lrb *lrbp)
+					    struct scsi_cmnd *cmd)
 {
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+
 	if (!(hba->quirks & UFSHCD_QUIRK_KEYS_IN_PRDT))
 		return;
 
-	if (!(scsi_cmd_to_rq(lrbp->cmd)->crypt_ctx))
+	if (!(scsi_cmd_to_rq(cmd)->crypt_ctx))
 		return;
 
 	/* Zeroize the PRDT because it can contain cryptographic keys. */
 	memzero_explicit(lrbp->ucd_prdt_ptr,
-			 ufshcd_sg_entry_size(hba) * scsi_sg_count(lrbp->cmd));
+			 ufshcd_sg_entry_size(hba) * scsi_sg_count(cmd));
 }
 
 bool ufshcd_crypto_enable(struct ufs_hba *hba);
@@ -82,13 +84,15 @@ ufshcd_prepare_req_desc_hdr_crypto(struct ufshcd_lrb *lrbp,
 				   struct request_desc_header *h) { }
 
 static inline int ufshcd_crypto_fill_prdt(struct ufs_hba *hba,
-					  struct ufshcd_lrb *lrbp)
+					  struct scsi_cmnd *cmd)
 {
 	return 0;
 }
 
 static inline void ufshcd_crypto_clear_prdt(struct ufs_hba *hba,
-					    struct ufshcd_lrb *lrbp) { }
+					    struct scsi_cmnd *cmd)
+{
+}
 
 static inline bool ufshcd_crypto_enable(struct ufs_hba *hba)
 {
diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index 749c0ab2a4ca..72d2766b19e3 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -77,8 +77,7 @@ bool ufshcd_cmd_inflight(struct scsi_cmnd *cmd);
 int ufshcd_mcq_sq_cleanup(struct ufs_hba *hba, int task_tag);
 int ufshcd_mcq_abort(struct scsi_cmnd *cmd);
 int ufshcd_try_to_abort_task(struct ufs_hba *hba, int tag);
-void ufshcd_release_scsi_cmd(struct ufs_hba *hba,
-			     struct ufshcd_lrb *lrbp);
+void ufshcd_release_scsi_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd);
 
 #define SD_ASCII_STD true
 #define SD_RAW false
@@ -363,6 +362,44 @@ static inline bool ufs_is_valid_unit_desc_lun(struct ufs_dev_info *dev_info, u8
 	return lun == UFS_UPIU_RPMB_WLUN || (lun < dev_info->max_lu_supported);
 }
 
+/*
+ * Convert a block layer tag into a SCSI command pointer. This function is
+ * called once per I/O completion path and is also called from error paths.
+ */
+static inline struct scsi_cmnd *ufshcd_tag_to_cmd(struct ufs_hba *hba, u32 tag)
+{
+	struct blk_mq_tags *tags = hba->host->tag_set.shared_tags;
+	struct request *rq;
+
+	/*
+	 * Handle reserved tags differently because the UFS driver does not
+	 * call blk_mq_alloc_request() for allocating reserved requests.
+	 * Allocating reserved tags with blk_mq_alloc_request() would require
+	 * the following:
+	 * - Allocate an additional request queue from &hba->host->tag_set for
+	 *   allocating reserved requests from.
+	 * - For that request queue, allocate a SCSI device.
+	 * - Calling blk_mq_alloc_request(hba->dev_mgmt_queue, REQ_OP_DRV_OUT,
+	 *   BLK_MQ_REQ_RESERVED) for allocating a reserved request and
+	 *   blk_mq_free_request() for freeing reserved requests.
+	 * - Set the .device pointer for these reserved requests.
+	 * - Submit reserved requests with blk_execute_rq().
+	 * - Modify ufshcd_queuecommand() such that it handles reserved requests
+	 *   in another way than SCSI requests.
+	 * - Modify ufshcd_compl_one_cqe() such that it calls scsi_done() for
+	 *   device management commands.
+	 * - Modify all callback functions called by blk_mq_tagset_busy_iter()
+	 *   calls in the UFS driver and skip device management commands.
+	 */
+	rq = tag < UFSHCD_NUM_RESERVED ? tags->static_rqs[tag] :
+					 blk_mq_tag_to_rq(tags, tag);
+
+	if (WARN_ON_ONCE(!rq))
+		return NULL;
+
+	return blk_mq_rq_to_pdu(rq);
+}
+
 static inline void ufshcd_inc_sq_tail(struct ufs_hw_queue *q)
 	__must_hold(&q->sq_lock)
 {
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 1757aa0237da..ce657b2506fb 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -28,6 +28,7 @@
 #include <scsi/scsi_dbg.h>
 #include <scsi/scsi_driver.h>
 #include <scsi/scsi_eh.h>
+#include <scsi/scsi_tcq.h>
 #include "ufshcd-priv.h"
 #include <ufs/ufs_quirks.h>
 #include <ufs/unipro.h>
@@ -483,7 +484,7 @@ static void ufshcd_add_command_trace(struct ufs_hba *hba, struct scsi_cmnd *cmd,
 	u32 hwq_id = 0;
 	struct request *rq = scsi_cmd_to_rq(cmd);
 	unsigned int tag = rq->tag;
-	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	int transfer_len = -1;
 
 	/* trace UPIU also */
@@ -594,14 +595,13 @@ static void ufshcd_print_evt_hist(struct ufs_hba *hba)
 	ufshcd_vops_dbg_register_dump(hba);
 }
 
-static
-void ufshcd_print_tr(struct ufs_hba *hba, int tag, bool pr_prdt)
+static void ufshcd_print_tr(struct ufs_hba *hba, struct scsi_cmnd *cmd,
+			    bool pr_prdt)
 {
-	const struct ufshcd_lrb *lrbp;
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	const int tag = lrbp->task_tag;
 	int prdt_length;
 
-	lrbp = &hba->lrb[tag];
-
 	if (hba->monitor.enabled) {
 		dev_err(hba->dev, "UPIU[%d] - issue time %lld us\n", tag,
 			div_u64(lrbp->issue_time_stamp_local_clock, 1000));
@@ -644,7 +644,7 @@ static bool ufshcd_print_tr_iter(struct request *req, void *priv)
 	struct Scsi_Host *shost = sdev->host;
 	struct ufs_hba *hba = shost_priv(shost);
 
-	ufshcd_print_tr(hba, req->tag, *(bool *)priv);
+	ufshcd_print_tr(hba, blk_mq_rq_to_pdu(req), *(bool *)priv);
 
 	return true;
 }
@@ -2298,8 +2298,7 @@ static inline bool ufshcd_should_inform_monitor(struct ufs_hba *hba,
 						struct scsi_cmnd *cmd)
 {
 	const struct ufs_hba_monitor *m = &hba->monitor;
-	struct request *rq = scsi_cmd_to_rq(cmd);
-	struct ufshcd_lrb *lrbp = &hba->lrb[rq->tag];
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 
 	return m->enabled &&
 	       (!m->chunk_size || m->chunk_size == cmd->sdb.length) &&
@@ -2320,7 +2319,7 @@ static void ufshcd_start_monitor(struct ufs_hba *hba, struct scsi_cmnd *cmd)
 static void ufshcd_update_monitor(struct ufs_hba *hba, struct scsi_cmnd *cmd)
 {
 	struct request *req = scsi_cmd_to_rq(cmd);
-	struct ufshcd_lrb *lrbp = &hba->lrb[req->tag];
+	const struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	int dir = ufshcd_monitor_opcode2dir(cmd->cmnd[0]);
 	unsigned long flags;
 
@@ -2350,17 +2349,26 @@ static void ufshcd_update_monitor(struct ufs_hba *hba, struct scsi_cmnd *cmd)
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
 }
 
+/*
+ * Returns %true for SCSI commands and %false for device management commands.
+ * Must not be called for SCSI commands that have not yet been started.
+ */
+static bool ufshcd_is_scsi_cmd(struct scsi_cmnd *cmd)
+{
+	return blk_mq_request_started(scsi_cmd_to_rq(cmd));
+}
+
 /**
  * ufshcd_send_command - Send SCSI or device management commands
  * @hba: per adapter instance
- * @lrbp: Local reference block of SCSI command
+ * @cmd: SCSI command or device management command pointer
  * @hwq: pointer to hardware queue instance
  */
 static inline void ufshcd_send_command(struct ufs_hba *hba,
-				       struct ufshcd_lrb *lrbp,
+				       struct scsi_cmnd *cmd,
 				       struct ufs_hw_queue *hwq)
 {
-	struct scsi_cmnd *cmd = lrbp->cmd;
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	unsigned long flags;
 
 	if (hba->monitor.enabled) {
@@ -2369,7 +2377,7 @@ static inline void ufshcd_send_command(struct ufs_hba *hba,
 		lrbp->compl_time_stamp = ktime_set(0, 0);
 		lrbp->compl_time_stamp_local_clock = 0;
 	}
-	if (cmd) {
+	if (ufshcd_is_scsi_cmd(cmd)) {
 		ufshcd_add_command_trace(hba, cmd, UFS_CMD_SEND);
 		ufshcd_clk_scaling_start_busy(hba);
 		if (unlikely(ufshcd_should_inform_monitor(hba, cmd)))
@@ -2389,7 +2397,8 @@ static inline void ufshcd_send_command(struct ufs_hba *hba,
 	} else {
 		spin_lock_irqsave(&hba->outstanding_lock, flags);
 		if (hba->vops && hba->vops->setup_xfer_req)
-			hba->vops->setup_xfer_req(hba, lrbp->task_tag, !!cmd);
+			hba->vops->setup_xfer_req(hba, lrbp->task_tag,
+						  ufshcd_is_scsi_cmd(cmd));
 		__set_bit(lrbp->task_tag, &hba->outstanding_reqs);
 		ufshcd_writel(hba, 1 << lrbp->task_tag,
 			      REG_UTP_TRANSFER_REQ_DOOR_BELL);
@@ -2399,11 +2408,12 @@ static inline void ufshcd_send_command(struct ufs_hba *hba,
 
 /**
  * ufshcd_copy_sense_data - Copy sense data in case of check condition
- * @lrbp: pointer to local reference block
+ * @cmd: SCSI command
  */
-static inline void ufshcd_copy_sense_data(struct ufshcd_lrb *lrbp)
+static inline void ufshcd_copy_sense_data(struct scsi_cmnd *cmd)
 {
-	u8 *const sense_buffer = lrbp->cmd->sense_buffer;
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	u8 *const sense_buffer = cmd->sense_buffer;
 	u16 resp_len;
 	int len;
 
@@ -2708,13 +2718,13 @@ static void ufshcd_sgl_to_prdt(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, int
 /**
  * ufshcd_map_sg - Map scatter-gather list to prdt
  * @hba: per adapter instance
- * @lrbp: pointer to local reference block
+ * @cmd: SCSI command
  *
  * Return: 0 in case of success, non-zero value in case of failure.
  */
-static int ufshcd_map_sg(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
+static int ufshcd_map_sg(struct ufs_hba *hba, struct scsi_cmnd *cmd)
 {
-	struct scsi_cmnd *cmd = lrbp->cmd;
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	int sg_segments = scsi_dma_map(cmd);
 
 	if (sg_segments < 0)
@@ -2722,7 +2732,7 @@ static int ufshcd_map_sg(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
 
 	ufshcd_sgl_to_prdt(hba, lrbp, sg_segments, scsi_sglist(cmd));
 
-	return ufshcd_crypto_fill_prdt(hba, lrbp);
+	return ufshcd_crypto_fill_prdt(hba, cmd);
 }
 
 /**
@@ -2781,13 +2791,13 @@ ufshcd_prepare_req_desc_hdr(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
 /**
  * ufshcd_prepare_utp_scsi_cmd_upiu() - fills the utp_transfer_req_desc,
  * for scsi commands
- * @lrbp: local reference block pointer
+ * @cmd: SCSI command
  * @upiu_flags: flags
  */
-static
-void ufshcd_prepare_utp_scsi_cmd_upiu(struct ufshcd_lrb *lrbp, u8 upiu_flags)
+static void ufshcd_prepare_utp_scsi_cmd_upiu(struct scsi_cmnd *cmd,
+					     u8 upiu_flags)
 {
-	struct scsi_cmnd *cmd = lrbp->cmd;
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	struct utp_upiu_req *ucd_req_ptr = lrbp->ucd_req_ptr;
 	unsigned short cdb_len;
 
@@ -2890,22 +2900,25 @@ static int ufshcd_compose_devman_upiu(struct ufs_hba *hba,
  * ufshcd_comp_scsi_upiu - UFS Protocol Information Unit(UPIU)
  *			   for SCSI Purposes
  * @hba: per adapter instance
- * @lrbp: pointer to local reference block
+ * @cmd: SCSI command
  */
-static void ufshcd_comp_scsi_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
+static void ufshcd_comp_scsi_upiu(struct ufs_hba *hba, struct scsi_cmnd *cmd)
 {
-	struct request *rq = scsi_cmd_to_rq(lrbp->cmd);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	struct request *rq = scsi_cmd_to_rq(cmd);
 	unsigned int ioprio_class = IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
 	u8 upiu_flags;
 
-	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, lrbp->cmd->sc_data_direction, 0);
+	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags,
+				    cmd->sc_data_direction, 0);
 	if (ioprio_class == IOPRIO_CLASS_RT)
 		upiu_flags |= UPIU_CMD_FLAGS_CP;
-	ufshcd_prepare_utp_scsi_cmd_upiu(lrbp, upiu_flags);
+	ufshcd_prepare_utp_scsi_cmd_upiu(cmd, upiu_flags);
 }
 
-static void ufshcd_init_lrb(struct ufs_hba *hba, struct ufshcd_lrb *lrb, int i)
+static void ufshcd_init_lrb(struct ufs_hba *hba, struct scsi_cmnd *cmd)
 {
+	const int i = scsi_cmd_to_rq(cmd)->tag;
 	struct utp_transfer_cmd_desc *cmd_descp =
 		(void *)hba->ucdl_base_addr + i * ufshcd_get_ucd_size(hba);
 	struct utp_transfer_req_desc *utrdlp = hba->utrdl_base_addr;
@@ -2913,6 +2926,7 @@ static void ufshcd_init_lrb(struct ufs_hba *hba, struct ufshcd_lrb *lrb, int i)
 		hba->ucdl_dma_addr + i * ufshcd_get_ucd_size(hba);
 	u16 response_offset = le16_to_cpu(utrdlp[i].response_upiu_offset);
 	u16 prdt_offset = le16_to_cpu(utrdlp[i].prd_table_offset);
+	struct ufshcd_lrb *lrb = scsi_cmd_priv(cmd);
 
 	lrb->utr_descriptor_ptr = utrdlp + i;
 	lrb->utrd_dma_addr =
@@ -2925,27 +2939,31 @@ static void ufshcd_init_lrb(struct ufs_hba *hba, struct ufshcd_lrb *lrb, int i)
 	lrb->ucd_prdt_dma_addr = cmd_desc_element_addr + prdt_offset;
 }
 
-static void __ufshcd_setup_cmd(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
-			       struct scsi_cmnd *cmd, u8 lun, int tag)
+static void __ufshcd_setup_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd,
+			       u8 lun, int tag)
 {
-	ufshcd_init_lrb(hba, lrbp, tag);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+
+	ufshcd_init_lrb(hba, cmd);
 
 	memset(lrbp->ucd_req_ptr, 0, sizeof(*lrbp->ucd_req_ptr));
 
-	lrbp->cmd = cmd;
 	lrbp->task_tag = tag;
 	lrbp->lun = lun;
-	ufshcd_prepare_lrbp_crypto(cmd ? scsi_cmd_to_rq(cmd) : NULL, lrbp);
+	ufshcd_prepare_lrbp_crypto(ufshcd_is_scsi_cmd(cmd) ?
+				   scsi_cmd_to_rq(cmd) : NULL, lrbp);
 }
 
-static void ufshcd_setup_scsi_cmd(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
-				  struct scsi_cmnd *cmd, u8 lun, int tag)
+static void ufshcd_setup_scsi_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd,
+				  u8 lun, int tag)
 {
-	__ufshcd_setup_cmd(hba, lrbp, cmd, lun, tag);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+
+	__ufshcd_setup_cmd(hba, cmd, lun, tag);
 	lrbp->intr_cmd = !ufshcd_is_intr_aggr_allowed(hba);
 	lrbp->req_abort_skip = false;
 
-	ufshcd_comp_scsi_upiu(hba, lrbp);
+	ufshcd_comp_scsi_upiu(hba, cmd);
 }
 
 /**
@@ -3016,7 +3034,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 {
 	struct ufs_hba *hba = shost_priv(host);
 	int tag = scsi_cmd_to_rq(cmd)->tag;
-	struct ufshcd_lrb *lrbp;
 	int err = 0;
 	struct ufs_hw_queue *hwq = NULL;
 
@@ -3067,11 +3084,10 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 
 	ufshcd_hold(hba);
 
-	lrbp = &hba->lrb[tag];
-
-	ufshcd_setup_scsi_cmd(hba, lrbp, cmd, ufshcd_scsi_to_upiu_lun(cmd->device->lun), tag);
+	ufshcd_setup_scsi_cmd(hba, cmd,
+			      ufshcd_scsi_to_upiu_lun(cmd->device->lun), tag);
 
-	err = ufshcd_map_sg(hba, lrbp);
+	err = ufshcd_map_sg(hba, cmd);
 	if (err) {
 		ufshcd_release(hba);
 		goto out;
@@ -3080,7 +3096,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 	if (hba->mcq_enabled)
 		hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(cmd));
 
-	ufshcd_send_command(hba, lrbp, hwq);
+	ufshcd_send_command(hba, cmd, hwq);
 
 out:
 	if (ufs_trigger_eh(hba)) {
@@ -3094,10 +3110,12 @@ out:
 	return err;
 }
 
-static void ufshcd_setup_dev_cmd(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
-			     enum dev_cmd_type cmd_type, u8 lun, int tag)
+static void ufshcd_setup_dev_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd,
+				 enum dev_cmd_type cmd_type, u8 lun, int tag)
 {
-	__ufshcd_setup_cmd(hba, lrbp, NULL, lun, tag);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+
+	__ufshcd_setup_cmd(hba, cmd, lun, tag);
 	lrbp->intr_cmd = true; /* No interrupt aggregation */
 	hba->dev_cmd.type = cmd_type;
 }
@@ -3105,10 +3123,12 @@ static void ufshcd_setup_dev_cmd(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
 /*
  * Return: 0 upon success; < 0 upon failure.
  */
-static int ufshcd_compose_dev_cmd(struct ufs_hba *hba,
-		struct ufshcd_lrb *lrbp, enum dev_cmd_type cmd_type, int tag)
+static int ufshcd_compose_dev_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd,
+				  enum dev_cmd_type cmd_type, int tag)
 {
-	ufshcd_setup_dev_cmd(hba, lrbp, cmd_type, 0, tag);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+
+	ufshcd_setup_dev_cmd(hba, cmd, cmd_type, 0, tag);
 
 	return ufshcd_compose_devman_upiu(hba, lrbp);
 }
@@ -3320,13 +3340,14 @@ static void ufshcd_dev_man_unlock(struct ufs_hba *hba)
  * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
  * < 0 if another error occurred.
  */
-static int ufshcd_issue_dev_cmd(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
-			  const u32 tag, int timeout)
+static int ufshcd_issue_dev_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd,
+				const u32 tag, int timeout)
 {
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	int err;
 
 	ufshcd_add_query_upiu_trace(hba, UFS_QUERY_SEND, lrbp->ucd_req_ptr);
-	ufshcd_send_command(hba, lrbp, hba->dev_cmd_queue);
+	ufshcd_send_command(hba, cmd, hba->dev_cmd_queue);
 	err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
 
 	ufshcd_add_query_upiu_trace(hba, err ? UFS_QUERY_ERR : UFS_QUERY_COMP,
@@ -3351,17 +3372,17 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
 		enum dev_cmd_type cmd_type, int timeout)
 {
 	const u32 tag = hba->reserved_slot;
-	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
+	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, tag);
 	int err;
 
 	/* Protects use of hba->reserved_slot. */
 	lockdep_assert_held(&hba->dev_cmd.lock);
 
-	err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
+	err = ufshcd_compose_dev_cmd(hba, cmd, cmd_type, tag);
 	if (unlikely(err))
 		return err;
 
-	return ufshcd_issue_dev_cmd(hba, lrbp, tag, timeout);
+	return ufshcd_issue_dev_cmd(hba, cmd, tag, timeout);
 }
 
 /**
@@ -3991,14 +4012,6 @@ static int ufshcd_memory_alloc(struct ufs_hba *hba)
 	}
 
 skip_utmrdl:
-	/* Allocate memory for local reference block */
-	hba->lrb = devm_kcalloc(hba->dev,
-				hba->nutrs, sizeof(struct ufshcd_lrb),
-				GFP_KERNEL);
-	if (!hba->lrb) {
-		dev_err(hba->dev, "LRB Memory allocation failed\n");
-		goto out;
-	}
 	return 0;
 out:
 	return -ENOMEM;
@@ -5411,19 +5424,18 @@ static void ufshcd_sdev_destroy(struct scsi_device *sdev)
 
 /**
  * ufshcd_scsi_cmd_status - Update SCSI command result based on SCSI status
- * @lrbp: pointer to local reference block of completed command
+ * @cmd: SCSI command
  * @scsi_status: SCSI command status
  *
  * Return: value base on SCSI command status.
  */
-static inline int
-ufshcd_scsi_cmd_status(struct ufshcd_lrb *lrbp, int scsi_status)
+static inline int ufshcd_scsi_cmd_status(struct scsi_cmnd *cmd, int scsi_status)
 {
 	int result = 0;
 
 	switch (scsi_status) {
 	case SAM_STAT_CHECK_CONDITION:
-		ufshcd_copy_sense_data(lrbp);
+		ufshcd_copy_sense_data(cmd);
 		fallthrough;
 	case SAM_STAT_GOOD:
 		result |= DID_OK << 16 | scsi_status;
@@ -5431,7 +5443,7 @@ ufshcd_scsi_cmd_status(struct ufshcd_lrb *lrbp, int scsi_status)
 	case SAM_STAT_TASK_SET_FULL:
 	case SAM_STAT_BUSY:
 	case SAM_STAT_TASK_ABORTED:
-		ufshcd_copy_sense_data(lrbp);
+		ufshcd_copy_sense_data(cmd);
 		result |= scsi_status;
 		break;
 	default:
@@ -5445,15 +5457,16 @@ ufshcd_scsi_cmd_status(struct ufshcd_lrb *lrbp, int scsi_status)
 /**
  * ufshcd_transfer_rsp_status - Get overall status of the response
  * @hba: per adapter instance
- * @lrbp: pointer to local reference block of completed command
+ * @cmd: SCSI command
  * @cqe: pointer to the completion queue entry
  *
  * Return: result of the command to notify SCSI midlayer.
  */
-static inline int
-ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
-			   struct cq_entry *cqe)
+static inline int ufshcd_transfer_rsp_status(struct ufs_hba *hba,
+					     struct scsi_cmnd *cmd,
+					     struct cq_entry *cqe)
 {
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	int result = 0;
 	int scsi_status;
 	enum utp_ocs ocs;
@@ -5467,7 +5480,7 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
 	 * not set either flag.
 	 */
 	if (resid && !(upiu_flags & UPIU_RSP_FLAG_OVERFLOW))
-		scsi_set_resid(lrbp->cmd, resid);
+		scsi_set_resid(cmd, resid);
 
 	/* overall command status of utrd */
 	ocs = ufshcd_get_tr_ocs(lrbp, cqe);
@@ -5488,7 +5501,7 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
 			 * to notify the SCSI midlayer of the command status
 			 */
 			scsi_status = lrbp->ucd_rsp_ptr->header.status;
-			result = ufshcd_scsi_cmd_status(lrbp, scsi_status);
+			result = ufshcd_scsi_cmd_status(cmd, scsi_status);
 
 			/*
 			 * Currently we are only supporting BKOPs exception
@@ -5553,7 +5566,7 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
 	    (host_byte(result) != DID_REQUEUE) && !hba->silence_err_logs) {
 		if (cqe)
 			ufshcd_hex_dump("UPIU CQE: ", cqe, sizeof(struct cq_entry));
-		ufshcd_print_tr(hba, lrbp->task_tag, true);
+		ufshcd_print_tr(hba, cmd, true);
 	}
 	return result;
 }
@@ -5620,13 +5633,10 @@ static irqreturn_t ufshcd_uic_cmd_compl(struct ufs_hba *hba, u32 intr_status)
 }
 
 /* Release the resources allocated for processing a SCSI command. */
-void ufshcd_release_scsi_cmd(struct ufs_hba *hba,
-			     struct ufshcd_lrb *lrbp)
+void ufshcd_release_scsi_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd)
 {
-	struct scsi_cmnd *cmd = lrbp->cmd;
-
 	scsi_dma_unmap(cmd);
-	ufshcd_crypto_clear_prdt(hba, lrbp);
+	ufshcd_crypto_clear_prdt(hba, cmd);
 	ufshcd_release(hba);
 	ufshcd_clk_scaling_update_busy(hba);
 }
@@ -5640,20 +5650,20 @@ void ufshcd_release_scsi_cmd(struct ufs_hba *hba,
 void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag,
 			  struct cq_entry *cqe)
 {
-	struct ufshcd_lrb *lrbp = &hba->lrb[task_tag];
-	struct scsi_cmnd *cmd = lrbp->cmd;
+	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, task_tag);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	enum utp_ocs ocs;
 
 	if (hba->monitor.enabled) {
 		lrbp->compl_time_stamp = ktime_get();
 		lrbp->compl_time_stamp_local_clock = local_clock();
 	}
-	if (cmd) {
+	if (ufshcd_is_scsi_cmd(cmd)) {
 		if (unlikely(ufshcd_should_inform_monitor(hba, cmd)))
 			ufshcd_update_monitor(hba, cmd);
 		ufshcd_add_command_trace(hba, cmd, UFS_CMD_COMP);
-		cmd->result = ufshcd_transfer_rsp_status(hba, lrbp, cqe);
-		ufshcd_release_scsi_cmd(hba, lrbp);
+		cmd->result = ufshcd_transfer_rsp_status(hba, cmd, cqe);
+		ufshcd_release_scsi_cmd(hba, cmd);
 		/* Do not touch lrbp after scsi done */
 		scsi_done(cmd);
 	} else {
@@ -5690,7 +5700,7 @@ static void ufshcd_clear_polled(struct ufs_hba *hba,
 	int tag;
 
 	for_each_set_bit(tag, completed_reqs, hba->nutrs) {
-		struct scsi_cmnd *cmd = hba->lrb[tag].cmd;
+		struct scsi_cmnd *cmd = scsi_host_find_tag(hba->host, tag);
 
 		if (!cmd)
 			continue;
@@ -5741,7 +5751,6 @@ static bool ufshcd_mcq_force_compl_one(struct request *rq, void *priv)
 	struct scsi_device *sdev = rq->q->queuedata;
 	struct Scsi_Host *shost = sdev->host;
 	struct ufs_hba *hba = shost_priv(shost);
-	struct ufshcd_lrb *lrbp = &hba->lrb[rq->tag];
 	struct ufs_hw_queue *hwq = ufshcd_mcq_req_to_hwq(hba, rq);
 
 	if (!hwq)
@@ -5756,7 +5765,7 @@ static bool ufshcd_mcq_force_compl_one(struct request *rq, void *priv)
 	scoped_guard(spinlock_irqsave, &hwq->cq_lock) {
 		if (!test_bit(SCMD_STATE_COMPLETE, &cmd->state)) {
 			set_host_byte(cmd, DID_REQUEUE);
-			ufshcd_release_scsi_cmd(hba, lrbp);
+			ufshcd_release_scsi_cmd(hba, cmd);
 			scsi_done(cmd);
 		}
 	}
@@ -6641,7 +6650,7 @@ static bool ufshcd_abort_one(struct request *rq, void *priv)
 
 	*ret = ufshcd_try_to_abort_task(hba, tag);
 	dev_err(hba->dev, "Aborting tag %d / CDB %#02x %s\n", tag,
-		hba->lrb[tag].cmd ? hba->lrb[tag].cmd->cmnd[0] : -1,
+		ufshcd_is_scsi_cmd(cmd) ? cmd->cmnd[0] : -1,
 		*ret ? "failed" : "succeeded");
 
 	return *ret == 0;
@@ -7371,14 +7380,15 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 					enum query_opcode desc_op)
 {
 	const u32 tag = hba->reserved_slot;
-	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
+	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, tag);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	int err = 0;
 	u8 upiu_flags;
 
 	/* Protects use of hba->reserved_slot. */
 	lockdep_assert_held(&hba->dev_cmd.lock);
 
-	ufshcd_setup_dev_cmd(hba, lrbp, cmd_type, 0, tag);
+	ufshcd_setup_dev_cmd(hba, cmd, cmd_type, 0, tag);
 
 	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, 0);
 
@@ -7403,7 +7413,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 	 * bound to fail since dev_cmd.query and dev_cmd.type were left empty.
 	 * read the response directly ignoring all errors.
 	 */
-	ufshcd_issue_dev_cmd(hba, lrbp, tag, dev_cmd_timeout);
+	ufshcd_issue_dev_cmd(hba, cmd, tag, dev_cmd_timeout);
 
 	/* just copy the upiu response as it is */
 	memcpy(rsp_upiu, lrbp->ucd_rsp_ptr, sizeof(*rsp_upiu));
@@ -7518,7 +7528,8 @@ int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *r
 			 enum dma_data_direction dir)
 {
 	const u32 tag = hba->reserved_slot;
-	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
+	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, tag);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	int err = 0;
 	int result;
 	u8 upiu_flags;
@@ -7529,7 +7540,8 @@ int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *r
 	/* Protects use of hba->reserved_slot. */
 	ufshcd_dev_man_lock(hba);
 
-	ufshcd_setup_dev_cmd(hba, lrbp, DEV_CMD_TYPE_RPMB, UFS_UPIU_RPMB_WLUN, tag);
+	ufshcd_setup_dev_cmd(hba, cmd, DEV_CMD_TYPE_RPMB, UFS_UPIU_RPMB_WLUN,
+			     tag);
 
 	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, ehs);
 
@@ -7546,7 +7558,7 @@ int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *r
 
 	memset(lrbp->ucd_rsp_ptr, 0, sizeof(struct utp_upiu_rsp));
 
-	err = ufshcd_issue_dev_cmd(hba, lrbp, tag, ADVANCED_RPMB_REQ_TIMEOUT);
+	err = ufshcd_issue_dev_cmd(hba, cmd, tag, ADVANCED_RPMB_REQ_TIMEOUT);
 
 	if (!err) {
 		/* Just copy the upiu response as it is */
@@ -7647,11 +7659,12 @@ static int ufshcd_eh_device_reset_handler(struct scsi_cmnd *cmd)
 
 static void ufshcd_set_req_abort_skip(struct ufs_hba *hba, unsigned long bitmap)
 {
-	struct ufshcd_lrb *lrbp;
 	int tag;
 
 	for_each_set_bit(tag, &bitmap, hba->nutrs) {
-		lrbp = &hba->lrb[tag];
+		struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, tag);
+		struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+
 		lrbp->req_abort_skip = true;
 	}
 }
@@ -7659,7 +7672,7 @@ static void ufshcd_set_req_abort_skip(struct ufs_hba *hba, unsigned long bitmap)
 /**
  * ufshcd_try_to_abort_task - abort a specific task
  * @hba: Pointer to adapter instance
- * @tag: Task tag/index to be aborted
+ * @tag: Tag of the task to be aborted
  *
  * Abort the pending command in device by sending UFS_ABORT_TASK task management
  * command, and in host controller by clearing the door-bell register. There can
@@ -7671,7 +7684,8 @@ static void ufshcd_set_req_abort_skip(struct ufs_hba *hba, unsigned long bitmap)
  */
 int ufshcd_try_to_abort_task(struct ufs_hba *hba, int tag)
 {
-	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
+	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, tag);
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	int err;
 	int poll_cnt;
 	u8 resp = 0xF;
@@ -7693,7 +7707,7 @@ int ufshcd_try_to_abort_task(struct ufs_hba *hba, int tag)
 				hba->dev,
 				"%s: cmd with tag %d not pending in the device.\n",
 				__func__, tag);
-			if (!ufshcd_cmd_inflight(lrbp->cmd)) {
+			if (!ufshcd_cmd_inflight(cmd)) {
 				dev_info(hba->dev,
 					 "%s: cmd with tag=%d completed.\n",
 					 __func__, tag);
@@ -7741,7 +7755,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 	struct Scsi_Host *host = cmd->device->host;
 	struct ufs_hba *hba = shost_priv(host);
 	int tag = scsi_cmd_to_rq(cmd)->tag;
-	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	unsigned long flags;
 	int err = FAILED;
 	bool outstanding;
@@ -7776,9 +7790,9 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 		ufshcd_print_evt_hist(hba);
 		ufshcd_print_host_state(hba);
 		ufshcd_print_pwr_info(hba);
-		ufshcd_print_tr(hba, tag, true);
+		ufshcd_print_tr(hba, cmd, true);
 	} else {
-		ufshcd_print_tr(hba, tag, false);
+		ufshcd_print_tr(hba, cmd, false);
 	}
 	hba->req_abort_count++;
 
@@ -7822,7 +7836,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 		goto release;
 	}
 
-	err = ufshcd_try_to_abort_task(hba, tag);
+	err = ufshcd_try_to_abort_task(hba, lrbp->task_tag);
 	if (err) {
 		dev_err(hba->dev, "%s: failed with err %d\n", __func__, err);
 		ufshcd_set_req_abort_skip(hba, hba->outstanding_reqs);
@@ -7839,7 +7853,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 	spin_unlock_irqrestore(&hba->outstanding_lock, flags);
 
 	if (outstanding)
-		ufshcd_release_scsi_cmd(hba, lrbp);
+		ufshcd_release_scsi_cmd(hba, cmd);
 
 	err = SUCCESS;
 
@@ -8919,8 +8933,6 @@ static void ufshcd_release_sdb_queue(struct ufs_hba *hba, int nutrs)
 	utrdl_size = sizeof(struct utp_transfer_req_desc) * nutrs;
 	dmam_free_coherent(hba->dev, utrdl_size, hba->utrdl_base_addr,
 			   hba->utrdl_dma_addr);
-
-	devm_kfree(hba->dev, hba->lrb);
 }
 
 static int ufshcd_alloc_mcq(struct ufs_hba *hba)
@@ -9191,6 +9203,7 @@ static const struct scsi_host_template ufshcd_driver_template = {
 	.name			= UFSHCD,
 	.proc_name		= UFSHCD,
 	.map_queues		= ufshcd_map_queues,
+	.cmd_size		= sizeof(struct ufshcd_lrb),
 	.init_cmd_priv		= ufshcd_init_cmd_priv,
 	.queuecommand		= ufshcd_queuecommand,
 	.nr_reserved_cmds	= UFSHCD_NUM_RESERVED,
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 00152e135fc9..fbed47b6c61f 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -161,7 +161,6 @@ struct ufs_pm_lvl_states {
  * @ucd_prdt_dma_addr: PRDT dma address for debug
  * @ucd_rsp_dma_addr: UPIU response dma address for debug
  * @ucd_req_dma_addr: UPIU request dma address for debug
- * @cmd: pointer to SCSI command
  * @scsi_status: SCSI status of the command
  * @command_type: SCSI, UFS, Query.
  * @task_tag: Task tag of the command
@@ -186,7 +185,6 @@ struct ufshcd_lrb {
 	dma_addr_t ucd_rsp_dma_addr;
 	dma_addr_t ucd_prdt_dma_addr;
 
-	struct scsi_cmnd *cmd;
 	int scsi_status;
 
 	int command_type;
@@ -833,7 +831,6 @@ enum ufshcd_mcq_opr {
  * @spm_lvl: desired UFS power management level during system PM.
  * @pm_op_in_progress: whether or not a PM operation is in progress.
  * @ahit: value of Auto-Hibernate Idle Timer register.
- * @lrb: local reference block
  * @outstanding_tasks: Bits representing outstanding task requests
  * @outstanding_lock: Protects @outstanding_reqs.
  * @outstanding_reqs: Bits representing outstanding transfer requests
@@ -976,8 +973,6 @@ struct ufs_hba {
 	/* Auto-Hibernate Idle Timer register value */
 	u32 ahit;
 
-	struct ufshcd_lrb *lrb;
-
 	unsigned long outstanding_tasks;
 	spinlock_t outstanding_lock;
 	unsigned long outstanding_reqs;
-- 
cgit v1.2.3


From 9a2c9500921d5ebbe96f7531adc73d9205c76485 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 31 Oct 2025 13:39:33 -0700
Subject: scsi: ufs: core: Remove the ufshcd_lrb task_tag member

Remove the ufshcd_lrb task_tag member and use scsi_cmd_to_rq(cmd)->tag
instead. Use rq->tag instead of lrbp->task_tag. This patch reduces the
size of struct ufshcd_lrb.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251031204029.2883185-26-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 62 +++++++++++++++++++++++------------------------
 include/ufs/ufshcd.h      |  1 -
 2 files changed, 30 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index cf2c08baa9ae..3e0fa433579d 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -599,7 +599,7 @@ static void ufshcd_print_tr(struct ufs_hba *hba, struct scsi_cmnd *cmd,
 			    bool pr_prdt)
 {
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
-	const int tag = lrbp->task_tag;
+	const int tag = scsi_cmd_to_rq(cmd)->tag;
 	int prdt_length;
 
 	if (hba->monitor.enabled) {
@@ -2369,6 +2369,7 @@ static inline void ufshcd_send_command(struct ufs_hba *hba,
 				       struct ufs_hw_queue *hwq)
 {
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	const int tag = scsi_cmd_to_rq(cmd)->tag;
 	unsigned long flags;
 
 	if (hba->monitor.enabled) {
@@ -2397,11 +2398,10 @@ static inline void ufshcd_send_command(struct ufs_hba *hba,
 	} else {
 		spin_lock_irqsave(&hba->outstanding_lock, flags);
 		if (hba->vops && hba->vops->setup_xfer_req)
-			hba->vops->setup_xfer_req(hba, lrbp->task_tag,
+			hba->vops->setup_xfer_req(hba, tag,
 						  ufshcd_is_scsi_cmd(cmd));
-		__set_bit(lrbp->task_tag, &hba->outstanding_reqs);
-		ufshcd_writel(hba, 1 << lrbp->task_tag,
-			      REG_UTP_TRANSFER_REQ_DOOR_BELL);
+		__set_bit(tag, &hba->outstanding_reqs);
+		ufshcd_writel(hba, 1 << tag, REG_UTP_TRANSFER_REQ_DOOR_BELL);
 		spin_unlock_irqrestore(&hba->outstanding_lock, flags);
 	}
 }
@@ -2798,6 +2798,7 @@ static void ufshcd_prepare_utp_scsi_cmd_upiu(struct scsi_cmnd *cmd,
 					     u8 upiu_flags)
 {
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	const int tag = scsi_cmd_to_rq(cmd)->tag;
 	struct utp_upiu_req *ucd_req_ptr = lrbp->ucd_req_ptr;
 	unsigned short cdb_len;
 
@@ -2805,11 +2806,11 @@ static void ufshcd_prepare_utp_scsi_cmd_upiu(struct scsi_cmnd *cmd,
 		.transaction_code = UPIU_TRANSACTION_COMMAND,
 		.flags = upiu_flags,
 		.lun = lrbp->lun,
-		.task_tag = lrbp->task_tag,
+		.task_tag = tag,
 		.command_set_type = UPIU_COMMAND_SET_TYPE_SCSI,
 	};
 
-	WARN_ON_ONCE(ucd_req_ptr->header.task_tag != lrbp->task_tag);
+	WARN_ON_ONCE(ucd_req_ptr->header.task_tag != tag);
 
 	ucd_req_ptr->sc.exp_data_transfer_len = cpu_to_be32(cmd->sdb.length);
 
@@ -2830,6 +2831,7 @@ static void ufshcd_prepare_utp_query_req_upiu(struct ufs_hba *hba,
 {
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	struct utp_upiu_req *ucd_req_ptr = lrbp->ucd_req_ptr;
+	const int tag = scsi_cmd_to_rq(cmd)->tag;
 	struct ufs_query *query = &hba->dev_cmd.query;
 	u16 len = be16_to_cpu(query->request.upiu_req.length);
 
@@ -2838,7 +2840,7 @@ static void ufshcd_prepare_utp_query_req_upiu(struct ufs_hba *hba,
 		.transaction_code = UPIU_TRANSACTION_QUERY_REQ,
 		.flags = upiu_flags,
 		.lun = lrbp->lun,
-		.task_tag = lrbp->task_tag,
+		.task_tag = tag,
 		.query_function = query->request.query_func,
 		/* Data segment length only need for WRITE_DESC */
 		.data_segment_length =
@@ -2861,12 +2863,13 @@ static inline void ufshcd_prepare_utp_nop_upiu(struct scsi_cmnd *cmd)
 {
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	struct utp_upiu_req *ucd_req_ptr = lrbp->ucd_req_ptr;
+	const int tag = scsi_cmd_to_rq(cmd)->tag;
 
 	memset(ucd_req_ptr, 0, sizeof(struct utp_upiu_req));
 
 	ucd_req_ptr->header = (struct utp_upiu_header){
 		.transaction_code = UPIU_TRANSACTION_NOP_OUT,
-		.task_tag = lrbp->task_tag,
+		.task_tag = tag,
 	};
 }
 
@@ -2951,7 +2954,6 @@ static void __ufshcd_setup_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd,
 
 	memset(lrbp->ucd_req_ptr, 0, sizeof(*lrbp->ucd_req_ptr));
 
-	lrbp->task_tag = tag;
 	lrbp->lun = lun;
 	ufshcd_prepare_lrbp_crypto(ufshcd_is_scsi_cmd(cmd) ?
 				   scsi_cmd_to_rq(cmd) : NULL, lrbp);
@@ -3249,6 +3251,8 @@ ufshcd_dev_cmd_completion(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
 static int ufshcd_wait_for_dev_cmd(struct ufs_hba *hba,
 		struct ufshcd_lrb *lrbp, int max_timeout)
 {
+	struct scsi_cmnd *cmd = (struct scsi_cmnd *)lrbp - 1;
+	const int tag = scsi_cmd_to_rq(cmd)->tag;
 	unsigned long time_left = msecs_to_jiffies(max_timeout);
 	unsigned long flags;
 	bool pending;
@@ -3265,18 +3269,18 @@ retry:
 	} else {
 		err = -ETIMEDOUT;
 		dev_dbg(hba->dev, "%s: dev_cmd request timedout, tag %d\n",
-			__func__, lrbp->task_tag);
+			__func__, tag);
 
 		/* MCQ mode */
 		if (hba->mcq_enabled) {
 			/* successfully cleared the command, retry if needed */
-			if (ufshcd_clear_cmd(hba, lrbp->task_tag) == 0)
+			if (ufshcd_clear_cmd(hba, tag) == 0)
 				err = -EAGAIN;
 			return err;
 		}
 
 		/* SDB mode */
-		if (ufshcd_clear_cmd(hba, lrbp->task_tag) == 0) {
+		if (ufshcd_clear_cmd(hba, tag) == 0) {
 			/* successfully cleared the command, retry if needed */
 			err = -EAGAIN;
 			/*
@@ -3285,11 +3289,9 @@ retry:
 			 * variable.
 			 */
 			spin_lock_irqsave(&hba->outstanding_lock, flags);
-			pending = test_bit(lrbp->task_tag,
-					   &hba->outstanding_reqs);
+			pending = test_bit(tag, &hba->outstanding_reqs);
 			if (pending)
-				__clear_bit(lrbp->task_tag,
-					    &hba->outstanding_reqs);
+				__clear_bit(tag, &hba->outstanding_reqs);
 			spin_unlock_irqrestore(&hba->outstanding_lock, flags);
 
 			if (!pending) {
@@ -3302,11 +3304,10 @@ retry:
 			}
 		} else {
 			dev_err(hba->dev, "%s: failed to clear tag %d\n",
-				__func__, lrbp->task_tag);
+				__func__, tag);
 
 			spin_lock_irqsave(&hba->outstanding_lock, flags);
-			pending = test_bit(lrbp->task_tag,
-					   &hba->outstanding_reqs);
+			pending = test_bit(tag, &hba->outstanding_reqs);
 			spin_unlock_irqrestore(&hba->outstanding_lock, flags);
 
 			if (!pending) {
@@ -5468,6 +5469,7 @@ static inline int ufshcd_transfer_rsp_status(struct ufs_hba *hba,
 					     struct cq_entry *cqe)
 {
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	const int tag = scsi_cmd_to_rq(cmd)->tag;
 	int result = 0;
 	int scsi_status;
 	enum utp_ocs ocs;
@@ -5539,10 +5541,8 @@ static inline int ufshcd_transfer_rsp_status(struct ufs_hba *hba,
 	case OCS_ABORTED:
 	case OCS_INVALID_COMMAND_STATUS:
 		result |= DID_REQUEUE << 16;
-		dev_warn(hba->dev,
-				"OCS %s from controller for tag %d\n",
-				(ocs == OCS_ABORTED ? "aborted" : "invalid"),
-				lrbp->task_tag);
+		dev_warn(hba->dev, "OCS %s from controller for tag %d\n",
+			 ocs == OCS_ABORTED ? "aborted" : "invalid", tag);
 		break;
 	case OCS_INVALID_CMD_TABLE_ATTR:
 	case OCS_INVALID_PRDT_ATTR:
@@ -5555,9 +5555,8 @@ static inline int ufshcd_transfer_rsp_status(struct ufs_hba *hba,
 	case OCS_GENERAL_CRYPTO_ERROR:
 	default:
 		result |= DID_ERROR << 16;
-		dev_err(hba->dev,
-				"OCS error from controller = %x for tag %d\n",
-				ocs, lrbp->task_tag);
+		dev_err(hba->dev, "OCS error from controller = %x for tag %d\n",
+			ocs, tag);
 		ufshcd_print_evt_hist(hba);
 		ufshcd_print_host_state(hba);
 		break;
@@ -7692,8 +7691,8 @@ int ufshcd_try_to_abort_task(struct ufs_hba *hba, int tag)
 	u8 resp = 0xF;
 
 	for (poll_cnt = 100; poll_cnt; poll_cnt--) {
-		err = ufshcd_issue_tm_cmd(hba, lrbp->lun, lrbp->task_tag,
-				UFS_QUERY_TASK, &resp);
+		err = ufshcd_issue_tm_cmd(hba, lrbp->lun, tag, UFS_QUERY_TASK,
+					  &resp);
 		if (!err && resp == UPIU_TASK_MANAGEMENT_FUNC_SUCCEEDED) {
 			/* cmd pending in the device */
 			dev_err(hba->dev, "%s: cmd pending in the device. tag = %d\n",
@@ -7726,8 +7725,7 @@ int ufshcd_try_to_abort_task(struct ufs_hba *hba, int tag)
 	if (!poll_cnt)
 		return -EBUSY;
 
-	err = ufshcd_issue_tm_cmd(hba, lrbp->lun, lrbp->task_tag,
-			UFS_ABORT_TASK, &resp);
+	err = ufshcd_issue_tm_cmd(hba, lrbp->lun, tag, UFS_ABORT_TASK, &resp);
 	if (err || resp != UPIU_TASK_MANAGEMENT_FUNC_COMPL) {
 		if (!err) {
 			err = resp; /* service response error */
@@ -7837,7 +7835,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 		goto release;
 	}
 
-	err = ufshcd_try_to_abort_task(hba, lrbp->task_tag);
+	err = ufshcd_try_to_abort_task(hba, tag);
 	if (err) {
 		dev_err(hba->dev, "%s: failed with err %d\n", __func__, err);
 		ufshcd_set_req_abort_skip(hba, hba->outstanding_reqs);
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index fbed47b6c61f..a92062f65455 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -188,7 +188,6 @@ struct ufshcd_lrb {
 	int scsi_status;
 
 	int command_type;
-	int task_tag;
 	u8 lun; /* UPIU LUN id field is only 8-bit wide */
 	bool intr_cmd;
 	bool req_abort_skip;
-- 
cgit v1.2.3


From 08b12cda6c44dc015bcc152613c35ee0ae8f37b9 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 31 Oct 2025 13:39:36 -0700
Subject: scsi: ufs: core: Switch to scsi_get_internal_cmd()

Instead of storing the tag of the reserved command in hba->reserved_slot,
use scsi_get_internal_cmd() and scsi_put_internal_cmd() to allocate the
tag for the reserved command dynamically. Add
ufshcd_queue_reserved_command() for submitting reserved commands. Add
support in ufshcd_abort() for device management commands. Use
blk_execute_rq() for submitting reserved commands. Remove the code and
data structures that became superfluous. This includes
ufshcd_wait_for_dev_cmd(), hba->reserved_slot and ufs_dev_cmd.complete.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251031204029.2883185-29-bvanassche@acm.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufs-mcq.c     |  19 ++--
 drivers/ufs/core/ufshcd-priv.h |  25 +----
 drivers/ufs/core/ufshcd.c      | 225 ++++++++++++++++++++---------------------
 include/ufs/ufshcd.h           |   6 --
 4 files changed, 116 insertions(+), 159 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c
index 776ff0896a2a..9ab91b4c05b0 100644
--- a/drivers/ufs/core/ufs-mcq.c
+++ b/drivers/ufs/core/ufs-mcq.c
@@ -479,9 +479,6 @@ int ufshcd_mcq_init(struct ufs_hba *hba)
 		mutex_init(&hwq->sq_mutex);
 	}
 
-	/* The very first HW queue serves device commands */
-	hba->dev_cmd_queue = &hba->uhq[0];
-
 	host->host_tagset = 1;
 	return 0;
 }
@@ -536,6 +533,7 @@ int ufshcd_mcq_sq_cleanup(struct ufs_hba *hba, int task_tag)
 {
 	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, task_tag);
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	struct request *rq = scsi_cmd_to_rq(cmd);
 	struct ufs_hw_queue *hwq;
 	void __iomem *reg, *opr_sqd_base;
 	u32 nexus, id, val;
@@ -544,15 +542,12 @@ int ufshcd_mcq_sq_cleanup(struct ufs_hba *hba, int task_tag)
 	if (hba->quirks & UFSHCD_QUIRK_MCQ_BROKEN_RTC)
 		return -ETIMEDOUT;
 
-	if (task_tag != hba->reserved_slot) {
-		if (!cmd)
-			return -EINVAL;
-		hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(cmd));
-		if (!hwq)
-			return 0;
-	} else {
-		hwq = hba->dev_cmd_queue;
-	}
+	if (!cmd)
+		return -EINVAL;
+
+	hwq = ufshcd_mcq_req_to_hwq(hba, rq);
+	if (!hwq)
+		return 0;
 
 	id = hwq->id;
 
diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index 72d2766b19e3..2f752a45db87 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -369,30 +369,7 @@ static inline bool ufs_is_valid_unit_desc_lun(struct ufs_dev_info *dev_info, u8
 static inline struct scsi_cmnd *ufshcd_tag_to_cmd(struct ufs_hba *hba, u32 tag)
 {
 	struct blk_mq_tags *tags = hba->host->tag_set.shared_tags;
-	struct request *rq;
-
-	/*
-	 * Handle reserved tags differently because the UFS driver does not
-	 * call blk_mq_alloc_request() for allocating reserved requests.
-	 * Allocating reserved tags with blk_mq_alloc_request() would require
-	 * the following:
-	 * - Allocate an additional request queue from &hba->host->tag_set for
-	 *   allocating reserved requests from.
-	 * - For that request queue, allocate a SCSI device.
-	 * - Calling blk_mq_alloc_request(hba->dev_mgmt_queue, REQ_OP_DRV_OUT,
-	 *   BLK_MQ_REQ_RESERVED) for allocating a reserved request and
-	 *   blk_mq_free_request() for freeing reserved requests.
-	 * - Set the .device pointer for these reserved requests.
-	 * - Submit reserved requests with blk_execute_rq().
-	 * - Modify ufshcd_queuecommand() such that it handles reserved requests
-	 *   in another way than SCSI requests.
-	 * - Modify ufshcd_compl_one_cqe() such that it calls scsi_done() for
-	 *   device management commands.
-	 * - Modify all callback functions called by blk_mq_tagset_busy_iter()
-	 *   calls in the UFS driver and skip device management commands.
-	 */
-	rq = tag < UFSHCD_NUM_RESERVED ? tags->static_rqs[tag] :
-					 blk_mq_tag_to_rq(tags, tag);
+	struct request *rq = blk_mq_tag_to_rq(tags, tag);
 
 	if (WARN_ON_ONCE(!rq))
 		return NULL;
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 2175b41262c8..ca17165f6f0e 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -2350,13 +2350,10 @@ static void ufshcd_update_monitor(struct ufs_hba *hba, struct scsi_cmnd *cmd)
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
 }
 
-/*
- * Returns %true for SCSI commands and %false for device management commands.
- * Must not be called for SCSI commands that have not yet been started.
- */
+/* Returns %true for SCSI commands and %false for device management commands. */
 static bool ufshcd_is_scsi_cmd(struct scsi_cmnd *cmd)
 {
-	return blk_mq_request_started(scsi_cmd_to_rq(cmd));
+	return !blk_mq_is_reserved_rq(scsi_cmd_to_rq(cmd));
 }
 
 /**
@@ -2487,7 +2484,6 @@ static inline int ufshcd_hba_capabilities(struct ufs_hba *hba)
 	hba->nutrs = (hba->capabilities & MASK_TRANSFER_REQUESTS_SLOTS_SDB) + 1;
 	hba->nutmrs =
 	((hba->capabilities & MASK_TASK_MANAGEMENT_REQUEST_SLOTS) >> 16) + 1;
-	hba->reserved_slot = 0;
 
 	hba->nortt = FIELD_GET(MASK_NUMBER_OUTSTANDING_RTT, hba->capabilities) + 1;
 
@@ -3116,6 +3112,20 @@ out:
 	return err;
 }
 
+static int ufshcd_queue_reserved_command(struct Scsi_Host *host,
+					 struct scsi_cmnd *cmd)
+{
+	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	struct request *rq = scsi_cmd_to_rq(cmd);
+	struct ufs_hba *hba = shost_priv(host);
+	struct ufs_hw_queue *hwq =
+		hba->mcq_enabled ? ufshcd_mcq_req_to_hwq(hba, rq) : NULL;
+
+	ufshcd_add_query_upiu_trace(hba, UFS_QUERY_SEND, lrbp->ucd_req_ptr);
+	ufshcd_send_command(hba, cmd, hwq);
+	return 0;
+}
+
 static void ufshcd_setup_dev_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd,
 				 enum dev_cmd_type cmd_type, u8 lun, int tag)
 {
@@ -3245,84 +3255,6 @@ ufshcd_dev_cmd_completion(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
 	return err;
 }
 
-/*
- * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
- * < 0 if another error occurred.
- */
-static int ufshcd_wait_for_dev_cmd(struct ufs_hba *hba,
-		struct ufshcd_lrb *lrbp, int max_timeout)
-{
-	struct scsi_cmnd *cmd = (struct scsi_cmnd *)lrbp - 1;
-	const int tag = scsi_cmd_to_rq(cmd)->tag;
-	unsigned long time_left = msecs_to_jiffies(max_timeout);
-	unsigned long flags;
-	bool pending;
-	int err;
-
-retry:
-	time_left = wait_for_completion_timeout(&hba->dev_cmd.complete,
-						time_left);
-
-	if (likely(time_left)) {
-		err = ufshcd_get_tr_ocs(lrbp, NULL);
-	} else {
-		err = -ETIMEDOUT;
-		dev_dbg(hba->dev, "%s: dev_cmd request timedout, tag %d\n",
-			__func__, tag);
-
-		/* MCQ mode */
-		if (hba->mcq_enabled) {
-			/* successfully cleared the command, retry if needed */
-			if (ufshcd_clear_cmd(hba, tag) == 0)
-				err = -EAGAIN;
-			return err;
-		}
-
-		/* SDB mode */
-		if (ufshcd_clear_cmd(hba, tag) == 0) {
-			/* successfully cleared the command, retry if needed */
-			err = -EAGAIN;
-			/*
-			 * Since clearing the command succeeded we also need to
-			 * clear the task tag bit from the outstanding_reqs
-			 * variable.
-			 */
-			spin_lock_irqsave(&hba->outstanding_lock, flags);
-			pending = test_bit(tag, &hba->outstanding_reqs);
-			if (pending)
-				__clear_bit(tag, &hba->outstanding_reqs);
-			spin_unlock_irqrestore(&hba->outstanding_lock, flags);
-
-			if (!pending) {
-				/*
-				 * The completion handler ran while we tried to
-				 * clear the command.
-				 */
-				time_left = 1;
-				goto retry;
-			}
-		} else {
-			dev_err(hba->dev, "%s: failed to clear tag %d\n",
-				__func__, tag);
-
-			spin_lock_irqsave(&hba->outstanding_lock, flags);
-			pending = test_bit(tag, &hba->outstanding_reqs);
-			spin_unlock_irqrestore(&hba->outstanding_lock, flags);
-
-			if (!pending) {
-				/*
-				 * The completion handler ran while we tried to
-				 * clear the command.
-				 */
-				time_left = 1;
-				goto retry;
-			}
-		}
-	}
-
-	return err;
-}
-
 static void ufshcd_dev_man_lock(struct ufs_hba *hba)
 {
 	ufshcd_hold(hba);
@@ -3337,6 +3269,24 @@ static void ufshcd_dev_man_unlock(struct ufs_hba *hba)
 	ufshcd_release(hba);
 }
 
+static struct scsi_cmnd *ufshcd_get_dev_mgmt_cmd(struct ufs_hba *hba)
+{
+	/*
+	 * The caller must hold this lock to guarantee that the NOWAIT
+	 * allocation will succeed.
+	 */
+	lockdep_assert_held(&hba->dev_cmd.lock);
+
+	return scsi_get_internal_cmd(
+		hba->host->pseudo_sdev, DMA_TO_DEVICE,
+		BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+}
+
+static void ufshcd_put_dev_mgmt_cmd(struct scsi_cmnd *cmd)
+{
+	scsi_put_internal_cmd(cmd);
+}
+
 /*
  * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
  * < 0 if another error occurred.
@@ -3345,16 +3295,14 @@ static int ufshcd_issue_dev_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd,
 				const u32 tag, int timeout)
 {
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
-	int err;
-
-	ufshcd_add_query_upiu_trace(hba, UFS_QUERY_SEND, lrbp->ucd_req_ptr);
-	ufshcd_send_command(hba, cmd, hba->dev_cmd_queue);
-	err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
-
-	ufshcd_add_query_upiu_trace(hba, err ? UFS_QUERY_ERR : UFS_QUERY_COMP,
-				    (struct utp_upiu_req *)lrbp->ucd_rsp_ptr);
+	struct request *rq = scsi_cmd_to_rq(cmd);
+	blk_status_t sts;
 
-	return err;
+	rq->timeout = timeout;
+	sts = blk_execute_rq(rq, true);
+	if (sts != BLK_STS_OK)
+		return blk_status_to_errno(sts);
+	return lrbp->utr_descriptor_ptr->header.ocs;
 }
 
 /**
@@ -3372,23 +3320,31 @@ static int ufshcd_issue_dev_cmd(struct ufs_hba *hba, struct scsi_cmnd *cmd,
 static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
 		enum dev_cmd_type cmd_type, int timeout)
 {
-	const u32 tag = hba->reserved_slot;
-	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, tag);
+	struct scsi_cmnd *cmd = ufshcd_get_dev_mgmt_cmd(hba);
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	u32 tag;
 	int err;
 
-	/* Protects use of hba->reserved_slot. */
+	/* Protects use of hba->dev_cmd. */
 	lockdep_assert_held(&hba->dev_cmd.lock);
 
+	if (WARN_ON_ONCE(!cmd))
+		return -ENOMEM;
+
+	tag = scsi_cmd_to_rq(cmd)->tag;
+
 	err = ufshcd_compose_dev_cmd(hba, cmd, cmd_type, tag);
 	if (unlikely(err))
-		return err;
+		goto out;
 
 	err = ufshcd_issue_dev_cmd(hba, cmd, tag, timeout);
-	if (err)
-		return err;
+	if (err == 0)
+		err = ufshcd_dev_cmd_completion(hba, lrbp);
 
-	return ufshcd_dev_cmd_completion(hba, lrbp);
+out:
+	ufshcd_put_dev_mgmt_cmd(cmd);
+
+	return err;
 }
 
 /**
@@ -5658,6 +5614,10 @@ void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag,
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	enum utp_ocs ocs;
 
+	if (WARN_ONCE(!cmd, "cqe->command_desc_base_addr = %#llx\n",
+		      le64_to_cpu(cqe->command_desc_base_addr)))
+		return;
+
 	if (hba->monitor.enabled) {
 		lrbp->compl_time_stamp = ktime_get();
 		lrbp->compl_time_stamp_local_clock = local_clock();
@@ -5668,15 +5628,21 @@ void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag,
 		ufshcd_add_command_trace(hba, cmd, UFS_CMD_COMP);
 		cmd->result = ufshcd_transfer_rsp_status(hba, cmd, cqe);
 		ufshcd_release_scsi_cmd(hba, cmd);
-		/* Do not touch lrbp after scsi done */
-		scsi_done(cmd);
 	} else {
 		if (cqe) {
 			ocs = cqe->overall_status & MASK_OCS;
 			lrbp->utr_descriptor_ptr->header.ocs = ocs;
+		} else {
+			ocs = lrbp->utr_descriptor_ptr->header.ocs;
 		}
-		complete(&hba->dev_cmd.complete);
+		ufshcd_add_query_upiu_trace(
+			hba,
+			ocs == OCS_SUCCESS ? UFS_QUERY_COMP : UFS_QUERY_ERR,
+			(struct utp_upiu_req *)lrbp->ucd_rsp_ptr);
+		cmd->result = 0;
 	}
+	/* Do not touch lrbp after scsi_done() has been called. */
+	scsi_done(cmd);
 }
 
 /**
@@ -7386,15 +7352,20 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 					enum dev_cmd_type cmd_type,
 					enum query_opcode desc_op)
 {
-	const u32 tag = hba->reserved_slot;
-	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, tag);
+	struct scsi_cmnd *cmd = ufshcd_get_dev_mgmt_cmd(hba);
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	u32 tag;
 	int err = 0;
 	u8 upiu_flags;
 
-	/* Protects use of hba->reserved_slot. */
+	/* Protects use of hba->dev_cmd. */
 	lockdep_assert_held(&hba->dev_cmd.lock);
 
+	if (WARN_ON_ONCE(!cmd))
+		return -ENOMEM;
+
+	tag = scsi_cmd_to_rq(cmd)->tag;
+
 	ufshcd_setup_dev_cmd(hba, cmd, cmd_type, 0, tag);
 
 	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, 0);
@@ -7417,7 +7388,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 
 	err = ufshcd_issue_dev_cmd(hba, cmd, tag, dev_cmd_timeout);
 	if (err)
-		return err;
+		goto put_dev_mgmt_cmd;
 
 	/* just copy the upiu response as it is */
 	memcpy(rsp_upiu, lrbp->ucd_rsp_ptr, sizeof(*rsp_upiu));
@@ -7438,6 +7409,9 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 		}
 	}
 
+put_dev_mgmt_cmd:
+	ufshcd_put_dev_mgmt_cmd(cmd);
+
 	return err;
 }
 
@@ -7531,9 +7505,9 @@ int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *r
 			 struct ufs_ehs *rsp_ehs, int sg_cnt, struct scatterlist *sg_list,
 			 enum dma_data_direction dir)
 {
-	const u32 tag = hba->reserved_slot;
-	struct scsi_cmnd *cmd = ufshcd_tag_to_cmd(hba, tag);
-	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
+	struct scsi_cmnd *cmd;
+	struct ufshcd_lrb *lrbp;
+	u32 tag;
 	int err = 0;
 	int result;
 	u8 upiu_flags;
@@ -7541,9 +7515,18 @@ int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *r
 	u16 ehs_len;
 	int ehs = (hba->capabilities & MASK_EHSLUTRD_SUPPORTED) ? 2 : 0;
 
-	/* Protects use of hba->reserved_slot. */
 	ufshcd_dev_man_lock(hba);
 
+	cmd = ufshcd_get_dev_mgmt_cmd(hba);
+
+	if (WARN_ON_ONCE(!cmd)) {
+		err = -ENOMEM;
+		goto unlock;
+	}
+
+	lrbp = scsi_cmd_priv(cmd);
+	tag = scsi_cmd_to_rq(cmd)->tag;
+
 	ufshcd_setup_dev_cmd(hba, cmd, DEV_CMD_TYPE_RPMB, UFS_UPIU_RPMB_WLUN,
 			     tag);
 
@@ -7564,7 +7547,7 @@ int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *r
 
 	err = ufshcd_issue_dev_cmd(hba, cmd, tag, ADVANCED_RPMB_REQ_TIMEOUT);
 	if (err)
-		return err;
+		goto put_dev_mgmt_cmd;
 
 	err = ufshcd_dev_cmd_completion(hba, lrbp);
 	if (!err) {
@@ -7590,6 +7573,10 @@ int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *r
 		}
 	}
 
+put_dev_mgmt_cmd:
+	ufshcd_put_dev_mgmt_cmd(cmd);
+
+unlock:
 	ufshcd_dev_man_unlock(hba);
 
 	return err ? : result;
@@ -7760,7 +7747,8 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 {
 	struct Scsi_Host *host = cmd->device->host;
 	struct ufs_hba *hba = shost_priv(host);
-	int tag = scsi_cmd_to_rq(cmd)->tag;
+	struct request *rq = scsi_cmd_to_rq(cmd);
+	int tag = rq->tag;
 	struct ufshcd_lrb *lrbp = scsi_cmd_priv(cmd);
 	unsigned long flags;
 	int err = FAILED;
@@ -7790,7 +7778,8 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 	 * to reduce repeated printouts. For other aborted requests only print
 	 * basic details.
 	 */
-	scsi_print_command(cmd);
+	if (ufshcd_is_scsi_cmd(cmd))
+		scsi_print_command(cmd);
 	if (!hba->req_abort_count) {
 		ufshcd_update_evt_hist(hba, UFS_EVT_ABORT, tag);
 		ufshcd_print_evt_hist(hba);
@@ -7842,7 +7831,10 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 		goto release;
 	}
 
-	err = ufshcd_try_to_abort_task(hba, tag);
+	if (blk_mq_is_reserved_rq(rq))
+		err = ufshcd_clear_cmd(hba, tag);
+	else
+		err = ufshcd_try_to_abort_task(hba, tag);
 	if (err) {
 		dev_err(hba->dev, "%s: failed with err %d\n", __func__, err);
 		ufshcd_set_req_abort_skip(hba, hba->outstanding_reqs);
@@ -9212,6 +9204,7 @@ static const struct scsi_host_template ufshcd_driver_template = {
 	.cmd_size		= sizeof(struct ufshcd_lrb),
 	.init_cmd_priv		= ufshcd_init_cmd_priv,
 	.queuecommand		= ufshcd_queuecommand,
+	.queue_reserved_command	= ufshcd_queue_reserved_command,
 	.nr_reserved_cmds	= UFSHCD_NUM_RESERVED,
 	.mq_poll		= ufshcd_poll,
 	.sdev_init		= ufshcd_sdev_init,
@@ -10764,8 +10757,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	 */
 	hba->vcc_off_delay_us = 2000;
 
-	init_completion(&hba->dev_cmd.complete);
-
 	err = ufshcd_hba_init(hba);
 	if (err)
 		goto out_error;
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index a92062f65455..c07ba003a5cb 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -236,13 +236,11 @@ struct ufs_query {
  * struct ufs_dev_cmd - all assosiated fields with device management commands
  * @type: device management command type - Query, NOP OUT
  * @lock: lock to allow one command at a time
- * @complete: internal commands completion
  * @query: Device management query information
  */
 struct ufs_dev_cmd {
 	enum dev_cmd_type type;
 	struct mutex lock;
-	struct completion complete;
 	struct ufs_query query;
 };
 
@@ -838,7 +836,6 @@ enum ufshcd_mcq_opr {
  * @nutrs: Transfer Request Queue depth supported by controller
  * @nortt - Max outstanding RTTs supported by controller
  * @nutmrs: Task Management Queue depth supported by controller
- * @reserved_slot: Used to submit device commands. Protected by @dev_cmd.lock.
  * @ufs_version: UFS Version to which controller complies
  * @vops: pointer to variant specific operations
  * @vps: pointer to variant specific parameters
@@ -929,7 +926,6 @@ enum ufshcd_mcq_opr {
  * @res: array of resource info of MCQ registers
  * @mcq_base: Multi circular queue registers base address
  * @uhq: array of supported hardware queues
- * @dev_cmd_queue: Queue for issuing device management commands
  * @mcq_opr: MCQ operation and runtime registers
  * @ufs_rtc_update_work: A work for UFS RTC periodic update
  * @pm_qos_req: PM QoS request handle
@@ -981,7 +977,6 @@ struct ufs_hba {
 	int nortt;
 	u32 mcq_capabilities;
 	int nutmrs;
-	u32 reserved_slot;
 	u32 ufs_version;
 	const struct ufs_hba_variant_ops *vops;
 	struct ufs_hba_variant_params *vps;
@@ -1099,7 +1094,6 @@ struct ufs_hba {
 	bool mcq_esi_enabled;
 	void __iomem *mcq_base;
 	struct ufs_hw_queue *uhq;
-	struct ufs_hw_queue *dev_cmd_queue;
 	struct ufshcd_mcq_opr_info_t mcq_opr[OPR_MAX];
 
 	struct delayed_work ufs_rtc_update_work;
-- 
cgit v1.2.3


From 775f726a742a60d8d0ed2b4733a5b6a796d9d1dd Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 7 Nov 2025 13:41:31 -0400
Subject: vfio: Add get_region_info_caps op

This op does the copy to/from user for the info and can return back
a cap chain through a vfio_info_cap * result.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/15-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/vfio/vfio_main.c | 56 ++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/vfio.h     |  4 ++++
 2 files changed, 56 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index f056e82ba350..48d034aede46 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1259,6 +1259,57 @@ static int vfio_ioctl_device_feature(struct vfio_device *device,
 	}
 }
 
+static long vfio_get_region_info(struct vfio_device *device,
+				 struct vfio_region_info __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+	struct vfio_region_info info = {};
+	struct vfio_info_cap caps = {};
+	int ret;
+
+	if (copy_from_user(&info, arg, minsz))
+		return -EFAULT;
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	if (device->ops->get_region_info_caps) {
+		ret = device->ops->get_region_info_caps(device, &info, &caps);
+		if (ret)
+			goto out_free;
+
+		if (caps.size) {
+			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user(arg + 1, caps.buf,
+						 caps.size)) {
+					ret = -EFAULT;
+					goto out_free;
+				}
+				info.cap_offset = sizeof(info);
+			}
+		}
+
+		if (copy_to_user(arg, &info, minsz)) {
+			ret = -EFAULT;
+			goto out_free;
+		}
+	} else if (device->ops->get_region_info) {
+		ret = device->ops->get_region_info(device, arg);
+		if (ret)
+			return ret;
+	} else {
+		return -EINVAL;
+	}
+
+out_free:
+	kfree(caps.buf);
+	return ret;
+}
+
 static long vfio_device_fops_unl_ioctl(struct file *filep,
 				       unsigned int cmd, unsigned long arg)
 {
@@ -1297,10 +1348,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
 		break;
 
 	case VFIO_DEVICE_GET_REGION_INFO:
-		if (unlikely(!device->ops->get_region_info))
-			ret = -EINVAL;
-		else
-			ret = device->ops->get_region_info(device, uptr);
+		ret = vfio_get_region_info(device, uptr);
 		break;
 
 	default:
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index be5fcf8432e8..6311ddc83770 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -21,6 +21,7 @@ struct kvm;
 struct iommufd_ctx;
 struct iommufd_device;
 struct iommufd_access;
+struct vfio_info_cap;
 
 /*
  * VFIO devices can be placed in a set, this allows all devices to share this
@@ -134,6 +135,9 @@ struct vfio_device_ops {
 			 unsigned long arg);
 	int	(*get_region_info)(struct vfio_device *vdev,
 				   struct vfio_region_info __user *arg);
+	int	(*get_region_info_caps)(struct vfio_device *vdev,
+					struct vfio_region_info *info,
+					struct vfio_info_cap *caps);
 	int	(*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
 	void	(*request)(struct vfio_device *vdev, unsigned int count);
 	int	(*match)(struct vfio_device *vdev, char *buf);
-- 
cgit v1.2.3


From 1b0ecb5baf4af3baa8627144bbcf9848806aa5f1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 7 Nov 2025 13:41:35 -0400
Subject: vfio/pci: Convert all PCI drivers to get_region_info_caps

Since the core function signature changes it has to flow up to all
drivers.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/19-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c |  30 +++----
 drivers/vfio/pci/mlx5/main.c                   |   2 +-
 drivers/vfio/pci/nvgrace-gpu/main.c            |  51 +++---------
 drivers/vfio/pci/pds/vfio_dev.c                |   2 +-
 drivers/vfio/pci/qat/main.c                    |   2 +-
 drivers/vfio/pci/vfio_pci.c                    |   2 +-
 drivers/vfio/pci/vfio_pci_core.c               | 103 ++++++++++---------------
 drivers/vfio/pci/virtio/common.h               |   3 +-
 drivers/vfio/pci/virtio/legacy_io.c            |  26 ++-----
 drivers/vfio/pci/virtio/main.c                 |   6 +-
 include/linux/vfio_pci_core.h                  |   3 +-
 11 files changed, 80 insertions(+), 150 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 1fd3a2075ee4..cf45f6370c36 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -1386,32 +1386,22 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev,
 }
 
 static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev,
-					  struct vfio_region_info __user *arg)
+					  struct vfio_region_info *info,
+					  struct vfio_info_cap *caps)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
-	struct vfio_region_info info;
-	unsigned long minsz;
-
-	minsz = offsetofend(struct vfio_region_info, offset);
 
-	if (copy_from_user(&info, arg, minsz))
-		return -EFAULT;
+	if (info->index != VFIO_PCI_BAR2_REGION_INDEX)
+		return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
 
-	if (info.argsz < minsz)
-		return -EINVAL;
+	info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
 
-	if (info.index != VFIO_PCI_BAR2_REGION_INDEX)
-		return vfio_pci_ioctl_get_region_info(core_vdev, arg);
+	info->size = hisi_acc_get_resource_len(vdev, info->index);
 
-	info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-
-	info.size = hisi_acc_get_resource_len(vdev, info.index);
-
-	info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
+	info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
 		     VFIO_REGION_INFO_FLAG_MMAP;
-
-	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+	return 0;
 }
 
 static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev)
@@ -1610,7 +1600,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
 	.open_device = hisi_acc_vfio_pci_open_device,
 	.close_device = hisi_acc_vfio_pci_close_device,
 	.ioctl = vfio_pci_core_ioctl,
-	.get_region_info = hisi_acc_vfio_ioctl_get_region,
+	.get_region_info_caps = hisi_acc_vfio_ioctl_get_region,
 	.device_feature = vfio_pci_core_ioctl_feature,
 	.read = hisi_acc_vfio_pci_read,
 	.write = hisi_acc_vfio_pci_write,
@@ -1631,7 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
 	.open_device = hisi_acc_vfio_pci_open_device,
 	.close_device = vfio_pci_core_close_device,
 	.ioctl = vfio_pci_core_ioctl,
-	.get_region_info = vfio_pci_ioctl_get_region_info,
+	.get_region_info_caps = vfio_pci_ioctl_get_region_info,
 	.device_feature = vfio_pci_core_ioctl_feature,
 	.read = vfio_pci_core_read,
 	.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index b7f941f8047e..9c5970411d07 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1366,7 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
 	.open_device = mlx5vf_pci_open_device,
 	.close_device = mlx5vf_pci_close_device,
 	.ioctl = vfio_pci_core_ioctl,
-	.get_region_info = vfio_pci_ioctl_get_region_info,
+	.get_region_info_caps = vfio_pci_ioctl_get_region_info,
 	.device_feature = vfio_pci_core_ioctl_feature,
 	.read = vfio_pci_core_read,
 	.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index cab743a30dc3..5a6f77d5c81e 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -205,34 +205,25 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
 	return 0;
 }
 
-static int
-nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
-				  struct vfio_region_info __user *arg)
+static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
+					     struct vfio_region_info *info,
+					     struct vfio_info_cap *caps)
 {
 	struct nvgrace_gpu_pci_core_device *nvdev =
 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
 			     core_device.vdev);
-	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
-	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
 	struct vfio_region_info_cap_sparse_mmap *sparse;
-	struct vfio_region_info info;
 	struct mem_region *memregion;
 	u32 size;
 	int ret;
 
-	if (copy_from_user(&info, arg, minsz))
-		return -EFAULT;
-
-	if (info.argsz < minsz)
-		return -EINVAL;
-
 	/*
 	 * Request to determine the BAR region information. Send the
 	 * GPU memory information.
 	 */
-	memregion = nvgrace_gpu_memregion(info.index, nvdev);
+	memregion = nvgrace_gpu_memregion(info->index, nvdev);
 	if (!memregion)
-		return vfio_pci_ioctl_get_region_info(core_vdev, arg);
+		return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
 
 	size = struct_size(sparse, areas, 1);
 
@@ -251,40 +242,22 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
 	sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
 	sparse->header.version = 1;
 
-	ret = vfio_info_add_capability(&caps, &sparse->header, size);
+	ret = vfio_info_add_capability(caps, &sparse->header, size);
 	kfree(sparse);
 	if (ret)
 		return ret;
 
-	info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+	info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
 	/*
 	 * The region memory size may not be power-of-2 aligned.
 	 * Given that the memory is a BAR and may not be
 	 * aligned, roundup to the next power-of-2.
 	 */
-	info.size = memregion->bar_size;
-	info.flags = VFIO_REGION_INFO_FLAG_READ |
+	info->size = memregion->bar_size;
+	info->flags = VFIO_REGION_INFO_FLAG_READ |
 		     VFIO_REGION_INFO_FLAG_WRITE |
 		     VFIO_REGION_INFO_FLAG_MMAP;
-
-	if (caps.size) {
-		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
-		if (info.argsz < sizeof(info) + caps.size) {
-			info.argsz = sizeof(info) + caps.size;
-			info.cap_offset = 0;
-		} else {
-			vfio_info_cap_shift(&caps, sizeof(info));
-			if (copy_to_user((void __user *)arg +
-					 sizeof(info), caps.buf,
-					 caps.size)) {
-				kfree(caps.buf);
-				return -EFAULT;
-			}
-			info.cap_offset = sizeof(info);
-		}
-		kfree(caps.buf);
-	}
-	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+	return 0;
 }
 
 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
@@ -686,7 +659,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
 	.open_device	= nvgrace_gpu_open_device,
 	.close_device	= nvgrace_gpu_close_device,
 	.ioctl		= nvgrace_gpu_ioctl,
-	.get_region_info = nvgrace_gpu_ioctl_get_region_info,
+	.get_region_info_caps = nvgrace_gpu_ioctl_get_region_info,
 	.device_feature	= vfio_pci_core_ioctl_feature,
 	.read		= nvgrace_gpu_read,
 	.write		= nvgrace_gpu_write,
@@ -707,7 +680,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
 	.open_device	= nvgrace_gpu_open_device,
 	.close_device	= vfio_pci_core_close_device,
 	.ioctl		= vfio_pci_core_ioctl,
-	.get_region_info = vfio_pci_ioctl_get_region_info,
+	.get_region_info_caps = vfio_pci_ioctl_get_region_info,
 	.device_feature	= vfio_pci_core_ioctl_feature,
 	.read		= vfio_pci_core_read,
 	.write		= vfio_pci_core_write,
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
index 1946bc75d99b..be103c74e969 100644
--- a/drivers/vfio/pci/pds/vfio_dev.c
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -195,7 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = {
 	.open_device = pds_vfio_open_device,
 	.close_device = pds_vfio_close_device,
 	.ioctl = vfio_pci_core_ioctl,
-	.get_region_info = vfio_pci_ioctl_get_region_info,
+	.get_region_info_caps = vfio_pci_ioctl_get_region_info,
 	.device_feature = vfio_pci_core_ioctl_feature,
 	.read = vfio_pci_core_read,
 	.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c
index 8452d9c1d11d..8fbdf7c6d666 100644
--- a/drivers/vfio/pci/qat/main.c
+++ b/drivers/vfio/pci/qat/main.c
@@ -609,7 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = {
 	.open_device = qat_vf_pci_open_device,
 	.close_device = qat_vf_pci_close_device,
 	.ioctl = vfio_pci_core_ioctl,
-	.get_region_info = vfio_pci_ioctl_get_region_info,
+	.get_region_info_caps = vfio_pci_ioctl_get_region_info,
 	.read = vfio_pci_core_read,
 	.write = vfio_pci_core_write,
 	.mmap = vfio_pci_core_mmap,
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 2d9122efc10b..a3e49d42c771 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -132,7 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
 	.open_device	= vfio_pci_open_device,
 	.close_device	= vfio_pci_core_close_device,
 	.ioctl		= vfio_pci_core_ioctl,
-	.get_region_info = vfio_pci_ioctl_get_region_info,
+	.get_region_info_caps = vfio_pci_ioctl_get_region_info,
 	.device_feature = vfio_pci_core_ioctl_feature,
 	.read		= vfio_pci_core_read,
 	.write		= vfio_pci_core_write,
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index f21d9026068c..57c0766fb9f8 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -997,43 +997,35 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
 }
 
 int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
-				   struct vfio_region_info __user *arg)
+				   struct vfio_region_info *info,
+				   struct vfio_info_cap *caps)
 {
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
-	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
 	struct pci_dev *pdev = vdev->pdev;
-	struct vfio_region_info info;
-	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
 	int i, ret;
 
-	if (copy_from_user(&info, arg, minsz))
-		return -EFAULT;
-
-	if (info.argsz < minsz)
-		return -EINVAL;
-
-	switch (info.index) {
+	switch (info->index) {
 	case VFIO_PCI_CONFIG_REGION_INDEX:
-		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-		info.size = pdev->cfg_size;
-		info.flags = VFIO_REGION_INFO_FLAG_READ |
-			     VFIO_REGION_INFO_FLAG_WRITE;
+		info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+		info->size = pdev->cfg_size;
+		info->flags = VFIO_REGION_INFO_FLAG_READ |
+			      VFIO_REGION_INFO_FLAG_WRITE;
 		break;
 	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
-		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-		info.size = pci_resource_len(pdev, info.index);
-		if (!info.size) {
-			info.flags = 0;
+		info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+		info->size = pci_resource_len(pdev, info->index);
+		if (!info->size) {
+			info->flags = 0;
 			break;
 		}
 
-		info.flags = VFIO_REGION_INFO_FLAG_READ |
-			     VFIO_REGION_INFO_FLAG_WRITE;
-		if (vdev->bar_mmap_supported[info.index]) {
-			info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
-			if (info.index == vdev->msix_bar) {
-				ret = msix_mmappable_cap(vdev, &caps);
+		info->flags = VFIO_REGION_INFO_FLAG_READ |
+			      VFIO_REGION_INFO_FLAG_WRITE;
+		if (vdev->bar_mmap_supported[info->index]) {
+			info->flags |= VFIO_REGION_INFO_FLAG_MMAP;
+			if (info->index == vdev->msix_bar) {
+				ret = msix_mmappable_cap(vdev, caps);
 				if (ret)
 					return ret;
 			}
@@ -1045,9 +1037,9 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
 		size_t size;
 		u16 cmd;
 
-		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-		info.flags = 0;
-		info.size = 0;
+		info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+		info->flags = 0;
+		info->size = 0;
 
 		if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
 			/*
@@ -1057,16 +1049,17 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
 			cmd = vfio_pci_memory_lock_and_enable(vdev);
 			io = pci_map_rom(pdev, &size);
 			if (io) {
-				info.flags = VFIO_REGION_INFO_FLAG_READ;
+				info->flags = VFIO_REGION_INFO_FLAG_READ;
 				/* Report the BAR size, not the ROM size. */
-				info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
+				info->size = pci_resource_len(pdev,
+							      PCI_ROM_RESOURCE);
 				pci_unmap_rom(pdev, io);
 			}
 			vfio_pci_memory_unlock_and_restore(vdev, cmd);
 		} else if (pdev->rom && pdev->romlen) {
-			info.flags = VFIO_REGION_INFO_FLAG_READ;
+			info->flags = VFIO_REGION_INFO_FLAG_READ;
 			/* Report BAR size as power of two. */
-			info.size = roundup_pow_of_two(pdev->romlen);
+			info->size = roundup_pow_of_two(pdev->romlen);
 		}
 
 		break;
@@ -1075,10 +1068,10 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
 		if (!vdev->has_vga)
 			return -EINVAL;
 
-		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-		info.size = 0xc0000;
-		info.flags = VFIO_REGION_INFO_FLAG_READ |
-			     VFIO_REGION_INFO_FLAG_WRITE;
+		info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+		info->size = 0xc0000;
+		info->flags = VFIO_REGION_INFO_FLAG_READ |
+			      VFIO_REGION_INFO_FLAG_WRITE;
 
 		break;
 	default: {
@@ -1087,52 +1080,34 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
 			.header.version = 1
 		};
 
-		if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+		if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
 			return -EINVAL;
-		info.index = array_index_nospec(
-			info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
+		info->index = array_index_nospec(
+			info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
 
-		i = info.index - VFIO_PCI_NUM_REGIONS;
+		i = info->index - VFIO_PCI_NUM_REGIONS;
 
-		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-		info.size = vdev->region[i].size;
-		info.flags = vdev->region[i].flags;
+		info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+		info->size = vdev->region[i].size;
+		info->flags = vdev->region[i].flags;
 
 		cap_type.type = vdev->region[i].type;
 		cap_type.subtype = vdev->region[i].subtype;
 
-		ret = vfio_info_add_capability(&caps, &cap_type.header,
+		ret = vfio_info_add_capability(caps, &cap_type.header,
 					       sizeof(cap_type));
 		if (ret)
 			return ret;
 
 		if (vdev->region[i].ops->add_capability) {
 			ret = vdev->region[i].ops->add_capability(
-				vdev, &vdev->region[i], &caps);
+				vdev, &vdev->region[i], caps);
 			if (ret)
 				return ret;
 		}
 	}
 	}
-
-	if (caps.size) {
-		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
-		if (info.argsz < sizeof(info) + caps.size) {
-			info.argsz = sizeof(info) + caps.size;
-			info.cap_offset = 0;
-		} else {
-			vfio_info_cap_shift(&caps, sizeof(info));
-			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
-				kfree(caps.buf);
-				return -EFAULT;
-			}
-			info.cap_offset = sizeof(*arg);
-		}
-
-		kfree(caps.buf);
-	}
-
-	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info);
 
diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h
index a10f2d92cb62..cb3d5e57d3a3 100644
--- a/drivers/vfio/pci/virtio/common.h
+++ b/drivers/vfio/pci/virtio/common.h
@@ -110,7 +110,8 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev);
 #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
 int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev);
 int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
-				       struct vfio_region_info __user *arg);
+				       struct vfio_region_info *info,
+				       struct vfio_info_cap *caps);
 ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev,
 				const char __user *buf, size_t count,
 				loff_t *ppos);
diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c
index d735d5c4bd77..1ed349a55629 100644
--- a/drivers/vfio/pci/virtio/legacy_io.c
+++ b/drivers/vfio/pci/virtio/legacy_io.c
@@ -281,29 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user
 }
 
 int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
-				       struct vfio_region_info __user *arg)
+				       struct vfio_region_info *info,
+				       struct vfio_info_cap *caps)
 {
 	struct virtiovf_pci_core_device *virtvdev = container_of(
 		core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
-	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
-	struct vfio_region_info info = {};
 
-	if (copy_from_user(&info, arg, minsz))
-		return -EFAULT;
+	if (info->index != VFIO_PCI_BAR0_REGION_INDEX)
+		return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
 
-	if (info.argsz < minsz)
-		return -EINVAL;
-
-	switch (info.index) {
-	case VFIO_PCI_BAR0_REGION_INDEX:
-		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-		info.size = virtvdev->bar0_virtual_buf_size;
-		info.flags = VFIO_REGION_INFO_FLAG_READ |
-			     VFIO_REGION_INFO_FLAG_WRITE;
-		return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
-	default:
-		return vfio_pci_ioctl_get_region_info(core_vdev, arg);
-	}
+	info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+	info->size = virtvdev->bar0_virtual_buf_size;
+	info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+	return 0;
 }
 
 static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev)
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index d68096bc5252..d2e5cbca13c8 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -88,7 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = {
 	.open_device = virtiovf_pci_open_device,
 	.close_device = virtiovf_pci_close_device,
 	.ioctl = vfio_pci_core_ioctl,
-	.get_region_info = vfio_pci_ioctl_get_region_info,
+	.get_region_info_caps = vfio_pci_ioctl_get_region_info,
 	.device_feature = vfio_pci_core_ioctl_feature,
 	.read = vfio_pci_core_read,
 	.write = vfio_pci_core_write,
@@ -110,7 +110,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = {
 	.open_device = virtiovf_pci_open_device,
 	.close_device = virtiovf_pci_close_device,
 	.ioctl = vfio_pci_core_ioctl,
-	.get_region_info = virtiovf_pci_ioctl_get_region_info,
+	.get_region_info_caps = virtiovf_pci_ioctl_get_region_info,
 	.device_feature = vfio_pci_core_ioctl_feature,
 	.read = virtiovf_pci_core_read,
 	.write = virtiovf_pci_core_write,
@@ -132,7 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
 	.open_device = virtiovf_pci_open_device,
 	.close_device = vfio_pci_core_close_device,
 	.ioctl = vfio_pci_core_ioctl,
-	.get_region_info = vfio_pci_ioctl_get_region_info,
+	.get_region_info_caps = vfio_pci_ioctl_get_region_info,
 	.device_feature = vfio_pci_core_ioctl_feature,
 	.read = vfio_pci_core_read,
 	.write = vfio_pci_core_write,
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 160bc2e31ece..e74f94c17fbe 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -116,7 +116,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 				void __user *arg, size_t argsz);
 int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
-				   struct vfio_region_info __user *arg);
+				   struct vfio_region_info *info,
+				   struct vfio_info_cap *caps);
 ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
 		size_t count, loff_t *ppos);
 ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
-- 
cgit v1.2.3


From 56c069307dfd0a5e39b685e0aeee6c40d1d7ddfc Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 7 Nov 2025 13:41:38 -0400
Subject: vfio: Remove the get_region_info op

No driver uses it now, all are using get_region_info_caps().

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/22-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/vfio/vfio_main.c | 50 +++++++++++++++++++++---------------------------
 include/linux/vfio.h     |  2 --
 2 files changed, 22 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 48d034aede46..b8fe1a75e48a 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1267,42 +1267,36 @@ static long vfio_get_region_info(struct vfio_device *device,
 	struct vfio_info_cap caps = {};
 	int ret;
 
+	if (unlikely(!device->ops->get_region_info_caps))
+		return -EINVAL;
+
 	if (copy_from_user(&info, arg, minsz))
 		return -EFAULT;
 	if (info.argsz < minsz)
 		return -EINVAL;
 
-	if (device->ops->get_region_info_caps) {
-		ret = device->ops->get_region_info_caps(device, &info, &caps);
-		if (ret)
-			goto out_free;
-
-		if (caps.size) {
-			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
-			if (info.argsz < sizeof(info) + caps.size) {
-				info.argsz = sizeof(info) + caps.size;
-				info.cap_offset = 0;
-			} else {
-				vfio_info_cap_shift(&caps, sizeof(info));
-				if (copy_to_user(arg + 1, caps.buf,
-						 caps.size)) {
-					ret = -EFAULT;
-					goto out_free;
-				}
-				info.cap_offset = sizeof(info);
+	ret = device->ops->get_region_info_caps(device, &info, &caps);
+	if (ret)
+		goto out_free;
+
+	if (caps.size) {
+		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+		if (info.argsz < sizeof(info) + caps.size) {
+			info.argsz = sizeof(info) + caps.size;
+			info.cap_offset = 0;
+		} else {
+			vfio_info_cap_shift(&caps, sizeof(info));
+			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
+				ret = -EFAULT;
+				goto out_free;
 			}
+			info.cap_offset = sizeof(info);
 		}
+	}
 
-		if (copy_to_user(arg, &info, minsz)) {
-			ret = -EFAULT;
-			goto out_free;
-		}
-	} else if (device->ops->get_region_info) {
-		ret = device->ops->get_region_info(device, arg);
-		if (ret)
-			return ret;
-	} else {
-		return -EINVAL;
+	if (copy_to_user(arg, &info, minsz)){
+		ret = -EFAULT;
+		goto out_free;
 	}
 
 out_free:
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 6311ddc83770..8e1ddb48b9b5 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -133,8 +133,6 @@ struct vfio_device_ops {
 			 size_t count, loff_t *size);
 	long	(*ioctl)(struct vfio_device *vdev, unsigned int cmd,
 			 unsigned long arg);
-	int	(*get_region_info)(struct vfio_device *vdev,
-				   struct vfio_region_info __user *arg);
 	int	(*get_region_info_caps)(struct vfio_device *vdev,
 					struct vfio_region_info *info,
 					struct vfio_info_cap *caps);
-- 
cgit v1.2.3


From 044b9f1a7f4f3d41563007d0762c83a7d7505ac0 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 12 Nov 2025 08:46:14 +0100
Subject: PCI/PTM: Enable only if device advertises relevant role

We have a Switch Upstream Port (2b:00.0) that has a PTM Capability, but
doesn't advertise support for any PTM roles:

  Capabilities: [220 v1] Precision Time Measurement
                PTMCap: Requester- Responder- Root-

Linux enables PTM without looking into what roles it actually supports, and
apparently the Port immediately sends PTM Requests even though it doesn't
support the PTM Requester role. The messages include an invalid bus number,
so the Root Port detects an ACS Violation (see the PCIe r7.0, sec 6.12.1.1,
implementation note):

  pci 0000:2b:00.0: [8086:5786] type 01 class 0x060400 PCIe Switch Upstream Port
  pci 0000:2b:00.0: PTM enabled, 4ns granularity
  pcieport 0000:00:07.1: AER: Multiple Uncorrectable (Non-Fatal) error message received from 0000:00:07.1
  pcieport 0000:00:07.1: PCIe Bus Error: severity=Uncorrectable (Non-Fatal), type=Transaction Layer, (Receiver ID)
  pcieport 0000:00:07.1:   device [8086:e44f] error status/mask=00200000/00000000
  pcieport 0000:00:07.1:    [21] ACSViol                (First)
  pcieport 0000:00:07.1: AER:   TLP Header: 0x34000000 0x00000052 0x00000000 0x00000000

The TLP Header shows a 4 DW header, no data (001b) Msg with Local routing
(1 0100b) with Requester ID 0x0000 and PTM Request code (0x52).

Fix this by enabling PTM only if the following conditions are true (see sec
6.21.1 figure 6-21):

  - Endpoint must advertise PTM Requester Capable

  - Switch Upstream Port must advertise PTM Responder Capable

  - Root Port must advertise PTM Root Capable

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
[bhelgaas: commit log, comments]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20251112074614.1440266-1-mika.westerberg@linux.intel.com
---
 drivers/pci/pcie/ptm.c | 23 +++++++++++++++++++++++
 include/linux/pci.h    |  2 ++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c
index 65e4b008be00..ed0f9691e7d1 100644
--- a/drivers/pci/pcie/ptm.c
+++ b/drivers/pci/pcie/ptm.c
@@ -81,6 +81,11 @@ void pci_ptm_init(struct pci_dev *dev)
 		dev->ptm_granularity = 0;
 	}
 
+	if (cap & PCI_PTM_CAP_RES)
+		dev->ptm_responder = 1;
+	if (cap & PCI_PTM_CAP_REQ)
+		dev->ptm_requester = 1;
+
 	if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
 	    pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM)
 		pci_enable_ptm(dev, NULL);
@@ -144,6 +149,24 @@ static int __pci_enable_ptm(struct pci_dev *dev)
 			return -EINVAL;
 	}
 
+	switch (pci_pcie_type(dev)) {
+	case PCI_EXP_TYPE_ROOT_PORT:
+		if (!dev->ptm_root)
+			return -EINVAL;
+		break;
+	case PCI_EXP_TYPE_UPSTREAM:
+		if (!dev->ptm_responder)
+			return -EINVAL;
+		break;
+	case PCI_EXP_TYPE_ENDPOINT:
+	case PCI_EXP_TYPE_LEG_END:
+		if (!dev->ptm_requester)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, &ctrl);
 
 	ctrl |= PCI_PTM_CTRL_ENABLE;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1fdf81fbe1e..d5018cb5c331 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -500,6 +500,8 @@ struct pci_dev {
 #ifdef CONFIG_PCIE_PTM
 	u16		ptm_cap;		/* PTM Capability */
 	unsigned int	ptm_root:1;
+	unsigned int	ptm_responder:1;
+	unsigned int	ptm_requester:1;
 	unsigned int	ptm_enabled:1;
 	u8		ptm_granularity;
 #endif
-- 
cgit v1.2.3


From 6276c67f2bc4aeaf350a7cf889c33c38b3330ea9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 12 Nov 2025 09:39:44 -0800
Subject: x86: Restrict KVM-induced symbol exports to KVM modules where
 obvious/possible

Extend KVM's export macro framework to provide EXPORT_SYMBOL_FOR_KVM(),
and use the helper macro to export symbols for KVM throughout x86 if and
only if KVM will build one or more modules, and only for those modules.

To avoid unnecessary exports when CONFIG_KVM=m but kvm.ko will not be
built (because no vendor modules are selected), let arch code #define
EXPORT_SYMBOL_FOR_KVM to suppress/override the exports.

Note, the set of symbols to restrict to KVM was generated by manual search
and audit; any "misses" are due to human error, not some grand plan.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Kai Huang <kai.huang@intel.com>
Tested-by: Kai Huang <kai.huang@intel.com>
Link: https://patch.msgid.link/20251112173944.1380633-5-seanjc%40google.com
---
 arch/x86/entry/entry.S             |  7 ++--
 arch/x86/entry/entry_64.S          |  3 +-
 arch/x86/entry/entry_64_fred.S     |  3 +-
 arch/x86/events/amd/core.c         |  5 +--
 arch/x86/events/core.c             |  7 ++--
 arch/x86/events/intel/lbr.c        |  3 +-
 arch/x86/events/intel/pt.c         |  7 ++--
 arch/x86/include/asm/kvm_types.h   |  5 +++
 arch/x86/kernel/apic/apic.c        |  3 +-
 arch/x86/kernel/apic/apic_common.c |  3 +-
 arch/x86/kernel/cpu/amd.c          |  4 +--
 arch/x86/kernel/cpu/bugs.c         | 17 +++++-----
 arch/x86/kernel/cpu/bus_lock.c     |  3 +-
 arch/x86/kernel/cpu/common.c       |  7 ++--
 arch/x86/kernel/cpu/sgx/main.c     |  3 +-
 arch/x86/kernel/cpu/sgx/virt.c     |  5 +--
 arch/x86/kernel/e820.c             |  3 +-
 arch/x86/kernel/fpu/core.c         | 21 ++++++------
 arch/x86/kernel/fpu/xstate.c       |  7 ++--
 arch/x86/kernel/hw_breakpoint.c    |  3 +-
 arch/x86/kernel/irq.c              |  3 +-
 arch/x86/kernel/kvm.c              |  5 +--
 arch/x86/kernel/nmi.c              |  5 ++-
 arch/x86/kernel/process_64.c       |  5 ++-
 arch/x86/kernel/reboot.c           |  5 +--
 arch/x86/kernel/tsc.c              |  1 +
 arch/x86/lib/cache-smp.c           |  9 ++---
 arch/x86/lib/msr.c                 |  5 +--
 arch/x86/mm/pat/memtype.c          |  3 +-
 arch/x86/mm/tlb.c                  |  5 +--
 arch/x86/virt/vmx/tdx/tdx.c        | 69 +++++++++++++++++++-------------------
 include/linux/kvm_types.h          | 14 ++++++++
 32 files changed, 144 insertions(+), 104 deletions(-)

(limited to 'include')

diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index 8e9a0cc20a4a..1d723c5ae9dd 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -4,6 +4,7 @@
  */
 
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/linkage.h>
 #include <linux/objtool.h>
 #include <asm/msr-index.h>
@@ -29,8 +30,7 @@ SYM_FUNC_START(write_ibpb)
 	FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET
 	RET
 SYM_FUNC_END(write_ibpb)
-/* For KVM */
-EXPORT_SYMBOL_GPL(write_ibpb);
+EXPORT_SYMBOL_FOR_KVM(write_ibpb);
 
 .popsection
 
@@ -48,8 +48,7 @@ SYM_CODE_START_NOALIGN(x86_verw_sel)
 	.word __KERNEL_DS
 .align L1_CACHE_BYTES, 0xcc
 SYM_CODE_END(x86_verw_sel);
-/* For KVM */
-EXPORT_SYMBOL_GPL(x86_verw_sel);
+EXPORT_SYMBOL_FOR_KVM(x86_verw_sel);
 
 .popsection
 
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ed04a968cc7d..f9983a1907bf 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -19,6 +19,7 @@
  * - idtentry:		Define exception entry points.
  */
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/cache.h>
@@ -1566,5 +1567,5 @@ SYM_FUNC_START(clear_bhb_loop)
 	pop	%rbp
 	RET
 SYM_FUNC_END(clear_bhb_loop)
-EXPORT_SYMBOL_GPL(clear_bhb_loop)
+EXPORT_SYMBOL_FOR_KVM(clear_bhb_loop)
 STACK_FRAME_NON_STANDARD(clear_bhb_loop)
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index fafbd3e68cb8..894f7f16eb80 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -4,6 +4,7 @@
  */
 
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 
 #include <asm/asm.h>
 #include <asm/fred.h>
@@ -146,5 +147,5 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	RET
 
 SYM_FUNC_END(asm_fred_entry_from_kvm)
-EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm);
+EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
 #endif
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index b20661b8621d..2dd9afb8dd9d 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -2,6 +2,7 @@
 #include <linux/perf_event.h>
 #include <linux/jump_label.h>
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -1569,7 +1570,7 @@ void amd_pmu_enable_virt(void)
 	/* Reload all events */
 	amd_pmu_reload_virt();
 }
-EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+EXPORT_SYMBOL_FOR_KVM(amd_pmu_enable_virt);
 
 void amd_pmu_disable_virt(void)
 {
@@ -1586,4 +1587,4 @@ void amd_pmu_disable_virt(void)
 	/* Reload all events */
 	amd_pmu_reload_virt();
 }
-EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
+EXPORT_SYMBOL_FOR_KVM(amd_pmu_disable_virt);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 745caa6c15a3..b5e397fa0835 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -20,6 +20,7 @@
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/kdebug.h>
+#include <linux/kvm_types.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/clock.h>
 #include <linux/uaccess.h>
@@ -714,7 +715,7 @@ struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data)
 {
 	return static_call(x86_pmu_guest_get_msrs)(nr, data);
 }
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+EXPORT_SYMBOL_FOR_KVM(perf_guest_get_msrs);
 
 /*
  * There may be PMI landing after enabled=0. The PMI hitting could be before or
@@ -3106,7 +3107,7 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
 	cap->events_mask_len	= x86_pmu.events_mask_len;
 	cap->pebs_ept		= x86_pmu.pebs_ept;
 }
-EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
+EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability);
 
 u64 perf_get_hw_event_config(int hw_event)
 {
@@ -3117,4 +3118,4 @@ u64 perf_get_hw_event_config(int hw_event)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
+EXPORT_SYMBOL_FOR_KVM(perf_get_hw_event_config);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 7aa59966e7c3..72f2adcda7c6 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/kvm_types.h>
 #include <linux/perf_event.h>
 #include <linux/types.h>
 
@@ -1705,7 +1706,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
 	lbr->info = x86_pmu.lbr_info;
 	lbr->has_callstack = x86_pmu_has_lbr_callstack();
 }
-EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
+EXPORT_SYMBOL_FOR_KVM(x86_perf_get_lbr);
 
 struct event_constraint vlbr_constraint =
 	__EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR),
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index e8cf29d2b10c..44524a387c58 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -17,6 +17,7 @@
 #include <linux/limits.h>
 #include <linux/slab.h>
 #include <linux/device.h>
+#include <linux/kvm_types.h>
 
 #include <asm/cpuid/api.h>
 #include <asm/perf_event.h>
@@ -82,13 +83,13 @@ u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
 
 	return (c & cd->mask) >> shift;
 }
-EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
+EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_cap);
 
 u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
 {
 	return intel_pt_validate_cap(pt_pmu.caps, cap);
 }
-EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
+EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_hw_cap);
 
 static ssize_t pt_cap_show(struct device *cdev,
 			   struct device_attribute *attr,
@@ -1590,7 +1591,7 @@ void intel_pt_handle_vmx(int on)
 
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
+EXPORT_SYMBOL_FOR_KVM(intel_pt_handle_vmx);
 
 /*
  * PMU callbacks
diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h
index 23268a188e70..d7c704ed1be9 100644
--- a/arch/x86/include/asm/kvm_types.h
+++ b/arch/x86/include/asm/kvm_types.h
@@ -10,6 +10,11 @@
 #define KVM_SUB_MODULES kvm-intel
 #else
 #undef KVM_SUB_MODULES
+/*
+ * Don't export symbols for KVM without vendor modules, as kvm.ko is built iff
+ * at least one vendor module is enabled.
+ */
+#define EXPORT_SYMBOL_FOR_KVM(symbol)
 #endif
 
 #define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 680d305589a3..dcf4dc7a9eac 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -36,6 +36,7 @@
 #include <linux/dmi.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/kvm_types.h>
 
 #include <xen/xen.h>
 
@@ -2316,7 +2317,7 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
 		dest |= msg->arch_addr_hi.destid_8_31 << 8;
 	return dest;
 }
-EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
+EXPORT_SYMBOL_FOR_KVM(x86_msi_msg_get_destid);
 
 static void __init apic_bsp_up_setup(void)
 {
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
index 9ef3be866832..2ed3b5c88c7f 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: GPL-2.0
  */
 #include <linux/irq.h>
+#include <linux/kvm_types.h>
 #include <asm/apic.h>
 
 #include "local.h"
@@ -25,7 +26,7 @@ u32 default_cpu_present_to_apicid(int mps_cpu)
 	else
 		return BAD_APICID;
 }
-EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid);
+EXPORT_SYMBOL_FOR_KVM(default_cpu_present_to_apicid);
 
 /*
  * Set up the logical destination ID when the APIC operates in logical
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 8e36964a7721..69a3c02cab48 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -3,7 +3,7 @@
 #include <linux/bitops.h>
 #include <linux/elf.h>
 #include <linux/mm.h>
-
+#include <linux/kvm_types.h>
 #include <linux/io.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
@@ -1310,7 +1310,7 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr)
 
 	return per_cpu(amd_dr_addr_mask[dr], smp_processor_id());
 }
-EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
+EXPORT_SYMBOL_FOR_KVM(amd_get_dr_addr_mask);
 
 static void zenbleed_check_cpu(void *unused)
 {
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 57c1d0ed36a5..d11a7655994e 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,6 +16,7 @@
 #include <linux/sched/smt.h>
 #include <linux/pgtable.h>
 #include <linux/bpf.h>
+#include <linux/kvm_types.h>
 
 #include <asm/spec-ctrl.h>
 #include <asm/cmdline.h>
@@ -178,7 +179,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
 /* Control IBPB on vCPU load */
 DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
-EXPORT_SYMBOL_GPL(switch_vcpu_ibpb);
+EXPORT_SYMBOL_FOR_KVM(switch_vcpu_ibpb);
 
 /* Control CPU buffer clear before idling (halt, mwait) */
 DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
@@ -197,7 +198,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
  * mitigation is required.
  */
 DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
-EXPORT_SYMBOL_GPL(cpu_buf_vm_clear);
+EXPORT_SYMBOL_FOR_KVM(cpu_buf_vm_clear);
 
 #undef pr_fmt
 #define pr_fmt(fmt)	"mitigations: " fmt
@@ -365,7 +366,7 @@ x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest)
 		speculation_ctrl_update(tif);
 	}
 }
-EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
+EXPORT_SYMBOL_FOR_KVM(x86_virt_spec_ctrl);
 
 static void x86_amd_ssb_disable(void)
 {
@@ -1031,7 +1032,7 @@ bool gds_ucode_mitigated(void)
 	return (gds_mitigation == GDS_MITIGATION_FULL ||
 		gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
 }
-EXPORT_SYMBOL_GPL(gds_ucode_mitigated);
+EXPORT_SYMBOL_FOR_KVM(gds_ucode_mitigated);
 
 void update_gds_msr(void)
 {
@@ -2858,7 +2859,7 @@ void x86_spec_ctrl_setup_ap(void)
 }
 
 bool itlb_multihit_kvm_mitigation;
-EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+EXPORT_SYMBOL_FOR_KVM(itlb_multihit_kvm_mitigation);
 
 #undef pr_fmt
 #define pr_fmt(fmt)	"L1TF: " fmt
@@ -2866,11 +2867,9 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
 /* Default mitigation for L1TF-affected CPUs */
 enum l1tf_mitigations l1tf_mitigation __ro_after_init =
 	IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF;
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(l1tf_mitigation);
-#endif
+EXPORT_SYMBOL_FOR_KVM(l1tf_mitigation);
 enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
-EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
+EXPORT_SYMBOL_FOR_KVM(l1tf_vmx_mitigation);
 
 /*
  * These CPUs all support 44bits physical address space internally in the
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
index 981f8b1f0792..dbc99a47be45 100644
--- a/arch/x86/kernel/cpu/bus_lock.c
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -6,6 +6,7 @@
 #include <linux/workqueue.h>
 #include <linux/delay.h>
 #include <linux/cpuhotplug.h>
+#include <linux/kvm_types.h>
 #include <asm/cpu_device_id.h>
 #include <asm/cmdline.h>
 #include <asm/traps.h>
@@ -289,7 +290,7 @@ bool handle_guest_split_lock(unsigned long ip)
 	force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
 	return false;
 }
-EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+EXPORT_SYMBOL_FOR_KVM(handle_guest_split_lock);
 
 void bus_lock_init(void)
 {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c7d3512914ca..71bb04e6a5bc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -7,6 +7,7 @@
 #include <linux/bitops.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/percpu.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
@@ -460,14 +461,14 @@ void cr4_update_irqsoff(unsigned long set, unsigned long clear)
 		__write_cr4(newval);
 	}
 }
-EXPORT_SYMBOL(cr4_update_irqsoff);
+EXPORT_SYMBOL_FOR_KVM(cr4_update_irqsoff);
 
 /* Read the CR4 shadow. */
 unsigned long cr4_read_shadow(void)
 {
 	return this_cpu_read(cpu_tlbstate.cr4);
 }
-EXPORT_SYMBOL_GPL(cr4_read_shadow);
+EXPORT_SYMBOL_FOR_KVM(cr4_read_shadow);
 
 void cr4_init(void)
 {
@@ -722,7 +723,7 @@ void load_direct_gdt(int cpu)
 	gdt_descr.size = GDT_SIZE - 1;
 	load_gdt(&gdt_descr);
 }
-EXPORT_SYMBOL_GPL(load_direct_gdt);
+EXPORT_SYMBOL_FOR_KVM(load_direct_gdt);
 
 /* Load a fixmap remapping of the per-cpu GDT */
 void load_fixmap_gdt(int cpu)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 2de01b379aa3..fc8fb64d62f4 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -5,6 +5,7 @@
 #include <linux/freezer.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/kvm_types.h>
 #include <linux/miscdevice.h>
 #include <linux/node.h>
 #include <linux/pagemap.h>
@@ -915,7 +916,7 @@ int sgx_set_attribute(unsigned long *allowed_attributes,
 	*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(sgx_set_attribute);
+EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute);
 
 static int __init sgx_init(void)
 {
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 7aaa3652e31d..727f2570c8b9 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -5,6 +5,7 @@
  * Copyright(c) 2021 Intel Corporation.
  */
 
+#include <linux/kvm_types.h>
 #include <linux/miscdevice.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
@@ -363,7 +364,7 @@ int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
 	WARN_ON_ONCE(ret);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);
 
 static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
 			    void __user *secs)
@@ -432,4 +433,4 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(sgx_virt_einit);
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c3acbd26408b..b15b97d3cb52 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -16,6 +16,7 @@
 #include <linux/firmware-map.h>
 #include <linux/sort.h>
 #include <linux/memory_hotplug.h>
+#include <linux/kvm_types.h>
 
 #include <asm/e820/api.h>
 #include <asm/setup.h>
@@ -95,7 +96,7 @@ bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
 {
 	return _e820__mapped_any(e820_table_firmware, start, end, type);
 }
-EXPORT_SYMBOL_GPL(e820__mapped_raw_any);
+EXPORT_SYMBOL_FOR_KVM(e820__mapped_raw_any);
 
 bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
 {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e88eacb1b5bb..da233f20ae6f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -18,6 +18,7 @@
 #include <uapi/asm/kvm.h>
 
 #include <linux/hardirq.h>
+#include <linux/kvm_types.h>
 #include <linux/pkeys.h>
 #include <linux/vmalloc.h>
 
@@ -276,7 +277,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
 
 	return true;
 }
-EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_alloc_guest_fpstate);
 
 void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
 {
@@ -291,7 +292,7 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
 	gfpu->fpstate = NULL;
 	vfree(fpstate);
 }
-EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_free_guest_fpstate);
 
 /*
   * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
@@ -313,7 +314,7 @@ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
 
 	return __xfd_enable_feature(xfeatures, guest_fpu);
 }
-EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
+EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features);
 
 #ifdef CONFIG_X86_64
 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
@@ -324,7 +325,7 @@ void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
 		xfd_update_state(guest_fpu->fpstate);
 	fpregs_unlock();
 }
-EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd);
 
 /**
  * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
@@ -348,7 +349,7 @@ void fpu_sync_guest_vmexit_xfd_state(void)
 		__this_cpu_write(xfd_state, fpstate->xfd);
 	}
 }
-EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
+EXPORT_SYMBOL_FOR_KVM(fpu_sync_guest_vmexit_xfd_state);
 #endif /* CONFIG_X86_64 */
 
 int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
@@ -390,7 +391,7 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
 	fpregs_unlock();
 	return 0;
 }
-EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_swap_kvm_fpstate);
 
 void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
 				    unsigned int size, u64 xfeatures, u32 pkru)
@@ -409,7 +410,7 @@ void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
 		ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
 	}
 }
-EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_guest_fpstate_to_uabi);
 
 int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
 				   u64 xcr0, u32 *vpkru)
@@ -439,7 +440,7 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
 
 	return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
 }
-EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_uabi_to_guest_fpstate);
 #endif /* CONFIG_KVM */
 
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
@@ -857,7 +858,7 @@ void switch_fpu_return(void)
 
 	fpregs_restore_userregs();
 }
-EXPORT_SYMBOL_GPL(switch_fpu_return);
+EXPORT_SYMBOL_FOR_KVM(switch_fpu_return);
 
 void fpregs_lock_and_load(void)
 {
@@ -892,7 +893,7 @@ void fpregs_assert_state_consistent(void)
 
 	WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
 }
-EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
+EXPORT_SYMBOL_FOR_KVM(fpregs_assert_state_consistent);
 #endif
 
 void fpregs_mark_activate(void)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 28e4fd65c9da..48113c5193aa 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -8,6 +8,7 @@
 #include <linux/compat.h>
 #include <linux/cpu.h>
 #include <linux/mman.h>
+#include <linux/kvm_types.h>
 #include <linux/nospec.h>
 #include <linux/pkeys.h>
 #include <linux/seq_file.h>
@@ -1058,7 +1059,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 
 	return __raw_xsave_addr(xsave, xfeature_nr);
 }
-EXPORT_SYMBOL_GPL(get_xsave_addr);
+EXPORT_SYMBOL_FOR_KVM(get_xsave_addr);
 
 /*
  * Given an xstate feature nr, calculate where in the xsave buffer the state is.
@@ -1482,7 +1483,7 @@ void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeatu
 	if (addr)
 		memset(addr, 0, xstate_sizes[xfeature]);
 }
-EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
+EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component);
 #endif
 
 #ifdef CONFIG_X86_64
@@ -1818,7 +1819,7 @@ u64 xstate_get_guest_group_perm(void)
 {
 	return xstate_get_group_perm(true);
 }
-EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
+EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm);
 
 /**
  * fpu_xstate_prctl - xstate permission operations
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index b01644c949b2..f846c15f21ca 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -24,6 +24,7 @@
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
+#include <linux/kvm_types.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
@@ -489,7 +490,7 @@ void hw_breakpoint_restore(void)
 	set_debugreg(DR6_RESERVED, 6);
 	set_debugreg(__this_cpu_read(cpu_dr7), 7);
 }
-EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+EXPORT_SYMBOL_FOR_KVM(hw_breakpoint_restore);
 
 /*
  * Handle debug exception notifications.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 10721a125226..86f4e574de02 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/irq.h>
+#include <linux/kvm_types.h>
 
 #include <asm/irq_stack.h>
 #include <asm/apic.h>
@@ -361,7 +362,7 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
 		synchronize_rcu();
 	}
 }
-EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+EXPORT_SYMBOL_FOR_KVM(kvm_set_posted_intr_wakeup_handler);
 
 /*
  * Handler for POSTED_INTERRUPT_VECTOR.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b67d7c59dca0..204765004c72 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -29,6 +29,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/cc_platform.h>
 #include <linux/efi.h>
+#include <linux/kvm_types.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 #include <asm/traps.h>
@@ -162,7 +163,7 @@ void kvm_async_pf_task_wait_schedule(u32 token)
 	}
 	finish_swait(&n.wq, &wait);
 }
-EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
+EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule);
 
 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 {
@@ -253,7 +254,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void)
 
 	return flags;
 }
-EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
+EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags);
 
 noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 {
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index be93ec7255bf..3d239ed12744 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/atomic.h>
 #include <linux/sched/clock.h>
+#include <linux/kvm_types.h>
 
 #include <asm/cpu_entry_area.h>
 #include <asm/traps.h>
@@ -613,9 +614,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
 {
 	exc_nmi(regs);
 }
-#if IS_MODULE(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
-#endif
+EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
 #endif
 
 #ifdef CONFIG_NMI_CHECK_CPU
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 52a5c03c353c..432c0a004c60 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -30,6 +30,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/ptrace.h>
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
@@ -303,9 +304,7 @@ void current_save_fsgs(void)
 	save_fsgs(current);
 	local_irq_restore(flags);
 }
-#if IS_ENABLED(CONFIG_KVM)
-EXPORT_SYMBOL_GPL(current_save_fsgs);
-#endif
+EXPORT_SYMBOL_FOR_KVM(current_save_fsgs);
 
 static __always_inline void loadseg(enum which_selector which,
 				    unsigned short sel)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 964f6b0a3d68..6032fa9ec753 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -13,6 +13,7 @@
 #include <linux/objtool.h>
 #include <linux/pgtable.h>
 #include <linux/kexec.h>
+#include <linux/kvm_types.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/apic.h>
@@ -541,7 +542,7 @@ void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
 
 	rcu_assign_pointer(cpu_emergency_virt_callback, callback);
 }
-EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_register_virt_callback);
 
 void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
 {
@@ -551,7 +552,7 @@ void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
 	rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
 	synchronize_rcu();
 }
-EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_unregister_virt_callback);
 
 /*
  * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 87e749106dda..7d3e13e14eab 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
 #include <linux/cpufreq.h>
 #include <linux/delay.h>
 #include <linux/clocksource.h>
+#include <linux/kvm_types.h>
 #include <linux/percpu.h>
 #include <linux/timex.h>
 #include <linux/static_key.h>
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index c5c60d07308c..824664c0ecbd 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -2,6 +2,7 @@
 #include <asm/paravirt.h>
 #include <linux/smp.h>
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 
 static void __wbinvd(void *dummy)
 {
@@ -12,7 +13,7 @@ void wbinvd_on_cpu(int cpu)
 {
 	smp_call_function_single(cpu, __wbinvd, NULL, 1);
 }
-EXPORT_SYMBOL(wbinvd_on_cpu);
+EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpu);
 
 void wbinvd_on_all_cpus(void)
 {
@@ -24,7 +25,7 @@ void wbinvd_on_cpus_mask(struct cpumask *cpus)
 {
 	on_each_cpu_mask(cpus, __wbinvd, NULL, 1);
 }
-EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask);
+EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpus_mask);
 
 static void __wbnoinvd(void *dummy)
 {
@@ -35,10 +36,10 @@ void wbnoinvd_on_all_cpus(void)
 {
 	on_each_cpu(__wbnoinvd, NULL, 1);
 }
-EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus);
+EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_all_cpus);
 
 void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
 {
 	on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1);
 }
-EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask);
+EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_cpus_mask);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 4ef7c6dcbea6..dfdd1da89f36 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <asm/msr.h>
@@ -103,7 +104,7 @@ int msr_set_bit(u32 msr, u8 bit)
 {
 	return __flip_bit(msr, bit, true);
 }
-EXPORT_SYMBOL_GPL(msr_set_bit);
+EXPORT_SYMBOL_FOR_KVM(msr_set_bit);
 
 /**
  * msr_clear_bit - Clear @bit in a MSR @msr.
@@ -119,7 +120,7 @@ int msr_clear_bit(u32 msr, u8 bit)
 {
 	return __flip_bit(msr, bit, false);
 }
-EXPORT_SYMBOL_GPL(msr_clear_bit);
+EXPORT_SYMBOL_FOR_KVM(msr_clear_bit);
 
 #ifdef CONFIG_TRACEPOINTS
 void do_trace_write_msr(u32 msr, u64 val, int failed)
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index b68200a0e0c6..8a3d9722f602 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -42,6 +42,7 @@
 #include <linux/highmem.h>
 #include <linux/fs.h>
 #include <linux/rbtree.h>
+#include <linux/kvm_types.h>
 
 #include <asm/cpu_device_id.h>
 #include <asm/cacheflush.h>
@@ -697,7 +698,7 @@ bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
 	       cm == _PAGE_CACHE_MODE_UC_MINUS ||
 	       cm == _PAGE_CACHE_MODE_WC;
 }
-EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
+EXPORT_SYMBOL_FOR_KVM(pat_pfn_immune_to_uc_mtrr);
 
 /**
  * memtype_reserve_io - Request a memory type mapping for a region of memory
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5d221709353e..f5b93e01e347 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
 #include <linux/task_work.h>
 #include <linux/mmu_notifier.h>
 #include <linux/mmu_context.h>
+#include <linux/kvm_types.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -1582,7 +1583,7 @@ unsigned long __get_current_cr3_fast(void)
 	VM_BUG_ON(cr3 != __read_cr3());
 	return cr3;
 }
-EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
+EXPORT_SYMBOL_FOR_KVM(__get_current_cr3_fast);
 
 /*
  * Flush one page in the kernel mapping
@@ -1723,7 +1724,7 @@ void __flush_tlb_all(void)
 		flush_tlb_local();
 	}
 }
-EXPORT_SYMBOL_GPL(__flush_tlb_all);
+EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
 
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index eac403248462..5ce4ebe99774 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -29,6 +29,7 @@
 #include <linux/acpi.h>
 #include <linux/suspend.h>
 #include <linux/idr.h>
+#include <linux/kvm_types.h>
 #include <asm/page.h>
 #include <asm/special_insns.h>
 #include <asm/msr-index.h>
@@ -181,7 +182,7 @@ int tdx_cpu_enable(void)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(tdx_cpu_enable);
+EXPORT_SYMBOL_FOR_KVM(tdx_cpu_enable);
 
 /*
  * Add a memory region as a TDX memory block.  The caller must make sure
@@ -662,7 +663,7 @@ void tdx_quirk_reset_page(struct page *page)
 {
 	tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
 }
-EXPORT_SYMBOL_GPL(tdx_quirk_reset_page);
+EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page);
 
 static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
 {
@@ -1216,7 +1217,7 @@ int tdx_enable(void)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdx_enable);
+EXPORT_SYMBOL_FOR_KVM(tdx_enable);
 
 static bool is_pamt_page(unsigned long phys)
 {
@@ -1477,13 +1478,13 @@ const struct tdx_sys_info *tdx_get_sysinfo(void)
 
 	return p;
 }
-EXPORT_SYMBOL_GPL(tdx_get_sysinfo);
+EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo);
 
 u32 tdx_get_nr_guest_keyids(void)
 {
 	return tdx_nr_guest_keyids;
 }
-EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids);
+EXPORT_SYMBOL_FOR_KVM(tdx_get_nr_guest_keyids);
 
 int tdx_guest_keyid_alloc(void)
 {
@@ -1491,13 +1492,13 @@ int tdx_guest_keyid_alloc(void)
 			       tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
 			       GFP_KERNEL);
 }
-EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc);
+EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_alloc);
 
 void tdx_guest_keyid_free(unsigned int keyid)
 {
 	ida_free(&tdx_guest_keyid_pool, keyid);
 }
-EXPORT_SYMBOL_GPL(tdx_guest_keyid_free);
+EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_free);
 
 static inline u64 tdx_tdr_pa(struct tdx_td *td)
 {
@@ -1521,7 +1522,7 @@ noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
 
 	return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_enter);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_enter);
 
 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
 {
@@ -1533,7 +1534,7 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
 	tdx_clflush_page(tdcs_page);
 	return seamcall(TDH_MNG_ADDCX, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mng_addcx);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx);
 
 u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1553,7 +1554,7 @@ u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mem_page_add);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_add);
 
 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1572,7 +1573,7 @@ u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mem_sept_add);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_sept_add);
 
 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
 {
@@ -1584,7 +1585,7 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
 	tdx_clflush_page(tdcx_page);
 	return seamcall(TDH_VP_ADDCX, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_addcx);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
 
 u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1603,7 +1604,7 @@ u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mem_page_aug);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_aug);
 
 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1620,7 +1621,7 @@ u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u6
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mem_range_block);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_range_block);
 
 u64 tdh_mng_key_config(struct tdx_td *td)
 {
@@ -1630,7 +1631,7 @@ u64 tdh_mng_key_config(struct tdx_td *td)
 
 	return seamcall(TDH_MNG_KEY_CONFIG, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mng_key_config);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_config);
 
 u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
 {
@@ -1642,7 +1643,7 @@ u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
 	tdx_clflush_page(td->tdr_page);
 	return seamcall(TDH_MNG_CREATE, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mng_create);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_create);
 
 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
 {
@@ -1654,7 +1655,7 @@ u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
 	tdx_clflush_page(vp->tdvpr_page);
 	return seamcall(TDH_VP_CREATE, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_create);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_create);
 
 u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
 {
@@ -1671,7 +1672,7 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mng_rd);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_rd);
 
 u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1688,7 +1689,7 @@ u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mr_extend);
+EXPORT_SYMBOL_FOR_KVM(tdh_mr_extend);
 
 u64 tdh_mr_finalize(struct tdx_td *td)
 {
@@ -1698,7 +1699,7 @@ u64 tdh_mr_finalize(struct tdx_td *td)
 
 	return seamcall(TDH_MR_FINALIZE, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mr_finalize);
+EXPORT_SYMBOL_FOR_KVM(tdh_mr_finalize);
 
 u64 tdh_vp_flush(struct tdx_vp *vp)
 {
@@ -1708,7 +1709,7 @@ u64 tdh_vp_flush(struct tdx_vp *vp)
 
 	return seamcall(TDH_VP_FLUSH, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_flush);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_flush);
 
 u64 tdh_mng_vpflushdone(struct tdx_td *td)
 {
@@ -1718,7 +1719,7 @@ u64 tdh_mng_vpflushdone(struct tdx_td *td)
 
 	return seamcall(TDH_MNG_VPFLUSHDONE, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_vpflushdone);
 
 u64 tdh_mng_key_freeid(struct tdx_td *td)
 {
@@ -1728,7 +1729,7 @@ u64 tdh_mng_key_freeid(struct tdx_td *td)
 
 	return seamcall(TDH_MNG_KEY_FREEID, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mng_key_freeid);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_freeid);
 
 u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
 {
@@ -1744,7 +1745,7 @@ u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mng_init);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_init);
 
 u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
 {
@@ -1761,7 +1762,7 @@ u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_vp_rd);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_rd);
 
 u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
 {
@@ -1774,7 +1775,7 @@ u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
 
 	return seamcall(TDH_VP_WR, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_wr);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_wr);
 
 u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
 {
@@ -1787,7 +1788,7 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
 	/* apicid requires version == 1. */
 	return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_init);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_init);
 
 /*
  * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats.
@@ -1809,7 +1810,7 @@ u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_reclaim);
 
 u64 tdh_mem_track(struct tdx_td *td)
 {
@@ -1819,7 +1820,7 @@ u64 tdh_mem_track(struct tdx_td *td)
 
 	return seamcall(TDH_MEM_TRACK, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mem_track);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_track);
 
 u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1836,7 +1837,7 @@ u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u6
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mem_page_remove);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_remove);
 
 u64 tdh_phymem_cache_wb(bool resume)
 {
@@ -1846,7 +1847,7 @@ u64 tdh_phymem_cache_wb(bool resume)
 
 	return seamcall(TDH_PHYMEM_CACHE_WB, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb);
 
 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
 {
@@ -1856,7 +1857,7 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
 
 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr);
 
 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
 {
@@ -1866,7 +1867,7 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
 
 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid);
 
 #ifdef CONFIG_KEXEC_CORE
 void tdx_cpu_flush_cache_for_kexec(void)
@@ -1884,5 +1885,5 @@ void tdx_cpu_flush_cache_for_kexec(void)
 	wbinvd();
 	this_cpu_write(cache_state_incoherent, false);
 }
-EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec);
+EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec);
 #endif
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 490464c205b4..a568d8e6f4e8 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -11,8 +11,22 @@
 #ifdef KVM_SUB_MODULES
 #define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) \
 	EXPORT_SYMBOL_FOR_MODULES(symbol, __stringify(KVM_SUB_MODULES))
+#define EXPORT_SYMBOL_FOR_KVM(symbol) \
+	EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm," __stringify(KVM_SUB_MODULES))
 #else
 #define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol)
+/*
+ * Allow architectures to provide a custom EXPORT_SYMBOL_FOR_KVM, but only
+ * if there are no submodules, e.g. to allow suppressing exports if KVM=m, but
+ * kvm.ko won't actually be built (due to lack of at least one submodule).
+ */
+#ifndef EXPORT_SYMBOL_FOR_KVM
+#if IS_MODULE(CONFIG_KVM)
+#define EXPORT_SYMBOL_FOR_KVM(symbol) EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm")
+#else
+#define EXPORT_SYMBOL_FOR_KVM(symbol)
+#endif /* IS_MODULE(CONFIG_KVM) */
+#endif /* EXPORT_SYMBOL_FOR_KVM */
 #endif
 
 #ifndef __ASSEMBLER__
-- 
cgit v1.2.3


From f6a8919d61484ae9ca6b1855035fcfb2ba6e2af9 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 12 Nov 2025 15:47:48 -0800
Subject: vmlinux.lds: Fix TEXT_MAIN to include .text.start and friends

Since:

  6568f14cb5ae ("vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN")

the TEXT_MAIN macro uses a series of patterns to prevent the
.text.startup[.*] and .text.exit[.*] sections from getting
linked into the vmlinux runtime .text.

That commit is a tad too aggressive: it also inadvertently filters out
valid runtime text sections like .text.start and
.text.start.constprop.0, which can be generated for a function named
start() when -ffunction-sections is enabled.

As a result, those sections become orphans when building with
CONFIG_LD_DEAD_CODE_DATA_ELIMINATION for arm:

  arm-linux-gnueabi-ld: warning: orphan section `.text.start.constprop.0' from `drivers/usb/host/sl811-hcd.o' being placed in section `.text.start.constprop.0'
  arm-linux-gnueabi-ld: warning: orphan section `.text.start.constprop.0' from `drivers/media/dvb-frontends/drxk_hard.o' being placed in section `.text.start.constprop.0'
  arm-linux-gnueabi-ld: warning: orphan section `.text.start' from `drivers/media/dvb-frontends/stv0910.o' being placed in section `.text.start'
  arm-linux-gnueabi-ld: warning: orphan section `.text.start.constprop.0' from `drivers/media/pci/ddbridge/ddbridge-sx8.o' being placed in section `.text.start.constprop.0'

Fix that by explicitly adding the partial "substring" sections (.text.s,
.text.st, .text.sta, etc) and their cloned derivatives.

While this unfortunately means that TEXT_MAIN continues to grow,
these changes are ultimately necessary for proper support of
-ffunction-sections.

Fixes: 6568f14cb5ae ("vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: live-patching@vger.kernel.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/cd588144e63df901a656b06b566855019c4a931d.1762991150.git.jpoimboe@kernel.org
Closes: https://lore.kernel.org/oe-kbuild-all/202511040812.DFGedJiy-lkp@intel.com/
---
 include/asm-generic/vmlinux.lds.h | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index cc060adfdc75..8f92d665cb0f 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -90,8 +90,9 @@
  * Support -ffunction-sections by matching .text and .text.*,
  * but exclude '.text..*', .text.startup[.*], and .text.exit[.*].
  *
- * .text.startup and .text.startup.* are matched later by INIT_TEXT.
- * .text.exit and .text.exit.* are matched later by EXIT_TEXT.
+ * .text.startup and .text.startup.* are matched later by INIT_TEXT, and
+ * .text.exit and .text.exit.* are matched later by EXIT_TEXT, so they must be
+ * explicitly excluded here.
  *
  * Other .text.* sections that are typically grouped separately, such as
  * .text.unlikely or .text.hot, must be matched explicitly before using
@@ -100,16 +101,16 @@
 #define TEXT_MAIN							\
 	.text								\
 	.text.[_0-9A-Za-df-rt-z]*					\
-	.text.s[_0-9A-Za-su-z]*						\
-	.text.st[_0-9A-Zb-z]*						\
-	.text.sta[_0-9A-Za-qs-z]*					\
-	.text.star[_0-9A-Za-su-z]*					\
-	.text.start[_0-9A-Za-tv-z]*					\
-	.text.startu[_0-9A-Za-oq-z]*					\
+	.text.s[_0-9A-Za-su-z]*		.text.s		.text.s.*	\
+	.text.st[_0-9A-Zb-z]*		.text.st	.text.st.*	\
+	.text.sta[_0-9A-Za-qs-z]*	.text.sta	.text.sta.*	\
+	.text.star[_0-9A-Za-su-z]*	.text.star	.text.star.*	\
+	.text.start[_0-9A-Za-tv-z]*	.text.start	.text.start.*	\
+	.text.startu[_0-9A-Za-oq-z]*	.text.startu	.text.startu.*	\
 	.text.startup[_0-9A-Za-z]*					\
-	.text.e[_0-9A-Za-wy-z]*						\
-	.text.ex[_0-9A-Za-hj-z]*					\
-	.text.exi[_0-9A-Za-su-z]*					\
+	.text.e[_0-9A-Za-wy-z]*		.text.e		.text.e.*	\
+	.text.ex[_0-9A-Za-hj-z]*	.text.ex	.text.ex.*	\
+	.text.exi[_0-9A-Za-su-z]*	.text.exi	.text.exi.*	\
 	.text.exit[_0-9A-Za-z]*
 
 /*
-- 
cgit v1.2.3


From 9c7dc1dd897a1cdcade9566ea4664b03fbabf4a4 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 12 Nov 2025 15:47:51 -0800
Subject: objtool: Warn on functions with ambiguous -ffunction-sections section
 names

When compiled with -ffunction-sections, a function named startup() will
be placed in .text.startup.  However, .text.startup is also used by the
compiler for functions with __attribute__((constructor)).

That creates an ambiguity for the vmlinux linker script, which needs to
differentiate those two cases.

Similar naming conflicts exist for functions named exit(), split(),
unlikely(), hot() and unknown().

One potential solution would be to use '#ifdef CC_USING_FUNCTION_SECTIONS'
to create two distinct implementations of the TEXT_MAIN macro.  However,
-ffunction-sections can be (and is) enabled or disabled on a per-object
basis (for example via ccflags-y or AUTOFDO_PROFILE).

So the recently unified TEXT_MAIN macro (commit 1ba9f8979426
("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros")) is
necessary.  This means there's no way for the linker script to
disambiguate things.

Instead, use objtool to warn on any function names whose resulting
section names might create ambiguity when the kernel is compiled (in
whole or in part) with -ffunction-sections.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: live-patching@vger.kernel.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/65fedea974fe14be487c8867a0b8d0e4a294ce1e.1762991150.git.jpoimboe@kernel.org
---
 include/asm-generic/vmlinux.lds.h       | 15 +++++++++++++++
 tools/objtool/Documentation/objtool.txt |  7 +++++++
 tools/objtool/check.c                   | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8f92d665cb0f..5efe1de2209b 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -97,6 +97,21 @@
  * Other .text.* sections that are typically grouped separately, such as
  * .text.unlikely or .text.hot, must be matched explicitly before using
  * TEXT_MAIN.
+ *
+ * NOTE: builds *with* and *without* -ffunction-sections are both supported by
+ * this single macro.  Even with -ffunction-sections, there may be some objects
+ * NOT compiled with the flag due to the use of a specific Makefile override
+ * like cflags-y or AUTOFDO_PROFILE_foo.o.  So this single catchall rule is
+ * needed to support mixed object builds.
+ *
+ * One implication is that functions named startup(), exit(), split(),
+ * unlikely(), hot(), and unknown() are not allowed in the kernel due to the
+ * ambiguity of their section names with -ffunction-sections.  For example,
+ * .text.startup could be __attribute__((constructor)) code in a *non*
+ * ffunction-sections object, which should be placed in .init.text; or it could
+ * be an actual function named startup() in an ffunction-sections object, which
+ * should be placed in .text.  Objtool will detect and complain about any such
+ * ambiguously named functions.
  */
 #define TEXT_MAIN							\
 	.text								\
diff --git a/tools/objtool/Documentation/objtool.txt b/tools/objtool/Documentation/objtool.txt
index 9e97fc25b2d8..f88f8d28513a 100644
--- a/tools/objtool/Documentation/objtool.txt
+++ b/tools/objtool/Documentation/objtool.txt
@@ -456,6 +456,13 @@ the objtool maintainers.
     these special names and does not use module_init() / module_exit()
     macros to create them.
 
+13. file.o: warning: func() function name creates ambiguity with -ffunctions-sections
+
+    Functions named startup(), exit(), split(), unlikely(), hot(), and
+    unknown() are not allowed due to the ambiguity of their section
+    names when compiled with -ffunction-sections.  For more information,
+    see the comment above TEXT_MAIN in include/asm-generic/vmlinux.lds.h.
+
 
 If the error doesn't seem to make sense, it could be a bug in objtool.
 Feel free to ask objtool maintainers for help.
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 57fac6ce3454..72c7f6f03350 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2663,6 +2663,37 @@ static int decode_sections(struct objtool_file *file)
 	return 0;
 }
 
+/*
+ * Certain function names are disallowed due to section name ambiguities
+ * introduced by -ffunction-sections.
+ *
+ * See the comment above TEXT_MAIN in include/asm-generic/vmlinux.lds.h.
+ */
+static int validate_function_names(struct objtool_file *file)
+{
+	struct symbol *func;
+	int warnings = 0;
+
+	for_each_sym(file->elf, func) {
+		if (!is_func_sym(func))
+			continue;
+
+		if (!strcmp(func->name, "startup")	|| strstarts(func->name, "startup.")	||
+		    !strcmp(func->name, "exit")		|| strstarts(func->name, "exit.")	||
+		    !strcmp(func->name, "split")	|| strstarts(func->name, "split.")	||
+		    !strcmp(func->name, "unlikely")	|| strstarts(func->name, "unlikely.")	||
+		    !strcmp(func->name, "hot")		|| strstarts(func->name, "hot.")	||
+		    !strcmp(func->name, "unknown")	|| strstarts(func->name, "unknown.")) {
+
+			WARN("%s() function name creates ambiguity with -ffunction-sections",
+			     func->name);
+			warnings++;
+		}
+	}
+
+	return warnings;
+}
+
 static bool is_special_call(struct instruction *insn)
 {
 	if (insn->type == INSN_CALL) {
@@ -4932,6 +4963,8 @@ int check(struct objtool_file *file)
 	if (!nr_insns)
 		goto out;
 
+	warnings += validate_function_names(file);
+
 	if (opts.retpoline)
 		warnings += validate_retpoline(file);
 
-- 
cgit v1.2.3


From fd3f646e1c9d783d1f4ef30e5376ccf315a8ae30 Mon Sep 17 00:00:00 2001
From: Isaac Scott <isaac.scott@ideasonboard.com>
Date: Wed, 29 Oct 2025 18:03:18 +0000
Subject: media: v4l: Add helper to get number of active lanes via a pad

Sometimes, users will not use all of the MIPI CSI 2 lanes available when
connecting to the MIPI CSI receiver of their device. Add a helper
function that checks the mbus_config for the device driver to allow
users to define the number of active data lanes through the
get_mbus_config op.

If the driver does not implement this op, fall back to using the maximum
number of lanes available.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Isaac Scott <isaac.scott@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 drivers/media/v4l2-core/v4l2-common.c | 29 +++++++++++++++++++++++++++++
 include/media/v4l2-common.h           | 20 ++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index b367d479d6b3..554c591e1113 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -573,6 +573,35 @@ s64 v4l2_get_link_freq(const struct media_pad *pad, unsigned int mul,
 	return v4l2_get_link_freq_ctrl(sd->ctrl_handler, mul, div);
 }
 EXPORT_SYMBOL_GPL(v4l2_get_link_freq);
+
+int v4l2_get_active_data_lanes(const struct media_pad *pad,
+			       unsigned int max_data_lanes)
+{
+	struct v4l2_mbus_config mbus_config = {};
+	struct v4l2_subdev *sd;
+	unsigned int lanes;
+	int ret;
+
+	sd = media_entity_to_v4l2_subdev(pad->entity);
+	ret = v4l2_subdev_call(sd, pad, get_mbus_config, pad->index,
+			       &mbus_config);
+	if (ret < 0 && ret != -ENOIOCTLCMD)
+		return ret;
+
+	/* This relies on the mbus_config being zeroed at init time */
+	lanes = mbus_config.bus.mipi_csi2.num_data_lanes;
+	if (!lanes)
+		return max_data_lanes;
+
+	if (lanes > max_data_lanes) {
+		dev_dbg(sd->dev, "Active data lanes (%u) exceeds max (%u)\n",
+			lanes, max_data_lanes);
+		return -EINVAL;
+	}
+
+	return lanes;
+}
+EXPORT_SYMBOL_GPL(v4l2_get_active_data_lanes);
 #endif
 
 /*
diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h
index 5c0a7f6b5bb6..f8b1faced79c 100644
--- a/include/media/v4l2-common.h
+++ b/include/media/v4l2-common.h
@@ -581,6 +581,26 @@ int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt, u32 pixelformat,
 #ifdef CONFIG_MEDIA_CONTROLLER
 s64 v4l2_get_link_freq(const struct media_pad *pad, unsigned int mul,
 		       unsigned int div);
+
+/**
+ * v4l2_get_active_data_lanes - Get number of active data lanes from driver
+ *
+ * @pad: The transmitter's media pad.
+ * @max_data_lanes: The maximum number of active data lanes supported by
+ *		    the MIPI CSI link in hardware.
+ *
+ * This function is intended for obtaining the number of data lanes that are
+ * actively being used by the driver for a MIPI CSI-2 device on a given media pad.
+ * This information is derived from a mbus_config fetched from a device driver
+ * using the get_mbus_config v4l2_subdev pad op.
+ *
+ * Return:
+ * * >0: Number of active data lanes
+ * * %-EINVAL: Number of active data lanes is invalid, as it exceeds the maximum
+ *	       supported data lanes.
+ */
+int v4l2_get_active_data_lanes(const struct media_pad *pad,
+			       unsigned int max_data_lanes);
 #endif
 
 void v4l2_simplify_fraction(u32 *numerator, u32 *denominator,
-- 
cgit v1.2.3


From 2bcd3800f2da1be13b972858f63c66d035b1ec6d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 13 Nov 2025 00:09:15 +0000
Subject: slab: Reimplement page_slab()

In order to separate slabs from folios, we need to convert from any page
in a slab to the slab directly without going through a page to folio
conversion first.

Up to this point, page_slab() has followed the example of other memdesc
converters (page_folio(), page_ptdesc() etc) and just cast the pointer
to the requested type, regardless of whether the pointer is actually a
pointer to the correct type or not.

That changes with this commit; we check that the page actually belongs
to a slab and return NULL if it does not.  Other memdesc converters will
adopt this convention in future.

kfence was the only user of page_slab(), so adjust it to the new way
of working.  It will need to be touched again when we separate slab
from page.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Marco Elver <elver@google.com>
Cc: kasan-dev@googlegroups.com
Link: https://patch.msgid.link/20251113000932.1589073-2-willy@infradead.org
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Tested-by: Marco Elver <elver@google.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/page-flags.h | 14 +-------------
 mm/kfence/core.c           | 14 ++++++++------
 mm/slab.h                  | 28 ++++++++++++++++------------
 3 files changed, 25 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0091ad1986bf..6d5e44968eab 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1048,19 +1048,7 @@ PAGE_TYPE_OPS(Table, table, pgtable)
  */
 PAGE_TYPE_OPS(Guard, guard, guard)
 
-FOLIO_TYPE_OPS(slab, slab)
-
-/**
- * PageSlab - Determine if the page belongs to the slab allocator
- * @page: The page to test.
- *
- * Context: Any context.
- * Return: True for slab pages, false for any other kind of page.
- */
-static inline bool PageSlab(const struct page *page)
-{
-	return folio_test_slab(page_folio(page));
-}
+PAGE_TYPE_OPS(Slab, slab, slab)
 
 #ifdef CONFIG_HUGETLB_PAGE
 FOLIO_TYPE_OPS(hugetlb, hugetlb)
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 727c20c94ac5..e62b5516bf48 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -612,14 +612,15 @@ static unsigned long kfence_init_pool(void)
 	 * enters __slab_free() slow-path.
 	 */
 	for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
-		struct slab *slab;
+		struct page *page;
 
 		if (!i || (i % 2))
 			continue;
 
-		slab = page_slab(pfn_to_page(start_pfn + i));
-		__folio_set_slab(slab_folio(slab));
+		page = pfn_to_page(start_pfn + i);
+		__SetPageSlab(page);
 #ifdef CONFIG_MEMCG
+		struct slab *slab = page_slab(page);
 		slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
 				 MEMCG_DATA_OBJEXTS;
 #endif
@@ -665,16 +666,17 @@ static unsigned long kfence_init_pool(void)
 
 reset_slab:
 	for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
-		struct slab *slab;
+		struct page *page;
 
 		if (!i || (i % 2))
 			continue;
 
-		slab = page_slab(pfn_to_page(start_pfn + i));
+		page = pfn_to_page(start_pfn + i);
 #ifdef CONFIG_MEMCG
+		struct slab *slab = page_slab(page);
 		slab->obj_exts = 0;
 #endif
-		__folio_clear_slab(slab_folio(slab));
+		__ClearPageSlab(page);
 	}
 
 	return addr;
diff --git a/mm/slab.h b/mm/slab.h
index 078daecc7cf5..a64b9b2c8731 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -146,20 +146,24 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
 	struct slab *:		(struct folio *)s))
 
 /**
- * page_slab - Converts from first struct page to slab.
- * @p: The first (either head of compound or single) page of slab.
+ * page_slab - Converts from struct page to its slab.
+ * @page: A page which may or may not belong to a slab.
  *
- * A temporary wrapper to convert struct page to struct slab in situations where
- * we know the page is the compound head, or single order-0 page.
- *
- * Long-term ideally everything would work with struct slab directly or go
- * through folio to struct slab.
- *
- * Return: The slab which contains this page
+ * Return: The slab which contains this page or NULL if the page does
+ * not belong to a slab.  This includes pages returned from large kmalloc.
  */
-#define page_slab(p)		(_Generic((p),				\
-	const struct page *:	(const struct slab *)(p),		\
-	struct page *:		(struct slab *)(p)))
+static inline struct slab *page_slab(const struct page *page)
+{
+	unsigned long head;
+
+	head = READ_ONCE(page->compound_head);
+	if (head & 1)
+		page = (struct page *)(head - 1);
+	if (data_race(page->page_type >> 24) != PGTY_slab)
+		page = NULL;
+
+	return (struct slab *)page;
+}
 
 /**
  * slab_page - The first struct page allocated for a slab
-- 
cgit v1.2.3


From ee1ee8abc4197e21594ca29348629ccbfff4daec Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 13 Nov 2025 00:09:16 +0000
Subject: slab: Remove folio references from __ksize()

In the future, we will separate slab, folio and page from each other
and calling virt_to_folio() on an address allocated from slab will
return NULL.  Delay the conversion from struct page to struct slab
until we know we're not dealing with a large kmalloc allocation.
There's a minor win for large kmalloc allocations as we avoid the
compound_head() hidden in virt_to_folio().

This deprecates calling ksize() on memory allocated by alloc_pages().
Today it becomes a warning and support will be removed entirely in
the future.

Introduce large_kmalloc_size() to abstract how we represent the size
of a large kmalloc allocation.  For now, this is the same as
page_size(), but it will change with separately allocated memdescs.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251113000932.1589073-3-willy@infradead.org
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/page-flags.h |  2 +-
 mm/slab.h                  | 10 ++++++++++
 mm/slab_common.c           | 23 ++++++++++++-----------
 3 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6d5e44968eab..f7a0e4af0c73 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1064,7 +1064,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)
  * Serialized with zone lock.
  */
 PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted)
-FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc)
+PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc)
 
 /**
  * PageHuge - Determine if the page belongs to hugetlbfs
diff --git a/mm/slab.h b/mm/slab.h
index a64b9b2c8731..31ccf0f6d3a1 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -605,6 +605,16 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
 	return s->size;
 }
 
+static inline unsigned int large_kmalloc_order(const struct page *page)
+{
+	return page[1].flags.f & 0xff;
+}
+
+static inline size_t large_kmalloc_size(const struct page *page)
+{
+	return PAGE_SIZE << large_kmalloc_order(page);
+}
+
 #ifdef CONFIG_SLUB_DEBUG
 void dump_unreclaimable_slab(void);
 #else
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 932d13ada36c..67ad2328276e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -997,26 +997,27 @@ void __init create_kmalloc_caches(void)
  */
 size_t __ksize(const void *object)
 {
-	struct folio *folio;
+	const struct page *page;
+	const struct slab *slab;
 
 	if (unlikely(object == ZERO_SIZE_PTR))
 		return 0;
 
-	folio = virt_to_folio(object);
+	page = virt_to_page(object);
 
-	if (unlikely(!folio_test_slab(folio))) {
-		if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
-			return 0;
-		if (WARN_ON(object != folio_address(folio)))
-			return 0;
-		return folio_size(folio);
-	}
+	if (unlikely(PageLargeKmalloc(page)))
+		return large_kmalloc_size(page);
+
+	slab = page_slab(page);
+	/* Delete this after we're sure there are no users */
+	if (WARN_ON(!slab))
+		return page_size(page);
 
 #ifdef CONFIG_SLUB_DEBUG
-	skip_orig_size_check(folio_slab(folio)->slab_cache, object);
+	skip_orig_size_check(slab->slab_cache, object);
 #endif
 
-	return slab_ksize(folio_slab(folio)->slab_cache);
+	return slab_ksize(slab->slab_cache);
 }
 
 gfp_t kmalloc_fix_flags(gfp_t flags)
-- 
cgit v1.2.3


From 4f49088c162579a4ed049c555fe0cd188fd928c4 Mon Sep 17 00:00:00 2001
From: Khairul Anuar Romli <khairul.anuar.romli@altera.com>
Date: Wed, 8 Oct 2025 17:09:05 +0800
Subject: firmware: stratix10-svc: Add definition for voltage and temperature
 sensor

Add entry in Stratix 10 Service Layer to support temperature and voltage
sensor.

Signed-off-by: Khairul Anuar Romli <khairul.anuar.romli@altera.com>
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
---
 drivers/firmware/stratix10-svc.c                   | 21 +++++++++++--
 include/linux/firmware/intel/stratix10-smc.h       | 34 ++++++++++++++++++++++
 .../linux/firmware/intel/stratix10-svc-client.h    |  8 ++++-
 3 files changed, 60 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c
index e3f990d888d7..5a32c1054bee 100644
--- a/drivers/firmware/stratix10-svc.c
+++ b/drivers/firmware/stratix10-svc.c
@@ -34,7 +34,7 @@
  * timeout is set to 30 seconds (30 * 1000) at Intel Stratix10 SoC.
  */
 #define SVC_NUM_DATA_IN_FIFO			32
-#define SVC_NUM_CHANNEL				3
+#define SVC_NUM_CHANNEL				4
 #define FPGA_CONFIG_DATA_CLAIM_TIMEOUT_MS	200
 #define FPGA_CONFIG_STATUS_TIMEOUT_SEC		30
 #define BYTE_TO_WORD_SIZE              4
@@ -341,6 +341,8 @@ static void svc_thread_recv_status_ok(struct stratix10_svc_data *p_data,
 	case COMMAND_RSU_MAX_RETRY:
 	case COMMAND_RSU_DCMF_STATUS:
 	case COMMAND_FIRMWARE_VERSION:
+	case COMMAND_HWMON_READTEMP:
+	case COMMAND_HWMON_READVOLT:
 		cb_data->status = BIT(SVC_STATUS_OK);
 		cb_data->kaddr1 = &res.a1;
 		break;
@@ -525,7 +527,17 @@ static int svc_normal_to_secure_thread(void *data)
 			a1 = (unsigned long)pdata->paddr;
 			a2 = 0;
 			break;
-
+		/* for HWMON */
+		case COMMAND_HWMON_READTEMP:
+			a0 = INTEL_SIP_SMC_HWMON_READTEMP;
+			a1 = pdata->arg[0];
+			a2 = 0;
+			break;
+		case COMMAND_HWMON_READVOLT:
+			a0 = INTEL_SIP_SMC_HWMON_READVOLT;
+			a1 = pdata->arg[0];
+			a2 = 0;
+			break;
 		/* for polling */
 		case COMMAND_POLL_SERVICE_STATUS:
 			a0 = INTEL_SIP_SMC_SERVICE_COMPLETED;
@@ -1197,6 +1209,11 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev)
 	chans[2].name = SVC_CLIENT_FCS;
 	spin_lock_init(&chans[2].lock);
 
+	chans[3].scl = NULL;
+	chans[3].ctrl = controller;
+	chans[3].name = SVC_CLIENT_HWMON;
+	spin_lock_init(&chans[3].lock);
+
 	list_add_tail(&controller->node, &svc_ctrl);
 	platform_set_drvdata(pdev, controller);
 
diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h
index ee80ca4bb0d0..7306dd243b2a 100644
--- a/include/linux/firmware/intel/stratix10-smc.h
+++ b/include/linux/firmware/intel/stratix10-smc.h
@@ -620,4 +620,38 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE)
 #define INTEL_SIP_SMC_FCS_GET_PROVISION_DATA \
 	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FCS_GET_PROVISION_DATA)
 
+/**
+ * Request INTEL_SIP_SMC_HWMON_READTEMP
+ * Sync call to request temperature
+ *
+ * Call register usage:
+ * a0 Temperature Channel
+ * a1-a7 not used
+ *
+ * Return status
+ * a0 INTEL_SIP_SMC_STATUS_OK
+ * a1 Temperature Value
+ * a2-a3 not used
+ */
+#define INTEL_SIP_SMC_FUNCID_HWMON_READTEMP 32
+#define INTEL_SIP_SMC_HWMON_READTEMP \
+	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READTEMP)
+
+/**
+ * Request INTEL_SIP_SMC_HWMON_READVOLT
+ * Sync call to request voltage
+ *
+ * Call register usage:
+ * a0 Voltage Channel
+ * a1-a7 not used
+ *
+ * Return status
+ * a0 INTEL_SIP_SMC_STATUS_OK
+ * a1 Voltage Value
+ * a2-a3 not used
+ */
+#define INTEL_SIP_SMC_FUNCID_HWMON_READVOLT 33
+#define INTEL_SIP_SMC_HWMON_READVOLT \
+	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READVOLT)
+
 #endif
diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index 60ed82112680..520004a5f15d 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -11,12 +11,14 @@
  *
  * fpga: for FPGA configuration
  * rsu: for remote status update
+ * hwmon: for hardware monitoring (voltage and temperature)
  */
 #define SVC_CLIENT_FPGA			"fpga"
 #define SVC_CLIENT_RSU			"rsu"
 #define SVC_CLIENT_FCS			"fcs"
+#define SVC_CLIENT_HWMON		"hwmon"
 
-/*
+/**
  * Status of the sent command, in bit number
  *
  * SVC_STATUS_OK:
@@ -70,6 +72,7 @@
 #define SVC_RSU_REQUEST_TIMEOUT_MS              300
 #define SVC_FCS_REQUEST_TIMEOUT_MS		2000
 #define SVC_COMPLETED_TIMEOUT_MS		30000
+#define SVC_HWMON_REQUEST_TIMEOUT_MS		300
 
 struct stratix10_svc_chan;
 
@@ -171,6 +174,9 @@ enum stratix10_svc_command_code {
 	COMMAND_MBOX_SEND_CMD = 100,
 	/* Non-mailbox SMC Call */
 	COMMAND_SMC_SVC_VERSION = 200,
+	/* for HWMON */
+	COMMAND_HWMON_READTEMP,
+	COMMAND_HWMON_READVOLT
 };
 
 /**
-- 
cgit v1.2.3


From bcb9f4f0706147afc62c48533276a18fe7b8f354 Mon Sep 17 00:00:00 2001
From: Mahesh Rao <mahesh.rao@altera.com>
Date: Mon, 27 Oct 2025 22:54:41 +0800
Subject: firmware: stratix10-svc: Add support for async communication

Introduce support for asynchronous communication with the Stratix10
service channel. Define new structures to enable asynchronous messaging
with the Secure Device Manager (SDM). Add and remove asynchronous
support for existing channels. Implement initialization and cleanup
routines for the asynchronous framework. Enable sending and polling of
messages to the SDM asynchronously.

The new public functions added are:
- stratix10_svc_add_async_client: Adds a client to the service channel.
- stratix10_svc_remove_async_client: Removes an asynchronous client from
        the service channel.
- stratix10_svc_async_send: Sends an asynchronous message to the SDM
        mailbox in EL3 secure firmware.
- stratix10_svc_async_poll: Polls the status of an asynchronous service
        request in EL3 secure firmware.
- stratix10_svc_async_done: Marks an asynchronous transaction as
        complete and frees up the resources.

These changes enhance the functionality of the Stratix10 service channel
by allowing for more efficient and flexible communication with the
firmware.

Signed-off-by: Mahesh Rao <mahesh.rao@altera.com>
Reviewed-by: Matthew Gerlach <matthew.gerlach@altera.com>
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
---
 drivers/firmware/stratix10-svc.c                   | 656 ++++++++++++++++++++-
 include/linux/firmware/intel/stratix10-smc.h       |  25 +
 .../linux/firmware/intel/stratix10-svc-client.h    |  88 +++
 3 files changed, 765 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c
index 9372a17d89b7..14bfa36a58ed 100644
--- a/drivers/firmware/stratix10-svc.c
+++ b/drivers/firmware/stratix10-svc.c
@@ -4,9 +4,12 @@
  * Copyright (C) 2025, Altera Corporation
  */
 
+#include <linux/atomic.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
 #include <linux/genalloc.h>
+#include <linux/hashtable.h>
+#include <linux/idr.h>
 #include <linux/io.h>
 #include <linux/kfifo.h>
 #include <linux/kthread.h>
@@ -44,6 +47,49 @@
 #define STRATIX10_RSU				"stratix10-rsu"
 #define INTEL_FCS				"intel-fcs"
 
+/* Maximum number of SDM client IDs. */
+#define MAX_SDM_CLIENT_IDS			16
+/* Client ID for SIP Service Version 1. */
+#define SIP_SVC_V1_CLIENT_ID			0x1
+/* Maximum number of SDM job IDs. */
+#define MAX_SDM_JOB_IDS				16
+/* Number of bits used for asynchronous transaction hashing. */
+#define ASYNC_TRX_HASH_BITS			3
+/**
+ * Total number of transaction IDs, which is a combination of
+ * client ID and job ID.
+ */
+#define TOTAL_TRANSACTION_IDS \
+	(MAX_SDM_CLIENT_IDS * MAX_SDM_JOB_IDS)
+
+/* Minimum major version of the ATF for Asynchronous transactions. */
+#define ASYNC_ATF_MINIMUM_MAJOR_VERSION		0x3
+/* Minimum minor version of the ATF for Asynchronous transactions.*/
+#define ASYNC_ATF_MINIMUM_MINOR_VERSION		0x0
+
+/* Job ID field in the transaction ID */
+#define STRATIX10_JOB_FIELD			GENMASK(3, 0)
+/* Client ID field in the transaction ID */
+#define STRATIX10_CLIENT_FIELD			GENMASK(7, 4)
+/* Transaction ID mask for Stratix10 service layer */
+#define STRATIX10_TRANS_ID_FIELD		GENMASK(7, 0)
+
+/* Macro to extract the job ID from a transaction ID. */
+#define STRATIX10_GET_JOBID(transaction_id) \
+	(FIELD_GET(STRATIX10_JOB_FIELD, transaction_id))
+/* Macro to set the job ID in a transaction ID. */
+#define STRATIX10_SET_JOBID(jobid) \
+	(FIELD_PREP(STRATIX10_JOB_FIELD, jobid))
+/* Macro to set the client ID in a transaction ID. */
+#define STRATIX10_SET_CLIENTID(clientid) \
+	(FIELD_PREP(STRATIX10_CLIENT_FIELD, clientid))
+/* Macro to set a transaction ID using a client ID and a job ID. */
+#define STRATIX10_SET_TRANSACTIONID(clientid, jobid) \
+	(STRATIX10_SET_CLIENTID(clientid) | STRATIX10_SET_JOBID(jobid))
+/* Macro to set a transaction ID for SIP SMC Async transactions */
+#define STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(transaction_id) \
+	(FIELD_PREP(STRATIX10_TRANS_ID_FIELD, transaction_id))
+
 typedef void (svc_invoke_fn)(unsigned long, unsigned long, unsigned long,
 			     unsigned long, unsigned long, unsigned long,
 			     unsigned long, unsigned long,
@@ -64,7 +110,7 @@ struct stratix10_svc {
  * @sync_complete: state for a completion
  * @addr: physical address of shared memory block
  * @size: size of shared memory block
- * @invoke_fn: function to issue secure monitor or hypervisor call
+ * @invoke_fn: service clients to handle secure monitor or hypervisor calls
  *
  * This struct is used to save physical address and size of shared memory
  * block. The shared memory blocked is allocated by secure monitor software
@@ -122,6 +168,74 @@ struct stratix10_svc_data {
 	u64 arg[3];
 };
 
+/**
+ * struct stratix10_svc_async_handler - Asynchronous handler for Stratix10
+ *                                      service layer
+ * @transaction_id: Unique identifier for the transaction
+ * @achan: Pointer to the asynchronous channel structure
+ * @cb_arg: Argument to be passed to the callback function
+ * @cb: Callback function to be called upon completion
+ * @msg: Pointer to the client message structure
+ * @next: Node in the hash list
+ * @res: Response structure to store result from the secure firmware
+ *
+ * This structure is used to handle asynchronous transactions in the
+ * Stratix10 service layer. It maintains the necessary information
+ * for processing and completing asynchronous requests.
+ */
+
+struct stratix10_svc_async_handler {
+	u8 transaction_id;
+	struct stratix10_async_chan *achan;
+	void *cb_arg;
+	async_callback_t cb;
+	struct stratix10_svc_client_msg *msg;
+	struct hlist_node next;
+	struct arm_smccc_1_2_regs res;
+};
+
+/**
+ * struct stratix10_async_chan - Structure representing an asynchronous channel
+ * @async_client_id: Unique client identifier for the asynchronous operation
+ * @job_id_pool: Pointer to the job ID pool associated with this channel
+ */
+
+struct stratix10_async_chan {
+	unsigned long async_client_id;
+	struct ida job_id_pool;
+};
+
+/**
+ * struct stratix10_async_ctrl - Control structure for Stratix10
+ *                               asynchronous operations
+ * @initialized: Flag indicating whether the control structure has
+ *               been initialized
+ * @invoke_fn: Function pointer for invoking Stratix10 service calls
+ *             to EL3 secure firmware
+ * @async_id_pool: Pointer to the ID pool used for asynchronous
+ *                 operations
+ * @common_achan_refcount: Atomic reference count for the common
+ *                         asynchronous channel usage
+ * @common_async_chan: Pointer to the common asynchronous channel
+ *                     structure
+ * @trx_list_lock: Spinlock for protecting the transaction list
+ *                     operations
+ * @trx_list: Hash table for managing asynchronous transactions
+ */
+
+struct stratix10_async_ctrl {
+	bool initialized;
+	void (*invoke_fn)(struct stratix10_async_ctrl *actrl,
+			  const struct arm_smccc_1_2_regs *args,
+			  struct arm_smccc_1_2_regs *res);
+	struct ida async_id_pool;
+	atomic_t common_achan_refcount;
+	struct stratix10_async_chan *common_async_chan;
+	/* spinlock to protect trx_list hash table */
+	spinlock_t trx_list_lock;
+	DECLARE_HASHTABLE(trx_list, ASYNC_TRX_HASH_BITS);
+};
+
 /**
  * struct stratix10_svc_controller - service controller
  * @dev: device
@@ -135,6 +249,7 @@ struct stratix10_svc_data {
  * @complete_status: state for completion
  * @svc_fifo_lock: protect access to service message data queue
  * @invoke_fn: function to issue secure monitor call or hypervisor call
+ * @actrl: async control structure
  *
  * This struct is used to create communication channels for service clients, to
  * handle secure monitor or hypervisor call.
@@ -151,6 +266,7 @@ struct stratix10_svc_controller {
 	struct completion complete_status;
 	spinlock_t svc_fifo_lock;
 	svc_invoke_fn *invoke_fn;
+	struct stratix10_async_ctrl actrl;
 };
 
 /**
@@ -159,15 +275,17 @@ struct stratix10_svc_controller {
  * @scl: pointer to service client which owns the channel
  * @name: service client name associated with the channel
  * @lock: protect access to the channel
+ * @async_chan: reference to asynchronous channel object for this channel
  *
- * This struct is used by service client to communicate with service layer, each
- * service client has its own channel created by service controller.
+ * This struct is used by service client to communicate with service layer.
+ * Each service client has its own channel created by service controller.
  */
 struct stratix10_svc_chan {
 	struct stratix10_svc_controller *ctrl;
 	struct stratix10_svc_client *scl;
 	char *name;
 	spinlock_t lock;
+	struct stratix10_async_chan *async_chan;
 };
 
 static LIST_HEAD(svc_ctrl);
@@ -942,6 +1060,525 @@ struct stratix10_svc_chan *stratix10_svc_request_channel_byname(
 }
 EXPORT_SYMBOL_GPL(stratix10_svc_request_channel_byname);
 
+/**
+ * stratix10_svc_add_async_client - Add an asynchronous client to the
+ * Stratix10 service channel.
+ * @chan: Pointer to the Stratix10 service channel structure.
+ * @use_unique_clientid: Boolean flag indicating whether to use a
+ * unique client ID.
+ *
+ * This function adds an asynchronous client to the specified
+ * Stratix10 service channel. If the `use_unique_clientid` flag is
+ * set to true, a unique client ID is allocated for the asynchronous
+ * channel. Otherwise, a common asynchronous channel is used.
+ *
+ * Return: 0 on success, or a negative error code on failure:
+ *         -EINVAL if the channel is NULL or the async controller is
+ *         not initialized.
+ *         -EALREADY if the async channel is already allocated.
+ *         -ENOMEM if memory allocation fails.
+ *         Other negative values if ID allocation fails.
+ */
+int stratix10_svc_add_async_client(struct stratix10_svc_chan *chan,
+				   bool use_unique_clientid)
+{
+	struct stratix10_svc_controller *ctrl;
+	struct stratix10_async_ctrl *actrl;
+	struct stratix10_async_chan *achan;
+	int ret = 0;
+
+	if (!chan)
+		return -EINVAL;
+
+	ctrl = chan->ctrl;
+	actrl = &ctrl->actrl;
+
+	if (!actrl->initialized) {
+		dev_err(ctrl->dev, "Async controller not initialized\n");
+		return -EINVAL;
+	}
+
+	if (chan->async_chan) {
+		dev_err(ctrl->dev, "async channel already allocated\n");
+		return -EALREADY;
+	}
+
+	if (use_unique_clientid &&
+	    atomic_read(&actrl->common_achan_refcount) > 0) {
+		chan->async_chan = actrl->common_async_chan;
+		atomic_inc(&actrl->common_achan_refcount);
+		return 0;
+	}
+
+	achan = kzalloc(sizeof(*achan), GFP_KERNEL);
+	if (!achan)
+		return -ENOMEM;
+
+	ida_init(&achan->job_id_pool);
+
+	ret = ida_alloc_max(&actrl->async_id_pool, MAX_SDM_CLIENT_IDS,
+			    GFP_KERNEL);
+	if (ret < 0) {
+		dev_err(ctrl->dev,
+			"Failed to allocate async client id\n");
+		ida_destroy(&achan->job_id_pool);
+		kfree(achan);
+		return ret;
+	}
+
+	achan->async_client_id = ret;
+	chan->async_chan = achan;
+
+	if (use_unique_clientid &&
+	    atomic_read(&actrl->common_achan_refcount) == 0) {
+		actrl->common_async_chan = achan;
+		atomic_inc(&actrl->common_achan_refcount);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(stratix10_svc_add_async_client);
+
+/**
+ * stratix10_svc_remove_async_client - Remove an asynchronous client
+ *                                     from the Stratix10 service
+ *                                     channel.
+ * @chan: Pointer to the Stratix10 service channel structure.
+ *
+ * This function removes an asynchronous client associated with the
+ * given service channel. It checks if the channel and the
+ * asynchronous channel are valid, and then proceeds to decrement
+ * the reference count for the common asynchronous channel if
+ * applicable. If the reference count reaches zero, it destroys the
+ * job ID pool and deallocates the asynchronous client ID. For
+ * non-common asynchronous channels, it directly destroys the job ID
+ * pool, deallocates the asynchronous client ID, and frees the
+ * memory allocated for the asynchronous channel.
+ *
+ * Return: 0 on success, -EINVAL if the channel or asynchronous
+ *         channel is invalid.
+ */
+int stratix10_svc_remove_async_client(struct stratix10_svc_chan *chan)
+{
+	struct stratix10_svc_controller *ctrl;
+	struct stratix10_async_ctrl *actrl;
+	struct stratix10_async_chan *achan;
+
+	if (!chan)
+		return -EINVAL;
+
+	ctrl = chan->ctrl;
+	actrl = &ctrl->actrl;
+	achan = chan->async_chan;
+
+	if (!achan) {
+		dev_err(ctrl->dev, "async channel not allocated\n");
+		return -EINVAL;
+	}
+
+	if (achan == actrl->common_async_chan) {
+		atomic_dec(&actrl->common_achan_refcount);
+		if (atomic_read(&actrl->common_achan_refcount) == 0) {
+			ida_destroy(&achan->job_id_pool);
+			ida_free(&actrl->async_id_pool,
+				 achan->async_client_id);
+			kfree(achan);
+			actrl->common_async_chan = NULL;
+		}
+	} else {
+		ida_destroy(&achan->job_id_pool);
+		ida_free(&actrl->async_id_pool, achan->async_client_id);
+		kfree(achan);
+	}
+	chan->async_chan = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(stratix10_svc_remove_async_client);
+
+/**
+ * stratix10_svc_async_send - Send an asynchronous message to the
+ *                            Stratix10 service
+ * @chan: Pointer to the service channel structure
+ * @msg: Pointer to the message to be sent
+ * @handler: Pointer to the handler for the asynchronous message
+ *           used by caller for later reference.
+ * @cb: Callback function to be called upon completion
+ * @cb_arg: Argument to be passed to the callback function
+ *
+ * This function sends an asynchronous message to the SDM mailbox in
+ * EL3 secure firmware. It performs various checks and setups,
+ * including allocating a job ID, setting up the transaction ID and
+ * packaging it to El3 firmware. The function handles different
+ * commands by setting up the appropriate arguments for the SMC call.
+ * If the SMC call is successful, the handler is set up and the
+ * function returns 0. If the SMC call fails, appropriate error
+ * handling is performed along with cleanup of resources.
+ *
+ * Return: 0 on success, -EINVAL for invalid argument, -ENOMEM if
+ * memory is not available, -EAGAIN if EL3 firmware is busy, -EBADF
+ * if the message is rejected by EL3 firmware and -EIO on other
+ * errors from EL3 firmware.
+ */
+int stratix10_svc_async_send(struct stratix10_svc_chan *chan, void *msg,
+			     void **handler, async_callback_t cb, void *cb_arg)
+{
+	struct arm_smccc_1_2_regs args = { 0 }, res = { 0 };
+	struct stratix10_svc_async_handler *handle = NULL;
+	struct stratix10_svc_client_msg *p_msg =
+		(struct stratix10_svc_client_msg *)msg;
+	struct stratix10_svc_controller *ctrl;
+	struct stratix10_async_ctrl *actrl;
+	struct stratix10_async_chan *achan;
+	int ret = 0;
+
+	if (!chan || !msg || !handler)
+		return -EINVAL;
+
+	achan = chan->async_chan;
+	ctrl = chan->ctrl;
+	actrl = &ctrl->actrl;
+
+	if (!actrl->initialized) {
+		dev_err(ctrl->dev, "Async controller not initialized\n");
+		return -EINVAL;
+	}
+
+	if (!achan) {
+		dev_err(ctrl->dev, "Async channel not allocated\n");
+		return -EINVAL;
+	}
+
+	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	ret = ida_alloc_max(&achan->job_id_pool, MAX_SDM_JOB_IDS,
+			    GFP_KERNEL);
+	if (ret < 0) {
+		dev_err(ctrl->dev, "Failed to allocate job id\n");
+		kfree(handle);
+		return -ENOMEM;
+	}
+
+	handle->transaction_id =
+		STRATIX10_SET_TRANSACTIONID(achan->async_client_id, ret);
+	handle->cb = cb;
+	handle->msg = p_msg;
+	handle->cb_arg = cb_arg;
+	handle->achan = achan;
+
+	/*set the transaction jobid in args.a1*/
+	args.a1 =
+		STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(handle->transaction_id);
+
+	switch (p_msg->command) {
+	default:
+		dev_err(ctrl->dev, "Invalid command ,%d\n", p_msg->command);
+		ret = -EINVAL;
+		goto deallocate_id;
+	}
+
+	/**
+	 * There is a chance that during the execution of async_send()
+	 * in one core, an interrupt might be received in another core;
+	 * to mitigate this we are adding the handle to the DB and then
+	 * send the smc call. If the smc call is rejected or busy then
+	 * we will deallocate the handle for the client to retry again.
+	 */
+	scoped_guard(spinlock_bh, &actrl->trx_list_lock) {
+		hash_add(actrl->trx_list, &handle->next,
+			 handle->transaction_id);
+	}
+
+	actrl->invoke_fn(actrl, &args, &res);
+
+	switch (res.a0) {
+	case INTEL_SIP_SMC_STATUS_OK:
+		dev_dbg(ctrl->dev,
+			"Async message sent with transaction_id 0x%02x\n",
+			handle->transaction_id);
+			*handler = handle;
+		return 0;
+	case INTEL_SIP_SMC_STATUS_BUSY:
+		dev_warn(ctrl->dev, "Mailbox is busy, try after some time\n");
+		ret = -EAGAIN;
+		break;
+	case INTEL_SIP_SMC_STATUS_REJECTED:
+		dev_err(ctrl->dev, "Async message rejected\n");
+		ret = -EBADF;
+		break;
+	default:
+		dev_err(ctrl->dev,
+			"Failed to send async message ,got status as %ld\n",
+			res.a0);
+		ret = -EIO;
+	}
+
+	scoped_guard(spinlock_bh, &actrl->trx_list_lock) {
+		hash_del(&handle->next);
+	}
+
+deallocate_id:
+	ida_free(&achan->job_id_pool,
+		 STRATIX10_GET_JOBID(handle->transaction_id));
+	kfree(handle);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stratix10_svc_async_send);
+/**
+ * stratix10_svc_async_poll - Polls the status of an asynchronous
+ * transaction.
+ * @chan: Pointer to the service channel structure.
+ * @tx_handle: Handle to the transaction being polled.
+ * @data: Pointer to the callback data structure.
+ *
+ * This function polls the status of an asynchronous transaction
+ * identified by the given transaction handle. It ensures that the
+ * necessary structures are initialized and valid before proceeding
+ * with the poll operation. The function sets up the necessary
+ * arguments for the SMC call, invokes the call, and prepares the
+ * response data if the call is successful. If the call fails, the
+ * function returns the error mapped to the SVC status error.
+ *
+ * Return: 0 on success, -EINVAL if any input parameter is invalid,
+ *         -EAGAIN if the transaction is still in progress,
+ *         -EPERM if the command is invalid, or other negative
+ *         error codes on failure.
+ */
+int stratix10_svc_async_poll(struct stratix10_svc_chan *chan,
+			     void *tx_handle,
+			     struct stratix10_svc_cb_data *data)
+{
+	struct stratix10_svc_async_handler *handle;
+	struct arm_smccc_1_2_regs args = { 0 };
+	struct stratix10_svc_controller *ctrl;
+	struct stratix10_async_ctrl *actrl;
+	struct stratix10_async_chan *achan;
+
+	if (!chan || !tx_handle || !data)
+		return -EINVAL;
+
+	ctrl = chan->ctrl;
+	actrl = &ctrl->actrl;
+	achan = chan->async_chan;
+
+	if (!achan) {
+		dev_err(ctrl->dev, "Async channel not allocated\n");
+		return -EINVAL;
+	}
+
+	handle = (struct stratix10_svc_async_handler *)tx_handle;
+	scoped_guard(spinlock_bh, &actrl->trx_list_lock) {
+		if (!hash_hashed(&handle->next)) {
+			dev_err(ctrl->dev, "Invalid transaction handler");
+			return -EINVAL;
+		}
+	}
+
+	args.a0 = INTEL_SIP_SMC_ASYNC_POLL;
+	args.a1 =
+		STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(handle->transaction_id);
+
+	actrl->invoke_fn(actrl, &args, &handle->res);
+
+	/*clear data for response*/
+	memset(data, 0, sizeof(*data));
+
+	if (handle->res.a0 == INTEL_SIP_SMC_STATUS_OK) {
+		return 0;
+	} else if (handle->res.a0 == INTEL_SIP_SMC_STATUS_BUSY) {
+		dev_dbg(ctrl->dev, "async message is still in progress\n");
+		return -EAGAIN;
+	}
+
+	dev_err(ctrl->dev,
+		"Failed to poll async message ,got status as %ld\n",
+		handle->res.a0);
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(stratix10_svc_async_poll);
+
+/**
+ * stratix10_svc_async_done - Completes an asynchronous transaction.
+ * @chan: Pointer to the service channel structure.
+ * @tx_handle: Handle to the transaction being completed.
+ *
+ * This function completes an asynchronous transaction identified by
+ * the given transaction handle. It ensures that the necessary
+ * structures are initialized and valid before proceeding with the
+ * completion operation. The function deallocates the transaction ID,
+ * frees the memory allocated for the handler, and removes the handler
+ * from the transaction list.
+ *
+ * Return: 0 on success, -EINVAL if any input parameter is invalid,
+ * or other negative error codes on failure.
+ */
+int stratix10_svc_async_done(struct stratix10_svc_chan *chan, void *tx_handle)
+{
+	struct stratix10_svc_async_handler *handle;
+	struct stratix10_svc_controller *ctrl;
+	struct stratix10_async_chan *achan;
+	struct stratix10_async_ctrl *actrl;
+
+	if (!chan || !tx_handle)
+		return -EINVAL;
+
+	ctrl = chan->ctrl;
+	achan = chan->async_chan;
+	actrl = &ctrl->actrl;
+
+	if (!achan) {
+		dev_err(ctrl->dev, "async channel not allocated\n");
+		return -EINVAL;
+	}
+
+	handle = (struct stratix10_svc_async_handler *)tx_handle;
+	scoped_guard(spinlock_bh, &actrl->trx_list_lock) {
+		if (!hash_hashed(&handle->next)) {
+			dev_err(ctrl->dev, "Invalid transaction handle");
+			return -EINVAL;
+		}
+		hash_del(&handle->next);
+	}
+	ida_free(&achan->job_id_pool,
+		 STRATIX10_GET_JOBID(handle->transaction_id));
+	kfree(handle);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(stratix10_svc_async_done);
+
+static inline void stratix10_smc_1_2(struct stratix10_async_ctrl *actrl,
+				     const struct arm_smccc_1_2_regs *args,
+				     struct arm_smccc_1_2_regs *res)
+{
+	arm_smccc_1_2_smc(args, res);
+}
+
+/**
+ * stratix10_svc_async_init - Initialize the Stratix10 service
+ *                            controller for asynchronous operations.
+ * @controller: Pointer to the Stratix10 service controller structure.
+ *
+ * This function initializes the asynchronous service controller by
+ * setting up the necessary data structures and initializing the
+ * transaction list.
+ *
+ * Return: 0 on success, -EINVAL if the controller is NULL or already
+ *         initialized, -ENOMEM if memory allocation fails,
+ *         -EADDRINUSE if the client ID is already reserved, or other
+ *         negative error codes on failure.
+ */
+static int stratix10_svc_async_init(struct stratix10_svc_controller *controller)
+{
+	struct stratix10_async_ctrl *actrl;
+	struct arm_smccc_res res;
+	struct device *dev;
+	int ret;
+
+	if (!controller)
+		return -EINVAL;
+
+	actrl = &controller->actrl;
+
+	if (actrl->initialized)
+		return -EINVAL;
+
+	dev = controller->dev;
+
+	controller->invoke_fn(INTEL_SIP_SMC_SVC_VERSION, 0, 0, 0, 0, 0, 0, 0, &res);
+	if (res.a0 != INTEL_SIP_SMC_STATUS_OK ||
+	    !(res.a1 > ASYNC_ATF_MINIMUM_MAJOR_VERSION ||
+	      (res.a1 == ASYNC_ATF_MINIMUM_MAJOR_VERSION &&
+	       res.a2 >= ASYNC_ATF_MINIMUM_MINOR_VERSION))) {
+		dev_err(dev,
+			"Intel Service Layer Driver: ATF version is not compatible for async operation\n");
+		return -EINVAL;
+	}
+
+	actrl->invoke_fn = stratix10_smc_1_2;
+
+	ida_init(&actrl->async_id_pool);
+
+	/**
+	 * SIP_SVC_V1_CLIENT_ID is used by V1/stratix10_svc_send() clients
+	 * for communicating with SDM synchronously. We need to restrict
+	 * this in V3/stratix10_svc_async_send() usage to distinguish
+	 * between V1 and V3 messages in El3 firmware.
+	 */
+	ret = ida_alloc_range(&actrl->async_id_pool, SIP_SVC_V1_CLIENT_ID,
+			      SIP_SVC_V1_CLIENT_ID, GFP_KERNEL);
+	if (ret < 0) {
+		dev_err(dev,
+			"Intel Service Layer Driver: Error on reserving SIP_SVC_V1_CLIENT_ID\n");
+		ida_destroy(&actrl->async_id_pool);
+		actrl->invoke_fn = NULL;
+		return -EADDRINUSE;
+	}
+
+	spin_lock_init(&actrl->trx_list_lock);
+	hash_init(actrl->trx_list);
+	atomic_set(&actrl->common_achan_refcount, 0);
+
+	actrl->initialized = true;
+	return 0;
+}
+
+/**
+ * stratix10_svc_async_exit - Clean up and exit the asynchronous
+ *                            service controller
+ * @ctrl: Pointer to the stratix10_svc_controller structure
+ *
+ * This function performs the necessary cleanup for the asynchronous
+ * service controller. It checks if the controller is valid and if it
+ * has been initialized. It then locks the transaction list and safely
+ * removes and deallocates each handler in the list. The function also
+ * removes any asynchronous clients associated with the controller's
+ * channels and destroys the asynchronous ID pool. Finally, it resets
+ * the asynchronous ID pool and invoke function pointers to NULL.
+ *
+ * Return: 0 on success, -EINVAL if the controller is invalid or not
+ *         initialized.
+ */
+static int stratix10_svc_async_exit(struct stratix10_svc_controller *ctrl)
+{
+	struct stratix10_svc_async_handler *handler;
+	struct stratix10_async_ctrl *actrl;
+	struct hlist_node *tmp;
+	int i;
+
+	if (!ctrl)
+		return -EINVAL;
+
+	actrl = &ctrl->actrl;
+
+	if (!actrl->initialized)
+		return -EINVAL;
+
+	actrl->initialized = false;
+
+	scoped_guard(spinlock_bh, &actrl->trx_list_lock) {
+		hash_for_each_safe(actrl->trx_list, i, tmp, handler, next) {
+			ida_free(&handler->achan->job_id_pool,
+				 STRATIX10_GET_JOBID(handler->transaction_id));
+			hash_del(&handler->next);
+			kfree(handler);
+		}
+	}
+
+	for (i = 0; i < SVC_NUM_CHANNEL; i++) {
+		if (ctrl->chans[i].async_chan) {
+			stratix10_svc_remove_async_client(&ctrl->chans[i]);
+			ctrl->chans[i].async_chan = NULL;
+		}
+	}
+
+	ida_destroy(&actrl->async_id_pool);
+	actrl->invoke_fn = NULL;
+
+	return 0;
+}
+
 /**
  * stratix10_svc_free_channel() - free service channel
  * @chan: service channel to be freed
@@ -1197,11 +1834,18 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev)
 	controller->invoke_fn = invoke_fn;
 	init_completion(&controller->complete_status);
 
+	ret = stratix10_svc_async_init(controller);
+	if (ret) {
+		dev_dbg(dev, "Intel Service Layer Driver: Error on stratix10_svc_async_init %d\n",
+			ret);
+		goto err_destroy_pool;
+	}
+
 	fifo_size = sizeof(struct stratix10_svc_data) * SVC_NUM_DATA_IN_FIFO;
 	ret = kfifo_alloc(&controller->svc_fifo, fifo_size, GFP_KERNEL);
 	if (ret) {
 		dev_err(dev, "failed to allocate FIFO\n");
-		goto err_destroy_pool;
+		goto err_async_exit;
 	}
 	spin_lock_init(&controller->svc_fifo_lock);
 
@@ -1277,6 +1921,8 @@ err_unregister_rsu_dev:
 	platform_device_unregister(svc->stratix10_svc_rsu);
 err_free_kfifo:
 	kfifo_free(&controller->svc_fifo);
+err_async_exit:
+	stratix10_svc_async_exit(controller);
 err_destroy_pool:
 	gen_pool_destroy(genpool);
 	return ret;
@@ -1287,6 +1933,8 @@ static void stratix10_svc_drv_remove(struct platform_device *pdev)
 	struct stratix10_svc *svc = dev_get_drvdata(&pdev->dev);
 	struct stratix10_svc_controller *ctrl = platform_get_drvdata(pdev);
 
+	stratix10_svc_async_exit(ctrl);
+
 	of_platform_depopulate(ctrl->dev);
 
 	platform_device_unregister(svc->intel_svc_fcs);
diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h
index 7306dd243b2a..3995d5d70cce 100644
--- a/include/linux/firmware/intel/stratix10-smc.h
+++ b/include/linux/firmware/intel/stratix10-smc.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (C) 2017-2018, Intel Corporation
+ * Copyright (C) 2025, Altera Corporation
  */
 
 #ifndef __STRATIX10_SMC_H
@@ -47,6 +48,10 @@
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64, \
 	ARM_SMCCC_OWNER_SIP, (func_num))
 
+#define INTEL_SIP_SMC_ASYNC_VAL(func_name)	\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_STD_CALL, ARM_SMCCC_SMC_64, \
+	ARM_SMCCC_OWNER_SIP, (func_name))
+
 /**
  * Return values in INTEL_SIP_SMC_* call
  *
@@ -654,4 +659,24 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE)
 #define INTEL_SIP_SMC_HWMON_READVOLT \
 	INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READVOLT)
 
+/**
+ * Request INTEL_SIP_SMC_ASYNC_POLL
+ * Async call used by service driver at EL1 to query mailbox response from SDM.
+ *
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_ASYNC_POLL
+ * a1 transaction job id
+ * a2-17 will be used to return the response data
+ *
+ * Return status
+ * a0 INTEL_SIP_SMC_STATUS_OK
+ * a1-17 will contain the response values from mailbox for the previous send
+ * transaction
+ * Or
+ * a0 INTEL_SIP_SMC_STATUS_NO_RESPONSE
+ * a1-17 not used
+ */
+#define INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL (0xC8)
+#define INTEL_SIP_SMC_ASYNC_POLL \
+	INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL)
 #endif
diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index 520004a5f15d..532dd4bd76dd 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (C) 2017-2018, Intel Corporation
+ * Copyright (C) 2025, Altera Corporation
  */
 
 #ifndef __STRATIX10_SVC_CLIENT_H
@@ -290,5 +291,92 @@ int stratix10_svc_send(struct stratix10_svc_chan *chan, void *msg);
  * request process.
  */
 void stratix10_svc_done(struct stratix10_svc_chan *chan);
+
+/**
+ * typedef async_callback_t - A type definition for an asynchronous callback function.
+ *
+ * This type defines a function pointer for an asynchronous callback.
+ * The callback function takes a single argument, which is a pointer to
+ * user-defined data.
+ *
+ * @param cb_arg A pointer to user-defined data passed to the callback function.
+ */
+typedef void (*async_callback_t)(void *cb_arg);
+
+/**
+ * stratix10_svc_add_async_client - Add an asynchronous client to a Stratix 10
+ *                                  service channel.
+ * @chan: Pointer to the Stratix 10 service channel structure.
+ * @use_unique_clientid: Boolean flag indicating whether to use a unique client ID.
+ *
+ * This function registers an asynchronous client with the specified Stratix 10
+ * service channel. If the use_unique_clientid flag is set to true, a unique client
+ * ID will be assigned to the client.
+ *
+ * Return: 0 on success, or a negative error code on failure:
+ *         -EINVAL if the channel is NULL or the async controller is not initialized.
+ *         -EALREADY if the async channel is already allocated.
+ *         -ENOMEM if memory allocation fails.
+ *         Other negative values if ID allocation fails
+ */
+int stratix10_svc_add_async_client(struct stratix10_svc_chan *chan, bool use_unique_clientid);
+
+/**
+ * stratix10_svc_remove_async_client - Remove an asynchronous client from the Stratix 10
+ *                                     service channel.
+ * @chan: Pointer to the Stratix 10 service channel structure.
+ *
+ * This function removes an asynchronous client from the specified Stratix 10 service channel.
+ * It is typically used to clean up and release resources associated with the client.
+ *
+ * Return: 0 on success, -EINVAL if the channel or asynchronous channel is invalid.
+ */
+int stratix10_svc_remove_async_client(struct stratix10_svc_chan *chan);
+
+/**
+ * stratix10_svc_async_send - Send an asynchronous message to the SDM mailbox
+ *                            in EL3 secure firmware.
+ * @chan: Pointer to the service channel structure.
+ * @msg: Pointer to the message to be sent.
+ * @handler: Pointer to the handler object used by caller to track the transaction.
+ * @cb: Callback function to be called upon completion.
+ * @cb_arg: Argument to be passed to the callback function.
+ *
+ * This function sends a message asynchronously to the SDM mailbox in EL3 secure firmware.
+ * and registers a callback function to be invoked when the operation completes.
+ *
+ * Return: 0 on success,and negative error codes on failure.
+ */
+int stratix10_svc_async_send(struct stratix10_svc_chan *chan, void *msg, void **handler,
+			     async_callback_t cb, void *cb_arg);
+
+/**
+ * stratix10_svc_async_poll - Polls the status of an asynchronous service request.
+ * @chan: Pointer to the service channel structure.
+ * @tx_handle: Handle to the transaction being polled.
+ * @data: Pointer to the callback data structure to be filled with the result.
+ *
+ * This function checks the status of an asynchronous service request
+ * and fills the provided callback data structure with the result.
+ *
+ * Return: 0 on success, -EINVAL if any input parameter is invalid or if the
+ *         async controller is not initialized, -EAGAIN if the transaction is
+ *         still in progress, or other negative error codes on failure.
+ */
+int stratix10_svc_async_poll(struct stratix10_svc_chan *chan, void *tx_handle,
+			     struct stratix10_svc_cb_data *data);
+
+/**
+ * stratix10_svc_async_done - Complete an asynchronous transaction
+ * @chan: Pointer to the service channel structure
+ * @tx_handle: Pointer to the transaction handle
+ *
+ * This function completes an asynchronous transaction by removing the
+ * transaction from the hash table and deallocating the associated resources.
+ *
+ * Return: 0 on success, -EINVAL on invalid input or errors.
+ */
+int stratix10_svc_async_done(struct stratix10_svc_chan *chan, void *tx_handle);
+
 #endif
 
-- 
cgit v1.2.3


From ec52379341a1209826c3e0ae53674393724d2071 Mon Sep 17 00:00:00 2001
From: Mahesh Rao <mahesh.rao@altera.com>
Date: Mon, 27 Oct 2025 22:54:42 +0800
Subject: firmware: stratix10-svc: Add support for RSU commands in asynchronous
 framework

Integrate Remote System Update(RSU) service commands into the
asynchronous framework for communicating with SDM. This allows the RSU
commands to be processed asynchronously, improving the responsiveness
of the Stratix10 service channel.

The asynchronous framework now supports the following RSU commands:
* COMMAND_RSU_GET_SPT_TABLE
* COMMAND_RSU_STATUS
* COMMAND_RSU_NOTIFY

Signed-off-by: Mahesh Rao <mahesh.rao@altera.com>
Reviewed-by: Matthew Gerlach <matthew.gerlach@altera.com>
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
---
 drivers/firmware/stratix10-svc.c                   | 72 ++++++++++++++++++++++
 include/linux/firmware/intel/stratix10-smc.h       | 52 ++++++++++++++++
 .../linux/firmware/intel/stratix10-svc-client.h    |  4 ++
 3 files changed, 128 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c
index 14bfa36a58ed..3acfa067c5dd 100644
--- a/drivers/firmware/stratix10-svc.c
+++ b/drivers/firmware/stratix10-svc.c
@@ -90,6 +90,12 @@
 #define STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(transaction_id) \
 	(FIELD_PREP(STRATIX10_TRANS_ID_FIELD, transaction_id))
 
+/* 10-bit mask for extracting the SDM status code */
+#define STRATIX10_SDM_STATUS_MASK GENMASK(9, 0)
+/* Macro to get the SDM mailbox error status */
+#define STRATIX10_GET_SDM_STATUS_CODE(status) \
+	(FIELD_GET(STRATIX10_SDM_STATUS_MASK, status))
+
 typedef void (svc_invoke_fn)(unsigned long, unsigned long, unsigned long,
 			     unsigned long, unsigned long, unsigned long,
 			     unsigned long, unsigned long,
@@ -1273,6 +1279,16 @@ int stratix10_svc_async_send(struct stratix10_svc_chan *chan, void *msg,
 		STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(handle->transaction_id);
 
 	switch (p_msg->command) {
+	case COMMAND_RSU_GET_SPT_TABLE:
+		args.a0 = INTEL_SIP_SMC_ASYNC_RSU_GET_SPT;
+		break;
+	case COMMAND_RSU_STATUS:
+		args.a0 = INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS;
+		break;
+	case COMMAND_RSU_NOTIFY:
+		args.a0 = INTEL_SIP_SMC_ASYNC_RSU_NOTIFY;
+		args.a2 = p_msg->arg[0];
+		break;
 	default:
 		dev_err(ctrl->dev, "Invalid command ,%d\n", p_msg->command);
 		ret = -EINVAL;
@@ -1326,6 +1342,56 @@ deallocate_id:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stratix10_svc_async_send);
+
+/**
+ * stratix10_svc_async_prepare_response - Prepare the response data for
+ * an asynchronous transaction.
+ * @chan: Pointer to the service channel structure.
+ * @handle: Pointer to the asynchronous handler structure.
+ * @data: Pointer to the callback data structure.
+ *
+ * This function prepares the response data for an asynchronous transaction. It
+ * extracts the response data from the SMC response structure and stores it in
+ * the callback data structure. The function also logs the completion of the
+ * asynchronous transaction.
+ *
+ * Return: 0 on success, -ENOENT if the command is invalid
+ */
+static int stratix10_svc_async_prepare_response(struct stratix10_svc_chan *chan,
+						struct stratix10_svc_async_handler *handle,
+						struct stratix10_svc_cb_data *data)
+{
+	struct stratix10_svc_client_msg *p_msg =
+		(struct stratix10_svc_client_msg *)handle->msg;
+	struct stratix10_svc_controller *ctrl = chan->ctrl;
+
+	data->status = STRATIX10_GET_SDM_STATUS_CODE(handle->res.a1);
+
+	switch (p_msg->command) {
+	case COMMAND_RSU_NOTIFY:
+		break;
+	case COMMAND_RSU_GET_SPT_TABLE:
+		data->kaddr1 = (void *)&handle->res.a2;
+		data->kaddr2 = (void *)&handle->res.a3;
+		break;
+	case COMMAND_RSU_STATUS:
+		/* COMMAND_RSU_STATUS has more elements than the cb_data
+		 * can acomodate, so passing the response structure to the
+		 * response function to be handled before done command is
+		 * executed by the client.
+		 */
+		data->kaddr1 = (void *)&handle->res;
+		break;
+
+	default:
+		dev_alert(ctrl->dev, "Invalid command\n ,%d", p_msg->command);
+		return -ENOENT;
+	}
+	dev_dbg(ctrl->dev, "Async message completed transaction_id 0x%02x\n",
+		handle->transaction_id);
+	return 0;
+}
+
 /**
  * stratix10_svc_async_poll - Polls the status of an asynchronous
  * transaction.
@@ -1355,6 +1421,7 @@ int stratix10_svc_async_poll(struct stratix10_svc_chan *chan,
 	struct stratix10_svc_controller *ctrl;
 	struct stratix10_async_ctrl *actrl;
 	struct stratix10_async_chan *achan;
+	int ret;
 
 	if (!chan || !tx_handle || !data)
 		return -EINVAL;
@@ -1386,6 +1453,11 @@ int stratix10_svc_async_poll(struct stratix10_svc_chan *chan,
 	memset(data, 0, sizeof(*data));
 
 	if (handle->res.a0 == INTEL_SIP_SMC_STATUS_OK) {
+		ret = stratix10_svc_async_prepare_response(chan, handle, data);
+		if (ret) {
+			dev_err(ctrl->dev, "Error in preparation of response,%d\n", ret);
+			WARN_ON_ONCE(1);
+		}
 		return 0;
 	} else if (handle->res.a0 == INTEL_SIP_SMC_STATUS_BUSY) {
 		dev_dbg(ctrl->dev, "async message is still in progress\n");
diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h
index 3995d5d70cce..935dba3633b5 100644
--- a/include/linux/firmware/intel/stratix10-smc.h
+++ b/include/linux/firmware/intel/stratix10-smc.h
@@ -679,4 +679,56 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE)
 #define INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL (0xC8)
 #define INTEL_SIP_SMC_ASYNC_POLL \
 	INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL)
+
+/**
+ * Request INTEL_SIP_SMC_ASYNC_RSU_GET_SPT
+ * Async call to get RSU SPT from SDM.
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_ASYNC_RSU_GET_SPT
+ * a1 transaction job id
+ * a2-a17 not used
+ *
+ * Return status:
+ * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED
+ * or INTEL_SIP_SMC_STATUS_BUSY
+ * a1-a17 not used
+ */
+#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_SPT (0xEA)
+#define INTEL_SIP_SMC_ASYNC_RSU_GET_SPT \
+	INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_SPT)
+
+/**
+ * Request INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS
+ * Async call to get RSU error status from SDM.
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS
+ * a1 transaction job id
+ * a2-a17 not used
+ *
+ * Return status:
+ * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED
+ * or INTEL_SIP_SMC_STATUS_BUSY
+ * a1-a17 not used
+ */
+#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_ERROR_STATUS (0xEB)
+#define INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS \
+	INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_ERROR_STATUS)
+
+/**
+ * Request INTEL_SIP_SMC_ASYNC_RSU_NOTIFY
+ * Async call to send NOTIFY value to SDM.
+ * Call register usage:
+ * a0 INTEL_SIP_SMC_ASYNC_RSU_NOTIFY
+ * a1 transaction job id
+ * a2 notify value
+ * a3-a17 not used
+ *
+ * Return status:
+ * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED
+ * or INTEL_SIP_SMC_STATUS_BUSY
+ * a1-a17 not used
+ */
+#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_NOTIFY (0xEC)
+#define INTEL_SIP_SMC_ASYNC_RSU_NOTIFY \
+	INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_NOTIFY)
 #endif
diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index 532dd4bd76dd..1bcc56d14080 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -128,6 +128,9 @@ struct stratix10_svc_chan;
  * @COMMAND_RSU_DCMF_STATUS: query firmware for the DCMF status
  * return status is SVC_STATUS_OK or SVC_STATUS_ERROR
  *
+ * @COMMAND_RSU_GET_SPT_TABLE: query firmware for SPT table
+ * return status is SVC_STATUS_OK or SVC_STATUS_ERROR
+ *
  * @COMMAND_FCS_REQUEST_SERVICE: request validation of image from firmware,
  * return status is SVC_STATUS_OK, SVC_STATUS_INVALID_PARAM
  *
@@ -162,6 +165,7 @@ enum stratix10_svc_command_code {
 	COMMAND_RSU_DCMF_VERSION,
 	COMMAND_RSU_DCMF_STATUS,
 	COMMAND_FIRMWARE_VERSION,
+	COMMAND_RSU_GET_SPT_TABLE,
 	/* for FCS */
 	COMMAND_FCS_REQUEST_SERVICE = 20,
 	COMMAND_FCS_SEND_CERTIFICATE,
-- 
cgit v1.2.3


From 524c3853831cf4f7e1db579e487c757c3065165c Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 22 Oct 2025 20:11:37 +0900
Subject: jbd2: use a per-journal lock_class_key for jbd2_trans_commit_key

syzbot is reporting possibility of deadlock due to sharing lock_class_key
for jbd2_handle across ext4 and ocfs2. But this is a false positive, for
one disk partition can't have two filesystems at the same time.

Reported-by: syzbot+6e493c165d26d6fcbf72@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=6e493c165d26d6fcbf72
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot+6e493c165d26d6fcbf72@syzkaller.appspotmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <987110fc-5470-457a-a218-d286a09dd82f@I-love.SAKURA.ne.jp>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/jbd2/journal.c    | 6 ++++--
 include/linux/jbd2.h | 6 ++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index d480b94117cd..f43474002f50 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1521,7 +1521,6 @@ static journal_t *journal_init_common(struct block_device *bdev,
 			struct block_device *fs_dev,
 			unsigned long long start, int len, int blocksize)
 {
-	static struct lock_class_key jbd2_trans_commit_key;
 	journal_t *journal;
 	int err;
 	int n;
@@ -1530,6 +1529,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	if (!journal)
 		return ERR_PTR(-ENOMEM);
 
+	lockdep_register_key(&journal->jbd2_trans_commit_key);
 	journal->j_blocksize = blocksize;
 	journal->j_dev = bdev;
 	journal->j_fs_dev = fs_dev;
@@ -1560,7 +1560,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	journal->j_max_batch_time = 15000; /* 15ms */
 	atomic_set(&journal->j_reserved_credits, 0);
 	lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
-			 &jbd2_trans_commit_key, 0);
+			 &journal->jbd2_trans_commit_key, 0);
 
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JBD2_ABORT;
@@ -1611,6 +1611,7 @@ err_cleanup:
 	kfree(journal->j_wbuf);
 	jbd2_journal_destroy_revoke(journal);
 	journal_fail_superblock(journal);
+	lockdep_unregister_key(&journal->jbd2_trans_commit_key);
 	kfree(journal);
 	return ERR_PTR(err);
 }
@@ -2187,6 +2188,7 @@ int jbd2_journal_destroy(journal_t *journal)
 		jbd2_journal_destroy_revoke(journal);
 	kfree(journal->j_fc_wbuf);
 	kfree(journal->j_wbuf);
+	lockdep_unregister_key(&journal->jbd2_trans_commit_key);
 	kfree(journal);
 
 	return err;
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 43b9297fe8a7..f5eaf76198f3 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1253,6 +1253,12 @@ struct journal_s
 	 */
 	struct lockdep_map	j_trans_commit_map;
 #endif
+	/**
+	 * @jbd2_trans_commit_key:
+	 *
+	 * "struct lock_class_key" for @j_trans_commit_map
+	 */
+	struct lock_class_key	jbd2_trans_commit_key;
 
 	/**
 	 * @j_fc_cleanup_callback:
-- 
cgit v1.2.3


From f694d215d34035cc64b1d176fd82db0d1f2428d4 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 11 Nov 2025 11:26:44 +0000
Subject: net: stmmac: always allocate mac_device_info

The ->setup() method implemented by dwmac-loongson and dwmac-sun8i
allocate the mac_device_info structure, as does stmmac_hwif_init().
This makes no sense.

Have stmmac_hwif_init() always allocate this structure, and pass it to
the ->setup() method to initialise when it is provided. Rename this
method to "mac_setup" to more accurately describe what it is doing.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vImWK-0000000DrIx-28vO@rmk-PC.armlinux.org.uk
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 13 ++++---------
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c    | 11 +++--------
 drivers/net/ethernet/stmicro/stmmac/hwif.c           | 16 +++++++++-------
 include/linux/stmmac.h                               |  4 +++-
 4 files changed, 19 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
index 2a3ac0136cdb..dd2fc39ec3e2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
@@ -320,10 +320,9 @@ static int loongson_dwmac_dma_interrupt(struct stmmac_priv *priv,
 	return ret;
 }
 
-static struct mac_device_info *loongson_dwmac_setup(void *apriv)
+static int loongson_dwmac_setup(void *apriv, struct mac_device_info *mac)
 {
 	struct stmmac_priv *priv = apriv;
-	struct mac_device_info *mac;
 	struct stmmac_dma_ops *dma;
 	struct loongson_data *ld;
 	struct pci_dev *pdev;
@@ -331,13 +330,9 @@ static struct mac_device_info *loongson_dwmac_setup(void *apriv)
 	ld = priv->plat->bsp_priv;
 	pdev = to_pci_dev(priv->device);
 
-	mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL);
-	if (!mac)
-		return NULL;
-
 	dma = devm_kzalloc(priv->device, sizeof(*dma), GFP_KERNEL);
 	if (!dma)
-		return NULL;
+		return -ENOMEM;
 
 	/* The Loongson GMAC and GNET devices are based on the DW GMAC
 	 * v3.50a and v3.73a IP-cores. But the HW designers have changed
@@ -396,7 +391,7 @@ static struct mac_device_info *loongson_dwmac_setup(void *apriv)
 	mac->mii.clk_csr_shift = 2;
 	mac->mii.clk_csr_mask = GENMASK(5, 2);
 
-	return mac;
+	return 0;
 }
 
 static int loongson_dwmac_msi_config(struct pci_dev *pdev,
@@ -598,7 +593,7 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id
 		goto err_disable_device;
 
 	plat->bsp_priv = ld;
-	plat->setup = loongson_dwmac_setup;
+	plat->mac_setup = loongson_dwmac_setup;
 	plat->fix_soc_reset = loongson_dwmac_fix_reset;
 	plat->suspend = loongson_dwmac_suspend;
 	plat->resume = loongson_dwmac_resume;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index 5d871b2cd111..7434d4bbb526 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -1040,15 +1040,10 @@ static const struct stmmac_ops sun8i_dwmac_ops = {
 	.set_mac_loopback = sun8i_dwmac_set_mac_loopback,
 };
 
-static struct mac_device_info *sun8i_dwmac_setup(void *ppriv)
+static int sun8i_dwmac_setup(void *ppriv, struct mac_device_info *mac)
 {
-	struct mac_device_info *mac;
 	struct stmmac_priv *priv = ppriv;
 
-	mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL);
-	if (!mac)
-		return NULL;
-
 	mac->pcsr = priv->ioaddr;
 	mac->mac = &sun8i_dwmac_ops;
 	mac->dma = &sun8i_dwmac_dma_ops;
@@ -1079,7 +1074,7 @@ static struct mac_device_info *sun8i_dwmac_setup(void *ppriv)
 	/* Synopsys Id is not available */
 	priv->synopsys_id = 0;
 
-	return mac;
+	return 0;
 }
 
 static struct regmap *sun8i_dwmac_get_syscon_from_dev(struct device_node *node)
@@ -1192,7 +1187,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
 	plat_dat->bsp_priv = gmac;
 	plat_dat->init = sun8i_dwmac_init;
 	plat_dat->exit = sun8i_dwmac_exit;
-	plat_dat->setup = sun8i_dwmac_setup;
+	plat_dat->mac_setup = sun8i_dwmac_setup;
 	plat_dat->tx_fifo_size = 4096;
 	plat_dat->rx_fifo_size = 16384;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index ee612cadbd77..014f7cd79a3c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -347,17 +347,19 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 			priv->estaddr = priv->ioaddr + EST_XGMAC_OFFSET;
 	}
 
+	mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL);
+	if (!mac)
+		return -ENOMEM;
+
 	/* Check for HW specific setup first */
-	if (priv->plat->setup) {
-		mac = priv->plat->setup(priv);
+	if (priv->plat->mac_setup) {
+		ret = priv->plat->mac_setup(priv, mac);
+		if (ret)
+			return ret;
+
 		needs_setup = false;
-	} else {
-		mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL);
 	}
 
-	if (!mac)
-		return -ENOMEM;
-
 	spin_lock_init(&mac->irq_ctrl_lock);
 
 	/* Fallback to generic HW */
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 48e9f1d4e17e..4f70a6551e68 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -192,6 +192,8 @@ enum dwmac_core_type {
 #define STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP	BIT(12)
 #define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY	BIT(13)
 
+struct mac_device_info;
+
 struct plat_stmmacenet_data {
 	enum dwmac_core_type core_type;
 	int bus_id;
@@ -266,7 +268,7 @@ struct plat_stmmacenet_data {
 	void (*exit)(struct platform_device *pdev, void *priv);
 	int (*suspend)(struct device *dev, void *priv);
 	int (*resume)(struct device *dev, void *priv);
-	struct mac_device_info *(*setup)(void *priv);
+	int (*mac_setup)(void *priv, struct mac_device_info *mac);
 	int (*clks_config)(void *priv, bool enabled);
 	int (*crosststamp)(ktime_t *device, struct system_counterval_t *system,
 			   void *ctx);
-- 
cgit v1.2.3


From 0f2620ffc41d117cc28bc053efe2dc837cf748dd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Nov 2025 09:39:42 +0100
Subject: fault-inject: make enum fault_flags available unconditionally

This will allow using should_fail_ex from code without having to
make it conditional on CONFIG_FAULT_INJECTION.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113084022.1255121-2-hch@lst.de
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/fault-inject.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 8c829d28dcf3..58fd14c82270 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -8,6 +8,10 @@
 struct dentry;
 struct kmem_cache;
 
+enum fault_flags {
+	FAULT_NOWARN =	1 << 0,
+};
+
 #ifdef CONFIG_FAULT_INJECTION
 
 #include <linux/atomic.h>
@@ -36,10 +40,6 @@ struct fault_attr {
 	struct dentry *dname;
 };
 
-enum fault_flags {
-	FAULT_NOWARN =	1 << 0,
-};
-
 #define FAULT_ATTR_INITIALIZER {					\
 		.interval = 1,						\
 		.times = ATOMIC_INIT(1),				\
-- 
cgit v1.2.3


From 2647e2ecc096d2330d6b6a34a3a1f0a99828c14c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Nov 2025 10:48:57 +0000
Subject: io_uring/query: introduce zcrx query

Add a new query type IO_URING_QUERY_ZCRX returning the user some basic
information about the interface, which includes allowed flags for areas
and registration and supported IORING_REGISTER_ZCRX_CTRL subcodes.

There is also a chicken-egg problem with user provided refill queue
memory, where offsets and size information is returned after
registration, but to properly allocate memory you need to know it
beforehand, which is why the userspace currently has to guess the RQ
headers size and severely overestimates it. Return the size information.
It's split into "size" and "alignment" fields because for default
placement modes the user is interested in the aligned size, however if
it gets support for more flexible placement, it'll need to only know the
actual header size.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/query.h | 16 ++++++++++++++++
 io_uring/query.c                    | 19 +++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h
index 3539ccbfd064..fc0cb1580e47 100644
--- a/include/uapi/linux/io_uring/query.h
+++ b/include/uapi/linux/io_uring/query.h
@@ -18,6 +18,7 @@ struct io_uring_query_hdr {
 
 enum {
 	IO_URING_QUERY_OPCODES			= 0,
+	IO_URING_QUERY_ZCRX			= 1,
 
 	__IO_URING_QUERY_MAX,
 };
@@ -41,4 +42,19 @@ struct io_uring_query_opcode {
 	__u32	__pad;
 };
 
+struct io_uring_query_zcrx {
+	/* Bitmask of supported ZCRX_REG_* flags, */
+	__u64 register_flags;
+	/* Bitmask of all supported IORING_ZCRX_AREA_* flags */
+	__u64 area_flags;
+	/* The number of supported ZCRX_CTRL_* opcodes */
+	__u32 nr_ctrl_opcodes;
+	__u32 __resv1;
+	/* The refill ring header size */
+	__u32 rq_hdr_size;
+	/* The alignment for the header */
+	__u32 rq_hdr_alignment;
+	__u64 __resv2;
+};
+
 #endif
diff --git a/io_uring/query.c b/io_uring/query.c
index e1435cdc2665..6f9fa5153903 100644
--- a/io_uring/query.c
+++ b/io_uring/query.c
@@ -4,9 +4,11 @@
 
 #include "query.h"
 #include "io_uring.h"
+#include "zcrx.h"
 
 union io_query_data {
 	struct io_uring_query_opcode opcodes;
+	struct io_uring_query_zcrx zcrx;
 };
 
 #define IO_MAX_QUERY_SIZE		sizeof(union io_query_data)
@@ -27,6 +29,20 @@ static ssize_t io_query_ops(union io_query_data *data)
 	return sizeof(*e);
 }
 
+static ssize_t io_query_zcrx(union io_query_data *data)
+{
+	struct io_uring_query_zcrx *e = &data->zcrx;
+
+	e->register_flags = ZCRX_REG_IMPORT;
+	e->area_flags = IORING_ZCRX_AREA_DMABUF;
+	e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST;
+	e->rq_hdr_size = sizeof(struct io_uring);
+	e->rq_hdr_alignment = L1_CACHE_BYTES;
+	e->__resv1 = 0;
+	e->__resv2 = 0;
+	return sizeof(*e);
+}
+
 static int io_handle_query_entry(struct io_ring_ctx *ctx,
 				 union io_query_data *data, void __user *uhdr,
 				 u64 *next_entry)
@@ -55,6 +71,9 @@ static int io_handle_query_entry(struct io_ring_ctx *ctx,
 	case IO_URING_QUERY_OPCODES:
 		ret = io_query_ops(data);
 		break;
+	case IO_URING_QUERY_ZCRX:
+		ret = io_query_zcrx(data);
+		break;
 	}
 
 	if (ret >= 0) {
-- 
cgit v1.2.3


From 4aaa9bc4d5921363490d95fe66c4db086a915799 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Nov 2025 10:48:58 +0000
Subject: io_uring/query: introduce rings info query

Same problem as with zcrx in the previous patch, the user needs to know
SQ/CQ header sizes to allocated memory before setup to use it for user
provided rings, i.e. IORING_SETUP_NO_MMAP, however that information is
only returned after registration, hence the user is guessing kernel
implementation details.

Return the header size and alignment, which is split with the same
motivation, to allow the user to know the real structure size without
alignment in case there will be more flexible placement schemes in the
future.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/query.h |  8 ++++++++
 io_uring/query.c                    | 13 +++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h
index fc0cb1580e47..2456e6c5ebb5 100644
--- a/include/uapi/linux/io_uring/query.h
+++ b/include/uapi/linux/io_uring/query.h
@@ -19,6 +19,7 @@ struct io_uring_query_hdr {
 enum {
 	IO_URING_QUERY_OPCODES			= 0,
 	IO_URING_QUERY_ZCRX			= 1,
+	IO_URING_QUERY_SCQ			= 2,
 
 	__IO_URING_QUERY_MAX,
 };
@@ -57,4 +58,11 @@ struct io_uring_query_zcrx {
 	__u64 __resv2;
 };
 
+struct io_uring_query_scq {
+	/* The SQ/CQ rings header size */
+	__u64 hdr_size;
+	/* The alignment for the header */
+	__u64 hdr_alignment;
+};
+
 #endif
diff --git a/io_uring/query.c b/io_uring/query.c
index 6f9fa5153903..e61b6221f87f 100644
--- a/io_uring/query.c
+++ b/io_uring/query.c
@@ -9,6 +9,7 @@
 union io_query_data {
 	struct io_uring_query_opcode opcodes;
 	struct io_uring_query_zcrx zcrx;
+	struct io_uring_query_scq scq;
 };
 
 #define IO_MAX_QUERY_SIZE		sizeof(union io_query_data)
@@ -43,6 +44,15 @@ static ssize_t io_query_zcrx(union io_query_data *data)
 	return sizeof(*e);
 }
 
+static ssize_t io_query_scq(union io_query_data *data)
+{
+	struct io_uring_query_scq *e = &data->scq;
+
+	e->hdr_size = sizeof(struct io_rings);
+	e->hdr_alignment = SMP_CACHE_BYTES;
+	return sizeof(*e);
+}
+
 static int io_handle_query_entry(struct io_ring_ctx *ctx,
 				 union io_query_data *data, void __user *uhdr,
 				 u64 *next_entry)
@@ -74,6 +84,9 @@ static int io_handle_query_entry(struct io_ring_ctx *ctx,
 	case IO_URING_QUERY_ZCRX:
 		ret = io_query_zcrx(data);
 		break;
+	case IO_URING_QUERY_SCQ:
+		ret = io_query_scq(data);
+		break;
 	}
 
 	if (ret >= 0) {
-- 
cgit v1.2.3


From d663976dad68de9b2e3df59cc31f0a24ee4c4511 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Nov 2025 10:46:12 +0000
Subject: io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL

It'll be annoying and take enough of boilerplate code to implement
new zcrx features as separate io_uring register opcode. Introduce
IORING_REGISTER_ZCRX_CTRL that will multiplex such calls to zcrx.
Note, there are no real users of the opcode in this patch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 13 +++++++++++++
 io_uring/register.c           |  3 +++
 io_uring/zcrx.c               | 21 +++++++++++++++++++++
 io_uring/zcrx.h               |  6 ++++++
 4 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e96080db3e4d..0e1d353fab1d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -697,6 +697,9 @@ enum io_uring_register_op {
 	/* query various aspects of io_uring, see linux/io_uring/query.h */
 	IORING_REGISTER_QUERY			= 35,
 
+	/* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
+	IORING_REGISTER_ZCRX_CTRL		= 36,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -1078,6 +1081,16 @@ struct io_uring_zcrx_ifq_reg {
 	__u64	__resv[3];
 };
 
+enum zcrx_ctrl_op {
+	__ZCRX_CTRL_LAST,
+};
+
+struct zcrx_ctrl {
+	__u32	zcrx_id;
+	__u32	op; /* see enum zcrx_ctrl_op */
+	__u64	__resv[8];
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/io_uring/register.c b/io_uring/register.c
index 334a457da3f7..fc66a5364483 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -815,6 +815,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	case IORING_REGISTER_QUERY:
 		ret = io_query(ctx, arg, nr_args);
 		break;
+	case IORING_REGISTER_ZCRX_CTRL:
+		ret = io_zcrx_ctrl(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 149bf9d5b983..0b5f4320c7a9 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -941,6 +941,27 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
 	.uninstall		= io_pp_uninstall,
 };
 
+int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+{
+	struct zcrx_ctrl ctrl;
+	struct io_zcrx_ifq *zcrx;
+
+	if (nr_args)
+		return -EINVAL;
+	if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
+		return -EFAULT;
+	if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv)))
+		return -EFAULT;
+
+	zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id);
+	if (!zcrx)
+		return -ENXIO;
+	if (ctrl.op >= __ZCRX_CTRL_LAST)
+		return -EOPNOTSUPP;
+
+	return -EINVAL;
+}
+
 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
 			      struct io_zcrx_ifq *ifq, int off, int len)
 {
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index c9b9bfae0547..f29edc22c91f 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -65,6 +65,7 @@ struct io_zcrx_ifq {
 };
 
 #if defined(CONFIG_IO_URING_ZCRX)
+int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg);
 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 			 struct io_uring_zcrx_ifq_reg __user *arg);
 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
@@ -93,6 +94,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct
 {
 	return NULL;
 }
+static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx,
+				void __user *arg, unsigned nr_arg)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);
-- 
cgit v1.2.3


From 475eb39b00478b1898bc9080344dcd8e86c53c7a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Nov 2025 10:46:13 +0000
Subject: io_uring/zcrx: add sync refill queue flushing

Add an zcrx interface via IORING_REGISTER_ZCRX_CTRL that forces the
kernel to flush / consume entries from the refill queue. Just as with
the IORING_REGISTER_ZCRX_REFILL attempt, the motivation is to address
cases where the refill queue becomes full, and the user can't return
buffers and needs to stash them. It's still a slow path, and the user
should size refill queue appropriately, but it should be helpful for
handling temporary traffic spikes and other unpredictable conditions.

The interface is simpler comparing to ZCRX_REFILL as it doesn't need
temporary refill entry arrays and gives natural batching, whereas
ZCRX_REFILL requires even more user logic to be somewhat efficient.

Also, add a structure for the operation. It's not currently used but
can serve for future improvements like limiting the number of buffers to
process, etc.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 10 +++++-
 io_uring/zcrx.c               | 74 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 80 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0e1d353fab1d..db47fced2cc6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -1082,13 +1082,21 @@ struct io_uring_zcrx_ifq_reg {
 };
 
 enum zcrx_ctrl_op {
+	ZCRX_CTRL_FLUSH_RQ,
+
 	__ZCRX_CTRL_LAST,
 };
 
+struct zcrx_ctrl_flush_rq {
+	__u64		__resv[6];
+};
+
 struct zcrx_ctrl {
 	__u32	zcrx_id;
 	__u32	op; /* see enum zcrx_ctrl_op */
-	__u64	__resv[8];
+	__u64	__resv[2];
+
+	struct zcrx_ctrl_flush_rq	zc_flush;
 };
 
 #ifdef __cplusplus
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 0b5f4320c7a9..08c103af69bc 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -941,6 +941,71 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
 	.uninstall		= io_pp_uninstall,
 };
 
+static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
+			      struct io_zcrx_ifq *zcrx)
+{
+	unsigned int mask = zcrx->rq_entries - 1;
+	unsigned int i;
+
+	guard(spinlock_bh)(&zcrx->rq_lock);
+
+	nr = min(nr, io_zcrx_rqring_entries(zcrx));
+	for (i = 0; i < nr; i++) {
+		struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask);
+		struct net_iov *niov;
+
+		if (!io_parse_rqe(rqe, zcrx, &niov))
+			break;
+		netmem_array[i] = net_iov_to_netmem(niov);
+	}
+
+	smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head);
+	return i;
+}
+
+#define ZCRX_FLUSH_BATCH 32
+
+static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr)
+{
+	unsigned i;
+
+	for (i = 0; i < nr; i++) {
+		netmem_ref netmem = netmems[i];
+		struct net_iov *niov = netmem_to_net_iov(netmem);
+
+		if (!io_zcrx_put_niov_uref(niov))
+			continue;
+		if (!page_pool_unref_and_test(netmem))
+			continue;
+		io_zcrx_return_niov(niov);
+	}
+}
+
+static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
+			 struct zcrx_ctrl *ctrl)
+{
+	struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush;
+	netmem_ref netmems[ZCRX_FLUSH_BATCH];
+	unsigned total = 0;
+	unsigned nr;
+
+	if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv)))
+		return -EINVAL;
+
+	do {
+		nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx);
+
+		zcrx_return_buffers(netmems, nr);
+		total += nr;
+
+		if (fatal_signal_pending(current))
+			break;
+		cond_resched();
+	} while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries);
+
+	return 0;
+}
+
 int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
 {
 	struct zcrx_ctrl ctrl;
@@ -956,10 +1021,13 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
 	zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id);
 	if (!zcrx)
 		return -ENXIO;
-	if (ctrl.op >= __ZCRX_CTRL_LAST)
-		return -EOPNOTSUPP;
 
-	return -EINVAL;
+	switch (ctrl.op) {
+	case ZCRX_CTRL_FLUSH_RQ:
+		return zcrx_flush_rq(ctx, zcrx, &ctrl);
+	}
+
+	return -EOPNOTSUPP;
 }
 
 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
-- 
cgit v1.2.3


From d7af80b213e5675664b14f12240cb282e81773d5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Nov 2025 10:46:16 +0000
Subject: io_uring/zcrx: export zcrx via a file

Add an option to wrap a zcrx instance into a file and expose it to the
user space. Currently, users can't do anything meaningful with the file,
but it'll be used in a next patch to import it into another io_uring
instance. It's implemented as a new op called ZCRX_CTRL_EXPORT for the
IORING_REGISTER_ZCRX_CTRL registration opcode.

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 11 ++++++-
 io_uring/zcrx.c               | 68 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 72 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index db47fced2cc6..4bedc0310a55 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -1083,6 +1083,7 @@ struct io_uring_zcrx_ifq_reg {
 
 enum zcrx_ctrl_op {
 	ZCRX_CTRL_FLUSH_RQ,
+	ZCRX_CTRL_EXPORT,
 
 	__ZCRX_CTRL_LAST,
 };
@@ -1091,12 +1092,20 @@ struct zcrx_ctrl_flush_rq {
 	__u64		__resv[6];
 };
 
+struct zcrx_ctrl_export {
+	__u32		zcrx_fd;
+	__u32 		__resv1[11];
+};
+
 struct zcrx_ctrl {
 	__u32	zcrx_id;
 	__u32	op; /* see enum zcrx_ctrl_op */
 	__u64	__resv[2];
 
-	struct zcrx_ctrl_flush_rq	zc_flush;
+	union {
+		struct zcrx_ctrl_export		zc_export;
+		struct zcrx_ctrl_flush_rq	zc_flush;
+	};
 };
 
 #ifdef __cplusplus
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index e60c5c00a611..815992aff246 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -8,6 +8,7 @@
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/skbuff_ref.h>
+#include <linux/anon_inodes.h>
 
 #include <net/page_pool/helpers.h>
 #include <net/page_pool/memory_provider.h>
@@ -586,6 +587,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
 	}
 }
 
+static void zcrx_unregister(struct io_zcrx_ifq *ifq)
+{
+	if (refcount_dec_and_test(&ifq->user_refs)) {
+		io_close_queue(ifq);
+		io_zcrx_scrub(ifq);
+	}
+	io_put_zcrx_ifq(ifq);
+}
+
 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
 					    unsigned int id)
 {
@@ -596,6 +606,55 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
 	return ifq ? &ifq->region : NULL;
 }
 
+static int zcrx_box_release(struct inode *inode, struct file *file)
+{
+	struct io_zcrx_ifq *ifq = file->private_data;
+
+	if (WARN_ON_ONCE(!ifq))
+		return -EFAULT;
+	zcrx_unregister(ifq);
+	return 0;
+}
+
+static const struct file_operations zcrx_box_fops = {
+	.owner		= THIS_MODULE,
+	.release	= zcrx_box_release,
+};
+
+static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
+		       struct zcrx_ctrl *ctrl, void __user *arg)
+{
+	struct zcrx_ctrl_export *ce = &ctrl->zc_export;
+	struct file *file;
+	int fd = -1;
+
+	if (!mem_is_zero(ce, sizeof(*ce)))
+		return -EINVAL;
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	ce->zcrx_fd = fd;
+	if (copy_to_user(arg, ctrl, sizeof(*ctrl))) {
+		put_unused_fd(fd);
+		return -EFAULT;
+	}
+
+	refcount_inc(&ifq->refs);
+	refcount_inc(&ifq->user_refs);
+
+	file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops,
+					 ifq, O_CLOEXEC, NULL);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		zcrx_unregister(ifq);
+		return PTR_ERR(file);
+	}
+
+	fd_install(fd, file);
+	return 0;
+}
+
 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 			  struct io_uring_zcrx_ifq_reg __user *arg)
 {
@@ -742,12 +801,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
 		}
 		if (!ifq)
 			break;
-
-		if (refcount_dec_and_test(&ifq->user_refs)) {
-			io_close_queue(ifq);
-			io_zcrx_scrub(ifq);
-		}
-		io_put_zcrx_ifq(ifq);
+		zcrx_unregister(ifq);
 	}
 
 	xa_destroy(&ctx->zcrx_ctxs);
@@ -1028,6 +1082,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
 	switch (ctrl.op) {
 	case ZCRX_CTRL_FLUSH_RQ:
 		return zcrx_flush_rq(ctx, zcrx, &ctrl);
+	case ZCRX_CTRL_EXPORT:
+		return zcrx_export(ctx, zcrx, &ctrl, arg);
 	}
 
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 00d91481279fb2df8c46d19090578afd523ca630 Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Thu, 13 Nov 2025 10:46:18 +0000
Subject: io_uring/zcrx: share an ifq between rings

Add a way to share an ifq from a src ring that is real (i.e. bound to a
HW RX queue) with other rings. This is done by passing a new flag
IORING_ZCRX_IFQ_REG_IMPORT in the registration struct
io_uring_zcrx_ifq_reg, alongside the fd of an exported zcrx ifq.

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  4 +++
 io_uring/zcrx.c               | 63 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 65 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4bedc0310a55..deb772222b6d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -1063,6 +1063,10 @@ struct io_uring_zcrx_area_reg {
 	__u64	__resv2[2];
 };
 
+enum zcrx_reg_flags {
+	ZCRX_REG_IMPORT	= 1,
+};
+
 /*
  * Argument for IORING_REGISTER_ZCRX_IFQ
  */
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index da7e556c349e..b99cf2c6670a 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -660,6 +660,63 @@ static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
 	return 0;
 }
 
+static int import_zcrx(struct io_ring_ctx *ctx,
+		       struct io_uring_zcrx_ifq_reg __user *arg,
+		       struct io_uring_zcrx_ifq_reg *reg)
+{
+	struct io_zcrx_ifq *ifq;
+	struct file *file;
+	int fd, ret;
+	u32 id;
+
+	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+		return -EINVAL;
+	if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
+		return -EINVAL;
+	if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
+		return -EINVAL;
+
+	fd = reg->if_idx;
+	CLASS(fd, f)(fd);
+	if (fd_empty(f))
+		return -EBADF;
+
+	file = fd_file(f);
+	if (file->f_op != &zcrx_box_fops || !file->private_data)
+		return -EBADF;
+
+	ifq = file->private_data;
+	refcount_inc(&ifq->refs);
+	refcount_inc(&ifq->user_refs);
+
+	scoped_guard(mutex, &ctx->mmap_lock) {
+		ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
+		if (ret)
+			goto err;
+	}
+
+	reg->zcrx_id = id;
+	io_fill_zcrx_offsets(&reg->offsets);
+	if (copy_to_user(arg, reg, sizeof(*reg))) {
+		ret = -EFAULT;
+		goto err_xa_erase;
+	}
+
+	scoped_guard(mutex, &ctx->mmap_lock) {
+		ret = -ENOMEM;
+		if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
+			goto err_xa_erase;
+	}
+
+	return 0;
+err_xa_erase:
+	scoped_guard(mutex, &ctx->mmap_lock)
+		xa_erase(&ctx->zcrx_ctxs, id);
+err:
+	zcrx_unregister(ifq);
+	return ret;
+}
+
 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 			  struct io_uring_zcrx_ifq_reg __user *arg)
 {
@@ -685,11 +742,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		return -EINVAL;
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
-	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
-		return -EFAULT;
 	if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) ||
 	    reg.__resv2 || reg.zcrx_id)
 		return -EINVAL;
+	if (reg.flags & ZCRX_REG_IMPORT)
+		return import_zcrx(ctx, arg, &reg);
+	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
+		return -EFAULT;
 	if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
 		return -EINVAL;
 	if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
-- 
cgit v1.2.3


From 36640d21fdfe0152c96e6cb9b58e3336291dfbaa Mon Sep 17 00:00:00 2001
From: Siddharth Vadapalli <s-vadapalli@ti.com>
Date: Wed, 29 Oct 2025 13:34:49 +0530
Subject: PCI: Export pci_get_host_bridge_device() for use by pci-keystone

The pci-keystone.c driver uses the 'pci_get_host_bridge_device()' helper.
Export it in preparation for enabling the pci-keystone.c driver to be built
as a loadable module.

Signed-off-by: Siddharth Vadapalli <s-vadapalli@ti.com>
Signed-off-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20251029080547.1253757-2-s-vadapalli@ti.com
---
 drivers/pci/host-bridge.c | 1 +
 include/linux/pci.h       | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c
index afa50b446567..be5ef6516cff 100644
--- a/drivers/pci/host-bridge.c
+++ b/drivers/pci/host-bridge.c
@@ -33,6 +33,7 @@ struct device *pci_get_host_bridge_device(struct pci_dev *dev)
 	kobject_get(&bridge->kobj);
 	return bridge;
 }
+EXPORT_SYMBOL_GPL(pci_get_host_bridge_device);
 
 void  pci_put_host_bridge_device(struct device *dev)
 {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1fdf81fbe1e..b253cbc27d36 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -646,6 +646,7 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv);
 struct pci_host_bridge *devm_pci_alloc_host_bridge(struct device *dev,
 						   size_t priv);
 void pci_free_host_bridge(struct pci_host_bridge *bridge);
+struct device *pci_get_host_bridge_device(struct pci_dev *dev);
 struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus);
 
 void pci_set_host_bridge_release(struct pci_host_bridge *bridge,
-- 
cgit v1.2.3


From 8d63e85c5b50f1dbfa0ccb214bd91fe5d7e2e860 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 4 Nov 2025 11:26:53 -0800
Subject: firmware: cs_dsp: fix kernel-doc warnings in a header file

Use correct kernel-doc format to avoid kernel-doc warnings in
nclude/linux/firmware/cirrus/cs_dsp_test_utils.h:

- mark one struct member as private: since the comment says that it is
private
- add ending ':' to struct members where needed

Warning: include/linux/firmware/cirrus/cs_dsp_test_utils.h:30 struct
 member 'saw_bus_write' not described in 'cs_dsp_test'
Warning: include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct
 member 'id' not described in 'cs_dsp_mock_alg_def'
Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct
 member 'ver' not described in 'cs_dsp_mock_alg_def'
Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct
 member 'xm_base_words' not described in 'cs_dsp_mock_alg_def'
Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct
 member 'xm_size_words' not described in 'cs_dsp_mock_alg_def'
Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct
 member 'ym_base_words' not described in 'cs_dsp_mock_alg_def'
Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct
 member 'ym_size_words' not described in 'cs_dsp_mock_alg_def'
Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct
 member 'zm_base_words' not described in 'cs_dsp_mock_alg_def'
Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct
 member 'zm_size_words' not described in 'cs_dsp_mock_alg_def'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20251104192653.929157-1-rdunlap@infradead.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/cirrus/cs_dsp_test_utils.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h
index ecd821ed8064..1f97764fdfd7 100644
--- a/include/linux/firmware/cirrus/cs_dsp_test_utils.h
+++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h
@@ -26,21 +26,21 @@ struct cs_dsp_test {
 
 	struct cs_dsp_test_local *local;
 
-	/* Following members are private */
+	/* private: Following members are private */
 	bool saw_bus_write;
 };
 
 /**
  * struct cs_dsp_mock_alg_def - Info for creating a mock algorithm entry.
  *
- * @id		  Algorithm ID.
- * @ver;	  Algorithm version.
- * @xm_base_words XM base address in DSP words.
- * @xm_size_words XM size in DSP words.
- * @ym_base_words YM base address in DSP words.
- * @ym_size_words YM size in DSP words.
- * @zm_base_words ZM base address in DSP words.
- * @zm_size_words ZM size in DSP words.
+ * @id:		   Algorithm ID.
+ * @ver:	   Algorithm version.
+ * @xm_base_words: XM base address in DSP words.
+ * @xm_size_words: XM size in DSP words.
+ * @ym_base_words: YM base address in DSP words.
+ * @ym_size_words: YM size in DSP words.
+ * @zm_base_words: ZM base address in DSP words.
+ * @zm_size_words: ZM size in DSP words.
  */
 struct cs_dsp_mock_alg_def {
 	unsigned int id;
-- 
cgit v1.2.3


From 280b7cdddc3d96c4887fdb31b6766e4db1b2f2a3 Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Date: Sat, 1 Nov 2025 05:00:31 +0000
Subject: dt-bindings: clock: renesas,r9a09g057-cpg: Add USB3.0 core clocks

Add definitions for USB3.0 core clocks in the R9A09G057 CPG DT bindings
header file.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://patch.msgid.link/20251101050034.738807-2-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/renesas,r9a09g057-cpg.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/renesas,r9a09g057-cpg.h b/include/dt-bindings/clock/renesas,r9a09g057-cpg.h
index 5346a898ab60..f91d7f72922a 100644
--- a/include/dt-bindings/clock/renesas,r9a09g057-cpg.h
+++ b/include/dt-bindings/clock/renesas,r9a09g057-cpg.h
@@ -22,5 +22,9 @@
 #define R9A09G057_GBETH_0_CLK_PTP_REF_I		11
 #define R9A09G057_GBETH_1_CLK_PTP_REF_I		12
 #define R9A09G057_SPI_CLK_SPI			13
+#define R9A09G057_USB3_0_REF_ALT_CLK_P		14
+#define R9A09G057_USB3_0_CLKCORE		15
+#define R9A09G057_USB3_1_REF_ALT_CLK_P		16
+#define R9A09G057_USB3_1_CLKCORE		17
 
 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G057_CPG_H__ */
-- 
cgit v1.2.3


From a95ce05cd0cc8b53f1559390c4e690bb8f79562f Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Date: Sat, 1 Nov 2025 05:00:32 +0000
Subject: dt-bindings: clock: renesas,r9a09g056-cpg: Add USB3.0 core clocks

Add definitions for USB3.0 core clocks in the R9A09G056 CPG DT bindings
header file.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://patch.msgid.link/20251101050034.738807-3-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/renesas,r9a09g056-cpg.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/renesas,r9a09g056-cpg.h b/include/dt-bindings/clock/renesas,r9a09g056-cpg.h
index a9af5af9e3a1..234dcf4f0f91 100644
--- a/include/dt-bindings/clock/renesas,r9a09g056-cpg.h
+++ b/include/dt-bindings/clock/renesas,r9a09g056-cpg.h
@@ -21,5 +21,7 @@
 #define R9A09G056_GBETH_0_CLK_PTP_REF_I		10
 #define R9A09G056_GBETH_1_CLK_PTP_REF_I		11
 #define R9A09G056_SPI_CLK_SPI			12
+#define R9A09G056_USB3_0_REF_ALT_CLK_P		13
+#define R9A09G056_USB3_0_CLKCORE		14
 
 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G056_CPG_H__ */
-- 
cgit v1.2.3


From e5b5f8b7c26f72fe86b59979e51d8e6cf36ea903 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 Nov 2025 18:14:40 -0800
Subject: PCI/TSM: Drop stub for pci_tsm_doe_transfer()

Just like pci_tsm_pf0_{con,de}structor(), in the CONFIG_PCI_TSM=n case there
should be no callers of pci_tsm_doe_transfer().

Reported-by: Xu Yilun <yilun.xu@linux.intel.com>
Closes: http://lore.kernel.org/aRFfk14DJWEVhC/R@yilunxu-OptiPlex-7050
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-3-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/pci-tsm.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h
index e921d30f9b6c..d7b078d5e272 100644
--- a/include/linux/pci-tsm.h
+++ b/include/linux/pci-tsm.h
@@ -147,11 +147,5 @@ static inline int pci_tsm_register(struct tsm_dev *tsm_dev)
 static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev)
 {
 }
-static inline int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type,
-				       const void *req, size_t req_sz,
-				       void *resp, size_t resp_sz)
-{
-	return -ENXIO;
-}
 #endif
 #endif /*__PCI_TSM_H */
-- 
cgit v1.2.3


From c16af019d9d6d23f211c82b5561f2ecd2a7dff54 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 Nov 2025 18:14:41 -0800
Subject: resource: Introduce resource_assigned() for discerning active
 resources
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A PCI bridge resource lifecycle involves both a "request" and "assign"
phase. At any point in time that resource may not yet be assigned, or may
have failed to assign (because it does not fit).

There are multiple conventions to determine when assignment has not
completed: IORESOURCE_UNSET, IORESOURCE_DISABLED, and checking whether the
resource is parented.

In code paths that are known to not be racing assignment, e.g. post
subsys_initcall(), the most reliable method to judge that a bridge resource
is assigned is to check the resource is parented [1].

Introduce a resource_assigned() helper for this purpose.

Link: http://lore.kernel.org/2b9f7f7b-d6a4-be59-14d4-7b4ffccfe373@linux.intel.com [1]
Suggested-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-4-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/ioport.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index e8b2d6aa4013..9afa30f9346f 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -334,6 +334,15 @@ static inline bool resource_union(const struct resource *r1, const struct resour
 	return true;
 }
 
+/*
+ * Check if this resource is added to a resource tree or detached. Caller is
+ * responsible for not racing assignment.
+ */
+static inline bool resource_assigned(struct resource *res)
+{
+	return res->parent;
+}
+
 int find_resource_space(struct resource *root, struct resource *new,
 			resource_size_t size, struct resource_constraint *constraint);
 
-- 
cgit v1.2.3


From 4aa73c6051cb65046e6fa601b7877b5c1e6edc85 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 12 Nov 2025 21:46:24 +0100
Subject: net: dsa: remove definition of struct dsa_switch_driver

Since 93e86b3bc842 ("net: dsa: Remove legacy probing support")
this struct has no user any longer.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://patch.msgid.link/4053a98f-052f-4dc1-a3d4-ed9b3d3cc7cb@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2df2e2ead9a8..97d5f401cfcf 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1314,11 +1314,6 @@ static inline int dsa_devlink_port_to_port(struct devlink_port *port)
 	return port->index;
 }
 
-struct dsa_switch_driver {
-	struct list_head	list;
-	const struct dsa_switch_ops *ops;
-};
-
 bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port,
 				 const unsigned char *addr, u16 vid,
 				 struct dsa_db db);
-- 
cgit v1.2.3


From 947643509279a605a09959a06d332bf027e8be57 Mon Sep 17 00:00:00 2001
From: Mikhail Kshevetskiy <mikhail.kshevetskiy@iopsys.eu>
Date: Mon, 10 Nov 2025 06:56:43 +0300
Subject: dt-bindings: clock: airoha: Add reset support to EN7523 clock binding

Introduce reset capability to EN7523 device-tree clock binding
documentation.

Signed-off-by: Mikhail Kshevetskiy <mikhail.kshevetskiy@iopsys.eu>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../bindings/clock/airoha,en7523-scu.yaml          |  3 +-
 include/dt-bindings/reset/airoha,en7523-reset.h    | 61 ++++++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100644 include/dt-bindings/reset/airoha,en7523-reset.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/airoha,en7523-scu.yaml b/Documentation/devicetree/bindings/clock/airoha,en7523-scu.yaml
index fe2c5c1baf43..a8471367175b 100644
--- a/Documentation/devicetree/bindings/clock/airoha,en7523-scu.yaml
+++ b/Documentation/devicetree/bindings/clock/airoha,en7523-scu.yaml
@@ -64,8 +64,6 @@ allOf:
         reg:
           minItems: 2
 
-        '#reset-cells': false
-
   - if:
       properties:
         compatible:
@@ -85,6 +83,7 @@ examples:
       reg = <0x1fa20000 0x400>,
             <0x1fb00000 0x1000>;
       #clock-cells = <1>;
+      #reset-cells = <1>;
     };
 
   - |
diff --git a/include/dt-bindings/reset/airoha,en7523-reset.h b/include/dt-bindings/reset/airoha,en7523-reset.h
new file mode 100644
index 000000000000..211e8a23a21c
--- /dev/null
+++ b/include/dt-bindings/reset/airoha,en7523-reset.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2024 iopsys Software Solutions AB.
+ * Copyright (C) 2025 Genexis AB.
+ *
+ * Author: Mikhail Kshevetskiy <mikhail.kshevetskiy@iopsys.eu>
+ *
+ * based on
+ *   include/dt-bindings/reset/airoha,en7581-reset.h
+ * by Lorenzo Bianconi <lorenzo@kernel.org>
+ */
+
+#ifndef __DT_BINDINGS_RESET_CONTROLLER_AIROHA_EN7523_H_
+#define __DT_BINDINGS_RESET_CONTROLLER_AIROHA_EN7523_H_
+
+/* RST_CTRL2 */
+#define EN7523_XPON_PHY_RST		 0
+#define EN7523_XSI_MAC_RST		 1
+#define EN7523_XSI_PHY_RST		 2
+#define EN7523_NPU_RST			 3
+#define EN7523_I2S_RST			 4
+#define EN7523_TRNG_RST			 5
+#define EN7523_TRNG_MSTART_RST		 6
+#define EN7523_DUAL_HSI0_RST		 7
+#define EN7523_DUAL_HSI1_RST		 8
+#define EN7523_HSI_RST			 9
+#define EN7523_DUAL_HSI0_MAC_RST	10
+#define EN7523_DUAL_HSI1_MAC_RST	11
+#define EN7523_HSI_MAC_RST		12
+#define EN7523_WDMA_RST			13
+#define EN7523_WOE0_RST			14
+#define EN7523_WOE1_RST			15
+#define EN7523_HSDMA_RST		16
+#define EN7523_I2C2RBUS_RST		17
+#define EN7523_TDMA_RST			18
+/* RST_CTRL1 */
+#define EN7523_PCM1_ZSI_ISI_RST		19
+#define EN7523_FE_PDMA_RST		20
+#define EN7523_FE_QDMA_RST		21
+#define EN7523_PCM_SPIWP_RST		22
+#define EN7523_CRYPTO_RST		23
+#define EN7523_TIMER_RST		24
+#define EN7523_PCM1_RST			25
+#define EN7523_UART_RST			26
+#define EN7523_GPIO_RST			27
+#define EN7523_GDMA_RST			28
+#define EN7523_I2C_MASTER_RST		29
+#define EN7523_PCM2_ZSI_ISI_RST		30
+#define EN7523_SFC_RST			31
+#define EN7523_UART2_RST		32
+#define EN7523_GDMP_RST			33
+#define EN7523_FE_RST			34
+#define EN7523_USB_HOST_P0_RST		35
+#define EN7523_GSW_RST			36
+#define EN7523_SFC2_PCM_RST		37
+#define EN7523_PCIE0_RST		38
+#define EN7523_PCIE1_RST		39
+#define EN7523_PCIE_HB_RST		40
+#define EN7523_XPON_MAC_RST		41
+
+#endif /* __DT_BINDINGS_RESET_CONTROLLER_AIROHA_EN7523_H_ */
-- 
cgit v1.2.3


From a97fbc3ee3e2a536fafaff04f21f45472db71769 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 29 Oct 2025 17:33:30 +0100
Subject: syscore: Pass context data to callbacks

Several drivers can benefit from registering per-instance data along
with the syscore operations. To achieve this, move the modifiable fields
out of the syscore_ops structure and into a separate struct syscore that
can be registered with the framework. Add a void * driver data field for
drivers to store contextual data that will be passed to the syscore ops.

Acked-by: Rafael J. Wysocki (Intel) <rafael@kernel.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 arch/arm/mach-exynos/mcpm-exynos.c        | 12 +++--
 arch/arm/mach-exynos/suspend.c            | 48 +++++++++++-------
 arch/arm/mach-pxa/generic.h               |  6 +--
 arch/arm/mach-pxa/irq.c                   | 10 ++--
 arch/arm/mach-pxa/mfp-pxa2xx.c            | 10 ++--
 arch/arm/mach-pxa/mfp-pxa3xx.c            | 10 ++--
 arch/arm/mach-pxa/pxa25x.c                |  4 +-
 arch/arm/mach-pxa/pxa27x.c                |  4 +-
 arch/arm/mach-pxa/pxa3xx.c                |  4 +-
 arch/arm/mach-pxa/smemc.c                 | 12 +++--
 arch/arm/mach-s3c/irq-pm-s3c64xx.c        | 12 +++--
 arch/arm/mach-s5pv210/pm.c                | 10 ++--
 arch/arm/mach-versatile/integrator_ap.c   | 12 +++--
 arch/arm/mm/cache-b15-rac.c               | 12 +++--
 arch/loongarch/kernel/smp.c               | 12 +++--
 arch/mips/alchemy/common/dbdma.c          | 12 +++--
 arch/mips/alchemy/common/irq.c            | 24 ++++++---
 arch/mips/alchemy/common/usb.c            | 12 +++--
 arch/mips/pci/pci-alchemy.c               | 16 +++---
 arch/powerpc/platforms/cell/spu_base.c    | 10 ++--
 arch/powerpc/platforms/powermac/pic.c     | 12 +++--
 arch/powerpc/sysdev/fsl_lbc.c             | 12 +++--
 arch/powerpc/sysdev/fsl_pci.c             | 12 +++--
 arch/powerpc/sysdev/ipic.c                | 12 +++--
 arch/powerpc/sysdev/mpic.c                | 14 ++++--
 arch/powerpc/sysdev/mpic_timer.c          | 10 ++--
 arch/sh/mm/pmb.c                          | 10 ++--
 arch/x86/events/amd/ibs.c                 | 12 +++--
 arch/x86/hyperv/hv_init.c                 | 12 +++--
 arch/x86/kernel/amd_gart_64.c             | 10 ++--
 arch/x86/kernel/apic/apic.c               | 12 +++--
 arch/x86/kernel/apic/io_apic.c            | 17 +++++--
 arch/x86/kernel/cpu/aperfmperf.c          | 20 +++++---
 arch/x86/kernel/cpu/intel_epb.c           | 16 +++---
 arch/x86/kernel/cpu/mce/core.c            | 14 ++++--
 arch/x86/kernel/cpu/microcode/core.c      | 15 ++++--
 arch/x86/kernel/cpu/mtrr/legacy.c         | 12 +++--
 arch/x86/kernel/cpu/umwait.c              | 10 ++--
 arch/x86/kernel/i8237.c                   | 10 ++--
 arch/x86/kernel/i8259.c                   | 14 ++++--
 arch/x86/kernel/kvm.c                     | 12 +++--
 drivers/acpi/pci_link.c                   | 10 ++--
 drivers/acpi/sleep.c                      | 12 +++--
 drivers/base/firmware_loader/main.c       | 12 +++--
 drivers/base/syscore.c                    | 82 ++++++++++++++++---------------
 drivers/bus/mvebu-mbus.c                  | 16 +++---
 drivers/clk/at91/pmc.c                    | 12 +++--
 drivers/clk/imx/clk-vf610.c               | 12 +++--
 drivers/clk/ingenic/jz4725b-cgu.c         |  2 +-
 drivers/clk/ingenic/jz4740-cgu.c          |  2 +-
 drivers/clk/ingenic/jz4755-cgu.c          |  2 +-
 drivers/clk/ingenic/jz4760-cgu.c          |  2 +-
 drivers/clk/ingenic/jz4770-cgu.c          |  2 +-
 drivers/clk/ingenic/jz4780-cgu.c          |  2 +-
 drivers/clk/ingenic/pm.c                  | 14 ++++--
 drivers/clk/ingenic/pm.h                  |  2 +-
 drivers/clk/ingenic/tcu.c                 | 12 +++--
 drivers/clk/ingenic/x1000-cgu.c           |  2 +-
 drivers/clk/ingenic/x1830-cgu.c           |  2 +-
 drivers/clk/mvebu/common.c                | 12 +++--
 drivers/clk/rockchip/clk-rk3288.c         | 12 +++--
 drivers/clk/samsung/clk-s5pv210-audss.c   | 12 +++--
 drivers/clk/samsung/clk.c                 | 12 +++--
 drivers/clk/tegra/clk-tegra210.c          | 12 +++--
 drivers/clocksource/timer-armada-370-xp.c | 12 +++--
 drivers/cpuidle/cpuidle-psci.c            | 12 +++--
 drivers/gpio/gpio-mxc.c                   | 12 +++--
 drivers/gpio/gpio-pxa.c                   | 12 +++--
 drivers/gpio/gpio-sa1100.c                | 12 +++--
 drivers/hv/vmbus_drv.c                    | 14 ++++--
 drivers/iommu/amd/init.c                  | 16 +++---
 drivers/iommu/intel/iommu.c               | 12 +++--
 drivers/irqchip/exynos-combiner.c         | 14 ++++--
 drivers/irqchip/irq-armada-370-xp.c       | 12 +++--
 drivers/irqchip/irq-bcm7038-l1.c          | 12 +++--
 drivers/irqchip/irq-gic-v3-its.c          | 12 +++--
 drivers/irqchip/irq-i8259.c               | 12 +++--
 drivers/irqchip/irq-imx-gpcv2.c           | 16 +++---
 drivers/irqchip/irq-loongson-eiointc.c    | 12 +++--
 drivers/irqchip/irq-loongson-htpic.c      | 10 ++--
 drivers/irqchip/irq-loongson-htvec.c      | 12 +++--
 drivers/irqchip/irq-loongson-pch-lpc.c    | 12 +++--
 drivers/irqchip/irq-loongson-pch-pic.c    | 12 +++--
 drivers/irqchip/irq-mchp-eic.c            | 12 +++--
 drivers/irqchip/irq-mst-intc.c            | 12 +++--
 drivers/irqchip/irq-mtk-cirq.c            | 12 +++--
 drivers/irqchip/irq-renesas-rzg2l.c       | 12 +++--
 drivers/irqchip/irq-sa11x0.c              | 12 +++--
 drivers/irqchip/irq-sifive-plic.c         | 12 +++--
 drivers/irqchip/irq-sun6i-r.c             | 18 ++++---
 drivers/irqchip/irq-tegra.c               | 12 +++--
 drivers/irqchip/irq-vic.c                 | 12 +++--
 drivers/leds/trigger/ledtrig-cpu.c        | 14 ++++--
 drivers/macintosh/via-pmu.c               | 12 +++--
 drivers/power/reset/sc27xx-poweroff.c     | 10 ++--
 drivers/sh/clk/core.c                     | 10 ++--
 drivers/sh/intc/core.c                    | 12 +++--
 drivers/soc/bcm/brcmstb/biuctrl.c         | 12 +++--
 drivers/soc/tegra/pmc.c                   | 17 ++++---
 drivers/thermal/intel/intel_hfi.c         | 12 +++--
 drivers/xen/xen-acpi-processor.c          | 12 +++--
 include/linux/syscore_ops.h               | 15 ++++--
 kernel/cpu_pm.c                           | 12 +++--
 kernel/irq/generic-chip.c                 | 14 ++++--
 kernel/irq/pm.c                           | 11 +++--
 kernel/printk/printk.c                    | 11 +++--
 kernel/time/sched_clock.c                 | 22 +++++++--
 kernel/time/timekeeping.c                 | 22 +++++++--
 virt/kvm/kvm_main.c                       | 18 ++++---
 109 files changed, 898 insertions(+), 470 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-exynos/mcpm-exynos.c b/arch/arm/mach-exynos/mcpm-exynos.c
index fd0dbeb93357..cb7d8a7b14e0 100644
--- a/arch/arm/mach-exynos/mcpm-exynos.c
+++ b/arch/arm/mach-exynos/mcpm-exynos.c
@@ -215,7 +215,7 @@ static const struct of_device_id exynos_dt_mcpm_match[] = {
 	{},
 };
 
-static void exynos_mcpm_setup_entry_point(void)
+static void exynos_mcpm_setup_entry_point(void *data)
 {
 	/*
 	 * U-Boot SPL is hardcoded to jump to the start of ns_sram_base_addr
@@ -228,10 +228,14 @@ static void exynos_mcpm_setup_entry_point(void)
 	__raw_writel(__pa_symbol(mcpm_entry_point), ns_sram_base_addr + 8);
 }
 
-static struct syscore_ops exynos_mcpm_syscore_ops = {
+static const struct syscore_ops exynos_mcpm_syscore_ops = {
 	.resume	= exynos_mcpm_setup_entry_point,
 };
 
+static struct syscore exynos_mcpm_syscore = {
+	.ops = &exynos_mcpm_syscore_ops,
+};
+
 static int __init exynos_mcpm_init(void)
 {
 	struct device_node *node;
@@ -300,9 +304,9 @@ static int __init exynos_mcpm_init(void)
 		pmu_raw_writel(value, EXYNOS_COMMON_OPTION(i));
 	}
 
-	exynos_mcpm_setup_entry_point();
+	exynos_mcpm_setup_entry_point(NULL);
 
-	register_syscore_ops(&exynos_mcpm_syscore_ops);
+	register_syscore(&exynos_mcpm_syscore);
 
 	return ret;
 }
diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c
index 150a1e56dcae..22d723553f62 100644
--- a/arch/arm/mach-exynos/suspend.c
+++ b/arch/arm/mach-exynos/suspend.c
@@ -53,9 +53,9 @@ struct exynos_pm_data {
 
 	void (*pm_prepare)(void);
 	void (*pm_resume_prepare)(void);
-	void (*pm_resume)(void);
-	int (*pm_suspend)(void);
 	int (*cpu_suspend)(unsigned long);
+
+	const struct syscore_ops *syscore_ops;
 };
 
 /* Used only on Exynos542x/5800 */
@@ -376,7 +376,7 @@ static void exynos5420_pm_prepare(void)
 }
 
 
-static int exynos_pm_suspend(void)
+static int exynos_pm_suspend(void *data)
 {
 	exynos_pm_central_suspend();
 
@@ -390,7 +390,7 @@ static int exynos_pm_suspend(void)
 	return 0;
 }
 
-static int exynos5420_pm_suspend(void)
+static int exynos5420_pm_suspend(void *data)
 {
 	u32 this_cluster;
 
@@ -408,7 +408,7 @@ static int exynos5420_pm_suspend(void)
 	return 0;
 }
 
-static void exynos_pm_resume(void)
+static void exynos_pm_resume(void *data)
 {
 	u32 cpuid = read_cpuid_part();
 
@@ -429,7 +429,7 @@ early_wakeup:
 	exynos_set_delayed_reset_assertion(true);
 }
 
-static void exynos3250_pm_resume(void)
+static void exynos3250_pm_resume(void *data)
 {
 	u32 cpuid = read_cpuid_part();
 
@@ -473,7 +473,7 @@ static void exynos5420_prepare_pm_resume(void)
 	}
 }
 
-static void exynos5420_pm_resume(void)
+static void exynos5420_pm_resume(void *data)
 {
 	unsigned long tmp;
 
@@ -596,41 +596,52 @@ static const struct platform_suspend_ops exynos_suspend_ops = {
 	.valid		= suspend_valid_only_mem,
 };
 
+static const struct syscore_ops exynos3250_syscore_ops = {
+	.suspend = exynos_pm_suspend,
+	.resume = exynos3250_pm_resume,
+};
+
 static const struct exynos_pm_data exynos3250_pm_data = {
 	.wkup_irq	= exynos3250_wkup_irq,
 	.wake_disable_mask = ((0xFF << 8) | (0x1F << 1)),
-	.pm_suspend	= exynos_pm_suspend,
-	.pm_resume	= exynos3250_pm_resume,
 	.pm_prepare	= exynos3250_pm_prepare,
 	.cpu_suspend	= exynos3250_cpu_suspend,
+	.syscore_ops	= &exynos3250_syscore_ops,
+};
+
+static const struct syscore_ops exynos_syscore_ops = {
+	.suspend = exynos_pm_suspend,
+	.resume = exynos_pm_resume,
 };
 
 static const struct exynos_pm_data exynos4_pm_data = {
 	.wkup_irq	= exynos4_wkup_irq,
 	.wake_disable_mask = ((0xFF << 8) | (0x1F << 1)),
-	.pm_suspend	= exynos_pm_suspend,
-	.pm_resume	= exynos_pm_resume,
 	.pm_prepare	= exynos_pm_prepare,
 	.cpu_suspend	= exynos_cpu_suspend,
+	.syscore_ops	= &exynos_syscore_ops,
 };
 
 static const struct exynos_pm_data exynos5250_pm_data = {
 	.wkup_irq	= exynos5250_wkup_irq,
 	.wake_disable_mask = ((0xFF << 8) | (0x1F << 1)),
-	.pm_suspend	= exynos_pm_suspend,
-	.pm_resume	= exynos_pm_resume,
 	.pm_prepare	= exynos_pm_prepare,
 	.cpu_suspend	= exynos_cpu_suspend,
+	.syscore_ops	= &exynos_syscore_ops,
+};
+
+static const struct syscore_ops exynos5420_syscore_ops = {
+	.resume = exynos5420_pm_resume,
+	.suspend = exynos5420_pm_suspend,
 };
 
 static const struct exynos_pm_data exynos5420_pm_data = {
 	.wkup_irq	= exynos5250_wkup_irq,
 	.wake_disable_mask = (0x7F << 7) | (0x1F << 1),
 	.pm_resume_prepare = exynos5420_prepare_pm_resume,
-	.pm_resume	= exynos5420_pm_resume,
-	.pm_suspend	= exynos5420_pm_suspend,
 	.pm_prepare	= exynos5420_pm_prepare,
 	.cpu_suspend	= exynos5420_cpu_suspend,
+	.syscore_ops	= &exynos5420_syscore_ops,
 };
 
 static const struct of_device_id exynos_pmu_of_device_ids[] __initconst = {
@@ -656,7 +667,7 @@ static const struct of_device_id exynos_pmu_of_device_ids[] __initconst = {
 	{ /*sentinel*/ },
 };
 
-static struct syscore_ops exynos_pm_syscore_ops;
+static struct syscore exynos_pm_syscore;
 
 void __init exynos_pm_init(void)
 {
@@ -684,10 +695,9 @@ void __init exynos_pm_init(void)
 	tmp |= pm_data->wake_disable_mask;
 	pmu_raw_writel(tmp, S5P_WAKEUP_MASK);
 
-	exynos_pm_syscore_ops.suspend	= pm_data->pm_suspend;
-	exynos_pm_syscore_ops.resume	= pm_data->pm_resume;
+	exynos_pm_syscore.ops = pm_data->syscore_ops;
 
-	register_syscore_ops(&exynos_pm_syscore_ops);
+	register_syscore(&exynos_pm_syscore);
 	suspend_set_ops(&exynos_suspend_ops);
 
 	/*
diff --git a/arch/arm/mach-pxa/generic.h b/arch/arm/mach-pxa/generic.h
index c9c2c46ecead..caad4fca8de3 100644
--- a/arch/arm/mach-pxa/generic.h
+++ b/arch/arm/mach-pxa/generic.h
@@ -34,9 +34,9 @@ extern void __init pxa27x_map_io(void);
 extern void __init pxa3xx_init_irq(void);
 extern void __init pxa3xx_map_io(void);
 
-extern struct syscore_ops pxa_irq_syscore_ops;
-extern struct syscore_ops pxa2xx_mfp_syscore_ops;
-extern struct syscore_ops pxa3xx_mfp_syscore_ops;
+extern struct syscore pxa_irq_syscore;
+extern struct syscore pxa2xx_mfp_syscore;
+extern struct syscore pxa3xx_mfp_syscore;
 
 void __init pxa_set_ffuart_info(void *info);
 void __init pxa_set_btuart_info(void *info);
diff --git a/arch/arm/mach-pxa/irq.c b/arch/arm/mach-pxa/irq.c
index 5bfce8aa4102..99acebbbf065 100644
--- a/arch/arm/mach-pxa/irq.c
+++ b/arch/arm/mach-pxa/irq.c
@@ -178,7 +178,7 @@ void __init pxa_init_irq(int irq_nr, int (*fn)(struct irq_data *, unsigned int))
 static unsigned long saved_icmr[MAX_INTERNAL_IRQS/32];
 static unsigned long saved_ipr[MAX_INTERNAL_IRQS];
 
-static int pxa_irq_suspend(void)
+static int pxa_irq_suspend(void *data)
 {
 	int i;
 
@@ -197,7 +197,7 @@ static int pxa_irq_suspend(void)
 	return 0;
 }
 
-static void pxa_irq_resume(void)
+static void pxa_irq_resume(void *data)
 {
 	int i;
 
@@ -219,11 +219,15 @@ static void pxa_irq_resume(void)
 #define pxa_irq_resume		NULL
 #endif
 
-struct syscore_ops pxa_irq_syscore_ops = {
+static const struct syscore_ops pxa_irq_syscore_ops = {
 	.suspend	= pxa_irq_suspend,
 	.resume		= pxa_irq_resume,
 };
 
+struct syscore pxa_irq_syscore = {
+	.ops = &pxa_irq_syscore_ops,
+};
+
 #ifdef CONFIG_OF
 static const struct of_device_id intc_ids[] __initconst = {
 	{ .compatible = "marvell,pxa-intc", },
diff --git a/arch/arm/mach-pxa/mfp-pxa2xx.c b/arch/arm/mach-pxa/mfp-pxa2xx.c
index f5a3d890f682..d1347055fbe4 100644
--- a/arch/arm/mach-pxa/mfp-pxa2xx.c
+++ b/arch/arm/mach-pxa/mfp-pxa2xx.c
@@ -346,7 +346,7 @@ static unsigned long saved_gpdr[4];
 static unsigned long saved_gplr[4];
 static unsigned long saved_pgsr[4];
 
-static int pxa2xx_mfp_suspend(void)
+static int pxa2xx_mfp_suspend(void *data)
 {
 	int i;
 
@@ -385,7 +385,7 @@ static int pxa2xx_mfp_suspend(void)
 	return 0;
 }
 
-static void pxa2xx_mfp_resume(void)
+static void pxa2xx_mfp_resume(void *data)
 {
 	int i;
 
@@ -404,11 +404,15 @@ static void pxa2xx_mfp_resume(void)
 #define pxa2xx_mfp_resume	NULL
 #endif
 
-struct syscore_ops pxa2xx_mfp_syscore_ops = {
+static const struct syscore_ops pxa2xx_mfp_syscore_ops = {
 	.suspend	= pxa2xx_mfp_suspend,
 	.resume		= pxa2xx_mfp_resume,
 };
 
+struct syscore pxa2xx_mfp_syscore = {
+	.ops = &pxa2xx_mfp_syscore_ops,
+};
+
 static int __init pxa2xx_mfp_init(void)
 {
 	int i;
diff --git a/arch/arm/mach-pxa/mfp-pxa3xx.c b/arch/arm/mach-pxa/mfp-pxa3xx.c
index d16ab7451efe..fe7498fbb62b 100644
--- a/arch/arm/mach-pxa/mfp-pxa3xx.c
+++ b/arch/arm/mach-pxa/mfp-pxa3xx.c
@@ -27,13 +27,13 @@
  * a pull-down mode if they're an active low chip select, and we're
  * just entering standby.
  */
-static int pxa3xx_mfp_suspend(void)
+static int pxa3xx_mfp_suspend(void *data)
 {
 	mfp_config_lpm();
 	return 0;
 }
 
-static void pxa3xx_mfp_resume(void)
+static void pxa3xx_mfp_resume(void *data)
 {
 	mfp_config_run();
 
@@ -49,7 +49,11 @@ static void pxa3xx_mfp_resume(void)
 #define pxa3xx_mfp_resume	NULL
 #endif
 
-struct syscore_ops pxa3xx_mfp_syscore_ops = {
+static const struct syscore_ops pxa3xx_mfp_syscore_ops = {
 	.suspend	= pxa3xx_mfp_suspend,
 	.resume		= pxa3xx_mfp_resume,
 };
+
+struct syscore pxa3xx_mfp_syscore = {
+	.ops = &pxa3xx_mfp_syscore_ops,
+};
diff --git a/arch/arm/mach-pxa/pxa25x.c b/arch/arm/mach-pxa/pxa25x.c
index 03e34841fc00..70509a599814 100644
--- a/arch/arm/mach-pxa/pxa25x.c
+++ b/arch/arm/mach-pxa/pxa25x.c
@@ -235,8 +235,8 @@ static int __init pxa25x_init(void)
 
 		pxa25x_init_pm();
 
-		register_syscore_ops(&pxa_irq_syscore_ops);
-		register_syscore_ops(&pxa2xx_mfp_syscore_ops);
+		register_syscore(&pxa_irq_syscore);
+		register_syscore(&pxa2xx_mfp_syscore);
 
 		if (!of_have_populated_dt()) {
 			software_node_register(&pxa2xx_gpiochip_node);
diff --git a/arch/arm/mach-pxa/pxa27x.c b/arch/arm/mach-pxa/pxa27x.c
index f8382477d629..ff6361979038 100644
--- a/arch/arm/mach-pxa/pxa27x.c
+++ b/arch/arm/mach-pxa/pxa27x.c
@@ -337,8 +337,8 @@ static int __init pxa27x_init(void)
 
 		pxa27x_init_pm();
 
-		register_syscore_ops(&pxa_irq_syscore_ops);
-		register_syscore_ops(&pxa2xx_mfp_syscore_ops);
+		register_syscore(&pxa_irq_syscore);
+		register_syscore(&pxa2xx_mfp_syscore);
 
 		if (!of_have_populated_dt()) {
 			software_node_register(&pxa2xx_gpiochip_node);
diff --git a/arch/arm/mach-pxa/pxa3xx.c b/arch/arm/mach-pxa/pxa3xx.c
index 1d1e5713464d..06c578ea658e 100644
--- a/arch/arm/mach-pxa/pxa3xx.c
+++ b/arch/arm/mach-pxa/pxa3xx.c
@@ -424,8 +424,8 @@ static int __init pxa3xx_init(void)
 		if (cpu_is_pxa320())
 			enable_irq_wake(IRQ_WAKEUP1);
 
-		register_syscore_ops(&pxa_irq_syscore_ops);
-		register_syscore_ops(&pxa3xx_mfp_syscore_ops);
+		register_syscore(&pxa_irq_syscore);
+		register_syscore(&pxa3xx_mfp_syscore);
 	}
 
 	return ret;
diff --git a/arch/arm/mach-pxa/smemc.c b/arch/arm/mach-pxa/smemc.c
index 2d2a321d82f8..fb93a8f28356 100644
--- a/arch/arm/mach-pxa/smemc.c
+++ b/arch/arm/mach-pxa/smemc.c
@@ -18,7 +18,7 @@ static unsigned long msc[2];
 static unsigned long sxcnfg, memclkcfg;
 static unsigned long csadrcfg[4];
 
-static int pxa3xx_smemc_suspend(void)
+static int pxa3xx_smemc_suspend(void *data)
 {
 	msc[0] = __raw_readl(MSC0);
 	msc[1] = __raw_readl(MSC1);
@@ -32,7 +32,7 @@ static int pxa3xx_smemc_suspend(void)
 	return 0;
 }
 
-static void pxa3xx_smemc_resume(void)
+static void pxa3xx_smemc_resume(void *data)
 {
 	__raw_writel(msc[0], MSC0);
 	__raw_writel(msc[1], MSC1);
@@ -46,11 +46,15 @@ static void pxa3xx_smemc_resume(void)
 	__raw_writel(0x2, CSMSADRCFG);
 }
 
-static struct syscore_ops smemc_syscore_ops = {
+static const struct syscore_ops smemc_syscore_ops = {
 	.suspend	= pxa3xx_smemc_suspend,
 	.resume		= pxa3xx_smemc_resume,
 };
 
+static struct syscore smemc_syscore = {
+	.ops = &smemc_syscore_ops,
+};
+
 static int __init smemc_init(void)
 {
 	if (cpu_is_pxa3xx()) {
@@ -64,7 +68,7 @@ static int __init smemc_init(void)
 		 */
 		__raw_writel(0x2, CSMSADRCFG);
 
-		register_syscore_ops(&smemc_syscore_ops);
+		register_syscore(&smemc_syscore);
 	}
 
 	return 0;
diff --git a/arch/arm/mach-s3c/irq-pm-s3c64xx.c b/arch/arm/mach-s3c/irq-pm-s3c64xx.c
index 4a1e935bada1..ab726c595001 100644
--- a/arch/arm/mach-s3c/irq-pm-s3c64xx.c
+++ b/arch/arm/mach-s3c/irq-pm-s3c64xx.c
@@ -58,7 +58,7 @@ static struct irq_grp_save {
 
 static u32 irq_uart_mask[SERIAL_SAMSUNG_UARTS];
 
-static int s3c64xx_irq_pm_suspend(void)
+static int s3c64xx_irq_pm_suspend(void *data)
 {
 	struct irq_grp_save *grp = eint_grp_save;
 	int i;
@@ -79,7 +79,7 @@ static int s3c64xx_irq_pm_suspend(void)
 	return 0;
 }
 
-static void s3c64xx_irq_pm_resume(void)
+static void s3c64xx_irq_pm_resume(void *data)
 {
 	struct irq_grp_save *grp = eint_grp_save;
 	int i;
@@ -100,18 +100,22 @@ static void s3c64xx_irq_pm_resume(void)
 	S3C_PMDBG("%s: IRQ configuration restored\n", __func__);
 }
 
-static struct syscore_ops s3c64xx_irq_syscore_ops = {
+static const struct syscore_ops s3c64xx_irq_syscore_ops = {
 	.suspend = s3c64xx_irq_pm_suspend,
 	.resume	 = s3c64xx_irq_pm_resume,
 };
 
+static struct syscore s3c64xx_irq_syscore = {
+	.ops = &s3c64xx_irq_syscore_ops,
+};
+
 static __init int s3c64xx_syscore_init(void)
 {
 	/* Appropriate drivers (pinctrl, uart) handle this when using DT. */
 	if (of_have_populated_dt() || !soc_is_s3c64xx())
 		return 0;
 
-	register_syscore_ops(&s3c64xx_irq_syscore_ops);
+	register_syscore(&s3c64xx_irq_syscore);
 
 	return 0;
 }
diff --git a/arch/arm/mach-s5pv210/pm.c b/arch/arm/mach-s5pv210/pm.c
index 6fa70f787df4..fa270750364c 100644
--- a/arch/arm/mach-s5pv210/pm.c
+++ b/arch/arm/mach-s5pv210/pm.c
@@ -195,20 +195,24 @@ static const struct platform_suspend_ops s5pv210_suspend_ops = {
 /*
  * Syscore operations used to delay restore of certain registers.
  */
-static void s5pv210_pm_resume(void)
+static void s5pv210_pm_resume(void *data)
 {
 	s3c_pm_do_restore_core(s5pv210_core_save, ARRAY_SIZE(s5pv210_core_save));
 }
 
-static struct syscore_ops s5pv210_pm_syscore_ops = {
+static const struct syscore_ops s5pv210_pm_syscore_ops = {
 	.resume		= s5pv210_pm_resume,
 };
 
+static struct syscore s5pv210_pm_syscore = {
+	.ops = &s5pv210_pm_syscore_ops,
+};
+
 /*
  * Initialization entry point.
  */
 void __init s5pv210_pm_init(void)
 {
-	register_syscore_ops(&s5pv210_pm_syscore_ops);
+	register_syscore(&s5pv210_pm_syscore);
 	suspend_set_ops(&s5pv210_suspend_ops);
 }
diff --git a/arch/arm/mach-versatile/integrator_ap.c b/arch/arm/mach-versatile/integrator_ap.c
index 4bd6712e9f52..ee90d6619d0d 100644
--- a/arch/arm/mach-versatile/integrator_ap.c
+++ b/arch/arm/mach-versatile/integrator_ap.c
@@ -63,13 +63,13 @@ static void __init ap_map_io(void)
 #ifdef CONFIG_PM
 static unsigned long ic_irq_enable;
 
-static int irq_suspend(void)
+static int irq_suspend(void *data)
 {
 	ic_irq_enable = readl(VA_IC_BASE + IRQ_ENABLE);
 	return 0;
 }
 
-static void irq_resume(void)
+static void irq_resume(void *data)
 {
 	/* disable all irq sources */
 	cm_clear_irqs();
@@ -83,14 +83,18 @@ static void irq_resume(void)
 #define irq_resume NULL
 #endif
 
-static struct syscore_ops irq_syscore_ops = {
+static const struct syscore_ops irq_syscore_ops = {
 	.suspend	= irq_suspend,
 	.resume		= irq_resume,
 };
 
+static struct syscore irq_syscore = {
+	.ops = &irq_syscore_ops,
+};
+
 static int __init irq_syscore_init(void)
 {
-	register_syscore_ops(&irq_syscore_ops);
+	register_syscore(&irq_syscore);
 
 	return 0;
 }
diff --git a/arch/arm/mm/cache-b15-rac.c b/arch/arm/mm/cache-b15-rac.c
index 6f63b90f9e1a..e7807356dfab 100644
--- a/arch/arm/mm/cache-b15-rac.c
+++ b/arch/arm/mm/cache-b15-rac.c
@@ -256,7 +256,7 @@ static int b15_rac_dead_cpu(unsigned int cpu)
 	return 0;
 }
 
-static int b15_rac_suspend(void)
+static int b15_rac_suspend(void *data)
 {
 	/* Suspend the read-ahead cache oeprations, forcing our cache
 	 * implementation to fallback to the regular ARMv7 calls.
@@ -271,7 +271,7 @@ static int b15_rac_suspend(void)
 	return 0;
 }
 
-static void b15_rac_resume(void)
+static void b15_rac_resume(void *data)
 {
 	/* Coming out of a S3 suspend/resume cycle, the read-ahead cache
 	 * register RAC_CONFIG0_REG will be restored to its default value, make
@@ -282,11 +282,15 @@ static void b15_rac_resume(void)
 	clear_bit(RAC_SUSPENDED, &b15_rac_flags);
 }
 
-static struct syscore_ops b15_rac_syscore_ops = {
+static const struct syscore_ops b15_rac_syscore_ops = {
 	.suspend	= b15_rac_suspend,
 	.resume		= b15_rac_resume,
 };
 
+static struct syscore b15_rac_syscore = {
+	.ops = &b15_rac_syscore_ops,
+};
+
 static int __init b15_rac_init(void)
 {
 	struct device_node *dn, *cpu_dn;
@@ -347,7 +351,7 @@ static int __init b15_rac_init(void)
 	}
 
 	if (IS_ENABLED(CONFIG_PM_SLEEP))
-		register_syscore_ops(&b15_rac_syscore_ops);
+		register_syscore(&b15_rac_syscore);
 
 	spin_lock(&rac_lock);
 	reg = __raw_readl(b15_rac_base + RAC_CONFIG0_REG);
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index 46036d98da75..8b2fcb3fb874 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -535,28 +535,32 @@ int hibernate_resume_nonboot_cpu_disable(void)
  */
 #ifdef CONFIG_PM
 
-static int loongson_ipi_suspend(void)
+static int loongson_ipi_suspend(void *data)
 {
 	return 0;
 }
 
-static void loongson_ipi_resume(void)
+static void loongson_ipi_resume(void *data)
 {
 	iocsr_write32(0xffffffff, LOONGARCH_IOCSR_IPI_EN);
 }
 
-static struct syscore_ops loongson_ipi_syscore_ops = {
+static const struct syscore_ops loongson_ipi_syscore_ops = {
 	.resume         = loongson_ipi_resume,
 	.suspend        = loongson_ipi_suspend,
 };
 
+static struct syscore loongson_ipi_syscore = {
+	.ops = &loongson_ipi_syscore_ops,
+};
+
 /*
  * Enable boot cpu ipi before enabling nonboot cpus
  * during syscore_resume.
  */
 static int __init ipi_pm_init(void)
 {
-	register_syscore_ops(&loongson_ipi_syscore_ops);
+	register_syscore(&loongson_ipi_syscore);
 	return 0;
 }
 
diff --git a/arch/mips/alchemy/common/dbdma.c b/arch/mips/alchemy/common/dbdma.c
index 6a3c890f7bbf..6c2c2010bbae 100644
--- a/arch/mips/alchemy/common/dbdma.c
+++ b/arch/mips/alchemy/common/dbdma.c
@@ -982,7 +982,7 @@ u32 au1xxx_dbdma_put_dscr(u32 chanid, au1x_ddma_desc_t *dscr)
 
 static unsigned long alchemy_dbdma_pm_data[NUM_DBDMA_CHANS + 1][6];
 
-static int alchemy_dbdma_suspend(void)
+static int alchemy_dbdma_suspend(void *data)
 {
 	int i;
 	void __iomem *addr;
@@ -1019,7 +1019,7 @@ static int alchemy_dbdma_suspend(void)
 	return 0;
 }
 
-static void alchemy_dbdma_resume(void)
+static void alchemy_dbdma_resume(void *data)
 {
 	int i;
 	void __iomem *addr;
@@ -1044,11 +1044,15 @@ static void alchemy_dbdma_resume(void)
 	}
 }
 
-static struct syscore_ops alchemy_dbdma_syscore_ops = {
+static const struct syscore_ops alchemy_dbdma_syscore_ops = {
 	.suspend	= alchemy_dbdma_suspend,
 	.resume		= alchemy_dbdma_resume,
 };
 
+static struct syscore alchemy_dbdma_syscore = {
+	.ops = &alchemy_dbdma_syscore_ops,
+};
+
 static int __init dbdma_setup(unsigned int irq, dbdev_tab_t *idtable)
 {
 	int ret;
@@ -1071,7 +1075,7 @@ static int __init dbdma_setup(unsigned int irq, dbdev_tab_t *idtable)
 		printk(KERN_ERR "Cannot grab DBDMA interrupt!\n");
 	else {
 		dbdma_initialized = 1;
-		register_syscore_ops(&alchemy_dbdma_syscore_ops);
+		register_syscore(&alchemy_dbdma_syscore);
 	}
 
 	return ret;
diff --git a/arch/mips/alchemy/common/irq.c b/arch/mips/alchemy/common/irq.c
index da9f9220048f..2403afcd2fb9 100644
--- a/arch/mips/alchemy/common/irq.c
+++ b/arch/mips/alchemy/common/irq.c
@@ -758,7 +758,7 @@ static inline void alchemy_ic_resume_one(void __iomem *base, unsigned long *d)
 	wmb();
 }
 
-static int alchemy_ic_suspend(void)
+static int alchemy_ic_suspend(void *data)
 {
 	alchemy_ic_suspend_one((void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR),
 			       alchemy_gpic_pmdata);
@@ -767,7 +767,7 @@ static int alchemy_ic_suspend(void)
 	return 0;
 }
 
-static void alchemy_ic_resume(void)
+static void alchemy_ic_resume(void *data)
 {
 	alchemy_ic_resume_one((void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR),
 			      &alchemy_gpic_pmdata[7]);
@@ -775,7 +775,7 @@ static void alchemy_ic_resume(void)
 			      alchemy_gpic_pmdata);
 }
 
-static int alchemy_gpic_suspend(void)
+static int alchemy_gpic_suspend(void *data)
 {
 	void __iomem *base = (void __iomem *)KSEG1ADDR(AU1300_GPIC_PHYS_ADDR);
 	int i;
@@ -806,7 +806,7 @@ static int alchemy_gpic_suspend(void)
 	return 0;
 }
 
-static void alchemy_gpic_resume(void)
+static void alchemy_gpic_resume(void *data)
 {
 	void __iomem *base = (void __iomem *)KSEG1ADDR(AU1300_GPIC_PHYS_ADDR);
 	int i;
@@ -837,16 +837,24 @@ static void alchemy_gpic_resume(void)
 	wmb();
 }
 
-static struct syscore_ops alchemy_ic_pmops = {
+static const struct syscore_ops alchemy_ic_pmops = {
 	.suspend	= alchemy_ic_suspend,
 	.resume		= alchemy_ic_resume,
 };
 
-static struct syscore_ops alchemy_gpic_pmops = {
+static struct syscore alchemy_ic_pm = {
+	.ops = &alchemy_ic_pmops,
+};
+
+static const struct syscore_ops alchemy_gpic_pmops = {
 	.suspend	= alchemy_gpic_suspend,
 	.resume		= alchemy_gpic_resume,
 };
 
+static struct syscore alchemy_gpic_pm = {
+	.ops = &alchemy_gpic_pmops,
+};
+
 /******************************************************************************/
 
 /* create chained handlers for the 4 IC requests to the MIPS IRQ ctrl */
@@ -880,7 +888,7 @@ static void __init au1000_init_irq(struct alchemy_irqmap *map)
 
 	ic_init((void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR));
 	ic_init((void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR));
-	register_syscore_ops(&alchemy_ic_pmops);
+	register_syscore(&alchemy_ic_pm);
 	mips_cpu_irq_init();
 
 	/* register all 64 possible IC0+IC1 irq sources as type "none".
@@ -925,7 +933,7 @@ static void __init alchemy_gpic_init_irq(const struct alchemy_irqmap *dints)
 	int i;
 	void __iomem *bank_base;
 
-	register_syscore_ops(&alchemy_gpic_pmops);
+	register_syscore(&alchemy_gpic_pm);
 	mips_cpu_irq_init();
 
 	/* disable & ack all possible interrupt sources */
diff --git a/arch/mips/alchemy/common/usb.c b/arch/mips/alchemy/common/usb.c
index 5d618547ebf0..a55f32bf517c 100644
--- a/arch/mips/alchemy/common/usb.c
+++ b/arch/mips/alchemy/common/usb.c
@@ -580,22 +580,26 @@ static void alchemy_usb_pm(int susp)
 	}
 }
 
-static int alchemy_usb_suspend(void)
+static int alchemy_usb_suspend(void *data)
 {
 	alchemy_usb_pm(1);
 	return 0;
 }
 
-static void alchemy_usb_resume(void)
+static void alchemy_usb_resume(void *data)
 {
 	alchemy_usb_pm(0);
 }
 
-static struct syscore_ops alchemy_usb_pm_ops = {
+static const struct syscore_ops alchemy_usb_pm_syscore_ops = {
 	.suspend	= alchemy_usb_suspend,
 	.resume		= alchemy_usb_resume,
 };
 
+static struct syscore alchemy_usb_pm_syscore = {
+	.ops = &alchemy_usb_pm_syscore_ops,
+};
+
 static int __init alchemy_usb_init(void)
 {
 	int ret = 0;
@@ -620,7 +624,7 @@ static int __init alchemy_usb_init(void)
 	}
 
 	if (!ret)
-		register_syscore_ops(&alchemy_usb_pm_ops);
+		register_syscore(&alchemy_usb_pm_syscore);
 
 	return ret;
 }
diff --git a/arch/mips/pci/pci-alchemy.c b/arch/mips/pci/pci-alchemy.c
index 58625d1b6465..6bfee0f71803 100644
--- a/arch/mips/pci/pci-alchemy.c
+++ b/arch/mips/pci/pci-alchemy.c
@@ -304,7 +304,7 @@ static int alchemy_pci_def_idsel(unsigned int devsel, int assert)
 }
 
 /* save PCI controller register contents. */
-static int alchemy_pci_suspend(void)
+static int alchemy_pci_suspend(void *data)
 {
 	struct alchemy_pci_context *ctx = __alchemy_pci_ctx;
 	if (!ctx)
@@ -326,7 +326,7 @@ static int alchemy_pci_suspend(void)
 	return 0;
 }
 
-static void alchemy_pci_resume(void)
+static void alchemy_pci_resume(void *data)
 {
 	struct alchemy_pci_context *ctx = __alchemy_pci_ctx;
 	if (!ctx)
@@ -354,9 +354,13 @@ static void alchemy_pci_resume(void)
 	alchemy_pci_wired_entry(ctx);	/* install it */
 }
 
-static struct syscore_ops alchemy_pci_pmops = {
-	.suspend	= alchemy_pci_suspend,
-	.resume		= alchemy_pci_resume,
+static const struct syscore_ops alchemy_pci_syscore_ops = {
+	.suspend = alchemy_pci_suspend,
+	.resume = alchemy_pci_resume,
+};
+
+static struct syscore alchemy_pci_syscore = {
+	.ops = &alchemy_pci_syscore_ops,
 };
 
 static int alchemy_pci_probe(struct platform_device *pdev)
@@ -478,7 +482,7 @@ static int alchemy_pci_probe(struct platform_device *pdev)
 
 	__alchemy_pci_ctx = ctx;
 	platform_set_drvdata(pdev, ctx);
-	register_syscore_ops(&alchemy_pci_pmops);
+	register_syscore(&alchemy_pci_syscore);
 	register_pci_controller(&ctx->alchemy_pci_ctrl);
 
 	dev_info(&pdev->dev, "PCI controller at %ld MHz\n",
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 2c07387201d0..2ddb93df4817 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -726,7 +726,7 @@ static inline void crash_register_spus(struct list_head *list)
 }
 #endif
 
-static void spu_shutdown(void)
+static void spu_shutdown(void *data)
 {
 	struct spu *spu;
 
@@ -738,10 +738,14 @@ static void spu_shutdown(void)
 	mutex_unlock(&spu_full_list_mutex);
 }
 
-static struct syscore_ops spu_syscore_ops = {
+static const struct syscore_ops spu_syscore_ops = {
 	.shutdown = spu_shutdown,
 };
 
+static struct syscore spu_syscore = {
+	.ops = &spu_syscore_ops,
+};
+
 static int __init init_spu_base(void)
 {
 	int i, ret = 0;
@@ -774,7 +778,7 @@ static int __init init_spu_base(void)
 	crash_register_spus(&spu_full_list);
 	mutex_unlock(&spu_full_list_mutex);
 	spu_add_dev_attr(&dev_attr_stat);
-	register_syscore_ops(&spu_syscore_ops);
+	register_syscore(&spu_syscore);
 
 	spu_init_affinity();
 
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index c37783a03d25..1959cc13438f 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -600,7 +600,7 @@ not_found:
 	return viaint;
 }
 
-static int pmacpic_suspend(void)
+static int pmacpic_suspend(void *data)
 {
 	int viaint = pmacpic_find_viaint();
 
@@ -621,7 +621,7 @@ static int pmacpic_suspend(void)
         return 0;
 }
 
-static void pmacpic_resume(void)
+static void pmacpic_resume(void *data)
 {
 	int i;
 
@@ -634,15 +634,19 @@ static void pmacpic_resume(void)
 			pmac_unmask_irq(irq_get_irq_data(i));
 }
 
-static struct syscore_ops pmacpic_syscore_ops = {
+static const struct syscore_ops pmacpic_syscore_ops = {
 	.suspend	= pmacpic_suspend,
 	.resume		= pmacpic_resume,
 };
 
+static struct syscore pmacpic_syscore = {
+	.ops = &pmacpic_syscore_ops,
+};
+
 static int __init init_pmacpic_syscore(void)
 {
 	if (pmac_irq_hw[0])
-		register_syscore_ops(&pmacpic_syscore_ops);
+		register_syscore(&pmacpic_syscore);
 	return 0;
 }
 
diff --git a/arch/powerpc/sysdev/fsl_lbc.c b/arch/powerpc/sysdev/fsl_lbc.c
index 217cea150987..7ed07232a69a 100644
--- a/arch/powerpc/sysdev/fsl_lbc.c
+++ b/arch/powerpc/sysdev/fsl_lbc.c
@@ -350,7 +350,7 @@ err:
 #ifdef CONFIG_SUSPEND
 
 /* save lbc registers */
-static int fsl_lbc_syscore_suspend(void)
+static int fsl_lbc_syscore_suspend(void *data)
 {
 	struct fsl_lbc_ctrl *ctrl;
 	struct fsl_lbc_regs __iomem *lbc;
@@ -374,7 +374,7 @@ out:
 }
 
 /* restore lbc registers */
-static void fsl_lbc_syscore_resume(void)
+static void fsl_lbc_syscore_resume(void *data)
 {
 	struct fsl_lbc_ctrl *ctrl;
 	struct fsl_lbc_regs __iomem *lbc;
@@ -408,10 +408,14 @@ static const struct of_device_id fsl_lbc_match[] = {
 };
 
 #ifdef CONFIG_SUSPEND
-static struct syscore_ops lbc_syscore_pm_ops = {
+static const struct syscore_ops lbc_syscore_pm_ops = {
 	.suspend = fsl_lbc_syscore_suspend,
 	.resume = fsl_lbc_syscore_resume,
 };
+
+static struct syscore lbc_syscore_pm = {
+	.ops = &lbc_syscore_pm_ops,
+};
 #endif
 
 static struct platform_driver fsl_lbc_ctrl_driver = {
@@ -425,7 +429,7 @@ static struct platform_driver fsl_lbc_ctrl_driver = {
 static int __init fsl_lbc_init(void)
 {
 #ifdef CONFIG_SUSPEND
-	register_syscore_ops(&lbc_syscore_pm_ops);
+	register_syscore(&lbc_syscore_pm);
 #endif
 	return platform_driver_register(&fsl_lbc_ctrl_driver);
 }
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index ef7707ea0db7..4e501654cb41 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -1258,7 +1258,7 @@ static void fsl_pci_syscore_do_suspend(struct pci_controller *hose)
 	send_pme_turnoff_message(hose);
 }
 
-static int fsl_pci_syscore_suspend(void)
+static int fsl_pci_syscore_suspend(void *data)
 {
 	struct pci_controller *hose, *tmp;
 
@@ -1291,7 +1291,7 @@ static void fsl_pci_syscore_do_resume(struct pci_controller *hose)
 	setup_pci_atmu(hose);
 }
 
-static void fsl_pci_syscore_resume(void)
+static void fsl_pci_syscore_resume(void *data)
 {
 	struct pci_controller *hose, *tmp;
 
@@ -1299,10 +1299,14 @@ static void fsl_pci_syscore_resume(void)
 		fsl_pci_syscore_do_resume(hose);
 }
 
-static struct syscore_ops pci_syscore_pm_ops = {
+static const struct syscore_ops pci_syscore_pm_ops = {
 	.suspend = fsl_pci_syscore_suspend,
 	.resume = fsl_pci_syscore_resume,
 };
+
+static struct syscore pci_syscore_pm = {
+	.ops = &pci_syscore_pm_ops,
+};
 #endif
 
 void fsl_pcibios_fixup_phb(struct pci_controller *phb)
@@ -1359,7 +1363,7 @@ static struct platform_driver fsl_pci_driver = {
 static int __init fsl_pci_init(void)
 {
 #ifdef CONFIG_PM_SLEEP
-	register_syscore_ops(&pci_syscore_pm_ops);
+	register_syscore(&pci_syscore_pm);
 #endif
 	return platform_driver_register(&fsl_pci_driver);
 }
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index 70be2105865d..290ba8427239 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -817,7 +817,7 @@ static struct {
 	u32 sercr;
 } ipic_saved_state;
 
-static int ipic_suspend(void)
+static int ipic_suspend(void *data)
 {
 	struct ipic *ipic = primary_ipic;
 
@@ -848,7 +848,7 @@ static int ipic_suspend(void)
 	return 0;
 }
 
-static void ipic_resume(void)
+static void ipic_resume(void *data)
 {
 	struct ipic *ipic = primary_ipic;
 
@@ -870,18 +870,22 @@ static void ipic_resume(void)
 #define ipic_resume NULL
 #endif
 
-static struct syscore_ops ipic_syscore_ops = {
+static const struct syscore_ops ipic_syscore_ops = {
 	.suspend = ipic_suspend,
 	.resume = ipic_resume,
 };
 
+static struct syscore ipic_syscore = {
+	.ops = &ipic_syscore_ops,
+};
+
 static int __init init_ipic_syscore(void)
 {
 	if (!primary_ipic || !primary_ipic->regs)
 		return -ENODEV;
 
 	printk(KERN_DEBUG "Registering ipic system core operations\n");
-	register_syscore_ops(&ipic_syscore_ops);
+	register_syscore(&ipic_syscore);
 
 	return 0;
 }
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index ad7310bba00b..67e51998d1ae 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -1944,7 +1944,7 @@ static void mpic_suspend_one(struct mpic *mpic)
 	}
 }
 
-static int mpic_suspend(void)
+static int mpic_suspend(void *data)
 {
 	struct mpic *mpic = mpics;
 
@@ -1986,7 +1986,7 @@ static void mpic_resume_one(struct mpic *mpic)
 	} /* end for loop */
 }
 
-static void mpic_resume(void)
+static void mpic_resume(void *data)
 {
 	struct mpic *mpic = mpics;
 
@@ -1996,19 +1996,23 @@ static void mpic_resume(void)
 	}
 }
 
-static struct syscore_ops mpic_syscore_ops = {
+static const struct syscore_ops mpic_syscore_ops = {
 	.resume = mpic_resume,
 	.suspend = mpic_suspend,
 };
 
+static struct syscore mpic_syscore = {
+	.ops = &mpic_syscore_ops,
+};
+
 static int mpic_init_sys(void)
 {
 	int rc;
 
-	register_syscore_ops(&mpic_syscore_ops);
+	register_syscore(&mpic_syscore);
 	rc = subsys_system_register(&mpic_subsys, NULL);
 	if (rc) {
-		unregister_syscore_ops(&mpic_syscore_ops);
+		unregister_syscore(&mpic_syscore);
 		pr_err("mpic: Failed to register subsystem!\n");
 		return rc;
 	}
diff --git a/arch/powerpc/sysdev/mpic_timer.c b/arch/powerpc/sysdev/mpic_timer.c
index 7166e2e0baaf..60f5b3934b51 100644
--- a/arch/powerpc/sysdev/mpic_timer.c
+++ b/arch/powerpc/sysdev/mpic_timer.c
@@ -519,7 +519,7 @@ out:
 	kfree(priv);
 }
 
-static void mpic_timer_resume(void)
+static void mpic_timer_resume(void *data)
 {
 	struct timer_group_priv *priv;
 
@@ -535,10 +535,14 @@ static const struct of_device_id mpic_timer_ids[] = {
 	{},
 };
 
-static struct syscore_ops mpic_timer_syscore_ops = {
+static const struct syscore_ops mpic_timer_syscore_ops = {
 	.resume = mpic_timer_resume,
 };
 
+static struct syscore mpic_timer_syscore = {
+	.ops = &mpic_timer_syscore_ops,
+};
+
 static int __init mpic_timer_init(void)
 {
 	struct device_node *np = NULL;
@@ -546,7 +550,7 @@ static int __init mpic_timer_init(void)
 	for_each_matching_node(np, mpic_timer_ids)
 		timer_group_init(np);
 
-	register_syscore_ops(&mpic_timer_syscore_ops);
+	register_syscore(&mpic_timer_syscore);
 
 	if (list_empty(&timer_group_list))
 		return -ENODEV;
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index 68eb7cc6e564..482eec50f404 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -857,7 +857,7 @@ static int __init pmb_debugfs_init(void)
 subsys_initcall(pmb_debugfs_init);
 
 #ifdef CONFIG_PM
-static void pmb_syscore_resume(void)
+static void pmb_syscore_resume(void *data)
 {
 	struct pmb_entry *pmbe;
 	int i;
@@ -874,13 +874,17 @@ static void pmb_syscore_resume(void)
 	read_unlock(&pmb_rwlock);
 }
 
-static struct syscore_ops pmb_syscore_ops = {
+static const struct syscore_ops pmb_syscore_ops = {
 	.resume = pmb_syscore_resume,
 };
 
+static struct syscore pmb_syscore = {
+	.ops = &pmb_syscore_ops,
+};
+
 static int __init pmb_sysdev_init(void)
 {
-	register_syscore_ops(&pmb_syscore_ops);
+	register_syscore(&pmb_syscore);
 	return 0;
 }
 subsys_initcall(pmb_sysdev_init);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 112f43b23ebf..aca89f23d2e0 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -1718,26 +1718,30 @@ static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu)
 
 #ifdef CONFIG_PM
 
-static int perf_ibs_suspend(void)
+static int perf_ibs_suspend(void *data)
 {
 	clear_APIC_ibs();
 	return 0;
 }
 
-static void perf_ibs_resume(void)
+static void perf_ibs_resume(void *data)
 {
 	ibs_eilvt_setup();
 	setup_APIC_ibs();
 }
 
-static struct syscore_ops perf_ibs_syscore_ops = {
+static const struct syscore_ops perf_ibs_syscore_ops = {
 	.resume		= perf_ibs_resume,
 	.suspend	= perf_ibs_suspend,
 };
 
+static struct syscore perf_ibs_syscore = {
+	.ops = &perf_ibs_syscore_ops,
+};
+
 static void perf_ibs_pm_init(void)
 {
-	register_syscore_ops(&perf_ibs_syscore_ops);
+	register_syscore(&perf_ibs_syscore);
 }
 
 #else
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index e890fd37e9c2..085ef4f2e73a 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -351,7 +351,7 @@ static int __init hv_pci_init(void)
 	return 1;
 }
 
-static int hv_suspend(void)
+static int hv_suspend(void *data)
 {
 	union hv_x64_msr_hypercall_contents hypercall_msr;
 	int ret;
@@ -378,7 +378,7 @@ static int hv_suspend(void)
 	return ret;
 }
 
-static void hv_resume(void)
+static void hv_resume(void *data)
 {
 	union hv_x64_msr_hypercall_contents hypercall_msr;
 	int ret;
@@ -405,11 +405,15 @@ static void hv_resume(void)
 }
 
 /* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */
-static struct syscore_ops hv_syscore_ops = {
+static const struct syscore_ops hv_syscore_ops = {
 	.suspend	= hv_suspend,
 	.resume		= hv_resume,
 };
 
+static struct syscore hv_syscore = {
+	.ops = &hv_syscore_ops,
+};
+
 static void (* __initdata old_setup_percpu_clockev)(void);
 
 static void __init hv_stimer_setup_percpu_clockev(void)
@@ -569,7 +573,7 @@ skip_hypercall_pg_init:
 
 	x86_init.pci.arch_init = hv_pci_init;
 
-	register_syscore_ops(&hv_syscore_ops);
+	register_syscore(&hv_syscore);
 
 	if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID)
 		hv_get_partition_id();
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 3485d419c2f5..e6e68a31634c 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -591,7 +591,7 @@ static void gart_fixup_northbridges(void)
 	}
 }
 
-static void gart_resume(void)
+static void gart_resume(void *data)
 {
 	pr_info("PCI-DMA: Resuming GART IOMMU\n");
 
@@ -600,11 +600,15 @@ static void gart_resume(void)
 	enable_gart_translations();
 }
 
-static struct syscore_ops gart_syscore_ops = {
+static const struct syscore_ops gart_syscore_ops = {
 	.resume		= gart_resume,
 
 };
 
+static struct syscore gart_syscore = {
+	.ops = &gart_syscore_ops,
+};
+
 /*
  * Private Northbridge GATT initialization in case we cannot use the
  * AGP driver for some reason.
@@ -650,7 +654,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
 
 	agp_gatt_table = gatt;
 
-	register_syscore_ops(&gart_syscore_ops);
+	register_syscore(&gart_syscore);
 
 	flush_gart();
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 680d305589a3..fd87b1562c7e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2381,7 +2381,7 @@ static struct {
 	unsigned int apic_cmci;
 } apic_pm_state;
 
-static int lapic_suspend(void)
+static int lapic_suspend(void *data)
 {
 	unsigned long flags;
 	int maxlvt;
@@ -2429,7 +2429,7 @@ static int lapic_suspend(void)
 	return 0;
 }
 
-static void lapic_resume(void)
+static void lapic_resume(void *data)
 {
 	unsigned int l, h;
 	unsigned long flags;
@@ -2504,11 +2504,15 @@ static void lapic_resume(void)
  * are needed on every CPU up until machine_halt/restart/poweroff.
  */
 
-static struct syscore_ops lapic_syscore_ops = {
+static const struct syscore_ops lapic_syscore_ops = {
 	.resume		= lapic_resume,
 	.suspend	= lapic_suspend,
 };
 
+static struct syscore lapic_syscore = {
+	.ops = &lapic_syscore_ops,
+};
+
 static void apic_pm_activate(void)
 {
 	apic_pm_state.active = 1;
@@ -2518,7 +2522,7 @@ static int __init init_lapic_sysfs(void)
 {
 	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
 	if (boot_cpu_has(X86_FEATURE_APIC))
-		register_syscore_ops(&lapic_syscore_ops);
+		register_syscore(&lapic_syscore);
 
 	return 0;
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5ba2feb2c04c..84e200662ce6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2308,7 +2308,12 @@ static void resume_ioapic_id(int ioapic_idx)
 	}
 }
 
-static void ioapic_resume(void)
+static int ioapic_suspend(void *data)
+{
+	return save_ioapic_entries();
+}
+
+static void ioapic_resume(void *data)
 {
 	int ioapic_idx;
 
@@ -2318,14 +2323,18 @@ static void ioapic_resume(void)
 	restore_ioapic_entries();
 }
 
-static struct syscore_ops ioapic_syscore_ops = {
-	.suspend	= save_ioapic_entries,
+static const struct syscore_ops ioapic_syscore_ops = {
+	.suspend	= ioapic_suspend,
 	.resume		= ioapic_resume,
 };
 
+static struct syscore ioapic_syscore = {
+	.ops = &ioapic_syscore_ops,
+};
+
 static int __init ioapic_init_ops(void)
 {
-	register_syscore_ops(&ioapic_syscore_ops);
+	register_syscore(&ioapic_syscore);
 
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index a315b0627dfb..7ffc78d5ebf2 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -37,7 +37,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
 	.seq = SEQCNT_ZERO(cpu_samples.seq)
 };
 
-static void init_counter_refs(void)
+static void init_counter_refs(void *data)
 {
 	u64 aperf, mperf;
 
@@ -289,16 +289,20 @@ out:
 }
 
 #ifdef CONFIG_PM_SLEEP
-static struct syscore_ops freq_invariance_syscore_ops = {
+static const struct syscore_ops freq_invariance_syscore_ops = {
 	.resume = init_counter_refs,
 };
 
-static void register_freq_invariance_syscore_ops(void)
+static struct syscore freq_invariance_syscore = {
+	.ops = &freq_invariance_syscore_ops,
+};
+
+static void register_freq_invariance_syscore(void)
 {
-	register_syscore_ops(&freq_invariance_syscore_ops);
+	register_syscore(&freq_invariance_syscore);
 }
 #else
-static inline void register_freq_invariance_syscore_ops(void) {}
+static inline void register_freq_invariance_syscore(void) {}
 #endif
 
 static void freq_invariance_enable(void)
@@ -308,7 +312,7 @@ static void freq_invariance_enable(void)
 		return;
 	}
 	static_branch_enable_cpuslocked(&arch_scale_freq_key);
-	register_freq_invariance_syscore_ops();
+	register_freq_invariance_syscore();
 	pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
 }
 
@@ -535,7 +539,7 @@ static int __init bp_init_aperfmperf(void)
 	if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
 		return 0;
 
-	init_counter_refs();
+	init_counter_refs(NULL);
 	bp_init_freq_invariance();
 	return 0;
 }
@@ -544,5 +548,5 @@ early_initcall(bp_init_aperfmperf);
 void ap_init_aperfmperf(void)
 {
 	if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
-		init_counter_refs();
+		init_counter_refs(NULL);
 }
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index bc7671f920a7..2c56f8730f59 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -75,7 +75,7 @@ static u8 energ_perf_values[] = {
 	[EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE,
 };
 
-static int intel_epb_save(void)
+static int intel_epb_save(void *data)
 {
 	u64 epb;
 
@@ -89,7 +89,7 @@ static int intel_epb_save(void)
 	return 0;
 }
 
-static void intel_epb_restore(void)
+static void intel_epb_restore(void *data)
 {
 	u64 val = this_cpu_read(saved_epb);
 	u64 epb;
@@ -114,11 +114,15 @@ static void intel_epb_restore(void)
 	wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val);
 }
 
-static struct syscore_ops intel_epb_syscore_ops = {
+static const struct syscore_ops intel_epb_syscore_ops = {
 	.suspend = intel_epb_save,
 	.resume = intel_epb_restore,
 };
 
+static struct syscore intel_epb_syscore = {
+	.ops = &intel_epb_syscore_ops,
+};
+
 static const char * const energy_perf_strings[] = {
 	[EPB_INDEX_PERFORMANCE] = "performance",
 	[EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance",
@@ -185,7 +189,7 @@ static int intel_epb_online(unsigned int cpu)
 {
 	struct device *cpu_dev = get_cpu_device(cpu);
 
-	intel_epb_restore();
+	intel_epb_restore(NULL);
 	if (!cpuhp_tasks_frozen)
 		sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group);
 
@@ -199,7 +203,7 @@ static int intel_epb_offline(unsigned int cpu)
 	if (!cpuhp_tasks_frozen)
 		sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group);
 
-	intel_epb_save();
+	intel_epb_save(NULL);
 	return 0;
 }
 
@@ -230,7 +234,7 @@ static __init int intel_epb_init(void)
 	if (ret < 0)
 		goto err_out_online;
 
-	register_syscore_ops(&intel_epb_syscore_ops);
+	register_syscore(&intel_epb_syscore);
 	return 0;
 
 err_out_online:
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 460e90a1a0b1..23bfbc7dfb8e 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -2410,13 +2410,13 @@ static void vendor_disable_error_reporting(void)
 	mce_disable_error_reporting();
 }
 
-static int mce_syscore_suspend(void)
+static int mce_syscore_suspend(void *data)
 {
 	vendor_disable_error_reporting();
 	return 0;
 }
 
-static void mce_syscore_shutdown(void)
+static void mce_syscore_shutdown(void *data)
 {
 	vendor_disable_error_reporting();
 }
@@ -2426,7 +2426,7 @@ static void mce_syscore_shutdown(void)
  * Only one CPU is active at this time, the others get re-added later using
  * CPU hotplug:
  */
-static void mce_syscore_resume(void)
+static void mce_syscore_resume(void *data)
 {
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
@@ -2434,12 +2434,16 @@ static void mce_syscore_resume(void)
 	cr4_set_bits(X86_CR4_MCE);
 }
 
-static struct syscore_ops mce_syscore_ops = {
+static const struct syscore_ops mce_syscore_ops = {
 	.suspend	= mce_syscore_suspend,
 	.shutdown	= mce_syscore_shutdown,
 	.resume		= mce_syscore_resume,
 };
 
+static struct syscore mce_syscore = {
+	.ops = &mce_syscore_ops,
+};
+
 /*
  * mce_device: Sysfs support
  */
@@ -2840,7 +2844,7 @@ static __init int mcheck_init_device(void)
 	if (err < 0)
 		goto err_out_online;
 
-	register_syscore_ops(&mce_syscore_ops);
+	register_syscore(&mce_syscore);
 
 	return 0;
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index f75c140906d0..3262756f8c32 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -812,8 +812,17 @@ void microcode_bsp_resume(void)
 		reload_early_microcode(cpu);
 }
 
-static struct syscore_ops mc_syscore_ops = {
-	.resume	= microcode_bsp_resume,
+static void microcode_bsp_syscore_resume(void *data)
+{
+	microcode_bsp_resume();
+}
+
+static const struct syscore_ops mc_syscore_ops = {
+	.resume	= microcode_bsp_syscore_resume,
+};
+
+static struct syscore mc_syscore = {
+	.ops = &mc_syscore_ops,
 };
 
 static int mc_cpu_online(unsigned int cpu)
@@ -892,7 +901,7 @@ static int __init microcode_init(void)
 		}
 	}
 
-	register_syscore_ops(&mc_syscore_ops);
+	register_syscore(&mc_syscore);
 	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
 			  mc_cpu_online, mc_cpu_down_prep);
 
diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c
index d25882fcf181..2415ffaaf02c 100644
--- a/arch/x86/kernel/cpu/mtrr/legacy.c
+++ b/arch/x86/kernel/cpu/mtrr/legacy.c
@@ -41,7 +41,7 @@ struct mtrr_value {
 
 static struct mtrr_value *mtrr_value;
 
-static int mtrr_save(void)
+static int mtrr_save(void *data)
 {
 	int i;
 
@@ -56,7 +56,7 @@ static int mtrr_save(void)
 	return 0;
 }
 
-static void mtrr_restore(void)
+static void mtrr_restore(void *data)
 {
 	int i;
 
@@ -69,11 +69,15 @@ static void mtrr_restore(void)
 	}
 }
 
-static struct syscore_ops mtrr_syscore_ops = {
+static const struct syscore_ops mtrr_syscore_ops = {
 	.suspend	= mtrr_save,
 	.resume		= mtrr_restore,
 };
 
+static struct syscore mtrr_syscore = {
+	.ops = &mtrr_syscore_ops,
+};
+
 void mtrr_register_syscore(void)
 {
 	mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL);
@@ -86,5 +90,5 @@ void mtrr_register_syscore(void)
 	 * TBD: is there any system with such CPU which supports
 	 * suspend/resume? If no, we should remove the code.
 	 */
-	register_syscore_ops(&mtrr_syscore_ops);
+	register_syscore(&mtrr_syscore);
 }
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index 933fcd7ff250..e4a31c536642 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -86,15 +86,19 @@ static int umwait_cpu_offline(unsigned int cpu)
  * trust the firmware nor does it matter if the same value is written
  * again.
  */
-static void umwait_syscore_resume(void)
+static void umwait_syscore_resume(void *data)
 {
 	umwait_update_control_msr(NULL);
 }
 
-static struct syscore_ops umwait_syscore_ops = {
+static const struct syscore_ops umwait_syscore_ops = {
 	.resume	= umwait_syscore_resume,
 };
 
+static struct syscore umwait_syscore = {
+	.ops = &umwait_syscore_ops,
+};
+
 /* sysfs interface */
 
 /*
@@ -226,7 +230,7 @@ static int __init umwait_init(void)
 		return ret;
 	}
 
-	register_syscore_ops(&umwait_syscore_ops);
+	register_syscore(&umwait_syscore);
 
 	/*
 	 * Add umwait control interface. Ignore failure, so at least the
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index 2cd124ad9380..896d46b44284 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -19,7 +19,7 @@
  * in asm/dma.h.
  */
 
-static void i8237A_resume(void)
+static void i8237A_resume(void *data)
 {
 	unsigned long flags;
 	int i;
@@ -41,10 +41,14 @@ static void i8237A_resume(void)
 	release_dma_lock(flags);
 }
 
-static struct syscore_ops i8237_syscore_ops = {
+static const struct syscore_ops i8237_syscore_ops = {
 	.resume		= i8237A_resume,
 };
 
+static struct syscore i8237_syscore = {
+	.ops = &i8237_syscore_ops,
+};
+
 static int __init i8237A_init_ops(void)
 {
 	/*
@@ -70,7 +74,7 @@ static int __init i8237A_init_ops(void)
 	if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017)
 		return -ENODEV;
 
-	register_syscore_ops(&i8237_syscore_ops);
+	register_syscore(&i8237_syscore);
 	return 0;
 }
 device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 2bade73f49e3..f67063df6723 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -247,19 +247,19 @@ static void save_ELCR(char *trigger)
 	trigger[1] = inb(PIC_ELCR2) & 0xDE;
 }
 
-static void i8259A_resume(void)
+static void i8259A_resume(void *data)
 {
 	init_8259A(i8259A_auto_eoi);
 	restore_ELCR(irq_trigger);
 }
 
-static int i8259A_suspend(void)
+static int i8259A_suspend(void *data)
 {
 	save_ELCR(irq_trigger);
 	return 0;
 }
 
-static void i8259A_shutdown(void)
+static void i8259A_shutdown(void *data)
 {
 	/* Put the i8259A into a quiescent state that
 	 * the kernel initialization code can get it
@@ -269,12 +269,16 @@ static void i8259A_shutdown(void)
 	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
 }
 
-static struct syscore_ops i8259_syscore_ops = {
+static const struct syscore_ops i8259_syscore_ops = {
 	.suspend = i8259A_suspend,
 	.resume = i8259A_resume,
 	.shutdown = i8259A_shutdown,
 };
 
+static struct syscore i8259_syscore = {
+	.ops = &i8259_syscore_ops,
+};
+
 static void mask_8259A(void)
 {
 	unsigned long flags;
@@ -444,7 +448,7 @@ EXPORT_SYMBOL(legacy_pic);
 static int __init i8259A_init_ops(void)
 {
 	if (legacy_pic == &default_legacy_pic)
-		register_syscore_ops(&i8259_syscore_ops);
+		register_syscore(&i8259_syscore);
 
 	return 0;
 }
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b67d7c59dca0..1500852ba03c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -720,7 +720,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
 
 #endif
 
-static int kvm_suspend(void)
+static int kvm_suspend(void *data)
 {
 	u64 val = 0;
 
@@ -734,7 +734,7 @@ static int kvm_suspend(void)
 	return 0;
 }
 
-static void kvm_resume(void)
+static void kvm_resume(void *data)
 {
 	kvm_cpu_online(raw_smp_processor_id());
 
@@ -744,11 +744,15 @@ static void kvm_resume(void)
 #endif
 }
 
-static struct syscore_ops kvm_syscore_ops = {
+static const struct syscore_ops kvm_syscore_ops = {
 	.suspend	= kvm_suspend,
 	.resume		= kvm_resume,
 };
 
+static struct syscore kvm_syscore = {
+	.ops = &kvm_syscore_ops,
+};
+
 static void kvm_pv_guest_cpu_reboot(void *unused)
 {
 	kvm_guest_cpu_offline(true);
@@ -858,7 +862,7 @@ static void __init kvm_guest_init(void)
 	machine_ops.crash_shutdown = kvm_crash_shutdown;
 #endif
 
-	register_syscore_ops(&kvm_syscore_ops);
+	register_syscore(&kvm_syscore);
 
 	/*
 	 * Hard lockup detection is enabled by default. Disable it, as guests
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index e4560b33b8ad..bed7dc85612e 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -761,7 +761,7 @@ static int acpi_pci_link_resume(struct acpi_pci_link *link)
 	return 0;
 }
 
-static void irqrouter_resume(void)
+static void irqrouter_resume(void *data)
 {
 	struct acpi_pci_link *link;
 
@@ -888,10 +888,14 @@ static int __init acpi_irq_balance_set(char *str)
 
 __setup("acpi_irq_balance", acpi_irq_balance_set);
 
-static struct syscore_ops irqrouter_syscore_ops = {
+static const struct syscore_ops irqrouter_syscore_ops = {
 	.resume = irqrouter_resume,
 };
 
+static struct syscore irqrouter_syscore = {
+	.ops = &irqrouter_syscore_ops,
+};
+
 void __init acpi_pci_link_init(void)
 {
 	if (acpi_noirq)
@@ -904,6 +908,6 @@ void __init acpi_pci_link_init(void)
 		else
 			acpi_irq_balance = 0;
 	}
-	register_syscore_ops(&irqrouter_syscore_ops);
+	register_syscore(&irqrouter_syscore);
 	acpi_scan_add_handler(&pci_link_handler);
 }
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index c8ee8e42b0f6..aaf57d0aaa19 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -884,13 +884,13 @@ bool acpi_s2idle_wakeup(void)
 #ifdef CONFIG_PM_SLEEP
 static u32 saved_bm_rld;
 
-static int  acpi_save_bm_rld(void)
+static int  acpi_save_bm_rld(void *data)
 {
 	acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_RLD, &saved_bm_rld);
 	return 0;
 }
 
-static void  acpi_restore_bm_rld(void)
+static void  acpi_restore_bm_rld(void *data)
 {
 	u32 resumed_bm_rld = 0;
 
@@ -901,14 +901,18 @@ static void  acpi_restore_bm_rld(void)
 	acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, saved_bm_rld);
 }
 
-static struct syscore_ops acpi_sleep_syscore_ops = {
+static const struct syscore_ops acpi_sleep_syscore_ops = {
 	.suspend = acpi_save_bm_rld,
 	.resume = acpi_restore_bm_rld,
 };
 
+static struct syscore acpi_sleep_syscore = {
+	.ops = &acpi_sleep_syscore_ops,
+};
+
 static void acpi_sleep_syscore_init(void)
 {
-	register_syscore_ops(&acpi_sleep_syscore_ops);
+	register_syscore(&acpi_sleep_syscore);
 }
 #else
 static inline void acpi_sleep_syscore_init(void) {}
diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index 6942c62fa59d..8191dbab92c4 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -1585,16 +1585,20 @@ static int fw_pm_notify(struct notifier_block *notify_block,
 }
 
 /* stop caching firmware once syscore_suspend is reached */
-static int fw_suspend(void)
+static int fw_suspend(void *data)
 {
 	fw_cache.state = FW_LOADER_NO_CACHE;
 	return 0;
 }
 
-static struct syscore_ops fw_syscore_ops = {
+static const struct syscore_ops fw_syscore_ops = {
 	.suspend = fw_suspend,
 };
 
+static struct syscore fw_syscore = {
+	.ops = &fw_syscore_ops,
+};
+
 static int __init register_fw_pm_ops(void)
 {
 	int ret;
@@ -1610,14 +1614,14 @@ static int __init register_fw_pm_ops(void)
 	if (ret)
 		return ret;
 
-	register_syscore_ops(&fw_syscore_ops);
+	register_syscore(&fw_syscore);
 
 	return ret;
 }
 
 static inline void unregister_fw_pm_ops(void)
 {
-	unregister_syscore_ops(&fw_syscore_ops);
+	unregister_syscore(&fw_syscore);
 	unregister_pm_notifier(&fw_cache.pm_notify);
 }
 #else
diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c
index 13db1f78d2ce..483adb796654 100644
--- a/drivers/base/syscore.c
+++ b/drivers/base/syscore.c
@@ -11,32 +11,32 @@
 #include <linux/suspend.h>
 #include <trace/events/power.h>
 
-static LIST_HEAD(syscore_ops_list);
-static DEFINE_MUTEX(syscore_ops_lock);
+static LIST_HEAD(syscore_list);
+static DEFINE_MUTEX(syscore_lock);
 
 /**
- * register_syscore_ops - Register a set of system core operations.
- * @ops: System core operations to register.
+ * register_syscore - Register a set of system core operations.
+ * @syscore: System core operations to register.
  */
-void register_syscore_ops(struct syscore_ops *ops)
+void register_syscore(struct syscore *syscore)
 {
-	mutex_lock(&syscore_ops_lock);
-	list_add_tail(&ops->node, &syscore_ops_list);
-	mutex_unlock(&syscore_ops_lock);
+	mutex_lock(&syscore_lock);
+	list_add_tail(&syscore->node, &syscore_list);
+	mutex_unlock(&syscore_lock);
 }
-EXPORT_SYMBOL_GPL(register_syscore_ops);
+EXPORT_SYMBOL_GPL(register_syscore);
 
 /**
- * unregister_syscore_ops - Unregister a set of system core operations.
- * @ops: System core operations to unregister.
+ * unregister_syscore - Unregister a set of system core operations.
+ * @syscore: System core operations to unregister.
  */
-void unregister_syscore_ops(struct syscore_ops *ops)
+void unregister_syscore(struct syscore *syscore)
 {
-	mutex_lock(&syscore_ops_lock);
-	list_del(&ops->node);
-	mutex_unlock(&syscore_ops_lock);
+	mutex_lock(&syscore_lock);
+	list_del(&syscore->node);
+	mutex_unlock(&syscore_lock);
 }
-EXPORT_SYMBOL_GPL(unregister_syscore_ops);
+EXPORT_SYMBOL_GPL(unregister_syscore);
 
 #ifdef CONFIG_PM_SLEEP
 /**
@@ -46,7 +46,7 @@ EXPORT_SYMBOL_GPL(unregister_syscore_ops);
  */
 int syscore_suspend(void)
 {
-	struct syscore_ops *ops;
+	struct syscore *syscore;
 	int ret = 0;
 
 	trace_suspend_resume(TPS("syscore_suspend"), 0, true);
@@ -59,25 +59,27 @@ int syscore_suspend(void)
 	WARN_ONCE(!irqs_disabled(),
 		"Interrupts enabled before system core suspend.\n");
 
-	list_for_each_entry_reverse(ops, &syscore_ops_list, node)
-		if (ops->suspend) {
-			pm_pr_dbg("Calling %pS\n", ops->suspend);
-			ret = ops->suspend();
+	list_for_each_entry_reverse(syscore, &syscore_list, node)
+		if (syscore->ops->suspend) {
+			pm_pr_dbg("Calling %pS\n", syscore->ops->suspend);
+			ret = syscore->ops->suspend(syscore->data);
 			if (ret)
 				goto err_out;
 			WARN_ONCE(!irqs_disabled(),
-				"Interrupts enabled after %pS\n", ops->suspend);
+				"Interrupts enabled after %pS\n",
+				syscore->ops->suspend);
 		}
 
 	trace_suspend_resume(TPS("syscore_suspend"), 0, false);
 	return 0;
 
  err_out:
-	pr_err("PM: System core suspend callback %pS failed.\n", ops->suspend);
+	pr_err("PM: System core suspend callback %pS failed.\n",
+	       syscore->ops->suspend);
 
-	list_for_each_entry_continue(ops, &syscore_ops_list, node)
-		if (ops->resume)
-			ops->resume();
+	list_for_each_entry_continue(syscore, &syscore_list, node)
+		if (syscore->ops->resume)
+			syscore->ops->resume(syscore->data);
 
 	return ret;
 }
@@ -90,18 +92,19 @@ EXPORT_SYMBOL_GPL(syscore_suspend);
  */
 void syscore_resume(void)
 {
-	struct syscore_ops *ops;
+	struct syscore *syscore;
 
 	trace_suspend_resume(TPS("syscore_resume"), 0, true);
 	WARN_ONCE(!irqs_disabled(),
 		"Interrupts enabled before system core resume.\n");
 
-	list_for_each_entry(ops, &syscore_ops_list, node)
-		if (ops->resume) {
-			pm_pr_dbg("Calling %pS\n", ops->resume);
-			ops->resume();
+	list_for_each_entry(syscore, &syscore_list, node)
+		if (syscore->ops->resume) {
+			pm_pr_dbg("Calling %pS\n", syscore->ops->resume);
+			syscore->ops->resume(syscore->data);
 			WARN_ONCE(!irqs_disabled(),
-				"Interrupts enabled after %pS\n", ops->resume);
+				"Interrupts enabled after %pS\n",
+				syscore->ops->resume);
 		}
 	trace_suspend_resume(TPS("syscore_resume"), 0, false);
 }
@@ -113,16 +116,17 @@ EXPORT_SYMBOL_GPL(syscore_resume);
  */
 void syscore_shutdown(void)
 {
-	struct syscore_ops *ops;
+	struct syscore *syscore;
 
-	mutex_lock(&syscore_ops_lock);
+	mutex_lock(&syscore_lock);
 
-	list_for_each_entry_reverse(ops, &syscore_ops_list, node)
-		if (ops->shutdown) {
+	list_for_each_entry_reverse(syscore, &syscore_list, node)
+		if (syscore->ops->shutdown) {
 			if (initcall_debug)
-				pr_info("PM: Calling %pS\n", ops->shutdown);
-			ops->shutdown();
+				pr_info("PM: Calling %pS\n",
+					syscore->ops->shutdown);
+			syscore->ops->shutdown(syscore->data);
 		}
 
-	mutex_unlock(&syscore_ops_lock);
+	mutex_unlock(&syscore_lock);
 }
diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c
index 00cb792bda18..dd94145c9b22 100644
--- a/drivers/bus/mvebu-mbus.c
+++ b/drivers/bus/mvebu-mbus.c
@@ -1006,7 +1006,7 @@ static __init int mvebu_mbus_debugfs_init(void)
 }
 fs_initcall(mvebu_mbus_debugfs_init);
 
-static int mvebu_mbus_suspend(void)
+static int mvebu_mbus_suspend(void *data)
 {
 	struct mvebu_mbus_state *s = &mbus_state;
 	int win;
@@ -1040,7 +1040,7 @@ static int mvebu_mbus_suspend(void)
 	return 0;
 }
 
-static void mvebu_mbus_resume(void)
+static void mvebu_mbus_resume(void *data)
 {
 	struct mvebu_mbus_state *s = &mbus_state;
 	int win;
@@ -1069,9 +1069,13 @@ static void mvebu_mbus_resume(void)
 	}
 }
 
-static struct syscore_ops mvebu_mbus_syscore_ops = {
-	.suspend	= mvebu_mbus_suspend,
-	.resume		= mvebu_mbus_resume,
+static const struct syscore_ops mvebu_mbus_syscore_ops = {
+	.suspend = mvebu_mbus_suspend,
+	.resume = mvebu_mbus_resume,
+};
+
+static struct syscore mvebu_mbus_syscore = {
+	.ops = &mvebu_mbus_syscore_ops,
 };
 
 static int __init mvebu_mbus_common_init(struct mvebu_mbus_state *mbus,
@@ -1118,7 +1122,7 @@ static int __init mvebu_mbus_common_init(struct mvebu_mbus_state *mbus,
 		writel(UNIT_SYNC_BARRIER_ALL,
 		       mbus->mbuswins_base + UNIT_SYNC_BARRIER_OFF);
 
-	register_syscore_ops(&mvebu_mbus_syscore_ops);
+	register_syscore(&mvebu_mbus_syscore);
 
 	return 0;
 }
diff --git a/drivers/clk/at91/pmc.c b/drivers/clk/at91/pmc.c
index acf780a81589..2310f6f73162 100644
--- a/drivers/clk/at91/pmc.c
+++ b/drivers/clk/at91/pmc.c
@@ -115,7 +115,7 @@ struct pmc_data *pmc_data_allocate(unsigned int ncore, unsigned int nsystem,
 /* Address in SECURAM that say if we suspend to backup mode. */
 static void __iomem *at91_pmc_backup_suspend;
 
-static int at91_pmc_suspend(void)
+static int at91_pmc_suspend(void *data)
 {
 	unsigned int backup;
 
@@ -129,7 +129,7 @@ static int at91_pmc_suspend(void)
 	return clk_save_context();
 }
 
-static void at91_pmc_resume(void)
+static void at91_pmc_resume(void *data)
 {
 	unsigned int backup;
 
@@ -143,11 +143,15 @@ static void at91_pmc_resume(void)
 	clk_restore_context();
 }
 
-static struct syscore_ops pmc_syscore_ops = {
+static const struct syscore_ops pmc_syscore_ops = {
 	.suspend = at91_pmc_suspend,
 	.resume = at91_pmc_resume,
 };
 
+static struct syscore pmc_syscore = {
+	.ops = &pmc_syscore_ops,
+};
+
 static const struct of_device_id pmc_dt_ids[] = {
 	{ .compatible = "atmel,sama5d2-pmc" },
 	{ .compatible = "microchip,sama7g5-pmc", },
@@ -185,7 +189,7 @@ static int __init pmc_register_ops(void)
 		return -ENOMEM;
 	}
 
-	register_syscore_ops(&pmc_syscore_ops);
+	register_syscore(&pmc_syscore);
 
 	return 0;
 }
diff --git a/drivers/clk/imx/clk-vf610.c b/drivers/clk/imx/clk-vf610.c
index 9e11f1c7c397..41eb38552a9c 100644
--- a/drivers/clk/imx/clk-vf610.c
+++ b/drivers/clk/imx/clk-vf610.c
@@ -139,7 +139,7 @@ static struct clk * __init vf610_get_fixed_clock(
 	return clk;
 };
 
-static int vf610_clk_suspend(void)
+static int vf610_clk_suspend(void *data)
 {
 	int i;
 
@@ -156,7 +156,7 @@ static int vf610_clk_suspend(void)
 	return 0;
 }
 
-static void vf610_clk_resume(void)
+static void vf610_clk_resume(void *data)
 {
 	int i;
 
@@ -171,11 +171,15 @@ static void vf610_clk_resume(void)
 		writel_relaxed(ccgr[i], CCM_CCGRx(i));
 }
 
-static struct syscore_ops vf610_clk_syscore_ops = {
+static const struct syscore_ops vf610_clk_syscore_ops = {
 	.suspend = vf610_clk_suspend,
 	.resume = vf610_clk_resume,
 };
 
+static struct syscore vf610_clk_syscore = {
+	.ops = &vf610_clk_syscore_ops,
+};
+
 static void __init vf610_clocks_init(struct device_node *ccm_node)
 {
 	struct device_node *np;
@@ -462,7 +466,7 @@ static void __init vf610_clocks_init(struct device_node *ccm_node)
 	for (i = 0; i < ARRAY_SIZE(clks_init_on); i++)
 		clk_prepare_enable(clk[clks_init_on[i]]);
 
-	register_syscore_ops(&vf610_clk_syscore_ops);
+	register_syscore(&vf610_clk_syscore);
 
 	/* Add the clocks to provider list */
 	clk_data.clks = clk;
diff --git a/drivers/clk/ingenic/jz4725b-cgu.c b/drivers/clk/ingenic/jz4725b-cgu.c
index 590e9c85cb25..94cee44c854f 100644
--- a/drivers/clk/ingenic/jz4725b-cgu.c
+++ b/drivers/clk/ingenic/jz4725b-cgu.c
@@ -268,6 +268,6 @@ static void __init jz4725b_cgu_init(struct device_node *np)
 	if (retval)
 		pr_err("%s: failed to register CGU Clocks\n", __func__);
 
-	ingenic_cgu_register_syscore_ops(cgu);
+	ingenic_cgu_register_syscore(cgu);
 }
 CLK_OF_DECLARE_DRIVER(jz4725b_cgu, "ingenic,jz4725b-cgu", jz4725b_cgu_init);
diff --git a/drivers/clk/ingenic/jz4740-cgu.c b/drivers/clk/ingenic/jz4740-cgu.c
index 3e0a30574ebb..2def3aedc8dd 100644
--- a/drivers/clk/ingenic/jz4740-cgu.c
+++ b/drivers/clk/ingenic/jz4740-cgu.c
@@ -266,6 +266,6 @@ static void __init jz4740_cgu_init(struct device_node *np)
 	if (retval)
 		pr_err("%s: failed to register CGU Clocks\n", __func__);
 
-	ingenic_cgu_register_syscore_ops(cgu);
+	ingenic_cgu_register_syscore(cgu);
 }
 CLK_OF_DECLARE_DRIVER(jz4740_cgu, "ingenic,jz4740-cgu", jz4740_cgu_init);
diff --git a/drivers/clk/ingenic/jz4755-cgu.c b/drivers/clk/ingenic/jz4755-cgu.c
index f2c2d848dab7..17cf5dcaece9 100644
--- a/drivers/clk/ingenic/jz4755-cgu.c
+++ b/drivers/clk/ingenic/jz4755-cgu.c
@@ -337,7 +337,7 @@ static void __init jz4755_cgu_init(struct device_node *np)
 	if (retval)
 		pr_err("%s: failed to register CGU Clocks\n", __func__);
 
-	ingenic_cgu_register_syscore_ops(cgu);
+	ingenic_cgu_register_syscore(cgu);
 }
 /*
  * CGU has some children devices, this is useful for probing children devices
diff --git a/drivers/clk/ingenic/jz4760-cgu.c b/drivers/clk/ingenic/jz4760-cgu.c
index e407f00bd594..372fe4b07992 100644
--- a/drivers/clk/ingenic/jz4760-cgu.c
+++ b/drivers/clk/ingenic/jz4760-cgu.c
@@ -436,7 +436,7 @@ static void __init jz4760_cgu_init(struct device_node *np)
 	if (retval)
 		pr_err("%s: failed to register CGU Clocks\n", __func__);
 
-	ingenic_cgu_register_syscore_ops(cgu);
+	ingenic_cgu_register_syscore(cgu);
 }
 
 /* We only probe via devicetree, no need for a platform driver */
diff --git a/drivers/clk/ingenic/jz4770-cgu.c b/drivers/clk/ingenic/jz4770-cgu.c
index 6ae1740367f9..58f1d3bad677 100644
--- a/drivers/clk/ingenic/jz4770-cgu.c
+++ b/drivers/clk/ingenic/jz4770-cgu.c
@@ -456,7 +456,7 @@ static void __init jz4770_cgu_init(struct device_node *np)
 	if (retval)
 		pr_err("%s: failed to register CGU Clocks\n", __func__);
 
-	ingenic_cgu_register_syscore_ops(cgu);
+	ingenic_cgu_register_syscore(cgu);
 }
 
 /* We only probe via devicetree, no need for a platform driver */
diff --git a/drivers/clk/ingenic/jz4780-cgu.c b/drivers/clk/ingenic/jz4780-cgu.c
index 07e2f3c5c454..1e88aef7ac0f 100644
--- a/drivers/clk/ingenic/jz4780-cgu.c
+++ b/drivers/clk/ingenic/jz4780-cgu.c
@@ -803,6 +803,6 @@ static void __init jz4780_cgu_init(struct device_node *np)
 		return;
 	}
 
-	ingenic_cgu_register_syscore_ops(cgu);
+	ingenic_cgu_register_syscore(cgu);
 }
 CLK_OF_DECLARE_DRIVER(jz4780_cgu, "ingenic,jz4780-cgu", jz4780_cgu_init);
diff --git a/drivers/clk/ingenic/pm.c b/drivers/clk/ingenic/pm.c
index 341752b640d2..206d5cf2872f 100644
--- a/drivers/clk/ingenic/pm.c
+++ b/drivers/clk/ingenic/pm.c
@@ -15,7 +15,7 @@
 
 static void __iomem * __maybe_unused ingenic_cgu_base;
 
-static int __maybe_unused ingenic_cgu_pm_suspend(void)
+static int __maybe_unused ingenic_cgu_pm_suspend(void *data)
 {
 	u32 val = readl(ingenic_cgu_base + CGU_REG_LCR);
 
@@ -24,22 +24,26 @@ static int __maybe_unused ingenic_cgu_pm_suspend(void)
 	return 0;
 }
 
-static void __maybe_unused ingenic_cgu_pm_resume(void)
+static void __maybe_unused ingenic_cgu_pm_resume(void *data)
 {
 	u32 val = readl(ingenic_cgu_base + CGU_REG_LCR);
 
 	writel(val & ~LCR_LOW_POWER_MODE, ingenic_cgu_base + CGU_REG_LCR);
 }
 
-static struct syscore_ops __maybe_unused ingenic_cgu_pm_ops = {
+static const struct syscore_ops __maybe_unused ingenic_cgu_pm_ops = {
 	.suspend = ingenic_cgu_pm_suspend,
 	.resume = ingenic_cgu_pm_resume,
 };
 
-void ingenic_cgu_register_syscore_ops(struct ingenic_cgu *cgu)
+static struct syscore __maybe_unused ingenic_cgu_pm = {
+	.ops = &ingenic_cgu_pm_ops,
+};
+
+void ingenic_cgu_register_syscore(struct ingenic_cgu *cgu)
 {
 	if (IS_ENABLED(CONFIG_PM_SLEEP)) {
 		ingenic_cgu_base = cgu->base;
-		register_syscore_ops(&ingenic_cgu_pm_ops);
+		register_syscore(&ingenic_cgu_pm);
 	}
 }
diff --git a/drivers/clk/ingenic/pm.h b/drivers/clk/ingenic/pm.h
index fa7540407b6b..0dcb57dc64cb 100644
--- a/drivers/clk/ingenic/pm.h
+++ b/drivers/clk/ingenic/pm.h
@@ -7,6 +7,6 @@
 
 struct ingenic_cgu;
 
-void ingenic_cgu_register_syscore_ops(struct ingenic_cgu *cgu);
+void ingenic_cgu_register_syscore(struct ingenic_cgu *cgu);
 
 #endif /* DRIVERS_CLK_INGENIC_PM_H */
diff --git a/drivers/clk/ingenic/tcu.c b/drivers/clk/ingenic/tcu.c
index 7d04ef40b7cf..bc6a51da2072 100644
--- a/drivers/clk/ingenic/tcu.c
+++ b/drivers/clk/ingenic/tcu.c
@@ -455,7 +455,7 @@ err_free_tcu:
 	return ret;
 }
 
-static int __maybe_unused tcu_pm_suspend(void)
+static int __maybe_unused tcu_pm_suspend(void *data)
 {
 	struct ingenic_tcu *tcu = ingenic_tcu;
 
@@ -465,7 +465,7 @@ static int __maybe_unused tcu_pm_suspend(void)
 	return 0;
 }
 
-static void __maybe_unused tcu_pm_resume(void)
+static void __maybe_unused tcu_pm_resume(void *data)
 {
 	struct ingenic_tcu *tcu = ingenic_tcu;
 
@@ -473,11 +473,15 @@ static void __maybe_unused tcu_pm_resume(void)
 		clk_enable(tcu->clk);
 }
 
-static struct syscore_ops __maybe_unused tcu_pm_ops = {
+static const struct syscore_ops __maybe_unused tcu_pm_ops = {
 	.suspend = tcu_pm_suspend,
 	.resume = tcu_pm_resume,
 };
 
+static struct syscore __maybe_unused tcu_pm = {
+	.ops = &tcu_pm_ops,
+};
+
 static void __init ingenic_tcu_init(struct device_node *np)
 {
 	int ret = ingenic_tcu_probe(np);
@@ -486,7 +490,7 @@ static void __init ingenic_tcu_init(struct device_node *np)
 		pr_crit("Failed to initialize TCU clocks: %d\n", ret);
 
 	if (IS_ENABLED(CONFIG_PM_SLEEP))
-		register_syscore_ops(&tcu_pm_ops);
+		register_syscore(&tcu_pm);
 }
 
 CLK_OF_DECLARE_DRIVER(jz4740_cgu, "ingenic,jz4740-tcu", ingenic_tcu_init);
diff --git a/drivers/clk/ingenic/x1000-cgu.c b/drivers/clk/ingenic/x1000-cgu.c
index d80886caf393..d89bdfb7c219 100644
--- a/drivers/clk/ingenic/x1000-cgu.c
+++ b/drivers/clk/ingenic/x1000-cgu.c
@@ -556,7 +556,7 @@ static void __init x1000_cgu_init(struct device_node *np)
 		return;
 	}
 
-	ingenic_cgu_register_syscore_ops(cgu);
+	ingenic_cgu_register_syscore(cgu);
 }
 /*
  * CGU has some children devices, this is useful for probing children devices
diff --git a/drivers/clk/ingenic/x1830-cgu.c b/drivers/clk/ingenic/x1830-cgu.c
index 0fd46e50a513..acf856e5009e 100644
--- a/drivers/clk/ingenic/x1830-cgu.c
+++ b/drivers/clk/ingenic/x1830-cgu.c
@@ -463,7 +463,7 @@ static void __init x1830_cgu_init(struct device_node *np)
 		return;
 	}
 
-	ingenic_cgu_register_syscore_ops(cgu);
+	ingenic_cgu_register_syscore(cgu);
 }
 /*
  * CGU has some children devices, this is useful for probing children devices
diff --git a/drivers/clk/mvebu/common.c b/drivers/clk/mvebu/common.c
index 785dbede4835..5adbbd91a6db 100644
--- a/drivers/clk/mvebu/common.c
+++ b/drivers/clk/mvebu/common.c
@@ -215,22 +215,26 @@ static struct clk *clk_gating_get_src(
 	return ERR_PTR(-ENODEV);
 }
 
-static int mvebu_clk_gating_suspend(void)
+static int mvebu_clk_gating_suspend(void *data)
 {
 	ctrl->saved_reg = readl(ctrl->base);
 	return 0;
 }
 
-static void mvebu_clk_gating_resume(void)
+static void mvebu_clk_gating_resume(void *data)
 {
 	writel(ctrl->saved_reg, ctrl->base);
 }
 
-static struct syscore_ops clk_gate_syscore_ops = {
+static const struct syscore_ops clk_gate_syscore_ops = {
 	.suspend = mvebu_clk_gating_suspend,
 	.resume = mvebu_clk_gating_resume,
 };
 
+static struct syscore clk_gate_syscore = {
+	.ops = &clk_gate_syscore_ops,
+};
+
 void __init mvebu_clk_gating_setup(struct device_node *np,
 				   const struct clk_gating_soc_desc *desc)
 {
@@ -284,7 +288,7 @@ void __init mvebu_clk_gating_setup(struct device_node *np,
 
 	of_clk_add_provider(np, clk_gating_get_src, ctrl);
 
-	register_syscore_ops(&clk_gate_syscore_ops);
+	register_syscore(&clk_gate_syscore);
 
 	return;
 gates_out:
diff --git a/drivers/clk/rockchip/clk-rk3288.c b/drivers/clk/rockchip/clk-rk3288.c
index 0a1e017df7c6..9cf3e1e43b78 100644
--- a/drivers/clk/rockchip/clk-rk3288.c
+++ b/drivers/clk/rockchip/clk-rk3288.c
@@ -871,7 +871,7 @@ static const int rk3288_saved_cru_reg_ids[] = {
 
 static u32 rk3288_saved_cru_regs[ARRAY_SIZE(rk3288_saved_cru_reg_ids)];
 
-static int rk3288_clk_suspend(void)
+static int rk3288_clk_suspend(void *data)
 {
 	int i, reg_id;
 
@@ -906,7 +906,7 @@ static int rk3288_clk_suspend(void)
 	return 0;
 }
 
-static void rk3288_clk_resume(void)
+static void rk3288_clk_resume(void *data)
 {
 	int i, reg_id;
 
@@ -923,11 +923,15 @@ static void rk3288_clk_shutdown(void)
 	writel_relaxed(0xf3030000, rk3288_cru_base + RK3288_MODE_CON);
 }
 
-static struct syscore_ops rk3288_clk_syscore_ops = {
+static const struct syscore_ops rk3288_clk_syscore_ops = {
 	.suspend = rk3288_clk_suspend,
 	.resume = rk3288_clk_resume,
 };
 
+static struct syscore rk3288_clk_syscore = {
+	.ops = &rk3288_clk_syscore_ops,
+};
+
 static void __init rk3288_common_init(struct device_node *np,
 				      enum rk3288_variant soc)
 {
@@ -976,7 +980,7 @@ static void __init rk3288_common_init(struct device_node *np,
 
 	rockchip_register_restart_notifier(ctx, RK3288_GLB_SRST_FST,
 					   rk3288_clk_shutdown);
-	register_syscore_ops(&rk3288_clk_syscore_ops);
+	register_syscore(&rk3288_clk_syscore);
 
 	rockchip_clk_of_add_provider(np, ctx);
 }
diff --git a/drivers/clk/samsung/clk-s5pv210-audss.c b/drivers/clk/samsung/clk-s5pv210-audss.c
index b1fd8fac3a4c..c9fcb23de183 100644
--- a/drivers/clk/samsung/clk-s5pv210-audss.c
+++ b/drivers/clk/samsung/clk-s5pv210-audss.c
@@ -36,7 +36,7 @@ static unsigned long reg_save[][2] = {
 	{ASS_CLK_GATE, 0},
 };
 
-static int s5pv210_audss_clk_suspend(void)
+static int s5pv210_audss_clk_suspend(void *data)
 {
 	int i;
 
@@ -46,7 +46,7 @@ static int s5pv210_audss_clk_suspend(void)
 	return 0;
 }
 
-static void s5pv210_audss_clk_resume(void)
+static void s5pv210_audss_clk_resume(void *data)
 {
 	int i;
 
@@ -54,10 +54,14 @@ static void s5pv210_audss_clk_resume(void)
 		writel(reg_save[i][1], reg_base + reg_save[i][0]);
 }
 
-static struct syscore_ops s5pv210_audss_clk_syscore_ops = {
+static const struct syscore_ops s5pv210_audss_clk_syscore_ops = {
 	.suspend	= s5pv210_audss_clk_suspend,
 	.resume		= s5pv210_audss_clk_resume,
 };
+
+static struct syscore s5pv210_audss_clk_syscore = {
+	.ops = &s5pv210_audss_clk_syscore_ops,
+};
 #endif /* CONFIG_PM_SLEEP */
 
 /* register s5pv210_audss clocks */
@@ -175,7 +179,7 @@ static int s5pv210_audss_clk_probe(struct platform_device *pdev)
 	}
 
 #ifdef CONFIG_PM_SLEEP
-	register_syscore_ops(&s5pv210_audss_clk_syscore_ops);
+	register_syscore(&s5pv210_audss_clk_syscore);
 #endif
 
 	return 0;
diff --git a/drivers/clk/samsung/clk.c b/drivers/clk/samsung/clk.c
index dbc9925ca8f4..c149ca6c2217 100644
--- a/drivers/clk/samsung/clk.c
+++ b/drivers/clk/samsung/clk.c
@@ -271,7 +271,7 @@ void __init samsung_clk_of_register_fixed_ext(struct samsung_clk_provider *ctx,
 }
 
 #ifdef CONFIG_PM_SLEEP
-static int samsung_clk_suspend(void)
+static int samsung_clk_suspend(void *data)
 {
 	struct samsung_clock_reg_cache *reg_cache;
 
@@ -284,7 +284,7 @@ static int samsung_clk_suspend(void)
 	return 0;
 }
 
-static void samsung_clk_resume(void)
+static void samsung_clk_resume(void *data)
 {
 	struct samsung_clock_reg_cache *reg_cache;
 
@@ -293,11 +293,15 @@ static void samsung_clk_resume(void)
 				reg_cache->rd_num);
 }
 
-static struct syscore_ops samsung_clk_syscore_ops = {
+static const struct syscore_ops samsung_clk_syscore_ops = {
 	.suspend = samsung_clk_suspend,
 	.resume = samsung_clk_resume,
 };
 
+static struct syscore samsung_clk_syscore = {
+	.ops = &samsung_clk_syscore_ops,
+};
+
 void samsung_clk_extended_sleep_init(void __iomem *reg_base,
 			const unsigned long *rdump,
 			unsigned long nr_rdump,
@@ -316,7 +320,7 @@ void samsung_clk_extended_sleep_init(void __iomem *reg_base,
 		panic("could not allocate register dump storage.\n");
 
 	if (list_empty(&clock_reg_cache_list))
-		register_syscore_ops(&samsung_clk_syscore_ops);
+		register_syscore(&samsung_clk_syscore);
 
 	reg_cache->reg_base = reg_base;
 	reg_cache->rd_num = nr_rdump;
diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c
index 412902f573b5..504d0ea997a5 100644
--- a/drivers/clk/tegra/clk-tegra210.c
+++ b/drivers/clk/tegra/clk-tegra210.c
@@ -3444,7 +3444,7 @@ static void tegra210_disable_cpu_clock(u32 cpu)
 static u32 spare_reg_ctx, misc_clk_enb_ctx, clk_msk_arm_ctx;
 static u32 cpu_softrst_ctx[3];
 
-static int tegra210_clk_suspend(void)
+static int tegra210_clk_suspend(void *data)
 {
 	unsigned int i;
 
@@ -3465,7 +3465,7 @@ static int tegra210_clk_suspend(void)
 	return 0;
 }
 
-static void tegra210_clk_resume(void)
+static void tegra210_clk_resume(void *data)
 {
 	unsigned int i;
 
@@ -3523,13 +3523,17 @@ static void tegra210_cpu_clock_resume(void)
 }
 #endif
 
-static struct syscore_ops tegra_clk_syscore_ops = {
+static const struct syscore_ops tegra_clk_syscore_ops = {
 #ifdef CONFIG_PM_SLEEP
 	.suspend = tegra210_clk_suspend,
 	.resume = tegra210_clk_resume,
 #endif
 };
 
+static struct syscore tegra_clk_syscore = {
+	.ops = &tegra_clk_syscore_ops,
+};
+
 static struct tegra_cpu_car_ops tegra210_cpu_car_ops = {
 	.wait_for_reset	= tegra210_wait_cpu_in_reset,
 	.disable_clock	= tegra210_disable_cpu_clock,
@@ -3813,6 +3817,6 @@ static void __init tegra210_clock_init(struct device_node *np)
 
 	tegra_cpu_car_ops = &tegra210_cpu_car_ops;
 
-	register_syscore_ops(&tegra_clk_syscore_ops);
+	register_syscore(&tegra_clk_syscore);
 }
 CLK_OF_DECLARE(tegra210, "nvidia,tegra210-car", tegra210_clock_init);
diff --git a/drivers/clocksource/timer-armada-370-xp.c b/drivers/clocksource/timer-armada-370-xp.c
index 54284c1c0651..f2b4cc40db93 100644
--- a/drivers/clocksource/timer-armada-370-xp.c
+++ b/drivers/clocksource/timer-armada-370-xp.c
@@ -207,14 +207,14 @@ static int armada_370_xp_timer_dying_cpu(unsigned int cpu)
 
 static u32 timer0_ctrl_reg, timer0_local_ctrl_reg;
 
-static int armada_370_xp_timer_suspend(void)
+static int armada_370_xp_timer_suspend(void *data)
 {
 	timer0_ctrl_reg = readl(timer_base + TIMER_CTRL_OFF);
 	timer0_local_ctrl_reg = readl(local_base + TIMER_CTRL_OFF);
 	return 0;
 }
 
-static void armada_370_xp_timer_resume(void)
+static void armada_370_xp_timer_resume(void *data)
 {
 	writel(0xffffffff, timer_base + TIMER0_VAL_OFF);
 	writel(0xffffffff, timer_base + TIMER0_RELOAD_OFF);
@@ -222,11 +222,15 @@ static void armada_370_xp_timer_resume(void)
 	writel(timer0_local_ctrl_reg, local_base + TIMER_CTRL_OFF);
 }
 
-static struct syscore_ops armada_370_xp_timer_syscore_ops = {
+static const struct syscore_ops armada_370_xp_timer_syscore_ops = {
 	.suspend	= armada_370_xp_timer_suspend,
 	.resume		= armada_370_xp_timer_resume,
 };
 
+static struct syscore armada_370_xp_timer_syscore = {
+	.ops = &armada_370_xp_timer_syscore_ops,
+};
+
 static unsigned long armada_370_delay_timer_read(void)
 {
 	return ~readl(timer_base + TIMER0_VAL_OFF);
@@ -324,7 +328,7 @@ static int __init armada_370_xp_timer_common_init(struct device_node *np)
 		return res;
 	}
 
-	register_syscore_ops(&armada_370_xp_timer_syscore_ops);
+	register_syscore(&armada_370_xp_timer_syscore);
 	
 	return 0;
 }
diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c
index b19bc60cc627..3372e1f90561 100644
--- a/drivers/cpuidle/cpuidle-psci.c
+++ b/drivers/cpuidle/cpuidle-psci.c
@@ -177,26 +177,30 @@ static void psci_idle_syscore_switch(bool suspend)
 	}
 }
 
-static int psci_idle_syscore_suspend(void)
+static int psci_idle_syscore_suspend(void *data)
 {
 	psci_idle_syscore_switch(true);
 	return 0;
 }
 
-static void psci_idle_syscore_resume(void)
+static void psci_idle_syscore_resume(void *data)
 {
 	psci_idle_syscore_switch(false);
 }
 
-static struct syscore_ops psci_idle_syscore_ops = {
+static const struct syscore_ops psci_idle_syscore_ops = {
 	.suspend = psci_idle_syscore_suspend,
 	.resume = psci_idle_syscore_resume,
 };
 
+static struct syscore psci_idle_syscore = {
+	.ops = &psci_idle_syscore_ops,
+};
+
 static void psci_idle_init_syscore(void)
 {
 	if (psci_cpuidle_use_syscore)
-		register_syscore_ops(&psci_idle_syscore_ops);
+		register_syscore(&psci_idle_syscore);
 }
 
 static void psci_idle_init_cpuhp(void)
diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c
index 52060b3ec745..d7666fe9dbf8 100644
--- a/drivers/gpio/gpio-mxc.c
+++ b/drivers/gpio/gpio-mxc.c
@@ -667,7 +667,7 @@ static const struct dev_pm_ops mxc_gpio_dev_pm_ops = {
 	RUNTIME_PM_OPS(mxc_gpio_runtime_suspend, mxc_gpio_runtime_resume, NULL)
 };
 
-static int mxc_gpio_syscore_suspend(void)
+static int mxc_gpio_syscore_suspend(void *data)
 {
 	struct mxc_gpio_port *port;
 	int ret;
@@ -684,7 +684,7 @@ static int mxc_gpio_syscore_suspend(void)
 	return 0;
 }
 
-static void mxc_gpio_syscore_resume(void)
+static void mxc_gpio_syscore_resume(void *data)
 {
 	struct mxc_gpio_port *port;
 	int ret;
@@ -701,11 +701,15 @@ static void mxc_gpio_syscore_resume(void)
 	}
 }
 
-static struct syscore_ops mxc_gpio_syscore_ops = {
+static const struct syscore_ops mxc_gpio_syscore_ops = {
 	.suspend = mxc_gpio_syscore_suspend,
 	.resume = mxc_gpio_syscore_resume,
 };
 
+static struct syscore mxc_gpio_syscore = {
+	.ops = &mxc_gpio_syscore_ops,
+};
+
 static struct platform_driver mxc_gpio_driver = {
 	.driver		= {
 		.name	= "gpio-mxc",
@@ -718,7 +722,7 @@ static struct platform_driver mxc_gpio_driver = {
 
 static int __init gpio_mxc_init(void)
 {
-	register_syscore_ops(&mxc_gpio_syscore_ops);
+	register_syscore(&mxc_gpio_syscore);
 
 	return platform_driver_register(&mxc_gpio_driver);
 }
diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c
index fa22f3faa163..664cf1eef494 100644
--- a/drivers/gpio/gpio-pxa.c
+++ b/drivers/gpio/gpio-pxa.c
@@ -747,7 +747,7 @@ static int __init pxa_gpio_dt_init(void)
 device_initcall(pxa_gpio_dt_init);
 
 #ifdef CONFIG_PM
-static int pxa_gpio_suspend(void)
+static int pxa_gpio_suspend(void *data)
 {
 	struct pxa_gpio_chip *pchip = pxa_gpio_chip;
 	struct pxa_gpio_bank *c;
@@ -768,7 +768,7 @@ static int pxa_gpio_suspend(void)
 	return 0;
 }
 
-static void pxa_gpio_resume(void)
+static void pxa_gpio_resume(void *data)
 {
 	struct pxa_gpio_chip *pchip = pxa_gpio_chip;
 	struct pxa_gpio_bank *c;
@@ -792,14 +792,18 @@ static void pxa_gpio_resume(void)
 #define pxa_gpio_resume		NULL
 #endif
 
-static struct syscore_ops pxa_gpio_syscore_ops = {
+static const struct syscore_ops pxa_gpio_syscore_ops = {
 	.suspend	= pxa_gpio_suspend,
 	.resume		= pxa_gpio_resume,
 };
 
+static struct syscore pxa_gpio_syscore = {
+	.ops = &pxa_gpio_syscore_ops,
+};
+
 static int __init pxa_gpio_sysinit(void)
 {
-	register_syscore_ops(&pxa_gpio_syscore_ops);
+	register_syscore(&pxa_gpio_syscore);
 	return 0;
 }
 postcore_initcall(pxa_gpio_sysinit);
diff --git a/drivers/gpio/gpio-sa1100.c b/drivers/gpio/gpio-sa1100.c
index 7f6a62f5d1ee..1938ffa2f4f3 100644
--- a/drivers/gpio/gpio-sa1100.c
+++ b/drivers/gpio/gpio-sa1100.c
@@ -256,7 +256,7 @@ static void sa1100_gpio_handler(struct irq_desc *desc)
 	} while (mask);
 }
 
-static int sa1100_gpio_suspend(void)
+static int sa1100_gpio_suspend(void *data)
 {
 	struct sa1100_gpio_chip *sgc = &sa1100_gpio_chip;
 
@@ -275,19 +275,23 @@ static int sa1100_gpio_suspend(void)
 	return 0;
 }
 
-static void sa1100_gpio_resume(void)
+static void sa1100_gpio_resume(void *data)
 {
 	sa1100_update_edge_regs(&sa1100_gpio_chip);
 }
 
-static struct syscore_ops sa1100_gpio_syscore_ops = {
+static const struct syscore_ops sa1100_gpio_syscore_ops = {
 	.suspend	= sa1100_gpio_suspend,
 	.resume		= sa1100_gpio_resume,
 };
 
+static struct syscore sa1100_gpio_syscore = {
+	.ops = &sa1100_gpio_syscore_ops,
+};
+
 static int __init sa1100_gpio_init_devicefs(void)
 {
-	register_syscore_ops(&sa1100_gpio_syscore_ops);
+	register_syscore(&sa1100_gpio_syscore);
 	return 0;
 }
 
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 69591dc7bad2..67734dc73e16 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2801,7 +2801,7 @@ static void hv_crash_handler(struct pt_regs *regs)
 	hv_synic_disable_regs(cpu);
 };
 
-static int hv_synic_suspend(void)
+static int hv_synic_suspend(void *data)
 {
 	/*
 	 * When we reach here, all the non-boot CPUs have been offlined.
@@ -2828,7 +2828,7 @@ static int hv_synic_suspend(void)
 	return 0;
 }
 
-static void hv_synic_resume(void)
+static void hv_synic_resume(void *data)
 {
 	hv_synic_enable_regs(0);
 
@@ -2840,11 +2840,15 @@ static void hv_synic_resume(void)
 }
 
 /* The callbacks run only on CPU0, with irqs_disabled. */
-static struct syscore_ops hv_synic_syscore_ops = {
+static const struct syscore_ops hv_synic_syscore_ops = {
 	.suspend = hv_synic_suspend,
 	.resume = hv_synic_resume,
 };
 
+static struct syscore hv_synic_syscore = {
+	.ops = &hv_synic_syscore_ops,
+};
+
 static int __init hv_acpi_init(void)
 {
 	int ret;
@@ -2887,7 +2891,7 @@ static int __init hv_acpi_init(void)
 	hv_setup_kexec_handler(hv_kexec_handler);
 	hv_setup_crash_handler(hv_crash_handler);
 
-	register_syscore_ops(&hv_synic_syscore_ops);
+	register_syscore(&hv_synic_syscore);
 
 	return 0;
 
@@ -2901,7 +2905,7 @@ static void __exit vmbus_exit(void)
 {
 	int cpu;
 
-	unregister_syscore_ops(&hv_synic_syscore_ops);
+	unregister_syscore(&hv_synic_syscore);
 
 	hv_remove_kexec_handler();
 	hv_remove_crash_handler();
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index f2991c11867c..b763f4c9c5a7 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -3024,7 +3024,7 @@ static void disable_iommus(void)
  * disable suspend until real resume implemented
  */
 
-static void amd_iommu_resume(void)
+static void amd_iommu_resume(void *data)
 {
 	struct amd_iommu *iommu;
 
@@ -3038,7 +3038,7 @@ static void amd_iommu_resume(void)
 	amd_iommu_enable_interrupts();
 }
 
-static int amd_iommu_suspend(void)
+static int amd_iommu_suspend(void *data)
 {
 	/* disable IOMMUs to go out of the way for BIOS */
 	disable_iommus();
@@ -3046,11 +3046,15 @@ static int amd_iommu_suspend(void)
 	return 0;
 }
 
-static struct syscore_ops amd_iommu_syscore_ops = {
+static const struct syscore_ops amd_iommu_syscore_ops = {
 	.suspend = amd_iommu_suspend,
 	.resume = amd_iommu_resume,
 };
 
+static struct syscore amd_iommu_syscore = {
+	.ops = &amd_iommu_syscore_ops,
+};
+
 static void __init free_iommu_resources(void)
 {
 	free_iommu_all();
@@ -3395,7 +3399,7 @@ static int __init state_next(void)
 		init_state = IOMMU_ENABLED;
 		break;
 	case IOMMU_ENABLED:
-		register_syscore_ops(&amd_iommu_syscore_ops);
+		register_syscore(&amd_iommu_syscore);
 		iommu_snp_enable();
 		ret = amd_iommu_init_pci();
 		init_state = ret ? IOMMU_INIT_ERROR : IOMMU_PCI_INIT;
@@ -3498,12 +3502,12 @@ int __init amd_iommu_enable(void)
 
 void amd_iommu_disable(void)
 {
-	amd_iommu_suspend();
+	amd_iommu_suspend(NULL);
 }
 
 int amd_iommu_reenable(int mode)
 {
-	amd_iommu_resume();
+	amd_iommu_resume(NULL);
 
 	return 0;
 }
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e236c7ec221f..fdaf7f64dd33 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2303,7 +2303,7 @@ static void iommu_flush_all(void)
 	}
 }
 
-static int iommu_suspend(void)
+static int iommu_suspend(void *data)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu = NULL;
@@ -2330,7 +2330,7 @@ static int iommu_suspend(void)
 	return 0;
 }
 
-static void iommu_resume(void)
+static void iommu_resume(void *data)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu = NULL;
@@ -2361,14 +2361,18 @@ static void iommu_resume(void)
 	}
 }
 
-static struct syscore_ops iommu_syscore_ops = {
+static const struct syscore_ops iommu_syscore_ops = {
 	.resume		= iommu_resume,
 	.suspend	= iommu_suspend,
 };
 
+static struct syscore iommu_syscore = {
+	.ops = &iommu_syscore_ops,
+};
+
 static void __init init_iommu_pm_ops(void)
 {
-	register_syscore_ops(&iommu_syscore_ops);
+	register_syscore(&iommu_syscore);
 }
 
 #else
diff --git a/drivers/irqchip/exynos-combiner.c b/drivers/irqchip/exynos-combiner.c
index e7dfcf0cda43..495848442b35 100644
--- a/drivers/irqchip/exynos-combiner.c
+++ b/drivers/irqchip/exynos-combiner.c
@@ -200,12 +200,13 @@ static void __init combiner_init(void __iomem *combiner_base,
 
 /**
  * combiner_suspend - save interrupt combiner state before suspend
+ * @data: syscore context
  *
  * Save the interrupt enable set register for all combiner groups since
  * the state is lost when the system enters into a sleep state.
  *
  */
-static int combiner_suspend(void)
+static int combiner_suspend(void *data)
 {
 	int i;
 
@@ -218,12 +219,13 @@ static int combiner_suspend(void)
 
 /**
  * combiner_resume - restore interrupt combiner state after resume
+ * @data: syscore context
  *
  * Restore the interrupt enable set register for all combiner groups since
  * the state is lost when the system enters into a sleep state on suspend.
  *
  */
-static void combiner_resume(void)
+static void combiner_resume(void *data)
 {
 	int i;
 
@@ -240,11 +242,15 @@ static void combiner_resume(void)
 #define combiner_resume		NULL
 #endif
 
-static struct syscore_ops combiner_syscore_ops = {
+static const struct syscore_ops combiner_syscore_ops = {
 	.suspend	= combiner_suspend,
 	.resume		= combiner_resume,
 };
 
+static struct syscore combiner_syscore = {
+	.ops = &combiner_syscore_ops,
+};
+
 static int __init combiner_of_init(struct device_node *np,
 				   struct device_node *parent)
 {
@@ -264,7 +270,7 @@ static int __init combiner_of_init(struct device_node *np,
 
 	combiner_init(combiner_base, np);
 
-	register_syscore_ops(&combiner_syscore_ops);
+	register_syscore(&combiner_syscore);
 
 	return 0;
 }
diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c
index a44c49e985b7..a4d03a2d1569 100644
--- a/drivers/irqchip/irq-armada-370-xp.c
+++ b/drivers/irqchip/irq-armada-370-xp.c
@@ -726,7 +726,7 @@ static void __exception_irq_entry mpic_handle_irq(struct pt_regs *regs)
 	} while (1);
 }
 
-static int mpic_suspend(void)
+static int mpic_suspend(void *data)
 {
 	struct mpic *mpic = mpic_data;
 
@@ -735,7 +735,7 @@ static int mpic_suspend(void)
 	return 0;
 }
 
-static void mpic_resume(void)
+static void mpic_resume(void *data)
 {
 	struct mpic *mpic = mpic_data;
 	bool src0, src1;
@@ -788,11 +788,15 @@ static void mpic_resume(void)
 		mpic_ipi_resume(mpic);
 }
 
-static struct syscore_ops mpic_syscore_ops = {
+static const struct syscore_ops mpic_syscore_ops = {
 	.suspend	= mpic_suspend,
 	.resume		= mpic_resume,
 };
 
+static struct syscore mpic_syscore = {
+	.ops = &mpic_syscore_ops,
+};
+
 static int __init mpic_map_region(struct device_node *np, int index,
 				  void __iomem **base, phys_addr_t *phys_base)
 {
@@ -905,7 +909,7 @@ static int __init mpic_of_init(struct device_node *node, struct device_node *par
 						 mpic_handle_cascade_irq, mpic);
 	}
 
-	register_syscore_ops(&mpic_syscore_ops);
+	register_syscore(&mpic_syscore);
 
 	return 0;
 }
diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c
index 04fac0cc857f..cd9a56459f99 100644
--- a/drivers/irqchip/irq-bcm7038-l1.c
+++ b/drivers/irqchip/irq-bcm7038-l1.c
@@ -292,7 +292,7 @@ static int __init bcm7038_l1_init_one(struct device_node *dn,
 static LIST_HEAD(bcm7038_l1_intcs_list);
 static DEFINE_RAW_SPINLOCK(bcm7038_l1_intcs_lock);
 
-static int bcm7038_l1_suspend(void)
+static int bcm7038_l1_suspend(void *data)
 {
 	struct bcm7038_l1_chip *intc;
 	int boot_cpu, word;
@@ -318,7 +318,7 @@ static int bcm7038_l1_suspend(void)
 	return 0;
 }
 
-static void bcm7038_l1_resume(void)
+static void bcm7038_l1_resume(void *data)
 {
 	struct bcm7038_l1_chip *intc;
 	int boot_cpu, word;
@@ -339,11 +339,15 @@ static void bcm7038_l1_resume(void)
 	}
 }
 
-static struct syscore_ops bcm7038_l1_syscore_ops = {
+static const struct syscore_ops bcm7038_l1_syscore_ops = {
 	.suspend	= bcm7038_l1_suspend,
 	.resume		= bcm7038_l1_resume,
 };
 
+static struct syscore bcm7038_l1_syscore = {
+	.ops = &bcm7038_l1_syscore_ops,
+};
+
 static int bcm7038_l1_set_wake(struct irq_data *d, unsigned int on)
 {
 	struct bcm7038_l1_chip *intc = irq_data_get_irq_chip_data(d);
@@ -431,7 +435,7 @@ static int __init bcm7038_l1_of_init(struct device_node *dn,
 	raw_spin_unlock(&bcm7038_l1_intcs_lock);
 
 	if (list_is_singular(&bcm7038_l1_intcs_list))
-		register_syscore_ops(&bcm7038_l1_syscore_ops);
+		register_syscore(&bcm7038_l1_syscore);
 #endif
 
 	pr_info("registered BCM7038 L1 intc (%pOF, IRQs: %d)\n",
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 467cb78435a9..ada585bfa451 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -4992,7 +4992,7 @@ static void its_enable_quirks(struct its_node *its)
 				     its_quirks, its);
 }
 
-static int its_save_disable(void)
+static int its_save_disable(void *data)
 {
 	struct its_node *its;
 	int err = 0;
@@ -5028,7 +5028,7 @@ err:
 	return err;
 }
 
-static void its_restore_enable(void)
+static void its_restore_enable(void *data)
 {
 	struct its_node *its;
 	int ret;
@@ -5088,11 +5088,15 @@ static void its_restore_enable(void)
 	raw_spin_unlock(&its_lock);
 }
 
-static struct syscore_ops its_syscore_ops = {
+static const struct syscore_ops its_syscore_ops = {
 	.suspend = its_save_disable,
 	.resume = its_restore_enable,
 };
 
+static struct syscore its_syscore = {
+	.ops = &its_syscore_ops,
+};
+
 static void __init __iomem *its_map_one(struct resource *res, int *err)
 {
 	void __iomem *its_base;
@@ -5864,7 +5868,7 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
 		}
 	}
 
-	register_syscore_ops(&its_syscore_ops);
+	register_syscore(&its_syscore);
 
 	return 0;
 }
diff --git a/drivers/irqchip/irq-i8259.c b/drivers/irqchip/irq-i8259.c
index 91b2f587119c..cca77f9948a3 100644
--- a/drivers/irqchip/irq-i8259.c
+++ b/drivers/irqchip/irq-i8259.c
@@ -202,13 +202,13 @@ spurious_8259A_irq:
 	}
 }
 
-static void i8259A_resume(void)
+static void i8259A_resume(void *data)
 {
 	if (i8259A_auto_eoi >= 0)
 		init_8259A(i8259A_auto_eoi);
 }
 
-static void i8259A_shutdown(void)
+static void i8259A_shutdown(void *data)
 {
 	/* Put the i8259A into a quiescent state that
 	 * the kernel initialization code can get it
@@ -220,11 +220,15 @@ static void i8259A_shutdown(void)
 	}
 }
 
-static struct syscore_ops i8259_syscore_ops = {
+static const struct syscore_ops i8259_syscore_ops = {
 	.resume = i8259A_resume,
 	.shutdown = i8259A_shutdown,
 };
 
+static struct syscore i8259_syscore = {
+	.ops = &i8259_syscore_ops,
+};
+
 static void init_8259A(int auto_eoi)
 {
 	unsigned long flags;
@@ -320,7 +324,7 @@ struct irq_domain * __init __init_i8259_irqs(struct device_node *node)
 
 	if (request_irq(irq, no_action, IRQF_NO_THREAD, "cascade", NULL))
 		pr_err("Failed to register cascade interrupt\n");
-	register_syscore_ops(&i8259_syscore_ops);
+	register_syscore(&i8259_syscore);
 	return domain;
 }
 
diff --git a/drivers/irqchip/irq-imx-gpcv2.c b/drivers/irqchip/irq-imx-gpcv2.c
index b91f5c14b405..04f7ba0657be 100644
--- a/drivers/irqchip/irq-imx-gpcv2.c
+++ b/drivers/irqchip/irq-imx-gpcv2.c
@@ -33,7 +33,7 @@ static void __iomem *gpcv2_idx_to_reg(struct gpcv2_irqchip_data *cd, int i)
 	return cd->gpc_base + cd->cpu2wakeup + i * 4;
 }
 
-static int gpcv2_wakeup_source_save(void)
+static int gpcv2_wakeup_source_save(void *data)
 {
 	struct gpcv2_irqchip_data *cd;
 	void __iomem *reg;
@@ -52,7 +52,7 @@ static int gpcv2_wakeup_source_save(void)
 	return 0;
 }
 
-static void gpcv2_wakeup_source_restore(void)
+static void gpcv2_wakeup_source_restore(void *data)
 {
 	struct gpcv2_irqchip_data *cd;
 	int i;
@@ -65,9 +65,13 @@ static void gpcv2_wakeup_source_restore(void)
 		writel_relaxed(cd->saved_irq_mask[i], gpcv2_idx_to_reg(cd, i));
 }
 
-static struct syscore_ops imx_gpcv2_syscore_ops = {
-	.suspend	= gpcv2_wakeup_source_save,
-	.resume		= gpcv2_wakeup_source_restore,
+static const struct syscore_ops gpcv2_syscore_ops = {
+	.suspend = gpcv2_wakeup_source_save,
+	.resume = gpcv2_wakeup_source_restore,
+};
+
+static struct syscore gpcv2_syscore = {
+	.ops = &gpcv2_syscore_ops,
 };
 
 static int imx_gpcv2_irq_set_wake(struct irq_data *d, unsigned int on)
@@ -276,7 +280,7 @@ static int __init imx_gpcv2_irqchip_init(struct device_node *node,
 	writel_relaxed(~0x1, cd->gpc_base + cd->cpu2wakeup);
 
 	imx_gpcv2_instance = cd;
-	register_syscore_ops(&imx_gpcv2_syscore_ops);
+	register_syscore(&gpcv2_syscore);
 
 	/*
 	 * Clear the OF_POPULATED flag set in of_irq_init so that
diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c
index 39e5a72ccd3c..ad2105685b48 100644
--- a/drivers/irqchip/irq-loongson-eiointc.c
+++ b/drivers/irqchip/irq-loongson-eiointc.c
@@ -407,21 +407,25 @@ static struct irq_domain *acpi_get_vec_parent(int node, struct acpi_vector_group
 	return NULL;
 }
 
-static int eiointc_suspend(void)
+static int eiointc_suspend(void *data)
 {
 	return 0;
 }
 
-static void eiointc_resume(void)
+static void eiointc_resume(void *data)
 {
 	eiointc_router_init(0);
 }
 
-static struct syscore_ops eiointc_syscore_ops = {
+static const struct syscore_ops eiointc_syscore_ops = {
 	.suspend = eiointc_suspend,
 	.resume = eiointc_resume,
 };
 
+static struct syscore eiointc_syscore = {
+	.ops = &eiointc_syscore_ops,
+};
+
 static int __init pch_pic_parse_madt(union acpi_subtable_headers *header,
 					const unsigned long end)
 {
@@ -540,7 +544,7 @@ static int __init eiointc_init(struct eiointc_priv *priv, int parent_irq,
 	eiointc_router_init(0);
 
 	if (nr_pics == 1) {
-		register_syscore_ops(&eiointc_syscore_ops);
+		register_syscore(&eiointc_syscore);
 		cpuhp_setup_state_nocalls(CPUHP_AP_IRQ_EIOINTC_STARTING,
 					  "irqchip/loongarch/eiointc:starting",
 					  eiointc_router_init, NULL);
diff --git a/drivers/irqchip/irq-loongson-htpic.c b/drivers/irqchip/irq-loongson-htpic.c
index f4abdf156de7..1c691c4be989 100644
--- a/drivers/irqchip/irq-loongson-htpic.c
+++ b/drivers/irqchip/irq-loongson-htpic.c
@@ -71,15 +71,19 @@ static void htpic_reg_init(void)
 	writel(0xffff, htpic->base + HTINT_EN_OFF);
 }
 
-static void htpic_resume(void)
+static void htpic_resume(void *data)
 {
 	htpic_reg_init();
 }
 
-struct syscore_ops htpic_syscore_ops = {
+static const struct syscore_ops htpic_syscore_ops = {
 	.resume		= htpic_resume,
 };
 
+static struct syscore htpic_syscore = {
+	.ops = &htpic_syscore_ops,
+};
+
 static int __init htpic_of_init(struct device_node *node, struct device_node *parent)
 {
 	unsigned int parent_irq[4];
@@ -130,7 +134,7 @@ static int __init htpic_of_init(struct device_node *node, struct device_node *pa
 						htpic_irq_dispatch, htpic);
 	}
 
-	register_syscore_ops(&htpic_syscore_ops);
+	register_syscore(&htpic_syscore);
 
 	return 0;
 
diff --git a/drivers/irqchip/irq-loongson-htvec.c b/drivers/irqchip/irq-loongson-htvec.c
index d8558eb35044..d2be8e954e92 100644
--- a/drivers/irqchip/irq-loongson-htvec.c
+++ b/drivers/irqchip/irq-loongson-htvec.c
@@ -159,7 +159,7 @@ static void htvec_reset(struct htvec *priv)
 	}
 }
 
-static int htvec_suspend(void)
+static int htvec_suspend(void *data)
 {
 	int i;
 
@@ -169,7 +169,7 @@ static int htvec_suspend(void)
 	return 0;
 }
 
-static void htvec_resume(void)
+static void htvec_resume(void *data)
 {
 	int i;
 
@@ -177,11 +177,15 @@ static void htvec_resume(void)
 		writel(htvec_priv->saved_vec_en[i], htvec_priv->base + HTVEC_EN_OFF + 4 * i);
 }
 
-static struct syscore_ops htvec_syscore_ops = {
+static const struct syscore_ops htvec_syscore_ops = {
 	.suspend = htvec_suspend,
 	.resume = htvec_resume,
 };
 
+static struct syscore htvec_syscore = {
+	.ops = &htvec_syscore_ops,
+};
+
 static int htvec_init(phys_addr_t addr, unsigned long size,
 		int num_parents, int parent_irq[], struct fwnode_handle *domain_handle)
 {
@@ -214,7 +218,7 @@ static int htvec_init(phys_addr_t addr, unsigned long size,
 
 	htvec_priv = priv;
 
-	register_syscore_ops(&htvec_syscore_ops);
+	register_syscore(&htvec_syscore);
 
 	return 0;
 
diff --git a/drivers/irqchip/irq-loongson-pch-lpc.c b/drivers/irqchip/irq-loongson-pch-lpc.c
index 912bf50a5c7c..3a125f3e4287 100644
--- a/drivers/irqchip/irq-loongson-pch-lpc.c
+++ b/drivers/irqchip/irq-loongson-pch-lpc.c
@@ -151,7 +151,7 @@ static int pch_lpc_disabled(struct pch_lpc *priv)
 			(readl(priv->base + LPC_INT_STS) == 0xffffffff);
 }
 
-static int pch_lpc_suspend(void)
+static int pch_lpc_suspend(void *data)
 {
 	pch_lpc_priv->saved_reg_ctl = readl(pch_lpc_priv->base + LPC_INT_CTL);
 	pch_lpc_priv->saved_reg_ena = readl(pch_lpc_priv->base + LPC_INT_ENA);
@@ -159,18 +159,22 @@ static int pch_lpc_suspend(void)
 	return 0;
 }
 
-static void pch_lpc_resume(void)
+static void pch_lpc_resume(void *data)
 {
 	writel(pch_lpc_priv->saved_reg_ctl, pch_lpc_priv->base + LPC_INT_CTL);
 	writel(pch_lpc_priv->saved_reg_ena, pch_lpc_priv->base + LPC_INT_ENA);
 	writel(pch_lpc_priv->saved_reg_pol, pch_lpc_priv->base + LPC_INT_POL);
 }
 
-static struct syscore_ops pch_lpc_syscore_ops = {
+static const struct syscore_ops pch_lpc_syscore_ops = {
 	.suspend = pch_lpc_suspend,
 	.resume = pch_lpc_resume,
 };
 
+static struct syscore pch_lpc_syscore = {
+	.ops = &pch_lpc_syscore_ops,
+};
+
 int __init pch_lpc_acpi_init(struct irq_domain *parent,
 					struct acpi_madt_lpc_pic *acpi_pchlpc)
 {
@@ -222,7 +226,7 @@ int __init pch_lpc_acpi_init(struct irq_domain *parent,
 
 	pch_lpc_priv = priv;
 	pch_lpc_handle = irq_handle;
-	register_syscore_ops(&pch_lpc_syscore_ops);
+	register_syscore(&pch_lpc_syscore);
 
 	return 0;
 
diff --git a/drivers/irqchip/irq-loongson-pch-pic.c b/drivers/irqchip/irq-loongson-pch-pic.c
index 62e6bf3a0611..c6b369a974a7 100644
--- a/drivers/irqchip/irq-loongson-pch-pic.c
+++ b/drivers/irqchip/irq-loongson-pch-pic.c
@@ -278,7 +278,7 @@ static void pch_pic_reset(struct pch_pic *priv)
 	}
 }
 
-static int pch_pic_suspend(void)
+static int pch_pic_suspend(void *data)
 {
 	int i, j;
 
@@ -296,7 +296,7 @@ static int pch_pic_suspend(void)
 	return 0;
 }
 
-static void pch_pic_resume(void)
+static void pch_pic_resume(void *data)
 {
 	int i, j;
 
@@ -313,11 +313,15 @@ static void pch_pic_resume(void)
 	}
 }
 
-static struct syscore_ops pch_pic_syscore_ops = {
+static const struct syscore_ops pch_pic_syscore_ops = {
 	.suspend =  pch_pic_suspend,
 	.resume =  pch_pic_resume,
 };
 
+static struct syscore pch_pic_syscore = {
+	.ops = &pch_pic_syscore_ops,
+};
+
 static int pch_pic_init(phys_addr_t addr, unsigned long size, int vec_base,
 			struct irq_domain *parent_domain, struct fwnode_handle *domain_handle,
 			u32 gsi_base)
@@ -356,7 +360,7 @@ static int pch_pic_init(phys_addr_t addr, unsigned long size, int vec_base,
 	pch_pic_priv[nr_pics++] = priv;
 
 	if (nr_pics == 1)
-		register_syscore_ops(&pch_pic_syscore_ops);
+		register_syscore(&pch_pic_syscore);
 
 	return 0;
 
diff --git a/drivers/irqchip/irq-mchp-eic.c b/drivers/irqchip/irq-mchp-eic.c
index 516a3a0e359c..be83e6e422c3 100644
--- a/drivers/irqchip/irq-mchp-eic.c
+++ b/drivers/irqchip/irq-mchp-eic.c
@@ -109,7 +109,7 @@ static int mchp_eic_irq_set_wake(struct irq_data *d, unsigned int on)
 	return 0;
 }
 
-static int mchp_eic_irq_suspend(void)
+static int mchp_eic_irq_suspend(void *data)
 {
 	unsigned int hwirq;
 
@@ -123,7 +123,7 @@ static int mchp_eic_irq_suspend(void)
 	return 0;
 }
 
-static void mchp_eic_irq_resume(void)
+static void mchp_eic_irq_resume(void *data)
 {
 	unsigned int hwirq;
 
@@ -135,11 +135,15 @@ static void mchp_eic_irq_resume(void)
 			       MCHP_EIC_SCFG(hwirq));
 }
 
-static struct syscore_ops mchp_eic_syscore_ops = {
+static const struct syscore_ops mchp_eic_syscore_ops = {
 	.suspend = mchp_eic_irq_suspend,
 	.resume = mchp_eic_irq_resume,
 };
 
+static struct syscore mchp_eic_syscore = {
+	.ops = &mchp_eic_syscore_ops,
+};
+
 static struct irq_chip mchp_eic_chip = {
 	.name		= "eic",
 	.flags		= IRQCHIP_MASK_ON_SUSPEND | IRQCHIP_SET_TYPE_MASKED,
@@ -257,7 +261,7 @@ static int mchp_eic_init(struct device_node *node, struct device_node *parent)
 		goto clk_unprepare;
 	}
 
-	register_syscore_ops(&mchp_eic_syscore_ops);
+	register_syscore(&mchp_eic_syscore);
 
 	pr_info("%pOF: EIC registered, nr_irqs %u\n", node, MCHP_EIC_NIRQ);
 
diff --git a/drivers/irqchip/irq-mst-intc.c b/drivers/irqchip/irq-mst-intc.c
index 9643cc3a77d7..7f760f555a76 100644
--- a/drivers/irqchip/irq-mst-intc.c
+++ b/drivers/irqchip/irq-mst-intc.c
@@ -143,7 +143,7 @@ static void mst_intc_polarity_restore(struct mst_intc_chip_data *cd)
 		writew_relaxed(cd->saved_polarity_conf[i], addr + i * 4);
 }
 
-static void mst_irq_resume(void)
+static void mst_irq_resume(void *data)
 {
 	struct mst_intc_chip_data *cd;
 
@@ -151,7 +151,7 @@ static void mst_irq_resume(void)
 		mst_intc_polarity_restore(cd);
 }
 
-static int mst_irq_suspend(void)
+static int mst_irq_suspend(void *data)
 {
 	struct mst_intc_chip_data *cd;
 
@@ -160,14 +160,18 @@ static int mst_irq_suspend(void)
 	return 0;
 }
 
-static struct syscore_ops mst_irq_syscore_ops = {
+static const struct syscore_ops mst_irq_syscore_ops = {
 	.suspend	= mst_irq_suspend,
 	.resume		= mst_irq_resume,
 };
 
+static struct syscore mst_irq_syscore = {
+	.ops = &mst_irq_syscore_ops,
+};
+
 static int __init mst_irq_pm_init(void)
 {
-	register_syscore_ops(&mst_irq_syscore_ops);
+	register_syscore(&mst_irq_syscore);
 	return 0;
 }
 late_initcall(mst_irq_pm_init);
diff --git a/drivers/irqchip/irq-mtk-cirq.c b/drivers/irqchip/irq-mtk-cirq.c
index de481ba340f8..9571f622774e 100644
--- a/drivers/irqchip/irq-mtk-cirq.c
+++ b/drivers/irqchip/irq-mtk-cirq.c
@@ -199,7 +199,7 @@ static const struct irq_domain_ops cirq_domain_ops = {
 };
 
 #ifdef CONFIG_PM_SLEEP
-static int mtk_cirq_suspend(void)
+static int mtk_cirq_suspend(void *data)
 {
 	void __iomem *reg;
 	u32 value, mask;
@@ -257,7 +257,7 @@ static int mtk_cirq_suspend(void)
 	return 0;
 }
 
-static void mtk_cirq_resume(void)
+static void mtk_cirq_resume(void *data)
 {
 	void __iomem *reg = mtk_cirq_reg(cirq_data, CIRQ_CONTROL);
 	u32 value;
@@ -272,14 +272,18 @@ static void mtk_cirq_resume(void)
 	writel_relaxed(value, reg);
 }
 
-static struct syscore_ops mtk_cirq_syscore_ops = {
+static const struct syscore_ops mtk_cirq_syscore_ops = {
 	.suspend	= mtk_cirq_suspend,
 	.resume		= mtk_cirq_resume,
 };
 
+static struct syscore mtk_cirq_syscore = {
+	.ops = &mtk_cirq_syscore_ops,
+};
+
 static void mtk_cirq_syscore_init(void)
 {
-	register_syscore_ops(&mtk_cirq_syscore_ops);
+	register_syscore(&mtk_cirq_syscore);
 }
 #else
 static inline void mtk_cirq_syscore_init(void) {}
diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c
index 2a54adeb4cc7..de88e80eac0c 100644
--- a/drivers/irqchip/irq-renesas-rzg2l.c
+++ b/drivers/irqchip/irq-renesas-rzg2l.c
@@ -399,7 +399,7 @@ static int rzg2l_irqc_set_type(struct irq_data *d, unsigned int type)
 	return irq_chip_set_type_parent(d, IRQ_TYPE_LEVEL_HIGH);
 }
 
-static int rzg2l_irqc_irq_suspend(void)
+static int rzg2l_irqc_irq_suspend(void *data)
 {
 	struct rzg2l_irqc_reg_cache *cache = &rzg2l_irqc_data->cache;
 	void __iomem *base = rzg2l_irqc_data->base;
@@ -411,7 +411,7 @@ static int rzg2l_irqc_irq_suspend(void)
 	return 0;
 }
 
-static void rzg2l_irqc_irq_resume(void)
+static void rzg2l_irqc_irq_resume(void *data)
 {
 	struct rzg2l_irqc_reg_cache *cache = &rzg2l_irqc_data->cache;
 	void __iomem *base = rzg2l_irqc_data->base;
@@ -426,11 +426,15 @@ static void rzg2l_irqc_irq_resume(void)
 	writel_relaxed(cache->iitsr, base + IITSR);
 }
 
-static struct syscore_ops rzg2l_irqc_syscore_ops = {
+static const struct syscore_ops rzg2l_irqc_syscore_ops = {
 	.suspend	= rzg2l_irqc_irq_suspend,
 	.resume		= rzg2l_irqc_irq_resume,
 };
 
+static struct syscore rzg2l_irqc_syscore = {
+	.ops = &rzg2l_irqc_syscore_ops,
+};
+
 static const struct irq_chip rzg2l_irqc_chip = {
 	.name			= "rzg2l-irqc",
 	.irq_eoi		= rzg2l_irqc_eoi,
@@ -581,7 +585,7 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node *
 		return -ENOMEM;
 	}
 
-	register_syscore_ops(&rzg2l_irqc_syscore_ops);
+	register_syscore(&rzg2l_irqc_syscore);
 
 	/*
 	 * Prevent the cleanup function from invoking put_device by assigning
diff --git a/drivers/irqchip/irq-sa11x0.c b/drivers/irqchip/irq-sa11x0.c
index d8d4dff16276..e5f24c5f3f41 100644
--- a/drivers/irqchip/irq-sa11x0.c
+++ b/drivers/irqchip/irq-sa11x0.c
@@ -85,7 +85,7 @@ static struct sa1100irq_state {
 	unsigned int	iccr;
 } sa1100irq_state;
 
-static int sa1100irq_suspend(void)
+static int sa1100irq_suspend(void *data)
 {
 	struct sa1100irq_state *st = &sa1100irq_state;
 
@@ -102,7 +102,7 @@ static int sa1100irq_suspend(void)
 	return 0;
 }
 
-static void sa1100irq_resume(void)
+static void sa1100irq_resume(void *data)
 {
 	struct sa1100irq_state *st = &sa1100irq_state;
 
@@ -114,14 +114,18 @@ static void sa1100irq_resume(void)
 	}
 }
 
-static struct syscore_ops sa1100irq_syscore_ops = {
+static const struct syscore_ops sa1100irq_syscore_ops = {
 	.suspend	= sa1100irq_suspend,
 	.resume		= sa1100irq_resume,
 };
 
+static struct syscore sa1100irq_syscore = {
+	.ops = &sa1100irq_syscore_ops,
+};
+
 static int __init sa1100irq_init_devicefs(void)
 {
-	register_syscore_ops(&sa1100irq_syscore_ops);
+	register_syscore(&sa1100irq_syscore);
 	return 0;
 }
 
diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c
index cbd7697bc148..4f59c0ca1537 100644
--- a/drivers/irqchip/irq-sifive-plic.c
+++ b/drivers/irqchip/irq-sifive-plic.c
@@ -245,7 +245,7 @@ static int plic_irq_set_type(struct irq_data *d, unsigned int type)
 	return IRQ_SET_MASK_OK;
 }
 
-static int plic_irq_suspend(void)
+static int plic_irq_suspend(void *data)
 {
 	unsigned int i, cpu;
 	unsigned long flags;
@@ -277,7 +277,7 @@ static int plic_irq_suspend(void)
 	return 0;
 }
 
-static void plic_irq_resume(void)
+static void plic_irq_resume(void *data)
 {
 	unsigned int i, index, cpu;
 	unsigned long flags;
@@ -308,11 +308,15 @@ static void plic_irq_resume(void)
 	}
 }
 
-static struct syscore_ops plic_irq_syscore_ops = {
+static const struct syscore_ops plic_irq_syscore_ops = {
 	.suspend	= plic_irq_suspend,
 	.resume		= plic_irq_resume,
 };
 
+static struct syscore plic_irq_syscore = {
+	.ops = &plic_irq_syscore_ops,
+};
+
 static int plic_irqdomain_map(struct irq_domain *d, unsigned int irq,
 			      irq_hw_number_t hwirq)
 {
@@ -678,7 +682,7 @@ done:
 			cpuhp_setup_state(CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
 					  "irqchip/sifive/plic:starting",
 					  plic_starting_cpu, plic_dying_cpu);
-			register_syscore_ops(&plic_irq_syscore_ops);
+			register_syscore(&plic_irq_syscore);
 			plic_global_setup_done = true;
 		}
 	}
diff --git a/drivers/irqchip/irq-sun6i-r.c b/drivers/irqchip/irq-sun6i-r.c
index 37d4b29763bc..23251831c06e 100644
--- a/drivers/irqchip/irq-sun6i-r.c
+++ b/drivers/irqchip/irq-sun6i-r.c
@@ -268,7 +268,7 @@ static const struct irq_domain_ops sun6i_r_intc_domain_ops = {
 	.free		= irq_domain_free_irqs_common,
 };
 
-static int sun6i_r_intc_suspend(void)
+static int sun6i_r_intc_suspend(void *data)
 {
 	u32 buf[BITS_TO_U32(MAX(SUN6I_NR_TOP_LEVEL_IRQS, SUN6I_NR_MUX_BITS))];
 	int i;
@@ -284,7 +284,7 @@ static int sun6i_r_intc_suspend(void)
 	return 0;
 }
 
-static void sun6i_r_intc_resume(void)
+static void sun6i_r_intc_resume(void *data)
 {
 	int i;
 
@@ -294,17 +294,21 @@ static void sun6i_r_intc_resume(void)
 		writel_relaxed(0, base + SUN6I_IRQ_ENABLE(i));
 }
 
-static void sun6i_r_intc_shutdown(void)
+static void sun6i_r_intc_shutdown(void *data)
 {
-	sun6i_r_intc_suspend();
+	sun6i_r_intc_suspend(data);
 }
 
-static struct syscore_ops sun6i_r_intc_syscore_ops = {
+static const struct syscore_ops sun6i_r_intc_syscore_ops = {
 	.suspend	= sun6i_r_intc_suspend,
 	.resume		= sun6i_r_intc_resume,
 	.shutdown	= sun6i_r_intc_shutdown,
 };
 
+static struct syscore sun6i_r_intc_syscore = {
+	.ops = &sun6i_r_intc_syscore_ops,
+};
+
 static int __init sun6i_r_intc_init(struct device_node *node,
 				    struct device_node *parent,
 				    const struct sun6i_r_intc_variant *v)
@@ -346,10 +350,10 @@ static int __init sun6i_r_intc_init(struct device_node *node,
 		return -ENOMEM;
 	}
 
-	register_syscore_ops(&sun6i_r_intc_syscore_ops);
+	register_syscore(&sun6i_r_intc_syscore);
 
 	sun6i_r_intc_ack_nmi();
-	sun6i_r_intc_resume();
+	sun6i_r_intc_resume(NULL);
 
 	return 0;
 }
diff --git a/drivers/irqchip/irq-tegra.c b/drivers/irqchip/irq-tegra.c
index 66cbb9f77ff3..b6382cf6359a 100644
--- a/drivers/irqchip/irq-tegra.c
+++ b/drivers/irqchip/irq-tegra.c
@@ -132,7 +132,7 @@ static int tegra_set_wake(struct irq_data *d, unsigned int enable)
 	return 0;
 }
 
-static int tegra_ictlr_suspend(void)
+static int tegra_ictlr_suspend(void *data)
 {
 	unsigned long flags;
 	unsigned int i;
@@ -161,7 +161,7 @@ static int tegra_ictlr_suspend(void)
 	return 0;
 }
 
-static void tegra_ictlr_resume(void)
+static void tegra_ictlr_resume(void *data)
 {
 	unsigned long flags;
 	unsigned int i;
@@ -184,14 +184,18 @@ static void tegra_ictlr_resume(void)
 	local_irq_restore(flags);
 }
 
-static struct syscore_ops tegra_ictlr_syscore_ops = {
+static const struct syscore_ops tegra_ictlr_syscore_ops = {
 	.suspend	= tegra_ictlr_suspend,
 	.resume		= tegra_ictlr_resume,
 };
 
+static struct syscore tegra_ictlr_syscore = {
+	.ops = &tegra_ictlr_syscore_ops,
+};
+
 static void tegra_ictlr_syscore_init(void)
 {
-	register_syscore_ops(&tegra_ictlr_syscore_ops);
+	register_syscore(&tegra_ictlr_syscore);
 }
 #else
 #define tegra_set_wake	NULL
diff --git a/drivers/irqchip/irq-vic.c b/drivers/irqchip/irq-vic.c
index 2bcdf216a000..e38104c5064e 100644
--- a/drivers/irqchip/irq-vic.c
+++ b/drivers/irqchip/irq-vic.c
@@ -120,7 +120,7 @@ static void resume_one_vic(struct vic_device *vic)
 	writel(~vic->soft_int, base + VIC_INT_SOFT_CLEAR);
 }
 
-static void vic_resume(void)
+static void vic_resume(void *data)
 {
 	int id;
 
@@ -146,7 +146,7 @@ static void suspend_one_vic(struct vic_device *vic)
 	writel(~vic->resume_irqs, base + VIC_INT_ENABLE_CLEAR);
 }
 
-static int vic_suspend(void)
+static int vic_suspend(void *data)
 {
 	int id;
 
@@ -156,11 +156,15 @@ static int vic_suspend(void)
 	return 0;
 }
 
-static struct syscore_ops vic_syscore_ops = {
+static const struct syscore_ops vic_syscore_ops = {
 	.suspend	= vic_suspend,
 	.resume		= vic_resume,
 };
 
+static struct syscore vic_syscore = {
+	.ops = &vic_syscore_ops,
+};
+
 /**
  * vic_pm_init - initcall to register VIC pm
  *
@@ -171,7 +175,7 @@ static struct syscore_ops vic_syscore_ops = {
 static int __init vic_pm_init(void)
 {
 	if (vic_id > 0)
-		register_syscore_ops(&vic_syscore_ops);
+		register_syscore(&vic_syscore);
 
 	return 0;
 }
diff --git a/drivers/leds/trigger/ledtrig-cpu.c b/drivers/leds/trigger/ledtrig-cpu.c
index 05848a2fecff..679323c2ccda 100644
--- a/drivers/leds/trigger/ledtrig-cpu.c
+++ b/drivers/leds/trigger/ledtrig-cpu.c
@@ -94,28 +94,32 @@ void ledtrig_cpu(enum cpu_led_event ledevt)
 }
 EXPORT_SYMBOL(ledtrig_cpu);
 
-static int ledtrig_cpu_syscore_suspend(void)
+static int ledtrig_cpu_syscore_suspend(void *data)
 {
 	ledtrig_cpu(CPU_LED_STOP);
 	return 0;
 }
 
-static void ledtrig_cpu_syscore_resume(void)
+static void ledtrig_cpu_syscore_resume(void *data)
 {
 	ledtrig_cpu(CPU_LED_START);
 }
 
-static void ledtrig_cpu_syscore_shutdown(void)
+static void ledtrig_cpu_syscore_shutdown(void *data)
 {
 	ledtrig_cpu(CPU_LED_HALTED);
 }
 
-static struct syscore_ops ledtrig_cpu_syscore_ops = {
+static const struct syscore_ops ledtrig_cpu_syscore_ops = {
 	.shutdown	= ledtrig_cpu_syscore_shutdown,
 	.suspend	= ledtrig_cpu_syscore_suspend,
 	.resume		= ledtrig_cpu_syscore_resume,
 };
 
+static struct syscore ledtrig_cpu_syscore = {
+	.ops = &ledtrig_cpu_syscore_ops,
+};
+
 static int ledtrig_online_cpu(unsigned int cpu)
 {
 	ledtrig_cpu(CPU_LED_START);
@@ -157,7 +161,7 @@ static int __init ledtrig_cpu_init(void)
 		led_trigger_register_simple(trig->name, &trig->_trig);
 	}
 
-	register_syscore_ops(&ledtrig_cpu_syscore_ops);
+	register_syscore(&ledtrig_cpu_syscore);
 
 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "leds/trigger:starting",
 				ledtrig_online_cpu, ledtrig_prepare_down_cpu);
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index b0f09c70f1ff..5fe47e784d43 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -2600,7 +2600,7 @@ void pmu_blink(int n)
 #if defined(CONFIG_SUSPEND) && defined(CONFIG_PPC32)
 int pmu_sys_suspended;
 
-static int pmu_syscore_suspend(void)
+static int pmu_syscore_suspend(void *data)
 {
 	/* Suspend PMU event interrupts */
 	pmu_suspend();
@@ -2614,7 +2614,7 @@ static int pmu_syscore_suspend(void)
 	return 0;
 }
 
-static void pmu_syscore_resume(void)
+static void pmu_syscore_resume(void *data)
 {
 	struct adb_request req;
 
@@ -2634,14 +2634,18 @@ static void pmu_syscore_resume(void)
 	pmu_sys_suspended = 0;
 }
 
-static struct syscore_ops pmu_syscore_ops = {
+static const struct syscore_ops pmu_syscore_ops = {
 	.suspend = pmu_syscore_suspend,
 	.resume = pmu_syscore_resume,
 };
 
+static struct syscore pmu_syscore = {
+	.ops = &pmu_syscore_ops,
+};
+
 static int pmu_syscore_register(void)
 {
-	register_syscore_ops(&pmu_syscore_ops);
+	register_syscore(&pmu_syscore);
 
 	return 0;
 }
diff --git a/drivers/power/reset/sc27xx-poweroff.c b/drivers/power/reset/sc27xx-poweroff.c
index 90287c31992c..393bd1c33b73 100644
--- a/drivers/power/reset/sc27xx-poweroff.c
+++ b/drivers/power/reset/sc27xx-poweroff.c
@@ -28,7 +28,7 @@ static struct regmap *regmap;
  * taking cpus down to avoid racing regmap or spi mutex lock when poweroff
  * system through PMIC.
  */
-static void sc27xx_poweroff_shutdown(void)
+static void sc27xx_poweroff_shutdown(void *data)
 {
 #ifdef CONFIG_HOTPLUG_CPU
 	int cpu;
@@ -40,10 +40,14 @@ static void sc27xx_poweroff_shutdown(void)
 #endif
 }
 
-static struct syscore_ops poweroff_syscore_ops = {
+static const struct syscore_ops poweroff_syscore_ops = {
 	.shutdown = sc27xx_poweroff_shutdown,
 };
 
+static struct syscore poweroff_syscore = {
+	.ops = &poweroff_syscore_ops,
+};
+
 static void sc27xx_poweroff_do_poweroff(void)
 {
 	/* Disable the external subsys connection's power firstly */
@@ -62,7 +66,7 @@ static int sc27xx_poweroff_probe(struct platform_device *pdev)
 		return -ENODEV;
 
 	pm_power_off = sc27xx_poweroff_do_poweroff;
-	register_syscore_ops(&poweroff_syscore_ops);
+	register_syscore(&poweroff_syscore);
 	return 0;
 }
 
diff --git a/drivers/sh/clk/core.c b/drivers/sh/clk/core.c
index 7a73f5e4a1fc..f02e12dfa5f6 100644
--- a/drivers/sh/clk/core.c
+++ b/drivers/sh/clk/core.c
@@ -569,7 +569,7 @@ long clk_round_rate(struct clk *clk, unsigned long rate)
 EXPORT_SYMBOL_GPL(clk_round_rate);
 
 #ifdef CONFIG_PM
-static void clks_core_resume(void)
+static void clks_core_resume(void *data)
 {
 	struct clk *clkp;
 
@@ -588,13 +588,17 @@ static void clks_core_resume(void)
 	}
 }
 
-static struct syscore_ops clks_syscore_ops = {
+static const struct syscore_ops clks_syscore_ops = {
 	.resume = clks_core_resume,
 };
 
+static struct syscore clks_syscore = {
+	.ops = &clks_syscore_ops,
+};
+
 static int __init clk_syscore_init(void)
 {
-	register_syscore_ops(&clks_syscore_ops);
+	register_syscore(&clks_syscore);
 
 	return 0;
 }
diff --git a/drivers/sh/intc/core.c b/drivers/sh/intc/core.c
index ea571eeb3078..3dde703b7766 100644
--- a/drivers/sh/intc/core.c
+++ b/drivers/sh/intc/core.c
@@ -394,7 +394,7 @@ err0:
 	return -ENOMEM;
 }
 
-static int intc_suspend(void)
+static int intc_suspend(void *data)
 {
 	struct intc_desc_int *d;
 
@@ -420,7 +420,7 @@ static int intc_suspend(void)
 	return 0;
 }
 
-static void intc_resume(void)
+static void intc_resume(void *data)
 {
 	struct intc_desc_int *d;
 
@@ -450,11 +450,15 @@ static void intc_resume(void)
 	}
 }
 
-struct syscore_ops intc_syscore_ops = {
+static const struct syscore_ops intc_syscore_ops = {
 	.suspend	= intc_suspend,
 	.resume		= intc_resume,
 };
 
+static struct syscore intc_syscore = {
+	.ops = &intc_syscore_ops,
+};
+
 const struct bus_type intc_subsys = {
 	.name		= "intc",
 	.dev_name	= "intc",
@@ -477,7 +481,7 @@ static int __init register_intc_devs(void)
 	struct intc_desc_int *d;
 	int error;
 
-	register_syscore_ops(&intc_syscore_ops);
+	register_syscore(&intc_syscore);
 
 	error = subsys_system_register(&intc_subsys, NULL);
 	if (!error) {
diff --git a/drivers/soc/bcm/brcmstb/biuctrl.c b/drivers/soc/bcm/brcmstb/biuctrl.c
index 364ddbe365c2..bd830649b60d 100644
--- a/drivers/soc/bcm/brcmstb/biuctrl.c
+++ b/drivers/soc/bcm/brcmstb/biuctrl.c
@@ -298,7 +298,7 @@ out:
 #ifdef CONFIG_PM_SLEEP
 static u32 cpubiuctrl_reg_save[NUM_CPU_BIUCTRL_REGS];
 
-static int brcmstb_cpu_credit_reg_suspend(void)
+static int brcmstb_cpu_credit_reg_suspend(void *data)
 {
 	unsigned int i;
 
@@ -311,7 +311,7 @@ static int brcmstb_cpu_credit_reg_suspend(void)
 	return 0;
 }
 
-static void brcmstb_cpu_credit_reg_resume(void)
+static void brcmstb_cpu_credit_reg_resume(void *data)
 {
 	unsigned int i;
 
@@ -322,10 +322,14 @@ static void brcmstb_cpu_credit_reg_resume(void)
 		cbc_writel(cpubiuctrl_reg_save[i], i);
 }
 
-static struct syscore_ops brcmstb_cpu_credit_syscore_ops = {
+static const struct syscore_ops brcmstb_cpu_credit_syscore_ops = {
 	.suspend = brcmstb_cpu_credit_reg_suspend,
 	.resume = brcmstb_cpu_credit_reg_resume,
 };
+
+static struct syscore brcmstb_cpu_credit_syscore = {
+	.ops = &brcmstb_cpu_credit_syscore_ops,
+};
 #endif
 
 
@@ -354,7 +358,7 @@ static int __init brcmstb_biuctrl_init(void)
 	a72_b53_rac_enable_all(np);
 	mcp_a72_b53_set();
 #ifdef CONFIG_PM_SLEEP
-	register_syscore_ops(&brcmstb_cpu_credit_syscore_ops);
+	register_syscore(&brcmstb_cpu_credit_syscore);
 #endif
 	ret = 0;
 out_put:
diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index 034a2a535a1e..93bbebd68001 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -466,7 +466,7 @@ struct tegra_pmc {
 	unsigned long *wake_type_dual_edge_map;
 	unsigned long *wake_sw_status_map;
 	unsigned long *wake_cntrl_level_map;
-	struct syscore_ops syscore;
+	struct syscore syscore;
 };
 
 static struct tegra_pmc *pmc = &(struct tegra_pmc) {
@@ -3147,7 +3147,7 @@ static void tegra186_pmc_process_wake_events(struct tegra_pmc *pmc, unsigned int
 	}
 }
 
-static void tegra186_pmc_wake_syscore_resume(void)
+static void tegra186_pmc_wake_syscore_resume(void *data)
 {
 	u32 status, mask;
 	unsigned int i;
@@ -3160,7 +3160,7 @@ static void tegra186_pmc_wake_syscore_resume(void)
 	}
 }
 
-static int tegra186_pmc_wake_syscore_suspend(void)
+static int tegra186_pmc_wake_syscore_suspend(void *data)
 {
 	wke_read_sw_wake_status(pmc);
 
@@ -3179,6 +3179,11 @@ static int tegra186_pmc_wake_syscore_suspend(void)
 	return 0;
 }
 
+static const struct syscore_ops tegra186_pmc_wake_syscore_ops = {
+	.suspend = tegra186_pmc_wake_syscore_suspend,
+	.resume = tegra186_pmc_wake_syscore_resume,
+};
+
 #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_ARM)
 static int tegra_pmc_suspend(struct device *dev)
 {
@@ -3829,10 +3834,8 @@ static const struct tegra_pmc_regs tegra186_pmc_regs = {
 
 static void tegra186_pmc_init(struct tegra_pmc *pmc)
 {
-	pmc->syscore.suspend = tegra186_pmc_wake_syscore_suspend;
-	pmc->syscore.resume = tegra186_pmc_wake_syscore_resume;
-
-	register_syscore_ops(&pmc->syscore);
+	pmc->syscore.ops = &tegra186_pmc_wake_syscore_ops;
+	register_syscore(&pmc->syscore);
 }
 
 static void tegra186_pmc_setup_irq_polarity(struct tegra_pmc *pmc,
diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index bd2fca7dc017..8a2f441cd2ec 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -592,7 +592,7 @@ static void hfi_disable_instance(void *ptr)
 	hfi_disable();
 }
 
-static void hfi_syscore_resume(void)
+static void hfi_syscore_resume(void *data)
 {
 	/* This code runs only on the boot CPU. */
 	struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, 0);
@@ -603,7 +603,7 @@ static void hfi_syscore_resume(void)
 		hfi_enable_instance(hfi_instance);
 }
 
-static int hfi_syscore_suspend(void)
+static int hfi_syscore_suspend(void *data)
 {
 	/* No locking needed. There is no concurrency with CPU offline. */
 	hfi_disable();
@@ -611,11 +611,15 @@ static int hfi_syscore_suspend(void)
 	return 0;
 }
 
-static struct syscore_ops hfi_pm_ops = {
+static const struct syscore_ops hfi_pm_ops = {
 	.resume = hfi_syscore_resume,
 	.suspend = hfi_syscore_suspend,
 };
 
+static struct syscore hfi_pm = {
+	.ops = &hfi_pm_ops,
+};
+
 static int hfi_thermal_notify(struct notifier_block *nb, unsigned long state,
 			      void *_notify)
 {
@@ -710,7 +714,7 @@ void __init intel_hfi_init(void)
 	if (thermal_genl_register_notifier(&hfi_thermal_nb))
 		goto err_nl_notif;
 
-	register_syscore_ops(&hfi_pm_ops);
+	register_syscore(&hfi_pm);
 
 	return;
 
diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
index 296703939846..f2e8eaf684ba 100644
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@@ -495,7 +495,7 @@ static void xen_acpi_processor_resume_worker(struct work_struct *dummy)
 		pr_info("ACPI data upload failed, error = %d\n", rc);
 }
 
-static void xen_acpi_processor_resume(void)
+static void xen_acpi_processor_resume(void *data)
 {
 	static DECLARE_WORK(wq, xen_acpi_processor_resume_worker);
 
@@ -509,10 +509,14 @@ static void xen_acpi_processor_resume(void)
 	schedule_work(&wq);
 }
 
-static struct syscore_ops xap_syscore_ops = {
+static const struct syscore_ops xap_syscore_ops = {
 	.resume	= xen_acpi_processor_resume,
 };
 
+static struct syscore xap_syscore = {
+	.ops = &xap_syscore_ops,
+};
+
 static int __init xen_acpi_processor_init(void)
 {
 	int i;
@@ -563,7 +567,7 @@ static int __init xen_acpi_processor_init(void)
 	if (rc)
 		goto err_unregister;
 
-	register_syscore_ops(&xap_syscore_ops);
+	register_syscore(&xap_syscore);
 
 	return 0;
 err_unregister:
@@ -580,7 +584,7 @@ static void __exit xen_acpi_processor_exit(void)
 {
 	int i;
 
-	unregister_syscore_ops(&xap_syscore_ops);
+	unregister_syscore(&xap_syscore);
 	bitmap_free(acpi_ids_done);
 	bitmap_free(acpi_id_present);
 	bitmap_free(acpi_id_cst_present);
diff --git a/include/linux/syscore_ops.h b/include/linux/syscore_ops.h
index ae4d48e4c970..ac6d71be5c38 100644
--- a/include/linux/syscore_ops.h
+++ b/include/linux/syscore_ops.h
@@ -11,14 +11,19 @@
 #include <linux/list.h>
 
 struct syscore_ops {
+	int (*suspend)(void *data);
+	void (*resume)(void *data);
+	void (*shutdown)(void *data);
+};
+
+struct syscore {
 	struct list_head node;
-	int (*suspend)(void);
-	void (*resume)(void);
-	void (*shutdown)(void);
+	const struct syscore_ops *ops;
+	void *data;
 };
 
-extern void register_syscore_ops(struct syscore_ops *ops);
-extern void unregister_syscore_ops(struct syscore_ops *ops);
+extern void register_syscore(struct syscore *syscore);
+extern void unregister_syscore(struct syscore *syscore);
 #ifdef CONFIG_PM_SLEEP
 extern int syscore_suspend(void);
 extern void syscore_resume(void);
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index b0f0d15085db..7481fbb947d3 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -173,7 +173,7 @@ int cpu_cluster_pm_exit(void)
 EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
 
 #ifdef CONFIG_PM
-static int cpu_pm_suspend(void)
+static int cpu_pm_suspend(void *data)
 {
 	int ret;
 
@@ -185,20 +185,24 @@ static int cpu_pm_suspend(void)
 	return ret;
 }
 
-static void cpu_pm_resume(void)
+static void cpu_pm_resume(void *data)
 {
 	cpu_cluster_pm_exit();
 	cpu_pm_exit();
 }
 
-static struct syscore_ops cpu_pm_syscore_ops = {
+static const struct syscore_ops cpu_pm_syscore_ops = {
 	.suspend = cpu_pm_suspend,
 	.resume = cpu_pm_resume,
 };
 
+static struct syscore cpu_pm_syscore = {
+	.ops = &cpu_pm_syscore_ops,
+};
+
 static int cpu_pm_init(void)
 {
-	register_syscore_ops(&cpu_pm_syscore_ops);
+	register_syscore(&cpu_pm_syscore);
 	return 0;
 }
 core_initcall(cpu_pm_init);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index bf59e37d650a..3cd0c40282c0 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -650,7 +650,7 @@ static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
 }
 
 #ifdef CONFIG_PM
-static int irq_gc_suspend(void)
+static int irq_gc_suspend(void *data)
 {
 	struct irq_chip_generic *gc;
 
@@ -670,7 +670,7 @@ static int irq_gc_suspend(void)
 	return 0;
 }
 
-static void irq_gc_resume(void)
+static void irq_gc_resume(void *data)
 {
 	struct irq_chip_generic *gc;
 
@@ -693,7 +693,7 @@ static void irq_gc_resume(void)
 #define irq_gc_resume NULL
 #endif
 
-static void irq_gc_shutdown(void)
+static void irq_gc_shutdown(void *data)
 {
 	struct irq_chip_generic *gc;
 
@@ -709,15 +709,19 @@ static void irq_gc_shutdown(void)
 	}
 }
 
-static struct syscore_ops irq_gc_syscore_ops = {
+static const struct syscore_ops irq_gc_syscore_ops = {
 	.suspend = irq_gc_suspend,
 	.resume = irq_gc_resume,
 	.shutdown = irq_gc_shutdown,
 };
 
+static struct syscore irq_gc_syscore = {
+	.ops = &irq_gc_syscore_ops,
+};
+
 static int __init irq_gc_init_ops(void)
 {
-	register_syscore_ops(&irq_gc_syscore_ops);
+	register_syscore(&irq_gc_syscore);
 	return 0;
 }
 device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f7394729cedc..99ff65466d87 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -211,21 +211,26 @@ void rearm_wake_irq(unsigned int irq)
 
 /**
  * irq_pm_syscore_resume - enable interrupt lines early
+ * @data: syscore context
  *
  * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
  */
-static void irq_pm_syscore_resume(void)
+static void irq_pm_syscore_resume(void *data)
 {
 	resume_irqs(true);
 }
 
-static struct syscore_ops irq_pm_syscore_ops = {
+static const struct syscore_ops irq_pm_syscore_ops = {
 	.resume		= irq_pm_syscore_resume,
 };
 
+static struct syscore irq_pm_syscore = {
+	.ops = &irq_pm_syscore_ops,
+};
+
 static int __init irq_pm_init_ops(void)
 {
-	register_syscore_ops(&irq_pm_syscore_ops);
+	register_syscore(&irq_pm_syscore);
 	return 0;
 }
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5aee9ffb16b9..f852dce94015 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3639,12 +3639,13 @@ static bool legacy_kthread_create(void)
 
 /**
  * printk_kthreads_shutdown - shutdown all threaded printers
+ * @data: syscore context
  *
  * On system shutdown all threaded printers are stopped. This allows printk
  * to transition back to atomic printing, thus providing a robust mechanism
  * for the final shutdown/reboot messages to be output.
  */
-static void printk_kthreads_shutdown(void)
+static void printk_kthreads_shutdown(void *data)
 {
 	struct console *con;
 
@@ -3666,10 +3667,14 @@ static void printk_kthreads_shutdown(void)
 	console_list_unlock();
 }
 
-static struct syscore_ops printk_syscore_ops = {
+static const struct syscore_ops printk_syscore_ops = {
 	.shutdown = printk_kthreads_shutdown,
 };
 
+static struct syscore printk_syscore = {
+	.ops = &printk_syscore_ops,
+};
+
 /*
  * If appropriate, start nbcon kthreads and set @printk_kthreads_running.
  * If any kthreads fail to start, those consoles are unregistered.
@@ -3737,7 +3742,7 @@ static void printk_kthreads_check_locked(void)
 
 static int __init printk_set_kthreads_ready(void)
 {
-	register_syscore_ops(&printk_syscore_ops);
+	register_syscore(&printk_syscore);
 
 	console_list_lock();
 	printk_kthreads_ready = true;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index cc1afec306b3..f39111830ca3 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -296,6 +296,11 @@ int sched_clock_suspend(void)
 	return 0;
 }
 
+static int sched_clock_syscore_suspend(void *data)
+{
+	return sched_clock_suspend();
+}
+
 void sched_clock_resume(void)
 {
 	struct clock_read_data *rd = &cd.read_data[0];
@@ -305,14 +310,23 @@ void sched_clock_resume(void)
 	rd->read_sched_clock = cd.actual_read_sched_clock;
 }
 
-static struct syscore_ops sched_clock_ops = {
-	.suspend	= sched_clock_suspend,
-	.resume		= sched_clock_resume,
+static void sched_clock_syscore_resume(void *data)
+{
+	sched_clock_resume();
+}
+
+static const struct syscore_ops sched_clock_syscore_ops = {
+	.suspend	= sched_clock_syscore_suspend,
+	.resume		= sched_clock_syscore_resume,
+};
+
+static struct syscore sched_clock_syscore = {
+	.ops = &sched_clock_syscore_ops,
 };
 
 static int __init sched_clock_syscore_init(void)
 {
-	register_syscore_ops(&sched_clock_ops);
+	register_syscore(&sched_clock_syscore);
 
 	return 0;
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b6974fce800c..f3513679ee09 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1994,6 +1994,11 @@ void timekeeping_resume(void)
 	timerfd_resume();
 }
 
+static void timekeeping_syscore_resume(void *data)
+{
+	timekeeping_resume();
+}
+
 int timekeeping_suspend(void)
 {
 	struct timekeeper *tks = &tk_core.shadow_timekeeper;
@@ -2061,15 +2066,24 @@ int timekeeping_suspend(void)
 	return 0;
 }
 
+static int timekeeping_syscore_suspend(void *data)
+{
+	return timekeeping_suspend();
+}
+
 /* sysfs resume/suspend bits for timekeeping */
-static struct syscore_ops timekeeping_syscore_ops = {
-	.resume		= timekeeping_resume,
-	.suspend	= timekeeping_suspend,
+static const struct syscore_ops timekeeping_syscore_ops = {
+	.resume		= timekeeping_syscore_resume,
+	.suspend	= timekeeping_syscore_suspend,
+};
+
+static struct syscore timekeeping_syscore = {
+	.ops = &timekeeping_syscore_ops,
 };
 
 static int __init timekeeping_init_ops(void)
 {
-	register_syscore_ops(&timekeeping_syscore_ops);
+	register_syscore(&timekeeping_syscore);
 	return 0;
 }
 device_initcall(timekeeping_init_ops);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 226faeaa8e56..b7675a58d663 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5629,7 +5629,7 @@ static int kvm_offline_cpu(unsigned int cpu)
 	return 0;
 }
 
-static void kvm_shutdown(void)
+static void kvm_shutdown(void *data)
 {
 	/*
 	 * Disable hardware virtualization and set kvm_rebooting to indicate
@@ -5647,7 +5647,7 @@ static void kvm_shutdown(void)
 	on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
 }
 
-static int kvm_suspend(void)
+static int kvm_suspend(void *data)
 {
 	/*
 	 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
@@ -5664,7 +5664,7 @@ static int kvm_suspend(void)
 	return 0;
 }
 
-static void kvm_resume(void)
+static void kvm_resume(void *data)
 {
 	lockdep_assert_not_held(&kvm_usage_lock);
 	lockdep_assert_irqs_disabled();
@@ -5672,12 +5672,16 @@ static void kvm_resume(void)
 	WARN_ON_ONCE(kvm_enable_virtualization_cpu());
 }
 
-static struct syscore_ops kvm_syscore_ops = {
+static const struct syscore_ops kvm_syscore_ops = {
 	.suspend = kvm_suspend,
 	.resume = kvm_resume,
 	.shutdown = kvm_shutdown,
 };
 
+static struct syscore kvm_syscore = {
+	.ops = &kvm_syscore_ops,
+};
+
 int kvm_enable_virtualization(void)
 {
 	int r;
@@ -5694,7 +5698,7 @@ int kvm_enable_virtualization(void)
 	if (r)
 		goto err_cpuhp;
 
-	register_syscore_ops(&kvm_syscore_ops);
+	register_syscore(&kvm_syscore);
 
 	/*
 	 * Undo virtualization enabling and bail if the system is going down.
@@ -5716,7 +5720,7 @@ int kvm_enable_virtualization(void)
 	return 0;
 
 err_rebooting:
-	unregister_syscore_ops(&kvm_syscore_ops);
+	unregister_syscore(&kvm_syscore);
 	cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
 err_cpuhp:
 	kvm_arch_disable_virtualization();
@@ -5732,7 +5736,7 @@ void kvm_disable_virtualization(void)
 	if (--kvm_usage_count)
 		return;
 
-	unregister_syscore_ops(&kvm_syscore_ops);
+	unregister_syscore(&kvm_syscore);
 	cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
 	kvm_arch_disable_virtualization();
 }
-- 
cgit v1.2.3


From 0559730b8570259ef948e9083653f8a87baba182 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Date: Mon, 22 Sep 2025 11:43:28 +0200
Subject: pwm: Drop unused function pwm_apply_args()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function pwm_apply_args() was introduced with the concept of atomic
PWM configuration and needed for drivers not using this concept yet. Now
all drivers are converted accordingly and so no callers are left which
allows to remove this function.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Link: https://patch.msgid.link/20250922094327.1143944-2-u.kleine-koenig@baylibre.com
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 include/linux/pwm.h | 33 ---------------------------------
 1 file changed, 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 549ac4aaad59..e59be4e382d1 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -611,39 +611,6 @@ devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode,
 }
 #endif
 
-static inline void pwm_apply_args(struct pwm_device *pwm)
-{
-	struct pwm_state state = { };
-
-	/*
-	 * PWM users calling pwm_apply_args() expect to have a fresh config
-	 * where the polarity and period are set according to pwm_args info.
-	 * The problem is, polarity can only be changed when the PWM is
-	 * disabled.
-	 *
-	 * PWM drivers supporting hardware readout may declare the PWM device
-	 * as enabled, and prevent polarity setting, which changes from the
-	 * existing behavior, where all PWM devices are declared as disabled
-	 * at startup (even if they are actually enabled), thus authorizing
-	 * polarity setting.
-	 *
-	 * To fulfill this requirement, we apply a new state which disables
-	 * the PWM device and set the reference period and polarity config.
-	 *
-	 * Note that PWM users requiring a smooth handover between the
-	 * bootloader and the kernel (like critical regulators controlled by
-	 * PWM devices) will have to switch to the atomic API and avoid calling
-	 * pwm_apply_args().
-	 */
-
-	state.enabled = false;
-	state.polarity = pwm->args.polarity;
-	state.period = pwm->args.period;
-	state.usage_power = false;
-
-	pwm_apply_might_sleep(pwm, &state);
-}
-
 struct pwm_lookup {
 	struct list_head list;
 	const char *provider;
-- 
cgit v1.2.3


From 37f0c7a8df7ad719a68fa1c2dbf066cfebc391a7 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Fri, 14 Nov 2025 11:07:04 +0200
Subject: block-dma: properly take MMIO path

In commit eadaa8b255f3 ("dma-mapping: introduce new DMA attribute to
indicate MMIO memory"), DMA_ATTR_MMIO attribute was added to describe
MMIO addresses, which require to avoid any memory cache flushing, as
an outcome of the discussion pointed in Link tag below.

In case of PCI_P2PDMA_MAP_THRU_HOST_BRIDGE transfer, blk-mq-dm logic
treated this as regular page and relied on "struct page" DMA flow.
That flow performs CPU cache flushing, which shouldn't be done here,
and doesn't set IOMMU_MMIO flag in DMA-IOMMU case.

As a solution, let's encode peer-to-peer transaction type in NVMe IOD
flags variable and provide it to blk-mq-dma API.

Link: https://lore.kernel.org/all/f912c446-1ae9-4390-9c11-00dce7bf0fd3@arm.com/
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-dma.c            | 18 +++++++----
 drivers/nvme/host/pci.c       | 73 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/bio-integrity.h |  1 -
 include/linux/blk-integrity.h | 14 ---------
 include/linux/blk-mq-dma.h    | 28 ++++++++---------
 include/linux/blk_types.h     |  2 --
 6 files changed, 90 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index cebfead826ee..e9108ccaf4b0 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -92,8 +92,13 @@ static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
 static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
 		struct blk_dma_iter *iter, struct phys_vec *vec)
 {
+	unsigned int attrs = 0;
+
+	if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
+		attrs |= DMA_ATTR_MMIO;
+
 	iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len,
-			rq_dma_dir(req), 0);
+			rq_dma_dir(req), attrs);
 	if (dma_mapping_error(dma_dev, iter->addr)) {
 		iter->status = BLK_STS_RESOURCE;
 		return false;
@@ -108,14 +113,18 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
 {
 	enum dma_data_direction dir = rq_dma_dir(req);
 	unsigned int mapped = 0;
+	unsigned int attrs = 0;
 	int error;
 
 	iter->addr = state->addr;
 	iter->len = dma_iova_size(state);
 
+	if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
+		attrs |= DMA_ATTR_MMIO;
+
 	do {
 		error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
-				vec->len, dir, 0);
+				vec->len, dir, attrs);
 		if (error)
 			break;
 		mapped += vec->len;
@@ -162,6 +171,7 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
 
 	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
 	iter->status = BLK_STS_OK;
+	iter->p2pdma.map = PCI_P2PDMA_MAP_NONE;
 
 	/*
 	 * Grab the first segment ASAP because we'll need it to check for P2P
@@ -173,10 +183,6 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
 	switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
 				 phys_to_page(vec.paddr))) {
 	case PCI_P2PDMA_MAP_BUS_ADDR:
-		if (iter->iter.is_integrity)
-			bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA;
-		else
-			req->cmd_flags |= REQ_P2PDMA;
 		return blk_dma_map_bus(iter, &vec);
 	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
 		/*
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d0dd836ccdb9..9085bed107fd 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -260,8 +260,20 @@ enum nvme_iod_flags {
 	/* single segment dma mapping */
 	IOD_SINGLE_SEGMENT	= 1U << 2,
 
+	/* Data payload contains p2p memory */
+	IOD_DATA_P2P		= 1U << 3,
+
+	/* Metadata contains p2p memory */
+	IOD_META_P2P		= 1U << 4,
+
+	/* Data payload contains MMIO memory */
+	IOD_DATA_MMIO		= 1U << 5,
+
+	/* Metadata contains MMIO memory */
+	IOD_META_MMIO		= 1U << 6,
+
 	/* Metadata using non-coalesced MPTR */
-	IOD_SINGLE_META_SEGMENT	= 1U << 5,
+	IOD_SINGLE_META_SEGMENT	= 1U << 7,
 };
 
 struct nvme_dma_vec {
@@ -733,10 +745,12 @@ static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge,
 static void nvme_unmap_metadata(struct request *req)
 {
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+	enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE;
 	enum dma_data_direction dir = rq_dma_dir(req);
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct device *dma_dev = nvmeq->dev->dev;
 	struct nvme_sgl_desc *sge = iod->meta_descriptor;
+	unsigned int attrs = 0;
 
 	if (iod->flags & IOD_SINGLE_META_SEGMENT) {
 		dma_unmap_page(dma_dev, iod->meta_dma,
@@ -745,13 +759,20 @@ static void nvme_unmap_metadata(struct request *req)
 		return;
 	}
 
-	if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state,
-					iod->meta_total_len)) {
+	if (iod->flags & IOD_META_P2P)
+		map = PCI_P2PDMA_MAP_BUS_ADDR;
+	else if (iod->flags & IOD_META_MMIO) {
+		map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE;
+		attrs |= DMA_ATTR_MMIO;
+	}
+
+	if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state,
+			      iod->meta_total_len, map)) {
 		if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
-			nvme_free_sgls(req, sge, &sge[1], 0);
+			nvme_free_sgls(req, sge, &sge[1], attrs);
 		else
 			dma_unmap_phys(dma_dev, iod->meta_dma,
-				       iod->meta_total_len, dir, 0);
+				       iod->meta_total_len, dir, attrs);
 	}
 
 	if (iod->meta_descriptor)
@@ -761,9 +782,11 @@ static void nvme_unmap_metadata(struct request *req)
 
 static void nvme_unmap_data(struct request *req)
 {
+	enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 	struct device *dma_dev = nvmeq->dev->dev;
+	unsigned int attrs = 0;
 
 	if (iod->flags & IOD_SINGLE_SEGMENT) {
 		static_assert(offsetof(union nvme_data_ptr, prp1) ==
@@ -773,12 +796,20 @@ static void nvme_unmap_data(struct request *req)
 		return;
 	}
 
-	if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
+	if (iod->flags & IOD_DATA_P2P)
+		map = PCI_P2PDMA_MAP_BUS_ADDR;
+	else if (iod->flags & IOD_DATA_MMIO) {
+		map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE;
+		attrs |= DMA_ATTR_MMIO;
+	}
+
+	if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len,
+			      map)) {
 		if (nvme_pci_cmd_use_sgl(&iod->cmd))
 			nvme_free_sgls(req, iod->descriptors[0],
-				       &iod->cmd.common.dptr.sgl, 0);
+				       &iod->cmd.common.dptr.sgl, attrs);
 		else
-			nvme_free_prps(req, 0);
+			nvme_free_prps(req, attrs);
 	}
 
 	if (iod->nr_descriptors)
@@ -1049,6 +1080,19 @@ static blk_status_t nvme_map_data(struct request *req)
 	if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
 		return iter.status;
 
+	switch (iter.p2pdma.map) {
+	case PCI_P2PDMA_MAP_BUS_ADDR:
+		iod->flags |= IOD_DATA_P2P;
+		break;
+	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+		iod->flags |= IOD_DATA_MMIO;
+		break;
+	case PCI_P2PDMA_MAP_NONE:
+		break;
+	default:
+		return BLK_STS_RESOURCE;
+	}
+
 	if (use_sgl == SGL_FORCED ||
 	    (use_sgl == SGL_SUPPORTED &&
 	     (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
@@ -1071,6 +1115,19 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
 						&iod->meta_dma_state, &iter))
 		return iter.status;
 
+	switch (iter.p2pdma.map) {
+	case PCI_P2PDMA_MAP_BUS_ADDR:
+		iod->flags |= IOD_META_P2P;
+		break;
+	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+		iod->flags |= IOD_META_MMIO;
+		break;
+	case PCI_P2PDMA_MAP_NONE:
+		break;
+	default:
+		return BLK_STS_RESOURCE;
+	}
+
 	if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
 		entries = 1;
 
diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h
index 3d05296a5afe..21e4652dcfd2 100644
--- a/include/linux/bio-integrity.h
+++ b/include/linux/bio-integrity.h
@@ -13,7 +13,6 @@ enum bip_flags {
 	BIP_CHECK_GUARD		= 1 << 5, /* guard check */
 	BIP_CHECK_REFTAG	= 1 << 6, /* reftag check */
 	BIP_CHECK_APPTAG	= 1 << 7, /* apptag check */
-	BIP_P2P_DMA		= 1 << 8, /* using P2P address */
 
 	BIP_MEMPOOL		= 1 << 15, /* buffer backed by mempool */
 };
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index c2030fd8ba0a..a6b84206eb94 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -33,14 +33,6 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t,
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 
-static inline bool blk_rq_integrity_dma_unmap(struct request *req,
-		struct device *dma_dev, struct dma_iova_state *state,
-		size_t mapped_len)
-{
-	return blk_dma_unmap(req, dma_dev, state, mapped_len,
-			bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA);
-}
-
 int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
 			      ssize_t bytes);
@@ -129,12 +121,6 @@ static inline int blk_rq_map_integrity_sg(struct request *q,
 {
 	return 0;
 }
-static inline bool blk_rq_integrity_dma_unmap(struct request *req,
-		struct device *dma_dev, struct dma_iova_state *state,
-		size_t mapped_len)
-{
-	return false;
-}
 static inline int blk_rq_integrity_map_user(struct request *rq,
 					    void __user *ubuf,
 					    ssize_t bytes)
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index 51829958d872..cb88fc791fbd 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -16,13 +16,13 @@ struct blk_dma_iter {
 	/* Output address range for this iteration */
 	dma_addr_t			addr;
 	u32				len;
+	struct pci_p2pdma_map_state	p2pdma;
 
 	/* Status code. Only valid when blk_rq_dma_map_iter_* returned false */
 	blk_status_t			status;
 
 	/* Internal to blk_rq_dma_map_iter_* */
 	struct blk_map_iter		iter;
-	struct pci_p2pdma_map_state	p2pdma;
 };
 
 bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
@@ -43,36 +43,34 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state)
 }
 
 /**
- * blk_dma_unmap - try to DMA unmap a request
+ * blk_rq_dma_unmap - try to DMA unmap a request
  * @req:	request to unmap
  * @dma_dev:	device to unmap from
  * @state:	DMA IOVA state
  * @mapped_len: number of bytes to unmap
- * @is_p2p:	true if mapped with PCI_P2PDMA_MAP_BUS_ADDR
+ * @map:	peer-to-peer mapping type
  *
  * Returns %false if the callers need to manually unmap every DMA segment
  * mapped using @iter or %true if no work is left to be done.
  */
-static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev,
-		struct dma_iova_state *state, size_t mapped_len, bool is_p2p)
+static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, size_t mapped_len,
+		enum pci_p2pdma_map_type map)
 {
-	if (is_p2p)
+	if (map == PCI_P2PDMA_MAP_BUS_ADDR)
 		return true;
 
 	if (dma_use_iova(state)) {
+		unsigned int attrs = 0;
+
+		if (map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
+			attrs |= DMA_ATTR_MMIO;
+
 		dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req),
-				 0);
+				 attrs);
 		return true;
 	}
 
 	return !dma_need_unmap(dma_dev);
 }
-
-static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
-		struct dma_iova_state *state, size_t mapped_len)
-{
-	return blk_dma_unmap(req, dma_dev, state, mapped_len,
-				req->cmd_flags & REQ_P2PDMA);
-}
-
 #endif /* BLK_MQ_DMA_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 53501ebb0623..d884cc1256ec 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -393,7 +393,6 @@ enum req_flag_bits {
 	__REQ_DRV,		/* for driver use */
 	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
 	__REQ_ATOMIC,		/* for atomic write operations */
-	__REQ_P2PDMA,		/* contains P2P DMA pages */
 	/*
 	 * Command specific flags, keep last:
 	 */
@@ -426,7 +425,6 @@ enum req_flag_bits {
 #define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
 #define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
 #define REQ_ATOMIC	(__force blk_opf_t)(1ULL << __REQ_ATOMIC)
-#define REQ_P2PDMA	(__force blk_opf_t)(1ULL << __REQ_P2PDMA)
 
 #define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 
-- 
cgit v1.2.3


From cefd55bd2159f427228d44864747243946296739 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 11 Nov 2025 22:29:44 +0100
Subject: nsproxy: fix free_nsproxy() and simplify create_new_namespaces()

Make it possible to handle NULL being passed to the reference count
helpers instead of forcing the caller to handle this. Afterwards we can
nicely allow a cleanup guard to handle nsproxy freeing.

Active reference count handling is not done in nsproxy_free() but rather
in free_nsproxy() as nsproxy_free() is also called from setns() failure
paths where a new nsproxy has been prepared but has not been marked as
active via switch_task_namespaces().

Link: https://lore.kernel.org/690bfb9e.050a0220.2e3c35.0013.GAE@google.com
Link: https://patch.msgid.link/20251111-sakralbau-guthaben-7dcc277d337f@brauner
Fixes: 3c9820d5c64a ("ns: add active reference count")
Reported-by: syzbot+0b2e79f91ff6579bfa5b@syzkaller.appspotmail.com
Reported-by: syzbot+0a8655a80e189278487e@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/ns_common.h | 11 +++++++----
 include/linux/nsproxy.h   |  4 ++--
 kernel/nsproxy.c          | 36 ++++++++++++++++++++----------------
 3 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 136f6a322e53..825f5865bfc5 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -114,11 +114,14 @@ static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common
 }
 
 #define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns)))
-#define ns_ref_inc(__ns) __ns_ref_inc(to_ns_common((__ns)))
-#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
-#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
+#define ns_ref_inc(__ns) \
+	do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0)
+#define ns_ref_get(__ns) \
+	((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false)
+#define ns_ref_put(__ns) \
+	((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false)
 #define ns_ref_put_and_lock(__ns, __ns_lock) \
-	__ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock)
+	((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false)
 
 #define ns_ref_active_read(__ns) \
 	((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index ac825eddec59..5a67648721c7 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -99,7 +99,7 @@ void get_cred_namespaces(struct task_struct *tsk);
 void exit_cred_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 int exec_task_namespaces(void);
-void free_nsproxy(struct nsproxy *ns);
+void deactivate_nsproxy(struct nsproxy *ns);
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
 	struct cred *, struct fs_struct *);
 int __init nsproxy_cache_init(void);
@@ -107,7 +107,7 @@ int __init nsproxy_cache_init(void);
 static inline void put_nsproxy(struct nsproxy *ns)
 {
 	if (refcount_dec_and_test(&ns->count))
-		free_nsproxy(ns);
+		deactivate_nsproxy(ns);
 }
 
 static inline void get_nsproxy(struct nsproxy *ns)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 94c2cfe0afa1..259c4b4f1eeb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -60,6 +60,25 @@ static inline struct nsproxy *create_nsproxy(void)
 	return nsproxy;
 }
 
+static inline void nsproxy_free(struct nsproxy *ns)
+{
+	put_mnt_ns(ns->mnt_ns);
+	put_uts_ns(ns->uts_ns);
+	put_ipc_ns(ns->ipc_ns);
+	put_pid_ns(ns->pid_ns_for_children);
+	put_time_ns(ns->time_ns);
+	put_time_ns(ns->time_ns_for_children);
+	put_cgroup_ns(ns->cgroup_ns);
+	put_net(ns->net_ns);
+	kmem_cache_free(nsproxy_cachep, ns);
+}
+
+void deactivate_nsproxy(struct nsproxy *ns)
+{
+	nsproxy_ns_active_put(ns);
+	nsproxy_free(ns);
+}
+
 /*
  * Create new nsproxy and all of its the associated namespaces.
  * Return the newly created nsproxy.  Do not attach this to the task,
@@ -185,21 +204,6 @@ int copy_namespaces(u64 flags, struct task_struct *tsk)
 	return 0;
 }
 
-void free_nsproxy(struct nsproxy *ns)
-{
-	nsproxy_ns_active_put(ns);
-
-	put_mnt_ns(ns->mnt_ns);
-	put_uts_ns(ns->uts_ns);
-	put_ipc_ns(ns->ipc_ns);
-	put_pid_ns(ns->pid_ns_for_children);
-	put_time_ns(ns->time_ns);
-	put_time_ns(ns->time_ns_for_children);
-	put_cgroup_ns(ns->cgroup_ns);
-	put_net(ns->net_ns);
-	kmem_cache_free(nsproxy_cachep, ns);
-}
-
 /*
  * Called from unshare. Unshare all the namespaces part of nsproxy.
  * On success, returns the new nsproxy.
@@ -338,7 +342,7 @@ static void put_nsset(struct nsset *nsset)
 	if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
 		free_fs_struct(nsset->fs);
 	if (nsset->nsproxy)
-		free_nsproxy(nsset->nsproxy);
+		nsproxy_free(nsset->nsproxy);
 }
 
 static int prepare_nsset(unsigned flags, struct nsset *nsset)
-- 
cgit v1.2.3


From 4037d966f034ba5da2872c413b2ec17eca867e68 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:25 +1100
Subject: VFS: introduce start_dirop() and end_dirop()

The fact that directory operations (create,remove,rename) are protected
by a lock on the parent is known widely throughout the kernel.
In order to change this - to instead lock the target dentry  - it is
best to centralise this knowledge so it can be changed in one place.

This patch introduces start_dirop() which is local to VFS code.
It performs the required locking for create and remove.  Rename
will be handled separately.

Various functions with names like start_creating() or start_removing_path(),
some of which already exist, will export this functionality beyond the VFS.

end_dirop() is the partner of start_dirop().  It drops the lock and
releases the reference on the dentry.
It *is* exported so that various end_creating etc functions can be inline.

As vfs_mkdir() drops the dentry on error we cannot use end_dirop() as
that won't unlock when the dentry IS_ERR().  For now we need an explicit
unlock when dentry IS_ERR().  I hope to change vfs_mkdir() to unlock
when it drops a dentry so that explicit unlock can go away.

end_dirop() can always be called on the result of start_dirop(), but not
after vfs_mkdir().  After a vfs_mkdir() we still may need the explicit
unlock as seen in end_creating_path().

As well as adding start_dirop() and end_dirop()
this patch uses them in:
 - simple_start_creating (which requires sharing lookup_noperm_common()
        with libfs.c)
 - start_removing_path / start_removing_user_path_at
 - filename_create / end_creating_path()
 - do_rmdir(), do_unlinkat()

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-3-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h      |  3 ++
 fs/libfs.c         | 36 ++++++++++----------
 fs/namei.c         | 98 ++++++++++++++++++++++++++++++++++++++++--------------
 include/linux/fs.h |  2 ++
 4 files changed, 95 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/fs/internal.h b/fs/internal.h
index 9b2b4d116880..d08d5e2235e9 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -67,6 +67,9 @@ int vfs_tmpfile(struct mnt_idmap *idmap,
 		const struct path *parentpath,
 		struct file *file, umode_t mode);
 struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
+struct dentry *start_dirop(struct dentry *parent, struct qstr *name,
+			   unsigned int lookup_flags);
+int lookup_noperm_common(struct qstr *qname, struct dentry *base);
 
 /*
  * namespace.c
diff --git a/fs/libfs.c b/fs/libfs.c
index ce8c496a6940..02371f45ef7d 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -2289,27 +2289,25 @@ void stashed_dentry_prune(struct dentry *dentry)
 	cmpxchg(stashed, dentry, NULL);
 }
 
-/* parent must be held exclusive */
+/**
+ * simple_start_creating - prepare to create a given name
+ * @parent: directory in which to prepare to create the name
+ * @name:   the name to be created
+ *
+ * Required lock is taken and a lookup in performed prior to creating an
+ * object in a directory.  No permission checking is performed.
+ *
+ * Returns: a negative dentry on which vfs_create() or similar may
+ *  be attempted, or an error.
+ */
 struct dentry *simple_start_creating(struct dentry *parent, const char *name)
 {
-	struct dentry *dentry;
-	struct inode *dir = d_inode(parent);
+	struct qstr qname = QSTR(name);
+	int err;
 
-	inode_lock(dir);
-	if (unlikely(IS_DEADDIR(dir))) {
-		inode_unlock(dir);
-		return ERR_PTR(-ENOENT);
-	}
-	dentry = lookup_noperm(&QSTR(name), parent);
-	if (IS_ERR(dentry)) {
-		inode_unlock(dir);
-		return dentry;
-	}
-	if (dentry->d_inode) {
-		dput(dentry);
-		inode_unlock(dir);
-		return ERR_PTR(-EEXIST);
-	}
-	return dentry;
+	err = lookup_noperm_common(&qname, parent);
+	if (err)
+		return ERR_PTR(err);
+	return start_dirop(parent, &qname, LOOKUP_CREATE | LOOKUP_EXCL);
 }
 EXPORT_SYMBOL(simple_start_creating);
diff --git a/fs/namei.c b/fs/namei.c
index 7377020a2cba..3618efd4bcaa 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2765,6 +2765,48 @@ static int filename_parentat(int dfd, struct filename *name,
 	return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
 }
 
+/**
+ * start_dirop - begin a create or remove dirop, performing locking and lookup
+ * @parent:       the dentry of the parent in which the operation will occur
+ * @name:         a qstr holding the name within that parent
+ * @lookup_flags: intent and other lookup flags.
+ *
+ * The lookup is performed and necessary locks are taken so that, on success,
+ * the returned dentry can be operated on safely.
+ * The qstr must already have the hash value calculated.
+ *
+ * Returns: a locked dentry, or an error.
+ *
+ */
+struct dentry *start_dirop(struct dentry *parent, struct qstr *name,
+			   unsigned int lookup_flags)
+{
+	struct dentry *dentry;
+	struct inode *dir = d_inode(parent);
+
+	inode_lock_nested(dir, I_MUTEX_PARENT);
+	dentry = lookup_one_qstr_excl(name, parent, lookup_flags);
+	if (IS_ERR(dentry))
+		inode_unlock(dir);
+	return dentry;
+}
+
+/**
+ * end_dirop - signal completion of a dirop
+ * @de: the dentry which was returned by start_dirop or similar.
+ *
+ * If the de is an error, nothing happens. Otherwise any lock taken to
+ * protect the dentry is dropped and the dentry itself is release (dput()).
+ */
+void end_dirop(struct dentry *de)
+{
+	if (!IS_ERR(de)) {
+		inode_unlock(de->d_parent->d_inode);
+		dput(de);
+	}
+}
+EXPORT_SYMBOL(end_dirop);
+
 /* does lookup, returns the object with parent locked */
 static struct dentry *__start_removing_path(int dfd, struct filename *name,
 					   struct path *path)
@@ -2781,10 +2823,9 @@ static struct dentry *__start_removing_path(int dfd, struct filename *name,
 		return ERR_PTR(-EINVAL);
 	/* don't fail immediately if it's r/o, at least try to report other errors */
 	error = mnt_want_write(parent_path.mnt);
-	inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
-	d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
+	d = start_dirop(parent_path.dentry, &last, 0);
 	if (IS_ERR(d))
-		goto unlock;
+		goto drop;
 	if (error)
 		goto fail;
 	path->dentry = no_free_ptr(parent_path.dentry);
@@ -2792,10 +2833,9 @@ static struct dentry *__start_removing_path(int dfd, struct filename *name,
 	return d;
 
 fail:
-	dput(d);
+	end_dirop(d);
 	d = ERR_PTR(error);
-unlock:
-	inode_unlock(parent_path.dentry->d_inode);
+drop:
 	if (!error)
 		mnt_drop_write(parent_path.mnt);
 	return d;
@@ -2910,7 +2950,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 }
 EXPORT_SYMBOL(vfs_path_lookup);
 
-static int lookup_noperm_common(struct qstr *qname, struct dentry *base)
+int lookup_noperm_common(struct qstr *qname, struct dentry *base)
 {
 	const char *name = qname->name;
 	u32 len = qname->len;
@@ -4223,21 +4263,18 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 	 */
 	if (last.name[last.len] && !want_dir)
 		create_flags &= ~LOOKUP_CREATE;
-	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
-	dentry = lookup_one_qstr_excl(&last, path->dentry,
-				      reval_flag | create_flags);
+	dentry = start_dirop(path->dentry, &last, reval_flag | create_flags);
 	if (IS_ERR(dentry))
-		goto unlock;
+		goto out_drop_write;
 
 	if (unlikely(error))
 		goto fail;
 
 	return dentry;
 fail:
-	dput(dentry);
+	end_dirop(dentry);
 	dentry = ERR_PTR(error);
-unlock:
-	inode_unlock(path->dentry->d_inode);
+out_drop_write:
 	if (!error)
 		mnt_drop_write(path->mnt);
 out:
@@ -4256,11 +4293,26 @@ struct dentry *start_creating_path(int dfd, const char *pathname,
 }
 EXPORT_SYMBOL(start_creating_path);
 
+/**
+ * end_creating_path - finish a code section started by start_creating_path()
+ * @path: the path instantiated by start_creating_path()
+ * @dentry: the dentry returned by start_creating_path()
+ *
+ * end_creating_path() will unlock and locks taken by start_creating_path()
+ * and drop an references that were taken.  It should only be called
+ * if start_creating_path() returned a non-error.
+ * If vfs_mkdir() was called and it returned an error, that error *should*
+ * be passed to end_creating_path() together with the path.
+ */
 void end_creating_path(const struct path *path, struct dentry *dentry)
 {
-	if (!IS_ERR(dentry))
-		dput(dentry);
-	inode_unlock(path->dentry->d_inode);
+	if (IS_ERR(dentry))
+		/* The parent is still locked despite the error from
+		 * vfs_mkdir() - must unlock it.
+		 */
+		inode_unlock(path->dentry->d_inode);
+	else
+		end_dirop(dentry);
 	mnt_drop_write(path->mnt);
 	path_put(path);
 }
@@ -4592,8 +4644,7 @@ retry:
 	if (error)
 		goto exit2;
 
-	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
-	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
+	dentry = start_dirop(path.dentry, &last, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
 		goto exit3;
@@ -4602,9 +4653,8 @@ retry:
 		goto exit4;
 	error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry);
 exit4:
-	dput(dentry);
+	end_dirop(dentry);
 exit3:
-	inode_unlock(path.dentry->d_inode);
 	mnt_drop_write(path.mnt);
 exit2:
 	path_put(&path);
@@ -4721,8 +4771,7 @@ retry:
 	if (error)
 		goto exit2;
 retry_deleg:
-	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
-	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
+	dentry = start_dirop(path.dentry, &last, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
 
@@ -4737,9 +4786,8 @@ retry_deleg:
 		error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
 				   dentry, &delegated_inode);
 exit3:
-		dput(dentry);
+		end_dirop(dentry);
 	}
-	inode_unlock(path.dentry->d_inode);
 	if (inode)
 		iput(inode);	/* truncate the inode here */
 	inode = NULL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..f4543612ef1e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3609,6 +3609,8 @@ extern void iterate_supers_type(struct file_system_type *,
 void filesystems_freeze(void);
 void filesystems_thaw(void);
 
+void end_dirop(struct dentry *de);
+
 extern int dcache_dir_open(struct inode *, struct file *);
 extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
-- 
cgit v1.2.3


From 7ab96df840e60eb933abfe65fc5fe44e72f16dc0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:27 +1100
Subject: VFS/nfsd/cachefiles/ovl: add start_creating() and end_creating()

start_creating() is similar to simple_start_creating() but is not so
simple.
It takes a qstr for the name, includes permission checking, and does NOT
report an error if the name already exists, returning a positive dentry
instead.

This is currently used by nfsd, cachefiles, and overlayfs.

end_creating() is called after the dentry has been used.
end_creating() drops the reference to the dentry as it is generally no
longer needed.  This is exactly the first section of end_creating_path()
so that function is changed to call the new end_creating()

These calls help encapsulate locking rules so that directory locking can
be changed.

Occasionally this change means that the parent lock is held for a
shorter period of time, for example in cachefiles_commit_tmpfile().
As this function now unlocks after an unlink and before the following
lookup, it is possible that the lookup could again find a positive
dentry, so a while loop is introduced there.

In overlayfs the ovl_lookup_temp() function has ovl_tempname()
split out to be used in ovl_start_creating_temp().  The other use
of ovl_lookup_temp() is preparing for a rename.  When rename handling
is updated, ovl_lookup_temp() will be removed.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-5-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cachefiles/namei.c    |  41 +++++++++----------
 fs/namei.c               |  35 +++++++++++++----
 fs/nfsd/nfs3proc.c       |  14 +++----
 fs/nfsd/nfs4proc.c       |  14 +++----
 fs/nfsd/nfs4recover.c    |  16 +++-----
 fs/nfsd/nfsproc.c        |  11 +++---
 fs/nfsd/vfs.c            |  52 ++++++++++--------------
 fs/overlayfs/copy_up.c   |  19 ++++-----
 fs/overlayfs/dir.c       | 100 +++++++++++++++++++++++++++--------------------
 fs/overlayfs/overlayfs.h |   8 ++++
 fs/overlayfs/super.c     |  32 ++++++++-------
 include/linux/namei.h    |  33 ++++++++++++++++
 12 files changed, 215 insertions(+), 160 deletions(-)

(limited to 'include')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index d1edb2ac3837..0a136eb434da 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -93,12 +93,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 	_enter(",,%s", dirname);
 
 	/* search the current directory for the element name */
-	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
 retry:
 	ret = cachefiles_inject_read_error();
 	if (ret == 0)
-		subdir = lookup_one(&nop_mnt_idmap, &QSTR(dirname), dir);
+		subdir = start_creating(&nop_mnt_idmap, dir, &QSTR(dirname));
 	else
 		subdir = ERR_PTR(ret);
 	trace_cachefiles_lookup(NULL, dir, subdir);
@@ -141,7 +140,7 @@ retry:
 		trace_cachefiles_mkdir(dir, subdir);
 
 		if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) {
-			dput(subdir);
+			end_creating(subdir, dir);
 			goto retry;
 		}
 		ASSERT(d_backing_inode(subdir));
@@ -154,7 +153,8 @@ retry:
 
 	/* Tell rmdir() it's not allowed to delete the subdir */
 	inode_lock(d_inode(subdir));
-	inode_unlock(d_inode(dir));
+	dget(subdir);
+	end_creating(subdir, dir);
 
 	if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) {
 		pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
@@ -196,14 +196,11 @@ mark_error:
 	return ERR_PTR(-EBUSY);
 
 mkdir_error:
-	inode_unlock(d_inode(dir));
-	if (!IS_ERR(subdir))
-		dput(subdir);
+	end_creating(subdir, dir);
 	pr_err("mkdir %s failed with error %d\n", dirname, ret);
 	return ERR_PTR(ret);
 
 lookup_error:
-	inode_unlock(d_inode(dir));
 	ret = PTR_ERR(subdir);
 	pr_err("Lookup %s failed with error %d\n", dirname, ret);
 	return ERR_PTR(ret);
@@ -679,36 +676,41 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
 
 	_enter(",%pD", object->file);
 
-	inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
 	ret = cachefiles_inject_read_error();
 	if (ret == 0)
-		dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan);
+		dentry = start_creating(&nop_mnt_idmap, fan, &QSTR(object->d_name));
 	else
 		dentry = ERR_PTR(ret);
 	if (IS_ERR(dentry)) {
 		trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry),
 					   cachefiles_trace_lookup_error);
 		_debug("lookup fail %ld", PTR_ERR(dentry));
-		goto out_unlock;
+		goto out;
 	}
 
-	if (!d_is_negative(dentry)) {
+	/*
+	 * This loop will only execute more than once if some other thread
+	 * races to create the object we are trying to create.
+	 */
+	while (!d_is_negative(dentry)) {
 		ret = cachefiles_unlink(volume->cache, object, fan, dentry,
 					FSCACHE_OBJECT_IS_STALE);
 		if (ret < 0)
-			goto out_dput;
+			goto out_end;
+
+		end_creating(dentry, fan);
 
-		dput(dentry);
 		ret = cachefiles_inject_read_error();
 		if (ret == 0)
-			dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan);
+			dentry = start_creating(&nop_mnt_idmap, fan,
+						&QSTR(object->d_name));
 		else
 			dentry = ERR_PTR(ret);
 		if (IS_ERR(dentry)) {
 			trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry),
 						   cachefiles_trace_lookup_error);
 			_debug("lookup fail %ld", PTR_ERR(dentry));
-			goto out_unlock;
+			goto out;
 		}
 	}
 
@@ -729,10 +731,9 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
 		success = true;
 	}
 
-out_dput:
-	dput(dentry);
-out_unlock:
-	inode_unlock(d_inode(fan));
+out_end:
+	end_creating(dentry, fan);
+out:
 	_leave(" = %u", success);
 	return success;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 9effaad115d9..9972b0257a4c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3221,6 +3221,33 @@ struct dentry *lookup_noperm_positive_unlocked(struct qstr *name,
 }
 EXPORT_SYMBOL(lookup_noperm_positive_unlocked);
 
+/**
+ * start_creating - prepare to create a given name with permission checking
+ * @idmap:  idmap of the mount
+ * @parent: directory in which to prepare to create the name
+ * @name:   the name to be created
+ *
+ * Locks are taken and a lookup is performed prior to creating
+ * an object in a directory.  Permission checking (MAY_EXEC) is performed
+ * against @idmap.
+ *
+ * If the name already exists, a positive dentry is returned, so
+ * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail
+ * with -EEXIST.
+ *
+ * Returns: a negative or positive dentry, or an error.
+ */
+struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
+			      struct qstr *name)
+{
+	int err = lookup_one_common(idmap, name, parent);
+
+	if (err)
+		return ERR_PTR(err);
+	return start_dirop(parent, name, LOOKUP_CREATE);
+}
+EXPORT_SYMBOL(start_creating);
+
 #ifdef CONFIG_UNIX98_PTYS
 int path_pts(struct path *path)
 {
@@ -4306,13 +4333,7 @@ EXPORT_SYMBOL(start_creating_path);
  */
 void end_creating_path(const struct path *path, struct dentry *dentry)
 {
-	if (IS_ERR(dentry))
-		/* The parent is still locked despite the error from
-		 * vfs_mkdir() - must unlock it.
-		 */
-		inode_unlock(path->dentry->d_inode);
-	else
-		end_dirop(dentry);
+	end_creating(dentry, path->dentry);
 	mnt_drop_write(path->mnt);
 	path_put(path);
 }
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index b6d03e1ef5f7..e2aac0def2cb 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -281,14 +281,11 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (host_err)
 		return nfserrno(host_err);
 
-	inode_lock_nested(inode, I_MUTEX_PARENT);
-
-	child = lookup_one(&nop_mnt_idmap,
-			   &QSTR_LEN(argp->name, argp->len),
-			   parent);
+	child = start_creating(&nop_mnt_idmap, parent,
+			       &QSTR_LEN(argp->name, argp->len));
 	if (IS_ERR(child)) {
 		status = nfserrno(PTR_ERR(child));
-		goto out;
+		goto out_write;
 	}
 
 	if (d_really_is_negative(child)) {
@@ -367,9 +364,8 @@ set_attr:
 	status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs);
 
 out:
-	inode_unlock(inode);
-	if (child && !IS_ERR(child))
-		dput(child);
+	end_creating(child, parent);
+out_write:
 	fh_drop_write(fhp);
 	return status;
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e466cf52d7d7..b2c95e8e7c68 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -264,14 +264,11 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (is_create_with_attrs(open))
 		nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs);
 
-	inode_lock_nested(inode, I_MUTEX_PARENT);
-
-	child = lookup_one(&nop_mnt_idmap,
-			   &QSTR_LEN(open->op_fname, open->op_fnamelen),
-			   parent);
+	child = start_creating(&nop_mnt_idmap, parent,
+			       &QSTR_LEN(open->op_fname, open->op_fnamelen));
 	if (IS_ERR(child)) {
 		status = nfserrno(PTR_ERR(child));
-		goto out;
+		goto out_write;
 	}
 
 	if (d_really_is_negative(child)) {
@@ -379,10 +376,9 @@ set_attr:
 	if (attrs.na_aclerr)
 		open->op_bmval[0] &= ~FATTR4_WORD0_ACL;
 out:
-	inode_unlock(inode);
+	end_creating(child, parent);
 	nfsd_attrs_free(&attrs);
-	if (child && !IS_ERR(child))
-		dput(child);
+out_write:
 	fh_drop_write(fhp);
 	return status;
 }
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index e2b9472e5c78..c247a7c3291c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -195,13 +195,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		goto out_creds;
 
 	dir = nn->rec_file->f_path.dentry;
-	/* lock the parent */
-	inode_lock(d_inode(dir));
 
-	dentry = lookup_one(&nop_mnt_idmap, &QSTR(dname), dir);
+	dentry = start_creating(&nop_mnt_idmap, dir, &QSTR(dname));
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
-		goto out_unlock;
+		goto out;
 	}
 	if (d_really_is_positive(dentry))
 		/*
@@ -212,15 +210,13 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		 * In the 4.0 case, we should never get here; but we may
 		 * as well be forgiving and just succeed silently.
 		 */
-		goto out_put;
+		goto out_end;
 	dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
 	if (IS_ERR(dentry))
 		status = PTR_ERR(dentry);
-out_put:
-	if (!status)
-		dput(dentry);
-out_unlock:
-	inode_unlock(d_inode(dir));
+out_end:
+	end_creating(dentry, dir);
+out:
 	if (status == 0) {
 		if (nn->in_grace)
 			__nfsd4_create_reclaim_record_grace(clp, dname,
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 8f71f5748c75..ee1b16e921fd 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -306,18 +306,16 @@ nfsd_proc_create(struct svc_rqst *rqstp)
 		goto done;
 	}
 
-	inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT);
-	dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(argp->name, argp->len),
-			    dirfhp->fh_dentry);
+	dchild = start_creating(&nop_mnt_idmap, dirfhp->fh_dentry,
+				&QSTR_LEN(argp->name, argp->len));
 	if (IS_ERR(dchild)) {
 		resp->status = nfserrno(PTR_ERR(dchild));
-		goto out_unlock;
+		goto out_write;
 	}
 	fh_init(newfhp, NFS_FHSIZE);
 	resp->status = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp);
 	if (!resp->status && d_really_is_negative(dchild))
 		resp->status = nfserr_noent;
-	dput(dchild);
 	if (resp->status) {
 		if (resp->status != nfserr_noent)
 			goto out_unlock;
@@ -423,7 +421,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
 	}
 
 out_unlock:
-	inode_unlock(dirfhp->fh_dentry->d_inode);
+	end_creating(dchild, dirfhp->fh_dentry);
+out_write:
 	fh_drop_write(dirfhp);
 done:
 	fh_put(dirfhp);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9cb20d4aeab1..4efd3688e081 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1521,7 +1521,7 @@ nfsd_check_ignore_resizing(struct iattr *iap)
 		iap->ia_valid &= ~ATTR_SIZE;
 }
 
-/* The parent directory should already be locked: */
+/* The parent directory should already be locked - we will unlock */
 __be32
 nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		   struct nfsd_attrs *attrs,
@@ -1587,8 +1587,9 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
 
 out:
-	if (!IS_ERR(dchild))
-		dput(dchild);
+	if (!err)
+		fh_fill_post_attrs(fhp);
+	end_creating(dchild, dentry);
 	return err;
 
 out_nfserr:
@@ -1626,28 +1627,26 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (host_err)
 		return nfserrno(host_err);
 
-	inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
-	dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
+	dchild = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen));
 	host_err = PTR_ERR(dchild);
-	if (IS_ERR(dchild)) {
-		err = nfserrno(host_err);
-		goto out_unlock;
-	}
+	if (IS_ERR(dchild))
+		return nfserrno(host_err);
+
 	err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
 	/*
 	 * We unconditionally drop our ref to dchild as fh_compose will have
 	 * already grabbed its own ref for it.
 	 */
-	dput(dchild);
 	if (err)
 		goto out_unlock;
 	err = fh_fill_pre_attrs(fhp);
 	if (err != nfs_ok)
 		goto out_unlock;
 	err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp);
-	fh_fill_post_attrs(fhp);
+	return err;
+
 out_unlock:
-	inode_unlock(dentry->d_inode);
+	end_creating(dchild, dentry);
 	return err;
 }
 
@@ -1733,11 +1732,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	}
 
 	dentry = fhp->fh_dentry;
-	inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
-	dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
+	dnew = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen));
 	if (IS_ERR(dnew)) {
 		err = nfserrno(PTR_ERR(dnew));
-		inode_unlock(dentry->d_inode);
 		goto out_drop_write;
 	}
 	err = fh_fill_pre_attrs(fhp);
@@ -1750,11 +1747,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
 	fh_fill_post_attrs(fhp);
 out_unlock:
-	inode_unlock(dentry->d_inode);
+	end_creating(dnew, dentry);
 	if (!err)
 		err = nfserrno(commit_metadata(fhp));
-	dput(dnew);
-	if (err==0) err = cerr;
+	if (!err)
+		err = cerr;
 out_drop_write:
 	fh_drop_write(fhp);
 out:
@@ -1809,32 +1806,31 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 
 	ddir = ffhp->fh_dentry;
 	dirp = d_inode(ddir);
-	inode_lock_nested(dirp, I_MUTEX_PARENT);
+	dnew = start_creating(&nop_mnt_idmap, ddir, &QSTR_LEN(name, len));
 
-	dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(name, len), ddir);
 	if (IS_ERR(dnew)) {
 		host_err = PTR_ERR(dnew);
-		goto out_unlock;
+		goto out_drop_write;
 	}
 
 	dold = tfhp->fh_dentry;
 
 	err = nfserr_noent;
 	if (d_really_is_negative(dold))
-		goto out_dput;
+		goto out_unlock;
 	err = fh_fill_pre_attrs(ffhp);
 	if (err != nfs_ok)
-		goto out_dput;
+		goto out_unlock;
 	host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL);
 	fh_fill_post_attrs(ffhp);
-	inode_unlock(dirp);
+out_unlock:
+	end_creating(dnew, ddir);
 	if (!host_err) {
 		host_err = commit_metadata(ffhp);
 		if (!host_err)
 			host_err = commit_metadata(tfhp);
 	}
 
-	dput(dnew);
 out_drop_write:
 	fh_drop_write(tfhp);
 	if (host_err == -EBUSY) {
@@ -1849,12 +1845,6 @@ out_drop_write:
 	}
 out:
 	return err != nfs_ok ? err : nfserrno(host_err);
-
-out_dput:
-	dput(dnew);
-out_unlock:
-	inode_unlock(dirp);
-	goto out_drop_write;
 }
 
 static void
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index aac7e34f56c1..7a31ca9bdea2 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -613,9 +613,9 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
 	if (err)
 		goto out;
 
-	inode_lock_nested(udir, I_MUTEX_PARENT);
-	upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir,
-				 c->dentry->d_name.len);
+	upper = ovl_start_creating_upper(ofs, upperdir,
+					 &QSTR_LEN(c->dentry->d_name.name,
+						   c->dentry->d_name.len));
 	err = PTR_ERR(upper);
 	if (!IS_ERR(upper)) {
 		err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper);
@@ -626,9 +626,8 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
 			ovl_dentry_set_upper_alias(c->dentry);
 			ovl_dentry_update_reval(c->dentry, upper);
 		}
-		dput(upper);
+		end_creating(upper, upperdir);
 	}
-	inode_unlock(udir);
 	if (err)
 		goto out;
 
@@ -894,16 +893,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
 	if (err)
 		goto out;
 
-	inode_lock_nested(udir, I_MUTEX_PARENT);
-
-	upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
-				 c->destname.len);
+	upper = ovl_start_creating_upper(ofs, c->destdir,
+					 &QSTR_LEN(c->destname.name,
+						   c->destname.len));
 	err = PTR_ERR(upper);
 	if (!IS_ERR(upper)) {
 		err = ovl_do_link(ofs, temp, udir, upper);
-		dput(upper);
+		end_creating(upper, c->destdir);
 	}
-	inode_unlock(udir);
 
 	if (err)
 		goto out;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index a5e9ddf3023b..f0728547f7d7 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -59,15 +59,21 @@ int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir,
 	return 0;
 }
 
-struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
+#define OVL_TEMPNAME_SIZE 20
+static void ovl_tempname(char name[OVL_TEMPNAME_SIZE])
 {
-	struct dentry *temp;
-	char name[20];
 	static atomic_t temp_id = ATOMIC_INIT(0);
 
 	/* counter is allowed to wrap, since temp dentries are ephemeral */
-	snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id));
+	snprintf(name, OVL_TEMPNAME_SIZE, "#%x", atomic_inc_return(&temp_id));
+}
+
+struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
+{
+	struct dentry *temp;
+	char name[OVL_TEMPNAME_SIZE];
 
+	ovl_tempname(name);
 	temp = ovl_lookup_upper(ofs, name, workdir, strlen(name));
 	if (!IS_ERR(temp) && temp->d_inode) {
 		pr_err("workdir/%s already exists\n", name);
@@ -78,48 +84,52 @@ struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
 	return temp;
 }
 
+static struct dentry *ovl_start_creating_temp(struct ovl_fs *ofs,
+					      struct dentry *workdir)
+{
+	char name[OVL_TEMPNAME_SIZE];
+
+	ovl_tempname(name);
+	return start_creating(ovl_upper_mnt_idmap(ofs), workdir,
+			      &QSTR(name));
+}
+
 static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
 {
 	int err;
-	struct dentry *whiteout;
+	struct dentry *whiteout, *link;
 	struct dentry *workdir = ofs->workdir;
 	struct inode *wdir = workdir->d_inode;
 
 	guard(mutex)(&ofs->whiteout_lock);
 
 	if (!ofs->whiteout) {
-		inode_lock_nested(wdir, I_MUTEX_PARENT);
-		whiteout = ovl_lookup_temp(ofs, workdir);
-		if (!IS_ERR(whiteout)) {
-			err = ovl_do_whiteout(ofs, wdir, whiteout);
-			if (err) {
-				dput(whiteout);
-				whiteout = ERR_PTR(err);
-			}
-		}
-		inode_unlock(wdir);
+		whiteout = ovl_start_creating_temp(ofs, workdir);
 		if (IS_ERR(whiteout))
 			return whiteout;
-		ofs->whiteout = whiteout;
+		err = ovl_do_whiteout(ofs, wdir, whiteout);
+		if (!err)
+			ofs->whiteout = dget(whiteout);
+		end_creating(whiteout, workdir);
+		if (err)
+			return ERR_PTR(err);
 	}
 
 	if (!ofs->no_shared_whiteout) {
-		inode_lock_nested(wdir, I_MUTEX_PARENT);
-		whiteout = ovl_lookup_temp(ofs, workdir);
-		if (!IS_ERR(whiteout)) {
-			err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout);
-			if (err) {
-				dput(whiteout);
-				whiteout = ERR_PTR(err);
-			}
-		}
-		inode_unlock(wdir);
-		if (!IS_ERR(whiteout))
-			return whiteout;
-		if (PTR_ERR(whiteout) != -EMLINK) {
-			pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%lu)\n",
+		link = ovl_start_creating_temp(ofs, workdir);
+		if (IS_ERR(link))
+			return link;
+		err = ovl_do_link(ofs, ofs->whiteout, wdir, link);
+		if (!err)
+			whiteout = dget(link);
+		end_creating(link, workdir);
+		if (!err)
+			return whiteout;;
+
+		if (err != -EMLINK) {
+			pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%u)\n",
 				ofs->whiteout->d_inode->i_nlink,
-				PTR_ERR(whiteout));
+				err);
 			ofs->no_shared_whiteout = true;
 		}
 	}
@@ -252,10 +262,13 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
 			       struct ovl_cattr *attr)
 {
 	struct dentry *ret;
-	inode_lock_nested(workdir->d_inode, I_MUTEX_PARENT);
-	ret = ovl_create_real(ofs, workdir,
-			      ovl_lookup_temp(ofs, workdir), attr);
-	inode_unlock(workdir->d_inode);
+	ret = ovl_start_creating_temp(ofs, workdir);
+	if (IS_ERR(ret))
+		return ret;
+	ret = ovl_create_real(ofs, workdir, ret, attr);
+	if (!IS_ERR(ret))
+		dget(ret);
+	end_creating(ret, workdir);
 	return ret;
 }
 
@@ -354,18 +367,21 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 {
 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
 	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
-	struct inode *udir = upperdir->d_inode;
 	struct dentry *newdentry;
 	int err;
 
-	inode_lock_nested(udir, I_MUTEX_PARENT);
-	newdentry = ovl_create_real(ofs, upperdir,
-				    ovl_lookup_upper(ofs, dentry->d_name.name,
-						     upperdir, dentry->d_name.len),
-				    attr);
-	inode_unlock(udir);
+	newdentry = ovl_start_creating_upper(ofs, upperdir,
+					     &QSTR_LEN(dentry->d_name.name,
+						       dentry->d_name.len));
 	if (IS_ERR(newdentry))
 		return PTR_ERR(newdentry);
+	newdentry = ovl_create_real(ofs, upperdir, newdentry, attr);
+	if (IS_ERR(newdentry)) {
+		end_creating(newdentry, upperdir);
+		return PTR_ERR(newdentry);
+	}
+	dget(newdentry);
+	end_creating(newdentry, upperdir);
 
 	if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) &&
 	    !ovl_allow_offline_changes(ofs)) {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index c8fd5951fc5e..beeba96cfcb2 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -415,6 +415,14 @@ static inline struct dentry *ovl_lookup_upper_unlocked(struct ovl_fs *ofs,
 				   &QSTR_LEN(name, len), base);
 }
 
+static inline struct dentry *ovl_start_creating_upper(struct ovl_fs *ofs,
+						      struct dentry *parent,
+						      struct qstr *name)
+{
+	return start_creating(ovl_upper_mnt_idmap(ofs),
+			      parent, name);
+}
+
 static inline bool ovl_open_flags_need_copy_up(int flags)
 {
 	if (!flags)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 43ee4c7296a7..6e0816c1147a 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -310,8 +310,7 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
 	bool retried = false;
 
 retry:
-	inode_lock_nested(dir, I_MUTEX_PARENT);
-	work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name));
+	work = ovl_start_creating_upper(ofs, ofs->workbasedir, &QSTR(name));
 
 	if (!IS_ERR(work)) {
 		struct iattr attr = {
@@ -320,14 +319,13 @@ retry:
 		};
 
 		if (work->d_inode) {
+			dget(work);
+			end_creating(work, ofs->workbasedir);
+			if (persist)
+				return work;
 			err = -EEXIST;
-			inode_unlock(dir);
 			if (retried)
 				goto out_dput;
-
-			if (persist)
-				return work;
-
 			retried = true;
 			err = ovl_workdir_cleanup(ofs, ofs->workbasedir, mnt, work, 0);
 			dput(work);
@@ -338,7 +336,9 @@ retry:
 		}
 
 		work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode);
-		inode_unlock(dir);
+		if (!IS_ERR(work))
+			dget(work);
+		end_creating(work, ofs->workbasedir);
 		err = PTR_ERR(work);
 		if (IS_ERR(work))
 			goto out_err;
@@ -376,7 +376,6 @@ retry:
 		if (err)
 			goto out_dput;
 	} else {
-		inode_unlock(dir);
 		err = PTR_ERR(work);
 		goto out_err;
 	}
@@ -626,14 +625,17 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs,
 					   struct dentry *parent,
 					   const char *name, umode_t mode)
 {
-	size_t len = strlen(name);
 	struct dentry *child;
 
-	inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
-	child = ovl_lookup_upper(ofs, name, parent, len);
-	if (!IS_ERR(child) && !child->d_inode)
-		child = ovl_create_real(ofs, parent, child, OVL_CATTR(mode));
-	inode_unlock(parent->d_inode);
+	child = ovl_start_creating_upper(ofs, parent, &QSTR(name));
+	if (!IS_ERR(child)) {
+		if (!child->d_inode)
+			child = ovl_create_real(ofs, parent, child,
+						OVL_CATTR(mode));
+		if (!IS_ERR(child))
+			dget(child);
+		end_creating(child, parent);
+	}
 	dput(parent);
 
 	return child;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index fed86221c69c..3f92c1a16878 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -88,6 +88,39 @@ struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
 					    struct qstr *name,
 					    struct dentry *base);
 
+struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
+			      struct qstr *name);
+
+/**
+ * end_creating - finish action started with start_creating
+ * @child:  dentry returned by start_creating() or vfs_mkdir()
+ * @parent: dentry given to start_creating(),
+ *
+ * Unlock and release the child.
+ *
+ * Unlike end_dirop() this can only be called if start_creating() succeeded.
+ * It handles @child being and error as vfs_mkdir() might have converted the
+ * dentry to an error - in that case the parent still needs to be unlocked.
+ *
+ * If vfs_mkdir() was called then the value returned from that function
+ * should be given for @child rather than the original dentry, as vfs_mkdir()
+ * may have provided a new dentry.  Even if vfs_mkdir() returns an error
+ * it must be given to end_creating().
+ *
+ * If vfs_mkdir() was not called, then @child will be a valid dentry and
+ * @parent will be ignored.
+ */
+static inline void end_creating(struct dentry *child, struct dentry *parent)
+{
+	if (IS_ERR(child))
+		/* The parent is still locked despite the error from
+		 * vfs_mkdir() - must unlock it.
+		 */
+		inode_unlock(parent->d_inode);
+	else
+		end_dirop(child);
+}
+
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *path, unsigned int flags);
 extern int follow_up(struct path *);
-- 
cgit v1.2.3


From bd6ede8a06e89ca5a94a8b51cea792705d1b8ca2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:28 +1100
Subject: VFS/nfsd/cachefiles/ovl: introduce start_removing() and
 end_removing()

start_removing() is similar to start_creating() but will only return a
positive dentry with the expectation that it will be removed.  This is
used by nfsd, cachefiles, and overlayfs.  They are changed to also use
end_removing() to terminate the action begun by start_removing().  This
is a simple alias for end_dirop().

Apart from changes to the error paths, as we no longer need to unlock on
a lookup error, an effect on callers is that they don't need to test if
the found dentry is positive or negative - they can be sure it is
positive.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-6-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cachefiles/namei.c    | 32 ++++++++++++++------------------
 fs/namei.c               | 27 +++++++++++++++++++++++++++
 fs/nfsd/nfs4recover.c    | 18 +++++-------------
 fs/nfsd/vfs.c            | 26 ++++++++++----------------
 fs/overlayfs/dir.c       | 15 +++++++--------
 fs/overlayfs/overlayfs.h |  8 ++++++++
 include/linux/namei.h    | 18 ++++++++++++++++++
 7 files changed, 89 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 0a136eb434da..c7f0c6ab9b88 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -260,6 +260,7 @@ static int cachefiles_unlink(struct cachefiles_cache *cache,
  * - File backed objects are unlinked
  * - Directory backed objects are stuffed into the graveyard for userspace to
  *   delete
+ * On entry dir must be locked.  It will be unlocked on exit.
  */
 int cachefiles_bury_object(struct cachefiles_cache *cache,
 			   struct cachefiles_object *object,
@@ -274,28 +275,30 @@ int cachefiles_bury_object(struct cachefiles_cache *cache,
 
 	_enter(",'%pd','%pd'", dir, rep);
 
+	/* end_removing() will dput() @rep but we need to keep
+	 * a ref, so take one now.  This also stops the dentry
+	 * being negated when unlinked which we need.
+	 */
+	dget(rep);
+
 	if (rep->d_parent != dir) {
-		inode_unlock(d_inode(dir));
+		end_removing(rep);
 		_leave(" = -ESTALE");
 		return -ESTALE;
 	}
 
 	/* non-directories can just be unlinked */
 	if (!d_is_dir(rep)) {
-		dget(rep); /* Stop the dentry being negated if it's only pinned
-			    * by a file struct.
-			    */
 		ret = cachefiles_unlink(cache, object, dir, rep, why);
-		dput(rep);
+		end_removing(rep);
 
-		inode_unlock(d_inode(dir));
 		_leave(" = %d", ret);
 		return ret;
 	}
 
 	/* directories have to be moved to the graveyard */
 	_debug("move stale object to graveyard");
-	inode_unlock(d_inode(dir));
+	end_removing(rep);
 
 try_again:
 	/* first step is to make up a grave dentry in the graveyard */
@@ -749,26 +752,20 @@ static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache,
 	struct dentry *victim;
 	int ret = -ENOENT;
 
-	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
+	victim = start_removing(&nop_mnt_idmap, dir, &QSTR(filename));
 
-	victim = lookup_one(&nop_mnt_idmap, &QSTR(filename), dir);
 	if (IS_ERR(victim))
 		goto lookup_error;
-	if (d_is_negative(victim))
-		goto lookup_put;
 	if (d_inode(victim)->i_flags & S_KERNEL_FILE)
 		goto lookup_busy;
 	return victim;
 
 lookup_busy:
 	ret = -EBUSY;
-lookup_put:
-	inode_unlock(d_inode(dir));
-	dput(victim);
+	end_removing(victim);
 	return ERR_PTR(ret);
 
 lookup_error:
-	inode_unlock(d_inode(dir));
 	ret = PTR_ERR(victim);
 	if (ret == -ENOENT)
 		return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */
@@ -816,18 +813,17 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
 
 	ret = cachefiles_bury_object(cache, NULL, dir, victim,
 				     FSCACHE_OBJECT_WAS_CULLED);
+	dput(victim);
 	if (ret < 0)
 		goto error;
 
 	fscache_count_culled();
-	dput(victim);
 	_leave(" = 0");
 	return 0;
 
 error_unlock:
-	inode_unlock(d_inode(dir));
+	end_removing(victim);
 error:
-	dput(victim);
 	if (ret == -ENOENT)
 		return -ESTALE; /* Probably got retired by the netfs */
 
diff --git a/fs/namei.c b/fs/namei.c
index 9972b0257a4c..ae833dfa277c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3248,6 +3248,33 @@ struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
 }
 EXPORT_SYMBOL(start_creating);
 
+/**
+ * start_removing - prepare to remove a given name with permission checking
+ * @idmap:  idmap of the mount
+ * @parent: directory in which to find the name
+ * @name:   the name to be removed
+ *
+ * Locks are taken and a lookup in performed prior to removing
+ * an object from a directory.  Permission checking (MAY_EXEC) is performed
+ * against @idmap.
+ *
+ * If the name doesn't exist, an error is returned.
+ *
+ * end_removing() should be called when removal is complete, or aborted.
+ *
+ * Returns: a positive dentry, or an error.
+ */
+struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
+			      struct qstr *name)
+{
+	int err = lookup_one_common(idmap, name, parent);
+
+	if (err)
+		return ERR_PTR(err);
+	return start_dirop(parent, name, 0);
+}
+EXPORT_SYMBOL(start_removing);
+
 #ifdef CONFIG_UNIX98_PTYS
 int path_pts(struct path *path)
 {
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index c247a7c3291c..3eefaa2202e3 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -324,20 +324,12 @@ nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn)
 	dprintk("NFSD: nfsd4_unlink_clid_dir. name %s\n", name);
 
 	dir = nn->rec_file->f_path.dentry;
-	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
-	dentry = lookup_one(&nop_mnt_idmap, &QSTR(name), dir);
-	if (IS_ERR(dentry)) {
-		status = PTR_ERR(dentry);
-		goto out_unlock;
-	}
-	status = -ENOENT;
-	if (d_really_is_negative(dentry))
-		goto out;
+	dentry = start_removing(&nop_mnt_idmap, dir, &QSTR(name));
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
 	status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry);
-out:
-	dput(dentry);
-out_unlock:
-	inode_unlock(d_inode(dir));
+	end_removing(dentry);
 	return status;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 4efd3688e081..cd64ffe12e0b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2044,7 +2044,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 {
 	struct dentry	*dentry, *rdentry;
 	struct inode	*dirp;
-	struct inode	*rinode;
+	struct inode	*rinode = NULL;
 	__be32		err;
 	int		host_err;
 
@@ -2063,24 +2063,21 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 
 	dentry = fhp->fh_dentry;
 	dirp = d_inode(dentry);
-	inode_lock_nested(dirp, I_MUTEX_PARENT);
 
-	rdentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
+	rdentry = start_removing(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen));
+
 	host_err = PTR_ERR(rdentry);
 	if (IS_ERR(rdentry))
-		goto out_unlock;
+		goto out_drop_write;
 
-	if (d_really_is_negative(rdentry)) {
-		dput(rdentry);
-		host_err = -ENOENT;
-		goto out_unlock;
-	}
-	rinode = d_inode(rdentry);
 	err = fh_fill_pre_attrs(fhp);
 	if (err != nfs_ok)
 		goto out_unlock;
 
+	rinode = d_inode(rdentry);
+	/* Prevent truncation until after locks dropped */
 	ihold(rinode);
+
 	if (!type)
 		type = d_inode(rdentry)->i_mode & S_IFMT;
 
@@ -2102,10 +2099,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	}
 	fh_fill_post_attrs(fhp);
 
-	inode_unlock(dirp);
-	if (!host_err)
+out_unlock:
+	end_removing(rdentry);
+	if (!err && !host_err)
 		host_err = commit_metadata(fhp);
-	dput(rdentry);
 	iput(rinode);    /* truncate the inode here */
 
 out_drop_write:
@@ -2123,9 +2120,6 @@ out_nfserr:
 	}
 out:
 	return err != nfs_ok ? err : nfserrno(host_err);
-out_unlock:
-	inode_unlock(dirp);
-	goto out_drop_write;
 }
 
 /*
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index f0728547f7d7..f0b6e2e7c9d4 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -866,17 +866,17 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
 			goto out;
 	}
 
-	inode_lock_nested(dir, I_MUTEX_PARENT);
-	upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
-				 dentry->d_name.len);
+	upper = ovl_start_removing_upper(ofs, upperdir,
+					 &QSTR_LEN(dentry->d_name.name,
+						   dentry->d_name.len));
 	err = PTR_ERR(upper);
 	if (IS_ERR(upper))
-		goto out_unlock;
+		goto out_dput;
 
 	err = -ESTALE;
 	if ((opaquedir && upper != opaquedir) ||
 	    (!opaquedir && !ovl_matches_upper(dentry, upper)))
-		goto out_dput_upper;
+		goto out_unlock;
 
 	if (is_dir)
 		err = ovl_do_rmdir(ofs, dir, upper);
@@ -892,10 +892,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
 	 */
 	if (!err)
 		d_drop(dentry);
-out_dput_upper:
-	dput(upper);
 out_unlock:
-	inode_unlock(dir);
+	end_removing(upper);
+out_dput:
 	dput(opaquedir);
 out:
 	return err;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index beeba96cfcb2..49ad65f829dc 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -423,6 +423,14 @@ static inline struct dentry *ovl_start_creating_upper(struct ovl_fs *ofs,
 			      parent, name);
 }
 
+static inline struct dentry *ovl_start_removing_upper(struct ovl_fs *ofs,
+						      struct dentry *parent,
+						      struct qstr *name)
+{
+	return start_removing(ovl_upper_mnt_idmap(ofs),
+			      parent, name);
+}
+
 static inline bool ovl_open_flags_need_copy_up(int flags)
 {
 	if (!flags)
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 3f92c1a16878..9ee76e88f3dd 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -90,6 +90,8 @@ struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
 
 struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
 			      struct qstr *name);
+struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
+			      struct qstr *name);
 
 /**
  * end_creating - finish action started with start_creating
@@ -121,6 +123,22 @@ static inline void end_creating(struct dentry *child, struct dentry *parent)
 		end_dirop(child);
 }
 
+/**
+ * end_removing - finish action started with start_removing
+ * @child:  dentry returned by start_removing()
+ * @parent: dentry given to start_removing()
+ *
+ * Unlock and release the child.
+ *
+ * This is identical to end_dirop().  It can be passed the result of
+ * start_removing() whether that was successful or not, but it not needed
+ * if start_removing() failed.
+ */
+static inline void end_removing(struct dentry *child)
+{
+	end_dirop(child);
+}
+
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *path, unsigned int flags);
 extern int follow_up(struct path *);
-- 
cgit v1.2.3


From c9ba789dad15ba65662bba17595c0aeaa0cfcf1c Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:29 +1100
Subject: VFS: introduce start_creating_noperm() and start_removing_noperm()

xfs, fuse, ipc/mqueue need variants of start_creating or start_removing
which do not check permissions.
This patch adds _noperm versions of these functions.

Note that do_mq_open() was only calling mntget() so it could call
path_put() - it didn't really need an extra reference on the mnt.
Now it doesn't call mntget() and uses end_creating() which does
the dput() half of path_put().

Also mq_unlink() previously passed
   d_inode(dentry->d_parent)
as the dir inode to vfs_unlink().  This is after locking
   d_inode(mnt->mnt_root)
These two inodes are the same, but normally calls use the textual
parent.
So I've changes the vfs_unlink() call to be given d_inode(mnt->mnt_root).

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>

--
changes since v2:
 - dir arg passed to vfs_unlink() in mq_unlink() changed to match
   the dir passed to lookup_noperm()
 - restore assignment to path->mnt even though the mntget() is removed.

Link: https://patch.msgid.link/20251113002050.676694-7-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fuse/dir.c            | 19 ++++++++-----------
 fs/namei.c               | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/orphanage.c | 11 ++++-------
 include/linux/namei.h    |  2 ++
 ipc/mqueue.c             | 32 ++++++++++++--------------------
 5 files changed, 74 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index ecaec0fea3a1..40ca94922349 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1397,27 +1397,25 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
 	if (!parent)
 		return -ENOENT;
 
-	inode_lock_nested(parent, I_MUTEX_PARENT);
 	if (!S_ISDIR(parent->i_mode))
-		goto unlock;
+		goto put_parent;
 
 	err = -ENOENT;
 	dir = d_find_alias(parent);
 	if (!dir)
-		goto unlock;
+		goto put_parent;
 
-	name->hash = full_name_hash(dir, name->name, name->len);
-	entry = d_lookup(dir, name);
+	entry = start_removing_noperm(dir, name);
 	dput(dir);
-	if (!entry)
-		goto unlock;
+	if (IS_ERR(entry))
+		goto put_parent;
 
 	fuse_dir_changed(parent);
 	if (!(flags & FUSE_EXPIRE_ONLY))
 		d_invalidate(entry);
 	fuse_invalidate_entry_cache(entry);
 
-	if (child_nodeid != 0 && d_really_is_positive(entry)) {
+	if (child_nodeid != 0) {
 		inode_lock(d_inode(entry));
 		if (get_node_id(d_inode(entry)) != child_nodeid) {
 			err = -ENOENT;
@@ -1445,10 +1443,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
 	} else {
 		err = 0;
 	}
-	dput(entry);
 
- unlock:
-	inode_unlock(parent);
+	end_removing(entry);
+ put_parent:
 	iput(parent);
 	return err;
 }
diff --git a/fs/namei.c b/fs/namei.c
index ae833dfa277c..696e4b794416 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3275,6 +3275,54 @@ struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
 }
 EXPORT_SYMBOL(start_removing);
 
+/**
+ * start_creating_noperm - prepare to create a given name without permission checking
+ * @parent: directory in which to prepare to create the name
+ * @name:   the name to be created
+ *
+ * Locks are taken and a lookup in performed prior to creating
+ * an object in a directory.
+ *
+ * If the name already exists, a positive dentry is returned.
+ *
+ * Returns: a negative or positive dentry, or an error.
+ */
+struct dentry *start_creating_noperm(struct dentry *parent,
+				     struct qstr *name)
+{
+	int err = lookup_noperm_common(name, parent);
+
+	if (err)
+		return ERR_PTR(err);
+	return start_dirop(parent, name, LOOKUP_CREATE);
+}
+EXPORT_SYMBOL(start_creating_noperm);
+
+/**
+ * start_removing_noperm - prepare to remove a given name without permission checking
+ * @parent: directory in which to find the name
+ * @name:   the name to be removed
+ *
+ * Locks are taken and a lookup in performed prior to removing
+ * an object from a directory.
+ *
+ * If the name doesn't exist, an error is returned.
+ *
+ * end_removing() should be called when removal is complete, or aborted.
+ *
+ * Returns: a positive dentry, or an error.
+ */
+struct dentry *start_removing_noperm(struct dentry *parent,
+				     struct qstr *name)
+{
+	int err = lookup_noperm_common(name, parent);
+
+	if (err)
+		return ERR_PTR(err);
+	return start_dirop(parent, name, 0);
+}
+EXPORT_SYMBOL(start_removing_noperm);
+
 #ifdef CONFIG_UNIX98_PTYS
 int path_pts(struct path *path)
 {
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index 9c12cb844231..e732605924a1 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -152,11 +152,10 @@ xrep_orphanage_create(
 	}
 
 	/* Try to find the orphanage directory. */
-	inode_lock_nested(root_inode, I_MUTEX_PARENT);
-	orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry);
+	orphanage_dentry = start_creating_noperm(root_dentry, &QSTR(ORPHANAGE));
 	if (IS_ERR(orphanage_dentry)) {
 		error = PTR_ERR(orphanage_dentry);
-		goto out_unlock_root;
+		goto out_dput_root;
 	}
 
 	/*
@@ -170,7 +169,7 @@ xrep_orphanage_create(
 					     orphanage_dentry, 0750);
 		error = PTR_ERR(orphanage_dentry);
 		if (IS_ERR(orphanage_dentry))
-			goto out_unlock_root;
+			goto out_dput_orphanage;
 	}
 
 	/* Not a directory? Bail out. */
@@ -200,9 +199,7 @@ xrep_orphanage_create(
 	sc->orphanage_ilock_flags = 0;
 
 out_dput_orphanage:
-	dput(orphanage_dentry);
-out_unlock_root:
-	inode_unlock(VFS_I(sc->mp->m_rootip));
+	end_creating(orphanage_dentry, root_dentry);
 out_dput_root:
 	dput(root_dentry);
 out:
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 9ee76e88f3dd..688e157d6afc 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -92,6 +92,8 @@ struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
 			      struct qstr *name);
 struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
 			      struct qstr *name);
+struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name);
+struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name);
 
 /**
  * end_creating - finish action started with start_creating
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 093551fe66a7..6d7610310003 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -913,13 +913,12 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
 		goto out_putname;
 
 	ro = mnt_want_write(mnt);	/* we'll drop it in any case */
-	inode_lock(d_inode(root));
-	path.dentry = lookup_noperm(&QSTR(name->name), root);
+	path.dentry = start_creating_noperm(root, &QSTR(name->name));
 	if (IS_ERR(path.dentry)) {
 		error = PTR_ERR(path.dentry);
 		goto out_putfd;
 	}
-	path.mnt = mntget(mnt);
+	path.mnt = mnt;
 	error = prepare_open(path.dentry, oflag, ro, mode, name, attr);
 	if (!error) {
 		struct file *file = dentry_open(&path, oflag, current_cred());
@@ -928,13 +927,12 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
 		else
 			error = PTR_ERR(file);
 	}
-	path_put(&path);
 out_putfd:
 	if (error) {
 		put_unused_fd(fd);
 		fd = error;
 	}
-	inode_unlock(d_inode(root));
+	end_creating(path.dentry, root);
 	if (!ro)
 		mnt_drop_write(mnt);
 out_putname:
@@ -957,7 +955,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	int err;
 	struct filename *name;
 	struct dentry *dentry;
-	struct inode *inode = NULL;
+	struct inode *inode;
 	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
 	struct vfsmount *mnt = ipc_ns->mq_mnt;
 
@@ -969,26 +967,20 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	err = mnt_want_write(mnt);
 	if (err)
 		goto out_name;
-	inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
-	dentry = lookup_noperm(&QSTR(name->name), mnt->mnt_root);
+	dentry = start_removing_noperm(mnt->mnt_root, &QSTR(name->name));
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
-		goto out_unlock;
+		goto out_drop_write;
 	}
 
 	inode = d_inode(dentry);
-	if (!inode) {
-		err = -ENOENT;
-	} else {
-		ihold(inode);
-		err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent),
-				 dentry, NULL);
-	}
-	dput(dentry);
-
-out_unlock:
-	inode_unlock(d_inode(mnt->mnt_root));
+	ihold(inode);
+	err = vfs_unlink(&nop_mnt_idmap, d_inode(mnt->mnt_root),
+			 dentry, NULL);
+	end_removing(dentry);
 	iput(inode);
+
+out_drop_write:
 	mnt_drop_write(mnt);
 out_name:
 	putname(name);
-- 
cgit v1.2.3


From 7bb1eb45e43c4730cbc5a48b9e9295049fccdacb Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:31 +1100
Subject: VFS: introduce start_removing_dentry()

start_removing_dentry() is similar to start_removing() but instead of
providing a name for lookup, the target dentry is given.

start_removing_dentry() checks that the dentry is still hashed and in
the parent, and if so it locks and increases the refcount so that
end_removing() can be used to finish the operation.

This is used in cachefiles, overlayfs, smb/server, and apparmor.

There will be other users including ecryptfs.

As start_removing_dentry() takes an extra reference to the dentry (to be
put by end_removing()), there is no need to explicitly take an extra
reference to stop d_delete() from using dentry_unlink_inode() to negate
the dentry - as in cachefiles_delete_object(), and ksmbd_vfs_unlink().

cachefiles_bury_object() now gets an extra ref to the victim, which is
drops.  As it includes the needed end_removing() calls, the caller
doesn't need them.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Namjae Jeon <linkinjeon@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-9-neilb@ownmail.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cachefiles/interface.c      | 11 +++++++----
 fs/cachefiles/namei.c          | 30 ++++++++++++++----------------
 fs/cachefiles/volume.c         |  9 ++++++---
 fs/namei.c                     | 33 +++++++++++++++++++++++++++++++++
 fs/overlayfs/dir.c             | 10 ++++------
 fs/overlayfs/readdir.c         |  8 ++++----
 fs/smb/server/vfs.c            | 27 ++++-----------------------
 include/linux/namei.h          |  2 ++
 security/apparmor/apparmorfs.c |  8 ++++----
 9 files changed, 78 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 3e63cfe15874..a08250d244ea 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
 #include <linux/mount.h>
 #include <linux/xattr.h>
 #include <linux/file.h>
+#include <linux/namei.h>
 #include <linux/falloc.h>
 #include <trace/events/fscache.h>
 #include "internal.h"
@@ -428,11 +429,13 @@ static bool cachefiles_invalidate_cookie(struct fscache_cookie *cookie)
 		if (!old_tmpfile) {
 			struct cachefiles_volume *volume = object->volume;
 			struct dentry *fan = volume->fanout[(u8)cookie->key_hash];
+			struct dentry *obj;
 
-			inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
-			cachefiles_bury_object(volume->cache, object, fan,
-					       old_file->f_path.dentry,
-					       FSCACHE_OBJECT_INVALIDATED);
+			obj = start_removing_dentry(fan, old_file->f_path.dentry);
+			if (!IS_ERR(obj))
+				cachefiles_bury_object(volume->cache, object,
+						       fan, obj,
+						       FSCACHE_OBJECT_INVALIDATED);
 		}
 		fput(old_file);
 	}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index c7f0c6ab9b88..0104ac00485d 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -261,6 +261,7 @@ static int cachefiles_unlink(struct cachefiles_cache *cache,
  * - Directory backed objects are stuffed into the graveyard for userspace to
  *   delete
  * On entry dir must be locked.  It will be unlocked on exit.
+ * On entry there must be at least 2 refs on rep, one will be dropped on exit.
  */
 int cachefiles_bury_object(struct cachefiles_cache *cache,
 			   struct cachefiles_object *object,
@@ -275,12 +276,6 @@ int cachefiles_bury_object(struct cachefiles_cache *cache,
 
 	_enter(",'%pd','%pd'", dir, rep);
 
-	/* end_removing() will dput() @rep but we need to keep
-	 * a ref, so take one now.  This also stops the dentry
-	 * being negated when unlinked which we need.
-	 */
-	dget(rep);
-
 	if (rep->d_parent != dir) {
 		end_removing(rep);
 		_leave(" = -ESTALE");
@@ -425,13 +420,12 @@ int cachefiles_delete_object(struct cachefiles_object *object,
 
 	_enter(",OBJ%x{%pD}", object->debug_id, object->file);
 
-	/* Stop the dentry being negated if it's only pinned by a file struct. */
-	dget(dentry);
-
-	inode_lock_nested(d_backing_inode(fan), I_MUTEX_PARENT);
-	ret = cachefiles_unlink(volume->cache, object, fan, dentry, why);
-	inode_unlock(d_backing_inode(fan));
-	dput(dentry);
+	dentry = start_removing_dentry(fan, dentry);
+	if (IS_ERR(dentry))
+		ret = PTR_ERR(dentry);
+	else
+		ret = cachefiles_unlink(volume->cache, object, fan, dentry, why);
+	end_removing(dentry);
 	return ret;
 }
 
@@ -644,9 +638,13 @@ bool cachefiles_look_up_object(struct cachefiles_object *object)
 
 	if (!d_is_reg(dentry)) {
 		pr_err("%pd is not a file\n", dentry);
-		inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
-		ret = cachefiles_bury_object(volume->cache, object, fan, dentry,
-					     FSCACHE_OBJECT_IS_WEIRD);
+		struct dentry *de = start_removing_dentry(fan, dentry);
+		if (IS_ERR(de))
+			ret = PTR_ERR(de);
+		else
+			ret = cachefiles_bury_object(volume->cache, object,
+						     fan, de,
+						     FSCACHE_OBJECT_IS_WEIRD);
 		dput(dentry);
 		if (ret < 0)
 			return false;
diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c
index 781aac4ef274..90ba926f488e 100644
--- a/fs/cachefiles/volume.c
+++ b/fs/cachefiles/volume.c
@@ -7,6 +7,7 @@
 
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/namei.h>
 #include "internal.h"
 #include <trace/events/fscache.h>
 
@@ -58,9 +59,11 @@ retry:
 		if (ret < 0) {
 			if (ret != -ESTALE)
 				goto error_dir;
-			inode_lock_nested(d_inode(cache->store), I_MUTEX_PARENT);
-			cachefiles_bury_object(cache, NULL, cache->store, vdentry,
-					       FSCACHE_VOLUME_IS_WEIRD);
+			vdentry = start_removing_dentry(cache->store, vdentry);
+			if (!IS_ERR(vdentry))
+				cachefiles_bury_object(cache, NULL, cache->store,
+						       vdentry,
+						       FSCACHE_VOLUME_IS_WEIRD);
 			cachefiles_put_directory(volume->dentry);
 			cond_resched();
 			goto retry;
diff --git a/fs/namei.c b/fs/namei.c
index 696e4b794416..bfc443bec8a9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3323,6 +3323,39 @@ struct dentry *start_removing_noperm(struct dentry *parent,
 }
 EXPORT_SYMBOL(start_removing_noperm);
 
+/**
+ * start_removing_dentry - prepare to remove a given dentry
+ * @parent: directory from which dentry should be removed
+ * @child:  the dentry to be removed
+ *
+ * A lock is taken to protect the dentry again other dirops and
+ * the validity of the dentry is checked: correct parent and still hashed.
+ *
+ * If the dentry is valid and positive, a reference is taken and
+ * returned.  If not an error is returned.
+ *
+ * end_removing() should be called when removal is complete, or aborted.
+ *
+ * Returns: the valid dentry, or an error.
+ */
+struct dentry *start_removing_dentry(struct dentry *parent,
+				     struct dentry *child)
+{
+	inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
+	if (unlikely(IS_DEADDIR(parent->d_inode) ||
+		     child->d_parent != parent ||
+		     d_unhashed(child))) {
+		inode_unlock(parent->d_inode);
+		return ERR_PTR(-EINVAL);
+	}
+	if (d_is_negative(child)) {
+		inode_unlock(parent->d_inode);
+		return ERR_PTR(-ENOENT);
+	}
+	return dget(child);
+}
+EXPORT_SYMBOL(start_removing_dentry);
+
 #ifdef CONFIG_UNIX98_PTYS
 int path_pts(struct path *path)
 {
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index f0b6e2e7c9d4..82b6ff0ab2d3 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -47,14 +47,12 @@ static int ovl_cleanup_locked(struct ovl_fs *ofs, struct inode *wdir,
 int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir,
 		struct dentry *wdentry)
 {
-	int err;
-
-	err = ovl_parent_lock(workdir, wdentry);
-	if (err)
-		return err;
+	wdentry = start_removing_dentry(workdir, wdentry);
+	if (IS_ERR(wdentry))
+		return PTR_ERR(wdentry);
 
 	ovl_cleanup_locked(ofs, workdir->d_inode, wdentry);
-	ovl_parent_unlock(workdir);
+	end_removing(wdentry);
 
 	return 0;
 }
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 1e9792cc557b..77ecc39fc33a 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -1242,11 +1242,11 @@ int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent,
 	if (!d_is_dir(dentry) || level > 1)
 		return ovl_cleanup(ofs, parent, dentry);
 
-	err = ovl_parent_lock(parent, dentry);
-	if (err)
-		return err;
+	dentry = start_removing_dentry(parent, dentry);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 	err = ovl_do_rmdir(ofs, parent->d_inode, dentry);
-	ovl_parent_unlock(parent);
+	end_removing(dentry);
 	if (err) {
 		struct path path = { .mnt = mnt, .dentry = dentry };
 
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index ea0a06b0ae44..148c65d59e42 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -49,24 +49,6 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
 	i_uid_write(inode, i_uid_read(parent_inode));
 }
 
-/**
- * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable
- * @parent: parent dentry
- * @child: child dentry
- *
- * Returns: %0 on success, %-ENOENT if the parent dentry is not stable
- */
-int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
-{
-	inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
-	if (child->d_parent != parent) {
-		inode_unlock(d_inode(parent));
-		return -ENOENT;
-	}
-
-	return 0;
-}
-
 static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf,
 				 char *pathname, unsigned int flags,
 				 struct path *path, bool for_remove)
@@ -1082,18 +1064,17 @@ int ksmbd_vfs_unlink(struct file *filp)
 		return err;
 
 	dir = dget_parent(dentry);
-	err = ksmbd_vfs_lock_parent(dir, dentry);
-	if (err)
+	dentry = start_removing_dentry(dir, dentry);
+	err = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
 		goto out;
-	dget(dentry);
 
 	if (S_ISDIR(d_inode(dentry)->i_mode))
 		err = vfs_rmdir(idmap, d_inode(dir), dentry);
 	else
 		err = vfs_unlink(idmap, d_inode(dir), dentry, NULL);
 
-	dput(dentry);
-	inode_unlock(d_inode(dir));
+	end_removing(dentry);
 	if (err)
 		ksmbd_debug(VFS, "failed to delete, err %d\n", err);
 out:
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 688e157d6afc..7e916e9d7726 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -94,6 +94,8 @@ struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
 			      struct qstr *name);
 struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name);
 struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name);
+struct dentry *start_removing_dentry(struct dentry *parent,
+				     struct dentry *child);
 
 /**
  * end_creating - finish action started with start_creating
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 391a586d0557..9d08d103f142 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -355,17 +355,17 @@ static void aafs_remove(struct dentry *dentry)
 	if (!dentry || IS_ERR(dentry))
 		return;
 
+	/* ->d_parent is stable as rename is not supported */
 	dir = d_inode(dentry->d_parent);
-	inode_lock(dir);
-	if (simple_positive(dentry)) {
+	dentry = start_removing_dentry(dentry->d_parent, dentry);
+	if (!IS_ERR(dentry) && simple_positive(dentry)) {
 		if (d_is_dir(dentry))
 			simple_rmdir(dir, dentry);
 		else
 			simple_unlink(dir, dentry);
 		d_delete(dentry);
-		dput(dentry);
 	}
-	inode_unlock(dir);
+	end_removing(dentry);
 	simple_release_fs(&aafs_mnt, &aafs_count);
 }
 
-- 
cgit v1.2.3


From ff7c4ea11a05c886f018fff4a4d4f4d68d951e25 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:32 +1100
Subject: VFS: add start_creating_killable() and start_removing_killable()

These are similar to start_creating() and start_removing(), but allow a
fatal signal to abort waiting for the lock.

They are used in btrfs for subvol creation and removal.

btrfs_may_create() no longer needs IS_DEADDIR() and
start_creating_killable() includes that check.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-10-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/btrfs/ioctl.c      | 41 ++++++++------------------
 fs/namei.c            | 80 +++++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/namei.h |  6 ++++
 3 files changed, 95 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 185bef0df1c2..4fbfdd8faf6a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -904,14 +904,9 @@ static noinline int btrfs_mksubvol(struct dentry *parent,
 	struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len);
 	int ret;
 
-	ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
-	if (ret == -EINTR)
-		return ret;
-
-	dentry = lookup_one(idmap, qname, parent);
-	ret = PTR_ERR(dentry);
+	dentry = start_creating_killable(idmap, parent, qname);
 	if (IS_ERR(dentry))
-		goto out_unlock;
+		return PTR_ERR(dentry);
 
 	ret = btrfs_may_create(idmap, dir, dentry);
 	if (ret)
@@ -940,9 +935,7 @@ static noinline int btrfs_mksubvol(struct dentry *parent,
 out_up_read:
 	up_read(&fs_info->subvol_sem);
 out_dput:
-	dput(dentry);
-out_unlock:
-	btrfs_inode_unlock(BTRFS_I(dir), 0);
+	end_creating(dentry, parent);
 	return ret;
 }
 
@@ -2417,18 +2410,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		goto free_subvol_name;
 	}
 
-	ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
-	if (ret == -EINTR)
-		goto free_subvol_name;
-	dentry = lookup_one(idmap, &QSTR(subvol_name), parent);
+	dentry = start_removing_killable(idmap, parent, &QSTR(subvol_name));
 	if (IS_ERR(dentry)) {
 		ret = PTR_ERR(dentry);
-		goto out_unlock_dir;
-	}
-
-	if (d_really_is_negative(dentry)) {
-		ret = -ENOENT;
-		goto out_dput;
+		goto out_end_removing;
 	}
 
 	inode = d_inode(dentry);
@@ -2449,7 +2434,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		 */
 		ret = -EPERM;
 		if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
-			goto out_dput;
+			goto out_end_removing;
 
 		/*
 		 * Do not allow deletion if the parent dir is the same
@@ -2460,21 +2445,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		 */
 		ret = -EINVAL;
 		if (root == dest)
-			goto out_dput;
+			goto out_end_removing;
 
 		ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
 		if (ret)
-			goto out_dput;
+			goto out_end_removing;
 	}
 
 	/* check if subvolume may be deleted by a user */
 	ret = btrfs_may_delete(idmap, dir, dentry, 1);
 	if (ret)
-		goto out_dput;
+		goto out_end_removing;
 
 	if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
 		ret = -EINVAL;
-		goto out_dput;
+		goto out_end_removing;
 	}
 
 	btrfs_inode_lock(BTRFS_I(inode), 0);
@@ -2483,10 +2468,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	if (!ret)
 		d_delete_notify(dir, dentry);
 
-out_dput:
-	dput(dentry);
-out_unlock_dir:
-	btrfs_inode_unlock(BTRFS_I(dir), 0);
+out_end_removing:
+	end_removing(dentry);
 free_subvol_name:
 	kfree(subvol_name_ptr);
 free_parent:
diff --git a/fs/namei.c b/fs/namei.c
index bfc443bec8a9..04d2819bd351 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2778,19 +2778,33 @@ static int filename_parentat(int dfd, struct filename *name,
  * Returns: a locked dentry, or an error.
  *
  */
-struct dentry *start_dirop(struct dentry *parent, struct qstr *name,
-			   unsigned int lookup_flags)
+static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name,
+				    unsigned int lookup_flags,
+				    unsigned int state)
 {
 	struct dentry *dentry;
 	struct inode *dir = d_inode(parent);
 
-	inode_lock_nested(dir, I_MUTEX_PARENT);
+	if (state == TASK_KILLABLE) {
+		int ret = down_write_killable_nested(&dir->i_rwsem,
+						     I_MUTEX_PARENT);
+		if (ret)
+			return ERR_PTR(ret);
+	} else {
+		inode_lock_nested(dir, I_MUTEX_PARENT);
+	}
 	dentry = lookup_one_qstr_excl(name, parent, lookup_flags);
 	if (IS_ERR(dentry))
 		inode_unlock(dir);
 	return dentry;
 }
 
+struct dentry *start_dirop(struct dentry *parent, struct qstr *name,
+			   unsigned int lookup_flags)
+{
+	return __start_dirop(parent, name, lookup_flags, TASK_NORMAL);
+}
+
 /**
  * end_dirop - signal completion of a dirop
  * @de: the dentry which was returned by start_dirop or similar.
@@ -3275,6 +3289,66 @@ struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
 }
 EXPORT_SYMBOL(start_removing);
 
+/**
+ * start_creating_killable - prepare to create a given name with permission checking
+ * @idmap:  idmap of the mount
+ * @parent: directory in which to prepare to create the name
+ * @name:   the name to be created
+ *
+ * Locks are taken and a lookup in performed prior to creating
+ * an object in a directory.  Permission checking (MAY_EXEC) is performed
+ * against @idmap.
+ *
+ * If the name already exists, a positive dentry is returned.
+ *
+ * If a signal is received or was already pending, the function aborts
+ * with -EINTR;
+ *
+ * Returns: a negative or positive dentry, or an error.
+ */
+struct dentry *start_creating_killable(struct mnt_idmap *idmap,
+				       struct dentry *parent,
+				       struct qstr *name)
+{
+	int err = lookup_one_common(idmap, name, parent);
+
+	if (err)
+		return ERR_PTR(err);
+	return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(start_creating_killable);
+
+/**
+ * start_removing_killable - prepare to remove a given name with permission checking
+ * @idmap:  idmap of the mount
+ * @parent: directory in which to find the name
+ * @name:   the name to be removed
+ *
+ * Locks are taken and a lookup in performed prior to removing
+ * an object from a directory.  Permission checking (MAY_EXEC) is performed
+ * against @idmap.
+ *
+ * If the name doesn't exist, an error is returned.
+ *
+ * end_removing() should be called when removal is complete, or aborted.
+ *
+ * If a signal is received or was already pending, the function aborts
+ * with -EINTR;
+ *
+ * Returns: a positive dentry, or an error.
+ */
+struct dentry *start_removing_killable(struct mnt_idmap *idmap,
+				       struct dentry *parent,
+				       struct qstr *name)
+{
+	int err = lookup_one_common(idmap, name, parent);
+
+	if (err)
+		return ERR_PTR(err);
+	return __start_dirop(parent, name, 0, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(start_removing_killable);
+
 /**
  * start_creating_noperm - prepare to create a given name without permission checking
  * @parent: directory in which to prepare to create the name
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 7e916e9d7726..e5cff89679df 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -92,6 +92,12 @@ struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
 			      struct qstr *name);
 struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
 			      struct qstr *name);
+struct dentry *start_creating_killable(struct mnt_idmap *idmap,
+				       struct dentry *parent,
+				       struct qstr *name);
+struct dentry *start_removing_killable(struct mnt_idmap *idmap,
+				       struct dentry *parent,
+				       struct qstr *name);
 struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name);
 struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name);
 struct dentry *start_removing_dentry(struct dentry *parent,
-- 
cgit v1.2.3


From 5c8752729970cc2323ba86817254749f7f21f163 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:33 +1100
Subject: VFS/nfsd/ovl: introduce start_renaming() and end_renaming()

start_renaming() combines name lookup and locking to prepare for rename.
It is used when two names need to be looked up as in nfsd and overlayfs -
cases where one or both dentries are already available will be handled
separately.

__start_renaming() avoids the inode_permission check and hash
calculation and is suitable after filename_parentat() in do_renameat2().
It subsumes quite a bit of code from that function.

start_renaming() does calculate the hash and check X permission and is
suitable elsewhere:
- nfsd_rename()
- ovl_rename()

In ovl, ovl_do_rename_rd() is factored out of ovl_do_rename(), which
itself will be gone by the end of the series.

Acked-by: Chuck Lever <chuck.lever@oracle.com> (for nfsd parts)
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: NeilBrown <neil@brown.name>

--
Changes since v3:
 - added missig dput() in ovl_rename when "whiteout" is not-NULL.

Changes since v2:
 - in __start_renaming() some label have been renamed, and err
   is always set before a "goto out_foo" rather than passing the
   error in a dentry*.
 - ovl_do_rename() changed to call the new ovl_do_rename_rd() rather
   than keeping duplicate code
 - code around ovl_cleanup() call in ovl_rename() restructured.

Link: https://patch.msgid.link/20251113002050.676694-11-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c               | 197 ++++++++++++++++++++++++++++++++++-------------
 fs/nfsd/vfs.c            |  73 ++++++------------
 fs/overlayfs/dir.c       |  74 ++++++++----------
 fs/overlayfs/overlayfs.h |  23 ++++--
 include/linux/namei.h    |   3 +
 5 files changed, 218 insertions(+), 152 deletions(-)

(limited to 'include')

diff --git a/fs/namei.c b/fs/namei.c
index 04d2819bd351..0ee0a110b088 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3667,6 +3667,129 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 }
 EXPORT_SYMBOL(unlock_rename);
 
+/**
+ * __start_renaming - lookup and lock names for rename
+ * @rd:           rename data containing parent and flags, and
+ *                for receiving found dentries
+ * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
+ *                LOOKUP_NO_SYMLINKS etc).
+ * @old_last:     name of object in @rd.old_parent
+ * @new_last:     name of object in @rd.new_parent
+ *
+ * Look up two names and ensure locks are in place for
+ * rename.
+ *
+ * On success the found dentries are stored in @rd.old_dentry,
+ * @rd.new_dentry.  These references and the lock are dropped by
+ * end_renaming().
+ *
+ * The passed in qstrs must have the hash calculated, and no permission
+ * checking is performed.
+ *
+ * Returns: zero or an error.
+ */
+static int
+__start_renaming(struct renamedata *rd, int lookup_flags,
+		 struct qstr *old_last, struct qstr *new_last)
+{
+	struct dentry *trap;
+	struct dentry *d1, *d2;
+	int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
+	int err;
+
+	if (rd->flags & RENAME_EXCHANGE)
+		target_flags = 0;
+	if (rd->flags & RENAME_NOREPLACE)
+		target_flags |= LOOKUP_EXCL;
+
+	trap = lock_rename(rd->old_parent, rd->new_parent);
+	if (IS_ERR(trap))
+		return PTR_ERR(trap);
+
+	d1 = lookup_one_qstr_excl(old_last, rd->old_parent,
+				  lookup_flags);
+	err = PTR_ERR(d1);
+	if (IS_ERR(d1))
+		goto out_unlock;
+
+	d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
+				  lookup_flags | target_flags);
+	err = PTR_ERR(d2);
+	if (IS_ERR(d2))
+		goto out_dput_d1;
+
+	if (d1 == trap) {
+		/* source is an ancestor of target */
+		err = -EINVAL;
+		goto out_dput_d2;
+	}
+
+	if (d2 == trap) {
+		/* target is an ancestor of source */
+		if (rd->flags & RENAME_EXCHANGE)
+			err = -EINVAL;
+		else
+			err = -ENOTEMPTY;
+		goto out_dput_d2;
+	}
+
+	rd->old_dentry = d1;
+	rd->new_dentry = d2;
+	return 0;
+
+out_dput_d2:
+	dput(d2);
+out_dput_d1:
+	dput(d1);
+out_unlock:
+	unlock_rename(rd->old_parent, rd->new_parent);
+	return err;
+}
+
+/**
+ * start_renaming - lookup and lock names for rename with permission checking
+ * @rd:           rename data containing parent and flags, and
+ *                for receiving found dentries
+ * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
+ *                LOOKUP_NO_SYMLINKS etc).
+ * @old_last:     name of object in @rd.old_parent
+ * @new_last:     name of object in @rd.new_parent
+ *
+ * Look up two names and ensure locks are in place for
+ * rename.
+ *
+ * On success the found dentries are stored in @rd.old_dentry,
+ * @rd.new_dentry.  These references and the lock are dropped by
+ * end_renaming().
+ *
+ * The passed in qstrs need not have the hash calculated, and basic
+ * eXecute permission checking is performed against @rd.mnt_idmap.
+ *
+ * Returns: zero or an error.
+ */
+int start_renaming(struct renamedata *rd, int lookup_flags,
+		   struct qstr *old_last, struct qstr *new_last)
+{
+	int err;
+
+	err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent);
+	if (err)
+		return err;
+	err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent);
+	if (err)
+		return err;
+	return __start_renaming(rd, lookup_flags, old_last, new_last);
+}
+EXPORT_SYMBOL(start_renaming);
+
+void end_renaming(struct renamedata *rd)
+{
+	unlock_rename(rd->old_parent, rd->new_parent);
+	dput(rd->old_dentry);
+	dput(rd->new_dentry);
+}
+EXPORT_SYMBOL(end_renaming);
+
 /**
  * vfs_prepare_mode - prepare the mode to be used for a new inode
  * @idmap:	idmap of the mount the inode was found from
@@ -5504,14 +5627,11 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
 		 struct filename *to, unsigned int flags)
 {
 	struct renamedata rd;
-	struct dentry *old_dentry, *new_dentry;
-	struct dentry *trap;
 	struct path old_path, new_path;
 	struct qstr old_last, new_last;
 	int old_type, new_type;
 	struct inode *delegated_inode = NULL;
-	unsigned int lookup_flags = 0, target_flags =
-		LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
+	unsigned int lookup_flags = 0;
 	bool should_retry = false;
 	int error = -EINVAL;
 
@@ -5522,11 +5642,6 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
 	    (flags & RENAME_EXCHANGE))
 		goto put_names;
 
-	if (flags & RENAME_EXCHANGE)
-		target_flags = 0;
-	if (flags & RENAME_NOREPLACE)
-		target_flags |= LOOKUP_EXCL;
-
 retry:
 	error = filename_parentat(olddfd, from, lookup_flags, &old_path,
 				  &old_last, &old_type);
@@ -5556,66 +5671,40 @@ retry:
 		goto exit2;
 
 retry_deleg:
-	trap = lock_rename(new_path.dentry, old_path.dentry);
-	if (IS_ERR(trap)) {
-		error = PTR_ERR(trap);
+	rd.old_parent	   = old_path.dentry;
+	rd.mnt_idmap	   = mnt_idmap(old_path.mnt);
+	rd.new_parent	   = new_path.dentry;
+	rd.delegated_inode = &delegated_inode;
+	rd.flags	   = flags;
+
+	error = __start_renaming(&rd, lookup_flags, &old_last, &new_last);
+	if (error)
 		goto exit_lock_rename;
-	}
 
-	old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
-					  lookup_flags);
-	error = PTR_ERR(old_dentry);
-	if (IS_ERR(old_dentry))
-		goto exit3;
-	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
-					  lookup_flags | target_flags);
-	error = PTR_ERR(new_dentry);
-	if (IS_ERR(new_dentry))
-		goto exit4;
 	if (flags & RENAME_EXCHANGE) {
-		if (!d_is_dir(new_dentry)) {
+		if (!d_is_dir(rd.new_dentry)) {
 			error = -ENOTDIR;
 			if (new_last.name[new_last.len])
-				goto exit5;
+				goto exit_unlock;
 		}
 	}
 	/* unless the source is a directory trailing slashes give -ENOTDIR */
-	if (!d_is_dir(old_dentry)) {
+	if (!d_is_dir(rd.old_dentry)) {
 		error = -ENOTDIR;
 		if (old_last.name[old_last.len])
-			goto exit5;
+			goto exit_unlock;
 		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
-			goto exit5;
-	}
-	/* source should not be ancestor of target */
-	error = -EINVAL;
-	if (old_dentry == trap)
-		goto exit5;
-	/* target should not be an ancestor of source */
-	if (!(flags & RENAME_EXCHANGE))
-		error = -ENOTEMPTY;
-	if (new_dentry == trap)
-		goto exit5;
+			goto exit_unlock;
+	}
 
-	error = security_path_rename(&old_path, old_dentry,
-				     &new_path, new_dentry, flags);
+	error = security_path_rename(&old_path, rd.old_dentry,
+				     &new_path, rd.new_dentry, flags);
 	if (error)
-		goto exit5;
+		goto exit_unlock;
 
-	rd.old_parent	   = old_path.dentry;
-	rd.old_dentry	   = old_dentry;
-	rd.mnt_idmap	   = mnt_idmap(old_path.mnt);
-	rd.new_parent	   = new_path.dentry;
-	rd.new_dentry	   = new_dentry;
-	rd.delegated_inode = &delegated_inode;
-	rd.flags	   = flags;
 	error = vfs_rename(&rd);
-exit5:
-	dput(new_dentry);
-exit4:
-	dput(old_dentry);
-exit3:
-	unlock_rename(new_path.dentry, old_path.dentry);
+exit_unlock:
+	end_renaming(&rd);
 exit_lock_rename:
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index cd64ffe12e0b..62109885d4db 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1885,11 +1885,12 @@ __be32
 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 			    struct svc_fh *tfhp, char *tname, int tlen)
 {
-	struct dentry	*fdentry, *tdentry, *odentry, *ndentry, *trap;
+	struct dentry	*fdentry, *tdentry;
 	int		type = S_IFDIR;
+	struct renamedata rd = {};
 	__be32		err;
 	int		host_err;
-	bool		close_cached = false;
+	struct dentry	*close_cached;
 
 	trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen);
 
@@ -1915,15 +1916,22 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 		goto out;
 
 retry:
+	close_cached = NULL;
 	host_err = fh_want_write(ffhp);
 	if (host_err) {
 		err = nfserrno(host_err);
 		goto out;
 	}
 
-	trap = lock_rename(tdentry, fdentry);
-	if (IS_ERR(trap)) {
-		err = nfserr_xdev;
+	rd.mnt_idmap	= &nop_mnt_idmap;
+	rd.old_parent	= fdentry;
+	rd.new_parent	= tdentry;
+
+	host_err = start_renaming(&rd, 0, &QSTR_LEN(fname, flen),
+				  &QSTR_LEN(tname, tlen));
+
+	if (host_err) {
+		err = nfserrno(host_err);
 		goto out_want_write;
 	}
 	err = fh_fill_pre_attrs(ffhp);
@@ -1933,48 +1941,23 @@ retry:
 	if (err != nfs_ok)
 		goto out_unlock;
 
-	odentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), fdentry);
-	host_err = PTR_ERR(odentry);
-	if (IS_ERR(odentry))
-		goto out_nfserr;
+	type = d_inode(rd.old_dentry)->i_mode & S_IFMT;
+
+	if (d_inode(rd.new_dentry))
+		type = d_inode(rd.new_dentry)->i_mode & S_IFMT;
 
-	host_err = -ENOENT;
-	if (d_really_is_negative(odentry))
-		goto out_dput_old;
-	host_err = -EINVAL;
-	if (odentry == trap)
-		goto out_dput_old;
-	type = d_inode(odentry)->i_mode & S_IFMT;
-
-	ndentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(tname, tlen), tdentry);
-	host_err = PTR_ERR(ndentry);
-	if (IS_ERR(ndentry))
-		goto out_dput_old;
-	if (d_inode(ndentry))
-		type = d_inode(ndentry)->i_mode & S_IFMT;
-	host_err = -ENOTEMPTY;
-	if (ndentry == trap)
-		goto out_dput_new;
-
-	if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) &&
-	    nfsd_has_cached_files(ndentry)) {
-		close_cached = true;
-		goto out_dput_old;
+	if ((rd.new_dentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) &&
+	    nfsd_has_cached_files(rd.new_dentry)) {
+		close_cached = dget(rd.new_dentry);
+		goto out_unlock;
 	} else {
-		struct renamedata rd = {
-			.mnt_idmap	= &nop_mnt_idmap,
-			.old_parent	= fdentry,
-			.old_dentry	= odentry,
-			.new_parent	= tdentry,
-			.new_dentry	= ndentry,
-		};
 		int retries;
 
 		for (retries = 1;;) {
 			host_err = vfs_rename(&rd);
 			if (host_err != -EAGAIN || !retries--)
 				break;
-			if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry)))
+			if (!nfsd_wait_for_delegreturn(rqstp, d_inode(rd.old_dentry)))
 				break;
 		}
 		if (!host_err) {
@@ -1983,11 +1966,6 @@ retry:
 				host_err = commit_metadata(ffhp);
 		}
 	}
- out_dput_new:
-	dput(ndentry);
- out_dput_old:
-	dput(odentry);
- out_nfserr:
 	if (host_err == -EBUSY) {
 		/*
 		 * See RFC 8881 Section 18.26.4 para 1-3: NFSv4 RENAME
@@ -2006,7 +1984,7 @@ retry:
 		fh_fill_post_attrs(tfhp);
 	}
 out_unlock:
-	unlock_rename(tdentry, fdentry);
+	end_renaming(&rd);
 out_want_write:
 	fh_drop_write(ffhp);
 
@@ -2017,9 +1995,8 @@ out_want_write:
 	 * until this point and then reattempt the whole shebang.
 	 */
 	if (close_cached) {
-		close_cached = false;
-		nfsd_close_cached_files(ndentry);
-		dput(ndentry);
+		nfsd_close_cached_files(close_cached);
+		dput(close_cached);
 		goto retry;
 	}
 out:
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 82b6ff0ab2d3..18a4c7a5ddd2 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1124,9 +1124,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
 	int err;
 	struct dentry *old_upperdir;
 	struct dentry *new_upperdir;
-	struct dentry *olddentry = NULL;
-	struct dentry *newdentry = NULL;
-	struct dentry *trap, *de;
+	struct renamedata rd = {};
 	bool old_opaque;
 	bool new_opaque;
 	bool cleanup_whiteout = false;
@@ -1136,6 +1134,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
 	bool new_is_dir = d_is_dir(new);
 	bool samedir = olddir == newdir;
 	struct dentry *opaquedir = NULL;
+	struct dentry *whiteout = NULL;
 	const struct cred *old_cred = NULL;
 	struct ovl_fs *ofs = OVL_FS(old->d_sb);
 	LIST_HEAD(list);
@@ -1233,29 +1232,21 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
 		}
 	}
 
-	trap = lock_rename(new_upperdir, old_upperdir);
-	if (IS_ERR(trap)) {
-		err = PTR_ERR(trap);
-		goto out_revert_creds;
-	}
+	rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+	rd.old_parent = old_upperdir;
+	rd.new_parent = new_upperdir;
+	rd.flags = flags;
 
-	de = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
-			      old->d_name.len);
-	err = PTR_ERR(de);
-	if (IS_ERR(de))
-		goto out_unlock;
-	olddentry = de;
+	err = start_renaming(&rd, 0,
+			     &QSTR_LEN(old->d_name.name, old->d_name.len),
+			     &QSTR_LEN(new->d_name.name, new->d_name.len));
 
-	err = -ESTALE;
-	if (!ovl_matches_upper(old, olddentry))
-		goto out_unlock;
+	if (err)
+		goto out_revert_creds;
 
-	de = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir,
-			      new->d_name.len);
-	err = PTR_ERR(de);
-	if (IS_ERR(de))
+	err = -ESTALE;
+	if (!ovl_matches_upper(old, rd.old_dentry))
 		goto out_unlock;
-	newdentry = de;
 
 	old_opaque = ovl_dentry_is_opaque(old);
 	new_opaque = ovl_dentry_is_opaque(new);
@@ -1263,15 +1254,15 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
 	err = -ESTALE;
 	if (d_inode(new) && ovl_dentry_upper(new)) {
 		if (opaquedir) {
-			if (newdentry != opaquedir)
+			if (rd.new_dentry != opaquedir)
 				goto out_unlock;
 		} else {
-			if (!ovl_matches_upper(new, newdentry))
+			if (!ovl_matches_upper(new, rd.new_dentry))
 				goto out_unlock;
 		}
 	} else {
-		if (!d_is_negative(newdentry)) {
-			if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry))
+		if (!d_is_negative(rd.new_dentry)) {
+			if (!new_opaque || !ovl_upper_is_whiteout(ofs, rd.new_dentry))
 				goto out_unlock;
 		} else {
 			if (flags & RENAME_EXCHANGE)
@@ -1279,19 +1270,14 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
 		}
 	}
 
-	if (olddentry == trap)
-		goto out_unlock;
-	if (newdentry == trap)
-		goto out_unlock;
-
-	if (olddentry->d_inode == newdentry->d_inode)
+	if (rd.old_dentry->d_inode == rd.new_dentry->d_inode)
 		goto out_unlock;
 
 	err = 0;
 	if (ovl_type_merge_or_lower(old))
 		err = ovl_set_redirect(old, samedir);
 	else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent))
-		err = ovl_set_opaque_xerr(old, olddentry, -EXDEV);
+		err = ovl_set_opaque_xerr(old, rd.old_dentry, -EXDEV);
 	if (err)
 		goto out_unlock;
 
@@ -1299,18 +1285,24 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
 		err = ovl_set_redirect(new, samedir);
 	else if (!overwrite && new_is_dir && !new_opaque &&
 		 ovl_type_merge(old->d_parent))
-		err = ovl_set_opaque_xerr(new, newdentry, -EXDEV);
+		err = ovl_set_opaque_xerr(new, rd.new_dentry, -EXDEV);
 	if (err)
 		goto out_unlock;
 
-	err = ovl_do_rename(ofs, old_upperdir, olddentry,
-			    new_upperdir, newdentry, flags);
-	unlock_rename(new_upperdir, old_upperdir);
+	err = ovl_do_rename_rd(&rd);
+
+	if (!err && cleanup_whiteout)
+		whiteout = dget(rd.new_dentry);
+
+	end_renaming(&rd);
+
 	if (err)
 		goto out_revert_creds;
 
-	if (cleanup_whiteout)
-		ovl_cleanup(ofs, old_upperdir, newdentry);
+	if (whiteout) {
+		ovl_cleanup(ofs, old_upperdir, whiteout);
+		dput(whiteout);
+	}
 
 	if (overwrite && d_inode(new)) {
 		if (new_is_dir)
@@ -1336,14 +1328,12 @@ out_revert_creds:
 	else
 		ovl_drop_write(old);
 out:
-	dput(newdentry);
-	dput(olddentry);
 	dput(opaquedir);
 	ovl_cache_free(&list);
 	return err;
 
 out_unlock:
-	unlock_rename(new_upperdir, old_upperdir);
+	end_renaming(&rd);
 	goto out_revert_creds;
 }
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 49ad65f829dc..3cc85a893b5c 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -355,11 +355,24 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry,
 	return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name);
 }
 
+static inline int ovl_do_rename_rd(struct renamedata *rd)
+{
+	int err;
+
+	pr_debug("rename(%pd2, %pd2, 0x%x)\n", rd->old_dentry, rd->new_dentry,
+		 rd->flags);
+	err = vfs_rename(rd);
+	if (err) {
+		pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
+			 rd->old_dentry, rd->new_dentry, err);
+	}
+	return err;
+}
+
 static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir,
 				struct dentry *olddentry, struct dentry *newdir,
 				struct dentry *newdentry, unsigned int flags)
 {
-	int err;
 	struct renamedata rd = {
 		.mnt_idmap	= ovl_upper_mnt_idmap(ofs),
 		.old_parent	= olddir,
@@ -369,13 +382,7 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir,
 		.flags		= flags,
 	};
 
-	pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
-	err = vfs_rename(&rd);
-	if (err) {
-		pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
-			 olddentry, newdentry, err);
-	}
-	return err;
+	return ovl_do_rename_rd(&rd);
 }
 
 static inline int ovl_do_whiteout(struct ovl_fs *ofs,
diff --git a/include/linux/namei.h b/include/linux/namei.h
index e5cff89679df..19c3d8e336d5 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -156,6 +156,9 @@ extern int follow_up(struct path *);
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
+int start_renaming(struct renamedata *rd, int lookup_flags,
+		   struct qstr *old_last, struct qstr *new_last);
+void end_renaming(struct renamedata *rd);
 
 /**
  * mode_strip_umask - handle vfs umask stripping
-- 
cgit v1.2.3


From ac50950ca143fd637dec4f7457a9162e1a4344e8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:34 +1100
Subject: VFS/ovl/smb: introduce start_renaming_dentry()

Several callers perform a rename on a dentry they already have, and only
require lookup for the target name.  This includes smb/server and a few
different places in overlayfs.

start_renaming_dentry() performs the required lookup and takes the
required lock using lock_rename_child()

It is used in three places in overlayfs and in ksmbd_vfs_rename().

In the ksmbd case, the parent of the source is not important - the
source must be renamed from wherever it is.  So start_renaming_dentry()
allows rd->old_parent to be NULL and only checks it if it is non-NULL.
On success rd->old_parent will be the parent of old_dentry with an extra
reference taken.  Other start_renaming function also now take the extra
reference and end_renaming() now drops this reference as well.

ovl_lookup_temp(), ovl_parent_lock(), and ovl_parent_unlock() are
all removed as they are no longer needed.

OVL_TEMPNAME_SIZE and ovl_tempname() are now declared in overlayfs.h so
that ovl_check_rename_whiteout() can access them.

ovl_copy_up_workdir() now always cleans up on error.

Reviewed-by: Namjae Jeon <linkinjeon@kernel.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-12-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c               | 108 ++++++++++++++++++++++++++++++++++++++++++++---
 fs/overlayfs/copy_up.c   |  54 ++++++++++--------------
 fs/overlayfs/dir.c       |  19 +--------
 fs/overlayfs/overlayfs.h |   8 +---
 fs/overlayfs/super.c     |  22 +++++-----
 fs/overlayfs/util.c      |  11 -----
 fs/smb/server/vfs.c      |  60 +++++---------------------
 include/linux/namei.h    |   2 +
 8 files changed, 150 insertions(+), 134 deletions(-)

(limited to 'include')

diff --git a/fs/namei.c b/fs/namei.c
index 0ee0a110b088..5153ceddd37a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3669,7 +3669,7 @@ EXPORT_SYMBOL(unlock_rename);
 
 /**
  * __start_renaming - lookup and lock names for rename
- * @rd:           rename data containing parent and flags, and
+ * @rd:           rename data containing parents and flags, and
  *                for receiving found dentries
  * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
  *                LOOKUP_NO_SYMLINKS etc).
@@ -3680,8 +3680,8 @@ EXPORT_SYMBOL(unlock_rename);
  * rename.
  *
  * On success the found dentries are stored in @rd.old_dentry,
- * @rd.new_dentry.  These references and the lock are dropped by
- * end_renaming().
+ * @rd.new_dentry and an extra ref is taken on @rd.old_parent.
+ * These references and the lock are dropped by end_renaming().
  *
  * The passed in qstrs must have the hash calculated, and no permission
  * checking is performed.
@@ -3735,6 +3735,7 @@ __start_renaming(struct renamedata *rd, int lookup_flags,
 
 	rd->old_dentry = d1;
 	rd->new_dentry = d2;
+	dget(rd->old_parent);
 	return 0;
 
 out_dput_d2:
@@ -3748,7 +3749,7 @@ out_unlock:
 
 /**
  * start_renaming - lookup and lock names for rename with permission checking
- * @rd:           rename data containing parent and flags, and
+ * @rd:           rename data containing parents and flags, and
  *                for receiving found dentries
  * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
  *                LOOKUP_NO_SYMLINKS etc).
@@ -3759,8 +3760,8 @@ out_unlock:
  * rename.
  *
  * On success the found dentries are stored in @rd.old_dentry,
- * @rd.new_dentry.  These references and the lock are dropped by
- * end_renaming().
+ * @rd.new_dentry.  Also the refcount on @rd->old_parent is increased.
+ * These references and the lock are dropped by end_renaming().
  *
  * The passed in qstrs need not have the hash calculated, and basic
  * eXecute permission checking is performed against @rd.mnt_idmap.
@@ -3782,11 +3783,106 @@ int start_renaming(struct renamedata *rd, int lookup_flags,
 }
 EXPORT_SYMBOL(start_renaming);
 
+static int
+__start_renaming_dentry(struct renamedata *rd, int lookup_flags,
+			struct dentry *old_dentry, struct qstr *new_last)
+{
+	struct dentry *trap;
+	struct dentry *d2;
+	int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
+	int err;
+
+	if (rd->flags & RENAME_EXCHANGE)
+		target_flags = 0;
+	if (rd->flags & RENAME_NOREPLACE)
+		target_flags |= LOOKUP_EXCL;
+
+	/* Already have the dentry - need to be sure to lock the correct parent */
+	trap = lock_rename_child(old_dentry, rd->new_parent);
+	if (IS_ERR(trap))
+		return PTR_ERR(trap);
+	if (d_unhashed(old_dentry) ||
+	    (rd->old_parent && rd->old_parent != old_dentry->d_parent)) {
+		/* dentry was removed, or moved and explicit parent requested */
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
+				  lookup_flags | target_flags);
+	err = PTR_ERR(d2);
+	if (IS_ERR(d2))
+		goto out_unlock;
+
+	if (old_dentry == trap) {
+		/* source is an ancestor of target */
+		err = -EINVAL;
+		goto out_dput_d2;
+	}
+
+	if (d2 == trap) {
+		/* target is an ancestor of source */
+		if (rd->flags & RENAME_EXCHANGE)
+			err = -EINVAL;
+		else
+			err = -ENOTEMPTY;
+		goto out_dput_d2;
+	}
+
+	rd->old_dentry = dget(old_dentry);
+	rd->new_dentry = d2;
+	rd->old_parent = dget(old_dentry->d_parent);
+	return 0;
+
+out_dput_d2:
+	dput(d2);
+out_unlock:
+	unlock_rename(old_dentry->d_parent, rd->new_parent);
+	return err;
+}
+
+/**
+ * start_renaming_dentry - lookup and lock name for rename with permission checking
+ * @rd:           rename data containing parents and flags, and
+ *                for receiving found dentries
+ * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
+ *                LOOKUP_NO_SYMLINKS etc).
+ * @old_dentry:   dentry of name to move
+ * @new_last:     name of target in @rd.new_parent
+ *
+ * Look up target name and ensure locks are in place for
+ * rename.
+ *
+ * On success the found dentry is stored in @rd.new_dentry and
+ * @rd.old_parent is confirmed to be the parent of @old_dentry.  If it
+ * was originally %NULL, it is set.  In either case a reference is taken
+ * so that end_renaming() can have a stable reference to unlock.
+ *
+ * References and the lock can be dropped with end_renaming()
+ *
+ * The passed in qstr need not have the hash calculated, and basic
+ * eXecute permission checking is performed against @rd.mnt_idmap.
+ *
+ * Returns: zero or an error.
+ */
+int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
+			  struct dentry *old_dentry, struct qstr *new_last)
+{
+	int err;
+
+	err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent);
+	if (err)
+		return err;
+	return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last);
+}
+EXPORT_SYMBOL(start_renaming_dentry);
+
 void end_renaming(struct renamedata *rd)
 {
 	unlock_rename(rd->old_parent, rd->new_parent);
 	dput(rd->old_dentry);
 	dput(rd->new_dentry);
+	dput(rd->old_parent);
 }
 EXPORT_SYMBOL(end_renaming);
 
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 7a31ca9bdea2..27014ada11c7 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -523,8 +523,8 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
 {
 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
 	struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
-	struct dentry *index = NULL;
 	struct dentry *temp = NULL;
+	struct renamedata rd = {};
 	struct qstr name = { };
 	int err;
 
@@ -556,17 +556,15 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
 	if (err)
 		goto out;
 
-	err = ovl_parent_lock(indexdir, temp);
+	rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+	rd.old_parent = indexdir;
+	rd.new_parent = indexdir;
+	err = start_renaming_dentry(&rd, 0, temp, &name);
 	if (err)
 		goto out;
-	index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
-	if (IS_ERR(index)) {
-		err = PTR_ERR(index);
-	} else {
-		err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0);
-		dput(index);
-	}
-	ovl_parent_unlock(indexdir);
+
+	err = ovl_do_rename_rd(&rd);
+	end_renaming(&rd);
 out:
 	if (err)
 		ovl_cleanup(ofs, indexdir, temp);
@@ -763,7 +761,8 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
 	struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
 	struct inode *inode;
 	struct path path = { .mnt = ovl_upper_mnt(ofs) };
-	struct dentry *temp, *upper, *trap;
+	struct renamedata rd = {};
+	struct dentry *temp;
 	struct ovl_cu_creds cc;
 	int err;
 	struct ovl_cattr cattr = {
@@ -807,29 +806,24 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
 	 * ovl_copy_up_data(), so lock workdir and destdir and make sure that
 	 * temp wasn't moved before copy up completion or cleanup.
 	 */
-	trap = lock_rename(c->workdir, c->destdir);
-	if (trap || temp->d_parent != c->workdir) {
-		/* temp or workdir moved underneath us? abort without cleanup */
-		dput(temp);
+	rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+	rd.old_parent = c->workdir;
+	rd.new_parent = c->destdir;
+	rd.flags = 0;
+	err = start_renaming_dentry(&rd, 0, temp,
+				    &QSTR_LEN(c->destname.name, c->destname.len));
+	if (err) {
+		/* temp or workdir moved underneath us? map to -EIO */
 		err = -EIO;
-		if (!IS_ERR(trap))
-			unlock_rename(c->workdir, c->destdir);
-		goto out;
 	}
-
-	err = ovl_copy_up_metadata(c, temp);
 	if (err)
-		goto cleanup;
+		goto cleanup_unlocked;
 
-	upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
-				 c->destname.len);
-	err = PTR_ERR(upper);
-	if (IS_ERR(upper))
-		goto cleanup;
+	err = ovl_copy_up_metadata(c, temp);
+	if (!err)
+		err = ovl_do_rename_rd(&rd);
+	end_renaming(&rd);
 
-	err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0);
-	unlock_rename(c->workdir, c->destdir);
-	dput(upper);
 	if (err)
 		goto cleanup_unlocked;
 
@@ -850,8 +844,6 @@ out:
 
 	return err;
 
-cleanup:
-	unlock_rename(c->workdir, c->destdir);
 cleanup_unlocked:
 	ovl_cleanup(ofs, c->workdir, temp);
 	dput(temp);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 18a4c7a5ddd2..ac5b4475533e 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -57,8 +57,7 @@ int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir,
 	return 0;
 }
 
-#define OVL_TEMPNAME_SIZE 20
-static void ovl_tempname(char name[OVL_TEMPNAME_SIZE])
+void ovl_tempname(char name[OVL_TEMPNAME_SIZE])
 {
 	static atomic_t temp_id = ATOMIC_INIT(0);
 
@@ -66,22 +65,6 @@ static void ovl_tempname(char name[OVL_TEMPNAME_SIZE])
 	snprintf(name, OVL_TEMPNAME_SIZE, "#%x", atomic_inc_return(&temp_id));
 }
 
-struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
-{
-	struct dentry *temp;
-	char name[OVL_TEMPNAME_SIZE];
-
-	ovl_tempname(name);
-	temp = ovl_lookup_upper(ofs, name, workdir, strlen(name));
-	if (!IS_ERR(temp) && temp->d_inode) {
-		pr_err("workdir/%s already exists\n", name);
-		dput(temp);
-		temp = ERR_PTR(-EIO);
-	}
-
-	return temp;
-}
-
 static struct dentry *ovl_start_creating_temp(struct ovl_fs *ofs,
 					      struct dentry *workdir)
 {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 3cc85a893b5c..746bc4ad7b37 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -447,11 +447,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
 }
 
 /* util.c */
-int ovl_parent_lock(struct dentry *parent, struct dentry *child);
-static inline void ovl_parent_unlock(struct dentry *parent)
-{
-	inode_unlock(parent->d_inode);
-}
 int ovl_get_write_access(struct dentry *dentry);
 void ovl_put_write_access(struct dentry *dentry);
 void ovl_start_write(struct dentry *dentry);
@@ -888,7 +883,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs,
 			       struct dentry *parent, struct dentry *newdentry,
 			       struct ovl_cattr *attr);
 int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *dentry);
-struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir);
+#define OVL_TEMPNAME_SIZE 20
+void ovl_tempname(char name[OVL_TEMPNAME_SIZE]);
 struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
 			       struct ovl_cattr *attr);
 
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 6e0816c1147a..a721ef2b90e8 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -566,9 +566,10 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
 {
 	struct dentry *workdir = ofs->workdir;
 	struct dentry *temp;
-	struct dentry *dest;
 	struct dentry *whiteout;
 	struct name_snapshot name;
+	struct renamedata rd = {};
+	char name2[OVL_TEMPNAME_SIZE];
 	int err;
 
 	temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0));
@@ -576,23 +577,21 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
 	if (IS_ERR(temp))
 		return err;
 
-	err = ovl_parent_lock(workdir, temp);
+	rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+	rd.old_parent = workdir;
+	rd.new_parent = workdir;
+	rd.flags = RENAME_WHITEOUT;
+	ovl_tempname(name2);
+	err = start_renaming_dentry(&rd, 0, temp, &QSTR(name2));
 	if (err) {
 		dput(temp);
 		return err;
 	}
-	dest = ovl_lookup_temp(ofs, workdir);
-	err = PTR_ERR(dest);
-	if (IS_ERR(dest)) {
-		dput(temp);
-		ovl_parent_unlock(workdir);
-		return err;
-	}
 
 	/* Name is inline and stable - using snapshot as a copy helper */
 	take_dentry_name_snapshot(&name, temp);
-	err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT);
-	ovl_parent_unlock(workdir);
+	err = ovl_do_rename_rd(&rd);
+	end_renaming(&rd);
 	if (err) {
 		if (err == -EINVAL)
 			err = 0;
@@ -616,7 +615,6 @@ cleanup_temp:
 	ovl_cleanup(ofs, workdir, temp);
 	release_dentry_name_snapshot(&name);
 	dput(temp);
-	dput(dest);
 
 	return err;
 }
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index f76672f2e686..46387aeb6be6 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1548,14 +1548,3 @@ void ovl_copyattr(struct inode *inode)
 	i_size_write(inode, i_size_read(realinode));
 	spin_unlock(&inode->i_lock);
 }
-
-int ovl_parent_lock(struct dentry *parent, struct dentry *child)
-{
-	inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
-	if (!child ||
-	    (!d_unhashed(child) && child->d_parent == parent))
-		return 0;
-
-	inode_unlock(parent->d_inode);
-	return -EINVAL;
-}
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 148c65d59e42..ea5ab5b0adb1 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -661,7 +661,6 @@ out1:
 int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
 		     char *newname, int flags)
 {
-	struct dentry *old_parent, *new_dentry, *trap;
 	struct dentry *old_child = old_path->dentry;
 	struct path new_path;
 	struct qstr new_last;
@@ -671,7 +670,6 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
 	struct ksmbd_file *parent_fp;
 	int new_type;
 	int err, lookup_flags = LOOKUP_NO_SYMLINKS;
-	int target_lookup_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
 
 	if (ksmbd_override_fsids(work))
 		return -ENOMEM;
@@ -682,14 +680,6 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
 		goto revert_fsids;
 	}
 
-	/*
-	 * explicitly handle file overwrite case, for compatibility with
-	 * filesystems that may not support rename flags (e.g: fuse)
-	 */
-	if (flags & RENAME_NOREPLACE)
-		target_lookup_flags |= LOOKUP_EXCL;
-	flags &= ~(RENAME_NOREPLACE);
-
 retry:
 	err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH,
 				     &new_path, &new_last, &new_type,
@@ -706,17 +696,14 @@ retry:
 	if (err)
 		goto out2;
 
-	trap = lock_rename_child(old_child, new_path.dentry);
-	if (IS_ERR(trap)) {
-		err = PTR_ERR(trap);
+	rd.mnt_idmap		= mnt_idmap(old_path->mnt);
+	rd.old_parent		= NULL;
+	rd.new_parent		= new_path.dentry;
+	rd.flags		= flags;
+	rd.delegated_inode	= NULL,
+	err = start_renaming_dentry(&rd, lookup_flags, old_child, &new_last);
+	if (err)
 		goto out_drop_write;
-	}
-
-	old_parent = dget(old_child->d_parent);
-	if (d_unhashed(old_child)) {
-		err = -EINVAL;
-		goto out3;
-	}
 
 	parent_fp = ksmbd_lookup_fd_inode(old_child->d_parent);
 	if (parent_fp) {
@@ -729,44 +716,17 @@ retry:
 		ksmbd_fd_put(work, parent_fp);
 	}
 
-	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
-					  lookup_flags | target_lookup_flags);
-	if (IS_ERR(new_dentry)) {
-		err = PTR_ERR(new_dentry);
-		goto out3;
-	}
-
-	if (d_is_symlink(new_dentry)) {
+	if (d_is_symlink(rd.new_dentry)) {
 		err = -EACCES;
-		goto out4;
-	}
-
-	if (old_child == trap) {
-		err = -EINVAL;
-		goto out4;
-	}
-
-	if (new_dentry == trap) {
-		err = -ENOTEMPTY;
-		goto out4;
+		goto out3;
 	}
 
-	rd.mnt_idmap		= mnt_idmap(old_path->mnt),
-	rd.old_parent		= old_parent,
-	rd.old_dentry		= old_child,
-	rd.new_parent		= new_path.dentry,
-	rd.new_dentry		= new_dentry,
-	rd.flags		= flags,
-	rd.delegated_inode	= NULL,
 	err = vfs_rename(&rd);
 	if (err)
 		ksmbd_debug(VFS, "vfs_rename failed err %d\n", err);
 
-out4:
-	dput(new_dentry);
 out3:
-	dput(old_parent);
-	unlock_rename(old_parent, new_path.dentry);
+	end_renaming(&rd);
 out_drop_write:
 	mnt_drop_write(old_path->mnt);
 out2:
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 19c3d8e336d5..f73001e3719a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -158,6 +158,8 @@ extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
 int start_renaming(struct renamedata *rd, int lookup_flags,
 		   struct qstr *old_last, struct qstr *new_last);
+int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
+			  struct dentry *old_dentry, struct qstr *new_last);
 void end_renaming(struct renamedata *rd);
 
 /**
-- 
cgit v1.2.3


From 833d2b3a072f7ff6005bf84c065c7cbda81a8aaa Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:35 +1100
Subject: Add start_renaming_two_dentries()

A few callers want to lock for a rename and already have both dentries.
Also debugfs does want to perform a lookup but doesn't want permission
checking, so start_renaming_dentry() cannot be used.

This patch introduces start_renaming_two_dentries() which is given both
dentries.  debugfs performs one lookup itself.  As it will only continue
with a negative dentry and as those cannot be renamed or unlinked, it is
safe to do the lookup before getting the rename locks.

overlayfs uses start_renaming_two_dentries() in three places and  selinux
uses it twice in sel_make_policy_nodes().

In sel_make_policy_nodes() we now lock for rename twice instead of just
once so the combined operation is no longer atomic w.r.t the parent
directory locks.  As selinux_state.policy_mutex is held across the whole
operation this does not open up any interesting races.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-13-neilb@ownmail.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/debugfs/inode.c           | 48 +++++++++++++++-----------------
 fs/namei.c                   | 65 ++++++++++++++++++++++++++++++++++++++++++++
 fs/overlayfs/dir.c           | 43 +++++++++++++++++++----------
 include/linux/namei.h        |  2 ++
 security/selinux/selinuxfs.c | 15 ++++++++--
 5 files changed, 131 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index f241b9df642a..532bd7c46baf 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -842,7 +842,8 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, .
 	int error = 0;
 	const char *new_name;
 	struct name_snapshot old_name;
-	struct dentry *parent, *target;
+	struct dentry *target;
+	struct renamedata rd = {};
 	struct inode *dir;
 	va_list ap;
 
@@ -855,36 +856,31 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, .
 	if (!new_name)
 		return -ENOMEM;
 
-	parent = dget_parent(dentry);
-	dir = d_inode(parent);
-	inode_lock(dir);
+	rd.old_parent = dget_parent(dentry);
+	rd.new_parent = rd.old_parent;
+	rd.flags = RENAME_NOREPLACE;
+	target = lookup_noperm_unlocked(&QSTR(new_name), rd.new_parent);
+	if (IS_ERR(target))
+		return PTR_ERR(target);
 
-	take_dentry_name_snapshot(&old_name, dentry);
-
-	if (WARN_ON_ONCE(dentry->d_parent != parent)) {
-		error = -EINVAL;
-		goto out;
-	}
-	if (strcmp(old_name.name.name, new_name) == 0)
-		goto out;
-	target = lookup_noperm(&QSTR(new_name), parent);
-	if (IS_ERR(target)) {
-		error = PTR_ERR(target);
-		goto out;
-	}
-	if (d_really_is_positive(target)) {
-		dput(target);
-		error = -EINVAL;
+	error = start_renaming_two_dentries(&rd, dentry, target);
+	if (error) {
+		if (error == -EEXIST && target == dentry)
+			/* it isn't an error to rename a thing to itself */
+			error = 0;
 		goto out;
 	}
-	simple_rename_timestamp(dir, dentry, dir, target);
-	d_move(dentry, target);
-	dput(target);
+
+	dir = d_inode(rd.old_parent);
+	take_dentry_name_snapshot(&old_name, dentry);
+	simple_rename_timestamp(dir, dentry, dir, rd.new_dentry);
+	d_move(dentry, rd.new_dentry);
 	fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry);
-out:
 	release_dentry_name_snapshot(&old_name);
-	inode_unlock(dir);
-	dput(parent);
+	end_renaming(&rd);
+out:
+	dput(rd.old_parent);
+	dput(target);
 	kfree_const(new_name);
 	return error;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 5153ceddd37a..4a4b8b96c192 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3877,6 +3877,71 @@ int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
 }
 EXPORT_SYMBOL(start_renaming_dentry);
 
+/**
+ * start_renaming_two_dentries - Lock to dentries in given parents for rename
+ * @rd:           rename data containing parent
+ * @old_dentry:   dentry of name to move
+ * @new_dentry:   dentry to move to
+ *
+ * Ensure locks are in place for rename and check parentage is still correct.
+ *
+ * On success the two dentries are stored in @rd.old_dentry and
+ * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to
+ * be the parents of the dentries.
+ *
+ * References and the lock can be dropped with end_renaming()
+ *
+ * Returns: zero or an error.
+ */
+int
+start_renaming_two_dentries(struct renamedata *rd,
+			    struct dentry *old_dentry, struct dentry *new_dentry)
+{
+	struct dentry *trap;
+	int err;
+
+	/* Already have the dentry - need to be sure to lock the correct parent */
+	trap = lock_rename_child(old_dentry, rd->new_parent);
+	if (IS_ERR(trap))
+		return PTR_ERR(trap);
+	err = -EINVAL;
+	if (d_unhashed(old_dentry) ||
+	    (rd->old_parent && rd->old_parent != old_dentry->d_parent))
+		/* old_dentry was removed, or moved and explicit parent requested */
+		goto out_unlock;
+	if (d_unhashed(new_dentry) ||
+	    rd->new_parent != new_dentry->d_parent)
+		/* new_dentry was removed or moved */
+		goto out_unlock;
+
+	if (old_dentry == trap)
+		/* source is an ancestor of target */
+		goto out_unlock;
+
+	if (new_dentry == trap) {
+		/* target is an ancestor of source */
+		if (rd->flags & RENAME_EXCHANGE)
+			err = -EINVAL;
+		else
+			err = -ENOTEMPTY;
+		goto out_unlock;
+	}
+
+	err = -EEXIST;
+	if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE))
+		goto out_unlock;
+
+	rd->old_dentry = dget(old_dentry);
+	rd->new_dentry = dget(new_dentry);
+	rd->old_parent = dget(old_dentry->d_parent);
+	return 0;
+
+out_unlock:
+	unlock_rename(old_dentry->d_parent, rd->new_parent);
+	return err;
+}
+EXPORT_SYMBOL(start_renaming_two_dentries);
+
 void end_renaming(struct renamedata *rd)
 {
 	unlock_rename(rd->old_parent, rd->new_parent);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index ac5b4475533e..b7f443932d93 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -123,6 +123,7 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir,
 			     struct dentry *dentry)
 {
 	struct dentry *whiteout;
+	struct renamedata rd = {};
 	int err;
 	int flags = 0;
 
@@ -134,10 +135,14 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir,
 	if (d_is_dir(dentry))
 		flags = RENAME_EXCHANGE;
 
-	err = ovl_lock_rename_workdir(ofs->workdir, whiteout, dir, dentry);
+	rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+	rd.old_parent = ofs->workdir;
+	rd.new_parent = dir;
+	rd.flags = flags;
+	err = start_renaming_two_dentries(&rd, whiteout, dentry);
 	if (!err) {
-		err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags);
-		unlock_rename(ofs->workdir, dir);
+		err = ovl_do_rename_rd(&rd);
+		end_renaming(&rd);
 	}
 	if (err)
 		goto kill_whiteout;
@@ -388,6 +393,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
 	struct dentry *workdir = ovl_workdir(dentry);
 	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct renamedata rd = {};
 	struct path upperpath;
 	struct dentry *upper;
 	struct dentry *opaquedir;
@@ -413,7 +419,11 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 	if (IS_ERR(opaquedir))
 		goto out;
 
-	err = ovl_lock_rename_workdir(workdir, opaquedir, upperdir, upper);
+	rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+	rd.old_parent = workdir;
+	rd.new_parent = upperdir;
+	rd.flags = RENAME_EXCHANGE;
+	err = start_renaming_two_dentries(&rd, opaquedir, upper);
 	if (err)
 		goto out_cleanup_unlocked;
 
@@ -431,8 +441,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 	if (err)
 		goto out_cleanup;
 
-	err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE);
-	unlock_rename(workdir, upperdir);
+	err = ovl_do_rename_rd(&rd);
+	end_renaming(&rd);
 	if (err)
 		goto out_cleanup_unlocked;
 
@@ -445,7 +455,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 	return opaquedir;
 
 out_cleanup:
-	unlock_rename(workdir, upperdir);
+	end_renaming(&rd);
 out_cleanup_unlocked:
 	ovl_cleanup(ofs, workdir, opaquedir);
 	dput(opaquedir);
@@ -468,6 +478,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
 	struct dentry *workdir = ovl_workdir(dentry);
 	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct renamedata rd = {};
 	struct dentry *upper;
 	struct dentry *newdentry;
 	int err;
@@ -499,7 +510,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 	if (IS_ERR(newdentry))
 		goto out_dput;
 
-	err = ovl_lock_rename_workdir(workdir, newdentry, upperdir, upper);
+	rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+	rd.old_parent = workdir;
+	rd.new_parent = upperdir;
+	rd.flags = 0;
+	err = start_renaming_two_dentries(&rd, newdentry, upper);
 	if (err)
 		goto out_cleanup_unlocked;
 
@@ -536,16 +551,16 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 		if (err)
 			goto out_cleanup;
 
-		err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper,
-				    RENAME_EXCHANGE);
-		unlock_rename(workdir, upperdir);
+		rd.flags = RENAME_EXCHANGE;
+		err = ovl_do_rename_rd(&rd);
+		end_renaming(&rd);
 		if (err)
 			goto out_cleanup_unlocked;
 
 		ovl_cleanup(ofs, workdir, upper);
 	} else {
-		err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0);
-		unlock_rename(workdir, upperdir);
+		err = ovl_do_rename_rd(&rd);
+		end_renaming(&rd);
 		if (err)
 			goto out_cleanup_unlocked;
 	}
@@ -565,7 +580,7 @@ out:
 	return err;
 
 out_cleanup:
-	unlock_rename(workdir, upperdir);
+	end_renaming(&rd);
 out_cleanup_unlocked:
 	ovl_cleanup(ofs, workdir, newdentry);
 	dput(newdentry);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index f73001e3719a..a99ac8b7e24a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -160,6 +160,8 @@ int start_renaming(struct renamedata *rd, int lookup_flags,
 		   struct qstr *old_last, struct qstr *new_last);
 int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
 			  struct dentry *old_dentry, struct qstr *new_last);
+int start_renaming_two_dentries(struct renamedata *rd,
+				struct dentry *old_dentry, struct dentry *new_dentry);
 void end_renaming(struct renamedata *rd);
 
 /**
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 232e087bce3e..404e08bf60ba 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -506,6 +506,7 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi,
 {
 	int ret = 0;
 	struct dentry *tmp_parent, *tmp_bool_dir, *tmp_class_dir;
+	struct renamedata rd = {};
 	unsigned int bool_num = 0;
 	char **bool_names = NULL;
 	int *bool_values = NULL;
@@ -539,9 +540,14 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi,
 	if (ret)
 		goto out;
 
-	lock_rename(tmp_parent, fsi->sb->s_root);
+	rd.old_parent = tmp_parent;
+	rd.new_parent = fsi->sb->s_root;
 
 	/* booleans */
+	ret = start_renaming_two_dentries(&rd, tmp_bool_dir, fsi->bool_dir);
+	if (ret)
+		goto out;
+
 	d_exchange(tmp_bool_dir, fsi->bool_dir);
 
 	swap(fsi->bool_num, bool_num);
@@ -549,12 +555,17 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi,
 	swap(fsi->bool_pending_values, bool_values);
 
 	fsi->bool_dir = tmp_bool_dir;
+	end_renaming(&rd);
 
 	/* classes */
+	ret = start_renaming_two_dentries(&rd, tmp_class_dir, fsi->class_dir);
+	if (ret)
+		goto out;
+
 	d_exchange(tmp_class_dir, fsi->class_dir);
 	fsi->class_dir = tmp_class_dir;
 
-	unlock_rename(tmp_parent, fsi->sb->s_root);
+	end_renaming(&rd);
 
 out:
 	sel_remove_old_bool_data(bool_num, bool_names, bool_values);
-- 
cgit v1.2.3


From f046fbb4d81d1b0c4a169707411e3cd540c03354 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:36 +1100
Subject: ecryptfs: use new start_creating/start_removing APIs

This requires the addition of start_creating_dentry() which is given the
dentry which has already been found, and asks for it to be locked and
its parent validated.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-14-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ecryptfs/inode.c   | 153 ++++++++++++++++++++++++--------------------------
 fs/namei.c            |  33 +++++++++++
 include/linux/namei.h |   2 +
 3 files changed, 107 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ed1394da8d6b..6a5bca89e752 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -24,18 +24,26 @@
 #include <linux/unaligned.h>
 #include "ecryptfs_kernel.h"
 
-static int lock_parent(struct dentry *dentry,
-		       struct dentry **lower_dentry,
-		       struct inode **lower_dir)
+static struct dentry *ecryptfs_start_creating_dentry(struct dentry *dentry)
 {
-	struct dentry *lower_dir_dentry;
+	struct dentry *parent = dget_parent(dentry);
+	struct dentry *ret;
 
-	lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
-	*lower_dir = d_inode(lower_dir_dentry);
-	*lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	ret = start_creating_dentry(ecryptfs_dentry_to_lower(parent),
+				    ecryptfs_dentry_to_lower(dentry));
+	dput(parent);
+	return ret;
+}
 
-	inode_lock_nested(*lower_dir, I_MUTEX_PARENT);
-	return (*lower_dentry)->d_parent == lower_dir_dentry ? 0 : -EINVAL;
+static struct dentry *ecryptfs_start_removing_dentry(struct dentry *dentry)
+{
+	struct dentry *parent = dget_parent(dentry);
+	struct dentry *ret;
+
+	ret = start_removing_dentry(ecryptfs_dentry_to_lower(parent),
+				    ecryptfs_dentry_to_lower(dentry));
+	dput(parent);
+	return ret;
 }
 
 static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
@@ -141,15 +149,12 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
 	struct inode *lower_dir;
 	int rc;
 
-	rc = lock_parent(dentry, &lower_dentry, &lower_dir);
-	dget(lower_dentry);	// don't even try to make the lower negative
-	if (!rc) {
-		if (d_unhashed(lower_dentry))
-			rc = -EINVAL;
-		else
-			rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry,
-					NULL);
-	}
+	lower_dentry = ecryptfs_start_removing_dentry(dentry);
+	if (IS_ERR(lower_dentry))
+		return PTR_ERR(lower_dentry);
+
+	lower_dir = lower_dentry->d_parent->d_inode;
+	rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL);
 	if (rc) {
 		printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
 		goto out_unlock;
@@ -158,8 +163,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
 	set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
 	inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
 out_unlock:
-	dput(lower_dentry);
-	inode_unlock(lower_dir);
+	end_removing(lower_dentry);
 	if (!rc)
 		d_drop(dentry);
 	return rc;
@@ -186,10 +190,12 @@ ecryptfs_do_create(struct inode *directory_inode,
 	struct inode *lower_dir;
 	struct inode *inode;
 
-	rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir);
-	if (!rc)
-		rc = vfs_create(&nop_mnt_idmap, lower_dir,
-				lower_dentry, mode, true);
+	lower_dentry = ecryptfs_start_creating_dentry(ecryptfs_dentry);
+	if (IS_ERR(lower_dentry))
+		return ERR_CAST(lower_dentry);
+	lower_dir = lower_dentry->d_parent->d_inode;
+	rc = vfs_create(&nop_mnt_idmap, lower_dir,
+			lower_dentry, mode, true);
 	if (rc) {
 		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
 		       "rc = [%d]\n", __func__, rc);
@@ -205,7 +211,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 	fsstack_copy_attr_times(directory_inode, lower_dir);
 	fsstack_copy_inode_size(directory_inode, lower_dir);
 out_lock:
-	inode_unlock(lower_dir);
+	end_creating(lower_dentry, NULL);
 	return inode;
 }
 
@@ -433,10 +439,12 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	file_size_save = i_size_read(d_inode(old_dentry));
 	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
-	rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir);
-	if (!rc)
-		rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir,
-			      lower_new_dentry, NULL);
+	lower_new_dentry = ecryptfs_start_creating_dentry(new_dentry);
+	if (IS_ERR(lower_new_dentry))
+		return PTR_ERR(lower_new_dentry);
+	lower_dir = lower_new_dentry->d_parent->d_inode;
+	rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir,
+		      lower_new_dentry, NULL);
 	if (rc || d_really_is_negative(lower_new_dentry))
 		goto out_lock;
 	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
@@ -448,7 +456,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 		  ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink);
 	i_size_write(d_inode(new_dentry), file_size_save);
 out_lock:
-	inode_unlock(lower_dir);
+	end_creating(lower_new_dentry, NULL);
 	return rc;
 }
 
@@ -468,9 +476,11 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap,
 	size_t encoded_symlen;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
 
-	rc = lock_parent(dentry, &lower_dentry, &lower_dir);
-	if (rc)
-		goto out_lock;
+	lower_dentry = ecryptfs_start_creating_dentry(dentry);
+	if (IS_ERR(lower_dentry))
+		return PTR_ERR(lower_dentry);
+	lower_dir = lower_dentry->d_parent->d_inode;
+
 	mount_crypt_stat = &ecryptfs_superblock_to_private(
 		dir->i_sb)->mount_crypt_stat;
 	rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
@@ -490,7 +500,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap,
 	fsstack_copy_attr_times(dir, lower_dir);
 	fsstack_copy_inode_size(dir, lower_dir);
 out_lock:
-	inode_unlock(lower_dir);
+	end_creating(lower_dentry, NULL);
 	if (d_really_is_negative(dentry))
 		d_drop(dentry);
 	return rc;
@@ -501,12 +511,14 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 {
 	int rc;
 	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
 	struct inode *lower_dir;
 
-	rc = lock_parent(dentry, &lower_dentry, &lower_dir);
-	if (rc)
-		goto out;
-
+	lower_dentry = ecryptfs_start_creating_dentry(dentry);
+	if (IS_ERR(lower_dentry))
+		return lower_dentry;
+	lower_dir_dentry = dget(lower_dentry->d_parent);
+	lower_dir = lower_dir_dentry->d_inode;
 	lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir,
 				 lower_dentry, mode);
 	rc = PTR_ERR(lower_dentry);
@@ -522,7 +534,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	fsstack_copy_inode_size(dir, lower_dir);
 	set_nlink(dir, lower_dir->i_nlink);
 out:
-	inode_unlock(lower_dir);
+	end_creating(lower_dentry, lower_dir_dentry);
 	if (d_really_is_negative(dentry))
 		d_drop(dentry);
 	return ERR_PTR(rc);
@@ -534,21 +546,18 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct inode *lower_dir;
 	int rc;
 
-	rc = lock_parent(dentry, &lower_dentry, &lower_dir);
-	dget(lower_dentry);	// don't even try to make the lower negative
-	if (!rc) {
-		if (d_unhashed(lower_dentry))
-			rc = -EINVAL;
-		else
-			rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry);
-	}
+	lower_dentry = ecryptfs_start_removing_dentry(dentry);
+	if (IS_ERR(lower_dentry))
+		return PTR_ERR(lower_dentry);
+	lower_dir = lower_dentry->d_parent->d_inode;
+
+	rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry);
 	if (!rc) {
 		clear_nlink(d_inode(dentry));
 		fsstack_copy_attr_times(dir, lower_dir);
 		set_nlink(dir, lower_dir->i_nlink);
 	}
-	dput(lower_dentry);
-	inode_unlock(lower_dir);
+	end_removing(lower_dentry);
 	if (!rc)
 		d_drop(dentry);
 	return rc;
@@ -562,10 +571,12 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	struct dentry *lower_dentry;
 	struct inode *lower_dir;
 
-	rc = lock_parent(dentry, &lower_dentry, &lower_dir);
-	if (!rc)
-		rc = vfs_mknod(&nop_mnt_idmap, lower_dir,
-			       lower_dentry, mode, dev);
+	lower_dentry = ecryptfs_start_creating_dentry(dentry);
+	if (IS_ERR(lower_dentry))
+		return PTR_ERR(lower_dentry);
+	lower_dir = lower_dentry->d_parent->d_inode;
+
+	rc = vfs_mknod(&nop_mnt_idmap, lower_dir, lower_dentry, mode, dev);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out;
 	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
@@ -574,7 +585,7 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	fsstack_copy_attr_times(dir, lower_dir);
 	fsstack_copy_inode_size(dir, lower_dir);
 out:
-	inode_unlock(lower_dir);
+	end_removing(lower_dentry);
 	if (d_really_is_negative(dentry))
 		d_drop(dentry);
 	return rc;
@@ -590,7 +601,6 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	struct dentry *lower_new_dentry;
 	struct dentry *lower_old_dir_dentry;
 	struct dentry *lower_new_dir_dentry;
-	struct dentry *trap;
 	struct inode *target_inode;
 	struct renamedata rd = {};
 
@@ -605,31 +615,13 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 
 	target_inode = d_inode(new_dentry);
 
-	trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
-	if (IS_ERR(trap))
-		return PTR_ERR(trap);
-	dget(lower_new_dentry);
-	rc = -EINVAL;
-	if (lower_old_dentry->d_parent != lower_old_dir_dentry)
-		goto out_lock;
-	if (lower_new_dentry->d_parent != lower_new_dir_dentry)
-		goto out_lock;
-	if (d_unhashed(lower_old_dentry) || d_unhashed(lower_new_dentry))
-		goto out_lock;
-	/* source should not be ancestor of target */
-	if (trap == lower_old_dentry)
-		goto out_lock;
-	/* target should not be ancestor of source */
-	if (trap == lower_new_dentry) {
-		rc = -ENOTEMPTY;
-		goto out_lock;
-	}
+	rd.mnt_idmap  = &nop_mnt_idmap;
+	rd.old_parent = lower_old_dir_dentry;
+	rd.new_parent = lower_new_dir_dentry;
+	rc = start_renaming_two_dentries(&rd, lower_old_dentry, lower_new_dentry);
+	if (rc)
+		return rc;
 
-	rd.mnt_idmap		= &nop_mnt_idmap;
-	rd.old_parent		= lower_old_dir_dentry;
-	rd.old_dentry		= lower_old_dentry;
-	rd.new_parent		= lower_new_dir_dentry;
-	rd.new_dentry		= lower_new_dentry;
 	rc = vfs_rename(&rd);
 	if (rc)
 		goto out_lock;
@@ -640,8 +632,7 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	if (new_dir != old_dir)
 		fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry));
 out_lock:
-	dput(lower_new_dentry);
-	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+	end_renaming(&rd);
 	return rc;
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index 4a4b8b96c192..8b7807cd1343 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3397,6 +3397,39 @@ struct dentry *start_removing_noperm(struct dentry *parent,
 }
 EXPORT_SYMBOL(start_removing_noperm);
 
+/**
+ * start_creating_dentry - prepare to create a given dentry
+ * @parent: directory from which dentry should be removed
+ * @child:  the dentry to be removed
+ *
+ * A lock is taken to protect the dentry again other dirops and
+ * the validity of the dentry is checked: correct parent and still hashed.
+ *
+ * If the dentry is valid and negative a reference is taken and
+ * returned.  If not an error is returned.
+ *
+ * end_creating() should be called when creation is complete, or aborted.
+ *
+ * Returns: the valid dentry, or an error.
+ */
+struct dentry *start_creating_dentry(struct dentry *parent,
+				     struct dentry *child)
+{
+	inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
+	if (unlikely(IS_DEADDIR(parent->d_inode) ||
+		     child->d_parent != parent ||
+		     d_unhashed(child))) {
+		inode_unlock(parent->d_inode);
+		return ERR_PTR(-EINVAL);
+	}
+	if (d_is_positive(child)) {
+		inode_unlock(parent->d_inode);
+		return ERR_PTR(-EEXIST);
+	}
+	return dget(child);
+}
+EXPORT_SYMBOL(start_creating_dentry);
+
 /**
  * start_removing_dentry - prepare to remove a given dentry
  * @parent: directory from which dentry should be removed
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a99ac8b7e24a..208aed1d6728 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -100,6 +100,8 @@ struct dentry *start_removing_killable(struct mnt_idmap *idmap,
 				       struct qstr *name);
 struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name);
 struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name);
+struct dentry *start_creating_dentry(struct dentry *parent,
+				     struct dentry *child);
 struct dentry *start_removing_dentry(struct dentry *parent,
 				     struct dentry *child);
 
-- 
cgit v1.2.3


From fe497f0759e0efb949f9480911d00b6045c21f50 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:37 +1100
Subject: VFS: change vfs_mkdir() to unlock on failure.

vfs_mkdir() already drops the reference to the dentry on failure but it
leaves the parent locked.
This complicates end_creating() which needs to unlock the parent even
though the dentry is no longer available.

If we change vfs_mkdir() to unlock on failure as well as releasing the
dentry, we can remove the "parent" arg from end_creating() and simplify
the rules for calling it.

Note that cachefiles_get_directory() can choose to substitute an error
instead of actually calling vfs_mkdir(), for fault injection.  In that
case it needs to call end_creating(), just as vfs_mkdir() now does on
error.

ovl_create_real() will now unlock on error.  So the conditional
end_creating() after the call is removed, and end_creating() is called
internally on error.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-15-neilb@ownmail.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/porting.rst | 13 +++++++++++++
 fs/btrfs/ioctl.c                      |  2 +-
 fs/cachefiles/namei.c                 | 16 +++++++++-------
 fs/ecryptfs/inode.c                   |  8 ++++----
 fs/namei.c                            |  4 ++--
 fs/nfsd/nfs3proc.c                    |  2 +-
 fs/nfsd/nfs4proc.c                    |  2 +-
 fs/nfsd/nfs4recover.c                 |  2 +-
 fs/nfsd/nfsproc.c                     |  2 +-
 fs/nfsd/vfs.c                         |  8 ++++----
 fs/overlayfs/copy_up.c                |  4 ++--
 fs/overlayfs/dir.c                    | 18 ++++++++----------
 fs/overlayfs/super.c                  |  6 +++---
 fs/xfs/scrub/orphanage.c              |  2 +-
 include/linux/namei.h                 | 28 +++++++++-------------------
 ipc/mqueue.c                          |  2 +-
 16 files changed, 61 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 7233b04668fc..76ff738a00f3 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1309,3 +1309,16 @@ a different length, use
 	vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len))
 
 instead.
+
+---
+
+**mandatory**
+
+vfs_mkdir() now returns a dentry - the one returned by ->mkdir().  If
+that dentry is different from the dentry passed in, including if it is
+an IS_ERR() dentry pointer, the original dentry is dput().
+
+When vfs_mkdir() returns an error, and so both dputs() the original
+dentry and doesn't provide a replacement, it also unlocks the parent.
+Consequently the return value from vfs_mkdir() can be passed to
+end_creating() and the parent will be unlocked precisely when necessary.
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4fbfdd8faf6a..90ef777eae25 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -935,7 +935,7 @@ static noinline int btrfs_mksubvol(struct dentry *parent,
 out_up_read:
 	up_read(&fs_info->subvol_sem);
 out_dput:
-	end_creating(dentry, parent);
+	end_creating(dentry);
 	return ret;
 }
 
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 0104ac00485d..59327618ac42 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -128,10 +128,12 @@ retry:
 		if (ret < 0)
 			goto mkdir_error;
 		ret = cachefiles_inject_write_error();
-		if (ret == 0)
+		if (ret == 0) {
 			subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
-		else
+		} else {
+			end_creating(subdir);
 			subdir = ERR_PTR(ret);
+		}
 		if (IS_ERR(subdir)) {
 			trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
 						   cachefiles_trace_mkdir_error);
@@ -140,7 +142,7 @@ retry:
 		trace_cachefiles_mkdir(dir, subdir);
 
 		if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) {
-			end_creating(subdir, dir);
+			end_creating(subdir);
 			goto retry;
 		}
 		ASSERT(d_backing_inode(subdir));
@@ -154,7 +156,7 @@ retry:
 	/* Tell rmdir() it's not allowed to delete the subdir */
 	inode_lock(d_inode(subdir));
 	dget(subdir);
-	end_creating(subdir, dir);
+	end_creating(subdir);
 
 	if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) {
 		pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
@@ -196,7 +198,7 @@ mark_error:
 	return ERR_PTR(-EBUSY);
 
 mkdir_error:
-	end_creating(subdir, dir);
+	end_creating(subdir);
 	pr_err("mkdir %s failed with error %d\n", dirname, ret);
 	return ERR_PTR(ret);
 
@@ -699,7 +701,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
 		if (ret < 0)
 			goto out_end;
 
-		end_creating(dentry, fan);
+		end_creating(dentry);
 
 		ret = cachefiles_inject_read_error();
 		if (ret == 0)
@@ -733,7 +735,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
 	}
 
 out_end:
-	end_creating(dentry, fan);
+	end_creating(dentry);
 out:
 	_leave(" = %u", success);
 	return success;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 6a5bca89e752..2ad1db2cd2ec 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -211,7 +211,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 	fsstack_copy_attr_times(directory_inode, lower_dir);
 	fsstack_copy_inode_size(directory_inode, lower_dir);
 out_lock:
-	end_creating(lower_dentry, NULL);
+	end_creating(lower_dentry);
 	return inode;
 }
 
@@ -456,7 +456,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 		  ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink);
 	i_size_write(d_inode(new_dentry), file_size_save);
 out_lock:
-	end_creating(lower_new_dentry, NULL);
+	end_creating(lower_new_dentry);
 	return rc;
 }
 
@@ -500,7 +500,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap,
 	fsstack_copy_attr_times(dir, lower_dir);
 	fsstack_copy_inode_size(dir, lower_dir);
 out_lock:
-	end_creating(lower_dentry, NULL);
+	end_creating(lower_dentry);
 	if (d_really_is_negative(dentry))
 		d_drop(dentry);
 	return rc;
@@ -534,7 +534,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	fsstack_copy_inode_size(dir, lower_dir);
 	set_nlink(dir, lower_dir->i_nlink);
 out:
-	end_creating(lower_dentry, lower_dir_dentry);
+	end_creating(lower_dentry);
 	if (d_really_is_negative(dentry))
 		d_drop(dentry);
 	return ERR_PTR(rc);
diff --git a/fs/namei.c b/fs/namei.c
index 8b7807cd1343..d284ebae41bf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4832,7 +4832,7 @@ EXPORT_SYMBOL(start_creating_path);
  */
 void end_creating_path(const struct path *path, struct dentry *dentry)
 {
-	end_creating(dentry, path->dentry);
+	end_creating(dentry);
 	mnt_drop_write(path->mnt);
 	path_put(path);
 }
@@ -5034,7 +5034,7 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	return dentry;
 
 err:
-	dput(dentry);
+	end_creating(dentry);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL(vfs_mkdir);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index e2aac0def2cb..6b39e4aff959 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -364,7 +364,7 @@ set_attr:
 	status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs);
 
 out:
-	end_creating(child, parent);
+	end_creating(child);
 out_write:
 	fh_drop_write(fhp);
 	return status;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b2c95e8e7c68..524cb07a477c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -376,7 +376,7 @@ set_attr:
 	if (attrs.na_aclerr)
 		open->op_bmval[0] &= ~FATTR4_WORD0_ACL;
 out:
-	end_creating(child, parent);
+	end_creating(child);
 	nfsd_attrs_free(&attrs);
 out_write:
 	fh_drop_write(fhp);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 3eefaa2202e3..18c08395b273 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -215,7 +215,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	if (IS_ERR(dentry))
 		status = PTR_ERR(dentry);
 out_end:
-	end_creating(dentry, dir);
+	end_creating(dentry);
 out:
 	if (status == 0) {
 		if (nn->in_grace)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index ee1b16e921fd..28f03a6a3cc3 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -421,7 +421,7 @@ nfsd_proc_create(struct svc_rqst *rqstp)
 	}
 
 out_unlock:
-	end_creating(dchild, dirfhp->fh_dentry);
+	end_creating(dchild);
 out_write:
 	fh_drop_write(dirfhp);
 done:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 62109885d4db..6e9a57863904 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1589,7 +1589,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 out:
 	if (!err)
 		fh_fill_post_attrs(fhp);
-	end_creating(dchild, dentry);
+	end_creating(dchild);
 	return err;
 
 out_nfserr:
@@ -1646,7 +1646,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	return err;
 
 out_unlock:
-	end_creating(dchild, dentry);
+	end_creating(dchild);
 	return err;
 }
 
@@ -1747,7 +1747,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
 	fh_fill_post_attrs(fhp);
 out_unlock:
-	end_creating(dnew, dentry);
+	end_creating(dnew);
 	if (!err)
 		err = nfserrno(commit_metadata(fhp));
 	if (!err)
@@ -1824,7 +1824,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL);
 	fh_fill_post_attrs(ffhp);
 out_unlock:
-	end_creating(dnew, ddir);
+	end_creating(dnew);
 	if (!host_err) {
 		host_err = commit_metadata(ffhp);
 		if (!host_err)
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 27014ada11c7..36949856ddea 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -624,7 +624,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
 			ovl_dentry_set_upper_alias(c->dentry);
 			ovl_dentry_update_reval(c->dentry, upper);
 		}
-		end_creating(upper, upperdir);
+		end_creating(upper);
 	}
 	if (err)
 		goto out;
@@ -891,7 +891,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
 	err = PTR_ERR(upper);
 	if (!IS_ERR(upper)) {
 		err = ovl_do_link(ofs, temp, udir, upper);
-		end_creating(upper, c->destdir);
+		end_creating(upper);
 	}
 
 	if (err)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index b7f443932d93..e097ef4e79d2 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -91,7 +91,7 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
 		err = ovl_do_whiteout(ofs, wdir, whiteout);
 		if (!err)
 			ofs->whiteout = dget(whiteout);
-		end_creating(whiteout, workdir);
+		end_creating(whiteout);
 		if (err)
 			return ERR_PTR(err);
 	}
@@ -103,7 +103,7 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
 		err = ovl_do_link(ofs, ofs->whiteout, wdir, link);
 		if (!err)
 			whiteout = dget(link);
-		end_creating(link, workdir);
+		end_creating(link);
 		if (!err)
 			return whiteout;;
 
@@ -187,7 +187,7 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
 			if (!err && ofs->casefold != ovl_dentry_casefolded(newdentry)) {
 				pr_warn_ratelimited("wrong inherited casefold (%pd2)\n",
 						    newdentry);
-				dput(newdentry);
+				end_creating(newdentry);
 				err = -EINVAL;
 			}
 			break;
@@ -237,8 +237,7 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
 	}
 out:
 	if (err) {
-		if (!IS_ERR(newdentry))
-			dput(newdentry);
+		end_creating(newdentry);
 		return ERR_PTR(err);
 	}
 	return newdentry;
@@ -254,7 +253,7 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
 	ret = ovl_create_real(ofs, workdir, ret, attr);
 	if (!IS_ERR(ret))
 		dget(ret);
-	end_creating(ret, workdir);
+	end_creating(ret);
 	return ret;
 }
 
@@ -362,12 +361,11 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 	if (IS_ERR(newdentry))
 		return PTR_ERR(newdentry);
 	newdentry = ovl_create_real(ofs, upperdir, newdentry, attr);
-	if (IS_ERR(newdentry)) {
-		end_creating(newdentry, upperdir);
+	if (IS_ERR(newdentry))
 		return PTR_ERR(newdentry);
-	}
+
 	dget(newdentry);
-	end_creating(newdentry, upperdir);
+	end_creating(newdentry);
 
 	if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) &&
 	    !ovl_allow_offline_changes(ofs)) {
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a721ef2b90e8..3acda985c8a3 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -320,7 +320,7 @@ retry:
 
 		if (work->d_inode) {
 			dget(work);
-			end_creating(work, ofs->workbasedir);
+			end_creating(work);
 			if (persist)
 				return work;
 			err = -EEXIST;
@@ -338,7 +338,7 @@ retry:
 		work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode);
 		if (!IS_ERR(work))
 			dget(work);
-		end_creating(work, ofs->workbasedir);
+		end_creating(work);
 		err = PTR_ERR(work);
 		if (IS_ERR(work))
 			goto out_err;
@@ -632,7 +632,7 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs,
 						OVL_CATTR(mode));
 		if (!IS_ERR(child))
 			dget(child);
-		end_creating(child, parent);
+		end_creating(child);
 	}
 	dput(parent);
 
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index e732605924a1..b77c2b6b6d44 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -199,7 +199,7 @@ xrep_orphanage_create(
 	sc->orphanage_ilock_flags = 0;
 
 out_dput_orphanage:
-	end_creating(orphanage_dentry, root_dentry);
+	end_creating(orphanage_dentry);
 out_dput_root:
 	dput(root_dentry);
 out:
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 208aed1d6728..0ef73d739a31 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -105,34 +105,24 @@ struct dentry *start_creating_dentry(struct dentry *parent,
 struct dentry *start_removing_dentry(struct dentry *parent,
 				     struct dentry *child);
 
-/**
- * end_creating - finish action started with start_creating
- * @child:  dentry returned by start_creating() or vfs_mkdir()
- * @parent: dentry given to start_creating(),
- *
- * Unlock and release the child.
+/* end_creating - finish action started with start_creating
+ * @child: dentry returned by start_creating() or vfs_mkdir()
  *
- * Unlike end_dirop() this can only be called if start_creating() succeeded.
- * It handles @child being and error as vfs_mkdir() might have converted the
- * dentry to an error - in that case the parent still needs to be unlocked.
+ * Unlock and release the child. This can be called after
+ * start_creating() whether that function succeeded or not,
+ * but it is not needed on failure.
  *
  * If vfs_mkdir() was called then the value returned from that function
  * should be given for @child rather than the original dentry, as vfs_mkdir()
- * may have provided a new dentry.  Even if vfs_mkdir() returns an error
- * it must be given to end_creating().
+ * may have provided a new dentry.
+ *
  *
  * If vfs_mkdir() was not called, then @child will be a valid dentry and
  * @parent will be ignored.
  */
-static inline void end_creating(struct dentry *child, struct dentry *parent)
+static inline void end_creating(struct dentry *child)
 {
-	if (IS_ERR(child))
-		/* The parent is still locked despite the error from
-		 * vfs_mkdir() - must unlock it.
-		 */
-		inode_unlock(parent->d_inode);
-	else
-		end_dirop(child);
+	end_dirop(child);
 }
 
 /**
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 6d7610310003..83d9466710d6 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -932,7 +932,7 @@ out_putfd:
 		put_unused_fd(fd);
 		fd = error;
 	}
-	end_creating(path.dentry, root);
+	end_creating(path.dentry);
 	if (!ro)
 		mnt_drop_write(mnt);
 out_putname:
-- 
cgit v1.2.3


From cf296b294c3bd8f7db229060efe677dfd49e46b6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Thu, 13 Nov 2025 11:18:38 +1100
Subject: VFS: introduce end_creating_keep()

Occasionally the caller of end_creating() wants to keep using the dentry.
Rather then requiring them to dget() the dentry (when not an error)
before calling end_creating(), provide end_creating_keep() which does
this.

cachefiles and overlayfs make use of this.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-16-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cachefiles/namei.c |  3 +--
 fs/overlayfs/dir.c    |  8 ++------
 fs/overlayfs/super.c  | 11 +++--------
 include/linux/namei.h | 22 ++++++++++++++++++++++
 4 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 59327618ac42..ef22ac19545b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -155,8 +155,7 @@ retry:
 
 	/* Tell rmdir() it's not allowed to delete the subdir */
 	inode_lock(d_inode(subdir));
-	dget(subdir);
-	end_creating(subdir);
+	end_creating_keep(subdir);
 
 	if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) {
 		pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index e097ef4e79d2..10e2b7e41a7a 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -251,10 +251,7 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
 	if (IS_ERR(ret))
 		return ret;
 	ret = ovl_create_real(ofs, workdir, ret, attr);
-	if (!IS_ERR(ret))
-		dget(ret);
-	end_creating(ret);
-	return ret;
+	return end_creating_keep(ret);
 }
 
 static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper,
@@ -364,8 +361,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 	if (IS_ERR(newdentry))
 		return PTR_ERR(newdentry);
 
-	dget(newdentry);
-	end_creating(newdentry);
+	end_creating_keep(newdentry);
 
 	if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) &&
 	    !ovl_allow_offline_changes(ofs)) {
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 3acda985c8a3..7b8fc1cab6eb 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -319,8 +319,7 @@ retry:
 		};
 
 		if (work->d_inode) {
-			dget(work);
-			end_creating(work);
+			end_creating_keep(work);
 			if (persist)
 				return work;
 			err = -EEXIST;
@@ -336,9 +335,7 @@ retry:
 		}
 
 		work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode);
-		if (!IS_ERR(work))
-			dget(work);
-		end_creating(work);
+		end_creating_keep(work);
 		err = PTR_ERR(work);
 		if (IS_ERR(work))
 			goto out_err;
@@ -630,9 +627,7 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs,
 		if (!child->d_inode)
 			child = ovl_create_real(ofs, parent, child,
 						OVL_CATTR(mode));
-		if (!IS_ERR(child))
-			dget(child);
-		end_creating(child);
+		end_creating_keep(child);
 	}
 	dput(parent);
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 0ef73d739a31..3d82c6a19197 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -125,6 +125,28 @@ static inline void end_creating(struct dentry *child)
 	end_dirop(child);
 }
 
+/* end_creating_keep - finish action started with start_creating() and return result
+ * @child: dentry returned by start_creating() or vfs_mkdir()
+ *
+ * Unlock and return the child. This can be called after
+ * start_creating() whether that function succeeded or not,
+ * but it is not needed on failure.
+ *
+ * If vfs_mkdir() was called then the value returned from that function
+ * should be given for @child rather than the original dentry, as vfs_mkdir()
+ * may have provided a new dentry.
+ *
+ * Returns: @child, which may be a dentry or an error.
+ *
+ */
+static inline struct dentry *end_creating_keep(struct dentry *child)
+{
+	if (!IS_ERR(child))
+		dget(child);
+	end_dirop(child);
+	return child;
+}
+
 /**
  * end_removing - finish action started with start_removing
  * @child:  dentry returned by start_removing()
-- 
cgit v1.2.3


From e36dbd1cf3dfc4ce18e9f7a80183b53cae257e30 Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Date: Fri, 4 Jul 2025 17:43:19 +0200
Subject: media: uapi: Introduce V4L2 generic ISP types

Introduce v4l2-isp.h in the Linux kernel uAPI.

The header includes types for generic ISP configuration parameters
and will be extended in the future with support for generic ISP statistics
formats.

Generic ISP parameters support is provided by introducing two new
types that represent an extensible and versioned buffer of ISP
configuration parameters.

The v4l2_params_buffer represents the container for the ISP
configuration data block. The generic type is defined with a 0-sized
data member that the ISP driver implementations shall properly size
according to their capabilities. The v4l2_params_block_header structure
represents the header to be prepend to each ISP configuration block.

Signed-off-by: Daniel Scally <dan.scally@ideasonboard.com>
Reviewed-by: Daniel Scally <dan.scally@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Michael Riesch <michael.riesch@collabora.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 MAINTAINERS                         |   6 +++
 include/uapi/linux/media/v4l2-isp.h | 102 ++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 include/uapi/linux/media/v4l2-isp.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 42b20f33f3bb..97c8a4cdbc2c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -26885,6 +26885,12 @@ F:	drivers/media/i2c/vd55g1.c
 F:	drivers/media/i2c/vd56g3.c
 F:	drivers/media/i2c/vgxy61.c
 
+V4L2 GENERIC ISP PARAMETERS AND STATISTIC FORMATS
+M:	Jacopo Mondi <jacopo.mondi@ideasonboard.com>
+L:	linux-media@vger.kernel.org
+S:	Maintained
+F:	include/uapi/linux/media/v4l2-isp.h
+
 VF610 NAND DRIVER
 M:	Stefan Agner <stefan@agner.ch>
 L:	linux-mtd@lists.infradead.org
diff --git a/include/uapi/linux/media/v4l2-isp.h b/include/uapi/linux/media/v4l2-isp.h
new file mode 100644
index 000000000000..779168f9058e
--- /dev/null
+++ b/include/uapi/linux/media/v4l2-isp.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Video4Linux2 generic ISP parameters and statistics support
+ *
+ * Copyright (C) 2025 Ideas On Board Oy
+ * Author: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
+ */
+
+#ifndef _UAPI_V4L2_ISP_H_
+#define _UAPI_V4L2_ISP_H_
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+/**
+ * enum v4l2_isp_params_version - V4L2 ISP parameters versioning
+ *
+ * @V4L2_ISP_PARAMS_VERSION_V0: First version of the V4L2 ISP parameters format
+ *				(for compatibility)
+ * @V4L2_ISP_PARAMS_VERSION_V1: First version of the V4L2 ISP parameters format
+ *
+ * V0 and V1 are identical in order to support drivers compatible with the V4L2
+ * ISP parameters format already upstreamed which use either 0 or 1 as their
+ * versioning identifier. Both V0 and V1 refers to the first version of the
+ * V4L2 ISP parameters format.
+ *
+ * Future revisions of the V4L2 ISP parameters format should start from the
+ * value of 2.
+ */
+enum v4l2_isp_params_version {
+	V4L2_ISP_PARAMS_VERSION_V0 = 0,
+	V4L2_ISP_PARAMS_VERSION_V1
+};
+
+#define V4L2_ISP_PARAMS_FL_BLOCK_DISABLE	(1U << 0)
+#define V4L2_ISP_PARAMS_FL_BLOCK_ENABLE		(1U << 1)
+
+/*
+ * Reserve the first 8 bits for V4L2_ISP_PARAMS_FL_* flag.
+ *
+ * Driver-specific flags should be defined as:
+ * #define DRIVER_SPECIFIC_FLAG0     ((1U << V4L2_ISP_PARAMS_FL_DRIVER_FLAGS(0))
+ * #define DRIVER_SPECIFIC_FLAG1     ((1U << V4L2_ISP_PARAMS_FL_DRIVER_FLAGS(1))
+ */
+#define V4L2_ISP_PARAMS_FL_DRIVER_FLAGS(n)       ((n) + 8)
+
+/**
+ * struct v4l2_isp_params_block_header - V4L2 extensible parameters block header
+ * @type: The parameters block type (driver-specific)
+ * @flags: A bitmask of block flags (driver-specific)
+ * @size: Size (in bytes) of the parameters block, including this header
+ *
+ * This structure represents the common part of all the ISP configuration
+ * blocks. Each parameters block shall embed an instance of this structure type
+ * as its first member, followed by the block-specific configuration data.
+ *
+ * The @type field is an ISP driver-specific value that identifies the block
+ * type. The @size field specifies the size of the parameters block.
+ *
+ * The @flags field is a bitmask of per-block flags V4L2_PARAMS_ISP_FL_* and
+ * driver-specific flags specified by the driver header.
+ */
+struct v4l2_isp_params_block_header {
+	__u16 type;
+	__u16 flags;
+	__u32 size;
+} __attribute__((aligned(8)));
+
+/**
+ * struct v4l2_isp_params_buffer - V4L2 extensible parameters configuration
+ * @version: The parameters buffer version (driver-specific)
+ * @data_size: The configuration data effective size, excluding this header
+ * @data: The configuration data
+ *
+ * This structure contains the configuration parameters of the ISP algorithms,
+ * serialized by userspace into a data buffer. Each configuration parameter
+ * block is represented by a block-specific structure which contains a
+ * :c:type:`v4l2_isp_params_block_header` entry as first member. Userspace
+ * populates the @data buffer with configuration parameters for the blocks that
+ * it intends to configure. As a consequence, the data buffer effective size
+ * changes according to the number of ISP blocks that userspace intends to
+ * configure and is set by userspace in the @data_size field.
+ *
+ * The parameters buffer is versioned by the @version field to allow modifying
+ * and extending its definition. Userspace shall populate the @version field to
+ * inform the driver about the version it intends to use. The driver will parse
+ * and handle the @data buffer according to the data layout specific to the
+ * indicated version and return an error if the desired version is not
+ * supported.
+ *
+ * For each ISP block that userspace wants to configure, a block-specific
+ * structure is appended to the @data buffer, one after the other without gaps
+ * in between. Userspace shall populate the @data_size field with the effective
+ * size, in bytes, of the @data buffer.
+ */
+struct v4l2_isp_params_buffer {
+	__u32 version;
+	__u32 data_size;
+	__u8 data[] __counted_by(data_size);
+};
+
+#endif /* _UAPI_V4L2_ISP_H_ */
-- 
cgit v1.2.3


From 1e8152db64bdee9f13e84e516c2b8a9bb10f025e Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Date: Mon, 7 Jul 2025 14:13:53 +0200
Subject: media: uapi: Convert RkISP1 to V4L2 extensible params

With the introduction of common types for extensible parameters
format, convert the rkisp1-config.h header to use the new types.

Factor out the documentation that is now part of the common header
and only keep the driver-specific on in place.

The conversion to use common types doesn't impact userspace as the
new types are either identical to the ones already existing in the
RkISP1 uAPI or are 1-to-1 type convertible.

Reviewed-by: Daniel Scally <dan.scally@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Michael Riesch <michael.riesch@collabora.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 include/uapi/linux/rkisp1-config.h | 107 +++++++++----------------------------
 1 file changed, 24 insertions(+), 83 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/rkisp1-config.h b/include/uapi/linux/rkisp1-config.h
index 3b060ea6eed7..b2d2a71f7baf 100644
--- a/include/uapi/linux/rkisp1-config.h
+++ b/include/uapi/linux/rkisp1-config.h
@@ -7,8 +7,13 @@
 #ifndef _UAPI_RKISP1_CONFIG_H
 #define _UAPI_RKISP1_CONFIG_H
 
+#ifdef __KERNEL__
+#include <linux/build_bug.h>
+#endif /* __KERNEL__ */
 #include <linux/types.h>
 
+#include <linux/media/v4l2-isp.h>
+
 /* Defect Pixel Cluster Detection */
 #define RKISP1_CIF_ISP_MODULE_DPCC		(1U << 0)
 /* Black Level Subtraction */
@@ -1158,79 +1163,26 @@ enum rkisp1_ext_params_block_type {
 	RKISP1_EXT_PARAMS_BLOCK_TYPE_WDR,
 };
 
-#define RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE	(1U << 0)
-#define RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE	(1U << 1)
+/* For backward compatibility */
+#define RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE	V4L2_ISP_PARAMS_FL_BLOCK_DISABLE
+#define RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE	V4L2_ISP_PARAMS_FL_BLOCK_ENABLE
 
 /* A bitmask of parameters blocks supported on the current hardware. */
 #define RKISP1_CID_SUPPORTED_PARAMS_BLOCKS	(V4L2_CID_USER_RKISP1_BASE + 0x01)
 
 /**
- * struct rkisp1_ext_params_block_header - RkISP1 extensible parameters block
- *					   header
+ * rkisp1_ext_params_block_header - RkISP1 extensible parameters block header
  *
  * This structure represents the common part of all the ISP configuration
- * blocks. Each parameters block shall embed an instance of this structure type
- * as its first member, followed by the block-specific configuration data. The
- * driver inspects this common header to discern the block type and its size and
- * properly handle the block content by casting it to the correct block-specific
- * type.
+ * blocks and is identical to :c:type:`v4l2_isp_params_block_header`.
  *
- * The @type field is one of the values enumerated by
+ * The type field is one of the values enumerated by
  * :c:type:`rkisp1_ext_params_block_type` and specifies how the data should be
- * interpreted by the driver. The @size field specifies the size of the
- * parameters block and is used by the driver for validation purposes.
- *
- * The @flags field is a bitmask of per-block flags RKISP1_EXT_PARAMS_FL_*.
- *
- * When userspace wants to configure and enable an ISP block it shall fully
- * populate the block configuration and set the
- * RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE bit in the @flags field.
- *
- * When userspace simply wants to disable an ISP block the
- * RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE bit should be set in @flags field. The
- * driver ignores the rest of the block configuration structure in this case.
- *
- * If a new configuration of an ISP block has to be applied userspace shall
- * fully populate the ISP block configuration and omit setting the
- * RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE and RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE bits
- * in the @flags field.
- *
- * Setting both the RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE and
- * RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE bits in the @flags field is not allowed
- * and not accepted by the driver.
- *
- * Userspace is responsible for correctly populating the parameters block header
- * fields (@type, @flags and @size) and the block-specific parameters.
- *
- * For example:
+ * interpreted by the driver.
  *
- * .. code-block:: c
- *
- *	void populate_bls(struct rkisp1_ext_params_block_header *block) {
- *		struct rkisp1_ext_params_bls_config *bls =
- *			(struct rkisp1_ext_params_bls_config *)block;
- *
- *		bls->header.type = RKISP1_EXT_PARAMS_BLOCK_ID_BLS;
- *		bls->header.flags = RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE;
- *		bls->header.size = sizeof(*bls);
- *
- *		bls->config.enable_auto = 0;
- *		bls->config.fixed_val.r = blackLevelRed_;
- *		bls->config.fixed_val.gr = blackLevelGreenR_;
- *		bls->config.fixed_val.gb = blackLevelGreenB_;
- *		bls->config.fixed_val.b = blackLevelBlue_;
- *	}
- *
- * @type: The parameters block type, see
- *	  :c:type:`rkisp1_ext_params_block_type`
- * @flags: A bitmask of block flags
- * @size: Size (in bytes) of the parameters block, including this header
+ * The flags field is a bitmask of per-block flags RKISP1_EXT_PARAMS_FL_*.
  */
-struct rkisp1_ext_params_block_header {
-	__u16 type;
-	__u16 flags;
-	__u32 size;
-};
+#define rkisp1_ext_params_block_header v4l2_isp_params_block_header
 
 /**
  * struct rkisp1_ext_params_bls_config - RkISP1 extensible params BLS config
@@ -1588,27 +1540,14 @@ struct rkisp1_ext_params_wdr_config {
  * @RKISP1_EXT_PARAM_BUFFER_V1: First version of RkISP1 extensible parameters
  */
 enum rksip1_ext_param_buffer_version {
-	RKISP1_EXT_PARAM_BUFFER_V1 = 1,
+	RKISP1_EXT_PARAM_BUFFER_V1 = V4L2_ISP_PARAMS_VERSION_V1,
 };
 
 /**
  * struct rkisp1_ext_params_cfg - RkISP1 extensible parameters configuration
  *
- * This struct contains the configuration parameters of the RkISP1 ISP
- * algorithms, serialized by userspace into a data buffer. Each configuration
- * parameter block is represented by a block-specific structure which contains a
- * :c:type:`rkisp1_ext_params_block_header` entry as first member. Userspace
- * populates the @data buffer with configuration parameters for the blocks that
- * it intends to configure. As a consequence, the data buffer effective size
- * changes according to the number of ISP blocks that userspace intends to
- * configure and is set by userspace in the @data_size field.
- *
- * The parameters buffer is versioned by the @version field to allow modifying
- * and extending its definition. Userspace shall populate the @version field to
- * inform the driver about the version it intends to use. The driver will parse
- * and handle the @data buffer according to the data layout specific to the
- * indicated version and return an error if the desired version is not
- * supported.
+ * This is the driver-specific implementation of
+ * :c:type:`v4l2_isp_params_buffer`.
  *
  * Currently the single RKISP1_EXT_PARAM_BUFFER_V1 version is supported.
  * When a new format version will be added, a mechanism for userspace to query
@@ -1624,11 +1563,6 @@ enum rksip1_ext_param_buffer_version {
  * the maximum value represents the blocks supported by the kernel driver,
  * independently of the device instance.
  *
- * For each ISP block that userspace wants to configure, a block-specific
- * structure is appended to the @data buffer, one after the other without gaps
- * in between nor overlaps. Userspace shall populate the @data_size field with
- * the effective size, in bytes, of the @data buffer.
- *
  * The expected memory layout of the parameters buffer is::
  *
  *	+-------------------- struct rkisp1_ext_params_cfg -------------------+
@@ -1678,4 +1612,11 @@ struct rkisp1_ext_params_cfg {
 	__u8 data[RKISP1_EXT_PARAMS_MAX_SIZE];
 };
 
+#ifdef __KERNEL__
+/* Make sure the header is type-convertible to the generic v4l2 params one */
+static_assert((sizeof(struct rkisp1_ext_params_cfg) -
+	      RKISP1_EXT_PARAMS_MAX_SIZE) ==
+	      sizeof(struct v4l2_isp_params_buffer));
+#endif /* __KERNEL__ */
+
 #endif /* _UAPI_RKISP1_CONFIG_H */
-- 
cgit v1.2.3


From 45662082855c6acd1719c11e077388cbccf3baf2 Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Date: Mon, 7 Jul 2025 14:18:52 +0200
Subject: media: uapi: Convert Amlogic C3 to V4L2 extensible params

With the introduction of common types for extensible parameters
format, convert the c3-isp-config.h header to use the new types.

Factor-out the documentation that is now part of the common header
and only keep the driver-specific on in place.

The conversion to use common types doesn't impact userspace as the
new types are either identical to the ones already existing in the
C3 ISP uAPI or are 1-to-1 type convertible.

Reviewed-by: Daniel Scally <dan.scally@ideasonboard.com>
Reviewed-by: Keke Li <keke.li@amlogic.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 include/uapi/linux/media/amlogic/c3-isp-config.h | 92 +++++++-----------------
 1 file changed, 24 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/media/amlogic/c3-isp-config.h b/include/uapi/linux/media/amlogic/c3-isp-config.h
index ed085ea62a57..0a3c1cc55ccb 100644
--- a/include/uapi/linux/media/amlogic/c3-isp-config.h
+++ b/include/uapi/linux/media/amlogic/c3-isp-config.h
@@ -6,8 +6,13 @@
 #ifndef _UAPI_C3_ISP_CONFIG_H_
 #define _UAPI_C3_ISP_CONFIG_H_
 
+#ifdef __KERNEL__
+#include <linux/build_bug.h>
+#endif /* __KERNEL__ */
 #include <linux/types.h>
 
+#include <linux/media/v4l2-isp.h>
+
 /*
  * Frames are split into zones of almost equal width and height - a zone is a
  * rectangular tile of a frame. The metering blocks within the ISP collect
@@ -141,7 +146,7 @@ struct c3_isp_stats_info {
  * @C3_ISP_PARAMS_BUFFER_V0: First version of C3 ISP parameters block
  */
 enum c3_isp_params_buffer_version {
-	C3_ISP_PARAMS_BUFFER_V0,
+	C3_ISP_PARAMS_BUFFER_V0 = V4L2_ISP_PARAMS_VERSION_V0,
 };
 
 /**
@@ -176,62 +181,23 @@ enum c3_isp_params_block_type {
 	C3_ISP_PARAMS_BLOCK_SENTINEL
 };
 
-#define C3_ISP_PARAMS_BLOCK_FL_DISABLE (1U << 0)
-#define C3_ISP_PARAMS_BLOCK_FL_ENABLE (1U << 1)
+/* For backward compatibility */
+#define C3_ISP_PARAMS_BLOCK_FL_DISABLE	V4L2_ISP_PARAMS_FL_BLOCK_DISABLE
+#define C3_ISP_PARAMS_BLOCK_FL_ENABLE	V4L2_ISP_PARAMS_FL_BLOCK_ENABLE
 
 /**
  * struct c3_isp_params_block_header - C3 ISP parameter block header
  *
  * This structure represents the common part of all the ISP configuration
- * blocks. Each parameters block shall embed an instance of this structure type
- * as its first member, followed by the block-specific configuration data. The
- * driver inspects this common header to discern the block type and its size and
- * properly handle the block content by casting it to the correct block-specific
- * type.
+ * blocks and is identical to :c:type:`v4l2_isp_params_block_header`.
  *
- * The @type field is one of the values enumerated by
+ * The type field is one of the values enumerated by
  * :c:type:`c3_isp_params_block_type` and specifies how the data should be
- * interpreted by the driver. The @size field specifies the size of the
- * parameters block and is used by the driver for validation purposes. The
- * @flags field is a bitmask of per-block flags C3_ISP_PARAMS_FL*.
- *
- * When userspace wants to disable an ISP block the
- * C3_ISP_PARAMS_BLOCK_FL_DISABLED bit should be set in the @flags field. In
- * this case userspace may optionally omit the remainder of the configuration
- * block, which will be ignored by the driver.
- *
- * When a new configuration of an ISP block needs to be applied userspace
- * shall fully populate the ISP block and omit setting the
- * C3_ISP_PARAMS_BLOCK_FL_DISABLED bit in the @flags field.
- *
- * Userspace is responsible for correctly populating the parameters block header
- * fields (@type, @flags and @size) and the block-specific parameters.
- *
- * For example:
- *
- * .. code-block:: c
+ * interpreted by the driver.
  *
- *	void populate_pst_gamma(struct c3_isp_params_block_header *block) {
- *		struct c3_isp_params_pst_gamma *gamma =
- *			(struct c3_isp_params_pst_gamma *)block;
- *
- *		gamma->header.type = C3_ISP_PARAMS_BLOCK_PST_GAMMA;
- *		gamma->header.flags = C3_ISP_PARAMS_BLOCK_FL_ENABLE;
- *		gamma->header.size = sizeof(*gamma);
- *
- *		for (unsigned int i = 0; i < 129; i++)
- *			gamma->pst_gamma_lut[i] = i;
- *	}
- *
- * @type: The parameters block type from :c:type:`c3_isp_params_block_type`
- * @flags: A bitmask of block flags
- * @size: Size (in bytes) of the parameters block, including this header
+ * The flags field is a bitmask of per-block flags C3_ISP_PARAMS_FL_*.
  */
-struct c3_isp_params_block_header {
-	__u16 type;
-	__u16 flags;
-	__u32 size;
-};
+#define c3_isp_params_block_header v4l2_isp_params_block_header
 
 /**
  * struct c3_isp_params_awb_gains - Gains for auto-white balance
@@ -498,26 +464,10 @@ struct c3_isp_params_blc {
 /**
  * struct c3_isp_params_cfg - C3 ISP configuration parameters
  *
- * This struct contains the configuration parameters of the C3 ISP
- * algorithms, serialized by userspace into an opaque data buffer. Each
- * configuration parameter block is represented by a block-specific structure
- * which contains a :c:type:`c3_isp_param_block_header` entry as first
- * member. Userspace populates the @data buffer with configuration parameters
- * for the blocks that it intends to configure. As a consequence, the data
- * buffer effective size changes according to the number of ISP blocks that
- * userspace intends to configure.
- *
- * The parameters buffer is versioned by the @version field to allow modifying
- * and extending its definition. Userspace should populate the @version field to
- * inform the driver about the version it intends to use. The driver will parse
- * and handle the @data buffer according to the data layout specific to the
- * indicated revision and return an error if the desired revision is not
- * supported.
- *
- * For each ISP block that userspace wants to configure, a block-specific
- * structure is appended to the @data buffer, one after the other without gaps
- * in between nor overlaps. Userspace shall populate the @total_size field with
- * the effective size, in bytes, of the @data buffer.
+ * This is the driver-specific implementation of
+ * :c:type:`v4l2_isp_params_buffer`.
+ *
+ * Currently only C3_ISP_PARAM_BUFFER_V0 is supported.
  *
  * The expected memory layout of the parameters buffer is::
  *
@@ -561,4 +511,10 @@ struct c3_isp_params_cfg {
 	__u8 data[C3_ISP_PARAMS_MAX_SIZE];
 };
 
+#ifdef __KERNEL__
+/* Make sure the header is type-convertible to the generic v4l2 params one */
+static_assert((sizeof(struct c3_isp_params_cfg) - C3_ISP_PARAMS_MAX_SIZE) ==
+	      sizeof(struct v4l2_isp_params_buffer));
+#endif /* __KERNEL__ */
+
 #endif
-- 
cgit v1.2.3


From 3cb6de6fafb8fca55b14313e63f13ce10ecc6fc4 Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Date: Tue, 1 Jul 2025 18:57:17 +0200
Subject: media: v4l2-core: Introduce v4l2-isp.c

Add to the V4L2 framework helper functions to support drivers when
validating a buffer of V4L2 ISP parameters.

Driver shall use v4l2_isp_params_validate_buffer_size() to verify the
size correctness of the data received from userspace, and after having
copied the data to a kernel-only memory location, complete the
validation by calling v4l2_isp_params_validate_buffer().

Reviewed-by: Daniel Scally <dan.scally@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Michael Riesch <michael.riesch@collabora.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 MAINTAINERS                        |   2 +
 drivers/media/v4l2-core/Kconfig    |   4 ++
 drivers/media/v4l2-core/Makefile   |   1 +
 drivers/media/v4l2-core/v4l2-isp.c | 132 +++++++++++++++++++++++++++++++++++++
 include/media/v4l2-isp.h           |  91 +++++++++++++++++++++++++
 5 files changed, 230 insertions(+)
 create mode 100644 drivers/media/v4l2-core/v4l2-isp.c
 create mode 100644 include/media/v4l2-isp.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index a22249166e4a..e4c8b2d533bd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -26890,6 +26890,8 @@ M:	Jacopo Mondi <jacopo.mondi@ideasonboard.com>
 L:	linux-media@vger.kernel.org
 S:	Maintained
 F:	Documentation/userspace-api/media/v4l/v4l2-isp.rst
+F:	drivers/media/v4l2-core/v4l2-isp.c
+F:	include/media/v4l2-isp.h
 F:	include/uapi/linux/media/v4l2-isp.h
 
 VF610 NAND DRIVER
diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig
index 331b8e535e5b..d50ccac9733c 100644
--- a/drivers/media/v4l2-core/Kconfig
+++ b/drivers/media/v4l2-core/Kconfig
@@ -82,3 +82,7 @@ config V4L2_CCI_I2C
 	depends on I2C
 	select REGMAP_I2C
 	select V4L2_CCI
+
+config V4L2_ISP
+	tristate
+	depends on VIDEOBUF2_CORE
diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile
index 2177b9d63a8f..329f0eadce99 100644
--- a/drivers/media/v4l2-core/Makefile
+++ b/drivers/media/v4l2-core/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_V4L2_CCI) += v4l2-cci.o
 obj-$(CONFIG_V4L2_FLASH_LED_CLASS) += v4l2-flash-led-class.o
 obj-$(CONFIG_V4L2_FWNODE) += v4l2-fwnode.o
 obj-$(CONFIG_V4L2_H264) += v4l2-h264.o
+obj-$(CONFIG_V4L2_ISP) += v4l2-isp.o
 obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o
 obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o
 obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o
diff --git a/drivers/media/v4l2-core/v4l2-isp.c b/drivers/media/v4l2-core/v4l2-isp.c
new file mode 100644
index 000000000000..756d2b4996cc
--- /dev/null
+++ b/drivers/media/v4l2-core/v4l2-isp.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Video4Linux2 generic ISP parameters and statistics support
+ *
+ * Copyright (C) 2025 Ideas On Board Oy
+ * Author: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
+ */
+
+#include <media/v4l2-isp.h>
+
+#include <linux/bitops.h>
+#include <linux/device.h>
+
+#include <media/videobuf2-core.h>
+
+int v4l2_isp_params_validate_buffer_size(struct device *dev,
+					 struct vb2_buffer *vb,
+					 size_t max_size)
+{
+	size_t header_size = offsetof(struct v4l2_isp_params_buffer, data);
+	size_t payload_size = vb2_get_plane_payload(vb, 0);
+
+	/* Payload size can't be greater than the destination buffer size */
+	if (payload_size > max_size) {
+		dev_dbg(dev, "Payload size is too large: %zu\n", payload_size);
+		return -EINVAL;
+	}
+
+	/* Payload size can't be smaller than the header size */
+	if (payload_size < header_size) {
+		dev_dbg(dev, "Payload size is too small: %zu\n", payload_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(v4l2_isp_params_validate_buffer_size);
+
+int v4l2_isp_params_validate_buffer(struct device *dev, struct vb2_buffer *vb,
+				    const struct v4l2_isp_params_buffer *buffer,
+				    const struct v4l2_isp_params_block_info *info,
+				    size_t num_blocks)
+{
+	size_t header_size = offsetof(struct v4l2_isp_params_buffer, data);
+	size_t payload_size = vb2_get_plane_payload(vb, 0);
+	size_t block_offset = 0;
+	size_t buffer_size;
+
+	/*
+	 * Currently only the first version of the V4L2 ISP parameters format is
+	 * supported. We accept both V0 and V1 to support existing drivers
+	 * compatible with V4L2 ISP that use either 0 or 1 as their "first
+	 * version" identifiers.
+	 */
+	if (buffer->version != V4L2_ISP_PARAMS_VERSION_V0 &&
+	    buffer->version != V4L2_ISP_PARAMS_VERSION_V1) {
+		dev_dbg(dev,
+			"Unsupported V4L2 ISP parameters format version: %u\n",
+			buffer->version);
+		return -EINVAL;
+	}
+
+	/* Validate the size reported in the header */
+	buffer_size = header_size + buffer->data_size;
+	if (buffer_size != payload_size) {
+		dev_dbg(dev, "Data size %zu and payload size %zu are different\n",
+			buffer_size, payload_size);
+		return -EINVAL;
+	}
+
+	/* Walk the list of ISP configuration blocks and validate them. */
+	buffer_size = buffer->data_size;
+	while (buffer_size >= sizeof(struct v4l2_isp_params_block_header)) {
+		const struct v4l2_isp_params_block_info *block_info;
+		const struct v4l2_isp_params_block_header *block;
+
+		block = (const struct v4l2_isp_params_block_header *)
+			(buffer->data + block_offset);
+
+		if (block->type >= num_blocks) {
+			dev_dbg(dev,
+				"Invalid block type %u at offset %zu\n",
+				block->type, block_offset);
+			return -EINVAL;
+		}
+
+		if (block->size > buffer_size) {
+			dev_dbg(dev, "Premature end of parameters data\n");
+			return -EINVAL;
+		}
+
+		/* It's invalid to specify both ENABLE and DISABLE. */
+		if ((block->flags & (V4L2_ISP_PARAMS_FL_BLOCK_ENABLE |
+				     V4L2_ISP_PARAMS_FL_BLOCK_DISABLE)) ==
+		     (V4L2_ISP_PARAMS_FL_BLOCK_ENABLE |
+		     V4L2_ISP_PARAMS_FL_BLOCK_DISABLE)) {
+			dev_dbg(dev, "Invalid block flags %x at offset %zu\n",
+				block->flags, block_offset);
+			return -EINVAL;
+		}
+
+		/*
+		 * Match the block reported size against the info provided
+		 * one, but allow the block to only contain the header in
+		 * case it is going to be disabled.
+		 */
+		block_info = &info[block->type];
+		if (block->size != block_info->size &&
+		    (!(block->flags & V4L2_ISP_PARAMS_FL_BLOCK_DISABLE) ||
+		    block->size != sizeof(*block))) {
+			dev_dbg(dev,
+				"Invalid block size %u (expected %zu) at offset %zu\n",
+				block->size, block_info->size, block_offset);
+			return -EINVAL;
+		}
+
+		block_offset += block->size;
+		buffer_size -= block->size;
+	}
+
+	if (buffer_size) {
+		dev_dbg(dev, "Unexpected data after the parameters buffer end\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(v4l2_isp_params_validate_buffer);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jacopo Mondi <jacopo.mondi@ideasonboard.com");
+MODULE_DESCRIPTION("V4L2 generic ISP parameters and statistics helpers");
diff --git a/include/media/v4l2-isp.h b/include/media/v4l2-isp.h
new file mode 100644
index 000000000000..8b4695663699
--- /dev/null
+++ b/include/media/v4l2-isp.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Video4Linux2 generic ISP parameters and statistics support
+ *
+ * Copyright (C) 2025 Ideas On Board Oy
+ * Author: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
+ */
+
+#ifndef _V4L2_ISP_H_
+#define _V4L2_ISP_H_
+
+#include <linux/media/v4l2-isp.h>
+
+struct device;
+struct vb2_buffer;
+
+/**
+ * v4l2_isp_params_buffer_size - Calculate size of v4l2_isp_params_buffer
+ * @max_params_size: The total size of the ISP configuration blocks
+ *
+ * Users of the v4l2 extensible parameters will have differing sized data arrays
+ * depending on their specific parameter buffers. Drivers and userspace will
+ * need to be able to calculate the appropriate size of the struct to
+ * accommodate all ISP configuration blocks provided by the platform.
+ * This macro provides a convenient tool for the calculation.
+ */
+#define v4l2_isp_params_buffer_size(max_params_size) \
+	(offsetof(struct v4l2_isp_params_buffer, data) + (max_params_size))
+
+/**
+ * v4l2_isp_params_validate_buffer_size - Validate a V4L2 ISP buffer sizes
+ * @dev: the driver's device pointer
+ * @vb: the videobuf2 buffer
+ * @max_size: the maximum allowed buffer size
+ *
+ * This function performs validation of the size of a V4L2 ISP parameters buffer
+ * before the driver can access the actual data buffer content.
+ *
+ * After the sizes validation, drivers should copy the buffer content to a
+ * kernel-only memory area to prevent userspace from modifying it,
+ * before completing validation using v4l2_isp_params_validate_buffer().
+ *
+ * The @vb buffer as received from the vb2 .buf_prepare() operation is checked
+ * against @max_size and it's validated to be large enough to accommodate at
+ * least one ISP configuration block.
+ */
+int v4l2_isp_params_validate_buffer_size(struct device *dev,
+					 struct vb2_buffer *vb,
+					 size_t max_size);
+
+/**
+ * struct v4l2_isp_params_block_info - V4L2 ISP per-block info
+ * @size: the block expected size
+ *
+ * The v4l2_isp_params_block_info collects information of the ISP configuration
+ * blocks for validation purposes. It currently only contains the expected
+ * block size.
+ *
+ * Drivers shall prepare a list of block info, indexed by block type, one for
+ * each supported ISP block and correctly populate them with the expected block
+ * size.
+ */
+struct v4l2_isp_params_block_info {
+	size_t size;
+};
+
+/**
+ * v4l2_isp_params_validate_buffer - Validate a V4L2 ISP parameters buffer
+ * @dev: the driver's device pointer
+ * @vb: the videobuf2 buffer
+ * @buffer: the V4L2 ISP parameters buffer
+ * @info: the list of per-block validation info
+ * @num_blocks: the number of blocks
+ *
+ * This function completes the validation of a V4L2 ISP parameters buffer,
+ * verifying each configuration block correctness before the driver can use
+ * them to program the hardware.
+ *
+ * Drivers should use this function after having validated the correctness of
+ * the vb2 buffer sizes by using the v4l2_isp_params_validate_buffer_size()
+ * helper first. Once the buffer size has been validated, drivers should
+ * perform a copy of the user provided buffer into a kernel-only memory buffer
+ * to prevent userspace from modifying its content after it has been submitted
+ * to the driver, and then call this function to complete validation.
+ */
+int v4l2_isp_params_validate_buffer(struct device *dev, struct vb2_buffer *vb,
+				    const struct v4l2_isp_params_buffer *buffer,
+				    const struct v4l2_isp_params_block_info *info,
+				    size_t num_blocks);
+
+#endif /* _V4L2_ISP_H_ */
-- 
cgit v1.2.3


From ec4ac3cb7198070611987a6e91829fce0f4ce6d0 Mon Sep 17 00:00:00 2001
From: Daniel Scally <dan.scally@ideasonboard.com>
Date: Tue, 11 Nov 2025 16:15:45 +0000
Subject: media: uapi: Add MEDIA_BUS_FMT_RGB202020_1X60 format code

The Mali-C55 ISP by ARM requires 20-bits per colour channel input on
the bus. Add a new media bus format code to represent it.

Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Nayden Kanchev <nayden.kanchev@arm.com>
Co-developed-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Daniel Scally <dan.scally@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 .../userspace-api/media/v4l/subdev-formats.rst     | 168 +++++++++++++++++++++
 include/uapi/linux/media-bus-format.h              |   3 +-
 2 files changed, 170 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/subdev-formats.rst b/Documentation/userspace-api/media/v4l/subdev-formats.rst
index 1904390df830..894592e15a2b 100644
--- a/Documentation/userspace-api/media/v4l/subdev-formats.rst
+++ b/Documentation/userspace-api/media/v4l/subdev-formats.rst
@@ -2225,6 +2225,174 @@ The following table list existing packed 48bit wide RGB formats.
 
     \endgroup
 
+The following table list existing packed 60bit wide RGB formats.
+
+.. tabularcolumns:: |p{4.0cm}|p{0.7cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|
+
+.. _v4l2-mbus-pixelcode-rgb-60:
+
+.. raw:: latex
+
+    \begingroup
+    \tiny
+    \setlength{\tabcolsep}{2pt}
+
+.. flat-table:: 60bit RGB formats
+    :header-rows:  3
+    :stub-columns: 0
+    :widths: 36 7 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
+
+    * - Identifier
+      - Code
+      -
+      - :cspan:`31` Data organization
+    * -
+      -
+      - Bit
+      -
+      -
+      -
+      -
+      - 59
+      - 58
+      - 57
+      - 56
+      - 55
+      - 54
+      - 53
+      - 52
+      - 51
+      - 50
+      - 49
+      - 48
+      - 47
+      - 46
+      - 45
+      - 44
+      - 43
+      - 42
+      - 41
+      - 40
+      - 39
+      - 38
+      - 37
+      - 36
+      - 35
+      - 34
+      - 33
+      - 32
+    * -
+      -
+      -
+      - 31
+      - 30
+      - 29
+      - 28
+      - 27
+      - 26
+      - 25
+      - 24
+      - 23
+      - 22
+      - 21
+      - 20
+      - 19
+      - 18
+      - 17
+      - 16
+      - 15
+      - 14
+      - 13
+      - 12
+      - 11
+      - 10
+      - 9
+      - 8
+      - 7
+      - 6
+      - 5
+      - 4
+      - 3
+      - 2
+      - 1
+      - 0
+    * .. _MEDIA-BUS-FMT-RGB202020-1X60:
+
+      - MEDIA_BUS_FMT_RGB202020_1X60
+      - 0x1026
+      -
+      -
+      -
+      -
+      -
+      - r\ :sub:`19`
+      - r\ :sub:`18`
+      - r\ :sub:`17`
+      - r\ :sub:`16`
+      - r\ :sub:`15`
+      - r\ :sub:`14`
+      - r\ :sub:`13`
+      - r\ :sub:`12`
+      - r\ :sub:`11`
+      - r\ :sub:`10`
+      - r\ :sub:`9`
+      - r\ :sub:`8`
+      - r\ :sub:`7`
+      - r\ :sub:`6`
+      - r\ :sub:`5`
+      - r\ :sub:`4`
+      - r\ :sub:`3`
+      - r\ :sub:`2`
+      - r\ :sub:`1`
+      - r\ :sub:`0`
+      - g\ :sub:`19`
+      - g\ :sub:`18`
+      - g\ :sub:`17`
+      - g\ :sub:`16`
+      - g\ :sub:`15`
+      - g\ :sub:`14`
+      - g\ :sub:`13`
+      - g\ :sub:`12`
+    * -
+      -
+      -
+      - g\ :sub:`11`
+      - g\ :sub:`10`
+      - g\ :sub:`9`
+      - g\ :sub:`8`
+      - g\ :sub:`7`
+      - g\ :sub:`6`
+      - g\ :sub:`5`
+      - g\ :sub:`4`
+      - g\ :sub:`3`
+      - g\ :sub:`2`
+      - g\ :sub:`1`
+      - g\ :sub:`0`
+      - b\ :sub:`19`
+      - b\ :sub:`18`
+      - b\ :sub:`17`
+      - b\ :sub:`16`
+      - b\ :sub:`15`
+      - b\ :sub:`14`
+      - b\ :sub:`13`
+      - b\ :sub:`12`
+      - b\ :sub:`11`
+      - b\ :sub:`10`
+      - b\ :sub:`9`
+      - b\ :sub:`8`
+      - b\ :sub:`7`
+      - b\ :sub:`6`
+      - b\ :sub:`5`
+      - b\ :sub:`4`
+      - b\ :sub:`3`
+      - b\ :sub:`2`
+      - b\ :sub:`1`
+      - b\ :sub:`0`
+
+.. raw:: latex
+
+    \endgroup
+
 On LVDS buses, usually each sample is transferred serialized in seven
 time slots per pixel clock, on three (18-bit) or four (24-bit) or five (30-bit)
 differential data pairs at the same time. The remaining bits are used
diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h
index ff62056feed5..62ad82fd285a 100644
--- a/include/uapi/linux/media-bus-format.h
+++ b/include/uapi/linux/media-bus-format.h
@@ -34,7 +34,7 @@
 
 #define MEDIA_BUS_FMT_FIXED			0x0001
 
-/* RGB - next is	0x1028 */
+/* RGB - next is	0x1029 */
 #define MEDIA_BUS_FMT_RGB444_1X12		0x1016
 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE	0x1001
 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE	0x1002
@@ -74,6 +74,7 @@
 #define MEDIA_BUS_FMT_RGB888_1X36_CPADLO	0x1021
 #define MEDIA_BUS_FMT_RGB121212_1X36		0x1019
 #define MEDIA_BUS_FMT_RGB161616_1X48		0x101a
+#define MEDIA_BUS_FMT_RGB202020_1X60		0x1028
 
 /* YUV (including grey) - next is	0x202f */
 #define MEDIA_BUS_FMT_Y8_1X8			0x2001
-- 
cgit v1.2.3


From 2477ab037621632c3ec167187dc9e7afac2ba7f2 Mon Sep 17 00:00:00 2001
From: Daniel Scally <dan.scally@ideasonboard.com>
Date: Tue, 11 Nov 2025 16:15:46 +0000
Subject: media: uapi: Add 20-bit bayer formats

The Mali-C55 requires input data be in 20-bit format, MSB aligned.
Add some new media bus format macros to represent that input format.

Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Co-developed-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Daniel Scally <dan.scally@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 .../userspace-api/media/v4l/subdev-formats.rst     | 252 ++++++++++++++++++++-
 include/uapi/linux/media-bus-format.h              |   6 +-
 2 files changed, 255 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/subdev-formats.rst b/Documentation/userspace-api/media/v4l/subdev-formats.rst
index 894592e15a2b..cf970750dd4c 100644
--- a/Documentation/userspace-api/media/v4l/subdev-formats.rst
+++ b/Documentation/userspace-api/media/v4l/subdev-formats.rst
@@ -2817,7 +2817,7 @@ organization is given as an example for the first pixel only.
     \tiny
     \setlength{\tabcolsep}{2pt}
 
-.. tabularcolumns:: |p{6.0cm}|p{0.7cm}|p{0.3cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|
+.. tabularcolumns:: |p{6.0cm}|p{0.7cm}|p{0.3cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|p{0.22cm}|
 
 .. _v4l2-mbus-pixelcode-bayer:
 
@@ -2830,10 +2830,14 @@ organization is given as an example for the first pixel only.
     * - Identifier
       - Code
       -
-      - :cspan:`15` Data organization
+      - :cspan:`19` Data organization
     * -
       -
       - Bit
+      - 19
+      - 18
+      - 17
+      - 16
       - 15
       - 14
       - 13
@@ -2863,6 +2867,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`7`
       - b\ :sub:`6`
       - b\ :sub:`5`
@@ -2884,6 +2892,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`7`
       - g\ :sub:`6`
       - g\ :sub:`5`
@@ -2905,6 +2917,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`7`
       - g\ :sub:`6`
       - g\ :sub:`5`
@@ -2926,6 +2942,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - r\ :sub:`7`
       - r\ :sub:`6`
       - r\ :sub:`5`
@@ -2947,6 +2967,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`7`
       - b\ :sub:`6`
       - b\ :sub:`5`
@@ -2968,6 +2992,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`7`
       - g\ :sub:`6`
       - g\ :sub:`5`
@@ -2989,6 +3017,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`7`
       - g\ :sub:`6`
       - g\ :sub:`5`
@@ -3010,6 +3042,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - r\ :sub:`7`
       - r\ :sub:`6`
       - r\ :sub:`5`
@@ -3031,6 +3067,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`7`
       - b\ :sub:`6`
       - b\ :sub:`5`
@@ -3052,6 +3092,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`7`
       - g\ :sub:`6`
       - g\ :sub:`5`
@@ -3073,6 +3117,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`7`
       - g\ :sub:`6`
       - g\ :sub:`5`
@@ -3094,6 +3142,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - r\ :sub:`7`
       - r\ :sub:`6`
       - r\ :sub:`5`
@@ -3115,6 +3167,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - 0
       - 0
       - 0
@@ -3134,6 +3190,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`7`
       - b\ :sub:`6`
       - b\ :sub:`5`
@@ -3155,6 +3215,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`7`
       - b\ :sub:`6`
       - b\ :sub:`5`
@@ -3174,6 +3238,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - 0
       - 0
       - 0
@@ -3195,6 +3263,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`9`
       - b\ :sub:`8`
       - b\ :sub:`7`
@@ -3214,6 +3286,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`1`
       - b\ :sub:`0`
       - 0
@@ -3235,6 +3311,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`1`
       - b\ :sub:`0`
       - 0
@@ -3254,6 +3334,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`9`
       - b\ :sub:`8`
       - b\ :sub:`7`
@@ -3273,6 +3357,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`9`
       - b\ :sub:`8`
       - b\ :sub:`7`
@@ -3294,6 +3382,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`9`
       - g\ :sub:`8`
       - g\ :sub:`7`
@@ -3315,6 +3407,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`9`
       - g\ :sub:`8`
       - g\ :sub:`7`
@@ -3336,6 +3432,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - r\ :sub:`9`
       - r\ :sub:`8`
       - r\ :sub:`7`
@@ -3355,6 +3455,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`11`
       - b\ :sub:`10`
       - b\ :sub:`9`
@@ -3376,6 +3480,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`11`
       - g\ :sub:`10`
       - g\ :sub:`9`
@@ -3397,6 +3505,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`11`
       - g\ :sub:`10`
       - g\ :sub:`9`
@@ -3418,6 +3530,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - r\ :sub:`11`
       - r\ :sub:`10`
       - r\ :sub:`9`
@@ -3437,6 +3553,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`13`
       - b\ :sub:`12`
       - b\ :sub:`11`
@@ -3458,6 +3578,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`13`
       - g\ :sub:`12`
       - g\ :sub:`11`
@@ -3479,6 +3603,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`13`
       - g\ :sub:`12`
       - g\ :sub:`11`
@@ -3500,6 +3628,10 @@ organization is given as an example for the first pixel only.
       -
       -
       -
+      -
+      -
+      -
+      -
       - r\ :sub:`13`
       - r\ :sub:`12`
       - r\ :sub:`11`
@@ -3519,6 +3651,10 @@ organization is given as an example for the first pixel only.
       - MEDIA_BUS_FMT_SBGGR16_1X16
       - 0x301d
       -
+      -
+      -
+      -
+      -
       - b\ :sub:`15`
       - b\ :sub:`14`
       - b\ :sub:`13`
@@ -3540,6 +3676,10 @@ organization is given as an example for the first pixel only.
       - MEDIA_BUS_FMT_SGBRG16_1X16
       - 0x301e
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`15`
       - g\ :sub:`14`
       - g\ :sub:`13`
@@ -3561,6 +3701,10 @@ organization is given as an example for the first pixel only.
       - MEDIA_BUS_FMT_SGRBG16_1X16
       - 0x301f
       -
+      -
+      -
+      -
+      -
       - g\ :sub:`15`
       - g\ :sub:`14`
       - g\ :sub:`13`
@@ -3582,6 +3726,110 @@ organization is given as an example for the first pixel only.
       - MEDIA_BUS_FMT_SRGGB16_1X16
       - 0x3020
       -
+      -
+      -
+      -
+      -
+      - r\ :sub:`15`
+      - r\ :sub:`14`
+      - r\ :sub:`13`
+      - r\ :sub:`12`
+      - r\ :sub:`11`
+      - r\ :sub:`10`
+      - r\ :sub:`9`
+      - r\ :sub:`8`
+      - r\ :sub:`7`
+      - r\ :sub:`6`
+      - r\ :sub:`5`
+      - r\ :sub:`4`
+      - r\ :sub:`3`
+      - r\ :sub:`2`
+      - r\ :sub:`1`
+      - r\ :sub:`0`
+    * .. _MEDIA-BUS-FMT-SBGGR20-1X20:
+
+      - MEDIA_BUS_FMT_SBGGR20_1X20
+      - 0x3021
+      -
+      - b\ :sub:`19`
+      - b\ :sub:`18`
+      - b\ :sub:`17`
+      - b\ :sub:`16`
+      - b\ :sub:`15`
+      - b\ :sub:`14`
+      - b\ :sub:`13`
+      - b\ :sub:`12`
+      - b\ :sub:`11`
+      - b\ :sub:`10`
+      - b\ :sub:`9`
+      - b\ :sub:`8`
+      - b\ :sub:`7`
+      - b\ :sub:`6`
+      - b\ :sub:`5`
+      - b\ :sub:`4`
+      - b\ :sub:`3`
+      - b\ :sub:`2`
+      - b\ :sub:`1`
+      - b\ :sub:`0`
+    * .. _MEDIA-BUS-FMT-SGBRG20-1X20:
+
+      - MEDIA_BUS_FMT_SGBRG20_1X20
+      - 0x3022
+      -
+      - g\ :sub:`19`
+      - g\ :sub:`18`
+      - g\ :sub:`17`
+      - g\ :sub:`16`
+      - g\ :sub:`15`
+      - g\ :sub:`14`
+      - g\ :sub:`13`
+      - g\ :sub:`12`
+      - g\ :sub:`11`
+      - g\ :sub:`10`
+      - g\ :sub:`9`
+      - g\ :sub:`8`
+      - g\ :sub:`7`
+      - g\ :sub:`6`
+      - g\ :sub:`5`
+      - g\ :sub:`4`
+      - g\ :sub:`3`
+      - g\ :sub:`2`
+      - g\ :sub:`1`
+      - g\ :sub:`0`
+    * .. _MEDIA-BUS-FMT-SGRBG20-1X20:
+
+      - MEDIA_BUS_FMT_SGRBG20_1X20
+      - 0x3023
+      -
+      - g\ :sub:`19`
+      - g\ :sub:`18`
+      - g\ :sub:`17`
+      - g\ :sub:`16`
+      - g\ :sub:`15`
+      - g\ :sub:`14`
+      - g\ :sub:`13`
+      - g\ :sub:`12`
+      - g\ :sub:`11`
+      - g\ :sub:`10`
+      - g\ :sub:`9`
+      - g\ :sub:`8`
+      - g\ :sub:`7`
+      - g\ :sub:`6`
+      - g\ :sub:`5`
+      - g\ :sub:`4`
+      - g\ :sub:`3`
+      - g\ :sub:`2`
+      - g\ :sub:`1`
+      - g\ :sub:`0`
+    * .. _MEDIA-BUS-FMT-SRGGB20-1X20:
+
+      - MEDIA_BUS_FMT_SRGGB20_1X20
+      - 0x3024
+      -
+      - r\ :sub:`19`
+      - r\ :sub:`18`
+      - r\ :sub:`17`
+      - r\ :sub:`16`
       - r\ :sub:`15`
       - r\ :sub:`14`
       - r\ :sub:`13`
diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h
index 62ad82fd285a..6005f033e62c 100644
--- a/include/uapi/linux/media-bus-format.h
+++ b/include/uapi/linux/media-bus-format.h
@@ -124,7 +124,7 @@
 #define MEDIA_BUS_FMT_YUV16_1X48		0x202a
 #define MEDIA_BUS_FMT_UYYVYY16_0_5X48		0x202b
 
-/* Bayer - next is	0x3021 */
+/* Bayer - next is	0x3025 */
 #define MEDIA_BUS_FMT_SBGGR8_1X8		0x3001
 #define MEDIA_BUS_FMT_SGBRG8_1X8		0x3013
 #define MEDIA_BUS_FMT_SGRBG8_1X8		0x3002
@@ -157,6 +157,10 @@
 #define MEDIA_BUS_FMT_SGBRG16_1X16		0x301e
 #define MEDIA_BUS_FMT_SGRBG16_1X16		0x301f
 #define MEDIA_BUS_FMT_SRGGB16_1X16		0x3020
+#define MEDIA_BUS_FMT_SBGGR20_1X20		0x3021
+#define MEDIA_BUS_FMT_SGBRG20_1X20		0x3022
+#define MEDIA_BUS_FMT_SGRBG20_1X20		0x3023
+#define MEDIA_BUS_FMT_SRGGB20_1X20		0x3024
 
 /* JPEG compressed formats - next is	0x4002 */
 #define MEDIA_BUS_FMT_JPEG_1X8			0x4001
-- 
cgit v1.2.3


From 8d0bbed21ef737195277c0af8c30511fb72e608b Mon Sep 17 00:00:00 2001
From: Daniel Scally <dan.scally@ideasonboard.com>
Date: Tue, 11 Nov 2025 16:15:48 +0000
Subject: media: uapi: Add controls for Mali-C55 ISP

Add definitions and documentation for the custom control that will
be needed by the Mali-C55 ISP driver. This will be a read only
bitmask of the driver's capabilities, informing userspace of which
blocks are fitted and which are absent.

Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Daniel Scally <dan.scally@ideasonboard.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 .../userspace-api/media/drivers/index.rst          |  1 +
 .../userspace-api/media/drivers/mali-c55.rst       | 55 ++++++++++++++++++++++
 include/uapi/linux/media/arm/mali-c55-config.h     | 26 ++++++++++
 include/uapi/linux/v4l2-controls.h                 |  6 +++
 4 files changed, 88 insertions(+)
 create mode 100644 Documentation/userspace-api/media/drivers/mali-c55.rst
 create mode 100644 include/uapi/linux/media/arm/mali-c55-config.h

(limited to 'include')

diff --git a/Documentation/userspace-api/media/drivers/index.rst b/Documentation/userspace-api/media/drivers/index.rst
index d706cb47b112..02967c9b18d6 100644
--- a/Documentation/userspace-api/media/drivers/index.rst
+++ b/Documentation/userspace-api/media/drivers/index.rst
@@ -32,6 +32,7 @@ For more details see the file COPYING in the source distribution of Linux.
 	cx2341x-uapi
 	dw100
 	imx-uapi
+	mali-c55
 	max2175
 	npcm-video
 	omap3isp-uapi
diff --git a/Documentation/userspace-api/media/drivers/mali-c55.rst b/Documentation/userspace-api/media/drivers/mali-c55.rst
new file mode 100644
index 000000000000..21148b187856
--- /dev/null
+++ b/Documentation/userspace-api/media/drivers/mali-c55.rst
@@ -0,0 +1,55 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+Arm Mali-C55 ISP driver
+=======================
+
+The Arm Mali-C55 ISP driver implements a single driver-specific control:
+
+``V4L2_CID_MALI_C55_CAPABILITIES (bitmask)``
+    Detail the capabilities of the ISP by giving detail about the fitted blocks.
+
+    .. flat-table:: Bitmask meaning definitions
+	:header-rows: 1
+	:widths: 2 4 8
+
+	* - Bit
+	  - Macro
+	  - Meaning
+        * - 0
+          - MALI_C55_PONG
+          - Pong configuration space is fitted in the ISP
+        * - 1
+          - MALI_C55_WDR
+          - WDR Framestitch, offset and gain is fitted in the ISP
+        * - 2
+          - MALI_C55_COMPRESSION
+          - Temper compression is fitted in the ISP
+        * - 3
+          - MALI_C55_TEMPER
+          - Temper is fitted in the ISP
+        * - 4
+          - MALI_C55_SINTER_LITE
+          - Sinter Lite is fitted in the ISP instead of the full Sinter version
+        * - 5
+          - MALI_C55_SINTER
+          - Sinter is fitted in the ISP
+        * - 6
+          - MALI_C55_IRIDIX_LTM
+          - Iridix local tone mappine is fitted in the ISP
+        * - 7
+          - MALI_C55_IRIDIX_GTM
+          - Iridix global tone mapping is fitted in the ISP
+        * - 8
+          - MALI_C55_CNR
+          - Colour noise reduction is fitted in the ISP
+        * - 9
+          - MALI_C55_FRSCALER
+          - The full resolution pipe scaler is fitted in the ISP
+        * - 10
+          - MALI_C55_DS_PIPE
+          - The downscale pipe is fitted in the ISP
+
+    The Mali-C55 ISP can be configured in a number of ways to include or exclude
+    blocks which may not be necessary. This control provides a way for the
+    driver to communicate to userspace which of the blocks are fitted in the
+    design.
\ No newline at end of file
diff --git a/include/uapi/linux/media/arm/mali-c55-config.h b/include/uapi/linux/media/arm/mali-c55-config.h
new file mode 100644
index 000000000000..7fddece54ada
--- /dev/null
+++ b/include/uapi/linux/media/arm/mali-c55-config.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * ARM Mali-C55 ISP Driver - Userspace API
+ *
+ * Copyright (C) 2023 Ideas on Board Oy
+ */
+
+#ifndef __UAPI_MALI_C55_CONFIG_H
+#define __UAPI_MALI_C55_CONFIG_H
+
+#include <linux/v4l2-controls.h>
+
+#define V4L2_CID_MALI_C55_CAPABILITIES	(V4L2_CID_USER_MALI_C55_BASE + 0x0)
+#define MALI_C55_GPS_PONG		(1U << 0)
+#define MALI_C55_GPS_WDR		(1U << 1)
+#define MALI_C55_GPS_COMPRESSION	(1U << 2)
+#define MALI_C55_GPS_TEMPER		(1U << 3)
+#define MALI_C55_GPS_SINTER_LITE	(1U << 4)
+#define MALI_C55_GPS_SINTER		(1U << 5)
+#define MALI_C55_GPS_IRIDIX_LTM		(1U << 6)
+#define MALI_C55_GPS_IRIDIX_GTM		(1U << 7)
+#define MALI_C55_GPS_CNR		(1U << 8)
+#define MALI_C55_GPS_FRSCALER		(1U << 9)
+#define MALI_C55_GPS_DS_PIPE		(1U << 10)
+
+#endif /* __UAPI_MALI_C55_CONFIG_H */
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 2d30107e047e..f84ed133a6c9 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -228,6 +228,12 @@ enum v4l2_colorfx {
  */
 #define V4L2_CID_USER_RKISP1_BASE		(V4L2_CID_USER_BASE + 0x1220)
 
+/*
+ * The base for the Arm Mali-C55 ISP driver controls.
+ * We reserve 16 controls for this driver
+ */
+#define V4L2_CID_USER_MALI_C55_BASE		(V4L2_CID_USER_BASE + 0x1230)
+
 /* MPEG-class control IDs */
 /* The MPEG controls are applicable to all codec controls
  * and the 'MPEG' part of the define is historical */
-- 
cgit v1.2.3


From 4d36f732366aeb32bf3486545e597500a3bf0994 Mon Sep 17 00:00:00 2001
From: Daniel Scally <dan.scally@ideasonboard.com>
Date: Tue, 11 Nov 2025 16:15:52 +0000
Subject: media: Add MALI_C55_3A_STATS meta format

Add a new meta format for the Mali-C55 ISP's 3A Statistics along
with a new descriptor entry.

Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Nayden Kanchev  <nayden.kanchev@arm.com>
Co-developed-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Daniel Scally <dan.scally@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ioctl.c | 1 +
 include/uapi/linux/videodev2.h       | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 01cf52c3ea33..bfab29938b8f 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1469,6 +1469,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_META_FMT_RK_ISP1_EXT_PARAMS:	descr = "Rockchip ISP1 Ext 3A Params"; break;
 	case V4L2_META_FMT_C3ISP_PARAMS:	descr = "Amlogic C3 ISP Parameters"; break;
 	case V4L2_META_FMT_C3ISP_STATS:		descr = "Amlogic C3 ISP Statistics"; break;
+	case V4L2_META_FMT_MALI_C55_STATS:	descr = "ARM Mali-C55 ISP 3A Statistics"; break;
 	case V4L2_PIX_FMT_NV12_8L128:	descr = "NV12 (8x128 Linear)"; break;
 	case V4L2_PIX_FMT_NV12M_8L128:	descr = "NV12M (8x128 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_10BE_8L128:	descr = "10-bit NV12 (8x128 Linear, BE)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index becd08fdbddb..cba4b1311667 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -884,6 +884,9 @@ struct v4l2_pix_format {
 #define V4L2_META_FMT_RPI_FE_CFG	v4l2_fourcc('R', 'P', 'F', 'C') /* PiSP FE configuration */
 #define V4L2_META_FMT_RPI_FE_STATS	v4l2_fourcc('R', 'P', 'F', 'S') /* PiSP FE stats */
 
+/* Vendor specific - used for Arm Mali-C55 ISP */
+#define V4L2_META_FMT_MALI_C55_STATS	v4l2_fourcc('C', '5', '5', 'S') /* ARM Mali-C55 3A Statistics */
+
 #ifdef __KERNEL__
 /*
  * Line-based metadata formats. Remember to update v4l_fill_fmtdesc() when
-- 
cgit v1.2.3


From c7f832f6f8129bb666346cb4805805ad056059b7 Mon Sep 17 00:00:00 2001
From: Daniel Scally <dan.scally@ideasonboard.com>
Date: Tue, 11 Nov 2025 16:15:53 +0000
Subject: media: uapi: Add 3a stats buffer for mali-c55

Describe the format of the 3A statistics buffers in the userspace API
header for the mali-c55 ISP.

Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Nayden Kanchev  <nayden.kanchev@arm.com>
Co-developed-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Daniel Scally <dan.scally@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 MAINTAINERS                                    |   1 +
 include/uapi/linux/media/arm/mali-c55-config.h | 170 +++++++++++++++++++++++++
 2 files changed, 171 insertions(+)

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index dc3719cb6120..193580ceb9f4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2116,6 +2116,7 @@ F:	Documentation/admin-guide/media/mali-c55.rst
 F:	Documentation/devicetree/bindings/media/arm,mali-c55.yaml
 F:	Documentation/userspace-api/media/drivers/mali-c55.rst
 F:	drivers/media/platform/arm/mali-c55/
+F:	include/uapi/linux/media/arm/mali-c55-config.h
 
 ARM MALI PANTHOR DRM DRIVER
 M:	Boris Brezillon <boris.brezillon@collabora.com>
diff --git a/include/uapi/linux/media/arm/mali-c55-config.h b/include/uapi/linux/media/arm/mali-c55-config.h
index 7fddece54ada..e31fb8ffa10a 100644
--- a/include/uapi/linux/media/arm/mali-c55-config.h
+++ b/include/uapi/linux/media/arm/mali-c55-config.h
@@ -8,6 +8,7 @@
 #ifndef __UAPI_MALI_C55_CONFIG_H
 #define __UAPI_MALI_C55_CONFIG_H
 
+#include <linux/types.h>
 #include <linux/v4l2-controls.h>
 
 #define V4L2_CID_MALI_C55_CAPABILITIES	(V4L2_CID_USER_MALI_C55_BASE + 0x0)
@@ -23,4 +24,173 @@
 #define MALI_C55_GPS_FRSCALER		(1U << 9)
 #define MALI_C55_GPS_DS_PIPE		(1U << 10)
 
+/*
+ * Frames are split into zones of almost equal width and height - a zone is a
+ * rectangular tile of a frame. The metering blocks within the ISP collect
+ * aggregated statistics per zone. A maximum of 15x15 zones can be configured,
+ * and so the statistics buffer within the hardware is sized to accommodate
+ * that.
+ *
+ * The utilised number of zones is runtime configurable.
+ */
+#define MALI_C55_MAX_ZONES	(15 * 15)
+
+/**
+ * struct mali_c55_ae_1024bin_hist - Auto Exposure 1024-bin histogram statistics
+ *
+ * @bins:	1024 element array of 16-bit pixel counts.
+ *
+ * The 1024-bin histogram module collects image-global but zone-weighted
+ * intensity distributions of pixels in fixed-width bins. The modules can be
+ * configured into different "plane modes" which affect the contents of the
+ * collected statistics. In plane mode 0, pixel intensities are taken regardless
+ * of colour plane into a single 1024-bin histogram with a bin width of 4. In
+ * plane mode 1, four 256-bin histograms with a bin width of 16 are collected -
+ * one for each CFA colour plane. In plane modes 4, 5, 6 and 7 two 512-bin
+ * histograms with a bin width of 8 are collected - in each mode one of the
+ * colour planes is collected into the first histogram and all the others are
+ * combined into the second. The histograms are stored consecutively in the bins
+ * array.
+ *
+ * The 16-bit pixel counts are stored as a 4-bit exponent in the most
+ * significant bits followed by a 12-bit mantissa. Conversion to a usable
+ * format can be done according to the following pseudo-code::
+ *
+ *	if (e == 0) {
+ *		bin = m * 2;
+ *	} else {
+ *		bin = (m + 4096) * 2^e
+ *	}
+ *
+ * where
+ *	e is the exponent value in range 0..15
+ *	m is the mantissa value in range 0..4095
+ *
+ * The pixels used in calculating the statistics can be masked using three
+ * methods:
+ *
+ * 1. Pixels can be skipped in X and Y directions independently.
+ * 2. Minimum/Maximum intensities can be configured
+ * 3. Zones can be differentially weighted, including 0 weighted to mask them
+ *
+ * The data for this histogram can be collected from different tap points in the
+ * ISP depending on configuration - after the white balance or digital gain
+ * blocks, or immediately after the input crossbar.
+ */
+struct mali_c55_ae_1024bin_hist {
+	__u16 bins[1024];
+} __attribute__((packed));
+
+/**
+ * struct mali_c55_ae_5bin_hist - Auto Exposure 5-bin histogram statistics
+ *
+ * @hist0:	16-bit normalised pixel count for the 0th intensity bin
+ * @hist1:	16-bit normalised pixel count for the 1st intensity bin
+ * @hist3:	16-bit normalised pixel count for the 3rd intensity bin
+ * @hist4:	16-bit normalised pixel count for the 4th intensity bin
+ *
+ * The ISP generates a 5-bin histogram of normalised pixel counts within bins of
+ * pixel intensity for each of 225 possible zones within a frame. The centre bin
+ * of the histogram for each zone is not available from the hardware and must be
+ * calculated by subtracting the values of hist0, hist1, hist3 and hist4 from
+ * 0xffff as in the following equation:
+ *
+ *	hist2 = 0xffff - (hist0 + hist1 + hist3 + hist4)
+ */
+struct mali_c55_ae_5bin_hist {
+	__u16 hist0;
+	__u16 hist1;
+	__u16 hist3;
+	__u16 hist4;
+} __attribute__((packed));
+
+/**
+ * struct mali_c55_awb_average_ratios - Auto White Balance colour ratios
+ *
+ * @avg_rg_gr:	Average R/G or G/R ratio in Q4.8 format.
+ * @avg_bg_br:	Average B/G or B/R ratio in Q4.8 format.
+ * @num_pixels:	The number of pixels used in the AWB calculation
+ *
+ * The ISP calculates and collects average colour ratios for each zone in an
+ * image and stores them in Q4.8 format (the lowest 8 bits are fractional, with
+ * bits [11:8] representing the integer). The exact ratios collected (either
+ * R/G, B/G or G/R, B/R) are configurable through the parameters buffer. The
+ * value of the 4 high bits is undefined.
+ */
+struct mali_c55_awb_average_ratios {
+	__u16 avg_rg_gr;
+	__u16 avg_bg_br;
+	__u32 num_pixels;
+} __attribute__((packed));
+
+/**
+ * struct mali_c55_af_statistics - Auto Focus edge and intensity statistics
+ *
+ * @intensity_stats:	Packed mantissa and exponent value for pixel intensity
+ * @edge_stats:		Packed mantissa and exponent values for edge intensity
+ *
+ * The ISP collects the squared sum of pixel intensities for each zone within a
+ * configurable Region of Interest on the frame. Additionally, the same data are
+ * collected after being passed through a bandpass filter which removes high and
+ * low frequency components - these are referred to as the edge statistics.
+ *
+ * The intensity and edge statistics for a zone can be used to calculate the
+ * contrast information for a zone
+ *
+ *	C = E2 / I2
+ *
+ * Where I2 is the intensity statistic for a zone and E2 is the edge statistic
+ * for that zone. Optimum focus is reached when C is at its maximum.
+ *
+ * The intensity and edge statistics are stored packed into a non-standard 16
+ * bit floating point format, where the 7 most significant bits represent the
+ * exponent and the 9 least significant bits the mantissa. This format can be
+ * unpacked with the following pseudocode::
+ *
+ *	if (e == 0) {
+ *		x = m;
+ *	} else {
+ *		x = 2^e-1 * (m + 2^9)
+ *	}
+ *
+ * where
+ *	e is the exponent value in range 0..127
+ *	m is the mantissa value in range 0..511
+ */
+struct mali_c55_af_statistics {
+	__u16 intensity_stats;
+	__u16 edge_stats;
+} __attribute__((packed));
+
+/**
+ * struct mali_c55_stats_buffer - 3A statistics for the mali-c55 ISP
+ *
+ * @ae_1024bin_hist:		1024-bin frame-global pixel intensity histogram
+ * @iridix_1024bin_hist:	Post-Iridix block 1024-bin histogram
+ * @ae_5bin_hists:		5-bin pixel intensity histograms for AEC
+ * @reserved1:			Undefined buffer space
+ * @awb_ratios:			Color balance ratios for Auto White Balance
+ * @reserved2:			Undefined buffer space
+ * @af_statistics:		Pixel intensity statistics for Auto Focus
+ * @reserved3:			Undefined buffer space
+ *
+ * This struct describes the metering statistics space in the Mali-C55 ISP's
+ * hardware in its entirety. The space between each defined area is marked as
+ * "unknown" and may not be 0, but should not be used. The @ae_5bin_hists,
+ * @awb_ratios and @af_statistics members are arrays of statistics per-zone.
+ * The zones are arranged in the array in raster order starting from the top
+ * left corner of the image.
+ */
+
+struct mali_c55_stats_buffer {
+	struct mali_c55_ae_1024bin_hist ae_1024bin_hist;
+	struct mali_c55_ae_1024bin_hist iridix_1024bin_hist;
+	struct mali_c55_ae_5bin_hist ae_5bin_hists[MALI_C55_MAX_ZONES];
+	__u32 reserved1[14];
+	struct mali_c55_awb_average_ratios awb_ratios[MALI_C55_MAX_ZONES];
+	__u32 reserved2[14];
+	struct mali_c55_af_statistics af_statistics[MALI_C55_MAX_ZONES];
+	__u32 reserved3[15];
+} __attribute__((packed));
+
 #endif /* __UAPI_MALI_C55_CONFIG_H */
-- 
cgit v1.2.3


From 1ab3cb233d61131b2d02650f8ed9e4e077fd4508 Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Date: Tue, 11 Nov 2025 16:15:56 +0000
Subject: media: mali-c55: Add image formats for Mali-C55 parameters buffer

Add a new V4L2 meta format code for the Mali-C55 parameters.

Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Nayden Kanchev  <nayden.kanchev@arm.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Daniel Scally <dan.scally@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ioctl.c | 1 +
 include/uapi/linux/videodev2.h       | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index bfab29938b8f..98512ea4cc5b 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1469,6 +1469,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_META_FMT_RK_ISP1_EXT_PARAMS:	descr = "Rockchip ISP1 Ext 3A Params"; break;
 	case V4L2_META_FMT_C3ISP_PARAMS:	descr = "Amlogic C3 ISP Parameters"; break;
 	case V4L2_META_FMT_C3ISP_STATS:		descr = "Amlogic C3 ISP Statistics"; break;
+	case V4L2_META_FMT_MALI_C55_PARAMS:	descr = "ARM Mali-C55 ISP Parameters"; break;
 	case V4L2_META_FMT_MALI_C55_STATS:	descr = "ARM Mali-C55 ISP 3A Statistics"; break;
 	case V4L2_PIX_FMT_NV12_8L128:	descr = "NV12 (8x128 Linear)"; break;
 	case V4L2_PIX_FMT_NV12M_8L128:	descr = "NV12M (8x128 Linear)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index cba4b1311667..add08188f068 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -885,6 +885,7 @@ struct v4l2_pix_format {
 #define V4L2_META_FMT_RPI_FE_STATS	v4l2_fourcc('R', 'P', 'F', 'S') /* PiSP FE stats */
 
 /* Vendor specific - used for Arm Mali-C55 ISP */
+#define V4L2_META_FMT_MALI_C55_PARAMS	v4l2_fourcc('C', '5', '5', 'P') /* ARM Mali-C55 Parameters */
 #define V4L2_META_FMT_MALI_C55_STATS	v4l2_fourcc('C', '5', '5', 'S') /* ARM Mali-C55 3A Statistics */
 
 #ifdef __KERNEL__
-- 
cgit v1.2.3


From 08a99369f44eeb63eacc56fe42f4c67a6c7dbc37 Mon Sep 17 00:00:00 2001
From: Daniel Scally <dan.scally@ideasonboard.com>
Date: Tue, 11 Nov 2025 16:15:57 +0000
Subject: media: uapi: Add parameters structs to mali-c55-config.h

Add structures describing the ISP parameters to mali-c55-config.h

Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Acked-by: Nayden Kanchev  <nayden.kanchev@arm.com>
Co-developed-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Daniel Scally <dan.scally@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 include/uapi/linux/media/arm/mali-c55-config.h | 598 +++++++++++++++++++++++++
 1 file changed, 598 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/media/arm/mali-c55-config.h b/include/uapi/linux/media/arm/mali-c55-config.h
index e31fb8ffa10a..109082c5694f 100644
--- a/include/uapi/linux/media/arm/mali-c55-config.h
+++ b/include/uapi/linux/media/arm/mali-c55-config.h
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/v4l2-controls.h>
+#include <linux/media/v4l2-isp.h>
 
 #define V4L2_CID_MALI_C55_CAPABILITIES	(V4L2_CID_USER_MALI_C55_BASE + 0x0)
 #define MALI_C55_GPS_PONG		(1U << 0)
@@ -193,4 +194,601 @@ struct mali_c55_stats_buffer {
 	__u32 reserved3[15];
 } __attribute__((packed));
 
+/**
+ * enum mali_c55_param_buffer_version - Mali-C55 parameters block versioning
+ *
+ * @MALI_C55_PARAM_BUFFER_V1: First version of Mali-C55 parameters block
+ */
+enum mali_c55_param_buffer_version {
+	MALI_C55_PARAM_BUFFER_V1,
+};
+
+/**
+ * enum mali_c55_param_block_type - Enumeration of Mali-C55 parameter blocks
+ *
+ * This enumeration defines the types of Mali-C55 parameters block. Each block
+ * configures a specific processing block of the Mali-C55 ISP. The block
+ * type allows the driver to correctly interpret the parameters block data.
+ *
+ * It is the responsibility of userspace to correctly set the type of each
+ * parameters block.
+ *
+ * @MALI_C55_PARAM_BLOCK_SENSOR_OFFS: Sensor pre-shading black level offset
+ * @MALI_C55_PARAM_BLOCK_AEXP_HIST: Auto-exposure 1024-bin histogram
+ *				    configuration
+ * @MALI_C55_PARAM_BLOCK_AEXP_IHIST: Post-Iridix auto-exposure 1024-bin
+ *				     histogram configuration
+ * @MALI_C55_PARAM_BLOCK_AEXP_HIST_WEIGHTS: Auto-exposure 1024-bin histogram
+ *					    weighting
+ * @MALI_C55_PARAM_BLOCK_AEXP_IHIST_WEIGHTS: Post-Iridix auto-exposure 1024-bin
+ *					     histogram weighting
+ * @MALI_C55_PARAM_BLOCK_DIGITAL_GAIN: Digital gain
+ * @MALI_C55_PARAM_BLOCK_AWB_GAINS: Auto-white balance gains
+ * @MALI_C55_PARAM_BLOCK_AWB_CONFIG: Auto-white balance statistics config
+ * @MALI_C55_PARAM_BLOCK_AWB_GAINS_AEXP: Auto-white balance gains for AEXP-0 tap
+ * @MALI_C55_PARAM_MESH_SHADING_CONFIG : Mesh shading tables configuration
+ * @MALI_C55_PARAM_MESH_SHADING_SELECTION: Mesh shading table selection
+ */
+enum mali_c55_param_block_type {
+	MALI_C55_PARAM_BLOCK_SENSOR_OFFS,
+	MALI_C55_PARAM_BLOCK_AEXP_HIST,
+	MALI_C55_PARAM_BLOCK_AEXP_IHIST,
+	MALI_C55_PARAM_BLOCK_AEXP_HIST_WEIGHTS,
+	MALI_C55_PARAM_BLOCK_AEXP_IHIST_WEIGHTS,
+	MALI_C55_PARAM_BLOCK_DIGITAL_GAIN,
+	MALI_C55_PARAM_BLOCK_AWB_GAINS,
+	MALI_C55_PARAM_BLOCK_AWB_CONFIG,
+	MALI_C55_PARAM_BLOCK_AWB_GAINS_AEXP,
+	MALI_C55_PARAM_MESH_SHADING_CONFIG,
+	MALI_C55_PARAM_MESH_SHADING_SELECTION,
+};
+
+/**
+ * struct mali_c55_params_sensor_off_preshading - offset subtraction for each
+ *						  color channel
+ *
+ * Provides removal of the sensor black level from the sensor data. Separate
+ * offsets are provided for each of the four Bayer component color channels
+ * which are defaulted to R, Gr, Gb, B.
+ *
+ * header.type should be set to MALI_C55_PARAM_BLOCK_SENSOR_OFFS from
+ * :c:type:`mali_c55_param_block_type` for this block.
+ *
+ * @header: The Mali-C55 parameters block header
+ * @chan00: Offset for color channel 00 (default: R)
+ * @chan01: Offset for color channel 01 (default: Gr)
+ * @chan10: Offset for color channel 10 (default: Gb)
+ * @chan11: Offset for color channel 11 (default: B)
+ */
+struct mali_c55_params_sensor_off_preshading {
+	struct v4l2_isp_params_block_header header;
+	__u32 chan00;
+	__u32 chan01;
+	__u32 chan10;
+	__u32 chan11;
+};
+
+/**
+ * enum mali_c55_aexp_hist_tap_points - Tap points for the AEXP histogram
+ * @MALI_C55_AEXP_HIST_TAP_WB: After static white balance
+ * @MALI_C55_AEXP_HIST_TAP_FS: After WDR Frame Stitch
+ * @MALI_C55_AEXP_HIST_TAP_TPG: After the test pattern generator
+ */
+enum mali_c55_aexp_hist_tap_points {
+	MALI_C55_AEXP_HIST_TAP_WB = 0,
+	MALI_C55_AEXP_HIST_TAP_FS,
+	MALI_C55_AEXP_HIST_TAP_TPG,
+};
+
+/**
+ * enum mali_c55_aexp_skip_x - Horizontal pixel skipping
+ * @MALI_C55_AEXP_SKIP_X_EVERY_2ND: Collect every 2nd pixel horizontally
+ * @MALI_C55_AEXP_SKIP_X_EVERY_3RD: Collect every 3rd pixel horizontally
+ * @MALI_C55_AEXP_SKIP_X_EVERY_4TH: Collect every 4th pixel horizontally
+ * @MALI_C55_AEXP_SKIP_X_EVERY_5TH: Collect every 5th pixel horizontally
+ * @MALI_C55_AEXP_SKIP_X_EVERY_8TH: Collect every 8th pixel horizontally
+ * @MALI_C55_AEXP_SKIP_X_EVERY_9TH: Collect every 9th pixel horizontally
+ */
+enum mali_c55_aexp_skip_x {
+	MALI_C55_AEXP_SKIP_X_EVERY_2ND,
+	MALI_C55_AEXP_SKIP_X_EVERY_3RD,
+	MALI_C55_AEXP_SKIP_X_EVERY_4TH,
+	MALI_C55_AEXP_SKIP_X_EVERY_5TH,
+	MALI_C55_AEXP_SKIP_X_EVERY_8TH,
+	MALI_C55_AEXP_SKIP_X_EVERY_9TH
+};
+
+/**
+ * enum mali_c55_aexp_skip_y - Vertical pixel skipping
+ * @MALI_C55_AEXP_SKIP_Y_ALL: Collect every single pixel vertically
+ * @MALI_C55_AEXP_SKIP_Y_EVERY_2ND: Collect every 2nd pixel vertically
+ * @MALI_C55_AEXP_SKIP_Y_EVERY_3RD: Collect every 3rd pixel vertically
+ * @MALI_C55_AEXP_SKIP_Y_EVERY_4TH: Collect every 4th pixel vertically
+ * @MALI_C55_AEXP_SKIP_Y_EVERY_5TH: Collect every 5th pixel vertically
+ * @MALI_C55_AEXP_SKIP_Y_EVERY_8TH: Collect every 8th pixel vertically
+ * @MALI_C55_AEXP_SKIP_Y_EVERY_9TH: Collect every 9th pixel vertically
+ */
+enum mali_c55_aexp_skip_y {
+	MALI_C55_AEXP_SKIP_Y_ALL,
+	MALI_C55_AEXP_SKIP_Y_EVERY_2ND,
+	MALI_C55_AEXP_SKIP_Y_EVERY_3RD,
+	MALI_C55_AEXP_SKIP_Y_EVERY_4TH,
+	MALI_C55_AEXP_SKIP_Y_EVERY_5TH,
+	MALI_C55_AEXP_SKIP_Y_EVERY_8TH,
+	MALI_C55_AEXP_SKIP_Y_EVERY_9TH
+};
+
+/**
+ * enum mali_c55_aexp_row_column_offset - Start from the first or second row or
+ *					  column
+ * @MALI_C55_AEXP_FIRST_ROW_OR_COL:	Start from the first row / column
+ * @MALI_C55_AEXP_SECOND_ROW_OR_COL:	Start from the second row / column
+ */
+enum mali_c55_aexp_row_column_offset {
+	MALI_C55_AEXP_FIRST_ROW_OR_COL = 1,
+	MALI_C55_AEXP_SECOND_ROW_OR_COL = 2,
+};
+
+/**
+ * enum mali_c55_aexp_hist_plane_mode - Mode for the AEXP Histograms
+ * @MALI_C55_AEXP_HIST_COMBINED: All color planes in one 1024-bin histogram
+ * @MALI_C55_AEXP_HIST_SEPARATE: Each color plane in one 256-bin histogram with a bin width of 16
+ * @MALI_C55_AEXP_HIST_FOCUS_00: Top left plane in the first bank, rest in second bank
+ * @MALI_C55_AEXP_HIST_FOCUS_01: Top right plane in the first bank, rest in second bank
+ * @MALI_C55_AEXP_HIST_FOCUS_10: Bottom left plane in the first bank, rest in second bank
+ * @MALI_C55_AEXP_HIST_FOCUS_11: Bottom right plane in the first bank, rest in second bank
+ *
+ * In the "focus" modes statistics are collected into two 512-bin histograms
+ * with a bin width of 8. One colour plane is in the first histogram with the
+ * remainder combined into the second. The four options represent which of the
+ * four positions in a bayer pattern are the focused plane.
+ */
+enum mali_c55_aexp_hist_plane_mode {
+	MALI_C55_AEXP_HIST_COMBINED = 0,
+	MALI_C55_AEXP_HIST_SEPARATE = 1,
+	MALI_C55_AEXP_HIST_FOCUS_00 = 4,
+	MALI_C55_AEXP_HIST_FOCUS_01 = 5,
+	MALI_C55_AEXP_HIST_FOCUS_10 = 6,
+	MALI_C55_AEXP_HIST_FOCUS_11 = 7,
+};
+
+/**
+ * struct mali_c55_params_aexp_hist - configuration for AEXP metering hists
+ *
+ * This struct allows users to configure the 1024-bin AEXP histograms. Broadly
+ * speaking the parameters allow you to mask particular regions of the image and
+ * to select different kinds of histogram.
+ *
+ * The skip_x, offset_x, skip_y and offset_y fields allow users to ignore or
+ * mask pixels in the frame by their position relative to the top left pixel.
+ * First, the skip_y, offset_x and offset_y fields define which of the pixels
+ * within each 2x2 region will be counted in the statistics.
+ *
+ * If skip_y == 0 then two pixels from each covered region will be counted. If
+ * both offset_x and offset_y are zero, then the two left-most pixels in each
+ * 2x2 pixel region will be counted. Setting offset_x = 1 will discount the top
+ * left pixel and count the top right pixel. Setting offset_y = 1 will discount
+ * the bottom left pixel and count the bottom right pixel.
+ *
+ * If skip_y != 0 then only a single pixel from each region covered by the
+ * pattern will be counted. In this case offset_x controls whether the pixel
+ * that's counted is in the left (if offset_x == 0) or right (if offset_x == 1)
+ * column and offset_y controls whether the pixel that's counted is in the top
+ * (if offset_y == 0) or bottom (if offset_y == 1) row.
+ *
+ * The skip_x and skip_y fields control how the 2x2 pixel region is repeated
+ * across the image data. The first instance of the region is always in the top
+ * left of the image data. The skip_x field controls how many pixels are ignored
+ * in the x direction before the pixel masking region is repeated. The skip_y
+ * field controls how many pixels are ignored in the y direction before the
+ * pixel masking region is repeated.
+ *
+ * These fields can be used to reduce the number of pixels counted for the
+ * statistics, but it's important to be careful to configure them correctly.
+ * Some combinations of values will result in colour components from the input
+ * data being ignored entirely, for example in the following configuration:
+ *
+ * skip_x = 0
+ * offset_x = 0
+ * skip_y = 0
+ * offset_y = 0
+ *
+ * Only the R and Gb components of RGGB data that was input would be collected.
+ * Similarly in the following configuration:
+ *
+ * skip_x = 0
+ * offset_x = 0
+ * skip_y = 1
+ * offset_y = 1
+ *
+ * Only the Gb component of RGGB data that was input would be collected. To
+ * correct things such that all 4 colour components were included it would be
+ * necessary to set the skip_x and skip_y fields in a way that resulted in all
+ * four colour components being collected:
+ *
+ * skip_x = 1
+ * offset_x = 0
+ * skip_y = 1
+ * offset_y = 1
+ *
+ * header.type should be set to one of either MALI_C55_PARAM_BLOCK_AEXP_HIST or
+ * MALI_C55_PARAM_BLOCK_AEXP_IHIST from :c:type:`mali_c55_param_block_type`.
+ *
+ * @header:		The Mali-C55 parameters block header
+ * @skip_x:		Horizontal decimation. See enum mali_c55_aexp_skip_x
+ * @offset_x:		Skip the first column, or not. See enum mali_c55_aexp_row_column_offset
+ * @skip_y:		Vertical decimation. See enum mali_c55_aexp_skip_y
+ * @offset_y:		Skip the first row, or not. See enum mali_c55_aexp_row_column_offset
+ * @scale_bottom:	Scale pixels in bottom half of intensity range: 0=1x ,1=2x, 2=4x, 4=8x, 4=16x
+ * @scale_top:		scale pixels in top half of intensity range: 0=1x ,1=2x, 2=4x, 4=8x, 4=16x
+ * @plane_mode:		Plane separation mode. See enum mali_c55_aexp_hist_plane_mode
+ * @tap_point:		Tap point for histogram from enum mali_c55_aexp_hist_tap_points.
+ *			This parameter is unused for the post-Iridix Histogram
+ */
+struct mali_c55_params_aexp_hist {
+	struct v4l2_isp_params_block_header header;
+	__u8 skip_x;
+	__u8 offset_x;
+	__u8 skip_y;
+	__u8 offset_y;
+	__u8 scale_bottom;
+	__u8 scale_top;
+	__u8 plane_mode;
+	__u8 tap_point;
+};
+
+/**
+ * struct mali_c55_params_aexp_weights - Array of weights for AEXP metering
+ *
+ * This struct allows users to configure the weighting for both of the 1024-bin
+ * AEXP histograms. The pixel data collected for each zone is multiplied by the
+ * corresponding weight from this array, which may be zero if the intention is
+ * to mask off the zone entirely.
+ *
+ * header.type should be set to one of either MALI_C55_PARAM_BLOCK_AEXP_HIST_WEIGHTS
+ * or MALI_C55_PARAM_BLOCK_AEXP_IHIST_WEIGHTS from :c:type:`mali_c55_param_block_type`.
+ *
+ * @header:		The Mali-C55 parameters block header
+ * @nodes_used_horiz:	Number of active zones horizontally [0..15]
+ * @nodes_used_vert:	Number of active zones vertically [0..15]
+ * @zone_weights:	Zone weighting. Index is row*col where 0,0 is the top
+ *			left zone continuing in raster order. Each zone can be
+ *			weighted in the range [0..15]. The number of rows and
+ *			columns is defined by @nodes_used_vert and
+ *			@nodes_used_horiz
+ */
+struct mali_c55_params_aexp_weights {
+	struct v4l2_isp_params_block_header header;
+	__u8 nodes_used_horiz;
+	__u8 nodes_used_vert;
+	__u8 zone_weights[MALI_C55_MAX_ZONES];
+};
+
+/**
+ * struct mali_c55_params_digital_gain - Digital gain value
+ *
+ * This struct carries a digital gain value to set in the ISP.
+ *
+ * header.type should be set to MALI_C55_PARAM_BLOCK_DIGITAL_GAIN from
+ * :c:type:`mali_c55_param_block_type` for this block.
+ *
+ * @header:	The Mali-C55 parameters block header
+ * @gain:	The digital gain value to apply, in Q5.8 format.
+ */
+struct mali_c55_params_digital_gain {
+	struct v4l2_isp_params_block_header header;
+	__u16 gain;
+};
+
+/**
+ * enum mali_c55_awb_stats_mode - Statistics mode for AWB
+ * @MALI_C55_AWB_MODE_GRBR: Statistics collected as Green/Red and Blue/Red ratios
+ * @MALI_C55_AWB_MODE_RGBG: Statistics collected as Red/Green and Blue/Green ratios
+ */
+enum mali_c55_awb_stats_mode {
+	MALI_C55_AWB_MODE_GRBR = 0,
+	MALI_C55_AWB_MODE_RGBG,
+};
+
+/**
+ * struct mali_c55_params_awb_gains - Gain settings for auto white balance
+ *
+ * This struct allows users to configure the gains for auto-white balance. There
+ * are four gain settings corresponding to each colour channel in the bayer
+ * domain. Although named generically, the association between the gain applied
+ * and the colour channel is done automatically within the ISP depending on the
+ * input format, and so the following mapping always holds true::
+ *
+ *	gain00 = R
+ *	gain01 = Gr
+ *	gain10 = Gb
+ *	gain11 = B
+ *
+ * All of the gains are stored in Q4.8 format.
+ *
+ * header.type should be set to one of either MALI_C55_PARAM_BLOCK_AWB_GAINS or
+ * MALI_C55_PARAM_BLOCK_AWB_GAINS_AEXP from :c:type:`mali_c55_param_block_type`.
+ *
+ * @header:	The Mali-C55 parameters block header
+ * @gain00:	Multiplier for colour channel 00
+ * @gain01:	Multiplier for colour channel 01
+ * @gain10:	Multiplier for colour channel 10
+ * @gain11:	Multiplier for colour channel 11
+ */
+struct mali_c55_params_awb_gains {
+	struct v4l2_isp_params_block_header header;
+	__u16 gain00;
+	__u16 gain01;
+	__u16 gain10;
+	__u16 gain11;
+};
+
+/**
+ * enum mali_c55_params_awb_tap_points - Tap points for the AWB statistics
+ * @MALI_C55_AWB_STATS_TAP_PF: Immediately after the Purple Fringe block
+ * @MALI_C55_AWB_STATS_TAP_CNR: Immediately after the CNR block
+ */
+enum mali_c55_params_awb_tap_points {
+	MALI_C55_AWB_STATS_TAP_PF = 0,
+	MALI_C55_AWB_STATS_TAP_CNR,
+};
+
+/**
+ * struct mali_c55_params_awb_config - Stats settings for auto-white balance
+ *
+ * This struct allows the configuration of the statistics generated for auto
+ * white balance. Pixel intensity limits can be set to exclude overly bright or
+ * dark regions of an image from the statistics entirely. Colour ratio minima
+ * and maxima can be set to discount pixels who's ratios fall outside the
+ * defined boundaries; there are two sets of registers to do this - the
+ * "min/max" ratios which bound a region and the "high/low" ratios which further
+ * trim the upper and lower ratios. For example with the boundaries configured
+ * as follows, only pixels whos colour ratios falls into the region marked "A"
+ * would be counted::
+ *
+ *	                                                          cr_high
+ *	    2.0 |                                                   |
+ *	        |               cb_max --> _________________________v_____
+ *	    1.8 |                         |                         \    |
+ *	        |                         |                          \   |
+ *	    1.6 |                         |                           \  |
+ *	        |                         |                            \ |
+ *	 c  1.4 |               cb_low -->|\              A             \|<--  cb_high
+ *	 b      |                         | \                            |
+ *	    1.2 |                         |  \                           |
+ *	 r      |                         |   \                          |
+ *	 a  1.0 |              cb_min --> |____\_________________________|
+ *	 t      |                         ^    ^                         ^
+ *	 i  0.8 |                         |    |                         |
+ *	 o      |                      cr_min  |                       cr_max
+ *	 s  0.6 |                              |
+ *	        |                             cr_low
+ *	    0.4 |
+ *	        |
+ *	    0.2 |
+ *	        |
+ *	    0.0 |_______________________________________________________________
+ *	        0.0   0.2   0.4   0.6   0.8   1.0   1.2   1.4   1.6   1.8   2.0
+ *	                                   cr ratios
+ *
+ * header.type should be set to MALI_C55_PARAM_BLOCK_AWB_CONFIG from
+ * :c:type:`mali_c55_param_block_type` for this block.
+ *
+ * @header:		The Mali-C55 parameters block header
+ * @tap_point:		The tap point from enum mali_c55_params_awb_tap_points
+ * @stats_mode:		AWB statistics collection mode, see :c:type:`mali_c55_awb_stats_mode`
+ * @white_level:	Upper pixel intensity (I.E. raw pixel values) limit
+ * @black_level:	Lower pixel intensity (I.E. raw pixel values) limit
+ * @cr_max:		Maximum R/G ratio (Q4.8 format)
+ * @cr_min:		Minimum R/G ratio (Q4.8 format)
+ * @cb_max:		Maximum B/G ratio (Q4.8 format)
+ * @cb_min:		Minimum B/G ratio (Q4.8 format)
+ * @nodes_used_horiz:	Number of active zones horizontally [0..15]
+ * @nodes_used_vert:	Number of active zones vertically [0..15]
+ * @cr_high:		R/G ratio trim high (Q4.8 format)
+ * @cr_low:		R/G ratio trim low (Q4.8 format)
+ * @cb_high:		B/G ratio trim high (Q4.8 format)
+ * @cb_low:		B/G ratio trim low (Q4.8 format)
+ */
+struct mali_c55_params_awb_config {
+	struct v4l2_isp_params_block_header header;
+	__u8 tap_point;
+	__u8 stats_mode;
+	__u16 white_level;
+	__u16 black_level;
+	__u16 cr_max;
+	__u16 cr_min;
+	__u16 cb_max;
+	__u16 cb_min;
+	__u8 nodes_used_horiz;
+	__u8 nodes_used_vert;
+	__u16 cr_high;
+	__u16 cr_low;
+	__u16 cb_high;
+	__u16 cb_low;
+};
+
+#define MALI_C55_NUM_MESH_SHADING_ELEMENTS 3072
+
+/**
+ * struct mali_c55_params_mesh_shading_config - Mesh shading configuration
+ *
+ * The mesh shading correction module allows programming a separate table of
+ * either 16x16 or 32x32 node coefficients for 3 different light sources. The
+ * final correction coefficients applied are computed by blending the
+ * coefficients from two tables together.
+ *
+ * A page of 1024 32-bit integers is associated to each colour channel, with
+ * pages stored consecutively in memory. Each 32-bit integer packs 3 8-bit
+ * correction coefficients for a single node, one for each of the three light
+ * sources. The 8 most significant bits are unused. The following table
+ * describes the layout::
+ *
+ *	+----------- Page (Colour Plane) 0 -------------+
+ *	| @mesh[i]  | Mesh Point | Bits  | Light Source |
+ *	+-----------+------------+-------+--------------+
+ *	|         0 |        0,0 | 16,23 | LS2          |
+ *	|           |            | 08-15 | LS1          |
+ *	|           |            | 00-07 | LS0          |
+ *	+-----------+------------+-------+--------------+
+ *	|         1 |        0,1 | 16,23 | LS2          |
+ *	|           |            | 08-15 | LS1          |
+ *	|           |            | 00-07 | LS0          |
+ *	+-----------+------------+-------+--------------+
+ *	|       ... |        ... | ...   | ...          |
+ *	+-----------+------------+-------+--------------+
+ *	|      1023 |      31,31 | 16,23 | LS2          |
+ *	|           |            | 08-15 | LS1          |
+ *	|           |            | 00-07 | LS0          |
+ *	+----------- Page (Colour Plane) 1 -------------+
+ *	| @mesh[i]  | Mesh Point | Bits  | Light Source |
+ *	+-----------+------------+-------+--------------+
+ *	|      1024 |        0,0 | 16,23 | LS2          |
+ *	|           |            | 08-15 | LS1          |
+ *	|           |            | 00-07 | LS0          |
+ *	+-----------+------------+-------+--------------+
+ *	|      1025 |        0,1 | 16,23 | LS2          |
+ *	|           |            | 08-15 | LS1          |
+ *	|           |            | 00-07 | LS0          |
+ *	+-----------+------------+-------+--------------+
+ *	|       ... |        ... | ...   | ...          |
+ *	+-----------+------------+-------+--------------+
+ *	|      2047 |      31,31 | 16,23 | LS2          |
+ *	|           |            | 08-15 | LS1          |
+ *	|           |            | 00-07 | LS0          |
+ *	+----------- Page (Colour Plane) 2 -------------+
+ *	| @mesh[i]  | Mesh Point | Bits  | Light Source |
+ *	+-----------+------------+-------+--------------+
+ *	|      2048 |        0,0 | 16,23 | LS2          |
+ *	|           |            | 08-15 | LS1          |
+ *	|           |            | 00-07 | LS0          |
+ *	+-----------+------------+-------+--------------+
+ *	|      2049 |        0,1 | 16,23 | LS2          |
+ *	|           |            | 08-15 | LS1          |
+ *	|           |            | 00-07 | LS0          |
+ *	+-----------+------------+-------+--------------+
+ *	|       ... |        ... | ...   | ...          |
+ *	+-----------+------------+-------+--------------+
+ *	|      3071 |      31,31 | 16,23 | LS2          |
+ *	|           |            | 08-15 | LS1          |
+ *	|           |            | 00-07 | LS0          |
+ *	+-----------+------------+-------+--------------+
+ *
+ * The @mesh_scale member determines the precision and minimum and maximum gain.
+ * For example if @mesh_scale is 0 and therefore selects 0 - 2x gain, a value of
+ * 0 in a coefficient means 0.0 gain, a value of 128 means 1.0 gain and 255
+ * means 2.0 gain.
+ *
+ * header.type should be set to MALI_C55_PARAM_MESH_SHADING_CONFIG from
+ * :c:type:`mali_c55_param_block_type` for this block.
+ *
+ * @header:		The Mali-C55 parameters block header
+ * @mesh_show:		Output the mesh data rather than image data
+ * @mesh_scale:		Set the precision and maximum gain range of mesh shading
+ *				- 0 = 0-2x gain
+ *				- 1 = 0-4x gain
+ *				- 2 = 0-8x gain
+ *				- 3 = 0-16x gain
+ *				- 4 = 1-2x gain
+ *				- 5 = 1-3x gain
+ *				- 6 = 1-5x gain
+ *				- 7 = 1-9x gain
+ * @mesh_page_r:	Mesh page select for red colour plane [0..2]
+ * @mesh_page_g:	Mesh page select for green colour plane [0..2]
+ * @mesh_page_b:	Mesh page select for blue colour plane [0..2]
+ * @mesh_width:		Number of horizontal nodes minus 1 [15,31]
+ * @mesh_height:	Number of vertical nodes minus 1 [15,31]
+ * @mesh:		Mesh shading correction tables
+ */
+struct mali_c55_params_mesh_shading_config {
+	struct v4l2_isp_params_block_header header;
+	__u8 mesh_show;
+	__u8 mesh_scale;
+	__u8 mesh_page_r;
+	__u8 mesh_page_g;
+	__u8 mesh_page_b;
+	__u8 mesh_width;
+	__u8 mesh_height;
+	__u32 mesh[MALI_C55_NUM_MESH_SHADING_ELEMENTS];
+};
+
+/** enum mali_c55_params_mesh_alpha_bank - Mesh shading table bank selection
+ * @MALI_C55_MESH_ALPHA_BANK_LS0_AND_LS1 - Select Light Sources 0 and 1
+ * @MALI_C55_MESH_ALPHA_BANK_LS1_AND_LS2 - Select Light Sources 1 and 2
+ * @MALI_C55_MESH_ALPHA_BANK_LS0_AND_LS2 - Select Light Sources 0 and 2
+ */
+enum mali_c55_params_mesh_alpha_bank {
+	MALI_C55_MESH_ALPHA_BANK_LS0_AND_LS1 = 0,
+	MALI_C55_MESH_ALPHA_BANK_LS1_AND_LS2 = 1,
+	MALI_C55_MESH_ALPHA_BANK_LS0_AND_LS2 = 4
+};
+
+/**
+ * struct mali_c55_params_mesh_shading_selection - Mesh table selection
+ *
+ * The module computes the final correction coefficients by blending the ones
+ * from two light source tables, which are selected (independently for each
+ * colour channel) by the @mesh_alpha_bank_r/g/b fields.
+ *
+ * The final blended coefficients for each node are calculated using the
+ * following equation:
+ *
+ *     Final coefficient = (a * LS\ :sub:`b`\ + (256 - a) * LS\ :sub:`a`\) / 256
+ *
+ * Where a is the @mesh_alpha_r/g/b value, and LS\ :sub:`a`\ and LS\ :sub:`b`\
+ * are the node cofficients for the two tables selected by the
+ * @mesh_alpha_bank_r/g/b value.
+ *
+ * The scale of the applied correction may also be controlled by tuning the
+ * @mesh_strength member. This is a modifier to the final coefficients which can
+ * be used to globally reduce the gains applied.
+ *
+ * header.type should be set to MALI_C55_PARAM_MESH_SHADING_SELECTION from
+ * :c:type:`mali_c55_param_block_type` for this block.
+ *
+ * @header:		The Mali-C55 parameters block header
+ * @mesh_alpha_bank_r:	Red mesh table select (c:type:`enum mali_c55_params_mesh_alpha_bank`)
+ * @mesh_alpha_bank_g:	Green mesh table select (c:type:`enum mali_c55_params_mesh_alpha_bank`)
+ * @mesh_alpha_bank_b:	Blue mesh table select (c:type:`enum mali_c55_params_mesh_alpha_bank`)
+ * @mesh_alpha_r:	Blend coefficient for R [0..255]
+ * @mesh_alpha_g:	Blend coefficient for G [0..255]
+ * @mesh_alpha_b:	Blend coefficient for B [0..255]
+ * @mesh_strength:	Mesh strength in Q4.12 format [0..4096]
+ */
+struct mali_c55_params_mesh_shading_selection {
+	struct v4l2_isp_params_block_header header;
+	__u8 mesh_alpha_bank_r;
+	__u8 mesh_alpha_bank_g;
+	__u8 mesh_alpha_bank_b;
+	__u8 mesh_alpha_r;
+	__u8 mesh_alpha_g;
+	__u8 mesh_alpha_b;
+	__u16 mesh_strength;
+};
+
+/**
+ * define MALI_C55_PARAMS_MAX_SIZE - Maximum size of all Mali C55 Parameters
+ *
+ * Though the parameters for the Mali-C55 are passed as optional blocks, the
+ * driver still needs to know the absolute maximum size so that it can allocate
+ * a buffer sized appropriately to accommodate userspace attempting to set all
+ * possible parameters in a single frame.
+ *
+ * Some structs are in this list multiple times. Where that's the case, it just
+ * reflects the fact that the same struct can be used with multiple different
+ * header types from :c:type:`mali_c55_param_block_type`.
+ */
+#define MALI_C55_PARAMS_MAX_SIZE				\
+	(sizeof(struct mali_c55_params_sensor_off_preshading) +	\
+	sizeof(struct mali_c55_params_aexp_hist) +		\
+	sizeof(struct mali_c55_params_aexp_weights) +		\
+	sizeof(struct mali_c55_params_aexp_hist) +		\
+	sizeof(struct mali_c55_params_aexp_weights) +		\
+	sizeof(struct mali_c55_params_digital_gain) +		\
+	sizeof(struct mali_c55_params_awb_gains) +		\
+	sizeof(struct mali_c55_params_awb_config) +		\
+	sizeof(struct mali_c55_params_awb_gains) +		\
+	sizeof(struct mali_c55_params_mesh_shading_config) +	\
+	sizeof(struct mali_c55_params_mesh_shading_selection))
+
 #endif /* __UAPI_MALI_C55_CONFIG_H */
-- 
cgit v1.2.3


From d619dd9a3d401063cc6d31cada98c99db449d381 Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Date: Fri, 14 Nov 2025 12:02:11 +0100
Subject: media: v4l2-isp: Rename block_info to block_type_info

The v4l2_isp_params_block_info structure contains validation information
that apply to a block -type- and not only to a specific ISP block
implementation.

Clarify this by renaming v4l2_isp_params_block_info in
v4l2_isp_params_block_type_info and update the documentation and the
users of v4l2-isp accordingly.

Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 .../media/platform/amlogic/c3/isp/c3-isp-params.c  |  9 ++++----
 .../media/platform/arm/mali-c55/mali-c55-params.c  |  7 +++---
 .../media/platform/rockchip/rkisp1/rkisp1-params.c |  9 ++++----
 drivers/media/v4l2-core/v4l2-isp.c                 | 16 ++++++-------
 include/media/v4l2-isp.h                           | 26 +++++++++++-----------
 5 files changed, 35 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/media/platform/amlogic/c3/isp/c3-isp-params.c b/drivers/media/platform/amlogic/c3/isp/c3-isp-params.c
index c2339d6dc107..6f9ca7a7dd88 100644
--- a/drivers/media/platform/amlogic/c3/isp/c3-isp-params.c
+++ b/drivers/media/platform/amlogic/c3/isp/c3-isp-params.c
@@ -536,7 +536,8 @@ static const c3_isp_block_handler c3_isp_params_handlers[] = {
 		.size = sizeof(struct c3_isp_params_ ## data), \
 	}
 
-static const struct v4l2_isp_params_block_info c3_isp_params_blocks_info[] = {
+static const struct v4l2_isp_params_block_type_info
+c3_isp_params_block_types_info[] = {
 	C3_ISP_PARAMS_BLOCK_INFO(AWB_GAINS, awb_gains),
 	C3_ISP_PARAMS_BLOCK_INFO(AWB_CONFIG, awb_config),
 	C3_ISP_PARAMS_BLOCK_INFO(AE_CONFIG, ae_config),
@@ -548,7 +549,7 @@ static const struct v4l2_isp_params_block_info c3_isp_params_blocks_info[] = {
 };
 
 static_assert(ARRAY_SIZE(c3_isp_params_handlers) ==
-	      ARRAY_SIZE(c3_isp_params_blocks_info));
+	      ARRAY_SIZE(c3_isp_params_block_types_info));
 
 static void c3_isp_params_cfg_blocks(struct c3_isp_params *params)
 {
@@ -781,8 +782,8 @@ static int c3_isp_params_vb2_buf_prepare(struct vb2_buffer *vb)
 
 	return v4l2_isp_params_validate_buffer(params->isp->dev, vb,
 					(struct v4l2_isp_params_buffer *)cfg,
-					c3_isp_params_blocks_info,
-					ARRAY_SIZE(c3_isp_params_blocks_info));
+					c3_isp_params_block_types_info,
+					ARRAY_SIZE(c3_isp_params_block_types_info));
 }
 
 static int c3_isp_params_vb2_buf_init(struct vb2_buffer *vb)
diff --git a/drivers/media/platform/arm/mali-c55/mali-c55-params.c b/drivers/media/platform/arm/mali-c55/mali-c55-params.c
index ce220a50d253..c643cd013e46 100644
--- a/drivers/media/platform/arm/mali-c55/mali-c55-params.c
+++ b/drivers/media/platform/arm/mali-c55/mali-c55-params.c
@@ -428,7 +428,8 @@ static const mali_c55_params_handler mali_c55_params_handlers[] = {
 	[MALI_C55_PARAM_MESH_SHADING_SELECTION] = &mali_c55_params_lsc_selection,
 };
 
-static const struct v4l2_isp_params_block_info mali_c55_params_blocks_info[] = {
+static const struct v4l2_isp_params_block_type_info
+mali_c55_params_block_types_info[] = {
 	[MALI_C55_PARAM_BLOCK_SENSOR_OFFS] = {
 		.size = sizeof(struct mali_c55_params_sensor_off_preshading),
 	},
@@ -599,8 +600,8 @@ static int mali_c55_params_buf_prepare(struct vb2_buffer *vb)
 	memcpy(buf->config, config, v4l2_isp_params_buffer_size(MALI_C55_PARAMS_MAX_SIZE));
 
 	return v4l2_isp_params_validate_buffer(mali_c55->dev, vb, buf->config,
-					       mali_c55_params_blocks_info,
-					       ARRAY_SIZE(mali_c55_params_blocks_info));
+					       mali_c55_params_block_types_info,
+					       ARRAY_SIZE(mali_c55_params_block_types_info));
 }
 
 static void mali_c55_params_buf_queue(struct vb2_buffer *vb)
diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c
index 2dde0c62c8e6..c9f88635224c 100644
--- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c
+++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c
@@ -2198,7 +2198,8 @@ static const struct rkisp1_ext_params_handler {
 		.size = sizeof(struct rkisp1_ext_params_ ## data ## _config), \
 	}
 
-static const struct v4l2_isp_params_block_info rkisp1_ext_params_blocks_info[] = {
+static const struct v4l2_isp_params_block_type_info
+rkisp1_ext_params_block_types_info[] = {
 	RKISP1_PARAMS_BLOCK_INFO(BLS, bls),
 	RKISP1_PARAMS_BLOCK_INFO(DPCC, dpcc),
 	RKISP1_PARAMS_BLOCK_INFO(SDG, sdg),
@@ -2223,7 +2224,7 @@ static const struct v4l2_isp_params_block_info rkisp1_ext_params_blocks_info[] =
 };
 
 static_assert(ARRAY_SIZE(rkisp1_ext_params_handlers) ==
-	      ARRAY_SIZE(rkisp1_ext_params_blocks_info));
+	      ARRAY_SIZE(rkisp1_ext_params_block_types_info));
 
 static void rkisp1_ext_params_config(struct rkisp1_params *params,
 				     struct rkisp1_ext_params_cfg *cfg,
@@ -2677,8 +2678,8 @@ static int rkisp1_params_prepare_ext_params(struct rkisp1_params *params,
 
 	return v4l2_isp_params_validate_buffer(params->rkisp1->dev, vb,
 				(struct v4l2_isp_params_buffer *)cfg,
-				rkisp1_ext_params_blocks_info,
-				ARRAY_SIZE(rkisp1_ext_params_blocks_info));
+				rkisp1_ext_params_block_types_info,
+				ARRAY_SIZE(rkisp1_ext_params_block_types_info));
 }
 
 static int rkisp1_params_vb2_buf_prepare(struct vb2_buffer *vb)
diff --git a/drivers/media/v4l2-core/v4l2-isp.c b/drivers/media/v4l2-core/v4l2-isp.c
index 756d2b4996cc..29831f7032e9 100644
--- a/drivers/media/v4l2-core/v4l2-isp.c
+++ b/drivers/media/v4l2-core/v4l2-isp.c
@@ -38,8 +38,8 @@ EXPORT_SYMBOL_GPL(v4l2_isp_params_validate_buffer_size);
 
 int v4l2_isp_params_validate_buffer(struct device *dev, struct vb2_buffer *vb,
 				    const struct v4l2_isp_params_buffer *buffer,
-				    const struct v4l2_isp_params_block_info *info,
-				    size_t num_blocks)
+				    const struct v4l2_isp_params_block_type_info *type_info,
+				    size_t num_block_types)
 {
 	size_t header_size = offsetof(struct v4l2_isp_params_buffer, data);
 	size_t payload_size = vb2_get_plane_payload(vb, 0);
@@ -71,13 +71,13 @@ int v4l2_isp_params_validate_buffer(struct device *dev, struct vb2_buffer *vb,
 	/* Walk the list of ISP configuration blocks and validate them. */
 	buffer_size = buffer->data_size;
 	while (buffer_size >= sizeof(struct v4l2_isp_params_block_header)) {
-		const struct v4l2_isp_params_block_info *block_info;
+		const struct v4l2_isp_params_block_type_info *info;
 		const struct v4l2_isp_params_block_header *block;
 
 		block = (const struct v4l2_isp_params_block_header *)
 			(buffer->data + block_offset);
 
-		if (block->type >= num_blocks) {
+		if (block->type >= num_block_types) {
 			dev_dbg(dev,
 				"Invalid block type %u at offset %zu\n",
 				block->type, block_offset);
@@ -100,17 +100,17 @@ int v4l2_isp_params_validate_buffer(struct device *dev, struct vb2_buffer *vb,
 		}
 
 		/*
-		 * Match the block reported size against the info provided
+		 * Match the block reported size against the type info provided
 		 * one, but allow the block to only contain the header in
 		 * case it is going to be disabled.
 		 */
-		block_info = &info[block->type];
-		if (block->size != block_info->size &&
+		info = &type_info[block->type];
+		if (block->size != info->size &&
 		    (!(block->flags & V4L2_ISP_PARAMS_FL_BLOCK_DISABLE) ||
 		    block->size != sizeof(*block))) {
 			dev_dbg(dev,
 				"Invalid block size %u (expected %zu) at offset %zu\n",
-				block->size, block_info->size, block_offset);
+				block->size, info->size, block_offset);
 			return -EINVAL;
 		}
 
diff --git a/include/media/v4l2-isp.h b/include/media/v4l2-isp.h
index 8b4695663699..f3a6d0edcb24 100644
--- a/include/media/v4l2-isp.h
+++ b/include/media/v4l2-isp.h
@@ -49,18 +49,18 @@ int v4l2_isp_params_validate_buffer_size(struct device *dev,
 					 size_t max_size);
 
 /**
- * struct v4l2_isp_params_block_info - V4L2 ISP per-block info
- * @size: the block expected size
+ * struct v4l2_isp_params_block_type_info - V4L2 ISP per-block-type info
+ * @size: the block type expected size
  *
- * The v4l2_isp_params_block_info collects information of the ISP configuration
- * blocks for validation purposes. It currently only contains the expected
- * block size.
+ * The v4l2_isp_params_block_type_info collects information of the ISP
+ * configuration block types for validation purposes. It currently only contains
+ * the expected block type size.
  *
- * Drivers shall prepare a list of block info, indexed by block type, one for
- * each supported ISP block and correctly populate them with the expected block
- * size.
+ * Drivers shall prepare a list of block type info, indexed by block type, one
+ * for each supported ISP block type and correctly populate them with the
+ * expected block type size.
  */
-struct v4l2_isp_params_block_info {
+struct v4l2_isp_params_block_type_info {
 	size_t size;
 };
 
@@ -69,8 +69,8 @@ struct v4l2_isp_params_block_info {
  * @dev: the driver's device pointer
  * @vb: the videobuf2 buffer
  * @buffer: the V4L2 ISP parameters buffer
- * @info: the list of per-block validation info
- * @num_blocks: the number of blocks
+ * @type_info: the array of per-block-type validation info
+ * @num_block_types: the number of block types in the type_info array
  *
  * This function completes the validation of a V4L2 ISP parameters buffer,
  * verifying each configuration block correctness before the driver can use
@@ -85,7 +85,7 @@ struct v4l2_isp_params_block_info {
  */
 int v4l2_isp_params_validate_buffer(struct device *dev, struct vb2_buffer *vb,
 				    const struct v4l2_isp_params_buffer *buffer,
-				    const struct v4l2_isp_params_block_info *info,
-				    size_t num_blocks);
+				    const struct v4l2_isp_params_block_type_info *type_info,
+				    size_t num_block_types);
 
 #endif /* _V4L2_ISP_H_ */
-- 
cgit v1.2.3


From c42ba5a87bdccbca11403b7ca8bad1a57b833732 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 10 Nov 2025 10:38:52 +0100
Subject: futex: Store time as ktime_t in restart block
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The futex core uses ktime_t to represent times, use that also for the
restart block.

This allows the simplification of the accessors.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20251110-restart-block-expiration-v1-2-5d39cc93df4f@linutronix.de
---
 include/linux/restart_block.h | 2 +-
 kernel/futex/waitwake.c       | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
index 7e50bbc94e47..4f9316e7590d 100644
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -32,7 +32,7 @@ struct restart_block {
 			u32 val;
 			u32 flags;
 			u32 bitset;
-			u64 time;
+			ktime_t time;
 			u32 __user *uaddr2;
 		} futex;
 		/* For nanosleep */
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index e2bbe5509ec2..1c2dd03f11ec 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -738,12 +738,11 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time
 static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = restart->futex.uaddr;
-	ktime_t t, *tp = NULL;
+	ktime_t *tp = NULL;
+
+	if (restart->futex.flags & FLAGS_HAS_TIMEOUT)
+		tp = &restart->futex.time;
 
-	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
-		t = restart->futex.time;
-		tp = &t;
-	}
 	restart->fn = do_no_restart_syscall;
 
 	return (long)futex_wait(uaddr, restart->futex.flags,
-- 
cgit v1.2.3


From 4702f4eceb639b6af199151e352e570943619d98 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 10 Nov 2025 10:38:53 +0100
Subject: hrtimer: Store time as ktime_t in restart block
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hrtimer core uses ktime_t to represent times, use that also for the
restart block. CPU timers internally use nanoseconds instead of ktime_t
but use the same restart block, so use the correct accessors for those.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251110-restart-block-expiration-v1-3-5d39cc93df4f@linutronix.de
---
 include/linux/restart_block.h  | 2 +-
 kernel/time/hrtimer.c          | 4 ++--
 kernel/time/posix-cpu-timers.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
index 7e50bbc94e47..36ddfa1ec301 100644
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -43,7 +43,7 @@ struct restart_block {
 				struct __kernel_timespec __user *rmtp;
 				struct old_timespec32 __user *compat_rmtp;
 			};
-			u64 expires;
+			ktime_t expires;
 		} nanosleep;
 		/* For poll */
 		struct {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 7e7b2b471bae..9c77e5c72556 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2145,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 	int ret;
 
 	hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
-	hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
+	hrtimer_set_expires(&t.timer, restart->nanosleep.expires);
 	ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
 	destroy_hrtimer_on_stack(&t.timer);
 	return ret;
@@ -2172,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
 
 	restart = &current->restart_block;
 	restart->nanosleep.clockid = t.timer.base->clockid;
-	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+	restart->nanosleep.expires = hrtimer_get_expires(&t.timer);
 	set_restart_fn(restart, hrtimer_nanosleep_restart);
 out:
 	destroy_hrtimer_on_stack(&t.timer);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2e5b89d7d866..0de2bb7cbec0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1557,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		 * Report back to the user the time still remaining.
 		 */
 		restart = &current->restart_block;
-		restart->nanosleep.expires = expires;
+		restart->nanosleep.expires = ns_to_ktime(expires);
 		if (restart->nanosleep.type != TT_NONE)
 			error = nanosleep_copyout(restart, &it.it_value);
 	}
@@ -1599,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 	clockid_t which_clock = restart_block->nanosleep.clockid;
 	struct timespec64 t;
 
-	t = ns_to_timespec64(restart_block->nanosleep.expires);
+	t = ktime_to_timespec64(restart_block->nanosleep.expires);
 
 	return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
 }
-- 
cgit v1.2.3


From 0ca04993dac9b0d21ffbfd22bf54cc43ec2c49f2 Mon Sep 17 00:00:00 2001
From: "Mario Limonciello (AMD)" <superm1@kernel.org>
Date: Wed, 12 Nov 2025 16:40:23 -0600
Subject: PM: Introduce new PMSG_POWEROFF event

PMSG_POWEROFF will be used for the PM core to allow differentiating between
a hibernation or shutdown sequence when re-using callbacks for common code.

Hibernation is started by writing a hibernation method (such as 'platform'
'shutdown', or 'reboot') to use into /sys/power/disk and writing 'disk' to
/sys/power/state.

Shutdown is initiated with the reboot() syscall with arguments on whether
to halt the system or power it off.

Tested-by: Eric Naim <dnaim@cachyos.org>
Signed-off-by: Mario Limonciello (AMD) <superm1@kernel.org>
Link: https://patch.msgid.link/20251112224025.2051702-2-superm1@kernel.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/main.c    | 5 +++++
 include/linux/pm.h           | 3 +++
 include/trace/events/power.h | 3 ++-
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 7a8807ec9a5d..38fc8a978b88 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -96,6 +96,8 @@ static const char *pm_verb(int event)
 		return "restore";
 	case PM_EVENT_RECOVER:
 		return "recover";
+	case PM_EVENT_POWEROFF:
+		return "poweroff";
 	default:
 		return "(unknown PM event)";
 	}
@@ -368,6 +370,7 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state)
 	case PM_EVENT_FREEZE:
 	case PM_EVENT_QUIESCE:
 		return ops->freeze;
+	case PM_EVENT_POWEROFF:
 	case PM_EVENT_HIBERNATE:
 		return ops->poweroff;
 	case PM_EVENT_THAW:
@@ -402,6 +405,7 @@ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops,
 	case PM_EVENT_FREEZE:
 	case PM_EVENT_QUIESCE:
 		return ops->freeze_late;
+	case PM_EVENT_POWEROFF:
 	case PM_EVENT_HIBERNATE:
 		return ops->poweroff_late;
 	case PM_EVENT_THAW:
@@ -436,6 +440,7 @@ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t stat
 	case PM_EVENT_FREEZE:
 	case PM_EVENT_QUIESCE:
 		return ops->freeze_noirq;
+	case PM_EVENT_POWEROFF:
 	case PM_EVENT_HIBERNATE:
 		return ops->poweroff_noirq;
 	case PM_EVENT_THAW:
diff --git a/include/linux/pm.h b/include/linux/pm.h
index a72e42eec130..7f69f739f613 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -508,6 +508,7 @@ const struct dev_pm_ops name = { \
  * RECOVER	Creation of a hibernation image or restoration of the main
  *		memory contents from a hibernation image has failed, call
  *		->thaw() and ->complete() for all devices.
+ * POWEROFF	System will poweroff, call ->poweroff() for all devices.
  *
  * The following PM_EVENT_ messages are defined for internal use by
  * kernel subsystems.  They are never issued by the PM core.
@@ -538,6 +539,7 @@ const struct dev_pm_ops name = { \
 #define PM_EVENT_USER		0x0100
 #define PM_EVENT_REMOTE		0x0200
 #define PM_EVENT_AUTO		0x0400
+#define PM_EVENT_POWEROFF	0x0800
 
 #define PM_EVENT_SLEEP		(PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE)
 #define PM_EVENT_USER_SUSPEND	(PM_EVENT_USER | PM_EVENT_SUSPEND)
@@ -552,6 +554,7 @@ const struct dev_pm_ops name = { \
 #define PMSG_QUIESCE	((struct pm_message){ .event = PM_EVENT_QUIESCE, })
 #define PMSG_SUSPEND	((struct pm_message){ .event = PM_EVENT_SUSPEND, })
 #define PMSG_HIBERNATE	((struct pm_message){ .event = PM_EVENT_HIBERNATE, })
+#define PMSG_POWEROFF	((struct pm_message){ .event = PM_EVENT_POWEROFF, })
 #define PMSG_RESUME	((struct pm_message){ .event = PM_EVENT_RESUME, })
 #define PMSG_THAW	((struct pm_message){ .event = PM_EVENT_THAW, })
 #define PMSG_RESTORE	((struct pm_message){ .event = PM_EVENT_RESTORE, })
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 82904291c2b8..370f8df2fdb4 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -179,7 +179,8 @@ TRACE_EVENT(pstate_sample,
 		{ PM_EVENT_HIBERNATE, "hibernate" }, \
 		{ PM_EVENT_THAW, "thaw" }, \
 		{ PM_EVENT_RESTORE, "restore" }, \
-		{ PM_EVENT_RECOVER, "recover" })
+		{ PM_EVENT_RECOVER, "recover" }, \
+		{ PM_EVENT_POWEROFF, "poweroff" })
 
 DEFINE_EVENT(cpu, cpu_frequency,
 
-- 
cgit v1.2.3


From ce62118a2e4838bcef1050fff55001a0bf87f0cb Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 27 Oct 2025 14:33:49 -0500
Subject: KVM: SEV: Consolidate the SEV policy bits in a single header file

Consolidate SEV policy bit definitions into a single file. Use
include/linux/psp-sev.h to hold the definitions and remove the current
definitions from the arch/x86/kvm/svm/sev.c and arch/x86/include/svm.h
files.

No functional change intended.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://patch.msgid.link/d9639f88a0b521a1a67aeac77cc609fdea1f90bd.1761593632.git.thomas.lendacky@amd.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/sev.c  | 16 ++++------------
 arch/x86/kvm/svm/svm.h  |  3 ---
 include/linux/psp-sev.h | 19 +++++++++++++++++++
 3 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0835c664fbfd..f04589ae76bb 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -65,15 +65,7 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04
 #define AP_RESET_HOLD_NAE_EVENT		1
 #define AP_RESET_HOLD_MSR_PROTO		2
 
-/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
-#define SNP_POLICY_MASK_API_MINOR	GENMASK_ULL(7, 0)
-#define SNP_POLICY_MASK_API_MAJOR	GENMASK_ULL(15, 8)
-#define SNP_POLICY_MASK_SMT		BIT_ULL(16)
-#define SNP_POLICY_MASK_RSVD_MBO	BIT_ULL(17)
-#define SNP_POLICY_MASK_DEBUG		BIT_ULL(19)
-#define SNP_POLICY_MASK_SINGLE_SOCKET	BIT_ULL(20)
-
-#define SNP_POLICY_MASK_VALID		(SNP_POLICY_MASK_API_MINOR	| \
+#define KVM_SNP_POLICY_MASK_VALID	(SNP_POLICY_MASK_API_MINOR	| \
 					 SNP_POLICY_MASK_API_MAJOR	| \
 					 SNP_POLICY_MASK_SMT		| \
 					 SNP_POLICY_MASK_RSVD_MBO	| \
@@ -2207,7 +2199,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	if (params.flags)
 		return -EINVAL;
 
-	if (params.policy & ~SNP_POLICY_MASK_VALID)
+	if (params.policy & ~KVM_SNP_POLICY_MASK_VALID)
 		return -EINVAL;
 
 	/* Check for policy bits that must be set */
@@ -5085,10 +5077,10 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
 
 	/* Check if the SEV policy allows debugging */
 	if (sev_snp_guest(vcpu->kvm)) {
-		if (!(sev->policy & SNP_POLICY_DEBUG))
+		if (!(sev->policy & SNP_POLICY_MASK_DEBUG))
 			return NULL;
 	} else {
-		if (sev->policy & SEV_POLICY_NODBG)
+		if (sev->policy & SEV_POLICY_MASK_NODBG)
 			return NULL;
 	}
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 6765a5e433ce..a9f6c1ece63d 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -117,9 +117,6 @@ struct kvm_sev_info {
 	cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
 };
 
-#define SEV_POLICY_NODBG	BIT_ULL(0)
-#define SNP_POLICY_DEBUG	BIT_ULL(19)
-
 struct kvm_svm {
 	struct kvm kvm;
 
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index e0dbcb4b4fd9..27c92543bf38 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -14,6 +14,25 @@
 
 #include <uapi/linux/psp-sev.h>
 
+/* As defined by SEV API, under "Guest Policy". */
+#define SEV_POLICY_MASK_NODBG			BIT(0)
+#define SEV_POLICY_MASK_NOKS			BIT(1)
+#define SEV_POLICY_MASK_ES			BIT(2)
+#define SEV_POLICY_MASK_NOSEND			BIT(3)
+#define SEV_POLICY_MASK_DOMAIN			BIT(4)
+#define SEV_POLICY_MASK_SEV			BIT(5)
+#define SEV_POLICY_MASK_API_MAJOR		GENMASK(23, 16)
+#define SEV_POLICY_MASK_API_MINOR		GENMASK(31, 24)
+
+/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
+#define SNP_POLICY_MASK_API_MINOR		GENMASK_ULL(7, 0)
+#define SNP_POLICY_MASK_API_MAJOR		GENMASK_ULL(15, 8)
+#define SNP_POLICY_MASK_SMT			BIT_ULL(16)
+#define SNP_POLICY_MASK_RSVD_MBO		BIT_ULL(17)
+#define SNP_POLICY_MASK_MIGRATE_MA		BIT_ULL(18)
+#define SNP_POLICY_MASK_DEBUG			BIT_ULL(19)
+#define SNP_POLICY_MASK_SINGLE_SOCKET		BIT_ULL(20)
+
 #define SEV_FW_BLOB_MAX_SIZE	0x4000	/* 16KB */
 
 /**
-- 
cgit v1.2.3


From c9434e64e8b4d17511f514f7495008f573595e3e Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 27 Oct 2025 14:33:50 -0500
Subject: crypto: ccp - Add an API to return the supported SEV-SNP policy bits

Supported policy bits are dependent on the level of SEV firmware that is
currently running. Create an API to return the supported policy bits for
the current level of firmware.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Link: https://patch.msgid.link/e3f711366ddc22e3dd215c987fd2e28dc1c07f54.1761593632.git.thomas.lendacky@amd.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 drivers/crypto/ccp/sev-dev.c | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/psp-sev.h      | 18 ++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'include')

diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 0d13d47c164b..db7c7c50cebc 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -2777,6 +2777,43 @@ void sev_platform_shutdown(void)
 }
 EXPORT_SYMBOL_GPL(sev_platform_shutdown);
 
+u64 sev_get_snp_policy_bits(void)
+{
+	struct psp_device *psp = psp_master;
+	struct sev_device *sev;
+	u64 policy_bits;
+
+	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+		return 0;
+
+	if (!psp || !psp->sev_data)
+		return 0;
+
+	sev = psp->sev_data;
+
+	policy_bits = SNP_POLICY_MASK_BASE;
+
+	if (sev->snp_plat_status.feature_info) {
+		if (sev->snp_feat_info_0.ecx & SNP_RAPL_DISABLE_SUPPORTED)
+			policy_bits |= SNP_POLICY_MASK_RAPL_DIS;
+
+		if (sev->snp_feat_info_0.ecx & SNP_CIPHER_TEXT_HIDING_SUPPORTED)
+			policy_bits |= SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM;
+
+		if (sev->snp_feat_info_0.ecx & SNP_AES_256_XTS_POLICY_SUPPORTED)
+			policy_bits |= SNP_POLICY_MASK_MEM_AES_256_XTS;
+
+		if (sev->snp_feat_info_0.ecx & SNP_CXL_ALLOW_POLICY_SUPPORTED)
+			policy_bits |= SNP_POLICY_MASK_CXL_ALLOW;
+
+		if (sev_version_greater_or_equal(1, 58))
+			policy_bits |= SNP_POLICY_MASK_PAGE_SWAP_DISABLE;
+	}
+
+	return policy_bits;
+}
+EXPORT_SYMBOL_GPL(sev_get_snp_policy_bits);
+
 void sev_dev_destroy(struct psp_device *psp)
 {
 	struct sev_device *sev = psp->sev_data;
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 27c92543bf38..abcdee256c65 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -32,6 +32,20 @@
 #define SNP_POLICY_MASK_MIGRATE_MA		BIT_ULL(18)
 #define SNP_POLICY_MASK_DEBUG			BIT_ULL(19)
 #define SNP_POLICY_MASK_SINGLE_SOCKET		BIT_ULL(20)
+#define SNP_POLICY_MASK_CXL_ALLOW		BIT_ULL(21)
+#define SNP_POLICY_MASK_MEM_AES_256_XTS		BIT_ULL(22)
+#define SNP_POLICY_MASK_RAPL_DIS		BIT_ULL(23)
+#define SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM	BIT_ULL(24)
+#define SNP_POLICY_MASK_PAGE_SWAP_DISABLE	BIT_ULL(25)
+
+/* Base SEV-SNP policy bitmask for minimum supported SEV firmware version */
+#define SNP_POLICY_MASK_BASE	(SNP_POLICY_MASK_API_MINOR		| \
+				 SNP_POLICY_MASK_API_MAJOR		| \
+				 SNP_POLICY_MASK_SMT			| \
+				 SNP_POLICY_MASK_RSVD_MBO		| \
+				 SNP_POLICY_MASK_MIGRATE_MA		| \
+				 SNP_POLICY_MASK_DEBUG			| \
+				 SNP_POLICY_MASK_SINGLE_SOCKET)
 
 #define SEV_FW_BLOB_MAX_SIZE	0x4000	/* 16KB */
 
@@ -868,7 +882,10 @@ struct snp_feature_info {
 	u32 edx;
 } __packed;
 
+#define SNP_RAPL_DISABLE_SUPPORTED		BIT(2)
 #define SNP_CIPHER_TEXT_HIDING_SUPPORTED	BIT(3)
+#define SNP_AES_256_XTS_POLICY_SUPPORTED	BIT(4)
+#define SNP_CXL_ALLOW_POLICY_SUPPORTED		BIT(5)
 
 #ifdef CONFIG_CRYPTO_DEV_SP_PSP
 
@@ -1014,6 +1031,7 @@ void *snp_alloc_firmware_page(gfp_t mask);
 void snp_free_firmware_page(void *addr);
 void sev_platform_shutdown(void);
 bool sev_is_snp_ciphertext_hiding_supported(void);
+u64 sev_get_snp_policy_bits(void);
 
 #else	/* !CONFIG_CRYPTO_DEV_SP_PSP */
 
-- 
cgit v1.2.3


From 337b1b566db087347194e4543ddfdfa5645275cc Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 13 Nov 2025 18:26:23 +0200
Subject: PCI: Fix restoring BARs on BAR resize rollback path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BAR resize operation is implemented in the pci_resize_resource() and
pbus_reassign_bridge_resources() functions. pci_resize_resource() can be
called either from __resource_resize_store() from sysfs or directly by the
driver for the Endpoint Device.

The pci_resize_resource() requires that caller has released the device
resources that share the bridge window with the BAR to be resized as
otherwise the bridge window is pinned in place and cannot be changed.

pbus_reassign_bridge_resources() rolls back resources if the resize
operation fails, but rollback is performed only for the bridge windows.
Because releasing the device resources are done by the caller of the BAR
resize interface, these functions performing the BAR resize do not have
access to the device resources as they were before the resize.

pbus_reassign_bridge_resources() could try __pci_bridge_assign_resources()
after rolling back the bridge windows as they were, however, it will not
guarantee the resource are assigned due to differences in how FW and the
kernel assign the resources (alignment of the start address and tail).

To perform rollback robustly, the BAR resize interface has to be altered to
also release the device resources that share the bridge window with the BAR
to be resized.

Also, remove restoring from the entries failed list as saved list should
now contain both the bridge windows and device resources so the extra
restore is duplicated work.

Some drivers (currently only amdgpu) want to prevent releasing some
resources. Add exclude_bars param to pci_resize_resource() and make amdgpu
pass its register BAR (BAR 2 or 5), which should never be released during
resize operation. Normally 64-bit prefetchable resources do not share a
bridge window with the 32-bit only register BAR, but there are various
fallbacks in the resource assignment logic which may make the resources
share the bridge window in rare cases.

This change (together with the driver side changes) is to counter the
resource releases that had to be done to prevent resource tree corruption
in the ("PCI: Release assigned resource before restoring them") change. As
such, it likely restores functionality in cases where device resources were
released to avoid resource tree conflicts which appeared to be "working"
when such conflicts were not correctly detected by the kernel.

Reported-by: Simon Richter <Simon.Richter@hogyros.de>
Link: https://lore.kernel.org/linux-pci/f9a8c975-f5d3-4dd2-988e-4371a1433a60@hogyros.de/
Reported-by: Alex Bennée <alex.bennee@linaro.org>
Link: https://lore.kernel.org/linux-pci/874irqop6b.fsf@draig.linaro.org/
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
[bhelgaas: squash amdgpu BAR selection from
https://lore.kernel.org/r/20251114103053.13778-1-ilpo.jarvinen@linux.intel.com]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Alex Bennée <alex.bennee@linaro.org> # AVA, AMD GPU
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patch.msgid.link/20251113162628.5946-7-ilpo.jarvinen@linux.intel.com
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |   4 +-
 drivers/gpu/drm/i915/gt/intel_region_lmem.c |   2 +-
 drivers/gpu/drm/xe/xe_vram.c                |   2 +-
 drivers/pci/pci-sysfs.c                     |  17 +----
 drivers/pci/pci.h                           |   4 +-
 drivers/pci/setup-bus.c                     | 100 +++++++++++++++++++++-------
 drivers/pci/setup-res.c                     |  23 ++-----
 include/linux/pci.h                         |   3 +-
 8 files changed, 93 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7a899fb4de29..bf0bc38e1c47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1736,7 +1736,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 
 	pci_release_resource(adev->pdev, 0);
 
-	r = pci_resize_resource(adev->pdev, 0, rbar_size);
+	r = pci_resize_resource(adev->pdev, 0, rbar_size,
+				(adev->asic_type >= CHIP_BONAIRE) ? 1 << 5
+								  : 1 << 2);
 	if (r == -ENOSPC)
 		dev_info(adev->dev,
 			 "Not enough PCI address space for a large BAR.");
diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
index 51bb27e10a4f..7699e8fcf5ed 100644
--- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c
+++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
@@ -37,7 +37,7 @@ _resize_bar(struct drm_i915_private *i915, int resno, resource_size_t size)
 
 	_release_bars(pdev);
 
-	ret = pci_resize_resource(pdev, resno, bar_size);
+	ret = pci_resize_resource(pdev, resno, bar_size, 0);
 	if (ret) {
 		drm_info(&i915->drm, "Failed to resize BAR%d to %dM (%pe)\n",
 			 resno, 1 << bar_size, ERR_PTR(ret));
diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c
index b44ebf50fedb..00dd027057df 100644
--- a/drivers/gpu/drm/xe/xe_vram.c
+++ b/drivers/gpu/drm/xe/xe_vram.c
@@ -36,7 +36,7 @@ _resize_bar(struct xe_device *xe, int resno, resource_size_t size)
 	if (pci_resource_len(pdev, resno))
 		pci_release_resource(pdev, resno);
 
-	ret = pci_resize_resource(pdev, resno, bar_size);
+	ret = pci_resize_resource(pdev, resno, bar_size, 0);
 	if (ret) {
 		drm_info(&xe->drm, "Failed to resize BAR%d to %dM (%pe). Consider enabling 'Resizable BAR' support in your BIOS\n",
 			 resno, 1 << bar_size, ERR_PTR(ret));
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 9d6f74bd95f8..2a1b5456c2dc 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1599,18 +1599,13 @@ static ssize_t __resource_resize_store(struct device *dev, int n,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct pci_bus *bus = pdev->bus;
-	struct resource *b_win, *res;
 	unsigned long size;
-	int ret, i;
+	int ret;
 	u16 cmd;
 
 	if (kstrtoul(buf, 0, &size) < 0)
 		return -EINVAL;
 
-	b_win = pbus_select_window(bus, pci_resource_n(pdev, n));
-	if (!b_win)
-		return -EINVAL;
-
 	device_lock(dev);
 	if (dev->driver || pci_num_vf(pdev)) {
 		ret = -EBUSY;
@@ -1632,15 +1627,7 @@ static ssize_t __resource_resize_store(struct device *dev, int n,
 
 	pci_remove_resource_files(pdev);
 
-	pci_dev_for_each_resource(pdev, res, i) {
-		if (i >= PCI_BRIDGE_RESOURCES)
-			break;
-
-		if (b_win == pbus_select_window(bus, res))
-			pci_release_resource(pdev, i);
-	}
-
-	ret = pci_resize_resource(pdev, n, size);
+	ret = pci_resize_resource(pdev, n, size, 0);
 
 	pci_assign_unassigned_bus_resources(bus);
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index bf1a577e9623..9893ea12d1f2 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -421,8 +421,10 @@ enum pci_bar_type {
 struct device *pci_get_host_bridge_device(struct pci_dev *dev);
 void pci_put_host_bridge_device(struct device *dev);
 
+void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size);
+int pci_do_resource_release_and_resize(struct pci_dev *dev, int resno, int size,
+				       int exclude_bars);
 unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge);
-int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res);
 int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
 
 int pci_configure_extended_tags(struct pci_dev *dev, void *ign);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 51f5e5a80b54..7e268960954b 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -2420,18 +2420,16 @@ EXPORT_SYMBOL_GPL(pci_assign_unassigned_bridge_resources);
  * release it when possible. If the bridge window contains assigned
  * resources, it cannot be released.
  */
-int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
+static int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res,
+					  struct list_head *saved)
 {
 	unsigned long type = res->flags;
 	struct pci_dev_resource *dev_res;
 	struct pci_dev *bridge = NULL;
-	LIST_HEAD(saved);
 	LIST_HEAD(added);
 	LIST_HEAD(failed);
 	unsigned int i;
-	int ret;
-
-	down_read(&pci_bus_sem);
+	int ret = 0;
 
 	while (!pci_is_root_bus(bus)) {
 		bridge = bus->self;
@@ -2443,9 +2441,9 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
 
 		/* Ignore BARs which are still in use */
 		if (!res->child) {
-			ret = add_to_list(&saved, bridge, res, 0, 0);
+			ret = add_to_list(saved, bridge, res, 0, 0);
 			if (ret)
-				goto cleanup;
+				return ret;
 
 			pci_release_resource(bridge, i);
 		} else {
@@ -2468,34 +2466,78 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
 		free_list(&added);
 
 	if (!list_empty(&failed)) {
-		if (pci_required_resource_failed(&failed, type)) {
+		if (pci_required_resource_failed(&failed, type))
 			ret = -ENOSPC;
-			goto cleanup;
-		}
-		/* Only resources with unrelated types failed (again) */
 		free_list(&failed);
+		if (ret)
+			return ret;
+
+		/* Only resources with unrelated types failed (again) */
 	}
 
-	list_for_each_entry(dev_res, &saved, list) {
+	list_for_each_entry(dev_res, saved, list) {
 		struct pci_dev *dev = dev_res->dev;
 
 		/* Skip the bridge we just assigned resources for */
 		if (bridge == dev)
 			continue;
 
+		if (!dev->subordinate)
+			continue;
+
 		pci_setup_bridge(dev->subordinate);
 	}
 
-	free_list(&saved);
-	up_read(&pci_bus_sem);
 	return 0;
+}
 
-cleanup:
-	/* Restore size and flags */
-	list_for_each_entry(dev_res, &failed, list)
-		restore_dev_resource(dev_res);
-	free_list(&failed);
+int pci_do_resource_release_and_resize(struct pci_dev *pdev, int resno, int size,
+				       int exclude_bars)
+{
+	struct resource *res = pci_resource_n(pdev, resno);
+	struct pci_dev_resource *dev_res;
+	struct pci_bus *bus = pdev->bus;
+	struct resource *b_win, *r;
+	LIST_HEAD(saved);
+	unsigned int i;
+	int ret = 0;
+
+	b_win = pbus_select_window(bus, res);
+	if (!b_win)
+		return -EINVAL;
+
+	pci_dev_for_each_resource(pdev, r, i) {
+		if (i >= PCI_BRIDGE_RESOURCES)
+			break;
+
+		if (exclude_bars & BIT(i))
+			continue;
 
+		if (b_win != pbus_select_window(bus, r))
+			continue;
+
+		ret = add_to_list(&saved, pdev, r, 0, 0);
+		if (ret)
+			goto restore;
+		pci_release_resource(pdev, i);
+	}
+
+	pci_resize_resource_set_size(pdev, resno, size);
+
+	if (!bus->self)
+		goto out;
+
+	down_read(&pci_bus_sem);
+	ret = pbus_reassign_bridge_resources(bus, res, &saved);
+	if (ret)
+		goto restore;
+
+out:
+	up_read(&pci_bus_sem);
+	free_list(&saved);
+	return ret;
+
+restore:
 	/* Revert to the old configuration */
 	list_for_each_entry(dev_res, &saved, list) {
 		struct resource *res = dev_res->res;
@@ -2510,13 +2552,21 @@ cleanup:
 
 		restore_dev_resource(dev_res);
 
-		pci_claim_resource(dev, i);
-		pci_setup_bridge(dev->subordinate);
-	}
-	up_read(&pci_bus_sem);
-	free_list(&saved);
+		ret = pci_claim_resource(dev, i);
+		if (ret)
+			continue;
 
-	return ret;
+		if (i < PCI_BRIDGE_RESOURCES) {
+			const char *res_name = pci_resource_name(dev, i);
+
+			pci_update_resource(dev, i);
+			pci_info(dev, "%s %pR: old value restored\n",
+				 res_name, res);
+		}
+		if (dev->subordinate)
+			pci_setup_bridge(dev->subordinate);
+	}
+	goto out;
 }
 
 void pci_assign_unassigned_bus_resources(struct pci_bus *bus)
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 3d0b0b3f60c4..e4486d7030c0 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -444,8 +444,7 @@ static bool pci_resize_is_memory_decoding_enabled(struct pci_dev *dev,
 	return cmd & PCI_COMMAND_MEMORY;
 }
 
-static void pci_resize_resource_set_size(struct pci_dev *dev, int resno,
-					 int size)
+void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size)
 {
 	resource_size_t res_size = pci_rebar_size_to_bytes(size);
 	struct resource *res = pci_resource_n(dev, resno);
@@ -456,9 +455,9 @@ static void pci_resize_resource_set_size(struct pci_dev *dev, int resno,
 	resource_set_size(res, res_size);
 }
 
-int pci_resize_resource(struct pci_dev *dev, int resno, int size)
+int pci_resize_resource(struct pci_dev *dev, int resno, int size,
+			int exclude_bars)
 {
-	struct resource *res = pci_resource_n(dev, resno);
 	struct pci_host_bridge *host;
 	int old, ret;
 	u32 sizes;
@@ -468,10 +467,6 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size)
 	if (host->preserve_config)
 		return -ENOTSUPP;
 
-	/* Make sure the resource isn't assigned before resizing it. */
-	if (!(res->flags & IORESOURCE_UNSET))
-		return -EBUSY;
-
 	if (pci_resize_is_memory_decoding_enabled(dev, resno))
 		return -EBUSY;
 
@@ -490,19 +485,13 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size)
 	if (ret)
 		return ret;
 
-	pci_resize_resource_set_size(dev, resno, size);
-
-	/* Check if the new config works by trying to assign everything. */
-	if (dev->bus->self) {
-		ret = pbus_reassign_bridge_resources(dev->bus, res);
-		if (ret)
-			goto error_resize;
-	}
+	ret = pci_do_resource_release_and_resize(dev, resno, size, exclude_bars);
+	if (ret)
+		goto error_resize;
 	return 0;
 
 error_resize:
 	pci_rebar_set_size(dev, resno, old);
-	pci_resize_resource_set_size(dev, resno, old);
 	return ret;
 }
 EXPORT_SYMBOL(pci_resize_resource);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1fdf81fbe1e..34ff295cd2e3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1428,7 +1428,8 @@ static inline int pci_rebar_bytes_to_size(u64 bytes)
 }
 
 u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
-int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size);
+int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size,
+				     int exclude_bars);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 bool pci_device_is_present(struct pci_dev *pdev);
 void pci_ignore_hotplug(struct pci_dev *dev);
-- 
cgit v1.2.3


From 876e15943e9205096441cbe520dc9ccf82df8344 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 13 Nov 2025 20:00:44 +0200
Subject: PCI: Move pci_rebar_bytes_to_size() and clean it up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move pci_rebar_bytes_to_size() from include/linux/pci.h to rebar.c as it
does not look very trivial and is not expected to be performance critical.

Convert literals to use a newly added PCI_REBAR_MIN_SIZE define.

Also add kernel doc for the function as the function is exported.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Michael J. Ruhl <mjruhl@habana.ai>
Link: https://patch.msgid.link/20251113180053.27944-3-ilpo.jarvinen@linux.intel.com
---
 drivers/pci/rebar.c | 23 +++++++++++++++++++++++
 include/linux/pci.h | 10 +++-------
 2 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c
index f6ed7e4893a7..f29810fe0a58 100644
--- a/drivers/pci/rebar.c
+++ b/drivers/pci/rebar.c
@@ -7,11 +7,34 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/ioport.h>
+#include <linux/log2.h>
 #include <linux/pci.h>
+#include <linux/sizes.h>
 #include <linux/types.h>
 
 #include "pci.h"
 
+#define PCI_REBAR_MIN_SIZE	((resource_size_t)SZ_1M)
+
+/**
+ * pci_rebar_bytes_to_size - Convert size in bytes to PCI BAR Size
+ * @bytes: size in bytes
+ *
+ * Convert size in bytes to encoded BAR Size in Resizable BAR Capability
+ * (PCIe r6.2, sec. 7.8.6.3).
+ *
+ * Return: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB)
+ */
+int pci_rebar_bytes_to_size(u64 bytes)
+{
+	int rebar_minsize = ilog2(PCI_REBAR_MIN_SIZE);
+
+	bytes = roundup_pow_of_two(bytes);
+
+	return max(ilog2(bytes), rebar_minsize) - rebar_minsize;
+}
+EXPORT_SYMBOL_GPL(pci_rebar_bytes_to_size);
+
 void pci_rebar_init(struct pci_dev *pdev)
 {
 	pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 34ff295cd2e3..628dda63b9e0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1419,17 +1419,13 @@ void pcibios_reset_secondary_bus(struct pci_dev *dev);
 void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int pci_release_resource(struct pci_dev *dev, int resno);
-static inline int pci_rebar_bytes_to_size(u64 bytes)
-{
-	bytes = roundup_pow_of_two(bytes);
-
-	/* Return BAR size as defined in the resizable BAR specification */
-	return max(ilog2(bytes), 20) - 20;
-}
 
+/* Resizable BAR related routines */
+int pci_rebar_bytes_to_size(u64 bytes);
 u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
 int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size,
 				     int exclude_bars);
+
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 bool pci_device_is_present(struct pci_dev *pdev);
 void pci_ignore_hotplug(struct pci_dev *dev);
-- 
cgit v1.2.3


From a337869885083131e575c6367c679f4da4b68bb0 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 13 Nov 2025 20:00:45 +0200
Subject: PCI: Move pci_rebar_size_to_bytes() and export it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pci_rebar_size_to_bytes() is in drivers/pci/pci.h but would be useful for
endpoint drivers as well.

Move the function to rebar.c and export it.

In addition, convert the literal to where the number comes from
(PCI_REBAR_MIN_SIZE).

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patch.msgid.link/20251113180053.27944-4-ilpo.jarvinen@linux.intel.com
---
 drivers/pci/pci.h   |  4 ----
 drivers/pci/rebar.c | 12 ++++++++++++
 include/linux/pci.h |  1 +
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 41df35920632..a1e7dbeb0f2c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1024,10 +1024,6 @@ void pci_rebar_init(struct pci_dev *pdev);
 void pci_restore_rebar_state(struct pci_dev *pdev);
 int pci_rebar_get_current_size(struct pci_dev *pdev, int bar);
 int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size);
-static inline u64 pci_rebar_size_to_bytes(int size)
-{
-	return 1ULL << (size + 20);
-}
 
 struct device_node;
 
diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c
index f29810fe0a58..a81cb3fcddef 100644
--- a/drivers/pci/rebar.c
+++ b/drivers/pci/rebar.c
@@ -35,6 +35,18 @@ int pci_rebar_bytes_to_size(u64 bytes)
 }
 EXPORT_SYMBOL_GPL(pci_rebar_bytes_to_size);
 
+/**
+ * pci_rebar_size_to_bytes - Convert encoded BAR Size to size in bytes
+ * @size: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB)
+ *
+ * Return: BAR size in bytes
+ */
+resource_size_t pci_rebar_size_to_bytes(int size)
+{
+	return 1ULL << (size + ilog2(PCI_REBAR_MIN_SIZE));
+}
+EXPORT_SYMBOL_GPL(pci_rebar_size_to_bytes);
+
 void pci_rebar_init(struct pci_dev *pdev)
 {
 	pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 628dda63b9e0..33b27e0c4f3e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1422,6 +1422,7 @@ int pci_release_resource(struct pci_dev *dev, int resno);
 
 /* Resizable BAR related routines */
 int pci_rebar_bytes_to_size(u64 bytes);
+resource_size_t pci_rebar_size_to_bytes(int size);
 u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
 int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size,
 				     int exclude_bars);
-- 
cgit v1.2.3


From bb1fabd0d94efc29f88f86fb996c40ac06db3669 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 13 Nov 2025 20:00:47 +0200
Subject: PCI: Add pci_rebar_size_supported() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many callers of pci_rebar_get_possible_sizes() are interested in finding
out if a particular encoded BAR Size (PCIe r7.0, sec 7.8.6.3) is supported
by the particular BAR.

Add pci_rebar_size_supported() into PCI core to make it easy for the
drivers to determine if the BAR size is supported or not.

Use the new function in pci_resize_resource() and in
pci_iov_vf_bar_set_size().

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
Link: https://patch.msgid.link/20251113180053.27944-6-ilpo.jarvinen@linux.intel.com
---
 drivers/pci/iov.c   |  8 +-------
 drivers/pci/rebar.c | 25 +++++++++++++++++++------
 include/linux/pci.h |  1 +
 3 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 04b675e90963..71ed85d38508 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -1339,19 +1339,13 @@ EXPORT_SYMBOL_GPL(pci_sriov_configure_simple);
  */
 int pci_iov_vf_bar_set_size(struct pci_dev *dev, int resno, int size)
 {
-	u32 sizes;
-
 	if (!pci_resource_is_iov(resno))
 		return -EINVAL;
 
 	if (pci_iov_is_memory_decoding_enabled(dev))
 		return -EBUSY;
 
-	sizes = pci_rebar_get_possible_sizes(dev, resno);
-	if (!sizes)
-		return -ENOTSUPP;
-
-	if (!(sizes & BIT(size)))
+	if (!pci_rebar_size_supported(dev, resno, size))
 		return -EINVAL;
 
 	return pci_rebar_set_size(dev, resno, size);
diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c
index 399660cb12fa..1c90b606b8d4 100644
--- a/drivers/pci/rebar.c
+++ b/drivers/pci/rebar.c
@@ -3,6 +3,7 @@
  * PCI Resizable BAR Extended Capability handling.
  */
 
+#include <linux/bits.h>
 #include <linux/bitfield.h>
 #include <linux/errno.h>
 #include <linux/export.h>
@@ -124,6 +125,23 @@ u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar)
 }
 EXPORT_SYMBOL(pci_rebar_get_possible_sizes);
 
+/**
+ * pci_rebar_size_supported - check if size is supported for BAR
+ * @pdev: PCI device
+ * @bar: BAR to check
+ * @size: encoded size as defined in the PCIe spec (0=1MB, 31=128TB)
+ *
+ * Return: %true if @bar is resizable and @size is supported, otherwise
+ *	   %false.
+ */
+bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size)
+{
+	u64 sizes = pci_rebar_get_possible_sizes(pdev, bar);
+
+	return BIT(size) & sizes;
+}
+EXPORT_SYMBOL_GPL(pci_rebar_size_supported);
+
 /**
  * pci_rebar_get_current_size - get the current size of a Resizable BAR
  * @pdev: PCI device
@@ -252,7 +270,6 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size,
 {
 	struct pci_host_bridge *host;
 	int old, ret;
-	u32 sizes;
 
 	/* Check if we must preserve the firmware's resource assignment */
 	host = pci_find_host_bridge(dev->bus);
@@ -262,11 +279,7 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size,
 	if (pci_resize_is_memory_decoding_enabled(dev, resno))
 		return -EBUSY;
 
-	sizes = pci_rebar_get_possible_sizes(dev, resno);
-	if (!sizes)
-		return -ENOTSUPP;
-
-	if (!(sizes & BIT(size)))
+	if (!pci_rebar_size_supported(dev, resno, size))
 		return -EINVAL;
 
 	old = pci_rebar_get_current_size(dev, resno);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 33b27e0c4f3e..0ef827cfaf0c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1424,6 +1424,7 @@ int pci_release_resource(struct pci_dev *dev, int resno);
 int pci_rebar_bytes_to_size(u64 bytes);
 resource_size_t pci_rebar_size_to_bytes(int size);
 u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
+bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size);
 int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size,
 				     int exclude_bars);
 
-- 
cgit v1.2.3


From 1c680f2acdbb3b64965962ca060a6daa6379575d Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 13 Nov 2025 20:00:50 +0200
Subject: PCI: Add pci_rebar_get_max_size()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add pci_rebar_get_max_size() to allow simplifying code that wants to know
the maximum possible size for a Resizable BAR.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patch.msgid.link/20251113180053.27944-9-ilpo.jarvinen@linux.intel.com
---
 drivers/pci/rebar.c | 23 +++++++++++++++++++++++
 include/linux/pci.h |  1 +
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c
index 1c90b606b8d4..e99b89bd5e51 100644
--- a/drivers/pci/rebar.c
+++ b/drivers/pci/rebar.c
@@ -5,6 +5,7 @@
 
 #include <linux/bits.h>
 #include <linux/bitfield.h>
+#include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/ioport.h>
@@ -142,6 +143,28 @@ bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size)
 }
 EXPORT_SYMBOL_GPL(pci_rebar_size_supported);
 
+/**
+ * pci_rebar_get_max_size - get the maximum supported size of a BAR
+ * @pdev: PCI device
+ * @bar: BAR to query
+ *
+ * Get the largest supported size of a resizable BAR as a size.
+ *
+ * Return: the encoded maximum BAR size as defined in the PCIe spec
+ *	   (0=1MB, 31=128TB), or %-NOENT on error.
+ */
+int pci_rebar_get_max_size(struct pci_dev *pdev, int bar)
+{
+	u32 sizes;
+
+	sizes = pci_rebar_get_possible_sizes(pdev, bar);
+	if (!sizes)
+		return -ENOENT;
+
+	return __fls(sizes);
+}
+EXPORT_SYMBOL_GPL(pci_rebar_get_max_size);
+
 /**
  * pci_rebar_get_current_size - get the current size of a Resizable BAR
  * @pdev: PCI device
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 0ef827cfaf0c..898bc3a4e8e7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1425,6 +1425,7 @@ int pci_rebar_bytes_to_size(u64 bytes);
 resource_size_t pci_rebar_size_to_bytes(int size);
 u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
 bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size);
+int pci_rebar_get_max_size(struct pci_dev *pdev, int bar);
 int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size,
 				     int exclude_bars);
 
-- 
cgit v1.2.3


From bf0a90fc907e47344f88e5b9b241082184dbac27 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 13 Nov 2025 20:00:53 +0200
Subject: PCI: Convert BAR sizes bitmasks to u64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PCIe r7.0, sec 7.8.6, defines resizable BAR sizes beyond the currently
supported maximum of 128TB, which will require more than u32 to store the
entire bitmask.

Convert Resizable BAR related functions to use u64 bitmask for BAR sizes to
make the typing more future-proof.

The support for the larger BAR sizes themselves is not added at this point.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patch.msgid.link/20251113180053.27944-12-ilpo.jarvinen@linux.intel.com
---
 drivers/gpu/drm/xe/xe_vram.c | 2 +-
 drivers/pci/iov.c            | 2 +-
 drivers/pci/pci-sysfs.c      | 2 +-
 drivers/pci/rebar.c          | 4 ++--
 include/linux/pci.h          | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c
index 524469f8a4bd..10f8a73e190b 100644
--- a/drivers/gpu/drm/xe/xe_vram.c
+++ b/drivers/gpu/drm/xe/xe_vram.c
@@ -69,7 +69,7 @@ static void resize_vram_bar(struct xe_device *xe)
 
 		if (!pci_rebar_size_supported(pdev, LMEM_BAR, rebar_size)) {
 			drm_info(&xe->drm,
-				 "Requested size: %lluMiB is not supported by rebar sizes: 0x%x. Leaving default: %lluMiB\n",
+				 "Requested size: %lluMiB is not supported by rebar sizes: 0x%llx. Leaving default: %lluMiB\n",
 				 (u64)pci_rebar_size_to_bytes(rebar_size) >> 20,
 				 pci_rebar_get_possible_sizes(pdev, LMEM_BAR),
 				 (u64)current_size >> 20);
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 71ed85d38508..00784a60ba80 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -1367,7 +1367,7 @@ EXPORT_SYMBOL_GPL(pci_iov_vf_bar_set_size);
 u32 pci_iov_vf_bar_get_sizes(struct pci_dev *dev, int resno, int num_vfs)
 {
 	u64 vf_len = pci_resource_len(dev, resno);
-	u32 sizes;
+	u64 sizes;
 
 	if (!num_vfs)
 		return 0;
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 2a1b5456c2dc..cb512bf0df7c 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1587,7 +1587,7 @@ static ssize_t __resource_resize_show(struct device *dev, int n, char *buf)
 	pci_config_pm_runtime_get(pdev);
 
 	ret = sysfs_emit(buf, "%016llx\n",
-			 (u64)pci_rebar_get_possible_sizes(pdev, n));
+			 pci_rebar_get_possible_sizes(pdev, n));
 
 	pci_config_pm_runtime_put(pdev);
 
diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c
index e99b89bd5e51..7f6dece19138 100644
--- a/drivers/pci/rebar.c
+++ b/drivers/pci/rebar.c
@@ -105,7 +105,7 @@ static int pci_rebar_find_pos(struct pci_dev *pdev, int bar)
  * Return: A bitmask of possible sizes (bit 0=1MB, bit 31=128TB), or %0 if
  *	   BAR isn't resizable.
  */
-u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar)
+u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar)
 {
 	int pos;
 	u32 cap;
@@ -155,7 +155,7 @@ EXPORT_SYMBOL_GPL(pci_rebar_size_supported);
  */
 int pci_rebar_get_max_size(struct pci_dev *pdev, int bar)
 {
-	u32 sizes;
+	u64 sizes;
 
 	sizes = pci_rebar_get_possible_sizes(pdev, bar);
 	if (!sizes)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 898bc3a4e8e7..4b7f4c08b5c7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1423,7 +1423,7 @@ int pci_release_resource(struct pci_dev *dev, int resno);
 /* Resizable BAR related routines */
 int pci_rebar_bytes_to_size(u64 bytes);
 resource_size_t pci_rebar_size_to_bytes(int size);
-u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
+u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
 bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size);
 int pci_rebar_get_max_size(struct pci_dev *pdev, int bar);
 int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size,
-- 
cgit v1.2.3


From 4518767be9089ea4f54754ad27364d6134fc46e2 Mon Sep 17 00:00:00 2001
From: Jianyun Gao <jianyungao89@gmail.com>
Date: Sat, 27 Sep 2025 17:34:10 +0800
Subject: time: Fix a few typos in time[r] related code comments

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20250927093411.1509275-1-jianyungao89@gmail.com
---
 include/linux/delay.h         | 8 ++++----
 kernel/time/posix-timers.c    | 2 +-
 kernel/time/timer_migration.c | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index 89866bab100d..46412c00033a 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -68,7 +68,7 @@ void usleep_range_state(unsigned long min, unsigned long max,
  * @min:	Minimum time in microseconds to sleep
  * @max:	Maximum time in microseconds to sleep
  *
- * For basic information please refere to usleep_range_state().
+ * For basic information please refer to usleep_range_state().
  *
  * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep.
  */
@@ -82,10 +82,10 @@ static inline void usleep_range(unsigned long min, unsigned long max)
  * @min:	Minimum time in microseconds to sleep
  * @max:	Maximum time in microseconds to sleep
  *
- * For basic information please refere to usleep_range_state().
+ * For basic information please refer to usleep_range_state().
  *
  * The sleeping task has the state TASK_IDLE during the sleep to prevent
- * contribution to the load avarage.
+ * contribution to the load average.
  */
 static inline void usleep_range_idle(unsigned long min, unsigned long max)
 {
@@ -96,7 +96,7 @@ static inline void usleep_range_idle(unsigned long min, unsigned long max)
  * ssleep - wrapper for seconds around msleep
  * @seconds:	Requested sleep duration in seconds
  *
- * Please refere to msleep() for detailed information.
+ * Please refer to msleep() for detailed information.
  */
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index aa3120104a51..36dbb8146517 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1242,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
  *    sys_clock_settime(). The kernel internal timekeeping is always using
  *    nanoseconds precision independent of the clocksource device which is
  *    used to read the time from. The resolution of that device only
- *    affects the presicion of the time returned by sys_clock_gettime().
+ *    affects the precision of the time returned by sys_clock_gettime().
  *
  * Returns:
  *	0		Success. @tp contains the resolution
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 19ddfa96b9df..57e38674e56e 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -708,7 +708,7 @@ void tmigr_cpu_activate(void)
 /*
  * Returns true, if there is nothing to be propagated to the next level
  *
- * @data->firstexp is set to expiry of first gobal event of the (top level of
+ * @data->firstexp is set to expiry of first global event of the (top level of
  * the) hierarchy, but only when hierarchy is completely idle.
  *
  * The child and group states need to be read under the lock, to prevent a race
-- 
cgit v1.2.3


From ef8057b07c72a817537856b98d6e7493b9404eaf Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 13 Nov 2025 20:33:33 +0100
Subject: PM: runtime: Wrapper macros for ACQUIRE()/ACQUIRE_ERR()

Add wrapper macros for ACQUIRE()/ACQUIRE_ERR() and runtime PM
usage counter guards introduced recently: pm_runtime_active_try,
pm_runtime_active_auto_try, pm_runtime_active_try_enabled, and
pm_runtime_active_auto_try_enabled.

The new macros should be more straightforward to use.

For example, they can be used for rewriting a piece of code like below:

        ACQUIRE(pm_runtime_active_try, pm)(dev);
        if ((ret = ACQUIRE_ERR(pm_runtime_active_try, &pm)))
                return ret;

in the following way:

        PM_RUNTIME_ACQUIRE(dev, pm);
        if ((ret = PM_RUNTIME_ACQUIRE_ERR(&pm)))
                return ret;

If the original code does not care about the specific error code
returned when attepmting to resume the device:

        ACQUIRE(pm_runtime_active_try, pm)(dev);
        if (ACQUIRE_ERR(pm_runtime_active_try, &pm))
                return -ENXIO;

it may be changed like this:

        PM_RUNTIME_ACQUIRE(dev, pm);
        if (PM_RUNTIME_ACQUIRE_ERR(&pm))
                return -ENXIO;

Link: https://lore.kernel.org/linux-pm/5068916.31r3eYUQgx@rafael.j.wysocki/
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Dhruva Gole <d-gole@ti.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://patch.msgid.link/3400866.aeNJFYEL58@rafael.j.wysocki
---
 include/linux/pm_runtime.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 0b436e15f4cd..911d7a4d32c1 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -637,6 +637,30 @@ DEFINE_GUARD_COND(pm_runtime_active_auto, _try,
 DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled,
 		  pm_runtime_resume_and_get(_T), _RET == 0)
 
+/* ACQUIRE() wrapper macros for the guards defined above. */
+
+#define PM_RUNTIME_ACQUIRE(_dev, _var)			\
+	ACQUIRE(pm_runtime_active_try, _var)(_dev)
+
+#define PM_RUNTIME_ACQUIRE_AUTOSUSPEND(_dev, _var)	\
+	ACQUIRE(pm_runtime_active_auto_try, _var)(_dev)
+
+#define PM_RUNTIME_ACQUIRE_IF_ENABLED(_dev, _var)	\
+	ACQUIRE(pm_runtime_active_try_enabled, _var)(_dev)
+
+#define PM_RUNTIME_ACQUIRE_IF_ENABLED_AUTOSUSPEND(_dev, _var)	\
+	ACQUIRE(pm_runtime_active_auto_try_enabled, _var)(_dev)
+
+/*
+ * ACQUIRE_ERR() wrapper macro for guard pm_runtime_active.
+ *
+ * Always check PM_RUNTIME_ACQUIRE_ERR() after using one of the
+ * PM_RUNTIME_ACQUIRE*() macros defined above (yes, it can be used with
+ * any of them) and if it is nonzero, avoid accessing the given device.
+ */
+#define PM_RUNTIME_ACQUIRE_ERR(_var_ptr)	\
+	ACQUIRE_ERR(pm_runtime_active, _var_ptr)
+
 /**
  * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0.
  * @dev: Target device.
-- 
cgit v1.2.3


From 1dcb98bbb7538d4b9015d47c934acdf5ea86045c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Nov 2025 15:33:41 -1000
Subject: sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs

With the buddy lockup detector, smp_processor_id() returns the detecting CPU,
not the locked CPU, making scx_hardlockup()'s printouts confusing. Pass the
locked CPU number from watchdog_hardlockup_check() as a parameter instead.

Also add kerneldoc comments to handle_lockup(), scx_hardlockup(), and
scx_rcu_cpu_stall() documenting their return value semantics.

Suggested-by: Doug Anderson <dianders@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h |  4 ++--
 kernel/sched/ext.c        | 25 ++++++++++++++++++++++---
 kernel/watchdog.c         |  2 +-
 3 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 70ee5c28a74d..bcb962d5ee7d 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -230,7 +230,7 @@ struct sched_ext_entity {
 void sched_ext_dead(struct task_struct *p);
 void print_scx_info(const char *log_lvl, struct task_struct *p);
 void scx_softlockup(u32 dur_s);
-bool scx_hardlockup(void);
+bool scx_hardlockup(int cpu);
 bool scx_rcu_cpu_stall(void);
 
 #else	/* !CONFIG_SCHED_CLASS_EXT */
@@ -238,7 +238,7 @@ bool scx_rcu_cpu_stall(void);
 static inline void sched_ext_dead(struct task_struct *p) {}
 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
 static inline void scx_softlockup(u32 dur_s) {}
-static inline bool scx_hardlockup(void) { return false; }
+static inline bool scx_hardlockup(int cpu) { return false; }
 static inline bool scx_rcu_cpu_stall(void) { return false; }
 
 #endif	/* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8a3b8f64a06b..918573f3f088 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3687,6 +3687,17 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
 	return false;
 }
 
+/**
+ * handle_lockup - sched_ext common lockup handler
+ * @fmt: format string
+ *
+ * Called on system stall or lockup condition and initiates abort of sched_ext
+ * if enabled, which may resolve the reported lockup.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the lockup. %false if sched_ext is not enabled or abort was already
+ * initiated by someone else.
+ */
 static __printf(1, 2) bool handle_lockup(const char *fmt, ...)
 {
 	struct scx_sched *sch;
@@ -3718,6 +3729,10 @@ static __printf(1, 2) bool handle_lockup(const char *fmt, ...)
  * that may not be caused by the current BPF scheduler, try kicking out the
  * current scheduler in an attempt to recover the system to a good state before
  * issuing panics.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the reported RCU stall. %false if sched_ext is not enabled or someone
+ * else already initiated abort.
  */
 bool scx_rcu_cpu_stall(void)
 {
@@ -3750,14 +3765,18 @@ void scx_softlockup(u32 dur_s)
  * numerous affinitized tasks in a single queue and directing all CPUs at it.
  * Try kicking out the current scheduler in an attempt to recover the system to
  * a good state before taking more drastic actions.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the reported hardlockdup. %false if sched_ext is not enabled or
+ * someone else already initiated abort.
  */
-bool scx_hardlockup(void)
+bool scx_hardlockup(int cpu)
 {
-	if (!handle_lockup("hard lockup - CPU %d", smp_processor_id()))
+	if (!handle_lockup("hard lockup - CPU %d", cpu))
 		return false;
 
 	printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
-			smp_processor_id());
+			cpu);
 	return true;
 }
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8dfac4a8f587..873020a2a581 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -203,7 +203,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
 		 * only once when sched_ext is enabled and will immediately
 		 * abort the BPF scheduler and print out a warning message.
 		 */
-		if (scx_hardlockup())
+		if (scx_hardlockup(cpu))
 			return;
 
 		/* Only print hardlockups once. */
-- 
cgit v1.2.3


From f86e51399c2a911a5b01d441de513f17bf773856 Mon Sep 17 00:00:00 2001
From: Xu Yilun <yilun.xu@linux.intel.com>
Date: Thu, 13 Nov 2025 17:02:27 -0800
Subject: PCI/IDE: Add Address Association Register setup for downstream MMIO

The address ranges for downstream Address Association Registers need to
cover memory addresses for all functions (PFs/VFs/downstream devices)
managed by a Device Security Manager (DSM). The proposed solution is get
the memory (32-bit only) range and prefetchable-memory (64-bit capable)
range from the immediate ancestor downstream port (either the direct-attach
RP or deepest switch port when switch attached).

Similar to RID association, address associations will be set by default if
hardware sets 'Number of Address Association Register Blocks' in the
'Selective IDE Stream Capability Register' to a non-zero value. TSM drivers
can opt-out of the settings by zero'ing out unwanted / unsupported address
ranges. E.g. TDX Connect only supports prefetachable (64-bit capable)
memory ranges for the Address Association setting.

If the immediate downstream port provides both a memory range and
prefetchable-memory range, but the IDE partner port only provides 1 Address
Association Register block then the TSM driver can pick which range to
associate, or let the PCI core prioritize memory.

Note, the Address Association Register setup for upstream requests is still
uncertain so is not included.

Co-developed-by: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Co-developed-by: Arto Merilainen <amerilainen@nvidia.com>
Signed-off-by: Arto Merilainen <amerilainen@nvidia.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Co-developed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251114010227.567693-1-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/ide.c       | 117 ++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/pci-ide.h |  32 +++++++++++++
 include/linux/pci.h     |   5 +++
 3 files changed, 145 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c
index da5b1acccbb4..2e0856a4307b 100644
--- a/drivers/pci/ide.c
+++ b/drivers/pci/ide.c
@@ -155,8 +155,11 @@ struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev)
 {
 	/* EP, RP, + HB Stream allocation */
 	struct stream_index __stream[PCI_IDE_HB + 1];
+	struct pci_bus_region pref_assoc = { 0, -1 };
+	struct pci_bus_region mem_assoc = { 0, -1 };
+	struct resource *mem, *pref;
 	struct pci_host_bridge *hb;
-	struct pci_dev *rp;
+	struct pci_dev *rp, *br;
 	int num_vf, rid_end;
 
 	if (!pci_is_pcie(pdev))
@@ -197,6 +200,21 @@ struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev)
 	else
 		rid_end = pci_dev_id(pdev);
 
+	br = pci_upstream_bridge(pdev);
+	if (!br)
+		return NULL;
+
+	/*
+	 * Check if the device consumes memory and/or prefetch-memory. Setup
+	 * downstream address association ranges for each.
+	 */
+	mem = pci_resource_n(br, PCI_BRIDGE_MEM_WINDOW);
+	pref = pci_resource_n(br, PCI_BRIDGE_PREF_MEM_WINDOW);
+	if (resource_assigned(mem))
+		pcibios_resource_to_bus(br->bus, &mem_assoc, mem);
+	if (resource_assigned(pref))
+		pcibios_resource_to_bus(br->bus, &pref_assoc, pref);
+
 	*ide = (struct pci_ide) {
 		.pdev = pdev,
 		.partner = {
@@ -204,11 +222,16 @@ struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev)
 				.rid_start = pci_dev_id(rp),
 				.rid_end = pci_dev_id(rp),
 				.stream_index = no_free_ptr(ep_stream)->stream_index,
+				/* Disable upstream address association */
+				.mem_assoc = { 0, -1 },
+				.pref_assoc = { 0, -1 },
 			},
 			[PCI_IDE_RP] = {
 				.rid_start = pci_dev_id(pdev),
 				.rid_end = rid_end,
 				.stream_index = no_free_ptr(rp_stream)->stream_index,
+				.mem_assoc = mem_assoc,
+				.pref_assoc = pref_assoc,
 			},
 		},
 		.host_bridge_stream = no_free_ptr(hb_stream)->stream_index,
@@ -385,6 +408,63 @@ static void set_ide_sel_ctl(struct pci_dev *pdev, struct pci_ide *ide,
 	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, val);
 }
 
+#define SEL_ADDR1_LOWER GENMASK(31, 20)
+#define SEL_ADDR_UPPER GENMASK_ULL(63, 32)
+#define PREP_PCI_IDE_SEL_ADDR1(base, limit)			\
+	(FIELD_PREP(PCI_IDE_SEL_ADDR_1_VALID, 1) |		\
+	 FIELD_PREP(PCI_IDE_SEL_ADDR_1_BASE_LOW,		\
+		    FIELD_GET(SEL_ADDR1_LOWER, (base))) |	\
+	 FIELD_PREP(PCI_IDE_SEL_ADDR_1_LIMIT_LOW,		\
+		    FIELD_GET(SEL_ADDR1_LOWER, (limit))))
+
+static void mem_assoc_to_regs(struct pci_bus_region *region,
+			      struct pci_ide_regs *regs, int idx)
+{
+	/* convert to u64 range for bitfield size checks */
+	struct range r = { region->start, region->end };
+
+	regs->addr[idx].assoc1 = PREP_PCI_IDE_SEL_ADDR1(r.start, r.end);
+	regs->addr[idx].assoc2 = FIELD_GET(SEL_ADDR_UPPER, r.end);
+	regs->addr[idx].assoc3 = FIELD_GET(SEL_ADDR_UPPER, r.start);
+}
+
+/**
+ * pci_ide_stream_to_regs() - convert IDE settings to association register values
+ * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port
+ * @ide: registered IDE settings descriptor
+ * @regs: output register values
+ */
+static void pci_ide_stream_to_regs(struct pci_dev *pdev, struct pci_ide *ide,
+				   struct pci_ide_regs *regs)
+{
+	struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide);
+	int assoc_idx = 0;
+
+	memset(regs, 0, sizeof(*regs));
+
+	if (!settings)
+		return;
+
+	regs->rid1 = FIELD_PREP(PCI_IDE_SEL_RID_1_LIMIT, settings->rid_end);
+
+	regs->rid2 = FIELD_PREP(PCI_IDE_SEL_RID_2_VALID, 1) |
+		     FIELD_PREP(PCI_IDE_SEL_RID_2_BASE, settings->rid_start) |
+		     FIELD_PREP(PCI_IDE_SEL_RID_2_SEG, pci_ide_domain(pdev));
+
+	if (pdev->nr_ide_mem && pci_bus_region_size(&settings->mem_assoc)) {
+		mem_assoc_to_regs(&settings->mem_assoc, regs, assoc_idx);
+		assoc_idx++;
+	}
+
+	if (pdev->nr_ide_mem > assoc_idx &&
+	    pci_bus_region_size(&settings->pref_assoc)) {
+		mem_assoc_to_regs(&settings->pref_assoc, regs, assoc_idx);
+		assoc_idx++;
+	}
+
+	regs->nr_addr = assoc_idx;
+}
+
 /**
  * pci_ide_stream_setup() - program settings to Selective IDE Stream registers
  * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port
@@ -398,22 +478,34 @@ static void set_ide_sel_ctl(struct pci_dev *pdev, struct pci_ide *ide,
 void pci_ide_stream_setup(struct pci_dev *pdev, struct pci_ide *ide)
 {
 	struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide);
+	struct pci_ide_regs regs;
 	int pos;
-	u32 val;
 
 	if (!settings)
 		return;
 
+	pci_ide_stream_to_regs(pdev, ide, &regs);
+
 	pos = sel_ide_offset(pdev, settings);
 
-	val = FIELD_PREP(PCI_IDE_SEL_RID_1_LIMIT, settings->rid_end);
-	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, val);
+	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, regs.rid1);
+	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, regs.rid2);
 
-	val = FIELD_PREP(PCI_IDE_SEL_RID_2_VALID, 1) |
-	      FIELD_PREP(PCI_IDE_SEL_RID_2_BASE, settings->rid_start) |
-	      FIELD_PREP(PCI_IDE_SEL_RID_2_SEG, pci_ide_domain(pdev));
+	for (int i = 0; i < regs.nr_addr; i++) {
+		pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_1(i),
+				       regs.addr[i].assoc1);
+		pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_2(i),
+				       regs.addr[i].assoc2);
+		pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_3(i),
+				       regs.addr[i].assoc3);
+	}
 
-	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, val);
+	/* clear extra unused address association blocks */
+	for (int i = regs.nr_addr; i < pdev->nr_ide_mem; i++) {
+		pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_1(i), 0);
+		pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_2(i), 0);
+		pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_3(i), 0);
+	}
 
 	/*
 	 * Setup control register early for devices that expect
@@ -436,7 +528,7 @@ EXPORT_SYMBOL_GPL(pci_ide_stream_setup);
 void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide)
 {
 	struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide);
-	int pos;
+	int pos, i;
 
 	if (!settings)
 		return;
@@ -444,6 +536,13 @@ void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide)
 	pos = sel_ide_offset(pdev, settings);
 
 	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, 0);
+
+	for (i = 0; i < pdev->nr_ide_mem; i++) {
+		pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_1(i), 0);
+		pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_2(i), 0);
+		pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_3(i), 0);
+	}
+
 	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, 0);
 	pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, 0);
 	settings->setup = 0;
diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h
index d0f10f3c89fc..93194338e4d0 100644
--- a/include/linux/pci-ide.h
+++ b/include/linux/pci-ide.h
@@ -28,21 +28,53 @@ enum pci_ide_partner_select {
  * @rid_start: Partner Port Requester ID range start
  * @rid_end: Partner Port Requester ID range end
  * @stream_index: Selective IDE Stream Register Block selection
+ * @mem_assoc: PCI bus memory address association for targeting peer partner
+ * @pref_assoc: PCI bus prefetchable memory address association for
+ *		targeting peer partner
  * @default_stream: Endpoint uses this stream for all upstream TLPs regardless of
  *		    address and RID association registers
  * @setup: flag to track whether to run pci_ide_stream_teardown() for this
  *	   partner slot
  * @enable: flag whether to run pci_ide_stream_disable() for this partner slot
+ *
+ * By default, pci_ide_stream_alloc() initializes @mem_assoc and @pref_assoc
+ * with the immediate ancestor downstream port memory ranges (i.e. Type 1
+ * Configuration Space Header values). Caller may zero size ({0, -1}) the range
+ * to drop it from consideration at pci_ide_stream_setup() time.
  */
 struct pci_ide_partner {
 	u16 rid_start;
 	u16 rid_end;
 	u8 stream_index;
+	struct pci_bus_region mem_assoc;
+	struct pci_bus_region pref_assoc;
 	unsigned int default_stream:1;
 	unsigned int setup:1;
 	unsigned int enable:1;
 };
 
+/**
+ * struct pci_ide_regs - Hardware register association settings for Selective
+ *			 IDE Streams
+ * @rid1: IDE RID Association Register 1
+ * @rid2: IDE RID Association Register 2
+ * @addr: Up to two address association blocks (IDE Address Association Register
+ *	  1 through 3) for MMIO and prefetchable MMIO
+ * @nr_addr: Number of address association blocks initialized
+ *
+ * See pci_ide_stream_to_regs()
+ */
+struct pci_ide_regs {
+	u32 rid1;
+	u32 rid2;
+	struct {
+		u32 assoc1;
+		u32 assoc2;
+		u32 assoc3;
+	} addr[2];
+	int nr_addr;
+};
+
 /**
  * struct pci_ide - PCIe Selective IDE Stream descriptor
  * @pdev: PCIe Endpoint in the pci_ide_partner pair
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2c8dbae4916c..ba39ca78b382 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -870,6 +870,11 @@ struct pci_bus_region {
 	pci_bus_addr_t	end;
 };
 
+static inline pci_bus_addr_t pci_bus_region_size(const struct pci_bus_region *region)
+{
+	return region->end - region->start + 1;
+}
+
 struct pci_dynids {
 	spinlock_t		lock;	/* Protects list, index */
 	struct list_head	list;	/* For IDs added at runtime */
-- 
cgit v1.2.3


From 079115370d00c78ef69b31dd15def90adf2aa579 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 Nov 2025 18:14:43 -0800
Subject: PCI/IDE: Initialize an ID for all IDE streams

The PCIe spec defines two types of streams - selective and link.  Each
stream has an ID from the same bucket so a stream ID does not tell the
type.  The spec defines an "enable" bit for every stream and required
stream IDs to be unique among all enabled stream but there is no such
requirement for disabled streams.

However, when IDE_KM is programming keys, an IDE-capable device needs
to know the type of stream being programmed to write it directly to
the hardware as keys are relatively large, possibly many of them and
devices often struggle with keeping around rather big data not being
used.

Walk through all streams on a device and initialise the IDs to some
unique number, both link and selective.

The weakest part of this proposal is the host bridge ide_stream_ids_ida.
Technically, a Stream ID only needs to be unique within a given partner
pair. However, with "anonymous" / unassigned streams there is no convenient
place to track the available ids. Proceed with an ida in the host bridge
for now, but consider moving this tracking to be an ide_stream_ids_ida per
device.

Co-developed-by: Alexey Kardashevskiy <aik@amd.com>
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-6-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/ide.c       | 129 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h       |   2 +
 drivers/pci/remove.c    |   1 +
 include/linux/pci-ide.h |   6 +++
 include/linux/pci.h     |   1 +
 5 files changed, 139 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c
index 2e0856a4307b..f0ef474e1a0d 100644
--- a/drivers/pci/ide.c
+++ b/drivers/pci/ide.c
@@ -35,8 +35,50 @@ static int sel_ide_offset(struct pci_dev *pdev,
 				settings->stream_index, pdev->nr_ide_mem);
 }
 
+static bool reserve_stream_index(struct pci_dev *pdev, u8 idx)
+{
+	int ret;
+
+	ret = ida_alloc_range(&pdev->ide_stream_ida, idx, idx, GFP_KERNEL);
+	return ret >= 0;
+}
+
+static bool reserve_stream_id(struct pci_host_bridge *hb, u8 id)
+{
+	int ret;
+
+	ret = ida_alloc_range(&hb->ide_stream_ids_ida, id, id, GFP_KERNEL);
+	return ret >= 0;
+}
+
+static bool claim_stream(struct pci_host_bridge *hb, u8 stream_id,
+			 struct pci_dev *pdev, u8 stream_idx)
+{
+	dev_info(&hb->dev, "Stream ID %d active at init\n", stream_id);
+	if (!reserve_stream_id(hb, stream_id)) {
+		dev_info(&hb->dev, "Failed to claim %s Stream ID %d\n",
+			 stream_id == PCI_IDE_RESERVED_STREAM_ID ? "reserved" :
+								   "active",
+			 stream_id);
+		return false;
+	}
+
+	/* No stream index to reserve in the Link IDE case */
+	if (!pdev)
+		return true;
+
+	if (!reserve_stream_index(pdev, stream_idx)) {
+		pci_info(pdev, "Failed to claim active Selective Stream %d\n",
+			 stream_idx);
+		return false;
+	}
+
+	return true;
+}
+
 void pci_ide_init(struct pci_dev *pdev)
 {
+	struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus);
 	u16 nr_link_ide, nr_ide_mem, nr_streams;
 	u16 ide_cap;
 	u32 val;
@@ -83,6 +125,7 @@ void pci_ide_init(struct pci_dev *pdev)
 		int pos = __sel_ide_offset(ide_cap, nr_link_ide, i, nr_ide_mem);
 		int nr_assoc;
 		u32 val;
+		u8 id;
 
 		pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CAP, &val);
 
@@ -98,6 +141,51 @@ void pci_ide_init(struct pci_dev *pdev)
 		}
 
 		nr_ide_mem = nr_assoc;
+
+		/*
+		 * Claim Stream IDs and Selective Stream blocks that are already
+		 * active on the device
+		 */
+		pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CTL, &val);
+		id = FIELD_GET(PCI_IDE_SEL_CTL_ID, val);
+		if ((val & PCI_IDE_SEL_CTL_EN) &&
+		    !claim_stream(hb, id, pdev, i))
+			return;
+	}
+
+	/* Reserve link stream-ids that are already active on the device */
+	for (u16 i = 0; i < nr_link_ide; ++i) {
+		int pos = ide_cap + PCI_IDE_LINK_STREAM_0 + i * PCI_IDE_LINK_BLOCK_SIZE;
+		u8 id;
+
+		pci_read_config_dword(pdev, pos + PCI_IDE_LINK_CTL_0, &val);
+		id = FIELD_GET(PCI_IDE_LINK_CTL_ID, val);
+		if ((val & PCI_IDE_LINK_CTL_EN) &&
+		    !claim_stream(hb, id, NULL, -1))
+			return;
+	}
+
+	for (u16 i = 0; i < nr_streams; i++) {
+		int pos = __sel_ide_offset(ide_cap, nr_link_ide, i, nr_ide_mem);
+
+		pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CAP, &val);
+		if (val & PCI_IDE_SEL_CTL_EN)
+			continue;
+		val &= ~PCI_IDE_SEL_CTL_ID;
+		val |= FIELD_PREP(PCI_IDE_SEL_CTL_ID, PCI_IDE_RESERVED_STREAM_ID);
+		pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, val);
+	}
+
+	for (u16 i = 0; i < nr_link_ide; ++i) {
+		int pos = ide_cap + PCI_IDE_LINK_STREAM_0 +
+			  i * PCI_IDE_LINK_BLOCK_SIZE;
+
+		pci_read_config_dword(pdev, pos, &val);
+		if (val & PCI_IDE_LINK_CTL_EN)
+			continue;
+		val &= ~PCI_IDE_LINK_CTL_ID;
+		val |= FIELD_PREP(PCI_IDE_LINK_CTL_ID, PCI_IDE_RESERVED_STREAM_ID);
+		pci_write_config_dword(pdev, pos, val);
 	}
 
 	pdev->ide_cap = ide_cap;
@@ -301,6 +389,28 @@ void pci_ide_stream_release(struct pci_ide *ide)
 }
 EXPORT_SYMBOL_GPL(pci_ide_stream_release);
 
+struct pci_ide_stream_id {
+	struct pci_host_bridge *hb;
+	u8 stream_id;
+};
+
+static struct pci_ide_stream_id *
+request_stream_id(struct pci_host_bridge *hb, u8 stream_id,
+		  struct pci_ide_stream_id *sid)
+{
+	if (!reserve_stream_id(hb, stream_id))
+		return NULL;
+
+	*sid = (struct pci_ide_stream_id) {
+		.hb = hb,
+		.stream_id = stream_id,
+	};
+
+	return sid;
+}
+DEFINE_FREE(free_stream_id, struct pci_ide_stream_id *,
+	    if (_T) ida_free(&_T->hb->ide_stream_ids_ida, _T->stream_id))
+
 /**
  * pci_ide_stream_register() - Prepare to activate an IDE Stream
  * @ide: IDE settings descriptor
@@ -313,6 +423,7 @@ int pci_ide_stream_register(struct pci_ide *ide)
 {
 	struct pci_dev *pdev = ide->pdev;
 	struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus);
+	struct pci_ide_stream_id __sid;
 	u8 ep_stream, rp_stream;
 	int rc;
 
@@ -321,6 +432,13 @@ int pci_ide_stream_register(struct pci_ide *ide)
 		return -ENXIO;
 	}
 
+	struct pci_ide_stream_id *sid __free(free_stream_id) =
+		request_stream_id(hb, ide->stream_id, &__sid);
+	if (!sid) {
+		pci_err(pdev, "Setup fail: Stream ID %d in use\n", ide->stream_id);
+		return -EBUSY;
+	}
+
 	ep_stream = ide->partner[PCI_IDE_EP].stream_index;
 	rp_stream = ide->partner[PCI_IDE_RP].stream_index;
 	const char *name __free(kfree) = kasprintf(GFP_KERNEL, "stream%d.%d.%d",
@@ -335,6 +453,9 @@ int pci_ide_stream_register(struct pci_ide *ide)
 
 	ide->name = no_free_ptr(name);
 
+	/* Stream ID reservation recorded in @ide is now successfully registered */
+	retain_and_null_ptr(sid);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pci_ide_stream_register);
@@ -353,6 +474,7 @@ void pci_ide_stream_unregister(struct pci_ide *ide)
 
 	sysfs_remove_link(&hb->dev.kobj, ide->name);
 	kfree(ide->name);
+	ida_free(&hb->ide_stream_ids_ida, ide->stream_id);
 	ide->name = NULL;
 }
 EXPORT_SYMBOL_GPL(pci_ide_stream_unregister);
@@ -616,6 +738,8 @@ void pci_ide_init_host_bridge(struct pci_host_bridge *hb)
 {
 	hb->nr_ide_streams = 256;
 	ida_init(&hb->ide_stream_ida);
+	ida_init(&hb->ide_stream_ids_ida);
+	reserve_stream_id(hb, PCI_IDE_RESERVED_STREAM_ID);
 }
 
 static ssize_t available_secure_streams_show(struct device *dev,
@@ -684,3 +808,8 @@ void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr)
 	sysfs_update_group(&hb->dev.kobj, &pci_ide_attr_group);
 }
 EXPORT_SYMBOL_NS_GPL(pci_ide_set_nr_streams, "PCI_IDE");
+
+void pci_ide_destroy(struct pci_dev *pdev)
+{
+	ida_destroy(&pdev->ide_stream_ida);
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f6ffe5ee4717..641c0b53c4e3 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -616,10 +616,12 @@ static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { }
 #ifdef CONFIG_PCI_IDE
 void pci_ide_init(struct pci_dev *dev);
 void pci_ide_init_host_bridge(struct pci_host_bridge *hb);
+void pci_ide_destroy(struct pci_dev *dev);
 extern const struct attribute_group pci_ide_attr_group;
 #else
 static inline void pci_ide_init(struct pci_dev *dev) { }
 static inline void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { }
+static inline void pci_ide_destroy(struct pci_dev *dev) { }
 #endif
 
 #ifdef CONFIG_PCI_TSM
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 803391892c4a..417a9ea59117 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -70,6 +70,7 @@ static void pci_destroy_dev(struct pci_dev *dev)
 	up_write(&pci_bus_sem);
 
 	pci_doe_destroy(dev);
+	pci_ide_destroy(dev);
 	pcie_aspm_exit_link_state(dev);
 	pci_bridge_d3_update(dev);
 	pci_pwrctrl_unregister(&dev->dev);
diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h
index 93194338e4d0..37a1ad9501b0 100644
--- a/include/linux/pci-ide.h
+++ b/include/linux/pci-ide.h
@@ -97,6 +97,12 @@ struct pci_ide {
 	struct tsm_dev *tsm_dev;
 };
 
+/*
+ * Some devices need help with aliased stream-ids even for idle streams. Use
+ * this id as the "never enabled" place holder.
+ */
+#define PCI_IDE_RESERVED_STREAM_ID 255
+
 void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr);
 struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev,
 					    struct pci_ide *ide);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ba39ca78b382..52a235c61023 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -619,6 +619,7 @@ struct pci_host_bridge {
 #ifdef CONFIG_PCI_IDE
 	u16 nr_ide_streams; /* Max streams possibly active in @ide_stream_ida */
 	struct ida ide_stream_ida;
+	struct ida ide_stream_ids_ida; /* track unique ids per domain */
 #endif
 	u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */
 	int (*map_irq)(const struct pci_dev *, u8, u8);
-- 
cgit v1.2.3


From 50cbec192f5317e29be993e2a634bbbdfcf0230e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 Nov 2025 18:14:44 -0800
Subject: PCI/TSM: Add pci_tsm_bind() helper for instantiating TDIs

After a PCIe device has established a secure link and session between a TEE
Security Manager (TSM) and its local Device Security Manager (DSM), the
device or its subfunctions are candidates to be bound to a private memory
context, a TVM. A PCIe device function interface assigned to a TVM is a TEE
Device Interface (TDI).

The pci_tsm_bind() requests the low-level TSM driver to associate the
device with private MMIO and private IOMMU context resources of a given TVM
represented by a @kvm argument. A device in the bound state corresponds to
the TDISP protocol LOCKED state and awaits validation by the TVM. It is a
'struct pci_tsm_link_ops' operation because, similar to IDE establishment,
it involves host side resource establishment and context setup on behalf of
the guest. It is also expected to be performed lazily to allow for
operation of the device in non-confidential "shared" context for pre-lock
configuration.

Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-7-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/tsm.c       | 109 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/pci-tsm.h |  34 +++++++++++++++
 2 files changed, 142 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/pci/tsm.c b/drivers/pci/tsm.c
index 6a2849f77adc..39de91a47a26 100644
--- a/drivers/pci/tsm.c
+++ b/drivers/pci/tsm.c
@@ -270,6 +270,95 @@ static int remove_fn(struct pci_dev *pdev, void *data)
 	return 0;
 }
 
+/*
+ * Note, this helper only returns an error code and takes an argument for
+ * compatibility with the pci_walk_bus() callback prototype. pci_tsm_unbind()
+ * always succeeds.
+ */
+static int __pci_tsm_unbind(struct pci_dev *pdev, void *data)
+{
+	struct pci_tdi *tdi;
+	struct pci_tsm_pf0 *tsm_pf0;
+
+	lockdep_assert_held(&pci_tsm_rwsem);
+
+	if (!pdev->tsm)
+		return 0;
+
+	tsm_pf0 = to_pci_tsm_pf0(pdev->tsm);
+	guard(mutex)(&tsm_pf0->lock);
+
+	tdi = pdev->tsm->tdi;
+	if (!tdi)
+		return 0;
+
+	to_pci_tsm_ops(pdev->tsm)->unbind(tdi);
+	pdev->tsm->tdi = NULL;
+
+	return 0;
+}
+
+void pci_tsm_unbind(struct pci_dev *pdev)
+{
+	guard(rwsem_read)(&pci_tsm_rwsem);
+	__pci_tsm_unbind(pdev, NULL);
+}
+EXPORT_SYMBOL_GPL(pci_tsm_unbind);
+
+/**
+ * pci_tsm_bind() - Bind @pdev as a TDI for @kvm
+ * @pdev: PCI device function to bind
+ * @kvm: Private memory attach context
+ * @tdi_id: Identifier (virtual BDF) for the TDI as referenced by the TSM and DSM
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ *
+ * Context: Caller is responsible for constraining the bind lifetime to the
+ * registered state of the device. For example, pci_tsm_bind() /
+ * pci_tsm_unbind() limited to the VFIO driver bound state of the device.
+ */
+int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id)
+{
+	struct pci_tsm_pf0 *tsm_pf0;
+	struct pci_tdi *tdi;
+
+	if (!kvm)
+		return -EINVAL;
+
+	guard(rwsem_read)(&pci_tsm_rwsem);
+
+	if (!pdev->tsm)
+		return -EINVAL;
+
+	if (!is_link_tsm(pdev->tsm->tsm_dev))
+		return -ENXIO;
+
+	tsm_pf0 = to_pci_tsm_pf0(pdev->tsm);
+	guard(mutex)(&tsm_pf0->lock);
+
+	/* Resolve races to bind a TDI */
+	if (pdev->tsm->tdi) {
+		if (pdev->tsm->tdi->kvm != kvm)
+			return -EBUSY;
+		return 0;
+	}
+
+	tdi = to_pci_tsm_ops(pdev->tsm)->bind(pdev, kvm, tdi_id);
+	if (IS_ERR(tdi))
+		return PTR_ERR(tdi);
+
+	pdev->tsm->tdi = tdi;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_tsm_bind);
+
+static void pci_tsm_unbind_all(struct pci_dev *pdev)
+{
+	pci_tsm_walk_fns_reverse(pdev, __pci_tsm_unbind, NULL);
+	__pci_tsm_unbind(pdev, NULL);
+}
+
 static void __pci_tsm_disconnect(struct pci_dev *pdev)
 {
 	struct pci_tsm_pf0 *tsm_pf0 = to_pci_tsm_pf0(pdev->tsm);
@@ -278,6 +367,8 @@ static void __pci_tsm_disconnect(struct pci_dev *pdev)
 	/* disconnect() mutually exclusive with subfunction pci_tsm_init() */
 	lockdep_assert_held_write(&pci_tsm_rwsem);
 
+	pci_tsm_unbind_all(pdev);
+
 	/*
 	 * disconnect() is uninterruptible as it may be called for device
 	 * teardown
@@ -439,6 +530,22 @@ static struct pci_dev *find_dsm_dev(struct pci_dev *pdev)
 	return NULL;
 }
 
+/**
+ * pci_tsm_tdi_constructor() - base 'struct pci_tdi' initialization for link TSMs
+ * @pdev: PCI device function representing the TDI
+ * @tdi: context to initialize
+ * @kvm: Private memory attach context
+ * @tdi_id: Identifier (virtual BDF) for the TDI as referenced by the TSM and DSM
+ */
+void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi,
+			     struct kvm *kvm, u32 tdi_id)
+{
+	tdi->pdev = pdev;
+	tdi->kvm = kvm;
+	tdi->tdi_id = tdi_id;
+}
+EXPORT_SYMBOL_GPL(pci_tsm_tdi_constructor);
+
 /**
  * pci_tsm_link_constructor() - base 'struct pci_tsm' initialization for link TSMs
  * @pdev: The PCI device
@@ -532,7 +639,7 @@ int pci_tsm_register(struct tsm_dev *tsm_dev)
 
 static void pci_tsm_fn_exit(struct pci_dev *pdev)
 {
-	/* TODO: unbind the fn */
+	__pci_tsm_unbind(pdev, NULL);
 	tsm_remove(pdev->tsm);
 }
 
diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h
index d7b078d5e272..a5e297677917 100644
--- a/include/linux/pci-tsm.h
+++ b/include/linux/pci-tsm.h
@@ -6,6 +6,8 @@
 
 struct pci_tsm;
 struct tsm_dev;
+struct kvm;
+enum pci_tsm_req_scope;
 
 /*
  * struct pci_tsm_ops - manage confidential links and security state
@@ -29,12 +31,16 @@ struct pci_tsm_ops {
 	 * @connect: establish / validate a secure connection (e.g. IDE)
 	 *	     with the device
 	 * @disconnect: teardown the secure link
+	 * @bind: bind a TDI in preparation for it to be accepted by a TVM
+	 * @unbind: remove a TDI from secure operation with a TVM
 	 *
 	 * Context: @probe, @remove, @connect, and @disconnect run under
 	 * pci_tsm_rwsem held for write to sync with TSM unregistration and
 	 * mutual exclusion of @connect and @disconnect. @connect and
 	 * @disconnect additionally run under the DSM lock (struct
 	 * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions.
+	 * @bind and @unbind run under pci_tsm_rwsem held for read and the DSM
+	 * lock.
 	 */
 	struct_group_tagged(pci_tsm_link_ops, link_ops,
 		struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev,
@@ -42,6 +48,9 @@ struct pci_tsm_ops {
 		void (*remove)(struct pci_tsm *tsm);
 		int (*connect)(struct pci_dev *pdev);
 		void (*disconnect)(struct pci_dev *pdev);
+		struct pci_tdi *(*bind)(struct pci_dev *pdev,
+					struct kvm *kvm, u32 tdi_id);
+		void (*unbind)(struct pci_tdi *tdi);
 	);
 
 	/*
@@ -61,12 +70,25 @@ struct pci_tsm_ops {
 	);
 };
 
+/**
+ * struct pci_tdi - Core TEE I/O Device Interface (TDI) context
+ * @pdev: host side representation of guest-side TDI
+ * @kvm: TEE VM context of bound TDI
+ * @tdi_id: Identifier (virtual BDF) for the TDI as referenced by the TSM and DSM
+ */
+struct pci_tdi {
+	struct pci_dev *pdev;
+	struct kvm *kvm;
+	u32 tdi_id;
+};
+
 /**
  * struct pci_tsm - Core TSM context for a given PCIe endpoint
  * @pdev: Back ref to device function, distinguishes type of pci_tsm context
  * @dsm_dev: PCI Device Security Manager for link operations on @pdev
  * @tsm_dev: PCI TEE Security Manager device for Link Confidentiality or Device
  *	     Function Security operations
+ * @tdi: TDI context established by the @bind link operation
  *
  * This structure is wrapped by low level TSM driver data and returned by
  * probe()/lock(), it is freed by the corresponding remove()/unlock().
@@ -82,6 +104,7 @@ struct pci_tsm {
 	struct pci_dev *pdev;
 	struct pci_dev *dsm_dev;
 	struct tsm_dev *tsm_dev;
+	struct pci_tdi *tdi;
 };
 
 /**
@@ -139,6 +162,10 @@ int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm,
 void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *tsm);
 int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req,
 			 size_t req_sz, void *resp, size_t resp_sz);
+int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id);
+void pci_tsm_unbind(struct pci_dev *pdev);
+void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi,
+			     struct kvm *kvm, u32 tdi_id);
 #else
 static inline int pci_tsm_register(struct tsm_dev *tsm_dev)
 {
@@ -147,5 +174,12 @@ static inline int pci_tsm_register(struct tsm_dev *tsm_dev)
 static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev)
 {
 }
+static inline int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u64 tdi_id)
+{
+	return -ENXIO;
+}
+static inline void pci_tsm_unbind(struct pci_dev *pdev)
+{
+}
 #endif
 #endif /*__PCI_TSM_H */
-- 
cgit v1.2.3


From c316c75d57fbb34e2305690813f4dbec9311f2b0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 Nov 2025 18:14:45 -0800
Subject: PCI/TSM: Add pci_tsm_guest_req() for managing TDIs

A PCIe device function interface assigned to a TVM is a TEE Device
Interface (TDI). A TDI instantiated by pci_tsm_bind() needs additional
steps taken by the TVM to be accepted into the TVM's Trusted Compute
Boundary (TCB) and transitioned to the RUN state.

pci_tsm_guest_req() is a channel for the guest to request TDISP collateral,
like Device Interface Reports, and effect TDISP state changes, like
LOCKED->RUN transititions. Similar to IDE establishment and pci_tsm_bind(),
these are long running operations involving SPDM message passing via the
DOE mailbox.

The path for a TVM to invoke pci_tsm_guest_req() is:
* TSM triggers exit via guest-to-host-interface ABI (implementation specific)
* VMM invokes handler (KVM handle_exit() -> userspace io)
* handler issues request (userspace io handler -> ioctl() ->
  pci_tsm_guest_req())
* handler supplies response
* VMM posts response, notifies/re-enters TVM

This path is purely a transport for messages from TVM to platform TSM. By
design the host kernel does not and must not care about the content of
these messages. I.e. the host kernel is not in the TCB of the TVM.

As this is an opaque passthrough interface, similar to fwctl, the kernel
requires that implementations stay within the bounds defined by 'enum
pci_tsm_req_scope'. Violation of those expectations likely has market and
regulatory consequences. Out of scope requests are blocked by default.

Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-8-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/tsm.c       | 60 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci-tsm.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 120 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/tsm.c b/drivers/pci/tsm.c
index 39de91a47a26..5e57501f693e 100644
--- a/drivers/pci/tsm.c
+++ b/drivers/pci/tsm.c
@@ -353,6 +353,66 @@ int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id)
 }
 EXPORT_SYMBOL_GPL(pci_tsm_bind);
 
+/**
+ * pci_tsm_guest_req() - helper to marshal guest requests to the TSM driver
+ * @pdev: @pdev representing a bound tdi
+ * @scope: caller asserts this passthrough request is limited to TDISP operations
+ * @req_in: Input payload forwarded from the guest
+ * @in_len: Length of @req_in
+ * @req_out: Output payload buffer response to the guest
+ * @out_len: Length of @req_out on input, bytes filled in @req_out on output
+ * @tsm_code: Optional TSM arch specific result code for the guest TSM
+ *
+ * This is a common entry point for requests triggered by userspace KVM-exit
+ * service handlers responding to TDI information or state change requests. The
+ * scope parameter limits requests to TDISP state management, or limited debug.
+ * This path is only suitable for commands and results that are the host kernel
+ * has no use, the host is only facilitating guest to TSM communication.
+ *
+ * Returns 0 on success and -error on failure and positive "residue" on success
+ * but @req_out is filled with less then @out_len, or @req_out is NULL and a
+ * residue number of bytes were not consumed from @req_in.  On success or
+ * failure @tsm_code may be populated with a TSM implementation specific result
+ * code for the guest to consume.
+ *
+ * Context: Caller is responsible for calling this within the pci_tsm_bind()
+ * state of the TDI.
+ */
+ssize_t pci_tsm_guest_req(struct pci_dev *pdev, enum pci_tsm_req_scope scope,
+			  sockptr_t req_in, size_t in_len, sockptr_t req_out,
+			  size_t out_len, u64 *tsm_code)
+{
+	struct pci_tsm_pf0 *tsm_pf0;
+	struct pci_tdi *tdi;
+	int rc;
+
+	/* Forbid requests that are not directly related to TDISP operations */
+	if (scope > PCI_TSM_REQ_STATE_CHANGE)
+		return -EINVAL;
+
+	ACQUIRE(rwsem_read_intr, lock)(&pci_tsm_rwsem);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &lock)))
+		return rc;
+
+	if (!pdev->tsm)
+		return -ENXIO;
+
+	if (!is_link_tsm(pdev->tsm->tsm_dev))
+		return -ENXIO;
+
+	tsm_pf0 = to_pci_tsm_pf0(pdev->tsm);
+	ACQUIRE(mutex_intr, ops_lock)(&tsm_pf0->lock);
+	if ((rc = ACQUIRE_ERR(mutex_intr, &ops_lock)))
+		return rc;
+
+	tdi = pdev->tsm->tdi;
+	if (!tdi)
+		return -ENXIO;
+	return to_pci_tsm_ops(pdev->tsm)->guest_req(tdi, scope, req_in, in_len,
+						    req_out, out_len, tsm_code);
+}
+EXPORT_SYMBOL_GPL(pci_tsm_guest_req);
+
 static void pci_tsm_unbind_all(struct pci_dev *pdev)
 {
 	pci_tsm_walk_fns_reverse(pdev, __pci_tsm_unbind, NULL);
diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h
index a5e297677917..a6435aba03f9 100644
--- a/include/linux/pci-tsm.h
+++ b/include/linux/pci-tsm.h
@@ -3,6 +3,7 @@
 #define __PCI_TSM_H
 #include <linux/mutex.h>
 #include <linux/pci.h>
+#include <linux/sockptr.h>
 
 struct pci_tsm;
 struct tsm_dev;
@@ -33,14 +34,15 @@ struct pci_tsm_ops {
 	 * @disconnect: teardown the secure link
 	 * @bind: bind a TDI in preparation for it to be accepted by a TVM
 	 * @unbind: remove a TDI from secure operation with a TVM
+	 * @guest_req: marshal TVM information and state change requests
 	 *
 	 * Context: @probe, @remove, @connect, and @disconnect run under
 	 * pci_tsm_rwsem held for write to sync with TSM unregistration and
 	 * mutual exclusion of @connect and @disconnect. @connect and
 	 * @disconnect additionally run under the DSM lock (struct
 	 * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions.
-	 * @bind and @unbind run under pci_tsm_rwsem held for read and the DSM
-	 * lock.
+	 * @bind, @unbind, and @guest_req run under pci_tsm_rwsem held for read
+	 * and the DSM lock.
 	 */
 	struct_group_tagged(pci_tsm_link_ops, link_ops,
 		struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev,
@@ -51,6 +53,11 @@ struct pci_tsm_ops {
 		struct pci_tdi *(*bind)(struct pci_dev *pdev,
 					struct kvm *kvm, u32 tdi_id);
 		void (*unbind)(struct pci_tdi *tdi);
+		ssize_t (*guest_req)(struct pci_tdi *tdi,
+				     enum pci_tsm_req_scope scope,
+				     sockptr_t req_in, size_t in_len,
+				     sockptr_t req_out, size_t out_len,
+				     u64 *tsm_code);
 	);
 
 	/*
@@ -152,6 +159,46 @@ static inline bool is_pci_tsm_pf0(struct pci_dev *pdev)
 	return PCI_FUNC(pdev->devfn) == 0;
 }
 
+/**
+ * enum pci_tsm_req_scope - Scope of guest requests to be validated by TSM
+ *
+ * Guest requests are a transport for a TVM to communicate with a TSM + DSM for
+ * a given TDI. A TSM driver is responsible for maintaining the kernel security
+ * model and limit commands that may affect the host, or are otherwise outside
+ * the typical TDISP operational model.
+ */
+enum pci_tsm_req_scope {
+	/**
+	 * @PCI_TSM_REQ_INFO: Read-only, without side effects, request for
+	 * typical TDISP collateral information like Device Interface Reports.
+	 * No device secrets are permitted, and no device state is changed.
+	 */
+	PCI_TSM_REQ_INFO = 0,
+	/**
+	 * @PCI_TSM_REQ_STATE_CHANGE: Request to change the TDISP state from
+	 * UNLOCKED->LOCKED, LOCKED->RUN, or other architecture specific state
+	 * changes to support those transitions for a TDI. No other (unrelated
+	 * to TDISP) device / host state, configuration, or data change is
+	 * permitted.
+	 */
+	PCI_TSM_REQ_STATE_CHANGE = 1,
+	/**
+	 * @PCI_TSM_REQ_DEBUG_READ: Read-only request for debug information
+	 *
+	 * A method to facilitate TVM information retrieval outside of typical
+	 * TDISP operational requirements. No device secrets are permitted.
+	 */
+	PCI_TSM_REQ_DEBUG_READ = 2,
+	/**
+	 * @PCI_TSM_REQ_DEBUG_WRITE: Device state changes for debug purposes
+	 *
+	 * The request may affect the operational state of the device outside of
+	 * the TDISP operational model. If allowed, requires CAP_SYS_RAW_IO, and
+	 * will taint the kernel.
+	 */
+	PCI_TSM_REQ_DEBUG_WRITE = 3,
+};
+
 #ifdef CONFIG_PCI_TSM
 int pci_tsm_register(struct tsm_dev *tsm_dev);
 void pci_tsm_unregister(struct tsm_dev *tsm_dev);
@@ -166,6 +213,9 @@ int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id);
 void pci_tsm_unbind(struct pci_dev *pdev);
 void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi,
 			     struct kvm *kvm, u32 tdi_id);
+ssize_t pci_tsm_guest_req(struct pci_dev *pdev, enum pci_tsm_req_scope scope,
+			  sockptr_t req_in, size_t in_len, sockptr_t req_out,
+			  size_t out_len, u64 *tsm_code);
 #else
 static inline int pci_tsm_register(struct tsm_dev *tsm_dev)
 {
@@ -181,5 +231,13 @@ static inline int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u64 tdi_id
 static inline void pci_tsm_unbind(struct pci_dev *pdev)
 {
 }
+static inline ssize_t pci_tsm_guest_req(struct pci_dev *pdev,
+					enum pci_tsm_req_scope scope,
+					sockptr_t req_in, size_t in_len,
+					sockptr_t req_out, size_t out_len,
+					u64 *tsm_code)
+{
+	return -ENXIO;
+}
 #endif
 #endif /*__PCI_TSM_H */
-- 
cgit v1.2.3


From 6d650ae9282bcec1e76205b44cb8f17e2265052e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 13 Nov 2025 14:03:57 +0000
Subject: tcp: gro: inline tcp_gro_pull_header()

tcp_gro_pull_header() is used in GRO fast path, inline it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251113140358.58242-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/gro.h      | 27 +++++++++++++++++++++++++++
 include/net/tcp.h      |  1 -
 net/ipv4/tcp_offload.c | 27 ---------------------------
 3 files changed, 27 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/gro.h b/include/net/gro.h
index e3affb2e2ca8..b65f631c521d 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -593,4 +593,31 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *
 struct packet_offload *gro_find_receive_by_type(__be16 type);
 struct packet_offload *gro_find_complete_by_type(__be16 type);
 
+static inline struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
+{
+	unsigned int thlen, hlen, off;
+	struct tcphdr *th;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*th);
+	th = skb_gro_header(skb, hlen, off);
+	if (unlikely(!th))
+		return NULL;
+
+	thlen = th->doff * 4;
+	if (unlikely(thlen < sizeof(*th)))
+		return NULL;
+
+	hlen = off + thlen;
+	if (!skb_gro_may_pull(skb, hlen)) {
+		th = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!th))
+			return NULL;
+	}
+
+	skb_gro_pull(skb, thlen);
+
+	return th;
+}
+
 #endif /* _NET_GRO_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4833ec7903ec..0deb5e9dd911 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2313,7 +2313,6 @@ void tcp_v4_destroy_sock(struct sock *sk);
 
 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 				netdev_features_t features);
-struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
 struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
 struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
 				struct tcphdr *th);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 2cb93da93abc..fdda18b1abda 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -282,33 +282,6 @@ struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
 	return NULL;
 }
 
-struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
-{
-	unsigned int thlen, hlen, off;
-	struct tcphdr *th;
-
-	off = skb_gro_offset(skb);
-	hlen = off + sizeof(*th);
-	th = skb_gro_header(skb, hlen, off);
-	if (unlikely(!th))
-		return NULL;
-
-	thlen = th->doff * 4;
-	if (thlen < sizeof(*th))
-		return NULL;
-
-	hlen = off + thlen;
-	if (!skb_gro_may_pull(skb, hlen)) {
-		th = skb_gro_header_slow(skb, hlen, off);
-		if (unlikely(!th))
-			return NULL;
-	}
-
-	skb_gro_pull(skb, thlen);
-
-	return th;
-}
-
 struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
 				struct tcphdr *th)
 {
-- 
cgit v1.2.3


From 06ac470658190e97518f131df01c9c530c293320 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Thu, 13 Nov 2025 19:45:01 +0800
Subject: sctp: Remove unused declaration sctp_auth_init_hmacs()

Commit bf40785fa437 ("sctp: Use HMAC-SHA1 and HMAC-SHA256 library for chunk
authentication") removed the implementation but leave declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Eric Biggers <ebiggers@kernel.org>
Link: https://patch.msgid.link/20251113114501.32905-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/auth.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h
index 3d5879e08e78..6f2cd562b1de 100644
--- a/include/net/sctp/auth.h
+++ b/include/net/sctp/auth.h
@@ -72,7 +72,6 @@ struct sctp_shared_key *sctp_auth_get_shkey(
 int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep,
 				struct sctp_association *asoc,
 				gfp_t gfp);
-int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp);
 const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id);
 const struct sctp_hmac *
 sctp_auth_asoc_get_hmac(const struct sctp_association *asoc);
-- 
cgit v1.2.3


From 4cc1aa469cd6b714adc958547a4866247bfd60a9 Mon Sep 17 00:00:00 2001
From: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Date: Fri, 17 Oct 2025 11:58:17 -0700
Subject: mshv: Fix deposit memory in MSHV_ROOT_HVCALL

When the MSHV_ROOT_HVCALL ioctl is executing a hypercall, and gets
HV_STATUS_INSUFFICIENT_MEMORY, it deposits memory and then returns
-EAGAIN to userspace. The expectation is that the VMM will retry.

However, some VMM code in the wild doesn't do this and simply fails.
Rather than force the VMM to retry, change the ioctl to deposit
memory on demand and immediately retry the hypercall as is done with
all the other hypercall helper functions.

In addition to making the ioctl easier to use, removing the need for
multiple syscalls improves performance.

There is a complication: unlike the other hypercall helper functions,
in MSHV_ROOT_HVCALL the input is opaque to the kernel. This is
problematic for rep hypercalls, because the next part of the input
list can't be copied on each loop after depositing pages (this was
the original reason for returning -EAGAIN in this case).

Introduce hv_do_rep_hypercall_ex(), which adds a 'rep_start'
parameter. This solves the issue, allowing the deposit loop in
MSHV_ROOT_HVCALL to restart a rep hypercall after depositing pages
partway through.

Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/mshv_root_main.c    | 58 ++++++++++++++++++++++--------------------
 include/asm-generic/mshyperv.h | 17 ++++++++++---
 2 files changed, 44 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 8a42d9961466..a599bbf1f9b8 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -159,6 +159,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
 	unsigned int pages_order;
 	void *input_pg = NULL;
 	void *output_pg = NULL;
+	u16 reps_completed;
 
 	if (copy_from_user(&args, user_args, sizeof(args)))
 		return -EFAULT;
@@ -210,41 +211,42 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
 	 */
 	*(u64 *)input_pg = partition->pt_id;
 
-	if (args.reps)
-		status = hv_do_rep_hypercall(args.code, args.reps, 0,
-					     input_pg, output_pg);
-	else
-		status = hv_do_hypercall(args.code, input_pg, output_pg);
-
-	if (hv_result(status) == HV_STATUS_CALL_PENDING) {
-		if (is_async) {
-			mshv_async_hvcall_handler(partition, &status);
-		} else { /* Paranoia check. This shouldn't happen! */
-			ret = -EBADFD;
-			goto free_pages_out;
+	reps_completed = 0;
+	do {
+		if (args.reps) {
+			status = hv_do_rep_hypercall_ex(args.code, args.reps,
+							0, reps_completed,
+							input_pg, output_pg);
+			reps_completed = hv_repcomp(status);
+		} else {
+			status = hv_do_hypercall(args.code, input_pg, output_pg);
 		}
-	}
 
-	if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
-		ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1);
-		if (!ret)
-			ret = -EAGAIN;
-	} else if (!hv_result_success(status)) {
-		ret = hv_result_to_errno(status);
-	}
+		if (hv_result(status) == HV_STATUS_CALL_PENDING) {
+			if (is_async) {
+				mshv_async_hvcall_handler(partition, &status);
+			} else { /* Paranoia check. This shouldn't happen! */
+				ret = -EBADFD;
+				goto free_pages_out;
+			}
+		}
+
+		if (hv_result_success(status))
+			break;
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+			ret = hv_result_to_errno(status);
+		else
+			ret = hv_call_deposit_pages(NUMA_NO_NODE,
+						    partition->pt_id, 1);
+	} while (!ret);
 
-	/*
-	 * Always return the status and output data regardless of result.
-	 * The VMM may need it to determine how to proceed. E.g. the status may
-	 * contain the number of reps completed if a rep hypercall partially
-	 * succeeded.
-	 */
 	args.status = hv_result(status);
-	args.reps = args.reps ? hv_repcomp(status) : 0;
+	args.reps = reps_completed;
 	if (copy_to_user(user_args, &args, sizeof(args)))
 		ret = -EFAULT;
 
-	if (output_pg &&
+	if (!ret && output_pg &&
 	    copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
 		ret = -EFAULT;
 
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 64ba6bc807d9..b89c7e3a2047 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -124,10 +124,12 @@ static inline unsigned int hv_repcomp(u64 status)
 
 /*
  * Rep hypercalls. Callers of this functions are supposed to ensure that
- * rep_count and varhead_size comply with Hyper-V hypercall definition.
+ * rep_count, varhead_size, and rep_start comply with Hyper-V hypercall
+ * definition.
  */
-static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
-				      void *input, void *output)
+static inline u64 hv_do_rep_hypercall_ex(u16 code, u16 rep_count,
+					 u16 varhead_size, u16 rep_start,
+					 void *input, void *output)
 {
 	u64 control = code;
 	u64 status;
@@ -135,6 +137,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
 
 	control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
 	control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
+	control |= (u64)rep_start << HV_HYPERCALL_REP_START_OFFSET;
 
 	do {
 		status = hv_do_hypercall(control, input, output);
@@ -152,6 +155,14 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
 	return status;
 }
 
+/* For the typical case where rep_start is 0 */
+static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
+				      void *input, void *output)
+{
+	return hv_do_rep_hypercall_ex(code, rep_count, varhead_size, 0,
+				      input, output);
+}
+
 /* Generate the guest OS identifier as described in the Hyper-V TLFS */
 static inline u64 hv_generate_guest_id(u64 kernel_version)
 {
-- 
cgit v1.2.3


From 3e1b611515d286c6725028e17170f7143e5e51fc Mon Sep 17 00:00:00 2001
From: Tianyu Lan <ltykernel@gmail.com>
Date: Thu, 18 Sep 2025 11:00:20 -0400
Subject: drivers: hv: Allow vmbus message synic interrupt injected from
 Hyper-V

When Secure AVIC is enabled, VMBus driver should
call x2apic Secure AVIC interface to allow Hyper-V
to inject VMBus message interrupt.

Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Tianyu Lan <tiala@microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/hyperv/hv_apic.c      | 5 +++++
 drivers/hv/hv.c                | 2 ++
 drivers/hv/hv_common.c         | 5 +++++
 include/asm-generic/mshyperv.h | 1 +
 4 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index e669053b637d..a8de503def37 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -53,6 +53,11 @@ static void hv_apic_icr_write(u32 low, u32 id)
 	wrmsrq(HV_X64_MSR_ICR, reg_val);
 }
 
+void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
+{
+	apic_update_vector(cpu, vector, set);
+}
+
 static u32 hv_apic_read(u32 reg)
 {
 	u32 reg_val, hi;
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index b14c5f9e0ef2..ec5d10839e0f 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -307,6 +307,7 @@ void hv_synic_enable_regs(unsigned int cpu)
 	}
 
 	hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
+	hv_enable_coco_interrupt(cpu, vmbus_interrupt, true);
 
 	/* Setup the shared SINT. */
 	if (vmbus_irq != -1)
@@ -350,6 +351,7 @@ void hv_synic_disable_regs(unsigned int cpu)
 	/* Need to correctly cleanup in the case of SMP!!! */
 	/* Disable the interrupt */
 	hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+	hv_enable_coco_interrupt(cpu, vmbus_interrupt, false);
 
 	simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
 	/*
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index e109a620c83f..3e41f686daa5 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -716,6 +716,11 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
 }
 EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
 
+void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
+{
+}
+EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt);
+
 void hv_identify_partition_type(void)
 {
 	/* Assume guest role */
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index b89c7e3a2047..db84aced1658 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -347,6 +347,7 @@ bool hv_is_isolation_supported(void);
 bool hv_isolation_type_snp(void);
 u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size);
 u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
+void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set);
 void hyperv_cleanup(void);
 bool hv_query_ext_cap(u64 cap_query);
 void hv_setup_dma_ops(struct device *dev, bool coherent);
-- 
cgit v1.2.3


From 6802d8af47d1dccd9a74a1f708fb9129244ef843 Mon Sep 17 00:00:00 2001
From: Roman Kisel <romank@linux.microsoft.com>
Date: Wed, 8 Oct 2025 16:34:04 -0700
Subject: Drivers: hv: VMBus protocol version 6.0

The confidential VMBus is supported starting from the protocol
version 6.0 onwards.

Provide the required definitions. No functional changes.

Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
Reviewed-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/hyperv_vmbus.h   |  2 ++
 drivers/hv/vmbus_drv.c      | 12 ++++++++
 include/hyperv/hvgdk_mini.h |  1 +
 include/linux/hyperv.h      | 69 ++++++++++++++++++++++++++++++++-------------
 4 files changed, 65 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 0b450e53161e..4a01797d4851 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -333,6 +333,8 @@ extern const struct vmbus_channel_message_table_entry
 
 /* General vmbus interface */
 
+bool vmbus_is_confidential(void);
+
 struct hv_device *vmbus_device_create(const guid_t *type,
 				      const guid_t *instance,
 				      struct vmbus_channel *channel);
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 69591dc7bad2..3c414560fa5f 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -56,6 +56,18 @@ static long __percpu *vmbus_evt;
 int vmbus_irq;
 int vmbus_interrupt;
 
+/*
+ * If the Confidential VMBus is used, the data on the "wire" is not
+ * visible to either the host or the hypervisor.
+ */
+static bool is_confidential;
+
+bool vmbus_is_confidential(void)
+{
+	return is_confidential;
+}
+EXPORT_SYMBOL_GPL(vmbus_is_confidential);
+
 /*
  * The panic notifier below is responsible solely for unloading the
  * vmbus connection, which is necessary in a panic event.
diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index 77abddfc750e..7f730a0e54e6 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -260,6 +260,7 @@ union hv_hypervisor_version_info {
 #define HYPERV_CPUID_VIRT_STACK_PROPERTIES	 0x40000082
 /* Support for the extended IOAPIC RTE format */
 #define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE	 BIT(2)
+#define HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE	 BIT(3)
 
 #define HYPERV_HYPERVISOR_PRESENT_BIT		 0x80000000
 #define HYPERV_CPUID_MIN			 0x40000005
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 59826c89171c..dfc516c1c719 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -265,16 +265,18 @@ static inline u32 hv_get_avail_to_write_percent(
  * Linux kernel.
  */
 
-#define VERSION_WS2008  ((0 << 16) | (13))
-#define VERSION_WIN7    ((1 << 16) | (1))
-#define VERSION_WIN8    ((2 << 16) | (4))
-#define VERSION_WIN8_1    ((3 << 16) | (0))
-#define VERSION_WIN10 ((4 << 16) | (0))
-#define VERSION_WIN10_V4_1 ((4 << 16) | (1))
-#define VERSION_WIN10_V5 ((5 << 16) | (0))
-#define VERSION_WIN10_V5_1 ((5 << 16) | (1))
-#define VERSION_WIN10_V5_2 ((5 << 16) | (2))
-#define VERSION_WIN10_V5_3 ((5 << 16) | (3))
+#define VMBUS_MAKE_VERSION(MAJ, MIN)	((((u32)MAJ) << 16) | (MIN))
+#define VERSION_WS2008					VMBUS_MAKE_VERSION(0, 13)
+#define VERSION_WIN7					VMBUS_MAKE_VERSION(1, 1)
+#define VERSION_WIN8					VMBUS_MAKE_VERSION(2, 4)
+#define VERSION_WIN8_1					VMBUS_MAKE_VERSION(3, 0)
+#define VERSION_WIN10					VMBUS_MAKE_VERSION(4, 0)
+#define VERSION_WIN10_V4_1				VMBUS_MAKE_VERSION(4, 1)
+#define VERSION_WIN10_V5				VMBUS_MAKE_VERSION(5, 0)
+#define VERSION_WIN10_V5_1				VMBUS_MAKE_VERSION(5, 1)
+#define VERSION_WIN10_V5_2				VMBUS_MAKE_VERSION(5, 2)
+#define VERSION_WIN10_V5_3				VMBUS_MAKE_VERSION(5, 3)
+#define VERSION_WIN10_V6_0				VMBUS_MAKE_VERSION(6, 0)
 
 /* Make maximum size of pipe payload of 16K */
 #define MAX_PIPE_DATA_PAYLOAD		(sizeof(u8) * 16384)
@@ -335,14 +337,22 @@ struct vmbus_channel_offer {
 } __packed;
 
 /* Server Flags */
-#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE	1
-#define VMBUS_CHANNEL_SERVER_SUPPORTS_TRANSFER_PAGES	2
-#define VMBUS_CHANNEL_SERVER_SUPPORTS_GPADLS		4
-#define VMBUS_CHANNEL_NAMED_PIPE_MODE			0x10
-#define VMBUS_CHANNEL_LOOPBACK_OFFER			0x100
-#define VMBUS_CHANNEL_PARENT_OFFER			0x200
-#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION	0x400
-#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER		0x2000
+#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE		0x0001
+/*
+ * This flag indicates that the channel is offered by the paravisor, and must
+ * use encrypted memory for the channel ring buffer.
+ */
+#define VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER			0x0002
+/*
+ * This flag indicates that the channel is offered by the paravisor, and must
+ * use encrypted memory for GPA direct packets and additional GPADLs.
+ */
+#define VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY		0x0004
+#define VMBUS_CHANNEL_NAMED_PIPE_MODE					0x0010
+#define VMBUS_CHANNEL_LOOPBACK_OFFER					0x0100
+#define VMBUS_CHANNEL_PARENT_OFFER						0x0200
+#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION	0x0400
+#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER				0x2000
 
 struct vmpacket_descriptor {
 	u16 type;
@@ -621,6 +631,12 @@ struct vmbus_channel_relid_released {
 	u32 child_relid;
 } __packed;
 
+/*
+ * Used by the paravisor only, means that the encrypted ring buffers and
+ * the encrypted external memory are supported
+ */
+#define VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS	0x10
+
 struct vmbus_channel_initiate_contact {
 	struct vmbus_channel_message_header header;
 	u32 vmbus_version_requested;
@@ -630,7 +646,8 @@ struct vmbus_channel_initiate_contact {
 		struct {
 			u8	msg_sint;
 			u8	msg_vtl;
-			u8	reserved[6];
+			u8	reserved[2];
+			u32 feature_flags; /* VMBus version 6.0 */
 		};
 	};
 	u64 monitor_page1;
@@ -1003,6 +1020,10 @@ struct vmbus_channel {
 
 	/* boolean to control visibility of sysfs for ring buffer */
 	bool ring_sysfs_visible;
+	/* The ring buffer is encrypted */
+	bool co_ring_buffer;
+	/* The external memory is encrypted */
+	bool co_external_memory;
 };
 
 #define lock_requestor(channel, flags)					\
@@ -1027,6 +1048,16 @@ u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id,
 			     u64 rqst_addr);
 u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id);
 
+static inline bool is_co_ring_buffer(const struct vmbus_channel_offer_channel *o)
+{
+	return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER);
+}
+
+static inline bool is_co_external_memory(const struct vmbus_channel_offer_channel *o)
+{
+	return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY);
+}
+
 static inline bool is_hvsock_offer(const struct vmbus_channel_offer_channel *o)
 {
 	return !!(o->offer.chn_flags & VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER);
-- 
cgit v1.2.3


From 7c8b6c326d830ca5c6b95f390c703966e14167e6 Mon Sep 17 00:00:00 2001
From: Roman Kisel <romank@linux.microsoft.com>
Date: Wed, 8 Oct 2025 16:34:05 -0700
Subject: arch/x86: mshyperv: Discover Confidential VMBus availability

Confidential VMBus requires enabling paravisor SynIC, and
the x86_64 guest has to inspect the Virtualization Stack (VS)
CPUID leaf to see if Confidential VMBus is available. If it is,
the guest shall enable the paravisor SynIC.

Read the relevant data from the VS CPUID leaf. Refactor the
code to avoid repeating CPUID and add flags to the struct
ms_hyperv_info. For ARM64, the flag for Confidential VMBus
is not set which provides the desired behaviour for now as
it is not available on ARM64 just yet. Once ARM64 CCA guests
are supported, this flag will be set unconditionally when
running such a guest.

Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/kernel/cpu/mshyperv.c | 28 +++++++++++++++-------------
 include/asm-generic/mshyperv.h |  2 ++
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 217a41b63df3..1369fae7d8a9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -440,7 +440,7 @@ EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
 
 static void __init ms_hyperv_init_platform(void)
 {
-	int hv_max_functions_eax;
+	int hv_max_functions_eax, eax;
 
 #ifdef CONFIG_PARAVIRT
 	pv_info.name = "Hyper-V";
@@ -478,6 +478,19 @@ static void __init ms_hyperv_init_platform(void)
 		pr_info("Hyper-V: running on a nested hypervisor\n");
 	}
 
+	/*
+	 * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID
+	 * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2
+	 * root) will not get them because the nested (L1) hypervisor filters them out.
+	 * These are handled through intercept processing by the Windows Hyper-V stack
+	 * or the paravisor.
+	 */
+	eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
+	ms_hyperv.confidential_vmbus_available =
+		eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE;
+	ms_hyperv.msi_ext_dest_id =
+		eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+
 	if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
 	    ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
 		x86_platform.calibrate_tsc = hv_get_tsc_khz;
@@ -678,21 +691,10 @@ static bool __init ms_hyperv_x2apic_available(void)
  * pci-hyperv host bridge.
  *
  * Note: for a Hyper-V root partition, this will always return false.
- * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by
- * default, they are implemented as intercepts by the Windows Hyper-V stack.
- * Even a nested root partition (L2 root) will not get them because the
- * nested (L1) hypervisor filters them out.
  */
 static bool __init ms_hyperv_msi_ext_dest_id(void)
 {
-	u32 eax;
-
-	eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE);
-	if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE)
-		return false;
-
-	eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
-	return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+	return ms_hyperv.msi_ext_dest_id;
 }
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index db84aced1658..8da1893365f0 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -62,6 +62,8 @@ struct ms_hyperv_info {
 		};
 	};
 	u64 shared_gpa_boundary;
+	bool msi_ext_dest_id;
+	bool confidential_vmbus_available;
 };
 extern struct ms_hyperv_info ms_hyperv;
 extern bool hv_nested;
-- 
cgit v1.2.3


From e6eeb3c782739cd1613a8da856b878b99f741943 Mon Sep 17 00:00:00 2001
From: Roman Kisel <romank@linux.microsoft.com>
Date: Wed, 8 Oct 2025 16:34:06 -0700
Subject: arch: hyperv: Get/set SynIC synth.registers via paravisor

The existing Hyper-V wrappers for getting and setting MSRs are
hv_get/set_msr(). Via hv_get/set_non_nested_msr(), they detect
when running in a CoCo VM with a paravisor, and use the TDX or
SNP guest-host communication protocol to bypass the paravisor
and go directly to the host hypervisor for SynIC MSRs. The "set"
function also implements the required special handling for the
SINT MSRs.

Provide functions that allow manipulating the SynIC registers
through the paravisor. Move vmbus_signal_eom() to a more
appropriate location (which also avoids breaking KVM).

Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
Reviewed-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/kernel/cpu/mshyperv.c | 20 +++++++++++++++++++
 drivers/hv/hv_common.c         | 11 +++++++++++
 drivers/hv/hyperv_vmbus.h      | 44 ++++++++++++++++++++++++++++++++++++++++++
 include/asm-generic/mshyperv.h | 42 ++--------------------------------------
 4 files changed, 77 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 1369fae7d8a9..eb9f1d7eec61 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -86,6 +86,26 @@ void hv_set_non_nested_msr(unsigned int reg, u64 value)
 }
 EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
 
+/*
+ * Get the SynIC register value from the paravisor.
+ */
+u64 hv_para_get_synic_register(unsigned int reg)
+{
+	if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+		return ~0ULL;
+	return native_read_msr(reg);
+}
+
+/*
+ * Set the SynIC register value with the paravisor.
+ */
+void hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+	if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+		return;
+	native_write_msr(reg, val);
+}
+
 u64 hv_get_msr(unsigned int reg)
 {
 	if (hv_nested)
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 3e41f686daa5..c39472ae8345 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -721,6 +721,17 @@ void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool
 }
 EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt);
 
+u64 __weak hv_para_get_synic_register(unsigned int reg)
+{
+	return ~0ULL;
+}
+EXPORT_SYMBOL_GPL(hv_para_get_synic_register);
+
+void __weak hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+}
+EXPORT_SYMBOL_GPL(hv_para_set_synic_register);
+
 void hv_identify_partition_type(void)
 {
 	/* Assume guest role */
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 4a01797d4851..9ac6f5520287 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -15,6 +15,7 @@
 #include <linux/list.h>
 #include <linux/bitops.h>
 #include <asm/sync_bitops.h>
+#include <asm/mshyperv.h>
 #include <linux/atomic.h>
 #include <linux/hyperv.h>
 #include <linux/interrupt.h>
@@ -335,6 +336,49 @@ extern const struct vmbus_channel_message_table_entry
 
 bool vmbus_is_confidential(void);
 
+#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
+/* Free the message slot and signal end-of-message if required */
+static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
+{
+	/*
+	 * On crash we're reading some other CPU's message page and we need
+	 * to be careful: this other CPU may already had cleared the header
+	 * and the host may already had delivered some other message there.
+	 * In case we blindly write msg->header.message_type we're going
+	 * to lose it. We can still lose a message of the same type but
+	 * we count on the fact that there can only be one
+	 * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
+	 * on crash.
+	 */
+	if (cmpxchg(&msg->header.message_type, old_msg_type,
+		    HVMSG_NONE) != old_msg_type)
+		return;
+
+	/*
+	 * The cmxchg() above does an implicit memory barrier to
+	 * ensure the write to MessageType (ie set to
+	 * HVMSG_NONE) happens before we read the
+	 * MessagePending and EOMing. Otherwise, the EOMing
+	 * will not deliver any more messages since there is
+	 * no empty slot
+	 */
+	if (msg->header.message_flags.msg_pending) {
+		/*
+		 * This will cause message queue rescan to
+		 * possibly deliver another msg from the
+		 * hypervisor
+		 */
+		if (vmbus_is_confidential())
+			hv_para_set_synic_register(HV_MSR_EOM, 0);
+		else
+			hv_set_msr(HV_MSR_EOM, 0);
+	}
+}
+
+extern int vmbus_interrupt;
+extern int vmbus_irq;
+#endif /* CONFIG_HYPERV_VMBUS */
+
 struct hv_device *vmbus_device_create(const guid_t *type,
 				      const guid_t *instance,
 				      struct vmbus_channel *channel);
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 8da1893365f0..c328265de624 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -176,46 +176,6 @@ static inline u64 hv_generate_guest_id(u64 kernel_version)
 	return guest_id;
 }
 
-#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
-/* Free the message slot and signal end-of-message if required */
-static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
-{
-	/*
-	 * On crash we're reading some other CPU's message page and we need
-	 * to be careful: this other CPU may already had cleared the header
-	 * and the host may already had delivered some other message there.
-	 * In case we blindly write msg->header.message_type we're going
-	 * to lose it. We can still lose a message of the same type but
-	 * we count on the fact that there can only be one
-	 * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
-	 * on crash.
-	 */
-	if (cmpxchg(&msg->header.message_type, old_msg_type,
-		    HVMSG_NONE) != old_msg_type)
-		return;
-
-	/*
-	 * The cmxchg() above does an implicit memory barrier to
-	 * ensure the write to MessageType (ie set to
-	 * HVMSG_NONE) happens before we read the
-	 * MessagePending and EOMing. Otherwise, the EOMing
-	 * will not deliver any more messages since there is
-	 * no empty slot
-	 */
-	if (msg->header.message_flags.msg_pending) {
-		/*
-		 * This will cause message queue rescan to
-		 * possibly deliver another msg from the
-		 * hypervisor
-		 */
-		hv_set_msr(HV_MSR_EOM, 0);
-	}
-}
-
-extern int vmbus_interrupt;
-extern int vmbus_irq;
-#endif /* CONFIG_HYPERV_VMBUS */
-
 int hv_get_hypervisor_version(union hv_hypervisor_version_info *info);
 
 void hv_setup_vmbus_handler(void (*handler)(void));
@@ -350,6 +310,8 @@ bool hv_isolation_type_snp(void);
 u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size);
 u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
 void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set);
+u64 hv_para_get_synic_register(unsigned int reg);
+void hv_para_set_synic_register(unsigned int reg, u64 val);
 void hyperv_cleanup(void);
 bool hv_query_ext_cap(u64 cap_query);
 void hv_setup_dma_ops(struct device *dev, bool coherent);
-- 
cgit v1.2.3


From a156ad8c508209ce22f3213d25c3c2ae1774a57d Mon Sep 17 00:00:00 2001
From: Roman Kisel <romank@linux.microsoft.com>
Date: Wed, 8 Oct 2025 16:34:07 -0700
Subject: arch/x86: mshyperv: Trap on access for some synthetic MSRs

hv_set_non_nested_msr() has special handling for SINT MSRs
when a paravisor is present. In addition to updating the MSR on the
host, the mirror MSR in the paravisor is updated, including with the
proxy bit. But with Confidential VMBus, the proxy bit must not be
used, so add a special case to skip it.

Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
Reviewed-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Tianyu Lan <tiala@microsoft.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/kernel/cpu/mshyperv.c | 29 +++++++++++++++++++++++++----
 drivers/hv/hv_common.c         |  5 +++++
 include/asm-generic/mshyperv.h |  1 +
 3 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index eb9f1d7eec61..80a641a6ac48 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -28,6 +28,7 @@
 #include <asm/apic.h>
 #include <asm/timer.h>
 #include <asm/reboot.h>
+#include <asm/msr.h>
 #include <asm/nmi.h>
 #include <clocksource/hyperv_timer.h>
 #include <asm/msr.h>
@@ -39,6 +40,12 @@ bool hv_nested;
 struct ms_hyperv_info ms_hyperv;
 
 #if IS_ENABLED(CONFIG_HYPERV)
+/*
+ * When running with the paravisor, controls proxying the synthetic interrupts
+ * from the host
+ */
+static bool hv_para_sint_proxy;
+
 static inline unsigned int hv_get_nested_msr(unsigned int reg)
 {
 	if (hv_is_sint_msr(reg))
@@ -75,17 +82,31 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
 void hv_set_non_nested_msr(unsigned int reg, u64 value)
 {
 	if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
+		/* The hypervisor will get the intercept. */
 		hv_ivm_msr_write(reg, value);
 
-		/* Write proxy bit via wrmsl instruction */
-		if (hv_is_sint_msr(reg))
-			wrmsrq(reg, value | 1 << 20);
+		/* Using wrmsrq so the following goes to the paravisor. */
+		if (hv_is_sint_msr(reg)) {
+			union hv_synic_sint sint = { .as_uint64 = value };
+
+			sint.proxy = hv_para_sint_proxy;
+			native_wrmsrq(reg, sint.as_uint64);
+		}
 	} else {
-		wrmsrq(reg, value);
+		native_wrmsrq(reg, value);
 	}
 }
 EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
 
+/*
+ * Enable or disable proxying synthetic interrupts
+ * to the paravisor.
+ */
+void hv_para_set_sint_proxy(bool enable)
+{
+	hv_para_sint_proxy = enable;
+}
+
 /*
  * Get the SynIC register value from the paravisor.
  */
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index c39472ae8345..4b44da7f23b2 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -721,6 +721,11 @@ void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool
 }
 EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt);
 
+void __weak hv_para_set_sint_proxy(bool enable)
+{
+}
+EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy);
+
 u64 __weak hv_para_get_synic_register(unsigned int reg)
 {
 	return ~0ULL;
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index c328265de624..ecedab554c80 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -310,6 +310,7 @@ bool hv_isolation_type_snp(void);
 u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size);
 u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
 void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set);
+void hv_para_set_sint_proxy(bool enable);
 u64 hv_para_get_synic_register(unsigned int reg);
 void hv_para_set_synic_register(unsigned int reg, u64 val);
 void hyperv_cleanup(void);
-- 
cgit v1.2.3


From 59aeea195948fd507cef2e439a5a964b8432750e Mon Sep 17 00:00:00 2001
From: Purna Pavan Chandra Aekkaladevi <paekkaladevi@linux.microsoft.com>
Date: Fri, 10 Oct 2025 14:55:48 -0700
Subject: mshv: Add the HVCALL_GET_PARTITION_PROPERTY_EX hypercall

This hypercall can be used to fetch extended properties of a
partition. Extended properties are properties with values larger than
a u64. Some of these also need additional input arguments.

Add helper function for using the hypercall in the mshv_root driver.

Signed-off-by: Purna Pavan Chandra Aekkaladevi <paekkaladevi@linux.microsoft.com>
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Reviewed-by: Anirudh Rayabharam <anirudh@anirudhrb.com>
Reviewed-by: Praveen K Paladugu <prapal@linux.microsoft.com>
Reviewed-by: Easwar Hariharan <easwar.hariharan@linux.microsoft.com>
Reviewed-by: Tianyu Lan <tiala@microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/mshv_root.h         |  2 ++
 drivers/hv/mshv_root_hv_call.c | 31 +++++++++++++++++++++++++++++++
 include/hyperv/hvgdk_mini.h    |  1 +
 include/hyperv/hvhdk.h         | 40 ++++++++++++++++++++++++++++++++++++++++
 include/hyperv/hvhdk_mini.h    | 26 ++++++++++++++++++++++++++
 5 files changed, 100 insertions(+)

(limited to 'include')

diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index db6b42db2fdc..0e62badfc9f1 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -303,6 +303,8 @@ int hv_call_unmap_stat_page(enum hv_stats_object_type type,
 int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
 				   u64 page_struct_count, u32 host_access,
 				   u32 flags, u8 acquire);
+int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg,
+				      void *property_value, size_t property_value_sz);
 
 extern struct mshv_root mshv_root;
 extern enum hv_scheduler_type hv_scheduler_type;
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index c9c274f29c3c..8049e51c45dc 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -590,6 +590,37 @@ int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
 	return hv_result_to_errno(status);
 }
 
+int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code,
+				      u64 arg, void *property_value,
+				      size_t property_value_sz)
+{
+	u64 status;
+	unsigned long flags;
+	struct hv_input_get_partition_property_ex *input;
+	struct hv_output_get_partition_property_ex *output;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	memset(input, 0, sizeof(*input));
+	input->partition_id = partition_id;
+	input->property_code = property_code;
+	input->arg = arg;
+	status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output);
+
+	if (!hv_result_success(status)) {
+		local_irq_restore(flags);
+		hv_status_debug(status, "\n");
+		return hv_result_to_errno(status);
+	}
+	memcpy(property_value, &output->property_value, property_value_sz);
+
+	local_irq_restore(flags);
+
+	return 0;
+}
+
 int
 hv_call_clear_virtual_interrupt(u64 partition_id)
 {
diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index 7f730a0e54e6..af85b1c36b6e 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -491,6 +491,7 @@ union hv_vp_assist_msr_contents {	 /* HV_REGISTER_VP_ASSIST_PAGE */
 #define HVCALL_GET_VP_STATE				0x00e3
 #define HVCALL_SET_VP_STATE				0x00e4
 #define HVCALL_GET_VP_CPUID_VALUES			0x00f4
+#define HVCALL_GET_PARTITION_PROPERTY_EX		0x0101
 #define HVCALL_MMIO_READ				0x0106
 #define HVCALL_MMIO_WRITE				0x0107
 
diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index b4067ada02cf..416c0d45b793 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -376,6 +376,46 @@ struct hv_input_set_partition_property {
 	u64 property_value;
 } __packed;
 
+union hv_partition_property_arg {
+	u64 as_uint64;
+	struct {
+		union {
+			u32 arg;
+			u32 vp_index;
+		};
+		u16 reserved0;
+		u8 reserved1;
+		u8 object_type;
+	} __packed;
+};
+
+struct hv_input_get_partition_property_ex {
+	u64 partition_id;
+	u32 property_code; /* enum hv_partition_property_code */
+	u32 padding;
+	union {
+		union hv_partition_property_arg arg_data;
+		u64 arg;
+	};
+} __packed;
+
+/*
+ * NOTE: Should use hv_input_set_partition_property_ex_header to compute this
+ * size, but hv_input_get_partition_property_ex is identical so it suffices
+ */
+#define HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE \
+	(HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_partition_property_ex))
+
+union hv_partition_property_ex {
+	u8 buffer[HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE];
+	struct hv_partition_property_vmm_capabilities vmm_capabilities;
+	/* More fields to be filled in when needed */
+};
+
+struct hv_output_get_partition_property_ex {
+	union hv_partition_property_ex property_value;
+} __packed;
+
 enum hv_vp_state_page_type {
 	HV_VP_STATE_PAGE_REGISTERS = 0,
 	HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1,
diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h
index 858f6a3925b3..bf2ce27dfcc5 100644
--- a/include/hyperv/hvhdk_mini.h
+++ b/include/hyperv/hvhdk_mini.h
@@ -96,8 +96,34 @@ enum hv_partition_property_code {
 	HV_PARTITION_PROPERTY_XSAVE_STATES                      = 0x00060007,
 	HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE		= 0x00060008,
 	HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY		= 0x00060009,
+
+	/* Extended properties with larger property values */
+	HV_PARTITION_PROPERTY_VMM_CAPABILITIES			= 0x00090007,
 };
 
+#define HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT		1
+#define HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT	59
+
+struct hv_partition_property_vmm_capabilities {
+	u16 bank_count;
+	u16 reserved[3];
+	union {
+		u64 as_uint64[HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT];
+		struct {
+			u64 map_gpa_preserve_adjustable: 1;
+			u64 vmm_can_provide_overlay_gpfn: 1;
+			u64 vp_affinity_property: 1;
+#if IS_ENABLED(CONFIG_ARM64)
+			u64 vmm_can_provide_gic_overlay_locations: 1;
+#else
+			u64 reservedbit3: 1;
+#endif
+			u64 assignable_synthetic_proc_features: 1;
+			u64 reserved0: HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT;
+		} __packed;
+	};
+} __packed;
+
 enum hv_snp_status {
 	HV_SNP_STATUS_NONE = 0,
 	HV_SNP_STATUS_AVAILABLE = 1,
-- 
cgit v1.2.3


From d62313bdf5961b5f815f0b212f029cf146a8a804 Mon Sep 17 00:00:00 2001
From: Jinank Jain <jinankjain@linux.microsoft.com>
Date: Fri, 10 Oct 2025 14:55:51 -0700
Subject: mshv: Introduce new hypercall to map stats page for L1VH partitions

Introduce HVCALL_MAP_STATS_PAGE2 which provides a map location (GPFN)
to map the stats to. This hypercall is required for L1VH partitions,
depending on the hypervisor version. This uses the same check as the
state page map location; mshv_use_overlay_gpfn().

Add mshv_map_vp_state_page() helpers to use this new hypercall or the
old one depending on availability.

For unmapping, the original HVCALL_UNMAP_STATS_PAGE works for both
cases.

Signed-off-by: Jinank Jain <jinankjain@linux.microsoft.com>
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Reviewed-by: Easwar Hariharan <easwar.hariharan@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/mshv_root.h         | 10 ++---
 drivers/hv/mshv_root_hv_call.c | 95 +++++++++++++++++++++++++++++++++++++++---
 drivers/hv/mshv_root_main.c    | 22 +++++-----
 include/hyperv/hvgdk_mini.h    |  1 +
 include/hyperv/hvhdk_mini.h    |  7 ++++
 5 files changed, 115 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 5b57d894358a..3eb815011b46 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -297,11 +297,11 @@ int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
 int hv_call_disconnect_port(u64 connection_partition_id,
 			    union hv_connection_id connection_id);
 int hv_call_notify_port_ring_empty(u32 sint_index);
-int hv_call_map_stat_page(enum hv_stats_object_type type,
-			  const union hv_stats_object_identity *identity,
-			  void **addr);
-int hv_call_unmap_stat_page(enum hv_stats_object_type type,
-			    const union hv_stats_object_identity *identity);
+int hv_map_stats_page(enum hv_stats_object_type type,
+		      const union hv_stats_object_identity *identity,
+		      void **addr);
+int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
+			const union hv_stats_object_identity *identity);
 int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
 				   u64 page_struct_count, u32 host_access,
 				   u32 flags, u8 acquire);
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index 6dac9fcc092c..caf02cfa49c9 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -807,9 +807,51 @@ hv_call_notify_port_ring_empty(u32 sint_index)
 	return hv_result_to_errno(status);
 }
 
-int hv_call_map_stat_page(enum hv_stats_object_type type,
-			  const union hv_stats_object_identity *identity,
-			  void **addr)
+static int hv_call_map_stats_page2(enum hv_stats_object_type type,
+				   const union hv_stats_object_identity *identity,
+				   u64 map_location)
+{
+	unsigned long flags;
+	struct hv_input_map_stats_page2 *input;
+	u64 status;
+	int ret;
+
+	if (!map_location || !mshv_use_overlay_gpfn())
+		return -EINVAL;
+
+	do {
+		local_irq_save(flags);
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+		memset(input, 0, sizeof(*input));
+		input->type = type;
+		input->identity = *identity;
+		input->map_location = map_location;
+
+		status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL);
+
+		local_irq_restore(flags);
+
+		ret = hv_result_to_errno(status);
+
+		if (!ret)
+			break;
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			hv_status_debug(status, "\n");
+			break;
+		}
+
+		ret = hv_call_deposit_pages(NUMA_NO_NODE,
+					    hv_current_partition_id, 1);
+	} while (!ret);
+
+	return ret;
+}
+
+static int hv_call_map_stats_page(enum hv_stats_object_type type,
+				  const union hv_stats_object_identity *identity,
+				  void **addr)
 {
 	unsigned long flags;
 	struct hv_input_map_stats_page *input;
@@ -848,8 +890,38 @@ int hv_call_map_stat_page(enum hv_stats_object_type type,
 	return ret;
 }
 
-int hv_call_unmap_stat_page(enum hv_stats_object_type type,
-			    const union hv_stats_object_identity *identity)
+int hv_map_stats_page(enum hv_stats_object_type type,
+		      const union hv_stats_object_identity *identity,
+		      void **addr)
+{
+	int ret;
+	struct page *allocated_page = NULL;
+
+	if (!addr)
+		return -EINVAL;
+
+	if (mshv_use_overlay_gpfn()) {
+		allocated_page = alloc_page(GFP_KERNEL);
+		if (!allocated_page)
+			return -ENOMEM;
+
+		ret = hv_call_map_stats_page2(type, identity,
+					      page_to_pfn(allocated_page));
+		*addr = page_address(allocated_page);
+	} else {
+		ret = hv_call_map_stats_page(type, identity, addr);
+	}
+
+	if (ret && allocated_page) {
+		__free_page(allocated_page);
+		*addr = NULL;
+	}
+
+	return ret;
+}
+
+static int hv_call_unmap_stats_page(enum hv_stats_object_type type,
+				    const union hv_stats_object_identity *identity)
 {
 	unsigned long flags;
 	struct hv_input_unmap_stats_page *input;
@@ -868,6 +940,19 @@ int hv_call_unmap_stat_page(enum hv_stats_object_type type,
 	return hv_result_to_errno(status);
 }
 
+int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
+			const union hv_stats_object_identity *identity)
+{
+	int ret;
+
+	ret = hv_call_unmap_stats_page(type, identity);
+
+	if (mshv_use_overlay_gpfn() && page_addr)
+		__free_page(virt_to_page(page_addr));
+
+	return ret;
+}
+
 int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
 				   u64 page_struct_count, u32 host_access,
 				   u32 flags, u8 acquire)
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 288695a859f7..7684645ef00d 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -843,7 +843,8 @@ mshv_vp_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
+static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
+				void *stats_pages[])
 {
 	union hv_stats_object_identity identity = {
 		.vp.partition_id = partition_id,
@@ -851,10 +852,10 @@ static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
 	};
 
 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
-	hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
 
 	identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
-	hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
 }
 
 static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
@@ -867,14 +868,14 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
 	int err;
 
 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
-	err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
-				    &stats_pages[HV_STATS_AREA_SELF]);
+	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+				&stats_pages[HV_STATS_AREA_SELF]);
 	if (err)
 		return err;
 
 	identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
-	err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
-				    &stats_pages[HV_STATS_AREA_PARENT]);
+	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+				&stats_pages[HV_STATS_AREA_PARENT]);
 	if (err)
 		goto unmap_self;
 
@@ -882,7 +883,7 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
 
 unmap_self:
 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
-	hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
 	return err;
 }
 
@@ -990,7 +991,7 @@ free_vp:
 	kfree(vp);
 unmap_stats_pages:
 	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
-		mshv_vp_stats_unmap(partition->pt_id, args.vp_index);
+		mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
 unmap_ghcb_page:
 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
 		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
@@ -1742,7 +1743,8 @@ static void destroy_partition(struct mshv_partition *partition)
 				continue;
 
 			if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
-				mshv_vp_stats_unmap(partition->pt_id, vp->vp_index);
+				mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
+						    (void **)vp->vp_stats_pages);
 
 			if (vp->vp_register_page) {
 				(void)hv_unmap_vp_state_page(partition->pt_id,
diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index af85b1c36b6e..f6e31d1c3267 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -494,6 +494,7 @@ union hv_vp_assist_msr_contents {	 /* HV_REGISTER_VP_ASSIST_PAGE */
 #define HVCALL_GET_PARTITION_PROPERTY_EX		0x0101
 #define HVCALL_MMIO_READ				0x0106
 #define HVCALL_MMIO_WRITE				0x0107
+#define HVCALL_MAP_STATS_PAGE2				0x0131
 
 /* HV_HYPERCALL_INPUT */
 #define HV_HYPERCALL_RESULT_MASK	GENMASK_ULL(15, 0)
diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h
index bf2ce27dfcc5..064bf735cab6 100644
--- a/include/hyperv/hvhdk_mini.h
+++ b/include/hyperv/hvhdk_mini.h
@@ -177,6 +177,13 @@ struct hv_input_map_stats_page {
 	union hv_stats_object_identity identity;
 } __packed;
 
+struct hv_input_map_stats_page2 {
+	u32 type; /* enum hv_stats_object_type */
+	u32 padding;
+	union hv_stats_object_identity identity;
+	u64 map_location;
+} __packed;
+
 struct hv_output_map_stats_page {
 	u64 map_location;
 } __packed;
-- 
cgit v1.2.3


From 56c3feb3cc17b764f51191fd3dc461ab55a7b803 Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mrathor@linux.microsoft.com>
Date: Mon, 6 Oct 2025 15:42:04 -0700
Subject: hyperv: Add two new hypercall numbers to guest ABI public header

In preparation for the subsequent crashdump patches, copy two hypercall
numbers to the guest ABI header published by Hyper-V. One to notify
hypervisor of an event that occurs in the root partition, other to ask
hypervisor to disable the hypervisor.

Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/hyperv/hvgdk_mini.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index f6e31d1c3267..7499a679e60a 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -470,6 +470,7 @@ union hv_vp_assist_msr_contents {	 /* HV_REGISTER_VP_ASSIST_PAGE */
 #define HVCALL_MAP_DEVICE_INTERRUPT			0x007c
 #define HVCALL_UNMAP_DEVICE_INTERRUPT			0x007d
 #define HVCALL_RETARGET_INTERRUPT			0x007e
+#define HVCALL_NOTIFY_PARTITION_EVENT                   0x0087
 #define HVCALL_NOTIFY_PORT_RING_EMPTY			0x008b
 #define HVCALL_REGISTER_INTERCEPT_RESULT		0x0091
 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT			0x0094
@@ -494,6 +495,7 @@ union hv_vp_assist_msr_contents {	 /* HV_REGISTER_VP_ASSIST_PAGE */
 #define HVCALL_GET_PARTITION_PROPERTY_EX		0x0101
 #define HVCALL_MMIO_READ				0x0106
 #define HVCALL_MMIO_WRITE				0x0107
+#define HVCALL_DISABLE_HYP_EX                           0x010f
 #define HVCALL_MAP_STATS_PAGE2				0x0131
 
 /* HV_HYPERCALL_INPUT */
-- 
cgit v1.2.3


From e0a975ecd2e671664d208723476eeabb3baf08be Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mrathor@linux.microsoft.com>
Date: Mon, 6 Oct 2025 15:42:05 -0700
Subject: hyperv: Add definitions for hypervisor crash dump support

Add data structures for hypervisor crash dump support to the hypervisor
host ABI header file. Details of their usages are in subsequent commits.

Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/hyperv/hvhdk_mini.h | 55 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'include')

diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h
index 064bf735cab6..f2d7b50de7a4 100644
--- a/include/hyperv/hvhdk_mini.h
+++ b/include/hyperv/hvhdk_mini.h
@@ -142,6 +142,17 @@ enum hv_system_property {
 	/* Add more values when needed */
 	HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15,
 	HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21,
+	HV_SYSTEM_PROPERTY_CRASHDUMPAREA = 47,
+};
+
+#define HV_PFN_RANGE_PGBITS 24  /* HV_SPA_PAGE_RANGE_ADDITIONAL_PAGES_BITS */
+union hv_pfn_range {            /* HV_SPA_PAGE_RANGE */
+	u64 as_uint64;
+	struct {
+		/* 39:0: base pfn.  63:40: additional pages */
+		u64 base_pfn : 64 - HV_PFN_RANGE_PGBITS;
+		u64 add_pfns : HV_PFN_RANGE_PGBITS;
+	} __packed;
 };
 
 enum hv_dynamic_processor_feature_property {
@@ -168,6 +179,8 @@ struct hv_output_get_system_property {
 #if IS_ENABLED(CONFIG_X86)
 		u64 hv_processor_feature_value;
 #endif
+		union hv_pfn_range hv_cda_info; /* CrashdumpAreaAddress */
+		u64 hv_tramp_pa;                /* CrashdumpTrampolineAddress */
 	};
 } __packed;
 
@@ -267,6 +280,48 @@ union hv_gpa_page_access_state {
 	u8 as_uint8;
 } __packed;
 
+enum hv_crashdump_action {
+	HV_CRASHDUMP_NONE = 0,
+	HV_CRASHDUMP_SUSPEND_ALL_VPS,
+	HV_CRASHDUMP_PREPARE_FOR_STATE_SAVE,
+	HV_CRASHDUMP_STATE_SAVED,
+	HV_CRASHDUMP_ENTRY,
+};
+
+struct hv_partition_event_root_crashdump_input {
+	u32 crashdump_action; /* enum hv_crashdump_action */
+} __packed;
+
+struct hv_input_disable_hyp_ex {   /* HV_X64_INPUT_DISABLE_HYPERVISOR_EX */
+	u64 rip;
+	u64 arg;
+} __packed;
+
+struct hv_crashdump_area {	   /* HV_CRASHDUMP_AREA */
+	u32 version;
+	union {
+		u32 flags_as_uint32;
+		struct {
+			u32 cda_valid : 1;
+			u32 cda_unused : 31;
+		} __packed;
+	};
+	/* more unused fields */
+} __packed;
+
+union hv_partition_event_input {
+	struct hv_partition_event_root_crashdump_input crashdump_input;
+};
+
+enum hv_partition_event {
+	HV_PARTITION_EVENT_ROOT_CRASHDUMP = 2,
+};
+
+struct hv_input_notify_partition_event {
+	u32 event;      /* enum hv_partition_event */
+	union hv_partition_event_input input;
+} __packed;
+
 struct hv_lp_startup_status {
 	u64 hv_status;
 	u64 substatus1;
-- 
cgit v1.2.3


From f91bc8f61abf0e1d23108ae9871c60d7612a09b2 Mon Sep 17 00:00:00 2001
From: Magnus Kulke <magnuskulke@linux.microsoft.com>
Date: Thu, 6 Nov 2025 14:13:31 -0800
Subject: mshv: Allow mappings that overlap in uaddr

Currently the MSHV driver rejects mappings that would overlap in
userspace.

Some VMMs require the same memory to be mapped to different parts of
the guest's address space, and so working around this restriction is
difficult.

The hypervisor itself doesn't prohibit mappings that overlap in uaddr,
(really in SPA; system physical addresses), so supporting this in the
driver doesn't require any extra work: only the checks need to be
removed.

Since no userspace code until now has been able to overlap regions in
userspace, relaxing this constraint can't break any existing code.

Signed-off-by: Magnus Kulke <magnuskulke@linux.microsoft.com>
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/mshv_root_main.c | 8 ++------
 include/uapi/linux/mshv.h   | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index a8f3c5f3ce34..3f73c468e975 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1188,12 +1188,8 @@ static int mshv_partition_create_region(struct mshv_partition *partition,
 
 	/* Reject overlapping regions */
 	hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
-		u64 rg_size = rg->nr_pages << HV_HYP_PAGE_SHIFT;
-
-		if ((mem->guest_pfn + nr_pages <= rg->start_gfn ||
-		     rg->start_gfn + rg->nr_pages <= mem->guest_pfn) &&
-		    (mem->userspace_addr + mem->size <= rg->start_uaddr ||
-		     rg->start_uaddr + rg_size <= mem->userspace_addr))
+		if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
+		    rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
 			continue;
 
 		return -EEXIST;
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index 876bfe4e4227..374f75e198bc 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -89,7 +89,7 @@ enum {
  * @rsvd: MBZ
  *
  * Map or unmap a region of userspace memory to Guest Physical Addresses (GPA).
- * Mappings can't overlap in GPA space or userspace.
+ * Mappings can't overlap in GPA space.
  * To unmap, these fields must match an existing mapping.
  */
 struct mshv_user_mem_region {
-- 
cgit v1.2.3


From c91fe5f162f278d4aa960d06d2dbc42f9857593a Mon Sep 17 00:00:00 2001
From: Muminul Islam <muislam@microsoft.com>
Date: Thu, 13 Nov 2025 11:45:33 -0800
Subject: mshv: Extend create partition ioctl to support cpu features

The existing mshv create partition ioctl does not provide a way to
specify which cpu features are enabled in the guest. Instead, it
attempts to enable all features and those that are not supported are
silently disabled by the hypervisor.

This was done to reduce unnecessary complexity and is sufficient for
many cases. However, new scenarios require fine-grained control over
these features.

Define a new mshv_create_partition_v2 structure which supports
passing the disabled processor and xsave feature bits through to the
create partition hypercall directly.

Introduce a new flag MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES which enables
the new structure. If unset, the original mshv_create_partition struct
is used, with the old behavior of enabling all features.

Co-developed-by: Jinank Jain <jinankjain@microsoft.com>
Signed-off-by: Jinank Jain <jinankjain@microsoft.com>
Signed-off-by: Muminul Islam <muislam@microsoft.com>
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/mshv_root_main.c | 118 ++++++++++++++++++++++++++++++++++++--------
 include/uapi/linux/mshv.h   |  34 +++++++++++++
 2 files changed, 131 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 3f73c468e975..45c7a5fea1cf 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1855,43 +1855,119 @@ add_partition(struct mshv_partition *partition)
 	return 0;
 }
 
-static long
-mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
+	      HV_PARTITION_PROCESSOR_FEATURES_BANKS);
+
+static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
+					struct hv_partition_creation_properties *cr_props,
+					union hv_partition_isolation_properties *isol_props)
 {
-	struct mshv_create_partition args;
-	u64 creation_flags;
-	struct hv_partition_creation_properties creation_properties = {};
-	union hv_partition_isolation_properties isolation_properties = {};
-	struct mshv_partition *partition;
-	struct file *file;
-	int fd;
-	long ret;
+	int i;
+	struct mshv_create_partition_v2 args;
+	union hv_partition_processor_features *disabled_procs;
+	union hv_partition_processor_xsave_features *disabled_xsave;
 
-	if (copy_from_user(&args, user_arg, sizeof(args)))
+	/* First, copy v1 struct in case user is on previous versions */
+	if (copy_from_user(&args, user_arg,
+			   sizeof(struct mshv_create_partition)))
 		return -EFAULT;
 
 	if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
 	    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
 		return -EINVAL;
 
+	disabled_procs = &cr_props->disabled_processor_features;
+	disabled_xsave = &cr_props->disabled_processor_xsave_features;
+
+	/* Check if user provided newer struct with feature fields */
+	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
+		if (copy_from_user(&args, user_arg, sizeof(args)))
+			return -EFAULT;
+
+		/* Re-validate v1 fields after second copy_from_user() */
+		if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
+		    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
+			return -EINVAL;
+
+		if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
+		    mshv_field_nonzero(args, pt_rsvd) ||
+		    mshv_field_nonzero(args, pt_rsvd1))
+			return -EINVAL;
+
+		/*
+		 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
+		 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
+		 * (i.e. 2).
+		 *
+		 * Further banks (index >= 2) will be modifiable as 'early'
+		 * properties via the set partition property hypercall.
+		 */
+		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+			disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
+
+#if IS_ENABLED(CONFIG_X86_64)
+		disabled_xsave->as_uint64 = args.pt_disabled_xsave;
+#else
+		/*
+		 * In practice this field is ignored on arm64, but safer to
+		 * zero it in case it is ever used.
+		 */
+		disabled_xsave->as_uint64 = 0;
+
+		if (mshv_field_nonzero(args, pt_rsvd2))
+			return -EINVAL;
+#endif
+	} else {
+		/*
+		 * v1 behavior: try to enable everything. The hypervisor will
+		 * disable features that are not supported. The banks can be
+		 * queried via the get partition property hypercall.
+		 */
+		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+			disabled_procs->as_uint64[i] = 0;
+
+		disabled_xsave->as_uint64 = 0;
+	}
+
 	/* Only support EXO partitions */
-	creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
-			 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
+	*pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
+		    HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
+
+	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
+		*pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
+	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
+		*pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
+	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
+		*pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
 
-	if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC))
-		creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
-	if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC))
-		creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
-	if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES))
-		creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
+	isol_props->as_uint64 = 0;
 
 	switch (args.pt_isolation) {
 	case MSHV_PT_ISOLATION_NONE:
-		isolation_properties.isolation_type =
-			HV_PARTITION_ISOLATION_TYPE_NONE;
+		isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
 		break;
 	}
 
+	return 0;
+}
+
+static long
+mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+{
+	u64 creation_flags;
+	struct hv_partition_creation_properties creation_properties;
+	union hv_partition_isolation_properties isolation_properties;
+	struct mshv_partition *partition;
+	struct file *file;
+	int fd;
+	long ret;
+
+	ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
+					  &creation_properties,
+					  &isolation_properties);
+	if (ret)
+		return ret;
+
 	partition = kzalloc(sizeof(*partition), GFP_KERNEL);
 	if (!partition)
 		return -ENOMEM;
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index 374f75e198bc..b645d17cc531 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -26,6 +26,7 @@ enum {
 	MSHV_PT_BIT_LAPIC,
 	MSHV_PT_BIT_X2APIC,
 	MSHV_PT_BIT_GPA_SUPER_PAGES,
+	MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES,
 	MSHV_PT_BIT_COUNT,
 };
 
@@ -41,6 +42,8 @@ enum {
  * @pt_flags: Bitmask of 1 << MSHV_PT_BIT_*
  * @pt_isolation: MSHV_PT_ISOLATION_*
  *
+ * This is the initial/v1 version for backward compatibility.
+ *
  * Returns a file descriptor to act as a handle to a guest partition.
  * At this point the partition is not yet initialized in the hypervisor.
  * Some operations must be done with the partition in this state, e.g. setting
@@ -52,6 +55,37 @@ struct mshv_create_partition {
 	__u64 pt_isolation;
 };
 
+#define MSHV_NUM_CPU_FEATURES_BANKS 2
+
+/**
+ * struct mshv_create_partition_v2
+ *
+ * This is extended version of the above initial MSHV_CREATE_PARTITION
+ * ioctl and allows for following additional parameters:
+ *
+ * @pt_num_cpu_fbanks: Must be set to MSHV_NUM_CPU_FEATURES_BANKS.
+ * @pt_cpu_fbanks: Disabled processor feature banks array.
+ * @pt_disabled_xsave: Disabled xsave feature bits.
+ *
+ * pt_cpu_fbanks and pt_disabled_xsave are passed through as-is to the create
+ * partition hypercall.
+ *
+ * Returns : same as above original mshv_create_partition
+ */
+struct mshv_create_partition_v2 {
+	__u64 pt_flags;
+	__u64 pt_isolation;
+	__u16 pt_num_cpu_fbanks;
+	__u8  pt_rsvd[6];		/* MBZ */
+	__u64 pt_cpu_fbanks[MSHV_NUM_CPU_FEATURES_BANKS];
+	__u64 pt_rsvd1[2];		/* MBZ */
+#if defined(__x86_64__)
+	__u64 pt_disabled_xsave;
+#else
+	__u64 pt_rsvd2;			/* MBZ */
+#endif
+} __packed;
+
 /* /dev/mshv */
 #define MSHV_CREATE_PARTITION	_IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition)
 
-- 
cgit v1.2.3


From 796ef5a7fe86a8605f2844471ed7baa8e80bace8 Mon Sep 17 00:00:00 2001
From: Naman Jain <namjain@linux.microsoft.com>
Date: Thu, 13 Nov 2025 04:41:47 +0000
Subject: static_call: allow using STATIC_CALL_TRAMP_STR() from assembly

STATIC_CALL_TRAMP_STR() could not be used from .S files because
static_call_types.h was not safe to include in assembly as it pulled in C
types/constructs that are unavailable under __ASSEMBLY__.
Make the header assembly-friendly by adding __ASSEMBLY__ checks and
providing only the minimal definitions needed for assembly, so that it
can be safely included by .S code. This enables emitting the static call
trampoline symbol name via STATIC_CALL_TRAMP_STR() directly in assembly
sources, to be used with 'call' instruction. Also, move a certain
definitions out of __ASSEMBLY__ checks in compiler_types.h to meet
the dependencies.

No functional change for C compilation.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Naman Jain <namjain@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/linux/compiler_types.h          | 8 ++++----
 include/linux/static_call_types.h       | 4 ++++
 tools/include/linux/static_call_types.h | 4 ++++
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 59288a2c1ad2..6897d4d5cb28 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -11,6 +11,10 @@
 #define __has_builtin(x) (0)
 #endif
 
+/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
+#define ___PASTE(a, b) a##b
+#define __PASTE(a, b) ___PASTE(a, b)
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -79,10 +83,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
 # define __builtin_warning(x, y...) (1)
 #endif /* __CHECKER__ */
 
-/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
-#define ___PASTE(a,b) a##b
-#define __PASTE(a,b) ___PASTE(a,b)
-
 #ifdef __KERNEL__
 
 /* Attributes */
diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h
index 5a00b8b2cf9f..cfb6ddeb292b 100644
--- a/include/linux/static_call_types.h
+++ b/include/linux/static_call_types.h
@@ -25,6 +25,8 @@
 #define STATIC_CALL_SITE_INIT 2UL	/* init section */
 #define STATIC_CALL_SITE_FLAGS 3UL
 
+#ifndef __ASSEMBLY__
+
 /*
  * The static call site table needs to be created by external tooling (objtool
  * or a compiler plugin).
@@ -100,4 +102,6 @@ struct static_call_key {
 
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _STATIC_CALL_TYPES_H */
diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h
index 5a00b8b2cf9f..cfb6ddeb292b 100644
--- a/tools/include/linux/static_call_types.h
+++ b/tools/include/linux/static_call_types.h
@@ -25,6 +25,8 @@
 #define STATIC_CALL_SITE_INIT 2UL	/* init section */
 #define STATIC_CALL_SITE_FLAGS 3UL
 
+#ifndef __ASSEMBLY__
+
 /*
  * The static call site table needs to be created by external tooling (objtool
  * or a compiler plugin).
@@ -100,4 +102,6 @@ struct static_call_key {
 
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _STATIC_CALL_TYPES_H */
-- 
cgit v1.2.3


From 4a09126a33638945a1640e064ed73e983b51ae07 Mon Sep 17 00:00:00 2001
From: Michael Riesch <michael.riesch@collabora.com>
Date: Fri, 14 Nov 2025 16:20:13 +0100
Subject: media: dt-bindings: video-interfaces: add defines for sampling modes

Add defines for the pixel clock sampling modes (rising edge, falling edge,
dual edge) for parallel video interfaces.
This avoids hardcoded constants in device tree sources.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Michael Riesch <michael.riesch@wolfvision.net>
Reviewed-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
Signed-off-by: Michael Riesch <michael.riesch@collabora.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 include/dt-bindings/media/video-interfaces.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/media/video-interfaces.h b/include/dt-bindings/media/video-interfaces.h
index 88b9d05d8075..0b19c9b2e627 100644
--- a/include/dt-bindings/media/video-interfaces.h
+++ b/include/dt-bindings/media/video-interfaces.h
@@ -20,4 +20,8 @@
 #define MEDIA_BUS_CSI2_CPHY_LINE_ORDER_CAB	4
 #define MEDIA_BUS_CSI2_CPHY_LINE_ORDER_CBA	5
 
+#define MEDIA_PCLK_SAMPLE_FALLING_EDGE		0
+#define MEDIA_PCLK_SAMPLE_RISING_EDGE		1
+#define MEDIA_PCLK_SAMPLE_DUAL_EDGE		2
+
 #endif /* __DT_BINDINGS_MEDIA_VIDEO_INTERFACES_H__ */
-- 
cgit v1.2.3


From b3bc229b54e780fe02a41ec65a0cb06acf7ac1d9 Mon Sep 17 00:00:00 2001
From: Chin-Ting Kuo <chin-ting_kuo@aspeedtech.com>
Date: Fri, 10 Oct 2025 16:03:13 +0800
Subject: dt-bindings: watchdog: aspeed,ast2400-wdt: Add support for AST2700

Add support for the AST2700 SoC in the ASPEED watchdog device tree
bindings. This includes:

- Adding "aspeed,ast2700-wdt" to the compatible string list.
- Extending the "aspeed,reset-mask" property description for AST2700.
- Defining AST2700-specific reset mask bits in aspeed-wdt.h,
  covering RESET1 to RESET5.

Signed-off-by: Chin-Ting Kuo <chin-ting_kuo@aspeedtech.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
---
 .../bindings/watchdog/aspeed,ast2400-wdt.yaml      |   8 +-
 include/dt-bindings/watchdog/aspeed-wdt.h          | 138 +++++++++++++++++++++
 2 files changed, 144 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/watchdog/aspeed,ast2400-wdt.yaml b/Documentation/devicetree/bindings/watchdog/aspeed,ast2400-wdt.yaml
index be78a9865584..9322cb5b462a 100644
--- a/Documentation/devicetree/bindings/watchdog/aspeed,ast2400-wdt.yaml
+++ b/Documentation/devicetree/bindings/watchdog/aspeed,ast2400-wdt.yaml
@@ -15,6 +15,7 @@ properties:
       - aspeed,ast2400-wdt
       - aspeed,ast2500-wdt
       - aspeed,ast2600-wdt
+      - aspeed,ast2700-wdt
 
   reg:
     maxItems: 1
@@ -87,13 +88,15 @@ properties:
   aspeed,reset-mask:
     $ref: /schemas/types.yaml#/definitions/uint32-array
     minItems: 1
-    maxItems: 2
+    maxItems: 5
     description: >
       A bitmask indicating which peripherals will be reset if the watchdog
       timer expires. On AST2500 SoCs this should be a single word defined using
       the AST2500_WDT_RESET_* macros; on AST2600 SoCs this should be a two-word
       array with the first word defined using the AST2600_WDT_RESET1_* macros,
-      and the second word defined using the AST2600_WDT_RESET2_* macros.
+      and the second word defined using the AST2600_WDT_RESET2_* macros; on
+      AST2700 SoCs, this should be five-word array from AST2700_WDT_RESET1_*
+      macros to AST2700_WDT_RESET5_* macros.
 
 required:
   - compatible
@@ -114,6 +117,7 @@ allOf:
           enum:
             - aspeed,ast2500-wdt
             - aspeed,ast2600-wdt
+            - aspeed,ast2700-wdt
   - if:
       required:
         - aspeed,ext-active-high
diff --git a/include/dt-bindings/watchdog/aspeed-wdt.h b/include/dt-bindings/watchdog/aspeed-wdt.h
index 7ae6d84b2bd9..89fa31ffce2d 100644
--- a/include/dt-bindings/watchdog/aspeed-wdt.h
+++ b/include/dt-bindings/watchdog/aspeed-wdt.h
@@ -89,4 +89,142 @@
 
 #define AST2600_WDT_RESET2_DEFAULT 0x03fffff1
 
+#define AST2700_WDT_RESET1_CPU		(1 << 0)
+#define AST2700_WDT_RESET1_DRAM		(1 << 1)
+#define AST2700_WDT_RESET1_SLI0		(1 << 2)
+#define AST2700_WDT_RESET1_EHCI		(1 << 3)
+#define AST2700_WDT_RESET1_HACE		(1 << 4)
+#define AST2700_WDT_RESET1_SOC_MISC0	(1 << 5)
+#define AST2700_WDT_RESET1_VIDEO	(1 << 6)
+#define AST2700_WDT_RESET1_2D_GRAPHIC	(1 << 7)
+#define AST2700_WDT_RESET1_RAVS0	(1 << 8)
+#define AST2700_WDT_RESET1_RAVS1	(1 << 9)
+#define AST2700_WDT_RESET1_GPIO0	(1 << 10)
+#define AST2700_WDT_RESET1_SSP		(1 << 11)
+#define AST2700_WDT_RESET1_TSP		(1 << 12)
+#define AST2700_WDT_RESET1_CRT		(1 << 13)
+#define AST2700_WDT_RESET1_USB20_HOST	(1 << 14)
+#define AST2700_WDT_RESET1_USB11_HOST	(1 << 15)
+#define AST2700_WDT_RESET1_UFS		(1 << 16)
+#define AST2700_WDT_RESET1_EMMC		(1 << 17)
+#define AST2700_WDT_RESET1_AHB_TO_PCIE1	(1 << 18)
+#define AST2700_WDT_RESET1_XDMA0	(1 << 22)
+#define AST2700_WDT_RESET1_MCTP1	(1 << 23)
+#define AST2700_WDT_RESET1_MCTP0	(1 << 24)
+#define AST2700_WDT_RESET1_JTAG0	(1 << 25)
+#define AST2700_WDT_RESET1_ECC		(1 << 26)
+#define AST2700_WDT_RESET1_XDMA1	(1 << 27)
+#define AST2700_WDT_RESET1_DP		(1 << 28)
+#define AST2700_WDT_RESET1_DP_MCU	(1 << 29)
+#define AST2700_WDT_RESET1_AHB_TO_PCIE0	(1 << 31)
+
+#define AST2700_WDT_RESET1_DEFAULT 0x8207ff71
+
+#define AST2700_WDT_RESET2_USB3_A_HOST	(1 << 0)
+#define AST2700_WDT_RESET2_USB3_A_VHUB3	(1 << 1)
+#define AST2700_WDT_RESET2_USB3_A_VHUB2	(1 << 2)
+#define AST2700_WDT_RESET2_USB3_B_HOST	(1 << 3)
+#define AST2700_WDT_RESET2_USB3_B_VHUB3	(1 << 4)
+#define AST2700_WDT_RESET2_USB3_B_VHUB2	(1 << 5)
+#define AST2700_WDT_RESET2_SM3		(1 << 6)
+#define AST2700_WDT_RESET2_SM4		(1 << 7)
+#define AST2700_WDT_RESET2_SHA3		(1 << 8)
+#define AST2700_WDT_RESET2_RSA		(1 << 9)
+
+#define AST2700_WDT_RESET2_DEFAULT 0x000003f6
+
+#define AST2700_WDT_RESET3_LPC0		(1 << 0)
+#define AST2700_WDT_RESET3_LPC1		(1 << 1)
+#define AST2700_WDT_RESET3_MDIO		(1 << 2)
+#define AST2700_WDT_RESET3_PECI		(1 << 3)
+#define AST2700_WDT_RESET3_PWM		(1 << 4)
+#define AST2700_WDT_RESET3_MAC0		(1 << 5)
+#define AST2700_WDT_RESET3_MAC1		(1 << 6)
+#define AST2700_WDT_RESET3_MAC2		(1 << 7)
+#define AST2700_WDT_RESET3_ADC		(1 << 8)
+#define AST2700_WDT_RESET3_SDC		(1 << 9)
+#define AST2700_WDT_RESET3_ESPI0	(1 << 10)
+#define AST2700_WDT_RESET3_ESPI1	(1 << 11)
+#define AST2700_WDT_RESET3_JTAG1	(1 << 12)
+#define AST2700_WDT_RESET3_SPI0		(1 << 13)
+#define AST2700_WDT_RESET3_SPI1		(1 << 14)
+#define AST2700_WDT_RESET3_SPI2		(1 << 15)
+#define AST2700_WDT_RESET3_I3C0		(1 << 16)
+#define AST2700_WDT_RESET3_I3C1		(1 << 17)
+#define AST2700_WDT_RESET3_I3C2		(1 << 18)
+#define AST2700_WDT_RESET3_I3C3		(1 << 19)
+#define AST2700_WDT_RESET3_I3C4		(1 << 20)
+#define AST2700_WDT_RESET3_I3C5		(1 << 21)
+#define AST2700_WDT_RESET3_I3C6		(1 << 22)
+#define AST2700_WDT_RESET3_I3C7		(1 << 23)
+#define AST2700_WDT_RESET3_I3C8		(1 << 24)
+#define AST2700_WDT_RESET3_I3C9		(1 << 25)
+#define AST2700_WDT_RESET3_I3C10	(1 << 26)
+#define AST2700_WDT_RESET3_I3C11	(1 << 27)
+#define AST2700_WDT_RESET3_I3C12	(1 << 28)
+#define AST2700_WDT_RESET3_I3C13	(1 << 29)
+#define AST2700_WDT_RESET3_I3C14	(1 << 30)
+#define AST2700_WDT_RESET3_I3C15	(1 << 31)
+
+#define AST2700_WDT_RESET3_DEFAULT 0x000093ec
+
+#define AST2700_WDT_RESET4_FMC		(1 << 0)
+#define AST2700_WDT_RESET4_SOC_MISC1	(1 << 1)
+#define AST2700_WDT_RESET4_AHB		(1 << 2)
+#define AST2700_WDT_RESET4_SLI1		(1 << 3)
+#define AST2700_WDT_RESET4_UART0	(1 << 4)
+#define AST2700_WDT_RESET4_UART1	(1 << 5)
+#define AST2700_WDT_RESET4_UART2	(1 << 6)
+#define AST2700_WDT_RESET4_UART3	(1 << 7)
+#define AST2700_WDT_RESET4_I2C_MONITOR	(1 << 8)
+#define AST2700_WDT_RESET4_HOST_TO_SPI1	(1 << 9)
+#define AST2700_WDT_RESET4_HOST_TO_SPI2	(1 << 10)
+#define AST2700_WDT_RESET4_GPIO1	(1 << 11)
+#define AST2700_WDT_RESET4_FSI		(1 << 12)
+#define AST2700_WDT_RESET4_CANBUS	(1 << 13)
+#define AST2700_WDT_RESET4_MCTP		(1 << 14)
+#define AST2700_WDT_RESET4_XDMA		(1 << 15)
+#define AST2700_WDT_RESET4_UART5	(1 << 16)
+#define AST2700_WDT_RESET4_UART6	(1 << 17)
+#define AST2700_WDT_RESET4_UART7	(1 << 18)
+#define AST2700_WDT_RESET4_UART8	(1 << 19)
+#define AST2700_WDT_RESET4_BOOT_MCU	(1 << 20)
+#define AST2700_WDT_RESET4_IO_MCU	(1 << 21)
+#define AST2700_WDT_RESET4_LTPI0	(1 << 22)
+#define AST2700_WDT_RESET4_VGA_LINK	(1 << 23)
+#define AST2700_WDT_RESET4_LTPI1	(1 << 24)
+#define AST2700_WDT_RESET4_LTPI_PHY	(1 << 25)
+#define AST2700_WDT_RESET4_ACE		(1 << 26)
+#define AST2700_WDT_RESET4_LTPI_GPIO0	(1 << 28)
+#define AST2700_WDT_RESET4_LTPI_GPIO1	(1 << 29)
+#define AST2700_WDT_RESET4_AHB_TO_PCIE1	(1 << 30)
+#define AST2700_WDT_RESET4_I3C_DMA	(1 << 31)
+
+#define AST2700_WDT_RESET4_DEFAULT 0x40303803
+
+#define AST2700_WDT_RESET5_I2C_GLOBAL	(1 << 0)
+#define AST2700_WDT_RESET5_I2C0		(1 << 1)
+#define AST2700_WDT_RESET5_I2C1		(1 << 2)
+#define AST2700_WDT_RESET5_I2C2		(1 << 3)
+#define AST2700_WDT_RESET5_I2C3		(1 << 4)
+#define AST2700_WDT_RESET5_I2C4		(1 << 5)
+#define AST2700_WDT_RESET5_I2C5		(1 << 6)
+#define AST2700_WDT_RESET5_I2C6		(1 << 7)
+#define AST2700_WDT_RESET5_I2C7		(1 << 8)
+#define AST2700_WDT_RESET5_I2C8		(1 << 9)
+#define AST2700_WDT_RESET5_I2C9		(1 << 10)
+#define AST2700_WDT_RESET5_I2C10	(1 << 11)
+#define AST2700_WDT_RESET5_I2C11	(1 << 12)
+#define AST2700_WDT_RESET5_I2C12	(1 << 13)
+#define AST2700_WDT_RESET5_I2C13	(1 << 14)
+#define AST2700_WDT_RESET5_I2C14	(1 << 15)
+#define AST2700_WDT_RESET5_I2C15	(1 << 16)
+#define AST2700_WDT_RESET5_UHCI		(1 << 17)
+#define AST2700_WDT_RESET5_USB2_C_UART	(1 << 18)
+#define AST2700_WDT_RESET5_USB2_C	(1 << 19)
+#define AST2700_WDT_RESET5_USB2_D_UART	(1 << 20)
+#define AST2700_WDT_RESET5_USB2_D	(1 << 21)
+
+#define AST2700_WDT_RESET5_DEFAULT 0x00320000
+
 #endif
-- 
cgit v1.2.3


From 4051a9115ad24bb9a691774730ca9c1dd56de665 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 17 Sep 2025 22:19:10 -0400
Subject: new helper: simple_remove_by_name()

simple_recursive_removal(), but instead of victim dentry it takes
parent + name.

Used to be open-coded in fs/fuse/control.c, but there's no need to expose
the guts of that thing there and there are other potential users, so
let's lift it into libfs...

Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/control.c  |  7 +------
 fs/libfs.c         | 13 +++++++++++++
 include/linux/fs.h |  2 ++
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 5247df896c5d..3dca752127ff 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -290,18 +290,13 @@ static void remove_one(struct dentry *dentry)
  */
 void fuse_ctl_remove_conn(struct fuse_conn *fc)
 {
-	struct dentry *dentry;
 	char name[32];
 
 	if (!fuse_control_sb || fc->no_control)
 		return;
 
 	sprintf(name, "%u", fc->dev);
-	dentry = lookup_noperm_positive_unlocked(&QSTR(name), fuse_control_sb->s_root);
-	if (!IS_ERR(dentry)) {
-		simple_recursive_removal(dentry, remove_one);
-		dput(dentry);	// paired with lookup_noperm_positive_unlocked()
-	}
+	simple_remove_by_name(fuse_control_sb->s_root, name, remove_one);
 }
 
 static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc)
diff --git a/fs/libfs.c b/fs/libfs.c
index ce8c496a6940..d029aff41f66 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -655,6 +655,19 @@ void simple_recursive_removal(struct dentry *dentry,
 }
 EXPORT_SYMBOL(simple_recursive_removal);
 
+void simple_remove_by_name(struct dentry *parent, const char *name,
+                           void (*callback)(struct dentry *))
+{
+	struct dentry *dentry;
+
+	dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent);
+	if (!IS_ERR(dentry)) {
+		simple_recursive_removal(dentry, callback);
+		dput(dentry);	// paired with lookup_noperm_positive_unlocked()
+	}
+}
+EXPORT_SYMBOL(simple_remove_by_name);
+
 /* caller holds parent directory with I_MUTEX_PARENT */
 void locked_recursive_removal(struct dentry *dentry,
                               void (*callback)(struct dentry *))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..28bd4e8d3892 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3631,6 +3631,8 @@ extern int simple_rename(struct mnt_idmap *, struct inode *,
 			 unsigned int);
 extern void simple_recursive_removal(struct dentry *,
                               void (*callback)(struct dentry *));
+extern void simple_remove_by_name(struct dentry *, const char *,
+                              void (*callback)(struct dentry *));
 extern void locked_recursive_removal(struct dentry *,
                               void (*callback)(struct dentry *));
 extern int noop_fsync(struct file *, loff_t, loff_t, int);
-- 
cgit v1.2.3


From 1552ddc7fade1ae55af298580ef6c913b8db74bc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 19 Sep 2025 17:46:01 -0400
Subject: new helper: simple_done_creating()

should be paired with simple_start_creating() - unlocks parent and
drops dentry reference.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c         | 8 ++++++++
 include/linux/fs.h | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/fs/libfs.c b/fs/libfs.c
index d029aff41f66..a033f35493d0 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -2326,3 +2326,11 @@ struct dentry *simple_start_creating(struct dentry *parent, const char *name)
 	return dentry;
 }
 EXPORT_SYMBOL(simple_start_creating);
+
+/* parent must have been held exclusive since simple_start_creating() */
+void simple_done_creating(struct dentry *child)
+{
+	inode_unlock(child->d_parent->d_inode);
+	dput(child);
+}
+EXPORT_SYMBOL(simple_done_creating);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 28bd4e8d3892..f5037c556f61 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3662,6 +3662,7 @@ extern int simple_fill_super(struct super_block *, unsigned long,
 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
 extern void simple_release_fs(struct vfsmount **mount, int *count);
 struct dentry *simple_start_creating(struct dentry *, const char *);
+void simple_done_creating(struct dentry *);
 
 extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 			loff_t *ppos, const void *from, size_t available);
-- 
cgit v1.2.3


From 8a210cacf5dc2a6210ee42aeca5cd03b2400876f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 3 Mar 2025 19:15:35 -0500
Subject: introduce a flag for explicitly marking persistently pinned dentries

Some filesystems use a kinda-sorta controlled dentry refcount leak to pin
dentries of created objects in dcache (and undo it when removing those).
Reference is grabbed and not released, but it's not actually _stored_
anywhere.  That works, but it's hard to follow and verify; among other
things, we have no way to tell _which_ of the increments is intended
to be an unpaired one.  Worse, on removal we need to decide whether
the reference had already been dropped, which can be non-trivial if
that removal is on umount and we need to figure out if this dentry is
pinned due to e.g. unlink() not done.  Usually that is handled by using
kill_litter_super() as ->kill_sb(), but there are open-coded special
cases of the same (consider e.g. /proc/self).

Things get simpler if we introduce a new dentry flag (DCACHE_PERSISTENT)
marking those "leaked" dentries.  Having it set claims responsibility
for +1 in refcount.

The end result this series is aiming for:

* get these unbalanced dget() and dput() replaced with new primitives that
  would, in addition to adjusting refcount, set and clear persistency flag.
* instead of having kill_litter_super() mess with removing the remaining
  "leaked" references (e.g. for all tmpfs files that hadn't been removed
  prior to umount), have the regular shrink_dcache_for_umount() strip
  DCACHE_PERSISTENT of all dentries, dropping the corresponding
  reference if it had been set.  After that kill_litter_super() becomes
  an equivalent of kill_anon_super().

Doing that in a single step is not feasible - it would affect too many places
in too many filesystems.  It has to be split into a series.

Here we
	* introduce the new flag
	* teach shrink_dcache_for_umount() to handle it (i.e. remove
and drop refcount on anything that survives to umount with that flag
still set)
	* teach kill_litter_super() that anything with that flag does
*not* need to be unpinned.

Next commits will add primitives for maintaing that flag and convert the
common helpers to those.  After that - a long series of per-filesystem
patches converting to those primitives.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 27 ++++++++++++++++++++++-----
 include/linux/dcache.h |  1 +
 2 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index 035cccbc9276..f2c9f4fef2a2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1511,6 +1511,15 @@ out:
 	return ret;
 }
 
+static enum d_walk_ret select_collect_umount(void *_data, struct dentry *dentry)
+{
+	if (dentry->d_flags & DCACHE_PERSISTENT) {
+		dentry->d_flags &= ~DCACHE_PERSISTENT;
+		dentry->d_lockref.count--;
+	}
+	return select_collect(_data, dentry);
+}
+
 static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
 {
 	struct select_data *data = _data;
@@ -1539,18 +1548,20 @@ out:
 }
 
 /**
- * shrink_dcache_parent - prune dcache
+ * shrink_dcache_tree - prune dcache
  * @parent: parent of entries to prune
+ * @for_umount: true if we want to unpin the persistent ones
  *
  * Prune the dcache to remove unused children of the parent dentry.
  */
-void shrink_dcache_parent(struct dentry *parent)
+static void shrink_dcache_tree(struct dentry *parent, bool for_umount)
 {
 	for (;;) {
 		struct select_data data = {.start = parent};
 
 		INIT_LIST_HEAD(&data.dispose);
-		d_walk(parent, &data, select_collect);
+		d_walk(parent, &data,
+			for_umount ? select_collect_umount : select_collect);
 
 		if (!list_empty(&data.dispose)) {
 			shrink_dentry_list(&data.dispose);
@@ -1575,6 +1586,11 @@ void shrink_dcache_parent(struct dentry *parent)
 			shrink_dentry_list(&data.dispose);
 	}
 }
+
+void shrink_dcache_parent(struct dentry *parent)
+{
+	shrink_dcache_tree(parent, false);
+}
 EXPORT_SYMBOL(shrink_dcache_parent);
 
 static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
@@ -1601,7 +1617,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
 
 static void do_one_tree(struct dentry *dentry)
 {
-	shrink_dcache_parent(dentry);
+	shrink_dcache_tree(dentry, true);
 	d_walk(dentry, dentry, umount_check);
 	d_drop(dentry);
 	dput(dentry);
@@ -3111,7 +3127,8 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
 {
 	struct dentry *root = data;
 	if (dentry != root) {
-		if (d_unhashed(dentry) || !dentry->d_inode)
+		if (d_unhashed(dentry) || !dentry->d_inode ||
+		    dentry->d_flags & DCACHE_PERSISTENT)
 			return D_WALK_SKIP;
 
 		if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index c83e02b94389..94b58655322a 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -225,6 +225,7 @@ enum dentry_flags {
 	DCACHE_PAR_LOOKUP		= BIT(24),	/* being looked up (with parent locked shared) */
 	DCACHE_DENTRY_CURSOR		= BIT(25),
 	DCACHE_NORCU			= BIT(26),	/* No RCU delay for freeing */
+	DCACHE_PERSISTENT		= BIT(27)
 };
 
 #define DCACHE_MANAGED_DENTRY \
-- 
cgit v1.2.3


From bacdf1d70bbe2027619c7bbbe48b379a806a9678 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 3 Mar 2025 19:38:04 -0500
Subject: primitives for maintaining persisitency

* d_make_persistent(dentry, inode) - bump refcount, mark persistent and
make hashed positive.  Return value is a borrowed reference to dentry;
it can be used until something removes persistency (at the very least,
until the parent gets unlocked, but some filesystems may have stronger
exclusion).

* d_make_discardable() - remove persistency mark and drop reference.

d_make_persistent() is similar to combination of d_instantiate(), dget()
and setting flag.  The only difference is that unlike d_instantiate()
it accepts hashed and unhashed negatives alike.  It is always called in
strong locking environment (parent held exclusive, or, in some cases,
dentry coming from d_alloc_name()); if we ever start using it with parent
held only shared and dentry coming from d_alloc_parallel(), we'll need
to copy the in-lookup logics from __d_add().

d_make_discardable() is eqiuvalent to combination of removing flag and
dput(); since flag removal requires ->d_lock, there's no point trying
to avoid taking that for refcount decrement as fast_dput() does.
The slow path of dput() has been taken into a helper and reused in
d_make_discardable() instead.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 74 ++++++++++++++++++++++++++++++++++++++++----------
 include/linux/dcache.h |  2 ++
 2 files changed, 61 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index f2c9f4fef2a2..3cc6c3876177 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -869,6 +869,24 @@ locked:
 	return false;
 }
 
+static void finish_dput(struct dentry *dentry)
+	__releases(dentry->d_lock)
+	__releases(RCU)
+{
+	while (lock_for_kill(dentry)) {
+		rcu_read_unlock();
+		dentry = __dentry_kill(dentry);
+		if (!dentry)
+			return;
+		if (retain_dentry(dentry, true)) {
+			spin_unlock(&dentry->d_lock);
+			return;
+		}
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+	spin_unlock(&dentry->d_lock);
+}
 
 /* 
  * This is dput
@@ -906,22 +924,28 @@ void dput(struct dentry *dentry)
 		rcu_read_unlock();
 		return;
 	}
-	while (lock_for_kill(dentry)) {
-		rcu_read_unlock();
-		dentry = __dentry_kill(dentry);
-		if (!dentry)
-			return;
-		if (retain_dentry(dentry, true)) {
-			spin_unlock(&dentry->d_lock);
-			return;
-		}
-		rcu_read_lock();
-	}
-	rcu_read_unlock();
-	spin_unlock(&dentry->d_lock);
+	finish_dput(dentry);
 }
 EXPORT_SYMBOL(dput);
 
+void d_make_discardable(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	/*
+	 * By the end of the series we'll add
+	 * WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT);
+	 * here, but while object removal is done by a few common helpers,
+	 * object creation tends to be open-coded (if nothing else, new inode
+	 * needs to be set up), so adding a warning from the very beginning
+	 * would make for much messier patch series.
+	 */
+	dentry->d_flags &= ~DCACHE_PERSISTENT;
+	dentry->d_lockref.count--;
+	rcu_read_lock();
+	finish_dput(dentry);
+}
+EXPORT_SYMBOL(d_make_discardable);
+
 static void to_shrink_list(struct dentry *dentry, struct list_head *list)
 __must_hold(&dentry->d_lock)
 {
@@ -1939,7 +1963,6 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	unsigned add_flags = d_flags_for_inode(inode);
 	WARN_ON(d_in_lookup(dentry));
 
-	spin_lock(&dentry->d_lock);
 	/*
 	 * The negative counter only tracks dentries on the LRU. Don't dec if
 	 * d_lru is on another list.
@@ -1952,7 +1975,6 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	__d_set_inode_and_type(dentry, inode, add_flags);
 	raw_write_seqcount_end(&dentry->d_seq);
 	fsnotify_update_flags(dentry);
-	spin_unlock(&dentry->d_lock);
 }
 
 /**
@@ -1976,7 +1998,9 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
 	if (inode) {
 		security_d_instantiate(entry, inode);
 		spin_lock(&inode->i_lock);
+		spin_lock(&entry->d_lock);
 		__d_instantiate(entry, inode);
+		spin_unlock(&entry->d_lock);
 		spin_unlock(&inode->i_lock);
 	}
 }
@@ -1995,7 +2019,9 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
 	lockdep_annotate_inode_mutex_key(inode);
 	security_d_instantiate(entry, inode);
 	spin_lock(&inode->i_lock);
+	spin_lock(&entry->d_lock);
 	__d_instantiate(entry, inode);
+	spin_unlock(&entry->d_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
 	inode->i_state &= ~I_NEW & ~I_CREATING;
 	/*
@@ -2754,6 +2780,24 @@ void d_add(struct dentry *entry, struct inode *inode)
 }
 EXPORT_SYMBOL(d_add);
 
+struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode)
+{
+	WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
+	WARN_ON(!inode);
+	security_d_instantiate(dentry, inode);
+	spin_lock(&inode->i_lock);
+	spin_lock(&dentry->d_lock);
+	__d_instantiate(dentry, inode);
+	dentry->d_flags |= DCACHE_PERSISTENT;
+	dget_dlock(dentry);
+	if (d_unhashed(dentry))
+		__d_rehash(dentry);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&inode->i_lock);
+	return dentry;
+}
+EXPORT_SYMBOL(d_make_persistent);
+
 static void swap_names(struct dentry *dentry, struct dentry *target)
 {
 	if (unlikely(dname_external(target))) {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 94b58655322a..6ec4066825e3 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -611,5 +611,7 @@ static inline struct dentry *d_next_sibling(const struct dentry *dentry)
 }
 
 void set_default_d_op(struct super_block *, const struct dentry_operations *);
+struct dentry *d_make_persistent(struct dentry *, struct inode *);
+void d_make_discardable(struct dentry *dentry);
 
 #endif	/* __LINUX_DCACHE_H */
-- 
cgit v1.2.3


From 23cbc7a795853bc7a8d0512b7c686ef879f6e909 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Feb 2024 01:55:36 -0500
Subject: procfs: make /self and /thread_self dentries persistent

... and there's no need to remember those pointers anywhere - ->kill_sb()
no longer needs to bother since kill_anon_super() will take care of
them anyway and proc_pid_readdir() only wants the inumbers, which
we had in a couple of static variables all along.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c          |  6 ++----
 fs/proc/internal.h      |  1 +
 fs/proc/root.c          | 14 ++++----------
 fs/proc/self.c          | 10 +++-------
 fs/proc/thread_self.c   | 11 +++--------
 include/linux/proc_fs.h |  2 --
 6 files changed, 13 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6299878e3d97..869677a26332 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3585,14 +3585,12 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 		return 0;
 
 	if (pos == TGID_OFFSET - 2) {
-		struct inode *inode = d_inode(fs_info->proc_self);
-		if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
+		if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK))
 			return 0;
 		ctx->pos = pos = pos + 1;
 	}
 	if (pos == TGID_OFFSET - 1) {
-		struct inode *inode = d_inode(fs_info->proc_thread_self);
-		if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
+		if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK))
 			return 0;
 		ctx->pos = pos = pos + 1;
 	}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d1598576506c..c1e8eb984da8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -373,6 +373,7 @@ static inline void proc_tty_init(void) {}
 extern struct proc_dir_entry proc_root;
 
 extern void proc_self_init(void);
+extern unsigned self_inum, thread_self_inum;
 
 /*
  * task_[no]mmu.c
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 1e24e085c7d5..d8ca41d823e4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -347,17 +347,11 @@ static void proc_kill_sb(struct super_block *sb)
 {
 	struct proc_fs_info *fs_info = proc_sb_info(sb);
 
-	if (!fs_info) {
-		kill_anon_super(sb);
-		return;
-	}
-
-	dput(fs_info->proc_self);
-	dput(fs_info->proc_thread_self);
-
 	kill_anon_super(sb);
-	put_pid_ns(fs_info->pid_ns);
-	kfree_rcu(fs_info, rcu);
+	if (fs_info) {
+		put_pid_ns(fs_info->pid_ns);
+		kfree_rcu(fs_info, rcu);
+	}
 }
 
 static struct file_system_type proc_fs_type = {
diff --git a/fs/proc/self.c b/fs/proc/self.c
index b46fbfd22681..62d2c0cfe35c 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -31,12 +31,11 @@ static const struct inode_operations proc_self_inode_operations = {
 	.get_link	= proc_self_get_link,
 };
 
-static unsigned self_inum __ro_after_init;
+unsigned self_inum __ro_after_init;
 
 int proc_setup_self(struct super_block *s)
 {
 	struct inode *root_inode = d_inode(s->s_root);
-	struct proc_fs_info *fs_info = proc_sb_info(s);
 	struct dentry *self;
 	int ret = -ENOMEM;
 
@@ -51,18 +50,15 @@ int proc_setup_self(struct super_block *s)
 			inode->i_uid = GLOBAL_ROOT_UID;
 			inode->i_gid = GLOBAL_ROOT_GID;
 			inode->i_op = &proc_self_inode_operations;
-			d_add(self, inode);
+			d_make_persistent(self, inode);
 			ret = 0;
-		} else {
-			dput(self);
 		}
+		dput(self);
 	}
 	inode_unlock(root_inode);
 
 	if (ret)
 		pr_err("proc_fill_super: can't allocate /proc/self\n");
-	else
-		fs_info->proc_self = self;
 
 	return ret;
 }
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 0e5050d6ab64..d6113dbe58e0 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -31,12 +31,11 @@ static const struct inode_operations proc_thread_self_inode_operations = {
 	.get_link	= proc_thread_self_get_link,
 };
 
-static unsigned thread_self_inum __ro_after_init;
+unsigned thread_self_inum __ro_after_init;
 
 int proc_setup_thread_self(struct super_block *s)
 {
 	struct inode *root_inode = d_inode(s->s_root);
-	struct proc_fs_info *fs_info = proc_sb_info(s);
 	struct dentry *thread_self;
 	int ret = -ENOMEM;
 
@@ -51,19 +50,15 @@ int proc_setup_thread_self(struct super_block *s)
 			inode->i_uid = GLOBAL_ROOT_UID;
 			inode->i_gid = GLOBAL_ROOT_GID;
 			inode->i_op = &proc_thread_self_inode_operations;
-			d_add(thread_self, inode);
+			d_make_persistent(thread_self, inode);
 			ret = 0;
-		} else {
-			dput(thread_self);
 		}
+		dput(thread_self);
 	}
 	inode_unlock(root_inode);
 
 	if (ret)
 		pr_err("proc_fill_super: can't allocate /proc/thread-self\n");
-	else
-		fs_info->proc_thread_self = thread_self;
-
 	return ret;
 }
 
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index f139377f4b31..19d1c5e5f335 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -66,8 +66,6 @@ enum proc_pidonly {
 
 struct proc_fs_info {
 	struct pid_namespace *pid_ns;
-	struct dentry *proc_self;        /* For /proc/self */
-	struct dentry *proc_thread_self; /* For /proc/thread-self */
 	kgid_t pid_gid;
 	enum proc_hidepid hide_pid;
 	enum proc_pidonly pidonly;
-- 
cgit v1.2.3


From 566a414558aec1ab263ab8709fa783dfa2e34325 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 2 Oct 2025 10:00:52 -0400
Subject: svcrdma: Increase the server's default RPC/RDMA credit grant

The range of commits from commit e3274026e2ec ("SUNRPC: move all of
xprt handling into svc_xprt_handle()") to commit 15d39883ee7d
("SUNRPC: change the back-channel queue to lwq") enabled NFSD
performance to scale better as the number of nfsd threads is
increased. These commits were merged in v6.7.

Now that the nfsd thread count can scale to more threads, permit
individual clients to make more use of those threads. Increase the
RPC/RDMA per-connection credit grant from 64 to 128 -- same as the
Linux NFS client.

Simple single client fio-based benchmarking so far shows only
improvement, no regression.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 22704c2e5b9b..57f4fd94166a 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -131,7 +131,7 @@ static inline struct svcxprt_rdma *svc_rdma_rqst_rdma(struct svc_rqst *rqstp)
  */
 enum {
 	RPCRDMA_LISTEN_BACKLOG	= 10,
-	RPCRDMA_MAX_REQUESTS	= 64,
+	RPCRDMA_MAX_REQUESTS	= 128,
 	RPCRDMA_MAX_BC_REQUESTS	= 2,
 };
 
-- 
cgit v1.2.3


From 6b3b697d65d46a0f640216a3f6c72856c159c567 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 13 Oct 2025 09:54:53 -0400
Subject: sunrpc: allocate a separate bvec array for socket sends

svc_tcp_sendmsg() calls xdr_buf_to_bvec() with the second slot of
rq_bvec as the start, but doesn't reduce the array length by one, which
could lead to an array overrun. Also, rq_bvec is always rq_maxpages in
length, which can be too short in some cases, since the TCP record
marker consumes a slot.

Fix both problems by adding a separate bvec array to the svc_sock that
is specifically for sending. For TCP, make this array one slot longer
than rq_maxpages, to account for the record marker. For UDP, only
allocate as large an array as we need since it's limited to 64k of
payload.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svcsock.h |  3 +++
 net/sunrpc/svcsock.c           | 55 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 51 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 963bbe251e52..de37069aba90 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -26,6 +26,9 @@ struct svc_sock {
 	void			(*sk_odata)(struct sock *);
 	void			(*sk_owspace)(struct sock *);
 
+	/* For sends (protected by xpt_mutex) */
+	struct bio_vec		*sk_bvec;
+
 	/* private TCP part */
 	/* On-the-wire fragment header: */
 	__be32			sk_marker;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 0cb9c4d45745..93de79020a2d 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -68,6 +68,17 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
+/*
+ * For UDP:
+ * 1 for header page
+ * enough pages for RPCSVC_MAXPAYLOAD_UDP
+ * 1 in case payload is not aligned
+ * 1 for tail page
+ */
+enum {
+	SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1
+};
+
 /* To-do: to avoid tying up an nfsd thread while waiting for a
  * handshake request, the request could instead be deferred.
  */
@@ -740,14 +751,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp)
 	if (svc_xprt_is_dead(xprt))
 		goto out_notconn;
 
-	count = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, xdr);
+	count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr);
 
-	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
 		      count, rqstp->rq_res.len);
 	err = sock_sendmsg(svsk->sk_sock, &msg);
 	if (err == -ECONNREFUSED) {
 		/* ICMP error on earlier request. */
-		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
 			      count, rqstp->rq_res.len);
 		err = sock_sendmsg(svsk->sk_sock, &msg);
 	}
@@ -1236,19 +1247,19 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
 	int ret;
 
 	/* The stream record marker is copied into a temporary page
-	 * fragment buffer so that it can be included in rq_bvec.
+	 * fragment buffer so that it can be included in sk_bvec.
 	 */
 	buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker),
 			      GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 	memcpy(buf, &marker, sizeof(marker));
-	bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker));
+	bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker));
 
-	count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, rqstp->rq_maxpages,
+	count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages,
 				&rqstp->rq_res);
 
-	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
 		      1 + count, sizeof(marker) + rqstp->rq_res.len);
 	ret = sock_sendmsg(svsk->sk_sock, &msg);
 	page_frag_free(buf);
@@ -1393,6 +1404,20 @@ void svc_sock_update_bufs(struct svc_serv *serv)
 	spin_unlock_bh(&serv->sv_lock);
 }
 
+static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags)
+{
+	switch (sock->type) {
+	case SOCK_STREAM:
+		/* +1 for TCP record marker */
+		if (flags & SVC_SOCK_TEMPORARY)
+			return svc_serv_maxpages(serv) + 1;
+		return 0;
+	case SOCK_DGRAM:
+		return SUNRPC_MAX_UDP_SENDPAGES;
+	}
+	return -EINVAL;
+}
+
 /*
  * Initialize socket for RPC use and create svc_sock struct
  */
@@ -1403,12 +1428,26 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	struct svc_sock	*svsk;
 	struct sock	*inet;
 	int		pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
+	int		sendpages;
 	unsigned long	pages;
 
+	sendpages = svc_sock_sendpages(serv, sock, flags);
+	if (sendpages < 0)
+		return ERR_PTR(sendpages);
+
 	pages = svc_serv_maxpages(serv);
 	svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL);
 	if (!svsk)
 		return ERR_PTR(-ENOMEM);
+
+	if (sendpages) {
+		svsk->sk_bvec = kcalloc(sendpages, sizeof(*svsk->sk_bvec), GFP_KERNEL);
+		if (!svsk->sk_bvec) {
+			kfree(svsk);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+
 	svsk->sk_maxpages = pages;
 
 	inet = sock->sk;
@@ -1420,6 +1459,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 				     inet->sk_protocol,
 				     ntohs(inet_sk(inet)->inet_sport));
 		if (err < 0) {
+			kfree(svsk->sk_bvec);
 			kfree(svsk);
 			return ERR_PTR(err);
 		}
@@ -1637,5 +1677,6 @@ static void svc_sock_free(struct svc_xprt *xprt)
 		sock_release(sock);
 
 	page_frag_cache_drain(&svsk->sk_frag_cache);
+	kfree(svsk->sk_bvec);
 	kfree(svsk);
 }
-- 
cgit v1.2.3


From dd9896d41fdf1050934d6a46a1c5ca2164284e72 Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Sat, 15 Nov 2025 19:06:26 +0100
Subject: ASoC: Intel: avs: Allow the topology to carry NHLT data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Typically the hardware configuration for I2S and DMIC devices resides
in the Non-HDAudio Link Table (NHLT) that is part of the ACPI tree. As
the NHLTs existing in the field are not always perfect, workaround
mechanisms are provided to patch them.

Currently the avs-driver is utilizing the ->blob_fmt override (see
topology.h and struct avs_tplg_modcfg_ext) when there is a valid entry
within a NHLT to configure the hardware for specific format but its
descriptor (header) is invalid.

A separate case is when there is no correct hardware configuration at
all within the NHLT available in the system. Patching the header won't
help and forcing ad-hoc BIOS updates for dated system is not feasible.
Allowing the topology to carry the data is the solution of choice as
replacing a userspace file that is part of /lib/firmware/intel/ is less
invasive than BIOS update and solves the problem.

Co-developed-by: Amadeusz Sławiński <amade@asmblr.net>
Signed-off-by: Amadeusz Sławiński <amade@asmblr.net>
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://patch.msgid.link/20251115180627.3589520-2-cezary.rojewski@intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/intel/avs/tokens.h |   5 ++
 sound/soc/intel/avs/topology.c        | 106 +++++++++++++++++++++++++++++++++-
 sound/soc/intel/avs/topology.h        |   7 +++
 3 files changed, 115 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/intel/avs/tokens.h b/include/uapi/sound/intel/avs/tokens.h
index f3ff6aae09a9..f7cbbfb00227 100644
--- a/include/uapi/sound/intel/avs/tokens.h
+++ b/include/uapi/sound/intel/avs/tokens.h
@@ -21,6 +21,7 @@ enum avs_tplg_token {
 	AVS_TKN_MANIFEST_NUM_BINDINGS_U32		= 8,
 	AVS_TKN_MANIFEST_NUM_CONDPATH_TMPLS_U32		= 9,
 	AVS_TKN_MANIFEST_NUM_INIT_CONFIGS_U32		= 10,
+	AVS_TKN_MANIFEST_NUM_NHLT_CONFIGS_U32		= 11,
 
 	/* struct avs_tplg_library */
 	AVS_TKN_LIBRARY_ID_U32				= 101,
@@ -160,6 +161,10 @@ enum avs_tplg_token {
 	AVS_TKN_INIT_CONFIG_ID_U32			= 2401,
 	AVS_TKN_INIT_CONFIG_PARAM_U8			= 2402,
 	AVS_TKN_INIT_CONFIG_LENGTH_U32			= 2403,
+
+	/* struct avs_tplg_nhlt_config */
+	AVS_TKN_NHLT_CONFIG_ID_U32			= 2501,
+	AVS_TKN_NHLT_CONFIG_SIZE_U32			= 2502,
 };
 
 #endif
diff --git a/sound/soc/intel/avs/topology.c b/sound/soc/intel/avs/topology.c
index dfe8cf505381..48fdbaef56dd 100644
--- a/sound/soc/intel/avs/topology.c
+++ b/sound/soc/intel/avs/topology.c
@@ -420,6 +420,22 @@ static int parse_link_formatted_string(struct snd_soc_component *comp, void *ele
 	return 0;
 }
 
+static int avs_parse_nhlt_config_size(struct snd_soc_component *comp, void *elem, void *object,
+				      u32 offset)
+{
+	struct snd_soc_tplg_vendor_value_elem *tuple = elem;
+	struct acpi_nhlt_config **blob = (struct acpi_nhlt_config **)((u8 *)object + offset);
+	u32 size;
+
+	size = le32_to_cpu(tuple->value);
+	*blob = devm_kzalloc(comp->card->dev, struct_size(*blob, capabilities, size), GFP_KERNEL);
+	if (!*blob)
+		return -ENOMEM;
+
+	(*blob)->capabilities_size = size;
+	return 0;
+}
+
 static int
 parse_dictionary_header(struct snd_soc_component *comp,
 			struct snd_soc_tplg_vendor_array *tuples,
@@ -1651,12 +1667,14 @@ static const struct avs_tplg_token_parser mod_init_config_parsers[] = {
 
 static int avs_tplg_parse_initial_configs(struct snd_soc_component *comp,
 					   struct snd_soc_tplg_vendor_array *tuples,
-					   u32 block_size)
+					   u32 block_size, u32 *offset)
 {
 	struct avs_soc_component *acomp = to_avs_soc_component(comp);
 	struct avs_tplg *tplg = acomp->tplg;
 	int ret, i;
 
+	*offset = 0;
+
 	/* Parse tuple section telling how many init configs there are. */
 	ret = parse_dictionary_header(comp, tuples, (void **)&tplg->init_configs,
 				      &tplg->num_init_configs,
@@ -1666,6 +1684,7 @@ static int avs_tplg_parse_initial_configs(struct snd_soc_component *comp,
 		return ret;
 
 	block_size -= le32_to_cpu(tuples->size);
+	*offset += le32_to_cpu(tuples->size);
 	/* With header parsed, move on to parsing entries. */
 	tuples = avs_tplg_vendor_array_next(tuples);
 
@@ -1681,6 +1700,7 @@ static int avs_tplg_parse_initial_configs(struct snd_soc_component *comp,
 		 */
 		tmp = avs_tplg_vendor_array_next(tuples);
 		esize = le32_to_cpu(tuples->size) + le32_to_cpu(tmp->size);
+		*offset += esize;
 
 		ret = parse_dictionary_entries(comp, tuples, esize, config, 1, sizeof(*config),
 					       AVS_TKN_INIT_CONFIG_ID_U32,
@@ -1692,6 +1712,7 @@ static int avs_tplg_parse_initial_configs(struct snd_soc_component *comp,
 		/* handle raw data section */
 		init_config_data = (void *)tuples + esize;
 		esize = config->length;
+		*offset += esize;
 
 		config->data = devm_kmemdup(comp->card->dev, init_config_data, esize, GFP_KERNEL);
 		if (!config->data)
@@ -1704,6 +1725,70 @@ static int avs_tplg_parse_initial_configs(struct snd_soc_component *comp,
 	return 0;
 }
 
+static const struct avs_tplg_token_parser mod_nhlt_config_parsers[] = {
+	{
+		.token = AVS_TKN_NHLT_CONFIG_ID_U32,
+		.type = SND_SOC_TPLG_TUPLE_TYPE_WORD,
+		.offset = offsetof(struct avs_tplg_nhlt_config, id),
+		.parse = avs_parse_word_token,
+	},
+	{
+		.token = AVS_TKN_NHLT_CONFIG_SIZE_U32,
+		.type = SND_SOC_TPLG_TUPLE_TYPE_WORD,
+		.offset = offsetof(struct avs_tplg_nhlt_config, blob),
+		.parse = avs_parse_nhlt_config_size,
+	},
+};
+
+static int avs_tplg_parse_nhlt_configs(struct snd_soc_component *comp,
+				       struct snd_soc_tplg_vendor_array *tuples,
+				       u32 block_size)
+{
+	struct avs_soc_component *acomp = to_avs_soc_component(comp);
+	struct avs_tplg *tplg = acomp->tplg;
+	int ret, i;
+
+	/* Parse the header section to know how many entries there are. */
+	ret = parse_dictionary_header(comp, tuples, (void **)&tplg->nhlt_configs,
+				      &tplg->num_nhlt_configs,
+				      sizeof(*tplg->nhlt_configs),
+				      AVS_TKN_MANIFEST_NUM_NHLT_CONFIGS_U32);
+	if (ret)
+		return ret;
+
+	block_size -= le32_to_cpu(tuples->size);
+	/* With the header parsed, move on to parsing entries. */
+	tuples = avs_tplg_vendor_array_next(tuples);
+
+	for (i = 0; i < tplg->num_nhlt_configs && block_size > 0; i++) {
+		struct avs_tplg_nhlt_config *config;
+		u32 esize;
+
+		config = &tplg->nhlt_configs[i];
+		esize = le32_to_cpu(tuples->size);
+
+		ret = parse_dictionary_entries(comp, tuples, esize, config, 1, sizeof(*config),
+					       AVS_TKN_NHLT_CONFIG_ID_U32,
+					       mod_nhlt_config_parsers,
+					       ARRAY_SIZE(mod_nhlt_config_parsers));
+		if (ret)
+			return ret;
+		/* With tuples parsed, the blob shall be allocated. */
+		if (!config->blob)
+			return -EINVAL;
+
+		/* Consume the raw data and move to the next entry. */
+		memcpy(config->blob->capabilities, (u8 *)tuples + esize,
+		       config->blob->capabilities_size);
+		esize += config->blob->capabilities_size;
+
+		block_size -= esize;
+		tuples = avs_tplg_vendor_array_at(tuples, esize);
+	}
+
+	return 0;
+}
+
 static int avs_route_load(struct snd_soc_component *comp, int index,
 			  struct snd_soc_dapm_route *route)
 {
@@ -2008,11 +2093,26 @@ static int avs_manifest(struct snd_soc_component *comp, int index,
 	tuples = avs_tplg_vendor_array_at(tuples, offset);
 
 	/* Initial configs dictionary. */
-	ret = avs_tplg_parse_initial_configs(comp, tuples, remaining);
+	ret = avs_tplg_parse_initial_configs(comp, tuples, remaining, &offset);
 	if (ret < 0)
 		return ret;
 
-	return 0;
+	remaining -= offset;
+	tuples = avs_tplg_vendor_array_at(tuples, offset);
+
+	ret = avs_tplg_vendor_array_lookup(tuples, remaining,
+					   AVS_TKN_MANIFEST_NUM_NHLT_CONFIGS_U32, &offset);
+	if (ret == -ENOENT)
+		return 0;
+	if (ret) {
+		dev_err(comp->dev, "NHLT config lookup failed: %d\n", ret);
+		return ret;
+	}
+
+	tuples = avs_tplg_vendor_array_at(tuples, offset);
+
+	/* NHLT configs dictionary. */
+	return avs_tplg_parse_nhlt_configs(comp, tuples, remaining);
 }
 
 enum {
diff --git a/sound/soc/intel/avs/topology.h b/sound/soc/intel/avs/topology.h
index 1e83fccf2ea2..61d50960ef06 100644
--- a/sound/soc/intel/avs/topology.h
+++ b/sound/soc/intel/avs/topology.h
@@ -37,6 +37,8 @@ struct avs_tplg {
 	u32 num_condpath_tmpls;
 	struct avs_tplg_init_config *init_configs;
 	u32 num_init_configs;
+	struct avs_tplg_nhlt_config *nhlt_configs;
+	u32 num_nhlt_configs;
 
 	struct list_head path_tmpl_list;
 };
@@ -175,6 +177,11 @@ struct avs_tplg_init_config {
 	void *data;
 };
 
+struct avs_tplg_nhlt_config {
+	u32 id;
+	struct acpi_nhlt_config *blob;
+};
+
 struct avs_tplg_path {
 	u32 id;
 
-- 
cgit v1.2.3


From d5c8b7902a41625ea328b52c78ebe750fbf6fef7 Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Sat, 15 Nov 2025 19:06:27 +0100
Subject: ASoC: Intel: avs: Honor NHLT override when setting up a path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In case topology provides NHLT configuration, use it instead of relying
on the table in ACPI tree. Only gateway-related modules e.g.: Copier
care about the process. For those the order of fetching for hardware
configuration becomes:

1) check if NHLT override is set,
2) check if NHLT descriptor override is set,
3) use NHLT from ACPI directly

Such approach ensures no conflicts exist between 1) and 2) and that 1)
always takes precedence.

Co-developed-by: Amadeusz Sławiński <amade@asmblr.net>
Signed-off-by: Amadeusz Sławiński <amade@asmblr.net>
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://patch.msgid.link/20251115180627.3589520-3-cezary.rojewski@intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/intel/avs/tokens.h |  1 +
 sound/soc/intel/avs/path.c            | 13 +++++++++----
 sound/soc/intel/avs/topology.c        |  7 +++++++
 sound/soc/intel/avs/topology.h        |  1 +
 4 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/intel/avs/tokens.h b/include/uapi/sound/intel/avs/tokens.h
index f7cbbfb00227..3ff6d9150822 100644
--- a/include/uapi/sound/intel/avs/tokens.h
+++ b/include/uapi/sound/intel/avs/tokens.h
@@ -125,6 +125,7 @@ enum avs_tplg_token {
 	AVS_TKN_MOD_KCONTROL_ID_U32			= 1707,
 	AVS_TKN_MOD_INIT_CONFIG_NUM_IDS_U32		= 1708,
 	AVS_TKN_MOD_INIT_CONFIG_ID_U32			= 1709,
+	AVS_TKN_MOD_NHLT_CONFIG_ID_U32			= 1710,
 
 	/* struct avs_tplg_path_template */
 	AVS_TKN_PATH_TMPL_ID_U32			= 1801,
diff --git a/sound/soc/intel/avs/path.c b/sound/soc/intel/avs/path.c
index 7aa20fcf1a33..c8b586aced20 100644
--- a/sound/soc/intel/avs/path.c
+++ b/sound/soc/intel/avs/path.c
@@ -210,9 +210,11 @@ int avs_path_set_constraint(struct avs_dev *adev, struct avs_tplg_path_template
 					continue;
 				}
 
-				blob = avs_nhlt_config_or_default(adev, module_template);
-				if (IS_ERR(blob))
-					continue;
+				if (!module_template->nhlt_config) {
+					blob = avs_nhlt_config_or_default(adev, module_template);
+					if (IS_ERR(blob))
+						continue;
+				}
 
 				rlist[i] = path_template->fe_fmt->sampling_freq;
 				clist[i] = path_template->fe_fmt->num_channels;
@@ -382,7 +384,10 @@ static int avs_fill_gtw_config(struct avs_dev *adev, struct avs_copier_gtw_cfg *
 	struct acpi_nhlt_config *blob;
 	size_t gtw_size;
 
-	blob = avs_nhlt_config_or_default(adev, t);
+	if (t->nhlt_config)
+		blob = t->nhlt_config->blob;
+	else
+		blob = avs_nhlt_config_or_default(adev, t);
 	if (IS_ERR(blob))
 		return PTR_ERR(blob);
 
diff --git a/sound/soc/intel/avs/topology.c b/sound/soc/intel/avs/topology.c
index 48fdbaef56dd..9033f683393c 100644
--- a/sound/soc/intel/avs/topology.c
+++ b/sound/soc/intel/avs/topology.c
@@ -350,6 +350,7 @@ AVS_DEFINE_PTR_PARSER(modcfg_base, struct avs_tplg_modcfg_base, modcfgs_base);
 AVS_DEFINE_PTR_PARSER(modcfg_ext, struct avs_tplg_modcfg_ext, modcfgs_ext);
 AVS_DEFINE_PTR_PARSER(pplcfg, struct avs_tplg_pplcfg, pplcfgs);
 AVS_DEFINE_PTR_PARSER(binding, struct avs_tplg_binding, bindings);
+AVS_DEFINE_PTR_PARSER(nhlt_config, struct avs_tplg_nhlt_config, nhlt_configs);
 
 static int
 parse_audio_format_bitfield(struct snd_soc_component *comp, void *elem, void *object, u32 offset)
@@ -1200,6 +1201,12 @@ static const struct avs_tplg_token_parser module_parsers[] = {
 		.offset = offsetof(struct avs_tplg_module, num_config_ids),
 		.parse = avs_parse_byte_token,
 	},
+	{
+		.token = AVS_TKN_MOD_NHLT_CONFIG_ID_U32,
+		.type = SND_SOC_TPLG_TUPLE_TYPE_WORD,
+		.offset = offsetof(struct avs_tplg_module, nhlt_config),
+		.parse = avs_parse_nhlt_config_ptr,
+	},
 };
 
 static const struct avs_tplg_token_parser init_config_parsers[] = {
diff --git a/sound/soc/intel/avs/topology.h b/sound/soc/intel/avs/topology.h
index 61d50960ef06..1cf7455b6c01 100644
--- a/sound/soc/intel/avs/topology.h
+++ b/sound/soc/intel/avs/topology.h
@@ -223,6 +223,7 @@ struct avs_tplg_module {
 	u32 ctl_id;
 	u32 num_config_ids;
 	u32 *config_ids;
+	struct avs_tplg_nhlt_config *nhlt_config;
 
 	struct avs_tplg_pipeline *owner;
 	/* Pipeline modules management. */
-- 
cgit v1.2.3


From 4d5c668c268b7812ff15452d303974ce247ad378 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 11 Nov 2025 00:17:48 +0000
Subject: ASoC: soc.h: convert to snd_soc_dapm_xxx()

This patch converts below functions.

dapm->dev					-> snd_soc_dapm_to_dev()
dapm->card					-> snd_soc_dapm_to_card()
dapm->component					-> snd_soc_dapm_to_component()

dapm_kcontrol_get_value()			-> snd_soc_dapm_kcontrol_get_value()

snd_soc_component_enable_pin()			-> snd_soc_dapm_enable_pin()
snd_soc_component_enable_pin_unlocked()		-> snd_soc_dapm_enable_pin_unlocked()
snd_soc_component_disable_pin()			-> snd_soc_dapm_disable_pin()
snd_soc_component_disable_pin_unlocked()	-> snd_soc_dapm_disable_pin_unlocked()
snd_soc_component_nc_pin()			-> snd_soc_dapm_nc_pin()
snd_soc_component_nc_pin_unlocked()		-> snd_soc_dapm_nc_pin_unlocked()
snd_soc_component_get_pin_status()		-> snd_soc_dapm_get_pin_status()
snd_soc_component_force_enable_pin()		-> snd_soc_dapm_force_enable_pin()
snd_soc_component_force_enable_pin_unlocked()	-> snd_soc_dapm_force_enable_pin_unlocked()
snd_soc_component_force_bias_level()		-> snd_soc_dapm_force_bias_level()
snd_soc_component_get_bias_level()		-> snd_soc_dapm_get_bias_level()
snd_soc_component_init_bias_level()		-> snd_soc_dapm_init_bias_level()
snd_soc_component_get_dapm()			-> snd_soc_component_to_dapm()

snd_soc_dapm_kcontrol_component()		-> snd_soc_dapm_kcontrol_to_component()
snd_soc_dapm_kcontrol_widget()			-> snd_soc_dapm_kcontrol_to_widget()
snd_soc_dapm_kcontrol_dapm()			-> snd_soc_dapm_kcontrol_to_dapm()
snd_soc_dapm_np_pin()				-> snd_soc_dapm_disable_pin()

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/874ir1a0cz.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 53b4129ee97a..37dc6f6fc63f 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1467,22 +1467,22 @@ static inline void _snd_soc_dapm_mutex_assert_held_c(struct snd_soc_card *card)
 
 static inline void _snd_soc_dapm_mutex_lock_root_d(struct snd_soc_dapm_context *dapm)
 {
-	_snd_soc_dapm_mutex_lock_root_c(dapm->card);
+	_snd_soc_dapm_mutex_lock_root_c(snd_soc_dapm_to_card(dapm));
 }
 
 static inline void _snd_soc_dapm_mutex_lock_d(struct snd_soc_dapm_context *dapm)
 {
-	_snd_soc_dapm_mutex_lock_c(dapm->card);
+	_snd_soc_dapm_mutex_lock_c(snd_soc_dapm_to_card(dapm));
 }
 
 static inline void _snd_soc_dapm_mutex_unlock_d(struct snd_soc_dapm_context *dapm)
 {
-	_snd_soc_dapm_mutex_unlock_c(dapm->card);
+	_snd_soc_dapm_mutex_unlock_c(snd_soc_dapm_to_card(dapm));
 }
 
 static inline void _snd_soc_dapm_mutex_assert_held_d(struct snd_soc_dapm_context *dapm)
 {
-	_snd_soc_dapm_mutex_assert_held_c(dapm->card);
+	_snd_soc_dapm_mutex_assert_held_c(snd_soc_dapm_to_card(dapm));
 }
 
 #define snd_soc_dapm_mutex_lock_root(x) _Generic((x),			\
-- 
cgit v1.2.3


From 8855eb7d29400fb7b2882da33725db2801c410e4 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 11 Nov 2025 00:17:52 +0000
Subject: ASoC: asoc.h: convert to snd_soc_dapm_xxx()

This patch converts below functions.

dapm->dev					-> snd_soc_dapm_to_dev()
dapm->card					-> snd_soc_dapm_to_card()
dapm->component					-> snd_soc_dapm_to_component()

dapm_kcontrol_get_value()			-> snd_soc_dapm_kcontrol_get_value()

snd_soc_component_enable_pin()			-> snd_soc_dapm_enable_pin()
snd_soc_component_enable_pin_unlocked()		-> snd_soc_dapm_enable_pin_unlocked()
snd_soc_component_disable_pin()			-> snd_soc_dapm_disable_pin()
snd_soc_component_disable_pin_unlocked()	-> snd_soc_dapm_disable_pin_unlocked()
snd_soc_component_nc_pin()			-> snd_soc_dapm_nc_pin()
snd_soc_component_nc_pin_unlocked()		-> snd_soc_dapm_nc_pin_unlocked()
snd_soc_component_get_pin_status()		-> snd_soc_dapm_get_pin_status()
snd_soc_component_force_enable_pin()		-> snd_soc_dapm_force_enable_pin()
snd_soc_component_force_enable_pin_unlocked()	-> snd_soc_dapm_force_enable_pin_unlocked()
snd_soc_component_force_bias_level()		-> snd_soc_dapm_force_bias_level()
snd_soc_component_get_bias_level()		-> snd_soc_dapm_get_bias_level()
snd_soc_component_init_bias_level()		-> snd_soc_dapm_init_bias_level()
snd_soc_component_get_dapm()			-> snd_soc_component_to_dapm()

snd_soc_dapm_kcontrol_component()		-> snd_soc_dapm_kcontrol_to_component()
snd_soc_dapm_kcontrol_widget()			-> snd_soc_dapm_kcontrol_to_widget()
snd_soc_dapm_kcontrol_dapm()			-> snd_soc_dapm_kcontrol_to_dapm()
snd_soc_dapm_np_pin()				-> snd_soc_dapm_disable_pin()

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/87346la0cv.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/trace/events/asoc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h
index 6696dbcc2b96..4a645549164e 100644
--- a/include/trace/events/asoc.h
+++ b/include/trace/events/asoc.h
@@ -27,8 +27,8 @@ DECLARE_EVENT_CLASS(snd_soc_dapm,
 	TP_ARGS(dapm, val),
 
 	TP_STRUCT__entry(
-		__string(	card_name,	dapm->card->name)
-		__string(	comp_name,	dapm->component ? dapm->component->name : "(none)")
+		__string(	card_name,	snd_soc_dapm_to_card(dapm)->name)
+		__string(	comp_name,	snd_soc_dapm_to_component(dapm) ? snd_soc_dapm_to_component(dapm)->name : "(none)")
 		__field(	int,		val)
 	),
 
-- 
cgit v1.2.3


From 37d17925480404f1293f24d027fbf3c9975603d7 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 29 Sep 2025 11:46:43 +0100
Subject: mm/thp: drop follow_devmap_pmd() default stub

follow_devmap_pmd() has already been dropped by the commit fd2825b0760a
("mm/gup: remove pXX_devmap usage from get_user_pages()").  The fallback
stub in the header which is now redundant, can be dropped off as well.

Link: https://lkml.kernel.org/r/20250929104643.1100421-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 71ac78b9f834..fee4cf7fa300 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -682,12 +682,6 @@ static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
 	return;
 }
 
-static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
-	unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
-	return NULL;
-}
-
 static inline bool thp_migration_supported(void)
 {
 	return false;
-- 
cgit v1.2.3


From 9c47753167a6a585d0305663c6912f042e131c2d Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 7 Oct 2025 14:20:29 +0200
Subject: mm/vmalloc: defer freeing partly initialized vm_struct

__vmalloc_area_node() may call free_vmap_area() or vfree() on error paths,
both of which can sleep.  This becomes problematic if the function is
invoked from an atomic context, such as when GFP_ATOMIC or GFP_NOWAIT is
passed via gfp_mask.

To fix this, unify error paths and defer the cleanup of partly initialized
vm_struct objects to a workqueue.  This ensures that freeing happens in a
process context and avoids invalid sleeps in atomic regions.

Link: https://lkml.kernel.org/r/20251007122035.56347-5-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmalloc.h |  6 +++++-
 mm/vmalloc.c            | 34 +++++++++++++++++++++++++++++++---
 2 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index eb54b7b3202f..1e43181369f1 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -50,7 +50,11 @@ struct iov_iter;		/* in uio.h */
 #endif
 
 struct vm_struct {
-	struct vm_struct	*next;
+	union {
+		struct vm_struct *next;	  /* Early registration of vm_areas. */
+		struct llist_node llnode; /* Asynchronous freeing on error paths. */
+	};
+
 	void			*addr;
 	unsigned long		size;
 	unsigned long		flags;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d83c01caaabe..9e29dd767c41 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3687,6 +3687,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
 	return nr_allocated;
 }
 
+static LLIST_HEAD(pending_vm_area_cleanup);
+static void cleanup_vm_area_work(struct work_struct *work)
+{
+	struct vm_struct *area, *tmp;
+	struct llist_node *head;
+
+	head = llist_del_all(&pending_vm_area_cleanup);
+	if (!head)
+		return;
+
+	llist_for_each_entry_safe(area, tmp, head, llnode) {
+		if (!area->pages)
+			free_vm_area(area);
+		else
+			vfree(area->addr);
+	}
+}
+
+/*
+ * Helper for __vmalloc_area_node() to defer cleanup
+ * of partially initialized vm_struct in error paths.
+ */
+static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
+static void defer_vm_area_cleanup(struct vm_struct *area)
+{
+	if (llist_add(&area->llnode, &pending_vm_area_cleanup))
+		schedule_work(&cleanup_vm_area);
+}
+
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, unsigned int page_shift,
 				 int node)
@@ -3718,8 +3747,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		warn_alloc(gfp_mask, NULL,
 			"vmalloc error: size %lu, failed to allocated page array size %lu",
 			nr_small_pages * PAGE_SIZE, array_size);
-		free_vm_area(area);
-		return NULL;
+		goto fail;
 	}
 
 	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
@@ -3796,7 +3824,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	return area->addr;
 
 fail:
-	vfree(area->addr);
+	defer_vm_area_cleanup(area);
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 8da89ba18ed4e9000d9b9b5b1f699e5004f4abf6 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 7 Oct 2025 14:20:30 +0200
Subject: mm/vmalloc: handle non-blocking GFP in __vmalloc_area_node()

Make __vmalloc_area_node() respect non-blocking GFP masks such as
GFP_ATOMIC and GFP_NOWAIT.

- Add memalloc_apply_gfp_scope()/memalloc_restore_scope()
  helpers to apply a proper scope.
- Apply memalloc_apply_gfp_scope()/memalloc_restore_scope()
  around vmap_pages_range() for page table setup.
- Set "nofail" to false if a non-blocking mask is used, as
  they are mutually exclusive.

This is particularly important for page table allocations that internally
use GFP_PGTABLE_KERNEL, which may sleep unless such scope restrictions are
applied.  For example:

<snip>
__pte_alloc_kernel()
  pte_alloc_one_kernel(&init_mm);
    pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0);
<snip>

Note: in most cases, PTE entries are established only up to the level
required by current vmap space usage, meaning the page tables are
typically fully populated during the mapping process.

Link: https://lkml.kernel.org/r/20251007122035.56347-6-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmalloc.h |  2 ++
 mm/vmalloc.c            | 52 +++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1e43181369f1..e8e94f90d686 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -332,4 +332,6 @@ bool vmalloc_dump_obj(void *object);
 static inline bool vmalloc_dump_obj(void *object) { return false; }
 #endif
 
+unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask);
+void memalloc_restore_scope(unsigned int flags);
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9e29dd767c41..d8bcd87239b5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3716,6 +3716,42 @@ static void defer_vm_area_cleanup(struct vm_struct *area)
 		schedule_work(&cleanup_vm_area);
 }
 
+/*
+ * Page tables allocations ignore external GFP. Enforces it by
+ * the memalloc scope API. It is used by vmalloc internals and
+ * KASAN shadow population only.
+ *
+ * GFP to scope mapping:
+ *
+ * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save()
+ * GFP_NOFS - memalloc_nofs_save()
+ * GFP_NOIO - memalloc_noio_save()
+ *
+ * Returns a flag cookie to pair with restore.
+ */
+unsigned int
+memalloc_apply_gfp_scope(gfp_t gfp_mask)
+{
+	unsigned int flags = 0;
+
+	if (!gfpflags_allow_blocking(gfp_mask))
+		flags = memalloc_noreclaim_save();
+	else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+		flags = memalloc_nofs_save();
+	else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+		flags = memalloc_noio_save();
+
+	/* 0 - no scope applied. */
+	return flags;
+}
+
+void
+memalloc_restore_scope(unsigned int flags)
+{
+	if (flags)
+		memalloc_flags_restore(flags);
+}
+
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, unsigned int page_shift,
 				 int node)
@@ -3732,6 +3768,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 
 	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
 
+	/* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */
+	if (!gfpflags_allow_blocking(gfp_mask))
+		nofail = false;
+
 	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
 		gfp_mask |= __GFP_HIGHMEM;
 
@@ -3797,22 +3837,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	 * page tables allocations ignore external gfp mask, enforce it
 	 * by the scope API
 	 */
-	if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
-		flags = memalloc_nofs_save();
-	else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
-		flags = memalloc_noio_save();
-
+	flags = memalloc_apply_gfp_scope(gfp_mask);
 	do {
 		ret = vmap_pages_range(addr, addr + size, prot, area->pages,
 			page_shift);
 		if (nofail && (ret < 0))
 			schedule_timeout_uninterruptible(1);
 	} while (nofail && (ret < 0));
-
-	if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
-		memalloc_nofs_restore(flags);
-	else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
-		memalloc_noio_restore(flags);
+	memalloc_restore_scope(flags);
 
 	if (ret < 0) {
 		warn_alloc(gfp_mask, NULL,
-- 
cgit v1.2.3


From b186a94227b753f2fdcab0df29dfc636c63ac329 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 7 Oct 2025 14:20:32 +0200
Subject: kmsan: remove hard-coded GFP_KERNEL flags

kmsan_vmap_pages_range_noflush() allocates its temp s_pages/o_pages arrays
with GFP_KERNEL, which may sleep.  This is inconsistent with vmalloc() as
it will support non-blocking requests later.

Plumb gfp_mask through the kmsan_vmap_pages_range_noflush(), so it can use
it internally for its demand.

Please note, the subsequent __vmap_pages_range_noflush() still uses
GFP_KERNEL and can sleep.  If a caller runs under reclaim constraints,
sleeping is forbidden, it must establish the appropriate memalloc scope
API.

Link: https://lkml.kernel.org/r/20251007122035.56347-8-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan.h |  6 ++++--
 mm/internal.h         |  4 ++--
 mm/kmsan/shadow.c     |  6 +++---
 mm/percpu-vm.c        |  2 +-
 mm/vmalloc.c          | 26 +++++++++++++++++---------
 5 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index f2fd221107bb..7da9fd506b39 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -133,6 +133,7 @@ void kmsan_kfree_large(const void *ptr);
  * @prot:	page protection flags used for vmap.
  * @pages:	array of pages.
  * @page_shift:	page_shift passed to vmap_range_noflush().
+ * @gfp_mask:	gfp_mask to use internally.
  *
  * KMSAN maps shadow and origin pages of @pages into contiguous ranges in
  * vmalloc metadata address range. Returns 0 on success, callers must check
@@ -142,7 +143,8 @@ int __must_check kmsan_vmap_pages_range_noflush(unsigned long start,
 						unsigned long end,
 						pgprot_t prot,
 						struct page **pages,
-						unsigned int page_shift);
+						unsigned int page_shift,
+						gfp_t gfp_mask);
 
 /**
  * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
@@ -347,7 +349,7 @@ static inline void kmsan_kfree_large(const void *ptr)
 
 static inline int __must_check kmsan_vmap_pages_range_noflush(
 	unsigned long start, unsigned long end, pgprot_t prot,
-	struct page **pages, unsigned int page_shift)
+	struct page **pages, unsigned int page_shift, gfp_t gfp_mask)
 {
 	return 0;
 }
diff --git a/mm/internal.h b/mm/internal.h
index 1561fc2ff5b8..e623c8103358 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1355,7 +1355,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
 #ifdef CONFIG_MMU
 void __init vmalloc_init(void);
 int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
-                pgprot_t prot, struct page **pages, unsigned int page_shift);
+	pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask);
 unsigned int get_vm_area_page_order(struct vm_struct *vm);
 #else
 static inline void vmalloc_init(void)
@@ -1364,7 +1364,7 @@ static inline void vmalloc_init(void)
 
 static inline
 int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
-                pgprot_t prot, struct page **pages, unsigned int page_shift)
+	pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask)
 {
 	return -EINVAL;
 }
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 55fdea199aaf..e7f554a31bb4 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -215,7 +215,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
 
 int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
 				   pgprot_t prot, struct page **pages,
-				   unsigned int page_shift)
+				   unsigned int page_shift, gfp_t gfp_mask)
 {
 	unsigned long shadow_start, origin_start, shadow_end, origin_end;
 	struct page **s_pages, **o_pages;
@@ -230,8 +230,8 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
 		return 0;
 
 	nr = (end - start) / PAGE_SIZE;
-	s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL);
-	o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL);
+	s_pages = kcalloc(nr, sizeof(*s_pages), gfp_mask);
+	o_pages = kcalloc(nr, sizeof(*o_pages), gfp_mask);
 	if (!s_pages || !o_pages) {
 		err = -ENOMEM;
 		goto ret;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index cd69caf6aa8d..4f5937090590 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -194,7 +194,7 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
 			    int nr_pages)
 {
 	return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
-					PAGE_KERNEL, pages, PAGE_SHIFT);
+			PAGE_KERNEL, pages, PAGE_SHIFT, GFP_KERNEL);
 }
 
 /**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d8bcd87239b5..d7e7049e01f8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -671,16 +671,28 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
 }
 
 int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
-		pgprot_t prot, struct page **pages, unsigned int page_shift)
+		pgprot_t prot, struct page **pages, unsigned int page_shift,
+		gfp_t gfp_mask)
 {
 	int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
-						 page_shift);
+						page_shift, gfp_mask);
 
 	if (ret)
 		return ret;
 	return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
 }
 
+static int __vmap_pages_range(unsigned long addr, unsigned long end,
+		pgprot_t prot, struct page **pages, unsigned int page_shift,
+		gfp_t gfp_mask)
+{
+	int err;
+
+	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask);
+	flush_cache_vmap(addr, end);
+	return err;
+}
+
 /**
  * vmap_pages_range - map pages to a kernel virtual address
  * @addr: start of the VM area to map
@@ -696,11 +708,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
 int vmap_pages_range(unsigned long addr, unsigned long end,
 		pgprot_t prot, struct page **pages, unsigned int page_shift)
 {
-	int err;
-
-	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
-	flush_cache_vmap(addr, end);
-	return err;
+	return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL);
 }
 
 static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
@@ -3839,8 +3847,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	 */
 	flags = memalloc_apply_gfp_scope(gfp_mask);
 	do {
-		ret = vmap_pages_range(addr, addr + size, prot, area->pages,
-			page_shift);
+		ret = __vmap_pages_range(addr, addr + size, prot, area->pages,
+				page_shift, nested_gfp);
 		if (nofail && (ret < 0))
 			schedule_timeout_uninterruptible(1);
 	} while (nofail && (ret < 0));
-- 
cgit v1.2.3


From 7241bb2ea33d5ff50b77a5981342bcc826bef52a Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 7 Oct 2025 14:20:33 +0200
Subject: mm: skip might_alloc() warnings when PF_MEMALLOC is set

might_alloc() catches invalid blocking allocations in contexts where
sleeping is not allowed.

However when PF_MEMALLOC is set, the page allocator already skips reclaim
and other blocking paths.  In such cases, a blocking gfp_mask does not
actually lead to blocking, so triggering might_alloc() splats is
misleading.

Adjust might_alloc() to skip warnings when the current task has
PF_MEMALLOC set, matching the allocator's actual blocking behaviour.

Link: https://lkml.kernel.org/r/20251007122035.56347-9-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/mm.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 0232d983b715..a74582aed747 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -318,6 +318,9 @@ static inline void might_alloc(gfp_t gfp_mask)
 	fs_reclaim_acquire(gfp_mask);
 	fs_reclaim_release(gfp_mask);
 
+	if (current->flags & PF_MEMALLOC)
+		return;
+
 	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
 }
 
-- 
cgit v1.2.3


From 590c03ca6a3fbb114396673314e2aa483839608b Mon Sep 17 00:00:00 2001
From: xu xin <xu.xin16@zte.com.cn>
Date: Tue, 7 Oct 2025 18:28:21 +0800
Subject: mm/ksm: fix exec/fork inheritance support for prctl

Patch series "ksm: fix exec/fork inheritance", v2.

This series fixes exec/fork inheritance.  See the detailed description of
the issue below.


This patch (of 2):

Background
==========

commit d7597f59d1d33 ("mm: add new api to enable ksm per process")
introduced MMF_VM_MERGE_ANY for mm->flags, and allowed user to set it by
prctl() so that the process's VMAs are forcibly scanned by ksmd.

Subsequently, the 3c6f33b7273a ("mm/ksm: support fork/exec for prctl")
supported inheriting the MMF_VM_MERGE_ANY flag when a task calls execve().

Finally, commit 3a9e567ca45fb ("mm/ksm: fix ksm exec support for prctl")
fixed the issue that ksmd doesn't scan the mm_struct with MMF_VM_MERGE_ANY
by adding the mm_slot to ksm_mm_head in __bprm_mm_init().

Problem
=======

In some extreme scenarios, however, this inheritance of MMF_VM_MERGE_ANY
during exec/fork can fail.  For example, when the scanning frequency of
ksmd is tuned extremely high, a process carrying MMF_VM_MERGE_ANY may
still fail to pass it to the newly exec'd process.  This happens because
ksm_execve() is executed too early in the do_execve flow (prematurely
adding the new mm_struct to the ksm_mm_slot list).

As a result, before do_execve completes, ksmd may have already performed a
scan and found that this new mm_struct has no VM_MERGEABLE VMAs, thus
clearing its MMF_VM_MERGE_ANY flag.  Consequently, when the new program
executes, the flag MMF_VM_MERGE_ANY inheritance missed.

Root reason
===========

commit d7597f59d1d33 ("mm: add new api to enable ksm per process") clear
the flag MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs.

Solution
========

Firstly, Don't clear MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE
VMAs, because perhaps their mm_struct has just been added to ksm_mm_slot
list, and its process has not yet officially started running or has not
yet performed mmap/brk to allocate anonymous VMAS.

Secondly, recheck MMF_VM_MERGEABLE again if a process takes
MMF_VM_MERGE_ANY, and create a mm_slot and join it into ksm_scan_list
again.

Link: https://lkml.kernel.org/r/20251007182504440BJgK8VXRHh8TD7IGSUIY4@zte.com.cn
Link: https://lkml.kernel.org/r/20251007182821572h_SoFqYZXEP1mvWI4n9VL@zte.com.cn
Fixes: 3c6f33b7273a ("mm/ksm: support fork/exec for prctl")
Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process")
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Cc: Stefan Roesch <shr@devkernel.io>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jinjiang Tu <tujinjiang@huawei.com>
Cc: Wang Yaxin <wang.yaxin@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h |  4 ++--
 mm/ksm.c            | 20 +++++++++++++++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 067538fc4d58..c982694c987b 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -17,7 +17,7 @@
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, vm_flags_t *vm_flags);
-vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file,
 			 vm_flags_t vm_flags);
 int ksm_enable_merge_any(struct mm_struct *mm);
 int ksm_disable_merge_any(struct mm_struct *mm);
@@ -103,7 +103,7 @@ bool ksm_process_mergeable(struct mm_struct *mm);
 
 #else  /* !CONFIG_KSM */
 
-static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
+static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm,
 		const struct file *file, vm_flags_t vm_flags)
 {
 	return vm_flags;
diff --git a/mm/ksm.c b/mm/ksm.c
index cdefba633856..4f672f4f2140 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2712,8 +2712,14 @@ no_vmas:
 		spin_unlock(&ksm_mmlist_lock);
 
 		mm_slot_free(mm_slot_cache, mm_slot);
+		/*
+		 * Only clear MMF_VM_MERGEABLE. We must not clear
+		 * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process,
+		 * perhaps their mm_struct has just been added to ksm_mm_slot
+		 * list, and its process has not yet officially started running
+		 * or has not yet performed mmap/brk to allocate anonymous VMAS.
+		 */
 		mm_flags_clear(MMF_VM_MERGEABLE, mm);
-		mm_flags_clear(MMF_VM_MERGE_ANY, mm);
 		mmap_read_unlock(mm);
 		mmdrop(mm);
 	} else {
@@ -2831,12 +2837,20 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
  *
  * Returns: @vm_flags possibly updated to mark mergeable.
  */
-vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file,
 			 vm_flags_t vm_flags)
 {
 	if (mm_flags_test(MMF_VM_MERGE_ANY, mm) &&
-	    __ksm_should_add_vma(file, vm_flags))
+	    __ksm_should_add_vma(file, vm_flags)) {
 		vm_flags |= VM_MERGEABLE;
+		/*
+		 * Generally, the flags here always include MMF_VM_MERGEABLE.
+		 * However, in rare cases, this flag may be cleared by ksmd who
+		 * scans a cycle without finding any mergeable vma.
+		 */
+		if (unlikely(!mm_flags_test(MMF_VM_MERGEABLE, mm)))
+			__ksm_enter(mm);
+	}
 
 	return vm_flags;
 }
-- 
cgit v1.2.3


From 9ac09bb9feaccc2f45e5606dc48a3f748d478dc4 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 3 Oct 2025 16:53:04 +0100
Subject: mm: consistently use current->mm in mm_get_unmapped_area()

mm_get_unmapped_area() is a wrapper around arch_get_unmapped_area() /
arch_get_unmapped_area_topdown(), both of which search current->mm for
some free space.  Neither take an mm_struct - they implicitly operate on
current->mm.

But the wrapper takes an mm_struct and uses it to decide whether to search
bottom up or top down.  All callers pass in current->mm for this, so
everything is working consistently.  But it feels like an accident waiting
to happen; eventually someone will call that function with a different mm,
expecting to find free space in it, but what gets returned is free space
in the current mm.

So let's simplify by removing the parameter and have the wrapper use
current->mm to decide which end to start at.  Now everything is consistent
and self-documenting.

Link: https://lkml.kernel.org/r/20251003155306.2147572-1-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/sparc/kernel/sys_sparc_64.c |  6 +++---
 arch/x86/kernel/cpu/sgx/driver.c |  2 +-
 drivers/char/mem.c               |  2 +-
 drivers/dax/device.c             |  5 ++---
 fs/hugetlbfs/inode.c             |  3 +--
 fs/proc/inode.c                  |  2 +-
 fs/ramfs/file-mmu.c              |  2 +-
 include/linux/sched/mm.h         |  9 ++++-----
 io_uring/memmap.c                |  2 +-
 kernel/bpf/arena.c               |  2 +-
 kernel/bpf/syscall.c             |  2 +-
 mm/huge_memory.c                 |  4 ++--
 mm/mmap.c                        | 17 +++++++----------
 mm/shmem.c                       |  8 +++-----
 14 files changed, 29 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 55faf2effa46..dbf118b40601 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -241,7 +241,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
 
 	if (flags & MAP_FIXED) {
 		/* Ok, don't mess with it. */
-		return mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags);
+		return mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags);
 	}
 	flags &= ~MAP_SHARED;
 
@@ -254,7 +254,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
 		align_goal = (64UL * 1024);
 
 	do {
-		addr = mm_get_unmapped_area(current->mm, NULL, orig_addr,
+		addr = mm_get_unmapped_area(NULL, orig_addr,
 					    len + (align_goal - PAGE_SIZE), pgoff, flags);
 		if (!(addr & ~PAGE_MASK)) {
 			addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL);
@@ -273,7 +273,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
 	 * be obtained.
 	 */
 	if (addr & ~PAGE_MASK)
-		addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags);
+		addr = mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags);
 
 	return addr;
 }
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 7f8d1e11dbee..3b3efadb8cae 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file,
 	if (flags & MAP_FIXED)
 		return addr;
 
-	return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(file, addr, len, pgoff, flags);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 34b815901b20..db1ca53a6d01 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -542,7 +542,7 @@ static unsigned long get_unmapped_area_zero(struct file *file,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	return thp_get_unmapped_area(file, addr, len, pgoff, flags);
 #else
-	return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(file, addr, len, pgoff, flags);
 #endif
 }
 #endif /* CONFIG_MMU */
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 2bb40a6060af..7f1ed0db8337 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -330,14 +330,13 @@ static unsigned long dax_get_unmapped_area(struct file *filp,
 	if ((off + len_align) < off)
 		goto out;
 
-	addr_align = mm_get_unmapped_area(current->mm, filp, addr, len_align,
-					  pgoff, flags);
+	addr_align = mm_get_unmapped_area(filp, addr, len_align, pgoff, flags);
 	if (!IS_ERR_VALUE(addr_align)) {
 		addr_align += (off - addr_align) & (align - 1);
 		return addr_align;
 	}
  out:
-	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(filp, addr, len, pgoff, flags);
 }
 
 static const struct address_space_operations dev_dax_aops = {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f42548ee9083..ce8e40d35032 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -184,8 +184,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (addr)
 		addr0 = ALIGN(addr, huge_page_size(h));
 
-	return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff,
-					    flags, 0);
+	return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0);
 }
 
 /*
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d9b7ef122343..2d3425cfa94b 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -443,7 +443,7 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo
 		return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);
 
 #ifdef CONFIG_MMU
-	return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags);
+	return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags);
 #endif
 
 	return orig_addr;
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index b11f5b20b78b..c3ed1c5117b2 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
 		unsigned long addr, unsigned long len, unsigned long pgoff,
 		unsigned long flags)
 {
-	return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(file, addr, len, pgoff, flags);
 }
 
 const struct file_operations ramfs_file_operations = {
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index a74582aed747..0e1d73955fa5 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -189,12 +189,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			       unsigned long len, unsigned long pgoff,
 			       unsigned long flags, vm_flags_t);
 
-unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
-				   unsigned long addr, unsigned long len,
-				   unsigned long pgoff, unsigned long flags);
+unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr,
+				   unsigned long len, unsigned long pgoff,
+				   unsigned long flags);
 
-unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
-					   struct file *filp,
+unsigned long mm_get_unmapped_area_vmflags(struct file *filp,
 					   unsigned long addr,
 					   unsigned long len,
 					   unsigned long pgoff,
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index add03ca75cb9..63fcfa757bb8 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -387,7 +387,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
 #else
 	addr = 0UL;
 #endif
-	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(filp, addr, len, pgoff, flags);
 }
 
 #else /* !CONFIG_MMU */
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 1074ac4459f2..872dc0e41c65 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -334,7 +334,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad
 			return -EINVAL;
 	}
 
-	ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags);
+	ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags);
 	if (IS_ERR_VALUE(ret))
 		return ret;
 	if ((ret >> 32) == ((ret + len - 1) >> 32))
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8a129746bd6c..d77685f2c6cb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1162,7 +1162,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr
 	if (map->ops->map_get_unmapped_area)
 		return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
 #ifdef CONFIG_MMU
-	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(filp, addr, len, pgoff, flags);
 #else
 	return addr;
 #endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2f2a521e5d68..32479ae27400 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1127,7 +1127,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
 	if (len_pad < len || (off + len_pad) < off)
 		return 0;
 
-	ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
+	ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad,
 					   off >> PAGE_SHIFT, flags, vm_flags);
 
 	/*
@@ -1164,7 +1164,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
 	if (ret)
 		return ret;
 
-	return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
+	return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags,
 					    vm_flags);
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 5fd3b80fda1d..644f02071a41 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -797,12 +797,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 }
 #endif
 
-unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp,
-					   unsigned long addr, unsigned long len,
-					   unsigned long pgoff, unsigned long flags,
-					   vm_flags_t vm_flags)
+unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
+					   unsigned long len, unsigned long pgoff,
+					   unsigned long flags, vm_flags_t vm_flags)
 {
-	if (mm_flags_test(MMF_TOPDOWN, mm))
+	if (mm_flags_test(MMF_TOPDOWN, current->mm))
 		return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
 						      flags, vm_flags);
 	return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
@@ -848,7 +847,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		addr = thp_get_unmapped_area_vmflags(file, addr, len,
 						     pgoff, flags, vm_flags);
 	} else {
-		addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
+		addr = mm_get_unmapped_area_vmflags(file, addr, len,
 						    pgoff, flags, vm_flags);
 	}
 	if (IS_ERR_VALUE(addr))
@@ -864,12 +863,10 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 }
 
 unsigned long
-mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
-		     unsigned long addr, unsigned long len,
+mm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		     unsigned long pgoff, unsigned long flags)
 {
-	return mm_get_unmapped_area_vmflags(mm, file, addr, len,
-					    pgoff, flags, 0);
+	return mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, 0);
 }
 EXPORT_SYMBOL(mm_get_unmapped_area);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 58701d14dd96..0eecb486a0cb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2759,8 +2759,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
-	addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
-				    flags);
+	addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags);
 
 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
 		return addr;
@@ -2838,8 +2837,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
 	if (inflated_len < len)
 		return addr;
 
-	inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
-					     inflated_len, 0, flags);
+	inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags);
 	if (IS_ERR_VALUE(inflated_addr))
 		return addr;
 	if (inflated_addr & ~PAGE_MASK)
@@ -5775,7 +5773,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
 				      unsigned long addr, unsigned long len,
 				      unsigned long pgoff, unsigned long flags)
 {
-	return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(file, addr, len, pgoff, flags);
 }
 #endif
 
-- 
cgit v1.2.3


From ada5cbe33a5321f8c896a3362c3aafa0bf262110 Mon Sep 17 00:00:00 2001
From: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Date: Thu, 9 Oct 2025 20:54:03 +0500
Subject: kasan: cleanup of kasan_enabled() checks

Deduplication of kasan_enabled() checks which are already used by callers.

* Altered functions:

check_page_allocation
	Delete the check because callers have it already in __wrappers in
	include/linux/kasan.h:
		__kasan_kfree_large
		__kasan_mempool_poison_pages
		__kasan_mempool_poison_object

kasan_populate_vmalloc, kasan_release_vmalloc
	Add __wrappers in include/linux/kasan.h.
	They are called externally in mm/vmalloc.c.

__kasan_unpoison_vmalloc, __kasan_poison_vmalloc
	Delete checks because there're already kasan_enabled() checks
	in respective __wrappers in include/linux/kasan.h.

release_free_meta -- Delete the check because the higher caller path
	has it already. See the stack trace:

	__kasan_slab_free -- has the check already
	__kasan_mempool_poison_object -- has the check already
		poison_slab_object
			kasan_save_free_info
				release_free_meta
					kasan_enabled() -- Delete here

Link: https://lkml.kernel.org/r/20251009155403.1379150-3-snovitoll@gmail.com
Signed-off-by: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 20 ++++++++++++++++++--
 mm/kasan/common.c     |  3 ---
 mm/kasan/generic.c    |  3 ---
 mm/kasan/shadow.c     | 20 ++++----------------
 4 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d12e1a5f5a9a..f335c1d7b61d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -571,11 +571,27 @@ static inline void kasan_init_hw_tags(void) { }
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 
 void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask);
-void kasan_release_vmalloc(unsigned long start, unsigned long end,
+int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask);
+static inline int kasan_populate_vmalloc(unsigned long addr,
+					 unsigned long size, gfp_t gfp_mask)
+{
+	if (kasan_enabled())
+		return __kasan_populate_vmalloc(addr, size, gfp_mask);
+	return 0;
+}
+void __kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
 			   unsigned long free_region_end,
 			   unsigned long flags);
+static inline void kasan_release_vmalloc(unsigned long start, unsigned long end,
+			   unsigned long free_region_start,
+			   unsigned long free_region_end,
+			   unsigned long flags)
+{
+	if (kasan_enabled())
+		return __kasan_release_vmalloc(start, end, free_region_start,
+					 free_region_end, flags);
+}
 
 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index d4c14359feaf..22e5d67ff064 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -305,9 +305,6 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
 
 static inline bool check_page_allocation(void *ptr, unsigned long ip)
 {
-	if (!kasan_enabled())
-		return false;
-
 	if (ptr != page_address(virt_to_head_page(ptr))) {
 		kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE);
 		return true;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 516b49accc4f..2b8e73f5f6a7 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -506,9 +506,6 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta)
 
 static void release_free_meta(const void *object, struct kasan_free_meta *meta)
 {
-	if (!kasan_enabled())
-		return;
-
 	/* Check if free meta is valid. */
 	if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META)
 		return;
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index a30d84bfdd52..29a751a8a08d 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -354,7 +354,7 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask
 	return 0;
 }
 
-static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask)
+static int __kasan_populate_vmalloc_do(unsigned long start, unsigned long end, gfp_t gfp_mask)
 {
 	unsigned long nr_pages, nr_total = PFN_UP(end - start);
 	struct vmalloc_populate_data data;
@@ -395,14 +395,11 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_
 	return ret;
 }
 
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask)
+int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask)
 {
 	unsigned long shadow_start, shadow_end;
 	int ret;
 
-	if (!kasan_enabled())
-		return 0;
-
 	if (!is_vmalloc_or_module_addr((void *)addr))
 		return 0;
 
@@ -424,7 +421,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mas
 	shadow_start = PAGE_ALIGN_DOWN(shadow_start);
 	shadow_end = PAGE_ALIGN(shadow_end);
 
-	ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask);
+	ret = __kasan_populate_vmalloc_do(shadow_start, shadow_end, gfp_mask);
 	if (ret)
 		return ret;
 
@@ -566,7 +563,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
  * pages entirely covered by the free region, we will not run in to any
  * trouble - any simultaneous allocations will be for disjoint regions.
  */
-void kasan_release_vmalloc(unsigned long start, unsigned long end,
+void __kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
 			   unsigned long free_region_end,
 			   unsigned long flags)
@@ -575,9 +572,6 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
 	unsigned long region_start, region_end;
 	unsigned long size;
 
-	if (!kasan_enabled())
-		return;
-
 	region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE);
 	region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE);
 
@@ -626,9 +620,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
 	 * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
 	 */
 
-	if (!kasan_enabled())
-		return (void *)start;
-
 	if (!is_vmalloc_or_module_addr(start))
 		return (void *)start;
 
@@ -651,9 +642,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
  */
 void __kasan_poison_vmalloc(const void *start, unsigned long size)
 {
-	if (!kasan_enabled())
-		return;
-
 	if (!is_vmalloc_or_module_addr(start))
 		return;
 
-- 
cgit v1.2.3


From eb8762dc220c0b0573100a941bfc68df34ece74f Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Tue, 14 Oct 2025 21:09:16 +0530
Subject: drivers/base/node: fold register_node() into register_one_node()

Patch series "drivers/base/node: fold node register and unregister
functions", v2.

The first patch merges register_one_node() and register_node(), leaving a
single register_node() function.

The second patch merges unregister_one_node() and unregister_node(),
leaving a single unregister_node() function.

There are no functional changes in these patches.


This patch (of 2):

register_node() is only called from register_one_node().  This patch folds
register_node() into its only caller and renames register_one_node() to
register_node().

This reduces unnecessary indirection and simplifies the code structure.
No functional changes are introduced.

[akpm@linux-foundation.org: fix kerneldoc, per David]
Link: https://lkml.kernel.org/r/cover.1760097207.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/910853c9dd61f7a2190a56cba101e73e9c6859be.1760097207.git.donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Aboorva Devarajan <aboorvad@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/platforms/pseries/pci_dlpar.c |  2 +-
 arch/x86/mm/numa.c                         |  4 +--
 drivers/base/node.c                        | 52 ++++++++++++------------------
 include/linux/node.h                       |  4 +--
 mm/memory_hotplug.c                        |  4 +--
 mm/mm_init.c                               |  2 +-
 6 files changed, 28 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index aeb8633a3d00..8c77ec7980de 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -29,7 +29,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn)
 	nid = of_node_to_nid(dn);
 	if (likely((nid) >= 0)) {
 		if (!node_online(nid)) {
-			if (register_one_node(nid)) {
+			if (register_node(nid)) {
 				pr_err("PCI: Failed to register node %d\n", nid);
 			} else {
 				update_numa_distance(dn);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index c24890c40138..7a97327140df 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -262,7 +262,7 @@ void __init init_gi_nodes(void)
 	 * bringup_nonboot_cpus
 	 *  cpu_up
 	 *   __try_online_node
-	 *    register_one_node
+	 *    register_node
 	 * because node_subsys is not initialized yet.
 	 * TODO remove dependency on node_online
 	 */
@@ -303,7 +303,7 @@ void __init init_cpu_to_node(void)
 		 * bringup_nonboot_cpus
 		 *  cpu_up
 		 *   __try_online_node
-		 *    register_one_node
+		 *    register_node
 		 * because node_subsys is not initialized yet.
 		 * TODO remove dependency on node_online
 		 */
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 83aeb0518e1d..17d7b90403ff 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -676,33 +676,6 @@ static void node_device_release(struct device *dev)
 	kfree(to_node(dev));
 }
 
-/*
- * register_node - Setup a sysfs device for a node.
- * @num - Node number to use when creating the device.
- *
- * Initialize and register the node device.
- */
-static int register_node(struct node *node, int num)
-{
-	int error;
-
-	node->dev.id = num;
-	node->dev.bus = &node_subsys;
-	node->dev.release = node_device_release;
-	node->dev.groups = node_dev_groups;
-	error = device_register(&node->dev);
-
-	if (error) {
-		put_device(&node->dev);
-	} else {
-		hugetlb_register_node(node);
-		compaction_register_node(node);
-		reclaim_register_node(node);
-	}
-
-	return error;
-}
-
 /**
  * unregister_node - unregister a node device
  * @node: node going away
@@ -907,7 +880,13 @@ void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn,
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-int register_one_node(int nid)
+/**
+ * register_node - Initialize and register the node device.
+ * @nid: Node number to use when creating the device.
+ *
+ * Return: 0 on success, -errno otherwise
+ */
+int register_node(int nid)
 {
 	int error;
 	int cpu;
@@ -918,14 +897,23 @@ int register_one_node(int nid)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&node->access_list);
-	node_devices[nid] = node;
 
-	error = register_node(node_devices[nid], nid);
+	node->dev.id = nid;
+	node->dev.bus = &node_subsys;
+	node->dev.release = node_device_release;
+	node->dev.groups = node_dev_groups;
+
+	error = device_register(&node->dev);
 	if (error) {
-		node_devices[nid] = NULL;
+		put_device(&node->dev);
 		return error;
 	}
 
+	node_devices[nid] = node;
+	hugetlb_register_node(node);
+	compaction_register_node(node);
+	reclaim_register_node(node);
+
 	/* link cpu under this node */
 	for_each_present_cpu(cpu) {
 		if (cpu_to_node(cpu) == nid)
@@ -1018,7 +1006,7 @@ void __init node_dev_init(void)
 	 * to already created cpu devices.
 	 */
 	for_each_online_node(i) {
-		ret =  register_one_node(i);
+		ret =  register_node(i);
 		if (ret)
 			panic("%s() failed to add node: %d\n", __func__, ret);
 	}
diff --git a/include/linux/node.h b/include/linux/node.h
index 866e3323f1fd..b7028d3ec3b4 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -176,7 +176,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri)
 #ifdef CONFIG_NUMA
 extern void node_dev_init(void);
 /* Core of the node registration - only memory hotplug should use this */
-extern int register_one_node(int nid);
+int register_node(int nid);
 extern void unregister_one_node(int nid);
 extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
@@ -189,7 +189,7 @@ extern int register_memory_node_under_compute_node(unsigned int mem_nid,
 static inline void node_dev_init(void)
 {
 }
-static inline int register_one_node(int nid)
+static inline int register_node(int nid)
 {
 	return 0;
 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0be83039c3b5..6c050d867031 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1311,7 +1311,7 @@ static int __try_online_node(int nid, bool set_node_online)
 
 	if (set_node_online) {
 		node_set_online(nid);
-		ret = register_one_node(nid);
+		ret = register_node(nid);
 		BUG_ON(ret);
 	}
 out:
@@ -1542,7 +1542,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		goto error_memblock_remove;
 	if (ret) {
 		node_set_online(nid);
-		ret = register_one_node(nid);
+		ret = register_node(nid);
 		if (WARN_ON(ret)) {
 			node_set_offline(nid);
 			goto error_memblock_remove;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 7712d887b696..c6812b4dbb2e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1909,7 +1909,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
 		free_area_init_node(nid);
 
 		/*
-		 * No sysfs hierarchy will be created via register_one_node()
+		 * No sysfs hierarchy will be created via register_node()
 		 *for memory-less node because here it's not marked as N_MEMORY
 		 *and won't be set online later. The benefit is userspace
 		 *program won't be confused by sysfs files/directories of
-- 
cgit v1.2.3


From d945667dcb1996ddf00ffa8408b579e4ce573652 Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Tue, 14 Oct 2025 21:09:17 +0530
Subject: drivers/base/node: fold unregister_node() into unregister_one_node()

unregister_node() is only called from unregister_one_node().  This patch
folds unregister_node() into its only caller and renames
unregister_one_node() to unregister_node().

This reduces unnecessary indirection and simplifies the code structure.
No functional changes are introduced.

[donettom@linux.ibm.com: remove extra spaces before @nid and "All"]
  Link: https://lkml.kernel.org/r/cff01514-9074-4c97-bcf1-d4e3594e48b0@linux.ibm.com
Link: https://lkml.kernel.org/r/32b7d5d8f0f30d313c3e1d8798f591459c8746f9.1760097208.git.donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: Aboorva Devarajan <aboorvad@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/node.c  | 38 +++++++++++++++++---------------------
 include/linux/node.h |  6 ++----
 mm/memory_hotplug.c  |  4 ++--
 3 files changed, 21 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 17d7b90403ff..00cf4532f121 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -676,23 +676,6 @@ static void node_device_release(struct device *dev)
 	kfree(to_node(dev));
 }
 
-/**
- * unregister_node - unregister a node device
- * @node: node going away
- *
- * Unregisters a node device @node.  All the devices on the node must be
- * unregistered before calling this function.
- */
-void unregister_node(struct node *node)
-{
-	hugetlb_unregister_node(node);
-	compaction_unregister_node(node);
-	reclaim_unregister_node(node);
-	node_remove_accesses(node);
-	node_remove_caches(node);
-	device_unregister(&node->dev);
-}
-
 struct node *node_devices[MAX_NUMNODES];
 
 /*
@@ -924,13 +907,26 @@ int register_node(int nid)
 
 	return error;
 }
-
-void unregister_one_node(int nid)
+/**
+ * unregister_node - unregister a node device
+ * @nid: nid of the node going away
+ *
+ * Unregisters the node device at node id @nid. All the devices on the
+ * node must be unregistered before calling this function.
+ */
+void unregister_node(int nid)
 {
-	if (!node_devices[nid])
+	struct node *node = node_devices[nid];
+
+	if (!node)
 		return;
 
-	unregister_node(node_devices[nid]);
+	hugetlb_unregister_node(node);
+	compaction_unregister_node(node);
+	reclaim_unregister_node(node);
+	node_remove_accesses(node);
+	node_remove_caches(node);
+	device_unregister(&node->dev);
 	node_devices[nid] = NULL;
 }
 
diff --git a/include/linux/node.h b/include/linux/node.h
index b7028d3ec3b4..0269b064ba65 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -132,8 +132,6 @@ static inline void register_memory_blocks_under_nodes(void)
 }
 #endif
 
-extern void unregister_node(struct node *node);
-
 struct node_notify {
 	int nid;
 };
@@ -177,7 +175,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri)
 extern void node_dev_init(void);
 /* Core of the node registration - only memory hotplug should use this */
 int register_node(int nid);
-extern void unregister_one_node(int nid);
+void unregister_node(int nid);
 extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
@@ -193,7 +191,7 @@ static inline int register_node(int nid)
 {
 	return 0;
 }
-static inline int unregister_one_node(int nid)
+static inline int unregister_node(int nid)
 {
 	return 0;
 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6c050d867031..94a8f6e8811a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1596,7 +1596,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 error:
 	if (new_node) {
 		node_set_offline(nid);
-		unregister_one_node(nid);
+		unregister_node(nid);
 	}
 error_memblock_remove:
 	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
@@ -2201,7 +2201,7 @@ void try_offline_node(int nid)
 	 * node now.
 	 */
 	node_set_offline(nid);
-	unregister_one_node(nid);
+	unregister_node(nid);
 }
 EXPORT_SYMBOL(try_offline_node);
 
-- 
cgit v1.2.3


From 0acc67c4030c39f39ac90413cc5d0abddd3a9527 Mon Sep 17 00:00:00 2001
From: Joshua Hahn <joshua.hahnjy@gmail.com>
Date: Tue, 14 Oct 2025 07:50:08 -0700
Subject: mm/page_alloc/vmstat: simplify refresh_cpu_vm_stats change detection

Patch series "mm/page_alloc: Batch callers of free_pcppages_bulk", v5.

Motivation & Approach
=====================

While testing workloads with high sustained memory pressure on large
machines in the Meta fleet (1Tb memory, 316 CPUs), we saw an unexpectedly
high number of softlockups.  Further investigation showed that the zone
lock in free_pcppages_bulk was being held for a long time, and was called
to free 2k+ pages over 100 times just during boot.

This causes starvation in other processes for the zone lock, which can
lead to the system stalling as multiple threads cannot make progress
without the locks.  We can see these issues manifesting as warnings:

[ 4512.591979] rcu: INFO: rcu_sched self-detected stall on CPU
[ 4512.604370] rcu:     20-....: (9312 ticks this GP) idle=a654/1/0x4000000000000000 softirq=309340/309344 fqs=5426
[ 4512.626401] rcu:              hardirqs   softirqs   csw/system
[ 4512.638793] rcu:      number:        0        145            0
[ 4512.651177] rcu:     cputime:       30      10410          174   ==> 10558(ms)
[ 4512.666657] rcu:     (t=21077 jiffies g=783665 q=1242213 ncpus=316)

While these warnings don't indicate a crash or a kernel panic, they do
point to the underlying issue of lock contention.  To prevent starvation
in both locks, batch the freeing of pages using pcp->batch.

Because free_pcppages_bulk is called with the pcp lock and acquires the
zone lock, relinquishing and reacquiring the locks are only effective when
both of them are broken together (unless the system was built with queued
spinlocks).  Thus, instead of modifying free_pcppages_bulk to break both
locks, batch the freeing from its callers instead.

A similar fix has been implemented in the Meta fleet, and we have seen
significantly less softlockups.

Testing
=======
The following are a few synthetic benchmarks, made on three machines. The
first is a large machine with 754GiB memory and 316 processors.
The second is a relatively smaller machine with 251GiB memory and 176
processors. The third and final is the smallest of the three, which has 62GiB
memory and 36 processors.

On all machines, I kick off a kernel build with -j$(nproc).
Negative delta is better (faster compilation).

Large machine (754GiB memory, 316 processors)
make -j$(nproc)
+------------+---------------+-----------+
| Metric (s) | Variation (%) | Delta(%)  |
+------------+---------------+-----------+
| real       |        0.8070 |  - 1.4865 |
| user       |        0.2823 |  + 0.4081 |
| sys        |        5.0267 |  -11.8737 |
+------------+---------------+-----------+

Medium machine (251GiB memory, 176 processors)
make -j$(nproc)
+------------+---------------+----------+
| Metric (s) | Variation (%) | Delta(%) |
+------------+---------------+----------+
| real       |        0.2806 |  +0.0351 |
| user       |        0.0994 |  +0.3170 |
| sys        |        0.6229 |  -0.6277 |
+------------+---------------+----------+

Small machine (62GiB memory, 36 processors)
make -j$(nproc)
+------------+---------------+----------+
| Metric (s) | Variation (%) | Delta(%) |
+------------+---------------+----------+
| real       |        0.1503 |  -2.6585 |
| user       |        0.0431 |  -2.2984 |
| sys        |        0.1870 |  -3.2013 |
+------------+---------------+----------+

Here, variation is the coefficient of variation, i.e.  standard deviation
/ mean.

Based on these results, it seems like there are varying degrees to how
much lock contention this reduces.  For the largest and smallest machines
that I ran the tests on, it seems like there is quite some significant
reduction.  There is also some performance increases visible from
userspace.

Interestingly, the performance gains don't scale with the size of the
machine, but rather there seems to be a dip in the gain there is for the
medium-sized machine.  One possible theory is that because the high
watermark depends on both memory and the number of local CPUs, what
impacts zone contention the most is not these individual values, but
rather the ratio of mem:processors.


This patch (of 5):

Currently, refresh_cpu_vm_stats returns an int, indicating how many
changes were made during its updates.  Using this information, callers
like vmstat_update can heuristically determine if more work will be done
in the future.

However, all of refresh_cpu_vm_stats's callers either (a) ignore the
result, only caring about performing the updates, or (b) only care about
whether changes were made, but not *how many* changes were made.

Simplify the code by returning a bool instead to indicate if updates
were made.

In addition, simplify fold_diff and decay_pcp_high to return a bool
for the same reason.

Link: https://lkml.kernel.org/r/20251014145011.3427205-1-joshua.hahnjy@gmail.com
Link: https://lkml.kernel.org/r/20251014145011.3427205-2-joshua.hahnjy@gmail.com
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Chris Mason <clm@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h |  2 +-
 mm/page_alloc.c     |  8 ++++----
 mm/vmstat.c         | 28 +++++++++++++++-------------
 3 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 623bee335383..b155929af5b1 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -387,7 +387,7 @@ extern void free_pages(unsigned long addr, unsigned int order);
 #define free_page(addr) free_pages((addr), 0)
 
 void page_alloc_init_cpuhp(void);
-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
+bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 10a908793b4c..f057ce5ea7da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2557,10 +2557,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  * Called from the vmstat counter updater to decay the PCP high.
  * Return whether there are addition works to do.
  */
-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	int high_min, to_drain, batch;
-	int todo = 0;
+	bool todo = false;
 
 	high_min = READ_ONCE(pcp->high_min);
 	batch = READ_ONCE(pcp->batch);
@@ -2573,7 +2573,7 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
 		pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
 				 pcp->high - (pcp->high >> 3), high_min);
 		if (pcp->high > high_min)
-			todo++;
+			todo = true;
 	}
 
 	to_drain = pcp->count - pcp->high;
@@ -2581,7 +2581,7 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
 		spin_lock(&pcp->lock);
 		free_pcppages_bulk(zone, to_drain, pcp, 0);
 		spin_unlock(&pcp->lock);
-		todo++;
+		todo = true;
 	}
 
 	return todo;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index bb09c032eecf..98855f31294d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -771,25 +771,25 @@ EXPORT_SYMBOL(dec_node_page_state);
 
 /*
  * Fold a differential into the global counters.
- * Returns the number of counters updated.
+ * Returns whether counters were updated.
  */
 static int fold_diff(int *zone_diff, int *node_diff)
 {
 	int i;
-	int changes = 0;
+	bool changed = false;
 
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 		if (zone_diff[i]) {
 			atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
-			changes++;
+			changed = true;
 	}
 
 	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 		if (node_diff[i]) {
 			atomic_long_add(node_diff[i], &vm_node_stat[i]);
-			changes++;
+			changed = true;
 	}
-	return changes;
+	return changed;
 }
 
 /*
@@ -806,16 +806,16 @@ static int fold_diff(int *zone_diff, int *node_diff)
  * with the global counters. These could cause remote node cache line
  * bouncing and will have to be only done when necessary.
  *
- * The function returns the number of global counters updated.
+ * The function returns whether global counters were updated.
  */
-static int refresh_cpu_vm_stats(bool do_pagesets)
+static bool refresh_cpu_vm_stats(bool do_pagesets)
 {
 	struct pglist_data *pgdat;
 	struct zone *zone;
 	int i;
 	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
-	int changes = 0;
+	bool changed = false;
 
 	for_each_populated_zone(zone) {
 		struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
@@ -839,7 +839,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 		if (do_pagesets) {
 			cond_resched();
 
-			changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
+			if (decay_pcp_high(zone, this_cpu_ptr(pcp)))
+				changed = true;
 #ifdef CONFIG_NUMA
 			/*
 			 * Deal with draining the remote pageset of this
@@ -861,13 +862,13 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 			}
 
 			if (__this_cpu_dec_return(pcp->expire)) {
-				changes++;
+				changed = true;
 				continue;
 			}
 
 			if (__this_cpu_read(pcp->count)) {
 				drain_zone_pages(zone, this_cpu_ptr(pcp));
-				changes++;
+				changed = true;
 			}
 #endif
 		}
@@ -887,8 +888,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 		}
 	}
 
-	changes += fold_diff(global_zone_diff, global_node_diff);
-	return changes;
+	if (fold_diff(global_zone_diff, global_node_diff))
+		changed = true;
+	return changed;
 }
 
 /*
-- 
cgit v1.2.3


From d929525c2e30abee621bf71f143ba6104c81ff2b Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Thu, 16 Oct 2025 09:10:35 -0700
Subject: memcg: net: track network throttling due to memcg memory pressure

The kernel can throttle network sockets if the memory cgroup associated
with the corresponding socket is under memory pressure.  The throttling
actions include clamping the transmit window, failing to expand receive or
send buffers, aggressively prune out-of-order receive queue, FIN deferred
to a retransmitted packet and more.  Let's add memcg metric to track such
throttling actions.

At the moment memcg memory pressure is defined through vmpressure and in
future it may be defined using PSI or we may add more flexible way for the
users to define memory pressure, maybe through ebpf.  However the
potential throttling actions will remain the same, so this newly
introduced metric will continue to track throttling actions irrespective
of how memcg memory pressure is defined.

Link: https://lkml.kernel.org/r/20251016161035.86161-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Daniel Sedlak <daniel.sedlak@cdn77.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kacinski <kuba@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 4 ++++
 include/linux/memcontrol.h              | 1 +
 include/net/sock.h                      | 6 +++++-
 kernel/cgroup/cgroup.c                  | 1 +
 mm/memcontrol.c                         | 3 +++
 5 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 0e6c67ac585a..3345961c30ac 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1515,6 +1515,10 @@ The following nested keys are defined.
           oom_group_kill
                 The number of times a group OOM has occurred.
 
+          sock_throttled
+                The number of times network sockets associated with
+                this cgroup are throttled.
+
   memory.events.local
 	Similar to memory.events but the fields in the file are local
 	to the cgroup i.e. not hierarchical. The file modified event
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 873e510d6f8d..5fe254813123 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -52,6 +52,7 @@ enum memcg_memory_event {
 	MEMCG_SWAP_HIGH,
 	MEMCG_SWAP_MAX,
 	MEMCG_SWAP_FAIL,
+	MEMCG_SOCK_THROTTLED,
 	MEMCG_NR_MEMORY_EVENTS,
 };
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 60bcb13f045c..ff7d49af1619 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2635,8 +2635,12 @@ static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
 #endif /* CONFIG_MEMCG_V1 */
 
 	do {
-		if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg)))
+		if (time_before64(get_jiffies_64(),
+				  mem_cgroup_get_socket_pressure(memcg))) {
+			memcg_memory_event(mem_cgroup_from_sk(sk),
+					   MEMCG_SOCK_THROTTLED);
 			return true;
+		}
 	} while ((memcg = parent_mem_cgroup(memcg)));
 
 	return false;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index fdee387f0d6b..8df671c59987 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4704,6 +4704,7 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 	}
 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
 }
+EXPORT_SYMBOL_GPL(cgroup_file_notify);
 
 /**
  * cgroup_file_show - show or hide a hidden cgroup file
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3ae5cbcaed75..976412c8196e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -81,6 +81,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly;
 EXPORT_SYMBOL(memory_cgrp_subsys);
 
 struct mem_cgroup *root_mem_cgroup __read_mostly;
+EXPORT_SYMBOL(root_mem_cgroup);
 
 /* Active memory cgroup to use from an interrupt context */
 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
@@ -4463,6 +4464,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
 		   atomic_long_read(&events[MEMCG_OOM_KILL]));
 	seq_printf(m, "oom_group_kill %lu\n",
 		   atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
+	seq_printf(m, "sock_throttled %lu\n",
+		   atomic_long_read(&events[MEMCG_SOCK_THROTTLED]));
 }
 
 static int memory_events_show(struct seq_file *m, void *v)
-- 
cgit v1.2.3


From 2f05435df9320e70f7a98149eb4b043ff361a120 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Fri, 17 Oct 2025 15:53:07 +0800
Subject: mm: vmscan: simplify the logic for activating dirty file folios

After commit 6b0dfabb3555 ("fs: Remove aops->writepage"), we no longer
attempt to write back filesystem folios through reclaim.

However, in the shrink_folio_list() function, there still remains some
logic related to writeback control of dirty file folios.  The original
logic was that, for direct reclaim, or when folio_test_reclaim() is false,
or the PGDAT_DIRTY flag is not set, the dirty file folios would be
directly activated to avoid being scanned again; otherwise, it will try to
writeback the dirty file folios.  However, since we can no longer perform
writeback on dirty folios, the dirty file folios will still be activated.

Additionally, under the original logic, if we continue to try writeback
dirty file folios, we will also check the references flag,
sc->may_writepage, and may_enter_fs(), which may result in dirty file
folios being left in the inactive list.  This is unreasonable.  Even if
these dirty folios are scanned again, we still cannot clean them.

Therefore, the checks on these dirty file folios appear to be redundant
and can be removed.  Dirty file folios should be directly moved to the
active list to avoid being scanned again.  Since we set the PG_reclaim
flag for the dirty folios, once the writeback is completed, they will be
moved back to the tail of the inactive list to be retried for quick
reclaim.

Link: https://lkml.kernel.org/r/ba5c49955fd93c6850bcc19abf0e02e1573768aa.1760687075.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  4 ----
 mm/vmscan.c            | 25 +++----------------------
 2 files changed, 3 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7fb7331c5725..4398e027f450 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1060,10 +1060,6 @@ struct zone {
 } ____cacheline_internodealigned_in_smp;
 
 enum pgdat_flags {
-	PGDAT_DIRTY,			/* reclaim scanning has recently found
-					 * many dirty file pages at the tail
-					 * of the LRU.
-					 */
 	PGDAT_WRITEBACK,		/* reclaim scanning has recently found
 					 * many pages under writeback
 					 */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e53ac12cc802..ecc90517b791 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1409,21 +1409,7 @@ retry:
 
 		mapping = folio_mapping(folio);
 		if (folio_test_dirty(folio)) {
-			/*
-			 * Only kswapd can writeback filesystem folios
-			 * to avoid risk of stack overflow. But avoid
-			 * injecting inefficient single-folio I/O into
-			 * flusher writeback as much as possible: only
-			 * write folios when we've encountered many
-			 * dirty folios, and when we've already scanned
-			 * the rest of the LRU for clean folios and see
-			 * the same dirty folios again (with the reclaim
-			 * flag set).
-			 */
-			if (folio_is_file_lru(folio) &&
-			    (!current_is_kswapd() ||
-			     !folio_test_reclaim(folio) ||
-			     !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
+			if (folio_is_file_lru(folio)) {
 				/*
 				 * Immediately reclaim when written back.
 				 * Similar in principle to folio_deactivate()
@@ -1432,7 +1418,8 @@ retry:
 				 */
 				node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
 						nr_pages);
-				folio_set_reclaim(folio);
+				if (!folio_test_reclaim(folio))
+					folio_set_reclaim(folio);
 
 				goto activate_locked;
 			}
@@ -6127,11 +6114,6 @@ again:
 		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
 			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
 
-		/* Allow kswapd to start writing pages during reclaim.*/
-		if (sc->nr.unqueued_dirty &&
-			sc->nr.unqueued_dirty == sc->nr.file_taken)
-			set_bit(PGDAT_DIRTY, &pgdat->flags);
-
 		/*
 		 * If kswapd scans pages marked for immediate
 		 * reclaim and under writeback (nr_immediate), it
@@ -6872,7 +6854,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
 
 	clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
 	clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
-	clear_bit(PGDAT_DIRTY, &pgdat->flags);
 	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
 }
 
-- 
cgit v1.2.3


From d3946c5f4c1c5db63532eb433a55c7d881de1389 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 17 Oct 2025 14:26:53 -0700
Subject: mm/damon: document damos_quota_goal->nid use case

Patch series "mm/damon: allow DAMOS auto-tuned for per-memcg per-node
memory usage".

Introduce two new DAMOS quota auto-tuning target metrics for per-cgroup
per-NUMA node memory utilization.  Expected use cases are cgroup level
access-aware NUMA memory managements, such as memory tiering or proactive
reclamation on cgroup-based multi-tenant NUMA systems.

Background
==========

The aim-oriented aggressiveness auto-tuning feature of DAMOS is a highly
recommended way for modern DAMOS use cases.  Using it, users can specify
what system status they want to achieve with what access-aware system
operations.  For example, reclaim cold memory aiming for 0.5 percent of
memory pressure (proactive reclaim), or migrate hot and cold memory
between NUMA nodes having different speed (memory tiering).  Then DAMOS
automatically adjusts the aggressiveness of the system operation (e.g.,
increase/decrease reclaim target coldness threshold) based on current
status of the system.

The use case is limited by the supported system status metrics for
specifying the target system status.  Two new system metrics for per-node
memory usage ratio, namely DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, were
recently added to extend the use cases for access-aware NUMA nodes
management, such as memory tiering.  Those are expected to be useful for
not only memory tiering but also general access-aware inter-NUMA node page
migration, though.

Limitation
----------

The per-node memory usage based auto-tuning can be applied only
system-wide.  For cgroups-based multi-tenant systems, it could arguably
harm the fairness.  For example, a cgroup may use faster NUMA node memory
more than other cgroup, depending on their access pattern.  If the user of
each cgroup are promised to get the same quality and amount of the system
resource, this can arguably be an unfair situation.

DAMOS supports cgroup level system operations via DAMOS filter.  But the
quota auto-tuning system is not aware of cgroups.

New DAMOS Quota Tuning Metrics for Per-Cgroup Per-NUMA Memory Usage
===================================================================

To overcome the limitation, introduce two new DAMOS quota auto-tuning goal
metrics, namely DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP.  Those can be
thought of as a variant of DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP that
extended for cgroups.

The two metrics specifies per-cgroup, per-node amount of used and unused
memory in ratio to the total memory of the node.  For example, let's
assume a system has two NUMA nodes of size 100 GiB and 50 GiB.  And two
cgroups are using 40 GiB and 60 GiB of node 0, 20 GiB and 10 GiB of node
1, respectively, as illustrated by the below table.

                     node-0    node-1
    Total memory     100 GiB   50 GiB
    Cgroup A usage   40 GiB    20 GiB
    Cgroup B usage   60 GiB    10 GiB

Then, DAMOS_QUOTA_NODE_MEMCG_USED_BP for the cgroups for the first node
are, 40 GiB / 100 GiB = 4,000 bp (40 percent) and 60 GiB / 100 GiB = 6,000
bp (60 percent), respectively.  Those for the second node are, 20 GiB / 50
GiB = 4000 bp (40 percent) and 10 GiB / 50 GiB = 2000 bp (20 percent),
respectively.

DAMOS_QUOTA_NODE_MEMCG_FREE_BP for the four cases are, 60 GiB /100 GiB =
6000 bp, 40 GiB / 100 GiB = 4000 bp, 30 GiB / 50 GiB = 6000 bp, and 40 GiB
/ 50 GiB = 8000 bp, respectively.

    DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-0: 4000 bp
    DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-0: 6000 bp
    DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-1: 4000 bp
    DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-1: 2000 bp

    DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-0: 6000 bp
    DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-0: 4000 bp
    DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-1: 6000 bp
    DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-1: 8000 bp

Using these, users can specify how much [un]used amount of memory for
per-cgroup and per-node DAMOS should make as a result of the auto-tuning.

Example Usecase: Cgroup Level Memory Tiering
============================================

Let's suppose a typical and simple tiered memory system.  The system
equips two NUMA nodes.  The first node (node 0) is CPU-attached and fast.
The second node (node 1) is CPU-unattached and slow.  It runs two cgroups
that desire to use about 30 percent and 70 percent of the faster node as
much as possible for their hot data, respectively.  Then, the user can
implement DAMOS-based memory tiering for the system using the DAMON
user-space tool (damo), like below.

    # ./damo start \
    	`# kdamond for node 1 (slow)` \
        --numa_node 1 --monitoring_intervals_goal 4% 3 5ms 10s \
	    `# promotion scheme for cgroup a` \
            --damos_action migrate_hot 0 --damos_access_rate 5% max \
            --damos_apply_interval 1s \
	    --damos_filter allow memcg /workloads/a \
            --damos_filter allow young \
            --damos_quota_interval 1s --damos_quota_space 200MB \
            --damos_quota_goal node_memcg_used_bp 29.7% 0 /workloads/a \
	    \
	    `# promotion scheme for cgroup b` \
            --damos_action migrate_hot 0 --damos_access_rate 5% max \
            --damos_apply_interval 1s \
	    --damos_filter allow memcg /workloads/b \
            --damos_filter allow young \
            --damos_quota_interval 1s --damos_quota_space 200MB \
            --damos_quota_goal node_memcg_used_bp 69.7% 0 workloads/b \
	    \
    	`# kdamond for node 0 (fast)` \
        --numa_node 0 --monitoring_intervals_goal 4% 3 5ms 10s \
            `# demotion scheme for cgroup a` \
            --damos_action migrate_cold 1 --damos_access_rate 0% 0% \
            --damos_apply_interval 1s \
	    --damos_filter allow memcg /workloads/a \
            --damos_filter reject young \
            --damos_quota_interval 1s --damos_quota_space 200MB \
            --damos_quota_goal node_memcg_free_bp 70.5% 0 \
	    \
            `# demotion scheme for cgroup b` \
            --damos_action migrate_cold 1 --damos_access_rate 0% 0% \
            --damos_apply_interval 1s \
	    --damos_filter allow memcg /workloads/a \
            --damos_filter reject young \
            --damos_quota_interval 1s --damos_quota_space 200MB \
            --damos_quota_goal node_memcg_free_bp 30.5% 0 \
	    \
            --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 1 1 1 1 \
        --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1

With the command, the user-space tool will ask DAMON to spawn two kernel
threads, each for monitoring accesses to node 1 (slow) and node 0 (fast),
respectively.  It installs two DAMOS schemes on each thread.  Let's call
them "promotion scheme for cgroup a/b", and "demotion scheme for cgroup
a/b" in the order.  The promotion schemes are installed on the DAMON
thread for node 1 (slow), and demotion schemes are installed on the DAMON
thread for node 0 (fast).

Cgroup Level Hot Pages Migration (Promotion)
--------------------------------------------

Promotion schemes will find memory regions on node 1 (slow), that some
access was detected.  The schemes will then migrate the found memory to
node 0 (fast), hottest pages first.

For accurate and effective migration, these schemes use two page level
filters.  First, the migration will be filtered for only cgroup A and
cgroup B.  That is, "promotion scheme for cgroup B" will not do the
migration if the page is for cgroup A.  Secondly, the schemes will ignore
pages that having their page table's Accessed bits unset.  The per-page
Accessed bit check logic will also unset the bit if it was set, for the
next check.

For controlled amounts of system resource consumption and aiming on the
target memory usage, the schemes use quotas setup.  The migration is
limited to be done only up to 200 MiB per second, to limit the peak system
resource usage.  And DAMOS_QUOTA_NODE_MEMCG_USED_BP target is set for
29.7% and 69.7% of node 0 (fast), respectively.  The target value is lower
than the high level goal (30% and 70% system memory), to give headroom on
node 0 (fast).  DAMOS will adjust the speed of the pages migration based
on the target and current per-cgroup node 0 memory usage.  For example, if
cgroup A is utilizing only 10% of node 0, DAMOS will try to migrate more
of cgroup A hot pages from node 1 to node 0, up to 200 MiB per second.  If
cgroup A utilizes more than 29.7% of node 0 memory, the cgroup A hot pages
migration from node 1 to node 0 will be slowed and eventually stopped.

Cgroup Level Cold Pages Migration (Demotion)
--------------------------------------------

Demotion schemes are similar to promotion schemes, but differ in filtering
setup and quota tuning setup.  Those filter out pages having their page
table Accessed bits set.  And set 70.5% and 30.5% of node 0 memory free
rate for the cgroup A and B, respectively.  Hence, if promotion schemes or
something made cgroup A and/or B uses more than 29.5% and 69.5% of node 0,
demotion schemes will start migrating cold pages of appropriate cgroups in
node 0 to node 1, under the 200 MiB per second speed cap, while adjusting
the speed based on how much more than wanted memory is being used.

The quota target values are set to overlap with promotion targets, to keep
a minimum level of page exchanges between the nodes.  This is to avoid a
case that the target memory utilization is met, and then access pattern
changes (pages in node 1 become hotter than pages in node 0) while the
memory utilization is unchanged.  Without the overlap, neither promotion
of hotter pages in node 1, nor demotion of colder pages in node 0 will
happen since both goals are met.  As a result, the faster and slower node
will unexpectedly serve cold and hot data.

Test: Per-cgroup Memory Tiering
===============================

I ran a simplified cgroup level memory tiering using the feature, and
confirmed it works as intended.

Setup
-----

I configured a QEMU virtual machine representing a simplified version of
the system that described on the above cgroup level memory tiering example
use case.  The system equips 40 CPU cores and two NUMA nodes each having
30 GiB physical memory.  The first node (node 0) represents the faster
NUMA node, and the second node (node 1) represents the slower NUMA node.
In specific, below qemu command line options are used.

    [...]
    -object memory-backend-ram,size=30G,id=m0 \
    -object memory-backend-ram,size=30G,id=m1 \
    -numa node,cpus=0-39,memdev=m0 \
    -numa node,memdev=m1 \
    [...]

I booted the virtual machine with a kernel that this patch series is
applied.  On the virtual machine, I created two cgroups, namely workload_a
and workload_b.  And ran a test program in each cgroup, resulting in one
process per cgroup.  The test program allocates 10 GiB memory and evenly
split it into 10 regions.  After the allocation, it repeatedly access the
first region for one minute, than the second one for one minute, and so
on.  After the one minute repeated access for the 10-th region is done, it
repeats the access from the first region.  So the process has 10 GiB of
data in total, but only 1 GiB of it is hot at a given moment, and the hot
data is gradually changed.

While the processes are running, run DAMON for a simple access-aware
memory tiering using below script.  It migrates hot and cold data of the
cgroups into node 0 and node 1, aiming the first and the second cgroups
(workload_a and workload_b, respectively) utilizing about 9.7 percent and
19.7 percent of node 0, respectively.

Note that this setup is a simplified version of the above example use
case, for ease of test.  Also note that we assigned 30 GiB physical memory
to node 0, but DAMON in this setup works for only 27 GiB of the memory.
It is due to an internal implementation detail of DAMON user-space tool
that not really important for this test.

    #!/bin/bash
    damo start \
        --numa_node 1 \
            --damos_action migrate_hot 0 --damos_access_rate 5% max \
                --damos_apply_interval 1s \
                --damos_filter allow memcg /workload_a \
                --damos_filter allow young \
                --damos_quota_interval 1s \
                --damos_quota_goal node_memcg_used_bp 9.7% 0 /workload_a \
            --damos_action migrate_hot 0 --damos_access_rate 5% max \
                --damos_apply_interval 1s \
                --damos_filter allow memcg /workload_b \
                --damos_filter allow young \
                --damos_quota_interval 1s \
                --damos_quota_goal node_memcg_used_bp 19.7% 0 /workload_b \
        --numa_node 0 \
            --damos_action migrate_cold 1 --damos_access_rate 0% 0% \
                --damos_apply_interval 1s \
                --damos_filter allow memcg /workload_a \
                --damos_filter reject young \
                --damos_quota_interval 1s \
                --damos_quota_goal node_memcg_free_bp 90.5% 0 /workload_a \
            --damos_action migrate_cold 1 --damos_access_rate 0% 0% \
                --damos_apply_interval 1s \
                --damos_filter allow memcg /workload_b \
                --damos_filter reject young \
                --damos_quota_interval 1s \
                --damos_quota_goal node_memcg_free_bp 80.5% 0 /workload_b \
                --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 2 2 2 2 \
        --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1

After starting DAMON, the pages continuously be migrated across nodes.  A
few minutes later, the memory usage of the cgroups converges into the
aimed amounts, and keeps the level, as expected.  To confirm the status is
kept in the target level as expected, I collected the memory usage stat of
the cgroups using memory.numa_stat file, after the stats are converged.  I
repeat the stat collection 42 times with 5 seconds delay between each of
the collections.  The results are as below:

    node0_memory_usage  average  stdev
    workload_a          2.79GiB  522.06MiB
    workload_b          5.15GiB  739.10MiB

The average values are quite close to the targeted values: 27 GiB * 9.7% =
2.619 GiB for workload_a, and 27 GiB * 19.7% = 5.319 GiB.  A level of
variances are expected, given the overlap of the promotion/demotion
targets, and dynamic data access pattern of the workloads.  Give that, the
measured variances are at a reasonable level.

Patches Sequence
================

The first patch (patch 1) updates the kernel-doc comment of
damos_quota_goal struct to clarify usage of optional fields of the struct,
since later patches will add such optional fields.

Following four patches (patches 2-5) implement a new DAMOS quota goal
metric for per-cgroup per-node memory usage.  Those extends the core layer
interface for the new metric (patch 2), implement the metric value
calculation on the core layer (patch 3), add DAMON sysfs interface file
for the target cgroup specification (patch 4), and implement support of
the new metric on DAMON sysfs interface (patch 5).

Next two patches implment the second new DAMOS quota goal metric for
per-cgroup per-node free (or, unused) memory.  Those implement it in the
core layer (patch 6) and DAMON sysfs interface (patch 7), extending the
existing implementation for memory usage metric.

Final three patches update the design (patch 8), the usage (patch 9), and
the ABI (patch 10) documents for the changes that are introduced by this
patch series.


This patch (of 10):

damos_quota_goal kerneldoc comment is not explaining when @metric is used.
Update the comment for that.

Link: https://lkml.kernel.org/r/20251017212706.183502-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251017212706.183502-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index cae8c613c5fc..dc9c310e0e75 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -176,6 +176,9 @@ enum damos_quota_goal_metric {
  * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually
  * entered by the user, probably inside the kdamond callbacks.  Otherwise,
  * DAMON sets @current_value with self-measured value of @metric.
+ *
+ * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node
+ * id of the target node to account the used/free memory.
  */
 struct damos_quota_goal {
 	enum damos_quota_goal_metric metric;
-- 
cgit v1.2.3


From 6a18bbe48361acad1eae8d86aa47d353b1cfe619 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 17 Oct 2025 14:26:54 -0700
Subject: mm/damon: add DAMOS quota goal type for per-memcg per-node memory
 usage

Define a new DAMOS quota auto-tuning target metric for per-cgroup per-node
memory usage.  For specifying the cgroup of the interest, add a field,
namely memcg_id, to damos_quota_goal struct.

Note that this commit is only implementing the interface.  The handling of
the interface (the metric value calculation) will be implemented in the
following commit.

Link: https://lkml.kernel.org/r/20251017212706.183502-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index dc9c310e0e75..0d63ceb7e6ef 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -147,6 +147,7 @@ enum damos_action {
  * @DAMOS_QUOTA_SOME_MEM_PSI_US:	System level some memory PSI in us.
  * @DAMOS_QUOTA_NODE_MEM_USED_BP:	MemUsed ratio of a node.
  * @DAMOS_QUOTA_NODE_MEM_FREE_BP:	MemFree ratio of a node.
+ * @DAMOS_QUOTA_NODE_MEMCG_USED_BP:	MemUsed ratio of a node for a cgroup.
  * @NR_DAMOS_QUOTA_GOAL_METRICS:	Number of DAMOS quota goal metrics.
  *
  * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
@@ -156,6 +157,7 @@ enum damos_quota_goal_metric {
 	DAMOS_QUOTA_SOME_MEM_PSI_US,
 	DAMOS_QUOTA_NODE_MEM_USED_BP,
 	DAMOS_QUOTA_NODE_MEM_FREE_BP,
+	DAMOS_QUOTA_NODE_MEMCG_USED_BP,
 	NR_DAMOS_QUOTA_GOAL_METRICS,
 };
 
@@ -166,6 +168,7 @@ enum damos_quota_goal_metric {
  * @current_value:	Current value of @metric.
  * @last_psi_total:	Last measured total PSI
  * @nid:		Node id.
+ * @memcg_id:		Memcg id.
  * @list:		List head for siblings.
  *
  * Data structure for getting the current score of the quota tuning goal.  The
@@ -179,6 +182,9 @@ enum damos_quota_goal_metric {
  *
  * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node
  * id of the target node to account the used/free memory.
+ *
+ * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents
+ * the node id and the cgroup to account the used memory for.
  */
 struct damos_quota_goal {
 	enum damos_quota_goal_metric metric;
@@ -187,7 +193,10 @@ struct damos_quota_goal {
 	/* metric-dependent fields */
 	union {
 		u64 last_psi_total;
-		int nid;
+		struct {
+			int nid;
+			unsigned short memcg_id;
+		};
 	};
 	struct list_head list;
 };
-- 
cgit v1.2.3


From 98fdce76fb7ed7070df21afbee46a4b36cb6a7c6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 17 Oct 2025 14:26:58 -0700
Subject: mm/damon/core: add DAMOS quota gaol metric for per-memcg per-numa
 free memory

Add a variant of DAMOS_QUOTA_NODE_MEMCG_USED_BP, for the free memory
portion.  The value of the metric is implemented as the entire memory of
the given NUMA node subtracted by the given cgroup's usage.  So from a
perspective, "unused" could be a better term than "free".  But arguably it
is not very clear what is better, so use the term "free".

Link: https://lkml.kernel.org/r/20251017212706.183502-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  6 ++++--
 mm/damon/core.c       | 10 ++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 0d63ceb7e6ef..0edf41d36ea1 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -148,6 +148,7 @@ enum damos_action {
  * @DAMOS_QUOTA_NODE_MEM_USED_BP:	MemUsed ratio of a node.
  * @DAMOS_QUOTA_NODE_MEM_FREE_BP:	MemFree ratio of a node.
  * @DAMOS_QUOTA_NODE_MEMCG_USED_BP:	MemUsed ratio of a node for a cgroup.
+ * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP:	MemFree ratio of a node for a cgroup.
  * @NR_DAMOS_QUOTA_GOAL_METRICS:	Number of DAMOS quota goal metrics.
  *
  * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
@@ -158,6 +159,7 @@ enum damos_quota_goal_metric {
 	DAMOS_QUOTA_NODE_MEM_USED_BP,
 	DAMOS_QUOTA_NODE_MEM_FREE_BP,
 	DAMOS_QUOTA_NODE_MEMCG_USED_BP,
+	DAMOS_QUOTA_NODE_MEMCG_FREE_BP,
 	NR_DAMOS_QUOTA_GOAL_METRICS,
 };
 
@@ -183,8 +185,8 @@ enum damos_quota_goal_metric {
  * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node
  * id of the target node to account the used/free memory.
  *
- * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents
- * the node id and the cgroup to account the used memory for.
+ * If @metric is DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP, @nid and @memcg_id
+ * represents the node id and the cgroup to account the used memory for.
  */
 struct damos_quota_goal {
 	enum damos_quota_goal_metric metric;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 8aa8d269df90..a9c11d2d37b0 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -790,6 +790,7 @@ static void damos_commit_quota_goal_union(
 		dst->nid = src->nid;
 		break;
 	case DAMOS_QUOTA_NODE_MEMCG_USED_BP:
+	case DAMOS_QUOTA_NODE_MEMCG_FREE_BP:
 		dst->nid = src->nid;
 		dst->memcg_id = src->memcg_id;
 		break;
@@ -2046,7 +2047,7 @@ static unsigned long damos_get_node_memcg_used_bp(
 {
 	struct mem_cgroup *memcg;
 	struct lruvec *lruvec;
-	unsigned long used_pages;
+	unsigned long used_pages, numerator;
 	struct sysinfo i;
 
 	rcu_read_lock();
@@ -2066,7 +2067,11 @@ static unsigned long damos_get_node_memcg_used_bp(
 	used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE);
 
 	si_meminfo_node(&i, goal->nid);
-	return used_pages * 10000 / i.totalram;
+	if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP)
+		numerator = used_pages;
+	else	/* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */
+		numerator = i.totalram - used_pages;
+	return numerator * 10000 / i.totalram;
 }
 #else
 static __kernel_ulong_t damos_get_node_mem_bp(
@@ -2101,6 +2106,7 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
 		goal->current_value = damos_get_node_mem_bp(goal);
 		break;
 	case DAMOS_QUOTA_NODE_MEMCG_USED_BP:
+	case DAMOS_QUOTA_NODE_MEMCG_FREE_BP:
 		goal->current_value = damos_get_node_memcg_used_bp(goal);
 		break;
 	default:
-- 
cgit v1.2.3


From e859a224fad65cb4848fe202aea9896a14fdb7f4 Mon Sep 17 00:00:00 2001
From: Quanmin Yan <yanquanmin1@huawei.com>
Date: Mon, 20 Oct 2025 21:01:24 +0800
Subject: mm/damon: add a min_sz_region parameter to
 damon_set_region_biggest_system_ram_default()

Patch series "mm/damon: fixes for address alignment issues in
DAMON_LRU_SORT and DAMON_RECLAIM", v2.

In DAMON_LRU_SORT and DAMON_RECLAIM, damon_set_regions() will apply
DAMON_MIN_REGION as the core address alignment, and the monitoring target
address ranges would be aligned on DAMON_MIN_REGION * addr_unit.  When
users 1) set addr_unit to a value larger than 1, and 2) set the monitoring
target address range as not aligned on DAMON_MIN_REGION * addr_unit, it
will cause DAMON_LRU_SORT and DAMON_RECLAIM to operate on unexpectedly
large physical address ranges.

For example, if the user sets the monitoring target address range to [4,
8) and addr_unit as 1024, the aimed monitoring target address range is [4
KiB, 8 KiB).  Assuming DAMON_MIN_REGION is 4096, so resulting target
address range will be [0, 4096) in the DAMON core layer address system,
and [0, 4 MiB) in the physical address space, which is an unexpected
range.

To fix the issue, add a min_sz_region parameter to
damon_set_region_biggest_system_ram_default() and use it when calling
damon_set_regions(), replacing the direct use of DAMON_MIN_REGION.


This patch (of 2):

In DAMON_LRU_SORT, damon_set_regions() will apply DAMON_MIN_REGION as the
core address alignment, and the monitoring target address ranges would be
aligned on DAMON_MIN_REGION * addr_unit.  When users 1) set addr_unit to a
value larger than 1, and 2) set the monitoring target address range as not
aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT to
operate on unexpectedly large physical address ranges.

For example, if the user sets the monitoring target address range to [4,
8) and addr_unit as 1024, the aimed monitoring target address range is [4
KiB, 8 KiB).  Assuming DAMON_MIN_REGION is 4096, so resulting target
address range will be [0, 4096) in the DAMON core layer address system,
and [0, 4 MiB) in the physical address space, which is an unexpected
range.

To fix the issue, add a min_sz_region parameter to
damon_set_region_biggest_system_ram_default() and use it when calling
damon_set_regions(), replacing the direct use of DAMON_MIN_REGION.

Link: https://lkml.kernel.org/r/20251020130125.2875164-1-yanquanmin1@huawei.com
Link: https://lkml.kernel.org/r/20251020130125.2875164-2-yanquanmin1@huawei.com
Fixes: 2e0fe9245d6b ("mm/damon/lru_sort: support addr_unit for DAMON_LRU_SORT")
Signed-off-by: Quanmin Yan <yanquanmin1@huawei.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: ze zuo <zuoze1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 3 ++-
 mm/damon/core.c       | 6 ++++--
 mm/damon/lru_sort.c   | 3 ++-
 mm/damon/reclaim.c    | 3 ++-
 mm/damon/stat.c       | 3 ++-
 5 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 0edf41d36ea1..9ee026c2db53 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -961,7 +961,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control);
 int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control);
 
 int damon_set_region_biggest_system_ram_default(struct damon_target *t,
-				unsigned long *start, unsigned long *end);
+				unsigned long *start, unsigned long *end,
+				unsigned long min_sz_region);
 
 #endif	/* CONFIG_DAMON */
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index a9c11d2d37b0..82546d138a5a 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -2818,6 +2818,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start,
  * @t:		The monitoring target to set the region.
  * @start:	The pointer to the start address of the region.
  * @end:	The pointer to the end address of the region.
+ * @min_sz_region:	Minimum region size.
  *
  * This function sets the region of @t as requested by @start and @end.  If the
  * values of @start and @end are zero, however, this function finds the biggest
@@ -2828,7 +2829,8 @@ static bool damon_find_biggest_system_ram(unsigned long *start,
  * Return: 0 on success, negative error code otherwise.
  */
 int damon_set_region_biggest_system_ram_default(struct damon_target *t,
-			unsigned long *start, unsigned long *end)
+			unsigned long *start, unsigned long *end,
+			unsigned long min_sz_region)
 {
 	struct damon_addr_range addr_range;
 
@@ -2841,7 +2843,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
 
 	addr_range.start = *start;
 	addr_range.end = *end;
-	return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION);
+	return damon_set_regions(t, &addr_range, 1, min_sz_region);
 }
 
 /*
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 42b9a656f9de..49b4bc294f4e 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -242,7 +242,8 @@ static int damon_lru_sort_apply_parameters(void)
 
 	err = damon_set_region_biggest_system_ram_default(param_target,
 					&monitor_region_start,
-					&monitor_region_end);
+					&monitor_region_end,
+					param_ctx->min_sz_region);
 	if (err)
 		goto out;
 	err = damon_commit_ctx(ctx, param_ctx);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 7ba3d0f9a19a..e30811cafe90 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -250,7 +250,8 @@ static int damon_reclaim_apply_parameters(void)
 
 	err = damon_set_region_biggest_system_ram_default(param_target,
 					&monitor_region_start,
-					&monitor_region_end);
+					&monitor_region_end,
+					DAMON_MIN_REGION);
 	if (err)
 		goto out;
 	err = damon_commit_ctx(ctx, param_ctx);
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index bf8626859902..ed8e3629d31a 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -188,7 +188,8 @@ static struct damon_ctx *damon_stat_build_ctx(void)
 	if (!target)
 		goto free_out;
 	damon_add_target(ctx, target);
-	if (damon_set_region_biggest_system_ram_default(target, &start, &end))
+	if (damon_set_region_biggest_system_ram_default(target, &start, &end,
+							ctx->min_sz_region))
 		goto free_out;
 	return ctx;
 free_out:
-- 
cgit v1.2.3


From 54c58a2f5fa191839cf192fa4ebab39395272a3e Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 20 Oct 2025 13:11:21 +0100
Subject: mm: add vma_desc_size(), vma_desc_pages() helpers

It's useful to be able to determine the size of a VMA descriptor range
used on f_op->mmap_prepare, expressed both in bytes and pages, so add
helpers for both and update code that could make use of it to do so.

Link: https://lkml.kernel.org/r/74ef338203c9ff08a9ace73a8f1f6116a79112a0.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ntfs3/file.c    |  2 +-
 include/linux/mm.h | 10 ++++++++++
 mm/secretmem.c     |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 4c90ec2fa2ea..2f344e1ed756 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -332,7 +332,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc)
 
 	if (rw) {
 		u64 to = min_t(loff_t, i_size_read(inode),
-			       from + desc->end - desc->start);
+			       from + vma_desc_size(desc));
 
 		if (is_sparsed(ni)) {
 			/* Allocate clusters for rw map. */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7c79b3369b82..5752b0c516f2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3583,6 +3583,16 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma)
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+static inline unsigned long vma_desc_size(const struct vm_area_desc *desc)
+{
+	return desc->end - desc->start;
+}
+
+static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc)
+{
+	return vma_desc_size(desc) >> PAGE_SHIFT;
+}
+
 /* Look up the first VMA which exactly match the interval vm_start ... vm_end */
 static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
 				unsigned long vm_start, unsigned long vm_end)
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 9b0f5d9ec6f4..37f6d1097853 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -120,7 +120,7 @@ static int secretmem_release(struct inode *inode, struct file *file)
 
 static int secretmem_mmap_prepare(struct vm_area_desc *desc)
 {
-	const unsigned long len = desc->end - desc->start;
+	const unsigned long len = vma_desc_size(desc);
 
 	if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
 		return -EINVAL;
-- 
cgit v1.2.3


From 51e38e7d40d617965504f4dcba569ecf9302f245 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 20 Oct 2025 13:11:24 +0100
Subject: mm: add remap_pfn_range_prepare(), remap_pfn_range_complete()

We need the ability to split PFN remap between updating the VMA and
performing the actual remap, in order to do away with the legacy f_op->mmap
hook.

To do so, update the PFN remap code to provide shared logic, and also make
remap_pfn_range_notrack() static, as its one user, io_mapping_map_user()
was removed in commit 9a4f90e24661 ("mm: remove mm/io-mapping.c").

Then, introduce remap_pfn_range_prepare(), which accepts VMA descriptor
and PFN parameters, and remap_pfn_range_complete() which accepts the same
parameters as remap_pfn_rangte().

remap_pfn_range_prepare() will set the cow vma->vm_pgoff if necessary, so
it must be supplied with a correct PFN to do so.

While we're here, also clean up the duplicated #ifdef
__HAVE_PFNMAP_TRACKING check and put into a single #ifdef/#else block.

We keep these internal to mm as they should only be used by internal
helpers.

Link: https://lkml.kernel.org/r/75b55de63249b3aa0fd5b3b08ed1d3ff19255d0d.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Pedro Falcato <pfalcato@suse.de>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  22 +++++++--
 mm/internal.h      |   4 ++
 mm/memory.c        | 132 +++++++++++++++++++++++++++++++++++------------------
 3 files changed, 110 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5752b0c516f2..ca5565f4fac4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -489,6 +489,21 @@ extern unsigned int kobjsize(const void *objp);
  */
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
 
+/*
+ * Physically remapped pages are special. Tell the
+ * rest of the world about it:
+ *   VM_IO tells people not to look at these pages
+ *	(accesses can have side effects).
+ *   VM_PFNMAP tells the core MM that the base pages are just
+ *	raw PFN mappings, and do not have a "struct page" associated
+ *	with them.
+ *   VM_DONTEXPAND
+ *      Disable vma merging and expanding with mremap().
+ *   VM_DONTDUMP
+ *      Omit vma from core dump, even when VM_IO turned off.
+ */
+#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP)
+
 /* This mask prevents VMA from being scanned with khugepaged */
 #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
 
@@ -3634,10 +3649,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 
 struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
 		unsigned long addr);
-int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
-			unsigned long pfn, unsigned long size, pgprot_t);
-int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
-		unsigned long pfn, unsigned long size, pgprot_t prot);
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+		    unsigned long pfn, unsigned long size, pgprot_t pgprot);
+
 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
 int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
 			struct page **pages, unsigned long *num);
diff --git a/mm/internal.h b/mm/internal.h
index 56a9a714709a..5ca1e7842b19 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1677,4 +1677,8 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
 void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
 int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
 
+void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn);
+int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pfn, unsigned long size, pgprot_t pgprot);
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/memory.c b/mm/memory.c
index f13b20b702f6..8e02b8d75535 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2900,6 +2900,25 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 	return 0;
 }
 
+static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
+		unsigned long end, unsigned long vm_start, unsigned long vm_end,
+		unsigned long pfn, pgoff_t *vm_pgoff_p)
+{
+	/*
+	 * There's a horrible special case to handle copy-on-write
+	 * behaviour that some programs depend on. We mark the "original"
+	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+	 * See vm_normal_page() for details.
+	 */
+	if (is_cow_mapping(vm_flags)) {
+		if (addr != vm_start || end != vm_end)
+			return -EINVAL;
+		*vm_pgoff_p = pfn;
+	}
+
+	return 0;
+}
+
 static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
 		unsigned long pfn, unsigned long size, pgprot_t prot)
 {
@@ -2912,31 +2931,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
 	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
 		return -EINVAL;
 
-	/*
-	 * Physically remapped pages are special. Tell the
-	 * rest of the world about it:
-	 *   VM_IO tells people not to look at these pages
-	 *	(accesses can have side effects).
-	 *   VM_PFNMAP tells the core MM that the base pages are just
-	 *	raw PFN mappings, and do not have a "struct page" associated
-	 *	with them.
-	 *   VM_DONTEXPAND
-	 *      Disable vma merging and expanding with mremap().
-	 *   VM_DONTDUMP
-	 *      Omit vma from core dump, even when VM_IO turned off.
-	 *
-	 * There's a horrible special case to handle copy-on-write
-	 * behaviour that some programs depend on. We mark the "original"
-	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
-	 * See vm_normal_page() for details.
-	 */
-	if (is_cow_mapping(vma->vm_flags)) {
-		if (addr != vma->vm_start || end != vma->vm_end)
-			return -EINVAL;
-		vma->vm_pgoff = pfn;
-	}
-
-	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+	VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS);
 
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
@@ -2957,7 +2952,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
  * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
  * must have pre-validated the caching bits of the pgprot_t.
  */
-int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
 		unsigned long pfn, unsigned long size, pgprot_t prot)
 {
 	int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
@@ -3002,23 +2997,9 @@ void pfnmap_track_ctx_release(struct kref *ref)
 	pfnmap_untrack(ctx->pfn, ctx->size);
 	kfree(ctx);
 }
-#endif /* __HAVE_PFNMAP_TRACKING */
 
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-#ifdef __HAVE_PFNMAP_TRACKING
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
-		    unsigned long pfn, unsigned long size, pgprot_t prot)
+static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pfn, unsigned long size, pgprot_t prot)
 {
 	struct pfnmap_track_ctx *ctx = NULL;
 	int err;
@@ -3054,15 +3035,78 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	return err;
 }
 
+static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	return remap_pfn_range_track(vma, addr, pfn, size, prot);
+}
 #else
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
-		    unsigned long pfn, unsigned long size, pgprot_t prot)
+static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pfn, unsigned long size, pgprot_t prot)
 {
 	return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
 }
 #endif
+
+void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
+{
+	/*
+	 * We set addr=VMA start, end=VMA end here, so this won't fail, but we
+	 * check it again on complete and will fail there if specified addr is
+	 * invalid.
+	 */
+	get_remap_pgoff(desc->vm_flags, desc->start, desc->end,
+			desc->start, desc->end, pfn, &desc->pgoff);
+	desc->vm_flags |= VM_REMAP_FLAGS;
+}
+
+static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pfn, unsigned long size)
+{
+	unsigned long end = addr + PAGE_ALIGN(size);
+	int err;
+
+	err = get_remap_pgoff(vma->vm_flags, addr, end,
+			      vma->vm_start, vma->vm_end,
+			      pfn, &vma->vm_pgoff);
+	if (err)
+		return err;
+
+	vm_flags_set(vma, VM_REMAP_FLAGS);
+	return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+		    unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	int err;
+
+	err = remap_pfn_range_prepare_vma(vma, addr, pfn, size);
+	if (err)
+		return err;
+
+	return do_remap_pfn_range(vma, addr, pfn, size, prot);
+}
 EXPORT_SYMBOL(remap_pfn_range);
 
+int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	return do_remap_pfn_range(vma, addr, pfn, size, prot);
+}
+
 /**
  * vm_iomap_memory - remap memory to userspace
  * @vma: user vma to map to
-- 
cgit v1.2.3


From c707a68f9468e4ef4a3546b636a9dd088fe7b7f1 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 20 Oct 2025 13:11:25 +0100
Subject: mm: abstract io_remap_pfn_range() based on PFN

The only instances in which we customise this function are ones in which we
customise the PFN used.

Instances where architectures were not passing the pgprot value through
pgprot_decrypted() are ones where pgprot_decrypted() was a no-op anyway, so
we can simply always pass pgprot through this function.

Use this fact to simplify the use of io_remap_pfn_range(), by abstracting
the PFN via io_remap_pfn_range_pfn() and using this instead of providing a
general io_remap_pfn_range() function per-architecture.

Link: https://lkml.kernel.org/r/d086191bf431b58ce3b231b4f4f555d080f60327.1760959442.git.lorenzo.stoakes@oracle.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/csky/include/asm/pgtable.h     |  3 ---
 arch/mips/alchemy/common/setup.c    |  9 +++++----
 arch/mips/include/asm/pgtable.h     |  5 ++---
 arch/sparc/include/asm/pgtable_32.h | 12 ++++--------
 arch/sparc/include/asm/pgtable_64.h | 12 ++++--------
 include/linux/mm.h                  | 19 ++++++++++++++-----
 6 files changed, 29 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index 5a394be09c35..d606afbabce1 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -263,7 +263,4 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
 #define update_mmu_cache(vma, addr, ptep) \
 	update_mmu_cache_range(NULL, vma, addr, ptep, 1)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
-	remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 #endif /* __ASM_CSKY_PGTABLE_H */
diff --git a/arch/mips/alchemy/common/setup.c b/arch/mips/alchemy/common/setup.c
index a7a6d31a7a41..c35b4f809d51 100644
--- a/arch/mips/alchemy/common/setup.c
+++ b/arch/mips/alchemy/common/setup.c
@@ -94,12 +94,13 @@ phys_addr_t fixup_bigphys_addr(phys_addr_t phys_addr, phys_addr_t size)
 	return phys_addr;
 }
 
-int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr,
-		unsigned long pfn, unsigned long size, pgprot_t prot)
+static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
+		unsigned long size)
 {
 	phys_addr_t phys_addr = fixup_bigphys_addr(pfn << PAGE_SHIFT, size);
 
-	return remap_pfn_range(vma, vaddr, phys_addr >> PAGE_SHIFT, size, prot);
+	return phys_addr >> PAGE_SHIFT;
 }
-EXPORT_SYMBOL(io_remap_pfn_range);
+EXPORT_SYMBOL(io_remap_pfn_range_pfn);
+
 #endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index ae73ecf4c41a..9c06a612d33a 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -604,9 +604,8 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
  */
 #ifdef CONFIG_MIPS_FIXUP_BIGPHYS_ADDR
 phys_addr_t fixup_bigphys_addr(phys_addr_t addr, phys_addr_t size);
-int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr,
-		unsigned long pfn, unsigned long size, pgprot_t prot);
-#define io_remap_pfn_range io_remap_pfn_range
+unsigned long io_remap_pfn_range_pfn(unsigned long pfn, unsigned long size);
+#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn
 #else
 #define fixup_bigphys_addr(addr, size)	(addr)
 #endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index f1538a48484a..a9f802d1dd64 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -395,12 +395,8 @@ __get_iospace (unsigned long addr)
 #define GET_IOSPACE(pfn)		(pfn >> (BITS_PER_LONG - 4))
 #define GET_PFN(pfn)			(pfn & 0x0fffffffUL)
 
-int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long,
-		    unsigned long, pgprot_t);
-
-static inline int io_remap_pfn_range(struct vm_area_struct *vma,
-				     unsigned long from, unsigned long pfn,
-				     unsigned long size, pgprot_t prot)
+static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
+		unsigned long size)
 {
 	unsigned long long offset, space, phys_base;
 
@@ -408,9 +404,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
 	space = GET_IOSPACE(pfn);
 	phys_base = offset | (space << 32ULL);
 
-	return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot);
+	return phys_base >> PAGE_SHIFT;
 }
-#define io_remap_pfn_range io_remap_pfn_range
+#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn
 
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 64b85ff9c766..615f460c50af 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1048,9 +1048,6 @@ int page_in_phys_avail(unsigned long paddr);
 #define GET_IOSPACE(pfn)		(pfn >> (BITS_PER_LONG - 4))
 #define GET_PFN(pfn)			(pfn & 0x0fffffffffffffffUL)
 
-int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long,
-		    unsigned long, pgprot_t);
-
 void adi_restore_tags(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, pte_t pte);
 
@@ -1084,9 +1081,8 @@ static inline int arch_unmap_one(struct mm_struct *mm,
 	return 0;
 }
 
-static inline int io_remap_pfn_range(struct vm_area_struct *vma,
-				     unsigned long from, unsigned long pfn,
-				     unsigned long size, pgprot_t prot)
+static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
+		unsigned long size)
 {
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 	int space = GET_IOSPACE(pfn);
@@ -1094,9 +1090,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
 
 	phys_base = offset | (((unsigned long) space) << 32UL);
 
-	return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot);
+	return phys_base >> PAGE_SHIFT;
 }
-#define io_remap_pfn_range io_remap_pfn_range
+#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn
 
 static inline unsigned long __untagged_addr(unsigned long start)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ca5565f4fac4..4441ceec913f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3684,15 +3684,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
 	return VM_FAULT_NOPAGE;
 }
 
-#ifndef io_remap_pfn_range
-static inline int io_remap_pfn_range(struct vm_area_struct *vma,
-				     unsigned long addr, unsigned long pfn,
-				     unsigned long size, pgprot_t prot)
+#ifndef io_remap_pfn_range_pfn
+static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
+		unsigned long size)
 {
-	return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
+	return pfn;
 }
 #endif
 
+static inline int io_remap_pfn_range(struct vm_area_struct *vma,
+				     unsigned long addr, unsigned long orig_pfn,
+				     unsigned long size, pgprot_t orig_prot)
+{
+	const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
+	const pgprot_t prot = pgprot_decrypted(orig_prot);
+
+	return remap_pfn_range(vma, addr, pfn, size, prot);
+}
+
 static inline vm_fault_t vmf_error(int err)
 {
 	if (err == -ENOMEM)
-- 
cgit v1.2.3


From ac0a3fc9c07df79dc8a4ce9d274df00afc7bf12d Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 20 Oct 2025 13:11:27 +0100
Subject: mm: add ability to take further action in vm_area_desc

Some drivers/filesystems need to perform additional tasks after the VMA is
set up.  This is typically in the form of pre-population.

The forms of pre-population most likely to be performed are a PFN remap
or the insertion of normal folios and PFNs into a mixed map.

We start by implementing the PFN remap functionality, ensuring that we
perform the appropriate actions at the appropriate time - that is setting
flags at the point of .mmap_prepare, and performing the actual remap at the
point at which the VMA is fully established.

This prevents the driver from doing anything too crazy with a VMA at any
stage, and we retain complete control over how the mm functionality is
applied.

Unfortunately callers still do often require some kind of custom action,
so we add an optional success/error _hook to allow the caller to do
something after the action has succeeded or failed.

This is done at the point when the VMA has already been established, so
the harm that can be done is limited.

The error hook can be used to filter errors if necessary.

There may be cases in which the caller absolutely must hold the file rmap
lock until the operation is entirely complete. It is an edge case, but
certainly the hugetlbfs mmap hook requires it.

To accommodate this, we add the hide_from_rmap_until_complete flag to the
mmap_action type. In this case, if a new VMA is allocated, we will hold the
file rmap lock until the operation is entirely completed (including any
success/error hooks).

Note that we do not need to update __compat_vma_mmap() to accommodate this
flag, as this function will be invoked from an .mmap handler whose VMA is
not yet visible, so we implicitly hide it from the rmap.

If any error arises on these final actions, we simply unmap the VMA
altogether.

Also update the stacked filesystem compatibility layer to utilise the
action behaviour, and update the VMA tests accordingly.

While we're here, rename __compat_vma_mmap_prepare() to __compat_vma_mmap()
as we are now performing actions invoked by the mmap_prepare in addition to
just the mmap_prepare hook.

Link: https://lkml.kernel.org/r/2601199a7b2eaeadfcd8ab6e199c6d1706650c94.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h               |   6 +-
 include/linux/mm.h               |  74 ++++++++++++++++++++
 include/linux/mm_types.h         |  53 ++++++++++++++
 mm/util.c                        | 146 ++++++++++++++++++++++++++++++++++++---
 mm/vma.c                         | 113 ++++++++++++++++++++++--------
 tools/testing/vma/vma_internal.h |  98 ++++++++++++++++++++++++--
 6 files changed, 441 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..8cf9547a881c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2393,14 +2393,14 @@ static inline bool can_mmap_file(struct file *file)
 	return true;
 }
 
-int __compat_vma_mmap_prepare(const struct file_operations *f_op,
+int __compat_vma_mmap(const struct file_operations *f_op,
 		struct file *file, struct vm_area_struct *vma);
-int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma);
+int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
 
 static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	if (file->f_op->mmap_prepare)
-		return compat_vma_mmap_prepare(file, vma);
+		return compat_vma_mmap(file, vma);
 
 	return file->f_op->mmap(file, vma);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4441ceec913f..2d060081caa5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3608,6 +3608,80 @@ static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc)
 	return vma_desc_size(desc) >> PAGE_SHIFT;
 }
 
+/**
+ * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN
+ * remap is required.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start: The virtual address to start the remap from, must be within the VMA.
+ * @start_pfn: The first PFN in the range to remap.
+ * @size: The size of the range to remap, in bytes, at most spanning to the end
+ * of the VMA.
+ */
+static inline void mmap_action_remap(struct vm_area_desc *desc,
+				     unsigned long start,
+				     unsigned long start_pfn,
+				     unsigned long size)
+{
+	struct mmap_action *action = &desc->action;
+
+	/* [start, start + size) must be within the VMA. */
+	WARN_ON_ONCE(start < desc->start || start >= desc->end);
+	WARN_ON_ONCE(start + size > desc->end);
+
+	action->type = MMAP_REMAP_PFN;
+	action->remap.start = start;
+	action->remap.start_pfn = start_pfn;
+	action->remap.size = size;
+	action->remap.pgprot = desc->page_prot;
+}
+
+/**
+ * mmap_action_remap_full - helper for mmap_prepare hook to specify that the
+ * entirety of a VMA should be PFN remapped.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start_pfn: The first PFN in the range to remap.
+ */
+static inline void mmap_action_remap_full(struct vm_area_desc *desc,
+					  unsigned long start_pfn)
+{
+	mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc));
+}
+
+/**
+ * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN
+ * I/O remap is required.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start: The virtual address to start the remap from, must be within the VMA.
+ * @start_pfn: The first PFN in the range to remap.
+ * @size: The size of the range to remap, in bytes, at most spanning to the end
+ * of the VMA.
+ */
+static inline void mmap_action_ioremap(struct vm_area_desc *desc,
+				       unsigned long start,
+				       unsigned long start_pfn,
+				       unsigned long size)
+{
+	mmap_action_remap(desc, start, start_pfn, size);
+	desc->action.type = MMAP_IO_REMAP_PFN;
+}
+
+/**
+ * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the
+ * entirety of a VMA should be PFN I/O remapped.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start_pfn: The first PFN in the range to remap.
+ */
+static inline void mmap_action_ioremap_full(struct vm_area_desc *desc,
+					  unsigned long start_pfn)
+{
+	mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc));
+}
+
+void mmap_action_prepare(struct mmap_action *action,
+			 struct vm_area_desc *desc);
+int mmap_action_complete(struct mmap_action *action,
+			 struct vm_area_struct *vma);
+
 /* Look up the first VMA which exactly match the interval vm_start ... vm_end */
 static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
 				unsigned long vm_start, unsigned long vm_end)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 90e5790c318f..5021047485a9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -773,6 +773,56 @@ struct pfnmap_track_ctx {
 };
 #endif
 
+/* What action should be taken after an .mmap_prepare call is complete? */
+enum mmap_action_type {
+	MMAP_NOTHING,		/* Mapping is complete, no further action. */
+	MMAP_REMAP_PFN,		/* Remap PFN range. */
+	MMAP_IO_REMAP_PFN,	/* I/O remap PFN range. */
+};
+
+/*
+ * Describes an action an mmap_prepare hook can instruct to be taken to complete
+ * the mapping of a VMA. Specified in vm_area_desc.
+ */
+struct mmap_action {
+	union {
+		/* Remap range. */
+		struct {
+			unsigned long start;
+			unsigned long start_pfn;
+			unsigned long size;
+			pgprot_t pgprot;
+		} remap;
+	};
+	enum mmap_action_type type;
+
+	/*
+	 * If specified, this hook is invoked after the selected action has been
+	 * successfully completed. Note that the VMA write lock still held.
+	 *
+	 * The absolute minimum ought to be done here.
+	 *
+	 * Returns 0 on success, or an error code.
+	 */
+	int (*success_hook)(const struct vm_area_struct *vma);
+
+	/*
+	 * If specified, this hook is invoked when an error occurred when
+	 * attempting the selection action.
+	 *
+	 * The hook can return an error code in order to filter the error, but
+	 * it is not valid to clear the error here.
+	 */
+	int (*error_hook)(int err);
+
+	/*
+	 * This should be set in rare instances where the operation required
+	 * that the rmap should not be able to access the VMA until
+	 * completely set up.
+	 */
+	bool hide_from_rmap_until_complete :1;
+};
+
 /*
  * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
  * manipulate mutable fields which will cause those fields to be updated in the
@@ -796,6 +846,9 @@ struct vm_area_desc {
 	/* Write-only fields. */
 	const struct vm_operations_struct *vm_ops;
 	void *private_data;
+
+	/* Take further action? */
+	struct mmap_action action;
 };
 
 /*
diff --git a/mm/util.c b/mm/util.c
index 8989d5767528..97cae40c0209 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1135,7 +1135,7 @@ EXPORT_SYMBOL(flush_dcache_folio);
 #endif
 
 /**
- * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare()
+ * __compat_vma_mmap() - See description for compat_vma_mmap()
  * for details. This is the same operation, only with a specific file operations
  * struct which may or may not be the same as vma->vm_file->f_op.
  * @f_op: The file operations whose .mmap_prepare() hook is specified.
@@ -1143,7 +1143,7 @@ EXPORT_SYMBOL(flush_dcache_folio);
  * @vma: The VMA to apply the .mmap_prepare() hook to.
  * Returns: 0 on success or error.
  */
-int __compat_vma_mmap_prepare(const struct file_operations *f_op,
+int __compat_vma_mmap(const struct file_operations *f_op,
 		struct file *file, struct vm_area_struct *vma)
 {
 	struct vm_area_desc desc = {
@@ -1156,21 +1156,24 @@ int __compat_vma_mmap_prepare(const struct file_operations *f_op,
 		.vm_file = vma->vm_file,
 		.vm_flags = vma->vm_flags,
 		.page_prot = vma->vm_page_prot,
+
+		.action.type = MMAP_NOTHING, /* Default */
 	};
 	int err;
 
 	err = f_op->mmap_prepare(&desc);
 	if (err)
 		return err;
-	set_vma_from_desc(vma, &desc);
 
-	return 0;
+	mmap_action_prepare(&desc.action, &desc);
+	set_vma_from_desc(vma, &desc);
+	return mmap_action_complete(&desc.action, vma);
 }
-EXPORT_SYMBOL(__compat_vma_mmap_prepare);
+EXPORT_SYMBOL(__compat_vma_mmap);
 
 /**
- * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
- * existing VMA.
+ * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
+ * existing VMA and execute any requested actions.
  * @file: The file which possesss an f_op->mmap_prepare() hook.
  * @vma: The VMA to apply the .mmap_prepare() hook to.
  *
@@ -1185,7 +1188,7 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare);
  * .mmap_prepare() hook, as we are in a different context when we invoke the
  * .mmap() hook, already having a VMA to deal with.
  *
- * compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
+ * compat_vma_mmap() is a compatibility function that takes VMA state,
  * establishes a struct vm_area_desc descriptor, passes to the underlying
  * .mmap_prepare() hook and applies any changes performed by it.
  *
@@ -1194,11 +1197,11 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare);
  *
  * Returns: 0 on success or error.
  */
-int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
+int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	return __compat_vma_mmap_prepare(file->f_op, file, vma);
+	return __compat_vma_mmap(file->f_op, file, vma);
 }
-EXPORT_SYMBOL(compat_vma_mmap_prepare);
+EXPORT_SYMBOL(compat_vma_mmap);
 
 static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
 			 const struct page *page)
@@ -1280,6 +1283,127 @@ again:
 	}
 }
 
+static int mmap_action_finish(struct mmap_action *action,
+		const struct vm_area_struct *vma, int err)
+{
+	/*
+	 * If an error occurs, unmap the VMA altogether and return an error. We
+	 * only clear the newly allocated VMA, since this function is only
+	 * invoked if we do NOT merge, so we only clean up the VMA we created.
+	 */
+	if (err) {
+		const size_t len = vma_pages(vma) << PAGE_SHIFT;
+
+		do_munmap(current->mm, vma->vm_start, len, NULL);
+
+		if (action->error_hook) {
+			/* We may want to filter the error. */
+			err = action->error_hook(err);
+
+			/* The caller should not clear the error. */
+			VM_WARN_ON_ONCE(!err);
+		}
+		return err;
+	}
+
+	if (action->success_hook)
+		return action->success_hook(vma);
+
+	return 0;
+}
+
+#ifdef CONFIG_MMU
+/**
+ * mmap_action_prepare - Perform preparatory setup for an VMA descriptor
+ * action which need to be performed.
+ * @desc: The VMA descriptor to prepare for @action.
+ * @action: The action to perform.
+ */
+void mmap_action_prepare(struct mmap_action *action,
+			 struct vm_area_desc *desc)
+{
+	switch (action->type) {
+	case MMAP_NOTHING:
+		break;
+	case MMAP_REMAP_PFN:
+		remap_pfn_range_prepare(desc, action->remap.start_pfn);
+		break;
+	case MMAP_IO_REMAP_PFN:
+		io_remap_pfn_range_prepare(desc, action->remap.start_pfn,
+					   action->remap.size);
+		break;
+	}
+}
+EXPORT_SYMBOL(mmap_action_prepare);
+
+/**
+ * mmap_action_complete - Execute VMA descriptor action.
+ * @action: The action to perform.
+ * @vma: The VMA to perform the action upon.
+ *
+ * Similar to mmap_action_prepare().
+ *
+ * Return: 0 on success, or error, at which point the VMA will be unmapped.
+ */
+int mmap_action_complete(struct mmap_action *action,
+			 struct vm_area_struct *vma)
+{
+	int err = 0;
+
+	switch (action->type) {
+	case MMAP_NOTHING:
+		break;
+	case MMAP_REMAP_PFN:
+		err = remap_pfn_range_complete(vma, action->remap.start,
+				action->remap.start_pfn, action->remap.size,
+				action->remap.pgprot);
+		break;
+	case MMAP_IO_REMAP_PFN:
+		err = io_remap_pfn_range_complete(vma, action->remap.start,
+				action->remap.start_pfn, action->remap.size,
+				action->remap.pgprot);
+		break;
+	}
+
+	return mmap_action_finish(action, vma, err);
+}
+EXPORT_SYMBOL(mmap_action_complete);
+#else
+void mmap_action_prepare(struct mmap_action *action,
+			struct vm_area_desc *desc)
+{
+	switch (action->type) {
+	case MMAP_NOTHING:
+		break;
+	case MMAP_REMAP_PFN:
+	case MMAP_IO_REMAP_PFN:
+		WARN_ON_ONCE(1); /* nommu cannot handle these. */
+		break;
+	}
+}
+EXPORT_SYMBOL(mmap_action_prepare);
+
+int mmap_action_complete(struct mmap_action *action,
+			struct vm_area_struct *vma)
+{
+	int err = 0;
+
+	switch (action->type) {
+	case MMAP_NOTHING:
+		break;
+	case MMAP_REMAP_PFN:
+	case MMAP_IO_REMAP_PFN:
+		WARN_ON_ONCE(1); /* nommu cannot handle this. */
+
+		err = -EINVAL;
+		break;
+	}
+
+	return mmap_action_finish(action, vma, err);
+}
+EXPORT_SYMBOL(mmap_action_complete);
+#endif
+
 #ifdef CONFIG_MMU
 /**
  * folio_pte_batch - detect a PTE batch for a large folio
diff --git a/mm/vma.c b/mm/vma.c
index eb2f711c03a1..919d1fc63a52 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -34,7 +34,9 @@ struct mmap_state {
 	struct maple_tree mt_detach;
 
 	/* Determine if we can check KSM flags early in mmap() logic. */
-	bool check_ksm_early;
+	bool check_ksm_early :1;
+	/* If we map new, hold the file rmap lock on mapping. */
+	bool hold_file_rmap_lock :1;
 };
 
 #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
@@ -1754,7 +1756,7 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
 		unlink_file_vma_batch_process(vb);
 }
 
-static void vma_link_file(struct vm_area_struct *vma)
+static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock)
 {
 	struct file *file = vma->vm_file;
 	struct address_space *mapping;
@@ -1763,7 +1765,8 @@ static void vma_link_file(struct vm_area_struct *vma)
 		mapping = file->f_mapping;
 		i_mmap_lock_write(mapping);
 		__vma_link_file(vma, mapping);
-		i_mmap_unlock_write(mapping);
+		if (!hold_rmap_lock)
+			i_mmap_unlock_write(mapping);
 	}
 }
 
@@ -1777,7 +1780,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
 
 	vma_start_write(vma);
 	vma_iter_store_new(&vmi, vma);
-	vma_link_file(vma);
+	vma_link_file(vma, /* hold_rmap_lock= */false);
 	mm->map_count++;
 	validate_mm(mm);
 	return 0;
@@ -2311,17 +2314,33 @@ static void update_ksm_flags(struct mmap_state *map)
 	map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags);
 }
 
+static void set_desc_from_map(struct vm_area_desc *desc,
+		const struct mmap_state *map)
+{
+	desc->start = map->addr;
+	desc->end = map->end;
+
+	desc->pgoff = map->pgoff;
+	desc->vm_file = map->file;
+	desc->vm_flags = map->vm_flags;
+	desc->page_prot = map->page_prot;
+}
+
 /*
  * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be
  * unmapped once the map operation is completed, check limits, account mapping
  * and clean up any pre-existing VMAs.
  *
+ * As a result it sets up the @map and @desc objects.
+ *
  * @map: Mapping state.
+ * @desc: VMA descriptor
  * @uf:  Userfaultfd context list.
  *
  * Returns: 0 on success, error code otherwise.
  */
-static int __mmap_setup(struct mmap_state *map, struct list_head *uf)
+static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc,
+			struct list_head *uf)
 {
 	int error;
 	struct vma_iterator *vmi = map->vmi;
@@ -2378,6 +2397,7 @@ static int __mmap_setup(struct mmap_state *map, struct list_head *uf)
 	 */
 	vms_clean_up_area(vms, &map->mas_detach);
 
+	set_desc_from_map(desc, map);
 	return 0;
 }
 
@@ -2479,7 +2499,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
 	vma_start_write(vma);
 	vma_iter_store_new(vmi, vma);
 	map->mm->map_count++;
-	vma_link_file(vma);
+	vma_link_file(vma, map->hold_file_rmap_lock);
 
 	/*
 	 * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
@@ -2539,6 +2559,17 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
 	vma_set_page_prot(vma);
 }
 
+static void call_action_prepare(struct mmap_state *map,
+				struct vm_area_desc *desc)
+{
+	struct mmap_action *action = &desc->action;
+
+	mmap_action_prepare(action, desc);
+
+	if (action->hide_from_rmap_until_complete)
+		map->hold_file_rmap_lock = true;
+}
+
 /*
  * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
  * specifies it.
@@ -2550,34 +2581,26 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
  *
  * Returns 0 on success, or an error code otherwise.
  */
-static int call_mmap_prepare(struct mmap_state *map)
+static int call_mmap_prepare(struct mmap_state *map,
+		struct vm_area_desc *desc)
 {
 	int err;
-	struct vm_area_desc desc = {
-		.mm = map->mm,
-		.file = map->file,
-		.start = map->addr,
-		.end = map->end,
-
-		.pgoff = map->pgoff,
-		.vm_file = map->file,
-		.vm_flags = map->vm_flags,
-		.page_prot = map->page_prot,
-	};
 
 	/* Invoke the hook. */
-	err = vfs_mmap_prepare(map->file, &desc);
+	err = vfs_mmap_prepare(map->file, desc);
 	if (err)
 		return err;
 
+	call_action_prepare(map, desc);
+
 	/* Update fields permitted to be changed. */
-	map->pgoff = desc.pgoff;
-	map->file = desc.vm_file;
-	map->vm_flags = desc.vm_flags;
-	map->page_prot = desc.page_prot;
+	map->pgoff = desc->pgoff;
+	map->file = desc->vm_file;
+	map->vm_flags = desc->vm_flags;
+	map->page_prot = desc->page_prot;
 	/* User-defined fields. */
-	map->vm_ops = desc.vm_ops;
-	map->vm_private_data = desc.private_data;
+	map->vm_ops = desc->vm_ops;
+	map->vm_private_data = desc->private_data;
 
 	return 0;
 }
@@ -2619,22 +2642,48 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
 	return false;
 }
 
+static int call_action_complete(struct mmap_state *map,
+				struct vm_area_desc *desc,
+				struct vm_area_struct *vma)
+{
+	struct mmap_action *action = &desc->action;
+	int ret;
+
+	ret = mmap_action_complete(action, vma);
+
+	/* If we held the file rmap we need to release it. */
+	if (map->hold_file_rmap_lock) {
+		struct file *file = vma->vm_file;
+
+		i_mmap_unlock_write(file->f_mapping);
+	}
+	return ret;
+}
+
 static unsigned long __mmap_region(struct file *file, unsigned long addr,
 		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
 		struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
-	int error;
 	bool have_mmap_prepare = file && file->f_op->mmap_prepare;
 	VMA_ITERATOR(vmi, mm, addr);
 	MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
+	struct vm_area_desc desc = {
+		.mm = mm,
+		.file = file,
+		.action = {
+			.type = MMAP_NOTHING, /* Default to no further action. */
+		},
+	};
+	bool allocated_new = false;
+	int error;
 
 	map.check_ksm_early = can_set_ksm_flags_early(&map);
 
-	error = __mmap_setup(&map, uf);
+	error = __mmap_setup(&map, &desc, uf);
 	if (!error && have_mmap_prepare)
-		error = call_mmap_prepare(&map);
+		error = call_mmap_prepare(&map, &desc);
 	if (error)
 		goto abort_munmap;
 
@@ -2653,6 +2702,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
 		error = __mmap_new_vma(&map, &vma);
 		if (error)
 			goto unacct_error;
+		allocated_new = true;
 	}
 
 	if (have_mmap_prepare)
@@ -2660,6 +2710,13 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
 
 	__mmap_complete(&map, vma);
 
+	if (have_mmap_prepare && allocated_new) {
+		error = call_action_complete(&map, &desc, vma);
+
+		if (error)
+			return error;
+	}
+
 	return addr;
 
 	/* Accounting was done by __mmap_setup(). */
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index dc976a285ad2..d873667704e8 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -275,6 +275,57 @@ struct mm_struct {
 
 struct vm_area_struct;
 
+
+/* What action should be taken after an .mmap_prepare call is complete? */
+enum mmap_action_type {
+	MMAP_NOTHING,		/* Mapping is complete, no further action. */
+	MMAP_REMAP_PFN,		/* Remap PFN range. */
+	MMAP_IO_REMAP_PFN,	/* I/O remap PFN range. */
+};
+
+/*
+ * Describes an action an mmap_prepare hook can instruct to be taken to complete
+ * the mapping of a VMA. Specified in vm_area_desc.
+ */
+struct mmap_action {
+	union {
+		/* Remap range. */
+		struct {
+			unsigned long start;
+			unsigned long start_pfn;
+			unsigned long size;
+			pgprot_t pgprot;
+		} remap;
+	};
+	enum mmap_action_type type;
+
+	/*
+	 * If specified, this hook is invoked after the selected action has been
+	 * successfully completed. Note that the VMA write lock still held.
+	 *
+	 * The absolute minimum ought to be done here.
+	 *
+	 * Returns 0 on success, or an error code.
+	 */
+	int (*success_hook)(const struct vm_area_struct *vma);
+
+	/*
+	 * If specified, this hook is invoked when an error occurred when
+	 * attempting the selection action.
+	 *
+	 * The hook can return an error code in order to filter the error, but
+	 * it is not valid to clear the error here.
+	 */
+	int (*error_hook)(int err);
+
+	/*
+	 * This should be set in rare instances where the operation required
+	 * that the rmap should not be able to access the VMA until
+	 * completely set up.
+	 */
+	bool hide_from_rmap_until_complete :1;
+};
+
 /*
  * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
  * manipulate mutable fields which will cause those fields to be updated in the
@@ -298,6 +349,9 @@ struct vm_area_desc {
 	/* Write-only fields. */
 	const struct vm_operations_struct *vm_ops;
 	void *private_data;
+
+	/* Take further action? */
+	struct mmap_action action;
 };
 
 struct file_operations {
@@ -1326,12 +1380,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma)
 static inline void set_vma_from_desc(struct vm_area_struct *vma,
 		struct vm_area_desc *desc);
 
-static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op,
+static inline void mmap_action_prepare(struct mmap_action *action,
+					   struct vm_area_desc *desc)
+{
+}
+
+static inline int mmap_action_complete(struct mmap_action *action,
+					   struct vm_area_struct *vma)
+{
+	return 0;
+}
+
+static inline int __compat_vma_mmap(const struct file_operations *f_op,
 		struct file *file, struct vm_area_struct *vma)
 {
 	struct vm_area_desc desc = {
 		.mm = vma->vm_mm,
-		.file = vma->vm_file,
+		.file = file,
 		.start = vma->vm_start,
 		.end = vma->vm_end,
 
@@ -1339,21 +1404,24 @@ static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op,
 		.vm_file = vma->vm_file,
 		.vm_flags = vma->vm_flags,
 		.page_prot = vma->vm_page_prot,
+
+		.action.type = MMAP_NOTHING, /* Default */
 	};
 	int err;
 
 	err = f_op->mmap_prepare(&desc);
 	if (err)
 		return err;
-	set_vma_from_desc(vma, &desc);
 
-	return 0;
+	mmap_action_prepare(&desc.action, &desc);
+	set_vma_from_desc(vma, &desc);
+	return mmap_action_complete(&desc.action, vma);
 }
 
-static inline int compat_vma_mmap_prepare(struct file *file,
+static inline int compat_vma_mmap(struct file *file,
 		struct vm_area_struct *vma)
 {
-	return __compat_vma_mmap_prepare(file->f_op, file, vma);
+	return __compat_vma_mmap(file->f_op, file, vma);
 }
 
 /* Did the driver provide valid mmap hook configuration? */
@@ -1374,7 +1442,7 @@ static inline bool can_mmap_file(struct file *file)
 static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	if (file->f_op->mmap_prepare)
-		return compat_vma_mmap_prepare(file, vma);
+		return compat_vma_mmap(file, vma);
 
 	return file->f_op->mmap(file, vma);
 }
@@ -1407,4 +1475,20 @@ static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
 	return vm_flags;
 }
 
+static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
+{
+}
+
+static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pfn, unsigned long size, pgprot_t pgprot)
+{
+	return 0;
+}
+
+static inline int do_munmap(struct mm_struct *, unsigned long, size_t,
+		struct list_head *uf)
+{
+	return 0;
+}
+
 #endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From ea52cb24cd3fb121283754ab82b2cb3044609359 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 20 Oct 2025 13:11:29 +0100
Subject: mm/hugetlbfs: update hugetlbfs to use mmap_prepare

Since we can now perform actions after the VMA is established via
mmap_prepare, use desc->action_success_hook to set up the hugetlb lock
once the VMA is setup.

We also make changes throughout hugetlbfs to make this possible.

Note that we must hide newly established hugetlb VMAs from the rmap until
the operation is entirely complete as we establish a hugetlb lock during
VMA setup that can be raced by rmap users.

Link: https://lkml.kernel.org/r/b1afa16d3cfa585a03df9ae215ae9f905b3f0ed7.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c           | 46 ++++++++++++++++++-------
 include/linux/hugetlb.h        |  9 +++--
 include/linux/hugetlb_inline.h | 15 +++++---
 mm/hugetlb.c                   | 77 ++++++++++++++++++++++++------------------
 4 files changed, 95 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ce8e40d35032..3919fca56553 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
 #define PGOFF_LOFFT_MAX \
 	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
 
-static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
 {
+	/* Unfortunate we have to reassign vma->vm_private_data. */
+	return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
+}
+
+static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
+{
+	struct file *file = desc->file;
 	struct inode *inode = file_inode(file);
 	loff_t len, vma_len;
 	int ret;
@@ -112,8 +119,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * way when do_mmap unwinds (may be important on powerpc
 	 * and ia64).
 	 */
-	vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
-	vma->vm_ops = &hugetlb_vm_ops;
+	desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+	desc->vm_ops = &hugetlb_vm_ops;
 
 	/*
 	 * page based offset in vm_pgoff could be sufficiently large to
@@ -122,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * sizeof(unsigned long).  So, only check in those instances.
 	 */
 	if (sizeof(unsigned long) == sizeof(loff_t)) {
-		if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
+		if (desc->pgoff & PGOFF_LOFFT_MAX)
 			return -EINVAL;
 	}
 
 	/* must be huge page aligned */
-	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
+	if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
 		return -EINVAL;
 
-	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
-	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+	vma_len = (loff_t)vma_desc_size(desc);
+	len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT);
 	/* check for overflow */
 	if (len < vma_len)
 		return -EINVAL;
@@ -141,7 +148,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	ret = -ENOMEM;
 
-	vm_flags = vma->vm_flags;
+	vm_flags = desc->vm_flags;
 	/*
 	 * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
 	 * reserving here. Note: only for SHM hugetlbfs file, the inode
@@ -151,17 +158,30 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 		vm_flags |= VM_NORESERVE;
 
 	if (hugetlb_reserve_pages(inode,
-				vma->vm_pgoff >> huge_page_order(h),
-				len >> huge_page_shift(h), vma,
-				vm_flags) < 0)
+			desc->pgoff >> huge_page_order(h),
+			len >> huge_page_shift(h), desc,
+			vm_flags) < 0)
 		goto out;
 
 	ret = 0;
-	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
+	if ((desc->vm_flags & VM_WRITE) && inode->i_size < len)
 		i_size_write(inode, len);
 out:
 	inode_unlock(inode);
 
+	if (!ret) {
+		/* Allocate the VMA lock after we set it up. */
+		desc->action.success_hook = hugetlb_file_mmap_prepare_success;
+		/*
+		 * We cannot permit the rmap finding this VMA in the time
+		 * between the VMA being inserted into the VMA tree and the
+		 * completion/success hook being invoked.
+		 *
+		 * This is because we establish a per-VMA hugetlb lock which can
+		 * be raced by rmap.
+		 */
+		desc->action.hide_from_rmap_until_complete = true;
+	}
 	return ret;
 }
 
@@ -1220,7 +1240,7 @@ static void init_once(void *foo)
 
 static const struct file_operations hugetlbfs_file_operations = {
 	.read_iter		= hugetlbfs_read_iter,
-	.mmap			= hugetlbfs_file_mmap,
+	.mmap_prepare		= hugetlbfs_file_mmap_prepare,
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
 	.llseek			= default_llseek,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e63e46b8e1f..2387513d6ae5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			     struct folio **foliop);
 #endif /* CONFIG_USERFAULTFD */
 long hugetlb_reserve_pages(struct inode *inode, long from, long to,
-						struct vm_area_struct *vma,
-						vm_flags_t vm_flags);
+			   struct vm_area_desc *desc, vm_flags_t vm_flags);
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 						long freed);
 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
@@ -280,6 +279,7 @@ bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 void fixup_hugetlb_reservations(struct vm_area_struct *vma);
 void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -466,6 +466,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
 
 static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
 
+static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+	return 0;
+}
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 
 #ifndef pgd_write
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 0660a03d37d9..a27aa0162918 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -2,22 +2,27 @@
 #ifndef _LINUX_HUGETLB_INLINE_H
 #define _LINUX_HUGETLB_INLINE_H
 
-#ifdef CONFIG_HUGETLB_PAGE
-
 #include <linux/mm.h>
 
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+#ifdef CONFIG_HUGETLB_PAGE
+
+static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
 {
-	return !!(vma->vm_flags & VM_HUGETLB);
+	return !!(vm_flags & VM_HUGETLB);
 }
 
 #else
 
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
 {
 	return false;
 }
 
 #endif
 
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+	return is_vm_hugetlb_flags(vma->vm_flags);
+}
+
 #endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7774c286b3b7..86e672fcb305 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -119,7 +119,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init;
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
 static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end, bool take_locks);
@@ -438,17 +437,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
 	}
 }
 
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+/*
+ * vma specific semaphore used for pmd sharing and fault/truncation
+ * synchronization
+ */
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
 {
 	struct hugetlb_vma_lock *vma_lock;
 
 	/* Only establish in (flags) sharable vmas */
 	if (!vma || !(vma->vm_flags & VM_MAYSHARE))
-		return;
+		return 0;
 
 	/* Should never get here with non-NULL vm_private_data */
 	if (vma->vm_private_data)
-		return;
+		return -EINVAL;
 
 	vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
 	if (!vma_lock) {
@@ -463,13 +466,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
 		 * allocation failure.
 		 */
 		pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
-		return;
+		return -EINVAL;
 	}
 
 	kref_init(&vma_lock->refs);
 	init_rwsem(&vma_lock->rw_sema);
 	vma_lock->vma = vma;
 	vma->vm_private_data = vma_lock;
+
+	return 0;
 }
 
 /* Helper that removes a struct file_region from the resv_map cache and returns
@@ -1201,20 +1206,28 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
 	}
 }
 
-static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
 {
-	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+	VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma);
+	VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma);
 
-	set_vma_private_data(vma, (unsigned long)map);
+	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
 }
 
-static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map)
 {
-	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+	VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+	VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
 
-	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+	desc->private_data = map;
+}
+
+static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags)
+{
+	VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+	VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
+
+	desc->private_data = (void *)((unsigned long)desc->private_data | flags);
 }
 
 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
@@ -1224,6 +1237,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 	return (get_vma_private_data(vma) & flag) != 0;
 }
 
+static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
+{
+	VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+
+	return ((unsigned long)desc->private_data) & flag;
+}
+
 bool __vma_private_lock(struct vm_area_struct *vma)
 {
 	return !(vma->vm_flags & VM_MAYSHARE) &&
@@ -7270,9 +7290,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
  */
 
 long hugetlb_reserve_pages(struct inode *inode,
-					long from, long to,
-					struct vm_area_struct *vma,
-					vm_flags_t vm_flags)
+		long from, long to,
+		struct vm_area_desc *desc,
+		vm_flags_t vm_flags)
 {
 	long chg = -1, add = -1, spool_resv, gbl_resv;
 	struct hstate *h = hstate_inode(inode);
@@ -7287,12 +7307,6 @@ long hugetlb_reserve_pages(struct inode *inode,
 		return -EINVAL;
 	}
 
-	/*
-	 * vma specific semaphore used for pmd sharing and fault/truncation
-	 * synchronization
-	 */
-	hugetlb_vma_lock_alloc(vma);
-
 	/*
 	 * Only apply hugepage reservation if asked. At fault time, an
 	 * attempt will be made for VM_NORESERVE to allocate a page
@@ -7305,9 +7319,9 @@ long hugetlb_reserve_pages(struct inode *inode,
 	 * Shared mappings base their reservation on the number of pages that
 	 * are already allocated on behalf of the file. Private mappings need
 	 * to reserve the full area even if read-only as mprotect() may be
-	 * called to make the mapping read-write. Assume !vma is a shm mapping
+	 * called to make the mapping read-write. Assume !desc is a shm mapping
 	 */
-	if (!vma || vma->vm_flags & VM_MAYSHARE) {
+	if (!desc || desc->vm_flags & VM_MAYSHARE) {
 		/*
 		 * resv_map can not be NULL as hugetlb_reserve_pages is only
 		 * called for inodes for which resv_maps were created (see
@@ -7324,8 +7338,8 @@ long hugetlb_reserve_pages(struct inode *inode,
 
 		chg = to - from;
 
-		set_vma_resv_map(vma, resv_map);
-		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+		set_vma_desc_resv_map(desc, resv_map);
+		set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER);
 	}
 
 	if (chg < 0)
@@ -7335,7 +7349,7 @@ long hugetlb_reserve_pages(struct inode *inode,
 				chg * pages_per_huge_page(h), &h_cg) < 0)
 		goto out_err;
 
-	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+	if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) {
 		/* For private mappings, the hugetlb_cgroup uncharge info hangs
 		 * of the resv_map.
 		 */
@@ -7369,7 +7383,7 @@ long hugetlb_reserve_pages(struct inode *inode,
 	 * consumed reservations are stored in the map. Hence, nothing
 	 * else has to be done for private mappings here
 	 */
-	if (!vma || vma->vm_flags & VM_MAYSHARE) {
+	if (!desc || desc->vm_flags & VM_MAYSHARE) {
 		add = region_add(resv_map, from, to, regions_needed, h, h_cg);
 
 		if (unlikely(add < 0)) {
@@ -7423,16 +7437,15 @@ out_uncharge_cgroup:
 	hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
 					    chg * pages_per_huge_page(h), h_cg);
 out_err:
-	hugetlb_vma_lock_free(vma);
-	if (!vma || vma->vm_flags & VM_MAYSHARE)
+	if (!desc || desc->vm_flags & VM_MAYSHARE)
 		/* Only call region_abort if the region_chg succeeded but the
 		 * region_add failed or didn't run.
 		 */
 		if (chg >= 0 && add < 0)
 			region_abort(resv_map, from, to, regions_needed);
-	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+	if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) {
 		kref_put(&resv_map->refs, resv_map_release);
-		set_vma_resv_map(vma, NULL);
+		set_vma_desc_resv_map(desc, NULL);
 	}
 	return chg < 0 ? chg : add < 0 ? add : -EINVAL;
 }
-- 
cgit v1.2.3


From 89646d9c748c0902600090f37ae585f3b99deb4d Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 20 Oct 2025 13:11:30 +0100
Subject: mm: add shmem_zero_setup_desc()

Add the ability to set up a shared anonymous mapping based on a VMA
descriptor rather than a VMA.

This is a prerequisite for converting to the char mm driver to use the
mmap_prepare hook.

Link: https://lkml.kernel.org/r/d9181517a7e3d6b014a5697c6990d3722c2c9fcd.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h |  3 ++-
 mm/shmem.c               | 41 +++++++++++++++++++++++++++++++++--------
 2 files changed, 35 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0e47465ef0fd..5b368f9549d6 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -94,7 +94,8 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
 					    unsigned long flags);
 extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
 		const char *name, loff_t size, unsigned long flags);
-extern int shmem_zero_setup(struct vm_area_struct *);
+int shmem_zero_setup(struct vm_area_struct *vma);
+int shmem_zero_setup_desc(struct vm_area_desc *desc);
 extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
diff --git a/mm/shmem.c b/mm/shmem.c
index 8b9fcdd144c8..da1df4270309 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -5880,14 +5880,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
 }
 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
 
-/**
- * shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap
- */
-int shmem_zero_setup(struct vm_area_struct *vma)
+static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags)
 {
-	struct file *file;
-	loff_t size = vma->vm_end - vma->vm_start;
+	loff_t size = end - start;
 
 	/*
 	 * Cloning a new file under mmap_lock leads to a lock ordering conflict
@@ -5895,7 +5890,18 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	 * accessible to the user through its mapping, use S_PRIVATE flag to
 	 * bypass file security, in the same way as shmem_kernel_file_setup().
 	 */
-	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
+	return shmem_kernel_file_setup("dev/zero", size, vm_flags);
+}
+
+/**
+ * shmem_zero_setup - setup a shared anonymous mapping
+ * @vma: the vma to be mmapped is prepared by do_mmap
+ * Returns: 0 on success, or error
+ */
+int shmem_zero_setup(struct vm_area_struct *vma)
+{
+	struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags);
+
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
@@ -5907,6 +5913,25 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	return 0;
 }
 
+/**
+ * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA
+ * descriptor for convenience.
+ * @desc: Describes VMA
+ * Returns: 0 on success, or error
+ */
+int shmem_zero_setup_desc(struct vm_area_desc *desc)
+{
+	struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
+
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	desc->vm_file = file;
+	desc->vm_ops = &shmem_anon_vm_ops;
+
+	return 0;
+}
+
 /**
  * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
  * @mapping:	the folio's address_space
-- 
cgit v1.2.3


From 5ff592bec75ad79ed7f1a817477ab6eef8dc5efc Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Tue, 21 Oct 2025 16:44:25 -0700
Subject: memcg: manually uninline __memcg_memory_event

__memcg_memory_event() has been unnecessarily marked inline even when it
is not really performance critical.  It is usually called to track extreme
conditions.  Over the time, it has evolved to include more functionality
and inlining it is causing more harm.

Before the patch:
$ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o
   text    data     bss     dec     hex filename
  35645   10574    4192   50411    c4eb mm/memcontrol.o
  54738    1658       0   56396    dc4c net/ipv4/tcp_input.o
  34644    1065       0   35709    8b7d net/ipv4/tcp_output.o

After the patch:
$ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o
   text    data     bss     dec     hex filename
  35137   10446    4192   49775    c26f mm/memcontrol.o
  54322    1562       0   55884    da4c net/ipv4/tcp_input.o
  34492    1017       0   35509    8ab5 net/ipv4/tcp_output.o

[akpm@linux-foundation.org: use EXPORT_SYMBOL_GPL for __memcg_memory_event, per Michal and Christoph]
Link: https://lkml.kernel.org/r/20251021234425.1885471-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 32 ++------------------------------
 mm/memcontrol.c            | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5fe254813123..8c0f15e5978f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1002,36 +1002,8 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
 	count_memcg_events_mm(mm, idx, 1);
 }
 
-static inline void __memcg_memory_event(struct mem_cgroup *memcg,
-					enum memcg_memory_event event,
-					bool allow_spinning)
-{
-	bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
-			  event == MEMCG_SWAP_FAIL;
-
-	/* For now only MEMCG_MAX can happen with !allow_spinning context. */
-	VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX);
-
-	atomic_long_inc(&memcg->memory_events_local[event]);
-	if (!swap_event && allow_spinning)
-		cgroup_file_notify(&memcg->events_local_file);
-
-	do {
-		atomic_long_inc(&memcg->memory_events[event]);
-		if (allow_spinning) {
-			if (swap_event)
-				cgroup_file_notify(&memcg->swap_events_file);
-			else
-				cgroup_file_notify(&memcg->events_file);
-		}
-
-		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
-			break;
-		if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
-			break;
-	} while ((memcg = parent_mem_cgroup(memcg)) &&
-		 !mem_cgroup_is_root(memcg));
-}
+void __memcg_memory_event(struct mem_cgroup *memcg,
+			  enum memcg_memory_event event, bool allow_spinning);
 
 static inline void memcg_memory_event(struct mem_cgroup *memcg,
 				      enum memcg_memory_event event)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 976412c8196e..025da46d9959 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1626,6 +1626,37 @@ unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
 	return page_counter_read(&memcg->memory);
 }
 
+void __memcg_memory_event(struct mem_cgroup *memcg,
+			  enum memcg_memory_event event, bool allow_spinning)
+{
+	bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
+			  event == MEMCG_SWAP_FAIL;
+
+	/* For now only MEMCG_MAX can happen with !allow_spinning context. */
+	VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX);
+
+	atomic_long_inc(&memcg->memory_events_local[event]);
+	if (!swap_event && allow_spinning)
+		cgroup_file_notify(&memcg->events_local_file);
+
+	do {
+		atomic_long_inc(&memcg->memory_events[event]);
+		if (allow_spinning) {
+			if (swap_event)
+				cgroup_file_notify(&memcg->swap_events_file);
+			else
+				cgroup_file_notify(&memcg->events_file);
+		}
+
+		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+			break;
+		if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+			break;
+	} while ((memcg = parent_mem_cgroup(memcg)) &&
+		 !mem_cgroup_is_root(memcg));
+}
+EXPORT_SYMBOL_GPL(__memcg_memory_event);
+
 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order)
 {
-- 
cgit v1.2.3


From 27bfafac65d87c58639f5d7af1353ec1e7886963 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 22 Oct 2025 16:26:28 +0800
Subject: mm: add a ptdesc flag to mark kernel page tables

The page tables used to map the kernel and userspace often have very
different handling rules.  There are frequently *_kernel() variants of
functions just for kernel page tables.  That's not great and has lead to
code duplication.

Instead of having completely separate call paths, allow a 'ptdesc' to be
marked as being for kernel mappings.  Introduce helpers to set and clear
this status.

Note: this uses the PG_referenced bit.  Page flags are a great fit for
this since it is truly a single bit of information.  Use PG_referenced
itself because it's a fairly benign flag (as opposed to things like
PG_lock).  It's also (according to Willy) unlikely to go away any time
soon.

PG_referenced is not in PAGE_FLAGS_CHECK_AT_FREE.  It does not need to be
cleared before freeing the page, and pages coming out of the allocator
should have it cleared.  Regardless, introduce an API to clear it anyway.
Having symmetry in the API makes it easier to change the underlying
implementation later, like if there was a need to move to a
PAGE_FLAGS_CHECK_AT_FREE bit.

Link: https://lkml.kernel.org/r/20251022082635.2462433-3-baolu.lu@linux.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2d060081caa5..5c887c4ea29e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2962,6 +2962,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 #endif /* CONFIG_MMU */
 
 enum pt_flags {
+	PT_kernel = PG_referenced,
 	PT_reserved = PG_reserved,
 	/* High bits are used for zone/node/section */
 };
@@ -2987,6 +2988,46 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt)
 	return test_bit(PT_reserved, &pt->pt_flags.f);
 }
 
+/**
+ * ptdesc_set_kernel - Mark a ptdesc used to map the kernel
+ * @ptdesc: The ptdesc to be marked
+ *
+ * Kernel page tables often need special handling. Set a flag so that
+ * the handling code knows this ptdesc will not be used for userspace.
+ */
+static inline void ptdesc_set_kernel(struct ptdesc *ptdesc)
+{
+	set_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
+/**
+ * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel
+ * @ptdesc: The ptdesc to be unmarked
+ *
+ * Use when the ptdesc is no longer used to map the kernel and no longer
+ * needs special handling.
+ */
+static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc)
+{
+	/*
+	 * Note: the 'PG_referenced' bit does not strictly need to be
+	 * cleared before freeing the page. But this is nice for
+	 * symmetry.
+	 */
+	clear_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
+/**
+ * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel
+ * @ptdesc: The ptdesc being tested
+ *
+ * Call to tell if the ptdesc used to map the kernel.
+ */
+static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc)
+{
+	return test_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
 /**
  * pagetable_alloc - Allocate pagetables
  * @gfp:    GFP flags
-- 
cgit v1.2.3


From 977870522af34359b461060597ee3a86f27450d6 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 22 Oct 2025 16:26:29 +0800
Subject: mm: actually mark kernel page table pages

Now that the API is in place, mark kernel page table pages just after they
are allocated.  Unmark them just before they are freed.

Note: Unconditionally clearing the 'kernel' marking (via
ptdesc_clear_kernel()) would be functionally identical to what is here.
But having the if() makes it logically clear that this function can be
used for kernel and non-kernel page tables.

Link: https://lkml.kernel.org/r/20251022082635.2462433-4-baolu.lu@linux.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/pgalloc.h | 18 ++++++++++++++++++
 include/linux/mm.h            |  3 +++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 3c8ec3bfea44..b9d2a7c79b93 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -28,6 +28,8 @@ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
 		return NULL;
 	}
 
+	ptdesc_set_kernel(ptdesc);
+
 	return ptdesc_address(ptdesc);
 }
 #define __pte_alloc_one_kernel(...)	alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))
@@ -146,6 +148,10 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad
 		pagetable_free(ptdesc);
 		return NULL;
 	}
+
+	if (mm == &init_mm)
+		ptdesc_set_kernel(ptdesc);
+
 	return ptdesc_address(ptdesc);
 }
 #define pmd_alloc_one(...)	alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
@@ -179,6 +185,10 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long
 		return NULL;
 
 	pagetable_pud_ctor(ptdesc);
+
+	if (mm == &init_mm)
+		ptdesc_set_kernel(ptdesc);
+
 	return ptdesc_address(ptdesc);
 }
 #define __pud_alloc_one(...)	alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))
@@ -233,6 +243,10 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long
 		return NULL;
 
 	pagetable_p4d_ctor(ptdesc);
+
+	if (mm == &init_mm)
+		ptdesc_set_kernel(ptdesc);
+
 	return ptdesc_address(ptdesc);
 }
 #define __p4d_alloc_one(...)	alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__))
@@ -277,6 +291,10 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order
 		return NULL;
 
 	pagetable_pgd_ctor(ptdesc);
+
+	if (mm == &init_mm)
+		ptdesc_set_kernel(ptdesc);
+
 	return ptdesc_address(ptdesc);
 }
 #define __pgd_alloc(...)	alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5c887c4ea29e..8f46048875a7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3057,6 +3057,9 @@ static inline void pagetable_free(struct ptdesc *pt)
 {
 	struct page *page = ptdesc_page(pt);
 
+	if (ptdesc_test_kernel(pt))
+		ptdesc_clear_kernel(pt);
+
 	__free_pages(page, compound_order(page));
 }
 
-- 
cgit v1.2.3


From 01894295672335ff304beed4359f30d14d5765f2 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 22 Oct 2025 16:26:31 +0800
Subject: mm: introduce pure page table freeing function

The pages used for ptdescs are currently freed back to the allocator in a
single location.  They will shortly be freed from a second location.

Create a simple helper that just frees them back to the allocator.

Link: https://lkml.kernel.org/r/20251022082635.2462433-6-baolu.lu@linux.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f46048875a7..88c0a0fae43a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3046,6 +3046,13 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde
 }
 #define pagetable_alloc(...)	alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
 
+static inline void __pagetable_free(struct ptdesc *pt)
+{
+	struct page *page = ptdesc_page(pt);
+
+	__free_pages(page, compound_order(page));
+}
+
 /**
  * pagetable_free - Free pagetables
  * @pt:	The page table descriptor
@@ -3055,12 +3062,10 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde
  */
 static inline void pagetable_free(struct ptdesc *pt)
 {
-	struct page *page = ptdesc_page(pt);
-
 	if (ptdesc_test_kernel(pt))
 		ptdesc_clear_kernel(pt);
 
-	__free_pages(page, compound_order(page));
+	__pagetable_free(pt);
 }
 
 #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
-- 
cgit v1.2.3


From 5ba2f0a1556479638ac11a3c201421f5515e89f5 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 22 Oct 2025 16:26:33 +0800
Subject: mm: introduce deferred freeing for kernel page tables

This introduces a conditional asynchronous mechanism, enabled by
CONFIG_ASYNC_KERNEL_PGTABLE_FREE.  When enabled, this mechanism defers the
freeing of pages that are used as page tables for kernel address mappings.
These pages are now queued to a work struct instead of being freed
immediately.

This deferred freeing allows for batch-freeing of page tables, providing a
safe context for performing a single expensive operation (TLB flush) for a
batch of kernel page tables instead of performing that expensive operation
for each page table.

Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   | 16 +++++++++++++---
 mm/Kconfig           |  3 +++
 mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 88c0a0fae43a..a6fd9f5aaf30 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3053,6 +3053,14 @@ static inline void __pagetable_free(struct ptdesc *pt)
 	__free_pages(page, compound_order(page));
 }
 
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+void pagetable_free_kernel(struct ptdesc *pt);
+#else
+static inline void pagetable_free_kernel(struct ptdesc *pt)
+{
+	__pagetable_free(pt);
+}
+#endif
 /**
  * pagetable_free - Free pagetables
  * @pt:	The page table descriptor
@@ -3062,10 +3070,12 @@ static inline void __pagetable_free(struct ptdesc *pt)
  */
 static inline void pagetable_free(struct ptdesc *pt)
 {
-	if (ptdesc_test_kernel(pt))
+	if (ptdesc_test_kernel(pt)) {
 		ptdesc_clear_kernel(pt);
-
-	__pagetable_free(pt);
+		pagetable_free_kernel(pt);
+	} else {
+		__pagetable_free(pt);
+	}
 }
 
 #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
diff --git a/mm/Kconfig b/mm/Kconfig
index 4971436c8697..682a5c39a1a6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -906,6 +906,9 @@ config HAVE_GIGANTIC_FOLIOS
 	def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \
 		 (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 
+config ASYNC_KERNEL_PGTABLE_FREE
+	def_bool n
+
 # TODO: Allow to be enabled without THP
 config ARCH_SUPPORTS_HUGE_PFNMAP
 	def_bool n
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 567e2d084071..1c7caa8ef164 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -406,3 +406,40 @@ again:
 	pte_unmap_unlock(pte, ptl);
 	goto again;
 }
+
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+static void kernel_pgtable_work_func(struct work_struct *work);
+
+static struct {
+	struct list_head list;
+	/* protect above ptdesc lists */
+	spinlock_t lock;
+	struct work_struct work;
+} kernel_pgtable_work = {
+	.list = LIST_HEAD_INIT(kernel_pgtable_work.list),
+	.lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
+	.work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
+};
+
+static void kernel_pgtable_work_func(struct work_struct *work)
+{
+	struct ptdesc *pt, *next;
+	LIST_HEAD(page_list);
+
+	spin_lock(&kernel_pgtable_work.lock);
+	list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
+	spin_unlock(&kernel_pgtable_work.lock);
+
+	list_for_each_entry_safe(pt, next, &page_list, pt_list)
+		__pagetable_free(pt);
+}
+
+void pagetable_free_kernel(struct ptdesc *pt)
+{
+	spin_lock(&kernel_pgtable_work.lock);
+	list_add(&pt->pt_list, &kernel_pgtable_work.list);
+	spin_unlock(&kernel_pgtable_work.lock);
+
+	schedule_work(&kernel_pgtable_work.work);
+}
+#endif
-- 
cgit v1.2.3


From e37d5a2d60a338c5917c45296bac65da1382eda5 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 22 Oct 2025 16:26:34 +0800
Subject: iommu/sva: invalidate stale IOTLB entries for kernel address space

Introduce a new IOMMU interface to flush IOTLB paging cache entries for
the CPU kernel address space.  This interface is invoked from the x86
architecture code that manages combined user and kernel page tables,
specifically before any kernel page table page is freed and reused.

This addresses the main issue with vfree() which is a common occurrence
and can be triggered by unprivileged users.  While this resolves the
primary problem, it doesn't address some extremely rare case related to
memory unplug of memory that was present as reserved memory at boot, which
cannot be triggered by unprivileged users.  The discussion can be found at
the link below.

Enable SVA on x86 architecture since the IOMMU can now receive
notification to flush the paging cache before freeing the CPU kernel page
table pages.

Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/
Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Suggested-by: Jann Horn <jannh@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/Kconfig          |  1 +
 drivers/iommu/iommu-sva.c | 32 ++++++++++++++++++++++++++++----
 include/linux/iommu.h     |  4 ++++
 mm/pgtable-generic.c      |  2 ++
 4 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fa3b616af03a..a3700766a8c0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -279,6 +279,7 @@ config X86
 	select HAVE_PCI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
+	select ASYNC_KERNEL_PGTABLE_FREE	if IOMMU_SVA
 	select MMU_GATHER_RCU_TABLE_FREE
 	select MMU_GATHER_MERGE_VMAS
 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index a0442faad952..d236aef80a8d 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -10,6 +10,8 @@
 #include "iommu-priv.h"
 
 static DEFINE_MUTEX(iommu_sva_lock);
+static bool iommu_sva_present;
+static LIST_HEAD(iommu_sva_mms);
 static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
 						   struct mm_struct *mm);
 
@@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de
 		return ERR_PTR(-ENOSPC);
 	}
 	iommu_mm->pasid = pasid;
+	iommu_mm->mm = mm;
 	INIT_LIST_HEAD(&iommu_mm->sva_domains);
 	/*
 	 * Make sure the write to mm->iommu_mm is not reordered in front of
@@ -77,9 +80,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 	if (!group)
 		return ERR_PTR(-ENODEV);
 
-	if (IS_ENABLED(CONFIG_X86))
-		return ERR_PTR(-EOPNOTSUPP);
-
 	mutex_lock(&iommu_sva_lock);
 
 	/* Allocate mm->pasid if necessary. */
@@ -135,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 	if (ret)
 		goto out_free_domain;
 	domain->users = 1;
-	list_add(&domain->next, &mm->iommu_mm->sva_domains);
 
+	if (list_empty(&iommu_mm->sva_domains)) {
+		if (list_empty(&iommu_sva_mms))
+			iommu_sva_present = true;
+		list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
+	}
+	list_add(&domain->next, &iommu_mm->sva_domains);
 out:
 	refcount_set(&handle->users, 1);
 	mutex_unlock(&iommu_sva_lock);
@@ -178,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
 		list_del(&domain->next);
 		iommu_domain_free(domain);
 	}
+
+	if (list_empty(&iommu_mm->sva_domains)) {
+		list_del(&iommu_mm->mm_list_elm);
+		if (list_empty(&iommu_sva_mms))
+			iommu_sva_present = false;
+	}
+
 	mutex_unlock(&iommu_sva_lock);
 	kfree(handle);
 }
@@ -315,3 +327,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
 
 	return domain;
 }
+
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
+{
+	struct iommu_mm_data *iommu_mm;
+
+	guard(mutex)(&iommu_sva_lock);
+	if (!iommu_sva_present)
+		return;
+
+	list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
+		mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
+}
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c30d12e16473..66e4abb2df0d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1134,7 +1134,9 @@ struct iommu_sva {
 
 struct iommu_mm_data {
 	u32			pasid;
+	struct mm_struct	*mm;
 	struct list_head	sva_domains;
+	struct list_head	mm_list_elm;
 };
 
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode);
@@ -1615,6 +1617,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm);
 void iommu_sva_unbind_device(struct iommu_sva *handle);
 u32 iommu_sva_get_pasid(struct iommu_sva *handle);
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end);
 #else
 static inline struct iommu_sva *
 iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
@@ -1639,6 +1642,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
 }
 
 static inline void mm_pasid_drop(struct mm_struct *mm) {}
+static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {}
 #endif /* CONFIG_IOMMU_SVA */
 
 #ifdef CONFIG_IOMMU_IOPF
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 1c7caa8ef164..8c22be79b734 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -13,6 +13,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/mm_inline.h>
+#include <linux/iommu.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
@@ -430,6 +431,7 @@ static void kernel_pgtable_work_func(struct work_struct *work)
 	list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
 	spin_unlock(&kernel_pgtable_work.lock);
 
+	iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL);
 	list_for_each_entry_safe(pt, next, &page_list, pt_list)
 		__pagetable_free(pt);
 }
-- 
cgit v1.2.3


From a983471cfc454afeba23526ee5d17fd8cdc7876f Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Fri, 24 Oct 2025 02:00:41 +0800
Subject: mm, swap: cleanup swap entry allocation parameter

We no longer need this GFP parameter after commit 8578e0c00dcf ("mm, swap:
use the swap table for the swap cache and switch API").  Before that
commit the GFP parameter is already almost identical for all callers, so
nothing changed by that commit.  Swap table just moved the GFP to lower
layer and make it more defined and changes depend on atomic or sleep
allocation.

Now this parameter is no longer used, just remove it.  No behavior change.

Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-3-a709469052e7@tencent.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Chris Li <chrisl@kernel.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 4 ++--
 mm/shmem.c           | 2 +-
 mm/swapfile.c        | 3 +--
 mm/vmscan.c          | 4 ++--
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index e818fbade1e2..a4b264817735 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -462,7 +462,7 @@ static inline long get_nr_swap_pages(void)
 }
 
 extern void si_swapinfo(struct sysinfo *);
-int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask);
+int folio_alloc_swap(struct folio *folio);
 bool folio_free_swap(struct folio *folio);
 void put_swap_folio(struct folio *folio, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
@@ -560,7 +560,7 @@ static inline int swp_swapcount(swp_entry_t entry)
 	return 0;
 }
 
-static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask)
+static inline int folio_alloc_swap(struct folio *folio)
 {
 	return -EINVAL;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index da1df4270309..e1dc2d8e939c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1617,7 +1617,7 @@ try_split:
 		folio_mark_uptodate(folio);
 	}
 
-	if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
+	if (!folio_alloc_swap(folio)) {
 		bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
 		int error;
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 808052319c0b..d87b562ae661 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1417,7 +1417,6 @@ static bool swap_sync_discard(void)
 /**
  * folio_alloc_swap - allocate swap space for a folio
  * @folio: folio we want to move to swap
- * @gfp: gfp mask for shadow nodes
  *
  * Allocate swap space for the folio and add the folio to the
  * swap cache.
@@ -1425,7 +1424,7 @@ static bool swap_sync_discard(void)
  * Context: Caller needs to hold the folio lock.
  * Return: Whether the folio was added to the swap cache.
  */
-int folio_alloc_swap(struct folio *folio, gfp_t gfp)
+int folio_alloc_swap(struct folio *folio)
 {
 	unsigned int order = folio_order(folio);
 	unsigned int size = 1 << order;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ecc90517b791..c23c9616052a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1318,7 +1318,7 @@ retry:
 					    split_folio_to_list(folio, folio_list))
 						goto activate_locked;
 				}
-				if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
+				if (folio_alloc_swap(folio)) {
 					int __maybe_unused order = folio_order(folio);
 
 					if (!folio_test_large(folio))
@@ -1334,7 +1334,7 @@ retry:
 					}
 #endif
 					count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
-					if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
+					if (folio_alloc_swap(folio))
 						goto activate_locked_split;
 				}
 				/*
-- 
cgit v1.2.3


From adf7d6cdd716e1f3826789befc453c961dfafcf2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 22 Oct 2025 18:25:25 -0700
Subject: mm/damon/core: add damon_target->obsolete for pin-point removal

Patch series "mm/damon: support pin-point targets removal".

DAMON maintains the targets in a list, and allows committing only an
entire list of targets having the new parameters.  Targets having same
index on the lists are treated as matching source and destination
targets.  If an existing target cannot find a matching one in the
sources list, the target is removed.  This means that there is no way to
remove only a specific monitoring target in the middle of the current
targets list.

Such pin-point target removal is really needed in some use cases,
though.  Monitoring access patterns on virtual address spaces of
processes that spawned from the same ancestor is one example.  If a
process of the group is terminated, the user may want to remove the
matching DAMON target as soon as possible, to save in-kernel memory
usage for the unnecessary target data.  The user may also want to do
that without turning DAMON off or removing unnecessary targets, to keep
the current monitoring results for other active processes.

Extend DAMON kernel API and sysfs ABI to support the pin-point removal
in the following way.  For API, add a new damon_target field, namely
'obsolete'.  If the field on parameters commit source target is set, it
means the matching destination target is obsolete.  Then the parameters
commit logic removes the destination target from the existing targets
list.  For sysfs ABI, add a new file under the target directory, namely
'obsolete_target'.  It is connected with the 'obsolete' field of the
commit source targets, so internally using the new API.

Also add a selftest for the new feature.  The related helper scripts for
manipulating the sysfs interface and dumping in-kernel DAMON status are
also extended for this.  Note that the selftest part was initially
posted as an individual RFC series [1], but now merged into this one.

Bijan Tabatabai has originally reported this issue, and participated in
this solution design on a GitHub issue [1] for DAMON user-space tool.


This patch (of 9):

DAMON's monitoring targets parameters update function,
damon_commit_targets(), is not providing a way to remove a target in the
middle of the existing targets list.  Extend the API by adding a field to
struct damon_target.  If the field of a damon_commit_targets() source
target is set, it indicates the matching target on the existing targets
list is obsolete.  damon_commit_targets() understands that and removes
those from the list, while respecting the index based matching for other
non-obsolete targets.

Link: https://lkml.kernel.org/r/20251023012535.69625-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251023012535.69625-2-sj@kernel.org
Link: https://github.com/damonitor/damo/issues/36 [1]
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Bijan Tabatabai <bijan311@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  6 ++++++
 mm/damon/core.c       | 10 +++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 9ee026c2db53..f3566b978cdf 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -91,17 +91,23 @@ struct damon_region {
  * @nr_regions:		Number of monitoring target regions of this target.
  * @regions_list:	Head of the monitoring target regions of this target.
  * @list:		List head for siblings.
+ * @obsolete:		Whether the commit destination target is obsolete.
  *
  * Each monitoring context could have multiple targets.  For example, a context
  * for virtual memory address spaces could have multiple target processes.  The
  * @pid should be set for appropriate &struct damon_operations including the
  * virtual address spaces monitoring operations.
+ *
+ * @obsolete is used only for damon_commit_targets() source targets, to specify
+ * the matching destination targets are obsolete.  Read damon_commit_targets()
+ * to see how it is handled.
  */
 struct damon_target {
 	struct pid *pid;
 	unsigned int nr_regions;
 	struct list_head regions_list;
 	struct list_head list;
+	bool obsolete;
 };
 
 /**
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 769da97fcb26..06ad359024ad 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -479,6 +479,7 @@ struct damon_target *damon_new_target(void)
 	t->nr_regions = 0;
 	INIT_LIST_HEAD(&t->regions_list);
 	INIT_LIST_HEAD(&t->list);
+	t->obsolete = false;
 
 	return t;
 }
@@ -1187,7 +1188,11 @@ static int damon_commit_targets(
 
 	damon_for_each_target_safe(dst_target, next, dst) {
 		src_target = damon_nth_target(i++, src);
-		if (src_target) {
+		/*
+		 * If src target is obsolete, do not commit the parameters to
+		 * the dst target, and further remove the dst target.
+		 */
+		if (src_target && !src_target->obsolete) {
 			err = damon_commit_target(
 					dst_target, damon_target_has_pid(dst),
 					src_target, damon_target_has_pid(src),
@@ -1210,6 +1215,9 @@ static int damon_commit_targets(
 	damon_for_each_target_safe(src_target, next, src) {
 		if (j++ < i)
 			continue;
+		/* target to remove has no matching dst */
+		if (src_target->obsolete)
+			return -EINVAL;
 		new_target = damon_new_target();
 		if (!new_target)
 			return -ENOMEM;
-- 
cgit v1.2.3


From b734b9d973ccd7ad1cfebc2e1f7db693824a37ef Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 24 Oct 2025 10:09:02 +0100
Subject: mm/vma: small VMA lock cleanups

We declare vma_start_read() as a static function in mm/mmap_lock.c, so
there is no need to provide a stub for !CONFIG_PER_VMA_LOCK.

__is_vma_write_locked() is declared in a header and should therefore be
static inline.

Put parens around (refcnt & VMA_LOCK_OFFSET) in is_vma_writer_only() to
make precedence clear.

Link: https://lkml.kernel.org/r/20251024090902.1118174-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmap_lock.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 2c9fffa58714..e05da70dc0cb 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -130,7 +130,7 @@ static inline bool is_vma_writer_only(int refcnt)
 	 * a detached vma happens only in vma_mark_detached() and is a rare
 	 * case, therefore most of the time there will be no unnecessary wakeup.
 	 */
-	return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
+	return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1;
 }
 
 static inline void vma_refcount_put(struct vm_area_struct *vma)
@@ -183,7 +183,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
 }
 
 /* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
+static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
 {
 	mmap_assert_write_locked(vma->vm_mm);
 
@@ -281,9 +281,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
 	return true;
 }
 static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
-static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
-						    struct vm_area_struct *vma)
-		{ return NULL; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 272239dc8fcb109b9f1ec1a73bb85405dac92eda Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 21 Oct 2025 03:56:38 +0100
Subject: mm: make INVALID_PHYS_ADDR a generic macro

INVALID_PHYS_ADDR has very similar definitions across the code base.
Hence just move that inside header <liux/mm.h> for more generic usage.
Also drop the now redundant ones which are no longer required.

Link: https://lkml.kernel.org/r/20251021025638.2420216-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>	[s390]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/mm/mmu.c                  | 2 --
 arch/s390/boot/vmem.c                | 1 -
 drivers/vdpa/vdpa_user/iova_domain.h | 2 --
 include/linux/mm.h                   | 2 ++
 kernel/dma/swiotlb.c                 | 2 --
 5 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b8d37eb037fc..94e29e3574ff 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -470,8 +470,6 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 	mutex_unlock(&fixmap_lock);
 }
 
-#define INVALID_PHYS_ADDR	(-1ULL)
-
 static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
 				       enum pgtable_type pgtable_type)
 {
diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c
index cea3de4dce8c..fbe64ffdfb96 100644
--- a/arch/s390/boot/vmem.c
+++ b/arch/s390/boot/vmem.c
@@ -16,7 +16,6 @@
 #include "decompressor.h"
 #include "boot.h"
 
-#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
 struct ctlreg __bootdata_preserved(s390_invalid_asce);
 
 #ifdef CONFIG_PROC_FS
diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
index 775cad5238f3..a923971a64f5 100644
--- a/drivers/vdpa/vdpa_user/iova_domain.h
+++ b/drivers/vdpa/vdpa_user/iova_domain.h
@@ -17,8 +17,6 @@
 
 #define IOVA_START_PFN 1
 
-#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-
 #define BOUNCE_MAP_SHIFT	12
 #define BOUNCE_MAP_SIZE	(1 << BOUNCE_MAP_SHIFT)
 #define BOUNCE_MAP_MASK	(~(BOUNCE_MAP_SIZE - 1))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a6fd9f5aaf30..7bcd9e6fbc3c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -105,6 +105,8 @@ extern int mmap_rnd_compat_bits __read_mostly;
 # endif
 #endif
 
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+
 #include <asm/page.h>
 #include <asm/processor.h>
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 0d37da3d95b6..a547c7693135 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -61,8 +61,6 @@
  */
 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
 
-#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-
 /**
  * struct io_tlb_slot - IO TLB slot descriptor
  * @orig_addr:	The original address corresponding to a mapped entry.
-- 
cgit v1.2.3


From 8e689f8ea45ffdae20350246dd37d124d7092c92 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 28 Oct 2025 11:43:07 +0800
Subject: mm/swap: do not choose swap device according to numa node

Patch series "mm/swapfile.c: select swap devices of default priority round
robin", v5.

Currently, on system with multiple swap devices, swap allocation will
select one swap device according to priority.  The swap device with the
highest priority will be chosen to allocate firstly.

People can specify a priority from 0 to 32767 when swapon a swap device,
or the system will set it from -2 then downwards by default.  Meanwhile,
on NUMA system, the swap device with node_id will be considered first on
that NUMA node of the node_id.

In the current code, an array of plist, swap_avail_heads[nid], is used to
organize swap devices on each NUMA node.  For each NUMA node, there is a
plist organizing all swap devices.  The 'prio' value in the plist is the
negated value of the device's priority due to plist being sorted from low
to high.  The swap device owning one node_id will be promoted to the front
position on that NUMA node, then other swap devices are put in order of
their default priority.

E.g I got a system with 8 NUMA nodes, and I setup 4 zram partition as
swap devices.

Current behaviour:
their priorities will be(note that -1 is skipped):
NAME       TYPE      SIZE USED PRIO
/dev/zram0 partition  16G   0B   -2
/dev/zram1 partition  16G   0B   -3
/dev/zram2 partition  16G   0B   -4
/dev/zram3 partition  16G   0B   -5

And their positions in the 8 swap_avail_lists[nid] will be:
swap_avail_lists[0]: /* node 0's available swap device list */
zram0   -> zram1   -> zram2   -> zram3
prio:1     prio:3     prio:4     prio:5
swap_avali_lists[1]: /* node 1's available swap device list */
zram1   -> zram0   -> zram2   -> zram3
prio:1     prio:2     prio:4     prio:5
swap_avail_lists[2]: /* node 2's available swap device list */
zram2   -> zram0   -> zram1   -> zram3
prio:1     prio:2     prio:3     prio:5
swap_avail_lists[3]: /* node 3's available swap device list */
zram3   -> zram0   -> zram1   -> zram2
prio:1     prio:2     prio:3     prio:4
swap_avail_lists[4-7]: /* node 4,5,6,7's available swap device list */
zram0   -> zram1   -> zram2   -> zram3
prio:2     prio:3     prio:4     prio:5

The adjustment for swap device with node_id intended to decrease the
pressure of lock contention for one swap device by taking different swap
device on different node.  The adjustment was introduced in commit
a2468cc9bfdf ("swap: choose swap device according to numa node").
However, the adjustment is a little coarse-grained.  On the node, the swap
device sharing the node's id will always be selected firstly by node's
CPUs until exhausted, then next one.  And on other nodes where no swap
device shares its node id, swap device with priority '-2' will be selected
firstly until exhausted, then next with priority '-3'.

This is the swapon output during the process high pressure vm-scability
test is being taken.  It's clearly showing zram0 is heavily exploited
until exhausted.

===================================
[root@hp-dl385g10-03 ~]# swapon
NAME       TYPE      SIZE  USED PRIO
/dev/zram0 partition  16G 15.7G   -2
/dev/zram1 partition  16G  3.4G   -3
/dev/zram2 partition  16G  3.4G   -4
/dev/zram3 partition  16G  2.6G   -5

The node based strategy on selecting swap device is much better then the
old way one by one selecting swap device.  However it is still
unreasonable because swap devices are assumed to have similar accessing
speed if no priority is specified when swapon.  It's unfair and doesn't
make sense just because one swap device is swapped on firstly, its
priority will be higher than the one swapped on later.

So in this patchset, change is made to select the swap device round robin
if default priority.  In code, the plist array swap_avail_heads[nid] is
replaced with a plist swap_avail_head which reverts commit a2468cc9bfdf.
Meanwhile, on top of the revert, further change is taken to make any
device w/o specified priority get the same default priority '-1'.  Surely,
swap device with specified priority are always put foremost, this is not
impacted.  If you care about their different accessing speed, then use
'swapon -p xx' to deploy priority for your swap devices.

New behaviour:

swap_avail_list: /* one global available swap device list */
zram0   -> zram1   -> zram2   -> zram3
prio:1     prio:1     prio:1     prio:1

This is the swapon output during the process high pressure vm-scability
being taken, all is selected round robin:
=======================================
[root@hp-dl385g10-03 linux]# swapon
NAME       TYPE      SIZE  USED PRIO
/dev/zram0 partition  16G 12.6G   -1
/dev/zram1 partition  16G 12.6G   -1
/dev/zram2 partition  16G 12.6G   -1
/dev/zram3 partition  16G 12.6G   -1

With the change, we can see about 18% efficiency promotion as below:

vm-scability test:
==================
Test with:
usemem --init-time -O -y -x -n 31 2G (4G memcg, zram as swap)
                           Before:          After:
System time:               637.92 s         526.74 s      (lower is better)
Sum Throughput:            3546.56 MB/s     4207.56 MB/s  (higher is better)
Single process Throughput: 114.40 MB/s      135.72 MB/s   (higher is better)
free latency:              10138455.99 us   6810119.01 us (low is better)


This patch (of 2):

This reverts commit a2468cc9bfdf ("swap: choose swap device according to
numa node").

After this patch, the behaviour will change back to pre-commit
a2468cc9bfdf.  Means the priority will be set from -1 then downwards by
default, and when swapping, it will exhault swap device one by one
according to priority from high to low.  This is preparation work for
later change.

[root@hp-dl385g10-03 ~]# swapon
NAME       TYPE      SIZE   USED PRIO
/dev/zram0 partition  16G    16G   -1
/dev/zram1 partition  16G 966.2M   -2
/dev/zram2 partition  16G     0B   -3
/dev/zram3 partition  16G     0B   -4

Link: https://lkml.kernel.org/r/20251028034308.929550-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20251028034308.929550-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Suggested-by: Chris Li <chrisl@kernel.org>
Acked-by: Chris Li <chrisl@kernel.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Kairui Song <kasong@tencent.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/index.rst     |  1 -
 Documentation/admin-guide/mm/swap_numa.rst | 78 -----------------------------
 include/linux/swap.h                       | 11 +---
 mm/swapfile.c                              | 80 ++++++------------------------
 4 files changed, 15 insertions(+), 155 deletions(-)
 delete mode 100644 Documentation/admin-guide/mm/swap_numa.rst

(limited to 'include')

diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index ebc83ca20fdc..bbb563cba5d2 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -39,7 +39,6 @@ the Linux memory management.
    shrinker_debugfs
    slab
    soft-dirty
-   swap_numa
    transhuge
    userfaultfd
    zswap
diff --git a/Documentation/admin-guide/mm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst
deleted file mode 100644
index 2e630627bcee..000000000000
--- a/Documentation/admin-guide/mm/swap_numa.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-===========================================
-Automatically bind swap device to numa node
-===========================================
-
-If the system has more than one swap device and swap device has the node
-information, we can make use of this information to decide which swap
-device to use in get_swap_pages() to get better performance.
-
-
-How to use this feature
-=======================
-
-Swap device has priority and that decides the order of it to be used. To make
-use of automatically binding, there is no need to manipulate priority settings
-for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
-swapB, with swapA attached to node 0 and swapB attached to node 1, are going
-to be swapped on. Simply swapping them on by doing::
-
-	# swapon /dev/swapA
-	# swapon /dev/swapB
-
-Then node 0 will use the two swap devices in the order of swapA then swapB and
-node 1 will use the two swap devices in the order of swapB then swapA. Note
-that the order of them being swapped on doesn't matter.
-
-A more complex example on a 4 node machine. Assume 6 swap devices are going to
-be swapped on: swapA and swapB are attached to node 0, swapC is attached to
-node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
-The way to swap them on is the same as above::
-
-	# swapon /dev/swapA
-	# swapon /dev/swapB
-	# swapon /dev/swapC
-	# swapon /dev/swapD
-	# swapon /dev/swapE
-	# swapon /dev/swapF
-
-Then node 0 will use them in the order of::
-
-	swapA/swapB -> swapC -> swapD -> swapE -> swapF
-
-swapA and swapB will be used in a round robin mode before any other swap device.
-
-node 1 will use them in the order of::
-
-	swapC -> swapA -> swapB -> swapD -> swapE -> swapF
-
-node 2 will use them in the order of::
-
-	swapD/swapE -> swapA -> swapB -> swapC -> swapF
-
-Similaly, swapD and swapE will be used in a round robin mode before any
-other swap devices.
-
-node 3 will use them in the order of::
-
-	swapF -> swapA -> swapB -> swapC -> swapD -> swapE
-
-
-Implementation details
-======================
-
-The current code uses a priority based list, swap_avail_list, to decide
-which swap device to use and if multiple swap devices share the same
-priority, they are used round robin. This change here replaces the single
-global swap_avail_list with a per-numa-node list, i.e. for each numa node,
-it sees its own priority based list of available swap devices. Swap
-device's priority can be promoted on its matching node's swap_avail_list.
-
-The current swap device's priority is set as: user can set a >=0 value,
-or the system will pick one starting from -1 then downwards. The priority
-value in the swap_avail_list is the negated value of the swap device's
-due to plist being sorted from low to high. The new policy doesn't change
-the semantics for priority >=0 cases, the previous starting from -1 then
-downwards now becomes starting from -2 then downwards and -1 is reserved
-as the promoted value. So if multiple swap devices are attached to the same
-node, they will all be promoted to priority -1 on that node's plist and will
-be used round robin before any other swap devices.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a4b264817735..38ca3df68716 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -301,16 +301,7 @@ struct swap_info_struct {
 	struct work_struct discard_work; /* discard worker */
 	struct work_struct reclaim_work; /* reclaim worker */
 	struct list_head discard_clusters; /* discard clusters list */
-	struct plist_node avail_lists[]; /*
-					   * entries in swap_avail_heads, one
-					   * entry per node.
-					   * Must be last as the number of the
-					   * array is nr_node_ids, which is not
-					   * a fixed value so have to allocate
-					   * dynamically.
-					   * And it has to be an array so that
-					   * plist_for_each_* can work.
-					   */
+	struct plist_node avail_list;   /* entry in swap_avail_head */
 };
 
 static inline swp_entry_t page_swap_entry(struct page *page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 125d893bb706..ce3580e2f4f4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages;
 EXPORT_SYMBOL_GPL(nr_swap_pages);
 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
 long total_swap_pages;
-static int least_priority = -1;
+static int least_priority;
 unsigned long swapfile_maximum_size;
 #ifdef CONFIG_MIGRATION
 bool swap_migration_ad_supported;
@@ -103,7 +103,7 @@ static PLIST_HEAD(swap_active_head);
  * is held and the locking order requires swap_lock to be taken
  * before any swap_info_struct->lock.
  */
-static struct plist_head *swap_avail_heads;
+static PLIST_HEAD(swap_avail_head);
 static DEFINE_SPINLOCK(swap_avail_lock);
 
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -1130,7 +1130,6 @@ done:
 /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
 static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
 {
-	int nid;
 	unsigned long pages;
 
 	spin_lock(&swap_avail_lock);
@@ -1159,8 +1158,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
 			goto skip;
 	}
 
-	for_each_node(nid)
-		plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
+	plist_del(&si->avail_list, &swap_avail_head);
 
 skip:
 	spin_unlock(&swap_avail_lock);
@@ -1169,7 +1167,6 @@ skip:
 /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
 static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
 {
-	int nid;
 	long val;
 	unsigned long pages;
 
@@ -1202,8 +1199,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
 			goto skip;
 	}
 
-	for_each_node(nid)
-		plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
+	plist_add(&si->avail_list, &swap_avail_head);
 
 skip:
 	spin_unlock(&swap_avail_lock);
@@ -1346,16 +1342,14 @@ static bool swap_alloc_fast(swp_entry_t *entry,
 static bool swap_alloc_slow(swp_entry_t *entry,
 			    int order)
 {
-	int node;
 	unsigned long offset;
 	struct swap_info_struct *si, *next;
 
-	node = numa_node_id();
 	spin_lock(&swap_avail_lock);
 start_over:
-	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
+	plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
 		/* Rotate the device and switch to a new cluster */
-		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
+		plist_requeue(&si->avail_list, &swap_avail_head);
 		spin_unlock(&swap_avail_lock);
 		if (get_swap_device_info(si)) {
 			offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
@@ -1380,7 +1374,7 @@ start_over:
 		 * still in the swap_avail_head list then try it, otherwise
 		 * start over if we have not gotten any slots.
 		 */
-		if (plist_node_empty(&next->avail_lists[node]))
+		if (plist_node_empty(&si->avail_list))
 			goto start_over;
 	}
 	spin_unlock(&swap_avail_lock);
@@ -1394,11 +1388,10 @@ start_over:
 static bool swap_sync_discard(void)
 {
 	bool ret = false;
-	int nid = numa_node_id();
 	struct swap_info_struct *si, *next;
 
 	spin_lock(&swap_avail_lock);
-	plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) {
+	plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
 		spin_unlock(&swap_avail_lock);
 		if (get_swap_device_info(si)) {
 			if (si->flags & SWP_PAGE_DISCARD)
@@ -2709,25 +2702,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 	return generic_swapfile_activate(sis, swap_file, span);
 }
 
-static int swap_node(struct swap_info_struct *si)
-{
-	struct block_device *bdev;
-
-	if (si->bdev)
-		bdev = si->bdev;
-	else
-		bdev = si->swap_file->f_inode->i_sb->s_bdev;
-
-	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
-}
-
 static void setup_swap_info(struct swap_info_struct *si, int prio,
 			    unsigned char *swap_map,
 			    struct swap_cluster_info *cluster_info,
 			    unsigned long *zeromap)
 {
-	int i;
-
 	if (prio >= 0)
 		si->prio = prio;
 	else
@@ -2737,16 +2716,7 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
 	 * low-to-high, while swap ordering is high-to-low
 	 */
 	si->list.prio = -si->prio;
-	for_each_node(i) {
-		if (si->prio >= 0)
-			si->avail_lists[i].prio = -si->prio;
-		else {
-			if (swap_node(si) == i)
-				si->avail_lists[i].prio = 1;
-			else
-				si->avail_lists[i].prio = -si->prio;
-		}
-	}
+	si->avail_list.prio = -si->prio;
 	si->swap_map = swap_map;
 	si->cluster_info = cluster_info;
 	si->zeromap = zeromap;
@@ -2919,15 +2889,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	del_from_avail_list(p, true);
 	if (p->prio < 0) {
 		struct swap_info_struct *si = p;
-		int nid;
 
 		plist_for_each_entry_continue(si, &swap_active_head, list) {
 			si->prio++;
 			si->list.prio--;
-			for_each_node(nid) {
-				if (si->avail_lists[nid].prio != 1)
-					si->avail_lists[nid].prio--;
-			}
+			si->avail_list.prio--;
 		}
 		least_priority++;
 	}
@@ -3168,9 +3134,8 @@ static struct swap_info_struct *alloc_swap_info(void)
 	struct swap_info_struct *p;
 	struct swap_info_struct *defer = NULL;
 	unsigned int type;
-	int i;
 
-	p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
+	p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL);
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
@@ -3209,8 +3174,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 	}
 	p->swap_extent_root = RB_ROOT;
 	plist_node_init(&p->list, 0);
-	for_each_node(i)
-		plist_node_init(&p->avail_lists[i], 0);
+	plist_node_init(&p->avail_list, 0);
 	p->flags = SWP_USED;
 	spin_unlock(&swap_lock);
 	if (defer) {
@@ -3467,9 +3431,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!swap_avail_heads)
-		return -ENOMEM;
-
 	si = alloc_swap_info();
 	if (IS_ERR(si))
 		return PTR_ERR(si);
@@ -4079,7 +4040,6 @@ static bool __has_usable_swap(void)
 void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
 {
 	struct swap_info_struct *si, *next;
-	int nid = folio_nid(folio);
 
 	if (!(gfp & __GFP_IO))
 		return;
@@ -4098,8 +4058,8 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
 		return;
 
 	spin_lock(&swap_avail_lock);
-	plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
-				  avail_lists[nid]) {
+	plist_for_each_entry_safe(si, next, &swap_avail_head,
+				  avail_list) {
 		if (si->bdev) {
 			blkcg_schedule_throttle(si->bdev->bd_disk, true);
 			break;
@@ -4111,18 +4071,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
 
 static int __init swapfile_init(void)
 {
-	int nid;
-
-	swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
-					 GFP_KERNEL);
-	if (!swap_avail_heads) {
-		pr_emerg("Not enough memory for swap heads, swap is disabled\n");
-		return -ENOMEM;
-	}
-
-	for_each_node(nid)
-		plist_head_init(&swap_avail_heads[nid]);
-
 	swapfile_maximum_size = arch_max_swapfile_size();
 
 	/*
-- 
cgit v1.2.3


From 1a4f70f6851a1916c4f0e52731c7ecfe99bf36e6 Mon Sep 17 00:00:00 2001
From: Israel Batista <linux@israelbatista.dev.br>
Date: Wed, 29 Oct 2025 19:56:28 +0000
Subject: mm: convert memory block states (MEM_*) macros to enum

Patch series "mm: Convert memory block states (MEM_*) macros to enums", v2.

The MEM_* constants indicating the state of a memory block are currently
defined as macros, meaning their definitions will be omitted from the
debuginfo on most kernel builds.  This makes it harder for debuggers to
correctly map the block state at runtime, which can be quite useful when
analysing errors related to memory hot plugging and unplugging with tools
such as drgn.

Converting the constants to an enum ensures the correct information is
emitted by the compiler and available for the debugger, without needing to
hard-code them into the debugger and track their changes.

This patch series aims to replace the current macros with a newly created
enum named memory_block_state, while also taking advantage of the compile
time guarantees that we get when using enums.

The first patch does the conversion of the macros to an enum, while the
2nd and 3rd patches use this enum to clean up some type declarations and
make sure that only valid values are used.


This patch (of 3):

Converting the MEM_* constants from macros to an enum ensures that their
values will be correctly emitted in the debug symbols, making it easier to
trace the meaning of each value when debugging with tools such as drgn,
without the need to hard-code the values.

Since the values are mutually exclusive and they are not exposed directly
to userspace, I also dropped the misleading pattern (1<<X) that made it
look like they were combinable flags.

Link: https://lkml.kernel.org/r/20251029195617.2210700-1-linux@israelbatista.dev.br
Link: https://lkml.kernel.org/r/20251029195617.2210700-2-linux@israelbatista.dev.br
Signed-off-by: Israel Batista <linux@israelbatista.dev.br>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Omar Sandoval <osandov@osandov.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory.h | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 0c214256216f..f4e358477c6a 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -64,6 +64,18 @@ struct memory_group {
 	};
 };
 
+enum memory_block_state {
+	/* These states are exposed to userspace as text strings in sysfs */
+	MEM_ONLINE,		/* exposed to userspace */
+	MEM_GOING_OFFLINE,	/* exposed to userspace */
+	MEM_OFFLINE,		/* exposed to userspace */
+	MEM_GOING_ONLINE,
+	MEM_CANCEL_ONLINE,
+	MEM_CANCEL_OFFLINE,
+	MEM_PREPARE_ONLINE,
+	MEM_FINISH_OFFLINE,
+};
+
 struct memory_block {
 	unsigned long start_section_nr;
 	unsigned long state;		/* serialized by the dev->lock */
@@ -89,16 +101,6 @@ int arch_get_memory_phys_device(unsigned long start_pfn);
 unsigned long memory_block_size_bytes(void);
 int set_memory_block_size_order(unsigned int order);
 
-/* These states are exposed to userspace as text strings in sysfs */
-#define	MEM_ONLINE		(1<<0) /* exposed to userspace */
-#define	MEM_GOING_OFFLINE	(1<<1) /* exposed to userspace */
-#define	MEM_OFFLINE		(1<<2) /* exposed to userspace */
-#define	MEM_GOING_ONLINE	(1<<3)
-#define	MEM_CANCEL_ONLINE	(1<<4)
-#define	MEM_CANCEL_OFFLINE	(1<<5)
-#define	MEM_PREPARE_ONLINE	(1<<6)
-#define	MEM_FINISH_OFFLINE	(1<<7)
-
 struct memory_notify {
 	/*
 	 * The altmap_start_pfn and altmap_nr_pages fields are designated for
-- 
cgit v1.2.3


From 8bc7ba3d265d6ee698de4b1941b7e8f7d91a0562 Mon Sep 17 00:00:00 2001
From: Israel Batista <linux@israelbatista.dev.br>
Date: Wed, 29 Oct 2025 19:56:30 +0000
Subject: mm: change type of state in struct memory_block

The state of a memory block should be restricted to values specified in
the documentation of the memory hotplug API.  However, since the state
field in the memory_block struct was defined as an unsigned long, this
restriction was not enforced at compile time.

With the introduction of the enum memory_block_state, it is now possible
to incorporate the desired semantics in the field declaration and enforce
these restrictions at compile time.

[akpm@linux-foundation.org: fix whitespace, per Randy]
Link: https://lkml.kernel.org/r/20251029195617.2210700-3-linux@israelbatista.dev.br
Signed-off-by: Israel Batista <linux@israelbatista.dev.br>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Omar Sandoval <osandov@osandov.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/memory.c  | 2 +-
 include/linux/memory.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 6d84a02cfa5d..3d17dd774947 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -198,7 +198,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 		break;
 	default:
 		WARN_ON(1);
-		return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
+		return sysfs_emit(buf, "ERROR-UNKNOWN-%d\n", mem->state);
 	}
 
 	return sysfs_emit(buf, "%s\n", output);
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f4e358477c6a..ca20cbdd71f2 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -78,7 +78,7 @@ enum memory_block_state {
 
 struct memory_block {
 	unsigned long start_section_nr;
-	unsigned long state;		/* serialized by the dev->lock */
+	enum memory_block_state state;	/* serialized by the dev->lock */
 	int online_type;		/* for passing data to online routine */
 	int nid;			/* NID for this memory block */
 	/*
-- 
cgit v1.2.3


From ed1f8855dd7b82a0ad87960b1729a3e848dc5589 Mon Sep 17 00:00:00 2001
From: Israel Batista <linux@israelbatista.dev.br>
Date: Wed, 29 Oct 2025 19:56:32 +0000
Subject: mm: change type of parameter for memory_notify

memory_notify() is responsible for sending events related to memory
hotplugging to a notification queue.  Since all the events must match one
of the values from the enum memory_block_state, it is appropriate to
change the function parameter type to make this condition explicit at
compile time.

Link: https://lkml.kernel.org/r/20251029195617.2210700-4-linux@israelbatista.dev.br
Signed-off-by: Israel Batista <linux@israelbatista.dev.br>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Omar Sandoval <osandov@osandov.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/memory.c  | 4 ++--
 include/linux/memory.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 3d17dd774947..c03f3b5e5e6f 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -204,9 +204,9 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 	return sysfs_emit(buf, "%s\n", output);
 }
 
-int memory_notify(unsigned long val, void *v)
+int memory_notify(enum memory_block_state state, void *v)
 {
-	return blocking_notifier_call_chain(&memory_chain, val, v);
+	return blocking_notifier_call_chain(&memory_chain, state, v);
 }
 
 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
diff --git a/include/linux/memory.h b/include/linux/memory.h
index ca20cbdd71f2..ca3eb1db6cc8 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -141,7 +141,7 @@ static inline int register_memory_notifier(struct notifier_block *nb)
 static inline void unregister_memory_notifier(struct notifier_block *nb)
 {
 }
-static inline int memory_notify(unsigned long val, void *v)
+static inline int memory_notify(enum memory_block_state state, void *v)
 {
 	return 0;
 }
@@ -165,7 +165,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
 				struct memory_group *group);
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
-extern int memory_notify(unsigned long val, void *v);
+extern int memory_notify(enum memory_block_state state, void *v);
 extern struct memory_block *find_memory_block(unsigned long section_nr);
 typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
-- 
cgit v1.2.3


From 2ec41967189cd65a8f79c760dd1b50c4f56e8ac6 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Sun, 2 Nov 2025 18:44:33 +0000
Subject: mm: handle poisoning of pfn without struct pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Poison (or ECC) errors can be very common on a large size cluster.  The
kernel MM currently does not handle ECC errors / poison on a memory region
that is not backed by struct pages.  If a memory region mapped using
remap_pfn_range() for example, but not added to the kernel, MM will not
have associated struct pages.  Add a new mechanism to handle memory
failure on such memory.

Make kernel MM expose a function to allow modules managing the device
memory to register the device memory SPA and the address space associated
it.  MM maintains this information as an interval tree.  On poison, MM can
search for the range that the poisoned PFN belong and use the
address_space to determine the mapping VMA.

In this implementation, kernel MM follows the following sequence that is
largely similar to the memory_failure() handler for struct page backed
memory:

1. memory_failure() is triggered on reception of a poison error.  An
   absence of struct page is detected and consequently
   memory_failure_pfn() is executed.

2. memory_failure_pfn() collects the processes mapped to the PFN.

3. memory_failure_pfn() sends SIGBUS to all the processes mapping the
   faulty PFN using kill_procs().

Note that there is one primary difference versus the handling of the
poison on struct pages, which is to skip unmapping to the faulty PFN.
This is done to handle the huge PFNMAP support added recently [1] that
enables VM_PFNMAP vmas to map at PMD or PUD level.  A poison to a PFN
mapped in such as way would need breaking the PMD/PUD mapping into PTEs
that will get mirrored into the S2.  This can greatly increase the cost of
table walks and have a major performance impact.

Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1]
Link: https://lkml.kernel.org/r/20251102184434.2406-3-ankita@nvidia.com
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Cc: Aniket Agashe <aniketa@nvidia.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Kirti Wankhede <kwankhede@nvidia.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew R. Ochs <mochs@nvidia.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Neo Jia <cjia@nvidia.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuai Xue <xueshuai@linux.alibaba.com>
Cc: Smita Koralahalli Channabasappa <smita.koralahallichannabasappa@amd.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tarun Gupta <targupta@nvidia.com>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Cc: Vikram Sethi <vsethi@nvidia.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS                    |   1 +
 include/linux/memory-failure.h |  17 +++++
 include/linux/mm.h             |   1 +
 include/ras/ras_event.h        |   1 +
 mm/Kconfig                     |   1 +
 mm/memory-failure.c            | 145 ++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/memory-failure.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 2625bc3d53d8..5cf6873569d3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11557,6 +11557,7 @@ M:	Miaohe Lin <linmiaohe@huawei.com>
 R:	Naoya Horiguchi <nao.horiguchi@gmail.com>
 L:	linux-mm@kvack.org
 S:	Maintained
+F:	include/linux/memory-failure.h
 F:	mm/hwpoison-inject.c
 F:	mm/memory-failure.c
 
diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h
new file mode 100644
index 000000000000..bc326503d2d2
--- /dev/null
+++ b/include/linux/memory-failure.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMORY_FAILURE_H
+#define _LINUX_MEMORY_FAILURE_H
+
+#include <linux/interval_tree.h>
+
+struct pfn_address_space;
+
+struct pfn_address_space {
+	struct interval_tree_node node;
+	struct address_space *mapping;
+};
+
+int register_pfn_address_space(struct pfn_address_space *pfn_space);
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space);
+
+#endif /* _LINUX_MEMORY_FAILURE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7bcd9e6fbc3c..b636d12bb651 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4285,6 +4285,7 @@ enum mf_action_page_type {
 	MF_MSG_DAX,
 	MF_MSG_UNSPLIT_THP,
 	MF_MSG_ALREADY_POISONED,
+	MF_MSG_PFN_MAP,
 	MF_MSG_UNKNOWN,
 };
 
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index c8cd0f00c845..fecfeb7c8be7 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -375,6 +375,7 @@ TRACE_EVENT(aer_event,
 	EM ( MF_MSG_DAX, "dax page" )					\
 	EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" )			\
 	EM ( MF_MSG_ALREADY_POISONED, "already poisoned" )		\
+	EM ( MF_MSG_PFN_MAP, "non struct page pfn" )                    \
 	EMe ( MF_MSG_UNKNOWN, "unknown page" )
 
 /*
diff --git a/mm/Kconfig b/mm/Kconfig
index eae03b14f7de..d548976d0e0a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -741,6 +741,7 @@ config MEMORY_FAILURE
 	depends on ARCH_SUPPORTS_MEMORY_FAILURE
 	bool "Enable recovery from hardware memory errors"
 	select RAS
+	select INTERVAL_TREE
 	help
 	  Enables code to recover from some memory failures on systems
 	  with MCA recovery. This allows a system to continue running
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 560884dd6250..77391b6f9f76 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -38,6 +38,7 @@
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/memory-failure.h>
 #include <linux/page-flags.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
@@ -154,6 +155,10 @@ static const struct ctl_table memory_failure_table[] = {
 	}
 };
 
+static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
+
+static DEFINE_MUTEX(pfn_space_lock);
+
 /*
  * Return values:
  *   1:   the page is dissolved (if needed) and taken off from buddy,
@@ -885,6 +890,7 @@ static const char * const action_page_types[] = {
 	[MF_MSG_DAX]			= "dax page",
 	[MF_MSG_UNSPLIT_THP]		= "unsplit thp",
 	[MF_MSG_ALREADY_POISONED]	= "already poisoned page",
+	[MF_MSG_PFN_MAP]                = "non struct page pfn",
 	[MF_MSG_UNKNOWN]		= "unknown page",
 };
 
@@ -1277,7 +1283,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
 {
 	trace_memory_failure_event(pfn, type, result);
 
-	if (type != MF_MSG_ALREADY_POISONED) {
+	if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) {
 		num_poisoned_pages_inc(pfn);
 		update_per_node_mf_stats(pfn, result);
 	}
@@ -2147,6 +2153,135 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
 	kill_procs(&tokill, true, pfn, flags);
 }
 
+int register_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+	guard(mutex)(&pfn_space_lock);
+
+	if (interval_tree_iter_first(&pfn_space_itree,
+				     pfn_space->node.start,
+				     pfn_space->node.last))
+		return -EBUSY;
+
+	interval_tree_insert(&pfn_space->node, &pfn_space_itree);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_pfn_address_space);
+
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+	guard(mutex)(&pfn_space_lock);
+
+	if (interval_tree_iter_first(&pfn_space_itree,
+				     pfn_space->node.start,
+				     pfn_space->node.last))
+		interval_tree_remove(&pfn_space->node, &pfn_space_itree);
+}
+EXPORT_SYMBOL_GPL(unregister_pfn_address_space);
+
+static void add_to_kill_pfn(struct task_struct *tsk,
+			    struct vm_area_struct *vma,
+			    struct list_head *to_kill,
+			    unsigned long pfn)
+{
+	struct to_kill *tk;
+
+	tk = kmalloc(sizeof(*tk), GFP_ATOMIC);
+	if (!tk) {
+		pr_info("Unable to kill proc %d\n", tsk->pid);
+		return;
+	}
+
+	/* Check for pgoff not backed by struct page */
+	tk->addr = vma_address(vma, pfn, 1);
+	tk->size_shift = PAGE_SHIFT;
+
+	if (tk->addr == -EFAULT)
+		pr_info("Unable to find address %lx in %s\n",
+			pfn, tsk->comm);
+
+	get_task_struct(tsk);
+	tk->tsk = tsk;
+	list_add_tail(&tk->nd, to_kill);
+}
+
+/*
+ * Collect processes when the error hit a PFN not backed by struct page.
+ */
+static void collect_procs_pfn(struct address_space *mapping,
+			      unsigned long pfn, struct list_head *to_kill)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+
+	i_mmap_lock_read(mapping);
+	rcu_read_lock();
+	for_each_process(tsk) {
+		struct task_struct *t = tsk;
+
+		t = task_early_kill(tsk, true);
+		if (!t)
+			continue;
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) {
+			if (vma->vm_mm == t->mm)
+				add_to_kill_pfn(t, vma, to_kill, pfn);
+		}
+	}
+	rcu_read_unlock();
+	i_mmap_unlock_read(mapping);
+}
+
+/**
+ * memory_failure_pfn - Handle memory failure on a page not backed by
+ *                      struct page.
+ * @pfn: Page Number of the corrupted page
+ * @flags: fine tune action taken
+ *
+ * Return:
+ *   0             - success,
+ *   -EBUSY        - Page PFN does not belong to any address space mapping.
+ */
+static int memory_failure_pfn(unsigned long pfn, int flags)
+{
+	struct interval_tree_node *node;
+	LIST_HEAD(tokill);
+
+	scoped_guard(mutex, &pfn_space_lock) {
+		bool mf_handled = false;
+
+		/*
+		 * Modules registers with MM the address space mapping to
+		 * the device memory they manage. Iterate to identify
+		 * exactly which address space has mapped to this failing
+		 * PFN.
+		 */
+		for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node;
+		     node = interval_tree_iter_next(node, pfn, pfn)) {
+			struct pfn_address_space *pfn_space =
+				container_of(node, struct pfn_address_space, node);
+
+			collect_procs_pfn(pfn_space->mapping, pfn, &tokill);
+
+			mf_handled = true;
+		}
+
+		if (!mf_handled)
+			return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED);
+	}
+
+	/*
+	 * Unlike System-RAM there is no possibility to swap in a different
+	 * physical page at a given virtual address, so all userspace
+	 * consumption of direct PFN memory necessitates SIGBUS (i.e.
+	 * MF_MUST_KILL)
+	 */
+	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+	kill_procs(&tokill, true, pfn, flags);
+
+	return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED);
+}
+
 /**
  * memory_failure - Handle memory failure of a page.
  * @pfn: Page Number of the corrupted page
@@ -2196,6 +2331,14 @@ int memory_failure(unsigned long pfn, int flags)
 		if (res == 0)
 			goto unlock_mutex;
 
+		if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) {
+			/*
+			 * The PFN is not backed by struct page.
+			 */
+			res = memory_failure_pfn(pfn, flags);
+			goto unlock_mutex;
+		}
+
 		if (pfn_valid(pfn)) {
 			pgmap = get_dev_pagemap(pfn);
 			put_ref_page(pfn, flags);
-- 
cgit v1.2.3


From a73d4a055622d0973e371382b16a13f9795ffec7 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Fri, 31 Oct 2025 12:21:31 +0100
Subject: drivers/xen/xenbus: Replace deprecated strcpy in
 xenbus_transaction_end

strcpy() is deprecated; inline the read-only string instead. Fix the
function comment and use bool instead of int while we're at it.

Link: https://github.com/KSPP/linux/issues/88
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Juergen Gross <jgross@suse.com>
Message-ID: <20251031112145.103257-2-thorsten.blum@linux.dev>
---
 drivers/xen/xenbus/xenbus_xs.c | 14 ++++----------
 include/xen/xenbus.h           |  2 +-
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 7c6c12925326..bddc714877c1 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -546,18 +546,12 @@ int xenbus_transaction_start(struct xenbus_transaction *t)
 EXPORT_SYMBOL_GPL(xenbus_transaction_start);
 
 /* End a transaction.
- * If abandon is true, transaction is discarded instead of committed.
+ * If abort is true, transaction is discarded instead of committed.
  */
-int xenbus_transaction_end(struct xenbus_transaction t, int abort)
+int xenbus_transaction_end(struct xenbus_transaction t, bool abort)
 {
-	char abortstr[2];
-
-	if (abort)
-		strcpy(abortstr, "F");
-	else
-		strcpy(abortstr, "T");
-
-	return xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
+	return xs_error(xs_single(t, XS_TRANSACTION_END, abort ? "F" : "T",
+				  NULL));
 }
 EXPORT_SYMBOL_GPL(xenbus_transaction_end);
 
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 7dab04cf4a36..c94caf852aea 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -158,7 +158,7 @@ int xenbus_exists(struct xenbus_transaction t,
 		  const char *dir, const char *node);
 int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node);
 int xenbus_transaction_start(struct xenbus_transaction *t);
-int xenbus_transaction_end(struct xenbus_transaction t, int abort);
+int xenbus_transaction_end(struct xenbus_transaction t, bool abort);
 
 /* Single read and scanf: returns -errno or num scanned if > 0. */
 __scanf(4, 5)
-- 
cgit v1.2.3


From 197b3f3c70d61ff1c7ca24f66d567e06fe8ed3d9 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 12 Nov 2025 14:55:30 +0100
Subject: string: provide strends()

Implement a function for checking if a string ends with a different
string and add its kunit test cases.

Acked-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20251112-gpio-shared-v4-1-b51f97b1abd8@linaro.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/string.h   | 18 ++++++++++++++++++
 lib/tests/string_kunit.c | 13 +++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index fdd3442c6bcb..929d05d1247c 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -562,4 +562,22 @@ static inline bool strstarts(const char *str, const char *prefix)
 	return strncmp(str, prefix, strlen(prefix)) == 0;
 }
 
+/**
+ * strends - Check if a string ends with another string.
+ * @str - NULL-terminated string to check against @suffix
+ * @suffix - NULL-terminated string defining the suffix to look for in @str
+ *
+ * Returns:
+ * True if @str ends with @suffix. False in all other cases.
+ */
+static inline bool strends(const char *str, const char *suffix)
+{
+	unsigned int str_len = strlen(str), suffix_len = strlen(suffix);
+
+	if (str_len < suffix_len)
+		return false;
+
+	return !(strcmp(str + str_len - suffix_len, suffix));
+}
+
 #endif /* _LINUX_STRING_H_ */
diff --git a/lib/tests/string_kunit.c b/lib/tests/string_kunit.c
index 0ed7448a26d3..f9a8e557ba77 100644
--- a/lib/tests/string_kunit.c
+++ b/lib/tests/string_kunit.c
@@ -602,6 +602,18 @@ static void string_test_memtostr(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, dest[7], '\0');
 }
 
+static void string_test_strends(struct kunit *test)
+{
+	KUNIT_EXPECT_TRUE(test, strends("foo-bar", "bar"));
+	KUNIT_EXPECT_TRUE(test, strends("foo-bar", "-bar"));
+	KUNIT_EXPECT_TRUE(test, strends("foobar", "foobar"));
+	KUNIT_EXPECT_TRUE(test, strends("foobar", ""));
+	KUNIT_EXPECT_FALSE(test, strends("bar", "foobar"));
+	KUNIT_EXPECT_FALSE(test, strends("", "foo"));
+	KUNIT_EXPECT_FALSE(test, strends("foobar", "ba"));
+	KUNIT_EXPECT_TRUE(test, strends("", ""));
+}
+
 static struct kunit_case string_test_cases[] = {
 	KUNIT_CASE(string_test_memset16),
 	KUNIT_CASE(string_test_memset32),
@@ -623,6 +635,7 @@ static struct kunit_case string_test_cases[] = {
 	KUNIT_CASE(string_test_strlcat),
 	KUNIT_CASE(string_test_strtomem),
 	KUNIT_CASE(string_test_memtostr),
+	KUNIT_CASE(string_test_strends),
 	{}
 };
 
-- 
cgit v1.2.3


From eb374f764a7012eda28019266a6d9191670c4fa5 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 12 Nov 2025 14:55:35 +0100
Subject: gpio: provide gpiod_is_shared()

Provide an interface allowing consumers to check if a GPIO descriptor
represents a GPIO that can potentially be shared by multiple consumers
at the same time. This is exposed to allow subsystems that already
work around the limitations of the current non-exclusive GPIO handling
in some ways, to gradually convert to relying on the new shared GPIO
feature of GPIOLIB.

Extend the gpiolib-shared module to mark the GPIO shared proxy
descriptors with a flag checked by the new interface.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20251112-gpio-shared-v4-6-b51f97b1abd8@linaro.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib-shared.c | 18 ++++++++++++++++++
 drivers/gpio/gpiolib.c        | 20 ++++++++++++++++++++
 drivers/gpio/gpiolib.h        |  1 +
 include/linux/gpio/consumer.h |  9 +++++++++
 4 files changed, 48 insertions(+)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib-shared.c b/drivers/gpio/gpiolib-shared.c
index 1fcbf53b6eca..fa1d16635ea7 100644
--- a/drivers/gpio/gpiolib-shared.c
+++ b/drivers/gpio/gpiolib-shared.c
@@ -314,6 +314,24 @@ int gpio_device_setup_shared(struct gpio_device *gdev)
 
 	guard(mutex)(&gpio_shared_lock);
 
+	list_for_each_entry(entry, &gpio_shared_list, list) {
+		list_for_each_entry(ref, &entry->refs, list) {
+			if (gdev->dev.parent == &ref->adev.dev) {
+				/*
+				 * This is a shared GPIO proxy. Mark its
+				 * descriptor as such and return here.
+				 */
+				__set_bit(GPIOD_FLAG_SHARED_PROXY,
+					  &gdev->descs[0].flags);
+				return 0;
+			}
+		}
+	}
+
+	/*
+	 * This is not a shared GPIO proxy but it still may be the device
+	 * exposing shared pins. Find them and create the proxy devices.
+	 */
 	list_for_each_entry(entry, &gpio_shared_list, list) {
 		if (!device_match_fwnode(&gdev->dev, entry->fwnode))
 			continue;
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 1e4c99179712..678d07dc768c 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3990,6 +3990,26 @@ int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name)
 }
 EXPORT_SYMBOL_GPL(gpiod_set_consumer_name);
 
+/**
+ * gpiod_is_shared() - check if this GPIO can be shared by multiple consumers
+ * @desc: GPIO to inspect
+ *
+ * Returns:
+ * True if this GPIO can be shared by multiple consumers at once. False if it's
+ * a regular, exclusive GPIO.
+ *
+ * Note:
+ * This function returning true does not mean that this GPIO is currently being
+ * shared. It means the GPIO core has registered the fact that the firmware
+ * configuration indicates that it can be shared by multiple consumers and is
+ * in charge of arbitrating the access.
+ */
+bool gpiod_is_shared(const struct gpio_desc *desc)
+{
+	return test_bit(GPIOD_FLAG_SHARED_PROXY, &desc->flags);
+}
+EXPORT_SYMBOL_GPL(gpiod_is_shared);
+
 /**
  * gpiod_to_irq() - return the IRQ corresponding to a GPIO
  * @desc: gpio whose IRQ will be returned (already requested)
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index 09ac92ed0853..abd870fb4a3b 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -205,6 +205,7 @@ struct gpio_desc {
 #define GPIOD_FLAG_EVENT_CLOCK_REALTIME	18 /* GPIO CDEV reports REALTIME timestamps in events */
 #define GPIOD_FLAG_EVENT_CLOCK_HTE	19 /* GPIO CDEV reports hardware timestamps in events */
 #define GPIOD_FLAG_SHARED		20 /* GPIO is shared by multiple consumers */
+#define GPIOD_FLAG_SHARED_PROXY		21 /* GPIO is a virtual proxy to a physically shared pin. */
 
 	/* Connection label */
 	struct gpio_desc_label __rcu *label;
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 00df68c51405..a8acb7c0b5af 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -167,6 +167,8 @@ int gpiod_cansleep(const struct gpio_desc *desc);
 int gpiod_to_irq(const struct gpio_desc *desc);
 int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name);
 
+bool gpiod_is_shared(const struct gpio_desc *desc);
+
 /* Convert between the old gpio_ and new gpiod_ interfaces */
 struct gpio_desc *gpio_to_desc(unsigned gpio);
 int desc_to_gpio(const struct gpio_desc *desc);
@@ -520,6 +522,13 @@ static inline int gpiod_set_consumer_name(struct gpio_desc *desc,
 	return -EINVAL;
 }
 
+static inline bool gpiod_is_shared(const struct gpio_desc *desc)
+{
+	/* GPIO can never have been requested */
+	WARN_ON(desc);
+	return false;
+}
+
 static inline struct gpio_desc *gpio_to_desc(unsigned gpio)
 {
 	return NULL;
-- 
cgit v1.2.3


From b98994cb9bc24f5c7575c86650f96c384576fdfa Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Mon, 17 Nov 2025 02:54:19 +0000
Subject: mtd: spinand: esmt: add support for F50L1G41LC

This adds support for ESMT F50L1G41LC, which appears to be an updated
version of the already supported F50L1G41LB.
Add esmt_8c SPI_NAND manufacturer to account for the newly used vendor
ID with support for the ESMT F50L1G41LC chip.

Link: https://github.com/openwrt/openwrt/pull/15214#issuecomment-3514824435
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/core.c |  1 +
 drivers/mtd/nand/spi/esmt.c | 24 ++++++++++++++++++++++++
 include/linux/mtd/spinand.h |  1 +
 3 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index f92133b8e1a6..d207286572d8 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -1227,6 +1227,7 @@ static const struct nand_ops spinand_ops = {
 static const struct spinand_manufacturer *spinand_manufacturers[] = {
 	&alliancememory_spinand_manufacturer,
 	&ato_spinand_manufacturer,
+	&esmt_8c_spinand_manufacturer,
 	&esmt_c8_spinand_manufacturer,
 	&fmsh_spinand_manufacturer,
 	&foresee_spinand_manufacturer,
diff --git a/drivers/mtd/nand/spi/esmt.c b/drivers/mtd/nand/spi/esmt.c
index 9a9325c0bc49..e60e4ac1fd6f 100644
--- a/drivers/mtd/nand/spi/esmt.c
+++ b/drivers/mtd/nand/spi/esmt.c
@@ -12,6 +12,7 @@
 
 /* ESMT uses GigaDevice 0xc8 JECDEC ID on some SPI NANDs */
 #define SPINAND_MFR_ESMT_C8			0xc8
+#define SPINAND_MFR_ESMT_8C			0x8c
 
 #define ESMT_F50L1G41LB_CFG_OTP_PROTECT		BIT(7)
 #define ESMT_F50L1G41LB_CFG_OTP_LOCK		\
@@ -184,6 +185,21 @@ static const struct spinand_fact_otp_ops f50l1g41lb_fact_otp_ops = {
 	.read = spinand_fact_otp_read,
 };
 
+
+static const struct spinand_info esmt_8c_spinand_table[] = {
+	SPINAND_INFO("F50L1G41LC",
+		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0x2C),
+		     NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1),
+		     NAND_ECCREQ(1, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&f50l1g41lb_ooblayout, NULL),
+		     SPINAND_USER_OTP_INFO(28, 2, &f50l1g41lb_user_otp_ops),
+		     SPINAND_FACT_OTP_INFO(2, 0, &f50l1g41lb_fact_otp_ops)),
+};
+
 static const struct spinand_info esmt_c8_spinand_table[] = {
 	SPINAND_INFO("F50L1G41LB",
 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0x01, 0x7f,
@@ -224,6 +240,14 @@ static const struct spinand_info esmt_c8_spinand_table[] = {
 static const struct spinand_manufacturer_ops esmt_spinand_manuf_ops = {
 };
 
+const struct spinand_manufacturer esmt_8c_spinand_manufacturer = {
+	.id = SPINAND_MFR_ESMT_8C,
+	.name = "ESMT",
+	.chips = esmt_8c_spinand_table,
+	.nchips = ARRAY_SIZE(esmt_8c_spinand_table),
+	.ops = &esmt_spinand_manuf_ops,
+};
+
 const struct spinand_manufacturer esmt_c8_spinand_manufacturer = {
 	.id = SPINAND_MFR_ESMT_C8,
 	.name = "ESMT",
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 927c10d78769..ce76f5c632e1 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -354,6 +354,7 @@ struct spinand_manufacturer {
 /* SPI NAND manufacturers */
 extern const struct spinand_manufacturer alliancememory_spinand_manufacturer;
 extern const struct spinand_manufacturer ato_spinand_manufacturer;
+extern const struct spinand_manufacturer esmt_8c_spinand_manufacturer;
 extern const struct spinand_manufacturer esmt_c8_spinand_manufacturer;
 extern const struct spinand_manufacturer fmsh_spinand_manufacturer;
 extern const struct spinand_manufacturer foresee_spinand_manufacturer;
-- 
cgit v1.2.3


From e678c2a0063ec931642b3c5935fb0c3c1282b6b3 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Tue, 4 Nov 2025 14:16:44 +0200
Subject: PCI: Add Intel Nova Lake S audio Device ID

Add Nova Lake S (NVL-S) audio Device ID

The ID will be used by  HDA legacy, SOF audio stack and the driver
to determine which audio stack should be used (intel-dsp-config).

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251104121650.21872-2-peter.ujfalusi@linux.intel.com
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 92ffc4373f6d..a9a089566b7c 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -3075,6 +3075,7 @@
 #define PCI_DEVICE_ID_INTEL_5100_22	0x65f6
 #define PCI_DEVICE_ID_INTEL_IOAT_SCNB	0x65ff
 #define PCI_DEVICE_ID_INTEL_HDA_FCL	0x67a8
+#define PCI_DEVICE_ID_INTEL_HDA_NVL_S	0x6e50
 #define PCI_DEVICE_ID_INTEL_82371SB_0	0x7000
 #define PCI_DEVICE_ID_INTEL_82371SB_1	0x7010
 #define PCI_DEVICE_ID_INTEL_82371SB_2	0x7020
-- 
cgit v1.2.3


From 2bd7bf3ccc83074dbaf53c941539732652451b09 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Tue, 4 Nov 2025 14:16:46 +0200
Subject: ASoC: Intel: soc-acpi: add NVL match tables

For now the tables are basic for mockup devices

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251104121650.21872-4-peter.ujfalusi@linux.intel.com
---
 include/sound/soc-acpi-intel-match.h              |  2 ++
 sound/soc/intel/common/Makefile                   |  1 +
 sound/soc/intel/common/soc-acpi-intel-nvl-match.c | 41 +++++++++++++++++++++++
 3 files changed, 44 insertions(+)
 create mode 100644 sound/soc/intel/common/soc-acpi-intel-nvl-match.c

(limited to 'include')

diff --git a/include/sound/soc-acpi-intel-match.h b/include/sound/soc-acpi-intel-match.h
index daed7123df9d..382029724e85 100644
--- a/include/sound/soc-acpi-intel-match.h
+++ b/include/sound/soc-acpi-intel-match.h
@@ -34,6 +34,7 @@ extern struct snd_soc_acpi_mach snd_soc_acpi_intel_mtl_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_lnl_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_arl_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_ptl_machines[];
+extern struct snd_soc_acpi_mach snd_soc_acpi_intel_nvl_machines[];
 
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_cnl_sdw_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_cfl_sdw_machines[];
@@ -46,6 +47,7 @@ extern struct snd_soc_acpi_mach snd_soc_acpi_intel_mtl_sdw_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_lnl_sdw_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_arl_sdw_machines[];
 extern struct snd_soc_acpi_mach snd_soc_acpi_intel_ptl_sdw_machines[];
+extern struct snd_soc_acpi_mach snd_soc_acpi_intel_nvl_sdw_machines[];
 
 /*
  * generic table used for HDA codec-based platforms, possibly with
diff --git a/sound/soc/intel/common/Makefile b/sound/soc/intel/common/Makefile
index 7822bcae6c69..dbfd9e2ac015 100644
--- a/sound/soc/intel/common/Makefile
+++ b/sound/soc/intel/common/Makefile
@@ -11,6 +11,7 @@ snd-soc-acpi-intel-match-y := soc-acpi-intel-byt-match.o soc-acpi-intel-cht-matc
 	soc-acpi-intel-arl-match.o \
 	soc-acpi-intel-lnl-match.o \
 	soc-acpi-intel-ptl-match.o \
+	soc-acpi-intel-nvl-match.o \
 	soc-acpi-intel-hda-match.o \
 	soc-acpi-intel-sdw-mockup-match.o sof-function-topology-lib.o
 
diff --git a/sound/soc/intel/common/soc-acpi-intel-nvl-match.c b/sound/soc/intel/common/soc-acpi-intel-nvl-match.c
new file mode 100644
index 000000000000..b8695d47e55b
--- /dev/null
+++ b/sound/soc/intel/common/soc-acpi-intel-nvl-match.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * soc-acpi-intel-nvl-match.c - tables and support for NVL ACPI enumeration.
+ *
+ * Copyright (c) 2025, Intel Corporation.
+ *
+ */
+
+#include <sound/soc-acpi.h>
+#include <sound/soc-acpi-intel-match.h>
+#include "soc-acpi-intel-sdw-mockup-match.h"
+
+struct snd_soc_acpi_mach snd_soc_acpi_intel_nvl_machines[] = {
+	{},
+};
+EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_nvl_machines);
+
+/* this table is used when there is no I2S codec present */
+struct snd_soc_acpi_mach snd_soc_acpi_intel_nvl_sdw_machines[] = {
+	/* mockup tests need to be first */
+	{
+		.link_mask = GENMASK(3, 0),
+		.links = sdw_mockup_headset_2amps_mic,
+		.drv_name = "sof_sdw",
+		.sof_tplg_filename = "sof-nvl-rt711-rt1308-rt715.tplg",
+	},
+	{
+		.link_mask = BIT(0) | BIT(1) | BIT(3),
+		.links = sdw_mockup_headset_1amp_mic,
+		.drv_name = "sof_sdw",
+		.sof_tplg_filename = "sof-nvl-rt711-rt1308-mono-rt715.tplg",
+	},
+	{
+		.link_mask = GENMASK(2, 0),
+		.links = sdw_mockup_mic_headset_1amp,
+		.drv_name = "sof_sdw",
+		.sof_tplg_filename = "sof-nvl-rt715-rt711-rt1308-mono.tplg",
+	},
+	{},
+};
+EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_nvl_sdw_machines);
-- 
cgit v1.2.3


From 33cf66d88306663d16e4759e9d24766b0aaa2e17 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 7 Nov 2025 17:01:31 +0100
Subject: sched/fair: Proportional newidle balance

Add a randomized algorithm that runs newidle balancing proportional to
its success rate.

This improves schbench significantly:

 6.18-rc4:			2.22 Mrps/s
 6.18-rc4+revert:		2.04 Mrps/s
 6.18-rc4+revert+random:	2.18 Mrps/S

Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:

 6.17:			-6%
 6.17+revert:		 0%
 6.17+revert+random:	-1%

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
---
 include/linux/sched/topology.h |  3 +++
 kernel/sched/core.c            |  3 +++
 kernel/sched/fair.c            | 44 ++++++++++++++++++++++++++++++++++++++----
 kernel/sched/features.h        |  5 +++++
 kernel/sched/sched.h           |  7 +++++++
 kernel/sched/topology.c        |  6 ++++++
 6 files changed, 64 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index bbcfdf12aa6e..45c0022b91ce 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -92,6 +92,9 @@ struct sched_domain {
 	unsigned int nr_balance_failed; /* initialise to 0 */
 
 	/* idle_balance() stats */
+	unsigned int newidle_call;
+	unsigned int newidle_success;
+	unsigned int newidle_ratio;
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 699db3f46df6..9f10cfbdc228 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
 
 #ifdef CONFIG_SCHED_PROXY_EXEC
 DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -8489,6 +8490,8 @@ void __init sched_init_smp(void)
 {
 	sched_init_numa(NUMA_NO_NODE);
 
+	prandom_init_once(&sched_rnd_state);
+
 	/*
 	 * There's no userspace yet to cause hotplug operations; hence all the
 	 * CPU masks are stable and all blatant races in the below code cannot
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index abcbb67dd785..1855975b8248 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12224,11 +12224,27 @@ void update_max_interval(void)
 	max_load_balance_interval = HZ*num_online_cpus()/10;
 }
 
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+	sd->newidle_call++;
+	sd->newidle_success += success;
+
+	if (sd->newidle_call >= 1024) {
+		sd->newidle_ratio = sd->newidle_success;
+		sd->newidle_call /= 2;
+		sd->newidle_success /= 2;
+	}
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
 {
 	unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
 	unsigned long now = jiffies;
 
+	if (cost)
+		update_newidle_stats(sd, success);
+
 	if (cost > sd->max_newidle_lb_cost) {
 		/*
 		 * Track max cost of a domain to make sure to not delay the
@@ -12276,7 +12292,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
 		 * Decay the newidle max times here because this is a regular
 		 * visit to all the domains.
 		 */
-		need_decay = update_newidle_cost(sd, 0);
+		need_decay = update_newidle_cost(sd, 0, 0);
 		max_cost += sd->max_newidle_lb_cost;
 
 		/*
@@ -12912,6 +12928,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 			break;
 
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
+			unsigned int weight = 1;
+
+			if (sched_feat(NI_RANDOM)) {
+				/*
+				 * Throw a 1k sided dice; and only run
+				 * newidle_balance according to the success
+				 * rate.
+				 */
+				u32 d1k = sched_rng() % 1024;
+				weight = 1 + sd->newidle_ratio;
+				if (d1k > weight) {
+					update_newidle_stats(sd, 0);
+					continue;
+				}
+				weight = (1024 + weight/2) / weight;
+			}
 
 			pulled_task = sched_balance_rq(this_cpu, this_rq,
 						   sd, CPU_NEWLY_IDLE,
@@ -12919,10 +12951,14 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 
 			t1 = sched_clock_cpu(this_cpu);
 			domain_cost = t1 - t0;
-			update_newidle_cost(sd, domain_cost);
-
 			curr_cost += domain_cost;
 			t0 = t1;
+
+			/*
+			 * Track max cost of a domain to make sure to not delay the
+			 * next wakeup on the CPU.
+			 */
+			update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
 		}
 
 		/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 0607def744af..980d92bab8ab 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
 SCHED_FEAT(UTIL_EST, true)
 
 SCHED_FEAT(LATENCY_WARN, false)
+
+/*
+ * Do newidle balancing proportional to its success rate using randomization.
+ */
+SCHED_FEAT(NI_RANDOM, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index def9ab7b59d4..b419a4d98461 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
 #ifndef _KERNEL_SCHED_SCHED_H
 #define _KERNEL_SCHED_SCHED_H
 
+#include <linux/prandom.h>
 #include <linux/sched/affinity.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/cpufreq.h>
@@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p)
 }
 
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+static inline u32 sched_rng(void)
+{
+	return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
+}
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		this_cpu_ptr(&runqueues)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 711076aa4980..cf643a5ddedd 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1669,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl,
 
 		.last_balance		= jiffies,
 		.balance_interval	= sd_weight,
+
+		/* 50% success rate */
+		.newidle_call		= 512,
+		.newidle_success	= 256,
+		.newidle_ratio		= 512,
+
 		.max_newidle_lb_cost	= 0,
 		.last_decay_max_lb_cost	= jiffies,
 		.child			= child,
-- 
cgit v1.2.3


From 96498e804cb6629e02747336a0a33e4955449732 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 17 Nov 2025 17:12:47 +0100
Subject: spi: davinci: remove platform data header

There are no longer any board files including the DaVinci SPI platform
data header. Let's move the bits and pieces that are used in the driver
into the driver .c file itself and remove the header.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Link: https://patch.msgid.link/20251117-davinci-spi-v2-1-cd799d17f04a@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-davinci.c                 | 64 ++++++++++++++++++++++++++-
 include/linux/platform_data/spi-davinci.h | 73 -------------------------------
 2 files changed, 62 insertions(+), 75 deletions(-)
 delete mode 100644 include/linux/platform_data/spi-davinci.h

(limited to 'include')

diff --git a/drivers/spi/spi-davinci.c b/drivers/spi/spi-davinci.c
index a29934422356..21a14e800eed 100644
--- a/drivers/spi/spi-davinci.c
+++ b/drivers/spi/spi-davinci.c
@@ -9,6 +9,7 @@
 #include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/delay.h>
+#include <linux/platform_data/edma.h>
 #include <linux/platform_device.h>
 #include <linux/err.h>
 #include <linux/clk.h>
@@ -19,8 +20,6 @@
 #include <linux/spi/spi_bitbang.h>
 #include <linux/slab.h>
 
-#include <linux/platform_data/spi-davinci.h>
-
 #define CS_DEFAULT	0xFF
 
 #define SPIFMT_PHASE_MASK	BIT(16)
@@ -98,8 +97,69 @@
 #define SPIDEF		0x4c
 #define SPIFMT0		0x50
 
+#define SPI_IO_TYPE_POLL	1
+#define SPI_IO_TYPE_DMA		2
+
 #define DMA_MIN_BYTES	16
 
+enum {
+	SPI_VERSION_1, /* For DM355/DM365/DM6467 */
+	SPI_VERSION_2, /* For DA8xx */
+};
+
+/**
+ * struct davinci_spi_platform_data - Platform data for SPI master device on DaVinci
+ *
+ * @version:	version of the SPI IP. Different DaVinci devices have slightly
+ *		varying versions of the same IP.
+ * @num_chipselect: number of chipselects supported by this SPI master
+ * @intr_line:	interrupt line used to connect the SPI IP to the ARM interrupt
+ *		controller withn the SoC. Possible values are 0 and 1.
+ * @prescaler_limit: max clock prescaler value
+ * @cshold_bug:	set this to true if the SPI controller on your chip requires
+ *		a write to CSHOLD bit in between transfers (like in DM355).
+ * @dma_event_q: DMA event queue to use if SPI_IO_TYPE_DMA is used for any
+ *		device on the bus.
+ */
+struct davinci_spi_platform_data {
+	u8			version;
+	u8			num_chipselect;
+	u8			intr_line;
+	u8			prescaler_limit;
+	bool			cshold_bug;
+	enum dma_event_q	dma_event_q;
+};
+
+/**
+ * struct davinci_spi_config - Per-chip-select configuration for SPI slave devices
+ *
+ * @wdelay:	amount of delay between transmissions. Measured in number of
+ *		SPI module clocks.
+ * @odd_parity:	polarity of parity flag at the end of transmit data stream.
+ *		0 - odd parity, 1 - even parity.
+ * @parity_enable: enable transmission of parity at end of each transmit
+ *		data stream.
+ * @io_type:	type of IO transfer. Choose between polled, interrupt and DMA.
+ * @timer_disable: disable chip-select timers (setup and hold)
+ * @c2tdelay:	chip-select setup time. Measured in number of SPI module clocks.
+ * @t2cdelay:	chip-select hold time. Measured in number of SPI module clocks.
+ * @t2edelay:	transmit data finished to SPI ENAn pin inactive time. Measured
+ *		in number of SPI clocks.
+ * @c2edelay:	chip-select active to SPI ENAn signal active time. Measured in
+ *		number of SPI clocks.
+ */
+struct davinci_spi_config {
+	u8	wdelay;
+	u8	odd_parity;
+	u8	parity_enable;
+	u8	io_type;
+	u8	timer_disable;
+	u8	c2tdelay;
+	u8	t2cdelay;
+	u8	t2edelay;
+	u8	c2edelay;
+};
+
 /* SPI Controller driver's private data. */
 struct davinci_spi {
 	struct spi_bitbang	bitbang;
diff --git a/include/linux/platform_data/spi-davinci.h b/include/linux/platform_data/spi-davinci.h
deleted file mode 100644
index 2cb5cc70fd9d..000000000000
--- a/include/linux/platform_data/spi-davinci.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright 2009 Texas Instruments.
- */
-
-#ifndef __ARCH_ARM_DAVINCI_SPI_H
-#define __ARCH_ARM_DAVINCI_SPI_H
-
-#include <linux/platform_data/edma.h>
-
-#define SPI_INTERN_CS	0xFF
-
-enum {
-	SPI_VERSION_1, /* For DM355/DM365/DM6467 */
-	SPI_VERSION_2, /* For DA8xx */
-};
-
-/**
- * davinci_spi_platform_data - Platform data for SPI master device on DaVinci
- *
- * @version:	version of the SPI IP. Different DaVinci devices have slightly
- *		varying versions of the same IP.
- * @num_chipselect: number of chipselects supported by this SPI master
- * @intr_line:	interrupt line used to connect the SPI IP to the ARM interrupt
- *		controller withn the SoC. Possible values are 0 and 1.
- * @cshold_bug:	set this to true if the SPI controller on your chip requires
- *		a write to CSHOLD bit in between transfers (like in DM355).
- * @dma_event_q: DMA event queue to use if SPI_IO_TYPE_DMA is used for any
- *		device on the bus.
- */
-struct davinci_spi_platform_data {
-	u8			version;
-	u8			num_chipselect;
-	u8			intr_line;
-	u8			prescaler_limit;
-	bool			cshold_bug;
-	enum dma_event_q	dma_event_q;
-};
-
-/**
- * davinci_spi_config - Per-chip-select configuration for SPI slave devices
- *
- * @wdelay:	amount of delay between transmissions. Measured in number of
- *		SPI module clocks.
- * @odd_parity:	polarity of parity flag at the end of transmit data stream.
- *		0 - odd parity, 1 - even parity.
- * @parity_enable: enable transmission of parity at end of each transmit
- *		data stream.
- * @io_type:	type of IO transfer. Choose between polled, interrupt and DMA.
- * @timer_disable: disable chip-select timers (setup and hold)
- * @c2tdelay:	chip-select setup time. Measured in number of SPI module clocks.
- * @t2cdelay:	chip-select hold time. Measured in number of SPI module clocks.
- * @t2edelay:	transmit data finished to SPI ENAn pin inactive time. Measured
- *		in number of SPI clocks.
- * @c2edelay:	chip-select active to SPI ENAn signal active time. Measured in
- *		number of SPI clocks.
- */
-struct davinci_spi_config {
-	u8	wdelay;
-	u8	odd_parity;
-	u8	parity_enable;
-#define SPI_IO_TYPE_INTR	0
-#define SPI_IO_TYPE_POLL	1
-#define SPI_IO_TYPE_DMA		2
-	u8	io_type;
-	u8	timer_disable;
-	u8	c2tdelay;
-	u8	t2cdelay;
-	u8	t2edelay;
-	u8	c2edelay;
-};
-
-#endif	/* __ARCH_ARM_DAVINCI_SPI_H */
-- 
cgit v1.2.3


From f49ae86483c494ddc793d889f6df5ea68d138569 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 17 Nov 2025 10:47:54 +0000
Subject: memregion: Drop unused IORES_DESC_* parameter from
 cpu_cache_invalidate_memregion()

The res_desc parameter was originally introduced for documentation purposes
and with the idea that with HDM-DB CXL invalidation could be triggered from
the device. That has not come to pass and the continued existence of the
option is confusing when we add a range in the following patch which might
not be a strict subset of the res_desc. So avoid that confusion by dropping
the parameter.

Link: https://lore.kernel.org/linux-mm/686eedb25ed02_24471002e@dwillia2-xfh.jf.intel.com.notmuch/
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
---
 arch/x86/mm/pat/set_memory.c | 2 +-
 drivers/cxl/core/region.c    | 2 +-
 drivers/nvdimm/region.c      | 2 +-
 drivers/nvdimm/region_devs.c | 2 +-
 include/linux/memregion.h    | 7 +++----
 5 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 8834c76f91c9..4019b17fb65e 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -368,7 +368,7 @@ bool cpu_cache_has_invalidate_memregion(void)
 }
 EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM");
 
-int cpu_cache_invalidate_memregion(int res_desc)
+int cpu_cache_invalidate_memregion(void)
 {
 	if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
 		return -ENXIO;
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 71cc42d05248..d7fa76810f82 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -228,7 +228,7 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
 		return -ENXIO;
 	}
 
-	cpu_cache_invalidate_memregion(IORES_DESC_CXL);
+	cpu_cache_invalidate_memregion();
 	return 0;
 }
 
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 88dc062af5f8..c43506448edf 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -110,7 +110,7 @@ static void nd_region_remove(struct device *dev)
 	 * here is ok.
 	 */
 	if (cpu_cache_has_invalidate_memregion())
-		cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
+		cpu_cache_invalidate_memregion();
 }
 
 static int child_notify(struct device *dev, void *data)
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index de1ee5ebc851..3cdd93d40997 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -90,7 +90,7 @@ static int nd_region_invalidate_memregion(struct nd_region *nd_region)
 		}
 	}
 
-	cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
+	cpu_cache_invalidate_memregion();
 out:
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
diff --git a/include/linux/memregion.h b/include/linux/memregion.h
index c01321467789..945646bde825 100644
--- a/include/linux/memregion.h
+++ b/include/linux/memregion.h
@@ -26,8 +26,7 @@ static inline void memregion_free(int id)
 
 /**
  * cpu_cache_invalidate_memregion - drop any CPU cached data for
- *     memregions described by @res_desc
- * @res_desc: one of the IORES_DESC_* types
+ *     memregion
  *
  * Perform cache maintenance after a memory event / operation that
  * changes the contents of physical memory in a cache-incoherent manner.
@@ -46,7 +45,7 @@ static inline void memregion_free(int id)
  * the cache maintenance.
  */
 #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
-int cpu_cache_invalidate_memregion(int res_desc);
+int cpu_cache_invalidate_memregion(void);
 bool cpu_cache_has_invalidate_memregion(void);
 #else
 static inline bool cpu_cache_has_invalidate_memregion(void)
@@ -54,7 +53,7 @@ static inline bool cpu_cache_has_invalidate_memregion(void)
 	return false;
 }
 
-static inline int cpu_cache_invalidate_memregion(int res_desc)
+static inline int cpu_cache_invalidate_memregion(void)
 {
 	WARN_ON_ONCE("CPU cache invalidation required");
 	return -ENXIO;
-- 
cgit v1.2.3


From b43652d867cf2a5f31b14e3d9a320ad01fca0992 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Mon, 17 Nov 2025 10:47:55 +0000
Subject: memregion: Support fine grained invalidate by
 cpu_cache_invalidate_memregion()

Extend cpu_cache_invalidate_memregion() to support invalidating a
particular range of memory by introducing start and length parameters.
Control of types of invalidation is left for when use cases turn up. For
now everything is Clean and Invalidate.

Where the range is unknown, use the provided cpu_cache_invalidate_all()
helper to act as documentation of intent in a fashion that is clearer than
passing (0, -1) to cpu_cache_invalidate_memregion().

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
---
 arch/x86/mm/pat/set_memory.c |  2 +-
 drivers/cxl/core/region.c    |  5 ++++-
 drivers/nvdimm/region.c      |  2 +-
 drivers/nvdimm/region_devs.c |  2 +-
 include/linux/memregion.h    | 13 +++++++++++--
 5 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 4019b17fb65e..292c7202faed 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -368,7 +368,7 @@ bool cpu_cache_has_invalidate_memregion(void)
 }
 EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM");
 
-int cpu_cache_invalidate_memregion(void)
+int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len)
 {
 	if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
 		return -ENXIO;
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index d7fa76810f82..410e41cef5d3 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -228,7 +228,10 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
 		return -ENXIO;
 	}
 
-	cpu_cache_invalidate_memregion();
+	if (!cxlr->params.res)
+		return -ENXIO;
+	cpu_cache_invalidate_memregion(cxlr->params.res->start,
+				       resource_size(cxlr->params.res));
 	return 0;
 }
 
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index c43506448edf..42e982db5b04 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -110,7 +110,7 @@ static void nd_region_remove(struct device *dev)
 	 * here is ok.
 	 */
 	if (cpu_cache_has_invalidate_memregion())
-		cpu_cache_invalidate_memregion();
+		cpu_cache_invalidate_all();
 }
 
 static int child_notify(struct device *dev, void *data)
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 3cdd93d40997..e27fc380f6c0 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -90,7 +90,7 @@ static int nd_region_invalidate_memregion(struct nd_region *nd_region)
 		}
 	}
 
-	cpu_cache_invalidate_memregion();
+	cpu_cache_invalidate_all();
 out:
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
diff --git a/include/linux/memregion.h b/include/linux/memregion.h
index 945646bde825..a55f62cc5266 100644
--- a/include/linux/memregion.h
+++ b/include/linux/memregion.h
@@ -27,6 +27,9 @@ static inline void memregion_free(int id)
 /**
  * cpu_cache_invalidate_memregion - drop any CPU cached data for
  *     memregion
+ * @start: start physical address of the target memory region.
+ * @len: length of the target memory region. -1 for all the regions of
+ *       the target type.
  *
  * Perform cache maintenance after a memory event / operation that
  * changes the contents of physical memory in a cache-incoherent manner.
@@ -45,7 +48,7 @@ static inline void memregion_free(int id)
  * the cache maintenance.
  */
 #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
-int cpu_cache_invalidate_memregion(void);
+int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len);
 bool cpu_cache_has_invalidate_memregion(void);
 #else
 static inline bool cpu_cache_has_invalidate_memregion(void)
@@ -53,10 +56,16 @@ static inline bool cpu_cache_has_invalidate_memregion(void)
 	return false;
 }
 
-static inline int cpu_cache_invalidate_memregion(void)
+static inline int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len)
 {
 	WARN_ON_ONCE("CPU cache invalidation required");
 	return -ENXIO;
 }
 #endif
+
+static inline int cpu_cache_invalidate_all(void)
+{
+	return cpu_cache_invalidate_memregion(0, -1);
+}
+
 #endif /* _MEMREGION_H_ */
-- 
cgit v1.2.3


From e275d9091c01b3b46f3ec534ce4ac77cffc9e3ae Mon Sep 17 00:00:00 2001
From: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Date: Fri, 14 Nov 2025 03:43:18 -0800
Subject: net: mana: Move hardware counter stats from per-port to per-VF
 context

Move hardware counter (HC) statistics from mana_port_context to
mana_context to enable sharing stats across multiple network ports
on the same MANA VF. Previously, each network port queried
hardware counters independently using MANA_QUERY_GF_STAT command
(GF = Generic Function stats from GDMA hardware), resulting in
redundant queries when multiple ports existed on the same device.

Isolate hardware counter stats by introducing mana_ethtool_hc_stats
in mana_context and update the code to ensure all stats are properly
reported via ethtool -S <interface>, maintaining consistency with
previous behavior.

Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1763120599-6331-2-git-send-email-ernis@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c      | 67 ++++++++---------
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 85 ++++++++++++----------
 include/net/mana/mana.h                            | 14 ++--
 3 files changed, 90 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index cccd5b63cee6..d8ce4402c696 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2809,11 +2809,12 @@ int mana_config_rss(struct mana_port_context *apc, enum TRI_STATE rx,
 	return 0;
 }
 
-void mana_query_gf_stats(struct mana_port_context *apc)
+void mana_query_gf_stats(struct mana_context *ac)
 {
+	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct mana_query_gf_stat_resp resp = {};
 	struct mana_query_gf_stat_req req = {};
-	struct net_device *ndev = apc->ndev;
+	struct device *dev = gc->dev;
 	int err;
 
 	mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_GF_STAT,
@@ -2847,52 +2848,52 @@ void mana_query_gf_stats(struct mana_port_context *apc)
 			STATISTICS_FLAGS_HC_TX_BCAST_BYTES |
 			STATISTICS_FLAGS_TX_ERRORS_GDMA_ERROR;
 
-	err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
+	err = mana_send_request(ac, &req, sizeof(req), &resp,
 				sizeof(resp));
 	if (err) {
-		netdev_err(ndev, "Failed to query GF stats: %d\n", err);
+		dev_err(dev, "Failed to query GF stats: %d\n", err);
 		return;
 	}
 	err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_GF_STAT,
 				   sizeof(resp));
 	if (err || resp.hdr.status) {
-		netdev_err(ndev, "Failed to query GF stats: %d, 0x%x\n", err,
-			   resp.hdr.status);
+		dev_err(dev, "Failed to query GF stats: %d, 0x%x\n", err,
+			resp.hdr.status);
 		return;
 	}
 
-	apc->eth_stats.hc_rx_discards_no_wqe = resp.rx_discards_nowqe;
-	apc->eth_stats.hc_rx_err_vport_disabled = resp.rx_err_vport_disabled;
-	apc->eth_stats.hc_rx_bytes = resp.hc_rx_bytes;
-	apc->eth_stats.hc_rx_ucast_pkts = resp.hc_rx_ucast_pkts;
-	apc->eth_stats.hc_rx_ucast_bytes = resp.hc_rx_ucast_bytes;
-	apc->eth_stats.hc_rx_bcast_pkts = resp.hc_rx_bcast_pkts;
-	apc->eth_stats.hc_rx_bcast_bytes = resp.hc_rx_bcast_bytes;
-	apc->eth_stats.hc_rx_mcast_pkts = resp.hc_rx_mcast_pkts;
-	apc->eth_stats.hc_rx_mcast_bytes = resp.hc_rx_mcast_bytes;
-	apc->eth_stats.hc_tx_err_gf_disabled = resp.tx_err_gf_disabled;
-	apc->eth_stats.hc_tx_err_vport_disabled = resp.tx_err_vport_disabled;
-	apc->eth_stats.hc_tx_err_inval_vportoffset_pkt =
+	ac->hc_stats.hc_rx_discards_no_wqe = resp.rx_discards_nowqe;
+	ac->hc_stats.hc_rx_err_vport_disabled = resp.rx_err_vport_disabled;
+	ac->hc_stats.hc_rx_bytes = resp.hc_rx_bytes;
+	ac->hc_stats.hc_rx_ucast_pkts = resp.hc_rx_ucast_pkts;
+	ac->hc_stats.hc_rx_ucast_bytes = resp.hc_rx_ucast_bytes;
+	ac->hc_stats.hc_rx_bcast_pkts = resp.hc_rx_bcast_pkts;
+	ac->hc_stats.hc_rx_bcast_bytes = resp.hc_rx_bcast_bytes;
+	ac->hc_stats.hc_rx_mcast_pkts = resp.hc_rx_mcast_pkts;
+	ac->hc_stats.hc_rx_mcast_bytes = resp.hc_rx_mcast_bytes;
+	ac->hc_stats.hc_tx_err_gf_disabled = resp.tx_err_gf_disabled;
+	ac->hc_stats.hc_tx_err_vport_disabled = resp.tx_err_vport_disabled;
+	ac->hc_stats.hc_tx_err_inval_vportoffset_pkt =
 					     resp.tx_err_inval_vport_offset_pkt;
-	apc->eth_stats.hc_tx_err_vlan_enforcement =
+	ac->hc_stats.hc_tx_err_vlan_enforcement =
 					     resp.tx_err_vlan_enforcement;
-	apc->eth_stats.hc_tx_err_eth_type_enforcement =
+	ac->hc_stats.hc_tx_err_eth_type_enforcement =
 					     resp.tx_err_ethtype_enforcement;
-	apc->eth_stats.hc_tx_err_sa_enforcement = resp.tx_err_SA_enforcement;
-	apc->eth_stats.hc_tx_err_sqpdid_enforcement =
+	ac->hc_stats.hc_tx_err_sa_enforcement = resp.tx_err_SA_enforcement;
+	ac->hc_stats.hc_tx_err_sqpdid_enforcement =
 					     resp.tx_err_SQPDID_enforcement;
-	apc->eth_stats.hc_tx_err_cqpdid_enforcement =
+	ac->hc_stats.hc_tx_err_cqpdid_enforcement =
 					     resp.tx_err_CQPDID_enforcement;
-	apc->eth_stats.hc_tx_err_mtu_violation = resp.tx_err_mtu_violation;
-	apc->eth_stats.hc_tx_err_inval_oob = resp.tx_err_inval_oob;
-	apc->eth_stats.hc_tx_bytes = resp.hc_tx_bytes;
-	apc->eth_stats.hc_tx_ucast_pkts = resp.hc_tx_ucast_pkts;
-	apc->eth_stats.hc_tx_ucast_bytes = resp.hc_tx_ucast_bytes;
-	apc->eth_stats.hc_tx_bcast_pkts = resp.hc_tx_bcast_pkts;
-	apc->eth_stats.hc_tx_bcast_bytes = resp.hc_tx_bcast_bytes;
-	apc->eth_stats.hc_tx_mcast_pkts = resp.hc_tx_mcast_pkts;
-	apc->eth_stats.hc_tx_mcast_bytes = resp.hc_tx_mcast_bytes;
-	apc->eth_stats.hc_tx_err_gdma = resp.tx_err_gdma;
+	ac->hc_stats.hc_tx_err_mtu_violation = resp.tx_err_mtu_violation;
+	ac->hc_stats.hc_tx_err_inval_oob = resp.tx_err_inval_oob;
+	ac->hc_stats.hc_tx_bytes = resp.hc_tx_bytes;
+	ac->hc_stats.hc_tx_ucast_pkts = resp.hc_tx_ucast_pkts;
+	ac->hc_stats.hc_tx_ucast_bytes = resp.hc_tx_ucast_bytes;
+	ac->hc_stats.hc_tx_bcast_pkts = resp.hc_tx_bcast_pkts;
+	ac->hc_stats.hc_tx_bcast_bytes = resp.hc_tx_bcast_bytes;
+	ac->hc_stats.hc_tx_mcast_pkts = resp.hc_tx_mcast_pkts;
+	ac->hc_stats.hc_tx_mcast_bytes = resp.hc_tx_mcast_bytes;
+	ac->hc_stats.hc_tx_err_gdma = resp.tx_err_gdma;
 }
 
 void mana_query_phy_stats(struct mana_port_context *apc)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index a1afa75a9463..3dfd96146424 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -15,66 +15,69 @@ struct mana_stats_desc {
 static const struct mana_stats_desc mana_eth_stats[] = {
 	{"stop_queue", offsetof(struct mana_ethtool_stats, stop_queue)},
 	{"wake_queue", offsetof(struct mana_ethtool_stats, wake_queue)},
-	{"hc_rx_discards_no_wqe", offsetof(struct mana_ethtool_stats,
+	{"tx_cq_err", offsetof(struct mana_ethtool_stats, tx_cqe_err)},
+	{"tx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
+					tx_cqe_unknown_type)},
+	{"rx_coalesced_err", offsetof(struct mana_ethtool_stats,
+					rx_coalesced_err)},
+	{"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
+					rx_cqe_unknown_type)},
+};
+
+static const struct mana_stats_desc mana_hc_stats[] = {
+	{"hc_rx_discards_no_wqe", offsetof(struct mana_ethtool_hc_stats,
 					   hc_rx_discards_no_wqe)},
-	{"hc_rx_err_vport_disabled", offsetof(struct mana_ethtool_stats,
+	{"hc_rx_err_vport_disabled", offsetof(struct mana_ethtool_hc_stats,
 					      hc_rx_err_vport_disabled)},
-	{"hc_rx_bytes", offsetof(struct mana_ethtool_stats, hc_rx_bytes)},
-	{"hc_rx_ucast_pkts", offsetof(struct mana_ethtool_stats,
+	{"hc_rx_bytes", offsetof(struct mana_ethtool_hc_stats, hc_rx_bytes)},
+	{"hc_rx_ucast_pkts", offsetof(struct mana_ethtool_hc_stats,
 				      hc_rx_ucast_pkts)},
-	{"hc_rx_ucast_bytes", offsetof(struct mana_ethtool_stats,
+	{"hc_rx_ucast_bytes", offsetof(struct mana_ethtool_hc_stats,
 				       hc_rx_ucast_bytes)},
-	{"hc_rx_bcast_pkts", offsetof(struct mana_ethtool_stats,
+	{"hc_rx_bcast_pkts", offsetof(struct mana_ethtool_hc_stats,
 				      hc_rx_bcast_pkts)},
-	{"hc_rx_bcast_bytes", offsetof(struct mana_ethtool_stats,
+	{"hc_rx_bcast_bytes", offsetof(struct mana_ethtool_hc_stats,
 				       hc_rx_bcast_bytes)},
-	{"hc_rx_mcast_pkts", offsetof(struct mana_ethtool_stats,
-			hc_rx_mcast_pkts)},
-	{"hc_rx_mcast_bytes", offsetof(struct mana_ethtool_stats,
+	{"hc_rx_mcast_pkts", offsetof(struct mana_ethtool_hc_stats,
+				      hc_rx_mcast_pkts)},
+	{"hc_rx_mcast_bytes", offsetof(struct mana_ethtool_hc_stats,
 				       hc_rx_mcast_bytes)},
-	{"hc_tx_err_gf_disabled", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_err_gf_disabled", offsetof(struct mana_ethtool_hc_stats,
 					   hc_tx_err_gf_disabled)},
-	{"hc_tx_err_vport_disabled", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_err_vport_disabled", offsetof(struct mana_ethtool_hc_stats,
 					      hc_tx_err_vport_disabled)},
 	{"hc_tx_err_inval_vportoffset_pkt",
-	 offsetof(struct mana_ethtool_stats,
+	 offsetof(struct mana_ethtool_hc_stats,
 		  hc_tx_err_inval_vportoffset_pkt)},
-	{"hc_tx_err_vlan_enforcement", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_err_vlan_enforcement", offsetof(struct mana_ethtool_hc_stats,
 						hc_tx_err_vlan_enforcement)},
 	{"hc_tx_err_eth_type_enforcement",
-	 offsetof(struct mana_ethtool_stats, hc_tx_err_eth_type_enforcement)},
-	{"hc_tx_err_sa_enforcement", offsetof(struct mana_ethtool_stats,
+	 offsetof(struct mana_ethtool_hc_stats, hc_tx_err_eth_type_enforcement)},
+	{"hc_tx_err_sa_enforcement", offsetof(struct mana_ethtool_hc_stats,
 					      hc_tx_err_sa_enforcement)},
 	{"hc_tx_err_sqpdid_enforcement",
-	 offsetof(struct mana_ethtool_stats, hc_tx_err_sqpdid_enforcement)},
+	 offsetof(struct mana_ethtool_hc_stats, hc_tx_err_sqpdid_enforcement)},
 	{"hc_tx_err_cqpdid_enforcement",
-	 offsetof(struct mana_ethtool_stats, hc_tx_err_cqpdid_enforcement)},
-	{"hc_tx_err_mtu_violation", offsetof(struct mana_ethtool_stats,
+	 offsetof(struct mana_ethtool_hc_stats, hc_tx_err_cqpdid_enforcement)},
+	{"hc_tx_err_mtu_violation", offsetof(struct mana_ethtool_hc_stats,
 					     hc_tx_err_mtu_violation)},
-	{"hc_tx_err_inval_oob", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_err_inval_oob", offsetof(struct mana_ethtool_hc_stats,
 					 hc_tx_err_inval_oob)},
-	{"hc_tx_err_gdma", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_err_gdma", offsetof(struct mana_ethtool_hc_stats,
 				    hc_tx_err_gdma)},
-	{"hc_tx_bytes", offsetof(struct mana_ethtool_stats, hc_tx_bytes)},
-	{"hc_tx_ucast_pkts", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_bytes", offsetof(struct mana_ethtool_hc_stats, hc_tx_bytes)},
+	{"hc_tx_ucast_pkts", offsetof(struct mana_ethtool_hc_stats,
 					hc_tx_ucast_pkts)},
-	{"hc_tx_ucast_bytes", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_ucast_bytes", offsetof(struct mana_ethtool_hc_stats,
 					hc_tx_ucast_bytes)},
-	{"hc_tx_bcast_pkts", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_bcast_pkts", offsetof(struct mana_ethtool_hc_stats,
 					hc_tx_bcast_pkts)},
-	{"hc_tx_bcast_bytes", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_bcast_bytes", offsetof(struct mana_ethtool_hc_stats,
 					hc_tx_bcast_bytes)},
-	{"hc_tx_mcast_pkts", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_mcast_pkts", offsetof(struct mana_ethtool_hc_stats,
 					hc_tx_mcast_pkts)},
-	{"hc_tx_mcast_bytes", offsetof(struct mana_ethtool_stats,
+	{"hc_tx_mcast_bytes", offsetof(struct mana_ethtool_hc_stats,
 					hc_tx_mcast_bytes)},
-	{"tx_cq_err", offsetof(struct mana_ethtool_stats, tx_cqe_err)},
-	{"tx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
-					tx_cqe_unknown_type)},
-	{"rx_coalesced_err", offsetof(struct mana_ethtool_stats,
-					rx_coalesced_err)},
-	{"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
-					rx_cqe_unknown_type)},
 };
 
 static const struct mana_stats_desc mana_phy_stats[] = {
@@ -138,7 +141,7 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 	if (stringset != ETH_SS_STATS)
 		return -EINVAL;
 
-	return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) +
+	return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) + ARRAY_SIZE(mana_hc_stats) +
 			num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
 }
 
@@ -150,10 +153,12 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 
 	if (stringset != ETH_SS_STATS)
 		return;
-
 	for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++)
 		ethtool_puts(&data, mana_eth_stats[i].name);
 
+	for (i = 0; i < ARRAY_SIZE(mana_hc_stats); i++)
+		ethtool_puts(&data, mana_hc_stats[i].name);
+
 	for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++)
 		ethtool_puts(&data, mana_phy_stats[i].name);
 
@@ -186,6 +191,7 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 	void *eth_stats = &apc->eth_stats;
+	void *hc_stats = &apc->ac->hc_stats;
 	void *phy_stats = &apc->phy_stats;
 	struct mana_stats_rx *rx_stats;
 	struct mana_stats_tx *tx_stats;
@@ -208,7 +214,7 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	if (!apc->port_is_up)
 		return;
 	/* we call mana function to update stats from GDMA */
-	mana_query_gf_stats(apc);
+	mana_query_gf_stats(apc->ac);
 
 	/* We call this mana function to get the phy stats from GDMA and includes
 	 * aggregate tx/rx drop counters, Per-TC(Traffic Channel) tx/rx and pause
@@ -219,6 +225,9 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	for (q = 0; q < ARRAY_SIZE(mana_eth_stats); q++)
 		data[i++] = *(u64 *)(eth_stats + mana_eth_stats[q].offset);
 
+	for (q = 0; q < ARRAY_SIZE(mana_hc_stats); q++)
+		data[i++] = *(u64 *)(hc_stats + mana_hc_stats[q].offset);
+
 	for (q = 0; q < ARRAY_SIZE(mana_phy_stats); q++)
 		data[i++] = *(u64 *)(phy_stats + mana_phy_stats[q].offset);
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 8906901535f5..3484f42803e3 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -375,6 +375,13 @@ struct mana_tx_qp {
 struct mana_ethtool_stats {
 	u64 stop_queue;
 	u64 wake_queue;
+	u64 tx_cqe_err;
+	u64 tx_cqe_unknown_type;
+	u64 rx_coalesced_err;
+	u64 rx_cqe_unknown_type;
+};
+
+struct mana_ethtool_hc_stats {
 	u64 hc_rx_discards_no_wqe;
 	u64 hc_rx_err_vport_disabled;
 	u64 hc_rx_bytes;
@@ -402,10 +409,6 @@ struct mana_ethtool_stats {
 	u64 hc_tx_mcast_pkts;
 	u64 hc_tx_mcast_bytes;
 	u64 hc_tx_err_gdma;
-	u64 tx_cqe_err;
-	u64 tx_cqe_unknown_type;
-	u64 rx_coalesced_err;
-	u64 rx_cqe_unknown_type;
 };
 
 struct mana_ethtool_phy_stats {
@@ -473,6 +476,7 @@ struct mana_context {
 	u16 num_ports;
 	u8 bm_hostmode;
 
+	struct mana_ethtool_hc_stats hc_stats;
 	struct mana_eq *eqs;
 	struct dentry *mana_eqs_debugfs;
 
@@ -577,7 +581,7 @@ u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq,
 struct bpf_prog *mana_xdp_get(struct mana_port_context *apc);
 void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog);
 int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
-void mana_query_gf_stats(struct mana_port_context *apc);
+void mana_query_gf_stats(struct mana_context *ac);
 int mana_query_link_cfg(struct mana_port_context *apc);
 int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
 		      int enable_clamping);
-- 
cgit v1.2.3


From be4f1d67ec56f23f37714ac73c01094e63c7ff28 Mon Sep 17 00:00:00 2001
From: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Date: Fri, 14 Nov 2025 03:43:19 -0800
Subject: net: mana: Add standard counter rx_missed_errors

Report standard counter stats->rx_missed_errors
using hc_rx_discards_no_wqe from the hardware.

Add a global workqueue to periodically run
mana_query_gf_stats every 2 seconds to get the latest
info in eth_stats and define a driver capability flag
to notify hardware of the periodic queries.

To avoid repeated failures and log flooding, the workqueue
is not rescheduled if mana_query_gf_stats fails on HWC timeout
error and the stats are reset to 0. Other errors are transient
which will not need a VF reset for recovery.

Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1763120599-6331-3-git-send-email-ernis@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c      | 36 ++++++++++++++++++++--
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c |  2 --
 include/net/mana/gdma.h                            |  6 +++-
 include/net/mana/mana.h                            |  6 +++-
 4 files changed, 43 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index d8ce4402c696..13f47be7aca6 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -534,6 +534,11 @@ static void mana_get_stats64(struct net_device *ndev,
 
 	netdev_stats_to_stats64(st, &ndev->stats);
 
+	if (apc->ac->hwc_timeout_occurred)
+		netdev_warn_once(ndev, "HWC timeout occurred\n");
+
+	st->rx_missed_errors = apc->ac->hc_stats.hc_rx_discards_no_wqe;
+
 	for (q = 0; q < num_queues; q++) {
 		rx_stats = &apc->rxqs[q]->stats;
 
@@ -2809,7 +2814,7 @@ int mana_config_rss(struct mana_port_context *apc, enum TRI_STATE rx,
 	return 0;
 }
 
-void mana_query_gf_stats(struct mana_context *ac)
+int mana_query_gf_stats(struct mana_context *ac)
 {
 	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct mana_query_gf_stat_resp resp = {};
@@ -2852,14 +2857,14 @@ void mana_query_gf_stats(struct mana_context *ac)
 				sizeof(resp));
 	if (err) {
 		dev_err(dev, "Failed to query GF stats: %d\n", err);
-		return;
+		return err;
 	}
 	err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_GF_STAT,
 				   sizeof(resp));
 	if (err || resp.hdr.status) {
 		dev_err(dev, "Failed to query GF stats: %d, 0x%x\n", err,
 			resp.hdr.status);
-		return;
+		return err;
 	}
 
 	ac->hc_stats.hc_rx_discards_no_wqe = resp.rx_discards_nowqe;
@@ -2894,6 +2899,8 @@ void mana_query_gf_stats(struct mana_context *ac)
 	ac->hc_stats.hc_tx_mcast_pkts = resp.hc_tx_mcast_pkts;
 	ac->hc_stats.hc_tx_mcast_bytes = resp.hc_tx_mcast_bytes;
 	ac->hc_stats.hc_tx_err_gdma = resp.tx_err_gdma;
+
+	return 0;
 }
 
 void mana_query_phy_stats(struct mana_port_context *apc)
@@ -3428,6 +3435,24 @@ int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type even
 	return 0;
 }
 
+#define MANA_GF_STATS_PERIOD (2 * HZ)
+
+static void mana_gf_stats_work_handler(struct work_struct *work)
+{
+	struct mana_context *ac =
+		container_of(to_delayed_work(work), struct mana_context, gf_stats_work);
+	int err;
+
+	err = mana_query_gf_stats(ac);
+	if (err == -ETIMEDOUT) {
+		/* HWC timeout detected - reset stats and stop rescheduling */
+		ac->hwc_timeout_occurred = true;
+		memset(&ac->hc_stats, 0, sizeof(ac->hc_stats));
+		return;
+	}
+	schedule_delayed_work(&ac->gf_stats_work, MANA_GF_STATS_PERIOD);
+}
+
 int mana_probe(struct gdma_dev *gd, bool resuming)
 {
 	struct gdma_context *gc = gd->gdma_context;
@@ -3520,6 +3545,10 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 	}
 
 	err = add_adev(gd, "eth");
+
+	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
+	schedule_delayed_work(&ac->gf_stats_work, MANA_GF_STATS_PERIOD);
+
 out:
 	if (err) {
 		mana_remove(gd, false);
@@ -3544,6 +3573,7 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 	int i;
 
 	disable_work_sync(&ac->link_change_work);
+	cancel_delayed_work_sync(&ac->gf_stats_work);
 
 	/* adev currently doesn't support suspending, always remove it */
 	if (gd->adev)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 3dfd96146424..99e811208683 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -213,8 +213,6 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 
 	if (!apc->port_is_up)
 		return;
-	/* we call mana function to update stats from GDMA */
-	mana_query_gf_stats(apc->ac);
 
 	/* We call this mana function to get the phy stats from GDMA and includes
 	 * aggregate tx/rx drop counters, Per-TC(Traffic Channel) tx/rx and pause
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 637f42485dba..2e4f2f3175e5 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -592,6 +592,9 @@ enum {
 #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
 #define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
 
+/* Driver can send HWC periodically to query stats */
+#define GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY BIT(21)
+
 #define GDMA_DRV_CAP_FLAGS1 \
 	(GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
 	 GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
@@ -601,7 +604,8 @@ enum {
 	 GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \
 	 GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \
 	 GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \
-	 GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE)
+	 GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE | \
+	 GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY)
 
 #define GDMA_DRV_CAP_FLAGS2 0
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 3484f42803e3..d37f4cea0ac3 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -480,6 +480,10 @@ struct mana_context {
 	struct mana_eq *eqs;
 	struct dentry *mana_eqs_debugfs;
 
+	/* Workqueue for querying hardware stats */
+	struct delayed_work gf_stats_work;
+	bool hwc_timeout_occurred;
+
 	struct net_device *ports[MAX_PORTS_IN_MANA_DEV];
 
 	/* Link state change work */
@@ -581,7 +585,7 @@ u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq,
 struct bpf_prog *mana_xdp_get(struct mana_port_context *apc);
 void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog);
 int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
-void mana_query_gf_stats(struct mana_context *ac);
+int mana_query_gf_stats(struct mana_context *ac);
 int mana_query_link_cfg(struct mana_port_context *apc);
 int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
 		      int enable_clamping);
-- 
cgit v1.2.3


From 24afd7827efb7c69adfc41835390470e3eec4740 Mon Sep 17 00:00:00 2001
From: Inochi Amaoto <inochiama@gmail.com>
Date: Fri, 14 Nov 2025 08:38:04 +0800
Subject: net: phy: Add helper for fixing RGMII PHY mode based on internal mac
 delay

The "phy-mode" property of devicetree indicates whether the PCB has
delay now, which means the mac needs to modify the PHY mode based
on whether there is an internal delay in the mac.

This modification is similar for many ethernet drivers. To simplify
code, define the helper phy_fix_phy_mode_for_mac_delays(speed, mac_txid,
mac_rxid) to fix PHY mode based on whether mac adds internal delay.

Suggested-by: Russell King (Oracle) <linux@armlinux.org.uk>
Signed-off-by: Inochi Amaoto <inochiama@gmail.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251114003805.494387-3-inochiama@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy-core.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h        |  3 +++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 605ca20ae192..0c63e6ba2cb0 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -101,6 +101,49 @@ const char *phy_rate_matching_to_str(int rate_matching)
 }
 EXPORT_SYMBOL_GPL(phy_rate_matching_to_str);
 
+/**
+ * phy_fix_phy_mode_for_mac_delays - Convenience function for fixing PHY
+ * mode based on whether mac adds internal delay
+ *
+ * @interface: The current interface mode of the port
+ * @mac_txid: True if the mac adds internal tx delay
+ * @mac_rxid: True if the mac adds internal rx delay
+ *
+ * Return: fixed PHY mode, or PHY_INTERFACE_MODE_NA if the interface can
+ * not apply the internal delay
+ */
+phy_interface_t phy_fix_phy_mode_for_mac_delays(phy_interface_t interface,
+						bool mac_txid, bool mac_rxid)
+{
+	if (!phy_interface_mode_is_rgmii(interface))
+		return interface;
+
+	if (mac_txid && mac_rxid) {
+		if (interface == PHY_INTERFACE_MODE_RGMII_ID)
+			return PHY_INTERFACE_MODE_RGMII;
+		return PHY_INTERFACE_MODE_NA;
+	}
+
+	if (mac_txid) {
+		if (interface == PHY_INTERFACE_MODE_RGMII_ID)
+			return PHY_INTERFACE_MODE_RGMII_RXID;
+		if (interface == PHY_INTERFACE_MODE_RGMII_TXID)
+			return PHY_INTERFACE_MODE_RGMII;
+		return PHY_INTERFACE_MODE_NA;
+	}
+
+	if (mac_rxid) {
+		if (interface == PHY_INTERFACE_MODE_RGMII_ID)
+			return PHY_INTERFACE_MODE_RGMII_TXID;
+		if (interface == PHY_INTERFACE_MODE_RGMII_RXID)
+			return PHY_INTERFACE_MODE_RGMII;
+		return PHY_INTERFACE_MODE_NA;
+	}
+
+	return interface;
+}
+EXPORT_SYMBOL_GPL(phy_fix_phy_mode_for_mac_delays);
+
 /**
  * phy_interface_num_ports - Return the number of links that can be carried by
  *			     a given MAC-PHY physical link. Returns 0 if this is
diff --git a/include/linux/phy.h b/include/linux/phy.h
index bf5457341ca8..65b0c3ca6a2b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -2040,6 +2040,9 @@ static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev)
 	return phydev->is_pseudo_fixed_link;
 }
 
+phy_interface_t phy_fix_phy_mode_for_mac_delays(phy_interface_t interface,
+						bool mac_txid, bool mac_rxid);
+
 int phy_save_page(struct phy_device *phydev);
 int phy_select_page(struct phy_device *phydev, int page);
 int phy_restore_page(struct phy_device *phydev, int oldpage, int ret);
-- 
cgit v1.2.3


From fc45aee66223253ec5547094d7552819914abdfb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 10 Mar 2025 00:06:29 -0400
Subject: get rid of kill_litter_super()

Not used anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst |  7 +++++++
 fs/dcache.c                           | 21 ---------------------
 fs/internal.h                         |  1 -
 fs/super.c                            |  8 --------
 include/linux/dcache.h                |  1 -
 include/linux/fs.h                    |  1 -
 6 files changed, 7 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 7233b04668fc..4921b3b0662a 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1309,3 +1309,10 @@ a different length, use
 	vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len))
 
 instead.
+
+---
+
+**mandatory**
+
+kill_litter_super() is gone; convert to DCACHE_PERSISTENT use (as all
+in-tree filesystems have done).
diff --git a/fs/dcache.c b/fs/dcache.c
index 3cc6c3876177..5ee2e78a91b3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3167,27 +3167,6 @@ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 }
 EXPORT_SYMBOL(is_subdir);
 
-static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
-{
-	struct dentry *root = data;
-	if (dentry != root) {
-		if (d_unhashed(dentry) || !dentry->d_inode ||
-		    dentry->d_flags & DCACHE_PERSISTENT)
-			return D_WALK_SKIP;
-
-		if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
-			dentry->d_flags |= DCACHE_GENOCIDE;
-			dentry->d_lockref.count--;
-		}
-	}
-	return D_WALK_CONTINUE;
-}
-
-void d_genocide(struct dentry *parent)
-{
-	d_walk(parent, parent, d_genocide_kill);
-}
-
 void d_mark_tmpfile(struct file *file, struct inode *inode)
 {
 	struct dentry *dentry = file->f_path.dentry;
diff --git a/fs/internal.h b/fs/internal.h
index 9b2b4d116880..144686af6c36 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -227,7 +227,6 @@ extern void shrink_dcache_for_umount(struct super_block *);
 extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
 extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
 				const struct qstr *name, unsigned *seq);
-extern void d_genocide(struct dentry *);
 
 /*
  * pipe.c
diff --git a/fs/super.c b/fs/super.c
index 5bab94fb7e03..ee001f684d2a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1284,14 +1284,6 @@ void kill_anon_super(struct super_block *sb)
 }
 EXPORT_SYMBOL(kill_anon_super);
 
-void kill_litter_super(struct super_block *sb)
-{
-	if (sb->s_root)
-		d_genocide(sb->s_root);
-	kill_anon_super(sb);
-}
-EXPORT_SYMBOL(kill_litter_super);
-
 int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
 {
 	return set_anon_super(sb, NULL);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6ec4066825e3..20a85144a00e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -198,7 +198,6 @@ enum dentry_flags {
 	DCACHE_REFERENCED		= BIT(6),	/* Recently used, don't discard. */
 	DCACHE_DONTCACHE		= BIT(7),	/* Purge from memory on final dput() */
 	DCACHE_CANT_MOUNT		= BIT(8),
-	DCACHE_GENOCIDE			= BIT(9),
 	DCACHE_SHRINK_LIST		= BIT(10),
 	DCACHE_OP_WEAK_REVALIDATE	= BIT(11),
 	/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f5037c556f61..95933ceaae51 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2728,7 +2728,6 @@ void retire_super(struct super_block *sb);
 void generic_shutdown_super(struct super_block *sb);
 void kill_block_super(struct super_block *sb);
 void kill_anon_super(struct super_block *sb);
-void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
 void deactivate_locked_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
-- 
cgit v1.2.3


From ca459ca70f60ce05445845eca74c788b0d5ddb1b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 25 Oct 2025 18:34:49 -0400
Subject: kill securityfs_recursive_remove()

it's an unused alias for securityfs_remove()

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/security.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 92ac3f27b973..9e710cfee744 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2258,8 +2258,6 @@ static inline void securityfs_remove(struct dentry *dentry)
 
 #endif
 
-#define securityfs_recursive_remove securityfs_remove
-
 #ifdef CONFIG_BPF_SYSCALL
 union bpf_attr;
 struct bpf_map;
-- 
cgit v1.2.3


From eb028c33451af08bb34f45c6be6967ef1c98cbd1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Oct 2025 18:32:21 -0400
Subject: d_make_discardable(): warn if given a non-persistent dentry

At this point there are very few call chains that might lead to
d_make_discardable() on a dentry that hadn't been made persistent:
calls of simple_unlink() and simple_rmdir() in configfs and
apparmorfs.

Both filesystems do pin (part of) their contents in dcache, but
they are currently playing very unusual games with that.  Converting
them to more usual patterns might be possible, but it's definitely
going to be a long series of changes in both cases.

For now the easiest solution is to have both stop using simple_unlink()
and simple_rmdir() - that allows to make d_make_discardable() warn
when given a non-persistent dentry.

Rather than giving them full-blown private copies (with calls of
d_make_discardable() replaced with dput()), let's pull the parts of
simple_unlink() and simple_rmdir() that deal with timestamps and link
counts into separate helpers (__simple_unlink() and __simple_rmdir()
resp.) and have those used by configfs and apparmorfs.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/configfs/dir.c              | 10 ++++++++--
 fs/configfs/inode.c            |  3 ++-
 fs/dcache.c                    |  9 +--------
 fs/libfs.c                     | 21 +++++++++++++++++----
 include/linux/fs.h             |  2 ++
 security/apparmor/apparmorfs.c | 13 +++++++++----
 6 files changed, 39 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 81f4f06bc87e..e8f2f44012e9 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -400,8 +400,14 @@ static void remove_dir(struct dentry * d)
 
 	configfs_remove_dirent(d);
 
-	if (d_really_is_positive(d))
-		simple_rmdir(d_inode(parent),d);
+	if (d_really_is_positive(d)) {
+		if (likely(simple_empty(d))) {
+			__simple_rmdir(d_inode(parent),d);
+			dput(d);
+		} else {
+			pr_warn("remove_dir (%pd): attributes remain", d);
+		}
+	}
 
 	pr_debug(" o %pd removing done (%d)\n", d, d_count(d));
 
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 1d2e3a5738d1..bcda3372e141 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -211,7 +211,8 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 			dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
-			simple_unlink(d_inode(parent), dentry);
+			__simple_unlink(d_inode(parent), dentry);
+			dput(dentry);
 		} else
 			spin_unlock(&dentry->d_lock);
 	}
diff --git a/fs/dcache.c b/fs/dcache.c
index 5ee2e78a91b3..824d620bb563 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -931,14 +931,7 @@ EXPORT_SYMBOL(dput);
 void d_make_discardable(struct dentry *dentry)
 {
 	spin_lock(&dentry->d_lock);
-	/*
-	 * By the end of the series we'll add
-	 * WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT);
-	 * here, but while object removal is done by a few common helpers,
-	 * object creation tends to be open-coded (if nothing else, new inode
-	 * needs to be set up), so adding a warning from the very beginning
-	 * would make for much messier patch series.
-	 */
+	WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT));
 	dentry->d_flags &= ~DCACHE_PERSISTENT;
 	dentry->d_lockref.count--;
 	rcu_read_lock();
diff --git a/fs/libfs.c b/fs/libfs.c
index 80f288a771e3..0aa630e7eb00 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -790,13 +790,27 @@ out:
 }
 EXPORT_SYMBOL(simple_empty);
 
-int simple_unlink(struct inode *dir, struct dentry *dentry)
+void __simple_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
 
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	drop_nlink(inode);
+}
+EXPORT_SYMBOL(__simple_unlink);
+
+void __simple_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	drop_nlink(d_inode(dentry));
+	__simple_unlink(dir, dentry);
+	drop_nlink(dir);
+}
+EXPORT_SYMBOL(__simple_rmdir);
+
+int simple_unlink(struct inode *dir, struct dentry *dentry)
+{
+	__simple_unlink(dir, dentry);
 	d_make_discardable(dentry);
 	return 0;
 }
@@ -807,9 +821,8 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!simple_empty(dentry))
 		return -ENOTEMPTY;
 
-	drop_nlink(d_inode(dentry));
-	simple_unlink(dir, dentry);
-	drop_nlink(dir);
+	__simple_rmdir(dir, dentry);
+	d_make_discardable(dentry);
 	return 0;
 }
 EXPORT_SYMBOL(simple_rmdir);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 95933ceaae51..ef842adbd418 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3621,6 +3621,8 @@ extern int simple_open(struct inode *inode, struct file *file);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
+extern void __simple_unlink(struct inode *, struct dentry *);
+extern void __simple_rmdir(struct inode *, struct dentry *);
 void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
 			     struct inode *new_dir, struct dentry *new_dentry);
 extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 391a586d0557..9b9090d38ea2 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -358,10 +358,15 @@ static void aafs_remove(struct dentry *dentry)
 	dir = d_inode(dentry->d_parent);
 	inode_lock(dir);
 	if (simple_positive(dentry)) {
-		if (d_is_dir(dentry))
-			simple_rmdir(dir, dentry);
-		else
-			simple_unlink(dir, dentry);
+		if (d_is_dir(dentry)) {
+			if (!WARN_ON(!simple_empty(dentry))) {
+				__simple_rmdir(dir, dentry);
+				dput(dentry);
+			}
+		} else {
+			__simple_unlink(dir, dentry);
+			dput(dentry);
+		}
 		d_delete(dentry);
 		dput(dentry);
 	}
-- 
cgit v1.2.3


From 7dc211c1159d991db609bdf4b0fb9033c04adcbc Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Sat, 15 Nov 2025 10:23:43 +0000
Subject: bpf: Fix invalid prog->stats access when update_effective_progs fails

Syzkaller triggers an invalid memory access issue following fault
injection in update_effective_progs. The issue can be described as
follows:

__cgroup_bpf_detach
  update_effective_progs
    compute_effective_progs
      bpf_prog_array_alloc <-- fault inject
  purge_effective_progs
    /* change to dummy_bpf_prog */
    array->items[index] = &dummy_bpf_prog.prog

---softirq start---
__do_softirq
  ...
    __cgroup_bpf_run_filter_skb
      __bpf_prog_run_save_cb
        bpf_prog_run
          stats = this_cpu_ptr(prog->stats)
          /* invalid memory access */
          flags = u64_stats_update_begin_irqsave(&stats->syncp)
---softirq end---

  static_branch_dec(&cgroup_bpf_enabled_key[atype])

The reason is that fault injection caused update_effective_progs to fail
and then changed the original prog into dummy_bpf_prog.prog in
purge_effective_progs. Then a softirq came, and accessing the members of
dummy_bpf_prog.prog in the softirq triggers invalid mem access.

To fix it, skip updating stats when stats is NULL.

Fixes: 492ecee892c2 ("bpf: enable program stats")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20251115102343.2200727-1-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 12 +++++++-----
 kernel/bpf/syscall.c   |  3 +++
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 973233b82dc1..569de3b14279 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -712,11 +712,13 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
 
 		duration = sched_clock() - start;
-		stats = this_cpu_ptr(prog->stats);
-		flags = u64_stats_update_begin_irqsave(&stats->syncp);
-		u64_stats_inc(&stats->cnt);
-		u64_stats_add(&stats->nsecs, duration);
-		u64_stats_update_end_irqrestore(&stats->syncp, flags);
+		if (likely(prog->stats)) {
+			stats = this_cpu_ptr(prog->stats);
+			flags = u64_stats_update_begin_irqsave(&stats->syncp);
+			u64_stats_inc(&stats->cnt);
+			u64_stats_add(&stats->nsecs, duration);
+			u64_stats_update_end_irqrestore(&stats->syncp, flags);
+		}
 	} else {
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
 	}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a2a441185f81..792623a7c90b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2463,6 +2463,9 @@ void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
 	struct bpf_prog_stats *stats;
 	unsigned int flags;
 
+	if (unlikely(!prog->stats))
+		return;
+
 	stats = this_cpu_ptr(prog->stats);
 	flags = u64_stats_update_begin_irqsave(&stats->syncp);
 	u64_stats_inc(&stats->misses);
-- 
cgit v1.2.3


From 945865a0ddf3e3950aea32e23e10d815ee9b21bc Mon Sep 17 00:00:00 2001
From: Antheas Kapenekakis <lkml@antheas.dev>
Date: Sun, 26 Oct 2025 20:16:34 +0100
Subject: ALSA: hda/tas2781: fix speaker id retrieval for multiple probes

Currently, on ASUS projects, the TAS2781 codec attaches the speaker GPIO
to the first tasdevice_priv instance using devm. This causes
tas2781_read_acpi to fail on subsequent probes since the GPIO is already
managed by the first device. This causes a failure on Xbox Ally X,
because it has two amplifiers, and prevents us from quirking both the
Xbox Ally and Xbox Ally X in the realtek codec driver.

It is unnecessary to attach the GPIO to a device as it is static.
Therefore, instead of attaching it and then reading it when loading the
firmware, read its value directly in tas2781_read_acpi and store it in
the private data structure. Then, make reading the value non-fatal so
that ASUS projects that miss a speaker pin can still work, perhaps using
fallback firmware.

Fixes: 4e7035a75da9 ("ALSA: hda/tas2781: Add speaker id check for ASUS projects")
Cc: stable@vger.kernel.org # 6.17
Signed-off-by: Antheas Kapenekakis <lkml@antheas.dev>
Reviewed-by: Baojun Xu <baojun.xu@ti.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251026191635.2447593-1-lkml@antheas.dev
---
 include/sound/tas2781.h                        |  2 +-
 sound/hda/codecs/side-codecs/tas2781_hda_i2c.c | 44 +++++++++++++++-----------
 2 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h
index 0fbcdb15c74b..29d15ba65f04 100644
--- a/include/sound/tas2781.h
+++ b/include/sound/tas2781.h
@@ -197,7 +197,6 @@ struct tasdevice_priv {
 	struct acoustic_data acou_data;
 #endif
 	struct tasdevice_fw *fmw;
-	struct gpio_desc *speaker_id;
 	struct gpio_desc *reset;
 	struct mutex codec_lock;
 	struct regmap *regmap;
@@ -215,6 +214,7 @@ struct tasdevice_priv {
 	unsigned int magic_num;
 	unsigned int chip_id;
 	unsigned int sysclk;
+	int speaker_id;
 
 	int irq;
 	int cur_prog;
diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
index 0357401a6023..c8619995b1d7 100644
--- a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
+++ b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
@@ -87,6 +87,7 @@ static const struct acpi_gpio_mapping tas2781_speaker_id_gpios[] = {
 
 static int tas2781_read_acpi(struct tasdevice_priv *p, const char *hid)
 {
+	struct gpio_desc *speaker_id;
 	struct acpi_device *adev;
 	struct device *physdev;
 	LIST_HEAD(resources);
@@ -119,19 +120,31 @@ static int tas2781_read_acpi(struct tasdevice_priv *p, const char *hid)
 	/* Speaker id was needed for ASUS projects. */
 	ret = kstrtou32(sub, 16, &subid);
 	if (!ret && upper_16_bits(subid) == PCI_VENDOR_ID_ASUSTEK) {
-		ret = devm_acpi_dev_add_driver_gpios(p->dev,
-			tas2781_speaker_id_gpios);
-		if (ret < 0)
+		ret = acpi_dev_add_driver_gpios(adev, tas2781_speaker_id_gpios);
+		if (ret < 0) {
 			dev_err(p->dev, "Failed to add driver gpio %d.\n",
 				ret);
-		p->speaker_id = devm_gpiod_get(p->dev, "speakerid", GPIOD_IN);
-		if (IS_ERR(p->speaker_id)) {
-			dev_err(p->dev, "Failed to get Speaker id.\n");
-			ret = PTR_ERR(p->speaker_id);
-			goto err;
+			p->speaker_id = -1;
+			goto end_2563;
+		}
+
+		speaker_id = fwnode_gpiod_get_index(acpi_fwnode_handle(adev),
+			"speakerid", 0, GPIOD_IN, NULL);
+		if (!IS_ERR(speaker_id)) {
+			p->speaker_id = gpiod_get_value_cansleep(speaker_id);
+			dev_dbg(p->dev, "Got speaker id gpio from ACPI: %d.\n",
+				p->speaker_id);
+			gpiod_put(speaker_id);
+		} else {
+			p->speaker_id = -1;
+			ret = PTR_ERR(speaker_id);
+			dev_err(p->dev, "Get speaker id gpio failed %d.\n",
+				ret);
 		}
+
+		acpi_dev_remove_driver_gpios(adev);
 	} else {
-		p->speaker_id = NULL;
+		p->speaker_id = -1;
 	}
 
 end_2563:
@@ -432,23 +445,16 @@ static void tasdevice_dspfw_init(void *context)
 	struct tas2781_hda *tas_hda = dev_get_drvdata(tas_priv->dev);
 	struct tas2781_hda_i2c_priv *hda_priv = tas_hda->hda_priv;
 	struct hda_codec *codec = tas_priv->codec;
-	int ret, spk_id;
+	int ret;
 
 	tasdevice_dsp_remove(tas_priv);
 	tas_priv->fw_state = TASDEVICE_DSP_FW_PENDING;
-	if (tas_priv->speaker_id != NULL) {
-		// Speaker id need to be checked for ASUS only.
-		spk_id = gpiod_get_value(tas_priv->speaker_id);
-		if (spk_id < 0) {
-			// Speaker id is not valid, use default.
-			dev_dbg(tas_priv->dev, "Wrong spk_id = %d\n", spk_id);
-			spk_id = 0;
-		}
+	if (tas_priv->speaker_id >= 0) {
 		snprintf(tas_priv->coef_binaryname,
 			  sizeof(tas_priv->coef_binaryname),
 			  "TAS2XXX%04X%d.bin",
 			  lower_16_bits(codec->core.subsystem_id),
-			  spk_id);
+			  tas_priv->speaker_id);
 	} else {
 		snprintf(tas_priv->coef_binaryname,
 			  sizeof(tas_priv->coef_binaryname),
-- 
cgit v1.2.3


From ae8966b7b5bd69b86209cc34bcca1ba9f18b68e6 Mon Sep 17 00:00:00 2001
From: Peter Hutterer <peter.hutterer@who-t.net>
Date: Thu, 6 Nov 2025 21:45:34 +1000
Subject: Input: rename INPUT_PROP_HAPTIC_TOUCHPAD to INPUT_PROP_PRESSUREPAD

And expand it to encompass all pressure pads.

Definition: "pressure pad" as used here as includes all touchpads that
use physical pressure to convert to click, without physical hinges. Also
called haptic touchpads in general parlance, Synaptics calls them
ForcePads.

Most (all?) pressure pads are currently advertised as
INPUT_PROP_BUTTONPAD. The suggestion to identify them as pressure pads
by defining the resolution on ABS_MT_PRESSURE has been in the docs since
commit 20ccc8dd38a3 ("Documentation: input: define
ABS_PRESSURE/ABS_MT_PRESSURE resolution as grams") but few devices
provide this information.

In userspace it's thus impossible to determine whether a device is a
true pressure pad (pressure equals pressure) or a normal clickpad with
(pressure equals finger size).

Commit 7075ae4ac9db ("Input: add INPUT_PROP_HAPTIC_TOUCHPAD") introduces
INPUT_PROP_HAPTIC_TOUCHPAD but restricted it to those touchpads that
have support for userspace-controlled effects. Let's expand and rename
that definition to include all pressure pad touchpads since those that
do support FF effects can be identified by the presence of the
FF_HAPTIC bit.

This means:
- clickpad: INPUT_PROP_BUTTONPAD
- pressurepad: INPUT_PROP_BUTTONPAD + INPUT_PROP_PRESSUREPAD
- pressurepad with configurable haptics:
  INPUT_PROP_BUTTONPAD + INPUT_PROP_PRESSUREPAD + FF_HAPTIC

Signed-off-by: Peter Hutterer <peter.hutterer@who-t.net>
Acked-by: Benjamin Tissoires <bentiss@kernel.org>
Link: https://patch.msgid.link/20251106114534.GA405512@tassie
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 Documentation/input/event-codes.rst    | 25 ++++++++++++++++++-------
 drivers/hid/hid-haptic.c               |  2 +-
 include/uapi/linux/input-event-codes.h |  2 +-
 3 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst
index 1ead9bb8d9c6..4424cbff251f 100644
--- a/Documentation/input/event-codes.rst
+++ b/Documentation/input/event-codes.rst
@@ -400,19 +400,30 @@ can report through the rotational axes (absolute and/or relative rx, ry, rz).
 All other axes retain their meaning. A device must not mix
 regular directional axes and accelerometer axes on the same event node.
 
-INPUT_PROP_HAPTIC_TOUCHPAD
---------------------------
+INPUT_PROP_PRESSUREPAD
+----------------------
+
+The INPUT_PROP_PRESSUREPAD property indicates that the device provides
+simulated haptic feedback (e.g. a vibrator motor situated below the surface)
+instead of physical haptic feedback (e.g. a hinge). This property is only set
+if the device:
 
-The INPUT_PROP_HAPTIC_TOUCHPAD property indicates that device:
-- supports simple haptic auto and manual triggering
 - can differentiate between at least 5 fingers
 - uses correct resolution for the X/Y (units and value)
-- reports correct force per touch, and correct units for them (newtons or grams)
 - follows the MT protocol type B
 
+If the simulated haptic feedback is controllable by userspace the device must:
+
+- support simple haptic auto and manual triggering, and
+- report correct force per touch, and correct units for them (newtons or grams), and
+- provide the EV_FF FF_HAPTIC force feedback effect.
+
 Summing up, such devices follow the MS spec for input devices in
-Win8 and Win8.1, and in addition support the Simple haptic controller HID table,
-and report correct units for the pressure.
+Win8 and Win8.1, and in addition may support the Simple haptic controller HID
+table, and report correct units for the pressure.
+
+Where applicable, this property is set in addition to INPUT_PROP_BUTTONPAD, it
+does not replace that property.
 
 Guidelines
 ==========
diff --git a/drivers/hid/hid-haptic.c b/drivers/hid/hid-haptic.c
index aa090684c1f2..fc8a9997f815 100644
--- a/drivers/hid/hid-haptic.c
+++ b/drivers/hid/hid-haptic.c
@@ -86,7 +86,7 @@ int hid_haptic_input_configured(struct hid_device *hdev,
 	if (hi->application == HID_DG_TOUCHPAD) {
 		if (haptic->auto_trigger_report &&
 		    haptic->manual_trigger_report) {
-			__set_bit(INPUT_PROP_HAPTIC_TOUCHPAD, hi->input->propbit);
+			__set_bit(INPUT_PROP_PRESSUREPAD, hi->input->propbit);
 			return 1;
 		}
 		return 0;
diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 9cd89bcc1d9c..30f3c9eaafaa 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -27,7 +27,7 @@
 #define INPUT_PROP_TOPBUTTONPAD		0x04	/* softbuttons at top of pad */
 #define INPUT_PROP_POINTING_STICK	0x05	/* is a pointing stick */
 #define INPUT_PROP_ACCELEROMETER	0x06	/* has accelerometer */
-#define INPUT_PROP_HAPTIC_TOUCHPAD	0x07	/* is a haptic touchpad */
+#define INPUT_PROP_PRESSUREPAD		0x07	/* pressure triggers clicks */
 
 #define INPUT_PROP_MAX			0x1f
 #define INPUT_PROP_CNT			(INPUT_PROP_MAX + 1)
-- 
cgit v1.2.3


From 9c7dacf5d51910f34a3bd709403f6a82ffc8c960 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sun, 2 Nov 2025 22:53:14 +0100
Subject: platform/x86: asus-armoury: add apu-mem control support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the APU memory size control under the asus-armoury module using
the fw_attributes class.

This allows the APU allocated memory size to be adjusted depending on
the users priority. A reboot is required after change.

Co-developed-by: Denis Benato <denis.benato@linux.dev>
Signed-off-by: Denis Benato <denis.benato@linux.dev>
Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://patch.msgid.link/20251102215319.3126879-5-denis.benato@linux.dev
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/asus-armoury.c        | 98 ++++++++++++++++++++++++++++++
 include/linux/platform_data/x86/asus-wmi.h |  2 +
 2 files changed, 100 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c
index f0cb973a487e..1b972260c5dd 100644
--- a/drivers/platform/x86/asus-armoury.c
+++ b/drivers/platform/x86/asus-armoury.c
@@ -174,6 +174,7 @@ static int armoury_get_devstate(struct kobj_attribute *attr, u32 *retval, u32 de
  * and should perform relevant checks.
  *
  * Returns:
+ * * %-EINVAL	- attempt to set a dangerous or unsupported value.
  * * %-EIO	- WMI function returned an error.
  * * %0		- successful and retval is filled.
  * * %other	- error from WMI call.
@@ -184,6 +185,26 @@ static int armoury_set_devstate(struct kobj_attribute *attr,
 	u32 result;
 	int err;
 
+	/*
+	 * Prevent developers from bricking devices or issuing dangerous
+	 * commands that can be difficult or impossible to recover from.
+	 */
+	switch (dev_id) {
+	case ASUS_WMI_DEVID_APU_MEM:
+		/*
+		 * A hard reset might suffice to save the device,
+		 * but there is no value in sending these commands.
+		 */
+		if (value == 0x100 || value == 0x101) {
+			pr_err("Refusing to set APU memory to unsafe value: 0x%x\n", value);
+			return -EINVAL;
+		}
+		break;
+	default:
+		/* No problems are known for this dev_id */
+		break;
+	}
+
 	err = asus_wmi_set_devstate(dev_id, value, retval ? retval : &result);
 	if (err) {
 		if (attr)
@@ -599,6 +620,82 @@ static ssize_t egpu_enable_possible_values_show(struct kobject *kobj, struct kob
 }
 ASUS_ATTR_GROUP_ENUM(egpu_enable, "egpu_enable", "Enable the eGPU (also disables dGPU)");
 
+/* Device memory available to APU */
+
+/*
+ * Values map for APU reserved memory (index + 1 number of GB).
+ * Some looks out of order, but are actually correct.
+ */
+static u32 apu_mem_map[] = {
+	[0] = 0x000, /* called "AUTO" on the BIOS, is the minimum available */
+	[1] = 0x102,
+	[2] = 0x103,
+	[3] = 0x104,
+	[4] = 0x105,
+	[5] = 0x107,
+	[6] = 0x108,
+	[7] = 0x109,
+	[8] = 0x106,
+};
+
+static ssize_t apu_mem_current_value_show(struct kobject *kobj, struct kobj_attribute *attr,
+					  char *buf)
+{
+	int err;
+	u32 mem;
+
+	err = armoury_get_devstate(attr, &mem, ASUS_WMI_DEVID_APU_MEM);
+	if (err)
+		return err;
+
+	/* After 0x000 is set, a read will return 0x100 */
+	if (mem == 0x100)
+		return sysfs_emit(buf, "0\n");
+
+	for (unsigned int i = 0; i < ARRAY_SIZE(apu_mem_map); i++) {
+		if (apu_mem_map[i] == mem)
+			return sysfs_emit(buf, "%u\n", i);
+	}
+
+	pr_warn("Unrecognised value for APU mem 0x%08x\n", mem);
+	return -EIO;
+}
+
+static ssize_t apu_mem_current_value_store(struct kobject *kobj, struct kobj_attribute *attr,
+					   const char *buf, size_t count)
+{
+	int result, err;
+	u32 requested, mem;
+
+	result = kstrtou32(buf, 10, &requested);
+	if (result)
+		return result;
+
+	if (requested >= ARRAY_SIZE(apu_mem_map))
+		return -EINVAL;
+	mem = apu_mem_map[requested];
+
+	err = armoury_set_devstate(attr, mem, NULL, ASUS_WMI_DEVID_APU_MEM);
+	if (err) {
+		pr_warn("Failed to set apu_mem 0x%x: %d\n", mem, err);
+		return err;
+	}
+
+	pr_info("APU memory changed to %uGB, reboot required\n", requested + 1);
+	sysfs_notify(kobj, NULL, attr->attr.name);
+
+	asus_set_reboot_and_signal_event();
+
+	return count;
+}
+
+static ssize_t apu_mem_possible_values_show(struct kobject *kobj, struct kobj_attribute *attr,
+					    char *buf)
+{
+	return armoury_attr_enum_list(buf, ARRAY_SIZE(apu_mem_map));
+}
+ASUS_ATTR_GROUP_ENUM(apu_mem, "apu_mem", "Set available system RAM (in GB) for the APU to use");
+
 /* Simple attribute creation */
 ASUS_ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2\n",
 			    "Show the current mode of charging");
@@ -618,6 +715,7 @@ static const struct asus_attr_group armoury_attr_groups[] = {
 	{ &egpu_connected_attr_group, ASUS_WMI_DEVID_EGPU_CONNECTED },
 	{ &egpu_enable_attr_group, ASUS_WMI_DEVID_EGPU },
 	{ &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU },
+	{ &apu_mem_attr_group, ASUS_WMI_DEVID_APU_MEM },
 
 	{ &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE },
 	{ &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND },
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 3cc235b20be4..9a6433d08973 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -136,6 +136,8 @@
 /* dgpu on/off */
 #define ASUS_WMI_DEVID_DGPU		0x00090020
 
+#define ASUS_WMI_DEVID_APU_MEM		0x000600C1
+
 /* gpu mux switch, 0 = dGPU, 1 = Optimus */
 #define ASUS_WMI_DEVID_GPU_MUX		0x00090016
 #define ASUS_WMI_DEVID_GPU_MUX_VIVO	0x00090026
-- 
cgit v1.2.3


From 7725a2dc58632cb44eeef2e5b959ab7b7931298d Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sun, 2 Nov 2025 22:53:16 +0100
Subject: platform/x86: asus-armoury: add screen auto-brightness toggle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add screen_auto_brightness toggle supported on some laptops.

Signed-off-by: Denis Benato <denis.benato@linux.dev>
Signed-off-by: Luke D. Jones <luke@ljones.dev>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://patch.msgid.link/20251102215319.3126879-7-denis.benato@linux.dev
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/asus-armoury.c        | 4 ++++
 include/linux/platform_data/x86/asus-wmi.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c
index 1b972260c5dd..c1dbaed409d2 100644
--- a/drivers/platform/x86/asus-armoury.c
+++ b/drivers/platform/x86/asus-armoury.c
@@ -707,6 +707,9 @@ ASUS_ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD,
 			"Set the panel refresh overdrive");
 ASUS_ATTR_GROUP_BOOL_RW(panel_hd_mode, "panel_hd_mode", ASUS_WMI_DEVID_PANEL_HD,
 			"Set the panel HD mode to UHD<0> or FHD<1>");
+ASUS_ATTR_GROUP_BOOL_RW(screen_auto_brightness, "screen_auto_brightness",
+			ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS,
+			"Set the panel brightness to Off<0> or On<1>");
 ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED,
 			"Show the eGPU connection status");
 
@@ -722,6 +725,7 @@ static const struct asus_attr_group armoury_attr_groups[] = {
 	{ &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE },
 	{ &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD },
 	{ &panel_hd_mode_attr_group, ASUS_WMI_DEVID_PANEL_HD },
+	{ &screen_auto_brightness_attr_group, ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS },
 };
 
 static int asus_fw_attr_add(void)
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 9a6433d08973..3af075baf9f7 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -82,6 +82,7 @@
 #define ASUS_WMI_DEVID_LID_FLIP_ROG	0x00060077
 #define ASUS_WMI_DEVID_MINI_LED_MODE	0x0005001E
 #define ASUS_WMI_DEVID_MINI_LED_MODE2	0x0005002E
+#define ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS	0x0005002A
 
 /* Storage */
 #define ASUS_WMI_DEVID_CARDREADER	0x00080013
-- 
cgit v1.2.3


From d849a9f2380d5287d5133eac5bae602a147b86c2 Mon Sep 17 00:00:00 2001
From: Denis Benato <denis.benato@linux.dev>
Date: Sun, 2 Nov 2025 22:53:18 +0100
Subject: platform/x86: asus-wmi: rename ASUS_WMI_DEVID_PPT_FPPT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Maintain power-related WMI macros naming consistency:
rename ASUS_WMI_DEVID_PPT_FPPT to ASUS_WMI_DEVID_PPT_PL3_FPPT.

Link: https://lore.kernel.org/all/cad7b458-5a7a-4975-94a1-d0c74f6f3de5@oracle.com/

Suggested-by: ALOK TIWARI <alok.a.tiwari@oracle.com>
Signed-off-by: Denis Benato <denis.benato@linux.dev>
Link: https://.../
Link: https://patch.msgid.link/20251102215319.3126879-9-denis.benato@linux.dev
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/asus-wmi.c            | 4 ++--
 include/linux/platform_data/x86/asus-wmi.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 6de633d4a748..64cfc0bf98dd 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -1218,7 +1218,7 @@ static ssize_t ppt_fppt_store(struct device *dev,
 	if (value < PPT_TOTAL_MIN || value > PPT_TOTAL_MAX)
 		return -EINVAL;
 
-	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_PPT_FPPT, value, &result);
+	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_PPT_PL3_FPPT, value, &result);
 	if (err) {
 		pr_warn("Failed to set ppt_fppt: %d\n", err);
 		return err;
@@ -4602,7 +4602,7 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
 	else if (attr == &dev_attr_ppt_pl1_spl.attr)
 		devid = ASUS_WMI_DEVID_PPT_PL1_SPL;
 	else if (attr == &dev_attr_ppt_fppt.attr)
-		devid = ASUS_WMI_DEVID_PPT_FPPT;
+		devid = ASUS_WMI_DEVID_PPT_PL3_FPPT;
 	else if (attr == &dev_attr_ppt_apu_sppt.attr)
 		devid = ASUS_WMI_DEVID_PPT_APU_SPPT;
 	else if (attr == &dev_attr_ppt_platform_sppt.attr)
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 3af075baf9f7..e7c95e9d29db 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -107,7 +107,7 @@
 #define ASUS_WMI_DEVID_PPT_PL1_SPL		0x001200A3
 #define ASUS_WMI_DEVID_PPT_APU_SPPT		0x001200B0
 #define ASUS_WMI_DEVID_PPT_PLAT_SPPT	0x001200B1
-#define ASUS_WMI_DEVID_PPT_FPPT			0x001200C1
+#define ASUS_WMI_DEVID_PPT_PL3_FPPT		0x001200C1
 #define ASUS_WMI_DEVID_NV_DYN_BOOST		0x001200C0
 #define ASUS_WMI_DEVID_NV_THERM_TARGET	0x001200C2
 
-- 
cgit v1.2.3


From 39ae6c50e599aa0cf62ea3d0dcf06492f7690ed7 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sun, 2 Nov 2025 22:53:19 +0100
Subject: platform/x86: asus-armoury: add ppt_* and nv_* tuning knobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the ppt_* and nv_* tuning knobs that are available via WMI methods
and adds proper min/max levels plus defaults.

The min/max are defined by ASUS and typically gained by looking at what
they allow in the ASUS Armoury Crate application - ASUS does not share
the values outside of this. It could also be possible to gain the AMD
values by use of ryzenadj and testing for the minimum stable value.

The general rule of thumb for adding to the match table is that if the
model range has a single CPU used throughout, then the DMI match can
omit the last letter of the model number as this is the GPU model.

If a min or max value is not provided it is assumed that the particular
setting is not supported. for example ppt_pl2_sppt_min/max is not set.
If a <ppt_setting>_def is not set then the default is assumed to be
<ppt_setting>_max

It is assumed that at least AC settings are available so that the
firmware attributes will be created - if no DC table is available
and power is on DC, then reading the attributes is -ENODEV.

Co-developed-by: Denis Benato <denis.benato@linux.dev>
Signed-off-by: Denis Benato <denis.benato@linux.dev>
Signed-off-by: Luke D. Jones <luke@ljones.dev>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Mateusz Schyboll <dragonn@op.pl>
Tested-by: Porfet Lillian <porfet828@gmail.com>
Link: https://patch.msgid.link/20251102215319.3126879-10-denis.benato@linux.dev
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/asus-armoury.c        |  302 ++++++-
 drivers/platform/x86/asus-armoury.h        | 1294 ++++++++++++++++++++++++++++
 include/linux/platform_data/x86/asus-wmi.h |    3 +
 3 files changed, 1593 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c
index c1dbaed409d2..d6aba68515e2 100644
--- a/drivers/platform/x86/asus-armoury.c
+++ b/drivers/platform/x86/asus-armoury.c
@@ -30,6 +30,7 @@
 #include <linux/pci.h>
 #include <linux/platform_data/x86/asus-wmi.h>
 #include <linux/printk.h>
+#include <linux/power_supply.h>
 #include <linux/sysfs.h>
 
 #include "asus-armoury.h"
@@ -48,6 +49,33 @@
 #define ASUS_MINI_LED_2024_STRONG 0x01
 #define ASUS_MINI_LED_2024_OFF    0x02
 
+/* Power tunable attribute name defines */
+#define ATTR_PPT_PL1_SPL        "ppt_pl1_spl"
+#define ATTR_PPT_PL2_SPPT       "ppt_pl2_sppt"
+#define ATTR_PPT_PL3_FPPT       "ppt_pl3_fppt"
+#define ATTR_PPT_APU_SPPT       "ppt_apu_sppt"
+#define ATTR_PPT_PLATFORM_SPPT  "ppt_platform_sppt"
+#define ATTR_NV_DYNAMIC_BOOST   "nv_dynamic_boost"
+#define ATTR_NV_TEMP_TARGET     "nv_temp_target"
+#define ATTR_NV_BASE_TGP        "nv_base_tgp"
+#define ATTR_NV_TGP             "nv_tgp"
+
+#define ASUS_ROG_TUNABLE_DC 0
+#define ASUS_ROG_TUNABLE_AC 1
+
+struct rog_tunables {
+	const struct power_limits *power_limits;
+	u32 ppt_pl1_spl;			// cpu
+	u32 ppt_pl2_sppt;			// cpu
+	u32 ppt_pl3_fppt;			// cpu
+	u32 ppt_apu_sppt;			// plat
+	u32 ppt_platform_sppt;		// plat
+
+	u32 nv_dynamic_boost;
+	u32 nv_temp_target;
+	u32 nv_tgp;
+};
+
 struct asus_armoury_priv {
 	struct device *fw_attr_dev;
 	struct kset *fw_attr_kset;
@@ -60,6 +88,9 @@ struct asus_armoury_priv {
 	 */
 	struct mutex egpu_mutex;
 
+	/* Index 0 for DC, 1 for AC */
+	struct rog_tunables *rog_tunables[2];
+
 	u32 mini_led_dev_id;
 	u32 gpu_mux_dev_id;
 };
@@ -290,6 +321,12 @@ static ssize_t enum_type_show(struct kobject *kobj, struct kobj_attribute *attr,
 	return sysfs_emit(buf, "enumeration\n");
 }
 
+static ssize_t int_type_show(struct kobject *kobj, struct kobj_attribute *attr,
+			     char *buf)
+{
+	return sysfs_emit(buf, "integer\n");
+}
+
 /* Mini-LED mode **************************************************************/
 
 /* Values map for mini-led modes on 2023 and earlier models. */
@@ -696,6 +733,15 @@ static ssize_t apu_mem_possible_values_show(struct kobject *kobj, struct kobj_at
 }
 ASUS_ATTR_GROUP_ENUM(apu_mem, "apu_mem", "Set available system RAM (in GB) for the APU to use");
 
+/* Define helper to access the current power mode tunable values */
+static inline struct rog_tunables *get_current_tunables(void)
+{
+	if (power_supply_is_system_supplied())
+		return asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC];
+
+	return asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC];
+}
+
 /* Simple attribute creation */
 ASUS_ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2\n",
 			    "Show the current mode of charging");
@@ -712,6 +758,24 @@ ASUS_ATTR_GROUP_BOOL_RW(screen_auto_brightness, "screen_auto_brightness",
 			"Set the panel brightness to Off<0> or On<1>");
 ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED,
 			"Show the eGPU connection status");
+ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_pl1_spl, ATTR_PPT_PL1_SPL, ASUS_WMI_DEVID_PPT_PL1_SPL,
+			    "Set the CPU slow package limit");
+ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_pl2_sppt, ATTR_PPT_PL2_SPPT, ASUS_WMI_DEVID_PPT_PL2_SPPT,
+			    "Set the CPU fast package limit");
+ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_pl3_fppt, ATTR_PPT_PL3_FPPT, ASUS_WMI_DEVID_PPT_PL3_FPPT,
+			    "Set the CPU fastest package limit");
+ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_apu_sppt, ATTR_PPT_APU_SPPT, ASUS_WMI_DEVID_PPT_APU_SPPT,
+			    "Set the APU package limit");
+ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_platform_sppt, ATTR_PPT_PLATFORM_SPPT, ASUS_WMI_DEVID_PPT_PLAT_SPPT,
+			    "Set the platform package limit");
+ASUS_ATTR_GROUP_ROG_TUNABLE(nv_dynamic_boost, ATTR_NV_DYNAMIC_BOOST, ASUS_WMI_DEVID_NV_DYN_BOOST,
+			    "Set the Nvidia dynamic boost limit");
+ASUS_ATTR_GROUP_ROG_TUNABLE(nv_temp_target, ATTR_NV_TEMP_TARGET, ASUS_WMI_DEVID_NV_THERM_TARGET,
+			    "Set the Nvidia max thermal limit");
+ASUS_ATTR_GROUP_ROG_TUNABLE(nv_tgp, "nv_tgp", ASUS_WMI_DEVID_DGPU_SET_TGP,
+			    "Set the additional TGP on top of the base TGP");
+ASUS_ATTR_GROUP_INT_VALUE_ONLY_RO(nv_base_tgp, ATTR_NV_BASE_TGP, ASUS_WMI_DEVID_DGPU_BASE_TGP,
+				  "Read the base TGP value");
 
 /* If an attribute does not require any special case handling add it here */
 static const struct asus_attr_group armoury_attr_groups[] = {
@@ -720,6 +784,16 @@ static const struct asus_attr_group armoury_attr_groups[] = {
 	{ &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU },
 	{ &apu_mem_attr_group, ASUS_WMI_DEVID_APU_MEM },
 
+	{ &ppt_pl1_spl_attr_group, ASUS_WMI_DEVID_PPT_PL1_SPL },
+	{ &ppt_pl2_sppt_attr_group, ASUS_WMI_DEVID_PPT_PL2_SPPT },
+	{ &ppt_pl3_fppt_attr_group, ASUS_WMI_DEVID_PPT_PL3_FPPT },
+	{ &ppt_apu_sppt_attr_group, ASUS_WMI_DEVID_PPT_APU_SPPT },
+	{ &ppt_platform_sppt_attr_group, ASUS_WMI_DEVID_PPT_PLAT_SPPT },
+	{ &nv_dynamic_boost_attr_group, ASUS_WMI_DEVID_NV_DYN_BOOST },
+	{ &nv_temp_target_attr_group, ASUS_WMI_DEVID_NV_THERM_TARGET },
+	{ &nv_base_tgp_attr_group, ASUS_WMI_DEVID_DGPU_BASE_TGP },
+	{ &nv_tgp_attr_group, ASUS_WMI_DEVID_DGPU_SET_TGP },
+
 	{ &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE },
 	{ &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND },
 	{ &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE },
@@ -728,8 +802,76 @@ static const struct asus_attr_group armoury_attr_groups[] = {
 	{ &screen_auto_brightness_attr_group, ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS },
 };
 
+/**
+ * is_power_tunable_attr - Determines if an attribute is a power-related tunable
+ * @name: The name of the attribute to check
+ *
+ * This function checks if the given attribute name is related to power tuning.
+ *
+ * Return: true if the attribute is a power-related tunable, false otherwise
+ */
+static bool is_power_tunable_attr(const char *name)
+{
+	static const char * const power_tunable_attrs[] = {
+		ATTR_PPT_PL1_SPL,	ATTR_PPT_PL2_SPPT,
+		ATTR_PPT_PL3_FPPT,	ATTR_PPT_APU_SPPT,
+		ATTR_PPT_PLATFORM_SPPT, ATTR_NV_DYNAMIC_BOOST,
+		ATTR_NV_TEMP_TARGET,	ATTR_NV_BASE_TGP,
+		ATTR_NV_TGP
+	};
+
+	for (unsigned int i = 0; i < ARRAY_SIZE(power_tunable_attrs); i++) {
+		if (!strcmp(name, power_tunable_attrs[i]))
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * has_valid_limit - Checks if a power-related attribute has a valid limit value
+ * @name: The name of the attribute to check
+ * @limits: Pointer to the power_limits structure containing limit values
+ *
+ * This function checks if a power-related attribute has a valid limit value.
+ * It returns false if limits is NULL or if the corresponding limit value is zero.
+ *
+ * Return: true if the attribute has a valid limit value, false otherwise
+ */
+static bool has_valid_limit(const char *name, const struct power_limits *limits)
+{
+	u32 limit_value = 0;
+
+	if (!limits)
+		return false;
+
+	if (!strcmp(name, ATTR_PPT_PL1_SPL))
+		limit_value = limits->ppt_pl1_spl_max;
+	else if (!strcmp(name, ATTR_PPT_PL2_SPPT))
+		limit_value = limits->ppt_pl2_sppt_max;
+	else if (!strcmp(name, ATTR_PPT_PL3_FPPT))
+		limit_value = limits->ppt_pl3_fppt_max;
+	else if (!strcmp(name, ATTR_PPT_APU_SPPT))
+		limit_value = limits->ppt_apu_sppt_max;
+	else if (!strcmp(name, ATTR_PPT_PLATFORM_SPPT))
+		limit_value = limits->ppt_platform_sppt_max;
+	else if (!strcmp(name, ATTR_NV_DYNAMIC_BOOST))
+		limit_value = limits->nv_dynamic_boost_max;
+	else if (!strcmp(name, ATTR_NV_TEMP_TARGET))
+		limit_value = limits->nv_temp_target_max;
+	else if (!strcmp(name, ATTR_NV_BASE_TGP) ||
+		 !strcmp(name, ATTR_NV_TGP))
+		limit_value = limits->nv_tgp_max;
+
+	return limit_value > 0;
+}
+
 static int asus_fw_attr_add(void)
 {
+	const struct rog_tunables *const ac_rog_tunables = asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC];
+	const struct power_limits *limits;
+	bool should_create;
+	const char *name;
 	int err, i;
 
 	asus_armoury.fw_attr_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0),
@@ -786,12 +928,28 @@ static int asus_fw_attr_add(void)
 		if (!armoury_has_devstate(armoury_attr_groups[i].wmi_devid))
 			continue;
 
-		err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj,
-					 armoury_attr_groups[i].attr_group);
-		if (err) {
-			pr_err("Failed to create sysfs-group for %s\n",
-			       armoury_attr_groups[i].attr_group->name);
-			goto err_remove_groups;
+		/* Always create by default, unless PPT is not present */
+		should_create = true;
+		name = armoury_attr_groups[i].attr_group->name;
+
+		/* Check if this is a power-related tunable requiring limits */
+		if (ac_rog_tunables && ac_rog_tunables->power_limits &&
+		    is_power_tunable_attr(name)) {
+			limits = ac_rog_tunables->power_limits;
+			/* Check only AC: if not present then DC won't be either */
+			should_create = has_valid_limit(name, limits);
+			if (!should_create)
+				pr_debug("Missing max value for tunable %s\n", name);
+		}
+
+		if (should_create) {
+			err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj,
+						 armoury_attr_groups[i].attr_group);
+			if (err) {
+				pr_err("Failed to create sysfs-group for %s\n",
+				       armoury_attr_groups[i].attr_group->name);
+				goto err_remove_groups;
+			}
 		}
 	}
 
@@ -820,6 +978,132 @@ fail_class_get:
 
 /* Init / exit ****************************************************************/
 
+/* Set up the min/max and defaults for ROG tunables */
+static void init_rog_tunables(void)
+{
+	const struct power_limits *ac_limits, *dc_limits;
+	struct rog_tunables *ac_rog_tunables = NULL, *dc_rog_tunables = NULL;
+	const struct power_data *power_data;
+	const struct dmi_system_id *dmi_id;
+
+	/* Match the system against the power_limits table */
+	dmi_id = dmi_first_match(power_limits);
+	if (!dmi_id) {
+		pr_warn("No matching power limits found for this system\n");
+		return;
+	}
+
+	/* Get the power data for this system */
+	power_data = dmi_id->driver_data;
+	if (!power_data) {
+		pr_info("No power data available for this system\n");
+		return;
+	}
+
+	/* Initialize AC power tunables */
+	ac_limits = power_data->ac_data;
+	if (ac_limits) {
+		ac_rog_tunables = kzalloc(sizeof(*asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC]),
+				GFP_KERNEL);
+		if (!ac_rog_tunables)
+			goto err_nomem;
+
+		asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC] = ac_rog_tunables;
+		ac_rog_tunables->power_limits = ac_limits;
+
+		/* Set initial AC values */
+		ac_rog_tunables->ppt_pl1_spl =
+			ac_limits->ppt_pl1_spl_def ?
+				ac_limits->ppt_pl1_spl_def :
+				ac_limits->ppt_pl1_spl_max;
+
+		ac_rog_tunables->ppt_pl2_sppt =
+			ac_limits->ppt_pl2_sppt_def ?
+				ac_limits->ppt_pl2_sppt_def :
+				ac_limits->ppt_pl2_sppt_max;
+
+		ac_rog_tunables->ppt_pl3_fppt =
+			ac_limits->ppt_pl3_fppt_def ?
+				ac_limits->ppt_pl3_fppt_def :
+				ac_limits->ppt_pl3_fppt_max;
+
+		ac_rog_tunables->ppt_apu_sppt =
+			ac_limits->ppt_apu_sppt_def ?
+				ac_limits->ppt_apu_sppt_def :
+				ac_limits->ppt_apu_sppt_max;
+
+		ac_rog_tunables->ppt_platform_sppt =
+			ac_limits->ppt_platform_sppt_def ?
+				ac_limits->ppt_platform_sppt_def :
+				ac_limits->ppt_platform_sppt_max;
+
+		ac_rog_tunables->nv_dynamic_boost =
+			ac_limits->nv_dynamic_boost_max;
+		ac_rog_tunables->nv_temp_target =
+			ac_limits->nv_temp_target_max;
+		ac_rog_tunables->nv_tgp = ac_limits->nv_tgp_max;
+
+		pr_debug("AC power limits initialized for %s\n", dmi_id->matches[0].substr);
+	} else {
+		pr_debug("No AC PPT limits defined\n");
+	}
+
+	/* Initialize DC power tunables */
+	dc_limits = power_data->dc_data;
+	if (dc_limits) {
+		dc_rog_tunables = kzalloc(sizeof(*asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC]),
+					  GFP_KERNEL);
+		if (!dc_rog_tunables) {
+			kfree(ac_rog_tunables);
+			goto err_nomem;
+		}
+
+		asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC] = dc_rog_tunables;
+		dc_rog_tunables->power_limits = dc_limits;
+
+		/* Set initial DC values */
+		dc_rog_tunables->ppt_pl1_spl =
+			dc_limits->ppt_pl1_spl_def ?
+				dc_limits->ppt_pl1_spl_def :
+				dc_limits->ppt_pl1_spl_max;
+
+		dc_rog_tunables->ppt_pl2_sppt =
+			dc_limits->ppt_pl2_sppt_def ?
+				dc_limits->ppt_pl2_sppt_def :
+				dc_limits->ppt_pl2_sppt_max;
+
+		dc_rog_tunables->ppt_pl3_fppt =
+			dc_limits->ppt_pl3_fppt_def ?
+				dc_limits->ppt_pl3_fppt_def :
+				dc_limits->ppt_pl3_fppt_max;
+
+		dc_rog_tunables->ppt_apu_sppt =
+			dc_limits->ppt_apu_sppt_def ?
+				dc_limits->ppt_apu_sppt_def :
+				dc_limits->ppt_apu_sppt_max;
+
+		dc_rog_tunables->ppt_platform_sppt =
+			dc_limits->ppt_platform_sppt_def ?
+				dc_limits->ppt_platform_sppt_def :
+				dc_limits->ppt_platform_sppt_max;
+
+		dc_rog_tunables->nv_dynamic_boost =
+			dc_limits->nv_dynamic_boost_max;
+		dc_rog_tunables->nv_temp_target =
+			dc_limits->nv_temp_target_max;
+		dc_rog_tunables->nv_tgp = dc_limits->nv_tgp_max;
+
+		pr_debug("DC power limits initialized for %s\n", dmi_id->matches[0].substr);
+	} else {
+		pr_debug("No DC PPT limits defined\n");
+	}
+
+	return;
+
+err_nomem:
+	pr_err("Failed to allocate memory for tunables\n");
+}
+
 static int __init asus_fw_init(void)
 {
 	char *wmi_uid;
@@ -835,6 +1119,9 @@ static int __init asus_fw_init(void)
 	if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI))
 		return -ENODEV;
 
+	init_rog_tunables();
+
+	/* Must always be last step to ensure data is available */
 	return asus_fw_attr_add();
 }
 
@@ -857,6 +1144,9 @@ static void __exit asus_fw_exit(void)
 	sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr);
 	kset_unregister(asus_armoury.fw_attr_kset);
 	device_destroy(&firmware_attributes_class, MKDEV(0, 0));
+
+	kfree(asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC]);
+	kfree(asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC]);
 }
 
 module_init(asus_fw_init);
diff --git a/drivers/platform/x86/asus-armoury.h b/drivers/platform/x86/asus-armoury.h
index 3a2a674a1b55..548c66c590a6 100644
--- a/drivers/platform/x86/asus-armoury.h
+++ b/drivers/platform/x86/asus-armoury.h
@@ -8,6 +8,7 @@
 #ifndef _ASUS_ARMOURY_H_
 #define _ASUS_ARMOURY_H_
 
+#include <linux/dmi.h>
 #include <linux/platform_device.h>
 #include <linux/sysfs.h>
 #include <linux/types.h>
@@ -197,4 +198,1297 @@ ssize_t armoury_attr_uint_show(struct kobject *kobj, struct kobj_attribute *attr
 		.name = _fsname, .attrs = _attrname##_attrs			\
 	}
 
+#define ASUS_ATTR_GROUP_INT_VALUE_ONLY_RO(_attrname, _fsname, _wmi, _dispname)	\
+	ASUS_WMI_SHOW_INT(_attrname##_current_value, _wmi);		\
+	static struct kobj_attribute attr_##_attrname##_current_value =		\
+		__ASUS_ATTR_RO(_attrname, current_value);			\
+	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);		\
+	static struct kobj_attribute attr_##_attrname##_type =			\
+		__ASUS_ATTR_RO_AS(type, int_type_show);				\
+	static struct attribute *_attrname##_attrs[] = {			\
+		&attr_##_attrname##_current_value.attr,				\
+		&attr_##_attrname##_display_name.attr,				\
+		&attr_##_attrname##_type.attr, NULL				\
+	};									\
+	static const struct attribute_group _attrname##_attr_group = {		\
+		.name = _fsname, .attrs = _attrname##_attrs			\
+	}
+
+/*
+ * ROG PPT attributes need a little different in setup as they
+ * require rog_tunables members.
+ */
+
+#define __ROG_TUNABLE_SHOW(_prop, _attrname, _val)				\
+	static ssize_t _attrname##_##_prop##_show(				\
+		struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
+	{									\
+		struct rog_tunables *tunables = get_current_tunables();		\
+										\
+		if (!tunables || !tunables->power_limits)			\
+			return -ENODEV;						\
+										\
+		return sysfs_emit(buf, "%d\n", tunables->power_limits->_val);	\
+	}									\
+	static struct kobj_attribute attr_##_attrname##_##_prop =		\
+		__ASUS_ATTR_RO(_attrname, _prop)
+
+#define __ROG_TUNABLE_SHOW_DEFAULT(_attrname)					\
+	static ssize_t _attrname##_default_value_show(				\
+		struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
+	{									\
+		struct rog_tunables *tunables = get_current_tunables();		\
+										\
+		if (!tunables || !tunables->power_limits)			\
+			return -ENODEV;						\
+										\
+		return sysfs_emit(						\
+			buf, "%d\n",						\
+			tunables->power_limits->_attrname##_def ?		\
+				tunables->power_limits->_attrname##_def :	\
+				tunables->power_limits->_attrname##_max);	\
+	}									\
+	static struct kobj_attribute attr_##_attrname##_default_value =		\
+		__ASUS_ATTR_RO(_attrname, default_value)
+
+#define __ROG_TUNABLE_RW(_attr, _wmi)						\
+	static ssize_t _attr##_current_value_store(				\
+		struct kobject *kobj, struct kobj_attribute *attr,		\
+		const char *buf, size_t count)					\
+	{									\
+		struct rog_tunables *tunables = get_current_tunables();		\
+										\
+		if (!tunables || !tunables->power_limits)			\
+			return -ENODEV;						\
+										\
+		if (tunables->power_limits->_attr##_min ==			\
+		    tunables->power_limits->_attr##_max)			\
+			return -EINVAL;						\
+										\
+		return armoury_attr_uint_store(kobj, attr, buf, count,		\
+				       tunables->power_limits->_attr##_min,	\
+				       tunables->power_limits->_attr##_max,	\
+				       &tunables->_attr, _wmi);			\
+	}									\
+	static ssize_t _attr##_current_value_show(				\
+		struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
+	{									\
+		struct rog_tunables *tunables = get_current_tunables();		\
+										\
+		if (!tunables)							\
+			return -ENODEV;						\
+										\
+		return sysfs_emit(buf, "%u\n", tunables->_attr);		\
+	}									\
+	static struct kobj_attribute attr_##_attr##_current_value =		\
+		__ASUS_ATTR_RW(_attr, current_value)
+
+#define ASUS_ATTR_GROUP_ROG_TUNABLE(_attrname, _fsname, _wmi, _dispname)	\
+	__ROG_TUNABLE_RW(_attrname, _wmi);				\
+	__ROG_TUNABLE_SHOW_DEFAULT(_attrname);				\
+	__ROG_TUNABLE_SHOW(min_value, _attrname, _attrname##_min);	\
+	__ROG_TUNABLE_SHOW(max_value, _attrname, _attrname##_max);	\
+	__ATTR_SHOW_FMT(scalar_increment, _attrname, "%d\n", 1);	\
+	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);	\
+	static struct kobj_attribute attr_##_attrname##_type =		\
+		__ASUS_ATTR_RO_AS(type, int_type_show);			\
+	static struct attribute *_attrname##_attrs[] = {		\
+		&attr_##_attrname##_current_value.attr,			\
+		&attr_##_attrname##_default_value.attr,			\
+		&attr_##_attrname##_min_value.attr,			\
+		&attr_##_attrname##_max_value.attr,			\
+		&attr_##_attrname##_scalar_increment.attr,		\
+		&attr_##_attrname##_display_name.attr,			\
+		&attr_##_attrname##_type.attr,				\
+		NULL							\
+	};								\
+	static const struct attribute_group _attrname##_attr_group = {	\
+		.name = _fsname, .attrs = _attrname##_attrs		\
+	}
+
+/* Default is always the maximum value unless *_def is specified */
+struct power_limits {
+	u8 ppt_pl1_spl_min;
+	u8 ppt_pl1_spl_def;
+	u8 ppt_pl1_spl_max;
+	u8 ppt_pl2_sppt_min;
+	u8 ppt_pl2_sppt_def;
+	u8 ppt_pl2_sppt_max;
+	u8 ppt_pl3_fppt_min;
+	u8 ppt_pl3_fppt_def;
+	u8 ppt_pl3_fppt_max;
+	u8 ppt_apu_sppt_min;
+	u8 ppt_apu_sppt_def;
+	u8 ppt_apu_sppt_max;
+	u8 ppt_platform_sppt_min;
+	u8 ppt_platform_sppt_def;
+	u8 ppt_platform_sppt_max;
+	/* Nvidia GPU specific, default is always max */
+	u8 nv_dynamic_boost_def; // unused. exists for macro
+	u8 nv_dynamic_boost_min;
+	u8 nv_dynamic_boost_max;
+	u8 nv_temp_target_def; // unused. exists for macro
+	u8 nv_temp_target_min;
+	u8 nv_temp_target_max;
+	u8 nv_tgp_def; // unused. exists for macro
+	u8 nv_tgp_min;
+	u8 nv_tgp_max;
+};
+
+struct power_data {
+		const struct power_limits *ac_data;
+		const struct power_limits *dc_data;
+		bool requires_fan_curve;
+};
+
+/*
+ * For each available attribute there must be a min and a max.
+ * _def is not required and will be assumed to be default == max if missing.
+ */
+static const struct dmi_system_id power_limits[] = {
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FA401W"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_max = 80,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 80,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_tgp_min = 55,
+				.nv_tgp_max = 75,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 30,
+				.ppt_pl2_sppt_min = 31,
+				.ppt_pl2_sppt_max = 44,
+				.ppt_pl3_fppt_min = 45,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FA507N"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_max = 80,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 80,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 45,
+				.ppt_pl1_spl_max = 65,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 54,
+				.ppt_pl2_sppt_max = 65,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FA507R"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 80,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 80
+			},
+			.dc_data = NULL,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FA507X"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_max = 80,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 80,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 20,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_tgp_min = 55,
+				.nv_tgp_max = 85,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 45,
+				.ppt_pl1_spl_max = 65,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 54,
+				.ppt_pl2_sppt_max = 65,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FA507Z"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 65,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 105,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 15,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_tgp_min = 55,
+				.nv_tgp_max = 85,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 45,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_max = 60,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FA607P"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 30,
+				.ppt_pl1_spl_def = 100,
+				.ppt_pl1_spl_max = 135,
+				.ppt_pl2_sppt_min = 30,
+				.ppt_pl2_sppt_def = 115,
+				.ppt_pl2_sppt_max = 135,
+				.ppt_pl3_fppt_min = 30,
+				.ppt_pl3_fppt_max = 135,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_tgp_min = 55,
+				.nv_tgp_max = 115,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_def = 45,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_def = 60,
+				.ppt_pl2_sppt_max = 80,
+				.ppt_pl3_fppt_min = 25,
+				.ppt_pl3_fppt_max = 80,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FA608WI"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 90,
+				.ppt_pl1_spl_max = 90,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 90,
+				.ppt_pl2_sppt_max = 90,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_def = 90,
+				.ppt_pl3_fppt_max = 90,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_tgp_min = 55,
+				.nv_tgp_max = 115,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 45,
+				.ppt_pl1_spl_max = 65,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 54,
+				.ppt_pl2_sppt_max = 65,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_def = 65,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FA617NS"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_apu_sppt_min = 15,
+				.ppt_apu_sppt_max = 80,
+				.ppt_platform_sppt_min = 30,
+				.ppt_platform_sppt_max = 120,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_apu_sppt_min = 25,
+				.ppt_apu_sppt_max = 35,
+				.ppt_platform_sppt_min = 45,
+				.ppt_platform_sppt_max = 100,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FA617NT"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_apu_sppt_min = 15,
+				.ppt_apu_sppt_max = 80,
+				.ppt_platform_sppt_min = 30,
+				.ppt_platform_sppt_max = 115,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_apu_sppt_min = 15,
+				.ppt_apu_sppt_max = 45,
+				.ppt_platform_sppt_min = 30,
+				.ppt_platform_sppt_max = 50,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FA617XS"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_apu_sppt_min = 15,
+				.ppt_apu_sppt_max = 80,
+				.ppt_platform_sppt_min = 30,
+				.ppt_platform_sppt_max = 120,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_apu_sppt_min = 25,
+				.ppt_apu_sppt_max = 35,
+				.ppt_platform_sppt_min = 45,
+				.ppt_platform_sppt_max = 100,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FX507VI"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 135,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 135,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 45,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_max = 60,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FX507VV"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_def = 115,
+				.ppt_pl1_spl_max = 135,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 135,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 45,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_max = 60,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "FX507Z"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 90,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 135,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 15,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 45,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_max = 60,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GA401Q"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 15,
+				.ppt_pl2_sppt_max = 80,
+			},
+			.dc_data = NULL,
+		},
+	},
+	{
+		.matches = {
+			// This model is full AMD. No Nvidia dGPU.
+			DMI_MATCH(DMI_BOARD_NAME, "GA402R"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_apu_sppt_min = 15,
+				.ppt_apu_sppt_max = 80,
+				.ppt_platform_sppt_min = 30,
+				.ppt_platform_sppt_max = 115,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_apu_sppt_min = 25,
+				.ppt_apu_sppt_def = 30,
+				.ppt_apu_sppt_max = 45,
+				.ppt_platform_sppt_min = 40,
+				.ppt_platform_sppt_max = 60,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GA402X"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 35,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_def = 65,
+				.ppt_pl2_sppt_max = 80,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 80,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 35,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 35,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GA403U"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 80,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 80,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_tgp_min = 55,
+				.nv_tgp_max = 65,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 35,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 35,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GA503R"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 35,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 65,
+				.ppt_pl2_sppt_max = 80,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 80,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 20,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 25,
+				.ppt_pl1_spl_max = 65,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 54,
+				.ppt_pl2_sppt_max = 60,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 65,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GA605W"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_max = 80,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 80,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 20,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_tgp_min = 55,
+				.nv_tgp_max = 85,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 35,
+				.ppt_pl2_sppt_min = 31,
+				.ppt_pl2_sppt_max = 44,
+				.ppt_pl3_fppt_min = 45,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GU603Z"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 60,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 135,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 20,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 40,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 40,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			}
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GU604V"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 65,
+				.ppt_pl1_spl_max = 120,
+				.ppt_pl2_sppt_min = 65,
+				.ppt_pl2_sppt_max = 150,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 40,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 40,
+				.ppt_pl2_sppt_max = 60,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GU605CW"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 45,
+				.ppt_pl1_spl_max = 85,
+				.ppt_pl2_sppt_min = 56,
+				.ppt_pl2_sppt_max = 110,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 20,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_tgp_min = 80,
+				.nv_tgp_def = 90,
+				.nv_tgp_max = 110,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 85,
+				.ppt_pl2_sppt_min = 32,
+				.ppt_pl2_sppt_max = 110,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GU605CX"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 45,
+				.ppt_pl1_spl_max = 85,
+				.ppt_pl2_sppt_min = 56,
+				.ppt_pl2_sppt_max = 110,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 20,
+				.nv_temp_target_min = 7,
+				.nv_temp_target_max = 87,
+				.nv_tgp_min = 95,
+				.nv_tgp_def = 100,
+				.nv_tgp_max = 110,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 85,
+				.ppt_pl2_sppt_min = 32,
+				.ppt_pl2_sppt_max = 110,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GU605M"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 90,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 135,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 20,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 35,
+				.ppt_pl2_sppt_min = 38,
+				.ppt_pl2_sppt_max = 53,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GV301Q"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 45,
+				.ppt_pl2_sppt_min = 65,
+				.ppt_pl2_sppt_max = 80,
+			},
+			.dc_data = NULL,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GV301R"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 45,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 54,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 35,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 35,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GV601R"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 35,
+				.ppt_pl1_spl_max = 90,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 54,
+				.ppt_pl2_sppt_max = 100,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_def = 80,
+				.ppt_pl3_fppt_max = 125,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 28,
+				.ppt_pl1_spl_max = 65,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 54,
+				.ppt_pl2_sppt_max = 60,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_def = 80,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GV601V"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_def = 100,
+				.ppt_pl1_spl_max = 110,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 135,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 20,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 40,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 40,
+				.ppt_pl2_sppt_max = 60,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GX650P"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 110,
+				.ppt_pl1_spl_max = 130,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 125,
+				.ppt_pl2_sppt_max = 130,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_def = 125,
+				.ppt_pl3_fppt_max = 135,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_def = 25,
+				.ppt_pl1_spl_max = 65,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_def = 35,
+				.ppt_pl2_sppt_max = 65,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_def = 42,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "G513I"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				/* Yes this laptop is very limited */
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 15,
+				.ppt_pl2_sppt_max = 80,
+			},
+			.dc_data = NULL,
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "G513QM"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				/* Yes this laptop is very limited */
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 100,
+				.ppt_pl2_sppt_min = 15,
+				.ppt_pl2_sppt_max = 190,
+			},
+			.dc_data = NULL,
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "G513R"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 35,
+				.ppt_pl1_spl_max = 90,
+				.ppt_pl2_sppt_min = 54,
+				.ppt_pl2_sppt_max = 100,
+				.ppt_pl3_fppt_min = 54,
+				.ppt_pl3_fppt_max = 125,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 50,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 50,
+				.ppt_pl3_fppt_min = 28,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "G614J"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 140,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 175,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 55,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 70,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "G634J"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 140,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 175,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 55,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 70,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "G713PV"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 30,
+				.ppt_pl1_spl_def = 120,
+				.ppt_pl1_spl_max = 130,
+				.ppt_pl2_sppt_min = 65,
+				.ppt_pl2_sppt_def = 125,
+				.ppt_pl2_sppt_max = 130,
+				.ppt_pl3_fppt_min = 65,
+				.ppt_pl3_fppt_def = 125,
+				.ppt_pl3_fppt_max = 130,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 65,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 65,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 75,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "G733C"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 170,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 175,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 35,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 35,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "G733P"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 30,
+				.ppt_pl1_spl_def = 100,
+				.ppt_pl1_spl_max = 130,
+				.ppt_pl2_sppt_min = 65,
+				.ppt_pl2_sppt_def = 125,
+				.ppt_pl2_sppt_max = 130,
+				.ppt_pl3_fppt_min = 65,
+				.ppt_pl3_fppt_def = 125,
+				.ppt_pl3_fppt_max = 130,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 65,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 65,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 75,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "G814J"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 140,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 140,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 55,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 70,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "G834J"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 28,
+				.ppt_pl1_spl_max = 140,
+				.ppt_pl2_sppt_min = 28,
+				.ppt_pl2_sppt_max = 175,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 25,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 55,
+				.ppt_pl2_sppt_min = 25,
+				.ppt_pl2_sppt_max = 70,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+			.requires_fan_curve = true,
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "H7606W"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 15,
+				.ppt_pl1_spl_max = 80,
+				.ppt_pl2_sppt_min = 35,
+				.ppt_pl2_sppt_max = 80,
+				.ppt_pl3_fppt_min = 35,
+				.ppt_pl3_fppt_max = 80,
+				.nv_dynamic_boost_min = 5,
+				.nv_dynamic_boost_max = 20,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+				.nv_tgp_min = 55,
+				.nv_tgp_max = 85,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 25,
+				.ppt_pl1_spl_max = 35,
+				.ppt_pl2_sppt_min = 31,
+				.ppt_pl2_sppt_max = 44,
+				.ppt_pl3_fppt_min = 45,
+				.ppt_pl3_fppt_max = 65,
+				.nv_temp_target_min = 75,
+				.nv_temp_target_max = 87,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "RC71"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 7,
+				.ppt_pl1_spl_max = 30,
+				.ppt_pl2_sppt_min = 15,
+				.ppt_pl2_sppt_max = 43,
+				.ppt_pl3_fppt_min = 15,
+				.ppt_pl3_fppt_max = 53,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 7,
+				.ppt_pl1_spl_def = 15,
+				.ppt_pl1_spl_max = 25,
+				.ppt_pl2_sppt_min = 15,
+				.ppt_pl2_sppt_def = 20,
+				.ppt_pl2_sppt_max = 30,
+				.ppt_pl3_fppt_min = 15,
+				.ppt_pl3_fppt_def = 25,
+				.ppt_pl3_fppt_max = 35,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "RC72"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 7,
+				.ppt_pl1_spl_max = 30,
+				.ppt_pl2_sppt_min = 15,
+				.ppt_pl2_sppt_max = 43,
+				.ppt_pl3_fppt_min = 15,
+				.ppt_pl3_fppt_max = 53,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 7,
+				.ppt_pl1_spl_def = 17,
+				.ppt_pl1_spl_max = 25,
+				.ppt_pl2_sppt_min = 15,
+				.ppt_pl2_sppt_def = 24,
+				.ppt_pl2_sppt_max = 30,
+				.ppt_pl3_fppt_min = 15,
+				.ppt_pl3_fppt_def = 30,
+				.ppt_pl3_fppt_max = 35,
+			},
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "RC73XA"),
+		},
+		.driver_data = &(struct power_data) {
+			.ac_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 7,
+				.ppt_pl1_spl_max = 35,
+				.ppt_pl2_sppt_min = 14,
+				.ppt_pl2_sppt_max = 45,
+				.ppt_pl3_fppt_min = 19,
+				.ppt_pl3_fppt_max = 55,
+			},
+			.dc_data = &(struct power_limits) {
+				.ppt_pl1_spl_min = 7,
+				.ppt_pl1_spl_def = 17,
+				.ppt_pl1_spl_max = 35,
+				.ppt_pl2_sppt_min = 13,
+				.ppt_pl2_sppt_def = 21,
+				.ppt_pl2_sppt_max = 45,
+				.ppt_pl3_fppt_min = 19,
+				.ppt_pl3_fppt_def = 26,
+				.ppt_pl3_fppt_max = 55,
+			},
+		},
+	},
+	{}
+};
+
 #endif /* _ASUS_ARMOURY_H_ */
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index e7c95e9d29db..419491d4abca 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -139,6 +139,9 @@
 
 #define ASUS_WMI_DEVID_APU_MEM		0x000600C1
 
+#define ASUS_WMI_DEVID_DGPU_BASE_TGP	0x00120099
+#define ASUS_WMI_DEVID_DGPU_SET_TGP	0x00120098
+
 /* gpu mux switch, 0 = dGPU, 1 = Optimus */
 #define ASUS_WMI_DEVID_GPU_MUX		0x00090016
 #define ASUS_WMI_DEVID_GPU_MUX_VIVO	0x00090026
-- 
cgit v1.2.3


From 32e3fee88a4ac183541b478f5bc94084ea76436c Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Tue, 11 Nov 2025 14:11:24 +0100
Subject: platform/x86: wmi: Remove extern keyword from prototypes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The external function definitions do not need the "extern" keyword.
Remove it to silence the associated checkpatch warnings.

Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://patch.msgid.link/20251111131125.3379-4-W_Armin@gmx.de
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/wmi.h | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 10751c8e5e6a..665ea7dc8a92 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -36,13 +36,10 @@ struct wmi_device {
  */
 #define to_wmi_device(device)	container_of_const(device, struct wmi_device, dev)
 
-extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev,
-					  u8 instance, u32 method_id,
-					  const struct acpi_buffer *in,
-					  struct acpi_buffer *out);
+acpi_status wmidev_evaluate_method(struct wmi_device *wdev, u8 instance, u32 method_id,
+				   const struct acpi_buffer *in, struct acpi_buffer *out);
 
-extern union acpi_object *wmidev_block_query(struct wmi_device *wdev,
-					     u8 instance);
+union acpi_object *wmidev_block_query(struct wmi_device *wdev, u8 instance);
 
 acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in);
 
@@ -81,9 +78,9 @@ struct wmi_driver {
  */
 #define to_wmi_driver(drv)	container_of_const(drv, struct wmi_driver, driver)
 
-extern int __must_check __wmi_driver_register(struct wmi_driver *driver,
-					      struct module *owner);
-extern void wmi_driver_unregister(struct wmi_driver *driver);
+int __must_check __wmi_driver_register(struct wmi_driver *driver, struct module *owner);
+
+void wmi_driver_unregister(struct wmi_driver *driver);
 
 /**
  * wmi_driver_register() - Helper macro to register a WMI driver
-- 
cgit v1.2.3


From 6eb2e056b0e418718fc5a3cfe79bdb41d9a2851d Mon Sep 17 00:00:00 2001
From: Dnyaneshwar Bhadane <dnyaneshwar.bhadane@intel.com>
Date: Mon, 22 Sep 2025 20:33:15 +0530
Subject: drm/pcids: Split PTL pciids group to make wcl subplatform

To form the WCL platform as a subplatform of PTL in definition,
WCL pci ids are splited into saparate group from PTL.
So update the pciidlist struct to cover all the pci ids.

v2:
- Squash wcl description in single patch for display and xe.(jani,gustavo)

Fixes: 3c0f211bc8fc ("drm/xe: Add Wildcat Lake device IDs to PTL list")
Signed-off-by: Dnyaneshwar Bhadane <dnyaneshwar.bhadane@intel.com>
Reviewed-by: Gustavo Sousa <gustavo.sousa@intel.com>
Signed-off-by: Suraj Kandpal <suraj.kandpal@intel.com>
Link: https://lore.kernel.org/r/20250922150317.2334680-2-dnyaneshwar.bhadane@intel.com
(cherry picked from commit 32620e176443bf23ec81bfe8f177c6721a904864)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
[Rodrigo added the Fixes tag when porting it to fixes]
---
 drivers/gpu/drm/i915/display/intel_display_device.c | 1 +
 drivers/gpu/drm/xe/xe_pci.c                         | 1 +
 include/drm/intel/pciids.h                          | 5 ++++-
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/display/intel_display_device.c b/drivers/gpu/drm/i915/display/intel_display_device.c
index a002bc6ce7b0..a9a36176096f 100644
--- a/drivers/gpu/drm/i915/display/intel_display_device.c
+++ b/drivers/gpu/drm/i915/display/intel_display_device.c
@@ -1482,6 +1482,7 @@ static const struct {
 	INTEL_LNL_IDS(INTEL_DISPLAY_DEVICE, &lnl_desc),
 	INTEL_BMG_IDS(INTEL_DISPLAY_DEVICE, &bmg_desc),
 	INTEL_PTL_IDS(INTEL_DISPLAY_DEVICE, &ptl_desc),
+	INTEL_WCL_IDS(INTEL_DISPLAY_DEVICE, &ptl_desc),
 };
 
 static const struct {
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 9a6df79fc5b6..89cc6d32f041 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -375,6 +375,7 @@ static const struct pci_device_id pciidlist[] = {
 	INTEL_LNL_IDS(INTEL_VGA_DEVICE, &lnl_desc),
 	INTEL_BMG_IDS(INTEL_VGA_DEVICE, &bmg_desc),
 	INTEL_PTL_IDS(INTEL_VGA_DEVICE, &ptl_desc),
+	INTEL_WCL_IDS(INTEL_VGA_DEVICE, &ptl_desc),
 	{ }
 };
 MODULE_DEVICE_TABLE(pci, pciidlist);
diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h
index da6301a6fcea..69d4ae92d822 100644
--- a/include/drm/intel/pciids.h
+++ b/include/drm/intel/pciids.h
@@ -877,7 +877,10 @@
 	MACRO__(0xB08F, ## __VA_ARGS__), \
 	MACRO__(0xB090, ## __VA_ARGS__), \
 	MACRO__(0xB0A0, ## __VA_ARGS__), \
-	MACRO__(0xB0B0, ## __VA_ARGS__), \
+	MACRO__(0xB0B0, ## __VA_ARGS__)
+
+/* WCL */
+#define INTEL_WCL_IDS(MACRO__, ...) \
 	MACRO__(0xFD80, ## __VA_ARGS__), \
 	MACRO__(0xFD81, ## __VA_ARGS__)
 
-- 
cgit v1.2.3


From 80adaccf0e1c8c8fff44be2d959f6dba80af0491 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Tue, 18 Nov 2025 13:52:13 +0300
Subject: rseq: Delete duplicate if statement in rseq_virt_userspace_exit()

This if statement is indented weirdly.  It's a duplicate and doesn't
affect runtime (beyond wasting a little time).  Delete it.

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/aRxP3YcwscrP1BU_@stanley.mountain
---
 include/linux/rseq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index b5e4803c4ebe..bf8a6bf315f3 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -126,7 +126,6 @@ static inline void rseq_force_update(void)
  */
 static inline void rseq_virt_userspace_exit(void)
 {
-	if (current->rseq.event.sched_switch)
 	/*
 	 * The generic optimization for deferring RSEQ updates until the next
 	 * exit relies on having a dedicated TIF_RSEQ.
-- 
cgit v1.2.3


From 5bebe8de19264946d398ead4e6c20c229454a552 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 18 Nov 2025 08:21:27 -0800
Subject: mm/huge_memory: Fix initialization of huge zero folio

The recent fix to properly initialize the tags of the huge zero folio
had an unfortunate not-so-subtle side effect: it caused the actual
*contents* of the huge zero folio to not be initialized at all when the
hardware didn't support the memory tagging.

The reason was the unfortunate semantics of tag_clear_highpage(): on
hardware that didn't do the tagging, it would silently just not do
anything at all.  And since this is done only on arm64 with MTE support,
that basically meant most hardware.

It wasn't necessarily immediately obvious since the huge zero page isn't
necessarily very heavily used - or because it might already be zero
because all-zeroes is the most common pattern.  But it ends up causing
random odd user space failures when you do hit it.

The unfortunate semantics have been around for a while, but became a
real bug only when we started actively using __GFP_ZEROTAGS in the
generic get_huge_zero_folio() function - before that, it had only ever
been used in code that checked that the hardware supported it.

Fix this by simply changing the semantics of tag_clear_highpage() to
return whether it actually successfully did something or not.  While at
it, also make it initialize multiple pages in one go, since that's
actually what the only caller wants it to do and it simplifies the whole
logic.

Fixes: adfb6609c680 ("mm/huge_memory: initialise the tags of the huge zero folio")
Link: https://lore.kernel.org/all/20251117082023.90176-1-00107082@163.com/
Reviewed-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reported-and-tested-by: David Wang <00107082@163.com>
Reported-and-tested-by: Carlos Llamas <cmllamas@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/page.h |  4 ++--
 arch/arm64/mm/fault.c         | 21 +++++++++++----------
 include/linux/highmem.h       |  6 ++++--
 mm/page_alloc.c               |  9 ++-------
 4 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 2312e6ee595f..258cca4b4873 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -33,8 +33,8 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
 						unsigned long vaddr);
 #define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio
 
-void tag_clear_highpage(struct page *to);
-#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
+bool tag_clear_highpages(struct page *to, int numpages);
+#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES
 
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 125dfa6c613b..a193b6a5d1e6 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -967,20 +967,21 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
 	return vma_alloc_folio(flags, 0, vma, vaddr);
 }
 
-void tag_clear_highpage(struct page *page)
+bool tag_clear_highpages(struct page *page, int numpages)
 {
 	/*
 	 * Check if MTE is supported and fall back to clear_highpage().
 	 * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and
-	 * post_alloc_hook() will invoke tag_clear_highpage().
+	 * post_alloc_hook() will invoke tag_clear_highpages().
 	 */
-	if (!system_supports_mte()) {
-		clear_highpage(page);
-		return;
-	}
+	if (!system_supports_mte())
+		return false;
 
-	/* Newly allocated page, shouldn't have been tagged yet */
-	WARN_ON_ONCE(!try_page_mte_tagging(page));
-	mte_zero_clear_page_tags(page_address(page));
-	set_page_mte_tagged(page);
+	/* Newly allocated pages, shouldn't have been tagged yet */
+	for (int i = 0; i < numpages; i++, page++) {
+		WARN_ON_ONCE(!try_page_mte_tagging(page));
+		mte_zero_clear_page_tags(page_address(page));
+		set_page_mte_tagged(page);
+	}
+	return true;
 }
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 105cc4c00cc3..abc20f9810fd 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -249,10 +249,12 @@ static inline void clear_highpage_kasan_tagged(struct page *page)
 	kunmap_local(kaddr);
 }
 
-#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
+#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES
 
-static inline void tag_clear_highpage(struct page *page)
+/* Return false to let people know we did not initialize the pages */
+static inline bool tag_clear_highpages(struct page *page, int numpages)
 {
+	return false;
 }
 
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 600d9e981c23..ed82ee55e66a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1822,14 +1822,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	 * If memory tags should be zeroed
 	 * (which happens only when memory should be initialized as well).
 	 */
-	if (zero_tags) {
-		/* Initialize both memory and memory tags. */
-		for (i = 0; i != 1 << order; ++i)
-			tag_clear_highpage(page + i);
+	if (zero_tags)
+		init = !tag_clear_highpages(page, 1 << order);
 
-		/* Take note that memory was initialized by the loop above. */
-		init = false;
-	}
 	if (!should_skip_kasan_unpoison(gfp_flags) &&
 	    kasan_unpoison_pages(page, order, init)) {
 		/* Take note that memory was initialized by KASAN. */
-- 
cgit v1.2.3


From c57210bc15371caa06a5d4040e7d8aaeed4cb661 Mon Sep 17 00:00:00 2001
From: Alexey Minnekhanov <alexeymin@postmarketos.org>
Date: Sun, 16 Nov 2025 04:12:33 +0300
Subject: dt-bindings: clock: mmcc-sdm660: Add missing MDSS reset

Add definition for display subsystem reset control, so display
driver can reset display controller properly, clearing any
configuration left there by bootloader. Since 6.17 after
PM domains rework it became necessary for display to function.

Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state")
Cc: stable@vger.kernel.org # 6.17
Signed-off-by: Alexey Minnekhanov <alexeymin@postmarketos.org>
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20251116-sdm660-mdss-reset-v2-1-6219bec0a97f@postmarketos.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/clock/qcom,mmcc-sdm660.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,mmcc-sdm660.h b/include/dt-bindings/clock/qcom,mmcc-sdm660.h
index f9dbc21cb5c7..ee2a89dae72d 100644
--- a/include/dt-bindings/clock/qcom,mmcc-sdm660.h
+++ b/include/dt-bindings/clock/qcom,mmcc-sdm660.h
@@ -157,6 +157,7 @@
 #define BIMC_SMMU_GDSC							7
 
 #define CAMSS_MICRO_BCR				 0
+#define MDSS_BCR				1
 
 #endif
 
-- 
cgit v1.2.3


From 23818ebb9c76bac8dfedec252cf33157230efc23 Mon Sep 17 00:00:00 2001
From: Xuyang Dong <dongxuyang@eswincomputing.com>
Date: Tue, 30 Sep 2025 17:32:18 +0800
Subject: dt-bindings: reset: eswin: Documentation for eic7700 SoC

Add device tree binding documentation and header file for the ESWIN
eic7700 reset controller module.

Signed-off-by: Yifeng Huang <huangyifeng@eswincomputing.com>
Signed-off-by: Xuyang Dong <dongxuyang@eswincomputing.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 .../bindings/reset/eswin,eic7700-reset.yaml        |  42 +++
 include/dt-bindings/reset/eswin,eic7700-reset.h    | 298 +++++++++++++++++++++
 2 files changed, 340 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/reset/eswin,eic7700-reset.yaml
 create mode 100644 include/dt-bindings/reset/eswin,eic7700-reset.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/reset/eswin,eic7700-reset.yaml b/Documentation/devicetree/bindings/reset/eswin,eic7700-reset.yaml
new file mode 100644
index 000000000000..cf2fdb907571
--- /dev/null
+++ b/Documentation/devicetree/bindings/reset/eswin,eic7700-reset.yaml
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/reset/eswin,eic7700-reset.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ESWIN EIC7700 SoC reset controller
+
+maintainers:
+  - Yifeng Huang <huangyifeng@eswincomputing.com>
+  - Xuyang Dong <dongxuyang@eswincomputing.com>
+
+description:
+  The system reset controller can be used to reset various peripheral
+  controllers in ESWIN eic7700 SoC.
+
+properties:
+  compatible:
+    const: eswin,eic7700-reset
+
+  reg:
+    maxItems: 1
+
+  '#reset-cells':
+    const: 1
+
+required:
+  - compatible
+  - reg
+  - '#reset-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/reset/eswin,eic7700-reset.h>
+
+    reset-controller@51828300 {
+        compatible = "eswin,eic7700-reset";
+        reg = <0x51828300 0x200>;
+        #reset-cells = <1>;
+    };
diff --git a/include/dt-bindings/reset/eswin,eic7700-reset.h b/include/dt-bindings/reset/eswin,eic7700-reset.h
new file mode 100644
index 000000000000..a370c9f74307
--- /dev/null
+++ b/include/dt-bindings/reset/eswin,eic7700-reset.h
@@ -0,0 +1,298 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright 2025, Beijing ESWIN Computing Technology Co., Ltd..
+ * All rights reserved.
+ *
+ * Device Tree binding constants for EIC7700 reset controller.
+ *
+ * Authors:
+ *	Yifeng Huang <huangyifeng@eswincomputing.com>
+ *	Xuyang Dong <dongxuyang@eswincomputing.com>
+ */
+
+#ifndef __DT_ESWIN_EIC7700_RESET_H__
+#define __DT_ESWIN_EIC7700_RESET_H__
+
+#define EIC7700_RESET_NOC_NSP		0
+#define EIC7700_RESET_NOC_CFG		1
+#define EIC7700_RESET_RNOC_NSP		2
+#define EIC7700_RESET_SNOC_TCU		3
+#define EIC7700_RESET_SNOC_U84		4
+#define EIC7700_RESET_SNOC_PCIE_XSR	5
+#define EIC7700_RESET_SNOC_PCIE_XMR	6
+#define EIC7700_RESET_SNOC_PCIE_PR	7
+#define EIC7700_RESET_SNOC_NPU		8
+#define EIC7700_RESET_SNOC_JTAG		9
+#define EIC7700_RESET_SNOC_DSP		10
+#define EIC7700_RESET_SNOC_DDRC1_P2	11
+#define EIC7700_RESET_SNOC_DDRC1_P1	12
+#define EIC7700_RESET_SNOC_DDRC0_P2	13
+#define EIC7700_RESET_SNOC_DDRC0_P1	14
+#define EIC7700_RESET_SNOC_D2D		15
+#define EIC7700_RESET_SNOC_AON		16
+#define EIC7700_RESET_GPU_AXI		17
+#define EIC7700_RESET_GPU_CFG		18
+#define EIC7700_RESET_GPU_GRAY		19
+#define EIC7700_RESET_GPU_JONES		20
+#define EIC7700_RESET_GPU_SPU		21
+#define EIC7700_RESET_DSP_AXI		22
+#define EIC7700_RESET_DSP_CFG		23
+#define EIC7700_RESET_DSP_DIV4		24
+#define EIC7700_RESET_DSP_DIV0		25
+#define EIC7700_RESET_DSP_DIV1		26
+#define EIC7700_RESET_DSP_DIV2		27
+#define EIC7700_RESET_DSP_DIV3		28
+#define EIC7700_RESET_D2D_AXI		29
+#define EIC7700_RESET_D2D_CFG		30
+#define EIC7700_RESET_D2D_PRST		31
+#define EIC7700_RESET_D2D_RAW_PCS	32
+#define EIC7700_RESET_D2D_RX		33
+#define EIC7700_RESET_D2D_TX		34
+#define EIC7700_RESET_D2D_CORE		35
+#define EIC7700_RESET_DDR1_ARST		36
+#define EIC7700_RESET_DDR1_TRACE	37
+#define EIC7700_RESET_DDR0_ARST		38
+#define EIC7700_RESET_DDR_CFG		39
+#define EIC7700_RESET_DDR0_TRACE	40
+#define EIC7700_RESET_DDR_CORE		41
+#define EIC7700_RESET_DDR_PRST		42
+#define EIC7700_RESET_TCU_AXI		43
+#define EIC7700_RESET_TCU_CFG		44
+#define EIC7700_RESET_TCU_TBU0		45
+#define EIC7700_RESET_TCU_TBU1		46
+#define EIC7700_RESET_TCU_TBU2		47
+#define EIC7700_RESET_TCU_TBU3		48
+#define EIC7700_RESET_TCU_TBU4		49
+#define EIC7700_RESET_TCU_TBU5		50
+#define EIC7700_RESET_TCU_TBU6		51
+#define EIC7700_RESET_TCU_TBU7		52
+#define EIC7700_RESET_TCU_TBU8		53
+#define EIC7700_RESET_TCU_TBU9		54
+#define EIC7700_RESET_TCU_TBU10		55
+#define EIC7700_RESET_TCU_TBU11		56
+#define EIC7700_RESET_TCU_TBU12		57
+#define EIC7700_RESET_TCU_TBU13		58
+#define EIC7700_RESET_TCU_TBU14		59
+#define EIC7700_RESET_TCU_TBU15		60
+#define EIC7700_RESET_TCU_TBU16		61
+#define EIC7700_RESET_NPU_AXI		62
+#define EIC7700_RESET_NPU_CFG		63
+#define EIC7700_RESET_NPU_CORE		64
+#define EIC7700_RESET_NPU_E31CORE	65
+#define EIC7700_RESET_NPU_E31BUS	66
+#define EIC7700_RESET_NPU_E31DBG	67
+#define EIC7700_RESET_NPU_LLC		68
+#define EIC7700_RESET_HSP_AXI		69
+#define EIC7700_RESET_HSP_CFG		70
+#define EIC7700_RESET_HSP_POR		71
+#define EIC7700_RESET_MSHC0_PHY		72
+#define EIC7700_RESET_MSHC1_PHY		73
+#define EIC7700_RESET_MSHC2_PHY		74
+#define EIC7700_RESET_MSHC0_TXRX	75
+#define EIC7700_RESET_MSHC1_TXRX	76
+#define EIC7700_RESET_MSHC2_TXRX	77
+#define EIC7700_RESET_SATA_ASIC0	78
+#define EIC7700_RESET_SATA_OOB		79
+#define EIC7700_RESET_SATA_PMALIVE	80
+#define EIC7700_RESET_SATA_RBC		81
+#define EIC7700_RESET_DMA0		82
+#define EIC7700_RESET_HSP_DMA		83
+#define EIC7700_RESET_USB0_VAUX		84
+#define EIC7700_RESET_USB1_VAUX		85
+#define EIC7700_RESET_HSP_SD1_PRST	86
+#define EIC7700_RESET_HSP_SD0_PRST	87
+#define EIC7700_RESET_HSP_EMMC_PRST	88
+#define EIC7700_RESET_HSP_DMA_PRST	89
+#define EIC7700_RESET_HSP_SD1_ARST	90
+#define EIC7700_RESET_HSP_SD0_ARST	91
+#define EIC7700_RESET_HSP_EMMC_ARST	92
+#define EIC7700_RESET_HSP_DMA_ARST	93
+#define EIC7700_RESET_HSP_ETH1_ARST	94
+#define EIC7700_RESET_HSP_ETH0_ARST	95
+#define EIC7700_RESET_SATA_ARST		96
+#define EIC7700_RESET_PCIE_CFG		97
+#define EIC7700_RESET_PCIE_POWEUP	98
+#define EIC7700_RESET_PCIE_PERST	99
+#define EIC7700_RESET_I2C0		100
+#define EIC7700_RESET_I2C1		101
+#define EIC7700_RESET_I2C2		102
+#define EIC7700_RESET_I2C3		103
+#define EIC7700_RESET_I2C4		104
+#define EIC7700_RESET_I2C5		105
+#define EIC7700_RESET_I2C6		106
+#define EIC7700_RESET_I2C7		107
+#define EIC7700_RESET_I2C8		108
+#define EIC7700_RESET_I2C9		109
+#define EIC7700_RESET_FAN		110
+#define EIC7700_RESET_PVT0		111
+#define EIC7700_RESET_PVT1		112
+#define EIC7700_RESET_MBOX0		113
+#define EIC7700_RESET_MBOX1		114
+#define EIC7700_RESET_MBOX2		115
+#define EIC7700_RESET_MBOX3		116
+#define EIC7700_RESET_MBOX4		117
+#define EIC7700_RESET_MBOX5		118
+#define EIC7700_RESET_MBOX6		119
+#define EIC7700_RESET_MBOX7		120
+#define EIC7700_RESET_MBOX8		121
+#define EIC7700_RESET_MBOX9		122
+#define EIC7700_RESET_MBOX10		123
+#define EIC7700_RESET_MBOX11		124
+#define EIC7700_RESET_MBOX12		125
+#define EIC7700_RESET_MBOX13		126
+#define EIC7700_RESET_MBOX14		127
+#define EIC7700_RESET_MBOX15		128
+#define EIC7700_RESET_UART0		129
+#define EIC7700_RESET_UART1		130
+#define EIC7700_RESET_UART2		131
+#define EIC7700_RESET_UART3		132
+#define EIC7700_RESET_UART4		133
+#define EIC7700_RESET_GPIO0		134
+#define EIC7700_RESET_GPIO1		135
+#define EIC7700_RESET_TIMER		136
+#define EIC7700_RESET_SSI0		137
+#define EIC7700_RESET_SSI1		138
+#define EIC7700_RESET_WDT0		139
+#define EIC7700_RESET_WDT1		140
+#define EIC7700_RESET_WDT2		141
+#define EIC7700_RESET_WDT3		142
+#define EIC7700_RESET_LSP_CFG		143
+#define EIC7700_RESET_U84_CORE0		144
+#define EIC7700_RESET_U84_CORE1		145
+#define EIC7700_RESET_U84_CORE2		146
+#define EIC7700_RESET_U84_CORE3		147
+#define EIC7700_RESET_U84_BUS		148
+#define EIC7700_RESET_U84_DBG		149
+#define EIC7700_RESET_U84_TRACECOM	150
+#define EIC7700_RESET_U84_TRACE0	151
+#define EIC7700_RESET_U84_TRACE1	152
+#define EIC7700_RESET_U84_TRACE2	153
+#define EIC7700_RESET_U84_TRACE3	154
+#define EIC7700_RESET_SCPU_CORE		155
+#define EIC7700_RESET_SCPU_BUS		156
+#define EIC7700_RESET_SCPU_DBG		157
+#define EIC7700_RESET_LPCPU_CORE	158
+#define EIC7700_RESET_LPCPU_BUS		159
+#define EIC7700_RESET_LPCPU_DBG		160
+#define EIC7700_RESET_VC_CFG		161
+#define EIC7700_RESET_VC_AXI		162
+#define EIC7700_RESET_VC_MONCFG		163
+#define EIC7700_RESET_JD_CFG		164
+#define EIC7700_RESET_JD_AXI		165
+#define EIC7700_RESET_JE_CFG		166
+#define EIC7700_RESET_JE_AXI		167
+#define EIC7700_RESET_VD_CFG		168
+#define EIC7700_RESET_VD_AXI		169
+#define EIC7700_RESET_VE_AXI		170
+#define EIC7700_RESET_VE_CFG		171
+#define EIC7700_RESET_G2D_CORE		172
+#define EIC7700_RESET_G2D_CFG		173
+#define EIC7700_RESET_G2D_AXI		174
+#define EIC7700_RESET_VI_AXI		175
+#define EIC7700_RESET_VI_CFG		176
+#define EIC7700_RESET_VI_DWE		177
+#define EIC7700_RESET_DVP		178
+#define EIC7700_RESET_ISP0		179
+#define EIC7700_RESET_ISP1		180
+#define EIC7700_RESET_SHUTTR0		181
+#define EIC7700_RESET_SHUTTR1		182
+#define EIC7700_RESET_SHUTTR2		183
+#define EIC7700_RESET_SHUTTR3		184
+#define EIC7700_RESET_SHUTTR4		185
+#define EIC7700_RESET_SHUTTR5		186
+#define EIC7700_RESET_VO_MIPI		187
+#define EIC7700_RESET_VO_PRST		188
+#define EIC7700_RESET_VO_HDMI_PRST	189
+#define EIC7700_RESET_VO_HDMI_PHY	190
+#define EIC7700_RESET_VO_HDMI		191
+#define EIC7700_RESET_VO_I2S		192
+#define EIC7700_RESET_VO_I2S_PRST	193
+#define EIC7700_RESET_VO_AXI		194
+#define EIC7700_RESET_VO_CFG		195
+#define EIC7700_RESET_VO_DC		196
+#define EIC7700_RESET_VO_DC_PRST	197
+#define EIC7700_RESET_BOOTSPI_HRST	198
+#define EIC7700_RESET_BOOTSPI		199
+#define EIC7700_RESET_ANO1		200
+#define EIC7700_RESET_ANO0		201
+#define EIC7700_RESET_DMA1_ARST		202
+#define EIC7700_RESET_DMA1_HRST		203
+#define EIC7700_RESET_FPRT		204
+#define EIC7700_RESET_HBLOCK		205
+#define EIC7700_RESET_SECSR		206
+#define EIC7700_RESET_OTP		207
+#define EIC7700_RESET_PKA		208
+#define EIC7700_RESET_SPACC		209
+#define EIC7700_RESET_TRNG		210
+#define EIC7700_RESET_TIMER0_0		211
+#define EIC7700_RESET_TIMER0_1		212
+#define EIC7700_RESET_TIMER0_2		213
+#define EIC7700_RESET_TIMER0_3		214
+#define EIC7700_RESET_TIMER0_4		215
+#define EIC7700_RESET_TIMER0_5		216
+#define EIC7700_RESET_TIMER0_6		217
+#define EIC7700_RESET_TIMER0_7		218
+#define EIC7700_RESET_TIMER0_N		219
+#define EIC7700_RESET_TIMER1_0		220
+#define EIC7700_RESET_TIMER1_1		221
+#define EIC7700_RESET_TIMER1_2		222
+#define EIC7700_RESET_TIMER1_3		223
+#define EIC7700_RESET_TIMER1_4		224
+#define EIC7700_RESET_TIMER1_5		225
+#define EIC7700_RESET_TIMER1_6		226
+#define EIC7700_RESET_TIMER1_7		227
+#define EIC7700_RESET_TIMER1_N		228
+#define EIC7700_RESET_TIMER2_0		229
+#define EIC7700_RESET_TIMER2_1		230
+#define EIC7700_RESET_TIMER2_2		231
+#define EIC7700_RESET_TIMER2_3		232
+#define EIC7700_RESET_TIMER2_4		233
+#define EIC7700_RESET_TIMER2_5		234
+#define EIC7700_RESET_TIMER2_6		235
+#define EIC7700_RESET_TIMER2_7		236
+#define EIC7700_RESET_TIMER2_N		237
+#define EIC7700_RESET_TIMER3_0		238
+#define EIC7700_RESET_TIMER3_1		239
+#define EIC7700_RESET_TIMER3_2		240
+#define EIC7700_RESET_TIMER3_3		241
+#define EIC7700_RESET_TIMER3_4		242
+#define EIC7700_RESET_TIMER3_5		243
+#define EIC7700_RESET_TIMER3_6		244
+#define EIC7700_RESET_TIMER3_7		245
+#define EIC7700_RESET_TIMER3_N		246
+#define EIC7700_RESET_RTC		247
+#define EIC7700_RESET_MNOC_SNOC_NSP	248
+#define EIC7700_RESET_MNOC_VC		249
+#define EIC7700_RESET_MNOC_CFG		250
+#define EIC7700_RESET_MNOC_HSP		251
+#define EIC7700_RESET_MNOC_GPU		252
+#define EIC7700_RESET_MNOC_DDRC1_P3	253
+#define EIC7700_RESET_MNOC_DDRC0_P3	254
+#define EIC7700_RESET_RNOC_VO		255
+#define EIC7700_RESET_RNOC_VI		256
+#define EIC7700_RESET_RNOC_SNOC_NSP	257
+#define EIC7700_RESET_RNOC_CFG		258
+#define EIC7700_RESET_MNOC_DDRC1_P4	259
+#define EIC7700_RESET_MNOC_DDRC0_P4	260
+#define EIC7700_RESET_CNOC_VO_CFG	261
+#define EIC7700_RESET_CNOC_VI_CFG	262
+#define EIC7700_RESET_CNOC_VC_CFG	263
+#define EIC7700_RESET_CNOC_TCU_CFG	264
+#define EIC7700_RESET_CNOC_PCIE_CFG	265
+#define EIC7700_RESET_CNOC_NPU_CFG	266
+#define EIC7700_RESET_CNOC_LSP_CFG	267
+#define EIC7700_RESET_CNOC_HSP_CFG	268
+#define EIC7700_RESET_CNOC_GPU_CFG	269
+#define EIC7700_RESET_CNOC_DSPT_CFG	270
+#define EIC7700_RESET_CNOC_DDRT1_CFG	271
+#define EIC7700_RESET_CNOC_DDRT0_CFG	272
+#define EIC7700_RESET_CNOC_D2D_CFG	273
+#define EIC7700_RESET_CNOC_CFG		274
+#define EIC7700_RESET_CNOC_CLMM_CFG	275
+#define EIC7700_RESET_CNOC_AON_CFG	276
+#define EIC7700_RESET_LNOC_CFG		277
+#define EIC7700_RESET_LNOC_NPU_LLC	278
+#define EIC7700_RESET_LNOC_DDRC1_P0	279
+#define EIC7700_RESET_LNOC_DDRC0_P0	280
+
+#endif /* __DT_ESWIN_EIC7700_RESET_H__ */
-- 
cgit v1.2.3


From 8bffbfdc01dff26f17f8b382266e71d48e63c5e9 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 22 Oct 2025 15:51:32 +0200
Subject: reset: remove legacy reset lookup code

There are no more users of this code. Let's remove the exported symbols
and the implementation from reset core.

Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
[p.zabel@pengutronix.de: folded in 8e6ec20e-8965-4b42-99fc-0462269ff2f1@paulmck-laptop]
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c             | 124 +--------------------------------------
 include/linux/reset-controller.h |  33 -----------
 2 files changed, 3 insertions(+), 154 deletions(-)

(limited to 'include')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index 22f67fc77ae5..20cd50804ba1 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -25,9 +25,6 @@
 static DEFINE_MUTEX(reset_list_mutex);
 static LIST_HEAD(reset_controller_list);
 
-static DEFINE_MUTEX(reset_lookup_mutex);
-static LIST_HEAD(reset_lookup_list);
-
 /* Protects reset_gpio_lookup_list */
 static DEFINE_MUTEX(reset_gpio_lookup_mutex);
 static LIST_HEAD(reset_gpio_lookup_list);
@@ -190,33 +187,6 @@ int devm_reset_controller_register(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_reset_controller_register);
 
-/**
- * reset_controller_add_lookup - register a set of lookup entries
- * @lookup: array of reset lookup entries
- * @num_entries: number of entries in the lookup array
- */
-void reset_controller_add_lookup(struct reset_control_lookup *lookup,
-				 unsigned int num_entries)
-{
-	struct reset_control_lookup *entry;
-	unsigned int i;
-
-	mutex_lock(&reset_lookup_mutex);
-	for (i = 0; i < num_entries; i++) {
-		entry = &lookup[i];
-
-		if (!entry->dev_id || !entry->provider) {
-			pr_warn("%s(): reset lookup entry badly specified, skipping\n",
-				__func__);
-			continue;
-		}
-
-		list_add_tail(&entry->list, &reset_lookup_list);
-	}
-	mutex_unlock(&reset_lookup_mutex);
-}
-EXPORT_SYMBOL_GPL(reset_controller_add_lookup);
-
 static inline struct reset_control_array *
 rstc_to_array(struct reset_control *rstc) {
 	return container_of(rstc, struct reset_control_array, base);
@@ -1081,75 +1051,12 @@ out_put:
 }
 EXPORT_SYMBOL_GPL(__of_reset_control_get);
 
-static struct reset_controller_dev *
-__reset_controller_by_name(const char *name)
-{
-	struct reset_controller_dev *rcdev;
-
-	lockdep_assert_held(&reset_list_mutex);
-
-	list_for_each_entry(rcdev, &reset_controller_list, list) {
-		if (!rcdev->dev)
-			continue;
-
-		if (!strcmp(name, dev_name(rcdev->dev)))
-			return rcdev;
-	}
-
-	return NULL;
-}
-
-static struct reset_control *
-__reset_control_get_from_lookup(struct device *dev, const char *con_id,
-				enum reset_control_flags flags)
-{
-	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
-	const struct reset_control_lookup *lookup;
-	struct reset_controller_dev *rcdev;
-	const char *dev_id = dev_name(dev);
-	struct reset_control *rstc = NULL;
-
-	mutex_lock(&reset_lookup_mutex);
-
-	list_for_each_entry(lookup, &reset_lookup_list, list) {
-		if (strcmp(lookup->dev_id, dev_id))
-			continue;
-
-		if ((!con_id && !lookup->con_id) ||
-		    ((con_id && lookup->con_id) &&
-		     !strcmp(con_id, lookup->con_id))) {
-			mutex_lock(&reset_list_mutex);
-			rcdev = __reset_controller_by_name(lookup->provider);
-			if (!rcdev) {
-				mutex_unlock(&reset_list_mutex);
-				mutex_unlock(&reset_lookup_mutex);
-				/* Reset provider may not be ready yet. */
-				return ERR_PTR(-EPROBE_DEFER);
-			}
-
-			flags &= ~RESET_CONTROL_FLAGS_BIT_OPTIONAL;
-
-			rstc = __reset_control_get_internal(rcdev,
-							    lookup->index,
-							    flags);
-			mutex_unlock(&reset_list_mutex);
-			break;
-		}
-	}
-
-	mutex_unlock(&reset_lookup_mutex);
-
-	if (!rstc)
-		return optional ? NULL : ERR_PTR(-ENOENT);
-
-	return rstc;
-}
-
 struct reset_control *__reset_control_get(struct device *dev, const char *id,
 					  int index, enum reset_control_flags flags)
 {
 	bool shared = flags & RESET_CONTROL_FLAGS_BIT_SHARED;
 	bool acquired = flags & RESET_CONTROL_FLAGS_BIT_ACQUIRED;
+	bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL;
 
 	if (WARN_ON(shared && acquired))
 		return ERR_PTR(-EINVAL);
@@ -1157,7 +1064,7 @@ struct reset_control *__reset_control_get(struct device *dev, const char *id,
 	if (dev->of_node)
 		return __of_reset_control_get(dev->of_node, id, index, flags);
 
-	return __reset_control_get_from_lookup(dev, id, flags);
+	return optional ? NULL : ERR_PTR(-ENOENT);
 }
 EXPORT_SYMBOL_GPL(__reset_control_get);
 
@@ -1492,31 +1399,6 @@ devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags)
 }
 EXPORT_SYMBOL_GPL(devm_reset_control_array_get);
 
-static int reset_control_get_count_from_lookup(struct device *dev)
-{
-	const struct reset_control_lookup *lookup;
-	const char *dev_id;
-	int count = 0;
-
-	if (!dev)
-		return -EINVAL;
-
-	dev_id = dev_name(dev);
-	mutex_lock(&reset_lookup_mutex);
-
-	list_for_each_entry(lookup, &reset_lookup_list, list) {
-		if (!strcmp(lookup->dev_id, dev_id))
-			count++;
-	}
-
-	mutex_unlock(&reset_lookup_mutex);
-
-	if (count == 0)
-		count = -ENOENT;
-
-	return count;
-}
-
 /**
  * reset_control_get_count - Count number of resets available with a device
  *
@@ -1530,6 +1412,6 @@ int reset_control_get_count(struct device *dev)
 	if (dev->of_node)
 		return of_reset_control_get_count(dev->of_node);
 
-	return reset_control_get_count_from_lookup(dev);
+	return -ENOENT;
 }
 EXPORT_SYMBOL_GPL(reset_control_get_count);
diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h
index 357df16ede32..46514cb1b9e0 100644
--- a/include/linux/reset-controller.h
+++ b/include/linux/reset-controller.h
@@ -26,31 +26,6 @@ struct module;
 struct device_node;
 struct of_phandle_args;
 
-/**
- * struct reset_control_lookup - represents a single lookup entry
- *
- * @list: internal list of all reset lookup entries
- * @provider: name of the reset controller device controlling this reset line
- * @index: ID of the reset controller in the reset controller device
- * @dev_id: name of the device associated with this reset line
- * @con_id: name of the reset line (can be NULL)
- */
-struct reset_control_lookup {
-	struct list_head list;
-	const char *provider;
-	unsigned int index;
-	const char *dev_id;
-	const char *con_id;
-};
-
-#define RESET_LOOKUP(_provider, _index, _dev_id, _con_id)		\
-	{								\
-		.provider = _provider,					\
-		.index = _index,					\
-		.dev_id = _dev_id,					\
-		.con_id = _con_id,					\
-	}
-
 /**
  * struct reset_controller_dev - reset controller entity that might
  *                               provide multiple reset controls
@@ -90,9 +65,6 @@ void reset_controller_unregister(struct reset_controller_dev *rcdev);
 struct device;
 int devm_reset_controller_register(struct device *dev,
 				   struct reset_controller_dev *rcdev);
-
-void reset_controller_add_lookup(struct reset_control_lookup *lookup,
-				 unsigned int num_entries);
 #else
 static inline int reset_controller_register(struct reset_controller_dev *rcdev)
 {
@@ -108,11 +80,6 @@ static inline int devm_reset_controller_register(struct device *dev,
 {
 	return 0;
 }
-
-static inline void reset_controller_add_lookup(struct reset_control_lookup *lookup,
-					       unsigned int num_entries)
-{
-}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 5334eb9de76c74e24821aae89e111e27398b5add Mon Sep 17 00:00:00 2001
From: Yao Zi <ziyao@disroot.org>
Date: Tue, 14 Oct 2025 13:10:28 +0000
Subject: dt-bindings: reset: thead,th1520-reset: Remove non-VO-subsystem
 resets

Registers in control of TH1520_RESET_ID_{NPU,WDT0,WDT1} belong to AP
reset controller, not the VO one which is documented as
"thead,th1520-reset" and is the only reset controller supported for
TH1520 for now.

Let's remove the IDs, leaving them to be implemented by AP-subsystem
reset controller in the future.

Fixes: 30e7573babdc ("dt-bindings: reset: Add T-HEAD TH1520 SoC Reset Controller")
Signed-off-by: Yao Zi <ziyao@disroot.org>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Drew Fustini <fustini@kernel.org>
Acked-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 include/dt-bindings/reset/thead,th1520-reset.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/reset/thead,th1520-reset.h b/include/dt-bindings/reset/thead,th1520-reset.h
index ee799286c175..e51d6314d131 100644
--- a/include/dt-bindings/reset/thead,th1520-reset.h
+++ b/include/dt-bindings/reset/thead,th1520-reset.h
@@ -9,9 +9,6 @@
 
 #define TH1520_RESET_ID_GPU		0
 #define TH1520_RESET_ID_GPU_CLKGEN	1
-#define TH1520_RESET_ID_NPU		2
-#define TH1520_RESET_ID_WDT0		3
-#define TH1520_RESET_ID_WDT1		4
 #define TH1520_RESET_ID_DPU_AHB		5
 #define TH1520_RESET_ID_DPU_AXI		6
 #define TH1520_RESET_ID_DPU_CORE	7
-- 
cgit v1.2.3


From a35ac6f3bdb135debc8e1ff599d0009bc64dc329 Mon Sep 17 00:00:00 2001
From: Yao Zi <ziyao@disroot.org>
Date: Tue, 14 Oct 2025 13:10:29 +0000
Subject: dt-bindings: reset: thead,th1520-reset: Add controllers for more
 subsys

TH1520 SoC is divided into several subsystems, most of them have
distinct reset controllers. Let's document reset controllers other than
the one for VO subsystem and IDs for their reset signals.

Signed-off-by: Yao Zi <ziyao@disroot.org>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Drew Fustini <fustini@kernel.org>
Acked-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 .../bindings/reset/thead,th1520-reset.yaml         |   8 +-
 include/dt-bindings/reset/thead,th1520-reset.h     | 216 +++++++++++++++++++++
 2 files changed, 223 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/reset/thead,th1520-reset.yaml b/Documentation/devicetree/bindings/reset/thead,th1520-reset.yaml
index f2e91d0add7a..7b5053c177fe 100644
--- a/Documentation/devicetree/bindings/reset/thead,th1520-reset.yaml
+++ b/Documentation/devicetree/bindings/reset/thead,th1520-reset.yaml
@@ -16,7 +16,13 @@ maintainers:
 properties:
   compatible:
     enum:
-      - thead,th1520-reset
+      - thead,th1520-reset # Reset controller for VO subsystem
+      - thead,th1520-reset-ao
+      - thead,th1520-reset-ap
+      - thead,th1520-reset-dsp
+      - thead,th1520-reset-misc
+      - thead,th1520-reset-vi
+      - thead,th1520-reset-vp
 
   reg:
     maxItems: 1
diff --git a/include/dt-bindings/reset/thead,th1520-reset.h b/include/dt-bindings/reset/thead,th1520-reset.h
index e51d6314d131..ba6805b6b12a 100644
--- a/include/dt-bindings/reset/thead,th1520-reset.h
+++ b/include/dt-bindings/reset/thead,th1520-reset.h
@@ -7,6 +7,200 @@
 #ifndef _DT_BINDINGS_TH1520_RESET_H
 #define _DT_BINDINGS_TH1520_RESET_H
 
+/* AO Subsystem */
+#define TH1520_RESET_ID_SYSTEM		0
+#define TH1520_RESET_ID_RTC_APB		1
+#define TH1520_RESET_ID_RTC_REF		2
+#define TH1520_RESET_ID_AOGPIO_DB	3
+#define TH1520_RESET_ID_AOGPIO_APB	4
+#define TH1520_RESET_ID_AOI2C_APB	5
+#define TH1520_RESET_ID_PVT_APB		6
+#define TH1520_RESET_ID_E902_CORE	7
+#define TH1520_RESET_ID_E902_HAD	8
+#define TH1520_RESET_ID_AOTIMER_APB	9
+#define TH1520_RESET_ID_AOTIMER_CORE	10
+#define TH1520_RESET_ID_AOWDT_APB	11
+#define TH1520_RESET_ID_APSYS		12
+#define TH1520_RESET_ID_NPUSYS		13
+#define TH1520_RESET_ID_DDRSYS		14
+#define TH1520_RESET_ID_AXI_AP2CP	15
+#define TH1520_RESET_ID_AXI_CP2AP	16
+#define TH1520_RESET_ID_AXI_CP2SRAM	17
+#define TH1520_RESET_ID_AUDSYS_CORE	18
+#define TH1520_RESET_ID_AUDSYS_IOPMP	19
+#define TH1520_RESET_ID_AUDSYS		20
+#define TH1520_RESET_ID_DSP0		21
+#define TH1520_RESET_ID_DSP1		22
+#define TH1520_RESET_ID_GPU_MODULE	23
+#define TH1520_RESET_ID_VDEC		24
+#define TH1520_RESET_ID_VENC		25
+#define TH1520_RESET_ID_ADC_APB		26
+#define TH1520_RESET_ID_AUDGPIO_DB	27
+#define TH1520_RESET_ID_AUDGPIO_APB	28
+#define TH1520_RESET_ID_AOUART_IF	29
+#define TH1520_RESET_ID_AOUART_APB	30
+#define TH1520_RESET_ID_SRAM_AXI_P0	31
+#define TH1520_RESET_ID_SRAM_AXI_P1	32
+#define TH1520_RESET_ID_SRAM_AXI_P2	33
+#define TH1520_RESET_ID_SRAM_AXI_P3	34
+#define TH1520_RESET_ID_SRAM_AXI_P4	35
+#define TH1520_RESET_ID_SRAM_AXI_CORE	36
+#define TH1520_RESET_ID_SE		37
+
+/* AP Subsystem */
+#define TH1520_RESET_ID_BROM			0
+#define TH1520_RESET_ID_C910_TOP		1
+#define TH1520_RESET_ID_NPU			2
+#define TH1520_RESET_ID_WDT0			3
+#define TH1520_RESET_ID_WDT1			4
+#define TH1520_RESET_ID_C910_C0			5
+#define TH1520_RESET_ID_C910_C1			6
+#define TH1520_RESET_ID_C910_C2			7
+#define TH1520_RESET_ID_C910_C3			8
+#define TH1520_RESET_ID_CHIP_DBG_CORE		9
+#define TH1520_RESET_ID_CHIP_DBG_AXI		10
+#define TH1520_RESET_ID_AXI4_CPUSYS2_AXI	11
+#define TH1520_RESET_ID_AXI4_CPUSYS2_APB	12
+#define TH1520_RESET_ID_X2H_CPUSYS		13
+#define TH1520_RESET_ID_AHB2_CPUSYS		14
+#define TH1520_RESET_ID_APB3_CPUSYS		15
+#define TH1520_RESET_ID_MBOX0_APB		16
+#define TH1520_RESET_ID_MBOX1_APB		17
+#define TH1520_RESET_ID_MBOX2_APB		18
+#define TH1520_RESET_ID_MBOX3_APB		19
+#define TH1520_RESET_ID_TIMER0_APB		20
+#define TH1520_RESET_ID_TIMER0_CORE		21
+#define TH1520_RESET_ID_TIMER1_APB		22
+#define TH1520_RESET_ID_TIMER1_CORE		23
+#define TH1520_RESET_ID_PERISYS_AHB		24
+#define TH1520_RESET_ID_PERISYS_APB1		25
+#define TH1520_RESET_ID_PERISYS_APB2		26
+#define TH1520_RESET_ID_GMAC0_APB		27
+#define TH1520_RESET_ID_GMAC0_AHB		28
+#define TH1520_RESET_ID_GMAC0_CLKGEN		29
+#define TH1520_RESET_ID_GMAC0_AXI		30
+#define TH1520_RESET_ID_UART0_APB		31
+#define TH1520_RESET_ID_UART0_IF		32
+#define TH1520_RESET_ID_UART1_APB		33
+#define TH1520_RESET_ID_UART1_IF		34
+#define TH1520_RESET_ID_UART2_APB		35
+#define TH1520_RESET_ID_UART2_IF		36
+#define TH1520_RESET_ID_UART3_APB		37
+#define TH1520_RESET_ID_UART3_IF		38
+#define TH1520_RESET_ID_UART4_APB		39
+#define TH1520_RESET_ID_UART4_IF		40
+#define TH1520_RESET_ID_UART5_APB		41
+#define TH1520_RESET_ID_UART5_IF		42
+#define TH1520_RESET_ID_QSPI0_IF		43
+#define TH1520_RESET_ID_QSPI0_APB		44
+#define TH1520_RESET_ID_QSPI1_IF		45
+#define TH1520_RESET_ID_QSPI1_APB		46
+#define TH1520_RESET_ID_SPI_IF			47
+#define TH1520_RESET_ID_SPI_APB			48
+#define TH1520_RESET_ID_I2C0_APB		49
+#define TH1520_RESET_ID_I2C0_CORE		50
+#define TH1520_RESET_ID_I2C1_APB		51
+#define TH1520_RESET_ID_I2C1_CORE		52
+#define TH1520_RESET_ID_I2C2_APB		53
+#define TH1520_RESET_ID_I2C2_CORE		54
+#define TH1520_RESET_ID_I2C3_APB		55
+#define TH1520_RESET_ID_I2C3_CORE		56
+#define TH1520_RESET_ID_I2C4_APB		57
+#define TH1520_RESET_ID_I2C4_CORE		58
+#define TH1520_RESET_ID_I2C5_APB		59
+#define TH1520_RESET_ID_I2C5_CORE		60
+#define TH1520_RESET_ID_GPIO0_DB		61
+#define TH1520_RESET_ID_GPIO0_APB		62
+#define TH1520_RESET_ID_GPIO1_DB		63
+#define TH1520_RESET_ID_GPIO1_APB		64
+#define TH1520_RESET_ID_GPIO2_DB		65
+#define TH1520_RESET_ID_GPIO2_APB		66
+#define TH1520_RESET_ID_PWM_COUNTER		67
+#define TH1520_RESET_ID_PWM_APB			68
+#define TH1520_RESET_ID_PADCTRL0_APB		69
+#define TH1520_RESET_ID_CPU2PERI_X2H		70
+#define TH1520_RESET_ID_CPU2AON_X2H		71
+#define TH1520_RESET_ID_AON2CPU_A2X		72
+#define TH1520_RESET_ID_NPUSYS_AXI		73
+#define TH1520_RESET_ID_NPUSYS_AXI_APB		74
+#define TH1520_RESET_ID_CPU2VP_X2P		75
+#define TH1520_RESET_ID_CPU2VI_X2H		76
+#define TH1520_RESET_ID_BMU_AXI			77
+#define TH1520_RESET_ID_BMU_APB			78
+#define TH1520_RESET_ID_DMAC_CPUSYS_AXI		79
+#define TH1520_RESET_ID_DMAC_CPUSYS_AHB		80
+#define TH1520_RESET_ID_SPINLOCK		81
+#define TH1520_RESET_ID_CFG2TEE			82
+#define TH1520_RESET_ID_DSMART			83
+#define TH1520_RESET_ID_GPIO3_DB		84
+#define TH1520_RESET_ID_GPIO3_APB		85
+#define TH1520_RESET_ID_PERI_I2S		86
+#define TH1520_RESET_ID_PERI_APB3		87
+#define TH1520_RESET_ID_PERI2PERI1_APB		88
+#define TH1520_RESET_ID_VPSYS_APB		89
+#define TH1520_RESET_ID_PERISYS_APB4		90
+#define TH1520_RESET_ID_GMAC1_APB		91
+#define TH1520_RESET_ID_GMAC1_AHB		92
+#define TH1520_RESET_ID_GMAC1_CLKGEN		93
+#define TH1520_RESET_ID_GMAC1_AXI		94
+#define TH1520_RESET_ID_GMAC_AXI		95
+#define TH1520_RESET_ID_GMAC_AXI_APB		96
+#define TH1520_RESET_ID_PADCTRL1_APB		97
+#define TH1520_RESET_ID_VOSYS_AXI		98
+#define TH1520_RESET_ID_VOSYS_AXI_APB		99
+#define TH1520_RESET_ID_VOSYS_AXI_X2X		100
+#define TH1520_RESET_ID_MISC2VP_X2X		101
+#define TH1520_RESET_ID_DSPSYS			102
+#define TH1520_RESET_ID_VISYS			103
+#define TH1520_RESET_ID_VOSYS			104
+#define TH1520_RESET_ID_VPSYS			105
+
+/* DSP Subsystem */
+#define TH1520_RESET_ID_X2X_DSP1	0
+#define TH1520_RESET_ID_X2X_DSP0	1
+#define TH1520_RESET_ID_X2X_SLAVE_DSP1	2
+#define TH1520_RESET_ID_X2X_SLAVE_DSP0	3
+#define TH1520_RESET_ID_DSP0_CORE	4
+#define TH1520_RESET_ID_DSP0_DEBUG	5
+#define TH1520_RESET_ID_DSP0_APB	6
+#define TH1520_RESET_ID_DSP1_CORE	7
+#define TH1520_RESET_ID_DSP1_DEBUG	8
+#define TH1520_RESET_ID_DSP1_APB	9
+#define TH1520_RESET_ID_DSPSYS_APB	10
+#define TH1520_RESET_ID_AXI4_DSPSYS_SLV	11
+#define TH1520_RESET_ID_AXI4_DSPSYS	12
+#define TH1520_RESET_ID_AXI4_DSP_RS	13
+
+/* MISC Subsystem */
+#define TH1520_RESET_ID_EMMC_SDIO_CLKGEN	0
+#define TH1520_RESET_ID_EMMC			1
+#define TH1520_RESET_ID_MISCSYS_AXI		2
+#define TH1520_RESET_ID_MISCSYS_AXI_APB		3
+#define TH1520_RESET_ID_SDIO0			4
+#define TH1520_RESET_ID_SDIO1			5
+#define TH1520_RESET_ID_USB3_APB		6
+#define TH1520_RESET_ID_USB3_PHY		7
+#define TH1520_RESET_ID_USB3_VCC		8
+
+/* VI Subsystem */
+#define TH1520_RESET_ID_ISP0		0
+#define TH1520_RESET_ID_ISP1		1
+#define TH1520_RESET_ID_CSI0_APB	2
+#define TH1520_RESET_ID_CSI1_APB	3
+#define TH1520_RESET_ID_CSI2_APB	4
+#define TH1520_RESET_ID_MIPI_FIFO	5
+#define TH1520_RESET_ID_ISP_VENC_APB	6
+#define TH1520_RESET_ID_VIPRE_APB	7
+#define TH1520_RESET_ID_VIPRE_AXI	8
+#define TH1520_RESET_ID_DW200_APB	9
+#define TH1520_RESET_ID_VISYS3_AXI	10
+#define TH1520_RESET_ID_VISYS2_AXI	11
+#define TH1520_RESET_ID_VISYS1_AXI	12
+#define TH1520_RESET_ID_VISYS_AXI	13
+#define TH1520_RESET_ID_VISYS_APB	14
+#define TH1520_RESET_ID_ISP_VENC_AXI	15
+
+/* VO Subsystem */
 #define TH1520_RESET_ID_GPU		0
 #define TH1520_RESET_ID_GPU_CLKGEN	1
 #define TH1520_RESET_ID_DPU_AHB		5
@@ -16,5 +210,27 @@
 #define TH1520_RESET_ID_DSI1_APB	9
 #define TH1520_RESET_ID_HDMI		10
 #define TH1520_RESET_ID_HDMI_APB	11
+#define TH1520_RESET_ID_VOAXI		12
+#define TH1520_RESET_ID_VOAXI_APB	13
+#define TH1520_RESET_ID_X2H_DPU_AXI	14
+#define TH1520_RESET_ID_X2H_DPU_AHB	15
+#define TH1520_RESET_ID_X2H_DPU1_AXI	16
+#define TH1520_RESET_ID_X2H_DPU1_AHB	17
+
+/* VP Subsystem */
+#define TH1520_RESET_ID_VPSYS_AXI_APB	0
+#define TH1520_RESET_ID_VPSYS_AXI	1
+#define TH1520_RESET_ID_FCE_APB		2
+#define TH1520_RESET_ID_FCE_CORE	3
+#define TH1520_RESET_ID_FCE_X2X_MASTER	4
+#define TH1520_RESET_ID_FCE_X2X_SLAVE	5
+#define TH1520_RESET_ID_G2D_APB		6
+#define TH1520_RESET_ID_G2D_ACLK	7
+#define TH1520_RESET_ID_G2D_CORE	8
+#define TH1520_RESET_ID_VDEC_APB	9
+#define TH1520_RESET_ID_VDEC_ACLK	10
+#define TH1520_RESET_ID_VDEC_CORE	11
+#define TH1520_RESET_ID_VENC_APB	12
+#define TH1520_RESET_ID_VENC_CORE	13
 
 #endif /* _DT_BINDINGS_TH1520_RESET_H */
-- 
cgit v1.2.3


From f3d8b64ee46c9b4b0b82b1a4642027728bac95b8 Mon Sep 17 00:00:00 2001
From: Encrow Thorne <jyc0019@gmail.com>
Date: Mon, 10 Nov 2025 14:10:37 +0800
Subject: reset: fix BIT macro reference

RESET_CONTROL_FLAGS_BIT_* macros use BIT(), but reset.h does not
include bits.h. This causes compilation errors when including
reset.h standalone.

Include bits.h to make reset.h self-contained.

Suggested-by: Troy Mitchell <troy.mitchell@linux.dev>
Reviewed-by: Troy Mitchell <troy.mitchell@linux.dev>
Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Encrow Thorne <jyc0019@gmail.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 include/linux/reset.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/reset.h b/include/linux/reset.h
index 840d75d172f6..44f9e3415f92 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_RESET_H_
 #define _LINUX_RESET_H_
 
+#include <linux/bits.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/types.h>
-- 
cgit v1.2.3


From 4edf654be5471659e3260be0a557eaa2ece668ab Mon Sep 17 00:00:00 2001
From: Peter Griffin <peter.griffin@linaro.org>
Date: Wed, 12 Nov 2025 16:27:06 +0000
Subject: phy: add new phy_notify_state() api

Add a new phy_notify_state() api that notifies and configures a phy for a
given state transition.

This is intended to be used by phy drivers which need to do some runtime
configuration of parameters that can't be handled by phy_calibrate() or
phy_power_{on|off}().

The first usage of this API is in the Samsung UFS phy that needs to issue
some register writes when entering and exiting the hibernate link state.

Signed-off-by: Peter Griffin <peter.griffin@linaro.org>
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://patch.msgid.link/20251112-phy-notify-pmstate-v5-1-39df622d8fcb@linaro.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/phy/phy-core.c  | 25 +++++++++++++++++++++++++
 include/linux/phy/phy.h | 19 +++++++++++++++++++
 2 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c
index 04a5a34e7a95..60be8af984bf 100644
--- a/drivers/phy/phy-core.c
+++ b/drivers/phy/phy-core.c
@@ -520,6 +520,31 @@ int phy_notify_disconnect(struct phy *phy, int port)
 }
 EXPORT_SYMBOL_GPL(phy_notify_disconnect);
 
+/**
+ * phy_notify_state() - phy state notification
+ * @phy: the PHY returned by phy_get()
+ * @state: the PHY state
+ *
+ * Notify the PHY of a state transition. Used to notify and
+ * configure the PHY accordingly.
+ *
+ * Returns: %0 if successful, a negative error code otherwise
+ */
+int phy_notify_state(struct phy *phy, union phy_notify state)
+{
+	int ret;
+
+	if (!phy || !phy->ops->notify_phystate)
+		return 0;
+
+	mutex_lock(&phy->mutex);
+	ret = phy->ops->notify_phystate(phy, state);
+	mutex_unlock(&phy->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phy_notify_state);
+
 /**
  * phy_configure() - Changes the phy parameters
  * @phy: the phy returned by phy_get()
diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 13add0c2c407..2af0d01ebb39 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -53,6 +53,15 @@ enum phy_media {
 	PHY_MEDIA_DAC,
 };
 
+enum phy_ufs_state {
+	PHY_UFS_HIBERN8_ENTER,
+	PHY_UFS_HIBERN8_EXIT,
+};
+
+union phy_notify {
+	enum phy_ufs_state ufs_state;
+};
+
 /**
  * union phy_configure_opts - Opaque generic phy configuration
  *
@@ -83,6 +92,7 @@ union phy_configure_opts {
  * @set_speed: set the speed of the phy (optional)
  * @reset: resetting the phy
  * @calibrate: calibrate the phy
+ * @notify_phystate: notify and configure the phy for a particular state
  * @release: ops to be performed while the consumer relinquishes the PHY
  * @owner: the module owner containing the ops
  */
@@ -132,6 +142,7 @@ struct phy_ops {
 	int	(*connect)(struct phy *phy, int port);
 	int	(*disconnect)(struct phy *phy, int port);
 
+	int	(*notify_phystate)(struct phy *phy, union phy_notify state);
 	void	(*release)(struct phy *phy);
 	struct module *owner;
 };
@@ -255,6 +266,7 @@ int phy_reset(struct phy *phy);
 int phy_calibrate(struct phy *phy);
 int phy_notify_connect(struct phy *phy, int port);
 int phy_notify_disconnect(struct phy *phy, int port);
+int phy_notify_state(struct phy *phy, union phy_notify state);
 static inline int phy_get_bus_width(struct phy *phy)
 {
 	return phy->attrs.bus_width;
@@ -412,6 +424,13 @@ static inline int phy_notify_disconnect(struct phy *phy, int index)
 	return -ENOSYS;
 }
 
+static inline int phy_notify_state(struct phy *phy, union phy_notify state)
+{
+	if (!phy)
+		return 0;
+	return -ENOSYS;
+}
+
 static inline int phy_configure(struct phy *phy,
 				union phy_configure_opts *opts)
 {
-- 
cgit v1.2.3


From 01ba82702957225218c54c06ad2c2d468b83f510 Mon Sep 17 00:00:00 2001
From: Krishna Chaitanya Chundru <krishna.chundru@oss.qualcomm.com>
Date: Sat, 1 Nov 2025 09:29:33 +0530
Subject: PCI: Add .assert_perst() to control PCIe PERST#

Controller driver probes first, enables link training and scans the bus.
When the PCI bridge is found, its child DT nodes will be scanned and
pwrctrl devices will be created if needed. By the time pwrctrl driver probe
gets called, link training is already enabled by controller driver.

Certain devices like TC9563, which uses the PCI pwrctl framework, need to
configure the device before the PCIe link is up.

As the controller driver already enables link training as part of its
probe, the moment device is powered on, controller and device participate
in link training and link can come up immediately and may not have time to
configure the device.

So we need to stop the link training by using assert_perst() by asserting
PERST# and de-asserting PERST# after device is configured.

Signed-off-by: Krishna Chaitanya Chundru <krishna.chundru@oss.qualcomm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Manivannan Sadhasivam <mani@kernel.org>
Link: https://patch.msgid.link/20251101-tc9563-v9-2-de3429f7787a@oss.qualcomm.com
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1fdf81fbe1e..ed5dac663e96 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -829,6 +829,7 @@ struct pci_ops {
 	void __iomem *(*map_bus)(struct pci_bus *bus, unsigned int devfn, int where);
 	int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val);
 	int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val);
+	int (*assert_perst)(struct pci_bus *bus, bool assert);
 };
 
 /*
-- 
cgit v1.2.3


From f5cb3ee251b4f9db2761aced191f10579bd7e64e Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 11 Nov 2025 14:06:17 +0000
Subject: ASoC: SDCA: Add companion amp Function

Add companion amp into the list of allowed SDCA Functions. More work
will be required to fully support companion amp, but this will let parts
including companion amp functions boot and it is a good first step to
proper support.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Tested-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20251111140617.2997454-1-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h   | 3 +++
 sound/soc/sdca/sdca_functions.c | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 99cb978f7099..c97861508a15 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -64,6 +64,7 @@ struct sdca_function_desc;
  * @SDCA_FUNCTION_TYPE_RJ: Retaskable jack.
  * @SDCA_FUNCTION_TYPE_SIMPLE_JACK: Subset of UAJ.
  * @SDCA_FUNCTION_TYPE_HID: Human Interface Device, for e.g. buttons.
+ * @SDCA_FUNCTION_TYPE_COMPANION_AMP: Sources audio from another amp.
  * @SDCA_FUNCTION_TYPE_IMP_DEF: Implementation-defined function.
  *
  * SDCA Function Types from SDCA specification v1.0a Section 5.1.2
@@ -83,6 +84,7 @@ enum sdca_function_type {
 	SDCA_FUNCTION_TYPE_RJ				= 0x07,
 	SDCA_FUNCTION_TYPE_SIMPLE_JACK			= 0x08,
 	SDCA_FUNCTION_TYPE_HID				= 0x0A,
+	SDCA_FUNCTION_TYPE_COMPANION_AMP		= 0x0B,
 	SDCA_FUNCTION_TYPE_IMP_DEF			= 0x1F,
 };
 
@@ -96,6 +98,7 @@ enum sdca_function_type {
 #define	SDCA_FUNCTION_TYPE_RJ_NAME			"RJ"
 #define	SDCA_FUNCTION_TYPE_SIMPLE_NAME			"SimpleJack"
 #define	SDCA_FUNCTION_TYPE_HID_NAME			"HID"
+#define	SDCA_FUNCTION_TYPE_COMPANION_AMP_NAME		"CompanionAmp"
 #define	SDCA_FUNCTION_TYPE_IMP_DEF_NAME			"ImplementationDefined"
 
 /**
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 49b98fe2d854..0fcc8e82041e 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -79,6 +79,8 @@ static const char *get_sdca_function_name(u32 function_type)
 		return SDCA_FUNCTION_TYPE_SPEAKER_MIC_NAME;
 	case SDCA_FUNCTION_TYPE_RJ:
 		return SDCA_FUNCTION_TYPE_RJ_NAME;
+	case SDCA_FUNCTION_TYPE_COMPANION_AMP:
+		return SDCA_FUNCTION_TYPE_COMPANION_AMP_NAME;
 	case SDCA_FUNCTION_TYPE_IMP_DEF:
 		return SDCA_FUNCTION_TYPE_IMP_DEF_NAME;
 	default:
-- 
cgit v1.2.3


From d9d0be59be2580f2c5e4b7217aafb980e8c371cf Mon Sep 17 00:00:00 2001
From: Martijn de Gouw <martijn.de.gouw@prodrive-technologies.com>
Date: Mon, 17 Nov 2025 21:22:14 +0100
Subject: regulator: pca9450: Add support for setting debounce settings

Make the different debounce timers configurable from the devicetree.
Depending on the board design, these have to be set different than the
default register values.

Signed-off-by: Martijn de Gouw <martijn.de.gouw@prodrive-technologies.com>
Link: https://patch.msgid.link/20251117202215.1936139-2-martijn.de.gouw@prodrive-technologies.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/pca9450-regulator.c | 158 ++++++++++++++++++++++++++++++----
 include/linux/regulator/pca9450.h     |  32 +++++++
 2 files changed, 171 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/pca9450-regulator.c b/drivers/regulator/pca9450-regulator.c
index 247f12df8974..93154c9c98dd 100644
--- a/drivers/regulator/pca9450-regulator.c
+++ b/drivers/regulator/pca9450-regulator.c
@@ -1147,6 +1147,143 @@ static int pca9450_i2c_restart_handler(struct sys_off_data *data)
 	return 0;
 }
 
+static int pca9450_of_init(struct pca9450 *pca9450)
+{
+	struct i2c_client *i2c = container_of(pca9450->dev, struct i2c_client, dev);
+	int ret;
+	unsigned int val;
+	unsigned int reset_ctrl;
+	unsigned int rstb_deb_ctrl;
+	unsigned int t_on_deb, t_off_deb;
+	unsigned int t_on_step, t_off_step;
+	unsigned int t_restart;
+
+	if (of_property_read_bool(i2c->dev.of_node, "nxp,wdog_b-warm-reset"))
+		reset_ctrl = WDOG_B_CFG_WARM;
+	else
+		reset_ctrl = WDOG_B_CFG_COLD_LDO12;
+
+	/* Set reset behavior on assertion of WDOG_B signal */
+	ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_RESET_CTRL,
+				 WDOG_B_CFG_MASK, reset_ctrl);
+	if (ret)
+		return dev_err_probe(&i2c->dev, ret, "Failed to set WDOG_B reset behavior\n");
+
+	ret = of_property_read_u32(i2c->dev.of_node, "npx,pmic-rst-b-debounce-ms", &val);
+	if (ret == -EINVAL)
+		rstb_deb_ctrl = T_PMIC_RST_DEB_50MS;
+	else if (ret)
+		return ret;
+	else {
+		switch (val) {
+		case 10: rstb_deb_ctrl = T_PMIC_RST_DEB_10MS; break;
+		case 50: rstb_deb_ctrl = T_PMIC_RST_DEB_50MS; break;
+		case 100: rstb_deb_ctrl = T_PMIC_RST_DEB_100MS; break;
+		case 500: rstb_deb_ctrl = T_PMIC_RST_DEB_500MS; break;
+		case 1000: rstb_deb_ctrl = T_PMIC_RST_DEB_1S; break;
+		case 2000: rstb_deb_ctrl = T_PMIC_RST_DEB_2S; break;
+		case 4000: rstb_deb_ctrl = T_PMIC_RST_DEB_4S; break;
+		case 8000: rstb_deb_ctrl = T_PMIC_RST_DEB_8S; break;
+		default: return -EINVAL;
+		}
+	}
+	ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_RESET_CTRL,
+				 T_PMIC_RST_DEB_MASK, rstb_deb_ctrl);
+	if (ret)
+		return dev_err_probe(&i2c->dev, ret, "Failed to set PMIC_RST_B debounce time\n");
+
+	ret = of_property_read_u32(i2c->dev.of_node, "nxp,pmic-on-req-on-debounce-us", &val);
+	if (ret == -EINVAL)
+		t_on_deb = T_ON_DEB_20MS;
+	else if (ret)
+		return ret;
+	else {
+		switch (val) {
+		case 120: t_on_deb = T_ON_DEB_120US; break;
+		case 20000: t_on_deb = T_ON_DEB_20MS; break;
+		case 100000: t_on_deb = T_ON_DEB_100MS; break;
+		case 750000: t_on_deb = T_ON_DEB_750MS; break;
+		default: return -EINVAL;
+		}
+	}
+
+	ret = of_property_read_u32(i2c->dev.of_node, "nxp,pmic-on-req-off-debounce-us", &val);
+	if (ret == -EINVAL)
+		t_off_deb = T_OFF_DEB_120US;
+	else if (ret)
+		return ret;
+	else {
+		switch (val) {
+		case 120: t_off_deb = T_OFF_DEB_120US; break;
+		case 2000: t_off_deb = T_OFF_DEB_2MS; break;
+		default: return -EINVAL;
+		}
+	}
+
+	ret = of_property_read_u32(i2c->dev.of_node, "nxp,power-on-step-ms", &val);
+	if (ret == -EINVAL)
+		t_on_step = T_ON_STEP_2MS;
+	else if (ret)
+		return ret;
+	else {
+		switch (val) {
+		case 1: t_on_step = T_ON_STEP_1MS; break;
+		case 2: t_on_step = T_ON_STEP_2MS; break;
+		case 4: t_on_step = T_ON_STEP_4MS; break;
+		case 8: t_on_step = T_ON_STEP_8MS; break;
+		default: return -EINVAL;
+		}
+	}
+
+	ret = of_property_read_u32(i2c->dev.of_node, "nxp,power-down-step-ms", &val);
+	if (ret == -EINVAL)
+		t_off_step = T_OFF_STEP_8MS;
+	else if (ret)
+		return ret;
+	else {
+		switch (val) {
+		case 2: t_off_step = T_OFF_STEP_2MS; break;
+		case 4: t_off_step = T_OFF_STEP_4MS; break;
+		case 8: t_off_step = T_OFF_STEP_8MS; break;
+		case 16: t_off_step = T_OFF_STEP_16MS; break;
+		default: return -EINVAL;
+		}
+	}
+
+	ret = of_property_read_u32(i2c->dev.of_node, "nxp,restart-ms", &val);
+	if (ret == -EINVAL)
+		t_restart = T_RESTART_250MS;
+	else if (ret)
+		return ret;
+	else {
+		switch (val) {
+		case 250: t_restart = T_RESTART_250MS; break;
+		case 500: t_restart = T_RESTART_500MS; break;
+		default: return -EINVAL;
+		}
+	}
+
+	ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_PWRCTRL,
+				 T_ON_DEB_MASK | T_OFF_DEB_MASK | T_ON_STEP_MASK |
+				 T_OFF_STEP_MASK | T_RESTART_MASK,
+				 t_on_deb | t_off_deb | t_on_step |
+				 t_off_step | t_restart);
+	if (ret)
+		return dev_err_probe(&i2c->dev, ret,
+				     "Failed to set PWR_CTRL debounce configuration\n");
+
+	if (of_property_read_bool(i2c->dev.of_node, "nxp,i2c-lt-enable")) {
+		/* Enable I2C Level Translator */
+		ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_CONFIG2,
+					 I2C_LT_MASK, I2C_LT_ON_STANDBY_RUN);
+		if (ret)
+			return dev_err_probe(&i2c->dev, ret,
+					     "Failed to enable I2C level translator\n");
+	}
+
+	return 0;
+}
+
 static int pca9450_i2c_probe(struct i2c_client *i2c)
 {
 	enum pca9450_chip_type type = (unsigned int)(uintptr_t)
@@ -1156,7 +1293,6 @@ static int pca9450_i2c_probe(struct i2c_client *i2c)
 	struct regulator_dev *ldo5;
 	struct pca9450 *pca9450;
 	unsigned int device_id, i;
-	unsigned int reset_ctrl;
 	int ret;
 
 	pca9450 = devm_kzalloc(&i2c->dev, sizeof(struct pca9450), GFP_KERNEL);
@@ -1254,25 +1390,9 @@ static int pca9450_i2c_probe(struct i2c_client *i2c)
 	if (ret)
 		return dev_err_probe(&i2c->dev, ret,  "Failed to clear PRESET_EN bit\n");
 
-	if (of_property_read_bool(i2c->dev.of_node, "nxp,wdog_b-warm-reset"))
-		reset_ctrl = WDOG_B_CFG_WARM;
-	else
-		reset_ctrl = WDOG_B_CFG_COLD_LDO12;
-
-	/* Set reset behavior on assertion of WDOG_B signal */
-	ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_RESET_CTRL,
-				 WDOG_B_CFG_MASK, reset_ctrl);
+	ret = pca9450_of_init(pca9450);
 	if (ret)
-		return dev_err_probe(&i2c->dev, ret, "Failed to set WDOG_B reset behavior\n");
-
-	if (of_property_read_bool(i2c->dev.of_node, "nxp,i2c-lt-enable")) {
-		/* Enable I2C Level Translator */
-		ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_CONFIG2,
-					 I2C_LT_MASK, I2C_LT_ON_STANDBY_RUN);
-		if (ret)
-			return dev_err_probe(&i2c->dev, ret,
-					     "Failed to enable I2C level translator\n");
-	}
+		return dev_err_probe(&i2c->dev, ret, "Unable to parse OF data\n");
 
 	/*
 	 * For LDO5 we need to be able to check the status of the SD_VSEL input in
diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h
index 85b4fecc10d8..0df8b3c48082 100644
--- a/include/linux/regulator/pca9450.h
+++ b/include/linux/regulator/pca9450.h
@@ -223,12 +223,44 @@ enum {
 #define IRQ_THERM_105			0x02
 #define IRQ_THERM_125			0x01
 
+/* PCA9450_REG_PWRCTRL bits */
+#define T_ON_DEB_MASK			0xC0
+#define T_ON_DEB_120US			(0 << 6)
+#define T_ON_DEB_20MS			(1 << 6)
+#define T_ON_DEB_100MS			(2 << 6)
+#define T_ON_DEB_750MS			(3 << 6)
+#define T_OFF_DEB_MASK			0x20
+#define T_OFF_DEB_120US			(0 << 5)
+#define T_OFF_DEB_2MS			(1 << 5)
+#define T_ON_STEP_MASK			0x18
+#define T_ON_STEP_1MS			(0 << 3)
+#define T_ON_STEP_2MS			(1 << 3)
+#define T_ON_STEP_4MS			(2 << 3)
+#define T_ON_STEP_8MS			(3 << 3)
+#define T_OFF_STEP_MASK			0x06
+#define T_OFF_STEP_2MS			(0 << 1)
+#define T_OFF_STEP_4MS			(1 << 1)
+#define T_OFF_STEP_8MS			(2 << 1)
+#define T_OFF_STEP_16MS			(3 << 1)
+#define T_RESTART_MASK			0x01
+#define T_RESTART_250MS			0
+#define T_RESTART_500MS			1
+
 /* PCA9450_REG_RESET_CTRL bits */
 #define WDOG_B_CFG_MASK			0xC0
 #define WDOG_B_CFG_NONE			0x00
 #define WDOG_B_CFG_WARM			0x40
 #define WDOG_B_CFG_COLD_LDO12		0x80
 #define WDOG_B_CFG_COLD			0xC0
+#define T_PMIC_RST_DEB_MASK		0x07
+#define T_PMIC_RST_DEB_10MS		0x00
+#define T_PMIC_RST_DEB_50MS		0x01
+#define T_PMIC_RST_DEB_100MS		0x02
+#define T_PMIC_RST_DEB_500MS		0x03
+#define T_PMIC_RST_DEB_1S		0x04
+#define T_PMIC_RST_DEB_2S		0x05
+#define T_PMIC_RST_DEB_4S		0x06
+#define T_PMIC_RST_DEB_8S		0x07
 
 /* PCA9450_REG_CONFIG2 bits */
 #define I2C_LT_MASK			0x03
-- 
cgit v1.2.3


From d85b56af22f371409cbf667bab26f938e6528d2e Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 15 Oct 2025 17:56:30 +0200
Subject: efi: Fix trailing whitespace in header file

Resolve an issue with the coding style.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 0b9eb3d2ff97..60e994096e20 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -290,7 +290,7 @@ typedef efi_status_t efi_get_variable_t (efi_char16_t *name, efi_guid_t *vendor,
 					 unsigned long *data_size, void *data);
 typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char16_t *name,
 					      efi_guid_t *vendor);
-typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, 
+typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor,
 					 u32 attr, unsigned long data_size,
 					 void *data);
 typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count);
-- 
cgit v1.2.3


From 17029cdd8f9d0182a6499e0b7bfc6391e8463091 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 15 Oct 2025 17:56:33 +0200
Subject: efi/libstub: gop: Add support for reading EDID

Add support for EFI_EDID_DISCOVERED_PROTOCOL and EFI_EDID_ACTIVE_PROTOCOL
as defined in UEFI 2.8, sec 12.9. Define GUIDs and data structures in the
rsp header files.

In the GOP setup function, read the EDID of the primary GOP device. First
try EFI_EDID_ACTIVE_PROTOCOL, which supports user-specified EDID data. Or
else try EFI_EDID_DISCOVERED_PROTOCOL, which returns the display device's
native EDID. If no EDID could be retrieved, clear the storage.

Rename efi_setup_gop() to efi_setup_graphics() to reflect the changes
Let callers pass an optional instance of struct edid_data, if they are
interested.

While screen_info and edid_info come from the same device handle, they
should be considered indendent data. The former refers to the graphics
mode, the latter refers to the display device. GOP devices might not
provide both.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/libstub/efi-stub.c |  2 +-
 drivers/firmware/efi/libstub/efistub.h  | 31 +++++++++++++++++++++++++++-
 drivers/firmware/efi/libstub/gop.c      | 36 ++++++++++++++++++++++++++++++++-
 drivers/firmware/efi/libstub/x86-stub.c |  2 +-
 include/linux/efi.h                     |  2 ++
 5 files changed, 69 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c
index 874f63b4a383..9cb814c5ba1b 100644
--- a/drivers/firmware/efi/libstub/efi-stub.c
+++ b/drivers/firmware/efi/libstub/efi-stub.c
@@ -56,7 +56,7 @@ static struct screen_info *setup_graphics(void)
 {
 	struct screen_info *si, tmp = {};
 
-	if (efi_setup_gop(&tmp) != EFI_SUCCESS)
+	if (efi_setup_graphics(&tmp, NULL) != EFI_SUCCESS)
 		return NULL;
 
 	si = alloc_screen_info();
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index f5ba032863a9..b2fb0c3fa721 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -34,6 +34,9 @@
 #define EFI_ALLOC_LIMIT		ULONG_MAX
 #endif
 
+struct edid_info;
+struct screen_info;
+
 extern bool efi_no5lvl;
 extern bool efi_nochunk;
 extern bool efi_nokaslr;
@@ -578,6 +581,32 @@ union efi_graphics_output_protocol {
 	} mixed_mode;
 };
 
+typedef union efi_edid_discovered_protocol efi_edid_discovered_protocol_t;
+
+union efi_edid_discovered_protocol {
+	struct {
+		u32 size_of_edid;
+		u8 *edid;
+	};
+	struct {
+		u32 size_of_edid;
+		u32 edid;
+	} mixed_mode;
+};
+
+typedef union efi_edid_active_protocol efi_edid_active_protocol_t;
+
+union efi_edid_active_protocol {
+	struct {
+		u32 size_of_edid;
+		u8 *edid;
+	};
+	struct {
+		u32 size_of_edid;
+		u32 edid;
+	} mixed_mode;
+};
+
 typedef union {
 	struct {
 		u32			revision;
@@ -1085,7 +1114,7 @@ efi_status_t efi_parse_options(char const *cmdline);
 
 void efi_parse_option_graphics(char *option);
 
-efi_status_t efi_setup_gop(struct screen_info *si);
+efi_status_t efi_setup_graphics(struct screen_info *si, struct edid_info *edid);
 
 efi_status_t handle_cmdline_files(efi_loaded_image_t *image,
 				  const efi_char16_t *optstr,
diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c
index 02459ef0f18c..72d74436a7a4 100644
--- a/drivers/firmware/efi/libstub/gop.c
+++ b/drivers/firmware/efi/libstub/gop.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <asm/efi.h>
 #include <asm/setup.h>
+#include <video/edid.h>
 
 #include "efistub.h"
 
@@ -413,6 +414,14 @@ static void setup_screen_info(struct screen_info *si, const efi_graphics_output_
 	si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
 }
 
+static void setup_edid_info(struct edid_info *edid, u32 gop_size_of_edid, u8 *gop_edid)
+{
+	if (!gop_edid || gop_size_of_edid < 128)
+		memset(edid->dummy, 0, sizeof(edid->dummy));
+	else
+		memcpy(edid->dummy, gop_edid, min(gop_size_of_edid, sizeof(edid->dummy)));
+}
+
 static efi_handle_t find_handle_with_primary_gop(unsigned long num, const efi_handle_t handles[],
 						 efi_graphics_output_protocol_t **found_gop)
 {
@@ -469,7 +478,7 @@ static efi_handle_t find_handle_with_primary_gop(unsigned long num, const efi_ha
 	return first_gop_handle;
 }
 
-efi_status_t efi_setup_gop(struct screen_info *si)
+efi_status_t efi_setup_graphics(struct screen_info *si, struct edid_info *edid)
 {
 	efi_handle_t *handles __free(efi_pool) = NULL;
 	efi_handle_t handle;
@@ -494,5 +503,30 @@ efi_status_t efi_setup_gop(struct screen_info *si)
 	if (si)
 		setup_screen_info(si, gop);
 
+	/* Display EDID for primary GOP */
+	if (edid) {
+		efi_edid_discovered_protocol_t *discovered_edid;
+		efi_edid_active_protocol_t *active_edid;
+		u32 gop_size_of_edid = 0;
+		u8 *gop_edid = NULL;
+
+		status = efi_bs_call(handle_protocol, handle, &EFI_EDID_ACTIVE_PROTOCOL_GUID,
+				     (void **)&active_edid);
+		if (status == EFI_SUCCESS) {
+			gop_size_of_edid = active_edid->size_of_edid;
+			gop_edid = active_edid->edid;
+		} else {
+			status = efi_bs_call(handle_protocol, handle,
+					     &EFI_EDID_DISCOVERED_PROTOCOL_GUID,
+					     (void **)&discovered_edid);
+			if (status == EFI_SUCCESS) {
+				gop_size_of_edid = discovered_edid->size_of_edid;
+				gop_edid = discovered_edid->edid;
+			}
+		}
+
+		setup_edid_info(edid, gop_size_of_edid, gop_edid);
+	}
+
 	return EFI_SUCCESS;
 }
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index dd9e19c31dd7..6e51cca72684 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -488,7 +488,7 @@ static void setup_graphics(struct boot_params *boot_params)
 {
 	struct screen_info *si = memset(&boot_params->screen_info, 0, sizeof(*si));
 
-	efi_setup_gop(si);
+	efi_setup_graphics(si, NULL);
 }
 
 static void __noreturn efi_exit(efi_handle_t handle, efi_status_t status)
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 60e994096e20..a01f3fe20dab 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -373,6 +373,8 @@ void efi_native_runtime_setup(void);
 #define EFI_DEVICE_PATH_TO_TEXT_PROTOCOL_GUID	EFI_GUID(0x8b843e20, 0x8132, 0x4852,  0x90, 0xcc, 0x55, 0x1a, 0x4e, 0x4a, 0x7f, 0x1c)
 #define EFI_DEVICE_PATH_FROM_TEXT_PROTOCOL_GUID	EFI_GUID(0x05c99a21, 0xc70f, 0x4ad2,  0x8a, 0x5f, 0x35, 0xdf, 0x33, 0x43, 0xf5, 0x1e)
 #define EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID	EFI_GUID(0x9042a9de, 0x23dc, 0x4a38,  0x96, 0xfb, 0x7a, 0xde, 0xd0, 0x80, 0x51, 0x6a)
+#define EFI_EDID_DISCOVERED_PROTOCOL_GUID	EFI_GUID(0x1c0c34f6, 0xd380, 0x41fa,  0xa0, 0x49, 0x8a, 0xd0, 0x6c, 0x1a, 0x66, 0xaa)
+#define EFI_EDID_ACTIVE_PROTOCOL_GUID		EFI_GUID(0xbd8c1056, 0x9f36, 0x44ec,  0x92, 0xa8, 0xa6, 0x33, 0x7f, 0x81, 0x79, 0x86)
 #define EFI_PCI_IO_PROTOCOL_GUID		EFI_GUID(0x4cf5b200, 0x68b8, 0x4ca5,  0x9e, 0xec, 0xb2, 0x3e, 0x3f, 0x50, 0x02, 0x9a)
 #define EFI_FILE_INFO_ID			EFI_GUID(0x09576e92, 0x6d3f, 0x11d2,  0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
 #define EFI_SYSTEM_RESOURCE_TABLE_GUID		EFI_GUID(0xb122a263, 0x3661, 0x4f68,  0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80)
-- 
cgit v1.2.3


From 4d24145a7833c14a6521dfab57c5f10076a0110f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 11 Nov 2025 15:49:45 +0100
Subject: devres: Remove unused devm_free_percpu()

Remove unused devm_free_percpu().

By the way, it was never used in the drivers/ from day 1.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20251111145046.997309-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Danilo Krummrich <dakr@kernel.org>
---
 Documentation/driver-api/driver-model/devres.rst |  1 -
 drivers/base/devres.c                            | 25 ------------------------
 include/linux/device.h                           |  1 -
 3 files changed, 27 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 2b36ebde9cec..0198ac65e874 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -383,7 +383,6 @@ NET
 
 PER-CPU MEM
   devm_alloc_percpu()
-  devm_free_percpu()
 
 PCI
   devm_pci_alloc_host_bridge()  : managed PCI host bridge allocation
diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index c948c88d3956..f54db6d138ab 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -1222,13 +1222,6 @@ static void devm_percpu_release(struct device *dev, void *pdata)
 	free_percpu(p);
 }
 
-static int devm_percpu_match(struct device *dev, void *data, void *p)
-{
-	struct devres *devr = container_of(data, struct devres, data);
-
-	return *(void **)devr->data == p;
-}
-
 /**
  * __devm_alloc_percpu - Resource-managed alloc_percpu
  * @dev: Device to allocate per-cpu memory for
@@ -1264,21 +1257,3 @@ void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
 	return pcpu;
 }
 EXPORT_SYMBOL_GPL(__devm_alloc_percpu);
-
-/**
- * devm_free_percpu - Resource-managed free_percpu
- * @dev: Device this memory belongs to
- * @pdata: Per-cpu memory to free
- *
- * Free memory allocated with devm_alloc_percpu().
- */
-void devm_free_percpu(struct device *dev, void __percpu *pdata)
-{
-	/*
-	 * Use devres_release() to prevent memory leakage as
-	 * devm_free_pages() does.
-	 */
-	WARN_ON(devres_release(dev, devm_percpu_release, devm_percpu_match,
-			       (void *)(__force unsigned long)pdata));
-}
-EXPORT_SYMBOL_GPL(devm_free_percpu);
diff --git a/include/linux/device.h b/include/linux/device.h
index b031ff71a5bd..0c6377f6631c 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -298,7 +298,6 @@ void device_remove_bin_file(struct device *dev,
 
 void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
 				   size_t align);
-void devm_free_percpu(struct device *dev, void __percpu *pdata);
 
 struct device_dma_parameters {
 	/*
-- 
cgit v1.2.3


From 42adb2d4ef24d2834cbd3bb96a6660826ae763da Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 14 Nov 2025 13:04:06 -0800
Subject: fs: Add the __data_racy annotation to backing_dev_info.ra_pages

Some but not all .ra_pages changes happen while block layer I/O is paused
with blk_mq_freeze_queue(). Filesystems may read .ra_pages even while
block layer I/O is paused, e.g. from inside their .fadvise callback.
Annotating all .ra_pages reads with READ_ONCE() would be cumbersome.
Hence, add the __data_racy annotatation to the .ra_pages member
variable.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/backing-dev-defs.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index c5c9d89c73ed..30f4bd9ff7c8 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -168,7 +168,9 @@ struct backing_dev_info {
 	u64 id;
 	struct rb_node rb_node; /* keyed by ->id */
 	struct list_head bdi_list;
-	unsigned long ra_pages;	/* max readahead in PAGE_SIZE units */
+	/* max readahead in PAGE_SIZE units */
+	unsigned long __data_racy ra_pages;
+
 	unsigned long io_pages;	/* max allowed IO size */
 
 	struct kref refcnt;	/* Reference counter for the structure */
-- 
cgit v1.2.3


From 935a20d1bebf6236076785fac3ff81e3931834e9 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 14 Nov 2025 13:04:07 -0800
Subject: block: Remove queue freezing from several sysfs store callbacks

Freezing the request queue from inside sysfs store callbacks may cause a
deadlock in combination with the dm-multipath driver and the
queue_if_no_path option. Additionally, freezing the request queue slows
down system boot on systems where sysfs attributes are set synchronously.

Fix this by removing the blk_mq_freeze_queue() / blk_mq_unfreeze_queue()
calls from the store callbacks that do not strictly need these callbacks.
Add the __data_racy annotation to request_queue.rq_timeout to suppress
KCSAN data race reports about the rq_timeout reads.

This patch may cause a small delay in applying the new settings.

For all the attributes affected by this patch, I/O will complete
correctly whether the old or the new value of the attribute is used.

This patch affects the following sysfs attributes:
* io_poll_delay
* io_timeout
* nomerges
* read_ahead_kb
* rq_affinity

Here is an example of a deadlock triggered by running test srp/002
if this patch is not applied:

task:multipathd
Call Trace:
 <TASK>
 __schedule+0x8c1/0x1bf0
 schedule+0xdd/0x270
 schedule_preempt_disabled+0x1c/0x30
 __mutex_lock+0xb89/0x1650
 mutex_lock_nested+0x1f/0x30
 dm_table_set_restrictions+0x823/0xdf0
 __bind+0x166/0x590
 dm_swap_table+0x2a7/0x490
 do_resume+0x1b1/0x610
 dev_suspend+0x55/0x1a0
 ctl_ioctl+0x3a5/0x7e0
 dm_ctl_ioctl+0x12/0x20
 __x64_sys_ioctl+0x127/0x1a0
 x64_sys_call+0xe2b/0x17d0
 do_syscall_64+0x96/0x3a0
 entry_SYSCALL_64_after_hwframe+0x4b/0x53
 </TASK>
task:(udev-worker)
Call Trace:
 <TASK>
 __schedule+0x8c1/0x1bf0
 schedule+0xdd/0x270
 blk_mq_freeze_queue_wait+0xf2/0x140
 blk_mq_freeze_queue_nomemsave+0x23/0x30
 queue_ra_store+0x14e/0x290
 queue_attr_store+0x23e/0x2c0
 sysfs_kf_write+0xde/0x140
 kernfs_fop_write_iter+0x3b2/0x630
 vfs_write+0x4fd/0x1390
 ksys_write+0xfd/0x230
 __x64_sys_write+0x76/0xc0
 x64_sys_call+0x276/0x17d0
 do_syscall_64+0x96/0x3a0
 entry_SYSCALL_64_after_hwframe+0x4b/0x53
 </TASK>

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Nilay Shroff <nilay@linux.ibm.com>
Cc: Martin Wilck <mwilck@suse.com>
Cc: Benjamin Marzinski <bmarzins@redhat.com>
Cc: stable@vger.kernel.org
Fixes: af2814149883 ("block: freeze the queue in queue_attr_store")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c      | 26 ++++++++------------------
 include/linux/blkdev.h |  2 +-
 2 files changed, 9 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 76c47fe9b8d6..8684c57498cc 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -143,21 +143,22 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count)
 {
 	unsigned long ra_kb;
 	ssize_t ret;
-	unsigned int memflags;
 	struct request_queue *q = disk->queue;
 
 	ret = queue_var_store(&ra_kb, page, count);
 	if (ret < 0)
 		return ret;
 	/*
-	 * ->ra_pages is protected by ->limits_lock because it is usually
-	 * calculated from the queue limits by queue_limits_commit_update.
+	 * The ->ra_pages change below is protected by ->limits_lock because it
+	 * is usually calculated from the queue limits by
+	 * queue_limits_commit_update().
+	 *
+	 * bdi->ra_pages reads are not serialized against bdi->ra_pages writes.
+	 * Use WRITE_ONCE() to write bdi->ra_pages once.
 	 */
 	mutex_lock(&q->limits_lock);
-	memflags = blk_mq_freeze_queue(q);
-	disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
+	WRITE_ONCE(disk->bdi->ra_pages, ra_kb >> (PAGE_SHIFT - 10));
 	mutex_unlock(&q->limits_lock);
-	blk_mq_unfreeze_queue(q, memflags);
 
 	return ret;
 }
@@ -375,21 +376,18 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page,
 				    size_t count)
 {
 	unsigned long nm;
-	unsigned int memflags;
 	struct request_queue *q = disk->queue;
 	ssize_t ret = queue_var_store(&nm, page, count);
 
 	if (ret < 0)
 		return ret;
 
-	memflags = blk_mq_freeze_queue(q);
 	blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
 	blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
 	if (nm == 2)
 		blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
 	else if (nm)
 		blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
-	blk_mq_unfreeze_queue(q, memflags);
 
 	return ret;
 }
@@ -409,7 +407,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
 #ifdef CONFIG_SMP
 	struct request_queue *q = disk->queue;
 	unsigned long val;
-	unsigned int memflags;
 
 	ret = queue_var_store(&val, page, count);
 	if (ret < 0)
@@ -421,7 +418,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
 	 * are accessed individually using atomic test_bit operation. So we
 	 * don't grab any lock while updating these flags.
 	 */
-	memflags = blk_mq_freeze_queue(q);
 	if (val == 2) {
 		blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
 		blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
@@ -432,7 +428,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
 		blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
 		blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
 	}
-	blk_mq_unfreeze_queue(q, memflags);
 #endif
 	return ret;
 }
@@ -446,11 +441,9 @@ static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page,
 static ssize_t queue_poll_store(struct gendisk *disk, const char *page,
 				size_t count)
 {
-	unsigned int memflags;
 	ssize_t ret = count;
 	struct request_queue *q = disk->queue;
 
-	memflags = blk_mq_freeze_queue(q);
 	if (!(q->limits.features & BLK_FEAT_POLL)) {
 		ret = -EINVAL;
 		goto out;
@@ -459,7 +452,6 @@ static ssize_t queue_poll_store(struct gendisk *disk, const char *page,
 	pr_info_ratelimited("writes to the poll attribute are ignored.\n");
 	pr_info_ratelimited("please use driver specific parameters instead.\n");
 out:
-	blk_mq_unfreeze_queue(q, memflags);
 	return ret;
 }
 
@@ -472,7 +464,7 @@ static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page)
 static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page,
 				  size_t count)
 {
-	unsigned int val, memflags;
+	unsigned int val;
 	int err;
 	struct request_queue *q = disk->queue;
 
@@ -480,9 +472,7 @@ static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page,
 	if (err || val == 0)
 		return -EINVAL;
 
-	memflags = blk_mq_freeze_queue(q);
 	blk_queue_rq_timeout(q, msecs_to_jiffies(val));
-	blk_mq_unfreeze_queue(q, memflags);
 
 	return count;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2fff8a80dbd2..cb4ba09959ee 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -495,7 +495,7 @@ struct request_queue {
 	 */
 	unsigned long		queue_flags;
 
-	unsigned int		rq_timeout;
+	unsigned int __data_racy rq_timeout;
 
 	unsigned int		queue_depth;
 
-- 
cgit v1.2.3


From b190eaea57803da00a4318ba12359625337be9e8 Mon Sep 17 00:00:00 2001
From: Taniya Das <taniya.das@oss.qualcomm.com>
Date: Tue, 18 Nov 2025 12:47:08 +0530
Subject: dt-bindings: clock: qcom: Add SM8750 video clock controller

Add compatible string for SM8750 video clock controller and the bindings
for SM8750 Qualcomm SoC.

Signed-off-by: Taniya Das <taniya.das@oss.qualcomm.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20251118-sm8750-videocc-v2-v4-4-049882a70c9f@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,sm8450-videocc.yaml        |  5 ++-
 include/dt-bindings/clock/qcom,sm8750-videocc.h    | 40 ++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/clock/qcom,sm8750-videocc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8450-videocc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8450-videocc.yaml
index fcd2727dae46..b31bd8335529 100644
--- a/Documentation/devicetree/bindings/clock/qcom,sm8450-videocc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,sm8450-videocc.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
 title: Qualcomm Video Clock & Reset Controller on SM8450
 
 maintainers:
-  - Taniya Das <quic_tdas@quicinc.com>
+  - Taniya Das <taniya.das@oss.qualcomm.com>
   - Jagadeesh Kona <quic_jkona@quicinc.com>
 
 description: |
@@ -17,6 +17,7 @@ description: |
   See also:
     include/dt-bindings/clock/qcom,sm8450-videocc.h
     include/dt-bindings/clock/qcom,sm8650-videocc.h
+    include/dt-bindings/clock/qcom,sm8750-videocc.h
 
 properties:
   compatible:
@@ -25,6 +26,7 @@ properties:
       - qcom,sm8475-videocc
       - qcom,sm8550-videocc
       - qcom,sm8650-videocc
+      - qcom,sm8750-videocc
       - qcom,x1e80100-videocc
 
   clocks:
@@ -61,6 +63,7 @@ allOf:
             enum:
               - qcom,sm8450-videocc
               - qcom,sm8550-videocc
+              - qcom,sm8750-videocc
     then:
       required:
         - required-opps
diff --git a/include/dt-bindings/clock/qcom,sm8750-videocc.h b/include/dt-bindings/clock/qcom,sm8750-videocc.h
new file mode 100644
index 000000000000..f3bfa2ba5160
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,sm8750-videocc.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_VIDEO_CC_SM8750_H
+#define _DT_BINDINGS_CLK_QCOM_VIDEO_CC_SM8750_H
+
+/* VIDEO_CC clocks */
+#define VIDEO_CC_AHB_CLK					0
+#define VIDEO_CC_AHB_CLK_SRC					1
+#define VIDEO_CC_MVS0_CLK					2
+#define VIDEO_CC_MVS0_CLK_SRC					3
+#define VIDEO_CC_MVS0_DIV_CLK_SRC				4
+#define VIDEO_CC_MVS0_FREERUN_CLK				5
+#define VIDEO_CC_MVS0_SHIFT_CLK					6
+#define VIDEO_CC_MVS0C_CLK					7
+#define VIDEO_CC_MVS0C_DIV2_DIV_CLK_SRC				8
+#define VIDEO_CC_MVS0C_FREERUN_CLK				9
+#define VIDEO_CC_MVS0C_SHIFT_CLK				10
+#define VIDEO_CC_PLL0						11
+#define VIDEO_CC_SLEEP_CLK					12
+#define VIDEO_CC_SLEEP_CLK_SRC					13
+#define VIDEO_CC_XO_CLK						14
+#define VIDEO_CC_XO_CLK_SRC					15
+
+/* VIDEO_CC power domains */
+#define VIDEO_CC_MVS0_GDSC					0
+#define VIDEO_CC_MVS0C_GDSC					1
+
+/* VIDEO_CC resets */
+#define VIDEO_CC_INTERFACE_BCR					0
+#define VIDEO_CC_MVS0_BCR					1
+#define VIDEO_CC_MVS0C_CLK_ARES					2
+#define VIDEO_CC_MVS0C_BCR					3
+#define VIDEO_CC_MVS0_FREERUN_CLK_ARES				4
+#define VIDEO_CC_MVS0C_FREERUN_CLK_ARES				5
+#define VIDEO_CC_XO_CLK_ARES					6
+
+#endif
-- 
cgit v1.2.3


From c84b824d3a8f14bedec8108cb8061da761180f49 Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Date: Tue, 18 Nov 2025 18:33:11 +0100
Subject: dt-bindings: clock: qcom: x1e80100-dispcc: Add USB4 router link
 resets

The router link clock branches also feature some reset logic, which is
required to properly power sequence the hardware for DP tunneling over
USB4.

Describe these missing resets.

Signed-off-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20251118-topic-usb4_x1e_dispcc-v1-1-14c68d842c71@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/clock/qcom,x1e80100-dispcc.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,x1e80100-dispcc.h b/include/dt-bindings/clock/qcom,x1e80100-dispcc.h
index d4a83e4fd0d1..49b3a9e5ce4a 100644
--- a/include/dt-bindings/clock/qcom,x1e80100-dispcc.h
+++ b/include/dt-bindings/clock/qcom,x1e80100-dispcc.h
@@ -90,6 +90,9 @@
 #define DISP_CC_MDSS_CORE_BCR					0
 #define DISP_CC_MDSS_CORE_INT2_BCR				1
 #define DISP_CC_MDSS_RSCC_BCR					2
+#define DISP_CC_MDSS_DPTX0_USB_ROUTER_LINK_INTF_CLK_ARES	3
+#define DISP_CC_MDSS_DPTX1_USB_ROUTER_LINK_INTF_CLK_ARES	4
+#define DISP_CC_MDSS_DPTX2_USB_ROUTER_LINK_INTF_CLK_ARES	5
 
 /* DISP_CC GDSCR */
 #define MDSS_GDSC						0
-- 
cgit v1.2.3


From 0e854e55356908386605714e66f98c3985d9e266 Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Fri, 14 Nov 2025 12:13:23 -0800
Subject: bpf: Always charge/uncharge memory when allocating/unlinking storage
 elements

Since commit a96a44aba556 ("bpf: bpf_sk_storage: Fix invalid wait
context lockdep report"), {charge,uncharge}_mem are always true when
allocating a bpf_local_storage_elem or unlinking a bpf_local_storage_elem
from local storage, so drop these arguments. No functional change.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251114201329.3275875-2-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  2 +-
 kernel/bpf/bpf_local_storage.c    | 22 ++++++++++------------
 net/core/bpf_sk_storage.c         |  2 +-
 3 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 782f58feea35..3663eabcc3ff 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -184,7 +184,7 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
-		bool charge_mem, bool swap_uptrs, gfp_t gfp_flags);
+		bool swap_uptrs, gfp_t gfp_flags);
 
 void bpf_selem_free(struct bpf_local_storage_elem *selem,
 		    struct bpf_local_storage_map *smap,
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b931fbceb54d..400bdf8a3eb2 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -73,11 +73,11 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
 
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
-		void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags)
+		void *value, bool swap_uptrs, gfp_t gfp_flags)
 {
 	struct bpf_local_storage_elem *selem;
 
-	if (charge_mem && mem_charge(smap, owner, smap->elem_size))
+	if (mem_charge(smap, owner, smap->elem_size))
 		return NULL;
 
 	if (smap->bpf_ma) {
@@ -106,8 +106,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 		return selem;
 	}
 
-	if (charge_mem)
-		mem_uncharge(smap, owner, smap->elem_size);
+	mem_uncharge(smap, owner, smap->elem_size);
 
 	return NULL;
 }
@@ -284,7 +283,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
  */
 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
 					    struct bpf_local_storage_elem *selem,
-					    bool uncharge_mem, struct hlist_head *free_selem_list)
+					    struct hlist_head *free_selem_list)
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
@@ -297,8 +296,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	 * The owner may be freed once the last selem is unlinked
 	 * from local_storage.
 	 */
-	if (uncharge_mem)
-		mem_uncharge(smap, owner, smap->elem_size);
+	mem_uncharge(smap, owner, smap->elem_size);
 
 	free_local_storage = hlist_is_singular_node(&selem->snode,
 						    &local_storage->list);
@@ -393,7 +391,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
 		free_local_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, true, &selem_free_list);
+			local_storage, selem, &selem_free_list);
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	bpf_selem_free_list(&selem_free_list, reuse_now);
@@ -582,7 +580,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		if (err)
 			return ERR_PTR(err);
 
-		selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
+		selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
 		if (!selem)
 			return ERR_PTR(-ENOMEM);
 
@@ -616,7 +614,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	/* A lookup has just been done before and concluded a new selem is
 	 * needed. The chance of an unnecessary alloc is unlikely.
 	 */
-	alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
+	alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
 	if (!alloc_selem)
 		return ERR_PTR(-ENOMEM);
 
@@ -656,7 +654,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (old_sdata) {
 		bpf_selem_unlink_map(SELEM(old_sdata));
 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
-						true, &old_selem_free_list);
+						&old_selem_free_list);
 	}
 
 unlock:
@@ -762,7 +760,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 		 * of the loop will set the free_cgroup_storage to true.
 		 */
 		free_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, true, &free_selem_list);
+			local_storage, selem, &free_selem_list);
 	}
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index d3fbaf89a698..bd3c686edc0b 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -136,7 +136,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
 {
 	struct bpf_local_storage_elem *copy_selem;
 
-	copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC);
+	copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC);
 	if (!copy_selem)
 		return NULL;
 
-- 
cgit v1.2.3


From e76a33e1c7186526c2c133af73ea70da9275e1ba Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Fri, 14 Nov 2025 12:13:24 -0800
Subject: bpf: Remove smap argument from bpf_selem_free()

Since selem already saves a pointer to smap, use it instead of an
additional argument in bpf_selem_free(). This requires moving the
SDATA(selem)->smap assignment from bpf_selem_link_map() to
bpf_selem_alloc() since bpf_selem_free() may be called without the
selem being linked to smap in bpf_local_storage_update().

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251114201329.3275875-3-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  1 -
 kernel/bpf/bpf_local_storage.c    | 19 ++++++++++---------
 net/core/bpf_sk_storage.c         |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 3663eabcc3ff..4ab137e75f33 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -187,7 +187,6 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
 		bool swap_uptrs, gfp_t gfp_flags);
 
 void bpf_selem_free(struct bpf_local_storage_elem *selem,
-		    struct bpf_local_storage_map *smap,
 		    bool reuse_now);
 
 int
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 400bdf8a3eb2..95a5ea618cc5 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -97,6 +97,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	}
 
 	if (selem) {
+		RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+
 		if (value) {
 			/* No need to call check_and_init_map_value as memory is zero init */
 			copy_map_value(&smap->map, SDATA(selem)->data, value);
@@ -227,9 +229,12 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
 }
 
 void bpf_selem_free(struct bpf_local_storage_elem *selem,
-		    struct bpf_local_storage_map *smap,
 		    bool reuse_now)
 {
+	struct bpf_local_storage_map *smap;
+
+	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
 	if (!smap->bpf_ma) {
 		/* Only task storage has uptrs and task storage
 		 * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true
@@ -263,7 +268,6 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
 static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
 {
 	struct bpf_local_storage_elem *selem;
-	struct bpf_local_storage_map *smap;
 	struct hlist_node *n;
 
 	/* The "_safe" iteration is needed.
@@ -271,10 +275,8 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
 	 * but bpf_selem_free will use the selem->rcu_head
 	 * which is union-ized with the selem->free_node.
 	 */
-	hlist_for_each_entry_safe(selem, n, list, free_node) {
-		smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
-		bpf_selem_free(selem, smap, reuse_now);
-	}
+	hlist_for_each_entry_safe(selem, n, list, free_node)
+		bpf_selem_free(selem, reuse_now);
 }
 
 /* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -432,7 +434,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&b->lock, flags);
-	RCU_INIT_POINTER(SDATA(selem)->smap, smap);
 	hlist_add_head_rcu(&selem->map_node, &b->list);
 	raw_spin_unlock_irqrestore(&b->lock, flags);
 }
@@ -586,7 +587,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 
 		err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
 		if (err) {
-			bpf_selem_free(selem, smap, true);
+			bpf_selem_free(selem, true);
 			mem_uncharge(smap, owner, smap->elem_size);
 			return ERR_PTR(err);
 		}
@@ -662,7 +663,7 @@ unlock:
 	bpf_selem_free_list(&old_selem_free_list, false);
 	if (alloc_selem) {
 		mem_uncharge(smap, owner, smap->elem_size);
-		bpf_selem_free(alloc_selem, smap, true);
+		bpf_selem_free(alloc_selem, true);
 	}
 	return err ? ERR_PTR(err) : SDATA(selem);
 }
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index bd3c686edc0b..850dd736ccd1 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -196,7 +196,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 		} else {
 			ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
 			if (ret) {
-				bpf_selem_free(copy_selem, smap, true);
+				bpf_selem_free(copy_selem, true);
 				atomic_sub(smap->elem_size,
 					   &newsk->sk_omem_alloc);
 				bpf_map_put(map);
-- 
cgit v1.2.3


From 39a460c4253e4a437b6b372f462c0c043026784d Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Fri, 14 Nov 2025 12:13:25 -0800
Subject: bpf: Save memory alloction info in bpf_local_storage

Save the memory allocation method used for bpf_local_storage in the
struct explicitly so that we don't need to go through the hassle to
find out the info. When a later patch replaces BPF memory allocator
with kmalloc_noloc(), bpf_local_storage_free() will no longer need
smap->storage_ma to return the memory and completely remove the
dependency on smap in bpf_local_storage_free().

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251114201329.3275875-4-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  1 +
 kernel/bpf/bpf_local_storage.c    | 52 ++++++---------------------------------
 2 files changed, 9 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 4ab137e75f33..7fef0cec8340 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -97,6 +97,7 @@ struct bpf_local_storage {
 				 */
 	struct rcu_head rcu;
 	raw_spinlock_t lock;	/* Protect adding/removing from the "list" */
+	bool bpf_ma;
 };
 
 /* U16_MAX is much more than enough for sk local storage
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 95a5ea618cc5..3c04b9d85860 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -157,12 +157,12 @@ static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
 
 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
 				   struct bpf_local_storage_map *smap,
-				   bool bpf_ma, bool reuse_now)
+				   bool reuse_now)
 {
 	if (!local_storage)
 		return;
 
-	if (!bpf_ma) {
+	if (!local_storage->bpf_ma) {
 		__bpf_local_storage_free(local_storage, reuse_now);
 		return;
 	}
@@ -336,47 +336,12 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	return free_local_storage;
 }
 
-static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
-				 struct bpf_local_storage_map *storage_smap,
-				 struct bpf_local_storage_elem *selem)
-{
-
-	struct bpf_local_storage_map *selem_smap;
-
-	/* local_storage->smap may be NULL. If it is, get the bpf_ma
-	 * from any selem in the local_storage->list. The bpf_ma of all
-	 * local_storage and selem should have the same value
-	 * for the same map type.
-	 *
-	 * If the local_storage->list is already empty, the caller will not
-	 * care about the bpf_ma value also because the caller is not
-	 * responsible to free the local_storage.
-	 */
-
-	if (storage_smap)
-		return storage_smap->bpf_ma;
-
-	if (!selem) {
-		struct hlist_node *n;
-
-		n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
-					  bpf_rcu_lock_held());
-		if (!n)
-			return false;
-
-		selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
-	}
-	selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
-
-	return selem_smap->bpf_ma;
-}
-
 static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 				     bool reuse_now)
 {
 	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage *local_storage;
-	bool bpf_ma, free_local_storage = false;
+	bool free_local_storage = false;
 	HLIST_HEAD(selem_free_list);
 	unsigned long flags;
 
@@ -388,7 +353,6 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 					      bpf_rcu_lock_held());
 	storage_smap = rcu_dereference_check(local_storage->smap,
 					     bpf_rcu_lock_held());
-	bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
 
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
@@ -399,7 +363,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	bpf_selem_free_list(&selem_free_list, reuse_now);
 
 	if (free_local_storage)
-		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
+		bpf_local_storage_free(local_storage, storage_smap, reuse_now);
 }
 
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -506,6 +470,7 @@ int bpf_local_storage_alloc(void *owner,
 	INIT_HLIST_HEAD(&storage->list);
 	raw_spin_lock_init(&storage->lock);
 	storage->owner = owner;
+	storage->bpf_ma = smap->bpf_ma;
 
 	bpf_selem_link_storage_nolock(storage, first_selem);
 	bpf_selem_link_map(smap, first_selem);
@@ -542,7 +507,7 @@ int bpf_local_storage_alloc(void *owner,
 	return 0;
 
 uncharge:
-	bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
+	bpf_local_storage_free(storage, smap, true);
 	mem_uncharge(smap, owner, sizeof(*storage));
 	return err;
 }
@@ -731,13 +696,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 {
 	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage_elem *selem;
-	bool bpf_ma, free_storage = false;
+	bool free_storage = false;
 	HLIST_HEAD(free_selem_list);
 	struct hlist_node *n;
 	unsigned long flags;
 
 	storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
-	bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
 
 	/* Neither the bpf_prog nor the bpf_map's syscall
 	 * could be modifying the local_storage->list now.
@@ -768,7 +732,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 	bpf_selem_free_list(&free_selem_list, true);
 
 	if (free_storage)
-		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
+		bpf_local_storage_free(local_storage, storage_smap, true);
 }
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
-- 
cgit v1.2.3


From f484f4a3e058b5641670ebaeb301c06589848521 Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Fri, 14 Nov 2025 12:13:26 -0800
Subject: bpf: Replace bpf memory allocator with kmalloc_nolock() in local
 storage

Replace bpf memory allocator with kmalloc_nolock() to reduce memory
wastage due to preallocation.

In bpf_selem_free(), an selem now needs to wait for a RCU grace period
before being freed when reuse_now == true. Therefore, rcu_barrier()
should be always be called in bpf_local_storage_map_free().

In bpf_local_storage_free(), since smap->storage_ma is no longer needed
to return the memory, the function is now independent from smap.

Remove the outdated comment in bpf_local_storage_alloc(). We already
free selem after an RCU grace period in bpf_local_storage_update() when
bpf_local_storage_alloc() failed the cmpxchg since commit c0d63f309186
("bpf: Add bpf_selem_free()").

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251114201329.3275875-5-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |   8 +-
 kernel/bpf/bpf_local_storage.c    | 152 +++++++++++---------------------------
 2 files changed, 48 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 7fef0cec8340..66432248cd81 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -53,9 +53,7 @@ struct bpf_local_storage_map {
 	u32 bucket_log;
 	u16 elem_size;
 	u16 cache_idx;
-	struct bpf_mem_alloc selem_ma;
-	struct bpf_mem_alloc storage_ma;
-	bool bpf_ma;
+	bool use_kmalloc_nolock;
 };
 
 struct bpf_local_storage_data {
@@ -97,7 +95,7 @@ struct bpf_local_storage {
 				 */
 	struct rcu_head rcu;
 	raw_spinlock_t lock;	/* Protect adding/removing from the "list" */
-	bool bpf_ma;
+	bool use_kmalloc_nolock;
 };
 
 /* U16_MAX is much more than enough for sk local storage
@@ -131,7 +129,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
 			    struct bpf_local_storage_cache *cache,
-			    bool bpf_ma);
+			    bool use_kmalloc_nolock);
 
 void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
 				      struct bpf_local_storage_map *smap,
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 3c04b9d85860..e2fe6c32822b 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -80,17 +80,9 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	if (mem_charge(smap, owner, smap->elem_size))
 		return NULL;
 
-	if (smap->bpf_ma) {
-		selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
-		if (selem)
-			/* Keep the original bpf_map_kzalloc behavior
-			 * before started using the bpf_mem_cache_alloc.
-			 *
-			 * No need to use zero_map_value. The bpf_selem_free()
-			 * only does bpf_mem_cache_free when there is
-			 * no other bpf prog is using the selem.
-			 */
-			memset(SDATA(selem)->data, 0, smap->map.value_size);
+	if (smap->use_kmalloc_nolock) {
+		selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
+					       __GFP_ZERO, NUMA_NO_NODE);
 	} else {
 		selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
 					gfp_flags | __GFP_NOWARN);
@@ -113,7 +105,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	return NULL;
 }
 
-/* rcu tasks trace callback for bpf_ma == false */
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
 static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage *local_storage;
@@ -128,12 +120,23 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
 		kfree_rcu(local_storage, rcu);
 }
 
+/* Handle use_kmalloc_nolock == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+				     bool vanilla_rcu)
+{
+	if (vanilla_rcu)
+		kfree_rcu(local_storage, rcu);
+	else
+		call_rcu_tasks_trace(&local_storage->rcu,
+				     __bpf_local_storage_free_trace_rcu);
+}
+
 static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage *local_storage;
 
 	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
-	bpf_mem_cache_raw_free(local_storage);
+	kfree_nolock(local_storage);
 }
 
 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
@@ -144,46 +147,27 @@ static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
 		call_rcu(rcu, bpf_local_storage_free_rcu);
 }
 
-/* Handle bpf_ma == false */
-static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
-				     bool vanilla_rcu)
-{
-	if (vanilla_rcu)
-		kfree_rcu(local_storage, rcu);
-	else
-		call_rcu_tasks_trace(&local_storage->rcu,
-				     __bpf_local_storage_free_trace_rcu);
-}
-
 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
-				   struct bpf_local_storage_map *smap,
 				   bool reuse_now)
 {
 	if (!local_storage)
 		return;
 
-	if (!local_storage->bpf_ma) {
+	if (!local_storage->use_kmalloc_nolock) {
 		__bpf_local_storage_free(local_storage, reuse_now);
 		return;
 	}
 
-	if (!reuse_now) {
-		call_rcu_tasks_trace(&local_storage->rcu,
-				     bpf_local_storage_free_trace_rcu);
+	if (reuse_now) {
+		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
 		return;
 	}
 
-	if (smap)
-		bpf_mem_cache_free(&smap->storage_ma, local_storage);
-	else
-		/* smap could be NULL if the selem that triggered
-		 * this 'local_storage' creation had been long gone.
-		 * In this case, directly do call_rcu().
-		 */
-		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+	call_rcu_tasks_trace(&local_storage->rcu,
+			     bpf_local_storage_free_trace_rcu);
 }
 
-/* rcu tasks trace callback for bpf_ma == false */
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
 static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
@@ -195,7 +179,7 @@ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
 		kfree_rcu(selem, rcu);
 }
 
-/* Handle bpf_ma == false */
+/* Handle use_kmalloc_nolock == false */
 static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
 			     bool vanilla_rcu)
 {
@@ -217,7 +201,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
 	migrate_disable();
 	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
 	migrate_enable();
-	bpf_mem_cache_raw_free(selem);
+	kfree_nolock(selem);
 }
 
 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
@@ -235,11 +219,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
 
 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
 
-	if (!smap->bpf_ma) {
-		/* Only task storage has uptrs and task storage
-		 * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true
-		 * for task storage, so this bpf_obj_free_fields() won't unpin
-		 * any uptr.
+	if (!smap->use_kmalloc_nolock) {
+		/*
+		 * No uptr will be unpin even when reuse_now == false since uptr
+		 * is only supported in task local storage, where
+		 * smap->use_kmalloc_nolock == true.
 		 */
 		bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
 		__bpf_selem_free(selem, reuse_now);
@@ -247,18 +231,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
 	}
 
 	if (reuse_now) {
-		/* reuse_now == true only happens when the storage owner
-		 * (e.g. task_struct) is being destructed or the map itself
-		 * is being destructed (ie map_free). In both cases,
-		 * no bpf prog can have a hold on the selem. It is
-		 * safe to unpin the uptrs and free the selem now.
-		 */
-		bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-		/* Instead of using the vanilla call_rcu(),
-		 * bpf_mem_cache_free will be able to reuse selem
-		 * immediately.
+		/*
+		 * While it is okay to call bpf_obj_free_fields() that unpins uptr when
+		 * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
 		 */
-		bpf_mem_cache_free(&smap->selem_ma, selem);
+		call_rcu(&selem->rcu, bpf_selem_free_rcu);
 		return;
 	}
 
@@ -339,7 +316,6 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 				     bool reuse_now)
 {
-	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage *local_storage;
 	bool free_local_storage = false;
 	HLIST_HEAD(selem_free_list);
@@ -351,8 +327,6 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 
 	local_storage = rcu_dereference_check(selem->local_storage,
 					      bpf_rcu_lock_held());
-	storage_smap = rcu_dereference_check(local_storage->smap,
-					     bpf_rcu_lock_held());
 
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
@@ -363,7 +337,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	bpf_selem_free_list(&selem_free_list, reuse_now);
 
 	if (free_local_storage)
-		bpf_local_storage_free(local_storage, storage_smap, reuse_now);
+		bpf_local_storage_free(local_storage, reuse_now);
 }
 
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -456,8 +430,9 @@ int bpf_local_storage_alloc(void *owner,
 	if (err)
 		return err;
 
-	if (smap->bpf_ma)
-		storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+	if (smap->use_kmalloc_nolock)
+		storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
+						 __GFP_ZERO, NUMA_NO_NODE);
 	else
 		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
 					  gfp_flags | __GFP_NOWARN);
@@ -470,7 +445,7 @@ int bpf_local_storage_alloc(void *owner,
 	INIT_HLIST_HEAD(&storage->list);
 	raw_spin_lock_init(&storage->lock);
 	storage->owner = owner;
-	storage->bpf_ma = smap->bpf_ma;
+	storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
 
 	bpf_selem_link_storage_nolock(storage, first_selem);
 	bpf_selem_link_map(smap, first_selem);
@@ -492,22 +467,12 @@ int bpf_local_storage_alloc(void *owner,
 		bpf_selem_unlink_map(first_selem);
 		err = -EAGAIN;
 		goto uncharge;
-
-		/* Note that even first_selem was linked to smap's
-		 * bucket->list, first_selem can be freed immediately
-		 * (instead of kfree_rcu) because
-		 * bpf_local_storage_map_free() does a
-		 * synchronize_rcu_mult (waiting for both sleepable and
-		 * normal programs) before walking the bucket->list.
-		 * Hence, no one is accessing selem from the
-		 * bucket->list under rcu_read_lock().
-		 */
 	}
 
 	return 0;
 
 uncharge:
-	bpf_local_storage_free(storage, smap, true);
+	bpf_local_storage_free(storage, true);
 	mem_uncharge(smap, owner, sizeof(*storage));
 	return err;
 }
@@ -694,15 +659,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 
 void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 {
-	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage_elem *selem;
 	bool free_storage = false;
 	HLIST_HEAD(free_selem_list);
 	struct hlist_node *n;
 	unsigned long flags;
 
-	storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
-
 	/* Neither the bpf_prog nor the bpf_map's syscall
 	 * could be modifying the local_storage->list now.
 	 * Thus, no elem can be added to or deleted from the
@@ -732,7 +694,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 	bpf_selem_free_list(&free_selem_list, true);
 
 	if (free_storage)
-		bpf_local_storage_free(local_storage, storage_smap, true);
+		bpf_local_storage_free(local_storage, true);
 }
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -745,20 +707,10 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
 	return usage;
 }
 
-/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
- * A deadlock free allocator is useful for storage that the bpf prog can easily
- * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
- * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
- * memory immediately. To be reuse-immediate safe, the owner destruction
- * code path needs to go through a rcu grace period before calling
- * bpf_local_storage_destroy().
- *
- * When bpf_ma == false, the kmalloc and kfree are used.
- */
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
 			    struct bpf_local_storage_cache *cache,
-			    bool bpf_ma)
+			    bool use_kmalloc_nolock)
 {
 	struct bpf_local_storage_map *smap;
 	unsigned int i;
@@ -792,20 +744,9 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 
 	/* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
 	 * preemptible context. Thus, enforce all storages to use
-	 * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled.
+	 * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
 	 */
-	smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma;
-	if (smap->bpf_ma) {
-		err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
-		if (err)
-			goto free_smap;
-
-		err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
-		if (err) {
-			bpf_mem_alloc_destroy(&smap->selem_ma);
-			goto free_smap;
-		}
-	}
+	smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
 
 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
 	return &smap->map;
@@ -875,12 +816,9 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
-	if (smap->bpf_ma) {
+	if (smap->use_kmalloc_nolock) {
 		rcu_barrier_tasks_trace();
-		if (!rcu_trace_implies_rcu_gp())
-			rcu_barrier();
-		bpf_mem_alloc_destroy(&smap->selem_ma);
-		bpf_mem_alloc_destroy(&smap->storage_ma);
+		rcu_barrier();
 	}
 	kvfree(smap->buckets);
 	bpf_map_area_free(smap);
-- 
cgit v1.2.3


From fbb9933666e31f84c62e9620e9ec4d220ee31ab4 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@nvidia.com>
Date: Mon, 17 Nov 2025 23:42:08 +0200
Subject: net/mlx5: Abort new commands if all command slots are stalled

In case of a FW issue, FW might be not responding to FW commands,
causing kernel lockout for a long period of time, e.g. rtnl_lock held
while ethtool is trying to collect stats waiting for FW to respond to
multiple commands, when all of them will timeout.

While there's no immediate indication of the FW lockout, we can safely
assume that something is wrong when all command slots are busy and in
a timeout state and no FW completion was received on any of them.

In such case, start immediately failing new commands.

Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763415729-1238421-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 55 +++++++++++++++++++++++++++
 include/linux/mlx5/driver.h                   |  1 +
 2 files changed, 56 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 722282cebce9..5b08e5ffe0e2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -181,6 +181,7 @@ static int cmd_alloc_index(struct mlx5_cmd *cmd, struct mlx5_cmd_work_ent *ent)
 static void cmd_free_index(struct mlx5_cmd *cmd, int idx)
 {
 	lockdep_assert_held(&cmd->alloc_lock);
+	cmd->ent_arr[idx] = NULL;
 	set_bit(idx, &cmd->vars.bitmask);
 }
 
@@ -1200,6 +1201,44 @@ out_err:
 	return err;
 }
 
+/* Check if all command slots are stalled (timed out and not recovered).
+ * returns true if all slots timed out on a recent command and have not been
+ * completed by FW yet. (stalled state)
+ * false otherwise (at least one slot is not stalled).
+ *
+ * In such odd situation "all_stalled", this serves as a protection mechanism
+ * to avoid blocking the kernel for long periods of time in case FW is not
+ * responding to commands.
+ */
+static bool mlx5_cmd_all_stalled(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	bool all_stalled = true;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&cmd->alloc_lock, flags);
+
+	/* at least one command slot is free */
+	if (bitmap_weight(&cmd->vars.bitmask, cmd->vars.max_reg_cmds) > 0) {
+		all_stalled = false;
+		goto out;
+	}
+
+	for_each_clear_bit(i, &cmd->vars.bitmask, cmd->vars.max_reg_cmds) {
+		struct mlx5_cmd_work_ent *ent = dev->cmd.ent_arr[i];
+
+		if (!test_bit(MLX5_CMD_ENT_STATE_TIMEDOUT, &ent->state)) {
+			all_stalled = false;
+			break;
+		}
+	}
+out:
+	spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+
+	return all_stalled;
+}
+
 /*  Notes:
  *    1. Callback functions may not sleep
  *    2. page queue commands do not support asynchrous completion
@@ -1230,6 +1269,15 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
 	if (callback && page_queue)
 		return -EINVAL;
 
+	if (!page_queue && mlx5_cmd_all_stalled(dev)) {
+		mlx5_core_err_rl(dev,
+				 "All CMD slots are stalled, aborting command\n");
+		/* there's no reason to wait and block the whole kernel if FW
+		 * isn't currently responding to all slots, fail immediately
+		 */
+		return -EAGAIN;
+	}
+
 	ent = cmd_alloc_ent(cmd, in, out, uout, uout_size,
 			    callback, context, page_queue);
 	if (IS_ERR(ent))
@@ -1700,6 +1748,13 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force
 		if (test_bit(i, &vector)) {
 			ent = cmd->ent_arr[i];
 
+			if (forced && ent->ret == -ETIMEDOUT)
+				set_bit(MLX5_CMD_ENT_STATE_TIMEDOUT,
+					&ent->state);
+			else if (!forced) /* real FW completion */
+				clear_bit(MLX5_CMD_ENT_STATE_TIMEDOUT,
+					  &ent->state);
+
 			/* if we already completed the command, ignore it */
 			if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP,
 						&ent->state)) {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 046396269ccf..7aec53371cf0 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -819,6 +819,7 @@ typedef void (*mlx5_cmd_cbk_t)(int status, void *context);
 
 enum {
 	MLX5_CMD_ENT_STATE_PENDING_COMP,
+	MLX5_CMD_ENT_STATE_TIMEDOUT,
 };
 
 struct mlx5_cmd_work_ent {
-- 
cgit v1.2.3


From 922a6f34c1756d2b0c35d9b2d915b8af19e85965 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 18 Nov 2025 10:46:31 +0800
Subject: autofs: dont trigger mount if it cant succeed

If a mount namespace contains autofs mounts, and they are propagation
private, and there is no namespace specific automount daemon to handle
possible automounting then attempted path resolution will loop until
MAXSYMLINKS is reached before failing causing quite a bit of noise in
the log.

Add a check for this in autofs ->d_automount() so that the VFS can
immediately return an error in this case. Since the mount is propagation
private an EPERM return seems most appropriate.

Suggested by: Christian Brauner <brauner@kernel.org>

Signed-off-by: Ian Kent <raven@themaw.net>
Link: https://patch.msgid.link/20251118024631.10854-2-raven@themaw.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/autofs/autofs_i.h  | 5 +++++
 fs/autofs/dev-ioctl.c | 1 +
 fs/autofs/inode.c     | 1 +
 fs/autofs/root.c      | 8 ++++++++
 fs/namespace.c        | 6 ++++++
 include/linux/fs.h    | 1 +
 6 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 23cea74f9933..4fd555528c5d 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -16,6 +16,7 @@
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/sched/signal.h>
+#include <uapi/linux/mount.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/uaccess.h>
@@ -27,6 +28,9 @@
 #include <linux/magic.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
+#include "../mount.h"
+#include <linux/ns_common.h>
+
 
 /* This is the range of ioctl() numbers we claim as ours */
 #define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
@@ -114,6 +118,7 @@ struct autofs_sb_info {
 	int pipefd;
 	struct file *pipe;
 	struct pid *oz_pgrp;
+	u64 mnt_ns_id;
 	int version;
 	int sub_version;
 	int min_proto;
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index d8dd150cbd74..ed2efe4e97b2 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -381,6 +381,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 		swap(sbi->oz_pgrp, new_pid);
 		sbi->pipefd = pipefd;
 		sbi->pipe = pipe;
+		sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id;
 		sbi->flags &= ~AUTOFS_SBI_CATATONIC;
 	}
 out:
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index f5c16ffba013..732aee76a24c 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,6 +251,7 @@ static struct autofs_sb_info *autofs_alloc_sbi(void)
 	sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
 	sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
 	sbi->pipefd = -1;
+	sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id;
 
 	set_autofs_type_indirect(&sbi->type);
 	mutex_init(&sbi->wq_mutex);
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 174c7205fee4..d10df9d89d1c 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -341,6 +341,14 @@ static struct vfsmount *autofs_d_automount(struct path *path)
 	if (autofs_oz_mode(sbi))
 		return NULL;
 
+	/* Refuse to trigger mount if current namespace is not the owner
+	 * and the mount is propagation private.
+	 */
+	if (sbi->mnt_ns_id != to_ns_common(current->nsproxy->mnt_ns)->ns_id) {
+		if (vfsmount_to_propagation_flags(path->mnt) & MS_PRIVATE)
+			return ERR_PTR(-EPERM);
+	}
+
 	/*
 	 * If an expire request is pending everyone must wait.
 	 * If the expire fails we're still mounted so continue
diff --git a/fs/namespace.c b/fs/namespace.c
index d82910f33dc4..27bb12693cba 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5150,6 +5150,12 @@ static u64 mnt_to_propagation_flags(struct mount *m)
 	return propagation;
 }
 
+u64 vfsmount_to_propagation_flags(struct vfsmount *mnt)
+{
+	return mnt_to_propagation_flags(real_mount(mnt));
+}
+EXPORT_SYMBOL_GPL(vfsmount_to_propagation_flags);
+
 static void statmount_sb_basic(struct kstatmount *s)
 {
 	struct super_block *sb = s->mnt->mnt_sb;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..a5c2077ce6ed 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3269,6 +3269,7 @@ extern struct file * open_exec(const char *);
 /* fs/dcache.c -- generic fs support functions */
 extern bool is_subdir(struct dentry *, struct dentry *);
 extern bool path_is_under(const struct path *, const struct path *);
+u64 vfsmount_to_propagation_flags(struct vfsmount *mnt);
 
 extern char *file_path(struct file *, char *, int);
 
-- 
cgit v1.2.3


From 0a75f3d90e7ab9cd182327fca4b4e3bce379afe5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 11 Nov 2025 15:49:46 +0100
Subject: devres: Move devm_alloc_percpu() and related to devres.h

Move devm_alloc_percpu() and related to devres.h where it belongs.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20251111145046.997309-3-andriy.shevchenko@linux.intel.com
[ Fix minor typo in commit message. - Danilo ]
Signed-off-by: Danilo Krummrich <dakr@kernel.org>
---
 include/linux/device.h        | 18 ------------------
 include/linux/device/devres.h | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 0c6377f6631c..0be95294b6e6 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -281,24 +281,6 @@ int __must_check device_create_bin_file(struct device *dev,
 void device_remove_bin_file(struct device *dev,
 			    const struct bin_attribute *attr);
 
-/**
- * devm_alloc_percpu - Resource-managed alloc_percpu
- * @dev: Device to allocate per-cpu memory for
- * @type: Type to allocate per-cpu memory for
- *
- * Managed alloc_percpu. Per-cpu memory allocated with this function is
- * automatically freed on driver detach.
- *
- * RETURNS:
- * Pointer to allocated memory on success, NULL on failure.
- */
-#define devm_alloc_percpu(dev, type)      \
-	((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \
-						      __alignof__(type)))
-
-void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
-				   size_t align);
-
 struct device_dma_parameters {
 	/*
 	 * a low level driver may set these to teach IOMMU code about
diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h
index 8c5f57e0d613..9c1e3d643d69 100644
--- a/include/linux/device/devres.h
+++ b/include/linux/device/devres.h
@@ -9,6 +9,7 @@
 #include <linux/stdarg.h>
 #include <linux/types.h>
 #include <asm/bug.h>
+#include <asm/percpu.h>
 
 struct device;
 struct device_node;
@@ -96,6 +97,22 @@ devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap);
 char * __printf(3, 4) __malloc
 devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...);
 
+/**
+ * devm_alloc_percpu - Resource-managed alloc_percpu
+ * @dev: Device to allocate per-cpu memory for
+ * @type: Type to allocate per-cpu memory for
+ *
+ * Managed alloc_percpu. Per-cpu memory allocated with this function is
+ * automatically freed on driver detach.
+ *
+ * RETURNS:
+ * Pointer to allocated memory on success, NULL on failure.
+ */
+#define devm_alloc_percpu(dev, type)      \
+	((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), __alignof__(type)))
+
+void __percpu *__devm_alloc_percpu(struct device *dev, size_t size, size_t align);
+
 unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order);
 void devm_free_pages(struct device *dev, unsigned long addr);
 
-- 
cgit v1.2.3


From a0c83150eea5807dbedf786f55cd49b14af118a8 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Wed, 12 Nov 2025 09:10:10 +0530
Subject: platform/x86/intel: Introduce Intel Elkhart Lake PSE I/O
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Intel Elkhart Lake Programmable Service Engine (PSE) includes two PCI
devices that expose two different capabilities of GPIO and Timed I/O
as a single PCI function through shared MMIO with below layout.

GPIO: 0x0000 - 0x1000
TIO:  0x1000 - 0x2000

This driver enumerates the PCI parent device and creates auxiliary child
devices for these capabilities. The actual functionalities are provided
by their respective auxiliary drivers.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20251112034040.457801-2-raag.jadav@intel.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 MAINTAINERS                             |  7 +++
 drivers/platform/x86/intel/Kconfig      | 13 +++++
 drivers/platform/x86/intel/Makefile     |  1 +
 drivers/platform/x86/intel/ehl_pse_io.c | 86 +++++++++++++++++++++++++++++++++
 include/linux/ehl_pse_io_aux.h          | 24 +++++++++
 5 files changed, 131 insertions(+)
 create mode 100644 drivers/platform/x86/intel/ehl_pse_io.c
 create mode 100644 include/linux/ehl_pse_io_aux.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 3da2c26a796b..00e2cb65ddec 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12505,6 +12505,13 @@ F:	drivers/gpu/drm/xe/
 F:	include/drm/intel/
 F:	include/uapi/drm/xe_drm.h
 
+INTEL ELKHART LAKE PSE I/O DRIVER
+M:	Raag Jadav <raag.jadav@intel.com>
+L:	platform-driver-x86@vger.kernel.org
+S:	Supported
+F:	drivers/platform/x86/intel/ehl_pse_io.c
+F:	include/linux/ehl_pse_io_aux.h
+
 INTEL ETHERNET DRIVERS
 M:	Tony Nguyen <anthony.l.nguyen@intel.com>
 M:	Przemek Kitszel <przemyslaw.kitszel@intel.com>
diff --git a/drivers/platform/x86/intel/Kconfig b/drivers/platform/x86/intel/Kconfig
index 19a2246f2770..2900407d6095 100644
--- a/drivers/platform/x86/intel/Kconfig
+++ b/drivers/platform/x86/intel/Kconfig
@@ -41,6 +41,19 @@ config INTEL_VBTN
 	  To compile this driver as a module, choose M here: the module will
 	  be called intel_vbtn.
 
+config INTEL_EHL_PSE_IO
+	tristate "Intel Elkhart Lake PSE I/O driver"
+	depends on PCI
+	select AUXILIARY_BUS
+	help
+	  Select this option to enable Intel Elkhart Lake PSE GPIO and Timed
+	  I/O support. This driver enumerates the PCI parent device and
+	  creates auxiliary child devices for these capabilities. The actual
+	  functionalities are provided by their respective auxiliary drivers.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called intel_ehl_pse_io.
+
 config INTEL_INT0002_VGPIO
 	tristate "Intel ACPI INT0002 Virtual GPIO driver"
 	depends on GPIOLIB && ACPI && PM_SLEEP
diff --git a/drivers/platform/x86/intel/Makefile b/drivers/platform/x86/intel/Makefile
index 78acb414e154..138b13756158 100644
--- a/drivers/platform/x86/intel/Makefile
+++ b/drivers/platform/x86/intel/Makefile
@@ -21,6 +21,7 @@ intel-target-$(CONFIG_INTEL_HID_EVENT)		+= hid.o
 intel-target-$(CONFIG_INTEL_VBTN)		+= vbtn.o
 
 # Intel miscellaneous drivers
+intel-target-$(CONFIG_INTEL_EHL_PSE_IO)		+= ehl_pse_io.o
 intel-target-$(CONFIG_INTEL_INT0002_VGPIO)	+= int0002_vgpio.o
 intel-target-$(CONFIG_INTEL_ISHTP_ECLITE)	+= ishtp_eclite.o
 intel-target-$(CONFIG_INTEL_OAKTRAIL)		+= oaktrail.o
diff --git a/drivers/platform/x86/intel/ehl_pse_io.c b/drivers/platform/x86/intel/ehl_pse_io.c
new file mode 100644
index 000000000000..861e14808b35
--- /dev/null
+++ b/drivers/platform/x86/intel/ehl_pse_io.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Elkhart Lake Programmable Service Engine (PSE) I/O
+ *
+ * Copyright (c) 2025 Intel Corporation.
+ *
+ * Author: Raag Jadav <raag.jadav@intel.com>
+ */
+
+#include <linux/auxiliary_bus.h>
+#include <linux/device/devres.h>
+#include <linux/errno.h>
+#include <linux/gfp_types.h>
+#include <linux/ioport.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include <linux/ehl_pse_io_aux.h>
+
+#define EHL_PSE_IO_DEV_SIZE	SZ_4K
+
+static int ehl_pse_io_dev_create(struct pci_dev *pci, const char *name, int idx)
+{
+	struct device *dev = &pci->dev;
+	struct auxiliary_device *adev;
+	struct ehl_pse_io_data *data;
+	resource_size_t start, offset;
+	u32 id;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	id = (pci_domain_nr(pci->bus) << 16) | pci_dev_id(pci);
+	start = pci_resource_start(pci, 0);
+	offset = EHL_PSE_IO_DEV_SIZE * idx;
+
+	data->mem = DEFINE_RES_MEM(start + offset, EHL_PSE_IO_DEV_SIZE);
+	data->irq = pci_irq_vector(pci, idx);
+
+	adev = __devm_auxiliary_device_create(dev, EHL_PSE_IO_NAME, name, data, id);
+
+	return adev ? 0 : -ENODEV;
+}
+
+static int ehl_pse_io_probe(struct pci_dev *pci, const struct pci_device_id *id)
+{
+	int ret;
+
+	ret = pcim_enable_device(pci);
+	if (ret)
+		return ret;
+
+	pci_set_master(pci);
+
+	ret = pci_alloc_irq_vectors(pci, 2, 2, PCI_IRQ_MSI);
+	if (ret < 0)
+		return ret;
+
+	ret = ehl_pse_io_dev_create(pci, EHL_PSE_GPIO_NAME, 0);
+	if (ret)
+		return ret;
+
+	return ehl_pse_io_dev_create(pci, EHL_PSE_TIO_NAME, 1);
+}
+
+static const struct pci_device_id ehl_pse_io_ids[] = {
+	{ PCI_VDEVICE(INTEL, 0x4b88) },
+	{ PCI_VDEVICE(INTEL, 0x4b89) },
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, ehl_pse_io_ids);
+
+static struct pci_driver ehl_pse_io_driver = {
+	.name		= EHL_PSE_IO_NAME,
+	.id_table	= ehl_pse_io_ids,
+	.probe		= ehl_pse_io_probe,
+};
+module_pci_driver(ehl_pse_io_driver);
+
+MODULE_AUTHOR("Raag Jadav <raag.jadav@intel.com>");
+MODULE_DESCRIPTION("Intel Elkhart Lake PSE I/O driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/ehl_pse_io_aux.h b/include/linux/ehl_pse_io_aux.h
new file mode 100644
index 000000000000..afb8587ee5fb
--- /dev/null
+++ b/include/linux/ehl_pse_io_aux.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel Elkhart Lake PSE I/O Auxiliary Device
+ *
+ * Copyright (c) 2025 Intel Corporation.
+ *
+ * Author: Raag Jadav <raag.jadav@intel.com>
+ */
+
+#ifndef _EHL_PSE_IO_AUX_H_
+#define _EHL_PSE_IO_AUX_H_
+
+#include <linux/ioport.h>
+
+#define EHL_PSE_IO_NAME		"ehl_pse_io"
+#define EHL_PSE_GPIO_NAME	"gpio"
+#define EHL_PSE_TIO_NAME	"pps_tio"
+
+struct ehl_pse_io_data {
+	struct resource mem;
+	int irq;
+};
+
+#endif /* _EHL_PSE_IO_AUX_H_ */
-- 
cgit v1.2.3


From c200892b46ba3df3dd210b7117a463ec283600c3 Mon Sep 17 00:00:00 2001
From: Coiby Xu <coxu@redhat.com>
Date: Wed, 19 Nov 2025 22:03:25 +0800
Subject: ima: Access decompressed kernel module to verify appended signature

Currently, when in-kernel module decompression (CONFIG_MODULE_DECOMPRESS)
is enabled, IMA has no way to verify the appended module signature as it
can't decompress the module.

Define a new kernel_read_file_id enumerate READING_MODULE_COMPRESSED so
IMA can calculate the compressed kernel module data hash on
READING_MODULE_COMPRESSED and defer appraising/measuring it until on
READING_MODULE when the module has been decompressed.

Before enabling in-kernel module decompression, a kernel module in
initramfs can still be loaded with ima_policy=secure_boot. So adjust the
kernel module rule in secure_boot policy to allow either an IMA
signature OR an appended signature i.e. to use
"appraise func=MODULE_CHECK appraise_type=imasig|modsig".

Reported-by: Karel Srot <ksrot@redhat.com>
Suggested-by: Mimi Zohar <zohar@linux.ibm.com>
Suggested-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Coiby Xu <coxu@redhat.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/kernel_read_file.h    |  1 +
 kernel/module/main.c                | 17 ++++++++++++++---
 security/integrity/ima/ima_main.c   | 24 ++++++++++++++++--------
 security/integrity/ima/ima_policy.c |  3 ++-
 security/ipe/hooks.c                |  1 +
 security/selinux/hooks.c            |  5 +++--
 6 files changed, 37 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/kernel_read_file.h b/include/linux/kernel_read_file.h
index 90451e2e12bd..d613a7b4dd35 100644
--- a/include/linux/kernel_read_file.h
+++ b/include/linux/kernel_read_file.h
@@ -14,6 +14,7 @@
 	id(KEXEC_INITRAMFS, kexec-initramfs)	\
 	id(POLICY, security-policy)		\
 	id(X509_CERTIFICATE, x509-certificate)	\
+	id(MODULE_COMPRESSED, kernel-module-compressed) \
 	id(MAX_ID, )
 
 #define __fid_enumify(ENUM, dummy) READING_ ## ENUM,
diff --git a/kernel/module/main.c b/kernel/module/main.c
index c66b26184936..7b3ec2fa6e7c 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3675,24 +3675,35 @@ static int idempotent_wait_for_completion(struct idempotent *u)
 
 static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
 {
+	bool compressed = !!(flags & MODULE_INIT_COMPRESSED_FILE);
 	struct load_info info = { };
 	void *buf = NULL;
 	int len;
+	int err;
 
-	len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE);
+	len = kernel_read_file(f, 0, &buf, INT_MAX, NULL,
+			       compressed ? READING_MODULE_COMPRESSED :
+					    READING_MODULE);
 	if (len < 0) {
 		mod_stat_inc(&failed_kreads);
 		return len;
 	}
 
-	if (flags & MODULE_INIT_COMPRESSED_FILE) {
-		int err = module_decompress(&info, buf, len);
+	if (compressed) {
+		err = module_decompress(&info, buf, len);
 		vfree(buf); /* compressed data is no longer needed */
 		if (err) {
 			mod_stat_inc(&failed_decompress);
 			mod_stat_add_long(len, &invalid_decompress_bytes);
 			return err;
 		}
+		err = security_kernel_post_read_file(f, (char *)info.hdr, info.len,
+						     READING_MODULE);
+		if (err) {
+			mod_stat_inc(&failed_kreads);
+			free_copy(&info, flags);
+			return err;
+		}
 	} else {
 		info.hdr = buf;
 		info.len = len;
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index ebaebccfbe9a..edd0fd3d77a0 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -235,7 +235,8 @@ static void ima_file_free(struct file *file)
 
 static int process_measurement(struct file *file, const struct cred *cred,
 			       struct lsm_prop *prop, char *buf, loff_t size,
-			       int mask, enum ima_hooks func)
+			       int mask, enum ima_hooks func,
+			       enum kernel_read_file_id read_id)
 {
 	struct inode *real_inode, *inode = file_inode(file);
 	struct ima_iint_cache *iint = NULL;
@@ -406,6 +407,12 @@ static int process_measurement(struct file *file, const struct cred *cred,
 	if (rc != 0 && rc != -EBADF && rc != -EINVAL)
 		goto out_locked;
 
+	/* Defer measuring/appraising kernel modules to READING_MODULE */
+	if (read_id == READING_MODULE_COMPRESSED) {
+		must_appraise = 0;
+		goto out_locked;
+	}
+
 	if (!pathbuf)	/* ima_rdwr_violation possibly pre-fetched */
 		pathname = ima_d_path(&file->f_path, &pathbuf, filename);
 
@@ -486,14 +493,14 @@ static int ima_file_mmap(struct file *file, unsigned long reqprot,
 
 	if (reqprot & PROT_EXEC) {
 		ret = process_measurement(file, current_cred(), &prop, NULL,
-					  0, MAY_EXEC, MMAP_CHECK_REQPROT);
+					  0, MAY_EXEC, MMAP_CHECK_REQPROT, 0);
 		if (ret)
 			return ret;
 	}
 
 	if (prot & PROT_EXEC)
 		return process_measurement(file, current_cred(), &prop, NULL,
-					   0, MAY_EXEC, MMAP_CHECK);
+					   0, MAY_EXEC, MMAP_CHECK, 0);
 
 	return 0;
 }
@@ -577,7 +584,7 @@ static int ima_bprm_check(struct linux_binprm *bprm)
 
 	security_current_getlsmprop_subj(&prop);
 	return process_measurement(bprm->file, current_cred(),
-				   &prop, NULL, 0, MAY_EXEC, BPRM_CHECK);
+				   &prop, NULL, 0, MAY_EXEC, BPRM_CHECK, 0);
 }
 
 /**
@@ -607,7 +614,7 @@ static int ima_creds_check(struct linux_binprm *bprm, const struct file *file)
 
 	security_current_getlsmprop_subj(&prop);
 	return process_measurement((struct file *)file, bprm->cred, &prop, NULL,
-				   0, MAY_EXEC, CREDS_CHECK);
+				   0, MAY_EXEC, CREDS_CHECK, 0);
 }
 
 /**
@@ -655,7 +662,7 @@ static int ima_file_check(struct file *file, int mask)
 	security_current_getlsmprop_subj(&prop);
 	return process_measurement(file, current_cred(), &prop, NULL, 0,
 				   mask & (MAY_READ | MAY_WRITE | MAY_EXEC |
-					   MAY_APPEND), FILE_CHECK);
+					   MAY_APPEND), FILE_CHECK, 0);
 }
 
 static int __ima_inode_hash(struct inode *inode, struct file *file, char *buf,
@@ -874,12 +881,13 @@ static int ima_read_file(struct file *file, enum kernel_read_file_id read_id,
 	func = read_idmap[read_id] ?: FILE_CHECK;
 	security_current_getlsmprop_subj(&prop);
 	return process_measurement(file, current_cred(), &prop, NULL, 0,
-				   MAY_READ, func);
+				   MAY_READ, func, 0);
 }
 
 const int read_idmap[READING_MAX_ID] = {
 	[READING_FIRMWARE] = FIRMWARE_CHECK,
 	[READING_MODULE] = MODULE_CHECK,
+	[READING_MODULE_COMPRESSED] = MODULE_CHECK,
 	[READING_KEXEC_IMAGE] = KEXEC_KERNEL_CHECK,
 	[READING_KEXEC_INITRAMFS] = KEXEC_INITRAMFS_CHECK,
 	[READING_POLICY] = POLICY_CHECK
@@ -917,7 +925,7 @@ static int ima_post_read_file(struct file *file, char *buf, loff_t size,
 	func = read_idmap[read_id] ?: FILE_CHECK;
 	security_current_getlsmprop_subj(&prop);
 	return process_measurement(file, current_cred(), &prop, buf, size,
-				   MAY_READ, func);
+				   MAY_READ, func, read_id);
 }
 
 /**
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 164d62832f8e..7468afaab686 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -244,7 +244,8 @@ static struct ima_rule_entry build_appraise_rules[] __ro_after_init = {
 
 static struct ima_rule_entry secure_boot_rules[] __ro_after_init = {
 	{.action = APPRAISE, .func = MODULE_CHECK,
-	 .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
+	 .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED | IMA_MODSIG_ALLOWED |
+		  IMA_CHECK_BLACKLIST},
 	{.action = APPRAISE, .func = FIRMWARE_CHECK,
 	 .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
 	{.action = APPRAISE, .func = KEXEC_KERNEL_CHECK,
diff --git a/security/ipe/hooks.c b/security/ipe/hooks.c
index d0323b81cd8f..1053a4acf589 100644
--- a/security/ipe/hooks.c
+++ b/security/ipe/hooks.c
@@ -118,6 +118,7 @@ int ipe_kernel_read_file(struct file *file, enum kernel_read_file_id id,
 		op = IPE_OP_FIRMWARE;
 		break;
 	case READING_MODULE:
+	case READING_MODULE_COMPRESSED:
 		op = IPE_OP_KERNEL_MODULE;
 		break;
 	case READING_KEXEC_INITRAMFS:
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index dfc22da42f30..c1ff69d5d76e 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4275,7 +4275,7 @@ static int selinux_kernel_read_file(struct file *file,
 {
 	int rc = 0;
 
-	BUILD_BUG_ON_MSG(READING_MAX_ID > 7,
+	BUILD_BUG_ON_MSG(READING_MAX_ID > 8,
 			 "New kernel_read_file_id introduced; update SELinux!");
 
 	switch (id) {
@@ -4283,6 +4283,7 @@ static int selinux_kernel_read_file(struct file *file,
 		rc = selinux_kernel_load_from_file(file, SYSTEM__FIRMWARE_LOAD);
 		break;
 	case READING_MODULE:
+	case READING_MODULE_COMPRESSED:
 		rc = selinux_kernel_load_from_file(file, SYSTEM__MODULE_LOAD);
 		break;
 	case READING_KEXEC_IMAGE:
@@ -4311,7 +4312,7 @@ static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents)
 {
 	int rc = 0;
 
-	BUILD_BUG_ON_MSG(LOADING_MAX_ID > 7,
+	BUILD_BUG_ON_MSG(LOADING_MAX_ID > 8,
 			 "New kernel_load_data_id introduced; update SELinux!");
 
 	switch (id) {
-- 
cgit v1.2.3


From 79301c7d605a10efea35af08167e0a362d8dffb1 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@linux.alibaba.com>
Date: Fri, 14 Nov 2025 16:54:02 +0800
Subject: mm: add spurious fault fixing support for huge pmd

The page faults may be spurious because of the racy access to the page
table.  For example, a non-populated virtual page is accessed on 2
CPUs simultaneously, thus the page faults are triggered on both CPUs.
However, it's possible that one CPU (say CPU A) cannot find the reason
for the page fault if the other CPU (say CPU B) has changed the page
table before the PTE is checked on CPU A.  Most of the time, the
spurious page faults can be ignored safely.  However, if the page
fault is for the write access, it's possible that a stale read-only
TLB entry exists in the local CPU and needs to be flushed on some
architectures.  This is called the spurious page fault fixing.

In the current kernel, there is spurious fault fixing support for pte,
but not for huge pmd because no architectures need it. But in the
next patch in the series, we will change the write protection fault
handling logic on arm64, so that some stale huge pmd entries may
remain in the TLB. These entries need to be flushed via the huge pmd
spurious fault fixing mechanism.

Signed-off-by: Huang Ying <ying.huang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Will Deacon <will@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <yang@os.amperecomputing.com>
Cc: Christoph Lameter (Ampere) <cl@gentwo.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/huge_mm.h |  2 +-
 include/linux/pgtable.h |  4 ++++
 mm/huge_memory.c        | 33 ++++++++++++++++----------
 mm/internal.h           |  2 +-
 mm/memory.c             | 62 ++++++++++++++++++++++++++++++++++++-------------
 5 files changed, 73 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f327d62fc985..887a632ce7a0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-void huge_pmd_set_accessed(struct vm_fault *vmf);
+bool huge_pmd_set_accessed(struct vm_fault *vmf);
 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
 		  struct vm_area_struct *vma);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 32e8457ad535..ee3148ef87f6 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1232,6 +1232,10 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
 #endif
 
+#ifndef flush_tlb_fix_spurious_fault_pmd
+#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0)
+#endif
+
 /*
  * When walking page tables, get the address of the next boundary,
  * or the end address of the range if that comes earlier.  Although no
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1b81680b4225..6a8679907eaa 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1641,17 +1641,30 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
 EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+/**
+ * touch_pmd - Mark page table pmd entry as accessed and dirty (for write)
+ * @vma: The VMA covering @addr
+ * @addr: The virtual address
+ * @pmd: pmd pointer into the page table mapping @addr
+ * @write: Whether it's a write access
+ *
+ * Return: whether the pmd entry is changed
+ */
+bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	       pmd_t *pmd, bool write)
 {
-	pmd_t _pmd;
+	pmd_t entry;
 
-	_pmd = pmd_mkyoung(*pmd);
+	entry = pmd_mkyoung(*pmd);
 	if (write)
-		_pmd = pmd_mkdirty(_pmd);
+		entry = pmd_mkdirty(entry);
 	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-				  pmd, _pmd, write))
+				  pmd, entry, write)) {
 		update_mmu_cache_pmd(vma, addr, pmd);
+		return true;
+	}
+
+	return false;
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1841,18 +1854,14 @@ unlock:
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-void huge_pmd_set_accessed(struct vm_fault *vmf)
+bool huge_pmd_set_accessed(struct vm_fault *vmf)
 {
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
-	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
 	if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
-		goto unlock;
-
-	touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
+		return false;
 
-unlock:
-	spin_unlock(vmf->ptl);
+	return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
 }
 
 static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
diff --git a/mm/internal.h b/mm/internal.h
index 1561fc2ff5b8..27ad37a41868 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1402,7 +1402,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
  */
 void touch_pud(struct vm_area_struct *vma, unsigned long addr,
 	       pud_t *pud, bool write);
-void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	       pmd_t *pmd, bool write);
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index 74b45e258323..6e5a08c4fd2e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6115,6 +6115,45 @@ split:
 	return VM_FAULT_FALLBACK;
 }
 
+/*
+ * The page faults may be spurious because of the racy access to the
+ * page table.  For example, a non-populated virtual page is accessed
+ * on 2 CPUs simultaneously, thus the page faults are triggered on
+ * both CPUs.  However, it's possible that one CPU (say CPU A) cannot
+ * find the reason for the page fault if the other CPU (say CPU B) has
+ * changed the page table before the PTE is checked on CPU A.  Most of
+ * the time, the spurious page faults can be ignored safely.  However,
+ * if the page fault is for the write access, it's possible that a
+ * stale read-only TLB entry exists in the local CPU and needs to be
+ * flushed on some architectures.  This is called the spurious page
+ * fault fixing.
+ *
+ * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
+ * by default and used as such on most architectures, while
+ * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
+ * used as such on most architectures.
+ */
+static void fix_spurious_fault(struct vm_fault *vmf,
+			       enum pgtable_level ptlevel)
+{
+	/* Skip spurious TLB flush for retried page fault */
+	if (vmf->flags & FAULT_FLAG_TRIED)
+		return;
+	/*
+	 * This is needed only for protection faults but the arch code
+	 * is not yet telling us if this is a protection fault or not.
+	 * This still avoids useless tlb flushes for .text page faults
+	 * with threads.
+	 */
+	if (vmf->flags & FAULT_FLAG_WRITE) {
+		if (ptlevel == PGTABLE_LEVEL_PTE)
+			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+						     vmf->pte);
+		else
+			flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
+							 vmf->pmd);
+	}
+}
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -6196,23 +6235,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 	}
 	entry = pte_mkyoung(entry);
 	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
-				vmf->flags & FAULT_FLAG_WRITE)) {
+				vmf->flags & FAULT_FLAG_WRITE))
 		update_mmu_cache_range(vmf, vmf->vma, vmf->address,
 				vmf->pte, 1);
-	} else {
-		/* Skip spurious TLB flush for retried page fault */
-		if (vmf->flags & FAULT_FLAG_TRIED)
-			goto unlock;
-		/*
-		 * This is needed only for protection faults but the arch code
-		 * is not yet telling us if this is a protection fault or not.
-		 * This still avoids useless tlb flushes for .text page faults
-		 * with threads.
-		 */
-		if (vmf->flags & FAULT_FLAG_WRITE)
-			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
-						     vmf->pte);
-	}
+	else
+		fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE);
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 	return 0;
@@ -6309,7 +6336,10 @@ retry_pud:
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
 			} else {
-				huge_pmd_set_accessed(&vmf);
+				vmf.ptl = pmd_lock(mm, vmf.pmd);
+				if (!huge_pmd_set_accessed(&vmf))
+					fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
+				spin_unlock(vmf.ptl);
 				return 0;
 			}
 		}
-- 
cgit v1.2.3


From 4acbfb6c116be5989d5a0e38a48deca2d5b8bb92 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Mon, 22 Sep 2025 10:21:06 +0800
Subject: PM: wakeup: Add out-of-band system wakeup support for devices

Some devices can wake up the system from suspend even when their power
domains are turned off. This is possible because their system-wakeup logic
resides in an always-on power domain - indicating that they support
out-of-band system wakeup.

Currently, PM domain core doesn't power off such devices if they are marked
as system wakeup sources. To better represent devices with out-of-band
wakeup capability, this patch introduces a new flag out_band_wakeup in
'struct dev_pm_info'.

Two helper APIs are added:
 - device_set_out_band_wakeup() - to mark a device as having out-of-band
   wakeup capability.
 - device_out_band_wakeup() - to query the flag.

Allow the PM core and drivers to distinguish between regular and
out-of-band wakeup sources, enable more accurate power management decision.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Reviewed-by: Dhruva Gole <d-gole@ti.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/base/power/main.c |  1 +
 include/linux/pm.h        |  1 +
 include/linux/pm_wakeup.h | 17 +++++++++++++++++
 3 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index e83503bdc1fd..bcfb170baca6 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -2126,6 +2126,7 @@ static int device_prepare(struct device *dev, pm_message_t state)
 	device_lock(dev);
 
 	dev->power.wakeup_path = false;
+	dev->power.out_band_wakeup = false;
 
 	if (dev->power.no_pm_callbacks)
 		goto unlock;
diff --git a/include/linux/pm.h b/include/linux/pm.h
index cc7b2dc28574..5b28a4f2e87e 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -684,6 +684,7 @@ struct dev_pm_info {
 	bool			smart_suspend:1;	/* Owned by the PM core */
 	bool			must_resume:1;		/* Owned by the PM core */
 	bool			may_skip_resume:1;	/* Set by subsystems */
+	bool			out_band_wakeup:1;
 	bool			strict_midlayer:1;
 #else
 	bool			should_wakeup:1;
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index c838b4a30f87..41e8f344a205 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -94,6 +94,16 @@ static inline void device_set_wakeup_path(struct device *dev)
 	dev->power.wakeup_path = true;
 }
 
+static inline void device_set_out_band_wakeup(struct device *dev)
+{
+	dev->power.out_band_wakeup = true;
+}
+
+static inline bool device_out_band_wakeup(struct device *dev)
+{
+	return dev->power.out_band_wakeup;
+}
+
 /* drivers/base/power/wakeup.c */
 extern struct wakeup_source *wakeup_source_register(struct device *dev,
 						    const char *name);
@@ -162,6 +172,13 @@ static inline bool device_wakeup_path(struct device *dev)
 
 static inline void device_set_wakeup_path(struct device *dev) {}
 
+static inline void device_set_out_band_wakeup(struct device *dev) {}
+
+static inline bool device_out_band_wakeup(struct device *dev)
+{
+	return false;
+}
+
 static inline void __pm_stay_awake(struct wakeup_source *ws) {}
 
 static inline void pm_stay_awake(struct device *dev) {}
-- 
cgit v1.2.3


From 854825367a1d28b3b6c757134460d0fe29a0b4a6 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Mon, 22 Sep 2025 10:21:09 +0800
Subject: usb: chipidea: ci_hdrc_imx: Set out of band wakeup for i.MX95

i.MX95 USB2 inside HSIOMIX could still wakeup Linux, even if HSIOMIX
power domain(Digital logic) is off. There is still always on logic
have the wakeup capability which is out band wakeup capbility.

So use device_set_out_band_wakeup for i.MX95 to make sure usb2 could
wakeup system even if HSIOMIX power domain is in off state.

Tested-by: Xu Yang <xu.yang_2@nxp.com>
Reviewed-by: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Acked-by: Peter Chen <peter.chen@kernel.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/usb/chipidea/ci_hdrc_imx.c | 11 ++++++++++-
 include/linux/usb/chipidea.h       |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c
index d7c2a1a3c271..d4ee9e16332f 100644
--- a/drivers/usb/chipidea/ci_hdrc_imx.c
+++ b/drivers/usb/chipidea/ci_hdrc_imx.c
@@ -79,6 +79,10 @@ static const struct ci_hdrc_imx_platform_flag imx8ulp_usb_data = {
 		CI_HDRC_HAS_PORTSC_PEC_MISSED,
 };
 
+static const struct ci_hdrc_imx_platform_flag imx95_usb_data = {
+	.flags = CI_HDRC_SUPPORTS_RUNTIME_PM | CI_HDRC_OUT_BAND_WAKEUP,
+};
+
 static const struct ci_hdrc_imx_platform_flag s32g_usb_data = {
 	.flags = CI_HDRC_DISABLE_HOST_STREAMING,
 };
@@ -94,6 +98,7 @@ static const struct of_device_id ci_hdrc_imx_dt_ids[] = {
 	{ .compatible = "fsl,imx7d-usb", .data = &imx7d_usb_data},
 	{ .compatible = "fsl,imx7ulp-usb", .data = &imx7ulp_usb_data},
 	{ .compatible = "fsl,imx8ulp-usb", .data = &imx8ulp_usb_data},
+	{ .compatible = "fsl,imx95-usb", .data = &imx95_usb_data},
 	{ .compatible = "nxp,s32g2-usb", .data = &s32g_usb_data},
 	{ /* sentinel */ }
 };
@@ -704,9 +709,13 @@ static int ci_hdrc_imx_suspend(struct device *dev)
 
 	pinctrl_pm_select_sleep_state(dev);
 
-	if (data->wakeup_irq > 0 && device_may_wakeup(dev))
+	if (data->wakeup_irq > 0 && device_may_wakeup(dev)) {
 		enable_irq_wake(data->wakeup_irq);
 
+		if (data->plat_data->flags & CI_HDRC_OUT_BAND_WAKEUP)
+			device_set_out_band_wakeup(dev);
+	}
+
 	return ret;
 }
 
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index e17ebeee24e3..c6451191d2de 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -66,6 +66,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_HAS_PORTSC_PEC_MISSED	BIT(17)
 #define CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS	BIT(18)
 #define	CI_HDRC_HAS_SHORT_PKT_LIMIT	BIT(19)
+#define	CI_HDRC_OUT_BAND_WAKEUP		BIT(20)
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
-- 
cgit v1.2.3


From ccde6525183c5489de293cf91a441585fff3c847 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 5 Nov 2025 10:54:07 +0100
Subject: smp: Introduce a helper function to check for pending IPIs

When governors used during cpuidle try to find the most optimal idle state
for a CPU or a group of CPUs, they are known to quite often fail. One
reason for this is, that they are not taking into account whether there has
been an IPI scheduled for any of the CPUs that are affected by the selected
idle state.

To enable pending IPIs to be taken into account for cpuidle decisions,
introduce a new helper function, cpus_peek_for_pending_ipi().

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/smp.h |  5 +++++
 kernel/smp.c        | 22 ++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 18e9c918325e..91d0ecf3b8d3 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -168,6 +168,7 @@ int smp_call_function_any(const struct cpumask *mask,
 
 void kick_all_cpus_sync(void);
 void wake_up_all_idle_cpus(void);
+bool cpus_peek_for_pending_ipi(const struct cpumask *mask);
 
 /*
  * Generic and arch helpers
@@ -216,6 +217,10 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
 
 static inline void kick_all_cpus_sync(void) {  }
 static inline void wake_up_all_idle_cpus(void) {  }
+static inline bool cpus_peek_for_pending_ipi(const struct cpumask *mask)
+{
+	return false;
+}
 
 #define setup_max_cpus 0
 
diff --git a/kernel/smp.c b/kernel/smp.c
index 02f52291fae4..f349960f79ca 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -1087,6 +1087,28 @@ void wake_up_all_idle_cpus(void)
 }
 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
 
+/**
+ * cpus_peek_for_pending_ipi - Check for pending IPI for CPUs
+ * @mask: The CPU mask for the CPUs to check.
+ *
+ * This function walks through the @mask to check if there are any pending IPIs
+ * scheduled, for any of the CPUs in the @mask. It does not guarantee
+ * correctness as it only provides a racy snapshot.
+ *
+ * Returns true if there is a pending IPI scheduled and false otherwise.
+ */
+bool cpus_peek_for_pending_ipi(const struct cpumask *mask)
+{
+	unsigned int cpu;
+
+	for_each_cpu(cpu, mask) {
+		if (!llist_empty(per_cpu_ptr(&call_single_queue, cpu)))
+			return true;
+	}
+
+	return false;
+}
+
 /**
  * struct smp_call_on_cpu_struct - Call a function on a specific CPU
  * @work: &work_struct
-- 
cgit v1.2.3


From 796e29b857aed89f83f70f2c199585c45db5dc0f Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 19 Nov 2025 12:22:31 +0000
Subject: ACPI / PPTT: Add a helper to fill a cpumask from a processor
 container

The ACPI MPAM table uses the UID of a processor container specified in
the PPTT to indicate the subset of CPUs and cache topology that can
access each MPAM System Component (MSC).

This information is not directly useful to the kernel. The equivalent
cpumask is needed instead.

Add a helper to find the processor container by its id, then walk
the possible CPUs to fill a cpumask with the CPUs that have this
processor container as a parent.

CC: Dave Martin <dave.martin@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Fenghua Yu <fenghuay@nvidia.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Jeremy Linton <jeremy.linton@arm.com>
Tested-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com>
Tested-by: Gavin Shan <gshan@redhat.com>
Tested-by: Zeng Heng <zengheng4@huawei.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/pptt.c  | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h |  3 ++
 2 files changed, 87 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c
index 54676e3d82dd..b8248c0092fe 100644
--- a/drivers/acpi/pptt.c
+++ b/drivers/acpi/pptt.c
@@ -817,3 +817,87 @@ int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
 	return find_acpi_cpu_topology_tag(cpu, PPTT_ABORT_PACKAGE,
 					  ACPI_PPTT_ACPI_IDENTICAL);
 }
+
+/**
+ * acpi_pptt_get_child_cpus() - Find all the CPUs below a PPTT
+ * processor hierarchy node
+ *
+ * @table_hdr:		A reference to the PPTT table
+ * @parent_node:	A pointer to the processor hierarchy node in the
+ *			table_hdr
+ * @cpus:		A cpumask to fill with the CPUs below @parent_node
+ *
+ * Walks up the PPTT from every possible CPU to find if the provided
+ * @parent_node is a parent of this CPU.
+ */
+static void acpi_pptt_get_child_cpus(struct acpi_table_header *table_hdr,
+				     struct acpi_pptt_processor *parent_node,
+				     cpumask_t *cpus)
+{
+	struct acpi_pptt_processor *cpu_node;
+	u32 acpi_id;
+	int cpu;
+
+	cpumask_clear(cpus);
+
+	for_each_possible_cpu(cpu) {
+		acpi_id = get_acpi_id_for_cpu(cpu);
+		cpu_node = acpi_find_processor_node(table_hdr, acpi_id);
+
+		while (cpu_node) {
+			if (cpu_node == parent_node) {
+				cpumask_set_cpu(cpu, cpus);
+				break;
+			}
+			cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent);
+		}
+	}
+}
+
+/**
+ * acpi_pptt_get_cpus_from_container() - Populate a cpumask with all CPUs in a
+ *                                       processor container
+ * @acpi_cpu_id:	The UID of the processor container
+ * @cpus:		The resulting CPU mask
+ *
+ * Find the specified Processor Container, and fill @cpus with all the cpus
+ * below it.
+ *
+ * Not all 'Processor Hierarchy' entries in the PPTT are either a CPU
+ * or a Processor Container, they may exist purely to describe a
+ * Private resource. CPUs have to be leaves, so a Processor Container
+ * is a non-leaf that has the 'ACPI Processor ID valid' flag set.
+ */
+void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus)
+{
+	struct acpi_table_header *table_hdr;
+	struct acpi_subtable_header *entry;
+	unsigned long table_end;
+	u32 proc_sz;
+
+	cpumask_clear(cpus);
+
+	table_hdr = acpi_get_pptt();
+	if (!table_hdr)
+		return;
+
+	table_end = (unsigned long)table_hdr + table_hdr->length;
+	entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr,
+			     sizeof(struct acpi_table_pptt));
+	proc_sz = sizeof(struct acpi_pptt_processor);
+	while ((unsigned long)entry + proc_sz <= table_end) {
+		if (entry->type == ACPI_PPTT_TYPE_PROCESSOR) {
+			struct acpi_pptt_processor *cpu_node;
+
+			cpu_node = (struct acpi_pptt_processor *)entry;
+			if (cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID &&
+			    !acpi_pptt_leaf_node(table_hdr, cpu_node) &&
+			    cpu_node->acpi_processor_id == acpi_cpu_id) {
+				acpi_pptt_get_child_cpus(table_hdr, cpu_node, cpus);
+				break;
+			}
+		}
+		entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry,
+				     entry->length);
+	}
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 5ff5d99f6ead..4752ebd48132 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1541,6 +1541,7 @@ int find_acpi_cpu_topology(unsigned int cpu, int level);
 int find_acpi_cpu_topology_cluster(unsigned int cpu);
 int find_acpi_cpu_topology_package(unsigned int cpu);
 int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
+void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus);
 #else
 static inline int acpi_pptt_cpu_is_thread(unsigned int cpu)
 {
@@ -1562,6 +1563,8 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
 {
 	return -EINVAL;
 }
+static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id,
+						     cpumask_t *cpus) { }
 #endif
 
 void acpi_arch_init(void);
-- 
cgit v1.2.3


From 41a7bb39fede8ecc053c261b86cdfadea45b7b10 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 19 Nov 2025 12:22:34 +0000
Subject: ACPI / PPTT: Find cache level by cache-id

The MPAM table identifies caches by id. The MPAM driver also wants to know
the cache level to determine if the platform is of the shape that can be
managed via resctrl. Cacheinfo has this information, but only for CPUs that
are online.

Waiting for all CPUs to come online is a problem for platforms where
CPUs are brought online late by user-space.

Add a helper that walks every possible cache, until it finds the one
identified by cache-id, then return the level.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Ben Horgan <ben.horgan@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Fenghua Yu <fenghuay@nvidia.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Jeremy Linton <jeremy.linton@arm.com>
Tested-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com>
Tested-by: Gavin Shan <gshan@redhat.com>
Tested-by: Zeng Heng <zengheng4@huawei.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/pptt.c  | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h |  5 ++++
 2 files changed, 71 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c
index ef39b176dc00..da49b56a1ef2 100644
--- a/drivers/acpi/pptt.c
+++ b/drivers/acpi/pptt.c
@@ -932,3 +932,69 @@ void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus)
 				     entry->length);
 	}
 }
+
+/**
+ * find_acpi_cache_level_from_id() - Get the level of the specified cache
+ * @cache_id: The id field of the cache
+ *
+ * Determine the level relative to any CPU for the cache identified by
+ * cache_id. This allows the property to be found even if the CPUs are offline.
+ *
+ * The returned level can be used to group caches that are peers.
+ *
+ * The PPTT table must be rev 3 or later.
+ *
+ * If one CPU's L2 is shared with another CPU as L3, this function will return
+ * an unpredictable value.
+ *
+ * Return: -ENOENT if the PPTT doesn't exist, the revision isn't supported or
+ * the cache cannot be found.
+ * Otherwise returns a value which represents the level of the specified cache.
+ */
+int find_acpi_cache_level_from_id(u32 cache_id)
+{
+	int cpu;
+	struct acpi_table_header *table;
+
+	table = acpi_get_pptt();
+	if (!table)
+		return -ENOENT;
+
+	if (table->revision < 3)
+		return -ENOENT;
+
+	for_each_possible_cpu(cpu) {
+		bool empty;
+		int level = 1;
+		u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu);
+		struct acpi_pptt_cache *cache;
+		struct acpi_pptt_processor *cpu_node;
+
+		cpu_node = acpi_find_processor_node(table, acpi_cpu_id);
+		if (!cpu_node)
+			continue;
+
+		do {
+			int cache_type[] = {CACHE_TYPE_INST, CACHE_TYPE_DATA, CACHE_TYPE_UNIFIED};
+
+			empty = true;
+			for (int i = 0; i < ARRAY_SIZE(cache_type); i++) {
+				struct acpi_pptt_cache_v1_full *cache_v1;
+
+				cache = acpi_find_cache_node(table, acpi_cpu_id, cache_type[i],
+							     level, &cpu_node);
+				if (!cache)
+					continue;
+
+				empty = false;
+
+				cache_v1 = upgrade_pptt_cache(cache);
+				if (cache_v1 && cache_v1->cache_id == cache_id)
+					return level;
+			}
+			level++;
+		} while (!empty);
+	}
+
+	return -ENOENT;
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4752ebd48132..be074bdfd4d1 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1542,6 +1542,7 @@ int find_acpi_cpu_topology_cluster(unsigned int cpu);
 int find_acpi_cpu_topology_package(unsigned int cpu);
 int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
 void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus);
+int find_acpi_cache_level_from_id(u32 cache_id);
 #else
 static inline int acpi_pptt_cpu_is_thread(unsigned int cpu)
 {
@@ -1565,6 +1566,10 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
 }
 static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id,
 						     cpumask_t *cpus) { }
+static inline int find_acpi_cache_level_from_id(u32 cache_id)
+{
+	return -ENOENT;
+}
 #endif
 
 void acpi_arch_init(void);
-- 
cgit v1.2.3


From a39a723a6f1ed9a1602ccf8dd56392402afa7339 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 19 Nov 2025 12:22:35 +0000
Subject: ACPI / PPTT: Add a helper to fill a cpumask from a cache_id

MPAM identifies CPUs by the cache_id in the PPTT cache structure.

The driver needs to know which CPUs are associated with the cache.
The CPUs may not all be online, so cacheinfo does not have the
information.

Add a helper to pull this information out of the PPTT.

CC: Rohit Mathew <Rohit.Mathew@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Fenghua Yu <fenghuay@nvidia.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Jeremy Linton <jeremy.linton@arm.com>
Tested-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com>
Tested-by: Gavin Shan <gshan@redhat.com>
Tested-by: Zeng Heng <zengheng4@huawei.com>
Tested-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/pptt.c  | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h |  6 +++++
 2 files changed, 71 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c
index da49b56a1ef2..de5f8c018333 100644
--- a/drivers/acpi/pptt.c
+++ b/drivers/acpi/pptt.c
@@ -998,3 +998,68 @@ int find_acpi_cache_level_from_id(u32 cache_id)
 
 	return -ENOENT;
 }
+
+/**
+ * acpi_pptt_get_cpumask_from_cache_id() - Get the cpus associated with the
+ *					   specified cache
+ * @cache_id: The id field of the cache
+ * @cpus: Where to build the cpumask
+ *
+ * Determine which CPUs are below this cache in the PPTT. This allows the property
+ * to be found even if the CPUs are offline.
+ *
+ * The PPTT table must be rev 3 or later,
+ *
+ * Return: -ENOENT if the PPTT doesn't exist, or the cache cannot be found.
+ * Otherwise returns 0 and sets the cpus in the provided cpumask.
+ */
+int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus)
+{
+	int cpu;
+	struct acpi_table_header *table;
+
+	cpumask_clear(cpus);
+
+	table = acpi_get_pptt();
+	if (!table)
+		return -ENOENT;
+
+	if (table->revision < 3)
+		return -ENOENT;
+
+	for_each_possible_cpu(cpu) {
+		bool empty;
+		int level = 1;
+		u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu);
+		struct acpi_pptt_cache *cache;
+		struct acpi_pptt_processor *cpu_node;
+
+		cpu_node = acpi_find_processor_node(table, acpi_cpu_id);
+		if (!cpu_node)
+			continue;
+
+		do {
+			int cache_type[] = {CACHE_TYPE_INST, CACHE_TYPE_DATA, CACHE_TYPE_UNIFIED};
+
+			empty = true;
+			for (int i = 0; i < ARRAY_SIZE(cache_type); i++) {
+				struct acpi_pptt_cache_v1_full *cache_v1;
+
+				cache = acpi_find_cache_node(table, acpi_cpu_id, cache_type[i],
+							     level, &cpu_node);
+
+				if (!cache)
+					continue;
+
+				empty = false;
+
+				cache_v1 = upgrade_pptt_cache(cache);
+				if (cache_v1 && cache_v1->cache_id == cache_id)
+					cpumask_set_cpu(cpu, cpus);
+			}
+			level++;
+		} while (!empty);
+	}
+
+	return 0;
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index be074bdfd4d1..a9dbacabdf89 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1543,6 +1543,7 @@ int find_acpi_cpu_topology_package(unsigned int cpu);
 int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
 void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus);
 int find_acpi_cache_level_from_id(u32 cache_id);
+int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus);
 #else
 static inline int acpi_pptt_cpu_is_thread(unsigned int cpu)
 {
@@ -1570,6 +1571,11 @@ static inline int find_acpi_cache_level_from_id(u32 cache_id)
 {
 	return -ENOENT;
 }
+static inline int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id,
+						      cpumask_t *cpus)
+{
+	return -ENOENT;
+}
 #endif
 
 void acpi_arch_init(void);
-- 
cgit v1.2.3


From f5915600cc4ca0338a37d5a8a4032e25d939156b Mon Sep 17 00:00:00 2001
From: Ben Horgan <ben.horgan@arm.com>
Date: Wed, 19 Nov 2025 12:22:37 +0000
Subject: platform: Define platform_device_put cleanup handler

Define a cleanup helper for use with __free to destroy platform devices
automatically when the pointer goes out of scope. This is only intended to
be used in error cases and so should be used with return_ptr() or
no_free_ptr() directly to avoid the automatic destruction on success.

A first use of this is introduced in a subsequent commit.

Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Fenghua Yu <fenghuay@nvidia.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Tested-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com>
Tested-by: Gavin Shan <gshan@redhat.com>
Tested-by: Zeng Heng <zengheng4@huawei.com>
Tested-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/platform_device.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 074754c23d33..23a30ada2d4c 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -232,6 +232,7 @@ extern int platform_device_add_data(struct platform_device *pdev,
 extern int platform_device_add(struct platform_device *pdev);
 extern void platform_device_del(struct platform_device *pdev);
 extern void platform_device_put(struct platform_device *pdev);
+DEFINE_FREE(platform_device_put, struct platform_device *, if (_T) platform_device_put(_T))
 
 struct platform_driver {
 	int (*probe)(struct platform_device *);
-- 
cgit v1.2.3


From 96f4a4d53e6660d9b62e8d739388267fbb660e9f Mon Sep 17 00:00:00 2001
From: Ben Horgan <ben.horgan@arm.com>
Date: Wed, 19 Nov 2025 12:22:38 +0000
Subject: ACPI: Define acpi_put_table cleanup handler and
 acpi_get_table_pointer() helper

Define a cleanup helper for use with __free to release the acpi table when
the pointer goes out of scope. Also, introduce the helper
acpi_get_table_pointer() to simplify a commonly used pattern involving
acpi_get_table().

These are first used in a subsequent commit.

Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Reviewed-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com>
Tested-by: Gavin Shan <gshan@redhat.com>
Tested-by: Zeng Heng <zengheng4@huawei.com>
Tested-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/acpi.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a9dbacabdf89..ac8797f95236 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_ACPI_H
 #define _LINUX_ACPI_H
 
+#include <linux/cleanup.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>	/* for struct resource */
 #include <linux/resource_ext.h>
@@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void);
 void acpi_table_init_complete (void);
 int acpi_table_init (void);
 
+static inline struct acpi_table_header *acpi_get_table_pointer(char *signature, u32 instance)
+{
+	struct acpi_table_header *table;
+	int status = acpi_get_table(signature, instance, &table);
+
+	if (ACPI_FAILURE(status))
+		return ERR_PTR(-ENOENT);
+	return table;
+}
+DEFINE_FREE(acpi_put_table, struct acpi_table_header *, if (!IS_ERR_OR_NULL(_T)) acpi_put_table(_T))
+
 int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
 int __init_or_acpilib acpi_table_parse_entries(char *id,
 		unsigned long table_size, int entry_id,
-- 
cgit v1.2.3


From 115c5325beae7199219ab7c12ec2a2af8dea6c3c Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 19 Nov 2025 12:22:39 +0000
Subject: ACPI / MPAM: Parse the MPAM table

Add code to parse the arm64 specific MPAM table, looking up the cache
level from the PPTT and feeding the end result into the MPAM driver.

This happens in two stages. Platform devices are created first for the
MSC devices. Once the driver probes it calls acpi_mpam_parse_resources()
to discover the RIS entries the MSC contains.

For now the MPAM hook mpam_ris_create() is stubbed out, but will update
the MPAM driver with optional discovered data about the RIS entries.

CC: Carl Worth <carl@os.amperecomputing.com>
Link: https://developer.arm.com/documentation/den0065/3-0bet/?lang=en
Reviewed-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Reviewed-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com>
Tested-by: Gavin Shan <gshan@redhat.com>
Tested-by: Zeng Heng <zengheng4@huawei.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig          |   1 +
 drivers/acpi/arm64/Kconfig  |   3 +
 drivers/acpi/arm64/Makefile |   1 +
 drivers/acpi/arm64/mpam.c   | 411 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/tables.c       |   2 +-
 include/linux/arm_mpam.h    |  47 +++++
 6 files changed, 464 insertions(+), 1 deletion(-)
 create mode 100644 drivers/acpi/arm64/mpam.c
 create mode 100644 include/linux/arm_mpam.h

(limited to 'include')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 67015d51f7b5..c5e66d5d72cd 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2025,6 +2025,7 @@ config ARM64_TLB_RANGE
 
 config ARM64_MPAM
 	bool "Enable support for MPAM"
+	select ACPI_MPAM if ACPI
 	help
 	  Memory System Resource Partitioning and Monitoring (MPAM) is an
 	  optional extension to the Arm architecture that allows each
diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig
index b3ed6212244c..f2fd79f22e7d 100644
--- a/drivers/acpi/arm64/Kconfig
+++ b/drivers/acpi/arm64/Kconfig
@@ -21,3 +21,6 @@ config ACPI_AGDI
 
 config ACPI_APMT
 	bool
+
+config ACPI_MPAM
+	bool
diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index 05ecde9eaabe..9390b57cb564 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI_APMT) 	+= apmt.o
 obj-$(CONFIG_ACPI_FFH)		+= ffh.o
 obj-$(CONFIG_ACPI_GTDT) 	+= gtdt.o
 obj-$(CONFIG_ACPI_IORT) 	+= iort.o
+obj-$(CONFIG_ACPI_MPAM) 	+= mpam.o
 obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o
 obj-$(CONFIG_ARM_AMBA)		+= amba.o
 obj-y				+= dma.o init.o
diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c
new file mode 100644
index 000000000000..84963a20c3e7
--- /dev/null
+++ b/drivers/acpi/arm64/mpam.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+
+/* Parse the MPAM ACPI table feeding the discovered nodes into the driver */
+
+#define pr_fmt(fmt) "ACPI MPAM: " fmt
+
+#include <linux/acpi.h>
+#include <linux/arm_mpam.h>
+#include <linux/bits.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/platform_device.h>
+
+#include <acpi/processor.h>
+
+/*
+ * Flags for acpi_table_mpam_msc.*_interrupt_flags.
+ * See 2.1.1 Interrupt Flags, Table 5, of DEN0065B_MPAM_ACPI_3.0-bet.
+ */
+#define ACPI_MPAM_MSC_IRQ_MODE                              BIT(0)
+#define ACPI_MPAM_MSC_IRQ_TYPE_MASK                         GENMASK(2, 1)
+#define ACPI_MPAM_MSC_IRQ_TYPE_WIRED                        0
+#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK                BIT(3)
+#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR           0
+#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER 1
+#define ACPI_MPAM_MSC_IRQ_AFFINITY_VALID                    BIT(4)
+
+/*
+ * Encodings for the MSC node body interface type field.
+ * See 2.1 MPAM MSC node, Table 4 of DEN0065B_MPAM_ACPI_3.0-bet.
+ */
+#define ACPI_MPAM_MSC_IFACE_MMIO   0x00
+#define ACPI_MPAM_MSC_IFACE_PCC    0x0a
+
+static bool _is_ppi_partition(u32 flags)
+{
+	u32 aff_type, is_ppi;
+	bool ret;
+
+	is_ppi = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_VALID, flags);
+	if (!is_ppi)
+		return false;
+
+	aff_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK, flags);
+	ret = (aff_type == ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER);
+	if (ret)
+		pr_err_once("Partitioned interrupts not supported\n");
+
+	return ret;
+}
+
+static int acpi_mpam_register_irq(struct platform_device *pdev,
+				  u32 intid, u32 flags)
+{
+	int irq;
+	u32 int_type;
+	int trigger;
+
+	if (!intid)
+		return -EINVAL;
+
+	if (_is_ppi_partition(flags))
+		return -EINVAL;
+
+	trigger = FIELD_GET(ACPI_MPAM_MSC_IRQ_MODE, flags);
+	int_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_TYPE_MASK, flags);
+	if (int_type != ACPI_MPAM_MSC_IRQ_TYPE_WIRED)
+		return -EINVAL;
+
+	irq = acpi_register_gsi(&pdev->dev, intid, trigger, ACPI_ACTIVE_HIGH);
+	if (irq < 0)
+		pr_err_once("Failed to register interrupt 0x%x with ACPI\n", intid);
+
+	return irq;
+}
+
+static void acpi_mpam_parse_irqs(struct platform_device *pdev,
+				 struct acpi_mpam_msc_node *tbl_msc,
+				 struct resource *res, int *res_idx)
+{
+	u32 flags, intid;
+	int irq;
+
+	intid = tbl_msc->overflow_interrupt;
+	flags = tbl_msc->overflow_interrupt_flags;
+	irq = acpi_mpam_register_irq(pdev, intid, flags);
+	if (irq > 0)
+		res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "overflow");
+
+	intid = tbl_msc->error_interrupt;
+	flags = tbl_msc->error_interrupt_flags;
+	irq = acpi_mpam_register_irq(pdev, intid, flags);
+	if (irq > 0)
+		res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error");
+}
+
+static int acpi_mpam_parse_resource(struct mpam_msc *msc,
+				    struct acpi_mpam_resource_node *res)
+{
+	int level, nid;
+	u32 cache_id;
+
+	switch (res->locator_type) {
+	case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE:
+		cache_id = res->locator.cache_locator.cache_reference;
+		level = find_acpi_cache_level_from_id(cache_id);
+		if (level <= 0) {
+			pr_err_once("Bad level (%d) for cache with id %u\n", level, cache_id);
+			return -EINVAL;
+		}
+		return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE,
+				       level, cache_id);
+	case ACPI_MPAM_LOCATION_TYPE_MEMORY:
+		nid = pxm_to_node(res->locator.memory_locator.proximity_domain);
+		if (nid == NUMA_NO_NODE) {
+			pr_debug("Bad proximity domain %lld, using node 0 instead\n",
+				 res->locator.memory_locator.proximity_domain);
+			nid = 0;
+		}
+		return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY,
+				       MPAM_CLASS_ID_DEFAULT, nid);
+	default:
+		/* These get discovered later and are treated as unknown */
+		return 0;
+	}
+}
+
+int acpi_mpam_parse_resources(struct mpam_msc *msc,
+			      struct acpi_mpam_msc_node *tbl_msc)
+{
+	int i, err;
+	char *ptr, *table_end;
+	struct acpi_mpam_resource_node *resource;
+
+	table_end = (char *)tbl_msc + tbl_msc->length;
+	ptr = (char *)(tbl_msc + 1);
+	for (i = 0; i < tbl_msc->num_resource_nodes; i++) {
+		u64 max_deps, remaining_table;
+
+		if (ptr + sizeof(*resource) > table_end)
+			return -EINVAL;
+
+		resource = (struct acpi_mpam_resource_node *)ptr;
+
+		remaining_table = table_end - ptr;
+		max_deps = remaining_table / sizeof(struct acpi_mpam_func_deps);
+		if (resource->num_functional_deps > max_deps) {
+			pr_debug("MSC has impossible number of functional dependencies\n");
+			return -EINVAL;
+		}
+
+		err = acpi_mpam_parse_resource(msc, resource);
+		if (err)
+			return err;
+
+		ptr += sizeof(*resource);
+		ptr += resource->num_functional_deps * sizeof(struct acpi_mpam_func_deps);
+	}
+
+	return 0;
+}
+
+/*
+ * Creates the device power management link and returns true if the
+ * acpi id is valid and usable for cpu affinity.  This is the case
+ * when the linked device is a processor or a processor container.
+ */
+static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc,
+				     struct platform_device *pdev,
+				     u32 *acpi_id)
+{
+	char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1] = { 0 };
+	bool acpi_id_valid = false;
+	struct acpi_device *buddy;
+	char uid[11];
+	int len;
+
+	memcpy(hid, &tbl_msc->hardware_id_linked_device,
+	       sizeof(tbl_msc->hardware_id_linked_device));
+
+	if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) {
+		*acpi_id = tbl_msc->instance_id_linked_device;
+		acpi_id_valid = true;
+	}
+
+	len = snprintf(uid, sizeof(uid), "%u",
+		       tbl_msc->instance_id_linked_device);
+	if (len >= sizeof(uid)) {
+		pr_debug("Failed to convert uid of device for power management.");
+		return acpi_id_valid;
+	}
+
+	buddy = acpi_dev_get_first_match_dev(hid, uid, -1);
+	if (buddy) {
+		device_link_add(&pdev->dev, &buddy->dev, DL_FLAG_STATELESS);
+		acpi_dev_put(buddy);
+	}
+
+	return acpi_id_valid;
+}
+
+static int decode_interface_type(struct acpi_mpam_msc_node *tbl_msc,
+				 enum mpam_msc_iface *iface)
+{
+	switch (tbl_msc->interface_type) {
+	case ACPI_MPAM_MSC_IFACE_MMIO:
+		*iface = MPAM_IFACE_MMIO;
+		return 0;
+	case ACPI_MPAM_MSC_IFACE_PCC:
+		*iface = MPAM_IFACE_PCC;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct platform_device * __init acpi_mpam_parse_msc(struct acpi_mpam_msc_node *tbl_msc)
+{
+	struct platform_device *pdev __free(platform_device_put) =
+		platform_device_alloc("mpam_msc", tbl_msc->identifier);
+	int next_res = 0, next_prop = 0, err;
+	/* pcc, nrdy, affinity and a sentinel */
+	struct property_entry props[4] = { 0 };
+	/* mmio, 2xirq, no sentinel. */
+	struct resource res[3] = { 0 };
+	struct acpi_device *companion;
+	enum mpam_msc_iface iface;
+	char uid[16];
+	u32 acpi_id;
+
+	if (!pdev)
+		return ERR_PTR(-ENOMEM);
+
+	/* Some power management is described in the namespace: */
+	err = snprintf(uid, sizeof(uid), "%u", tbl_msc->identifier);
+	if (err > 0 && err < sizeof(uid)) {
+		companion = acpi_dev_get_first_match_dev("ARMHAA5C", uid, -1);
+		if (companion) {
+			ACPI_COMPANION_SET(&pdev->dev, companion);
+			acpi_dev_put(companion);
+		} else {
+			pr_debug("MSC.%u: missing namespace entry\n", tbl_msc->identifier);
+		}
+	}
+
+	if (decode_interface_type(tbl_msc, &iface)) {
+		pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (iface == MPAM_IFACE_MMIO) {
+		res[next_res++] = DEFINE_RES_MEM_NAMED(tbl_msc->base_address,
+						       tbl_msc->mmio_size,
+						       "MPAM:MSC");
+	} else if (iface == MPAM_IFACE_PCC) {
+		props[next_prop++] = PROPERTY_ENTRY_U32("pcc-channel",
+							tbl_msc->base_address);
+	}
+
+	acpi_mpam_parse_irqs(pdev, tbl_msc, res, &next_res);
+
+	WARN_ON_ONCE(next_res > ARRAY_SIZE(res));
+	err = platform_device_add_resources(pdev, res, next_res);
+	if (err)
+		return ERR_PTR(err);
+
+	props[next_prop++] = PROPERTY_ENTRY_U32("arm,not-ready-us",
+						tbl_msc->max_nrdy_usec);
+
+	/*
+	 * The MSC's CPU affinity is described via its linked power
+	 * management device, but only if it points at a Processor or
+	 * Processor Container.
+	 */
+	if (parse_msc_pm_link(tbl_msc, pdev, &acpi_id))
+		props[next_prop++] = PROPERTY_ENTRY_U32("cpu_affinity", acpi_id);
+
+	WARN_ON_ONCE(next_prop > ARRAY_SIZE(props) - 1);
+	err = device_create_managed_software_node(&pdev->dev, props, NULL);
+	if (err)
+		return ERR_PTR(err);
+
+	/*
+	 * Stash the table entry for acpi_mpam_parse_resources() to discover
+	 * what this MSC controls.
+	 */
+	err = platform_device_add_data(pdev, tbl_msc, tbl_msc->length);
+	if (err)
+		return ERR_PTR(err);
+
+	err = platform_device_add(pdev);
+	if (err)
+		return ERR_PTR(err);
+
+	return_ptr(pdev);
+}
+
+static int __init acpi_mpam_parse(void)
+{
+	char *table_end, *table_offset;
+	struct acpi_mpam_msc_node *tbl_msc;
+	struct platform_device *pdev;
+
+	if (acpi_disabled || !system_supports_mpam())
+		return 0;
+
+	struct acpi_table_header *table __free(acpi_put_table) =
+		acpi_get_table_pointer(ACPI_SIG_MPAM, 0);
+
+	if (IS_ERR(table))
+		return 0;
+
+	if (table->revision < 1) {
+		pr_debug("MPAM ACPI table revision %d not supported\n", table->revision);
+		return 0;
+	}
+
+	table_offset = (char *)(table + 1);
+	table_end = (char *)table + table->length;
+
+	while (table_offset < table_end) {
+		tbl_msc = (struct acpi_mpam_msc_node *)table_offset;
+		if (table_offset + sizeof(*tbl_msc) > table_end ||
+		    table_offset + tbl_msc->length > table_end) {
+			pr_err("MSC entry overlaps end of ACPI table\n");
+			return -EINVAL;
+		}
+		table_offset += tbl_msc->length;
+
+		/*
+		 * If any of the reserved fields are set, make no attempt to
+		 * parse the MSC structure. This MSC will still be counted by
+		 * acpi_mpam_count_msc(), meaning the MPAM driver can't probe
+		 * against all MSC, and will never be enabled. There is no way
+		 * to enable it safely, because we cannot determine safe
+		 * system-wide partid and pmg ranges in this situation.
+		 */
+		if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) {
+			pr_err_once("Unrecognised MSC, MPAM not usable\n");
+			pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier);
+			continue;
+		}
+
+		if (!tbl_msc->mmio_size) {
+			pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier);
+			continue;
+		}
+
+		pdev = acpi_mpam_parse_msc(tbl_msc);
+		if (IS_ERR(pdev))
+			return PTR_ERR(pdev);
+	}
+
+	return 0;
+}
+
+/**
+ * acpi_mpam_count_msc() - Count the number of MSC described by firmware.
+ *
+ * Returns the number of MSCs, or zero for an error.
+ *
+ * This can be called before or in parallel with acpi_mpam_parse().
+ */
+int acpi_mpam_count_msc(void)
+{
+	char *table_end, *table_offset;
+	struct acpi_mpam_msc_node *tbl_msc;
+	int count = 0;
+
+	if (acpi_disabled || !system_supports_mpam())
+		return 0;
+
+	struct acpi_table_header *table __free(acpi_put_table) =
+		acpi_get_table_pointer(ACPI_SIG_MPAM, 0);
+
+	if (IS_ERR(table))
+		return 0;
+
+	if (table->revision < 1)
+		return 0;
+
+	table_offset = (char *)(table + 1);
+	table_end = (char *)table + table->length;
+
+	while (table_offset < table_end) {
+		tbl_msc = (struct acpi_mpam_msc_node *)table_offset;
+
+		if (table_offset + sizeof(*tbl_msc) > table_end)
+			return -EINVAL;
+		if (tbl_msc->length < sizeof(*tbl_msc))
+			return -EINVAL;
+		if (tbl_msc->length > table_end - table_offset)
+			return -EINVAL;
+		table_offset += tbl_msc->length;
+
+		if (!tbl_msc->mmio_size)
+			continue;
+
+		count++;
+	}
+
+	return count;
+}
+
+/*
+ * Call after ACPI devices have been created, which happens behind acpi_scan_init()
+ * called from subsys_initcall(). PCC requires the mailbox driver, which is
+ * initialised from postcore_initcall().
+ */
+subsys_initcall_sync(acpi_mpam_parse);
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 57fc8bc56166..4286e4af1092 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -408,7 +408,7 @@ static const char table_sigs[][ACPI_NAMESEG_SIZE] __nonstring_array __initconst
 	ACPI_SIG_PSDT, ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT,
 	ACPI_SIG_IORT, ACPI_SIG_NFIT, ACPI_SIG_HMAT, ACPI_SIG_PPTT,
 	ACPI_SIG_NHLT, ACPI_SIG_AEST, ACPI_SIG_CEDT, ACPI_SIG_AGDI,
-	ACPI_SIG_NBFT, ACPI_SIG_SWFT};
+	ACPI_SIG_NBFT, ACPI_SIG_SWFT, ACPI_SIG_MPAM};
 
 #define ACPI_HEADER_SIZE sizeof(struct acpi_table_header)
 
diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h
new file mode 100644
index 000000000000..4b7f335181e0
--- /dev/null
+++ b/include/linux/arm_mpam.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2025 Arm Ltd. */
+
+#ifndef __LINUX_ARM_MPAM_H
+#define __LINUX_ARM_MPAM_H
+
+#include <linux/acpi.h>
+#include <linux/types.h>
+
+struct mpam_msc;
+
+enum mpam_msc_iface {
+	MPAM_IFACE_MMIO,	/* a real MPAM MSC */
+	MPAM_IFACE_PCC,		/* a fake MPAM MSC */
+};
+
+enum mpam_class_types {
+	MPAM_CLASS_CACHE,	/* Caches, e.g. L2, L3 */
+	MPAM_CLASS_MEMORY,	/* Main memory */
+	MPAM_CLASS_UNKNOWN,	/* Everything else, e.g. SMMU */
+};
+
+#define MPAM_CLASS_ID_DEFAULT	255
+
+#ifdef CONFIG_ACPI_MPAM
+int acpi_mpam_parse_resources(struct mpam_msc *msc,
+			      struct acpi_mpam_msc_node *tbl_msc);
+
+int acpi_mpam_count_msc(void);
+#else
+static inline int acpi_mpam_parse_resources(struct mpam_msc *msc,
+					    struct acpi_mpam_msc_node *tbl_msc)
+{
+	return -EINVAL;
+}
+
+static inline int acpi_mpam_count_msc(void) { return -EINVAL; }
+#endif
+
+static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
+				  enum mpam_class_types type, u8 class_id,
+				  int component_id)
+{
+	return -EINVAL;
+}
+
+#endif /* __LINUX_ARM_MPAM_H */
-- 
cgit v1.2.3


From 01fb4b8224726aa0f2170b63e4685cf0eec85d8d Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 19 Nov 2025 12:22:41 +0000
Subject: arm_mpam: Add the class and component structures for firmware
 described ris

An MSC is a container of resources, each identified by their RIS index.
Some RIS are described by firmware to provide their position in the system.
Others are discovered when the driver probes the hardware.

To configure a resource it needs to be found by its class, e.g. 'L2'.
There are two kinds of grouping, a class is a set of components, which
are visible to user-space as there are likely to be multiple instances
of the L2 cache. (e.g. one per cluster or package)

Add support for creating and destroying structures to allow a hierarchy
of resources to be created.

Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Reviewed-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com>
Tested-by: Gavin Shan <gshan@redhat.com>
Tested-by: Zeng Heng <zengheng4@huawei.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/resctrl/mpam_devices.c  | 392 +++++++++++++++++++++++++++++++++++++++-
 drivers/resctrl/mpam_internal.h |  94 ++++++++++
 include/linux/arm_mpam.h        |   5 +
 3 files changed, 490 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
index e097e852f9c3..f1dcf9bb14f2 100644
--- a/drivers/resctrl/mpam_devices.c
+++ b/drivers/resctrl/mpam_devices.c
@@ -36,6 +36,383 @@ struct srcu_struct mpam_srcu;
  */
 static atomic_t mpam_num_msc;
 
+/*
+ * An MSC is a physical container for controls and monitors, each identified by
+ * their RIS index. These share a base-address, interrupts and some MMIO
+ * registers. A vMSC is a virtual container for RIS in an MSC that control or
+ * monitor the same thing. Members of a vMSC are all RIS in the same MSC, but
+ * not all RIS in an MSC share a vMSC.
+ *
+ * Components are a group of vMSC that control or monitor the same thing but
+ * are from different MSC, so have different base-address, interrupts etc.
+ * Classes are the set components of the same type.
+ *
+ * The features of a vMSC is the union of the RIS it contains.
+ * The features of a Class and Component are the common subset of the vMSC
+ * they contain.
+ *
+ * e.g. The system cache may have bandwidth controls on multiple interfaces,
+ * for regulating traffic from devices independently of traffic from CPUs.
+ * If these are two RIS in one MSC, they will be treated as controlling
+ * different things, and will not share a vMSC/component/class.
+ *
+ * e.g. The L2 may have one MSC and two RIS, one for cache-controls another
+ * for bandwidth. These two RIS are members of the same vMSC.
+ *
+ * e.g. The set of RIS that make up the L2 are grouped as a component. These
+ * are sometimes termed slices. They should be configured the same, as if there
+ * were only one.
+ *
+ * e.g. The SoC probably has more than one L2, each attached to a distinct set
+ * of CPUs. All the L2 components are grouped as a class.
+ *
+ * When creating an MSC, struct mpam_msc is added to the all mpam_all_msc list,
+ * then linked via struct mpam_ris to a vmsc, component and class.
+ * The same MSC may exist under different class->component->vmsc paths, but the
+ * RIS index will be unique.
+ */
+LIST_HEAD(mpam_classes);
+
+/* List of all objects that can be free()d after synchronise_srcu() */
+static LLIST_HEAD(mpam_garbage);
+
+static inline void init_garbage(struct mpam_garbage *garbage)
+{
+	init_llist_node(&garbage->llist);
+}
+
+#define add_to_garbage(x)				\
+do {							\
+	__typeof__(x) _x = (x);				\
+	_x->garbage.to_free = _x;			\
+	llist_add(&_x->garbage.llist, &mpam_garbage);	\
+} while (0)
+
+static void mpam_free_garbage(void)
+{
+	struct mpam_garbage *iter, *tmp;
+	struct llist_node *to_free = llist_del_all(&mpam_garbage);
+
+	if (!to_free)
+		return;
+
+	synchronize_srcu(&mpam_srcu);
+
+	llist_for_each_entry_safe(iter, tmp, to_free, llist) {
+		if (iter->pdev)
+			devm_kfree(&iter->pdev->dev, iter->to_free);
+		else
+			kfree(iter->to_free);
+	}
+}
+
+static struct mpam_class *
+mpam_class_alloc(u8 level_idx, enum mpam_class_types type)
+{
+	struct mpam_class *class;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	class = kzalloc(sizeof(*class), GFP_KERNEL);
+	if (!class)
+		return ERR_PTR(-ENOMEM);
+	init_garbage(&class->garbage);
+
+	INIT_LIST_HEAD_RCU(&class->components);
+	/* Affinity is updated when ris are added */
+	class->level = level_idx;
+	class->type = type;
+	INIT_LIST_HEAD_RCU(&class->classes_list);
+
+	list_add_rcu(&class->classes_list, &mpam_classes);
+
+	return class;
+}
+
+static void mpam_class_destroy(struct mpam_class *class)
+{
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_del_rcu(&class->classes_list);
+	add_to_garbage(class);
+}
+
+static struct mpam_class *
+mpam_class_find(u8 level_idx, enum mpam_class_types type)
+{
+	struct mpam_class *class;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_for_each_entry(class, &mpam_classes, classes_list) {
+		if (class->type == type && class->level == level_idx)
+			return class;
+	}
+
+	return mpam_class_alloc(level_idx, type);
+}
+
+static struct mpam_component *
+mpam_component_alloc(struct mpam_class *class, int id)
+{
+	struct mpam_component *comp;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	comp = kzalloc(sizeof(*comp), GFP_KERNEL);
+	if (!comp)
+		return ERR_PTR(-ENOMEM);
+	init_garbage(&comp->garbage);
+
+	comp->comp_id = id;
+	INIT_LIST_HEAD_RCU(&comp->vmsc);
+	/* Affinity is updated when RIS are added */
+	INIT_LIST_HEAD_RCU(&comp->class_list);
+	comp->class = class;
+
+	list_add_rcu(&comp->class_list, &class->components);
+
+	return comp;
+}
+
+static void mpam_component_destroy(struct mpam_component *comp)
+{
+	struct mpam_class *class = comp->class;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_del_rcu(&comp->class_list);
+	add_to_garbage(comp);
+
+	if (list_empty(&class->components))
+		mpam_class_destroy(class);
+}
+
+static struct mpam_component *
+mpam_component_find(struct mpam_class *class, int id)
+{
+	struct mpam_component *comp;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_for_each_entry(comp, &class->components, class_list) {
+		if (comp->comp_id == id)
+			return comp;
+	}
+
+	return mpam_component_alloc(class, id);
+}
+
+static struct mpam_vmsc *
+mpam_vmsc_alloc(struct mpam_component *comp, struct mpam_msc *msc)
+{
+	struct mpam_vmsc *vmsc;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	vmsc = kzalloc(sizeof(*vmsc), GFP_KERNEL);
+	if (!vmsc)
+		return ERR_PTR(-ENOMEM);
+	init_garbage(&vmsc->garbage);
+
+	INIT_LIST_HEAD_RCU(&vmsc->ris);
+	INIT_LIST_HEAD_RCU(&vmsc->comp_list);
+	vmsc->comp = comp;
+	vmsc->msc = msc;
+
+	list_add_rcu(&vmsc->comp_list, &comp->vmsc);
+
+	return vmsc;
+}
+
+static void mpam_vmsc_destroy(struct mpam_vmsc *vmsc)
+{
+	struct mpam_component *comp = vmsc->comp;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_del_rcu(&vmsc->comp_list);
+	add_to_garbage(vmsc);
+
+	if (list_empty(&comp->vmsc))
+		mpam_component_destroy(comp);
+}
+
+static struct mpam_vmsc *
+mpam_vmsc_find(struct mpam_component *comp, struct mpam_msc *msc)
+{
+	struct mpam_vmsc *vmsc;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	list_for_each_entry(vmsc, &comp->vmsc, comp_list) {
+		if (vmsc->msc->id == msc->id)
+			return vmsc;
+	}
+
+	return mpam_vmsc_alloc(comp, msc);
+}
+
+/*
+ * The cacheinfo structures are only populated when CPUs are online.
+ * This helper walks the acpi tables to include offline CPUs too.
+ */
+int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
+				   cpumask_t *affinity)
+{
+	return acpi_pptt_get_cpumask_from_cache_id(cache_id, affinity);
+}
+
+/*
+ * cpumask_of_node() only knows about online CPUs. This can't tell us whether
+ * a class is represented on all possible CPUs.
+ */
+static void get_cpumask_from_node_id(u32 node_id, cpumask_t *affinity)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (node_id == cpu_to_node(cpu))
+			cpumask_set_cpu(cpu, affinity);
+	}
+}
+
+static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity,
+				 enum mpam_class_types type,
+				 struct mpam_class *class,
+				 struct mpam_component *comp)
+{
+	int err;
+
+	switch (type) {
+	case MPAM_CLASS_CACHE:
+		err = mpam_get_cpumask_from_cache_id(comp->comp_id, class->level,
+						     affinity);
+		if (err) {
+			dev_warn_once(&msc->pdev->dev,
+				      "Failed to determine CPU affinity\n");
+			return err;
+		}
+
+		if (cpumask_empty(affinity))
+			dev_warn_once(&msc->pdev->dev, "no CPUs associated with cache node\n");
+
+		break;
+	case MPAM_CLASS_MEMORY:
+		get_cpumask_from_node_id(comp->comp_id, affinity);
+		/* affinity may be empty for CPU-less memory nodes */
+		break;
+	case MPAM_CLASS_UNKNOWN:
+		return 0;
+	}
+
+	cpumask_and(affinity, affinity, &msc->accessibility);
+
+	return 0;
+}
+
+static int mpam_ris_create_locked(struct mpam_msc *msc, u8 ris_idx,
+				  enum mpam_class_types type, u8 class_id,
+				  int component_id)
+{
+	int err;
+	struct mpam_vmsc *vmsc;
+	struct mpam_msc_ris *ris;
+	struct mpam_class *class;
+	struct mpam_component *comp;
+	struct platform_device *pdev = msc->pdev;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	if (ris_idx > MPAM_MSC_MAX_NUM_RIS)
+		return -EINVAL;
+
+	if (test_and_set_bit(ris_idx, &msc->ris_idxs))
+		return -EBUSY;
+
+	ris = devm_kzalloc(&msc->pdev->dev, sizeof(*ris), GFP_KERNEL);
+	if (!ris)
+		return -ENOMEM;
+	init_garbage(&ris->garbage);
+	ris->garbage.pdev = pdev;
+
+	class = mpam_class_find(class_id, type);
+	if (IS_ERR(class))
+		return PTR_ERR(class);
+
+	comp = mpam_component_find(class, component_id);
+	if (IS_ERR(comp)) {
+		if (list_empty(&class->components))
+			mpam_class_destroy(class);
+		return PTR_ERR(comp);
+	}
+
+	vmsc = mpam_vmsc_find(comp, msc);
+	if (IS_ERR(vmsc)) {
+		if (list_empty(&comp->vmsc))
+			mpam_component_destroy(comp);
+		return PTR_ERR(vmsc);
+	}
+
+	err = mpam_ris_get_affinity(msc, &ris->affinity, type, class, comp);
+	if (err) {
+		if (list_empty(&vmsc->ris))
+			mpam_vmsc_destroy(vmsc);
+		return err;
+	}
+
+	ris->ris_idx = ris_idx;
+	INIT_LIST_HEAD_RCU(&ris->msc_list);
+	INIT_LIST_HEAD_RCU(&ris->vmsc_list);
+	ris->vmsc = vmsc;
+
+	cpumask_or(&comp->affinity, &comp->affinity, &ris->affinity);
+	cpumask_or(&class->affinity, &class->affinity, &ris->affinity);
+	list_add_rcu(&ris->vmsc_list, &vmsc->ris);
+	list_add_rcu(&ris->msc_list, &msc->ris);
+
+	return 0;
+}
+
+static void mpam_ris_destroy(struct mpam_msc_ris *ris)
+{
+	struct mpam_vmsc *vmsc = ris->vmsc;
+	struct mpam_msc *msc = vmsc->msc;
+	struct mpam_component *comp = vmsc->comp;
+	struct mpam_class *class = comp->class;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	/*
+	 * It is assumed affinities don't overlap. If they do the class becomes
+	 * unusable immediately.
+	 */
+	cpumask_andnot(&class->affinity, &class->affinity, &ris->affinity);
+	cpumask_andnot(&comp->affinity, &comp->affinity, &ris->affinity);
+	clear_bit(ris->ris_idx, &msc->ris_idxs);
+	list_del_rcu(&ris->msc_list);
+	list_del_rcu(&ris->vmsc_list);
+	add_to_garbage(ris);
+
+	if (list_empty(&vmsc->ris))
+		mpam_vmsc_destroy(vmsc);
+}
+
+int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
+		    enum mpam_class_types type, u8 class_id, int component_id)
+{
+	int err;
+
+	mutex_lock(&mpam_list_lock);
+	err = mpam_ris_create_locked(msc, ris_idx, type, class_id,
+				     component_id);
+	mutex_unlock(&mpam_list_lock);
+	if (err)
+		mpam_free_garbage();
+
+	return err;
+}
+
 /*
  * An MSC can control traffic from a set of CPUs, but may only be accessible
  * from a (hopefully wider) set of CPUs. The common reason for this is power
@@ -56,14 +433,25 @@ static void update_msc_accessibility(struct mpam_msc *msc)
 		acpi_pptt_get_cpus_from_container(affinity_id, &msc->accessibility);
 }
 
+/*
+ * There are two ways of reaching a struct mpam_msc_ris. Via the
+ * class->component->vmsc->ris, or via the msc.
+ * When destroying the msc, the other side needs unlinking and cleaning up too.
+ */
 static void mpam_msc_destroy(struct mpam_msc *msc)
 {
 	struct platform_device *pdev = msc->pdev;
+	struct mpam_msc_ris *ris, *tmp;
 
 	lockdep_assert_held(&mpam_list_lock);
 
+	list_for_each_entry_safe(ris, tmp, &msc->ris, msc_list)
+		mpam_ris_destroy(ris);
+
 	list_del_rcu(&msc->all_msc_list);
 	platform_set_drvdata(pdev, NULL);
+
+	add_to_garbage(msc);
 }
 
 static void mpam_msc_drv_remove(struct platform_device *pdev)
@@ -74,7 +462,7 @@ static void mpam_msc_drv_remove(struct platform_device *pdev)
 	mpam_msc_destroy(msc);
 	mutex_unlock(&mpam_list_lock);
 
-	synchronize_srcu(&mpam_srcu);
+	mpam_free_garbage();
 }
 
 static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev)
@@ -90,6 +478,8 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev)
 	msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
 	if (!msc)
 		return ERR_PTR(-ENOMEM);
+	init_garbage(&msc->garbage);
+	msc->garbage.pdev = pdev;
 
 	err = devm_mutex_init(dev, &msc->probe_lock);
 	if (err)
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
index 540066903eca..8f7a28d2c021 100644
--- a/drivers/resctrl/mpam_internal.h
+++ b/drivers/resctrl/mpam_internal.h
@@ -7,11 +7,30 @@
 #include <linux/arm_mpam.h>
 #include <linux/cpumask.h>
 #include <linux/io.h>
+#include <linux/llist.h>
 #include <linux/mutex.h>
+#include <linux/srcu.h>
 #include <linux/types.h>
 
+#define MPAM_MSC_MAX_NUM_RIS	16
+
 struct platform_device;
 
+/*
+ * Structures protected by SRCU may not be freed for a surprising amount of
+ * time (especially if perf is running). To ensure the MPAM error interrupt can
+ * tear down all the structures, build a list of objects that can be garbage
+ * collected once synchronize_srcu() has returned.
+ * If pdev is non-NULL, use devm_kfree().
+ */
+struct mpam_garbage {
+	/* member of mpam_garbage */
+	struct llist_node	llist;
+
+	void			*to_free;
+	struct platform_device	*pdev;
+};
+
 struct mpam_msc {
 	/* member of mpam_all_msc */
 	struct list_head	all_msc_list;
@@ -45,5 +64,80 @@ struct mpam_msc {
 
 	void __iomem		*mapped_hwpage;
 	size_t			mapped_hwpage_sz;
+
+	struct mpam_garbage	garbage;
+};
+
+struct mpam_class {
+	/* mpam_components in this class */
+	struct list_head	components;
+
+	cpumask_t		affinity;
+
+	u8			level;
+	enum mpam_class_types	type;
+
+	/* member of mpam_classes */
+	struct list_head	classes_list;
+
+	struct mpam_garbage	garbage;
+};
+
+struct mpam_component {
+	u32			comp_id;
+
+	/* mpam_vmsc in this component */
+	struct list_head	vmsc;
+
+	cpumask_t		affinity;
+
+	/* member of mpam_class:components */
+	struct list_head	class_list;
+
+	/* parent: */
+	struct mpam_class	*class;
+
+	struct mpam_garbage	garbage;
+};
+
+struct mpam_vmsc {
+	/* member of mpam_component:vmsc_list */
+	struct list_head	comp_list;
+
+	/* mpam_msc_ris in this vmsc */
+	struct list_head	ris;
+
+	/* All RIS in this vMSC are members of this MSC */
+	struct mpam_msc		*msc;
+
+	/* parent: */
+	struct mpam_component	*comp;
+
+	struct mpam_garbage	garbage;
+};
+
+struct mpam_msc_ris {
+	u8			ris_idx;
+
+	cpumask_t		affinity;
+
+	/* member of mpam_vmsc:ris */
+	struct list_head	vmsc_list;
+
+	/* member of mpam_msc:ris */
+	struct list_head	msc_list;
+
+	/* parent: */
+	struct mpam_vmsc	*vmsc;
+
+	struct mpam_garbage	garbage;
 };
+
+/* List of all classes - protected by srcu*/
+extern struct srcu_struct mpam_srcu;
+extern struct list_head mpam_classes;
+
+int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
+				   cpumask_t *affinity);
+
 #endif /* MPAM_INTERNAL_H */
diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h
index 4b7f335181e0..13a8ac5c2cbd 100644
--- a/include/linux/arm_mpam.h
+++ b/include/linux/arm_mpam.h
@@ -37,11 +37,16 @@ static inline int acpi_mpam_parse_resources(struct mpam_msc *msc,
 static inline int acpi_mpam_count_msc(void) { return -EINVAL; }
 #endif
 
+#ifdef CONFIG_ARM64_MPAM_DRIVER
+int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
+		    enum mpam_class_types type, u8 class_id, int component_id);
+#else
 static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
 				  enum mpam_class_types type, u8 class_id,
 				  int component_id)
 {
 	return -EINVAL;
 }
+#endif
 
 #endif /* __LINUX_ARM_MPAM_H */
-- 
cgit v1.2.3


From bd221f9f82afb616887e0b88b43fbb937479d744 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 19 Nov 2025 12:22:44 +0000
Subject: arm_mpam: Probe hardware to find the supported partid/pmg values

CPUs can generate traffic with a range of PARTID and PMG values,
but each MSC may also have its own maximum size for these fields.
Before MPAM can be used, the driver needs to probe each RIS on
each MSC, to find the system-wide smallest value that can be used.
The limits from requestors (e.g. CPUs) also need taking into account.

While doing this, RIS entries that firmware didn't describe are created
under MPAM_CLASS_UNKNOWN.

This adds the low level MSC write accessors.

While we're here, implement the mpam_register_requestor() call
for the arch code to register the CPU limits. Future callers of this
will tell us about the SMMU and ITS.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Ben Horgan <ben.horgan@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Reviewed-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Fenghua Yu <fenghuay@nvidia.com>
Tested-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com>
Tested-by: Gavin Shan <gshan@redhat.com>
Tested-by: Zeng Heng <zengheng4@huawei.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/resctrl/mpam_devices.c  | 148 +++++++++++++++++++++++++++++++++++++++-
 drivers/resctrl/mpam_internal.h |   6 ++
 include/linux/arm_mpam.h        |  14 ++++
 3 files changed, 167 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
index 51284f55ae9b..3d9b87a9727a 100644
--- a/drivers/resctrl/mpam_devices.c
+++ b/drivers/resctrl/mpam_devices.c
@@ -6,6 +6,7 @@
 #include <linux/acpi.h>
 #include <linux/atomic.h>
 #include <linux/arm_mpam.h>
+#include <linux/bitfield.h>
 #include <linux/cacheinfo.h>
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
@@ -42,6 +43,15 @@ static atomic_t mpam_num_msc;
 static int mpam_cpuhp_state;
 static DEFINE_MUTEX(mpam_cpuhp_state_lock);
 
+/*
+ * The smallest common values for any CPU or MSC in the system.
+ * Generating traffic outside this range will result in screaming interrupts.
+ */
+u16 mpam_partid_max;
+u8 mpam_pmg_max;
+static bool partid_max_init, partid_max_published;
+static DEFINE_SPINLOCK(partid_max_lock);
+
 /*
  * mpam is enabled once all devices have been probed from CPU online callbacks,
  * scheduled via this work_struct. If access to an MSC depends on a CPU that
@@ -143,6 +153,70 @@ static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg)
 
 #define mpam_read_partsel_reg(msc, reg) _mpam_read_partsel_reg(msc, MPAMF_##reg)
 
+static void __mpam_write_reg(struct mpam_msc *msc, u16 reg, u32 val)
+{
+	WARN_ON_ONCE(reg + sizeof(u32) > msc->mapped_hwpage_sz);
+	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility));
+
+	writel_relaxed(val, msc->mapped_hwpage + reg);
+}
+
+static inline void _mpam_write_partsel_reg(struct mpam_msc *msc, u16 reg, u32 val)
+{
+	lockdep_assert_held_once(&msc->part_sel_lock);
+	__mpam_write_reg(msc, reg, val);
+}
+
+#define mpam_write_partsel_reg(msc, reg, val)  _mpam_write_partsel_reg(msc, MPAMCFG_##reg, val)
+
+static u64 mpam_msc_read_idr(struct mpam_msc *msc)
+{
+	u64 idr_high = 0, idr_low;
+
+	lockdep_assert_held(&msc->part_sel_lock);
+
+	idr_low = mpam_read_partsel_reg(msc, IDR);
+	if (FIELD_GET(MPAMF_IDR_EXT, idr_low))
+		idr_high = mpam_read_partsel_reg(msc, IDR + 4);
+
+	return (idr_high << 32) | idr_low;
+}
+
+static void __mpam_part_sel_raw(u32 partsel, struct mpam_msc *msc)
+{
+	lockdep_assert_held(&msc->part_sel_lock);
+
+	mpam_write_partsel_reg(msc, PART_SEL, partsel);
+}
+
+static void __mpam_part_sel(u8 ris_idx, u16 partid, struct mpam_msc *msc)
+{
+	u32 partsel = FIELD_PREP(MPAMCFG_PART_SEL_RIS, ris_idx) |
+		      FIELD_PREP(MPAMCFG_PART_SEL_PARTID_SEL, partid);
+
+	__mpam_part_sel_raw(partsel, msc);
+}
+
+int mpam_register_requestor(u16 partid_max, u8 pmg_max)
+{
+	guard(spinlock)(&partid_max_lock);
+	if (!partid_max_init) {
+		mpam_partid_max = partid_max;
+		mpam_pmg_max = pmg_max;
+		partid_max_init = true;
+	} else if (!partid_max_published) {
+		mpam_partid_max = min(mpam_partid_max, partid_max);
+		mpam_pmg_max = min(mpam_pmg_max, pmg_max);
+	} else {
+		/* New requestors can't lower the values */
+		if (partid_max < mpam_partid_max || pmg_max < mpam_pmg_max)
+			return -EBUSY;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(mpam_register_requestor);
+
 static struct mpam_class *
 mpam_class_alloc(u8 level_idx, enum mpam_class_types type)
 {
@@ -450,9 +524,35 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
 	return err;
 }
 
+static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc,
+						   u8 ris_idx)
+{
+	int err;
+	struct mpam_msc_ris *ris;
+
+	lockdep_assert_held(&mpam_list_lock);
+
+	if (!test_bit(ris_idx, &msc->ris_idxs)) {
+		err = mpam_ris_create_locked(msc, ris_idx, MPAM_CLASS_UNKNOWN,
+					     0, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	list_for_each_entry(ris, &msc->ris, msc_list) {
+		if (ris->ris_idx == ris_idx)
+			return ris;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
 static int mpam_msc_hw_probe(struct mpam_msc *msc)
 {
 	u64 idr;
+	u16 partid_max;
+	u8 ris_idx, pmg_max;
+	struct mpam_msc_ris *ris;
 	struct device *dev = &msc->pdev->dev;
 
 	lockdep_assert_held(&msc->probe_lock);
@@ -463,6 +563,40 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc)
 		return -EIO;
 	}
 
+	/* Grab an IDR value to find out how many RIS there are */
+	mutex_lock(&msc->part_sel_lock);
+	idr = mpam_msc_read_idr(msc);
+	mutex_unlock(&msc->part_sel_lock);
+
+	msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr);
+
+	/* Use these values so partid/pmg always starts with a valid value */
+	msc->partid_max = FIELD_GET(MPAMF_IDR_PARTID_MAX, idr);
+	msc->pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr);
+
+	for (ris_idx = 0; ris_idx <= msc->ris_max; ris_idx++) {
+		mutex_lock(&msc->part_sel_lock);
+		__mpam_part_sel(ris_idx, 0, msc);
+		idr = mpam_msc_read_idr(msc);
+		mutex_unlock(&msc->part_sel_lock);
+
+		partid_max = FIELD_GET(MPAMF_IDR_PARTID_MAX, idr);
+		pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr);
+		msc->partid_max = min(msc->partid_max, partid_max);
+		msc->pmg_max = min(msc->pmg_max, pmg_max);
+
+		mutex_lock(&mpam_list_lock);
+		ris = mpam_get_or_create_ris(msc, ris_idx);
+		mutex_unlock(&mpam_list_lock);
+		if (IS_ERR(ris))
+			return PTR_ERR(ris);
+	}
+
+	spin_lock(&partid_max_lock);
+	mpam_partid_max = min(mpam_partid_max, msc->partid_max);
+	mpam_pmg_max = min(mpam_pmg_max, msc->pmg_max);
+	spin_unlock(&partid_max_lock);
+
 	msc->probed = true;
 
 	return 0;
@@ -682,10 +816,20 @@ static struct platform_driver mpam_msc_driver = {
 
 static void mpam_enable_once(void)
 {
+	/*
+	 * Once the cpuhp callbacks have been changed, mpam_partid_max can no
+	 * longer change.
+	 */
+	spin_lock(&partid_max_lock);
+	partid_max_published = true;
+	spin_unlock(&partid_max_lock);
+
 	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline,
 				      "mpam:online");
 
-	pr_info("MPAM enabled\n");
+	/* Use printk() to avoid the pr_fmt adding the function name. */
+	printk(KERN_INFO "MPAM enabled with %u PARTIDs and %u PMGs\n",
+	       mpam_partid_max + 1, mpam_pmg_max + 1);
 }
 
 void mpam_disable(struct work_struct *ignored)
@@ -751,4 +895,6 @@ static int __init mpam_msc_driver_init(void)
 
 	return platform_driver_register(&mpam_msc_driver);
 }
+
+/* Must occur after arm64_mpam_register_cpus() from arch_initcall() */
 subsys_initcall(mpam_msc_driver_init);
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
index 4e1538d29783..768a58a3ab27 100644
--- a/drivers/resctrl/mpam_internal.h
+++ b/drivers/resctrl/mpam_internal.h
@@ -49,6 +49,8 @@ struct mpam_msc {
 	 */
 	struct mutex		probe_lock;
 	bool			probed;
+	u16			partid_max;
+	u8			pmg_max;
 	unsigned long		ris_idxs;
 	u32			ris_max;
 
@@ -138,6 +140,10 @@ struct mpam_msc_ris {
 extern struct srcu_struct mpam_srcu;
 extern struct list_head mpam_classes;
 
+/* System wide partid/pmg values */
+extern u16 mpam_partid_max;
+extern u8 mpam_pmg_max;
+
 /* Scheduled work callback to enable mpam once all MSC have been probed */
 void mpam_enable(struct work_struct *work);
 void mpam_disable(struct work_struct *work);
diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h
index 13a8ac5c2cbd..7f00c5285a32 100644
--- a/include/linux/arm_mpam.h
+++ b/include/linux/arm_mpam.h
@@ -49,4 +49,18 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
 }
 #endif
 
+/**
+ * mpam_register_requestor() - Register a requestor with the MPAM driver
+ * @partid_max:		The maximum PARTID value the requestor can generate.
+ * @pmg_max:		The maximum PMG value the requestor can generate.
+ *
+ * Registers a requestor with the MPAM driver to ensure the chosen system-wide
+ * minimum PARTID and PMG values will allow the requestors features to be used.
+ *
+ * Returns an error if the registration is too late, and a larger PARTID/PMG
+ * value has been advertised to user-space. In this case the requestor should
+ * not use its MPAM features. Returns 0 on success.
+ */
+int mpam_register_requestor(u16 partid_max, u8 pmg_max);
+
 #endif /* __LINUX_ARM_MPAM_H */
-- 
cgit v1.2.3


From 934fa943b53795339486cc0026b3ab7ad39dc600 Mon Sep 17 00:00:00 2001
From: Aditya Garg <gargaditya@linux.microsoft.com>
Date: Tue, 18 Nov 2025 03:11:08 -0800
Subject: net: mana: Handle SKB if TX SGEs exceed hardware limit

The MANA hardware supports a maximum of 30 scatter-gather entries (SGEs)
per TX WQE. Exceeding this limit can cause TX failures.
Add ndo_features_check() callback to validate SKB layout before
transmission. For GSO SKBs that would exceed the hardware SGE limit, clear
NETIF_F_GSO_MASK to enforce software segmentation in the stack.
Add a fallback in mana_start_xmit() to linearize non-GSO SKBs that still
exceed the SGE limit.

Also, Add ethtool counter for SKBs linearized

Co-developed-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Signed-off-by: Aditya Garg <gargaditya@linux.microsoft.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1763464269-10431-2-git-send-email-gargaditya@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c      | 40 ++++++++++++++++++++--
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c |  2 ++
 include/net/mana/gdma.h                            |  8 ++++-
 include/net/mana/mana.h                            |  1 +
 4 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 13f47be7aca6..7b49ab005e2d 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -11,6 +11,7 @@
 #include <linux/mm.h>
 #include <linux/pci.h>
 #include <linux/export.h>
+#include <linux/skbuff.h>
 
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
@@ -329,6 +330,21 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	cq = &apc->tx_qp[txq_idx].tx_cq;
 	tx_stats = &txq->stats;
 
+	BUILD_BUG_ON(MAX_TX_WQE_SGL_ENTRIES != MANA_MAX_TX_WQE_SGL_ENTRIES);
+	if (MAX_SKB_FRAGS + 2 > MAX_TX_WQE_SGL_ENTRIES &&
+	    skb_shinfo(skb)->nr_frags + 2 > MAX_TX_WQE_SGL_ENTRIES) {
+		/* GSO skb with Hardware SGE limit exceeded is not expected here
+		 * as they are handled in mana_features_check() callback
+		 */
+		if (skb_linearize(skb)) {
+			netdev_warn_once(ndev, "Failed to linearize skb with nr_frags=%d and is_gso=%d\n",
+					 skb_shinfo(skb)->nr_frags,
+					 skb_is_gso(skb));
+			goto tx_drop_count;
+		}
+		apc->eth_stats.tx_linear_pkt_cnt++;
+	}
+
 	pkg.tx_oob.s_oob.vcq_num = cq->gdma_id;
 	pkg.tx_oob.s_oob.vsq_frame = txq->vsq_frame;
 
@@ -442,8 +458,6 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 		}
 	}
 
-	WARN_ON_ONCE(pkg.wqe_req.num_sge > MAX_TX_WQE_SGL_ENTRIES);
-
 	if (pkg.wqe_req.num_sge <= ARRAY_SIZE(pkg.sgl_array)) {
 		pkg.wqe_req.sgl = pkg.sgl_array;
 	} else {
@@ -518,6 +532,25 @@ tx_drop:
 	return NETDEV_TX_OK;
 }
 
+#if (MAX_SKB_FRAGS + 2 > MANA_MAX_TX_WQE_SGL_ENTRIES)
+static netdev_features_t mana_features_check(struct sk_buff *skb,
+					     struct net_device *ndev,
+					     netdev_features_t features)
+{
+	if (skb_shinfo(skb)->nr_frags + 2 > MAX_TX_WQE_SGL_ENTRIES) {
+		/* Exceeds HW SGE limit.
+		 * GSO case:
+		 *   Disable GSO so the stack will software-segment the skb
+		 *   into smaller skbs that fit the SGE budget.
+		 * Non-GSO case:
+		 *   The xmit path will attempt skb_linearize() as a fallback.
+		 */
+		features &= ~NETIF_F_GSO_MASK;
+	}
+	return features;
+}
+#endif
+
 static void mana_get_stats64(struct net_device *ndev,
 			     struct rtnl_link_stats64 *st)
 {
@@ -883,6 +916,9 @@ static const struct net_device_ops mana_devops = {
 	.ndo_open		= mana_open,
 	.ndo_stop		= mana_close,
 	.ndo_select_queue	= mana_select_queue,
+#if (MAX_SKB_FRAGS + 2 > MANA_MAX_TX_WQE_SGL_ENTRIES)
+	.ndo_features_check	= mana_features_check,
+#endif
 	.ndo_start_xmit		= mana_start_xmit,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_get_stats64	= mana_get_stats64,
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 99e811208683..0e2f4343ac67 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -18,6 +18,8 @@ static const struct mana_stats_desc mana_eth_stats[] = {
 	{"tx_cq_err", offsetof(struct mana_ethtool_stats, tx_cqe_err)},
 	{"tx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
 					tx_cqe_unknown_type)},
+	{"tx_linear_pkt_cnt", offsetof(struct mana_ethtool_stats,
+				       tx_linear_pkt_cnt)},
 	{"rx_coalesced_err", offsetof(struct mana_ethtool_stats,
 					rx_coalesced_err)},
 	{"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 2e4f2f3175e5..a4cf307859f8 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -486,6 +486,8 @@ struct gdma_wqe {
 #define INLINE_OOB_SMALL_SIZE 8
 #define INLINE_OOB_LARGE_SIZE 24
 
+#define MANA_MAX_TX_WQE_SGL_ENTRIES 30
+
 #define MAX_TX_WQE_SIZE 512
 #define MAX_RX_WQE_SIZE 256
 
@@ -592,6 +594,9 @@ enum {
 #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
 #define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
 
+/* Driver supports linearizing the skb when num_sge exceeds hardware limit */
+#define GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE BIT(20)
+
 /* Driver can send HWC periodically to query stats */
 #define GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY BIT(21)
 
@@ -605,7 +610,8 @@ enum {
 	 GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \
 	 GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \
 	 GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE | \
-	 GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY)
+	 GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY | \
+	 GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE)
 
 #define GDMA_DRV_CAP_FLAGS2 0
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index d37f4cea0ac3..fb28b3cac067 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -377,6 +377,7 @@ struct mana_ethtool_stats {
 	u64 wake_queue;
 	u64 tx_cqe_err;
 	u64 tx_cqe_unknown_type;
+	u64 tx_linear_pkt_cnt;
 	u64 rx_coalesced_err;
 	u64 rx_cqe_unknown_type;
 };
-- 
cgit v1.2.3


From 45120304e84171fd215c1b57b15b285446d15106 Mon Sep 17 00:00:00 2001
From: Aditya Garg <gargaditya@linux.microsoft.com>
Date: Tue, 18 Nov 2025 03:11:09 -0800
Subject: net: mana: Drop TX skb on post_work_request failure and unmap
 resources

Drop TX packets when posting the work request fails and ensure DMA
mappings are always cleaned up.

Signed-off-by: Aditya Garg <gargaditya@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1763464269-10431-3-git-send-email-gargaditya@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c | 6 +-----
 drivers/net/ethernet/microsoft/mana/mana_en.c   | 7 +++----
 include/net/mana/mana.h                         | 1 +
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index effe0a2f207a..8fd70b34807a 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1300,7 +1300,6 @@ int mana_gd_post_work_request(struct gdma_queue *wq,
 			      struct gdma_posted_wqe_info *wqe_info)
 {
 	u32 client_oob_size = wqe_req->inline_oob_size;
-	struct gdma_context *gc;
 	u32 sgl_data_size;
 	u32 max_wqe_size;
 	u32 wqe_size;
@@ -1330,11 +1329,8 @@ int mana_gd_post_work_request(struct gdma_queue *wq,
 	if (wqe_size > max_wqe_size)
 		return -EINVAL;
 
-	if (wq->monitor_avl_buf && wqe_size > mana_gd_wq_avail_space(wq)) {
-		gc = wq->gdma_dev->gdma_context;
-		dev_err(gc->dev, "unsuccessful flow control!\n");
+	if (wq->monitor_avl_buf && wqe_size > mana_gd_wq_avail_space(wq))
 		return -ENOSPC;
-	}
 
 	if (wqe_info)
 		wqe_info->wqe_size_in_bu = wqe_size / GDMA_WQE_BU_SIZE;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 7b49ab005e2d..1ad154f9db1a 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -492,9 +492,9 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 
 	if (err) {
 		(void)skb_dequeue_tail(&txq->pending_skbs);
+		mana_unmap_skb(skb, apc);
 		netdev_warn(ndev, "Failed to post TX OOB: %d\n", err);
-		err = NETDEV_TX_BUSY;
-		goto tx_busy;
+		goto free_sgl_ptr;
 	}
 
 	err = NETDEV_TX_OK;
@@ -514,7 +514,6 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	tx_stats->bytes += len + ((num_gso_seg - 1) * gso_hs);
 	u64_stats_update_end(&tx_stats->syncp);
 
-tx_busy:
 	if (netif_tx_queue_stopped(net_txq) && mana_can_tx(gdma_sq)) {
 		netif_tx_wake_queue(net_txq);
 		apc->eth_stats.wake_queue++;
@@ -1687,7 +1686,7 @@ static int mana_move_wq_tail(struct gdma_queue *wq, u32 num_units)
 	return 0;
 }
 
-static void mana_unmap_skb(struct sk_buff *skb, struct mana_port_context *apc)
+void mana_unmap_skb(struct sk_buff *skb, struct mana_port_context *apc)
 {
 	struct mana_skb_head *ash = (struct mana_skb_head *)skb->head;
 	struct gdma_context *gc = apc->ac->gdma_dev->gdma_context;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index fb28b3cac067..d7e089c6b694 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -593,6 +593,7 @@ int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
 void mana_query_phy_stats(struct mana_port_context *apc);
 int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues);
 void mana_pre_dealloc_rxbufs(struct mana_port_context *apc);
+void mana_unmap_skb(struct sk_buff *skb, struct mana_port_context *apc);
 
 extern const struct ethtool_ops mana_ethtool_ops;
 extern struct dentry *mana_debugfs_root;
-- 
cgit v1.2.3


From 3fee828789b1cf294a8fc83ad8a37f644c174fae Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Sun, 16 Nov 2025 22:45:36 +0200
Subject: net/mlx5: Move the esw mode notifier chain outside the devlink lock

The esw mode change notifier chain is initialized/cleaned up in
mlx5_init_one() / mlx5_uninit_one() with the devlink lock held.

Move the notifier head from the eswitch struct into mlx5_priv directly,
and initialize it outside the critical section. This will allow notifier
registration to happen earlier in the init procedure in subsequent
patches.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c    | 13 +++++++------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h    |  7 ++++---
 drivers/net/ethernet/mellanox/mlx5/core/main.c       |  2 ++
 drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c |  6 +++---
 include/linux/mlx5/driver.h                          |  1 +
 5 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 25af8bd7f077..3adf2b1cd26a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1474,7 +1474,7 @@ static void mlx5_esw_mode_change_notify(struct mlx5_eswitch *esw, u16 mode)
 
 	info.new_mode = mode;
 
-	blocking_notifier_call_chain(&esw->n_head, 0, &info);
+	blocking_notifier_call_chain(&esw->dev->priv.esw_n_head, 0, &info);
 }
 
 static int mlx5_esw_egress_acls_init(struct mlx5_core_dev *dev)
@@ -2050,7 +2050,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC;
 	else
 		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE;
-	BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head);
 
 	esw_info(dev,
 		 "Total vports %d, per vport: max uc(%d) max mc(%d)\n",
@@ -2379,14 +2378,16 @@ bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
 		dev1->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS);
 }
 
-int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *nb)
+int mlx5_esw_event_notifier_register(struct mlx5_core_dev *dev,
+				     struct notifier_block *nb)
 {
-	return blocking_notifier_chain_register(&esw->n_head, nb);
+	return blocking_notifier_chain_register(&dev->priv.esw_n_head, nb);
 }
 
-void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *nb)
+void mlx5_esw_event_notifier_unregister(struct mlx5_core_dev *dev,
+					struct notifier_block *nb)
 {
-	blocking_notifier_chain_unregister(&esw->n_head, nb);
+	blocking_notifier_chain_unregister(&dev->priv.esw_n_head, nb);
 }
 
 /**
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index beaec450a734..ad1073f7b79f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -403,7 +403,6 @@ struct mlx5_eswitch {
 	struct {
 		u32             large_group_num;
 	}  params;
-	struct blocking_notifier_head n_head;
 	struct xarray paired;
 	struct mlx5_devcom_comp_dev *devcom;
 	u16 enabled_ipsec_vf_count;
@@ -864,8 +863,10 @@ struct mlx5_esw_event_info {
 	u16 new_mode;
 };
 
-int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *n);
-void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *n);
+int mlx5_esw_event_notifier_register(struct mlx5_core_dev *dev,
+				     struct notifier_block *n);
+void mlx5_esw_event_notifier_unregister(struct mlx5_core_dev *dev,
+					struct notifier_block *n);
 
 bool mlx5_esw_hold(struct mlx5_core_dev *dev);
 void mlx5_esw_release(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 612fc4de9d3c..05f16f3e9c4f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1834,6 +1834,8 @@ static int mlx5_notifiers_init(struct mlx5_core_dev *dev)
 		return err;
 	}
 
+	BLOCKING_INIT_NOTIFIER_HEAD(&dev->priv.esw_n_head);
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
index 3304f25cc805..2ece4983d33f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -481,7 +481,7 @@ int mlx5_sf_table_init(struct mlx5_core_dev *dev)
 	xa_init(&table->function_ids);
 	dev->priv.sf_table = table;
 	table->esw_nb.notifier_call = mlx5_sf_esw_event;
-	err = mlx5_esw_event_notifier_register(dev->priv.eswitch, &table->esw_nb);
+	err = mlx5_esw_event_notifier_register(dev, &table->esw_nb);
 	if (err)
 		goto reg_err;
 
@@ -496,7 +496,7 @@ int mlx5_sf_table_init(struct mlx5_core_dev *dev)
 	return 0;
 
 vhca_err:
-	mlx5_esw_event_notifier_unregister(dev->priv.eswitch, &table->esw_nb);
+	mlx5_esw_event_notifier_unregister(dev, &table->esw_nb);
 reg_err:
 	mutex_destroy(&table->sf_state_lock);
 	kfree(table);
@@ -513,7 +513,7 @@ void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
 
 	mlx5_blocking_notifier_unregister(dev, &table->mdev_nb);
 	mlx5_vhca_event_notifier_unregister(table->dev, &table->vhca_nb);
-	mlx5_esw_event_notifier_unregister(dev->priv.eswitch, &table->esw_nb);
+	mlx5_esw_event_notifier_unregister(dev, &table->esw_nb);
 	mutex_destroy(&table->sf_state_lock);
 	WARN_ON(!xa_empty(&table->function_ids));
 	kfree(table);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7aec53371cf0..9a4a5112a59e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -599,6 +599,7 @@ struct mlx5_priv {
 
 	struct mlx5_flow_steering *steering;
 	struct mlx5_mpfs        *mpfs;
+	struct blocking_notifier_head esw_n_head;
 	struct mlx5_eswitch     *eswitch;
 	struct mlx5_core_sriov	sriov;
 	struct mlx5_lag		*lag;
-- 
cgit v1.2.3


From d3a356db853bc2dfb51034eacafd41aca7dd4c37 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Sun, 16 Nov 2025 22:45:37 +0200
Subject: net/mlx5: Move the vhca event notifier outside of the devlink lock

The vhca event notifier consists of an atomic notifier for vhca state
changes (used for SF events), multiple workqueues and a blocking
notifier chain for delivering the vhca state change events for further
processing.

This patch moves the vhca notifier head outside of mlx5_init_one() /
mlx5_uninit_one() and into the mlx5_mdev_init() / mlx5_mdev_uninit()
functions.

This allows called notifiers to grab the PF devlink lock which was
previously impossible because it would create a circular lock
dependency.

mlx5_vhca_event_stop() is now called earlier in the cleanup phase and
flushes the workqueues to ensure that after the call, there are no
pending events. This simplifies the cleanup flow for vhca event
consumers.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  3 +-
 .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.c   |  1 -
 .../net/ethernet/mellanox/mlx5/core/sf/hw_table.c  |  1 +
 .../ethernet/mellanox/mlx5/core/sf/vhca_event.c    | 69 ++++++++--------------
 .../ethernet/mellanox/mlx5/core/sf/vhca_event.h    |  5 ++
 include/linux/mlx5/driver.h                        |  4 +-
 6 files changed, 35 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 05f16f3e9c4f..097ba7ab90a4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1438,12 +1438,12 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
 {
 	mlx5_eswitch_disable(dev->priv.eswitch);
 	mlx5_devlink_traps_unregister(priv_to_devlink(dev));
+	mlx5_vhca_event_stop(dev);
 	mlx5_sf_dev_table_destroy(dev);
 	mlx5_sriov_detach(dev);
 	mlx5_lag_remove_mdev(dev);
 	mlx5_ec_cleanup(dev);
 	mlx5_sf_hw_table_destroy(dev);
-	mlx5_vhca_event_stop(dev);
 	mlx5_fs_core_cleanup(dev);
 	mlx5_fpga_device_stop(dev);
 	mlx5_rsc_dump_cleanup(dev);
@@ -1835,6 +1835,7 @@ static int mlx5_notifiers_init(struct mlx5_core_dev *dev)
 	}
 
 	BLOCKING_INIT_NOTIFIER_HEAD(&dev->priv.esw_n_head);
+	mlx5_vhca_state_notifier_init(dev);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
index 99219ea52c4b..a68a8ee24dce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
@@ -381,7 +381,6 @@ void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev)
 
 	mlx5_sf_dev_destroy_active_works(table);
 	mlx5_vhca_event_notifier_unregister(dev, &table->nb);
-	mlx5_vhca_event_work_queues_flush(dev);
 
 	/* Now that event handler is not running, it is safe to destroy
 	 * the sf device without race.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
index 1f613320fe07..a14b1aa5fb5a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
@@ -389,6 +389,7 @@ void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev)
 		return;
 
 	mlx5_vhca_event_notifier_unregister(dev, &table->vhca_nb);
+
 	/* Dealloc SFs whose firmware event has been missed. */
 	mlx5_sf_hw_table_dealloc_all(table);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
index cda01ba441ae..b04cf6cf8956 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
@@ -9,15 +9,9 @@
 #define CREATE_TRACE_POINTS
 #include "diag/vhca_tracepoint.h"
 
-struct mlx5_vhca_state_notifier {
-	struct mlx5_core_dev *dev;
-	struct mlx5_nb nb;
-	struct blocking_notifier_head n_head;
-};
-
 struct mlx5_vhca_event_work {
 	struct work_struct work;
-	struct mlx5_vhca_state_notifier *notifier;
+	struct mlx5_core_dev *dev;
 	struct mlx5_vhca_state_event event;
 };
 
@@ -95,16 +89,14 @@ mlx5_vhca_event_notify(struct mlx5_core_dev *dev, struct mlx5_vhca_state_event *
 	mlx5_vhca_event_arm(dev, event->function_id);
 	trace_mlx5_sf_vhca_event(dev, event);
 
-	blocking_notifier_call_chain(&dev->priv.vhca_state_notifier->n_head, 0, event);
+	blocking_notifier_call_chain(&dev->priv.vhca_state_n_head, 0, event);
 }
 
 static void mlx5_vhca_state_work_handler(struct work_struct *_work)
 {
 	struct mlx5_vhca_event_work *work = container_of(_work, struct mlx5_vhca_event_work, work);
-	struct mlx5_vhca_state_notifier *notifier = work->notifier;
-	struct mlx5_core_dev *dev = notifier->dev;
 
-	mlx5_vhca_event_notify(dev, &work->event);
+	mlx5_vhca_event_notify(work->dev, &work->event);
 	kfree(work);
 }
 
@@ -116,8 +108,8 @@ void mlx5_vhca_events_work_enqueue(struct mlx5_core_dev *dev, int idx, struct wo
 static int
 mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, void *data)
 {
-	struct mlx5_vhca_state_notifier *notifier =
-				mlx5_nb_cof(nb, struct mlx5_vhca_state_notifier, nb);
+	struct mlx5_core_dev *dev = mlx5_nb_cof(nb, struct mlx5_core_dev,
+						priv.vhca_state_nb);
 	struct mlx5_vhca_event_work *work;
 	struct mlx5_eqe *eqe = data;
 	int wq_idx;
@@ -126,10 +118,10 @@ mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, v
 	if (!work)
 		return NOTIFY_DONE;
 	INIT_WORK(&work->work, &mlx5_vhca_state_work_handler);
-	work->notifier = notifier;
+	work->dev = dev;
 	work->event.function_id = be16_to_cpu(eqe->data.vhca_state.function_id);
 	wq_idx = work->event.function_id % MLX5_DEV_MAX_WQS;
-	mlx5_vhca_events_work_enqueue(notifier->dev, wq_idx, &work->work);
+	mlx5_vhca_events_work_enqueue(dev, wq_idx, &work->work);
 	return NOTIFY_OK;
 }
 
@@ -145,9 +137,15 @@ void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap)
 	MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_teardown_request, 1);
 }
 
+void mlx5_vhca_state_notifier_init(struct mlx5_core_dev *dev)
+{
+	BLOCKING_INIT_NOTIFIER_HEAD(&dev->priv.vhca_state_n_head);
+	MLX5_NB_INIT(&dev->priv.vhca_state_nb, mlx5_vhca_state_change_notifier,
+		     VHCA_STATE_CHANGE);
+}
+
 int mlx5_vhca_event_init(struct mlx5_core_dev *dev)
 {
-	struct mlx5_vhca_state_notifier *notifier;
 	char wq_name[MLX5_CMD_WQ_MAX_NAME];
 	struct mlx5_vhca_events *events;
 	int err, i;
@@ -160,7 +158,6 @@ int mlx5_vhca_event_init(struct mlx5_core_dev *dev)
 		return -ENOMEM;
 
 	events->dev = dev;
-	dev->priv.vhca_events = events;
 	for (i = 0; i < MLX5_DEV_MAX_WQS; i++) {
 		snprintf(wq_name, MLX5_CMD_WQ_MAX_NAME, "mlx5_vhca_event%d", i);
 		events->handler[i].wq = create_singlethread_workqueue(wq_name);
@@ -169,20 +166,10 @@ int mlx5_vhca_event_init(struct mlx5_core_dev *dev)
 			goto err_create_wq;
 		}
 	}
+	dev->priv.vhca_events = events;
 
-	notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
-	if (!notifier) {
-		err = -ENOMEM;
-		goto err_notifier;
-	}
-
-	dev->priv.vhca_state_notifier = notifier;
-	notifier->dev = dev;
-	BLOCKING_INIT_NOTIFIER_HEAD(&notifier->n_head);
-	MLX5_NB_INIT(&notifier->nb, mlx5_vhca_state_change_notifier, VHCA_STATE_CHANGE);
 	return 0;
 
-err_notifier:
 err_create_wq:
 	for (--i; i >= 0; i--)
 		destroy_workqueue(events->handler[i].wq);
@@ -211,8 +198,6 @@ void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev)
 	if (!mlx5_vhca_event_supported(dev))
 		return;
 
-	kfree(dev->priv.vhca_state_notifier);
-	dev->priv.vhca_state_notifier = NULL;
 	vhca_events = dev->priv.vhca_events;
 	for (i = 0; i < MLX5_DEV_MAX_WQS; i++)
 		destroy_workqueue(vhca_events->handler[i].wq);
@@ -221,34 +206,30 @@ void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev)
 
 void mlx5_vhca_event_start(struct mlx5_core_dev *dev)
 {
-	struct mlx5_vhca_state_notifier *notifier;
-
-	if (!dev->priv.vhca_state_notifier)
+	if (!mlx5_vhca_event_supported(dev))
 		return;
 
-	notifier = dev->priv.vhca_state_notifier;
-	mlx5_eq_notifier_register(dev, &notifier->nb);
+	mlx5_eq_notifier_register(dev, &dev->priv.vhca_state_nb);
 }
 
 void mlx5_vhca_event_stop(struct mlx5_core_dev *dev)
 {
-	struct mlx5_vhca_state_notifier *notifier;
-
-	if (!dev->priv.vhca_state_notifier)
+	if (!mlx5_vhca_event_supported(dev))
 		return;
 
-	notifier = dev->priv.vhca_state_notifier;
-	mlx5_eq_notifier_unregister(dev, &notifier->nb);
+	mlx5_eq_notifier_unregister(dev, &dev->priv.vhca_state_nb);
+
+	/* Flush workqueues of all pending events. */
+	mlx5_vhca_event_work_queues_flush(dev);
 }
 
 int mlx5_vhca_event_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb)
 {
-	if (!dev->priv.vhca_state_notifier)
-		return -EOPNOTSUPP;
-	return blocking_notifier_chain_register(&dev->priv.vhca_state_notifier->n_head, nb);
+	return blocking_notifier_chain_register(&dev->priv.vhca_state_n_head,
+						nb);
 }
 
 void mlx5_vhca_event_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb)
 {
-	blocking_notifier_chain_unregister(&dev->priv.vhca_state_notifier->n_head, nb);
+	blocking_notifier_chain_unregister(&dev->priv.vhca_state_n_head, nb);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h
index 1725ba64f8af..52790423874c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h
@@ -18,6 +18,7 @@ static inline bool mlx5_vhca_event_supported(const struct mlx5_core_dev *dev)
 }
 
 void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap);
+void mlx5_vhca_state_notifier_init(struct mlx5_core_dev *dev);
 int mlx5_vhca_event_init(struct mlx5_core_dev *dev);
 void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev);
 void mlx5_vhca_event_start(struct mlx5_core_dev *dev);
@@ -37,6 +38,10 @@ static inline void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *s
 {
 }
 
+static inline void mlx5_vhca_state_notifier_init(struct mlx5_core_dev *dev)
+{
+}
+
 static inline int mlx5_vhca_event_init(struct mlx5_core_dev *dev)
 {
 	return 0;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 9a4a5112a59e..88afb2788dc9 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -488,7 +488,6 @@ struct mlx5_devcom_dev;
 struct mlx5_fw_reset;
 struct mlx5_eq_table;
 struct mlx5_irq_table;
-struct mlx5_vhca_state_notifier;
 struct mlx5_sf_dev_table;
 struct mlx5_sf_hw_table;
 struct mlx5_sf_table;
@@ -615,7 +614,8 @@ struct mlx5_priv {
 	struct mlx5_bfreg_data		bfregs;
 	struct mlx5_sq_bfreg bfreg;
 #ifdef CONFIG_MLX5_SF
-	struct mlx5_vhca_state_notifier *vhca_state_notifier;
+	struct mlx5_nb vhca_state_nb;
+	struct blocking_notifier_head vhca_state_n_head;
 	struct mlx5_sf_dev_table *sf_dev_table;
 	struct mlx5_core_dev *parent_mdev;
 #endif
-- 
cgit v1.2.3


From e63c9c5f0a4802deea81a48c2c40d0af56153e8a Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Sun, 16 Nov 2025 22:45:38 +0200
Subject: net/mlx5: Move the SF HW table notifier outside the devlink lock

Move the SF HW table notifier registration/unregistration outside of
mlx5_init_one() / mlx5_uninit_one() and into the mlx5_mdev_init() /
mlx5_mdev_uninit() functions.

This is only done for non-SFs, since SFs do not have a SF HW table
themselves.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 17 +++---
 .../net/ethernet/mellanox/mlx5/core/sf/hw_table.c  | 62 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h    |  9 +++-
 include/linux/mlx5/driver.h                        |  1 +
 4 files changed, 54 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 097ba7ab90a4..843ee452239f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1377,12 +1377,6 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 
 	mlx5_vhca_event_start(dev);
 
-	err = mlx5_sf_hw_table_create(dev);
-	if (err) {
-		mlx5_core_err(dev, "sf table create failed %d\n", err);
-		goto err_vhca;
-	}
-
 	err = mlx5_ec_init(dev);
 	if (err) {
 		mlx5_core_err(dev, "Failed to init embedded CPU\n");
@@ -1411,8 +1405,6 @@ err_sriov:
 	mlx5_lag_remove_mdev(dev);
 	mlx5_ec_cleanup(dev);
 err_ec:
-	mlx5_sf_hw_table_destroy(dev);
-err_vhca:
 	mlx5_vhca_event_stop(dev);
 err_set_hca:
 	mlx5_fs_core_cleanup(dev);
@@ -1837,11 +1829,20 @@ static int mlx5_notifiers_init(struct mlx5_core_dev *dev)
 	BLOCKING_INIT_NOTIFIER_HEAD(&dev->priv.esw_n_head);
 	mlx5_vhca_state_notifier_init(dev);
 
+	err = mlx5_sf_hw_notifier_init(dev);
+	if (err)
+		goto err_sf_hw_notifier;
+
 	return 0;
+
+err_sf_hw_notifier:
+	mlx5_events_cleanup(dev);
+	return err;
 }
 
 static void mlx5_notifiers_cleanup(struct mlx5_core_dev *dev)
 {
+	mlx5_sf_hw_notifier_cleanup(dev);
 	mlx5_events_cleanup(dev);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
index a14b1aa5fb5a..bd968f3b3855 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
@@ -30,9 +30,7 @@ enum mlx5_sf_hwc_index {
 };
 
 struct mlx5_sf_hw_table {
-	struct mlx5_core_dev *dev;
 	struct mutex table_lock; /* Serializes sf deletion and vhca state change handler. */
-	struct notifier_block vhca_nb;
 	struct mlx5_sf_hwc_table hwc[MLX5_SF_HWC_MAX];
 };
 
@@ -71,14 +69,16 @@ mlx5_sf_table_fn_to_hwc(struct mlx5_sf_hw_table *table, u16 fn_id)
 	return NULL;
 }
 
-static int mlx5_sf_hw_table_id_alloc(struct mlx5_sf_hw_table *table, u32 controller,
+static int mlx5_sf_hw_table_id_alloc(struct mlx5_core_dev *dev,
+				     struct mlx5_sf_hw_table *table,
+				     u32 controller,
 				     u32 usr_sfnum)
 {
 	struct mlx5_sf_hwc_table *hwc;
 	int free_idx = -1;
 	int i;
 
-	hwc = mlx5_sf_controller_to_hwc(table->dev, controller);
+	hwc = mlx5_sf_controller_to_hwc(dev, controller);
 	if (!hwc->sfs)
 		return -ENOSPC;
 
@@ -100,11 +100,13 @@ static int mlx5_sf_hw_table_id_alloc(struct mlx5_sf_hw_table *table, u32 control
 	return free_idx;
 }
 
-static void mlx5_sf_hw_table_id_free(struct mlx5_sf_hw_table *table, u32 controller, int id)
+static void mlx5_sf_hw_table_id_free(struct mlx5_core_dev *dev,
+				     struct mlx5_sf_hw_table *table,
+				     u32 controller, int id)
 {
 	struct mlx5_sf_hwc_table *hwc;
 
-	hwc = mlx5_sf_controller_to_hwc(table->dev, controller);
+	hwc = mlx5_sf_controller_to_hwc(dev, controller);
 	hwc->sfs[id].allocated = false;
 	hwc->sfs[id].pending_delete = false;
 }
@@ -120,7 +122,7 @@ int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 controller, u32 usr
 		return -EOPNOTSUPP;
 
 	mutex_lock(&table->table_lock);
-	sw_id = mlx5_sf_hw_table_id_alloc(table, controller, usr_sfnum);
+	sw_id = mlx5_sf_hw_table_id_alloc(dev, table, controller, usr_sfnum);
 	if (sw_id < 0) {
 		err = sw_id;
 		goto exist_err;
@@ -151,7 +153,7 @@ int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 controller, u32 usr
 vhca_err:
 	mlx5_cmd_dealloc_sf(dev, hw_fn_id);
 err:
-	mlx5_sf_hw_table_id_free(table, controller, sw_id);
+	mlx5_sf_hw_table_id_free(dev, table, controller, sw_id);
 exist_err:
 	mutex_unlock(&table->table_lock);
 	return err;
@@ -165,7 +167,7 @@ void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u32 controller, u16 id)
 	mutex_lock(&table->table_lock);
 	hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, id);
 	mlx5_cmd_dealloc_sf(dev, hw_fn_id);
-	mlx5_sf_hw_table_id_free(table, controller, id);
+	mlx5_sf_hw_table_id_free(dev, table, controller, id);
 	mutex_unlock(&table->table_lock);
 }
 
@@ -216,10 +218,12 @@ static void mlx5_sf_hw_table_hwc_dealloc_all(struct mlx5_core_dev *dev,
 	}
 }
 
-static void mlx5_sf_hw_table_dealloc_all(struct mlx5_sf_hw_table *table)
+static void mlx5_sf_hw_table_dealloc_all(struct mlx5_core_dev *dev,
+					 struct mlx5_sf_hw_table *table)
 {
-	mlx5_sf_hw_table_hwc_dealloc_all(table->dev, &table->hwc[MLX5_SF_HWC_EXTERNAL]);
-	mlx5_sf_hw_table_hwc_dealloc_all(table->dev, &table->hwc[MLX5_SF_HWC_LOCAL]);
+	mlx5_sf_hw_table_hwc_dealloc_all(dev,
+					 &table->hwc[MLX5_SF_HWC_EXTERNAL]);
+	mlx5_sf_hw_table_hwc_dealloc_all(dev, &table->hwc[MLX5_SF_HWC_LOCAL]);
 }
 
 static int mlx5_sf_hw_table_hwc_init(struct mlx5_sf_hwc_table *hwc, u16 max_fn, u16 base_id)
@@ -301,7 +305,6 @@ int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev)
 	}
 
 	mutex_init(&table->table_lock);
-	table->dev = dev;
 	dev->priv.sf_hw_table = table;
 
 	base_id = mlx5_sf_start_function_id(dev);
@@ -338,19 +341,22 @@ void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev)
 	mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_LOCAL]);
 	mutex_destroy(&table->table_lock);
 	kfree(table);
+	dev->priv.sf_hw_table = NULL;
 res_unregister:
 	mlx5_sf_hw_table_res_unregister(dev);
 }
 
 static int mlx5_sf_hw_vhca_event(struct notifier_block *nb, unsigned long opcode, void *data)
 {
-	struct mlx5_sf_hw_table *table = container_of(nb, struct mlx5_sf_hw_table, vhca_nb);
+	struct mlx5_core_dev *dev = container_of(nb, struct mlx5_core_dev,
+						 priv.sf_hw_table_vhca_nb);
+	struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table;
 	const struct mlx5_vhca_state_event *event = data;
 	struct mlx5_sf_hwc_table *hwc;
 	struct mlx5_sf_hw *sf_hw;
 	u16 sw_id;
 
-	if (event->new_vhca_state != MLX5_VHCA_STATE_ALLOCATED)
+	if (!table || event->new_vhca_state != MLX5_VHCA_STATE_ALLOCATED)
 		return 0;
 
 	hwc = mlx5_sf_table_fn_to_hwc(table, event->function_id);
@@ -365,20 +371,28 @@ static int mlx5_sf_hw_vhca_event(struct notifier_block *nb, unsigned long opcode
 	 * Hence recycle the sf hardware id for reuse.
 	 */
 	if (sf_hw->allocated && sf_hw->pending_delete)
-		mlx5_sf_hw_table_hwc_sf_free(table->dev, hwc, sw_id);
+		mlx5_sf_hw_table_hwc_sf_free(dev, hwc, sw_id);
 	mutex_unlock(&table->table_lock);
 	return 0;
 }
 
-int mlx5_sf_hw_table_create(struct mlx5_core_dev *dev)
+int mlx5_sf_hw_notifier_init(struct mlx5_core_dev *dev)
 {
-	struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table;
-
-	if (!table)
+	if (mlx5_core_is_sf(dev))
 		return 0;
 
-	table->vhca_nb.notifier_call = mlx5_sf_hw_vhca_event;
-	return mlx5_vhca_event_notifier_register(dev, &table->vhca_nb);
+	dev->priv.sf_hw_table_vhca_nb.notifier_call = mlx5_sf_hw_vhca_event;
+	return mlx5_vhca_event_notifier_register(dev,
+						 &dev->priv.sf_hw_table_vhca_nb);
+}
+
+void mlx5_sf_hw_notifier_cleanup(struct mlx5_core_dev *dev)
+{
+	if (mlx5_core_is_sf(dev))
+		return;
+
+	mlx5_vhca_event_notifier_unregister(dev,
+					    &dev->priv.sf_hw_table_vhca_nb);
 }
 
 void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev)
@@ -388,10 +402,8 @@ void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev)
 	if (!table)
 		return;
 
-	mlx5_vhca_event_notifier_unregister(dev, &table->vhca_nb);
-
 	/* Dealloc SFs whose firmware event has been missed. */
-	mlx5_sf_hw_table_dealloc_all(table);
+	mlx5_sf_hw_table_dealloc_all(dev, table);
 }
 
 bool mlx5_sf_hw_table_supported(const struct mlx5_core_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
index 89559a37997a..3922dacffae8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
@@ -12,7 +12,8 @@
 int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev);
 void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev);
 
-int mlx5_sf_hw_table_create(struct mlx5_core_dev *dev);
+int mlx5_sf_hw_notifier_init(struct mlx5_core_dev *dev);
+void mlx5_sf_hw_notifier_cleanup(struct mlx5_core_dev *dev);
 void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev);
 
 int mlx5_sf_table_init(struct mlx5_core_dev *dev);
@@ -44,11 +45,15 @@ static inline void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev)
 {
 }
 
-static inline int mlx5_sf_hw_table_create(struct mlx5_core_dev *dev)
+static inline int mlx5_sf_hw_notifier_init(struct mlx5_core_dev *dev)
 {
 	return 0;
 }
 
+static inline void mlx5_sf_hw_notifier_cleanup(struct mlx5_core_dev *dev)
+{
+}
+
 static inline void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev)
 {
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 88afb2788dc9..d6c5bcebdaca 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -620,6 +620,7 @@ struct mlx5_priv {
 	struct mlx5_core_dev *parent_mdev;
 #endif
 #ifdef CONFIG_MLX5_SF_MANAGER
+	struct notifier_block sf_hw_table_vhca_nb;
 	struct mlx5_sf_hw_table *sf_hw_table;
 	struct mlx5_sf_table *sf_table;
 #endif
-- 
cgit v1.2.3


From d4a0acbd94c2a93bf308a9fde9ab6719f5d98c7a Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Sun, 16 Nov 2025 22:45:39 +0200
Subject: net/mlx5: Move the SF table notifiers outside the devlink lock

Move the SF table notifiers registration/unregistration outside of
mlx5_init_one() / mlx5_uninit_one() and into the mlx5_mdev_init() /
mlx5_mdev_uninit() functions.

This is only done for non-SFs, since SFs do not have a SF table
themselves and thus don't need notifiers.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-6-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  7 ++
 .../net/ethernet/mellanox/mlx5/core/sf/devlink.c   | 90 ++++++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h    | 11 +++
 include/linux/mlx5/driver.h                        |  3 +
 4 files changed, 78 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 843ee452239f..0c3613ef39b1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1833,8 +1833,14 @@ static int mlx5_notifiers_init(struct mlx5_core_dev *dev)
 	if (err)
 		goto err_sf_hw_notifier;
 
+	err = mlx5_sf_notifiers_init(dev);
+	if (err)
+		goto err_sf_notifiers;
+
 	return 0;
 
+err_sf_notifiers:
+	mlx5_sf_hw_notifier_cleanup(dev);
 err_sf_hw_notifier:
 	mlx5_events_cleanup(dev);
 	return err;
@@ -1842,6 +1848,7 @@ err_sf_hw_notifier:
 
 static void mlx5_notifiers_cleanup(struct mlx5_core_dev *dev)
 {
+	mlx5_sf_notifiers_cleanup(dev);
 	mlx5_sf_hw_notifier_cleanup(dev);
 	mlx5_events_cleanup(dev);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
index 2ece4983d33f..b82323b8449e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -31,9 +31,6 @@ struct mlx5_sf_table {
 	struct mlx5_core_dev *dev; /* To refer from notifier context. */
 	struct xarray function_ids; /* function id based lookup. */
 	struct mutex sf_state_lock; /* Serializes sf state among user cmds & vhca event handler. */
-	struct notifier_block esw_nb;
-	struct notifier_block vhca_nb;
-	struct notifier_block mdev_nb;
 };
 
 static struct mlx5_sf *
@@ -391,11 +388,16 @@ static bool mlx5_sf_state_update_check(const struct mlx5_sf *sf, u8 new_state)
 
 static int mlx5_sf_vhca_event(struct notifier_block *nb, unsigned long opcode, void *data)
 {
-	struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, vhca_nb);
+	struct mlx5_core_dev *dev = container_of(nb, struct mlx5_core_dev,
+						 priv.sf_table_vhca_nb);
+	struct mlx5_sf_table *table = dev->priv.sf_table;
 	const struct mlx5_vhca_state_event *event = data;
 	bool update = false;
 	struct mlx5_sf *sf;
 
+	if (!table)
+		return 0;
+
 	mutex_lock(&table->sf_state_lock);
 	sf = mlx5_sf_lookup_by_function_id(table, event->function_id);
 	if (!sf)
@@ -407,7 +409,7 @@ static int mlx5_sf_vhca_event(struct notifier_block *nb, unsigned long opcode, v
 	update = mlx5_sf_state_update_check(sf, event->new_vhca_state);
 	if (update)
 		sf->hw_state = event->new_vhca_state;
-	trace_mlx5_sf_update_state(table->dev, sf->port_index, sf->controller,
+	trace_mlx5_sf_update_state(dev, sf->port_index, sf->controller,
 				   sf->hw_fn_id, sf->hw_state);
 unlock:
 	mutex_unlock(&table->sf_state_lock);
@@ -425,12 +427,16 @@ static void mlx5_sf_del_all(struct mlx5_sf_table *table)
 
 static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, void *data)
 {
-	struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, esw_nb);
+	struct mlx5_core_dev *dev = container_of(nb, struct mlx5_core_dev,
+						 priv.sf_table_esw_nb);
 	const struct mlx5_esw_event_info *mode = data;
 
+	if (!dev->priv.sf_table)
+		return 0;
+
 	switch (mode->new_mode) {
 	case MLX5_ESWITCH_LEGACY:
-		mlx5_sf_del_all(table);
+		mlx5_sf_del_all(dev->priv.sf_table);
 		break;
 	default:
 		break;
@@ -441,15 +447,16 @@ static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, voi
 
 static int mlx5_sf_mdev_event(struct notifier_block *nb, unsigned long event, void *data)
 {
-	struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, mdev_nb);
+	struct mlx5_core_dev *dev = container_of(nb, struct mlx5_core_dev,
+						 priv.sf_table_mdev_nb);
 	struct mlx5_sf_peer_devlink_event_ctx *event_ctx = data;
+	struct mlx5_sf_table *table = dev->priv.sf_table;
 	int ret = NOTIFY_DONE;
 	struct mlx5_sf *sf;
 
-	if (event != MLX5_DRIVER_EVENT_SF_PEER_DEVLINK)
+	if (!table || event != MLX5_DRIVER_EVENT_SF_PEER_DEVLINK)
 		return NOTIFY_DONE;
 
-
 	mutex_lock(&table->sf_state_lock);
 	sf = mlx5_sf_lookup_by_function_id(table, event_ctx->fn_id);
 	if (!sf)
@@ -464,10 +471,40 @@ out:
 	return ret;
 }
 
+int mlx5_sf_notifiers_init(struct mlx5_core_dev *dev)
+{
+	int err;
+
+	if (mlx5_core_is_sf(dev))
+		return 0;
+
+	dev->priv.sf_table_esw_nb.notifier_call = mlx5_sf_esw_event;
+	err = mlx5_esw_event_notifier_register(dev, &dev->priv.sf_table_esw_nb);
+	if (err)
+		return err;
+
+	dev->priv.sf_table_vhca_nb.notifier_call = mlx5_sf_vhca_event;
+	err = mlx5_vhca_event_notifier_register(dev,
+						&dev->priv.sf_table_vhca_nb);
+	if (err)
+		goto vhca_err;
+
+	dev->priv.sf_table_mdev_nb.notifier_call = mlx5_sf_mdev_event;
+	err = mlx5_blocking_notifier_register(dev, &dev->priv.sf_table_mdev_nb);
+	if (err)
+		goto mdev_err;
+
+	return 0;
+mdev_err:
+	mlx5_vhca_event_notifier_unregister(dev, &dev->priv.sf_table_vhca_nb);
+vhca_err:
+	mlx5_esw_event_notifier_unregister(dev, &dev->priv.sf_table_esw_nb);
+	return err;
+}
+
 int mlx5_sf_table_init(struct mlx5_core_dev *dev)
 {
 	struct mlx5_sf_table *table;
-	int err;
 
 	if (!mlx5_sf_table_supported(dev) || !mlx5_vhca_event_supported(dev))
 		return 0;
@@ -480,28 +517,18 @@ int mlx5_sf_table_init(struct mlx5_core_dev *dev)
 	table->dev = dev;
 	xa_init(&table->function_ids);
 	dev->priv.sf_table = table;
-	table->esw_nb.notifier_call = mlx5_sf_esw_event;
-	err = mlx5_esw_event_notifier_register(dev, &table->esw_nb);
-	if (err)
-		goto reg_err;
-
-	table->vhca_nb.notifier_call = mlx5_sf_vhca_event;
-	err = mlx5_vhca_event_notifier_register(table->dev, &table->vhca_nb);
-	if (err)
-		goto vhca_err;
-
-	table->mdev_nb.notifier_call = mlx5_sf_mdev_event;
-	mlx5_blocking_notifier_register(dev, &table->mdev_nb);
 
 	return 0;
+}
 
-vhca_err:
-	mlx5_esw_event_notifier_unregister(dev, &table->esw_nb);
-reg_err:
-	mutex_destroy(&table->sf_state_lock);
-	kfree(table);
-	dev->priv.sf_table = NULL;
-	return err;
+void mlx5_sf_notifiers_cleanup(struct mlx5_core_dev *dev)
+{
+	if (mlx5_core_is_sf(dev))
+		return;
+
+	mlx5_blocking_notifier_unregister(dev, &dev->priv.sf_table_mdev_nb);
+	mlx5_vhca_event_notifier_unregister(dev, &dev->priv.sf_table_vhca_nb);
+	mlx5_esw_event_notifier_unregister(dev, &dev->priv.sf_table_esw_nb);
 }
 
 void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
@@ -511,9 +538,6 @@ void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
 	if (!table)
 		return;
 
-	mlx5_blocking_notifier_unregister(dev, &table->mdev_nb);
-	mlx5_vhca_event_notifier_unregister(table->dev, &table->vhca_nb);
-	mlx5_esw_event_notifier_unregister(dev, &table->esw_nb);
 	mutex_destroy(&table->sf_state_lock);
 	WARN_ON(!xa_empty(&table->function_ids));
 	kfree(table);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
index 3922dacffae8..d8a934a0e968 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
@@ -16,7 +16,9 @@ int mlx5_sf_hw_notifier_init(struct mlx5_core_dev *dev);
 void mlx5_sf_hw_notifier_cleanup(struct mlx5_core_dev *dev);
 void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev);
 
+int mlx5_sf_notifiers_init(struct mlx5_core_dev *dev);
 int mlx5_sf_table_init(struct mlx5_core_dev *dev);
+void mlx5_sf_notifiers_cleanup(struct mlx5_core_dev *dev);
 void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev);
 bool mlx5_sf_table_empty(const struct mlx5_core_dev *dev);
 
@@ -58,11 +60,20 @@ static inline void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev)
 {
 }
 
+static inline int mlx5_sf_notifiers_init(struct mlx5_core_dev *dev)
+{
+	return 0;
+}
+
 static inline int mlx5_sf_table_init(struct mlx5_core_dev *dev)
 {
 	return 0;
 }
 
+static inline void mlx5_sf_notifiers_cleanup(struct mlx5_core_dev *dev)
+{
+}
+
 static inline void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
 {
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d6c5bcebdaca..6af62047a614 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -622,6 +622,9 @@ struct mlx5_priv {
 #ifdef CONFIG_MLX5_SF_MANAGER
 	struct notifier_block sf_hw_table_vhca_nb;
 	struct mlx5_sf_hw_table *sf_hw_table;
+	struct notifier_block sf_table_esw_nb;
+	struct notifier_block sf_table_vhca_nb;
+	struct notifier_block sf_table_mdev_nb;
 	struct mlx5_sf_table *sf_table;
 #endif
 	struct blocking_notifier_head lag_nh;
-- 
cgit v1.2.3


From 64ad6470c882fcaecfa4a1da96ea94de7ca0dc80 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Sun, 16 Nov 2025 22:45:40 +0200
Subject: net/mlx5: Move SF dev table notifier registration outside the PF
 devlink lock

This completes the previous patches by moving notifier registration for
SF dev tables outside the devlink locked critical section in
mlx5_init_one() / mlx5_uninit_one() and into the mlx5_mdev_init() /
mlx5_mdev_uninit() functions.

This is only done for non-SFs, since SFs do not have a SF HW table
themselves.

After this patch, notifiers can grab the PF devlink lock (soon to be
necessary) without creating a locking cycle.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-7-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  7 ++++
 .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.c   | 47 ++++++++++++++--------
 .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.h   | 11 +++++
 include/linux/mlx5/driver.h                        |  1 +
 4 files changed, 49 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 0c3613ef39b1..024339ce41f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1837,8 +1837,14 @@ static int mlx5_notifiers_init(struct mlx5_core_dev *dev)
 	if (err)
 		goto err_sf_notifiers;
 
+	err = mlx5_sf_dev_notifier_init(dev);
+	if (err)
+		goto err_sf_dev_notifier;
+
 	return 0;
 
+err_sf_dev_notifier:
+	mlx5_sf_notifiers_cleanup(dev);
 err_sf_notifiers:
 	mlx5_sf_hw_notifier_cleanup(dev);
 err_sf_hw_notifier:
@@ -1848,6 +1854,7 @@ err_sf_hw_notifier:
 
 static void mlx5_notifiers_cleanup(struct mlx5_core_dev *dev)
 {
+	mlx5_sf_dev_notifier_cleanup(dev);
 	mlx5_sf_notifiers_cleanup(dev);
 	mlx5_sf_hw_notifier_cleanup(dev);
 	mlx5_events_cleanup(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
index a68a8ee24dce..f310bde3d11f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
@@ -16,7 +16,6 @@ struct mlx5_sf_dev_table {
 	struct xarray devices;
 	phys_addr_t base_address;
 	u64 sf_bar_length;
-	struct notifier_block nb;
 	struct workqueue_struct *active_wq;
 	struct work_struct work;
 	u8 stop_active_wq:1;
@@ -156,18 +155,23 @@ static void mlx5_sf_dev_del(struct mlx5_core_dev *dev, struct mlx5_sf_dev *sf_de
 static int
 mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_code, void *data)
 {
-	struct mlx5_sf_dev_table *table = container_of(nb, struct mlx5_sf_dev_table, nb);
+	struct mlx5_core_dev *dev = container_of(nb, struct mlx5_core_dev,
+						 priv.sf_dev_nb);
+	struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table;
 	const struct mlx5_vhca_state_event *event = data;
 	struct mlx5_sf_dev *sf_dev;
 	u16 max_functions;
 	u16 sf_index;
 	u16 base_id;
 
-	max_functions = mlx5_sf_max_functions(table->dev);
+	if (!table)
+		return 0;
+
+	max_functions = mlx5_sf_max_functions(dev);
 	if (!max_functions)
 		return 0;
 
-	base_id = mlx5_sf_start_function_id(table->dev);
+	base_id = mlx5_sf_start_function_id(dev);
 	if (event->function_id < base_id || event->function_id >= (base_id + max_functions))
 		return 0;
 
@@ -177,19 +181,19 @@ mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_
 	case MLX5_VHCA_STATE_INVALID:
 	case MLX5_VHCA_STATE_ALLOCATED:
 		if (sf_dev)
-			mlx5_sf_dev_del(table->dev, sf_dev, sf_index);
+			mlx5_sf_dev_del(dev, sf_dev, sf_index);
 		break;
 	case MLX5_VHCA_STATE_TEARDOWN_REQUEST:
 		if (sf_dev)
-			mlx5_sf_dev_del(table->dev, sf_dev, sf_index);
+			mlx5_sf_dev_del(dev, sf_dev, sf_index);
 		else
-			mlx5_core_err(table->dev,
+			mlx5_core_err(dev,
 				      "SF DEV: teardown state for invalid dev index=%d sfnum=0x%x\n",
 				      sf_index, event->sw_function_id);
 		break;
 	case MLX5_VHCA_STATE_ACTIVE:
 		if (!sf_dev)
-			mlx5_sf_dev_add(table->dev, sf_index, event->function_id,
+			mlx5_sf_dev_add(dev, sf_index, event->function_id,
 					event->sw_function_id);
 		break;
 	default:
@@ -315,6 +319,15 @@ static void mlx5_sf_dev_destroy_active_works(struct mlx5_sf_dev_table *table)
 	}
 }
 
+int mlx5_sf_dev_notifier_init(struct mlx5_core_dev *dev)
+{
+	if (mlx5_core_is_sf(dev))
+		return 0;
+
+	dev->priv.sf_dev_nb.notifier_call = mlx5_sf_dev_state_change_handler;
+	return mlx5_vhca_event_notifier_register(dev, &dev->priv.sf_dev_nb);
+}
+
 void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev)
 {
 	struct mlx5_sf_dev_table *table;
@@ -329,17 +342,12 @@ void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev)
 		goto table_err;
 	}
 
-	table->nb.notifier_call = mlx5_sf_dev_state_change_handler;
 	table->dev = dev;
 	table->sf_bar_length = 1 << (MLX5_CAP_GEN(dev, log_min_sf_size) + 12);
 	table->base_address = pci_resource_start(dev->pdev, 2);
 	xa_init(&table->devices);
 	dev->priv.sf_dev_table = table;
 
-	err = mlx5_vhca_event_notifier_register(dev, &table->nb);
-	if (err)
-		goto vhca_err;
-
 	err = mlx5_sf_dev_create_active_works(table);
 	if (err)
 		goto add_active_err;
@@ -351,10 +359,8 @@ void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev)
 
 arm_err:
 	mlx5_sf_dev_destroy_active_works(table);
-add_active_err:
-	mlx5_vhca_event_notifier_unregister(dev, &table->nb);
 	mlx5_vhca_event_work_queues_flush(dev);
-vhca_err:
+add_active_err:
 	kfree(table);
 	dev->priv.sf_dev_table = NULL;
 table_err:
@@ -372,6 +378,14 @@ static void mlx5_sf_dev_destroy_all(struct mlx5_sf_dev_table *table)
 	}
 }
 
+void mlx5_sf_dev_notifier_cleanup(struct mlx5_core_dev *dev)
+{
+	if (mlx5_core_is_sf(dev))
+		return;
+
+	mlx5_vhca_event_notifier_unregister(dev, &dev->priv.sf_dev_nb);
+}
+
 void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev)
 {
 	struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table;
@@ -380,7 +394,6 @@ void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev)
 		return;
 
 	mlx5_sf_dev_destroy_active_works(table);
-	mlx5_vhca_event_notifier_unregister(dev, &table->nb);
 
 	/* Now that event handler is not running, it is safe to destroy
 	 * the sf device without race.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
index b99131e95e37..3ab0449c770c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
@@ -25,7 +25,9 @@ struct mlx5_sf_peer_devlink_event_ctx {
 	int err;
 };
 
+int mlx5_sf_dev_notifier_init(struct mlx5_core_dev *dev);
 void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev);
+void mlx5_sf_dev_notifier_cleanup(struct mlx5_core_dev *dev);
 void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev);
 
 int mlx5_sf_driver_register(void);
@@ -35,10 +37,19 @@ bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev);
 
 #else
 
+static inline int mlx5_sf_dev_notifier_init(struct mlx5_core_dev *dev)
+{
+	return 0;
+}
+
 static inline void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev)
 {
 }
 
+static inline void mlx5_sf_dev_notifier_cleanup(struct mlx5_core_dev *dev)
+{
+}
+
 static inline void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev)
 {
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 6af62047a614..1c54aa6f74fb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -616,6 +616,7 @@ struct mlx5_priv {
 #ifdef CONFIG_MLX5_SF
 	struct mlx5_nb vhca_state_nb;
 	struct blocking_notifier_head vhca_state_n_head;
+	struct notifier_block sf_dev_nb;
 	struct mlx5_sf_dev_table *sf_dev_table;
 	struct mlx5_core_dev *parent_mdev;
 #endif
-- 
cgit v1.2.3


From a77f0ad44fde89874654ba48f461209fb0382107 Mon Sep 17 00:00:00 2001
From: Pagadala Yesu Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Date: Wed, 12 Nov 2025 11:10:23 +0200
Subject: wifi: cfg80211: Add support for 6GHz AP role not relevant AP type

Add IEEE80211_6GHZ_CTRL_REG_AP_ROLE_NOT_RELEVANT
and map it to IEEE80211_REG_LPI_AP for safe regulatory compliance
when AP role classification is not applicable.
Use LPI as safe fallback to prevent power limit violations.

Signed-off-by: Pagadala Yesu Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251112110828.856283677cc7.I36138a34847c3b4e680974bf347dde844448f3bc@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211-he.h | 1 +
 include/net/cfg80211.h       | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211-he.h b/include/linux/ieee80211-he.h
index 904d50db5bb8..a08c446fbb04 100644
--- a/include/linux/ieee80211-he.h
+++ b/include/linux/ieee80211-he.h
@@ -548,6 +548,7 @@ static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len)
 #define IEEE80211_6GHZ_CTRL_REG_VLP_AP			2
 #define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP		3
 #define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD	4
+#define IEEE80211_6GHZ_CTRL_REG_AP_ROLE_NOT_RELEVANT	7
 #define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP		8
 
 /**
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 625cb2c78361..3d3ed1932262 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -10148,6 +10148,7 @@ cfg80211_6ghz_power_type(u8 control, u32 client_flags)
 	switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) {
 	case IEEE80211_6GHZ_CTRL_REG_LPI_AP:
 	case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP:
+	case IEEE80211_6GHZ_CTRL_REG_AP_ROLE_NOT_RELEVANT:
 		return IEEE80211_REG_LPI_AP;
 	case IEEE80211_6GHZ_CTRL_REG_SP_AP:
 	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD:
-- 
cgit v1.2.3


From ee19b52c31b3b111f140c1affd88eca1ed11edd0 Mon Sep 17 00:00:00 2001
From: André Draszik <andre.draszik@linaro.org>
Date: Fri, 14 Nov 2025 14:10:59 +0000
Subject: mfd: sec: Use chained IRQs for s2mpg10
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On S2MPG10 (and similar like S2MPG11), top-level interrupt status and
mask registers exist which need to be unmasked to get the PMIC
interrupts. This additional status doesn't seem to exist on other PMICs
in the S2MP* family, and the S2MPG10 driver is manually dealing with
masking and unmasking currently.

The correct approach here is to register this hierarchy as chained
interrupts, though, without any additional manual steps. Doing so will
also simplify addition of other, similar, PMICs (like S2MPG11) in the
future.

Update the driver to do just that.

Signed-off-by: André Draszik <andre.draszik@linaro.org>
Link: https://patch.msgid.link/20251114-s2mpg10-chained-irq-v1-1-34ddfa49c4cd@linaro.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/sec-acpm.c          | 23 +------------
 drivers/mfd/sec-irq.c           | 73 +++++++++++++++++++++++++++++++++++++++--
 include/linux/mfd/samsung/irq.h |  6 ++++
 3 files changed, 77 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/sec-acpm.c b/drivers/mfd/sec-acpm.c
index 8b31c816d65b..36622069a788 100644
--- a/drivers/mfd/sec-acpm.c
+++ b/drivers/mfd/sec-acpm.c
@@ -325,11 +325,6 @@ static struct regmap *sec_pmic_acpm_regmap_init(struct device *dev,
 	return regmap;
 }
 
-static void sec_pmic_acpm_mask_common_irqs(void *regmap_common)
-{
-	regmap_write(regmap_common, S2MPG10_COMMON_INT_MASK, S2MPG10_COMMON_INT_SRC);
-}
-
 static int sec_pmic_acpm_probe(struct platform_device *pdev)
 {
 	struct regmap *regmap_common, *regmap_pmic, *regmap;
@@ -360,15 +355,10 @@ static int sec_pmic_acpm_probe(struct platform_device *pdev)
 	shared_ctx->speedy_channel = pdata->speedy_channel;
 
 	regmap_common = sec_pmic_acpm_regmap_init(dev, shared_ctx, SEC_PMIC_ACPM_ACCESSTYPE_COMMON,
-						  pdata->regmap_cfg_common, false);
+						  pdata->regmap_cfg_common, true);
 	if (IS_ERR(regmap_common))
 		return PTR_ERR(regmap_common);
 
-	/* Mask all interrupts from 'common' block, until successful init */
-	ret = regmap_write(regmap_common, S2MPG10_COMMON_INT_MASK, S2MPG10_COMMON_INT_SRC);
-	if (ret)
-		return dev_err_probe(dev, ret, "failed to mask common block interrupts\n");
-
 	regmap_pmic = sec_pmic_acpm_regmap_init(dev, shared_ctx, SEC_PMIC_ACPM_ACCESSTYPE_PMIC,
 						pdata->regmap_cfg_pmic, false);
 	if (IS_ERR(regmap_pmic))
@@ -391,17 +381,6 @@ static int sec_pmic_acpm_probe(struct platform_device *pdev)
 	if (device_property_read_bool(dev, "wakeup-source"))
 		devm_device_init_wakeup(dev);
 
-	/* Unmask PMIC interrupt from 'common' block, now that everything is in place. */
-	ret = regmap_clear_bits(regmap_common, S2MPG10_COMMON_INT_MASK,
-				S2MPG10_COMMON_INT_SRC_PMIC);
-	if (ret)
-		return dev_err_probe(dev, ret, "failed to unmask PMIC interrupt\n");
-
-	/* Mask all interrupts from 'common' block on shutdown */
-	ret = devm_add_action_or_reset(dev, sec_pmic_acpm_mask_common_irqs, regmap_common);
-	if (ret)
-		return ret;
-
 	return 0;
 }
 
diff --git a/drivers/mfd/sec-irq.c b/drivers/mfd/sec-irq.c
index c5c80b1ba104..d992e41e716d 100644
--- a/drivers/mfd/sec-irq.c
+++ b/drivers/mfd/sec-irq.c
@@ -20,6 +20,12 @@
 #include "sec-core.h"
 
 static const struct regmap_irq s2mpg10_irqs[] = {
+	REGMAP_IRQ_REG(S2MPG10_COMMON_IRQ_PMIC, 0, S2MPG10_COMMON_INT_SRC_PMIC),
+	/* No documentation or other reference for remaining bits */
+	REGMAP_IRQ_REG(S2MPG10_COMMON_IRQ_UNUSED, 0, GENMASK(7, 1)),
+};
+
+static const struct regmap_irq s2mpg10_pmic_irqs[] = {
 	REGMAP_IRQ_REG(S2MPG10_IRQ_PWRONF, 0, S2MPG10_IRQ_PWRONF_MASK),
 	REGMAP_IRQ_REG(S2MPG10_IRQ_PWRONR, 0, S2MPG10_IRQ_PWRONR_MASK),
 	REGMAP_IRQ_REG(S2MPG10_IRQ_JIGONBF, 0, S2MPG10_IRQ_JIGONBF_MASK),
@@ -183,11 +189,20 @@ static const struct regmap_irq s5m8767_irqs[] = {
 /* All S2MPG10 interrupt sources are read-only and don't require clearing */
 static const struct regmap_irq_chip s2mpg10_irq_chip = {
 	.name = "s2mpg10",
+	.status_base = S2MPG10_COMMON_INT,
+	.mask_base = S2MPG10_COMMON_INT_MASK,
+	.num_regs = 1,
 	.irqs = s2mpg10_irqs,
 	.num_irqs = ARRAY_SIZE(s2mpg10_irqs),
-	.num_regs = 6,
+};
+
+static const struct regmap_irq_chip s2mpg10_irq_chip_pmic = {
+	.name = "s2mpg10-pmic",
 	.status_base = S2MPG10_PMIC_INT1,
 	.mask_base = S2MPG10_PMIC_INT1M,
+	.num_regs = 6,
+	.irqs = s2mpg10_pmic_irqs,
+	.num_irqs = ARRAY_SIZE(s2mpg10_pmic_irqs),
 };
 
 static const struct regmap_irq_chip s2mps11_irq_chip = {
@@ -253,6 +268,59 @@ static const struct regmap_irq_chip s5m8767_irq_chip = {
 	.ack_base = S5M8767_REG_INT1,
 };
 
+static int s2mpg1x_add_chained_irq_chip(struct device *dev, struct regmap *regmap, int pirq,
+					struct regmap_irq_chip_data *parent,
+					const struct regmap_irq_chip *chip,
+					struct regmap_irq_chip_data **data)
+{
+	int irq, ret;
+
+	irq = regmap_irq_get_virq(parent, pirq);
+	if (irq < 0)
+		return dev_err_probe(dev, irq, "Failed to get parent vIRQ(%d) for chip %s\n", pirq,
+				     chip->name);
+
+	ret = devm_regmap_add_irq_chip(dev, regmap, irq, IRQF_ONESHOT | IRQF_SHARED, 0, chip, data);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to add %s IRQ chip\n", chip->name);
+
+	return 0;
+}
+
+static int sec_irq_init_s2mpg1x(struct sec_pmic_dev *sec_pmic)
+{
+	const struct regmap_irq_chip *irq_chip, *chained_irq_chip;
+	struct regmap_irq_chip_data *irq_data;
+	struct regmap *regmap_common;
+	int chained_pirq;
+	int ret;
+
+	switch (sec_pmic->device_type) {
+	case S2MPG10:
+		irq_chip = &s2mpg10_irq_chip;
+		chained_irq_chip = &s2mpg10_irq_chip_pmic;
+		chained_pirq = S2MPG10_COMMON_IRQ_PMIC;
+		break;
+	default:
+		return dev_err_probe(sec_pmic->dev, -EINVAL, "Unsupported device type %d\n",
+				     sec_pmic->device_type);
+	};
+
+	regmap_common = dev_get_regmap(sec_pmic->dev, "common");
+	if (!regmap_common)
+		return dev_err_probe(sec_pmic->dev, -EINVAL, "No 'common' regmap %d\n",
+				     sec_pmic->device_type);
+
+	ret = devm_regmap_add_irq_chip(sec_pmic->dev, regmap_common, sec_pmic->irq, IRQF_ONESHOT, 0,
+				       irq_chip, &irq_data);
+	if (ret)
+		return dev_err_probe(sec_pmic->dev, ret, "Failed to add %s IRQ chip\n",
+				     irq_chip->name);
+
+	return s2mpg1x_add_chained_irq_chip(sec_pmic->dev, sec_pmic->regmap_pmic, chained_pirq,
+					    irq_data, chained_irq_chip, &sec_pmic->irq_data);
+}
+
 int sec_irq_init(struct sec_pmic_dev *sec_pmic)
 {
 	const struct regmap_irq_chip *sec_irq_chip;
@@ -268,8 +336,7 @@ int sec_irq_init(struct sec_pmic_dev *sec_pmic)
 		sec_irq_chip = &s2mps14_irq_chip;
 		break;
 	case S2MPG10:
-		sec_irq_chip = &s2mpg10_irq_chip;
-		break;
+		return sec_irq_init_s2mpg1x(sec_pmic);
 	case S2MPS11X:
 		sec_irq_chip = &s2mps11_irq_chip;
 		break;
diff --git a/include/linux/mfd/samsung/irq.h b/include/linux/mfd/samsung/irq.h
index b4805cbd949b..8402a5f8e18a 100644
--- a/include/linux/mfd/samsung/irq.h
+++ b/include/linux/mfd/samsung/irq.h
@@ -57,6 +57,12 @@ enum s2mpa01_irq {
 #define S2MPA01_IRQ_B24_TSD_MASK	(1 << 4)
 #define S2MPA01_IRQ_B35_TSD_MASK	(1 << 5)
 
+enum s2mpg10_common_irq {
+	/* Top-level (common) block */
+	S2MPG10_COMMON_IRQ_PMIC,
+	S2MPG10_COMMON_IRQ_UNUSED,
+};
+
 enum s2mpg10_irq {
 	/* PMIC */
 	S2MPG10_IRQ_PWRONF,
-- 
cgit v1.2.3


From 4255545a28f75fb6082b6f91d1e7ada28383ab22 Mon Sep 17 00:00:00 2001
From: Chien Wong <m@xv97.com>
Date: Thu, 13 Nov 2025 22:05:08 +0800
Subject: wifi: mac80211: add generic MMIE struct defines

The added struct is needed when writing generic handler for both CMAC-128
and CMAC-256.

Signed-off-by: Chien Wong <m@xv97.com>
Link: https://patch.msgid.link/20251113140511.48658-3-m@xv97.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 6d4bc80caf96..d55d8ea3a8be 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1207,7 +1207,7 @@ struct ieee80211_mgmt {
 #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u)
 
 
-/* Management MIC information element (IEEE 802.11w) */
+/* Management MIC information element (IEEE 802.11w) for CMAC */
 struct ieee80211_mmie {
 	u8 element_id;
 	u8 length;
@@ -1225,6 +1225,15 @@ struct ieee80211_mmie_16 {
 	u8 mic[16];
 } __packed;
 
+/* Management MIC information element (IEEE 802.11w) for all variants */
+struct ieee80211_mmie_var {
+	u8 element_id;
+	u8 length;
+	__le16 key_id;
+	u8 sequence_number[6];
+	u8 mic[]; /* 8 or 16 bytes */
+} __packed;
+
 struct ieee80211_vendor_ie {
 	u8 element_id;
 	u8 len;
@@ -1889,6 +1898,9 @@ enum ieee80211_radio_measurement_actioncode {
 #define IEEE80211_GCMP_HDR_LEN		8
 #define IEEE80211_GCMP_MIC_LEN		16
 #define IEEE80211_GCMP_PN_LEN		6
+#define IEEE80211_CMAC_128_MIC_LEN	8
+#define IEEE80211_CMAC_256_MIC_LEN	16
+#define IEEE80211_GMAC_MIC_LEN		16
 
 #define FILS_NONCE_LEN			16
 #define FILS_MAX_KEK_LEN		64
-- 
cgit v1.2.3


From 77d7dc8bef482e987036bc204136bbda552d95cd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:26:45 +0100
Subject: sched/mmcid: Revert the complex CID management

The CID management is a complex beast, which affects both scheduling and
task migration. The compaction mechanism forces random tasks of a process
into task work on exit to user space causing latency spikes.

Revert back to the initial simple bitmap allocating mechanics, which are
known to have scalability issues as that allows to gradually build up a
replacement functionality in a reviewable way.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.068197830@linutronix.de
---
 include/linux/mm_types.h |  53 +----
 kernel/fork.c            |   5 +-
 kernel/sched/core.c      | 517 ++---------------------------------------------
 kernel/sched/sched.h     | 289 ++++----------------------
 4 files changed, 64 insertions(+), 800 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 90e5790c318f..63b8c1209e7b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -922,13 +922,9 @@ struct vm_area_struct {
 #define vma_policy(vma) NULL
 #endif
 
-#ifdef CONFIG_SCHED_MM_CID
 struct mm_cid {
-	u64 time;
-	int cid;
-	int recent_cid;
+	unsigned int cid;
 };
-#endif
 
 /*
  * Opaque type representing current mm_struct flag state. Must be accessed via
@@ -1000,12 +996,6 @@ struct mm_struct {
 		 * runqueue locks.
 		 */
 		struct mm_cid __percpu *pcpu_cid;
-		/*
-		 * @mm_cid_next_scan: Next mm_cid scan (in jiffies).
-		 *
-		 * When the next mm_cid scan is due (in jiffies).
-		 */
-		unsigned long mm_cid_next_scan;
 		/**
 		 * @nr_cpus_allowed: Number of CPUs allowed for mm.
 		 *
@@ -1013,14 +1003,6 @@ struct mm_struct {
 		 * threads allowed CPUs.
 		 */
 		unsigned int nr_cpus_allowed;
-		/**
-		 * @max_nr_cid: Maximum number of allowed concurrency
-		 *              IDs allocated.
-		 *
-		 * Track the highest number of allowed concurrency IDs
-		 * allocated for the mm.
-		 */
-		atomic_t max_nr_cid;
 		/**
 		 * @cpus_allowed_lock: Lock protecting mm cpus_allowed.
 		 *
@@ -1371,35 +1353,7 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
 
 #ifdef CONFIG_SCHED_MM_CID
 
-enum mm_cid_state {
-	MM_CID_UNSET = -1U,		/* Unset state has lazy_put flag set. */
-	MM_CID_LAZY_PUT = (1U << 31),
-};
-
-static inline bool mm_cid_is_unset(int cid)
-{
-	return cid == MM_CID_UNSET;
-}
-
-static inline bool mm_cid_is_lazy_put(int cid)
-{
-	return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
-}
-
-static inline bool mm_cid_is_valid(int cid)
-{
-	return !(cid & MM_CID_LAZY_PUT);
-}
-
-static inline int mm_cid_set_lazy_put(int cid)
-{
-	return cid | MM_CID_LAZY_PUT;
-}
-
-static inline int mm_cid_clear_lazy_put(int cid)
-{
-	return cid & ~MM_CID_LAZY_PUT;
-}
+#define	MM_CID_UNSET	(~0U)
 
 /*
  * mm_cpus_allowed: Union of all mm's threads allowed CPUs.
@@ -1432,11 +1386,8 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 		struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
 
 		pcpu_cid->cid = MM_CID_UNSET;
-		pcpu_cid->recent_cid = MM_CID_UNSET;
-		pcpu_cid->time = 0;
 	}
 	mm->nr_cpus_allowed = p->nr_cpus_allowed;
-	atomic_set(&mm->max_nr_cid, 0);
 	raw_spin_lock_init(&mm->cpus_allowed_lock);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
 	cpumask_clear(mm_cidmask(mm));
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..9d9afe453ef1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -955,10 +955,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #endif
 
 #ifdef CONFIG_SCHED_MM_CID
-	tsk->mm_cid = -1;
-	tsk->last_mm_cid = -1;
+	tsk->mm_cid = MM_CID_UNSET;
+	tsk->last_mm_cid = MM_CID_UNSET;
 	tsk->mm_cid_active = 0;
-	tsk->migrate_from_cpu = -1;
 #endif
 	return tsk;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 579a8e93578f..11a173596e0d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2128,8 +2128,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_on_rq_migrating(p))
 		flags |= ENQUEUE_MIGRATED;
-	if (flags & ENQUEUE_MIGRATED)
-		sched_mm_cid_migrate_to(rq, p);
 
 	enqueue_task(rq, p, flags);
 
@@ -3329,7 +3327,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
-		sched_mm_cid_migrate_from(p);
 		perf_event_task_migrate(p);
 	}
 
@@ -5280,9 +5277,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 *
 	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
-	 *
-	 * switch_mm_cid() needs to be updated if the barriers provided
-	 * by context_switch() are modified.
 	 */
 	if (!next->mm) {                                // to kernel
 		enter_lazy_tlb(prev->active_mm, next);
@@ -5312,8 +5306,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		}
 	}
 
-	/* switch_mm_cid() requires the memory barriers above. */
-	switch_mm_cid(rq, prev, next);
+	switch_mm_cid(prev, next);
 
 	/*
 	 * Tell rseq that the task was scheduled in. Must be after
@@ -5604,7 +5597,6 @@ void sched_tick(void)
 		resched_latency = cpu_resched_latency(rq);
 	calc_global_load_tick(rq);
 	sched_core_tick(rq);
-	task_tick_mm_cid(rq, donor);
 	scx_tick(rq);
 
 	rq_unlock(rq, &rf);
@@ -10376,522 +10368,47 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
 }
 
 #ifdef CONFIG_SCHED_MM_CID
-
 /*
- * @cid_lock: Guarantee forward-progress of cid allocation.
- *
- * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
- * is only used when contention is detected by the lock-free allocation so
- * forward progress can be guaranteed.
- */
-DEFINE_RAW_SPINLOCK(cid_lock);
-
-/*
- * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
- *
- * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
- * detected, it is set to 1 to ensure that all newly coming allocations are
- * serialized by @cid_lock until the allocation which detected contention
- * completes and sets @use_cid_lock back to 0. This guarantees forward progress
- * of a cid allocation.
- */
-int use_cid_lock;
-
-/*
- * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
- * concurrently with respect to the execution of the source runqueue context
- * switch.
- *
- * There is one basic properties we want to guarantee here:
- *
- * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
- * used by a task. That would lead to concurrent allocation of the cid and
- * userspace corruption.
- *
- * Provide this guarantee by introducing a Dekker memory ordering to guarantee
- * that a pair of loads observe at least one of a pair of stores, which can be
- * shown as:
- *
- *      X = Y = 0
- *
- *      w[X]=1          w[Y]=1
- *      MB              MB
- *      r[Y]=y          r[X]=x
- *
- * Which guarantees that x==0 && y==0 is impossible. But rather than using
- * values 0 and 1, this algorithm cares about specific state transitions of the
- * runqueue current task (as updated by the scheduler context switch), and the
- * per-mm/cpu cid value.
- *
- * Let's introduce task (Y) which has task->mm == mm and task (N) which has
- * task->mm != mm for the rest of the discussion. There are two scheduler state
- * transitions on context switch we care about:
- *
- * (TSA) Store to rq->curr with transition from (N) to (Y)
- *
- * (TSB) Store to rq->curr with transition from (Y) to (N)
- *
- * On the remote-clear side, there is one transition we care about:
- *
- * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
- *
- * There is also a transition to UNSET state which can be performed from all
- * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
- * guarantees that only a single thread will succeed:
- *
- * (TMB) cmpxchg to *pcpu_cid to mark UNSET
- *
- * Just to be clear, what we do _not_ want to happen is a transition to UNSET
- * when a thread is actively using the cid (property (1)).
- *
- * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
- *
- * Scenario A) (TSA)+(TMA) (from next task perspective)
- *
- * CPU0                                      CPU1
- *
- * Context switch CS-1                       Remote-clear
- *   - store to rq->curr: (N)->(Y) (TSA)     - cmpxchg to *pcpu_id to LAZY (TMA)
- *                                             (implied barrier after cmpxchg)
- *   - switch_mm_cid()
- *     - memory barrier (see switch_mm_cid()
- *       comment explaining how this barrier
- *       is combined with other scheduler
- *       barriers)
- *     - mm_cid_get (next)
- *       - READ_ONCE(*pcpu_cid)              - rcu_dereference(src_rq->curr)
- *
- * This Dekker ensures that either task (Y) is observed by the
- * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
- * observed.
- *
- * If task (Y) store is observed by rcu_dereference(), it means that there is
- * still an active task on the cpu. Remote-clear will therefore not transition
- * to UNSET, which fulfills property (1).
- *
- * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
- * it will move its state to UNSET, which clears the percpu cid perhaps
- * uselessly (which is not an issue for correctness). Because task (Y) is not
- * observed, CPU1 can move ahead to set the state to UNSET. Because moving
- * state to UNSET is done with a cmpxchg expecting that the old state has the
- * LAZY flag set, only one thread will successfully UNSET.
- *
- * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
- * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
- * CPU1 will observe task (Y) and do nothing more, which is fine.
- *
- * What we are effectively preventing with this Dekker is a scenario where
- * neither LAZY flag nor store (Y) are observed, which would fail property (1)
- * because this would UNSET a cid which is actively used.
+ * When a task exits, the MM CID held by the task is not longer required as
+ * the task cannot return to user space.
  */
-
-void sched_mm_cid_migrate_from(struct task_struct *t)
-{
-	t->migrate_from_cpu = task_cpu(t);
-}
-
-static
-int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
-					  struct task_struct *t,
-					  struct mm_cid *src_pcpu_cid)
-{
-	struct mm_struct *mm = t->mm;
-	struct task_struct *src_task;
-	int src_cid, last_mm_cid;
-
-	if (!mm)
-		return -1;
-
-	last_mm_cid = t->last_mm_cid;
-	/*
-	 * If the migrated task has no last cid, or if the current
-	 * task on src rq uses the cid, it means the source cid does not need
-	 * to be moved to the destination cpu.
-	 */
-	if (last_mm_cid == -1)
-		return -1;
-	src_cid = READ_ONCE(src_pcpu_cid->cid);
-	if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
-		return -1;
-
-	/*
-	 * If we observe an active task using the mm on this rq, it means we
-	 * are not the last task to be migrated from this cpu for this mm, so
-	 * there is no need to move src_cid to the destination cpu.
-	 */
-	guard(rcu)();
-	src_task = rcu_dereference(src_rq->curr);
-	if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
-		t->last_mm_cid = -1;
-		return -1;
-	}
-
-	return src_cid;
-}
-
-static
-int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
-					      struct task_struct *t,
-					      struct mm_cid *src_pcpu_cid,
-					      int src_cid)
-{
-	struct task_struct *src_task;
-	struct mm_struct *mm = t->mm;
-	int lazy_cid;
-
-	if (src_cid == -1)
-		return -1;
-
-	/*
-	 * Attempt to clear the source cpu cid to move it to the destination
-	 * cpu.
-	 */
-	lazy_cid = mm_cid_set_lazy_put(src_cid);
-	if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
-		return -1;
-
-	/*
-	 * The implicit barrier after cmpxchg per-mm/cpu cid before loading
-	 * rq->curr->mm matches the scheduler barrier in context_switch()
-	 * between store to rq->curr and load of prev and next task's
-	 * per-mm/cpu cid.
-	 *
-	 * The implicit barrier after cmpxchg per-mm/cpu cid before loading
-	 * rq->curr->mm_cid_active matches the barrier in
-	 * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
-	 * sched_mm_cid_after_execve() between store to t->mm_cid_active and
-	 * load of per-mm/cpu cid.
-	 */
-
-	/*
-	 * If we observe an active task using the mm on this rq after setting
-	 * the lazy-put flag, this task will be responsible for transitioning
-	 * from lazy-put flag set to MM_CID_UNSET.
-	 */
-	scoped_guard (rcu) {
-		src_task = rcu_dereference(src_rq->curr);
-		if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
-			/*
-			 * We observed an active task for this mm, there is therefore
-			 * no point in moving this cid to the destination cpu.
-			 */
-			t->last_mm_cid = -1;
-			return -1;
-		}
-	}
-
-	/*
-	 * The src_cid is unused, so it can be unset.
-	 */
-	if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
-		return -1;
-	WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
-	return src_cid;
-}
-
-/*
- * Migration to dst cpu. Called with dst_rq lock held.
- * Interrupts are disabled, which keeps the window of cid ownership without the
- * source rq lock held small.
- */
-void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
-{
-	struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
-	struct mm_struct *mm = t->mm;
-	int src_cid, src_cpu;
-	bool dst_cid_is_set;
-	struct rq *src_rq;
-
-	lockdep_assert_rq_held(dst_rq);
-
-	if (!mm)
-		return;
-	src_cpu = t->migrate_from_cpu;
-	if (src_cpu == -1) {
-		t->last_mm_cid = -1;
-		return;
-	}
-	/*
-	 * Move the src cid if the dst cid is unset. This keeps id
-	 * allocation closest to 0 in cases where few threads migrate around
-	 * many CPUs.
-	 *
-	 * If destination cid or recent cid is already set, we may have
-	 * to just clear the src cid to ensure compactness in frequent
-	 * migrations scenarios.
-	 *
-	 * It is not useful to clear the src cid when the number of threads is
-	 * greater or equal to the number of allowed CPUs, because user-space
-	 * can expect that the number of allowed cids can reach the number of
-	 * allowed CPUs.
-	 */
-	dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
-	dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) ||
-			 !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
-	if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
-		return;
-	src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
-	src_rq = cpu_rq(src_cpu);
-	src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
-	if (src_cid == -1)
-		return;
-	src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
-							    src_cid);
-	if (src_cid == -1)
-		return;
-	if (dst_cid_is_set) {
-		__mm_cid_put(mm, src_cid);
-		return;
-	}
-	/* Move src_cid to dst cpu. */
-	mm_cid_snapshot_time(dst_rq, mm);
-	WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
-	WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
-}
-
-static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
-				      int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	struct task_struct *t;
-	int cid, lazy_cid;
-
-	cid = READ_ONCE(pcpu_cid->cid);
-	if (!mm_cid_is_valid(cid))
-		return;
-
-	/*
-	 * Clear the cpu cid if it is set to keep cid allocation compact.  If
-	 * there happens to be other tasks left on the source cpu using this
-	 * mm, the next task using this mm will reallocate its cid on context
-	 * switch.
-	 */
-	lazy_cid = mm_cid_set_lazy_put(cid);
-	if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
-		return;
-
-	/*
-	 * The implicit barrier after cmpxchg per-mm/cpu cid before loading
-	 * rq->curr->mm matches the scheduler barrier in context_switch()
-	 * between store to rq->curr and load of prev and next task's
-	 * per-mm/cpu cid.
-	 *
-	 * The implicit barrier after cmpxchg per-mm/cpu cid before loading
-	 * rq->curr->mm_cid_active matches the barrier in
-	 * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
-	 * sched_mm_cid_after_execve() between store to t->mm_cid_active and
-	 * load of per-mm/cpu cid.
-	 */
-
-	/*
-	 * If we observe an active task using the mm on this rq after setting
-	 * the lazy-put flag, that task will be responsible for transitioning
-	 * from lazy-put flag set to MM_CID_UNSET.
-	 */
-	scoped_guard (rcu) {
-		t = rcu_dereference(rq->curr);
-		if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
-			return;
-	}
-
-	/*
-	 * The cid is unused, so it can be unset.
-	 * Disable interrupts to keep the window of cid ownership without rq
-	 * lock small.
-	 */
-	scoped_guard (irqsave) {
-		if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
-			__mm_cid_put(mm, cid);
-	}
-}
-
-static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	struct mm_cid *pcpu_cid;
-	struct task_struct *curr;
-	u64 rq_clock;
-
-	/*
-	 * rq->clock load is racy on 32-bit but one spurious clear once in a
-	 * while is irrelevant.
-	 */
-	rq_clock = READ_ONCE(rq->clock);
-	pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
-
-	/*
-	 * In order to take care of infrequently scheduled tasks, bump the time
-	 * snapshot associated with this cid if an active task using the mm is
-	 * observed on this rq.
-	 */
-	scoped_guard (rcu) {
-		curr = rcu_dereference(rq->curr);
-		if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
-			WRITE_ONCE(pcpu_cid->time, rq_clock);
-			return;
-		}
-	}
-
-	if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
-		return;
-	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
-}
-
-static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
-					     int weight)
-{
-	struct mm_cid *pcpu_cid;
-	int cid;
-
-	pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
-	cid = READ_ONCE(pcpu_cid->cid);
-	if (!mm_cid_is_valid(cid) || cid < weight)
-		return;
-	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
-}
-
-static void task_mm_cid_work(struct callback_head *work)
-{
-	unsigned long now = jiffies, old_scan, next_scan;
-	struct task_struct *t = current;
-	struct cpumask *cidmask;
-	struct mm_struct *mm;
-	int weight, cpu;
-
-	WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
-
-	work->next = work;	/* Prevent double-add */
-	if (t->flags & PF_EXITING)
-		return;
-	mm = t->mm;
-	if (!mm)
-		return;
-	old_scan = READ_ONCE(mm->mm_cid_next_scan);
-	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
-	if (!old_scan) {
-		unsigned long res;
-
-		res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
-		if (res != old_scan)
-			old_scan = res;
-		else
-			old_scan = next_scan;
-	}
-	if (time_before(now, old_scan))
-		return;
-	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
-		return;
-	cidmask = mm_cidmask(mm);
-	/* Clear cids that were not recently used. */
-	for_each_possible_cpu(cpu)
-		sched_mm_cid_remote_clear_old(mm, cpu);
-	weight = cpumask_weight(cidmask);
-	/*
-	 * Clear cids that are greater or equal to the cidmask weight to
-	 * recompact it.
-	 */
-	for_each_possible_cpu(cpu)
-		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
-}
-
-void init_sched_mm_cid(struct task_struct *t)
-{
-	struct mm_struct *mm = t->mm;
-	int mm_users = 0;
-
-	if (mm) {
-		mm_users = atomic_read(&mm->mm_users);
-		if (mm_users == 1)
-			mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
-	}
-	t->cid_work.next = &t->cid_work;	/* Protect against double add */
-	init_task_work(&t->cid_work, task_mm_cid_work);
-}
-
-void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
-{
-	struct callback_head *work = &curr->cid_work;
-	unsigned long now = jiffies;
-
-	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
-	    work->next != work)
-		return;
-	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
-		return;
-
-	/* No page allocation under rq lock */
-	task_work_add(curr, work, TWA_RESUME);
-}
-
 void sched_mm_cid_exit_signals(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	struct rq *rq;
 
-	if (!mm)
+	if (!mm || !t->mm_cid_active)
 		return;
 
-	preempt_disable();
-	rq = this_rq();
-	guard(rq_lock_irqsave)(rq);
-	preempt_enable_no_resched();	/* holding spinlock */
-	WRITE_ONCE(t->mm_cid_active, 0);
-	/*
-	 * Store t->mm_cid_active before loading per-mm/cpu cid.
-	 * Matches barrier in sched_mm_cid_remote_clear_old().
-	 */
-	smp_mb();
-	mm_cid_put(mm);
-	t->last_mm_cid = t->mm_cid = -1;
+	guard(preempt)();
+	t->mm_cid_active = 0;
+	if (t->mm_cid != MM_CID_UNSET) {
+		cpumask_clear_cpu(t->mm_cid, mm_cidmask(mm));
+		t->mm_cid = MM_CID_UNSET;
+	}
 }
 
+/* Deactivate MM CID allocation across execve() */
 void sched_mm_cid_before_execve(struct task_struct *t)
 {
-	struct mm_struct *mm = t->mm;
-	struct rq *rq;
-
-	if (!mm)
-		return;
-
-	preempt_disable();
-	rq = this_rq();
-	guard(rq_lock_irqsave)(rq);
-	preempt_enable_no_resched();	/* holding spinlock */
-	WRITE_ONCE(t->mm_cid_active, 0);
-	/*
-	 * Store t->mm_cid_active before loading per-mm/cpu cid.
-	 * Matches barrier in sched_mm_cid_remote_clear_old().
-	 */
-	smp_mb();
-	mm_cid_put(mm);
-	t->last_mm_cid = t->mm_cid = -1;
+	sched_mm_cid_exit_signals(t);
 }
 
+/* Reactivate MM CID after successful execve() */
 void sched_mm_cid_after_execve(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	struct rq *rq;
 
 	if (!mm)
 		return;
 
-	preempt_disable();
-	rq = this_rq();
-	scoped_guard (rq_lock_irqsave, rq) {
-		preempt_enable_no_resched();	/* holding spinlock */
-		WRITE_ONCE(t->mm_cid_active, 1);
-		/*
-		 * Store t->mm_cid_active before loading per-mm/cpu cid.
-		 * Matches barrier in sched_mm_cid_remote_clear_old().
-		 */
-		smp_mb();
-		t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
-	}
+	guard(preempt)();
+	t->mm_cid_active = 1;
+	mm_cid_select(t);
 }
 
 void sched_mm_cid_fork(struct task_struct *t)
 {
-	WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
+	WARN_ON_ONCE(!t->mm || t->mm_cid != MM_CID_UNSET);
 	t->mm_cid_active = 1;
 }
 #endif /* CONFIG_SCHED_MM_CID */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4838dda75b10..bf227c27b889 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3540,286 +3540,83 @@ extern void sched_dynamic_update(int mode);
 extern const char *preempt_modes[];
 
 #ifdef CONFIG_SCHED_MM_CID
-
-#define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
-#define MM_CID_SCAN_DELAY	100			/* 100ms */
-
-extern raw_spinlock_t cid_lock;
-extern int use_cid_lock;
-
-extern void sched_mm_cid_migrate_from(struct task_struct *t);
-extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
-extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
-extern void init_sched_mm_cid(struct task_struct *t);
-
-static inline void __mm_cid_put(struct mm_struct *mm, int cid)
-{
-	if (cid < 0)
-		return;
-	cpumask_clear_cpu(cid, mm_cidmask(mm));
-}
-
-/*
- * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
- * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
- * be held to transition to other states.
- *
- * State transitions synchronized with cmpxchg or try_cmpxchg need to be
- * consistent across CPUs, which prevents use of this_cpu_cmpxchg.
- */
-static inline void mm_cid_put_lazy(struct task_struct *t)
+static inline void init_sched_mm_cid(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
-	int cid;
+	unsigned int max_cid;
 
-	lockdep_assert_irqs_disabled();
-	cid = __this_cpu_read(pcpu_cid->cid);
-	if (!mm_cid_is_lazy_put(cid) ||
-	    !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+	if (!mm)
 		return;
-	__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
-}
 
-static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
-{
-	struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
-	int cid, res;
-
-	lockdep_assert_irqs_disabled();
-	cid = __this_cpu_read(pcpu_cid->cid);
-	for (;;) {
-		if (mm_cid_is_unset(cid))
-			return MM_CID_UNSET;
-		/*
-		 * Attempt transition from valid or lazy-put to unset.
-		 */
-		res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
-		if (res == cid)
-			break;
-		cid = res;
-	}
-	return cid;
+	/* Preset last_mm_cid */
+	max_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users));
+	t->last_mm_cid = max_cid - 1;
 }
 
-static inline void mm_cid_put(struct mm_struct *mm)
+static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
 {
-	int cid;
+	struct mm_struct *mm = t->mm;
 
-	lockdep_assert_irqs_disabled();
-	cid = mm_cid_pcpu_unset(mm);
-	if (cid == MM_CID_UNSET)
-		return;
-	__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+	if (cid >= max_cids)
+		return false;
+	if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm)))
+		return false;
+	t->mm_cid = t->last_mm_cid = cid;
+	__this_cpu_write(mm->pcpu_cid->cid, cid);
+	return true;
 }
 
-static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
+static inline bool mm_cid_get(struct task_struct *t)
 {
-	struct cpumask *cidmask = mm_cidmask(mm);
-	struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
-	int cid, max_nr_cid, allowed_max_nr_cid;
+	struct mm_struct *mm = t->mm;
+	unsigned int max_cids;
 
-	/*
-	 * After shrinking the number of threads or reducing the number
-	 * of allowed cpus, reduce the value of max_nr_cid so expansion
-	 * of cid allocation will preserve cache locality if the number
-	 * of threads or allowed cpus increase again.
-	 */
-	max_nr_cid = atomic_read(&mm->max_nr_cid);
-	while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed),
-					   atomic_read(&mm->mm_users))),
-	       max_nr_cid > allowed_max_nr_cid) {
-		/* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */
-		if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) {
-			max_nr_cid = allowed_max_nr_cid;
-			break;
-		}
-	}
-	/* Try to re-use recent cid. This improves cache locality. */
-	cid = __this_cpu_read(pcpu_cid->recent_cid);
-	if (!mm_cid_is_unset(cid) && cid < max_nr_cid &&
-	    !cpumask_test_and_set_cpu(cid, cidmask))
-		return cid;
-	/*
-	 * Expand cid allocation if the maximum number of concurrency
-	 * IDs allocated (max_nr_cid) is below the number cpus allowed
-	 * and number of threads. Expanding cid allocation as much as
-	 * possible improves cache locality.
-	 */
-	cid = max_nr_cid;
-	while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) {
-		/* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */
-		if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1))
-			continue;
-		if (!cpumask_test_and_set_cpu(cid, cidmask))
-			return cid;
-	}
-	/*
-	 * Find the first available concurrency id.
-	 * Retry finding first zero bit if the mask is temporarily
-	 * filled. This only happens during concurrent remote-clear
-	 * which owns a cid without holding a rq lock.
-	 */
-	for (;;) {
-		cid = cpumask_first_zero(cidmask);
-		if (cid < READ_ONCE(mm->nr_cpus_allowed))
-			break;
-		cpu_relax();
-	}
-	if (cpumask_test_and_set_cpu(cid, cidmask))
-		return -1;
+	max_cids = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users));
 
-	return cid;
-}
+	/* Try to reuse the last CID of this task */
+	if (__mm_cid_get(t, t->last_mm_cid, max_cids))
+		return true;
 
-/*
- * Save a snapshot of the current runqueue time of this cpu
- * with the per-cpu cid value, allowing to estimate how recently it was used.
- */
-static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
-{
-	struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
+	/* Try to reuse the last CID of this mm on this CPU */
+	if (__mm_cid_get(t, __this_cpu_read(mm->pcpu_cid->cid), max_cids))
+		return true;
 
-	lockdep_assert_rq_held(rq);
-	WRITE_ONCE(pcpu_cid->time, rq->clock);
+	/* Try the first zero bit in the cidmask. */
+	return __mm_cid_get(t, cpumask_first_zero(mm_cidmask(mm)), max_cids);
 }
 
-static inline int __mm_cid_get(struct rq *rq, struct task_struct *t,
-			       struct mm_struct *mm)
+static inline void mm_cid_select(struct task_struct *t)
 {
-	int cid;
-
 	/*
-	 * All allocations (even those using the cid_lock) are lock-free. If
-	 * use_cid_lock is set, hold the cid_lock to perform cid allocation to
-	 * guarantee forward progress.
+	 * mm_cid_get() can fail when the maximum CID, which is determined
+	 * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently.
+	 * That's a transient failure as there cannot be more tasks
+	 * concurrently on a CPU (or about to be scheduled in) than that.
 	 */
-	if (!READ_ONCE(use_cid_lock)) {
-		cid = __mm_cid_try_get(t, mm);
-		if (cid >= 0)
-			goto end;
-		raw_spin_lock(&cid_lock);
-	} else {
-		raw_spin_lock(&cid_lock);
-		cid = __mm_cid_try_get(t, mm);
-		if (cid >= 0)
-			goto unlock;
-	}
-
-	/*
-	 * cid concurrently allocated. Retry while forcing following
-	 * allocations to use the cid_lock to ensure forward progress.
-	 */
-	WRITE_ONCE(use_cid_lock, 1);
-	/*
-	 * Set use_cid_lock before allocation. Only care about program order
-	 * because this is only required for forward progress.
-	 */
-	barrier();
-	/*
-	 * Retry until it succeeds. It is guaranteed to eventually succeed once
-	 * all newcoming allocations observe the use_cid_lock flag set.
-	 */
-	do {
-		cid = __mm_cid_try_get(t, mm);
-		cpu_relax();
-	} while (cid < 0);
-	/*
-	 * Allocate before clearing use_cid_lock. Only care about
-	 * program order because this is for forward progress.
-	 */
-	barrier();
-	WRITE_ONCE(use_cid_lock, 0);
-unlock:
-	raw_spin_unlock(&cid_lock);
-end:
-	mm_cid_snapshot_time(rq, mm);
-
-	return cid;
-}
-
-static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
-			     struct mm_struct *mm)
-{
-	struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
-	int cid;
-
-	lockdep_assert_rq_held(rq);
-	cid = __this_cpu_read(pcpu_cid->cid);
-	if (mm_cid_is_valid(cid)) {
-		mm_cid_snapshot_time(rq, mm);
-		return cid;
-	}
-	if (mm_cid_is_lazy_put(cid)) {
-		if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
-			__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+	for (;;) {
+		if (mm_cid_get(t))
+			break;
 	}
-	cid = __mm_cid_get(rq, t, mm);
-	__this_cpu_write(pcpu_cid->cid, cid);
-	__this_cpu_write(pcpu_cid->recent_cid, cid);
-
-	return cid;
 }
 
-static inline void switch_mm_cid(struct rq *rq,
-				 struct task_struct *prev,
-				 struct task_struct *next)
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
 {
-	/*
-	 * Provide a memory barrier between rq->curr store and load of
-	 * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
-	 *
-	 * Should be adapted if context_switch() is modified.
-	 */
-	if (!next->mm) {                                // to kernel
-		/*
-		 * user -> kernel transition does not guarantee a barrier, but
-		 * we can use the fact that it performs an atomic operation in
-		 * mmgrab().
-		 */
-		if (prev->mm)                           // from user
-			smp_mb__after_mmgrab();
-		/*
-		 * kernel -> kernel transition does not change rq->curr->mm
-		 * state. It stays NULL.
-		 */
-	} else {                                        // to user
-		/*
-		 * kernel -> user transition does not provide a barrier
-		 * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
-		 * Provide it here.
-		 */
-		if (!prev->mm) {                        // from kernel
-			smp_mb();
-		} else {				// from user
-			/*
-			 * user->user transition relies on an implicit
-			 * memory barrier in switch_mm() when
-			 * current->mm changes. If the architecture
-			 * switch_mm() does not have an implicit memory
-			 * barrier, it is emitted here.  If current->mm
-			 * is unchanged, no barrier is needed.
-			 */
-			smp_mb__after_switch_mm();
-		}
-	}
 	if (prev->mm_cid_active) {
-		mm_cid_snapshot_time(rq, prev->mm);
-		mm_cid_put_lazy(prev);
-		prev->mm_cid = -1;
+		if (prev->mm_cid != MM_CID_UNSET)
+			cpumask_clear_cpu(prev->mm_cid, mm_cidmask(prev->mm));
+		prev->mm_cid = MM_CID_UNSET;
 	}
+
 	if (next->mm_cid_active) {
-		next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
+		mm_cid_select(next);
 		rseq_sched_set_task_mm_cid(next, next->mm_cid);
 	}
 }
 
 #else /* !CONFIG_SCHED_MM_CID: */
-static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
-static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
-static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
-static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
 static inline void init_sched_mm_cid(struct task_struct *t) { }
+static inline void mm_cid_select(struct task_struct *t) { }
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
 #endif /* !CONFIG_SCHED_MM_CID */
 
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
-- 
cgit v1.2.3


From 8cea569ca785060b8c5cc7800713ddc3b1548a94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:26:47 +0100
Subject: sched/mmcid: Use proper data structures

Having a lot of CID functionality specific members in struct task_struct
and struct mm_struct is not really making the code easier to read.

Encapsulate the CID specific parts in data structures and keep them
separate from the stuff they are embedded in.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.131573768@linutronix.de
---
 include/linux/mm_types.h   | 56 ++++++++++++----------------------------------
 include/linux/rseq_types.h | 42 ++++++++++++++++++++++++++++++++++
 include/linux/sched.h      | 11 ++-------
 init/init_task.c           |  3 +++
 kernel/fork.c              |  6 ++---
 kernel/sched/core.c        | 16 ++++++-------
 kernel/sched/sched.h       | 26 ++++++++++-----------
 7 files changed, 85 insertions(+), 75 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 63b8c1209e7b..e4818e932a1d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -20,6 +20,7 @@
 #include <linux/seqlock.h>
 #include <linux/percpu_counter.h>
 #include <linux/types.h>
+#include <linux/rseq_types.h>
 #include <linux/bitmap.h>
 
 #include <asm/mmu.h>
@@ -922,10 +923,6 @@ struct vm_area_struct {
 #define vma_policy(vma) NULL
 #endif
 
-struct mm_cid {
-	unsigned int cid;
-};
-
 /*
  * Opaque type representing current mm_struct flag state. Must be accessed via
  * mm_flags_xxx() helper functions.
@@ -987,30 +984,9 @@ struct mm_struct {
 		 */
 		atomic_t mm_users;
 
-#ifdef CONFIG_SCHED_MM_CID
-		/**
-		 * @pcpu_cid: Per-cpu current cid.
-		 *
-		 * Keep track of the currently allocated mm_cid for each cpu.
-		 * The per-cpu mm_cid values are serialized by their respective
-		 * runqueue locks.
-		 */
-		struct mm_cid __percpu *pcpu_cid;
-		/**
-		 * @nr_cpus_allowed: Number of CPUs allowed for mm.
-		 *
-		 * Number of CPUs allowed in the union of all mm's
-		 * threads allowed CPUs.
-		 */
-		unsigned int nr_cpus_allowed;
-		/**
-		 * @cpus_allowed_lock: Lock protecting mm cpus_allowed.
-		 *
-		 * Provide mutual exclusion for mm cpus_allowed and
-		 * mm nr_cpus_allowed updates.
-		 */
-		raw_spinlock_t cpus_allowed_lock;
-#endif
+		/* MM CID related storage */
+		struct mm_mm_cid mm_cid;
+
 #ifdef CONFIG_MMU
 		atomic_long_t pgtables_bytes;	/* size of all page tables */
 #endif
@@ -1352,9 +1328,6 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
 }
 
 #ifdef CONFIG_SCHED_MM_CID
-
-#define	MM_CID_UNSET	(~0U)
-
 /*
  * mm_cpus_allowed: Union of all mm's threads allowed CPUs.
  */
@@ -1383,20 +1356,20 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 	int i;
 
 	for_each_possible_cpu(i) {
-		struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
+		struct mm_cid_pcpu *pcpu = per_cpu_ptr(mm->mm_cid.pcpu, i);
 
-		pcpu_cid->cid = MM_CID_UNSET;
+		pcpu->cid = MM_CID_UNSET;
 	}
-	mm->nr_cpus_allowed = p->nr_cpus_allowed;
-	raw_spin_lock_init(&mm->cpus_allowed_lock);
+	mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+	raw_spin_lock_init(&mm->mm_cid.lock);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
 	cpumask_clear(mm_cidmask(mm));
 }
 
 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
 {
-	mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
-	if (!mm->pcpu_cid)
+	mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu);
+	if (!mm->mm_cid.pcpu)
 		return -ENOMEM;
 	mm_init_cid(mm, p);
 	return 0;
@@ -1405,8 +1378,8 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *
 
 static inline void mm_destroy_cid(struct mm_struct *mm)
 {
-	free_percpu(mm->pcpu_cid);
-	mm->pcpu_cid = NULL;
+	free_percpu(mm->mm_cid.pcpu);
+	mm->mm_cid.pcpu = NULL;
 }
 
 static inline unsigned int mm_cid_size(void)
@@ -1421,10 +1394,9 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
 	if (!mm)
 		return;
 	/* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
-	raw_spin_lock(&mm->cpus_allowed_lock);
+	guard(raw_spinlock)(&mm->mm_cid.lock);
 	cpumask_or(mm_allowed, mm_allowed, cpumask);
-	WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
-	raw_spin_unlock(&mm->cpus_allowed_lock);
+	WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed));
 }
 #else /* CONFIG_SCHED_MM_CID */
 static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 9c7a34154de8..e444dd267c7a 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -90,4 +90,46 @@ struct rseq_data {
 struct rseq_data { };
 #endif /* !CONFIG_RSEQ */
 
+#ifdef CONFIG_SCHED_MM_CID
+
+#define MM_CID_UNSET	(~0U)
+
+/**
+ * struct sched_mm_cid - Storage for per task MM CID data
+ * @active:	MM CID is active for the task
+ * @cid:	The CID associated to the task
+ * @last_cid:	The last CID associated to the task
+ */
+struct sched_mm_cid {
+	unsigned int		active;
+	unsigned int		cid;
+	unsigned int		last_cid;
+};
+
+/**
+ * struct mm_cid_pcpu - Storage for per CPU MM_CID data
+ * @cid:	The CID associated to the CPU
+ */
+struct mm_cid_pcpu {
+	unsigned int	cid;
+};
+
+/**
+ * struct mm_mm_cid - Storage for per MM CID data
+ * @pcpu:		Per CPU storage for CIDs associated to a CPU
+ * @nr_cpus_allowed:	The number of CPUs in the per MM allowed CPUs map. The map
+ *			is growth only.
+ * @lock:		Spinlock to protect all fields except @pcpu. It also protects
+ *			the MM cid cpumask and the MM cidmask bitmap.
+ */
+struct mm_mm_cid {
+	struct mm_cid_pcpu	__percpu *pcpu;
+	unsigned int		nr_cpus_allowed;
+	raw_spinlock_t		lock;
+};
+#else /* CONFIG_SCHED_MM_CID */
+struct mm_mm_cid { };
+struct sched_mm_cid { };
+#endif /* !CONFIG_SCHED_MM_CID */
+
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e47abc8685d7..64f080d6ed6e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1407,14 +1407,7 @@ struct task_struct {
 #endif /* CONFIG_NUMA_BALANCING */
 
 	struct rseq_data		rseq;
-
-#ifdef CONFIG_SCHED_MM_CID
-	int				mm_cid;		/* Current cid in mm */
-	int				last_mm_cid;	/* Most recent cid in mm */
-	int				migrate_from_cpu;
-	int				mm_cid_active;	/* Whether cid bitmap is active */
-	struct callback_head		cid_work;
-#endif
+	struct sched_mm_cid		mm_cid;
 
 	struct tlbflush_unmap_batch	tlb_ubc;
 
@@ -2308,7 +2301,7 @@ void sched_mm_cid_fork(struct task_struct *t);
 void sched_mm_cid_exit_signals(struct task_struct *t);
 static inline int task_mm_cid(struct task_struct *t)
 {
-	return t->mm_cid;
+	return t->mm_cid.cid;
 }
 #else
 static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
diff --git a/init/init_task.c b/init/init_task.c
index a55e2189206f..5d122699b664 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -223,6 +223,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 #ifdef CONFIG_SECCOMP_FILTER
 	.seccomp	= { .filter_count = ATOMIC_INIT(0) },
 #endif
+#ifdef CONFIG_SCHED_MM_CID
+	.mm_cid		= { .cid = MM_CID_UNSET, },
+#endif
 };
 EXPORT_SYMBOL(init_task);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d9afe453ef1..74bc7c9f1bb3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -955,9 +955,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #endif
 
 #ifdef CONFIG_SCHED_MM_CID
-	tsk->mm_cid = MM_CID_UNSET;
-	tsk->last_mm_cid = MM_CID_UNSET;
-	tsk->mm_cid_active = 0;
+	tsk->mm_cid.cid = MM_CID_UNSET;
+	tsk->mm_cid.last_cid = MM_CID_UNSET;
+	tsk->mm_cid.active = 0;
 #endif
 	return tsk;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11a173596e0d..b1aa7d1055ac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10376,14 +10376,14 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
 
-	if (!mm || !t->mm_cid_active)
+	if (!mm || !t->mm_cid.active)
 		return;
 
 	guard(preempt)();
-	t->mm_cid_active = 0;
-	if (t->mm_cid != MM_CID_UNSET) {
-		cpumask_clear_cpu(t->mm_cid, mm_cidmask(mm));
-		t->mm_cid = MM_CID_UNSET;
+	t->mm_cid.active = 0;
+	if (t->mm_cid.cid != MM_CID_UNSET) {
+		cpumask_clear_cpu(t->mm_cid.cid, mm_cidmask(mm));
+		t->mm_cid.cid = MM_CID_UNSET;
 	}
 }
 
@@ -10402,14 +10402,14 @@ void sched_mm_cid_after_execve(struct task_struct *t)
 		return;
 
 	guard(preempt)();
-	t->mm_cid_active = 1;
+	t->mm_cid.active = 1;
 	mm_cid_select(t);
 }
 
 void sched_mm_cid_fork(struct task_struct *t)
 {
-	WARN_ON_ONCE(!t->mm || t->mm_cid != MM_CID_UNSET);
-	t->mm_cid_active = 1;
+	WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET);
+	t->mm_cid.active = 1;
 }
 #endif /* CONFIG_SCHED_MM_CID */
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bf227c27b889..a17f04f075e1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3549,8 +3549,8 @@ static inline void init_sched_mm_cid(struct task_struct *t)
 		return;
 
 	/* Preset last_mm_cid */
-	max_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users));
-	t->last_mm_cid = max_cid - 1;
+	max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
+	t->mm_cid.last_cid = max_cid - 1;
 }
 
 static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
@@ -3561,8 +3561,8 @@ static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigne
 		return false;
 	if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm)))
 		return false;
-	t->mm_cid = t->last_mm_cid = cid;
-	__this_cpu_write(mm->pcpu_cid->cid, cid);
+	t->mm_cid.cid = t->mm_cid.last_cid = cid;
+	__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
 	return true;
 }
 
@@ -3571,14 +3571,14 @@ static inline bool mm_cid_get(struct task_struct *t)
 	struct mm_struct *mm = t->mm;
 	unsigned int max_cids;
 
-	max_cids = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users));
+	max_cids = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
 
 	/* Try to reuse the last CID of this task */
-	if (__mm_cid_get(t, t->last_mm_cid, max_cids))
+	if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
 		return true;
 
 	/* Try to reuse the last CID of this mm on this CPU */
-	if (__mm_cid_get(t, __this_cpu_read(mm->pcpu_cid->cid), max_cids))
+	if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids))
 		return true;
 
 	/* Try the first zero bit in the cidmask. */
@@ -3601,15 +3601,15 @@ static inline void mm_cid_select(struct task_struct *t)
 
 static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
 {
-	if (prev->mm_cid_active) {
-		if (prev->mm_cid != MM_CID_UNSET)
-			cpumask_clear_cpu(prev->mm_cid, mm_cidmask(prev->mm));
-		prev->mm_cid = MM_CID_UNSET;
+	if (prev->mm_cid.active) {
+		if (prev->mm_cid.cid != MM_CID_UNSET)
+			cpumask_clear_cpu(prev->mm_cid.cid, mm_cidmask(prev->mm));
+		prev->mm_cid.cid = MM_CID_UNSET;
 	}
 
-	if (next->mm_cid_active) {
+	if (next->mm_cid.active) {
 		mm_cid_select(next);
-		rseq_sched_set_task_mm_cid(next, next->mm_cid);
+		rseq_sched_set_task_mm_cid(next, next->mm_cid.cid);
 	}
 }
 
-- 
cgit v1.2.3


From be4463fa2c7185823d2989562162d578b45a89ae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:26:49 +0100
Subject: sched/mmcid: Cacheline align MM CID storage

Both the per CPU storage and the data in mm_struct are heavily used in
context switch. As they can end up next to other frequently modified data,
they are subject to false sharing.

Make them cache line aligned.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.194111661@linutronix.de
---
 include/linux/rseq_types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index e444dd267c7a..d7e8071b626a 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -112,7 +112,7 @@ struct sched_mm_cid {
  */
 struct mm_cid_pcpu {
 	unsigned int	cid;
-};
+}____cacheline_aligned_in_smp;
 
 /**
  * struct mm_mm_cid - Storage for per MM CID data
@@ -126,7 +126,7 @@ struct mm_mm_cid {
 	struct mm_cid_pcpu	__percpu *pcpu;
 	unsigned int		nr_cpus_allowed;
 	raw_spinlock_t		lock;
-};
+}____cacheline_aligned_in_smp;
 #else /* CONFIG_SCHED_MM_CID */
 struct mm_mm_cid { };
 struct sched_mm_cid { };
-- 
cgit v1.2.3


From b08ef5fc8fa01ae5285bef5ff783bbb425d1fb08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:26:53 +0100
Subject: sched/mmcid: Move scheduler code out of global header

This is only used in the scheduler core code, so there is no point to have
it in a global header.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Link: https://patch.msgid.link/20251119172549.321259077@linutronix.de
---
 include/linux/mm_types.h | 13 -------------
 kernel/sched/core.c      | 20 ++++++++++++++++++--
 2 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e4818e932a1d..67a7bdf772f7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1387,27 +1387,14 @@ static inline unsigned int mm_cid_size(void)
 	return 2 * cpumask_size();	/* mm_cpus_allowed(), mm_cidmask(). */
 }
 
-static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask)
-{
-	struct cpumask *mm_allowed = mm_cpus_allowed(mm);
-
-	if (!mm)
-		return;
-	/* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
-	guard(raw_spinlock)(&mm->mm_cid.lock);
-	cpumask_or(mm_allowed, mm_allowed, cpumask);
-	WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed));
-}
 #else /* CONFIG_SCHED_MM_CID */
 static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
 static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
 static inline void mm_destroy_cid(struct mm_struct *mm) { }
-
 static inline unsigned int mm_cid_size(void)
 {
 	return 0;
 }
-static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
 #endif /* CONFIG_SCHED_MM_CID */
 
 struct mmu_gather;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b667171b4422..f5e37c233b01 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2669,6 +2669,8 @@ out_unlock:
 	return 0;
 }
 
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask);
+
 /*
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
@@ -2728,7 +2730,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
 		put_prev_task(rq, p);
 
 	p->sched_class->set_cpus_allowed(p, ctx);
-	mm_set_cpus_allowed(p->mm, ctx->new_mask);
+	mm_update_cpus_allowed(p->mm, ctx->new_mask);
 
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -10372,6 +10374,18 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
  * When a task exits, the MM CID held by the task is not longer required as
  * the task cannot return to user space.
  */
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
+{
+	struct cpumask *mm_allowed = mm_cpus_allowed(mm);
+
+	if (!mm)
+		return;
+	/* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
+	guard(raw_spinlock)(&mm->mm_cid.lock);
+	cpumask_or(mm_allowed, mm_allowed, affmsk);
+	WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed));
+}
+
 void sched_mm_cid_exit_signals(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
@@ -10411,7 +10425,9 @@ void sched_mm_cid_fork(struct task_struct *t)
 	WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET);
 	t->mm_cid.active = 1;
 }
-#endif /* CONFIG_SCHED_MM_CID */
+#else /* CONFIG_SCHED_MM_CID */
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+#endif /* !CONFIG_SCHED_MM_CID */
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-- 
cgit v1.2.3


From 437cb3ded25038d5280d21de489ce78c745118d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:26:57 +0100
Subject: cpumask: Introduce cpumask_weighted_or()

CID management OR's two cpumasks and then calculates the weight on the
result. That's inefficient as that has to walk the same stuff twice. As
this is done with runqueue lock held, there is a real benefit of speeding
this up. Depending on the system this results in 10-20% less cycles spent
with runqueue lock held for a 4K cpumask.

Provide cpumask_weighted_or() and the corresponding bitmap functions which
return the weight of the OR result right away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.448263340@linutronix.de
---
 include/linux/bitmap.h  | 15 +++++++++++++++
 include/linux/cpumask.h | 16 ++++++++++++++++
 lib/bitmap.c            |  6 ++++++
 3 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 595217b7a6e7..b0395e4ccf90 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -45,6 +45,7 @@ struct device;
  *  bitmap_copy(dst, src, nbits)                *dst = *src
  *  bitmap_and(dst, src1, src2, nbits)          *dst = *src1 & *src2
  *  bitmap_or(dst, src1, src2, nbits)           *dst = *src1 | *src2
+ *  bitmap_weighted_or(dst, src1, src2, nbits)	*dst = *src1 | *src2. Returns Hamming Weight of dst
  *  bitmap_xor(dst, src1, src2, nbits)          *dst = *src1 ^ *src2
  *  bitmap_andnot(dst, src1, src2, nbits)       *dst = *src1 & ~(*src2)
  *  bitmap_complement(dst, src, nbits)          *dst = ~(*src)
@@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int nbits);
+unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1,
+				  const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
 		  const unsigned long *bitmap2, unsigned int nbits);
 bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
@@ -337,6 +340,18 @@ void bitmap_or(unsigned long *dst, const unsigned long *src1,
 		__bitmap_or(dst, src1, src2, nbits);
 }
 
+static __always_inline
+unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1,
+				const unsigned long *src2, unsigned int nbits)
+{
+	if (small_const_nbits(nbits)) {
+		*dst = *src1 | *src2;
+		return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
+	} else {
+		return __bitmap_weighted_or(dst, src1, src2, nbits);
+	}
+}
+
 static __always_inline
 void bitmap_xor(unsigned long *dst, const unsigned long *src1,
 		const unsigned long *src2, unsigned int nbits)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ff8f41ab7ce6..feba06eb0a42 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -728,6 +728,22 @@ void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
 				      cpumask_bits(src2p), small_cpumask_bits);
 }
 
+/**
+ * cpumask_weighted_or - *dstp = *src1p | *src2p and return the weight of the result
+ * @dstp: the cpumask result
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Return: The number of bits set in the resulting cpumask @dstp
+ */
+static __always_inline
+unsigned int cpumask_weighted_or(struct cpumask *dstp, const struct cpumask *src1p,
+				 const struct cpumask *src2p)
+{
+	return bitmap_weighted_or(cpumask_bits(dstp), cpumask_bits(src1p),
+				  cpumask_bits(src2p), small_cpumask_bits);
+}
+
 /**
  * cpumask_xor - *dstp = *src1p ^ *src2p
  * @dstp: the cpumask result
diff --git a/lib/bitmap.c b/lib/bitmap.c
index b97692854966..9dc526507875 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -355,6 +355,12 @@ unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
 }
 EXPORT_SYMBOL(__bitmap_weight_andnot);
 
+unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1,
+				  const unsigned long *bitmap2, unsigned int bits)
+{
+	return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits);
+}
+
 void __bitmap_set(unsigned long *map, unsigned int start, int len)
 {
 	unsigned long *p = map + BIT_WORD(start);
-- 
cgit v1.2.3


From b11890683380a36b8488229f818d5e76e8204587 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <cassel@kernel.org>
Date: Wed, 19 Nov 2025 15:13:14 +0100
Subject: ata: libata-scsi: Fix system suspend for a security locked drive

Commit cf3fc037623c ("ata: libata-scsi: Fix ata_to_sense_error() status
handling") fixed ata_to_sense_error() to properly generate sense key
ABORTED COMMAND (without any additional sense code), instead of the
previous bogus sense key ILLEGAL REQUEST with the additional sense code
UNALIGNED WRITE COMMAND, for a failed command.

However, this broke suspend for Security locked drives (drives that have
Security enabled, and have not been Security unlocked by boot firmware).

The reason for this is that the SCSI disk driver, for the Synchronize
Cache command only, treats any sense data with sense key ILLEGAL REQUEST
as a successful command (regardless of ASC / ASCQ).

After commit cf3fc037623c ("ata: libata-scsi: Fix ata_to_sense_error()
status handling") the code that treats any sense data with sense key
ILLEGAL REQUEST as a successful command is no longer applicable, so the
command fails, which causes the system suspend to be aborted:

  sd 1:0:0:0: PM: dpm_run_callback(): scsi_bus_suspend returns -5
  sd 1:0:0:0: PM: failed to suspend async: error -5
  PM: Some devices failed to suspend, or early wake event detected

To make suspend work once again, for a Security locked device only,
return sense data LOGICAL UNIT ACCESS NOT AUTHORIZED, the actual sense
data which a real SCSI device would have returned if locked.
The SCSI disk driver treats this sense data as a successful command.

Cc: stable@vger.kernel.org
Reported-by: Ilia Baryshnikov <qwelias@gmail.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220704
Fixes: cf3fc037623c ("ata: libata-scsi: Fix ata_to_sense_error() status handling")
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/libata-scsi.c | 7 +++++++
 include/linux/ata.h       | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 3fb84f690644..434774e71fe6 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -992,6 +992,13 @@ static void ata_gen_ata_sense(struct ata_queued_cmd *qc)
 		return;
 	}
 
+	if (ata_id_is_locked(dev->id)) {
+		/* Security locked */
+		/* LOGICAL UNIT ACCESS NOT AUTHORIZED */
+		ata_scsi_set_sense(dev, cmd, DATA_PROTECT, 0x74, 0x71);
+		return;
+	}
+
 	if (!(qc->flags & ATA_QCFLAG_RTF_FILLED)) {
 		ata_dev_dbg(dev,
 			    "Missing result TF: reporting aborted command\n");
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 792e10a09787..c9013e472aa3 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -566,6 +566,7 @@ struct ata_bmdma_prd {
 #define ata_id_has_ncq(id)	((id)[ATA_ID_SATA_CAPABILITY] & (1 << 8))
 #define ata_id_queue_depth(id)	(((id)[ATA_ID_QUEUE_DEPTH] & 0x1f) + 1)
 #define ata_id_removable(id)	((id)[ATA_ID_CONFIG] & (1 << 7))
+#define ata_id_is_locked(id)	(((id)[ATA_ID_DLF] & 0x7) == 0x7)
 #define ata_id_has_atapi_AN(id)	\
 	((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
 	  ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
-- 
cgit v1.2.3


From 78cfd833bc04c0398ca4cfc64704350aebe4d4c2 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 20 Nov 2025 13:06:39 +0000
Subject: firmware: cs_dsp: Factor out common debugfs string read

cs_dsp_debugfs_wmfw_read() and cs_dsp_debugfs_bin_read() were identical
except for which struct member they printed. Move all this duplicated
code into a common function cs_dsp_debugfs_string_read().

The check for dsp->booted has been removed because this is redundant.
The two strings are set when the DSP is booted and cleared when the
DSP is powered-down.

Access to the string char * must be protected by the pwr_lock mutex. The
string is passed into cs_dsp_debugfs_string_read() as a pointer to the
char * so that the mutex lock can also be factored out into
cs_dsp_debugfs_string_read().

wmfw_file_name and bin_file_name members of struct cs_dsp have been
changed to const char *. It makes for a better API to pass a const
pointer into cs_dsp_debugfs_string_read().

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20251120130640.1169780-2-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/firmware/cirrus/cs_dsp.c       | 45 ++++++++++++++++------------------
 include/linux/firmware/cirrus/cs_dsp.h |  4 +--
 2 files changed, 23 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c
index f51047d8ea64..58e41751dbc1 100644
--- a/drivers/firmware/cirrus/cs_dsp.c
+++ b/drivers/firmware/cirrus/cs_dsp.c
@@ -9,6 +9,7 @@
  *                         Cirrus Logic International Semiconductor Ltd.
  */
 
+#include <linux/cleanup.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
@@ -410,24 +411,30 @@ static void cs_dsp_debugfs_clear(struct cs_dsp *dsp)
 	dsp->bin_file_name = NULL;
 }
 
+static ssize_t cs_dsp_debugfs_string_read(struct cs_dsp *dsp,
+					  char __user *user_buf,
+					  size_t count, loff_t *ppos,
+					  const char **pstr)
+{
+	const char *str;
+
+	scoped_guard(mutex, &dsp->pwr_lock) {
+		str = *pstr;
+		if (!str)
+			return 0;
+
+		return simple_read_from_buffer(user_buf, count, ppos, str, strlen(str));
+	}
+}
+
 static ssize_t cs_dsp_debugfs_wmfw_read(struct file *file,
 					char __user *user_buf,
 					size_t count, loff_t *ppos)
 {
 	struct cs_dsp *dsp = file->private_data;
-	ssize_t ret;
 
-	mutex_lock(&dsp->pwr_lock);
-
-	if (!dsp->wmfw_file_name || !dsp->booted)
-		ret = 0;
-	else
-		ret = simple_read_from_buffer(user_buf, count, ppos,
-					      dsp->wmfw_file_name,
-					      strlen(dsp->wmfw_file_name));
-
-	mutex_unlock(&dsp->pwr_lock);
-	return ret;
+	return cs_dsp_debugfs_string_read(dsp, user_buf, count, ppos,
+					  &dsp->wmfw_file_name);
 }
 
 static ssize_t cs_dsp_debugfs_bin_read(struct file *file,
@@ -435,19 +442,9 @@ static ssize_t cs_dsp_debugfs_bin_read(struct file *file,
 				       size_t count, loff_t *ppos)
 {
 	struct cs_dsp *dsp = file->private_data;
-	ssize_t ret;
-
-	mutex_lock(&dsp->pwr_lock);
 
-	if (!dsp->bin_file_name || !dsp->booted)
-		ret = 0;
-	else
-		ret = simple_read_from_buffer(user_buf, count, ppos,
-					      dsp->bin_file_name,
-					      strlen(dsp->bin_file_name));
-
-	mutex_unlock(&dsp->pwr_lock);
-	return ret;
+	return cs_dsp_debugfs_string_read(dsp, user_buf, count, ppos,
+					  &dsp->bin_file_name);
 }
 
 static const struct {
diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index a66eb7624730..69959032f8f5 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -188,8 +188,8 @@ struct cs_dsp {
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_root;
-	char *wmfw_file_name;
-	char *bin_file_name;
+	const char *wmfw_file_name;
+	const char *bin_file_name;
 #endif
 };
 
-- 
cgit v1.2.3


From d5089fffe1db04a802b028c2ef4875be1ed452a3 Mon Sep 17 00:00:00 2001
From: Baojun Xu <baojun.xu@ti.com>
Date: Mon, 17 Nov 2025 18:21:53 +0800
Subject: ASoC: tas2781: Add tas2568/2574/5806m/5806md/5830 support

TAS5806M, TAS5806MD, TAS5830 has on-chip DSP without current/voltage
feedback, and in same family with TAS58XX.
TAS2568, TAS2574 is in family with TAS257X.

Signed-off-by: Baojun Xu <baojun.xu@ti.com>
Link: https://patch.msgid.link/20251117102153.30644-2-baojun.xu@ti.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/tas2781.h        |  5 +++++
 sound/soc/codecs/tas2781-i2c.c | 29 ++++++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h
index c3b4c43dd2bf..711142cb9918 100644
--- a/include/sound/tas2781.h
+++ b/include/sound/tas2781.h
@@ -117,15 +117,20 @@ enum audio_device {
 	TAS2120,
 	TAS2320,
 	TAS2563,
+	TAS2568,
 	TAS2570,
 	TAS2572,
+	TAS2574,
 	TAS2781,
 	TAS5802,
+	TAS5806M,
+	TAS5806MD,
 	TAS5815,
 	TAS5822,
 	TAS5825,
 	TAS5827,
 	TAS5828,
+	TAS5830,
 	TAS_OTHERS,
 };
 
diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c
index 7bd98ff5a250..0a86964dfcfd 100644
--- a/sound/soc/codecs/tas2781-i2c.c
+++ b/sound/soc/codecs/tas2781-i2c.c
@@ -105,15 +105,20 @@ static const struct i2c_device_id tasdevice_id[] = {
 	{ "tas2120", TAS2120 },
 	{ "tas2320", TAS2320 },
 	{ "tas2563", TAS2563 },
+	{ "tas2568", TAS2568 },
 	{ "tas2570", TAS2570 },
 	{ "tas2572", TAS2572 },
+	{ "tas2574", TAS2574 },
 	{ "tas2781", TAS2781 },
 	{ "tas5802", TAS5802 },
+	{ "tas5806m", TAS5806M },
+	{ "tas5806md", TAS5806MD },
 	{ "tas5815", TAS5815 },
 	{ "tas5822", TAS5822 },
 	{ "tas5825", TAS5825 },
 	{ "tas5827", TAS5827 },
 	{ "tas5828", TAS5828 },
+	{ "tas5830", TAS5830 },
 	{}
 };
 MODULE_DEVICE_TABLE(i2c, tasdevice_id);
@@ -125,15 +130,20 @@ static const struct of_device_id tasdevice_of_match[] = {
 	{ .compatible = "ti,tas2120" },
 	{ .compatible = "ti,tas2320" },
 	{ .compatible = "ti,tas2563" },
+	{ .compatible = "ti,tas2568" },
 	{ .compatible = "ti,tas2570" },
 	{ .compatible = "ti,tas2572" },
+	{ .compatible = "ti,tas2574" },
 	{ .compatible = "ti,tas2781" },
 	{ .compatible = "ti,tas5802" },
+	{ .compatible = "ti,tas5806m" },
+	{ .compatible = "ti,tas5806md" },
 	{ .compatible = "ti,tas5815" },
 	{ .compatible = "ti,tas5822" },
 	{ .compatible = "ti,tas5825" },
 	{ .compatible = "ti,tas5827" },
 	{ .compatible = "ti,tas5828" },
+	{ .compatible = "ti,tas5830" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, tasdevice_of_match);
@@ -1643,8 +1653,10 @@ static void tasdevice_fw_ready(const struct firmware *fmw,
 	case TAS2118:
 	case TAS2120:
 	case TAS2320:
+	case TAS2568:
 	case TAS2570:
 	case TAS2572:
+	case TAS2574:
 		goto out;
 	}
 	if (tas_priv->name_prefix)
@@ -1670,9 +1682,7 @@ static void tasdevice_fw_ready(const struct firmware *fmw,
 	}
 	tas_priv->fw_state = TASDEVICE_DSP_FW_ALL_OK;
 
-	/* There is no calibration required for
-	 * TAS5802/TAS5815/TAS5822/TAS5825/TAS5827/TAS5828.
-	 */
+	/* There is no calibration required for TAS58XX. */
 	if (tas_priv->chip_id < TAS5802) {
 		ret = tasdevice_create_cali_ctrls(tas_priv);
 		if (ret) {
@@ -1728,11 +1738,14 @@ out:
 		case TAS2563:
 		case TAS2781:
 		case TAS5802:
+		case TAS5806M:
+		case TAS5806MD:
 		case TAS5815:
 		case TAS5822:
 		case TAS5825:
 		case TAS5827:
 		case TAS5828:
+		case TAS5830:
 			/* If DSP FW fail, DSP kcontrol won't be created. */
 			tasdevice_dsp_remove(tas_priv);
 		}
@@ -1883,8 +1896,10 @@ static int tasdevice_codec_probe(struct snd_soc_component *codec)
 	case TAS2118:
 	case TAS2120:
 	case TAS2320:
+	case TAS2568:
 	case TAS2570:
 	case TAS2572:
+	case TAS2574:
 		p = (struct snd_kcontrol_new *)tas2x20_snd_controls;
 		size = ARRAY_SIZE(tas2x20_snd_controls);
 		tas_priv->dvc_tlv_table = tas2x20_dvc_table;
@@ -1894,11 +1909,14 @@ static int tasdevice_codec_probe(struct snd_soc_component *codec)
 		size = ARRAY_SIZE(tas2781_snd_controls);
 		break;
 	case TAS5802:
+	case TAS5806M:
+	case TAS5806MD:
 	case TAS5815:
 	case TAS5822:
 	case TAS5825:
 	case TAS5827:
 	case TAS5828:
+	case TAS5830:
 		p = (struct snd_kcontrol_new *)tas5825_snd_controls;
 		size = ARRAY_SIZE(tas5825_snd_controls);
 		break;
@@ -2072,15 +2090,20 @@ static const struct acpi_device_id tasdevice_acpi_match[] = {
 	{ "TXNW2120", TAS2120 },
 	{ "TXNW2320", TAS2320 },
 	{ "TXNW2563", TAS2563 },
+	{ "TXNW2568", TAS2568 },
 	{ "TXNW2570", TAS2570 },
 	{ "TXNW2572", TAS2572 },
+	{ "TXNW2574", TAS2574 },
 	{ "TXNW2781", TAS2781 },
 	{ "TXNW5802", TAS5802 },
+	{ "TXNW806M", TAS5806M },
+	{ "TXNW806D", TAS5806MD },
 	{ "TXNW5815", TAS5815 },
 	{ "TXNW5822", TAS5822 },
 	{ "TXNW5825", TAS5825 },
 	{ "TXNW5827", TAS5827 },
 	{ "TXNW5828", TAS5828 },
+	{ "TXNW5830", TAS5830 },
 	{},
 };
 
-- 
cgit v1.2.3


From 6f87b41303d3c4280a57b4f7360022a0951b43dd Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 18 Nov 2025 11:04:03 +0100
Subject: string: fix kerneldoc formatting in strends()

strends() kernel doc should have used `@str:` format for arguments
instead of `@str -`.

Fixes: 197b3f3c70d6 ("string: provide strends()")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/all/20251118134748.40f03b9c@canb.auug.org.au/
Link: https://lore.kernel.org/r/20251118-strends-follow-up-v1-1-d3f8ef750f59@linaro.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/string.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index 929d05d1247c..69e9256592f8 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -564,8 +564,8 @@ static inline bool strstarts(const char *str, const char *prefix)
 
 /**
  * strends - Check if a string ends with another string.
- * @str - NULL-terminated string to check against @suffix
- * @suffix - NULL-terminated string defining the suffix to look for in @str
+ * @str: NULL-terminated string to check against @suffix
+ * @suffix: NULL-terminated string defining the suffix to look for in @str
  *
  * Returns:
  * True if @str ends with @suffix. False in all other cases.
-- 
cgit v1.2.3


From 8278cb72c60399f6dc6300c409879fb4c7291513 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <yuntao.wang@linux.dev>
Date: Sat, 15 Nov 2025 21:47:46 +0800
Subject: of/fdt: Consolidate duplicate code into helper functions

Currently, there are many pieces of nearly identical code scattered across
different places. Consolidate the duplicate code into helper functions to
improve maintainability and reduce the likelihood of errors.

Signed-off-by: Yuntao Wang <yuntao.wang@linux.dev>
Link: https://patch.msgid.link/20251115134753.179931-2-yuntao.wang@linux.dev
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 drivers/of/fdt.c       | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/of_fdt.h |  9 +++++++++
 2 files changed, 50 insertions(+)

(limited to 'include')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 0edd639898a6..0c18bdefbbee 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -625,6 +625,47 @@ const void *__init of_get_flat_dt_prop(unsigned long node, const char *name,
 	return fdt_getprop(initial_boot_params, node, name, size);
 }
 
+const __be32 *__init of_flat_dt_get_addr_size_prop(unsigned long node,
+						   const char *name,
+						   int *entries)
+{
+	const __be32 *prop;
+	int len, elen = (dt_root_addr_cells + dt_root_size_cells) * sizeof(__be32);
+
+	prop = of_get_flat_dt_prop(node, name, &len);
+	if (!prop || len % elen) {
+		*entries = 0;
+		return NULL;
+	}
+
+	*entries = len / elen;
+	return prop;
+}
+
+bool __init of_flat_dt_get_addr_size(unsigned long node, const char *name,
+				     u64 *addr, u64 *size)
+{
+	const __be32 *prop;
+	int entries;
+
+	prop = of_flat_dt_get_addr_size_prop(node, name, &entries);
+	if (!prop || entries != 1)
+		return false;
+
+	of_flat_dt_read_addr_size(prop, 0, addr, size);
+	return true;
+}
+
+void __init of_flat_dt_read_addr_size(const __be32 *prop, int entry_index,
+				      u64 *addr, u64 *size)
+{
+	int entry_cells = dt_root_addr_cells + dt_root_size_cells;
+	prop += entry_cells * entry_index;
+
+	*addr = dt_mem_next_cell(dt_root_addr_cells, &prop);
+	*size = dt_mem_next_cell(dt_root_size_cells, &prop);
+}
+
 /**
  * of_fdt_is_compatible - Return true if given node from the given blob has
  * compat in its compatible list
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index b8d6c0c20876..51dadbaa3d63 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -55,6 +55,15 @@ extern int of_get_flat_dt_subnode_by_name(unsigned long node,
 					  const char *uname);
 extern const void *of_get_flat_dt_prop(unsigned long node, const char *name,
 				       int *size);
+
+extern const __be32 *of_flat_dt_get_addr_size_prop(unsigned long node,
+						   const char *name,
+						   int *entries);
+extern bool of_flat_dt_get_addr_size(unsigned long node, const char *name,
+				     u64 *addr, u64 *size);
+extern void of_flat_dt_read_addr_size(const __be32 *prop, int entry_index,
+				      u64 *addr, u64 *size);
+
 extern int of_flat_dt_is_compatible(unsigned long node, const char *name);
 extern unsigned long of_get_flat_dt_root(void);
 extern uint32_t of_get_flat_dt_phandle(unsigned long node);
-- 
cgit v1.2.3


From d1cadd4bfc2802c6f73b1739dbceef7513afc591 Mon Sep 17 00:00:00 2001
From: David Laight <david.laight.linux@gmail.com>
Date: Wed, 19 Nov 2025 22:41:28 +0000
Subject: nodemask: use min() instead of min_t()

min_t(unsigned int, a, b) casts an 'unsigned long' to 'unsigned int'.
Use min(a, b) instead as it promotes any 'unsigned int' to 'unsigned long'
and so cannot discard significant bits.

In this case the 'unsigned long' value is small enough that the result
is ok.

Detected by an extra check added to min_t().

Signed-off-by: David Laight <david.laight.linux@gmail.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
 include/linux/nodemask.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 7ad1f5c7407e..bd38648c998d 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -245,18 +245,18 @@ static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int n
 }
 
 /* FIXME: better would be to fix all architectures to never return
-          > MAX_NUMNODES, then the silly min_ts could be dropped. */
+          > MAX_NUMNODES, then the silly min()s could be dropped. */
 
 #define first_node(src) __first_node(&(src))
 static __always_inline unsigned int __first_node(const nodemask_t *srcp)
 {
-	return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
+	return min(MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
 }
 
 #define next_node(n, src) __next_node((n), &(src))
 static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp)
 {
-	return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
+	return min(MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
 }
 
 /*
@@ -293,8 +293,7 @@ static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node)
 #define first_unset_node(mask) __first_unset_node(&(mask))
 static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp)
 {
-	return min_t(unsigned int, MAX_NUMNODES,
-			find_first_zero_bit(maskp->bits, MAX_NUMNODES));
+	return min(MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES));
 }
 
 #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)
-- 
cgit v1.2.3


From d7cdbbc93c564902169e854e78716a7b5e6cb241 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 20 Nov 2025 14:23:58 +0100
Subject: software node: allow referencing firmware nodes

At the moment software nodes can only reference other software nodes.
This is a limitation for devices created, for instance, on the auxiliary
bus with a dynamic software node attached which cannot reference devices
the firmware node of which is "real" (as an OF node or otherwise).

Make it possible for a software node to reference all firmware nodes in
addition to static software nodes. To that end: add a second pointer to
struct software_node_ref_args of type struct fwnode_handle. The core
swnode code will first check the swnode pointer and if it's NULL, it
will assume the fwnode pointer should be set.

Software node graphs remain the same, as in: the remote endpoints still
have to be software nodes.

Acked-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Tested-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/base/swnode.c    | 24 ++++++++++++++++++++++--
 include/linux/property.h | 13 ++++++++++---
 2 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index 6b1ee75a908f..16a8301c25d6 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -535,7 +535,24 @@ software_node_get_reference_args(const struct fwnode_handle *fwnode,
 	ref_array = prop->pointer;
 	ref = &ref_array[index];
 
-	refnode = software_node_fwnode(ref->node);
+	/*
+	 * A software node can reference other software nodes or firmware
+	 * nodes (which are the abstraction layer sitting on top of them).
+	 * This is done to ensure we can create references to static software
+	 * nodes before they're registered with the firmware node framework.
+	 * At the time the reference is being resolved, we expect the swnodes
+	 * in question to already have been registered and to be backed by
+	 * a firmware node. This is why we use the fwnode API below to read the
+	 * relevant properties and bump the reference count.
+	 */
+
+	if (ref->swnode)
+		refnode = software_node_fwnode(ref->swnode);
+	else if (ref->fwnode)
+		refnode = ref->fwnode;
+	else
+		return -EINVAL;
+
 	if (!refnode)
 		return -ENOENT;
 
@@ -633,7 +650,10 @@ software_node_graph_get_remote_endpoint(const struct fwnode_handle *fwnode)
 
 	ref = prop->pointer;
 
-	return software_node_get(software_node_fwnode(ref[0].node));
+	if (!ref->swnode)
+		return NULL;
+
+	return software_node_get(software_node_fwnode(ref->swnode));
 }
 
 static struct fwnode_handle *
diff --git a/include/linux/property.h b/include/linux/property.h
index 50b26589dd70..272bfbdea7bf 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -355,19 +355,26 @@ struct software_node;
 
 /**
  * struct software_node_ref_args - Reference property with additional arguments
- * @node: Reference to a software node
+ * @swnode: Reference to a software node
+ * @fwnode: Alternative reference to a firmware node handle
  * @nargs: Number of elements in @args array
  * @args: Integer arguments
  */
 struct software_node_ref_args {
-	const struct software_node *node;
+	const struct software_node *swnode;
+	struct fwnode_handle *fwnode;
 	unsigned int nargs;
 	u64 args[NR_FWNODE_REFERENCE_ARGS];
 };
 
 #define SOFTWARE_NODE_REFERENCE(_ref_, ...)			\
 (const struct software_node_ref_args) {				\
-	.node = _ref_,						\
+	.swnode = _Generic(_ref_,				\
+			   const struct software_node *: _ref_,	\
+			   default: NULL),			\
+	.fwnode = _Generic(_ref_,				\
+			   struct fwnode_handle *: _ref_,	\
+			   default: NULL),			\
 	.nargs = COUNT_ARGS(__VA_ARGS__),			\
 	.args = { __VA_ARGS__ },				\
 }
-- 
cgit v1.2.3


From cf6ec18ea6e12569b83af2709d0bd0cc09da198f Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 20 Nov 2025 13:44:34 +0000
Subject: ASoC: soc.h: Add SOC_ENUM_EXT_ACC() to allow setting access flags

Add a macro SOC_ENUM_EXT_ACC() to allow the access permission flags
to be set. This is the same as SOC_ENUM_EXT() but with an extra
argument for the access flags.

This will be used by the cs35l56.c driver to create a read-only
volatile enum. It's preferable to avoid custom control macros in codec
drivers. Code maintenance is easier if all control macros are defined
together in soc.h.

This commit only creates this one macro that is actually going to be used.
There's no point cluttering soc.h with unused macros - that just adds a
maintenance burden. People can add equivalents for the other macros if
they need them.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20251120134437.1179191-2-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 37dc6f6fc63f..b1b6b6a497da 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -319,6 +319,13 @@ struct platform_device;
 #define SOC_VALUE_ENUM_EXT(xname, xenum, xhandler_get, xhandler_put) \
 	SOC_ENUM_EXT(xname, xenum, xhandler_get, xhandler_put)
 
+#define SOC_ENUM_EXT_ACC(xname, xenum, xhandler_get, xhandler_put, xaccess) \
+{	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, \
+	.access = xaccess, \
+	.info = snd_soc_info_enum_double, \
+	.get = xhandler_get, .put = xhandler_put, \
+	.private_value = (unsigned long)&xenum }
+
 #define SND_SOC_BYTES(xname, xbase, xregs)		      \
 {	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname,   \
 	.info = snd_soc_bytes_info, .get = snd_soc_bytes_get, \
-- 
cgit v1.2.3


From d7a82707f19c7a11ce42dd46cb22ca34a58cc9b0 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 20 Nov 2025 13:44:35 +0000
Subject: ASoC: soc.h: Add SND_SOC_BYTES_E_ACC() to allow setting access flags

Add a macro SND_SOC_BYTES_E_ACC() to allow the access permission flags
to be set. This is the same as SND_SOC_BYTES_E() but with an extra
argument for the access flags.

This will be used by the cs35l56.c driver to create a read-only
volatile byte control. It's preferable to avoid custom control macros
in codec drivers. Code maintenance is easier if all control macros are
defined together in soc.h.

This commit only creates this one macro that is actually going to be used.
There's no point cluttering soc.h with unused macros - that just adds a
maintenance burden. People can add equivalents for the other macros if
they need them.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20251120134437.1179191-3-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index b1b6b6a497da..aa0fe6b80293 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -338,6 +338,13 @@ struct platform_device;
 	.put = xhandler_put, .private_value = \
 		((unsigned long)&(struct soc_bytes) \
 		{.base = xbase, .num_regs = xregs }) }
+#define SND_SOC_BYTES_E_ACC(xname, xbase, xregs, xhandler_get, xhandler_put, xaccess) \
+{	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, \
+	.access = xaccess, \
+	.info = snd_soc_bytes_info, .get = xhandler_get, \
+	.put = xhandler_put, .private_value = \
+		((unsigned long)&(struct soc_bytes) \
+		{.base = xbase, .num_regs = xregs }) }
 
 #define SND_SOC_BYTES_MASK(xname, xbase, xregs, xmask)	      \
 {	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname,   \
-- 
cgit v1.2.3


From 1f382215119a0bc165e766e5bc424b3d3e8dae35 Mon Sep 17 00:00:00 2001
From: Pingfan Liu <piliu@redhat.com>
Date: Wed, 19 Nov 2025 17:55:24 +0800
Subject: cgroup/cpuset: Introduce cpuset_cpus_allowed_locked()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cpuset_cpus_allowed() uses a reader lock that is sleepable under RT,
which means it cannot be called inside raw_spin_lock_t context.

Introduce a new cpuset_cpus_allowed_locked() helper that performs the
same function as cpuset_cpus_allowed() except that the caller must have
acquired the cpuset_mutex so that no further locking will be needed.

Suggested-by: Waiman Long <longman@redhat.com>
Signed-off-by: Pingfan Liu <piliu@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: linux-kernel@vger.kernel.org
To: cgroups@vger.kernel.org
Reviewed-by: Chen Ridong <chenridong@huawei.com>
Reviewed-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h |  9 ++++++++-
 kernel/cgroup/cpuset.c | 51 +++++++++++++++++++++++++++++++++++---------------
 2 files changed, 44 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2ddb256187b5..a98d3330385c 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -74,6 +74,7 @@ extern void inc_dl_tasks_cs(struct task_struct *task);
 extern void dec_dl_tasks_cs(struct task_struct *task);
 extern void cpuset_lock(void);
 extern void cpuset_unlock(void);
+extern void cpuset_cpus_allowed_locked(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern bool cpuset_cpu_is_isolated(int cpu);
@@ -195,10 +196,16 @@ static inline void dec_dl_tasks_cs(struct task_struct *task) { }
 static inline void cpuset_lock(void) { }
 static inline void cpuset_unlock(void) { }
 
+static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
+					struct cpumask *mask)
+{
+	cpumask_copy(mask, task_cpu_possible_mask(p));
+}
+
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
 {
-	cpumask_copy(mask, task_cpu_possible_mask(p));
+	cpuset_cpus_allowed_locked(p, mask);
 }
 
 static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 976bce6e5673..ec8bebc66469 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4160,24 +4160,13 @@ void __init cpuset_init_smp(void)
 	BUG_ON(!cpuset_migrate_mm_wq);
 }
 
-/**
- * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
- * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
- * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
- *
- * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
- * attached to the specified @tsk.  Guaranteed to return some non-empty
- * subset of cpu_active_mask, even if this means going outside the
- * tasks cpuset, except when the task is in the top cpuset.
- **/
-
-void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
+/*
+ * Return cpus_allowed mask from a task's cpuset.
+ */
+static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
 {
-	unsigned long flags;
 	struct cpuset *cs;
 
-	spin_lock_irqsave(&callback_lock, flags);
-
 	cs = task_cs(tsk);
 	if (cs != &top_cpuset)
 		guarantee_active_cpus(tsk, pmask);
@@ -4197,7 +4186,39 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 		if (!cpumask_intersects(pmask, cpu_active_mask))
 			cpumask_copy(pmask, possible_mask);
 	}
+}
 
+/**
+ * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
+ *
+ * Similir to cpuset_cpus_allowed() except that the caller must have acquired
+ * cpuset_mutex.
+ */
+void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+{
+	lockdep_assert_held(&cpuset_mutex);
+	__cpuset_cpus_allowed_locked(tsk, pmask);
+}
+
+/**
+ * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
+ *
+ * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
+ * attached to the specified @tsk.  Guaranteed to return some non-empty
+ * subset of cpu_active_mask, even if this means going outside the
+ * tasks cpuset, except when the task is in the top cpuset.
+ **/
+
+void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&callback_lock, flags);
+	__cpuset_cpus_allowed_locked(tsk, pmask);
 	spin_unlock_irqrestore(&callback_lock, flags);
 }
 
-- 
cgit v1.2.3


From 3efee7362dbf896072af1c1aaeaf9fd6e235c591 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 20 Nov 2025 15:56:57 +0000
Subject: ASoC: SDCA: Add stubs for FDL helper functions

In the case the SDCA IRQ is built in but FDL support is not stub
functions are required for the FDL helpers to avoid build failures. The
FDL IRQs likely shouldn't get triggered in this case, however they would
still be a part of the build.

Fixes: 71f7990a34cd ("ASoC: SDCA: Add FDL library for XU entities")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511200419.SbU6YvjE-lkp@intel.com/
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251120155657.2181751-1-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_fdl.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_fdl.h b/include/sound/sdca_fdl.h
index f4ba809cb203..fbaf4b384c8a 100644
--- a/include/sound/sdca_fdl.h
+++ b/include/sound/sdca_fdl.h
@@ -64,6 +64,8 @@ struct fdl_state {
 					 SDCA_CTL_XU_FDLD_ACK_TRANSFER | \
 					 SDCA_CTL_XU_FDLD_NEEDS_SET)
 
+#if IS_ENABLED(CONFIG_SND_SOC_SDCA_FDL)
+
 int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt);
 int sdca_fdl_process(struct sdca_interrupt *interrupt);
 int sdca_fdl_sync(struct device *dev, struct sdca_function_data *function,
@@ -72,4 +74,32 @@ int sdca_fdl_sync(struct device *dev, struct sdca_function_data *function,
 int sdca_reset_function(struct device *dev, struct sdca_function_data *function,
 			struct regmap *regmap);
 
+#else
+
+static inline int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt)
+{
+	return 0;
+}
+
+static inline int sdca_fdl_process(struct sdca_interrupt *interrupt)
+{
+	return 0;
+}
+
+static inline int sdca_fdl_sync(struct device *dev,
+				struct sdca_function_data *function,
+				struct sdca_interrupt_info *info)
+{
+	return 0;
+}
+
+static inline int sdca_reset_function(struct device *dev,
+				      struct sdca_function_data *function,
+				      struct regmap *regmap)
+{
+	return 0;
+}
+
+#endif // CONFIG_SND_SOC_SDCA_FDL
+
 #endif // __SDCA_FDL_H__
-- 
cgit v1.2.3


From 5fe65824b74c0414f105f0535437108cd6c31cc7 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 20 Nov 2025 15:30:12 +0000
Subject: ASoC: SDCA: Add missing forward declaration in header

The structure sdca_function_desc contains a fwnode_handle which is
undefined if the user doesn't pull in an appropriate header. Add a
forward declaration to avoid this.

Fixes: 996bf834d0b6 ("ASoC: SDCA: Add code to parse Function information")
Tested-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Maciej Strozek <mstrozek@opensource.cirrus.com>
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Tested-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251120153023.2105663-4-ckeepax@opensource.cirrus.com
Reviewed-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/sound/sdca.h b/include/sound/sdca.h
index d38cdbfeb35f..d58d60221277 100644
--- a/include/sound/sdca.h
+++ b/include/sound/sdca.h
@@ -13,6 +13,7 @@
 #include <linux/kconfig.h>
 
 struct acpi_table_swft;
+struct fwnode_handle;
 struct sdw_slave;
 
 #define SDCA_MAX_FUNCTION_COUNT 8
-- 
cgit v1.2.3


From 5acf17b6df5e759bfb8bc0a75fadcbb3e363a17b Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 20 Nov 2025 15:30:19 +0000
Subject: ASoC: SDCA: Add helper to write initialization writes

Add a helper function to write out the SDCA blind initialization writes.

Acked-by: Vinod Koul <vkoul@kernel.org>
Tested-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Maciej Strozek <mstrozek@opensource.cirrus.com>
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Tested-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251120153023.2105663-11-ckeepax@opensource.cirrus.com
Reviewed-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_regmap.h  |  2 ++
 sound/soc/sdca/sdca_regmap.c | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_regmap.h b/include/sound/sdca_regmap.h
index b2e3c2ad2bb8..792540a530fc 100644
--- a/include/sound/sdca_regmap.h
+++ b/include/sound/sdca_regmap.h
@@ -27,5 +27,7 @@ int sdca_regmap_populate_constants(struct device *dev, struct sdca_function_data
 
 int sdca_regmap_write_defaults(struct device *dev, struct regmap *regmap,
 			       struct sdca_function_data *function);
+int sdca_regmap_write_init(struct device *dev, struct regmap *regmap,
+			   struct sdca_function_data *function);
 
 #endif // __SDCA_REGMAP_H__
diff --git a/sound/soc/sdca/sdca_regmap.c b/sound/soc/sdca/sdca_regmap.c
index 6fbb241d9d35..2cca9a9c71ea 100644
--- a/sound/soc/sdca/sdca_regmap.c
+++ b/sound/soc/sdca/sdca_regmap.c
@@ -355,3 +355,19 @@ int sdca_regmap_write_defaults(struct device *dev, struct regmap *regmap,
 	return 0;
 }
 EXPORT_SYMBOL_NS(sdca_regmap_write_defaults, "SND_SOC_SDCA");
+
+int sdca_regmap_write_init(struct device *dev, struct regmap *regmap,
+			   struct sdca_function_data *function)
+{
+	struct sdca_init_write *init = function->init_table;
+	int ret, i;
+
+	for (i = 0; i < function->num_init_table; i++) {
+		ret = regmap_write(regmap, init[i].addr, init[i].val);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(sdca_regmap_write_init, "SND_SOC_SDCA");
-- 
cgit v1.2.3


From 4496d1c65bad7a3a32d2e09aaf3c54bc562c3fcc Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 20 Nov 2025 15:30:20 +0000
Subject: ASoC: SDCA: add function devices

Use the auxiliary bus to register/unregister subdevices for each
function. Each function will be handled with a separate driver,
matched using a name.

If a vendor wants to override a specific function driver, they could
use a custom name to match with a custom function driver.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Tested-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Maciej Strozek <mstrozek@opensource.cirrus.com>
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Tested-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251120153023.2105663-12-ckeepax@opensource.cirrus.com
Reviewed-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca.h                  |  13 ++++
 sound/soc/sdca/Kconfig                |   1 +
 sound/soc/sdca/Makefile               |   4 +-
 sound/soc/sdca/sdca_function_device.c | 117 ++++++++++++++++++++++++++++++++++
 sound/soc/sdca/sdca_function_device.h |  15 +++++
 5 files changed, 148 insertions(+), 2 deletions(-)
 create mode 100644 sound/soc/sdca/sdca_function_device.c
 create mode 100644 sound/soc/sdca/sdca_function_device.h

(limited to 'include')

diff --git a/include/sound/sdca.h b/include/sound/sdca.h
index d58d60221277..67ff3c88705d 100644
--- a/include/sound/sdca.h
+++ b/include/sound/sdca.h
@@ -15,18 +15,21 @@
 struct acpi_table_swft;
 struct fwnode_handle;
 struct sdw_slave;
+struct sdca_dev;
 
 #define SDCA_MAX_FUNCTION_COUNT 8
 
 /**
  * struct sdca_function_desc - short descriptor for an SDCA Function
  * @node: firmware node for the Function.
+ * @func_dev: pointer to SDCA function device.
  * @name: Human-readable string.
  * @type: Function topology type.
  * @adr: ACPI address (used for SDCA register access).
  */
 struct sdca_function_desc {
 	struct fwnode_handle *node;
+	struct sdca_dev *func_dev;
 	const char *name;
 	u32 type;
 	u8 adr;
@@ -59,6 +62,8 @@ void sdca_lookup_functions(struct sdw_slave *slave);
 void sdca_lookup_swft(struct sdw_slave *slave);
 void sdca_lookup_interface_revision(struct sdw_slave *slave);
 bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_quirk quirk);
+int sdca_dev_register_functions(struct sdw_slave *slave);
+void sdca_dev_unregister_functions(struct sdw_slave *slave);
 
 #else
 
@@ -69,6 +74,14 @@ static inline bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_qu
 {
 	return false;
 }
+
+static inline int sdca_dev_register_functions(struct sdw_slave *slave)
+{
+	return 0;
+}
+
+static inline void sdca_dev_unregister_functions(struct sdw_slave *slave) {}
+
 #endif
 
 #endif
diff --git a/sound/soc/sdca/Kconfig b/sound/soc/sdca/Kconfig
index a73920d07073..e7f36d668f15 100644
--- a/sound/soc/sdca/Kconfig
+++ b/sound/soc/sdca/Kconfig
@@ -4,6 +4,7 @@ menu "SoundWire (SDCA)"
 config SND_SOC_SDCA
 	tristate
 	depends on ACPI
+	select AUXILIARY_BUS
 	help
 	  This option enables support for the MIPI SoundWire Device
 	  Class for Audio (SDCA).
diff --git a/sound/soc/sdca/Makefile b/sound/soc/sdca/Makefile
index be911c399bbd..babe3fa2bb3f 100644
--- a/sound/soc/sdca/Makefile
+++ b/sound/soc/sdca/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-snd-soc-sdca-y := sdca_functions.o sdca_device.o sdca_regmap.o sdca_asoc.o \
-		  sdca_ump.o
+snd-soc-sdca-y := sdca_functions.o sdca_device.o sdca_function_device.o \
+		  sdca_regmap.o sdca_asoc.o sdca_ump.o
 snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_HID) += sdca_hid.o
 snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_IRQ) += sdca_interrupts.o
 snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_FDL) += sdca_fdl.o
diff --git a/sound/soc/sdca/sdca_function_device.c b/sound/soc/sdca/sdca_function_device.c
new file mode 100644
index 000000000000..91c49d7389db
--- /dev/null
+++ b/sound/soc/sdca/sdca_function_device.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2024 Intel Corporation.
+
+/*
+ * SDCA Function Device management
+ */
+
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/auxiliary_bus.h>
+#include <linux/soundwire/sdw.h>
+#include <sound/sdca.h>
+#include <sound/sdca_function.h>
+#include "sdca_function_device.h"
+
+/*
+ * A SoundWire device can have multiple SDCA functions identified by
+ * their type and ADR. there can be multiple SoundWire devices per
+ * link, or multiple devices spread across multiple links. An IDA is
+ * required to identify each instance.
+ */
+static DEFINE_IDA(sdca_function_ida);
+
+static void sdca_dev_release(struct device *dev)
+{
+	struct auxiliary_device *auxdev = to_auxiliary_dev(dev);
+	struct sdca_dev *sdev = auxiliary_dev_to_sdca_dev(auxdev);
+
+	ida_free(&sdca_function_ida, auxdev->id);
+	kfree(sdev);
+}
+
+/* alloc, init and add link devices */
+static struct sdca_dev *sdca_dev_register(struct device *parent,
+					  struct sdca_function_desc *function_desc)
+{
+	struct sdca_dev *sdev;
+	struct auxiliary_device *auxdev;
+	int ret;
+	int rc;
+
+	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+	if (!sdev)
+		return ERR_PTR(-ENOMEM);
+
+	auxdev = &sdev->auxdev;
+	auxdev->name = function_desc->name;
+	auxdev->dev.parent = parent;
+	auxdev->dev.fwnode = function_desc->node;
+	auxdev->dev.release = sdca_dev_release;
+
+	sdev->function.desc = function_desc;
+
+	rc = ida_alloc(&sdca_function_ida, GFP_KERNEL);
+	if (rc < 0) {
+		kfree(sdev);
+		return ERR_PTR(rc);
+	}
+	auxdev->id = rc;
+
+	/* now follow the two-step init/add sequence */
+	ret = auxiliary_device_init(auxdev);
+	if (ret < 0) {
+		dev_err(parent, "failed to initialize SDCA function dev %s\n",
+			function_desc->name);
+		ida_free(&sdca_function_ida, auxdev->id);
+		kfree(sdev);
+		return ERR_PTR(ret);
+	}
+
+	ret = auxiliary_device_add(auxdev);
+	if (ret < 0) {
+		dev_err(parent, "failed to add SDCA function dev %s\n",
+			sdev->auxdev.name);
+		/* sdev will be freed with the put_device() and .release sequence */
+		auxiliary_device_uninit(&sdev->auxdev);
+		return ERR_PTR(ret);
+	}
+
+	return sdev;
+}
+
+static void sdca_dev_unregister(struct sdca_dev *sdev)
+{
+	auxiliary_device_delete(&sdev->auxdev);
+	auxiliary_device_uninit(&sdev->auxdev);
+}
+
+int sdca_dev_register_functions(struct sdw_slave *slave)
+{
+	struct sdca_device_data *sdca_data = &slave->sdca_data;
+	int i;
+
+	for (i = 0; i < sdca_data->num_functions; i++) {
+		struct sdca_dev *func_dev;
+
+		func_dev = sdca_dev_register(&slave->dev,
+					     &sdca_data->function[i]);
+		if (!func_dev)
+			return -ENODEV;
+
+		sdca_data->function[i].func_dev = func_dev;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(sdca_dev_register_functions, "SND_SOC_SDCA");
+
+void sdca_dev_unregister_functions(struct sdw_slave *slave)
+{
+	struct sdca_device_data *sdca_data = &slave->sdca_data;
+	int i;
+
+	for (i = 0; i < sdca_data->num_functions; i++)
+		sdca_dev_unregister(sdca_data->function[i].func_dev);
+}
+EXPORT_SYMBOL_NS(sdca_dev_unregister_functions, "SND_SOC_SDCA");
diff --git a/sound/soc/sdca/sdca_function_device.h b/sound/soc/sdca/sdca_function_device.h
new file mode 100644
index 000000000000..5adf7551d3a4
--- /dev/null
+++ b/sound/soc/sdca/sdca_function_device.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/* Copyright(c) 2024 Intel Corporation. */
+
+#ifndef __SDCA_FUNCTION_DEVICE_H
+#define __SDCA_FUNCTION_DEVICE_H
+
+struct sdca_dev {
+	struct auxiliary_device auxdev;
+	struct sdca_function_data function;
+};
+
+#define auxiliary_dev_to_sdca_dev(auxiliary_dev)		\
+	container_of(auxiliary_dev, struct sdca_dev, auxdev)
+
+#endif
-- 
cgit v1.2.3


From 2d877d0659cb69cc0677ee2805e9521966d70ac5 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 20 Nov 2025 15:30:21 +0000
Subject: ASoC: SDCA: Add basic SDCA class driver

Add a device level driver as the entry point for the class driver.
Additional auxiliary drivers will be registered to support each function
within the device. This driver will register those function drivers and
provide the device level functionality, such as monitoring bus
attach/detach, the device level register map, and the root for the IRQ
handling.

Co-developed-by: Maciej Strozek <mstrozek@opensource.cirrus.com>
Tested-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Maciej Strozek <mstrozek@opensource.cirrus.com>
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Tested-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251120153023.2105663-13-ckeepax@opensource.cirrus.com
Reviewed-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw_registers.h |   2 +
 sound/soc/sdca/Kconfig                  |  10 ++
 sound/soc/sdca/Makefile                 |   4 +
 sound/soc/sdca/sdca_class.c             | 304 ++++++++++++++++++++++++++++++++
 sound/soc/sdca/sdca_class.h             |  37 ++++
 5 files changed, 357 insertions(+)
 create mode 100644 sound/soc/sdca/sdca_class.c
 create mode 100644 sound/soc/sdca/sdca_class.h

(limited to 'include')

diff --git a/include/linux/soundwire/sdw_registers.h b/include/linux/soundwire/sdw_registers.h
index 0a5939285583..cae8a0a5a9b0 100644
--- a/include/linux/soundwire/sdw_registers.h
+++ b/include/linux/soundwire/sdw_registers.h
@@ -355,4 +355,6 @@
 /* Check the reserved and fixed bits in address */
 #define SDW_SDCA_VALID_CTL(reg) (((reg) & (GENMASK(31, 25) | BIT(18) | BIT(13))) == BIT(30))
 
+#define SDW_SDCA_MAX_REGISTER			0x47FFFFFF
+
 #endif /* __SDW_REGISTERS_H */
diff --git a/sound/soc/sdca/Kconfig b/sound/soc/sdca/Kconfig
index e7f36d668f15..cbce51be0ba9 100644
--- a/sound/soc/sdca/Kconfig
+++ b/sound/soc/sdca/Kconfig
@@ -37,4 +37,14 @@ config SND_SOC_SDCA_FDL
 config SND_SOC_SDCA_OPTIONAL
 	def_tristate SND_SOC_SDCA || !SND_SOC_SDCA
 
+config SND_SOC_SDCA_CLASS
+	tristate "SDCA Class Driver"
+	depends on SND_SOC_SDCA
+	select SND_SOC_SDCA_FDL
+	select SND_SOC_SDCA_HID
+	select SND_SOC_SDCA_IRQ
+	help
+	  This option enables support for the SDCA Class driver which should
+	  support any class compliant SDCA part.
+
 endmenu
diff --git a/sound/soc/sdca/Makefile b/sound/soc/sdca/Makefile
index babe3fa2bb3f..95db4cef3483 100644
--- a/sound/soc/sdca/Makefile
+++ b/sound/soc/sdca/Makefile
@@ -6,4 +6,8 @@ snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_HID) += sdca_hid.o
 snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_IRQ) += sdca_interrupts.o
 snd-soc-sdca-$(CONFIG_SND_SOC_SDCA_FDL) += sdca_fdl.o
 
+snd-soc-sdca-class-y := sdca_class.o
+
 obj-$(CONFIG_SND_SOC_SDCA) += snd-soc-sdca.o
+
+obj-$(CONFIG_SND_SOC_SDCA_CLASS) += snd-soc-sdca-class.o
diff --git a/sound/soc/sdca/sdca_class.c b/sound/soc/sdca/sdca_class.c
new file mode 100644
index 000000000000..349d32933ba8
--- /dev/null
+++ b/sound/soc/sdca/sdca_class.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Cirrus Logic, Inc. and
+//                    Cirrus Logic International Semiconductor Ltd.
+
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
+#include <linux/regmap.h>
+#include <linux/soundwire/sdw.h>
+#include <linux/soundwire/sdw_registers.h>
+#include <linux/soundwire/sdw_type.h>
+#include <sound/sdca.h>
+#include <sound/sdca_function.h>
+#include <sound/sdca_interrupts.h>
+#include <sound/sdca_regmap.h>
+#include "sdca_class.h"
+
+#define CLASS_SDW_ATTACH_TIMEOUT_MS	5000
+
+static int class_read_prop(struct sdw_slave *sdw)
+{
+	struct sdw_slave_prop *prop = &sdw->prop;
+
+	sdw_slave_read_prop(sdw);
+
+	prop->use_domain_irq = true;
+	prop->scp_int1_mask = SDW_SCP_INT1_BUS_CLASH | SDW_SCP_INT1_PARITY |
+			      SDW_SCP_INT1_IMPL_DEF;
+
+	return 0;
+}
+
+static int class_sdw_update_status(struct sdw_slave *sdw, enum sdw_slave_status status)
+{
+	struct sdca_class_drv *drv = dev_get_drvdata(&sdw->dev);
+
+	switch (status) {
+	case SDW_SLAVE_ATTACHED:
+		dev_dbg(drv->dev, "device attach\n");
+
+		drv->attached = true;
+
+		complete(&drv->device_attach);
+		break;
+	case SDW_SLAVE_UNATTACHED:
+		dev_dbg(drv->dev, "device detach\n");
+
+		drv->attached = false;
+
+		reinit_completion(&drv->device_attach);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static const struct sdw_slave_ops class_sdw_ops = {
+	.read_prop	= class_read_prop,
+	.update_status	= class_sdw_update_status,
+};
+
+static void class_regmap_lock(void *data)
+{
+	struct mutex *lock = data;
+
+	mutex_lock(lock);
+}
+
+static void class_regmap_unlock(void *data)
+{
+	struct mutex *lock = data;
+
+	mutex_unlock(lock);
+}
+
+static int class_wait_for_attach(struct sdca_class_drv *drv)
+{
+	if (!drv->attached) {
+		unsigned long timeout = msecs_to_jiffies(CLASS_SDW_ATTACH_TIMEOUT_MS);
+		unsigned long time;
+
+		time = wait_for_completion_timeout(&drv->device_attach, timeout);
+		if (!time) {
+			dev_err(drv->dev, "timed out waiting for device re-attach\n");
+			return -ETIMEDOUT;
+		}
+	}
+
+	regcache_cache_only(drv->dev_regmap, false);
+
+	return 0;
+}
+
+static bool class_dev_regmap_volatile(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case SDW_SCP_SDCA_INTMASK1 ... SDW_SCP_SDCA_INTMASK4:
+		return false;
+	default:
+		return true;
+	}
+}
+
+static bool class_dev_regmap_precious(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case SDW_SCP_SDCA_INT1 ... SDW_SCP_SDCA_INT4:
+	case SDW_SCP_SDCA_INTMASK1 ... SDW_SCP_SDCA_INTMASK4:
+		return false;
+	default:
+		return true;
+	}
+}
+
+static const struct regmap_config class_dev_regmap_config = {
+	.name			= "sdca-device",
+	.reg_bits		= 32,
+	.val_bits		= 8,
+
+	.max_register		= SDW_SDCA_MAX_REGISTER,
+	.volatile_reg		= class_dev_regmap_volatile,
+	.precious_reg		= class_dev_regmap_precious,
+
+	.cache_type		= REGCACHE_MAPLE,
+
+	.lock			= class_regmap_lock,
+	.unlock			= class_regmap_unlock,
+};
+
+static void class_boot_work(struct work_struct *work)
+{
+	struct sdca_class_drv *drv = container_of(work,
+						  struct sdca_class_drv,
+						  boot_work);
+	int ret;
+
+	ret = class_wait_for_attach(drv);
+	if (ret)
+		goto err;
+
+	drv->irq_info = sdca_irq_allocate(drv->dev, drv->dev_regmap,
+					  drv->sdw->irq);
+	if (IS_ERR(drv->irq_info))
+		goto err;
+
+	ret = sdca_dev_register_functions(drv->sdw);
+	if (ret)
+		goto err;
+
+	dev_dbg(drv->dev, "boot work complete\n");
+
+	pm_runtime_mark_last_busy(drv->dev);
+	pm_runtime_put_autosuspend(drv->dev);
+
+	return;
+
+err:
+	pm_runtime_put_sync(drv->dev);
+}
+
+static void class_dev_remove(void *data)
+{
+	struct sdca_class_drv *drv = data;
+
+	cancel_work_sync(&drv->boot_work);
+
+	sdca_dev_unregister_functions(drv->sdw);
+}
+
+static int class_sdw_probe(struct sdw_slave *sdw, const struct sdw_device_id *id)
+{
+	struct device *dev = &sdw->dev;
+	struct sdca_device_data *data = &sdw->sdca_data;
+	struct regmap_config *dev_config;
+	struct sdca_class_drv *drv;
+	int ret;
+
+	sdca_lookup_swft(sdw);
+
+	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
+	if (!drv)
+		return -ENOMEM;
+
+	dev_config = devm_kmemdup(dev, &class_dev_regmap_config,
+				  sizeof(*dev_config), GFP_KERNEL);
+	if (!dev_config)
+		return -ENOMEM;
+
+	drv->functions = devm_kcalloc(dev, data->num_functions,
+				      sizeof(*drv->functions),
+				      GFP_KERNEL);
+	if (!drv->functions)
+		return -ENOMEM;
+
+	drv->dev = dev;
+	drv->sdw = sdw;
+	mutex_init(&drv->regmap_lock);
+
+	dev_set_drvdata(drv->dev, drv);
+
+	INIT_WORK(&drv->boot_work, class_boot_work);
+	init_completion(&drv->device_attach);
+
+	dev_config->lock_arg = &drv->regmap_lock;
+
+	drv->dev_regmap = devm_regmap_init_sdw(sdw, dev_config);
+	if (IS_ERR(drv->dev_regmap))
+		return dev_err_probe(drv->dev, PTR_ERR(drv->dev_regmap),
+				     "failed to create device regmap\n");
+
+	regcache_cache_only(drv->dev_regmap, true);
+
+	pm_runtime_set_autosuspend_delay(dev, 250);
+	pm_runtime_use_autosuspend(dev);
+	pm_runtime_set_active(dev);
+	pm_runtime_get_noresume(dev);
+
+	ret = devm_pm_runtime_enable(dev);
+	if (ret)
+		return ret;
+
+	ret = devm_add_action_or_reset(dev, class_dev_remove, drv);
+	if (ret)
+		return ret;
+
+	queue_work(system_long_wq, &drv->boot_work);
+
+	return 0;
+}
+
+static int class_runtime_suspend(struct device *dev)
+{
+	struct sdca_class_drv *drv = dev_get_drvdata(dev);
+
+	/*
+	 * Whilst the driver doesn't power the chip down here, going into runtime
+	 * suspend lets the SoundWire bus power down, which means the driver
+	 * can't communicate with the device any more.
+	 */
+	regcache_cache_only(drv->dev_regmap, true);
+
+	return 0;
+}
+
+static int class_runtime_resume(struct device *dev)
+{
+	struct sdca_class_drv *drv = dev_get_drvdata(dev);
+	int ret;
+
+	ret = class_wait_for_attach(drv);
+	if (ret)
+		goto err;
+
+	regcache_mark_dirty(drv->dev_regmap);
+
+	ret = regcache_sync(drv->dev_regmap);
+	if (ret) {
+		dev_err(drv->dev, "failed to restore cache: %d\n", ret);
+		goto err;
+	}
+
+	return 0;
+
+err:
+	regcache_cache_only(drv->dev_regmap, true);
+
+	return ret;
+}
+
+static const struct dev_pm_ops class_pm_ops = {
+	RUNTIME_PM_OPS(class_runtime_suspend, class_runtime_resume, NULL)
+};
+
+static const struct sdw_device_id class_sdw_id[] = {
+	SDW_SLAVE_ENTRY(0x01FA, 0x4245, 0),
+	{}
+};
+MODULE_DEVICE_TABLE(sdw, class_sdw_id);
+
+static struct sdw_driver class_sdw_driver = {
+	.driver = {
+		.name		= "sdca_class",
+		.pm		= pm_ptr(&class_pm_ops),
+	},
+
+	.probe		= class_sdw_probe,
+	.id_table	= class_sdw_id,
+	.ops		= &class_sdw_ops,
+};
+module_sdw_driver(class_sdw_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SDCA Class Driver");
+MODULE_IMPORT_NS("SND_SOC_SDCA");
diff --git a/sound/soc/sdca/sdca_class.h b/sound/soc/sdca/sdca_class.h
new file mode 100644
index 000000000000..bb4c9dd12429
--- /dev/null
+++ b/sound/soc/sdca/sdca_class.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ *
+ * Copyright (C) 2025 Cirrus Logic, Inc. and
+ *                    Cirrus Logic International Semiconductor Ltd.
+ */
+
+#ifndef __SDCA_CLASS_H__
+#define __SDCA_CLASS_H__
+
+#include <linux/completion.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+struct device;
+struct regmap;
+struct sdw_slave;
+struct sdca_function_data;
+
+struct sdca_class_drv {
+	struct device *dev;
+	struct regmap *dev_regmap;
+	struct sdw_slave *sdw;
+
+	struct sdca_function_data *functions;
+	struct sdca_interrupt_info *irq_info;
+
+	struct mutex regmap_lock;
+	struct work_struct boot_work;
+	struct completion device_attach;
+
+	bool attached;
+};
+
+#endif /* __SDCA_CLASS_H__ */
-- 
cgit v1.2.3


From f58ef9d1d1355b15443719df95081f193067ab88 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 20 Nov 2025 11:28:20 +0200
Subject: PCI/P2PDMA: Separate the mmap() support from the core logic

Currently the P2PDMA code requires a pgmap and a struct page to
function. The was serving three important purposes:

 - DMA API compatibility, where scatterlist required a struct page as
   input

 - Life cycle management, the percpu_ref is used to prevent UAF during
   device hot unplug

 - A way to get the P2P provider data through the pci_p2pdma_pagemap

The DMA API now has a new flow, and has gained phys_addr_t support, so
it no longer needs struct pages to perform P2P mapping.

Lifecycle management can be delegated to the user, DMABUF for instance
has a suitable invalidation protocol that does not require struct page.

Finding the P2P provider data can also be managed by the caller
without need to look it up from the phys_addr.

Split the P2PDMA code into two layers. The optional upper layer,
effectively, provides a way to mmap() P2P memory into a VMA by
providing struct page, pgmap, a genalloc and sysfs.

The lower layer provides the actual P2P infrastructure and is wrapped
up in a new struct p2pdma_provider. Rework the mmap layer to use new
p2pdma_provider based APIs.

Drivers that do not want to put P2P memory into VMA's can allocate a
struct p2pdma_provider after probe() starts and free it before
remove() completes. When DMA mapping the driver must convey the struct
p2pdma_provider to the DMA mapping code along with a phys_addr of the
MMIO BAR slice to map. The driver must ensure that no DMA mapping
outlives the lifetime of the struct p2pdma_provider.

The intended target of this new API layer is DMABUF. There is usually
only a single p2pdma_provider for a DMABUF exporter. Most drivers can
establish the p2pdma_provider during probe, access the single instance
during DMABUF attach and use that to drive the DMA mapping.

DMABUF provides an invalidation mechanism that can guarantee all DMA
is halted and the DMA mappings are undone prior to destroying the
struct p2pdma_provider. This ensures there is no UAF through DMABUFs
that are lingering past driver removal.

The new p2pdma_provider layer cannot be used to create P2P memory that
can be mapped into VMA's, be used with pin_user_pages(), O_DIRECT, and
so on. These use cases must still use the mmap() layer. The
p2pdma_provider layer is principally for DMABUF-like use cases where
DMABUF natively manages the life cycle and access instead of
vmas/pin_user_pages()/struct page.

In addition, remove the bus_off field from pci_p2pdma_map_state since
it duplicates information already available in the pgmap structure.
The bus_offset is only used in one location (pci_p2pdma_bus_addr_map)
and is always identical to pgmap->bus_offset.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Alex Mastro <amastro@fb.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-1-d7f71607f371@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/pci/p2pdma.c       | 43 +++++++++++++++++++++++--------------------
 include/linux/pci-p2pdma.h | 19 ++++++++++++++-----
 2 files changed, 37 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 78e108e47254..59cd6fb40e83 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -28,9 +28,8 @@ struct pci_p2pdma {
 };
 
 struct pci_p2pdma_pagemap {
-	struct pci_dev *provider;
-	u64 bus_offset;
 	struct dev_pagemap pgmap;
+	struct p2pdma_provider mem;
 };
 
 static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap)
@@ -204,8 +203,8 @@ static void p2pdma_page_free(struct page *page)
 {
 	struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page));
 	/* safe to dereference while a reference is held to the percpu ref */
-	struct pci_p2pdma *p2pdma =
-		rcu_dereference_protected(pgmap->provider->p2pdma, 1);
+	struct pci_p2pdma *p2pdma = rcu_dereference_protected(
+		to_pci_dev(pgmap->mem.owner)->p2pdma, 1);
 	struct percpu_ref *ref;
 
 	gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page),
@@ -270,14 +269,15 @@ out:
 
 static void pci_p2pdma_unmap_mappings(void *data)
 {
-	struct pci_dev *pdev = data;
+	struct pci_p2pdma_pagemap *p2p_pgmap = data;
 
 	/*
 	 * Removing the alloc attribute from sysfs will call
 	 * unmap_mapping_range() on the inode, teardown any existing userspace
 	 * mappings and prevent new ones from being created.
 	 */
-	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
+	sysfs_remove_file_from_group(&p2p_pgmap->mem.owner->kobj,
+				     &p2pmem_alloc_attr.attr,
 				     p2pmem_group.name);
 }
 
@@ -328,10 +328,9 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	pgmap->nr_range = 1;
 	pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
 	pgmap->ops = &p2pdma_pgmap_ops;
-
-	p2p_pgmap->provider = pdev;
-	p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) -
-		pci_resource_start(pdev, bar);
+	p2p_pgmap->mem.owner = &pdev->dev;
+	p2p_pgmap->mem.bus_offset =
+		pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar);
 
 	addr = devm_memremap_pages(&pdev->dev, pgmap);
 	if (IS_ERR(addr)) {
@@ -340,7 +339,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	}
 
 	error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
-					 pdev);
+					 p2p_pgmap);
 	if (error)
 		goto pages_free;
 
@@ -972,16 +971,16 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
 }
 EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
 
-static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
-						    struct device *dev)
+static enum pci_p2pdma_map_type
+pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev)
 {
 	enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
-	struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider;
+	struct pci_dev *pdev = to_pci_dev(provider->owner);
 	struct pci_dev *client;
 	struct pci_p2pdma *p2pdma;
 	int dist;
 
-	if (!provider->p2pdma)
+	if (!pdev->p2pdma)
 		return PCI_P2PDMA_MAP_NOT_SUPPORTED;
 
 	if (!dev_is_pci(dev))
@@ -990,7 +989,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
 	client = to_pci_dev(dev);
 
 	rcu_read_lock();
-	p2pdma = rcu_dereference(provider->p2pdma);
+	p2pdma = rcu_dereference(pdev->p2pdma);
 
 	if (p2pdma)
 		type = xa_to_value(xa_load(&p2pdma->map_types,
@@ -998,7 +997,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
 	rcu_read_unlock();
 
 	if (type == PCI_P2PDMA_MAP_UNKNOWN)
-		return calc_map_type_and_dist(provider, client, &dist, true);
+		return calc_map_type_and_dist(pdev, client, &dist, true);
 
 	return type;
 }
@@ -1006,9 +1005,13 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
 void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
 		struct device *dev, struct page *page)
 {
-	state->pgmap = page_pgmap(page);
-	state->map = pci_p2pdma_map_type(state->pgmap, dev);
-	state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset;
+	struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page));
+
+	if (state->mem == &p2p_pgmap->mem)
+		return;
+
+	state->mem = &p2p_pgmap->mem;
+	state->map = pci_p2pdma_map_type(&p2p_pgmap->mem, dev);
 }
 
 /**
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 951f81a38f3a..1400f3ad4299 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -16,6 +16,16 @@
 struct block_device;
 struct scatterlist;
 
+/**
+ * struct p2pdma_provider
+ *
+ * A p2pdma provider is a range of MMIO address space available to the CPU.
+ */
+struct p2pdma_provider {
+	struct device *owner;
+	u64 bus_offset;
+};
+
 #ifdef CONFIG_PCI_P2PDMA
 int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 		u64 offset);
@@ -139,11 +149,11 @@ enum pci_p2pdma_map_type {
 };
 
 struct pci_p2pdma_map_state {
-	struct dev_pagemap *pgmap;
+	struct p2pdma_provider *mem;
 	enum pci_p2pdma_map_type map;
-	u64 bus_off;
 };
 
+
 /* helper for pci_p2pdma_state(), do not use directly */
 void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
 		struct device *dev, struct page *page);
@@ -162,8 +172,7 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev,
 		struct page *page)
 {
 	if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
-		if (state->pgmap != page_pgmap(page))
-			__pci_p2pdma_update_state(state, dev, page);
+		__pci_p2pdma_update_state(state, dev, page);
 		return state->map;
 	}
 	return PCI_P2PDMA_MAP_NONE;
@@ -181,7 +190,7 @@ static inline dma_addr_t
 pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr)
 {
 	WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR);
-	return paddr + state->bus_off;
+	return paddr + state->mem->bus_offset;
 }
 
 #endif /* _LINUX_PCI_P2P_H */
-- 
cgit v1.2.3


From d4504262f745e48c1739c8b864f779b4b0f9de80 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 20 Nov 2025 11:28:21 +0200
Subject: PCI/P2PDMA: Simplify bus address mapping API

Update the pci_p2pdma_bus_addr_map() function to take a direct pointer
to the p2pdma_provider structure instead of the pci_p2pdma_map_state.
This simplifies the API by removing the need for callers to extract
the provider from the state structure.

The change updates all callers across the kernel (block layer, IOMMU,
DMA direct, and HMM) to pass the provider pointer directly, making
the code more explicit and reducing unnecessary indirection. This
also removes the runtime warning check since callers now have direct
control over which provider they use.

Tested-by: Alex Mastro <amastro@fb.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-2-d7f71607f371@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 block/blk-mq-dma.c         | 2 +-
 drivers/iommu/dma-iommu.c  | 4 ++--
 include/linux/pci-p2pdma.h | 7 +++----
 kernel/dma/direct.c        | 4 ++--
 mm/hmm.c                   | 2 +-
 5 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 449950029872..a1b623744b2f 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -85,7 +85,7 @@ static inline bool blk_can_dma_map_iova(struct request *req,
 
 static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
 {
-	iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr);
+	iter->addr = pci_p2pdma_bus_addr_map(iter->p2pdma.mem, vec->paddr);
 	iter->len = vec->len;
 	return true;
 }
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7944a3af4545..e52d19d2e833 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1439,8 +1439,8 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 			 * as a bus address, __finalise_sg() will copy the dma
 			 * address into the output segment.
 			 */
-			s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
-						sg_phys(s));
+			s->dma_address = pci_p2pdma_bus_addr_map(
+				p2pdma_state.mem, sg_phys(s));
 			sg_dma_len(s) = sg->length;
 			sg_dma_mark_bus_address(s);
 			continue;
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 1400f3ad4299..9516ef97b17a 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -181,16 +181,15 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev,
 /**
  * pci_p2pdma_bus_addr_map - Translate a physical address to a bus address
  *			     for a PCI_P2PDMA_MAP_BUS_ADDR transfer.
- * @state:	P2P state structure
+ * @provider:	P2P provider structure
  * @paddr:	physical address to map
  *
  * Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer.
  */
 static inline dma_addr_t
-pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr)
+pci_p2pdma_bus_addr_map(struct p2pdma_provider *provider, phys_addr_t paddr)
 {
-	WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR);
-	return paddr + state->mem->bus_offset;
+	return paddr + provider->bus_offset;
 }
 
 #endif /* _LINUX_PCI_P2P_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 1f9ee9759426..d8b3dfc598b2 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -479,8 +479,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 			}
 			break;
 		case PCI_P2PDMA_MAP_BUS_ADDR:
-			sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
-					sg_phys(sg));
+			sg->dma_address = pci_p2pdma_bus_addr_map(
+				p2pdma_state.mem, sg_phys(sg));
 			sg_dma_mark_bus_address(sg);
 			continue;
 		default:
diff --git a/mm/hmm.c b/mm/hmm.c
index 87562914670a..9bf0b831a029 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -811,7 +811,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
 		break;
 	case PCI_P2PDMA_MAP_BUS_ADDR:
 		pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
-		return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+		return pci_p2pdma_bus_addr_map(p2pdma_state->mem, paddr);
 	default:
 		return DMA_MAPPING_ERROR;
 	}
-- 
cgit v1.2.3


From 372d6d1b8ae3cdfe6b0638a0a848c6865ec94567 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 20 Nov 2025 11:28:22 +0200
Subject: PCI/P2PDMA: Refactor to separate core P2P functionality from memory
 allocation

Refactor the PCI P2PDMA subsystem to separate the core peer-to-peer DMA
functionality from the optional memory allocation layer. This creates a
two-tier architecture:

The core layer provides P2P mapping functionality for physical addresses
based on PCI device MMIO BARs and integrates with the DMA API for
mapping operations. This layer is required for all P2PDMA users.

The optional upper layer provides memory allocation capabilities
including gen_pool allocator, struct page support, and sysfs interface
for user space access.

This separation allows subsystems like DMABUF to use only the core P2P
mapping functionality without the overhead of memory allocation features
they don't need. The core functionality is now available through the
new pcim_p2pdma_provider() function that returns a p2pdma_provider
structure.

Tested-by: Alex Mastro <amastro@fb.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-3-d7f71607f371@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/pci/p2pdma.c       | 151 +++++++++++++++++++++++++++++++++++----------
 include/linux/pci-p2pdma.h |  11 ++++
 2 files changed, 131 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 59cd6fb40e83..855d3493634c 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -25,11 +25,12 @@ struct pci_p2pdma {
 	struct gen_pool *pool;
 	bool p2pmem_published;
 	struct xarray map_types;
+	struct p2pdma_provider mem[PCI_STD_NUM_BARS];
 };
 
 struct pci_p2pdma_pagemap {
 	struct dev_pagemap pgmap;
-	struct p2pdma_provider mem;
+	struct p2pdma_provider *mem;
 };
 
 static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap)
@@ -204,7 +205,7 @@ static void p2pdma_page_free(struct page *page)
 	struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page));
 	/* safe to dereference while a reference is held to the percpu ref */
 	struct pci_p2pdma *p2pdma = rcu_dereference_protected(
-		to_pci_dev(pgmap->mem.owner)->p2pdma, 1);
+		to_pci_dev(pgmap->mem->owner)->p2pdma, 1);
 	struct percpu_ref *ref;
 
 	gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page),
@@ -227,44 +228,123 @@ static void pci_p2pdma_release(void *data)
 
 	/* Flush and disable pci_alloc_p2p_mem() */
 	pdev->p2pdma = NULL;
-	synchronize_rcu();
+	if (p2pdma->pool)
+		synchronize_rcu();
+	xa_destroy(&p2pdma->map_types);
+
+	if (!p2pdma->pool)
+		return;
 
 	gen_pool_destroy(p2pdma->pool);
 	sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
-	xa_destroy(&p2pdma->map_types);
 }
 
-static int pci_p2pdma_setup(struct pci_dev *pdev)
+/**
+ * pcim_p2pdma_init - Initialise peer-to-peer DMA providers
+ * @pdev: The PCI device to enable P2PDMA for
+ *
+ * This function initializes the peer-to-peer DMA infrastructure
+ * for a PCI device. It allocates and sets up the necessary data
+ * structures to support P2PDMA operations, including mapping type
+ * tracking.
+ */
+int pcim_p2pdma_init(struct pci_dev *pdev)
 {
-	int error = -ENOMEM;
 	struct pci_p2pdma *p2p;
+	int i, ret;
+
+	p2p = rcu_dereference_protected(pdev->p2pdma, 1);
+	if (p2p)
+		return 0;
 
 	p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL);
 	if (!p2p)
 		return -ENOMEM;
 
 	xa_init(&p2p->map_types);
+	/*
+	 * Iterate over all standard PCI BARs and record only those that
+	 * correspond to MMIO regions. Skip non-memory resources (e.g. I/O
+	 * port BARs) since they cannot be used for peer-to-peer (P2P)
+	 * transactions.
+	 */
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
 
-	p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
-	if (!p2p->pool)
-		goto out;
+		p2p->mem[i].owner = &pdev->dev;
+		p2p->mem[i].bus_offset =
+			pci_bus_address(pdev, i) - pci_resource_start(pdev, i);
+	}
 
-	error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
-	if (error)
-		goto out_pool_destroy;
+	ret = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
+	if (ret)
+		goto out_p2p;
 
-	error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
-	if (error)
+	rcu_assign_pointer(pdev->p2pdma, p2p);
+	return 0;
+
+out_p2p:
+	devm_kfree(&pdev->dev, p2p);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pcim_p2pdma_init);
+
+/**
+ * pcim_p2pdma_provider - Get peer-to-peer DMA provider
+ * @pdev: The PCI device to enable P2PDMA for
+ * @bar: BAR index to get provider
+ *
+ * This function gets peer-to-peer DMA provider for a PCI device. The lifetime
+ * of the provider (and of course the MMIO) is bound to the lifetime of the
+ * driver. A driver calling this function must ensure that all references to the
+ * provider, and any DMA mappings created for any MMIO, are all cleaned up
+ * before the driver remove() completes.
+ *
+ * Since P2P is almost always shared with a second driver this means some system
+ * to notify, invalidate and revoke the MMIO's DMA must be in place to use this
+ * function. For example a revoke can be built using DMABUF.
+ */
+struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar)
+{
+	struct pci_p2pdma *p2p;
+
+	if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
+		return NULL;
+
+	p2p = rcu_dereference_protected(pdev->p2pdma, 1);
+	if (WARN_ON(!p2p))
+		/* Someone forgot to call to pcim_p2pdma_init() before */
+		return NULL;
+
+	return &p2p->mem[bar];
+}
+EXPORT_SYMBOL_GPL(pcim_p2pdma_provider);
+
+static int pci_p2pdma_setup_pool(struct pci_dev *pdev)
+{
+	struct pci_p2pdma *p2pdma;
+	int ret;
+
+	p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
+	if (p2pdma->pool)
+		/* We already setup pools, do nothing, */
+		return 0;
+
+	p2pdma->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
+	if (!p2pdma->pool)
+		return -ENOMEM;
+
+	ret = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
+	if (ret)
 		goto out_pool_destroy;
 
-	rcu_assign_pointer(pdev->p2pdma, p2p);
 	return 0;
 
 out_pool_destroy:
-	gen_pool_destroy(p2p->pool);
-out:
-	devm_kfree(&pdev->dev, p2p);
-	return error;
+	gen_pool_destroy(p2pdma->pool);
+	p2pdma->pool = NULL;
+	return ret;
 }
 
 static void pci_p2pdma_unmap_mappings(void *data)
@@ -276,7 +356,7 @@ static void pci_p2pdma_unmap_mappings(void *data)
 	 * unmap_mapping_range() on the inode, teardown any existing userspace
 	 * mappings and prevent new ones from being created.
 	 */
-	sysfs_remove_file_from_group(&p2p_pgmap->mem.owner->kobj,
+	sysfs_remove_file_from_group(&p2p_pgmap->mem->owner->kobj,
 				     &p2pmem_alloc_attr.attr,
 				     p2pmem_group.name);
 }
@@ -295,6 +375,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 			    u64 offset)
 {
 	struct pci_p2pdma_pagemap *p2p_pgmap;
+	struct p2pdma_provider *mem;
 	struct dev_pagemap *pgmap;
 	struct pci_p2pdma *p2pdma;
 	void *addr;
@@ -312,11 +393,21 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	if (size + offset > pci_resource_len(pdev, bar))
 		return -EINVAL;
 
-	if (!pdev->p2pdma) {
-		error = pci_p2pdma_setup(pdev);
-		if (error)
-			return error;
-	}
+	error = pcim_p2pdma_init(pdev);
+	if (error)
+		return error;
+
+	error = pci_p2pdma_setup_pool(pdev);
+	if (error)
+		return error;
+
+	mem = pcim_p2pdma_provider(pdev, bar);
+	/*
+	 * We checked validity of BAR prior to call
+	 * to pcim_p2pdma_provider. It should never return NULL.
+	 */
+	if (WARN_ON(!mem))
+		return -EINVAL;
 
 	p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL);
 	if (!p2p_pgmap)
@@ -328,9 +419,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	pgmap->nr_range = 1;
 	pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
 	pgmap->ops = &p2pdma_pgmap_ops;
-	p2p_pgmap->mem.owner = &pdev->dev;
-	p2p_pgmap->mem.bus_offset =
-		pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar);
+	p2p_pgmap->mem = mem;
 
 	addr = devm_memremap_pages(&pdev->dev, pgmap);
 	if (IS_ERR(addr)) {
@@ -1007,11 +1096,11 @@ void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
 {
 	struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page));
 
-	if (state->mem == &p2p_pgmap->mem)
+	if (state->mem == p2p_pgmap->mem)
 		return;
 
-	state->mem = &p2p_pgmap->mem;
-	state->map = pci_p2pdma_map_type(&p2p_pgmap->mem, dev);
+	state->mem = p2p_pgmap->mem;
+	state->map = pci_p2pdma_map_type(p2p_pgmap->mem, dev);
 }
 
 /**
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 9516ef97b17a..15471252817b 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -27,6 +27,8 @@ struct p2pdma_provider {
 };
 
 #ifdef CONFIG_PCI_P2PDMA
+int pcim_p2pdma_init(struct pci_dev *pdev);
+struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar);
 int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 		u64 offset);
 int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients,
@@ -44,6 +46,15 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
 			       bool use_p2pdma);
 #else /* CONFIG_PCI_P2PDMA */
+static inline int pcim_p2pdma_init(struct pci_dev *pdev)
+{
+	return -EOPNOTSUPP;
+}
+static inline struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev,
+							   int bar)
+{
+	return NULL;
+}
 static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
 		size_t size, u64 offset)
 {
-- 
cgit v1.2.3


From 395698bd2cd7639b85784a4a8f5ddb7a581e353c Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 20 Nov 2025 11:28:23 +0200
Subject: PCI/P2PDMA: Provide an access to pci_p2pdma_map_type() function

Provide an access to pci_p2pdma_map_type() function to allow subsystems
to determine the appropriate mapping type for P2PDMA transfers between
a provider and target device.

The pci_p2pdma_map_type() function is the core P2P layer version of
the existing public, but struct page focused, pci_p2pdma_state()
function. It returns the same result. It is required to use the p2p
subsystem from drivers that don't use the struct page layer.

Like __pci_p2pdma_update_state() it is not an exported function. The
idea is that only subsystem code will implement mapping helpers for
taking in phys_addr_t lists, this is deliberately not made accessible
to every driver to prevent abuse.

Following patches will use this function to implement a shared DMA
mapping helper for DMABUF.

Tested-by: Alex Mastro <amastro@fb.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-4-d7f71607f371@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/pci/p2pdma.c       | 14 ++++++--
 include/linux/pci-p2pdma.h | 85 +++++++++++++++++++++++++---------------------
 2 files changed, 58 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 855d3493634c..981a76b6b7c0 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -1060,8 +1060,18 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
 }
 EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
 
-static enum pci_p2pdma_map_type
-pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev)
+/**
+ * pci_p2pdma_map_type - Determine the mapping type for P2PDMA transfers
+ * @provider: P2PDMA provider structure
+ * @dev: Target device for the transfer
+ *
+ * Determines how peer-to-peer DMA transfers should be mapped between
+ * the provider and the target device. The mapping type indicates whether
+ * the transfer can be done directly through PCI switches or must go
+ * through the host bridge.
+ */
+enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider,
+					     struct device *dev)
 {
 	enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
 	struct pci_dev *pdev = to_pci_dev(provider->owner);
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 15471252817b..517e121d2598 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -26,6 +26,45 @@ struct p2pdma_provider {
 	u64 bus_offset;
 };
 
+enum pci_p2pdma_map_type {
+	/*
+	 * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before
+	 * the mapping type has been calculated. Exported routines for the API
+	 * will never return this value.
+	 */
+	PCI_P2PDMA_MAP_UNKNOWN = 0,
+
+	/*
+	 * Not a PCI P2PDMA transfer.
+	 */
+	PCI_P2PDMA_MAP_NONE,
+
+	/*
+	 * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will
+	 * traverse the host bridge and the host bridge is not in the
+	 * allowlist. DMA Mapping routines should return an error when
+	 * this is returned.
+	 */
+	PCI_P2PDMA_MAP_NOT_SUPPORTED,
+
+	/*
+	 * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to
+	 * each other directly through a PCI switch and the transaction will
+	 * not traverse the host bridge. Such a mapping should program
+	 * the DMA engine with PCI bus addresses.
+	 */
+	PCI_P2PDMA_MAP_BUS_ADDR,
+
+	/*
+	 * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk
+	 * to each other, but the transaction traverses a host bridge on the
+	 * allowlist. In this case, a normal mapping either with CPU physical
+	 * addresses (in the case of dma-direct) or IOVA addresses (in the
+	 * case of IOMMUs) should be used to program the DMA engine.
+	 */
+	PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
+};
+
 #ifdef CONFIG_PCI_P2PDMA
 int pcim_p2pdma_init(struct pci_dev *pdev);
 struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar);
@@ -45,6 +84,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
 			    bool *use_p2pdma);
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
 			       bool use_p2pdma);
+enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider,
+					     struct device *dev);
 #else /* CONFIG_PCI_P2PDMA */
 static inline int pcim_p2pdma_init(struct pci_dev *pdev)
 {
@@ -106,6 +147,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page,
 {
 	return sprintf(page, "none\n");
 }
+static inline enum pci_p2pdma_map_type
+pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev)
+{
+	return PCI_P2PDMA_MAP_NOT_SUPPORTED;
+}
 #endif /* CONFIG_PCI_P2PDMA */
 
 
@@ -120,45 +166,6 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client)
 	return pci_p2pmem_find_many(&client, 1);
 }
 
-enum pci_p2pdma_map_type {
-	/*
-	 * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before
-	 * the mapping type has been calculated. Exported routines for the API
-	 * will never return this value.
-	 */
-	PCI_P2PDMA_MAP_UNKNOWN = 0,
-
-	/*
-	 * Not a PCI P2PDMA transfer.
-	 */
-	PCI_P2PDMA_MAP_NONE,
-
-	/*
-	 * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will
-	 * traverse the host bridge and the host bridge is not in the
-	 * allowlist. DMA Mapping routines should return an error when
-	 * this is returned.
-	 */
-	PCI_P2PDMA_MAP_NOT_SUPPORTED,
-
-	/*
-	 * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to
-	 * each other directly through a PCI switch and the transaction will
-	 * not traverse the host bridge. Such a mapping should program
-	 * the DMA engine with PCI bus addresses.
-	 */
-	PCI_P2PDMA_MAP_BUS_ADDR,
-
-	/*
-	 * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk
-	 * to each other, but the transaction traverses a host bridge on the
-	 * allowlist. In this case, a normal mapping either with CPU physical
-	 * addresses (in the case of dma-direct) or IOVA addresses (in the
-	 * case of IOMMUs) should be used to program the DMA engine.
-	 */
-	PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
-};
-
 struct pci_p2pdma_map_state {
 	struct p2pdma_provider *mem;
 	enum pci_p2pdma_map_type map;
-- 
cgit v1.2.3


From 3aa31a8bb11e47c0ff2b306988d1756b810c1c3c Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 20 Nov 2025 11:28:25 +0200
Subject: dma-buf: provide phys_vec to scatter-gather mapping routine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add dma_buf_phys_vec_to_sgt() and dma_buf_free_sgt() helpers to convert
an array of MMIO physical address ranges into scatter-gather tables with
proper DMA mapping.

These common functions are a starting point and support any PCI
drivers creating mappings from their BAR's MMIO addresses. VFIO is one
case, as shortly will be RDMA. We can review existing DRM drivers to
refactor them separately. We hope this will evolve into routines to
help common DRM that include mixed CPU and MMIO mappings.

Compared to the dma_map_resource() abuse this implementation handles
the complicated PCI P2P scenarios properly, especially when an IOMMU
is enabled:

 - Direct bus address mapping without IOVA allocation for
   PCI_P2PDMA_MAP_BUS_ADDR, using pci_p2pdma_bus_addr_map(). This
   happens if the IOMMU is enabled but the PCIe switch ACS flags allow
   transactions to avoid the host bridge.

   Further, this handles the slightly obscure, case of MMIO with a
   phys_addr_t that is different from the physical BAR programming
   (bus offset). The phys_addr_t is converted to a dma_addr_t and
   accommodates this effect. This enables certain real systems to
   work, especially on ARM platforms.

 - Mapping through host bridge with IOVA allocation and DMA_ATTR_MMIO
   attribute for MMIO memory regions (PCI_P2PDMA_MAP_THRU_HOST_BRIDGE).
   This happens when the IOMMU is enabled and the ACS flags are forcing
   all traffic to the IOMMU - ie for virtualization systems.

 - Cases where P2P is not supported through the host bridge/CPU. The
   P2P subsystem is the proper place to detect this and block it.

Helper functions fill_sg_entry() and calc_sg_nents() handle the
scatter-gather table construction, splitting large regions into
UINT_MAX-sized chunks to fit within sg->length field limits.

Since the physical address based DMA API forbids use of the CPU list
of the scatterlist this will produce a mangled scatterlist that has
a fully zero-length and NULL'd CPU list. The list is 0 length,
all the struct page pointers are NULL and zero sized. This is stronger
and more robust than the existing mangle_sg_table() technique. It is
a future project to migrate DMABUF as a subsystem away from using
scatterlist for this data structure.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Alex Mastro <amastro@fb.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-6-d7f71607f371@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/dma-buf/Makefile          |   2 +-
 drivers/dma-buf/dma-buf-mapping.c | 248 ++++++++++++++++++++++++++++++++++++++
 include/linux/dma-buf-mapping.h   |  17 +++
 include/linux/dma-buf.h           |  11 ++
 4 files changed, 277 insertions(+), 1 deletion(-)
 create mode 100644 drivers/dma-buf/dma-buf-mapping.c
 create mode 100644 include/linux/dma-buf-mapping.h

(limited to 'include')

diff --git a/drivers/dma-buf/Makefile b/drivers/dma-buf/Makefile
index 70ec901edf2c..2008fb7481b3 100644
--- a/drivers/dma-buf/Makefile
+++ b/drivers/dma-buf/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \
-	 dma-fence-unwrap.o dma-resv.o
+	 dma-fence-unwrap.o dma-resv.o dma-buf-mapping.o
 obj-$(CONFIG_DMABUF_HEAPS)	+= dma-heap.o
 obj-$(CONFIG_DMABUF_HEAPS)	+= heaps/
 obj-$(CONFIG_SYNC_FILE)		+= sync_file.o
diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c
new file mode 100644
index 000000000000..b4819811a64a
--- /dev/null
+++ b/drivers/dma-buf/dma-buf-mapping.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * DMA BUF Mapping Helpers
+ *
+ */
+#include <linux/dma-buf-mapping.h>
+#include <linux/dma-resv.h>
+
+static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, size_t length,
+					 dma_addr_t addr)
+{
+	unsigned int len, nents;
+	int i;
+
+	nents = DIV_ROUND_UP(length, UINT_MAX);
+	for (i = 0; i < nents; i++) {
+		len = min_t(size_t, length, UINT_MAX);
+		length -= len;
+		/*
+		 * DMABUF abuses scatterlist to create a scatterlist
+		 * that does not have any CPU list, only the DMA list.
+		 * Always set the page related values to NULL to ensure
+		 * importers can't use it. The phys_addr based DMA API
+		 * does not require the CPU list for mapping or unmapping.
+		 */
+		sg_set_page(sgl, NULL, 0, 0);
+		sg_dma_address(sgl) = addr + i * UINT_MAX;
+		sg_dma_len(sgl) = len;
+		sgl = sg_next(sgl);
+	}
+
+	return sgl;
+}
+
+static unsigned int calc_sg_nents(struct dma_iova_state *state,
+				  struct dma_buf_phys_vec *phys_vec,
+				  size_t nr_ranges, size_t size)
+{
+	unsigned int nents = 0;
+	size_t i;
+
+	if (!state || !dma_use_iova(state)) {
+		for (i = 0; i < nr_ranges; i++)
+			nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
+	} else {
+		/*
+		 * In IOVA case, there is only one SG entry which spans
+		 * for whole IOVA address space, but we need to make sure
+		 * that it fits sg->length, maybe we need more.
+		 */
+		nents = DIV_ROUND_UP(size, UINT_MAX);
+	}
+
+	return nents;
+}
+
+/**
+ * struct dma_buf_dma - holds DMA mapping information
+ * @sgt:    Scatter-gather table
+ * @state:  DMA IOVA state relevant in IOMMU-based DMA
+ * @size:   Total size of DMA transfer
+ */
+struct dma_buf_dma {
+	struct sg_table sgt;
+	struct dma_iova_state *state;
+	size_t size;
+};
+
+/**
+ * dma_buf_phys_vec_to_sgt - Returns the scatterlist table of the attachment
+ * from arrays of physical vectors. This funciton is intended for MMIO memory
+ * only.
+ * @attach:	[in]	attachment whose scatterlist is to be returned
+ * @provider:	[in]	p2pdma provider
+ * @phys_vec:	[in]	array of physical vectors
+ * @nr_ranges:	[in]	number of entries in phys_vec array
+ * @size:	[in]	total size of phys_vec
+ * @dir:	[in]	direction of DMA transfer
+ *
+ * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR
+ * on error. May return -EINTR if it is interrupted by a signal.
+ *
+ * On success, the DMA addresses and lengths in the returned scatterlist are
+ * PAGE_SIZE aligned.
+ *
+ * A mapping must be unmapped by using dma_buf_free_sgt().
+ *
+ * NOTE: This function is intended for exporters. If direct traffic routing is
+ * mandatory exporter should call routing pci_p2pdma_map_type() before calling
+ * this function.
+ */
+struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach,
+					 struct p2pdma_provider *provider,
+					 struct dma_buf_phys_vec *phys_vec,
+					 size_t nr_ranges, size_t size,
+					 enum dma_data_direction dir)
+{
+	unsigned int nents, mapped_len = 0;
+	struct dma_buf_dma *dma;
+	struct scatterlist *sgl;
+	dma_addr_t addr;
+	size_t i;
+	int ret;
+
+	dma_resv_assert_held(attach->dmabuf->resv);
+
+	if (WARN_ON(!attach || !attach->dmabuf || !provider))
+		/* This function is supposed to work on MMIO memory only */
+		return ERR_PTR(-EINVAL);
+
+	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+	if (!dma)
+		return ERR_PTR(-ENOMEM);
+
+	switch (pci_p2pdma_map_type(provider, attach->dev)) {
+	case PCI_P2PDMA_MAP_BUS_ADDR:
+		/*
+		 * There is no need in IOVA at all for this flow.
+		 */
+		break;
+	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+		dma->state = kzalloc(sizeof(*dma->state), GFP_KERNEL);
+		if (!dma->state) {
+			ret = -ENOMEM;
+			goto err_free_dma;
+		}
+
+		dma_iova_try_alloc(attach->dev, dma->state, 0, size);
+		break;
+	default:
+		ret = -EINVAL;
+		goto err_free_dma;
+	}
+
+	nents = calc_sg_nents(dma->state, phys_vec, nr_ranges, size);
+	ret = sg_alloc_table(&dma->sgt, nents, GFP_KERNEL | __GFP_ZERO);
+	if (ret)
+		goto err_free_state;
+
+	sgl = dma->sgt.sgl;
+
+	for (i = 0; i < nr_ranges; i++) {
+		if (!dma->state) {
+			addr = pci_p2pdma_bus_addr_map(provider,
+						       phys_vec[i].paddr);
+		} else if (dma_use_iova(dma->state)) {
+			ret = dma_iova_link(attach->dev, dma->state,
+					    phys_vec[i].paddr, 0,
+					    phys_vec[i].len, dir,
+					    DMA_ATTR_MMIO);
+			if (ret)
+				goto err_unmap_dma;
+
+			mapped_len += phys_vec[i].len;
+		} else {
+			addr = dma_map_phys(attach->dev, phys_vec[i].paddr,
+					    phys_vec[i].len, dir,
+					    DMA_ATTR_MMIO);
+			ret = dma_mapping_error(attach->dev, addr);
+			if (ret)
+				goto err_unmap_dma;
+		}
+
+		if (!dma->state || !dma_use_iova(dma->state))
+			sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);
+	}
+
+	if (dma->state && dma_use_iova(dma->state)) {
+		WARN_ON_ONCE(mapped_len != size);
+		ret = dma_iova_sync(attach->dev, dma->state, 0, mapped_len);
+		if (ret)
+			goto err_unmap_dma;
+
+		sgl = fill_sg_entry(sgl, mapped_len, dma->state->addr);
+	}
+
+	dma->size = size;
+
+	/*
+	 * No CPU list included — set orig_nents = 0 so others can detect
+	 * this via SG table (use nents only).
+	 */
+	dma->sgt.orig_nents = 0;
+
+
+	/*
+	 * SGL must be NULL to indicate that SGL is the last one
+	 * and we allocated correct number of entries in sg_alloc_table()
+	 */
+	WARN_ON_ONCE(sgl);
+	return &dma->sgt;
+
+err_unmap_dma:
+	if (!i || !dma->state) {
+		; /* Do nothing */
+	} else if (dma_use_iova(dma->state)) {
+		dma_iova_destroy(attach->dev, dma->state, mapped_len, dir,
+				 DMA_ATTR_MMIO);
+	} else {
+		for_each_sgtable_dma_sg(&dma->sgt, sgl, i)
+			dma_unmap_phys(attach->dev, sg_dma_address(sgl),
+				       sg_dma_len(sgl), dir, DMA_ATTR_MMIO);
+	}
+	sg_free_table(&dma->sgt);
+err_free_state:
+	kfree(dma->state);
+err_free_dma:
+	kfree(dma);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_phys_vec_to_sgt, "DMA_BUF");
+
+/**
+ * dma_buf_free_sgt- unmaps the buffer
+ * @attach:	[in]	attachment to unmap buffer from
+ * @sgt:	[in]	scatterlist info of the buffer to unmap
+ * @dir:	[in]	direction of DMA transfer
+ *
+ * This unmaps a DMA mapping for @attached obtained
+ * by dma_buf_phys_vec_to_sgt().
+ */
+void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt,
+		      enum dma_data_direction dir)
+{
+	struct dma_buf_dma *dma = container_of(sgt, struct dma_buf_dma, sgt);
+	int i;
+
+	dma_resv_assert_held(attach->dmabuf->resv);
+
+	if (!dma->state) {
+		; /* Do nothing */
+	} else if (dma_use_iova(dma->state)) {
+		dma_iova_destroy(attach->dev, dma->state, dma->size, dir,
+				 DMA_ATTR_MMIO);
+	} else {
+		struct scatterlist *sgl;
+
+		for_each_sgtable_dma_sg(sgt, sgl, i)
+			dma_unmap_phys(attach->dev, sg_dma_address(sgl),
+				       sg_dma_len(sgl), dir, DMA_ATTR_MMIO);
+	}
+
+	sg_free_table(sgt);
+	kfree(dma->state);
+	kfree(dma);
+
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_free_sgt, "DMA_BUF");
diff --git a/include/linux/dma-buf-mapping.h b/include/linux/dma-buf-mapping.h
new file mode 100644
index 000000000000..a3c0ce2d3a42
--- /dev/null
+++ b/include/linux/dma-buf-mapping.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * DMA BUF Mapping Helpers
+ *
+ */
+#ifndef __DMA_BUF_MAPPING_H__
+#define __DMA_BUF_MAPPING_H__
+#include <linux/dma-buf.h>
+
+struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach,
+					 struct p2pdma_provider *provider,
+					 struct dma_buf_phys_vec *phys_vec,
+					 size_t nr_ranges, size_t size,
+					 enum dma_data_direction dir);
+void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt,
+		      enum dma_data_direction dir);
+#endif
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index d58e329ac0e7..0bc492090237 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/dma-fence.h>
 #include <linux/wait.h>
+#include <linux/pci-p2pdma.h>
 
 struct device;
 struct dma_buf;
@@ -530,6 +531,16 @@ struct dma_buf_export_info {
 	void *priv;
 };
 
+/**
+ * struct dma_buf_phys_vec - describe continuous chunk of memory
+ * @paddr:   physical address of that chunk
+ * @len:     Length of this chunk
+ */
+struct dma_buf_phys_vec {
+	phys_addr_t paddr;
+	size_t len;
+};
+
 /**
  * DEFINE_DMA_BUF_EXPORT_INFO - helper macro for exporters
  * @name: export-info name
-- 
cgit v1.2.3


From 64a5dedcff801072154a806102d731ecdf0e7552 Mon Sep 17 00:00:00 2001
From: Vivek Kasireddy <vivek.kasireddy@intel.com>
Date: Thu, 20 Nov 2025 11:28:26 +0200
Subject: vfio: Export vfio device get and put registration helpers

These helpers are useful for managing additional references taken
on the device from other associated VFIO modules.

Original-patch-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Alex Mastro <amastro@fb.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-7-d7f71607f371@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/vfio/vfio_main.c | 2 ++
 include/linux/vfio.h     | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 38c8e9350a60..9aa4a5d081e8 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -172,11 +172,13 @@ void vfio_device_put_registration(struct vfio_device *device)
 	if (refcount_dec_and_test(&device->refcount))
 		complete(&device->comp);
 }
+EXPORT_SYMBOL_GPL(vfio_device_put_registration);
 
 bool vfio_device_try_get_registration(struct vfio_device *device)
 {
 	return refcount_inc_not_zero(&device->refcount);
 }
+EXPORT_SYMBOL_GPL(vfio_device_try_get_registration);
 
 /*
  * VFIO driver API
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index eb563f538dee..217ba4ef1752 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -297,6 +297,8 @@ static inline void vfio_put_device(struct vfio_device *device)
 int vfio_register_group_dev(struct vfio_device *device);
 int vfio_register_emulated_iommu_dev(struct vfio_device *device);
 void vfio_unregister_group_dev(struct vfio_device *device);
+bool vfio_device_try_get_registration(struct vfio_device *device);
+void vfio_device_put_registration(struct vfio_device *device);
 
 int vfio_assign_device_set(struct vfio_device *device, void *set_id);
 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set);
-- 
cgit v1.2.3


From 8312cab5ff4702389a86129051eba6ea046a71a1 Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Thu, 20 Nov 2025 15:56:47 +0100
Subject: timers/migration: Rename 'online' bit to 'available'

The timer migration hierarchy excludes offline CPUs via the
tmigr_is_not_available function, which is essentially checking the
online bit for the CPU.

Rename the online bit to available and all references in function names
and tracepoint to generalise the concept of available CPUs.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251120145653.296659-2-gmonaco@redhat.com
---
 include/trace/events/timer_migration.h |  4 ++--
 kernel/time/timer_migration.c          | 24 ++++++++++++------------
 kernel/time/timer_migration.h          |  2 +-
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h
index 47db5eaf2f9a..61171b13c687 100644
--- a/include/trace/events/timer_migration.h
+++ b/include/trace/events/timer_migration.h
@@ -173,14 +173,14 @@ DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_active,
 	TP_ARGS(tmc)
 );
 
-DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_online,
+DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_available,
 
 	TP_PROTO(struct tmigr_cpu *tmc),
 
 	TP_ARGS(tmc)
 );
 
-DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_offline,
+DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_unavailable,
 
 	TP_PROTO(struct tmigr_cpu *tmc),
 
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 57e38674e56e..2cfebed35e22 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -429,7 +429,7 @@ static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
 
 static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc)
 {
-	return !(tmc->tmgroup && tmc->online);
+	return !(tmc->tmgroup && tmc->available);
 }
 
 /*
@@ -926,7 +926,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
 	 * updated the event takes care when hierarchy is completely
 	 * idle. Otherwise the migrator does it as the event is enqueued.
 	 */
-	if (!tmc->online || tmc->remote || tmc->cpuevt.ignore ||
+	if (!tmc->available || tmc->remote || tmc->cpuevt.ignore ||
 	    now < tmc->cpuevt.nextevt.expires) {
 		raw_spin_unlock_irq(&tmc->lock);
 		return;
@@ -973,7 +973,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
 	 * (See also section "Required event and timerqueue update after a
 	 * remote expiry" in the documentation at the top)
 	 */
-	if (!tmc->online || !tmc->idle) {
+	if (!tmc->available || !tmc->idle) {
 		timer_unlock_remote_bases(cpu);
 		goto unlock;
 	}
@@ -1422,19 +1422,19 @@ static long tmigr_trigger_active(void *unused)
 {
 	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
 
-	WARN_ON_ONCE(!tmc->online || tmc->idle);
+	WARN_ON_ONCE(!tmc->available || tmc->idle);
 
 	return 0;
 }
 
-static int tmigr_cpu_offline(unsigned int cpu)
+static int tmigr_clear_cpu_available(unsigned int cpu)
 {
 	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
 	int migrator;
 	u64 firstexp;
 
 	raw_spin_lock_irq(&tmc->lock);
-	tmc->online = false;
+	tmc->available = false;
 	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
 
 	/*
@@ -1442,7 +1442,7 @@ static int tmigr_cpu_offline(unsigned int cpu)
 	 * offline; Therefore nextevt value is set to KTIME_MAX
 	 */
 	firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
-	trace_tmigr_cpu_offline(tmc);
+	trace_tmigr_cpu_unavailable(tmc);
 	raw_spin_unlock_irq(&tmc->lock);
 
 	if (firstexp != KTIME_MAX) {
@@ -1453,7 +1453,7 @@ static int tmigr_cpu_offline(unsigned int cpu)
 	return 0;
 }
 
-static int tmigr_cpu_online(unsigned int cpu)
+static int tmigr_set_cpu_available(unsigned int cpu)
 {
 	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
 
@@ -1462,11 +1462,11 @@ static int tmigr_cpu_online(unsigned int cpu)
 		return -EINVAL;
 
 	raw_spin_lock_irq(&tmc->lock);
-	trace_tmigr_cpu_online(tmc);
+	trace_tmigr_cpu_available(tmc);
 	tmc->idle = timer_base_is_idle();
 	if (!tmc->idle)
 		__tmigr_cpu_activate(tmc);
-	tmc->online = true;
+	tmc->available = true;
 	raw_spin_unlock_irq(&tmc->lock);
 	return 0;
 }
@@ -1758,7 +1758,7 @@ static int tmigr_add_cpu(unsigned int cpu)
 		 * The (likely) current CPU is expected to be online in the hierarchy,
 		 * otherwise the old root may not be active as expected.
 		 */
-		WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->online);
+		WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
 		ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
 	}
 
@@ -1854,7 +1854,7 @@ static int __init tmigr_init(void)
 		goto err;
 
 	ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online",
-				tmigr_cpu_online, tmigr_cpu_offline);
+				tmigr_set_cpu_available, tmigr_clear_cpu_available);
 	if (ret)
 		goto err;
 
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index ae19f70f8170..70879cde6fdd 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -97,7 +97,7 @@ struct tmigr_group {
  */
 struct tmigr_cpu {
 	raw_spinlock_t		lock;
-	bool			online;
+	bool			available;
 	bool			idle;
 	bool			remote;
 	struct tmigr_group	*tmgroup;
-- 
cgit v1.2.3


From b56651007fc018effe695a68d48caa6970b23094 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 20 Nov 2025 15:56:52 +0100
Subject: cpumask: Add initialiser to use cleanup helpers

Now we can simplify a code that allocates cpumasks for local needs.

Automatic variables have to be initialized at declaration, or at least
before any possibility for the logic to return, so that compiler
wouldn't try to call an associate destructor function on a random stack
number.

Because cpumask_var_t, depending on the CPUMASK_OFFSTACK config, is
either a pointer or an array, we have to have a macro for initialization.

So define a CPUMASK_VAR_NULL macro, which allows to init struct cpumask
pointer with NULL when CPUMASK_OFFSTACK is enabled, and effectively a
no-op when CPUMASK_OFFSTACK is disabled (initialisation optimised out
with -O2).

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://patch.msgid.link/20251120145653.296659-7-gmonaco@redhat.com
---
 include/linux/cpumask.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ff8f41ab7ce6..68be522449ec 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1005,6 +1005,7 @@ static __always_inline unsigned int cpumask_size(void)
 
 #define this_cpu_cpumask_var_ptr(x)	this_cpu_read(x)
 #define __cpumask_var_read_mostly	__read_mostly
+#define CPUMASK_VAR_NULL		NULL
 
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
 
@@ -1051,6 +1052,7 @@ static __always_inline bool cpumask_available(cpumask_var_t mask)
 
 #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
 #define __cpumask_var_read_mostly
+#define CPUMASK_VAR_NULL {}
 
 static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 {
-- 
cgit v1.2.3


From 7dec062cfcf27808dbb70a0b231d1a698792743d Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Thu, 20 Nov 2025 15:56:53 +0100
Subject: timers/migration: Exclude isolated cpus from hierarchy

The timer migration mechanism allows active CPUs to pull timers from
idle ones to improve the overall idle time. This is however undesired
when CPU intensive workloads run on isolated cores, as the algorithm
would move the timers from housekeeping to isolated cores, negatively
affecting the isolation.

Exclude isolated cores from the timer migration algorithm, extend the
concept of unavailable cores, currently used for offline ones, to
isolated ones:
* A core is unavailable if isolated or offline;
* A core is available if non isolated and online;

A core is considered unavailable as isolated if it belongs to:
* the isolcpus (domain) list
* an isolated cpuset
Except if it is:
* in the nohz_full list (already idle for the hierarchy)
* the nohz timekeeper core (must be available to handle global timers)

CPUs are added to the hierarchy during late boot, excluding isolated
ones, the hierarchy is also adapted when the cpuset isolation changes.

Due to how the timer migration algorithm works, any CPU part of the
hierarchy can have their global timers pulled by remote CPUs and have to
pull remote timers, only skipping pulling remote timers would break the
logic.
For this reason, prevent isolated CPUs from pulling remote global
timers, but also the other way around: any global timer started on an
isolated CPU will run there. This does not break the concept of
isolation (global timers don't come from outside the CPU) and, if
considered inappropriate, can usually be mitigated with other isolation
techniques (e.g. IRQ pinning).

This effect was noticed on a 128 cores machine running oslat on the
isolated cores (1-31,33-63,65-95,97-127). The tool monopolises CPUs,
and the CPU with lowest count in a timer migration hierarchy (here 1
and 65) appears as always active and continuously pulls global timers,
from the housekeeping CPUs. This ends up moving driver work (e.g.
delayed work) to isolated CPUs and causes latency spikes:

before the change:

 # oslat -c 1-31,33-63,65-95,97-127 -D 62s
 ...
  Maximum:     1203 10 3 4 ... 5 (us)

after the change:

 # oslat -c 1-31,33-63,65-95,97-127 -D 62s
 ...
  Maximum:      10 4 3 4 3 ... 5 (us)

The same behaviour was observed on a machine with as few as 20 cores /
40 threads with isocpus set to: 1-9,11-39 with rtla-osnoise-top.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: John B. Wyatt IV <jwyatt@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://patch.msgid.link/20251120145653.296659-8-gmonaco@redhat.com
---
 include/linux/timer.h         |   9 +++
 kernel/cgroup/cpuset.c        |   3 +
 kernel/time/timer_migration.c | 143 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+)

(limited to 'include')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 0414d9e6b4fc..62e1cea71125 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -188,4 +188,13 @@ int timers_dead_cpu(unsigned int cpu);
 #define timers_dead_cpu		NULL
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask);
+#else
+static inline int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
+{
+	return 0;
+}
+#endif
+
 #endif
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index cf34623fe66f..bfc3b319e1c0 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1350,6 +1350,9 @@ static void update_isolation_cpumasks(bool isolcpus_updated)
 
 	ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
 	WARN_ON_ONCE(ret < 0);
+
+	ret = tmigr_isolated_exclude_cpumask(isolated_cpus);
+	WARN_ON_ONCE(ret < 0);
 }
 
 /**
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index a01c7f8bdf52..18dda1aa782d 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -10,6 +10,7 @@
 #include <linux/spinlock.h>
 #include <linux/timerqueue.h>
 #include <trace/events/ipi.h>
+#include <linux/sched/isolation.h>
 
 #include "timer_migration.h"
 #include "tick-internal.h"
@@ -427,8 +428,13 @@ static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
 /*
  * CPUs available for timer migration.
  * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
+ * Additionally tmigr_available_mutex serializes set/clear operations with each other.
  */
 static cpumask_var_t tmigr_available_cpumask;
+static DEFINE_MUTEX(tmigr_available_mutex);
+
+/* Enabled during late initcall */
+static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated);
 
 #define TMIGR_NONE	0xFF
 #define BIT_CNT		8
@@ -438,6 +444,33 @@ static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc)
 	return !(tmc->tmgroup && tmc->available);
 }
 
+/*
+ * Returns true if @cpu should be excluded from the hierarchy as isolated.
+ * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs
+ * are still part of the hierarchy but become idle (from a tick and timer
+ * migration perspective) when they stop their tick. This lets the timekeeping
+ * CPU handle their global timers. Marking also isolated CPUs as idle would be
+ * too costly, hence they are completely excluded from the hierarchy.
+ * This check is necessary, for instance, to prevent offline isolated CPUs from
+ * being incorrectly marked as available once getting back online.
+ *
+ * This function returns false during early boot and the isolation logic is
+ * enabled only after isolated CPUs are marked as unavailable at late boot.
+ * The tick CPU can be isolated at boot, however we cannot mark it as
+ * unavailable to avoid having no global migrator for the nohz_full CPUs. This
+ * should be ensured by the callers of this function: implicitly from hotplug
+ * callbacks and explicitly in tmigr_init_isolation() and
+ * tmigr_isolated_exclude_cpumask().
+ */
+static inline bool tmigr_is_isolated(int cpu)
+{
+	if (!static_branch_unlikely(&tmigr_exclude_isolated))
+		return false;
+	return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) ||
+		cpuset_cpu_is_isolated(cpu)) &&
+	       housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE);
+}
+
 /*
  * Returns true, when @childmask corresponds to the group migrator or when the
  * group is not active - so no migrator is set.
@@ -1439,8 +1472,12 @@ static int tmigr_clear_cpu_available(unsigned int cpu)
 	int migrator;
 	u64 firstexp;
 
+	guard(mutex)(&tmigr_available_mutex);
+
 	cpumask_clear_cpu(cpu, tmigr_available_cpumask);
 	scoped_guard(raw_spinlock_irq, &tmc->lock) {
+		if (!tmc->available)
+			return 0;
 		tmc->available = false;
 		WRITE_ONCE(tmc->wakeup, KTIME_MAX);
 
@@ -1468,8 +1505,15 @@ static int tmigr_set_cpu_available(unsigned int cpu)
 	if (WARN_ON_ONCE(!tmc->tmgroup))
 		return -EINVAL;
 
+	if (tmigr_is_isolated(cpu))
+		return 0;
+
+	guard(mutex)(&tmigr_available_mutex);
+
 	cpumask_set_cpu(cpu, tmigr_available_cpumask);
 	scoped_guard(raw_spinlock_irq, &tmc->lock) {
+		if (tmc->available)
+			return 0;
 		trace_tmigr_cpu_available(tmc);
 		tmc->idle = timer_base_is_idle();
 		if (!tmc->idle)
@@ -1479,6 +1523,105 @@ static int tmigr_set_cpu_available(unsigned int cpu)
 	return 0;
 }
 
+static void tmigr_cpu_isolate(struct work_struct *ignored)
+{
+	tmigr_clear_cpu_available(smp_processor_id());
+}
+
+static void tmigr_cpu_unisolate(struct work_struct *ignored)
+{
+	tmigr_set_cpu_available(smp_processor_id());
+}
+
+/**
+ * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy
+ * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy
+ *
+ * This function can be called from cpuset code to provide the new set of
+ * isolated CPUs that should be excluded from the hierarchy.
+ * Online CPUs not present in exclude_cpumask but already excluded are brought
+ * back to the hierarchy.
+ * Functions to isolate/unisolate need to be called locally and can sleep.
+ */
+int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
+{
+	struct work_struct __percpu *works __free(free_percpu) =
+		alloc_percpu(struct work_struct);
+	cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+	int cpu;
+
+	lockdep_assert_cpus_held();
+
+	if (!works)
+		return -ENOMEM;
+	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	/*
+	 * First set previously isolated CPUs as available (unisolate).
+	 * This cpumask contains only CPUs that switched to available now.
+	 */
+	cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask);
+	cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask);
+
+	for_each_cpu(cpu, cpumask) {
+		struct work_struct *work = per_cpu_ptr(works, cpu);
+
+		INIT_WORK(work, tmigr_cpu_unisolate);
+		schedule_work_on(cpu, work);
+	}
+	for_each_cpu(cpu, cpumask)
+		flush_work(per_cpu_ptr(works, cpu));
+
+	/*
+	 * Then clear previously available CPUs (isolate).
+	 * This cpumask contains only CPUs that switched to not available now.
+	 * There cannot be overlap with the newly available ones.
+	 */
+	cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask);
+	cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+	/*
+	 * Handle this here and not in the cpuset code because exclude_cpumask
+	 * might include also the tick CPU if included in isolcpus.
+	 */
+	for_each_cpu(cpu, cpumask) {
+		if (!tick_nohz_cpu_hotpluggable(cpu)) {
+			cpumask_clear_cpu(cpu, cpumask);
+			break;
+		}
+	}
+
+	for_each_cpu(cpu, cpumask) {
+		struct work_struct *work = per_cpu_ptr(works, cpu);
+
+		INIT_WORK(work, tmigr_cpu_isolate);
+		schedule_work_on(cpu, work);
+	}
+	for_each_cpu(cpu, cpumask)
+		flush_work(per_cpu_ptr(works, cpu));
+
+	return 0;
+}
+
+static int __init tmigr_init_isolation(void)
+{
+	cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+
+	static_branch_enable(&tmigr_exclude_isolated);
+
+	if (!housekeeping_enabled(HK_TYPE_DOMAIN))
+		return 0;
+	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+	/* Protect against RCU torture hotplug testing */
+	guard(cpus_read_lock)();
+	return tmigr_isolated_exclude_cpumask(cpumask);
+}
+late_initcall(tmigr_init_isolation);
+
 static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
 			     int node)
 {
-- 
cgit v1.2.3


From d0d9a9629f505ac70e1ffd172e092ff71f5d989a Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Tue, 11 Nov 2025 10:57:35 +0800
Subject: dt-bindings: clock, reset: Add support for rv1126b

Add clock and reset ID defines for rv1126b.
Also add documentation for the rv1126b CRU core.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20251111025738.869847-3-zhangqing@rock-chips.com
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 .../bindings/clock/rockchip,rv1126b-cru.yaml       |  52 +++
 include/dt-bindings/clock/rockchip,rv1126b-cru.h   | 392 ++++++++++++++++++++
 include/dt-bindings/reset/rockchip,rv1126b-cru.h   | 405 +++++++++++++++++++++
 3 files changed, 849 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/rockchip,rv1126b-cru.yaml
 create mode 100644 include/dt-bindings/clock/rockchip,rv1126b-cru.h
 create mode 100644 include/dt-bindings/reset/rockchip,rv1126b-cru.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/rockchip,rv1126b-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rv1126b-cru.yaml
new file mode 100644
index 000000000000..04b0a5c51e4e
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rv1126b-cru.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rv1126b-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RV1126B Clock and Reset Unit
+
+maintainers:
+  - Elaine Zhang <zhangqing@rock-chips.com>
+  - Heiko Stuebner <heiko@sntech.de>
+
+description:
+  The rv1126b clock controller generates the clock and also implements a
+  reset controller for SoC peripherals.
+
+properties:
+  compatible:
+    enum:
+      - rockchip,rv1126b-cru
+
+  reg:
+    maxItems: 1
+
+  "#clock-cells":
+    const: 1
+
+  "#reset-cells":
+    const: 1
+
+  clocks:
+    maxItems: 1
+
+  clock-names:
+    const: xin24m
+
+required:
+  - compatible
+  - reg
+  - "#clock-cells"
+  - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+  - |
+    clock-controller@20000000 {
+      compatible = "rockchip,rv1126b-cru";
+      reg = <0x20000000 0xc0000>;
+      #clock-cells = <1>;
+      #reset-cells = <1>;
+    };
diff --git a/include/dt-bindings/clock/rockchip,rv1126b-cru.h b/include/dt-bindings/clock/rockchip,rv1126b-cru.h
new file mode 100644
index 000000000000..721d50a1419f
--- /dev/null
+++ b/include/dt-bindings/clock/rockchip,rv1126b-cru.h
@@ -0,0 +1,392 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */
+/*
+ * Copyright (c) 2025 Rockchip Electronics Co., Ltd.
+ * Author: Elaine Zhang <zhangqing@rock-chips.com>
+ */
+
+#ifndef _DT_BINDINGS_CLK_ROCKCHIP_RV1126B_H
+#define _DT_BINDINGS_CLK_ROCKCHIP_RV1126B_H
+
+/* pll clocks */
+#define PLL_GPLL				0
+#define PLL_CPLL				1
+#define PLL_AUPLL				2
+#define ARMCLK					3
+#define SCLK_DDR				4
+
+/* clk (clocks) */
+#define CLK_CPLL_DIV20				5
+#define CLK_CPLL_DIV10				6
+#define CLK_CPLL_DIV8				7
+#define CLK_GPLL_DIV8				8
+#define CLK_GPLL_DIV6				9
+#define CLK_GPLL_DIV4				10
+#define CLK_CPLL_DIV3				11
+#define CLK_GPLL_DIV3				12
+#define CLK_CPLL_DIV2				13
+#define CLK_GPLL_DIV2				14
+#define CLK_CM_FRAC0				15
+#define CLK_CM_FRAC1				16
+#define CLK_CM_FRAC2				17
+#define CLK_UART_FRAC0				18
+#define CLK_UART_FRAC1				19
+#define CLK_AUDIO_FRAC0				20
+#define CLK_AUDIO_FRAC1				21
+#define CLK_AUDIO_INT0				22
+#define CLK_AUDIO_INT1				23
+#define SCLK_UART0_SRC				24
+#define SCLK_UART1				25
+#define SCLK_UART2				26
+#define SCLK_UART3				27
+#define SCLK_UART4				28
+#define SCLK_UART5				29
+#define SCLK_UART6				30
+#define SCLK_UART7				31
+#define MCLK_SAI0				32
+#define MCLK_SAI1				33
+#define MCLK_SAI2				34
+#define MCLK_PDM				35
+#define CLKOUT_PDM				36
+#define MCLK_ASRC0				37
+#define MCLK_ASRC1				38
+#define MCLK_ASRC2				39
+#define MCLK_ASRC3				40
+#define CLK_ASRC0				41
+#define CLK_ASRC1				42
+#define CLK_CORE_PLL				43
+#define CLK_NPU_PLL				44
+#define CLK_VEPU_PLL				45
+#define CLK_ISP_PLL				46
+#define CLK_AISP_PLL				47
+#define CLK_SARADC0_SRC				48
+#define CLK_SARADC1_SRC				49
+#define CLK_SARADC2_SRC				50
+#define HCLK_NPU_ROOT				51
+#define PCLK_NPU_ROOT				52
+#define ACLK_VEPU_ROOT				53
+#define HCLK_VEPU_ROOT				54
+#define PCLK_VEPU_ROOT				55
+#define CLK_CORE_RGA_SRC			56
+#define ACLK_GMAC_ROOT				57
+#define ACLK_VI_ROOT				58
+#define HCLK_VI_ROOT				59
+#define PCLK_VI_ROOT				60
+#define DCLK_VICAP_ROOT				61
+#define CLK_SYS_DSMC_ROOT			62
+#define ACLK_VDO_ROOT				63
+#define ACLK_RKVDEC_ROOT			64
+#define HCLK_VDO_ROOT				65
+#define PCLK_VDO_ROOT				66
+#define DCLK_OOC_SRC				67
+#define DCLK_VOP				68
+#define DCLK_DECOM_SRC				69
+#define PCLK_DDR_ROOT				70
+#define ACLK_SYSMEM_SRC				71
+#define ACLK_TOP_ROOT				72
+#define ACLK_BUS_ROOT				73
+#define HCLK_BUS_ROOT				74
+#define PCLK_BUS_ROOT				75
+#define CCLK_SDMMC0				76
+#define CCLK_SDMMC1				77
+#define CCLK_EMMC				78
+#define SCLK_2X_FSPI0				79
+#define CLK_GMAC_PTP_REF_SRC			80
+#define CLK_GMAC_125M				81
+#define CLK_TIMER_ROOT				82
+#define TCLK_WDT_NS_SRC				83
+#define TCLK_WDT_S_SRC				84
+#define TCLK_WDT_HPMCU				85
+#define CLK_CAN0				86
+#define CLK_CAN1				87
+#define PCLK_PERI_ROOT				88
+#define ACLK_PERI_ROOT				89
+#define CLK_I2C_BUS_SRC				90
+#define CLK_SPI0				91
+#define CLK_SPI1				92
+#define BUSCLK_PMU_SRC				93
+#define CLK_PWM0				94
+#define CLK_PWM2				95
+#define CLK_PWM3				96
+#define CLK_PKA_RKCE_SRC			97
+#define ACLK_RKCE_SRC				98
+#define ACLK_VCP_ROOT				99
+#define HCLK_VCP_ROOT				100
+#define PCLK_VCP_ROOT				101
+#define CLK_CORE_FEC_SRC			102
+#define CLK_CORE_AVSP_SRC			103
+#define CLK_50M_GMAC_IOBUF_VI			104
+#define PCLK_TOP_ROOT				105
+#define CLK_MIPI0_OUT2IO			106
+#define CLK_MIPI1_OUT2IO			107
+#define CLK_MIPI2_OUT2IO			108
+#define CLK_MIPI3_OUT2IO			109
+#define CLK_CIF_OUT2IO				110
+#define CLK_MAC_OUT2IO				111
+#define MCLK_SAI0_OUT2IO			112
+#define MCLK_SAI1_OUT2IO			113
+#define MCLK_SAI2_OUT2IO			114
+#define CLK_CM_FRAC0_SRC			115
+#define CLK_CM_FRAC1_SRC			116
+#define CLK_CM_FRAC2_SRC			117
+#define CLK_UART_FRAC0_SRC			118
+#define CLK_UART_FRAC1_SRC			119
+#define CLK_AUDIO_FRAC0_SRC			120
+#define CLK_AUDIO_FRAC1_SRC			121
+#define ACLK_NPU_ROOT				122
+#define HCLK_RKNN				123
+#define ACLK_RKNN				124
+#define PCLK_GPIO3				125
+#define DBCLK_GPIO3				126
+#define PCLK_IOC_VCCIO3				127
+#define PCLK_SARADC0				128
+#define CLK_SARADC0				129
+#define HCLK_SDMMC1				130
+#define HCLK_VEPU				131
+#define ACLK_VEPU				132
+#define CLK_CORE_VEPU				133
+#define HCLK_FEC				134
+#define ACLK_FEC				135
+#define CLK_CORE_FEC				136
+#define HCLK_AVSP				137
+#define ACLK_AVSP				138
+#define BUSCLK_PMU1_ROOT			139
+#define HCLK_AISP				140
+#define ACLK_AISP				141
+#define CLK_CORE_AISP				142
+#define CLK_CORE_ISP_ROOT			143
+#define PCLK_DSMC				144
+#define ACLK_DSMC				145
+#define HCLK_CAN0				146
+#define HCLK_CAN1				147
+#define PCLK_GPIO2				148
+#define DBCLK_GPIO2				149
+#define PCLK_GPIO4				150
+#define DBCLK_GPIO4				151
+#define PCLK_GPIO5				152
+#define DBCLK_GPIO5				153
+#define PCLK_GPIO6				154
+#define DBCLK_GPIO6				155
+#define PCLK_GPIO7				156
+#define DBCLK_GPIO7				157
+#define PCLK_IOC_VCCIO2				158
+#define PCLK_IOC_VCCIO4				159
+#define PCLK_IOC_VCCIO5				160
+#define PCLK_IOC_VCCIO6				161
+#define PCLK_IOC_VCCIO7				162
+#define HCLK_ISP				163
+#define ACLK_ISP				164
+#define CLK_CORE_ISP				165
+#define HCLK_VICAP				166
+#define ACLK_VICAP				167
+#define DCLK_VICAP				168
+#define ISP0CLK_VICAP				169
+#define HCLK_VPSS				170
+#define ACLK_VPSS				171
+#define CLK_CORE_VPSS				172
+#define PCLK_CSI2HOST0				173
+#define DCLK_CSI2HOST0				174
+#define PCLK_CSI2HOST1				175
+#define DCLK_CSI2HOST1				176
+#define PCLK_CSI2HOST2				177
+#define DCLK_CSI2HOST2				178
+#define PCLK_CSI2HOST3				179
+#define DCLK_CSI2HOST3				180
+#define HCLK_SDMMC0				181
+#define ACLK_GMAC				182
+#define PCLK_GMAC				183
+#define CLK_GMAC_PTP_REF			184
+#define PCLK_CSIPHY0				185
+#define PCLK_CSIPHY1				186
+#define PCLK_MACPHY				187
+#define PCLK_SARADC1				188
+#define CLK_SARADC1				189
+#define PCLK_SARADC2				190
+#define CLK_SARADC2				191
+#define ACLK_RKVDEC				192
+#define HCLK_RKVDEC				193
+#define CLK_HEVC_CA_RKVDEC			194
+#define ACLK_VOP				195
+#define HCLK_VOP				196
+#define HCLK_RKJPEG				197
+#define ACLK_RKJPEG				198
+#define ACLK_RKMMU_DECOM			199
+#define HCLK_RKMMU_DECOM			200
+#define DCLK_DECOM				201
+#define ACLK_DECOM				202
+#define PCLK_DECOM				203
+#define PCLK_MIPI_DSI				204
+#define PCLK_DSIPHY				205
+#define ACLK_OOC				206
+#define ACLK_SYSMEM				207
+#define PCLK_DDRC				208
+#define PCLK_DDRMON				209
+#define CLK_TIMER_DDRMON			210
+#define PCLK_DFICTRL				211
+#define PCLK_DDRPHY				212
+#define PCLK_DMA2DDR				213
+#define CLK_RCOSC_SRC				214
+#define BUSCLK_PMU_MUX				215
+#define BUSCLK_PMU_ROOT				216
+#define PCLK_PMU				217
+#define CLK_XIN_RC_DIV				218
+#define CLK_32K					219
+#define PCLK_PMU_GPIO0				220
+#define DBCLK_PMU_GPIO0				221
+#define PCLK_PMU_HP_TIMER			222
+#define CLK_PMU_HP_TIMER			223
+#define CLK_PMU_32K_HP_TIMER			224
+#define PCLK_PWM1				225
+#define CLK_PWM1				226
+#define CLK_OSC_PWM1				227
+#define CLK_RC_PWM1				228
+#define CLK_FREQ_PWM1				229
+#define CLK_COUNTER_PWM1			230
+#define PCLK_I2C2				231
+#define CLK_I2C2				232
+#define PCLK_UART0				233
+#define SCLK_UART0				234
+#define PCLK_RCOSC_CTRL				235
+#define CLK_OSC_RCOSC_CTRL			236
+#define CLK_REF_RCOSC_CTRL			237
+#define PCLK_IOC_PMUIO0				238
+#define CLK_REFOUT				239
+#define CLK_PREROLL				240
+#define CLK_PREROLL_32K				241
+#define HCLK_PMU_SRAM				242
+#define PCLK_WDT_LPMCU				243
+#define TCLK_WDT_LPMCU				244
+#define CLK_LPMCU				245
+#define CLK_LPMCU_RTC				246
+#define PCLK_LPMCU_MAILBOX			247
+#define HCLK_OOC				248
+#define PCLK_SPI2AHB				249
+#define HCLK_SPI2AHB				250
+#define HCLK_FSPI1				251
+#define HCLK_XIP_FSPI1				252
+#define SCLK_1X_FSPI1				253
+#define PCLK_IOC_PMUIO1				254
+#define PCLK_AUDIO_ADC_PMU			255
+#define MCLK_AUDIO_ADC_PMU			256
+#define MCLK_AUDIO_ADC_DIV4_PMU			257
+#define MCLK_LPSAI				258
+#define ACLK_GIC400				259
+#define PCLK_WDT_NS				260
+#define TCLK_WDT_NS				261
+#define PCLK_WDT_HPMCU				262
+#define HCLK_CACHE				263
+#define PCLK_HPMCU_MAILBOX			264
+#define PCLK_HPMCU_INTMUX			265
+#define CLK_HPMCU				266
+#define CLK_HPMCU_RTC				267
+#define PCLK_RKDMA				268
+#define ACLK_RKDMA				269
+#define PCLK_DCF				270
+#define ACLK_DCF				271
+#define HCLK_RGA				272
+#define ACLK_RGA				273
+#define CLK_CORE_RGA				274
+#define PCLK_TIMER				275
+#define CLK_TIMER0				276
+#define CLK_TIMER1				277
+#define CLK_TIMER2				278
+#define CLK_TIMER3				279
+#define CLK_TIMER4				280
+#define CLK_TIMER5				281
+#define PCLK_I2C0				282
+#define CLK_I2C0				283
+#define PCLK_I2C1				284
+#define CLK_I2C1				285
+#define PCLK_I2C3				286
+#define CLK_I2C3				287
+#define PCLK_I2C4				288
+#define CLK_I2C4				289
+#define PCLK_I2C5				290
+#define CLK_I2C5				291
+#define PCLK_SPI0				292
+#define PCLK_SPI1				293
+#define PCLK_PWM0				294
+#define CLK_OSC_PWM0				295
+#define CLK_RC_PWM0				296
+#define PCLK_PWM2				297
+#define CLK_OSC_PWM2				298
+#define CLK_RC_PWM2				299
+#define PCLK_PWM3				300
+#define CLK_OSC_PWM3				301
+#define CLK_RC_PWM3				302
+#define PCLK_UART1				303
+#define PCLK_UART2				304
+#define PCLK_UART3				305
+#define PCLK_UART4				306
+#define PCLK_UART5				307
+#define PCLK_UART6				308
+#define PCLK_UART7				309
+#define PCLK_TSADC				310
+#define CLK_TSADC				311
+#define HCLK_SAI0				312
+#define HCLK_SAI1				313
+#define HCLK_SAI2				314
+#define HCLK_RKDSM				315
+#define MCLK_RKDSM				316
+#define HCLK_PDM				317
+#define HCLK_ASRC0				318
+#define HCLK_ASRC1				319
+#define PCLK_AUDIO_ADC_BUS			320
+#define MCLK_AUDIO_ADC_BUS			321
+#define MCLK_AUDIO_ADC_DIV4_BUS			322
+#define PCLK_RKCE				323
+#define HCLK_NS_RKCE				324
+#define PCLK_OTPC_NS				325
+#define CLK_SBPI_OTPC_NS			326
+#define CLK_USER_OTPC_NS			327
+#define CLK_OTPC_ARB				328
+#define PCLK_OTP_MASK				329
+#define CLK_TSADC_PHYCTRL			330
+#define LRCK_SRC_ASRC0				331
+#define LRCK_DST_ASRC0				332
+#define LRCK_SRC_ASRC1				333
+#define LRCK_DST_ASRC1				334
+#define PCLK_KEY_READER				335
+#define ACLK_NSRKCE				336
+#define CLK_PKA_NSRKCE				337
+#define PCLK_RTC_ROOT				338
+#define PCLK_GPIO1				339
+#define DBCLK_GPIO1				340
+#define PCLK_IOC_VCCIO1				341
+#define ACLK_USB3OTG				342
+#define CLK_REF_USB3OTG				343
+#define CLK_SUSPEND_USB3OTG			344
+#define HCLK_USB2HOST				345
+#define HCLK_ARB_USB2HOST			346
+#define PCLK_RTC_TEST				347
+#define HCLK_EMMC				348
+#define HCLK_FSPI0				349
+#define HCLK_XIP_FSPI0				350
+#define PCLK_PIPEPHY				351
+#define PCLK_USB2PHY				352
+#define CLK_REF_PIPEPHY_CPLL_SRC		353
+#define CLK_REF_PIPEPHY				354
+#define HCLK_VPSL				355
+#define ACLK_VPSL				356
+#define CLK_CORE_VPSL				357
+#define CLK_MACPHY				358
+#define HCLK_RKRNG_NS				359
+#define HCLK_RKRNG_S_NS				360
+#define CLK_AISP_PLL_SRC			361
+
+/* secure clks */
+#define CLK_USER_OTPC_S				362
+#define CLK_SBPI_OTPC_S				363
+#define PCLK_OTPC_S				364
+#define PCLK_KEY_READER_S			365
+#define HCLK_KL_RKCE_S				366
+#define HCLK_RKCE_S				367
+#define PCLK_WDT_S				368
+#define TCLK_WDT_S				369
+#define CLK_STIMER0				370
+#define CLK_STIMER1				371
+#define PLK_STIMER				372
+#define HCLK_RKRNG_S				373
+#define CLK_PKA_RKCE_S				374
+#define ACLK_RKCE_S				375
+
+#endif
diff --git a/include/dt-bindings/reset/rockchip,rv1126b-cru.h b/include/dt-bindings/reset/rockchip,rv1126b-cru.h
new file mode 100644
index 000000000000..a7712db319d0
--- /dev/null
+++ b/include/dt-bindings/reset/rockchip,rv1126b-cru.h
@@ -0,0 +1,405 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */
+/*
+ * Copyright (c) 2025 Rockchip Electronics Co., Ltd.
+ * Author: Elaine Zhang <zhangqing@rock-chips.com>
+ */
+
+#ifndef _DT_BINDINGS_RESET_ROCKCHIP_RV1126B_H
+#define _DT_BINDINGS_RESET_ROCKCHIP_RV1126B_H
+
+/* ==========================list all of reset fields id=========================== */
+/* TOPCRU-->SOFTRST_CON00 */
+
+/* TOPCRU-->SOFTRST_CON15 */
+#define SRST_P_CRU				0
+#define SRST_P_CRU_BIU				1
+
+/* BUSCRU-->SOFTRST_CON00 */
+#define SRST_A_TOP_BIU				2
+#define SRST_A_RKCE_BIU				3
+#define SRST_A_BUS_BIU				4
+#define SRST_H_BUS_BIU				5
+#define SRST_P_BUS_BIU				6
+#define SRST_P_CRU_BUS				7
+#define SRST_P_SYS_GRF				8
+#define SRST_H_BOOTROM				9
+#define SRST_A_GIC400				10
+#define SRST_A_SPINLOCK				11
+#define SRST_P_WDT_NS				12
+#define SRST_T_WDT_NS				13
+
+/* BUSCRU-->SOFTRST_CON01 */
+#define SRST_P_WDT_HPMCU			14
+#define SRST_T_WDT_HPMCU			15
+#define SRST_H_CACHE				16
+#define SRST_P_HPMCU_MAILBOX			17
+#define SRST_P_HPMCU_INTMUX			18
+#define SRST_HPMCU_FULL_CLUSTER			19
+#define SRST_HPMCU_PWUP				20
+#define SRST_HPMCU_ONLY_CORE			21
+#define SRST_T_HPMCU_JTAG			22
+#define SRST_P_RKDMA				23
+#define SRST_A_RKDMA				24
+
+/* BUSCRU-->SOFTRST_CON02 */
+#define SRST_P_DCF				25
+#define SRST_A_DCF				26
+#define SRST_H_RGA				27
+#define SRST_A_RGA				28
+#define SRST_CORE_RGA				29
+#define SRST_P_TIMER				30
+#define SRST_TIMER0				31
+#define SRST_TIMER1				32
+#define SRST_TIMER2				33
+#define SRST_TIMER3				34
+#define SRST_TIMER4				35
+#define SRST_TIMER5				36
+#define SRST_A_RKCE				37
+#define SRST_PKA_RKCE				38
+#define SRST_H_RKRNG_S				39
+#define SRST_H_RKRNG_NS				40
+
+/* BUSCRU-->SOFTRST_CON03 */
+#define SRST_P_I2C0				41
+#define SRST_I2C0				42
+#define SRST_P_I2C1				43
+#define SRST_I2C1				44
+#define SRST_P_I2C3				45
+#define SRST_I2C3				46
+#define SRST_P_I2C4				47
+#define SRST_I2C4				48
+#define SRST_P_I2C5				49
+#define SRST_I2C5				50
+#define SRST_P_SPI0				51
+#define SRST_SPI0				52
+#define SRST_P_SPI1				53
+#define SRST_SPI1				54
+
+/* BUSCRU-->SOFTRST_CON04 */
+#define SRST_P_PWM0				55
+#define SRST_PWM0				56
+#define SRST_P_PWM2				57
+#define SRST_PWM2				58
+#define SRST_P_PWM3				59
+#define SRST_PWM3				60
+
+/* BUSCRU-->SOFTRST_CON05 */
+#define SRST_P_UART1				61
+#define SRST_S_UART1				62
+#define SRST_P_UART2				63
+#define SRST_S_UART2				64
+#define SRST_P_UART3				65
+#define SRST_S_UART3				66
+#define SRST_P_UART4				67
+#define SRST_S_UART4				68
+#define SRST_P_UART5				69
+#define SRST_S_UART5				70
+#define SRST_P_UART6				71
+#define SRST_S_UART6				72
+#define SRST_P_UART7				73
+#define SRST_S_UART7				74
+
+/* BUSCRU-->SOFTRST_CON06 */
+#define SRST_P_TSADC				75
+#define SRST_TSADC				76
+#define SRST_H_SAI0				77
+#define SRST_M_SAI0				78
+#define SRST_H_SAI1				79
+#define SRST_M_SAI1				80
+#define SRST_H_SAI2				81
+#define SRST_M_SAI2				82
+#define SRST_H_RKDSM				83
+#define SRST_M_RKDSM				84
+#define SRST_H_PDM				85
+#define SRST_M_PDM				86
+#define SRST_PDM				87
+
+/* BUSCRU-->SOFTRST_CON07 */
+#define SRST_H_ASRC0				88
+#define SRST_ASRC0				89
+#define SRST_H_ASRC1				90
+#define SRST_ASRC1				91
+#define SRST_P_AUDIO_ADC_BUS			92
+#define SRST_M_AUDIO_ADC_BUS			93
+#define SRST_P_RKCE				94
+#define SRST_H_NS_RKCE				95
+#define SRST_P_OTPC_NS				96
+#define SRST_SBPI_OTPC_NS			97
+#define SRST_USER_OTPC_NS			98
+#define SRST_OTPC_ARB				99
+#define SRST_P_OTP_MASK				100
+
+/* PERICRU-->SOFTRST_CON00 */
+#define SRST_A_PERI_BIU				101
+#define SRST_P_PERI_BIU				102
+#define SRST_P_RTC_BIU				103
+#define SRST_P_CRU_PERI				104
+#define SRST_P_PERI_GRF				105
+#define SRST_P_GPIO1				106
+#define SRST_DB_GPIO1				107
+#define SRST_P_IOC_VCCIO1			108
+#define SRST_A_USB3OTG				109
+#define SRST_H_USB2HOST				110
+#define SRST_H_ARB_USB2HOST			111
+#define SRST_P_RTC_TEST				112
+
+/* PERICRU-->SOFTRST_CON01 */
+#define SRST_H_EMMC				113
+#define SRST_H_FSPI0				114
+#define SRST_H_XIP_FSPI0			115
+#define SRST_S_2X_FSPI0				116
+#define SRST_UTMI_USB2HOST			117
+#define SRST_REF_PIPEPHY			118
+#define SRST_P_PIPEPHY				119
+#define SRST_P_PIPEPHY_GRF			120
+#define SRST_P_USB2PHY				121
+#define SRST_POR_USB2PHY			122
+#define SRST_OTG_USB2PHY			123
+#define SRST_HOST_USB2PHY			124
+
+/* CORECRU-->SOFTRST_CON00 */
+#define SRST_REF_PVTPLL_CORE			125
+#define SRST_NCOREPORESET0			126
+#define SRST_NCORESET0				127
+#define SRST_NCOREPORESET1			128
+#define SRST_NCORESET1				129
+#define SRST_NCOREPORESET2			130
+#define SRST_NCORESET2				131
+#define SRST_NCOREPORESET3			132
+#define SRST_NCORESET3				133
+#define SRST_NDBGRESET				134
+#define SRST_NL2RESET				135
+
+/* CORECRU-->SOFTRST_CON01 */
+#define SRST_A_CORE_BIU				136
+#define SRST_P_CORE_BIU				137
+#define SRST_H_CORE_BIU				138
+#define SRST_P_DBG				139
+#define SRST_POT_DBG				140
+#define SRST_NT_DBG				141
+#define SRST_P_CORE_PVTPLL			142
+#define SRST_P_CRU_CORE				143
+#define SRST_P_CORE_GRF				144
+#define SRST_P_DFT2APB				145
+
+/* PMUCRU-->SOFTRST_CON00 */
+#define SRST_H_PMU_BIU				146
+#define SRST_P_PMU_GPIO0			147
+#define SRST_DB_PMU_GPIO0			148
+#define SRST_P_PMU_HP_TIMER			149
+#define SRST_PMU_HP_TIMER			150
+#define SRST_PMU_32K_HP_TIMER			151
+
+/* PMUCRU-->SOFTRST_CON01 */
+#define SRST_P_PWM1				152
+#define SRST_PWM1				153
+#define SRST_P_I2C2				154
+#define SRST_I2C2				155
+#define SRST_P_UART0				156
+#define SRST_S_UART0				157
+
+/* PMUCRU-->SOFTRST_CON02 */
+#define SRST_P_RCOSC_CTRL			158
+#define SRST_REF_RCOSC_CTRL			159
+#define SRST_P_IOC_PMUIO0			160
+#define SRST_P_CRU_PMU				161
+#define SRST_P_PMU_GRF				162
+#define SRST_PREROLL				163
+#define SRST_PREROLL_32K			164
+#define SRST_H_PMU_SRAM				165
+
+/* PMUCRU-->SOFTRST_CON03 */
+#define SRST_P_WDT_LPMCU			166
+#define SRST_T_WDT_LPMCU			167
+#define SRST_LPMCU_FULL_CLUSTER			168
+#define SRST_LPMCU_PWUP				169
+#define SRST_LPMCU_ONLY_CORE			170
+#define SRST_T_LPMCU_JTAG			171
+#define SRST_P_LPMCU_MAILBOX			172
+
+/* PMU1CRU-->SOFTRST_CON00 */
+#define SRST_P_SPI2AHB				173
+#define SRST_H_SPI2AHB				174
+#define SRST_H_FSPI1				175
+#define SRST_H_XIP_FSPI1			176
+#define SRST_S_1X_FSPI1				177
+#define SRST_P_IOC_PMUIO1			178
+#define SRST_P_CRU_PMU1				179
+#define SRST_P_AUDIO_ADC_PMU			180
+#define SRST_M_AUDIO_ADC_PMU			181
+#define SRST_H_PMU1_BIU				182
+
+/* PMU1CRU-->SOFTRST_CON01 */
+#define SRST_P_LPDMA				183
+#define SRST_A_LPDMA				184
+#define SRST_H_LPSAI				185
+#define SRST_M_LPSAI				186
+#define SRST_P_AOA_TDD				187
+#define SRST_P_AOA_FE				188
+#define SRST_P_AOA_AAD				189
+#define SRST_P_AOA_APB				190
+#define SRST_P_AOA_SRAM				191
+
+/* DDRCRU-->SOFTRST_CON00 */
+#define SRST_P_DDR_BIU				192
+#define SRST_P_DDRC				193
+#define SRST_P_DDRMON				194
+#define SRST_TIMER_DDRMON			195
+#define SRST_P_DFICTRL				196
+#define SRST_P_DDR_GRF				197
+#define SRST_P_CRU_DDR				198
+#define SRST_P_DDRPHY				199
+#define SRST_P_DMA2DDR				200
+
+/* SUBDDRCRU-->SOFTRST_CON00 */
+#define SRST_A_SYSMEM_BIU			201
+#define SRST_A_SYSMEM				202
+#define SRST_A_DDR_BIU				203
+#define SRST_A_DDRSCH0_CPU			204
+#define SRST_A_DDRSCH1_NPU			205
+#define SRST_A_DDRSCH2_POE			206
+#define SRST_A_DDRSCH3_VI			207
+#define SRST_CORE_DDRC				208
+#define SRST_DDRMON				209
+#define SRST_DFICTRL				210
+#define SRST_RS					211
+#define SRST_A_DMA2DDR				212
+#define SRST_DDRPHY				213
+
+/* VICRU-->SOFTRST_CON00 */
+#define SRST_REF_PVTPLL_ISP			214
+#define SRST_A_GMAC_BIU				215
+#define SRST_A_VI_BIU				216
+#define SRST_H_VI_BIU				217
+#define SRST_P_VI_BIU				218
+#define SRST_P_CRU_VI				219
+#define SRST_P_VI_GRF				220
+#define SRST_P_VI_PVTPLL			221
+#define SRST_P_DSMC				222
+#define SRST_A_DSMC				223
+#define SRST_H_CAN0				224
+#define SRST_CAN0				225
+#define SRST_H_CAN1				226
+#define SRST_CAN1				227
+
+/* VICRU-->SOFTRST_CON01 */
+#define SRST_P_GPIO2				228
+#define SRST_DB_GPIO2				229
+#define SRST_P_GPIO4				230
+#define SRST_DB_GPIO4				231
+#define SRST_P_GPIO5				232
+#define SRST_DB_GPIO5				233
+#define SRST_P_GPIO6				234
+#define SRST_DB_GPIO6				235
+#define SRST_P_GPIO7				236
+#define SRST_DB_GPIO7				237
+#define SRST_P_IOC_VCCIO2			238
+#define SRST_P_IOC_VCCIO4			239
+#define SRST_P_IOC_VCCIO5			240
+#define SRST_P_IOC_VCCIO6			241
+#define SRST_P_IOC_VCCIO7			242
+
+/* VICRU-->SOFTRST_CON02 */
+#define SRST_CORE_ISP				243
+#define SRST_H_VICAP				244
+#define SRST_A_VICAP				245
+#define SRST_D_VICAP				246
+#define SRST_ISP0_VICAP				247
+#define SRST_CORE_VPSS				248
+#define SRST_CORE_VPSL				249
+#define SRST_P_CSI2HOST0			250
+#define SRST_P_CSI2HOST1			251
+#define SRST_P_CSI2HOST2			252
+#define SRST_P_CSI2HOST3			253
+#define SRST_H_SDMMC0				254
+#define SRST_A_GMAC				255
+#define SRST_P_CSIPHY0				256
+#define SRST_P_CSIPHY1				257
+
+/* VICRU-->SOFTRST_CON03 */
+#define SRST_P_MACPHY				258
+#define SRST_MACPHY				259
+#define SRST_P_SARADC1				260
+#define SRST_SARADC1				261
+#define SRST_P_SARADC2				262
+#define SRST_SARADC2				263
+
+/* VEPUCRU-->SOFTRST_CON00 */
+#define SRST_REF_PVTPLL_VEPU			264
+#define SRST_A_VEPU_BIU				265
+#define SRST_H_VEPU_BIU				266
+#define SRST_P_VEPU_BIU				267
+#define SRST_P_CRU_VEPU				268
+#define SRST_P_VEPU_GRF				269
+#define SRST_P_GPIO3				270
+#define SRST_DB_GPIO3				271
+#define SRST_P_IOC_VCCIO3			272
+#define SRST_P_SARADC0				273
+#define SRST_SARADC0				274
+#define SRST_H_SDMMC1				275
+
+/* VEPUCRU-->SOFTRST_CON01 */
+#define SRST_P_VEPU_PVTPLL			276
+#define SRST_H_VEPU				277
+#define SRST_A_VEPU				278
+#define SRST_CORE_VEPU				279
+
+/* NPUCRU-->SOFTRST_CON00 */
+#define SRST_REF_PVTPLL_NPU			280
+#define SRST_A_NPU_BIU				281
+#define SRST_H_NPU_BIU				282
+#define SRST_P_NPU_BIU				283
+#define SRST_P_CRU_NPU				284
+#define SRST_P_NPU_GRF				285
+#define SRST_P_NPU_PVTPLL			286
+#define SRST_H_RKNN				287
+#define SRST_A_RKNN				288
+
+/* VDOCRU-->SOFTRST_CON00 */
+#define SRST_A_RKVDEC_BIU			289
+#define SRST_A_VDO_BIU				290
+#define SRST_H_VDO_BIU				291
+#define SRST_P_VDO_BIU				292
+#define SRST_P_CRU_VDO				293
+#define SRST_P_VDO_GRF				294
+#define SRST_A_RKVDEC				295
+#define SRST_H_RKVDEC				296
+#define SRST_HEVC_CA_RKVDEC			297
+#define SRST_A_VOP				298
+#define SRST_H_VOP				299
+#define SRST_D_VOP				300
+#define SRST_A_OOC				301
+#define SRST_H_OOC				302
+#define SRST_D_OOC				303
+
+/* VDOCRU-->SOFTRST_CON01 */
+#define SRST_H_RKJPEG				304
+#define SRST_A_RKJPEG				305
+#define SRST_A_RKMMU_DECOM			306
+#define SRST_H_RKMMU_DECOM			307
+#define SRST_D_DECOM				308
+#define SRST_A_DECOM				309
+#define SRST_P_DECOM				310
+#define SRST_P_MIPI_DSI				311
+#define SRST_P_DSIPHY				312
+
+/* VCPCRU-->SOFTRST_CON00 */
+#define SRST_REF_PVTPLL_VCP			313
+#define SRST_A_VCP_BIU				314
+#define SRST_H_VCP_BIU				315
+#define SRST_P_VCP_BIU				316
+#define SRST_P_CRU_VCP				317
+#define SRST_P_VCP_GRF				318
+#define SRST_P_VCP_PVTPLL			319
+#define SRST_A_AISP_BIU				320
+#define SRST_H_AISP_BIU				321
+#define SRST_CORE_AISP				322
+
+/* VCPCRU-->SOFTRST_CON01 */
+#define SRST_H_FEC				323
+#define SRST_A_FEC				324
+#define SRST_CORE_FEC				325
+#define SRST_H_AVSP				326
+#define SRST_A_AVSP				327
+
+#endif
-- 
cgit v1.2.3


From 84692a1519b32d61ff882cf24a9eda900961acad Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 20 Nov 2025 11:15:56 -0800
Subject: io_uring/kbuf: remove obsolete buf_nr_pages and update comments

The buf_nr_pages field in io_buffer_list was previously used to
determine whether the buffer list uses ring-provided buffers or classic
provided buffers. This is now determined by checking the IOBL_BUF_RING
flag.

Remove the buf_nr_pages field and update related comments.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 4 ++--
 io_uring/kbuf.h                | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 92780764d5fa..e1adb0d20a0a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -327,8 +327,8 @@ struct io_ring_ctx {
 
 		/*
 		 * Modifications are protected by ->uring_lock and ->mmap_lock.
-		 * The flags, buf_pages and buf_nr_pages fields should be stable
-		 * once published.
+		 * The buffer list's io mapped region should be stable once
+		 * published.
 		 */
 		struct xarray		io_bl_xa;
 
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index ada382ff38d7..bf15e26520d3 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -14,8 +14,8 @@ enum {
 
 struct io_buffer_list {
 	/*
-	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
-	 * then these are classic provided buffers and ->buf_list is used.
+	 * If the IOBL_BUF_RING flag is set, then buf_ring is used. If not, then
+	 * these are classic provided buffers and ->buf_list is used.
 	 */
 	union {
 		struct list_head buf_list;
@@ -27,7 +27,6 @@ struct io_buffer_list {
 	__u16 bgid;
 
 	/* below is for ring provided buffers */
-	__u16 buf_nr_pages;
 	__u16 nr_entries;
 	__u16 head;
 	__u16 mask;
-- 
cgit v1.2.3


From c04507ac500e2cc8048000c2a849588227554e06 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 16 Nov 2025 21:51:07 +0100
Subject: sched: Provide and use set_need_resched_current()

set_tsk_need_resched(current) requires set_preempt_need_resched(current) to
work correctly outside of the scheduler.

Provide set_need_resched_current() which wraps this correctly and replace
all the open coded instances.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251116174750.665769842@linutronix.de
---
 arch/s390/mm/pfault.c    |  3 +--
 include/linux/sched.h    |  7 +++++++
 kernel/rcu/tiny.c        |  8 +++-----
 kernel/rcu/tree.c        | 14 +++++---------
 kernel/rcu/tree_exp.h    |  3 +--
 kernel/rcu/tree_plugin.h |  9 +++------
 kernel/rcu/tree_stall.h  |  3 +--
 7 files changed, 21 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c
index e6175d75e4b0..2f829448c719 100644
--- a/arch/s390/mm/pfault.c
+++ b/arch/s390/mm/pfault.c
@@ -199,8 +199,7 @@ block:
 			 * return to userspace schedule() to block.
 			 */
 			__set_current_state(TASK_UNINTERRUPTIBLE);
-			set_tsk_need_resched(tsk);
-			set_preempt_need_resched();
+			set_need_resched_current();
 		}
 	}
 out:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bb436ee1942d..021d05aa941a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2058,6 +2058,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 
+static inline void set_need_resched_current(void)
+{
+	lockdep_assert_irqs_disabled();
+	set_tsk_need_resched(current);
+	set_preempt_need_resched();
+}
+
 /*
  * cond_resched() and cond_resched_lock(): latency reduction via
  * explicit rescheduling in places that are safe. The return
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c1ebfd51768b..585cade21010 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -70,12 +70,10 @@ void rcu_qs(void)
  */
 void rcu_sched_clock_irq(int user)
 {
-	if (user) {
+	if (user)
 		rcu_qs();
-	} else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
-		set_tsk_need_resched(current);
-		set_preempt_need_resched();
-	}
+	else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail)
+		set_need_resched_current();
 }
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8293bae1dec1..85b82a7007b9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2696,10 +2696,8 @@ void rcu_sched_clock_irq(int user)
 	/* The load-acquire pairs with the store-release setting to true. */
 	if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
 		/* Idle and userspace execution already are quiescent states. */
-		if (!rcu_is_cpu_rrupt_from_idle() && !user) {
-			set_tsk_need_resched(current);
-			set_preempt_need_resched();
-		}
+		if (!rcu_is_cpu_rrupt_from_idle() && !user)
+			set_need_resched_current();
 		__this_cpu_write(rcu_data.rcu_urgent_qs, false);
 	}
 	rcu_flavor_sched_clock_irq(user);
@@ -2824,7 +2822,6 @@ static void strict_work_handler(struct work_struct *work)
 /* Perform RCU core processing work for the current CPU.  */
 static __latent_entropy void rcu_core(void)
 {
-	unsigned long flags;
 	struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
 	struct rcu_node *rnp = rdp->mynode;
 
@@ -2837,8 +2834,8 @@ static __latent_entropy void rcu_core(void)
 	if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
 		rcu_preempt_deferred_qs(current);
 	} else if (rcu_preempt_need_deferred_qs(current)) {
-		set_tsk_need_resched(current);
-		set_preempt_need_resched();
+		guard(irqsave)();
+		set_need_resched_current();
 	}
 
 	/* Update RCU state based on any recent quiescent states. */
@@ -2847,10 +2844,9 @@ static __latent_entropy void rcu_core(void)
 	/* No grace period and unregistered callbacks? */
 	if (!rcu_gp_in_progress() &&
 	    rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) {
-		local_irq_save(flags);
+		guard(irqsave)();
 		if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
 			rcu_accelerate_cbs_unlocked(rnp, rdp);
-		local_irq_restore(flags);
 	}
 
 	rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6058a734090c..96c49c56fc14 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -729,8 +729,7 @@ static void rcu_exp_need_qs(void)
 	__this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
 	/* Store .exp before .rcu_urgent_qs. */
 	smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
-	set_tsk_need_resched(current);
-	set_preempt_need_resched();
+	set_need_resched_current();
 }
 
 #ifdef CONFIG_PREEMPT_RCU
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d85763336b3c..dbe2d02be824 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -753,8 +753,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			// Also if no expediting and no possible deboosting,
 			// slow is OK.  Plus nohz_full CPUs eventually get
 			// tick enabled.
-			set_tsk_need_resched(current);
-			set_preempt_need_resched();
+			set_need_resched_current();
 			if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
 			    needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
 			    cpu_online(rdp->cpu)) {
@@ -813,10 +812,8 @@ static void rcu_flavor_sched_clock_irq(int user)
 	if (rcu_preempt_depth() > 0 ||
 	    (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
 		/* No QS, force context switch if deferred. */
-		if (rcu_preempt_need_deferred_qs(t)) {
-			set_tsk_need_resched(t);
-			set_preempt_need_resched();
-		}
+		if (rcu_preempt_need_deferred_qs(t))
+			set_need_resched_current();
 	} else if (rcu_preempt_need_deferred_qs(t)) {
 		rcu_preempt_deferred_qs(t); /* Report deferred QS. */
 		return;
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index d16afeb11506..b67532cb8770 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -763,8 +763,7 @@ static void print_cpu_stall(unsigned long gp_seq, unsigned long gps)
 	 * progress and it could be we're stuck in kernel space without context
 	 * switches for an entirely unreasonable amount of time.
 	 */
-	set_tsk_need_resched(current);
-	set_preempt_need_resched();
+	set_need_resched_current();
 }
 
 static bool csd_lock_suppress_rcu_stall;
-- 
cgit v1.2.3


From 898f94465205e33295c29333a82a249b8f90aa74 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 23 Oct 2025 09:12:39 -0400
Subject: lockd: don't allow locking on reexported NFSv2/3

Since commit 9254c8ae9b81 ("nfsd: disallow file locking and delegations
for NFSv4 reexport"), file locking when reexporting an NFS mount via
NFSv4 is expressly prohibited by nfsd. Do the same in lockd:

Add a new  nlmsvc_file_cannot_lock() helper that will test whether file
locking is allowed for a given file, and return nlm_lck_denied_nolocks
if it isn't.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Olga Kornievskaia <okorniev@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svclock.c          | 12 ++++++++++++
 fs/lockd/svcshare.c         |  6 ++++++
 include/linux/lockd/lockd.h |  9 ++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index a31dc9588eb8..3a3d05cfe09a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -495,6 +495,9 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 				(long long)lock->fl.fl_end,
 				wait);
 
+	if (nlmsvc_file_cannot_lock(file))
+		return nlm_lck_denied_nolocks;
+
 	if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) {
 		async_block = wait;
 		wait = 0;
@@ -621,6 +624,9 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
+	if (nlmsvc_file_cannot_lock(file))
+		return nlm_lck_denied_nolocks;
+
 	if (locks_in_grace(SVC_NET(rqstp))) {
 		ret = nlm_lck_denied_grace_period;
 		goto out;
@@ -678,6 +684,9 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
+	if (nlmsvc_file_cannot_lock(file))
+		return nlm_lck_denied_nolocks;
+
 	/* First, cancel any lock that might be there */
 	nlmsvc_cancel_blocked(net, file, lock);
 
@@ -715,6 +724,9 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
+	if (nlmsvc_file_cannot_lock(file))
+		return nlm_lck_denied_nolocks;
+
 	if (locks_in_grace(net))
 		return nlm_lck_denied_grace_period;
 
diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c
index ade4931b2da2..88c81ce1148d 100644
--- a/fs/lockd/svcshare.c
+++ b/fs/lockd/svcshare.c
@@ -32,6 +32,9 @@ nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file,
 	struct xdr_netobj	*oh = &argp->lock.oh;
 	u8			*ohdata;
 
+	if (nlmsvc_file_cannot_lock(file))
+		return nlm_lck_denied_nolocks;
+
 	for (share = file->f_shares; share; share = share->s_next) {
 		if (share->s_host == host && nlm_cmp_owner(share, oh))
 			goto update;
@@ -72,6 +75,9 @@ nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
 	struct nlm_share	*share, **shpp;
 	struct xdr_netobj	*oh = &argp->lock.oh;
 
+	if (nlmsvc_file_cannot_lock(file))
+		return nlm_lck_denied_nolocks;
+
 	for (shpp = &file->f_shares; (share = *shpp) != NULL;
 					shpp = &share->s_next) {
 		if (share->s_host == host && nlm_cmp_owner(share, oh)) {
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index c8f0f9458f2c..330e38776bb2 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -12,6 +12,7 @@
 
 /* XXX: a lot of this should really be under fs/lockd. */
 
+#include <linux/exportfs.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <net/ipv6.h>
@@ -307,7 +308,7 @@ void		  nlmsvc_invalidate_all(void);
 int           nlmsvc_unlock_all_by_sb(struct super_block *sb);
 int           nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr);
 
-static inline struct file *nlmsvc_file_file(struct nlm_file *file)
+static inline struct file *nlmsvc_file_file(const struct nlm_file *file)
 {
 	return file->f_file[O_RDONLY] ?
 	       file->f_file[O_RDONLY] : file->f_file[O_WRONLY];
@@ -318,6 +319,12 @@ static inline struct inode *nlmsvc_file_inode(struct nlm_file *file)
 	return file_inode(nlmsvc_file_file(file));
 }
 
+static inline bool
+nlmsvc_file_cannot_lock(const struct nlm_file *file)
+{
+	return exportfs_cannot_lock(nlmsvc_file_file(file)->f_path.dentry->d_sb->s_export_op);
+}
+
 static inline int __nlm_privileged_request4(const struct sockaddr *sap)
 {
 	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-- 
cgit v1.2.3


From 340b59816bc417c306cd76b867914cfb4f386d2d Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Tue, 4 Nov 2025 16:57:09 +0800
Subject: mm: kill mm_wr_locked from unmap_vmas() and unmap_single_vma()

Kill mm_wr_locked since commit f8e97613fed2 ("mm: convert VM_PFNMAP
tracking to pfnmap_track() + pfnmap_untrack()") remove the user.

Link: https://lkml.kernel.org/r/20251104085709.2688433-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h               |  2 +-
 mm/memory.c                      | 12 ++++--------
 mm/mmap.c                        |  2 +-
 mm/vma.c                         |  5 ++---
 tools/testing/vma/vma_internal.h |  3 +--
 5 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b636d12bb651..df9f258a017c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2480,7 +2480,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
 }
 void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 		struct vm_area_struct *start_vma, unsigned long start,
-		unsigned long end, unsigned long tree_end, bool mm_wr_locked);
+		unsigned long end, unsigned long tree_end);
 
 struct mmu_notifier_range;
 
diff --git a/mm/memory.c b/mm/memory.c
index 8d8c36adafa8..b09de6274da3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2023,8 +2023,7 @@ void unmap_page_range(struct mmu_gather *tlb,
 
 static void unmap_single_vma(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, unsigned long start_addr,
-		unsigned long end_addr,
-		struct zap_details *details, bool mm_wr_locked)
+		unsigned long end_addr, struct zap_details *details)
 {
 	unsigned long start = max(vma->vm_start, start_addr);
 	unsigned long end;
@@ -2070,7 +2069,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  * @start_addr: virtual address at which to start unmapping
  * @end_addr: virtual address at which to end unmapping
  * @tree_end: The maximum index to check
- * @mm_wr_locked: lock flag
  *
  * Unmap all pages in the vma list.
  *
@@ -2085,8 +2083,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  */
 void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 		struct vm_area_struct *vma, unsigned long start_addr,
-		unsigned long end_addr, unsigned long tree_end,
-		bool mm_wr_locked)
+		unsigned long end_addr, unsigned long tree_end)
 {
 	struct mmu_notifier_range range;
 	struct zap_details details = {
@@ -2102,8 +2099,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 		unsigned long start = start_addr;
 		unsigned long end = end_addr;
 		hugetlb_zap_begin(vma, &start, &end);
-		unmap_single_vma(tlb, vma, start, end, &details,
-				 mm_wr_locked);
+		unmap_single_vma(tlb, vma, start, end, &details);
 		hugetlb_zap_end(vma, &details);
 		vma = mas_find(mas, tree_end - 1);
 	} while (vma && likely(!xa_is_zero(vma)));
@@ -2139,7 +2135,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb,
 	 * unmap 'address-end' not 'range.start-range.end' as range
 	 * could have been expanded for hugetlb pmd sharing.
 	 */
-	unmap_single_vma(tlb, vma, address, end, details, false);
+	unmap_single_vma(tlb, vma, address, end, details);
 	mmu_notifier_invalidate_range_end(&range);
 	if (is_vm_hugetlb_page(vma)) {
 		/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 644f02071a41..4f51ca644903 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1274,7 +1274,7 @@ void exit_mmap(struct mm_struct *mm)
 	tlb_gather_mmu_fullmm(&tlb, mm);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
-	unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
+	unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX);
 	mmap_read_unlock(mm);
 
 	/*
diff --git a/mm/vma.c b/mm/vma.c
index 919d1fc63a52..0c5e391fe2e2 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -483,8 +483,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
 
 	tlb_gather_mmu(&tlb, mm);
 	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
-		   /* mm_wr_locked = */ true);
+	unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end);
 	mas_set(mas, vma->vm_end);
 	free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
 		      next ? next->vm_start : USER_PGTABLES_CEILING,
@@ -1228,7 +1227,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
 	tlb_gather_mmu(&tlb, vms->vma->vm_mm);
 	update_hiwater_rss(vms->vma->vm_mm);
 	unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
-		   vms->vma_count, mm_wr_locked);
+		   vms->vma_count);
 
 	mas_set(mas_detach, 1);
 	/* start and end may be different if there is no prev or next vma. */
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index d873667704e8..c68d382dac81 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -848,8 +848,7 @@ static inline void update_hiwater_vm(struct mm_struct *mm)
 
 static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 		      struct vm_area_struct *vma, unsigned long start_addr,
-		      unsigned long end_addr, unsigned long tree_end,
-		      bool mm_wr_locked)
+		      unsigned long end_addr, unsigned long tree_end)
 {
 }
 
-- 
cgit v1.2.3


From 5dba5cc2e0ffa76f2f6c8922a04469dc9602c396 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 18 Nov 2025 10:17:43 +0000
Subject: mm: introduce VM_MAYBE_GUARD and make visible in /proc/$pid/smaps

Patch series "introduce VM_MAYBE_GUARD and make it sticky", v4.

Currently, guard regions are not visible to users except through
/proc/$pid/pagemap, with no explicit visibility at the VMA level.

This makes the feature less useful, as it isn't entirely apparent which
VMAs may have these entries present, especially when performing actions
which walk through memory regions such as those performed by CRIU.

This series addresses this issue by introducing the VM_MAYBE_GUARD flag
which fulfils this role, updating the smaps logic to display an entry for
these.

The semantics of this flag are that a guard region MAY be present if set
(we cannot be sure, as we can't efficiently track whether an
MADV_GUARD_REMOVE finally removes all the guard regions in a VMA) - but if
not set the VMA definitely does NOT have any guard regions present.

It's problematic to establish this flag without further action, because
that means that VMAs with guard regions in them become non-mergeable with
adjacent VMAs for no especially good reason.

To work around this, this series also introduces the concept of 'sticky'
VMA flags - that is flags which:

a. if set in one VMA and not in another still permit those VMAs to be
   merged (if otherwise compatible).

b. When they are merged, the resultant VMA must have the flag set.

The VMA logic is updated to propagate these flags correctly.

Additionally, VM_MAYBE_GUARD being an explicit VMA flag allows us to solve
an issue with file-backed guard regions - previously these established an
anon_vma object for file-backed mappings solely to have vma_needs_copy()
correctly propagate guard region mappings to child processes.

We introduce a new flag alias VM_COPY_ON_FORK (which currently only
specifies VM_MAYBE_GUARD) and update vma_needs_copy() to check explicitly
for this flag and to copy page tables if it is present, which resolves
this issue.

Additionally, we add the ability for allow-listed VMA flags to be
atomically writable with only mmap/VMA read locks held.

The only flag we allow so far is VM_MAYBE_GUARD, which we carefully ensure
does not cause any races by being allowed to do so.

This allows us to maintain guard region installation as a read-locked
operation and not endure the overhead of obtaining a write lock here.

Finally we introduce extensive VMA userland tests to assert that the
sticky VMA logic behaves correctly as well as guard region self tests to
assert that smaps visibility is correctly implemented.


This patch (of 9):

Currently, if a user needs to determine if guard regions are present in a
range, they have to scan all VMAs (or have knowledge of which ones might
have guard regions).

Since commit 8e2f2aeb8b48 ("fs/proc/task_mmu: add guard region bit to
pagemap") and the related commit a516403787e0 ("fs/proc: extend the
PAGEMAP_SCAN ioctl to report guard regions"), users can use either
/proc/$pid/pagemap or the PAGEMAP_SCAN functionality to perform this
operation at a virtual address level.

This is not ideal, and it gives no visibility at a /proc/$pid/smaps level
that guard regions exist in ranges.

This patch remedies the situation by establishing a new VMA flag,
VM_MAYBE_GUARD, to indicate that a VMA may contain guard regions (it is
uncertain because we cannot reasonably determine whether a
MADV_GUARD_REMOVE call has removed all of the guard regions in a VMA, and
additionally VMAs may change across merge/split).

We utilise 0x800 for this flag which makes it available to 32-bit
architectures also, a flag that was previously used by VM_DENYWRITE, which
was removed in commit 8d0920bde5eb ("mm: remove VM_DENYWRITE") and hasn't
bee reused yet.

We also update the smaps logic and documentation to identify these VMAs.

Another major use of this functionality is that we can use it to identify
that we ought to copy page tables on fork.

We do not actually implement usage of this flag in mm/madvise.c yet as we
need to allow some VMA flags to be applied atomically under mmap/VMA read
lock in order to avoid the need to acquire a write lock for this purpose.

Link: https://lkml.kernel.org/r/cover.1763460113.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/cf8ef821eba29b6c5b5e138fffe95d6dcabdedb9.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/filesystems/proc.rst | 5 +++--
 fs/proc/task_mmu.c                 | 1 +
 include/linux/mm.h                 | 3 +++
 include/trace/events/mmflags.h     | 1 +
 mm/memory.c                        | 4 ++++
 tools/testing/vma/vma_internal.h   | 1 +
 6 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 0b86a8022fa1..8256e857e2d7 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -553,7 +553,7 @@ otherwise.
 kernel flags associated with the particular virtual memory area in two letter
 encoded manner. The codes are the following:
 
-    ==    =======================================
+    ==    =============================================================
     rd    readable
     wr    writeable
     ex    executable
@@ -591,7 +591,8 @@ encoded manner. The codes are the following:
     sl    sealed
     lf    lock on fault pages
     dp    always lazily freeable mapping
-    ==    =======================================
+    gu    maybe contains guard regions (if not set, definitely doesn't)
+    ==    =============================================================
 
 Note that there is no guarantee that every flag and associated mnemonic will
 be present in all further kernel releases. Things get changed, the flags may
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fc35a0543f01..db16ed91c269 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1146,6 +1146,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_MAYSHARE)]	= "ms",
 		[ilog2(VM_GROWSDOWN)]	= "gd",
 		[ilog2(VM_PFNMAP)]	= "pf",
+		[ilog2(VM_MAYBE_GUARD)]	= "gu",
 		[ilog2(VM_LOCKED)]	= "lo",
 		[ilog2(VM_IO)]		= "io",
 		[ilog2(VM_SEQ_READ)]	= "sr",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index df9f258a017c..36b9418c00fc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -271,6 +271,8 @@ extern struct rw_semaphore nommu_region_sem;
 extern unsigned int kobjsize(const void *objp);
 #endif
 
+#define VM_MAYBE_GUARD_BIT 11
+
 /*
  * vm_flags in vm_area_struct, see mm_types.h.
  * When changing, update also include/trace/events/mmflags.h
@@ -296,6 +298,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_UFFD_MISSING	0
 #endif /* CONFIG_MMU */
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
+#define VM_MAYBE_GUARD	BIT(VM_MAYBE_GUARD_BIT)	/* The VMA maybe contains guard regions. */
 #define VM_UFFD_WP	0x00001000	/* wrprotect pages tracking */
 
 #define VM_LOCKED	0x00002000
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index aa441f593e9a..a6e5a44c9b42 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -213,6 +213,7 @@ IF_HAVE_PG_ARCH_3(arch_3)
 	{VM_UFFD_MISSING,		"uffd_missing"	},		\
 IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR,	"uffd_minor"	)		\
 	{VM_PFNMAP,			"pfnmap"	},		\
+	{VM_MAYBE_GUARD,		"maybe_guard"	},		\
 	{VM_UFFD_WP,			"uffd_wp"	},		\
 	{VM_LOCKED,			"locked"	},		\
 	{VM_IO,				"io"		},		\
diff --git a/mm/memory.c b/mm/memory.c
index b09de6274da3..d1728d0538d6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1478,6 +1478,10 @@ vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 	if (src_vma->anon_vma)
 		return true;
 
+	/* Guard regions have modified page tables that require copying. */
+	if (src_vma->vm_flags & VM_MAYBE_GUARD)
+		return true;
+
 	/*
 	 * Don't copy ptes where a page fault will fill them correctly.  Fork
 	 * becomes much lighter when there are big shared or private readonly
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index c68d382dac81..46acb4df45de 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -56,6 +56,7 @@ extern unsigned long dac_mmap_min_addr;
 #define VM_MAYEXEC	0x00000040
 #define VM_GROWSDOWN	0x00000100
 #define VM_PFNMAP	0x00000400
+#define VM_MAYBE_GUARD	0x00000800
 #define VM_LOCKED	0x00002000
 #define VM_IO           0x00004000
 #define VM_SEQ_READ	0x00008000	/* App will access data sequentially */
-- 
cgit v1.2.3


From 568822502383acd57d7cc1c72ee43932c45a9524 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 18 Nov 2025 10:17:44 +0000
Subject: mm: add atomic VMA flags and set VM_MAYBE_GUARD as such

This patch adds the ability to atomically set VMA flags with only the mmap
read/VMA read lock held.

As this could be hugely problematic for VMA flags in general given that
all other accesses are non-atomic and serialised by the mmap/VMA locks, we
implement this with a strict allow-list - that is, only designated flags
are allowed to do this.

We make VM_MAYBE_GUARD one of these flags.

Link: https://lkml.kernel.org/r/97e57abed09f2663077ed7a36fb8206e243171a9.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 36b9418c00fc..03776aab3837 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -518,6 +518,9 @@ extern unsigned int kobjsize(const void *objp);
 /* This mask represents all the VMA flag bits used by mlock */
 #define VM_LOCKED_MASK	(VM_LOCKED | VM_LOCKONFAULT)
 
+/* These flags can be updated atomically via VMA/mmap read lock. */
+#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD
+
 /* Arch-specific flags to clear when updating VM flags on protection change */
 #ifndef VM_ARCH_CLEAR
 # define VM_ARCH_CLEAR	VM_NONE
@@ -860,6 +863,47 @@ static inline void vm_flags_mod(struct vm_area_struct *vma,
 	__vm_flags_mod(vma, set, clear);
 }
 
+static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma,
+				       int bit)
+{
+	const vm_flags_t mask = BIT(bit);
+
+	/* Only specific flags are permitted */
+	if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED)))
+		return false;
+
+	return true;
+}
+
+/*
+ * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific
+ * valid flags are allowed to do this.
+ */
+static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit)
+{
+	/* mmap read lock/VMA read lock must be held. */
+	if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
+		vma_assert_locked(vma);
+
+	if (__vma_flag_atomic_valid(vma, bit))
+		set_bit(bit, &ACCESS_PRIVATE(vma, __vm_flags));
+}
+
+/*
+ * Test for VMA flag atomically. Requires no locks. Only specific valid flags
+ * are allowed to do this.
+ *
+ * This is necessarily racey, so callers must ensure that serialisation is
+ * achieved through some other means, or that races are permissible.
+ */
+static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, int bit)
+{
+	if (__vma_flag_atomic_valid(vma, bit))
+		return test_bit(bit, &vma->vm_flags);
+
+	return false;
+}
+
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
 {
 	vma->vm_ops = NULL;
-- 
cgit v1.2.3


From 64212ba02e66e705cabce188453ba4e61e9d7325 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 18 Nov 2025 10:17:46 +0000
Subject: mm: implement sticky VMA flags

It is useful to be able to designate that certain flags are 'sticky', that
is, if two VMAs are merged one with a flag of this nature and one without,
the merged VMA sets this flag.

As a result we ignore these flags for the purposes of determining VMA flag
differences between VMAs being considered for merge.

This patch therefore updates the VMA merge logic to perform this action,
with flags possessing this property being described in the VM_STICKY
bitmap.

Those flags which ought to be ignored for the purposes of VMA merge are
described in the VM_IGNORE_MERGE bitmap, which the VMA merge logic is also
updated to use.

As part of this change we place VM_SOFTDIRTY in VM_IGNORE_MERGE as it
already had this behaviour, alongside VM_STICKY as sticky flags by
implication must not disallow merge.

Ultimately it seems that we should make VM_SOFTDIRTY a sticky flag in its
own right, but this change is out of scope for this series.

The only sticky flag designated as such is VM_MAYBE_GUARD, so as a result
of this change, once the VMA flag is set upon guard region installation,
VMAs with guard ranges will now not have their merge behaviour impacted as
a result and can be freely merged with other VMAs without VM_MAYBE_GUARD
set.

Also update the comments for vma_modify_flags() to directly reference
sticky flags now we have established the concept.

We also update the VMA userland tests to account for the changes.

Link: https://lkml.kernel.org/r/22ad5269f7669d62afb42ce0c79bad70b994c58d.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h               | 28 ++++++++++++++++++++++++++++
 mm/vma.c                         | 28 +++++++++++++++-------------
 mm/vma.h                         | 10 ++++------
 tools/testing/vma/vma_internal.h | 28 ++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 03776aab3837..fea113d1d723 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -527,6 +527,34 @@ extern unsigned int kobjsize(const void *objp);
 #endif
 #define VM_FLAGS_CLEAR	(ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)
 
+/*
+ * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
+ * possesses it but the other does not, the merged VMA should nonetheless have
+ * applied to it:
+ *
+ * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that
+ *                  mapped page tables may contain metadata not described by the
+ *                  VMA and thus any merged VMA may also contain this metadata,
+ *                  and thus we must make this flag sticky.
+ */
+#define VM_STICKY VM_MAYBE_GUARD
+
+/*
+ * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
+ * of these flags and the other not does not preclude a merge.
+ *
+ * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but
+ *                dirty bit -- the caller should mark merged VMA as dirty. If
+ *                dirty bit won't be excluded from comparison, we increase
+ *                pressure on the memory system forcing the kernel to generate
+ *                new VMAs when old one could be extended instead.
+ *
+ *    VM_STICKY - When merging VMAs, VMA flags must match, unless they are
+ *                'sticky'. If any sticky flags exist in either VMA, we simply
+ *                set all of them on the merged VMA.
+ */
+#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY)
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
diff --git a/mm/vma.c b/mm/vma.c
index 47469c036a72..4e21c988054d 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -89,15 +89,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
 
 	if (!mpol_equal(vmg->policy, vma_policy(vma)))
 		return false;
-	/*
-	 * VM_SOFTDIRTY should not prevent from VMA merging, if we
-	 * match the flags but dirty bit -- the caller should mark
-	 * merged VMA as dirty. If dirty bit won't be excluded from
-	 * comparison, we increase pressure on the memory system forcing
-	 * the kernel to generate new VMAs when old one could be
-	 * extended instead.
-	 */
-	if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY)
+	if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE)
 		return false;
 	if (vma->vm_file != vmg->file)
 		return false;
@@ -808,6 +800,7 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
 static __must_check struct vm_area_struct *vma_merge_existing_range(
 		struct vma_merge_struct *vmg)
 {
+	vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY;
 	struct vm_area_struct *middle = vmg->middle;
 	struct vm_area_struct *prev = vmg->prev;
 	struct vm_area_struct *next;
@@ -900,11 +893,13 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
 	if (merge_right) {
 		vma_start_write(next);
 		vmg->target = next;
+		sticky_flags |= (next->vm_flags & VM_STICKY);
 	}
 
 	if (merge_left) {
 		vma_start_write(prev);
 		vmg->target = prev;
+		sticky_flags |= (prev->vm_flags & VM_STICKY);
 	}
 
 	if (merge_both) {
@@ -974,6 +969,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
 	if (err || commit_merge(vmg))
 		goto abort;
 
+	vm_flags_set(vmg->target, sticky_flags);
 	khugepaged_enter_vma(vmg->target, vmg->vm_flags);
 	vmg->state = VMA_MERGE_SUCCESS;
 	return vmg->target;
@@ -1124,6 +1120,10 @@ int vma_expand(struct vma_merge_struct *vmg)
 	bool remove_next = false;
 	struct vm_area_struct *target = vmg->target;
 	struct vm_area_struct *next = vmg->next;
+	vm_flags_t sticky_flags;
+
+	sticky_flags = vmg->vm_flags & VM_STICKY;
+	sticky_flags |= target->vm_flags & VM_STICKY;
 
 	VM_WARN_ON_VMG(!target, vmg);
 
@@ -1133,6 +1133,7 @@ int vma_expand(struct vma_merge_struct *vmg)
 	if (next && (target != next) && (vmg->end == next->vm_end)) {
 		int ret;
 
+		sticky_flags |= next->vm_flags & VM_STICKY;
 		remove_next = true;
 		/* This should already have been checked by this point. */
 		VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
@@ -1159,6 +1160,7 @@ int vma_expand(struct vma_merge_struct *vmg)
 	if (commit_merge(vmg))
 		goto nomem;
 
+	vm_flags_set(target, sticky_flags);
 	return 0;
 
 nomem:
@@ -1654,9 +1656,9 @@ struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
 		return ret;
 
 	/*
-	 * For a merge to succeed, the flags must match those requested. For
-	 * flags which do not obey typical merge rules (i.e. do not need to
-	 * match), we must let the caller know about them.
+	 * For a merge to succeed, the flags must match those
+	 * requested. However, sticky flags may have been retained, so propagate
+	 * them to the caller.
 	 */
 	if (vmg.state == VMA_MERGE_SUCCESS)
 		*vm_flags_ptr = ret->vm_flags;
@@ -1906,7 +1908,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
 	return a->vm_end == b->vm_start &&
 		mpol_equal(vma_policy(a), vma_policy(b)) &&
 		a->vm_file == b->vm_file &&
-		!((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
+		!((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) &&
 		b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
 }
 
diff --git a/mm/vma.h b/mm/vma.h
index 75f1d9c7204b..abada6a64c4e 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -273,17 +273,15 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
  * @start: The start of the range to update. May be offset within @vma.
  * @end: The exclusive end of the range to update, may be offset within @vma.
  * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is
- * about to be set to. On merge, this will be updated to include any additional
- * flags which remain in place.
+ * about to be set to. On merge, this will be updated to include sticky flags.
  *
  * IMPORTANT: The actual modification being requested here is NOT applied,
  * rather the VMA is perhaps split, perhaps merged to accommodate the change,
  * and the caller is expected to perform the actual modification.
  *
- * In order to account for VMA flags which may persist (e.g. soft-dirty), the
- * @vm_flags_ptr parameter points to the requested flags which are then updated
- * so the caller, should they overwrite any existing flags, correctly retains
- * these.
+ * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points
+ * to the requested flags which are then updated so the caller, should they
+ * overwrite any existing flags, correctly retains these.
  *
  * Returns: A VMA which contains the range @start to @end ready to have its
  * flags altered to *@vm_flags.
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 46acb4df45de..73c2025777e6 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -117,6 +117,34 @@ extern unsigned long dac_mmap_min_addr;
 #define VM_SEALED	VM_NONE
 #endif
 
+/*
+ * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
+ * possesses it but the other does not, the merged VMA should nonetheless have
+ * applied to it:
+ *
+ * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that
+ *                  mapped page tables may contain metadata not described by the
+ *                  VMA and thus any merged VMA may also contain this metadata,
+ *                  and thus we must make this flag sticky.
+ */
+#define VM_STICKY VM_MAYBE_GUARD
+
+/*
+ * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
+ * of these flags and the other not does not preclude a merge.
+ *
+ * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but
+ *                dirty bit -- the caller should mark merged VMA as dirty. If
+ *                dirty bit won't be excluded from comparison, we increase
+ *                pressure on the memory system forcing the kernel to generate
+ *                new VMAs when old one could be extended instead.
+ *
+ *    VM_STICKY - When merging VMAs, VMA flags must match, unless they are
+ *                'sticky'. If any sticky flags exist in either VMA, we simply
+ *                set all of them on the merged VMA.
+ */
+#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY)
+
 #define FIRST_USER_ADDRESS	0UL
 #define USER_PGTABLES_CEILING	0UL
 
-- 
cgit v1.2.3


From ab04b530e7e8bd5cf9fb0c1ad20e0deee8f569ec Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 18 Nov 2025 10:17:47 +0000
Subject: mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one

Gather all the VMA flags whose presence implies that page tables must be
copied on fork into a single bitmap - VM_COPY_ON_FORK - and use this
rather than specifying individual flags in vma_needs_copy().

We also add VM_MAYBE_GUARD to this list, as it being set on a VMA implies
that there may be metadata contained in the page tables (that is - guard
markers) which would will not and cannot be propagated upon fork.

This was already being done manually previously in vma_needs_copy(), but
this makes it very explicit, alongside VM_PFNMAP, VM_MIXEDMAP and
VM_UFFD_WP all of which imply the same.

Note that VM_STICKY flags ought generally to be marked VM_COPY_ON_FORK too
- because equally a flag being VM_STICKY indicates that the VMA contains
metadat that is not propagated by being faulted in - i.e.  that the VMA
metadata does not fully describe the VMA alone, and thus we must propagate
whatever metadata there is on a fork.

However, for maximum flexibility, we do not make this necessarily the case
here.

Link: https://lkml.kernel.org/r/5d41b24e7bc622cda0af92b6d558d7f4c0d1bc8c.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h               | 26 ++++++++++++++++++++++++++
 mm/memory.c                      | 18 ++++--------------
 tools/testing/vma/vma_internal.h | 26 ++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fea113d1d723..af2904aeb163 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -555,6 +555,32 @@ extern unsigned int kobjsize(const void *objp);
  */
 #define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY)
 
+/*
+ * Flags which should result in page tables being copied on fork. These are
+ * flags which indicate that the VMA maps page tables which cannot be
+ * reconsistuted upon page fault, so necessitate page table copying upon
+ *
+ * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
+ *                           reasonably reconstructed on page fault.
+ *
+ *              VM_UFFD_WP - Encodes metadata about an installed uffd
+ *                           write protect handler, which cannot be
+ *                           reconstructed on page fault.
+ *
+ *                           We always copy pgtables when dst_vma has uffd-wp
+ *                           enabled even if it's file-backed
+ *                           (e.g. shmem). Because when uffd-wp is enabled,
+ *                           pgtable contains uffd-wp protection information,
+ *                           that's something we can't retrieve from page cache,
+ *                           and skip copying will lose those info.
+ *
+ *          VM_MAYBE_GUARD - Could contain page guard region markers which
+ *                           by design are a property of the page tables
+ *                           only and thus cannot be reconstructed on page
+ *                           fault.
+ */
+#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
diff --git a/mm/memory.c b/mm/memory.c
index d1728d0538d6..27bc457b32c2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1463,25 +1463,15 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 static bool
 vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 {
+	if (src_vma->vm_flags & VM_COPY_ON_FORK)
+		return true;
 	/*
-	 * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
-	 * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
-	 * contains uffd-wp protection information, that's something we can't
-	 * retrieve from page cache, and skip copying will lose those info.
+	 * The presence of an anon_vma indicates an anonymous VMA has page
+	 * tables which naturally cannot be reconstituted on page fault.
 	 */
-	if (userfaultfd_wp(dst_vma))
-		return true;
-
-	if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
-		return true;
-
 	if (src_vma->anon_vma)
 		return true;
 
-	/* Guard regions have modified page tables that require copying. */
-	if (src_vma->vm_flags & VM_MAYBE_GUARD)
-		return true;
-
 	/*
 	 * Don't copy ptes where a page fault will fill them correctly.  Fork
 	 * becomes much lighter when there are big shared or private readonly
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 73c2025777e6..233819a9e7ee 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -145,6 +145,32 @@ extern unsigned long dac_mmap_min_addr;
  */
 #define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY)
 
+/*
+ * Flags which should result in page tables being copied on fork. These are
+ * flags which indicate that the VMA maps page tables which cannot be
+ * reconsistuted upon page fault, so necessitate page table copying upon
+ *
+ * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
+ *                           reasonably reconstructed on page fault.
+ *
+ *              VM_UFFD_WP - Encodes metadata about an installed uffd
+ *                           write protect handler, which cannot be
+ *                           reconstructed on page fault.
+ *
+ *                           We always copy pgtables when dst_vma has uffd-wp
+ *                           enabled even if it's file-backed
+ *                           (e.g. shmem). Because when uffd-wp is enabled,
+ *                           pgtable contains uffd-wp protection information,
+ *                           that's something we can't retrieve from page cache,
+ *                           and skip copying will lose those info.
+ *
+ *          VM_MAYBE_GUARD - Could contain page guard region markers which
+ *                           by design are a property of the page tables
+ *                           only and thus cannot be reconstructed on page
+ *                           fault.
+ */
+#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
+
 #define FIRST_USER_ADDRESS	0UL
 #define USER_PGTABLES_CEILING	0UL
 
-- 
cgit v1.2.3


From 05be0287955970b043a0742e85b6c285dea4f286 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Fri, 7 Nov 2025 17:55:36 +0800
Subject: mm: remove unnecessary __GFP_HIGHMEM in __p*d_alloc_one_*()

__{pgd,p4d,pud,pmd,pte}_alloc_one_*() always allocate pages with GFP flag
GFP_PGTABLE_KERNEL/GFP_PGTABLE_USER.  These two macros are defined as
follows:

 #define GFP_PGTABLE_KERNEL	(GFP_KERNEL | __GFP_ZERO)
 #define GFP_PGTABLE_USER	(GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)

There is no __GFP_HIGHMEM in them, so we needn't to clear __GFP_HIGHMEM
explicitly.

Link: https://lkml.kernel.org/r/20251109021817.346181-1-chenhuacai@loongson.cn
Link: https://lkml.kernel.org/r/20251107095536.3101371-1-chenhuacai@loongson.cn
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/pgalloc.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index b9d2a7c79b93..57137d3ac159 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -18,8 +18,7 @@
  */
 static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
 {
-	struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL &
-			~__GFP_HIGHMEM, 0);
+	struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL, 0);
 
 	if (!ptdesc)
 		return NULL;
@@ -178,7 +177,6 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long
 
 	if (mm == &init_mm)
 		gfp = GFP_PGTABLE_KERNEL;
-	gfp &= ~__GFP_HIGHMEM;
 
 	ptdesc = pagetable_alloc_noprof(gfp, 0);
 	if (!ptdesc)
@@ -236,7 +234,6 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long
 
 	if (mm == &init_mm)
 		gfp = GFP_PGTABLE_KERNEL;
-	gfp &= ~__GFP_HIGHMEM;
 
 	ptdesc = pagetable_alloc_noprof(gfp, 0);
 	if (!ptdesc)
@@ -284,7 +281,6 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order
 
 	if (mm == &init_mm)
 		gfp = GFP_PGTABLE_KERNEL;
-	gfp &= ~__GFP_HIGHMEM;
 
 	ptdesc = pagetable_alloc_noprof(gfp, order);
 	if (!ptdesc)
-- 
cgit v1.2.3


From bc8e51c05ad50a5a0b02114d3cc94d151a332595 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Fri, 7 Nov 2025 15:40:41 -0800
Subject: mm: memcg: dump memcg protection info on oom or alloc failures

Currently kernel dumps memory state on oom and allocation failures.  One
of the question usually raised on those dumps is why the kernel has not
reclaimed the reclaimable memory instead of triggering oom.  One potential
reason is the usage of memory protection provided by memcg.  So, let's
also dump the memory protected by the memcg in such reports to ease the
debugging.

Link: https://lkml.kernel.org/r/20251107234041.3632644-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  5 +++++
 mm/memcontrol.c            | 13 +++++++++++++
 mm/oom_kill.c              |  1 +
 mm/page_alloc.c            |  1 +
 4 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8c0f15e5978f..966f7c1a0128 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1764,6 +1764,7 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
 
 bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
 
+void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -1830,6 +1831,10 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
 {
 	return true;
 }
+
+static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
+{
+}
 #endif /* CONFIG_MEMCG */
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 025da46d9959..bfc986da3289 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5635,3 +5635,16 @@ bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
 {
 	return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
 }
+
+void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return;
+
+	if (!memcg)
+		memcg = root_mem_cgroup;
+
+	pr_warn("Memory cgroup min protection %lukB -- low protection %lukB",
+		K(atomic_long_read(&memcg->memory.children_min_usage)*PAGE_SIZE),
+		K(atomic_long_read(&memcg->memory.children_low_usage)*PAGE_SIZE));
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c145b0feecc1..5eb11fbba704 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -472,6 +472,7 @@ static void dump_header(struct oom_control *oc)
 		if (should_dump_unreclaim_slab())
 			dump_unreclaimable_slab();
 	}
+	mem_cgroup_show_protected_memory(oc->memcg);
 	if (sysctl_oom_dump_tasks)
 		dump_tasks(oc);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e4efda1158b2..26be5734253f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3977,6 +3977,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
 		filter &= ~SHOW_MEM_FILTER_NODES;
 
 	__show_mem(filter, nodemask, gfp_zone(gfp_mask));
+	mem_cgroup_show_protected_memory(NULL);
 }
 
 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
-- 
cgit v1.2.3


From 2197bb60f89077603cc580ff752c5cf6388c1099 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 10 Nov 2025 20:32:01 +0000
Subject: mm: add vma_start_write_killable()

Patch series "vma_start_write_killable"", v2.

When we added the VMA lock, we made a major oversight in not adding a
killable variant.  That can run us into trouble where a thread takes the
VMA lock for read (eg handling a page fault) and then goes out to lunch
for an hour (eg doing reclaim).  Another thread tries to modify the VMA,
taking the mmap_lock for write, then attempts to lock the VMA for write.
That blocks on the first thread, and ensures that every other page fault
now tries to take the mmap_lock for read.  Because everything's in an
uninterruptible sleep, we can't kill the task, which makes me angry.

This patchset just adds vma_start_write_killable() and converts one caller
to use it.  Most users are somewhat tricky to convert, so expect follow-up
individual patches per call-site which need careful analysis to make sure
we've done proper cleanup.


This patch (of 2):

The vma can be held read-locked for a substantial period of time, eg if
memory allocation needs to go into reclaim.  It's useful to be able to
send fatal signals to threads which are waiting for the write lock.

Link: https://lkml.kernel.org/r/20251110203204.1454057-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20251110203204.1454057-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Chris Li <chriscli@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/process_addrs.rst |  9 ++++++++-
 include/linux/mmap_lock.h          | 30 ++++++++++++++++++++++++++++--
 mm/mmap_lock.c                     | 34 +++++++++++++++++++++++++---------
 tools/testing/vma/vma_internal.h   |  8 ++++++++
 4 files changed, 69 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst
index be49e2a269e4..7f2f3e87071d 100644
--- a/Documentation/mm/process_addrs.rst
+++ b/Documentation/mm/process_addrs.rst
@@ -48,7 +48,8 @@ Terminology
 * **VMA locks** - The VMA lock is at VMA granularity (of course) which behaves
   as a read/write semaphore in practice. A VMA read lock is obtained via
   :c:func:`!lock_vma_under_rcu` (and unlocked via :c:func:`!vma_end_read`) and a
-  write lock via :c:func:`!vma_start_write` (all VMA write locks are unlocked
+  write lock via vma_start_write() or vma_start_write_killable()
+  (all VMA write locks are unlocked
   automatically when the mmap write lock is released). To take a VMA write lock
   you **must** have already acquired an :c:func:`!mmap_write_lock`.
 * **rmap locks** - When trying to access VMAs through the reverse mapping via a
@@ -907,3 +908,9 @@ Stack expansion
 Stack expansion throws up additional complexities in that we cannot permit there
 to be racing page faults, as a result we invoke :c:func:`!vma_start_write` to
 prevent this in :c:func:`!expand_downwards` or :c:func:`!expand_upwards`.
+
+------------------------
+Functions and structures
+------------------------
+
+.. kernel-doc:: include/linux/mmap_lock.h
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e05da70dc0cb..d53f72dba7fe 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -195,7 +195,8 @@ static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned in
 	return (vma->vm_lock_seq == *mm_lock_seq);
 }
 
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
+int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
+		int state);
 
 /*
  * Begin writing to a VMA.
@@ -209,7 +210,30 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
 
-	__vma_start_write(vma, mm_lock_seq);
+	__vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE);
+}
+
+/**
+ * vma_start_write_killable - Begin writing to a VMA.
+ * @vma: The VMA we are going to modify.
+ *
+ * Exclude concurrent readers under the per-VMA lock until the currently
+ * write-locked mmap_lock is dropped or downgraded.
+ *
+ * Context: May sleep while waiting for readers to drop the vma read lock.
+ * Caller must already hold the mmap_lock for write.
+ *
+ * Return: 0 for a successful acquisition.  -EINTR if a fatal signal was
+ * received.
+ */
+static inline __must_check
+int vma_start_write_killable(struct vm_area_struct *vma)
+{
+	unsigned int mm_lock_seq;
+
+	if (__is_vma_write_locked(vma, &mm_lock_seq))
+		return 0;
+	return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -283,6 +307,8 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
 static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
 static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline __must_check
+int vma_start_write_killable(struct vm_area_struct *vma) { return 0; }
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 		{ mmap_assert_write_locked(vma->vm_mm); }
 static inline void vma_assert_attached(struct vm_area_struct *vma) {}
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 0a0db5849b8e..39f341caf32c 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -45,8 +45,15 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released);
 
 #ifdef CONFIG_MMU
 #ifdef CONFIG_PER_VMA_LOCK
-static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+/*
+ * Return value: 0 if vma detached,
+ * 1 if vma attached with no readers,
+ * -EINTR if signal received,
+ */
+static inline int __vma_enter_locked(struct vm_area_struct *vma,
+		bool detaching, int state)
 {
+	int err;
 	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
 
 	/* Additional refcnt if the vma is attached. */
@@ -58,15 +65,19 @@ static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching
 	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
 	 */
 	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
-		return false;
+		return 0;
 
 	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
-	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+	err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
 		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
-		   TASK_UNINTERRUPTIBLE);
+		   state);
+	if (err) {
+		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+		return err;
+	}
 	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
 
-	return true;
+	return 1;
 }
 
 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
@@ -75,16 +86,19 @@ static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
 	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
 }
 
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
+		int state)
 {
-	bool locked;
+	int locked;
 
 	/*
 	 * __vma_enter_locked() returns false immediately if the vma is not
 	 * attached, otherwise it waits until refcnt is indicating that vma
 	 * is attached with no readers.
 	 */
-	locked = __vma_enter_locked(vma, false);
+	locked = __vma_enter_locked(vma, false, state);
+	if (locked < 0)
+		return locked;
 
 	/*
 	 * We should use WRITE_ONCE() here because we can have concurrent reads
@@ -100,6 +114,8 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
 		__vma_exit_locked(vma, &detached);
 		WARN_ON_ONCE(detached); /* vma should remain attached */
 	}
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(__vma_start_write);
 
@@ -118,7 +134,7 @@ void vma_mark_detached(struct vm_area_struct *vma)
 	 */
 	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
 		/* Wait until vma is detached with no readers. */
-		if (__vma_enter_locked(vma, true)) {
+		if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) {
 			bool detached;
 
 			__vma_exit_locked(vma, &detached);
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 233819a9e7ee..73a899ba2686 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -952,6 +952,14 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	vma->vm_lock_seq++;
 }
 
+static inline __must_check
+int vma_start_write_killable(struct vm_area_struct *vma)
+{
+	/* Used to indicate to tests that a write operation has begun. */
+	vma->vm_lock_seq++;
+	return 0;
+}
+
 static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 unsigned long start,
 					 unsigned long end,
-- 
cgit v1.2.3


From 8b02baf37311754518dfe78073583db03fbb0c07 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 12 Nov 2025 07:41:04 -0800
Subject: mm/damon: rename damos core filter helpers to have word core

Patch series "mm/damon: misc cleanups".

Yet another batch of misc cleanups and refactoring for DAMON code, tests,
and documents.

First two patches (1and 2) rename DAMOS core filters related code for
readability.

Three following patches (3-5) refactor page table walk callback functions
in DAMON, as suggested by Hugh and David, and I promised.

Next two patches (6 and 7) refactor DAMON core layer kunit test and sysfs
interface selftest to be simple and deduplicated.

Final two patches (8 and 9) fix up sphinx and grammatical errors on
documents.


This patch (of 9):

DAMOS filters handled by the core layer are called core filters, while
those handled by the ops layer are called ops filters.  They share the
same type but are managed in different places since core filters are
evaluated before the ops filters.  They also have different helper
functions that depend on their managed places.

The helper functions for ops filters have '_ops_' keyword on their name,
so it is easy to know they are for ops filters.  Meanwhile, the helper
functions for core filters are not having the 'core' keyword on their
name.  This makes it easy to be mistakenly used for ops filters.  Actually
there was such a bug.

To avoid future mistakes from similar confusions, rename DAMOS core
filters helper functions to have a keyword 'core' on their names.

Link: https://lkml.kernel.org/r/20251112154114.66053-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251112154114.66053-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .clang-format         |  4 ++--
 include/linux/damon.h |  4 ++--
 mm/damon/core.c       | 14 +++++++-------
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/.clang-format b/.clang-format
index f371a13b4d19..748efbe791ad 100644
--- a/.clang-format
+++ b/.clang-format
@@ -140,8 +140,8 @@ ForEachMacros:
   - 'damon_for_each_scheme_safe'
   - 'damon_for_each_target'
   - 'damon_for_each_target_safe'
-  - 'damos_for_each_filter'
-  - 'damos_for_each_filter_safe'
+  - 'damos_for_each_core_filter'
+  - 'damos_for_each_core_filter_safe'
   - 'damos_for_each_ops_filter'
   - 'damos_for_each_ops_filter_safe'
   - 'damos_for_each_quota_goal'
diff --git a/include/linux/damon.h b/include/linux/damon.h
index f3566b978cdf..6e3db165fe60 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -871,10 +871,10 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
 #define damos_for_each_quota_goal_safe(goal, next, quota) \
 	list_for_each_entry_safe(goal, next, &(quota)->goals, list)
 
-#define damos_for_each_filter(f, scheme) \
+#define damos_for_each_core_filter(f, scheme) \
 	list_for_each_entry(f, &(scheme)->filters, list)
 
-#define damos_for_each_filter_safe(f, next, scheme) \
+#define damos_for_each_core_filter_safe(f, next, scheme) \
 	list_for_each_entry_safe(f, next, &(scheme)->filters, list)
 
 #define damos_for_each_ops_filter(f, scheme) \
diff --git a/mm/damon/core.c b/mm/damon/core.c
index a14cc73c2cab..d4cb11ced13f 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -450,7 +450,7 @@ void damon_destroy_scheme(struct damos *s)
 	damos_for_each_quota_goal_safe(g, g_next, &s->quota)
 		damos_destroy_quota_goal(g);
 
-	damos_for_each_filter_safe(f, next, s)
+	damos_for_each_core_filter_safe(f, next, s)
 		damos_destroy_filter(f);
 
 	damos_for_each_ops_filter_safe(f, next, s)
@@ -864,12 +864,12 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src)
 	return 0;
 }
 
-static struct damos_filter *damos_nth_filter(int n, struct damos *s)
+static struct damos_filter *damos_nth_core_filter(int n, struct damos *s)
 {
 	struct damos_filter *filter;
 	int i = 0;
 
-	damos_for_each_filter(filter, s) {
+	damos_for_each_core_filter(filter, s) {
 		if (i++ == n)
 			return filter;
 	}
@@ -923,15 +923,15 @@ static int damos_commit_core_filters(struct damos *dst, struct damos *src)
 	struct damos_filter *dst_filter, *next, *src_filter, *new_filter;
 	int i = 0, j = 0;
 
-	damos_for_each_filter_safe(dst_filter, next, dst) {
-		src_filter = damos_nth_filter(i++, src);
+	damos_for_each_core_filter_safe(dst_filter, next, dst) {
+		src_filter = damos_nth_core_filter(i++, src);
 		if (src_filter)
 			damos_commit_filter(dst_filter, src_filter);
 		else
 			damos_destroy_filter(dst_filter);
 	}
 
-	damos_for_each_filter_safe(src_filter, next, src) {
+	damos_for_each_core_filter_safe(src_filter, next, src) {
 		if (j++ < i)
 			continue;
 
@@ -1767,7 +1767,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
 	struct damos_filter *filter;
 
 	s->core_filters_allowed = false;
-	damos_for_each_filter(filter, s) {
+	damos_for_each_core_filter(filter, s) {
 		if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) {
 			if (filter->allow)
 				s->core_filters_allowed = true;
-- 
cgit v1.2.3


From 53298afe456e62ad2c2dc8bc7aa54bb86a67ba2f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 12 Nov 2025 07:41:05 -0800
Subject: mm/damon: rename damos->filters to damos->core_filters

DAMOS filters that are handled by the ops layer are linked to
damos->ops_filters.  Owing to the ops_ prefix on the name, it is easy to
understand it is for ops layer handled filters.  The other types of
filters, which are handled by the core layer, are linked to
damos->filters.  Because of the name, it is easy to confuse the list is
there for not only core layer handled ones but all filters.  Avoid such
confusions by renaming the field to core_filters.

Link: https://lkml.kernel.org/r/20251112154114.66053-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h                                   | 10 +++++-----
 mm/damon/core.c                                         |  6 +++---
 mm/damon/tests/core-kunit.h                             |  4 ++--
 tools/testing/selftests/damon/drgn_dump_damon_status.py |  8 ++++----
 tools/testing/selftests/damon/sysfs.py                  |  2 +-
 5 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 6e3db165fe60..3813373a9200 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -492,7 +492,7 @@ struct damos_migrate_dests {
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
  * @migrate_dests:	Destination nodes if @action is "migrate_{hot,cold}".
  * @target_nid:		Destination node if @action is "migrate_{hot,cold}".
- * @filters:		Additional set of &struct damos_filter for &action.
+ * @core_filters:	Additional set of &struct damos_filter for &action.
  * @ops_filters:	ops layer handling &struct damos_filter objects list.
  * @last_applied:	Last @action applied ops-managing entity.
  * @stat:		Statistics of this scheme.
@@ -518,7 +518,7 @@ struct damos_migrate_dests {
  *
  * Before applying the &action to a memory region, &struct damon_operations
  * implementation could check pages of the region and skip &action to respect
- * &filters
+ * &core_filters
  *
  * The minimum entity that @action can be applied depends on the underlying
  * &struct damon_operations.  Since it may not be aligned with the core layer
@@ -562,7 +562,7 @@ struct damos {
 			struct damos_migrate_dests migrate_dests;
 		};
 	};
-	struct list_head filters;
+	struct list_head core_filters;
 	struct list_head ops_filters;
 	void *last_applied;
 	struct damos_stat stat;
@@ -872,10 +872,10 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
 	list_for_each_entry_safe(goal, next, &(quota)->goals, list)
 
 #define damos_for_each_core_filter(f, scheme) \
-	list_for_each_entry(f, &(scheme)->filters, list)
+	list_for_each_entry(f, &(scheme)->core_filters, list)
 
 #define damos_for_each_core_filter_safe(f, next, scheme) \
-	list_for_each_entry_safe(f, next, &(scheme)->filters, list)
+	list_for_each_entry_safe(f, next, &(scheme)->core_filters, list)
 
 #define damos_for_each_ops_filter(f, scheme) \
 	list_for_each_entry(f, &(scheme)->ops_filters, list)
diff --git a/mm/damon/core.c b/mm/damon/core.c
index d4cb11ced13f..aedb315b075a 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -306,7 +306,7 @@ void damos_add_filter(struct damos *s, struct damos_filter *f)
 	if (damos_filter_for_ops(f->type))
 		list_add_tail(&f->list, &s->ops_filters);
 	else
-		list_add_tail(&f->list, &s->filters);
+		list_add_tail(&f->list, &s->core_filters);
 }
 
 static void damos_del_filter(struct damos_filter *f)
@@ -397,7 +397,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 	 */
 	scheme->next_apply_sis = 0;
 	scheme->walk_completed = false;
-	INIT_LIST_HEAD(&scheme->filters);
+	INIT_LIST_HEAD(&scheme->core_filters);
 	INIT_LIST_HEAD(&scheme->ops_filters);
 	scheme->stat = (struct damos_stat){};
 	INIT_LIST_HEAD(&scheme->list);
@@ -995,7 +995,7 @@ static void damos_set_filters_default_reject(struct damos *s)
 		s->core_filters_default_reject = false;
 	else
 		s->core_filters_default_reject =
-			damos_filters_default_reject(&s->filters);
+			damos_filters_default_reject(&s->core_filters);
 	s->ops_filters_default_reject =
 		damos_filters_default_reject(&s->ops_filters);
 }
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index 0d2d8cda8631..4380d0312d24 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -876,7 +876,7 @@ static void damos_test_commit_filter(struct kunit *test)
 static void damos_test_help_initailize_scheme(struct damos *scheme)
 {
 	INIT_LIST_HEAD(&scheme->quota.goals);
-	INIT_LIST_HEAD(&scheme->filters);
+	INIT_LIST_HEAD(&scheme->core_filters);
 	INIT_LIST_HEAD(&scheme->ops_filters);
 }
 
@@ -1140,7 +1140,7 @@ static void damon_test_set_filters_default_reject(struct kunit *test)
 	struct damos scheme;
 	struct damos_filter *target_filter, *anon_filter;
 
-	INIT_LIST_HEAD(&scheme.filters);
+	INIT_LIST_HEAD(&scheme.core_filters);
 	INIT_LIST_HEAD(&scheme.ops_filters);
 
 	damos_set_filters_default_reject(&scheme);
diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py
index cb4fdbe68acb..5374d18d1fa8 100755
--- a/tools/testing/selftests/damon/drgn_dump_damon_status.py
+++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py
@@ -175,11 +175,11 @@ def scheme_to_dict(scheme):
         ['target_nid', int],
         ['migrate_dests', damos_migrate_dests_to_dict],
         ])
-    filters = []
+    core_filters = []
     for f in list_for_each_entry(
-            'struct damos_filter', scheme.filters.address_of_(), 'list'):
-        filters.append(damos_filter_to_dict(f))
-    dict_['filters'] = filters
+            'struct damos_filter', scheme.core_filters.address_of_(), 'list'):
+        core_filters.append(damos_filter_to_dict(f))
+    dict_['core_filters'] = core_filters
     ops_filters = []
     for f in list_for_each_entry(
             'struct damos_filter', scheme.ops_filters.address_of_(), 'list'):
diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py
index b34aea0a6775..b4c5ef5c4d69 100755
--- a/tools/testing/selftests/damon/sysfs.py
+++ b/tools/testing/selftests/damon/sysfs.py
@@ -132,7 +132,7 @@ def assert_scheme_committed(scheme, dump):
     assert_watermarks_committed(scheme.watermarks, dump['wmarks'])
     # TODO: test filters directory
     for idx, f in enumerate(scheme.core_filters.filters):
-        assert_filter_committed(f, dump['filters'][idx])
+        assert_filter_committed(f, dump['core_filters'][idx])
     for idx, f in enumerate(scheme.ops_filters.filters):
         assert_filter_committed(f, dump['ops_filters'][idx])
 
-- 
cgit v1.2.3


From 6707915e030a3258868355f989b80140c1a45bbe Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 17 Nov 2025 17:33:38 +0000
Subject: mm: propagate VM_SOFTDIRTY on merge

Patch series "make VM_SOFTDIRTY a sticky VMA flag", v2.

Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by
establishing a new VMA, or via merge) as implemented in __mmap_complete()
and do_brk_flags().

However, when performing a merge of existing mappings such as when
performing mprotect(), we may lose the VM_SOFTDIRTY flag.

Now we have the concept of making VMA flags 'sticky', that is that they
both don't prevent merge and, importantly, are propagated to merged VMAs,
this seems a sensible alternative to the existing special-casing of
VM_SOFTDIRTY.

We additionally add a self-test that demonstrates that this logic behaves
as expected.


This patch (of 2):

Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by
establishing a new VMA, or via merge) as implemented in __mmap_complete()
and do_brk_flags().

However, when performing a merge of existing mappings such as when
performing mprotect(), we may lose the VM_SOFTDIRTY flag.

This is because currently we simply ignore VM_SOFTDIRTY for the purposes
of merge, so one VMA may possess the flag and another not, and whichever
happens to be the target VMA will be the one upon which the merge is
performed which may or may not have VM_SOFTDIRTY set.

Now we have the concept of 'sticky' VMA flags, let's make VM_SOFTDIRTY one
which solves this issue.

Additionally update VMA userland tests to propagate changes.

[akpm@linux-foundation.org: update comments, per Lorenzo]
  Link: https://lkml.kernel.org/r/0019e0b8-ee1e-4359-b5ee-94225cbe5588@lucifer.local
Link: https://lkml.kernel.org/r/cover.1763399675.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/955478b5170715c895d1ef3b7f68e0cd77f76868.1763399675.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Andrey Vagin <avagin@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h               | 15 +++++++--------
 tools/testing/vma/vma_internal.h | 18 ++++++------------
 2 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index af2904aeb163..bf660d5b6e97 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -532,28 +532,27 @@ extern unsigned int kobjsize(const void *objp);
  * possesses it but the other does not, the merged VMA should nonetheless have
  * applied to it:
  *
+ *   VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its
+ *                  references cleared via /proc/$pid/clear_refs, any merged VMA
+ *                  should be considered soft-dirty also as it operates at a VMA
+ *                  granularity.
+ *
  * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that
  *                  mapped page tables may contain metadata not described by the
  *                  VMA and thus any merged VMA may also contain this metadata,
  *                  and thus we must make this flag sticky.
  */
-#define VM_STICKY VM_MAYBE_GUARD
+#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
 
 /*
  * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
  * of these flags and the other not does not preclude a merge.
  *
- * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but
- *                dirty bit -- the caller should mark merged VMA as dirty. If
- *                dirty bit won't be excluded from comparison, we increase
- *                pressure on the memory system forcing the kernel to generate
- *                new VMAs when old one could be extended instead.
- *
  *    VM_STICKY - When merging VMAs, VMA flags must match, unless they are
  *                'sticky'. If any sticky flags exist in either VMA, we simply
  *                set all of them on the merged VMA.
  */
-#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY)
+#define VM_IGNORE_MERGE VM_STICKY
 
 /*
  * Flags which should result in page tables being copied on fork. These are
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 73a899ba2686..81b501f51948 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -122,28 +122,22 @@ extern unsigned long dac_mmap_min_addr;
  * possesses it but the other does not, the merged VMA should nonetheless have
  * applied to it:
  *
- * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that
- *                  mapped page tables may contain metadata not described by the
- *                  VMA and thus any merged VMA may also contain this metadata,
- *                  and thus we must make this flag sticky.
+ *   VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its
+ *                  references cleared via /proc/$pid/clear_refs, any merged VMA
+ *                  should be considered soft-dirty also as it operates at a VMA
+ *                  granularity.
  */
-#define VM_STICKY VM_MAYBE_GUARD
+#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
 
 /*
  * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
  * of these flags and the other not does not preclude a merge.
  *
- * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but
- *                dirty bit -- the caller should mark merged VMA as dirty. If
- *                dirty bit won't be excluded from comparison, we increase
- *                pressure on the memory system forcing the kernel to generate
- *                new VMAs when old one could be extended instead.
- *
  *    VM_STICKY - When merging VMAs, VMA flags must match, unless they are
  *                'sticky'. If any sticky flags exist in either VMA, we simply
  *                set all of them on the merged VMA.
  */
-#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY)
+#define VM_IGNORE_MERGE VM_STICKY
 
 /*
  * Flags which should result in page tables being copied on fork. These are
-- 
cgit v1.2.3


From 4015b979767125cf8a2233a145a3b3af78bfd8fb Mon Sep 17 00:00:00 2001
From: Chris Lu <chris.lu@mediatek.com>
Date: Wed, 12 Nov 2025 15:53:34 +0800
Subject: Bluetooth: btusb: mediatek: Fix kernel crash when releasing mtk iso
 interface

When performing reset tests and encountering abnormal card drop issues
that lead to a kernel crash, it is necessary to perform a null check
before releasing resources to avoid attempting to release a null pointer.

<4>[   29.158070] Hardware name: Google Quigon sku196612/196613 board (DT)
<4>[   29.158076] Workqueue: hci0 hci_cmd_sync_work [bluetooth]
<4>[   29.158154] pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
<4>[   29.158162] pc : klist_remove+0x90/0x158
<4>[   29.158174] lr : klist_remove+0x88/0x158
<4>[   29.158180] sp : ffffffc0846b3c00
<4>[   29.158185] pmr_save: 000000e0
<4>[   29.158188] x29: ffffffc0846b3c30 x28: ffffff80cd31f880 x27: ffffff80c1bdc058
<4>[   29.158199] x26: dead000000000100 x25: ffffffdbdc624ea3 x24: ffffff80c1bdc4c0
<4>[   29.158209] x23: ffffffdbdc62a3e6 x22: ffffff80c6c07000 x21: ffffffdbdc829290
<4>[   29.158219] x20: 0000000000000000 x19: ffffff80cd3e0648 x18: 000000031ec97781
<4>[   29.158229] x17: ffffff80c1bdc4a8 x16: ffffffdc10576548 x15: ffffff80c1180428
<4>[   29.158238] x14: 0000000000000000 x13: 000000000000e380 x12: 0000000000000018
<4>[   29.158248] x11: ffffff80c2a7fd10 x10: 0000000000000000 x9 : 0000000100000000
<4>[   29.158257] x8 : 0000000000000000 x7 : 7f7f7f7f7f7f7f7f x6 : 2d7223ff6364626d
<4>[   29.158266] x5 : 0000008000000000 x4 : 0000000000000020 x3 : 2e7325006465636e
<4>[   29.158275] x2 : ffffffdc11afeff8 x1 : 0000000000000000 x0 : ffffffdc11be4d0c
<4>[   29.158285] Call trace:
<4>[   29.158290]  klist_remove+0x90/0x158
<4>[   29.158298]  device_release_driver_internal+0x20c/0x268
<4>[   29.158308]  device_release_driver+0x1c/0x30
<4>[   29.158316]  usb_driver_release_interface+0x70/0x88
<4>[   29.158325]  btusb_mtk_release_iso_intf+0x68/0xd8 [btusb (HASH:e8b6 5)]
<4>[   29.158347]  btusb_mtk_reset+0x5c/0x480 [btusb (HASH:e8b6 5)]
<4>[   29.158361]  hci_cmd_sync_work+0x10c/0x188 [bluetooth (HASH:a4fa 6)]
<4>[   29.158430]  process_scheduled_works+0x258/0x4e8
<4>[   29.158441]  worker_thread+0x300/0x428
<4>[   29.158448]  kthread+0x108/0x1d0
<4>[   29.158455]  ret_from_fork+0x10/0x20
<0>[   29.158467] Code: 91343000 940139d1 f9400268 927ff914 (f9401297)
<4>[   29.158474] ---[ end trace 0000000000000000 ]---
<0>[   29.167129] Kernel panic - not syncing: Oops: Fatal exception
<2>[   29.167144] SMP: stopping secondary CPUs
<4>[   29.167158] ------------[ cut here ]------------

Fixes: ceac1cb0259d ("Bluetooth: btusb: mediatek: add ISO data transmission functions")
Signed-off-by: Chris Lu <chris.lu@mediatek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c        | 34 +++++++++++++++++++++++++++-------
 include/net/bluetooth/hci_core.h |  1 -
 2 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index a722446ec73d..202a845e0236 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -2711,9 +2711,16 @@ static int btusb_recv_event_realtek(struct hci_dev *hdev, struct sk_buff *skb)
 
 static void btusb_mtk_claim_iso_intf(struct btusb_data *data)
 {
-	struct btmtk_data *btmtk_data = hci_get_priv(data->hdev);
+	struct btmtk_data *btmtk_data;
 	int err;
 
+	if (!data->hdev)
+		return;
+
+	btmtk_data = hci_get_priv(data->hdev);
+	if (!btmtk_data)
+		return;
+
 	/*
 	 * The function usb_driver_claim_interface() is documented to need
 	 * locks held if it's not called from a probe routine. The code here
@@ -2735,17 +2742,30 @@ static void btusb_mtk_claim_iso_intf(struct btusb_data *data)
 
 static void btusb_mtk_release_iso_intf(struct hci_dev *hdev)
 {
-	struct btmtk_data *btmtk_data = hci_get_priv(hdev);
+	struct btmtk_data *btmtk_data;
+
+	if (!hdev)
+		return;
+
+	btmtk_data = hci_get_priv(hdev);
+	if (!btmtk_data)
+		return;
 
 	if (test_bit(BTMTK_ISOPKT_OVER_INTR, &btmtk_data->flags)) {
 		usb_kill_anchored_urbs(&btmtk_data->isopkt_anchor);
 		clear_bit(BTMTK_ISOPKT_RUNNING, &btmtk_data->flags);
 
-		dev_kfree_skb_irq(btmtk_data->isopkt_skb);
-		btmtk_data->isopkt_skb = NULL;
-		usb_set_intfdata(btmtk_data->isopkt_intf, NULL);
-		usb_driver_release_interface(&btusb_driver,
-					     btmtk_data->isopkt_intf);
+		if (btmtk_data->isopkt_skb) {
+			dev_kfree_skb_irq(btmtk_data->isopkt_skb);
+			btmtk_data->isopkt_skb = NULL;
+		}
+
+		if (btmtk_data->isopkt_intf) {
+			usb_set_intfdata(btmtk_data->isopkt_intf, NULL);
+			usb_driver_release_interface(&btusb_driver,
+						     btmtk_data->isopkt_intf);
+			btmtk_data->isopkt_intf = NULL;
+		}
 	}
 
 	clear_bit(BTMTK_ISOPKT_OVER_INTR, &btmtk_data->flags);
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b8100dbfe5d7..32b1c08c8bba 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -749,7 +749,6 @@ struct hci_conn {
 
 	__u8		remote_cap;
 	__u8		remote_auth;
-	__u8		remote_id;
 
 	unsigned int	sent;
 
-- 
cgit v1.2.3


From 79a2d4678ba90bdba577dc3af88cc900d6dcd5ee Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Sat, 15 Nov 2025 18:43:55 +0200
Subject: Bluetooth: hci_core: lookup hci_conn on RX path on protocol side

The hdev lock/lookup/unlock/use pattern in the packet RX path doesn't
ensure hci_conn* is not concurrently modified/deleted. This locking
appears to be leftover from before conn_hash started using RCU
commit bf4c63252490b ("Bluetooth: convert conn hash to RCU")
and not clear if it had purpose since then.

Currently, there are code paths that delete hci_conn* from elsewhere
than the ordered hdev->workqueue where the RX work runs in. E.g.
commit 5af1f84ed13a ("Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync")
introduced some of these, and there probably were a few others before
it.  It's better to do the locking so that even if these run
concurrently no UAF is possible.

Move the lookup of hci_conn and associated socket-specific conn to
protocol recv handlers, and do them within a single critical section
to cover hci_conn* usage and lookup.

syzkaller has reported a crash that appears to be this issue:

    [Task hdev->workqueue]          [Task 2]
                                    hci_disconnect_all_sync
    l2cap_recv_acldata(hcon)
                                      hci_conn_get(hcon)
                                      hci_abort_conn_sync(hcon)
                                        hci_dev_lock
      hci_dev_lock
                                        hci_conn_del(hcon)
      v-------------------------------- hci_dev_unlock
                                      hci_conn_put(hcon)
      conn = hcon->l2cap_data (UAF)

Fixes: 5af1f84ed13a ("Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync")
Reported-by: syzbot+d32d77220b92eddd89ad@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d32d77220b92eddd89ad
Signed-off-by: Pauli Virtanen <pav@iki.fi>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 20 +++++++----
 net/bluetooth/hci_core.c         | 73 ++++++++++++++--------------------------
 net/bluetooth/iso.c              | 30 ++++++++++++++---
 net/bluetooth/l2cap_core.c       | 23 ++++++++++---
 net/bluetooth/sco.c              | 35 ++++++++++++++-----
 5 files changed, 108 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 32b1c08c8bba..0cb87687837f 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -856,11 +856,12 @@ extern struct mutex hci_cb_list_lock;
 /* ----- HCI interface to upper protocols ----- */
 int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr);
 int l2cap_disconn_ind(struct hci_conn *hcon);
-void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags);
+int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb,
+		       u16 flags);
 
 #if IS_ENABLED(CONFIG_BT_BREDR)
 int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags);
-void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb);
+int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb);
 #else
 static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
 				  __u8 *flags)
@@ -868,23 +869,30 @@ static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
 	return 0;
 }
 
-static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+static inline int sco_recv_scodata(struct hci_dev *hdev, u16 handle,
+				   struct sk_buff *skb)
 {
+	kfree_skb(skb);
+	return -ENOENT;
 }
 #endif
 
 #if IS_ENABLED(CONFIG_BT_LE)
 int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags);
-void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags);
+int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb,
+	     u16 flags);
 #else
 static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
 				  __u8 *flags)
 {
 	return 0;
 }
-static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb,
-			    u16 flags)
+
+static inline int iso_recv(struct hci_dev *hdev, u16 handle,
+			   struct sk_buff *skb, u16 flags)
 {
+	kfree_skb(skb);
+	return -ENOENT;
 }
 #endif
 
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 1920e3d62bda..8ccec73dce45 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3832,13 +3832,14 @@ static void hci_tx_work(struct work_struct *work)
 static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_acl_hdr *hdr;
-	struct hci_conn *conn;
 	__u16 handle, flags;
+	int err;
 
 	hdr = skb_pull_data(skb, sizeof(*hdr));
 	if (!hdr) {
 		bt_dev_err(hdev, "ACL packet too small");
-		goto drop;
+		kfree_skb(skb);
+		return;
 	}
 
 	handle = __le16_to_cpu(hdr->handle);
@@ -3850,36 +3851,27 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 
 	hdev->stat.acl_rx++;
 
-	hci_dev_lock(hdev);
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	hci_dev_unlock(hdev);
-
-	if (conn) {
-		hci_conn_enter_active_mode(conn, BT_POWER_FORCE_ACTIVE_OFF);
-
-		/* Send to upper protocol */
-		l2cap_recv_acldata(conn, skb, flags);
-		return;
-	} else {
+	err = l2cap_recv_acldata(hdev, handle, skb, flags);
+	if (err == -ENOENT)
 		bt_dev_err(hdev, "ACL packet for unknown connection handle %d",
 			   handle);
-	}
-
-drop:
-	kfree_skb(skb);
+	else if (err)
+		bt_dev_dbg(hdev, "ACL packet recv for handle %d failed: %d",
+			   handle, err);
 }
 
 /* SCO data packet */
 static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_sco_hdr *hdr;
-	struct hci_conn *conn;
 	__u16 handle, flags;
+	int err;
 
 	hdr = skb_pull_data(skb, sizeof(*hdr));
 	if (!hdr) {
 		bt_dev_err(hdev, "SCO packet too small");
-		goto drop;
+		kfree_skb(skb);
+		return;
 	}
 
 	handle = __le16_to_cpu(hdr->handle);
@@ -3891,34 +3883,28 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 
 	hdev->stat.sco_rx++;
 
-	hci_dev_lock(hdev);
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	hci_dev_unlock(hdev);
+	hci_skb_pkt_status(skb) = flags & 0x03;
 
-	if (conn) {
-		/* Send to upper protocol */
-		hci_skb_pkt_status(skb) = flags & 0x03;
-		sco_recv_scodata(conn, skb);
-		return;
-	} else {
+	err = sco_recv_scodata(hdev, handle, skb);
+	if (err == -ENOENT)
 		bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d",
 				       handle);
-	}
-
-drop:
-	kfree_skb(skb);
+	else if (err)
+		bt_dev_dbg(hdev, "SCO packet recv for handle %d failed: %d",
+			   handle, err);
 }
 
 static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_iso_hdr *hdr;
-	struct hci_conn *conn;
 	__u16 handle, flags;
+	int err;
 
 	hdr = skb_pull_data(skb, sizeof(*hdr));
 	if (!hdr) {
 		bt_dev_err(hdev, "ISO packet too small");
-		goto drop;
+		kfree_skb(skb);
+		return;
 	}
 
 	handle = __le16_to_cpu(hdr->handle);
@@ -3928,22 +3914,13 @@ static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 	bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len,
 		   handle, flags);
 
-	hci_dev_lock(hdev);
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	hci_dev_unlock(hdev);
-
-	if (!conn) {
+	err = iso_recv(hdev, handle, skb, flags);
+	if (err == -ENOENT)
 		bt_dev_err(hdev, "ISO packet for unknown connection handle %d",
 			   handle);
-		goto drop;
-	}
-
-	/* Send to upper protocol */
-	iso_recv(conn, skb, flags);
-	return;
-
-drop:
-	kfree_skb(skb);
+	else if (err)
+		bt_dev_dbg(hdev, "ISO packet recv for handle %d failed: %d",
+			   handle, err);
 }
 
 static bool hci_req_is_complete(struct hci_dev *hdev)
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 3d98cb6291da..616c2fef91d2 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -2314,14 +2314,31 @@ static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason)
 	iso_conn_del(hcon, bt_to_errno(reason));
 }
 
-void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
+int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags)
 {
-	struct iso_conn *conn = hcon->iso_data;
+	struct hci_conn *hcon;
+	struct iso_conn *conn;
 	struct skb_shared_hwtstamps *hwts;
 	__u16 pb, ts, len, sn;
 
-	if (!conn)
-		goto drop;
+	hci_dev_lock(hdev);
+
+	hcon = hci_conn_hash_lookup_handle(hdev, handle);
+	if (!hcon) {
+		hci_dev_unlock(hdev);
+		kfree_skb(skb);
+		return -ENOENT;
+	}
+
+	conn = iso_conn_hold_unless_zero(hcon->iso_data);
+	hcon = NULL;
+
+	hci_dev_unlock(hdev);
+
+	if (!conn) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
 
 	pb     = hci_iso_flags_pb(flags);
 	ts     = hci_iso_flags_ts(flags);
@@ -2377,7 +2394,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
 			hci_skb_pkt_status(skb) = flags & 0x03;
 			hci_skb_pkt_seqnum(skb) = sn;
 			iso_recv_frame(conn, skb);
-			return;
+			goto done;
 		}
 
 		if (pb == ISO_SINGLE) {
@@ -2455,6 +2472,9 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
 
 drop:
 	kfree_skb(skb);
+done:
+	iso_conn_put(conn);
+	return 0;
 }
 
 static struct hci_cb iso_cb = {
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 35c57657bcf4..07b493331fd7 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -7510,13 +7510,24 @@ struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c)
 	return c;
 }
 
-void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
+int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle,
+		       struct sk_buff *skb, u16 flags)
 {
+	struct hci_conn *hcon;
 	struct l2cap_conn *conn;
 	int len;
 
-	/* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */
-	hci_dev_lock(hcon->hdev);
+	/* Lock hdev for hci_conn, and race on l2cap_data vs. l2cap_conn_del */
+	hci_dev_lock(hdev);
+
+	hcon = hci_conn_hash_lookup_handle(hdev, handle);
+	if (!hcon) {
+		hci_dev_unlock(hdev);
+		kfree_skb(skb);
+		return -ENOENT;
+	}
+
+	hci_conn_enter_active_mode(hcon, BT_POWER_FORCE_ACTIVE_OFF);
 
 	conn = hcon->l2cap_data;
 
@@ -7524,12 +7535,13 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
 		conn = l2cap_conn_add(hcon);
 
 	conn = l2cap_conn_hold_unless_zero(conn);
+	hcon = NULL;
 
-	hci_dev_unlock(hcon->hdev);
+	hci_dev_unlock(hdev);
 
 	if (!conn) {
 		kfree_skb(skb);
-		return;
+		return -EINVAL;
 	}
 
 	BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags);
@@ -7643,6 +7655,7 @@ drop:
 unlock:
 	mutex_unlock(&conn->lock);
 	l2cap_conn_put(conn);
+	return 0;
 }
 
 static struct hci_cb l2cap_cb = {
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index ab0cf442d57b..298c2a9ab4df 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1458,22 +1458,39 @@ static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
 	sco_conn_del(hcon, bt_to_errno(reason));
 }
 
-void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb)
 {
-	struct sco_conn *conn = hcon->sco_data;
+	struct hci_conn *hcon;
+	struct sco_conn *conn;
 
-	if (!conn)
-		goto drop;
+	hci_dev_lock(hdev);
+
+	hcon = hci_conn_hash_lookup_handle(hdev, handle);
+	if (!hcon) {
+		hci_dev_unlock(hdev);
+		kfree_skb(skb);
+		return -ENOENT;
+	}
+
+	conn = sco_conn_hold_unless_zero(hcon->sco_data);
+	hcon = NULL;
+
+	hci_dev_unlock(hdev);
+
+	if (!conn) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
 
 	BT_DBG("conn %p len %u", conn, skb->len);
 
-	if (skb->len) {
+	if (skb->len)
 		sco_recv_frame(conn, skb);
-		return;
-	}
+	else
+		kfree_skb(skb);
 
-drop:
-	kfree_skb(skb);
+	sco_conn_put(conn);
+	return 0;
 }
 
 static struct hci_cb sco_cb = {
-- 
cgit v1.2.3


From 760fc597c33d5a727507c8bb19d6ab87a8c5885b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 30 Oct 2025 12:44:18 +0100
Subject: panic: sys_info: align constant definition names with parameters

Align constant definition names with parameters to make it easier to map.
It's also better to maintain and extend the names while keeping their
uniqueness.

Link: https://lkml.kernel.org/r/20251030132007.3742368-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Feng Tang <feng.tang@linux.alibaba.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sys_info.h | 2 +-
 kernel/panic.c           | 2 +-
 lib/sys_info.c           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sys_info.h b/include/linux/sys_info.h
index 89d77dc4f2ed..a5bc3ea3d44b 100644
--- a/include/linux/sys_info.h
+++ b/include/linux/sys_info.h
@@ -14,7 +14,7 @@
 #define SYS_INFO_LOCKS			0x00000008
 #define SYS_INFO_FTRACE			0x00000010
 #define SYS_INFO_PANIC_CONSOLE_REPLAY	0x00000020
-#define SYS_INFO_ALL_CPU_BT		0x00000040
+#define SYS_INFO_ALL_BT			0x00000040
 #define SYS_INFO_BLOCKED_TASKS		0x00000080
 
 void sys_info(unsigned long si_mask);
diff --git a/kernel/panic.c b/kernel/panic.c
index ffceb6f13935..a9af1bbe16b0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -401,7 +401,7 @@ static void panic_trigger_all_cpu_backtrace(void)
  */
 static void panic_other_cpus_shutdown(bool crash_kexec)
 {
-	if (panic_print & SYS_INFO_ALL_CPU_BT)
+	if (panic_print & SYS_INFO_ALL_BT)
 		panic_trigger_all_cpu_backtrace();
 
 	/*
diff --git a/lib/sys_info.c b/lib/sys_info.c
index d542a024406a..6b0188b30227 100644
--- a/lib/sys_info.c
+++ b/lib/sys_info.c
@@ -23,7 +23,7 @@ static const struct sys_info_name  si_names[] = {
 	{ SYS_INFO_TIMERS,		"timers" },
 	{ SYS_INFO_LOCKS,		"locks" },
 	{ SYS_INFO_FTRACE,		"ftrace" },
-	{ SYS_INFO_ALL_CPU_BT,		"all_bt" },
+	{ SYS_INFO_ALL_BT,		"all_bt" },
 	{ SYS_INFO_BLOCKED_TASKS,	"blocked_tasks" },
 };
 
@@ -118,7 +118,7 @@ void sys_info(unsigned long si_mask)
 	if (si_mask & SYS_INFO_FTRACE)
 		ftrace_dump(DUMP_ALL);
 
-	if (si_mask & SYS_INFO_ALL_CPU_BT)
+	if (si_mask & SYS_INFO_ALL_BT)
 		trigger_all_cpu_backtrace();
 
 	if (si_mask & SYS_INFO_BLOCKED_TASKS)
-- 
cgit v1.2.3


From bd97c976419126ee3e9acd4957f6f16a90316643 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 4 Nov 2025 19:38:34 +0100
Subject: util_macros.h: fix kernel-doc for u64_to_user_ptr()

The added documentation to u64_to_user_ptr() misspelled the function name.
Fix it.

Link: https://lkml.kernel.org/r/20251104183834.1046584-1-andriy.shevchenko@linux.intel.com
Fixes: 029c896c4105 ("kernel.h: move PTR_IF() and u64_to_user_ptr() to util_macros.h")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexandru Ardelean <aardelean@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/util_macros.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h
index 9373962aade9..2eb528058d0d 100644
--- a/include/linux/util_macros.h
+++ b/include/linux/util_macros.h
@@ -136,10 +136,10 @@
 #define PTR_IF(cond, ptr)	((cond) ? (ptr) : NULL)
 
 /**
- * to_user_ptr - cast a pointer passed as u64 from user space to void __user *
+ * u64_to_user_ptr - cast a pointer passed as u64 from user space to void __user *
  * @x: The u64 value from user space, usually via IOCTL
  *
- * to_user_ptr() simply casts a pointer passed as u64 from user space to void
+ * u64_to_user_ptr() simply casts a pointer passed as u64 from user space to void
  * __user * correctly. Using this lets us get rid of all the tiresome casts.
  */
 #define u64_to_user_ptr(x)		\
-- 
cgit v1.2.3


From 6480241f31f543333ed0c7a209962412461f6e41 Mon Sep 17 00:00:00 2001
From: David Laight <david.laight.linux@gmail.com>
Date: Wed, 5 Nov 2025 20:10:30 +0000
Subject: lib: add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing mul_u64_u64_div_u64() rounds down, a 'rounding up' variant
needs 'divisor - 1' adding in between the multiply and divide so cannot
easily be done by a caller.

Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d and
implement the 'round down' and 'round up' using it.

Update the x86-64 asm to optimise for 'c' being a constant zero.

Add kerndoc definitions for all three functions.

Link: https://lkml.kernel.org/r/20251105201035.64043-5-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/include/asm/div64.h | 20 ++++++++++--------
 include/linux/math64.h       | 48 +++++++++++++++++++++++++++++++++++++++++++-
 lib/math/div64.c             | 14 +++++++------
 3 files changed, 67 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9931e4c7d73f..6d8a3de3f43a 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -84,21 +84,25 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
  * Will generate an #DE when the result doesn't fit u64, could fix with an
  * __ex_table[] entry when it becomes an issue.
  */
-static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
+static inline u64 mul_u64_add_u64_div_u64(u64 rax, u64 mul, u64 add, u64 div)
 {
-	u64 q;
+	u64 rdx;
 
-	asm ("mulq %2; divq %3" : "=a" (q)
-				: "a" (a), "rm" (mul), "rm" (div)
-				: "rdx");
+	asm ("mulq %[mul]" : "+a" (rax), "=d" (rdx) : [mul] "rm" (mul));
 
-	return q;
+	if (!statically_true(!add))
+		asm ("addq %[add], %[lo]; adcq $0, %[hi]" :
+			[lo] "+r" (rax), [hi] "+r" (rdx) : [add] "irm" (add));
+
+	asm ("divq %[div]" : "+a" (rax), "+d" (rdx) : [div] "rm" (div));
+
+	return rax;
 }
-#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
+#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
 
 static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
 {
-	return mul_u64_u64_div_u64(a, mul, div);
+	return mul_u64_add_u64_div_u64(a, mul, 0, div);
 }
 #define mul_u64_u32_div	mul_u64_u32_div
 
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 6aaccc1626ab..e889d850b7f1 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -282,7 +282,53 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 }
 #endif /* mul_u64_u32_div */
 
-u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+/**
+ * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @c: unsigned 64bit addend
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * add a third value and then divide by a fourth.
+ * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
+ * Architecture specific code may trap on zero or overflow.
+ *
+ * Return: (@a * @b + @c) / @d
+ */
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
+
+/**
+ * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * and then divide by a third value.
+ * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
+ * Architecture specific code may trap on zero or overflow.
+ *
+ * Return: @a * @b / @d
+ */
+#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d)
+
+/**
+ * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * and then divide and round up.
+ * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
+ * Architecture specific code may trap on zero or overflow.
+ *
+ * Return: (@a * @b + @d - 1) / @d
+ */
+#define mul_u64_u64_div_u64_roundup(a, b, d) \
+	({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); })
+
 
 /**
  * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 4a4b1aa9e6e1..a88391b8ada0 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -183,13 +183,13 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
 }
 EXPORT_SYMBOL(iter_div_u64_rem);
 
-#ifndef mul_u64_u64_div_u64
-u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
+#ifndef mul_u64_add_u64_div_u64
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
 {
 #if defined(__SIZEOF_INT128__)
 
 	/* native 64x64=128 bits multiplication */
-	u128 prod = (u128)a * b;
+	u128 prod = (u128)a * b + c;
 	u64 n_lo = prod, n_hi = prod >> 64;
 
 #else
@@ -198,8 +198,10 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
 	u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
 	u64 x, y, z;
 
-	x = (u64)a_lo * b_lo;
-	y = (u64)a_lo * b_hi + (u32)(x >> 32);
+	/* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
+	x = (u64)a_lo * b_lo + (u32)c;
+	y = (u64)a_lo * b_hi + (u32)(c >> 32);
+	y += (u32)(x >> 32);
 	z = (u64)a_hi * b_hi + (u32)(y >> 32);
 	y = (u64)a_hi * b_lo + (u32)y;
 	z += (u32)(y >> 32);
@@ -265,5 +267,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
 
 	return res;
 }
-EXPORT_SYMBOL(mul_u64_u64_div_u64);
+EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
 #endif
-- 
cgit v1.2.3


From 630f96a687def5616d6fa7f069adcea158320909 Mon Sep 17 00:00:00 2001
From: David Laight <david.laight.linux@gmail.com>
Date: Wed, 5 Nov 2025 20:10:33 +0000
Subject: lib: mul_u64_u64_div_u64(): optimise multiply on 32bit x86
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc generates horrid code for both ((u64)u32_a * u32_b) and (u64_a +
u32_b).  As well as the extra instructions it can generate a lot of spills
to stack (including spills of constant zeros and even multiplies by
constant zero).

mul_u32_u32() already exists to optimise the multiply.  Add a similar
add_u64_32() for the addition.  Disable both for clang - it generates
better code without them.

Move the 64x64 => 128 multiply into a static inline helper function for
code clarity.  No need for the a/b_hi/lo variables, the implicit casts on
the function calls do the work for us.  Should have minimal effect on the
generated code.

Use mul_u32_u32() and add_u64_u32() in the 64x64 => 128 multiply in
mul_u64_add_u64_div_u64().

Link: https://lkml.kernel.org/r/20251105201035.64043-8-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/include/asm/div64.h | 19 +++++++++++++++++++
 include/linux/math64.h       | 11 +++++++++++
 lib/math/div64.c             | 40 ++++++++++++++++++++++++++--------------
 3 files changed, 56 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 6d8a3de3f43a..30fd06ede751 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -60,6 +60,12 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 }
 #define div_u64_rem	div_u64_rem
 
+/*
+ * gcc tends to zero extend 32bit values and do full 64bit maths.
+ * Define asm functions that avoid this.
+ * (clang generates better code for the C versions.)
+ */
+#ifndef __clang__
 static inline u64 mul_u32_u32(u32 a, u32 b)
 {
 	u32 high, low;
@@ -71,6 +77,19 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
 }
 #define mul_u32_u32 mul_u32_u32
 
+static inline u64 add_u64_u32(u64 a, u32 b)
+{
+	u32 high = a >> 32, low = a;
+
+	asm ("addl %[b], %[low]; adcl $0, %[high]"
+		: [low] "+r" (low), [high] "+r" (high)
+		: [b] "rm" (b) );
+
+	return low | (u64)high << 32;
+}
+#define add_u64_u32 add_u64_u32
+#endif
+
 /*
  * __div64_32() is never called on x86, so prevent the
  * generic definition from getting built.
diff --git a/include/linux/math64.h b/include/linux/math64.h
index e889d850b7f1..cc305206d89f 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -158,6 +158,17 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
 }
 #endif
 
+#ifndef add_u64_u32
+/*
+ * Many a GCC version also messes this up.
+ * Zero extending b and then spilling everything to stack.
+ */
+static inline u64 add_u64_u32(u64 a, u32 b)
+{
+	return a + b;
+}
+#endif
+
 #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
 
 #ifndef mul_u64_u32_shr
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 18a9ba26c418..bb57a48ce36a 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -186,33 +186,45 @@ EXPORT_SYMBOL(iter_div_u64_rem);
 #endif
 
 #if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64)
-u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
-{
+
+#define mul_add(a, b, c) add_u64_u32(mul_u32_u32(a, b), c)
+
 #if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64)
 
+static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
+{
 	/* native 64x64=128 bits multiplication */
 	u128 prod = (u128)a * b + c;
-	u64 n_lo = prod, n_hi = prod >> 64;
+
+	*p_lo = prod;
+	return prod >> 64;
+}
 
 #else
 
-	/* perform a 64x64=128 bits multiplication manually */
-	u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
+static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
+{
+	/* perform a 64x64=128 bits multiplication in 32bit chunks */
 	u64 x, y, z;
 
 	/* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
-	x = (u64)a_lo * b_lo + (u32)c;
-	y = (u64)a_lo * b_hi + (u32)(c >> 32);
-	y += (u32)(x >> 32);
-	z = (u64)a_hi * b_hi + (u32)(y >> 32);
-	y = (u64)a_hi * b_lo + (u32)y;
-	z += (u32)(y >> 32);
-	x = (y << 32) + (u32)x;
-
-	u64 n_lo = x, n_hi = z;
+	x = mul_add(a, b, c);
+	y = mul_add(a, b >> 32, c >> 32);
+	y = add_u64_u32(y, x >> 32);
+	z = mul_add(a >> 32, b >> 32, y >> 32);
+	y = mul_add(a >> 32, b, y);
+	*p_lo = (y << 32) + (u32)x;
+	return add_u64_u32(z, y >> 32);
+}
 
 #endif
 
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
+{
+	u64 n_lo, n_hi;
+
+	n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c);
+
 	if (!n_hi)
 		return div64_u64(n_lo, d);
 
-- 
cgit v1.2.3


From f3fb126fdc9e148da38a6e25d7fc609774a99fc3 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 6 Nov 2025 16:20:51 +0100
Subject: math.h: amend abs() kernel-doc and add a note about signed type
 limits

- amend the kernel-doc so the description is decoupled from the
  parameter descriptions.

- add a note to explain behaviour for the signed types when supplied
  value is the minimum (e.g., INT_MIN for int type).

Link: https://lkml.kernel.org/r/20251106152051.2361551-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/math.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/math.h b/include/linux/math.h
index 0198c92cbe3e..6dc1d1d32fbc 100644
--- a/include/linux/math.h
+++ b/include/linux/math.h
@@ -148,11 +148,16 @@ __STRUCT_FRACT(u32)
 
 /**
  * abs - return absolute value of an argument
- * @x: the value.  If it is unsigned type, it is converted to signed type first.
- *     char is treated as if it was signed (regardless of whether it really is)
- *     but the macro's return type is preserved as char.
+ * @x: the value.
  *
- * Return: an absolute value of x.
+ * If it is unsigned type, @x is converted to signed type first.
+ * char is treated as if it was signed (regardless of whether it really is)
+ * but the macro's return type is preserved as char.
+ *
+ * NOTE, for signed type if @x is the minimum, the returned result is undefined
+ * as there is not enough bits to represent it as a positive number.
+ *
+ * Return: an absolute value of @x.
  */
 #define abs(x)	__abs_choose_expr(x, long long,				\
 		__abs_choose_expr(x, long,				\
-- 
cgit v1.2.3


From 242b872239f6a7deacbc20ab9406ea40cb738ec6 Mon Sep 17 00:00:00 2001
From: Xie Yuanbin <qq570070308@gmail.com>
Date: Sun, 9 Nov 2025 16:37:15 +0800
Subject: include/linux/once_lite.h: fix judgment in WARN_ONCE with clang

For c code:
```c
extern int xx;
void test(void)
{
	if (WARN_ONCE(xx, "x"))
		__asm__ volatile ("nop":::);
}
```

Clang will generate the following assembly code:
```assemble
test:
	movl	xx(%rip), %eax // Assume xx == 0 (likely case)
	testl	%eax, %eax // judge once
	je	.LBB0_3    // jump to .LBB0_3
	testb	$1, test.__already_done(%rip)
	je	.LBB0_2
.LBB0_3:
	testl	%eax, %eax // judge again
	je	.LBB0_5    // jump to .LBB0_5
.LBB0_4:
	nop
.LBB0_5:
	retq
	// omit
```

In the above code, `xx == 0` should be a likely case, but in this case,
xx has been judged twice.

Test info:
1. kernel source:
linux-next
commit 9c0826a5d9aa4d52206d ("Add linux-next specific files for 20251107")
2. compiler:
clang: Debian clang version 21.1.4 (8) with
Debian LLD 21.1.4 (compatible with GNU linkers)
3. config:
base on default x86_64_defconfig, and setting:
CONFIG_MITIGATION_RETHUNK=n
CONFIG_STACKPROTECTOR=n

Add unlikely to __ret_cond to help the compiler optimize correctly.

[akpm@linux-foundation.org: undo whitespace changes]
Link: https://lkml.kernel.org/r/20251109083715.24495-1-qq570070308@gmail.com
Signed-off-by: Xie Yuanbin <qq570070308@gmail.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Maninder Singh <maninder1.s@samsung.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/once_lite.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/once_lite.h b/include/linux/once_lite.h
index 27de7bc32a06..236592c4eeb1 100644
--- a/include/linux/once_lite.h
+++ b/include/linux/once_lite.h
@@ -16,7 +16,7 @@
 		bool __ret_cond = !!(condition);			\
 		bool __ret_once = false;				\
 									\
-		if (unlikely(__ret_cond && !__already_done)) {		\
+		if (unlikely(__ret_cond) && unlikely(!__already_done)) {\
 			__already_done = true;				\
 			__ret_once = true;				\
 		}							\
-- 
cgit v1.2.3


From f1e2ca801c54dfc09d6a5540207cec25e8d43f6f Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Fri, 14 Nov 2025 14:00:45 +0800
Subject: lib/base64: add support for multiple variants

Patch series " lib/base64: add generic encoder/decoder, migrate users", v5.

This series introduces a generic Base64 encoder/decoder to the kernel
library, eliminating duplicated implementations and delivering significant
performance improvements.

The Base64 API has been extended to support multiple variants (Standard,
URL-safe, and IMAP) as defined in RFC 4648 and RFC 3501.  The API now
takes a variant parameter and an option to control padding.  As part of
this series, users are migrated to the new interface while preserving
their specific formats: fscrypt now uses BASE64_URLSAFE, Ceph uses
BASE64_IMAP, and NVMe is updated to BASE64_STD.

On the encoder side, the implementation processes input in 3-byte blocks,
mapping 24 bits directly to 4 output symbols.  This avoids bit-by-bit
streaming and reduces loop overhead, achieving about a 2.7x speedup
compared to previous implementations.

On the decoder side, replace strchr() lookups with per-variant reverse
tables and process input in 4-character groups.  Each group is mapped to
numeric values and combined into 3 bytes.  Padded and unpadded forms are
validated explicitly, rejecting invalid '=' usage and enforcing tail
rules.  This improves throughput by ~43-52x.


This patch (of 6):

Extend the base64 API to support multiple variants (standard, URL-safe,
and IMAP) as defined in RFC 4648 and RFC 3501.  The API now takes a
variant parameter and an option to control padding.  Update NVMe auth code
to use the new interface with BASE64_STD.

Link: https://lkml.kernel.org/r/20251114055829.87814-1-409411716@gms.tku.edu.tw
Link: https://lkml.kernel.org/r/20251114060045.88792-1-409411716@gms.tku.edu.tw
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Co-developed-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
Reviewed-by: David Laight <david.laight.linux@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: "Theodore Y. Ts'o" <tytso@mit.edu>
Cc: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Yu-Sheng Huang <home7438072@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/nvme/common/auth.c |  4 +--
 include/linux/base64.h     | 10 ++++++--
 lib/base64.c               | 62 +++++++++++++++++++++++++++-------------------
 3 files changed, 46 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
index 1f51fbebd9fa..e07e7d4bf8b6 100644
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -178,7 +178,7 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 	if (!key)
 		return ERR_PTR(-ENOMEM);
 
-	key_len = base64_decode(secret, allocated_len, key->key);
+	key_len = base64_decode(secret, allocated_len, key->key, true, BASE64_STD);
 	if (key_len < 0) {
 		pr_debug("base64 key decoding error %d\n",
 			 key_len);
@@ -663,7 +663,7 @@ int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len,
 	if (ret)
 		goto out_free_digest;
 
-	ret = base64_encode(digest, digest_len, enc);
+	ret = base64_encode(digest, digest_len, enc, true, BASE64_STD);
 	if (ret < hmac_len) {
 		ret = -ENOKEY;
 		goto out_free_digest;
diff --git a/include/linux/base64.h b/include/linux/base64.h
index 660d4cb1ef31..a2c6c9222da3 100644
--- a/include/linux/base64.h
+++ b/include/linux/base64.h
@@ -8,9 +8,15 @@
 
 #include <linux/types.h>
 
+enum base64_variant {
+	BASE64_STD,       /* RFC 4648 (standard) */
+	BASE64_URLSAFE,   /* RFC 4648 (base64url) */
+	BASE64_IMAP,      /* RFC 3501 */
+};
+
 #define BASE64_CHARS(nbytes)   DIV_ROUND_UP((nbytes) * 4, 3)
 
-int base64_encode(const u8 *src, int len, char *dst);
-int base64_decode(const char *src, int len, u8 *dst);
+int base64_encode(const u8 *src, int len, char *dst, bool padding, enum base64_variant variant);
+int base64_decode(const char *src, int len, u8 *dst, bool padding, enum base64_variant variant);
 
 #endif /* _LINUX_BASE64_H */
diff --git a/lib/base64.c b/lib/base64.c
index b736a7a431c5..a7c20a8e8e98 100644
--- a/lib/base64.c
+++ b/lib/base64.c
@@ -1,12 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * base64.c - RFC4648-compliant base64 encoding
+ * base64.c - Base64 with support for multiple variants
  *
  * Copyright (c) 2020 Hannes Reinecke, SUSE
  *
  * Based on the base64url routines from fs/crypto/fname.c
- * (which are using the URL-safe base64 encoding),
- * modified to use the standard coding table from RFC4648 section 4.
+ * (which are using the URL-safe Base64 encoding),
+ * modified to support multiple Base64 variants.
  */
 
 #include <linux/kernel.h>
@@ -15,26 +15,31 @@
 #include <linux/string.h>
 #include <linux/base64.h>
 
-static const char base64_table[65] =
-	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char base64_tables[][65] = {
+	[BASE64_STD] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/",
+	[BASE64_URLSAFE] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_",
+	[BASE64_IMAP] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,",
+};
 
 /**
- * base64_encode() - base64-encode some binary data
+ * base64_encode() - Base64-encode some binary data
  * @src: the binary data to encode
  * @srclen: the length of @src in bytes
- * @dst: (output) the base64-encoded string.  Not NUL-terminated.
+ * @dst: (output) the Base64-encoded string.  Not NUL-terminated.
+ * @padding: whether to append '=' padding characters
+ * @variant: which base64 variant to use
  *
- * Encodes data using base64 encoding, i.e. the "Base 64 Encoding" specified
- * by RFC 4648, including the  '='-padding.
+ * Encodes data using the selected Base64 variant.
  *
- * Return: the length of the resulting base64-encoded string in bytes.
+ * Return: the length of the resulting Base64-encoded string in bytes.
  */
-int base64_encode(const u8 *src, int srclen, char *dst)
+int base64_encode(const u8 *src, int srclen, char *dst, bool padding, enum base64_variant variant)
 {
 	u32 ac = 0;
 	int bits = 0;
 	int i;
 	char *cp = dst;
+	const char *base64_table = base64_tables[variant];
 
 	for (i = 0; i < srclen; i++) {
 		ac = (ac << 8) | src[i];
@@ -48,44 +53,49 @@ int base64_encode(const u8 *src, int srclen, char *dst)
 		*cp++ = base64_table[(ac << (6 - bits)) & 0x3f];
 		bits -= 6;
 	}
-	while (bits < 0) {
-		*cp++ = '=';
-		bits += 2;
+	if (padding) {
+		while (bits < 0) {
+			*cp++ = '=';
+			bits += 2;
+		}
 	}
 	return cp - dst;
 }
 EXPORT_SYMBOL_GPL(base64_encode);
 
 /**
- * base64_decode() - base64-decode a string
+ * base64_decode() - Base64-decode a string
  * @src: the string to decode.  Doesn't need to be NUL-terminated.
  * @srclen: the length of @src in bytes
  * @dst: (output) the decoded binary data
+ * @padding: whether to append '=' padding characters
+ * @variant: which base64 variant to use
  *
- * Decodes a string using base64 encoding, i.e. the "Base 64 Encoding"
- * specified by RFC 4648, including the  '='-padding.
+ * Decodes a string using the selected Base64 variant.
  *
  * This implementation hasn't been optimized for performance.
  *
  * Return: the length of the resulting decoded binary data in bytes,
- *	   or -1 if the string isn't a valid base64 string.
+ *	   or -1 if the string isn't a valid Base64 string.
  */
-int base64_decode(const char *src, int srclen, u8 *dst)
+int base64_decode(const char *src, int srclen, u8 *dst, bool padding, enum base64_variant variant)
 {
 	u32 ac = 0;
 	int bits = 0;
 	int i;
 	u8 *bp = dst;
+	const char *base64_table = base64_tables[variant];
 
 	for (i = 0; i < srclen; i++) {
 		const char *p = strchr(base64_table, src[i]);
-
-		if (src[i] == '=') {
-			ac = (ac << 6);
-			bits += 6;
-			if (bits >= 8)
-				bits -= 8;
-			continue;
+		if (padding) {
+			if (src[i] == '=') {
+				ac = (ac << 6);
+				bits += 6;
+				if (bits >= 8)
+					bits -= 8;
+				continue;
+			}
 		}
 		if (p == NULL || src[i] == 0)
 			return -1;
-- 
cgit v1.2.3


From 9031b852c97f1db52180878aed66ca08946eca93 Mon Sep 17 00:00:00 2001
From: Alice Ryhl <aliceryhl@google.com>
Date: Tue, 18 Nov 2025 17:32:50 +0000
Subject: uaccess: gate _copy_[to|from]_user on !INLINE_COPY_FROM_USER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These methods only exist when INLINE_COPY_FROM_USER is disabled, so update
the header file to reflect that.

This fixes the following error on builds that enable both RUST and
INLINE_COPY_FROM_USER.

ERROR: modpost: "_copy_from_user" [samples/rust/rust_misc_device.ko] undefined!
ERROR: modpost: "_copy_to_user" [samples/rust/rust_misc_device.ko] undefined!

This error is triggered because when a method is available both as a
rust_helper_* and normal method, Rust will call the normal method.

[akpm@linux-foundation.org: s/INLINE_COPY_FROM_USER/INLINE_COPY_TO_USER/, per Alice]
Link: https://lkml.kernel.org/r/20251118173250.2821388-1-aliceryhl@google.com
Fixes: d99dc586ca7c ("uaccess: decouple INLINE_COPY_FROM_USER and CONFIG_RUST")
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Trevor Gross <tmgross@umich.edu>
Cc: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/uaccess.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 01cbd7dd0ba3..5594012160da 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -181,8 +181,10 @@ fail:
 	memset(to + (n - res), 0, res);
 	return res;
 }
+#ifndef INLINE_COPY_FROM_USER
 extern __must_check unsigned long
 _copy_from_user(void *, const void __user *, unsigned long);
+#endif
 
 static inline __must_check unsigned long
 _inline_copy_to_user(void __user *to, const void *from, unsigned long n)
@@ -196,8 +198,10 @@ _inline_copy_to_user(void __user *to, const void *from, unsigned long n)
 	}
 	return n;
 }
+#ifndef INLINE_COPY_TO_USER
 extern __must_check unsigned long
 _copy_to_user(void __user *, const void *, unsigned long);
+#endif
 
 static __always_inline unsigned long __must_check
 copy_from_user(void *to, const void __user *from, unsigned long n)
-- 
cgit v1.2.3


From e0940c672ab4228caa33bcd7cc0ad8017482c2f1 Mon Sep 17 00:00:00 2001
From: Nirbhay Sharma <nirbhay.lkd@gmail.com>
Date: Fri, 21 Nov 2025 02:16:21 +0530
Subject: bpf: Document cfi_stubs and owner fields in struct bpf_struct_ops

Add missing kernel-doc documentation for the cfi_stubs and owner
fields in struct bpf_struct_ops to fix the following warnings:

  Warning: include/linux/bpf.h:1931 struct member 'cfi_stubs' not
  described in 'bpf_struct_ops'
  Warning: include/linux/bpf.h:1931 struct member 'owner' not
  described in 'bpf_struct_ops'

The cfi_stubs field was added in commit 2cd3e3772e41 ("x86/cfi,bpf:
Fix bpf_struct_ops CFI") to provide CFI stub functions for trampolines,
and the owner field is used for module reference counting.

Signed-off-by: Nirbhay Sharma <nirbhay.lkd@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251120204620.59571-2-nirbhay.lkd@gmail.com
---
 include/linux/bpf.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 09d5dc541d1c..30fb40421405 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1922,12 +1922,14 @@ struct btf_member;
  *	      reason, if this callback is not defined, the check is skipped as
  *	      the struct_ops map will have final verification performed in
  *	      @reg.
- * @type: BTF type.
- * @value_type: Value type.
+ * @cfi_stubs: Pointer to a structure of stub functions for CFI. These stubs
+ *	       provide the correct Control Flow Integrity hashes for the
+ *	       trampolines generated by BPF struct_ops.
+ * @owner: The module that owns this struct_ops. Used for module reference
+ *	   counting to ensure the module providing the struct_ops cannot be
+ *	   unloaded while in use.
  * @name: The name of the struct bpf_struct_ops object.
  * @func_models: Func models
- * @type_id: BTF type id.
- * @value_id: BTF value id.
  */
 struct bpf_struct_ops {
 	const struct bpf_verifier_ops *verifier_ops;
-- 
cgit v1.2.3


From 6d5dea68246ecb190a50a7fecbaf7f8c1ddb15e4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 19 Nov 2025 08:48:12 +0000
Subject: tcp: tcp_moderate_rcvbuf is only used in rx path

sysctl_tcp_moderate_rcvbuf is only used from tcp_rcvbuf_grow().

Move it to netns_ipv4_read_rx group.

Remove various CACHELINE_ASSERT_GROUP_SIZE() from netns_ipv4_struct_check(),
as they have no real benefit but cause pain for all changes.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251119084813.3684576-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst | 2 +-
 include/net/netns/ipv4.h                                      | 2 +-
 net/core/net_namespace.c                                      | 9 ++-------
 3 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst
index 6e7b20afd2d4..5d5d54fb6ab1 100644
--- a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst
+++ b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst
@@ -102,7 +102,7 @@ u8                              sysctl_tcp_app_win
 u8                              sysctl_tcp_frto                                                                      tcp_enter_loss
 u8                              sysctl_tcp_nometrics_save                                                            TCP_LAST_ACK/tcp_update_metrics
 u8                              sysctl_tcp_no_ssthresh_metrics_save                                                  TCP_LAST_ACK/tcp_(update/init)_metrics
-u8                              sysctl_tcp_moderate_rcvbuf                   read_mostly         read_mostly         tcp_tso_should_defer(tx);tcp_rcv_space_adjust(rx)
+u8                              sysctl_tcp_moderate_rcvbuf                                       read_mostly         tcp_rcvbuf_grow()
 u8                              sysctl_tcp_tso_win_divisor                   read_mostly                             tcp_tso_should_defer(tcp_write_xmit)
 u8                              sysctl_tcp_workaround_signed_windows                                                 tcp_select_window
 int                             sysctl_tcp_limit_output_bytes                read_mostly                             tcp_small_queue_check(tcp_write_xmit)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index de9d36acc8e2..11837d3ccc0a 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -74,11 +74,11 @@ struct netns_ipv4 {
 
 	/* TXRX readonly hotpath cache lines */
 	__cacheline_group_begin(netns_ipv4_read_txrx);
-	u8 sysctl_tcp_moderate_rcvbuf;
 	__cacheline_group_end(netns_ipv4_read_txrx);
 
 	/* RX readonly hotpath cache line */
 	__cacheline_group_begin(netns_ipv4_read_rx);
+	u8 sysctl_tcp_moderate_rcvbuf;
 	u8 sysctl_ip_early_demux;
 	u8 sysctl_tcp_early_demux;
 	u8 sysctl_tcp_l3mdev_accept;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index adcfef55a66f..c8adbbe01451 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1223,14 +1223,10 @@ static void __init netns_ipv4_struct_check(void)
 				      sysctl_tcp_wmem);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
 				      sysctl_ip_fwd_use_pmtu);
-	CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33);
-
-	/* TXRX readonly hotpath cache lines */
-	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
-				      sysctl_tcp_moderate_rcvbuf);
-	CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);
 
 	/* RX readonly hotpath cache line */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+				      sysctl_tcp_moderate_rcvbuf);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
 				      sysctl_ip_early_demux);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
@@ -1241,7 +1237,6 @@ static void __init netns_ipv4_struct_check(void)
 				      sysctl_tcp_reordering);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
 				      sysctl_tcp_rmem);
-	CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22);
 }
 #endif
 
-- 
cgit v1.2.3


From ecfea98b7d0d56c5bf2df3fc02c5501afa5cef6f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 19 Nov 2025 08:48:13 +0000
Subject: tcp: add net.ipv4.tcp_rcvbuf_low_rtt

This is a follow up of commit aa251c84636c ("tcp: fix too slow
tcp_rcvbuf_grow() action") which brought again the issue that I tried
to fix in commit 65c5287892e9 ("tcp: fix sk_rcvbuf overshoot")

We also recently increased tcp_rmem[2] to 32 MB in commit 572be9bf9d0d
("tcp: increase tcp_rmem[2] to 32 MB")

Idea of this patch is to not let tcp_rcvbuf_grow() grow sk->sk_rcvbuf
too fast for small RTT flows. If sk->sk_rcvbuf is too big, this can
force NIC driver to not recycle pages from their page pool, and also
can cause cache evictions for DDIO enabled cpus/NIC, as receivers
are usually slower than senders.

Add net.ipv4.tcp_rcvbuf_low_rtt sysctl, set by default to 1000 usec (1 ms)

If RTT if smaller than the sysctl value, use the RTT/tcp_rcvbuf_low_rtt
ratio to control sk_rcvbuf inflation.

Tested:

Pair of hosts with a 200Gbit IDPF NIC. Using netperf/netserver

Client initiates 8 TCP bulk flows, asking netserver to use CPU #10 only.

super_netperf 8 -H server -T,10 -l 30

On server, use perf -e tcp:tcp_rcvbuf_grow while test is running.

Before:

sysctl -w net.ipv4.tcp_rcvbuf_low_rtt=1
perf record -a -e tcp:tcp_rcvbuf_grow sleep 30 ; perf script|tail -20|cut -c30-230
 1153.051201: tcp:tcp_rcvbuf_grow: time=398 rtt_us=382 copied=6905856 inq=180224 space=6115328 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25600000 famil
 1153.138752: tcp:tcp_rcvbuf_grow: time=446 rtt_us=413 copied=5529600 inq=180224 space=4505600 ooo=0 scaling_ratio=240 rcvbuf=23068672 rcv_ssthresh=21571860 window_clamp=21626880 rcv_wnd=21286912 famil
 1153.361484: tcp:tcp_rcvbuf_grow: time=415 rtt_us=380 copied=7061504 inq=204800 space=6725632 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25600000 famil
 1153.457642: tcp:tcp_rcvbuf_grow: time=483 rtt_us=421 copied=5885952 inq=720896 space=4407296 ooo=0 scaling_ratio=240 rcvbuf=23763511 rcv_ssthresh=22223271 window_clamp=22278291 rcv_wnd=21430272 famil
 1153.466002: tcp:tcp_rcvbuf_grow: time=308 rtt_us=281 copied=3244032 inq=180224 space=2883584 ooo=0 scaling_ratio=240 rcvbuf=44854314 rcv_ssthresh=41992059 window_clamp=42050919 rcv_wnd=41713664 famil
 1153.747792: tcp:tcp_rcvbuf_grow: time=394 rtt_us=332 copied=4460544 inq=585728 space=3063808 ooo=0 scaling_ratio=240 rcvbuf=44854314 rcv_ssthresh=41992059 window_clamp=42050919 rcv_wnd=41373696 famil
 1154.260747: tcp:tcp_rcvbuf_grow: time=652 rtt_us=226 copied=10977280 inq=737280 space=9486336 ooo=0 scaling_ratio=240 rcvbuf=31165538 rcv_ssthresh=29197743 window_clamp=29217691 rcv_wnd=28368896 fami
 1154.375019: tcp:tcp_rcvbuf_grow: time=461 rtt_us=443 copied=7573504 inq=507904 space=6856704 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25288704 famil
 1154.463072: tcp:tcp_rcvbuf_grow: time=494 rtt_us=408 copied=7983104 inq=200704 space=7065600 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25579520 famil
 1154.474658: tcp:tcp_rcvbuf_grow: time=507 rtt_us=459 copied=5586944 inq=540672 space=4718592 ooo=0 scaling_ratio=240 rcvbuf=17852266 rcv_ssthresh=16692999 window_clamp=16736499 rcv_wnd=16056320 famil
 1154.584657: tcp:tcp_rcvbuf_grow: time=494 rtt_us=427 copied=8126464 inq=204800 space=7782400 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25600000 famil
 1154.702117: tcp:tcp_rcvbuf_grow: time=480 rtt_us=406 copied=5734400 inq=180224 space=5349376 ooo=0 scaling_ratio=240 rcvbuf=23068672 rcv_ssthresh=21571860 window_clamp=21626880 rcv_wnd=21286912 famil
 1155.941595: tcp:tcp_rcvbuf_grow: time=717 rtt_us=670 copied=11042816 inq=3784704 space=7159808 ooo=0 scaling_ratio=240 rcvbuf=19581357 rcv_ssthresh=18333222 window_clamp=18357522 rcv_wnd=14614528 fam
 1156.384735: tcp:tcp_rcvbuf_grow: time=529 rtt_us=473 copied=9011200 inq=180224 space=7258112 ooo=0 scaling_ratio=240 rcvbuf=19581357 rcv_ssthresh=18333222 window_clamp=18357522 rcv_wnd=18018304 famil
 1157.821676: tcp:tcp_rcvbuf_grow: time=529 rtt_us=272 copied=8224768 inq=602112 space=6545408 ooo=0 scaling_ratio=240 rcvbuf=67000000 rcv_ssthresh=62793576 window_clamp=62812500 rcv_wnd=62115840 famil
 1158.906379: tcp:tcp_rcvbuf_grow: time=710 rtt_us=445 copied=11845632 inq=540672 space=10240000 ooo=0 scaling_ratio=240 rcvbuf=31165538 rcv_ssthresh=29205935 window_clamp=29217691 rcv_wnd=28536832 fam
 1164.600160: tcp:tcp_rcvbuf_grow: time=841 rtt_us=430 copied=12976128 inq=1290240 space=11304960 ooo=0 scaling_ratio=240 rcvbuf=31165538 rcv_ssthresh=29212591 window_clamp=29217691 rcv_wnd=27856896 fa
 1165.163572: tcp:tcp_rcvbuf_grow: time=845 rtt_us=800 copied=12632064 inq=540672 space=7921664 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25912795 window_clamp=25937095 rcv_wnd=25260032 fami
 1165.653464: tcp:tcp_rcvbuf_grow: time=388 rtt_us=309 copied=4493312 inq=180224 space=3874816 ooo=0 scaling_ratio=240 rcvbuf=44854314 rcv_ssthresh=41995899 window_clamp=42050919 rcv_wnd=41713664 famil
 1166.651211: tcp:tcp_rcvbuf_grow: time=556 rtt_us=553 copied=6328320 inq=540672 space=5554176 ooo=0 scaling_ratio=240 rcvbuf=23068672 rcv_ssthresh=21571860 window_clamp=21626880 rcv_wnd=20946944 famil

After:

sysctl -w net.ipv4.tcp_rcvbuf_low_rtt=1000
perf record -a -e tcp:tcp_rcvbuf_grow sleep 30 ; perf script|tail -20|cut -c30-230
 1457.053149: tcp:tcp_rcvbuf_grow: time=128 rtt_us=24 copied=1441792 inq=40960 space=1269760 ooo=0 scaling_ratio=240 rcvbuf=2960741 rcv_ssthresh=2605474 window_clamp=2775694 rcv_wnd=2568192 family=AF_I
 1458.000778: tcp:tcp_rcvbuf_grow: time=128 rtt_us=31 copied=1441792 inq=24576 space=1400832 ooo=0 scaling_ratio=240 rcvbuf=3060163 rcv_ssthresh=2810042 window_clamp=2868902 rcv_wnd=2674688 family=AF_I
 1458.088059: tcp:tcp_rcvbuf_grow: time=190 rtt_us=110 copied=3227648 inq=385024 space=2781184 ooo=0 scaling_ratio=240 rcvbuf=6728240 rcv_ssthresh=6252705 window_clamp=6307725 rcv_wnd=5799936 family=AF
 1458.148549: tcp:tcp_rcvbuf_grow: time=232 rtt_us=129 copied=3956736 inq=237568 space=2842624 ooo=0 scaling_ratio=240 rcvbuf=6731333 rcv_ssthresh=6252705 window_clamp=6310624 rcv_wnd=5918720 family=AF
 1458.466861: tcp:tcp_rcvbuf_grow: time=193 rtt_us=83 copied=2949120 inq=180224 space=2457600 ooo=0 scaling_ratio=240 rcvbuf=5751438 rcv_ssthresh=5357689 window_clamp=5391973 rcv_wnd=5054464 family=AF_
 1458.775476: tcp:tcp_rcvbuf_grow: time=257 rtt_us=127 copied=4304896 inq=352256 space=3346432 ooo=0 scaling_ratio=240 rcvbuf=8067131 rcv_ssthresh=7523275 window_clamp=7562935 rcv_wnd=7061504 family=AF
 1458.776631: tcp:tcp_rcvbuf_grow: time=200 rtt_us=96 copied=3260416 inq=143360 space=2768896 ooo=0 scaling_ratio=240 rcvbuf=6397256 rcv_ssthresh=5938567 window_clamp=5997427 rcv_wnd=5828608 family=AF_
 1459.707973: tcp:tcp_rcvbuf_grow: time=215 rtt_us=96 copied=2506752 inq=163840 space=1388544 ooo=0 scaling_ratio=240 rcvbuf=3068867 rcv_ssthresh=2768282 window_clamp=2877062 rcv_wnd=2555904 family=AF_
 1460.246494: tcp:tcp_rcvbuf_grow: time=231 rtt_us=80 copied=3756032 inq=204800 space=3117056 ooo=0 scaling_ratio=240 rcvbuf=7288091 rcv_ssthresh=6773725 window_clamp=6832585 rcv_wnd=6471680 family=AF_
 1460.714596: tcp:tcp_rcvbuf_grow: time=270 rtt_us=110 copied=4714496 inq=311296 space=3719168 ooo=0 scaling_ratio=240 rcvbuf=8957739 rcv_ssthresh=8339020 window_clamp=8397880 rcv_wnd=7933952 family=AF
 1462.029977: tcp:tcp_rcvbuf_grow: time=101 rtt_us=19 copied=1105920 inq=40960 space=1036288 ooo=0 scaling_ratio=240 rcvbuf=2338970 rcv_ssthresh=2091684 window_clamp=2192784 rcv_wnd=1986560 family=AF_I
 1462.802385: tcp:tcp_rcvbuf_grow: time=89 rtt_us=45 copied=1069056 inq=0 space=1064960 ooo=0 scaling_ratio=240 rcvbuf=2338970 rcv_ssthresh=2091684 window_clamp=2192784 rcv_wnd=2035712 family=AF_INET6
 1462.918648: tcp:tcp_rcvbuf_grow: time=105 rtt_us=33 copied=1441792 inq=180224 space=1069056 ooo=0 scaling_ratio=240 rcvbuf=2383282 rcv_ssthresh=2091684 window_clamp=2234326 rcv_wnd=1896448 family=AF_
 1463.222533: tcp:tcp_rcvbuf_grow: time=273 rtt_us=144 copied=4603904 inq=385024 space=3469312 ooo=0 scaling_ratio=240 rcvbuf=8422564 rcv_ssthresh=7891053 window_clamp=7896153 rcv_wnd=7409664 family=AF
 1466.519312: tcp:tcp_rcvbuf_grow: time=130 rtt_us=23 copied=1343488 inq=0 space=1261568 ooo=0 scaling_ratio=240 rcvbuf=2780158 rcv_ssthresh=2493778 window_clamp=2606398 rcv_wnd=2494464 family=AF_INET6
 1466.681003: tcp:tcp_rcvbuf_grow: time=128 rtt_us=21 copied=1441792 inq=12288 space=1343488 ooo=0 scaling_ratio=240 rcvbuf=2932027 rcv_ssthresh=2578555 window_clamp=2748775 rcv_wnd=2568192 family=AF_I
 1470.689959: tcp:tcp_rcvbuf_grow: time=255 rtt_us=122 copied=3932160 inq=204800 space=3551232 ooo=0 scaling_ratio=240 rcvbuf=8182038 rcv_ssthresh=7647384 window_clamp=7670660 rcv_wnd=7442432 family=AF
 1471.754154: tcp:tcp_rcvbuf_grow: time=188 rtt_us=95 copied=2138112 inq=577536 space=1429504 ooo=0 scaling_ratio=240 rcvbuf=3113650 rcv_ssthresh=2806426 window_clamp=2919046 rcv_wnd=2248704 family=AF_
 1476.813542: tcp:tcp_rcvbuf_grow: time=269 rtt_us=99 copied=3088384 inq=180224 space=2564096 ooo=0 scaling_ratio=240 rcvbuf=6219470 rcv_ssthresh=5771893 window_clamp=5830753 rcv_wnd=5509120 family=AF_
 1477.738309: tcp:tcp_rcvbuf_grow: time=166 rtt_us=54 copied=1777664 inq=180224 space=1417216 ooo=0 scaling_ratio=240 rcvbuf=3117118 rcv_ssthresh=2874958 window_clamp=2922298 rcv_wnd=2613248 family=AF_

We can see sk_rcvbuf values are much smaller, and that rtt_us (estimation of rtt
from a receiver point of view) is kept small, instead of being bloated.

No difference in throughput.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Tested-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/20251119084813.3684576-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst                 | 10 ++++++++++
 .../networking/net_cachelines/netns_ipv4_sysctl.rst    |  1 +
 include/net/netns/ipv4.h                               |  1 +
 net/core/net_namespace.c                               |  2 ++
 net/ipv4/sysctl_net_ipv4.c                             |  9 +++++++++
 net/ipv4/tcp_input.c                                   | 18 ++++++++++++++----
 net/ipv4/tcp_ipv4.c                                    |  1 +
 7 files changed, 38 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index f4ad739a6b53..bc9a01606daf 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -673,6 +673,16 @@ tcp_moderate_rcvbuf - BOOLEAN
 
 	Default: 1 (enabled)
 
+tcp_rcvbuf_low_rtt - INTEGER
+	rcvbuf autotuning can over estimate final socket rcvbuf, which
+	can lead to cache trashing for high throughput flows.
+
+	For small RTT flows (below tcp_rcvbuf_low_rtt usecs), we can relax
+	rcvbuf growth: Few additional ms to reach the final (and smaller)
+	rcvbuf is a good tradeoff.
+
+	Default : 1000 (1 ms)
+
 tcp_mtu_probing - INTEGER
 	Controls TCP Packetization-Layer Path MTU Discovery.  Takes three
 	values:
diff --git a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst
index 5d5d54fb6ab1..beaf1880a19b 100644
--- a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst
+++ b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst
@@ -103,6 +103,7 @@ u8                              sysctl_tcp_frto
 u8                              sysctl_tcp_nometrics_save                                                            TCP_LAST_ACK/tcp_update_metrics
 u8                              sysctl_tcp_no_ssthresh_metrics_save                                                  TCP_LAST_ACK/tcp_(update/init)_metrics
 u8                              sysctl_tcp_moderate_rcvbuf                                       read_mostly         tcp_rcvbuf_grow()
+u32                             sysctl_tcp_rcvbuf_low_rtt                                        read_mostly         tcp_rcvbuf_grow()
 u8                              sysctl_tcp_tso_win_divisor                   read_mostly                             tcp_tso_should_defer(tcp_write_xmit)
 u8                              sysctl_tcp_workaround_signed_windows                                                 tcp_select_window
 int                             sysctl_tcp_limit_output_bytes                read_mostly                             tcp_small_queue_check(tcp_write_xmit)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 11837d3ccc0a..2dbd46fc4734 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -85,6 +85,7 @@ struct netns_ipv4 {
 	/* 3 bytes hole, try to pack */
 	int sysctl_tcp_reordering;
 	int sysctl_tcp_rmem[3];
+	int sysctl_tcp_rcvbuf_low_rtt;
 	__cacheline_group_end(netns_ipv4_read_rx);
 
 	struct inet_timewait_death_row tcp_death_row;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c8adbbe01451..dfad7c03b809 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1227,6 +1227,8 @@ static void __init netns_ipv4_struct_check(void)
 	/* RX readonly hotpath cache line */
 	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
 				      sysctl_tcp_moderate_rcvbuf);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+				      sysctl_tcp_rcvbuf_low_rtt);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
 				      sysctl_ip_early_demux);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 35367f8e2da3..a1a50a5c80dc 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1342,6 +1342,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dou8vec_minmax,
 	},
+	{
+		.procname	= "tcp_rcvbuf_low_rtt",
+		.data		= &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
+	},
 	{
 		.procname	= "tcp_tso_win_divisor",
 		.data		= &init_net.ipv4.sysctl_tcp_tso_win_divisor,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9df5d7515605..198f8a0d37be 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -896,6 +896,7 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
 	const struct net *net = sock_net(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 rcvwin, rcvbuf, cap, oldval;
+	u32 rtt_threshold, rtt_us;
 	u64 grow;
 
 	oldval = tp->rcvq_space.space;
@@ -908,10 +909,19 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
 	/* DRS is always one RTT late. */
 	rcvwin = newval << 1;
 
-	/* slow start: allow the sender to double its rate. */
-	grow = (u64)rcvwin * (newval - oldval);
-	do_div(grow, oldval);
-	rcvwin += grow << 1;
+	rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
+	rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt);
+	if (rtt_us < rtt_threshold) {
+		/* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold.
+		 * It might take few additional ms to reach 'line rate',
+		 * but will avoid sk_rcvbuf inflation and poor cache use.
+		 */
+		grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold);
+	} else {
+		/* slow start: allow the sender to double its rate. */
+		grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval);
+	}
+	rcvwin += grow;
 
 	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
 		rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6fcaecb67284..e0bb8d9e2d9c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3566,6 +3566,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
 	net->ipv4.sysctl_tcp_frto = 2;
 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
+	net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
 	/* This limits the percentage of the congestion window which we
 	 * will allow a single TSO frame to consume.  Building TSO frames
 	 * which are too large can cause TCP streams to be bursty.
-- 
cgit v1.2.3


From 85081acc6b1188f2a6e5e605dc644225fcdf327f Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 19 Nov 2025 10:03:50 +0000
Subject: net: stmmac: pass struct device to init()/exit() methods

As struct plat_stmmacenet_data is not platform_device specific, pass
a struct device into the init() and exit() methods to allow them to
become independent of the underlying device.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Chen-Yu Tsai <wens@kernel.org>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vLf2U-0000000FMN2-0SLg@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/stmicro/stmmac/dwmac-anarion.c    |  4 ++--
 .../net/ethernet/stmicro/stmmac/dwmac-eic7700.c    |  4 ++--
 .../net/ethernet/stmicro/stmmac/dwmac-loongson1.c  | 12 +++++-----
 .../ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c  |  4 ++--
 drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c    | 14 +++++------
 .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c    |  4 ++--
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c  | 10 ++++----
 drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c  |  4 ++--
 drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c  |  2 +-
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 28 ++++++++++++----------
 include/linux/stmmac.h                             |  4 ++--
 12 files changed, 47 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c
index 84072c8ed741..5e0fc31762d9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c
@@ -34,7 +34,7 @@ static void gmac_write_reg(struct anarion_gmac *gmac, uint8_t reg, uint32_t val)
 	writel(val, gmac->ctl_block + reg);
 }
 
-static int anarion_gmac_init(struct platform_device *pdev, void *priv)
+static int anarion_gmac_init(struct device *dev, void *priv)
 {
 	uint32_t sw_config;
 	struct anarion_gmac *gmac = priv;
@@ -52,7 +52,7 @@ static int anarion_gmac_init(struct platform_device *pdev, void *priv)
 	return 0;
 }
 
-static void anarion_gmac_exit(struct platform_device *pdev, void *priv)
+static void anarion_gmac_exit(struct device *dev, void *priv)
 {
 	struct anarion_gmac *gmac = priv;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-eic7700.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-eic7700.c
index 1dcf2037001e..bcb8e000e720 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-eic7700.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-eic7700.c
@@ -58,14 +58,14 @@ static int eic7700_clks_config(void *priv, bool enabled)
 	return ret;
 }
 
-static int eic7700_dwmac_init(struct platform_device *pdev, void *priv)
+static int eic7700_dwmac_init(struct device *dev, void *priv)
 {
 	struct eic7700_qos_priv *dwc = priv;
 
 	return eic7700_clks_config(dwc, true);
 }
 
-static void eic7700_dwmac_exit(struct platform_device *pdev, void *priv)
+static void eic7700_dwmac_exit(struct device *dev, void *priv)
 {
 	struct eic7700_qos_priv *dwc = priv;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson1.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson1.c
index 894ee66f5c9b..de9aba756aac 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson1.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson1.c
@@ -48,7 +48,7 @@ struct ls1x_dwmac {
 struct ls1x_data {
 	int (*setup)(struct platform_device *pdev,
 		     struct plat_stmmacenet_data *plat_dat);
-	int (*init)(struct platform_device *pdev, void *bsp_priv);
+	int (*init)(struct device *dev, void *bsp_priv);
 };
 
 static int ls1b_dwmac_setup(struct platform_device *pdev,
@@ -79,7 +79,7 @@ static int ls1b_dwmac_setup(struct platform_device *pdev,
 	return 0;
 }
 
-static int ls1b_dwmac_syscon_init(struct platform_device *pdev, void *priv)
+static int ls1b_dwmac_syscon_init(struct device *dev, void *priv)
 {
 	struct ls1x_dwmac *dwmac = priv;
 	struct plat_stmmacenet_data *plat = dwmac->plat_dat;
@@ -98,7 +98,7 @@ static int ls1b_dwmac_syscon_init(struct platform_device *pdev, void *priv)
 					   GMAC0_USE_TXCLK | GMAC0_USE_PWM01);
 			break;
 		default:
-			dev_err(&pdev->dev, "Unsupported PHY mode %u\n",
+			dev_err(dev, "Unsupported PHY mode %u\n",
 				plat->phy_interface);
 			return -EOPNOTSUPP;
 		}
@@ -122,7 +122,7 @@ static int ls1b_dwmac_syscon_init(struct platform_device *pdev, void *priv)
 					   GMAC1_USE_TXCLK | GMAC1_USE_PWM23);
 			break;
 		default:
-			dev_err(&pdev->dev, "Unsupported PHY mode %u\n",
+			dev_err(dev, "Unsupported PHY mode %u\n",
 				plat->phy_interface);
 			return -EOPNOTSUPP;
 		}
@@ -133,7 +133,7 @@ static int ls1b_dwmac_syscon_init(struct platform_device *pdev, void *priv)
 	return 0;
 }
 
-static int ls1c_dwmac_syscon_init(struct platform_device *pdev, void *priv)
+static int ls1c_dwmac_syscon_init(struct device *dev, void *priv)
 {
 	struct ls1x_dwmac *dwmac = priv;
 	struct plat_stmmacenet_data *plat = dwmac->plat_dat;
@@ -143,7 +143,7 @@ static int ls1c_dwmac_syscon_init(struct platform_device *pdev, void *priv)
 	phy_intf_sel = stmmac_get_phy_intf_sel(plat->phy_interface);
 	if (phy_intf_sel != PHY_INTF_SEL_GMII_MII &&
 	    phy_intf_sel != PHY_INTF_SEL_RMII) {
-		dev_err(&pdev->dev, "Unsupported PHY-mode %u\n",
+		dev_err(dev, "Unsupported PHY-mode %u\n",
 			plat->phy_interface);
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c
index bc7bb975803c..be7f5eb2cdcf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c
@@ -91,7 +91,7 @@ static struct phylink_pcs *renesas_gmac_select_pcs(struct stmmac_priv *priv,
 	return priv->hw->phylink_pcs;
 }
 
-static int renesas_gbeth_init(struct platform_device *pdev, void *priv)
+static int renesas_gbeth_init(struct device *dev, void *priv)
 {
 	struct plat_stmmacenet_data *plat_dat;
 	struct renesas_gbeth *gbeth = priv;
@@ -113,7 +113,7 @@ static int renesas_gbeth_init(struct platform_device *pdev, void *priv)
 	return ret;
 }
 
-static void renesas_gbeth_exit(struct platform_device *pdev, void *priv)
+static void renesas_gbeth_exit(struct device *dev, void *priv)
 {
 	struct plat_stmmacenet_data *plat_dat;
 	struct renesas_gbeth *gbeth = priv;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c
index 2b7ad64bfdf7..5a485ee98fa7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c
@@ -47,7 +47,7 @@ static int s32_gmac_write_phy_intf_select(struct s32_priv_data *gmac)
 	return 0;
 }
 
-static int s32_gmac_init(struct platform_device *pdev, void *priv)
+static int s32_gmac_init(struct device *dev, void *priv)
 {
 	struct s32_priv_data *gmac = priv;
 	int ret;
@@ -55,31 +55,31 @@ static int s32_gmac_init(struct platform_device *pdev, void *priv)
 	/* Set initial TX interface clock */
 	ret = clk_prepare_enable(gmac->tx_clk);
 	if (ret) {
-		dev_err(&pdev->dev, "Can't enable tx clock\n");
+		dev_err(dev, "Can't enable tx clock\n");
 		return ret;
 	}
 	ret = clk_set_rate(gmac->tx_clk, GMAC_INTF_RATE_125M);
 	if (ret) {
-		dev_err(&pdev->dev, "Can't set tx clock\n");
+		dev_err(dev, "Can't set tx clock\n");
 		goto err_tx_disable;
 	}
 
 	/* Set initial RX interface clock */
 	ret = clk_prepare_enable(gmac->rx_clk);
 	if (ret) {
-		dev_err(&pdev->dev, "Can't enable rx clock\n");
+		dev_err(dev, "Can't enable rx clock\n");
 		goto err_tx_disable;
 	}
 	ret = clk_set_rate(gmac->rx_clk, GMAC_INTF_RATE_125M);
 	if (ret) {
-		dev_err(&pdev->dev, "Can't set rx clock\n");
+		dev_err(dev, "Can't set rx clock\n");
 		goto err_txrx_disable;
 	}
 
 	/* Set interface mode */
 	ret = s32_gmac_write_phy_intf_select(gmac);
 	if (ret) {
-		dev_err(&pdev->dev, "Can't set PHY interface mode\n");
+		dev_err(dev, "Can't set PHY interface mode\n");
 		goto err_txrx_disable;
 	}
 
@@ -92,7 +92,7 @@ err_tx_disable:
 	return ret;
 }
 
-static void s32_gmac_exit(struct platform_device *pdev, void *priv)
+static void s32_gmac_exit(struct device *dev, void *priv)
 {
 	struct s32_priv_data *gmac = priv;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
index 49d651948e2b..a2b52d2c4eb6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
@@ -551,7 +551,7 @@ static struct phylink_pcs *socfpga_dwmac_select_pcs(struct stmmac_priv *priv,
 	return priv->hw->phylink_pcs;
 }
 
-static int socfpga_dwmac_init(struct platform_device *pdev, void *bsp_priv)
+static int socfpga_dwmac_init(struct device *dev, void *bsp_priv)
 {
 	struct socfpga_dwmac *dwmac = bsp_priv;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
index b0509ab6b31c..f50547b67fbc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
@@ -229,14 +229,14 @@ static int sti_dwmac_parse_data(struct sti_dwmac *dwmac,
 	return 0;
 }
 
-static int sti_dwmac_init(struct platform_device *pdev, void *bsp_priv)
+static int sti_dwmac_init(struct device *dev, void *bsp_priv)
 {
 	struct sti_dwmac *dwmac = bsp_priv;
 
 	return clk_prepare_enable(dwmac->clk);
 }
 
-static void sti_dwmac_exit(struct platform_device *pdev, void *bsp_priv)
+static void sti_dwmac_exit(struct device *dev, void *bsp_priv)
 {
 	struct sti_dwmac *dwmac = bsp_priv;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index 7434d4bbb526..8aa496ac85cc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -571,16 +571,16 @@ static const struct stmmac_dma_ops sun8i_dwmac_dma_ops = {
 
 static int sun8i_dwmac_power_internal_phy(struct stmmac_priv *priv);
 
-static int sun8i_dwmac_init(struct platform_device *pdev, void *priv)
+static int sun8i_dwmac_init(struct device *dev, void *priv)
 {
-	struct net_device *ndev = platform_get_drvdata(pdev);
+	struct net_device *ndev = dev_get_drvdata(dev);
 	struct sunxi_priv_data *gmac = priv;
 	int ret;
 
 	if (gmac->regulator) {
 		ret = regulator_enable(gmac->regulator);
 		if (ret) {
-			dev_err(&pdev->dev, "Fail to enable regulator\n");
+			dev_err(dev, "Fail to enable regulator\n");
 			return ret;
 		}
 	}
@@ -1005,7 +1005,7 @@ static void sun8i_dwmac_unset_syscon(struct sunxi_priv_data *gmac)
 				   (H3_EPHY_SHUTDOWN | H3_EPHY_SELECT));
 }
 
-static void sun8i_dwmac_exit(struct platform_device *pdev, void *priv)
+static void sun8i_dwmac_exit(struct device *dev, void *priv)
 {
 	struct sunxi_priv_data *gmac = priv;
 
@@ -1265,7 +1265,7 @@ static void sun8i_dwmac_shutdown(struct platform_device *pdev)
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	struct sunxi_priv_data *gmac = priv->plat->bsp_priv;
 
-	sun8i_dwmac_exit(pdev, gmac);
+	sun8i_dwmac_exit(&pdev->dev, gmac);
 }
 
 static const struct of_device_id sun8i_dwmac_match[] = {
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
index 7f560d78209d..52593ba3a3a3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
@@ -27,7 +27,7 @@ struct sunxi_priv_data {
 #define SUN7I_GMAC_GMII_RGMII_RATE	125000000
 #define SUN7I_GMAC_MII_RATE		25000000
 
-static int sun7i_gmac_init(struct platform_device *pdev, void *priv)
+static int sun7i_gmac_init(struct device *dev, void *priv)
 {
 	struct sunxi_priv_data *gmac = priv;
 	int ret = 0;
@@ -58,7 +58,7 @@ static int sun7i_gmac_init(struct platform_device *pdev, void *priv)
 	return ret;
 }
 
-static void sun7i_gmac_exit(struct platform_device *pdev, void *priv)
+static void sun7i_gmac_exit(struct device *dev, void *priv)
 {
 	struct sunxi_priv_data *gmac = priv;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
index a3378046b061..e291028ba56e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
@@ -186,7 +186,7 @@ static int thead_dwmac_enable_clk(struct plat_stmmacenet_data *plat)
 	return 0;
 }
 
-static int thead_dwmac_init(struct platform_device *pdev, void *priv)
+static int thead_dwmac_init(struct device *dev, void *priv)
 {
 	struct thead_dwmac *dwmac = priv;
 	unsigned int reg;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 1fefa6c55db1..feccb8a3e7e8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -747,40 +747,40 @@ EXPORT_SYMBOL_GPL(stmmac_get_platform_resources);
 
 /**
  * stmmac_pltfr_init
- * @pdev: pointer to the platform device
+ * @dev: pointer to the device structure
  * @plat: driver data platform structure
  * Description: Call the platform's init callback (if any) and propagate
  * the return value.
  */
-static int stmmac_pltfr_init(struct platform_device *pdev,
+static int stmmac_pltfr_init(struct device *dev,
 			     struct plat_stmmacenet_data *plat)
 {
 	int ret = 0;
 
 	if (plat->init)
-		ret = plat->init(pdev, plat->bsp_priv);
+		ret = plat->init(dev, plat->bsp_priv);
 
 	return ret;
 }
 
 /**
  * stmmac_pltfr_exit
- * @pdev: pointer to the platform device
+ * @dev: pointer to the device structure
  * @plat: driver data platform structure
  * Description: Call the platform's exit callback (if any).
  */
-static void stmmac_pltfr_exit(struct platform_device *pdev,
+static void stmmac_pltfr_exit(struct device *dev,
 			      struct plat_stmmacenet_data *plat)
 {
 	if (plat->exit)
-		plat->exit(pdev, plat->bsp_priv);
+		plat->exit(dev, plat->bsp_priv);
 }
 
 static int stmmac_plat_suspend(struct device *dev, void *bsp_priv)
 {
 	struct stmmac_priv *priv = netdev_priv(dev_get_drvdata(dev));
 
-	stmmac_pltfr_exit(to_platform_device(dev), priv->plat);
+	stmmac_pltfr_exit(dev, priv->plat);
 
 	return 0;
 }
@@ -789,7 +789,7 @@ static int stmmac_plat_resume(struct device *dev, void *bsp_priv)
 {
 	struct stmmac_priv *priv = netdev_priv(dev_get_drvdata(dev));
 
-	return stmmac_pltfr_init(to_platform_device(dev), priv->plat);
+	return stmmac_pltfr_init(dev, priv->plat);
 }
 
 /**
@@ -804,6 +804,7 @@ int stmmac_pltfr_probe(struct platform_device *pdev,
 		       struct plat_stmmacenet_data *plat,
 		       struct stmmac_resources *res)
 {
+	struct device *dev = &pdev->dev;
 	int ret;
 
 	if (!plat->suspend && plat->exit)
@@ -811,13 +812,13 @@ int stmmac_pltfr_probe(struct platform_device *pdev,
 	if (!plat->resume && plat->init)
 		plat->resume = stmmac_plat_resume;
 
-	ret = stmmac_pltfr_init(pdev, plat);
+	ret = stmmac_pltfr_init(dev, plat);
 	if (ret)
 		return ret;
 
-	ret = stmmac_dvr_probe(&pdev->dev, plat, res);
+	ret = stmmac_dvr_probe(dev, plat, res);
 	if (ret) {
-		stmmac_pltfr_exit(pdev, plat);
+		stmmac_pltfr_exit(dev, plat);
 		return ret;
 	}
 
@@ -866,9 +867,10 @@ void stmmac_pltfr_remove(struct platform_device *pdev)
 	struct net_device *ndev = platform_get_drvdata(pdev);
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	struct plat_stmmacenet_data *plat = priv->plat;
+	struct device *dev = &pdev->dev;
 
-	stmmac_dvr_remove(&pdev->dev);
-	stmmac_pltfr_exit(pdev, plat);
+	stmmac_dvr_remove(dev);
+	stmmac_pltfr_exit(dev, plat);
 }
 EXPORT_SYMBOL_GPL(stmmac_pltfr_remove);
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 4f70a6551e68..673b068fdadf 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -264,8 +264,8 @@ struct plat_stmmacenet_data {
 			  unsigned int mode,
 			  phy_interface_t interface);
 	void (*ptp_clk_freq_config)(struct stmmac_priv *priv);
-	int (*init)(struct platform_device *pdev, void *priv);
-	void (*exit)(struct platform_device *pdev, void *priv);
+	int (*init)(struct device *dev, void *priv);
+	void (*exit)(struct device *dev, void *priv);
 	int (*suspend)(struct device *dev, void *priv);
 	int (*resume)(struct device *dev, void *priv);
 	int (*mac_setup)(void *priv, struct mac_device_info *mac);
-- 
cgit v1.2.3


From 6ff3310ca28298e363c78143b6a2f20312421f4e Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 19 Nov 2025 10:23:30 +0000
Subject: net: stmmac: move stmmac_axi_blen_to_mask() to stmmac_main.c

Move the call to stmmac_axi_blen_to_mask() out of the individual
MAC version drivers into the main code in stmmac_init_dma_engine(),
passing the resulting value through a new member, axi_blen_regval,
in the struct stmmac_axi structure.

There is now no need for stmmac_axi_blen_to_dma_mask() to use
u32p_replace_bits(), so use FIELD_PREP() instead.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vLfLW-0000000FMb1-0zKV@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c | 3 +--
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c    | 3 +--
 drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c  | 3 +--
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c   | 8 ++++++--
 include/linux/stmmac.h                              | 1 +
 5 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
index b6476a1bfeab..6d9b8fac3c6d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
@@ -41,8 +41,7 @@ static void dwmac1000_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi)
 	 * set). Note that the UNDEF bit is readonly, and is the inverse of
 	 * Bus Mode bit 16.
 	 */
-	stmmac_axi_blen_to_mask(&value, axi->axi_blen,
-				ARRAY_SIZE(axi->axi_blen));
+	value = (value & ~DMA_AXI_BLEN_MASK) | axi->axi_blen_regval;
 
 	writel(value, ioaddr + DMA_AXI_BUS_MODE);
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 90d03c7b29f4..7b513324cfb0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -40,8 +40,7 @@ static void dwmac4_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi)
 	 * set). Note that the UNDEF bit is readonly, and is the inverse of
 	 * Bus Mode bit 16.
 	 */
-	stmmac_axi_blen_to_mask(&value, axi->axi_blen,
-				ARRAY_SIZE(axi->axi_blen));
+	value = (value & ~DMA_AXI_BLEN_MASK) | axi->axi_blen_regval;
 
 	writel(value, ioaddr + DMA_SYS_BUS_MODE);
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
index 8a2cb6ca9588..cc1bdc0975d5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
@@ -106,8 +106,7 @@ static void dwxgmac2_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi)
 	 * set). Note that the UNDEF bit is readonly, and is the inverse of
 	 * Bus Mode bit 16.
 	 */
-	stmmac_axi_blen_to_mask(&value, axi->axi_blen,
-				ARRAY_SIZE(axi->axi_blen));
+	value = (value & ~DMA_AXI_BLEN_MASK) | axi->axi_blen_regval;
 
 	writel(value, ioaddr + XGMAC_DMA_SYSBUS_MODE);
 	writel(XGMAC_TDPS, ioaddr + XGMAC_TX_EDMA_CTRL);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 0b1e571f70f0..aac82ddfb8c0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -223,7 +223,7 @@ void stmmac_axi_blen_to_mask(u32 *regval, const u32 *blen, size_t len)
 		val |= burst >> 2;
 	}
 
-	u32p_replace_bits(regval, val, DMA_AXI_BLEN_MASK);
+	*regval = FIELD_PREP(DMA_AXI_BLEN_MASK, val);
 }
 
 /**
@@ -3212,8 +3212,12 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 	/* DMA Configuration */
 	stmmac_dma_init(priv, priv->ioaddr, priv->plat->dma_cfg);
 
-	if (priv->plat->axi)
+	if (priv->plat->axi) {
+		/* Encode the AXI burst length to a register value */
+		stmmac_axi_blen_to_mask(&priv->plat->axi->axi_blen_regval,
+					priv->plat->axi->axi_blen, AXI_BLEN);
 		stmmac_axi(priv, priv->ioaddr, priv->plat->axi);
+	}
 
 	/* DMA CSR Channel configuration */
 	for (chan = 0; chan < dma_csr_ch; chan++) {
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 673b068fdadf..d1a41fe0825f 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -113,6 +113,7 @@ struct stmmac_axi {
 	u32 axi_wr_osr_lmt;
 	u32 axi_rd_osr_lmt;
 	bool axi_kbbe;
+	u32 axi_blen_regval;
 	u32 axi_blen[AXI_BLEN];
 	bool axi_fb;
 	bool axi_mb;
-- 
cgit v1.2.3


From efd3c8cc52bb9583183ebb83c8c55b23bf97cb2f Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 19 Nov 2025 10:23:40 +0000
Subject: net: stmmac: remove axi_blen array

Remove the axi_blen array from struct stmmac_axi as we set this array,
and then immediately convert it ot the register value, never looking at
the array again. Thus, the array can be function local rather than part
of a run-time allocated long-lived struct.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vLfLg-0000000FMbD-1vmh@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 11 ++---------
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c       |  3 ---
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c        |  4 ----
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c   |  5 +++--
 include/linux/stmmac.h                                  |  1 -
 5 files changed, 5 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
index bd06f26a27b4..d043bad4a862 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
@@ -38,8 +38,6 @@ static int dwc_eth_dwmac_config_dt(struct platform_device *pdev,
 {
 	struct device *dev = &pdev->dev;
 	u32 burst_map = 0;
-	u32 bit_index = 0;
-	u32 a_index = 0;
 
 	if (!plat_dat->axi) {
 		plat_dat->axi = devm_kzalloc(&pdev->dev,
@@ -83,13 +81,8 @@ static int dwc_eth_dwmac_config_dt(struct platform_device *pdev,
 	}
 	device_property_read_u32(dev, "snps,burst-map", &burst_map);
 
-	/* converts burst-map bitmask to burst array */
-	for (bit_index = 0; bit_index < 7; bit_index++)
-		if (burst_map & (1 << bit_index))
-			plat_dat->axi->axi_blen[a_index++] = 4 << bit_index;
-
-	stmmac_axi_blen_to_mask(&plat_dat->axi->axi_blen_regval,
-				plat_dat->axi->axi_blen, a_index);
+	plat_dat->axi->axi_blen_regval = FIELD_PREP(DMA_AXI_BLEN_MASK,
+						    burst_map);
 
 	/* dwc-qos needs GMAC4, AAL, TSO and PMT */
 	plat_dat->core_type = DWMAC_CORE_GMAC4;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index e94605d3d185..aad1be1ec4c1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -652,9 +652,6 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 	plat->axi->axi_rd_osr_lmt = 1;
 	plat->axi->axi_blen_regval = DMA_AXI_BLEN4 | DMA_AXI_BLEN8 |
 				     DMA_AXI_BLEN16;
-	plat->axi->axi_blen[0] = 4;
-	plat->axi->axi_blen[1] = 8;
-	plat->axi->axi_blen[2] = 16;
 
 	plat->ptp_max_adj = plat->clk_ptp_rate;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index e1036150fae2..afb1c53ca6f8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -94,10 +94,6 @@ static int snps_gmac5_default_data(struct pci_dev *pdev,
 	plat->axi->axi_fb = false;
 	plat->axi->axi_blen_regval = DMA_AXI_BLEN4 | DMA_AXI_BLEN8 |
 				     DMA_AXI_BLEN16 | DMA_AXI_BLEN32;
-	plat->axi->axi_blen[0] = 4;
-	plat->axi->axi_blen[1] = 8;
-	plat->axi->axi_blen[2] = 16;
-	plat->axi->axi_blen[3] = 32;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 656d4adedabe..8979a50b5507 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -95,6 +95,7 @@ static struct stmmac_axi *stmmac_axi_setup(struct platform_device *pdev)
 {
 	struct device_node *np;
 	struct stmmac_axi *axi;
+	u32 axi_blen[AXI_BLEN];
 
 	np = of_parse_phandle(pdev->dev.of_node, "snps,axi-config", 0);
 	if (!np)
@@ -117,8 +118,8 @@ static struct stmmac_axi *stmmac_axi_setup(struct platform_device *pdev)
 		axi->axi_wr_osr_lmt = 1;
 	if (of_property_read_u32(np, "snps,rd_osr_lmt", &axi->axi_rd_osr_lmt))
 		axi->axi_rd_osr_lmt = 1;
-	of_property_read_u32_array(np, "snps,blen", axi->axi_blen, AXI_BLEN);
-	stmmac_axi_blen_to_mask(&axi->axi_blen_regval, axi->axi_blen, AXI_BLEN);
+	of_property_read_u32_array(np, "snps,blen", axi_blen, AXI_BLEN);
+	stmmac_axi_blen_to_mask(&axi->axi_blen_regval, axi_blen, AXI_BLEN);
 	of_node_put(np);
 
 	return axi;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index d1a41fe0825f..f1054b9c2d8a 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -114,7 +114,6 @@ struct stmmac_axi {
 	u32 axi_rd_osr_lmt;
 	bool axi_kbbe;
 	u32 axi_blen_regval;
-	u32 axi_blen[AXI_BLEN];
 	bool axi_fb;
 	bool axi_mb;
 	bool axi_rb;
-- 
cgit v1.2.3


From 491c5dc98b848c4781addd514caed95039e5366c Mon Sep 17 00:00:00 2001
From: Yael Chemla <ychemla@nvidia.com>
Date: Wed, 19 Nov 2025 22:48:15 +0200
Subject: net: ethtool: Add support for 1600Gbps speed

Add support for 1600Gbps link modes based on 200Gbps per lane [1].
This includes the adopted IEEE 802.3dj copper and optical PMDs that use
200G/lane signaling [2].

Add the following PMD types:
- KR8 (backplane)
- CR8 (copper cable)
- DR8 (SMF 500m)
- DR8-2 (SMF 2km)

These modes are defined in the 802.3dj specifications.
References:
[1] https://www.ieee802.org/3/dj/public/23_03/opsasnick_3dj_01a_2303.pdf
[2] https://www.ieee802.org/3/dj/projdoc/objectives_P802d3dj_240314.pdf

Signed-off-by: Yael Chemla <ychemla@nvidia.com>
Reviewed-by: Shahar Shitrit <shshitrit@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/1763585297-1243980-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy-caps.h   | 1 +
 drivers/net/phy/phy-core.c   | 4 +++-
 drivers/net/phy/phy_caps.c   | 2 ++
 include/uapi/linux/ethtool.h | 5 +++++
 net/ethtool/common.c         | 8 ++++++++
 5 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy-caps.h b/drivers/net/phy/phy-caps.h
index b7f0c6a3037a..4951a39f3828 100644
--- a/drivers/net/phy/phy-caps.h
+++ b/drivers/net/phy/phy-caps.h
@@ -29,6 +29,7 @@ enum {
 	LINK_CAPA_200000FD,
 	LINK_CAPA_400000FD,
 	LINK_CAPA_800000FD,
+	LINK_CAPA_1600000FD,
 
 	__LINK_CAPA_MAX,
 };
diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 0c63e6ba2cb0..277c034bc32f 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -17,7 +17,7 @@
  */
 const char *phy_speed_to_str(int speed)
 {
-	BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 121,
+	BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 125,
 		"Enum ethtool_link_mode_bit_indices and phylib are out of sync. "
 		"If a speed or mode has been added please update phy_speed_to_str "
 		"and the PHY settings array.\n");
@@ -55,6 +55,8 @@ const char *phy_speed_to_str(int speed)
 		return "400Gbps";
 	case SPEED_800000:
 		return "800Gbps";
+	case SPEED_1600000:
+		return "1600Gbps";
 	case SPEED_UNKNOWN:
 		return "Unknown";
 	default:
diff --git a/drivers/net/phy/phy_caps.c b/drivers/net/phy/phy_caps.c
index 23c808b59b6f..3a05982b39bf 100644
--- a/drivers/net/phy/phy_caps.c
+++ b/drivers/net/phy/phy_caps.c
@@ -25,6 +25,7 @@ static struct link_capabilities link_caps[__LINK_CAPA_MAX] __ro_after_init = {
 	{ SPEED_200000, DUPLEX_FULL, {0} }, /* LINK_CAPA_200000FD */
 	{ SPEED_400000, DUPLEX_FULL, {0} }, /* LINK_CAPA_400000FD */
 	{ SPEED_800000, DUPLEX_FULL, {0} }, /* LINK_CAPA_800000FD */
+	{ SPEED_1600000, DUPLEX_FULL, {0} }, /* LINK_CAPA_1600000FD */
 };
 
 static int speed_duplex_to_capa(int speed, unsigned int duplex)
@@ -52,6 +53,7 @@ static int speed_duplex_to_capa(int speed, unsigned int duplex)
 	case SPEED_200000: return LINK_CAPA_200000FD;
 	case SPEED_400000: return LINK_CAPA_400000FD;
 	case SPEED_800000: return LINK_CAPA_800000FD;
+	case SPEED_1600000: return LINK_CAPA_1600000FD;
 	}
 
 	return -EINVAL;
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 8bd5ea5469d9..eb7ff2602fbb 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2077,6 +2077,10 @@ enum ethtool_link_mode_bit_indices {
 	ETHTOOL_LINK_MODE_800000baseDR4_2_Full_BIT	 = 118,
 	ETHTOOL_LINK_MODE_800000baseSR4_Full_BIT	 = 119,
 	ETHTOOL_LINK_MODE_800000baseVR4_Full_BIT	 = 120,
+	ETHTOOL_LINK_MODE_1600000baseCR8_Full_BIT	 = 121,
+	ETHTOOL_LINK_MODE_1600000baseKR8_Full_BIT	 = 122,
+	ETHTOOL_LINK_MODE_1600000baseDR8_Full_BIT	 = 123,
+	ETHTOOL_LINK_MODE_1600000baseDR8_2_Full_BIT	 = 124,
 
 	/* must be last entry */
 	__ETHTOOL_LINK_MODE_MASK_NBITS
@@ -2190,6 +2194,7 @@ enum ethtool_link_mode_bit_indices {
 #define SPEED_200000		200000
 #define SPEED_400000		400000
 #define SPEED_800000		800000
+#define SPEED_1600000		1600000
 
 #define SPEED_UNKNOWN		-1
 
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 55223ebc2a7e..369c05cf8163 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -233,6 +233,10 @@ const char link_mode_names[][ETH_GSTRING_LEN] = {
 	__DEFINE_LINK_MODE_NAME(800000, DR4_2, Full),
 	__DEFINE_LINK_MODE_NAME(800000, SR4, Full),
 	__DEFINE_LINK_MODE_NAME(800000, VR4, Full),
+	__DEFINE_LINK_MODE_NAME(1600000, CR8, Full),
+	__DEFINE_LINK_MODE_NAME(1600000, KR8, Full),
+	__DEFINE_LINK_MODE_NAME(1600000, DR8, Full),
+	__DEFINE_LINK_MODE_NAME(1600000, DR8_2, Full),
 };
 static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
 
@@ -422,6 +426,10 @@ const struct link_mode_info link_mode_params[] = {
 	__DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full),
 	__DEFINE_LINK_MODE_PARAMS(800000, SR4, Full),
 	__DEFINE_LINK_MODE_PARAMS(800000, VR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(1600000, CR8, Full),
+	__DEFINE_LINK_MODE_PARAMS(1600000, KR8, Full),
+	__DEFINE_LINK_MODE_PARAMS(1600000, DR8, Full),
+	__DEFINE_LINK_MODE_PARAMS(1600000, DR8_2, Full),
 };
 static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS);
 EXPORT_SYMBOL_GPL(link_mode_params);
-- 
cgit v1.2.3


From d10f26a7abbd3dd5d59bac1acdca117385b54ea9 Mon Sep 17 00:00:00 2001
From: Yuji Ishikawa <yuji2.ishikawa@toshiba.co.jp>
Date: Fri, 14 Nov 2025 15:53:58 +0900
Subject: dt-bindings: clock: tmpv770x: Remove definition of number of clocks

Remove the definitions of number of clocks from bindings because they
prevent adding new clocks. Since the previous patch removed all refereces
within the driver, they can now be deleted.

The same for resets and plls.

Signed-off-by: Yuji Ishikawa <yuji2.ishikawa@toshiba.co.jp>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/toshiba,tmpv770x.h | 3 ---
 include/dt-bindings/reset/toshiba,tmpv770x.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/toshiba,tmpv770x.h b/include/dt-bindings/clock/toshiba,tmpv770x.h
index 5fce713001fd..89189c4f6a52 100644
--- a/include/dt-bindings/clock/toshiba,tmpv770x.h
+++ b/include/dt-bindings/clock/toshiba,tmpv770x.h
@@ -11,7 +11,6 @@
 #define TMPV770X_PLL_PIDDRCPLL		4
 #define TMPV770X_PLL_PIVOIFPLL		5
 #define TMPV770X_PLL_PIIMGERPLL		6
-#define TMPV770X_NR_PLL		7
 
 /* Clocks */
 #define TMPV770X_CLK_PIPLL1_DIV1	0
@@ -141,7 +140,6 @@
 #define TMPV770X_CLK_PIREFCLK		124
 #define TMPV770X_CLK_SBUS		125
 #define TMPV770X_CLK_BUSLCK		126
-#define TMPV770X_NR_CLK			127
 
 /* Reset */
 #define TMPV770X_RESET_PIETHER_2P5M	0
@@ -176,6 +174,5 @@
 #define TMPV770X_RESET_PIPCMIF		29
 #define TMPV770X_RESET_PICKMON		30
 #define TMPV770X_RESET_SBUSCLK		31
-#define TMPV770X_NR_RESET		32
 
 #endif /*_DT_BINDINGS_CLOCK_TOSHIBA_TMPV770X_H_ */
diff --git a/include/dt-bindings/reset/toshiba,tmpv770x.h b/include/dt-bindings/reset/toshiba,tmpv770x.h
index c1007acb1941..bedfe253fa36 100644
--- a/include/dt-bindings/reset/toshiba,tmpv770x.h
+++ b/include/dt-bindings/reset/toshiba,tmpv770x.h
@@ -36,6 +36,5 @@
 #define TMPV770X_RESET_PIPCMIF		29
 #define TMPV770X_RESET_PICKMON		30
 #define TMPV770X_RESET_SBUSCLK		31
-#define TMPV770X_NR_RESET		32
 
 #endif /*_DT_BINDINGS_RESET_TOSHIBA_TMPV770X_H_ */
-- 
cgit v1.2.3


From beeff790c5679b3eacc8ee7021f775f447f47603 Mon Sep 17 00:00:00 2001
From: Yuji Ishikawa <yuji2.ishikawa@toshiba.co.jp>
Date: Fri, 14 Nov 2025 16:05:11 +0900
Subject: dt-bindings: clock: tmpv770x: Add VIIF clocks

Add clock and reset identifiers for the Video Input Interface.
These identifiers support two instances: VIIF0 and VIIF1.

Signed-off-by: Yuji Ishikawa <yuji2.ishikawa@toshiba.co.jp>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/toshiba,tmpv770x.h | 11 +++++++++++
 include/dt-bindings/reset/toshiba,tmpv770x.h |  8 ++++++++
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/toshiba,tmpv770x.h b/include/dt-bindings/clock/toshiba,tmpv770x.h
index 89189c4f6a52..a36c89266686 100644
--- a/include/dt-bindings/clock/toshiba,tmpv770x.h
+++ b/include/dt-bindings/clock/toshiba,tmpv770x.h
@@ -140,6 +140,9 @@
 #define TMPV770X_CLK_PIREFCLK		124
 #define TMPV770X_CLK_SBUS		125
 #define TMPV770X_CLK_BUSLCK		126
+#define TMPV770X_CLK_VIIFBS1_L2ISP	127
+#define TMPV770X_CLK_VIIFBS1_L1ISP	128
+#define TMPV770X_CLK_VIIFBS1_PROC	129
 
 /* Reset */
 #define TMPV770X_RESET_PIETHER_2P5M	0
@@ -174,5 +177,13 @@
 #define TMPV770X_RESET_PIPCMIF		29
 #define TMPV770X_RESET_PICKMON		30
 #define TMPV770X_RESET_SBUSCLK		31
+#define TMPV770X_RESET_VIIFBS0		32
+#define TMPV770X_RESET_VIIFBS0_APB	33
+#define TMPV770X_RESET_VIIFBS0_L2ISP	34
+#define TMPV770X_RESET_VIIFBS0_L1ISP	35
+#define TMPV770X_RESET_VIIFBS1		36
+#define TMPV770X_RESET_VIIFBS1_APB	37
+#define TMPV770X_RESET_VIIFBS1_L2ISP	38
+#define TMPV770X_RESET_VIIFBS1_L1ISP	39
 
 #endif /*_DT_BINDINGS_CLOCK_TOSHIBA_TMPV770X_H_ */
diff --git a/include/dt-bindings/reset/toshiba,tmpv770x.h b/include/dt-bindings/reset/toshiba,tmpv770x.h
index bedfe253fa36..9452bef31425 100644
--- a/include/dt-bindings/reset/toshiba,tmpv770x.h
+++ b/include/dt-bindings/reset/toshiba,tmpv770x.h
@@ -36,5 +36,13 @@
 #define TMPV770X_RESET_PIPCMIF		29
 #define TMPV770X_RESET_PICKMON		30
 #define TMPV770X_RESET_SBUSCLK		31
+#define TMPV770X_RESET_VIIFBS0		32
+#define TMPV770X_RESET_VIIFBS0_APB	33
+#define TMPV770X_RESET_VIIFBS0_L2ISP	34
+#define TMPV770X_RESET_VIIFBS0_L1ISP	35
+#define TMPV770X_RESET_VIIFBS1		36
+#define TMPV770X_RESET_VIIFBS1_APB	37
+#define TMPV770X_RESET_VIIFBS1_L2ISP	38
+#define TMPV770X_RESET_VIIFBS1_L1ISP	39
 
 #endif /*_DT_BINDINGS_RESET_TOSHIBA_TMPV770X_H_ */
-- 
cgit v1.2.3


From 011d133bb988f80d597a9cbdab659414ba7ff72b Mon Sep 17 00:00:00 2001
From: Daniel Zahka <daniel.zahka@gmail.com>
Date: Tue, 18 Nov 2025 18:50:31 -0800
Subject: devlink: pass extack through to devlink_param::get()

Allow devlink_param::get() handlers to report error messages via
extack. This function is called in a few different contexts, but not
all of them will have an valid extack to use.

When devlink_param::get() is called from param_get_doit or
param_get_dumpit contexts, pass the extack through so that drivers can
report errors when retrieving param values. devlink_param::get() is
called from the context of devlink_param_notify(), pass NULL in for
the extack.

Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251119025038.651131-2-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c   |  6 ++++--
 drivers/net/ethernet/amd/pds_core/core.h              |  3 ++-
 drivers/net/ethernet/amd/pds_core/devlink.c           |  3 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c     |  6 ++++--
 drivers/net/ethernet/intel/i40e/i40e_devlink.c        |  3 ++-
 drivers/net/ethernet/intel/ice/devlink/devlink.c      | 14 ++++++++++----
 .../net/ethernet/marvell/octeontx2/af/rvu_devlink.c   | 15 ++++++++++-----
 .../net/ethernet/marvell/octeontx2/nic/otx2_devlink.c |  6 ++++--
 drivers/net/ethernet/mellanox/mlx4/main.c             |  6 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c     |  3 ++-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c    |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c     |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c    |  3 ++-
 .../net/ethernet/mellanox/mlx5/core/lib/nv_param.c    |  9 ++++++---
 .../net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c   |  3 ++-
 drivers/net/ethernet/netronome/nfp/devlink_param.c    |  3 ++-
 drivers/net/ethernet/qlogic/qed/qed_devlink.c         |  3 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c     |  3 ++-
 drivers/net/ethernet/ti/am65-cpsw-nuss.c              |  3 ++-
 drivers/net/ethernet/ti/cpsw_new.c                    |  6 ++++--
 drivers/net/wwan/iosm/iosm_ipc_devlink.c              |  3 ++-
 include/net/devlink.h                                 |  3 ++-
 include/net/dsa.h                                     |  3 ++-
 net/devlink/param.c                                   | 19 +++++++++++--------
 net/dsa/devlink.c                                     |  3 ++-
 25 files changed, 89 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c b/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c
index 215a1a8ba7e9..07a74f702c3a 100644
--- a/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c
+++ b/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c
@@ -24,7 +24,8 @@ static int otx2_cpt_dl_egrp_delete(struct devlink *dl, u32 id,
 }
 
 static int otx2_cpt_dl_uc_info(struct devlink *dl, u32 id,
-			       struct devlink_param_gset_ctx *ctx)
+			       struct devlink_param_gset_ctx *ctx,
+			       struct netlink_ext_ack *extack)
 {
 	ctx->val.vstr[0] = '\0';
 
@@ -32,7 +33,8 @@ static int otx2_cpt_dl_uc_info(struct devlink *dl, u32 id,
 }
 
 static int otx2_cpt_dl_t106_mode_get(struct devlink *dl, u32 id,
-				     struct devlink_param_gset_ctx *ctx)
+				     struct devlink_param_gset_ctx *ctx,
+				     struct netlink_ext_ack *extack)
 {
 	struct otx2_cpt_devlink *cpt_dl = devlink_priv(dl);
 	struct otx2_cptpf_dev *cptpf = cpt_dl->cptpf;
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 0b53a1fab46d..4a6b35c84dab 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -255,7 +255,8 @@ int pdsc_dl_flash_update(struct devlink *dl,
 			 struct devlink_flash_update_params *params,
 			 struct netlink_ext_ack *extack);
 int pdsc_dl_enable_get(struct devlink *dl, u32 id,
-		       struct devlink_param_gset_ctx *ctx);
+		       struct devlink_param_gset_ctx *ctx,
+		       struct netlink_ext_ack *extack);
 int pdsc_dl_enable_set(struct devlink *dl, u32 id,
 		       struct devlink_param_gset_ctx *ctx,
 		       struct netlink_ext_ack *extack);
diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c
index d8dc39da4161..b576be626a29 100644
--- a/drivers/net/ethernet/amd/pds_core/devlink.c
+++ b/drivers/net/ethernet/amd/pds_core/devlink.c
@@ -22,7 +22,8 @@ pdsc_viftype *pdsc_dl_find_viftype_by_id(struct pdsc *pdsc,
 }
 
 int pdsc_dl_enable_get(struct devlink *dl, u32 id,
-		       struct devlink_param_gset_ctx *ctx)
+		       struct devlink_param_gset_ctx *ctx,
+		       struct netlink_ext_ack *extack)
 {
 	struct pdsc *pdsc = devlink_priv(dl);
 	struct pdsc_viftype *vt_entry;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 67ca02d84c97..15de802bbac4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -1086,7 +1086,8 @@ static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg,
 }
 
 static int bnxt_dl_nvm_param_get(struct devlink *dl, u32 id,
-				 struct devlink_param_gset_ctx *ctx)
+				 struct devlink_param_gset_ctx *ctx,
+				 struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = bnxt_get_bp_from_dl(dl);
 	struct hwrm_nvm_get_variable_input *req;
@@ -1168,7 +1169,8 @@ static int bnxt_dl_msix_validate(struct devlink *dl, u32 id,
 }
 
 static int bnxt_remote_dev_reset_get(struct devlink *dl, u32 id,
-				     struct devlink_param_gset_ctx *ctx)
+				     struct devlink_param_gset_ctx *ctx,
+				     struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = bnxt_get_bp_from_dl(dl);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_devlink.c b/drivers/net/ethernet/intel/i40e/i40e_devlink.c
index bc205e3077c7..229179ccc131 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_devlink.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_devlink.c
@@ -24,7 +24,8 @@ static int i40e_max_mac_per_vf_set(struct devlink *devlink,
 
 static int i40e_max_mac_per_vf_get(struct devlink *devlink,
 				   u32 id,
-				   struct devlink_param_gset_ctx *ctx)
+				   struct devlink_param_gset_ctx *ctx,
+				   struct netlink_ext_ack *extack)
 {
 	struct i40e_pf *pf = devlink_priv(devlink);
 
diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c
index 938914abbe06..d88b7f3fd1f9 100644
--- a/drivers/net/ethernet/intel/ice/devlink/devlink.c
+++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c
@@ -610,11 +610,13 @@ exit_release_res:
  * @devlink: pointer to the devlink instance
  * @id: the parameter ID to set
  * @ctx: context to store the parameter value
+ * @extack: netlink extended ACK structure
  *
  * Return: zero on success and negative value on failure.
  */
 static int ice_devlink_tx_sched_layers_get(struct devlink *devlink, u32 id,
-					   struct devlink_param_gset_ctx *ctx)
+					   struct devlink_param_gset_ctx *ctx,
+					   struct netlink_ext_ack *extack)
 {
 	struct ice_pf *pf = devlink_priv(devlink);
 	int err;
@@ -1349,7 +1351,8 @@ static const struct devlink_ops ice_sf_devlink_ops;
 
 static int
 ice_devlink_enable_roce_get(struct devlink *devlink, u32 id,
-			    struct devlink_param_gset_ctx *ctx)
+			    struct devlink_param_gset_ctx *ctx,
+			    struct netlink_ext_ack *extack)
 {
 	struct ice_pf *pf = devlink_priv(devlink);
 	struct iidc_rdma_core_dev_info *cdev;
@@ -1415,7 +1418,8 @@ ice_devlink_enable_roce_validate(struct devlink *devlink, u32 id,
 
 static int
 ice_devlink_enable_iw_get(struct devlink *devlink, u32 id,
-			  struct devlink_param_gset_ctx *ctx)
+			  struct devlink_param_gset_ctx *ctx,
+			  struct netlink_ext_ack *extack)
 {
 	struct ice_pf *pf = devlink_priv(devlink);
 	struct iidc_rdma_core_dev_info *cdev;
@@ -1522,11 +1526,13 @@ static int ice_devlink_local_fwd_str_to_mode(const char *mode_str)
  * @devlink: Pointer to the devlink instance.
  * @id: The parameter ID to set.
  * @ctx: Context to store the parameter value.
+ * @extack: netlink extended ACK structure
  *
  * Return: Zero.
  */
 static int ice_devlink_local_fwd_get(struct devlink *devlink, u32 id,
-				     struct devlink_param_gset_ctx *ctx)
+				     struct devlink_param_gset_ctx *ctx,
+				     struct netlink_ext_ack *extack)
 {
 	struct ice_pf *pf = devlink_priv(devlink);
 	struct ice_port_info *pi;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
index 3735372539bd..0f9953eaf1b0 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
@@ -1233,7 +1233,8 @@ static int rvu_af_dl_dwrr_mtu_set(struct devlink *devlink, u32 id,
 }
 
 static int rvu_af_dl_dwrr_mtu_get(struct devlink *devlink, u32 id,
-				  struct devlink_param_gset_ctx *ctx)
+				  struct devlink_param_gset_ctx *ctx,
+				  struct netlink_ext_ack *extack)
 {
 	struct rvu_devlink *rvu_dl = devlink_priv(devlink);
 	struct rvu *rvu = rvu_dl->rvu;
@@ -1259,7 +1260,8 @@ enum rvu_af_dl_param_id {
 };
 
 static int rvu_af_npc_exact_feature_get(struct devlink *devlink, u32 id,
-					struct devlink_param_gset_ctx *ctx)
+					struct devlink_param_gset_ctx *ctx,
+					struct netlink_ext_ack *extack)
 {
 	struct rvu_devlink *rvu_dl = devlink_priv(devlink);
 	struct rvu *rvu = rvu_dl->rvu;
@@ -1314,7 +1316,8 @@ static int rvu_af_npc_exact_feature_validate(struct devlink *devlink, u32 id,
 }
 
 static int rvu_af_dl_npc_mcam_high_zone_percent_get(struct devlink *devlink, u32 id,
-						    struct devlink_param_gset_ctx *ctx)
+						    struct devlink_param_gset_ctx *ctx,
+						    struct netlink_ext_ack *extack)
 {
 	struct rvu_devlink *rvu_dl = devlink_priv(devlink);
 	struct rvu *rvu = rvu_dl->rvu;
@@ -1376,7 +1379,8 @@ static int rvu_af_dl_npc_mcam_high_zone_percent_validate(struct devlink *devlink
 }
 
 static int rvu_af_dl_npc_def_rule_cntr_get(struct devlink *devlink, u32 id,
-					   struct devlink_param_gset_ctx *ctx)
+					   struct devlink_param_gset_ctx *ctx,
+					   struct netlink_ext_ack *extack)
 {
 	struct rvu_devlink *rvu_dl = devlink_priv(devlink);
 	struct rvu *rvu = rvu_dl->rvu;
@@ -1402,7 +1406,8 @@ static int rvu_af_dl_npc_def_rule_cntr_set(struct devlink *devlink, u32 id,
 }
 
 static int rvu_af_dl_nix_maxlf_get(struct devlink *devlink, u32 id,
-				   struct devlink_param_gset_ctx *ctx)
+				   struct devlink_param_gset_ctx *ctx,
+				   struct netlink_ext_ack *extack)
 {
 	struct rvu_devlink *rvu_dl = devlink_priv(devlink);
 	struct rvu *rvu = rvu_dl->rvu;
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
index e13ae5484c19..a72694219df4 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
@@ -48,7 +48,8 @@ static int otx2_dl_mcam_count_set(struct devlink *devlink, u32 id,
 }
 
 static int otx2_dl_mcam_count_get(struct devlink *devlink, u32 id,
-				  struct devlink_param_gset_ctx *ctx)
+				  struct devlink_param_gset_ctx *ctx,
+				  struct netlink_ext_ack *extack)
 {
 	struct otx2_devlink *otx2_dl = devlink_priv(devlink);
 	struct otx2_nic *pfvf = otx2_dl->pfvf;
@@ -84,7 +85,8 @@ static int otx2_dl_ucast_flt_cnt_set(struct devlink *devlink, u32 id,
 }
 
 static int otx2_dl_ucast_flt_cnt_get(struct devlink *devlink, u32 id,
-				     struct devlink_param_gset_ctx *ctx)
+				     struct devlink_param_gset_ctx *ctx,
+				     struct netlink_ext_ack *extack)
 {
 	struct otx2_devlink *otx2_dl = devlink_priv(devlink);
 	struct otx2_nic *pfvf = otx2_dl->pfvf;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 03d2fc7d9b09..2de226951e19 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -174,7 +174,8 @@ MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is defaul
 static atomic_t pf_loading = ATOMIC_INIT(0);
 
 static int mlx4_devlink_ierr_reset_get(struct devlink *devlink, u32 id,
-				       struct devlink_param_gset_ctx *ctx)
+				       struct devlink_param_gset_ctx *ctx,
+				       struct netlink_ext_ack *extack)
 {
 	ctx->val.vbool = !!mlx4_internal_err_reset;
 	return 0;
@@ -189,7 +190,8 @@ static int mlx4_devlink_ierr_reset_set(struct devlink *devlink, u32 id,
 }
 
 static int mlx4_devlink_crdump_snapshot_get(struct devlink *devlink, u32 id,
-					    struct devlink_param_gset_ctx *ctx)
+					    struct devlink_param_gset_ctx *ctx,
+					    struct netlink_ext_ack *extack)
 {
 	struct mlx4_priv *priv = devlink_priv(devlink);
 	struct mlx4_dev *dev = &priv->dev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 3adf2b1cd26a..4b7a1ce7f406 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1969,7 +1969,8 @@ static int mlx5_devlink_esw_multiport_set(struct devlink *devlink, u32 id,
 }
 
 static int mlx5_devlink_esw_multiport_get(struct devlink *devlink, u32 id,
-					  struct devlink_param_gset_ctx *ctx)
+					  struct devlink_param_gset_ctx *ctx,
+					  struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 8ebca0d17f65..8de6c7f6c294 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2618,7 +2618,8 @@ done:
 }
 
 static int esw_port_metadata_get(struct devlink *devlink, u32 id,
-				 struct devlink_param_gset_ctx *ctx)
+				 struct devlink_param_gset_ctx *ctx,
+				 struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 2b755a0035ce..0a6031a64c6f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -3833,7 +3833,8 @@ static int mlx5_fs_mode_set(struct devlink *devlink, u32 id,
 }
 
 static int mlx5_fs_mode_get(struct devlink *devlink, u32 id,
-			    struct devlink_param_gset_ctx *ctx)
+			    struct devlink_param_gset_ctx *ctx,
+			    struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
index 89e399606877..2bceb42c98cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -73,7 +73,8 @@ static int mlx5_fw_reset_enable_remote_dev_reset_set(struct devlink *devlink, u3
 }
 
 static int mlx5_fw_reset_enable_remote_dev_reset_get(struct devlink *devlink, u32 id,
-						     struct devlink_param_gset_ctx *ctx)
+						     struct devlink_param_gset_ctx *ctx,
+						     struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	struct mlx5_fw_reset *fw_reset;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c
index 459a0b4d08e6..70cb22fa96db 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c
@@ -200,7 +200,8 @@ static const char *const
 
 static int
 mlx5_nv_param_devlink_cqe_compress_get(struct devlink *devlink, u32 id,
-				       struct devlink_param_gset_ctx *ctx)
+				       struct devlink_param_gset_ctx *ctx,
+				       struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {};
@@ -302,7 +303,8 @@ static int mlx5_nv_param_read_per_host_pf_conf(struct mlx5_core_dev *dev,
 }
 
 static int mlx5_devlink_enable_sriov_get(struct devlink *devlink, u32 id,
-					 struct devlink_param_gset_ctx *ctx)
+					 struct devlink_param_gset_ctx *ctx,
+					 struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {};
@@ -413,7 +415,8 @@ static int mlx5_devlink_enable_sriov_set(struct devlink *devlink, u32 id,
 }
 
 static int mlx5_devlink_total_vfs_get(struct devlink *devlink, u32 id,
-				      struct devlink_param_gset_ctx *ctx)
+				      struct devlink_param_gset_ctx *ctx,
+				      struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
index b1d08e958bf9..69f9da9fb305 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
@@ -1489,7 +1489,8 @@ mlxsw_sp_acl_tcam_vregion_rehash(struct mlxsw_sp *mlxsw_sp,
 
 static int
 mlxsw_sp_acl_tcam_region_rehash_intrvl_get(struct devlink *devlink, u32 id,
-					   struct devlink_param_gset_ctx *ctx)
+					   struct devlink_param_gset_ctx *ctx,
+					   struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
 	struct mlxsw_sp_acl_tcam *tcam;
diff --git a/drivers/net/ethernet/netronome/nfp/devlink_param.c b/drivers/net/ethernet/netronome/nfp/devlink_param.c
index 0e1a3800f371..85e3b19e6165 100644
--- a/drivers/net/ethernet/netronome/nfp/devlink_param.c
+++ b/drivers/net/ethernet/netronome/nfp/devlink_param.c
@@ -81,7 +81,8 @@ static const struct nfp_devlink_param_u8_arg nfp_devlink_u8_args[] = {
 
 static int
 nfp_devlink_param_u8_get(struct devlink *devlink, u32 id,
-			 struct devlink_param_gset_ctx *ctx)
+			 struct devlink_param_gset_ctx *ctx,
+			 struct netlink_ext_ack *extack)
 {
 	const struct nfp_devlink_param_u8_arg *arg;
 	struct nfp_pf *pf = devlink_priv(devlink);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_devlink.c b/drivers/net/ethernet/qlogic/qed/qed_devlink.c
index 94c5689b5abd..0c5278c0598c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_devlink.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_devlink.c
@@ -121,7 +121,8 @@ void qed_fw_reporters_destroy(struct devlink *devlink)
 }
 
 static int qed_dl_param_get(struct devlink *dl, u32 id,
-			    struct devlink_param_gset_ctx *ctx)
+			    struct devlink_param_gset_ctx *ctx,
+			    struct netlink_ext_ack *extack)
 {
 	struct qed_devlink *qed_dl = devlink_priv(dl);
 	struct qed_dev *cdev;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index b1aa236d8051..6cacedb2c9b3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -7531,7 +7531,8 @@ static int stmmac_dl_ts_coarse_set(struct devlink *dl, u32 id,
 }
 
 static int stmmac_dl_ts_coarse_get(struct devlink *dl, u32 id,
-				   struct devlink_param_gset_ctx *ctx)
+				   struct devlink_param_gset_ctx *ctx,
+				   struct netlink_ext_ack *extack)
 {
 	struct stmmac_devlink_priv *dl_priv = devlink_priv(dl);
 	struct stmmac_priv *priv = dl_priv->stmmac_priv;
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index d5f358ec9820..5924db6be3fe 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -3068,7 +3068,8 @@ static void am65_cpsw_init_host_port_emac(struct am65_cpsw_common *common)
 }
 
 static int am65_cpsw_dl_switch_mode_get(struct devlink *dl, u32 id,
-					struct devlink_param_gset_ctx *ctx)
+					struct devlink_param_gset_ctx *ctx,
+					struct netlink_ext_ack *extack)
 {
 	struct am65_cpsw_devlink *dl_priv = devlink_priv(dl);
 	struct am65_cpsw_common *common = dl_priv->common;
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 8b9e2078c602..ab88d4c02cbd 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -1618,7 +1618,8 @@ static const struct devlink_ops cpsw_devlink_ops = {
 };
 
 static int cpsw_dl_switch_mode_get(struct devlink *dl, u32 id,
-				   struct devlink_param_gset_ctx *ctx)
+				   struct devlink_param_gset_ctx *ctx,
+				   struct netlink_ext_ack *extack)
 {
 	struct cpsw_devlink *dl_priv = devlink_priv(dl);
 	struct cpsw_common *cpsw = dl_priv->cpsw;
@@ -1753,7 +1754,8 @@ exit:
 }
 
 static int cpsw_dl_ale_ctrl_get(struct devlink *dl, u32 id,
-				struct devlink_param_gset_ctx *ctx)
+				struct devlink_param_gset_ctx *ctx,
+				struct netlink_ext_ack *extack)
 {
 	struct cpsw_devlink *dl_priv = devlink_priv(dl);
 	struct cpsw_common *cpsw = dl_priv->cpsw;
diff --git a/drivers/net/wwan/iosm/iosm_ipc_devlink.c b/drivers/net/wwan/iosm/iosm_ipc_devlink.c
index 33d6342124bc..301a9d294d30 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_devlink.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_devlink.c
@@ -21,7 +21,8 @@ static struct iosm_coredump_file_info list[IOSM_NOF_CD_REGION] = {
 
 /* Get the param values for the specific param ID's */
 static int ipc_devlink_get_param(struct devlink *dl, u32 id,
-				 struct devlink_param_gset_ctx *ctx)
+				 struct devlink_param_gset_ctx *ctx,
+				 struct netlink_ext_ack *extack)
 {
 	struct iosm_devlink *ipc_devlink = devlink_priv(dl);
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index d01046ef0577..5f479227144d 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -490,7 +490,8 @@ struct devlink_param {
 	enum devlink_param_type type;
 	unsigned long supported_cmodes;
 	int (*get)(struct devlink *devlink, u32 id,
-		   struct devlink_param_gset_ctx *ctx);
+		   struct devlink_param_gset_ctx *ctx,
+		   struct netlink_ext_ack *extack);
 	int (*set)(struct devlink *devlink, u32 id,
 		   struct devlink_param_gset_ctx *ctx,
 		   struct netlink_ext_ack *extack);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 97d5f401cfcf..e40cdc12f7f3 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1251,7 +1251,8 @@ struct dsa_switch_ops {
 			     dsa_devlink_param_get, dsa_devlink_param_set, NULL)
 
 int dsa_devlink_param_get(struct devlink *dl, u32 id,
-			  struct devlink_param_gset_ctx *ctx);
+			  struct devlink_param_gset_ctx *ctx,
+			  struct netlink_ext_ack *extack);
 int dsa_devlink_param_set(struct devlink *dl, u32 id,
 			  struct devlink_param_gset_ctx *ctx,
 			  struct netlink_ext_ack *extack);
diff --git a/net/devlink/param.c b/net/devlink/param.c
index 6b233b13b69a..3dbd023e4c36 100644
--- a/net/devlink/param.c
+++ b/net/devlink/param.c
@@ -174,11 +174,12 @@ devlink_param_cmode_is_supported(const struct devlink_param *param,
 
 static int devlink_param_get(struct devlink *devlink,
 			     const struct devlink_param *param,
-			     struct devlink_param_gset_ctx *ctx)
+			     struct devlink_param_gset_ctx *ctx,
+			     struct netlink_ext_ack *extack)
 {
 	if (!param->get)
 		return -EOPNOTSUPP;
-	return param->get(devlink, param->id, ctx);
+	return param->get(devlink, param->id, ctx, extack);
 }
 
 static int devlink_param_set(struct devlink *devlink,
@@ -250,7 +251,8 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
 				 unsigned int port_index,
 				 struct devlink_param_item *param_item,
 				 enum devlink_command cmd,
-				 u32 portid, u32 seq, int flags)
+				 u32 portid, u32 seq, int flags,
+				 struct netlink_ext_ack *extack)
 {
 	union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
 	bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {};
@@ -275,7 +277,7 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
 				return -EOPNOTSUPP;
 		} else {
 			ctx.cmode = i;
-			err = devlink_param_get(devlink, param, &ctx);
+			err = devlink_param_get(devlink, param, &ctx, extack);
 			if (err)
 				return err;
 			param_value[i] = ctx.val;
@@ -357,7 +359,7 @@ static void devlink_param_notify(struct devlink *devlink,
 	if (!msg)
 		return;
 	err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd,
-				    0, 0, 0);
+				    0, 0, 0, NULL);
 	if (err) {
 		nlmsg_free(msg);
 		return;
@@ -400,7 +402,8 @@ static int devlink_nl_param_get_dump_one(struct sk_buff *msg,
 		err = devlink_nl_param_fill(msg, devlink, 0, param_item,
 					    DEVLINK_CMD_PARAM_GET,
 					    NETLINK_CB(cb->skb).portid,
-					    cb->nlh->nlmsg_seq, flags);
+					    cb->nlh->nlmsg_seq, flags,
+					    cb->extack);
 		if (err == -EOPNOTSUPP) {
 			err = 0;
 		} else if (err) {
@@ -509,8 +512,8 @@ int devlink_nl_param_get_doit(struct sk_buff *skb,
 		return -ENOMEM;
 
 	err = devlink_nl_param_fill(msg, devlink, 0, param_item,
-				    DEVLINK_CMD_PARAM_GET,
-				    info->snd_portid, info->snd_seq, 0);
+				    DEVLINK_CMD_PARAM_GET, info->snd_portid,
+				    info->snd_seq, 0, info->extack);
 	if (err) {
 		nlmsg_free(msg);
 		return err;
diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c
index f41f9fc2194e..ed342f345692 100644
--- a/net/dsa/devlink.c
+++ b/net/dsa/devlink.c
@@ -182,7 +182,8 @@ static const struct devlink_ops dsa_devlink_ops = {
 };
 
 int dsa_devlink_param_get(struct devlink *dl, u32 id,
-			  struct devlink_param_gset_ctx *ctx)
+			  struct devlink_param_gset_ctx *ctx,
+			  struct netlink_ext_ack *extack)
 {
 	struct dsa_switch *ds = dsa_devlink_to_ds(dl);
 
-- 
cgit v1.2.3


From 2a367002ed321e884276c3d7232a362ddd1bf7d6 Mon Sep 17 00:00:00 2001
From: Daniel Zahka <daniel.zahka@gmail.com>
Date: Tue, 18 Nov 2025 18:50:33 -0800
Subject: devlink: support default values for param-get and param-set

Support querying and resetting to default param values.

Introduce two new devlink netlink attrs:
DEVLINK_ATTR_PARAM_VALUE_DEFAULT and
DEVLINK_ATTR_PARAM_RESET_DEFAULT. The former is used to contain an
optional parameter value inside of the param_value nested
attribute. The latter is used in param-set requests from userspace to
indicate that the driver should reset the param to its default value.

To implement this, two new functions are added to the devlink driver
api: devlink_param::get_default() and
devlink_param::reset_default(). These callbacks allow drivers to
implement default param actions for runtime and permanent cmodes. For
driverinit params, the core latches the last value set by a driver via
devl_param_driverinit_value_set(), and uses that as the default value
for a param.

Because default parameter values are optional, it would be impossible
to discern whether or not a param of type bool has default value of
false or not provided if the default value is encoded using a netlink
flag type. For this reason, when a DEVLINK_PARAM_TYPE_BOOL has an
associated default value, the default value is encoded using a u8
type.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251119025038.651131-4-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/devlink.yaml           |   9 ++
 .../networking/devlink/devlink-params.rst          |  10 ++
 include/net/devlink.h                              |  42 +++++++++
 include/uapi/linux/devlink.h                       |   3 +
 net/devlink/netlink_gen.c                          |   5 +-
 net/devlink/param.c                                | 105 ++++++++++++++++++---
 6 files changed, 160 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml
index 426d5aa7d955..837112da6738 100644
--- a/Documentation/netlink/specs/devlink.yaml
+++ b/Documentation/netlink/specs/devlink.yaml
@@ -859,6 +859,14 @@ attribute-sets:
         name: health-reporter-burst-period
         type: u64
         doc: Time (in msec) for recoveries before starting the grace period.
+
+      # TODO: fill in the attributes in between
+
+      -
+        name: param-reset-default
+        type: flag
+        doc: Request restoring parameter to its default value.
+        value: 183
   -
     name: dl-dev-stats
     subset-of: devlink
@@ -1793,6 +1801,7 @@ operations:
             - param-type
             # param-value-data is missing here as the type is variable
             - param-value-cmode
+            - param-reset-default
 
     -
       name: region-get
diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst
index c0597d456641..ea17756dcda6 100644
--- a/Documentation/networking/devlink/devlink-params.rst
+++ b/Documentation/networking/devlink/devlink-params.rst
@@ -41,6 +41,16 @@ In order for ``driverinit`` parameters to take effect, the driver must
 support reloading via the ``devlink-reload`` command. This command will
 request a reload of the device driver.
 
+Default parameter values
+=========================
+
+Drivers may optionally export default values for parameters of cmode
+``runtime`` and ``permanent``. For ``driverinit`` parameters, the last
+value set by the driver will be used as the default value. Drivers can
+also support resetting params with cmode ``runtime`` and ``permanent``
+to their default values. Resetting ``driverinit`` params is supported
+by devlink core without additional driver support needed.
+
 .. _devlink_params_generic:
 
 Generic configuration parameters
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 5f479227144d..cb839e0435a1 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -479,6 +479,10 @@ struct devlink_flash_notify {
  * @set: set parameter value, used for runtime and permanent
  *       configuration modes
  * @validate: validate input value is applicable (within value range, etc.)
+ * @get_default: get parameter default value, used for runtime and permanent
+ *               configuration modes
+ * @reset_default: reset parameter to default value, used for runtime and permanent
+ *                 configuration modes
  *
  * This struct should be used by the driver to fill the data for
  * a parameter it registers.
@@ -498,6 +502,12 @@ struct devlink_param {
 	int (*validate)(struct devlink *devlink, u32 id,
 			union devlink_param_value val,
 			struct netlink_ext_ack *extack);
+	int (*get_default)(struct devlink *devlink, u32 id,
+			   struct devlink_param_gset_ctx *ctx,
+			   struct netlink_ext_ack *extack);
+	int (*reset_default)(struct devlink *devlink, u32 id,
+			     enum devlink_param_cmode cmode,
+			     struct netlink_ext_ack *extack);
 };
 
 struct devlink_param_item {
@@ -509,6 +519,7 @@ struct devlink_param_item {
 							 * until reload.
 							 */
 	bool driverinit_value_new_valid;
+	union devlink_param_value driverinit_default;
 };
 
 enum devlink_param_generic_id {
@@ -630,6 +641,37 @@ enum devlink_param_generic_id {
 	.validate = _validate,						\
 }
 
+#define DEVLINK_PARAM_GENERIC_WITH_DEFAULTS(_id, _cmodes, _get, _set,	      \
+					    _validate, _get_default,	      \
+					    _reset_default)		      \
+{									      \
+	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				      \
+	.name = DEVLINK_PARAM_GENERIC_##_id##_NAME,			      \
+	.type = DEVLINK_PARAM_GENERIC_##_id##_TYPE,			      \
+	.generic = true,						      \
+	.supported_cmodes = _cmodes,					      \
+	.get = _get,							      \
+	.set = _set,							      \
+	.validate = _validate,						      \
+	.get_default = _get_default,					      \
+	.reset_default = _reset_default,				      \
+}
+
+#define DEVLINK_PARAM_DRIVER_WITH_DEFAULTS(_id, _name, _type, _cmodes,	      \
+					   _get, _set, _validate,	      \
+					   _get_default, _reset_default)      \
+{									      \
+	.id = _id,							      \
+	.name = _name,							      \
+	.type = _type,							      \
+	.supported_cmodes = _cmodes,					      \
+	.get = _get,							      \
+	.set = _set,							      \
+	.validate = _validate,						      \
+	.get_default = _get_default,					      \
+	.reset_default = _reset_default,				      \
+}
+
 /* Identifier of board design */
 #define DEVLINK_INFO_VERSION_GENERIC_BOARD_ID	"board.id"
 /* Revision of board design */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 157f11d3fb72..e7d6b6d13470 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -639,6 +639,9 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,	/* u64 */
 
+	DEVLINK_ATTR_PARAM_VALUE_DEFAULT,	/* dynamic */
+	DEVLINK_ATTR_PARAM_RESET_DEFAULT,	/* flag */
+
 	/* Add new attributes above here, update the spec in
 	 * Documentation/netlink/specs/devlink.yaml and re-generate
 	 * net/devlink/netlink_gen.c.
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index 5ad435aee29d..580985025f49 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -301,12 +301,13 @@ static const struct nla_policy devlink_param_get_dump_nl_policy[DEVLINK_ATTR_DEV
 };
 
 /* DEVLINK_CMD_PARAM_SET - do */
-static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_VALUE_CMODE + 1] = {
+static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_RESET_DEFAULT + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
 	[DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, },
 	[DEVLINK_ATTR_PARAM_TYPE] = NLA_POLICY_VALIDATE_FN(NLA_U8, &devlink_attr_param_type_validate),
 	[DEVLINK_ATTR_PARAM_VALUE_CMODE] = NLA_POLICY_MAX(NLA_U8, 2),
+	[DEVLINK_ATTR_PARAM_RESET_DEFAULT] = { .type = NLA_FLAG, },
 };
 
 /* DEVLINK_CMD_REGION_GET - do */
@@ -919,7 +920,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
 		.doit		= devlink_nl_param_set_doit,
 		.post_doit	= devlink_nl_post_doit,
 		.policy		= devlink_param_set_nl_policy,
-		.maxattr	= DEVLINK_ATTR_PARAM_VALUE_CMODE,
+		.maxattr	= DEVLINK_ATTR_PARAM_RESET_DEFAULT,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
 	{
diff --git a/net/devlink/param.c b/net/devlink/param.c
index 3aa14ef345f0..e0ea93eded43 100644
--- a/net/devlink/param.c
+++ b/net/devlink/param.c
@@ -192,9 +192,32 @@ static int devlink_param_set(struct devlink *devlink,
 	return param->set(devlink, param->id, ctx, extack);
 }
 
+static int devlink_param_get_default(struct devlink *devlink,
+				     const struct devlink_param *param,
+				     struct devlink_param_gset_ctx *ctx,
+				     struct netlink_ext_ack *extack)
+{
+	if (!param->get_default)
+		return -EOPNOTSUPP;
+
+	return param->get_default(devlink, param->id, ctx, extack);
+}
+
+static int devlink_param_reset_default(struct devlink *devlink,
+				       const struct devlink_param *param,
+				       enum devlink_param_cmode cmode,
+				       struct netlink_ext_ack *extack)
+{
+	if (!param->reset_default)
+		return -EOPNOTSUPP;
+
+	return param->reset_default(devlink, param->id, cmode, extack);
+}
+
 static int
 devlink_nl_param_value_put(struct sk_buff *msg, enum devlink_param_type type,
-			   int nla_type, union devlink_param_value val)
+			   int nla_type, union devlink_param_value val,
+			   bool flag_as_u8)
 {
 	switch (type) {
 	case DEVLINK_PARAM_TYPE_U8:
@@ -218,8 +241,16 @@ devlink_nl_param_value_put(struct sk_buff *msg, enum devlink_param_type type,
 			return -EMSGSIZE;
 		break;
 	case DEVLINK_PARAM_TYPE_BOOL:
-		if (val.vbool && nla_put_flag(msg, nla_type))
-			return -EMSGSIZE;
+		/* default values of type bool are encoded with u8, so that
+		 * false can be distinguished from not present
+		 */
+		if (flag_as_u8) {
+			if (nla_put_u8(msg, nla_type, val.vbool))
+				return -EMSGSIZE;
+		} else {
+			if (val.vbool && nla_put_flag(msg, nla_type))
+				return -EMSGSIZE;
+		}
 		break;
 	}
 	return 0;
@@ -229,7 +260,9 @@ static int
 devlink_nl_param_value_fill_one(struct sk_buff *msg,
 				enum devlink_param_type type,
 				enum devlink_param_cmode cmode,
-				union devlink_param_value val)
+				union devlink_param_value val,
+				union devlink_param_value default_val,
+				bool has_default)
 {
 	struct nlattr *param_value_attr;
 	int err = -EMSGSIZE;
@@ -243,10 +276,19 @@ devlink_nl_param_value_fill_one(struct sk_buff *msg,
 		goto value_nest_cancel;
 
 	err = devlink_nl_param_value_put(msg, type,
-					 DEVLINK_ATTR_PARAM_VALUE_DATA, val);
+					 DEVLINK_ATTR_PARAM_VALUE_DATA,
+					 val, false);
 	if (err)
 		goto value_nest_cancel;
 
+	if (has_default) {
+		err = devlink_nl_param_value_put(msg, type,
+						 DEVLINK_ATTR_PARAM_VALUE_DEFAULT,
+						 default_val, true);
+		if (err)
+			goto value_nest_cancel;
+	}
+
 	nla_nest_end(msg, param_value_attr);
 	return 0;
 
@@ -262,7 +304,9 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
 				 u32 portid, u32 seq, int flags,
 				 struct netlink_ext_ack *extack)
 {
+	union devlink_param_value default_value[DEVLINK_PARAM_CMODE_MAX + 1];
 	union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
+	bool default_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {};
 	bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {};
 	const struct devlink_param *param = param_item->param;
 	struct devlink_param_gset_ctx ctx;
@@ -283,12 +327,26 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
 				param_value[i] = param_item->driverinit_value;
 			else
 				return -EOPNOTSUPP;
+
+			if (param_item->driverinit_value_valid) {
+				default_value[i] = param_item->driverinit_default;
+				default_value_set[i] = true;
+			}
 		} else {
 			ctx.cmode = i;
 			err = devlink_param_get(devlink, param, &ctx, extack);
 			if (err)
 				return err;
 			param_value[i] = ctx.val;
+
+			err = devlink_param_get_default(devlink, param, &ctx,
+							extack);
+			if (!err) {
+				default_value[i] = ctx.val;
+				default_value_set[i] = true;
+			} else if (err != -EOPNOTSUPP) {
+				return err;
+			}
 		}
 		param_value_set[i] = true;
 	}
@@ -325,7 +383,9 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
 		if (!param_value_set[i])
 			continue;
 		err = devlink_nl_param_value_fill_one(msg, param->type,
-						      i, param_value[i]);
+						      i, param_value[i],
+						      default_value[i],
+						      default_value_set[i]);
 		if (err)
 			goto values_list_nest_cancel;
 	}
@@ -542,6 +602,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
 	struct devlink_param_item *param_item;
 	const struct devlink_param *param;
 	union devlink_param_value value;
+	bool reset_default;
 	int err = 0;
 
 	param_item = devlink_param_get_from_info(params, info);
@@ -553,13 +614,18 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
 		return err;
 	if (param_type != param->type)
 		return -EINVAL;
-	err = devlink_param_value_get_from_info(param, info, &value);
-	if (err)
-		return err;
-	if (param->validate) {
-		err = param->validate(devlink, param->id, value, info->extack);
+
+	reset_default = info->attrs[DEVLINK_ATTR_PARAM_RESET_DEFAULT];
+	if (!reset_default) {
+		err = devlink_param_value_get_from_info(param, info, &value);
 		if (err)
 			return err;
+		if (param->validate) {
+			err = param->validate(devlink, param->id, value,
+					      info->extack);
+			if (err)
+				return err;
+		}
 	}
 
 	if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE))
@@ -569,6 +635,15 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
 		return -EOPNOTSUPP;
 
 	if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+		if (reset_default) {
+			if (!param_item->driverinit_value_valid) {
+				NL_SET_ERR_MSG(info->extack,
+					       "Default value not available");
+				return -EOPNOTSUPP;
+			}
+			value = param_item->driverinit_default;
+		}
+
 		param_item->driverinit_value_new = value;
 		param_item->driverinit_value_new_valid = true;
 	} else {
@@ -576,7 +651,12 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
 			return -EOPNOTSUPP;
 		ctx.val = value;
 		ctx.cmode = cmode;
-		err = devlink_param_set(devlink, param, &ctx, info->extack);
+		if (reset_default)
+			err = devlink_param_reset_default(devlink, param, cmode,
+							  info->extack);
+		else
+			err = devlink_param_set(devlink, param, &ctx,
+						info->extack);
 		if (err)
 			return err;
 	}
@@ -824,6 +904,7 @@ void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
 
 	param_item->driverinit_value = init_val;
 	param_item->driverinit_value_valid = true;
+	param_item->driverinit_default = init_val;
 
 	devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
 }
-- 
cgit v1.2.3


From 5d74781ebc86c5fa9e9d6934024c505412de9b52 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 20 Nov 2025 11:28:29 +0200
Subject: vfio/pci: Add dma-buf export support for MMIO regions

Add support for exporting PCI device MMIO regions through dma-buf,
enabling safe sharing of non-struct page memory with controlled
lifetime management. This allows RDMA and other subsystems to import
dma-buf FDs and build them into memory regions for PCI P2P operations.

The implementation provides a revocable attachment mechanism using
dma-buf move operations. MMIO regions are normally pinned as BARs
don't change physical addresses, but access is revoked when the VFIO
device is closed or a PCI reset is issued. This ensures kernel
self-defense against potentially hostile userspace.

Currently VFIO can take MMIO regions from the device's BAR and map
them into a PFNMAP VMA with special PTEs. This mapping type ensures
the memory cannot be used with things like pin_user_pages(), hmm, and
so on. In practice only the user process CPU and KVM can safely make
use of these VMA. When VFIO shuts down these VMAs are cleaned by
unmap_mapping_range() to prevent any UAF of the MMIO beyond driver
unbind.

However, VFIO type 1 has an insecure behavior where it uses
follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back
into the IOMMU. This has a long history of enabling P2P DMA inside
VMs, but has serious lifetime problems by allowing a UAF of the MMIO
after the VFIO driver has been unbound.

Introduce DMABUF as a new safe way to export a FD based handle for the
MMIO regions. This can be consumed by existing DMABUF importers like
RDMA or DRM without opening an UAF. A following series will add an
importer to iommufd to obsolete the type 1 code and allow safe
UAF-free MMIO P2P in VM cases.

DMABUF has a built in synchronous invalidation mechanism called
move_notify. VFIO keeps track of all drivers importing its MMIO and
can invoke a synchronous invalidation callback to tell the importing
drivers to DMA unmap and forget about the MMIO pfns. This process is
being called revoke. This synchronous invalidation fully prevents any
lifecycle problems. VFIO will do this before unbinding its driver
ensuring there is no UAF of the MMIO beyond the driver lifecycle.

Further, VFIO has additional behavior to block access to the MMIO
during things like Function Level Reset. This is because some poor
platforms may experience a MCE type crash when touching MMIO of a PCI
device that is undergoing a reset. Today this is done by using
unmap_mapping_range() on the VMAs. Extend that into the DMABUF world
and temporarily revoke the MMIO from the DMABUF importers during FLR
as well. This will more robustly prevent an errant P2P from possibly
upsetting the platform.

A DMABUF FD is a preferred handle for MMIO compared to using something
like a pgmap because:
 - VFIO is supported, including its P2P feature, on archs that don't
   support pgmap
 - PCI devices have all sorts of BAR sizes, including ones smaller
   than a section so a pgmap cannot always be created
 - It is undesirable to waste a lot of memory for struct pages,
   especially for a case like a GPU with ~100GB of BAR size
 - We want a synchronous revoke semantic to support FLR with light
   hardware requirements

Use the P2P subsystem to help generate the DMA mapping. This is a
significant upgrade over the abuse of dma_map_resource() that has
historically been used by DMABUF exporters. Experience with an OOT
version of this patch shows that real systems do need this. This
approach deals with all the P2P scenarios:
 - Non-zero PCI bus_offset
 - ACS flags routing traffic to the IOMMU
 - ACS flags that bypass the IOMMU - though vfio noiommu is required
   to hit this.

There will be further work to formalize the revoke semantic in
DMABUF. For now this acts like a move_notify dynamic exporter where
importer fault handling will get a failure when they attempt to map.
This means that only fully restartable fault capable importers can
import the VFIO DMABUFs. A future revoke semantic should open this up
to more HW as the HW only needs to invalidate, not handle restartable
faults.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-10-d7f71607f371@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/vfio/pci/Kconfig           |   3 +
 drivers/vfio/pci/Makefile          |   1 +
 drivers/vfio/pci/vfio_pci.c        |   5 +
 drivers/vfio/pci/vfio_pci_config.c |  22 ++-
 drivers/vfio/pci/vfio_pci_core.c   |  18 ++-
 drivers/vfio/pci/vfio_pci_dmabuf.c | 316 +++++++++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_priv.h   |  23 +++
 include/linux/vfio_pci_core.h      |  42 +++++
 include/uapi/linux/vfio.h          |  28 ++++
 9 files changed, 453 insertions(+), 5 deletions(-)
 create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c

(limited to 'include')

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2b0172f54665..2b9fca00e9e8 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM
 
 	  To enable s390x KVM vfio-pci extensions, say Y.
 
+config VFIO_PCI_DMABUF
+	def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER
+
 source "drivers/vfio/pci/mlx5/Kconfig"
 
 source "drivers/vfio/pci/hisilicon/Kconfig"
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c..53f59226ae01 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,6 +2,7 @@
 
 vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
+vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
 obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
 
 vfio-pci-y := vfio_pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index ac10f14417f2..6d41cf26b539 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -147,6 +147,10 @@ static const struct vfio_device_ops vfio_pci_ops = {
 	.pasid_detach_ioas	= vfio_iommufd_physical_pasid_detach_ioas,
 };
 
+static const struct vfio_pci_device_ops vfio_pci_dev_ops = {
+	.get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys,
+};
+
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct vfio_pci_core_device *vdev;
@@ -161,6 +165,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return PTR_ERR(vdev);
 
 	dev_set_drvdata(&pdev->dev, vdev);
+	vdev->pci_ops = &vfio_pci_dev_ops;
 	ret = vfio_pci_core_register_device(vdev);
 	if (ret)
 		goto out_put_vdev;
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 8f02f236b5b4..1f6008eabf23 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
 		virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
 		new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
 
-		if (!new_mem)
+		if (!new_mem) {
 			vfio_pci_zap_and_down_write_memory_lock(vdev);
-		else
+			vfio_pci_dma_buf_move(vdev, true);
+		} else {
 			down_write(&vdev->memory_lock);
+		}
 
 		/*
 		 * If the user is writing mem/io enable (new_mem/io) and we
@@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
 		*virt_cmd &= cpu_to_le16(~mask);
 		*virt_cmd |= cpu_to_le16(new_cmd & mask);
 
+		if (__vfio_pci_memory_enabled(vdev))
+			vfio_pci_dma_buf_move(vdev, false);
 		up_write(&vdev->memory_lock);
 	}
 
@@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
 static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev,
 					  pci_power_t state)
 {
-	if (state >= PCI_D3hot)
+	if (state >= PCI_D3hot) {
 		vfio_pci_zap_and_down_write_memory_lock(vdev);
-	else
+		vfio_pci_dma_buf_move(vdev, true);
+	} else {
 		down_write(&vdev->memory_lock);
+	}
 
 	vfio_pci_set_power_state(vdev, state);
+	if (__vfio_pci_memory_enabled(vdev))
+		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
 }
 
@@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
 
 		if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
 			vfio_pci_zap_and_down_write_memory_lock(vdev);
+			vfio_pci_dma_buf_move(vdev, true);
 			pci_try_reset_function(vdev->pdev);
+			if (__vfio_pci_memory_enabled(vdev))
+				vfio_pci_dma_buf_move(vdev, false);
 			up_write(&vdev->memory_lock);
 		}
 	}
@@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
 
 		if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) {
 			vfio_pci_zap_and_down_write_memory_lock(vdev);
+			vfio_pci_dma_buf_move(vdev, true);
 			pci_try_reset_function(vdev->pdev);
+			if (__vfio_pci_memory_enabled(vdev))
+				vfio_pci_dma_buf_move(vdev, false);
 			up_write(&vdev->memory_lock);
 		}
 	}
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 142b84b3f225..9449cf44c18a 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -287,6 +287,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
 	 * semaphore.
 	 */
 	vfio_pci_zap_and_down_write_memory_lock(vdev);
+	vfio_pci_dma_buf_move(vdev, true);
+
 	if (vdev->pm_runtime_engaged) {
 		up_write(&vdev->memory_lock);
 		return -EINVAL;
@@ -370,6 +372,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
 	 */
 	down_write(&vdev->memory_lock);
 	__vfio_pci_runtime_pm_exit(vdev);
+	if (__vfio_pci_memory_enabled(vdev))
+		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
 }
 
@@ -690,6 +694,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 #endif
 	vfio_pci_core_disable(vdev);
 
+	vfio_pci_dma_buf_cleanup(vdev);
+
 	mutex_lock(&vdev->igate);
 	if (vdev->err_trigger) {
 		eventfd_ctx_put(vdev->err_trigger);
@@ -1222,7 +1228,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
 	 */
 	vfio_pci_set_power_state(vdev, PCI_D0);
 
+	vfio_pci_dma_buf_move(vdev, true);
 	ret = pci_try_reset_function(vdev->pdev);
+	if (__vfio_pci_memory_enabled(vdev))
+		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
 
 	return ret;
@@ -1511,6 +1520,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 		return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
 		return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_DMA_BUF:
+		return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
 	default:
 		return -ENOTTY;
 	}
@@ -2095,6 +2106,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
 	ret = pcim_p2pdma_init(vdev->pdev);
 	if (ret && ret != -EOPNOTSUPP)
 		return ret;
+	INIT_LIST_HEAD(&vdev->dmabufs);
 	init_rwsem(&vdev->memory_lock);
 	xa_init(&vdev->ctx);
 
@@ -2459,6 +2471,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 			break;
 		}
 
+		vfio_pci_dma_buf_move(vdev, true);
 		vfio_pci_zap_bars(vdev);
 	}
 
@@ -2487,8 +2500,11 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 
 err_undo:
 	list_for_each_entry_from_reverse(vdev, &dev_set->device_list,
-					 vdev.dev_set_list)
+					 vdev.dev_set_list) {
+		if (vdev->vdev.open_count && __vfio_pci_memory_enabled(vdev))
+			vfio_pci_dma_buf_move(vdev, false);
 		up_write(&vdev->memory_lock);
+	}
 
 	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
 		pm_runtime_put(&vdev->pdev->dev);
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
new file mode 100644
index 000000000000..6698f540bdac
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ */
+#include <linux/dma-buf-mapping.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/dma-resv.h>
+
+#include "vfio_pci_priv.h"
+
+MODULE_IMPORT_NS("DMA_BUF");
+
+struct vfio_pci_dma_buf {
+	struct dma_buf *dmabuf;
+	struct vfio_pci_core_device *vdev;
+	struct list_head dmabufs_elm;
+	size_t size;
+	struct dma_buf_phys_vec *phys_vec;
+	struct p2pdma_provider *provider;
+	u32 nr_ranges;
+	u8 revoked : 1;
+};
+
+static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
+				   struct dma_buf_attachment *attachment)
+{
+	struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+	if (!attachment->peer2peer)
+		return -EOPNOTSUPP;
+
+	if (priv->revoked)
+		return -ENODEV;
+
+	return 0;
+}
+
+static struct sg_table *
+vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
+		     enum dma_data_direction dir)
+{
+	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+
+	dma_resv_assert_held(priv->dmabuf->resv);
+
+	if (priv->revoked)
+		return ERR_PTR(-ENODEV);
+
+	return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
+				       priv->phys_vec, priv->nr_ranges,
+				       priv->size, dir);
+}
+
+static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
+				   struct sg_table *sgt,
+				   enum dma_data_direction dir)
+{
+	dma_buf_free_sgt(attachment, sgt, dir);
+}
+
+static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
+{
+	struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+	/*
+	 * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
+	 * The refcount prevents both.
+	 */
+	if (priv->vdev) {
+		down_write(&priv->vdev->memory_lock);
+		list_del_init(&priv->dmabufs_elm);
+		up_write(&priv->vdev->memory_lock);
+		vfio_device_put_registration(&priv->vdev->vdev);
+	}
+	kfree(priv->phys_vec);
+	kfree(priv);
+}
+
+static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
+	.attach = vfio_pci_dma_buf_attach,
+	.map_dma_buf = vfio_pci_dma_buf_map,
+	.unmap_dma_buf = vfio_pci_dma_buf_unmap,
+	.release = vfio_pci_dma_buf_release,
+};
+
+int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
+				struct vfio_region_dma_range *dma_ranges,
+				size_t nr_ranges, phys_addr_t start,
+				phys_addr_t len)
+{
+	phys_addr_t max_addr;
+	unsigned int i;
+
+	max_addr = start + len;
+	for (i = 0; i < nr_ranges; i++) {
+		phys_addr_t end;
+
+		if (!dma_ranges[i].length)
+			return -EINVAL;
+
+		if (check_add_overflow(start, dma_ranges[i].offset,
+				       &phys_vec[i].paddr) ||
+		    check_add_overflow(phys_vec[i].paddr,
+				       dma_ranges[i].length, &end))
+			return -EOVERFLOW;
+		if (end > max_addr)
+			return -EINVAL;
+
+		phys_vec[i].len = dma_ranges[i].length;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec);
+
+int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
+				  struct p2pdma_provider **provider,
+				  unsigned int region_index,
+				  struct dma_buf_phys_vec *phys_vec,
+				  struct vfio_region_dma_range *dma_ranges,
+				  size_t nr_ranges)
+{
+	struct pci_dev *pdev = vdev->pdev;
+
+	*provider = pcim_p2pdma_provider(pdev, region_index);
+	if (!*provider)
+		return -EINVAL;
+
+	return vfio_pci_core_fill_phys_vec(
+		phys_vec, dma_ranges, nr_ranges,
+		pci_resource_start(pdev, region_index),
+		pci_resource_len(pdev, region_index));
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys);
+
+static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf,
+				 struct vfio_region_dma_range *dma_ranges,
+				 size_t *lengthp)
+{
+	size_t length = 0;
+	u32 i;
+
+	for (i = 0; i < dma_buf->nr_ranges; i++) {
+		u64 offset = dma_ranges[i].offset;
+		u64 len = dma_ranges[i].length;
+
+		if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+			return -EINVAL;
+
+		if (check_add_overflow(length, len, &length))
+			return -EINVAL;
+	}
+
+	/*
+	 * dma_iova_try_alloc() will WARN on if userspace proposes a size that
+	 * is too big, eg with lots of ranges.
+	 */
+	if ((u64)(length) & DMA_IOVA_USE_SWIOTLB)
+		return -EINVAL;
+
+	*lengthp = length;
+	return 0;
+}
+
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+				  struct vfio_device_feature_dma_buf __user *arg,
+				  size_t argsz)
+{
+	struct vfio_device_feature_dma_buf get_dma_buf = {};
+	struct vfio_region_dma_range *dma_ranges;
+	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+	struct vfio_pci_dma_buf *priv;
+	size_t length;
+	int ret;
+
+	if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys)
+		return -EOPNOTSUPP;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+				 sizeof(get_dma_buf));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
+		return -EFAULT;
+
+	if (!get_dma_buf.nr_ranges || get_dma_buf.flags)
+		return -EINVAL;
+
+	/*
+	 * For PCI the region_index is the BAR number like everything else.
+	 */
+	if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX)
+		return -ENODEV;
+
+	dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges,
+				       sizeof(*dma_ranges));
+	if (IS_ERR(dma_ranges))
+		return PTR_ERR(dma_ranges);
+
+	ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length);
+	if (ret)
+		goto err_free_ranges;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		goto err_free_ranges;
+	}
+	priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec),
+				 GFP_KERNEL);
+	if (!priv->phys_vec) {
+		ret = -ENOMEM;
+		goto err_free_priv;
+	}
+
+	priv->vdev = vdev;
+	priv->nr_ranges = get_dma_buf.nr_ranges;
+	priv->size = length;
+	ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
+					     get_dma_buf.region_index,
+					     priv->phys_vec, dma_ranges,
+					     priv->nr_ranges);
+	if (ret)
+		goto err_free_phys;
+
+	kfree(dma_ranges);
+	dma_ranges = NULL;
+
+	if (!vfio_device_try_get_registration(&vdev->vdev)) {
+		ret = -ENODEV;
+		goto err_free_phys;
+	}
+
+	exp_info.ops = &vfio_pci_dmabuf_ops;
+	exp_info.size = priv->size;
+	exp_info.flags = get_dma_buf.open_flags;
+	exp_info.priv = priv;
+
+	priv->dmabuf = dma_buf_export(&exp_info);
+	if (IS_ERR(priv->dmabuf)) {
+		ret = PTR_ERR(priv->dmabuf);
+		goto err_dev_put;
+	}
+
+	/* dma_buf_put() now frees priv */
+	INIT_LIST_HEAD(&priv->dmabufs_elm);
+	down_write(&vdev->memory_lock);
+	dma_resv_lock(priv->dmabuf->resv, NULL);
+	priv->revoked = !__vfio_pci_memory_enabled(vdev);
+	list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
+	dma_resv_unlock(priv->dmabuf->resv);
+	up_write(&vdev->memory_lock);
+
+	/*
+	 * dma_buf_fd() consumes the reference, when the file closes the dmabuf
+	 * will be released.
+	 */
+	ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
+	if (ret < 0)
+		goto err_dma_buf;
+	return ret;
+
+err_dma_buf:
+	dma_buf_put(priv->dmabuf);
+err_dev_put:
+	vfio_device_put_registration(&vdev->vdev);
+err_free_phys:
+	kfree(priv->phys_vec);
+err_free_priv:
+	kfree(priv);
+err_free_ranges:
+	kfree(dma_ranges);
+	return ret;
+}
+
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
+{
+	struct vfio_pci_dma_buf *priv;
+	struct vfio_pci_dma_buf *tmp;
+
+	lockdep_assert_held_write(&vdev->memory_lock);
+
+	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+		if (!get_file_active(&priv->dmabuf->file))
+			continue;
+
+		if (priv->revoked != revoked) {
+			dma_resv_lock(priv->dmabuf->resv, NULL);
+			priv->revoked = revoked;
+			dma_buf_move_notify(priv->dmabuf);
+			dma_resv_unlock(priv->dmabuf->resv);
+		}
+		fput(priv->dmabuf->file);
+	}
+}
+
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_dma_buf *priv;
+	struct vfio_pci_dma_buf *tmp;
+
+	down_write(&vdev->memory_lock);
+	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+		if (!get_file_active(&priv->dmabuf->file))
+			continue;
+
+		dma_resv_lock(priv->dmabuf->resv, NULL);
+		list_del_init(&priv->dmabufs_elm);
+		priv->vdev = NULL;
+		priv->revoked = true;
+		dma_buf_move_notify(priv->dmabuf);
+		dma_resv_unlock(priv->dmabuf->resv);
+		vfio_device_put_registration(&vdev->vdev);
+		fput(priv->dmabuf->file);
+	}
+	up_write(&vdev->memory_lock);
+}
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index a9972eacb293..28a405f8b97c 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -107,4 +107,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
 }
 
+#ifdef CONFIG_VFIO_PCI_DMABUF
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+				  struct vfio_device_feature_dma_buf __user *arg,
+				  size_t argsz);
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
+#else
+static inline int
+vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+			      struct vfio_device_feature_dma_buf __user *arg,
+			      size_t argsz)
+{
+	return -ENOTTY;
+}
+static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+}
+static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev,
+					 bool revoked)
+{
+}
+#endif
+
 #endif
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index f541044e42a2..c9466ba323fa 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -26,6 +26,8 @@
 
 struct vfio_pci_core_device;
 struct vfio_pci_region;
+struct p2pdma_provider;
+struct dma_buf_phys_vec;
 
 struct vfio_pci_regops {
 	ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
@@ -49,9 +51,48 @@ struct vfio_pci_region {
 	u32				flags;
 };
 
+struct vfio_pci_device_ops {
+	int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev,
+			       struct p2pdma_provider **provider,
+			       unsigned int region_index,
+			       struct dma_buf_phys_vec *phys_vec,
+			       struct vfio_region_dma_range *dma_ranges,
+			       size_t nr_ranges);
+};
+
+#if IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)
+int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
+				struct vfio_region_dma_range *dma_ranges,
+				size_t nr_ranges, phys_addr_t start,
+				phys_addr_t len);
+int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
+				  struct p2pdma_provider **provider,
+				  unsigned int region_index,
+				  struct dma_buf_phys_vec *phys_vec,
+				  struct vfio_region_dma_range *dma_ranges,
+				  size_t nr_ranges);
+#else
+static inline int
+vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
+			    struct vfio_region_dma_range *dma_ranges,
+			    size_t nr_ranges, phys_addr_t start,
+			    phys_addr_t len)
+{
+	return -EINVAL;
+}
+static inline int vfio_pci_core_get_dmabuf_phys(
+	struct vfio_pci_core_device *vdev, struct p2pdma_provider **provider,
+	unsigned int region_index, struct dma_buf_phys_vec *phys_vec,
+	struct vfio_region_dma_range *dma_ranges, size_t nr_ranges)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 struct vfio_pci_core_device {
 	struct vfio_device	vdev;
 	struct pci_dev		*pdev;
+	const struct vfio_pci_device_ops *pci_ops;
 	void __iomem		*barmap[PCI_STD_NUM_BARS];
 	bool			bar_mmap_supported[PCI_STD_NUM_BARS];
 	u8			*pci_config_map;
@@ -94,6 +135,7 @@ struct vfio_pci_core_device {
 	struct vfio_pci_core_device	*sriov_pf_core_dev;
 	struct notifier_block	nb;
 	struct rw_semaphore	memory_lock;
+	struct list_head	dmabufs;
 };
 
 /* Will be exported for vfio pci drivers usage */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 75100bf009ba..ac2329f24141 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
+#include <linux/stddef.h>
 
 #define VFIO_API_VERSION	0
 
@@ -1478,6 +1479,33 @@ struct vfio_device_feature_bus_master {
 };
 #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
 
+/**
+ * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
+ * regions selected.
+ *
+ * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
+ * etc. offset/length specify a slice of the region to create the dmabuf from.
+ * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
+ *
+ * flags should be 0.
+ *
+ * Return: The fd number on success, -1 and errno is set on failure.
+ */
+#define VFIO_DEVICE_FEATURE_DMA_BUF 11
+
+struct vfio_region_dma_range {
+	__u64 offset;
+	__u64 length;
+};
+
+struct vfio_device_feature_dma_buf {
+	__u32	region_index;
+	__u32	open_flags;
+	__u32   flags;
+	__u32   nr_ranges;
+	struct vfio_region_dma_range dma_ranges[] __counted_by(nr_ranges);
+};
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
cgit v1.2.3


From 05954511b73e748d0370549ad9dd9cd95297d97a Mon Sep 17 00:00:00 2001
From: Jason Tian <jason@os.amperecomputing.com>
Date: Thu, 14 Aug 2025 09:52:52 -0700
Subject: RAS: Report all ARM processor CPER information to userspace

The ARM processor CPER record was added in UEFI v2.6 and remained
unchanged up to v2.10.

Yet, the original arm_event trace code added by

  e9279e83ad1f ("trace, ras: add ARM processor error trace event")

is incomplete, as it only traces some fields of UAPI 2.6 table N.16, not
exporting any information from tables N.17 to N.29 of the record.

This is not enough for the user to be able to figure out what has
exactly happened or to take appropriate action.

According to the UEFI v2.9 specification chapter N2.4.4, the ARM
processor error section includes:

- several (ERR_INFO_NUM) ARM processor error information structures
  (Tables N.17 to N.20);
- several (CONTEXT_INFO_NUM) ARM processor context information
  structures (Tables N.21 to N.29);
- several vendor specific error information structures. The
  size is given by Section Length minus the size of the other
  fields.

In addition, it also exports two fields that are parsed by the GHES
driver when firmware reports it, e.g.:

- error severity
- CPU logical index

Report all of these information to userspace via a the ARM tracepoint so
that userspace can properly record the error and take decisions related
to CPU core isolation according to error severity and other info.

The updated ARM trace event now contains the following fields:

======================================  =============================
UEFI field on table N.16                ARM Processor trace fields
======================================  =============================
Validation                              handled when filling data for
                                        affinity MPIDR and running
                                        state.
ERR_INFO_NUM                            pei_len
CONTEXT_INFO_NUM                        ctx_len
Section Length                          indirectly reported by
                                        pei_len, ctx_len and oem_len
Error affinity level                    affinity
MPIDR_EL1                               mpidr
MIDR_EL1                                midr
Running State                           running_state
PSCI State                              psci_state
Processor Error Information Structure   pei_err - count at pei_len
Processor Context                       ctx_err- count at ctx_len
Vendor Specific Error Info              oem - count at oem_len
======================================  =============================

It should be noted that decoding of tables N.17 to N.29, if needed, will
be handled in userspace. That gives more flexibility, as there won't be
any need to flood the kernel with micro-architecture specific error
decoding.

Also, decoding the other fields require a complex logic, and should be
done for each of the several values inside the record field.  So, let
userspace daemons like rasdaemon decode them, parsing such tables and
having vendor-specific micro-architecture-specific decoders.

 [mchehab: modified description, solved merge conflicts and fixed coding style]

Signed-off-by: Jason Tian <jason@os.amperecomputing.com>
Co-developed-by: Shengwei Luo <luoshengwei@huawei.com>
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Daniel Ferguson <danielf@os.amperecomputing.com> # rebased
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Shiju Jose <shiju.jose@huawei.com>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Fixes: e9279e83ad1f ("trace, ras: add ARM processor error trace event")
Link: https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#arm-processor-error-section
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/acpi/apei/ghes.c | 11 ++++-------
 drivers/ras/ras.c        | 40 +++++++++++++++++++++++++++++++++++++--
 include/linux/ras.h      | 16 +++++++++++++---
 include/ras/ras_event.h  | 49 +++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 99 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 97ee19f2cae0..7d2466b51504 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -552,7 +552,7 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
 }
 
 static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
-				       int sev, bool sync)
+				     int sev, bool sync)
 {
 	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
 	int flags = sync ? MF_ACTION_REQUIRED : 0;
@@ -560,9 +560,8 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
 	int sec_sev, i;
 	char *p;
 
-	log_arm_hw_error(err);
-
 	sec_sev = ghes_severity(gdata->error_severity);
+	log_arm_hw_error(err, sec_sev);
 	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
 		return false;
 
@@ -895,11 +894,9 @@ static void ghes_do_proc(struct ghes *ghes,
 
 			arch_apei_report_mem_error(sev, mem_err);
 			queued = ghes_handle_memory_failure(gdata, sev, sync);
-		}
-		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
+		} else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
 			ghes_handle_aer(gdata);
-		}
-		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
+		} else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
 			queued = ghes_handle_arm_hw_error(gdata, sev, sync);
 		} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
 			struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index ac0e132ccc3e..2a5b5a9fdcb3 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -53,9 +53,45 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id,
 }
 EXPORT_SYMBOL_GPL(log_non_standard_event);
 
-void log_arm_hw_error(struct cper_sec_proc_arm *err)
+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
 {
-	trace_arm_event(err);
+	struct cper_arm_err_info *err_info;
+	struct cper_arm_ctx_info *ctx_info;
+	u8 *ven_err_data;
+	u32 ctx_len = 0;
+	int n, sz, cpu;
+	s32 vsei_len;
+	u32 pei_len;
+	u8 *pei_err, *ctx_err;
+
+	pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num;
+	pei_err = (u8 *)(err + 1);
+
+	err_info = (struct cper_arm_err_info *)(err + 1);
+	ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
+	ctx_err = (u8 *)ctx_info;
+
+	for (n = 0; n < err->context_info_num; n++) {
+		sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
+		ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
+		ctx_len += sz;
+	}
+
+	vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) + pei_len + ctx_len);
+	if (vsei_len < 0) {
+		pr_warn(FW_BUG "section length: %d\n", err->section_length);
+		pr_warn(FW_BUG "section length is too small\n");
+		pr_warn(FW_BUG "firmware-generated error record is incorrect\n");
+		vsei_len = 0;
+	}
+	ven_err_data = (u8 *)ctx_info;
+
+	cpu = GET_LOGICAL_INDEX(err->mpidr);
+	if (cpu < 0)
+		cpu = -1;
+
+	trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len,
+			ven_err_data, (u32)vsei_len, sev, cpu);
 }
 
 static int __init ras_init(void)
diff --git a/include/linux/ras.h b/include/linux/ras.h
index a64182bc72ad..468941bfe855 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -24,8 +24,7 @@ int __init parse_cec_param(char *str);
 void log_non_standard_event(const guid_t *sec_type,
 			    const guid_t *fru_id, const char *fru_text,
 			    const u8 sev, const u8 *err, const u32 len);
-void log_arm_hw_error(struct cper_sec_proc_arm *err);
-
+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev);
 #else
 static inline void
 log_non_standard_event(const guid_t *sec_type,
@@ -33,7 +32,7 @@ log_non_standard_event(const guid_t *sec_type,
 		       const u8 sev, const u8 *err, const u32 len)
 { return; }
 static inline void
-log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
+log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return; }
 #endif
 
 struct atl_err {
@@ -53,4 +52,15 @@ static inline unsigned long
 amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
 #endif /* CONFIG_AMD_ATL */
 
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+#include <asm/smp_plat.h>
+/*
+ * Include ARM-specific SMP header which provides a function mapping mpidr to
+ * CPU logical index.
+ */
+#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr & MPIDR_HWID_BITMASK)
+#else
+#define GET_LOGICAL_INDEX(mpidr) -EINVAL
+#endif /* CONFIG_ARM || CONFIG_ARM64 */
+
 #endif /* __RAS_H__ */
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index c8cd0f00c845..c9f0b1018bcc 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -168,11 +168,25 @@ TRACE_EVENT(mc_event,
  * This event is generated when hardware detects an ARM processor error
  * has occurred. UEFI 2.6 spec section N.2.4.4.
  */
+#define APEIL "ARM Processor Err Info data len"
+#define APEID "ARM Processor Err Info raw data"
+#define APECIL "ARM Processor Err Context Info data len"
+#define APECID "ARM Processor Err Context Info raw data"
+#define VSEIL "Vendor Specific Err Info data len"
+#define VSEID "Vendor Specific Err Info raw data"
 TRACE_EVENT(arm_event,
 
-	TP_PROTO(const struct cper_sec_proc_arm *proc),
+	TP_PROTO(const struct cper_sec_proc_arm *proc,
+		 const u8 *pei_err,
+		 const u32 pei_len,
+		 const u8 *ctx_err,
+		 const u32 ctx_len,
+		 const u8 *oem,
+		 const u32 oem_len,
+		 u8 sev,
+		 int cpu),
 
-	TP_ARGS(proc),
+	TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, cpu),
 
 	TP_STRUCT__entry(
 		__field(u64, mpidr)
@@ -180,6 +194,14 @@ TRACE_EVENT(arm_event,
 		__field(u32, running_state)
 		__field(u32, psci_state)
 		__field(u8, affinity)
+		__field(u32, pei_len)
+		__dynamic_array(u8, pei_buf, pei_len)
+		__field(u32, ctx_len)
+		__dynamic_array(u8, ctx_buf, ctx_len)
+		__field(u32, oem_len)
+		__dynamic_array(u8, oem_buf, oem_len)
+		__field(u8, sev)
+		__field(int, cpu)
 	),
 
 	TP_fast_assign(
@@ -199,12 +221,29 @@ TRACE_EVENT(arm_event,
 			__entry->running_state = ~0;
 			__entry->psci_state = ~0;
 		}
+		__entry->pei_len = pei_len;
+		memcpy(__get_dynamic_array(pei_buf), pei_err, pei_len);
+		__entry->ctx_len = ctx_len;
+		memcpy(__get_dynamic_array(ctx_buf), ctx_err, ctx_len);
+		__entry->oem_len = oem_len;
+		memcpy(__get_dynamic_array(oem_buf), oem, oem_len);
+		__entry->sev = sev;
+		__entry->cpu = cpu;
 	),
 
-	TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
-		  "running state: %d; PSCI state: %d",
+	TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
+		  "running state: %d; PSCI state: %d; "
+		  "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s",
+		  __entry->cpu,
+		  __entry->sev,
 		  __entry->affinity, __entry->mpidr, __entry->midr,
-		  __entry->running_state, __entry->psci_state)
+		  __entry->running_state, __entry->psci_state,
+		  APEIL, __entry->pei_len, APEID,
+		  __print_hex(__get_dynamic_array(pei_buf), __entry->pei_len),
+		  APECIL, __entry->ctx_len, APECID,
+		  __print_hex(__get_dynamic_array(ctx_buf), __entry->ctx_len),
+		  VSEIL, __entry->oem_len, VSEID,
+		  __print_hex(__get_dynamic_array(oem_buf), __entry->oem_len))
 );
 
 /*
-- 
cgit v1.2.3


From a976d790f49499ccaa0f991788ad8ebf92e7fd5c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 14 Aug 2025 09:52:54 -0700
Subject: efi/cper: Add a new helper function to print bitmasks

Add a helper function to print a string with names associated
to each bit field.

A typical example is:

	const char * const bits[] = {
		"bit 3 name",
		"bit 4 name",
		"bit 5 name",
	};
	char str[120];
        unsigned int bitmask = BIT(3) | BIT(5);

	#define MASK  GENMASK(5,3)

	cper_bits_to_str(str, sizeof(str), FIELD_GET(MASK, bitmask),
			 bits, ARRAY_SIZE(bits));

The above code fills string "str" with "bit 3 name|bit 5 name".

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/cper.c | 60 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cper.h        |  2 ++
 2 files changed, 62 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 928409199a1a..79ba688a64f8 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -12,6 +12,7 @@
  * Specification version 2.4.
  */
 
+#include <linux/bitmap.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/time.h>
@@ -106,6 +107,65 @@ void cper_print_bits(const char *pfx, unsigned int bits,
 		printk("%s\n", buf);
 }
 
+/**
+ * cper_bits_to_str - return a string for set bits
+ * @buf: buffer to store the output string
+ * @buf_size: size of the output string buffer
+ * @bits: bit mask
+ * @strs: string array, indexed by bit position
+ * @strs_size: size of the string array: @strs
+ *
+ * Add to @buf the bitmask in hexadecimal. Then, for each set bit in @bits,
+ * add the corresponding string describing the bit in @strs to @buf.
+ *
+ * A typical example is::
+ *
+ *	const char * const bits[] = {
+ *		"bit 3 name",
+ *		"bit 4 name",
+ *		"bit 5 name",
+ *	};
+ *	char str[120];
+ *	unsigned int bitmask = BIT(3) | BIT(5);
+ *	#define MASK GENMASK(5,3)
+ *
+ *	cper_bits_to_str(str, sizeof(str), FIELD_GET(MASK, bitmask),
+ *			 bits, ARRAY_SIZE(bits));
+ *
+ * The above code fills the string ``str`` with ``bit 3 name|bit 5 name``.
+ *
+ * Return: number of bytes stored or an error code if lower than zero.
+ */
+int cper_bits_to_str(char *buf, int buf_size, unsigned long bits,
+		     const char * const strs[], unsigned int strs_size)
+{
+	int len = buf_size;
+	char *str = buf;
+	int i, size;
+
+	*buf = '\0';
+
+	for_each_set_bit(i, &bits, strs_size) {
+		if (!(bits & BIT_ULL(i)))
+			continue;
+
+		if (*buf && len > 0) {
+			*str = '|';
+			len--;
+			str++;
+		}
+
+		size = strscpy(str, strs[i], len);
+		if (size < 0)
+			return size;
+
+		len -= size;
+		str += size;
+	}
+	return len - buf_size;
+}
+EXPORT_SYMBOL_GPL(cper_bits_to_str);
+
 static const char * const proc_type_strs[] = {
 	"IA32/X64",
 	"IA64",
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 0ed60a91eca9..58f40477c824 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -588,6 +588,8 @@ const char *cper_mem_err_type_str(unsigned int);
 const char *cper_mem_err_status_str(u64 status);
 void cper_print_bits(const char *prefix, unsigned int bits,
 		     const char * const strs[], unsigned int strs_size);
+int cper_bits_to_str(char *buf, int buf_size, unsigned long bits,
+		     const char * const strs[], unsigned int strs_size);
 void cper_mem_err_pack(const struct cper_sec_mem_err *,
 		       struct cper_mem_err_compact *);
 const char *cper_mem_err_unpack(struct trace_seq *,
-- 
cgit v1.2.3


From 96b010536ee020e716d28d9b359a4bcd18800aeb Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 14 Aug 2025 09:52:55 -0700
Subject: efi/cper: align ARM CPER type with UEFI 2.9A/2.10 specs

Up to UEFI spec 2.9, the type byte of CPER struct for ARM processor
was defined simply as:

Type at byte offset 4:

	- Cache error
	- TLB Error
	- Bus Error
	- Micro-architectural Error
	All other values are reserved

Yet, there was no information about how this would be encoded.

Spec 2.9A errata corrected it by defining:

	- Bit 1 - Cache Error
	- Bit 2 - TLB Error
	- Bit 3 - Bus Error
	- Bit 4 - Micro-architectural Error
	All other values are reserved

That actually aligns with the values already defined on older
versions at N.2.4.1. Generic Processor Error Section.

Spec 2.10 also preserve the same encoding as 2.9A.

Adjust CPER and GHES handling code for both generic and ARM
processors to properly handle UEFI 2.9A and 2.10 encoding.

Link: https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#arm-processor-error-information
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/acpi/apei/ghes.c        | 16 ++++++++-----
 drivers/firmware/efi/cper-arm.c | 50 ++++++++++++++++++++---------------------
 include/linux/cper.h            | 10 ++++-----
 3 files changed, 39 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 7d2466b51504..56107aa00274 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -22,6 +22,7 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/acpi.h>
+#include <linux/bitfield.h>
 #include <linux/io.h>
 #include <linux/interrupt.h>
 #include <linux/timer.h>
@@ -556,6 +557,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
 {
 	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
 	int flags = sync ? MF_ACTION_REQUIRED : 0;
+	char error_type[120];
 	bool queued = false;
 	int sec_sev, i;
 	char *p;
@@ -568,9 +570,8 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
 	p = (char *)(err + 1);
 	for (i = 0; i < err->err_info_num; i++) {
 		struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
-		bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
+		bool is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
 		bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
-		const char *error_type = "unknown error";
 
 		/*
 		 * The field (err_info->error_info & BIT(26)) is fixed to set to
@@ -584,12 +585,15 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
 			continue;
 		}
 
-		if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs))
-			error_type = cper_proc_error_type_strs[err_info->type];
+		cper_bits_to_str(error_type, sizeof(error_type),
+				 FIELD_GET(CPER_ARM_ERR_TYPE_MASK, err_info->type),
+				 cper_proc_error_type_strs,
+				 ARRAY_SIZE(cper_proc_error_type_strs));
 
 		pr_warn_ratelimited(FW_WARN GHES_PFX
-				    "Unhandled processor error type: %s\n",
-				    error_type);
+				    "Unhandled processor error type 0x%02x: %s%s\n",
+				    err_info->type, error_type,
+				    (err_info->type & ~CPER_ARM_ERR_TYPE_MASK) ? " with reserved bit(s)" : "");
 		p += err_info->length;
 	}
 
diff --git a/drivers/firmware/efi/cper-arm.c b/drivers/firmware/efi/cper-arm.c
index 6ff781e47147..76542a53e202 100644
--- a/drivers/firmware/efi/cper-arm.c
+++ b/drivers/firmware/efi/cper-arm.c
@@ -93,15 +93,11 @@ static void cper_print_arm_err_info(const char *pfx, u32 type,
 	bool proc_context_corrupt, corrected, precise_pc, restartable_pc;
 	bool time_out, access_mode;
 
-	/* If the type is unknown, bail. */
-	if (type > CPER_ARM_MAX_TYPE)
-		return;
-
 	/*
 	 * Vendor type errors have error information values that are vendor
 	 * specific.
 	 */
-	if (type == CPER_ARM_VENDOR_ERROR)
+	if (type & CPER_ARM_VENDOR_ERROR)
 		return;
 
 	if (error_info & CPER_ARM_ERR_VALID_TRANSACTION_TYPE) {
@@ -116,43 +112,38 @@ static void cper_print_arm_err_info(const char *pfx, u32 type,
 	if (error_info & CPER_ARM_ERR_VALID_OPERATION_TYPE) {
 		op_type = ((error_info >> CPER_ARM_ERR_OPERATION_SHIFT)
 			   & CPER_ARM_ERR_OPERATION_MASK);
-		switch (type) {
-		case CPER_ARM_CACHE_ERROR:
+		if (type & CPER_ARM_CACHE_ERROR) {
 			if (op_type < ARRAY_SIZE(arm_cache_err_op_strs)) {
-				printk("%soperation type: %s\n", pfx,
+				printk("%scache error, operation type: %s\n", pfx,
 				       arm_cache_err_op_strs[op_type]);
 			}
-			break;
-		case CPER_ARM_TLB_ERROR:
+		}
+		if (type & CPER_ARM_TLB_ERROR) {
 			if (op_type < ARRAY_SIZE(arm_tlb_err_op_strs)) {
-				printk("%soperation type: %s\n", pfx,
+				printk("%sTLB error, operation type: %s\n", pfx,
 				       arm_tlb_err_op_strs[op_type]);
 			}
-			break;
-		case CPER_ARM_BUS_ERROR:
+		}
+		if (type & CPER_ARM_BUS_ERROR) {
 			if (op_type < ARRAY_SIZE(arm_bus_err_op_strs)) {
-				printk("%soperation type: %s\n", pfx,
+				printk("%sbus error, operation type: %s\n", pfx,
 				       arm_bus_err_op_strs[op_type]);
 			}
-			break;
 		}
 	}
 
 	if (error_info & CPER_ARM_ERR_VALID_LEVEL) {
 		level = ((error_info >> CPER_ARM_ERR_LEVEL_SHIFT)
 			 & CPER_ARM_ERR_LEVEL_MASK);
-		switch (type) {
-		case CPER_ARM_CACHE_ERROR:
+		if (type & CPER_ARM_CACHE_ERROR)
 			printk("%scache level: %d\n", pfx, level);
-			break;
-		case CPER_ARM_TLB_ERROR:
+
+		if (type & CPER_ARM_TLB_ERROR)
 			printk("%sTLB level: %d\n", pfx, level);
-			break;
-		case CPER_ARM_BUS_ERROR:
+
+		if (type & CPER_ARM_BUS_ERROR)
 			printk("%saffinity level at which the bus error occurred: %d\n",
 			       pfx, level);
-			break;
-		}
 	}
 
 	if (error_info & CPER_ARM_ERR_VALID_PROC_CONTEXT_CORRUPT) {
@@ -241,6 +232,7 @@ void cper_print_proc_arm(const char *pfx,
 	struct cper_arm_err_info *err_info;
 	struct cper_arm_ctx_info *ctx_info;
 	char newpfx[64], infopfx[ARRAY_SIZE(newpfx) + 1];
+	char error_type[120];
 
 	printk("%sMIDR: 0x%016llx\n", pfx, proc->midr);
 
@@ -289,9 +281,15 @@ void cper_print_proc_arm(const char *pfx,
 				       newpfx);
 		}
 
-		printk("%serror_type: %d, %s\n", newpfx, err_info->type,
-			err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
-			cper_proc_error_type_strs[err_info->type] : "unknown");
+		cper_bits_to_str(error_type, sizeof(error_type),
+				 FIELD_GET(CPER_ARM_ERR_TYPE_MASK, err_info->type),
+				 cper_proc_error_type_strs,
+				 ARRAY_SIZE(cper_proc_error_type_strs));
+
+		printk("%serror_type: 0x%02x: %s%s\n", newpfx, err_info->type,
+		       error_type,
+		       (err_info->type & ~CPER_ARM_ERR_TYPE_MASK) ? " with reserved bit(s)" : "");
+
 		if (err_info->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO) {
 			printk("%serror_info: 0x%016llx\n", newpfx,
 			       err_info->error_info);
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 58f40477c824..5b1236d8c65b 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -297,11 +297,11 @@ enum {
 #define CPER_ARM_INFO_FLAGS_PROPAGATED		BIT(2)
 #define CPER_ARM_INFO_FLAGS_OVERFLOW		BIT(3)
 
-#define CPER_ARM_CACHE_ERROR			0
-#define CPER_ARM_TLB_ERROR			1
-#define CPER_ARM_BUS_ERROR			2
-#define CPER_ARM_VENDOR_ERROR			3
-#define CPER_ARM_MAX_TYPE			CPER_ARM_VENDOR_ERROR
+#define CPER_ARM_ERR_TYPE_MASK			GENMASK(4,1)
+#define CPER_ARM_CACHE_ERROR			BIT(1)
+#define CPER_ARM_TLB_ERROR			BIT(2)
+#define CPER_ARM_BUS_ERROR			BIT(3)
+#define CPER_ARM_VENDOR_ERROR			BIT(4)
 
 #define CPER_ARM_ERR_VALID_TRANSACTION_TYPE	BIT(0)
 #define CPER_ARM_ERR_VALID_OPERATION_TYPE	BIT(1)
-- 
cgit v1.2.3


From 93863f3f859a626347ce2ec18947b11357b4ca14 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Thu, 20 Nov 2025 12:14:20 -0800
Subject: kbuild: Check for functions with ambiguous -ffunction-sections
 section names

Commit 9c7dc1dd897a ("objtool: Warn on functions with ambiguous
-ffunction-sections section names") only works for drivers which are
compiled on architectures supported by objtool.

Make a script to perform the same check for all architectures.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/a6a49644a34964f7e02f3a8ce43af03e72817180.1763669451.git.jpoimboe@kernel.org
---
 include/asm-generic/vmlinux.lds.h |  2 +-
 scripts/Makefile.vmlinux_o        |  4 ++++
 scripts/check-function-names.sh   | 25 +++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100755 scripts/check-function-names.sh

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 5efe1de2209b..0cdae6f809b5 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -110,7 +110,7 @@
  * .text.startup could be __attribute__((constructor)) code in a *non*
  * ffunction-sections object, which should be placed in .init.text; or it could
  * be an actual function named startup() in an ffunction-sections object, which
- * should be placed in .text.  Objtool will detect and complain about any such
+ * should be placed in .text.  The build will detect and complain about any such
  * ambiguously named functions.
  */
 #define TEXT_MAIN							\
diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
index 20533cc0b1ee..527352c222ff 100644
--- a/scripts/Makefile.vmlinux_o
+++ b/scripts/Makefile.vmlinux_o
@@ -63,11 +63,15 @@ quiet_cmd_ld_vmlinux.o = LD      $@
 	--start-group $(KBUILD_VMLINUX_LIBS) --end-group \
 	$(cmd_objtool)
 
+cmd_check_function_names = $(srctree)/scripts/check-function-names.sh $@
+
 define rule_ld_vmlinux.o
 	$(call cmd_and_savecmd,ld_vmlinux.o)
 	$(call cmd,gen_objtooldep)
+	$(call cmd,check_function_names)
 endef
 
+
 vmlinux.o: $(initcalls-lds) vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE
 	$(call if_changed_rule,ld_vmlinux.o)
 
diff --git a/scripts/check-function-names.sh b/scripts/check-function-names.sh
new file mode 100755
index 000000000000..410042591cfc
--- /dev/null
+++ b/scripts/check-function-names.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Certain function names are disallowed due to section name ambiguities
+# introduced by -ffunction-sections.
+#
+# See the comment above TEXT_MAIN in include/asm-generic/vmlinux.lds.h.
+
+objfile="$1"
+
+if [ ! -f "$objfile" ]; then
+	echo "usage: $0 <file.o>" >&2
+	exit 1
+fi
+
+bad_symbols=$(nm "$objfile" | awk '$2 ~ /^[TtWw]$/ {print $3}' | grep -E '^(startup|exit|split|unlikely|hot|unknown)(\.|$)')
+
+if [ -n "$bad_symbols" ]; then
+	echo "$bad_symbols" | while read -r sym; do
+		echo "$objfile: error: $sym() function name creates ambiguity with -ffunction-sections" >&2
+	done
+	exit 1
+fi
+
+exit 0
-- 
cgit v1.2.3


From 8e8678e740ecde2ae4a0404fd9b4ed2b726e236d Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 8 Jul 2025 12:57:57 +0000
Subject: KVM: s390: Add capability that forwards operation exceptions

Setting KVM_CAP_S390_USER_OPEREXEC will forward all operation
exceptions to user space. This also includes the 0x0000 instructions
managed by KVM_CAP_S390_USER_INSTR0. It's helpful if user space wants
to emulate instructions which do not (yet) have an opcode.

While we're at it refine the documentation for
KVM_CAP_S390_USER_INSTR0.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 Documentation/virt/kvm/api.rst                   |  17 ++-
 arch/s390/include/asm/kvm_host.h                 |   1 +
 arch/s390/kvm/intercept.c                        |   3 +
 arch/s390/kvm/kvm-s390.c                         |   7 ++
 include/uapi/linux/kvm.h                         |   1 +
 tools/testing/selftests/kvm/Makefile.kvm         |   1 +
 tools/testing/selftests/kvm/s390/user_operexec.c | 140 +++++++++++++++++++++++
 7 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/kvm/s390/user_operexec.c

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 72b2fae99a83..1bc2a84c59ee 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7820,7 +7820,7 @@ where 0xff represents CPUs 0-7 in cluster 0.
 :Architectures: s390
 :Parameters: none
 
-With this capability enabled, all illegal instructions 0x0000 (2 bytes) will
+With this capability enabled, the illegal instruction 0x0000 (2 bytes) will
 be intercepted and forwarded to user space. User space can use this
 mechanism e.g. to realize 2-byte software breakpoints. The kernel will
 not inject an operating exception for these instructions, user space has
@@ -8703,6 +8703,21 @@ This capability indicate to the userspace whether a PFNMAP memory region
 can be safely mapped as cacheable. This relies on the presence of
 force write back (FWB) feature support on the hardware.
 
+7.45 KVM_CAP_S390_USER_OPEREXEC
+-------------------------------
+
+:Architectures: s390
+:Parameters: none
+
+When this capability is enabled KVM forwards all operation exceptions
+that it doesn't handle itself to user space. This also includes the
+0x0000 instructions managed by KVM_CAP_S390_USER_INSTR0. This is
+helpful if user space wants to emulate instructions which are not
+(yet) implemented in hardware.
+
+This capability can be enabled dynamically even if VCPUs were already
+created and are running.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 22cedcaea475..1e4829c70216 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -648,6 +648,7 @@ struct kvm_arch {
 	int user_sigp;
 	int user_stsi;
 	int user_instr0;
+	int user_operexec;
 	struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
 	wait_queue_head_t ipte_wq;
 	int ipte_lock_count;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index c7908950c1f4..420ae62977e2 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -471,6 +471,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.sie_block->ipa == 0xb256)
 		return handle_sthyi(vcpu);
 
+	if (vcpu->kvm->arch.user_operexec)
+		return -EOPNOTSUPP;
+
 	if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
 		return -EOPNOTSUPP;
 	rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t));
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 70ebc54b1bb1..56d4730b7c41 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -606,6 +606,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_S390_DIAG318:
 	case KVM_CAP_IRQFD_RESAMPLE:
+	case KVM_CAP_S390_USER_OPEREXEC:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -921,6 +922,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
 			 r ? "(not available)" : "(success)");
 		break;
+	case KVM_CAP_S390_USER_OPEREXEC:
+		VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_OPEREXEC");
+		kvm->arch.user_operexec = 1;
+		icpt_operexc_on_all_vcpus(kvm);
+		r = 0;
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 52f6000ab020..8ab07396ce3b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -963,6 +963,7 @@ struct kvm_enable_cap {
 #define KVM_CAP_RISCV_MP_STATE_RESET 242
 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
 #define KVM_CAP_GUEST_MEMFD_FLAGS 244
+#define KVM_CAP_S390_USER_OPEREXEC 245
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 148d427ff24b..87e429206bb8 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -194,6 +194,7 @@ TEST_GEN_PROGS_s390 += s390/debug_test
 TEST_GEN_PROGS_s390 += s390/cpumodel_subfuncs_test
 TEST_GEN_PROGS_s390 += s390/shared_zeropage_test
 TEST_GEN_PROGS_s390 += s390/ucontrol_test
+TEST_GEN_PROGS_s390 += s390/user_operexec
 TEST_GEN_PROGS_s390 += rseq_test
 
 TEST_GEN_PROGS_riscv = $(TEST_GEN_PROGS_COMMON)
diff --git a/tools/testing/selftests/kvm/s390/user_operexec.c b/tools/testing/selftests/kvm/s390/user_operexec.c
new file mode 100644
index 000000000000..714906c1d12a
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/user_operexec.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Test operation exception forwarding.
+ *
+ * Copyright IBM Corp. 2025
+ *
+ * Authors:
+ *  Janosch Frank <frankja@linux.ibm.com>
+ */
+#include "kselftest.h"
+#include "kvm_util.h"
+#include "test_util.h"
+#include "sie.h"
+
+#include <linux/kvm.h>
+
+static void guest_code_instr0(void)
+{
+	asm(".word 0x0000");
+}
+
+static void test_user_instr0(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int rc;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_instr0);
+	rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0);
+	TEST_ASSERT_EQ(0, rc);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0);
+
+	kvm_vm_free(vm);
+}
+
+static void guest_code_user_operexec(void)
+{
+	asm(".word 0x0807");
+}
+
+static void test_user_operexec(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int rc;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec);
+	rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0);
+	TEST_ASSERT_EQ(0, rc);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807);
+
+	kvm_vm_free(vm);
+
+	/*
+	 * Since user_operexec is the superset it can be used for the
+	 * 0 instruction.
+	 */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_instr0);
+	rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0);
+	TEST_ASSERT_EQ(0, rc);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0);
+
+	kvm_vm_free(vm);
+}
+
+/* combine user_instr0 and user_operexec */
+static void test_user_operexec_combined(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int rc;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec);
+	rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0);
+	TEST_ASSERT_EQ(0, rc);
+	rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0);
+	TEST_ASSERT_EQ(0, rc);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807);
+
+	kvm_vm_free(vm);
+
+	/* Reverse enablement order */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec);
+	rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0);
+	TEST_ASSERT_EQ(0, rc);
+	rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0);
+	TEST_ASSERT_EQ(0, rc);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807);
+
+	kvm_vm_free(vm);
+}
+
+/*
+ * Run all tests above.
+ *
+ * Enablement after VCPU has been added is automatically tested since
+ * we enable the capability after VCPU creation.
+ */
+static struct testdef {
+	const char *name;
+	void (*test)(void);
+} testlist[] = {
+	{ "instr0", test_user_instr0 },
+	{ "operexec", test_user_operexec },
+	{ "operexec_combined", test_user_operexec_combined},
+};
+
+int main(int argc, char *argv[])
+{
+	int idx;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_USER_INSTR0));
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(testlist));
+	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
+		testlist[idx].test();
+		ksft_test_result_pass("%s\n", testlist[idx].name);
+	}
+	ksft_finished();
+}
-- 
cgit v1.2.3


From d292dbb5640c5b73b5ad889ae31fe889a2bf3137 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 2 Jun 2025 14:59:32 +0200
Subject: bug: Add BUG_FORMAT infrastructure

Add BUG_FORMAT; an architecture opt-in feature that allows adding the
WARN_printf() format string to the bug_entry table.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.223371452@infradead.org
---
 include/asm-generic/bug.h |  7 +++++++
 lib/bug.c                 | 39 ++++++++++++++++++++++++++++++++-------
 2 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 2d9f61346dab..c7a1407b8669 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -42,6 +42,13 @@ struct bug_entry {
 #else
 	signed int	bug_addr_disp;
 #endif
+#ifdef HAVE_ARCH_BUG_FORMAT
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
+	const char	*format;
+#else
+	signed int	format_disp;
+#endif
+#endif
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 #ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
 	const char	*file;
diff --git a/lib/bug.c b/lib/bug.c
index b1f07459c2ee..6e57199ed1f7 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -139,6 +139,19 @@ void bug_get_file_line(struct bug_entry *bug, const char **file,
 #endif
 }
 
+static const char *bug_get_format(struct bug_entry *bug)
+{
+	const char *format = NULL;
+#ifdef HAVE_ARCH_BUG_FORMAT
+#ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
+	format = (const char *)&bug->format_disp + bug->format_disp;
+#else
+	format = bug->format;
+#endif
+#endif
+	return format;
+}
+
 struct bug_entry *find_bug(unsigned long bugaddr)
 {
 	struct bug_entry *bug;
@@ -150,11 +163,19 @@ struct bug_entry *find_bug(unsigned long bugaddr)
 	return module_find_bug(bugaddr);
 }
 
+static void __warn_printf(const char *fmt)
+{
+	if (!fmt)
+		return;
+
+	printk("%s", fmt);
+}
+
 static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
 {
-	struct bug_entry *bug;
-	const char *file;
-	unsigned line, warning, once, done;
+	bool warning, once, done, no_cut, has_args;
+	const char *file, *fmt;
+	unsigned line;
 
 	if (!is_valid_bugaddr(bugaddr))
 		return BUG_TRAP_TYPE_NONE;
@@ -166,10 +187,12 @@ static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *re
 	disable_trace_on_warning();
 
 	bug_get_file_line(bug, &file, &line);
+	fmt = bug_get_format(bug);
 
-	warning = (bug->flags & BUGFLAG_WARNING) != 0;
-	once = (bug->flags & BUGFLAG_ONCE) != 0;
-	done = (bug->flags & BUGFLAG_DONE) != 0;
+	warning  = bug->flags & BUGFLAG_WARNING;
+	once     = bug->flags & BUGFLAG_ONCE;
+	done     = bug->flags & BUGFLAG_DONE;
+	no_cut   = bug->flags & BUGFLAG_NO_CUT_HERE;
 
 	if (warning && once) {
 		if (done)
@@ -187,8 +210,10 @@ static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *re
 	 * "cut here" line now. WARN() issues its own "cut here" before the
 	 * extra debugging message it writes before triggering the handler.
 	 */
-	if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0)
+	if (!no_cut) {
 		printk(KERN_DEFAULT CUT_HERE);
+		__warn_printf(fmt);
+	}
 
 	if (warning) {
 		/* this is a WARN_ON rather than BUG/BUG_ON */
-- 
cgit v1.2.3


From 30b82568b04e279d0d99482db036f1bdfecac522 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 2 Jun 2025 15:01:38 +0200
Subject: bug: Clean up CONFIG_GENERIC_BUG_RELATIVE_POINTERS

Three repeated CONFIG_GENERIC_BUG_RELATIVE_POINTERS #ifdefs right
after one another yields unreadable code. Add a helper.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.341703850@infradead.org
---
 include/asm-generic/bug.h | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index c7a1407b8669..9ee622ae0c9a 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -35,26 +35,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 
 #ifdef CONFIG_BUG
 
-#ifdef CONFIG_GENERIC_BUG
-struct bug_entry {
 #ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
-	unsigned long	bug_addr;
+#define BUG_REL(type, name) type name
 #else
-	signed int	bug_addr_disp;
+#define BUG_REL(type, name) signed int name##_disp
 #endif
+
+#ifdef CONFIG_GENERIC_BUG
+struct bug_entry {
+	BUG_REL(unsigned long, bug_addr);
 #ifdef HAVE_ARCH_BUG_FORMAT
-#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
-	const char	*format;
-#else
-	signed int	format_disp;
-#endif
+	BUG_REL(const char *, format);
 #endif
 #ifdef CONFIG_DEBUG_BUGVERBOSE
-#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
-	const char	*file;
-#else
-	signed int	file_disp;
-#endif
+	BUG_REL(const char *, file);
 	unsigned short	line;
 #endif
 	unsigned short	flags;
-- 
cgit v1.2.3


From 5c47b7f3d1a9d7589026a201abb8ad445f029246 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 7 Jun 2025 10:51:24 +0200
Subject: bug: Add BUG_FORMAT_ARGS infrastructure

Add BUG_FORMAT_ARGS; when an architecture is able to provide a va_list
given pt_regs, use this to print format arguments.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.457339417@infradead.org
---
 include/asm-generic/bug.h |  1 +
 lib/bug.c                 | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 9ee622ae0c9a..228873e13b95 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -13,6 +13,7 @@
 #define BUGFLAG_ONCE		(1 << 1)
 #define BUGFLAG_DONE		(1 << 2)
 #define BUGFLAG_NO_CUT_HERE	(1 << 3)	/* CUT_HERE already sent */
+#define BUGFLAG_ARGS		(1 << 4)
 #define BUGFLAG_TAINT(taint)	((taint) << 8)
 #define BUG_GET_TAINT(bug)	((bug)->flags >> 8)
 #endif
diff --git a/lib/bug.c b/lib/bug.c
index 6e57199ed1f7..8100258a2004 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -163,11 +163,23 @@ struct bug_entry *find_bug(unsigned long bugaddr)
 	return module_find_bug(bugaddr);
 }
 
-static void __warn_printf(const char *fmt)
+static void __warn_printf(const char *fmt, struct pt_regs *regs)
 {
 	if (!fmt)
 		return;
 
+#ifdef HAVE_ARCH_BUG_FORMAT_ARGS
+	if (regs) {
+		struct arch_va_list _args;
+		va_list *args = __warn_args(&_args, regs);
+
+		if (args) {
+			vprintk(fmt, *args);
+			return;
+		}
+	}
+#endif
+
 	printk("%s", fmt);
 }
 
@@ -193,6 +205,7 @@ static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *re
 	once     = bug->flags & BUGFLAG_ONCE;
 	done     = bug->flags & BUGFLAG_DONE;
 	no_cut   = bug->flags & BUGFLAG_NO_CUT_HERE;
+	has_args = bug->flags & BUGFLAG_ARGS;
 
 	if (warning && once) {
 		if (done)
@@ -212,7 +225,7 @@ static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *re
 	 */
 	if (!no_cut) {
 		printk(KERN_DEFAULT CUT_HERE);
-		__warn_printf(fmt);
+		__warn_printf(fmt, has_args ? regs : NULL);
 	}
 
 	if (warning) {
-- 
cgit v1.2.3


From 7d2c27a0ec5ecec980b623ded45758918c00b164 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 7 Jun 2025 10:52:28 +0200
Subject: bug: Add report_bug_entry()

Add a report_bug() variant where the bug_entry is already known. This
is useful when the exception instruction is not instantiated per-site.
But instead has a single instance. In such a case the bug_entry
address might be passed along in a known register or something.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.575795595@infradead.org
---
 include/linux/bug.h |  8 ++++++++
 lib/bug.c           | 28 +++++++++++++++++++++-------
 2 files changed, 29 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bug.h b/include/linux/bug.h
index a9948a9f1093..17a4933c611b 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -42,6 +42,7 @@ void bug_get_file_line(struct bug_entry *bug, const char **file,
 struct bug_entry *find_bug(unsigned long bugaddr);
 
 enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
+enum bug_trap_type report_bug_entry(struct bug_entry *bug, struct pt_regs *regs);
 
 /* These are defined by the architecture */
 int is_valid_bugaddr(unsigned long addr);
@@ -62,6 +63,13 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 }
 
 struct bug_entry;
+
+static inline enum bug_trap_type
+report_bug_entry(struct bug_entry *bug, struct pt_regs *regs)
+{
+	return BUG_TRAP_TYPE_BUG;
+}
+
 static inline void bug_get_file_line(struct bug_entry *bug, const char **file,
 				     unsigned int *line)
 {
diff --git a/lib/bug.c b/lib/bug.c
index 8100258a2004..581a66b88c5c 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -183,18 +183,20 @@ static void __warn_printf(const char *fmt, struct pt_regs *regs)
 	printk("%s", fmt);
 }
 
-static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
+static enum bug_trap_type __report_bug(struct bug_entry *bug, unsigned long bugaddr, struct pt_regs *regs)
 {
 	bool warning, once, done, no_cut, has_args;
 	const char *file, *fmt;
 	unsigned line;
 
-	if (!is_valid_bugaddr(bugaddr))
-		return BUG_TRAP_TYPE_NONE;
+	if (!bug) {
+		if (!is_valid_bugaddr(bugaddr))
+			return BUG_TRAP_TYPE_NONE;
 
-	bug = find_bug(bugaddr);
-	if (!bug)
-		return BUG_TRAP_TYPE_NONE;
+		bug = find_bug(bugaddr);
+		if (!bug)
+			return BUG_TRAP_TYPE_NONE;
+	}
 
 	disable_trace_on_warning();
 
@@ -244,13 +246,25 @@ static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *re
 	return BUG_TRAP_TYPE_BUG;
 }
 
+enum bug_trap_type report_bug_entry(struct bug_entry *bug, struct pt_regs *regs)
+{
+	enum bug_trap_type ret;
+	bool rcu = false;
+
+	rcu = warn_rcu_enter();
+	ret = __report_bug(bug, 0, regs);
+	warn_rcu_exit(rcu);
+
+	return ret;
+}
+
 enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 {
 	enum bug_trap_type ret;
 	bool rcu = false;
 
 	rcu = warn_rcu_enter();
-	ret = __report_bug(bugaddr, regs);
+	ret = __report_bug(NULL, bugaddr, regs);
 	warn_rcu_exit(rcu);
 
 	return ret;
-- 
cgit v1.2.3


From 3fd45b871fde00f4fac96318a136bd256ec0b90b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 10 Nov 2025 11:58:37 +0100
Subject: bug: Implement WARN_ON() using __WARN_FLAGS()

This completes 3bc3c9c3ab6d ("bugs/core: Pass down the condition
string of WARN_ON_ONCE(cond) warnings to __WARN_FLAGS()") and makes
WARN_ON() and WARN_ON_ONCE() behaviour consistent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.690999560@infradead.org
---
 include/asm-generic/bug.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 228873e13b95..4bfbeae30c42 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -109,21 +109,35 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 	} while (0)
 #else
 #define __WARN()		__WARN_FLAGS("", BUGFLAG_TAINT(TAINT_WARN))
+
 #define __WARN_printf(taint, arg...) do {				\
 		instrumentation_begin();				\
 		__warn_printk(arg);					\
 		__WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
 		instrumentation_end();					\
 	} while (0)
-#define WARN_ON_ONCE(condition) ({				\
-	int __ret_warn_on = !!(condition);			\
-	if (unlikely(__ret_warn_on))				\
-		__WARN_FLAGS("["#condition"] ",			\
-			     BUGFLAG_ONCE |			\
-			     BUGFLAG_TAINT(TAINT_WARN));	\
-	unlikely(__ret_warn_on);				\
+
+#ifndef WARN_ON
+#define WARN_ON(condition) ({						\
+	int __ret_warn_on = !!(condition);				\
+	if (unlikely(__ret_warn_on))					\
+		__WARN_FLAGS("["#condition"] ",				\
+			     BUGFLAG_TAINT(TAINT_WARN));		\
+	unlikely(__ret_warn_on);					\
+})
+#endif
+
+#ifndef WARN_ON_ONCE
+#define WARN_ON_ONCE(condition) ({					\
+	int __ret_warn_on = !!(condition);				\
+	if (unlikely(__ret_warn_on))					\
+		__WARN_FLAGS("["#condition"] ",				\
+			     BUGFLAG_ONCE |				\
+			     BUGFLAG_TAINT(TAINT_WARN));		\
+	unlikely(__ret_warn_on);					\
 })
 #endif
+#endif /* __WARN_FLAGS */
 
 /* used internally by panic.c */
 
-- 
cgit v1.2.3


From b9b2c455f462b67954bee5f17c3d68355d37586f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 2 Jun 2025 15:08:36 +0200
Subject: bug: Allow architectures to provide __WARN_printf()

In addition to providing __WARN_FLAGS(), allow an architecture to also
provide __WARN_printf().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.807154591@infradead.org
---
 include/asm-generic/bug.h | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 4bfbeae30c42..21d2c8f88d49 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -100,23 +100,9 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint,
 		       const char *fmt, ...);
 extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 
-#ifndef __WARN_FLAGS
-#define __WARN()		__WARN_printf(TAINT_WARN, NULL)
-#define __WARN_printf(taint, arg...) do {				\
-		instrumentation_begin();				\
-		warn_slowpath_fmt(__FILE__, __LINE__, taint, arg);	\
-		instrumentation_end();					\
-	} while (0)
-#else
+#ifdef __WARN_FLAGS
 #define __WARN()		__WARN_FLAGS("", BUGFLAG_TAINT(TAINT_WARN))
 
-#define __WARN_printf(taint, arg...) do {				\
-		instrumentation_begin();				\
-		__warn_printk(arg);					\
-		__WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
-		instrumentation_end();					\
-	} while (0)
-
 #ifndef WARN_ON
 #define WARN_ON(condition) ({						\
 	int __ret_warn_on = !!(condition);				\
@@ -139,6 +125,27 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 #endif
 #endif /* __WARN_FLAGS */
 
+#if defined(__WARN_FLAGS) && !defined(__WARN_printf)
+#define __WARN_printf(taint, arg...) do {				\
+		instrumentation_begin();				\
+		__warn_printk(arg);					\
+		__WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
+		instrumentation_end();					\
+	} while (0)
+#endif
+
+#ifndef __WARN_printf
+#define __WARN_printf(taint, arg...) do {				\
+		instrumentation_begin();				\
+		warn_slowpath_fmt(__FILE__, __LINE__, taint, arg);	\
+		instrumentation_end();					\
+	} while (0)
+#endif
+
+#ifndef __WARN
+#define __WARN()		__WARN_printf(TAINT_WARN, NULL)
+#endif
+
 /* used internally by panic.c */
 
 #ifndef WARN_ON
-- 
cgit v1.2.3


From a67df6d1b939ca98e1ad403f53e3ee57299b8c44 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Tue, 11 Nov 2025 14:46:10 +0100
Subject: uapi: cdc.h: cleanly provide for more interfaces and countries

The spec requires at least one interface respectively country.
It allows multiple ones. This needs to be clearly said in the UAPI.
This is subject to sanity checking in cdc_parse_cdc_header(), thus
we can trust the length.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
Link: https://patch.msgid.link/20251111134641.4118827-1-oneukum@suse.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/class/cdc-acm.c  |  2 +-
 include/uapi/linux/usb/cdc.h | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 73f9476774ae..54be4aa1dcb2 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1475,7 +1475,7 @@ made_compressed_probe:
 		if (!acm->country_codes)
 			goto skip_countries;
 		acm->country_code_size = cfd->bLength - 4;
-		memcpy(acm->country_codes, (u8 *)&cfd->wCountyCode0,
+		memcpy(acm->country_codes, cfd->wCountryCodes,
 							cfd->bLength - 4);
 		acm->country_rel_date = cfd->iCountryCodeRelDate;
 
diff --git a/include/uapi/linux/usb/cdc.h b/include/uapi/linux/usb/cdc.h
index 1924cf665448..7bd5d12d8b26 100644
--- a/include/uapi/linux/usb/cdc.h
+++ b/include/uapi/linux/usb/cdc.h
@@ -104,8 +104,10 @@ struct usb_cdc_union_desc {
 	__u8	bDescriptorSubType;
 
 	__u8	bMasterInterface0;
-	__u8	bSlaveInterface0;
-	/* ... and there could be other slave interfaces */
+	union {
+		__u8	bSlaveInterface0;
+		__DECLARE_FLEX_ARRAY(__u8, bSlaveInterfaces);
+	};
 } __attribute__ ((packed));
 
 /* "Country Selection Functional Descriptor" from CDC spec 5.2.3.9 */
@@ -115,8 +117,10 @@ struct usb_cdc_country_functional_desc {
 	__u8	bDescriptorSubType;
 
 	__u8	iCountryCodeRelDate;
-	__le16	wCountyCode0;
-	/* ... and there can be a lot of country codes */
+	union {
+		__le16	wCountryCode0;
+		__DECLARE_FLEX_ARRAY(__le16, wCountryCodes);
+	};
 } __attribute__ ((packed));
 
 /* "Network Channel Terminal Functional Descriptor" from CDC spec 5.2.3.11 */
-- 
cgit v1.2.3


From a75a5b148b4e1d7c0525359be455d5a54024b714 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Fri, 14 Nov 2025 19:37:55 +0100
Subject: usb: ohci-da8xx: remove unused platform data

We no longer support any board files for DaVinci in mainline and so
struct da8xx_ohci_root_hub is no longer used. Remove it together with
all the code it's used for.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://patch.msgid.link/20251114-davinci-usb-v1-1-737380353a74@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ohci-da8xx.c             | 17 -----------------
 include/linux/platform_data/usb-davinci.h | 22 ----------------------
 2 files changed, 39 deletions(-)
 delete mode 100644 include/linux/platform_data/usb-davinci.h

(limited to 'include')

diff --git a/drivers/usb/host/ohci-da8xx.c b/drivers/usb/host/ohci-da8xx.c
index 3c5ca2d7c92e..0938c0e7a8b6 100644
--- a/drivers/usb/host/ohci-da8xx.c
+++ b/drivers/usb/host/ohci-da8xx.c
@@ -18,7 +18,6 @@
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/phy/phy.h>
-#include <linux/platform_data/usb-davinci.h>
 #include <linux/regulator/consumer.h>
 #include <linux/usb.h>
 #include <linux/usb/hcd.h>
@@ -166,17 +165,6 @@ static int ohci_da8xx_has_oci(struct usb_hcd *hcd)
 	return 0;
 }
 
-static int ohci_da8xx_has_potpgt(struct usb_hcd *hcd)
-{
-	struct device *dev		= hcd->self.controller;
-	struct da8xx_ohci_root_hub *hub	= dev_get_platdata(dev);
-
-	if (hub && hub->potpgt)
-		return 1;
-
-	return 0;
-}
-
 static int ohci_da8xx_regulator_event(struct notifier_block *nb,
 				unsigned long event, void *data)
 {
@@ -228,7 +216,6 @@ static int ohci_da8xx_register_notify(struct usb_hcd *hcd)
 static int ohci_da8xx_reset(struct usb_hcd *hcd)
 {
 	struct device *dev		= hcd->self.controller;
-	struct da8xx_ohci_root_hub *hub	= dev_get_platdata(dev);
 	struct ohci_hcd	*ohci		= hcd_to_ohci(hcd);
 	int result;
 	u32 rh_a;
@@ -266,10 +253,6 @@ static int ohci_da8xx_reset(struct usb_hcd *hcd)
 		rh_a &= ~RH_A_NOCP;
 		rh_a |=  RH_A_OCPM;
 	}
-	if (ohci_da8xx_has_potpgt(hcd)) {
-		rh_a &= ~RH_A_POTPGT;
-		rh_a |= hub->potpgt << 24;
-	}
 	ohci_writel(ohci, rh_a, &ohci->regs->roothub.a);
 
 	return result;
diff --git a/include/linux/platform_data/usb-davinci.h b/include/linux/platform_data/usb-davinci.h
deleted file mode 100644
index 879f5c78b91a..000000000000
--- a/include/linux/platform_data/usb-davinci.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * USB related definitions
- *
- * Copyright (C) 2009 MontaVista Software, Inc. <source@mvista.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-
-#ifndef __ASM_ARCH_USB_H
-#define __ASM_ARCH_USB_H
-
-/* Passed as the platform data to the OHCI driver */
-struct	da8xx_ohci_root_hub {
-	/* Time from power on to power good (in 2 ms units) */
-	u8	potpgt;
-};
-
-void davinci_setup_usb(unsigned mA, unsigned potpgt_ms);
-
-#endif	/* ifndef __ASM_ARCH_USB_H */
-- 
cgit v1.2.3


From c460697d3472d4252917fba9bbc1d1a23eafc124 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Mon, 17 Nov 2025 10:47:56 +0000
Subject: lib: Support ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION

ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION provides the mechanism for
invalidating certain memory regions in a cache-incoherent manner. Currently
this is used by NVDIMM and CXL memory drivers in cases where it is
necessary to flush all data from caches by physical address range.
The operations in question are effectively memory hotplug, where stale
data might otherwise remain in the caches.

This is separate from the invalidates done to enable use of non-coherent
DMA masters, primarily in terms of when it is needed (not related to DMA
mappings) and how deep the flush must push data. The flushes done for
non-coherent DMA only need to reach the Point of Coherence of a single host
(which is often nearer CPUs and DMA masters than the physical storage).
This operation must push the data out of non architectural caches
(memory-side caches, write buffers etc) and typically all the way to the
memory device.

In some architectures these operations are supported by system components
that may become available only later in boot as they are either present
on a discoverable bus, or via a firmware description of an MMIO interface
(e.g. ACPI DSDT). Provide a framework to handle this case.

Architectures can opt in for this support via
CONFIG_GENERIC_CPU_CACHE_MAINTENANCE

Add a registration framework. Each driver provides an ops structure and
the first op is Write Back and Invalidate by PA Range. The driver may
over invalidate.

For systems that can perform this operation asynchronously an optional
completion check operation is also provided. If present that must be called
to ensure that the action has finished. This provides a considerable
performance advantage if multiple agents are involved in the maintenance
operation.

When multiple agents are present in the system each should register with
this framework and the core code will issue the invalidate to all of them
before checking for completion on each. This is done to avoid need for
filtering in the core code which can become complex when interleave,
potentially across different cache coherency hardware is going on, so it
is easier to tell everyone and let those who don't care do nothing.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
---
 include/linux/cache_coherency.h |  61 ++++++++++++++++++
 lib/Kconfig                     |   3 +
 lib/Makefile                    |   2 +
 lib/cache_maint.c               | 138 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 204 insertions(+)
 create mode 100644 include/linux/cache_coherency.h
 create mode 100644 lib/cache_maint.c

(limited to 'include')

diff --git a/include/linux/cache_coherency.h b/include/linux/cache_coherency.h
new file mode 100644
index 000000000000..cc81c5733e31
--- /dev/null
+++ b/include/linux/cache_coherency.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Cache coherency maintenance operation device drivers
+ *
+ * Copyright Huawei 2025
+ */
+#ifndef _LINUX_CACHE_COHERENCY_H_
+#define _LINUX_CACHE_COHERENCY_H_
+
+#include <linux/list.h>
+#include <linux/kref.h>
+#include <linux/types.h>
+
+struct cc_inval_params {
+	phys_addr_t addr;
+	size_t size;
+};
+
+struct cache_coherency_ops_inst;
+
+struct cache_coherency_ops {
+	int (*wbinv)(struct cache_coherency_ops_inst *cci,
+		     struct cc_inval_params *invp);
+	int (*done)(struct cache_coherency_ops_inst *cci);
+};
+
+struct cache_coherency_ops_inst {
+	struct kref kref;
+	struct list_head node;
+	const struct cache_coherency_ops *ops;
+};
+
+int cache_coherency_ops_instance_register(struct cache_coherency_ops_inst *cci);
+void cache_coherency_ops_instance_unregister(struct cache_coherency_ops_inst *cci);
+
+struct cache_coherency_ops_inst *
+_cache_coherency_ops_instance_alloc(const struct cache_coherency_ops *ops,
+				    size_t size);
+/**
+ * cache_coherency_ops_instance_alloc - Allocate cache coherency ops instance
+ * @ops: Cache maintenance operations
+ * @drv_struct: structure that contains the struct cache_coherency_ops_inst
+ * @member: Name of the struct cache_coherency_ops_inst member in @drv_struct.
+ *
+ * This allocates a driver specific structure and initializes the
+ * cache_coherency_ops_inst embedded in the drv_struct. Upon success the
+ * pointer must be freed via cache_coherency_ops_instance_put().
+ *
+ * Returns a &drv_struct * on success, %NULL on error.
+ */
+#define cache_coherency_ops_instance_alloc(ops, drv_struct, member)	    \
+	({								    \
+		static_assert(__same_type(struct cache_coherency_ops_inst,  \
+					  ((drv_struct *)NULL)->member));   \
+		static_assert(offsetof(drv_struct, member) == 0);	    \
+		(drv_struct *)_cache_coherency_ops_instance_alloc(ops,	    \
+			sizeof(drv_struct));				    \
+	})
+void cache_coherency_ops_instance_put(struct cache_coherency_ops_inst *cci);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index c483951b624f..cd8e5844f9bb 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -543,6 +543,9 @@ config MEMREGION
 config ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
 	bool
 
+config GENERIC_CPU_CACHE_MAINTENANCE
+	bool
+
 config ARCH_HAS_MEMREMAP_COMPAT_ALIGN
 	bool
 
diff --git a/lib/Makefile b/lib/Makefile
index 392ff808c9b9..eed20c50f358 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -130,6 +130,8 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
+obj-$(CONFIG_GENERIC_CPU_CACHE_MAINTENANCE) += cache_maint.o
+
 lib-y += logic_pio.o
 
 lib-$(CONFIG_INDIRECT_IOMEM) += logic_iomem.o
diff --git a/lib/cache_maint.c b/lib/cache_maint.c
new file mode 100644
index 000000000000..9256a9ffc34c
--- /dev/null
+++ b/lib/cache_maint.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic support for Memory System Cache Maintenance operations.
+ *
+ * Coherency maintenance drivers register with this simple framework that will
+ * iterate over each registered instance to first kick off invalidation and
+ * then to wait until it is complete.
+ *
+ * If no implementations are registered yet cpu_cache_has_invalidate_memregion()
+ * will return false. If this runs concurrently with unregistration then a
+ * race exists but this is no worse than the case where the operations instance
+ * responsible for a given memory region has not yet registered.
+ */
+#include <linux/cache_coherency.h>
+#include <linux/cleanup.h>
+#include <linux/container_of.h>
+#include <linux/export.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/memregion.h>
+#include <linux/module.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+
+static LIST_HEAD(cache_ops_instance_list);
+static DECLARE_RWSEM(cache_ops_instance_list_lock);
+
+static void __cache_coherency_ops_instance_free(struct kref *kref)
+{
+	struct cache_coherency_ops_inst *cci =
+		container_of(kref, struct cache_coherency_ops_inst, kref);
+	kfree(cci);
+}
+
+void cache_coherency_ops_instance_put(struct cache_coherency_ops_inst *cci)
+{
+	kref_put(&cci->kref, __cache_coherency_ops_instance_free);
+}
+EXPORT_SYMBOL_GPL(cache_coherency_ops_instance_put);
+
+static int cache_inval_one(struct cache_coherency_ops_inst *cci, void *data)
+{
+	if (!cci->ops)
+		return -EINVAL;
+
+	return cci->ops->wbinv(cci, data);
+}
+
+static int cache_inval_done_one(struct cache_coherency_ops_inst *cci)
+{
+	if (!cci->ops)
+		return -EINVAL;
+
+	if (!cci->ops->done)
+		return 0;
+
+	return cci->ops->done(cci);
+}
+
+static int cache_invalidate_memregion(phys_addr_t addr, size_t size)
+{
+	int ret;
+	struct cache_coherency_ops_inst *cci;
+	struct cc_inval_params params = {
+		.addr = addr,
+		.size = size,
+	};
+
+	guard(rwsem_read)(&cache_ops_instance_list_lock);
+	list_for_each_entry(cci, &cache_ops_instance_list, node) {
+		ret = cache_inval_one(cci, &params);
+		if (ret)
+			return ret;
+	}
+	list_for_each_entry(cci, &cache_ops_instance_list, node) {
+		ret = cache_inval_done_one(cci);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+struct cache_coherency_ops_inst *
+_cache_coherency_ops_instance_alloc(const struct cache_coherency_ops *ops,
+				    size_t size)
+{
+	struct cache_coherency_ops_inst *cci;
+
+	if (!ops || !ops->wbinv)
+		return NULL;
+
+	cci = kzalloc(size, GFP_KERNEL);
+	if (!cci)
+		return NULL;
+
+	cci->ops = ops;
+	INIT_LIST_HEAD(&cci->node);
+	kref_init(&cci->kref);
+
+	return cci;
+}
+EXPORT_SYMBOL_NS_GPL(_cache_coherency_ops_instance_alloc, "CACHE_COHERENCY");
+
+int cache_coherency_ops_instance_register(struct cache_coherency_ops_inst *cci)
+{
+	guard(rwsem_write)(&cache_ops_instance_list_lock);
+	list_add(&cci->node, &cache_ops_instance_list);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cache_coherency_ops_instance_register, "CACHE_COHERENCY");
+
+void cache_coherency_ops_instance_unregister(struct cache_coherency_ops_inst *cci)
+{
+	guard(rwsem_write)(&cache_ops_instance_list_lock);
+	list_del(&cci->node);
+}
+EXPORT_SYMBOL_NS_GPL(cache_coherency_ops_instance_unregister, "CACHE_COHERENCY");
+
+int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len)
+{
+	return cache_invalidate_memregion(start, len);
+}
+EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM");
+
+/*
+ * Used for optimization / debug purposes only as removal can race
+ *
+ * Machines that do not support invalidation, e.g. VMs, will not have any
+ * operations instance to register and so this will always return false.
+ */
+bool cpu_cache_has_invalidate_memregion(void)
+{
+	guard(rwsem_read)(&cache_ops_instance_list_lock);
+	return !list_empty(&cache_ops_instance_list);
+}
+EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM");
-- 
cgit v1.2.3


From 1d6c915819f5b805c35487b6ce5923e31a28266b Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Thu, 20 Nov 2025 16:05:38 -0800
Subject: powercap: intel_rapl: Prepare read_raw() interface for atomic-context
 callers

The current read_raw() implementation of the TPMI, MMIO and MSR
interfaces does not distinguish between atomic and non-atomic callers.

rapl_msr_read_raw() uses rdmsrq_safe_on_cpu(), which can sleep and
issue cross CPU calls. When MSR-based RAPL PMU support is enabled, PMU
event handlers can invoke this function from atomic context where
sleeping or rescheduling is not allowed. In atomic context, the caller
is already executing on the target CPU, so a direct rdmsrq() is
sufficient.

To support such usage, introduce an atomic flag to the read_raw()
interface to allow callers pass the context information. Modify the
common RAPL code to propagate this flag, and set the flag to reflect
the calling contexts.

Utilize the atomic flag in rapl_msr_read_raw() to perform direct MSR
read with rdmsrq() when running in atomic context, and a sanity check
to ensure target CPU matches the current CPU for such use cases.

The TPMI and MMIO implementations do not require special atomic
handling, so the flag is ignored in those paths.

This is a preparatory patch for adding MSR-based RAPL PMU support.

Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
[ rjw: Subject tweak ]
Link: https://patch.msgid.link/20251121000539.386069-2-sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c               | 24 ++++++++++++----------
 drivers/powercap/intel_rapl_msr.c                  | 16 ++++++++++++++-
 drivers/powercap/intel_rapl_tpmi.c                 |  2 +-
 .../intel/int340x_thermal/processor_thermal_rapl.c |  2 +-
 include/linux/intel_rapl.h                         |  2 +-
 5 files changed, 31 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 57bebd07c7d0..47ec34d4c099 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -253,7 +253,8 @@ struct rapl_primitive_info {
 static void rapl_init_domains(struct rapl_package *rp);
 static int rapl_read_data_raw(struct rapl_domain *rd,
 			      enum rapl_primitives prim,
-			      bool xlate, u64 *data);
+			      bool xlate, u64 *data,
+			      bool atomic);
 static int rapl_write_data_raw(struct rapl_domain *rd,
 			       enum rapl_primitives prim,
 			       unsigned long long value);
@@ -289,7 +290,7 @@ static int get_energy_counter(struct powercap_zone *power_zone,
 	cpus_read_lock();
 	rd = power_zone_to_rapl_domain(power_zone);
 
-	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
+	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now, false)) {
 		*energy_raw = energy_now;
 		cpus_read_unlock();
 
@@ -830,7 +831,8 @@ prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
  * 63-------------------------- 31--------------------------- 0
  */
 static int rapl_read_data_raw(struct rapl_domain *rd,
-			      enum rapl_primitives prim, bool xlate, u64 *data)
+			      enum rapl_primitives prim, bool xlate, u64 *data,
+			      bool atomic)
 {
 	u64 value;
 	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
@@ -852,7 +854,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 
 	ra.mask = rpi->mask;
 
-	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
+	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, atomic)) {
 		pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name);
 		return -EIO;
 	}
@@ -904,7 +906,7 @@ static int rapl_read_pl_data(struct rapl_domain *rd, int pl,
 	if (!is_pl_valid(rd, pl))
 		return -EINVAL;
 
-	return rapl_read_data_raw(rd, prim, xlate, data);
+	return rapl_read_data_raw(rd, prim, xlate, data, false);
 }
 
 static int rapl_write_pl_data(struct rapl_domain *rd, int pl,
@@ -941,7 +943,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd)
 
 	ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT];
 	ra.mask = ~0;
-	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
+	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) {
 		pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
 			ra.reg.val, rd->rp->name, rd->name);
 		return -ENODEV;
@@ -969,7 +971,7 @@ static int rapl_check_unit_atom(struct rapl_domain *rd)
 
 	ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT];
 	ra.mask = ~0;
-	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
+	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) {
 		pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
 			ra.reg.val, rd->rp->name, rd->name);
 		return -ENODEV;
@@ -1156,7 +1158,7 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd)
 
 	ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT];
 	ra.mask = ~0;
-	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) {
+	if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) {
 		pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
 			ra.reg.val, rd->rp->name, rd->name);
 		return -ENODEV;
@@ -1328,7 +1330,7 @@ static void rapl_update_domain_data(struct rapl_package *rp)
 			struct rapl_primitive_info *rpi = get_rpi(rp, prim);
 
 			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
-						rpi->unit, &val))
+						rpi->unit, &val, false))
 				rp->domains[dmn].rdd.primitives[prim] = val;
 		}
 	}
@@ -1428,7 +1430,7 @@ static int rapl_check_domain(int domain, struct rapl_package *rp)
 	 */
 
 	ra.mask = ENERGY_STATUS_MASK;
-	if (rp->priv->read_raw(get_rid(rp), &ra) || !ra.value)
+	if (rp->priv->read_raw(get_rid(rp), &ra, false) || !ra.value)
 		return -ENODEV;
 
 	return 0;
@@ -1639,7 +1641,7 @@ static u64 event_read_counter(struct perf_event *event)
 	if (event->hw.idx < 0)
 		return 0;
 
-	ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val);
+	ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val, true);
 
 	/* Return 0 for failed read */
 	if (ret)
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index c6b9a7debc35..6e3c50af0912 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -102,12 +102,26 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 	return 0;
 }
 
-static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
+static int rapl_msr_read_raw(int cpu, struct reg_action *ra, bool atomic)
 {
+	/*
+	 * When called from atomic-context (eg PMU event handler)
+	 * perform MSR read directly using rdmsrq().
+	 */
+	if (atomic) {
+		if (unlikely(smp_processor_id() != cpu))
+			return -EIO;
+
+		rdmsrq(ra->reg.msr, ra->value);
+		goto out;
+	}
+
 	if (rdmsrq_safe_on_cpu(cpu, ra->reg.msr, &ra->value)) {
 		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg.msr, cpu);
 		return -EIO;
 	}
+
+out:
 	ra->value &= ra->mask;
 	return 0;
 }
diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c
index 82201bf4685d..0a0b85f4528b 100644
--- a/drivers/powercap/intel_rapl_tpmi.c
+++ b/drivers/powercap/intel_rapl_tpmi.c
@@ -60,7 +60,7 @@ static DEFINE_MUTEX(tpmi_rapl_lock);
 
 static struct powercap_control_type *tpmi_control_type;
 
-static int tpmi_rapl_read_raw(int id, struct reg_action *ra)
+static int tpmi_rapl_read_raw(int id, struct reg_action *ra, bool atomic)
 {
 	if (!ra->reg.mmio)
 		return -EINVAL;
diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c
index bde2cc386afd..bf51a17c5be6 100644
--- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c
+++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c
@@ -19,7 +19,7 @@ static const struct rapl_mmio_regs rapl_mmio_default = {
 	.limits[RAPL_DOMAIN_DRAM] = BIT(POWER_LIMIT2),
 };
 
-static int rapl_mmio_read_raw(int cpu, struct reg_action *ra)
+static int rapl_mmio_read_raw(int cpu, struct reg_action *ra, bool atomic)
 {
 	if (!ra->reg.mmio)
 		return -EINVAL;
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index c0397423d3a8..e9ade2ff4af6 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -152,7 +152,7 @@ struct rapl_if_priv {
 	union rapl_reg reg_unit;
 	union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
 	int limits[RAPL_DOMAIN_MAX];
-	int (*read_raw)(int id, struct reg_action *ra);
+	int (*read_raw)(int id, struct reg_action *ra, bool atomic);
 	int (*write_raw)(int id, struct reg_action *ra);
 	void *defaults;
 	void *rpi;
-- 
cgit v1.2.3


From 7923ae7698cf9728501974d76d8ea712686281bc Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@amd.com>
Date: Wed, 12 Nov 2025 18:57:29 -0600
Subject: x86,fs/resctrl: Detect io_alloc feature

AMD's SDCIAE (SDCI Allocation Enforcement) PQE feature enables system software
to control the portions of L3 cache used for direct insertion of data from I/O
devices into the L3 cache.

Introduce a generic resctrl cache resource property "io_alloc_capable" as the
first part of the new "io_alloc" resctrl feature that will support AMD's
SDCIAE. Any architecture can set a cache resource as "io_alloc_capable" if
a portion of the cache can be allocated for I/O traffic.

Set the "io_alloc_capable" property for the L3 cache resource on x86 (AMD)
systems that support SDCIAE.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/df85a9a6081674fd3ef6b4170920485512ce2ded.1762995456.git.babu.moger@amd.com
---
 arch/x86/kernel/cpu/resctrl/core.c | 7 +++++++
 include/linux/resctrl.h            | 3 +++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 2b2935b3df8d..3792ab4819dc 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -274,6 +274,11 @@ static void rdt_get_cdp_config(int level)
 	rdt_resources_all[level].r_resctrl.cdp_capable = true;
 }
 
+static void rdt_set_io_alloc_capable(struct rdt_resource *r)
+{
+	r->cache.io_alloc_capable = true;
+}
+
 static void rdt_get_cdp_l3_config(void)
 {
 	rdt_get_cdp_config(RDT_RESOURCE_L3);
@@ -855,6 +860,8 @@ static __init bool get_rdt_alloc_resources(void)
 		rdt_get_cache_alloc_cfg(1, r);
 		if (rdt_cpu_has(X86_FEATURE_CDP_L3))
 			rdt_get_cdp_l3_config();
+		if (rdt_cpu_has(X86_FEATURE_SDCIAE))
+			rdt_set_io_alloc_capable(r);
 		ret = true;
 	}
 	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index a7d92718b653..533f240dbe21 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -206,6 +206,8 @@ struct rdt_mon_domain {
  * @arch_has_sparse_bitmasks:	True if a bitmask like f00f is valid.
  * @arch_has_per_cpu_cfg:	True if QOS_CFG register for this cache
  *				level has CPU scope.
+ * @io_alloc_capable:	True if portion of the cache can be configured
+ *			for I/O traffic.
  */
 struct resctrl_cache {
 	unsigned int	cbm_len;
@@ -213,6 +215,7 @@ struct resctrl_cache {
 	unsigned int	shareable_bits;
 	bool		arch_has_sparse_bitmasks;
 	bool		arch_has_per_cpu_cfg;
+	bool		io_alloc_capable;
 };
 
 /**
-- 
cgit v1.2.3


From 556d2892aa715286d840a74216c8fff885559261 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@amd.com>
Date: Wed, 12 Nov 2025 18:57:30 -0600
Subject: x86,fs/resctrl: Implement "io_alloc" enable/disable handlers

"io_alloc" is the generic name of the new resctrl feature that enables system
software to configure the portion of cache allocated for I/O traffic. On AMD
systems, "io_alloc" resctrl feature is backed by AMD's L3 Smart Data Cache
Injection Allocation Enforcement (SDCIAE).

Introduce the architecture-specific functions that resctrl fs should call to
enable, disable, or check status of the "io_alloc" feature. Change SDCIAE state
by setting (to enable) or clearing (to disable) bit 1 of
MSR_IA32_L3_QOS_EXT_CFG on all logical processors within the cache domain.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/9e9070100c320eab5368e088a3642443dee95ed7.1762995456.git.babu.moger@amd.com
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 40 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/resctrl/internal.h    |  5 ++++
 include/linux/resctrl.h                   | 21 ++++++++++++++++
 3 files changed, 66 insertions(+)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 1189c0df4ad7..b20e705606b8 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -91,3 +91,43 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
 
 	return hw_dom->ctrl_val[idx];
 }
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+	return resctrl_to_arch_res(r)->sdciae_enabled;
+}
+
+static void resctrl_sdciae_set_one_amd(void *arg)
+{
+	bool *enable = arg;
+
+	if (*enable)
+		msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+	else
+		msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+}
+
+static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable)
+{
+	struct rdt_ctrl_domain *d;
+
+	/* Walking r->ctrl_domains, ensure it can't race with cpuhp */
+	lockdep_assert_cpus_held();
+
+	/* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */
+	list_for_each_entry(d, &r->ctrl_domains, hdr.list)
+		on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1);
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+	if (hw_res->r_resctrl.cache.io_alloc_capable &&
+	    hw_res->sdciae_enabled != enable) {
+		_resctrl_sdciae_enable(r, enable);
+		hw_res->sdciae_enabled = enable;
+	}
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 9f4c2f0aaf5c..4a916c84a322 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -46,6 +46,9 @@ struct arch_mbm_state {
 #define ABMC_EXTENDED_EVT_ID		BIT(31)
 #define ABMC_EVT_ID			BIT(0)
 
+/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */
+#define SDCIAE_ENABLE_BIT		1
+
 /**
  * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
  *			       a resource for a control function
@@ -112,6 +115,7 @@ struct msr_param {
  * @mbm_width:		Monitor width, to detect and correct for overflow.
  * @cdp_enabled:	CDP state of this resource
  * @mbm_cntr_assign_enabled:	ABMC feature is enabled
+ * @sdciae_enabled:	SDCIAE feature (backing "io_alloc") is enabled.
  *
  * Members of this structure are either private to the architecture
  * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g.
@@ -126,6 +130,7 @@ struct rdt_hw_resource {
 	unsigned int		mbm_width;
 	bool			cdp_enabled;
 	bool			mbm_cntr_assign_enabled;
+	bool			sdciae_enabled;
 };
 
 static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 533f240dbe21..54701668b3df 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -657,6 +657,27 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
 			     u32 closid, u32 rmid, int cntr_id,
 			     enum resctrl_event_id eventid);
 
+/**
+ * resctrl_arch_io_alloc_enable() - Enable/disable io_alloc feature.
+ * @r:		The resctrl resource.
+ * @enable:	Enable (true) or disable (false) io_alloc on resource @r.
+ *
+ * This can be called from any CPU.
+ *
+ * Return:
+ * 0 on success, <0 on error.
+ */
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable);
+
+/**
+ * resctrl_arch_get_io_alloc_enabled() - Get io_alloc feature state.
+ * @r:		The resctrl resource.
+ *
+ * Return:
+ * true if io_alloc is enabled or false if disabled.
+ */
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r);
+
 extern unsigned int resctrl_rmid_realloc_threshold;
 extern unsigned int resctrl_rmid_realloc_limit;
 
-- 
cgit v1.2.3


From e40f5a6bf88a781d5f81bc6b8aab9ac31d8c98dd Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 19 Nov 2025 17:03:54 +0100
Subject: bpf: correct stack liveness for tail calls

This updates bpf_insn_successors() reflecting that control flow might
jump over the instructions between tail call and function exit, verifier
might assume that some writes to parent stack always happen, which is
not the case.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin Teichmann <martin.teichmann@xfel.eu>
Link: https://lore.kernel.org/r/20251119160355.1160932-4-martin.teichmann@xfel.eu
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  5 +++--
 kernel/bpf/liveness.c        |  7 ++++---
 kernel/bpf/verifier.c        | 29 +++++++++++++++++++++++++++--
 3 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5441341f1ab9..8d0b60fa5f2b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -527,7 +527,6 @@ struct bpf_insn_aux_data {
 		struct {
 			u32 map_index;		/* index into used_maps[] */
 			u32 map_off;		/* offset from value base address */
-			struct bpf_iarray *jt;	/* jump table for gotox instruction */
 		};
 		struct {
 			enum bpf_reg_type reg_type;	/* type of pseudo_btf_id */
@@ -550,6 +549,7 @@ struct bpf_insn_aux_data {
 		/* remember the offset of node field within type to rewrite */
 		u64 insert_off;
 	};
+	struct bpf_iarray *jt;	/* jump table for gotox or bpf_tailcall call instruction */
 	struct btf_struct_meta *kptr_struct_meta;
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
@@ -652,6 +652,7 @@ struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u32 postorder_start; /* The idx to the env->cfg.insn_postorder */
+	u32 exit_idx; /* Index of one of the BPF_EXIT instructions in this subprogram */
 	u16 stack_depth; /* max. stack depth used by this function */
 	u16 stack_extra;
 	/* offsets in range [stack_depth .. fastcall_stack_off)
@@ -669,9 +670,9 @@ struct bpf_subprog_info {
 	bool keep_fastcall_stack: 1;
 	bool changes_pkt_data: 1;
 	bool might_sleep: 1;
+	u8 arg_cnt:3;
 
 	enum priv_stack_mode priv_stack_mode;
-	u8 arg_cnt;
 	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
 };
 
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index a7240013fd9d..60db5d655495 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -482,11 +482,12 @@ bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
 	struct bpf_prog *prog = env->prog;
 	struct bpf_insn *insn = &prog->insnsi[idx];
 	const struct opcode_info *opcode_info;
-	struct bpf_iarray *succ;
+	struct bpf_iarray *succ, *jt;
 	int insn_sz;
 
-	if (unlikely(insn_is_gotox(insn)))
-		return env->insn_aux_data[idx].jt;
+	jt = env->insn_aux_data[idx].jt;
+	if (unlikely(jt))
+		return jt;
 
 	/* pre-allocated array of size up to 2; reset cnt, as it may have been used already */
 	succ = env->succ;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9426367fc911..0828718a8ba7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3555,8 +3555,12 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			subprog[cur_subprog].has_ld_abs = true;
 		if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
 			goto next;
-		if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
+		if (BPF_OP(code) == BPF_CALL)
 			goto next;
+		if (BPF_OP(code) == BPF_EXIT) {
+			subprog[cur_subprog].exit_idx = i;
+			goto next;
+		}
 		off = i + bpf_jmp_offset(&insn[i]) + 1;
 		if (off < subprog_start || off >= subprog_end) {
 			verbose(env, "jump out of range from insn %d to %d\n", i, off);
@@ -18156,6 +18160,25 @@ static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
 	return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
 }
 
+static int visit_tailcall_insn(struct bpf_verifier_env *env, int t)
+{
+	static struct bpf_subprog_info *subprog;
+	struct bpf_iarray *jt;
+
+	if (env->insn_aux_data[t].jt)
+		return 0;
+
+	jt = iarray_realloc(NULL, 2);
+	if (!jt)
+		return -ENOMEM;
+
+	subprog = bpf_find_containing_subprog(env, t);
+	jt->items[0] = t + 1;
+	jt->items[1] = subprog->exit_idx;
+	env->insn_aux_data[t].jt = jt;
+	return 0;
+}
+
 /* Visits the instruction at index t and returns one of the following:
  *  < 0 - an error occurred
  *  DONE_EXPLORING - the instruction was fully explored
@@ -18216,6 +18239,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 				mark_subprog_might_sleep(env, t);
 			if (bpf_helper_changes_pkt_data(insn->imm))
 				mark_subprog_changes_pkt_data(env, t);
+			if (insn->imm == BPF_FUNC_tail_call)
+				visit_tailcall_insn(env, t);
 		} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
 			struct bpf_kfunc_call_arg_meta meta;
 
@@ -21477,7 +21502,7 @@ static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len
 	int i;
 
 	for (i = start; i < end; i++) {
-		if (insn_is_gotox(&insns[i])) {
+		if (aux_data[i].jt) {
 			kvfree(aux_data[i].jt);
 			aux_data[i].jt = NULL;
 		}
-- 
cgit v1.2.3


From c7dcb041ce7d32c0becd43e8f99f993365e6bd20 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Thu, 13 Nov 2025 18:57:08 -0800
Subject: crypto: ansi_cprng - Remove unused ansi_cprng algorithm

Remove ansi_cprng, since it's obsolete and unused, as confirmed at
https://lore.kernel.org/r/aQxpnckYMgAAOLpZ@gondor.apana.org.au/

This was originally added in 2008, apparently as a FIPS approved random
number generator.  Whether this has ever belonged upstream is
questionable.  Either way, ansi_cprng is no longer usable for this
purpose, since it's been superseded by the more modern algorithms in
crypto/drbg.c, and FIPS itself no longer allows it.  (NIST SP 800-131A
Rev 1 (2015) says that RNGs based on ANSI X9.31 will be disallowed after
2015.  NIST SP 800-131A Rev 2 (2019) confirms they are now disallowed.)

Therefore, there is no reason to keep it around.

Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Haotian Zhang <vulab@iscas.ac.cn>
Cc: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/crypto/userspace-if.rst       |   7 +-
 MAINTAINERS                                 |   1 -
 arch/arm/configs/axm55xx_defconfig          |   1 -
 arch/arm/configs/clps711x_defconfig         |   1 -
 arch/arm/configs/dove_defconfig             |   1 -
 arch/arm/configs/ep93xx_defconfig           |   1 -
 arch/arm/configs/jornada720_defconfig       |   1 -
 arch/arm/configs/keystone_defconfig         |   1 -
 arch/arm/configs/lpc32xx_defconfig          |   1 -
 arch/arm/configs/mmp2_defconfig             |   1 -
 arch/arm/configs/mv78xx0_defconfig          |   1 -
 arch/arm/configs/omap1_defconfig            |   1 -
 arch/arm/configs/orion5x_defconfig          |   1 -
 arch/arm/configs/pxa168_defconfig           |   1 -
 arch/arm/configs/pxa3xx_defconfig           |   1 -
 arch/arm/configs/pxa910_defconfig           |   1 -
 arch/arm/configs/spitz_defconfig            |   1 -
 arch/arm64/configs/defconfig                |   1 -
 arch/hexagon/configs/comet_defconfig        |   1 -
 arch/m68k/configs/amcore_defconfig          |   1 -
 arch/m68k/configs/amiga_defconfig           |   1 -
 arch/m68k/configs/apollo_defconfig          |   1 -
 arch/m68k/configs/atari_defconfig           |   1 -
 arch/m68k/configs/bvme6000_defconfig        |   1 -
 arch/m68k/configs/hp300_defconfig           |   1 -
 arch/m68k/configs/mac_defconfig             |   1 -
 arch/m68k/configs/multi_defconfig           |   1 -
 arch/m68k/configs/mvme147_defconfig         |   1 -
 arch/m68k/configs/mvme16x_defconfig         |   1 -
 arch/m68k/configs/q40_defconfig             |   1 -
 arch/m68k/configs/stmark2_defconfig         |   1 -
 arch/m68k/configs/sun3_defconfig            |   1 -
 arch/m68k/configs/sun3x_defconfig           |   1 -
 arch/mips/configs/decstation_64_defconfig   |   1 -
 arch/mips/configs/decstation_defconfig      |   1 -
 arch/mips/configs/decstation_r4k_defconfig  |   1 -
 arch/s390/configs/debug_defconfig           |   1 -
 arch/s390/configs/defconfig                 |   1 -
 arch/sh/configs/ap325rxa_defconfig          |   1 -
 arch/sh/configs/apsh4a3a_defconfig          |   1 -
 arch/sh/configs/apsh4ad0a_defconfig         |   1 -
 arch/sh/configs/dreamcast_defconfig         |   1 -
 arch/sh/configs/ecovec24_defconfig          |   1 -
 arch/sh/configs/edosk7760_defconfig         |   1 -
 arch/sh/configs/espt_defconfig              |   1 -
 arch/sh/configs/hp6xx_defconfig             |   1 -
 arch/sh/configs/landisk_defconfig           |   1 -
 arch/sh/configs/lboxre2_defconfig           |   1 -
 arch/sh/configs/migor_defconfig             |   1 -
 arch/sh/configs/r7780mp_defconfig           |   1 -
 arch/sh/configs/r7785rp_defconfig           |   1 -
 arch/sh/configs/rts7751r2d1_defconfig       |   1 -
 arch/sh/configs/rts7751r2dplus_defconfig    |   1 -
 arch/sh/configs/sdk7780_defconfig           |   1 -
 arch/sh/configs/sdk7786_defconfig           |   1 -
 arch/sh/configs/se7206_defconfig            |   1 -
 arch/sh/configs/se7343_defconfig            |   1 -
 arch/sh/configs/se7705_defconfig            |   1 -
 arch/sh/configs/se7712_defconfig            |   1 -
 arch/sh/configs/se7721_defconfig            |   1 -
 arch/sh/configs/se7722_defconfig            |   1 -
 arch/sh/configs/se7724_defconfig            |   1 -
 arch/sh/configs/se7750_defconfig            |   1 -
 arch/sh/configs/se7751_defconfig            |   1 -
 arch/sh/configs/se7780_defconfig            |   1 -
 arch/sh/configs/sh03_defconfig              |   1 -
 arch/sh/configs/sh2007_defconfig            |   1 -
 arch/sh/configs/sh7710voipgw_defconfig      |   1 -
 arch/sh/configs/sh7757lcr_defconfig         |   1 -
 arch/sh/configs/sh7763rdp_defconfig         |   1 -
 arch/sh/configs/sh7785lcr_32bit_defconfig   |   1 -
 arch/sh/configs/sh7785lcr_defconfig         |   1 -
 arch/sh/configs/shmin_defconfig             |   1 -
 arch/sh/configs/shx3_defconfig              |   1 -
 arch/sh/configs/titan_defconfig             |   1 -
 arch/sh/configs/ul2_defconfig               |   1 -
 arch/sh/configs/urquell_defconfig           |   1 -
 arch/sparc/configs/sparc32_defconfig        |   1 -
 arch/sparc/configs/sparc64_defconfig        |   1 -
 arch/xtensa/configs/audio_kc705_defconfig   |   1 -
 arch/xtensa/configs/generic_kc705_defconfig |   1 -
 arch/xtensa/configs/iss_defconfig           |   1 -
 arch/xtensa/configs/nommu_kc705_defconfig   |   1 -
 arch/xtensa/configs/smp_lx200_defconfig     |   1 -
 arch/xtensa/configs/virt_defconfig          |   1 -
 arch/xtensa/configs/xip_kc705_defconfig     |   1 -
 crypto/Kconfig                              |  13 +-
 crypto/Makefile                             |   1 -
 crypto/ansi_cprng.c                         | 474 ----------------------------
 crypto/tcrypt.c                             |   4 -
 crypto/testmgr.c                            |  97 ------
 crypto/testmgr.h                            | 106 -------
 include/crypto/rng.h                        |  11 +-
 93 files changed, 9 insertions(+), 789 deletions(-)
 delete mode 100644 crypto/ansi_cprng.c

(limited to 'include')

diff --git a/Documentation/crypto/userspace-if.rst b/Documentation/crypto/userspace-if.rst
index f80f243e227e..8158b363cd98 100644
--- a/Documentation/crypto/userspace-if.rst
+++ b/Documentation/crypto/userspace-if.rst
@@ -302,10 +302,9 @@ follows:
 
 
 Depending on the RNG type, the RNG must be seeded. The seed is provided
-using the setsockopt interface to set the key. For example, the
-ansi_cprng requires a seed. The DRBGs do not require a seed, but may be
-seeded. The seed is also known as a *Personalization String* in NIST SP 800-90A
-standard.
+using the setsockopt interface to set the key. The SP800-90A DRBGs do
+not require a seed, but may be seeded. The seed is also known as a
+*Personalization String* in NIST SP 800-90A standard.
 
 Using the read()/recvmsg() system calls, random numbers can be obtained.
 The kernel generates at most 128 bytes in one call. If user space
diff --git a/MAINTAINERS b/MAINTAINERS
index 46126ce2f968..5493ee49646f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6605,7 +6605,6 @@ CRYPTOGRAPHIC RANDOM NUMBER GENERATOR
 M:	Neil Horman <nhorman@tuxdriver.com>
 L:	linux-crypto@vger.kernel.org
 S:	Maintained
-F:	crypto/ansi_cprng.c
 F:	crypto/rng.c
 
 CS3308 MEDIA DRIVER
diff --git a/arch/arm/configs/axm55xx_defconfig b/arch/arm/configs/axm55xx_defconfig
index 516689dc6cf1..f35d1e7efc7d 100644
--- a/arch/arm/configs/axm55xx_defconfig
+++ b/arch/arm/configs/axm55xx_defconfig
@@ -233,4 +233,3 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_DEBUG_USER=y
 CONFIG_CRYPTO_GCM=y
 CONFIG_CRYPTO_SHA256=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/arm/configs/clps711x_defconfig b/arch/arm/configs/clps711x_defconfig
index 6fa3477e6b02..f66d502ce2ef 100644
--- a/arch/arm/configs/clps711x_defconfig
+++ b/arch/arm/configs/clps711x_defconfig
@@ -75,5 +75,4 @@ CONFIG_MINIX_FS=y
 CONFIG_DEBUG_USER=y
 CONFIG_DEBUG_LL=y
 CONFIG_EARLY_PRINTK=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/arm/configs/dove_defconfig b/arch/arm/configs/dove_defconfig
index d76eb12d29a7..23d0b61402e4 100644
--- a/arch/arm/configs/dove_defconfig
+++ b/arch/arm/configs/dove_defconfig
@@ -126,7 +126,6 @@ CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_SHA512=y
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_DEV_MARVELL_CESA=y
 CONFIG_PRINTK_TIME=y
 # CONFIG_DEBUG_BUGVERBOSE is not set
diff --git a/arch/arm/configs/ep93xx_defconfig b/arch/arm/configs/ep93xx_defconfig
index 2248afaf35b5..facdd4902470 100644
--- a/arch/arm/configs/ep93xx_defconfig
+++ b/arch/arm/configs/ep93xx_defconfig
@@ -119,4 +119,3 @@ CONFIG_DEBUG_SPINLOCK=y
 CONFIG_DEBUG_MUTEXES=y
 CONFIG_DEBUG_USER=y
 CONFIG_DEBUG_LL=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/arm/configs/jornada720_defconfig b/arch/arm/configs/jornada720_defconfig
index e6ec768f42e2..d57285cfefb2 100644
--- a/arch/arm/configs/jornada720_defconfig
+++ b/arch/arm/configs/jornada720_defconfig
@@ -92,4 +92,3 @@ CONFIG_NLS_UTF8=m
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_FTRACE is not set
 CONFIG_DEBUG_LL=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig
index c1291ca290b2..b0cadd878152 100644
--- a/arch/arm/configs/keystone_defconfig
+++ b/arch/arm/configs/keystone_defconfig
@@ -228,7 +228,6 @@ CONFIG_CRYPTO_DES=y
 CONFIG_CRYPTO_CBC=y
 CONFIG_CRYPTO_CTR=y
 CONFIG_CRYPTO_XCBC=y
-CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_USER_API_HASH=y
 CONFIG_CRYPTO_USER_API_SKCIPHER=y
 CONFIG_DMA_CMA=y
diff --git a/arch/arm/configs/lpc32xx_defconfig b/arch/arm/configs/lpc32xx_defconfig
index 9afccd76446b..2bddb0924a8c 100644
--- a/arch/arm/configs/lpc32xx_defconfig
+++ b/arch/arm/configs/lpc32xx_defconfig
@@ -177,7 +177,6 @@ CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ASCII=y
 CONFIG_NLS_ISO8859_1=y
 CONFIG_NLS_UTF8=y
-CONFIG_CRYPTO_ANSI_CPRNG=y
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRINTK_TIME=y
 CONFIG_DYNAMIC_DEBUG=y
diff --git a/arch/arm/configs/mmp2_defconfig b/arch/arm/configs/mmp2_defconfig
index 842a989baa27..d38e8d36fef2 100644
--- a/arch/arm/configs/mmp2_defconfig
+++ b/arch/arm/configs/mmp2_defconfig
@@ -78,4 +78,3 @@ CONFIG_DEBUG_USER=y
 CONFIG_DEBUG_LL=y
 CONFIG_DEBUG_MMP_UART3=y
 CONFIG_EARLY_PRINTK=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/arm/configs/mv78xx0_defconfig b/arch/arm/configs/mv78xx0_defconfig
index 3343f72de7ea..e9d6af34c4e1 100644
--- a/arch/arm/configs/mv78xx0_defconfig
+++ b/arch/arm/configs/mv78xx0_defconfig
@@ -121,4 +121,3 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_SCHEDSTATS=y
 CONFIG_DEBUG_USER=y
 CONFIG_DEBUG_LL=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/arm/configs/omap1_defconfig b/arch/arm/configs/omap1_defconfig
index 661e5d6894bd..c574aa090acb 100644
--- a/arch/arm/configs/omap1_defconfig
+++ b/arch/arm/configs/omap1_defconfig
@@ -220,7 +220,6 @@ CONFIG_CRYPTO_ECB=y
 CONFIG_CRYPTO_PCBC=y
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_FONTS=y
 CONFIG_FONT_8x8=y
 CONFIG_FONT_8x16=y
diff --git a/arch/arm/configs/orion5x_defconfig b/arch/arm/configs/orion5x_defconfig
index 62b9c6102789..1194ae1458f7 100644
--- a/arch/arm/configs/orion5x_defconfig
+++ b/arch/arm/configs/orion5x_defconfig
@@ -145,4 +145,3 @@ CONFIG_LATENCYTOP=y
 # CONFIG_FTRACE is not set
 CONFIG_DEBUG_USER=y
 CONFIG_DEBUG_LL=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/arm/configs/pxa168_defconfig b/arch/arm/configs/pxa168_defconfig
index 4748c7d33cb8..8cbca84fe33a 100644
--- a/arch/arm/configs/pxa168_defconfig
+++ b/arch/arm/configs/pxa168_defconfig
@@ -48,4 +48,3 @@ CONFIG_MAGIC_SYSRQ=y
 # CONFIG_DEBUG_PREEMPT is not set
 CONFIG_DEBUG_USER=y
 CONFIG_DEBUG_LL=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig
index 381356faf382..07d422f0ff34 100644
--- a/arch/arm/configs/pxa3xx_defconfig
+++ b/arch/arm/configs/pxa3xx_defconfig
@@ -106,5 +106,4 @@ CONFIG_DEBUG_SPINLOCK=y
 CONFIG_DEBUG_SPINLOCK_SLEEP=y
 # CONFIG_FTRACE is not set
 CONFIG_DEBUG_USER=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/arm/configs/pxa910_defconfig b/arch/arm/configs/pxa910_defconfig
index 49b59c600ae1..71ed0d73f8a9 100644
--- a/arch/arm/configs/pxa910_defconfig
+++ b/arch/arm/configs/pxa910_defconfig
@@ -59,4 +59,3 @@ CONFIG_DEBUG_USER=y
 CONFIG_DEBUG_LL=y
 CONFIG_DEBUG_MMP_UART2=y
 CONFIG_EARLY_PRINTK=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/arm/configs/spitz_defconfig b/arch/arm/configs/spitz_defconfig
index ac2a0f998c73..cd27bb960436 100644
--- a/arch/arm/configs/spitz_defconfig
+++ b/arch/arm/configs/spitz_defconfig
@@ -228,7 +228,6 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index e3a2d37bd104..41328593e74b 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1783,7 +1783,6 @@ CONFIG_CRYPTO_CHACHA20=m
 CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
 CONFIG_CRYPTO_SHA3_ARM64=m
diff --git a/arch/hexagon/configs/comet_defconfig b/arch/hexagon/configs/comet_defconfig
index c6108f000288..f748400ac4c8 100644
--- a/arch/hexagon/configs/comet_defconfig
+++ b/arch/hexagon/configs/comet_defconfig
@@ -70,7 +70,6 @@ CONFIG_INET=y
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 CONFIG_CRYPTO_MD5=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
 CONFIG_FRAME_WARN=0
 CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig
index 60767811e34a..88832e9cd7cb 100644
--- a/arch/m68k/configs/amcore_defconfig
+++ b/arch/m68k/configs/amcore_defconfig
@@ -86,5 +86,4 @@ CONFIG_PANIC_ON_OOPS=y
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_BUGVERBOSE is not set
 # CONFIG_CRYPTO_ECHAINIV is not set
-CONFIG_CRYPTO_ANSI_CPRNG=y
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index fba8089c9fb3..62fb72988c2c 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -591,7 +591,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 6af37716384c..1efc03aa034e 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -548,7 +548,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 471f4ec3730d..9edcafbe5cd2 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -568,7 +568,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 28492ef51457..516984ec3f93 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -540,7 +540,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 2fbefb16b72e..689e0179b763 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -550,7 +550,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index deec5df3f35a..1ab00a8041f8 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -567,7 +567,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 301a05c12577..2520e9b222c5 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -654,7 +654,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index 0d401db0e8f8..5b28388b13a5 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -540,7 +540,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 90fb5b6bcf83..ab71f83efef2 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -541,7 +541,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index b89b0f7fe2da..25d7f98c76a4 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -557,7 +557,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/stmark2_defconfig b/arch/m68k/configs/stmark2_defconfig
index 7787a4dd7c3c..70637b6ddbdd 100644
--- a/arch/m68k/configs/stmark2_defconfig
+++ b/arch/m68k/configs/stmark2_defconfig
@@ -84,7 +84,6 @@ CONFIG_FSCACHE=y
 CONFIG_CRAMFS=y
 CONFIG_SQUASHFS=y
 CONFIG_ROMFS_FS=y
-CONFIG_CRYPTO_ANSI_CPRNG=y
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRINTK_TIME=y
 # CONFIG_DEBUG_BUGVERBOSE is not set
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 8cc372c4df72..7dd129b3e767 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -538,7 +538,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index f4569f64c6e4..217cf847051d 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -538,7 +538,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 CONFIG_CRYPTO_USER_API_HASH=m
diff --git a/arch/mips/configs/decstation_64_defconfig b/arch/mips/configs/decstation_64_defconfig
index 85a4472cb058..b738c4c28c28 100644
--- a/arch/mips/configs/decstation_64_defconfig
+++ b/arch/mips/configs/decstation_64_defconfig
@@ -200,7 +200,6 @@ CONFIG_CRYPTO_LZO=m
 CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/mips/configs/decstation_defconfig b/arch/mips/configs/decstation_defconfig
index a3b2c8da2dde..60b87cf63df3 100644
--- a/arch/mips/configs/decstation_defconfig
+++ b/arch/mips/configs/decstation_defconfig
@@ -195,7 +195,6 @@ CONFIG_CRYPTO_LZO=m
 CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/mips/configs/decstation_r4k_defconfig b/arch/mips/configs/decstation_r4k_defconfig
index a476717b8a6a..ef2d18bc593a 100644
--- a/arch/mips/configs/decstation_r4k_defconfig
+++ b/arch/mips/configs/decstation_r4k_defconfig
@@ -195,7 +195,6 @@ CONFIG_CRYPTO_LZO=m
 CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_DRBG_HASH=y
 CONFIG_CRYPTO_DRBG_CTR=y
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index b31c1df90257..38781f69a51b 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -800,7 +800,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 161dad7ef211..3fe746c15281 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -784,7 +784,6 @@ CONFIG_CRYPTO_842=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ZSTD=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
 CONFIG_CRYPTO_JITTERENTROPY_OSR=1
 CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
diff --git a/arch/sh/configs/ap325rxa_defconfig b/arch/sh/configs/ap325rxa_defconfig
index b6f36c938f1d..fe053a0d85d0 100644
--- a/arch/sh/configs/ap325rxa_defconfig
+++ b/arch/sh/configs/ap325rxa_defconfig
@@ -98,4 +98,3 @@ CONFIG_NLS_ISO8859_1=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_CRYPTO=y
 CONFIG_CRYPTO_CBC=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/apsh4a3a_defconfig b/arch/sh/configs/apsh4a3a_defconfig
index 9c2644443c4d..c9e711dac7cb 100644
--- a/arch/sh/configs/apsh4a3a_defconfig
+++ b/arch/sh/configs/apsh4a3a_defconfig
@@ -87,5 +87,4 @@ CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_BUGVERBOSE is not set
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 # CONFIG_FTRACE is not set
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig
index 137573610ec4..e3f524692cff 100644
--- a/arch/sh/configs/apsh4ad0a_defconfig
+++ b/arch/sh/configs/apsh4ad0a_defconfig
@@ -117,4 +117,3 @@ CONFIG_DETECT_HUNG_TASK=y
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_DEBUG_VM=y
 CONFIG_DWARF_UNWINDER=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/dreamcast_defconfig b/arch/sh/configs/dreamcast_defconfig
index 0c9f2030bb7c..4573d5d64989 100644
--- a/arch/sh/configs/dreamcast_defconfig
+++ b/arch/sh/configs/dreamcast_defconfig
@@ -66,6 +66,5 @@ CONFIG_LOGO=y
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_HUGETLBFS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=y
diff --git a/arch/sh/configs/ecovec24_defconfig b/arch/sh/configs/ecovec24_defconfig
index e76694aace25..9475c19a3d97 100644
--- a/arch/sh/configs/ecovec24_defconfig
+++ b/arch/sh/configs/ecovec24_defconfig
@@ -127,4 +127,3 @@ CONFIG_NLS_ISO8859_1=y
 CONFIG_DEBUG_FS=y
 CONFIG_CRYPTO=y
 CONFIG_CRYPTO_CBC=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig
index f427a95bcd21..368cf25e87a4 100644
--- a/arch/sh/configs/edosk7760_defconfig
+++ b/arch/sh/configs/edosk7760_defconfig
@@ -111,4 +111,3 @@ CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_CRYPTO=y
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_DES=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/espt_defconfig b/arch/sh/configs/espt_defconfig
index da176f100e00..8c83d20e6f6f 100644
--- a/arch/sh/configs/espt_defconfig
+++ b/arch/sh/configs/espt_defconfig
@@ -109,4 +109,3 @@ CONFIG_NLS_KOI8_U=y
 CONFIG_NLS_UTF8=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_DEBUG_FS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/hp6xx_defconfig b/arch/sh/configs/hp6xx_defconfig
index 3582af15ad86..04a9fcb4342a 100644
--- a/arch/sh/configs/hp6xx_defconfig
+++ b/arch/sh/configs/hp6xx_defconfig
@@ -54,5 +54,4 @@ CONFIG_CRYPTO_CBC=y
 CONFIG_CRYPTO_ECB=y
 CONFIG_CRYPTO_PCBC=y
 CONFIG_CRYPTO_MD5=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig
index 924bb3233b0b..7899cb92d87d 100644
--- a/arch/sh/configs/landisk_defconfig
+++ b/arch/sh/configs/landisk_defconfig
@@ -110,4 +110,3 @@ CONFIG_SMB_FS=m
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_CODEPAGE_932=y
 CONFIG_SH_STANDARD_BIOS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/lboxre2_defconfig b/arch/sh/configs/lboxre2_defconfig
index 0307bb2be79f..f58714ea4b1d 100644
--- a/arch/sh/configs/lboxre2_defconfig
+++ b/arch/sh/configs/lboxre2_defconfig
@@ -57,4 +57,3 @@ CONFIG_TMPFS=y
 CONFIG_ROMFS_FS=y
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_SH_STANDARD_BIOS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/migor_defconfig b/arch/sh/configs/migor_defconfig
index 31dbd8888aaa..7cdaa909ffd6 100644
--- a/arch/sh/configs/migor_defconfig
+++ b/arch/sh/configs/migor_defconfig
@@ -87,5 +87,4 @@ CONFIG_TMPFS=y
 CONFIG_NFS_FS=y
 CONFIG_ROOT_NFS=y
 CONFIG_DEBUG_FS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig
index f28b8c4181c2..f268d206a5b1 100644
--- a/arch/sh/configs/r7780mp_defconfig
+++ b/arch/sh/configs/r7780mp_defconfig
@@ -104,4 +104,3 @@ CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig
index 3a4239f20ff1..dbbd1661ac0f 100644
--- a/arch/sh/configs/r7785rp_defconfig
+++ b/arch/sh/configs/r7785rp_defconfig
@@ -102,4 +102,3 @@ CONFIG_4KSTACKS=y
 CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/rts7751r2d1_defconfig b/arch/sh/configs/rts7751r2d1_defconfig
index 69568cc40396..0c54ab2b06e6 100644
--- a/arch/sh/configs/rts7751r2d1_defconfig
+++ b/arch/sh/configs/rts7751r2d1_defconfig
@@ -86,4 +86,3 @@ CONFIG_TMPFS=y
 CONFIG_MINIX_FS=y
 CONFIG_NLS_CODEPAGE_932=y
 CONFIG_DEBUG_FS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig
index ecb4bdb5bb58..3173b616b2cb 100644
--- a/arch/sh/configs/rts7751r2dplus_defconfig
+++ b/arch/sh/configs/rts7751r2dplus_defconfig
@@ -91,4 +91,3 @@ CONFIG_TMPFS=y
 CONFIG_MINIX_FS=y
 CONFIG_NLS_CODEPAGE_932=y
 CONFIG_DEBUG_FS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig
index 9870d16d9711..98cf3e20ddec 100644
--- a/arch/sh/configs/sdk7780_defconfig
+++ b/arch/sh/configs/sdk7780_defconfig
@@ -135,4 +135,3 @@ CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_SH_STANDARD_BIOS=y
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_DES=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig
index 07894f13441e..2803a8e9c3b4 100644
--- a/arch/sh/configs/sdk7786_defconfig
+++ b/arch/sh/configs/sdk7786_defconfig
@@ -212,4 +212,3 @@ CONFIG_FUNCTION_TRACER=y
 CONFIG_DMA_API_DEBUG=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DWARF_UNWINDER=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig
index 64f9308ee586..d67158f69c52 100644
--- a/arch/sh/configs/se7206_defconfig
+++ b/arch/sh/configs/se7206_defconfig
@@ -99,5 +99,4 @@ CONFIG_FRAME_POINTER=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/sh/configs/se7343_defconfig b/arch/sh/configs/se7343_defconfig
index 75db12fb9ad1..4e7b4364757d 100644
--- a/arch/sh/configs/se7343_defconfig
+++ b/arch/sh/configs/se7343_defconfig
@@ -92,4 +92,3 @@ CONFIG_CRAMFS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_NFSD=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7705_defconfig b/arch/sh/configs/se7705_defconfig
index 1752ddc2694a..ad55a4d9d57d 100644
--- a/arch/sh/configs/se7705_defconfig
+++ b/arch/sh/configs/se7705_defconfig
@@ -51,4 +51,3 @@ CONFIG_PROC_KCORE=y
 CONFIG_JFFS2_FS=y
 CONFIG_NFS_FS=y
 CONFIG_ROOT_NFS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig
index 8770a72e6a63..11d390801d5f 100644
--- a/arch/sh/configs/se7712_defconfig
+++ b/arch/sh/configs/se7712_defconfig
@@ -95,4 +95,3 @@ CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_FRAME_POINTER=y
 CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_PCBC=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig
index b15c6406a0e8..a31008d0d513 100644
--- a/arch/sh/configs/se7721_defconfig
+++ b/arch/sh/configs/se7721_defconfig
@@ -121,4 +121,3 @@ CONFIG_NLS_ISO8859_1=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_FRAME_POINTER=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7722_defconfig b/arch/sh/configs/se7722_defconfig
index 5327a2f70980..37239ec2753c 100644
--- a/arch/sh/configs/se7722_defconfig
+++ b/arch/sh/configs/se7722_defconfig
@@ -54,4 +54,3 @@ CONFIG_PRINTK_TIME=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_FS=y
 CONFIG_SH_STANDARD_BIOS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7724_defconfig b/arch/sh/configs/se7724_defconfig
index 9501e69eb886..4b62bb860e67 100644
--- a/arch/sh/configs/se7724_defconfig
+++ b/arch/sh/configs/se7724_defconfig
@@ -127,4 +127,3 @@ CONFIG_NLS_ISO8859_1=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_CRYPTO=y
 CONFIG_CRYPTO_CBC=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7750_defconfig b/arch/sh/configs/se7750_defconfig
index a1e25d7de8a6..83ae513e5462 100644
--- a/arch/sh/configs/se7750_defconfig
+++ b/arch/sh/configs/se7750_defconfig
@@ -52,4 +52,3 @@ CONFIG_ROOT_NFS=y
 CONFIG_PARTITION_ADVANCED=y
 # CONFIG_MSDOS_PARTITION is not set
 # CONFIG_ENABLE_MUST_CHECK is not set
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig
index 8b5fe4ec16bc..8f5ddab3c106 100644
--- a/arch/sh/configs/se7751_defconfig
+++ b/arch/sh/configs/se7751_defconfig
@@ -42,4 +42,3 @@ CONFIG_EXT2_FS=y
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_JFFS2_FS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7780_defconfig b/arch/sh/configs/se7780_defconfig
index 21303304eda2..12463b766120 100644
--- a/arch/sh/configs/se7780_defconfig
+++ b/arch/sh/configs/se7780_defconfig
@@ -102,4 +102,3 @@ CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_DEBUG_FS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/sh03_defconfig b/arch/sh/configs/sh03_defconfig
index 4d75c92cac10..518526dfdfad 100644
--- a/arch/sh/configs/sh03_defconfig
+++ b/arch/sh/configs/sh03_defconfig
@@ -119,6 +119,5 @@ CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_SHA1=y
 CONFIG_CRYPTO_DEFLATE=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=y
diff --git a/arch/sh/configs/sh2007_defconfig b/arch/sh/configs/sh2007_defconfig
index cc6292b3235a..4e06ea7bcc30 100644
--- a/arch/sh/configs/sh2007_defconfig
+++ b/arch/sh/configs/sh2007_defconfig
@@ -191,5 +191,4 @@ CONFIG_CRYPTO_TEA=y
 CONFIG_CRYPTO_TWOFISH=y
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/sh/configs/sh7710voipgw_defconfig b/arch/sh/configs/sh7710voipgw_defconfig
index 5b151bb2bc43..c8625ba5be79 100644
--- a/arch/sh/configs/sh7710voipgw_defconfig
+++ b/arch/sh/configs/sh7710voipgw_defconfig
@@ -51,4 +51,3 @@ CONFIG_THERMAL=y
 # CONFIG_DNOTIFY is not set
 CONFIG_JFFS2_FS=y
 CONFIG_DEBUG_FS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/sh7757lcr_defconfig b/arch/sh/configs/sh7757lcr_defconfig
index 48a0f9beb116..9d2de154765b 100644
--- a/arch/sh/configs/sh7757lcr_defconfig
+++ b/arch/sh/configs/sh7757lcr_defconfig
@@ -81,4 +81,3 @@ CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_BUGVERBOSE is not set
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 # CONFIG_FTRACE is not set
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/sh7763rdp_defconfig b/arch/sh/configs/sh7763rdp_defconfig
index b77b3313157e..394f92bd6416 100644
--- a/arch/sh/configs/sh7763rdp_defconfig
+++ b/arch/sh/configs/sh7763rdp_defconfig
@@ -111,4 +111,3 @@ CONFIG_NLS_KOI8_U=y
 CONFIG_NLS_UTF8=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_DEBUG_FS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig
index 44f9b2317f09..a51f16c079aa 100644
--- a/arch/sh/configs/sh7785lcr_32bit_defconfig
+++ b/arch/sh/configs/sh7785lcr_32bit_defconfig
@@ -145,5 +145,4 @@ CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_LATENCYTOP=y
 # CONFIG_FTRACE is not set
 CONFIG_CRYPTO_HMAC=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/sh/configs/sh7785lcr_defconfig b/arch/sh/configs/sh7785lcr_defconfig
index aec74b0e7003..389edf8dd99f 100644
--- a/arch/sh/configs/sh7785lcr_defconfig
+++ b/arch/sh/configs/sh7785lcr_defconfig
@@ -113,5 +113,4 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 # CONFIG_DEBUG_BUGVERBOSE is not set
 CONFIG_CRYPTO_HMAC=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig
index bfeb004f130e..11ac2e66ec7e 100644
--- a/arch/sh/configs/shmin_defconfig
+++ b/arch/sh/configs/shmin_defconfig
@@ -49,4 +49,3 @@ CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_SH_STANDARD_BIOS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig
index 9a0df5ea3866..0795ddcc603c 100644
--- a/arch/sh/configs/shx3_defconfig
+++ b/arch/sh/configs/shx3_defconfig
@@ -98,4 +98,3 @@ CONFIG_DEBUG_VM=y
 CONFIG_FRAME_POINTER=y
 CONFIG_SH_STANDARD_BIOS=y
 CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index 8ef72b8dbcd3..876db502354a 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -262,4 +262,3 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/ul2_defconfig b/arch/sh/configs/ul2_defconfig
index 103b81ec1ffb..8f5235dd96ca 100644
--- a/arch/sh/configs/ul2_defconfig
+++ b/arch/sh/configs/ul2_defconfig
@@ -81,4 +81,3 @@ CONFIG_NLS_CODEPAGE_932=y
 CONFIG_NLS_ISO8859_1=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_CRYPTO_MICHAEL_MIC=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig
index 00ef62133b04..5197ee3167f2 100644
--- a/arch/sh/configs/urquell_defconfig
+++ b/arch/sh/configs/urquell_defconfig
@@ -143,5 +143,4 @@ CONFIG_FRAME_POINTER=y
 # CONFIG_FTRACE is not set
 # CONFIG_DUMP_CODE is not set
 CONFIG_CRYPTO_HMAC=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/sparc/configs/sparc32_defconfig b/arch/sparc/configs/sparc32_defconfig
index f6341b063b01..e021ecfb5a77 100644
--- a/arch/sparc/configs/sparc32_defconfig
+++ b/arch/sparc/configs/sparc32_defconfig
@@ -92,5 +92,4 @@ CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TWOFISH=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig
index 7a7c4dec2925..7e52da881175 100644
--- a/arch/sparc/configs/sparc64_defconfig
+++ b/arch/sparc/configs/sparc64_defconfig
@@ -228,7 +228,6 @@ CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_VCC=m
 CONFIG_PATA_CMD64X=y
 CONFIG_IP_PNP=y
diff --git a/arch/xtensa/configs/audio_kc705_defconfig b/arch/xtensa/configs/audio_kc705_defconfig
index f2af1a32c9c7..5c14c9fa1fe8 100644
--- a/arch/xtensa/configs/audio_kc705_defconfig
+++ b/arch/xtensa/configs/audio_kc705_defconfig
@@ -133,4 +133,3 @@ CONFIG_STACKTRACE=y
 CONFIG_RCU_TRACE=y
 # CONFIG_FTRACE is not set
 # CONFIG_S32C1I_SELFTEST is not set
-CONFIG_CRYPTO_ANSI_CPRNG=y
diff --git a/arch/xtensa/configs/generic_kc705_defconfig b/arch/xtensa/configs/generic_kc705_defconfig
index 4427907becca..836082830c8e 100644
--- a/arch/xtensa/configs/generic_kc705_defconfig
+++ b/arch/xtensa/configs/generic_kc705_defconfig
@@ -121,4 +121,3 @@ CONFIG_RCU_TRACE=y
 # CONFIG_FTRACE is not set
 CONFIG_LD_NO_RELAX=y
 # CONFIG_S32C1I_SELFTEST is not set
-CONFIG_CRYPTO_ANSI_CPRNG=y
diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig
index 32ce8fb068f0..324266824fae 100644
--- a/arch/xtensa/configs/iss_defconfig
+++ b/arch/xtensa/configs/iss_defconfig
@@ -28,4 +28,3 @@ CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 # CONFIG_FRAME_POINTER is not set
 CONFIG_DETECT_HUNG_TASK=y
-CONFIG_CRYPTO_ANSI_CPRNG=y
diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig
index 5828228522ba..90cf64372518 100644
--- a/arch/xtensa/configs/nommu_kc705_defconfig
+++ b/arch/xtensa/configs/nommu_kc705_defconfig
@@ -122,4 +122,3 @@ CONFIG_RCU_TRACE=y
 # CONFIG_FTRACE is not set
 # CONFIG_LD_NO_RELAX is not set
 # CONFIG_CRYPTO_ECHAINIV is not set
-CONFIG_CRYPTO_ANSI_CPRNG=y
diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig
index 326966ca7831..9e0112454059 100644
--- a/arch/xtensa/configs/smp_lx200_defconfig
+++ b/arch/xtensa/configs/smp_lx200_defconfig
@@ -125,4 +125,3 @@ CONFIG_RCU_TRACE=y
 # CONFIG_FTRACE is not set
 CONFIG_LD_NO_RELAX=y
 # CONFIG_S32C1I_SELFTEST is not set
-CONFIG_CRYPTO_ANSI_CPRNG=y
diff --git a/arch/xtensa/configs/virt_defconfig b/arch/xtensa/configs/virt_defconfig
index e37048985b47..e2df7db318a2 100644
--- a/arch/xtensa/configs/virt_defconfig
+++ b/arch/xtensa/configs/virt_defconfig
@@ -92,7 +92,6 @@ CONFIG_NLS_ISO8859_1=y
 CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
-CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_DEV_VIRTIO=y
 CONFIG_FONTS=y
 CONFIG_PRINTK_TIME=y
diff --git a/arch/xtensa/configs/xip_kc705_defconfig b/arch/xtensa/configs/xip_kc705_defconfig
index ee47438f9b51..9ddb9bf6c5fd 100644
--- a/arch/xtensa/configs/xip_kc705_defconfig
+++ b/arch/xtensa/configs/xip_kc705_defconfig
@@ -98,7 +98,6 @@ CONFIG_NLS_ISO8859_1=y
 CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
-CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DYNAMIC_DEBUG=y
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
diff --git a/crypto/Kconfig b/crypto/Kconfig
index b9afd8505b89..a7997759cbd6 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -25,7 +25,7 @@ menu "Crypto core or helper"
 
 config CRYPTO_FIPS
 	bool "FIPS 200 compliance"
-	depends on (CRYPTO_ANSI_CPRNG || CRYPTO_DRBG) && CRYPTO_SELFTESTS
+	depends on CRYPTO_DRBG && CRYPTO_SELFTESTS
 	depends on (MODULE_SIG || !MODULES)
 	help
 	  This option enables the fips boot option which is
@@ -1169,17 +1169,6 @@ endmenu
 
 menu "Random number generation"
 
-config CRYPTO_ANSI_CPRNG
-	tristate "ANSI PRNG (Pseudo Random Number Generator)"
-	select CRYPTO_AES
-	select CRYPTO_RNG
-	help
-	  Pseudo RNG (random number generator) (ANSI X9.31 Appendix A.2.4)
-
-	  This uses the AES cipher algorithm.
-
-	  Note that this option must be enabled if CRYPTO_FIPS is selected
-
 menuconfig CRYPTO_DRBG_MENU
 	tristate "NIST SP800-90A DRBG (Deterministic Random Bit Generator)"
 	help
diff --git a/crypto/Makefile b/crypto/Makefile
index c47f2bf5db61..75e0d9d45795 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -163,7 +163,6 @@ obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o
 obj-$(CONFIG_CRYPTO_XXHASH) += xxhash_generic.o
 obj-$(CONFIG_CRYPTO_842) += 842.o
 obj-$(CONFIG_CRYPTO_RNG2) += rng.o
-obj-$(CONFIG_CRYPTO_ANSI_CPRNG) += ansi_cprng.o
 obj-$(CONFIG_CRYPTO_DRBG) += drbg.o
 obj-$(CONFIG_CRYPTO_JITTERENTROPY) += jitterentropy_rng.o
 CFLAGS_jitterentropy.o = -O0
diff --git a/crypto/ansi_cprng.c b/crypto/ansi_cprng.c
deleted file mode 100644
index 153523ce6076..000000000000
--- a/crypto/ansi_cprng.c
+++ /dev/null
@@ -1,474 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * PRNG: Pseudo Random Number Generator
- *       Based on NIST Recommended PRNG From ANSI X9.31 Appendix A.2.4 using
- *       AES 128 cipher
- *
- *  (C) Neil Horman <nhorman@tuxdriver.com>
- */
-
-#include <crypto/internal/cipher.h>
-#include <crypto/internal/rng.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/string.h>
-
-#define DEFAULT_PRNG_KEY "0123456789abcdef"
-#define DEFAULT_PRNG_KSZ 16
-#define DEFAULT_BLK_SZ 16
-#define DEFAULT_V_SEED "zaybxcwdveuftgsh"
-
-/*
- * Flags for the prng_context flags field
- */
-
-#define PRNG_FIXED_SIZE 0x1
-#define PRNG_NEED_RESET 0x2
-
-/*
- * Note: DT is our counter value
- *	 I is our intermediate value
- *	 V is our seed vector
- * See http://csrc.nist.gov/groups/STM/cavp/documents/rng/931rngext.pdf
- * for implementation details
- */
-
-
-struct prng_context {
-	spinlock_t prng_lock;
-	unsigned char rand_data[DEFAULT_BLK_SZ];
-	unsigned char last_rand_data[DEFAULT_BLK_SZ];
-	unsigned char DT[DEFAULT_BLK_SZ];
-	unsigned char I[DEFAULT_BLK_SZ];
-	unsigned char V[DEFAULT_BLK_SZ];
-	u32 rand_data_valid;
-	struct crypto_cipher *tfm;
-	u32 flags;
-};
-
-static int dbg;
-
-static void hexdump(char *note, unsigned char *buf, unsigned int len)
-{
-	if (dbg) {
-		printk(KERN_CRIT "%s", note);
-		print_hex_dump(KERN_CONT, "", DUMP_PREFIX_OFFSET,
-				16, 1,
-				buf, len, false);
-	}
-}
-
-#define dbgprint(format, args...) do {\
-if (dbg)\
-	printk(format, ##args);\
-} while (0)
-
-static void xor_vectors(unsigned char *in1, unsigned char *in2,
-			unsigned char *out, unsigned int size)
-{
-	int i;
-
-	for (i = 0; i < size; i++)
-		out[i] = in1[i] ^ in2[i];
-
-}
-/*
- * Returns DEFAULT_BLK_SZ bytes of random data per call
- * returns 0 if generation succeeded, <0 if something went wrong
- */
-static int _get_more_prng_bytes(struct prng_context *ctx, int cont_test)
-{
-	int i;
-	unsigned char tmp[DEFAULT_BLK_SZ];
-	unsigned char *output = NULL;
-
-
-	dbgprint(KERN_CRIT "Calling _get_more_prng_bytes for context %p\n",
-		ctx);
-
-	hexdump("Input DT: ", ctx->DT, DEFAULT_BLK_SZ);
-	hexdump("Input I: ", ctx->I, DEFAULT_BLK_SZ);
-	hexdump("Input V: ", ctx->V, DEFAULT_BLK_SZ);
-
-	/*
-	 * This algorithm is a 3 stage state machine
-	 */
-	for (i = 0; i < 3; i++) {
-
-		switch (i) {
-		case 0:
-			/*
-			 * Start by encrypting the counter value
-			 * This gives us an intermediate value I
-			 */
-			memcpy(tmp, ctx->DT, DEFAULT_BLK_SZ);
-			output = ctx->I;
-			hexdump("tmp stage 0: ", tmp, DEFAULT_BLK_SZ);
-			break;
-		case 1:
-
-			/*
-			 * Next xor I with our secret vector V
-			 * encrypt that result to obtain our
-			 * pseudo random data which we output
-			 */
-			xor_vectors(ctx->I, ctx->V, tmp, DEFAULT_BLK_SZ);
-			hexdump("tmp stage 1: ", tmp, DEFAULT_BLK_SZ);
-			output = ctx->rand_data;
-			break;
-		case 2:
-			/*
-			 * First check that we didn't produce the same
-			 * random data that we did last time around through this
-			 */
-			if (!memcmp(ctx->rand_data, ctx->last_rand_data,
-					DEFAULT_BLK_SZ)) {
-				if (cont_test) {
-					panic("cprng %p Failed repetition check!\n",
-						ctx);
-				}
-
-				printk(KERN_ERR
-					"ctx %p Failed repetition check!\n",
-					ctx);
-
-				ctx->flags |= PRNG_NEED_RESET;
-				return -EINVAL;
-			}
-			memcpy(ctx->last_rand_data, ctx->rand_data,
-				DEFAULT_BLK_SZ);
-
-			/*
-			 * Lastly xor the random data with I
-			 * and encrypt that to obtain a new secret vector V
-			 */
-			xor_vectors(ctx->rand_data, ctx->I, tmp,
-				DEFAULT_BLK_SZ);
-			output = ctx->V;
-			hexdump("tmp stage 2: ", tmp, DEFAULT_BLK_SZ);
-			break;
-		}
-
-
-		/* do the encryption */
-		crypto_cipher_encrypt_one(ctx->tfm, output, tmp);
-
-	}
-
-	/*
-	 * Now update our DT value
-	 */
-	for (i = DEFAULT_BLK_SZ - 1; i >= 0; i--) {
-		ctx->DT[i] += 1;
-		if (ctx->DT[i] != 0)
-			break;
-	}
-
-	dbgprint("Returning new block for context %p\n", ctx);
-	ctx->rand_data_valid = 0;
-
-	hexdump("Output DT: ", ctx->DT, DEFAULT_BLK_SZ);
-	hexdump("Output I: ", ctx->I, DEFAULT_BLK_SZ);
-	hexdump("Output V: ", ctx->V, DEFAULT_BLK_SZ);
-	hexdump("New Random Data: ", ctx->rand_data, DEFAULT_BLK_SZ);
-
-	return 0;
-}
-
-/* Our exported functions */
-static int get_prng_bytes(char *buf, size_t nbytes, struct prng_context *ctx,
-				int do_cont_test)
-{
-	unsigned char *ptr = buf;
-	unsigned int byte_count = (unsigned int)nbytes;
-	int err;
-
-
-	spin_lock_bh(&ctx->prng_lock);
-
-	err = -EINVAL;
-	if (ctx->flags & PRNG_NEED_RESET)
-		goto done;
-
-	/*
-	 * If the FIXED_SIZE flag is on, only return whole blocks of
-	 * pseudo random data
-	 */
-	err = -EINVAL;
-	if (ctx->flags & PRNG_FIXED_SIZE) {
-		if (nbytes < DEFAULT_BLK_SZ)
-			goto done;
-		byte_count = DEFAULT_BLK_SZ;
-	}
-
-	/*
-	 * Return 0 in case of success as mandated by the kernel
-	 * crypto API interface definition.
-	 */
-	err = 0;
-
-	dbgprint(KERN_CRIT "getting %d random bytes for context %p\n",
-		byte_count, ctx);
-
-
-remainder:
-	if (ctx->rand_data_valid == DEFAULT_BLK_SZ) {
-		if (_get_more_prng_bytes(ctx, do_cont_test) < 0) {
-			memset(buf, 0, nbytes);
-			err = -EINVAL;
-			goto done;
-		}
-	}
-
-	/*
-	 * Copy any data less than an entire block
-	 */
-	if (byte_count < DEFAULT_BLK_SZ) {
-empty_rbuf:
-		while (ctx->rand_data_valid < DEFAULT_BLK_SZ) {
-			*ptr = ctx->rand_data[ctx->rand_data_valid];
-			ptr++;
-			byte_count--;
-			ctx->rand_data_valid++;
-			if (byte_count == 0)
-				goto done;
-		}
-	}
-
-	/*
-	 * Now copy whole blocks
-	 */
-	for (; byte_count >= DEFAULT_BLK_SZ; byte_count -= DEFAULT_BLK_SZ) {
-		if (ctx->rand_data_valid == DEFAULT_BLK_SZ) {
-			if (_get_more_prng_bytes(ctx, do_cont_test) < 0) {
-				memset(buf, 0, nbytes);
-				err = -EINVAL;
-				goto done;
-			}
-		}
-		if (ctx->rand_data_valid > 0)
-			goto empty_rbuf;
-		memcpy(ptr, ctx->rand_data, DEFAULT_BLK_SZ);
-		ctx->rand_data_valid += DEFAULT_BLK_SZ;
-		ptr += DEFAULT_BLK_SZ;
-	}
-
-	/*
-	 * Now go back and get any remaining partial block
-	 */
-	if (byte_count)
-		goto remainder;
-
-done:
-	spin_unlock_bh(&ctx->prng_lock);
-	dbgprint(KERN_CRIT "returning %d from get_prng_bytes in context %p\n",
-		err, ctx);
-	return err;
-}
-
-static void free_prng_context(struct prng_context *ctx)
-{
-	crypto_free_cipher(ctx->tfm);
-}
-
-static int reset_prng_context(struct prng_context *ctx,
-			      const unsigned char *key, size_t klen,
-			      const unsigned char *V, const unsigned char *DT)
-{
-	int ret;
-	const unsigned char *prng_key;
-
-	spin_lock_bh(&ctx->prng_lock);
-	ctx->flags |= PRNG_NEED_RESET;
-
-	prng_key = (key != NULL) ? key : (unsigned char *)DEFAULT_PRNG_KEY;
-
-	if (!key)
-		klen = DEFAULT_PRNG_KSZ;
-
-	if (V)
-		memcpy(ctx->V, V, DEFAULT_BLK_SZ);
-	else
-		memcpy(ctx->V, DEFAULT_V_SEED, DEFAULT_BLK_SZ);
-
-	if (DT)
-		memcpy(ctx->DT, DT, DEFAULT_BLK_SZ);
-	else
-		memset(ctx->DT, 0, DEFAULT_BLK_SZ);
-
-	memset(ctx->rand_data, 0, DEFAULT_BLK_SZ);
-	memset(ctx->last_rand_data, 0, DEFAULT_BLK_SZ);
-
-	ctx->rand_data_valid = DEFAULT_BLK_SZ;
-
-	ret = crypto_cipher_setkey(ctx->tfm, prng_key, klen);
-	if (ret) {
-		dbgprint(KERN_CRIT "PRNG: setkey() failed flags=%x\n",
-			crypto_cipher_get_flags(ctx->tfm));
-		goto out;
-	}
-
-	ret = 0;
-	ctx->flags &= ~PRNG_NEED_RESET;
-out:
-	spin_unlock_bh(&ctx->prng_lock);
-	return ret;
-}
-
-static int cprng_init(struct crypto_tfm *tfm)
-{
-	struct prng_context *ctx = crypto_tfm_ctx(tfm);
-
-	spin_lock_init(&ctx->prng_lock);
-	ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
-	if (IS_ERR(ctx->tfm)) {
-		dbgprint(KERN_CRIT "Failed to alloc tfm for context %p\n",
-				ctx);
-		return PTR_ERR(ctx->tfm);
-	}
-
-	if (reset_prng_context(ctx, NULL, DEFAULT_PRNG_KSZ, NULL, NULL) < 0)
-		return -EINVAL;
-
-	/*
-	 * after allocation, we should always force the user to reset
-	 * so they don't inadvertently use the insecure default values
-	 * without specifying them intentially
-	 */
-	ctx->flags |= PRNG_NEED_RESET;
-	return 0;
-}
-
-static void cprng_exit(struct crypto_tfm *tfm)
-{
-	free_prng_context(crypto_tfm_ctx(tfm));
-}
-
-static int cprng_get_random(struct crypto_rng *tfm,
-			    const u8 *src, unsigned int slen,
-			    u8 *rdata, unsigned int dlen)
-{
-	struct prng_context *prng = crypto_rng_ctx(tfm);
-
-	return get_prng_bytes(rdata, dlen, prng, 0);
-}
-
-/*
- *  This is the cprng_registered reset method the seed value is
- *  interpreted as the tuple { V KEY DT}
- *  V and KEY are required during reset, and DT is optional, detected
- *  as being present by testing the length of the seed
- */
-static int cprng_reset(struct crypto_rng *tfm,
-		       const u8 *seed, unsigned int slen)
-{
-	struct prng_context *prng = crypto_rng_ctx(tfm);
-	const u8 *key = seed + DEFAULT_BLK_SZ;
-	const u8 *dt = NULL;
-
-	if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ)
-		return -EINVAL;
-
-	if (slen >= (2 * DEFAULT_BLK_SZ + DEFAULT_PRNG_KSZ))
-		dt = key + DEFAULT_PRNG_KSZ;
-
-	reset_prng_context(prng, key, DEFAULT_PRNG_KSZ, seed, dt);
-
-	if (prng->flags & PRNG_NEED_RESET)
-		return -EINVAL;
-	return 0;
-}
-
-#ifdef CONFIG_CRYPTO_FIPS
-static int fips_cprng_get_random(struct crypto_rng *tfm,
-				 const u8 *src, unsigned int slen,
-				 u8 *rdata, unsigned int dlen)
-{
-	struct prng_context *prng = crypto_rng_ctx(tfm);
-
-	return get_prng_bytes(rdata, dlen, prng, 1);
-}
-
-static int fips_cprng_reset(struct crypto_rng *tfm,
-			    const u8 *seed, unsigned int slen)
-{
-	u8 rdata[DEFAULT_BLK_SZ];
-	const u8 *key = seed + DEFAULT_BLK_SZ;
-	int rc;
-
-	struct prng_context *prng = crypto_rng_ctx(tfm);
-
-	if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ)
-		return -EINVAL;
-
-	/* fips strictly requires seed != key */
-	if (!memcmp(seed, key, DEFAULT_PRNG_KSZ))
-		return -EINVAL;
-
-	rc = cprng_reset(tfm, seed, slen);
-
-	if (!rc)
-		goto out;
-
-	/* this primes our continuity test */
-	rc = get_prng_bytes(rdata, DEFAULT_BLK_SZ, prng, 0);
-	prng->rand_data_valid = DEFAULT_BLK_SZ;
-
-out:
-	return rc;
-}
-#endif
-
-static struct rng_alg rng_algs[] = { {
-	.generate		= cprng_get_random,
-	.seed			= cprng_reset,
-	.seedsize		= DEFAULT_PRNG_KSZ + 2 * DEFAULT_BLK_SZ,
-	.base			=	{
-		.cra_name		= "stdrng",
-		.cra_driver_name	= "ansi_cprng",
-		.cra_priority		= 100,
-		.cra_ctxsize		= sizeof(struct prng_context),
-		.cra_module		= THIS_MODULE,
-		.cra_init		= cprng_init,
-		.cra_exit		= cprng_exit,
-	}
-#ifdef CONFIG_CRYPTO_FIPS
-}, {
-	.generate		= fips_cprng_get_random,
-	.seed			= fips_cprng_reset,
-	.seedsize		= DEFAULT_PRNG_KSZ + 2 * DEFAULT_BLK_SZ,
-	.base			=	{
-		.cra_name		= "fips(ansi_cprng)",
-		.cra_driver_name	= "fips_ansi_cprng",
-		.cra_priority		= 300,
-		.cra_ctxsize		= sizeof(struct prng_context),
-		.cra_module		= THIS_MODULE,
-		.cra_init		= cprng_init,
-		.cra_exit		= cprng_exit,
-	}
-#endif
-} };
-
-/* Module initalization */
-static int __init prng_mod_init(void)
-{
-	return crypto_register_rngs(rng_algs, ARRAY_SIZE(rng_algs));
-}
-
-static void __exit prng_mod_fini(void)
-{
-	crypto_unregister_rngs(rng_algs, ARRAY_SIZE(rng_algs));
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Software Pseudo Random Number Generator");
-MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
-module_param(dbg, int, 0);
-MODULE_PARM_DESC(dbg, "Boolean to enable debugging (0/1 == off/on)");
-module_init(prng_mod_init);
-module_exit(prng_mod_fini);
-MODULE_ALIAS_CRYPTO("stdrng");
-MODULE_ALIAS_CRYPTO("ansi_cprng");
-MODULE_IMPORT_NS("CRYPTO_INTERNAL");
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index d1d88debbd71..ea58a4f6dd86 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1758,10 +1758,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret = min(ret, tcrypt_test("hmac(streebog512)"));
 		break;
 
-	case 150:
-		ret = min(ret, tcrypt_test("ansi_cprng"));
-		break;
-
 	case 151:
 		ret = min(ret, tcrypt_test("rfc4106(gcm(aes))"));
 		break;
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 6a490aaa71b9..dc22b4f28633 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -117,11 +117,6 @@ struct hash_test_suite {
 	unsigned int count;
 };
 
-struct cprng_test_suite {
-	const struct cprng_testvec *vecs;
-	unsigned int count;
-};
-
 struct drbg_test_suite {
 	const struct drbg_testvec *vecs;
 	unsigned int count;
@@ -154,7 +149,6 @@ struct alg_test_desc {
 		struct cipher_test_suite cipher;
 		struct comp_test_suite comp;
 		struct hash_test_suite hash;
-		struct cprng_test_suite cprng;
 		struct drbg_test_suite drbg;
 		struct akcipher_test_suite akcipher;
 		struct sig_test_suite sig;
@@ -3442,68 +3436,6 @@ out:
 	return ret;
 }
 
-static int test_cprng(struct crypto_rng *tfm,
-		      const struct cprng_testvec *template,
-		      unsigned int tcount)
-{
-	const char *algo = crypto_tfm_alg_driver_name(crypto_rng_tfm(tfm));
-	int err = 0, i, j, seedsize;
-	u8 *seed;
-	char result[32];
-
-	seedsize = crypto_rng_seedsize(tfm);
-
-	seed = kmalloc(seedsize, GFP_KERNEL);
-	if (!seed) {
-		printk(KERN_ERR "alg: cprng: Failed to allocate seed space "
-		       "for %s\n", algo);
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < tcount; i++) {
-		memset(result, 0, 32);
-
-		memcpy(seed, template[i].v, template[i].vlen);
-		memcpy(seed + template[i].vlen, template[i].key,
-		       template[i].klen);
-		memcpy(seed + template[i].vlen + template[i].klen,
-		       template[i].dt, template[i].dtlen);
-
-		err = crypto_rng_reset(tfm, seed, seedsize);
-		if (err) {
-			printk(KERN_ERR "alg: cprng: Failed to reset rng "
-			       "for %s\n", algo);
-			goto out;
-		}
-
-		for (j = 0; j < template[i].loops; j++) {
-			err = crypto_rng_get_bytes(tfm, result,
-						   template[i].rlen);
-			if (err < 0) {
-				printk(KERN_ERR "alg: cprng: Failed to obtain "
-				       "the correct amount of random data for "
-				       "%s (requested %d)\n", algo,
-				       template[i].rlen);
-				goto out;
-			}
-		}
-
-		err = memcmp(result, template[i].result,
-			     template[i].rlen);
-		if (err) {
-			printk(KERN_ERR "alg: cprng: Test %d failed for %s\n",
-			       i, algo);
-			hexdump(result, template[i].rlen);
-			err = -EINVAL;
-			goto out;
-		}
-	}
-
-out:
-	kfree(seed);
-	return err;
-}
-
 static int alg_test_cipher(const struct alg_test_desc *desc,
 			   const char *driver, u32 type, u32 mask)
 {
@@ -3550,29 +3482,6 @@ static int alg_test_comp(const struct alg_test_desc *desc, const char *driver,
 	return err;
 }
 
-static int alg_test_cprng(const struct alg_test_desc *desc, const char *driver,
-			  u32 type, u32 mask)
-{
-	struct crypto_rng *rng;
-	int err;
-
-	rng = crypto_alloc_rng(driver, type, mask);
-	if (IS_ERR(rng)) {
-		if (PTR_ERR(rng) == -ENOENT)
-			return 0;
-		printk(KERN_ERR "alg: cprng: Failed to load transform for %s: "
-		       "%ld\n", driver, PTR_ERR(rng));
-		return PTR_ERR(rng);
-	}
-
-	err = test_cprng(rng, desc->suite.cprng.vecs, desc->suite.cprng.count);
-
-	crypto_free_rng(rng);
-
-	return err;
-}
-
-
 static int drbg_cavs_test(const struct drbg_testvec *test, int pr,
 			  const char *driver, u32 type, u32 mask)
 {
@@ -4170,12 +4079,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.aead = __VECS(aegis128_tv_template)
 		}
-	}, {
-		.alg = "ansi_cprng",
-		.test = alg_test_cprng,
-		.suite = {
-			.cprng = __VECS(ansi_cprng_aes_tv_template)
-		}
 	}, {
 		.alg = "authenc(hmac(md5),ecb(cipher_null))",
 		.generic_driver = "authenc(hmac-md5-lib,ecb-cipher_null)",
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 268231227282..7a69185b86e8 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -119,18 +119,6 @@ struct aead_testvec {
 	int crypt_error;
 };
 
-struct cprng_testvec {
-	const char *key;
-	const char *dt;
-	const char *v;
-	const char *result;
-	unsigned char klen;
-	unsigned short dtlen;
-	unsigned short vlen;
-	unsigned short rlen;
-	unsigned short loops;
-};
-
 struct drbg_testvec {
 	const unsigned char *entropy;
 	size_t entropylen;
@@ -22376,100 +22364,6 @@ static const struct aead_testvec aegis128_tv_template[] = {
 	},
 };
 
-/*
- * ANSI X9.31 Continuous Pseudo-Random Number Generator (AES mode)
- * test vectors, taken from Appendix B.2.9 and B.2.10:
- *     http://csrc.nist.gov/groups/STM/cavp/documents/rng/RNGVS.pdf
- * Only AES-128 is supported at this time.
- */
-static const struct cprng_testvec ansi_cprng_aes_tv_template[] = {
-	{
-		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
-			  "\xed\x06\x1c\xab\xb8\xd4\x62\x02",
-		.klen	= 16,
-		.dt	= "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62"
-			  "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xf9",
-		.dtlen	= 16,
-		.v	= "\x80\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.vlen	= 16,
-		.result	= "\x59\x53\x1e\xd1\x3b\xb0\xc0\x55"
-			  "\x84\x79\x66\x85\xc1\x2f\x76\x41",
-		.rlen	= 16,
-		.loops	= 1,
-	}, {
-		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
-			  "\xed\x06\x1c\xab\xb8\xd4\x62\x02",
-		.klen	= 16,
-		.dt	= "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62"
-			  "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfa",
-		.dtlen	= 16,
-		.v	= "\xc0\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.vlen	= 16,
-		.result	= "\x7c\x22\x2c\xf4\xca\x8f\xa2\x4c"
-			  "\x1c\x9c\xb6\x41\xa9\xf3\x22\x0d",
-		.rlen	= 16,
-		.loops	= 1,
-	}, {
-		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
-			  "\xed\x06\x1c\xab\xb8\xd4\x62\x02",
-		.klen	= 16,
-		.dt	= "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62"
-			  "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfb",
-		.dtlen	= 16,
-		.v	= "\xe0\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.vlen	= 16,
-		.result	= "\x8a\xaa\x00\x39\x66\x67\x5b\xe5"
-			  "\x29\x14\x28\x81\xa9\x4d\x4e\xc7",
-		.rlen	= 16,
-		.loops	= 1,
-	}, {
-		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
-			  "\xed\x06\x1c\xab\xb8\xd4\x62\x02",
-		.klen	= 16,
-		.dt	= "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62"
-			  "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfc",
-		.dtlen	= 16,
-		.v	= "\xf0\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.vlen	= 16,
-		.result	= "\x88\xdd\xa4\x56\x30\x24\x23\xe5"
-			  "\xf6\x9d\xa5\x7e\x7b\x95\xc7\x3a",
-		.rlen	= 16,
-		.loops	= 1,
-	}, {
-		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
-			  "\xed\x06\x1c\xab\xb8\xd4\x62\x02",
-		.klen	= 16,
-		.dt	= "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62"
-			  "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfd",
-		.dtlen	= 16,
-		.v	= "\xf8\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.vlen	= 16,
-		.result	= "\x05\x25\x92\x46\x61\x79\xd2\xcb"
-			  "\x78\xc4\x0b\x14\x0a\x5a\x9a\xc8",
-		.rlen	= 16,
-		.loops	= 1,
-	}, {	/* Monte Carlo Test */
-		.key	= "\x9f\x5b\x51\x20\x0b\xf3\x34\xb5"
-			  "\xd8\x2b\xe8\xc3\x72\x55\xc8\x48",
-		.klen	= 16,
-		.dt	= "\x63\x76\xbb\xe5\x29\x02\xba\x3b"
-			  "\x67\xc9\x25\xfa\x70\x1f\x11\xac",
-		.dtlen	= 16,
-		.v	= "\x57\x2c\x8e\x76\x87\x26\x47\x97"
-			  "\x7e\x74\xfb\xdd\xc4\x95\x01\xd1",
-		.vlen	= 16,
-		.result	= "\x48\xe9\xbd\x0d\x06\xee\x18\xfb"
-			  "\xe4\x57\x90\xd5\xc3\xfc\x9b\x73",
-		.rlen	= 16,
-		.loops	= 10000,
-	},
-};
-
 /*
  * SP800-90A DRBG Test vectors from
  * http://csrc.nist.gov/groups/STM/cavp/documents/drbg/drbgtestvectors.zip
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index f8224cc390f8..d451b54b322a 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -169,12 +169,11 @@ static inline int crypto_rng_get_bytes(struct crypto_rng *tfm,
  *
  * The reset function completely re-initializes the random number generator
  * referenced by the cipher handle by clearing the current state. The new state
- * is initialized with the caller provided seed or automatically, depending
- * on the random number generator type (the ANSI X9.31 RNG requires
- * caller-provided seed, the SP800-90A DRBGs perform an automatic seeding).
- * The seed is provided as a parameter to this function call. The provided seed
- * should have the length of the seed size defined for the random number
- * generator as defined by crypto_rng_seedsize.
+ * is initialized with the caller provided seed or automatically, depending on
+ * the random number generator type. (The SP800-90A DRBGs perform an automatic
+ * seeding.) The seed is provided as a parameter to this function call. The
+ * provided seed should have the length of the seed size defined for the random
+ * number generator as defined by crypto_rng_seedsize.
  *
  * Return: 0 if the setting of the key was successful; < 0 if an error occurred
  */
-- 
cgit v1.2.3


From 4dffc9bbffb9ccfcda730d899c97c553599e7ca8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 15 Nov 2025 15:08:16 -0800
Subject: crypto: scatterwalk - Fix memcpy_sglist() to always succeed

The original implementation of memcpy_sglist() was broken because it
didn't handle scatterlists that describe exactly the same memory, which
is a case that many callers rely on.  The current implementation is
broken too because it calls the skcipher_walk functions which can fail.
It ignores any errors from those functions.

Fix it by replacing it with a new implementation written from scratch.
It always succeeds.  It's also a bit faster, since it avoids the
overhead of skcipher_walk.  skcipher_walk includes a lot of
functionality (such as alignmask handling) that's irrelevant here.

Reported-by: Colin Ian King <coking@nvidia.com>
Closes: https://lore.kernel.org/r/20251114122620.111623-1-coking@nvidia.com
Fixes: 131bdceca1f0 ("crypto: scatterwalk - Add memcpy_sglist")
Fixes: 0f8d42bf128d ("crypto: scatterwalk - Move skcipher walk and use it for memcpy_sglist")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/scatterwalk.c         | 97 ++++++++++++++++++++++++++++++++++++++------
 include/crypto/scatterwalk.h | 52 ++++++++++++++----------
 2 files changed, 115 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c
index 1d010e2a1b1a..b95e5974e327 100644
--- a/crypto/scatterwalk.c
+++ b/crypto/scatterwalk.c
@@ -101,26 +101,97 @@ void memcpy_to_sglist(struct scatterlist *sg, unsigned int start,
 }
 EXPORT_SYMBOL_GPL(memcpy_to_sglist);
 
+/**
+ * memcpy_sglist() - Copy data from one scatterlist to another
+ * @dst: The destination scatterlist.  Can be NULL if @nbytes == 0.
+ * @src: The source scatterlist.  Can be NULL if @nbytes == 0.
+ * @nbytes: Number of bytes to copy
+ *
+ * The scatterlists can describe exactly the same memory, in which case this
+ * function is a no-op.  No other overlaps are supported.
+ *
+ * Context: Any context
+ */
 void memcpy_sglist(struct scatterlist *dst, struct scatterlist *src,
 		   unsigned int nbytes)
 {
-	struct skcipher_walk walk = {};
+	unsigned int src_offset, dst_offset;
 
-	if (unlikely(nbytes == 0)) /* in case sg == NULL */
+	if (unlikely(nbytes == 0)) /* in case src and/or dst is NULL */
 		return;
 
-	walk.total = nbytes;
-
-	scatterwalk_start(&walk.in, src);
-	scatterwalk_start(&walk.out, dst);
+	src_offset = src->offset;
+	dst_offset = dst->offset;
+	for (;;) {
+		/* Compute the length to copy this step. */
+		unsigned int len = min3(src->offset + src->length - src_offset,
+					dst->offset + dst->length - dst_offset,
+					nbytes);
+		struct page *src_page = sg_page(src);
+		struct page *dst_page = sg_page(dst);
+		const void *src_virt;
+		void *dst_virt;
+
+		if (IS_ENABLED(CONFIG_HIGHMEM)) {
+			/* HIGHMEM: we may have to actually map the pages. */
+			const unsigned int src_oip = offset_in_page(src_offset);
+			const unsigned int dst_oip = offset_in_page(dst_offset);
+			const unsigned int limit = PAGE_SIZE;
+
+			/* Further limit len to not cross a page boundary. */
+			len = min3(len, limit - src_oip, limit - dst_oip);
+
+			/* Compute the source and destination pages. */
+			src_page += src_offset / PAGE_SIZE;
+			dst_page += dst_offset / PAGE_SIZE;
+
+			if (src_page != dst_page) {
+				/* Copy between different pages. */
+				memcpy_page(dst_page, dst_oip,
+					    src_page, src_oip, len);
+				flush_dcache_page(dst_page);
+			} else if (src_oip != dst_oip) {
+				/* Copy between different parts of same page. */
+				dst_virt = kmap_local_page(dst_page);
+				memcpy(dst_virt + dst_oip, dst_virt + src_oip,
+				       len);
+				kunmap_local(dst_virt);
+				flush_dcache_page(dst_page);
+			} /* Else, it's the same memory.  No action needed. */
+		} else {
+			/*
+			 * !HIGHMEM: no mapping needed.  Just work in the linear
+			 * buffer of each sg entry.  Note that we can cross page
+			 * boundaries, as they are not significant in this case.
+			 */
+			src_virt = page_address(src_page) + src_offset;
+			dst_virt = page_address(dst_page) + dst_offset;
+			if (src_virt != dst_virt) {
+				memcpy(dst_virt, src_virt, len);
+				if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+					__scatterwalk_flush_dcache_pages(
+						dst_page, dst_offset, len);
+			} /* Else, it's the same memory.  No action needed. */
+		}
+		nbytes -= len;
+		if (nbytes == 0) /* No more to copy? */
+			break;
 
-	skcipher_walk_first(&walk, true);
-	do {
-		if (walk.src.virt.addr != walk.dst.virt.addr)
-			memcpy(walk.dst.virt.addr, walk.src.virt.addr,
-			       walk.nbytes);
-		skcipher_walk_done(&walk, 0);
-	} while (walk.nbytes);
+		/*
+		 * There's more to copy.  Advance the offsets by the length
+		 * copied this step, and advance the sg entries as needed.
+		 */
+		src_offset += len;
+		if (src_offset >= src->offset + src->length) {
+			src = sg_next(src);
+			src_offset = src->offset;
+		}
+		dst_offset += len;
+		if (dst_offset >= dst->offset + dst->length) {
+			dst = sg_next(dst);
+			dst_offset = dst->offset;
+		}
+	}
 }
 EXPORT_SYMBOL_GPL(memcpy_sglist);
 
diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h
index 83d14376ff2b..f485454e3955 100644
--- a/include/crypto/scatterwalk.h
+++ b/include/crypto/scatterwalk.h
@@ -227,6 +227,34 @@ static inline void scatterwalk_done_src(struct scatter_walk *walk,
 	scatterwalk_advance(walk, nbytes);
 }
 
+/*
+ * Flush the dcache of any pages that overlap the region
+ * [offset, offset + nbytes) relative to base_page.
+ *
+ * This should be called only when ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE, to ensure
+ * that all relevant code (including the call to sg_page() in the caller, if
+ * applicable) gets fully optimized out when !ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
+ */
+static inline void __scatterwalk_flush_dcache_pages(struct page *base_page,
+						    unsigned int offset,
+						    unsigned int nbytes)
+{
+	unsigned int num_pages;
+
+	base_page += offset / PAGE_SIZE;
+	offset %= PAGE_SIZE;
+
+	/*
+	 * This is an overflow-safe version of
+	 * num_pages = DIV_ROUND_UP(offset + nbytes, PAGE_SIZE).
+	 */
+	num_pages = nbytes / PAGE_SIZE;
+	num_pages += DIV_ROUND_UP(offset + (nbytes % PAGE_SIZE), PAGE_SIZE);
+
+	for (unsigned int i = 0; i < num_pages; i++)
+		flush_dcache_page(base_page + i);
+}
+
 /**
  * scatterwalk_done_dst() - Finish one step of a walk of destination scatterlist
  * @walk: the scatter_walk
@@ -240,27 +268,9 @@ static inline void scatterwalk_done_dst(struct scatter_walk *walk,
 					unsigned int nbytes)
 {
 	scatterwalk_unmap(walk);
-	/*
-	 * Explicitly check ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE instead of just
-	 * relying on flush_dcache_page() being a no-op when not implemented,
-	 * since otherwise the BUG_ON in sg_page() does not get optimized out.
-	 * This also avoids having to consider whether the loop would get
-	 * reliably optimized out or not.
-	 */
-	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE) {
-		struct page *base_page;
-		unsigned int offset;
-		int start, end, i;
-
-		base_page = sg_page(walk->sg);
-		offset = walk->offset;
-		start = offset >> PAGE_SHIFT;
-		end = start + (nbytes >> PAGE_SHIFT);
-		end += (offset_in_page(offset) + offset_in_page(nbytes) +
-			PAGE_SIZE - 1) >> PAGE_SHIFT;
-		for (i = start; i < end; i++)
-			flush_dcache_page(base_page + i);
-	}
+	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+		__scatterwalk_flush_dcache_pages(sg_page(walk->sg),
+						 walk->offset, nbytes);
 	scatterwalk_advance(walk, nbytes);
 }
 
-- 
cgit v1.2.3


From 20d868a77f11ba050fe96e7b8efb8ec3b6f2737f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 15 Nov 2025 15:08:17 -0800
Subject: Revert "crypto: scatterwalk - Move skcipher walk and use it for
 memcpy_sglist"

This reverts commit 0f8d42bf128d349ad490e87d5574d211245e40f1, with the
memcpy_sglist() part dropped.

Now that memcpy_sglist() no longer uses the skcipher_walk code, the
skcipher_walk code can be moved back to where it belongs.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/scatterwalk.c               | 248 -----------------------------------
 crypto/skcipher.c                  | 261 ++++++++++++++++++++++++++++++++++++-
 include/crypto/algapi.h            |  12 ++
 include/crypto/internal/skcipher.h |  48 ++++++-
 include/crypto/scatterwalk.h       |  65 +--------
 5 files changed, 316 insertions(+), 318 deletions(-)

(limited to 'include')

diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c
index b95e5974e327..be0e24843806 100644
--- a/crypto/scatterwalk.c
+++ b/crypto/scatterwalk.c
@@ -10,25 +10,10 @@
  */
 
 #include <crypto/scatterwalk.h>
-#include <linux/crypto.h>
-#include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/scatterlist.h>
-#include <linux/slab.h>
-
-enum {
-	SKCIPHER_WALK_SLOW = 1 << 0,
-	SKCIPHER_WALK_COPY = 1 << 1,
-	SKCIPHER_WALK_DIFF = 1 << 2,
-	SKCIPHER_WALK_SLEEP = 1 << 3,
-};
-
-static inline gfp_t skcipher_walk_gfp(struct skcipher_walk *walk)
-{
-	return walk->flags & SKCIPHER_WALK_SLEEP ? GFP_KERNEL : GFP_ATOMIC;
-}
 
 void scatterwalk_skip(struct scatter_walk *walk, unsigned int nbytes)
 {
@@ -217,236 +202,3 @@ struct scatterlist *scatterwalk_ffwd(struct scatterlist dst[2],
 	return dst;
 }
 EXPORT_SYMBOL_GPL(scatterwalk_ffwd);
-
-static int skcipher_next_slow(struct skcipher_walk *walk, unsigned int bsize)
-{
-	unsigned alignmask = walk->alignmask;
-	unsigned n;
-	void *buffer;
-
-	if (!walk->buffer)
-		walk->buffer = walk->page;
-	buffer = walk->buffer;
-	if (!buffer) {
-		/* Min size for a buffer of bsize bytes aligned to alignmask */
-		n = bsize + (alignmask & ~(crypto_tfm_ctx_alignment() - 1));
-
-		buffer = kzalloc(n, skcipher_walk_gfp(walk));
-		if (!buffer)
-			return skcipher_walk_done(walk, -ENOMEM);
-		walk->buffer = buffer;
-	}
-
-	buffer = PTR_ALIGN(buffer, alignmask + 1);
-	memcpy_from_scatterwalk(buffer, &walk->in, bsize);
-	walk->out.__addr = buffer;
-	walk->in.__addr = walk->out.addr;
-
-	walk->nbytes = bsize;
-	walk->flags |= SKCIPHER_WALK_SLOW;
-
-	return 0;
-}
-
-static int skcipher_next_copy(struct skcipher_walk *walk)
-{
-	void *tmp = walk->page;
-
-	scatterwalk_map(&walk->in);
-	memcpy(tmp, walk->in.addr, walk->nbytes);
-	scatterwalk_unmap(&walk->in);
-	/*
-	 * walk->in is advanced later when the number of bytes actually
-	 * processed (which might be less than walk->nbytes) is known.
-	 */
-
-	walk->in.__addr = tmp;
-	walk->out.__addr = tmp;
-	return 0;
-}
-
-static int skcipher_next_fast(struct skcipher_walk *walk)
-{
-	unsigned long diff;
-
-	diff = offset_in_page(walk->in.offset) -
-	       offset_in_page(walk->out.offset);
-	diff |= (u8 *)(sg_page(walk->in.sg) + (walk->in.offset >> PAGE_SHIFT)) -
-		(u8 *)(sg_page(walk->out.sg) + (walk->out.offset >> PAGE_SHIFT));
-
-	scatterwalk_map(&walk->out);
-	walk->in.__addr = walk->out.__addr;
-
-	if (diff) {
-		walk->flags |= SKCIPHER_WALK_DIFF;
-		scatterwalk_map(&walk->in);
-	}
-
-	return 0;
-}
-
-static int skcipher_walk_next(struct skcipher_walk *walk)
-{
-	unsigned int bsize;
-	unsigned int n;
-
-	n = walk->total;
-	bsize = min(walk->stride, max(n, walk->blocksize));
-	n = scatterwalk_clamp(&walk->in, n);
-	n = scatterwalk_clamp(&walk->out, n);
-
-	if (unlikely(n < bsize)) {
-		if (unlikely(walk->total < walk->blocksize))
-			return skcipher_walk_done(walk, -EINVAL);
-
-slow_path:
-		return skcipher_next_slow(walk, bsize);
-	}
-	walk->nbytes = n;
-
-	if (unlikely((walk->in.offset | walk->out.offset) & walk->alignmask)) {
-		if (!walk->page) {
-			gfp_t gfp = skcipher_walk_gfp(walk);
-
-			walk->page = (void *)__get_free_page(gfp);
-			if (!walk->page)
-				goto slow_path;
-		}
-		walk->flags |= SKCIPHER_WALK_COPY;
-		return skcipher_next_copy(walk);
-	}
-
-	return skcipher_next_fast(walk);
-}
-
-static int skcipher_copy_iv(struct skcipher_walk *walk)
-{
-	unsigned alignmask = walk->alignmask;
-	unsigned ivsize = walk->ivsize;
-	unsigned aligned_stride = ALIGN(walk->stride, alignmask + 1);
-	unsigned size;
-	u8 *iv;
-
-	/* Min size for a buffer of stride + ivsize, aligned to alignmask */
-	size = aligned_stride + ivsize +
-	       (alignmask & ~(crypto_tfm_ctx_alignment() - 1));
-
-	walk->buffer = kmalloc(size, skcipher_walk_gfp(walk));
-	if (!walk->buffer)
-		return -ENOMEM;
-
-	iv = PTR_ALIGN(walk->buffer, alignmask + 1) + aligned_stride;
-
-	walk->iv = memcpy(iv, walk->iv, walk->ivsize);
-	return 0;
-}
-
-int skcipher_walk_first(struct skcipher_walk *walk, bool atomic)
-{
-	if (WARN_ON_ONCE(in_hardirq()))
-		return -EDEADLK;
-
-	walk->flags = atomic ? 0 : SKCIPHER_WALK_SLEEP;
-
-	walk->buffer = NULL;
-	if (unlikely(((unsigned long)walk->iv & walk->alignmask))) {
-		int err = skcipher_copy_iv(walk);
-		if (err)
-			return err;
-	}
-
-	walk->page = NULL;
-
-	return skcipher_walk_next(walk);
-}
-EXPORT_SYMBOL_GPL(skcipher_walk_first);
-
-/**
- * skcipher_walk_done() - finish one step of a skcipher_walk
- * @walk: the skcipher_walk
- * @res: number of bytes *not* processed (>= 0) from walk->nbytes,
- *	 or a -errno value to terminate the walk due to an error
- *
- * This function cleans up after one step of walking through the source and
- * destination scatterlists, and advances to the next step if applicable.
- * walk->nbytes is set to the number of bytes available in the next step,
- * walk->total is set to the new total number of bytes remaining, and
- * walk->{src,dst}.virt.addr is set to the next pair of data pointers.  If there
- * is no more data, or if an error occurred (i.e. -errno return), then
- * walk->nbytes and walk->total are set to 0 and all resources owned by the
- * skcipher_walk are freed.
- *
- * Return: 0 or a -errno value.  If @res was a -errno value then it will be
- *	   returned, but other errors may occur too.
- */
-int skcipher_walk_done(struct skcipher_walk *walk, int res)
-{
-	unsigned int n = walk->nbytes; /* num bytes processed this step */
-	unsigned int total = 0; /* new total remaining */
-
-	if (!n)
-		goto finish;
-
-	if (likely(res >= 0)) {
-		n -= res; /* subtract num bytes *not* processed */
-		total = walk->total - n;
-	}
-
-	if (likely(!(walk->flags & (SKCIPHER_WALK_SLOW |
-				    SKCIPHER_WALK_COPY |
-				    SKCIPHER_WALK_DIFF)))) {
-		scatterwalk_advance(&walk->in, n);
-	} else if (walk->flags & SKCIPHER_WALK_DIFF) {
-		scatterwalk_done_src(&walk->in, n);
-	} else if (walk->flags & SKCIPHER_WALK_COPY) {
-		scatterwalk_advance(&walk->in, n);
-		scatterwalk_map(&walk->out);
-		memcpy(walk->out.addr, walk->page, n);
-	} else { /* SKCIPHER_WALK_SLOW */
-		if (res > 0) {
-			/*
-			 * Didn't process all bytes.  Either the algorithm is
-			 * broken, or this was the last step and it turned out
-			 * the message wasn't evenly divisible into blocks but
-			 * the algorithm requires it.
-			 */
-			res = -EINVAL;
-			total = 0;
-		} else
-			memcpy_to_scatterwalk(&walk->out, walk->out.addr, n);
-		goto dst_done;
-	}
-
-	scatterwalk_done_dst(&walk->out, n);
-dst_done:
-
-	if (res > 0)
-		res = 0;
-
-	walk->total = total;
-	walk->nbytes = 0;
-
-	if (total) {
-		if (walk->flags & SKCIPHER_WALK_SLEEP)
-			cond_resched();
-		walk->flags &= ~(SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY |
-				 SKCIPHER_WALK_DIFF);
-		return skcipher_walk_next(walk);
-	}
-
-finish:
-	/* Short-circuit for the common/fast path. */
-	if (!((unsigned long)walk->buffer | (unsigned long)walk->page))
-		goto out;
-
-	if (walk->iv != walk->oiv)
-		memcpy(walk->oiv, walk->iv, walk->ivsize);
-	if (walk->buffer != walk->page)
-		kfree(walk->buffer);
-	if (walk->page)
-		free_page((unsigned long)walk->page);
-
-out:
-	return res;
-}
-EXPORT_SYMBOL_GPL(skcipher_walk_done);
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 8fa5d9686d08..14a820cb06c7 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -17,6 +17,7 @@
 #include <linux/cryptouser.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
@@ -27,14 +28,258 @@
 
 #define CRYPTO_ALG_TYPE_SKCIPHER_MASK	0x0000000e
 
+enum {
+	SKCIPHER_WALK_SLOW = 1 << 0,
+	SKCIPHER_WALK_COPY = 1 << 1,
+	SKCIPHER_WALK_DIFF = 1 << 2,
+	SKCIPHER_WALK_SLEEP = 1 << 3,
+};
+
 static const struct crypto_type crypto_skcipher_type;
 
+static int skcipher_walk_next(struct skcipher_walk *walk);
+
+static inline gfp_t skcipher_walk_gfp(struct skcipher_walk *walk)
+{
+	return walk->flags & SKCIPHER_WALK_SLEEP ? GFP_KERNEL : GFP_ATOMIC;
+}
+
 static inline struct skcipher_alg *__crypto_skcipher_alg(
 	struct crypto_alg *alg)
 {
 	return container_of(alg, struct skcipher_alg, base);
 }
 
+/**
+ * skcipher_walk_done() - finish one step of a skcipher_walk
+ * @walk: the skcipher_walk
+ * @res: number of bytes *not* processed (>= 0) from walk->nbytes,
+ *	 or a -errno value to terminate the walk due to an error
+ *
+ * This function cleans up after one step of walking through the source and
+ * destination scatterlists, and advances to the next step if applicable.
+ * walk->nbytes is set to the number of bytes available in the next step,
+ * walk->total is set to the new total number of bytes remaining, and
+ * walk->{src,dst}.virt.addr is set to the next pair of data pointers.  If there
+ * is no more data, or if an error occurred (i.e. -errno return), then
+ * walk->nbytes and walk->total are set to 0 and all resources owned by the
+ * skcipher_walk are freed.
+ *
+ * Return: 0 or a -errno value.  If @res was a -errno value then it will be
+ *	   returned, but other errors may occur too.
+ */
+int skcipher_walk_done(struct skcipher_walk *walk, int res)
+{
+	unsigned int n = walk->nbytes; /* num bytes processed this step */
+	unsigned int total = 0; /* new total remaining */
+
+	if (!n)
+		goto finish;
+
+	if (likely(res >= 0)) {
+		n -= res; /* subtract num bytes *not* processed */
+		total = walk->total - n;
+	}
+
+	if (likely(!(walk->flags & (SKCIPHER_WALK_SLOW |
+				    SKCIPHER_WALK_COPY |
+				    SKCIPHER_WALK_DIFF)))) {
+		scatterwalk_advance(&walk->in, n);
+	} else if (walk->flags & SKCIPHER_WALK_DIFF) {
+		scatterwalk_done_src(&walk->in, n);
+	} else if (walk->flags & SKCIPHER_WALK_COPY) {
+		scatterwalk_advance(&walk->in, n);
+		scatterwalk_map(&walk->out);
+		memcpy(walk->out.addr, walk->page, n);
+	} else { /* SKCIPHER_WALK_SLOW */
+		if (res > 0) {
+			/*
+			 * Didn't process all bytes.  Either the algorithm is
+			 * broken, or this was the last step and it turned out
+			 * the message wasn't evenly divisible into blocks but
+			 * the algorithm requires it.
+			 */
+			res = -EINVAL;
+			total = 0;
+		} else
+			memcpy_to_scatterwalk(&walk->out, walk->out.addr, n);
+		goto dst_done;
+	}
+
+	scatterwalk_done_dst(&walk->out, n);
+dst_done:
+
+	if (res > 0)
+		res = 0;
+
+	walk->total = total;
+	walk->nbytes = 0;
+
+	if (total) {
+		if (walk->flags & SKCIPHER_WALK_SLEEP)
+			cond_resched();
+		walk->flags &= ~(SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY |
+				 SKCIPHER_WALK_DIFF);
+		return skcipher_walk_next(walk);
+	}
+
+finish:
+	/* Short-circuit for the common/fast path. */
+	if (!((unsigned long)walk->buffer | (unsigned long)walk->page))
+		goto out;
+
+	if (walk->iv != walk->oiv)
+		memcpy(walk->oiv, walk->iv, walk->ivsize);
+	if (walk->buffer != walk->page)
+		kfree(walk->buffer);
+	if (walk->page)
+		free_page((unsigned long)walk->page);
+
+out:
+	return res;
+}
+EXPORT_SYMBOL_GPL(skcipher_walk_done);
+
+static int skcipher_next_slow(struct skcipher_walk *walk, unsigned int bsize)
+{
+	unsigned alignmask = walk->alignmask;
+	unsigned n;
+	void *buffer;
+
+	if (!walk->buffer)
+		walk->buffer = walk->page;
+	buffer = walk->buffer;
+	if (!buffer) {
+		/* Min size for a buffer of bsize bytes aligned to alignmask */
+		n = bsize + (alignmask & ~(crypto_tfm_ctx_alignment() - 1));
+
+		buffer = kzalloc(n, skcipher_walk_gfp(walk));
+		if (!buffer)
+			return skcipher_walk_done(walk, -ENOMEM);
+		walk->buffer = buffer;
+	}
+
+	buffer = PTR_ALIGN(buffer, alignmask + 1);
+	memcpy_from_scatterwalk(buffer, &walk->in, bsize);
+	walk->out.__addr = buffer;
+	walk->in.__addr = walk->out.addr;
+
+	walk->nbytes = bsize;
+	walk->flags |= SKCIPHER_WALK_SLOW;
+
+	return 0;
+}
+
+static int skcipher_next_copy(struct skcipher_walk *walk)
+{
+	void *tmp = walk->page;
+
+	scatterwalk_map(&walk->in);
+	memcpy(tmp, walk->in.addr, walk->nbytes);
+	scatterwalk_unmap(&walk->in);
+	/*
+	 * walk->in is advanced later when the number of bytes actually
+	 * processed (which might be less than walk->nbytes) is known.
+	 */
+
+	walk->in.__addr = tmp;
+	walk->out.__addr = tmp;
+	return 0;
+}
+
+static int skcipher_next_fast(struct skcipher_walk *walk)
+{
+	unsigned long diff;
+
+	diff = offset_in_page(walk->in.offset) -
+	       offset_in_page(walk->out.offset);
+	diff |= (u8 *)(sg_page(walk->in.sg) + (walk->in.offset >> PAGE_SHIFT)) -
+		(u8 *)(sg_page(walk->out.sg) + (walk->out.offset >> PAGE_SHIFT));
+
+	scatterwalk_map(&walk->out);
+	walk->in.__addr = walk->out.__addr;
+
+	if (diff) {
+		walk->flags |= SKCIPHER_WALK_DIFF;
+		scatterwalk_map(&walk->in);
+	}
+
+	return 0;
+}
+
+static int skcipher_walk_next(struct skcipher_walk *walk)
+{
+	unsigned int bsize;
+	unsigned int n;
+
+	n = walk->total;
+	bsize = min(walk->stride, max(n, walk->blocksize));
+	n = scatterwalk_clamp(&walk->in, n);
+	n = scatterwalk_clamp(&walk->out, n);
+
+	if (unlikely(n < bsize)) {
+		if (unlikely(walk->total < walk->blocksize))
+			return skcipher_walk_done(walk, -EINVAL);
+
+slow_path:
+		return skcipher_next_slow(walk, bsize);
+	}
+	walk->nbytes = n;
+
+	if (unlikely((walk->in.offset | walk->out.offset) & walk->alignmask)) {
+		if (!walk->page) {
+			gfp_t gfp = skcipher_walk_gfp(walk);
+
+			walk->page = (void *)__get_free_page(gfp);
+			if (!walk->page)
+				goto slow_path;
+		}
+		walk->flags |= SKCIPHER_WALK_COPY;
+		return skcipher_next_copy(walk);
+	}
+
+	return skcipher_next_fast(walk);
+}
+
+static int skcipher_copy_iv(struct skcipher_walk *walk)
+{
+	unsigned alignmask = walk->alignmask;
+	unsigned ivsize = walk->ivsize;
+	unsigned aligned_stride = ALIGN(walk->stride, alignmask + 1);
+	unsigned size;
+	u8 *iv;
+
+	/* Min size for a buffer of stride + ivsize, aligned to alignmask */
+	size = aligned_stride + ivsize +
+	       (alignmask & ~(crypto_tfm_ctx_alignment() - 1));
+
+	walk->buffer = kmalloc(size, skcipher_walk_gfp(walk));
+	if (!walk->buffer)
+		return -ENOMEM;
+
+	iv = PTR_ALIGN(walk->buffer, alignmask + 1) + aligned_stride;
+
+	walk->iv = memcpy(iv, walk->iv, walk->ivsize);
+	return 0;
+}
+
+static int skcipher_walk_first(struct skcipher_walk *walk)
+{
+	if (WARN_ON_ONCE(in_hardirq()))
+		return -EDEADLK;
+
+	walk->buffer = NULL;
+	if (unlikely(((unsigned long)walk->iv & walk->alignmask))) {
+		int err = skcipher_copy_iv(walk);
+		if (err)
+			return err;
+	}
+
+	walk->page = NULL;
+
+	return skcipher_walk_next(walk);
+}
+
 int skcipher_walk_virt(struct skcipher_walk *__restrict walk,
 		       struct skcipher_request *__restrict req, bool atomic)
 {
@@ -49,8 +294,10 @@ int skcipher_walk_virt(struct skcipher_walk *__restrict walk,
 	walk->nbytes = 0;
 	walk->iv = req->iv;
 	walk->oiv = req->iv;
-	if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP))
-		atomic = true;
+	if ((req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) && !atomic)
+		walk->flags = SKCIPHER_WALK_SLEEP;
+	else
+		walk->flags = 0;
 
 	if (unlikely(!walk->total))
 		return 0;
@@ -67,7 +314,7 @@ int skcipher_walk_virt(struct skcipher_walk *__restrict walk,
 	else
 		walk->stride = alg->walksize;
 
-	return skcipher_walk_first(walk, atomic);
+	return skcipher_walk_first(walk);
 }
 EXPORT_SYMBOL_GPL(skcipher_walk_virt);
 
@@ -80,8 +327,10 @@ static int skcipher_walk_aead_common(struct skcipher_walk *__restrict walk,
 	walk->nbytes = 0;
 	walk->iv = req->iv;
 	walk->oiv = req->iv;
-	if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP))
-		atomic = true;
+	if ((req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) && !atomic)
+		walk->flags = SKCIPHER_WALK_SLEEP;
+	else
+		walk->flags = 0;
 
 	if (unlikely(!walk->total))
 		return 0;
@@ -94,7 +343,7 @@ static int skcipher_walk_aead_common(struct skcipher_walk *__restrict walk,
 	walk->ivsize = crypto_aead_ivsize(tfm);
 	walk->alignmask = crypto_aead_alignmask(tfm);
 
-	return skcipher_walk_first(walk, atomic);
+	return skcipher_walk_first(walk);
 }
 
 int skcipher_walk_aead_encrypt(struct skcipher_walk *__restrict walk,
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index fc4574940636..05deea9dac5e 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -107,6 +107,18 @@ struct crypto_queue {
 	unsigned int max_qlen;
 };
 
+struct scatter_walk {
+	/* Must be the first member, see struct skcipher_walk. */
+	union {
+		void *const addr;
+
+		/* Private API field, do not touch. */
+		union crypto_no_such_thing *__addr;
+	};
+	struct scatterlist *sg;
+	unsigned int offset;
+};
+
 struct crypto_attr_alg {
 	char name[CRYPTO_MAX_ALG_NAME];
 };
diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h
index d5aa535263f6..0cad8e7364c8 100644
--- a/include/crypto/internal/skcipher.h
+++ b/include/crypto/internal/skcipher.h
@@ -10,7 +10,6 @@
 
 #include <crypto/algapi.h>
 #include <crypto/internal/cipher.h>
-#include <crypto/scatterwalk.h>
 #include <crypto/skcipher.h>
 #include <linux/types.h>
 
@@ -55,6 +54,47 @@ struct crypto_lskcipher_spawn {
 	struct crypto_spawn base;
 };
 
+struct skcipher_walk {
+	union {
+		/* Virtual address of the source. */
+		struct {
+			struct {
+				const void *const addr;
+			} virt;
+		} src;
+
+		/* Private field for the API, do not use. */
+		struct scatter_walk in;
+	};
+
+	union {
+		/* Virtual address of the destination. */
+		struct {
+			struct {
+				void *const addr;
+			} virt;
+		} dst;
+
+		/* Private field for the API, do not use. */
+		struct scatter_walk out;
+	};
+
+	unsigned int nbytes;
+	unsigned int total;
+
+	u8 *page;
+	u8 *buffer;
+	u8 *oiv;
+	void *iv;
+
+	unsigned int ivsize;
+
+	int flags;
+	unsigned int blocksize;
+	unsigned int stride;
+	unsigned int alignmask;
+};
+
 static inline struct crypto_instance *skcipher_crypto_instance(
 	struct skcipher_instance *inst)
 {
@@ -171,6 +211,7 @@ void crypto_unregister_lskciphers(struct lskcipher_alg *algs, int count);
 int lskcipher_register_instance(struct crypto_template *tmpl,
 				struct lskcipher_instance *inst);
 
+int skcipher_walk_done(struct skcipher_walk *walk, int res);
 int skcipher_walk_virt(struct skcipher_walk *__restrict walk,
 		       struct skcipher_request *__restrict req,
 		       bool atomic);
@@ -181,6 +222,11 @@ int skcipher_walk_aead_decrypt(struct skcipher_walk *__restrict walk,
 			       struct aead_request *__restrict req,
 			       bool atomic);
 
+static inline void skcipher_walk_abort(struct skcipher_walk *walk)
+{
+	skcipher_walk_done(walk, -ECANCELED);
+}
+
 static inline void *crypto_skcipher_ctx(struct crypto_skcipher *tfm)
 {
 	return crypto_tfm_ctx(&tfm->base);
diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h
index f485454e3955..624fab589c2c 100644
--- a/include/crypto/scatterwalk.h
+++ b/include/crypto/scatterwalk.h
@@ -11,64 +11,11 @@
 #ifndef _CRYPTO_SCATTERWALK_H
 #define _CRYPTO_SCATTERWALK_H
 
-#include <linux/errno.h>
+#include <crypto/algapi.h>
+
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/scatterlist.h>
-#include <linux/types.h>
-
-struct scatter_walk {
-	/* Must be the first member, see struct skcipher_walk. */
-	union {
-		void *const addr;
-
-		/* Private API field, do not touch. */
-		union crypto_no_such_thing *__addr;
-	};
-	struct scatterlist *sg;
-	unsigned int offset;
-};
-
-struct skcipher_walk {
-	union {
-		/* Virtual address of the source. */
-		struct {
-			struct {
-				const void *const addr;
-			} virt;
-		} src;
-
-		/* Private field for the API, do not use. */
-		struct scatter_walk in;
-	};
-
-	union {
-		/* Virtual address of the destination. */
-		struct {
-			struct {
-				void *const addr;
-			} virt;
-		} dst;
-
-		/* Private field for the API, do not use. */
-		struct scatter_walk out;
-	};
-
-	unsigned int nbytes;
-	unsigned int total;
-
-	u8 *page;
-	u8 *buffer;
-	u8 *oiv;
-	void *iv;
-
-	unsigned int ivsize;
-
-	int flags;
-	unsigned int blocksize;
-	unsigned int stride;
-	unsigned int alignmask;
-};
 
 static inline void scatterwalk_crypto_chain(struct scatterlist *head,
 					    struct scatterlist *sg, int num)
@@ -306,12 +253,4 @@ struct scatterlist *scatterwalk_ffwd(struct scatterlist dst[2],
 				     struct scatterlist *src,
 				     unsigned int len);
 
-int skcipher_walk_first(struct skcipher_walk *walk, bool atomic);
-int skcipher_walk_done(struct skcipher_walk *walk, int res);
-
-static inline void skcipher_walk_abort(struct skcipher_walk *walk)
-{
-	skcipher_walk_done(walk, -ECANCELED);
-}
-
 #endif  /* _CRYPTO_SCATTERWALK_H */
-- 
cgit v1.2.3


From 4167096cb964325ed88cd558f5b0c61fcaab44c1 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Mon, 17 Nov 2025 20:04:09 +0000
Subject: bpf: support nested rcu critical sections

Currently, nested rcu critical sections are rejected by the verifier and
rcu_lock state is managed by a boolean variable. Add support for nested
rcu critical sections by make active_rcu_locks a counter similar to
active_preempt_locks. bpf_rcu_read_lock() increments this counter and
bpf_rcu_read_unlock() decrements it, MEM_RCU -> PTR_UNTRUSTED transition
happens when active_rcu_locks drops to 0.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251117200411.25563-2-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |  2 +-
 kernel/bpf/verifier.c                              | 47 ++++++++++------------
 .../selftests/bpf/prog_tests/rcu_read_lock.c       |  2 +-
 3 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 8d0b60fa5f2b..130bcbd66f60 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -416,7 +416,7 @@ struct bpf_verifier_state {
 	u32 active_irq_id;
 	u32 active_lock_id;
 	void *active_lock_ptr;
-	bool active_rcu_lock;
+	u32 active_rcu_locks;
 
 	bool speculative;
 	bool in_sleepable;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0828718a8ba7..2e170be647bd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1437,7 +1437,7 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf
 	dst->acquired_refs = src->acquired_refs;
 	dst->active_locks = src->active_locks;
 	dst->active_preempt_locks = src->active_preempt_locks;
-	dst->active_rcu_lock = src->active_rcu_lock;
+	dst->active_rcu_locks = src->active_rcu_locks;
 	dst->active_irq_id = src->active_irq_id;
 	dst->active_lock_id = src->active_lock_id;
 	dst->active_lock_ptr = src->active_lock_ptr;
@@ -5889,7 +5889,7 @@ static bool in_sleepable(struct bpf_verifier_env *env)
  */
 static bool in_rcu_cs(struct bpf_verifier_env *env)
 {
-	return env->cur_state->active_rcu_lock ||
+	return env->cur_state->active_rcu_locks ||
 	       env->cur_state->active_locks ||
 	       !in_sleepable(env);
 }
@@ -10744,7 +10744,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 
 		if (env->subprog_info[subprog].might_sleep &&
-		    (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks ||
+		    (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks ||
 		     env->cur_state->active_irq_id || !in_sleepable(env))) {
 			verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n"
 				     "i.e., in a RCU/IRQ/preempt-disabled section, or in\n"
@@ -11327,7 +11327,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit
 		return -EINVAL;
 	}
 
-	if (check_lock && env->cur_state->active_rcu_lock) {
+	if (check_lock && env->cur_state->active_rcu_locks) {
 		verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
 		return -EINVAL;
 	}
@@ -11465,7 +11465,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
 /* Check if we're in a sleepable context. */
 static inline bool in_sleepable_context(struct bpf_verifier_env *env)
 {
-	return !env->cur_state->active_rcu_lock &&
+	return !env->cur_state->active_rcu_locks &&
 	       !env->cur_state->active_preempt_locks &&
 	       !env->cur_state->active_irq_id &&
 	       in_sleepable(env);
@@ -11531,7 +11531,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		return err;
 	}
 
-	if (env->cur_state->active_rcu_lock) {
+	if (env->cur_state->active_rcu_locks) {
 		if (fn->might_sleep) {
 			verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
 				func_id_name(func_id), func_id);
@@ -14038,36 +14038,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
 	preempt_enable = is_kfunc_bpf_preempt_enable(&meta);
 
-	if (env->cur_state->active_rcu_lock) {
+	if (rcu_lock) {
+		env->cur_state->active_rcu_locks++;
+	} else if (rcu_unlock) {
 		struct bpf_func_state *state;
 		struct bpf_reg_state *reg;
 		u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
 
-		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
-			verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
-			return -EACCES;
-		}
-
-		if (rcu_lock) {
-			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
+		if (env->cur_state->active_rcu_locks == 0) {
+			verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
 			return -EINVAL;
-		} else if (rcu_unlock) {
+		}
+		if (--env->cur_state->active_rcu_locks == 0) {
 			bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
 				if (reg->type & MEM_RCU) {
 					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
 					reg->type |= PTR_UNTRUSTED;
 				}
 			}));
-			env->cur_state->active_rcu_lock = false;
-		} else if (sleepable) {
-			verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
-			return -EACCES;
 		}
-	} else if (rcu_lock) {
-		env->cur_state->active_rcu_lock = true;
-	} else if (rcu_unlock) {
-		verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
-		return -EINVAL;
+	} else if (sleepable && env->cur_state->active_rcu_locks) {
+		verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+		return -EACCES;
+	}
+
+	if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+		verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
+		return -EACCES;
 	}
 
 	if (env->cur_state->active_preempt_locks) {
@@ -19387,7 +19384,7 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c
 	if (old->active_preempt_locks != cur->active_preempt_locks)
 		return false;
 
-	if (old->active_rcu_lock != cur->active_rcu_lock)
+	if (old->active_rcu_locks != cur->active_rcu_locks)
 		return false;
 
 	if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
index c9f855e5da24..451a5d9ff4cb 100644
--- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
@@ -28,6 +28,7 @@ static void test_success(void)
 	bpf_program__set_autoload(skel->progs.two_regions, true);
 	bpf_program__set_autoload(skel->progs.non_sleepable_1, true);
 	bpf_program__set_autoload(skel->progs.non_sleepable_2, true);
+	bpf_program__set_autoload(skel->progs.nested_rcu_region, true);
 	bpf_program__set_autoload(skel->progs.task_trusted_non_rcuptr, true);
 	bpf_program__set_autoload(skel->progs.rcu_read_lock_subprog, true);
 	bpf_program__set_autoload(skel->progs.rcu_read_lock_global_subprog, true);
@@ -78,7 +79,6 @@ static const char * const inproper_region_tests[] = {
 	"non_sleepable_rcu_mismatch",
 	"inproper_sleepable_helper",
 	"inproper_sleepable_kfunc",
-	"nested_rcu_region",
 	"rcu_read_lock_global_subprog_lock",
 	"rcu_read_lock_global_subprog_unlock",
 	"rcu_read_lock_sleepable_helper_global_subprog",
-- 
cgit v1.2.3


From ac529d86ad26d632d3c70b7c5b839282a3294d2f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Nov 2025 09:39:48 +0100
Subject: mempool: add mempool_{alloc,free}_bulk

Add a version of the mempool allocator that works for batch allocations
of multiple objects.  Calling mempool_alloc in a loop is not safe because
it could deadlock if multiple threads are performing such an allocation
at the same time.

As an extra benefit the interface is build so that the same array can be
used for alloc_pages_bulk / release_pages so that at least for page
backed mempools the fast path can use a nice batch optimization.

Note that mempool_alloc_bulk does not take a gfp_mask argument as it
must always be able to sleep and doesn't support any non-trivial
modifiers.  NOFO or NOIO constrainst must be set through the scoped API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113084022.1255121-8-hch@lst.de
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mempool.h |   6 ++
 mm/mempool.c            | 177 ++++++++++++++++++++++++++++++++++++------------
 2 files changed, 141 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 34941a4b9026..e914fec0e119 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -66,9 +66,15 @@ extern void mempool_destroy(mempool_t *pool);
 extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc;
 #define mempool_alloc(...)						\
 	alloc_hooks(mempool_alloc_noprof(__VA_ARGS__))
+int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem,
+		unsigned int count, unsigned int allocated);
+#define mempool_alloc_bulk(...)						\
+	alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__))
 
 extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc;
 extern void mempool_free(void *element, mempool_t *pool);
+unsigned int mempool_free_bulk(struct mempool *pool, void **elem,
+		unsigned int count);
 
 /*
  * A mempool_alloc_t and mempool_free_t that get the memory from
diff --git a/mm/mempool.c b/mm/mempool.c
index 6bcc319d547d..b45bcf415147 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -21,11 +21,21 @@
 #include "slab.h"
 
 static DECLARE_FAULT_ATTR(fail_mempool_alloc);
+static DECLARE_FAULT_ATTR(fail_mempool_alloc_bulk);
 
 static int __init mempool_faul_inject_init(void)
 {
-	return PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc",
+	int error;
+
+	error = PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc",
 			NULL, &fail_mempool_alloc));
+	if (error)
+		return error;
+
+	/* booting will fail on error return here, don't bother to cleanup */
+	return PTR_ERR_OR_ZERO(
+		fault_create_debugfs_attr("fail_mempool_alloc_bulk", NULL,
+		&fail_mempool_alloc_bulk));
 }
 late_initcall(mempool_faul_inject_init);
 
@@ -380,15 +390,22 @@ out:
 }
 EXPORT_SYMBOL(mempool_resize);
 
-static void *mempool_alloc_from_pool(struct mempool *pool, gfp_t gfp_mask)
+static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems,
+		unsigned int count, unsigned int allocated,
+		gfp_t gfp_mask)
 {
 	unsigned long flags;
-	void *element;
+	unsigned int i;
 
 	spin_lock_irqsave(&pool->lock, flags);
-	if (unlikely(!pool->curr_nr))
+	if (unlikely(pool->curr_nr < count - allocated))
 		goto fail;
-	element = remove_element(pool);
+	for (i = 0; i < count; i++) {
+		if (!elems[i]) {
+			elems[i] = remove_element(pool);
+			allocated++;
+		}
+	}
 	spin_unlock_irqrestore(&pool->lock, flags);
 
 	/* Paired with rmb in mempool_free(), read comment there. */
@@ -398,8 +415,9 @@ static void *mempool_alloc_from_pool(struct mempool *pool, gfp_t gfp_mask)
 	 * Update the allocation stack trace as this is more useful for
 	 * debugging.
 	 */
-	kmemleak_update_trace(element);
-	return element;
+	for (i = 0; i < count; i++)
+		kmemleak_update_trace(elems[i]);
+	return allocated;
 
 fail:
 	if (gfp_mask & __GFP_DIRECT_RECLAIM) {
@@ -421,7 +439,7 @@ fail:
 		spin_unlock_irqrestore(&pool->lock, flags);
 	}
 
-	return NULL;
+	return allocated;
 }
 
 /*
@@ -437,6 +455,65 @@ static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask)
 	return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
 }
 
+/**
+ * mempool_alloc_bulk - allocate multiple elements from a memory pool
+ * @pool:	pointer to the memory pool
+ * @elems:	partially or fully populated elements array
+ * @count:	number of entries in @elem that need to be allocated
+ * @allocated:	number of entries in @elem already allocated
+ *
+ * Allocate elements for each slot in @elem that is non-%NULL. This is done by
+ * first calling into the alloc_fn supplied at pool initialization time, and
+ * dipping into the reserved pool when alloc_fn fails to allocate an element.
+ *
+ * On return all @count elements in @elems will be populated.
+ *
+ * Return: Always 0.  If it wasn't for %$#^$ alloc tags, it would return void.
+ */
+int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems,
+		unsigned int count, unsigned int allocated)
+{
+	gfp_t gfp_mask = GFP_KERNEL;
+	gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask);
+	unsigned int i = 0;
+
+	VM_WARN_ON_ONCE(count > pool->min_nr);
+	might_alloc(gfp_mask);
+
+	/*
+	 * If an error is injected, fail all elements in a bulk allocation so
+	 * that we stress the multiple elements missing path.
+	 */
+	if (should_fail_ex(&fail_mempool_alloc_bulk, 1, FAULT_NOWARN)) {
+		pr_info("forcing mempool usage for %pS\n",
+				(void *)_RET_IP_);
+		goto use_pool;
+	}
+
+repeat_alloc:
+	/*
+	 * Try to allocate the elements using the allocation callback first as
+	 * that might succeed even when the caller's bulk allocation did not.
+	 */
+	for (i = 0; i < count; i++) {
+		if (elems[i])
+			continue;
+		elems[i] = pool->alloc(gfp_temp, pool->pool_data);
+		if (unlikely(!elems[i]))
+			goto use_pool;
+		allocated++;
+	}
+
+	return 0;
+
+use_pool:
+	allocated = mempool_alloc_from_pool(pool, elems, count, allocated,
+			gfp_temp);
+	gfp_temp = gfp_mask;
+	goto repeat_alloc;
+}
+EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof);
+
 /**
  * mempool_alloc - allocate an element from a memory pool
  * @pool:	pointer to the memory pool
@@ -478,8 +555,7 @@ repeat_alloc:
 		 * sleep in mempool_alloc_from_pool.  Retry the allocation
 		 * with all flags set in that case.
 		 */
-		element = mempool_alloc_from_pool(pool, gfp_temp);
-		if (!element) {
+		if (!mempool_alloc_from_pool(pool, &element, 1, 0, gfp_temp)) {
 			if (gfp_temp != gfp_mask) {
 				gfp_temp = gfp_mask;
 				goto repeat_alloc;
@@ -508,26 +584,33 @@ EXPORT_SYMBOL(mempool_alloc_noprof);
  */
 void *mempool_alloc_preallocated(mempool_t *pool)
 {
-	return mempool_alloc_from_pool(pool, GFP_NOWAIT);
+	void *element = NULL;
+
+	mempool_alloc_from_pool(pool, &element, 1, 0, GFP_NOWAIT);
+	return element;
 }
 EXPORT_SYMBOL(mempool_alloc_preallocated);
 
 /**
- * mempool_free - return an element to a mempool
- * @element:	pointer to element
+ * mempool_free_bulk - return elements to a mempool
  * @pool:	pointer to the memory pool
+ * @elems:	elements to return
+ * @count:	number of elements to return
  *
- * Returns @element to @pool if it needs replenishing, else frees it using
- * the free_fn callback in @pool.
+ * Returns a number of elements from the start of @elem to @pool if @pool needs
+ * replenishing and sets their slots in @elem to NULL.  Other elements are left
+ * in @elem.
  *
- * This function only sleeps if the free_fn callback sleeps.
+ * Return: number of elements transferred to @pool.  Elements are always
+ * transferred from the beginning of @elem, so the return value can be used as
+ * an offset into @elem for the freeing the remaining elements in the caller.
  */
-void mempool_free(void *element, mempool_t *pool)
+unsigned int mempool_free_bulk(struct mempool *pool, void **elems,
+		unsigned int count)
 {
 	unsigned long flags;
-
-	if (unlikely(element == NULL))
-		return;
+	unsigned int freed = 0;
+	bool added = false;
 
 	/*
 	 * Paired with the wmb in mempool_alloc().  The preceding read is
@@ -561,21 +644,6 @@ void mempool_free(void *element, mempool_t *pool)
 	 * Waiters happen iff curr_nr is 0 and the above guarantee also
 	 * ensures that there will be frees which return elements to the
 	 * pool waking up the waiters.
-	 */
-	if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
-		spin_lock_irqsave(&pool->lock, flags);
-		if (likely(pool->curr_nr < pool->min_nr)) {
-			add_element(pool, element);
-			spin_unlock_irqrestore(&pool->lock, flags);
-			if (wq_has_sleeper(&pool->wait))
-				wake_up(&pool->wait);
-			return;
-		}
-		spin_unlock_irqrestore(&pool->lock, flags);
-	}
-
-	/*
-	 * Handle the min_nr = 0 edge case:
 	 *
 	 * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds,
 	 * so waiters sleeping on pool->wait would never be woken by the
@@ -583,20 +651,45 @@ void mempool_free(void *element, mempool_t *pool)
 	 * allocation of element when both min_nr and curr_nr are 0, and
 	 * any active waiters are properly awakened.
 	 */
-	if (unlikely(pool->min_nr == 0 &&
+	if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
+		spin_lock_irqsave(&pool->lock, flags);
+		while (pool->curr_nr < pool->min_nr && freed < count) {
+			add_element(pool, elems[freed++]);
+			added = true;
+		}
+		spin_unlock_irqrestore(&pool->lock, flags);
+	} else if (unlikely(pool->min_nr == 0 &&
 		     READ_ONCE(pool->curr_nr) == 0)) {
+		/* Handle the min_nr = 0 edge case: */
 		spin_lock_irqsave(&pool->lock, flags);
 		if (likely(pool->curr_nr == 0)) {
-			add_element(pool, element);
-			spin_unlock_irqrestore(&pool->lock, flags);
-			if (wq_has_sleeper(&pool->wait))
-				wake_up(&pool->wait);
-			return;
+			add_element(pool, elems[freed++]);
+			added = true;
 		}
 		spin_unlock_irqrestore(&pool->lock, flags);
 	}
 
-	pool->free(element, pool->pool_data);
+	if (unlikely(added) && wq_has_sleeper(&pool->wait))
+		wake_up(&pool->wait);
+
+	return freed;
+}
+EXPORT_SYMBOL_GPL(mempool_free_bulk);
+
+/**
+ * mempool_free - return an element to the pool.
+ * @element:	element to return
+ * @pool:	pointer to the memory pool
+ *
+ * Returns @element to @pool if it needs replenishing, else frees it using
+ * the free_fn callback in @pool.
+ *
+ * This function only sleeps if the free_fn callback sleeps.
+ */
+void mempool_free(void *element, struct mempool *pool)
+{
+	if (likely(element) && !mempool_free_bulk(pool, &element, 1))
+		pool->free(element, pool->pool_data);
 }
 EXPORT_SYMBOL(mempool_free);
 
-- 
cgit v1.2.3


From 8b41fb80a2cc023591f47d63b094e96af9c2c615 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Nov 2025 09:39:50 +0100
Subject: mempool: remove mempool_{init,create}_kvmalloc_pool

This was added for bcachefs and is unused now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113084022.1255121-10-hch@lst.de
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mempool.h | 13 -------------
 mm/mempool.c            | 13 -------------
 2 files changed, 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index e914fec0e119..d9332485e8ca 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -103,19 +103,6 @@ void mempool_kfree(void *element, void *pool_data);
 	mempool_create((_min_nr), mempool_kmalloc, mempool_kfree,	\
 		       (void *)(unsigned long)(_size))
 
-void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data);
-void mempool_kvfree(void *element, void *pool_data);
-
-static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
-	return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
-}
-
-static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size)
-{
-	return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
-}
-
 /*
  * A mempool_alloc_t and mempool_free_t for a simple page allocator that
  * allocates pages of the order specified by pool_data
diff --git a/mm/mempool.c b/mm/mempool.c
index 9ec3a04a0130..0e1e015998e7 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -728,19 +728,6 @@ void mempool_kfree(void *element, void *pool_data)
 }
 EXPORT_SYMBOL(mempool_kfree);
 
-void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
-{
-	size_t size = (size_t)pool_data;
-	return kvmalloc(size, gfp_mask);
-}
-EXPORT_SYMBOL(mempool_kvmalloc);
-
-void mempool_kvfree(void *element, void *pool_data)
-{
-	kvfree(element);
-}
-EXPORT_SYMBOL(mempool_kvfree);
-
 /*
  * A simple mempool-backed page allocator that allocates pages
  * of the order specified by pool_data.
-- 
cgit v1.2.3


From 0cab6873b7305abdd0acd95ee8cfa56b983500da Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Nov 2025 09:39:51 +0100
Subject: mempool: de-typedef

Switch all uses of the deprecated mempool_t typedef in the core mempool
code to use struct mempool instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113084022.1255121-11-hch@lst.de
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mempool.h | 39 +++++++++++++++++++-------------------
 mm/mempool.c            | 50 +++++++++++++++++++++++++------------------------
 2 files changed, 45 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index d9332485e8ca..e8e440e04a06 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -27,32 +27,31 @@ typedef struct mempool {
 	wait_queue_head_t wait;
 } mempool_t;
 
-static inline bool mempool_initialized(mempool_t *pool)
+static inline bool mempool_initialized(struct mempool *pool)
 {
 	return pool->elements != NULL;
 }
 
-static inline bool mempool_is_saturated(mempool_t *pool)
+static inline bool mempool_is_saturated(struct mempool *pool)
 {
 	return READ_ONCE(pool->curr_nr) >= pool->min_nr;
 }
 
-void mempool_exit(mempool_t *pool);
-int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
-		      mempool_free_t *free_fn, void *pool_data,
-		      gfp_t gfp_mask, int node_id);
-
-int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
-		 mempool_free_t *free_fn, void *pool_data);
+void mempool_exit(struct mempool *pool);
+int mempool_init_node(struct mempool *pool, int min_nr,
+		mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
+		void *pool_data, gfp_t gfp_mask, int node_id);
+int mempool_init_noprof(struct mempool *pool, int min_nr,
+		mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
+		void *pool_data);
 #define mempool_init(...)						\
 	alloc_hooks(mempool_init_noprof(__VA_ARGS__))
 
-extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
-			mempool_free_t *free_fn, void *pool_data);
-
-extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
-			mempool_free_t *free_fn, void *pool_data,
-			gfp_t gfp_mask, int nid);
+struct mempool *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+		mempool_free_t *free_fn, void *pool_data);
+struct mempool *mempool_create_node_noprof(int min_nr,
+		mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
+		void *pool_data, gfp_t gfp_mask, int nid);
 #define mempool_create_node(...)					\
 	alloc_hooks(mempool_create_node_noprof(__VA_ARGS__))
 
@@ -60,10 +59,10 @@ extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_
 	mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data,	\
 			    GFP_KERNEL, NUMA_NO_NODE)
 
-extern int mempool_resize(mempool_t *pool, int new_min_nr);
-extern void mempool_destroy(mempool_t *pool);
+int mempool_resize(struct mempool *pool, int new_min_nr);
+void mempool_destroy(struct mempool *pool);
 
-extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc;
+void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) __malloc;
 #define mempool_alloc(...)						\
 	alloc_hooks(mempool_alloc_noprof(__VA_ARGS__))
 int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem,
@@ -71,8 +70,8 @@ int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem,
 #define mempool_alloc_bulk(...)						\
 	alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__))
 
-extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc;
-extern void mempool_free(void *element, mempool_t *pool);
+void *mempool_alloc_preallocated(struct mempool *pool) __malloc;
+void mempool_free(void *element, struct mempool *pool);
 unsigned int mempool_free_bulk(struct mempool *pool, void **elem,
 		unsigned int count);
 
diff --git a/mm/mempool.c b/mm/mempool.c
index 0e1e015998e7..89ab7bba5c9c 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -40,7 +40,7 @@ static int __init mempool_faul_inject_init(void)
 late_initcall(mempool_faul_inject_init);
 
 #ifdef CONFIG_SLUB_DEBUG_ON
-static void poison_error(mempool_t *pool, void *element, size_t size,
+static void poison_error(struct mempool *pool, void *element, size_t size,
 			 size_t byte)
 {
 	const int nr = pool->curr_nr;
@@ -57,7 +57,7 @@ static void poison_error(mempool_t *pool, void *element, size_t size,
 	dump_stack();
 }
 
-static void __check_element(mempool_t *pool, void *element, size_t size)
+static void __check_element(struct mempool *pool, void *element, size_t size)
 {
 	u8 *obj = element;
 	size_t i;
@@ -73,7 +73,7 @@ static void __check_element(mempool_t *pool, void *element, size_t size)
 	memset(obj, POISON_INUSE, size);
 }
 
-static void check_element(mempool_t *pool, void *element)
+static void check_element(struct mempool *pool, void *element)
 {
 	/* Skip checking: KASAN might save its metadata in the element. */
 	if (kasan_enabled())
@@ -102,7 +102,7 @@ static void __poison_element(void *element, size_t size)
 	obj[size - 1] = POISON_END;
 }
 
-static void poison_element(mempool_t *pool, void *element)
+static void poison_element(struct mempool *pool, void *element)
 {
 	/* Skip poisoning: KASAN might save its metadata in the element. */
 	if (kasan_enabled())
@@ -123,15 +123,16 @@ static void poison_element(mempool_t *pool, void *element)
 	}
 }
 #else /* CONFIG_SLUB_DEBUG_ON */
-static inline void check_element(mempool_t *pool, void *element)
+static inline void check_element(struct mempool *pool, void *element)
 {
 }
-static inline void poison_element(mempool_t *pool, void *element)
+static inline void poison_element(struct mempool *pool, void *element)
 {
 }
 #endif /* CONFIG_SLUB_DEBUG_ON */
 
-static __always_inline bool kasan_poison_element(mempool_t *pool, void *element)
+static __always_inline bool kasan_poison_element(struct mempool *pool,
+		void *element)
 {
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
 		return kasan_mempool_poison_object(element);
@@ -141,7 +142,7 @@ static __always_inline bool kasan_poison_element(mempool_t *pool, void *element)
 	return true;
 }
 
-static void kasan_unpoison_element(mempool_t *pool, void *element)
+static void kasan_unpoison_element(struct mempool *pool, void *element)
 {
 	if (pool->alloc == mempool_kmalloc)
 		kasan_mempool_unpoison_object(element, (size_t)pool->pool_data);
@@ -153,7 +154,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
 					     (unsigned long)pool->pool_data);
 }
 
-static __always_inline void add_element(mempool_t *pool, void *element)
+static __always_inline void add_element(struct mempool *pool, void *element)
 {
 	BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr);
 	poison_element(pool, element);
@@ -161,7 +162,7 @@ static __always_inline void add_element(mempool_t *pool, void *element)
 		pool->elements[pool->curr_nr++] = element;
 }
 
-static void *remove_element(mempool_t *pool)
+static void *remove_element(struct mempool *pool)
 {
 	void *element = pool->elements[--pool->curr_nr];
 
@@ -182,7 +183,7 @@ static void *remove_element(mempool_t *pool)
  * May be called on a zeroed but uninitialized mempool (i.e. allocated with
  * kzalloc()).
  */
-void mempool_exit(mempool_t *pool)
+void mempool_exit(struct mempool *pool)
 {
 	while (pool->curr_nr) {
 		void *element = remove_element(pool);
@@ -201,7 +202,7 @@ EXPORT_SYMBOL(mempool_exit);
  * Free all reserved elements in @pool and @pool itself.  This function
  * only sleeps if the free_fn() function sleeps.
  */
-void mempool_destroy(mempool_t *pool)
+void mempool_destroy(struct mempool *pool)
 {
 	if (unlikely(!pool))
 		return;
@@ -211,9 +212,9 @@ void mempool_destroy(mempool_t *pool)
 }
 EXPORT_SYMBOL(mempool_destroy);
 
-int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
-		      mempool_free_t *free_fn, void *pool_data,
-		      gfp_t gfp_mask, int node_id)
+int mempool_init_node(struct mempool *pool, int min_nr,
+		mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
+		void *pool_data, gfp_t gfp_mask, int node_id)
 {
 	spin_lock_init(&pool->lock);
 	pool->min_nr	= min_nr;
@@ -263,8 +264,9 @@ EXPORT_SYMBOL(mempool_init_node);
  *
  * Return: %0 on success, negative error code otherwise.
  */
-int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
-			mempool_free_t *free_fn, void *pool_data)
+int mempool_init_noprof(struct mempool *pool, int min_nr,
+		mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
+		void *pool_data)
 {
 	return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
 				 pool_data, GFP_KERNEL, NUMA_NO_NODE);
@@ -290,11 +292,11 @@ EXPORT_SYMBOL(mempool_init_noprof);
  *
  * Return: pointer to the created memory pool object or %NULL on error.
  */
-mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
-				      mempool_free_t *free_fn, void *pool_data,
-				      gfp_t gfp_mask, int node_id)
+struct mempool *mempool_create_node_noprof(int min_nr,
+		mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
+		void *pool_data, gfp_t gfp_mask, int node_id)
 {
-	mempool_t *pool;
+	struct mempool *pool;
 
 	pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
 	if (!pool)
@@ -328,7 +330,7 @@ EXPORT_SYMBOL(mempool_create_node_noprof);
  *
  * Return: %0 on success, negative error code otherwise.
  */
-int mempool_resize(mempool_t *pool, int new_min_nr)
+int mempool_resize(struct mempool *pool, int new_min_nr)
 {
 	void *element;
 	void **new_elements;
@@ -530,7 +532,7 @@ EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof);
  * an element.  Allocation failure can only happen when @gfp_mask does not
  * include %__GFP_DIRECT_RECLAIM.
  */
-void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask)
+void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask)
 {
 	gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask);
 	void *element;
@@ -582,7 +584,7 @@ EXPORT_SYMBOL(mempool_alloc_noprof);
  * Return: pointer to the allocated element or %NULL if no elements are
  * available.
  */
-void *mempool_alloc_preallocated(mempool_t *pool)
+void *mempool_alloc_preallocated(struct mempool *pool)
 {
 	void *element = NULL;
 
-- 
cgit v1.2.3


From 447c4e8338dbfad517769d26b53d633b88d51184 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Date: Thu, 30 Oct 2025 20:26:28 +0200
Subject: PM / devfreq: Move governor.h to a public header location

Some device drivers (and out-of-tree modules) might want to define
device-specific device governors. Rather than restricting all of them to
be a part of drivers/devfreq/ (which is not possible for out-of-tree
drivers anyway) move governor.h to include/linux/devfreq-governor.h and
update all drivers to use it.

The devfreq_cpu_data is only used internally, by the passive governor,
so it is moved to the driver source rather than being a part of the
public interface.

Reported-by: Robie Basak <robibasa@qti.qualcomm.com>
Acked-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Reviewed-by: Bjorn Andersson <andersson@kernel.org>
Acked-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Link: https://patchwork.kernel.org/project/linux-pm/patch/20251030-governor-public-v2-1-432a11a9975a@oss.qualcomm.com/
---
 drivers/devfreq/devfreq.c                 |   2 +-
 drivers/devfreq/governor.h                | 127 ------------------------------
 drivers/devfreq/governor_passive.c        |  27 ++++++-
 drivers/devfreq/governor_performance.c    |   2 +-
 drivers/devfreq/governor_powersave.c      |   2 +-
 drivers/devfreq/governor_simpleondemand.c |   2 +-
 drivers/devfreq/governor_userspace.c      |   2 +-
 drivers/devfreq/hisi_uncore_freq.c        |   3 +-
 drivers/devfreq/tegra30-devfreq.c         |   3 +-
 include/linux/devfreq-governor.h          | 102 ++++++++++++++++++++++++
 10 files changed, 135 insertions(+), 137 deletions(-)
 delete mode 100644 drivers/devfreq/governor.h
 create mode 100644 include/linux/devfreq-governor.h

(limited to 'include')

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 2e8d01d47f69..00979f2e0e27 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -20,6 +20,7 @@
 #include <linux/stat.h>
 #include <linux/pm_opp.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/workqueue.h>
 #include <linux/platform_device.h>
 #include <linux/list.h>
@@ -28,7 +29,6 @@
 #include <linux/of.h>
 #include <linux/pm_qos.h>
 #include <linux/units.h>
-#include "governor.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/devfreq.h>
diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h
deleted file mode 100644
index 0adfebc0467a..000000000000
--- a/drivers/devfreq/governor.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * governor.h - internal header for devfreq governors.
- *
- * Copyright (C) 2011 Samsung Electronics
- *	MyungJoo Ham <myungjoo.ham@samsung.com>
- *
- * This header is for devfreq governors in drivers/devfreq/
- */
-
-#ifndef _GOVERNOR_H
-#define _GOVERNOR_H
-
-#include <linux/devfreq.h>
-
-#define DEVFREQ_NAME_LEN			16
-
-#define to_devfreq(DEV)	container_of((DEV), struct devfreq, dev)
-
-/* Devfreq events */
-#define DEVFREQ_GOV_START			0x1
-#define DEVFREQ_GOV_STOP			0x2
-#define DEVFREQ_GOV_UPDATE_INTERVAL		0x3
-#define DEVFREQ_GOV_SUSPEND			0x4
-#define DEVFREQ_GOV_RESUME			0x5
-
-#define DEVFREQ_MIN_FREQ			0
-#define DEVFREQ_MAX_FREQ			ULONG_MAX
-
-/*
- * Definition of the governor feature flags
- * - DEVFREQ_GOV_FLAG_IMMUTABLE
- *   : This governor is never changeable to other governors.
- * - DEVFREQ_GOV_FLAG_IRQ_DRIVEN
- *   : The devfreq won't schedule the work for this governor.
- */
-#define DEVFREQ_GOV_FLAG_IMMUTABLE			BIT(0)
-#define DEVFREQ_GOV_FLAG_IRQ_DRIVEN			BIT(1)
-
-/*
- * Definition of governor attribute flags except for common sysfs attributes
- * - DEVFREQ_GOV_ATTR_POLLING_INTERVAL
- *   : Indicate polling_interval sysfs attribute
- * - DEVFREQ_GOV_ATTR_TIMER
- *   : Indicate timer sysfs attribute
- */
-#define DEVFREQ_GOV_ATTR_POLLING_INTERVAL		BIT(0)
-#define DEVFREQ_GOV_ATTR_TIMER				BIT(1)
-
-/**
- * struct devfreq_cpu_data - Hold the per-cpu data
- * @node:	list node
- * @dev:	reference to cpu device.
- * @first_cpu:	the cpumask of the first cpu of a policy.
- * @opp_table:	reference to cpu opp table.
- * @cur_freq:	the current frequency of the cpu.
- * @min_freq:	the min frequency of the cpu.
- * @max_freq:	the max frequency of the cpu.
- *
- * This structure stores the required cpu_data of a cpu.
- * This is auto-populated by the governor.
- */
-struct devfreq_cpu_data {
-	struct list_head node;
-
-	struct device *dev;
-	unsigned int first_cpu;
-
-	struct opp_table *opp_table;
-	unsigned int cur_freq;
-	unsigned int min_freq;
-	unsigned int max_freq;
-};
-
-/**
- * struct devfreq_governor - Devfreq policy governor
- * @node:		list node - contains registered devfreq governors
- * @name:		Governor's name
- * @attrs:		Governor's sysfs attribute flags
- * @flags:		Governor's feature flags
- * @get_target_freq:	Returns desired operating frequency for the device.
- *			Basically, get_target_freq will run
- *			devfreq_dev_profile.get_dev_status() to get the
- *			status of the device (load = busy_time / total_time).
- * @event_handler:      Callback for devfreq core framework to notify events
- *                      to governors. Events include per device governor
- *                      init and exit, opp changes out of devfreq, suspend
- *                      and resume of per device devfreq during device idle.
- *
- * Note that the callbacks are called with devfreq->lock locked by devfreq.
- */
-struct devfreq_governor {
-	struct list_head node;
-
-	const char name[DEVFREQ_NAME_LEN];
-	const u64 attrs;
-	const u64 flags;
-	int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
-	int (*event_handler)(struct devfreq *devfreq,
-				unsigned int event, void *data);
-};
-
-void devfreq_monitor_start(struct devfreq *devfreq);
-void devfreq_monitor_stop(struct devfreq *devfreq);
-void devfreq_monitor_suspend(struct devfreq *devfreq);
-void devfreq_monitor_resume(struct devfreq *devfreq);
-void devfreq_update_interval(struct devfreq *devfreq, unsigned int *delay);
-
-int devfreq_add_governor(struct devfreq_governor *governor);
-int devfreq_remove_governor(struct devfreq_governor *governor);
-
-int devm_devfreq_add_governor(struct device *dev,
-			      struct devfreq_governor *governor);
-
-int devfreq_update_status(struct devfreq *devfreq, unsigned long freq);
-int devfreq_update_target(struct devfreq *devfreq, unsigned long freq);
-void devfreq_get_freq_range(struct devfreq *devfreq, unsigned long *min_freq,
-			    unsigned long *max_freq);
-
-static inline int devfreq_update_stats(struct devfreq *df)
-{
-	if (!df->profile->get_dev_status)
-		return -EINVAL;
-
-	return df->profile->get_dev_status(df->dev.parent, &df->last_status);
-}
-#endif /* _GOVERNOR_H */
diff --git a/drivers/devfreq/governor_passive.c b/drivers/devfreq/governor_passive.c
index 953cf9a1e9f7..8cd6f9a59f64 100644
--- a/drivers/devfreq/governor_passive.c
+++ b/drivers/devfreq/governor_passive.c
@@ -14,8 +14,33 @@
 #include <linux/slab.h>
 #include <linux/device.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/units.h>
-#include "governor.h"
+
+/**
+ * struct devfreq_cpu_data - Hold the per-cpu data
+ * @node:	list node
+ * @dev:	reference to cpu device.
+ * @first_cpu:	the cpumask of the first cpu of a policy.
+ * @opp_table:	reference to cpu opp table.
+ * @cur_freq:	the current frequency of the cpu.
+ * @min_freq:	the min frequency of the cpu.
+ * @max_freq:	the max frequency of the cpu.
+ *
+ * This structure stores the required cpu_data of a cpu.
+ * This is auto-populated by the governor.
+ */
+struct devfreq_cpu_data {
+	struct list_head node;
+
+	struct device *dev;
+	unsigned int first_cpu;
+
+	struct opp_table *opp_table;
+	unsigned int cur_freq;
+	unsigned int min_freq;
+	unsigned int max_freq;
+};
 
 static struct devfreq_cpu_data *
 get_parent_cpu_data(struct devfreq_passive_data *p_data,
diff --git a/drivers/devfreq/governor_performance.c b/drivers/devfreq/governor_performance.c
index 2e4e981446fa..fdb22bf512cf 100644
--- a/drivers/devfreq/governor_performance.c
+++ b/drivers/devfreq/governor_performance.c
@@ -7,8 +7,8 @@
  */
 
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/module.h>
-#include "governor.h"
 
 static int devfreq_performance_func(struct devfreq *df,
 				    unsigned long *freq)
diff --git a/drivers/devfreq/governor_powersave.c b/drivers/devfreq/governor_powersave.c
index f059e8814804..ee2d6ec8a512 100644
--- a/drivers/devfreq/governor_powersave.c
+++ b/drivers/devfreq/governor_powersave.c
@@ -7,8 +7,8 @@
  */
 
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/module.h>
-#include "governor.h"
 
 static int devfreq_powersave_func(struct devfreq *df,
 				  unsigned long *freq)
diff --git a/drivers/devfreq/governor_simpleondemand.c b/drivers/devfreq/governor_simpleondemand.c
index c23435736367..9c69b96df5f9 100644
--- a/drivers/devfreq/governor_simpleondemand.c
+++ b/drivers/devfreq/governor_simpleondemand.c
@@ -9,8 +9,8 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/math64.h>
-#include "governor.h"
 
 /* Default constants for DevFreq-Simple-Ondemand (DFSO) */
 #define DFSO_UPTHRESHOLD	(90)
diff --git a/drivers/devfreq/governor_userspace.c b/drivers/devfreq/governor_userspace.c
index 175de0c0b50e..395174f93960 100644
--- a/drivers/devfreq/governor_userspace.c
+++ b/drivers/devfreq/governor_userspace.c
@@ -9,11 +9,11 @@
 #include <linux/slab.h>
 #include <linux/device.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/kstrtox.h>
 #include <linux/pm.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
-#include "governor.h"
 
 struct userspace_data {
 	unsigned long user_frequency;
diff --git a/drivers/devfreq/hisi_uncore_freq.c b/drivers/devfreq/hisi_uncore_freq.c
index 96d1815059e3..b8e4621c57eb 100644
--- a/drivers/devfreq/hisi_uncore_freq.c
+++ b/drivers/devfreq/hisi_uncore_freq.c
@@ -9,6 +9,7 @@
 #include <linux/bits.h>
 #include <linux/cleanup.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/device.h>
 #include <linux/dev_printk.h>
 #include <linux/errno.h>
@@ -26,8 +27,6 @@
 #include <linux/units.h>
 #include <acpi/pcc.h>
 
-#include "governor.h"
-
 struct hisi_uncore_pcc_data {
 	u16 status;
 	u16 resv;
diff --git a/drivers/devfreq/tegra30-devfreq.c b/drivers/devfreq/tegra30-devfreq.c
index 4a4f0106ab9d..77cbb204087c 100644
--- a/drivers/devfreq/tegra30-devfreq.c
+++ b/drivers/devfreq/tegra30-devfreq.c
@@ -9,6 +9,7 @@
 #include <linux/clk.h>
 #include <linux/cpufreq.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq-governor.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/irq.h>
@@ -21,8 +22,6 @@
 
 #include <soc/tegra/fuse.h>
 
-#include "governor.h"
-
 #define ACTMON_GLB_STATUS					0x0
 #define ACTMON_GLB_PERIOD_CTRL					0x4
 
diff --git a/include/linux/devfreq-governor.h b/include/linux/devfreq-governor.h
new file mode 100644
index 000000000000..dfdd0160a29f
--- /dev/null
+++ b/include/linux/devfreq-governor.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * governor.h - internal header for devfreq governors.
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ *	MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This header is for devfreq governors
+ */
+
+#ifndef __LINUX_DEVFREQ_DEVFREQ_H__
+#define __LINUX_DEVFREQ_DEVFREQ_H__
+
+#include <linux/devfreq.h>
+
+#define DEVFREQ_NAME_LEN			16
+
+#define to_devfreq(DEV)	container_of((DEV), struct devfreq, dev)
+
+/* Devfreq events */
+#define DEVFREQ_GOV_START			0x1
+#define DEVFREQ_GOV_STOP			0x2
+#define DEVFREQ_GOV_UPDATE_INTERVAL		0x3
+#define DEVFREQ_GOV_SUSPEND			0x4
+#define DEVFREQ_GOV_RESUME			0x5
+
+#define DEVFREQ_MIN_FREQ			0
+#define DEVFREQ_MAX_FREQ			ULONG_MAX
+
+/*
+ * Definition of the governor feature flags
+ * - DEVFREQ_GOV_FLAG_IMMUTABLE
+ *   : This governor is never changeable to other governors.
+ * - DEVFREQ_GOV_FLAG_IRQ_DRIVEN
+ *   : The devfreq won't schedule the work for this governor.
+ */
+#define DEVFREQ_GOV_FLAG_IMMUTABLE			BIT(0)
+#define DEVFREQ_GOV_FLAG_IRQ_DRIVEN			BIT(1)
+
+/*
+ * Definition of governor attribute flags except for common sysfs attributes
+ * - DEVFREQ_GOV_ATTR_POLLING_INTERVAL
+ *   : Indicate polling_interval sysfs attribute
+ * - DEVFREQ_GOV_ATTR_TIMER
+ *   : Indicate timer sysfs attribute
+ */
+#define DEVFREQ_GOV_ATTR_POLLING_INTERVAL		BIT(0)
+#define DEVFREQ_GOV_ATTR_TIMER				BIT(1)
+
+/**
+ * struct devfreq_governor - Devfreq policy governor
+ * @node:		list node - contains registered devfreq governors
+ * @name:		Governor's name
+ * @attrs:		Governor's sysfs attribute flags
+ * @flags:		Governor's feature flags
+ * @get_target_freq:	Returns desired operating frequency for the device.
+ *			Basically, get_target_freq will run
+ *			devfreq_dev_profile.get_dev_status() to get the
+ *			status of the device (load = busy_time / total_time).
+ * @event_handler:      Callback for devfreq core framework to notify events
+ *                      to governors. Events include per device governor
+ *                      init and exit, opp changes out of devfreq, suspend
+ *                      and resume of per device devfreq during device idle.
+ *
+ * Note that the callbacks are called with devfreq->lock locked by devfreq.
+ */
+struct devfreq_governor {
+	struct list_head node;
+
+	const char name[DEVFREQ_NAME_LEN];
+	const u64 attrs;
+	const u64 flags;
+	int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
+	int (*event_handler)(struct devfreq *devfreq,
+				unsigned int event, void *data);
+};
+
+void devfreq_monitor_start(struct devfreq *devfreq);
+void devfreq_monitor_stop(struct devfreq *devfreq);
+void devfreq_monitor_suspend(struct devfreq *devfreq);
+void devfreq_monitor_resume(struct devfreq *devfreq);
+void devfreq_update_interval(struct devfreq *devfreq, unsigned int *delay);
+
+int devfreq_add_governor(struct devfreq_governor *governor);
+int devfreq_remove_governor(struct devfreq_governor *governor);
+
+int devm_devfreq_add_governor(struct device *dev,
+			      struct devfreq_governor *governor);
+
+int devfreq_update_status(struct devfreq *devfreq, unsigned long freq);
+int devfreq_update_target(struct devfreq *devfreq, unsigned long freq);
+void devfreq_get_freq_range(struct devfreq *devfreq, unsigned long *min_freq,
+			    unsigned long *max_freq);
+
+static inline int devfreq_update_stats(struct devfreq *df)
+{
+	if (!df->profile->get_dev_status)
+		return -EINVAL;
+
+	return df->profile->get_dev_status(df->dev.parent, &df->last_status);
+}
+#endif /* __LINUX_DEVFREQ_DEVFREQ_H__ */
-- 
cgit v1.2.3


From 074e16d58e6b78612c22ff611aa469ee929cc37f Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 23 Nov 2025 06:48:19 +0100
Subject: compiler_types: introduce at_least parameter decoration pseudo
 keyword
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clang and recent gcc support warning if they are able to prove that the
user is passing to a function an array that is too short in size. For
example:

    void blah(unsigned char herp[at_least 7]);
    static void schma(void)
    {
        unsigned char good[] = { 1, 2, 3, 4, 5, 6, 7 };
        unsigned char bad[] = { 1, 2, 3, 4, 5, 6 };
        blah(good);
        blah(bad);
    }

The notation here, `static 7`, which this commit makes explicit by
allowing us to write it as `at_least 7`, means that it's incorrect to
pass anything less than 7 elements. This is section 6.7.5.3 of C99:

    If the keyword static also appears within the [ and ] of the array
    type derivation, then for each call to the function, the value of
    the corresponding actual argument shall provide access to the first
    element of an array with at least as many elements as specified by
    the size expression.

Here is the output from gcc 15:

    zx2c4@thinkpad /tmp $ gcc -c a.c
    a.c: In function ‘schma’:
    a.c:9:9: warning: ‘blah’ accessing 7 bytes in a region of size 6 [-Wstringop-overflow=]
        9 |         blah(bad);
          |         ^~~~~~~~~
    a.c:9:9: note: referencing argument 1 of type ‘unsigned char[7]’
    a.c:2:6: note: in a call to function ‘blah’
        2 | void blah(unsigned char herp[at_least 7]);
          |      ^~~~

And from clang 21:

    zx2c4@thinkpad /tmp $ clang -c a.c
    a.c:9:2: warning: array argument is too small; contains 6 elements, callee requires at least 7
          [-Warray-bounds]
        9 |         blah(bad);
          |         ^    ~~~
    a.c:2:25: note: callee declares array parameter as static here
        2 | void blah(unsigned char herp[at_least 7]);
          |                         ^   ~~~~~~~~~~
    1 warning generated.

So these are covered by, variously, -Wstringop-overflow and
-Warray-bounds.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: "Jason A. Donenfeld" <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20251123054819.2371989-3-Jason@zx2c4.com
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/linux/compiler_types.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 59288a2c1ad2..51f0dccdb54d 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -394,6 +394,21 @@ struct ftrace_likely_data {
 #define __counted_by_be(member)	__counted_by(member)
 #endif
 
+/*
+ * This designates the minimum number of elements a passed array parameter must
+ * have. For example:
+ *
+ *     void some_function(u8 param[at_least 7]);
+ *
+ * If a caller passes an array with fewer than 7 elements, the compiler will
+ * emit a warning.
+ */
+#ifndef __CHECKER__
+#define at_least static
+#else
+#define at_least
+#endif
+
 /* Do not trap wrapping arithmetic within an annotated function. */
 #ifdef CONFIG_UBSAN_INTEGER_WRAP
 # define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow")))
-- 
cgit v1.2.3


From ac653d57ad8bb873c1c68fe77a1dee81cc1e365d Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 23 Nov 2025 06:48:20 +0100
Subject: lib/crypto: chacha20poly1305: Statically check fixed array lengths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Several parameters of the chacha20poly1305 functions require arrays of
an exact length. Use the new at_least keyword to instruct gcc and
clang to statically check that the caller is passing an object of at
least that length.

Here it is in action, with this faulty patch to wireguard's cookie.h:

     struct cookie_checker {
     	u8 secret[NOISE_HASH_LEN];
    -	u8 cookie_encryption_key[NOISE_SYMMETRIC_KEY_LEN];
    +	u8 cookie_encryption_key[NOISE_SYMMETRIC_KEY_LEN - 1];
     	u8 message_mac1_key[NOISE_SYMMETRIC_KEY_LEN];

If I try compiling this code, I get this helpful warning:

  CC      drivers/net/wireguard/cookie.o
drivers/net/wireguard/cookie.c: In function ‘wg_cookie_message_create’:
drivers/net/wireguard/cookie.c:193:9: warning: ‘xchacha20poly1305_encrypt’ reading 32 bytes from a region of size 31 [-Wstringop-overread]
  193 |         xchacha20poly1305_encrypt(dst->encrypted_cookie, cookie, COOKIE_LEN,
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  194 |                                   macs->mac1, COOKIE_LEN, dst->nonce,
      |                                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  195 |                                   checker->cookie_encryption_key);
      |                                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/wireguard/cookie.c:193:9: note: referencing argument 7 of type ‘const u8 *’ {aka ‘const unsigned char *’}
In file included from drivers/net/wireguard/messages.h:10,
                 from drivers/net/wireguard/cookie.h:9,
                 from drivers/net/wireguard/cookie.c:6:
include/crypto/chacha20poly1305.h:28:6: note: in a call to function ‘xchacha20poly1305_encrypt’
   28 | void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: "Jason A. Donenfeld" <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20251123054819.2371989-4-Jason@zx2c4.com
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/chacha20poly1305.h | 19 ++++++++++---------
 lib/crypto/chacha20poly1305.c     | 18 +++++++++---------
 2 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/crypto/chacha20poly1305.h b/include/crypto/chacha20poly1305.h
index d2ac3ff7dc1e..0f71b037702d 100644
--- a/include/crypto/chacha20poly1305.h
+++ b/include/crypto/chacha20poly1305.h
@@ -18,32 +18,33 @@ enum chacha20poly1305_lengths {
 void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			      const u8 *ad, const size_t ad_len,
 			      const u64 nonce,
-			      const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+			      const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 bool __must_check
 chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 			 const u8 *ad, const size_t ad_len, const u64 nonce,
-			 const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+			 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			       const u8 *ad, const size_t ad_len,
-			       const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-			       const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+			       const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+			       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 bool __must_check xchacha20poly1305_decrypt(
-	u8 *dst, const u8 *src, const size_t src_len, const u8 *ad,
-	const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-	const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+	u8 *dst, const u8 *src, const size_t src_len,
+	const u8 *ad, const size_t ad_len,
+	const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+	const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
 
 bool chacha20poly1305_selftest(void);
 
diff --git a/lib/crypto/chacha20poly1305.c b/lib/crypto/chacha20poly1305.c
index 0b49d6aedefd..212ce33562af 100644
--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
@@ -89,7 +89,7 @@ __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			      const u8 *ad, const size_t ad_len,
 			      const u64 nonce,
-			      const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			      const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 	u32 k[CHACHA_KEY_WORDS];
@@ -111,8 +111,8 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt);
 
 void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			       const u8 *ad, const size_t ad_len,
-			       const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-			       const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			       const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+			       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 
@@ -170,7 +170,7 @@ __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 			      const u8 *ad, const size_t ad_len,
 			      const u64 nonce,
-			      const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			      const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 	u32 k[CHACHA_KEY_WORDS];
@@ -195,8 +195,8 @@ EXPORT_SYMBOL(chacha20poly1305_decrypt);
 
 bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 			       const u8 *ad, const size_t ad_len,
-			       const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-			       const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			       const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+			       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 
@@ -211,7 +211,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 				       const size_t src_len,
 				       const u8 *ad, const size_t ad_len,
 				       const u64 nonce,
-				       const u8 key[CHACHA20POLY1305_KEY_SIZE],
+				       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE],
 				       int encrypt)
 {
 	const u8 *pad0 = page_address(ZERO_PAGE(0));
@@ -335,7 +335,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE])
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	return chacha20poly1305_crypt_sg_inplace(src, src_len, ad, ad_len,
 						 nonce, key, 1);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt_sg_inplace);
 bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE])
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	if (unlikely(src_len < POLY1305_DIGEST_SIZE))
 		return false;
-- 
cgit v1.2.3


From 1b31b43bf5c2b7ae8b0f9acac036354ea28b0397 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 22 Nov 2025 11:42:01 -0800
Subject: lib/crypto: chacha: Add at_least decoration to fixed-size array
 params

Add the at_least (i.e. 'static') decoration to the fixed-size array
parameters of the chacha library functions.  This causes clang to warn
when a too-small array of known size is passed.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: "Jason A. Donenfeld" <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20251122194206.31822-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/chacha.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
index 38e26dff27b0..1cc301a48469 100644
--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
@@ -38,18 +38,18 @@ struct chacha_state {
 };
 
 void chacha_block_generic(struct chacha_state *state,
-			  u8 out[CHACHA_BLOCK_SIZE], int nrounds);
+			  u8 out[at_least CHACHA_BLOCK_SIZE], int nrounds);
 static inline void chacha20_block(struct chacha_state *state,
-				  u8 out[CHACHA_BLOCK_SIZE])
+				  u8 out[at_least CHACHA_BLOCK_SIZE])
 {
 	chacha_block_generic(state, out, 20);
 }
 
 void hchacha_block_generic(const struct chacha_state *state,
-			   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+			   u32 out[at_least HCHACHA_OUT_WORDS], int nrounds);
 
 void hchacha_block(const struct chacha_state *state,
-		   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+		   u32 out[at_least HCHACHA_OUT_WORDS], int nrounds);
 
 enum chacha_constants { /* expand 32-byte k */
 	CHACHA_CONSTANT_EXPA = 0x61707865U,
@@ -67,8 +67,8 @@ static inline void chacha_init_consts(struct chacha_state *state)
 }
 
 static inline void chacha_init(struct chacha_state *state,
-			       const u32 key[CHACHA_KEY_WORDS],
-			       const u8 iv[CHACHA_IV_SIZE])
+			       const u32 key[at_least CHACHA_KEY_WORDS],
+			       const u8 iv[at_least CHACHA_IV_SIZE])
 {
 	chacha_init_consts(state);
 	state->x[4]  = key[0];
-- 
cgit v1.2.3


From 2143d622cdf3bf93e61f2e0a728487bc871785e5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 22 Nov 2025 11:42:02 -0800
Subject: lib/crypto: curve25519: Add at_least decoration to fixed-size array
 params

Add the at_least (i.e. 'static') decoration to the fixed-size array
parameters of the curve25519 library functions.  This causes clang to
warn when a too-small array of known size is passed.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: "Jason A. Donenfeld" <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20251122194206.31822-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/curve25519.h | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h
index db63a5577c00..2362b48f8741 100644
--- a/include/crypto/curve25519.h
+++ b/include/crypto/curve25519.h
@@ -13,24 +13,28 @@ enum curve25519_lengths {
 	CURVE25519_KEY_SIZE = 32
 };
 
-void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
-			const u8 scalar[CURVE25519_KEY_SIZE],
-			const u8 point[CURVE25519_KEY_SIZE]);
+void curve25519_generic(u8 out[at_least CURVE25519_KEY_SIZE],
+			const u8 scalar[at_least CURVE25519_KEY_SIZE],
+			const u8 point[at_least CURVE25519_KEY_SIZE]);
 
-bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
-			     const u8 secret[CURVE25519_KEY_SIZE],
-			     const u8 basepoint[CURVE25519_KEY_SIZE]);
+bool __must_check
+curve25519(u8 mypublic[at_least CURVE25519_KEY_SIZE],
+	   const u8 secret[at_least CURVE25519_KEY_SIZE],
+	   const u8 basepoint[at_least CURVE25519_KEY_SIZE]);
 
-bool __must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
-					     const u8 secret[CURVE25519_KEY_SIZE]);
+bool __must_check
+curve25519_generate_public(u8 pub[at_least CURVE25519_KEY_SIZE],
+			   const u8 secret[at_least CURVE25519_KEY_SIZE]);
 
-static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE])
+static inline void
+curve25519_clamp_secret(u8 secret[at_least CURVE25519_KEY_SIZE])
 {
 	secret[0] &= 248;
 	secret[31] = (secret[31] & 127) | 64;
 }
 
-static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE])
+static inline void
+curve25519_generate_secret(u8 secret[at_least CURVE25519_KEY_SIZE])
 {
 	get_random_bytes_wait(secret, CURVE25519_KEY_SIZE);
 	curve25519_clamp_secret(secret);
-- 
cgit v1.2.3


From 580f1d31dff62b0f0034304bd75f169b8fec6f36 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 22 Nov 2025 11:42:03 -0800
Subject: lib/crypto: md5: Add at_least decoration to fixed-size array params

Add the at_least (i.e. 'static') decoration to the fixed-size array
parameters of the md5 library functions.  This causes clang to warn when
a too-small array of known size is passed.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: "Jason A. Donenfeld" <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20251122194206.31822-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/md5.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/crypto/md5.h b/include/crypto/md5.h
index c9aa5c3abc53..c47aedfe67ec 100644
--- a/include/crypto/md5.h
+++ b/include/crypto/md5.h
@@ -76,7 +76,7 @@ void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len);
  *
  * Context: Any context.
  */
-void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
+void md5_final(struct md5_ctx *ctx, u8 out[at_least MD5_DIGEST_SIZE]);
 
 /**
  * md5() - Compute MD5 message digest in one shot
@@ -86,7 +86,7 @@ void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]);
+void md5(const u8 *data, size_t len, u8 out[at_least MD5_DIGEST_SIZE]);
 
 /**
  * struct hmac_md5_key - Prepared key for HMAC-MD5
@@ -173,7 +173,7 @@ static inline void hmac_md5_update(struct hmac_md5_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
+void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[at_least MD5_DIGEST_SIZE]);
 
 /**
  * hmac_md5() - Compute HMAC-MD5 in one shot, using a prepared key
@@ -187,7 +187,8 @@ void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_md5(const struct hmac_md5_key *key,
-	      const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]);
+	      const u8 *data, size_t data_len,
+	      u8 out[at_least MD5_DIGEST_SIZE]);
 
 /**
  * hmac_md5_usingrawkey() - Compute HMAC-MD5 in one shot, using a raw key
@@ -204,6 +205,6 @@ void hmac_md5(const struct hmac_md5_key *key,
  */
 void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			  const u8 *data, size_t data_len,
-			  u8 out[MD5_DIGEST_SIZE]);
+			  u8 out[at_least MD5_DIGEST_SIZE]);
 
 #endif /* _CRYPTO_MD5_H */
-- 
cgit v1.2.3


From c2099fa61664e8fe8844cccdb7d1d18a5f0f94d1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 22 Nov 2025 11:42:04 -0800
Subject: lib/crypto: poly1305: Add at_least decoration to fixed-size array
 params

Add the at_least (i.e. 'static') decoration to the fixed-size array
parameters of the poly1305 library functions.  This causes clang to warn
when a too-small array of known size is passed.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: "Jason A. Donenfeld" <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20251122194206.31822-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/poly1305.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
index d4daeec8da19..190beb427c6d 100644
--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
@@ -59,7 +59,7 @@ struct poly1305_desc_ctx {
 };
 
 void poly1305_init(struct poly1305_desc_ctx *desc,
-		   const u8 key[POLY1305_KEY_SIZE]);
+		   const u8 key[at_least POLY1305_KEY_SIZE]);
 void poly1305_update(struct poly1305_desc_ctx *desc,
 		     const u8 *src, unsigned int nbytes);
 void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest);
-- 
cgit v1.2.3


From d5cc4e731de7edb1a2b7940d0f0badf9956dddb7 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 22 Nov 2025 11:42:05 -0800
Subject: lib/crypto: sha1: Add at_least decoration to fixed-size array params

Add the at_least (i.e. 'static') decoration to the fixed-size array
parameters of the sha1 library functions.  This causes clang to warn
when a too-small array of known size is passed.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: "Jason A. Donenfeld" <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20251122194206.31822-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha1.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h
index 162a529ec841..27f08b972931 100644
--- a/include/crypto/sha1.h
+++ b/include/crypto/sha1.h
@@ -84,7 +84,7 @@ void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len);
  *
  * Context: Any context.
  */
-void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
+void sha1_final(struct sha1_ctx *ctx, u8 out[at_least SHA1_DIGEST_SIZE]);
 
 /**
  * sha1() - Compute SHA-1 message digest in one shot
@@ -94,7 +94,7 @@ void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]);
+void sha1(const u8 *data, size_t len, u8 out[at_least SHA1_DIGEST_SIZE]);
 
 /**
  * struct hmac_sha1_key - Prepared key for HMAC-SHA1
@@ -181,7 +181,8 @@ static inline void hmac_sha1_update(struct hmac_sha1_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
+void hmac_sha1_final(struct hmac_sha1_ctx *ctx,
+		     u8 out[at_least SHA1_DIGEST_SIZE]);
 
 /**
  * hmac_sha1() - Compute HMAC-SHA1 in one shot, using a prepared key
@@ -195,7 +196,8 @@ void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_sha1(const struct hmac_sha1_key *key,
-	       const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE]);
+	       const u8 *data, size_t data_len,
+	       u8 out[at_least SHA1_DIGEST_SIZE]);
 
 /**
  * hmac_sha1_usingrawkey() - Compute HMAC-SHA1 in one shot, using a raw key
@@ -212,6 +214,6 @@ void hmac_sha1(const struct hmac_sha1_key *key,
  */
 void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			   const u8 *data, size_t data_len,
-			   u8 out[SHA1_DIGEST_SIZE]);
+			   u8 out[at_least SHA1_DIGEST_SIZE]);
 
 #endif /* _CRYPTO_SHA1_H */
-- 
cgit v1.2.3


From 4f0382b0901b43552b600f8e5f806295778b0fb0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 22 Nov 2025 11:42:06 -0800
Subject: lib/crypto: sha2: Add at_least decoration to fixed-size array params

Add the at_least (i.e. 'static') decoration to the fixed-size array
parameters of the sha2 library functions.  This causes clang to warn
when a too-small array of known size is passed.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: "Jason A. Donenfeld" <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20251122194206.31822-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha2.h | 53 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index e5dafb935cc8..7bb8fe169daf 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -190,7 +190,7 @@ static inline void sha224_update(struct sha224_ctx *ctx,
  *
  * Context: Any context.
  */
-void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
+void sha224_final(struct sha224_ctx *ctx, u8 out[at_least SHA224_DIGEST_SIZE]);
 
 /**
  * sha224() - Compute SHA-224 message digest in one shot
@@ -200,7 +200,7 @@ void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]);
+void sha224(const u8 *data, size_t len, u8 out[at_least SHA224_DIGEST_SIZE]);
 
 /**
  * struct hmac_sha224_key - Prepared key for HMAC-SHA224
@@ -287,7 +287,8 @@ static inline void hmac_sha224_update(struct hmac_sha224_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
+void hmac_sha224_final(struct hmac_sha224_ctx *ctx,
+		       u8 out[at_least SHA224_DIGEST_SIZE]);
 
 /**
  * hmac_sha224() - Compute HMAC-SHA224 in one shot, using a prepared key
@@ -301,7 +302,8 @@ void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_sha224(const struct hmac_sha224_key *key,
-		 const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]);
+		 const u8 *data, size_t data_len,
+		 u8 out[at_least SHA224_DIGEST_SIZE]);
 
 /**
  * hmac_sha224_usingrawkey() - Compute HMAC-SHA224 in one shot, using a raw key
@@ -318,7 +320,7 @@ void hmac_sha224(const struct hmac_sha224_key *key,
  */
 void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
-			     u8 out[SHA224_DIGEST_SIZE]);
+			     u8 out[at_least SHA224_DIGEST_SIZE]);
 
 /**
  * struct sha256_ctx - Context for hashing a message with SHA-256
@@ -363,7 +365,7 @@ static inline void sha256_update(struct sha256_ctx *ctx,
  *
  * Context: Any context.
  */
-void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
+void sha256_final(struct sha256_ctx *ctx, u8 out[at_least SHA256_DIGEST_SIZE]);
 
 /**
  * sha256() - Compute SHA-256 message digest in one shot
@@ -373,7 +375,7 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
+void sha256(const u8 *data, size_t len, u8 out[at_least SHA256_DIGEST_SIZE]);
 
 /**
  * sha256_finup_2x() - Compute two SHA-256 digests from a common initial
@@ -390,8 +392,9 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
  * Context: Any context.
  */
 void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
-		     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
-		     u8 out2[SHA256_DIGEST_SIZE]);
+		     const u8 *data2, size_t len,
+		     u8 out1[at_least SHA256_DIGEST_SIZE],
+		     u8 out2[at_least SHA256_DIGEST_SIZE]);
 
 /**
  * sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real
@@ -488,7 +491,8 @@ static inline void hmac_sha256_update(struct hmac_sha256_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
+void hmac_sha256_final(struct hmac_sha256_ctx *ctx,
+		       u8 out[at_least SHA256_DIGEST_SIZE]);
 
 /**
  * hmac_sha256() - Compute HMAC-SHA256 in one shot, using a prepared key
@@ -502,7 +506,8 @@ void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_sha256(const struct hmac_sha256_key *key,
-		 const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]);
+		 const u8 *data, size_t data_len,
+		 u8 out[at_least SHA256_DIGEST_SIZE]);
 
 /**
  * hmac_sha256_usingrawkey() - Compute HMAC-SHA256 in one shot, using a raw key
@@ -519,7 +524,7 @@ void hmac_sha256(const struct hmac_sha256_key *key,
  */
 void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
-			     u8 out[SHA256_DIGEST_SIZE]);
+			     u8 out[at_least SHA256_DIGEST_SIZE]);
 
 /* State for the SHA-512 (and SHA-384) compression function */
 struct sha512_block_state {
@@ -598,7 +603,7 @@ static inline void sha384_update(struct sha384_ctx *ctx,
  *
  * Context: Any context.
  */
-void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
+void sha384_final(struct sha384_ctx *ctx, u8 out[at_least SHA384_DIGEST_SIZE]);
 
 /**
  * sha384() - Compute SHA-384 message digest in one shot
@@ -608,7 +613,7 @@ void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE]);
+void sha384(const u8 *data, size_t len, u8 out[at_least SHA384_DIGEST_SIZE]);
 
 /**
  * struct hmac_sha384_key - Prepared key for HMAC-SHA384
@@ -695,7 +700,8 @@ static inline void hmac_sha384_update(struct hmac_sha384_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_sha384_final(struct hmac_sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
+void hmac_sha384_final(struct hmac_sha384_ctx *ctx,
+		       u8 out[at_least SHA384_DIGEST_SIZE]);
 
 /**
  * hmac_sha384() - Compute HMAC-SHA384 in one shot, using a prepared key
@@ -709,7 +715,8 @@ void hmac_sha384_final(struct hmac_sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_sha384(const struct hmac_sha384_key *key,
-		 const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE]);
+		 const u8 *data, size_t data_len,
+		 u8 out[at_least SHA384_DIGEST_SIZE]);
 
 /**
  * hmac_sha384_usingrawkey() - Compute HMAC-SHA384 in one shot, using a raw key
@@ -726,7 +733,7 @@ void hmac_sha384(const struct hmac_sha384_key *key,
  */
 void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
-			     u8 out[SHA384_DIGEST_SIZE]);
+			     u8 out[at_least SHA384_DIGEST_SIZE]);
 
 /**
  * struct sha512_ctx - Context for hashing a message with SHA-512
@@ -771,7 +778,7 @@ static inline void sha512_update(struct sha512_ctx *ctx,
  *
  * Context: Any context.
  */
-void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
+void sha512_final(struct sha512_ctx *ctx, u8 out[at_least SHA512_DIGEST_SIZE]);
 
 /**
  * sha512() - Compute SHA-512 message digest in one shot
@@ -781,7 +788,7 @@ void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
  *
  * Context: Any context.
  */
-void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]);
+void sha512(const u8 *data, size_t len, u8 out[at_least SHA512_DIGEST_SIZE]);
 
 /**
  * struct hmac_sha512_key - Prepared key for HMAC-SHA512
@@ -868,7 +875,8 @@ static inline void hmac_sha512_update(struct hmac_sha512_ctx *ctx,
  *
  * Context: Any context.
  */
-void hmac_sha512_final(struct hmac_sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
+void hmac_sha512_final(struct hmac_sha512_ctx *ctx,
+		       u8 out[at_least SHA512_DIGEST_SIZE]);
 
 /**
  * hmac_sha512() - Compute HMAC-SHA512 in one shot, using a prepared key
@@ -882,7 +890,8 @@ void hmac_sha512_final(struct hmac_sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
  * Context: Any context.
  */
 void hmac_sha512(const struct hmac_sha512_key *key,
-		 const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE]);
+		 const u8 *data, size_t data_len,
+		 u8 out[at_least SHA512_DIGEST_SIZE]);
 
 /**
  * hmac_sha512_usingrawkey() - Compute HMAC-SHA512 in one shot, using a raw key
@@ -899,6 +908,6 @@ void hmac_sha512(const struct hmac_sha512_key *key,
  */
 void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
-			     u8 out[SHA512_DIGEST_SIZE]);
+			     u8 out[at_least SHA512_DIGEST_SIZE]);
 
 #endif /* _CRYPTO_SHA2_H */
-- 
cgit v1.2.3


From 441244d4273a8037b265fd254dfdaca5fa736ee2 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <okorniev@redhat.com>
Date: Tue, 4 Nov 2025 17:29:25 -0500
Subject: SUNRPC: cleanup common code in backchannel request

Create a helper function for common code between rdma
and tcp backchannel handling of the backchannel request.
Make sure that access is protected by the bc_pa_lock
lock.

Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/bc_xprt.h    |  1 +
 net/sunrpc/backchannel_rqst.c     | 19 ++++++++++++++++---
 net/sunrpc/xprtrdma/backchannel.c |  8 ++------
 3 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index f22bf915dcf6..178f34ad8db6 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -25,6 +25,7 @@ void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task,
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
+void xprt_enqueue_bc_request(struct rpc_rqst *req);
 
 /* Socket backchannel transport methods */
 int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs);
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index caa94cf57123..efddea0f4b8b 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -354,7 +354,6 @@ found:
 void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 {
 	struct rpc_xprt *xprt = req->rq_xprt;
-	struct svc_serv *bc_serv = xprt->bc_serv;
 
 	spin_lock(&xprt->bc_pa_lock);
 	list_del(&req->rq_bc_pa_list);
@@ -365,7 +364,21 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 	set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
 
 	dprintk("RPC:       add callback request to list\n");
+	xprt_enqueue_bc_request(req);
+}
+
+void xprt_enqueue_bc_request(struct rpc_rqst *req)
+{
+	struct rpc_xprt *xprt = req->rq_xprt;
+	struct svc_serv *bc_serv;
+
 	xprt_get(xprt);
-	lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list);
-	svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
+	spin_lock(&xprt->bc_pa_lock);
+	bc_serv = xprt->bc_serv;
+	if (bc_serv) {
+		lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list);
+		svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
+	}
+	spin_unlock(&xprt->bc_pa_lock);
 }
+EXPORT_SYMBOL_GPL(xprt_enqueue_bc_request);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 8c817e755262..2f0f9618dd05 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -9,6 +9,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/bc_xprt.h>
 
 #include "xprt_rdma.h"
 #include <trace/events/rpcrdma.h>
@@ -220,7 +221,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 			     struct rpcrdma_rep *rep)
 {
 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-	struct svc_serv *bc_serv;
 	struct rpcrdma_req *req;
 	struct rpc_rqst *rqst;
 	struct xdr_buf *buf;
@@ -261,11 +261,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	trace_xprtrdma_cb_call(r_xprt, rqst);
 
 	/* Queue rqst for ULP's callback service */
-	bc_serv = xprt->bc_serv;
-	xprt_get(xprt);
-	lwq_enqueue(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
-
-	svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
+	xprt_enqueue_bc_request(rqst);
 
 	r_xprt->rx_stats.bcall_count++;
 	return;
-- 
cgit v1.2.3


From 6f8b26c90a4d645fd5c944c41a6f0fd61ec27c50 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <okorniev@redhat.com>
Date: Tue, 4 Nov 2025 17:29:26 -0500
Subject: SUNRPC: new helper function for stopping backchannel server

Create a new backchannel function to stop the backchannel server
and clear the bc_serv in transport protected under the bc_pa_lock.

Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/bc_xprt.h |  6 ++++++
 net/sunrpc/backchannel_rqst.c  | 16 ++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index 178f34ad8db6..98939cb664cf 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -32,6 +32,7 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs);
 void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs);
 void xprt_free_bc_rqst(struct rpc_rqst *req);
 unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt);
+void xprt_svc_destroy_nullify_bc(struct rpc_xprt *xprt, struct svc_serv **serv);
 
 /*
  * Determine if a shared backchannel is in use
@@ -69,5 +70,10 @@ static inline void set_bc_enabled(struct svc_serv *serv)
 static inline void xprt_free_bc_request(struct rpc_rqst *req)
 {
 }
+
+static inline void xprt_svc_destroy_nullify_bc(struct rpc_xprt *xprt, struct svc_serv **serv)
+{
+	svc_destroy(serv);
+}
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 #endif /* _LINUX_SUNRPC_BC_XPRT_H */
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index efddea0f4b8b..68b1fcdea8f0 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -24,6 +24,22 @@ unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt)
 	return BC_MAX_SLOTS;
 }
 
+/*
+ * Helper function to nullify backchannel server pointer in transport.
+ * We need to synchronize setting the pointer to NULL (done so after
+ * the backchannel server is shutdown) with the usage of that pointer
+ * by the backchannel request processing routines
+ * xprt_complete_bc_request() and rpcrdma_bc_receive_call().
+ */
+void xprt_svc_destroy_nullify_bc(struct rpc_xprt *xprt, struct svc_serv **serv)
+{
+	spin_lock(&xprt->bc_pa_lock);
+	svc_destroy(serv);
+	xprt->bc_serv = NULL;
+	spin_unlock(&xprt->bc_pa_lock);
+}
+EXPORT_SYMBOL_GPL(xprt_svc_destroy_nullify_bc);
+
 /*
  * Helper routines that track the number of preallocation elements
  * on the transport.
-- 
cgit v1.2.3


From 130ae65c01862e1ed30ef5ff2258990d7628f360 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <anna.schumaker@oracle.com>
Date: Tue, 4 Nov 2025 10:06:41 -0500
Subject: NFS: Add support for sending GDD_GETATTR

I add this to the existing GETATTR compound as an option extra step that
we can send if the "dir_deleg" flag is set to 'true'. Actually enabling
this value will happen in a later patch.

Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4xdr.c        | 106 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_xdr.h |   7 ++++
 2 files changed, 113 insertions(+)

(limited to 'include')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1d0e6c10f921..b6fe30577fab 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -393,6 +393,20 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
+#define encode_get_dir_deleg_maxsz (op_encode_hdr_maxsz + \
+				    4 /* gdda_signal_deleg_avail */ + \
+				    8 /* gdda_notification_types */ + \
+				    nfstime4_maxsz /* gdda_child_attr_delay */ + \
+				    nfstime4_maxsz /* gdda_dir_attr_delay */ + \
+				    nfs4_fattr_bitmap_maxsz /* gdda_child_attributes */ + \
+				    nfs4_fattr_bitmap_maxsz /* gdda_dir_attributes */)
+#define decode_get_dir_deleg_maxsz (op_decode_hdr_maxsz + \
+				    4 /* gddrnf_status */ + \
+				    encode_verifier_maxsz /* gddr_cookieverf */ + \
+				    encode_stateid_maxsz /* gddr_stateid */ + \
+				    8 /* gddr_notification */ + \
+				    nfs4_fattr_maxsz /* gddr_child_attributes */ + \
+				    nfs4_fattr_maxsz /* gddr_dir_attributes */)
 #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
 				XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
 				1 /* layout type */ + \
@@ -444,6 +458,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz	0
 #define decode_sequence_maxsz	0
+#define encode_get_dir_deleg_maxsz 0
+#define decode_get_dir_deleg_maxsz 0
 #define encode_layoutreturn_maxsz 0
 #define decode_layoutreturn_maxsz 0
 #define encode_layoutget_maxsz	0
@@ -631,11 +647,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 #define NFS4_enc_getattr_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
+				encode_get_dir_deleg_maxsz + \
 				encode_getattr_maxsz + \
 				encode_renew_maxsz)
 #define NFS4_dec_getattr_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
+				decode_get_dir_deleg_maxsz + \
 				decode_getattr_maxsz + \
 				decode_renew_maxsz)
 #define NFS4_enc_lookup_sz	(compound_encode_hdr_maxsz + \
@@ -2007,6 +2025,33 @@ static void encode_sequence(struct xdr_stream *xdr,
 }
 
 #ifdef CONFIG_NFS_V4_1
+static void
+encode_get_dir_delegation(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	struct timespec64 ts = { 0, 0 };
+	u32 notifications[1] = { 0 };
+	u32 attributes[1] = { 0 };
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_GET_DIR_DELEGATION, decode_get_dir_deleg_maxsz, hdr);
+
+	/* We don't handle CB_RECALLABLE_OBJ_AVAIL yet. */
+	xdr_stream_encode_bool(xdr, false);
+
+	xdr_encode_bitmap4(xdr, notifications, ARRAY_SIZE(notifications));
+
+	/* Request no delay on attribute updates */
+	p = reserve_space(xdr, 12 + 12);
+	p = xdr_encode_nfstime4(p, &ts);
+	xdr_encode_nfstime4(p, &ts);
+
+	/* Requested child attributes */
+	xdr_encode_bitmap4(xdr, attributes, ARRAY_SIZE(attributes));
+
+	/* Requested dir attributes */
+	xdr_encode_bitmap4(xdr, attributes, ARRAY_SIZE(attributes));
+}
+
 static void
 encode_getdeviceinfo(struct xdr_stream *xdr,
 		     const struct nfs4_getdeviceinfo_args *args,
@@ -2142,6 +2187,11 @@ static void encode_free_stateid(struct xdr_stream *xdr,
 	encode_nfs4_stateid(xdr, &args->stateid);
 }
 #else
+static inline void
+encode_get_dir_delegation(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+}
+
 static inline void
 encode_layoutreturn(struct xdr_stream *xdr,
 		    const struct nfs4_layoutreturn_args *args,
@@ -2356,6 +2406,8 @@ static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
+	if (args->get_dir_deleg)
+		encode_get_dir_delegation(xdr, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
@@ -5994,6 +6046,49 @@ static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 	return decode_stateid(xdr, stateid);
 }
 
+static int decode_get_dir_delegation(struct xdr_stream *xdr,
+				     struct nfs4_getattr_res *res)
+{
+	struct nfs4_gdd_res *gdd_res = res->gdd_res;
+	nfs4_verifier cookieverf;
+	u32 bitmap[1];
+	int status;
+
+	status = decode_op_hdr(xdr, OP_GET_DIR_DELEGATION);
+	if (status)
+		return status;
+
+	if (xdr_stream_decode_u32(xdr, &gdd_res->status))
+		return -EIO;
+
+	if (gdd_res->status == GDD4_UNAVAIL)
+		return xdr_inline_decode(xdr, 4) ? 0 : -EIO;
+
+	status = decode_verifier(xdr, &cookieverf);
+	if (status)
+		return status;
+
+	status = decode_delegation_stateid(xdr, &gdd_res->deleg);
+	if (status)
+		return status;
+
+	/* Decode supported notification types. */
+	status = decode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap));
+	if (status < 0)
+		return status;
+
+	/* Decode supported child attributes. */
+	status = decode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap));
+	if (status < 0)
+		return status;
+
+	/* Decode supported attributes. */
+	status = decode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap));
+	if (status < 0)
+		return status;
+	return 0;
+}
+
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
 				struct nfs4_getdeviceinfo_res *res)
 {
@@ -6208,6 +6303,12 @@ static int decode_free_stateid(struct xdr_stream *xdr,
 	return res->status;
 }
 #else
+static int decode_get_dir_delegation(struct xdr_stream *xdr,
+				     struct nfs4_getattr_res *res)
+{
+	return 0;
+}
+
 static inline
 int decode_layoutreturn(struct xdr_stream *xdr,
 			       struct nfs4_layoutreturn_res *res)
@@ -6525,6 +6626,11 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
+	if (res->gdd_res) {
+		status = decode_get_dir_delegation(xdr, res);
+		if (status)
+			goto out;
+	}
 	status = decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 31463286402f..8bf6cba96c46 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1092,12 +1092,19 @@ struct nfs4_getattr_arg {
 	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
+	bool				get_dir_deleg;
+};
+
+struct nfs4_gdd_res {
+	u32				status;
+	nfs4_stateid			deleg;
 };
 
 struct nfs4_getattr_res {
 	struct nfs4_sequence_res	seq_res;
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
+	struct nfs4_gdd_res *		gdd_res;
 };
 
 struct nfs4_link_arg {
-- 
cgit v1.2.3


From 156b0948293362b036caf49e6e4d97cae30201de Mon Sep 17 00:00:00 2001
From: Anna Schumaker <anna.schumaker@oracle.com>
Date: Tue, 4 Nov 2025 10:06:42 -0500
Subject: NFS: Request a directory delegation on ACCESS, CREATE, and UNLINK

This patch adds a new flag: NFS_INO_REQ_DIR_DELEG to signal that a
directory wants to request a directory delegation the next time it does
a GETATTR. I have the client request a directory delegation when doing
an access, create, or unlink call since these calls indicate that a user
is working with a directory.

Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/delegation.c       |  1 +
 fs/nfs/delegation.h       |  6 ++++++
 fs/nfs/nfs4proc.c         | 55 +++++++++++++++++++++++++++++++++++++++++++----
 include/linux/nfs_fs.h    |  1 +
 include/linux/nfs_fs_sb.h |  1 +
 5 files changed, 60 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 9d3a5f29f17f..b4c192f00e94 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -379,6 +379,7 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi,
 	delegation->inode = NULL;
 	rcu_assign_pointer(nfsi->delegation, NULL);
 	spin_unlock(&delegation->lock);
+	clear_bit(NFS_INO_REQ_DIR_DELEG, &nfsi->flags);
 	return delegation;
 }
 
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 08ec2e9c68a4..def50e8a83bf 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -124,6 +124,12 @@ static inline int nfs_have_delegated_mtime(struct inode *inode)
 						 NFS_DELEGATION_FLAG_TIME);
 }
 
+static inline void nfs_request_directory_delegation(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode))
+		set_bit(NFS_INO_REQ_DIR_DELEG, &NFS_I(inode)->flags);
+}
+
 int nfs4_delegation_hash_alloc(struct nfs_server *server);
 
 #endif
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3b436ba2ed3b..99edc1d8d7aa 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4470,6 +4470,28 @@ out:
 	return status;
 }
 
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+static bool should_request_dir_deleg(struct inode *inode)
+{
+	if (!inode)
+		return false;
+	if (!S_ISDIR(inode->i_mode))
+		return false;
+	if (!nfs_server_capable(inode, NFS_CAP_DIR_DELEG))
+		return false;
+	if (!test_and_clear_bit(NFS_INO_REQ_DIR_DELEG, &(NFS_I(inode)->flags)))
+		return false;
+	if (nfs4_have_delegation(inode, FMODE_READ, 0))
+		return false;
+	return true;
+}
+#else
+static bool should_request_dir_deleg(struct inode *inode)
+{
+	return false;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 				struct nfs_fattr *fattr, struct inode *inode)
 {
@@ -4487,7 +4509,9 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
+	struct nfs4_gdd_res gdd_res;
 	unsigned short task_flags = 0;
+	int status;
 
 	if (nfs4_has_session(server->nfs_client))
 		task_flags = RPC_TASK_MOVEABLE;
@@ -4496,11 +4520,26 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 	if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
 		task_flags |= RPC_TASK_TIMEOUT;
 
+	args.get_dir_deleg = should_request_dir_deleg(inode);
+	if (args.get_dir_deleg)
+		res.gdd_res = &gdd_res;
+
 	nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), inode, 0);
 	nfs_fattr_init(fattr);
 	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
-	return nfs4_do_call_sync(server->client, server, &msg,
-			&args.seq_args, &res.seq_res, task_flags);
+
+	status = nfs4_do_call_sync(server->client, server, &msg,
+				   &args.seq_args, &res.seq_res, task_flags);
+	if (args.get_dir_deleg) {
+		if (status == -EOPNOTSUPP) {
+			server->caps &= ~NFS_CAP_DIR_DELEG;
+		} else if (status == 0 && gdd_res.status == GDD4_OK) {
+			status = nfs_inode_set_delegation(inode, current_cred(),
+							  FMODE_READ, &gdd_res.deleg,
+							  0, NFS4_OPEN_DELEGATE_READ);
+		}
+	}
+	return status;
 }
 
 int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -4513,8 +4552,10 @@ int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 	do {
 		err = _nfs4_proc_getattr(server, fhandle, fattr, inode);
 		trace_nfs4_getattr(server, fhandle, fattr, err);
-		err = nfs4_handle_exception(server, err,
-				&exception);
+		if (err == -EOPNOTSUPP)
+			exception.retry = true;
+		else
+			err = nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
 	return err;
 }
@@ -4778,6 +4819,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
 	int status = 0;
 
 	if (!nfs4_have_delegation(inode, FMODE_READ, 0)) {
+		nfs_request_directory_delegation(inode);
 		res.fattr = nfs_alloc_fattr();
 		if (res.fattr == NULL)
 			return -ENOMEM;
@@ -4885,6 +4927,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 
 	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
 
+	nfs_request_directory_delegation(dir);
+
 	if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
 		sattr->ia_mode &= ~current_umask();
 	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
@@ -4981,6 +5025,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg,
 	nfs4_init_sequence(&args->seq_args, &res->seq_res, 1, 0);
 
 	nfs_fattr_init(res->dir_attr);
+	nfs_request_directory_delegation(d_inode(dentry->d_parent));
 
 	if (inode) {
 		nfs4_inode_return_delegation(inode);
@@ -10832,6 +10877,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.minor_version = 1,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
+		| NFS_CAP_DIR_DELEG
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1
@@ -10858,6 +10904,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 	.minor_version = 2,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
+		| NFS_CAP_DIR_DELEG
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c585939b6cd6..a6624edb7226 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -344,6 +344,7 @@ struct nfs4_copy_state {
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
 #define NFS_INO_LAYOUTSTATS	(11)		/* layoutstats inflight */
 #define NFS_INO_ODIRECT		(12)		/* I/O setting is O_DIRECT */
+#define NFS_INO_REQ_DIR_DELEG	(13)		/* Request a directory delegation */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d30c0245031c..4ba04de6b1ca 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -305,6 +305,7 @@ struct nfs_server {
 #define NFS_CAP_REBOOT_LAYOUTRETURN	(1U << 8)
 #define NFS_CAP_OFFLOAD_STATUS	(1U << 9)
 #define NFS_CAP_ZERO_RANGE	(1U << 10)
+#define NFS_CAP_DIR_DELEG	(1U << 11)
 #define NFS_CAP_OPEN_XOR	(1U << 12)
 #define NFS_CAP_DELEGTIME	(1U << 13)
 #define NFS_CAP_POSIX_LOCK	(1U << 14)
-- 
cgit v1.2.3


From 2da211670782637fd2d4fbba06f91d1e7c70dc0c Mon Sep 17 00:00:00 2001
From: Anna Schumaker <anna.schumaker@oracle.com>
Date: Tue, 4 Nov 2025 10:06:43 -0500
Subject: NFS: Request a directory delegation during RENAME

If we notice that we're renaming a file within a directory then we take
that as a sign that the user is working with the current directory and
may want a delegation to avoid extra revalidations when possible.

The nfs_request_directory_delegation() function exists within the NFS v4
module, so I add an extra flag to rename_setup() to indicate if a dentry
is being renamed within the same parent directory.

Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs3proc.c       | 3 ++-
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/proc.c           | 3 ++-
 fs/nfs/unlink.c         | 3 ++-
 include/linux/nfs_xdr.h | 3 ++-
 5 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index a4cb67573aa7..1181f9cc6dbd 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -483,7 +483,8 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 static void
 nfs3_proc_rename_setup(struct rpc_message *msg,
 		struct dentry *old_dentry,
-		struct dentry *new_dentry)
+		struct dentry *new_dentry,
+		struct inode *same_parent)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 99edc1d8d7aa..6691a44866b6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5060,7 +5060,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 
 static void nfs4_proc_rename_setup(struct rpc_message *msg,
 		struct dentry *old_dentry,
-		struct dentry *new_dentry)
+		struct dentry *new_dentry,
+		struct inode *same_parent)
 {
 	struct nfs_renameargs *arg = msg->rpc_argp;
 	struct nfs_renameres *res = msg->rpc_resp;
@@ -5071,6 +5072,8 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg,
 		nfs4_inode_make_writeable(old_inode);
 	if (new_inode)
 		nfs4_inode_return_delegation(new_inode);
+	if (same_parent)
+		nfs_request_directory_delegation(same_parent);
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
 	res->server = NFS_SB(old_dentry->d_sb);
 	nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1, 0);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 63e71310b9f6..39df80e4ae6f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -353,7 +353,8 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 static void
 nfs_proc_rename_setup(struct rpc_message *msg,
 		struct dentry *old_dentry,
-		struct dentry *new_dentry)
+		struct dentry *new_dentry,
+		struct inode *same_parent)
 {
 	msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
 }
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index b55467911648..4db818c0f9dd 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -390,7 +390,8 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
 
 	nfs_sb_active(old_dir->i_sb);
 
-	NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry);
+	NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry,
+					old_dir == new_dir ? old_dir : NULL);
 
 	return rpc_run_task(&task_setup_data);
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8bf6cba96c46..79fe2dfb470f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1808,7 +1808,8 @@ struct nfs_rpc_ops {
 	int	(*unlink_done) (struct rpc_task *, struct inode *);
 	void	(*rename_setup)  (struct rpc_message *msg,
 			struct dentry *old_dentry,
-			struct dentry *new_dentry);
+			struct dentry *new_dentry,
+			struct inode *same_parent);
 	void	(*rename_rpc_prepare)(struct rpc_task *task, struct nfs_renamedata *);
 	int	(*rename_done) (struct rpc_task *task, struct inode *old_dir, struct inode *new_dir);
 	int	(*link)    (struct inode *, struct inode *, const struct qstr *);
-- 
cgit v1.2.3


From 84898f8e9cea06f8178fc5ca53f068180f7bfba0 Mon Sep 17 00:00:00 2001
From: Finley Xiao <finley.xiao@rock-chips.com>
Date: Fri, 21 Nov 2025 15:53:49 +0800
Subject: dt-bindings: clock: rockchip: Add RK3506 clock and reset unit

Add device tree bindings for clock and reset unit on RK3506 SoC.
Add clock and reset IDs for RK3506 SoC.

Signed-off-by: Finley Xiao <finley.xiao@rock-chips.com>
Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20251121075350.2564860-2-zhangqing@rock-chips.com
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 .../bindings/clock/rockchip,rk3506-cru.yaml        |  55 ++++
 include/dt-bindings/clock/rockchip,rk3506-cru.h    | 285 +++++++++++++++++++++
 include/dt-bindings/reset/rockchip,rk3506-cru.h    | 211 +++++++++++++++
 3 files changed, 551 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/rockchip,rk3506-cru.yaml
 create mode 100644 include/dt-bindings/clock/rockchip,rk3506-cru.h
 create mode 100644 include/dt-bindings/reset/rockchip,rk3506-cru.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3506-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3506-cru.yaml
new file mode 100644
index 000000000000..ca940475336c
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3506-cru.yaml
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rk3506-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RK3506 Clock and Reset Unit (CRU)
+
+maintainers:
+  - Finley Xiao <finley.xiao@rock-chips.com>
+  - Heiko Stuebner <heiko@sntech.de>
+
+description:
+  The RK3506 CRU generates the clock and also implements reset for SoC
+  peripherals.
+
+properties:
+  compatible:
+    const: rockchip,rk3506-cru
+
+  reg:
+    maxItems: 1
+
+  "#clock-cells":
+    const: 1
+
+  "#reset-cells":
+    const: 1
+
+  clocks:
+    maxItems: 1
+
+  clock-names:
+    const: xin
+
+required:
+  - compatible
+  - reg
+  - "#clock-cells"
+  - "#reset-cells"
+  - clocks
+  - clock-names
+
+additionalProperties: false
+
+examples:
+  - |
+    clock-controller@ff9a0000 {
+      compatible = "rockchip,rk3506-cru";
+      reg = <0xff9a0000 0x20000>;
+      #clock-cells = <1>;
+      #reset-cells = <1>;
+      clocks = <&xin24m>;
+      clock-names = "xin";
+    };
diff --git a/include/dt-bindings/clock/rockchip,rk3506-cru.h b/include/dt-bindings/clock/rockchip,rk3506-cru.h
new file mode 100644
index 000000000000..71d7dda23cc9
--- /dev/null
+++ b/include/dt-bindings/clock/rockchip,rk3506-cru.h
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2023-2025 Rockchip Electronics Co., Ltd.
+ * Author: Finley Xiao <finley.xiao@rock-chips.com>
+ */
+
+#ifndef _DT_BINDINGS_CLK_ROCKCHIP_RK3506_H
+#define _DT_BINDINGS_CLK_ROCKCHIP_RK3506_H
+
+/* cru plls */
+#define PLL_GPLL			0
+#define PLL_V0PLL			1
+#define PLL_V1PLL			2
+
+/* cru-clocks indices */
+#define ARMCLK				3
+#define CLK_DDR				4
+#define XIN24M_GATE			5
+#define CLK_GPLL_GATE			6
+#define CLK_V0PLL_GATE			7
+#define CLK_V1PLL_GATE			8
+#define CLK_GPLL_DIV			9
+#define CLK_GPLL_DIV_100M		10
+#define CLK_V0PLL_DIV			11
+#define CLK_V1PLL_DIV			12
+#define CLK_INT_VOICE_MATRIX0		13
+#define CLK_INT_VOICE_MATRIX1		14
+#define CLK_INT_VOICE_MATRIX2		15
+#define CLK_FRAC_UART_MATRIX0_MUX	16
+#define CLK_FRAC_UART_MATRIX1_MUX	17
+#define CLK_FRAC_VOICE_MATRIX0_MUX	18
+#define CLK_FRAC_VOICE_MATRIX1_MUX	19
+#define CLK_FRAC_COMMON_MATRIX0_MUX	20
+#define CLK_FRAC_COMMON_MATRIX1_MUX	21
+#define CLK_FRAC_COMMON_MATRIX2_MUX	22
+#define CLK_FRAC_UART_MATRIX0		23
+#define CLK_FRAC_UART_MATRIX1		24
+#define CLK_FRAC_VOICE_MATRIX0		25
+#define CLK_FRAC_VOICE_MATRIX1		26
+#define CLK_FRAC_COMMON_MATRIX0		27
+#define CLK_FRAC_COMMON_MATRIX1		28
+#define CLK_FRAC_COMMON_MATRIX2		29
+#define CLK_REF_USBPHY_TOP		30
+#define CLK_REF_DPHY_TOP		31
+#define ACLK_CORE_ROOT			32
+#define PCLK_CORE_ROOT			33
+#define PCLK_DBG			34
+#define PCLK_CORE_GRF			35
+#define PCLK_CORE_CRU			36
+#define CLK_CORE_EMA_DETECT		37
+#define CLK_REF_PVTPLL_CORE		38
+#define PCLK_GPIO1			39
+#define DBCLK_GPIO1			40
+#define ACLK_CORE_PERI_ROOT		41
+#define HCLK_CORE_PERI_ROOT		42
+#define PCLK_CORE_PERI_ROOT		43
+#define CLK_DSMC			44
+#define ACLK_DSMC			45
+#define PCLK_DSMC			46
+#define CLK_FLEXBUS_TX			47
+#define CLK_FLEXBUS_RX			48
+#define ACLK_FLEXBUS			49
+#define HCLK_FLEXBUS			50
+#define ACLK_DSMC_SLV			51
+#define HCLK_DSMC_SLV			52
+#define ACLK_BUS_ROOT			53
+#define HCLK_BUS_ROOT			54
+#define PCLK_BUS_ROOT			55
+#define ACLK_SYSRAM			56
+#define HCLK_SYSRAM			57
+#define ACLK_DMAC0			58
+#define ACLK_DMAC1			59
+#define HCLK_M0				60
+#define PCLK_BUS_GRF			61
+#define PCLK_TIMER			62
+#define CLK_TIMER0_CH0			63
+#define CLK_TIMER0_CH1			64
+#define CLK_TIMER0_CH2			65
+#define CLK_TIMER0_CH3			66
+#define CLK_TIMER0_CH4			67
+#define CLK_TIMER0_CH5			68
+#define PCLK_WDT0			69
+#define TCLK_WDT0			70
+#define PCLK_WDT1			71
+#define TCLK_WDT1			72
+#define PCLK_MAILBOX			73
+#define PCLK_INTMUX			74
+#define PCLK_SPINLOCK			75
+#define PCLK_DDRC			76
+#define HCLK_DDRPHY			77
+#define PCLK_DDRMON			78
+#define CLK_DDRMON_OSC			79
+#define PCLK_STDBY			80
+#define HCLK_USBOTG0			81
+#define HCLK_USBOTG0_PMU		82
+#define CLK_USBOTG0_ADP			83
+#define HCLK_USBOTG1			84
+#define HCLK_USBOTG1_PMU		85
+#define CLK_USBOTG1_ADP			86
+#define PCLK_USBPHY			87
+#define ACLK_DMA2DDR			88
+#define PCLK_DMA2DDR			89
+#define STCLK_M0			90
+#define CLK_DDRPHY			91
+#define CLK_DDRC_SRC			92
+#define ACLK_DDRC_0			93
+#define ACLK_DDRC_1			94
+#define CLK_DDRC			95
+#define CLK_DDRMON			96
+#define HCLK_LSPERI_ROOT		97
+#define PCLK_LSPERI_ROOT		98
+#define PCLK_UART0			99
+#define PCLK_UART1			100
+#define PCLK_UART2			101
+#define PCLK_UART3			102
+#define PCLK_UART4			103
+#define SCLK_UART0			104
+#define SCLK_UART1			105
+#define SCLK_UART2			106
+#define SCLK_UART3			107
+#define SCLK_UART4			108
+#define PCLK_I2C0			109
+#define CLK_I2C0			110
+#define PCLK_I2C1			111
+#define CLK_I2C1			112
+#define PCLK_I2C2			113
+#define CLK_I2C2			114
+#define PCLK_PWM1			115
+#define CLK_PWM1			116
+#define CLK_OSC_PWM1			117
+#define CLK_RC_PWM1			118
+#define CLK_FREQ_PWM1			119
+#define CLK_COUNTER_PWM1		120
+#define PCLK_SPI0			121
+#define CLK_SPI0			122
+#define PCLK_SPI1			123
+#define CLK_SPI1			124
+#define PCLK_GPIO2			125
+#define DBCLK_GPIO2			126
+#define PCLK_GPIO3			127
+#define DBCLK_GPIO3			128
+#define PCLK_GPIO4			129
+#define DBCLK_GPIO4			130
+#define HCLK_CAN0			131
+#define CLK_CAN0			132
+#define HCLK_CAN1			133
+#define CLK_CAN1			134
+#define HCLK_PDM			135
+#define MCLK_PDM			136
+#define CLKOUT_PDM			137
+#define MCLK_SPDIFTX			138
+#define HCLK_SPDIFTX			139
+#define HCLK_SPDIFRX			140
+#define MCLK_SPDIFRX			141
+#define MCLK_SAI0			142
+#define HCLK_SAI0			143
+#define MCLK_OUT_SAI0			144
+#define MCLK_SAI1			145
+#define HCLK_SAI1			146
+#define MCLK_OUT_SAI1			147
+#define HCLK_ASRC0			148
+#define CLK_ASRC0			149
+#define HCLK_ASRC1			150
+#define CLK_ASRC1			151
+#define PCLK_CRU			152
+#define PCLK_PMU_ROOT			153
+#define MCLK_ASRC0			154
+#define MCLK_ASRC1			155
+#define MCLK_ASRC2			156
+#define MCLK_ASRC3			157
+#define LRCK_ASRC0_SRC			158
+#define LRCK_ASRC0_DST			159
+#define LRCK_ASRC1_SRC			160
+#define LRCK_ASRC1_DST			161
+#define ACLK_HSPERI_ROOT		162
+#define HCLK_HSPERI_ROOT		163
+#define PCLK_HSPERI_ROOT		164
+#define CCLK_SRC_SDMMC			165
+#define HCLK_SDMMC			166
+#define HCLK_FSPI			167
+#define SCLK_FSPI			168
+#define PCLK_SPI2			169
+#define ACLK_MAC0			170
+#define ACLK_MAC1			171
+#define PCLK_MAC0			172
+#define PCLK_MAC1			173
+#define CLK_MAC_ROOT			174
+#define CLK_MAC0			175
+#define CLK_MAC1			176
+#define MCLK_SAI2			177
+#define HCLK_SAI2			178
+#define MCLK_OUT_SAI2			179
+#define MCLK_SAI3_SRC			180
+#define HCLK_SAI3			181
+#define MCLK_SAI3			182
+#define MCLK_OUT_SAI3			183
+#define MCLK_SAI4_SRC			184
+#define HCLK_SAI4			185
+#define MCLK_SAI4			186
+#define HCLK_DSM			187
+#define MCLK_DSM			188
+#define PCLK_AUDIO_ADC			189
+#define MCLK_AUDIO_ADC			190
+#define MCLK_AUDIO_ADC_DIV4		191
+#define PCLK_SARADC			192
+#define CLK_SARADC			193
+#define PCLK_OTPC_NS			194
+#define CLK_SBPI_OTPC_NS		195
+#define CLK_USER_OTPC_NS		196
+#define PCLK_UART5			197
+#define SCLK_UART5			198
+#define PCLK_GPIO234_IOC		199
+#define CLK_MAC_PTP_ROOT		200
+#define CLK_MAC0_PTP			201
+#define CLK_MAC1_PTP			202
+#define CLK_SPI2			203
+#define ACLK_VIO_ROOT			204
+#define HCLK_VIO_ROOT			205
+#define PCLK_VIO_ROOT			206
+#define HCLK_RGA			207
+#define ACLK_RGA			208
+#define CLK_CORE_RGA			209
+#define ACLK_VOP			210
+#define HCLK_VOP			211
+#define DCLK_VOP			212
+#define PCLK_DPHY			213
+#define PCLK_DSI_HOST			214
+#define PCLK_TSADC			215
+#define CLK_TSADC			216
+#define CLK_TSADC_TSEN			217
+#define PCLK_GPIO1_IOC			218
+#define PCLK_OTPC_S			219
+#define CLK_SBPI_OTPC_S			220
+#define CLK_USER_OTPC_S			221
+#define PCLK_OTP_MASK			222
+#define PCLK_KEYREADER			223
+#define HCLK_BOOTROM			224
+#define PCLK_DDR_SERVICE		225
+#define HCLK_CRYPTO_S			226
+#define HCLK_KEYLAD			227
+#define CLK_CORE_CRYPTO			228
+#define CLK_PKA_CRYPTO			229
+#define CLK_CORE_CRYPTO_S		230
+#define CLK_PKA_CRYPTO_S		231
+#define ACLK_CRYPTO_S			232
+#define HCLK_RNG_S			233
+#define CLK_CORE_CRYPTO_NS		234
+#define CLK_PKA_CRYPTO_NS		235
+#define ACLK_CRYPTO_NS			236
+#define HCLK_CRYPTO_NS			237
+#define HCLK_RNG			238
+#define CLK_PMU				239
+#define PCLK_PMU			240
+#define CLK_PMU_32K			241
+#define PCLK_PMU_CRU			242
+#define PCLK_PMU_GRF			243
+#define PCLK_GPIO0_IOC			244
+#define PCLK_GPIO0			245
+#define DBCLK_GPIO0			246
+#define PCLK_GPIO1_SHADOW		247
+#define DBCLK_GPIO1_SHADOW		248
+#define PCLK_PMU_HP_TIMER		249
+#define CLK_PMU_HP_TIMER		250
+#define CLK_PMU_HP_TIMER_32K		251
+#define PCLK_PWM0			252
+#define CLK_PWM0			253
+#define CLK_OSC_PWM0			254
+#define CLK_RC_PWM0			255
+#define CLK_MAC_OUT			256
+#define CLK_REF_OUT0			257
+#define CLK_REF_OUT1			258
+#define CLK_32K_FRAC			259
+#define CLK_32K_RC			260
+#define CLK_32K				261
+#define CLK_32K_PMU			262
+#define PCLK_TOUCH_KEY			263
+#define CLK_TOUCH_KEY			264
+#define CLK_REF_PHY_PLL			265
+#define CLK_REF_PHY_PMU_MUX		266
+#define CLK_WIFI_OUT			267
+#define CLK_V0PLL_REF			268
+#define CLK_V1PLL_REF			269
+#define CLK_32K_FRAC_MUX		270
+
+#endif
diff --git a/include/dt-bindings/reset/rockchip,rk3506-cru.h b/include/dt-bindings/reset/rockchip,rk3506-cru.h
new file mode 100644
index 000000000000..31c0d4aa410f
--- /dev/null
+++ b/include/dt-bindings/reset/rockchip,rk3506-cru.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2023-2025 Rockchip Electronics Co., Ltd.
+ * Author: Finley Xiao <finley.xiao@rock-chips.com>
+ */
+
+#ifndef _DT_BINDINGS_REST_ROCKCHIP_RK3506_H
+#define _DT_BINDINGS_REST_ROCKCHIP_RK3506_H
+
+/* CRU-->SOFTRST_CON00 */
+#define SRST_NCOREPORESET0_AC	0
+#define SRST_NCOREPORESET1_AC	1
+#define SRST_NCOREPORESET2_AC	2
+#define SRST_NCORESET0_AC	3
+#define SRST_NCORESET1_AC	4
+#define SRST_NCORESET2_AC	5
+#define SRST_NL2RESET_AC	6
+#define SRST_A_CORE_BIU_AC	7
+#define SRST_H_M0_AC		8
+
+/* CRU-->SOFTRST_CON02 */
+#define SRST_NDBGRESET		9
+#define SRST_P_CORE_BIU		10
+#define SRST_PMU		11
+
+/* CRU-->SOFTRST_CON03 */
+#define SRST_P_DBG		12
+#define SRST_POT_DBG		13
+#define SRST_P_CORE_GRF		14
+#define SRST_CORE_EMA_DETECT	15
+#define SRST_REF_PVTPLL_CORE	16
+#define SRST_P_GPIO1		17
+#define SRST_DB_GPIO1		18
+
+/* CRU-->SOFTRST_CON04 */
+#define SRST_A_CORE_PERI_BIU	19
+#define SRST_A_DSMC		20
+#define SRST_P_DSMC		21
+#define SRST_FLEXBUS		22
+#define SRST_A_FLEXBUS		23
+#define SRST_H_FLEXBUS		24
+#define SRST_A_DSMC_SLV		25
+#define SRST_H_DSMC_SLV		26
+#define SRST_DSMC_SLV		27
+
+/* CRU-->SOFTRST_CON05 */
+#define SRST_A_BUS_BIU		28
+#define SRST_H_BUS_BIU		29
+#define SRST_P_BUS_BIU		30
+#define SRST_A_SYSRAM		31
+#define SRST_H_SYSRAM		32
+#define SRST_A_DMAC0		33
+#define SRST_A_DMAC1		34
+#define SRST_H_M0		35
+#define SRST_M0_JTAG		36
+#define SRST_H_CRYPTO		37
+
+/* CRU-->SOFTRST_CON06 */
+#define SRST_H_RNG		38
+#define SRST_P_BUS_GRF		39
+#define SRST_P_TIMER0		40
+#define SRST_TIMER0_CH0		41
+#define SRST_TIMER0_CH1		42
+#define SRST_TIMER0_CH2		43
+#define SRST_TIMER0_CH3		44
+#define SRST_TIMER0_CH4		45
+#define SRST_TIMER0_CH5		46
+#define SRST_P_WDT0		47
+#define SRST_T_WDT0		48
+#define SRST_P_WDT1		49
+#define SRST_T_WDT1		50
+#define SRST_P_MAILBOX		51
+#define SRST_P_INTMUX		52
+#define SRST_P_SPINLOCK		53
+
+/* CRU-->SOFTRST_CON07 */
+#define SRST_P_DDRC		54
+#define SRST_H_DDRPHY		55
+#define SRST_P_DDRMON		56
+#define SRST_DDRMON_OSC		57
+#define SRST_P_DDR_LPC		58
+#define SRST_H_USBOTG0		59
+#define SRST_USBOTG0_ADP	60
+#define SRST_H_USBOTG1		61
+#define SRST_USBOTG1_ADP	62
+#define SRST_P_USBPHY		63
+#define SRST_USBPHY_POR		64
+#define SRST_USBPHY_OTG0	65
+#define SRST_USBPHY_OTG1	66
+
+/* CRU-->SOFTRST_CON08 */
+#define SRST_A_DMA2DDR		67
+#define SRST_P_DMA2DDR		68
+
+/* CRU-->SOFTRST_CON09 */
+#define SRST_USBOTG0_UTMI	69
+#define SRST_USBOTG1_UTMI	70
+
+/* CRU-->SOFTRST_CON10 */
+#define SRST_A_DDRC_0		71
+#define SRST_A_DDRC_1		72
+#define SRST_A_DDR_BIU		73
+#define SRST_DDRC		74
+#define SRST_DDRMON		75
+
+/* CRU-->SOFTRST_CON11 */
+#define SRST_H_LSPERI_BIU	76
+#define SRST_P_UART0		77
+#define SRST_P_UART1		78
+#define SRST_P_UART2		79
+#define SRST_P_UART3		80
+#define SRST_P_UART4		81
+#define SRST_UART0		82
+#define SRST_UART1		83
+#define SRST_UART2		84
+#define SRST_UART3		85
+#define SRST_UART4		86
+#define SRST_P_I2C0		87
+#define SRST_I2C0		88
+
+/* CRU-->SOFTRST_CON12 */
+#define SRST_P_I2C1		89
+#define SRST_I2C1		90
+#define SRST_P_I2C2		91
+#define SRST_I2C2		92
+#define SRST_P_PWM1		93
+#define SRST_PWM1		94
+#define SRST_P_SPI0		95
+#define SRST_SPI0		96
+#define SRST_P_SPI1		97
+#define SRST_SPI1		98
+#define SRST_P_GPIO2		99
+#define SRST_DB_GPIO2		100
+
+/* CRU-->SOFTRST_CON13 */
+#define SRST_P_GPIO3		101
+#define SRST_DB_GPIO3		102
+#define SRST_P_GPIO4		103
+#define SRST_DB_GPIO4		104
+#define SRST_H_CAN0		105
+#define SRST_CAN0		106
+#define SRST_H_CAN1		107
+#define SRST_CAN1		108
+#define SRST_H_PDM		109
+#define SRST_M_PDM		110
+#define SRST_PDM		111
+#define SRST_SPDIFTX		112
+#define SRST_H_SPDIFTX		113
+#define SRST_H_SPDIFRX		114
+#define SRST_SPDIFRX		115
+#define SRST_M_SAI0		116
+
+/* CRU-->SOFTRST_CON14 */
+#define SRST_H_SAI0		117
+#define SRST_M_SAI1		118
+#define SRST_H_SAI1		119
+#define SRST_H_ASRC0		120
+#define SRST_ASRC0		121
+#define SRST_H_ASRC1		122
+#define SRST_ASRC1		123
+
+/* CRU-->SOFTRST_CON17 */
+#define SRST_H_HSPERI_BIU	124
+#define SRST_H_SDMMC		125
+#define SRST_H_FSPI		126
+#define SRST_S_FSPI		127
+#define SRST_P_SPI2		128
+#define SRST_A_MAC0		129
+#define SRST_A_MAC1		130
+
+/* CRU-->SOFTRST_CON18 */
+#define SRST_M_SAI2		131
+#define SRST_H_SAI2		132
+#define SRST_H_SAI3		133
+#define SRST_M_SAI3		134
+#define SRST_H_SAI4		135
+#define SRST_M_SAI4		136
+#define SRST_H_DSM		137
+#define SRST_M_DSM		138
+#define SRST_P_AUDIO_ADC	139
+#define SRST_M_AUDIO_ADC	140
+
+/* CRU-->SOFTRST_CON19 */
+#define SRST_P_SARADC		141
+#define SRST_SARADC		142
+#define SRST_SARADC_PHY		143
+#define SRST_P_OTPC_NS		144
+#define SRST_SBPI_OTPC_NS	145
+#define SRST_USER_OTPC_NS	146
+#define SRST_P_UART5		147
+#define SRST_UART5		148
+#define SRST_P_GPIO234_IOC	149
+
+/* CRU-->SOFTRST_CON21 */
+#define SRST_A_VIO_BIU		150
+#define SRST_H_VIO_BIU		151
+#define SRST_H_RGA		152
+#define SRST_A_RGA		153
+#define SRST_CORE_RGA		154
+#define SRST_A_VOP		155
+#define SRST_H_VOP		156
+#define SRST_VOP		157
+#define SRST_P_DPHY		158
+#define SRST_P_DSI_HOST		159
+#define SRST_P_TSADC		160
+#define SRST_TSADC		161
+
+/* CRU-->SOFTRST_CON22 */
+#define SRST_P_GPIO1_IOC	162
+
+#endif
-- 
cgit v1.2.3


From 0f1f9b5e47cec229dc2127481807823b75e933b0 Mon Sep 17 00:00:00 2001
From: Maher Sanalla <msanalla@nvidia.com>
Date: Thu, 20 Nov 2025 17:15:15 +0200
Subject: RDMA/core: Add new IB rate for XDR (8x) support

Add the new rates as defined in the Infiniband spec for XDR and 8x
link width support.

Furthermore, modify the utility conversion methods accordingly.

Reference: IB Spec Release 1.8

Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Signed-off-by: Maher Sanalla <msanalla@nvidia.com>
Link: https://patch.msgid.link/20251120-speed-8-v1-1-e6a7efef8cb8@nvidia.com
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/verbs.c | 3 +++
 include/rdma/ib_verbs.h         | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 3a5f81402d2f..11b1a194de44 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -148,6 +148,7 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
 	case IB_RATE_400_GBPS: return 160;
 	case IB_RATE_600_GBPS: return 240;
 	case IB_RATE_800_GBPS: return 320;
+	case IB_RATE_1600_GBPS: return 640;
 	default:	       return  -1;
 	}
 }
@@ -178,6 +179,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
 	case 160: return IB_RATE_400_GBPS;
 	case 240: return IB_RATE_600_GBPS;
 	case 320: return IB_RATE_800_GBPS;
+	case 640: return IB_RATE_1600_GBPS;
 	default:  return IB_RATE_PORT_CURRENT;
 	}
 }
@@ -208,6 +210,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
 	case IB_RATE_400_GBPS: return 425000;
 	case IB_RATE_600_GBPS: return 637500;
 	case IB_RATE_800_GBPS: return 850000;
+	case IB_RATE_1600_GBPS: return 1700000;
 	default:	       return -1;
 	}
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0a85af610b6b..6aad66bc5dd7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -859,6 +859,7 @@ enum ib_rate {
 	IB_RATE_400_GBPS = 21,
 	IB_RATE_600_GBPS = 22,
 	IB_RATE_800_GBPS = 23,
+	IB_RATE_1600_GBPS = 25,
 };
 
 /**
-- 
cgit v1.2.3


From e950d1f84d3c16e86dd1b6066c3ac3958099fa79 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Wed, 19 Nov 2025 15:37:56 +0100
Subject: s390/percpu: Get rid of ARCH_MODULE_NEEDS_WEAK_PER_CPU

Since the rework of the kernel virtual address space [1] the module area
and the kernel image are within the same 4GB area. Therefore there is no
need for the weak per cpu workaround for modules anymore. Remove it.

[1] commit c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces")

Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/Kconfig              | 1 -
 arch/s390/include/asm/percpu.h | 8 --------
 include/linux/percpu-defs.h    | 2 +-
 3 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 9914db771e45..cb143bf782f8 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -140,7 +140,6 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
 	select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-	select ARCH_MODULE_NEEDS_WEAK_PER_CPU
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 965886dfe954..5899f57f17d1 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -12,14 +12,6 @@
  */
 #define __my_cpu_offset get_lowcore()->percpu_offset
 
-/*
- * For 64 bit module code, the module may be more than 4G above the
- * per cpu area, use weak definitions to force the compiler to
- * generate external references.
- * Therefore, we have enabled CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU
- * in the Kconfig.
- */
-
 /*
  * We use a compare-and-swap loop since that uses less cpu cycles than
  * disabling and enabling interrupts like the generic variant would do.
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 12d90360f6db..43c854a273c3 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -52,7 +52,7 @@
 	__section(".discard") __attribute__((unused))
 
 /*
- * s390 and alpha modules require percpu variables to be defined as
+ * alpha modules require percpu variables to be defined as
  * weak to force the compiler to generate GOT based external
  * references for them.  This is necessary because percpu sections
  * will be located outside of the usually addressable area.
-- 
cgit v1.2.3


From 385aab8fccd7a8746b9f1a17f3c1e38498a14bc7 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Wed, 8 Oct 2025 12:41:48 +0200
Subject: wifi: mt76: wed: use proper wed reference in mt76 wed driver
 callabacks

MT7996 driver can use both wed and wed_hif2 devices to offload traffic
from/to the wireless NIC. In the current codebase we assume to always
use the primary wed device in wed callbacks resulting in the following
crash if the hw runs wed_hif2 (e.g. 6GHz link).

[  297.455876] Unable to handle kernel read from unreadable memory at virtual address 000000000000080a
[  297.464928] Mem abort info:
[  297.467722]   ESR = 0x0000000096000005
[  297.471461]   EC = 0x25: DABT (current EL), IL = 32 bits
[  297.476766]   SET = 0, FnV = 0
[  297.479809]   EA = 0, S1PTW = 0
[  297.482940]   FSC = 0x05: level 1 translation fault
[  297.487809] Data abort info:
[  297.490679]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
[  297.496156]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[  297.501196]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[  297.506500] user pgtable: 4k pages, 39-bit VAs, pgdp=0000000107480000
[  297.512927] [000000000000080a] pgd=08000001097fb003, p4d=08000001097fb003, pud=08000001097fb003, pmd=0000000000000000
[  297.523532] Internal error: Oops: 0000000096000005 [#1] SMP
[  297.715393] CPU: 2 UID: 0 PID: 45 Comm: kworker/u16:2 Tainted: G           O       6.12.50 #0
[  297.723908] Tainted: [O]=OOT_MODULE
[  297.727384] Hardware name: Banana Pi BPI-R4 (2x SFP+) (DT)
[  297.732857] Workqueue: nf_ft_offload_del nf_flow_rule_route_ipv6 [nf_flow_table]
[  297.740254] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[  297.747205] pc : mt76_wed_offload_disable+0x64/0xa0 [mt76]
[  297.752688] lr : mtk_wed_flow_remove+0x58/0x80
[  297.757126] sp : ffffffc080fe3ae0
[  297.760430] x29: ffffffc080fe3ae0 x28: ffffffc080fe3be0 x27: 00000000deadbef7
[  297.767557] x26: ffffff80c5ebca00 x25: 0000000000000001 x24: ffffff80c85f4c00
[  297.774683] x23: ffffff80c1875b78 x22: ffffffc080d42cd0 x21: ffffffc080660018
[  297.781809] x20: ffffff80c6a076d0 x19: ffffff80c6a043c8 x18: 0000000000000000
[  297.788935] x17: 0000000000000000 x16: 0000000000000001 x15: 0000000000000000
[  297.796060] x14: 0000000000000019 x13: ffffff80c0ad8ec0 x12: 00000000fa83b2da
[  297.803185] x11: ffffff80c02700c0 x10: ffffff80c0ad8ec0 x9 : ffffff81fef96200
[  297.810311] x8 : ffffff80c02700c0 x7 : ffffff80c02700d0 x6 : 0000000000000002
[  297.817435] x5 : 0000000000000400 x4 : 0000000000000000 x3 : 0000000000000000
[  297.824561] x2 : 0000000000000001 x1 : 0000000000000800 x0 : ffffff80c6a063c8
[  297.831686] Call trace:
[  297.834123]  mt76_wed_offload_disable+0x64/0xa0 [mt76]
[  297.839254]  mtk_wed_flow_remove+0x58/0x80
[  297.843342]  mtk_flow_offload_cmd+0x434/0x574
[  297.847689]  mtk_wed_setup_tc_block_cb+0x30/0x40
[  297.852295]  nf_flow_offload_ipv6_hook+0x7f4/0x964 [nf_flow_table]
[  297.858466]  nf_flow_rule_route_ipv6+0x438/0x4a4 [nf_flow_table]
[  297.864463]  process_one_work+0x174/0x300
[  297.868465]  worker_thread+0x278/0x430
[  297.872204]  kthread+0xd8/0xdc
[  297.875251]  ret_from_fork+0x10/0x20
[  297.878820] Code: 928b5ae0 8b000273 91400a60 f943fa61 (79401421)
[  297.884901] ---[ end trace 0000000000000000 ]---

Fix the issue detecting the proper wed reference to use running wed
callabacks.

Fixes: 83eafc9251d6 ("wifi: mt76: mt7996: add wed tx support")
Tested-by: Daniel Pawlik <pawlik.dan@gmail.com>
Tested-by: Matteo Croce <teknoraver@meta.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251008-wed-fixes-v1-1-8f7678583385@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 drivers/net/wireless/mediatek/mt76/mt76.h        |  9 +++++++++
 drivers/net/wireless/mediatek/mt76/mt7996/mmio.c |  1 +
 drivers/net/wireless/mediatek/mt76/wed.c         | 10 +++++-----
 include/linux/soc/mediatek/mtk_wed.h             |  1 +
 4 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index 86b812f68c97..e0a036b03b03 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -1260,6 +1260,15 @@ static inline int mt76_wed_dma_setup(struct mt76_dev *dev, struct mt76_queue *q,
 #define mt76_dereference(p, dev) \
 	rcu_dereference_protected(p, lockdep_is_held(&(dev)->mutex))
 
+static inline struct mt76_dev *mt76_wed_to_dev(struct mtk_wed_device *wed)
+{
+#ifdef CONFIG_NET_MEDIATEK_SOC_WED
+	if (wed->wlan.hif2)
+		return container_of(wed, struct mt76_dev, mmio.wed_hif2);
+#endif /* CONFIG_NET_MEDIATEK_SOC_WED */
+	return container_of(wed, struct mt76_dev, mmio.wed);
+}
+
 static inline struct mt76_wcid *
 __mt76_wcid_ptr(struct mt76_dev *dev, u16 idx)
 {
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c
index d14b626ee511..80db102ed809 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c
@@ -595,6 +595,7 @@ int mt7996_mmio_wed_init(struct mt7996_dev *dev, void *pdev_ptr,
 
 	wed->wlan.nbuf = MT7996_HW_TOKEN_SIZE;
 	wed->wlan.token_start = MT7996_TOKEN_SIZE - wed->wlan.nbuf;
+	wed->wlan.hif2 = hif2;
 
 	wed->wlan.amsdu_max_subframes = 8;
 	wed->wlan.amsdu_max_len = 1536;
diff --git a/drivers/net/wireless/mediatek/mt76/wed.c b/drivers/net/wireless/mediatek/mt76/wed.c
index 907a8e43e72a..fbd7e59c73aa 100644
--- a/drivers/net/wireless/mediatek/mt76/wed.c
+++ b/drivers/net/wireless/mediatek/mt76/wed.c
@@ -8,7 +8,7 @@
 
 void mt76_wed_release_rx_buf(struct mtk_wed_device *wed)
 {
-	struct mt76_dev *dev = container_of(wed, struct mt76_dev, mmio.wed);
+	struct mt76_dev *dev = mt76_wed_to_dev(wed);
 	int i;
 
 	for (i = 0; i < dev->rx_token_size; i++) {
@@ -31,8 +31,8 @@ EXPORT_SYMBOL_GPL(mt76_wed_release_rx_buf);
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
 u32 mt76_wed_init_rx_buf(struct mtk_wed_device *wed, int size)
 {
-	struct mt76_dev *dev = container_of(wed, struct mt76_dev, mmio.wed);
 	struct mtk_wed_bm_desc *desc = wed->rx_buf_ring.desc;
+	struct mt76_dev *dev = mt76_wed_to_dev(wed);
 	struct mt76_queue *q = &dev->q_rx[MT_RXQ_MAIN];
 	struct mt76_txwi_cache *t = NULL;
 	int i;
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(mt76_wed_init_rx_buf);
 
 int mt76_wed_offload_enable(struct mtk_wed_device *wed)
 {
-	struct mt76_dev *dev = container_of(wed, struct mt76_dev, mmio.wed);
+	struct mt76_dev *dev = mt76_wed_to_dev(wed);
 
 	spin_lock_bh(&dev->token_lock);
 	dev->token_size = wed->wlan.token_start;
@@ -164,7 +164,7 @@ EXPORT_SYMBOL_GPL(mt76_wed_dma_setup);
 
 void mt76_wed_offload_disable(struct mtk_wed_device *wed)
 {
-	struct mt76_dev *dev = container_of(wed, struct mt76_dev, mmio.wed);
+	struct mt76_dev *dev = mt76_wed_to_dev(wed);
 
 	spin_lock_bh(&dev->token_lock);
 	dev->token_size = dev->drv->token_size;
@@ -174,7 +174,7 @@ EXPORT_SYMBOL_GPL(mt76_wed_offload_disable);
 
 void mt76_wed_reset_complete(struct mtk_wed_device *wed)
 {
-	struct mt76_dev *dev = container_of(wed, struct mt76_dev, mmio.wed);
+	struct mt76_dev *dev = mt76_wed_to_dev(wed);
 
 	complete(&dev->mmio.wed_reset_complete);
 }
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index c4ff6bab176d..3fa93bd65004 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -154,6 +154,7 @@ struct mtk_wed_device {
 		bool wcid_512;
 		bool hw_rro;
 		bool msi;
+		bool hif2;
 
 		u16 token_start;
 		unsigned int nbuf;
-- 
cgit v1.2.3


From 7fb554b1b623c7da845521604bd05fa9570d07bc Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 17 Oct 2025 10:50:32 +0200
Subject: wifi: mt76: Introduce the NPU generic layer

Add the NPU generic layer in mt76 module. NPU will be used to enable
traffic forward offloading between the MT76 NIC and the Airoha ethernet one
available on the Airoha EN7581 SoC using Netfilter Flowtable APIs.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-mt76-npu-devel-v2-4-ddaa90901723@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 drivers/net/wireless/mediatek/mt76/Kconfig    |   4 +
 drivers/net/wireless/mediatek/mt76/Makefile   |   1 +
 drivers/net/wireless/mediatek/mt76/dma.c      |  41 ++-
 drivers/net/wireless/mediatek/mt76/dma.h      |  36 ++
 drivers/net/wireless/mediatek/mt76/mac80211.c |   6 +-
 drivers/net/wireless/mediatek/mt76/mt76.h     | 135 +++++++
 drivers/net/wireless/mediatek/mt76/npu.c      | 501 ++++++++++++++++++++++++++
 include/linux/soc/airoha/airoha_offload.h     |   1 +
 8 files changed, 718 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/wireless/mediatek/mt76/npu.c

(limited to 'include')

diff --git a/drivers/net/wireless/mediatek/mt76/Kconfig b/drivers/net/wireless/mediatek/mt76/Kconfig
index d0aff94075d6..502303622a53 100644
--- a/drivers/net/wireless/mediatek/mt76/Kconfig
+++ b/drivers/net/wireless/mediatek/mt76/Kconfig
@@ -37,6 +37,10 @@ config MT792x_USB
 	tristate
 	select MT76_USB
 
+config MT76_NPU
+	bool
+	depends on MT76_CORE
+
 source "drivers/net/wireless/mediatek/mt76/mt76x0/Kconfig"
 source "drivers/net/wireless/mediatek/mt76/mt76x2/Kconfig"
 source "drivers/net/wireless/mediatek/mt76/mt7603/Kconfig"
diff --git a/drivers/net/wireless/mediatek/mt76/Makefile b/drivers/net/wireless/mediatek/mt76/Makefile
index 53e7bcefe770..1d42adfe8030 100644
--- a/drivers/net/wireless/mediatek/mt76/Makefile
+++ b/drivers/net/wireless/mediatek/mt76/Makefile
@@ -12,6 +12,7 @@ mt76-y := \
 	mmio.o util.o trace.o dma.o mac80211.o debugfs.o eeprom.o \
 	tx.o agg-rx.o mcu.o wed.o scan.o channel.o
 
+mt76-$(CONFIG_MT76_NPU) += npu.o
 mt76-$(CONFIG_PCI) += pci.o
 mt76-$(CONFIG_NL80211_TESTMODE) += testmode.o
 
diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index a4c1b6c66488..f240016ed9f0 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -189,10 +189,15 @@ static void
 mt76_dma_sync_idx(struct mt76_dev *dev, struct mt76_queue *q)
 {
 	Q_WRITE(q, desc_base, q->desc_dma);
-	if (q->flags & MT_QFLAG_WED_RRO_EN)
+	if ((q->flags & MT_QFLAG_WED_RRO_EN) && !mt76_npu_device_active(dev))
 		Q_WRITE(q, ring_size, MT_DMA_RRO_EN | q->ndesc);
 	else
 		Q_WRITE(q, ring_size, q->ndesc);
+
+	if (mt76_queue_is_npu_tx(q)) {
+		writel(q->desc_dma, &q->regs->desc_base);
+		writel(q->ndesc, &q->regs->ring_size);
+	}
 	q->head = Q_READ(q, dma_idx);
 	q->tail = q->head;
 }
@@ -204,7 +209,7 @@ void mt76_dma_queue_reset(struct mt76_dev *dev, struct mt76_queue *q,
 		return;
 
 	if (!mt76_queue_is_wed_rro_ind(q) &&
-	    !mt76_queue_is_wed_rro_rxdmad_c(q)) {
+	    !mt76_queue_is_wed_rro_rxdmad_c(q) && !mt76_queue_is_npu(q)) {
 		int i;
 
 		/* clear descriptors */
@@ -415,6 +420,7 @@ mt76_dma_tx_cleanup(struct mt76_dev *dev, struct mt76_queue *q, bool flush)
 
 	while (q->queued > 0 && q->tail != last) {
 		mt76_dma_tx_cleanup_idx(dev, q, q->tail, &entry);
+		mt76_npu_txdesc_cleanup(q, q->tail);
 		mt76_queue_tx_complete(dev, q, &entry);
 
 		if (entry.txwi) {
@@ -649,6 +655,10 @@ mt76_dma_tx_queue_skb(struct mt76_phy *phy, struct mt76_queue *q,
 	if (test_bit(MT76_RESET, &phy->state))
 		goto free_skb;
 
+	/* TODO: Take into account unlinear skbs */
+	if (mt76_npu_device_active(dev) && skb_linearize(skb))
+		goto free_skb;
+
 	t = mt76_get_txwi(dev);
 	if (!t)
 		goto free_skb;
@@ -696,6 +706,9 @@ mt76_dma_tx_queue_skb(struct mt76_phy *phy, struct mt76_queue *q,
 	if (ret < 0)
 		goto unmap;
 
+	if (mt76_npu_device_active(dev))
+		return mt76_npu_dma_add_buf(phy, q, skb, &tx_info.buf[1], txwi);
+
 	return mt76_dma_add_buf(dev, q, tx_info.buf, tx_info.nbuf,
 				tx_info.info, tx_info.skb, t);
 
@@ -796,8 +809,15 @@ mt76_dma_alloc_queue(struct mt76_dev *dev, struct mt76_queue *q,
 	q->hw_idx = idx;
 	q->dev = dev;
 
-	size = mt76_queue_is_wed_rro_ind(q) ? sizeof(struct mt76_wed_rro_desc)
-					    : sizeof(struct mt76_desc);
+	if (mt76_queue_is_wed_rro_ind(q))
+		size = sizeof(struct mt76_wed_rro_desc);
+	else if (mt76_queue_is_npu_tx(q))
+		size = sizeof(struct airoha_npu_tx_dma_desc);
+	else if (mt76_queue_is_npu_rx(q))
+		size = sizeof(struct airoha_npu_rx_dma_desc);
+	else
+		size = sizeof(struct mt76_desc);
+
 	q->desc = dmam_alloc_coherent(dev->dma_dev, q->ndesc * size,
 				      &q->desc_dma, GFP_KERNEL);
 	if (!q->desc)
@@ -813,6 +833,7 @@ mt76_dma_alloc_queue(struct mt76_dev *dev, struct mt76_queue *q,
 	if (ret)
 		return ret;
 
+	mt76_npu_queue_setup(dev, q);
 	ret = mt76_wed_dma_setup(dev, q, false);
 	if (ret)
 		return ret;
@@ -840,6 +861,11 @@ mt76_dma_rx_cleanup(struct mt76_dev *dev, struct mt76_queue *q)
 	if (!q->ndesc)
 		return;
 
+	if (mt76_queue_is_npu(q)) {
+		mt76_npu_queue_cleanup(dev, q);
+		return;
+	}
+
 	do {
 		spin_lock_bh(&q->lock);
 		buf = mt76_dma_dequeue(dev, q, true, NULL, NULL, &more, NULL);
@@ -870,7 +896,7 @@ mt76_dma_rx_reset(struct mt76_dev *dev, enum mt76_rxq_id qid)
 		return;
 
 	if (!mt76_queue_is_wed_rro_ind(q) &&
-	    !mt76_queue_is_wed_rro_rxdmad_c(q)) {
+	    !mt76_queue_is_wed_rro_rxdmad_c(q) && !mt76_queue_is_npu(q)) {
 		int i;
 
 		for (i = 0; i < q->ndesc; i++)
@@ -890,7 +916,10 @@ mt76_dma_rx_reset(struct mt76_dev *dev, enum mt76_rxq_id qid)
 		return;
 
 	mt76_dma_sync_idx(dev, q);
-	mt76_dma_rx_fill_buf(dev, q, false);
+	if (mt76_queue_is_npu(q))
+		mt76_npu_fill_rx_queue(dev, q);
+	else
+		mt76_dma_rx_fill(dev, q, false);
 }
 
 static void
diff --git a/drivers/net/wireless/mediatek/mt76/dma.h b/drivers/net/wireless/mediatek/mt76/dma.h
index 19bc768913ff..4a63de6c5bf5 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.h
+++ b/drivers/net/wireless/mediatek/mt76/dma.h
@@ -70,6 +70,42 @@
 		writel(_val, &(_q)->regs->_field);			\
 } while (0)
 
+#elif IS_ENABLED(CONFIG_MT76_NPU)
+
+#define Q_READ(_q, _field) ({						\
+	u32 _offset = offsetof(struct mt76_queue_regs, _field);		\
+	u32 _val = 0;							\
+	if ((_q)->flags & MT_QFLAG_NPU) {				\
+		struct airoha_npu *npu;					\
+									\
+		rcu_read_lock();					\
+		npu = rcu_dereference(q->dev->mmio.npu);		\
+		if (npu)						\
+			regmap_read(npu->regmap,			\
+				    ((_q)->wed_regs + _offset), &_val);	\
+		rcu_read_unlock();					\
+	} else {							\
+		_val = readl(&(_q)->regs->_field);			\
+	}								\
+	_val;								\
+})
+
+#define Q_WRITE(_q, _field, _val)	do {				\
+	u32 _offset = offsetof(struct mt76_queue_regs, _field);		\
+	if ((_q)->flags & MT_QFLAG_NPU) {				\
+		struct airoha_npu *npu;					\
+									\
+		rcu_read_lock();					\
+		npu = rcu_dereference(q->dev->mmio.npu);		\
+		if (npu)						\
+			regmap_write(npu->regmap,			\
+				     ((_q)->wed_regs + _offset), _val);	\
+		rcu_read_unlock();					\
+	} else {							\
+		writel(_val, &(_q)->regs->_field);			\
+	}								\
+} while (0)
+
 #else
 
 #define Q_READ(_q, _field)		readl(&(_q)->regs->_field)
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 4be47aea2253..72ed3c825fa6 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -630,6 +630,8 @@ int mt76_create_page_pool(struct mt76_dev *dev, struct mt76_queue *q)
 	case MT_RXQ_MAIN:
 	case MT_RXQ_BAND1:
 	case MT_RXQ_BAND2:
+	case MT_RXQ_NPU0:
+	case MT_RXQ_NPU1:
 		pp_params.pool_size = 256;
 		break;
 	default:
@@ -814,6 +816,7 @@ void mt76_free_device(struct mt76_dev *dev)
 		destroy_workqueue(dev->wq);
 		dev->wq = NULL;
 	}
+	mt76_npu_deinit(dev);
 	ieee80211_free_hw(dev->hw);
 }
 EXPORT_SYMBOL_GPL(mt76_free_device);
@@ -1553,7 +1556,8 @@ void mt76_rx_poll_complete(struct mt76_dev *dev, enum mt76_rxq_id q,
 
 	while ((skb = __skb_dequeue(&dev->rx_skb[q])) != NULL) {
 		mt76_check_sta(dev, skb);
-		if (mtk_wed_device_active(&dev->mmio.wed))
+		if (mtk_wed_device_active(&dev->mmio.wed) ||
+		    mt76_npu_device_active(dev))
 			__skb_queue_tail(&frames, skb);
 		else
 			mt76_rx_aggr_reorder(skb, &frames);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index a66b89bc4b4a..d05e83ea1cac 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -13,6 +13,7 @@
 #include <linux/leds.h>
 #include <linux/usb.h>
 #include <linux/average.h>
+#include <linux/soc/airoha/airoha_offload.h>
 #include <linux/soc/mediatek/mtk_wed.h>
 #include <net/mac80211.h>
 #include <net/page_pool/helpers.h>
@@ -34,6 +35,7 @@
 #define MT_QFLAG_WED_RRO	BIT(6)
 #define MT_QFLAG_WED_RRO_EN	BIT(7)
 #define MT_QFLAG_EMI_EN		BIT(8)
+#define MT_QFLAG_NPU		BIT(9)
 
 #define __MT_WED_Q(_type, _n)	(MT_QFLAG_WED | \
 				 FIELD_PREP(MT_QFLAG_WED_TYPE, _type) | \
@@ -48,6 +50,12 @@
 #define MT_WED_RRO_Q_IND	__MT_WED_RRO_Q(MT76_WED_RRO_Q_IND, 0)
 #define MT_WED_RRO_Q_RXDMAD_C	__MT_WED_RRO_Q(MT76_WED_RRO_Q_RXDMAD_C, 0)
 
+#define __MT_NPU_Q(_type, _n)	(MT_QFLAG_NPU | \
+				 FIELD_PREP(MT_QFLAG_WED_TYPE, _type) | \
+				 FIELD_PREP(MT_QFLAG_WED_RING, _n))
+#define MT_NPU_Q_TX(_n)		__MT_NPU_Q(MT76_WED_Q_TX, _n)
+#define MT_NPU_Q_RX(_n)		__MT_NPU_Q(MT76_WED_Q_RX, _n)
+
 struct mt76_dev;
 struct mt76_phy;
 struct mt76_wcid;
@@ -139,6 +147,8 @@ enum mt76_rxq_id {
 	MT_RXQ_TXFREE_BAND2,
 	MT_RXQ_RRO_IND,
 	MT_RXQ_RRO_RXDMAD_C,
+	MT_RXQ_NPU0,
+	MT_RXQ_NPU1,
 	__MT_RXQ_MAX
 };
 
@@ -707,6 +717,11 @@ struct mt76_mmio {
 	struct mtk_wed_device wed_hif2;
 	struct completion wed_reset;
 	struct completion wed_reset_complete;
+
+	struct airoha_ppe_dev __rcu *ppe_dev;
+	struct airoha_npu __rcu *npu;
+	phys_addr_t phy_addr;
+	int npu_type;
 };
 
 struct mt76_rx_status {
@@ -1617,6 +1632,109 @@ int mt76_testmode_dump(struct ieee80211_hw *hw, struct sk_buff *skb,
 int mt76_testmode_set_state(struct mt76_phy *phy, enum mt76_testmode_state state);
 int mt76_testmode_alloc_skb(struct mt76_phy *phy, u32 len);
 
+#ifdef CONFIG_MT76_NPU
+void mt76_npu_check_ppe(struct mt76_dev *dev, struct sk_buff *skb,
+			u32 info);
+int mt76_npu_dma_add_buf(struct mt76_phy *phy, struct mt76_queue *q,
+			 struct sk_buff *skb, struct mt76_queue_buf *buf,
+			 void *txwi_ptr);
+int mt76_npu_rx_queue_init(struct mt76_dev *dev, struct mt76_queue *q);
+int mt76_npu_fill_rx_queue(struct mt76_dev *dev, struct mt76_queue *q);
+void mt76_npu_queue_cleanup(struct mt76_dev *dev, struct mt76_queue *q);
+void mt76_npu_disable_irqs(struct mt76_dev *dev);
+int mt76_npu_init(struct mt76_dev *dev, phys_addr_t phy_addr, int type);
+void mt76_npu_deinit(struct mt76_dev *dev);
+void mt76_npu_queue_setup(struct mt76_dev *dev, struct mt76_queue *q);
+void mt76_npu_txdesc_cleanup(struct mt76_queue *q, int index);
+int mt76_npu_net_setup_tc(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			  struct net_device *dev, enum tc_setup_type type,
+			  void *type_data);
+#else
+static inline void mt76_npu_check_ppe(struct mt76_dev *dev,
+				      struct sk_buff *skb, u32 info)
+{
+}
+
+static inline int mt76_npu_dma_add_buf(struct mt76_phy *phy,
+				       struct mt76_queue *q,
+				       struct sk_buff *skb,
+				       struct mt76_queue_buf *buf,
+				       void *txwi_ptr)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int mt76_npu_fill_rx_queue(struct mt76_dev *dev,
+					 struct mt76_queue *q)
+{
+	return 0;
+}
+
+static inline void mt76_npu_queue_cleanup(struct mt76_dev *dev,
+					  struct mt76_queue *q)
+{
+}
+
+static inline void mt76_npu_disable_irqs(struct mt76_dev *dev)
+{
+}
+
+static inline int mt76_npu_init(struct mt76_dev *dev, phys_addr_t phy_addr,
+				int type)
+{
+	return 0;
+}
+
+static inline void mt76_npu_deinit(struct mt76_dev *dev)
+{
+}
+
+static inline void mt76_npu_queue_setup(struct mt76_dev *dev,
+					struct mt76_queue *q)
+{
+}
+
+static inline void mt76_npu_txdesc_cleanup(struct mt76_queue *q,
+					   int index)
+{
+}
+
+static inline int mt76_npu_net_setup_tc(struct ieee80211_hw *hw,
+					struct ieee80211_vif *vif,
+					struct net_device *dev,
+					enum tc_setup_type type,
+					void *type_data)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_MT76_NPU */
+
+static inline bool mt76_npu_device_active(struct mt76_dev *dev)
+{
+	return !!rcu_access_pointer(dev->mmio.npu);
+}
+
+static inline bool mt76_ppe_device_active(struct mt76_dev *dev)
+{
+	return !!rcu_access_pointer(dev->mmio.ppe_dev);
+}
+
+static inline int mt76_npu_send_msg(struct airoha_npu *npu, int ifindex,
+				    enum airoha_npu_wlan_set_cmd cmd,
+				    u32 val, gfp_t gfp)
+{
+	return airoha_npu_wlan_send_msg(npu, ifindex, cmd, &val, sizeof(val),
+					gfp);
+}
+
+static inline int mt76_npu_get_msg(struct airoha_npu *npu, int ifindex,
+				   enum airoha_npu_wlan_get_cmd cmd,
+				   u32 *val, gfp_t gfp)
+{
+	return airoha_npu_wlan_get_msg(npu, ifindex, cmd, val, sizeof(*val),
+				       gfp);
+}
+
 static inline void mt76_testmode_reset(struct mt76_phy *phy, bool disable)
 {
 #ifdef CONFIG_NL80211_TESTMODE
@@ -1858,6 +1976,23 @@ static inline bool mt76_queue_is_emi(struct mt76_queue *q)
 	return q->flags & MT_QFLAG_EMI_EN;
 }
 
+static inline bool mt76_queue_is_npu(struct mt76_queue *q)
+{
+	return q->flags & MT_QFLAG_NPU;
+}
+
+static inline bool mt76_queue_is_npu_tx(struct mt76_queue *q)
+{
+	return mt76_queue_is_npu(q) &&
+	       FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_TX;
+}
+
+static inline bool mt76_queue_is_npu_rx(struct mt76_queue *q)
+{
+	return mt76_queue_is_npu(q) &&
+	       FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX;
+}
+
 struct mt76_txwi_cache *
 mt76_token_release(struct mt76_dev *dev, int token, bool *wake);
 int mt76_token_consume(struct mt76_dev *dev, struct mt76_txwi_cache **ptxwi);
diff --git a/drivers/net/wireless/mediatek/mt76/npu.c b/drivers/net/wireless/mediatek/mt76/npu.c
new file mode 100644
index 000000000000..ec36975f6dc9
--- /dev/null
+++ b/drivers/net/wireless/mediatek/mt76/npu.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025 AIROHA Inc
+ * Author: Lorenzo Bianconi <lorenzo@kernel.org>
+ */
+#include <linux/kernel.h>
+#include <net/flow_offload.h>
+#include <net/pkt_cls.h>
+
+#include "mt76.h"
+#include "dma.h"
+#include "mt76_connac.h"
+
+#define MT76_NPU_RX_BUF_SIZE	(1800 + \
+				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
+int mt76_npu_fill_rx_queue(struct mt76_dev *dev, struct mt76_queue *q)
+{
+	int nframes = 0;
+
+	while (q->queued < q->ndesc - 1) {
+		struct airoha_npu_rx_dma_desc *desc = (void *)q->desc;
+		struct mt76_queue_entry *e = &q->entry[q->head];
+		struct page *page;
+		int offset;
+
+		e->buf = mt76_get_page_pool_buf(q, &offset, q->buf_size);
+		if (!e->buf)
+			break;
+
+		e->dma_len[0] = SKB_WITH_OVERHEAD(q->buf_size);
+		page = virt_to_head_page(e->buf);
+		e->dma_addr[0] = page_pool_get_dma_addr(page) + offset;
+
+		memset(&desc[q->head], 0, sizeof(*desc));
+		desc[q->head].addr = e->dma_addr[0];
+
+		q->head = (q->head + 1) % q->ndesc;
+		q->queued++;
+		nframes++;
+	}
+
+	return nframes;
+}
+
+void mt76_npu_queue_cleanup(struct mt76_dev *dev, struct mt76_queue *q)
+{
+	spin_lock_bh(&q->lock);
+	while (q->queued > 0) {
+		struct mt76_queue_entry *e = &q->entry[q->tail];
+
+		dma_sync_single_for_cpu(dev->dma_dev, e->dma_addr[0],
+					e->dma_len[0],
+					page_pool_get_dma_dir(q->page_pool));
+		mt76_put_page_pool_buf(e->buf, false);
+		q->tail = (q->tail + 1) % q->ndesc;
+		q->queued--;
+	}
+	spin_unlock_bh(&q->lock);
+}
+
+static struct sk_buff *mt76_npu_dequeue(struct mt76_dev *dev,
+					struct mt76_queue *q,
+					u32 *info)
+{
+	struct airoha_npu_rx_dma_desc *desc = (void *)q->desc;
+	int i, nframes, index = q->tail;
+	struct sk_buff *skb = NULL;
+
+	nframes = FIELD_GET(NPU_RX_DMA_PKT_COUNT_MASK, desc[index].info);
+	nframes = max_t(int, nframes, 1);
+
+	for (i = 0; i < nframes; i++) {
+		struct mt76_queue_entry *e = &q->entry[index];
+		int len = FIELD_GET(NPU_RX_DMA_DESC_CUR_LEN_MASK,
+				    desc[index].ctrl);
+
+		if (!FIELD_GET(NPU_RX_DMA_DESC_DONE_MASK, desc[index].ctrl)) {
+			dev_kfree_skb(skb);
+			return NULL;
+		}
+
+		dma_sync_single_for_cpu(dev->dma_dev, e->dma_addr[0],
+					e->dma_len[0],
+					page_pool_get_dma_dir(q->page_pool));
+
+		if (!skb) {
+			skb = napi_build_skb(e->buf, q->buf_size);
+			if (!skb)
+				return NULL;
+
+			__skb_put(skb, len);
+			skb_reset_mac_header(skb);
+			skb_mark_for_recycle(skb);
+		} else {
+			struct skb_shared_info *shinfo = skb_shinfo(skb);
+			struct page *page = virt_to_head_page(e->buf);
+			int nr_frags = shinfo->nr_frags;
+
+			if (nr_frags < ARRAY_SIZE(shinfo->frags))
+				skb_add_rx_frag(skb, nr_frags, page,
+						e->buf - page_address(page),
+						len, q->buf_size);
+		}
+
+		*info = desc[index].info;
+		index = (index + 1) % q->ndesc;
+	}
+	q->tail = index;
+	q->queued -= i;
+	Q_WRITE(q, dma_idx, q->tail);
+
+	return skb;
+}
+
+void mt76_npu_check_ppe(struct mt76_dev *dev, struct sk_buff *skb,
+			u32 info)
+{
+	struct airoha_ppe_dev *ppe_dev;
+	u16 reason, hash;
+
+	if (!mt76_npu_device_active(dev))
+		return;
+
+	rcu_read_lock();
+
+	ppe_dev = rcu_dereference(dev->mmio.ppe_dev);
+	if (!ppe_dev)
+		goto out;
+
+	hash = FIELD_GET(NPU_RX_DMA_FOE_ID_MASK, info);
+	skb_set_hash(skb, hash, PKT_HASH_TYPE_L4);
+
+	reason = FIELD_GET(NPU_RX_DMA_CRSN_MASK, info);
+	if (reason == PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED) {
+		skb_set_mac_header(skb, 0);
+		airoha_ppe_dev_check_skb(ppe_dev, skb, hash, true);
+	}
+out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(mt76_npu_check_ppe);
+
+static int mt76_npu_rx_poll(struct napi_struct *napi, int budget)
+{
+	struct mt76_dev *dev = mt76_priv(napi->dev);
+	enum mt76_rxq_id qid = napi - dev->napi;
+	struct airoha_npu *npu;
+	int done = 0;
+
+	rcu_read_lock();
+
+	npu = rcu_dereference(dev->mmio.npu);
+	if (!npu)
+		goto out;
+
+	while (done < budget) {
+		struct sk_buff *skb;
+		u32 info = 0;
+
+		skb = mt76_npu_dequeue(dev, &dev->q_rx[qid], &info);
+		if (!skb)
+			break;
+
+		dev->drv->rx_skb(dev, qid, skb, &info);
+		mt76_rx_poll_complete(dev, qid, napi);
+		done++;
+	}
+
+	mt76_npu_fill_rx_queue(dev, &dev->q_rx[qid]);
+out:
+	if (done < budget && napi_complete(napi))
+		dev->drv->rx_poll_complete(dev, qid);
+
+	rcu_read_unlock();
+
+	return done;
+}
+
+static irqreturn_t mt76_npu_irq_handler(int irq, void *q_instance)
+{
+	struct mt76_queue *q = q_instance;
+	struct mt76_dev *dev = q->dev;
+	int qid = q - &dev->q_rx[0];
+	int index = qid - MT_RXQ_NPU0;
+	struct airoha_npu *npu;
+	u32 status;
+
+	rcu_read_lock();
+
+	npu = rcu_dereference(dev->mmio.npu);
+	if (!npu)
+		goto out;
+
+	status = airoha_npu_wlan_get_irq_status(npu, index);
+	airoha_npu_wlan_set_irq_status(npu, status);
+
+	airoha_npu_wlan_disable_irq(npu, index);
+	napi_schedule(&dev->napi[qid]);
+out:
+	rcu_read_unlock();
+
+	return IRQ_HANDLED;
+}
+
+int mt76_npu_dma_add_buf(struct mt76_phy *phy, struct mt76_queue *q,
+			 struct sk_buff *skb, struct mt76_queue_buf *buf,
+			 void *txwi_ptr)
+{
+	u16 txwi_len = min_t(u16, phy->dev->drv->txwi_size, NPU_TXWI_LEN);
+	struct airoha_npu_tx_dma_desc *desc = (void *)q->desc;
+	int ret;
+
+	/* TODO: Take into account unlinear skbs */
+	memcpy(desc[q->head].txwi, txwi_ptr, txwi_len);
+	desc[q->head].addr = buf->addr;
+	desc[q->head].ctrl = FIELD_PREP(NPU_TX_DMA_DESC_VEND_LEN_MASK, txwi_len) |
+			     FIELD_PREP(NPU_TX_DMA_DESC_LEN_MASK, skb->len) |
+			     NPU_TX_DMA_DESC_DONE_MASK;
+
+	ret = q->head;
+	q->entry[q->head].skip_buf0 = true;
+	q->entry[q->head].skip_buf1 = true;
+	q->entry[q->head].txwi = NULL;
+	q->entry[q->head].skb = NULL;
+	q->entry[q->head].wcid = 0xffff;
+
+	q->head = (q->head + 1) % q->ndesc;
+	q->queued++;
+
+	return ret;
+}
+
+void mt76_npu_txdesc_cleanup(struct mt76_queue *q, int index)
+{
+	struct airoha_npu_tx_dma_desc *desc = (void *)q->desc;
+
+	if (!mt76_queue_is_npu_tx(q))
+		return;
+
+	desc[index].ctrl &= ~NPU_TX_DMA_DESC_DONE_MASK;
+}
+
+void mt76_npu_queue_setup(struct mt76_dev *dev, struct mt76_queue *q)
+{
+	int qid = FIELD_GET(MT_QFLAG_WED_RING, q->flags);
+	bool xmit = mt76_queue_is_npu_tx(q);
+	struct airoha_npu *npu;
+
+	if (!mt76_queue_is_npu(q))
+		return;
+
+	npu = rcu_dereference_protected(dev->mmio.npu, &dev->mutex);
+	if (npu)
+		q->wed_regs = airoha_npu_wlan_get_queue_addr(npu, qid, xmit);
+}
+
+int mt76_npu_rx_queue_init(struct mt76_dev *dev, struct mt76_queue *q)
+{
+	int err, irq, qid = q - &dev->q_rx[0];
+	int size, index = qid - MT_RXQ_NPU0;
+	struct airoha_npu *npu;
+	const char *name;
+
+	mutex_lock(&dev->mutex);
+
+	npu = rcu_dereference_protected(dev->mmio.npu, &dev->mutex);
+	irq = npu && index < ARRAY_SIZE(npu->irqs) ? npu->irqs[index]
+						   : -EINVAL;
+	if (irq < 0) {
+		err = irq;
+		goto out;
+	}
+
+	q->flags = MT_NPU_Q_RX(index);
+	size = qid == MT_RXQ_NPU1 ? NPU_RX1_DESC_NUM : NPU_RX0_DESC_NUM;
+	err = dev->queue_ops->alloc(dev, q, 0, size,
+				    MT76_NPU_RX_BUF_SIZE, 0);
+	if (err)
+		goto out;
+
+	name = devm_kasprintf(dev->dev, GFP_KERNEL, "mt76-npu.%d", index);
+	if (!name) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = devm_request_irq(dev->dev, irq, mt76_npu_irq_handler,
+			       IRQF_SHARED, name, q);
+	if (err)
+		goto out;
+
+	netif_napi_add(dev->napi_dev, &dev->napi[qid], mt76_npu_rx_poll);
+	mt76_npu_fill_rx_queue(dev, q);
+	napi_enable(&dev->napi[qid]);
+out:
+	mutex_unlock(&dev->mutex);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mt76_npu_rx_queue_init);
+
+static int mt76_npu_setup_tc_block_cb(enum tc_setup_type type,
+				      void *type_data, void *cb_priv)
+{
+	struct mt76_phy *phy = cb_priv;
+	struct mt76_dev *dev = phy->dev;
+	struct airoha_ppe_dev *ppe_dev;
+	int err = -EOPNOTSUPP;
+
+	if (type != TC_SETUP_CLSFLOWER)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&dev->mutex);
+
+	ppe_dev = rcu_dereference_protected(dev->mmio.ppe_dev, &dev->mutex);
+	if (ppe_dev)
+		err = airoha_ppe_dev_setup_tc_block_cb(ppe_dev, type_data);
+
+	mutex_unlock(&dev->mutex);
+
+	return err;
+}
+
+static int mt76_npu_setup_tc_block(struct mt76_phy *phy,
+				   struct net_device *dev,
+				   struct flow_block_offload *f)
+{
+	flow_setup_cb_t *cb = mt76_npu_setup_tc_block_cb;
+	static LIST_HEAD(block_cb_list);
+	struct flow_block_cb *block_cb;
+
+	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	if (!tc_can_offload(dev))
+		return -EOPNOTSUPP;
+
+	f->driver_block_list = &block_cb_list;
+	switch (f->command) {
+	case FLOW_BLOCK_BIND:
+		block_cb = flow_block_cb_lookup(f->block, cb, dev);
+		if (block_cb) {
+			flow_block_cb_incref(block_cb);
+			return 0;
+		}
+
+		block_cb = flow_block_cb_alloc(cb, dev, phy, NULL);
+		if (IS_ERR(block_cb))
+			return PTR_ERR(block_cb);
+
+		flow_block_cb_incref(block_cb);
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &block_cb_list);
+		return 0;
+	case FLOW_BLOCK_UNBIND:
+		block_cb = flow_block_cb_lookup(f->block, cb, dev);
+		if (!block_cb)
+			return -ENOENT;
+
+		if (!flow_block_cb_decref(block_cb)) {
+			flow_block_cb_remove(block_cb, f);
+			list_del(&block_cb->driver_list);
+		}
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+int mt76_npu_net_setup_tc(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			  struct net_device *dev, enum tc_setup_type type,
+			  void *type_data)
+{
+	struct mt76_phy *phy = hw->priv;
+
+	if (!tc_can_offload(dev))
+		return -EOPNOTSUPP;
+
+	if (!mt76_npu_device_active(phy->dev))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_BLOCK:
+	case TC_SETUP_FT:
+		return mt76_npu_setup_tc_block(phy, dev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(mt76_npu_net_setup_tc);
+
+void mt76_npu_disable_irqs(struct mt76_dev *dev)
+{
+	struct airoha_npu *npu;
+	int i;
+
+	rcu_read_lock();
+
+	npu = rcu_dereference(dev->mmio.npu);
+	if (!npu)
+		goto unlock;
+
+	for (i = MT_RXQ_NPU0; i <= MT_RXQ_NPU1; i++) {
+		int qid = i - MT_RXQ_NPU0;
+		u32 status;
+
+		status = airoha_npu_wlan_get_irq_status(npu, qid);
+		airoha_npu_wlan_set_irq_status(npu, status);
+		airoha_npu_wlan_disable_irq(npu, qid);
+	}
+unlock:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(mt76_npu_disable_irqs);
+
+int mt76_npu_init(struct mt76_dev *dev, phys_addr_t phy_addr, int type)
+{
+	struct airoha_ppe_dev *ppe_dev;
+	struct airoha_npu *npu;
+	int err = 0;
+
+	/* NPU offloading is only supported by MT7992 */
+	if (!is_mt7992(dev))
+		return 0;
+
+	mutex_lock(&dev->mutex);
+
+	npu = airoha_npu_get(dev->dev);
+	if (IS_ERR(npu)) {
+		request_module("airoha-npu");
+		npu = airoha_npu_get(dev->dev);
+	}
+
+	if (IS_ERR(npu)) {
+		err = PTR_ERR(npu);
+		goto error_unlock;
+	}
+
+	ppe_dev = airoha_ppe_get_dev(dev->dev);
+	if (IS_ERR(ppe_dev)) {
+		request_module("airoha-eth");
+		ppe_dev = airoha_ppe_get_dev(dev->dev);
+	}
+
+	if (IS_ERR(ppe_dev)) {
+		err = PTR_ERR(ppe_dev);
+		goto error_npu_put;
+	}
+
+	err = airoha_npu_wlan_init_reserved_memory(npu);
+	if (err)
+		goto error_ppe_put;
+
+	dev->dma_dev = npu->dev;
+	dev->mmio.phy_addr = phy_addr;
+	dev->mmio.npu_type = type;
+	/* NPU offloading requires HW-RRO for RX packet reordering. */
+	dev->hwrro_mode = MT76_HWRRO_V3_1;
+
+	rcu_assign_pointer(dev->mmio.npu, npu);
+	rcu_assign_pointer(dev->mmio.ppe_dev, ppe_dev);
+	synchronize_rcu();
+
+	mutex_unlock(&dev->mutex);
+
+	return 0;
+
+error_ppe_put:
+	airoha_ppe_put_dev(ppe_dev);
+error_npu_put:
+	airoha_npu_put(npu);
+error_unlock:
+	mutex_unlock(&dev->mutex);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mt76_npu_init);
+
+void mt76_npu_deinit(struct mt76_dev *dev)
+{
+	struct airoha_ppe_dev *ppe_dev;
+	struct airoha_npu *npu;
+
+	mutex_lock(&dev->mutex);
+
+	npu = rcu_replace_pointer(dev->mmio.npu, NULL,
+				  lockdep_is_held(&dev->mutex));
+	if (npu)
+		airoha_npu_put(npu);
+
+	ppe_dev = rcu_replace_pointer(dev->mmio.ppe_dev, NULL,
+				      lockdep_is_held(&dev->mutex));
+	if (ppe_dev)
+		airoha_ppe_put_dev(ppe_dev);
+
+	mutex_unlock(&dev->mutex);
+
+	mt76_npu_queue_cleanup(dev, &dev->q_rx[MT_RXQ_NPU0]);
+	mt76_npu_queue_cleanup(dev, &dev->q_rx[MT_RXQ_NPU1]);
+}
diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h
index 6f66eb339b3f..4d23cbb7d407 100644
--- a/include/linux/soc/airoha/airoha_offload.h
+++ b/include/linux/soc/airoha/airoha_offload.h
@@ -6,6 +6,7 @@
 #ifndef AIROHA_OFFLOAD_H
 #define AIROHA_OFFLOAD_H
 
+#include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
 
-- 
cgit v1.2.3


From cbbfba4847b8a5299d36e002bf864b21bb83295d Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 11 Nov 2025 11:37:55 +0000
Subject: perf: Add perf_event_attr::config4

Arm FEAT_SPE_FDS adds the ability to filter on the data source of a
packet using another 64-bits of event filtering control. As the existing
perf_event_attr::configN fields are all used up for SPE PMU, an
additional field is needed. Add a new 'config4' field.

Reviewed-by: Leo Yan <leo.yan@arm.com>
Tested-by: Leo Yan <leo.yan@arm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/uapi/linux/perf_event.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 78a362b80027..0d0ed85ad8cb 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -382,6 +382,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER6			120	/* Add: aux_sample_size */
 #define PERF_ATTR_SIZE_VER7			128	/* Add: sig_data */
 #define PERF_ATTR_SIZE_VER8			136	/* Add: config3 */
+#define PERF_ATTR_SIZE_VER9			144	/* add: config4 */
 
 /*
  * 'struct perf_event_attr' contains various attributes that define
@@ -543,6 +544,7 @@ struct perf_event_attr {
 	__u64	sig_data;
 
 	__u64	config3; /* extension of config2 */
+	__u64	config4; /* extension of config3 */
 };
 
 /*
-- 
cgit v1.2.3


From 935419b9fb74ab2643583fce750cb774c9b5faa6 Mon Sep 17 00:00:00 2001
From: Dinh Nguyen <dinguyen@kernel.org>
Date: Fri, 14 Nov 2025 12:58:15 -0600
Subject: firmware: stratix10-svc: fix make htmldocs warning

Stephen Rothwell reports htmldocs warnings when merging char-misc tree:

WARNING: include/linux/firmware/intel/stratix10-svc-client.h:22 This comment
starts with '/**', but isn't a kernel-doc comment.

WARNING: include/linux/firmware/intel/stratix10-svc-client.h:184 Enum value
'COMMAND_HWMON_READTEMP' not described in enum 'stratix10_svc_command_code'

WARNING: include/linux/firmware/intel/stratix10-svc-client.h:184 Enum value
'COMMAND_HWMON_READVOLT' not described in enum 'stratix10_svc_command_code'

WARNING: include/linux/firmware/intel/stratix10-svc-client.h:307 function
parameter 'cb_arg' not described in 'async_callback_t'

Fixes: 4f49088c1625 ("firmware: stratix10-svc: Add definition for voltage and temperature sensor")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/linux-next/20251114153920.1c5df700@canb.auug.org.au/
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
Link: https://patch.msgid.link/20251114185815.358423-3-dinguyen@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/intel/stratix10-svc-client.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index 1bcc56d14080..d290060f4c73 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -19,7 +19,7 @@
 #define SVC_CLIENT_FCS			"fcs"
 #define SVC_CLIENT_HWMON		"hwmon"
 
-/**
+/*
  * Status of the sent command, in bit number
  *
  * SVC_STATUS_OK:
@@ -148,6 +148,12 @@ struct stratix10_svc_chan;
  *
  * @COMMAND_FCS_RANDOM_NUMBER_GEN: generate a random number, return status
  * is SVC_STATUS_OK, SVC_STATUS_ERROR
+ *
+ * @COMMAND_HWMON_READTEMP: query the temperature from the hardware monitor,
+ * return status is SVC_STATUS_OK or SVC_STATUS_ERROR
+ *
+ * @COMMAND_HWMON_READVOLT: query the voltage from the hardware monitor,
+ * return status is SVC_STATUS_OK or SVC_STATUS_ERROR
  */
 enum stratix10_svc_command_code {
 	/* for FPGA */
@@ -303,7 +309,7 @@ void stratix10_svc_done(struct stratix10_svc_chan *chan);
  * The callback function takes a single argument, which is a pointer to
  * user-defined data.
  *
- * @param cb_arg A pointer to user-defined data passed to the callback function.
+ * @cb_arg: Argument to be passed to the callback function.
  */
 typedef void (*async_callback_t)(void *cb_arg);
 
-- 
cgit v1.2.3


From e6ab504633e4c06e35377ecf3c8cbc304de79858 Mon Sep 17 00:00:00 2001
From: Dave Penkler <dpenkler@gmail.com>
Date: Mon, 17 Nov 2025 15:40:21 +0100
Subject: staging: gpib: Destage gpib

Move the gpib drivers out of staging and into the "real" part of the
kernel.  This entails:

 - Remove the gpib Kconfig menu and Makefile build rule from staging.
 - Remove gpib/uapi from the header file search path in subdir-ccflags
   of the gpib Makefile
 - move the gpib/uapi files to include/uapi/linux
 - Move the gpib tree out of staging to drivers.
 - Remove the word "Linux" from the gpib Kconfig file.
 - Add the gpib Kconfig menu and Makefile build rule to drivers

Signed-off-by: Dave Penkler <dpenkler@gmail.com>
Link: https://patch.msgid.link/20251117144021.23569-5-dpenkler@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                                        |    4 +-
 drivers/Kconfig                                    |    2 +
 drivers/Makefile                                   |    1 +
 drivers/gpib/Kconfig                               |  255 ++
 drivers/gpib/Makefile                              |   20 +
 drivers/gpib/TODO                                  |   10 +
 drivers/gpib/agilent_82350b/Makefile               |    2 +
 drivers/gpib/agilent_82350b/agilent_82350b.c       |  896 +++++++
 drivers/gpib/agilent_82350b/agilent_82350b.h       |  157 ++
 drivers/gpib/agilent_82357a/Makefile               |    4 +
 drivers/gpib/agilent_82357a/agilent_82357a.c       | 1691 ++++++++++++
 drivers/gpib/agilent_82357a/agilent_82357a.h       |  182 ++
 drivers/gpib/cb7210/Makefile                       |    3 +
 drivers/gpib/cb7210/cb7210.c                       | 1598 ++++++++++++
 drivers/gpib/cb7210/cb7210.h                       |  203 ++
 drivers/gpib/cec/Makefile                          |    3 +
 drivers/gpib/cec/cec.h                             |   20 +
 drivers/gpib/cec/cec_gpib.c                        |  393 +++
 drivers/gpib/common/Makefile                       |    6 +
 drivers/gpib/common/gpib_os.c                      | 2271 +++++++++++++++++
 drivers/gpib/common/iblib.c                        |  717 ++++++
 drivers/gpib/common/ibsys.h                        |   34 +
 drivers/gpib/eastwood/Makefile                     |    3 +
 drivers/gpib/eastwood/fluke_gpib.c                 | 1180 +++++++++
 drivers/gpib/eastwood/fluke_gpib.h                 |  146 ++
 drivers/gpib/fmh_gpib/Makefile                     |    2 +
 drivers/gpib/fmh_gpib/fmh_gpib.c                   | 1754 +++++++++++++
 drivers/gpib/fmh_gpib/fmh_gpib.h                   |  177 ++
 drivers/gpib/gpio/Makefile                         |    4 +
 drivers/gpib/gpio/gpib_bitbang.c                   | 1469 +++++++++++
 drivers/gpib/hp_82335/Makefile                     |    4 +
 drivers/gpib/hp_82335/hp82335.c                    |  371 +++
 drivers/gpib/hp_82335/hp82335.h                    |   52 +
 drivers/gpib/hp_82341/Makefile                     |    2 +
 drivers/gpib/hp_82341/hp_82341.c                   |  907 +++++++
 drivers/gpib/hp_82341/hp_82341.h                   |  165 ++
 drivers/gpib/include/amcc5920.h                    |   49 +
 drivers/gpib/include/amccs5933.h                   |   59 +
 drivers/gpib/include/gpibP.h                       |   41 +
 drivers/gpib/include/gpib_cmd.h                    |  112 +
 drivers/gpib/include/gpib_pci_ids.h                |   23 +
 drivers/gpib/include/gpib_proto.h                  |   49 +
 drivers/gpib/include/gpib_state_machines.h         |   23 +
 drivers/gpib/include/gpib_types.h                  |  381 +++
 drivers/gpib/include/nec7210.h                     |  141 ++
 drivers/gpib/include/nec7210_registers.h           |  218 ++
 drivers/gpib/include/plx9050.h                     |   72 +
 drivers/gpib/include/quancom_pci.h                 |   22 +
 drivers/gpib/include/tms9914.h                     |  280 ++
 drivers/gpib/include/tnt4882_registers.h           |  192 ++
 drivers/gpib/ines/Makefile                         |    3 +
 drivers/gpib/ines/ines.h                           |  165 ++
 drivers/gpib/ines/ines_gpib.c                      | 1500 +++++++++++
 drivers/gpib/lpvo_usb_gpib/Makefile                |    3 +
 drivers/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c         | 2025 +++++++++++++++
 drivers/gpib/nec7210/Makefile                      |    4 +
 drivers/gpib/nec7210/board.h                       |   19 +
 drivers/gpib/nec7210/nec7210.c                     | 1121 ++++++++
 drivers/gpib/ni_usb/Makefile                       |    4 +
 drivers/gpib/ni_usb/ni_usb_gpib.c                  | 2678 ++++++++++++++++++++
 drivers/gpib/ni_usb/ni_usb_gpib.h                  |  226 ++
 drivers/gpib/pc2/Makefile                          |    5 +
 drivers/gpib/pc2/pc2_gpib.c                        |  684 +++++
 drivers/gpib/tms9914/Makefile                      |    6 +
 drivers/gpib/tms9914/tms9914.c                     |  914 +++++++
 drivers/gpib/tnt4882/Makefile                      |    6 +
 drivers/gpib/tnt4882/mite.c                        |  133 +
 drivers/gpib/tnt4882/mite.h                        |  234 ++
 drivers/gpib/tnt4882/tnt4882_gpib.c                | 1838 ++++++++++++++
 drivers/staging/Kconfig                            |    2 -
 drivers/staging/Makefile                           |    1 -
 drivers/staging/gpib/Kconfig                       |  255 --
 drivers/staging/gpib/Makefile                      |   20 -
 drivers/staging/gpib/TODO                          |   10 -
 drivers/staging/gpib/agilent_82350b/Makefile       |    2 -
 .../staging/gpib/agilent_82350b/agilent_82350b.c   |  896 -------
 .../staging/gpib/agilent_82350b/agilent_82350b.h   |  157 --
 drivers/staging/gpib/agilent_82357a/Makefile       |    4 -
 .../staging/gpib/agilent_82357a/agilent_82357a.c   | 1691 ------------
 .../staging/gpib/agilent_82357a/agilent_82357a.h   |  182 --
 drivers/staging/gpib/cb7210/Makefile               |    3 -
 drivers/staging/gpib/cb7210/cb7210.c               | 1598 ------------
 drivers/staging/gpib/cb7210/cb7210.h               |  203 --
 drivers/staging/gpib/cec/Makefile                  |    3 -
 drivers/staging/gpib/cec/cec.h                     |   20 -
 drivers/staging/gpib/cec/cec_gpib.c                |  393 ---
 drivers/staging/gpib/common/Makefile               |    6 -
 drivers/staging/gpib/common/gpib_os.c              | 2271 -----------------
 drivers/staging/gpib/common/iblib.c                |  717 ------
 drivers/staging/gpib/common/ibsys.h                |   34 -
 drivers/staging/gpib/eastwood/Makefile             |    3 -
 drivers/staging/gpib/eastwood/fluke_gpib.c         | 1180 ---------
 drivers/staging/gpib/eastwood/fluke_gpib.h         |  146 --
 drivers/staging/gpib/fmh_gpib/Makefile             |    2 -
 drivers/staging/gpib/fmh_gpib/fmh_gpib.c           | 1754 -------------
 drivers/staging/gpib/fmh_gpib/fmh_gpib.h           |  177 --
 drivers/staging/gpib/gpio/Makefile                 |    4 -
 drivers/staging/gpib/gpio/gpib_bitbang.c           | 1469 -----------
 drivers/staging/gpib/hp_82335/Makefile             |    4 -
 drivers/staging/gpib/hp_82335/hp82335.c            |  371 ---
 drivers/staging/gpib/hp_82335/hp82335.h            |   52 -
 drivers/staging/gpib/hp_82341/Makefile             |    2 -
 drivers/staging/gpib/hp_82341/hp_82341.c           |  907 -------
 drivers/staging/gpib/hp_82341/hp_82341.h           |  165 --
 drivers/staging/gpib/include/amcc5920.h            |   49 -
 drivers/staging/gpib/include/amccs5933.h           |   59 -
 drivers/staging/gpib/include/gpibP.h               |   41 -
 drivers/staging/gpib/include/gpib_cmd.h            |  112 -
 drivers/staging/gpib/include/gpib_pci_ids.h        |   23 -
 drivers/staging/gpib/include/gpib_proto.h          |   49 -
 drivers/staging/gpib/include/gpib_state_machines.h |   23 -
 drivers/staging/gpib/include/gpib_types.h          |  381 ---
 drivers/staging/gpib/include/nec7210.h             |  141 --
 drivers/staging/gpib/include/nec7210_registers.h   |  218 --
 drivers/staging/gpib/include/plx9050.h             |   72 -
 drivers/staging/gpib/include/quancom_pci.h         |   22 -
 drivers/staging/gpib/include/tms9914.h             |  280 --
 drivers/staging/gpib/include/tnt4882_registers.h   |  192 --
 drivers/staging/gpib/ines/Makefile                 |    3 -
 drivers/staging/gpib/ines/ines.h                   |  165 --
 drivers/staging/gpib/ines/ines_gpib.c              | 1500 -----------
 drivers/staging/gpib/lpvo_usb_gpib/Makefile        |    3 -
 drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c | 2025 ---------------
 drivers/staging/gpib/nec7210/Makefile              |    4 -
 drivers/staging/gpib/nec7210/board.h               |   19 -
 drivers/staging/gpib/nec7210/nec7210.c             | 1121 --------
 drivers/staging/gpib/ni_usb/Makefile               |    4 -
 drivers/staging/gpib/ni_usb/ni_usb_gpib.c          | 2678 --------------------
 drivers/staging/gpib/ni_usb/ni_usb_gpib.h          |  226 --
 drivers/staging/gpib/pc2/Makefile                  |    5 -
 drivers/staging/gpib/pc2/pc2_gpib.c                |  684 -----
 drivers/staging/gpib/tms9914/Makefile              |    6 -
 drivers/staging/gpib/tms9914/tms9914.c             |  914 -------
 drivers/staging/gpib/tnt4882/Makefile              |    6 -
 drivers/staging/gpib/tnt4882/mite.c                |  133 -
 drivers/staging/gpib/tnt4882/mite.h                |  234 --
 drivers/staging/gpib/tnt4882/tnt4882_gpib.c        | 1838 --------------
 drivers/staging/gpib/uapi/gpib.h                   |  104 -
 drivers/staging/gpib/uapi/gpib_ioctl.h             |  167 --
 include/uapi/linux/gpib.h                          |  104 +
 include/uapi/linux/gpib_ioctl.h                    |  167 ++
 141 files changed, 28208 insertions(+), 28206 deletions(-)
 create mode 100644 drivers/gpib/Kconfig
 create mode 100644 drivers/gpib/Makefile
 create mode 100644 drivers/gpib/TODO
 create mode 100644 drivers/gpib/agilent_82350b/Makefile
 create mode 100644 drivers/gpib/agilent_82350b/agilent_82350b.c
 create mode 100644 drivers/gpib/agilent_82350b/agilent_82350b.h
 create mode 100644 drivers/gpib/agilent_82357a/Makefile
 create mode 100644 drivers/gpib/agilent_82357a/agilent_82357a.c
 create mode 100644 drivers/gpib/agilent_82357a/agilent_82357a.h
 create mode 100644 drivers/gpib/cb7210/Makefile
 create mode 100644 drivers/gpib/cb7210/cb7210.c
 create mode 100644 drivers/gpib/cb7210/cb7210.h
 create mode 100644 drivers/gpib/cec/Makefile
 create mode 100644 drivers/gpib/cec/cec.h
 create mode 100644 drivers/gpib/cec/cec_gpib.c
 create mode 100644 drivers/gpib/common/Makefile
 create mode 100644 drivers/gpib/common/gpib_os.c
 create mode 100644 drivers/gpib/common/iblib.c
 create mode 100644 drivers/gpib/common/ibsys.h
 create mode 100644 drivers/gpib/eastwood/Makefile
 create mode 100644 drivers/gpib/eastwood/fluke_gpib.c
 create mode 100644 drivers/gpib/eastwood/fluke_gpib.h
 create mode 100644 drivers/gpib/fmh_gpib/Makefile
 create mode 100644 drivers/gpib/fmh_gpib/fmh_gpib.c
 create mode 100644 drivers/gpib/fmh_gpib/fmh_gpib.h
 create mode 100644 drivers/gpib/gpio/Makefile
 create mode 100644 drivers/gpib/gpio/gpib_bitbang.c
 create mode 100644 drivers/gpib/hp_82335/Makefile
 create mode 100644 drivers/gpib/hp_82335/hp82335.c
 create mode 100644 drivers/gpib/hp_82335/hp82335.h
 create mode 100644 drivers/gpib/hp_82341/Makefile
 create mode 100644 drivers/gpib/hp_82341/hp_82341.c
 create mode 100644 drivers/gpib/hp_82341/hp_82341.h
 create mode 100644 drivers/gpib/include/amcc5920.h
 create mode 100644 drivers/gpib/include/amccs5933.h
 create mode 100644 drivers/gpib/include/gpibP.h
 create mode 100644 drivers/gpib/include/gpib_cmd.h
 create mode 100644 drivers/gpib/include/gpib_pci_ids.h
 create mode 100644 drivers/gpib/include/gpib_proto.h
 create mode 100644 drivers/gpib/include/gpib_state_machines.h
 create mode 100644 drivers/gpib/include/gpib_types.h
 create mode 100644 drivers/gpib/include/nec7210.h
 create mode 100644 drivers/gpib/include/nec7210_registers.h
 create mode 100644 drivers/gpib/include/plx9050.h
 create mode 100644 drivers/gpib/include/quancom_pci.h
 create mode 100644 drivers/gpib/include/tms9914.h
 create mode 100644 drivers/gpib/include/tnt4882_registers.h
 create mode 100644 drivers/gpib/ines/Makefile
 create mode 100644 drivers/gpib/ines/ines.h
 create mode 100644 drivers/gpib/ines/ines_gpib.c
 create mode 100644 drivers/gpib/lpvo_usb_gpib/Makefile
 create mode 100644 drivers/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c
 create mode 100644 drivers/gpib/nec7210/Makefile
 create mode 100644 drivers/gpib/nec7210/board.h
 create mode 100644 drivers/gpib/nec7210/nec7210.c
 create mode 100644 drivers/gpib/ni_usb/Makefile
 create mode 100644 drivers/gpib/ni_usb/ni_usb_gpib.c
 create mode 100644 drivers/gpib/ni_usb/ni_usb_gpib.h
 create mode 100644 drivers/gpib/pc2/Makefile
 create mode 100644 drivers/gpib/pc2/pc2_gpib.c
 create mode 100644 drivers/gpib/tms9914/Makefile
 create mode 100644 drivers/gpib/tms9914/tms9914.c
 create mode 100644 drivers/gpib/tnt4882/Makefile
 create mode 100644 drivers/gpib/tnt4882/mite.c
 create mode 100644 drivers/gpib/tnt4882/mite.h
 create mode 100644 drivers/gpib/tnt4882/tnt4882_gpib.c
 delete mode 100644 drivers/staging/gpib/Kconfig
 delete mode 100644 drivers/staging/gpib/Makefile
 delete mode 100644 drivers/staging/gpib/TODO
 delete mode 100644 drivers/staging/gpib/agilent_82350b/Makefile
 delete mode 100644 drivers/staging/gpib/agilent_82350b/agilent_82350b.c
 delete mode 100644 drivers/staging/gpib/agilent_82350b/agilent_82350b.h
 delete mode 100644 drivers/staging/gpib/agilent_82357a/Makefile
 delete mode 100644 drivers/staging/gpib/agilent_82357a/agilent_82357a.c
 delete mode 100644 drivers/staging/gpib/agilent_82357a/agilent_82357a.h
 delete mode 100644 drivers/staging/gpib/cb7210/Makefile
 delete mode 100644 drivers/staging/gpib/cb7210/cb7210.c
 delete mode 100644 drivers/staging/gpib/cb7210/cb7210.h
 delete mode 100644 drivers/staging/gpib/cec/Makefile
 delete mode 100644 drivers/staging/gpib/cec/cec.h
 delete mode 100644 drivers/staging/gpib/cec/cec_gpib.c
 delete mode 100644 drivers/staging/gpib/common/Makefile
 delete mode 100644 drivers/staging/gpib/common/gpib_os.c
 delete mode 100644 drivers/staging/gpib/common/iblib.c
 delete mode 100644 drivers/staging/gpib/common/ibsys.h
 delete mode 100644 drivers/staging/gpib/eastwood/Makefile
 delete mode 100644 drivers/staging/gpib/eastwood/fluke_gpib.c
 delete mode 100644 drivers/staging/gpib/eastwood/fluke_gpib.h
 delete mode 100644 drivers/staging/gpib/fmh_gpib/Makefile
 delete mode 100644 drivers/staging/gpib/fmh_gpib/fmh_gpib.c
 delete mode 100644 drivers/staging/gpib/fmh_gpib/fmh_gpib.h
 delete mode 100644 drivers/staging/gpib/gpio/Makefile
 delete mode 100644 drivers/staging/gpib/gpio/gpib_bitbang.c
 delete mode 100644 drivers/staging/gpib/hp_82335/Makefile
 delete mode 100644 drivers/staging/gpib/hp_82335/hp82335.c
 delete mode 100644 drivers/staging/gpib/hp_82335/hp82335.h
 delete mode 100644 drivers/staging/gpib/hp_82341/Makefile
 delete mode 100644 drivers/staging/gpib/hp_82341/hp_82341.c
 delete mode 100644 drivers/staging/gpib/hp_82341/hp_82341.h
 delete mode 100644 drivers/staging/gpib/include/amcc5920.h
 delete mode 100644 drivers/staging/gpib/include/amccs5933.h
 delete mode 100644 drivers/staging/gpib/include/gpibP.h
 delete mode 100644 drivers/staging/gpib/include/gpib_cmd.h
 delete mode 100644 drivers/staging/gpib/include/gpib_pci_ids.h
 delete mode 100644 drivers/staging/gpib/include/gpib_proto.h
 delete mode 100644 drivers/staging/gpib/include/gpib_state_machines.h
 delete mode 100644 drivers/staging/gpib/include/gpib_types.h
 delete mode 100644 drivers/staging/gpib/include/nec7210.h
 delete mode 100644 drivers/staging/gpib/include/nec7210_registers.h
 delete mode 100644 drivers/staging/gpib/include/plx9050.h
 delete mode 100644 drivers/staging/gpib/include/quancom_pci.h
 delete mode 100644 drivers/staging/gpib/include/tms9914.h
 delete mode 100644 drivers/staging/gpib/include/tnt4882_registers.h
 delete mode 100644 drivers/staging/gpib/ines/Makefile
 delete mode 100644 drivers/staging/gpib/ines/ines.h
 delete mode 100644 drivers/staging/gpib/ines/ines_gpib.c
 delete mode 100644 drivers/staging/gpib/lpvo_usb_gpib/Makefile
 delete mode 100644 drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c
 delete mode 100644 drivers/staging/gpib/nec7210/Makefile
 delete mode 100644 drivers/staging/gpib/nec7210/board.h
 delete mode 100644 drivers/staging/gpib/nec7210/nec7210.c
 delete mode 100644 drivers/staging/gpib/ni_usb/Makefile
 delete mode 100644 drivers/staging/gpib/ni_usb/ni_usb_gpib.c
 delete mode 100644 drivers/staging/gpib/ni_usb/ni_usb_gpib.h
 delete mode 100644 drivers/staging/gpib/pc2/Makefile
 delete mode 100644 drivers/staging/gpib/pc2/pc2_gpib.c
 delete mode 100644 drivers/staging/gpib/tms9914/Makefile
 delete mode 100644 drivers/staging/gpib/tms9914/tms9914.c
 delete mode 100644 drivers/staging/gpib/tnt4882/Makefile
 delete mode 100644 drivers/staging/gpib/tnt4882/mite.c
 delete mode 100644 drivers/staging/gpib/tnt4882/mite.h
 delete mode 100644 drivers/staging/gpib/tnt4882/tnt4882_gpib.c
 delete mode 100644 drivers/staging/gpib/uapi/gpib.h
 delete mode 100644 drivers/staging/gpib/uapi/gpib_ioctl.h
 create mode 100644 include/uapi/linux/gpib.h
 create mode 100644 include/uapi/linux/gpib_ioctl.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index df07d1a3c28d..a6055a910be6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10628,7 +10628,9 @@ F:	drivers/platform/x86/gpd-pocket-fan.c
 GPIB DRIVERS
 M:	Dave Penkler <dpenkler@gmail.com>
 S:	Maintained
-F:	drivers/staging/gpib/
+F:	drivers/gpib/
+F:	include/uapi/linux/gpib.h
+F:	include/uapi/linux/gpib_ioctl.h
 
 GPIO ACPI SUPPORT
 M:	Mika Westerberg <westeri@kernel.org>
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 4915a63866b0..01602581b880 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -161,6 +161,8 @@ source "drivers/greybus/Kconfig"
 
 source "drivers/comedi/Kconfig"
 
+source "drivers/gpib/Kconfig"
+
 source "drivers/staging/Kconfig"
 
 source "drivers/platform/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 8e1ffa4358d5..d275b1526cdd 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -150,6 +150,7 @@ obj-$(CONFIG_VHOST_IOTLB)	+= vhost/
 obj-$(CONFIG_VHOST)		+= vhost/
 obj-$(CONFIG_GREYBUS)		+= greybus/
 obj-$(CONFIG_COMEDI)		+= comedi/
+obj-$(CONFIG_GPIB)		+= gpib/
 obj-$(CONFIG_STAGING)		+= staging/
 obj-y				+= platform/
 
diff --git a/drivers/gpib/Kconfig b/drivers/gpib/Kconfig
new file mode 100644
index 000000000000..eeb50956ce85
--- /dev/null
+++ b/drivers/gpib/Kconfig
@@ -0,0 +1,255 @@
+# SPDX-License-Identifier: GPL-2.0
+menuconfig GPIB
+	tristate "GPIB drivers"
+	help
+	  Enable support for GPIB cards and dongles.  GPIB is the
+	  General Purpose Interface Bus which conforms to the IEEE488
+	  standard.
+
+	  This set of drivers can be used with the corresponding user
+	  space library that can be found on Sourceforge under linux-gpib.
+	  Select the drivers for your hardware from the list.
+
+if GPIB
+
+config GPIB_COMMON
+	tristate "GPIB core"
+	help
+
+	  Core common driver for all GPIB drivers. It provides the
+	  interface for the userland library
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called gpib_common
+
+config GPIB_AGILENT_82350B
+	tristate "Agilent 8235xx PCI(e) adapters"
+	depends on PCI
+	select GPIB_COMMON
+	select GPIB_TMS9914
+	help
+	  Enable support for HP/Agilent/Keysight boards
+	    82350A
+	    82350B
+	    82351A
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called agilent_82350b.
+
+config GPIB_AGILENT_82357A
+	tristate "Agilent 82357a/b USB dongles"
+	select GPIB_COMMON
+	depends on USB
+	help
+	  Enable support for Agilent/Keysight 82357x USB dongles.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called agilent_82357a.
+
+config GPIB_CEC_PCI
+	tristate "CEC PCI board"
+	depends on PCI
+	depends on HAS_IOPORT
+	select GPIB_COMMON
+	select GPIB_NEC7210
+	help
+	  Enable support for Capital Equipment Corporation PCI-488
+	  and Keithly KPCI-488 boards.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called cec_gpib.
+
+config GPIB_NI_PCI_ISA
+	tristate "NI PCI/ISA compatible boards"
+	depends on ISA_BUS || PCI || PCMCIA
+	depends on HAS_IOPORT
+	depends on PCMCIA || !PCMCIA
+	depends on HAS_IOPORT_MAP
+	select GPIB_COMMON
+	select GPIB_NEC7210
+	help
+	  Enable support for National Instruments boards based
+	  on TNT4882 chips:
+	     AT-GPIB (with NAT4882 chip)
+	     AT-GPIB (with NEC7210 chip)
+	     AT-GPIB/TNT
+	     PCI-GPIB
+	     PCIe-GPIB
+	     PCI-GPIB+
+	     PCM-GPIB
+	     PXI-GPIB
+	     PCMCIA-GPIB
+	     and Capital Equipment Corporation CEC-488 board.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called tnt4882.
+
+config GPIB_CB7210
+       tristate "Measurement Computing compatible boards"
+	depends on HAS_IOPORT
+	depends on ISA_BUS || PCI || PCMCIA
+	depends on PCMCIA || !PCMCIA
+       select GPIB_COMMON
+	select GPIB_NEC7210
+       help
+       Enable support for Measurement Computing (Computer Boards):
+       CPCI_GPIB, ISA-GPIB, ISA-GPIB/LC, PCI-GPIB/1M, PCI-GPIB/300K and
+       PCMCIA-GPIB
+       Quancom PCIGPIB-1 with MC cb7210 chip
+
+	  To compile this driver as a module, choose M here: the module will be
+
+config GPIB_NI_USB
+	tristate "NI USB dongles"
+	select GPIB_COMMON
+	depends on USB
+	help
+	  Enable support for National Instruments
+	       GPIB-USB-B
+	       GPIB-USB-HS
+	       GPIB-USB-HS+
+	   Keithly
+	       KUSB-488
+	       KUSB-488A
+	   Measurement Computing (Computer Boards)
+	       USB-488
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called ni_usb.
+
+config GPIB_FLUKE
+       tristate "Fluke"
+	depends on OF
+       select GPIB_COMMON
+       select GPIB_NEC7210
+       help
+         GPIB driver for Fluke based cda devices.
+
+	 To compile this driver as a module, choose M here: the module will be
+	 called fluke_gpib
+
+config GPIB_FMH
+       tristate "FMH FPGA based devices"
+       select GPIB_COMMON
+       select GPIB_NEC7210
+       depends on !PPC
+       depends on OF && PCI
+       help
+         GPIB driver for fmhess FPGA based devices
+
+	 To compile this driver as a module, choose M here: the module will be
+	 called fmh_gpib
+
+config GPIB_GPIO
+       tristate "RPi GPIO bitbang"
+	depends on ARCH_BCM2835 || COMPILE_TEST
+       select GPIB_COMMON
+       help
+         GPIB bitbang driver Raspberry Pi GPIO adapters
+
+	 To compile this driver as a module, choose M here: the module will be
+	 called gpib_bitbang
+
+config GPIB_HP82335
+       tristate "HP82335/HP27209"
+	depends on ISA_BUS
+       select GPIB_COMMON
+       select GPIB_TMS9914
+       help
+         GPIB driver for HP82335 and HP27209 boards
+
+	 To compile this driver as a module, choose M here: the module will be
+	 called hp82335
+
+
+config GPIB_HP82341
+       tristate "HP82341x"
+       select GPIB_COMMON
+       select GPIB_TMS9914
+       depends on ISA_BUS || EISA
+       help
+         GPIB driver for HP82341 A/B/C/D boards
+
+	 To compile this driver as a module, choose M here: the module will be
+	 called hp82341
+
+config GPIB_INES
+       tristate "INES"
+	depends on PCI || ISA_BUS || PCMCIA
+	depends on PCMCIA || !PCMCIA
+	depends on HAS_IOPORT
+       select GPIB_COMMON
+       select GPIB_NEC7210
+       help
+         GPIB driver for Ines compatible boards
+	 Ines
+	    GPIB-HS-NT
+	    GPIB for Compact PCI
+	    GPIB for PCI
+	    GPIB for PCMCIA
+	    GPIB PC/104
+	 Hameg
+	    HO80-2
+	 Quancom
+	    PCIGPIB-1 based on Ines iGPIB 72010 chip
+
+	 To compile this driver as a module, choose M here: the module will be
+	 called ines_gpib
+	  called cb7210.
+
+config GPIB_PCMCIA
+       def_bool y
+       depends on PCMCIA && (GPIB_NI_PCI_ISA || GPIB_CB7210 || GPIB_INES)
+       help
+         Enable PCMCIA/CArdbus support for National Instruments,
+	 measurement computing boards and Ines boards.
+
+config GPIB_LPVO
+       tristate "LPVO DIY USB GPIB"
+       select GPIB_COMMON
+       depends on USB
+       help
+         Enable support for LPVO Self-made usb-gpib adapter
+
+	 To compile this driver as a module, choose M here: the module will be
+	 called lpvo_usb_gpib
+
+config GPIB_PC2
+       tristate "PC2 PC2a"
+	depends on ISA_BUS
+	depends on HAS_IOPORT
+       select GPIB_COMMON
+       select GPIB_NEC7210
+       help
+         Enable support for pc2 and pc2a compatible adapters
+	    Capital Equipment Corporation PC-488
+	    CONTEC GP-IB(PC)
+	    Hameg HO80
+	    Iotech GP488B
+	    Keithly MBC-488
+	    Measurement Computing ISA-GPIB-PCA2
+	    National Instruments PCII, PCIIa and PCII/IIa
+
+	 To compile this driver as a module, choose M here: the module will be
+	 called pc2_gpib
+
+
+config GPIB_TMS9914
+       tristate
+       select GPIB_COMMON
+       help
+         Enable support for TMS 9914 chip.
+
+	 To compile this driver as a module, choose M here: the module will be
+	 called tms9914
+
+config GPIB_NEC7210
+       tristate
+       select GPIB_COMMON
+       help
+         Enable support for NEC 7210 compatible chips.
+
+	 To compile this driver as a module, choose M here: the module will be
+	 called nec7210
+
+endif # GPIB
diff --git a/drivers/gpib/Makefile b/drivers/gpib/Makefile
new file mode 100644
index 000000000000..2d44fed2a743
--- /dev/null
+++ b/drivers/gpib/Makefile
@@ -0,0 +1,20 @@
+
+subdir-ccflags-y += -I$(src)/include
+
+obj-$(CONFIG_GPIB_AGILENT_82350B) += agilent_82350b/
+obj-$(CONFIG_GPIB_AGILENT_82357A) += agilent_82357a/
+obj-$(CONFIG_GPIB_CB7210) += cb7210/
+obj-$(CONFIG_GPIB_CEC_PCI) += cec/
+obj-$(CONFIG_GPIB_COMMON) += common/
+obj-$(CONFIG_GPIB_FLUKE) += eastwood/
+obj-$(CONFIG_GPIB_FMH) += fmh_gpib/
+obj-$(CONFIG_GPIB_GPIO) += gpio/
+obj-$(CONFIG_GPIB_HP82335) += hp_82335/
+obj-$(CONFIG_GPIB_HP82341) += hp_82341/
+obj-$(CONFIG_GPIB_INES) += ines/
+obj-$(CONFIG_GPIB_LPVO) += lpvo_usb_gpib/
+obj-$(CONFIG_GPIB_NEC7210) += nec7210/
+obj-$(CONFIG_GPIB_NI_USB) += ni_usb/
+obj-$(CONFIG_GPIB_PC2) += pc2/
+obj-$(CONFIG_GPIB_TMS9914) += tms9914/
+obj-$(CONFIG_GPIB_NI_PCI_ISA) += tnt4882/
diff --git a/drivers/gpib/TODO b/drivers/gpib/TODO
new file mode 100644
index 000000000000..ac07dd90b4ef
--- /dev/null
+++ b/drivers/gpib/TODO
@@ -0,0 +1,10 @@
+TODO:
+- checkpatch.pl fixes
+  These checks should be ignored:
+    CHECK:ALLOC_SIZEOF_STRUCT: Prefer kmalloc(sizeof(*board->private_data)...) over kmalloc(sizeof(struct xxx_priv)...)
+    ./gpio/gpib_bitbang.c:50: ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in parenthese
+  This warning will be addressed later:  WARNING:UNDOCUMENTED_DT_STRING: DT compatible string
+- resolve XXX notes where possible
+- fix FIXME notes
+- clean-up commented-out code
+- fix typos
diff --git a/drivers/gpib/agilent_82350b/Makefile b/drivers/gpib/agilent_82350b/Makefile
new file mode 100644
index 000000000000..f24e1e713a63
--- /dev/null
+++ b/drivers/gpib/agilent_82350b/Makefile
@@ -0,0 +1,2 @@
+
+obj-$(CONFIG_GPIB_AGILENT_82350B) += agilent_82350b.o
diff --git a/drivers/gpib/agilent_82350b/agilent_82350b.c b/drivers/gpib/agilent_82350b/agilent_82350b.c
new file mode 100644
index 000000000000..01a5bb43cd2d
--- /dev/null
+++ b/drivers/gpib/agilent_82350b/agilent_82350b.c
@@ -0,0 +1,896 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *   copyright            : (C) 2002, 2004 by Frank Mori Hess              *
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include "agilent_82350b.h"
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/dma.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver for Agilent 82350b");
+
+static int read_transfer_counter(struct agilent_82350b_priv *a_priv);
+static unsigned short read_and_clear_event_status(struct gpib_board *board);
+static void set_transfer_counter(struct agilent_82350b_priv *a_priv, int count);
+static int agilent_82350b_write(struct gpib_board *board, u8 *buffer,
+				size_t length, int send_eoi, size_t *bytes_written);
+
+static int agilent_82350b_accel_read(struct gpib_board *board, u8 *buffer,
+				     size_t length, int *end, size_t *bytes_read)
+
+{
+	struct agilent_82350b_priv *a_priv = board->private_data;
+	struct tms9914_priv *tms_priv = &a_priv->tms9914_priv;
+	int retval = 0;
+	unsigned short event_status;
+	int i, num_fifo_bytes;
+	/* hardware doesn't support checking for end-of-string character when using fifo */
+	if (tms_priv->eos_flags & REOS)
+		return tms9914_read(board, tms_priv, buffer, length, end, bytes_read);
+
+	clear_bit(DEV_CLEAR_BN, &tms_priv->state);
+
+	read_and_clear_event_status(board);
+	*end = 0;
+	*bytes_read = 0;
+	if (length == 0)
+		return 0;
+	/* disable fifo for the moment */
+	writeb(DIRECTION_GPIB_TO_HOST, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
+	/* handle corner case of board not in holdoff and one byte might slip in early */
+	if (tms_priv->holdoff_active == 0 && length > 1) {
+		size_t num_bytes;
+
+		retval = tms9914_read(board, tms_priv, buffer, 1, end, &num_bytes);
+		*bytes_read += num_bytes;
+		if (retval < 0 || *end)
+			return retval;
+		++buffer;
+		--length;
+	}
+	tms9914_set_holdoff_mode(tms_priv, TMS9914_HOLDOFF_EOI);
+	tms9914_release_holdoff(tms_priv);
+	i = 0;
+	num_fifo_bytes = length - 1;
+	/* disable BI interrupts */
+	write_byte(tms_priv, tms_priv->imr0_bits & ~HR_BIIE, IMR0);
+	while (i < num_fifo_bytes && *end == 0) {
+		int block_size;
+		int j;
+		int count;
+
+		block_size = min(num_fifo_bytes - i, agilent_82350b_fifo_size);
+		set_transfer_counter(a_priv, block_size);
+		writeb(ENABLE_TI_TO_SRAM | DIRECTION_GPIB_TO_HOST,
+		       a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
+		if (agilent_82350b_fifo_is_halted(a_priv))
+			writeb(RESTART_STREAM_BIT, a_priv->gpib_base + STREAM_STATUS_REG);
+
+		clear_bit(READ_READY_BN, &tms_priv->state);
+
+		retval = wait_event_interruptible(board->wait,
+						  ((event_status =
+						    read_and_clear_event_status(board)) &
+						   (TERM_COUNT_STATUS_BIT |
+						    BUFFER_END_STATUS_BIT)) ||
+						  test_bit(DEV_CLEAR_BN, &tms_priv->state) ||
+						  test_bit(TIMO_NUM, &board->status));
+		if (retval) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		count = block_size - read_transfer_counter(a_priv);
+		for (j = 0; j < count && i < num_fifo_bytes; ++j)
+			buffer[i++] = readb(a_priv->sram_base + j);
+		if (event_status & BUFFER_END_STATUS_BIT) {
+			clear_bit(RECEIVED_END_BN, &tms_priv->state);
+
+			tms_priv->holdoff_active = 1;
+			*end = 1;
+		}
+		if (test_bit(TIMO_NUM, &board->status)) {
+			retval = -ETIMEDOUT;
+			break;
+		}
+		if (test_bit(DEV_CLEAR_BN, &tms_priv->state)) {
+			retval = -EINTR;
+			break;
+		}
+	}
+	/* re-enable BI interrupts */
+	write_byte(tms_priv, tms_priv->imr0_bits, IMR0);
+	*bytes_read += i;
+	buffer += i;
+	length -= i;
+	writeb(DIRECTION_GPIB_TO_HOST, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
+	if (retval < 0)
+		return retval;
+	/* read last bytes if we havn't received an END yet */
+	if (*end == 0) {
+		size_t num_bytes;
+		/* try to make sure we holdoff after last byte read */
+		retval = tms9914_read(board, tms_priv, buffer, length, end, &num_bytes);
+		*bytes_read += num_bytes;
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
+}
+
+static int translate_wait_return_value(struct gpib_board *board, int retval)
+
+{
+	struct agilent_82350b_priv *a_priv = board->private_data;
+	struct tms9914_priv *tms_priv = &a_priv->tms9914_priv;
+
+	if (retval)
+		return -ERESTARTSYS;
+	if (test_bit(TIMO_NUM, &board->status))
+		return -ETIMEDOUT;
+	if (test_bit(DEV_CLEAR_BN, &tms_priv->state))
+		return -EINTR;
+	return 0;
+}
+
+static int agilent_82350b_accel_write(struct gpib_board *board, u8 *buffer,
+				      size_t length, int send_eoi,
+				      size_t *bytes_written)
+{
+	struct agilent_82350b_priv *a_priv = board->private_data;
+	struct tms9914_priv *tms_priv = &a_priv->tms9914_priv;
+	int i, j;
+	unsigned short event_status;
+	int retval = 0;
+	int fifotransferlength = length;
+	int block_size = 0;
+	size_t num_bytes;
+
+	*bytes_written = 0;
+	if (send_eoi)
+		--fifotransferlength;
+
+	clear_bit(DEV_CLEAR_BN, &tms_priv->state);
+
+	writeb(0, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
+
+	event_status = read_and_clear_event_status(board);
+
+#ifdef EXPERIMENTAL
+	/* wait for previous BO to complete if any */
+	retval = wait_event_interruptible(board->wait,
+					  test_bit(DEV_CLEAR_BN, &tms_priv->state) ||
+					  test_bit(WRITE_READY_BN, &tms_priv->state) ||
+					  test_bit(TIMO_NUM, &board->status));
+	retval = translate_wait_return_value(board, retval);
+
+	if (retval)
+		return retval;
+#endif
+
+	if (fifotransferlength > 0) {
+		retval = agilent_82350b_write(board, buffer, 1, 0, &num_bytes);
+		*bytes_written += num_bytes;
+		if (retval < 0)
+			return retval;
+	}
+
+	write_byte(tms_priv, tms_priv->imr0_bits & ~HR_BOIE, IMR0);
+	for (i = 1; i < fifotransferlength;) {
+		clear_bit(WRITE_READY_BN, &tms_priv->state);
+
+		block_size = min(fifotransferlength - i, agilent_82350b_fifo_size);
+		set_transfer_counter(a_priv, block_size);
+		for (j = 0; j < block_size; ++j, ++i) {
+			/* load data into board's sram */
+			writeb(buffer[i], a_priv->sram_base + j);
+		}
+		writeb(ENABLE_TI_TO_SRAM, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
+
+		if (agilent_82350b_fifo_is_halted(a_priv))
+			writeb(RESTART_STREAM_BIT, a_priv->gpib_base + STREAM_STATUS_REG);
+
+		retval = wait_event_interruptible(board->wait,
+						  ((event_status =
+						    read_and_clear_event_status(board)) &
+						   TERM_COUNT_STATUS_BIT) ||
+						  test_bit(DEV_CLEAR_BN, &tms_priv->state) ||
+						  test_bit(TIMO_NUM, &board->status));
+		writeb(0, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
+		num_bytes = block_size - read_transfer_counter(a_priv);
+
+		*bytes_written += num_bytes;
+		retval = translate_wait_return_value(board, retval);
+		if (retval)
+			break;
+	}
+	write_byte(tms_priv, tms_priv->imr0_bits, IMR0);
+	if (retval < 0)
+		return retval;
+
+	if (send_eoi) {
+		retval = agilent_82350b_write(board, buffer + fifotransferlength, 1, send_eoi,
+					      &num_bytes);
+		*bytes_written += num_bytes;
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
+}
+
+static unsigned short read_and_clear_event_status(struct gpib_board *board)
+{
+	struct agilent_82350b_priv *a_priv = board->private_data;
+	unsigned long flags;
+	unsigned short status;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	status = a_priv->event_status_bits;
+	a_priv->event_status_bits = 0;
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return status;
+}
+
+static irqreturn_t agilent_82350b_interrupt(int irq, void *arg)
+
+{
+	int tms9914_status1 = 0, tms9914_status2 = 0;
+	int event_status;
+	struct gpib_board *board = arg;
+	struct agilent_82350b_priv *a_priv = board->private_data;
+	unsigned long flags;
+	irqreturn_t retval = IRQ_NONE;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	event_status = readb(a_priv->gpib_base + EVENT_STATUS_REG);
+	if (event_status & IRQ_STATUS_BIT)
+		retval = IRQ_HANDLED;
+
+	if (event_status & TMS9914_IRQ_STATUS_BIT) {
+		tms9914_status1 = read_byte(&a_priv->tms9914_priv, ISR0);
+		tms9914_status2 = read_byte(&a_priv->tms9914_priv, ISR1);
+		tms9914_interrupt_have_status(board, &a_priv->tms9914_priv, tms9914_status1,
+					      tms9914_status2);
+	}
+	/* write-clear status bits */
+	if (event_status & (BUFFER_END_STATUS_BIT | TERM_COUNT_STATUS_BIT)) {
+		writeb(event_status & (BUFFER_END_STATUS_BIT | TERM_COUNT_STATUS_BIT),
+		       a_priv->gpib_base + EVENT_STATUS_REG);
+		a_priv->event_status_bits |= event_status;
+		wake_up_interruptible(&board->wait);
+	}
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return retval;
+}
+
+static void agilent_82350b_detach(struct gpib_board *board);
+
+static int read_transfer_counter(struct agilent_82350b_priv *a_priv)
+{
+	int lo, mid, value;
+
+	lo = readb(a_priv->gpib_base + XFER_COUNT_LO_REG);
+	mid = readb(a_priv->gpib_base + XFER_COUNT_MID_REG);
+	value = (lo & 0xff) | ((mid << 8) & 0x7f00);
+	value = ~(value - 1) & 0x7fff;
+	return value;
+}
+
+static void set_transfer_counter(struct agilent_82350b_priv *a_priv, int count)
+{
+	int complement = -count;
+
+	writeb(complement & 0xff, a_priv->gpib_base + XFER_COUNT_LO_REG);
+	writeb((complement >> 8) & 0xff, a_priv->gpib_base + XFER_COUNT_MID_REG);
+	/* I don't think the hi count reg is even used, but oh well */
+	writeb((complement >> 16) & 0xf, a_priv->gpib_base + XFER_COUNT_HI_REG);
+}
+
+/* wrappers for interface functions */
+static int agilent_82350b_read(struct gpib_board *board, u8 *buffer,
+			       size_t length, int *end, size_t *bytes_read)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_read(board, &priv->tms9914_priv, buffer, length, end, bytes_read);
+}
+
+static int agilent_82350b_write(struct gpib_board *board, u8 *buffer,
+				size_t length, int send_eoi, size_t *bytes_written)
+
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_write(board, &priv->tms9914_priv, buffer, length, send_eoi, bytes_written);
+}
+
+static int agilent_82350b_command(struct gpib_board *board, u8 *buffer,
+				  size_t length, size_t *bytes_written)
+
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_command(board, &priv->tms9914_priv, buffer, length, bytes_written);
+}
+
+static int agilent_82350b_take_control(struct gpib_board *board, int synchronous)
+
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_take_control_workaround(board, &priv->tms9914_priv, synchronous);
+}
+
+static int agilent_82350b_go_to_standby(struct gpib_board *board)
+
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_go_to_standby(board, &priv->tms9914_priv);
+}
+
+static int agilent_82350b_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct agilent_82350b_priv *a_priv = board->private_data;
+
+	if (request_control) {
+		a_priv->card_mode_bits |= CM_SYSTEM_CONTROLLER_BIT;
+		if (a_priv->model != MODEL_82350A)
+			writeb(IC_SYSTEM_CONTROLLER_BIT, a_priv->gpib_base + INTERNAL_CONFIG_REG);
+	} else {
+		a_priv->card_mode_bits &= ~CM_SYSTEM_CONTROLLER_BIT;
+		if (a_priv->model != MODEL_82350A)
+			writeb(0, a_priv->gpib_base + INTERNAL_CONFIG_REG);
+	}
+	writeb(a_priv->card_mode_bits, a_priv->gpib_base + CARD_MODE_REG);
+	return tms9914_request_system_control(board, &a_priv->tms9914_priv, request_control);
+}
+
+static void agilent_82350b_interface_clear(struct gpib_board *board, int assert)
+
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	tms9914_interface_clear(board, &priv->tms9914_priv, assert);
+}
+
+static void agilent_82350b_remote_enable(struct gpib_board *board, int enable)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	tms9914_remote_enable(board, &priv->tms9914_priv, enable);
+}
+
+static int agilent_82350b_enable_eos(struct gpib_board *board, u8 eos_byte,
+				     int compare_8_bits)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_enable_eos(board, &priv->tms9914_priv, eos_byte, compare_8_bits);
+}
+
+static void agilent_82350b_disable_eos(struct gpib_board *board)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	tms9914_disable_eos(board, &priv->tms9914_priv);
+}
+
+static unsigned int agilent_82350b_update_status(struct gpib_board *board,
+						 unsigned int clear_mask)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_update_status(board, &priv->tms9914_priv, clear_mask);
+}
+
+static int agilent_82350b_primary_address(struct gpib_board *board,
+					  unsigned int address)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_primary_address(board, &priv->tms9914_priv, address);
+}
+
+static int agilent_82350b_secondary_address(struct gpib_board *board,
+					    unsigned int address, int enable)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_secondary_address(board, &priv->tms9914_priv, address, enable);
+}
+
+static int agilent_82350b_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_parallel_poll(board, &priv->tms9914_priv, result);
+}
+
+static void agilent_82350b_parallel_poll_configure(struct gpib_board *board,
+						   u8 config)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	tms9914_parallel_poll_configure(board, &priv->tms9914_priv, config);
+}
+
+static void agilent_82350b_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	tms9914_parallel_poll_response(board, &priv->tms9914_priv, ist);
+}
+
+static void agilent_82350b_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	tms9914_serial_poll_response(board, &priv->tms9914_priv, status);
+}
+
+static u8 agilent_82350b_serial_poll_status(struct gpib_board *board)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_serial_poll_status(board, &priv->tms9914_priv);
+}
+
+static int agilent_82350b_line_status(const struct gpib_board *board)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	return tms9914_line_status(board, &priv->tms9914_priv);
+}
+
+static int agilent_82350b_t1_delay(struct gpib_board *board, unsigned int nanosec)
+{
+	struct agilent_82350b_priv *a_priv = board->private_data;
+	static const int nanosec_per_clock = 30;
+	unsigned int value;
+
+	tms9914_t1_delay(board, &a_priv->tms9914_priv, nanosec);
+
+	value = (nanosec + nanosec_per_clock - 1) / nanosec_per_clock;
+	if (value > 0xff)
+		value = 0xff;
+	writeb(value, a_priv->gpib_base + T1_DELAY_REG);
+	return value * nanosec_per_clock;
+}
+
+static void agilent_82350b_return_to_local(struct gpib_board *board)
+{
+	struct agilent_82350b_priv *priv = board->private_data;
+
+	tms9914_return_to_local(board, &priv->tms9914_priv);
+}
+
+static int agilent_82350b_allocate_private(struct gpib_board *board)
+{
+	board->private_data = kzalloc(sizeof(struct agilent_82350b_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static void agilent_82350b_free_private(struct gpib_board *board)
+{
+	kfree(board->private_data);
+	board->private_data = NULL;
+}
+
+static int init_82350a_hardware(struct gpib_board *board,
+				const struct gpib_board_config *config)
+{
+	struct agilent_82350b_priv *a_priv = board->private_data;
+	static const unsigned int firmware_length = 5302;
+	unsigned int borg_status;
+	static const unsigned int timeout = 1000;
+	int i, j;
+	const char *firmware_data = config->init_data;
+	const unsigned int plx_cntrl_static_bits = PLX9050_WAITO_NOT_USER0_SELECT_BIT |
+		PLX9050_USER0_OUTPUT_BIT |
+		PLX9050_LLOCK_NOT_USER1_SELECT_BIT |
+		PLX9050_USER1_OUTPUT_BIT |
+		PLX9050_USER2_OUTPUT_BIT |
+		PLX9050_USER3_OUTPUT_BIT |
+		PLX9050_PCI_READ_MODE_BIT |
+		PLX9050_PCI_WRITE_MODE_BIT |
+		PLX9050_PCI_RETRY_DELAY_BITS(64) |
+		PLX9050_DIRECT_SLAVE_LOCK_ENABLE_BIT;
+
+	/* load borg data */
+	borg_status = readb(a_priv->borg_base);
+	if ((borg_status & BORG_DONE_BIT))
+		return 0;
+	/* need to programme borg */
+	if (!config->init_data || config->init_data_length != firmware_length) {
+		dev_err(board->gpib_dev, "the 82350A board requires firmware after powering on.\n");
+		return -EIO;
+	}
+	dev_dbg(board->gpib_dev, "Loading firmware...\n");
+
+	/* tickle the borg */
+	writel(plx_cntrl_static_bits | PLX9050_USER3_DATA_BIT,
+	       a_priv->plx_base + PLX9050_CNTRL_REG);
+	usleep_range(1000, 2000);
+	writel(plx_cntrl_static_bits, a_priv->plx_base + PLX9050_CNTRL_REG);
+	usleep_range(1000, 2000);
+	writel(plx_cntrl_static_bits | PLX9050_USER3_DATA_BIT,
+	       a_priv->plx_base + PLX9050_CNTRL_REG);
+	usleep_range(1000, 2000);
+
+	for (i = 0; i < config->init_data_length; ++i) {
+		for (j = 0; j < timeout && (readb(a_priv->borg_base) & BORG_READY_BIT) == 0; ++j) {
+			if (need_resched())
+				schedule();
+			usleep_range(10, 20);
+		}
+		if (j == timeout) {
+			dev_err(board->gpib_dev, "timed out loading firmware.\n");
+			return -ETIMEDOUT;
+		}
+		writeb(firmware_data[i], a_priv->gpib_base + CONFIG_DATA_REG);
+	}
+	for (j = 0; j < timeout && (readb(a_priv->borg_base) & BORG_DONE_BIT) == 0; ++j) {
+		if (need_resched())
+			schedule();
+		usleep_range(10, 20);
+	}
+	if (j == timeout) {
+		dev_err(board->gpib_dev, "timed out waiting for firmware load to complete.\n");
+		return -ETIMEDOUT;
+	}
+	dev_dbg(board->gpib_dev, " ...done.\n");
+	return 0;
+}
+
+static int test_sram(struct gpib_board *board)
+
+{
+	struct agilent_82350b_priv *a_priv = board->private_data;
+	unsigned int i;
+	const unsigned int sram_length = pci_resource_len(a_priv->pci_device, SRAM_82350A_REGION);
+	/* test SRAM */
+	const unsigned int byte_mask = 0xff;
+
+	for (i = 0; i < sram_length; ++i) {
+		writeb(i & byte_mask, a_priv->sram_base + i);
+		if (need_resched())
+			schedule();
+	}
+	for (i = 0; i < sram_length; ++i) {
+		unsigned int read_value = readb(a_priv->sram_base + i);
+
+		if ((i & byte_mask) != read_value) {
+			dev_err(board->gpib_dev, "SRAM test failed at %d wanted %d got %d\n",
+				i, (i & byte_mask), read_value);
+			return -EIO;
+		}
+		if (need_resched())
+			schedule();
+	}
+	dev_dbg(board->gpib_dev, "SRAM test passed 0x%x bytes checked\n", sram_length);
+	return 0;
+}
+
+static int agilent_82350b_generic_attach(struct gpib_board *board,
+					 const struct gpib_board_config *config,
+					 int use_fifos)
+
+{
+	struct agilent_82350b_priv *a_priv;
+	struct tms9914_priv *tms_priv;
+	int retval;
+
+	board->status = 0;
+
+	if (agilent_82350b_allocate_private(board))
+		return -ENOMEM;
+	a_priv = board->private_data;
+	a_priv->using_fifos = use_fifos;
+	tms_priv = &a_priv->tms9914_priv;
+	tms_priv->read_byte = tms9914_iomem_read_byte;
+	tms_priv->write_byte = tms9914_iomem_write_byte;
+	tms_priv->offset = 1;
+
+	/* find board */
+	a_priv->pci_device = gpib_pci_get_device(config, PCI_VENDOR_ID_AGILENT,
+						 PCI_DEVICE_ID_82350B, NULL);
+	if (a_priv->pci_device) {
+		a_priv->model = MODEL_82350B;
+		dev_dbg(board->gpib_dev, "Agilent 82350B board found\n");
+
+	} else	{
+		a_priv->pci_device = gpib_pci_get_device(config, PCI_VENDOR_ID_AGILENT,
+							 PCI_DEVICE_ID_82351A, NULL);
+		if (a_priv->pci_device)	{
+			a_priv->model = MODEL_82351A;
+			dev_dbg(board->gpib_dev, "Agilent 82351B board found\n");
+
+		} else {
+			a_priv->pci_device = gpib_pci_get_subsys(config, PCI_VENDOR_ID_PLX,
+								 PCI_DEVICE_ID_PLX_9050,
+								 PCI_VENDOR_ID_HP,
+								 PCI_SUBDEVICE_ID_82350A,
+								 a_priv->pci_device);
+			if (a_priv->pci_device) {
+				a_priv->model = MODEL_82350A;
+				dev_dbg(board->gpib_dev, "HP/Agilent 82350A board found\n");
+			} else {
+				dev_err(board->gpib_dev, "no 82350/82351 board found\n");
+				return -ENODEV;
+			}
+		}
+	}
+	if (pci_enable_device(a_priv->pci_device)) {
+		dev_err(board->gpib_dev, "error enabling pci device\n");
+		return -EIO;
+	}
+	if (pci_request_regions(a_priv->pci_device, DRV_NAME))
+		return -ENOMEM;
+	switch (a_priv->model) {
+	case MODEL_82350A:
+		a_priv->plx_base = ioremap(pci_resource_start(a_priv->pci_device, PLX_MEM_REGION),
+					   pci_resource_len(a_priv->pci_device, PLX_MEM_REGION));
+		dev_dbg(board->gpib_dev, "plx base address remapped to 0x%p\n", a_priv->plx_base);
+		a_priv->gpib_base = ioremap(pci_resource_start(a_priv->pci_device,
+							       GPIB_82350A_REGION),
+					    pci_resource_len(a_priv->pci_device,
+							     GPIB_82350A_REGION));
+		dev_dbg(board->gpib_dev, "chip base address remapped to 0x%p\n", a_priv->gpib_base);
+		tms_priv->mmiobase = a_priv->gpib_base + TMS9914_BASE_REG;
+		a_priv->sram_base = ioremap(pci_resource_start(a_priv->pci_device,
+							       SRAM_82350A_REGION),
+					    pci_resource_len(a_priv->pci_device,
+							     SRAM_82350A_REGION));
+		dev_dbg(board->gpib_dev, "sram base address remapped to 0x%p\n", a_priv->sram_base);
+		a_priv->borg_base = ioremap(pci_resource_start(a_priv->pci_device,
+							       BORG_82350A_REGION),
+					    pci_resource_len(a_priv->pci_device,
+							     BORG_82350A_REGION));
+		dev_dbg(board->gpib_dev, "borg base address remapped to 0x%p\n", a_priv->borg_base);
+
+		retval = init_82350a_hardware(board, config);
+		if (retval < 0)
+			return retval;
+		break;
+	case MODEL_82350B:
+	case MODEL_82351A:
+		a_priv->gpib_base = ioremap(pci_resource_start(a_priv->pci_device, GPIB_REGION),
+					    pci_resource_len(a_priv->pci_device, GPIB_REGION));
+		dev_dbg(board->gpib_dev, "chip base address remapped to 0x%p\n", a_priv->gpib_base);
+		tms_priv->mmiobase = a_priv->gpib_base + TMS9914_BASE_REG;
+		a_priv->sram_base = ioremap(pci_resource_start(a_priv->pci_device, SRAM_REGION),
+					    pci_resource_len(a_priv->pci_device, SRAM_REGION));
+		dev_dbg(board->gpib_dev, "sram base address remapped to 0x%p\n", a_priv->sram_base);
+		a_priv->misc_base = ioremap(pci_resource_start(a_priv->pci_device, MISC_REGION),
+					    pci_resource_len(a_priv->pci_device, MISC_REGION));
+		dev_dbg(board->gpib_dev, "misc base address remapped to 0x%p\n", a_priv->misc_base);
+		break;
+	default:
+		dev_err(board->gpib_dev, "invalid board\n");
+		return -ENODEV;
+	}
+
+	retval = test_sram(board);
+	if (retval < 0)
+		return retval;
+
+	if (request_irq(a_priv->pci_device->irq, agilent_82350b_interrupt,
+			IRQF_SHARED, DRV_NAME, board)) {
+		dev_err(board->gpib_dev, "failed to obtain irq %d\n", a_priv->pci_device->irq);
+		return -EIO;
+	}
+	a_priv->irq = a_priv->pci_device->irq;
+	dev_dbg(board->gpib_dev, " IRQ %d\n", a_priv->irq);
+
+	writeb(0, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
+	a_priv->card_mode_bits = ENABLE_PCI_IRQ_BIT;
+	writeb(a_priv->card_mode_bits, a_priv->gpib_base + CARD_MODE_REG);
+
+	if (a_priv->model == MODEL_82350A) {
+		/* enable PCI interrupts for 82350a */
+		writel(PLX9050_LINTR1_EN_BIT | PLX9050_LINTR2_POLARITY_BIT |
+		       PLX9050_PCI_INTR_EN_BIT,
+		       a_priv->plx_base + PLX9050_INTCSR_REG);
+	}
+
+	if (use_fifos) {
+		writeb(ENABLE_BUFFER_END_EVENTS_BIT | ENABLE_TERM_COUNT_EVENTS_BIT,
+		       a_priv->gpib_base + EVENT_ENABLE_REG);
+		writeb(ENABLE_TERM_COUNT_INTERRUPT_BIT | ENABLE_BUFFER_END_INTERRUPT_BIT |
+		       ENABLE_TMS9914_INTERRUPTS_BIT, a_priv->gpib_base + INTERRUPT_ENABLE_REG);
+		/* write-clear event status bits */
+		writeb(BUFFER_END_STATUS_BIT | TERM_COUNT_STATUS_BIT,
+		       a_priv->gpib_base + EVENT_STATUS_REG);
+	} else {
+		writeb(0, a_priv->gpib_base + EVENT_ENABLE_REG);
+		writeb(ENABLE_TMS9914_INTERRUPTS_BIT,
+		       a_priv->gpib_base + INTERRUPT_ENABLE_REG);
+	}
+	board->t1_nano_sec = agilent_82350b_t1_delay(board, 2000);
+	tms9914_board_reset(tms_priv);
+
+	tms9914_online(board, tms_priv);
+
+	return 0;
+}
+
+static int agilent_82350b_unaccel_attach(struct gpib_board *board,
+					 const struct gpib_board_config *config)
+{
+	return agilent_82350b_generic_attach(board, config, 0);
+}
+
+static int agilent_82350b_accel_attach(struct gpib_board *board,
+				       const struct gpib_board_config *config)
+{
+	return agilent_82350b_generic_attach(board, config, 1);
+}
+
+static void agilent_82350b_detach(struct gpib_board *board)
+{
+	struct agilent_82350b_priv *a_priv = board->private_data;
+	struct tms9914_priv *tms_priv;
+
+	if (a_priv) {
+		if (a_priv->plx_base) /* disable interrupts */
+			writel(0, a_priv->plx_base + PLX9050_INTCSR_REG);
+
+		tms_priv = &a_priv->tms9914_priv;
+		if (a_priv->irq)
+			free_irq(a_priv->irq, board);
+		if (a_priv->gpib_base) {
+			tms9914_board_reset(tms_priv);
+			if (a_priv->misc_base)
+				iounmap(a_priv->misc_base);
+			if (a_priv->borg_base)
+				iounmap(a_priv->borg_base);
+			if (a_priv->sram_base)
+				iounmap(a_priv->sram_base);
+			if (a_priv->gpib_base)
+				iounmap(a_priv->gpib_base);
+			if (a_priv->plx_base)
+				iounmap(a_priv->plx_base);
+			pci_release_regions(a_priv->pci_device);
+		}
+		if (a_priv->pci_device)
+			pci_dev_put(a_priv->pci_device);
+	}
+	agilent_82350b_free_private(board);
+}
+
+static struct gpib_interface agilent_82350b_unaccel_interface = {
+	.name = "agilent_82350b_unaccel",
+	.attach = agilent_82350b_unaccel_attach,
+	.detach = agilent_82350b_detach,
+	.read = agilent_82350b_read,
+	.write = agilent_82350b_write,
+	.command = agilent_82350b_command,
+	.request_system_control = agilent_82350b_request_system_control,
+	.take_control = agilent_82350b_take_control,
+	.go_to_standby = agilent_82350b_go_to_standby,
+	.interface_clear = agilent_82350b_interface_clear,
+	.remote_enable = agilent_82350b_remote_enable,
+	.enable_eos = agilent_82350b_enable_eos,
+	.disable_eos = agilent_82350b_disable_eos,
+	.parallel_poll = agilent_82350b_parallel_poll,
+	.parallel_poll_configure = agilent_82350b_parallel_poll_configure,
+	.parallel_poll_response = agilent_82350b_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, /* XXX */
+	.line_status = agilent_82350b_line_status,
+	.update_status = agilent_82350b_update_status,
+	.primary_address = agilent_82350b_primary_address,
+	.secondary_address = agilent_82350b_secondary_address,
+	.serial_poll_response = agilent_82350b_serial_poll_response,
+	.serial_poll_status = agilent_82350b_serial_poll_status,
+	.t1_delay = agilent_82350b_t1_delay,
+	.return_to_local = agilent_82350b_return_to_local,
+};
+
+static struct gpib_interface agilent_82350b_interface = {
+	.name = "agilent_82350b",
+	.attach = agilent_82350b_accel_attach,
+	.detach = agilent_82350b_detach,
+	.read = agilent_82350b_accel_read,
+	.write = agilent_82350b_accel_write,
+	.command = agilent_82350b_command,
+	.request_system_control = agilent_82350b_request_system_control,
+	.take_control = agilent_82350b_take_control,
+	.go_to_standby = agilent_82350b_go_to_standby,
+	.interface_clear = agilent_82350b_interface_clear,
+	.remote_enable = agilent_82350b_remote_enable,
+	.enable_eos = agilent_82350b_enable_eos,
+	.disable_eos = agilent_82350b_disable_eos,
+	.parallel_poll = agilent_82350b_parallel_poll,
+	.parallel_poll_configure = agilent_82350b_parallel_poll_configure,
+	.parallel_poll_response = agilent_82350b_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, /* XXX */
+	.line_status = agilent_82350b_line_status,
+	.update_status = agilent_82350b_update_status,
+	.primary_address = agilent_82350b_primary_address,
+	.secondary_address = agilent_82350b_secondary_address,
+	.serial_poll_response = agilent_82350b_serial_poll_response,
+	.serial_poll_status = agilent_82350b_serial_poll_status,
+	.t1_delay = agilent_82350b_t1_delay,
+	.return_to_local = agilent_82350b_return_to_local,
+};
+
+static int agilent_82350b_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+
+{
+	return 0;
+}
+
+static const struct pci_device_id agilent_82350b_pci_table[] = {
+	{ PCI_VENDOR_ID_PLX,     PCI_DEVICE_ID_PLX_9050, PCI_VENDOR_ID_HP,
+	  PCI_SUBDEVICE_ID_82350A, 0, 0, 0 },
+	{ PCI_VENDOR_ID_AGILENT, PCI_DEVICE_ID_82350B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
+	{ PCI_VENDOR_ID_AGILENT, PCI_DEVICE_ID_82351A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
+	{ 0 }
+};
+MODULE_DEVICE_TABLE(pci, agilent_82350b_pci_table);
+
+static struct pci_driver agilent_82350b_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = agilent_82350b_pci_table,
+	.probe = &agilent_82350b_pci_probe
+};
+
+static int __init agilent_82350b_init_module(void)
+{
+	int result;
+
+	result = pci_register_driver(&agilent_82350b_pci_driver);
+	if (result) {
+		pr_err("pci_register_driver failed: error = %d\n", result);
+		return result;
+	}
+
+	result = gpib_register_driver(&agilent_82350b_unaccel_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_unaccel;
+	}
+
+	result = gpib_register_driver(&agilent_82350b_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_interface;
+	}
+
+	return 0;
+
+err_interface:
+	gpib_unregister_driver(&agilent_82350b_unaccel_interface);
+err_unaccel:
+	pci_unregister_driver(&agilent_82350b_pci_driver);
+
+	return result;
+}
+
+static void __exit agilent_82350b_exit_module(void)
+{
+	gpib_unregister_driver(&agilent_82350b_interface);
+	gpib_unregister_driver(&agilent_82350b_unaccel_interface);
+
+	pci_unregister_driver(&agilent_82350b_pci_driver);
+}
+
+module_init(agilent_82350b_init_module);
+module_exit(agilent_82350b_exit_module);
diff --git a/drivers/gpib/agilent_82350b/agilent_82350b.h b/drivers/gpib/agilent_82350b/agilent_82350b.h
new file mode 100644
index 000000000000..ef841957297f
--- /dev/null
+++ b/drivers/gpib/agilent_82350b/agilent_82350b.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright            : (C) 2002, 2004 by Frank Mori Hess             *
+ ***************************************************************************/
+
+#include "gpibP.h"
+#include "plx9050.h"
+#include "tms9914.h"
+
+enum pci_vendor_ids {
+	PCI_VENDOR_ID_AGILENT = 0x15bc,
+};
+
+enum pci_device_ids {
+	PCI_DEVICE_ID_82350B = 0x0b01,
+	PCI_DEVICE_ID_82351A = 0x1218
+};
+
+enum pci_subdevice_ids {
+	PCI_SUBDEVICE_ID_82350A = 0x10b0,
+};
+
+enum pci_regions_82350a {
+	PLX_MEM_REGION  = 0,
+	PLX_IO_REGION   = 1,
+	GPIB_82350A_REGION = 2,
+	SRAM_82350A_REGION = 3,
+	BORG_82350A_REGION = 4
+};
+
+enum pci_regions_82350b {
+	GPIB_REGION = 0,
+	SRAM_REGION = 1,
+	MISC_REGION = 2,
+};
+
+enum board_model {
+	MODEL_82350A,
+	MODEL_82350B,
+	MODEL_82351A
+};
+
+/* struct which defines private_data for board */
+struct agilent_82350b_priv {
+	struct tms9914_priv tms9914_priv;
+	struct pci_dev *pci_device;
+	void __iomem *plx_base;	/* 82350a only */
+	void __iomem *gpib_base;
+	void __iomem *sram_base;
+	void __iomem *misc_base;
+	void __iomem *borg_base;
+	int irq;
+	unsigned short card_mode_bits;
+	unsigned short event_status_bits;
+	enum board_model model;
+	bool using_fifos;
+};
+
+/* registers */
+enum agilent_82350b_gpib_registers
+
+{
+	CARD_MODE_REG = 0x1,
+	CONFIG_DATA_REG = 0x2, /* 82350A specific */
+	INTERRUPT_ENABLE_REG = 0x3,
+	EVENT_STATUS_REG = 0x4,
+	EVENT_ENABLE_REG = 0x5,
+	STREAM_STATUS_REG = 0x7,
+	DEBUG_RAM0_REG = 0x8,
+	DEBUG_RAM1_REG = 0x9,
+	DEBUG_RAM2_REG = 0xa,
+	DEBUG_RAM3_REG = 0xb,
+	XFER_COUNT_LO_REG = 0xc,
+	XFER_COUNT_MID_REG = 0xd,
+	XFER_COUNT_HI_REG = 0xe,
+	TMS9914_BASE_REG = 0x10,
+	INTERNAL_CONFIG_REG = 0x18,
+	IMR0_READ_REG = 0x19, /* read */
+	T1_DELAY_REG = 0x19, /* write */
+	IMR1_READ_REG = 0x1a,
+	ADR_READ_REG = 0x1b,
+	SPMR_READ_REG = 0x1c,
+	PPR_READ_REG = 0x1d,
+	CDOR_READ_REG = 0x1e,
+	SRAM_ACCESS_CONTROL_REG = 0x1f,
+};
+
+enum card_mode_bits
+
+{
+	ACTIVE_CONTROLLER_BIT = 0x2, /* read-only */
+	CM_SYSTEM_CONTROLLER_BIT = 0x8,
+	ENABLE_BUS_MONITOR_BIT = 0x10,
+	ENABLE_PCI_IRQ_BIT = 0x20,
+};
+
+enum interrupt_enable_bits
+
+{
+	ENABLE_TMS9914_INTERRUPTS_BIT = 0x1,
+	ENABLE_BUFFER_END_INTERRUPT_BIT = 0x10,
+	ENABLE_TERM_COUNT_INTERRUPT_BIT = 0x20,
+};
+
+enum event_enable_bits
+
+{
+	ENABLE_BUFFER_END_EVENTS_BIT = 0x10,
+	ENABLE_TERM_COUNT_EVENTS_BIT = 0x20,
+};
+
+enum event_status_bits
+
+{
+	TMS9914_IRQ_STATUS_BIT = 0x1,
+	IRQ_STATUS_BIT = 0x2,
+	BUFFER_END_STATUS_BIT = 0x10, /* write-clear */
+	TERM_COUNT_STATUS_BIT = 0x20, /* write-clear */
+};
+
+enum stream_status_bits
+
+{
+	HALTED_STATUS_BIT = 0x1, /* read */
+	RESTART_STREAM_BIT = 0x1, /* write */
+};
+
+enum internal_config_bits
+
+{
+	IC_SYSTEM_CONTROLLER_BIT = 0x80,
+};
+
+enum sram_access_control_bits
+
+{
+	DIRECTION_GPIB_TO_HOST = 0x20, /* transfer direction */
+	ENABLE_TI_TO_SRAM = 0x40, /* enable fifo */
+	ENABLE_FAST_TALKER = 0x80 /* added for 82350A (not used) */
+};
+
+enum borg_bits
+
+{
+	BORG_READY_BIT = 0x40,
+	BORG_DONE_BIT = 0x80
+};
+
+static const int agilent_82350b_fifo_size = 0x8000;
+
+static inline int agilent_82350b_fifo_is_halted(struct agilent_82350b_priv *a_priv)
+
+{
+	return readb(a_priv->gpib_base + STREAM_STATUS_REG) & HALTED_STATUS_BIT;
+}
+
diff --git a/drivers/gpib/agilent_82357a/Makefile b/drivers/gpib/agilent_82357a/Makefile
new file mode 100644
index 000000000000..81a55c257a6e
--- /dev/null
+++ b/drivers/gpib/agilent_82357a/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_GPIB_AGILENT_82357A) += agilent_82357a.o
+
+
diff --git a/drivers/gpib/agilent_82357a/agilent_82357a.c b/drivers/gpib/agilent_82357a/agilent_82357a.c
new file mode 100644
index 000000000000..77c8e549b208
--- /dev/null
+++ b/drivers/gpib/agilent_82357a/agilent_82357a.c
@@ -0,0 +1,1691 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *	driver for Agilent 82357A/B usb to gpib adapters		   *
+ *    copyright		   : (C) 2004 by Frank Mori Hess		   *
+ ***************************************************************************/
+
+#define _GNU_SOURCE
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "agilent_82357a.h"
+#include "gpibP.h"
+#include "tms9914.h"
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver for Agilent 82357A/B usb adapters");
+
+#define MAX_NUM_82357A_INTERFACES 128
+static struct usb_interface *agilent_82357a_driver_interfaces[MAX_NUM_82357A_INTERFACES];
+static DEFINE_MUTEX(agilent_82357a_hotplug_lock); // protect board insertion and removal
+
+static unsigned int agilent_82357a_update_status(struct gpib_board *board,
+						 unsigned int clear_mask);
+
+static int agilent_82357a_take_control_internal(struct gpib_board *board, int synchronous);
+
+static void agilent_82357a_bulk_complete(struct urb *urb)
+{
+	struct agilent_82357a_urb_ctx *context = urb->context;
+
+	complete(&context->complete);
+}
+
+static void agilent_82357a_timeout_handler(struct timer_list *t)
+{
+	struct agilent_82357a_priv *a_priv = timer_container_of(a_priv, t,
+								bulk_timer);
+	struct agilent_82357a_urb_ctx *context = &a_priv->context;
+
+	context->timed_out = 1;
+	complete(&context->complete);
+}
+
+static int agilent_82357a_send_bulk_msg(struct agilent_82357a_priv *a_priv, void *data,
+					int data_length, int *actual_data_length,
+					int timeout_msecs)
+{
+	struct usb_device *usb_dev;
+	int retval;
+	unsigned int out_pipe;
+	struct agilent_82357a_urb_ctx *context = &a_priv->context;
+
+	*actual_data_length = 0;
+	retval = mutex_lock_interruptible(&a_priv->bulk_alloc_lock);
+	if (retval)
+		return retval;
+	if (!a_priv->bus_interface) {
+		mutex_unlock(&a_priv->bulk_alloc_lock);
+		return -ENODEV;
+	}
+	if (a_priv->bulk_urb) {
+		mutex_unlock(&a_priv->bulk_alloc_lock);
+		return -EAGAIN;
+	}
+	a_priv->bulk_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!a_priv->bulk_urb) {
+		mutex_unlock(&a_priv->bulk_alloc_lock);
+		return -ENOMEM;
+	}
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	out_pipe = usb_sndbulkpipe(usb_dev, a_priv->bulk_out_endpoint);
+	init_completion(&context->complete);
+	context->timed_out = 0;
+	usb_fill_bulk_urb(a_priv->bulk_urb, usb_dev, out_pipe, data, data_length,
+			  &agilent_82357a_bulk_complete, context);
+
+	if (timeout_msecs)
+		mod_timer(&a_priv->bulk_timer, jiffies + msecs_to_jiffies(timeout_msecs));
+
+	retval = usb_submit_urb(a_priv->bulk_urb, GFP_KERNEL);
+	if (retval) {
+		dev_err(&usb_dev->dev, "failed to submit bulk out urb, retval=%i\n", retval);
+		mutex_unlock(&a_priv->bulk_alloc_lock);
+		goto cleanup;
+	}
+	mutex_unlock(&a_priv->bulk_alloc_lock);
+	if (wait_for_completion_interruptible(&context->complete)) {
+		retval = -ERESTARTSYS;
+		goto cleanup;
+	}
+	if (context->timed_out)	{
+		retval = -ETIMEDOUT;
+	} else {
+		retval = a_priv->bulk_urb->status;
+		*actual_data_length = a_priv->bulk_urb->actual_length;
+	}
+cleanup:
+	if (timeout_msecs) {
+		if (timer_pending(&a_priv->bulk_timer))
+			timer_delete_sync(&a_priv->bulk_timer);
+	}
+	mutex_lock(&a_priv->bulk_alloc_lock);
+	if (a_priv->bulk_urb) {
+		usb_kill_urb(a_priv->bulk_urb);
+		usb_free_urb(a_priv->bulk_urb);
+		a_priv->bulk_urb = NULL;
+	}
+	mutex_unlock(&a_priv->bulk_alloc_lock);
+	return retval;
+}
+
+static int agilent_82357a_receive_bulk_msg(struct agilent_82357a_priv *a_priv, void *data,
+					   int data_length, int *actual_data_length,
+					   int timeout_msecs)
+{
+	struct usb_device *usb_dev;
+	int retval;
+	unsigned int in_pipe;
+	struct agilent_82357a_urb_ctx *context = &a_priv->context;
+
+	*actual_data_length = 0;
+	retval = mutex_lock_interruptible(&a_priv->bulk_alloc_lock);
+	if (retval)
+		return retval;
+	if (!a_priv->bus_interface) {
+		mutex_unlock(&a_priv->bulk_alloc_lock);
+		return -ENODEV;
+	}
+	if (a_priv->bulk_urb) {
+		mutex_unlock(&a_priv->bulk_alloc_lock);
+		return -EAGAIN;
+	}
+	a_priv->bulk_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!a_priv->bulk_urb) {
+		mutex_unlock(&a_priv->bulk_alloc_lock);
+		return -ENOMEM;
+	}
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	in_pipe = usb_rcvbulkpipe(usb_dev, AGILENT_82357_BULK_IN_ENDPOINT);
+	init_completion(&context->complete);
+	context->timed_out = 0;
+	usb_fill_bulk_urb(a_priv->bulk_urb, usb_dev, in_pipe, data, data_length,
+			  &agilent_82357a_bulk_complete, context);
+
+	if (timeout_msecs)
+		mod_timer(&a_priv->bulk_timer, jiffies + msecs_to_jiffies(timeout_msecs));
+
+	retval = usb_submit_urb(a_priv->bulk_urb, GFP_KERNEL);
+	if (retval) {
+		dev_err(&usb_dev->dev, "failed to submit bulk in urb, retval=%i\n", retval);
+		mutex_unlock(&a_priv->bulk_alloc_lock);
+		goto cleanup;
+	}
+	mutex_unlock(&a_priv->bulk_alloc_lock);
+	if (wait_for_completion_interruptible(&context->complete)) {
+		retval = -ERESTARTSYS;
+		goto cleanup;
+	}
+	if (context->timed_out)	{
+		retval = -ETIMEDOUT;
+		goto cleanup;
+	}
+	retval = a_priv->bulk_urb->status;
+	*actual_data_length = a_priv->bulk_urb->actual_length;
+cleanup:
+	if (timeout_msecs)
+		timer_delete_sync(&a_priv->bulk_timer);
+
+	mutex_lock(&a_priv->bulk_alloc_lock);
+	if (a_priv->bulk_urb) {
+		usb_kill_urb(a_priv->bulk_urb);
+		usb_free_urb(a_priv->bulk_urb);
+		a_priv->bulk_urb = NULL;
+	}
+	mutex_unlock(&a_priv->bulk_alloc_lock);
+	return retval;
+}
+
+static int agilent_82357a_receive_control_msg(struct agilent_82357a_priv *a_priv, __u8 request,
+					      __u8 requesttype, __u16 value,  __u16 index,
+					      void *data, __u16 size, int timeout_msecs)
+{
+	struct usb_device *usb_dev;
+	int retval;
+	unsigned int in_pipe;
+
+	retval = mutex_lock_interruptible(&a_priv->control_alloc_lock);
+	if (retval)
+		return retval;
+	if (!a_priv->bus_interface) {
+		mutex_unlock(&a_priv->control_alloc_lock);
+		return -ENODEV;
+	}
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	in_pipe = usb_rcvctrlpipe(usb_dev, AGILENT_82357_CONTROL_ENDPOINT);
+	retval = usb_control_msg(usb_dev, in_pipe, request, requesttype, value, index, data,
+				 size, timeout_msecs);
+	mutex_unlock(&a_priv->control_alloc_lock);
+	return retval;
+}
+
+static void agilent_82357a_dump_raw_block(const u8 *raw_data, int length)
+{
+	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 8, 1, raw_data, length, true);
+}
+
+static int agilent_82357a_write_registers(struct agilent_82357a_priv *a_priv,
+					  const struct agilent_82357a_register_pairlet *writes,
+					  int num_writes)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	int retval;
+	u8 *out_data, *in_data;
+	int out_data_length, in_data_length;
+	int bytes_written, bytes_read;
+	int i = 0;
+	int j;
+	static const int bytes_per_write = 2;
+	static const int header_length = 2;
+	static const int max_writes = 31;
+
+	if (num_writes > max_writes) {
+		dev_err(&usb_dev->dev, "bug! num_writes=%i too large\n", num_writes);
+		return -EIO;
+	}
+	out_data_length = num_writes * bytes_per_write + header_length;
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+
+	out_data[i++] = DATA_PIPE_CMD_WR_REGS;
+	out_data[i++] = num_writes;
+	for (j = 0; j < num_writes; j++)	{
+		out_data[i++] = writes[j].address;
+		out_data[i++] = writes[j].value;
+	}
+
+	retval = mutex_lock_interruptible(&a_priv->bulk_transfer_lock);
+	if (retval) {
+		kfree(out_data);
+		return retval;
+	}
+	retval = agilent_82357a_send_bulk_msg(a_priv, out_data, i, &bytes_written, 1000);
+	kfree(out_data);
+	if (retval) {
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
+			retval, bytes_written, i);
+		mutex_unlock(&a_priv->bulk_transfer_lock);
+		return retval;
+	}
+	in_data_length = 0x20;
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data) {
+		mutex_unlock(&a_priv->bulk_transfer_lock);
+		return -ENOMEM;
+	}
+	retval = agilent_82357a_receive_bulk_msg(a_priv, in_data, in_data_length,
+						 &bytes_read, 1000);
+	mutex_unlock(&a_priv->bulk_transfer_lock);
+
+	if (retval) {
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+			retval, bytes_read);
+		agilent_82357a_dump_raw_block(in_data, bytes_read);
+		kfree(in_data);
+		return -EIO;
+	}
+	if (in_data[0] != (0xff & ~DATA_PIPE_CMD_WR_REGS)) {
+		dev_err(&usb_dev->dev, "bulk command=0x%x != ~DATA_PIPE_CMD_WR_REGS\n", in_data[0]);
+		return -EIO;
+	}
+	if (in_data[1])	{
+		dev_err(&usb_dev->dev, "nonzero error code 0x%x in DATA_PIPE_CMD_WR_REGS response\n",
+			in_data[1]);
+		return -EIO;
+	}
+	kfree(in_data);
+	return 0;
+}
+
+static int agilent_82357a_read_registers(struct agilent_82357a_priv *a_priv,
+					 struct agilent_82357a_register_pairlet *reads,
+					 int num_reads, int blocking)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	int retval;
+	u8 *out_data, *in_data;
+	int out_data_length, in_data_length;
+	int bytes_written, bytes_read;
+	int i = 0;
+	int j;
+	static const int header_length = 2;
+	static const int max_reads = 62;
+
+	if (num_reads > max_reads) {
+		dev_err(&usb_dev->dev, "bug! num_reads=%i too large\n", num_reads);
+		return -EIO;
+	}
+	out_data_length = num_reads + header_length;
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+
+	out_data[i++] = DATA_PIPE_CMD_RD_REGS;
+	out_data[i++] = num_reads;
+	for (j = 0; j < num_reads; j++)
+		out_data[i++] = reads[j].address;
+
+	if (blocking) {
+		retval = mutex_lock_interruptible(&a_priv->bulk_transfer_lock);
+		if (retval) {
+			kfree(out_data);
+			return retval;
+		}
+	} else {
+		retval = mutex_trylock(&a_priv->bulk_transfer_lock);
+		if (retval == 0) {
+			kfree(out_data);
+			return -EAGAIN;
+		}
+	}
+	retval = agilent_82357a_send_bulk_msg(a_priv, out_data, i, &bytes_written, 1000);
+	kfree(out_data);
+	if (retval) {
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
+			retval, bytes_written, i);
+		mutex_unlock(&a_priv->bulk_transfer_lock);
+		return retval;
+	}
+	in_data_length = 0x20;
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data) {
+		mutex_unlock(&a_priv->bulk_transfer_lock);
+		return -ENOMEM;
+	}
+	retval = agilent_82357a_receive_bulk_msg(a_priv, in_data, in_data_length,
+						 &bytes_read, 10000);
+	mutex_unlock(&a_priv->bulk_transfer_lock);
+
+	if (retval) {
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+			retval, bytes_read);
+		agilent_82357a_dump_raw_block(in_data, bytes_read);
+		kfree(in_data);
+		return -EIO;
+	}
+	i = 0;
+	if (in_data[i++] != (0xff & ~DATA_PIPE_CMD_RD_REGS)) {
+		dev_err(&usb_dev->dev, "bulk command=0x%x != ~DATA_PIPE_CMD_RD_REGS\n",	in_data[0]);
+		return -EIO;
+	}
+	if (in_data[i++]) {
+		dev_err(&usb_dev->dev, "nonzero error code 0x%x in DATA_PIPE_CMD_RD_REGS response\n",
+			in_data[1]);
+		return -EIO;
+	}
+	for (j = 0; j < num_reads; j++)
+		reads[j].value = in_data[i++];
+	kfree(in_data);
+	return 0;
+}
+
+static int agilent_82357a_abort(struct agilent_82357a_priv *a_priv, int flush)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	int retval = 0;
+	int receive_control_retval;
+	u16 wIndex = 0;
+	u8 *status_data;
+	static const unsigned int status_data_len = 2;
+
+	status_data = kmalloc(status_data_len, GFP_KERNEL);
+	if (!status_data)
+		return -ENOMEM;
+
+	if (flush)
+		wIndex |= XA_FLUSH;
+	receive_control_retval = agilent_82357a_receive_control_msg(a_priv,
+								    agilent_82357a_control_request,
+								    USB_DIR_IN | USB_TYPE_VENDOR |
+								    USB_RECIP_DEVICE, XFER_ABORT,
+								    wIndex, status_data,
+								    status_data_len, 100);
+	if (receive_control_retval < 0)	{
+		dev_err(&usb_dev->dev, "82357a_receive_control_msg() returned %i\n",
+			receive_control_retval);
+		retval = -EIO;
+		goto cleanup;
+	}
+	if (status_data[0] != (~XFER_ABORT & 0xff)) {
+		dev_err(&usb_dev->dev, "major code=0x%x != ~XFER_ABORT\n", status_data[0]);
+		retval = -EIO;
+		goto cleanup;
+	}
+	switch (status_data[1])	{
+	case UGP_SUCCESS:
+		retval = 0;
+		break;
+	case UGP_ERR_FLUSHING:
+		if (flush) {
+			retval = 0;
+			break;
+		}
+		fallthrough;
+	case UGP_ERR_FLUSHING_ALREADY:
+	default:
+		dev_err(&usb_dev->dev, "abort returned error code=0x%x\n", status_data[1]);
+		retval = -EIO;
+		break;
+	}
+
+cleanup:
+	kfree(status_data);
+	return retval;
+}
+
+// interface functions
+int agilent_82357a_command(struct gpib_board *board, u8 *buffer, size_t length,
+			   size_t *bytes_written);
+
+static int agilent_82357a_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
+			       size_t *nbytes)
+{
+	int retval;
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	u8 *out_data, *in_data;
+	int out_data_length, in_data_length;
+	int bytes_written, bytes_read;
+	int i = 0;
+	u8 trailing_flags;
+	unsigned long start_jiffies = jiffies;
+	int msec_timeout;
+
+	*nbytes = 0;
+	*end = 0;
+
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	out_data_length = 0x9;
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+	out_data[i++] = DATA_PIPE_CMD_READ;
+	out_data[i++] = 0;	// primary address when ARF_NO_ADDR is not set
+	out_data[i++] = 0;	// secondary address when ARF_NO_ADDR is not set
+	out_data[i] = ARF_NO_ADDRESS | ARF_END_ON_EOI;
+	if (a_priv->eos_mode & REOS)
+		out_data[i] |= ARF_END_ON_EOS_CHAR;
+	++i;
+	out_data[i++] = length & 0xff;
+	out_data[i++] = (length >> 8) & 0xff;
+	out_data[i++] = (length >> 16) & 0xff;
+	out_data[i++] = (length >> 24) & 0xff;
+	out_data[i++] = a_priv->eos_char;
+	msec_timeout = (board->usec_timeout + 999) / 1000;
+	retval = mutex_lock_interruptible(&a_priv->bulk_transfer_lock);
+	if (retval) {
+		kfree(out_data);
+		return retval;
+	}
+	retval = agilent_82357a_send_bulk_msg(a_priv, out_data, i, &bytes_written, msec_timeout);
+	kfree(out_data);
+	if (retval || bytes_written != i) {
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
+			retval, bytes_written, i);
+		mutex_unlock(&a_priv->bulk_transfer_lock);
+		if (retval < 0)
+			return retval;
+		return -EIO;
+	}
+	in_data_length = length + 1;
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data) {
+		mutex_unlock(&a_priv->bulk_transfer_lock);
+		return -ENOMEM;
+	}
+	if (board->usec_timeout != 0)
+		msec_timeout -= jiffies_to_msecs(jiffies - start_jiffies) - 1;
+	if (msec_timeout >= 0) {
+		retval = agilent_82357a_receive_bulk_msg(a_priv, in_data, in_data_length,
+							 &bytes_read, msec_timeout);
+	} else {
+		retval = -ETIMEDOUT;
+		bytes_read = 0;
+	}
+	if (retval == -ETIMEDOUT) {
+		int extra_bytes_read;
+		int extra_bytes_retval;
+
+		agilent_82357a_abort(a_priv, 1);
+		extra_bytes_retval = agilent_82357a_receive_bulk_msg(a_priv, in_data + bytes_read,
+								     in_data_length - bytes_read,
+								     &extra_bytes_read, 100);
+		bytes_read += extra_bytes_read;
+		if (extra_bytes_retval)	{
+			dev_err(&usb_dev->dev, "extra_bytes_retval=%i, bytes_read=%i\n",
+				extra_bytes_retval, bytes_read);
+			agilent_82357a_abort(a_priv, 0);
+		}
+	} else if (retval) {
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+			retval, bytes_read);
+		agilent_82357a_abort(a_priv, 0);
+	}
+	mutex_unlock(&a_priv->bulk_transfer_lock);
+	if (bytes_read > length + 1) {
+		bytes_read = length + 1;
+		dev_warn(&usb_dev->dev, "bytes_read > length? truncating");
+	}
+
+	if (bytes_read >= 1) {
+		memcpy(buffer, in_data, bytes_read - 1);
+		trailing_flags = in_data[bytes_read - 1];
+		*nbytes = bytes_read - 1;
+		if (trailing_flags & (ATRF_EOI | ATRF_EOS))
+			*end = 1;
+	}
+	kfree(in_data);
+
+	/*
+	 * Fix for a bug in 9914A that does not return the contents of ADSR
+	 * when the board is in listener active state and ATN is not asserted.
+	 * Set ATN here to obtain a valid board level ibsta
+	 */
+	agilent_82357a_take_control_internal(board, 0);
+
+	// FIXME check trailing flags for error
+	return retval;
+}
+
+static ssize_t agilent_82357a_generic_write(struct gpib_board *board,
+					    u8 *buffer, size_t length,
+					    int send_commands, int send_eoi,
+					    size_t *bytes_written)
+{
+	int retval;
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	u8 *out_data = NULL;
+	u8 *status_data = NULL;
+	int out_data_length;
+	int raw_bytes_written;
+	int i = 0, j;
+	int msec_timeout;
+	unsigned short bsr, adsr;
+	struct agilent_82357a_register_pairlet read_reg;
+
+	*bytes_written = 0;
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	out_data_length = length + 0x8;
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+	out_data[i++] = DATA_PIPE_CMD_WRITE;
+	out_data[i++] = 0; // primary address when AWF_NO_ADDRESS is not set
+	out_data[i++] = 0; // secondary address when AWF_NO_ADDRESS is not set
+	out_data[i] = AWF_NO_ADDRESS | AWF_NO_FAST_TALKER_FIRST_BYTE;
+	if (send_commands)
+		out_data[i] |= AWF_ATN | AWF_NO_FAST_TALKER;
+	if (send_eoi)
+		out_data[i] |= AWF_SEND_EOI;
+	++i;
+	out_data[i++] = length & 0xff;
+	out_data[i++] = (length >> 8) & 0xff;
+	out_data[i++] = (length >> 16) & 0xff;
+	out_data[i++] = (length >> 24) & 0xff;
+	for (j = 0; j < length; j++)
+		out_data[i++] = buffer[j];
+
+	clear_bit(AIF_WRITE_COMPLETE_BN, &a_priv->interrupt_flags);
+
+	msec_timeout = (board->usec_timeout + 999) / 1000;
+	retval = mutex_lock_interruptible(&a_priv->bulk_transfer_lock);
+	if (retval) {
+		kfree(out_data);
+		return retval;
+	}
+	retval = agilent_82357a_send_bulk_msg(a_priv, out_data, i, &raw_bytes_written,
+					      msec_timeout);
+	kfree(out_data);
+	if (retval || raw_bytes_written != i) {
+		agilent_82357a_abort(a_priv, 0);
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, raw_bytes_written=%i, i=%i\n",
+			retval, raw_bytes_written, i);
+		mutex_unlock(&a_priv->bulk_transfer_lock);
+		if (retval < 0)
+			return retval;
+		return -EIO;
+	}
+
+	retval = wait_event_interruptible(board->wait,
+					  test_bit(AIF_WRITE_COMPLETE_BN,
+						   &a_priv->interrupt_flags) ||
+					  test_bit(TIMO_NUM, &board->status));
+	if (retval) {
+		dev_dbg(&usb_dev->dev, "wait write complete interrupted\n");
+		agilent_82357a_abort(a_priv, 0);
+		mutex_unlock(&a_priv->bulk_transfer_lock);
+		return -ERESTARTSYS;
+	}
+
+	if (test_bit(AIF_WRITE_COMPLETE_BN, &a_priv->interrupt_flags) == 0) {
+		dev_dbg(&usb_dev->dev, "write timed out ibs %i, tmo %i\n",
+			test_bit(TIMO_NUM, &board->status), msec_timeout);
+
+		agilent_82357a_abort(a_priv, 0);
+
+		mutex_unlock(&a_priv->bulk_transfer_lock);
+
+		read_reg.address = BSR;
+		retval = agilent_82357a_read_registers(a_priv, &read_reg, 1, 1);
+		if (retval) {
+			dev_err(&usb_dev->dev, "read_registers() returned error\n");
+			return -ETIMEDOUT;
+		}
+
+		bsr = read_reg.value;
+		dev_dbg(&usb_dev->dev, "write aborted bsr 0x%x\n", bsr);
+
+		if (send_commands) {/* check for no listeners */
+			if ((bsr & BSR_ATN_BIT) && !(bsr & (BSR_NDAC_BIT | BSR_NRFD_BIT))) {
+				dev_dbg(&usb_dev->dev, "No listener on command\n");
+				clear_bit(TIMO_NUM, &board->status);
+				return -ENOTCONN; // no listener on bus
+			}
+		} else {
+			read_reg.address = ADSR;
+			retval = agilent_82357a_read_registers(a_priv, &read_reg, 1, 1);
+			if (retval) {
+				dev_err(&usb_dev->dev, "read_registers() returned error\n");
+				return -ETIMEDOUT;
+			}
+			adsr = read_reg.value;
+			if ((adsr & HR_TA) && !(bsr & (BSR_NDAC_BIT | BSR_NRFD_BIT))) {
+				dev_dbg(&usb_dev->dev, "No listener on write\n");
+				clear_bit(TIMO_NUM, &board->status);
+				return -ECOMM;
+			}
+		}
+
+		return -ETIMEDOUT;
+	}
+
+	status_data = kmalloc(STATUS_DATA_LEN, GFP_KERNEL);
+	if (!status_data) {
+		mutex_unlock(&a_priv->bulk_transfer_lock);
+		return -ENOMEM;
+	}
+
+	retval = agilent_82357a_receive_control_msg(a_priv, agilent_82357a_control_request,
+						    USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+						    XFER_STATUS, 0, status_data, STATUS_DATA_LEN,
+						    100);
+	mutex_unlock(&a_priv->bulk_transfer_lock);
+	if (retval < 0)	{
+		dev_err(&usb_dev->dev, "receive_control_msg() returned %i\n", retval);
+		kfree(status_data);
+		return -EIO;
+	}
+	*bytes_written	= (u32)status_data[2];
+	*bytes_written |= (u32)status_data[3] << 8;
+	*bytes_written |= (u32)status_data[4] << 16;
+	*bytes_written |= (u32)status_data[5] << 24;
+
+	kfree(status_data);
+	return 0;
+}
+
+static int agilent_82357a_write(struct gpib_board *board, u8 *buffer,
+				size_t length, int send_eoi, size_t *bytes_written)
+{
+	return agilent_82357a_generic_write(board, buffer, length, 0, send_eoi, bytes_written);
+}
+
+int agilent_82357a_command(struct gpib_board *board, u8 *buffer, size_t length,
+			   size_t *bytes_written)
+{
+	return agilent_82357a_generic_write(board, buffer, length, 1, 0, bytes_written);
+}
+
+int agilent_82357a_take_control_internal(struct gpib_board *board, int synchronous)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	struct agilent_82357a_register_pairlet write;
+	int retval;
+
+	write.address = AUXCR;
+	if (synchronous)
+		write.value = AUX_TCS;
+	else
+		write.value = AUX_TCA;
+	retval = agilent_82357a_write_registers(a_priv, &write, 1);
+	if (retval)
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+
+	return retval;
+}
+
+static int agilent_82357a_take_control(struct gpib_board *board, int synchronous)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	const int timeout = 10;
+	int i;
+
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+
+/*
+ * It looks like the 9914 does not handle tcs properly.
+ * See comment above tms9914_take_control_workaround() in
+ * drivers/gpib/tms9914/tms9914_aux.c
+ */
+	if (synchronous)
+		return -ETIMEDOUT;
+
+	agilent_82357a_take_control_internal(board, synchronous);
+	// busy wait until ATN is asserted
+	for (i = 0; i < timeout; ++i) {
+		agilent_82357a_update_status(board, 0);
+		if (test_bit(ATN_NUM, &board->status))
+			break;
+		udelay(1);
+	}
+	if (i == timeout)
+		return -ETIMEDOUT;
+	return 0;
+}
+
+static int agilent_82357a_go_to_standby(struct gpib_board *board)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	struct agilent_82357a_register_pairlet write;
+	int retval;
+
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	write.address = AUXCR;
+	write.value = AUX_GTS;
+	retval = agilent_82357a_write_registers(a_priv, &write, 1);
+	if (retval)
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+	return 0;
+}
+
+static int agilent_82357a_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	struct agilent_82357a_register_pairlet writes[2];
+	int retval;
+	int i = 0;
+
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	/* 82357B needs bit to be set in 9914 AUXCR register */
+	writes[i].address = AUXCR;
+	if (request_control) {
+		writes[i].value = AUX_RQC;
+		a_priv->hw_control_bits |= SYSTEM_CONTROLLER;
+	} else {
+		return -EINVAL;
+	}
+	++i;
+	writes[i].address = HW_CONTROL;
+	writes[i].value = a_priv->hw_control_bits;
+	++i;
+	retval = agilent_82357a_write_registers(a_priv, writes, i);
+	if (retval)
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+	return retval;
+}
+
+static void agilent_82357a_interface_clear(struct gpib_board *board, int assert)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	struct agilent_82357a_register_pairlet write;
+	int retval;
+
+	if (!a_priv->bus_interface)
+		return; // -ENODEV;
+
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	write.address = AUXCR;
+	write.value = AUX_SIC;
+	if (assert) {
+		write.value |= AUX_CS;
+		a_priv->is_cic = 1;
+	}
+	retval = agilent_82357a_write_registers(a_priv, &write, 1);
+	if (retval)
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+}
+
+static void agilent_82357a_remote_enable(struct gpib_board *board, int enable)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	struct agilent_82357a_register_pairlet write;
+	int retval;
+
+	if (!a_priv->bus_interface)
+		return; //-ENODEV;
+
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	write.address = AUXCR;
+	write.value = AUX_SRE;
+	if (enable)
+		write.value |= AUX_CS;
+	retval = agilent_82357a_write_registers(a_priv, &write, 1);
+	if (retval)
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+	a_priv->ren_state = enable;
+	return;// 0;
+}
+
+static int agilent_82357a_enable_eos(struct gpib_board *board, u8 eos_byte,
+				     int compare_8_bits)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+	if (compare_8_bits == 0)
+		return -EOPNOTSUPP;
+
+	a_priv->eos_char = eos_byte;
+	a_priv->eos_mode = REOS | BIN;
+	return 0;
+}
+
+static void agilent_82357a_disable_eos(struct gpib_board *board)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+
+	a_priv->eos_mode &= ~REOS;
+}
+
+static unsigned int agilent_82357a_update_status(struct gpib_board *board,
+						 unsigned int clear_mask)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	struct agilent_82357a_register_pairlet address_status, bus_status;
+	int retval;
+
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	board->status &= ~clear_mask;
+	if (a_priv->is_cic)
+		set_bit(CIC_NUM, &board->status);
+	else
+		clear_bit(CIC_NUM, &board->status);
+	address_status.address = ADSR;
+	retval = agilent_82357a_read_registers(a_priv, &address_status, 1, 0);
+	if (retval) {
+		if (retval != -EAGAIN)
+			dev_err(&usb_dev->dev, "read_registers() returned error\n");
+		return board->status;
+	}
+	// check for remote/local
+	if (address_status.value & HR_REM)
+		set_bit(REM_NUM, &board->status);
+	else
+		clear_bit(REM_NUM, &board->status);
+	// check for lockout
+	if (address_status.value & HR_LLO)
+		set_bit(LOK_NUM, &board->status);
+	else
+		clear_bit(LOK_NUM, &board->status);
+	// check for ATN
+	if (address_status.value & HR_ATN)
+		set_bit(ATN_NUM, &board->status);
+	else
+		clear_bit(ATN_NUM, &board->status);
+	// check for talker/listener addressed
+	if (address_status.value & HR_TA)
+		set_bit(TACS_NUM, &board->status);
+	else
+		clear_bit(TACS_NUM, &board->status);
+	if (address_status.value & HR_LA)
+		set_bit(LACS_NUM, &board->status);
+	else
+		clear_bit(LACS_NUM, &board->status);
+
+	bus_status.address = BSR;
+	retval = agilent_82357a_read_registers(a_priv, &bus_status, 1, 0);
+	if (retval) {
+		if (retval != -EAGAIN)
+			dev_err(&usb_dev->dev, "read_registers() returned error\n");
+		return board->status;
+	}
+	if (bus_status.value & BSR_SRQ_BIT)
+		set_bit(SRQI_NUM, &board->status);
+	else
+		clear_bit(SRQI_NUM, &board->status);
+
+	return board->status;
+}
+
+static int agilent_82357a_primary_address(struct gpib_board *board, unsigned int address)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	struct agilent_82357a_register_pairlet write;
+	int retval;
+
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	// put primary address in address0
+	write.address = ADR;
+	write.value = address & ADDRESS_MASK;
+	retval = agilent_82357a_write_registers(a_priv, &write, 1);
+	if (retval) {
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+		return retval;
+	}
+	return retval;
+}
+
+static int agilent_82357a_secondary_address(struct gpib_board *board,
+					    unsigned int address, int enable)
+{
+	if (enable)
+		return	-EOPNOTSUPP;
+	return 0;
+}
+
+static int agilent_82357a_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	struct agilent_82357a_register_pairlet writes[2];
+	struct agilent_82357a_register_pairlet read;
+	int retval;
+
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	// execute parallel poll
+	writes[0].address = AUXCR;
+	writes[0].value = AUX_CS | AUX_RPP;
+	writes[1].address = HW_CONTROL;
+	writes[1].value = a_priv->hw_control_bits & ~NOT_PARALLEL_POLL;
+	retval = agilent_82357a_write_registers(a_priv, writes, 2);
+	if (retval) {
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+		return retval;
+	}
+	udelay(2);	// silly, since usb write will take way longer
+	read.address = CPTR;
+	retval = agilent_82357a_read_registers(a_priv, &read, 1, 1);
+	if (retval) {
+		dev_err(&usb_dev->dev, "read_registers() returned error\n");
+		return retval;
+	}
+	*result = read.value;
+	// clear parallel poll state
+	writes[0].address = HW_CONTROL;
+	writes[0].value = a_priv->hw_control_bits | NOT_PARALLEL_POLL;
+	writes[1].address = AUXCR;
+	writes[1].value = AUX_RPP;
+	retval = agilent_82357a_write_registers(a_priv, writes, 2);
+	if (retval) {
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+		return retval;
+	}
+	return 0;
+}
+
+static void agilent_82357a_parallel_poll_configure(struct gpib_board *board, u8 config)
+{
+	// board can only be system controller
+	return;// 0;
+}
+
+static void agilent_82357a_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	// board can only be system controller
+	return;// 0;
+}
+
+static void agilent_82357a_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	// board can only be system controller
+	return;// 0;
+}
+
+static u8 agilent_82357a_serial_poll_status(struct gpib_board *board)
+{
+	// board can only be system controller
+	return 0;
+}
+
+static void agilent_82357a_return_to_local(struct gpib_board *board)
+{
+	// board can only be system controller
+	return;// 0;
+}
+
+static int agilent_82357a_line_status(const struct gpib_board *board)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	struct agilent_82357a_register_pairlet bus_status;
+	int retval;
+	int status = VALID_ALL;
+
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	bus_status.address = BSR;
+	retval = agilent_82357a_read_registers(a_priv, &bus_status, 1, 0);
+	if (retval) {
+		if (retval != -EAGAIN)
+			dev_err(&usb_dev->dev, "read_registers() returned error\n");
+		return retval;
+	}
+	if (bus_status.value & BSR_REN_BIT)
+		status |= BUS_REN;
+	if (bus_status.value & BSR_IFC_BIT)
+		status |= BUS_IFC;
+	if (bus_status.value & BSR_SRQ_BIT)
+		status |= BUS_SRQ;
+	if (bus_status.value & BSR_EOI_BIT)
+		status |= BUS_EOI;
+	if (bus_status.value & BSR_NRFD_BIT)
+		status |= BUS_NRFD;
+	if (bus_status.value & BSR_NDAC_BIT)
+		status |= BUS_NDAC;
+	if (bus_status.value & BSR_DAV_BIT)
+		status |= BUS_DAV;
+	if (bus_status.value & BSR_ATN_BIT)
+		status |= BUS_ATN;
+	return status;
+}
+
+static unsigned short nanosec_to_fast_talker_bits(unsigned int *nanosec)
+{
+	static const int nanosec_per_bit = 21;
+	static const int max_value = 0x72;
+	static const int min_value = 0x11;
+	unsigned short bits;
+
+	bits = (*nanosec + nanosec_per_bit / 2) / nanosec_per_bit;
+	if (bits < min_value)
+		bits = min_value;
+	if (bits > max_value)
+		bits = max_value;
+	*nanosec = bits * nanosec_per_bit;
+	return bits;
+}
+
+static int agilent_82357a_t1_delay(struct gpib_board *board, unsigned int nanosec)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	struct agilent_82357a_register_pairlet write;
+	int retval;
+
+	if (!a_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	write.address = FAST_TALKER_T1;
+	write.value = nanosec_to_fast_talker_bits(&nanosec);
+	retval = agilent_82357a_write_registers(a_priv, &write, 1);
+	if (retval)
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+	return nanosec;
+}
+
+static void agilent_82357a_interrupt_complete(struct urb *urb)
+{
+	struct gpib_board *board = urb->context;
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	int retval;
+	u8 *transfer_buffer = urb->transfer_buffer;
+	unsigned long interrupt_flags;
+
+	switch (urb->status) {
+		/* success */
+	case 0:
+		break;
+		/* unlinked, don't resubmit */
+	case -ECONNRESET:
+	case -ENOENT:
+	case -ESHUTDOWN:
+		return;
+	default: /* other error, resubmit */
+		retval = usb_submit_urb(a_priv->interrupt_urb, GFP_ATOMIC);
+		if (retval)
+			dev_err(&usb_dev->dev, "failed to resubmit interrupt urb\n");
+		return;
+	}
+
+	interrupt_flags = transfer_buffer[0];
+	if (test_bit(AIF_READ_COMPLETE_BN, &interrupt_flags))
+		set_bit(AIF_READ_COMPLETE_BN, &a_priv->interrupt_flags);
+	if (test_bit(AIF_WRITE_COMPLETE_BN, &interrupt_flags))
+		set_bit(AIF_WRITE_COMPLETE_BN, &a_priv->interrupt_flags);
+	if (test_bit(AIF_SRQ_BN, &interrupt_flags))
+		set_bit(SRQI_NUM, &board->status);
+
+	wake_up_interruptible(&board->wait);
+
+	retval = usb_submit_urb(a_priv->interrupt_urb, GFP_ATOMIC);
+	if (retval)
+		dev_err(&usb_dev->dev, "failed to resubmit interrupt urb\n");
+}
+
+static int agilent_82357a_setup_urbs(struct gpib_board *board)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev;
+	int int_pipe;
+	int retval;
+
+	retval = mutex_lock_interruptible(&a_priv->interrupt_alloc_lock);
+	if (retval)
+		return retval;
+	if (!a_priv->bus_interface) {
+		retval = -ENODEV;
+		goto setup_exit;
+	}
+
+	a_priv->interrupt_buffer = kmalloc(INTERRUPT_BUF_LEN, GFP_KERNEL);
+	if (!a_priv->interrupt_buffer) {
+		retval = -ENOMEM;
+		goto setup_exit;
+	}
+	a_priv->interrupt_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!a_priv->interrupt_urb) {
+		retval = -ENOMEM;
+		goto setup_exit;
+	}
+	usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	int_pipe = usb_rcvintpipe(usb_dev, a_priv->interrupt_in_endpoint);
+	usb_fill_int_urb(a_priv->interrupt_urb, usb_dev, int_pipe, a_priv->interrupt_buffer,
+			 INTERRUPT_BUF_LEN, &agilent_82357a_interrupt_complete, board, 1);
+	retval = usb_submit_urb(a_priv->interrupt_urb, GFP_KERNEL);
+	if (retval) {
+		usb_free_urb(a_priv->interrupt_urb);
+		a_priv->interrupt_urb = NULL;
+		dev_err(&usb_dev->dev, "failed to submit first interrupt urb, retval=%i\n", retval);
+		goto setup_exit;
+	}
+	mutex_unlock(&a_priv->interrupt_alloc_lock);
+	return 0;
+
+setup_exit:
+	kfree(a_priv->interrupt_buffer);
+	mutex_unlock(&a_priv->interrupt_alloc_lock);
+	return retval;
+}
+
+static void agilent_82357a_cleanup_urbs(struct agilent_82357a_priv *a_priv)
+{
+	if (a_priv && a_priv->bus_interface) {
+		if (a_priv->interrupt_urb)
+			usb_kill_urb(a_priv->interrupt_urb);
+		if (a_priv->bulk_urb)
+			usb_kill_urb(a_priv->bulk_urb);
+	}
+};
+
+static void agilent_82357a_release_urbs(struct agilent_82357a_priv *a_priv)
+{
+	if (a_priv) {
+		usb_free_urb(a_priv->interrupt_urb);
+		a_priv->interrupt_urb = NULL;
+		kfree(a_priv->interrupt_buffer);
+	}
+}
+
+static int agilent_82357a_allocate_private(struct gpib_board *board)
+{
+	struct agilent_82357a_priv *a_priv;
+
+	board->private_data = kzalloc(sizeof(struct agilent_82357a_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -ENOMEM;
+	a_priv = board->private_data;
+	mutex_init(&a_priv->bulk_transfer_lock);
+	mutex_init(&a_priv->bulk_alloc_lock);
+	mutex_init(&a_priv->control_alloc_lock);
+	mutex_init(&a_priv->interrupt_alloc_lock);
+	return 0;
+}
+
+static void agilent_82357a_free_private(struct gpib_board *board)
+{
+	kfree(board->private_data);
+	board->private_data = NULL;
+}
+
+#define INIT_NUM_REG_WRITES 18
+static int agilent_82357a_init(struct gpib_board *board)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	struct agilent_82357a_register_pairlet hw_control;
+	struct agilent_82357a_register_pairlet writes[INIT_NUM_REG_WRITES];
+	int retval;
+	unsigned int nanosec;
+
+	writes[0].address = LED_CONTROL;
+	writes[0].value = FAIL_LED_ON;
+	writes[1].address = RESET_TO_POWERUP;
+	writes[1].value = RESET_SPACEBALL;
+	retval = agilent_82357a_write_registers(a_priv, writes, 2);
+	if (retval) {
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+		return -EIO;
+	}
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (schedule_timeout(usec_to_jiffies(2000)))
+		return -ERESTARTSYS;
+	writes[0].address = AUXCR;
+	writes[0].value = AUX_NBAF;
+	writes[1].address = AUXCR;
+	writes[1].value = AUX_HLDE;
+	writes[2].address = AUXCR;
+	writes[2].value = AUX_TON;
+	writes[3].address = AUXCR;
+	writes[3].value = AUX_LON;
+	writes[4].address = AUXCR;
+	writes[4].value = AUX_RSV2;
+	writes[5].address = AUXCR;
+	writes[5].value = AUX_INVAL;
+	writes[6].address = AUXCR;
+	writes[6].value = AUX_RPP;
+	writes[7].address = AUXCR;
+	writes[7].value = AUX_STDL;
+	writes[8].address = AUXCR;
+	writes[8].value = AUX_VSTDL;
+	writes[9].address = FAST_TALKER_T1;
+	nanosec = board->t1_nano_sec;
+	writes[9].value = nanosec_to_fast_talker_bits(&nanosec);
+	board->t1_nano_sec = nanosec;
+	writes[10].address = ADR;
+	writes[10].value = board->pad & ADDRESS_MASK;
+	writes[11].address = PPR;
+	writes[11].value = 0;
+	writes[12].address = SPMR;
+	writes[12].value = 0;
+	writes[13].address = PROTOCOL_CONTROL;
+	writes[13].value = WRITE_COMPLETE_INTERRUPT_EN;
+	writes[14].address = IMR0;
+	writes[14].value = HR_BOIE | HR_BIIE;
+	writes[15].address = IMR1;
+	writes[15].value = HR_SRQIE;
+	// turn off reset state
+	writes[16].address = AUXCR;
+	writes[16].value = AUX_CHIP_RESET;
+	writes[17].address = LED_CONTROL;
+	writes[17].value = FIRMWARE_LED_CONTROL;
+	retval = agilent_82357a_write_registers(a_priv, writes, INIT_NUM_REG_WRITES);
+	if (retval) {
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+		return -EIO;
+	}
+	hw_control.address = HW_CONTROL;
+	retval = agilent_82357a_read_registers(a_priv, &hw_control, 1, 1);
+	if (retval) {
+		dev_err(&usb_dev->dev, "read_registers() returned error\n");
+		return -EIO;
+	}
+	a_priv->hw_control_bits = (hw_control.value & ~0x7) | NOT_TI_RESET | NOT_PARALLEL_POLL;
+
+	return 0;
+}
+
+static inline int agilent_82357a_device_match(struct usb_interface *interface,
+					      const struct gpib_board_config *config)
+{
+	struct usb_device * const usbdev = interface_to_usbdev(interface);
+
+	if (gpib_match_device_path(&interface->dev, config->device_path) == 0)
+		return 0;
+	if (config->serial_number &&
+	    strcmp(usbdev->serial, config->serial_number) != 0)
+		return 0;
+
+	return 1;
+}
+
+static int agilent_82357a_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	int retval;
+	int i;
+	unsigned int product_id;
+	struct agilent_82357a_priv *a_priv;
+	struct usb_device *usb_dev;
+
+	if (mutex_lock_interruptible(&agilent_82357a_hotplug_lock))
+		return -ERESTARTSYS;
+
+	retval = agilent_82357a_allocate_private(board);
+	if (retval < 0) {
+		mutex_unlock(&agilent_82357a_hotplug_lock);
+		return retval;
+	}
+	a_priv = board->private_data;
+	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i) {
+		if (agilent_82357a_driver_interfaces[i] &&
+		    !usb_get_intfdata(agilent_82357a_driver_interfaces[i]) &&
+		    agilent_82357a_device_match(agilent_82357a_driver_interfaces[i], config)) {
+			a_priv->bus_interface = agilent_82357a_driver_interfaces[i];
+			usb_set_intfdata(agilent_82357a_driver_interfaces[i], board);
+			usb_dev = interface_to_usbdev(a_priv->bus_interface);
+			break;
+		}
+	}
+	if (i == MAX_NUM_82357A_INTERFACES) {
+		dev_err(board->gpib_dev,
+			"No supported adapters found, have you loaded its firmware?\n");
+		retval = -ENODEV;
+		goto attach_fail;
+	}
+	product_id = le16_to_cpu(interface_to_usbdev(a_priv->bus_interface)->descriptor.idProduct);
+	switch (product_id) {
+	case USB_DEVICE_ID_AGILENT_82357A:
+		a_priv->bulk_out_endpoint = AGILENT_82357A_BULK_OUT_ENDPOINT;
+		a_priv->interrupt_in_endpoint = AGILENT_82357A_INTERRUPT_IN_ENDPOINT;
+		break;
+	case USB_DEVICE_ID_AGILENT_82357B:
+		a_priv->bulk_out_endpoint = AGILENT_82357B_BULK_OUT_ENDPOINT;
+		a_priv->interrupt_in_endpoint = AGILENT_82357B_INTERRUPT_IN_ENDPOINT;
+		break;
+	default:
+		dev_err(&usb_dev->dev, "bug, unhandled product_id in switch?\n");
+		retval = -EIO;
+		goto attach_fail;
+	}
+
+	retval = agilent_82357a_setup_urbs(board);
+	if (retval < 0)
+		goto attach_fail;
+
+	timer_setup(&a_priv->bulk_timer, agilent_82357a_timeout_handler, 0);
+
+	board->t1_nano_sec = 800;
+
+	retval = agilent_82357a_init(board);
+
+	if (retval < 0)	{
+		agilent_82357a_cleanup_urbs(a_priv);
+		agilent_82357a_release_urbs(a_priv);
+		goto attach_fail;
+	}
+
+	dev_info(&usb_dev->dev, "bus %d dev num %d attached to gpib%d, interface %i\n",
+		 usb_dev->bus->busnum, usb_dev->devnum, board->minor, i);
+	mutex_unlock(&agilent_82357a_hotplug_lock);
+	return retval;
+
+attach_fail:
+	agilent_82357a_free_private(board);
+	mutex_unlock(&agilent_82357a_hotplug_lock);
+	return retval;
+}
+
+static int agilent_82357a_go_idle(struct gpib_board *board)
+{
+	struct agilent_82357a_priv *a_priv = board->private_data;
+	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
+	struct agilent_82357a_register_pairlet writes[0x20];
+	int retval;
+
+	// turn on tms9914 reset state
+	writes[0].address = AUXCR;
+	writes[0].value = AUX_CS | AUX_CHIP_RESET;
+	a_priv->hw_control_bits &= ~NOT_TI_RESET;
+	writes[1].address = HW_CONTROL;
+	writes[1].value = a_priv->hw_control_bits;
+	writes[2].address = PROTOCOL_CONTROL;
+	writes[2].value = 0;
+	writes[3].address = IMR0;
+	writes[3].value = 0;
+	writes[4].address = IMR1;
+	writes[4].value = 0;
+	writes[5].address = LED_CONTROL;
+	writes[5].value = 0;
+	retval = agilent_82357a_write_registers(a_priv, writes, 6);
+	if (retval) {
+		dev_err(&usb_dev->dev, "write_registers() returned error\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+static void agilent_82357a_detach(struct gpib_board *board)
+{
+	struct agilent_82357a_priv *a_priv;
+
+	mutex_lock(&agilent_82357a_hotplug_lock);
+
+	a_priv = board->private_data;
+	if (a_priv) {
+		if (a_priv->bus_interface) {
+			agilent_82357a_go_idle(board);
+			usb_set_intfdata(a_priv->bus_interface, NULL);
+		}
+		mutex_lock(&a_priv->control_alloc_lock);
+		mutex_lock(&a_priv->bulk_alloc_lock);
+		mutex_lock(&a_priv->interrupt_alloc_lock);
+		agilent_82357a_cleanup_urbs(a_priv);
+		agilent_82357a_release_urbs(a_priv);
+		agilent_82357a_free_private(board);
+	}
+	mutex_unlock(&agilent_82357a_hotplug_lock);
+}
+
+static struct gpib_interface agilent_82357a_gpib_interface = {
+	.name = "agilent_82357a",
+	.attach = agilent_82357a_attach,
+	.detach = agilent_82357a_detach,
+	.read = agilent_82357a_read,
+	.write = agilent_82357a_write,
+	.command = agilent_82357a_command,
+	.take_control = agilent_82357a_take_control,
+	.go_to_standby = agilent_82357a_go_to_standby,
+	.request_system_control = agilent_82357a_request_system_control,
+	.interface_clear = agilent_82357a_interface_clear,
+	.remote_enable = agilent_82357a_remote_enable,
+	.enable_eos = agilent_82357a_enable_eos,
+	.disable_eos = agilent_82357a_disable_eos,
+	.parallel_poll = agilent_82357a_parallel_poll,
+	.parallel_poll_configure = agilent_82357a_parallel_poll_configure,
+	.parallel_poll_response = agilent_82357a_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = agilent_82357a_line_status,
+	.update_status = agilent_82357a_update_status,
+	.primary_address = agilent_82357a_primary_address,
+	.secondary_address = agilent_82357a_secondary_address,
+	.serial_poll_response = agilent_82357a_serial_poll_response,
+	.serial_poll_status = agilent_82357a_serial_poll_status,
+	.t1_delay = agilent_82357a_t1_delay,
+	.return_to_local = agilent_82357a_return_to_local,
+	.no_7_bit_eos = 1,
+	.skip_check_for_command_acceptors = 1
+};
+
+// Table with the USB-devices: just now only testing IDs
+static struct usb_device_id agilent_82357a_driver_device_table[] = {
+	{USB_DEVICE(USB_VENDOR_ID_AGILENT, USB_DEVICE_ID_AGILENT_82357A)},
+	{USB_DEVICE(USB_VENDOR_ID_AGILENT, USB_DEVICE_ID_AGILENT_82357B)},
+	{} /* Terminating entry */
+};
+MODULE_DEVICE_TABLE(usb, agilent_82357a_driver_device_table);
+
+static int agilent_82357a_driver_probe(struct usb_interface *interface,
+				       const struct usb_device_id *id)
+{
+	int i;
+	char *path;
+	static const int path_length = 1024;
+	struct usb_device *usb_dev;
+
+	if (mutex_lock_interruptible(&agilent_82357a_hotplug_lock))
+		return -ERESTARTSYS;
+	usb_dev = usb_get_dev(interface_to_usbdev(interface));
+	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i) {
+		if (!agilent_82357a_driver_interfaces[i]) {
+			agilent_82357a_driver_interfaces[i] = interface;
+			usb_set_intfdata(interface, NULL);
+			dev_dbg(&usb_dev->dev, "set bus interface %i to address 0x%p\n",
+				i, interface);
+			break;
+		}
+	}
+	if (i == MAX_NUM_82357A_INTERFACES) {
+		usb_put_dev(usb_dev);
+		mutex_unlock(&agilent_82357a_hotplug_lock);
+		dev_err(&usb_dev->dev, "out of space in agilent_82357a_driver_interfaces[]\n");
+		return -1;
+	}
+	path = kmalloc(path_length, GFP_KERNEL);
+	if (!path) {
+		usb_put_dev(usb_dev);
+		mutex_unlock(&agilent_82357a_hotplug_lock);
+		return -ENOMEM;
+	}
+	usb_make_path(usb_dev, path, path_length);
+	dev_info(&usb_dev->dev, "probe succeeded for path: %s\n", path);
+	kfree(path);
+	mutex_unlock(&agilent_82357a_hotplug_lock);
+	return 0;
+}
+
+static void agilent_82357a_driver_disconnect(struct usb_interface *interface)
+{
+	int i;
+	struct usb_device *usb_dev = interface_to_usbdev(interface);
+
+	mutex_lock(&agilent_82357a_hotplug_lock);
+
+	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i) {
+		if (agilent_82357a_driver_interfaces[i] == interface) {
+			struct gpib_board *board = usb_get_intfdata(interface);
+
+			if (board) {
+				struct agilent_82357a_priv *a_priv = board->private_data;
+
+				if (a_priv) {
+					mutex_lock(&a_priv->control_alloc_lock);
+					mutex_lock(&a_priv->bulk_alloc_lock);
+					mutex_lock(&a_priv->interrupt_alloc_lock);
+					agilent_82357a_cleanup_urbs(a_priv);
+					a_priv->bus_interface = NULL;
+					mutex_unlock(&a_priv->interrupt_alloc_lock);
+					mutex_unlock(&a_priv->bulk_alloc_lock);
+					mutex_unlock(&a_priv->control_alloc_lock);
+				}
+			}
+			agilent_82357a_driver_interfaces[i] = NULL;
+			break;
+		}
+	}
+	if (i == MAX_NUM_82357A_INTERFACES)
+		dev_err(&usb_dev->dev, "unable to find interface - bug?\n");
+	usb_put_dev(usb_dev);
+
+	mutex_unlock(&agilent_82357a_hotplug_lock);
+}
+
+static int agilent_82357a_driver_suspend(struct usb_interface *interface, pm_message_t message)
+{
+	int i, retval;
+	struct usb_device *usb_dev = interface_to_usbdev(interface);
+
+	mutex_lock(&agilent_82357a_hotplug_lock);
+
+	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i) {
+		if (agilent_82357a_driver_interfaces[i] == interface)	{
+			struct gpib_board *board = usb_get_intfdata(interface);
+
+			if (board) {
+				struct agilent_82357a_priv *a_priv = board->private_data;
+
+				if (a_priv) {
+					agilent_82357a_abort(a_priv, 0);
+					agilent_82357a_abort(a_priv, 0);
+					retval = agilent_82357a_go_idle(board);
+					if (retval) {
+						dev_err(&usb_dev->dev, "failed to go idle, retval=%i\n",
+							retval);
+						mutex_unlock(&agilent_82357a_hotplug_lock);
+						return retval;
+					}
+					mutex_lock(&a_priv->interrupt_alloc_lock);
+					agilent_82357a_cleanup_urbs(a_priv);
+					mutex_unlock(&a_priv->interrupt_alloc_lock);
+					dev_dbg(&usb_dev->dev,
+						"bus %d dev num %d gpib %d, interface %i suspended\n",
+						usb_dev->bus->busnum, usb_dev->devnum,
+						board->minor, i);
+				}
+			}
+			break;
+		}
+	}
+
+	mutex_unlock(&agilent_82357a_hotplug_lock);
+
+	return 0;
+}
+
+static int agilent_82357a_driver_resume(struct usb_interface *interface)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(interface);
+	struct gpib_board *board;
+	int i, retval = 0;
+
+	mutex_lock(&agilent_82357a_hotplug_lock);
+
+	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i)	{
+		if (agilent_82357a_driver_interfaces[i] == interface) {
+			board = usb_get_intfdata(interface);
+			if (board)
+				break;
+		}
+	}
+	if (i == MAX_NUM_82357A_INTERFACES) {
+		retval = -ENOENT;
+		goto resume_exit;
+	}
+
+	struct agilent_82357a_priv *a_priv = board->private_data;
+
+	if (a_priv) {
+		if (a_priv->interrupt_urb) {
+			mutex_lock(&a_priv->interrupt_alloc_lock);
+			retval = usb_submit_urb(a_priv->interrupt_urb, GFP_KERNEL);
+			if (retval) {
+				dev_err(&usb_dev->dev, "failed to resubmit interrupt urb in resume, retval=%i\n",
+					retval);
+				mutex_unlock(&a_priv->interrupt_alloc_lock);
+				mutex_unlock(&agilent_82357a_hotplug_lock);
+				return retval;
+			}
+			mutex_unlock(&a_priv->interrupt_alloc_lock);
+		}
+		retval = agilent_82357a_init(board);
+		if (retval < 0) {
+			mutex_unlock(&agilent_82357a_hotplug_lock);
+			return retval;
+		}
+		// set/unset system controller
+		retval = agilent_82357a_request_system_control(board, board->master);
+		// toggle ifc if master
+		if (board->master) {
+			agilent_82357a_interface_clear(board, 1);
+			usleep_range(200, 250);
+			agilent_82357a_interface_clear(board, 0);
+		}
+		// assert/unassert REN
+		agilent_82357a_remote_enable(board, a_priv->ren_state);
+
+		dev_dbg(&usb_dev->dev,
+			"bus %d dev num %d gpib%d, interface %i resumed\n",
+			usb_dev->bus->busnum, usb_dev->devnum, board->minor, i);
+	}
+
+resume_exit:
+	mutex_unlock(&agilent_82357a_hotplug_lock);
+
+	return retval;
+}
+
+static struct usb_driver agilent_82357a_bus_driver = {
+	.name = DRV_NAME,
+	.probe = agilent_82357a_driver_probe,
+	.disconnect = agilent_82357a_driver_disconnect,
+	.suspend = agilent_82357a_driver_suspend,
+	.resume = agilent_82357a_driver_resume,
+	.id_table = agilent_82357a_driver_device_table,
+};
+
+static int __init agilent_82357a_init_module(void)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i)
+		agilent_82357a_driver_interfaces[i] = NULL;
+
+	ret = usb_register(&agilent_82357a_bus_driver);
+	if (ret) {
+		pr_err("usb_register failed: error = %d\n", ret);
+		return ret;
+	}
+
+	ret = gpib_register_driver(&agilent_82357a_gpib_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		usb_deregister(&agilent_82357a_bus_driver);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit agilent_82357a_exit_module(void)
+{
+	gpib_unregister_driver(&agilent_82357a_gpib_interface);
+	usb_deregister(&agilent_82357a_bus_driver);
+}
+
+module_init(agilent_82357a_init_module);
+module_exit(agilent_82357a_exit_module);
diff --git a/drivers/gpib/agilent_82357a/agilent_82357a.h b/drivers/gpib/agilent_82357a/agilent_82357a.h
new file mode 100644
index 000000000000..33ac558e5552
--- /dev/null
+++ b/drivers/gpib/agilent_82357a/agilent_82357a.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *   copyright            : (C) 2004 by Frank Mori Hess                    *
+ ***************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/usb.h>
+#include <linux/timer.h>
+#include <linux/compiler_attributes.h>
+#include "gpibP.h"
+#include "tms9914.h"
+
+enum usb_vendor_ids {
+	USB_VENDOR_ID_AGILENT = 0x0957
+};
+
+enum usb_device_ids {
+	USB_DEVICE_ID_AGILENT_82357A = 0x0107,
+	USB_DEVICE_ID_AGILENT_82357A_PREINIT = 0x0007,	// device id before firmware is loaded
+	USB_DEVICE_ID_AGILENT_82357B = 0x0718,		// device id before firmware is loaded
+	USB_DEVICE_ID_AGILENT_82357B_PREINIT = 0x0518,	// device id before firmware is loaded
+};
+
+enum endpoint_addresses {
+	AGILENT_82357_CONTROL_ENDPOINT = 0x0,
+	AGILENT_82357_BULK_IN_ENDPOINT = 0x2,
+	AGILENT_82357A_BULK_OUT_ENDPOINT = 0x4,
+	AGILENT_82357A_INTERRUPT_IN_ENDPOINT = 0x6,
+	AGILENT_82357B_BULK_OUT_ENDPOINT = 0x6,
+	AGILENT_82357B_INTERRUPT_IN_ENDPOINT = 0x8,
+};
+
+enum bulk_commands {
+	DATA_PIPE_CMD_WRITE = 0x1,
+	DATA_PIPE_CMD_READ = 0x3,
+	DATA_PIPE_CMD_WR_REGS = 0x4,
+	DATA_PIPE_CMD_RD_REGS = 0x5
+};
+
+enum agilent_82357a_read_flags {
+	ARF_END_ON_EOI = 0x1,
+	ARF_NO_ADDRESS = 0x2,
+	ARF_END_ON_EOS_CHAR = 0x4,
+	ARF_SPOLL = 0x8
+};
+
+enum agilent_82357a_trailing_read_flags {
+	ATRF_EOI = 0x1,
+	ATRF_ATN = 0x2,
+	ATRF_IFC = 0x4,
+	ATRF_EOS = 0x8,
+	ATRF_ABORT = 0x10,
+	ATRF_COUNT = 0x20,
+	ATRF_DEAD_BUS = 0x40,
+	ATRF_UNADDRESSED = 0x80
+};
+
+enum agilent_82357a_write_flags {
+	AWF_SEND_EOI = 0x1,
+	AWF_NO_FAST_TALKER_FIRST_BYTE = 0x2,
+	AWF_NO_FAST_TALKER = 0x4,
+	AWF_NO_ADDRESS = 0x8,
+	AWF_ATN = 0x10,
+	AWF_SEPARATE_HEADER = 0x80
+};
+
+enum agilent_82357a_interrupt_flag_bit_numbers {
+	AIF_SRQ_BN = 0,
+	AIF_WRITE_COMPLETE_BN = 1,
+	AIF_READ_COMPLETE_BN = 2,
+};
+
+enum agilent_82357_error_codes {
+	UGP_SUCCESS = 0,
+	UGP_ERR_INVALID_CMD = 1,
+	UGP_ERR_INVALID_PARAM = 2,
+	UGP_ERR_INVALID_REG = 3,
+	UGP_ERR_GPIB_READ = 4,
+	UGP_ERR_GPIB_WRITE = 5,
+	UGP_ERR_FLUSHING = 6,
+	UGP_ERR_FLUSHING_ALREADY = 7,
+	UGP_ERR_UNSUPPORTED = 8,
+	UGP_ERR_OTHER  = 9
+};
+
+enum agilent_82357_control_values {
+	XFER_ABORT = 0xa0,
+	XFER_STATUS = 0xb0,
+};
+
+enum xfer_status_bits {
+	XS_COMPLETED = 0x1,
+	XS_READ = 0x2,
+};
+
+enum xfer_status_completion_bits {
+	XSC_EOI = 0x1,
+	XSC_ATN = 0x2,
+	XSC_IFC = 0x4,
+	XSC_EOS = 0x8,
+	XSC_ABORT = 0x10,
+	XSC_COUNT = 0x20,
+	XSC_DEAD_BUS = 0x40,
+	XSC_BUS_NOT_ADDRESSED = 0x80
+};
+
+enum xfer_abort_type {
+	XA_FLUSH = 0x1
+};
+
+#define STATUS_DATA_LEN 8
+#define INTERRUPT_BUF_LEN 8
+
+struct agilent_82357a_urb_ctx {
+	struct completion complete;
+	unsigned timed_out : 1;
+};
+
+// struct which defines local data for each 82357 device
+struct agilent_82357a_priv {
+	struct usb_interface *bus_interface;
+	unsigned short eos_char;
+	unsigned short eos_mode;
+	unsigned short hw_control_bits;
+	unsigned long interrupt_flags;
+	struct urb *bulk_urb;
+	struct urb *interrupt_urb;
+	u8 *interrupt_buffer;
+	struct mutex bulk_transfer_lock;	// bulk transfer lock
+	struct mutex bulk_alloc_lock;		// bulk transfer allocation lock
+	struct mutex interrupt_alloc_lock;	// interrupt allocation lock
+	struct mutex control_alloc_lock;	// control message allocation lock
+	struct timer_list bulk_timer;
+	struct agilent_82357a_urb_ctx context;
+	unsigned int bulk_out_endpoint;
+	unsigned int interrupt_in_endpoint;
+	unsigned is_cic : 1;
+	unsigned ren_state : 1;
+};
+
+struct agilent_82357a_register_pairlet {
+	short address;
+	unsigned short value;
+};
+
+enum firmware_registers {
+	HW_CONTROL = 0xa,
+	LED_CONTROL = 0xb,
+	RESET_TO_POWERUP = 0xc,
+	PROTOCOL_CONTROL = 0xd,
+	FAST_TALKER_T1 = 0xe
+};
+
+enum hardware_control_bits {
+	NOT_TI_RESET = 0x1,
+	SYSTEM_CONTROLLER = 0x2,
+	NOT_PARALLEL_POLL = 0x4,
+	OSCILLATOR_5V_ON = 0x8,
+	OUTPUT_5V_ON = 0x20,
+	CPLD_3V_ON = 0x80,
+};
+
+enum led_control_bits {
+	FIRMWARE_LED_CONTROL = 0x1,
+	FAIL_LED_ON = 0x20,
+	READY_LED_ON = 0x40,
+	ACCESS_LED_ON = 0x80
+};
+
+enum reset_to_powerup_bits {
+	RESET_SPACEBALL = 0x1,	// wait 2 millisec after sending
+};
+
+enum protocol_control_bits {
+	WRITE_COMPLETE_INTERRUPT_EN = 0x1,
+};
+
+static const int agilent_82357a_control_request = 0x4;
+
diff --git a/drivers/gpib/cb7210/Makefile b/drivers/gpib/cb7210/Makefile
new file mode 100644
index 000000000000..d239ae80b415
--- /dev/null
+++ b/drivers/gpib/cb7210/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_GPIB_CB7210) += cb7210.o
+
+
diff --git a/drivers/gpib/cb7210/cb7210.c b/drivers/gpib/cb7210/cb7210.c
new file mode 100644
index 000000000000..3e2397898a9b
--- /dev/null
+++ b/drivers/gpib/cb7210/cb7210.c
@@ -0,0 +1,1598 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ * Measurement Computing boards using cb7210.2 and cbi488.2 chips
+ *    copyright            : (C) 2001, 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include "cb7210.h"
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/dma.h>
+#include <linux/bitops.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include "gpib_pci_ids.h"
+#include "quancom_pci.h"
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver Measurement Computing boards using cb7210.2 and cbi488.2");
+
+static int cb7210_read(struct gpib_board *board, u8 *buffer, size_t length,
+		       int *end, size_t *bytes_read);
+
+	static inline int have_fifo_word(const struct cb7210_priv *cb_priv)
+{
+	if (((cb7210_read_byte(cb_priv, HS_STATUS)) &
+	     (HS_RX_MSB_NOT_EMPTY | HS_RX_LSB_NOT_EMPTY)) ==
+	    (HS_RX_MSB_NOT_EMPTY | HS_RX_LSB_NOT_EMPTY))
+		return 1;
+	else
+		return 0;
+}
+
+static inline void input_fifo_enable(struct gpib_board *board, int enable)
+{
+	struct cb7210_priv *cb_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+
+	if (enable) {
+		cb_priv->in_fifo_half_full = 0;
+		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
+
+		cb7210_write_byte(cb_priv, HS_RX_ENABLE | HS_TX_ENABLE | HS_CLR_SRQ_INT |
+				  HS_CLR_EOI_EMPTY_INT | HS_CLR_HF_INT | cb_priv->hs_mode_bits,
+				  HS_MODE);
+
+		cb_priv->hs_mode_bits &= ~HS_ENABLE_MASK;
+		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
+
+		cb7210_write_byte(cb_priv, irq_bits(cb_priv->irq), HS_INT_LEVEL);
+
+		cb_priv->hs_mode_bits |= HS_RX_ENABLE;
+		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
+	} else {
+		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
+
+		cb_priv->hs_mode_bits &= ~HS_ENABLE_MASK;
+		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, nec7210_iobase(cb_priv) +
+				  HS_MODE);
+
+		clear_bit(READ_READY_BN, &nec_priv->state);
+	}
+
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+
+static int fifo_read(struct gpib_board *board, struct cb7210_priv *cb_priv, u8 *buffer,
+		     size_t length, int *end, size_t *bytes_read)
+{
+	ssize_t retval = 0;
+	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+	int hs_status;
+	u16 word;
+	unsigned long flags;
+
+	*bytes_read = 0;
+	if (cb_priv->fifo_iobase == 0)	{
+		dev_err(board->gpib_dev, "fifo iobase is zero!\n");
+		return -EIO;
+	}
+	*end = 0;
+	if (length <= cb7210_fifo_size)	{
+		dev_err(board->gpib_dev, " bug! fifo read length < fifo size\n");
+		return -EINVAL;
+	}
+
+	input_fifo_enable(board, 1);
+
+	while (*bytes_read + cb7210_fifo_size < length)	{
+		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, HR_DMAI);
+
+		if (wait_event_interruptible(board->wait,
+					     (cb_priv->in_fifo_half_full &&
+					      have_fifo_word(cb_priv)) ||
+					     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
+					     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+					     test_bit(TIMO_NUM, &board->status))) {
+			retval = -ERESTARTSYS;
+			nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
+			break;
+		}
+
+		spin_lock_irqsave(&board->spinlock, flags);
+
+		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
+
+		while (have_fifo_word(cb_priv))	{
+			word = inw(cb_priv->fifo_iobase + DIR);
+			buffer[(*bytes_read)++] = word & 0xff;
+			buffer[(*bytes_read)++] = (word >> 8) & 0xff;
+		}
+
+		cb_priv->in_fifo_half_full = 0;
+
+		hs_status = cb7210_read_byte(cb_priv, HS_STATUS);
+
+		spin_unlock_irqrestore(&board->spinlock, flags);
+
+		if (test_and_clear_bit(RECEIVED_END_BN, &nec_priv->state)) {
+			*end = 1;
+			break;
+		}
+		if (hs_status & HS_FIFO_FULL)
+			break;
+		if (test_bit(TIMO_NUM, &board->status))	{
+			retval = -ETIMEDOUT;
+			break;
+		}
+		if (test_bit(DEV_CLEAR_BN, &nec_priv->state)) {
+			retval = -EINTR;
+			break;
+		}
+	}
+	hs_status = cb7210_read_byte(cb_priv, HS_STATUS);
+	if (hs_status & HS_RX_LSB_NOT_EMPTY) {
+		word = inw(cb_priv->fifo_iobase + DIR);
+		buffer[(*bytes_read)++] = word & 0xff;
+	}
+
+	input_fifo_enable(board, 0);
+
+	if (wait_event_interruptible(board->wait,
+				     test_bit(READ_READY_BN, &nec_priv->state) ||
+				     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status))) {
+		retval = -ERESTARTSYS;
+	}
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+	if (test_bit(READ_READY_BN, &nec_priv->state)) {
+		nec7210_set_handshake_mode(board, nec_priv, HR_HLDA);
+		buffer[(*bytes_read)++] = nec7210_read_data_in(board, nec_priv, end);
+	}
+
+	return retval;
+}
+
+static int cb7210_accel_read(struct gpib_board *board, u8 *buffer,
+			     size_t length, int *end, size_t *bytes_read)
+{
+	ssize_t retval;
+	struct cb7210_priv *cb_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+	size_t num_bytes;
+
+	*bytes_read = 0;
+	// deal with limitations of fifo
+	if (length < cb7210_fifo_size + 3 || (nec_priv->auxa_bits & HR_REOS))
+		return cb7210_read(board, buffer, length, end, bytes_read);
+	*end = 0;
+
+	nec7210_release_rfd_holdoff(board, nec_priv);
+
+	if (wait_event_interruptible(board->wait,
+				     test_bit(READ_READY_BN, &nec_priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status))) {
+		return -ERESTARTSYS;
+	}
+	if (test_bit(TIMO_NUM, &board->status))
+		return -ETIMEDOUT;
+	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
+		return -EINTR;
+
+	nec7210_set_handshake_mode(board, nec_priv, HR_HLDE);
+	buffer[(*bytes_read)++] = nec7210_read_data_in(board, nec_priv, end);
+	if (*end)
+		return 0;
+
+	nec7210_release_rfd_holdoff(board, nec_priv);
+
+	retval = fifo_read(board, cb_priv, &buffer[*bytes_read], length - *bytes_read - 1,
+			   end, &num_bytes);
+	*bytes_read += num_bytes;
+	if (retval < 0)
+		return retval;
+	if (*end)
+		return 0;
+
+	retval = cb7210_read(board, &buffer[*bytes_read], 1, end, &num_bytes);
+	*bytes_read += num_bytes;
+	if (retval < 0)
+		return retval;
+
+	return 0;
+}
+
+static int output_fifo_empty(const struct cb7210_priv *cb_priv)
+{
+	if ((cb7210_read_byte(cb_priv, HS_STATUS) & (HS_TX_MSB_NOT_EMPTY | HS_TX_LSB_NOT_EMPTY))
+	    == 0)
+		return 1;
+	else
+		return 0;
+}
+
+static inline void output_fifo_enable(struct gpib_board *board, int enable)
+{
+	struct cb7210_priv *cb_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+
+	if (enable) {
+		nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, 0);
+		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, HR_DMAO);
+
+		cb7210_write_byte(cb_priv, HS_RX_ENABLE | HS_TX_ENABLE | HS_CLR_SRQ_INT |
+				  HS_CLR_EOI_EMPTY_INT | HS_CLR_HF_INT | cb_priv->hs_mode_bits,
+				  HS_MODE);
+
+		cb_priv->hs_mode_bits &= ~HS_ENABLE_MASK;
+		cb_priv->hs_mode_bits |= HS_TX_ENABLE;
+		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
+
+		cb7210_write_byte(cb_priv, irq_bits(cb_priv->irq), HS_INT_LEVEL);
+
+		clear_bit(WRITE_READY_BN, &nec_priv->state);
+
+	} else {
+		cb_priv->hs_mode_bits &= ~HS_ENABLE_MASK;
+		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
+
+		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0);
+		nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, HR_DOIE);
+	}
+
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+
+static int fifo_write(struct gpib_board *board, u8 *buffer, size_t length,
+		      size_t *bytes_written)
+{
+	size_t count = 0;
+	ssize_t retval = 0;
+	struct cb7210_priv *cb_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+	unsigned int num_bytes, i;
+	unsigned long flags;
+
+	*bytes_written = 0;
+	if (cb_priv->fifo_iobase == 0) {
+		dev_err(board->gpib_dev, "fifo iobase is zero!\n");
+		return -EINVAL;
+	}
+	if (length == 0)
+		return 0;
+
+	clear_bit(DEV_CLEAR_BN, &nec_priv->state);
+	clear_bit(BUS_ERROR_BN, &nec_priv->state);
+
+	output_fifo_enable(board, 1);
+
+	while (count < length) {
+		// wait until byte is ready to be sent
+		if (wait_event_interruptible(board->wait,
+					     cb_priv->out_fifo_half_empty ||
+					     output_fifo_empty(cb_priv) ||
+					     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+					     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
+					     test_bit(TIMO_NUM, &board->status))) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		if (test_bit(TIMO_NUM, &board->status) ||
+		    test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+		    test_bit(BUS_ERROR_BN, &nec_priv->state))
+			break;
+
+		if (output_fifo_empty(cb_priv))
+			num_bytes = cb7210_fifo_size - cb7210_fifo_width;
+		else
+			num_bytes = cb7210_fifo_size / 2;
+		if (num_bytes + count > length)
+			num_bytes = length - count;
+		if (num_bytes % cb7210_fifo_width) {
+			dev_err(board->gpib_dev, " bug! fifo write with odd number of bytes\n");
+			retval = -EINVAL;
+			break;
+		}
+
+		spin_lock_irqsave(&board->spinlock, flags);
+		for (i = 0; i < num_bytes / cb7210_fifo_width; i++) {
+			u16 word;
+
+			word = buffer[count++] & 0xff;
+			word |= (buffer[count++] << 8) & 0xff00;
+			outw(word, cb_priv->fifo_iobase + CDOR);
+		}
+		cb_priv->out_fifo_half_empty = 0;
+		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits |
+				  HS_CLR_EOI_EMPTY_INT | HS_CLR_HF_INT, HS_MODE);
+		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
+		spin_unlock_irqrestore(&board->spinlock, flags);
+	}
+	// wait last byte has been sent
+	if (wait_event_interruptible(board->wait,
+				     output_fifo_empty(cb_priv) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status))) {
+		retval = -ERESTARTSYS;
+	}
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_bit(BUS_ERROR_BN, &nec_priv->state))
+		retval = -EIO;
+	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+
+	output_fifo_enable(board, 0);
+
+	*bytes_written = count;
+	return retval;
+}
+
+static int cb7210_accel_write(struct gpib_board *board, u8 *buffer,
+			      size_t length, int send_eoi, size_t *bytes_written)
+{
+	struct cb7210_priv *cb_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+	unsigned long fast_chunk_size, leftover;
+	int retval;
+	size_t num_bytes;
+
+	*bytes_written = 0;
+	if (length > cb7210_fifo_width)
+		fast_chunk_size = length - 1;
+	else
+		fast_chunk_size = 0;
+	fast_chunk_size -= fast_chunk_size % cb7210_fifo_width;
+	leftover = length - fast_chunk_size;
+
+	retval = fifo_write(board, buffer, fast_chunk_size, &num_bytes);
+	*bytes_written += num_bytes;
+	if (retval < 0)
+		return retval;
+
+	retval = nec7210_write(board, nec_priv, buffer + fast_chunk_size, leftover,
+			       send_eoi, &num_bytes);
+	*bytes_written += num_bytes;
+	return retval;
+}
+
+static int cb7210_line_status(const struct gpib_board *board)
+{
+	int status = VALID_ALL;
+	int bsr_bits;
+	struct cb7210_priv *cb_priv;
+
+	cb_priv = board->private_data;
+
+	bsr_bits = cb7210_paged_read_byte(cb_priv, BUS_STATUS, BUS_STATUS_PAGE);
+
+	if ((bsr_bits & BSR_REN_BIT) == 0)
+		status |= BUS_REN;
+	if ((bsr_bits & BSR_IFC_BIT) == 0)
+		status |= BUS_IFC;
+	if ((bsr_bits & BSR_SRQ_BIT) == 0)
+		status |= BUS_SRQ;
+	if ((bsr_bits & BSR_EOI_BIT) == 0)
+		status |= BUS_EOI;
+	if ((bsr_bits & BSR_NRFD_BIT) == 0)
+		status |= BUS_NRFD;
+	if ((bsr_bits & BSR_NDAC_BIT) == 0)
+		status |= BUS_NDAC;
+	if ((bsr_bits & BSR_DAV_BIT) == 0)
+		status |= BUS_DAV;
+	if ((bsr_bits & BSR_ATN_BIT) == 0)
+		status |= BUS_ATN;
+
+	return status;
+}
+
+static int cb7210_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	struct cb7210_priv *cb_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+	unsigned int retval;
+
+	retval = nec7210_t1_delay(board, nec_priv, nano_sec);
+
+	if (nano_sec <= 350) {
+		write_byte(nec_priv, AUX_HI_SPEED, AUXMR);
+		retval = 350;
+	} else {
+		write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
+	}
+	return retval;
+}
+
+static irqreturn_t cb7210_locked_internal_interrupt(struct gpib_board *board);
+
+/*
+ * GPIB interrupt service routines
+ */
+
+static irqreturn_t cb_pci_interrupt(int irq, void *arg)
+{
+	int bits;
+	struct gpib_board *board = arg;
+	struct cb7210_priv *priv = board->private_data;
+
+	// first task check if this is really our interrupt in a shared irq environment
+	switch (priv->pci_chip)	{
+	case PCI_CHIP_AMCC_S5933:
+		if ((inl(priv->amcc_iobase + INTCSR_REG) &
+		     (INBOX_INTR_CS_BIT | INTR_ASSERTED_BIT)) == 0)
+			return IRQ_NONE;
+
+		// read incoming mailbox to clear mailbox full flag
+		inl(priv->amcc_iobase + INCOMING_MAILBOX_REG(3));
+		// clear amccs5933 interrupt
+		bits = INBOX_FULL_INTR_BIT | INBOX_BYTE_BITS(3) |
+			INBOX_SELECT_BITS(3) |	INBOX_INTR_CS_BIT;
+		outl(bits, priv->amcc_iobase + INTCSR_REG);
+		break;
+	case PCI_CHIP_QUANCOM:
+		if ((inb(nec7210_iobase(priv) + QUANCOM_IRQ_CONTROL_STATUS_REG) &
+		     QUANCOM_IRQ_ASSERTED_BIT))
+			outb(QUANCOM_IRQ_ENABLE_BIT, nec7210_iobase(priv) +
+			     QUANCOM_IRQ_CONTROL_STATUS_REG);
+		break;
+	default:
+		break;
+	}
+	return cb7210_locked_internal_interrupt(arg);
+}
+
+static irqreturn_t cb7210_internal_interrupt(struct gpib_board *board)
+{
+	int hs_status, status1, status2;
+	struct cb7210_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+	int clear_bits;
+
+	if ((priv->hs_mode_bits & HS_ENABLE_MASK)) {
+		status1 = 0;
+		hs_status = cb7210_read_byte(priv, HS_STATUS);
+	} else {
+		hs_status = 0;
+		status1 = read_byte(nec_priv, ISR1);
+	}
+	status2 = read_byte(nec_priv, ISR2);
+	nec7210_interrupt_have_status(board, nec_priv, status1, status2);
+
+	dev_dbg(board->gpib_dev, "status 0x%x, mode 0x%x\n", hs_status, priv->hs_mode_bits);
+
+	clear_bits = 0;
+
+	if (hs_status & HS_HALF_FULL) {
+		if (priv->hs_mode_bits & HS_TX_ENABLE)
+			priv->out_fifo_half_empty = 1;
+		else if (priv->hs_mode_bits & HS_RX_ENABLE)
+			priv->in_fifo_half_full = 1;
+		clear_bits |= HS_CLR_HF_INT;
+	}
+
+	if (hs_status & HS_SRQ_INT) {
+		set_bit(SRQI_NUM, &board->status);
+		clear_bits |= HS_CLR_SRQ_INT;
+	}
+
+	if ((hs_status & HS_EOI_INT)) {
+		clear_bits |= HS_CLR_EOI_EMPTY_INT;
+		set_bit(RECEIVED_END_BN, &nec_priv->state);
+		if ((nec_priv->auxa_bits & HR_HANDSHAKE_MASK) == HR_HLDE)
+			set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+	}
+
+	if ((priv->hs_mode_bits & HS_TX_ENABLE) &&
+	    (hs_status & (HS_TX_MSB_NOT_EMPTY | HS_TX_LSB_NOT_EMPTY)) == 0)
+		clear_bits |= HS_CLR_EOI_EMPTY_INT;
+
+	if (clear_bits) {
+		cb7210_write_byte(priv, priv->hs_mode_bits | clear_bits, HS_MODE);
+		cb7210_write_byte(priv, priv->hs_mode_bits, HS_MODE);
+		wake_up_interruptible(&board->wait);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t cb7210_locked_internal_interrupt(struct gpib_board *board)
+{
+	unsigned long flags;
+	irqreturn_t retval;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	retval = cb7210_internal_interrupt(board);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return retval;
+}
+
+static irqreturn_t cb7210_interrupt(int irq, void *arg)
+{
+	return cb7210_internal_interrupt(arg);
+}
+
+static int cb_pci_attach(struct gpib_board *board, const struct gpib_board_config *config);
+static int cb_isa_attach(struct gpib_board *board, const struct gpib_board_config *config);
+
+static void cb_pci_detach(struct gpib_board *board);
+static void cb_isa_detach(struct gpib_board *board);
+
+// wrappers for interface functions
+static int cb7210_read(struct gpib_board *board, u8 *buffer, size_t length,
+		       int *end, size_t *bytes_read)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
+}
+
+static int cb7210_write(struct gpib_board *board, u8 *buffer, size_t length,
+			int send_eoi, size_t *bytes_written)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
+}
+
+static int cb7210_command(struct gpib_board *board, u8 *buffer, size_t length,
+			  size_t *bytes_written)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
+}
+
+static int cb7210_take_control(struct gpib_board *board, int synchronous)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
+}
+
+static int cb7210_go_to_standby(struct gpib_board *board)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_go_to_standby(board, &priv->nec7210_priv);
+}
+
+static int cb7210_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct cb7210_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+
+	if (request_control)
+		priv->hs_mode_bits |= HS_SYS_CONTROL;
+	else
+		priv->hs_mode_bits &= ~HS_SYS_CONTROL;
+
+	cb7210_write_byte(priv, priv->hs_mode_bits, HS_MODE);
+	return nec7210_request_system_control(board, nec_priv, request_control);
+}
+
+static void cb7210_interface_clear(struct gpib_board *board, int assert)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
+}
+
+static void cb7210_remote_enable(struct gpib_board *board, int enable)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
+}
+
+static int cb7210_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
+}
+
+static void cb7210_disable_eos(struct gpib_board *board)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	nec7210_disable_eos(board, &priv->nec7210_priv);
+}
+
+static unsigned int cb7210_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
+}
+
+static int cb7210_primary_address(struct gpib_board *board, unsigned int address)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_primary_address(board, &priv->nec7210_priv, address);
+}
+
+static int cb7210_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
+}
+
+static int cb7210_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
+}
+
+static void cb7210_parallel_poll_configure(struct gpib_board *board, u8 configuration)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, configuration);
+}
+
+static void cb7210_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
+}
+
+static void cb7210_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
+}
+
+static u8 cb7210_serial_poll_status(struct gpib_board *board)
+{
+	struct cb7210_priv *priv = board->private_data;
+
+	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
+}
+
+static void cb7210_return_to_local(struct gpib_board *board)
+{
+	struct cb7210_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+
+	write_byte(nec_priv, AUX_RTL2, AUXMR);
+	udelay(1);
+	write_byte(nec_priv, AUX_RTL, AUXMR);
+}
+
+static struct gpib_interface cb_pci_unaccel_interface = {
+	.name = "cbi_pci_unaccel",
+	.attach = cb_pci_attach,
+	.detach = cb_pci_detach,
+	.read = cb7210_read,
+	.write = cb7210_write,
+	.command = cb7210_command,
+	.take_control = cb7210_take_control,
+	.go_to_standby = cb7210_go_to_standby,
+	.request_system_control = cb7210_request_system_control,
+	.interface_clear = cb7210_interface_clear,
+	.remote_enable = cb7210_remote_enable,
+	.enable_eos = cb7210_enable_eos,
+	.disable_eos = cb7210_disable_eos,
+	.parallel_poll = cb7210_parallel_poll,
+	.parallel_poll_configure = cb7210_parallel_poll_configure,
+	.parallel_poll_response = cb7210_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = cb7210_line_status,
+	.update_status = cb7210_update_status,
+	.primary_address = cb7210_primary_address,
+	.secondary_address = cb7210_secondary_address,
+	.serial_poll_response = cb7210_serial_poll_response,
+	.serial_poll_status = cb7210_serial_poll_status,
+	.t1_delay = cb7210_t1_delay,
+	.return_to_local = cb7210_return_to_local,
+};
+
+static struct gpib_interface cb_pci_accel_interface = {
+	.name = "cbi_pci_accel",
+	.attach = cb_pci_attach,
+	.detach = cb_pci_detach,
+	.read = cb7210_accel_read,
+	.write = cb7210_accel_write,
+	.command = cb7210_command,
+	.take_control = cb7210_take_control,
+	.go_to_standby = cb7210_go_to_standby,
+	.request_system_control = cb7210_request_system_control,
+	.interface_clear = cb7210_interface_clear,
+	.remote_enable = cb7210_remote_enable,
+	.enable_eos = cb7210_enable_eos,
+	.disable_eos = cb7210_disable_eos,
+	.parallel_poll = cb7210_parallel_poll,
+	.parallel_poll_configure = cb7210_parallel_poll_configure,
+	.parallel_poll_response = cb7210_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = cb7210_line_status,
+	.update_status = cb7210_update_status,
+	.primary_address = cb7210_primary_address,
+	.secondary_address = cb7210_secondary_address,
+	.serial_poll_response = cb7210_serial_poll_response,
+	.serial_poll_status = cb7210_serial_poll_status,
+	.t1_delay = cb7210_t1_delay,
+	.return_to_local = cb7210_return_to_local,
+};
+
+static struct gpib_interface cb_pci_interface = {
+	.name = "cbi_pci",
+	.attach = cb_pci_attach,
+	.detach = cb_pci_detach,
+	.read = cb7210_accel_read,
+	.write = cb7210_accel_write,
+	.command = cb7210_command,
+	.take_control = cb7210_take_control,
+	.go_to_standby = cb7210_go_to_standby,
+	.request_system_control = cb7210_request_system_control,
+	.interface_clear = cb7210_interface_clear,
+	.remote_enable = cb7210_remote_enable,
+	.enable_eos = cb7210_enable_eos,
+	.disable_eos = cb7210_disable_eos,
+	.parallel_poll = cb7210_parallel_poll,
+	.parallel_poll_configure = cb7210_parallel_poll_configure,
+	.parallel_poll_response = cb7210_parallel_poll_response,
+	.line_status = cb7210_line_status,
+	.update_status = cb7210_update_status,
+	.primary_address = cb7210_primary_address,
+	.secondary_address = cb7210_secondary_address,
+	.serial_poll_response = cb7210_serial_poll_response,
+	.serial_poll_status = cb7210_serial_poll_status,
+	.t1_delay = cb7210_t1_delay,
+	.return_to_local = cb7210_return_to_local,
+};
+
+static struct gpib_interface cb_isa_unaccel_interface = {
+	.name = "cbi_isa_unaccel",
+	.attach = cb_isa_attach,
+	.detach = cb_isa_detach,
+	.read = cb7210_read,
+	.write = cb7210_write,
+	.command = cb7210_command,
+	.take_control = cb7210_take_control,
+	.go_to_standby = cb7210_go_to_standby,
+	.request_system_control = cb7210_request_system_control,
+	.interface_clear = cb7210_interface_clear,
+	.remote_enable = cb7210_remote_enable,
+	.enable_eos = cb7210_enable_eos,
+	.disable_eos = cb7210_disable_eos,
+	.parallel_poll = cb7210_parallel_poll,
+	.parallel_poll_configure = cb7210_parallel_poll_configure,
+	.parallel_poll_response = cb7210_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = cb7210_line_status,
+	.update_status = cb7210_update_status,
+	.primary_address = cb7210_primary_address,
+	.secondary_address = cb7210_secondary_address,
+	.serial_poll_response = cb7210_serial_poll_response,
+	.serial_poll_status = cb7210_serial_poll_status,
+	.t1_delay = cb7210_t1_delay,
+	.return_to_local = cb7210_return_to_local,
+};
+
+static struct gpib_interface cb_isa_interface = {
+	.name = "cbi_isa",
+	.attach = cb_isa_attach,
+	.detach = cb_isa_detach,
+	.read = cb7210_accel_read,
+	.write = cb7210_accel_write,
+	.command = cb7210_command,
+	.take_control = cb7210_take_control,
+	.go_to_standby = cb7210_go_to_standby,
+	.request_system_control = cb7210_request_system_control,
+	.interface_clear = cb7210_interface_clear,
+	.remote_enable = cb7210_remote_enable,
+	.enable_eos = cb7210_enable_eos,
+	.disable_eos = cb7210_disable_eos,
+	.parallel_poll = cb7210_parallel_poll,
+	.parallel_poll_configure = cb7210_parallel_poll_configure,
+	.parallel_poll_response = cb7210_parallel_poll_response,
+	.line_status = cb7210_line_status,
+	.update_status = cb7210_update_status,
+	.primary_address = cb7210_primary_address,
+	.secondary_address = cb7210_secondary_address,
+	.serial_poll_response = cb7210_serial_poll_response,
+	.serial_poll_status = cb7210_serial_poll_status,
+	.t1_delay = cb7210_t1_delay,
+	.return_to_local = cb7210_return_to_local,
+};
+
+static struct gpib_interface cb_isa_accel_interface = {
+	.name = "cbi_isa_accel",
+	.attach = cb_isa_attach,
+	.detach = cb_isa_detach,
+	.read = cb7210_accel_read,
+	.write = cb7210_accel_write,
+	.command = cb7210_command,
+	.take_control = cb7210_take_control,
+	.go_to_standby = cb7210_go_to_standby,
+	.request_system_control = cb7210_request_system_control,
+	.interface_clear = cb7210_interface_clear,
+	.remote_enable = cb7210_remote_enable,
+	.enable_eos = cb7210_enable_eos,
+	.disable_eos = cb7210_disable_eos,
+	.parallel_poll = cb7210_parallel_poll,
+	.parallel_poll_configure = cb7210_parallel_poll_configure,
+	.parallel_poll_response = cb7210_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = cb7210_line_status,
+	.update_status = cb7210_update_status,
+	.primary_address = cb7210_primary_address,
+	.secondary_address = cb7210_secondary_address,
+	.serial_poll_response = cb7210_serial_poll_response,
+	.serial_poll_status = cb7210_serial_poll_status,
+	.t1_delay = cb7210_t1_delay,
+	.return_to_local = cb7210_return_to_local,
+};
+
+static int cb7210_allocate_private(struct gpib_board *board)
+{
+	struct cb7210_priv *priv;
+
+	board->private_data = kmalloc(sizeof(struct cb7210_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -ENOMEM;
+	priv = board->private_data;
+	memset(priv, 0, sizeof(struct cb7210_priv));
+	init_nec7210_private(&priv->nec7210_priv);
+	return 0;
+}
+
+static void cb7210_generic_detach(struct gpib_board *board)
+{
+	kfree(board->private_data);
+	board->private_data = NULL;
+}
+
+// generic part of attach functions shared by all cb7210 boards
+static int cb7210_generic_attach(struct gpib_board *board)
+{
+	struct cb7210_priv *cb_priv;
+	struct nec7210_priv *nec_priv;
+
+	board->status = 0;
+
+	if (cb7210_allocate_private(board))
+		return -ENOMEM;
+	cb_priv = board->private_data;
+	nec_priv = &cb_priv->nec7210_priv;
+	nec_priv->read_byte = nec7210_locking_ioport_read_byte;
+	nec_priv->write_byte = nec7210_locking_ioport_write_byte;
+	nec_priv->offset = cb7210_reg_offset;
+	nec_priv->type = CB7210;
+	return 0;
+}
+
+static int cb7210_init(struct cb7210_priv *cb_priv, struct gpib_board *board)
+{
+	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+
+	cb7210_write_byte(cb_priv, HS_RESET7210, HS_INT_LEVEL);
+	cb7210_write_byte(cb_priv, irq_bits(cb_priv->irq), HS_INT_LEVEL);
+
+	nec7210_board_reset(nec_priv, board);
+	cb7210_write_byte(cb_priv, HS_TX_ENABLE | HS_RX_ENABLE | HS_CLR_SRQ_INT |
+			  HS_CLR_EOI_EMPTY_INT | HS_CLR_HF_INT, HS_MODE);
+
+	cb_priv->hs_mode_bits = HS_HF_INT_EN;
+	cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
+
+	write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
+	/*
+	 * set clock register for maximum (20 MHz) driving frequency
+	 * ICR should be set to clock in megahertz (1-15) and to zero
+	 * for clocks faster than 15 MHz (max 20MHz)
+	 */
+	write_byte(nec_priv, ICR | 0, AUXMR);
+
+	if (cb_priv->pci_chip == PCI_CHIP_QUANCOM) {
+		/* change interrupt polarity */
+		nec_priv->auxb_bits |= HR_INV;
+		write_byte(nec_priv, nec_priv->auxb_bits, AUXMR);
+	}
+	nec7210_board_online(nec_priv, board);
+
+	/* poll so we can detect assertion of ATN */
+	if (gpib_request_pseudo_irq(board, cb_pci_interrupt)) {
+		pr_err("failed to allocate pseudo_irq\n");
+		return -1;
+	}
+	return 0;
+}
+
+static int cb_pci_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct cb7210_priv *cb_priv;
+	struct nec7210_priv *nec_priv;
+	int isr_flags = 0;
+	int bits;
+	int retval;
+
+	retval = cb7210_generic_attach(board);
+	if (retval)
+		return retval;
+
+	cb_priv = board->private_data;
+	nec_priv = &cb_priv->nec7210_priv;
+
+	cb_priv->pci_device = gpib_pci_get_device(config, PCI_VENDOR_ID_CBOARDS,
+						  PCI_DEVICE_ID_CBOARDS_PCI_GPIB, NULL);
+	if (cb_priv->pci_device)
+		cb_priv->pci_chip = PCI_CHIP_AMCC_S5933;
+	if (!cb_priv->pci_device) {
+		cb_priv->pci_device = gpib_pci_get_device(config, PCI_VENDOR_ID_CBOARDS,
+							  PCI_DEVICE_ID_CBOARDS_CPCI_GPIB, NULL);
+		if (cb_priv->pci_device)
+			cb_priv->pci_chip = PCI_CHIP_AMCC_S5933;
+	}
+	if (!cb_priv->pci_device) {
+		cb_priv->pci_device = gpib_pci_get_device(config, PCI_VENDOR_ID_QUANCOM,
+							  PCI_DEVICE_ID_QUANCOM_GPIB, NULL);
+		if (cb_priv->pci_device) {
+			cb_priv->pci_chip = PCI_CHIP_QUANCOM;
+			nec_priv->offset = 4;
+		}
+	}
+	if (!cb_priv->pci_device) {
+		dev_err(board->gpib_dev, "no supported boards found.\n");
+		return -ENODEV;
+	}
+
+	if (pci_enable_device(cb_priv->pci_device)) {
+		dev_err(board->gpib_dev, "error enabling pci device\n");
+		return -EIO;
+	}
+
+	if (pci_request_regions(cb_priv->pci_device, DRV_NAME))
+		return -EBUSY;
+	switch (cb_priv->pci_chip) {
+	case PCI_CHIP_AMCC_S5933:
+		cb_priv->amcc_iobase = pci_resource_start(cb_priv->pci_device, 0);
+		nec_priv->iobase = pci_resource_start(cb_priv->pci_device, 1);
+		cb_priv->fifo_iobase = pci_resource_start(cb_priv->pci_device, 2);
+		break;
+	case PCI_CHIP_QUANCOM:
+		nec_priv->iobase = pci_resource_start(cb_priv->pci_device, 0);
+		cb_priv->fifo_iobase = nec_priv->iobase;
+		break;
+	default:
+		dev_err(board->gpib_dev, "bug! unhandled pci_chip=%i\n", cb_priv->pci_chip);
+		return -EIO;
+	}
+	isr_flags |= IRQF_SHARED;
+	if (request_irq(cb_priv->pci_device->irq, cb_pci_interrupt, isr_flags, DRV_NAME, board)) {
+		dev_err(board->gpib_dev, "can't request IRQ %d\n",
+			cb_priv->pci_device->irq);
+		return -EBUSY;
+	}
+	cb_priv->irq = cb_priv->pci_device->irq;
+
+	switch (cb_priv->pci_chip) {
+	case PCI_CHIP_AMCC_S5933:
+		// make sure mailbox flags are clear
+		inl(cb_priv->amcc_iobase + INCOMING_MAILBOX_REG(3));
+		// enable interrupts on amccs5933 chip
+		bits = INBOX_FULL_INTR_BIT | INBOX_BYTE_BITS(3) | INBOX_SELECT_BITS(3) |
+			INBOX_INTR_CS_BIT;
+		outl(bits, cb_priv->amcc_iobase + INTCSR_REG);
+		break;
+	default:
+		break;
+	}
+	return cb7210_init(cb_priv, board);
+}
+
+static void cb_pci_detach(struct gpib_board *board)
+{
+	struct cb7210_priv *cb_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (cb_priv) {
+		gpib_free_pseudo_irq(board);
+		nec_priv = &cb_priv->nec7210_priv;
+		if (cb_priv->irq) {
+			// disable amcc interrupts
+			outl(0, cb_priv->amcc_iobase + INTCSR_REG);
+			free_irq(cb_priv->irq, board);
+		}
+		if (nec_priv->iobase) {
+			nec7210_board_reset(nec_priv, board);
+			pci_release_regions(cb_priv->pci_device);
+		}
+		if (cb_priv->pci_device)
+			pci_dev_put(cb_priv->pci_device);
+	}
+	cb7210_generic_detach(board);
+}
+
+static int cb_isa_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	int isr_flags = 0;
+	struct cb7210_priv *cb_priv;
+	struct nec7210_priv *nec_priv;
+	unsigned int bits;
+	int retval;
+
+	retval = cb7210_generic_attach(board);
+	if (retval)
+		return retval;
+	cb_priv = board->private_data;
+	nec_priv = &cb_priv->nec7210_priv;
+	if (!request_region(config->ibbase, cb7210_iosize, DRV_NAME)) {
+		dev_err(board->gpib_dev, "ioports starting at 0x%x are already in use\n",
+			config->ibbase);
+		return -EBUSY;
+	}
+	nec_priv->iobase = config->ibbase;
+	cb_priv->fifo_iobase = nec7210_iobase(cb_priv);
+
+	bits = irq_bits(config->ibirq);
+	if (bits == 0)
+		dev_err(board->gpib_dev, "board incapable of using irq %i, try 2-5, 7, 10, or 11\n",
+			config->ibirq);
+
+	// install interrupt handler
+	if (request_irq(config->ibirq, cb7210_interrupt, isr_flags, DRV_NAME, board)) {
+		dev_err(board->gpib_dev, "failed to obtain IRQ %d\n", config->ibirq);
+		return -EBUSY;
+	}
+	cb_priv->irq = config->ibirq;
+
+	return cb7210_init(cb_priv, board);
+}
+
+static void cb_isa_detach(struct gpib_board *board)
+{
+	struct cb7210_priv *cb_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (cb_priv) {
+		gpib_free_pseudo_irq(board);
+		nec_priv = &cb_priv->nec7210_priv;
+		if (cb_priv->irq)
+			free_irq(cb_priv->irq, board);
+		if (nec_priv->iobase) {
+			nec7210_board_reset(nec_priv, board);
+			release_region(nec7210_iobase(cb_priv), cb7210_iosize);
+		}
+	}
+	cb7210_generic_detach(board);
+}
+
+static int cb7210_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	return 0;
+}
+
+static const struct pci_device_id cb7210_pci_table[] = {
+	{PCI_VENDOR_ID_CBOARDS, PCI_DEVICE_ID_CBOARDS_PCI_GPIB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
+	{PCI_VENDOR_ID_CBOARDS, PCI_DEVICE_ID_CBOARDS_CPCI_GPIB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
+	{PCI_VENDOR_ID_QUANCOM, PCI_DEVICE_ID_QUANCOM_GPIB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
+	{ 0 }
+};
+MODULE_DEVICE_TABLE(pci, cb7210_pci_table);
+
+static struct pci_driver cb7210_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = cb7210_pci_table,
+	.probe = &cb7210_pci_probe
+};
+
+/***************************************************************************
+ *  Support for computer boards pcmcia-gpib card
+ *
+ *  Based on gpib PCMCIA client driver written by Claus Schroeter
+ *  (clausi@chemie.fu-berlin.de), which was adapted from the
+ *  pcmcia skeleton example (presumably David Hinds)
+ ***************************************************************************/
+
+#ifdef CONFIG_GPIB_PCMCIA
+
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+
+#include <pcmcia/cistpl.h>
+#include <pcmcia/ds.h>
+
+/*
+ * The event() function is this driver's Card Services event handler.
+ * It will be called by Card Services when an appropriate card status
+ * event is received.  The config() and release() entry points are
+ * used to configure or release a socket, in response to card insertion
+ * and ejection events.	 They are invoked from the gpib event
+ * handler.
+ */
+
+static int cb_gpib_config(struct pcmcia_device	*link);
+static void cb_gpib_release(struct pcmcia_device  *link);
+static int cb_pcmcia_attach(struct gpib_board *board, const struct gpib_board_config *config);
+static void cb_pcmcia_detach(struct gpib_board *board);
+
+/*
+ *  A linked list of "instances" of the gpib device.  Each actual
+ *  PCMCIA card corresponds to one device instance, and is described
+ *  by one dev_link_t structure (defined in ds.h).
+ *
+ *  You may not want to use a linked list for this -- for example, the
+ *  memory card driver uses an array of dev_link_t pointers, where minor
+ *  device numbers are used to derive the corresponding array index.
+ */
+
+static	struct pcmcia_device  *curr_dev;
+
+/*
+ *  A dev_link_t structure has fields for most things that are needed
+ *  to keep track of a socket, but there will usually be some device
+ *  specific information that also needs to be kept track of.  The
+ *  'priv' pointer in a dev_link_t structure can be used to point to
+ *  a device-specific private data structure, like this.
+ *
+ *  A driver needs to provide a dev_node_t structure for each device
+ *  on a card.	In some cases, there is only one device per card (for
+ *  example, ethernet cards, modems).  In other cases, there may be
+ *  many actual or logical devices (SCSI adapters, memory cards with
+ *  multiple partitions).  The dev_node_t structures need to be kept
+ *  in a linked list starting at the 'dev' field of a dev_link_t
+ *  structure.	We allocate them in the card's private data structure,
+ * because they generally can't be allocated dynamically.
+ */
+
+struct local_info {
+	struct pcmcia_device	*p_dev;
+	struct gpib_board		*dev;
+};
+
+/*
+ *  gpib_attach() creates an "instance" of the driver, allocating
+ *  local data structures for one device.  The device is registered
+ *  with Card Services.
+ *
+ *  The dev_link structure is initialized, but we don't actually
+ *  configure the card at this point -- we wait until we receive a
+ *  card insertion event.
+ */
+
+static int cb_gpib_probe(struct pcmcia_device *link)
+{
+	struct local_info *info;
+	int ret;
+
+	/* Allocate space for private device-specific data */
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->p_dev = link;
+	link->priv = info;
+
+	/* The io structure describes IO port mapping */
+	link->resource[0]->end = 16;
+	link->resource[0]->flags &= ~IO_DATA_PATH_WIDTH;
+	link->resource[0]->flags |= IO_DATA_PATH_WIDTH_AUTO;
+	link->resource[1]->end = 16;
+	link->resource[1]->flags &= ~IO_DATA_PATH_WIDTH;
+	link->resource[1]->flags |= IO_DATA_PATH_WIDTH_16;
+	link->io_lines = 10;
+
+	/* General socket configuration */
+	link->config_flags = CONF_ENABLE_IRQ | CONF_AUTO_SET_IO;
+	link->config_index = 1;
+	link->config_regs = PRESENT_OPTION;
+
+	/* Register with Card Services */
+	curr_dev = link;
+	ret = cb_gpib_config(link);
+	if (ret)
+		goto free_info;
+
+	return 0;
+
+free_info:
+	kfree(info);
+	return ret;
+}
+
+/*
+ *   This deletes a driver "instance".  The device is de-registered
+ *   with Card Services.  If it has been released, all local data
+ *   structures are freed.  Otherwise, the structures will be freed
+ *   when the device is released.
+ */
+
+static void cb_gpib_remove(struct pcmcia_device *link)
+{
+	struct local_info *info = link->priv;
+	//struct struct gpib_board *dev = info->dev;
+
+	if (info->dev)
+		cb_pcmcia_detach(info->dev);
+	cb_gpib_release(link);
+
+	//free_netdev(dev);
+	kfree(info);
+}
+
+static int cb_gpib_config_iteration(struct pcmcia_device *link, void *priv_data)
+{
+	return pcmcia_request_io(link);
+}
+
+/*
+ *   gpib_config() is scheduled to run after a CARD_INSERTION event
+ *   is received, to configure the PCMCIA socket, and to make the
+ *   ethernet device available to the system.
+ */
+
+static int cb_gpib_config(struct pcmcia_device  *link)
+{
+	int retval;
+
+	retval = pcmcia_loop_config(link, &cb_gpib_config_iteration, NULL);
+	if (retval) {
+		dev_warn(&link->dev, "no configuration found\n");
+		cb_gpib_release(link);
+		return -ENODEV;
+	}
+
+	/*
+	 *  This actually configures the PCMCIA socket -- setting up
+	 *  the I/O windows and the interrupt mapping.
+	 */
+	retval = pcmcia_enable_device(link);
+	if (retval) {
+		dev_warn(&link->dev, "pcmcia_enable_device failed\n");
+		cb_gpib_release(link);
+		return -ENODEV;
+	}
+
+	return 0;
+} /* gpib_config */
+
+/*
+ * After a card is removed, gpib_release() will unregister the net
+ * device, and release the PCMCIA configuration.  If the device is
+ * still open, this will be postponed until it is closed.
+ */
+
+static void cb_gpib_release(struct pcmcia_device *link)
+{
+	pcmcia_disable_device(link);
+}
+
+static int cb_gpib_suspend(struct pcmcia_device *link)
+{
+	//struct local_info *info = link->priv;
+	//struct struct gpib_board *dev = info->dev;
+
+	if (link->open)
+		dev_warn(&link->dev, "Device still open\n");
+	//netif_device_detach(dev);
+
+	return 0;
+}
+
+static int cb_gpib_resume(struct pcmcia_device *link)
+{
+	//struct local_info *info = link->priv;
+	//struct struct gpib_board *dev = info->dev;
+
+	/*if (link->open) {
+	 *	ni_gpib_probe(dev);	/ really?
+	 *	//netif_device_attach(dev);
+	 *
+	 */
+	return cb_gpib_config(link);
+}
+
+/*====================================================================*/
+
+static struct pcmcia_device_id cb_pcmcia_ids[] = {
+	PCMCIA_DEVICE_MANF_CARD(0x01c5, 0x0005),
+	PCMCIA_DEVICE_NULL
+};
+MODULE_DEVICE_TABLE(pcmcia, cb_pcmcia_ids);
+
+static struct pcmcia_driver cb_gpib_cs_driver = {
+	.name           = "cb_gpib_cs",
+	.owner		= THIS_MODULE,
+	.id_table	= cb_pcmcia_ids,
+	.probe		= cb_gpib_probe,
+	.remove		= cb_gpib_remove,
+	.suspend	= cb_gpib_suspend,
+	.resume		= cb_gpib_resume,
+};
+
+static void cb_pcmcia_cleanup_module(void)
+{
+	pcmcia_unregister_driver(&cb_gpib_cs_driver);
+}
+
+static struct gpib_interface cb_pcmcia_unaccel_interface = {
+	.name = "cbi_pcmcia_unaccel",
+	.attach = cb_pcmcia_attach,
+	.detach = cb_pcmcia_detach,
+	.read = cb7210_read,
+	.write = cb7210_write,
+	.command = cb7210_command,
+	.take_control = cb7210_take_control,
+	.go_to_standby = cb7210_go_to_standby,
+	.request_system_control = cb7210_request_system_control,
+	.interface_clear = cb7210_interface_clear,
+	.remote_enable = cb7210_remote_enable,
+	.enable_eos = cb7210_enable_eos,
+	.disable_eos = cb7210_disable_eos,
+	.parallel_poll = cb7210_parallel_poll,
+	.parallel_poll_configure = cb7210_parallel_poll_configure,
+	.parallel_poll_response = cb7210_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = cb7210_line_status,
+	.update_status = cb7210_update_status,
+	.primary_address = cb7210_primary_address,
+	.secondary_address = cb7210_secondary_address,
+	.serial_poll_response = cb7210_serial_poll_response,
+	.serial_poll_status = cb7210_serial_poll_status,
+	.t1_delay = cb7210_t1_delay,
+	.return_to_local = cb7210_return_to_local,
+};
+
+static struct gpib_interface cb_pcmcia_interface = {
+	.name = "cbi_pcmcia",
+	.attach = cb_pcmcia_attach,
+	.detach = cb_pcmcia_detach,
+	.read = cb7210_accel_read,
+	.write = cb7210_accel_write,
+	.command = cb7210_command,
+	.take_control = cb7210_take_control,
+	.go_to_standby = cb7210_go_to_standby,
+	.request_system_control = cb7210_request_system_control,
+	.interface_clear = cb7210_interface_clear,
+	.remote_enable = cb7210_remote_enable,
+	.enable_eos = cb7210_enable_eos,
+	.disable_eos = cb7210_disable_eos,
+	.parallel_poll = cb7210_parallel_poll,
+	.parallel_poll_configure = cb7210_parallel_poll_configure,
+	.parallel_poll_response = cb7210_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = cb7210_line_status,
+	.update_status = cb7210_update_status,
+	.primary_address = cb7210_primary_address,
+	.secondary_address = cb7210_secondary_address,
+	.serial_poll_response = cb7210_serial_poll_response,
+	.serial_poll_status = cb7210_serial_poll_status,
+	.t1_delay = cb7210_t1_delay,
+	.return_to_local = cb7210_return_to_local,
+};
+
+static struct gpib_interface cb_pcmcia_accel_interface = {
+	.name = "cbi_pcmcia_accel",
+	.attach = cb_pcmcia_attach,
+	.detach = cb_pcmcia_detach,
+	.read = cb7210_accel_read,
+	.write = cb7210_accel_write,
+	.command = cb7210_command,
+	.take_control = cb7210_take_control,
+	.go_to_standby = cb7210_go_to_standby,
+	.request_system_control = cb7210_request_system_control,
+	.interface_clear = cb7210_interface_clear,
+	.remote_enable = cb7210_remote_enable,
+	.enable_eos = cb7210_enable_eos,
+	.disable_eos = cb7210_disable_eos,
+	.parallel_poll = cb7210_parallel_poll,
+	.parallel_poll_configure = cb7210_parallel_poll_configure,
+	.parallel_poll_response = cb7210_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = cb7210_line_status,
+	.update_status = cb7210_update_status,
+	.primary_address = cb7210_primary_address,
+	.secondary_address = cb7210_secondary_address,
+	.serial_poll_response = cb7210_serial_poll_response,
+	.serial_poll_status = cb7210_serial_poll_status,
+	.t1_delay = cb7210_t1_delay,
+	.return_to_local = cb7210_return_to_local,
+};
+
+static int cb_pcmcia_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct cb7210_priv *cb_priv;
+	struct nec7210_priv *nec_priv;
+	int retval;
+
+	if (!curr_dev) {
+		dev_err(board->gpib_dev, "no cb pcmcia cards found\n");
+		return -ENODEV;
+	}
+
+	retval = cb7210_generic_attach(board);
+	if (retval)
+		return retval;
+
+	cb_priv = board->private_data;
+	nec_priv = &cb_priv->nec7210_priv;
+
+	if (!request_region(curr_dev->resource[0]->start, resource_size(curr_dev->resource[0]),
+			    DRV_NAME))	{
+		dev_err(board->gpib_dev, "ioports starting at 0x%lx are already in use\n",
+			(unsigned long)curr_dev->resource[0]->start);
+		return -EBUSY;
+	}
+	nec_priv->iobase = curr_dev->resource[0]->start;
+	cb_priv->fifo_iobase = curr_dev->resource[0]->start;
+
+	if (request_irq(curr_dev->irq, cb7210_interrupt, IRQF_SHARED, DRV_NAME, board)) {
+		dev_err(board->gpib_dev, "failed to request IRQ %d\n", curr_dev->irq);
+		return -EBUSY;
+	}
+	cb_priv->irq = curr_dev->irq;
+
+	return cb7210_init(cb_priv, board);
+}
+
+static void cb_pcmcia_detach(struct gpib_board *board)
+{
+	struct cb7210_priv *cb_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (cb_priv) {
+		nec_priv = &cb_priv->nec7210_priv;
+		gpib_free_pseudo_irq(board);
+		if (cb_priv->irq)
+			free_irq(cb_priv->irq, board);
+		if (nec_priv->iobase) {
+			nec7210_board_reset(nec_priv, board);
+			release_region(nec7210_iobase(cb_priv), cb7210_iosize);
+		}
+	}
+	cb7210_generic_detach(board);
+}
+
+#endif /* CONFIG_GPIB_PCMCIA */
+
+static int __init cb7210_init_module(void)
+{
+	int ret;
+
+	ret = pci_register_driver(&cb7210_pci_driver);
+	if (ret) {
+		pr_err("pci_register_driver failed: error = %d\n", ret);
+		return ret;
+	}
+
+	ret = gpib_register_driver(&cb_pci_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pci;
+	}
+
+	ret = gpib_register_driver(&cb_isa_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_isa;
+	}
+
+	ret = gpib_register_driver(&cb_pci_accel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pci_accel;
+	}
+
+	ret = gpib_register_driver(&cb_pci_unaccel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pci_unaccel;
+	}
+
+	ret = gpib_register_driver(&cb_isa_accel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_isa_accel;
+	}
+
+	ret = gpib_register_driver(&cb_isa_unaccel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_isa_unaccel;
+	}
+
+#ifdef CONFIG_GPIB_PCMCIA
+	ret = gpib_register_driver(&cb_pcmcia_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pcmcia;
+	}
+
+	ret = gpib_register_driver(&cb_pcmcia_accel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pcmcia_accel;
+	}
+
+	ret = gpib_register_driver(&cb_pcmcia_unaccel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pcmcia_unaccel;
+	}
+
+	ret = pcmcia_register_driver(&cb_gpib_cs_driver);
+	if (ret) {
+		pr_err("pcmcia_register_driver failed: error = %d\n", ret);
+		goto err_pcmcia_driver;
+	}
+#endif
+
+	return 0;
+
+#ifdef CONFIG_GPIB_PCMCIA
+err_pcmcia_driver:
+	gpib_unregister_driver(&cb_pcmcia_unaccel_interface);
+err_pcmcia_unaccel:
+	gpib_unregister_driver(&cb_pcmcia_accel_interface);
+err_pcmcia_accel:
+	gpib_unregister_driver(&cb_pcmcia_interface);
+err_pcmcia:
+#endif
+	gpib_unregister_driver(&cb_isa_unaccel_interface);
+err_isa_unaccel:
+	gpib_unregister_driver(&cb_isa_accel_interface);
+err_isa_accel:
+	gpib_unregister_driver(&cb_pci_unaccel_interface);
+err_pci_unaccel:
+	gpib_unregister_driver(&cb_pci_accel_interface);
+err_pci_accel:
+	gpib_unregister_driver(&cb_isa_interface);
+err_isa:
+	gpib_unregister_driver(&cb_pci_interface);
+err_pci:
+	pci_unregister_driver(&cb7210_pci_driver);
+
+	return ret;
+}
+
+static void __exit cb7210_exit_module(void)
+{
+	gpib_unregister_driver(&cb_pci_interface);
+	gpib_unregister_driver(&cb_isa_interface);
+	gpib_unregister_driver(&cb_pci_accel_interface);
+	gpib_unregister_driver(&cb_pci_unaccel_interface);
+	gpib_unregister_driver(&cb_isa_accel_interface);
+	gpib_unregister_driver(&cb_isa_unaccel_interface);
+#ifdef CONFIG_GPIB_PCMCIA
+	gpib_unregister_driver(&cb_pcmcia_interface);
+	gpib_unregister_driver(&cb_pcmcia_accel_interface);
+	gpib_unregister_driver(&cb_pcmcia_unaccel_interface);
+	cb_pcmcia_cleanup_module();
+#endif
+
+	pci_unregister_driver(&cb7210_pci_driver);
+}
+
+module_init(cb7210_init_module);
+module_exit(cb7210_exit_module);
diff --git a/drivers/gpib/cb7210/cb7210.h b/drivers/gpib/cb7210/cb7210.h
new file mode 100644
index 000000000000..ddc841ff87ae
--- /dev/null
+++ b/drivers/gpib/cb7210/cb7210.h
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright            : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#include "nec7210.h"
+#include "gpibP.h"
+#include "amccs5933.h"
+
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+
+enum {
+	PCI_DEVICE_ID_CBOARDS_PCI_GPIB = 0x6,
+	PCI_DEVICE_ID_CBOARDS_CPCI_GPIB = 0xe,
+};
+
+enum pci_chip {
+	PCI_CHIP_NONE = 0,
+	PCI_CHIP_AMCC_S5933,
+	PCI_CHIP_QUANCOM
+};
+
+// struct which defines private_data for cb7210 boards
+struct cb7210_priv {
+	struct nec7210_priv nec7210_priv;
+	struct pci_dev *pci_device;
+	// base address of amccs5933 pci chip
+	unsigned long amcc_iobase;
+	unsigned long fifo_iobase;
+	unsigned int irq;
+	enum pci_chip pci_chip;
+	u8 hs_mode_bits;
+	unsigned out_fifo_half_empty : 1;
+	unsigned in_fifo_half_full : 1;
+};
+
+// pci-gpib register offset
+static const int cb7210_reg_offset = 1;
+
+// uses 10 ioports
+static const int cb7210_iosize = 10;
+
+// fifo size in bytes
+static const int cb7210_fifo_size = 2048;
+static const int cb7210_fifo_width = 2;
+
+// cb7210 specific registers and bits
+enum cb7210_regs {
+	BUS_STATUS = 0x7,
+};
+
+enum cb7210_page_in {
+	BUS_STATUS_PAGE = 1,
+};
+
+enum hs_regs {
+	// write registers
+	HS_MODE = 0x8,	/* HS_MODE register */
+	HS_INT_LEVEL = 0x9,	/* HS_INT_LEVEL register */
+	// read registers
+	HS_STATUS = 0x8,	/* HS_STATUS register */
+};
+
+static inline u32 nec7210_iobase(const struct cb7210_priv *cb_priv)
+{
+	return cb_priv->nec7210_priv.iobase;
+}
+
+static inline int cb7210_page_in_bits(unsigned int page)
+{
+	return 0x50 | (page & 0xf);
+}
+
+static inline u8 cb7210_paged_read_byte(struct cb7210_priv *cb_priv,
+					unsigned int register_num, unsigned int page)
+{
+	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+	u8 retval;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
+	outb(cb7210_page_in_bits(page), nec7210_iobase(cb_priv) + AUXMR * nec_priv->offset);
+	udelay(1);
+	retval = inb(nec7210_iobase(cb_priv) + register_num * nec_priv->offset);
+	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
+	return retval;
+}
+
+// don't use for register_num < 8, since it doesn't lock
+static inline u8 cb7210_read_byte(const struct cb7210_priv *cb_priv,
+				  enum hs_regs register_num)
+{
+	const struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+	u8 retval;
+
+	retval = inb(nec7210_iobase(cb_priv) + register_num * nec_priv->offset);
+	return retval;
+}
+
+static inline void cb7210_paged_write_byte(struct cb7210_priv *cb_priv, u8 data,
+					   unsigned int register_num, unsigned int page)
+{
+	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
+	outb(cb7210_page_in_bits(page), nec7210_iobase(cb_priv) + AUXMR * nec_priv->offset);
+	udelay(1);
+	outb(data, nec7210_iobase(cb_priv) + register_num * nec_priv->offset);
+	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
+}
+
+// don't use for register_num < 8, since it doesn't lock
+static inline void cb7210_write_byte(const struct cb7210_priv *cb_priv, u8 data,
+				     enum hs_regs register_num)
+{
+	const struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
+
+	outb(data, nec7210_iobase(cb_priv) + register_num * nec_priv->offset);
+}
+
+enum bus_status_bits {
+	BSR_ATN_BIT = 0x1,
+	BSR_EOI_BIT = 0x2,
+	BSR_SRQ_BIT = 0x4,
+	BSR_IFC_BIT = 0x8,
+	BSR_REN_BIT = 0x10,
+	BSR_DAV_BIT = 0x20,
+	BSR_NRFD_BIT = 0x40,
+	BSR_NDAC_BIT = 0x80,
+};
+
+/* CBI 488.2 HS control */
+
+/*
+ * when both bit 0 and 1 are set, it
+ *   1 clears the transmit state machine to an initial condition
+ *   2 clears any residual interrupts left latched on cbi488.2
+ *   3 resets all control bits in HS_MODE to zero
+ *   4 enables TX empty interrupts
+ * when both bit 0 and 1 are zero, then the high speed mode is disabled
+ */
+enum hs_mode_bits {
+	HS_ENABLE_MASK = 0x3,
+	HS_TX_ENABLE = (1 << 0),
+	HS_RX_ENABLE = (1 << 1),
+	HS_HF_INT_EN = (1 << 3),
+	HS_CLR_SRQ_INT = (1 << 4),
+	HS_CLR_EOI_EMPTY_INT = (1 << 5),
+	HS_CLR_HF_INT = (1 << 6),
+	HS_SYS_CONTROL = (1 << 7),
+};
+
+/* CBI 488.2 status */
+enum hs_status_bits {
+	HS_FIFO_FULL = (1 << 0),
+	HS_HALF_FULL = (1 << 1),
+	HS_SRQ_INT = (1 << 2),
+	HS_EOI_INT = (1 << 3),
+	HS_TX_MSB_NOT_EMPTY = (1 << 4),
+	HS_RX_MSB_NOT_EMPTY = (1 << 5),
+	HS_TX_LSB_NOT_EMPTY = (1 << 6),
+	HS_RX_LSB_NOT_EMPTY = (1 << 7),
+};
+
+/* CBI488.2 hs_int_level register */
+enum hs_int_level_bits {
+	HS_RESET7210 = (1 << 7),
+};
+
+static inline unsigned int irq_bits(unsigned int irq)
+{
+	switch (irq) {
+	case 2:
+	case 3:
+	case 4:
+	case 5:
+		return irq - 1;
+	case 7:
+		return 0x5;
+	case 10:
+		return 0x6;
+	case 11:
+		return 0x7;
+	default:
+		return 0;
+	}
+}
+
+enum cb7210_aux_cmds {
+/*
+ * AUX_RTL2 is an undocumented aux command which causes cb7210 to assert
+ * (and keep asserted) local rtl message.  This is used in conjunction
+ * with the (stupid) cb7210 implementation
+ * of the normal nec7210 AUX_RTL aux command, which
+ * causes the rtl message to toggle between on and off.
+ */
+	AUX_RTL2 = 0xd,
+	AUX_LO_SPEED = 0x40,
+	AUX_HI_SPEED = 0x41,
+};
diff --git a/drivers/gpib/cec/Makefile b/drivers/gpib/cec/Makefile
new file mode 100644
index 000000000000..b7141e23d4e0
--- /dev/null
+++ b/drivers/gpib/cec/Makefile
@@ -0,0 +1,3 @@
+
+obj-$(CONFIG_GPIB_CEC_PCI) += cec_gpib.o
+
diff --git a/drivers/gpib/cec/cec.h b/drivers/gpib/cec/cec.h
new file mode 100644
index 000000000000..3ce2869c7429
--- /dev/null
+++ b/drivers/gpib/cec/cec.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright            : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#include "nec7210.h"
+#include "gpibP.h"
+#include "plx9050.h"
+
+struct cec_priv  {
+	struct nec7210_priv nec7210_priv;
+	struct pci_dev *pci_device;
+	// base address for plx9052 pci chip
+	unsigned long plx_iobase;
+	unsigned int irq;
+};
+
+// offset between consecutive nec7210 registers
+static const int cec_reg_offset = 1;
diff --git a/drivers/gpib/cec/cec_gpib.c b/drivers/gpib/cec/cec_gpib.c
new file mode 100644
index 000000000000..dbf9b95baabc
--- /dev/null
+++ b/drivers/gpib/cec/cec_gpib.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *   copyright            : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include "cec.h"
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/bitops.h>
+#include <asm/dma.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver for CEC PCI and PCMCIA boards");
+
+/*
+ * GPIB interrupt service routines
+ */
+
+static irqreturn_t cec_interrupt(int irq, void *arg)
+{
+	struct gpib_board *board = arg;
+	struct cec_priv *priv = board->private_data;
+	unsigned long flags;
+	irqreturn_t retval;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	retval = nec7210_interrupt(board, &priv->nec7210_priv);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return retval;
+}
+
+#define CEC_VENDOR_ID 0x12fc
+#define CEC_DEV_ID    0x5cec
+#define CEC_SUBID 0x9050
+
+static int cec_pci_attach(struct gpib_board *board, const struct gpib_board_config *config);
+
+static void cec_pci_detach(struct gpib_board *board);
+
+// wrappers for interface functions
+static int cec_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
+		    size_t *bytes_read)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
+}
+
+static int cec_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
+		     size_t *bytes_written)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
+}
+
+static int cec_command(struct gpib_board *board, u8 *buffer,
+		       size_t length, size_t *bytes_written)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
+}
+
+static int cec_take_control(struct gpib_board *board, int synchronous)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
+}
+
+static int cec_go_to_standby(struct gpib_board *board)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_go_to_standby(board, &priv->nec7210_priv);
+}
+
+static int cec_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_request_system_control(board, &priv->nec7210_priv, request_control);
+}
+
+static void cec_interface_clear(struct gpib_board *board, int assert)
+{
+	struct cec_priv *priv = board->private_data;
+
+	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
+}
+
+static void cec_remote_enable(struct gpib_board *board, int enable)
+{
+	struct cec_priv *priv = board->private_data;
+
+	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
+}
+
+static int cec_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
+}
+
+static void cec_disable_eos(struct gpib_board *board)
+{
+	struct cec_priv *priv = board->private_data;
+
+	nec7210_disable_eos(board, &priv->nec7210_priv);
+}
+
+static unsigned int cec_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
+}
+
+static int cec_primary_address(struct gpib_board *board, unsigned int address)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_primary_address(board, &priv->nec7210_priv, address);
+}
+
+static int cec_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
+}
+
+static int cec_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
+}
+
+static void cec_parallel_poll_configure(struct gpib_board *board, u8 config)
+{
+	struct cec_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, config);
+}
+
+static void cec_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	struct cec_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
+}
+
+static void cec_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	struct cec_priv *priv = board->private_data;
+
+	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
+}
+
+static u8 cec_serial_poll_status(struct gpib_board *board)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
+}
+
+static int cec_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	struct cec_priv *priv = board->private_data;
+
+	return nec7210_t1_delay(board, &priv->nec7210_priv, nano_sec);
+}
+
+static void cec_return_to_local(struct gpib_board *board)
+{
+	struct cec_priv *priv = board->private_data;
+
+	nec7210_return_to_local(board, &priv->nec7210_priv);
+}
+
+static struct gpib_interface cec_pci_interface = {
+	.name = "cec_pci",
+	.attach = cec_pci_attach,
+	.detach = cec_pci_detach,
+	.read = cec_read,
+	.write = cec_write,
+	.command = cec_command,
+	.take_control = cec_take_control,
+	.go_to_standby = cec_go_to_standby,
+	.request_system_control = cec_request_system_control,
+	.interface_clear = cec_interface_clear,
+	.remote_enable = cec_remote_enable,
+	.enable_eos = cec_enable_eos,
+	.disable_eos = cec_disable_eos,
+	.parallel_poll = cec_parallel_poll,
+	.parallel_poll_configure = cec_parallel_poll_configure,
+	.parallel_poll_response = cec_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = NULL,	// XXX
+	.update_status = cec_update_status,
+	.primary_address = cec_primary_address,
+	.secondary_address = cec_secondary_address,
+	.serial_poll_response = cec_serial_poll_response,
+	.serial_poll_status = cec_serial_poll_status,
+	.t1_delay = cec_t1_delay,
+	.return_to_local = cec_return_to_local,
+};
+
+static int cec_allocate_private(struct gpib_board *board)
+{
+	struct cec_priv *priv;
+
+	board->private_data = kmalloc(sizeof(struct cec_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -1;
+	priv = board->private_data;
+	memset(priv, 0, sizeof(struct cec_priv));
+	init_nec7210_private(&priv->nec7210_priv);
+	return 0;
+}
+
+static void cec_free_private(struct gpib_board *board)
+{
+	kfree(board->private_data);
+	board->private_data = NULL;
+}
+
+static int cec_generic_attach(struct gpib_board *board)
+{
+	struct cec_priv *cec_priv;
+	struct nec7210_priv *nec_priv;
+
+	board->status = 0;
+
+	if (cec_allocate_private(board))
+		return -ENOMEM;
+	cec_priv = board->private_data;
+	nec_priv = &cec_priv->nec7210_priv;
+	nec_priv->read_byte = nec7210_ioport_read_byte;
+	nec_priv->write_byte = nec7210_ioport_write_byte;
+	nec_priv->offset = cec_reg_offset;
+	nec_priv->type = NEC7210;	// guess
+	return 0;
+}
+
+static void cec_init(struct cec_priv *cec_priv, const struct gpib_board *board)
+{
+	struct nec7210_priv *nec_priv = &cec_priv->nec7210_priv;
+
+	nec7210_board_reset(nec_priv, board);
+
+	/* set internal counter register for 8 MHz input clock */
+	write_byte(nec_priv, ICR | 8, AUXMR);
+
+	nec7210_board_online(nec_priv, board);
+}
+
+static int cec_pci_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct cec_priv *cec_priv;
+	struct nec7210_priv *nec_priv;
+	int isr_flags = 0;
+	int retval;
+
+	retval = cec_generic_attach(board);
+	if (retval)
+		return retval;
+
+	cec_priv = board->private_data;
+	nec_priv = &cec_priv->nec7210_priv;
+
+	// find board
+	cec_priv->pci_device = NULL;
+	while ((cec_priv->pci_device =
+		gpib_pci_get_device(config, CEC_VENDOR_ID,
+				    CEC_DEV_ID, cec_priv->pci_device)))	{
+		// check for board with plx9050 controller
+		if (cec_priv->pci_device->subsystem_device == CEC_SUBID)
+			break;
+	}
+	if (!cec_priv->pci_device) {
+		dev_err(board->gpib_dev, "no cec PCI board found\n");
+		return -ENODEV;
+	}
+
+	if (pci_enable_device(cec_priv->pci_device)) {
+		dev_err(board->gpib_dev, "error enabling pci device\n");
+		return -EIO;
+	}
+
+	if (pci_request_regions(cec_priv->pci_device, "cec-gpib"))
+		return -EBUSY;
+
+	cec_priv->plx_iobase = pci_resource_start(cec_priv->pci_device, 1);
+	nec_priv->iobase = pci_resource_start(cec_priv->pci_device, 3);
+
+	isr_flags |= IRQF_SHARED;
+	if (request_irq(cec_priv->pci_device->irq, cec_interrupt, isr_flags, DRV_NAME, board)) {
+		dev_err(board->gpib_dev, "failed to obtain IRQ %d\n", cec_priv->pci_device->irq);
+		return -EBUSY;
+	}
+	cec_priv->irq = cec_priv->pci_device->irq;
+	if (gpib_request_pseudo_irq(board, cec_interrupt)) {
+		dev_err(board->gpib_dev, "failed to allocate pseudo irq\n");
+		return -1;
+	}
+	cec_init(cec_priv, board);
+
+	// enable interrupts on plx chip
+	outl(PLX9050_LINTR1_EN_BIT | PLX9050_LINTR1_POLARITY_BIT | PLX9050_PCI_INTR_EN_BIT,
+	     cec_priv->plx_iobase + PLX9050_INTCSR_REG);
+
+	return 0;
+}
+
+static void cec_pci_detach(struct gpib_board *board)
+{
+	struct cec_priv *cec_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (cec_priv) {
+		nec_priv = &cec_priv->nec7210_priv;
+		gpib_free_pseudo_irq(board);
+		if (cec_priv->irq) {
+			// disable plx9050 interrupts
+			outl(0, cec_priv->plx_iobase + PLX9050_INTCSR_REG);
+			free_irq(cec_priv->irq, board);
+		}
+		if (nec_priv->iobase) {
+			nec7210_board_reset(nec_priv, board);
+			pci_release_regions(cec_priv->pci_device);
+		}
+		if (cec_priv->pci_device)
+			pci_dev_put(cec_priv->pci_device);
+	}
+	cec_free_private(board);
+}
+
+static int cec_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	return 0;
+}
+
+static const struct pci_device_id cec_pci_table[] = {
+	{CEC_VENDOR_ID, CEC_DEV_ID, PCI_ANY_ID, CEC_SUBID, 0, 0, 0 },
+	{0}
+};
+MODULE_DEVICE_TABLE(pci, cec_pci_table);
+
+static struct pci_driver cec_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = cec_pci_table,
+	.probe = &cec_pci_probe
+};
+
+static int __init cec_init_module(void)
+{
+	int result;
+
+	result = pci_register_driver(&cec_pci_driver);
+	if (result) {
+		pr_err("pci_register_driver failed: error = %d\n", result);
+		return result;
+	}
+
+	result = gpib_register_driver(&cec_pci_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		return result;
+	}
+
+	return 0;
+}
+
+static void cec_exit_module(void)
+{
+	gpib_unregister_driver(&cec_pci_interface);
+
+	pci_unregister_driver(&cec_pci_driver);
+}
+
+module_init(cec_init_module);
+module_exit(cec_exit_module);
diff --git a/drivers/gpib/common/Makefile b/drivers/gpib/common/Makefile
new file mode 100644
index 000000000000..460586edb574
--- /dev/null
+++ b/drivers/gpib/common/Makefile
@@ -0,0 +1,6 @@
+
+obj-$(CONFIG_GPIB_COMMON) += gpib_common.o
+
+gpib_common-objs := gpib_os.o iblib.o
+
+
diff --git a/drivers/gpib/common/gpib_os.c b/drivers/gpib/common/gpib_os.c
new file mode 100644
index 000000000000..9dbbac8b8436
--- /dev/null
+++ b/drivers/gpib/common/gpib_os.c
@@ -0,0 +1,2271 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *    copyright            : (C) 2001, 2004 by Frank Mori Hess
+ ***************************************************************************
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+
+#include "ibsys.h"
+#include <linux/module.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/fcntl.h>
+#include <linux/kmod.h>
+#include <linux/uaccess.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB base support");
+MODULE_ALIAS_CHARDEV_MAJOR(GPIB_CODE);
+
+static int board_type_ioctl(struct gpib_file_private *file_priv,
+			    struct gpib_board *board, unsigned long arg);
+static int read_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
+		      unsigned long arg);
+static int write_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
+		       unsigned long arg);
+static int command_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
+			 unsigned long arg);
+static int open_dev_ioctl(struct file *filep, struct gpib_board *board, unsigned long arg);
+static int close_dev_ioctl(struct file *filep, struct gpib_board *board, unsigned long arg);
+static int serial_poll_ioctl(struct gpib_board *board, unsigned long arg);
+static int wait_ioctl(struct gpib_file_private *file_priv,
+		      struct gpib_board *board, unsigned long arg);
+static int parallel_poll_ioctl(struct gpib_board *board, unsigned long arg);
+static int online_ioctl(struct gpib_board *board, unsigned long arg);
+static int remote_enable_ioctl(struct gpib_board *board, unsigned long arg);
+static int take_control_ioctl(struct gpib_board *board, unsigned long arg);
+static int line_status_ioctl(struct gpib_board *board, unsigned long arg);
+static int pad_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
+		     unsigned long arg);
+static int sad_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
+		     unsigned long arg);
+static int eos_ioctl(struct gpib_board *board, unsigned long arg);
+static int request_service_ioctl(struct gpib_board *board, unsigned long arg);
+static int request_service2_ioctl(struct gpib_board *board, unsigned long arg);
+static int iobase_ioctl(struct gpib_board_config *config, unsigned long arg);
+static int irq_ioctl(struct gpib_board_config *config, unsigned long arg);
+static int dma_ioctl(struct gpib_board_config *config, unsigned long arg);
+static int autospoll_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
+			   unsigned long arg);
+static int mutex_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
+		       unsigned long arg);
+static int timeout_ioctl(struct gpib_board *board, unsigned long arg);
+static int status_bytes_ioctl(struct gpib_board *board, unsigned long arg);
+static int board_info_ioctl(const struct gpib_board *board, unsigned long arg);
+static int ppc_ioctl(struct gpib_board *board, unsigned long arg);
+static int set_local_ppoll_mode_ioctl(struct gpib_board *board, unsigned long arg);
+static int get_local_ppoll_mode_ioctl(struct gpib_board *board, unsigned long arg);
+static int query_board_rsv_ioctl(struct gpib_board *board, unsigned long arg);
+static int interface_clear_ioctl(struct gpib_board *board, unsigned long arg);
+static int select_pci_ioctl(struct gpib_board_config *config, unsigned long arg);
+static int select_device_path_ioctl(struct gpib_board_config *config, unsigned long arg);
+static int event_ioctl(struct gpib_board *board, unsigned long arg);
+static int request_system_control_ioctl(struct gpib_board *board, unsigned long arg);
+static int t1_delay_ioctl(struct gpib_board *board, unsigned long arg);
+
+static int cleanup_open_devices(struct gpib_file_private *file_priv, struct gpib_board *board);
+
+static int pop_gpib_event_nolock(struct gpib_board *board,
+				 struct gpib_event_queue *queue, short *event_type);
+
+/*
+ * Timer functions
+ */
+
+/* Watchdog timeout routine */
+
+static void watchdog_timeout(struct timer_list *t)
+{
+	struct gpib_board *board = timer_container_of(board, t, timer);
+
+	set_bit(TIMO_NUM, &board->status);
+	wake_up_interruptible(&board->wait);
+}
+
+/* install timer interrupt handler */
+void os_start_timer(struct gpib_board *board, unsigned int usec_timeout)
+/* Starts the timeout task  */
+{
+	if (timer_pending(&board->timer)) {
+		dev_err(board->gpib_dev, "bug! timer already running?\n");
+		return;
+	}
+	clear_bit(TIMO_NUM, &board->status);
+
+	if (usec_timeout > 0) {
+		board->timer.function = watchdog_timeout;
+		/* set number of ticks */
+		mod_timer(&board->timer, jiffies + usec_to_jiffies(usec_timeout));
+	}
+}
+
+void os_remove_timer(struct gpib_board *board)
+/* Removes the timeout task */
+{
+	if (timer_pending(&board->timer))
+		timer_delete_sync(&board->timer);
+}
+
+int io_timed_out(struct gpib_board *board)
+{
+	if (test_bit(TIMO_NUM, &board->status))
+		return 1;
+	return 0;
+}
+
+/*
+ * this is a function instead of a constant because of Suse
+ * defining HZ to be a function call to get_hz()
+ */
+static inline int pseudo_irq_period(void)
+{
+	return (HZ + 99) / 100;
+}
+
+static void pseudo_irq_handler(struct timer_list *t)
+{
+	struct gpib_pseudo_irq *pseudo_irq = timer_container_of(pseudo_irq, t,
+								timer);
+
+	if (pseudo_irq->handler)
+		pseudo_irq->handler(0, pseudo_irq->board);
+	else
+		pr_err("gpib: bug! pseudo_irq.handler is NULL\n");
+
+	if (atomic_read(&pseudo_irq->active))
+		mod_timer(&pseudo_irq->timer, jiffies + pseudo_irq_period());
+}
+
+int gpib_request_pseudo_irq(struct gpib_board *board, irqreturn_t (*handler)(int, void *))
+{
+	if (timer_pending(&board->pseudo_irq.timer) || board->pseudo_irq.handler) {
+		dev_err(board->gpib_dev, "only one pseudo interrupt per board allowed\n");
+		return -1;
+	}
+
+	board->pseudo_irq.handler = handler;
+	board->pseudo_irq.timer.function = pseudo_irq_handler;
+	board->pseudo_irq.board = board;
+
+	atomic_set(&board->pseudo_irq.active, 1);
+
+	mod_timer(&board->pseudo_irq.timer, jiffies + pseudo_irq_period());
+
+	return 0;
+}
+EXPORT_SYMBOL(gpib_request_pseudo_irq);
+
+void gpib_free_pseudo_irq(struct gpib_board *board)
+{
+	atomic_set(&board->pseudo_irq.active, 0);
+
+	timer_delete_sync(&board->pseudo_irq.timer);
+	board->pseudo_irq.handler = NULL;
+}
+EXPORT_SYMBOL(gpib_free_pseudo_irq);
+
+static const unsigned int serial_timeout = 1000000;
+
+unsigned int num_status_bytes(const struct gpib_status_queue *dev)
+{
+	if (!dev)
+		return 0;
+	return dev->num_status_bytes;
+}
+
+// push status byte onto back of status byte fifo
+int push_status_byte(struct gpib_board *board, struct gpib_status_queue *device, u8 poll_byte)
+{
+	struct list_head *head = &device->status_bytes;
+	struct gpib_status_byte *status;
+	static const unsigned int max_num_status_bytes = 1024;
+	int retval;
+
+	if (num_status_bytes(device) >= max_num_status_bytes) {
+		u8 lost_byte;
+
+		device->dropped_byte = 1;
+		retval = pop_status_byte(board, device, &lost_byte);
+		if (retval < 0)
+			return retval;
+	}
+
+	status = kmalloc(sizeof(*status), GFP_KERNEL);
+	if (!status)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&status->list);
+	status->poll_byte = poll_byte;
+
+	list_add_tail(&status->list, head);
+
+	device->num_status_bytes++;
+
+	dev_dbg(board->gpib_dev, "pushed status byte 0x%x, %i in queue\n",
+		(int)poll_byte, num_status_bytes(device));
+
+	return 0;
+}
+
+// pop status byte from front of status byte fifo
+int pop_status_byte(struct gpib_board *board, struct gpib_status_queue *device, u8 *poll_byte)
+{
+	struct list_head *head = &device->status_bytes;
+	struct list_head *front = head->next;
+	struct gpib_status_byte *status;
+
+	if (num_status_bytes(device) == 0)
+		return -EIO;
+
+	if (front == head)
+		return -EIO;
+
+	if (device->dropped_byte) {
+		device->dropped_byte = 0;
+		return -EPIPE;
+	}
+
+	status = list_entry(front, struct gpib_status_byte, list);
+	*poll_byte = status->poll_byte;
+
+	list_del(front);
+	kfree(status);
+
+	device->num_status_bytes--;
+
+	dev_dbg(board->gpib_dev, "popped status byte 0x%x, %i in queue\n",
+		(int)*poll_byte, num_status_bytes(device));
+
+	return 0;
+}
+
+struct gpib_status_queue *get_gpib_status_queue(struct gpib_board *board, unsigned int pad, int sad)
+{
+	struct gpib_status_queue *device;
+	struct list_head *list_ptr;
+	const struct list_head *head = &board->device_list;
+
+	for (list_ptr = head->next; list_ptr != head; list_ptr = list_ptr->next) {
+		device = list_entry(list_ptr, struct gpib_status_queue, list);
+		if (gpib_address_equal(device->pad, device->sad, pad, sad))
+			return device;
+	}
+
+	return NULL;
+}
+
+int get_serial_poll_byte(struct gpib_board *board, unsigned int pad, int sad,
+			 unsigned int usec_timeout, u8 *poll_byte)
+{
+	struct gpib_status_queue *device;
+
+	device = get_gpib_status_queue(board, pad, sad);
+	if (num_status_bytes(device))
+		return pop_status_byte(board, device, poll_byte);
+	else
+		return dvrsp(board, pad, sad, usec_timeout, poll_byte);
+}
+
+int autopoll_all_devices(struct gpib_board *board)
+{
+	int retval;
+
+	if (mutex_lock_interruptible(&board->user_mutex))
+		return -ERESTARTSYS;
+	if (mutex_lock_interruptible(&board->big_gpib_mutex)) {
+		mutex_unlock(&board->user_mutex);
+		return -ERESTARTSYS;
+	}
+
+	dev_dbg(board->gpib_dev, "autopoll has board lock\n");
+
+	retval = serial_poll_all(board, serial_timeout);
+	if (retval < 0)	{
+		mutex_unlock(&board->big_gpib_mutex);
+		mutex_unlock(&board->user_mutex);
+		return retval;
+	}
+
+	dev_dbg(board->gpib_dev, "complete\n");
+	/*
+	 * need to wake wait queue in case someone is
+	 * waiting on RQS
+	 */
+	wake_up_interruptible(&board->wait);
+	mutex_unlock(&board->big_gpib_mutex);
+	mutex_unlock(&board->user_mutex);
+
+	return retval;
+}
+
+static int setup_serial_poll(struct gpib_board *board, unsigned int usec_timeout)
+{
+	u8 cmd_string[8];
+	int i;
+	size_t bytes_written;
+	int ret;
+
+	os_start_timer(board, usec_timeout);
+	ret = ibcac(board, 1, 1);
+	if (ret < 0) {
+		os_remove_timer(board);
+		return ret;
+	}
+
+	i = 0;
+	cmd_string[i++] = UNL;
+	cmd_string[i++] = MLA(board->pad);	/* controller's listen address */
+	if (board->sad >= 0)
+		cmd_string[i++] = MSA(board->sad);
+	cmd_string[i++] = SPE;	// serial poll enable
+
+	ret = board->interface->command(board, cmd_string, i, &bytes_written);
+	if (ret < 0 || bytes_written < i) {
+		dev_dbg(board->gpib_dev, "failed to setup serial poll\n");
+		os_remove_timer(board);
+		return -EIO;
+	}
+	os_remove_timer(board);
+
+	return 0;
+}
+
+static int read_serial_poll_byte(struct gpib_board *board, unsigned int pad,
+				 int sad, unsigned int usec_timeout, u8 *result)
+{
+	u8 cmd_string[8];
+	int end_flag;
+	int ret;
+	int i;
+	size_t nbytes;
+
+	dev_dbg(board->gpib_dev, "entering  pad=%i sad=%i\n", pad, sad);
+
+	os_start_timer(board, usec_timeout);
+	ret = ibcac(board, 1, 1);
+	if (ret < 0) {
+		os_remove_timer(board);
+		return ret;
+	}
+
+	i = 0;
+	// send talk address
+	cmd_string[i++] = MTA(pad);
+	if (sad >= 0)
+		cmd_string[i++] = MSA(sad);
+
+	ret = board->interface->command(board, cmd_string, i, &nbytes);
+	if (ret < 0 || nbytes < i) {
+		dev_err(board->gpib_dev, "failed to setup serial poll\n");
+		os_remove_timer(board);
+		return -EIO;
+	}
+
+	ibgts(board);
+
+	// read poll result
+	ret = board->interface->read(board, result, 1, &end_flag, &nbytes);
+	if (ret < 0 || nbytes < 1) {
+		dev_err(board->gpib_dev, "serial poll failed\n");
+		os_remove_timer(board);
+		return -EIO;
+	}
+	os_remove_timer(board);
+
+	return 0;
+}
+
+static int cleanup_serial_poll(struct gpib_board *board, unsigned int usec_timeout)
+{
+	u8 cmd_string[8];
+	int ret;
+	size_t bytes_written;
+
+	os_start_timer(board, usec_timeout);
+	ret = ibcac(board, 1, 1);
+	if (ret < 0) {
+		os_remove_timer(board);
+		return ret;
+	}
+
+	cmd_string[0] = SPD;	/* disable serial poll bytes */
+	cmd_string[1] = UNT;
+	ret = board->interface->command(board, cmd_string, 2, &bytes_written);
+	if (ret < 0 || bytes_written < 2) {
+		dev_err(board->gpib_dev, "failed to disable serial poll\n");
+		os_remove_timer(board);
+		return -EIO;
+	}
+	os_remove_timer(board);
+
+	return 0;
+}
+
+static int serial_poll_single(struct gpib_board *board, unsigned int pad, int sad,
+			      unsigned int usec_timeout, u8 *result)
+{
+	int retval, cleanup_retval;
+
+	retval = setup_serial_poll(board, usec_timeout);
+	if (retval < 0)
+		return retval;
+	retval = read_serial_poll_byte(board, pad, sad, usec_timeout, result);
+	cleanup_retval = cleanup_serial_poll(board, usec_timeout);
+	if (retval < 0)
+		return retval;
+	if (cleanup_retval < 0)
+		return retval;
+
+	return 0;
+}
+
+int serial_poll_all(struct gpib_board *board, unsigned int usec_timeout)
+{
+	int retval = 0;
+	struct list_head *cur;
+	const struct list_head *head = NULL;
+	struct gpib_status_queue *device;
+	u8 result;
+	unsigned int num_bytes = 0;
+
+	head = &board->device_list;
+	if (head->next == head)
+		return 0;
+
+	retval = setup_serial_poll(board, usec_timeout);
+	if (retval < 0)
+		return retval;
+
+	for (cur = head->next; cur != head; cur = cur->next) {
+		device = list_entry(cur, struct gpib_status_queue, list);
+		retval = read_serial_poll_byte(board,
+					       device->pad, device->sad, usec_timeout, &result);
+		if (retval < 0)
+			continue;
+		if (result & request_service_bit) {
+			retval = push_status_byte(board, device, result);
+			if (retval < 0)
+				continue;
+			num_bytes++;
+		}
+	}
+
+	retval = cleanup_serial_poll(board, usec_timeout);
+	if (retval < 0)
+		return retval;
+
+	return num_bytes;
+}
+
+/*
+ * DVRSP
+ * This function performs a serial poll of the device with primary
+ * address pad and secondary address sad. If the device has no
+ * secondary address, pass a negative number in for this argument.  At the
+ * end of a successful serial poll the response is returned in result.
+ * SPD and UNT are sent at the completion of the poll.
+ */
+
+int dvrsp(struct gpib_board *board, unsigned int pad, int sad,
+	  unsigned int usec_timeout, u8 *result)
+{
+	int status = ibstatus(board);
+	int retval;
+
+	if ((status & CIC) == 0) {
+		dev_err(board->gpib_dev, "not CIC during serial poll\n");
+		return -1;
+	}
+
+	if (pad > MAX_GPIB_PRIMARY_ADDRESS || sad > MAX_GPIB_SECONDARY_ADDRESS || sad < -1) {
+		dev_err(board->gpib_dev, "bad address for serial poll");
+		return -1;
+	}
+
+	retval = serial_poll_single(board, pad, sad, usec_timeout, result);
+	if (io_timed_out(board))
+		retval = -ETIMEDOUT;
+
+	return retval;
+}
+
+static struct gpib_descriptor *handle_to_descriptor(const struct gpib_file_private *file_priv,
+						    int handle)
+{
+	if (handle < 0 || handle >= GPIB_MAX_NUM_DESCRIPTORS) {
+		pr_err("gpib: invalid handle %i\n", handle);
+		return NULL;
+	}
+
+	return file_priv->descriptors[handle];
+}
+
+static int init_gpib_file_private(struct gpib_file_private *priv)
+{
+	memset(priv, 0, sizeof(*priv));
+	atomic_set(&priv->holding_mutex, 0);
+	priv->descriptors[0] = kmalloc(sizeof(struct gpib_descriptor), GFP_KERNEL);
+	if (!priv->descriptors[0]) {
+		pr_err("gpib: failed to allocate default board descriptor\n");
+		return -ENOMEM;
+	}
+	init_gpib_descriptor(priv->descriptors[0]);
+	priv->descriptors[0]->is_board = 1;
+	mutex_init(&priv->descriptors_mutex);
+	return 0;
+}
+
+int ibopen(struct inode *inode, struct file *filep)
+{
+	unsigned int minor = iminor(inode);
+	struct gpib_board *board;
+	struct gpib_file_private *priv;
+
+	if (minor >= GPIB_MAX_NUM_BOARDS) {
+		pr_err("gpib: invalid minor number of device file\n");
+		return -ENXIO;
+	}
+
+	board = &board_array[minor];
+
+	filep->private_data = kmalloc(sizeof(struct gpib_file_private), GFP_KERNEL);
+	if (!filep->private_data)
+		return -ENOMEM;
+
+	priv = filep->private_data;
+	init_gpib_file_private((struct gpib_file_private *)filep->private_data);
+
+	if (board->use_count == 0) {
+		int retval;
+
+		retval = request_module("gpib%i", minor);
+		if (retval)
+			dev_dbg(board->gpib_dev, "request module returned %i\n", retval);
+	}
+	if (board->interface) {
+		if (!try_module_get(board->provider_module)) {
+			dev_err(board->gpib_dev, "try_module_get() failed\n");
+			return -EIO;
+		}
+		board->use_count++;
+		priv->got_module = 1;
+	}
+	return 0;
+}
+
+int ibclose(struct inode *inode, struct file *filep)
+{
+	unsigned int minor = iminor(inode);
+	struct gpib_board *board;
+	struct gpib_file_private *priv = filep->private_data;
+	struct gpib_descriptor *desc;
+
+	if (minor >= GPIB_MAX_NUM_BOARDS) {
+		pr_err("gpib: invalid minor number of device file\n");
+		return -ENODEV;
+	}
+
+	board = &board_array[minor];
+
+	if (priv) {
+		desc = handle_to_descriptor(priv, 0);
+		if (desc) {
+			if (desc->autopoll_enabled) {
+				dev_dbg(board->gpib_dev, "decrementing autospollers\n");
+				if (board->autospollers > 0)
+					board->autospollers--;
+				else
+					dev_err(board->gpib_dev,
+						"Attempt to decrement zero autospollers\n");
+			}
+		} else {
+			dev_err(board->gpib_dev, "Unexpected null gpib_descriptor\n");
+		}
+
+		cleanup_open_devices(priv, board);
+
+		if (atomic_read(&priv->holding_mutex))
+			mutex_unlock(&board->user_mutex);
+
+		if (priv->got_module && board->use_count) {
+			module_put(board->provider_module);
+			--board->use_count;
+		}
+
+		kfree(filep->private_data);
+		filep->private_data = NULL;
+	}
+
+	return 0;
+}
+
+long ibioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	unsigned int minor = iminor(file_inode(filep));
+	struct gpib_board *board;
+	struct gpib_file_private *file_priv = filep->private_data;
+	long retval = -ENOTTY;
+
+	if (minor >= GPIB_MAX_NUM_BOARDS) {
+		pr_err("gpib: invalid minor number of device file\n");
+		return -ENODEV;
+	}
+	board = &board_array[minor];
+
+	if (mutex_lock_interruptible(&board->big_gpib_mutex))
+		return -ERESTARTSYS;
+
+	dev_dbg(board->gpib_dev, "ioctl %d, interface=%s, use=%d, onl=%d\n",
+		cmd & 0xff,
+		board->interface ? board->interface->name : "",
+		board->use_count,
+		board->online);
+
+	switch (cmd) {
+	case CFCBOARDTYPE:
+		retval = board_type_ioctl(file_priv, board, arg);
+		goto done;
+	case IBONL:
+		retval = online_ioctl(board, arg);
+		goto done;
+	default:
+		break;
+	}
+	if (!board->interface) {
+		dev_err(board->gpib_dev, "no gpib board configured\n");
+		retval = -ENODEV;
+		goto done;
+	}
+	if (file_priv->got_module == 0)	{
+		if (!try_module_get(board->provider_module)) {
+			dev_err(board->gpib_dev, "try_module_get() failed\n");
+			retval = -EIO;
+			goto done;
+		}
+		file_priv->got_module = 1;
+		board->use_count++;
+	}
+	switch (cmd) {
+	case CFCBASE:
+		retval = iobase_ioctl(&board->config, arg);
+		goto done;
+	case CFCIRQ:
+		retval = irq_ioctl(&board->config, arg);
+		goto done;
+	case CFCDMA:
+		retval = dma_ioctl(&board->config, arg);
+		goto done;
+	case IBAUTOSPOLL:
+		retval = autospoll_ioctl(board, file_priv, arg);
+		goto done;
+	case IBBOARD_INFO:
+		retval = board_info_ioctl(board, arg);
+		goto done;
+	case IBMUTEX:
+		/*
+		 * Need to unlock board->big_gpib_mutex before potentially locking board->user_mutex
+		 * to maintain consistent locking order
+		 */
+		mutex_unlock(&board->big_gpib_mutex);
+		return mutex_ioctl(board, file_priv, arg);
+	case IBPAD:
+		retval = pad_ioctl(board, file_priv, arg);
+		goto done;
+	case IBSAD:
+		retval = sad_ioctl(board, file_priv, arg);
+		goto done;
+	case IBSELECT_PCI:
+		retval = select_pci_ioctl(&board->config, arg);
+		goto done;
+	case IBSELECT_DEVICE_PATH:
+		retval = select_device_path_ioctl(&board->config, arg);
+		goto done;
+	default:
+		break;
+	}
+
+	if (!board->online) {
+		retval = -EINVAL;
+		goto done;
+	}
+
+	switch (cmd) {
+	case IBEVENT:
+		retval = event_ioctl(board, arg);
+		goto done;
+	case IBCLOSEDEV:
+		retval = close_dev_ioctl(filep, board, arg);
+		goto done;
+	case IBOPENDEV:
+		retval = open_dev_ioctl(filep, board, arg);
+		goto done;
+	case IBSPOLL_BYTES:
+		retval = status_bytes_ioctl(board, arg);
+		goto done;
+	case IBWAIT:
+		retval = wait_ioctl(file_priv, board, arg);
+		if (retval == -ERESTARTSYS)
+			return retval;
+		goto done;
+	case IBLINES:
+		retval = line_status_ioctl(board, arg);
+		goto done;
+	case IBLOC:
+		board->interface->return_to_local(board);
+		retval = 0;
+		goto done;
+	default:
+		break;
+	}
+
+	spin_lock(&board->locking_pid_spinlock);
+	if (current->pid != board->locking_pid)	{
+		spin_unlock(&board->locking_pid_spinlock);
+		retval = -EPERM;
+		goto done;
+	}
+	spin_unlock(&board->locking_pid_spinlock);
+
+	switch (cmd) {
+	case IB_T1_DELAY:
+		retval = t1_delay_ioctl(board, arg);
+		goto done;
+	case IBCAC:
+		retval = take_control_ioctl(board, arg);
+		goto done;
+	case IBCMD:
+		/*
+		 * IO ioctls can take a long time, we need to unlock board->big_gpib_mutex
+		 * before we call them.
+		 */
+		mutex_unlock(&board->big_gpib_mutex);
+		return command_ioctl(file_priv, board, arg);
+	case IBEOS:
+		retval = eos_ioctl(board, arg);
+		goto done;
+	case IBGTS:
+		retval = ibgts(board);
+		goto done;
+	case IBPPC:
+		retval = ppc_ioctl(board, arg);
+		goto done;
+	case IBPP2_SET:
+		retval = set_local_ppoll_mode_ioctl(board, arg);
+		goto done;
+	case IBPP2_GET:
+		retval = get_local_ppoll_mode_ioctl(board, arg);
+		goto done;
+	case IBQUERY_BOARD_RSV:
+		retval = query_board_rsv_ioctl(board, arg);
+		goto done;
+	case IBRD:
+		/*
+		 * IO ioctls can take a long time, we need to unlock board->big_gpib_mutex
+		 * before we call them.
+		 */
+		mutex_unlock(&board->big_gpib_mutex);
+		return read_ioctl(file_priv, board, arg);
+	case IBRPP:
+		retval = parallel_poll_ioctl(board, arg);
+		goto done;
+	case IBRSC:
+		retval = request_system_control_ioctl(board, arg);
+		goto done;
+	case IBRSP:
+		retval = serial_poll_ioctl(board, arg);
+		goto done;
+	case IBRSV:
+		retval = request_service_ioctl(board, arg);
+		goto done;
+	case IBRSV2:
+		retval = request_service2_ioctl(board, arg);
+		goto done;
+	case IBSIC:
+		retval = interface_clear_ioctl(board, arg);
+		goto done;
+	case IBSRE:
+		retval = remote_enable_ioctl(board, arg);
+		goto done;
+	case IBTMO:
+		retval = timeout_ioctl(board, arg);
+		goto done;
+	case IBWRT:
+		/*
+		 * IO ioctls can take a long time, we need to unlock board->big_gpib_mutex
+		 * before we call them.
+		 */
+		mutex_unlock(&board->big_gpib_mutex);
+		return write_ioctl(file_priv, board, arg);
+	default:
+		retval = -ENOTTY;
+		goto done;
+	}
+
+done:
+	mutex_unlock(&board->big_gpib_mutex);
+	dev_dbg(board->gpib_dev, "ioctl done status = 0x%lx\n", board->status);
+	return retval;
+}
+
+static int board_type_ioctl(struct gpib_file_private *file_priv,
+			    struct gpib_board *board, unsigned long arg)
+{
+	struct list_head *list_ptr;
+	struct gpib_board_type_ioctl cmd;
+	int retval;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (board->online)
+		return -EBUSY;
+
+	retval = copy_from_user(&cmd, (void __user *)arg,
+				sizeof(struct gpib_board_type_ioctl));
+	if (retval)
+		return -EFAULT;
+
+	for (list_ptr = registered_drivers.next; list_ptr != &registered_drivers;
+	     list_ptr = list_ptr->next) {
+		struct gpib_interface_list *entry;
+
+		entry = list_entry(list_ptr, struct gpib_interface_list, list);
+		if (strcmp(entry->interface->name, cmd.name) == 0) {
+			int i;
+			int had_module = file_priv->got_module;
+
+			if (board->use_count) {
+				for (i = 0; i < board->use_count; ++i)
+					module_put(board->provider_module);
+				board->interface = NULL;
+				file_priv->got_module = 0;
+			}
+			board->interface = entry->interface;
+			board->provider_module = entry->module;
+			for (i = 0; i < board->use_count; ++i) {
+				if (!try_module_get(entry->module)) {
+					board->use_count = i;
+					return -EIO;
+				}
+			}
+			if (had_module == 0) {
+				if (!try_module_get(entry->module))
+					return -EIO;
+				++board->use_count;
+			}
+			file_priv->got_module = 1;
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int read_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
+		      unsigned long arg)
+{
+	struct gpib_read_write_ioctl read_cmd;
+	u8 __user *userbuf;
+	unsigned long remain;
+	int end_flag = 0;
+	int retval;
+	ssize_t read_ret = 0;
+	struct gpib_descriptor *desc;
+	size_t nbytes;
+
+	retval = copy_from_user(&read_cmd, (void __user *)arg, sizeof(read_cmd));
+	if (retval)
+		return -EFAULT;
+
+	if (read_cmd.completed_transfer_count > read_cmd.requested_transfer_count)
+		return -EINVAL;
+
+	desc = handle_to_descriptor(file_priv, read_cmd.handle);
+	if (!desc)
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(sizeof(userbuf) > sizeof(read_cmd.buffer_ptr)))
+		return -EFAULT;
+
+	userbuf = (u8 __user *)(unsigned long)read_cmd.buffer_ptr;
+	userbuf += read_cmd.completed_transfer_count;
+
+	remain = read_cmd.requested_transfer_count - read_cmd.completed_transfer_count;
+
+	/* Check write access to buffer */
+	if (!access_ok(userbuf, remain))
+		return -EFAULT;
+
+	atomic_set(&desc->io_in_progress, 1);
+
+	/* Read buffer loads till we fill the user supplied buffer */
+	while (remain > 0 && end_flag == 0) {
+		nbytes = 0;
+		read_ret = ibrd(board, board->buffer, (board->buffer_length < remain) ?
+				board->buffer_length : remain, &end_flag, &nbytes);
+		if (nbytes == 0)
+			break;
+		retval = copy_to_user(userbuf, board->buffer, nbytes);
+		if (retval) {
+			retval = -EFAULT;
+			break;
+		}
+		remain -= nbytes;
+		userbuf += nbytes;
+		if (read_ret < 0)
+			break;
+	}
+	read_cmd.completed_transfer_count = read_cmd.requested_transfer_count - remain;
+	read_cmd.end = end_flag;
+	/*
+	 * suppress errors (for example due to timeout or interruption by device clear)
+	 * if all bytes got sent.  This prevents races that can occur in the various drivers
+	 * if a device receives a device clear immediately after a transfer completes and
+	 * the driver code wasn't careful enough to handle that case.
+	 */
+	if (remain == 0 || end_flag)
+		read_ret = 0;
+	if (retval == 0)
+		retval = copy_to_user((void __user *)arg, &read_cmd, sizeof(read_cmd));
+
+	atomic_set(&desc->io_in_progress, 0);
+
+	wake_up_interruptible(&board->wait);
+	if (retval)
+		return -EFAULT;
+
+	return read_ret;
+}
+
+static int command_ioctl(struct gpib_file_private *file_priv,
+			 struct gpib_board *board, unsigned long arg)
+{
+	struct gpib_read_write_ioctl cmd;
+	u8 __user *userbuf;
+	unsigned long remain;
+	int retval;
+	int fault = 0;
+	struct gpib_descriptor *desc;
+	size_t bytes_written;
+	int no_clear_io_in_prog;
+
+	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
+	if (retval)
+		return -EFAULT;
+
+	if (cmd.completed_transfer_count > cmd.requested_transfer_count)
+		return -EINVAL;
+
+	desc = handle_to_descriptor(file_priv, cmd.handle);
+	if (!desc)
+		return -EINVAL;
+
+	userbuf = (u8 __user *)(unsigned long)cmd.buffer_ptr;
+	userbuf += cmd.completed_transfer_count;
+
+	no_clear_io_in_prog = cmd.end;
+	cmd.end = 0;
+
+	remain = cmd.requested_transfer_count - cmd.completed_transfer_count;
+
+	/* Check read access to buffer */
+	if (!access_ok(userbuf, remain))
+		return -EFAULT;
+
+	/*
+	 * Write buffer loads till we empty the user supplied buffer.
+	 * Call drivers at least once, even if remain is zero, in
+	 * order to allow them to insure previous commands were
+	 * completely finished, in the case of a restarted ioctl.
+	 */
+
+	atomic_set(&desc->io_in_progress, 1);
+
+	do {
+		fault = copy_from_user(board->buffer, userbuf, (board->buffer_length < remain) ?
+				       board->buffer_length : remain);
+		if (fault) {
+			retval = -EFAULT;
+			bytes_written = 0;
+		} else {
+			retval = ibcmd(board, board->buffer, (board->buffer_length < remain) ?
+				       board->buffer_length : remain, &bytes_written);
+		}
+		remain -= bytes_written;
+		userbuf += bytes_written;
+		if (retval < 0) {
+			atomic_set(&desc->io_in_progress, 0);
+
+			wake_up_interruptible(&board->wait);
+			break;
+		}
+	} while (remain > 0);
+
+	cmd.completed_transfer_count = cmd.requested_transfer_count - remain;
+
+	if (fault == 0)
+		fault = copy_to_user((void __user *)arg, &cmd, sizeof(cmd));
+
+	/*
+	 * no_clear_io_in_prog (cmd.end) is true when io_in_progress should
+	 * not be set to zero because the cmd in progress is the address setup
+	 * operation for an async read or write. This causes CMPL not to be set
+	 * in general_ibstatus until the async read or write completes.
+	 */
+	if (!no_clear_io_in_prog || fault)
+		atomic_set(&desc->io_in_progress, 0);
+
+	wake_up_interruptible(&board->wait);
+	if (fault)
+		return -EFAULT;
+
+	return retval;
+}
+
+static int write_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
+		       unsigned long arg)
+{
+	struct gpib_read_write_ioctl write_cmd;
+	u8 __user *userbuf;
+	unsigned long remain;
+	int retval = 0;
+	int fault;
+	struct gpib_descriptor *desc;
+
+	fault = copy_from_user(&write_cmd, (void __user *)arg, sizeof(write_cmd));
+	if (fault)
+		return -EFAULT;
+
+	if (write_cmd.completed_transfer_count > write_cmd.requested_transfer_count)
+		return -EINVAL;
+
+	desc = handle_to_descriptor(file_priv, write_cmd.handle);
+	if (!desc)
+		return -EINVAL;
+
+	userbuf = (u8 __user *)(unsigned long)write_cmd.buffer_ptr;
+	userbuf += write_cmd.completed_transfer_count;
+
+	remain = write_cmd.requested_transfer_count - write_cmd.completed_transfer_count;
+
+	/* Check read access to buffer */
+	if (!access_ok(userbuf, remain))
+		return -EFAULT;
+
+	atomic_set(&desc->io_in_progress, 1);
+
+	/* Write buffer loads till we empty the user supplied buffer */
+	while (remain > 0) {
+		int send_eoi;
+		size_t bytes_written = 0;
+
+		send_eoi = remain <= board->buffer_length && write_cmd.end;
+		fault = copy_from_user(board->buffer, userbuf, (board->buffer_length < remain) ?
+				       board->buffer_length : remain);
+		if (fault) {
+			retval = -EFAULT;
+			break;
+		}
+		retval = ibwrt(board, board->buffer, (board->buffer_length < remain) ?
+			       board->buffer_length : remain, send_eoi, &bytes_written);
+		remain -= bytes_written;
+		userbuf += bytes_written;
+		if (retval < 0)
+			break;
+	}
+	write_cmd.completed_transfer_count = write_cmd.requested_transfer_count - remain;
+	/*
+	 * suppress errors (for example due to timeout or interruption by device clear)
+	 * if all bytes got sent.  This prevents races that can occur in the various drivers
+	 * if a device receives a device clear immediately after a transfer completes and
+	 * the driver code wasn't careful enough to handle that case.
+	 */
+	if (remain == 0)
+		retval = 0;
+	if (fault == 0)
+		fault = copy_to_user((void __user *)arg, &write_cmd, sizeof(write_cmd));
+
+	atomic_set(&desc->io_in_progress, 0);
+
+	wake_up_interruptible(&board->wait);
+	if (fault)
+		return -EFAULT;
+
+	return retval;
+}
+
+static int status_bytes_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	struct gpib_status_queue *device;
+	struct gpib_spoll_bytes_ioctl cmd;
+	int retval;
+
+	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
+	if (retval)
+		return -EFAULT;
+
+	device = get_gpib_status_queue(board, cmd.pad, cmd.sad);
+	if (!device)
+		cmd.num_bytes = 0;
+	else
+		cmd.num_bytes = num_status_bytes(device);
+
+	retval = copy_to_user((void __user *)arg, &cmd, sizeof(cmd));
+	if (retval)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int increment_open_device_count(struct gpib_board *board, struct list_head *head,
+				       unsigned int pad, int sad)
+{
+	struct list_head *list_ptr;
+	struct gpib_status_queue *device;
+
+	/*
+	 * first see if address has already been opened, then increment
+	 * open count
+	 */
+	for (list_ptr = head->next; list_ptr != head; list_ptr = list_ptr->next) {
+		device = list_entry(list_ptr, struct gpib_status_queue, list);
+		if (gpib_address_equal(device->pad, device->sad, pad, sad)) {
+			dev_dbg(board->gpib_dev, "incrementing open count for pad %i, sad %i\n",
+				device->pad, device->sad);
+			device->reference_count++;
+			return 0;
+		}
+	}
+
+	/* otherwise we need to allocate a new struct gpib_status_queue */
+	device = kmalloc(sizeof(struct gpib_status_queue), GFP_ATOMIC);
+	if (!device)
+		return -ENOMEM;
+	init_gpib_status_queue(device);
+	device->pad = pad;
+	device->sad = sad;
+	device->reference_count = 1;
+
+	list_add(&device->list, head);
+
+	dev_dbg(board->gpib_dev, "opened pad %i, sad %i\n", device->pad, device->sad);
+
+	return 0;
+}
+
+static int subtract_open_device_count(struct gpib_board *board, struct list_head *head,
+				      unsigned int pad, int sad, unsigned int count)
+{
+	struct gpib_status_queue *device;
+	struct list_head *list_ptr;
+
+	for (list_ptr = head->next; list_ptr != head; list_ptr = list_ptr->next) {
+		device = list_entry(list_ptr, struct gpib_status_queue, list);
+		if (gpib_address_equal(device->pad, device->sad, pad, sad)) {
+			dev_dbg(board->gpib_dev, "decrementing open count for pad %i, sad %i\n",
+				device->pad, device->sad);
+			if (count > device->reference_count) {
+				dev_err(board->gpib_dev, "bug! in %s()\n", __func__);
+				return -EINVAL;
+			}
+			device->reference_count -= count;
+			if (device->reference_count == 0) {
+				dev_dbg(board->gpib_dev, "closing pad %i, sad %i\n",
+					device->pad, device->sad);
+				list_del(list_ptr);
+				kfree(device);
+			}
+			return 0;
+		}
+	}
+	dev_err(board->gpib_dev, "bug! tried to close address that was never opened!\n");
+	return -EINVAL;
+}
+
+static inline int decrement_open_device_count(struct gpib_board *board, struct list_head *head,
+					      unsigned int pad, int sad)
+{
+	return subtract_open_device_count(board, head, pad, sad, 1);
+}
+
+static int cleanup_open_devices(struct gpib_file_private *file_priv, struct gpib_board *board)
+{
+	int retval = 0;
+	int i;
+
+	for (i = 0; i < GPIB_MAX_NUM_DESCRIPTORS; i++) {
+		struct gpib_descriptor *desc;
+
+		desc = file_priv->descriptors[i];
+		if (!desc)
+			continue;
+
+		if (desc->is_board == 0) {
+			retval = decrement_open_device_count(board, &board->device_list, desc->pad,
+							     desc->sad);
+			if (retval < 0)
+				return retval;
+		}
+		kfree(desc);
+		file_priv->descriptors[i] = NULL;
+	}
+
+	return 0;
+}
+
+static int open_dev_ioctl(struct file *filep, struct gpib_board *board, unsigned long arg)
+{
+	struct gpib_open_dev_ioctl open_dev_cmd;
+	int retval;
+	struct gpib_file_private *file_priv = filep->private_data;
+	int i;
+
+	retval = copy_from_user(&open_dev_cmd, (void __user *)arg, sizeof(open_dev_cmd));
+	if (retval)
+		return -EFAULT;
+
+	if (mutex_lock_interruptible(&file_priv->descriptors_mutex))
+		return -ERESTARTSYS;
+	for (i = 0; i < GPIB_MAX_NUM_DESCRIPTORS; i++)
+		if (!file_priv->descriptors[i])
+			break;
+	if (i == GPIB_MAX_NUM_DESCRIPTORS) {
+		mutex_unlock(&file_priv->descriptors_mutex);
+		return -ERANGE;
+	}
+	file_priv->descriptors[i] = kmalloc(sizeof(struct gpib_descriptor), GFP_KERNEL);
+	if (!file_priv->descriptors[i]) {
+		mutex_unlock(&file_priv->descriptors_mutex);
+		return -ENOMEM;
+	}
+	init_gpib_descriptor(file_priv->descriptors[i]);
+
+	file_priv->descriptors[i]->pad = open_dev_cmd.pad;
+	file_priv->descriptors[i]->sad = open_dev_cmd.sad;
+	file_priv->descriptors[i]->is_board = open_dev_cmd.is_board;
+	mutex_unlock(&file_priv->descriptors_mutex);
+
+	retval = increment_open_device_count(board, &board->device_list, open_dev_cmd.pad,
+					     open_dev_cmd.sad);
+	if (retval < 0)
+		return retval;
+
+	/*
+	 * clear stuck srq state, since we may be able to find service request on
+	 * the new device
+	 */
+	atomic_set(&board->stuck_srq, 0);
+
+	open_dev_cmd.handle = i;
+	retval = copy_to_user((void __user *)arg, &open_dev_cmd, sizeof(open_dev_cmd));
+	if (retval)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int close_dev_ioctl(struct file *filep, struct gpib_board *board, unsigned long arg)
+{
+	struct gpib_close_dev_ioctl cmd;
+	struct gpib_file_private *file_priv = filep->private_data;
+	int retval;
+
+	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
+	if (retval)
+		return -EFAULT;
+
+	if (cmd.handle >= GPIB_MAX_NUM_DESCRIPTORS)
+		return -EINVAL;
+	if (!file_priv->descriptors[cmd.handle])
+		return -EINVAL;
+
+	retval = decrement_open_device_count(board, &board->device_list,
+					     file_priv->descriptors[cmd.handle]->pad,
+					     file_priv->descriptors[cmd.handle]->sad);
+	if (retval < 0)
+		return retval;
+
+	kfree(file_priv->descriptors[cmd.handle]);
+	file_priv->descriptors[cmd.handle] = NULL;
+
+	return 0;
+}
+
+static int serial_poll_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	struct gpib_serial_poll_ioctl serial_cmd;
+	int retval;
+
+	retval = copy_from_user(&serial_cmd, (void __user *)arg, sizeof(serial_cmd));
+	if (retval)
+		return -EFAULT;
+
+	retval = get_serial_poll_byte(board, serial_cmd.pad, serial_cmd.sad, board->usec_timeout,
+				      &serial_cmd.status_byte);
+	if (retval < 0)
+		return retval;
+
+	retval = copy_to_user((void __user *)arg, &serial_cmd, sizeof(serial_cmd));
+	if (retval)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int wait_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
+		      unsigned long arg)
+{
+	struct gpib_wait_ioctl wait_cmd;
+	int retval;
+	struct gpib_descriptor *desc;
+
+	retval = copy_from_user(&wait_cmd, (void __user *)arg, sizeof(wait_cmd));
+	if (retval)
+		return -EFAULT;
+
+	desc = handle_to_descriptor(file_priv, wait_cmd.handle);
+	if (!desc)
+		return -EINVAL;
+
+	retval = ibwait(board, wait_cmd.wait_mask, wait_cmd.clear_mask,
+			wait_cmd.set_mask, &wait_cmd.ibsta, wait_cmd.usec_timeout, desc);
+	if (retval < 0)
+		return retval;
+
+	retval = copy_to_user((void __user *)arg, &wait_cmd, sizeof(wait_cmd));
+	if (retval)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int parallel_poll_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	u8 poll_byte;
+	int retval;
+
+	retval = ibrpp(board, &poll_byte);
+	if (retval < 0)
+		return retval;
+
+	retval = copy_to_user((void __user *)arg, &poll_byte, sizeof(poll_byte));
+	if (retval)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int online_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	struct gpib_online_ioctl online_cmd;
+	int retval;
+	void __user *init_data = NULL;
+
+	board->config.init_data = NULL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	retval = copy_from_user(&online_cmd, (void __user *)arg, sizeof(online_cmd));
+	if (retval)
+		return -EFAULT;
+	if (online_cmd.init_data_length > 0) {
+		board->config.init_data = vmalloc(online_cmd.init_data_length);
+		if (!board->config.init_data)
+			return -ENOMEM;
+		if (WARN_ON_ONCE(sizeof(init_data) > sizeof(online_cmd.init_data_ptr)))
+			return -EFAULT;
+		init_data = (void __user *)(unsigned long)(online_cmd.init_data_ptr);
+		retval = copy_from_user(board->config.init_data, init_data,
+					online_cmd.init_data_length);
+		if (retval) {
+			vfree(board->config.init_data);
+			return -EFAULT;
+		}
+		board->config.init_data_length = online_cmd.init_data_length;
+	} else {
+		board->config.init_data = NULL;
+		board->config.init_data_length = 0;
+	}
+	if (online_cmd.online)
+		retval = ibonline(board);
+	else
+		retval = iboffline(board);
+	if (board->config.init_data) {
+		vfree(board->config.init_data);
+		board->config.init_data = NULL;
+		board->config.init_data_length = 0;
+	}
+	return retval;
+}
+
+static int remote_enable_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	int enable;
+	int retval;
+
+	retval = copy_from_user(&enable, (void __user *)arg, sizeof(enable));
+	if (retval)
+		return -EFAULT;
+
+	return ibsre(board, enable);
+}
+
+static int take_control_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	int synchronous;
+	int retval;
+
+	retval = copy_from_user(&synchronous, (void __user *)arg, sizeof(synchronous));
+	if (retval)
+		return -EFAULT;
+
+	return ibcac(board, synchronous, 1);
+}
+
+static int line_status_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	short lines;
+	int retval;
+
+	retval = iblines(board, &lines);
+	if (retval < 0)
+		return retval;
+
+	retval = copy_to_user((void __user *)arg, &lines, sizeof(lines));
+	if (retval)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int pad_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
+		     unsigned long arg)
+{
+	struct gpib_pad_ioctl cmd;
+	int retval;
+	struct gpib_descriptor *desc;
+
+	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
+	if (retval)
+		return -EFAULT;
+
+	desc = handle_to_descriptor(file_priv, cmd.handle);
+	if (!desc)
+		return -EINVAL;
+
+	if (desc->is_board) {
+		retval = ibpad(board, cmd.pad);
+		if (retval < 0)
+			return retval;
+	} else {
+		retval = decrement_open_device_count(board, &board->device_list, desc->pad,
+						     desc->sad);
+		if (retval < 0)
+			return retval;
+
+		desc->pad = cmd.pad;
+
+		retval = increment_open_device_count(board, &board->device_list, desc->pad,
+						     desc->sad);
+		if (retval < 0)
+			return retval;
+	}
+
+	return 0;
+}
+
+static int sad_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
+		     unsigned long arg)
+{
+	struct gpib_sad_ioctl cmd;
+	int retval;
+	struct gpib_descriptor *desc;
+
+	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
+	if (retval)
+		return -EFAULT;
+
+	desc = handle_to_descriptor(file_priv, cmd.handle);
+	if (!desc)
+		return -EINVAL;
+
+	if (desc->is_board) {
+		retval = ibsad(board, cmd.sad);
+		if (retval < 0)
+			return retval;
+	} else {
+		retval = decrement_open_device_count(board, &board->device_list, desc->pad,
+						     desc->sad);
+		if (retval < 0)
+			return retval;
+
+		desc->sad = cmd.sad;
+
+		retval = increment_open_device_count(board, &board->device_list, desc->pad,
+						     desc->sad);
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
+}
+
+static int eos_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	struct gpib_eos_ioctl eos_cmd;
+	int retval;
+
+	retval = copy_from_user(&eos_cmd, (void __user *)arg, sizeof(eos_cmd));
+	if (retval)
+		return -EFAULT;
+
+	return ibeos(board, eos_cmd.eos, eos_cmd.eos_flags);
+}
+
+static int request_service_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	u8 status_byte;
+	int retval;
+
+	retval = copy_from_user(&status_byte, (void __user *)arg, sizeof(status_byte));
+	if (retval)
+		return -EFAULT;
+
+	return ibrsv2(board, status_byte, status_byte & request_service_bit);
+}
+
+static int request_service2_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	struct gpib_request_service2 request_service2_cmd;
+	int retval;
+
+	retval = copy_from_user(&request_service2_cmd, (void __user *)arg,
+				sizeof(struct gpib_request_service2));
+	if (retval)
+		return -EFAULT;
+
+	return ibrsv2(board, request_service2_cmd.status_byte,
+		      request_service2_cmd.new_reason_for_service);
+}
+
+static int iobase_ioctl(struct gpib_board_config *config, unsigned long arg)
+{
+	u64 base_addr;
+	int retval;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	retval = copy_from_user(&base_addr, (void __user *)arg, sizeof(base_addr));
+	if (retval)
+		return -EFAULT;
+
+	if (WARN_ON_ONCE(sizeof(void *) > sizeof(base_addr)))
+		return -EFAULT;
+	config->ibbase = base_addr;
+
+	return 0;
+}
+
+static int irq_ioctl(struct gpib_board_config *config, unsigned long arg)
+{
+	unsigned int irq;
+	int retval;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	retval = copy_from_user(&irq, (void __user *)arg, sizeof(irq));
+	if (retval)
+		return -EFAULT;
+
+	config->ibirq = irq;
+
+	return 0;
+}
+
+static int dma_ioctl(struct gpib_board_config *config, unsigned long arg)
+{
+	unsigned int dma_channel;
+	int retval;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	retval = copy_from_user(&dma_channel, (void __user *)arg, sizeof(dma_channel));
+	if (retval)
+		return -EFAULT;
+
+	config->ibdma = dma_channel;
+
+	return 0;
+}
+
+static int autospoll_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
+			   unsigned long arg)
+{
+	short enable;
+	int retval;
+	struct gpib_descriptor *desc;
+
+	retval = copy_from_user(&enable, (void __user *)arg, sizeof(enable));
+	if (retval)
+		return -EFAULT;
+
+	desc = handle_to_descriptor(file_priv, 0); /* board handle is 0 */
+
+	if (enable) {
+		if (!desc->autopoll_enabled) {
+			board->autospollers++;
+			desc->autopoll_enabled = 1;
+		}
+		retval = 0;
+	} else {
+		if (desc->autopoll_enabled) {
+			desc->autopoll_enabled = 0;
+			if (board->autospollers > 0) {
+				board->autospollers--;
+				retval = 0;
+			} else {
+				dev_err(board->gpib_dev,
+					"tried to set number of autospollers negative\n");
+				retval = -EINVAL;
+			}
+		} else {
+			dev_err(board->gpib_dev, "autopoll disable requested before enable\n");
+			retval = -EINVAL;
+		}
+	}
+	return retval;
+}
+
+static int mutex_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
+		       unsigned long arg)
+{
+	int retval, lock_mutex;
+
+	retval = copy_from_user(&lock_mutex, (void __user *)arg, sizeof(lock_mutex));
+	if (retval)
+		return -EFAULT;
+
+	if (lock_mutex)	{
+		retval = mutex_lock_interruptible(&board->user_mutex);
+		if (retval)
+			return -ERESTARTSYS;
+
+		spin_lock(&board->locking_pid_spinlock);
+		board->locking_pid = current->pid;
+		spin_unlock(&board->locking_pid_spinlock);
+
+		atomic_set(&file_priv->holding_mutex, 1);
+
+		dev_dbg(board->gpib_dev, "locked board mutex\n");
+	} else {
+		spin_lock(&board->locking_pid_spinlock);
+		if (current->pid != board->locking_pid) {
+			dev_err(board->gpib_dev, "bug! pid %i tried to release mutex held by pid %i\n",
+				current->pid, board->locking_pid);
+			spin_unlock(&board->locking_pid_spinlock);
+			return -EPERM;
+		}
+		board->locking_pid = 0;
+		spin_unlock(&board->locking_pid_spinlock);
+
+		atomic_set(&file_priv->holding_mutex, 0);
+
+		mutex_unlock(&board->user_mutex);
+		dev_dbg(board->gpib_dev, "unlocked board mutex\n");
+	}
+	return 0;
+}
+
+static int timeout_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	unsigned int timeout;
+	int retval;
+
+	retval = copy_from_user(&timeout, (void __user *)arg, sizeof(timeout));
+	if (retval)
+		return -EFAULT;
+
+	board->usec_timeout = timeout;
+	dev_dbg(board->gpib_dev, "timeout set to %i usec\n", timeout);
+
+	return 0;
+}
+
+static int ppc_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	struct gpib_ppoll_config_ioctl cmd;
+	int retval;
+
+	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
+	if (retval)
+		return -EFAULT;
+
+	if (cmd.set_ist) {
+		board->ist = 1;
+		board->interface->parallel_poll_response(board, board->ist);
+	} else if (cmd.clear_ist) {
+		board->ist = 0;
+		board->interface->parallel_poll_response(board, board->ist);
+	}
+
+	if (cmd.config)	{
+		retval = ibppc(board, cmd.config);
+		if (retval < 0)
+			return retval;
+	}
+
+	return 0;
+}
+
+static int set_local_ppoll_mode_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	short cmd;
+	int retval;
+
+	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
+	if (retval)
+		return -EFAULT;
+
+	if (!board->interface->local_parallel_poll_mode)
+		return -ENOENT;
+	board->local_ppoll_mode = cmd != 0;
+	board->interface->local_parallel_poll_mode(board, board->local_ppoll_mode);
+
+	return 0;
+}
+
+static int get_local_ppoll_mode_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	short cmd;
+	int retval;
+
+	cmd = board->local_ppoll_mode;
+	retval = copy_to_user((void __user *)arg, &cmd, sizeof(cmd));
+	if (retval)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int query_board_rsv_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	int status;
+	int retval;
+
+	status = board->interface->serial_poll_status(board);
+
+	retval = copy_to_user((void __user *)arg, &status, sizeof(status));
+	if (retval)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int board_info_ioctl(const struct gpib_board *board, unsigned long arg)
+{
+	struct gpib_board_info_ioctl info = { };
+	int retval;
+
+	info.pad = board->pad;
+	info.sad = board->sad;
+	info.parallel_poll_configuration = board->parallel_poll_configuration;
+	info.is_system_controller = board->master;
+	if (board->autospollers)
+		info.autopolling = 1;
+	else
+		info.autopolling = 0;
+	info.t1_delay = board->t1_nano_sec;
+	info.ist = board->ist;
+	info.no_7_bit_eos = board->interface->no_7_bit_eos;
+	retval = copy_to_user((void __user *)arg, &info, sizeof(info));
+	if (retval)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int interface_clear_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	unsigned int usec_duration;
+	int retval;
+
+	retval = copy_from_user(&usec_duration, (void __user *)arg, sizeof(usec_duration));
+	if (retval)
+		return -EFAULT;
+
+	return ibsic(board, usec_duration);
+}
+
+static int select_pci_ioctl(struct gpib_board_config *config, unsigned long arg)
+{
+	struct gpib_select_pci_ioctl selection;
+	int retval;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	retval = copy_from_user(&selection, (void __user *)arg, sizeof(selection));
+	if (retval)
+		return -EFAULT;
+
+	config->pci_bus = selection.pci_bus;
+	config->pci_slot = selection.pci_slot;
+
+	return 0;
+}
+
+static int select_device_path_ioctl(struct gpib_board_config *config, unsigned long arg)
+{
+	struct gpib_select_device_path_ioctl *selection;
+	int retval;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	selection = vmalloc(sizeof(struct gpib_select_device_path_ioctl));
+	if (!selection)
+		return -ENOMEM;
+
+	retval = copy_from_user(selection, (void __user *)arg,
+				sizeof(struct gpib_select_device_path_ioctl));
+	if (retval) {
+		vfree(selection);
+		return -EFAULT;
+	}
+
+	selection->device_path[sizeof(selection->device_path) - 1] = '\0';
+	kfree(config->device_path);
+	config->device_path = NULL;
+	if (strlen(selection->device_path) > 0)
+		config->device_path = kstrdup(selection->device_path, GFP_KERNEL);
+
+	vfree(selection);
+	return 0;
+}
+
+unsigned int num_gpib_events(const struct gpib_event_queue *queue)
+{
+	return queue->num_events;
+}
+
+static int push_gpib_event_nolock(struct gpib_board *board, short event_type)
+{
+	struct gpib_event_queue *queue = &board->event_queue;
+	struct list_head *head = &queue->event_head;
+	struct gpib_event *event;
+	static const unsigned int max_num_events = 1024;
+	int retval;
+
+	if (num_gpib_events(queue) >= max_num_events) {
+		short lost_event;
+
+		queue->dropped_event = 1;
+		retval = pop_gpib_event_nolock(board, queue, &lost_event);
+		if (retval < 0)
+			return retval;
+	}
+
+	event = kmalloc(sizeof(struct gpib_event), GFP_ATOMIC);
+	if (!event) {
+		queue->dropped_event = 1;
+		dev_err(board->gpib_dev, "failed to allocate memory for event\n");
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&event->list);
+	event->event_type = event_type;
+
+	list_add_tail(&event->list, head);
+
+	queue->num_events++;
+
+	dev_dbg(board->gpib_dev, "pushed event %i, %i in queue\n",
+		(int)event_type, num_gpib_events(queue));
+
+	return 0;
+}
+
+// push event onto back of event queue
+int push_gpib_event(struct gpib_board *board, short event_type)
+{
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&board->event_queue.lock, flags);
+	retval = push_gpib_event_nolock(board, event_type);
+	spin_unlock_irqrestore(&board->event_queue.lock, flags);
+
+	if (event_type == EVENT_DEV_TRG)
+		board->status |= DTAS;
+	if (event_type == EVENT_DEV_CLR)
+		board->status |= DCAS;
+
+	return retval;
+}
+EXPORT_SYMBOL(push_gpib_event);
+
+static int pop_gpib_event_nolock(struct gpib_board *board,
+				 struct gpib_event_queue *queue, short *event_type)
+{
+	struct list_head *head = &queue->event_head;
+	struct list_head *front = head->next;
+	struct gpib_event *event;
+
+	if (num_gpib_events(queue) == 0) {
+		*event_type = EVENT_NONE;
+		return 0;
+	}
+
+	if (front == head)
+		return -EIO;
+
+	if (queue->dropped_event) {
+		queue->dropped_event = 0;
+		return -EPIPE;
+	}
+
+	event = list_entry(front, struct gpib_event, list);
+	*event_type = event->event_type;
+
+	list_del(front);
+	kfree(event);
+
+	queue->num_events--;
+
+	dev_dbg(board->gpib_dev, "popped event %i, %i in queue\n",
+		(int)*event_type, num_gpib_events(queue));
+
+	return 0;
+}
+
+// pop event from front of event queue
+int pop_gpib_event(struct gpib_board *board, struct gpib_event_queue *queue, short *event_type)
+{
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&queue->lock, flags);
+	retval = pop_gpib_event_nolock(board, queue, event_type);
+	spin_unlock_irqrestore(&queue->lock, flags);
+	return retval;
+}
+
+static int event_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	short user_event;
+	int retval;
+	short event;
+
+	retval = pop_gpib_event(board, &board->event_queue, &event);
+	if (retval < 0)
+		return retval;
+
+	user_event = event;
+
+	retval = copy_to_user((void __user *)arg, &user_event, sizeof(user_event));
+	if (retval)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int request_system_control_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	int request_control;
+	int retval;
+
+	retval = copy_from_user(&request_control, (void __user *)arg, sizeof(request_control));
+	if (retval)
+		return -EFAULT;
+
+	return ibrsc(board, request_control);
+}
+
+static int t1_delay_ioctl(struct gpib_board *board, unsigned long arg)
+{
+	unsigned int cmd;
+	unsigned int delay;
+	int retval;
+
+	if (!board->interface->t1_delay)
+		return -ENOENT;
+
+	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
+	if (retval)
+		return -EFAULT;
+
+	delay = cmd;
+
+	retval = board->interface->t1_delay(board, delay);
+	if (retval < 0)
+		return retval;
+
+	board->t1_nano_sec = retval;
+	return 0;
+}
+
+static const struct file_operations ib_fops = {
+	.owner = THIS_MODULE,
+	.llseek = NULL,
+	.unlocked_ioctl = &ibioctl,
+	.compat_ioctl = &ibioctl,
+	.open = &ibopen,
+	.release = &ibclose,
+};
+
+struct gpib_board board_array[GPIB_MAX_NUM_BOARDS];
+
+LIST_HEAD(registered_drivers);
+
+void init_gpib_descriptor(struct gpib_descriptor *desc)
+{
+	desc->pad = 0;
+	desc->sad = -1;
+	desc->is_board = 0;
+	desc->autopoll_enabled = 0;
+	atomic_set(&desc->io_in_progress, 0);
+}
+
+int gpib_register_driver(struct gpib_interface *interface, struct module *provider_module)
+{
+	struct gpib_interface_list *entry;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->interface = interface;
+	entry->module = provider_module;
+	list_add(&entry->list, &registered_drivers);
+
+	return 0;
+}
+EXPORT_SYMBOL(gpib_register_driver);
+
+void gpib_unregister_driver(struct gpib_interface *interface)
+{
+	int i;
+	struct list_head *list_ptr;
+
+	for (i = 0; i < GPIB_MAX_NUM_BOARDS; i++) {
+		struct gpib_board *board = &board_array[i];
+
+		if (board->interface == interface) {
+			if (board->use_count > 0)
+				pr_warn("gpib: Warning: deregistered interface %s in use\n",
+					interface->name);
+			iboffline(board);
+			board->interface = NULL;
+		}
+	}
+	for (list_ptr = registered_drivers.next; list_ptr != &registered_drivers;) {
+		struct gpib_interface_list *entry;
+
+		entry = list_entry(list_ptr, struct gpib_interface_list, list);
+		list_ptr = list_ptr->next;
+		if (entry->interface == interface) {
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
+}
+EXPORT_SYMBOL(gpib_unregister_driver);
+
+static void init_gpib_board_config(struct gpib_board_config *config)
+{
+	memset(config, 0, sizeof(struct gpib_board_config));
+	config->pci_bus = -1;
+	config->pci_slot = -1;
+}
+
+void init_gpib_board(struct gpib_board *board)
+{
+	board->interface = NULL;
+	board->provider_module = NULL;
+	board->buffer = NULL;
+	board->buffer_length = 0;
+	board->status = 0;
+	init_waitqueue_head(&board->wait);
+	mutex_init(&board->user_mutex);
+	mutex_init(&board->big_gpib_mutex);
+	board->locking_pid = 0;
+	spin_lock_init(&board->locking_pid_spinlock);
+	spin_lock_init(&board->spinlock);
+	timer_setup(&board->timer, NULL, 0);
+	board->dev = NULL;
+	board->gpib_dev = NULL;
+	init_gpib_board_config(&board->config);
+	board->private_data = NULL;
+	board->use_count = 0;
+	INIT_LIST_HEAD(&board->device_list);
+	board->pad = 0;
+	board->sad = -1;
+	board->usec_timeout = 3000000;
+	board->parallel_poll_configuration = 0;
+	board->online = 0;
+	board->autospollers = 0;
+	board->autospoll_task = NULL;
+	init_event_queue(&board->event_queue);
+	board->minor = -1;
+	init_gpib_pseudo_irq(&board->pseudo_irq);
+	board->master = 1;
+	atomic_set(&board->stuck_srq, 0);
+	board->local_ppoll_mode = 0;
+}
+
+int gpib_allocate_board(struct gpib_board *board)
+{
+	if (!board->buffer) {
+		board->buffer_length = 0x4000;
+		board->buffer = vmalloc(board->buffer_length);
+		if (!board->buffer) {
+			board->buffer_length = 0;
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+void gpib_deallocate_board(struct gpib_board *board)
+{
+	short dummy;
+
+	if (board->buffer) {
+		vfree(board->buffer);
+		board->buffer = NULL;
+		board->buffer_length = 0;
+	}
+	while (num_gpib_events(&board->event_queue))
+		pop_gpib_event(board, &board->event_queue, &dummy);
+}
+
+static void init_board_array(struct gpib_board *board_array, unsigned int length)
+{
+	int i;
+
+	for (i = 0; i < length; i++) {
+		init_gpib_board(&board_array[i]);
+		board_array[i].minor = i;
+	}
+}
+
+void init_gpib_status_queue(struct gpib_status_queue *device)
+{
+	INIT_LIST_HEAD(&device->list);
+	INIT_LIST_HEAD(&device->status_bytes);
+	device->num_status_bytes = 0;
+	device->reference_count = 0;
+	device->dropped_byte = 0;
+}
+
+static struct class *gpib_class;
+
+static int __init gpib_common_init_module(void)
+{
+	int i;
+
+	pr_info("GPIB core driver\n");
+	init_board_array(board_array, GPIB_MAX_NUM_BOARDS);
+	if (register_chrdev(GPIB_CODE, "gpib", &ib_fops)) {
+		pr_err("gpib: can't get major %d\n", GPIB_CODE);
+		return -EIO;
+	}
+	gpib_class = class_create("gpib_common");
+	if (IS_ERR(gpib_class)) {
+		pr_err("gpib: failed to create gpib class\n");
+		unregister_chrdev(GPIB_CODE, "gpib");
+		return PTR_ERR(gpib_class);
+	}
+	for (i = 0; i < GPIB_MAX_NUM_BOARDS; ++i)
+		board_array[i].gpib_dev = device_create(gpib_class, NULL,
+							MKDEV(GPIB_CODE, i), NULL, "gpib%i", i);
+
+	return 0;
+}
+
+static void __exit gpib_common_exit_module(void)
+{
+	int i;
+
+	for (i = 0; i < GPIB_MAX_NUM_BOARDS; ++i)
+		device_destroy(gpib_class, MKDEV(GPIB_CODE, i));
+
+	class_destroy(gpib_class);
+	unregister_chrdev(GPIB_CODE, "gpib");
+}
+
+int gpib_match_device_path(struct device *dev, const char *device_path_in)
+{
+	if (device_path_in) {
+		char *device_path;
+
+		device_path = kobject_get_path(&dev->kobj, GFP_KERNEL);
+		if (!device_path) {
+			dev_err(dev, "kobject_get_path returned NULL.");
+			return 0;
+		}
+		if (strcmp(device_path_in, device_path) != 0) {
+			kfree(device_path);
+			return 0;
+		}
+		kfree(device_path);
+	}
+	return 1;
+}
+EXPORT_SYMBOL(gpib_match_device_path);
+
+struct pci_dev *gpib_pci_get_device(const struct gpib_board_config *config, unsigned int vendor_id,
+				    unsigned int device_id, struct pci_dev *from)
+{
+	struct pci_dev *pci_device = from;
+
+	while ((pci_device = pci_get_device(vendor_id, device_id, pci_device)))	{
+		if (config->pci_bus >= 0 && config->pci_bus != pci_device->bus->number)
+			continue;
+		if (config->pci_slot >= 0 && config->pci_slot !=
+		    PCI_SLOT(pci_device->devfn))
+			continue;
+		if (gpib_match_device_path(&pci_device->dev, config->device_path) == 0)
+			continue;
+		return pci_device;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(gpib_pci_get_device);
+
+struct pci_dev *gpib_pci_get_subsys(const struct gpib_board_config *config, unsigned int vendor_id,
+				    unsigned int device_id, unsigned int ss_vendor,
+				    unsigned int ss_device,
+				    struct pci_dev *from)
+{
+	struct pci_dev *pci_device = from;
+
+	while ((pci_device = pci_get_subsys(vendor_id, device_id,
+					    ss_vendor, ss_device, pci_device))) {
+		if (config->pci_bus >= 0 && config->pci_bus != pci_device->bus->number)
+			continue;
+		if (config->pci_slot >= 0 && config->pci_slot !=
+		    PCI_SLOT(pci_device->devfn))
+			continue;
+		if (gpib_match_device_path(&pci_device->dev, config->device_path) == 0)
+			continue;
+		return pci_device;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(gpib_pci_get_subsys);
+
+module_init(gpib_common_init_module);
+module_exit(gpib_common_exit_module);
+
diff --git a/drivers/gpib/common/iblib.c b/drivers/gpib/common/iblib.c
new file mode 100644
index 000000000000..7cbb6a467177
--- /dev/null
+++ b/drivers/gpib/common/iblib.c
@@ -0,0 +1,717 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *    copyright            : (C) 2001, 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#define dev_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "ibsys.h"
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+
+/*
+ * IBCAC
+ * Return to the controller active state from the
+ * controller standby state, i.e., turn ATN on.  Note
+ * that in order to enter the controller active state
+ * from the controller idle state, ibsic must be called.
+ * If sync is non-zero, attempt to take control synchronously.
+ * If fallback_to_async is non-zero, try to take control asynchronously
+ * if synchronous attempt fails.
+ */
+int ibcac(struct gpib_board *board, int sync, int fallback_to_async)
+{
+	int status = ibstatus(board);
+	int retval;
+
+	if ((status & CIC) == 0)
+		return -EINVAL;
+
+	if (status & ATN)
+		return 0;
+
+	if (sync && (status & LACS) == 0)
+		/*
+		 * tcs (take control synchronously) can only possibly work when
+		 * controller is listener.  Error code also needs to be -ETIMEDOUT
+		 * or it will giveout without doing fallback.
+		 */
+		retval = -ETIMEDOUT;
+	else
+		retval = board->interface->take_control(board, sync);
+
+	if (retval < 0 && fallback_to_async) {
+		if (sync && retval == -ETIMEDOUT)
+			retval = board->interface->take_control(board, 0);
+	}
+	board->interface->update_status(board, 0);
+
+	return retval;
+}
+
+/*
+ * After ATN is asserted, it should cause any connected devices
+ * to start listening for command bytes and leave acceptor idle state.
+ * So if ATN is asserted and neither NDAC or NRFD are asserted,
+ * then there are no devices and ibcmd should error out immediately.
+ * Some gpib hardware sees itself asserting NDAC/NRFD when it
+ * is controller in charge, in which case this check will
+ * do nothing useful (but shouldn't cause any harm either).
+ * Drivers that don't need this check (ni_usb for example) may
+ * set the skip_check_for_command_acceptors flag in their
+ * gpib_interface_struct to avoid useless overhead.
+ */
+static int check_for_command_acceptors(struct gpib_board *board)
+{
+	int lines;
+
+	if (board->interface->skip_check_for_command_acceptors)
+		return 0;
+	if (!board->interface->line_status)
+		return 0;
+
+	udelay(2); // allow time for devices to respond to ATN if it was just asserted
+
+	lines = board->interface->line_status(board);
+	if (lines < 0)
+		return lines;
+
+	if ((lines & VALID_NRFD) && (lines & VALID_NDAC))	{
+		if ((lines & BUS_NRFD) == 0 && (lines & BUS_NDAC) == 0)
+			return -ENOTCONN;
+	}
+
+	return 0;
+}
+
+/*
+ * IBCMD
+ * Write cnt command bytes from buf to the GPIB.  The
+ * command operation terminates only on I/O complete.
+ *
+ * NOTE:
+ *      1.  Prior to beginning the command, the interface is
+ *          placed in the controller active state.
+ *      2.  Before calling ibcmd for the first time, ibsic
+ *          must be called to initialize the GPIB and enable
+ *          the interface to leave the controller idle state.
+ */
+int ibcmd(struct gpib_board *board, u8 *buf, size_t length, size_t *bytes_written)
+{
+	ssize_t ret = 0;
+	int status;
+
+	*bytes_written = 0;
+
+	status = ibstatus(board);
+
+	if ((status & CIC) == 0)
+		return -EINVAL;
+
+	os_start_timer(board, board->usec_timeout);
+
+	ret = ibcac(board, 1, 1);
+	if (ret == 0) {
+		ret = check_for_command_acceptors(board);
+		if (ret == 0)
+			ret = board->interface->command(board, buf, length, bytes_written);
+	}
+
+	os_remove_timer(board);
+
+	if (io_timed_out(board))
+		ret = -ETIMEDOUT;
+
+	return ret;
+}
+
+/*
+ * IBGTS
+ * Go to the controller standby state from the controller
+ * active state, i.e., turn ATN off.
+ */
+
+int ibgts(struct gpib_board *board)
+{
+	int status = ibstatus(board);
+	int retval;
+
+	if ((status & CIC) == 0)
+		return -EINVAL;
+
+	retval = board->interface->go_to_standby(board);    /* go to standby */
+
+	board->interface->update_status(board, 0);
+
+	return retval;
+}
+
+static int autospoll_wait_should_wake_up(struct gpib_board *board)
+{
+	int retval;
+
+	mutex_lock(&board->big_gpib_mutex);
+
+	retval = board->master && board->autospollers > 0 &&
+		!atomic_read(&board->stuck_srq) &&
+		test_and_clear_bit(SRQI_NUM, &board->status);
+
+	mutex_unlock(&board->big_gpib_mutex);
+	return retval;
+}
+
+static int autospoll_thread(void *board_void)
+{
+	struct gpib_board *board = board_void;
+	int retval = 0;
+
+	dev_dbg(board->gpib_dev, "entering autospoll thread\n");
+
+	while (1) {
+		wait_event_interruptible(board->wait,
+					 kthread_should_stop() ||
+					 autospoll_wait_should_wake_up(board));
+		dev_dbg(board->gpib_dev, "autospoll wait satisfied\n");
+		if (kthread_should_stop())
+			break;
+
+		mutex_lock(&board->big_gpib_mutex);
+		/* make sure we are still good after we have lock */
+		if (board->autospollers <= 0 || board->master == 0) {
+			mutex_unlock(&board->big_gpib_mutex);
+			continue;
+		}
+		mutex_unlock(&board->big_gpib_mutex);
+
+		if (try_module_get(board->provider_module)) {
+			retval = autopoll_all_devices(board);
+			module_put(board->provider_module);
+		} else {
+			dev_err(board->gpib_dev, "try_module_get() failed!\n");
+		}
+		if (retval <= 0) {
+			dev_err(board->gpib_dev, "stuck SRQ\n");
+
+			atomic_set(&board->stuck_srq, 1);	// XXX could be better
+			set_bit(SRQI_NUM, &board->status);
+		}
+	}
+	return retval;
+}
+
+int ibonline(struct gpib_board *board)
+{
+	int retval;
+
+	if (board->online)
+		return -EBUSY;
+	if (!board->interface)
+		return -ENODEV;
+	retval = gpib_allocate_board(board);
+	if (retval < 0)
+		return retval;
+
+	board->dev = NULL;
+	board->local_ppoll_mode = 0;
+	retval = board->interface->attach(board, &board->config);
+	if (retval < 0) {
+		board->interface->detach(board);
+		return retval;
+	}
+	/*
+	 * nios2nommu on 2.6.11 uclinux kernel has weird problems
+	 * with autospoll thread causing huge slowdowns
+	 */
+#ifndef CONFIG_NIOS2
+	board->autospoll_task = kthread_run(&autospoll_thread, board,
+					    "gpib%d_autospoll_kthread", board->minor);
+	retval = IS_ERR(board->autospoll_task);
+	if (retval) {
+		dev_err(board->gpib_dev, "failed to create autospoll thread\n");
+		board->interface->detach(board);
+		return retval;
+	}
+#endif
+	board->online = 1;
+	dev_dbg(board->gpib_dev, "board online\n");
+
+	return 0;
+}
+
+/* XXX need to make sure board is generally not in use (grab board lock?) */
+int iboffline(struct gpib_board *board)
+{
+	int retval;
+
+	if (board->online == 0)
+		return 0;
+	if (!board->interface)
+		return -ENODEV;
+
+	if (board->autospoll_task && !IS_ERR(board->autospoll_task)) {
+		retval = kthread_stop(board->autospoll_task);
+		if (retval)
+			dev_err(board->gpib_dev, "kthread_stop returned %i\n", retval);
+		board->autospoll_task = NULL;
+	}
+
+	board->interface->detach(board);
+	gpib_deallocate_board(board);
+	board->online = 0;
+	dev_dbg(board->gpib_dev, "board offline\n");
+
+	return 0;
+}
+
+/*
+ * IBLINES
+ * Poll the GPIB control lines and return their status in buf.
+ *
+ *      LSB (bits 0-7)  -  VALID lines mask (lines that can be monitored).
+ * Next LSB (bits 8-15) - STATUS lines mask (lines that are currently set).
+ *
+ */
+int iblines(const struct gpib_board *board, short *lines)
+{
+	int retval;
+
+	*lines = 0;
+	if (!board->interface->line_status)
+		return 0;
+	retval = board->interface->line_status(board);
+	if (retval < 0)
+		return retval;
+	*lines = retval;
+	return 0;
+}
+
+/*
+ * IBRD
+ * Read up to 'length' bytes of data from the GPIB into buf.  End
+ * on detection of END (EOI and or EOS) and set 'end_flag'.
+ *
+ * NOTE:
+ *      1.  The interface is placed in the controller standby
+ *          state prior to beginning the read.
+ *      2.  Prior to calling ibrd, the intended devices as well
+ *          as the interface board itself must be addressed by
+ *          calling ibcmd.
+ */
+
+int ibrd(struct gpib_board *board, u8 *buf, size_t length, int *end_flag, size_t *nbytes)
+{
+	ssize_t ret = 0;
+	int retval;
+	size_t bytes_read;
+
+	*nbytes = 0;
+	*end_flag = 0;
+	if (length == 0)
+		return 0;
+
+	if (board->master) {
+		retval = ibgts(board);
+		if (retval < 0)
+			return retval;
+	}
+	/*
+	 * XXX resetting timer here could cause timeouts take longer than they should,
+	 * since read_ioctl calls this
+	 * function in a loop, there is probably a similar problem with writes/commands
+	 */
+	os_start_timer(board, board->usec_timeout);
+
+	do {
+		ret = board->interface->read(board, buf, length - *nbytes, end_flag, &bytes_read);
+		if (ret < 0)
+			goto ibrd_out;
+
+		buf += bytes_read;
+		*nbytes += bytes_read;
+		if (need_resched())
+			schedule();
+	} while (ret == 0 && *nbytes > 0 && *nbytes < length && *end_flag == 0);
+ibrd_out:
+	os_remove_timer(board);
+
+	return ret;
+}
+
+/*
+ * IBRPP
+ * Conduct a parallel poll and return the byte in buf.
+ *
+ * NOTE:
+ *	1.  Prior to conducting the poll the interface is placed
+ *	    in the controller active state.
+ */
+int ibrpp(struct gpib_board *board, u8 *result)
+{
+	int retval = 0;
+
+	os_start_timer(board, board->usec_timeout);
+	retval = ibcac(board, 1, 1);
+	if (retval)
+		return -1;
+
+	retval =  board->interface->parallel_poll(board, result);
+
+	os_remove_timer(board);
+	return retval;
+}
+
+int ibppc(struct gpib_board *board, u8 configuration)
+{
+	configuration &= 0x1f;
+	board->interface->parallel_poll_configure(board, configuration);
+	board->parallel_poll_configuration = configuration;
+
+	return 0;
+}
+
+int ibrsv2(struct gpib_board *board, u8 status_byte, int new_reason_for_service)
+{
+	int board_status = ibstatus(board);
+	const unsigned int MSS = status_byte & request_service_bit;
+
+	if ((board_status & CIC))
+		return -EINVAL;
+
+	if (MSS == 0 && new_reason_for_service)
+		return -EINVAL;
+
+	if (board->interface->serial_poll_response2)	{
+		board->interface->serial_poll_response2(board, status_byte, new_reason_for_service);
+		// fall back on simpler serial_poll_response if the behavior would be the same
+	} else if (board->interface->serial_poll_response &&
+		   (MSS == 0 || (MSS && new_reason_for_service))) {
+		board->interface->serial_poll_response(board, status_byte);
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/*
+ * IBSIC
+ * Send IFC for at least 100 microseconds.
+ *
+ * NOTE:
+ *	1.  Ibsic must be called prior to the first call to
+ *	    ibcmd in order to initialize the bus and enable the
+ *	    interface to leave the controller idle state.
+ */
+int ibsic(struct gpib_board *board, unsigned int usec_duration)
+{
+	if (board->master == 0)
+		return -EINVAL;
+
+	if (usec_duration < 100)
+		usec_duration = 100;
+	if (usec_duration > 1000)
+		usec_duration = 1000;
+
+	dev_dbg(board->gpib_dev, "sending interface clear, delay = %ius\n", usec_duration);
+	board->interface->interface_clear(board, 1);
+	udelay(usec_duration);
+	board->interface->interface_clear(board, 0);
+
+	return 0;
+}
+
+int ibrsc(struct gpib_board *board, int request_control)
+{
+	int retval;
+
+	if (!board->interface->request_system_control)
+		return -EPERM;
+
+	retval = board->interface->request_system_control(board, request_control);
+
+	if (retval)
+		return retval;
+
+	board->master = request_control != 0;
+
+	return  0;
+}
+
+/*
+ * IBSRE
+ * Send REN true if v is non-zero or false if v is zero.
+ */
+int ibsre(struct gpib_board *board, int enable)
+{
+	if (board->master == 0)
+		return -EINVAL;
+
+	board->interface->remote_enable(board, enable);	/* set or clear REN */
+	if (!enable)
+		usleep_range(100, 150);
+
+	return 0;
+}
+
+/*
+ * IBPAD
+ * change the GPIB address of the interface board.  The address
+ * must be 0 through 30.  ibonl resets the address to PAD.
+ */
+int ibpad(struct gpib_board *board, unsigned int addr)
+{
+	if (addr > MAX_GPIB_PRIMARY_ADDRESS)
+		return -EINVAL;
+
+	board->pad = addr;
+	if (board->online)
+		board->interface->primary_address(board, board->pad);
+	dev_dbg(board->gpib_dev, "set primary addr to %i\n", board->pad);
+	return 0;
+}
+
+/*
+ * IBSAD
+ * change the secondary GPIB address of the interface board.
+ * The address must be 0 through 30, or negative disables.  ibonl resets the
+ * address to SAD.
+ */
+int ibsad(struct gpib_board *board, int addr)
+{
+	if (addr > MAX_GPIB_SECONDARY_ADDRESS)
+		return -EINVAL;
+	board->sad = addr;
+	if (board->online) {
+		if (board->sad >= 0)
+			board->interface->secondary_address(board, board->sad, 1);
+		else
+			board->interface->secondary_address(board, 0, 0);
+	}
+	dev_dbg(board->gpib_dev, "set secondary addr to %i\n", board->sad);
+
+	return 0;
+}
+
+/*
+ * IBEOS
+ * Set the end-of-string modes for I/O operations to v.
+ *
+ */
+int ibeos(struct gpib_board *board, int eos, int eosflags)
+{
+	int retval;
+
+	if (eosflags & ~EOS_MASK)
+		return -EINVAL;
+	if (eosflags & REOS) {
+		retval = board->interface->enable_eos(board, eos, eosflags & BIN);
+	} else {
+		board->interface->disable_eos(board);
+		retval = 0;
+	}
+	return retval;
+}
+
+int ibstatus(struct gpib_board *board)
+{
+	return general_ibstatus(board, NULL, 0, 0, NULL);
+}
+
+int general_ibstatus(struct gpib_board *board, const struct gpib_status_queue *device,
+		     int clear_mask, int set_mask, struct gpib_descriptor *desc)
+{
+	int status = 0;
+	short line_status;
+
+	if (board->private_data) {
+		status = board->interface->update_status(board, clear_mask);
+		/*
+		 * XXX should probably stop having drivers use TIMO bit in
+		 * board->status to avoid confusion
+		 */
+		status &= ~TIMO;
+		/* get real SRQI status if we can */
+		if (iblines(board, &line_status) == 0) {
+			if ((line_status & VALID_SRQ)) {
+				if ((line_status & BUS_SRQ))
+					status |= SRQI;
+				else
+					status &= ~SRQI;
+			}
+		}
+	}
+	if (device)
+		if (num_status_bytes(device))
+			status |= RQS;
+
+	if (desc) {
+		if (set_mask & CMPL)
+			atomic_set(&desc->io_in_progress, 0);
+		else if (clear_mask & CMPL)
+			atomic_set(&desc->io_in_progress, 1);
+
+		if (atomic_read(&desc->io_in_progress))
+			status &= ~CMPL;
+		else
+			status |= CMPL;
+	}
+	if (num_gpib_events(&board->event_queue))
+		status |= EVENT;
+	else
+		status &= ~EVENT;
+
+	return status;
+}
+
+struct wait_info {
+	struct gpib_board *board;
+	struct timer_list timer;
+	int timed_out;
+	unsigned long usec_timeout;
+};
+
+static void wait_timeout(struct timer_list *t)
+{
+	struct wait_info *winfo = timer_container_of(winfo, t, timer);
+
+	winfo->timed_out = 1;
+	wake_up_interruptible(&winfo->board->wait);
+}
+
+static void init_wait_info(struct wait_info *winfo)
+{
+	winfo->board = NULL;
+	winfo->timed_out = 0;
+	timer_setup_on_stack(&winfo->timer, wait_timeout, 0);
+}
+
+static int wait_satisfied(struct wait_info *winfo, struct gpib_status_queue *status_queue,
+			  int wait_mask, int *status, struct gpib_descriptor *desc)
+{
+	struct gpib_board *board = winfo->board;
+	int temp_status;
+
+	if (mutex_lock_interruptible(&board->big_gpib_mutex))
+		return -ERESTARTSYS;
+
+	temp_status = general_ibstatus(board, status_queue, 0, 0, desc);
+
+	mutex_unlock(&board->big_gpib_mutex);
+
+	if (winfo->timed_out)
+		temp_status |= TIMO;
+	else
+		temp_status &= ~TIMO;
+	if (wait_mask & temp_status) {
+		*status = temp_status;
+		return 1;
+	}
+// XXX does wait for END work?
+	return 0;
+}
+
+/* install timer interrupt handler */
+static void start_wait_timer(struct wait_info *winfo)
+/* Starts the timeout task  */
+{
+	winfo->timed_out = 0;
+
+	if (winfo->usec_timeout > 0)
+		mod_timer(&winfo->timer, jiffies + usec_to_jiffies(winfo->usec_timeout));
+}
+
+static void remove_wait_timer(struct wait_info *winfo)
+{
+	timer_delete_sync(&winfo->timer);
+	timer_destroy_on_stack(&winfo->timer);
+}
+
+/*
+ * IBWAIT
+ * Check or wait for a GPIB event to occur.  The mask argument
+ * is a bit vector corresponding to the status bit vector.  It
+ * has a bit set for each condition which can terminate the wait
+ * If the mask is 0 then
+ * no condition is waited for.
+ */
+int ibwait(struct gpib_board *board, int wait_mask, int clear_mask, int set_mask,
+	   int *status, unsigned long usec_timeout, struct gpib_descriptor *desc)
+{
+	int retval = 0;
+	struct gpib_status_queue *status_queue;
+	struct wait_info winfo;
+
+	if (desc->is_board)
+		status_queue = NULL;
+	else
+		status_queue = get_gpib_status_queue(board, desc->pad, desc->sad);
+
+	if (wait_mask == 0) {
+		*status = general_ibstatus(board, status_queue, clear_mask, set_mask, desc);
+		return 0;
+	}
+
+	mutex_unlock(&board->big_gpib_mutex);
+
+	init_wait_info(&winfo);
+	winfo.board = board;
+	winfo.usec_timeout = usec_timeout;
+	start_wait_timer(&winfo);
+
+	if (wait_event_interruptible(board->wait, wait_satisfied(&winfo, status_queue,
+								 wait_mask, status, desc))) {
+		dev_dbg(board->gpib_dev, "wait interrupted\n");
+		retval = -ERESTARTSYS;
+	}
+	remove_wait_timer(&winfo);
+
+	if (retval)
+		return retval;
+	if (mutex_lock_interruptible(&board->big_gpib_mutex))
+		return -ERESTARTSYS;
+
+	/* make sure we only clear status bits that we are reporting */
+	if (*status & clear_mask || set_mask)
+		general_ibstatus(board, status_queue, *status & clear_mask, set_mask, NULL);
+
+	return 0;
+}
+
+/*
+ * IBWRT
+ * Write cnt bytes of data from buf to the GPIB.  The write
+ * operation terminates only on I/O complete.
+ *
+ * NOTE:
+ *      1.  Prior to beginning the write, the interface is
+ *          placed in the controller standby state.
+ *      2.  Prior to calling ibwrt, the intended devices as
+ *          well as the interface board itself must be
+ *          addressed by calling ibcmd.
+ */
+int ibwrt(struct gpib_board *board, u8 *buf, size_t cnt, int send_eoi, size_t *bytes_written)
+{
+	int ret = 0;
+	int retval;
+
+	if (cnt == 0)
+		return 0;
+
+	if (board->master) {
+		retval = ibgts(board);
+		if (retval < 0)
+			return retval;
+	}
+	os_start_timer(board, board->usec_timeout);
+	ret = board->interface->write(board, buf, cnt, send_eoi, bytes_written);
+
+	if (io_timed_out(board))
+		ret = -ETIMEDOUT;
+
+	os_remove_timer(board);
+
+	return ret;
+}
+
diff --git a/drivers/gpib/common/ibsys.h b/drivers/gpib/common/ibsys.h
new file mode 100644
index 000000000000..e5a148f513a8
--- /dev/null
+++ b/drivers/gpib/common/ibsys.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "gpibP.h"
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/timer.h>
+
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <asm/irq.h>
+#include <asm/dma.h>
+
+#define MAX_GPIB_PRIMARY_ADDRESS 30
+#define MAX_GPIB_SECONDARY_ADDRESS 31
+
+int gpib_allocate_board(struct gpib_board *board);
+void gpib_deallocate_board(struct gpib_board *board);
+
+unsigned int num_status_bytes(const struct gpib_status_queue *dev);
+int push_status_byte(struct gpib_board *board, struct gpib_status_queue *device,
+		     u8 poll_byte);
+int pop_status_byte(struct gpib_board *board, struct gpib_status_queue *device,
+		    u8 *poll_byte);
+struct gpib_status_queue *get_gpib_status_queue(struct gpib_board *board,
+						unsigned int pad, int sad);
+int get_serial_poll_byte(struct gpib_board *board, unsigned int pad, int sad,
+			 unsigned int usec_timeout, u8 *poll_byte);
+int autopoll_all_devices(struct gpib_board *board);
diff --git a/drivers/gpib/eastwood/Makefile b/drivers/gpib/eastwood/Makefile
new file mode 100644
index 000000000000..384825195f77
--- /dev/null
+++ b/drivers/gpib/eastwood/Makefile
@@ -0,0 +1,3 @@
+
+obj-$(CONFIG_GPIB_FLUKE) += fluke_gpib.o
+
diff --git a/drivers/gpib/eastwood/fluke_gpib.c b/drivers/gpib/eastwood/fluke_gpib.c
new file mode 100644
index 000000000000..3ae848e3f738
--- /dev/null
+++ b/drivers/gpib/eastwood/fluke_gpib.c
@@ -0,0 +1,1180 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ * GPIB Driver for Fluke cda devices.  Basically, its a driver for a (bugfixed)
+ * cb7210 connected to channel 0 of a pl330 dma controller.
+ *    Author: Frank Mori Hess <fmh6jj@gmail.com>
+ *   copyright: (C) 2006, 2010, 2015 Fluke Corporation
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include "fluke_gpib.h"
+
+#include "gpibP.h"
+#include <linux/dma-mapping.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB Driver for Fluke cda devices");
+
+static int fluke_attach_holdoff_all(struct gpib_board *board,
+				    const struct gpib_board_config *config);
+static int fluke_attach_holdoff_end(struct gpib_board *board,
+				    const struct gpib_board_config *config);
+static void fluke_detach(struct gpib_board *board);
+static int fluke_config_dma(struct gpib_board *board, int output);
+static irqreturn_t fluke_gpib_internal_interrupt(struct gpib_board *board);
+
+static struct platform_device *fluke_gpib_pdev;
+
+static u8 fluke_locking_read_byte(struct nec7210_priv *nec_priv, unsigned int register_number)
+{
+	u8 retval;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
+	retval = fluke_read_byte_nolock(nec_priv, register_number);
+	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
+	return retval;
+}
+
+static void fluke_locking_write_byte(struct nec7210_priv *nec_priv, u8 byte,
+				     unsigned int register_number)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
+	fluke_write_byte_nolock(nec_priv, byte, register_number);
+	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
+}
+
+// wrappers for interface functions
+static int fluke_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
+		      size_t *bytes_read)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
+}
+
+static int fluke_write(struct gpib_board *board, u8 *buffer, size_t length,
+		       int send_eoi, size_t *bytes_written)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
+}
+
+static int fluke_command(struct gpib_board *board, u8 *buffer,
+			 size_t length, size_t *bytes_written)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
+}
+
+static int fluke_take_control(struct gpib_board *board, int synchronous)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
+}
+
+static int fluke_go_to_standby(struct gpib_board *board)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_go_to_standby(board, &priv->nec7210_priv);
+}
+
+static int fluke_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct fluke_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+
+	return nec7210_request_system_control(board, nec_priv, request_control);
+}
+
+static void fluke_interface_clear(struct gpib_board *board, int assert)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
+}
+
+static void fluke_remote_enable(struct gpib_board *board, int enable)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
+}
+
+static int fluke_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
+}
+
+static void fluke_disable_eos(struct gpib_board *board)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	nec7210_disable_eos(board, &priv->nec7210_priv);
+}
+
+static unsigned int fluke_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
+}
+
+static int fluke_primary_address(struct gpib_board *board, unsigned int address)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_primary_address(board, &priv->nec7210_priv, address);
+}
+
+static int fluke_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
+}
+
+static int fluke_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
+}
+
+static void fluke_parallel_poll_configure(struct gpib_board *board, u8 configuration)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, configuration);
+}
+
+static void fluke_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
+}
+
+static void fluke_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
+}
+
+static u8 fluke_serial_poll_status(struct gpib_board *board)
+{
+	struct fluke_priv *priv = board->private_data;
+
+	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
+}
+
+static void fluke_return_to_local(struct gpib_board *board)
+{
+	struct fluke_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+
+	write_byte(nec_priv, AUX_RTL2, AUXMR);
+	udelay(1);
+	write_byte(nec_priv, AUX_RTL, AUXMR);
+}
+
+static int fluke_line_status(const struct gpib_board *board)
+{
+	int status = VALID_ALL;
+	int bsr_bits;
+	struct fluke_priv *e_priv;
+
+	e_priv = board->private_data;
+
+	bsr_bits = fluke_paged_read_byte(e_priv, BUS_STATUS, BUS_STATUS_PAGE);
+
+	if ((bsr_bits & BSR_REN_BIT) == 0)
+		status |= BUS_REN;
+	if ((bsr_bits & BSR_IFC_BIT) == 0)
+		status |= BUS_IFC;
+	if ((bsr_bits & BSR_SRQ_BIT) == 0)
+		status |= BUS_SRQ;
+	if ((bsr_bits & BSR_EOI_BIT) == 0)
+		status |= BUS_EOI;
+	if ((bsr_bits & BSR_NRFD_BIT) == 0)
+		status |= BUS_NRFD;
+	if ((bsr_bits & BSR_NDAC_BIT) == 0)
+		status |= BUS_NDAC;
+	if ((bsr_bits & BSR_DAV_BIT) == 0)
+		status |= BUS_DAV;
+	if ((bsr_bits & BSR_ATN_BIT) == 0)
+		status |= BUS_ATN;
+
+	return status;
+}
+
+static int fluke_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	struct fluke_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	unsigned int retval;
+
+	retval = nec7210_t1_delay(board, nec_priv, nano_sec);
+
+	if (nano_sec <= 350) {
+		write_byte(nec_priv, AUX_HI_SPEED, AUXMR);
+		retval = 350;
+	} else {
+		write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
+	}
+	return retval;
+}
+
+static int lacs_or_read_ready(struct gpib_board *board)
+{
+	const struct fluke_priv *e_priv = board->private_data;
+	const struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	retval = test_bit(LACS_NUM, &board->status) || test_bit(READ_READY_BN, &nec_priv->state);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return retval;
+}
+
+/*
+ * Wait until it is possible for a read to do something useful.  This
+ * is not essential, it only exists to prevent RFD holdoff from being released pointlessly.
+ */
+static int wait_for_read(struct gpib_board *board)
+{
+	struct fluke_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+
+	if (wait_event_interruptible(board->wait,
+				     lacs_or_read_ready(board) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+	return retval;
+}
+
+/*
+ * Check if the SH state machine is in SGNS.  We check twice since there is a very small chance
+ * we could be blowing through SGNS from SIDS to SDYS if there is already a
+ * byte available in the handshake state machine.  We are interested
+ * in the case where the handshake is stuck in SGNS due to no byte being
+ * available to the chip (and thus we can be confident a dma transfer will
+ * result in at least one byte making it into the chip).  This matters
+ * because we want to be confident before sending a "send eoi" auxilary
+ * command that we will be able to also put the associated data byte
+ * in the chip before any potential timeout.
+ */
+static int source_handshake_is_sgns(struct fluke_priv *e_priv)
+{
+	int i;
+
+	for (i = 0; i < 2; ++i)	{
+		if ((fluke_paged_read_byte(e_priv, STATE1_REG, STATE1_PAGE) &
+		     SOURCE_HANDSHAKE_MASK) != SOURCE_HANDSHAKE_SGNS_BITS) {
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int source_handshake_is_sids_or_sgns(struct fluke_priv *e_priv)
+{
+	unsigned int source_handshake_bits;
+
+	source_handshake_bits = fluke_paged_read_byte(e_priv, STATE1_REG, STATE1_PAGE) &
+		SOURCE_HANDSHAKE_MASK;
+
+	return (source_handshake_bits == SOURCE_HANDSHAKE_SGNS_BITS) ||
+		(source_handshake_bits == SOURCE_HANDSHAKE_SIDS_BITS);
+}
+
+/*
+ * Wait until the gpib chip is ready to accept a data out byte.
+ * If the chip is SGNS it is probably waiting for a a byte to
+ * be written to it.
+ */
+static int wait_for_data_out_ready(struct gpib_board *board)
+{
+	struct fluke_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+
+	if (wait_event_interruptible(board->wait,
+				     (test_bit(TACS_NUM, &board->status) &&
+				      source_handshake_is_sgns(e_priv)) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+	return retval;
+}
+
+static int wait_for_sids_or_sgns(struct gpib_board *board)
+{
+	struct fluke_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+
+	if (wait_event_interruptible(board->wait,
+				     source_handshake_is_sids_or_sgns(e_priv) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+	return retval;
+}
+
+static void fluke_dma_callback(void *arg)
+{
+	struct gpib_board *board = arg;
+	struct fluke_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE | HR_DIIE, HR_DOIE | HR_DIIE);
+	wake_up_interruptible(&board->wait);
+
+	fluke_gpib_internal_interrupt(board);
+	clear_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state);
+	clear_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state);
+
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+
+static int fluke_dma_write(struct gpib_board *board, u8 *buffer, size_t length,
+			   size_t *bytes_written)
+{
+	struct fluke_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	unsigned long flags;
+	int retval = 0;
+	dma_addr_t address;
+	struct dma_async_tx_descriptor *tx_desc;
+
+	*bytes_written = 0;
+
+	if (WARN_ON_ONCE(length > e_priv->dma_buffer_size))
+		return -EFAULT;
+	dmaengine_terminate_all(e_priv->dma_channel);
+	// write-clear counter
+	writel(0x0, e_priv->write_transfer_counter);
+
+	memcpy(e_priv->dma_buffer, buffer, length);
+	address = dma_map_single(board->dev, e_priv->dma_buffer,
+				 length, DMA_TO_DEVICE);
+	/* program dma controller */
+	retval = fluke_config_dma(board, 1);
+	if (retval)
+		goto cleanup;
+
+	tx_desc = dmaengine_prep_slave_single(e_priv->dma_channel, address, length, DMA_MEM_TO_DEV,
+					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	if (!tx_desc) {
+		dev_err(board->gpib_dev, "failed to allocate dma transmit descriptor\n");
+		retval = -ENOMEM;
+		goto cleanup;
+	}
+	tx_desc->callback = fluke_dma_callback;
+	tx_desc->callback_param = board;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, 0);
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, HR_DMAO);
+	dmaengine_submit(tx_desc);
+	dma_async_issue_pending(e_priv->dma_channel);
+
+	clear_bit(WRITE_READY_BN, &nec_priv->state);
+	set_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state);
+
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	// suspend until message is sent
+	if (wait_event_interruptible(board->wait,
+				     ((readl(e_priv->write_transfer_counter) &
+				       write_transfer_counter_mask) == length) ||
+				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status))) {
+		retval = -ERESTARTSYS;
+	}
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+	if (test_and_clear_bit(BUS_ERROR_BN, &nec_priv->state))
+		retval = -EIO;
+	// disable board's dma
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0);
+
+	dmaengine_terminate_all(e_priv->dma_channel);
+	// make sure fluke_dma_callback got called
+	if (test_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state))
+		fluke_dma_callback(board);
+
+	/*
+	 * if everything went fine, try to wait until last byte is actually
+	 * transmitted across gpib (but don't try _too_ hard)
+	 */
+	if (retval == 0)
+		retval = wait_for_sids_or_sgns(board);
+
+	*bytes_written = readl(e_priv->write_transfer_counter) & write_transfer_counter_mask;
+	if (WARN_ON_ONCE(*bytes_written > length))
+		return -EFAULT;
+
+cleanup:
+	dma_unmap_single(board->dev, address, length, DMA_TO_DEVICE);
+	return retval;
+}
+
+static int fluke_accel_write(struct gpib_board *board, u8 *buffer, size_t length,
+			     int send_eoi, size_t *bytes_written)
+{
+	struct fluke_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	size_t remainder = length;
+	size_t transfer_size;
+	ssize_t retval = 0;
+	size_t dma_remainder = remainder;
+
+	if (!e_priv->dma_channel) {
+		dev_err(board->gpib_dev, "No dma channel available, cannot do accel write.");
+		return -ENXIO;
+	}
+
+	*bytes_written = 0;
+	if (length < 1)
+		return 0;
+
+	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
+
+	if (send_eoi)
+		--dma_remainder;
+
+	while (dma_remainder > 0) {
+		size_t num_bytes;
+
+		retval = wait_for_data_out_ready(board);
+		if (retval < 0)
+			break;
+
+		transfer_size = (e_priv->dma_buffer_size < dma_remainder) ?
+			e_priv->dma_buffer_size : dma_remainder;
+		retval = fluke_dma_write(board, buffer, transfer_size, &num_bytes);
+		*bytes_written += num_bytes;
+		if (retval < 0)
+			break;
+		dma_remainder -= num_bytes;
+		remainder -= num_bytes;
+		buffer += num_bytes;
+		if (need_resched())
+			schedule();
+	}
+	if (retval < 0)
+		return retval;
+	// handle sending of last byte with eoi
+	if (send_eoi) {
+		size_t num_bytes;
+
+		if (WARN_ON_ONCE(remainder != 1))
+			return -EFAULT;
+
+		/*
+		 * wait until we are sure we will be able to write the data byte
+		 * into the chip before we send AUX_SEOI.  This prevents a timeout
+		 * scenerio where we send AUX_SEOI but then timeout without getting
+		 * any bytes into the gpib chip.  This will result in the first byte
+		 * of the next write having a spurious EOI set on the first byte.
+		 */
+		retval = wait_for_data_out_ready(board);
+		if (retval < 0)
+			return retval;
+
+		write_byte(nec_priv, AUX_SEOI, AUXMR);
+		retval = fluke_dma_write(board, buffer, remainder, &num_bytes);
+		*bytes_written += num_bytes;
+		if (retval < 0)
+			return retval;
+		remainder -= num_bytes;
+	}
+	return 0;
+}
+
+static int fluke_get_dma_residue(struct dma_chan *chan, dma_cookie_t cookie)
+{
+	struct dma_tx_state state;
+	int result;
+
+	result = dmaengine_pause(chan);
+	if (result < 0) {
+		pr_err("dma pause failed?\n");
+		return result;
+	}
+	dmaengine_tx_status(chan, cookie, &state);
+	/*
+	 * hardware doesn't support resume, so dont call this
+	 * method unless the dma transfer is done.
+	 */
+	return state.residue;
+}
+
+static int fluke_dma_read(struct gpib_board *board, u8 *buffer,
+			  size_t length, int *end, size_t *bytes_read)
+{
+	struct fluke_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+	unsigned long flags;
+	int residue;
+	dma_addr_t bus_address;
+	struct dma_async_tx_descriptor *tx_desc;
+	dma_cookie_t dma_cookie;
+	int i;
+	static const int timeout = 10;
+
+	*bytes_read = 0;
+	*end = 0;
+	if (length == 0)
+		return 0;
+
+	bus_address = dma_map_single(board->dev, e_priv->dma_buffer,
+				     length, DMA_FROM_DEVICE);
+
+	/* program dma controller */
+	retval = fluke_config_dma(board, 0);
+	if (retval) {
+		dma_unmap_single(board->dev, bus_address, length, DMA_FROM_DEVICE);
+		return retval;
+	}
+	tx_desc = dmaengine_prep_slave_single(e_priv->dma_channel,
+					      bus_address, length, DMA_DEV_TO_MEM,
+					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	if (!tx_desc) {
+		dev_err(board->gpib_dev, "failed to allocate dma transmit descriptor\n");
+		dma_unmap_single(NULL, bus_address, length, DMA_FROM_DEVICE);
+		return -EIO;
+	}
+	tx_desc->callback = fluke_dma_callback;
+	tx_desc->callback_param = board;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	// enable nec7210 dma
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DIIE, 0);
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, HR_DMAI);
+
+	dma_cookie = dmaengine_submit(tx_desc);
+	dma_async_issue_pending(e_priv->dma_channel);
+
+	set_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state);
+	clear_bit(READ_READY_BN, &nec_priv->state);
+
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	// wait for data to transfer
+	if (wait_event_interruptible(board->wait,
+				     test_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state) == 0 ||
+				     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status))) {
+		retval = -ERESTARTSYS;
+	}
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+
+	/*
+	 * If we woke up because of end, wait until the dma transfer has pulled
+	 * the data byte associated with the end before we cancel the dma transfer.
+	 */
+	if (test_bit(RECEIVED_END_BN, &nec_priv->state)) {
+		for (i = 0; i < timeout; ++i) {
+			if (test_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state) == 0)
+				break;
+			if ((read_byte(nec_priv, ADR0) & DATA_IN_STATUS) == 0)
+				break;
+			usleep_range(10, 15);
+		}
+		if (i == timeout)
+			pr_warn("fluke_gpib: timeout waiting for dma to transfer end data byte.\n");
+	}
+
+	// stop the dma transfer
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
+	/*
+	 * delay a little just to make sure any bytes in dma controller's fifo get
+	 * written to memory before we disable it
+	 */
+	usleep_range(10, 15);
+	residue = fluke_get_dma_residue(e_priv->dma_channel, dma_cookie);
+	if (WARN_ON_ONCE(residue > length || residue < 0))
+		return -EFAULT;
+	*bytes_read += length - residue;
+	dmaengine_terminate_all(e_priv->dma_channel);
+	// make sure fluke_dma_callback got called
+	if (test_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state))
+		fluke_dma_callback(board);
+
+	dma_unmap_single(board->dev, bus_address, length, DMA_FROM_DEVICE);
+	memcpy(buffer, e_priv->dma_buffer, *bytes_read);
+
+	/*
+	 * If we got an end interrupt, figure out if it was
+	 * associated with the last byte we dma'd or with a
+	 * byte still sitting on the cb7210.
+	 */
+	spin_lock_irqsave(&board->spinlock, flags);
+	if (test_bit(READ_READY_BN, &nec_priv->state) == 0) {
+		/*
+		 * There is no byte sitting on the cb7210.  If we
+		 * saw an end interrupt, we need to deal with it now
+		 */
+		if (test_and_clear_bit(RECEIVED_END_BN, &nec_priv->state))
+			*end = 1;
+	}
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	return retval;
+}
+
+static int fluke_accel_read(struct gpib_board *board, u8 *buffer, size_t length,
+			    int *end, size_t *bytes_read)
+{
+	struct fluke_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	size_t remain = length;
+	size_t transfer_size;
+	int retval = 0;
+	size_t dma_nbytes;
+
+	*end = 0;
+	*bytes_read = 0;
+
+	smp_mb__before_atomic();
+	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
+	smp_mb__after_atomic();
+
+	retval = wait_for_read(board);
+	if (retval < 0)
+		return retval;
+
+	nec7210_release_rfd_holdoff(board, nec_priv);
+
+	while (remain > 0) {
+		transfer_size = (e_priv->dma_buffer_size < remain) ?
+			e_priv->dma_buffer_size : remain;
+		retval = fluke_dma_read(board, buffer, transfer_size, end, &dma_nbytes);
+		remain -= dma_nbytes;
+		buffer += dma_nbytes;
+		*bytes_read += dma_nbytes;
+		if (*end)
+			break;
+		if (retval < 0)
+			return retval;
+		if (need_resched())
+			schedule();
+	}
+
+	return retval;
+}
+
+static struct gpib_interface fluke_unaccel_interface = {
+	.name = "fluke_unaccel",
+	.attach = fluke_attach_holdoff_all,
+	.detach = fluke_detach,
+	.read = fluke_read,
+	.write = fluke_write,
+	.command = fluke_command,
+	.take_control = fluke_take_control,
+	.go_to_standby = fluke_go_to_standby,
+	.request_system_control = fluke_request_system_control,
+	.interface_clear = fluke_interface_clear,
+	.remote_enable = fluke_remote_enable,
+	.enable_eos = fluke_enable_eos,
+	.disable_eos = fluke_disable_eos,
+	.parallel_poll = fluke_parallel_poll,
+	.parallel_poll_configure = fluke_parallel_poll_configure,
+	.parallel_poll_response = fluke_parallel_poll_response,
+	.line_status = fluke_line_status,
+	.update_status = fluke_update_status,
+	.primary_address = fluke_primary_address,
+	.secondary_address = fluke_secondary_address,
+	.serial_poll_response = fluke_serial_poll_response,
+	.serial_poll_status = fluke_serial_poll_status,
+	.t1_delay = fluke_t1_delay,
+	.return_to_local = fluke_return_to_local,
+};
+
+/*
+ * fluke_hybrid uses dma for writes but not for reads.  Added
+ * to deal with occasional corruption of bytes seen when doing dma
+ * reads.  From looking at the cb7210 vhdl, I believe the corruption
+ * is due to a hardware bug triggered by the cpu reading a cb7210
+ *		}
+ * register just as the dma controller is also doing a read.
+ */
+
+static struct gpib_interface fluke_hybrid_interface = {
+	.name = "fluke_hybrid",
+	.attach = fluke_attach_holdoff_all,
+	.detach = fluke_detach,
+	.read = fluke_read,
+	.write = fluke_accel_write,
+	.command = fluke_command,
+	.take_control = fluke_take_control,
+	.go_to_standby = fluke_go_to_standby,
+	.request_system_control = fluke_request_system_control,
+	.interface_clear = fluke_interface_clear,
+	.remote_enable = fluke_remote_enable,
+	.enable_eos = fluke_enable_eos,
+	.disable_eos = fluke_disable_eos,
+	.parallel_poll = fluke_parallel_poll,
+	.parallel_poll_configure = fluke_parallel_poll_configure,
+	.parallel_poll_response = fluke_parallel_poll_response,
+	.line_status = fluke_line_status,
+	.update_status = fluke_update_status,
+	.primary_address = fluke_primary_address,
+	.secondary_address = fluke_secondary_address,
+	.serial_poll_response = fluke_serial_poll_response,
+	.serial_poll_status = fluke_serial_poll_status,
+	.t1_delay = fluke_t1_delay,
+	.return_to_local = fluke_return_to_local,
+};
+
+static struct gpib_interface fluke_interface = {
+	.name = "fluke",
+	.attach = fluke_attach_holdoff_end,
+	.detach = fluke_detach,
+	.read = fluke_accel_read,
+	.write = fluke_accel_write,
+	.command = fluke_command,
+	.take_control = fluke_take_control,
+	.go_to_standby = fluke_go_to_standby,
+	.request_system_control = fluke_request_system_control,
+	.interface_clear = fluke_interface_clear,
+	.remote_enable = fluke_remote_enable,
+	.enable_eos = fluke_enable_eos,
+	.disable_eos = fluke_disable_eos,
+	.parallel_poll = fluke_parallel_poll,
+	.parallel_poll_configure = fluke_parallel_poll_configure,
+	.parallel_poll_response = fluke_parallel_poll_response,
+	.line_status = fluke_line_status,
+	.update_status = fluke_update_status,
+	.primary_address = fluke_primary_address,
+	.secondary_address = fluke_secondary_address,
+	.serial_poll_response = fluke_serial_poll_response,
+	.serial_poll_status = fluke_serial_poll_status,
+	.t1_delay = fluke_t1_delay,
+	.return_to_local = fluke_return_to_local,
+};
+
+irqreturn_t fluke_gpib_internal_interrupt(struct gpib_board *board)
+{
+	int status0, status1, status2;
+	struct fluke_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+	int retval = IRQ_NONE;
+
+	if (read_byte(nec_priv, ADR0) & DATA_IN_STATUS)
+		set_bit(READ_READY_BN, &nec_priv->state);
+
+	status0 = fluke_paged_read_byte(priv, ISR0_IMR0, ISR0_IMR0_PAGE);
+	status1 = read_byte(nec_priv, ISR1);
+	status2 = read_byte(nec_priv, ISR2);
+
+	if (status0 & FLUKE_IFCI_BIT) {
+		push_gpib_event(board, EVENT_IFC);
+		retval = IRQ_HANDLED;
+	}
+
+	if (nec7210_interrupt_have_status(board, nec_priv, status1, status2) == IRQ_HANDLED)
+		retval = IRQ_HANDLED;
+
+	if (read_byte(nec_priv, ADR0) & DATA_IN_STATUS)	{
+		if (test_bit(RFD_HOLDOFF_BN, &nec_priv->state))
+			set_bit(READ_READY_BN, &nec_priv->state);
+		else
+			clear_bit(READ_READY_BN, &nec_priv->state);
+	}
+
+	if (retval == IRQ_HANDLED)
+		wake_up_interruptible(&board->wait);
+
+	return retval;
+}
+
+static irqreturn_t fluke_gpib_interrupt(int irq, void *arg)
+{
+	struct gpib_board *board = arg;
+	unsigned long flags;
+	irqreturn_t retval;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	retval = fluke_gpib_internal_interrupt(board);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return retval;
+}
+
+static int fluke_allocate_private(struct gpib_board *board)
+{
+	struct fluke_priv *priv;
+
+	board->private_data = kmalloc(sizeof(struct fluke_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -ENOMEM;
+	priv = board->private_data;
+	memset(priv, 0, sizeof(struct fluke_priv));
+	init_nec7210_private(&priv->nec7210_priv);
+	priv->dma_buffer_size = 0x7ff;
+	priv->dma_buffer = kmalloc(priv->dma_buffer_size, GFP_KERNEL);
+	if (!priv->dma_buffer)
+		return -ENOMEM;
+	return 0;
+}
+
+static void fluke_generic_detach(struct gpib_board *board)
+{
+	if (board->private_data) {
+		struct fluke_priv *e_priv = board->private_data;
+
+		kfree(e_priv->dma_buffer);
+		kfree(board->private_data);
+		board->private_data = NULL;
+	}
+}
+
+// generic part of attach functions shared by all cb7210 boards
+static int fluke_generic_attach(struct gpib_board *board)
+{
+	struct fluke_priv *e_priv;
+	struct nec7210_priv *nec_priv;
+	int retval;
+
+	board->status = 0;
+
+	retval = fluke_allocate_private(board);
+	if (retval < 0)
+		return retval;
+	e_priv = board->private_data;
+	nec_priv = &e_priv->nec7210_priv;
+	nec_priv->read_byte = fluke_locking_read_byte;
+	nec_priv->write_byte = fluke_locking_write_byte;
+	nec_priv->offset = fluke_reg_offset;
+	nec_priv->type = CB7210;
+	return 0;
+}
+
+static int fluke_config_dma(struct gpib_board *board, int output)
+{
+	struct fluke_priv *e_priv = board->private_data;
+	struct dma_slave_config config;
+
+	config.src_maxburst = 1;
+	config.dst_maxburst = 1;
+	config.device_fc = true;
+
+	if (output) {
+		config.direction = DMA_MEM_TO_DEV;
+		config.src_addr = 0;
+		config.dst_addr = e_priv->dma_port_res->start;
+		config.src_addr_width = 1;
+		config.dst_addr_width = 1;
+	} else {
+		config.direction = DMA_DEV_TO_MEM;
+		config.src_addr = e_priv->dma_port_res->start;
+		config.dst_addr = 0;
+		config.src_addr_width = 1;
+		config.dst_addr_width = 1;
+	}
+	return dmaengine_slave_config(e_priv->dma_channel, &config);
+}
+
+static int fluke_init(struct fluke_priv *e_priv, struct gpib_board *board, int handshake_mode)
+{
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+
+	nec7210_board_reset(nec_priv, board);
+	write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
+	/*
+	 * set clock register for driving frequency
+	 * ICR should be set to clock in megahertz (1-15) and to zero
+	 * for clocks faster than 15 MHz (max 20MHz)
+	 */
+	write_byte(nec_priv, ICR | 10, AUXMR);
+	nec7210_set_handshake_mode(board, nec_priv, handshake_mode);
+
+	nec7210_board_online(nec_priv, board);
+
+	/* poll so we can detect ATN changes */
+	if (gpib_request_pseudo_irq(board, fluke_gpib_interrupt)) {
+		dev_err(board->gpib_dev, "failed to allocate pseudo_irq\n");
+		return -EINVAL;
+	}
+
+	fluke_paged_write_byte(e_priv, FLUKE_IFCIE_BIT, ISR0_IMR0, ISR0_IMR0_PAGE);
+	return 0;
+}
+
+/*
+ * This function is passed to dma_request_channel() in order to
+ * select the pl330 dma channel which has been hardwired to
+ * the gpib controller.
+ */
+static bool gpib_dma_channel_filter(struct dma_chan *chan, void *filter_param)
+{
+	// select the channel which is wired to the gpib chip
+	return chan->chan_id == 0;
+}
+
+static int fluke_attach_impl(struct gpib_board *board, const struct gpib_board_config *config,
+			     unsigned int handshake_mode)
+{
+	struct fluke_priv *e_priv;
+	struct nec7210_priv *nec_priv;
+	int isr_flags = 0;
+	int retval;
+	int irq;
+	struct resource *res;
+	dma_cap_mask_t dma_cap;
+
+	if (!fluke_gpib_pdev) {
+		dev_err(board->gpib_dev, "No fluke device was found, attach failed.\n");
+		return -ENODEV;
+	}
+
+	retval = fluke_generic_attach(board);
+	if (retval)
+		return retval;
+
+	e_priv = board->private_data;
+	nec_priv = &e_priv->nec7210_priv;
+	nec_priv->offset = fluke_reg_offset;
+	board->dev = &fluke_gpib_pdev->dev;
+
+	res = platform_get_resource(fluke_gpib_pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&fluke_gpib_pdev->dev, "Unable to locate mmio resource\n");
+		return -ENODEV;
+	}
+
+	if (request_mem_region(res->start,
+			       resource_size(res),
+			       fluke_gpib_pdev->name) == NULL) {
+		dev_err(&fluke_gpib_pdev->dev, "cannot claim registers\n");
+		return -ENXIO;
+	}
+	e_priv->gpib_iomem_res = res;
+
+	nec_priv->mmiobase = ioremap(e_priv->gpib_iomem_res->start,
+				     resource_size(e_priv->gpib_iomem_res));
+	if (!nec_priv->mmiobase) {
+		dev_err(&fluke_gpib_pdev->dev, "Could not map I/O memory\n");
+		return -ENOMEM;
+	}
+
+	res = platform_get_resource(fluke_gpib_pdev, IORESOURCE_MEM, 1);
+	if (!res) {
+		dev_err(&fluke_gpib_pdev->dev, "Unable to locate mmio resource for gpib dma port\n");
+		return -ENODEV;
+	}
+	if (request_mem_region(res->start,
+			       resource_size(res),
+			       fluke_gpib_pdev->name) == NULL) {
+		dev_err(&fluke_gpib_pdev->dev, "cannot claim registers\n");
+		return -ENXIO;
+	}
+	e_priv->dma_port_res = res;
+
+	res = platform_get_resource(fluke_gpib_pdev, IORESOURCE_MEM, 2);
+	if (!res) {
+		dev_err(&fluke_gpib_pdev->dev, "Unable to locate mmio resource for write transfer counter\n");
+		return -ENODEV;
+	}
+
+	if (request_mem_region(res->start,
+			       resource_size(res),
+			       fluke_gpib_pdev->name) == NULL) {
+		dev_err(&fluke_gpib_pdev->dev, "cannot claim registers\n");
+		return -ENXIO;
+	}
+	e_priv->write_transfer_counter_res = res;
+
+	e_priv->write_transfer_counter = ioremap(e_priv->write_transfer_counter_res->start,
+						 resource_size(e_priv->write_transfer_counter_res));
+	if (!e_priv->write_transfer_counter) {
+		dev_err(&fluke_gpib_pdev->dev, "Could not map I/O memory\n");
+		return -ENOMEM;
+	}
+
+	irq = platform_get_irq(fluke_gpib_pdev, 0);
+	if (irq < 0)
+		return -EBUSY;
+	retval = request_irq(irq, fluke_gpib_interrupt, isr_flags, fluke_gpib_pdev->name, board);
+	if (retval) {
+		dev_err(&fluke_gpib_pdev->dev,
+			"cannot register interrupt handler err=%d\n",
+			retval);
+		return retval;
+	}
+	e_priv->irq = irq;
+
+	dma_cap_zero(dma_cap);
+	dma_cap_set(DMA_SLAVE, dma_cap);
+	e_priv->dma_channel = dma_request_channel(dma_cap, gpib_dma_channel_filter, NULL);
+	if (!e_priv->dma_channel) {
+		dev_err(board->gpib_dev, "failed to allocate a dma channel.\n");
+		/*
+		 * we don't error out here because unaccel interface will still
+		 * work without dma
+		 */
+	}
+
+	return fluke_init(e_priv, board, handshake_mode);
+}
+
+int fluke_attach_holdoff_all(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	return fluke_attach_impl(board, config, HR_HLDA);
+}
+
+int fluke_attach_holdoff_end(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	return fluke_attach_impl(board, config, HR_HLDE);
+}
+
+void fluke_detach(struct gpib_board *board)
+{
+	struct fluke_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (e_priv) {
+		if (e_priv->dma_channel)
+			dma_release_channel(e_priv->dma_channel);
+		gpib_free_pseudo_irq(board);
+		nec_priv = &e_priv->nec7210_priv;
+
+		if (nec_priv->mmiobase) {
+			fluke_paged_write_byte(e_priv, 0, ISR0_IMR0, ISR0_IMR0_PAGE);
+			nec7210_board_reset(nec_priv, board);
+		}
+		if (e_priv->irq)
+			free_irq(e_priv->irq, board);
+		if (e_priv->write_transfer_counter_res) {
+			release_mem_region(e_priv->write_transfer_counter_res->start,
+					   resource_size(e_priv->write_transfer_counter_res));
+		}
+		if (e_priv->dma_port_res) {
+			release_mem_region(e_priv->dma_port_res->start,
+					   resource_size(e_priv->dma_port_res));
+		}
+		if (e_priv->gpib_iomem_res)
+			release_mem_region(e_priv->gpib_iomem_res->start,
+					   resource_size(e_priv->gpib_iomem_res));
+	}
+	fluke_generic_detach(board);
+}
+
+static int fluke_gpib_probe(struct platform_device *pdev)
+{
+	fluke_gpib_pdev = pdev;
+	return 0;
+}
+
+static const struct of_device_id fluke_gpib_of_match[] = {
+	{ .compatible = "flk,fgpib-4.0"},
+	{ {0} }
+};
+MODULE_DEVICE_TABLE(of, fluke_gpib_of_match);
+
+static struct platform_driver fluke_gpib_platform_driver = {
+	.driver = {
+		.name = DRV_NAME,
+		.of_match_table = fluke_gpib_of_match,
+	},
+	.probe = &fluke_gpib_probe
+};
+
+static int __init fluke_init_module(void)
+{
+	int result;
+
+	result = platform_driver_register(&fluke_gpib_platform_driver);
+	if (result) {
+		pr_err("platform_driver_register failed: error = %d\n", result);
+		return result;
+	}
+
+	result = gpib_register_driver(&fluke_unaccel_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_unaccel;
+	}
+
+	result = gpib_register_driver(&fluke_hybrid_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_hybrid;
+	}
+
+	result = gpib_register_driver(&fluke_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_interface;
+	}
+
+	return 0;
+
+err_interface:
+	gpib_unregister_driver(&fluke_hybrid_interface);
+err_hybrid:
+	gpib_unregister_driver(&fluke_unaccel_interface);
+err_unaccel:
+	platform_driver_unregister(&fluke_gpib_platform_driver);
+
+	return result;
+}
+
+static void __exit fluke_exit_module(void)
+{
+	gpib_unregister_driver(&fluke_unaccel_interface);
+	gpib_unregister_driver(&fluke_hybrid_interface);
+	gpib_unregister_driver(&fluke_interface);
+	platform_driver_unregister(&fluke_gpib_platform_driver);
+}
+
+module_init(fluke_init_module);
+module_exit(fluke_exit_module);
diff --git a/drivers/gpib/eastwood/fluke_gpib.h b/drivers/gpib/eastwood/fluke_gpib.h
new file mode 100644
index 000000000000..493c200d0bbf
--- /dev/null
+++ b/drivers/gpib/eastwood/fluke_gpib.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *   Author: Frank Mori Hess <fmh6jj@gmail.com>
+ *   copyright: (C) 2006, 2010, 2015 Fluke Corporation
+ ***************************************************************************/
+
+#include <linux/compiler.h>
+#include <linux/dmaengine.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include "nec7210.h"
+
+struct fluke_priv {
+	struct nec7210_priv nec7210_priv;
+	struct resource *gpib_iomem_res;
+	struct resource *write_transfer_counter_res;
+	struct resource *dma_port_res;
+	int irq;
+	struct dma_chan *dma_channel;
+	u8 *dma_buffer;
+	int dma_buffer_size;
+	void __iomem *write_transfer_counter;
+};
+
+// cb7210 specific registers and bits
+enum cb7210_regs {
+	STATE1_REG = 0x4,
+	ISR0_IMR0 = 0x6,
+	BUS_STATUS = 0x7
+};
+
+enum cb7210_page_in {
+	ISR0_IMR0_PAGE = 1,
+	BUS_STATUS_PAGE = 1,
+	STATE1_PAGE = 1
+};
+
+/* IMR0 -- Interrupt Mode Register 0 */
+enum imr0_bits {
+	FLUKE_IFCIE_BIT = 0x8,	/* interface clear interrupt */
+};
+
+/* ISR0 -- Interrupt Status Register 0 */
+enum isr0_bits {
+	FLUKE_IFCI_BIT = 0x8,	/* interface clear interrupt */
+};
+
+enum state1_bits {
+	SOURCE_HANDSHAKE_SIDS_BITS = 0x0, /* source idle state */
+	SOURCE_HANDSHAKE_SGNS_BITS = 0x1, /* source generate state */
+	SOURCE_HANDSHAKE_SDYS_BITS = 0x2, /* source delay state */
+	SOURCE_HANDSHAKE_STRS_BITS = 0x5, /* source transfer state */
+	SOURCE_HANDSHAKE_MASK = 0x7
+};
+
+/*
+ * we customized the cb7210 vhdl to give the "data in" status
+ * on the unused bit 7 of the address0 register.
+ */
+enum cb7210_address0 {
+	DATA_IN_STATUS = 0x80
+};
+
+static inline int cb7210_page_in_bits(unsigned int page)
+{
+	return 0x50 | (page & 0xf);
+}
+
+// don't use without locking nec_priv->register_page_lock
+static inline u8 fluke_read_byte_nolock(struct nec7210_priv *nec_priv,
+					int register_num)
+{
+	u8 retval;
+
+	retval = readl(nec_priv->mmiobase + register_num * nec_priv->offset);
+	return retval;
+}
+
+// don't use without locking nec_priv->register_page_lock
+static inline void fluke_write_byte_nolock(struct nec7210_priv *nec_priv, u8 data,
+					   int register_num)
+{
+	writel(data, nec_priv->mmiobase + register_num * nec_priv->offset);
+}
+
+static inline u8 fluke_paged_read_byte(struct fluke_priv *e_priv,
+				       unsigned int register_num, unsigned int page)
+{
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	u8 retval;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
+	fluke_write_byte_nolock(nec_priv, cb7210_page_in_bits(page), AUXMR);
+	udelay(1);
+	/* chip auto clears the page after a read */
+	retval = fluke_read_byte_nolock(nec_priv, register_num);
+	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
+	return retval;
+}
+
+static inline void fluke_paged_write_byte(struct fluke_priv *e_priv, u8 data,
+					  unsigned int register_num, unsigned int page)
+{
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
+	fluke_write_byte_nolock(nec_priv, cb7210_page_in_bits(page), AUXMR);
+	udelay(1);
+	fluke_write_byte_nolock(nec_priv, data, register_num);
+	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
+}
+
+enum bus_status_bits {
+	BSR_ATN_BIT = 0x1,
+	BSR_EOI_BIT = 0x2,
+	BSR_SRQ_BIT = 0x4,
+	BSR_IFC_BIT = 0x8,
+	BSR_REN_BIT = 0x10,
+	BSR_DAV_BIT = 0x20,
+	BSR_NRFD_BIT = 0x40,
+	BSR_NDAC_BIT = 0x80,
+};
+
+enum cb7210_aux_cmds {
+/*
+ * AUX_RTL2 is an undocumented aux command which causes cb7210 to assert
+ * (and keep asserted) local rtl message.  This is used in conjunction
+ * with the (stupid) cb7210 implementation
+ * of the normal nec7210 AUX_RTL aux command, which
+ * causes the rtl message to toggle between on and off.
+ */
+	AUX_RTL2 = 0xd,
+	AUX_NBAF = 0xe,	// new byte available false (also clears seoi)
+	AUX_LO_SPEED = 0x40,
+	AUX_HI_SPEED = 0x41,
+};
+
+enum {
+	fluke_reg_offset = 4,
+	fluke_num_regs = 8,
+	write_transfer_counter_mask = 0x7ff,
+};
diff --git a/drivers/gpib/fmh_gpib/Makefile b/drivers/gpib/fmh_gpib/Makefile
new file mode 100644
index 000000000000..cc4d9e7cd5cd
--- /dev/null
+++ b/drivers/gpib/fmh_gpib/Makefile
@@ -0,0 +1,2 @@
+
+obj-$(CONFIG_GPIB_FMH) += fmh_gpib.o
diff --git a/drivers/gpib/fmh_gpib/fmh_gpib.c b/drivers/gpib/fmh_gpib/fmh_gpib.c
new file mode 100644
index 000000000000..f7bfb4a8e553
--- /dev/null
+++ b/drivers/gpib/fmh_gpib/fmh_gpib.c
@@ -0,0 +1,1754 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ * GPIB Driver for fmh_gpib_core, see
+ * https://github.com/fmhess/fmh_gpib_core
+ *
+ * More specifically, it is a driver for the hardware arrangement described by
+ *  src/examples/fmh_gpib_top.vhd in the fmh_gpib_core repository.
+ *
+ * Author: Frank Mori Hess <fmh6jj@gmail.com>
+ * Copyright: (C) 2006, 2010, 2015 Fluke Corporation
+ *	(C) 2017 Frank Mori Hess
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include "fmh_gpib.h"
+
+#include "gpibP.h"
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB Driver for fmh_gpib_core");
+MODULE_AUTHOR("Frank Mori Hess <fmh6jj@gmail.com>");
+
+static irqreturn_t fmh_gpib_interrupt(int irq, void *arg);
+static int fmh_gpib_attach_holdoff_all(struct gpib_board *board,
+				       const struct gpib_board_config *config);
+static int fmh_gpib_attach_holdoff_end(struct gpib_board *board,
+				       const struct gpib_board_config *config);
+static void fmh_gpib_detach(struct gpib_board *board);
+static int fmh_gpib_pci_attach_holdoff_all(struct gpib_board *board,
+					   const struct gpib_board_config *config);
+static int fmh_gpib_pci_attach_holdoff_end(struct gpib_board *board,
+					   const struct gpib_board_config *config);
+static void fmh_gpib_pci_detach(struct gpib_board *board);
+static int fmh_gpib_config_dma(struct gpib_board *board, int output);
+static irqreturn_t fmh_gpib_internal_interrupt(struct gpib_board *board);
+static struct platform_driver fmh_gpib_platform_driver;
+static struct pci_driver fmh_gpib_pci_driver;
+
+// wrappers for interface functions
+static int fmh_gpib_read(struct gpib_board *board, u8 *buffer, size_t length,
+			 int *end, size_t *bytes_read)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
+}
+
+static int fmh_gpib_write(struct gpib_board *board, u8 *buffer, size_t length,
+			  int send_eoi, size_t *bytes_written)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
+}
+
+static int fmh_gpib_command(struct gpib_board *board, u8 *buffer, size_t length,
+			    size_t *bytes_written)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
+}
+
+static int fmh_gpib_take_control(struct gpib_board *board, int synchronous)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
+}
+
+static int fmh_gpib_go_to_standby(struct gpib_board *board)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_go_to_standby(board, &priv->nec7210_priv);
+}
+
+static int fmh_gpib_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct fmh_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+
+	return nec7210_request_system_control(board, nec_priv, request_control);
+}
+
+static void fmh_gpib_interface_clear(struct gpib_board *board, int assert)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
+}
+
+static void fmh_gpib_remote_enable(struct gpib_board *board, int enable)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
+}
+
+static int fmh_gpib_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
+}
+
+static void fmh_gpib_disable_eos(struct gpib_board *board)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	nec7210_disable_eos(board, &priv->nec7210_priv);
+}
+
+static unsigned int fmh_gpib_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
+}
+
+static int fmh_gpib_primary_address(struct gpib_board *board, unsigned int address)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_primary_address(board, &priv->nec7210_priv, address);
+}
+
+static int fmh_gpib_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
+}
+
+static int fmh_gpib_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
+}
+
+static void fmh_gpib_parallel_poll_configure(struct gpib_board *board, u8 configuration)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, configuration);
+}
+
+static void fmh_gpib_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
+}
+
+static void fmh_gpib_local_parallel_poll_mode(struct gpib_board *board, int local)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	if (local) {
+		write_byte(&priv->nec7210_priv, AUX_I_REG | LOCAL_PPOLL_MODE_BIT, AUXMR);
+	} else	{
+		/*
+		 * For fmh_gpib_core, remote parallel poll config mode is unaffected by the
+		 * state of the disable bit of the parallel poll register (unlike the tnt4882).
+		 * So, we don't need to worry about that.
+		 */
+		write_byte(&priv->nec7210_priv, AUX_I_REG | 0x0, AUXMR);
+	}
+}
+
+static void fmh_gpib_serial_poll_response2(struct gpib_board *board, u8 status,
+					   int new_reason_for_service)
+{
+	struct fmh_priv *priv = board->private_data;
+	unsigned long flags;
+	const int MSS = status & request_service_bit;
+	const int reqt = MSS && new_reason_for_service;
+	const int reqf = MSS == 0;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	if (reqt) {
+		priv->nec7210_priv.srq_pending = 1;
+		clear_bit(SPOLL_NUM, &board->status);
+	} else if (reqf) {
+		priv->nec7210_priv.srq_pending = 0;
+	}
+
+	if (reqt) {
+		/*
+		 * It may seem like a race to issue reqt before updating
+		 * the status byte, but it is not.  The chip does not
+		 * issue the reqt until the SPMR is written to at
+		 * a later time.
+		 */
+		write_byte(&priv->nec7210_priv, AUX_REQT, AUXMR);
+	} else if (reqf) {
+		write_byte(&priv->nec7210_priv, AUX_REQF, AUXMR);
+	}
+	/*
+	 * We need to always zero bit 6 of the status byte before writing it to
+	 * the SPMR to insure we are using
+	 * serial poll mode SP1, and not accidentally triggering mode SP3.
+	 */
+	write_byte(&priv->nec7210_priv, status & ~request_service_bit, SPMR);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+
+static u8 fmh_gpib_serial_poll_status(struct gpib_board *board)
+{
+	struct fmh_priv *priv = board->private_data;
+
+	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
+}
+
+static void fmh_gpib_return_to_local(struct gpib_board *board)
+{
+	struct fmh_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+
+	write_byte(nec_priv, AUX_RTL2, AUXMR);
+	udelay(1);
+	write_byte(nec_priv, AUX_RTL, AUXMR);
+}
+
+static int fmh_gpib_line_status(const struct gpib_board *board)
+{
+	int status = VALID_ALL;
+	int bsr_bits;
+	struct fmh_priv *e_priv;
+	struct nec7210_priv *nec_priv;
+
+	e_priv = board->private_data;
+	nec_priv = &e_priv->nec7210_priv;
+
+	bsr_bits = read_byte(nec_priv, BUS_STATUS_REG);
+
+	if ((bsr_bits & BSR_REN_BIT) == 0)
+		status |= BUS_REN;
+	if ((bsr_bits & BSR_IFC_BIT) == 0)
+		status |= BUS_IFC;
+	if ((bsr_bits & BSR_SRQ_BIT) == 0)
+		status |= BUS_SRQ;
+	if ((bsr_bits & BSR_EOI_BIT) == 0)
+		status |= BUS_EOI;
+	if ((bsr_bits & BSR_NRFD_BIT) == 0)
+		status |= BUS_NRFD;
+	if ((bsr_bits & BSR_NDAC_BIT) == 0)
+		status |= BUS_NDAC;
+	if ((bsr_bits & BSR_DAV_BIT) == 0)
+		status |= BUS_DAV;
+	if ((bsr_bits & BSR_ATN_BIT) == 0)
+		status |= BUS_ATN;
+
+	return status;
+}
+
+static int fmh_gpib_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	unsigned int retval;
+
+	retval = nec7210_t1_delay(board, nec_priv, nano_sec);
+
+	if (nano_sec <= 350) {
+		write_byte(nec_priv, AUX_HI_SPEED, AUXMR);
+		retval = 350;
+	} else {
+		write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
+	}
+	return retval;
+}
+
+static int lacs_or_read_ready(struct gpib_board *board)
+{
+	const struct fmh_priv *e_priv = board->private_data;
+	const struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	retval = test_bit(LACS_NUM, &board->status) ||
+		test_bit(READ_READY_BN, &nec_priv->state);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	return retval;
+}
+
+static int wait_for_read(struct gpib_board *board)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+
+	if (wait_event_interruptible(board->wait,
+				     lacs_or_read_ready(board) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+	return retval;
+}
+
+static int wait_for_rx_fifo_half_full_or_end(struct gpib_board *board)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+
+	if (wait_event_interruptible(board->wait,
+				     (fifos_read(e_priv, FIFO_CONTROL_STATUS_REG) &
+				      RX_FIFO_HALF_FULL) ||
+				     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+	return retval;
+}
+
+/*
+ * Wait until the gpib chip is ready to accept a data out byte.
+ */
+static int wait_for_data_out_ready(struct gpib_board *board)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+
+	if (wait_event_interruptible(board->wait,
+				     (test_bit(TACS_NUM, &board->status) &&
+				      (read_byte(nec_priv, EXT_STATUS_1_REG) &
+				       DATA_OUT_STATUS_BIT)) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+
+	return retval;
+}
+
+static void fmh_gpib_dma_callback(void *arg)
+{
+	struct gpib_board *board = arg;
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE | HR_DIIE, HR_DOIE | HR_DIIE);
+	wake_up_interruptible(&board->wait);
+
+	fmh_gpib_internal_interrupt(board);
+
+	clear_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state);
+	clear_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state);
+
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+
+/*
+ * returns true when all the bytes of a write have been transferred to
+ * the chip and successfully transferred out over the gpib bus.
+ */
+static int fmh_gpib_all_bytes_are_sent(struct fmh_priv *e_priv)
+{
+	if (fifos_read(e_priv, FIFO_XFER_COUNTER_REG) & fifo_xfer_counter_mask)
+		return 0;
+
+	if ((read_byte(&e_priv->nec7210_priv, EXT_STATUS_1_REG) & DATA_OUT_STATUS_BIT) == 0)
+		return 0;
+
+	return 1;
+}
+
+static int fmh_gpib_dma_write(struct gpib_board *board, u8 *buffer, size_t length,
+			      size_t *bytes_written)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	unsigned long flags;
+	int retval = 0;
+	dma_addr_t address;
+	struct dma_async_tx_descriptor *tx_desc;
+
+	*bytes_written = 0;
+	if (WARN_ON_ONCE(length > e_priv->dma_buffer_size))
+		return -EFAULT;
+	dmaengine_terminate_all(e_priv->dma_channel);
+	memcpy(e_priv->dma_buffer, buffer, length);
+	address = dma_map_single(board->dev, e_priv->dma_buffer, length, DMA_TO_DEVICE);
+	if (dma_mapping_error(board->dev,  address))
+		dev_err(board->gpib_dev, "dma mapping error in dma write!\n");
+	/* program dma controller */
+	retval = fmh_gpib_config_dma(board, 1);
+	if (retval)
+		goto cleanup;
+
+	tx_desc = dmaengine_prep_slave_single(e_priv->dma_channel, address, length, DMA_MEM_TO_DEV,
+					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	if (!tx_desc) {
+		dev_err(board->gpib_dev, "failed to allocate dma transmit descriptor\n");
+		retval = -ENOMEM;
+		goto cleanup;
+	}
+	tx_desc->callback = fmh_gpib_dma_callback;
+	tx_desc->callback_param = board;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	fifos_write(e_priv, length & fifo_xfer_counter_mask, FIFO_XFER_COUNTER_REG);
+	fifos_write(e_priv, TX_FIFO_DMA_REQUEST_ENABLE | TX_FIFO_CLEAR, FIFO_CONTROL_STATUS_REG);
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, 0);
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, HR_DMAO);
+
+	dmaengine_submit(tx_desc);
+	dma_async_issue_pending(e_priv->dma_channel);
+	clear_bit(WRITE_READY_BN, &nec_priv->state);
+	set_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state);
+
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	// suspend until message is sent
+	if (wait_event_interruptible(board->wait,
+				     fmh_gpib_all_bytes_are_sent(e_priv) ||
+				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+	if (test_and_clear_bit(BUS_ERROR_BN, &nec_priv->state))
+		retval = -EIO;
+	// disable board's dma
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0);
+	fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
+
+	dmaengine_terminate_all(e_priv->dma_channel);
+	// make sure fmh_gpib_dma_callback got called
+	if (test_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state))
+		fmh_gpib_dma_callback(board);
+
+	*bytes_written = length - (fifos_read(e_priv, FIFO_XFER_COUNTER_REG) &
+				   fifo_xfer_counter_mask);
+	if (WARN_ON_ONCE(*bytes_written > length))
+		return -EFAULT;
+cleanup:
+	dma_unmap_single(board->dev, address, length, DMA_TO_DEVICE);
+	return retval;
+}
+
+static int fmh_gpib_accel_write(struct gpib_board *board, u8 *buffer,
+				size_t length, int send_eoi, size_t *bytes_written)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	size_t remainder = length;
+	size_t transfer_size;
+	ssize_t retval = 0;
+	size_t dma_remainder = remainder;
+
+	if (!e_priv->dma_channel) {
+		dev_err(board->gpib_dev, "No dma channel available, cannot do accel write.");
+		return -ENXIO;
+	}
+
+	*bytes_written = 0;
+	if (length < 1)
+		return 0;
+
+	smp_mb__before_atomic();
+	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
+	smp_mb__after_atomic();
+
+	if (send_eoi)
+		--dma_remainder;
+
+	while (dma_remainder > 0) {
+		size_t num_bytes;
+
+		retval = wait_for_data_out_ready(board);
+		if (retval < 0)
+			break;
+
+		transfer_size = (e_priv->dma_buffer_size < dma_remainder) ?
+			e_priv->dma_buffer_size : dma_remainder;
+		retval = fmh_gpib_dma_write(board, buffer, transfer_size, &num_bytes);
+		*bytes_written += num_bytes;
+		if (retval < 0)
+			break;
+		dma_remainder -= num_bytes;
+		remainder -= num_bytes;
+		buffer += num_bytes;
+		if (need_resched())
+			schedule();
+	}
+	if (retval < 0)
+		return retval;
+	// handle sending of last byte with eoi
+	if (send_eoi) {
+		size_t num_bytes;
+
+		if (WARN_ON_ONCE(remainder != 1))
+			return -EFAULT;
+
+		/*
+		 * wait until we are sure we will be able to write the data byte
+		 * into the chip before we send AUX_SEOI.  This prevents a timeout
+		 * scenario where we send AUX_SEOI but then timeout without getting
+		 * any bytes into the gpib chip.  This will result in the first byte
+		 * of the next write having a spurious EOI set on the first byte.
+		 */
+		retval = wait_for_data_out_ready(board);
+		if (retval < 0)
+			return retval;
+
+		write_byte(nec_priv, AUX_SEOI, AUXMR);
+		retval = fmh_gpib_dma_write(board, buffer, remainder, &num_bytes);
+		*bytes_written += num_bytes;
+		if (retval < 0)
+			return retval;
+		remainder -= num_bytes;
+	}
+	return 0;
+}
+
+static int fmh_gpib_get_dma_residue(struct dma_chan *chan, dma_cookie_t cookie)
+{
+	struct dma_tx_state state;
+	int result;
+
+	result = dmaengine_pause(chan);
+	if (result < 0)	{
+		pr_err("dma pause failed?\n");
+		return result;
+	}
+	dmaengine_tx_status(chan, cookie, &state);
+	/*
+	 * dma330 hardware doesn't support resume, so dont call this
+	 * method unless the dma transfer is done.
+	 */
+	return state.residue;
+}
+
+static int wait_for_tx_fifo_half_empty(struct gpib_board *board)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+
+	if (wait_event_interruptible(board->wait,
+				     (test_bit(TACS_NUM, &board->status) &&
+				      (fifos_read(e_priv, FIFO_CONTROL_STATUS_REG) &
+				       TX_FIFO_HALF_EMPTY)) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+
+	return retval;
+}
+
+/*
+ * supports writing a chunk of data whose length must fit into the hardware'd xfer counter,
+ * called in a loop by fmh_gpib_fifo_write()
+ */
+static int fmh_gpib_fifo_write_countable(struct gpib_board *board, u8 *buffer,
+					 size_t length, int send_eoi, size_t *bytes_written)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+	unsigned int remainder;
+
+	*bytes_written = 0;
+	if (WARN_ON_ONCE(length > fifo_xfer_counter_mask))
+		return -EFAULT;
+
+	fifos_write(e_priv, length & fifo_xfer_counter_mask, FIFO_XFER_COUNTER_REG);
+	fifos_write(e_priv, TX_FIFO_CLEAR, FIFO_CONTROL_STATUS_REG);
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, 0);
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, HR_DMAO);
+
+	remainder = length;
+	while (remainder > 0) {
+		int i;
+
+		fifos_write(e_priv, TX_FIFO_HALF_EMPTY_INTERRUPT_ENABLE, FIFO_CONTROL_STATUS_REG);
+		retval = wait_for_tx_fifo_half_empty(board);
+		if (retval < 0)
+			goto cleanup;
+
+		for (i = 0; i < fmh_gpib_half_fifo_size(e_priv) && remainder > 0; ++i) {
+			unsigned int data_value = *buffer;
+
+			if (send_eoi && remainder == 1)
+				data_value |= FIFO_DATA_EOI_FLAG;
+			fifos_write(e_priv, data_value, FIFO_DATA_REG);
+			++buffer;
+			--remainder;
+		}
+	}
+
+	// suspend until last byte is sent
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, HR_DOIE);
+	if (wait_event_interruptible(board->wait,
+				     fmh_gpib_all_bytes_are_sent(e_priv) ||
+				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+	if (test_and_clear_bit(BUS_ERROR_BN, &nec_priv->state))
+		retval = -EIO;
+
+cleanup:
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, 0);
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0);
+	fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
+
+	*bytes_written = length - (fifos_read(e_priv, FIFO_XFER_COUNTER_REG) &
+				   fifo_xfer_counter_mask);
+	if (WARN_ON_ONCE(*bytes_written > length))
+		return -EFAULT;
+
+	return retval;
+}
+
+static int fmh_gpib_fifo_write(struct gpib_board *board, u8 *buffer, size_t length,
+			       int send_eoi, size_t *bytes_written)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	size_t remainder = length;
+	size_t transfer_size;
+	ssize_t retval = 0;
+
+	*bytes_written = 0;
+	if (length < 1)
+		return 0;
+
+	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
+
+	while (remainder > 0) {
+		size_t num_bytes;
+		int last_pass;
+
+		retval = wait_for_data_out_ready(board);
+		if (retval < 0)
+			break;
+
+		if (fifo_xfer_counter_mask < remainder)	{
+			// round transfer size to a multiple of half fifo size
+			transfer_size = (fifo_xfer_counter_mask /
+					 fmh_gpib_half_fifo_size(e_priv)) *
+				fmh_gpib_half_fifo_size(e_priv);
+			last_pass = 0;
+		} else {
+			transfer_size = remainder;
+			last_pass = 1;
+		}
+		retval = fmh_gpib_fifo_write_countable(board, buffer, transfer_size,
+						       last_pass && send_eoi, &num_bytes);
+		*bytes_written += num_bytes;
+		if (retval < 0)
+			break;
+		remainder -= num_bytes;
+		buffer += num_bytes;
+		if (need_resched())
+			schedule();
+	}
+
+	return retval;
+}
+
+static int fmh_gpib_dma_read(struct gpib_board *board, u8 *buffer,
+			     size_t length, int *end, size_t *bytes_read)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+	unsigned long flags;
+	int residue;
+	int wait_retval;
+	dma_addr_t bus_address;
+	struct dma_async_tx_descriptor *tx_desc;
+	dma_cookie_t dma_cookie;
+
+	*bytes_read = 0;
+	*end = 0;
+	if (length == 0)
+		return 0;
+
+	bus_address = dma_map_single(board->dev, e_priv->dma_buffer,
+				     length, DMA_FROM_DEVICE);
+	if (dma_mapping_error(board->dev, bus_address))
+		dev_err(board->gpib_dev, "dma mapping error in dma read!");
+
+	/* program dma controller */
+	retval = fmh_gpib_config_dma(board, 0);
+	if (retval) {
+		dma_unmap_single(board->dev, bus_address, length, DMA_FROM_DEVICE);
+		return retval;
+	}
+	tx_desc = dmaengine_prep_slave_single(e_priv->dma_channel, bus_address,
+					      length, DMA_DEV_TO_MEM,
+					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	if (!tx_desc)  {
+		dev_err(board->gpib_dev, "failed to allocate dma transmit descriptor\n");
+		dma_unmap_single(board->dev, bus_address, length, DMA_FROM_DEVICE);
+		return -EIO;
+	}
+	tx_desc->callback = fmh_gpib_dma_callback;
+	tx_desc->callback_param = board;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	// enable nec7210 dma
+	fifos_write(e_priv, length & fifo_xfer_counter_mask, FIFO_XFER_COUNTER_REG);
+	fifos_write(e_priv, RX_FIFO_DMA_REQUEST_ENABLE | RX_FIFO_CLEAR, FIFO_CONTROL_STATUS_REG);
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DIIE, 0);
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, HR_DMAI);
+
+	dma_cookie = dmaengine_submit(tx_desc);
+	dma_async_issue_pending(e_priv->dma_channel);
+
+	set_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state);
+
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	// wait for data to transfer
+	wait_retval = wait_event_interruptible(board->wait,
+					       test_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state)
+					       == 0 ||
+					       test_bit(RECEIVED_END_BN, &nec_priv->state) ||
+					       test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+					       test_bit(TIMO_NUM, &board->status));
+	if (wait_retval)
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
+		retval = -EINTR;
+	// stop the dma transfer
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
+	fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
+	/*
+	 * give time for pl330 to transfer any in-flight data, since
+	 * pl330 will throw it away when dmaengine_pause is called.
+	 */
+	usleep_range(10, 15);
+	residue = fmh_gpib_get_dma_residue(e_priv->dma_channel, dma_cookie);
+	if (WARN_ON_ONCE(residue > length || residue < 0))
+		return -EFAULT;
+	*bytes_read += length - residue;
+	dmaengine_terminate_all(e_priv->dma_channel);
+	// make sure fmh_gpib_dma_callback got called
+	if (test_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state))
+		fmh_gpib_dma_callback(board);
+
+	dma_unmap_single(board->dev, bus_address, length, DMA_FROM_DEVICE);
+	memcpy(buffer, e_priv->dma_buffer, *bytes_read);
+
+	/* Manually read any dregs out of fifo. */
+	while ((fifos_read(e_priv, FIFO_CONTROL_STATUS_REG) & RX_FIFO_EMPTY) == 0) {
+		if ((*bytes_read) >= length) {
+			dev_err(board->dev, "unexpected extra bytes in rx fifo, discarding!  bytes_read=%d length=%d residue=%d\n",
+				(int)(*bytes_read), (int)length, (int)residue);
+			break;
+		}
+		buffer[(*bytes_read)++] = fifos_read(e_priv, FIFO_DATA_REG) & fifo_data_mask;
+	}
+
+	/*
+	 * If we got an end interrupt, figure out if it was
+	 * associated with the last byte we dma'd or with a
+	 * byte still sitting on the cb7210.
+	 */
+	spin_lock_irqsave(&board->spinlock, flags);
+	if (*bytes_read > 0 && test_bit(READ_READY_BN, &nec_priv->state) == 0) {
+		/*
+		 * If there is no byte sitting on the cb7210 and we
+		 * saw an end, we need to deal with it now
+		 */
+		if (test_and_clear_bit(RECEIVED_END_BN, &nec_priv->state))
+			*end = 1;
+	}
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	return retval;
+}
+
+static void fmh_gpib_release_rfd_holdoff(struct gpib_board *board, struct fmh_priv *e_priv)
+{
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	unsigned int ext_status_1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+
+	ext_status_1 = read_byte(nec_priv, EXT_STATUS_1_REG);
+
+	/*
+	 * if there is an end byte sitting on the chip, don't release
+	 * holdoff.  We want it left set after we read out the end
+	 * byte.
+	 */
+	if ((ext_status_1 & (DATA_IN_STATUS_BIT | END_STATUS_BIT)) !=
+	    (DATA_IN_STATUS_BIT | END_STATUS_BIT))	{
+		if (ext_status_1 & RFD_HOLDOFF_STATUS_BIT)
+			write_byte(nec_priv, AUX_FH, AUXMR);
+
+		/*
+		 * Check if an end byte raced in before we executed the AUX_FH command.
+		 * If it did, we want to make sure the rfd holdoff is in effect.  The end
+		 * byte can arrive since
+		 * AUX_RFD_HOLDOFF_ASAP doesn't immediately force the acceptor handshake
+		 * to leave ACRS.
+		 */
+		if ((read_byte(nec_priv, EXT_STATUS_1_REG) &
+		     (RFD_HOLDOFF_STATUS_BIT | DATA_IN_STATUS_BIT | END_STATUS_BIT)) ==
+		    (DATA_IN_STATUS_BIT | END_STATUS_BIT)) {
+			write_byte(nec_priv, AUX_RFD_HOLDOFF_ASAP, AUXMR);
+			set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+		} else {
+			clear_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+		}
+	}
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+
+static int fmh_gpib_accel_read(struct gpib_board *board, u8 *buffer, size_t length,
+			       int *end, size_t *bytes_read)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	size_t remain = length;
+	size_t transfer_size;
+	int retval = 0;
+	size_t dma_nbytes;
+	unsigned long flags;
+
+	smp_mb__before_atomic();
+	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
+	smp_mb__after_atomic();
+	*end = 0;
+	*bytes_read = 0;
+
+	retval = wait_for_read(board);
+	if (retval < 0)
+		return retval;
+
+	fmh_gpib_release_rfd_holdoff(board, e_priv);
+	while (remain > 0) {
+		transfer_size = (e_priv->dma_buffer_size < remain) ?
+			e_priv->dma_buffer_size : remain;
+		retval = fmh_gpib_dma_read(board, buffer, transfer_size, end, &dma_nbytes);
+		remain -= dma_nbytes;
+		buffer += dma_nbytes;
+		*bytes_read += dma_nbytes;
+		if (*end)
+			break;
+		if (retval < 0)
+			break;
+		if (need_resched())
+			schedule();
+	}
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	if (test_bit(RFD_HOLDOFF_BN, &nec_priv->state) == 0) {
+		write_byte(nec_priv, AUX_RFD_HOLDOFF_ASAP, AUXMR);
+		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+	}
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	return retval;
+}
+
+/*
+ * Read a chunk of data whose length is within the limits of the hardware's
+ * xfer counter.  Called in a loop from fmh_gpib_fifo_read().
+ */
+static int fmh_gpib_fifo_read_countable(struct gpib_board *board, u8 *buffer,
+					size_t length, int *end, size_t *bytes_read)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	int retval = 0;
+
+	*bytes_read = 0;
+	*end = 0;
+	if (length == 0)
+		return 0;
+
+	fifos_write(e_priv, length & fifo_xfer_counter_mask, FIFO_XFER_COUNTER_REG);
+	fifos_write(e_priv, RX_FIFO_CLEAR, FIFO_CONTROL_STATUS_REG);
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DIIE, 0);
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, HR_DMAI);
+
+	while (*bytes_read < length && *end == 0) {
+		int i;
+
+		fifos_write(e_priv, RX_FIFO_HALF_FULL_INTERRUPT_ENABLE, FIFO_CONTROL_STATUS_REG);
+		retval = wait_for_rx_fifo_half_full_or_end(board);
+		if (retval < 0)
+			goto cleanup;
+
+		for (i = 0; i < fmh_gpib_half_fifo_size(e_priv) && *end == 0; ++i) {
+			unsigned int data_value;
+
+			data_value = fifos_read(e_priv, FIFO_DATA_REG);
+			buffer[(*bytes_read)++] = data_value & fifo_data_mask;
+			if (data_value & FIFO_DATA_EOI_FLAG)
+				*end = 1;
+		}
+	}
+
+cleanup:
+	// stop the transfer
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
+	fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
+
+	/* Manually read any dregs out of fifo. */
+	while ((fifos_read(e_priv, FIFO_CONTROL_STATUS_REG) & RX_FIFO_EMPTY) == 0) {
+		unsigned int data_value;
+
+		if ((*bytes_read) >= length) {
+			dev_err(board->dev, "unexpected extra bytes in rx fifo, discarding!  bytes_read=%d length=%d\n",
+				(int)(*bytes_read), (int)length);
+			break;
+		}
+		data_value = fifos_read(e_priv, FIFO_DATA_REG);
+		buffer[(*bytes_read)++] = data_value & fifo_data_mask;
+		if (data_value & FIFO_DATA_EOI_FLAG)
+			*end = 1;
+	}
+
+	return retval;
+}
+
+static int fmh_gpib_fifo_read(struct gpib_board *board, u8 *buffer, size_t length,
+			      int *end, size_t *bytes_read)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	size_t remain = length;
+	size_t transfer_size;
+	int retval = 0;
+	size_t nbytes;
+	unsigned long flags;
+
+	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
+	*end = 0;
+	*bytes_read = 0;
+
+	/*
+	 * Do a little prep with data in interrupt so that following wait_for_read()
+	 * will wake up if a data byte is received.
+	 */
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_DIIE, HR_DIIE);
+	fmh_gpib_interrupt(0, board);
+
+	retval = wait_for_read(board);
+	if (retval < 0)
+		return retval;
+
+	fmh_gpib_release_rfd_holdoff(board, e_priv);
+	while (remain > 0) {
+		if (fifo_xfer_counter_mask < remain) {
+			// round transfer size to a multiple of half fifo size
+			transfer_size = (fifo_xfer_counter_mask /
+					 fmh_gpib_half_fifo_size(e_priv)) *
+				fmh_gpib_half_fifo_size(e_priv);
+		} else {
+			transfer_size = remain;
+		}
+		retval = fmh_gpib_fifo_read_countable(board, buffer, transfer_size, end, &nbytes);
+		remain -= nbytes;
+		buffer += nbytes;
+		*bytes_read += nbytes;
+		if (*end)
+			break;
+		if (retval < 0)
+			break;
+		if (need_resched())
+			schedule();
+	}
+
+	if (*end == 0)	{
+		spin_lock_irqsave(&board->spinlock, flags);
+		write_byte(nec_priv, AUX_RFD_HOLDOFF_ASAP, AUXMR);
+		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+		spin_unlock_irqrestore(&board->spinlock, flags);
+	}
+
+	return retval;
+}
+
+static struct gpib_interface fmh_gpib_unaccel_interface = {
+	.name = "fmh_gpib_unaccel",
+	.attach = fmh_gpib_attach_holdoff_all,
+	.detach = fmh_gpib_detach,
+	.read = fmh_gpib_read,
+	.write = fmh_gpib_write,
+	.command = fmh_gpib_command,
+	.take_control = fmh_gpib_take_control,
+	.go_to_standby = fmh_gpib_go_to_standby,
+	.request_system_control = fmh_gpib_request_system_control,
+	.interface_clear = fmh_gpib_interface_clear,
+	.remote_enable = fmh_gpib_remote_enable,
+	.enable_eos = fmh_gpib_enable_eos,
+	.disable_eos = fmh_gpib_disable_eos,
+	.parallel_poll = fmh_gpib_parallel_poll,
+	.parallel_poll_configure = fmh_gpib_parallel_poll_configure,
+	.parallel_poll_response = fmh_gpib_parallel_poll_response,
+	.local_parallel_poll_mode = fmh_gpib_local_parallel_poll_mode,
+	.line_status = fmh_gpib_line_status,
+	.update_status = fmh_gpib_update_status,
+	.primary_address = fmh_gpib_primary_address,
+	.secondary_address = fmh_gpib_secondary_address,
+	.serial_poll_response2 = fmh_gpib_serial_poll_response2,
+	.serial_poll_status = fmh_gpib_serial_poll_status,
+	.t1_delay = fmh_gpib_t1_delay,
+	.return_to_local = fmh_gpib_return_to_local,
+};
+
+static struct gpib_interface fmh_gpib_interface = {
+	.name = "fmh_gpib",
+	.attach = fmh_gpib_attach_holdoff_end,
+	.detach = fmh_gpib_detach,
+	.read = fmh_gpib_accel_read,
+	.write = fmh_gpib_accel_write,
+	.command = fmh_gpib_command,
+	.take_control = fmh_gpib_take_control,
+	.go_to_standby = fmh_gpib_go_to_standby,
+	.request_system_control = fmh_gpib_request_system_control,
+	.interface_clear = fmh_gpib_interface_clear,
+	.remote_enable = fmh_gpib_remote_enable,
+	.enable_eos = fmh_gpib_enable_eos,
+	.disable_eos = fmh_gpib_disable_eos,
+	.parallel_poll = fmh_gpib_parallel_poll,
+	.parallel_poll_configure = fmh_gpib_parallel_poll_configure,
+	.parallel_poll_response = fmh_gpib_parallel_poll_response,
+	.local_parallel_poll_mode = fmh_gpib_local_parallel_poll_mode,
+	.line_status = fmh_gpib_line_status,
+	.update_status = fmh_gpib_update_status,
+	.primary_address = fmh_gpib_primary_address,
+	.secondary_address = fmh_gpib_secondary_address,
+	.serial_poll_response2 = fmh_gpib_serial_poll_response2,
+	.serial_poll_status = fmh_gpib_serial_poll_status,
+	.t1_delay = fmh_gpib_t1_delay,
+	.return_to_local = fmh_gpib_return_to_local,
+};
+
+static struct gpib_interface fmh_gpib_pci_interface = {
+	.name = "fmh_gpib_pci",
+	.attach = fmh_gpib_pci_attach_holdoff_end,
+	.detach = fmh_gpib_pci_detach,
+	.read = fmh_gpib_fifo_read,
+	.write = fmh_gpib_fifo_write,
+	.command = fmh_gpib_command,
+	.take_control = fmh_gpib_take_control,
+	.go_to_standby = fmh_gpib_go_to_standby,
+	.request_system_control = fmh_gpib_request_system_control,
+	.interface_clear = fmh_gpib_interface_clear,
+	.remote_enable = fmh_gpib_remote_enable,
+	.enable_eos = fmh_gpib_enable_eos,
+	.disable_eos = fmh_gpib_disable_eos,
+	.parallel_poll = fmh_gpib_parallel_poll,
+	.parallel_poll_configure = fmh_gpib_parallel_poll_configure,
+	.parallel_poll_response = fmh_gpib_parallel_poll_response,
+	.local_parallel_poll_mode = fmh_gpib_local_parallel_poll_mode,
+	.line_status = fmh_gpib_line_status,
+	.update_status = fmh_gpib_update_status,
+	.primary_address = fmh_gpib_primary_address,
+	.secondary_address = fmh_gpib_secondary_address,
+	.serial_poll_response2 = fmh_gpib_serial_poll_response2,
+	.serial_poll_status = fmh_gpib_serial_poll_status,
+	.t1_delay = fmh_gpib_t1_delay,
+	.return_to_local = fmh_gpib_return_to_local,
+};
+
+static struct gpib_interface fmh_gpib_pci_unaccel_interface = {
+	.name = "fmh_gpib_pci_unaccel",
+	.attach = fmh_gpib_pci_attach_holdoff_all,
+	.detach = fmh_gpib_pci_detach,
+	.read = fmh_gpib_read,
+	.write = fmh_gpib_write,
+	.command = fmh_gpib_command,
+	.take_control = fmh_gpib_take_control,
+	.go_to_standby = fmh_gpib_go_to_standby,
+	.request_system_control = fmh_gpib_request_system_control,
+	.interface_clear = fmh_gpib_interface_clear,
+	.remote_enable = fmh_gpib_remote_enable,
+	.enable_eos = fmh_gpib_enable_eos,
+	.disable_eos = fmh_gpib_disable_eos,
+	.parallel_poll = fmh_gpib_parallel_poll,
+	.parallel_poll_configure = fmh_gpib_parallel_poll_configure,
+	.parallel_poll_response = fmh_gpib_parallel_poll_response,
+	.local_parallel_poll_mode = fmh_gpib_local_parallel_poll_mode,
+	.line_status = fmh_gpib_line_status,
+	.update_status = fmh_gpib_update_status,
+	.primary_address = fmh_gpib_primary_address,
+	.secondary_address = fmh_gpib_secondary_address,
+	.serial_poll_response2 = fmh_gpib_serial_poll_response2,
+	.serial_poll_status = fmh_gpib_serial_poll_status,
+	.t1_delay = fmh_gpib_t1_delay,
+	.return_to_local = fmh_gpib_return_to_local,
+};
+
+irqreturn_t fmh_gpib_internal_interrupt(struct gpib_board *board)
+{
+	unsigned int status0, status1, status2, ext_status_1, fifo_status;
+	struct fmh_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+	int retval = IRQ_NONE;
+
+	status0 = read_byte(nec_priv, ISR0_IMR0_REG);
+	status1 = read_byte(nec_priv, ISR1);
+	status2 = read_byte(nec_priv, ISR2);
+	fifo_status = fifos_read(priv, FIFO_CONTROL_STATUS_REG);
+
+	if (status0 & IFC_INTERRUPT_BIT) {
+		push_gpib_event(board, EVENT_IFC);
+		retval = IRQ_HANDLED;
+	}
+
+	if (nec7210_interrupt_have_status(board, nec_priv, status1, status2) == IRQ_HANDLED)
+		retval = IRQ_HANDLED;
+
+	ext_status_1 = read_byte(nec_priv, EXT_STATUS_1_REG);
+
+	if (ext_status_1 & DATA_IN_STATUS_BIT)
+		set_bit(READ_READY_BN, &nec_priv->state);
+	else
+		clear_bit(READ_READY_BN, &nec_priv->state);
+
+	if (ext_status_1 & DATA_OUT_STATUS_BIT)
+		set_bit(WRITE_READY_BN, &nec_priv->state);
+	else
+		clear_bit(WRITE_READY_BN, &nec_priv->state);
+
+	if (ext_status_1 & COMMAND_OUT_STATUS_BIT)
+		set_bit(COMMAND_READY_BN, &nec_priv->state);
+	else
+		clear_bit(COMMAND_READY_BN, &nec_priv->state);
+
+	if (ext_status_1 & RFD_HOLDOFF_STATUS_BIT)
+		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+	else
+		clear_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+
+	if (ext_status_1 & END_STATUS_BIT) {
+		/*
+		 * only set RECEIVED_END while there is still a data
+		 * byte sitting in the chip, to avoid spuriously
+		 * setting it multiple times after it has been cleared
+		 * during a read.
+		 */
+		if (ext_status_1 & DATA_IN_STATUS_BIT)
+			set_bit(RECEIVED_END_BN, &nec_priv->state);
+	} else {
+		clear_bit(RECEIVED_END_BN, &nec_priv->state);
+	}
+
+	if ((fifo_status & TX_FIFO_HALF_EMPTY_INTERRUPT_IS_ENABLED) &&
+	    (fifo_status & TX_FIFO_HALF_EMPTY)) {
+		/*
+		 * We really only want to clear the
+		 * TX_FIFO_HALF_EMPTY_INTERRUPT_ENABLE bit in the
+		 * FIFO_CONTROL_STATUS_REG.  Since we are not being
+		 * careful, this also has a side effect of disabling
+		 * DMA requests and the RX fifo interrupt.  That is
+		 * fine though, since they should never be in use at
+		 * the same time as the TX fifo interrupt.
+		 */
+		fifos_write(priv, 0x0, FIFO_CONTROL_STATUS_REG);
+		retval = IRQ_HANDLED;
+	}
+
+	if ((fifo_status & RX_FIFO_HALF_FULL_INTERRUPT_IS_ENABLED) &&
+	    (fifo_status & RX_FIFO_HALF_FULL)) {
+		/*
+		 * We really only want to clear the
+		 * RX_FIFO_HALF_FULL_INTERRUPT_ENABLE bit in the
+		 * FIFO_CONTROL_STATUS_REG.  Since we are not being
+		 * careful, this also has a side effect of disabling
+		 * DMA requests and the TX fifo interrupt.  That is
+		 * fine though, since they should never be in use at
+		 * the same time as the RX fifo interrupt.
+		 */
+		fifos_write(priv, 0x0, FIFO_CONTROL_STATUS_REG);
+		retval = IRQ_HANDLED;
+	}
+
+	if (retval == IRQ_HANDLED)
+		wake_up_interruptible(&board->wait);
+
+	return retval;
+}
+
+irqreturn_t fmh_gpib_interrupt(int irq, void *arg)
+{
+	struct gpib_board *board = arg;
+	unsigned long flags;
+	irqreturn_t retval;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	retval = fmh_gpib_internal_interrupt(board);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return retval;
+}
+
+static int fmh_gpib_allocate_private(struct gpib_board *board)
+{
+	struct fmh_priv *priv;
+
+	board->private_data = kmalloc(sizeof(struct fmh_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -ENOMEM;
+	priv = board->private_data;
+	memset(priv, 0, sizeof(struct fmh_priv));
+	init_nec7210_private(&priv->nec7210_priv);
+	priv->dma_buffer_size = 0x800;
+	priv->dma_buffer = kmalloc(priv->dma_buffer_size, GFP_KERNEL);
+	if (!priv->dma_buffer)
+		return -ENOMEM;
+	return 0;
+}
+
+static void fmh_gpib_generic_detach(struct gpib_board *board)
+{
+	if (board->private_data) {
+		struct fmh_priv *e_priv = board->private_data;
+
+		kfree(e_priv->dma_buffer);
+		kfree(board->private_data);
+		board->private_data = NULL;
+	}
+	if (board->dev)
+		dev_set_drvdata(board->dev, NULL);
+}
+
+// generic part of attach functions
+static int fmh_gpib_generic_attach(struct gpib_board *board)
+{
+	struct fmh_priv *e_priv;
+	struct nec7210_priv *nec_priv;
+	int retval;
+
+	board->status = 0;
+
+	retval = fmh_gpib_allocate_private(board);
+	if (retval < 0)
+		return retval;
+	e_priv = board->private_data;
+	nec_priv = &e_priv->nec7210_priv;
+	nec_priv->read_byte = gpib_cs_read_byte;
+	nec_priv->write_byte = gpib_cs_write_byte;
+	nec_priv->offset = 1;
+	nec_priv->type = CB7210;
+	return 0;
+}
+
+static int fmh_gpib_config_dma(struct gpib_board *board, int output)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct dma_slave_config config;
+
+	config.device_fc = true;
+
+	if (e_priv->dma_burst_length < 1) {
+		config.src_maxburst = 1;
+		config.dst_maxburst = 1;
+	} else {
+		config.src_maxburst = e_priv->dma_burst_length;
+		config.dst_maxburst = e_priv->dma_burst_length;
+	}
+
+	config.src_addr_width = 1;
+	config.dst_addr_width = 1;
+
+	if (output) {
+		config.direction = DMA_MEM_TO_DEV;
+		config.src_addr = 0;
+		config.dst_addr = e_priv->dma_port_res->start + FIFO_DATA_REG * fifo_reg_offset;
+	} else {
+		config.direction = DMA_DEV_TO_MEM;
+		config.src_addr = e_priv->dma_port_res->start + FIFO_DATA_REG * fifo_reg_offset;
+		config.dst_addr = 0;
+	}
+	return dmaengine_slave_config(e_priv->dma_channel, &config);
+}
+
+static int fmh_gpib_init(struct fmh_priv *e_priv, struct gpib_board *board, int handshake_mode)
+{
+	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
+	unsigned long flags;
+	unsigned int fifo_status_bits;
+
+	fifos_write(e_priv, RX_FIFO_CLEAR | TX_FIFO_CLEAR, FIFO_CONTROL_STATUS_REG);
+
+	nec7210_board_reset(nec_priv, board);
+	write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
+	nec7210_set_handshake_mode(board, nec_priv, handshake_mode);
+
+	/* Hueristically check if hardware supports fifo half full/empty interrupts */
+	fifo_status_bits = fifos_read(e_priv, FIFO_CONTROL_STATUS_REG);
+	e_priv->supports_fifo_interrupts = (fifo_status_bits & TX_FIFO_EMPTY) &&
+		(fifo_status_bits & TX_FIFO_HALF_EMPTY);
+
+	nec7210_board_online(nec_priv, board);
+
+	write_byte(nec_priv, IFC_INTERRUPT_ENABLE_BIT | ATN_INTERRUPT_ENABLE_BIT, ISR0_IMR0_REG);
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	write_byte(nec_priv, AUX_RFD_HOLDOFF_ASAP, AUXMR);
+	set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return 0;
+}
+
+/* Match callback for driver_find_device */
+static int fmh_gpib_device_match(struct device *dev, const void *data)
+{
+	const struct gpib_board_config *config = data;
+
+	if (dev_get_drvdata(dev))
+		return 0;
+
+	if (gpib_match_device_path(dev, config->device_path) == 0)
+		return 0;
+
+	// driver doesn't support selection by serial number
+	if (config->serial_number)
+		return 0;
+
+	dev_dbg(dev, "matched: %s\n", of_node_full_name(dev_of_node((dev))));
+	return 1;
+}
+
+static int fmh_gpib_attach_impl(struct gpib_board *board, const struct gpib_board_config *config,
+				unsigned int handshake_mode, int acquire_dma)
+{
+	struct fmh_priv *e_priv;
+	struct nec7210_priv *nec_priv;
+	int retval;
+	int irq;
+	struct resource *res;
+	struct platform_device *pdev;
+
+	board->dev = driver_find_device(&fmh_gpib_platform_driver.driver,
+					NULL, (const void *)config, &fmh_gpib_device_match);
+	if (!board->dev)	{
+		dev_err(board->gpib_dev, "No matching fmh_gpib_core device was found, attach failed.");
+		return -ENODEV;
+	}
+	// currently only used to mark the device as already attached
+	dev_set_drvdata(board->dev, board);
+	pdev = to_platform_device(board->dev);
+
+	retval = fmh_gpib_generic_attach(board);
+	if (retval)
+		return retval;
+
+	e_priv = board->private_data;
+	nec_priv = &e_priv->nec7210_priv;
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "gpib_control_status");
+	if (!res) {
+		dev_err(board->dev, "Unable to locate mmio resource\n");
+		return -ENODEV;
+	}
+
+	if (request_mem_region(res->start,
+			       resource_size(res),
+			       pdev->name) == NULL) {
+		dev_err(board->dev, "cannot claim registers\n");
+		return -ENXIO;
+	}
+	e_priv->gpib_iomem_res = res;
+
+	nec_priv->mmiobase = ioremap(e_priv->gpib_iomem_res->start,
+				     resource_size(e_priv->gpib_iomem_res));
+	if (!nec_priv->mmiobase) {
+		dev_err(board->dev, "Could not map I/O memory\n");
+		return -ENOMEM;
+	}
+	dev_dbg(board->dev, "iobase %pr remapped to %p\n",
+		e_priv->gpib_iomem_res, nec_priv->mmiobase);
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dma_fifos");
+	if (!res) {
+		dev_err(board->dev, "Unable to locate mmio resource for gpib dma port\n");
+		return -ENODEV;
+	}
+	if (request_mem_region(res->start,
+			       resource_size(res),
+			       pdev->name) == NULL) {
+		dev_err(board->dev, "cannot claim registers\n");
+		return -ENXIO;
+	}
+	e_priv->dma_port_res = res;
+	e_priv->fifo_base = ioremap(e_priv->dma_port_res->start,
+				    resource_size(e_priv->dma_port_res));
+	if (!e_priv->fifo_base) {
+		dev_err(board->dev, "Could not map I/O memory for fifos\n");
+		return -ENOMEM;
+	}
+	dev_dbg(board->dev, "dma fifos 0x%lx remapped to %p, length=%ld\n",
+		(unsigned long)e_priv->dma_port_res->start, e_priv->fifo_base,
+		(unsigned long)resource_size(e_priv->dma_port_res));
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return -EBUSY;
+	retval = request_irq(irq, fmh_gpib_interrupt, IRQF_SHARED, pdev->name, board);
+	if (retval) {
+		dev_err(board->dev,
+			"cannot register interrupt handler err=%d\n",
+			retval);
+		return retval;
+	}
+	e_priv->irq = irq;
+
+	if (acquire_dma) {
+		e_priv->dma_channel = dma_request_slave_channel(board->dev, "rxtx");
+		if (!e_priv->dma_channel) {
+			dev_err(board->dev, "failed to acquire dma channel \"rxtx\".\n");
+			return -EIO;
+		}
+	}
+	/*
+	 * in the future we might want to know the half-fifo size
+	 * (dma_burst_length) even when not using dma, so go ahead an
+	 * initialize it unconditionally.
+	 */
+	e_priv->dma_burst_length = fifos_read(e_priv, FIFO_MAX_BURST_LENGTH_REG) &
+		fifo_max_burst_length_mask;
+
+	return fmh_gpib_init(e_priv, board, handshake_mode);
+}
+
+int fmh_gpib_attach_holdoff_all(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	return fmh_gpib_attach_impl(board, config, HR_HLDA, 0);
+}
+
+int fmh_gpib_attach_holdoff_end(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	return fmh_gpib_attach_impl(board, config, HR_HLDE, 1);
+}
+
+void fmh_gpib_detach(struct gpib_board *board)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (e_priv) {
+		if (e_priv->dma_channel)
+			dma_release_channel(e_priv->dma_channel);
+		nec_priv = &e_priv->nec7210_priv;
+
+		if (e_priv->irq)
+			free_irq(e_priv->irq, board);
+		if (e_priv->fifo_base)
+			fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
+		if (nec_priv->mmiobase) {
+			write_byte(nec_priv, 0, ISR0_IMR0_REG);
+			nec7210_board_reset(nec_priv, board);
+		}
+		if (e_priv->fifo_base)
+			iounmap(e_priv->fifo_base);
+		if (nec_priv->mmiobase)
+			iounmap(nec_priv->mmiobase);
+		if (e_priv->dma_port_res) {
+			release_mem_region(e_priv->dma_port_res->start,
+					   resource_size(e_priv->dma_port_res));
+		}
+		if (e_priv->gpib_iomem_res)
+			release_mem_region(e_priv->gpib_iomem_res->start,
+					   resource_size(e_priv->gpib_iomem_res));
+	}
+	fmh_gpib_generic_detach(board);
+
+	if (board->dev) {
+		put_device(board->dev);
+		board->dev = NULL;
+	}
+}
+
+static int fmh_gpib_pci_attach_impl(struct gpib_board *board,
+				    const struct gpib_board_config *config,
+				    unsigned int handshake_mode)
+{
+	struct fmh_priv *e_priv;
+	struct nec7210_priv *nec_priv;
+	int retval;
+	struct pci_dev *pci_device;
+
+	retval = fmh_gpib_generic_attach(board);
+	if (retval)
+		return retval;
+
+	e_priv = board->private_data;
+	nec_priv = &e_priv->nec7210_priv;
+
+	// find board
+	pci_device = gpib_pci_get_device(config, BOGUS_PCI_VENDOR_ID_FLUKE,
+					 BOGUS_PCI_DEVICE_ID_FLUKE_BLADERUNNER, NULL);
+	if (!pci_device)	{
+		dev_err(board->gpib_dev, "No matching fmh_gpib_core pci device was found, attach failed.");
+		return -ENODEV;
+	}
+	board->dev = &pci_device->dev;
+
+	// bladerunner prototype has offset of 4 between gpib control/status registers
+	nec_priv->offset = 4;
+
+	if (pci_enable_device(pci_device)) {
+		dev_err(board->dev, "error enabling pci device\n");
+		return -EIO;
+	}
+	if (pci_request_regions(pci_device, KBUILD_MODNAME)) {
+		dev_err(board->dev, "pci_request_regions failed\n");
+		return -EIO;
+	}
+	e_priv->gpib_iomem_res = &pci_device->resource[gpib_control_status_pci_resource_index];
+	e_priv->dma_port_res =	&pci_device->resource[gpib_fifo_pci_resource_index];
+
+	nec_priv->mmiobase = ioremap(pci_resource_start(pci_device,
+							gpib_control_status_pci_resource_index),
+				     pci_resource_len(pci_device,
+						      gpib_control_status_pci_resource_index));
+	dev_dbg(board->dev, "base address for gpib control/status registers remapped to 0x%p\n",
+		nec_priv->mmiobase);
+
+	if (e_priv->dma_port_res->flags & IORESOURCE_MEM) {
+		e_priv->fifo_base = ioremap(pci_resource_start(pci_device,
+							       gpib_fifo_pci_resource_index),
+					    pci_resource_len(pci_device,
+							     gpib_fifo_pci_resource_index));
+		dev_dbg(board->dev, "base address for gpib fifo registers remapped to 0x%p\n",
+			e_priv->fifo_base);
+	} else {
+		e_priv->fifo_base = NULL;
+		dev_dbg(board->dev, "hardware has no gpib fifo registers.\n");
+	}
+
+	if (pci_device->irq) {
+		retval = request_irq(pci_device->irq, fmh_gpib_interrupt, IRQF_SHARED,
+				     KBUILD_MODNAME, board);
+		if (retval) {
+			dev_err(board->dev, "cannot register interrupt handler err=%d\n", retval);
+			return retval;
+		}
+	}
+	e_priv->irq = pci_device->irq;
+
+	e_priv->dma_burst_length = fifos_read(e_priv, FIFO_MAX_BURST_LENGTH_REG) &
+		fifo_max_burst_length_mask;
+
+	return fmh_gpib_init(e_priv, board, handshake_mode);
+}
+
+int fmh_gpib_pci_attach_holdoff_all(struct gpib_board *board,
+				    const struct gpib_board_config *config)
+{
+	return fmh_gpib_pci_attach_impl(board, config, HR_HLDA);
+}
+
+int fmh_gpib_pci_attach_holdoff_end(struct gpib_board *board,
+				    const struct gpib_board_config *config)
+{
+	int retval;
+	struct fmh_priv *e_priv;
+
+	retval = fmh_gpib_pci_attach_impl(board, config, HR_HLDE);
+	e_priv = board->private_data;
+	if (retval == 0 && e_priv && e_priv->supports_fifo_interrupts == 0) {
+		dev_err(board->gpib_dev, "your fmh_gpib_core does not appear to support fifo interrupts.  Try the fmh_gpib_pci_unaccel board type instead.");
+		return -EIO;
+	}
+	return retval;
+}
+
+void fmh_gpib_pci_detach(struct gpib_board *board)
+{
+	struct fmh_priv *e_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (e_priv)	{
+		nec_priv = &e_priv->nec7210_priv;
+
+		if (e_priv->irq)
+			free_irq(e_priv->irq, board);
+		if (e_priv->fifo_base)
+			fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
+		if (nec_priv->mmiobase) {
+			write_byte(nec_priv, 0, ISR0_IMR0_REG);
+			nec7210_board_reset(nec_priv, board);
+		}
+		if (e_priv->fifo_base)
+			iounmap(e_priv->fifo_base);
+		if (nec_priv->mmiobase)
+			iounmap(nec_priv->mmiobase);
+		if (e_priv->dma_port_res || e_priv->gpib_iomem_res)
+			pci_release_regions(to_pci_dev(board->dev));
+		if (board->dev)
+			pci_dev_put(to_pci_dev(board->dev));
+	}
+	fmh_gpib_generic_detach(board);
+}
+
+static int fmh_gpib_platform_probe(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static const struct of_device_id fmh_gpib_of_match[] = {
+	{ .compatible = "fmhess,fmh_gpib_core"},
+	{ {0} }
+};
+MODULE_DEVICE_TABLE(of, fmh_gpib_of_match);
+
+static struct platform_driver fmh_gpib_platform_driver = {
+	.driver = {
+		.name = DRV_NAME,
+		.of_match_table = fmh_gpib_of_match,
+	},
+	.probe = &fmh_gpib_platform_probe
+};
+
+static int fmh_gpib_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	return 0;
+}
+
+static const struct pci_device_id fmh_gpib_pci_match[] = {
+	{ BOGUS_PCI_VENDOR_ID_FLUKE, BOGUS_PCI_DEVICE_ID_FLUKE_BLADERUNNER, 0, 0, 0 },
+	{ 0 }
+};
+MODULE_DEVICE_TABLE(pci, fmh_gpib_pci_match);
+
+static struct pci_driver fmh_gpib_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = fmh_gpib_pci_match,
+	.probe = &fmh_gpib_pci_probe
+};
+
+static int __init fmh_gpib_init_module(void)
+{
+	int result;
+
+	result = platform_driver_register(&fmh_gpib_platform_driver);
+	if (result) {
+		pr_err("platform_driver_register failed: error = %d\n", result);
+		return result;
+	}
+
+	result = pci_register_driver(&fmh_gpib_pci_driver);
+	if (result) {
+		pr_err("pci_register_driver failed: error = %d\n", result);
+		goto err_pci_driver;
+	}
+
+	result = gpib_register_driver(&fmh_gpib_unaccel_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_unaccel;
+	}
+
+	result = gpib_register_driver(&fmh_gpib_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_interface;
+	}
+
+	result = gpib_register_driver(&fmh_gpib_pci_unaccel_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_pci_unaccel;
+	}
+
+	result = gpib_register_driver(&fmh_gpib_pci_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_pci;
+	}
+
+	return 0;
+
+err_pci:
+	gpib_unregister_driver(&fmh_gpib_pci_unaccel_interface);
+err_pci_unaccel:
+	gpib_unregister_driver(&fmh_gpib_interface);
+err_interface:
+	gpib_unregister_driver(&fmh_gpib_unaccel_interface);
+err_unaccel:
+	pci_unregister_driver(&fmh_gpib_pci_driver);
+err_pci_driver:
+	platform_driver_unregister(&fmh_gpib_platform_driver);
+
+	return result;
+}
+
+static void __exit fmh_gpib_exit_module(void)
+{
+	gpib_unregister_driver(&fmh_gpib_pci_interface);
+	gpib_unregister_driver(&fmh_gpib_pci_unaccel_interface);
+	gpib_unregister_driver(&fmh_gpib_unaccel_interface);
+	gpib_unregister_driver(&fmh_gpib_interface);
+
+	pci_unregister_driver(&fmh_gpib_pci_driver);
+	platform_driver_unregister(&fmh_gpib_platform_driver);
+}
+
+module_init(fmh_gpib_init_module);
+module_exit(fmh_gpib_exit_module);
diff --git a/drivers/gpib/fmh_gpib/fmh_gpib.h b/drivers/gpib/fmh_gpib/fmh_gpib.h
new file mode 100644
index 000000000000..e7602d7e1401
--- /dev/null
+++ b/drivers/gpib/fmh_gpib/fmh_gpib.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    Author: Frank Mori Hess <fmh6jj@gmail.com>
+ *   Copyright: (C) 2006, 2010, 2015 Fluke Corporation
+ *	(C) 2017 Frank Mori Hess
+ ***************************************************************************/
+
+#include <linux/dmaengine.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include "nec7210.h"
+
+static const int fifo_reg_offset = 2;
+
+static const int gpib_control_status_pci_resource_index;
+static const int gpib_fifo_pci_resource_index = 1;
+
+/* We don't have a real pci vendor/device id, the following will need to be
+ * patched to match prototype hardware.
+ */
+#define BOGUS_PCI_VENDOR_ID_FLUKE 0xffff
+#define BOGUS_PCI_DEVICE_ID_FLUKE_BLADERUNNER 0x0
+
+struct fmh_priv {
+	struct nec7210_priv nec7210_priv;
+	struct resource *gpib_iomem_res;
+	struct resource *write_transfer_counter_res;
+	struct resource *dma_port_res;
+	int irq;
+	struct dma_chan *dma_channel;
+	u8 *dma_buffer;
+	int dma_buffer_size;
+	int dma_burst_length;
+	void __iomem *fifo_base;
+	unsigned supports_fifo_interrupts : 1;
+};
+
+static inline int fmh_gpib_half_fifo_size(struct fmh_priv *priv)
+{
+	return priv->dma_burst_length;
+}
+
+// registers beyond the nec7210 register set
+enum fmh_gpib_regs {
+	EXT_STATUS_1_REG = 0x9,
+	STATE1_REG = 0xc,
+	ISR0_IMR0_REG = 0xe,
+	BUS_STATUS_REG = 0xf
+};
+
+/* IMR0 -- Interrupt Mode Register 0 */
+enum imr0_bits {
+	ATN_INTERRUPT_ENABLE_BIT = 0x4,
+	IFC_INTERRUPT_ENABLE_BIT = 0x8
+};
+
+/* ISR0 -- Interrupt Status Register 0 */
+enum isr0_bits {
+	ATN_INTERRUPT_BIT = 0x4,
+	IFC_INTERRUPT_BIT = 0x8
+};
+
+enum state1_bits {
+	SOURCE_HANDSHAKE_SIDS_BITS = 0x0, /* source idle state */
+	SOURCE_HANDSHAKE_SGNS_BITS = 0x1, /* source generate state */
+	SOURCE_HANDSHAKE_SDYS_BITS = 0x2, /* source delay state */
+	SOURCE_HANDSHAKE_STRS_BITS = 0x5, /* source transfer state */
+	SOURCE_HANDSHAKE_MASK = 0x7
+};
+
+enum fmh_gpib_auxmr_bits {
+	AUX_I_REG = 0xe0,
+};
+
+enum aux_reg_i_bits {
+	LOCAL_PPOLL_MODE_BIT = 0x4
+};
+
+enum ext_status_1_bits {
+	DATA_IN_STATUS_BIT = 0x01,
+	DATA_OUT_STATUS_BIT = 0x02,
+	COMMAND_OUT_STATUS_BIT = 0x04,
+	RFD_HOLDOFF_STATUS_BIT = 0x08,
+	END_STATUS_BIT = 0x10
+};
+
+/* dma fifo reg and bits */
+enum dma_fifo_regs {
+	FIFO_DATA_REG = 0x0,
+	FIFO_CONTROL_STATUS_REG = 0x1,
+	FIFO_XFER_COUNTER_REG = 0x2,
+	FIFO_MAX_BURST_LENGTH_REG = 0x3
+};
+
+enum fifo_data_bits {
+	FIFO_DATA_EOI_FLAG = 0x100
+};
+
+enum fifo_control_bits {
+	TX_FIFO_DMA_REQUEST_ENABLE = 0x0001,
+	TX_FIFO_CLEAR = 0x0002,
+	TX_FIFO_HALF_EMPTY_INTERRUPT_ENABLE = 0x0008,
+	RX_FIFO_DMA_REQUEST_ENABLE = 0x0100,
+	RX_FIFO_CLEAR = 0x0200,
+	RX_FIFO_HALF_FULL_INTERRUPT_ENABLE = 0x0800
+};
+
+enum fifo_status_bits {
+	TX_FIFO_EMPTY = 0x0001,
+	TX_FIFO_FULL = 0x0002,
+	TX_FIFO_HALF_EMPTY = 0x0004,
+	TX_FIFO_HALF_EMPTY_INTERRUPT_IS_ENABLED = 0x0008,
+	TX_FIFO_DMA_REQUEST_IS_ENABLED = 0x0010,
+	RX_FIFO_EMPTY = 0x0100,
+	RX_FIFO_FULL = 0x0200,
+	RX_FIFO_HALF_FULL = 0x0400,
+	RX_FIFO_HALF_FULL_INTERRUPT_IS_ENABLED = 0x0800,
+	RX_FIFO_DMA_REQUEST_IS_ENABLED = 0x1000
+};
+
+static const unsigned int fifo_data_mask = 0x00ff;
+static const unsigned int fifo_xfer_counter_mask = 0x0fff;
+static const unsigned int fifo_max_burst_length_mask = 0x00ff;
+
+static inline u8 gpib_cs_read_byte(struct nec7210_priv *nec_priv,
+				   unsigned int register_num)
+{
+	return readb(nec_priv->mmiobase + register_num * nec_priv->offset);
+}
+
+static inline void gpib_cs_write_byte(struct nec7210_priv *nec_priv, u8 data,
+				      unsigned int register_num)
+{
+	writeb(data, nec_priv->mmiobase + register_num * nec_priv->offset);
+}
+
+static inline uint16_t fifos_read(struct fmh_priv *fmh_priv, int register_num)
+{
+	if (!fmh_priv->fifo_base)
+		return 0;
+	return readw(fmh_priv->fifo_base + register_num * fifo_reg_offset);
+}
+
+static inline void fifos_write(struct fmh_priv *fmh_priv, uint16_t data, int register_num)
+{
+	if (!fmh_priv->fifo_base)
+		return;
+	writew(data, fmh_priv->fifo_base + register_num * fifo_reg_offset);
+}
+
+enum bus_status_bits {
+	BSR_ATN_BIT = 0x01,
+	BSR_EOI_BIT = 0x02,
+	BSR_SRQ_BIT = 0x04,
+	BSR_IFC_BIT = 0x08,
+	BSR_REN_BIT = 0x10,
+	BSR_DAV_BIT = 0x20,
+	BSR_NRFD_BIT = 0x40,
+	BSR_NDAC_BIT = 0x80,
+};
+
+enum fmh_gpib_aux_cmds {
+	/* AUX_RTL2 is an auxiliary command which causes the cb7210 to assert
+	 * (and keep asserted) the local rtl message.  This is used in conjunction
+	 * with the normal nec7210 AUX_RTL command, which
+	 * pulses the rtl message, having the effect of clearing rtl if it was left
+	 * asserted by AUX_RTL2.
+	 */
+	AUX_RTL2 = 0x0d,
+	AUX_RFD_HOLDOFF_ASAP = 0x15,
+	AUX_REQT = 0x18,
+	AUX_REQF = 0x19,
+	AUX_LO_SPEED = 0x40,
+	AUX_HI_SPEED = 0x41
+};
diff --git a/drivers/gpib/gpio/Makefile b/drivers/gpib/gpio/Makefile
new file mode 100644
index 000000000000..00ea52abdda7
--- /dev/null
+++ b/drivers/gpib/gpio/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_GPIB_GPIO) += gpib_bitbang.o
+
+
diff --git a/drivers/gpib/gpio/gpib_bitbang.c b/drivers/gpib/gpio/gpib_bitbang.c
new file mode 100644
index 000000000000..374cd61355e9
--- /dev/null
+++ b/drivers/gpib/gpio/gpib_bitbang.c
@@ -0,0 +1,1469 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*************************************************************************
+ *  This code has been developed at the Institute of Sensor and Actuator  *
+ *  Systems (Technical University of Vienna, Austria) to enable the GPIO  *
+ *  lines (e.g. of a raspberry pi) to function as a GPIO master device	  *
+ *									  *
+ *  authors		 : Thomas Klima					  *
+ *			   Marcello Carla'				  *
+ *			   Dave Penkler					  *
+ *									  *
+ *  copyright		 : (C) 2016 Thomas Klima			  *
+ *									  *
+ *************************************************************************/
+
+/*
+ * limitations:
+ *	works only on RPi
+ *	cannot function as non-CIC system controller with SN7516x because
+ *	SN75161B cannot simultaneously make ATN input with IFC and REN as
+ *	outputs.
+ * not implemented:
+ *	parallel poll
+ *	return2local
+ *	device support (non master operation)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define NAME KBUILD_MODNAME
+
+#define ENABLE_IRQ(IRQ, TYPE) irq_set_irq_type(IRQ, TYPE)
+#define DISABLE_IRQ(IRQ) irq_set_irq_type(IRQ, IRQ_TYPE_NONE)
+
+/*
+ * Debug print levels:
+ *  0 = load/unload info and errors that make the driver fail;
+ *  1 = + warnings for unforeseen events that may break the current
+ *	 operation and lead to a timeout, but do not affect the
+ *       driver integrity (mainly unexpected interrupts);
+ *  2 = + trace of function calls;
+ *  3 = + trace of protocol codes;
+ *  4 = + trace of interrupt operation.
+ */
+#define dbg_printk(level, frm, ...)					\
+	do { if (debug >= (level))					\
+			dev_dbg(board->gpib_dev, frm, ## __VA_ARGS__); } \
+	while (0)
+
+#define LINVAL gpiod_get_value(DAV),		\
+		gpiod_get_value(NRFD),		\
+		gpiod_get_value(NDAC),		\
+		gpiod_get_value(SRQ)
+#define LINFMT "DAV: %d	 NRFD:%d  NDAC: %d SRQ: %d"
+
+#include "gpibP.h"
+#include "gpib_state_machines.h"
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/gpio/driver.h>
+#include <linux/gpio/machine.h>
+#include <linux/gpio.h>
+#include <linux/irq.h>
+
+static int sn7516x_used = 1, sn7516x;
+module_param(sn7516x_used, int, 0660);
+
+#define PINMAP_0 "elektronomikon"
+#define PINMAP_1 "gpib4pi-1.1"
+#define PINMAP_2 "yoga"
+static char *pin_map = PINMAP_0;
+module_param(pin_map, charp, 0660);
+MODULE_PARM_DESC(pin_map, " valid values: " PINMAP_0 " " PINMAP_1 " " PINMAP_2);
+
+/**********************************************
+ *  Signal pairing and pin wiring between the *
+ *  Raspberry-Pi connector and the GPIB bus   *
+ *					      *
+ *		 signal		  pin wiring  *
+ *	      GPIB  Pi-gpio	GPIB  ->  RPi *
+ **********************************************
+ */
+enum lines_t {
+	D01_pin_nr =  20,     /*   1  ->  38  */
+	D02_pin_nr =  26,     /*   2  ->  37  */
+	D03_pin_nr =  16,     /*   3  ->  36  */
+	D04_pin_nr =  19,     /*   4  ->  35  */
+	D05_pin_nr =  13,     /*  13  ->  33  */
+	D06_pin_nr =  12,     /*  14  ->  32  */
+	D07_pin_nr =   6,     /*  15  ->  31  */
+	D08_pin_nr =   5,     /*  16  ->  29  */
+	EOI_pin_nr =   9,     /*   5  ->  21  */
+	DAV_pin_nr =  10,     /*   6  ->  19  */
+	NRFD_pin_nr = 24,     /*   7  ->  18  */
+	NDAC_pin_nr = 23,     /*   8  ->  16  */
+	IFC_pin_nr =  22,     /*   9  ->  15  */
+	SRQ_pin_nr =  11,     /*  10  ->  23  */
+	_ATN_pin_nr = 25,     /*  11  ->  22  */
+	REN_pin_nr =  27,     /*  17  ->  13  */
+/*
+ *  GROUND PINS
+ *    12,18,19,20,21,22,23,24  => 14,20,25,30,34,39
+ */
+
+/*
+ *  These lines are used to control the external
+ *  SN75160/161 driver chips when used.
+ *  When not used there is reduced fan out;
+ *  currently tested with up to 4 devices.
+ */
+
+/*		 Pi GPIO	RPI   75161B 75160B   Description       */
+	PE_pin_nr =    7,    /*	 26  ->	  nc	 11   Pullup Enable     */
+	DC_pin_nr =    8,    /*	 24  ->	  12	 nc   Direction control */
+	TE_pin_nr =   18,    /*	 12  ->	   2	  1   Talk Enable       */
+	ACT_LED_pin_nr = 4,  /*	  7  ->	 LED  */
+
+/* YOGA adapter uses different pinout to ease layout */
+	YOGA_D03_pin_nr =  13,
+	YOGA_D04_pin_nr =  12,
+	YOGA_D05_pin_nr =  21,
+	YOGA_D06_pin_nr =  19,
+};
+
+/*
+ * GPIO descriptors and pins - WARNING: STRICTLY KEEP ITEMS ORDER
+ */
+
+#define GPIB_PINS 16
+#define SN7516X_PINS 4
+#define NUM_PINS (GPIB_PINS + SN7516X_PINS)
+
+#define ACT_LED_ON do {						\
+		if (ACT_LED)					\
+			gpiod_direction_output(ACT_LED, 1);	\
+	} while (0)
+#define ACT_LED_OFF do {					\
+		if (ACT_LED)					\
+			gpiod_direction_output(ACT_LED, 0);	\
+	} while (0)
+
+static struct gpio_desc *all_descriptors[GPIB_PINS + SN7516X_PINS];
+
+#define D01 all_descriptors[0]
+#define D02 all_descriptors[1]
+#define D03 all_descriptors[2]
+#define D04 all_descriptors[3]
+#define D05 all_descriptors[4]
+#define D06 all_descriptors[5]
+#define D07 all_descriptors[6]
+#define D08 all_descriptors[7]
+
+#define EOI all_descriptors[8]
+#define NRFD all_descriptors[9]
+#define IFC all_descriptors[10]
+#define _ATN all_descriptors[11]
+#define REN all_descriptors[12]
+#define DAV all_descriptors[13]
+#define NDAC all_descriptors[14]
+#define SRQ all_descriptors[15]
+
+#define PE all_descriptors[16]
+#define DC all_descriptors[17]
+#define TE all_descriptors[18]
+#define ACT_LED all_descriptors[19]
+
+/* YOGA adapter uses a global enable for the buffer chips, re-using the TE pin */
+#define YOGA_ENABLE TE
+
+static int gpios_vector[] = {
+	D01_pin_nr,
+	D02_pin_nr,
+	D03_pin_nr,
+	D04_pin_nr,
+	D05_pin_nr,
+	D06_pin_nr,
+	D07_pin_nr,
+	D08_pin_nr,
+
+	EOI_pin_nr,
+	NRFD_pin_nr,
+	IFC_pin_nr,
+	_ATN_pin_nr,
+	REN_pin_nr,
+	DAV_pin_nr,
+	NDAC_pin_nr,
+	SRQ_pin_nr,
+
+	PE_pin_nr,
+	DC_pin_nr,
+	TE_pin_nr,
+	ACT_LED_pin_nr
+};
+
+/* Lookup table for general GPIOs */
+
+static struct gpiod_lookup_table gpib_gpio_table_1 = {
+	// for bcm2835/6
+	.dev_id = "",	 // device id of board device
+	.table = {
+		GPIO_LOOKUP_IDX("GPIO_GCLK",  U16_MAX, NULL,  4, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO5",	  U16_MAX, NULL,  5, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO6",	  U16_MAX, NULL,  6, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("SPI_CE1_N",  U16_MAX, NULL,  7, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("SPI_CE0_N",  U16_MAX, NULL,  8, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("SPI_MISO",	  U16_MAX, NULL,  9, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("SPI_MOSI",	  U16_MAX, NULL, 10, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("SPI_SCLK",	  U16_MAX, NULL, 11, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO12",	  U16_MAX, NULL, 12, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO13",	  U16_MAX, NULL, 13, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO16",	  U16_MAX, NULL, 16, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO17",	  U16_MAX, NULL, 17, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO18",	  U16_MAX, NULL, 18, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO19",	  U16_MAX, NULL, 19, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO20",	  U16_MAX, NULL, 20, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO21",	  U16_MAX, NULL, 21, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO22",	  U16_MAX, NULL, 22, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO23",	  U16_MAX, NULL, 23, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO24",	  U16_MAX, NULL, 24, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO25",	  U16_MAX, NULL, 25, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO26",	  U16_MAX, NULL, 26, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO27",	  U16_MAX, NULL, 27, GPIO_ACTIVE_HIGH),
+		{ }
+	},
+};
+
+static struct gpiod_lookup_table gpib_gpio_table_0 = {
+	.dev_id = "",	 // device id of board device
+	.table = {
+		// for bcm27xx based pis (b b+ 2b 3b 3b+ 4 5)
+		GPIO_LOOKUP_IDX("GPIO4",  U16_MAX, NULL,  4, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO5",  U16_MAX, NULL,  5, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO6",  U16_MAX, NULL,  6, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO7",  U16_MAX, NULL,  7, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO8",  U16_MAX, NULL,  8, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO9",  U16_MAX, NULL,  9, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO10", U16_MAX, NULL, 10, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO11", U16_MAX, NULL, 11, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO12", U16_MAX, NULL, 12, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO13", U16_MAX, NULL, 13, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO16", U16_MAX, NULL, 16, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO17", U16_MAX, NULL, 17, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO18", U16_MAX, NULL, 18, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO19", U16_MAX, NULL, 19, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO20", U16_MAX, NULL, 20, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO21", U16_MAX, NULL, 21, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO22", U16_MAX, NULL, 22, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO23", U16_MAX, NULL, 23, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO24", U16_MAX, NULL, 24, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO25", U16_MAX, NULL, 25, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO26", U16_MAX, NULL, 26, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("GPIO27", U16_MAX, NULL, 27, GPIO_ACTIVE_HIGH),
+		{ }
+	},
+};
+
+static struct gpiod_lookup_table *lookup_tables[] = {
+	&gpib_gpio_table_0,
+	&gpib_gpio_table_1,
+	NULL
+};
+
+/* struct which defines private_data for gpio driver */
+
+struct bb_priv {
+	int irq_NRFD;
+	int irq_NDAC;
+	int irq_DAV;
+	int irq_SRQ;
+	int dav_mode;	     /* dav  interrupt mode 0/1 -> edge/levels */
+	int nrfd_mode;	     /* nrfd interrupt mode 0/1 -> edge/levels */
+	int ndac_mode;	     /* nrfd interrupt mode 0/1 -> edge/levels */
+	int dav_tx;	     /* keep trace of DAV status while sending */
+	int dav_rx;	     /* keep trace of DAV status while receiving */
+	u8 eos;              /* eos character */
+	short eos_flags;     /* eos mode */
+	short eos_check;     /* eos check required in current operation ... */
+	short eos_check_8;   /* ... with byte comparison */
+	short eos_mask_7;    /* ... with 7 bit masked character */
+	short int end;
+	int request;
+	int count;
+	int direction;
+	int t1_delay;
+	u8 *rbuf;
+	u8 *wbuf;
+	int end_flag;
+	int r_busy;	      /* 0==idle   1==busy */
+	int w_busy;
+	int write_done;
+	int cmd;	      /* 1 = cmd write in progress */
+	size_t w_cnt;
+	size_t length;
+	u8 *w_buf;
+	spinlock_t rw_lock;   /* protect mods to rw_lock */
+	int phase;
+	int ndac_idle;
+	int ndac_seq;
+	int nrfd_idle;
+	int nrfd_seq;
+	int dav_seq;
+	long all_irqs;
+	int dav_idle;
+
+	enum talker_function_state talker_state;
+	enum listener_function_state listener_state;
+};
+
+static inline long usec_diff(struct timespec64 *a, struct timespec64 *b);
+static void bb_buffer_print(struct gpib_board *board, unsigned char *buffer, size_t length,
+			    int cmd, int eoi);
+static void set_data_lines(u8 byte);
+static u8 get_data_lines(void);
+static void set_data_lines_input(void);
+static void set_data_lines_output(void);
+static inline int check_for_eos(struct bb_priv *priv, u8 byte);
+static void set_atn(struct gpib_board *board, int atn_asserted);
+
+static inline void SET_DIR_WRITE(struct bb_priv *priv);
+static inline void SET_DIR_READ(struct bb_priv *priv);
+
+#define DIR_READ 0
+#define DIR_WRITE 1
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB helper functions for bitbanging I/O");
+
+/****  global variables	 ****/
+static int debug;
+module_param(debug, int, 0644);
+
+static char printable(char x)
+{
+	if (x < 32 || x > 126)
+		return ' ';
+	return x;
+}
+
+/***************************************************************************
+ *									   *
+ * READ									   *
+ *									   *
+ ***************************************************************************/
+
+static int bb_read(struct gpib_board *board, u8 *buffer, size_t length,
+		   int *end, size_t *bytes_read)
+{
+	struct bb_priv *priv = board->private_data;
+	unsigned long flags;
+	int retval = 0;
+
+	ACT_LED_ON;
+	SET_DIR_READ(priv);
+
+	dbg_printk(2, "board: %p  lock %d  length: %zu\n",
+		   board, mutex_is_locked(&board->user_mutex), length);
+
+	priv->end = 0;
+	priv->count = 0;
+	priv->rbuf = buffer;
+	if (length == 0)
+		goto read_end;
+	priv->request = length;
+	priv->eos_check = (priv->eos_flags & REOS) == 0; /* do eos check */
+	priv->eos_check_8 = priv->eos_flags & BIN;	 /* over 8 bits */
+	priv->eos_mask_7 = priv->eos & 0x7f;		 /* with this 7 bit eos */
+
+	dbg_printk(3, ".........." LINFMT "\n", LINVAL);
+
+	spin_lock_irqsave(&priv->rw_lock, flags);
+	priv->dav_mode = 1;
+	priv->dav_rx = 1;
+	ENABLE_IRQ(priv->irq_DAV, IRQ_TYPE_LEVEL_LOW);
+	priv->end_flag = 0;
+	gpiod_set_value(NRFD, 1); // ready for data
+	priv->r_busy = 1;
+	priv->phase = 100;
+	spin_unlock_irqrestore(&priv->rw_lock, flags);
+
+	/* wait for the interrupt routines finish their work */
+
+	retval = wait_event_interruptible(board->wait,
+					  (priv->end_flag || board->status & TIMO));
+
+	dbg_printk(3, "awake from wait queue: %d\n", retval);
+
+	if (retval == 0 && board->status & TIMO) {
+		retval = -ETIMEDOUT;
+		dbg_printk(1, "timeout\n");
+	} else if (retval) {
+		retval = -ERESTARTSYS;
+	}
+
+	DISABLE_IRQ(priv->irq_DAV);
+	spin_lock_irqsave(&priv->rw_lock, flags);
+	gpiod_set_value(NRFD, 0); // DIR_READ line state
+	priv->r_busy = 0;
+	spin_unlock_irqrestore(&priv->rw_lock, flags);
+
+read_end:
+	ACT_LED_OFF;
+	*bytes_read = priv->count;
+	*end = priv->end;
+	priv->r_busy = 0;
+	dbg_printk(2, "return: %d  eoi|eos: %d count: %d\n\n", retval, priv->end, priv->count);
+	return retval;
+}
+
+/***************************************************************************
+ *									   *
+ *	READ interrupt routine (DAV line)				   *
+ *									   *
+ ***************************************************************************/
+
+static irqreturn_t bb_DAV_interrupt(int irq, void *arg)
+{
+	struct gpib_board *board = arg;
+	struct bb_priv *priv = board->private_data;
+	int val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->rw_lock, flags);
+
+	priv->all_irqs++;
+
+	if (priv->dav_mode) {
+		ENABLE_IRQ(priv->irq_DAV, IRQ_TYPE_EDGE_BOTH);
+		priv->dav_mode = 0;
+	}
+
+	if (priv->r_busy == 0) {
+		dbg_printk(1, "interrupt while idle after %d at %d\n",
+			   priv->count, priv->phase);
+		priv->dav_idle++;
+		priv->phase = 200;
+		goto dav_exit;	/* idle */
+	}
+
+	val = gpiod_get_value(DAV);
+	if (val == priv->dav_rx) {
+		dbg_printk(1, "out of order DAV interrupt %d/%d after %zu/%zu at %d cmd %d "
+			   LINFMT ".\n", val, priv->dav_rx, priv->w_cnt, priv->length,
+			   priv->phase, priv->cmd, LINVAL);
+		priv->dav_seq++;
+	}
+	priv->dav_rx = val;
+
+	dbg_printk(3, "> irq: %d  DAV: %d  st: %4lx dir: %d  busy: %d:%d\n",
+		   irq, val, board->status, priv->direction, priv->r_busy, priv->w_busy);
+
+	if (val == 0) {
+		gpiod_set_value(NRFD, 0); // not ready for data
+		priv->rbuf[priv->count++] = get_data_lines();
+		priv->end = !gpiod_get_value(EOI);
+		gpiod_set_value(NDAC, 1); // data accepted
+		priv->end |= check_for_eos(priv, priv->rbuf[priv->count - 1]);
+		priv->end_flag = ((priv->count >= priv->request) || priv->end);
+		priv->phase = 210;
+	} else {
+		gpiod_set_value(NDAC, 0);	// data not accepted
+		if (priv->end_flag) {
+			priv->r_busy = 0;
+			wake_up_interruptible(&board->wait);
+			priv->phase = 220;
+		} else {
+			gpiod_set_value(NRFD, 1);     // ready for data
+			priv->phase = 230;
+		}
+	}
+
+dav_exit:
+	spin_unlock_irqrestore(&priv->rw_lock, flags);
+	dbg_printk(3, "< irq: %d  count %d\n", irq, priv->count);
+	return IRQ_HANDLED;
+}
+
+/***************************************************************************
+ *									   *
+ * WRITE								   *
+ *									   *
+ ***************************************************************************/
+
+static int bb_write(struct gpib_board *board, u8 *buffer, size_t length,
+		    int send_eoi, size_t *bytes_written)
+{
+	unsigned long flags;
+	int retval = 0;
+
+	struct bb_priv *priv = board->private_data;
+
+	ACT_LED_ON;
+
+	priv->w_cnt = 0;
+	priv->w_buf = buffer;
+	dbg_printk(2, "board %p	lock %d	 length: %zu\n",
+		   board, mutex_is_locked(&board->user_mutex), length);
+
+	if (debug > 1)
+		bb_buffer_print(board, buffer, length, priv->cmd, send_eoi);
+	priv->count = 0;
+	priv->phase = 300;
+
+	if (length == 0)
+		goto write_end;
+	priv->end = send_eoi;
+	priv->length = length;
+
+	SET_DIR_WRITE(priv);
+
+	dbg_printk(2, "Enabling interrupts - NRFD: %d   NDAC: %d\n",
+		   gpiod_get_value(NRFD), gpiod_get_value(NDAC));
+
+	if (gpiod_get_value(NRFD) && gpiod_get_value(NDAC)) { /* check for listener */
+		retval = -ENOTCONN;
+		goto write_end;
+	}
+
+	spin_lock_irqsave(&priv->rw_lock, flags);
+	priv->w_busy = 1;	   /* make the interrupt routines active */
+	priv->write_done = 0;
+	priv->nrfd_mode = 1;
+	priv->ndac_mode = 1;
+	priv->dav_tx = 1;
+	ENABLE_IRQ(priv->irq_NDAC, IRQ_TYPE_LEVEL_HIGH);
+	ENABLE_IRQ(priv->irq_NRFD, IRQ_TYPE_LEVEL_HIGH);
+	spin_unlock_irqrestore(&priv->rw_lock, flags);
+
+	/* wait for the interrupt routines finish their work */
+
+	retval = wait_event_interruptible(board->wait,
+					  priv->write_done || (board->status & TIMO));
+
+	dbg_printk(3, "awake from wait queue: %d\n", retval);
+
+	if (retval == 0) {
+		if (board->status & TIMO) {
+			retval = -ETIMEDOUT;
+			dbg_printk(1, "timeout after %zu/%zu at %d " LINFMT " eoi: %d\n",
+				   priv->w_cnt, length, priv->phase, LINVAL, send_eoi);
+		} else {
+			retval = priv->w_cnt;
+		}
+	} else {
+		retval = -ERESTARTSYS;
+	}
+
+	DISABLE_IRQ(priv->irq_NRFD);
+	DISABLE_IRQ(priv->irq_NDAC);
+
+	spin_lock_irqsave(&priv->rw_lock, flags);
+	priv->w_busy = 0;
+	gpiod_set_value(DAV, 1); // DIR_WRITE line state
+	gpiod_set_value(EOI, 1); // De-assert EOI (in case)
+	spin_unlock_irqrestore(&priv->rw_lock, flags);
+
+write_end:
+	*bytes_written = priv->w_cnt;
+	ACT_LED_OFF;
+	dbg_printk(2, "sent %zu bytes\r\n\r\n", *bytes_written);
+	priv->phase = 310;
+	return retval;
+}
+
+/***************************************************************************
+ *									   *
+ *	WRITE interrupt routine (NRFD line)				   *
+ *									   *
+ ***************************************************************************/
+
+static irqreturn_t bb_NRFD_interrupt(int irq, void *arg)
+{
+	struct gpib_board *board = arg;
+	struct bb_priv *priv = board->private_data;
+	unsigned long flags;
+	int nrfd;
+
+	spin_lock_irqsave(&priv->rw_lock, flags);
+
+	nrfd = gpiod_get_value(NRFD);
+	priv->all_irqs++;
+
+	dbg_printk(3, "> irq: %d  NRFD: %d   NDAC: %d	st: %4lx dir: %d  busy: %d:%d\n",
+		   irq, nrfd, gpiod_get_value(NDAC), board->status, priv->direction,
+		   priv->w_busy, priv->r_busy);
+
+	if (priv->nrfd_mode) {
+		ENABLE_IRQ(priv->irq_NRFD, IRQ_TYPE_EDGE_RISING);
+		priv->nrfd_mode = 0;
+	}
+
+	if (priv->w_busy == 0) {
+		dbg_printk(1, "interrupt while idle after %zu/%zu at %d\n",
+			   priv->w_cnt, priv->length, priv->phase);
+		priv->nrfd_idle++;
+		goto nrfd_exit;	 /* idle */
+	}
+	if (nrfd == 0) {
+		dbg_printk(1, "out of order interrupt after %zu/%zu at %d cmd %d " LINFMT ".\n",
+			   priv->w_cnt, priv->length, priv->phase, priv->cmd, LINVAL);
+		priv->phase = 400;
+		priv->nrfd_seq++;
+		goto nrfd_exit;
+	}
+	if (!priv->dav_tx) {
+		dbg_printk(1, "DAV low after %zu/%zu cmd %d " LINFMT ". No action.\n",
+			   priv->w_cnt, priv->length, priv->cmd, LINVAL);
+		priv->dav_seq++;
+		goto nrfd_exit;
+	}
+
+	if (priv->w_cnt >= priv->length) { // test for missed NDAC end of transfer
+		dev_err(board->gpib_dev, "Unexpected NRFD exit\n");
+		priv->write_done = 1;
+		priv->w_busy = 0;
+		wake_up_interruptible(&board->wait);
+		goto nrfd_exit;
+	}
+
+	dbg_printk(3, "sending %zu\n", priv->w_cnt);
+
+	set_data_lines(priv->w_buf[priv->w_cnt++]); // put the data on the lines
+
+	if (priv->w_cnt == priv->length && priv->end) {
+		dbg_printk(3, "Asserting EOI\n");
+		gpiod_set_value(EOI, 0); // Assert EOI
+	}
+
+	gpiod_set_value(DAV, 0); // Data available
+	priv->dav_tx = 0;
+	priv->phase = 410;
+
+nrfd_exit:
+	spin_unlock_irqrestore(&priv->rw_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+/***************************************************************************
+ *									   *
+ *	WRITE interrupt routine (NDAC line)				   *
+ *									   *
+ ***************************************************************************/
+
+static irqreturn_t bb_NDAC_interrupt(int irq, void *arg)
+{
+	struct gpib_board *board = arg;
+	struct bb_priv *priv = board->private_data;
+	unsigned long flags;
+	int ndac;
+
+	spin_lock_irqsave(&priv->rw_lock, flags);
+
+	ndac = gpiod_get_value(NDAC);
+	priv->all_irqs++;
+	dbg_printk(3, "> irq: %d  NRFD: %d   NDAC: %d	st: %4lx dir: %d  busy: %d:%d\n",
+		   irq, gpiod_get_value(NRFD), ndac, board->status, priv->direction,
+		   priv->w_busy, priv->r_busy);
+
+	if (priv->ndac_mode) {
+		ENABLE_IRQ(priv->irq_NDAC, IRQ_TYPE_EDGE_RISING);
+		priv->ndac_mode = 0;
+	}
+
+	if (priv->w_busy == 0) {
+		dbg_printk(1, "interrupt while idle.\n");
+		priv->ndac_idle++;
+		goto ndac_exit;
+	}
+	if (ndac == 0) {
+		dbg_printk(1, "out of order interrupt at %zu:%d.\n", priv->w_cnt, priv->phase);
+		priv->phase = 500;
+		priv->ndac_seq++;
+		goto ndac_exit;
+	}
+	if (priv->dav_tx) {
+		dbg_printk(1, "DAV high after %zu/%zu cmd %d " LINFMT ". No action.\n",
+			   priv->w_cnt, priv->length, priv->cmd, LINVAL);
+		priv->dav_seq++;
+		goto ndac_exit;
+	}
+
+	dbg_printk(3, "accepted %zu\n", priv->w_cnt - 1);
+
+	gpiod_set_value(DAV, 1); // Data not available
+	priv->dav_tx = 1;
+	priv->phase = 510;
+
+	if (priv->w_cnt >= priv->length) { // test for end of transfer
+		priv->write_done = 1;
+		priv->w_busy = 0;
+		wake_up_interruptible(&board->wait);
+	}
+
+ndac_exit:
+	spin_unlock_irqrestore(&priv->rw_lock, flags);
+	return IRQ_HANDLED;
+}
+
+/***************************************************************************
+ *									   *
+ *	interrupt routine for SRQ line					   *
+ *									   *
+ ***************************************************************************/
+
+static irqreturn_t bb_SRQ_interrupt(int irq, void *arg)
+{
+	struct gpib_board  *board = arg;
+
+	int val = gpiod_get_value(SRQ);
+
+	dbg_printk(3, "> %d   st: %4lx\n", val, board->status);
+
+	if (!val)
+		set_bit(SRQI_NUM, &board->status);  /* set_bit() is atomic */
+
+	wake_up_interruptible(&board->wait);
+
+	return IRQ_HANDLED;
+}
+
+static int bb_command(struct gpib_board *board, u8 *buffer,
+		      size_t length, size_t *bytes_written)
+{
+	int ret;
+	struct bb_priv *priv = board->private_data;
+	int i;
+
+	dbg_printk(2, "%p  %p\n", buffer, board->buffer);
+
+	/* the _ATN line has already been asserted by bb_take_control() */
+
+	priv->cmd = 1;
+
+	ret = bb_write(board, buffer, length, 0, bytes_written); // no eoi
+
+	for (i = 0; i < length; i++) {
+		if (buffer[i] == UNT) {
+			priv->talker_state = talker_idle;
+		} else {
+			if (buffer[i] == UNL) {
+				priv->listener_state = listener_idle;
+			} else {
+				if (buffer[i] == (MTA(board->pad))) {
+					priv->talker_state = talker_addressed;
+					priv->listener_state = listener_idle;
+				} else if (buffer[i] == (MLA(board->pad))) {
+					priv->listener_state = listener_addressed;
+					priv->talker_state = talker_idle;
+				}
+			}
+		}
+	}
+
+	/* the _ATN line will be released by bb_go_to_stby */
+
+	priv->cmd = 0;
+
+	return ret;
+}
+
+/***************************************************************************
+ *									   *
+ *	Buffer print with decode for debug/trace			   *
+ *									   *
+ ***************************************************************************/
+
+static char *cmd_string[32] = {
+	"",    // 0x00
+	"GTL", // 0x01
+	"",    // 0x02
+	"",    // 0x03
+	"SDC", // 0x04
+	"PPC", // 0x05
+	"",    // 0x06
+	"",    // 0x07
+	"GET", // 0x08
+	"TCT", // 0x09
+	"",    // 0x0a
+	"",    // 0x0b
+	"",    // 0x0c
+	"",    // 0x0d
+	"",    // 0x0e
+	"",    // 0x0f
+	"",    // 0x10
+	"LLO", // 0x11
+	"",    // 0x12
+	"",    // 0x13
+	"DCL", // 0x14
+	"PPU", // 0x15
+	"",    // 0x16
+	"",    // 0x17
+	"SPE", // 0x18
+	"SPD", // 0x19
+	"",    // 0x1a
+	"",    // 0x1b
+	"",    // 0x1c
+	"",    // 0x1d
+	"",    // 0x1e
+	"CFE"  // 0x1f
+};
+
+static void bb_buffer_print(struct gpib_board *board, unsigned char *buffer, size_t length,
+			    int cmd, int eoi)
+{
+	int i;
+
+	if (cmd) {
+		dbg_printk(2, "<cmd len %zu>\n", length);
+		for (i = 0; i < length; i++) {
+			if (buffer[i] < 0x20) {
+				dbg_printk(3, "0x%x=%s\n", buffer[i], cmd_string[buffer[i]]);
+			} else if (buffer[i] == 0x3f) {
+				dbg_printk(3, "0x%x=%s\n", buffer[i], "UNL");
+			} else if (buffer[i] == 0x5f) {
+				dbg_printk(3, "0x%x=%s\n", buffer[i], "UNT");
+			} else	if (buffer[i] < 0x60) {
+				dbg_printk(3, "0x%x=%s%d\n", buffer[i],
+					   (buffer[i] & 0x40) ? "TLK" : "LSN", buffer[i] & 0x1F);
+			} else {
+				dbg_printk(3, "0x%x\n", buffer[i]);
+			}
+		}
+	} else {
+		dbg_printk(2, "<data len %zu %s>\n", length, (eoi) ? "w.EOI" : " ");
+		for (i = 0; i < length; i++)
+			dbg_printk(2, "%3d  0x%x->%c\n", i, buffer[i], printable(buffer[i]));
+	}
+}
+
+/***************************************************************************
+ *									   *
+ * STATUS Management							   *
+ *									   *
+ ***************************************************************************/
+static void set_atn(struct gpib_board *board, int atn_asserted)
+{
+	struct bb_priv *priv = board->private_data;
+
+	if (priv->listener_state != listener_idle &&
+	    priv->talker_state != talker_idle) {
+		dev_err(board->gpib_dev, "listener/talker state machine conflict\n");
+	}
+	if (atn_asserted) {
+		if (priv->listener_state == listener_active)
+			priv->listener_state = listener_addressed;
+		if (priv->talker_state == talker_active)
+			priv->talker_state = talker_addressed;
+		SET_DIR_WRITE(priv);  // need to be able to read bus NRFD/NDAC
+	} else {
+		if (priv->listener_state == listener_addressed) {
+			priv->listener_state = listener_active;
+			SET_DIR_READ(priv); // make sure holdoff is active when we unassert ATN
+		}
+		if (priv->talker_state == talker_addressed)
+			priv->talker_state = talker_active;
+	}
+	gpiod_direction_output(_ATN, !atn_asserted);
+}
+
+static int bb_take_control(struct gpib_board *board, int synchronous)
+{
+	dbg_printk(2, "%d\n", synchronous);
+	set_atn(board, 1);
+	return 0;
+}
+
+static int bb_go_to_standby(struct gpib_board *board)
+{
+	dbg_printk(2, "\n");
+	set_atn(board, 0);
+	return 0;
+}
+
+static int bb_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct bb_priv *priv = board->private_data;
+
+	dbg_printk(2, "%d\n", request_control);
+	if (!request_control)
+		return -EINVAL;
+
+	gpiod_direction_output(REN, 1); /* user space must enable REN if needed */
+	gpiod_direction_output(IFC, 1); /* user space must toggle IFC if needed */
+	if (sn7516x)
+		gpiod_direction_output(DC, 0); /* enable ATN as output on SN75161/2 */
+
+	gpiod_direction_input(SRQ);
+
+	ENABLE_IRQ(priv->irq_SRQ, IRQ_TYPE_EDGE_FALLING);
+
+	return 0;
+}
+
+static void bb_interface_clear(struct gpib_board *board, int assert)
+{
+	struct bb_priv *priv = board->private_data;
+
+	dbg_printk(2, "%d\n", assert);
+	if (assert) {
+		gpiod_direction_output(IFC, 0);
+		priv->talker_state = talker_idle;
+		priv->listener_state = listener_idle;
+		set_bit(CIC_NUM, &board->status);
+	} else {
+		gpiod_direction_output(IFC, 1);
+	}
+}
+
+static void bb_remote_enable(struct gpib_board *board, int enable)
+{
+	dbg_printk(2, "%d\n", enable);
+	if (enable) {
+		set_bit(REM_NUM, &board->status);
+		gpiod_direction_output(REN, 0);
+	} else {
+		clear_bit(REM_NUM, &board->status);
+		gpiod_direction_output(REN, 1);
+	}
+}
+
+static int bb_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct bb_priv *priv = board->private_data;
+
+	dbg_printk(2, "%s\n", "EOS_en");
+	priv->eos = eos_byte;
+	priv->eos_flags = REOS;
+	if (compare_8_bits)
+		priv->eos_flags |= BIN;
+
+	return 0;
+}
+
+static void bb_disable_eos(struct gpib_board *board)
+{
+	struct bb_priv *priv = board->private_data;
+
+	dbg_printk(2, "\n");
+	priv->eos_flags &= ~REOS;
+}
+
+static unsigned int bb_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	struct bb_priv *priv = board->private_data;
+
+	board->status &= ~clear_mask;
+
+	if (gpiod_get_value(SRQ))	       /* SRQ asserted low */
+		clear_bit(SRQI_NUM, &board->status);
+	else
+		set_bit(SRQI_NUM, &board->status);
+	if (gpiod_get_value(_ATN))			/* ATN asserted low */
+		clear_bit(ATN_NUM, &board->status);
+	else
+		set_bit(ATN_NUM, &board->status);
+	if (priv->talker_state == talker_active ||
+	    priv->talker_state == talker_addressed)
+		set_bit(TACS_NUM, &board->status);
+	else
+		clear_bit(TACS_NUM, &board->status);
+
+	if (priv->listener_state == listener_active ||
+	    priv->listener_state == listener_addressed)
+		set_bit(LACS_NUM, &board->status);
+	else
+		clear_bit(LACS_NUM, &board->status);
+
+	dbg_printk(2, "0x%lx mask 0x%x\n", board->status, clear_mask);
+
+	return board->status;
+}
+
+static int bb_primary_address(struct gpib_board *board, unsigned int address)
+{
+	dbg_printk(2, "%d\n", address);
+	board->pad = address;
+	return 0;
+}
+
+static int bb_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	dbg_printk(2, "%d %d\n", address, enable);
+	if (enable)
+		board->sad = address;
+	return 0;
+}
+
+static int bb_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	return -ENOENT;
+}
+
+static void bb_parallel_poll_configure(struct gpib_board *board, u8 config)
+{
+}
+
+static void bb_parallel_poll_response(struct gpib_board *board, int ist)
+{
+}
+
+static void bb_serial_poll_response(struct gpib_board *board, u8 status)
+{
+}
+
+static u8 bb_serial_poll_status(struct gpib_board *board)
+{
+	return 0; // -ENOENT;
+}
+
+static int bb_t1_delay(struct gpib_board *board,  unsigned int nano_sec)
+{
+	struct bb_priv *priv = board->private_data;
+
+	if (nano_sec <= 350)
+		priv->t1_delay = 350;
+	else if (nano_sec <= 1100)
+		priv->t1_delay = 1100;
+	else
+		priv->t1_delay = 2000;
+
+	dbg_printk(2, "t1 delay set to %d nanosec\n", priv->t1_delay);
+
+	return priv->t1_delay;
+}
+
+static void bb_return_to_local(struct gpib_board *board)
+{
+}
+
+static int bb_line_status(const struct gpib_board *board)
+{
+	int line_status = VALID_ALL;
+
+	if (gpiod_get_value(REN) == 0)
+		line_status |= BUS_REN;
+	if (gpiod_get_value(IFC) == 0)
+		line_status |= BUS_IFC;
+	if (gpiod_get_value(NDAC) == 0)
+		line_status |= BUS_NDAC;
+	if (gpiod_get_value(NRFD) == 0)
+		line_status |= BUS_NRFD;
+	if (gpiod_get_value(DAV) == 0)
+		line_status |= BUS_DAV;
+	if (gpiod_get_value(EOI) == 0)
+		line_status |= BUS_EOI;
+	if (gpiod_get_value(_ATN) == 0)
+		line_status |= BUS_ATN;
+	if (gpiod_get_value(SRQ) == 0)
+		line_status |= BUS_SRQ;
+
+	dbg_printk(2, "status lines: %4x\n", line_status);
+
+	return line_status;
+}
+
+/***************************************************************************
+ *									   *
+ * Module Management							   *
+ *									   *
+ ***************************************************************************/
+
+static int allocate_private(struct gpib_board *board)
+{
+	board->private_data = kzalloc(sizeof(struct bb_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -1;
+	return 0;
+}
+
+static void free_private(struct gpib_board *board)
+{
+	kfree(board->private_data);
+	board->private_data = NULL;
+}
+
+static int bb_get_irq(struct gpib_board *board, char *name,
+		      struct gpio_desc *gpio, int *irq,
+		      irq_handler_t handler, irq_handler_t thread_fn, unsigned long flags)
+{
+	if (!gpio)
+		return -1;
+	gpiod_direction_input(gpio);
+	*irq = gpiod_to_irq(gpio);
+	dbg_printk(2, "IRQ %s: %d\n", name, *irq);
+	if (*irq < 0) {
+		dev_err(board->gpib_dev, "can't get IRQ for %s\n", name);
+		return -1;
+	}
+	if (request_threaded_irq(*irq, handler, thread_fn, flags, name, board)) {
+		dev_err(board->gpib_dev, "can't request IRQ for %s %d\n", name, *irq);
+		*irq = 0;
+		return -1;
+	}
+	DISABLE_IRQ(*irq);
+	return 0;
+}
+
+static void bb_free_irq(struct gpib_board *board, int *irq, char *name)
+{
+	if (*irq) {
+		free_irq(*irq, board);
+		dbg_printk(2, "IRQ %d(%s) freed\n", *irq, name);
+		*irq = 0;
+	}
+}
+
+static void release_gpios(void)
+{
+	int j;
+
+	for (j = 0 ; j < NUM_PINS ; j++) {
+		if (all_descriptors[j]) {
+			gpiod_put(all_descriptors[j]);
+			all_descriptors[j] = NULL;
+		}
+	}
+}
+
+static int allocate_gpios(struct gpib_board *board)
+{
+	int j;
+	int table_index = 0;
+	char name[256];
+	struct gpio_desc *desc;
+	struct gpiod_lookup_table *lookup_table;
+
+	if (!board->gpib_dev) {
+		pr_err("NULL gpib dev for board\n");
+		return -ENOENT;
+	}
+
+	lookup_table = lookup_tables[table_index];
+	lookup_table->dev_id = dev_name(board->gpib_dev);
+	gpiod_add_lookup_table(lookup_table);
+	dbg_printk(1, "Allocating gpios using table index %d\n", table_index);
+
+	for (j = 0 ; j < NUM_PINS ; j++) {
+		if (gpios_vector[j] < 0)
+			continue;
+		/* name not really used in gpiod_get_index() */
+		sprintf(name, "GPIO%d", gpios_vector[j]);
+try_again:
+		dbg_printk(1, "Allocating gpio %s pin no %d\n", name, gpios_vector[j]);
+		desc = gpiod_get_index(board->gpib_dev, name, gpios_vector[j], GPIOD_IN);
+
+		if (IS_ERR(desc)) {
+			gpiod_remove_lookup_table(lookup_table);
+			table_index++;
+			lookup_table = lookup_tables[table_index];
+			if (!lookup_table) {
+				dev_err(board->gpib_dev, "Unable to obtain gpio descriptor for pin %d error %ld\n",
+					gpios_vector[j], PTR_ERR(desc));
+				goto alloc_gpios_fail;
+			}
+			dbg_printk(1, "Allocation failed, now using table_index %d\n", table_index);
+			lookup_table->dev_id = dev_name(board->gpib_dev);
+			gpiod_add_lookup_table(lookup_table);
+			goto try_again;
+		}
+		all_descriptors[j] = desc;
+	}
+
+	gpiod_remove_lookup_table(lookup_table);
+
+	return 0;
+
+alloc_gpios_fail:
+	release_gpios();
+	return -1;
+}
+
+static void bb_detach(struct gpib_board *board)
+{
+	struct bb_priv *priv = board->private_data;
+
+	dbg_printk(2, "Enter with data %p\n", board->private_data);
+	if (!board->private_data)
+		return;
+
+	bb_free_irq(board, &priv->irq_DAV, NAME "_DAV");
+	bb_free_irq(board, &priv->irq_NRFD, NAME "_NRFD");
+	bb_free_irq(board, &priv->irq_NDAC, NAME "_NDAC");
+	bb_free_irq(board, &priv->irq_SRQ, NAME "_SRQ");
+
+	if (strcmp(PINMAP_2, pin_map) == 0) { /* YOGA */
+		gpiod_set_value(YOGA_ENABLE, 0);
+	}
+
+	release_gpios();
+
+	dbg_printk(2, "detached board: %d\n", board->minor);
+	dbg_printk(0, "NRFD: idle %d, seq %d,  NDAC: idle %d, seq %d  DAV: idle %d  seq: %d  all: %ld",
+		   priv->nrfd_idle, priv->nrfd_seq,
+		   priv->ndac_idle, priv->ndac_seq,
+		   priv->dav_idle, priv->dav_seq, priv->all_irqs);
+
+	free_private(board);
+}
+
+static int bb_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct bb_priv *priv;
+	int retval = 0;
+
+	dbg_printk(2, "%s\n", "Enter ...");
+
+	board->status = 0;
+
+	if (allocate_private(board))
+		return -ENOMEM;
+	priv = board->private_data;
+	priv->direction = -1;
+	priv->t1_delay = 2000;
+	priv->listener_state = listener_idle;
+	priv->talker_state = talker_idle;
+
+	sn7516x = sn7516x_used;
+	if (strcmp(PINMAP_0, pin_map) == 0) {
+		if (!sn7516x) {
+			gpios_vector[&(PE) - &all_descriptors[0]] = -1;
+			gpios_vector[&(DC) - &all_descriptors[0]] = -1;
+			gpios_vector[&(TE) - &all_descriptors[0]] = -1;
+		}
+	} else if (strcmp(PINMAP_1, pin_map) == 0) {
+		if (!sn7516x) {
+			gpios_vector[&(PE) - &all_descriptors[0]] = -1;
+			gpios_vector[&(DC) - &all_descriptors[0]] = -1;
+			gpios_vector[&(TE) - &all_descriptors[0]] = -1;
+		}
+		gpios_vector[&(REN) - &all_descriptors[0]] = 0; /* 27 -> 0 REN on GPIB pin 0 */
+	} else if (strcmp(PINMAP_2, pin_map) == 0) { /* YOGA */
+		sn7516x = 0;
+		gpios_vector[&(D03) - &all_descriptors[0]] = YOGA_D03_pin_nr;
+		gpios_vector[&(D04) - &all_descriptors[0]] = YOGA_D04_pin_nr;
+		gpios_vector[&(D05) - &all_descriptors[0]] = YOGA_D05_pin_nr;
+		gpios_vector[&(D06) - &all_descriptors[0]] = YOGA_D06_pin_nr;
+		gpios_vector[&(PE)  - &all_descriptors[0]] = -1;
+		gpios_vector[&(DC)  - &all_descriptors[0]] = -1;
+	} else {
+		dev_err(board->gpib_dev, "Unrecognized pin map %s\n", pin_map);
+		goto bb_attach_fail;
+	}
+	dbg_printk(0, "Using pin map \"%s\" %s\n", pin_map, (sn7516x) ?
+		   " with SN7516x driver support" : "");
+
+	if (allocate_gpios(board))
+		goto bb_attach_fail;
+
+/*
+ * Configure SN7516X control lines.
+ * drive ATN, IFC and REN as outputs only when master
+ * i.e. system controller. In this mode can only be the CIC
+ * When not master then enable device mode ATN, IFC & REN as inputs
+ */
+	if (sn7516x) {
+		gpiod_direction_output(DC, 0);
+		gpiod_direction_output(TE, 1);
+		gpiod_direction_output(PE, 1);
+	}
+/* Set main control lines to a known state */
+	gpiod_direction_output(IFC, 1);
+	gpiod_direction_output(REN, 1);
+	gpiod_direction_output(_ATN, 1);
+
+	if (strcmp(PINMAP_2, pin_map) == 0) { /* YOGA: enable level shifters */
+		gpiod_direction_output(YOGA_ENABLE, 1);
+	}
+
+	spin_lock_init(&priv->rw_lock);
+
+	/* request DAV interrupt for read */
+	if (bb_get_irq(board, NAME "_DAV", DAV, &priv->irq_DAV, bb_DAV_interrupt, NULL,
+		       IRQF_TRIGGER_NONE))
+		goto bb_attach_fail_r;
+
+	/* request NRFD interrupt for write */
+	if (bb_get_irq(board, NAME "_NRFD", NRFD, &priv->irq_NRFD, bb_NRFD_interrupt, NULL,
+		       IRQF_TRIGGER_NONE))
+		goto bb_attach_fail_r;
+
+	/* request NDAC interrupt for write */
+	if (bb_get_irq(board, NAME "_NDAC", NDAC, &priv->irq_NDAC, bb_NDAC_interrupt, NULL,
+		       IRQF_TRIGGER_NONE))
+		goto bb_attach_fail_r;
+
+	/* request SRQ interrupt for Service Request */
+	if (bb_get_irq(board, NAME "_SRQ", SRQ, &priv->irq_SRQ, bb_SRQ_interrupt, NULL,
+		       IRQF_TRIGGER_NONE))
+		goto bb_attach_fail_r;
+
+	dbg_printk(0, "attached board %d\n", board->minor);
+	goto bb_attach_out;
+
+bb_attach_fail_r:
+	release_gpios();
+bb_attach_fail:
+	retval = -1;
+bb_attach_out:
+	return retval;
+}
+
+static struct gpib_interface bb_interface = {
+	.name =	NAME,
+	.attach = bb_attach,
+	.detach = bb_detach,
+	.read = bb_read,
+	.write = bb_write,
+	.command = bb_command,
+	.take_control = bb_take_control,
+	.go_to_standby = bb_go_to_standby,
+	.request_system_control = bb_request_system_control,
+	.interface_clear = bb_interface_clear,
+	.remote_enable = bb_remote_enable,
+	.enable_eos = bb_enable_eos,
+	.disable_eos = bb_disable_eos,
+	.parallel_poll = bb_parallel_poll,
+	.parallel_poll_configure = bb_parallel_poll_configure,
+	.parallel_poll_response = bb_parallel_poll_response,
+	.line_status = bb_line_status,
+	.update_status = bb_update_status,
+	.primary_address = bb_primary_address,
+	.secondary_address = bb_secondary_address,
+	.serial_poll_response = bb_serial_poll_response,
+	.serial_poll_status = bb_serial_poll_status,
+	.t1_delay = bb_t1_delay,
+	.return_to_local = bb_return_to_local,
+};
+
+static int __init bb_init_module(void)
+{
+	int result = gpib_register_driver(&bb_interface, THIS_MODULE);
+
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		return result;
+	}
+
+	return 0;
+}
+
+static void __exit bb_exit_module(void)
+{
+	gpib_unregister_driver(&bb_interface);
+}
+
+module_init(bb_init_module);
+module_exit(bb_exit_module);
+
+/***************************************************************************
+ *									   *
+ * UTILITY Functions							   *
+ *									   *
+ ***************************************************************************/
+inline long usec_diff(struct timespec64 *a, struct timespec64 *b)
+{
+	return ((a->tv_sec - b->tv_sec) * 1000000 +
+		(a->tv_nsec - b->tv_nsec) / 1000);
+}
+
+static inline int check_for_eos(struct bb_priv *priv, u8 byte)
+{
+	if (priv->eos_check)
+		return 0;
+
+	if (priv->eos_check_8) {
+		if (priv->eos == byte)
+			return 1;
+	} else {
+		if (priv->eos_mask_7 == (byte & 0x7f))
+			return 1;
+	}
+	return 0;
+}
+
+static void set_data_lines_output(void)
+{
+	gpiod_direction_output(D01, 1);
+	gpiod_direction_output(D02, 1);
+	gpiod_direction_output(D03, 1);
+	gpiod_direction_output(D04, 1);
+	gpiod_direction_output(D05, 1);
+	gpiod_direction_output(D06, 1);
+	gpiod_direction_output(D07, 1);
+	gpiod_direction_output(D08, 1);
+}
+
+static void set_data_lines(u8 byte)
+{
+	gpiod_set_value(D01, !(byte & 0x01));
+	gpiod_set_value(D02, !(byte & 0x02));
+	gpiod_set_value(D03, !(byte & 0x04));
+	gpiod_set_value(D04, !(byte & 0x08));
+	gpiod_set_value(D05, !(byte & 0x10));
+	gpiod_set_value(D06, !(byte & 0x20));
+	gpiod_set_value(D07, !(byte & 0x40));
+	gpiod_set_value(D08, !(byte & 0x80));
+}
+
+static u8 get_data_lines(void)
+{
+	u8 ret;
+
+	ret = gpiod_get_value(D01);
+	ret |= gpiod_get_value(D02) << 1;
+	ret |= gpiod_get_value(D03) << 2;
+	ret |= gpiod_get_value(D04) << 3;
+	ret |= gpiod_get_value(D05) << 4;
+	ret |= gpiod_get_value(D06) << 5;
+	ret |= gpiod_get_value(D07) << 6;
+	ret |= gpiod_get_value(D08) << 7;
+	return ~ret;
+}
+
+static void set_data_lines_input(void)
+{
+	gpiod_direction_input(D01);
+	gpiod_direction_input(D02);
+	gpiod_direction_input(D03);
+	gpiod_direction_input(D04);
+	gpiod_direction_input(D05);
+	gpiod_direction_input(D06);
+	gpiod_direction_input(D07);
+	gpiod_direction_input(D08);
+}
+
+static inline void SET_DIR_WRITE(struct bb_priv *priv)
+{
+	if (priv->direction == DIR_WRITE)
+		return;
+
+	gpiod_direction_input(NRFD);
+	gpiod_direction_input(NDAC);
+	set_data_lines_output();
+	gpiod_direction_output(DAV, 1);
+	gpiod_direction_output(EOI, 1);
+
+	if (sn7516x) {
+		gpiod_set_value(PE, 1);	 /* set data lines to transmit on sn75160b */
+		gpiod_set_value(TE, 1);	 /* set NDAC and NRFD to receive and DAV to transmit */
+	}
+
+	priv->direction = DIR_WRITE;
+}
+
+static inline void SET_DIR_READ(struct bb_priv *priv)
+{
+	if (priv->direction == DIR_READ)
+		return;
+
+	gpiod_direction_input(DAV);
+	gpiod_direction_input(EOI);
+
+	set_data_lines_input();
+
+	if (sn7516x) {
+		gpiod_set_value(PE, 0);	 /* set data lines to receive on sn75160b */
+		gpiod_set_value(TE, 0);	 /* set NDAC and NRFD to transmit and DAV to receive */
+	}
+
+	gpiod_direction_output(NRFD, 0); /* hold off the talker */
+	gpiod_direction_output(NDAC, 0); /* data not accepted */
+
+	priv->direction = DIR_READ;
+}
diff --git a/drivers/gpib/hp_82335/Makefile b/drivers/gpib/hp_82335/Makefile
new file mode 100644
index 000000000000..305ce44ee48a
--- /dev/null
+++ b/drivers/gpib/hp_82335/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_GPIB_HP82335) += hp82335.o
+
+
diff --git a/drivers/gpib/hp_82335/hp82335.c b/drivers/gpib/hp_82335/hp82335.c
new file mode 100644
index 000000000000..d0e47ef77c87
--- /dev/null
+++ b/drivers/gpib/hp_82335/hp82335.c
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ * copyright            : (C) 2002 by Frank Mori Hess                      *
+ ***************************************************************************/
+
+/*
+ * should enable ATN interrupts (and update board->status on occurrence),
+ * implement recovery from bus errors (if necessary)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include "hp82335.h"
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/init.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver for HP 82335 interface cards");
+
+static int hp82335_attach(struct gpib_board *board, const struct gpib_board_config *config);
+static void hp82335_detach(struct gpib_board *board);
+static irqreturn_t hp82335_interrupt(int irq, void *arg);
+
+// wrappers for interface functions
+static int hp82335_read(struct gpib_board *board, u8 *buffer, size_t length,
+			int *end, size_t *bytes_read)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_read(board, &priv->tms9914_priv, buffer, length, end, bytes_read);
+}
+
+static int hp82335_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
+			 size_t *bytes_written)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_write(board, &priv->tms9914_priv, buffer, length, send_eoi, bytes_written);
+}
+
+static int hp82335_command(struct gpib_board *board, u8 *buffer, size_t length,
+			   size_t *bytes_written)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_command(board, &priv->tms9914_priv, buffer, length, bytes_written);
+}
+
+static int hp82335_take_control(struct gpib_board *board, int synchronous)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_take_control(board, &priv->tms9914_priv, synchronous);
+}
+
+static int hp82335_go_to_standby(struct gpib_board *board)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_go_to_standby(board, &priv->tms9914_priv);
+}
+
+static int hp82335_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_request_system_control(board, &priv->tms9914_priv, request_control);
+}
+
+static void hp82335_interface_clear(struct gpib_board *board, int assert)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	tms9914_interface_clear(board, &priv->tms9914_priv, assert);
+}
+
+static void hp82335_remote_enable(struct gpib_board *board, int enable)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	tms9914_remote_enable(board, &priv->tms9914_priv, enable);
+}
+
+static int hp82335_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_enable_eos(board, &priv->tms9914_priv, eos_byte, compare_8_bits);
+}
+
+static void hp82335_disable_eos(struct gpib_board *board)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	tms9914_disable_eos(board, &priv->tms9914_priv);
+}
+
+static unsigned int hp82335_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_update_status(board, &priv->tms9914_priv, clear_mask);
+}
+
+static int hp82335_primary_address(struct gpib_board *board, unsigned int address)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_primary_address(board, &priv->tms9914_priv, address);
+}
+
+static int hp82335_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_secondary_address(board, &priv->tms9914_priv, address, enable);
+}
+
+static int hp82335_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_parallel_poll(board, &priv->tms9914_priv, result);
+}
+
+static void hp82335_parallel_poll_configure(struct gpib_board *board, u8 config)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	tms9914_parallel_poll_configure(board, &priv->tms9914_priv, config);
+}
+
+static void hp82335_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	tms9914_parallel_poll_response(board, &priv->tms9914_priv, ist);
+}
+
+static void hp82335_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	tms9914_serial_poll_response(board, &priv->tms9914_priv, status);
+}
+
+static u8 hp82335_serial_poll_status(struct gpib_board *board)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_serial_poll_status(board, &priv->tms9914_priv);
+}
+
+static int hp82335_line_status(const struct gpib_board *board)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_line_status(board, &priv->tms9914_priv);
+}
+
+static int hp82335_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	return tms9914_t1_delay(board, &priv->tms9914_priv, nano_sec);
+}
+
+static void hp82335_return_to_local(struct gpib_board *board)
+{
+	struct hp82335_priv *priv = board->private_data;
+
+	tms9914_return_to_local(board, &priv->tms9914_priv);
+}
+
+static struct gpib_interface hp82335_interface = {
+	.name = "hp82335",
+	.attach = hp82335_attach,
+	.detach = hp82335_detach,
+	.read = hp82335_read,
+	.write = hp82335_write,
+	.command = hp82335_command,
+	.request_system_control = hp82335_request_system_control,
+	.take_control = hp82335_take_control,
+	.go_to_standby = hp82335_go_to_standby,
+	.interface_clear = hp82335_interface_clear,
+	.remote_enable = hp82335_remote_enable,
+	.enable_eos = hp82335_enable_eos,
+	.disable_eos = hp82335_disable_eos,
+	.parallel_poll = hp82335_parallel_poll,
+	.parallel_poll_configure = hp82335_parallel_poll_configure,
+	.parallel_poll_response = hp82335_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = hp82335_line_status,
+	.update_status = hp82335_update_status,
+	.primary_address = hp82335_primary_address,
+	.secondary_address = hp82335_secondary_address,
+	.serial_poll_response = hp82335_serial_poll_response,
+	.serial_poll_status = hp82335_serial_poll_status,
+	.t1_delay = hp82335_t1_delay,
+	.return_to_local = hp82335_return_to_local,
+};
+
+static int hp82335_allocate_private(struct gpib_board *board)
+{
+	board->private_data = kzalloc(sizeof(struct hp82335_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -1;
+	return 0;
+}
+
+static void hp82335_free_private(struct gpib_board *board)
+{
+	kfree(board->private_data);
+	board->private_data = NULL;
+}
+
+static inline unsigned int tms9914_to_hp82335_offset(unsigned int register_num)
+{
+	return 0x1ff8 + register_num;
+}
+
+static u8 hp82335_read_byte(struct tms9914_priv *priv, unsigned int register_num)
+{
+	return tms9914_iomem_read_byte(priv, tms9914_to_hp82335_offset(register_num));
+}
+
+static void hp82335_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num)
+{
+	tms9914_iomem_write_byte(priv, data, tms9914_to_hp82335_offset(register_num));
+}
+
+static void hp82335_clear_interrupt(struct hp82335_priv *hp_priv)
+{
+	struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv;
+
+	writeb(0, tms_priv->mmiobase + HPREG_INTR_CLEAR);
+}
+
+static int hp82335_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct hp82335_priv *hp_priv;
+	struct tms9914_priv *tms_priv;
+	int retval;
+	const unsigned long upper_iomem_base = config->ibbase + hp82335_rom_size;
+
+	board->status = 0;
+
+	if (hp82335_allocate_private(board))
+		return -ENOMEM;
+	hp_priv = board->private_data;
+	tms_priv = &hp_priv->tms9914_priv;
+	tms_priv->read_byte = hp82335_read_byte;
+	tms_priv->write_byte = hp82335_write_byte;
+	tms_priv->offset = 1;
+
+	switch (config->ibbase) {
+	case 0xc4000:
+	case 0xc8000:
+	case 0xcc000:
+	case 0xd0000:
+	case 0xd4000:
+	case 0xd8000:
+	case 0xdc000:
+	case 0xe0000:
+	case 0xe4000:
+	case 0xe8000:
+	case 0xec000:
+	case 0xf0000:
+	case 0xf4000:
+	case 0xf8000:
+	case 0xfc000:
+		break;
+	default:
+		dev_err(board->gpib_dev, "invalid base io address 0x%x\n", config->ibbase);
+		return -EINVAL;
+	}
+	if (!request_mem_region(upper_iomem_base, hp82335_upper_iomem_size, "hp82335")) {
+		dev_err(board->gpib_dev, "failed to allocate io memory region 0x%lx-0x%lx\n",
+			upper_iomem_base, upper_iomem_base + hp82335_upper_iomem_size - 1);
+		return -EBUSY;
+	}
+	hp_priv->raw_iobase = upper_iomem_base;
+	tms_priv->mmiobase = ioremap(upper_iomem_base, hp82335_upper_iomem_size);
+
+	retval = request_irq(config->ibirq, hp82335_interrupt, 0, DRV_NAME, board);
+	if (retval) {
+		dev_err(board->gpib_dev, "can't request IRQ %d\n", config->ibirq);
+		return retval;
+	}
+	hp_priv->irq = config->ibirq;
+
+	tms9914_board_reset(tms_priv);
+
+	hp82335_clear_interrupt(hp_priv);
+
+	writeb(INTR_ENABLE, tms_priv->mmiobase + HPREG_CCR);
+
+	tms9914_online(board, tms_priv);
+
+	return 0;
+}
+
+static void hp82335_detach(struct gpib_board *board)
+{
+	struct hp82335_priv *hp_priv = board->private_data;
+	struct tms9914_priv *tms_priv;
+
+	if (hp_priv) {
+		tms_priv = &hp_priv->tms9914_priv;
+		if (hp_priv->irq)
+			free_irq(hp_priv->irq, board);
+		if (tms_priv->mmiobase) {
+			writeb(0, tms_priv->mmiobase + HPREG_CCR);
+			tms9914_board_reset(tms_priv);
+			iounmap(tms_priv->mmiobase);
+		}
+		if (hp_priv->raw_iobase)
+			release_mem_region(hp_priv->raw_iobase, hp82335_upper_iomem_size);
+	}
+	hp82335_free_private(board);
+}
+
+static int __init hp82335_init_module(void)
+{
+	int result = gpib_register_driver(&hp82335_interface, THIS_MODULE);
+
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		return result;
+	}
+
+	return 0;
+}
+
+static void __exit hp82335_exit_module(void)
+{
+	gpib_unregister_driver(&hp82335_interface);
+}
+
+module_init(hp82335_init_module);
+module_exit(hp82335_exit_module);
+
+/*
+ * GPIB interrupt service routines
+ */
+
+static irqreturn_t hp82335_interrupt(int irq, void *arg)
+{
+	int status1, status2;
+	struct gpib_board *board = arg;
+	struct hp82335_priv *priv = board->private_data;
+	unsigned long flags;
+	irqreturn_t retval;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	status1 = read_byte(&priv->tms9914_priv, ISR0);
+	status2 = read_byte(&priv->tms9914_priv, ISR1);
+	hp82335_clear_interrupt(priv);
+	retval = tms9914_interrupt_have_status(board, &priv->tms9914_priv, status1, status2);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return retval;
+}
+
diff --git a/drivers/gpib/hp_82335/hp82335.h b/drivers/gpib/hp_82335/hp82335.h
new file mode 100644
index 000000000000..0c252a712ec9
--- /dev/null
+++ b/drivers/gpib/hp_82335/hp82335.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright            : (C) 2002 by Frank Mori Hess                   *
+ ***************************************************************************/
+
+#ifndef _HP82335_H
+#define _HP82335_H
+
+#include "tms9914.h"
+#include "gpibP.h"
+
+// struct which defines private_data for board
+struct hp82335_priv  {
+	struct tms9914_priv tms9914_priv;
+	unsigned int irq;
+	unsigned long raw_iobase;
+};
+
+// size of io memory region used
+static const int hp82335_rom_size = 0x2000;
+static const int hp82335_upper_iomem_size = 0x2000;
+
+// hp82335 register offsets
+enum hp_read_regs {
+	HPREG_CSR = 0x17f8,
+	HPREG_STATUS = 0x1ffc,
+};
+
+enum hp_write_regs {
+	HPREG_INTR_CLEAR = 0x17f7,
+	HPREG_CCR = HPREG_CSR,
+};
+
+enum ccr_bits {
+	DMA_ENABLE = (1 << 0),   /* DMA enable                  */
+	DMA_CHAN_SELECT = (1 << 1),   /* DMA channel select  O=3,1=2 */
+	INTR_ENABLE = (1 << 2),   /* interrupt enable            */
+	SYS_DISABLE = (1 << 3),   /* system controller disable   */
+};
+
+enum csr_bits {
+	SWITCH6 = (1 << 0),   /* switch 6 position           */
+	SWITCH5 = (1 << 1),   /* switch 5 position           */
+	SYS_CONTROLLER = (1 << 2),   /* system controller bit       */
+	DMA_ENABLE_STATUS = (1 << 4),   /* DMA enabled                 */
+	DMA_CHAN_STATUS = (1 << 5),   /* DMA channel   0=3,1=2       */
+	INTR_ENABLE_STATUS = (1 << 6),   /* Interrupt enable            */
+	INTR_PENDING = (1 << 7),   /* Interrupt Pending           */
+};
+
+#endif	// _HP82335_H
diff --git a/drivers/gpib/hp_82341/Makefile b/drivers/gpib/hp_82341/Makefile
new file mode 100644
index 000000000000..21367310a17e
--- /dev/null
+++ b/drivers/gpib/hp_82341/Makefile
@@ -0,0 +1,2 @@
+
+obj-$(CONFIG_GPIB_HP82341) += hp_82341.o
diff --git a/drivers/gpib/hp_82341/hp_82341.c b/drivers/gpib/hp_82341/hp_82341.c
new file mode 100644
index 000000000000..1a2ad0560e14
--- /dev/null
+++ b/drivers/gpib/hp_82341/hp_82341.c
@@ -0,0 +1,907 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *     Driver for hp 82341a/b/c/d boards.                                  *
+ * Might be worth merging with Agilent 82350b driver.                      *
+ *   copyright            : (C) 2002, 2005 by Frank Mori Hess              *
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include "hp_82341.h"
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/isapnp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver for hp 82341a/b/c/d boards");
+
+static unsigned short read_and_clear_event_status(struct gpib_board *board);
+static void set_transfer_counter(struct hp_82341_priv *hp_priv, int count);
+static int read_transfer_counter(struct hp_82341_priv *hp_priv);
+static int hp_82341_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
+			  size_t *bytes_written);
+static irqreturn_t hp_82341_interrupt(int irq, void *arg);
+
+static int hp_82341_accel_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
+			       size_t *bytes_read)
+{
+	struct hp_82341_priv *hp_priv = board->private_data;
+	struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv;
+	int retval = 0;
+	unsigned short event_status;
+	int i;
+	int num_fifo_bytes;
+	// hardware doesn't support checking for end-of-string character when using fifo
+	if (tms_priv->eos_flags & REOS)
+		return tms9914_read(board, tms_priv, buffer, length, end, bytes_read);
+
+	clear_bit(DEV_CLEAR_BN, &tms_priv->state);
+
+	read_and_clear_event_status(board);
+	*end = 0;
+	*bytes_read = 0;
+	if (length == 0)
+		return 0;
+	// disable fifo for the moment
+	outb(DIRECTION_GPIB_TO_HOST_BIT, hp_priv->iobase[3] + BUFFER_CONTROL_REG);
+	/*
+	 * Handle corner case of board not in holdoff and one byte has slipped in already.
+	 * Also, board sometimes has problems (spurious 1 byte reads) when read fifo is
+	 * started up with board in TACS under certain data holdoff conditions.
+	 * Doing a 1 byte tms9914-style read avoids these problems.
+	 */
+	if (/*tms_priv->holdoff_active == 0 && */length > 1) {
+		size_t num_bytes;
+
+		retval = tms9914_read(board, tms_priv, buffer, 1, end, &num_bytes);
+		*bytes_read += num_bytes;
+		if (retval < 0)
+			dev_err(board->gpib_dev, "tms9914_read failed retval=%i\n", retval);
+		if (retval < 0 || *end)
+			return retval;
+		++buffer;
+		--length;
+	}
+	tms9914_set_holdoff_mode(tms_priv, TMS9914_HOLDOFF_EOI);
+	tms9914_release_holdoff(tms_priv);
+	outb(0x00, hp_priv->iobase[3] + BUFFER_FLUSH_REG);
+	i = 0;
+	num_fifo_bytes = length - 1;
+	while (i < num_fifo_bytes && *end == 0)	{
+		int block_size;
+		int j;
+		int count;
+
+		block_size = min(num_fifo_bytes - i, hp_82341_fifo_size);
+		set_transfer_counter(hp_priv, block_size);
+		outb(ENABLE_TI_BUFFER_BIT | DIRECTION_GPIB_TO_HOST_BIT, hp_priv->iobase[3] +
+		     BUFFER_CONTROL_REG);
+		if (inb(hp_priv->iobase[0] + STREAM_STATUS_REG) & HALTED_STATUS_BIT)
+			outb(RESTART_STREAM_BIT, hp_priv->iobase[0] + STREAM_STATUS_REG);
+
+		clear_bit(READ_READY_BN, &tms_priv->state);
+
+		retval = wait_event_interruptible(board->wait,
+						  ((event_status =
+						    read_and_clear_event_status(board)) &
+						   (TERMINAL_COUNT_EVENT_BIT |
+						    BUFFER_END_EVENT_BIT)) ||
+						  test_bit(DEV_CLEAR_BN, &tms_priv->state) ||
+						  test_bit(TIMO_NUM, &board->status));
+		if (retval)  {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		// have to disable buffer before we can read from buffer port
+		outb(DIRECTION_GPIB_TO_HOST_BIT, hp_priv->iobase[3] + BUFFER_CONTROL_REG);
+		count = block_size - read_transfer_counter(hp_priv);
+		j = 0;
+		while (j < count && i < num_fifo_bytes) {
+			unsigned short data_word = inw(hp_priv->iobase[3] + BUFFER_PORT_LOW_REG);
+
+			buffer[i++] = data_word & 0xff;
+			++j;
+			if (j < count && i < num_fifo_bytes) {
+				buffer[i++] = (data_word >> 8) & 0xff;
+				++j;
+			}
+		}
+		if (event_status & BUFFER_END_EVENT_BIT) {
+			clear_bit(RECEIVED_END_BN, &tms_priv->state);
+
+			*end = 1;
+			tms_priv->holdoff_active = 1;
+		}
+		if (test_bit(TIMO_NUM, &board->status))	{
+			retval = -ETIMEDOUT;
+			break;
+		}
+		if (test_bit(DEV_CLEAR_BN, &tms_priv->state)) {
+			retval = -EINTR;
+			break;
+		}
+	}
+	*bytes_read += i;
+	buffer += i;
+	length -= i;
+	if (retval < 0)
+		return retval;
+	// read last byte if we havn't received an END yet
+	if (*end == 0) {
+		size_t num_bytes;
+		// try to make sure we holdoff after last byte read
+		retval = tms9914_read(board, tms_priv, buffer, length, end, &num_bytes);
+		*bytes_read += num_bytes;
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
+}
+
+static int restart_write_fifo(struct gpib_board *board, struct hp_82341_priv *hp_priv)
+{
+	struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv;
+
+	if ((inb(hp_priv->iobase[0] + STREAM_STATUS_REG) & HALTED_STATUS_BIT) == 0)
+		return 0;
+	while (1) {
+		int status;
+
+		// restart doesn't work if data holdoff is in effect
+		status = tms9914_line_status(board, tms_priv);
+		if ((status & BUS_NRFD) == 0) {
+			outb(RESTART_STREAM_BIT, hp_priv->iobase[0] + STREAM_STATUS_REG);
+			return 0;
+		}
+		if (test_bit(DEV_CLEAR_BN, &tms_priv->state))
+			return -EINTR;
+		if (test_bit(TIMO_NUM, &board->status))
+			return -ETIMEDOUT;
+		if (msleep_interruptible(1))
+			return -EINTR;
+	}
+	return 0;
+}
+
+static int hp_82341_accel_write(struct gpib_board *board, u8 *buffer, size_t length,
+				int send_eoi, size_t *bytes_written)
+{
+	struct hp_82341_priv *hp_priv = board->private_data;
+	struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv;
+	int i, j;
+	unsigned short event_status;
+	int retval = 0;
+	int fifo_xfer_len = length;
+
+	*bytes_written = 0;
+	if (send_eoi)
+		--fifo_xfer_len;
+
+	clear_bit(DEV_CLEAR_BN, &tms_priv->state);
+
+	read_and_clear_event_status(board);
+	outb(0, hp_priv->iobase[3] + BUFFER_CONTROL_REG);
+	outb(0x00, hp_priv->iobase[3] + BUFFER_FLUSH_REG);
+	for (i = 0; i < fifo_xfer_len;) {
+		int block_size;
+
+		block_size = min(fifo_xfer_len - i, hp_82341_fifo_size);
+		set_transfer_counter(hp_priv, block_size);
+		// load data into board's fifo
+		for (j = 0; j < block_size;) {
+			unsigned short data_word = buffer[i++];
+			++j;
+			if (j < block_size) {
+				data_word |= buffer[i++] << 8;
+				++j;
+			}
+			outw(data_word, hp_priv->iobase[3] + BUFFER_PORT_LOW_REG);
+		}
+		clear_bit(WRITE_READY_BN, &tms_priv->state);
+		outb(ENABLE_TI_BUFFER_BIT, hp_priv->iobase[3] + BUFFER_CONTROL_REG);
+		retval = restart_write_fifo(board, hp_priv);
+		if (retval < 0)	{
+			dev_err(board->gpib_dev, "failed to restart write stream\n");
+			break;
+		}
+		retval = wait_event_interruptible(board->wait,
+						  ((event_status =
+						    read_and_clear_event_status(board)) &
+						   TERMINAL_COUNT_EVENT_BIT) ||
+						  test_bit(DEV_CLEAR_BN, &tms_priv->state) ||
+						  test_bit(TIMO_NUM, &board->status));
+		outb(0, hp_priv->iobase[3] + BUFFER_CONTROL_REG);
+		*bytes_written += block_size - read_transfer_counter(hp_priv);
+		if (retval) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		if (test_bit(TIMO_NUM, &board->status))	{
+			retval = -ETIMEDOUT;
+			break;
+		}
+		if (test_bit(DEV_CLEAR_BN, &tms_priv->state)) {
+			retval = -EINTR;
+			break;
+		}
+	}
+	if (retval)
+		return retval;
+	if (send_eoi) {
+		size_t num_bytes;
+
+		retval = hp_82341_write(board, buffer + fifo_xfer_len, 1, 1, &num_bytes);
+		*bytes_written += num_bytes;
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
+}
+
+static int hp_82341_attach(struct gpib_board *board, const struct gpib_board_config *config);
+
+static void hp_82341_detach(struct gpib_board *board);
+
+// wrappers for interface functions
+static int hp_82341_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
+			 size_t *bytes_read)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_read(board, &priv->tms9914_priv, buffer, length, end, bytes_read);
+}
+
+static int hp_82341_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
+			  size_t *bytes_written)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_write(board, &priv->tms9914_priv, buffer, length, send_eoi, bytes_written);
+}
+
+static int hp_82341_command(struct gpib_board *board, u8 *buffer, size_t length,
+			    size_t *bytes_written)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_command(board, &priv->tms9914_priv, buffer, length, bytes_written);
+}
+
+static int hp_82341_take_control(struct gpib_board *board, int synchronous)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_take_control(board, &priv->tms9914_priv, synchronous);
+}
+
+static int hp_82341_go_to_standby(struct gpib_board *board)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_go_to_standby(board, &priv->tms9914_priv);
+}
+
+static int hp_82341_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	if (request_control)
+		priv->mode_control_bits |= SYSTEM_CONTROLLER_BIT;
+	else
+		priv->mode_control_bits &= ~SYSTEM_CONTROLLER_BIT;
+	outb(priv->mode_control_bits, priv->iobase[0] + MODE_CONTROL_STATUS_REG);
+	return tms9914_request_system_control(board, &priv->tms9914_priv, request_control);
+}
+
+static void hp_82341_interface_clear(struct gpib_board *board, int assert)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	tms9914_interface_clear(board, &priv->tms9914_priv, assert);
+}
+
+static void hp_82341_remote_enable(struct gpib_board *board, int enable)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	tms9914_remote_enable(board, &priv->tms9914_priv, enable);
+}
+
+static int hp_82341_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_enable_eos(board, &priv->tms9914_priv, eos_byte, compare_8_bits);
+}
+
+static void hp_82341_disable_eos(struct gpib_board *board)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	tms9914_disable_eos(board, &priv->tms9914_priv);
+}
+
+static unsigned int hp_82341_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_update_status(board, &priv->tms9914_priv, clear_mask);
+}
+
+static int hp_82341_primary_address(struct gpib_board *board, unsigned int address)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_primary_address(board, &priv->tms9914_priv, address);
+}
+
+static int hp_82341_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_secondary_address(board, &priv->tms9914_priv, address, enable);
+}
+
+static int hp_82341_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_parallel_poll(board, &priv->tms9914_priv, result);
+}
+
+static void hp_82341_parallel_poll_configure(struct gpib_board *board, u8 config)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	tms9914_parallel_poll_configure(board, &priv->tms9914_priv, config);
+}
+
+static void hp_82341_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	tms9914_parallel_poll_response(board, &priv->tms9914_priv, ist);
+}
+
+static void hp_82341_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	tms9914_serial_poll_response(board, &priv->tms9914_priv, status);
+}
+
+static u8 hp_82341_serial_poll_status(struct gpib_board *board)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_serial_poll_status(board, &priv->tms9914_priv);
+}
+
+static int hp_82341_line_status(const struct gpib_board *board)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_line_status(board, &priv->tms9914_priv);
+}
+
+static int hp_82341_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	return tms9914_t1_delay(board, &priv->tms9914_priv, nano_sec);
+}
+
+static void hp_82341_return_to_local(struct gpib_board *board)
+{
+	struct hp_82341_priv *priv = board->private_data;
+
+	tms9914_return_to_local(board, &priv->tms9914_priv);
+}
+
+static struct gpib_interface hp_82341_unaccel_interface = {
+	.name = "hp_82341_unaccel",
+	.attach = hp_82341_attach,
+	.detach = hp_82341_detach,
+	.read = hp_82341_read,
+	.write = hp_82341_write,
+	.command = hp_82341_command,
+	.request_system_control = hp_82341_request_system_control,
+	.take_control = hp_82341_take_control,
+	.go_to_standby = hp_82341_go_to_standby,
+	.interface_clear = hp_82341_interface_clear,
+	.remote_enable = hp_82341_remote_enable,
+	.enable_eos = hp_82341_enable_eos,
+	.disable_eos = hp_82341_disable_eos,
+	.parallel_poll = hp_82341_parallel_poll,
+	.parallel_poll_configure = hp_82341_parallel_poll_configure,
+	.parallel_poll_response = hp_82341_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = hp_82341_line_status,
+	.update_status = hp_82341_update_status,
+	.primary_address = hp_82341_primary_address,
+	.secondary_address = hp_82341_secondary_address,
+	.serial_poll_response = hp_82341_serial_poll_response,
+	.serial_poll_status = hp_82341_serial_poll_status,
+	.t1_delay = hp_82341_t1_delay,
+	.return_to_local = hp_82341_return_to_local,
+};
+
+static struct gpib_interface hp_82341_interface = {
+	.name = "hp_82341",
+	.attach = hp_82341_attach,
+	.detach = hp_82341_detach,
+	.read = hp_82341_accel_read,
+	.write = hp_82341_accel_write,
+	.command = hp_82341_command,
+	.request_system_control = hp_82341_request_system_control,
+	.take_control = hp_82341_take_control,
+	.go_to_standby = hp_82341_go_to_standby,
+	.interface_clear = hp_82341_interface_clear,
+	.remote_enable = hp_82341_remote_enable,
+	.enable_eos = hp_82341_enable_eos,
+	.disable_eos = hp_82341_disable_eos,
+	.parallel_poll = hp_82341_parallel_poll,
+	.parallel_poll_configure = hp_82341_parallel_poll_configure,
+	.parallel_poll_response = hp_82341_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = hp_82341_line_status,
+	.update_status = hp_82341_update_status,
+	.primary_address = hp_82341_primary_address,
+	.secondary_address = hp_82341_secondary_address,
+	.serial_poll_response = hp_82341_serial_poll_response,
+	.t1_delay = hp_82341_t1_delay,
+	.return_to_local = hp_82341_return_to_local,
+};
+
+static int hp_82341_allocate_private(struct gpib_board *board)
+{
+	board->private_data = kzalloc(sizeof(struct hp_82341_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static void hp_82341_free_private(struct gpib_board *board)
+{
+	kfree(board->private_data);
+	board->private_data = NULL;
+}
+
+static u8 hp_82341_read_byte(struct tms9914_priv *priv, unsigned int register_num)
+{
+	return inb(priv->iobase + register_num);
+}
+
+static void hp_82341_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num)
+{
+	outb(data, priv->iobase + register_num);
+}
+
+static int hp_82341_find_isapnp_board(struct pnp_dev **dev)
+{
+	*dev = pnp_find_dev(NULL, ISAPNP_VENDOR('H', 'W', 'P'),
+			    ISAPNP_FUNCTION(0x1411), NULL);
+	if (!*dev || !(*dev)->card) {
+		pr_err("failed to find isapnp board\n");
+		return -ENODEV;
+	}
+	if (pnp_device_attach(*dev) < 0) {
+		pr_err("board already active, skipping\n");
+		return -EBUSY;
+	}
+	if (pnp_activate_dev(*dev) < 0) {
+		pnp_device_detach(*dev);
+		pr_err("failed to activate(), aborting\n");
+		return -EAGAIN;
+	}
+	if (!pnp_port_valid(*dev, 0) || !pnp_irq_valid(*dev, 0)) {
+		pnp_device_detach(*dev);
+		pr_err("invalid port or irq, aborting\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static int xilinx_ready(struct hp_82341_priv *hp_priv)
+{
+	switch (hp_priv->hw_version) {
+	case HW_VERSION_82341C:
+		if (inb(hp_priv->iobase[0] + CONFIG_CONTROL_STATUS_REG) & XILINX_READY_BIT)
+			return 1;
+		else
+			return 0;
+		break;
+	case HW_VERSION_82341D:
+		if (isapnp_read_byte(PIO_DATA_REG) & HP_82341D_XILINX_READY_BIT)
+			return 1;
+		else
+			return 0;
+	default:
+		pr_err("bug! unknown hw_version\n");
+		break;
+	}
+	return 0;
+}
+
+static int xilinx_done(struct hp_82341_priv *hp_priv)
+{
+	switch (hp_priv->hw_version) {
+	case HW_VERSION_82341C:
+		if (inb(hp_priv->iobase[0] + CONFIG_CONTROL_STATUS_REG) & DONE_PGL_BIT)
+			return 1;
+		else
+			return 0;
+	case HW_VERSION_82341D:
+		if (isapnp_read_byte(PIO_DATA_REG) & HP_82341D_XILINX_DONE_BIT)
+			return 1;
+		else
+			return 0;
+	default:
+		pr_err("bug! unknown hw_version\n");
+		break;
+	}
+	return 0;
+}
+
+static int irq_valid(struct hp_82341_priv *hp_priv, int irq)
+{
+	switch (hp_priv->hw_version) {
+	case HW_VERSION_82341C:
+		switch (irq) {
+		case 3:
+		case 5:
+		case 7:
+		case 9:
+		case 10:
+		case 11:
+		case 12:
+		case 15:
+			return 1;
+		default:
+			pr_err("invalid irq=%i for 82341C, irq must be 3, 5, 7, 9, 10, 11, 12, or 15.\n",
+			       irq);
+			return 0;
+		}
+		break;
+	case HW_VERSION_82341D:
+		return 1;
+	default:
+		pr_err("bug! unknown hw_version\n");
+		break;
+	}
+	return 0;
+}
+
+static int hp_82341_load_firmware_array(struct hp_82341_priv *hp_priv,
+					const unsigned char *firmware_data,
+					unsigned int firmware_length)
+{
+	int i, j;
+	static const int timeout = 100;
+
+	for (i = 0; i < firmware_length; ++i) {
+		for (j = 0; j < timeout; ++j) {
+			if (need_resched())
+				schedule();
+			if (xilinx_ready(hp_priv))
+				break;
+			usleep_range(10, 15);
+		}
+		if (j == timeout) {
+			pr_err("timed out waiting for Xilinx ready.\n");
+			return -ETIMEDOUT;
+		}
+		outb(firmware_data[i], hp_priv->iobase[0] + XILINX_DATA_REG);
+	}
+	for (j = 0; j < timeout; ++j) {
+		if (xilinx_done(hp_priv))
+			break;
+		if (need_resched())
+			schedule();
+		usleep_range(10, 15);
+	}
+	if (j == timeout) {
+		pr_err("timed out waiting for Xilinx done.\n");
+		return -ETIMEDOUT;
+	}
+	return 0;
+}
+
+static int hp_82341_load_firmware(struct hp_82341_priv *hp_priv,
+				  const struct gpib_board_config *config)
+{
+	if (config->init_data_length == 0) {
+		if (xilinx_done(hp_priv))
+			return 0;
+		pr_err("board needs be initialized with firmware upload.\n"
+		       "\tUse the --init-data option of gpib_config.\n");
+		return -EINVAL;
+	}
+	switch (hp_priv->hw_version) {
+	case HW_VERSION_82341C:
+		if (config->init_data_length != hp_82341c_firmware_length) {
+			pr_err("bad firmware length=%i for 82341c (expected %i).\n",
+			       config->init_data_length, hp_82341c_firmware_length);
+			return -EINVAL;
+		}
+		break;
+	case HW_VERSION_82341D:
+		if (config->init_data_length != hp_82341d_firmware_length) {
+			pr_err("bad firmware length=%i for 82341d (expected %i).\n",
+			       config->init_data_length, hp_82341d_firmware_length);
+			return -EINVAL;
+		}
+		break;
+	default:
+		pr_err("bug! unknown hw_version\n");
+		break;
+	}
+	return hp_82341_load_firmware_array(hp_priv, config->init_data, config->init_data_length);
+}
+
+static void set_xilinx_not_prog(struct hp_82341_priv *hp_priv, int assert)
+{
+	switch (hp_priv->hw_version) {
+	case HW_VERSION_82341C:
+		if (assert)
+			hp_priv->config_control_bits |= DONE_PGL_BIT;
+		else
+			hp_priv->config_control_bits &= ~DONE_PGL_BIT;
+		outb(hp_priv->config_control_bits, hp_priv->iobase[0] + CONFIG_CONTROL_STATUS_REG);
+		break;
+	case HW_VERSION_82341D:
+		if (assert)
+			isapnp_write_byte(PIO_DATA_REG, HP_82341D_NOT_PROG_BIT);
+		else
+			isapnp_write_byte(PIO_DATA_REG, 0x0);
+		break;
+	default:
+		break;
+	}
+}
+
+// clear xilinx firmware
+static int clear_xilinx(struct hp_82341_priv *hp_priv)
+{
+	set_xilinx_not_prog(hp_priv, 1);
+	if (msleep_interruptible(1))
+		return -EINTR;
+	set_xilinx_not_prog(hp_priv, 0);
+	if (msleep_interruptible(1))
+		return -EINTR;
+	set_xilinx_not_prog(hp_priv, 1);
+	if (msleep_interruptible(1))
+		return -EINTR;
+	return 0;
+}
+
+static int hp_82341_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct hp_82341_priv *hp_priv;
+	struct tms9914_priv *tms_priv;
+	u32 start_addr;
+	u32 iobase;
+	int irq;
+	int i;
+	int retval;
+
+	board->status = 0;
+	if (hp_82341_allocate_private(board))
+		return -ENOMEM;
+	hp_priv = board->private_data;
+	tms_priv = &hp_priv->tms9914_priv;
+	tms_priv->read_byte = hp_82341_read_byte;
+	tms_priv->write_byte = hp_82341_write_byte;
+	tms_priv->offset = 1;
+
+	if (config->ibbase == 0) {
+		struct pnp_dev *dev;
+		int retval = hp_82341_find_isapnp_board(&dev);
+
+		if (retval < 0)
+			return retval;
+		hp_priv->pnp_dev = dev;
+		iobase = pnp_port_start(dev, 0);
+		irq = pnp_irq(dev, 0);
+		hp_priv->hw_version = HW_VERSION_82341D;
+		hp_priv->io_region_offset = 0x8;
+	} else {
+		iobase = config->ibbase;
+		irq = config->ibirq;
+		hp_priv->hw_version = HW_VERSION_82341C;
+		hp_priv->io_region_offset = 0x400;
+	}
+	for (i = 0; i < hp_82341_num_io_regions; ++i) {
+		start_addr = iobase + i * hp_priv->io_region_offset;
+		if (!request_region(start_addr, hp_82341_region_iosize, DRV_NAME)) {
+			dev_err(board->gpib_dev, "failed to allocate io ports 0x%x-0x%x\n",
+				start_addr,
+				start_addr + hp_82341_region_iosize - 1);
+			return -EIO;
+		}
+		hp_priv->iobase[i] = start_addr;
+	}
+	tms_priv->iobase = hp_priv->iobase[2];
+	if (hp_priv->hw_version == HW_VERSION_82341D) {
+		retval = isapnp_cfg_begin(hp_priv->pnp_dev->card->number,
+					  hp_priv->pnp_dev->number);
+		if (retval < 0)	{
+			dev_err(board->gpib_dev, "isapnp_cfg_begin returned error\n");
+			return retval;
+		}
+		isapnp_write_byte(PIO_DIRECTION_REG, HP_82341D_XILINX_READY_BIT |
+				  HP_82341D_XILINX_DONE_BIT);
+	}
+	retval = clear_xilinx(hp_priv);
+	if (retval < 0)
+		return retval;
+	retval = hp_82341_load_firmware(hp_priv, config);
+	if (hp_priv->hw_version == HW_VERSION_82341D)
+		isapnp_cfg_end();
+	if (retval < 0)
+		return retval;
+	if (irq_valid(hp_priv, irq) == 0)
+		return -EINVAL;
+	if (request_irq(irq, hp_82341_interrupt, 0, DRV_NAME, board))	{
+		dev_err(board->gpib_dev, "failed to allocate IRQ %d\n", irq);
+		return -EIO;
+	}
+	hp_priv->irq = irq;
+	hp_priv->config_control_bits &= ~IRQ_SELECT_MASK;
+	hp_priv->config_control_bits |= IRQ_SELECT_BITS(irq);
+	outb(hp_priv->config_control_bits, hp_priv->iobase[0] + CONFIG_CONTROL_STATUS_REG);
+	hp_priv->mode_control_bits |= ENABLE_IRQ_CONFIG_BIT;
+	outb(hp_priv->mode_control_bits, hp_priv->iobase[0] + MODE_CONTROL_STATUS_REG);
+	tms9914_board_reset(tms_priv);
+	outb(ENABLE_BUFFER_END_EVENT_BIT | ENABLE_TERMINAL_COUNT_EVENT_BIT |
+	     ENABLE_TI_INTERRUPT_EVENT_BIT, hp_priv->iobase[0] +  EVENT_ENABLE_REG);
+	outb(ENABLE_BUFFER_END_INTERRUPT_BIT | ENABLE_TERMINAL_COUNT_INTERRUPT_BIT |
+	     ENABLE_TI_INTERRUPT_BIT, hp_priv->iobase[0] + INTERRUPT_ENABLE_REG);
+	// write clear event register
+	outb((TI_INTERRUPT_EVENT_BIT | POINTERS_EQUAL_EVENT_BIT |
+	      BUFFER_END_EVENT_BIT | TERMINAL_COUNT_EVENT_BIT),
+	     hp_priv->iobase[0] + EVENT_STATUS_REG);
+
+	tms9914_online(board, tms_priv);
+
+	return 0;
+}
+
+static void hp_82341_detach(struct gpib_board *board)
+{
+	struct hp_82341_priv *hp_priv = board->private_data;
+	struct tms9914_priv *tms_priv;
+	int i;
+
+	if (hp_priv) {
+		tms_priv = &hp_priv->tms9914_priv;
+		if (hp_priv->iobase[0])	{
+			outb(0, hp_priv->iobase[0] + INTERRUPT_ENABLE_REG);
+			if (tms_priv->iobase)
+				tms9914_board_reset(tms_priv);
+			if (hp_priv->irq)
+				free_irq(hp_priv->irq, board);
+		}
+		for (i = 0; i < hp_82341_num_io_regions; ++i) {
+			if (hp_priv->iobase[i])
+				release_region(hp_priv->iobase[i], hp_82341_region_iosize);
+		}
+		if (hp_priv->pnp_dev)
+			pnp_device_detach(hp_priv->pnp_dev);
+	}
+	hp_82341_free_private(board);
+}
+
+#if 0
+/* unused, will be needed when the driver is turned into a pnp_driver */
+static const struct pnp_device_id hp_82341_pnp_table[] = {
+	{.id = "HWP1411"},
+	{.id = ""}
+};
+MODULE_DEVICE_TABLE(pnp, hp_82341_pnp_table);
+#endif
+
+static int __init hp_82341_init_module(void)
+{
+	int ret;
+
+	ret = gpib_register_driver(&hp_82341_unaccel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		return ret;
+	}
+
+	ret = gpib_register_driver(&hp_82341_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		gpib_unregister_driver(&hp_82341_unaccel_interface);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit hp_82341_exit_module(void)
+{
+	gpib_unregister_driver(&hp_82341_interface);
+	gpib_unregister_driver(&hp_82341_unaccel_interface);
+}
+
+module_init(hp_82341_init_module);
+module_exit(hp_82341_exit_module);
+
+/*
+ * GPIB interrupt service routines
+ */
+static unsigned short read_and_clear_event_status(struct gpib_board *board)
+{
+	struct hp_82341_priv *hp_priv = board->private_data;
+	unsigned long flags;
+	unsigned short status;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	status = hp_priv->event_status_bits;
+	hp_priv->event_status_bits = 0;
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return status;
+}
+
+static irqreturn_t hp_82341_interrupt(int irq, void *arg)
+{
+	int status1, status2;
+	struct gpib_board *board = arg;
+	struct hp_82341_priv *hp_priv = board->private_data;
+	struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv;
+	unsigned long flags;
+	irqreturn_t retval = IRQ_NONE;
+	int event_status;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	event_status = inb(hp_priv->iobase[0] + EVENT_STATUS_REG);
+	if (event_status & INTERRUPT_PENDING_EVENT_BIT)
+		retval = IRQ_HANDLED;
+	// write-clear status bits
+	if (event_status & (TI_INTERRUPT_EVENT_BIT | POINTERS_EQUAL_EVENT_BIT |
+			    BUFFER_END_EVENT_BIT | TERMINAL_COUNT_EVENT_BIT)) {
+		outb(event_status & (TI_INTERRUPT_EVENT_BIT | POINTERS_EQUAL_EVENT_BIT |
+				     BUFFER_END_EVENT_BIT | TERMINAL_COUNT_EVENT_BIT),
+		     hp_priv->iobase[0] + EVENT_STATUS_REG);
+		hp_priv->event_status_bits |= event_status;
+	}
+	if (event_status & TI_INTERRUPT_EVENT_BIT) {
+		status1 = read_byte(tms_priv, ISR0);
+		status2 = read_byte(tms_priv, ISR1);
+		tms9914_interrupt_have_status(board, tms_priv, status1, status2);
+	}
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return retval;
+}
+
+static int read_transfer_counter(struct hp_82341_priv *hp_priv)
+{
+	int lo, mid, value;
+
+	lo = inb(hp_priv->iobase[1] + TRANSFER_COUNT_LOW_REG);
+	mid = inb(hp_priv->iobase[1] + TRANSFER_COUNT_MID_REG);
+	value = (lo & 0xff) | ((mid << 8) & 0x7f00);
+	value = ~(value - 1) & 0x7fff;
+	return value;
+}
+
+static void set_transfer_counter(struct hp_82341_priv *hp_priv, int count)
+{
+	int complement = -count;
+
+	outb(complement & 0xff, hp_priv->iobase[1] + TRANSFER_COUNT_LOW_REG);
+	outb((complement >> 8) & 0xff, hp_priv->iobase[1] + TRANSFER_COUNT_MID_REG);
+	// I don't think the hi count reg is even used, but oh well
+	outb((complement >> 16) & 0xf, hp_priv->iobase[1] + TRANSFER_COUNT_HIGH_REG);
+}
+
diff --git a/drivers/gpib/hp_82341/hp_82341.h b/drivers/gpib/hp_82341/hp_82341.h
new file mode 100644
index 000000000000..859ef2899acb
--- /dev/null
+++ b/drivers/gpib/hp_82341/hp_82341.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright            : (C) 2002, 2005 by Frank Mori Hess             *
+ ***************************************************************************/
+
+#include "tms9914.h"
+#include "gpibP.h"
+
+enum hp_82341_hardware_version {
+	HW_VERSION_UNKNOWN,
+	HW_VERSION_82341C,
+	HW_VERSION_82341D,
+};
+
+// struct which defines private_data for board
+struct hp_82341_priv {
+	struct tms9914_priv tms9914_priv;
+	unsigned int irq;
+	unsigned short config_control_bits;
+	unsigned short mode_control_bits;
+	unsigned short event_status_bits;
+	struct pnp_dev *pnp_dev;
+	unsigned long iobase[4];
+	unsigned long io_region_offset;
+	enum hp_82341_hardware_version hw_version;
+};
+
+static const int hp_82341_region_iosize = 0x8;
+static const int hp_82341_num_io_regions = 4;
+static const int hp_82341_fifo_size = 0xffe;
+static const int hp_82341c_firmware_length = 5764;
+static const int hp_82341d_firmware_length = 5302;
+
+// hp 82341 register offsets
+enum hp_82341_region_0_registers {
+	CONFIG_CONTROL_STATUS_REG = 0x0,
+	MODE_CONTROL_STATUS_REG = 0x1,
+	MONITOR_REG = 0x2,	// after initialization
+	XILINX_DATA_REG = 0x2,	// before initialization, write only
+	INTERRUPT_ENABLE_REG = 0x3,
+	EVENT_STATUS_REG = 0x4,
+	EVENT_ENABLE_REG = 0x5,
+	STREAM_STATUS_REG = 0x7,
+};
+
+enum hp_82341_region_1_registers {
+	ID0_REG = 0x2,
+	ID1_REG = 0x3,
+	TRANSFER_COUNT_LOW_REG = 0x4,
+	TRANSFER_COUNT_MID_REG = 0x5,
+	TRANSFER_COUNT_HIGH_REG = 0x6,
+};
+
+enum hp_82341_region_3_registers {
+	BUFFER_PORT_LOW_REG = 0x0,
+	BUFFER_PORT_HIGH_REG = 0x1,
+	ID2_REG = 0x2,
+	ID3_REG = 0x3,
+	BUFFER_FLUSH_REG = 0x4,
+	BUFFER_CONTROL_REG = 0x7
+};
+
+enum config_control_status_bits {
+	IRQ_SELECT_MASK = 0x7,
+	DMA_CONFIG_MASK = 0x18,
+	ENABLE_DMA_CONFIG_BIT = 0x20,
+	XILINX_READY_BIT = 0x40,	// read only
+	DONE_PGL_BIT = 0x80
+};
+
+static inline unsigned int IRQ_SELECT_BITS(int irq)
+{
+	switch (irq) {
+	case 3:
+		return 0x3;
+	case 5:
+		return 0x2;
+	case 7:
+		return 0x1;
+	case 9:
+		return 0x0;
+	case 10:
+		return 0x7;
+	case 11:
+		return 0x6;
+	case 12:
+		return 0x5;
+	case 15:
+		return 0x4;
+	default:
+		return 0x0;
+	}
+};
+
+enum mode_control_status_bits {
+	SLOT8_BIT = 0x1,		// read only
+	ACTIVE_CONTROLLER_BIT = 0x2,	// read only
+	ENABLE_DMA_BIT = 0x4,
+	SYSTEM_CONTROLLER_BIT = 0x8,
+	MONITOR_BIT = 0x10,
+	ENABLE_IRQ_CONFIG_BIT = 0x20,
+	ENABLE_TI_STREAM_BIT = 0x40
+};
+
+enum monitor_bits {
+	MONITOR_INTERRUPT_PENDING_BIT = 0x1,	// read only
+	MONITOR_CLEAR_HOLDOFF_BIT = 0x2,	// write only
+	MONITOR_PPOLL_BIT = 0x4,		// write clear
+	MONITOR_SRQ_BIT = 0x8,			// write clear
+	MONITOR_IFC_BIT = 0x10,			// write clear
+	MONITOR_REN_BIT = 0x20,			// write clear
+	MONITOR_END_BIT = 0x40,			// write clear
+	MONITOR_DAV_BIT = 0x80			// write clear
+};
+
+enum interrupt_enable_bits {
+	ENABLE_TI_INTERRUPT_BIT = 0x1,
+	ENABLE_POINTERS_EQUAL_INTERRUPT_BIT = 0x4,
+	ENABLE_BUFFER_END_INTERRUPT_BIT = 0x10,
+	ENABLE_TERMINAL_COUNT_INTERRUPT_BIT = 0x20,
+	ENABLE_DMA_TERMINAL_COUNT_INTERRUPT_BIT = 0x80,
+};
+
+enum event_status_bits {
+	TI_INTERRUPT_EVENT_BIT = 0x1,		// write clear
+	INTERRUPT_PENDING_EVENT_BIT = 0x2,	// read only
+	POINTERS_EQUAL_EVENT_BIT = 0x4,		// write clear
+	BUFFER_END_EVENT_BIT = 0x10,		// write clear
+	TERMINAL_COUNT_EVENT_BIT = 0x20,	// write clear
+	DMA_TERMINAL_COUNT_EVENT_BIT = 0x80,	// write clear
+};
+
+enum event_enable_bits {
+	ENABLE_TI_INTERRUPT_EVENT_BIT = 0x1,		// write clear
+	ENABLE_POINTERS_EQUAL_EVENT_BIT = 0x4,		// write clear
+	ENABLE_BUFFER_END_EVENT_BIT = 0x10,		// write clear
+	ENABLE_TERMINAL_COUNT_EVENT_BIT = 0x20,		// write clear
+	ENABLE_DMA_TERMINAL_COUNT_EVENT_BIT = 0x80,	// write clear
+};
+
+enum stream_status_bits {
+	HALTED_STATUS_BIT = 0x1,	// read
+	RESTART_STREAM_BIT = 0x1	// write
+};
+
+enum buffer_control_bits {
+	DIRECTION_GPIB_TO_HOST_BIT = 0x20,	// transfer direction (set for gpib to host)
+	ENABLE_TI_BUFFER_BIT = 0x40,		// enable fifo
+	FAST_WR_EN_BIT = 0x80,			// 350 ns t1 delay?
+};
+
+// registers accessible through isapnp chip on 82341d
+enum hp_82341d_pnp_registers {
+	PIO_DATA_REG = 0x20,		// read/write pio data lines
+	PIO_DIRECTION_REG = 0x21,	// set pio data line directions (set for input)
+};
+
+enum hp_82341d_pnp_pio_bits {
+	HP_82341D_XILINX_READY_BIT = 0x1,
+	HP_82341D_XILINX_DONE_BIT = 0x2,
+	// use register layout compatible with C and older versions instead of 32 contiguous ioports
+	HP_82341D_LEGACY_MODE_BIT = 0x4,
+	HP_82341D_NOT_PROG_BIT = 0x8,	// clear to reinitialize xilinx
+};
diff --git a/drivers/gpib/include/amcc5920.h b/drivers/gpib/include/amcc5920.h
new file mode 100644
index 000000000000..7a88bd282feb
--- /dev/null
+++ b/drivers/gpib/include/amcc5920.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *  Header for amcc5920 pci chip
+ *
+ *   copyright		  : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+// plx pci chip registers and bits
+enum amcc_registers {
+	AMCC_INTCS_REG = 0x38,
+	AMCC_PASS_THRU_REG	= 0x60,
+};
+
+enum amcc_incsr_bits {
+	AMCC_ADDON_INTR_ENABLE_BIT = 0x2000,
+	AMCC_ADDON_INTR_ACTIVE_BIT = 0x400000,
+	AMCC_INTR_ACTIVE_BIT = 0x800000,
+};
+
+static const int bits_per_region = 8;
+
+static inline uint32_t amcc_wait_state_bits(unsigned int region, unsigned int num_wait_states)
+{
+	return (num_wait_states & 0x7) << (--region * bits_per_region);
+};
+
+enum amcc_prefetch_bits {
+	PREFETCH_DISABLED = 0x0,
+	PREFETCH_SMALL = 0x8,
+	PREFETCH_MEDIUM = 0x10,
+	PREFETCH_LARGE = 0x18,
+};
+
+static inline uint32_t amcc_prefetch_bits(unsigned int region, enum amcc_prefetch_bits prefetch)
+{
+	return prefetch << (--region * bits_per_region);
+};
+
+static inline uint32_t amcc_PTADR_mode_bit(unsigned int region)
+{
+	return 0x80 << (--region * bits_per_region);
+};
+
+static inline uint32_t amcc_disable_write_fifo_bit(unsigned int region)
+{
+	return 0x20 << (--region * bits_per_region);
+};
+
diff --git a/drivers/gpib/include/amccs5933.h b/drivers/gpib/include/amccs5933.h
new file mode 100644
index 000000000000..d7f63c795096
--- /dev/null
+++ b/drivers/gpib/include/amccs5933.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ * Registers and bits for amccs5933 pci chip
+ *    copyright            : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+// register offsets
+enum {
+	MBEF_REG = 0x34,	// mailbux empty/full
+	INTCSR_REG = 0x38,	// interrupt control and status
+	BMCSR_REG = 0x3c,	// bus master control and status
+};
+
+// incoming mailbox 0-3  register offsets
+extern inline int INCOMING_MAILBOX_REG(unsigned int mailbox)
+{
+	return (0x10 + 4 * mailbox);
+};
+
+// bit definitions
+
+// INTCSR bits
+enum {
+	OUTBOX_EMPTY_INTR_BIT = 0x10,	// enable outbox empty interrupt
+	INBOX_FULL_INTR_BIT = 0x1000,	// enable inbox full interrupt
+	INBOX_INTR_CS_BIT = 0x20000,	// read, or write clear inbox full interrupt
+	INTR_ASSERTED_BIT = 0x800000,	// read only, interrupt asserted
+};
+
+// select byte 0 to 3 of incoming mailbox
+extern inline int INBOX_BYTE_BITS(unsigned int byte)
+{
+	return (byte & 0x3) << 8;
+};
+
+// select incoming mailbox 0 to 3
+extern inline int INBOX_SELECT_BITS(unsigned int mailbox)
+{
+	return (mailbox & 0x3) << 10;
+};
+
+// select byte 0 to 3 of outgoing mailbox
+extern inline int OUTBOX_BYTE_BITS(unsigned int byte)
+{
+	return (byte & 0x3);
+};
+
+// select outgoing mailbox 0 to 3
+extern inline int OUTBOX_SELECT_BITS(unsigned int mailbox)
+{
+	return (mailbox & 0x3) << 2;
+};
+
+// BMCSR bits
+enum {
+	MBOX_FLAGS_RESET_BIT = 0x08000000,	// resets mailbox empty/full flags
+};
+
diff --git a/drivers/gpib/include/gpibP.h b/drivers/gpib/include/gpibP.h
new file mode 100644
index 000000000000..e3938ada3e0d
--- /dev/null
+++ b/drivers/gpib/include/gpibP.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright		   : (C) 2002,2003 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _GPIB_P_H
+#define _GPIB_P_H
+
+#include <linux/types.h>
+
+#include "gpib_types.h"
+#include "gpib_proto.h"
+#include "gpib_cmd.h"
+#include <linux/gpib.h>
+#include <linux/gpib_ioctl.h>
+
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+
+int gpib_register_driver(struct gpib_interface *interface, struct module *mod);
+void gpib_unregister_driver(struct gpib_interface *interface);
+struct pci_dev *gpib_pci_get_device(const struct gpib_board_config *config, unsigned int vendor_id,
+				    unsigned int device_id, struct pci_dev *from);
+struct pci_dev *gpib_pci_get_subsys(const struct gpib_board_config *config, unsigned int vendor_id,
+				    unsigned int device_id, unsigned int ss_vendor,
+				    unsigned int ss_device, struct pci_dev *from);
+unsigned int num_gpib_events(const struct gpib_event_queue *queue);
+int push_gpib_event(struct gpib_board *board, short event_type);
+int pop_gpib_event(struct gpib_board *board, struct gpib_event_queue *queue, short *event_type);
+int gpib_request_pseudo_irq(struct gpib_board *board, irqreturn_t (*handler)(int, void *));
+void gpib_free_pseudo_irq(struct gpib_board *board);
+int gpib_match_device_path(struct device *dev, const char *device_path_in);
+
+extern struct gpib_board board_array[GPIB_MAX_NUM_BOARDS];
+
+extern struct list_head registered_drivers;
+
+#endif	// _GPIB_P_H
+
diff --git a/drivers/gpib/include/gpib_cmd.h b/drivers/gpib/include/gpib_cmd.h
new file mode 100644
index 000000000000..9e96a3bfa22d
--- /dev/null
+++ b/drivers/gpib/include/gpib_cmd.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _GPIB_CMD_H
+#define _GPIB_CMD_H
+
+#include <linux/types.h>
+
+/* Command byte definitions tests and functions */
+
+/* mask of bits that actually matter in a command byte */
+enum {
+	gpib_command_mask = 0x7f,
+};
+
+/* Possible GPIB command messages */
+
+enum cmd_byte {
+	GTL = 0x1,	/* go to local			*/
+	SDC = 0x4,	/* selected device clear	*/
+	PP_CONFIG = 0x5,
+	GET = 0x8,	/* group execute trigger	*/
+	TCT = 0x9,	/* take control			*/
+	LLO = 0x11,	/* local lockout		*/
+	DCL = 0x14,	/* device clear			*/
+	PPU = 0x15,	/* parallel poll unconfigure	*/
+	SPE = 0x18,	/* serial poll enable		*/
+	SPD = 0x19,	/* serial poll disable		*/
+	CFE = 0x1f,     /* configure enable */
+	LAD = 0x20,	/* value to be 'ored' in to obtain listen address */
+	UNL = 0x3F,	/* unlisten			*/
+	TAD = 0x40,	/* value to be 'ored' in to obtain talk address	  */
+	UNT = 0x5F,	/* untalk			*/
+	SAD = 0x60,	/* my secondary address (base) */
+	PPE = 0x60,	/* parallel poll enable (base)	*/
+	PPD = 0x70	/* parallel poll disable	*/
+};
+
+/* confine address to range 0 to 30. */
+static inline unsigned int gpib_address_restrict(u32 addr)
+{
+	addr &= 0x1f;
+	if (addr == 0x1f)
+		addr = 0;
+	return addr;
+}
+
+static inline u8 MLA(u32 addr)
+{
+	return gpib_address_restrict(addr) | LAD;
+}
+
+static inline u8 MTA(u32 addr)
+{
+	return gpib_address_restrict(addr) | TAD;
+}
+
+static inline u8 MSA(u32 addr)
+{
+	return (addr & 0x1f) | SAD;
+}
+
+static inline s32 gpib_address_equal(u32 pad1, s32 sad1, u32 pad2, s32 sad2)
+{
+	if (pad1 == pad2) {
+		if (sad1 == sad2)
+			return 1;
+		if (sad1 < 0 && sad2 < 0)
+			return 1;
+	}
+
+	return 0;
+}
+
+static inline s32 is_PPE(u8 command)
+{
+	return (command & 0x70) == 0x60;
+}
+
+static inline s32 is_PPD(u8 command)
+{
+	return (command & 0x70) == 0x70;
+}
+
+static inline s32 in_addressed_command_group(u8 command)
+{
+	return (command & 0x70) == 0x0;
+}
+
+static inline s32 in_universal_command_group(u8 command)
+{
+	return (command & 0x70) == 0x10;
+}
+
+static inline s32 in_listen_address_group(u8 command)
+{
+	return (command & 0x60) == 0x20;
+}
+
+static inline s32 in_talk_address_group(u8 command)
+{
+	return (command & 0x60) == 0x40;
+}
+
+static inline s32 in_primary_command_group(u8 command)
+{
+	return in_addressed_command_group(command) ||
+		in_universal_command_group(command) ||
+		in_listen_address_group(command) ||
+		in_talk_address_group(command);
+}
+
+#endif /* _GPIB_CMD_H */
diff --git a/drivers/gpib/include/gpib_pci_ids.h b/drivers/gpib/include/gpib_pci_ids.h
new file mode 100644
index 000000000000..52dcab07a7d1
--- /dev/null
+++ b/drivers/gpib/include/gpib_pci_ids.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __GPIB_PCI_IDS_H
+#define __GPIB_PCI_IDS_H
+
+#ifndef PCI_VENDOR_ID_AMCC
+#define PCI_VENDOR_ID_AMCC	0x10e8
+#endif
+
+#ifndef PCI_VENDOR_ID_CBOARDS
+#define PCI_VENDOR_ID_CBOARDS	0x1307
+#endif
+
+#ifndef PCI_VENDOR_ID_QUANCOM
+#define PCI_VENDOR_ID_QUANCOM	0x8008
+#endif
+
+#ifndef PCI_DEVICE_ID_QUANCOM_GPIB
+#define PCI_DEVICE_ID_QUANCOM_GPIB	0x3302
+#endif
+
+#endif	// __GPIB_PCI_IDS_H
+
diff --git a/drivers/gpib/include/gpib_proto.h b/drivers/gpib/include/gpib_proto.h
new file mode 100644
index 000000000000..42e736e3b7cd
--- /dev/null
+++ b/drivers/gpib/include/gpib_proto.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef GPIB_PROTO_INCLUDED
+#define GPIB_PROTO_INCLUDED
+
+#include <linux/fs.h>
+
+int ibopen(struct inode *inode, struct file *filep);
+int ibclose(struct inode *inode, struct file *file);
+long ibioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+void os_start_timer(struct gpib_board *board, unsigned int usec_timeout);
+void os_remove_timer(struct gpib_board *board);
+void init_gpib_board(struct gpib_board *board);
+static inline unsigned long usec_to_jiffies(unsigned int usec)
+{
+	unsigned long usec_per_jiffy = 1000000 / HZ;
+
+	return 1 + (usec + usec_per_jiffy - 1) / usec_per_jiffy;
+};
+
+int serial_poll_all(struct gpib_board *board, unsigned int usec_timeout);
+void init_gpib_descriptor(struct gpib_descriptor *desc);
+int dvrsp(struct gpib_board *board, unsigned int pad, int sad,
+	  unsigned int usec_timeout, u8 *result);
+int ibcac(struct gpib_board *board, int sync, int fallback_to_async);
+int ibcmd(struct gpib_board *board, u8 *buf, size_t length, size_t *bytes_written);
+int ibgts(struct gpib_board *board);
+int ibonline(struct gpib_board *board);
+int iboffline(struct gpib_board *board);
+int iblines(const struct gpib_board *board, short *lines);
+int ibrd(struct gpib_board *board, u8 *buf, size_t length, int *end_flag, size_t *bytes_read);
+int ibrpp(struct gpib_board *board, u8 *buf);
+int ibrsv2(struct gpib_board *board, u8 status_byte, int new_reason_for_service);
+int ibrsc(struct gpib_board *board, int request_control);
+int ibsic(struct gpib_board *board, unsigned int usec_duration);
+int ibsre(struct gpib_board *board, int enable);
+int ibpad(struct gpib_board *board, unsigned int addr);
+int ibsad(struct gpib_board *board, int addr);
+int ibeos(struct gpib_board *board, int eos, int eosflags);
+int ibwait(struct gpib_board *board, int wait_mask, int clear_mask, int set_mask,
+	   int *status, unsigned long usec_timeout, struct gpib_descriptor *desc);
+int ibwrt(struct gpib_board *board, u8 *buf, size_t cnt, int send_eoi, size_t *bytes_written);
+int ibstatus(struct gpib_board *board);
+int general_ibstatus(struct gpib_board *board, const struct gpib_status_queue *device,
+		     int clear_mask, int set_mask, struct gpib_descriptor *desc);
+int io_timed_out(struct gpib_board *board);
+int ibppc(struct gpib_board *board, u8 configuration);
+
+#endif /* GPIB_PROTO_INCLUDED */
diff --git a/drivers/gpib/include/gpib_state_machines.h b/drivers/gpib/include/gpib_state_machines.h
new file mode 100644
index 000000000000..7488c00f191e
--- /dev/null
+++ b/drivers/gpib/include/gpib_state_machines.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright            : (C) 2006 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _GPIB_STATE_MACHINES_H
+#define _GPIB_STATE_MACHINES_H
+
+enum talker_function_state {
+	talker_idle,
+	talker_addressed,
+	talker_active,
+	serial_poll_active
+};
+
+enum listener_function_state {
+	listener_idle,
+	listener_addressed,
+	listener_active
+};
+
+#endif	// _GPIB_STATE_MACHINES_H
diff --git a/drivers/gpib/include/gpib_types.h b/drivers/gpib/include/gpib_types.h
new file mode 100644
index 000000000000..5a0978ae27e7
--- /dev/null
+++ b/drivers/gpib/include/gpib_types.h
@@ -0,0 +1,381 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright		   : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _GPIB_TYPES_H
+#define _GPIB_TYPES_H
+
+#ifdef __KERNEL__
+#include <linux/gpib.h>
+#include <linux/atomic.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/interrupt.h>
+
+struct gpib_board;
+
+/* config parameters that are only used by driver attach functions */
+struct gpib_board_config {
+	/* firmware blob */
+	void *init_data;
+	int init_data_length;
+	/* IO base address to use for non-pnp cards (set by core, driver should make local copy) */
+	u32 ibbase;
+	void __iomem *mmibbase;
+	/* IRQ to use for non-pnp cards (set by core, driver should make local copy) */
+	unsigned int ibirq;
+	/* dma channel to use for non-pnp cards (set by core, driver should make local copy) */
+	unsigned int ibdma;
+	/*
+	 * pci bus of card, useful for distinguishing multiple identical pci cards
+	 * (negative means don't care)
+	 */
+	int pci_bus;
+	/*
+	 * pci slot of card, useful for distinguishing multiple identical pci cards
+	 * (negative means don't care)
+	 */
+	int pci_slot;
+	/* sysfs device path of hardware to attach */
+	char *device_path;
+	/* serial number of hardware to attach */
+	char *serial_number;
+};
+
+/*
+ * struct gpib_interface defines the interface
+ * between the board-specific details dealt with in the drivers
+ * and generic interface provided by gpib-common.
+ * This really should be in a different header file.
+ */
+struct gpib_interface {
+	/* name of board */
+	char *name;
+	/* attach() initializes board and allocates resources */
+	int (*attach)(struct gpib_board *board, const struct gpib_board_config *config);
+	/* detach() shuts down board and frees resources */
+	void (*detach)(struct gpib_board *board);
+	/*
+	 * read() should read at most 'length' bytes from the bus into
+	 * 'buffer'.  It should return when it fills the buffer or
+	 * encounters an END (EOI and or EOS if appropriate).  It should set 'end'
+	 * to be nonzero if the read was terminated by an END, otherwise 'end'
+	 * should be zero.
+	 * Ultimately, this will be changed into or replaced by an asynchronous
+	 * read.  Zero return value for success, negative
+	 * return indicates error.
+	 * nbytes returns number of bytes read
+	 */
+	int (*read)(struct gpib_board *board, u8 *buffer, size_t length, int *end,
+		    size_t *bytes_read);
+	/*
+	 * write() should write 'length' bytes from buffer to the bus.
+	 * If the boolean value send_eoi is nonzero, then EOI should
+	 * be sent along with the last byte.  Returns number of bytes
+	 * written or negative value on error.
+	 */
+	int (*write)(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
+		     size_t *bytes_written);
+	/*
+	 * command() writes the command bytes in 'buffer' to the bus
+	 * Returns zero on success or negative value on error.
+	 */
+	int (*command)(struct gpib_board *board, u8 *buffer, size_t length,
+		       size_t *bytes_written);
+	/*
+	 * Take control (assert ATN).  If 'asyncronous' is nonzero, take
+	 * control asyncronously (assert ATN immediately without waiting
+	 * for other processes to complete first).  Should not return
+	 * until board becomes controller in charge.  Returns zero no success,
+	 * nonzero on error.
+	 */
+	int (*take_control)(struct gpib_board *board, int asyncronous);
+	/*
+	 * De-assert ATN.  Returns zero on success, nonzer on error.
+	 */
+	int (*go_to_standby)(struct gpib_board *board);
+	/* request/release control of the IFC and REN lines (system controller) */
+	int (*request_system_control)(struct gpib_board *board, int request_control);
+	/*
+	 * Asserts or de-asserts 'interface clear' (IFC) depending on
+	 * boolean value of 'assert'
+	 */
+	void (*interface_clear)(struct gpib_board *board, int assert);
+	/*
+	 * Sends remote enable command if 'enable' is nonzero, disables remote mode
+	 * if 'enable' is zero
+	 */
+	void (*remote_enable)(struct gpib_board *board, int enable);
+	/*
+	 * enable END for reads, when byte 'eos' is received.  If
+	 * 'compare_8_bits' is nonzero, then all 8 bits are compared
+	 * with the eos bytes.	Otherwise only the 7 least significant
+	 * bits are compared.
+	 */
+	int (*enable_eos)(struct gpib_board *board, u8 eos, int compare_8_bits);
+	/* disable END on eos byte (END on EOI only)*/
+	void (*disable_eos)(struct gpib_board *board);
+	/* configure parallel poll */
+	void (*parallel_poll_configure)(struct gpib_board *board, u8 configuration);
+	/* conduct parallel poll */
+	int (*parallel_poll)(struct gpib_board *board, u8 *result);
+	/* set/clear ist (individual status bit) */
+	void (*parallel_poll_response)(struct gpib_board *board, int ist);
+	/* select local parallel poll configuration mode PP2 versus remote PP1 */
+	void (*local_parallel_poll_mode)(struct gpib_board *board, int local);
+	/*
+	 * Returns current status of the bus lines.  Should be set to
+	 * NULL if your board does not have the ability to query the
+	 * state of the bus lines.
+	 */
+	int (*line_status)(const struct gpib_board *board);
+	/*
+	 * updates and returns the board's current status.
+	 * The meaning of the bits are specified in gpib_user.h
+	 * in the IBSTA section.  The driver does not need to
+	 * worry about setting the CMPL, END, TIMO, or ERR bits.
+	 */
+	unsigned int (*update_status)(struct gpib_board *board, unsigned int clear_mask);
+	/*
+	 * Sets primary address 0-30 for gpib interface card.
+	 */
+	int (*primary_address)(struct gpib_board *board, unsigned int address);
+	/*
+	 * Sets and enables, or disables secondary address 0-30
+	 * for gpib interface card.
+	 */
+	int (*secondary_address)(struct gpib_board *board, unsigned int address,
+				 int enable);
+	/*
+	 * Sets the byte the board should send in response to a serial poll.
+	 * This function should also start or stop requests for service via
+	 * IEEE 488.2 reqt/reqf, based on MSS (bit 6 of the status_byte).
+	 * If the more flexible serial_poll_response2 is implemented by the
+	 * driver, then this method should be left NULL since it will not
+	 * be used.  This method can generate spurious service requests
+	 * which are allowed by IEEE 488.2, but not ideal.
+	 *
+	 * This method should implement the serial poll response method described
+	 * by IEEE 488.2 section 11.3.3.4.3 "Allowed Coupled Control of
+	 * STB, reqt, and reqf".
+	 */
+	void (*serial_poll_response)(struct gpib_board *board, u8 status_byte);
+	/*
+	 * Sets the byte the board should send in response to a serial poll.
+	 * This function should also request service via IEEE 488.2 reqt/reqf
+	 * based on MSS (bit 6 of the status_byte) and new_reason_for_service.
+	 * reqt should be set true if new_reason_for_service is true,
+	 * and reqf should be set true if MSS is false.	 This function
+	 * will never be called with MSS false and new_reason_for_service
+	 * true simultaneously, so don't worry about that case.
+	 *
+	 * This method implements the serial poll response method described
+	 * by IEEE 488.2 section 11.3.3.4.1 "Preferred Implementation".
+	 *
+	 * If this method is left NULL by the driver, then the user library
+	 * function ibrsv2 will not work.
+	 */
+	void (*serial_poll_response2)(struct gpib_board *board, u8 status_byte,
+				      int new_reason_for_service);
+	/*
+	 * returns the byte the board will send in response to a serial poll.
+	 */
+	u8 (*serial_poll_status)(struct gpib_board *board);
+	/* adjust T1 delay */
+	int (*t1_delay)(struct gpib_board *board, unsigned int nano_sec);
+	/* go to local mode */
+	void (*return_to_local)(struct gpib_board *board);
+	/* board does not support 7 bit eos comparisons */
+	unsigned no_7_bit_eos : 1;
+	/* skip check for listeners before trying to send command bytes */
+	unsigned skip_check_for_command_acceptors : 1;
+};
+
+struct gpib_event_queue {
+	struct list_head event_head;
+	spinlock_t lock; // for access to event list
+	unsigned int num_events;
+	unsigned dropped_event : 1;
+};
+
+static inline void init_event_queue(struct gpib_event_queue *queue)
+{
+	INIT_LIST_HEAD(&queue->event_head);
+	queue->num_events = 0;
+	queue->dropped_event = 0;
+	spin_lock_init(&queue->lock);
+}
+
+/* struct for supporting polling operation when irq is not available */
+struct gpib_pseudo_irq {
+	struct timer_list timer;
+	irqreturn_t (*handler)(int irq, void *arg);
+	struct gpib_board *board;
+	atomic_t active;
+};
+
+static inline void init_gpib_pseudo_irq(struct gpib_pseudo_irq *pseudo_irq)
+{
+	pseudo_irq->handler = NULL;
+	timer_setup(&pseudo_irq->timer, NULL, 0);
+	atomic_set(&pseudo_irq->active, 0);
+}
+
+/* list so we can make a linked list of drivers */
+struct gpib_interface_list {
+	struct list_head list;
+	struct gpib_interface *interface;
+	struct module *module;
+};
+
+/*
+ * One struct gpib_board is allocated for each physical board in the computer.
+ * It provides storage for variables local to each board, and interface
+ * functions for performing operations on the board
+ */
+struct gpib_board {
+	/* functions used by this board */
+	struct gpib_interface *interface;
+	/*
+	 * Pointer to module whose use count we should increment when
+	 * interface is in use
+	 */
+	struct module *provider_module;
+	/* buffer used to store read/write data for this board */
+	u8 *buffer;
+	/* length of buffer */
+	unsigned int buffer_length;
+	/*
+	 * Used to hold the board's current status (see update_status() above)
+	 */
+	unsigned long status;
+	/*
+	 * Driver should only sleep on this wait queue.	 It is special in that the
+	 * core will wake this queue and set the TIMO bit in 'status' when the
+	 * watchdog timer times out.
+	 */
+	wait_queue_head_t wait;
+	/*
+	 * Lock that only allows one process to access this board at a time.
+	 * Has to be first in any locking order, since it can be locked over
+	 * multiple ioctls.
+	 */
+	struct mutex user_mutex;
+	/*
+	 * Mutex which compensates for removal of "big kernel lock" from kernel.
+	 * Should not be held for extended waits.
+	 */
+	struct mutex big_gpib_mutex;
+	/* pid of last process to lock the board mutex */
+	pid_t locking_pid;
+	/* lock for setting locking pid */
+	spinlock_t locking_pid_spinlock;
+	/* Spin lock for dealing with races with the interrupt handler */
+	spinlock_t spinlock;
+	/* Watchdog timer to enable timeouts */
+	struct timer_list timer;
+	/* device of attached driver if any */
+	struct device *dev;
+	/* gpib_common device gpibN */
+	struct device *gpib_dev;
+	/*
+	 * 'private_data' can be used as seen fit by the driver to
+	 * store additional variables for this board
+	 */
+	void *private_data;
+	/* Number of open file descriptors using this board */
+	unsigned int use_count;
+	/* list of open devices connected to this board */
+	struct list_head device_list;
+	/* primary address */
+	unsigned int pad;
+	/* secondary address */
+	int sad;
+	/* timeout for io operations, in microseconds */
+	unsigned int usec_timeout;
+	/* board's parallel poll configuration byte */
+	u8 parallel_poll_configuration;
+	/* t1 delay we are using */
+	unsigned int t1_nano_sec;
+	/* Count that keeps track of whether board is up and running or not */
+	unsigned int online;
+	/* number of processes trying to autopoll */
+	int autospollers;
+	/* autospoll kernel thread */
+	struct task_struct *autospoll_task;
+	/* queue for recording received trigger/clear/ifc events */
+	struct gpib_event_queue event_queue;
+	/* minor number for this board's device file */
+	int minor;
+	/* struct to deal with polling mode*/
+	struct gpib_pseudo_irq pseudo_irq;
+	/* error dong autopoll */
+	atomic_t stuck_srq;
+	struct gpib_board_config config;
+	/* Flag that indicates whether board is system controller of the bus */
+	unsigned master : 1;
+	/* individual status bit */
+	unsigned ist : 1;
+	/*
+	 * one means local parallel poll mode ieee 488.1 PP2 (or no parallel poll PP0),
+	 * zero means remote parallel poll configuration mode ieee 488.1 PP1
+	 */
+	unsigned local_ppoll_mode : 1;
+};
+
+/* element of event queue */
+struct gpib_event {
+	struct list_head list;
+	short event_type;
+};
+
+/*
+ * Each board has a list of gpib_status_queue to keep track of all open devices
+ * on the bus, so we know what address to poll when we get a service request
+ */
+struct gpib_status_queue {
+	/* list_head so we can make a linked list of devices */
+	struct list_head list;
+	unsigned int pad;	/* primary gpib address */
+	int sad;	/* secondary gpib address (negative means disabled) */
+	/* stores serial poll bytes for this device */
+	struct list_head status_bytes;
+	unsigned int num_status_bytes;
+	/* number of times this address is opened */
+	unsigned int reference_count;
+	/* flags loss of status byte error due to limit on size of queue */
+	unsigned dropped_byte : 1;
+};
+
+struct gpib_status_byte {
+	struct list_head list;
+	u8 poll_byte;
+};
+
+void init_gpib_status_queue(struct gpib_status_queue *device);
+
+/* Used to store device-descriptor-specific information */
+struct gpib_descriptor {
+	unsigned int pad;	/* primary gpib address */
+	int sad;	/* secondary gpib address (negative means disabled) */
+	atomic_t io_in_progress;
+	unsigned is_board : 1;
+	unsigned autopoll_enabled : 1;
+};
+
+struct gpib_file_private {
+	atomic_t holding_mutex;
+	struct gpib_descriptor *descriptors[GPIB_MAX_NUM_DESCRIPTORS];
+	/* locked while descriptors are being allocated/deallocated */
+	struct mutex descriptors_mutex;
+	unsigned got_module : 1;
+};
+
+#endif	/* __KERNEL__ */
+
+#endif	/* _GPIB_TYPES_H */
diff --git a/drivers/gpib/include/nec7210.h b/drivers/gpib/include/nec7210.h
new file mode 100644
index 000000000000..9835aa5ef4ff
--- /dev/null
+++ b/drivers/gpib/include/nec7210.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright            : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _NEC7210_H
+#define _NEC7210_H
+
+#include "gpib_state_machines.h"
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/interrupt.h>
+
+#include "gpib_types.h"
+#include "nec7210_registers.h"
+
+/* struct used to provide variables local to a nec7210 chip */
+struct nec7210_priv {
+#ifdef CONFIG_HAS_IOPORT
+	u32 iobase;
+#endif
+	void __iomem *mmiobase;
+	unsigned int offset;		// offset between successive nec7210 io addresses
+	unsigned int dma_channel;
+	u8 *dma_buffer;
+	unsigned int dma_buffer_length;	// length of dma buffer
+	dma_addr_t dma_buffer_addr;	// bus address of board->buffer for use with dma
+	// software copy of bits written to registers
+	u8 reg_bits[8];
+	u8 auxa_bits;			// bits written to auxiliary register A
+	u8 auxb_bits;			// bits written to auxiliary register B
+	// used to keep track of board's state, bit definitions given below
+	unsigned long state;
+	// lock for chips that extend the nec7210 registers by paging in alternate regs
+	spinlock_t register_page_lock;
+	// wrappers for outb, inb, readb, or writeb
+	u8 (*read_byte)(struct nec7210_priv *priv, unsigned int register_number);
+	void (*write_byte)(struct nec7210_priv *priv, u8 byte, unsigned int register_number);
+	enum nec7210_chipset type;
+	enum talker_function_state talker_state;
+	enum listener_function_state listener_state;
+	void *private;
+	unsigned srq_pending : 1;
+};
+
+static inline void init_nec7210_private(struct nec7210_priv *priv)
+{
+	memset(priv, 0, sizeof(struct nec7210_priv));
+	spin_lock_init(&priv->register_page_lock);
+}
+
+// slightly shorter way to access read_byte and write_byte
+static inline u8 read_byte(struct nec7210_priv *priv, unsigned int register_number)
+{
+	return priv->read_byte(priv, register_number);
+}
+
+static inline void write_byte(struct nec7210_priv *priv, u8 byte, unsigned int register_number)
+{
+	priv->write_byte(priv, byte, register_number);
+}
+
+// struct nec7210_priv.state bit numbers
+enum {
+	PIO_IN_PROGRESS_BN,		// pio transfer in progress
+	DMA_READ_IN_PROGRESS_BN,	// dma read transfer in progress
+	DMA_WRITE_IN_PROGRESS_BN,	// dma write transfer in progress
+	READ_READY_BN,			// board has data byte available to read
+	WRITE_READY_BN,			// board is ready to send a data byte
+	COMMAND_READY_BN,		// board is ready to send a command byte
+	RECEIVED_END_BN,		// received END
+	BUS_ERROR_BN,			// output error has occurred
+	RFD_HOLDOFF_BN,			// rfd holdoff in effect
+	DEV_CLEAR_BN,			// device clear received
+	ADR_CHANGE_BN,			// address state change occurred
+};
+
+// interface functions
+int nec7210_read(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
+		 size_t length, int *end, size_t *bytes_read);
+int nec7210_write(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
+		  size_t length, int send_eoi, size_t *bytes_written);
+int nec7210_command(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
+		    size_t length, size_t *bytes_written);
+int nec7210_take_control(struct gpib_board *board, struct nec7210_priv *priv, int syncronous);
+int nec7210_go_to_standby(struct gpib_board *board, struct nec7210_priv *priv);
+int nec7210_request_system_control(struct gpib_board *board,
+				   struct nec7210_priv *priv, int request_control);
+void nec7210_interface_clear(struct gpib_board *board, struct nec7210_priv *priv, int assert);
+void nec7210_remote_enable(struct gpib_board *board, struct nec7210_priv *priv, int enable);
+int nec7210_enable_eos(struct gpib_board *board, struct nec7210_priv *priv, u8 eos_bytes,
+		       int compare_8_bits);
+void nec7210_disable_eos(struct gpib_board *board, struct nec7210_priv *priv);
+unsigned int nec7210_update_status(struct gpib_board *board, struct nec7210_priv *priv,
+				   unsigned int clear_mask);
+unsigned int nec7210_update_status_nolock(struct gpib_board *board, struct nec7210_priv *priv);
+int nec7210_primary_address(const struct gpib_board *board,
+			    struct nec7210_priv *priv, unsigned int address);
+int nec7210_secondary_address(const struct gpib_board *board, struct nec7210_priv *priv,
+			      unsigned int address, int enable);
+int nec7210_parallel_poll(struct gpib_board *board, struct nec7210_priv *priv, u8 *result);
+void nec7210_serial_poll_response(struct gpib_board *board,
+				  struct nec7210_priv *priv, u8 status);
+void nec7210_parallel_poll_configure(struct gpib_board *board,
+				     struct nec7210_priv *priv, unsigned int configuration);
+void nec7210_parallel_poll_response(struct gpib_board *board,
+				    struct nec7210_priv *priv, int ist);
+u8 nec7210_serial_poll_status(struct gpib_board *board, struct nec7210_priv *priv);
+int nec7210_t1_delay(struct gpib_board *board,
+		     struct nec7210_priv *priv, unsigned int nano_sec);
+void nec7210_return_to_local(const struct gpib_board *board, struct nec7210_priv *priv);
+
+// utility functions
+void nec7210_board_reset(struct nec7210_priv *priv, const struct gpib_board *board);
+void nec7210_board_online(struct nec7210_priv *priv, const struct gpib_board *board);
+unsigned int nec7210_set_reg_bits(struct nec7210_priv *priv, unsigned int reg,
+				  unsigned int mask, unsigned int bits);
+void nec7210_set_handshake_mode(struct gpib_board *board, struct nec7210_priv *priv, int mode);
+void nec7210_release_rfd_holdoff(struct gpib_board *board, struct nec7210_priv *priv);
+u8 nec7210_read_data_in(struct gpib_board *board, struct nec7210_priv *priv, int *end);
+
+// wrappers for io functions
+u8 nec7210_ioport_read_byte(struct nec7210_priv *priv, unsigned int register_num);
+void nec7210_ioport_write_byte(struct nec7210_priv *priv, u8 data, unsigned int register_num);
+u8 nec7210_iomem_read_byte(struct nec7210_priv *priv, unsigned int register_num);
+void nec7210_iomem_write_byte(struct nec7210_priv *priv, u8 data, unsigned int register_num);
+u8 nec7210_locking_ioport_read_byte(struct nec7210_priv *priv, unsigned int register_num);
+void nec7210_locking_ioport_write_byte(struct nec7210_priv *priv, u8 data,
+				       unsigned int register_num);
+u8 nec7210_locking_iomem_read_byte(struct nec7210_priv *priv, unsigned int register_num);
+void nec7210_locking_iomem_write_byte(struct nec7210_priv *priv, u8 data,
+				      unsigned int register_num);
+
+// interrupt service routine
+irqreturn_t nec7210_interrupt(struct gpib_board *board, struct nec7210_priv *priv);
+irqreturn_t nec7210_interrupt_have_status(struct gpib_board *board,
+					  struct nec7210_priv *priv, int status1, int status2);
+
+#endif	//_NEC7210_H
diff --git a/drivers/gpib/include/nec7210_registers.h b/drivers/gpib/include/nec7210_registers.h
new file mode 100644
index 000000000000..067983d7a07f
--- /dev/null
+++ b/drivers/gpib/include/nec7210_registers.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright            : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _NEC7210_REGISTERS_H
+#define _NEC7210_REGISTERS_H
+
+enum nec7210_chipset {
+	NEC7210,	// The original
+	TNT4882,	// NI
+	NAT4882,	// NI
+	CB7210,		// measurement computing
+	IOT7210,	// iotech
+	IGPIB7210,	// Ines
+	TNT5004,	// NI (minor differences to TNT4882)
+};
+
+/*
+ * nec7210 register numbers (might need to be multiplied by
+ * a board-dependent offset to get actually io address offset)
+ */
+// write registers
+enum nec7210_write_regs {
+	CDOR,	// command/data out
+	IMR1,	// interrupt mask 1
+	IMR2,	// interrupt mask 2
+	SPMR,	// serial poll mode
+	ADMR,	// address mode
+	AUXMR,	// auxiliary mode
+	ADR,	// address
+	EOSR,	// end-of-string
+
+	// nec7210 has 8 registers
+	nec7210_num_registers = 8,
+};
+
+// read registers
+enum nec7210_read_regs {
+	DIR,	// data in
+	ISR1,	// interrupt status 1
+	ISR2,	// interrupt status 2
+	SPSR,	// serial poll status
+	ADSR,	// address status
+	CPTR,	// command pass though
+	ADR0,	// address 1
+	ADR1,	// address 2
+};
+
+// bit definitions common to nec-7210 compatible registers
+
+// ISR1: interrupt status register 1
+enum isr1_bits {
+	HR_DI = (1 << 0),
+	HR_DO = (1 << 1),
+	HR_ERR = (1 << 2),
+	HR_DEC = (1 << 3),
+	HR_END = (1 << 4),
+	HR_DET = (1 << 5),
+	HR_APT = (1 << 6),
+	HR_CPT = (1 << 7),
+};
+
+// IMR1: interrupt mask register 1
+enum imr1_bits {
+	HR_DIIE = (1 << 0),
+	HR_DOIE = (1 << 1),
+	HR_ERRIE = (1 << 2),
+	HR_DECIE = (1 << 3),
+	HR_ENDIE = (1 << 4),
+	HR_DETIE = (1 << 5),
+	HR_APTIE = (1 << 6),
+	HR_CPTIE = (1 << 7),
+};
+
+// ISR2, interrupt status register 2
+enum isr2_bits {
+	HR_ADSC = (1 << 0),
+	HR_REMC = (1 << 1),
+	HR_LOKC = (1 << 2),
+	HR_CO = (1 << 3),
+	HR_REM = (1 << 4),
+	HR_LOK = (1 << 5),
+	HR_SRQI = (1 << 6),
+	HR_INT = (1 << 7),
+};
+
+// IMR2, interrupt mask register 2
+enum imr2_bits {
+	// all the bits in this register that enable interrupts
+	IMR2_ENABLE_INTR_MASK = 0x4f,
+	HR_ACIE = (1 << 0),
+	HR_REMIE = (1 << 1),
+	HR_LOKIE = (1 << 2),
+	HR_COIE = (1 << 3),
+	HR_DMAI = (1 << 4),
+	HR_DMAO = (1 << 5),
+	HR_SRQIE = (1 << 6),
+};
+
+// SPSR, serial poll status register
+enum spsr_bits {
+	HR_PEND = (1 << 6),
+};
+
+// SPMR, serial poll mode register
+enum spmr_bits {
+	HR_RSV = (1 << 6),
+};
+
+// ADSR, address status register
+enum adsr_bits {
+	HR_MJMN = (1 << 0),
+	HR_TA = (1 << 1),
+	HR_LA = (1 << 2),
+	HR_TPAS = (1 << 3),
+	HR_LPAS = (1 << 4),
+	HR_SPMS = (1 << 5),
+	HR_NATN = (1 << 6),
+	HR_CIC = (1 << 7),
+};
+
+// ADMR, address mode register
+enum admr_bits {
+	HR_ADM0 = (1 << 0),
+	HR_ADM1 = (1 << 1),
+	HR_TRM0 = (1 << 4),
+	HR_TRM1 = (1 << 5),
+	HR_TRM_EOIOE_TRIG = 0,
+	HR_TRM_CIC_TRIG = HR_TRM0,
+	HR_TRM_CIC_EOIOE = HR_TRM1,
+	HR_TRM_CIC_PE = HR_TRM0 | HR_TRM1,
+	HR_LON = (1 << 6),
+	HR_TON = (1 << 7),
+};
+
+// ADR, bits used in address0, address1 and address0/1 registers
+enum adr_bits {
+	ADDRESS_MASK = 0x1f,	/* mask to specify lower 5 bits */
+	HR_DL = (1 << 5),
+	HR_DT = (1 << 6),
+	HR_ARS = (1 << 7),
+};
+
+// ADR1, address1 register
+enum adr1_bits {
+	HR_EOI = (1 << 7),
+};
+
+// AUXMR, auxiliary mode register
+enum auxmr_bits {
+	ICR = 0x20,
+	PPR = 0x60,
+	AUXRA = 0x80,
+	AUXRB = 0xa0,
+	AUXRE = 0xc0,
+};
+
+// auxra, auxiliary register A
+enum auxra_bits {
+	HR_HANDSHAKE_MASK = 0x3,
+	HR_HLDA = 0x1,
+	HR_HLDE = 0x2,
+	HR_LCM = 0x3,	/* auxra listen continuous */
+	HR_REOS = 0x4,
+	HR_XEOS = 0x8,
+	HR_BIN = 0x10,
+};
+
+// auxrb, auxiliary register B
+enum auxrb_bits {
+	HR_CPTE = (1 << 0),
+	HR_SPEOI = (1 << 1),
+	HR_TRI = (1 << 2),
+	HR_INV = (1 << 3),
+	HR_ISS = (1 << 4),
+};
+
+enum auxre_bits {
+	HR_DAC_HLD_DCAS = 0x1,	/* perform DAC holdoff on receiving clear */
+	HR_DAC_HLD_DTAS = 0x2,	/* perform DAC holdoff on receiving trigger */
+};
+
+// parallel poll register
+enum ppr_bits {
+	HR_PPS = (1 << 3),
+	HR_PPU = (1 << 4),
+};
+
+/* 7210 Auxiliary Commands */
+enum aux_cmds {
+	AUX_PON = 0x0,	/* Immediate Execute pon                  */
+	AUX_CPPF = 0x1,	/* Clear Parallel Poll Flag               */
+	AUX_CR = 0x2,	/* Chip Reset                             */
+	AUX_FH = 0x3,	/* Finish Handshake                       */
+	AUX_TRIG = 0x4,	/* Trigger                                */
+	AUX_RTL = 0x5,	/* Return to local                        */
+	AUX_SEOI = 0x6,	/* Send EOI                               */
+	AUX_NVAL = 0x7,	/* Non-Valid Secondary Command or Address */
+	AUX_SPPF = 0x9,	/* Set Parallel Poll Flag                 */
+	AUX_VAL = 0xf,	/* Valid Secondary Command or Address     */
+	AUX_GTS = 0x10,	/* Go To Standby                          */
+	AUX_TCA = 0x11,	/* Take Control Asynchronously            */
+	AUX_TCS = 0x12,	/* Take Control Synchronously             */
+	AUX_LTN = 0x13,	/* Listen                                 */
+	AUX_DSC = 0x14,	/* Disable System Control                 */
+	AUX_CIFC = 0x16,	/* Clear IFC                              */
+	AUX_CREN = 0x17,	/* Clear REN                              */
+	AUX_TCSE = 0x1a,	/* Take Control Synchronously on End      */
+	AUX_LTNC = 0x1b,	/* Listen in Continuous Mode              */
+	AUX_LUN = 0x1c,	/* Local Unlisten                         */
+	AUX_EPP = 0x1d,	/* Execute Parallel Poll                  */
+	AUX_SIFC = 0x1e,	/* Set IFC                                */
+	AUX_SREN = 0x1f,	/* Set REN                                */
+};
+
+#endif	//_NEC7210_REGISTERS_H
diff --git a/drivers/gpib/include/plx9050.h b/drivers/gpib/include/plx9050.h
new file mode 100644
index 000000000000..c911b285a0ca
--- /dev/null
+++ b/drivers/gpib/include/plx9050.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *  Header for plx9050 pci chip
+ *    copyright            : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _PLX9050_GPIB_H
+#define _PLX9050_GPIB_H
+
+// plx pci chip registers and bits
+enum {
+	PLX9050_INTCSR_REG = 0x4c,
+	PLX9050_CNTRL_REG = 0x50
+};
+
+enum plx9050_intcsr_bits {
+	PLX9050_LINTR1_EN_BIT = 0x1,
+	PLX9050_LINTR1_POLARITY_BIT = 0x2,
+	PLX9050_LINTR1_STATUS_BIT = 0x4,
+	PLX9050_LINTR2_EN_BIT = 0x8,
+	PLX9050_LINTR2_POLARITY_BIT = 0x10,
+	PLX9050_LINTR2_STATUS_BIT = 0x20,
+	PLX9050_PCI_INTR_EN_BIT = 0x40,
+	PLX9050_SOFT_INTR_BIT = 0x80,
+	PLX9050_LINTR1_SELECT_ENABLE_BIT = 0x100,	// 9052 extension
+	PLX9050_LINTR2_SELECT_ENABLE_BIT = 0x200,	// 9052 extension
+	PLX9050_LINTR1_EDGE_CLEAR_BIT = 0x400,		// 9052 extension
+	PLX9050_LINTR2_EDGE_CLEAR_BIT = 0x800,		// 9052 extension
+};
+
+enum plx9050_cntrl_bits {
+	PLX9050_WAITO_NOT_USER0_SELECT_BIT = 0x1,
+	PLX9050_USER0_OUTPUT_BIT = 0x2,
+	PLX9050_USER0_DATA_BIT = 0x4,
+	PLX9050_LLOCK_NOT_USER1_SELECT_BIT = 0x8,
+	PLX9050_USER1_OUTPUT_BIT = 0x10,
+	PLX9050_USER1_DATA_BIT = 0x20,
+	PLX9050_CS2_NOT_USER2_SELECT_BIT = 0x40,
+	PLX9050_USER2_OUTPUT_BIT = 0x80,
+	PLX9050_USER2_DATA_BIT = 0x100,
+	PLX9050_CS3_NOT_USER3_SELECT_BIT = 0x200,
+	PLX9050_USER3_OUTPUT_BIT = 0x400,
+	PLX9050_USER3_DATA_BIT = 0x800,
+	PLX9050_PCIBAR_ENABLE_MASK = 0x3000,
+	PLX9050_PCIBAR_MEMORY_AND_IO_ENABLE_BITS = 0x0,
+	PLX9050_PCIBAR_MEMORY_NO_IO_ENABLE_BITS = 0x1000,
+	PLX9050_PCIBAR_IO_NO_MEMORY_ENABLE_BITS = 0x2000,
+	PLX9050_PCIBAR_MEMORY_AND_IO_TOO_ENABLE_BITS = 0x3000,
+	PLX9050_PCI_READ_MODE_BIT = 0x4000,
+	PLX9050_PCI_READ_WITH_WRITE_FLUSH_MODE_BIT = 0x8000,
+	PLX9050_PCI_READ_NO_FLUSH_MODE_BIT = 0x10000,
+	PLX9050_PCI_READ_NO_WRITE_MODE_BIT = 0x20000,
+	PLX9050_PCI_WRITE_MODE_BIT = 0x40000,
+	PLX9050_PCI_RETRY_DELAY_MASK = 0x780000,
+	PLX9050_DIRECT_SLAVE_LOCK_ENABLE_BIT = 0x800000,
+	PLX9050_EEPROM_CLOCK_BIT = 0x1000000,
+	PLX9050_EEPROM_CHIP_SELECT_BIT = 0x2000000,
+	PLX9050_WRITE_TO_EEPROM_BIT = 0x4000000,
+	PLX9050_READ_EEPROM_DATA_BIT = 0x8000000,
+	PLX9050_EEPROM_VALID_BIT = 0x10000000,
+	PLX9050_RELOAD_CONFIG_REGISTERS_BIT = 0x20000000,
+	PLX9050_PCI_SOFTWARE_RESET_BIT = 0x40000000,
+	PLX9050_MASK_REVISION_BIT = 0x80000000
+};
+
+static inline unsigned int PLX9050_PCI_RETRY_DELAY_BITS(unsigned int clocks)
+{
+	return ((clocks / 8) << 19) & PLX9050_PCI_RETRY_DELAY_MASK;
+}
+
+#endif	// _PLX9050_GPIB_H
diff --git a/drivers/gpib/include/quancom_pci.h b/drivers/gpib/include/quancom_pci.h
new file mode 100644
index 000000000000..cdaf0d056be9
--- /dev/null
+++ b/drivers/gpib/include/quancom_pci.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ * Quancom pci stuff
+ * copyright (C) 2005 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _QUANCOM_PCI_H
+#define _QUANCOM_PCI_H
+
+/* quancom registers */
+enum quancom_regs {
+	QUANCOM_IRQ_CONTROL_STATUS_REG = 0xfc,
+};
+
+enum quancom_irq_control_status_bits {
+	QUANCOM_IRQ_ASSERTED_BIT = 0x1, /* readable */
+	/* (any write to the register clears the interrupt)*/
+	QUANCOM_IRQ_ENABLE_BIT = 0x4, /* writeable */
+};
+
+#endif	// _QUANCOM_PCI_H
diff --git a/drivers/gpib/include/tms9914.h b/drivers/gpib/include/tms9914.h
new file mode 100644
index 000000000000..e66b75e0fda8
--- /dev/null
+++ b/drivers/gpib/include/tms9914.h
@@ -0,0 +1,280 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright            : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _TMS9914_H
+#define _TMS9914_H
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include "gpib_state_machines.h"
+#include "gpib_types.h"
+
+enum tms9914_holdoff_mode {
+	TMS9914_HOLDOFF_NONE,
+	TMS9914_HOLDOFF_EOI,
+	TMS9914_HOLDOFF_ALL,
+};
+
+/* struct used to provide variables local to a tms9914 chip */
+struct tms9914_priv {
+#ifdef CONFIG_HAS_IOPORT
+	u32 iobase;
+#endif
+	void __iomem *mmiobase;
+	unsigned int offset;	// offset between successive tms9914 io addresses
+	unsigned int dma_channel;
+	// software copy of bits written to interrupt mask registers
+	u8 imr0_bits, imr1_bits;
+	// bits written to address mode register
+	u8 admr_bits;
+	u8 auxa_bits;		// bits written to auxiliary register A
+	// used to keep track of board's state, bit definitions given below
+	unsigned long state;
+	u8 eos;			// eos character
+	short eos_flags;
+	u8 spoll_status;
+	enum tms9914_holdoff_mode holdoff_mode;
+	unsigned int ppoll_line;
+	enum talker_function_state talker_state;
+	enum listener_function_state listener_state;
+	unsigned ppoll_sense : 1;
+	unsigned ppoll_enable : 1;
+	unsigned ppoll_configure_state : 1;
+	unsigned primary_listen_addressed : 1;
+	unsigned primary_talk_addressed : 1;
+	unsigned holdoff_on_end : 1;
+	unsigned holdoff_on_all : 1;
+	unsigned holdoff_active : 1;
+	// wrappers for outb, inb, readb, or writeb
+	u8 (*read_byte)(struct tms9914_priv *priv, unsigned int register_number);
+	void (*write_byte)(struct tms9914_priv *priv, u8 byte, unsigned int
+			   register_number);
+};
+
+// slightly shorter way to access read_byte and write_byte
+static inline u8 read_byte(struct tms9914_priv *priv, unsigned int register_number)
+{
+	return priv->read_byte(priv, register_number);
+}
+
+static inline void write_byte(struct tms9914_priv *priv, u8 byte, unsigned int register_number)
+{
+	priv->write_byte(priv, byte, register_number);
+}
+
+// struct tms9914_priv.state bit numbers
+enum {
+	PIO_IN_PROGRESS_BN,		// pio transfer in progress
+	DMA_READ_IN_PROGRESS_BN,	// dma read transfer in progress
+	DMA_WRITE_IN_PROGRESS_BN,	// dma write transfer in progress
+	READ_READY_BN,			// board has data byte available to read
+	WRITE_READY_BN,			// board is ready to send a data byte
+	COMMAND_READY_BN,		// board is ready to send a command byte
+	RECEIVED_END_BN,		// received END
+	BUS_ERROR_BN,			// bus error
+	DEV_CLEAR_BN,			// device clear received
+};
+
+// interface functions
+int tms9914_read(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
+		 size_t length, int *end, size_t *bytes_read);
+int tms9914_write(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
+		  size_t length, int send_eoi, size_t *bytes_written);
+int tms9914_command(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
+		    size_t length, size_t *bytes_written);
+int tms9914_take_control(struct gpib_board *board, struct tms9914_priv *priv, int syncronous);
+/*
+ * alternate version of tms9914_take_control which works around buggy tcs
+ * implementation.
+ */
+int tms9914_take_control_workaround(struct gpib_board *board, struct tms9914_priv *priv,
+				    int syncronous);
+int tms9914_go_to_standby(struct gpib_board *board, struct tms9914_priv *priv);
+int tms9914_request_system_control(struct gpib_board *board, struct tms9914_priv *priv,
+				   int request_control);
+void tms9914_interface_clear(struct gpib_board *board, struct tms9914_priv *priv, int assert);
+void tms9914_remote_enable(struct gpib_board *board, struct tms9914_priv *priv, int enable);
+int tms9914_enable_eos(struct gpib_board *board, struct tms9914_priv *priv, u8 eos_bytes,
+		       int compare_8_bits);
+void tms9914_disable_eos(struct gpib_board *board, struct tms9914_priv *priv);
+unsigned int tms9914_update_status(struct gpib_board *board, struct tms9914_priv *priv,
+				   unsigned int clear_mask);
+int tms9914_primary_address(struct gpib_board *board,
+			    struct tms9914_priv *priv, unsigned int address);
+int tms9914_secondary_address(struct gpib_board *board, struct tms9914_priv *priv,
+			      unsigned int address, int enable);
+int tms9914_parallel_poll(struct gpib_board *board, struct tms9914_priv *priv, u8 *result);
+void tms9914_parallel_poll_configure(struct gpib_board *board,
+				     struct tms9914_priv *priv, u8 config);
+void tms9914_parallel_poll_response(struct gpib_board *board,
+				    struct tms9914_priv *priv, int ist);
+void tms9914_serial_poll_response(struct gpib_board *board,
+				  struct tms9914_priv *priv, u8 status);
+u8 tms9914_serial_poll_status(struct gpib_board *board, struct tms9914_priv *priv);
+int tms9914_line_status(const struct gpib_board *board, struct tms9914_priv *priv);
+unsigned int tms9914_t1_delay(struct gpib_board *board, struct tms9914_priv *priv,
+			      unsigned int nano_sec);
+void tms9914_return_to_local(const struct gpib_board *board, struct tms9914_priv *priv);
+
+// utility functions
+void tms9914_board_reset(struct tms9914_priv *priv);
+void tms9914_online(struct gpib_board *board, struct tms9914_priv *priv);
+void tms9914_release_holdoff(struct tms9914_priv *priv);
+void tms9914_set_holdoff_mode(struct tms9914_priv *priv, enum tms9914_holdoff_mode mode);
+
+// wrappers for io functions
+u8 tms9914_ioport_read_byte(struct tms9914_priv *priv, unsigned int register_num);
+void tms9914_ioport_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num);
+u8 tms9914_iomem_read_byte(struct tms9914_priv *priv, unsigned int register_num);
+void tms9914_iomem_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num);
+
+// interrupt service routine
+irqreturn_t tms9914_interrupt(struct gpib_board *board, struct tms9914_priv *priv);
+irqreturn_t tms9914_interrupt_have_status(struct gpib_board *board, struct tms9914_priv *priv,
+					  int status1,	int status2);
+
+// tms9914 has 8 registers
+enum {
+	ms9914_num_registers = 8,
+};
+
+/*
+ * tms9914 register numbers (might need to be multiplied by
+ * a board-dependent offset to get actually io address offset)
+ */
+// write registers
+enum {
+	IMR0 = 0,	/* interrupt mask 0          */
+	IMR1 = 1,	/* interrupt mask 1          */
+	AUXCR = 3,	/* auxiliary command         */
+	ADR = 4,	/* address register	     */
+	SPMR = 5,	/* serial poll mode register */
+	PPR = 6,	/* parallel poll             */
+	CDOR = 7,	/* data out register         */
+};
+
+// read registers
+enum {
+	ISR0 = 0,	/* interrupt status 0	     */
+	ISR1 = 1,	/* interrupt status 1	     */
+	ADSR = 2,	/* address status	     */
+	BSR = 3,	/* bus status		     */
+	CPTR = 6,	/* command pass thru	     */
+	DIR = 7,	/* data in register          */
+};
+
+// bit definitions common to tms9914 compatible registers
+
+/* ISR0   - Register bits */
+enum isr0_bits {
+	HR_MAC = (1 << 0),   /* My Address Change           */
+	HR_RLC = (1 << 1),   /* Remote/Local change         */
+	HR_SPAS = (1 << 2),   /* Serial Poll active State    */
+	HR_END = (1 << 3),   /* END (EOI or EOS)            */
+	HR_BO = (1 << 4),   /* Byte Out                    */
+	HR_BI = (1 << 5),   /* Byte In                     */
+};
+
+/* IMR0   - Register bits */
+enum imr0_bits {
+	HR_MACIE = (1 << 0),   /*        */
+	HR_RLCIE = (1 << 1),   /*        */
+	HR_SPASIE = (1 << 2),   /*        */
+	HR_ENDIE = (1 << 3),   /*        */
+	HR_BOIE = (1 << 4),   /*        */
+	HR_BIIE = (1 << 5),   /*        */
+};
+
+/* ISR1   - Register bits */
+enum isr1_bits {
+	HR_IFC = (1 << 0),   /* IFC asserted                */
+	HR_SRQ = (1 << 1),   /* SRQ asserted                */
+	HR_MA = (1 << 2),    /* My Address                  */
+	HR_DCAS = (1 << 3),  /* Device Clear active State   */
+	HR_APT = (1 << 4),   /* Address pass Through        */
+	HR_UNC = (1 << 5),   /* Unrecognized Command        */
+	HR_ERR = (1 << 6),   /* Data Transmission Error     */
+	HR_GET = (1 << 7),   /* Group execute Trigger       */
+};
+
+/* IMR1   - Register bits */
+enum imr1_bits {
+	HR_IFCIE = (1 << 0),   /*        */
+	HR_SRQIE = (1 << 1),   /*        */
+	HR_MAIE = (1 << 2),    /*        */
+	HR_DCASIE = (1 << 3),  /*        */
+	HR_APTIE = (1 << 4),   /*        */
+	HR_UNCIE = (1 << 5),   /*        */
+	HR_ERRIE = (1 << 6),   /*        */
+	HR_GETIE = (1 << 7),   /*        */
+};
+
+/* ADSR   - Register bits */
+enum adsr_bits {
+	HR_ULPA = (1 << 0),   /* Store last address LSB       */
+	HR_TA = (1 << 1),     /* Talker Adressed              */
+	HR_LA = (1 << 2),     /* Listener adressed            */
+	HR_TPAS = (1 << 3),   /* talker primary address state */
+	HR_LPAS = (1 << 4),   /* listener    "                */
+	HR_ATN = (1 << 5),    /* ATN active                   */
+	HR_LLO = (1 << 6),    /* LLO active                   */
+	HR_REM = (1 << 7),    /* REM active                   */
+};
+
+/* ADR   - Register bits */
+enum adr_bits {
+	ADDRESS_MASK = 0x1f,	/* mask to specify lower 5 bits for ADR */
+	HR_DAT = (1 << 5),      /* disable talker */
+	HR_DAL = (1 << 6),      /* disable listener */
+	HR_EDPA = (1 << 7),     /* enable dual primary addressing */
+};
+
+enum bus_status_bits {
+	BSR_REN_BIT = 0x1,
+	BSR_IFC_BIT = 0x2,
+	BSR_SRQ_BIT = 0x4,
+	BSR_EOI_BIT = 0x8,
+	BSR_NRFD_BIT = 0x10,
+	BSR_NDAC_BIT = 0x20,
+	BSR_DAV_BIT = 0x40,
+	BSR_ATN_BIT = 0x80,
+};
+
+/*---------------------------------------------------------*/
+/* TMS 9914 Auxiliary Commands                             */
+/*---------------------------------------------------------*/
+
+enum aux_cmd_bits {
+	AUX_CS = 0x80,			/* set bit instead of clearing it, used with commands marked 'd' below */
+	AUX_CHIP_RESET = 0x0,		/* d Chip reset                   */
+	AUX_INVAL = 0x1,		/* release dac holdoff, invalid command byte */
+	AUX_VAL = (AUX_INVAL | AUX_CS),	/* release dac holdoff, valid command byte   */
+	AUX_RHDF = 0x2,			/* X Release RFD holdoff          */
+	AUX_HLDA = 0x3,			/* d holdoff on all data          */
+	AUX_HLDE = 0x4,			/* d holdoff on EOI only          */
+	AUX_NBAF = 0x5,			/* X Set new byte available false */
+	AUX_FGET = 0x6,			/* d force GET                    */
+	AUX_RTL = 0x7,			/* d return to local              */
+	AUX_SEOI = 0x8,			/* X send EOI with next byte      */
+	AUX_LON = 0x9,			/* d Listen only                  */
+	AUX_TON = 0xa,			/* d Talk only                    */
+	AUX_GTS = 0xb,			/* X goto standby                 */
+	AUX_TCA = 0xc,			/* X take control asynchronously  */
+	AUX_TCS = 0xd,			/* X take    "     synchronously  */
+	AUX_RPP = 0xe,			/* d Request parallel poll        */
+	AUX_SIC = 0xf,			/* d send interface clear         */
+	AUX_SRE = 0x10,			/* d send remote enable           */
+	AUX_RQC = 0x11,			/* X request control              */
+	AUX_RLC = 0x12,			/* X release control              */
+	AUX_DAI = 0x13,			/* d disable all interrupts       */
+	AUX_PTS = 0x14,			/* X pass through next secondary  */
+	AUX_STDL = 0x15,		/* d short T1 delay		  */
+	AUX_SHDW = 0x16,		/* d shadow handshake             */
+	AUX_VSTDL = 0x17,		/* d very short T1 delay (smj9914 extension)   */
+	AUX_RSV2 = 0x18,		/* d request service bit 2 (smj9914 extension) */
+};
+
+#endif	//_TMS9914_H
diff --git a/drivers/gpib/include/tnt4882_registers.h b/drivers/gpib/include/tnt4882_registers.h
new file mode 100644
index 000000000000..d54c4cc61168
--- /dev/null
+++ b/drivers/gpib/include/tnt4882_registers.h
@@ -0,0 +1,192 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *    copyright		   : (C) 2002, 2004 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _TNT4882_REGISTERS_H
+#define _TNT4882_REGISTERS_H
+
+// tnt4882 register offsets
+enum {
+	ACCWR = 0x5,
+	// offset of auxiliary command register in 9914 mode
+	AUXCR = 0x6,
+	INTRT = 0x7,
+	// register number for auxiliary command register when swap bit is set (9914 mode)
+	SWAPPED_AUXCR = 0xa,
+	HSSEL = 0xd,	// handshake select register
+	CNT2 = 0x9,
+	CNT3 = 0xb,
+	CFG = 0x10,
+	SASR = 0x1b,
+	IMR0 = 0x1d,
+	IMR3 = 0x12,
+	CNT0 = 0x14,
+	CNT1 = 0x16,
+	KEYREG = 0x17,	// key control register (7210 mode only)
+	CSR = KEYREG,
+	FIFOB = 0x18,
+	FIFOA = 0x19,
+	CCR = 0x1a,	// carry cycle register
+	CMDR = 0x1c,	// command register
+	TIMER = 0x1e,	// timer register
+
+	STS1 = 0x10,	// T488 Status Register 1
+	STS2 = 0x1c,	// T488 Status Register 2
+	ISR0 = IMR0,
+	ISR3 = 0x1a,	// T488 Interrupt Status Register 3
+	BCR = 0x1f,	// bus control/status register
+	BSR = BCR,
+};
+
+enum {
+	tnt_pagein_offset = 0x11,
+};
+
+/*============================================================*/
+
+/* TURBO-488 registers bit definitions */
+
+enum bus_control_status_bits {
+	BCSR_REN_BIT = 0x1,
+	BCSR_IFC_BIT = 0x2,
+	BCSR_SRQ_BIT = 0x4,
+	BCSR_EOI_BIT = 0x8,
+	BCSR_NRFD_BIT = 0x10,
+	BCSR_NDAC_BIT = 0x20,
+	BCSR_DAV_BIT = 0x40,
+	BCSR_ATN_BIT = 0x80,
+};
+
+/* CFG -- Configuration Register (write only) */
+enum cfg_bits {
+	TNT_COMMAND = 0x80,	/* bytes are command bytes instead of data bytes
+				 * (tnt4882 one-chip and newer only?)
+				 */
+	TNT_TLCHE = (1 << 6),	/* halt transfer on imr0, imr1, or imr2 interrupt */
+	TNT_IN = (1 << 5),	/* transfer is GPIB read		 */
+	TNT_A_B = (1 << 4),	/* order to use fifos 1=fifo A first(big endian),
+				 * 0=fifo b first(little endian)
+				 */
+	TNT_CCEN = (1 << 3),	/* enable carry cycle		      */
+	TNT_TMOE = (1 << 2),	/* enable CPU bus time limit	      */
+	TNT_TIM_BYTN = (1 << 1),	/* tmot reg is: 1=125ns clocks, 0=num bytes */
+	TNT_B_16BIT = (1 << 0),	/* 1=FIFO is 16-bit register, 0=8-bit */
+};
+
+/* CMDR -- Command Register */
+enum cmdr_bits {
+	CLRSC = 0x2,	/* clear the system controller bit */
+	SETSC = 0x3,	/* set the system controller bit */
+	GO = 0x4,	/* start fifos */
+	STOP = 0x8,	/* stop fifos */
+	RESET_FIFO = 0x10,	/* reset the FIFOs		*/
+	SOFT_RESET = 0x22,	/* issue a software reset	*/
+	HARD_RESET = 0x40	/* 500x only? */
+};
+
+/* HSSEL -- handshake select register (write only) */
+enum hssel_bits {
+	TNT_ONE_CHIP_BIT = 0x1,
+	NODMA = 0x10,
+	TNT_GO2SIDS_BIT = 0x20,
+};
+
+/* IMR0 -- Interrupt Mode Register 0 */
+enum imr0_bits {
+	TNT_SYNCIE_BIT = 0x1, /* handshake sync */
+	TNT_TOIE_BIT = 0x2, /* timeout */
+	TNT_ATNIE_BIT = 0x4, /* ATN interrupt */
+	TNT_IFCIE_BIT = 0x8,	/* interface clear interrupt */
+	TNT_BTO_BIT = 0x10, /* byte timeout */
+	TNT_NLEN_BIT = 0x20,	/* treat new line as EOS char */
+	TNT_STBOIE_BIT = 0x40,	/* status byte out  */
+	TNT_IMR0_ALWAYS_BITS = 0x80,	/* always set this bit on write */
+};
+
+/* ISR0 -- Interrupt Status Register 0 */
+enum isr0_bits {
+	TNT_SYNC_BIT = 0x1,	/* handshake sync */
+	TNT_TO_BIT = 0x2,	/* timeout */
+	TNT_ATNI_BIT = 0x4,	/* ATN interrupt */
+	TNT_IFCI_BIT = 0x8,	/* interface clear interrupt */
+	TNT_EOS_BIT = 0x10,	/* end of string */
+	TNT_NL_BIT = 0x20,	/* new line receive */
+	TNT_STBO_BIT = 0x40,	/* status byte out  */
+	TNT_NBA_BIT = 0x80,	/* new byte available */
+};
+
+/* ISR3 -- Interrupt Status Register 3 (read only) */
+enum isr3_bits {
+	HR_DONE = (1 << 0),	/* transfer done */
+	HR_TLCI = (1 << 1),	/* isr0, isr1, or isr2 interrupt asserted */
+	HR_NEF = (1 << 2),	/* NOT empty fifo */
+	HR_NFF = (1 << 3),	/* NOT full fifo */
+	HR_STOP = (1 << 4),	/* fifo empty or STOP command issued */
+	HR_SRQI_CIC = (1 << 5),	/* SRQ asserted and we are CIC (500x only?)*/
+	HR_INTR = (1 << 7),	/* isr3 interrupt active */
+};
+
+enum keyreg_bits {
+	MSTD = 0x20,	/* enable 350ns T1 delay */
+};
+
+/* STS1 -- Status Register 1 (read only) */
+enum sts1_bits {
+	S_DONE = 0x80,	/* DMA done			      */
+	S_SC = 0x40,	/* is system controller		      */
+	S_IN = 0x20,	/* DMA in (to memory)		      */
+	S_DRQ = 0x10,	/* DRQ line (for diagnostics)	      */
+	S_STOP = 0x08,	/* DMA stopped			      */
+	S_NDAV = 0x04,	/* inverse of DAV		      */
+	S_HALT = 0x02,	/* status of transfer machine	      */
+	S_GSYNC = 0x01,	/* indicates if GPIB is in sync w I/O */
+};
+
+/* STS2 -- Status Register 2 */
+enum sts2_bits {
+	AFFN = (1 << 3),	/* "A full FIFO NOT"  (0=FIFO full)  */
+	AEFN = (1 << 2),	/* "A empty FIFO NOT" (0=FIFO empty) */
+	BFFN = (1 << 1),	/* "B full FIFO NOT"  (0=FIFO full)  */
+	BEFN = (1 << 0),	/* "B empty FIFO NOT" (0=FIFO empty) */
+};
+
+// Auxiliary commands
+enum tnt4882_aux_cmds {
+	AUX_9914 = 0x15,	// switch to 9914 mode
+	AUX_REQT = 0x18,
+	AUX_REQF = 0x19,
+	AUX_PAGEIN = 0x50,	// page in alternate registers
+	AUX_HLDI = 0x51,	// rfd holdoff immediately
+	AUX_CLEAR_END = 0x55,
+	AUX_7210 = 0x99,	// switch to 7210 mode
+};
+
+enum tnt4882_aux_regs {
+	AUXRG = 0x40,
+	AUXRI = 0xe0,
+};
+
+enum auxg_bits {
+ /* no talking when no listeners bit (prevents bus errors when data written at wrong time) */
+	NTNL_BIT = 0x8,
+	RPP2_BIT = 0x4,	/* set/clear local rpp message */
+	CHES_BIT = 0x1, /*clear holdoff on end select bit*/
+};
+
+enum auxi_bits {
+	SISB = 0x1,	// static interrupt bits (don't clear isr1, isr2 on read)
+	PP2 = 0x4,	// ignore remote parallel poll configuration
+	USTD = 0x8,	// ultra short (1100 nanosec) T1 delay
+};
+
+enum sasr_bits {
+	ACRDY_BIT = 0x4,	/* acceptor ready state */
+	ADHS_BIT = 0x8,		/* acceptor data holdoff state */
+	ANHS2_BIT = 0x10,	/* acceptor not ready holdoff immediately state */
+	ANHS1_BIT = 0x20,	/* acceptor not ready holdoff state */
+	AEHS_BIT = 0x40,	/* acceptor end holdoff state */
+};
+
+#endif	// _TNT4882_REGISTERS_H
diff --git a/drivers/gpib/ines/Makefile b/drivers/gpib/ines/Makefile
new file mode 100644
index 000000000000..88241f15ecea
--- /dev/null
+++ b/drivers/gpib/ines/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_GPIB_INES) += ines_gpib.o
+
+
diff --git a/drivers/gpib/ines/ines.h b/drivers/gpib/ines/ines.h
new file mode 100644
index 000000000000..6ad57e9a1216
--- /dev/null
+++ b/drivers/gpib/ines/ines.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *  Header for ines GPIB boards
+ *    copyright            : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _INES_GPIB_H
+#define _INES_GPIB_H
+
+#include "nec7210.h"
+#include "gpibP.h"
+#include "plx9050.h"
+#include "amcc5920.h"
+#include "quancom_pci.h"
+#include <linux/interrupt.h>
+
+enum ines_pci_chip {
+	PCI_CHIP_NONE,
+	PCI_CHIP_PLX9050,
+	PCI_CHIP_AMCC5920,
+	PCI_CHIP_QUANCOM,
+	PCI_CHIP_QUICKLOGIC5030,
+};
+
+struct ines_priv {
+	struct nec7210_priv nec7210_priv;
+	struct pci_dev *pci_device;
+	// base address for plx9052 pci chip
+	unsigned long plx_iobase;
+	// base address for amcc5920 pci chip
+	unsigned long amcc_iobase;
+	unsigned int irq;
+	enum ines_pci_chip pci_chip_type;
+	u8 extend_mode_bits;
+};
+
+/* inb/outb wrappers */
+static inline unsigned int ines_inb(struct ines_priv *priv, unsigned int register_number)
+{
+	return inb(priv->nec7210_priv.iobase +
+		   register_number * priv->nec7210_priv.offset);
+}
+
+static inline void ines_outb(struct ines_priv *priv, unsigned int value,
+			     unsigned int register_number)
+{
+	outb(value, priv->nec7210_priv.iobase +
+	     register_number * priv->nec7210_priv.offset);
+}
+
+enum ines_regs {
+	// read
+	FIFO_STATUS = 0x8,
+	ISR3 = 0x9,
+	ISR4 = 0xa,
+	IN_FIFO_COUNT = 0x10,
+	OUT_FIFO_COUNT = 0x11,
+	EXTEND_STATUS = 0xf,
+
+	// write
+	XDMA_CONTROL = 0x8,
+	IMR3 = ISR3,
+	IMR4 = ISR4,
+	IN_FIFO_WATERMARK = IN_FIFO_COUNT,
+	OUT_FIFO_WATERMARK = OUT_FIFO_COUNT,
+	EXTEND_MODE = 0xf,
+
+	// read-write
+	XFER_COUNT_LOWER = 0xb,
+	XFER_COUNT_UPPER = 0xc,
+	BUS_CONTROL_MONITOR = 0x13,
+};
+
+enum isr3_imr3_bits {
+	HW_TIMEOUT_BIT = 0x1,
+	XFER_COUNT_BIT = 0x2,
+	CMD_RECEIVED_BIT = 0x4,
+	TCT_RECEIVED_BIT = 0x8,
+	IFC_ACTIVE_BIT = 0x10,
+	ATN_ACTIVE_BIT = 0x20,
+	FIFO_ERROR_BIT = 0x40,
+};
+
+enum isr4_imr4_bits {
+	IN_FIFO_WATERMARK_BIT = 0x1,
+	OUT_FIFO_WATERMARK_BIT = 0x2,
+	IN_FIFO_FULL_BIT = 0x4,
+	OUT_FIFO_EMPTY_BIT = 0x8,
+	IN_FIFO_READY_BIT = 0x10,
+	OUT_FIFO_READY_BIT = 0x20,
+	IN_FIFO_EXIT_WATERMARK_BIT = 0x40,
+	OUT_FIFO_EXIT_WATERMARK_BIT = 0x80,
+};
+
+enum extend_mode_bits {
+	TR3_TRIG_ENABLE_BIT = 0x1,	// enable generation of trigger pulse T/R3 pin
+	// clear message available status bit when chip writes byte with EOI true
+	MAV_ENABLE_BIT = 0x2,
+	EOS1_ENABLE_BIT = 0x4,		// enable eos register 1
+	EOS2_ENABLE_BIT = 0x8,		// enable eos register 2
+	EOIDIS_BIT = 0x10,		// disable EOI interrupt when doing rfd holdoff on end?
+	XFER_COUNTER_ENABLE_BIT = 0x20,
+	XFER_COUNTER_OUTPUT_BIT = 0x40,	// use counter for output, clear for input
+	// when xfer counter hits 0, assert EOI on write or RFD holdoff on read
+	LAST_BYTE_HANDLING_BIT = 0x80,
+};
+
+enum extend_status_bits {
+	OUTPUT_MESSAGE_IN_PROGRESS_BIT = 0x1,
+	SCSEL_BIT = 0x2,	// statue of SCSEL pin
+	LISTEN_DISABLED = 0x4,
+	IN_FIFO_EMPTY_BIT = 0x8,
+	OUT_FIFO_FULL_BIT = 0x10,
+};
+
+// ines adds fifo enable bits to address mode register
+enum ines_admr_bits {
+	IN_FIFO_ENABLE_BIT = 0x8,
+	OUT_FIFO_ENABLE_BIT = 0x4,
+};
+
+enum xdma_control_bits {
+	DMA_OUTPUT_BIT = 0x1,		// use dma for output, clear for input
+	ENABLE_SYNC_DMA_BIT = 0x2,
+	DMA_ACCESS_EVERY_CYCLE = 0x4,	// dma accesses fifo every cycle, clear for every other cycle
+	DMA_16BIT = 0x8,		// clear for 8 bit transfers
+};
+
+enum bus_control_monitor_bits {
+	BCM_DAV_BIT = 0x1,
+	BCM_NRFD_BIT = 0x2,
+	BCM_NDAC_BIT = 0x4,
+	BCM_IFC_BIT = 0x8,
+	BCM_ATN_BIT = 0x10,
+	BCM_SRQ_BIT = 0x20,
+	BCM_REN_BIT = 0x40,
+	BCM_EOI_BIT = 0x80,
+};
+
+enum ines_aux_reg_bits {
+	INES_AUXD = 0x40,
+};
+
+enum ines_aux_cmds {
+	INES_RFD_HLD_IMMEDIATE = 0x4,
+	INES_AUX_CLR_OUT_FIFO = 0x5,
+	INES_AUX_CLR_IN_FIFO = 0x6,
+	INES_AUX_XMODE = 0xa,
+};
+
+enum ines_auxd_bits {
+	INES_FOLLOWING_T1_MASK = 0x3,
+	INES_FOLLOWING_T1_500ns = 0x0,
+	INES_FOLLOWING_T1_350ns = 0x1,
+	INES_FOLLOWING_T1_250ns = 0x2,
+	INES_INITIAL_TI_MASK = 0xc,
+	INES_INITIAL_T1_2000ns = 0x0,
+	INES_INITIAL_T1_1100ns = 0x4,
+	INES_INITIAL_T1_700ns = 0x8,
+	INES_T6_2us = 0x0,
+	INES_T6_50us = 0x10,
+};
+
+#endif	// _INES_GPIB_H
diff --git a/drivers/gpib/ines/ines_gpib.c b/drivers/gpib/ines/ines_gpib.c
new file mode 100644
index 000000000000..a3cf846fd0f9
--- /dev/null
+++ b/drivers/gpib/ines/ines_gpib.c
@@ -0,0 +1,1500 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *    copyright		   : (C) 1999 Axel Dziemba (axel.dziemba@ines.de)
+ *			    (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include "ines.h"
+
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/bitops.h>
+#include <asm/dma.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "gpib_pci_ids.h"
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver for Ines iGPIB 72010");
+
+static irqreturn_t ines_interrupt(struct gpib_board *board);
+
+static int ines_line_status(const struct gpib_board *board)
+{
+	int status = VALID_ALL;
+	int bcm_bits;
+	struct ines_priv *ines_priv;
+
+	ines_priv = board->private_data;
+
+	bcm_bits = ines_inb(ines_priv, BUS_CONTROL_MONITOR);
+
+	if (bcm_bits & BCM_REN_BIT)
+		status |= BUS_REN;
+	if (bcm_bits & BCM_IFC_BIT)
+		status |= BUS_IFC;
+	if (bcm_bits & BCM_SRQ_BIT)
+		status |= BUS_SRQ;
+	if (bcm_bits & BCM_EOI_BIT)
+		status |= BUS_EOI;
+	if (bcm_bits & BCM_NRFD_BIT)
+		status |= BUS_NRFD;
+	if (bcm_bits & BCM_NDAC_BIT)
+		status |= BUS_NDAC;
+	if (bcm_bits & BCM_DAV_BIT)
+		status |= BUS_DAV;
+	if (bcm_bits & BCM_ATN_BIT)
+		status |= BUS_ATN;
+
+	return status;
+}
+
+static void ines_set_xfer_counter(struct ines_priv *priv, unsigned int count)
+{
+	if (count > 0xffff) {
+		pr_err("bug! tried to set xfer counter > 0xffff\n");
+		return;
+	}
+	ines_outb(priv, (count >> 8) & 0xff, XFER_COUNT_UPPER);
+	ines_outb(priv, count & 0xff, XFER_COUNT_LOWER);
+}
+
+static int ines_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	struct ines_priv *ines_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
+	unsigned int retval;
+
+	retval = nec7210_t1_delay(board, nec_priv, nano_sec);
+
+	if (nano_sec <= 250) {
+		write_byte(nec_priv, INES_AUXD | INES_FOLLOWING_T1_250ns |
+			   INES_INITIAL_T1_2000ns, AUXMR);
+		retval = 250;
+	} else if (nano_sec <= 350) {
+		write_byte(nec_priv, INES_AUXD | INES_FOLLOWING_T1_350ns |
+			   INES_INITIAL_T1_2000ns, AUXMR);
+		retval = 350;
+	} else {
+		write_byte(nec_priv, INES_AUXD | INES_FOLLOWING_T1_500ns |
+			   INES_INITIAL_T1_2000ns, AUXMR);
+		retval = 500;
+	}
+
+	return retval;
+}
+
+static inline unsigned short num_in_fifo_bytes(struct ines_priv *ines_priv)
+{
+	return ines_inb(ines_priv, IN_FIFO_COUNT);
+}
+
+static ssize_t pio_read(struct gpib_board *board, struct ines_priv *ines_priv, u8 *buffer,
+			size_t length, size_t *nbytes)
+{
+	ssize_t retval = 0;
+	unsigned int num_fifo_bytes, i;
+	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
+
+	*nbytes = 0;
+	while (*nbytes < length) {
+		if (wait_event_interruptible(board->wait,
+					     num_in_fifo_bytes(ines_priv) ||
+					     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
+					     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+					     test_bit(TIMO_NUM, &board->status)))
+			return -ERESTARTSYS;
+
+		if (test_bit(TIMO_NUM, &board->status))
+			return -ETIMEDOUT;
+		if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
+			return -EINTR;
+
+		num_fifo_bytes = num_in_fifo_bytes(ines_priv);
+		if (num_fifo_bytes + *nbytes > length)
+			num_fifo_bytes = length - *nbytes;
+
+		for (i = 0; i < num_fifo_bytes; i++)
+			buffer[(*nbytes)++] = read_byte(nec_priv, DIR);
+		if (test_bit(RECEIVED_END_BN, &nec_priv->state) &&
+		    num_in_fifo_bytes(ines_priv) == 0)
+			break;
+		if (need_resched())
+			schedule();
+	}
+	/* make sure RECEIVED_END is in sync */
+	ines_interrupt(board);
+	return retval;
+}
+
+static int ines_accel_read(struct gpib_board *board, u8 *buffer,
+			   size_t length, int *end, size_t *bytes_read)
+{
+	ssize_t retval = 0;
+	struct ines_priv *ines_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
+	int counter_setting;
+
+	*end = 0;
+	*bytes_read = 0;
+	if (length == 0)
+		return 0;
+
+	clear_bit(DEV_CLEAR_BN, &nec_priv->state);
+
+	write_byte(nec_priv, INES_RFD_HLD_IMMEDIATE, AUXMR);
+
+	// clear in fifo
+	nec7210_set_reg_bits(nec_priv, ADMR, IN_FIFO_ENABLE_BIT, 0);
+	nec7210_set_reg_bits(nec_priv, ADMR, IN_FIFO_ENABLE_BIT, IN_FIFO_ENABLE_BIT);
+
+	ines_priv->extend_mode_bits |= LAST_BYTE_HANDLING_BIT;
+	ines_priv->extend_mode_bits &= ~XFER_COUNTER_OUTPUT_BIT & ~XFER_COUNTER_ENABLE_BIT;
+	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
+
+	counter_setting = length - num_in_fifo_bytes(ines_priv);
+	if (counter_setting > 0) {
+		ines_set_xfer_counter(ines_priv, length);
+		ines_priv->extend_mode_bits |= XFER_COUNTER_ENABLE_BIT;
+		ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
+
+		// holdoff on END
+		nec7210_set_handshake_mode(board, nec_priv, HR_HLDE);
+		/* release rfd holdoff */
+		write_byte(nec_priv, AUX_FH, AUXMR);
+	}
+
+	retval = pio_read(board, ines_priv, buffer, length, bytes_read);
+	ines_priv->extend_mode_bits &= ~XFER_COUNTER_ENABLE_BIT;
+	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
+	if (retval < 0)	{
+		write_byte(nec_priv, INES_RFD_HLD_IMMEDIATE, AUXMR);
+		return retval;
+	}
+	if (test_and_clear_bit(RECEIVED_END_BN, &nec_priv->state))
+		*end = 1;
+
+	return retval;
+}
+
+static const int out_fifo_size = 0xff;
+
+static inline unsigned short num_out_fifo_bytes(struct ines_priv *ines_priv)
+{
+	return ines_inb(ines_priv, OUT_FIFO_COUNT);
+}
+
+static int ines_write_wait(struct gpib_board *board, struct ines_priv *ines_priv,
+			   unsigned int fifo_threshold)
+{
+	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
+
+	// wait until byte is ready to be sent
+	if (wait_event_interruptible(board->wait,
+				     num_out_fifo_bytes(ines_priv) < fifo_threshold ||
+				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		return -ERESTARTSYS;
+
+	if (test_bit(BUS_ERROR_BN, &nec_priv->state))
+		return -EIO;
+	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
+		return -EINTR;
+	if (test_bit(TIMO_NUM, &board->status))
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int ines_accel_write(struct gpib_board *board, u8 *buffer, size_t length,
+			    int send_eoi, size_t *bytes_written)
+{
+	size_t count = 0;
+	ssize_t retval = 0;
+	struct ines_priv *ines_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
+	unsigned int num_bytes, i;
+
+	*bytes_written = 0;
+	// clear out fifo
+	nec7210_set_reg_bits(nec_priv, ADMR, OUT_FIFO_ENABLE_BIT, 0);
+	nec7210_set_reg_bits(nec_priv, ADMR, OUT_FIFO_ENABLE_BIT, OUT_FIFO_ENABLE_BIT);
+
+	ines_priv->extend_mode_bits |= XFER_COUNTER_OUTPUT_BIT;
+	ines_priv->extend_mode_bits &= ~XFER_COUNTER_ENABLE_BIT;
+	ines_priv->extend_mode_bits &= ~LAST_BYTE_HANDLING_BIT;
+	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
+
+	ines_set_xfer_counter(ines_priv, length);
+	if (send_eoi)
+		ines_priv->extend_mode_bits |= LAST_BYTE_HANDLING_BIT;
+	ines_priv->extend_mode_bits |= XFER_COUNTER_ENABLE_BIT;
+	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
+
+	while (count < length) {
+		retval = ines_write_wait(board, ines_priv, out_fifo_size);
+		if (retval < 0)
+			break;
+
+		num_bytes = out_fifo_size - num_out_fifo_bytes(ines_priv);
+		if (num_bytes + count > length)
+			num_bytes = length - count;
+		for (i = 0; i < num_bytes; i++)
+			write_byte(nec_priv, buffer[count++], CDOR);
+	}
+	if (retval < 0)	{
+		ines_priv->extend_mode_bits &= ~XFER_COUNTER_ENABLE_BIT;
+		ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
+		*bytes_written = length - num_out_fifo_bytes(ines_priv);
+		return retval;
+	}
+	// wait last byte has been sent
+	retval = ines_write_wait(board, ines_priv, 1);
+	ines_priv->extend_mode_bits &= ~XFER_COUNTER_ENABLE_BIT;
+	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
+	*bytes_written = length - num_out_fifo_bytes(ines_priv);
+
+	return retval;
+}
+
+static irqreturn_t ines_pci_interrupt(int irq, void *arg)
+{
+	struct gpib_board *board = arg;
+	struct ines_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+
+	if (priv->pci_chip_type == PCI_CHIP_QUANCOM) {
+		if ((inb(nec_priv->iobase +
+			 QUANCOM_IRQ_CONTROL_STATUS_REG) &
+		     QUANCOM_IRQ_ASSERTED_BIT))
+			outb(QUANCOM_IRQ_ENABLE_BIT, nec_priv->iobase +
+			     QUANCOM_IRQ_CONTROL_STATUS_REG);
+	}
+
+	return ines_interrupt(board);
+}
+
+static irqreturn_t ines_interrupt(struct gpib_board *board)
+{
+	struct ines_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+	unsigned int isr3_bits, isr4_bits;
+	unsigned long flags;
+	int wake = 0;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+
+	nec7210_interrupt(board, nec_priv);
+	isr3_bits = ines_inb(priv, ISR3);
+	isr4_bits = ines_inb(priv, ISR4);
+	if (isr3_bits & IFC_ACTIVE_BIT)	{
+		push_gpib_event(board, EVENT_IFC);
+		wake++;
+	}
+	if (isr3_bits & FIFO_ERROR_BIT)
+		dev_err(board->gpib_dev, "fifo error\n");
+	if (isr3_bits & XFER_COUNT_BIT)
+		wake++;
+
+	if (isr4_bits & (IN_FIFO_WATERMARK_BIT | IN_FIFO_FULL_BIT | OUT_FIFO_WATERMARK_BIT |
+			 OUT_FIFO_EMPTY_BIT))
+		wake++;
+
+	if (wake)
+		wake_up_interruptible(&board->wait);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return IRQ_HANDLED;
+}
+
+static int ines_pci_attach(struct gpib_board *board, const struct gpib_board_config *config);
+static int ines_pci_accel_attach(struct gpib_board *board, const struct gpib_board_config *config);
+static int ines_isa_attach(struct gpib_board *board, const struct gpib_board_config *config);
+
+static void ines_pci_detach(struct gpib_board *board);
+static void ines_isa_detach(struct gpib_board *board);
+
+enum ines_pci_vendor_ids {
+	PCI_VENDOR_ID_INES_QUICKLOGIC = 0x16da
+};
+
+enum ines_pci_device_ids {
+	PCI_DEVICE_ID_INES_GPIB_AMCC = 0x8507,
+	PCI_DEVICE_ID_INES_GPIB_QL5030 = 0x11,
+};
+
+enum ines_pci_subdevice_ids {
+	PCI_SUBDEVICE_ID_INES_GPIB = 0x1072
+};
+
+static struct pci_device_id ines_pci_table[] = {
+	{PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9050, PCI_VENDOR_ID_PLX,
+	 PCI_SUBDEVICE_ID_INES_GPIB, 0, 0, 0},
+	{PCI_VENDOR_ID_AMCC, PCI_DEVICE_ID_INES_GPIB_AMCC, PCI_VENDOR_ID_AMCC,
+	 PCI_SUBDEVICE_ID_INES_GPIB, 0, 0, 0},
+	{PCI_VENDOR_ID_INES_QUICKLOGIC, PCI_DEVICE_ID_INES_GPIB_QL5030,
+	 PCI_VENDOR_ID_INES_QUICKLOGIC, PCI_DEVICE_ID_INES_GPIB_QL5030, 0, 0, 0},
+	{PCI_DEVICE(PCI_VENDOR_ID_QUANCOM, PCI_DEVICE_ID_QUANCOM_GPIB)},
+	{0}
+};
+MODULE_DEVICE_TABLE(pci, ines_pci_table);
+
+struct ines_pci_id {
+	unsigned int vendor_id;
+	unsigned int device_id;
+	int subsystem_vendor_id;
+	int subsystem_device_id;
+	unsigned int gpib_region;
+	unsigned int io_offset;
+	enum ines_pci_chip pci_chip_type;
+};
+
+static struct ines_pci_id pci_ids[] = {
+	{.vendor_id = PCI_VENDOR_ID_PLX,
+	 .device_id = PCI_DEVICE_ID_PLX_9050,
+	 .subsystem_vendor_id = PCI_VENDOR_ID_PLX,
+	 .subsystem_device_id = PCI_SUBDEVICE_ID_INES_GPIB,
+	 .gpib_region = 2,
+	 .io_offset = 1,
+	 .pci_chip_type = PCI_CHIP_PLX9050,
+	},
+	{.vendor_id = PCI_VENDOR_ID_AMCC,
+	 .device_id = PCI_DEVICE_ID_INES_GPIB_AMCC,
+	 .subsystem_vendor_id = PCI_VENDOR_ID_AMCC,
+	 .subsystem_device_id = PCI_SUBDEVICE_ID_INES_GPIB,
+	 .gpib_region = 1,
+	 .io_offset = 1,
+	 .pci_chip_type = PCI_CHIP_AMCC5920,
+	},
+	{.vendor_id = PCI_VENDOR_ID_INES_QUICKLOGIC,
+	 .device_id = PCI_DEVICE_ID_INES_GPIB_QL5030,
+	 .subsystem_vendor_id = PCI_VENDOR_ID_INES_QUICKLOGIC,
+	 .subsystem_device_id = PCI_DEVICE_ID_INES_GPIB_QL5030,
+	 .gpib_region = 1,
+	 .io_offset = 1,
+	 .pci_chip_type = PCI_CHIP_QUICKLOGIC5030,
+	},
+	{.vendor_id = PCI_VENDOR_ID_QUANCOM,
+	 .device_id = PCI_DEVICE_ID_QUANCOM_GPIB,
+	 .subsystem_vendor_id = -1,
+	 .subsystem_device_id = -1,
+	 .gpib_region = 0,
+	 .io_offset = 4,
+	 .pci_chip_type = PCI_CHIP_QUANCOM,
+	},
+};
+
+static const int num_pci_chips = ARRAY_SIZE(pci_ids);
+
+// wrappers for interface functions
+static int ines_read(struct gpib_board *board, u8 *buffer, size_t length,
+		     int *end, size_t *bytes_read)
+{
+	struct ines_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+	ssize_t retval;
+	int dummy;
+
+	retval = nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
+	if (retval < 0)	{
+		write_byte(nec_priv, INES_RFD_HLD_IMMEDIATE, AUXMR);
+
+		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+
+		nec7210_read_data_in(board, nec_priv, &dummy);
+	}
+	return retval;
+}
+
+static int ines_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
+		      size_t *bytes_written)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
+}
+
+static int ines_command(struct gpib_board *board, u8 *buffer, size_t length, size_t *bytes_written)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
+}
+
+static int ines_take_control(struct gpib_board *board, int synchronous)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
+}
+
+static int ines_go_to_standby(struct gpib_board *board)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_go_to_standby(board, &priv->nec7210_priv);
+}
+
+static int ines_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_request_system_control(board, &priv->nec7210_priv, request_control);
+}
+
+static void ines_interface_clear(struct gpib_board *board, int assert)
+{
+	struct ines_priv *priv = board->private_data;
+
+	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
+}
+
+static void ines_remote_enable(struct gpib_board *board, int enable)
+{
+	struct ines_priv *priv = board->private_data;
+
+	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
+}
+
+static int ines_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
+}
+
+static void ines_disable_eos(struct gpib_board *board)
+{
+	struct ines_priv *priv = board->private_data;
+
+	nec7210_disable_eos(board, &priv->nec7210_priv);
+}
+
+static unsigned int ines_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
+}
+
+static int ines_primary_address(struct gpib_board *board, unsigned int address)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_primary_address(board, &priv->nec7210_priv, address);
+}
+
+static int ines_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
+}
+
+static int ines_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
+}
+
+static void ines_parallel_poll_configure(struct gpib_board *board, u8 config)
+{
+	struct ines_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, config);
+}
+
+static void ines_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	struct ines_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
+}
+
+static void ines_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	struct ines_priv *priv = board->private_data;
+
+	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
+}
+
+static u8 ines_serial_poll_status(struct gpib_board *board)
+{
+	struct ines_priv *priv = board->private_data;
+
+	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
+}
+
+static void ines_return_to_local(struct gpib_board *board)
+{
+	struct ines_priv *priv = board->private_data;
+
+	nec7210_return_to_local(board, &priv->nec7210_priv);
+}
+
+static struct gpib_interface ines_pci_unaccel_interface = {
+	.name = "ines_pci_unaccel",
+	.attach = ines_pci_attach,
+	.detach = ines_pci_detach,
+	.read = ines_read,
+	.write = ines_write,
+	.command = ines_command,
+	.take_control = ines_take_control,
+	.go_to_standby = ines_go_to_standby,
+	.request_system_control = ines_request_system_control,
+	.interface_clear = ines_interface_clear,
+	.remote_enable = ines_remote_enable,
+	.enable_eos = ines_enable_eos,
+	.disable_eos = ines_disable_eos,
+	.parallel_poll = ines_parallel_poll,
+	.parallel_poll_configure = ines_parallel_poll_configure,
+	.parallel_poll_response = ines_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = ines_line_status,
+	.update_status = ines_update_status,
+	.primary_address = ines_primary_address,
+	.secondary_address = ines_secondary_address,
+	.serial_poll_response = ines_serial_poll_response,
+	.serial_poll_status = ines_serial_poll_status,
+	.t1_delay = ines_t1_delay,
+	.return_to_local = ines_return_to_local,
+};
+
+static struct gpib_interface ines_pci_interface = {
+	.name = "ines_pci",
+	.attach = ines_pci_accel_attach,
+	.detach = ines_pci_detach,
+	.read = ines_accel_read,
+	.write = ines_accel_write,
+	.command = ines_command,
+	.take_control = ines_take_control,
+	.go_to_standby = ines_go_to_standby,
+	.request_system_control = ines_request_system_control,
+	.interface_clear = ines_interface_clear,
+	.remote_enable = ines_remote_enable,
+	.enable_eos = ines_enable_eos,
+	.disable_eos = ines_disable_eos,
+	.parallel_poll = ines_parallel_poll,
+	.parallel_poll_configure = ines_parallel_poll_configure,
+	.parallel_poll_response = ines_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = ines_line_status,
+	.update_status = ines_update_status,
+	.primary_address = ines_primary_address,
+	.secondary_address = ines_secondary_address,
+	.serial_poll_response = ines_serial_poll_response,
+	.serial_poll_status = ines_serial_poll_status,
+	.t1_delay = ines_t1_delay,
+	.return_to_local = ines_return_to_local,
+};
+
+static struct gpib_interface ines_pci_accel_interface = {
+	.name = "ines_pci_accel",
+	.attach = ines_pci_accel_attach,
+	.detach = ines_pci_detach,
+	.read = ines_accel_read,
+	.write = ines_accel_write,
+	.command = ines_command,
+	.take_control = ines_take_control,
+	.go_to_standby = ines_go_to_standby,
+	.request_system_control = ines_request_system_control,
+	.interface_clear = ines_interface_clear,
+	.remote_enable = ines_remote_enable,
+	.enable_eos = ines_enable_eos,
+	.disable_eos = ines_disable_eos,
+	.parallel_poll = ines_parallel_poll,
+	.parallel_poll_configure = ines_parallel_poll_configure,
+	.parallel_poll_response = ines_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = ines_line_status,
+	.update_status = ines_update_status,
+	.primary_address = ines_primary_address,
+	.secondary_address = ines_secondary_address,
+	.serial_poll_response = ines_serial_poll_response,
+	.serial_poll_status = ines_serial_poll_status,
+	.t1_delay = ines_t1_delay,
+	.return_to_local = ines_return_to_local,
+};
+
+static struct gpib_interface ines_isa_interface = {
+	.name = "ines_isa",
+	.attach = ines_isa_attach,
+	.detach = ines_isa_detach,
+	.read = ines_accel_read,
+	.write = ines_accel_write,
+	.command = ines_command,
+	.take_control = ines_take_control,
+	.go_to_standby = ines_go_to_standby,
+	.request_system_control = ines_request_system_control,
+	.interface_clear = ines_interface_clear,
+	.remote_enable = ines_remote_enable,
+	.enable_eos = ines_enable_eos,
+	.disable_eos = ines_disable_eos,
+	.parallel_poll = ines_parallel_poll,
+	.parallel_poll_configure = ines_parallel_poll_configure,
+	.parallel_poll_response = ines_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = ines_line_status,
+	.update_status = ines_update_status,
+	.primary_address = ines_primary_address,
+	.secondary_address = ines_secondary_address,
+	.serial_poll_response = ines_serial_poll_response,
+	.serial_poll_status = ines_serial_poll_status,
+	.t1_delay = ines_t1_delay,
+	.return_to_local = ines_return_to_local,
+};
+
+static int ines_allocate_private(struct gpib_board *board)
+{
+	struct ines_priv *priv;
+
+	board->private_data = kmalloc(sizeof(struct ines_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -1;
+	priv = board->private_data;
+	memset(priv, 0, sizeof(struct ines_priv));
+	init_nec7210_private(&priv->nec7210_priv);
+	return 0;
+}
+
+static void ines_free_private(struct gpib_board *board)
+{
+	kfree(board->private_data);
+	board->private_data = NULL;
+}
+
+static int ines_generic_attach(struct gpib_board *board)
+{
+	struct ines_priv *ines_priv;
+	struct nec7210_priv *nec_priv;
+
+	board->status = 0;
+
+	if (ines_allocate_private(board))
+		return -ENOMEM;
+	ines_priv = board->private_data;
+	nec_priv = &ines_priv->nec7210_priv;
+	nec_priv->read_byte = nec7210_ioport_read_byte;
+	nec_priv->write_byte = nec7210_ioport_write_byte;
+	nec_priv->offset = 1;
+	nec_priv->type = IGPIB7210;
+	ines_priv->pci_chip_type = PCI_CHIP_NONE;
+
+	return 0;
+}
+
+static void ines_online(struct ines_priv *ines_priv, const struct gpib_board *board, int use_accel)
+{
+	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
+
+	/* ines doesn't seem to use internal count register */
+	write_byte(nec_priv, ICR | 0, AUXMR);
+
+	write_byte(nec_priv, INES_AUX_XMODE, AUXMR);
+	write_byte(nec_priv, INES_RFD_HLD_IMMEDIATE, AUXMR);
+
+	set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+
+	write_byte(nec_priv, INES_AUXD | 0, AUXMR);
+	ines_outb(ines_priv, 0, XDMA_CONTROL);
+	ines_priv->extend_mode_bits = 0;
+	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
+	if (use_accel) {
+		ines_outb(ines_priv, 0x80, OUT_FIFO_WATERMARK);
+		ines_outb(ines_priv, 0x80, IN_FIFO_WATERMARK);
+		ines_outb(ines_priv, IFC_ACTIVE_BIT | ATN_ACTIVE_BIT |
+			  FIFO_ERROR_BIT | XFER_COUNT_BIT, IMR3);
+		ines_outb(ines_priv, IN_FIFO_WATERMARK_BIT | IN_FIFO_FULL_BIT |
+			  OUT_FIFO_WATERMARK_BIT | OUT_FIFO_EMPTY_BIT, IMR4);
+	} else {
+		nec7210_set_reg_bits(nec_priv, ADMR, IN_FIFO_ENABLE_BIT | OUT_FIFO_ENABLE_BIT, 0);
+		ines_outb(ines_priv, IFC_ACTIVE_BIT | FIFO_ERROR_BIT, IMR3);
+		ines_outb(ines_priv, 0, IMR4);
+	}
+
+	nec7210_board_online(nec_priv, board);
+	if (use_accel)
+		nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE | HR_DIIE, 0);
+}
+
+static int ines_common_pci_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct ines_priv *ines_priv;
+	struct nec7210_priv *nec_priv;
+	int isr_flags = 0;
+	int retval;
+	struct ines_pci_id found_id;
+	unsigned int i;
+	struct pci_dev *pdev;
+
+	memset(&found_id, 0, sizeof(found_id));
+
+	retval = ines_generic_attach(board);
+	if (retval)
+		return retval;
+
+	ines_priv = board->private_data;
+	nec_priv = &ines_priv->nec7210_priv;
+
+	// find board
+	ines_priv->pci_device = NULL;
+	for (i = 0; i < num_pci_chips && !ines_priv->pci_device; i++) {
+		pdev = NULL;
+		do {
+			if (pci_ids[i].subsystem_vendor_id >= 0 &&
+			    pci_ids[i].subsystem_device_id >= 0)
+				pdev = pci_get_subsys(pci_ids[i].vendor_id, pci_ids[i].device_id,
+						      pci_ids[i].subsystem_vendor_id,
+						      pci_ids[i].subsystem_device_id, pdev);
+			else
+				pdev = pci_get_device(pci_ids[i].vendor_id, pci_ids[i].device_id,
+						      pdev);
+			if (!pdev)
+				break;
+			if (config->pci_bus >= 0 && config->pci_bus != pdev->bus->number)
+				continue;
+			if (config->pci_slot >= 0 && config->pci_slot != PCI_SLOT(pdev->devfn))
+				continue;
+			found_id = pci_ids[i];
+			ines_priv->pci_device = pdev;
+			break;
+		} while (1);
+	}
+	if (!ines_priv->pci_device) {
+		dev_err(board->gpib_dev, "could not find ines PCI board\n");
+		return -1;
+	}
+
+	if (pci_enable_device(ines_priv->pci_device)) {
+		dev_err(board->gpib_dev, "error enabling pci device\n");
+		return -1;
+	}
+
+	if (pci_request_regions(ines_priv->pci_device, DRV_NAME))
+		return -1;
+	nec_priv->iobase = pci_resource_start(ines_priv->pci_device,
+					      found_id.gpib_region);
+
+	ines_priv->pci_chip_type = found_id.pci_chip_type;
+	nec_priv->offset = found_id.io_offset;
+	switch (ines_priv->pci_chip_type) {
+	case PCI_CHIP_PLX9050:
+		ines_priv->plx_iobase = pci_resource_start(ines_priv->pci_device, 1);
+		break;
+	case PCI_CHIP_AMCC5920:
+		ines_priv->amcc_iobase = pci_resource_start(ines_priv->pci_device, 0);
+		break;
+	case PCI_CHIP_QUANCOM:
+		break;
+	case PCI_CHIP_QUICKLOGIC5030:
+		break;
+	default:
+		dev_err(board->gpib_dev, "unspecified chip type? (bug)\n");
+		nec_priv->iobase = 0;
+		pci_release_regions(ines_priv->pci_device);
+		return -1;
+	}
+
+	nec7210_board_reset(nec_priv, board);
+#ifdef QUANCOM_PCI
+	if (ines_priv->pci_chip_type == PCI_CHIP_QUANCOM) {
+		/* change interrupt polarity */
+		nec_priv->auxb_bits |= HR_INV;
+		ines_outb(ines_priv, nec_priv->auxb_bits, AUXMR);
+	}
+#endif
+	isr_flags |= IRQF_SHARED;
+	if (request_irq(ines_priv->pci_device->irq, ines_pci_interrupt, isr_flags,
+			DRV_NAME, board)) {
+		dev_err(board->gpib_dev, "can't request IRQ %d\n", ines_priv->pci_device->irq);
+		return -1;
+	}
+	ines_priv->irq = ines_priv->pci_device->irq;
+
+	// enable interrupts on pci chip
+	switch (ines_priv->pci_chip_type) {
+	case PCI_CHIP_PLX9050:
+		outl(PLX9050_LINTR1_EN_BIT | PLX9050_LINTR1_POLARITY_BIT | PLX9050_PCI_INTR_EN_BIT,
+		     ines_priv->plx_iobase + PLX9050_INTCSR_REG);
+		break;
+	case PCI_CHIP_AMCC5920:
+	{
+		static const int region = 1;
+		static const int num_wait_states = 7;
+		u32 bits;
+
+		bits = amcc_prefetch_bits(region, PREFETCH_DISABLED);
+		bits |= amcc_PTADR_mode_bit(region);
+		bits |= amcc_disable_write_fifo_bit(region);
+		bits |= amcc_wait_state_bits(region, num_wait_states);
+		outl(bits, ines_priv->amcc_iobase + AMCC_PASS_THRU_REG);
+		outl(AMCC_ADDON_INTR_ENABLE_BIT, ines_priv->amcc_iobase + AMCC_INTCS_REG);
+	}
+	break;
+	case PCI_CHIP_QUANCOM:
+		outb(QUANCOM_IRQ_ENABLE_BIT, nec_priv->iobase +
+		     QUANCOM_IRQ_CONTROL_STATUS_REG);
+		break;
+	case PCI_CHIP_QUICKLOGIC5030:
+		break;
+	default:
+		dev_err(board->gpib_dev, "unspecified chip type? (bug)\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int ines_pci_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct ines_priv *ines_priv;
+	int retval;
+
+	retval = ines_common_pci_attach(board, config);
+	if (retval < 0)
+		return retval;
+
+	ines_priv = board->private_data;
+	ines_online(ines_priv, board, 0);
+
+	return 0;
+}
+
+static int ines_pci_accel_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct ines_priv *ines_priv;
+	int retval;
+
+	retval = ines_common_pci_attach(board, config);
+	if (retval < 0)
+		return retval;
+
+	ines_priv = board->private_data;
+	ines_online(ines_priv, board, 1);
+
+	return 0;
+}
+
+static const int ines_isa_iosize = 0x20;
+
+static int ines_isa_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct ines_priv *ines_priv;
+	struct nec7210_priv *nec_priv;
+	int isr_flags = 0;
+	int retval;
+
+	retval = ines_generic_attach(board);
+	if (retval)
+		return retval;
+
+	ines_priv = board->private_data;
+	nec_priv = &ines_priv->nec7210_priv;
+
+	if (!request_region(config->ibbase, ines_isa_iosize, DRV_NAME)) {
+		dev_err(board->gpib_dev, "ioports at 0x%x already in use\n",
+			config->ibbase);
+		return -EBUSY;
+	}
+	nec_priv->iobase = config->ibbase;
+	nec_priv->offset = 1;
+	nec7210_board_reset(nec_priv, board);
+	if (request_irq(config->ibirq, ines_pci_interrupt, isr_flags, DRV_NAME, board)) {
+		dev_err(board->gpib_dev, "failed to allocate IRQ %d\n", config->ibirq);
+		return -1;
+	}
+	ines_priv->irq = config->ibirq;
+	ines_online(ines_priv, board, 1);
+	return 0;
+}
+
+static void ines_pci_detach(struct gpib_board *board)
+{
+	struct ines_priv *ines_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (ines_priv) {
+		nec_priv = &ines_priv->nec7210_priv;
+		if (ines_priv->irq) {
+			// disable interrupts
+			switch (ines_priv->pci_chip_type) {
+			case PCI_CHIP_AMCC5920:
+				if (ines_priv->plx_iobase)
+					outl(0, ines_priv->plx_iobase + PLX9050_INTCSR_REG);
+				break;
+			case PCI_CHIP_QUANCOM:
+				if (nec_priv->iobase)
+					outb(0, nec_priv->iobase +
+					     QUANCOM_IRQ_CONTROL_STATUS_REG);
+				break;
+			default:
+				break;
+			}
+			free_irq(ines_priv->irq, board);
+		}
+		if (nec_priv->iobase) {
+			nec7210_board_reset(nec_priv, board);
+			pci_release_regions(ines_priv->pci_device);
+		}
+		if (ines_priv->pci_device)
+			pci_dev_put(ines_priv->pci_device);
+	}
+	ines_free_private(board);
+}
+
+static void ines_isa_detach(struct gpib_board *board)
+{
+	struct ines_priv *ines_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (ines_priv) {
+		nec_priv = &ines_priv->nec7210_priv;
+		if (ines_priv->irq)
+			free_irq(ines_priv->irq, board);
+		if (nec_priv->iobase) {
+			nec7210_board_reset(nec_priv, board);
+			release_region(nec_priv->iobase, ines_isa_iosize);
+		}
+	}
+	ines_free_private(board);
+}
+
+static int ines_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	return 0;
+}
+
+static struct pci_driver ines_pci_driver = {
+	.name = "ines_gpib",
+	.id_table = ines_pci_table,
+	.probe = &ines_pci_probe
+};
+
+#ifdef CONFIG_GPIB_PCMCIA
+
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+
+#include <pcmcia/cistpl.h>
+#include <pcmcia/ds.h>
+#include <pcmcia/cisreg.h>
+
+static const int ines_pcmcia_iosize = 0x20;
+
+/*
+ * The event() function is this driver's Card Services event handler.
+ * It will be called by Card Services when an appropriate card status
+ * event is received.  The config() and release() entry points are
+ * used to configure or release a socket, in response to card insertion
+ * and ejection events.  They are invoked from the gpib event
+ * handler.
+ */
+
+static int ines_gpib_config(struct pcmcia_device  *link);
+static void ines_gpib_release(struct pcmcia_device  *link);
+static int ines_pcmcia_attach(struct gpib_board *board, const struct gpib_board_config *config);
+static int ines_pcmcia_accel_attach(struct gpib_board *board,
+				    const struct gpib_board_config *config);
+static void ines_pcmcia_detach(struct gpib_board *board);
+static int ines_common_pcmcia_attach(struct gpib_board *board);
+/*
+ * A linked list of "instances" of the gpib device.  Each actual
+ * PCMCIA card corresponds to one device instance, and is described
+ * by one dev_link_t structure (defined in ds.h).
+ *
+ * You may not want to use a linked list for this -- for example, the
+ * memory card driver uses an array of dev_link_t pointers, where minor
+ * device numbers are used to derive the corresponding array index.
+ */
+
+static struct pcmcia_device *curr_dev;
+
+/*
+ * A dev_link_t structure has fields for most things that are needed
+ * to keep track of a socket, but there will usually be some device
+ * specific information that also needs to be kept track of.  The
+ * 'priv' pointer in a dev_link_t structure can be used to point to
+ * a device-specific private data structure, like this.
+ *
+ * A driver needs to provide a dev_node_t structure for each device
+ * on a card.	In some cases, there is only one device per card (for
+ * example, ethernet cards, modems).  In other cases, there may be
+ * many actual or logical devices (SCSI adapters, memory cards with
+ * multiple partitions).  The dev_node_t structures need to be kept
+ * in a linked list starting at the 'dev' field of a dev_link_t
+ * structure.	We allocate them in the card's private data structure,
+ * because they generally can't be allocated dynamically.
+ */
+
+struct local_info {
+	struct pcmcia_device	*p_dev;
+	struct gpib_board		*dev;
+	u_short manfid;
+	u_short cardid;
+};
+
+/*
+ * gpib_attach() creates an "instance" of the driver, allocating
+ * local data structures for one device.  The device is registered
+ * with Card Services.
+ *
+ * The dev_link structure is initialized, but we don't actually
+ * configure the card at this point -- we wait until we receive a
+ * card insertion event.
+ */
+static int ines_gpib_probe(struct pcmcia_device *link)
+{
+	struct local_info *info;
+
+//	int ret, i;
+
+	/* Allocate space for private device-specific data */
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->p_dev = link;
+	link->priv = info;
+
+	/* The io structure describes IO port mapping */
+	link->resource[0]->end = 32;
+	link->resource[0]->flags &= ~IO_DATA_PATH_WIDTH;
+	link->resource[0]->flags |= IO_DATA_PATH_WIDTH_8;
+	link->io_lines = 5;
+
+	/* General socket configuration */
+	link->config_flags = CONF_ENABLE_IRQ | CONF_AUTO_SET_IO;
+
+	/* Register with Card Services */
+	curr_dev = link;
+	return ines_gpib_config(link);
+}
+
+/*
+ * This deletes a driver "instance".	The device is de-registered
+ * with Card Services.  If it has been released, all local data
+ * structures are freed.  Otherwise, the structures will be freed
+ * when the device is released.
+ */
+static void ines_gpib_remove(struct pcmcia_device *link)
+{
+	struct local_info *info = link->priv;
+	//struct struct gpib_board *dev = info->dev;
+
+	if (info->dev)
+		ines_pcmcia_detach(info->dev);
+	ines_gpib_release(link);
+
+	//free_netdev(dev);
+	kfree(info);
+}
+
+static int ines_gpib_config_iteration(struct pcmcia_device *link, void *priv_data)
+{
+	return pcmcia_request_io(link);
+}
+
+/*
+ * gpib_config() is scheduled to run after a CARD_INSERTION event
+ * is received, to configure the PCMCIA socket, and to make the
+ * device available to the system.
+ */
+static int ines_gpib_config(struct pcmcia_device *link)
+{
+	int retval;
+	void __iomem *virt;
+
+	retval = pcmcia_loop_config(link, &ines_gpib_config_iteration, NULL);
+	if (retval) {
+		dev_warn(&link->dev, "no configuration found\n");
+		ines_gpib_release(link);
+		return -ENODEV;
+	}
+
+	dev_dbg(&link->dev, "ines_cs: manufacturer: 0x%x card: 0x%x\n",
+		link->manf_id, link->card_id);
+
+	/*
+	 * for the ines card we have to setup the configuration registers in
+	 * attribute memory here
+	 */
+	link->resource[2]->flags |= WIN_MEMORY_TYPE_AM | WIN_DATA_WIDTH_8 | WIN_ENABLE;
+	link->resource[2]->end = 0x1000;
+	retval = pcmcia_request_window(link, link->resource[2], 250);
+	if (retval) {
+		dev_warn(&link->dev, "pcmcia_request_window failed\n");
+		ines_gpib_release(link);
+		return -ENODEV;
+	}
+	retval = pcmcia_map_mem_page(link, link->resource[2], 0);
+	if (retval) {
+		dev_warn(&link->dev, "pcmcia_map_mem_page failed\n");
+		ines_gpib_release(link);
+		return -ENODEV;
+	}
+	virt = ioremap(link->resource[2]->start, resource_size(link->resource[2]));
+	writeb((link->resource[2]->start >> 2) & 0xff, virt + 0xf0); // IOWindow base
+	iounmap(virt);
+
+	/*
+	 * This actually configures the PCMCIA socket -- setting up
+	 * the I/O windows and the interrupt mapping.
+	 */
+	retval = pcmcia_enable_device(link);
+	if (retval) {
+		ines_gpib_release(link);
+		return -ENODEV;
+	}
+	return 0;
+} /* gpib_config */
+
+/*
+ * After a card is removed, gpib_release() will unregister the net
+ * device, and release the PCMCIA configuration.  If the device is
+ * still open, this will be postponed until it is closed.
+ */
+
+static void ines_gpib_release(struct pcmcia_device *link)
+{
+	pcmcia_disable_device(link);
+} /* gpib_release */
+
+static int ines_gpib_suspend(struct pcmcia_device *link)
+{
+	//struct local_info *info = link->priv;
+	//struct struct gpib_board *dev = info->dev;
+
+	if (link->open)
+		dev_err(&link->dev, "Device still open\n");
+	//netif_device_detach(dev);
+
+	return 0;
+}
+
+static int ines_gpib_resume(struct pcmcia_device *link)
+{
+	//struct local_info_t *info = link->priv;
+	//struct struct gpib_board *dev = info->dev;
+
+	/*if (link->open) {
+	 *	ni_gpib_probe(dev);	/ really?
+	 *	//netif_device_attach(dev);
+	 *}
+	 */
+	return ines_gpib_config(link);
+}
+
+static struct pcmcia_device_id ines_pcmcia_ids[] = {
+	PCMCIA_DEVICE_MANF_CARD(0x01b4, 0x4730),
+	PCMCIA_DEVICE_NULL
+};
+MODULE_DEVICE_TABLE(pcmcia, ines_pcmcia_ids);
+
+static struct pcmcia_driver ines_gpib_cs_driver = {
+	.owner		= THIS_MODULE,
+	.name		= "ines_gpib_cs",
+	.id_table	= ines_pcmcia_ids,
+	.probe		= ines_gpib_probe,
+	.remove		= ines_gpib_remove,
+	.suspend	= ines_gpib_suspend,
+	.resume		= ines_gpib_resume,
+};
+
+static void ines_pcmcia_cleanup_module(void)
+{
+	pcmcia_unregister_driver(&ines_gpib_cs_driver);
+}
+
+static struct gpib_interface ines_pcmcia_unaccel_interface = {
+	.name = "ines_pcmcia_unaccel",
+	.attach = ines_pcmcia_attach,
+	.detach = ines_pcmcia_detach,
+	.read = ines_read,
+	.write = ines_write,
+	.command = ines_command,
+	.take_control = ines_take_control,
+	.go_to_standby = ines_go_to_standby,
+	.request_system_control = ines_request_system_control,
+	.interface_clear = ines_interface_clear,
+	.remote_enable = ines_remote_enable,
+	.enable_eos = ines_enable_eos,
+	.disable_eos = ines_disable_eos,
+	.parallel_poll = ines_parallel_poll,
+	.parallel_poll_configure = ines_parallel_poll_configure,
+	.parallel_poll_response = ines_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = ines_line_status,
+	.update_status = ines_update_status,
+	.primary_address = ines_primary_address,
+	.secondary_address = ines_secondary_address,
+	.serial_poll_response = ines_serial_poll_response,
+	.serial_poll_status = ines_serial_poll_status,
+	.t1_delay = ines_t1_delay,
+	.return_to_local = ines_return_to_local,
+};
+
+static struct gpib_interface ines_pcmcia_accel_interface = {
+	.name = "ines_pcmcia_accel",
+	.attach = ines_pcmcia_accel_attach,
+	.detach = ines_pcmcia_detach,
+	.read = ines_accel_read,
+	.write = ines_accel_write,
+	.command = ines_command,
+	.take_control = ines_take_control,
+	.go_to_standby = ines_go_to_standby,
+	.request_system_control = ines_request_system_control,
+	.interface_clear = ines_interface_clear,
+	.remote_enable = ines_remote_enable,
+	.enable_eos = ines_enable_eos,
+	.disable_eos = ines_disable_eos,
+	.parallel_poll = ines_parallel_poll,
+	.parallel_poll_configure = ines_parallel_poll_configure,
+	.parallel_poll_response = ines_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = ines_line_status,
+	.update_status = ines_update_status,
+	.primary_address = ines_primary_address,
+	.secondary_address = ines_secondary_address,
+	.serial_poll_response = ines_serial_poll_response,
+	.serial_poll_status = ines_serial_poll_status,
+	.t1_delay = ines_t1_delay,
+	.return_to_local = ines_return_to_local,
+};
+
+static struct gpib_interface ines_pcmcia_interface = {
+	.name = "ines_pcmcia",
+	.attach = ines_pcmcia_accel_attach,
+	.detach = ines_pcmcia_detach,
+	.read = ines_accel_read,
+	.write = ines_accel_write,
+	.command = ines_command,
+	.take_control = ines_take_control,
+	.go_to_standby = ines_go_to_standby,
+	.request_system_control = ines_request_system_control,
+	.interface_clear = ines_interface_clear,
+	.remote_enable = ines_remote_enable,
+	.enable_eos = ines_enable_eos,
+	.disable_eos = ines_disable_eos,
+	.parallel_poll = ines_parallel_poll,
+	.parallel_poll_configure = ines_parallel_poll_configure,
+	.parallel_poll_response = ines_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = ines_line_status,
+	.update_status = ines_update_status,
+	.primary_address = ines_primary_address,
+	.secondary_address = ines_secondary_address,
+	.serial_poll_response = ines_serial_poll_response,
+	.serial_poll_status = ines_serial_poll_status,
+	.t1_delay = ines_t1_delay,
+	.return_to_local = ines_return_to_local,
+};
+
+static irqreturn_t ines_pcmcia_interrupt(int irq, void *arg)
+{
+	struct gpib_board *board = arg;
+
+	return ines_interrupt(board);
+}
+
+static int ines_common_pcmcia_attach(struct gpib_board *board)
+{
+	struct ines_priv *ines_priv;
+	struct nec7210_priv *nec_priv;
+	int retval;
+
+	if (!curr_dev) {
+		dev_err(board->gpib_dev, "no ines pcmcia cards found\n");
+		return -1;
+	}
+
+	retval = ines_generic_attach(board);
+	if (retval)
+		return retval;
+
+	ines_priv = board->private_data;
+	nec_priv = &ines_priv->nec7210_priv;
+
+	if (!request_region(curr_dev->resource[0]->start,
+			    resource_size(curr_dev->resource[0]), DRV_NAME)) {
+		dev_err(board->gpib_dev, "ioports at 0x%lx already in use\n",
+			(unsigned long)(curr_dev->resource[0]->start));
+		return -1;
+	}
+
+	nec_priv->iobase = curr_dev->resource[0]->start;
+
+	nec7210_board_reset(nec_priv, board);
+
+	if (request_irq(curr_dev->irq, ines_pcmcia_interrupt, IRQF_SHARED,
+			"pcmcia-gpib", board))	{
+		dev_err(board->gpib_dev, "can't request IRQ %d\n", curr_dev->irq);
+		return -1;
+	}
+	ines_priv->irq = curr_dev->irq;
+
+	return 0;
+}
+
+static int ines_pcmcia_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct ines_priv *ines_priv;
+	int retval;
+
+	retval = ines_common_pcmcia_attach(board);
+	if (retval < 0)
+		return retval;
+
+	ines_priv = board->private_data;
+	ines_online(ines_priv, board, 0);
+
+	return 0;
+}
+
+static int ines_pcmcia_accel_attach(struct gpib_board *board,
+				    const struct gpib_board_config *config)
+{
+	struct ines_priv *ines_priv;
+	int retval;
+
+	retval = ines_common_pcmcia_attach(board);
+	if (retval < 0)
+		return retval;
+
+	ines_priv = board->private_data;
+	ines_online(ines_priv, board, 1);
+
+	return 0;
+}
+
+static void ines_pcmcia_detach(struct gpib_board *board)
+{
+	struct ines_priv *ines_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (ines_priv) {
+		nec_priv = &ines_priv->nec7210_priv;
+		if (ines_priv->irq)
+			free_irq(ines_priv->irq, board);
+		if (nec_priv->iobase) {
+			nec7210_board_reset(nec_priv, board);
+			release_region(nec_priv->iobase, ines_pcmcia_iosize);
+		}
+	}
+	ines_free_private(board);
+}
+
+#endif /* CONFIG_GPIB_PCMCIA */
+
+static int __init ines_init_module(void)
+{
+	int ret;
+
+	ret = pci_register_driver(&ines_pci_driver);
+	if (ret) {
+		pr_err("pci_register_driver failed: error = %d\n", ret);
+		return ret;
+	}
+
+	ret = gpib_register_driver(&ines_pci_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pci;
+	}
+
+	ret = gpib_register_driver(&ines_pci_unaccel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pci_unaccel;
+	}
+
+	ret = gpib_register_driver(&ines_pci_accel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pci_accel;
+	}
+
+	ret = gpib_register_driver(&ines_isa_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_isa;
+	}
+
+#ifdef CONFIG_GPIB_PCMCIA
+	ret = gpib_register_driver(&ines_pcmcia_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pcmcia;
+	}
+
+	ret = gpib_register_driver(&ines_pcmcia_unaccel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pcmcia_unaccel;
+	}
+
+	ret = gpib_register_driver(&ines_pcmcia_accel_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pcmcia_accel;
+	}
+
+	ret = pcmcia_register_driver(&ines_gpib_cs_driver);
+	if (ret) {
+		pr_err("pcmcia_register_driver failed: error = %d\n", ret);
+		goto err_pcmcia_driver;
+	}
+#endif
+
+	return 0;
+
+#ifdef CONFIG_GPIB_PCMCIA
+err_pcmcia_driver:
+	gpib_unregister_driver(&ines_pcmcia_accel_interface);
+err_pcmcia_accel:
+	gpib_unregister_driver(&ines_pcmcia_unaccel_interface);
+err_pcmcia_unaccel:
+	gpib_unregister_driver(&ines_pcmcia_interface);
+err_pcmcia:
+#endif
+	gpib_unregister_driver(&ines_isa_interface);
+err_isa:
+	gpib_unregister_driver(&ines_pci_accel_interface);
+err_pci_accel:
+	gpib_unregister_driver(&ines_pci_unaccel_interface);
+err_pci_unaccel:
+	gpib_unregister_driver(&ines_pci_interface);
+err_pci:
+	pci_unregister_driver(&ines_pci_driver);
+
+	return ret;
+}
+
+static void __exit ines_exit_module(void)
+{
+	gpib_unregister_driver(&ines_pci_interface);
+	gpib_unregister_driver(&ines_pci_unaccel_interface);
+	gpib_unregister_driver(&ines_pci_accel_interface);
+	gpib_unregister_driver(&ines_isa_interface);
+#ifdef CONFIG_GPIB_PCMCIA
+	gpib_unregister_driver(&ines_pcmcia_interface);
+	gpib_unregister_driver(&ines_pcmcia_unaccel_interface);
+	gpib_unregister_driver(&ines_pcmcia_accel_interface);
+	ines_pcmcia_cleanup_module();
+#endif
+
+	pci_unregister_driver(&ines_pci_driver);
+}
+
+module_init(ines_init_module);
+module_exit(ines_exit_module);
diff --git a/drivers/gpib/lpvo_usb_gpib/Makefile b/drivers/gpib/lpvo_usb_gpib/Makefile
new file mode 100644
index 000000000000..360553488e6d
--- /dev/null
+++ b/drivers/gpib/lpvo_usb_gpib/Makefile
@@ -0,0 +1,3 @@
+
+obj-$(CONFIG_GPIB_LPVO) += lpvo_usb_gpib.o
+
diff --git a/drivers/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c b/drivers/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c
new file mode 100644
index 000000000000..dd68c4843490
--- /dev/null
+++ b/drivers/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c
@@ -0,0 +1,2025 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *  This code has been developed at the Department of Physics (University  *
+ *  of Florence, Italy) to support in linux-gpib the open usb-gpib adapter *
+ *  implemented at the University of Ljubljana (lpvo.fe.uni-lj.si/gpib)	   *
+ *									   *
+ *  copyright		 : (C) 2011 Marcello Carla'			   *
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define NAME KBUILD_MODNAME
+
+/* base module includes */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/tty.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/sched/signal.h>
+#include <linux/usb.h>
+
+#include "gpibP.h"
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver for LPVO usb devices");
+
+/*
+ * Table of devices that work with this driver.
+ *
+ * Currently, only one device is known to be used in the
+ * lpvo_usb_gpib adapter (FTDI 0403:6001).
+ * If your adapter uses a different chip, insert a line
+ * in the following table with proper <Vendor-id>, <Product-id>.
+ *
+ * To have your chip automatically handled by the driver,
+ * update files "/usr/local/etc/modprobe.d/lpvo_usb_gpib.conf"
+ * and /usr/local/etc/udev/rules.d/99-lpvo_usb_gpib.rules.
+ *
+ */
+
+static const struct usb_device_id skel_table[] = {
+	{ USB_DEVICE(0x0403, 0x6001) },
+	{ }					   /* Terminating entry */
+};
+MODULE_DEVICE_TABLE(usb, skel_table);
+
+/*
+ *   ***  Diagnostics and Debug  ***
+ * To enable the diagnostic and debug messages either compile with DEBUG set
+ * or control via the dynamic debug mechanisms.
+ * The module parameter "debug" controls the sending of debug messages to
+ * syslog. By default it is set to 0
+ * debug = 0: only attach/detach messages are sent
+ *         1: every action is logged
+ *         2: extended logging; each single exchanged byte is documented
+ *	(about twice the log volume of [1])
+ * To switch debug level:
+ *         At module loading:  modprobe lpvo_usb_gpib debug={0,1,2}
+ *         On the fly: echo {0,1,2} > /sys/modules/lpvo_usb_gpib/parameters/debug
+ */
+
+static int debug;
+module_param(debug, int, 0644);
+
+#define DIA_LOG(level, format, ...)			   \
+	do { if (debug >= (level))					\
+			dev_dbg(board->gpib_dev, format, ## __VA_ARGS__); } \
+	while (0)
+
+#define WQT wait_queue_entry_t
+#define WQH head
+#define WQE entry
+
+/* standard and extended command sets of the usb-gpib adapter */
+
+#define USB_GPIB_ON	 "\nIB\n"
+#define USB_GPIB_OFF	 "\nIBO\n"
+#define USB_GPIB_IBm0	 "\nIBm0\n"   /* do not assert REN with IFC */
+#define USB_GPIB_IBm1	 "\nIBm1\n"   /* assert REN with IFC */
+#define USB_GPIB_IBCL	 "\nIBZ\n"
+#define USB_GPIB_STATUS	 "\nIBS\n"
+#define USB_GPIB_READ	 "\nIB?\n"
+#define USB_GPIB_READ_1	 "\nIBB\n"
+#define USB_GPIB_EOI	 "\nIBe0\n"
+#define USB_GPIB_FTMO	 "\nIBf0\n"    /* disable first byte timeout */
+#define USB_GPIB_TTMOZ	 "\nIBt0\n"    /* disable byte timeout */
+
+/* incomplete commands */
+
+#define USB_GPIB_BTMO	 "\nIBt"      /* set byte timeout */
+#define USB_GPIB_TTMO	 "\nIBT"      /* set total timeout */
+
+#define USB_GPIB_DEBUG_ON    "\nIBDE\xAA\n"
+#define USB_GPIB_SET_LISTEN  "\nIBDT0\n"
+#define USB_GPIB_SET_TALK    "\nIBDT1\n"
+#define USB_GPIB_SET_LINES   "\nIBDC.\n"
+#define USB_GPIB_SET_DATA    "\nIBDM.\n"
+#define USB_GPIB_READ_LINES  "\nIBD?C\n"
+#define USB_GPIB_READ_DATA   "\nIBD?M\n"
+#define USB_GPIB_READ_BUS    "\nIBD??\n"
+
+/* command sequences */
+
+#define USB_GPIB_UNTALK "\nIBC_\n"
+#define USB_GPIB_UNLISTEN "\nIBC?\n"
+
+/* special characters used by the adapter */
+
+#define DLE ('\020')
+#define STX ('\02')
+#define ETX ('\03')
+#define ACK ('\06')
+#define NODATA ('\03')
+#define NODAV ('\011')
+
+#define IB_BUS_REN  0x01
+#define IB_BUS_IFC  0x02
+#define IB_BUS_NDAC 0x04
+#define IB_BUS_NRFD 0x08
+#define IB_BUS_DAV  0x10
+#define IB_BUS_EOI  0x20
+#define IB_BUS_ATN  0x40
+#define IB_BUS_SRQ  0x80
+
+#define INBUF_SIZE 128
+
+struct char_buf {		/* used by one_char() routine */
+	char *inbuf;
+	int last;
+	int nchar;
+};
+
+struct usb_gpib_priv {		/* private data to the device */
+	u8 eos;			/* eos character */
+	short eos_flags;	/* eos mode */
+	int timeout;		/* current value for timeout */
+	void *dev;		/* the usb device private data structure */
+};
+
+#define GPIB_DEV (((struct usb_gpib_priv *)board->private_data)->dev)
+
+static void show_status(struct gpib_board *board)
+{
+	DIA_LOG(2, "# - buffer_length %d\n", board->buffer_length);
+	DIA_LOG(2, "# - status %lx\n", board->status);
+	DIA_LOG(2, "# - use_count %d\n", board->use_count);
+	DIA_LOG(2, "# - pad %x\n", board->pad);
+	DIA_LOG(2, "# - sad %x\n", board->sad);
+	DIA_LOG(2, "# - timeout %d\n", board->usec_timeout);
+	DIA_LOG(2, "# - ppc %d\n", board->parallel_poll_configuration);
+	DIA_LOG(2, "# - t1delay %d\n", board->t1_nano_sec);
+	DIA_LOG(2, "# - online %d\n", board->online);
+	DIA_LOG(2, "# - autopoll %d\n", board->autospollers);
+	DIA_LOG(2, "# - autopoll task %p\n", board->autospoll_task);
+	DIA_LOG(2, "# - minor %d\n", board->minor);
+	DIA_LOG(2, "# - master %d\n", board->master);
+	DIA_LOG(2, "# - list %d\n", board->ist);
+}
+
+/*
+ * GLOBAL VARIABLES: required for
+ * pairing among gpib minor and usb minor.
+ * MAX_DEV is the max number of usb-gpib adapters; free
+ * to change as you like, but no more than 32
+ */
+
+#define MAX_DEV 8
+static struct usb_interface *lpvo_usb_interfaces[MAX_DEV];   /* registered interfaces */
+static int usb_minors[MAX_DEV];			   /* usb minors */
+static int assigned_usb_minors;		   /* mask of filled slots */
+static struct mutex minors_lock;     /* operations on usb_minors are to be protected */
+
+/*
+ * usb-skeleton prototypes
+ */
+
+struct usb_skel;
+static ssize_t skel_do_write(struct usb_skel *, const char *, size_t);
+static ssize_t skel_do_read(struct usb_skel *, char *, size_t);
+static int skel_do_open(struct gpib_board *, int);
+static int skel_do_release(struct gpib_board *);
+
+/*
+ *  usec_diff : take difference in MICROsec between two 'timespec'
+ *		 (unix time in sec and NANOsec)
+ */
+
+static inline int usec_diff(struct timespec64 *a, struct timespec64 *b)
+{
+	return ((a->tv_sec - b->tv_sec) * 1000000 +
+		(a->tv_nsec - b->tv_nsec) / 1000);
+}
+
+/*
+ *  ***  these routines are specific to the usb-gpib adapter  ***
+ */
+
+/**
+ * write_loop() - Send a byte sequence to the adapter
+ *
+ * @dev:      the private device structure
+ * @msg:      the byte sequence.
+ * @leng:     the byte sequence length.
+ *
+ */
+
+static int write_loop(void *dev, char *msg, int leng)
+{
+	return skel_do_write(dev, msg, leng);
+}
+
+/**
+ * send_command() - Send a byte sequence and return a single byte reply.
+ *
+ * @board:    the gpib_board_struct data area for this gpib interface
+ * @msg:      the byte sequence.
+ * @leng:     the byte sequence length; can be given as zero and is
+ *	      computed automatically, but if 'msg' contains a zero byte,
+ *	      it has to be given explicitly.
+ */
+
+static int send_command(struct gpib_board *board, char *msg, int leng)
+{
+	char buffer[64];
+	int nchar;
+	int retval;
+	struct timespec64 before, after;
+
+	ktime_get_real_ts64 (&before);
+
+	if (!leng)
+		leng = strlen(msg);
+	retval = write_loop(GPIB_DEV, msg, leng);
+	if (retval < 0)
+		return retval;
+
+	nchar = skel_do_read(GPIB_DEV, buffer, 64);
+
+	if (nchar < 0) {
+		dev_err(board->gpib_dev, " return from read: %d\n", nchar);
+		return nchar;
+	} else if (nchar != 1) {
+		dev_err(board->gpib_dev, " Irregular reply to command: %s\n", msg);
+		return -EIO;
+	}
+	ktime_get_real_ts64 (&after);
+
+	DIA_LOG(1, "Sent %d - done %d us.\n", leng, usec_diff(&after, &before));
+
+	return buffer[0] & 0xff;
+}
+
+/*
+ * set_control_line() - Set the value of a single gpib control line
+ *
+ * @board:    the gpib_board_struct data area for this gpib interface
+ * @line:     line mask
+ * @value:    line new value (0/1)
+ */
+
+static int set_control_line(struct gpib_board *board, int line, int value)
+{
+	char msg[] = USB_GPIB_SET_LINES;
+	int retval;
+	int leng = strlen(msg);
+
+	DIA_LOG(1, "setting line %x to %x\n", line, value);
+
+	retval = send_command(board, USB_GPIB_READ_LINES, 0);
+
+	DIA_LOG(1, "old line values: %x\n", retval);
+
+	if (retval == -EIO)
+		return retval;
+
+	msg[leng - 2] = value ? (retval & ~line) : retval | line;
+
+	retval = send_command(board, msg, 0);
+
+	DIA_LOG(1, "operation result: %x\n", retval);
+
+	return retval;
+}
+
+/*
+ * one_char() - read one single byte from input buffer
+ *
+ * @board:	the gpib_board_struct data area for this gpib interface
+ * @char_buf:	the routine private data structure
+ */
+
+static int one_char(struct gpib_board *board, struct char_buf *b)
+{
+	struct timespec64 before, after;
+
+	if (b->nchar) {
+		DIA_LOG(2, "-> %x\n", b->inbuf[b->last - b->nchar]);
+		return b->inbuf[b->last - b->nchar--];
+	}
+	ktime_get_real_ts64 (&before);
+	b->nchar = skel_do_read(GPIB_DEV, b->inbuf, INBUF_SIZE);
+	b->last = b->nchar;
+	ktime_get_real_ts64 (&after);
+
+	DIA_LOG(2, "read %d bytes in %d usec\n",
+		b->nchar, usec_diff(&after, &before));
+
+	if (b->nchar > 0) {
+		DIA_LOG(2, "--> %x\n", b->inbuf[b->last - b->nchar]);
+		return b->inbuf[b->last - b->nchar--];
+	}
+	return -EIO;
+}
+
+/**
+ * set_timeout() - set single byte / total timeouts on the adapter
+ *
+ * @board:    the gpib_board_struct data area for this gpib interface
+ *
+ *	   For sake of speed, the operation is performed only if it
+ *	   modifies the current (saved) value. Minimum allowed timeout
+ *	   is 30 ms (T30ms -> 8); timeout disable (TNONE -> 0) currently
+ *	   not supported.
+ */
+
+static void set_timeout(struct gpib_board *board)
+{
+	int n, val;
+	char command[sizeof(USB_GPIB_TTMO) + 6];
+	struct usb_gpib_priv *data = board->private_data;
+
+	if (data->timeout == board->usec_timeout)
+		return;
+
+	n = (board->usec_timeout + 32767) / 32768;
+	if (n < 2)
+		n = 2;
+
+	DIA_LOG(1, "Set timeout to %d us -> %d\n", board->usec_timeout, n);
+
+	sprintf(command, "%s%d\n", USB_GPIB_BTMO, n > 255 ? 255 : n);
+	val = send_command(board, command, 0);
+
+	if (val == ACK) {
+		if (n > 65535)
+			n = 65535;
+		sprintf(command, "%s%d\n", USB_GPIB_TTMO, n);
+		val = send_command(board, command, 0);
+	}
+
+	if (val != ACK)
+		dev_err(board->gpib_dev, "error in timeout set: <%s>\n", command);
+	else
+		data->timeout = board->usec_timeout;
+}
+
+/*
+ * now the standard interface functions - attach and detach
+ */
+
+/**
+ * usb_gpib_attach() - activate the usb-gpib converter board
+ *
+ * @board:    the gpib_board_struct data area for this gpib interface
+ * @config:   firmware data, if any (from gpib_config -I <file>)
+ *
+ * The channel name is ttyUSBn, with n=0 by default. Other values for n
+ * passed with gpib_config -b <n>.
+ *
+ * In this routine I trust that when an error code is returned
+ * detach() will be called. Always.
+ */
+
+static int usb_gpib_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	int retval, j;
+	u32 base = config->ibbase;
+	char *device_path;
+	int match;
+	struct usb_device *udev;
+
+	DIA_LOG(0, "Board %p -t %s -m %d -a %p -u %d -l %d -b %d\n",
+		board, board->interface->name, board->minor, config->device_path,
+		config->pci_bus, config->pci_slot, base);
+
+	board->private_data = NULL;  /* to be sure - we can detach before setting */
+
+	/* identify device to be attached */
+
+	mutex_lock(&minors_lock);
+
+	if (config->device_path) {
+		/* if config->device_path given, try that first */
+		for (j = 0 ; j < MAX_DEV ; j++) {
+			if ((assigned_usb_minors & 1 << j) == 0)
+				continue;
+			udev =	usb_get_dev(interface_to_usbdev(lpvo_usb_interfaces[j]));
+			device_path = kobject_get_path(&udev->dev.kobj, GFP_KERNEL);
+			match = gpib_match_device_path(&lpvo_usb_interfaces[j]->dev,
+						       config->device_path);
+			DIA_LOG(1, "dev. %d: minor %d  path: %s --> %d\n", j,
+				lpvo_usb_interfaces[j]->minor, device_path, match);
+			kfree(device_path);
+			if (match)
+				break;
+		}
+	} else if (config->pci_bus != -1 && config->pci_slot != -1) {
+		/* second: look for bus and slot */
+		for (j = 0 ; j < MAX_DEV ; j++) {
+			if ((assigned_usb_minors & 1 << j) == 0)
+				continue;
+			udev =	usb_get_dev(interface_to_usbdev(lpvo_usb_interfaces[j]));
+			DIA_LOG(1, "dev. %d: bus %d -> %d  dev: %d -> %d\n", j,
+				udev->bus->busnum, config->pci_bus, udev->devnum, config->pci_slot);
+			if (config->pci_bus == udev->bus->busnum &&
+			    config->pci_slot == udev->devnum)
+				break;
+		}
+	} else {		/* last chance: usb_minor, given as ibbase */
+		for (j = 0 ; j < MAX_DEV ; j++) {
+			if (usb_minors[j] == base && assigned_usb_minors & 1 << j)
+				break;
+		}
+	}
+	mutex_unlock(&minors_lock);
+
+	if (j == MAX_DEV) {
+		dev_err(board->gpib_dev, "Requested device is not registered.\n");
+		return -EIO;
+	}
+
+	board->private_data = kzalloc(sizeof(struct usb_gpib_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -ENOMEM;
+
+	retval = skel_do_open(board, usb_minors[j]);
+
+	DIA_LOG(1, "Skel open: %d\n", retval);
+
+	if (retval) {
+		dev_err(board->gpib_dev, "skel open failed.\n");
+		kfree(board->private_data);
+		board->private_data = NULL;
+		return -ENODEV;
+	}
+
+	show_status(board);
+
+	retval = send_command(board, USB_GPIB_ON, 0);
+	DIA_LOG(1, "USB_GPIB_ON returns %x\n", retval);
+	if (retval != ACK)
+		return -EIO;
+
+	/*
+	 * We must setup debug mode because we need the extended instruction
+	 * set to cope with the Core (gpib_common) point of view
+	 */
+
+	retval = send_command(board, USB_GPIB_DEBUG_ON, 0);
+	DIA_LOG(1, "USB_GPIB_DEBUG_ON returns %x\n", retval);
+	if (retval != ACK)
+		return -EIO;
+
+	/*
+	 * We must keep REN off after an IFC because so it is
+	 * assumed by the Core
+	 */
+
+	retval = send_command(board, USB_GPIB_IBm0, 0);
+	DIA_LOG(1, "USB_GPIB_IBm0 returns %x\n", retval);
+	if (retval != ACK)
+		return -EIO;
+
+	retval = set_control_line(board, IB_BUS_REN, 0);
+	if (retval != ACK)
+		return -EIO;
+
+	retval = send_command(board, USB_GPIB_FTMO, 0);
+	DIA_LOG(1, "USB_GPIB_FTMO returns %x\n", retval);
+	if (retval != ACK)
+		return -EIO;
+
+	show_status(board);
+	DIA_LOG(0, "attached\n");
+	return 0;
+}
+
+/**
+ * usb_gpib_detach() - deactivate the usb-gpib converter board
+ *
+ * @board:    the gpib_board data area for this gpib interface
+ *
+ */
+
+static void usb_gpib_detach(struct gpib_board *board)
+{
+	int retval;
+
+	show_status(board);
+
+	DIA_LOG(0, "detaching\n");
+
+	if (board->private_data) {
+		if (GPIB_DEV) {
+			write_loop(GPIB_DEV, USB_GPIB_OFF, strlen(USB_GPIB_OFF));
+			msleep(100);
+			DIA_LOG(1, "%s", "GPIB off\n");
+			retval = skel_do_release(board);
+			DIA_LOG(1, "skel release -> %d\n", retval);
+		}
+		kfree(board->private_data);
+		board->private_data = NULL;
+	}
+
+	DIA_LOG(0, "detached\n");
+}
+
+/*
+ *   Other functions follow in alphabetical order
+ */
+/* command */
+static int usb_gpib_command(struct gpib_board *board,
+			    u8 *buffer,
+			    size_t length,
+			    size_t *bytes_written)
+{
+	int i, retval;
+	char command[6] = "IBc.\n";
+
+	DIA_LOG(1, "enter %p\n", board);
+
+	set_timeout(board);
+
+	*bytes_written = 0;
+	for (i = 0 ; i < length ; i++) {
+		command[3] = buffer[i];
+		retval = send_command(board, command, 5);
+		DIA_LOG(2, "%d ==> %x %x\n", i, buffer[i], retval);
+		if (retval != 0x06)
+			return retval;
+		++(*bytes_written);
+	}
+	return 0;
+}
+
+/**
+ * usb_gpib_disable_eos() - Disable END on eos byte (END on EOI only)
+ *
+ * @board:    the gpib_board data area for this gpib interface
+ *
+ *   With the lpvo adapter eos can only be handled via software.
+ *   Cannot do nothing here, but remember for future use.
+ */
+
+static void usb_gpib_disable_eos(struct gpib_board *board)
+{
+	((struct usb_gpib_priv *)board->private_data)->eos_flags &= ~REOS;
+	DIA_LOG(1, "done: %x\n",
+		((struct usb_gpib_priv *)board->private_data)->eos_flags);
+}
+
+/**
+ * usb_gpib_enable_eos() - Enable END for reads when eos byte is received.
+ *
+ * @board:    the gpib_board data area for this gpib interface
+ * @eos_byte: the 'eos' byte
+ * @compare_8_bits: if zero ignore eigthth bit when comparing
+ *
+ */
+
+static int usb_gpib_enable_eos(struct gpib_board *board,
+			       u8 eos_byte,
+			       int compare_8_bits)
+{
+	struct usb_gpib_priv *pd = (struct usb_gpib_priv *)board->private_data;
+
+	DIA_LOG(1, "enter with %x\n", eos_byte);
+	pd->eos = eos_byte;
+	pd->eos_flags = REOS;
+	if (compare_8_bits)
+		pd->eos_flags |= BIN;
+	return 0;
+}
+
+/**
+ * usb_gpib_go_to_standby() - De-assert ATN
+ *
+ * @board:    the gpib_board data area for this gpib interface
+ */
+
+static int usb_gpib_go_to_standby(struct gpib_board *board)
+{
+	int retval = set_control_line(board, IB_BUS_ATN, 0);
+
+	DIA_LOG(1, "done with %x\n", retval);
+
+	if (retval == ACK)
+		return 0;
+	return -EIO;
+}
+
+/**
+ * usb_gpib_interface_clear() - Assert or de-assert IFC
+ *
+ * @board:    the gpib_board data area for this gpib interface
+ * @assert:   1: assert IFC;  0: de-assert IFC
+ *
+ *    Currently on the assert request we issue the lpvo IBZ
+ *    command that cycles IFC low for 100 usec, then we ignore
+ *    the de-assert request.
+ */
+
+static void usb_gpib_interface_clear(struct gpib_board *board, int assert)
+{
+	int retval = 0;
+
+	DIA_LOG(1, "enter with %d\n", assert);
+
+	if (assert) {
+		retval = send_command(board, USB_GPIB_IBCL, 0);
+
+		set_bit(CIC_NUM, &board->status);
+	}
+
+	DIA_LOG(1, "done with %d %d\n", assert, retval);
+}
+
+/**
+ * usb_gpib_line_status() - Read the status of the bus lines.
+ *
+ *  @board:    the gpib_board data area for this gpib interface
+ *
+ *    We can read all lines.
+ */
+static int usb_gpib_line_status(const struct gpib_board *board)
+{
+	int buffer;
+	int line_status = VALID_ALL;   /* all lines will be read */
+	struct list_head *p, *q;
+	WQT *item;
+	unsigned long flags;
+	int sleep = 0;
+
+	DIA_LOG(1, "%s\n", "request");
+
+	/*
+	 * if we are on the wait queue (board->wait), do not hurry
+	 * reading status line; instead, pause a little
+	 */
+
+	spin_lock_irqsave((spinlock_t *)&board->wait.lock, flags);
+	q = (struct list_head *)&board->wait.WQH;
+	list_for_each(p, q) {
+		item = container_of(p, WQT, WQE);
+		if (item->private == current) {
+			sleep = 20;
+			break;
+		}
+		/* pid is: ((struct task_struct *) item->private)->pid); */
+	}
+	spin_unlock_irqrestore((spinlock_t *)&board->wait.lock, flags);
+	if (sleep) {
+		DIA_LOG(1, "we are on the wait queue - sleep %d ms\n", sleep);
+		msleep(sleep);
+	}
+
+	buffer = send_command((struct gpib_board *)board, USB_GPIB_STATUS, 0);
+
+	if (buffer < 0) {
+		dev_err(board->gpib_dev, "line status read failed with %d\n", buffer);
+		return -1;
+	}
+
+	if ((buffer & 0x01) == 0)
+		line_status |= BUS_REN;
+	if ((buffer & 0x02) == 0)
+		line_status |= BUS_IFC;
+	if ((buffer & 0x04) == 0)
+		line_status |= BUS_NDAC;
+	if ((buffer & 0x08) == 0)
+		line_status |= BUS_NRFD;
+	if ((buffer & 0x10) == 0)
+		line_status |= BUS_DAV;
+	if ((buffer & 0x20) == 0)
+		line_status |= BUS_EOI;
+	if ((buffer & 0x40) == 0)
+		line_status |= BUS_ATN;
+	if ((buffer & 0x80) == 0)
+		line_status |= BUS_SRQ;
+
+	DIA_LOG(1, "done with %x %x\n", buffer, line_status);
+
+	return line_status;
+}
+
+/* parallel_poll */
+
+static int usb_gpib_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	/*
+	 * request parallel poll asserting ATN | EOI;
+	 * we suppose ATN already asserted
+	 */
+
+	int retval;
+
+	DIA_LOG(1, "enter %p\n", board);
+
+	retval = set_control_line(board, IB_BUS_EOI, 1);
+	if (retval != ACK)
+		return -EIO;
+
+	*result = send_command(board, USB_GPIB_READ_DATA, 0);
+
+	DIA_LOG(1, "done with %x\n", *result);
+
+	retval = set_control_line(board, IB_BUS_EOI, 0);
+	if (retval != 0x06)
+		return -EIO;
+
+	return 0;
+}
+
+/* read */
+
+static int usb_gpib_read(struct gpib_board *board,
+			 u8 *buffer,
+			 size_t length,
+			 int *end,
+			 size_t *bytes_read)
+{
+#define MAX_READ_EXCESS 16384
+
+	struct char_buf b = {NULL, 0};
+
+	int retval;
+	char c, nc;
+	int ic;
+	struct timespec64 before, after;
+	int read_count = MAX_READ_EXCESS;
+	struct usb_gpib_priv *pd = (struct usb_gpib_priv *)board->private_data;
+
+	DIA_LOG(1, "enter %p -> %zu\n", board, length);
+
+	*bytes_read = 0;      /* by default, things go wrong */
+	*end = 0;
+
+	set_timeout(board);
+
+	/* single byte read has a special handling */
+
+	if (length == 1) {
+		char inbuf[2] = {0, 0};
+
+		/* read a single character */
+
+		ktime_get_real_ts64 (&before);
+
+		retval = write_loop(GPIB_DEV, USB_GPIB_READ_1, strlen(USB_GPIB_READ_1));
+		if (retval < 0)
+			return retval;
+
+		retval = skel_do_read(GPIB_DEV, inbuf, 1);
+		retval += skel_do_read(GPIB_DEV, inbuf + 1, 1);
+
+		ktime_get_real_ts64 (&after);
+
+		DIA_LOG(1, "single read: %x %x %x in %d\n", retval,
+			inbuf[0], inbuf[1],
+			usec_diff(&after, &before));
+
+		/* good char / last char? */
+
+		if (retval == 2 && inbuf[1] == ACK) {
+			buffer[0] = inbuf[0];
+			*bytes_read = 1;
+			return 0;
+		}
+		if (retval < 2)
+			return -EIO;
+		else
+			return -ETIME;
+	}
+
+	/* allocate buffer for multibyte read */
+
+	b.inbuf = kmalloc(INBUF_SIZE, GFP_KERNEL);
+	if (!b.inbuf)
+		return -ENOMEM;
+
+	/* send read command and check <DLE><STX> sequence */
+
+	retval = write_loop(GPIB_DEV, USB_GPIB_READ, strlen(USB_GPIB_READ));
+	if (retval < 0)
+		goto read_return;
+
+	if (one_char(board, &b) != DLE || one_char(board, &b) != STX) {
+		dev_err(board->gpib_dev, "wrong <DLE><STX> sequence\n");
+		retval = -EIO;
+		goto read_return;
+	}
+
+	/* get data flow */
+
+	while (1) {
+		ic = one_char(board, &b);
+		if (ic == -EIO) {
+			retval = -EIO;
+			goto read_return;
+		}
+		c = ic;
+
+		if (c == DLE)
+			nc = one_char(board, &b);
+		if (c != DLE || nc == DLE) {
+			/* data byte - store into buffer */
+
+			if (*bytes_read == length)
+				break; /* data overflow */
+			if (c == DLE)
+				c = nc;
+			buffer[(*bytes_read)++] = c;
+			if (c == pd->eos) {
+				*end = 1;
+				break;
+			}
+
+		} else {
+			/* we are in the closing <DLE><ETX> sequence */
+			c = nc;
+			if (c == ETX) {
+				c = one_char(board, &b);
+				if (c == ACK) {
+					*end = 1;
+					retval = 0;
+					goto read_return;
+				} else {
+					dev_err(board->gpib_dev, "wrong end of message %x", c);
+					retval = -ETIME;
+					goto read_return;
+				}
+			} else {
+				dev_err(board->gpib_dev, "lone <DLE> in stream");
+				retval = -EIO;
+				goto read_return;
+			}
+		}
+	}
+
+	/* we had a data overflow - flush excess data */
+
+	while (read_count--) {
+		if (one_char(board, &b) != DLE)
+			continue;
+		c = one_char(board, &b);
+		if (c == DLE)
+			continue;
+		if (c == ETX) {
+			c = one_char(board, &b);
+			if (c == ACK) {
+				if (MAX_READ_EXCESS - read_count > 1)
+					dev_dbg(board->gpib_dev, "small buffer - maybe some data lost");
+				retval = 0;
+				goto read_return;
+			}
+			break;
+		}
+	}
+
+	dev_err(board->gpib_dev, "no input end - board in odd state\n");
+	retval = -EIO;
+
+read_return:
+	kfree(b.inbuf);
+
+	DIA_LOG(1, "done with byte/status: %d %x %d\n",	(int)*bytes_read, retval, *end);
+
+	if (retval == 0 || retval == -ETIME) {
+		if (send_command(board, USB_GPIB_UNTALK, sizeof(USB_GPIB_UNTALK)) == 0x06)
+			return retval;
+		return	-EIO;
+	}
+
+	return retval;
+}
+
+/* remote_enable */
+
+static void usb_gpib_remote_enable(struct gpib_board *board, int enable)
+{
+	int retval;
+
+	retval = set_control_line(board, IB_BUS_REN, enable ? 1 : 0);
+	if (retval != ACK)
+		dev_err(board->gpib_dev, "could not set REN line: %x\n", retval);
+
+	DIA_LOG(1, "done with %x\n", retval);
+}
+
+/* request_system_control */
+
+static int usb_gpib_request_system_control(struct gpib_board *board, int request_control)
+{
+	if (!request_control)
+		return -EINVAL;
+
+	DIA_LOG(1, "done with %d -> %lx\n", request_control, board->status);
+	return 0;
+}
+
+/* take_control */
+/* beware: the sync flag is ignored; what is its real meaning? */
+
+static int usb_gpib_take_control(struct gpib_board *board, int sync)
+{
+	int retval;
+
+	retval = set_control_line(board, IB_BUS_ATN, 1);
+
+	DIA_LOG(1, "done with %d %x\n", sync, retval);
+
+	if (retval == ACK)
+		return 0;
+	return -EIO;
+}
+
+/* update_status */
+
+static unsigned int usb_gpib_update_status(struct gpib_board *board,
+					   unsigned int clear_mask)
+{
+	/* There is nothing we can do here, I guess */
+
+	board->status &= ~clear_mask;
+
+	DIA_LOG(1, "done with %x %lx\n", clear_mask, board->status);
+
+	return board->status;
+}
+
+/* write */
+/* beware: DLE characters are not escaped - can only send ASCII data */
+
+static int usb_gpib_write(struct gpib_board *board,
+			  u8 *buffer,
+			  size_t length,
+			  int send_eoi,
+			  size_t *bytes_written)
+{
+	int retval;
+	char *msg;
+
+	DIA_LOG(1, "enter %p -> %zu\n", board, length);
+
+	set_timeout(board);
+
+	msg = kmalloc(length + 8, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	memcpy(msg, "\nIB\020\002", 5);
+	memcpy(msg + 5, buffer, length);
+	memcpy(msg + 5 + length, "\020\003\n", 3);
+
+	retval = send_command(board, msg, length + 8);
+	kfree(msg);
+
+	DIA_LOG(1, "<%.*s> -> %x\n", (int)length, buffer, retval);
+
+	if (retval != ACK)
+		return -EPIPE;
+
+	*bytes_written = length;
+
+	if (send_command(board, USB_GPIB_UNLISTEN, sizeof(USB_GPIB_UNLISTEN)) != 0x06)
+		return -EPIPE;
+
+	return length;
+}
+
+/*
+ *  ***	 following functions not implemented yet  ***
+ */
+
+/* parallel_poll configure */
+
+static void usb_gpib_parallel_poll_configure(struct gpib_board *board,
+					     u8 configuration)
+{
+}
+
+/* parallel_poll_response */
+
+static void usb_gpib_parallel_poll_response(struct gpib_board *board, int ist)
+{
+}
+
+/* primary_address */
+
+static int  usb_gpib_primary_address(struct gpib_board *board, unsigned int address)
+{
+	return 0;
+}
+
+/* return_to_local */
+
+static	void usb_gpib_return_to_local(struct gpib_board *board)
+{
+}
+
+/* secondary_address */
+
+static int usb_gpib_secondary_address(struct gpib_board *board,
+				      unsigned int address,
+				      int enable)
+{
+	return 0;
+}
+
+/* serial_poll_response */
+
+static void usb_gpib_serial_poll_response(struct gpib_board *board, u8 status)
+{
+}
+
+/* serial_poll_status */
+
+static u8 usb_gpib_serial_poll_status(struct gpib_board *board)
+{
+	return 0;
+}
+
+/* t1_delay */
+
+static int usb_gpib_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	return 0;
+}
+
+/*
+ *   ***  module dispatch table and init/exit functions	 ***
+ */
+
+static struct gpib_interface usb_gpib_interface = {
+	.name = NAME,
+	.attach = usb_gpib_attach,
+	.detach = usb_gpib_detach,
+	.read = usb_gpib_read,
+	.write = usb_gpib_write,
+	.command = usb_gpib_command,
+	.take_control = usb_gpib_take_control,
+	.go_to_standby = usb_gpib_go_to_standby,
+	.request_system_control = usb_gpib_request_system_control,
+	.interface_clear = usb_gpib_interface_clear,
+	.remote_enable = usb_gpib_remote_enable,
+	.enable_eos = usb_gpib_enable_eos,
+	.disable_eos = usb_gpib_disable_eos,
+	.parallel_poll = usb_gpib_parallel_poll,
+	.parallel_poll_configure = usb_gpib_parallel_poll_configure,
+	.parallel_poll_response = usb_gpib_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = usb_gpib_line_status,
+	.update_status = usb_gpib_update_status,
+	.primary_address = usb_gpib_primary_address,
+	.secondary_address = usb_gpib_secondary_address,
+	.serial_poll_response = usb_gpib_serial_poll_response,
+	.serial_poll_status = usb_gpib_serial_poll_status,
+	.t1_delay = usb_gpib_t1_delay,
+	.return_to_local = usb_gpib_return_to_local,
+	.skip_check_for_command_acceptors = 1
+};
+
+/*
+ * usb_gpib_init_module(), usb_gpib_exit_module()
+ *
+ * This functions are called every time a new device is detected
+ * and registered or is removed and unregistered.
+ * We must take note of created and destroyed usb minors to be used
+ * when usb_gpib_attach() and usb_gpib_detach() will be called on
+ * request by gpib_config.
+ */
+
+static int usb_gpib_init_module(struct usb_interface *interface)
+{
+	int j, mask, rv;
+
+	rv = mutex_lock_interruptible(&minors_lock);
+	if (rv < 0)
+		return rv;
+
+	if (!assigned_usb_minors) {
+		rv = gpib_register_driver(&usb_gpib_interface, THIS_MODULE);
+		if (rv) {
+			pr_err("gpib_register_driver failed: error = %d\n", rv);
+			goto exit;
+		}
+	} else {
+		/*
+		 * check if minor is already registered - maybe useless, but if
+		 * it happens the code is inconsistent somewhere
+		 */
+
+		for (j = 0 ; j < MAX_DEV ; j++) {
+			if (usb_minors[j] == interface->minor && assigned_usb_minors & 1 << j) {
+				pr_err("CODE BUG: USB minor %d registered at %d.\n",
+				       interface->minor, j);
+				rv = -1;
+				goto exit;
+			}
+		}
+	}
+
+	/* find a free slot */
+
+	for (j = 0 ; j < MAX_DEV ; j++) {
+		mask = 1 << j;
+		if ((assigned_usb_minors & mask) == 0) {
+			usb_minors[j] = interface->minor;
+			lpvo_usb_interfaces[j] = interface;
+			assigned_usb_minors |= mask;
+			rv = 0;
+			goto exit;
+		}
+	}
+	pr_err("No slot available for interface %p minor %d\n", interface, interface->minor);
+	rv = -1;
+
+exit:
+	mutex_unlock(&minors_lock);
+	return rv;
+}
+
+static void usb_gpib_exit_module(int minor)
+{
+	int j;
+
+	mutex_lock(&minors_lock);
+	for (j = 0 ; j < MAX_DEV ; j++) {
+		if (usb_minors[j] == minor && assigned_usb_minors & 1 << j) {
+			assigned_usb_minors &= ~(1 << j);
+			usb_minors[j] = -1;
+			if (assigned_usb_minors == 0)
+				gpib_unregister_driver(&usb_gpib_interface);
+			goto exit;
+		}
+	}
+	pr_err("CODE BUG: USB minor %d not found.\n", minor);
+
+exit:
+	mutex_unlock(&minors_lock);
+}
+
+/*
+ * Default latency time (16 msec) is too long.
+ * We must use 1 msec (best); anyhow, no more than 5 msec.
+ *
+ * Defines and function taken and modified from the kernel tree
+ * (see ftdi_sio.h and ftdi_sio.c).
+ */
+
+#define FTDI_SIO_SET_LATENCY_TIMER	9 /* Set the latency timer */
+#define FTDI_SIO_SET_LATENCY_TIMER_REQUEST FTDI_SIO_SET_LATENCY_TIMER
+#define FTDI_SIO_SET_LATENCY_TIMER_REQUEST_TYPE 0x40
+#define WDR_TIMEOUT 5000 /* default urb timeout */
+#define WDR_SHORT_TIMEOUT 1000	/* shorter urb timeout */
+
+#define LATENCY_TIMER 1		   /* use a small latency timer: 1 ... 5 msec */
+#define LATENCY_CHANNEL 0	   /* channel selection in multichannel devices */
+static int write_latency_timer(struct usb_device *udev)
+{
+	int rv = usb_control_msg(udev,
+				 usb_sndctrlpipe(udev, 0),
+				 FTDI_SIO_SET_LATENCY_TIMER_REQUEST,
+				 FTDI_SIO_SET_LATENCY_TIMER_REQUEST_TYPE,
+				 LATENCY_TIMER, LATENCY_CHANNEL,
+				 NULL, 0, WDR_TIMEOUT);
+	if (rv < 0)
+		dev_err(&udev->dev, "Unable to write latency timer: %i\n", rv);
+	return rv;
+}
+
+/*****************************************************************************
+ *									     *
+ *  The following code is a modified version of the USB Skeleton driver	     *
+ *  written by Greg Kroah-Hartman and available in the kernel tree.	     *
+ *									     *
+ *  Functions skel_open() and skel_release() have been rewritten and named   *
+ *  skel_do_open() and skel_do_release() to process the attach and detach    *
+ *  requests coming from gpib_config.					     *
+ *									     *
+ *  Functions skel_read() and skel_write() have been split into a	     *
+ *  skel_do_read() and skel_do_write(), that cover the kernel stuff of read  *
+ *  and write operations, and the original skel_read() and skel_write(),     *
+ *  that handle communication with user space and call their _do_ companion. *
+ *									     *
+ *  Only the _do_ versions are used by the lpvo_usb_gpib driver; other ones  *
+ *  can be (optionally) maintained in the compilation to have direct access  *
+ *  to a gpib controller for debug and diagnostics.			     *
+ *									     *
+ *  To avoid collisions in names, devices in user space have been renamed    *
+ *  lpvo_raw1, lpvo_raw2 ....  and the usb driver has been renamed with the  *
+ *  gpib module name.							     *
+ *									     *
+ *****************************************************************************/
+
+/*
+ * USB Skeleton driver - 2.2
+ *
+ * Copyright (C) 2001-2004 Greg Kroah-Hartman (greg@kroah.com)
+ *
+ * This driver is based on the 2.6.3 version of drivers/usb/usb-skeleton.c
+ * but has been rewritten to be easier to read and use.
+ */
+
+#include <linux/errno.h>
+#include <linux/kref.h>
+#include <linux/uaccess.h>
+#include <linux/mutex.h>
+
+/* Get a minor range for your devices from the usb maintainer */
+#define USB_SKEL_MINOR_BASE	   192
+
+/*   private defines   */
+
+#define MAX_TRANSFER		    (PAGE_SIZE - 512)
+/*
+ * MAX_TRANSFER is chosen so that the VM is not stressed by
+ * allocations > PAGE_SIZE and the number of packets in a page
+ * is an integer 512 is the largest possible packet on EHCI
+ */
+
+#define WRITES_IN_FLIGHT	1     /* we do not want more than one pending write */
+#define USER_DEVICE 1		      /* compile for device(s) in user space */
+
+/* Structure to hold all of our device specific stuff */
+struct usb_skel {
+	struct usb_device     *udev;		     /* the usb device for this device */
+	struct usb_interface  *interface;	     /* the interface for this device */
+	struct semaphore      limit_sem;	     /* limiting the number of writes in progress */
+	struct usb_anchor     submitted;	     /* in case need to retract our submissions */
+	struct urb	      *bulk_in_urb;	     /* the urb to read data with */
+	unsigned char	      *bulk_in_buffer;	     /* the buffer to receive data */
+	size_t		      bulk_in_size;	     /* the size of the receive buffer */
+	size_t		      bulk_in_filled;	     /* number of bytes in the buffer */
+	size_t		      bulk_in_copied;	     /* already copied to user space */
+	__u8		      bulk_in_endpoint_addr;  /* the address of the bulk in endpoint */
+	__u8		      bulk_out_endpoint_addr; /* the address of the bulk out endpoint */
+	int		      errors;		     /* the last request tanked */
+	bool		      ongoing_read;	     /* a read is going on */
+	spinlock_t	      err_lock;		     /* lock for errors */
+	struct kref	      kref;
+	struct mutex	      io_mutex;		     /* synchronize I/O with disconnect */
+	wait_queue_head_t     bulk_in_wait;	     /* to wait for an ongoing read */
+};
+
+#define to_skel_dev(d) container_of(d, struct usb_skel, kref)
+
+static struct usb_driver skel_driver;
+static void skel_draw_down(struct usb_skel *dev);
+
+static void skel_delete(struct kref *kref)
+{
+	struct usb_skel *dev = to_skel_dev(kref);
+
+	usb_free_urb(dev->bulk_in_urb);
+	usb_put_dev(dev->udev);
+	kfree(dev->bulk_in_buffer);
+	kfree(dev);
+}
+
+/*
+ * skel_do_open() - to be called by usb_gpib_attach
+ */
+
+static int skel_do_open(struct gpib_board *board, int subminor)
+{
+	struct usb_skel *dev;
+	struct usb_interface *interface;
+	int retval = 0;
+
+	interface = usb_find_interface(&skel_driver, subminor);
+	if (!interface) {
+		dev_err(board->gpib_dev, "can't find device for minor %d\n", subminor);
+		retval = -ENODEV;
+		goto exit;
+	}
+
+	dev = usb_get_intfdata(interface);
+	if (!dev) {
+		retval = -ENODEV;
+		goto exit;
+	}
+
+	retval = usb_autopm_get_interface(interface);
+	if (retval)
+		goto exit;
+
+	/* increment our usage count for the device */
+	kref_get(&dev->kref);
+
+	/* save our object in the file's private structure */
+	GPIB_DEV = dev;
+
+exit:
+	return retval;
+}
+
+/*
+ * skel_do_release() - to be called by usb_gpib_detach
+ */
+
+static int skel_do_release(struct gpib_board *board)
+{
+	struct usb_skel *dev;
+
+	dev = GPIB_DEV;
+	if (!dev)
+		return -ENODEV;
+
+	/* allow the device to be autosuspended */
+	mutex_lock(&dev->io_mutex);
+	if (dev->interface)
+		usb_autopm_put_interface(dev->interface);
+	mutex_unlock(&dev->io_mutex);
+
+	/* decrement the count on our device */
+	kref_put(&dev->kref, skel_delete);
+	return 0;
+}
+
+/*
+ * read functions
+ */
+
+static void skel_read_bulk_callback(struct urb *urb)
+{
+	struct usb_skel *dev;
+	unsigned long flags;
+
+	dev = urb->context;
+
+	spin_lock_irqsave(&dev->err_lock, flags);
+	/* sync/async unlink faults aren't errors */
+	if (urb->status) {
+		if (!(urb->status == -ENOENT ||
+		      urb->status == -ECONNRESET ||
+		      urb->status == -ESHUTDOWN))
+			dev_err(&dev->interface->dev, "nonzero read bulk status received: %d\n",
+				urb->status);
+
+		dev->errors = urb->status;
+	} else {
+		dev->bulk_in_filled = urb->actual_length;
+	}
+	dev->ongoing_read = 0;
+	spin_unlock_irqrestore(&dev->err_lock, flags);
+
+	wake_up_interruptible(&dev->bulk_in_wait);
+}
+
+static int skel_do_read_io(struct usb_skel *dev, size_t count)
+{
+	int rv;
+
+	/* prepare a read */
+	usb_fill_bulk_urb(dev->bulk_in_urb,
+			  dev->udev,
+			  usb_rcvbulkpipe(dev->udev,
+					  dev->bulk_in_endpoint_addr),
+			  dev->bulk_in_buffer,
+			  min(dev->bulk_in_size, count),
+			  skel_read_bulk_callback,
+			  dev);
+	/* tell everybody to leave the URB alone */
+	spin_lock_irq(&dev->err_lock);
+	dev->ongoing_read = 1;
+	spin_unlock_irq(&dev->err_lock);
+
+	/* submit bulk in urb, which means no data to deliver */
+	dev->bulk_in_filled = 0;
+	dev->bulk_in_copied = 0;
+
+	/* do it */
+	rv = usb_submit_urb(dev->bulk_in_urb, GFP_KERNEL);
+	if (rv < 0) {
+		dev_err(&dev->interface->dev, "failed submitting read urb, error %d\n", rv);
+		rv = (rv == -ENOMEM) ? rv : -EIO;
+		spin_lock_irq(&dev->err_lock);
+		dev->ongoing_read = 0;
+		spin_unlock_irq(&dev->err_lock);
+	}
+
+	return rv;
+}
+
+/*
+ * skel_do_read() - read operations from lpvo_usb_gpib
+ */
+
+static ssize_t skel_do_read(struct usb_skel *dev, char *buffer, size_t count)
+{
+	int rv;
+	bool ongoing_io;
+
+	/* if we cannot read at all, return EOF */
+
+	if (!dev->bulk_in_urb || !count)
+		return 0;
+
+restart:  /* added to comply with ftdi timeout technique */
+
+	/* no concurrent readers */
+
+	rv = mutex_lock_interruptible(&dev->io_mutex);
+	if (rv < 0)
+		return rv;
+
+	if (!dev->interface) {		      /* disconnect() was called */
+		rv = -ENODEV;
+		goto exit;
+	}
+
+retry:
+	/* if IO is under way, we must not touch things */
+	spin_lock_irq(&dev->err_lock);
+	ongoing_io = dev->ongoing_read;
+	spin_unlock_irq(&dev->err_lock);
+
+	if (ongoing_io) {
+//		  /* nonblocking IO shall not wait */
+//		  /* no file, no O_NONBLOCK; maybe provide when from user space */
+//		  if (file->f_flags & O_NONBLOCK) {
+//			  rv = -EAGAIN;
+//			  goto exit;
+//		  }
+
+		/*
+		 * IO may take forever
+		 * hence wait in an interruptible state
+		 */
+		rv = wait_event_interruptible(dev->bulk_in_wait, (!dev->ongoing_read));
+		if (rv < 0)
+			goto exit;
+	}
+
+	/* errors must be reported */
+	rv = dev->errors;
+	if (rv < 0) {
+		/* any error is reported once */
+		dev->errors = 0;
+		/* to preserve notifications about reset */
+		rv = (rv == -EPIPE) ? rv : -EIO;
+		/* report it */
+		goto exit;
+	}
+
+	/*
+	 * if the buffer is filled we may satisfy the read
+	 * else we need to start IO
+	 */
+
+	if (dev->bulk_in_filled) {
+		/* we had read data */
+
+		size_t available = dev->bulk_in_filled - dev->bulk_in_copied;
+//		  size_t chunk = min(available, count);	 /* compute chunk later */
+		size_t chunk;
+
+		if (!available) {
+			/*
+			 * all data has been used
+			 * actual IO needs to be done
+			 */
+			/*
+			 * it seems that requests for less than dev->bulk_in_size
+			 *  are not accepted
+			 */
+			rv = skel_do_read_io(dev, dev->bulk_in_size);
+			if (rv < 0)
+				goto exit;
+			else
+				goto retry;
+		}
+
+		/*
+		 * data is available - chunk tells us how much shall be copied
+		 */
+
+		/*
+		 * Condition dev->bulk_in_copied > 0 maybe will never happen. In case,
+		 * signal the event and copy using the original procedure, i.e., copy
+		 * first two bytes also
+		 */
+
+		if (dev->bulk_in_copied) {
+			chunk = min(available, count);
+			memcpy(buffer, dev->bulk_in_buffer + dev->bulk_in_copied, chunk);
+			rv = chunk;
+			dev->bulk_in_copied += chunk;
+
+			/* copy discarding first two bytes that contain ftdi chip status */
+
+		} else {
+			/* account for two bytes to be discarded */
+			chunk = min(available, count + 2);
+			if (chunk < 2) {
+				dev_err(&dev->udev->dev, "BAD READ - chunk: %zu\n", chunk);
+				rv = -EIO;
+				goto exit;
+			}
+
+			memcpy(buffer, dev->bulk_in_buffer + 2, chunk - 2);
+			rv = chunk;
+			dev->bulk_in_copied += chunk;
+		}
+
+		/*
+		 * if we are asked for more than we have,
+		 * we start IO but don't wait
+		 *
+		 * No, no read ahead allowed; if the case, more data will be
+		 * asked for by the lpvo_usb_gpib layer.
+		 */
+//		  if (available < count)
+//			  skel_do_read_io(dev, dev->bulk_in_size);
+	} else {
+		/* no data in the buffer */
+		rv = skel_do_read_io(dev, dev->bulk_in_size);
+		if (rv < 0)
+			goto exit;
+		else
+			goto retry;
+	}
+exit:
+	mutex_unlock(&dev->io_mutex);
+	if (rv == 2)
+		goto restart;	/* ftdi chip returns two status bytes after a latency anyhow */
+
+	if (rv > 0)
+		return rv - 2;	/* account for 2 discarded bytes in a valid buffer */
+	return rv;
+}
+
+/*
+ * write functions
+ */
+
+static void skel_write_bulk_callback(struct urb *urb)
+{
+	struct usb_skel *dev;
+	unsigned long flags;
+
+	dev = urb->context;
+
+	/* sync/async unlink faults aren't errors */
+	if (urb->status) {
+		if (!(urb->status == -ENOENT ||
+		      urb->status == -ECONNRESET ||
+		      urb->status == -ESHUTDOWN))
+			dev_err(&dev->interface->dev,
+				"nonzero write bulk status received: %d\n", urb->status);
+
+		spin_lock_irqsave(&dev->err_lock, flags);
+		dev->errors = urb->status;
+		spin_unlock_irqrestore(&dev->err_lock, flags);
+	}
+
+	/* free up our allocated buffer */
+	usb_free_coherent(urb->dev, urb->transfer_buffer_length,
+			  urb->transfer_buffer, urb->transfer_dma);
+	up(&dev->limit_sem);
+}
+
+/*
+ * skel_do_write() - write operations from lpvo_usb_gpib
+ */
+
+static ssize_t skel_do_write(struct usb_skel *dev, const char *buffer, size_t count)
+{
+	int retval = 0;
+	struct urb *urb = NULL;
+	char *buf = NULL;
+	size_t writesize = min_t(size_t, count, (size_t)MAX_TRANSFER);
+
+	/* verify that we actually have some data to write */
+	if (count == 0)
+		goto exit;
+
+	/*
+	 * limit the number of URBs in flight to stop a user from using up all
+	 * RAM
+	 */
+	/* Only one URB is used, because we can't have a pending write() and go on */
+
+//	  if (!(file->f_flags & O_NONBLOCK)) {	/* no NONBLOCK provided */
+	if (down_interruptible(&dev->limit_sem)) {
+		retval = -ERESTARTSYS;
+		goto exit;
+	}
+//	  } else {
+//		  if (down_trylock(&dev->limit_sem)) {
+//			  retval = -EAGAIN;
+//			  goto exit;
+//		  }
+//	  }
+
+	spin_lock_irq(&dev->err_lock);
+	retval = dev->errors;
+	if (retval < 0) {
+		/* any error is reported once */
+		dev->errors = 0;
+		/* to preserve notifications about reset */
+		retval = (retval == -EPIPE) ? retval : -EIO;
+	}
+	spin_unlock_irq(&dev->err_lock);
+	if (retval < 0)
+		goto error;
+
+	/* create a urb, and a buffer for it, and copy the data to the urb */
+	urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!urb) {
+		retval = -ENOMEM;
+		goto error;
+	}
+
+	buf = usb_alloc_coherent(dev->udev, writesize, GFP_KERNEL,
+				 &urb->transfer_dma);
+	if (!buf) {
+		retval = -ENOMEM;
+		goto error;
+	}
+
+	memcpy(buf, buffer, count);
+
+	/* this lock makes sure we don't submit URBs to gone devices */
+	mutex_lock(&dev->io_mutex);
+	if (!dev->interface) {		      /* disconnect() was called */
+		mutex_unlock(&dev->io_mutex);
+		retval = -ENODEV;
+		goto error;
+	}
+
+	/* initialize the urb properly */
+	usb_fill_bulk_urb(urb, dev->udev,
+			  usb_sndbulkpipe(dev->udev, dev->bulk_out_endpoint_addr),
+			  buf, writesize, skel_write_bulk_callback, dev);
+	urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+	usb_anchor_urb(urb, &dev->submitted);
+
+	/* send the data out the bulk port */
+	retval = usb_submit_urb(urb, GFP_KERNEL);
+	mutex_unlock(&dev->io_mutex);
+	if (retval) {
+		dev_err(&dev->interface->dev, "failed submitting write urb, error %d\n", retval);
+		goto error_unanchor;
+	}
+
+	/*
+	 * release our reference to this urb, the USB core will eventually free
+	 * it entirely
+	 */
+	usb_free_urb(urb);
+
+	return writesize;
+
+error_unanchor:
+	usb_unanchor_urb(urb);
+error:
+	if (urb) {
+		usb_free_coherent(dev->udev, writesize, buf, urb->transfer_dma);
+		usb_free_urb(urb);
+	}
+	up(&dev->limit_sem);
+
+exit:
+	return retval;
+}
+
+/*
+ * services for the user space devices
+ */
+
+#if USER_DEVICE	 /* conditional compilation of user space device */
+
+static int skel_flush(struct file *file, fl_owner_t id)
+{
+	struct usb_skel *dev;
+	int res;
+
+	dev = file->private_data;
+	if (!dev)
+		return -ENODEV;
+
+	/* wait for io to stop */
+	mutex_lock(&dev->io_mutex);
+	skel_draw_down(dev);
+
+	/* read out errors, leave subsequent opens a clean slate */
+	spin_lock_irq(&dev->err_lock);
+	res = dev->errors ? (dev->errors == -EPIPE ? -EPIPE : -EIO) : 0;
+	dev->errors = 0;
+	spin_unlock_irq(&dev->err_lock);
+
+	mutex_unlock(&dev->io_mutex);
+
+	return res;
+}
+
+static int skel_open(struct inode *inode, struct file *file)
+{
+	struct usb_skel *dev;
+	struct usb_interface *interface;
+	int subminor;
+	int retval = 0;
+
+	subminor = iminor(inode);
+
+	interface = usb_find_interface(&skel_driver, subminor);
+	if (!interface) {
+		pr_err("can't find device for minor %d\n", subminor);
+		retval = -ENODEV;
+		goto exit;
+	}
+
+	dev = usb_get_intfdata(interface);
+	if (!dev) {
+		retval = -ENODEV;
+		goto exit;
+	}
+
+	retval = usb_autopm_get_interface(interface);
+	if (retval)
+		goto exit;
+
+	/* increment our usage count for the device */
+	kref_get(&dev->kref);
+
+	/* save our object in the file's private structure */
+	file->private_data = dev;
+
+exit:
+	return retval;
+}
+
+static int skel_release(struct inode *inode, struct file *file)
+{
+	struct usb_skel *dev;
+
+	dev = file->private_data;
+	if (!dev)
+		return -ENODEV;
+
+	/* allow the device to be autosuspended */
+	mutex_lock(&dev->io_mutex);
+	if (dev->interface)
+		usb_autopm_put_interface(dev->interface);
+	mutex_unlock(&dev->io_mutex);
+
+	/* decrement the count on our device */
+	kref_put(&dev->kref, skel_delete);
+	return 0;
+}
+
+/*
+ * user space access to read function
+ */
+
+static ssize_t skel_read(struct file *file, char __user *buffer, size_t count,
+			 loff_t *ppos)
+{
+	struct usb_skel *dev;
+	char *buf;
+	ssize_t rv;
+
+	dev = file->private_data;
+
+	buf = kmalloc(count, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	rv = skel_do_read(dev, buf, count);
+
+	if (rv > 0) {
+		if (copy_to_user(buffer, buf, rv)) {
+			kfree(buf);
+			return -EFAULT;
+		}
+	}
+	kfree(buf);
+	return rv;
+}
+
+/*
+ * user space access to write function
+ */
+
+static ssize_t skel_write(struct file *file, const char __user *user_buffer,
+			  size_t count, loff_t *ppos)
+{
+	struct usb_skel *dev;
+	char *buf;
+	ssize_t rv;
+
+	dev = file->private_data;
+
+	buf = kmalloc(count, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, user_buffer, count)) {
+		kfree(buf);
+		return -EFAULT;
+	}
+
+	rv = skel_do_write(dev, buf, count);
+	kfree(buf);
+	return rv;
+}
+#endif
+
+static const struct file_operations skel_fops = {
+	.owner =	THIS_MODULE,
+#if USER_DEVICE
+	.read =	   skel_read,
+	.write =   skel_write,
+	.open =	   skel_open,
+	.release = skel_release,
+	.flush =   skel_flush,
+	.llseek =  noop_llseek,
+#endif
+};
+
+/*
+ * usb class driver info in order to get a minor number from the usb core,
+ * and to have the device registered with the driver core
+ */
+#if USER_DEVICE
+static struct usb_class_driver skel_class = {
+	.name =		       "lpvo_raw%d",
+	.fops =		       &skel_fops,
+	.minor_base =	     USB_SKEL_MINOR_BASE,
+};
+#endif
+
+static int skel_probe(struct usb_interface *interface,
+		      const struct usb_device_id *id)
+{
+	struct usb_skel *dev;
+	struct usb_endpoint_descriptor *bulk_in, *bulk_out;
+	int retval;
+	char *device_path;
+
+	mutex_init(&minors_lock);   /* required for handling minor numbers table */
+
+	/* allocate memory for our device state and initialize it */
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	kref_init(&dev->kref);
+	sema_init(&dev->limit_sem, WRITES_IN_FLIGHT);
+	mutex_init(&dev->io_mutex);
+	spin_lock_init(&dev->err_lock);
+	init_usb_anchor(&dev->submitted);
+	init_waitqueue_head(&dev->bulk_in_wait);
+
+	dev->udev = usb_get_dev(interface_to_usbdev(interface));
+	dev->interface = interface;
+
+	/* set up the endpoint information */
+	/* use only the first bulk-in and bulk-out endpoints */
+	retval = usb_find_common_endpoints(interface->cur_altsetting,
+					   &bulk_in, &bulk_out, NULL, NULL);
+	if (retval) {
+		dev_err(&interface->dev,
+			"Could not find both bulk-in and bulk-out endpoints\n");
+		goto error;
+	}
+
+	dev->bulk_in_size = usb_endpoint_maxp(bulk_in);
+	dev->bulk_in_endpoint_addr = bulk_in->bEndpointAddress;
+	dev->bulk_in_buffer = kmalloc(dev->bulk_in_size, GFP_KERNEL);
+	if (!dev->bulk_in_buffer) {
+		retval = -ENOMEM;
+		goto error;
+	}
+	dev->bulk_in_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!dev->bulk_in_urb) {
+		retval = -ENOMEM;
+		goto error;
+	}
+
+	dev->bulk_out_endpoint_addr = bulk_out->bEndpointAddress;
+
+	/* save our data pointer in this interface device */
+	usb_set_intfdata(interface, dev);
+
+	/* let the world know */
+
+	device_path = kobject_get_path(&dev->udev->dev.kobj, GFP_KERNEL);
+	dev_dbg(&interface->dev, "New lpvo_usb_device -> bus: %d  dev: %d  path: %s\n",
+		dev->udev->bus->busnum, dev->udev->devnum, device_path);
+	kfree(device_path);
+
+#if USER_DEVICE
+	/* we can register the device now, as it is ready */
+	retval = usb_register_dev(interface, &skel_class);
+	if (retval) {
+		/* something prevented us from registering this driver */
+		dev_err(&interface->dev,
+			"Not able to get a minor for this device.\n");
+		usb_set_intfdata(interface, NULL);
+		goto error;
+	}
+#endif
+
+	write_latency_timer(dev->udev);	    /* adjust the latency timer */
+
+	usb_gpib_init_module(interface);    /* last, init the lpvo for this minor */
+
+	return 0;
+
+error:
+	/* this frees allocated memory */
+	kref_put(&dev->kref, skel_delete);
+
+	return retval;
+}
+
+static void skel_disconnect(struct usb_interface *interface)
+{
+	struct usb_skel *dev;
+	int minor = interface->minor;
+
+	usb_gpib_exit_module(minor);	  /* first, disactivate the lpvo */
+
+	dev = usb_get_intfdata(interface);
+	usb_set_intfdata(interface, NULL);
+
+#if USER_DEVICE
+	/* give back our minor */
+	usb_deregister_dev(interface, &skel_class);
+#endif
+
+	/* prevent more I/O from starting */
+	mutex_lock(&dev->io_mutex);
+	dev->interface = NULL;
+	mutex_unlock(&dev->io_mutex);
+
+	usb_kill_anchored_urbs(&dev->submitted);
+
+	/* decrement our usage count */
+	kref_put(&dev->kref, skel_delete);
+}
+
+static void skel_draw_down(struct usb_skel *dev)
+{
+	int time;
+
+	time = usb_wait_anchor_empty_timeout(&dev->submitted, 1000);
+	if (!time)
+		usb_kill_anchored_urbs(&dev->submitted);
+	usb_kill_urb(dev->bulk_in_urb);
+}
+
+static int skel_suspend(struct usb_interface *intf, pm_message_t message)
+{
+	struct usb_skel *dev = usb_get_intfdata(intf);
+
+	if (!dev)
+		return 0;
+	skel_draw_down(dev);
+	return 0;
+}
+
+static int skel_resume(struct usb_interface *intf)
+{
+	return 0;
+}
+
+static int skel_pre_reset(struct usb_interface *intf)
+{
+	struct usb_skel *dev = usb_get_intfdata(intf);
+
+	mutex_lock(&dev->io_mutex);
+	skel_draw_down(dev);
+
+	return 0;
+}
+
+static int skel_post_reset(struct usb_interface *intf)
+{
+	struct usb_skel *dev = usb_get_intfdata(intf);
+
+	/* we are sure no URBs are active - no locking needed */
+	dev->errors = -EPIPE;
+	mutex_unlock(&dev->io_mutex);
+
+	return 0;
+}
+
+static struct usb_driver skel_driver = {
+	.name =			NAME,
+	.probe =		skel_probe,
+	.disconnect =		skel_disconnect,
+	.suspend =		skel_suspend,
+	.resume =		skel_resume,
+	.pre_reset =		skel_pre_reset,
+	.post_reset =		skel_post_reset,
+	.id_table =		skel_table,
+	.supports_autosuspend = 1,
+};
+
+module_usb_driver(skel_driver);
diff --git a/drivers/gpib/nec7210/Makefile b/drivers/gpib/nec7210/Makefile
new file mode 100644
index 000000000000..64330f2e89d1
--- /dev/null
+++ b/drivers/gpib/nec7210/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_GPIB_NEC7210) += nec7210.o
+
+
diff --git a/drivers/gpib/nec7210/board.h b/drivers/gpib/nec7210/board.h
new file mode 100644
index 000000000000..ac3fe38ade57
--- /dev/null
+++ b/drivers/gpib/nec7210/board.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *   copyright            : (C) 2001, 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _GPIB_PCIIA_BOARD_H
+#define _GPIB_PCIIA_BOARD_H
+
+#include "gpibP.h"
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+
+#include "nec7210.h"
+
+#endif	//_GPIB_PCIIA_BOARD_H
+
diff --git a/drivers/gpib/nec7210/nec7210.c b/drivers/gpib/nec7210/nec7210.c
new file mode 100644
index 000000000000..bbf39367f5e4
--- /dev/null
+++ b/drivers/gpib/nec7210/nec7210.c
@@ -0,0 +1,1121 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *   copyright            : (C) 2001, 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#define dev_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "board.h"
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/dma.h>
+#include <linux/bitops.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB library code for NEC uPD7210");
+
+int nec7210_enable_eos(struct gpib_board *board, struct nec7210_priv *priv, u8 eos_byte,
+		       int compare_8_bits)
+{
+	write_byte(priv, eos_byte, EOSR);
+	priv->auxa_bits |= HR_REOS;
+	if (compare_8_bits)
+		priv->auxa_bits |= HR_BIN;
+	else
+		priv->auxa_bits &= ~HR_BIN;
+	write_byte(priv, priv->auxa_bits, AUXMR);
+	return 0;
+}
+EXPORT_SYMBOL(nec7210_enable_eos);
+
+void nec7210_disable_eos(struct gpib_board *board, struct nec7210_priv *priv)
+{
+	priv->auxa_bits &= ~HR_REOS;
+	write_byte(priv, priv->auxa_bits, AUXMR);
+}
+EXPORT_SYMBOL(nec7210_disable_eos);
+
+int nec7210_parallel_poll(struct gpib_board *board, struct nec7210_priv *priv, u8 *result)
+{
+	int ret;
+
+	clear_bit(COMMAND_READY_BN, &priv->state);
+
+	// execute parallel poll
+	write_byte(priv, AUX_EPP, AUXMR);
+	// wait for result FIXME: support timeouts
+	ret = wait_event_interruptible(board->wait, test_bit(COMMAND_READY_BN, &priv->state));
+	if (ret) {
+		dev_dbg(board->gpib_dev, "gpib: parallel poll interrupted\n");
+		return -ERESTARTSYS;
+	}
+	*result = read_byte(priv, CPTR);
+
+	return 0;
+}
+EXPORT_SYMBOL(nec7210_parallel_poll);
+
+void nec7210_parallel_poll_configure(struct gpib_board *board,
+				     struct nec7210_priv *priv, unsigned int configuration)
+{
+	write_byte(priv, PPR | configuration, AUXMR);
+}
+EXPORT_SYMBOL(nec7210_parallel_poll_configure);
+
+void nec7210_parallel_poll_response(struct gpib_board *board, struct nec7210_priv *priv, int ist)
+{
+	if (ist)
+		write_byte(priv, AUX_SPPF, AUXMR);
+	else
+		write_byte(priv, AUX_CPPF, AUXMR);
+}
+EXPORT_SYMBOL(nec7210_parallel_poll_response);
+/*
+ * This is really only adequate for chips that do a 488.2 style reqt/reqf
+ * based on bit 6 of the SPMR (see chapter 11.3.3 of 488.2). For simpler chips that simply
+ * set rsv directly based on bit 6, we either need to do more hardware setup to expose
+ * the 488.2 capability (for example with NI chips), or we need to implement the
+ * 488.2 set srv state machine in the driver (if that is even viable).
+ */
+void nec7210_serial_poll_response(struct gpib_board *board,
+				  struct nec7210_priv *priv, u8 status)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	if (status & request_service_bit) {
+		priv->srq_pending = 1;
+		clear_bit(SPOLL_NUM, &board->status);
+
+	} else {
+		priv->srq_pending = 0;
+	}
+	write_byte(priv, status, SPMR);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+EXPORT_SYMBOL(nec7210_serial_poll_response);
+
+u8 nec7210_serial_poll_status(struct gpib_board *board, struct nec7210_priv *priv)
+{
+	return read_byte(priv, SPSR);
+}
+EXPORT_SYMBOL(nec7210_serial_poll_status);
+
+int nec7210_primary_address(const struct gpib_board *board, struct nec7210_priv *priv,
+			    unsigned int address)
+{
+	// put primary address in address0
+	write_byte(priv, address & ADDRESS_MASK, ADR);
+	return 0;
+}
+EXPORT_SYMBOL(nec7210_primary_address);
+
+int nec7210_secondary_address(const struct gpib_board *board, struct nec7210_priv *priv,
+			      unsigned int address, int enable)
+{
+	if (enable) {
+		// put secondary address in address1
+		write_byte(priv, HR_ARS | (address & ADDRESS_MASK), ADR);
+		// go to address mode 2
+		priv->reg_bits[ADMR] &= ~HR_ADM0;
+		priv->reg_bits[ADMR] |= HR_ADM1;
+	} else {
+		// disable address1 register
+		write_byte(priv, HR_ARS | HR_DT | HR_DL, ADR);
+		// go to address mode 1
+		priv->reg_bits[ADMR] |= HR_ADM0;
+		priv->reg_bits[ADMR] &= ~HR_ADM1;
+	}
+	write_byte(priv, priv->reg_bits[ADMR], ADMR);
+	return 0;
+}
+EXPORT_SYMBOL(nec7210_secondary_address);
+
+static void update_talker_state(struct nec7210_priv *priv, unsigned int address_status_bits)
+{
+	if ((address_status_bits & HR_TA)) {
+		if ((address_status_bits & HR_NATN)) {
+			if (address_status_bits & HR_SPMS)
+				priv->talker_state = serial_poll_active;
+			else
+				priv->talker_state = talker_active;
+		} else {
+			priv->talker_state = talker_addressed;
+		}
+	} else {
+		priv->talker_state = talker_idle;
+	}
+}
+
+static void update_listener_state(struct nec7210_priv *priv, unsigned int address_status_bits)
+{
+	if (address_status_bits & HR_LA) {
+		if ((address_status_bits & HR_NATN))
+			priv->listener_state = listener_active;
+		else
+			priv->listener_state = listener_addressed;
+	} else {
+		priv->listener_state = listener_idle;
+	}
+}
+
+unsigned int nec7210_update_status_nolock(struct gpib_board *board, struct nec7210_priv *priv)
+{
+	int address_status_bits;
+	u8 spoll_status;
+
+	if (!priv)
+		return 0;
+
+	address_status_bits = read_byte(priv, ADSR);
+	if (address_status_bits & HR_CIC)
+		set_bit(CIC_NUM, &board->status);
+	else
+		clear_bit(CIC_NUM, &board->status);
+	// check for talker/listener addressed
+	update_talker_state(priv, address_status_bits);
+	if (priv->talker_state == talker_active || priv->talker_state == talker_addressed)
+		set_bit(TACS_NUM, &board->status);
+	else
+		clear_bit(TACS_NUM, &board->status);
+	update_listener_state(priv, address_status_bits);
+	if (priv->listener_state == listener_active ||
+	    priv->listener_state == listener_addressed)
+		set_bit(LACS_NUM, &board->status);
+	else
+		clear_bit(LACS_NUM, &board->status);
+	if (address_status_bits & HR_NATN)
+		clear_bit(ATN_NUM, &board->status);
+	else
+		set_bit(ATN_NUM, &board->status);
+	spoll_status = nec7210_serial_poll_status(board, priv);
+	if (priv->srq_pending && (spoll_status & request_service_bit) == 0) {
+		priv->srq_pending = 0;
+		set_bit(SPOLL_NUM, &board->status);
+	}
+
+	/*
+	 * we rely on the interrupt handler to set the
+	 * rest of the status bits
+	 */
+
+	return board->status;
+}
+EXPORT_SYMBOL(nec7210_update_status_nolock);
+
+unsigned int nec7210_update_status(struct gpib_board *board, struct nec7210_priv *priv,
+				   unsigned int clear_mask)
+{
+	unsigned long flags;
+	unsigned int retval;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	board->status &= ~clear_mask;
+	retval = nec7210_update_status_nolock(board, priv);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	return retval;
+}
+EXPORT_SYMBOL(nec7210_update_status);
+
+unsigned int nec7210_set_reg_bits(struct nec7210_priv *priv, unsigned int reg,
+				  unsigned int mask, unsigned int bits)
+{
+	priv->reg_bits[reg] &= ~mask;
+	priv->reg_bits[reg] |= mask & bits;
+	write_byte(priv, priv->reg_bits[reg], reg);
+	return priv->reg_bits[reg];
+}
+EXPORT_SYMBOL(nec7210_set_reg_bits);
+
+void nec7210_set_handshake_mode(struct gpib_board *board, struct nec7210_priv *priv, int mode)
+{
+	unsigned long flags;
+
+	mode &= HR_HANDSHAKE_MASK;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	if ((priv->auxa_bits & HR_HANDSHAKE_MASK) != mode) {
+		priv->auxa_bits &= ~HR_HANDSHAKE_MASK;
+		priv->auxa_bits |= mode;
+		write_byte(priv, priv->auxa_bits, AUXMR);
+	}
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+EXPORT_SYMBOL(nec7210_set_handshake_mode);
+
+u8 nec7210_read_data_in(struct gpib_board *board, struct nec7210_priv *priv, int *end)
+{
+	unsigned long flags;
+	u8 data;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	data = read_byte(priv, DIR);
+	clear_bit(READ_READY_BN, &priv->state);
+	if (test_and_clear_bit(RECEIVED_END_BN, &priv->state))
+		*end = 1;
+	else
+		*end = 0;
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	return data;
+}
+EXPORT_SYMBOL(nec7210_read_data_in);
+
+int nec7210_take_control(struct gpib_board *board, struct nec7210_priv *priv, int syncronous)
+{
+	int i;
+	const int timeout = 100;
+	int retval = 0;
+	unsigned int adsr_bits = 0;
+
+	if (syncronous)
+		write_byte(priv, AUX_TCS, AUXMR);
+	else
+		write_byte(priv, AUX_TCA, AUXMR);
+	// busy wait until ATN is asserted
+	for (i = 0; i < timeout; i++) {
+		adsr_bits = read_byte(priv, ADSR);
+		if ((adsr_bits & HR_NATN) == 0)
+			break;
+		udelay(1);
+	}
+	if (i == timeout)
+		return -ETIMEDOUT;
+
+	clear_bit(WRITE_READY_BN, &priv->state);
+
+	return retval;
+}
+EXPORT_SYMBOL(nec7210_take_control);
+
+int nec7210_go_to_standby(struct gpib_board *board, struct nec7210_priv *priv)
+{
+	int i;
+	const int timeout = 1000;
+	unsigned int adsr_bits = 0;
+	int retval = 0;
+
+	write_byte(priv, AUX_GTS, AUXMR);
+	// busy wait until ATN is released
+	for (i = 0; i < timeout; i++) {
+		adsr_bits = read_byte(priv, ADSR);
+		if (adsr_bits & HR_NATN)
+			break;
+		udelay(1);
+	}
+	// if busy wait has failed, try sleeping
+	if (i == timeout) {
+		for (i = 0; i < HZ; i++) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (schedule_timeout(1))
+				return -ERESTARTSYS;
+			adsr_bits = read_byte(priv, ADSR);
+			if (adsr_bits & HR_NATN)
+				break;
+		}
+		if (i == HZ)
+			return -ETIMEDOUT;
+	}
+
+	clear_bit(COMMAND_READY_BN, &priv->state);
+	return retval;
+}
+EXPORT_SYMBOL(nec7210_go_to_standby);
+
+int nec7210_request_system_control(struct gpib_board *board, struct nec7210_priv *priv,
+				   int request_control)
+{
+	if (request_control == 0) {
+		write_byte(priv, AUX_CREN, AUXMR);
+		write_byte(priv, AUX_CIFC, AUXMR);
+		write_byte(priv, AUX_DSC, AUXMR);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(nec7210_request_system_control);
+
+void nec7210_interface_clear(struct gpib_board *board, struct nec7210_priv *priv, int assert)
+{
+	if (assert)
+		write_byte(priv, AUX_SIFC, AUXMR);
+	else
+		write_byte(priv, AUX_CIFC, AUXMR);
+}
+EXPORT_SYMBOL(nec7210_interface_clear);
+
+void nec7210_remote_enable(struct gpib_board *board, struct nec7210_priv *priv, int enable)
+{
+	if (enable)
+		write_byte(priv, AUX_SREN, AUXMR);
+	else
+		write_byte(priv, AUX_CREN, AUXMR);
+}
+EXPORT_SYMBOL(nec7210_remote_enable);
+
+void nec7210_release_rfd_holdoff(struct gpib_board *board, struct nec7210_priv *priv)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	if (test_bit(RFD_HOLDOFF_BN, &priv->state) &&
+	    test_bit(READ_READY_BN, &priv->state) == 0) {
+		write_byte(priv, AUX_FH, AUXMR);
+		clear_bit(RFD_HOLDOFF_BN, &priv->state);
+	}
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+EXPORT_SYMBOL(nec7210_release_rfd_holdoff);
+
+int nec7210_t1_delay(struct gpib_board *board, struct nec7210_priv *priv,
+		     unsigned int nano_sec)
+{
+	unsigned int retval;
+
+	if (nano_sec <= 500) {
+		priv->auxb_bits |= HR_TRI;
+		retval = 500;
+	} else {
+		priv->auxb_bits &= ~HR_TRI;
+		retval = 2000;
+	}
+	write_byte(priv, priv->auxb_bits, AUXMR);
+
+	return retval;
+}
+EXPORT_SYMBOL(nec7210_t1_delay);
+
+void nec7210_return_to_local(const struct gpib_board *board, struct nec7210_priv *priv)
+{
+	write_byte(priv, AUX_RTL, AUXMR);
+}
+EXPORT_SYMBOL(nec7210_return_to_local);
+
+static inline short nec7210_atn_has_changed(struct gpib_board *board, struct nec7210_priv *priv)
+{
+	short address_status_bits = read_byte(priv, ADSR);
+
+	if (address_status_bits & HR_NATN) {
+		if (test_bit(ATN_NUM, &board->status))
+			return 1;
+		else
+			return 0;
+	} else	{
+		if (test_bit(ATN_NUM, &board->status))
+			return 0;
+		else
+			return 1;
+	}
+	return -1;
+}
+
+int nec7210_command(struct gpib_board *board, struct nec7210_priv *priv, u8
+		    *buffer, size_t length, size_t *bytes_written)
+{
+	int retval = 0;
+	unsigned long flags;
+
+	*bytes_written = 0;
+
+	clear_bit(BUS_ERROR_BN, &priv->state);
+
+	while (*bytes_written < length)	{
+		if (wait_event_interruptible(board->wait,
+					     test_bit(COMMAND_READY_BN, &priv->state) ||
+					     test_bit(BUS_ERROR_BN, &priv->state) ||
+					     test_bit(TIMO_NUM, &board->status))) {
+			dev_dbg(board->gpib_dev, "command wait interrupted\n");
+			retval = -ERESTARTSYS;
+			break;
+		}
+		if (test_bit(TIMO_NUM, &board->status))
+			break;
+		if (test_and_clear_bit(BUS_ERROR_BN, &priv->state))
+			break;
+		spin_lock_irqsave(&board->spinlock, flags);
+		clear_bit(COMMAND_READY_BN, &priv->state);
+		write_byte(priv, buffer[*bytes_written], CDOR);
+		spin_unlock_irqrestore(&board->spinlock, flags);
+
+		++(*bytes_written);
+
+		if (need_resched())
+			schedule();
+	}
+	// wait for last byte to get sent
+	if (wait_event_interruptible(board->wait, test_bit(COMMAND_READY_BN, &priv->state) ||
+				     test_bit(BUS_ERROR_BN, &priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+
+	if (test_and_clear_bit(BUS_ERROR_BN, &priv->state))
+		retval = -EIO;
+
+	return retval;
+}
+EXPORT_SYMBOL(nec7210_command);
+
+static int pio_read(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
+		    size_t length, int *end, size_t *bytes_read)
+{
+	ssize_t retval = 0;
+
+	*bytes_read = 0;
+	*end = 0;
+
+	while (*bytes_read < length) {
+		if (wait_event_interruptible(board->wait,
+					     test_bit(READ_READY_BN, &priv->state) ||
+					     test_bit(DEV_CLEAR_BN, &priv->state) ||
+					     test_bit(TIMO_NUM, &board->status))) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		if (test_bit(READ_READY_BN, &priv->state)) {
+			if (*bytes_read == 0)	{
+				/*
+				 * We set the handshake mode here because we know
+				 * no new bytes will arrive (it has already arrived
+				 * and is awaiting being read out of the chip) while we are changing
+				 * modes.  This ensures we can reliably keep track
+				 * of the holdoff state.
+				 */
+				nec7210_set_handshake_mode(board, priv, HR_HLDA);
+			}
+			buffer[(*bytes_read)++] = nec7210_read_data_in(board, priv, end);
+			if (*end)
+				break;
+		}
+		if (test_bit(TIMO_NUM, &board->status)) {
+			retval = -ETIMEDOUT;
+			break;
+		}
+		if (test_bit(DEV_CLEAR_BN, &priv->state)) {
+			retval = -EINTR;
+			break;
+		}
+
+		if (*bytes_read < length)
+			nec7210_release_rfd_holdoff(board, priv);
+
+		if (need_resched())
+			schedule();
+	}
+	return retval;
+}
+
+#ifdef NEC_DMA
+static ssize_t __dma_read(struct gpib_board *board, struct nec7210_priv *priv, size_t length)
+{
+	ssize_t retval = 0;
+	size_t count = 0;
+	unsigned long flags, dma_irq_flags;
+
+	if (length == 0)
+		return 0;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+
+	dma_irq_flags = claim_dma_lock();
+	disable_dma(priv->dma_channel);
+	/* program dma controller */
+	clear_dma_ff(priv->dma_channel);
+	set_dma_count(priv->dma_channel, length);
+	set_dma_addr(priv->dma_channel, priv->dma_buffer_addr);
+	set_dma_mode(priv->dma_channel, DMA_MODE_READ);
+	release_dma_lock(dma_irq_flags);
+
+	enable_dma(priv->dma_channel);
+
+	set_bit(DMA_READ_IN_PROGRESS_BN, &priv->state);
+	clear_bit(READ_READY_BN, &priv->state);
+
+	// enable nec7210 dma
+	nec7210_set_reg_bits(priv, IMR2, HR_DMAI, HR_DMAI);
+
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	// wait for data to transfer
+	if (wait_event_interruptible(board->wait,
+				     test_bit(DMA_READ_IN_PROGRESS_BN, &priv->state) == 0 ||
+				     test_bit(DEV_CLEAR_BN, &priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_bit(DEV_CLEAR_BN, &priv->state))
+		retval = -EINTR;
+
+	// disable nec7210 dma
+	nec7210_set_reg_bits(priv, IMR2, HR_DMAI, 0);
+
+	// record how many bytes we transferred
+	flags = claim_dma_lock();
+	clear_dma_ff(priv->dma_channel);
+	disable_dma(priv->dma_channel);
+	count += length - get_dma_residue(priv->dma_channel);
+	release_dma_lock(flags);
+
+	return retval ? retval : count;
+}
+
+static ssize_t dma_read(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
+			size_t length)
+{
+	size_t remain = length;
+	size_t transfer_size;
+	ssize_t retval = 0;
+
+	while (remain > 0) {
+		transfer_size = (priv->dma_buffer_length < remain) ?
+			priv->dma_buffer_length : remain;
+		retval = __dma_read(board, priv, transfer_size);
+		if (retval < 0)
+			break;
+		memcpy(buffer, priv->dma_buffer, transfer_size);
+		remain -= retval;
+		buffer += retval;
+		if (test_bit(RECEIVED_END_BN, &priv->state))
+			break;
+	}
+
+	if (retval < 0)
+		return retval;
+
+	return length - remain;
+}
+#endif
+
+int nec7210_read(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
+		 size_t length, int *end, size_t *bytes_read)
+{
+	ssize_t retval = 0;
+
+	*end = 0;
+	*bytes_read = 0;
+
+	if (length == 0)
+		return 0;
+
+	clear_bit(DEV_CLEAR_BN, &priv->state); // XXX wrong
+
+	nec7210_release_rfd_holdoff(board, priv);
+
+	retval = pio_read(board, priv, buffer, length, end, bytes_read);
+
+	return retval;
+}
+EXPORT_SYMBOL(nec7210_read);
+
+static int pio_write_wait(struct gpib_board *board, struct nec7210_priv *priv,
+			  short wake_on_lacs, short wake_on_atn, short wake_on_bus_error)
+{
+	// wait until byte is ready to be sent
+	if (wait_event_interruptible(board->wait,
+				     (test_bit(TACS_NUM, &board->status) &&
+				      test_bit(WRITE_READY_BN, &priv->state)) ||
+				     test_bit(DEV_CLEAR_BN, &priv->state) ||
+				     (wake_on_bus_error && test_bit(BUS_ERROR_BN, &priv->state)) ||
+				     (wake_on_lacs && test_bit(LACS_NUM, &board->status)) ||
+				     (wake_on_atn && test_bit(ATN_NUM, &board->status)) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		return -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		return -ETIMEDOUT;
+
+	if (test_bit(DEV_CLEAR_BN, &priv->state))
+		return -EINTR;
+
+	if (wake_on_bus_error && test_and_clear_bit(BUS_ERROR_BN, &priv->state))
+		return -EIO;
+
+	return 0;
+}
+
+static int pio_write(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
+		     size_t length, size_t *bytes_written)
+{
+	size_t last_count = 0;
+	ssize_t retval = 0;
+	unsigned long flags;
+	const int max_bus_errors = (length > 1000) ? length : 1000;
+	int bus_error_count = 0;
+	*bytes_written = 0;
+
+	clear_bit(BUS_ERROR_BN, &priv->state);
+
+	while (*bytes_written < length) {
+		if (need_resched())
+			schedule();
+
+		retval = pio_write_wait(board, priv, 0, 0, priv->type == NEC7210);
+		if (retval == -EIO) {
+			/* resend last byte on bus error */
+			*bytes_written = last_count;
+			/*
+			 * we can get unrecoverable bus errors,
+			 * so give up after a while
+			 */
+			bus_error_count++;
+			if (bus_error_count > max_bus_errors)
+				return retval;
+			continue;
+		} else {
+			if (retval < 0)
+				return retval;
+		}
+		spin_lock_irqsave(&board->spinlock, flags);
+		clear_bit(BUS_ERROR_BN, &priv->state);
+		clear_bit(WRITE_READY_BN, &priv->state);
+		last_count = *bytes_written;
+		write_byte(priv, buffer[(*bytes_written)++], CDOR);
+		spin_unlock_irqrestore(&board->spinlock, flags);
+	}
+	retval = pio_write_wait(board, priv, 1, 1, priv->type == NEC7210);
+	return retval;
+}
+
+#ifdef NEC_DMA
+static ssize_t __dma_write(struct gpib_board *board, struct nec7210_priv *priv, dma_addr_t address,
+			   size_t length)
+{
+	unsigned long flags, dma_irq_flags;
+	int residue = 0;
+	int retval = 0;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+
+	/* program dma controller */
+	dma_irq_flags = claim_dma_lock();
+	disable_dma(priv->dma_channel);
+	clear_dma_ff(priv->dma_channel);
+	set_dma_count(priv->dma_channel, length);
+	set_dma_addr(priv->dma_channel, address);
+	set_dma_mode(priv->dma_channel, DMA_MODE_WRITE);
+	enable_dma(priv->dma_channel);
+	release_dma_lock(dma_irq_flags);
+
+	// enable board's dma for output
+	nec7210_set_reg_bits(priv, IMR2, HR_DMAO, HR_DMAO);
+
+	clear_bit(WRITE_READY_BN, &priv->state);
+	set_bit(DMA_WRITE_IN_PROGRESS_BN, &priv->state);
+
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	// suspend until message is sent
+	if (wait_event_interruptible(board->wait,
+				     test_bit(DMA_WRITE_IN_PROGRESS_BN, &priv->state) == 0 ||
+				     test_bit(BUS_ERROR_BN, &priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+	if (test_and_clear_bit(DEV_CLEAR_BN, &priv->state))
+		retval = -EINTR;
+	if (test_and_clear_bit(BUS_ERROR_BN, &priv->state))
+		retval = -EIO;
+
+	// disable board's dma
+	nec7210_set_reg_bits(priv, IMR2, HR_DMAO, 0);
+
+	dma_irq_flags = claim_dma_lock();
+	clear_dma_ff(priv->dma_channel);
+	disable_dma(priv->dma_channel);
+	residue = get_dma_residue(priv->dma_channel);
+	release_dma_lock(dma_irq_flags);
+
+	if (residue)
+		retval = -EPIPE;
+
+	return retval ? retval : length;
+}
+
+static ssize_t dma_write(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
+			 size_t length)
+{
+	size_t remain = length;
+	size_t transfer_size;
+	ssize_t retval = 0;
+
+	while (remain > 0) {
+		transfer_size = (priv->dma_buffer_length < remain) ?
+			priv->dma_buffer_length : remain;
+		memcpy(priv->dma_buffer, buffer, transfer_size);
+		retval = __dma_write(board, priv, priv->dma_buffer_addr, transfer_size);
+		if (retval < 0)
+			break;
+		remain -= retval;
+		buffer += retval;
+	}
+
+	if (retval < 0)
+		return retval;
+
+	return length - remain;
+}
+#endif
+int nec7210_write(struct gpib_board *board, struct nec7210_priv *priv,
+		  u8 *buffer, size_t length, int send_eoi,
+		  size_t *bytes_written)
+{
+	int retval = 0;
+
+	*bytes_written = 0;
+
+	clear_bit(DEV_CLEAR_BN, &priv->state); // XXX
+
+	if (send_eoi)
+		length-- ; // save the last byte for sending EOI
+
+	if (length > 0)	{
+		// isa dma transfer
+		if (0 /*priv->dma_channel*/) {
+/*
+ * dma writes are unreliable since they can't recover from bus errors
+ * (which happen when ATN is asserted in the middle of a write)
+ */
+#ifdef NEC_DMA
+			retval = dma_write(board, priv, buffer, length);
+			if (retval < 0)
+				return retval;
+			count += retval;
+#endif
+		} else {	// PIO transfer
+			size_t num_bytes;
+
+			retval = pio_write(board, priv, buffer, length, &num_bytes);
+
+			*bytes_written += num_bytes;
+			if (retval < 0)
+				return retval;
+		}
+	}
+	if (send_eoi) {
+		size_t num_bytes;
+
+		/*
+		 * We need to wait to make sure we will immediately be able to write the data byte
+		 * into the chip before sending the associated AUX_SEOI command.  This is really
+		 * only needed for length==1 since otherwise the earlier calls to pio_write
+		 * will have dont the wait already.
+		 */
+		retval = pio_write_wait(board, priv, 0, 0, priv->type == NEC7210);
+		if (retval < 0)
+			return retval;
+		/*send EOI */
+		write_byte(priv, AUX_SEOI, AUXMR);
+
+		retval = pio_write(board, priv, &buffer[*bytes_written], 1, &num_bytes);
+		*bytes_written += num_bytes;
+		if (retval < 0)
+			return retval;
+	}
+
+	return retval;
+}
+EXPORT_SYMBOL(nec7210_write);
+
+/*
+ * interrupt service routine
+ */
+irqreturn_t nec7210_interrupt(struct gpib_board *board, struct nec7210_priv *priv)
+{
+	int status1, status2;
+
+	// read interrupt status (also clears status)
+	status1 = read_byte(priv, ISR1);
+	status2 = read_byte(priv, ISR2);
+
+	return nec7210_interrupt_have_status(board, priv, status1, status2);
+}
+EXPORT_SYMBOL(nec7210_interrupt);
+
+irqreturn_t nec7210_interrupt_have_status(struct gpib_board *board,
+					  struct nec7210_priv *priv, int status1, int status2)
+{
+#ifdef NEC_DMA
+	unsigned long dma_flags;
+#endif
+	int retval = IRQ_NONE;
+
+	// record service request in status
+	if (status2 & HR_SRQI)
+		set_bit(SRQI_NUM, &board->status);
+
+	// change in lockout status
+	if (status2 & HR_LOKC) {
+		if (status2 & HR_LOK)
+			set_bit(LOK_NUM, &board->status);
+		else
+			clear_bit(LOK_NUM, &board->status);
+	}
+
+	// change in remote status
+	if (status2 & HR_REMC) {
+		if (status2 & HR_REM)
+			set_bit(REM_NUM, &board->status);
+		else
+			clear_bit(REM_NUM, &board->status);
+	}
+
+	// record reception of END
+	if (status1 & HR_END) {
+		set_bit(RECEIVED_END_BN, &priv->state);
+		if ((priv->auxa_bits & HR_HANDSHAKE_MASK) == HR_HLDE)
+			set_bit(RFD_HOLDOFF_BN, &priv->state);
+	}
+
+	// get incoming data in PIO mode
+	if ((status1 & HR_DI)) {
+		set_bit(READ_READY_BN, &priv->state);
+		if ((priv->auxa_bits & HR_HANDSHAKE_MASK) == HR_HLDA)
+			set_bit(RFD_HOLDOFF_BN, &priv->state);
+	}
+#ifdef NEC_DMA
+	// check for dma read transfer complete
+	if (test_bit(DMA_READ_IN_PROGRESS_BN, &priv->state)) {
+		dma_flags = claim_dma_lock();
+		disable_dma(priv->dma_channel);
+		clear_dma_ff(priv->dma_channel);
+		if ((status1 & HR_END) || get_dma_residue(priv->dma_channel) == 0)
+			clear_bit(DMA_READ_IN_PROGRESS_BN, &priv->state);
+		else
+			enable_dma(priv->dma_channel);
+		release_dma_lock(dma_flags);
+	}
+#endif
+	if ((status1 & HR_DO)) {
+		if (test_bit(DMA_WRITE_IN_PROGRESS_BN, &priv->state) == 0)
+			set_bit(WRITE_READY_BN, &priv->state);
+#ifdef NEC_DMA
+		if (test_bit(DMA_WRITE_IN_PROGRESS_BN, &priv->state)) {	// write data, isa dma mode
+			// check if dma transfer is complete
+			dma_flags = claim_dma_lock();
+			disable_dma(priv->dma_channel);
+			clear_dma_ff(priv->dma_channel);
+			if (get_dma_residue(priv->dma_channel) == 0) {
+				clear_bit(DMA_WRITE_IN_PROGRESS_BN, &priv->state);
+			// XXX race? byte may still be in CDOR reg
+			} else {
+				clear_bit(WRITE_READY_BN, &priv->state);
+				enable_dma(priv->dma_channel);
+			}
+			release_dma_lock(dma_flags);
+		}
+#endif
+	}
+
+	// outgoing command can be sent
+	if (status2 & HR_CO)
+		set_bit(COMMAND_READY_BN, &priv->state);
+
+	// command pass through received
+	if (status1 & HR_CPT)
+		write_byte(priv, AUX_NVAL, AUXMR);
+
+	if (status1 & HR_ERR)
+		set_bit(BUS_ERROR_BN, &priv->state);
+
+	if (status1 & HR_DEC) {
+		unsigned short address_status_bits = read_byte(priv, ADSR);
+
+		// ignore device clear events if we are controller in charge
+		if ((address_status_bits & HR_CIC) == 0) {
+			push_gpib_event(board, EVENT_DEV_CLR);
+			set_bit(DEV_CLEAR_BN, &priv->state);
+		}
+	}
+
+	if (status1 & HR_DET)
+		push_gpib_event(board, EVENT_DEV_TRG);
+
+	// Addressing status has changed
+	if (status2 & HR_ADSC)
+		set_bit(ADR_CHANGE_BN, &priv->state);
+
+	if ((status1 & priv->reg_bits[IMR1]) ||
+	    (status2 & (priv->reg_bits[IMR2] & IMR2_ENABLE_INTR_MASK)) ||
+	    nec7210_atn_has_changed(board, priv))	{
+		nec7210_update_status_nolock(board, priv);
+		dev_dbg(board->gpib_dev, "minor %i, stat %lx, isr1 0x%x, imr1 0x%x, isr2 0x%x, imr2 0x%x\n",
+			board->minor, board->status, status1, priv->reg_bits[IMR1], status2,
+			     priv->reg_bits[IMR2]);
+		wake_up_interruptible(&board->wait); /* wake up sleeping process */
+		retval = IRQ_HANDLED;
+	}
+
+	return retval;
+}
+EXPORT_SYMBOL(nec7210_interrupt_have_status);
+
+void nec7210_board_reset(struct nec7210_priv *priv, const struct gpib_board *board)
+{
+	/* 7210 chip reset */
+	write_byte(priv, AUX_CR, AUXMR);
+
+	/* disable all interrupts */
+	priv->reg_bits[IMR1] = 0;
+	write_byte(priv, priv->reg_bits[IMR1], IMR1);
+	priv->reg_bits[IMR2] = 0;
+	write_byte(priv, priv->reg_bits[IMR2], IMR2);
+	write_byte(priv, 0, SPMR);
+
+	/* clear registers by reading */
+	read_byte(priv, CPTR);
+	read_byte(priv, ISR1);
+	read_byte(priv, ISR2);
+
+	/* parallel poll unconfigure */
+	write_byte(priv, PPR | HR_PPU, AUXMR);
+
+	priv->reg_bits[ADMR] = HR_TRM0 | HR_TRM1;
+
+	priv->auxa_bits = AUXRA | HR_HLDA;
+	write_byte(priv, priv->auxa_bits, AUXMR);
+
+	write_byte(priv, AUXRE | 0, AUXMR);
+
+	/* set INT pin to active high, enable command pass through of unknown commands */
+	priv->auxb_bits = AUXRB | HR_CPTE;
+	write_byte(priv, priv->auxb_bits, AUXMR);
+	write_byte(priv, AUXRE, AUXMR);
+}
+EXPORT_SYMBOL(nec7210_board_reset);
+
+void nec7210_board_online(struct nec7210_priv *priv, const struct gpib_board *board)
+{
+	/* set GPIB address */
+	nec7210_primary_address(board, priv, board->pad);
+	nec7210_secondary_address(board, priv, board->sad, board->sad >= 0);
+
+	/* enable interrupts */
+	priv->reg_bits[IMR1] = HR_ERRIE | HR_DECIE | HR_ENDIE |
+		HR_DETIE | HR_CPTIE | HR_DOIE | HR_DIIE;
+	priv->reg_bits[IMR2] = IMR2_ENABLE_INTR_MASK;
+	write_byte(priv, priv->reg_bits[IMR1], IMR1);
+	write_byte(priv, priv->reg_bits[IMR2], IMR2);
+
+	write_byte(priv, AUX_PON, AUXMR);
+}
+EXPORT_SYMBOL(nec7210_board_online);
+
+#ifdef CONFIG_HAS_IOPORT
+/* wrappers for io */
+u8 nec7210_ioport_read_byte(struct nec7210_priv *priv, unsigned int register_num)
+{
+	return inb(priv->iobase + register_num * priv->offset);
+}
+EXPORT_SYMBOL(nec7210_ioport_read_byte);
+
+void nec7210_ioport_write_byte(struct nec7210_priv *priv, u8 data, unsigned int register_num)
+{
+	if (register_num == AUXMR)
+		/*
+		 * locking makes absolutely sure noone accesses the
+		 * AUXMR register faster than once per microsecond
+		 */
+		nec7210_locking_ioport_write_byte(priv, data, register_num);
+	else
+		outb(data, priv->iobase + register_num * priv->offset);
+}
+EXPORT_SYMBOL(nec7210_ioport_write_byte);
+
+/* locking variants of io wrappers, for chips that page-in registers */
+u8 nec7210_locking_ioport_read_byte(struct nec7210_priv *priv, unsigned int register_num)
+{
+	u8 retval;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->register_page_lock, flags);
+	retval = inb(priv->iobase + register_num * priv->offset);
+	spin_unlock_irqrestore(&priv->register_page_lock, flags);
+	return retval;
+}
+EXPORT_SYMBOL(nec7210_locking_ioport_read_byte);
+
+void nec7210_locking_ioport_write_byte(struct nec7210_priv *priv, u8 data,
+				       unsigned int register_num)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->register_page_lock, flags);
+	if (register_num == AUXMR)
+		udelay(1);
+	outb(data, priv->iobase + register_num * priv->offset);
+	spin_unlock_irqrestore(&priv->register_page_lock, flags);
+}
+EXPORT_SYMBOL(nec7210_locking_ioport_write_byte);
+#endif
+
+u8 nec7210_iomem_read_byte(struct nec7210_priv *priv, unsigned int register_num)
+{
+	return readb(priv->mmiobase + register_num * priv->offset);
+}
+EXPORT_SYMBOL(nec7210_iomem_read_byte);
+
+void nec7210_iomem_write_byte(struct nec7210_priv *priv, u8 data, unsigned int register_num)
+{
+	if (register_num == AUXMR)
+		/*
+		 * locking makes absolutely sure noone accesses the
+		 * AUXMR register faster than once per microsecond
+		 */
+		nec7210_locking_iomem_write_byte(priv, data, register_num);
+	else
+		writeb(data, priv->mmiobase + register_num * priv->offset);
+}
+EXPORT_SYMBOL(nec7210_iomem_write_byte);
+
+u8 nec7210_locking_iomem_read_byte(struct nec7210_priv *priv, unsigned int register_num)
+{
+	u8 retval;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->register_page_lock, flags);
+	retval = readb(priv->mmiobase + register_num * priv->offset);
+	spin_unlock_irqrestore(&priv->register_page_lock, flags);
+	return retval;
+}
+EXPORT_SYMBOL(nec7210_locking_iomem_read_byte);
+
+void nec7210_locking_iomem_write_byte(struct nec7210_priv *priv, u8 data,
+				      unsigned int register_num)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->register_page_lock, flags);
+	if (register_num == AUXMR)
+		udelay(1);
+	writeb(data, priv->mmiobase + register_num * priv->offset);
+	spin_unlock_irqrestore(&priv->register_page_lock, flags);
+}
+EXPORT_SYMBOL(nec7210_locking_iomem_write_byte);
+
+static int __init nec7210_init_module(void)
+{
+	return 0;
+}
+
+static void __exit nec7210_exit_module(void)
+{
+}
+
+module_init(nec7210_init_module);
+module_exit(nec7210_exit_module);
diff --git a/drivers/gpib/ni_usb/Makefile b/drivers/gpib/ni_usb/Makefile
new file mode 100644
index 000000000000..469c5d16add3
--- /dev/null
+++ b/drivers/gpib/ni_usb/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_GPIB_NI_USB) += ni_usb_gpib.o
+
+
diff --git a/drivers/gpib/ni_usb/ni_usb_gpib.c b/drivers/gpib/ni_usb/ni_usb_gpib.c
new file mode 100644
index 000000000000..1f8412de9fa3
--- /dev/null
+++ b/drivers/gpib/ni_usb/ni_usb_gpib.c
@@ -0,0 +1,2678 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ * driver for National Instruments usb to gpib adapters
+ *    copyright		   : (C) 2004 by Frank Mori Hess
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "ni_usb_gpib.h"
+#include "gpibP.h"
+#include "nec7210.h"
+#include "tnt4882_registers.h"
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver for National Instruments USB devices");
+
+#define MAX_NUM_NI_USB_INTERFACES 128
+static struct usb_interface *ni_usb_driver_interfaces[MAX_NUM_NI_USB_INTERFACES];
+
+static int ni_usb_parse_status_block(const u8 *buffer, struct ni_usb_status_block *status);
+static int ni_usb_set_interrupt_monitor(struct gpib_board *board, unsigned int monitored_bits);
+static void ni_usb_stop(struct ni_usb_priv *ni_priv);
+
+static DEFINE_MUTEX(ni_usb_hotplug_lock);
+
+// calculates a reasonable timeout in that can be passed to usb functions
+static inline unsigned long ni_usb_timeout_msecs(unsigned int usec)
+{
+	if (usec == 0)
+		return 0;
+	return 2000 + usec / 500;
+};
+
+// returns timeout code byte for use in ni-usb-b instructions
+static unsigned short ni_usb_timeout_code(unsigned int usec)
+{
+	if (usec == 0)
+		return 0xf0;
+	else if (usec <= 10)
+		return 0xf1;
+	else if (usec <= 30)
+		return 0xf2;
+	else if (usec <= 100)
+		return 0xf3;
+	else if (usec <= 300)
+		return 0xf4;
+	else if (usec <= 1000)
+		return 0xf5;
+	else if (usec <= 3000)
+		return 0xf6;
+	else if (usec <= 10000)
+		return 0xf7;
+	else if (usec <= 30000)
+		return 0xf8;
+	else if (usec <= 100000)
+		return 0xf9;
+	else if (usec <= 300000)
+		return 0xfa;
+	else if (usec <= 1000000)
+		return 0xfb;
+	else if (usec <= 3000000)
+		return 0xfc;
+	else if (usec <= 10000000)
+		return 0xfd;
+	else if (usec <= 30000000)
+		return 0xfe;
+	else if (usec <= 100000000)
+		return 0xff;
+	else if	 (usec <= 300000000)
+		return 0x01;
+	/*
+	 * NI driver actually uses 0xff for timeout T1000s, which is a bug in their code.
+	 * I've verified on a usb-b that a code of 0x2 is correct for a 1000 sec timeout
+	 */
+	else if (usec <= 1000000000)
+		return 0x02;
+	pr_err("bug? usec is greater than 1e9\n");
+	return 0xf0;
+}
+
+static void ni_usb_bulk_complete(struct urb *urb)
+{
+	struct ni_usb_urb_ctx *context = urb->context;
+
+	complete(&context->complete);
+}
+
+static void ni_usb_timeout_handler(struct timer_list *t)
+{
+	struct ni_usb_priv *ni_priv = timer_container_of(ni_priv, t,
+							 bulk_timer);
+	struct ni_usb_urb_ctx *context = &ni_priv->context;
+
+	context->timed_out = 1;
+	complete(&context->complete);
+};
+
+// I'm using nonblocking loosely here, it only means -EAGAIN can be returned in certain cases
+static int ni_usb_nonblocking_send_bulk_msg(struct ni_usb_priv *ni_priv, void *data,
+					    int data_length, int *actual_data_length,
+					    int timeout_msecs)
+{
+	struct usb_device *usb_dev;
+	int retval;
+	unsigned int out_pipe;
+	struct ni_usb_urb_ctx *context = &ni_priv->context;
+
+	*actual_data_length = 0;
+	mutex_lock(&ni_priv->bulk_transfer_lock);
+	if (!ni_priv->bus_interface) {
+		mutex_unlock(&ni_priv->bulk_transfer_lock);
+		return -ENODEV;
+	}
+	if (ni_priv->bulk_urb) {
+		mutex_unlock(&ni_priv->bulk_transfer_lock);
+		return -EAGAIN;
+	}
+	ni_priv->bulk_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!ni_priv->bulk_urb) {
+		mutex_unlock(&ni_priv->bulk_transfer_lock);
+		return -ENOMEM;
+	}
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	out_pipe = usb_sndbulkpipe(usb_dev, ni_priv->bulk_out_endpoint);
+	init_completion(&context->complete);
+	context->timed_out = 0;
+	usb_fill_bulk_urb(ni_priv->bulk_urb, usb_dev, out_pipe, data, data_length,
+			  &ni_usb_bulk_complete, context);
+
+	if (timeout_msecs)
+		mod_timer(&ni_priv->bulk_timer, jiffies + msecs_to_jiffies(timeout_msecs));
+
+	retval = usb_submit_urb(ni_priv->bulk_urb, GFP_KERNEL);
+	if (retval) {
+		timer_delete_sync(&ni_priv->bulk_timer);
+		usb_free_urb(ni_priv->bulk_urb);
+		ni_priv->bulk_urb = NULL;
+		dev_err(&usb_dev->dev, "failed to submit bulk out urb, retval=%i\n",
+			retval);
+		mutex_unlock(&ni_priv->bulk_transfer_lock);
+		return retval;
+	}
+	mutex_unlock(&ni_priv->bulk_transfer_lock);
+	wait_for_completion(&context->complete);    // wait for ni_usb_bulk_complete
+	if (context->timed_out) {
+		usb_kill_urb(ni_priv->bulk_urb);
+		dev_err(&usb_dev->dev, "killed urb due to timeout\n");
+		retval = -ETIMEDOUT;
+	} else {
+		retval = ni_priv->bulk_urb->status;
+	}
+
+	timer_delete_sync(&ni_priv->bulk_timer);
+	*actual_data_length = ni_priv->bulk_urb->actual_length;
+	mutex_lock(&ni_priv->bulk_transfer_lock);
+	usb_free_urb(ni_priv->bulk_urb);
+	ni_priv->bulk_urb = NULL;
+	mutex_unlock(&ni_priv->bulk_transfer_lock);
+	return retval;
+}
+
+static int ni_usb_send_bulk_msg(struct ni_usb_priv *ni_priv, void *data, int data_length,
+				int *actual_data_length, int timeout_msecs)
+{
+	int retval;
+	int timeout_msecs_remaining = timeout_msecs;
+
+	retval = ni_usb_nonblocking_send_bulk_msg(ni_priv, data, data_length, actual_data_length,
+						  timeout_msecs_remaining);
+	while (retval == -EAGAIN && (timeout_msecs == 0 || timeout_msecs_remaining > 0)) {
+		usleep_range(1000, 1500);
+		retval = ni_usb_nonblocking_send_bulk_msg(ni_priv, data, data_length,
+							  actual_data_length,
+							  timeout_msecs_remaining);
+		if (timeout_msecs != 0)
+			--timeout_msecs_remaining;
+	}
+	if (timeout_msecs != 0 && timeout_msecs_remaining <= 0)
+		return -ETIMEDOUT;
+	return retval;
+}
+
+// I'm using nonblocking loosely here, it only means -EAGAIN can be returned in certain cases
+static int ni_usb_nonblocking_receive_bulk_msg(struct ni_usb_priv *ni_priv,
+					       void *data, int data_length,
+					       int *actual_data_length, int timeout_msecs,
+					       int interruptible)
+{
+	struct usb_device *usb_dev;
+	int retval;
+	unsigned int in_pipe;
+	struct ni_usb_urb_ctx *context = &ni_priv->context;
+
+	*actual_data_length = 0;
+	mutex_lock(&ni_priv->bulk_transfer_lock);
+	if (!ni_priv->bus_interface) {
+		mutex_unlock(&ni_priv->bulk_transfer_lock);
+		return -ENODEV;
+	}
+	if (ni_priv->bulk_urb) {
+		mutex_unlock(&ni_priv->bulk_transfer_lock);
+		return -EAGAIN;
+	}
+	ni_priv->bulk_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!ni_priv->bulk_urb) {
+		mutex_unlock(&ni_priv->bulk_transfer_lock);
+		return -ENOMEM;
+	}
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	in_pipe = usb_rcvbulkpipe(usb_dev, ni_priv->bulk_in_endpoint);
+	init_completion(&context->complete);
+	context->timed_out = 0;
+	usb_fill_bulk_urb(ni_priv->bulk_urb, usb_dev, in_pipe, data, data_length,
+			  &ni_usb_bulk_complete, context);
+
+	if (timeout_msecs)
+		mod_timer(&ni_priv->bulk_timer, jiffies + msecs_to_jiffies(timeout_msecs));
+
+	retval = usb_submit_urb(ni_priv->bulk_urb, GFP_KERNEL);
+	if (retval) {
+		timer_delete_sync(&ni_priv->bulk_timer);
+		usb_free_urb(ni_priv->bulk_urb);
+		ni_priv->bulk_urb = NULL;
+		dev_err(&usb_dev->dev, "failed to submit bulk in urb, retval=%i\n", retval);
+		mutex_unlock(&ni_priv->bulk_transfer_lock);
+		return retval;
+	}
+	mutex_unlock(&ni_priv->bulk_transfer_lock);
+	if (interruptible) {
+		if (wait_for_completion_interruptible(&context->complete)) {
+			/*
+			 * If we got interrupted by a signal while
+			 * waiting for the usb gpib to respond, we
+			 * should send a stop command so it will
+			 * finish up with whatever it was doing and
+			 * send its response now.
+			 */
+			ni_usb_stop(ni_priv);
+			retval = -ERESTARTSYS;
+			/*
+			 * now do an uninterruptible wait, it shouldn't take long
+			 * for the board to respond now.
+			 */
+			wait_for_completion(&context->complete);
+		}
+	} else {
+		wait_for_completion(&context->complete);
+	}
+	if (context->timed_out) {
+		usb_kill_urb(ni_priv->bulk_urb);
+		dev_err(&usb_dev->dev, "killed urb due to timeout\n");
+		retval = -ETIMEDOUT;
+	} else {
+		if (ni_priv->bulk_urb->status)
+			retval = ni_priv->bulk_urb->status;
+	}
+	timer_delete_sync(&ni_priv->bulk_timer);
+	*actual_data_length = ni_priv->bulk_urb->actual_length;
+	mutex_lock(&ni_priv->bulk_transfer_lock);
+	usb_free_urb(ni_priv->bulk_urb);
+	ni_priv->bulk_urb = NULL;
+	mutex_unlock(&ni_priv->bulk_transfer_lock);
+	return retval;
+}
+
+static int ni_usb_receive_bulk_msg(struct ni_usb_priv *ni_priv, void *data,
+				   int data_length, int *actual_data_length, int timeout_msecs,
+				   int interruptible)
+{
+	int retval;
+	int timeout_msecs_remaining = timeout_msecs;
+
+	retval = ni_usb_nonblocking_receive_bulk_msg(ni_priv, data, data_length,
+						     actual_data_length, timeout_msecs_remaining,
+						     interruptible);
+	while (retval == -EAGAIN && (timeout_msecs == 0 || timeout_msecs_remaining > 0)) {
+		usleep_range(1000, 1500);
+		retval = ni_usb_nonblocking_receive_bulk_msg(ni_priv, data, data_length,
+							     actual_data_length,
+							     timeout_msecs_remaining,
+							     interruptible);
+		if (timeout_msecs != 0)
+			--timeout_msecs_remaining;
+	}
+	if (timeout_msecs && timeout_msecs_remaining <= 0)
+		return -ETIMEDOUT;
+	return retval;
+}
+
+static int ni_usb_receive_control_msg(struct ni_usb_priv *ni_priv, __u8 request,
+				      __u8 requesttype, __u16 value, __u16 index,
+				      void *data, __u16 size, int timeout_msecs)
+{
+	struct usb_device *usb_dev;
+	int retval;
+	unsigned int in_pipe;
+
+	mutex_lock(&ni_priv->control_transfer_lock);
+	if (!ni_priv->bus_interface) {
+		mutex_unlock(&ni_priv->control_transfer_lock);
+		return -ENODEV;
+	}
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	in_pipe = usb_rcvctrlpipe(usb_dev, 0);
+	retval = usb_control_msg(usb_dev, in_pipe, request, requesttype, value, index, data,
+				 size, timeout_msecs);
+	mutex_unlock(&ni_priv->control_transfer_lock);
+	return retval;
+}
+
+static void ni_usb_soft_update_status(struct gpib_board *board, unsigned int ni_usb_ibsta,
+				      unsigned int clear_mask)
+{
+	static const unsigned int ni_usb_ibsta_mask = SRQI | ATN | CIC | REM | LACS | TACS | LOK;
+
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	unsigned int need_monitoring_bits = ni_usb_ibsta_monitor_mask;
+	unsigned long flags;
+
+	board->status &= ~clear_mask;
+	board->status &= ~ni_usb_ibsta_mask;
+	board->status |= ni_usb_ibsta & ni_usb_ibsta_mask;
+	if (ni_usb_ibsta & DCAS)
+		push_gpib_event(board, EVENT_DEV_CLR);
+	if (ni_usb_ibsta & DTAS)
+		push_gpib_event(board, EVENT_DEV_TRG);
+
+	spin_lock_irqsave(&board->spinlock, flags);
+/* remove set status bits from monitored set why ?***/
+	ni_priv->monitored_ibsta_bits &= ~ni_usb_ibsta;
+	need_monitoring_bits &= ~ni_priv->monitored_ibsta_bits; /* mm - monitored set */
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	dev_dbg(&usb_dev->dev, "need_monitoring_bits=0x%x\n", need_monitoring_bits);
+
+	if (need_monitoring_bits & ~ni_usb_ibsta)
+		ni_usb_set_interrupt_monitor(board, ni_usb_ibsta_monitor_mask);
+	else if (need_monitoring_bits & ni_usb_ibsta)
+		wake_up_interruptible(&board->wait);
+
+	dev_dbg(&usb_dev->dev, "ibsta=0x%x\n", ni_usb_ibsta);
+}
+
+static int ni_usb_parse_status_block(const u8 *buffer, struct ni_usb_status_block *status)
+{
+	u16 count;
+
+	status->id = buffer[0];
+	status->ibsta = (buffer[1] << 8) | buffer[2];
+	status->error_code = buffer[3];
+	count = buffer[4] | (buffer[5] << 8);
+	count = ~count;
+	count++;
+	status->count = count;
+	return 8;
+};
+
+static void ni_usb_dump_raw_block(const u8 *raw_data, int length)
+{
+	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 8, 1, raw_data, length, true);
+}
+
+static int ni_usb_parse_register_read_block(const u8 *raw_data, unsigned int *results,
+					    int num_results)
+{
+	int i = 0;
+	int j;
+	int unexpected = 0;
+	static const int results_per_chunk = 3;
+
+	for (j = 0; j < num_results;) {
+		int k;
+
+		if (raw_data[i++] != NIUSB_REGISTER_READ_DATA_START_ID) {
+			pr_err("parse error: wrong start id\n");
+			unexpected = 1;
+		}
+		for (k = 0; k < results_per_chunk && j < num_results; ++k)
+			results[j++] = raw_data[i++];
+	}
+	while (i % 4)
+		i++;
+	if (raw_data[i++] != NIUSB_REGISTER_READ_DATA_END_ID) {
+		pr_err("parse error: wrong end id\n");
+		unexpected = 1;
+	}
+	if (raw_data[i++] % results_per_chunk != num_results % results_per_chunk) {
+		pr_err("parse error: wrong count=%i for NIUSB_REGISTER_READ_DATA_END\n",
+		       (int)raw_data[i - 1]);
+		unexpected = 1;
+	}
+	while (i % 4) {
+		if (raw_data[i++] != 0) {
+			pr_err("unexpected data: raw_data[%i]=0x%x, expected 0\n",
+			       i - 1, (int)raw_data[i - 1]);
+			unexpected = 1;
+		}
+	}
+	if (unexpected)
+		ni_usb_dump_raw_block(raw_data, i);
+	return i;
+}
+
+static int ni_usb_parse_termination_block(const u8 *buffer)
+{
+	int i = 0;
+
+	if (buffer[i++] != NIUSB_TERM_ID ||
+	    buffer[i++] != 0x0 ||
+	    buffer[i++] != 0x0 ||
+	    buffer[i++] != 0x0) {
+		pr_err("received unexpected termination block\n");
+		pr_err(" expected: 0x%x 0x%x 0x%x 0x%x\n", NIUSB_TERM_ID, 0x0, 0x0, 0x0);
+		pr_err(" received: 0x%x 0x%x 0x%x 0x%x\n",
+		       buffer[i - 4], buffer[i - 3], buffer[i - 2], buffer[i - 1]);
+	}
+	return i;
+};
+
+static int parse_board_ibrd_readback(const u8 *raw_data, struct ni_usb_status_block *status,
+				     u8 *parsed_data, int parsed_data_length,
+				     int *actual_bytes_read)
+{
+	static const int ibrd_data_block_length = 0xf;
+	static const int ibrd_extended_data_block_length = 0x1e;
+	int data_block_length = 0;
+	int i = 0;
+	int j = 0;
+	int k;
+	int num_data_blocks = 0;
+	struct ni_usb_status_block register_write_status;
+	int unexpected = 0;
+
+	while (raw_data[i] == NIUSB_IBRD_DATA_ID || raw_data[i] == NIUSB_IBRD_EXTENDED_DATA_ID) {
+		if (raw_data[i] == NIUSB_IBRD_DATA_ID) {
+			data_block_length = ibrd_data_block_length;
+		} else if (raw_data[i] == NIUSB_IBRD_EXTENDED_DATA_ID) {
+			data_block_length = ibrd_extended_data_block_length;
+			if (raw_data[++i] !=  0)	{
+				pr_err("unexpected data: raw_data[%i]=0x%x, expected 0\n",
+				       i, (int)raw_data[i]);
+				unexpected = 1;
+			}
+		} else {
+			pr_err("Unexpected NIUSB_IBRD ID\n");
+			return -EINVAL;
+		}
+		++i;
+		for (k = 0; k < data_block_length; k++) {
+			if (j < parsed_data_length)
+				parsed_data[j++] = raw_data[i++];
+			else
+				++i;
+		}
+		++num_data_blocks;
+	}
+	i += ni_usb_parse_status_block(&raw_data[i], status);
+	if (status->id != NIUSB_IBRD_STATUS_ID) {
+		pr_err("bug: status->id=%i, != ibrd_status_id\n", status->id);
+		return -EIO;
+	}
+	i++;
+	if (num_data_blocks) {
+		*actual_bytes_read = (num_data_blocks - 1) * data_block_length + raw_data[i++];
+	} else {
+		++i;
+		*actual_bytes_read = 0;
+	}
+	if (*actual_bytes_read > j)
+		pr_err("bug: discarded data. actual_bytes_read=%i, j=%i\n", *actual_bytes_read, j);
+	for (k = 0; k < 2; k++)
+		if (raw_data[i++] != 0) {
+			pr_err("unexpected data: raw_data[%i]=0x%x, expected 0\n",
+			       i - 1, (int)raw_data[i - 1]);
+			unexpected = 1;
+		}
+	i += ni_usb_parse_status_block(&raw_data[i], &register_write_status);
+	if (register_write_status.id != NIUSB_REG_WRITE_ID) {
+		pr_err("unexpected data: register write status id=0x%x, expected 0x%x\n",
+		       register_write_status.id, NIUSB_REG_WRITE_ID);
+		unexpected = 1;
+	}
+	if (raw_data[i++] != 2) {
+		pr_err("unexpected data: register write count=%i, expected 2\n",
+		       (int)raw_data[i - 1]);
+		unexpected = 1;
+	}
+	for (k = 0; k < 3; k++)
+		if (raw_data[i++] != 0) {
+			pr_err("unexpected data: raw_data[%i]=0x%x, expected 0\n",
+			       i - 1, (int)raw_data[i - 1]);
+			unexpected = 1;
+		}
+	i += ni_usb_parse_termination_block(&raw_data[i]);
+	if (unexpected)
+		ni_usb_dump_raw_block(raw_data, i);
+	return i;
+}
+
+static	int ni_usb_parse_reg_write_status_block(const u8 *raw_data,
+						struct ni_usb_status_block *status,
+						int *writes_completed)
+{
+	int i = 0;
+
+	i += ni_usb_parse_status_block(raw_data, status);
+	*writes_completed = raw_data[i++];
+	while (i % 4)
+		i++;
+	return i;
+}
+
+static int ni_usb_write_registers(struct ni_usb_priv *ni_priv,
+				  const struct ni_usb_register *writes, int num_writes,
+				  unsigned int *ibsta)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	int retval;
+	u8 *out_data, *in_data;
+	int out_data_length;
+	static const int in_data_length = 0x20;
+	int bytes_written = 0, bytes_read = 0;
+	int i = 0;
+	int j;
+	struct ni_usb_status_block status;
+	static const int bytes_per_write = 3;
+	int reg_writes_completed;
+
+	out_data_length = num_writes * bytes_per_write + 0x10;
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+	i += ni_usb_bulk_register_write_header(&out_data[i], num_writes);
+	for (j = 0; j < num_writes; j++)
+		i += ni_usb_bulk_register_write(&out_data[i], writes[j]);
+	while (i % 4)
+		out_data[i++] = 0x00;
+	i += ni_usb_bulk_termination(&out_data[i]);
+
+	mutex_lock(&ni_priv->addressed_transfer_lock);
+
+	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
+	kfree(out_data);
+	if (retval) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
+			retval, bytes_written, i);
+		return retval;
+	}
+
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		return -ENOMEM;
+	}
+	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read, 1000, 0);
+	if (retval || bytes_read != 16) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+			retval, bytes_read);
+		ni_usb_dump_raw_block(in_data, bytes_read);
+		kfree(in_data);
+		return retval;
+	}
+
+	mutex_unlock(&ni_priv->addressed_transfer_lock);
+
+	ni_usb_parse_reg_write_status_block(in_data, &status, &reg_writes_completed);
+	// FIXME parse extra 09 status bits and termination
+	kfree(in_data);
+	if (status.id != NIUSB_REG_WRITE_ID) {
+		dev_err(&usb_dev->dev, "parse error, id=0x%x != NIUSB_REG_WRITE_ID\n", status.id);
+		return -EIO;
+	}
+	if (status.error_code) {
+		dev_err(&usb_dev->dev, "nonzero error code 0x%x\n", status.error_code);
+		return -EIO;
+	}
+	if (reg_writes_completed != num_writes) {
+		dev_err(&usb_dev->dev, "reg_writes_completed=%i, num_writes=%i\n",
+			reg_writes_completed, num_writes);
+		return -EIO;
+	}
+	if (ibsta)
+		*ibsta = status.ibsta;
+	return 0;
+}
+
+// interface functions
+static int ni_usb_read(struct gpib_board *board, u8 *buffer, size_t length,
+		       int *end, size_t *bytes_read)
+{
+	int retval, parse_retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	u8 *out_data, *in_data;
+	static const int out_data_length = 0x20;
+	int in_data_length;
+	int usb_bytes_written = 0, usb_bytes_read = 0;
+	int i = 0;
+	int complement_count;
+	int actual_length;
+	struct ni_usb_status_block status;
+	static const int max_read_length = 0xffff;
+	struct ni_usb_register reg;
+
+	*bytes_read = 0;
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	if (length > max_read_length)
+		return -EINVAL;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+	out_data[i++] = 0x0a;
+	out_data[i++] = ni_priv->eos_mode >> 8;
+	out_data[i++] = ni_priv->eos_char;
+	out_data[i++] = ni_usb_timeout_code(board->usec_timeout);
+	complement_count = length - 1;
+	complement_count = ~complement_count;
+	out_data[i++] = complement_count & 0xff;
+	out_data[i++] = (complement_count >> 8) & 0xff;
+	out_data[i++] = 0x0;
+	out_data[i++] = 0x0;
+	i += ni_usb_bulk_register_write_header(&out_data[i], 2);
+	reg.device = NIUSB_SUBDEV_TNT4882;
+	reg.address = nec7210_to_tnt4882_offset(AUXMR);
+	reg.value = AUX_HLDI;
+	i += ni_usb_bulk_register_write(&out_data[i], reg);
+	reg.value = AUX_CLEAR_END;
+	i += ni_usb_bulk_register_write(&out_data[i], reg);
+	while (i % 4)	// pad with zeros to 4-byte boundary
+		out_data[i++] = 0x0;
+	i += ni_usb_bulk_termination(&out_data[i]);
+
+	mutex_lock(&ni_priv->addressed_transfer_lock);
+
+	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &usb_bytes_written, 1000);
+	kfree(out_data);
+	if (retval || usb_bytes_written != i) {
+		if (retval == 0)
+			retval = -EIO;
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, usb_bytes_written=%i, i=%i\n",
+			retval, usb_bytes_written, i);
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		return retval;
+	}
+
+	in_data_length = (length / 30 + 1) * 0x20 + 0x20;
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		return -ENOMEM;
+	}
+	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &usb_bytes_read,
+					 ni_usb_timeout_msecs(board->usec_timeout), 1);
+
+	mutex_unlock(&ni_priv->addressed_transfer_lock);
+
+	if (retval == -ERESTARTSYS) {
+	} else if (retval) {
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, usb_bytes_read=%i\n",
+			retval, usb_bytes_read);
+		kfree(in_data);
+		return retval;
+	}
+	parse_retval = parse_board_ibrd_readback(in_data, &status, buffer, length, &actual_length);
+	if (parse_retval != usb_bytes_read) {
+		if (parse_retval >= 0)
+			parse_retval = -EIO;
+		dev_err(&usb_dev->dev, "retval=%i usb_bytes_read=%i\n",
+			parse_retval, usb_bytes_read);
+		kfree(in_data);
+		return parse_retval;
+	}
+	if (actual_length != length - status.count) {
+		dev_err(&usb_dev->dev, "actual_length=%i expected=%li\n",
+			actual_length, (long)(length - status.count));
+		ni_usb_dump_raw_block(in_data, usb_bytes_read);
+	}
+	kfree(in_data);
+	switch (status.error_code) {
+	case NIUSB_NO_ERROR:
+		retval = 0;
+		break;
+	case NIUSB_ABORTED_ERROR:
+		/*
+		 * this is expected if ni_usb_receive_bulk_msg got
+		 * interrupted by a signal and returned -ERESTARTSYS
+		 */
+		break;
+	case NIUSB_ATN_STATE_ERROR:
+		if (status.ibsta & DCAS) {
+			retval = -EINTR;
+		} else {
+			retval = -EIO;
+			dev_dbg(&usb_dev->dev, "read when ATN set stat: 0x%06x\n", status.ibsta);
+		}
+		break;
+	case NIUSB_ADDRESSING_ERROR:
+		retval = -EIO;
+		break;
+	case NIUSB_TIMEOUT_ERROR:
+		retval = -ETIMEDOUT;
+		break;
+	case NIUSB_EOSMODE_ERROR:
+		dev_err(&usb_dev->dev, "driver bug, we should have been able to avoid NIUSB_EOSMODE_ERROR.\n");
+		retval = -EINVAL;
+		break;
+	default:
+		dev_err(&usb_dev->dev, "unknown error code=%i\n",  status.error_code);
+		retval = -EIO;
+		break;
+	}
+	ni_usb_soft_update_status(board, status.ibsta, 0);
+	if (status.ibsta & END)
+		*end = 1;
+	else
+		*end = 0;
+	*bytes_read = actual_length;
+	return retval;
+}
+
+static int ni_usb_write(struct gpib_board *board, u8 *buffer, size_t length,
+			int send_eoi, size_t *bytes_written)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	u8 *out_data, *in_data;
+	int out_data_length;
+	static const int in_data_length = 0x10;
+	int usb_bytes_written = 0, usb_bytes_read = 0;
+	int i = 0, j;
+	int complement_count;
+	struct ni_usb_status_block status;
+	static const int max_write_length = 0xffff;
+
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	if (length > max_write_length)
+		return -EINVAL;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	out_data_length = length + 0x10;
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+	out_data[i++] = 0x0d;
+	complement_count = length - 1;
+	complement_count = ~complement_count;
+	out_data[i++] = complement_count & 0xff;
+	out_data[i++] = (complement_count >> 8) & 0xff;
+	out_data[i++] = ni_usb_timeout_code(board->usec_timeout);
+	out_data[i++] = 0x0;
+	out_data[i++] = 0x0;
+	if (send_eoi)
+		out_data[i++] = 0x8;
+	else
+		out_data[i++] = 0x0;
+	out_data[i++] = 0x0;
+	for (j = 0; j < length; j++)
+		out_data[i++] = buffer[j];
+	while (i % 4)	// pad with zeros to 4-byte boundary
+		out_data[i++] = 0x0;
+	i += ni_usb_bulk_termination(&out_data[i]);
+
+	mutex_lock(&ni_priv->addressed_transfer_lock);
+
+	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &usb_bytes_written,
+				      ni_usb_timeout_msecs(board->usec_timeout));
+	kfree(out_data);
+	if (retval || usb_bytes_written != i)	{
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, usb_bytes_written=%i, i=%i\n",
+			retval, usb_bytes_written, i);
+		return retval;
+	}
+
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		return -ENOMEM;
+	}
+	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &usb_bytes_read,
+					 ni_usb_timeout_msecs(board->usec_timeout), 1);
+
+	mutex_unlock(&ni_priv->addressed_transfer_lock);
+
+	if ((retval && retval != -ERESTARTSYS) || usb_bytes_read != 12) {
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, usb_bytes_read=%i\n",
+			retval, usb_bytes_read);
+		kfree(in_data);
+		return retval;
+	}
+	ni_usb_parse_status_block(in_data, &status);
+	kfree(in_data);
+	switch	(status.error_code) {
+	case NIUSB_NO_ERROR:
+		retval = 0;
+		break;
+	case NIUSB_ABORTED_ERROR:
+		/*
+		 * this is expected if ni_usb_receive_bulk_msg got
+		 * interrupted by a signal and returned -ERESTARTSYS
+		 */
+		break;
+	case NIUSB_ADDRESSING_ERROR:
+		dev_err(&usb_dev->dev, "Addressing error retval %d error code=%i\n",
+			retval, status.error_code);
+		retval = -ENXIO;
+		break;
+	case NIUSB_NO_LISTENER_ERROR:
+		retval = -ECOMM;
+		break;
+	case NIUSB_TIMEOUT_ERROR:
+		retval = -ETIMEDOUT;
+		break;
+	default:
+		dev_err(&usb_dev->dev, "unknown error code=%i\n", status.error_code);
+		retval = -EPIPE;
+		break;
+	}
+	ni_usb_soft_update_status(board, status.ibsta, 0);
+	*bytes_written = length - status.count;
+	return retval;
+}
+
+static int ni_usb_command_chunk(struct gpib_board *board, u8 *buffer, size_t length,
+				size_t *command_bytes_written)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	u8 *out_data, *in_data;
+	int out_data_length;
+	static const int in_data_length = 0x10;
+	int bytes_written = 0, bytes_read = 0;
+	int i = 0, j;
+	unsigned int complement_count;
+	struct ni_usb_status_block status;
+	// usb-b gives error 4 if you try to send more than 16 command bytes at once
+	static const int max_command_length = 0x10;
+
+	*command_bytes_written = 0;
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	if (length > max_command_length)
+		length = max_command_length;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	out_data_length = length + 0x10;
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+	out_data[i++] = 0x0c;
+	complement_count = length - 1;
+	complement_count = ~complement_count;
+	out_data[i++] = complement_count;
+	out_data[i++] = 0x0;
+	out_data[i++] = ni_usb_timeout_code(board->usec_timeout);
+	for (j = 0; j < length; j++)
+		out_data[i++] = buffer[j];
+	while (i % 4)	// pad with zeros to 4-byte boundary
+		out_data[i++] = 0x0;
+	i += ni_usb_bulk_termination(&out_data[i]);
+
+	mutex_lock(&ni_priv->addressed_transfer_lock);
+
+	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written,
+				      ni_usb_timeout_msecs(board->usec_timeout));
+	kfree(out_data);
+	if (retval || bytes_written != i) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
+			retval, bytes_written, i);
+		return retval;
+	}
+
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		return -ENOMEM;
+	}
+
+	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read,
+					 ni_usb_timeout_msecs(board->usec_timeout), 1);
+
+	mutex_unlock(&ni_priv->addressed_transfer_lock);
+
+	if ((retval && retval != -ERESTARTSYS) || bytes_read != 12) {
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+			retval, bytes_read);
+		kfree(in_data);
+		return retval;
+	}
+	ni_usb_parse_status_block(in_data, &status);
+	kfree(in_data);
+	*command_bytes_written = length - status.count;
+	switch (status.error_code) {
+	case NIUSB_NO_ERROR:
+		break;
+	case NIUSB_ABORTED_ERROR:
+		/*
+		 * this is expected if ni_usb_receive_bulk_msg got
+		 * interrupted by a signal and returned -ERESTARTSYS
+		 */
+		break;
+	case NIUSB_NO_BUS_ERROR:
+		return -ENOTCONN;
+	case NIUSB_EOSMODE_ERROR:
+		dev_err(&usb_dev->dev, "got eosmode error. Driver bug?\n");
+		return -EIO;
+	case NIUSB_TIMEOUT_ERROR:
+		return -ETIMEDOUT;
+	default:
+		dev_err(&usb_dev->dev, "unknown error code=%i\n", status.error_code);
+		return -EIO;
+	}
+	ni_usb_soft_update_status(board, status.ibsta, 0);
+	return 0;
+}
+
+static int ni_usb_command(struct gpib_board *board, u8 *buffer, size_t length,
+			  size_t *bytes_written)
+{
+	size_t count;
+	int retval;
+
+	*bytes_written = 0;
+	while (*bytes_written < length) {
+		retval = ni_usb_command_chunk(board, buffer + *bytes_written,
+					      length - *bytes_written, &count);
+		*bytes_written += count;
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
+}
+
+static int ni_usb_take_control(struct gpib_board *board, int synchronous)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	u8 *out_data, *in_data;
+	static const int out_data_length = 0x10;
+	static const int  in_data_length = 0x10;
+	int bytes_written = 0, bytes_read = 0;
+	int i = 0;
+	struct ni_usb_status_block status;
+
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+	out_data[i++] = NIUSB_IBCAC_ID;
+	if (synchronous)
+		out_data[i++] = 0x1;
+	else
+		out_data[i++] = 0x0;
+	out_data[i++] = 0x0;
+	out_data[i++] = 0x0;
+	i += ni_usb_bulk_termination(&out_data[i]);
+
+	mutex_lock(&ni_priv->addressed_transfer_lock);
+
+	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
+	kfree(out_data);
+	if (retval || bytes_written != i) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
+			retval, bytes_written, i);
+		return retval;
+	}
+
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		return -ENOMEM;
+	}
+	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read, 1000, 1);
+
+	mutex_unlock(&ni_priv->addressed_transfer_lock);
+
+	if ((retval && retval != -ERESTARTSYS) || bytes_read != 12) {
+		if (retval == 0)
+			retval = -EIO;
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+			retval, bytes_read);
+		kfree(in_data);
+		return retval;
+	}
+	ni_usb_parse_status_block(in_data, &status);
+	kfree(in_data);
+	ni_usb_soft_update_status(board, status.ibsta, 0);
+	return retval;
+}
+
+static int ni_usb_go_to_standby(struct gpib_board *board)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	u8 *out_data, *in_data;
+	static const int out_data_length = 0x10;
+	static const int  in_data_length = 0x20;
+	int bytes_written = 0, bytes_read = 0;
+	int i = 0;
+	struct ni_usb_status_block status;
+
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+
+	out_data[i++] = NIUSB_IBGTS_ID;
+	out_data[i++] = 0x0;
+	out_data[i++] = 0x0;
+	out_data[i++] = 0x0;
+	i += ni_usb_bulk_termination(&out_data[i]);
+
+	mutex_lock(&ni_priv->addressed_transfer_lock);
+
+	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
+	kfree(out_data);
+	if (retval || bytes_written != i) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
+			retval, bytes_written, i);
+		return retval;
+	}
+
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		return -ENOMEM;
+	}
+	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read, 1000, 0);
+
+	mutex_unlock(&ni_priv->addressed_transfer_lock);
+
+	if (retval || bytes_read != 12) {
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+			retval, bytes_read);
+		kfree(in_data);
+		return retval;
+	}
+	ni_usb_parse_status_block(in_data, &status);
+	kfree(in_data);
+	if (status.id != NIUSB_IBGTS_ID)
+		dev_err(&usb_dev->dev, "bug: status.id 0x%x != INUSB_IBGTS_ID\n", status.id);
+	ni_usb_soft_update_status(board, status.ibsta, 0);
+	return 0;
+}
+
+static int ni_usb_request_system_control(struct gpib_board *board, int request_control)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	int i = 0;
+	struct ni_usb_register writes[4];
+	unsigned int ibsta;
+
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	if (request_control) {
+		writes[i].device = NIUSB_SUBDEV_TNT4882;
+		writes[i].address = CMDR;
+		writes[i].value = SETSC;
+		i++;
+		writes[i].device = NIUSB_SUBDEV_TNT4882;
+		writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+		writes[i].value = AUX_CIFC;
+		i++;
+	} else {
+		writes[i].device = NIUSB_SUBDEV_TNT4882;
+		writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+		writes[i].value = AUX_CREN;
+		i++;
+		writes[i].device = NIUSB_SUBDEV_TNT4882;
+		writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+		writes[i].value = AUX_CIFC;
+		i++;
+		writes[i].device = NIUSB_SUBDEV_TNT4882;
+		writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+		writes[i].value = AUX_DSC;
+		i++;
+		writes[i].device = NIUSB_SUBDEV_TNT4882;
+		writes[i].address = CMDR;
+		writes[i].value = CLRSC;
+		i++;
+	}
+	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
+	if (retval < 0) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return retval;
+	}
+	if (!request_control)
+		ni_priv->ren_state = 0;
+	ni_usb_soft_update_status(board, ibsta, 0);
+	return 0;
+}
+
+// FIXME maybe the interface should have a "pulse interface clear" function that can return an error?
+static void ni_usb_interface_clear(struct gpib_board *board, int assert)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	u8 *out_data, *in_data;
+	static const int out_data_length = 0x10;
+	static const int  in_data_length = 0x10;
+	int bytes_written = 0, bytes_read = 0;
+	int i = 0;
+	struct ni_usb_status_block status;
+
+	if (!ni_priv->bus_interface)
+		return; // -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+// FIXME: we are going to pulse when assert is true, and ignore otherwise
+	if (assert == 0)
+		return;
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return;
+	out_data[i++] = NIUSB_IBSIC_ID;
+	out_data[i++] = 0x0;
+	out_data[i++] = 0x0;
+	out_data[i++] = 0x0;
+	i += ni_usb_bulk_termination(&out_data[i]);
+	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
+	kfree(out_data);
+	if (retval || bytes_written != i) {
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
+			retval, bytes_written, i);
+		return;
+	}
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data)
+		return;
+
+	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read, 1000, 0);
+	if (retval || bytes_read != 12) {
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+			retval, bytes_read);
+		kfree(in_data);
+		return;
+	}
+	ni_usb_parse_status_block(in_data, &status);
+	kfree(in_data);
+	ni_usb_soft_update_status(board, status.ibsta, 0);
+}
+
+static void ni_usb_remote_enable(struct gpib_board *board, int enable)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	struct ni_usb_register reg;
+	unsigned int ibsta;
+
+	if (!ni_priv->bus_interface)
+		return; // -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	reg.device = NIUSB_SUBDEV_TNT4882;
+	reg.address = nec7210_to_tnt4882_offset(AUXMR);
+	if (enable)
+		reg.value = AUX_SREN;
+	else
+		reg.value = AUX_CREN;
+	retval = ni_usb_write_registers(ni_priv, &reg, 1, &ibsta);
+	if (retval < 0) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return; //retval;
+	}
+	ni_priv->ren_state = enable;
+	ni_usb_soft_update_status(board, ibsta, 0);
+	return;// 0;
+}
+
+static int ni_usb_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct ni_usb_priv *ni_priv = board->private_data;
+
+	ni_priv->eos_char = eos_byte;
+	ni_priv->eos_mode |= REOS;
+	if (compare_8_bits)
+		ni_priv->eos_mode |= BIN;
+	else
+		ni_priv->eos_mode &= ~BIN;
+	return 0;
+}
+
+static void ni_usb_disable_eos(struct gpib_board *board)
+{
+	struct ni_usb_priv *ni_priv = board->private_data;
+	/*
+	 * adapter gets unhappy if you don't zero all the bits
+	 * for the eos mode and eos char (returns error 4 on reads).
+	 */
+	ni_priv->eos_mode = 0;
+	ni_priv->eos_char = 0;
+}
+
+static unsigned int ni_usb_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	static const int buffer_length = 8;
+	u8 *buffer;
+	struct ni_usb_status_block status;
+
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	buffer = kmalloc(buffer_length, GFP_KERNEL);
+	if (!buffer)
+		return board->status;
+
+	retval = ni_usb_receive_control_msg(ni_priv, NI_USB_WAIT_REQUEST, USB_DIR_IN |
+					    USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+					    0x200, 0x0, buffer, buffer_length, 1000);
+	if (retval != buffer_length) {
+		dev_err(&usb_dev->dev, "usb_control_msg returned %i\n", retval);
+		kfree(buffer);
+		return board->status;
+	}
+	ni_usb_parse_status_block(buffer, &status);
+	kfree(buffer);
+	ni_usb_soft_update_status(board, status.ibsta, clear_mask);
+	return board->status;
+}
+
+// tells ni-usb to immediately stop an ongoing i/o operation
+static void ni_usb_stop(struct ni_usb_priv *ni_priv)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	int retval;
+	static const int buffer_length = 8;
+	u8 *buffer;
+	struct ni_usb_status_block status;
+
+	buffer = kmalloc(buffer_length, GFP_KERNEL);
+	if (!buffer)
+		return;
+
+	retval = ni_usb_receive_control_msg(ni_priv, NI_USB_STOP_REQUEST, USB_DIR_IN |
+					    USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+					    0x0, 0x0, buffer, buffer_length, 1000);
+	if (retval != buffer_length) {
+		dev_err(&usb_dev->dev, "usb_control_msg returned %i\n", retval);
+		kfree(buffer);
+		return;
+	}
+	ni_usb_parse_status_block(buffer, &status);
+	kfree(buffer);
+}
+
+static int ni_usb_primary_address(struct gpib_board *board, unsigned int address)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	int i = 0;
+	struct ni_usb_register writes[2];
+	unsigned int ibsta;
+
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(ADR);
+	writes[i].value = address;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_UNKNOWN2;
+	writes[i].address = 0x0;
+	writes[i].value = address;
+	i++;
+	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
+	if (retval < 0) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return retval;
+	}
+	ni_usb_soft_update_status(board, ibsta, 0);
+	return 0;
+}
+
+static int ni_usb_write_sad(struct ni_usb_register *writes, int address, int enable)
+{
+	unsigned int adr_bits, admr_bits;
+	int i = 0;
+
+	adr_bits = HR_ARS;
+	admr_bits = HR_TRM0 | HR_TRM1;
+	if (enable) {
+		adr_bits |= address;
+		admr_bits |= HR_ADM1;
+	} else {
+		adr_bits |= HR_DT | HR_DL;
+		admr_bits |= HR_ADM0;
+	}
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(ADR);
+	writes[i].value = adr_bits;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(ADMR);
+	writes[i].value = admr_bits;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_UNKNOWN2;
+	writes[i].address = 0x1;
+	writes[i].value = enable ? MSA(address) : 0x0;
+	i++;
+	return i;
+}
+
+static int ni_usb_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	int i = 0;
+	struct ni_usb_register writes[3];
+	unsigned int ibsta;
+
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	i += ni_usb_write_sad(writes, address, enable);
+	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
+	if (retval < 0) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return retval;
+	}
+	ni_usb_soft_update_status(board, ibsta, 0);
+	return 0;
+}
+
+static int ni_usb_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	u8 *out_data, *in_data;
+	static const int out_data_length = 0x10;
+	static const int  in_data_length = 0x20;
+	int bytes_written = 0, bytes_read = 0;
+	int i = 0;
+	int j = 0;
+	struct ni_usb_status_block status;
+
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+
+	out_data[i++] = NIUSB_IBRPP_ID;
+	out_data[i++] = 0xf0;	// FIXME: this should be the parallel poll timeout code
+	out_data[i++] = 0x0;
+	out_data[i++] = 0x0;
+	i += ni_usb_bulk_termination(&out_data[i]);
+	/*FIXME: 1000 should use parallel poll timeout (not supported yet)*/
+	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
+
+	kfree(out_data);
+	if (retval || bytes_written != i) {
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
+			retval, bytes_written, i);
+		return retval;
+	}
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data)
+		return -ENOMEM;
+
+	/*FIXME: should use parallel poll timeout (not supported yet)*/
+	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length,
+					 &bytes_read, 1000, 1);
+
+	if (retval && retval != -ERESTARTSYS)	{
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+			retval, bytes_read);
+		kfree(in_data);
+		return retval;
+	}
+	j += ni_usb_parse_status_block(in_data, &status);
+	*result = in_data[j++];
+	kfree(in_data);
+	ni_usb_soft_update_status(board, status.ibsta, 0);
+	return retval;
+}
+
+static void ni_usb_parallel_poll_configure(struct gpib_board *board, u8 config)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	int i = 0;
+	struct ni_usb_register writes[1];
+	unsigned int ibsta;
+
+	if (!ni_priv->bus_interface)
+		return; // -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	writes[i].value = PPR | config;
+	i++;
+	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
+	if (retval < 0) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return;// retval;
+	}
+	ni_usb_soft_update_status(board, ibsta, 0);
+	return;// 0;
+}
+
+static void ni_usb_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	int i = 0;
+	struct ni_usb_register writes[1];
+	unsigned int ibsta;
+
+	if (!ni_priv->bus_interface)
+		return; // -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	if (ist)
+		writes[i].value = AUX_SPPF;
+	else
+		writes[i].value = AUX_CPPF;
+	i++;
+	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
+	if (retval < 0) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return;// retval;
+	}
+	ni_usb_soft_update_status(board, ibsta, 0);
+	return;// 0;
+}
+
+static void ni_usb_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	int i = 0;
+	struct ni_usb_register writes[1];
+	unsigned int ibsta;
+
+	if (!ni_priv->bus_interface)
+		return; // -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(SPMR);
+	writes[i].value = status;
+	i++;
+	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
+	if (retval < 0) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return;// retval;
+	}
+	ni_usb_soft_update_status(board, ibsta, 0);
+	return;// 0;
+}
+
+static u8 ni_usb_serial_poll_status(struct gpib_board *board)
+{
+	return 0;
+}
+
+static void ni_usb_return_to_local(struct gpib_board *board)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	int i = 0;
+	struct ni_usb_register writes[1];
+	unsigned int ibsta;
+
+	if (!ni_priv->bus_interface)
+		return; // -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	writes[i].value = AUX_RTL;
+	i++;
+	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
+	if (retval < 0) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return;// retval;
+	}
+	ni_usb_soft_update_status(board, ibsta, 0);
+	return;// 0;
+}
+
+static int ni_usb_line_status(const struct gpib_board *board)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	u8 *out_data, *in_data;
+	static const int out_data_length = 0x20;
+	static const int  in_data_length = 0x20;
+	int bytes_written = 0, bytes_read = 0;
+	int i = 0;
+	unsigned int bsr_bits;
+	int line_status = VALID_ALL;
+	// NI windows driver reads 0xd(HSSEL), 0xc (ARD0), 0x1f (BSR)
+
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data)
+		return -ENOMEM;
+
+	/* line status gets called during ibwait */
+	retval = mutex_trylock(&ni_priv->addressed_transfer_lock);
+
+	if (retval == 0) {
+		kfree(out_data);
+		return -EBUSY;
+	}
+	i += ni_usb_bulk_register_read_header(&out_data[i], 1);
+	i += ni_usb_bulk_register_read(&out_data[i], NIUSB_SUBDEV_TNT4882, BSR);
+	while (i % 4)
+		out_data[i++] = 0x0;
+	i += ni_usb_bulk_termination(&out_data[i]);
+	retval = ni_usb_nonblocking_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
+	kfree(out_data);
+	if (retval || bytes_written != i) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		if (retval != -EAGAIN)
+			dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
+				retval, bytes_written, i);
+		return retval;
+	}
+
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data) {
+		mutex_unlock(&ni_priv->addressed_transfer_lock);
+		return -ENOMEM;
+	}
+	retval = ni_usb_nonblocking_receive_bulk_msg(ni_priv, in_data, in_data_length,
+						     &bytes_read, 1000, 0);
+
+	mutex_unlock(&ni_priv->addressed_transfer_lock);
+
+	if (retval) {
+		if (retval != -EAGAIN)
+			dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+				retval, bytes_read);
+		kfree(in_data);
+		return retval;
+	}
+
+	ni_usb_parse_register_read_block(in_data, &bsr_bits, 1);
+	kfree(in_data);
+	if (bsr_bits & BCSR_REN_BIT)
+		line_status |= BUS_REN;
+	if (bsr_bits & BCSR_IFC_BIT)
+		line_status |= BUS_IFC;
+	if (bsr_bits & BCSR_SRQ_BIT)
+		line_status |= BUS_SRQ;
+	if (bsr_bits & BCSR_EOI_BIT)
+		line_status |= BUS_EOI;
+	if (bsr_bits & BCSR_NRFD_BIT)
+		line_status |= BUS_NRFD;
+	if (bsr_bits & BCSR_NDAC_BIT)
+		line_status |= BUS_NDAC;
+	if (bsr_bits & BCSR_DAV_BIT)
+		line_status |= BUS_DAV;
+	if (bsr_bits & BCSR_ATN_BIT)
+		line_status |= BUS_ATN;
+	return line_status;
+}
+
+static int ni_usb_setup_t1_delay(struct ni_usb_register *reg, unsigned int nano_sec,
+				 unsigned int *actual_ns)
+{
+	int i = 0;
+
+	*actual_ns = 2000;
+
+	reg[i].device = NIUSB_SUBDEV_TNT4882;
+	reg[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	if (nano_sec <= 1100)	{
+		reg[i].value = AUXRI | USTD | SISB;
+		*actual_ns = 1100;
+	} else {
+		reg[i].value = AUXRI | SISB;
+	}
+	i++;
+	reg[i].device = NIUSB_SUBDEV_TNT4882;
+	reg[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	if (nano_sec <= 500)	{
+		reg[i].value = AUXRB | HR_TRI;
+		*actual_ns = 500;
+	} else {
+		reg[i].value = AUXRB;
+	}
+	i++;
+	reg[i].device = NIUSB_SUBDEV_TNT4882;
+	reg[i].address = KEYREG;
+	if (nano_sec <= 350) {
+		reg[i].value = MSTD;
+		*actual_ns = 350;
+	} else {
+		reg[i].value = 0x0;
+	}
+	i++;
+	return i;
+}
+
+static int ni_usb_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	struct ni_usb_register writes[3];
+	unsigned int ibsta;
+	unsigned int actual_ns;
+	int i;
+
+	if (!ni_priv->bus_interface)
+		return -ENODEV;
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	i = ni_usb_setup_t1_delay(writes, nano_sec, &actual_ns);
+	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
+	if (retval < 0) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return retval;
+	}
+	board->t1_nano_sec = actual_ns;
+	ni_usb_soft_update_status(board, ibsta, 0);
+	return actual_ns;
+}
+
+static int ni_usb_allocate_private(struct gpib_board *board)
+{
+	struct ni_usb_priv *ni_priv;
+
+	board->private_data = kmalloc(sizeof(struct ni_usb_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -ENOMEM;
+	ni_priv = board->private_data;
+	memset(ni_priv, 0, sizeof(struct ni_usb_priv));
+	mutex_init(&ni_priv->bulk_transfer_lock);
+	mutex_init(&ni_priv->control_transfer_lock);
+	mutex_init(&ni_priv->interrupt_transfer_lock);
+	mutex_init(&ni_priv->addressed_transfer_lock);
+	return 0;
+}
+
+static void ni_usb_free_private(struct ni_usb_priv *ni_priv)
+{
+	usb_free_urb(ni_priv->interrupt_urb);
+	kfree(ni_priv);
+}
+
+#define NUM_INIT_WRITES 26
+static int ni_usb_setup_init(struct gpib_board *board, struct ni_usb_register *writes)
+{
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	unsigned int mask, actual_ns;
+	int i = 0;
+
+	writes[i].device = NIUSB_SUBDEV_UNKNOWN3;
+	writes[i].address = 0x10;
+	writes[i].value = 0x0;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = CMDR;
+	writes[i].value = SOFT_RESET;
+	i++;
+	writes[i].device =  NIUSB_SUBDEV_TNT4882;
+	writes[i].address =  nec7210_to_tnt4882_offset(AUXMR);
+	mask = AUXRA | HR_HLDA;
+	if (ni_priv->eos_mode & BIN)
+		mask |= HR_BIN;
+	writes[i].value = mask;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = AUXCR;
+	writes[i].value = mask;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = HSSEL;
+	writes[i].value = TNT_ONE_CHIP_BIT;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	writes[i].value = AUX_CR;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = IMR0;
+	writes[i].value = TNT_IMR0_ALWAYS_BITS;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(IMR1);
+	writes[i].value = 0x0;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address =  nec7210_to_tnt4882_offset(IMR2);
+	writes[i].value = 0x0;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = IMR3;
+	writes[i].value = 0x0;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	writes[i].value = AUX_HLDI;
+	i++;
+
+	i += ni_usb_setup_t1_delay(&writes[i], board->t1_nano_sec, &actual_ns);
+
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	writes[i].value = AUXRG | NTNL_BIT;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = CMDR;
+	if (board->master)
+		mask = SETSC; // set system controller
+	else
+		mask = CLRSC; // clear system controller
+	writes[i].value = mask;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	writes[i].value = AUX_CIFC;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(ADR);
+	writes[i].value = board->pad;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_UNKNOWN2;
+	writes[i].address = 0x0;
+	writes[i].value = board->pad;
+	i++;
+
+	i += ni_usb_write_sad(&writes[i], board->sad, board->sad >= 0);
+
+	writes[i].device = NIUSB_SUBDEV_UNKNOWN2;
+	writes[i].address = 0x2; // could this be a timeout ?
+	writes[i].value = 0xfd;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = 0xf; // undocumented address
+	writes[i].value = 0x11;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	writes[i].value = AUX_PON;
+	i++;
+	writes[i].device = NIUSB_SUBDEV_TNT4882;
+	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
+	writes[i].value = AUX_CPPF;
+	i++;
+	if (i > NUM_INIT_WRITES) {
+		dev_err(&usb_dev->dev, "bug!, buffer overrun, i=%i\n", i);
+		return 0;
+	}
+	return i;
+}
+
+static int ni_usb_init(struct gpib_board *board)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	struct ni_usb_register *writes;
+	unsigned int ibsta;
+	int writes_len;
+
+	writes = kmalloc_array(NUM_INIT_WRITES, sizeof(*writes), GFP_KERNEL);
+	if (!writes)
+		return -ENOMEM;
+
+	writes_len = ni_usb_setup_init(board, writes);
+	if (writes_len)
+		retval = ni_usb_write_registers(ni_priv, writes, writes_len, &ibsta);
+	else
+		return -EFAULT;
+	kfree(writes);
+	if (retval) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return retval;
+	}
+	ni_usb_soft_update_status(board, ibsta, 0);
+	return 0;
+}
+
+static void ni_usb_interrupt_complete(struct urb *urb)
+{
+	struct gpib_board *board = urb->context;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	int retval;
+	struct ni_usb_status_block status;
+	unsigned long flags;
+
+	switch (urb->status) {
+		/* success */
+	case 0:
+		break;
+		/* unlinked, don't resubmit */
+	case -ECONNRESET:
+	case -ENOENT:
+	case -ESHUTDOWN:
+		return;
+	default: /* other error, resubmit */
+		retval = usb_submit_urb(ni_priv->interrupt_urb, GFP_ATOMIC);
+		if (retval)
+			dev_err(&usb_dev->dev, "failed to resubmit interrupt urb\n");
+		return;
+	}
+
+	ni_usb_parse_status_block(urb->transfer_buffer, &status);
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	ni_priv->monitored_ibsta_bits &= ~status.ibsta;
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	wake_up_interruptible(&board->wait);
+
+	retval = usb_submit_urb(ni_priv->interrupt_urb, GFP_ATOMIC);
+	if (retval)
+		dev_err(&usb_dev->dev, "failed to resubmit interrupt urb\n");
+}
+
+static int ni_usb_set_interrupt_monitor(struct gpib_board *board, unsigned int monitored_bits)
+{
+	int retval;
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	static const int buffer_length = 8;
+	u8 *buffer;
+	struct ni_usb_status_block status;
+	unsigned long flags;
+
+	buffer = kmalloc(buffer_length, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	ni_priv->monitored_ibsta_bits = ni_usb_ibsta_monitor_mask & monitored_bits;
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	retval = ni_usb_receive_control_msg(ni_priv, NI_USB_WAIT_REQUEST, USB_DIR_IN |
+					    USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+					    0x300, ni_usb_ibsta_monitor_mask & monitored_bits,
+					    buffer, buffer_length, 1000);
+	if (retval != buffer_length) {
+		dev_err(&usb_dev->dev, "usb_control_msg returned %i\n", retval);
+		kfree(buffer);
+		return -1;
+	}
+	ni_usb_parse_status_block(buffer, &status);
+	kfree(buffer);
+	return 0;
+}
+
+static int ni_usb_setup_urbs(struct gpib_board *board)
+{
+	struct ni_usb_priv *ni_priv = board->private_data;
+	struct usb_device *usb_dev;
+	int int_pipe;
+	int retval;
+
+	if (ni_priv->interrupt_in_endpoint < 0)
+		return 0;
+
+	mutex_lock(&ni_priv->interrupt_transfer_lock);
+	if (!ni_priv->bus_interface) {
+		mutex_unlock(&ni_priv->interrupt_transfer_lock);
+		return -ENODEV;
+	}
+	ni_priv->interrupt_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!ni_priv->interrupt_urb) {
+		mutex_unlock(&ni_priv->interrupt_transfer_lock);
+		return -ENOMEM;
+	}
+	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	int_pipe = usb_rcvintpipe(usb_dev, ni_priv->interrupt_in_endpoint);
+	usb_fill_int_urb(ni_priv->interrupt_urb, usb_dev, int_pipe, ni_priv->interrupt_buffer,
+			 sizeof(ni_priv->interrupt_buffer), &ni_usb_interrupt_complete, board, 1);
+	retval = usb_submit_urb(ni_priv->interrupt_urb, GFP_KERNEL);
+	mutex_unlock(&ni_priv->interrupt_transfer_lock);
+	if (retval) {
+		dev_err(&usb_dev->dev, "failed to submit first interrupt urb, retval=%i\n", retval);
+		return retval;
+	}
+	return 0;
+}
+
+static void ni_usb_cleanup_urbs(struct ni_usb_priv *ni_priv)
+{
+	if (ni_priv && ni_priv->bus_interface) {
+		if (ni_priv->interrupt_urb)
+			usb_kill_urb(ni_priv->interrupt_urb);
+		if (ni_priv->bulk_urb)
+			usb_kill_urb(ni_priv->bulk_urb);
+	}
+}
+
+static int ni_usb_b_read_serial_number(struct ni_usb_priv *ni_priv)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	int retval;
+	u8 *out_data;
+	u8 *in_data;
+	static const int out_data_length = 0x20;
+	static const int  in_data_length = 0x20;
+	int bytes_written = 0, bytes_read = 0;
+	int i = 0;
+	static const int num_reads = 4;
+	unsigned int results[4];
+	int j;
+	unsigned int serial_number;
+
+	in_data = kmalloc(in_data_length, GFP_KERNEL);
+	if (!in_data)
+		return -ENOMEM;
+
+	out_data = kmalloc(out_data_length, GFP_KERNEL);
+	if (!out_data) {
+		kfree(in_data);
+		return -ENOMEM;
+	}
+	i += ni_usb_bulk_register_read_header(&out_data[i], num_reads);
+	i += ni_usb_bulk_register_read(&out_data[i], NIUSB_SUBDEV_UNKNOWN3, SERIAL_NUMBER_1_REG);
+	i += ni_usb_bulk_register_read(&out_data[i], NIUSB_SUBDEV_UNKNOWN3, SERIAL_NUMBER_2_REG);
+	i += ni_usb_bulk_register_read(&out_data[i], NIUSB_SUBDEV_UNKNOWN3, SERIAL_NUMBER_3_REG);
+	i += ni_usb_bulk_register_read(&out_data[i], NIUSB_SUBDEV_UNKNOWN3, SERIAL_NUMBER_4_REG);
+	while (i % 4)
+		out_data[i++] = 0x0;
+	i += ni_usb_bulk_termination(&out_data[i]);
+	retval = ni_usb_send_bulk_msg(ni_priv, out_data, out_data_length, &bytes_written, 1000);
+	if (retval) {
+		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%li\n",
+			retval, bytes_written, (long)out_data_length);
+		goto serial_out;
+	}
+	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read, 1000, 0);
+	if (retval) {
+		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
+			retval, bytes_read);
+		ni_usb_dump_raw_block(in_data, bytes_read);
+		goto serial_out;
+	}
+	if (ARRAY_SIZE(results) < num_reads) {
+		dev_err(&usb_dev->dev, "serial number eetup bug\n");
+		retval = -EINVAL;
+		goto serial_out;
+	}
+	ni_usb_parse_register_read_block(in_data, results, num_reads);
+	serial_number = 0;
+	for (j = 0; j < num_reads; ++j)
+		serial_number |= (results[j] & 0xff) << (8 * j);
+	dev_dbg(&usb_dev->dev, "board serial number is 0x%x\n", serial_number);
+	retval = 0;
+serial_out:
+	kfree(in_data);
+	kfree(out_data);
+	return retval;
+}
+
+static int ni_usb_hs_wait_for_ready(struct ni_usb_priv *ni_priv)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	static const int buffer_size = 0x10;
+	static const int timeout = 50;
+	static const int msec_sleep_duration = 100;
+	int i;	int retval;
+	int j;
+	int unexpected = 0;
+	unsigned int serial_number;
+	u8 *buffer;
+
+	buffer = kmalloc(buffer_size, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	retval = ni_usb_receive_control_msg(ni_priv, NI_USB_SERIAL_NUMBER_REQUEST,
+					    USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+					    0x0, 0x0, buffer, buffer_size, 1000);
+	if (retval < 0) {
+		dev_err(&usb_dev->dev, "usb_control_msg request 0x%x returned %i\n",
+			NI_USB_SERIAL_NUMBER_REQUEST, retval);
+		goto ready_out;
+	}
+	j = 0;
+	if (buffer[j] != NI_USB_SERIAL_NUMBER_REQUEST) {
+		dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x%x\n",
+			j, (int)buffer[j], NI_USB_SERIAL_NUMBER_REQUEST);
+		unexpected = 1;
+	}
+	if (unexpected)
+		ni_usb_dump_raw_block(buffer, retval);
+	// NI-USB-HS+ pads the serial with 0x0 to make 16 bytes
+	if (retval != 5 && retval != 16) {
+		dev_err(&usb_dev->dev, "received unexpected number of bytes = %i, expected 5 or 16\n",
+			retval);
+		ni_usb_dump_raw_block(buffer, retval);
+	}
+	serial_number = 0;
+	serial_number |= buffer[++j];
+	serial_number |= (buffer[++j] << 8);
+	serial_number |= (buffer[++j] << 16);
+	serial_number |= (buffer[++j] << 24);
+	dev_dbg(&usb_dev->dev, "board serial number is 0x%x\n", serial_number);
+	for (i = 0; i < timeout; ++i) {
+		int ready = 0;
+
+		retval = ni_usb_receive_control_msg(ni_priv, NI_USB_POLL_READY_REQUEST,
+						    USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+						    0x0, 0x0, buffer, buffer_size, 100);
+		if (retval < 0) {
+			dev_err(&usb_dev->dev, "usb_control_msg request 0x%x returned %i\n",
+				NI_USB_POLL_READY_REQUEST, retval);
+			goto ready_out;
+		}
+		j = 0;
+		unexpected = 0;
+		if (buffer[j] != NI_USB_POLL_READY_REQUEST) { // [0]
+			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x%x\n",
+				j, (int)buffer[j], NI_USB_POLL_READY_REQUEST);
+			unexpected = 1;
+		}
+		++j;
+		if (buffer[j] != 0x1 && buffer[j] != 0x0) { // [1] HS+ sends 0x0
+			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x1 or 0x0\n",
+				j, (int)buffer[j]);
+			unexpected = 1;
+		}
+		if (buffer[++j] != 0x0) { // [2]
+			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x%x\n",
+				j, (int)buffer[j], 0x0);
+			unexpected = 1;
+		}
+		++j;
+		/*
+		 * MC usb-488 (and sometimes NI-USB-HS?) sends 0x8 here; MC usb-488A sends 0x7 here
+		 * NI-USB-HS+ sends 0x0
+		 */
+		if (buffer[j] != 0x1 && buffer[j] != 0x8 && buffer[j] != 0x7 && buffer[j] != 0x0) {
+			// [3]
+			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x0, 0x1, 0x7 or 0x8\n",
+				j, (int)buffer[j]);
+			unexpected = 1;
+		}
+		++j;
+		// NI-USB-HS+ sends 0 here
+		if (buffer[j] != 0x30 && buffer[j] != 0x0) { // [4]
+			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x0 or 0x30\n",
+				j, (int)buffer[j]);
+			unexpected = 1;
+		}
+		++j;
+		// MC usb-488 (and sometimes NI-USB-HS?) and NI-USB-HS+ sends 0x0 here
+		if (buffer[j] != 0x1 && buffer[j] != 0x0) { // [5]
+			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x1 or 0x0\n",
+				j, (int)buffer[j]);
+			unexpected = 1;
+		}
+		if (buffer[++j] != 0x0) { // [6]
+			ready = 1;
+			// NI-USB-HS+ sends 0xf or 0x19 here
+			if (buffer[j] != 0x2 && buffer[j] != 0xe && buffer[j] != 0xf &&
+			    buffer[j] != 0x16 && buffer[j] != 0x19) {
+				dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x2, 0xe, 0xf, 0x16 or 0x19\n",
+					j, (int)buffer[j]);
+				unexpected = 1;
+			}
+		}
+		if (buffer[++j] != 0x0) { // [7]
+			ready = 1;
+			// MC usb-488 sends 0x5 here; MC usb-488A sends 0x6 here
+			if (buffer[j] != 0x3 && buffer[j] != 0x5 && buffer[j] != 0x6 &&
+			    buffer[j] != 0x8)	{
+				dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x3 or 0x5, 0x6 or 0x08\n",
+					j, (int)buffer[j]);
+				unexpected = 1;
+			}
+		}
+		++j;
+		if (buffer[j] != 0x0 && buffer[j] != 0x2) { // [8] MC usb-488 sends 0x2 here
+			dev_err(&usb_dev->dev, " unexpected data: buffer[%i]=0x%x, expected 0x0 or 0x2\n",
+				j, (int)buffer[j]);
+			unexpected = 1;
+		}
+		++j;
+		// MC usb-488A and NI-USB-HS sends 0x3 here; NI-USB-HS+ sends 0x30 here
+		if (buffer[j] != 0x0 && buffer[j] != 0x3 && buffer[j] != 0x30) { // [9]
+			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x0, 0x3 or 0x30\n",
+				j, (int)buffer[j]);
+			unexpected = 1;
+		}
+		if (buffer[++j] != 0x0) { // [10] MC usb-488 sends 0x7 here, new HS+ sends 0x59
+			ready = 1;
+			if (buffer[j] != 0x96 && buffer[j] != 0x7 && buffer[j] != 0x6e &&
+			    buffer[j] != 0x59) {
+				dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x96, 0x07, 0x6e or 0x59\n",
+					j, (int)buffer[j]);
+				unexpected = 1;
+			}
+		}
+		if (unexpected)
+			ni_usb_dump_raw_block(buffer, retval);
+		if (ready)
+			break;
+		retval = msleep_interruptible(msec_sleep_duration);
+		if (retval) {
+			retval = -ERESTARTSYS;
+			goto ready_out;
+		}
+	}
+	retval = 0;
+
+ready_out:
+	kfree(buffer);
+	dev_dbg(&usb_dev->dev, "exit retval=%d\n", retval);
+	return retval;
+}
+
+/*
+ * This does some extra init for HS+ models, as observed on Windows.  One of the
+ * control requests causes the LED to stop blinking.
+ * I'm not sure what the other 2 requests do.  None of these requests are actually required
+ * for the adapter to work, maybe they do some init for the analyzer interface
+ * (which we don't use).
+ */
+static int ni_usb_hs_plus_extra_init(struct ni_usb_priv *ni_priv)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	int retval;
+	u8 *buffer;
+	static const int buffer_size = 16;
+	int transfer_size;
+
+	buffer = kmalloc(buffer_size, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+	do {
+		transfer_size = 16;
+
+		retval = ni_usb_receive_control_msg(ni_priv, NI_USB_HS_PLUS_0x48_REQUEST,
+						    USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+						    0x0, 0x0, buffer, transfer_size, 1000);
+		if (retval < 0) {
+			dev_err(&usb_dev->dev, "usb_control_msg request 0x%x returned %i\n",
+				NI_USB_HS_PLUS_0x48_REQUEST, retval);
+			break;
+		}
+		// expected response data: 48 f3 30 00 00 00 00 00 00 00 00 00 00 00 00 00
+		if (buffer[0] != NI_USB_HS_PLUS_0x48_REQUEST)
+			dev_err(&usb_dev->dev, "unexpected data: buffer[0]=0x%x, expected 0x%x\n",
+				(int)buffer[0], NI_USB_HS_PLUS_0x48_REQUEST);
+
+		transfer_size = 2;
+
+		retval = ni_usb_receive_control_msg(ni_priv, NI_USB_HS_PLUS_LED_REQUEST,
+						    USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+						    0x1, 0x0, buffer, transfer_size, 1000);
+		if (retval < 0) {
+			dev_err(&usb_dev->dev, "usb_control_msg request 0x%x returned %i\n",
+				NI_USB_HS_PLUS_LED_REQUEST, retval);
+			break;
+		}
+		// expected response data: 4b 00
+		if (buffer[0] != NI_USB_HS_PLUS_LED_REQUEST)
+			dev_err(&usb_dev->dev, "unexpected data: buffer[0]=0x%x, expected 0x%x\n",
+				(int)buffer[0], NI_USB_HS_PLUS_LED_REQUEST);
+
+		transfer_size = 9;
+
+		retval = ni_usb_receive_control_msg(ni_priv, NI_USB_HS_PLUS_0xf8_REQUEST,
+						    USB_DIR_IN | USB_TYPE_VENDOR |
+						    USB_RECIP_INTERFACE,
+						    0x0, 0x1, buffer, transfer_size, 1000);
+		if (retval < 0) {
+			dev_err(&usb_dev->dev, "usb_control_msg request 0x%x returned %i\n",
+				NI_USB_HS_PLUS_0xf8_REQUEST, retval);
+			break;
+		}
+		// expected response data: f8 01 00 00 00 01 00 00 00
+		if (buffer[0] != NI_USB_HS_PLUS_0xf8_REQUEST)
+			dev_err(&usb_dev->dev, "unexpected data: buffer[0]=0x%x, expected 0x%x\n",
+				(int)buffer[0], NI_USB_HS_PLUS_0xf8_REQUEST);
+	} while (0);
+
+	// cleanup
+	kfree(buffer);
+	return retval;
+}
+
+static inline int ni_usb_device_match(struct usb_interface *interface,
+				      const struct gpib_board_config *config)
+{
+	if (gpib_match_device_path(&interface->dev, config->device_path) == 0)
+		return 0;
+	return 1;
+}
+
+static int ni_usb_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	int retval;
+	int i, index;
+	struct ni_usb_priv *ni_priv;
+	int product_id;
+	struct usb_device *usb_dev;
+
+	mutex_lock(&ni_usb_hotplug_lock);
+	retval = ni_usb_allocate_private(board);
+	if (retval < 0)		{
+		mutex_unlock(&ni_usb_hotplug_lock);
+		return retval;
+	}
+	ni_priv = board->private_data;
+	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++)	{
+		if (ni_usb_driver_interfaces[i] &&
+		    !usb_get_intfdata(ni_usb_driver_interfaces[i]) &&
+		    ni_usb_device_match(ni_usb_driver_interfaces[i], config)) {
+			ni_priv->bus_interface = ni_usb_driver_interfaces[i];
+			usb_set_intfdata(ni_usb_driver_interfaces[i], board);
+			usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+			index = i;
+			break;
+		}
+	}
+	if (i == MAX_NUM_NI_USB_INTERFACES) {
+		mutex_unlock(&ni_usb_hotplug_lock);
+		dev_err(board->gpib_dev, "No supported adapters found, have you loaded its firmware?\n");
+		return -ENODEV;
+	}
+	if (usb_reset_configuration(interface_to_usbdev(ni_priv->bus_interface)))
+		dev_err(&usb_dev->dev, "usb_reset_configuration() failed.\n");
+
+	product_id = le16_to_cpu(usb_dev->descriptor.idProduct);
+	ni_priv->product_id = product_id;
+
+	timer_setup(&ni_priv->bulk_timer, ni_usb_timeout_handler, 0);
+
+	switch (product_id) {
+	case USB_DEVICE_ID_NI_USB_B:
+		ni_priv->bulk_out_endpoint = NIUSB_B_BULK_OUT_ENDPOINT;
+		ni_priv->bulk_in_endpoint = NIUSB_B_BULK_IN_ENDPOINT;
+		ni_priv->interrupt_in_endpoint = NIUSB_B_INTERRUPT_IN_ENDPOINT;
+		ni_usb_b_read_serial_number(ni_priv);
+		break;
+	case USB_DEVICE_ID_NI_USB_HS:
+	case USB_DEVICE_ID_MC_USB_488:
+	case USB_DEVICE_ID_KUSB_488A:
+		ni_priv->bulk_out_endpoint = NIUSB_HS_BULK_OUT_ENDPOINT;
+		ni_priv->bulk_in_endpoint = NIUSB_HS_BULK_IN_ENDPOINT;
+		ni_priv->interrupt_in_endpoint = NIUSB_HS_INTERRUPT_IN_ENDPOINT;
+		retval = ni_usb_hs_wait_for_ready(ni_priv);
+		if (retval < 0) {
+			mutex_unlock(&ni_usb_hotplug_lock);
+			return retval;
+		}
+		break;
+	case USB_DEVICE_ID_NI_USB_HS_PLUS:
+		ni_priv->bulk_out_endpoint = NIUSB_HS_PLUS_BULK_OUT_ENDPOINT;
+		ni_priv->bulk_in_endpoint = NIUSB_HS_PLUS_BULK_IN_ENDPOINT;
+		ni_priv->interrupt_in_endpoint = NIUSB_HS_PLUS_INTERRUPT_IN_ENDPOINT;
+		retval = ni_usb_hs_wait_for_ready(ni_priv);
+		if (retval < 0) {
+			mutex_unlock(&ni_usb_hotplug_lock);
+			return retval;
+		}
+		retval = ni_usb_hs_plus_extra_init(ni_priv);
+		if (retval < 0) {
+			mutex_unlock(&ni_usb_hotplug_lock);
+			return retval;
+		}
+		break;
+	default:
+		mutex_unlock(&ni_usb_hotplug_lock);
+		dev_err(&usb_dev->dev, "\tDriver bug: unknown endpoints for usb device id %x\n",
+			product_id);
+		return -EINVAL;
+	}
+
+	retval = ni_usb_setup_urbs(board);
+	if (retval < 0) {
+		mutex_unlock(&ni_usb_hotplug_lock);
+		return retval;
+	}
+	retval = ni_usb_set_interrupt_monitor(board, 0);
+	if (retval < 0) {
+		mutex_unlock(&ni_usb_hotplug_lock);
+		return retval;
+	}
+
+	board->t1_nano_sec = 500;
+
+	retval = ni_usb_init(board);
+	if (retval < 0) {
+		mutex_unlock(&ni_usb_hotplug_lock);
+		return retval;
+	}
+	retval = ni_usb_set_interrupt_monitor(board, ni_usb_ibsta_monitor_mask);
+	if (retval < 0)		{
+		mutex_unlock(&ni_usb_hotplug_lock);
+		return retval;
+	}
+
+	mutex_unlock(&ni_usb_hotplug_lock);
+	dev_info(&usb_dev->dev,
+		 "bus %d dev num %d attached to gpib%d, intf %i\n",
+		 usb_dev->bus->busnum, usb_dev->devnum, board->minor, index);
+	return retval;
+}
+
+static int ni_usb_shutdown_hardware(struct ni_usb_priv *ni_priv)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
+	int retval;
+	struct ni_usb_register writes[2];
+	static const int writes_length = ARRAY_SIZE(writes);
+	unsigned int ibsta;
+
+	writes[0].device = NIUSB_SUBDEV_TNT4882;
+	writes[0].address = nec7210_to_tnt4882_offset(AUXMR);
+	writes[0].value = AUX_CR;
+	writes[1].device = NIUSB_SUBDEV_UNKNOWN3;
+	writes[1].address = 0x10;
+	writes[1].value = 0x0;
+	retval = ni_usb_write_registers(ni_priv, writes, writes_length, &ibsta);
+	if (retval) {
+		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
+		return retval;
+	}
+	return 0;
+}
+
+static void ni_usb_detach(struct gpib_board *board)
+{
+	struct ni_usb_priv *ni_priv;
+
+	mutex_lock(&ni_usb_hotplug_lock);
+	/*
+	 * under windows, software unplug does chip_reset nec7210 aux command,
+	 * then writes 0x0 to address 0x10 of device 3
+	 */
+	ni_priv = board->private_data;
+	if (ni_priv) {
+		if (ni_priv->bus_interface) {
+			ni_usb_set_interrupt_monitor(board, 0);
+			ni_usb_shutdown_hardware(ni_priv);
+			usb_set_intfdata(ni_priv->bus_interface, NULL);
+		}
+		mutex_lock(&ni_priv->bulk_transfer_lock);
+		mutex_lock(&ni_priv->control_transfer_lock);
+		mutex_lock(&ni_priv->interrupt_transfer_lock);
+		ni_usb_cleanup_urbs(ni_priv);
+		ni_usb_free_private(ni_priv);
+	}
+	mutex_unlock(&ni_usb_hotplug_lock);
+}
+
+static struct gpib_interface ni_usb_gpib_interface = {
+	.name = "ni_usb_b",
+	.attach = ni_usb_attach,
+	.detach = ni_usb_detach,
+	.read = ni_usb_read,
+	.write = ni_usb_write,
+	.command = ni_usb_command,
+	.take_control = ni_usb_take_control,
+	.go_to_standby = ni_usb_go_to_standby,
+	.request_system_control = ni_usb_request_system_control,
+	.interface_clear = ni_usb_interface_clear,
+	.remote_enable = ni_usb_remote_enable,
+	.enable_eos = ni_usb_enable_eos,
+	.disable_eos = ni_usb_disable_eos,
+	.parallel_poll = ni_usb_parallel_poll,
+	.parallel_poll_configure = ni_usb_parallel_poll_configure,
+	.parallel_poll_response = ni_usb_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = ni_usb_line_status,
+	.update_status = ni_usb_update_status,
+	.primary_address = ni_usb_primary_address,
+	.secondary_address = ni_usb_secondary_address,
+	.serial_poll_response = ni_usb_serial_poll_response,
+	.serial_poll_status = ni_usb_serial_poll_status,
+	.t1_delay = ni_usb_t1_delay,
+	.return_to_local = ni_usb_return_to_local,
+	.skip_check_for_command_acceptors = 1
+};
+
+// Table with the USB-devices: just now only testing IDs
+static struct usb_device_id ni_usb_driver_device_table[] = {
+	{USB_DEVICE(USB_VENDOR_ID_NI, USB_DEVICE_ID_NI_USB_B)},
+	{USB_DEVICE(USB_VENDOR_ID_NI, USB_DEVICE_ID_NI_USB_HS)},
+	// gpib-usb-hs+ has a second interface for the analyzer, which we ignore
+	{USB_DEVICE_INTERFACE_NUMBER(USB_VENDOR_ID_NI, USB_DEVICE_ID_NI_USB_HS_PLUS, 0)},
+	{USB_DEVICE(USB_VENDOR_ID_NI, USB_DEVICE_ID_KUSB_488A)},
+	{USB_DEVICE(USB_VENDOR_ID_NI, USB_DEVICE_ID_MC_USB_488)},
+	{} /* Terminating entry */
+};
+MODULE_DEVICE_TABLE(usb, ni_usb_driver_device_table);
+
+static int ni_usb_driver_probe(struct usb_interface *interface,	const struct usb_device_id *id)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(interface);
+	int i;
+	char *path;
+	static const int path_length = 1024;
+
+	mutex_lock(&ni_usb_hotplug_lock);
+	usb_get_dev(usb_dev);
+	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++) {
+		if (!ni_usb_driver_interfaces[i]) {
+			ni_usb_driver_interfaces[i] = interface;
+			usb_set_intfdata(interface, NULL);
+			break;
+		}
+	}
+	if (i == MAX_NUM_NI_USB_INTERFACES) {
+		usb_put_dev(usb_dev);
+		mutex_unlock(&ni_usb_hotplug_lock);
+		dev_err(&usb_dev->dev, "ni_usb_driver_interfaces[] full\n");
+		return -1;
+	}
+	path = kmalloc(path_length, GFP_KERNEL);
+	if (!path) {
+		usb_put_dev(usb_dev);
+		mutex_unlock(&ni_usb_hotplug_lock);
+		return -ENOMEM;
+	}
+	usb_make_path(usb_dev, path, path_length);
+	dev_info(&usb_dev->dev, "probe succeeded for path: %s\n", path);
+	kfree(path);
+	mutex_unlock(&ni_usb_hotplug_lock);
+	return 0;
+}
+
+static void ni_usb_driver_disconnect(struct usb_interface *interface)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(interface);
+	int i;
+
+	mutex_lock(&ni_usb_hotplug_lock);
+	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++)	{
+		if (ni_usb_driver_interfaces[i] == interface)	{
+			struct gpib_board *board = usb_get_intfdata(interface);
+
+			if (board) {
+				struct ni_usb_priv *ni_priv = board->private_data;
+
+				if (ni_priv) {
+					mutex_lock(&ni_priv->bulk_transfer_lock);
+					mutex_lock(&ni_priv->control_transfer_lock);
+					mutex_lock(&ni_priv->interrupt_transfer_lock);
+					ni_usb_cleanup_urbs(ni_priv);
+					ni_priv->bus_interface = NULL;
+					mutex_unlock(&ni_priv->interrupt_transfer_lock);
+					mutex_unlock(&ni_priv->control_transfer_lock);
+					mutex_unlock(&ni_priv->bulk_transfer_lock);
+				}
+			}
+			ni_usb_driver_interfaces[i] = NULL;
+			break;
+		}
+	}
+	if (i == MAX_NUM_NI_USB_INTERFACES)
+		dev_err(&usb_dev->dev, "unable to find interface  bug?\n");
+	usb_put_dev(usb_dev);
+	mutex_unlock(&ni_usb_hotplug_lock);
+}
+
+static int ni_usb_driver_suspend(struct usb_interface *interface, pm_message_t message)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(interface);
+	struct gpib_board *board;
+	int i, retval;
+
+	mutex_lock(&ni_usb_hotplug_lock);
+
+	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++)	{
+		if (ni_usb_driver_interfaces[i] == interface) {
+			board = usb_get_intfdata(interface);
+			if (board)
+				break;
+		}
+	}
+	if (i == MAX_NUM_NI_USB_INTERFACES) {
+		mutex_unlock(&ni_usb_hotplug_lock);
+		return 0;
+	}
+
+	struct ni_usb_priv *ni_priv = board->private_data;
+
+	if (ni_priv) {
+		ni_usb_set_interrupt_monitor(board, 0);
+		retval = ni_usb_shutdown_hardware(ni_priv);
+		if (retval) {
+			mutex_unlock(&ni_usb_hotplug_lock);
+			return retval;
+		}
+		if (ni_priv->interrupt_urb) {
+			mutex_lock(&ni_priv->interrupt_transfer_lock);
+			ni_usb_cleanup_urbs(ni_priv);
+			mutex_unlock(&ni_priv->interrupt_transfer_lock);
+		}
+		dev_dbg(&usb_dev->dev,
+			"bus %d dev num %d gpib%d, interface %i suspended\n",
+			usb_dev->bus->busnum, usb_dev->devnum, board->minor, i);
+	}
+
+	mutex_unlock(&ni_usb_hotplug_lock);
+	return 0;
+}
+
+static int ni_usb_driver_resume(struct usb_interface *interface)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(interface);
+
+	struct gpib_board *board;
+	int i, retval;
+
+	mutex_lock(&ni_usb_hotplug_lock);
+
+	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++)	{
+		if (ni_usb_driver_interfaces[i] == interface) {
+			board = usb_get_intfdata(interface);
+			if (board)
+				break;
+		}
+	}
+	if (i == MAX_NUM_NI_USB_INTERFACES) {
+		mutex_unlock(&ni_usb_hotplug_lock);
+		return 0;
+	}
+
+	struct ni_usb_priv *ni_priv = board->private_data;
+
+	if (ni_priv) {
+		if (ni_priv->interrupt_urb) {
+			mutex_lock(&ni_priv->interrupt_transfer_lock);
+			retval = usb_submit_urb(ni_priv->interrupt_urb, GFP_KERNEL);
+			if (retval) {
+				dev_err(&usb_dev->dev, "resume failed to resubmit interrupt urb, retval=%i\n",
+					retval);
+				mutex_unlock(&ni_priv->interrupt_transfer_lock);
+				mutex_unlock(&ni_usb_hotplug_lock);
+				return retval;
+			}
+			mutex_unlock(&ni_priv->interrupt_transfer_lock);
+		} else {
+			dev_err(&usb_dev->dev, "bug! resume int urb not set up\n");
+			mutex_unlock(&ni_usb_hotplug_lock);
+			return -EINVAL;
+		}
+
+		switch (ni_priv->product_id) {
+		case USB_DEVICE_ID_NI_USB_B:
+			ni_usb_b_read_serial_number(ni_priv);
+			break;
+		case USB_DEVICE_ID_NI_USB_HS:
+		case USB_DEVICE_ID_MC_USB_488:
+		case USB_DEVICE_ID_KUSB_488A:
+			retval = ni_usb_hs_wait_for_ready(ni_priv);
+			if (retval < 0) {
+				mutex_unlock(&ni_usb_hotplug_lock);
+				return retval;
+			}
+			break;
+		case USB_DEVICE_ID_NI_USB_HS_PLUS:
+			retval = ni_usb_hs_wait_for_ready(ni_priv);
+			if (retval < 0) {
+				mutex_unlock(&ni_usb_hotplug_lock);
+				return retval;
+			}
+			retval = ni_usb_hs_plus_extra_init(ni_priv);
+			if (retval < 0) {
+				mutex_unlock(&ni_usb_hotplug_lock);
+				return retval;
+			}
+			break;
+		default:
+			mutex_unlock(&ni_usb_hotplug_lock);
+			dev_err(&usb_dev->dev, "\tDriver bug: unknown endpoints for usb device id\n");
+			return -EINVAL;
+		}
+
+		retval = ni_usb_set_interrupt_monitor(board, 0);
+		if (retval < 0) {
+			mutex_unlock(&ni_usb_hotplug_lock);
+			return retval;
+		}
+
+		retval = ni_usb_init(board);
+		if (retval < 0) {
+			mutex_unlock(&ni_usb_hotplug_lock);
+			return retval;
+		}
+		retval = ni_usb_set_interrupt_monitor(board, ni_usb_ibsta_monitor_mask);
+		if (retval < 0)		{
+			mutex_unlock(&ni_usb_hotplug_lock);
+			return retval;
+		}
+		if (board->master)
+			ni_usb_interface_clear(board, 1); // this is a pulsed action
+		if (ni_priv->ren_state)
+			ni_usb_remote_enable(board, 1);
+
+		dev_dbg(&usb_dev->dev,
+			"bus %d dev num %d gpib%d, interface %i resumed\n",
+			usb_dev->bus->busnum, usb_dev->devnum, board->minor, i);
+	}
+
+	mutex_unlock(&ni_usb_hotplug_lock);
+	return 0;
+}
+
+static struct usb_driver ni_usb_bus_driver = {
+	.name = DRV_NAME,
+	.probe = ni_usb_driver_probe,
+	.disconnect = ni_usb_driver_disconnect,
+	.suspend = ni_usb_driver_suspend,
+	.resume = ni_usb_driver_resume,
+	.id_table = ni_usb_driver_device_table,
+};
+
+static int __init ni_usb_init_module(void)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++)
+		ni_usb_driver_interfaces[i] = NULL;
+
+	ret = usb_register(&ni_usb_bus_driver);
+	if (ret) {
+		pr_err("usb_register failed: error = %d\n", ret);
+		return ret;
+	}
+
+	ret = gpib_register_driver(&ni_usb_gpib_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit ni_usb_exit_module(void)
+{
+	gpib_unregister_driver(&ni_usb_gpib_interface);
+	usb_deregister(&ni_usb_bus_driver);
+}
+
+module_init(ni_usb_init_module);
+module_exit(ni_usb_exit_module);
diff --git a/drivers/gpib/ni_usb/ni_usb_gpib.h b/drivers/gpib/ni_usb/ni_usb_gpib.h
new file mode 100644
index 000000000000..688f5e08792f
--- /dev/null
+++ b/drivers/gpib/ni_usb/ni_usb_gpib.h
@@ -0,0 +1,226 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/***************************************************************************
+ *   copyright            : (C) 2004 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _NI_USB_GPIB_H
+#define _NI_USB_GPIB_H
+
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/usb.h>
+#include <linux/timer.h>
+#include "gpibP.h"
+
+enum {
+	USB_VENDOR_ID_NI = 0x3923
+};
+
+enum {
+	USB_DEVICE_ID_NI_USB_B = 0x702a,
+	USB_DEVICE_ID_NI_USB_B_PREINIT = 0x702b,	// device id before firmware is loaded
+	USB_DEVICE_ID_NI_USB_HS = 0x709b,
+	USB_DEVICE_ID_NI_USB_HS_PLUS = 0x7618,
+	USB_DEVICE_ID_KUSB_488A = 0x725c,
+	USB_DEVICE_ID_MC_USB_488 = 0x725d
+};
+
+enum ni_usb_device {
+	NIUSB_SUBDEV_TNT4882 = 1,
+	NIUSB_SUBDEV_UNKNOWN2 = 2,
+	NIUSB_SUBDEV_UNKNOWN3 = 3,
+};
+
+enum endpoint_addresses {
+	NIUSB_B_BULK_OUT_ENDPOINT = 0x2,
+	NIUSB_B_BULK_IN_ENDPOINT = 0x2,
+	NIUSB_B_BULK_IN_ALT_ENDPOINT = 0x6,
+	NIUSB_B_INTERRUPT_IN_ENDPOINT = 0x4,
+};
+
+enum hs_enpoint_addresses {
+	NIUSB_HS_BULK_OUT_ENDPOINT = 0x2,
+	NIUSB_HS_BULK_OUT_ALT_ENDPOINT = 0x6,
+	NIUSB_HS_BULK_IN_ENDPOINT = 0x4,
+	NIUSB_HS_BULK_IN_ALT_ENDPOINT = 0x8,
+	NIUSB_HS_INTERRUPT_IN_ENDPOINT = 0x1,
+};
+
+enum hs_plus_endpoint_addresses {
+	NIUSB_HS_PLUS_BULK_OUT_ENDPOINT = 0x1,
+	NIUSB_HS_PLUS_BULK_OUT_ALT_ENDPOINT = 0x4,
+	NIUSB_HS_PLUS_BULK_IN_ENDPOINT = 0x2,
+	NIUSB_HS_PLUS_BULK_IN_ALT_ENDPOINT = 0x5,
+	NIUSB_HS_PLUS_INTERRUPT_IN_ENDPOINT = 0x3,
+};
+
+struct ni_usb_urb_ctx {
+	struct completion complete;
+	unsigned timed_out : 1;
+};
+
+// struct which defines private_data for ni_usb devices
+struct ni_usb_priv {
+	struct usb_interface *bus_interface;
+	int bulk_out_endpoint;
+	int bulk_in_endpoint;
+	int interrupt_in_endpoint;
+	u8 eos_char;
+	unsigned short eos_mode;
+	unsigned int monitored_ibsta_bits;
+	struct urb *bulk_urb;
+	struct urb *interrupt_urb;
+	u8 interrupt_buffer[0x11];
+	struct mutex addressed_transfer_lock;	// protect transfer lock
+	struct mutex bulk_transfer_lock;	// protect bulk message sends
+	struct mutex control_transfer_lock;	// protect control messages
+	struct mutex interrupt_transfer_lock;	//  protect interrupt messages
+	struct timer_list bulk_timer;
+	struct ni_usb_urb_ctx context;
+	int product_id;
+	unsigned short ren_state;
+};
+
+struct ni_usb_status_block {
+	short id;
+	unsigned short ibsta;
+	short error_code;
+	unsigned short count;
+};
+
+struct ni_usb_register {
+	enum ni_usb_device device;
+	short address;
+	unsigned short value;
+};
+
+enum ni_usb_bulk_ids {
+	NIUSB_IBCAC_ID = 0x1,
+	NIUSB_UNKNOWN3_ID = 0x3, // device level function id?
+	NIUSB_TERM_ID = 0x4,
+	NIUSB_IBGTS_ID = 0x6,
+	NIUSB_IBRPP_ID = 0x7,
+	NIUSB_REG_READ_ID = 0x8,
+	NIUSB_REG_WRITE_ID = 0x9,
+	NIUSB_IBSIC_ID = 0xf,
+	NIUSB_REGISTER_READ_DATA_START_ID = 0x34,
+	NIUSB_REGISTER_READ_DATA_END_ID = 0x35,
+	NIUSB_IBRD_DATA_ID = 0x36,
+	NIUSB_IBRD_EXTENDED_DATA_ID = 0x37,
+	NIUSB_IBRD_STATUS_ID = 0x38
+};
+
+enum ni_usb_error_codes {
+	NIUSB_NO_ERROR = 0,
+	/*
+	 * NIUSB_ABORTED_ERROR occurs when I/O is interrupted early by
+	 * doing a NI_USB_STOP_REQUEST on the control endpoint.
+	 */
+	NIUSB_ABORTED_ERROR = 1,
+	/*
+	 * NIUSB_READ_ATN_ERROR occurs when you do a board read while
+	 * ATN is set
+	 */
+	NIUSB_ATN_STATE_ERROR = 2,
+	/*
+	 * NIUSB_ADDRESSING_ERROR occurs when you do a board
+	 * read/write as CIC but are not in LACS/TACS
+	 */
+	NIUSB_ADDRESSING_ERROR = 3,
+	/*
+	 * NIUSB_EOSMODE_ERROR occurs on reads if any eos mode or char
+	 * bits are set when REOS is not set.
+	 * Have also seen error 4 if you try to send more than 16
+	 * command bytes at once on a usb-b.
+	 */
+	NIUSB_EOSMODE_ERROR = 4,
+	/*
+	 * NIUSB_NO_BUS_ERROR occurs when you try to write a command
+	 * byte but there are no devices connected to the gpib bus
+	 */
+	NIUSB_NO_BUS_ERROR = 5,
+	/*
+	 * NIUSB_NO_LISTENER_ERROR occurs when you do a board write as
+	 * CIC with no listener
+	 */
+	NIUSB_NO_LISTENER_ERROR = 8,
+	/* get NIUSB_TIMEOUT_ERROR on board read/write timeout */
+	NIUSB_TIMEOUT_ERROR = 10,
+};
+
+enum ni_usb_control_requests {
+	NI_USB_STOP_REQUEST = 0x20,
+	NI_USB_WAIT_REQUEST = 0x21,
+	NI_USB_POLL_READY_REQUEST = 0x40,
+	NI_USB_SERIAL_NUMBER_REQUEST = 0x41,
+	NI_USB_HS_PLUS_0x48_REQUEST = 0x48,
+	NI_USB_HS_PLUS_LED_REQUEST = 0x4b,
+	NI_USB_HS_PLUS_0xf8_REQUEST = 0xf8
+};
+
+static const unsigned int ni_usb_ibsta_monitor_mask =
+	SRQI | LOK | REM | CIC | ATN | TACS | LACS | DTAS | DCAS;
+
+static inline int nec7210_to_tnt4882_offset(int offset)
+{
+	return 2 * offset;
+};
+
+static inline int ni_usb_bulk_termination(u8 *buffer)
+{
+	int i = 0;
+
+	buffer[i++] = NIUSB_TERM_ID;
+	buffer[i++] = 0x0;
+	buffer[i++] = 0x0;
+	buffer[i++] = 0x0;
+	return i;
+}
+
+enum ni_usb_unknown3_register {
+	SERIAL_NUMBER_4_REG = 0x8,
+	SERIAL_NUMBER_3_REG = 0x9,
+	SERIAL_NUMBER_2_REG = 0xa,
+	SERIAL_NUMBER_1_REG = 0xb,
+};
+
+static inline int ni_usb_bulk_register_write_header(u8 *buffer, int num_writes)
+{
+	int i = 0;
+
+	buffer[i++] = NIUSB_REG_WRITE_ID;
+	buffer[i++] = num_writes;
+	buffer[i++] = 0x0;
+	return i;
+}
+
+static inline int ni_usb_bulk_register_write(u8 *buffer, struct ni_usb_register reg)
+{
+	int i = 0;
+
+	buffer[i++] = reg.device;
+	buffer[i++] = reg.address;
+	buffer[i++] = reg.value;
+	return i;
+}
+
+static inline int ni_usb_bulk_register_read_header(u8 *buffer, int num_reads)
+{
+	int i = 0;
+
+	buffer[i++] = NIUSB_REG_READ_ID;
+	buffer[i++] = num_reads;
+	return i;
+}
+
+static inline int ni_usb_bulk_register_read(u8 *buffer, int device, int address)
+{
+	int i = 0;
+
+	buffer[i++] = device;
+	buffer[i++] = address;
+	return i;
+}
+
+#endif	// _NI_USB_GPIB_H
diff --git a/drivers/gpib/pc2/Makefile b/drivers/gpib/pc2/Makefile
new file mode 100644
index 000000000000..481ee4296e1b
--- /dev/null
+++ b/drivers/gpib/pc2/Makefile
@@ -0,0 +1,5 @@
+
+obj-$(CONFIG_GPIB_PC2) += pc2_gpib.o
+
+
+
diff --git a/drivers/gpib/pc2/pc2_gpib.c b/drivers/gpib/pc2/pc2_gpib.c
new file mode 100644
index 000000000000..9f3943d1df66
--- /dev/null
+++ b/drivers/gpib/pc2/pc2_gpib.c
@@ -0,0 +1,684 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *    copyright            : (C) 2001, 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <asm/dma.h>
+#include <linux/dma-mapping.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include "nec7210.h"
+#include "gpibP.h"
+
+// struct which defines private_data for pc2 driver
+struct pc2_priv {
+	struct nec7210_priv nec7210_priv;
+	unsigned int irq;
+	// io address that clears interrupt for pc2a (0x2f0 + irq)
+	unsigned int clear_intr_addr;
+};
+
+// pc2 uses 8 consecutive io addresses
+static const int pc2_iosize = 8;
+static const int pc2a_iosize = 8;
+static const int pc2_2a_iosize = 16;
+
+// offset between io addresses of successive nec7210 registers
+static const int pc2a_reg_offset = 0x400;
+static const int pc2_reg_offset = 1;
+
+// interrupt service routine
+static irqreturn_t pc2_interrupt(int irq, void *arg);
+static irqreturn_t pc2a_interrupt(int irq, void *arg);
+
+// pc2 specific registers and bits
+
+// interrupt clear register address
+static const int pc2a_clear_intr_iobase = 0x2f0;
+static inline unsigned int CLEAR_INTR_REG(unsigned int irq)
+{
+	return pc2a_clear_intr_iobase + irq;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver for PC2/PC2a and compatible devices");
+
+/*
+ * GPIB interrupt service routines
+ */
+
+irqreturn_t pc2_interrupt(int irq, void *arg)
+{
+	struct gpib_board *board = arg;
+	struct pc2_priv *priv = board->private_data;
+	unsigned long flags;
+	irqreturn_t retval;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	retval = nec7210_interrupt(board, &priv->nec7210_priv);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return retval;
+}
+
+irqreturn_t pc2a_interrupt(int irq, void *arg)
+{
+	struct gpib_board *board = arg;
+	struct pc2_priv *priv = board->private_data;
+	int status1, status2;
+	unsigned long flags;
+	irqreturn_t retval;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	// read interrupt status (also clears status)
+	status1 = read_byte(&priv->nec7210_priv, ISR1);
+	status2 = read_byte(&priv->nec7210_priv, ISR2);
+	/* clear interrupt circuit */
+	if (priv->irq)
+		outb(0xff, CLEAR_INTR_REG(priv->irq));
+	retval = nec7210_interrupt_have_status(board, &priv->nec7210_priv, status1, status2);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return retval;
+}
+
+// wrappers for interface functions
+static int pc2_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
+		    size_t *bytes_read)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
+}
+
+static int pc2_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
+		     size_t *bytes_written)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
+}
+
+static int pc2_command(struct gpib_board *board, u8 *buffer,
+		       size_t length, size_t *bytes_written)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
+}
+
+static int pc2_take_control(struct gpib_board *board, int synchronous)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
+}
+
+static int pc2_go_to_standby(struct gpib_board *board)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_go_to_standby(board, &priv->nec7210_priv);
+}
+
+static int pc2_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_request_system_control(board, &priv->nec7210_priv, request_control);
+}
+
+static void pc2_interface_clear(struct gpib_board *board, int assert)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
+}
+
+static void pc2_remote_enable(struct gpib_board *board, int enable)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
+}
+
+static int pc2_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
+}
+
+static void pc2_disable_eos(struct gpib_board *board)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	nec7210_disable_eos(board, &priv->nec7210_priv);
+}
+
+static unsigned int pc2_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
+}
+
+static int pc2_primary_address(struct gpib_board *board, unsigned int address)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_primary_address(board, &priv->nec7210_priv, address);
+}
+
+static int pc2_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
+}
+
+static int pc2_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
+}
+
+static void pc2_parallel_poll_configure(struct gpib_board *board, u8 config)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, config);
+}
+
+static void pc2_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
+}
+
+static void pc2_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
+}
+
+static u8 pc2_serial_poll_status(struct gpib_board *board)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
+}
+
+static int pc2_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	return nec7210_t1_delay(board, &priv->nec7210_priv, nano_sec);
+}
+
+static void pc2_return_to_local(struct gpib_board *board)
+{
+	struct pc2_priv *priv = board->private_data;
+
+	nec7210_return_to_local(board, &priv->nec7210_priv);
+}
+
+static int allocate_private(struct gpib_board *board)
+{
+	struct pc2_priv *priv;
+
+	board->private_data = kmalloc(sizeof(struct pc2_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -1;
+	priv = board->private_data;
+	memset(priv, 0, sizeof(struct pc2_priv));
+	init_nec7210_private(&priv->nec7210_priv);
+	return 0;
+}
+
+static void free_private(struct gpib_board *board)
+{
+	kfree(board->private_data);
+	board->private_data = NULL;
+}
+
+static int pc2_generic_attach(struct gpib_board *board, const struct gpib_board_config *config,
+			      enum nec7210_chipset chipset)
+{
+	struct pc2_priv *pc2_priv;
+	struct nec7210_priv *nec_priv;
+
+	board->status = 0;
+	if (allocate_private(board))
+		return -ENOMEM;
+	pc2_priv = board->private_data;
+	nec_priv = &pc2_priv->nec7210_priv;
+	nec_priv->read_byte = nec7210_ioport_read_byte;
+	nec_priv->write_byte = nec7210_ioport_write_byte;
+	nec_priv->type = chipset;
+
+#ifndef PC2_DMA
+	/*
+	 * board->dev hasn't been initialized, so forget about DMA until this driver
+	 * is adapted to use isa_register_driver.
+	 */
+	if (config->ibdma)
+	// driver needs to be adapted to use isa_register_driver to get a struct device*
+		dev_err(board->gpib_dev, "DMA disabled for pc2 gpib");
+#else
+	if (config->ibdma) {
+		nec_priv->dma_buffer_length = 0x1000;
+		nec_priv->dma_buffer = dma_alloc_coherent(board->dev,
+							  nec_priv->dma_buffer_length, &
+							  nec_priv->dma_buffer_addr, GFP_ATOMIC);
+		if (!nec_priv->dma_buffer)
+			return -ENOMEM;
+
+		// request isa dma channel
+		if (request_dma(config->ibdma, "pc2")) {
+			dev_err(board->gpib_dev, "can't request DMA %d\n", config->ibdma);
+			return -1;
+		}
+		nec_priv->dma_channel = config->ibdma;
+	}
+#endif
+
+	return 0;
+}
+
+static int pc2_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	int isr_flags = 0;
+	struct pc2_priv *pc2_priv;
+	struct nec7210_priv *nec_priv;
+	int retval;
+
+	retval = pc2_generic_attach(board, config, NEC7210);
+	if (retval)
+		return retval;
+
+	pc2_priv = board->private_data;
+	nec_priv = &pc2_priv->nec7210_priv;
+	nec_priv->offset = pc2_reg_offset;
+
+	if (!request_region(config->ibbase, pc2_iosize, "pc2")) {
+		dev_err(board->gpib_dev, "ioports are already in use\n");
+		return -EBUSY;
+	}
+	nec_priv->iobase = config->ibbase;
+
+	nec7210_board_reset(nec_priv, board);
+
+	// install interrupt handler
+	if (config->ibirq) {
+		if (request_irq(config->ibirq, pc2_interrupt, isr_flags, "pc2", board))	{
+			dev_err(board->gpib_dev, "can't request IRQ %d\n", config->ibirq);
+			return -EBUSY;
+		}
+	}
+	pc2_priv->irq = config->ibirq;
+	/* poll so we can detect assertion of ATN */
+	if (gpib_request_pseudo_irq(board, pc2_interrupt)) {
+		dev_err(board->gpib_dev, "failed to allocate pseudo_irq\n");
+		return -1;
+	}
+	/* set internal counter register for 8 MHz input clock */
+	write_byte(nec_priv, ICR | 8, AUXMR);
+
+	nec7210_board_online(nec_priv, board);
+
+	return 0;
+}
+
+static void pc2_detach(struct gpib_board *board)
+{
+	struct pc2_priv *pc2_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (pc2_priv) {
+		nec_priv = &pc2_priv->nec7210_priv;
+#ifdef PC2_DMA
+		if (nec_priv->dma_channel)
+			free_dma(nec_priv->dma_channel);
+#endif
+		gpib_free_pseudo_irq(board);
+		if (pc2_priv->irq)
+			free_irq(pc2_priv->irq, board);
+		if (nec_priv->iobase) {
+			nec7210_board_reset(nec_priv, board);
+			release_region(nec_priv->iobase, pc2_iosize);
+		}
+		if (nec_priv->dma_buffer) {
+			dma_free_coherent(board->dev, nec_priv->dma_buffer_length,
+					  nec_priv->dma_buffer, nec_priv->dma_buffer_addr);
+			nec_priv->dma_buffer = NULL;
+		}
+	}
+	free_private(board);
+}
+
+static int pc2a_common_attach(struct gpib_board *board, const struct gpib_board_config *config,
+			      unsigned int num_registers, enum nec7210_chipset chipset)
+{
+	unsigned int i, j;
+	struct pc2_priv *pc2_priv;
+	struct nec7210_priv *nec_priv;
+	int retval;
+
+	retval = pc2_generic_attach(board, config, chipset);
+	if (retval)
+		return retval;
+
+	pc2_priv = board->private_data;
+	nec_priv = &pc2_priv->nec7210_priv;
+	nec_priv->offset = pc2a_reg_offset;
+
+	switch (config->ibbase) {
+	case 0x02e1:
+	case 0x22e1:
+	case 0x42e1:
+	case 0x62e1:
+		break;
+	default:
+		dev_err(board->gpib_dev, "PCIIa base range invalid, must be one of 0x[0246]2e1, but is 0x%x\n",
+			config->ibbase);
+		return -1;
+	}
+
+	if (config->ibirq) {
+		if (config->ibirq < 2 || config->ibirq > 7) {
+			dev_err(board->gpib_dev, "illegal interrupt level %i\n",
+				config->ibirq);
+			return -1;
+		}
+	} else	{
+		dev_err(board->gpib_dev, "interrupt disabled, using polling mode (slow)\n");
+	}
+#ifdef CHECK_IOPORTS
+	unsigned int err = 0;
+
+	for (i = 0; i < num_registers; i++) {
+		if (check_region(config->ibbase + i * pc2a_reg_offset, 1))
+			err++;
+	}
+	if (config->ibirq && check_region(pc2a_clear_intr_iobase + config->ibirq, 1))
+		err++;
+	if (err) {
+		dev_err(board->gpib_dev, "ioports are already in use");
+		return -EBUSY;
+	}
+#endif
+	for (i = 0; i < num_registers; i++) {
+		if (!request_region(config->ibbase +
+					i * pc2a_reg_offset, 1, "pc2a")) {
+			dev_err(board->gpib_dev, "ioports are already in use");
+			for (j = 0; j < i; j++)
+				release_region(config->ibbase +
+					j * pc2a_reg_offset, 1);
+			return -EBUSY;
+		}
+	}
+	nec_priv->iobase = config->ibbase;
+	if (config->ibirq) {
+		if (!request_region(pc2a_clear_intr_iobase + config->ibirq, 1, "pc2a"))  {
+			dev_err(board->gpib_dev, "ioports are already in use");
+			return -1;
+		}
+		pc2_priv->clear_intr_addr = pc2a_clear_intr_iobase + config->ibirq;
+		if (request_irq(config->ibirq, pc2a_interrupt, 0, "pc2a", board)) {
+			dev_err(board->gpib_dev, "can't request IRQ %d\n", config->ibirq);
+			return -EBUSY;
+		}
+	}
+	pc2_priv->irq = config->ibirq;
+	/* poll so we can detect assertion of ATN */
+	if (gpib_request_pseudo_irq(board, pc2_interrupt)) {
+		dev_err(board->gpib_dev, "failed to allocate pseudo_irq\n");
+		return -1;
+	}
+
+	// make sure interrupt is clear
+	if (pc2_priv->irq)
+		outb(0xff, CLEAR_INTR_REG(pc2_priv->irq));
+
+	nec7210_board_reset(nec_priv, board);
+
+	/* set internal counter register for 8 MHz input clock */
+	write_byte(nec_priv, ICR | 8, AUXMR);
+
+	nec7210_board_online(nec_priv, board);
+
+	return 0;
+}
+
+static int pc2a_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	return pc2a_common_attach(board, config, pc2a_iosize, NEC7210);
+}
+
+static int pc2a_cb7210_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	return pc2a_common_attach(board, config, pc2a_iosize, CB7210);
+}
+
+static int pc2_2a_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	return pc2a_common_attach(board, config, pc2_2a_iosize, NAT4882);
+}
+
+static void pc2a_common_detach(struct gpib_board *board, unsigned int num_registers)
+{
+	int i;
+	struct pc2_priv *pc2_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (pc2_priv) {
+		nec_priv = &pc2_priv->nec7210_priv;
+#ifdef PC2_DMA
+		if (nec_priv->dma_channel)
+			free_dma(nec_priv->dma_channel);
+#endif
+		gpib_free_pseudo_irq(board);
+		if (pc2_priv->irq)
+			free_irq(pc2_priv->irq, board);
+		if (nec_priv->iobase) {
+			nec7210_board_reset(nec_priv, board);
+			for (i = 0; i < num_registers; i++)
+				release_region(nec_priv->iobase +
+					       i * pc2a_reg_offset, 1);
+		}
+		if (pc2_priv->clear_intr_addr)
+			release_region(pc2_priv->clear_intr_addr, 1);
+		if (nec_priv->dma_buffer) {
+			dma_free_coherent(board->dev, nec_priv->dma_buffer_length,
+					  nec_priv->dma_buffer,
+					  nec_priv->dma_buffer_addr);
+			nec_priv->dma_buffer = NULL;
+		}
+	}
+	free_private(board);
+}
+
+static void pc2a_detach(struct gpib_board *board)
+{
+	pc2a_common_detach(board, pc2a_iosize);
+}
+
+static void pc2_2a_detach(struct gpib_board *board)
+{
+	pc2a_common_detach(board, pc2_2a_iosize);
+}
+
+static struct gpib_interface pc2_interface = {
+	.name =	"pcII",
+	.attach =	pc2_attach,
+	.detach =	pc2_detach,
+	.read =	pc2_read,
+	.write =	pc2_write,
+	.command =	pc2_command,
+	.take_control =	pc2_take_control,
+	.go_to_standby =	pc2_go_to_standby,
+	.request_system_control =	pc2_request_system_control,
+	.interface_clear =	pc2_interface_clear,
+	.remote_enable =	pc2_remote_enable,
+	.enable_eos =	pc2_enable_eos,
+	.disable_eos =	pc2_disable_eos,
+	.parallel_poll =	pc2_parallel_poll,
+	.parallel_poll_configure =	pc2_parallel_poll_configure,
+	.parallel_poll_response =	pc2_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status =	NULL,
+	.update_status =	pc2_update_status,
+	.primary_address =	pc2_primary_address,
+	.secondary_address =	pc2_secondary_address,
+	.serial_poll_response =	pc2_serial_poll_response,
+	.serial_poll_status =	pc2_serial_poll_status,
+	.t1_delay = pc2_t1_delay,
+	.return_to_local = pc2_return_to_local,
+};
+
+static struct gpib_interface pc2a_interface = {
+	.name =	"pcIIa",
+	.attach =	pc2a_attach,
+	.detach =	pc2a_detach,
+	.read =	pc2_read,
+	.write =	pc2_write,
+	.command =	pc2_command,
+	.take_control =	pc2_take_control,
+	.go_to_standby =	pc2_go_to_standby,
+	.request_system_control =	pc2_request_system_control,
+	.interface_clear =	pc2_interface_clear,
+	.remote_enable =	pc2_remote_enable,
+	.enable_eos =	pc2_enable_eos,
+	.disable_eos =	pc2_disable_eos,
+	.parallel_poll =	pc2_parallel_poll,
+	.parallel_poll_configure =	pc2_parallel_poll_configure,
+	.parallel_poll_response =	pc2_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status =	NULL,
+	.update_status =	pc2_update_status,
+	.primary_address =	pc2_primary_address,
+	.secondary_address =	pc2_secondary_address,
+	.serial_poll_response =	pc2_serial_poll_response,
+	.serial_poll_status =	pc2_serial_poll_status,
+	.t1_delay = pc2_t1_delay,
+	.return_to_local = pc2_return_to_local,
+};
+
+static struct gpib_interface pc2a_cb7210_interface = {
+	.name =	"pcIIa_cb7210",
+	.attach =	pc2a_cb7210_attach,
+	.detach =	pc2a_detach,
+	.read =	pc2_read,
+	.write =	pc2_write,
+	.command =	pc2_command,
+	.take_control =	pc2_take_control,
+	.go_to_standby =	pc2_go_to_standby,
+	.request_system_control =	pc2_request_system_control,
+	.interface_clear =	pc2_interface_clear,
+	.remote_enable =	pc2_remote_enable,
+	.enable_eos =	pc2_enable_eos,
+	.disable_eos =	pc2_disable_eos,
+	.parallel_poll =	pc2_parallel_poll,
+	.parallel_poll_configure =	pc2_parallel_poll_configure,
+	.parallel_poll_response =	pc2_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status =	NULL, // XXX
+	.update_status =	pc2_update_status,
+	.primary_address =	pc2_primary_address,
+	.secondary_address =	pc2_secondary_address,
+	.serial_poll_response =	pc2_serial_poll_response,
+	.serial_poll_status =	pc2_serial_poll_status,
+	.t1_delay = pc2_t1_delay,
+	.return_to_local = pc2_return_to_local,
+};
+
+static struct gpib_interface pc2_2a_interface = {
+	.name =	"pcII_IIa",
+	.attach =	pc2_2a_attach,
+	.detach =	pc2_2a_detach,
+	.read =	pc2_read,
+	.write =	pc2_write,
+	.command =	pc2_command,
+	.take_control =	pc2_take_control,
+	.go_to_standby =	pc2_go_to_standby,
+	.request_system_control =	pc2_request_system_control,
+	.interface_clear =	pc2_interface_clear,
+	.remote_enable =	pc2_remote_enable,
+	.enable_eos =	pc2_enable_eos,
+	.disable_eos =	pc2_disable_eos,
+	.parallel_poll =	pc2_parallel_poll,
+	.parallel_poll_configure =	pc2_parallel_poll_configure,
+	.parallel_poll_response =	pc2_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status =	NULL,
+	.update_status =	pc2_update_status,
+	.primary_address =	pc2_primary_address,
+	.secondary_address =	pc2_secondary_address,
+	.serial_poll_response =	pc2_serial_poll_response,
+	.serial_poll_status =	pc2_serial_poll_status,
+	.t1_delay = pc2_t1_delay,
+	.return_to_local = pc2_return_to_local,
+};
+
+static int __init pc2_init_module(void)
+{
+	int ret;
+
+	ret = gpib_register_driver(&pc2_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		return ret;
+	}
+
+	ret = gpib_register_driver(&pc2a_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pc2a;
+	}
+
+	ret = gpib_register_driver(&pc2a_cb7210_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_cb7210;
+	}
+
+	ret = gpib_register_driver(&pc2_2a_interface, THIS_MODULE);
+	if (ret) {
+		pr_err("gpib_register_driver failed: error = %d\n", ret);
+		goto err_pc2_2a;
+	}
+
+	return 0;
+
+err_pc2_2a:
+	gpib_unregister_driver(&pc2a_cb7210_interface);
+err_cb7210:
+	gpib_unregister_driver(&pc2a_interface);
+err_pc2a:
+	gpib_unregister_driver(&pc2_interface);
+
+	return ret;
+}
+
+static void __exit pc2_exit_module(void)
+{
+	gpib_unregister_driver(&pc2_interface);
+	gpib_unregister_driver(&pc2a_interface);
+	gpib_unregister_driver(&pc2a_cb7210_interface);
+	gpib_unregister_driver(&pc2_2a_interface);
+}
+
+module_init(pc2_init_module);
+module_exit(pc2_exit_module);
+
diff --git a/drivers/gpib/tms9914/Makefile b/drivers/gpib/tms9914/Makefile
new file mode 100644
index 000000000000..4705ab07f413
--- /dev/null
+++ b/drivers/gpib/tms9914/Makefile
@@ -0,0 +1,6 @@
+
+obj-$(CONFIG_GPIB_TMS9914) += tms9914.o
+
+
+
+
diff --git a/drivers/gpib/tms9914/tms9914.c b/drivers/gpib/tms9914/tms9914.c
new file mode 100644
index 000000000000..72a11596a35e
--- /dev/null
+++ b/drivers/gpib/tms9914/tms9914.c
@@ -0,0 +1,914 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ *   copyright		  : (C) 2001, 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/dma.h>
+#include <linux/io.h>
+#include <linux/bitops.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "gpibP.h"
+#include "tms9914.h"
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB library for tms9914");
+
+static unsigned int update_status_nolock(struct gpib_board *board, struct tms9914_priv *priv);
+
+int tms9914_take_control(struct gpib_board *board, struct tms9914_priv *priv, int synchronous)
+{
+	int i;
+	const int timeout = 100;
+
+	if (synchronous)
+		write_byte(priv, AUX_TCS, AUXCR);
+	else
+		write_byte(priv, AUX_TCA, AUXCR);
+	// busy wait until ATN is asserted
+	for (i = 0; i < timeout; i++) {
+		if ((read_byte(priv, ADSR) & HR_ATN))
+			break;
+		udelay(1);
+	}
+	if (i == timeout)
+		return -ETIMEDOUT;
+
+	clear_bit(WRITE_READY_BN, &priv->state);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tms9914_take_control);
+
+/*
+ * The agilent 82350B has a buggy implementation of tcs which interferes with the
+ * operation of tca.  It appears to be based on the controller state machine
+ * described in the TI 9900 TMS9914A data manual published in 1982.  This
+ * manual describes tcs as putting the controller into a CWAS
+ * state where it waits indefinitely for ANRS and ignores tca.	Since a
+ * functioning tca is far more important than tcs, we work around the
+ * problem by never issuing tcs.
+ *
+ * I don't know if this problem exists in the real tms9914a or just in the fpga
+ * of the 82350B.  For now, only the agilent_82350b uses this workaround.
+ * The rest of the tms9914 based drivers still use tms9914_take_control
+ * directly (which does issue tcs).
+ */
+int tms9914_take_control_workaround(struct gpib_board *board,
+				    struct tms9914_priv *priv, int synchronous)
+{
+	if (synchronous)
+		return -ETIMEDOUT;
+	return tms9914_take_control(board, priv, synchronous);
+}
+EXPORT_SYMBOL_GPL(tms9914_take_control_workaround);
+
+int tms9914_go_to_standby(struct gpib_board *board, struct tms9914_priv *priv)
+{
+	int i;
+	const int timeout = 1000;
+
+	write_byte(priv, AUX_GTS, AUXCR);
+	// busy wait until ATN is released
+	for (i = 0; i < timeout; i++) {
+		if ((read_byte(priv, ADSR) & HR_ATN) == 0)
+			break;
+		udelay(1);
+	}
+	if (i == timeout)
+		return -ETIMEDOUT;
+
+	clear_bit(COMMAND_READY_BN, &priv->state);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tms9914_go_to_standby);
+
+void tms9914_interface_clear(struct gpib_board *board, struct tms9914_priv *priv, int assert)
+{
+	if (assert) {
+		write_byte(priv, AUX_SIC | AUX_CS, AUXCR);
+
+		set_bit(CIC_NUM, &board->status);
+	} else {
+		write_byte(priv, AUX_SIC, AUXCR);
+	}
+}
+EXPORT_SYMBOL_GPL(tms9914_interface_clear);
+
+void tms9914_remote_enable(struct gpib_board *board, struct tms9914_priv *priv, int enable)
+{
+	if (enable)
+		write_byte(priv, AUX_SRE | AUX_CS, AUXCR);
+	else
+		write_byte(priv, AUX_SRE, AUXCR);
+}
+EXPORT_SYMBOL_GPL(tms9914_remote_enable);
+
+int tms9914_request_system_control(struct gpib_board *board, struct tms9914_priv *priv,
+				   int request_control)
+{
+	if (request_control) {
+		write_byte(priv, AUX_RQC, AUXCR);
+	} else {
+		clear_bit(CIC_NUM, &board->status);
+		write_byte(priv, AUX_RLC, AUXCR);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tms9914_request_system_control);
+
+unsigned int tms9914_t1_delay(struct gpib_board *board, struct tms9914_priv *priv,
+			      unsigned int nano_sec)
+{
+	static const int clock_period = 200;	// assuming 5Mhz input clock
+	int num_cycles;
+
+	num_cycles = 12;
+
+	if (nano_sec <= 8 * clock_period) {
+		write_byte(priv, AUX_STDL | AUX_CS, AUXCR);
+		num_cycles = 8;
+	} else {
+		write_byte(priv, AUX_STDL, AUXCR);
+	}
+
+	if (nano_sec <= 4 * clock_period) {
+		write_byte(priv, AUX_VSTDL | AUX_CS, AUXCR);
+		num_cycles = 4;
+	} else {
+		write_byte(priv, AUX_VSTDL, AUXCR);
+	}
+
+	return num_cycles * clock_period;
+}
+EXPORT_SYMBOL_GPL(tms9914_t1_delay);
+
+void tms9914_return_to_local(const struct gpib_board *board, struct tms9914_priv *priv)
+{
+	write_byte(priv, AUX_RTL, AUXCR);
+}
+EXPORT_SYMBOL_GPL(tms9914_return_to_local);
+
+void tms9914_set_holdoff_mode(struct tms9914_priv *priv, enum tms9914_holdoff_mode mode)
+{
+	switch (mode) {
+	case TMS9914_HOLDOFF_NONE:
+		write_byte(priv, AUX_HLDE, AUXCR);
+		write_byte(priv, AUX_HLDA, AUXCR);
+		break;
+	case TMS9914_HOLDOFF_EOI:
+		write_byte(priv, AUX_HLDE | AUX_CS, AUXCR);
+		write_byte(priv, AUX_HLDA, AUXCR);
+		break;
+	case TMS9914_HOLDOFF_ALL:
+		write_byte(priv, AUX_HLDE, AUXCR);
+		write_byte(priv, AUX_HLDA | AUX_CS, AUXCR);
+		break;
+	default:
+		pr_err("bug! bad holdoff mode %i\n", mode);
+		break;
+	}
+	priv->holdoff_mode = mode;
+}
+EXPORT_SYMBOL_GPL(tms9914_set_holdoff_mode);
+
+void tms9914_release_holdoff(struct tms9914_priv *priv)
+{
+	if (priv->holdoff_active) {
+		write_byte(priv, AUX_RHDF, AUXCR);
+		priv->holdoff_active = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(tms9914_release_holdoff);
+
+int tms9914_enable_eos(struct gpib_board *board, struct tms9914_priv *priv, u8 eos_byte,
+		       int compare_8_bits)
+{
+	priv->eos = eos_byte;
+	priv->eos_flags = REOS;
+	if (compare_8_bits)
+		priv->eos_flags |= BIN;
+	return 0;
+}
+EXPORT_SYMBOL(tms9914_enable_eos);
+
+void tms9914_disable_eos(struct gpib_board *board, struct tms9914_priv *priv)
+{
+	priv->eos_flags &= ~REOS;
+}
+EXPORT_SYMBOL(tms9914_disable_eos);
+
+int tms9914_parallel_poll(struct gpib_board *board, struct tms9914_priv *priv, u8 *result)
+{
+	// execute parallel poll
+	write_byte(priv, AUX_CS | AUX_RPP, AUXCR);
+	udelay(2);
+	*result = read_byte(priv, CPTR);
+	// clear parallel poll state
+	write_byte(priv, AUX_RPP, AUXCR);
+	return 0;
+}
+EXPORT_SYMBOL(tms9914_parallel_poll);
+
+static void set_ppoll_reg(struct tms9914_priv *priv, int enable,
+			  unsigned int dio_line, int sense, int ist)
+{
+	u8 dio_byte;
+
+	if (enable && ((sense && ist) || (!sense && !ist))) {
+		dio_byte = 1 << (dio_line - 1);
+		write_byte(priv, dio_byte, PPR);
+	} else {
+		write_byte(priv, 0, PPR);
+	}
+}
+
+void tms9914_parallel_poll_configure(struct gpib_board *board,
+				     struct tms9914_priv *priv, u8 config)
+{
+	priv->ppoll_enable = (config & PPC_DISABLE) == 0;
+	priv->ppoll_line = (config & PPC_DIO_MASK) + 1;
+	priv->ppoll_sense = (config & PPC_SENSE) != 0;
+	set_ppoll_reg(priv, priv->ppoll_enable, priv->ppoll_line, priv->ppoll_sense, board->ist);
+}
+EXPORT_SYMBOL(tms9914_parallel_poll_configure);
+
+void tms9914_parallel_poll_response(struct gpib_board *board,
+				    struct tms9914_priv *priv, int ist)
+{
+	set_ppoll_reg(priv, priv->ppoll_enable, priv->ppoll_line, priv->ppoll_sense, ist);
+}
+EXPORT_SYMBOL(tms9914_parallel_poll_response);
+
+void tms9914_serial_poll_response(struct gpib_board *board,
+				  struct tms9914_priv *priv, u8 status)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	write_byte(priv, status, SPMR);
+	priv->spoll_status = status;
+	if (status & request_service_bit)
+		write_byte(priv, AUX_RSV2 | AUX_CS, AUXCR);
+	else
+		write_byte(priv, AUX_RSV2, AUXCR);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+EXPORT_SYMBOL(tms9914_serial_poll_response);
+
+u8 tms9914_serial_poll_status(struct gpib_board *board, struct tms9914_priv *priv)
+{
+	u8 status;
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	status = priv->spoll_status;
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	return status;
+}
+EXPORT_SYMBOL(tms9914_serial_poll_status);
+
+int tms9914_primary_address(struct gpib_board *board,
+			    struct tms9914_priv *priv, unsigned int address)
+{
+	// put primary address in address0
+	write_byte(priv, address & ADDRESS_MASK, ADR);
+	return 0;
+}
+EXPORT_SYMBOL(tms9914_primary_address);
+
+int tms9914_secondary_address(struct gpib_board *board, struct tms9914_priv *priv,
+			      unsigned int address, int enable)
+{
+	if (enable)
+		priv->imr1_bits |= HR_APTIE;
+	else
+		priv->imr1_bits &= ~HR_APTIE;
+
+	write_byte(priv, priv->imr1_bits, IMR1);
+	return 0;
+}
+EXPORT_SYMBOL(tms9914_secondary_address);
+
+unsigned int tms9914_update_status(struct gpib_board *board, struct tms9914_priv *priv,
+				   unsigned int clear_mask)
+{
+	unsigned long flags;
+	unsigned int retval;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	retval = update_status_nolock(board, priv);
+	board->status &= ~clear_mask;
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	return retval;
+}
+EXPORT_SYMBOL(tms9914_update_status);
+
+static void update_talker_state(struct tms9914_priv *priv, unsigned int address_status_bits)
+{
+	if (address_status_bits & HR_TA)	{
+		if (address_status_bits & HR_ATN)
+			priv->talker_state = talker_addressed;
+		else
+			/*
+			 * this could also be serial_poll_active, but the tms9914 provides no
+			 * way to distinguish, so we'll assume talker_active
+			 */
+			priv->talker_state = talker_active;
+	} else {
+		priv->talker_state = talker_idle;
+	}
+}
+
+static void update_listener_state(struct tms9914_priv *priv, unsigned int address_status_bits)
+{
+	if (address_status_bits & HR_LA)	{
+		if (address_status_bits & HR_ATN)
+			priv->listener_state = listener_addressed;
+		else
+			priv->listener_state = listener_active;
+	} else {
+		priv->listener_state = listener_idle;
+	}
+}
+
+static unsigned int update_status_nolock(struct gpib_board *board, struct tms9914_priv *priv)
+{
+	int address_status;
+	int bsr_bits;
+
+	address_status = read_byte(priv, ADSR);
+
+	// check for remote/local
+	if (address_status & HR_REM)
+		set_bit(REM_NUM, &board->status);
+	else
+		clear_bit(REM_NUM, &board->status);
+	// check for lockout
+	if (address_status & HR_LLO)
+		set_bit(LOK_NUM, &board->status);
+	else
+		clear_bit(LOK_NUM, &board->status);
+	// check for ATN
+	if (address_status & HR_ATN)
+		set_bit(ATN_NUM, &board->status);
+	else
+		clear_bit(ATN_NUM, &board->status);
+	// check for talker/listener addressed
+	update_talker_state(priv, address_status);
+	if (priv->talker_state == talker_active || priv->talker_state == talker_addressed)
+		set_bit(TACS_NUM, &board->status);
+	else
+		clear_bit(TACS_NUM, &board->status);
+
+	update_listener_state(priv, address_status);
+	if (priv->listener_state == listener_active || priv->listener_state == listener_addressed)
+		set_bit(LACS_NUM, &board->status);
+	else
+		clear_bit(LACS_NUM, &board->status);
+	// Check for SRQI - not reset elsewhere except in autospoll
+	if (board->status & SRQI) {
+		bsr_bits = read_byte(priv, BSR);
+		if (!(bsr_bits & BSR_SRQ_BIT))
+			clear_bit(SRQI_NUM, &board->status);
+	}
+
+	dev_dbg(board->gpib_dev, "status 0x%lx, state 0x%lx\n", board->status, priv->state);
+
+	return board->status;
+}
+
+int tms9914_line_status(const struct gpib_board *board, struct tms9914_priv *priv)
+{
+	int bsr_bits;
+	int status = VALID_ALL;
+
+	bsr_bits = read_byte(priv, BSR);
+
+	if (bsr_bits & BSR_REN_BIT)
+		status |= BUS_REN;
+	if (bsr_bits & BSR_IFC_BIT)
+		status |= BUS_IFC;
+	if (bsr_bits & BSR_SRQ_BIT)
+		status |= BUS_SRQ;
+	if (bsr_bits & BSR_EOI_BIT)
+		status |= BUS_EOI;
+	if (bsr_bits & BSR_NRFD_BIT)
+		status |= BUS_NRFD;
+	if (bsr_bits & BSR_NDAC_BIT)
+		status |= BUS_NDAC;
+	if (bsr_bits & BSR_DAV_BIT)
+		status |= BUS_DAV;
+	if (bsr_bits & BSR_ATN_BIT)
+		status |= BUS_ATN;
+
+	return status;
+}
+EXPORT_SYMBOL(tms9914_line_status);
+
+static int check_for_eos(struct tms9914_priv *priv, u8 byte)
+{
+	static const u8 seven_bit_compare_mask = 0x7f;
+
+	if ((priv->eos_flags & REOS) == 0)
+		return 0;
+
+	if (priv->eos_flags & BIN) {
+		if (priv->eos == byte)
+			return 1;
+	} else	{
+		if ((priv->eos & seven_bit_compare_mask) == (byte & seven_bit_compare_mask))
+			return 1;
+	}
+	return 0;
+}
+
+static int wait_for_read_byte(struct gpib_board *board, struct tms9914_priv *priv)
+{
+	if (wait_event_interruptible(board->wait,
+				     test_bit(READ_READY_BN, &priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		return -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		return -ETIMEDOUT;
+
+	if (test_bit(DEV_CLEAR_BN, &priv->state))
+		return -EINTR;
+	return 0;
+}
+
+static inline u8 tms9914_read_data_in(struct gpib_board *board,
+				      struct tms9914_priv *priv, int *end)
+{
+	unsigned long flags;
+	u8 data;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	clear_bit(READ_READY_BN, &priv->state);
+	data = read_byte(priv, DIR);
+	if (test_and_clear_bit(RECEIVED_END_BN, &priv->state))
+		*end = 1;
+	else
+		*end = 0;
+	switch (priv->holdoff_mode) {
+	case TMS9914_HOLDOFF_EOI:
+		if (*end)
+			priv->holdoff_active = 1;
+		break;
+	case TMS9914_HOLDOFF_ALL:
+		priv->holdoff_active = 1;
+		break;
+	case TMS9914_HOLDOFF_NONE:
+		break;
+	default:
+		dev_err(board->gpib_dev, "bug! bad holdoff mode %i\n", priv->holdoff_mode);
+		break;
+	}
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	return data;
+}
+
+static int pio_read(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
+		    size_t length, int *end, size_t *bytes_read)
+{
+	ssize_t retval = 0;
+
+	*bytes_read = 0;
+	*end = 0;
+	while (*bytes_read < length && *end == 0) {
+		tms9914_release_holdoff(priv);
+		retval = wait_for_read_byte(board, priv);
+		if (retval < 0)
+			return retval;
+		buffer[(*bytes_read)++] = tms9914_read_data_in(board, priv, end);
+
+		if (check_for_eos(priv, buffer[*bytes_read - 1]))
+			*end = 1;
+	}
+
+	return retval;
+}
+
+int tms9914_read(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
+		 size_t length, int *end, size_t *bytes_read)
+{
+	ssize_t retval = 0;
+	size_t num_bytes;
+
+	*end = 0;
+	*bytes_read = 0;
+	if (length == 0)
+		return 0;
+
+	clear_bit(DEV_CLEAR_BN, &priv->state);
+
+	// transfer data (except for last byte)
+	if (length > 1)	{
+		if (priv->eos_flags & REOS)
+			tms9914_set_holdoff_mode(priv, TMS9914_HOLDOFF_ALL);
+		else
+			tms9914_set_holdoff_mode(priv, TMS9914_HOLDOFF_EOI);
+		// PIO transfer
+		retval = pio_read(board, priv, buffer, length - 1, end, &num_bytes);
+		*bytes_read += num_bytes;
+		if (retval < 0)
+			return retval;
+		buffer += num_bytes;
+		length -= num_bytes;
+	}
+	// read last bytes if we haven't received an END yet
+	if (*end == 0) {
+		// make sure we holdoff after last byte read
+		tms9914_set_holdoff_mode(priv, TMS9914_HOLDOFF_ALL);
+		retval = pio_read(board, priv, buffer, length, end, &num_bytes);
+		*bytes_read += num_bytes;
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tms9914_read);
+
+static int pio_write_wait(struct gpib_board *board, struct tms9914_priv *priv)
+{
+	// wait until next byte is ready to be sent
+	if (wait_event_interruptible(board->wait,
+				     test_bit(WRITE_READY_BN, &priv->state) ||
+				     test_bit(BUS_ERROR_BN, &priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		return -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		return -ETIMEDOUT;
+	if (test_bit(BUS_ERROR_BN, &priv->state))
+		return -EIO;
+	if (test_bit(DEV_CLEAR_BN, &priv->state))
+		return -EINTR;
+
+	return 0;
+}
+
+static int pio_write(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
+		     size_t length, size_t *bytes_written)
+{
+	ssize_t retval = 0;
+	unsigned long flags;
+
+	*bytes_written = 0;
+	while (*bytes_written < length) {
+		retval = pio_write_wait(board, priv);
+		if (retval < 0)
+			break;
+
+		spin_lock_irqsave(&board->spinlock, flags);
+		clear_bit(WRITE_READY_BN, &priv->state);
+		write_byte(priv, buffer[(*bytes_written)++], CDOR);
+		spin_unlock_irqrestore(&board->spinlock, flags);
+	}
+	retval = pio_write_wait(board, priv);
+	if (retval < 0)
+		return retval;
+
+	return length;
+}
+
+int tms9914_write(struct gpib_board *board, struct tms9914_priv *priv,
+		  u8 *buffer, size_t length, int send_eoi, size_t *bytes_written)
+{
+	ssize_t retval = 0;
+
+	*bytes_written = 0;
+	if (length == 0)
+		return 0;
+
+	clear_bit(BUS_ERROR_BN, &priv->state);
+	clear_bit(DEV_CLEAR_BN, &priv->state);
+
+	if (send_eoi)
+		length-- ; /* save the last byte for sending EOI */
+
+	if (length > 0)	{
+		size_t num_bytes;
+		// PIO transfer
+		retval = pio_write(board, priv, buffer, length, &num_bytes);
+		*bytes_written += num_bytes;
+		if (retval < 0)
+			return retval;
+	}
+	if (send_eoi) {
+		size_t num_bytes;
+		/*send EOI */
+		write_byte(priv, AUX_SEOI, AUXCR);
+
+		retval = pio_write(board, priv, &buffer[*bytes_written], 1, &num_bytes);
+		*bytes_written += num_bytes;
+	}
+	return retval;
+}
+EXPORT_SYMBOL(tms9914_write);
+
+static void check_my_address_state(struct gpib_board *board,
+				   struct tms9914_priv *priv, int cmd_byte)
+{
+	if (cmd_byte == MLA(board->pad)) {
+		priv->primary_listen_addressed = 1;
+		// become active listener
+		if (board->sad < 0)
+			write_byte(priv, AUX_LON | AUX_CS, AUXCR);
+	} else if (board->sad >= 0 && priv->primary_listen_addressed &&
+		  cmd_byte == MSA(board->sad)) {
+		// become active listener
+		write_byte(priv, AUX_LON | AUX_CS, AUXCR);
+	} else if (cmd_byte != MLA(board->pad) && (cmd_byte & 0xe0) == LAD) {
+		priv->primary_listen_addressed = 0;
+	} else if (cmd_byte == UNL) {
+		priv->primary_listen_addressed = 0;
+		write_byte(priv, AUX_LON, AUXCR);
+	} else if (cmd_byte == MTA(board->pad))	{
+		priv->primary_talk_addressed = 1;
+		if (board->sad < 0)
+			// make active talker
+			write_byte(priv, AUX_TON | AUX_CS, AUXCR);
+	} else if (board->sad >= 0 && priv->primary_talk_addressed &&
+		   cmd_byte == MSA(board->sad)) {
+		// become active talker
+		write_byte(priv, AUX_TON | AUX_CS, AUXCR);
+	} else if (cmd_byte != MTA(board->pad) && (cmd_byte & 0xe0) == TAD) {
+		// Other Talk Address
+		priv->primary_talk_addressed = 0;
+		write_byte(priv, AUX_TON, AUXCR);
+	} else if (cmd_byte == UNT) {
+		priv->primary_talk_addressed = 0;
+		write_byte(priv, AUX_TON, AUXCR);
+	}
+}
+
+int tms9914_command(struct gpib_board *board, struct tms9914_priv *priv,  u8 *buffer,
+		    size_t length, size_t *bytes_written)
+{
+	int retval = 0;
+	unsigned long flags;
+
+	*bytes_written = 0;
+	while (*bytes_written < length) {
+		if (wait_event_interruptible(board->wait,
+					     test_bit(COMMAND_READY_BN,
+						      &priv->state) ||
+					     test_bit(TIMO_NUM, &board->status)))
+			break;
+		if (test_bit(TIMO_NUM, &board->status))
+			break;
+
+		spin_lock_irqsave(&board->spinlock, flags);
+		clear_bit(COMMAND_READY_BN, &priv->state);
+		write_byte(priv, buffer[*bytes_written], CDOR);
+		spin_unlock_irqrestore(&board->spinlock, flags);
+
+		check_my_address_state(board, priv, buffer[*bytes_written]);
+
+		++(*bytes_written);
+	}
+	// wait until last command byte is written
+	if (wait_event_interruptible(board->wait,
+				     test_bit(COMMAND_READY_BN,
+					      &priv->state) || test_bit(TIMO_NUM, &board->status)))
+		retval = -ERESTARTSYS;
+	if (test_bit(TIMO_NUM, &board->status))
+		retval = -ETIMEDOUT;
+
+	return retval;
+}
+EXPORT_SYMBOL(tms9914_command);
+
+irqreturn_t tms9914_interrupt(struct gpib_board *board, struct tms9914_priv *priv)
+{
+	int status0, status1;
+
+	// read interrupt status (also clears status)
+	status0 = read_byte(priv, ISR0);
+	status1 = read_byte(priv, ISR1);
+	return tms9914_interrupt_have_status(board, priv, status0, status1);
+}
+EXPORT_SYMBOL(tms9914_interrupt);
+
+irqreturn_t tms9914_interrupt_have_status(struct gpib_board *board, struct tms9914_priv *priv,
+					  int status0, int status1)
+{
+	// record reception of END
+	if (status0 & HR_END)
+		set_bit(RECEIVED_END_BN, &priv->state);
+	// get incoming data in PIO mode
+	if ((status0 & HR_BI))
+		set_bit(READ_READY_BN, &priv->state);
+	if ((status0 & HR_BO))	{
+		if (read_byte(priv, ADSR) & HR_ATN)
+			set_bit(COMMAND_READY_BN, &priv->state);
+		else
+			set_bit(WRITE_READY_BN, &priv->state);
+	}
+
+	if (status0 & HR_SPAS) {
+		priv->spoll_status &= ~request_service_bit;
+		write_byte(priv, priv->spoll_status, SPMR);
+		// FIXME: set SPOLL status bit
+	}
+	// record service request in status
+	if (status1 & HR_SRQ)
+		set_bit(SRQI_NUM, &board->status);
+	// have been addressed (with secondary addressing disabled)
+	if (status1 & HR_MA)
+		// clear dac holdoff
+		write_byte(priv, AUX_VAL, AUXCR);
+	// unrecognized command received
+	if (status1 & HR_UNC) {
+		unsigned short command_byte = read_byte(priv, CPTR) & gpib_command_mask;
+
+		switch (command_byte) {
+		case PP_CONFIG:
+			priv->ppoll_configure_state = 1;
+			/*
+			 * AUX_PTS generates another UNC interrupt on the next command byte
+			 * if it is in the secondary address group (such as PPE and PPD).
+			 */
+			write_byte(priv, AUX_PTS, AUXCR);
+			write_byte(priv, AUX_VAL, AUXCR);
+			break;
+		case PPU:
+			tms9914_parallel_poll_configure(board, priv, command_byte);
+			write_byte(priv, AUX_VAL, AUXCR);
+			break;
+		default:
+			if (is_PPE(command_byte) || is_PPD(command_byte)) {
+				if (priv->ppoll_configure_state) {
+					tms9914_parallel_poll_configure(board, priv, command_byte);
+					write_byte(priv, AUX_VAL, AUXCR);
+				} else	{// bad parallel poll configure byte
+					// clear dac holdoff
+					write_byte(priv, AUX_INVAL, AUXCR);
+				}
+			} else	{
+				// clear dac holdoff
+				write_byte(priv, AUX_INVAL, AUXCR);
+			}
+			break;
+		}
+
+		if (in_primary_command_group(command_byte) && command_byte != PP_CONFIG)
+			priv->ppoll_configure_state = 0;
+	}
+
+	if (status1 & HR_ERR) {
+		dev_dbg(board->gpib_dev, "gpib bus error\n");
+		set_bit(BUS_ERROR_BN, &priv->state);
+	}
+
+	if (status1 & HR_IFC) {
+		push_gpib_event(board, EVENT_IFC);
+		clear_bit(CIC_NUM, &board->status);
+	}
+
+	if (status1 & HR_GET) {
+		push_gpib_event(board, EVENT_DEV_TRG);
+		// clear dac holdoff
+		write_byte(priv, AUX_VAL, AUXCR);
+	}
+
+	if (status1 & HR_DCAS) {
+		push_gpib_event(board, EVENT_DEV_CLR);
+		// clear dac holdoff
+		write_byte(priv, AUX_VAL, AUXCR);
+		set_bit(DEV_CLEAR_BN, &priv->state);
+	}
+
+	// check for being addressed with secondary addressing
+	if (status1 & HR_APT) {
+		if (board->sad < 0)
+			dev_err(board->gpib_dev, "bug, APT interrupt without secondary addressing?\n");
+		if ((read_byte(priv, CPTR) & gpib_command_mask) == MSA(board->sad))
+			write_byte(priv, AUX_VAL, AUXCR);
+		else
+			write_byte(priv, AUX_INVAL, AUXCR);
+	}
+
+	if ((status0 & priv->imr0_bits) || (status1 & priv->imr1_bits))	{
+		dev_dbg(board->gpib_dev, "isr0 0x%x, imr0 0x%x, isr1 0x%x, imr1 0x%x\n",
+			status0, priv->imr0_bits, status1, priv->imr1_bits);
+		update_status_nolock(board, priv);
+		wake_up_interruptible(&board->wait);
+	}
+	return IRQ_HANDLED;
+}
+EXPORT_SYMBOL(tms9914_interrupt_have_status);
+
+void tms9914_board_reset(struct tms9914_priv *priv)
+{
+	/* chip reset */
+	write_byte(priv, AUX_CHIP_RESET | AUX_CS, AUXCR);
+
+	/* disable all interrupts */
+	priv->imr0_bits = 0;
+	write_byte(priv, priv->imr0_bits, IMR0);
+	priv->imr1_bits = 0;
+	write_byte(priv, priv->imr1_bits, IMR1);
+	write_byte(priv, AUX_DAI | AUX_CS, AUXCR);
+
+	/* clear registers by reading */
+	read_byte(priv, CPTR);
+	read_byte(priv, ISR0);
+	read_byte(priv, ISR1);
+
+	write_byte(priv, 0, SPMR);
+
+	/* parallel poll unconfigure */
+	write_byte(priv, 0, PPR);
+	/* request for data holdoff */
+	tms9914_set_holdoff_mode(priv, TMS9914_HOLDOFF_ALL);
+}
+EXPORT_SYMBOL_GPL(tms9914_board_reset);
+
+void tms9914_online(struct gpib_board *board, struct tms9914_priv *priv)
+{
+	/* set GPIB address */
+	tms9914_primary_address(board, priv, board->pad);
+	tms9914_secondary_address(board, priv, board->sad, board->sad >= 0);
+
+	/* enable tms9914 interrupts */
+	priv->imr0_bits |= HR_MACIE | HR_RLCIE | HR_ENDIE | HR_BOIE | HR_BIIE |
+		HR_SPASIE;
+	priv->imr1_bits |= HR_MAIE | HR_SRQIE | HR_UNCIE | HR_ERRIE | HR_IFCIE |
+		HR_GETIE | HR_DCASIE;
+	write_byte(priv, priv->imr0_bits, IMR0);
+	write_byte(priv, priv->imr1_bits, IMR1);
+	write_byte(priv, AUX_DAI, AUXCR);
+
+	/* turn off reset state */
+	write_byte(priv, AUX_CHIP_RESET, AUXCR);
+}
+EXPORT_SYMBOL_GPL(tms9914_online);
+
+#ifdef CONFIG_HAS_IOPORT
+// wrapper for inb
+u8 tms9914_ioport_read_byte(struct tms9914_priv *priv, unsigned int register_num)
+{
+	return inb(priv->iobase + register_num * priv->offset);
+}
+EXPORT_SYMBOL_GPL(tms9914_ioport_read_byte);
+
+// wrapper for outb
+void tms9914_ioport_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num)
+{
+	outb(data, priv->iobase + register_num * priv->offset);
+	if (register_num == AUXCR)
+		udelay(1);
+}
+EXPORT_SYMBOL_GPL(tms9914_ioport_write_byte);
+#endif
+
+// wrapper for readb
+u8 tms9914_iomem_read_byte(struct tms9914_priv *priv, unsigned int register_num)
+{
+	return readb(priv->mmiobase + register_num * priv->offset);
+}
+EXPORT_SYMBOL_GPL(tms9914_iomem_read_byte);
+
+// wrapper for writeb
+void tms9914_iomem_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num)
+{
+	writeb(data, priv->mmiobase + register_num * priv->offset);
+	if (register_num == AUXCR)
+		udelay(1);
+}
+EXPORT_SYMBOL_GPL(tms9914_iomem_write_byte);
+
+static int __init tms9914_init_module(void)
+{
+	return 0;
+}
+
+static void __exit tms9914_exit_module(void)
+{
+}
+
+module_init(tms9914_init_module);
+module_exit(tms9914_exit_module);
+
diff --git a/drivers/gpib/tnt4882/Makefile b/drivers/gpib/tnt4882/Makefile
new file mode 100644
index 000000000000..fa1687ad0d1b
--- /dev/null
+++ b/drivers/gpib/tnt4882/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_GPIB_NI_PCI_ISA) += tnt4882.o
+
+tnt4882-objs := tnt4882_gpib.o mite.o
+
+
+
diff --git a/drivers/gpib/tnt4882/mite.c b/drivers/gpib/tnt4882/mite.c
new file mode 100644
index 000000000000..847b96f411bd
--- /dev/null
+++ b/drivers/gpib/tnt4882/mite.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ *	Hardware driver for NI Mite PCI interface chip,
+ *	adapted from COMEDI
+ *
+ *	Copyright (C) 1997-8 David A. Schleef
+ *	Copyright (C) 2002 Frank Mori Hess
+ *
+ *	The PCI-MIO E series driver was originally written by
+ *	Tomasz Motylewski <...>, and ported to comedi by ds.
+ *
+ *	References for specifications:
+ *
+ *	   321747b.pdf  Register Level Programmer Manual (obsolete)
+ *	   321747c.pdf  Register Level Programmer Manual (new)
+ *	   DAQ-STC reference manual
+ *
+ *	Other possibly relevant info:
+ *
+ *	   320517c.pdf  User manual (obsolete)
+ *	   320517f.pdf  User manual (new)
+ *	   320889a.pdf  delete
+ *	   320906c.pdf  maximum signal ratings
+ *	   321066a.pdf  about 16x
+ *	   321791a.pdf  discontinuation of at-mio-16e-10 rev. c
+ *	   321808a.pdf  about at-mio-16e-10 rev P
+ *	   321837a.pdf  discontinuation of at-mio-16de-10 rev d
+ *	   321838a.pdf  about at-mio-16de-10 rev N
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#include "mite.h"
+
+#define PCI_MITE_SIZE		4096
+#define PCI_DAQ_SIZE		4096
+
+struct mite_struct *mite_devices;
+
+#define TOP_OF_PAGE(x) ((x) | (~(PAGE_MASK)))
+
+void mite_init(void)
+{
+	struct pci_dev *pcidev;
+	struct mite_struct *mite;
+
+	for (pcidev = pci_get_device(PCI_VENDOR_ID_NATINST, PCI_ANY_ID, NULL);
+		pcidev;
+		pcidev = pci_get_device(PCI_VENDOR_ID_NATINST, PCI_ANY_ID, pcidev)) {
+		mite = kzalloc(sizeof(*mite), GFP_KERNEL);
+		if (!mite)
+			return;
+
+		mite->pcidev = pcidev;
+		pci_dev_get(mite->pcidev);
+		mite->next = mite_devices;
+		mite_devices = mite;
+	}
+}
+
+int mite_setup(struct mite_struct *mite)
+{
+	u32 addr;
+
+	if (pci_enable_device(mite->pcidev)) {
+		pr_err("mite: error enabling mite.\n");
+		return -EIO;
+	}
+	pci_set_master(mite->pcidev);
+	if (pci_request_regions(mite->pcidev, "mite")) {
+		pr_err("mite: failed to request mite io regions.\n");
+		return -EIO;
+	}
+	addr = pci_resource_start(mite->pcidev, 0);
+	mite->mite_phys_addr = addr;
+	mite->mite_io_addr = ioremap(addr, pci_resource_len(mite->pcidev, 0));
+	if (!mite->mite_io_addr) {
+		pr_err("mite: failed to remap mite io memory address.\n");
+		return -ENOMEM;
+	}
+	addr = pci_resource_start(mite->pcidev, 1);
+	mite->daq_phys_addr = addr;
+	mite->daq_io_addr = ioremap(mite->daq_phys_addr, pci_resource_len(mite->pcidev, 1));
+	if (!mite->daq_io_addr)	{
+		pr_err("mite: failed to remap daq io memory address.\n");
+		return -ENOMEM;
+	}
+	writel(mite->daq_phys_addr | WENAB, mite->mite_io_addr + MITE_IODWBSR);
+	mite->used = 1;
+	return 0;
+}
+
+void mite_cleanup(void)
+{
+	struct mite_struct *mite, *next;
+
+	for (mite = mite_devices; mite; mite = next) {
+		next = mite->next;
+		if (mite->pcidev)
+			pci_dev_put(mite->pcidev);
+		kfree(mite);
+	}
+}
+
+void mite_unsetup(struct mite_struct *mite)
+{
+	if (!mite)
+		return;
+	if (mite->mite_io_addr)	{
+		iounmap(mite->mite_io_addr);
+		mite->mite_io_addr = NULL;
+	}
+	if (mite->daq_io_addr) {
+		iounmap(mite->daq_io_addr);
+		mite->daq_io_addr = NULL;
+	}
+	if (mite->mite_phys_addr) {
+		pci_release_regions(mite->pcidev);
+		pci_disable_device(mite->pcidev);
+		mite->mite_phys_addr = 0;
+	}
+	mite->used = 0;
+}
diff --git a/drivers/gpib/tnt4882/mite.h b/drivers/gpib/tnt4882/mite.h
new file mode 100644
index 000000000000..a1fdba9672a0
--- /dev/null
+++ b/drivers/gpib/tnt4882/mite.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/*
+ *   Hardware driver for NI Mite PCI interface chip
+ *
+ *   Copyright (C) 1999 David A. Schleef <ds@stm.lbl.gov>
+ */
+
+#ifndef _MITE_H_
+#define _MITE_H_
+
+#include <linux/pci.h>
+
+#define PCI_VENDOR_ID_NATINST		0x1093
+
+//#define DEBUG_MITE
+
+#ifdef DEBUG_MITE
+#define MDPRINTK(format, args...) pr_debug(format, ## args)
+#else
+#define MDPRINTK(args...)
+#endif
+
+#define MITE_RING_SIZE 3000
+struct mite_dma_chain {
+	u32 count;
+	u32 addr;
+	u32 next;
+};
+
+struct mite_struct {
+	struct mite_struct *next;
+	int used;
+
+	struct pci_dev *pcidev;
+	unsigned long mite_phys_addr;
+	void __iomem *mite_io_addr;
+	unsigned long daq_phys_addr;
+	void __iomem *daq_io_addr;
+
+	int DMA_CheckNearEnd;
+
+	struct mite_dma_chain ring[MITE_RING_SIZE];
+};
+
+extern struct mite_struct *mite_devices;
+
+extern inline unsigned int mite_irq(struct mite_struct *mite)
+{
+	return mite->pcidev->irq;
+};
+
+extern inline unsigned int mite_device_id(struct mite_struct *mite)
+{
+	return mite->pcidev->device;
+};
+
+void mite_init(void);
+void mite_cleanup(void);
+int mite_setup(struct mite_struct *mite);
+void mite_unsetup(struct mite_struct *mite);
+void mite_list_devices(void);
+
+#define CHAN_OFFSET(x)			(0x100 * (x))
+
+/* DMA base for chan 0 is 0x500, chan 1 is 0x600 */
+
+#define MITE_CHOR		0x500
+#define CHOR_DMARESET			BIT(31)
+#define CHOR_SET_SEND_TC		BIT(11)
+#define CHOR_CLR_SEND_TC		BIT(10)
+#define CHOR_SET_LPAUSE			BIT(9)
+#define CHOR_CLR_LPAUSE			BIT(8)
+#define CHOR_CLRDONE			BIT(7)
+#define CHOR_CLRRB			BIT(6)
+#define CHOR_CLRLC			BIT(5)
+#define CHOR_FRESET			BIT(4)
+#define CHOR_ABORT			BIT(3)
+#define CHOR_STOP			BIT(2)
+#define CHOR_CONT			BIT(1)
+#define CHOR_START			BIT(0)
+#define CHOR_PON			(CHOR_CLR_SEND_TC | CHOR_CLR_LPAUSE)
+
+#define MITE_CHCR		0x504
+#define CHCR_SET_DMA_IE			BIT(31)
+#define CHCR_CLR_DMA_IE			BIT(30)
+#define CHCR_SET_LINKP_IE		BIT(29)
+#define CHCR_CLR_LINKP_IE		BIT(28)
+#define CHCR_SET_SAR_IE			BIT(27)
+#define CHCR_CLR_SAR_IE			BIT(26)
+#define CHCR_SET_DONE_IE		BIT(25)
+#define CHCR_CLR_DONE_IE		BIT(24)
+#define CHCR_SET_MRDY_IE		BIT(23)
+#define CHCR_CLR_MRDY_IE		BIT(22)
+#define CHCR_SET_DRDY_IE		BIT(21)
+#define CHCR_CLR_DRDY_IE		BIT(20)
+#define CHCR_SET_LC_IE			BIT(19)
+#define CHCR_CLR_LC_IE			BIT(18)
+#define CHCR_SET_CONT_RB_IE		BIT(17)
+#define CHCR_CLR_CONT_RB_IE		BIT(16)
+#define CHCR_FIFODIS			BIT(15)
+#define CHCR_FIFO_ON			0
+#define CHCR_BURSTEN			BIT(14)
+#define CHCR_NO_BURSTEN			0
+#define CHCR_NFTP(x)			((x) << 11)
+#define CHCR_NFTP0			CHCR_NFTP(0)
+#define CHCR_NFTP1			CHCR_NFTP(1)
+#define CHCR_NFTP2			CHCR_NFTP(2)
+#define CHCR_NFTP4			CHCR_NFTP(3)
+#define CHCR_NFTP8			CHCR_NFTP(4)
+#define CHCR_NFTP16			CHCR_NFTP(5)
+#define CHCR_NETP(x)			((x) << 11)
+#define CHCR_NETP0			CHCR_NETP(0)
+#define CHCR_NETP1			CHCR_NETP(1)
+#define CHCR_NETP2			CHCR_NETP(2)
+#define CHCR_NETP4			CHCR_NETP(3)
+#define CHCR_NETP8			CHCR_NETP(4)
+#define CHCR_CHEND1			BIT(5)
+#define CHCR_CHEND0			BIT(4)
+#define CHCR_DIR			BIT(3)
+#define CHCR_DEV_TO_MEM			CHCR_DIR
+#define CHCR_MEM_TO_DEV			0
+#define CHCR_NORMAL			((0) << 0)
+#define CHCR_CONTINUE			((1) << 0)
+#define CHCR_RINGBUFF			((2) << 0)
+#define CHCR_LINKSHORT			((4) << 0)
+#define CHCR_LINKLONG			((5) << 0)
+#define CHCRPON				(CHCR_CLR_DMA_IE | CHCR_CLR_LINKP_IE | CHCR_CLR_SAR_IE | \
+					 CHCR_CLR_DONE_IE | CHCR_CLR_MRDY_IE | CHCR_CLR_DRDY_IE | \
+					 CHCR_CLR_LC_IE | CHCR_CLR_CONT_IE)
+
+#define MITE_TCR		0x508
+
+/* CR bits */
+#define CR_RL(x)			((x) << 21)
+#define CR_RL0				CR_RL(0)
+#define CR_RL1				CR_RL(1)
+#define CR_RL2				CR_RL(2)
+#define CR_RL4				CR_RL(3)
+#define CR_RL8				CR_RL(4)
+#define CR_RL16				CR_RL(5)
+#define CR_RL32				CR_RL(6)
+#define CR_RL64				CR_RL(7)
+#define CR_RD(x)			((x) << 19)
+#define CR_RD0				CR_RD(0)
+#define CR_RD32				CR_RD(1)
+#define CR_RD512			CR_RD(2)
+#define CR_RD8192			CR_RD(3)
+#define CR_REQS(x)			((x) << 16)
+#define CR_REQSDRQ0			CR_REQS(4)
+#define CR_REQSDRQ1			CR_REQS(5)
+#define CR_REQSDRQ2			CR_REQS(6)
+#define CR_REQSDRQ3			CR_REQS(7)
+#define CR_ASEQX(x)			((x) << 10)
+#define CR_ASEQX0			CR_ASEQX(0)
+#define	CR_ASEQDONT			CR_ASEQX0
+#define CR_ASEQXP1			CR_ASEQX(1)
+#define CR_ASEQUP			CR_ASEQXP1
+#define CR_ASEQXP2			CR_ASEQX(2)
+#define CR_ASEQDOWN			CR_ASEQXP2
+#define CR_ASEQXP4			CR_ASEQX(3)
+#define CR_ASEQXP8			CR_ASEQX(4)
+#define CR_ASEQXP16			CR_ASEQX(5)
+#define CR_ASEQXP32			CR_ASEQX(6)
+#define CR_ASEQXP64			CR_ASEQX(7)
+#define CR_ASEQXM1			CR_ASEQX(9)
+#define CR_ASEQXM2			CR_ASEQX(10)
+#define CR_ASEQXM4			CR_ASEQX(11)
+#define CR_ASEQXM8			CR_ASEQX(12)
+#define CR_ASEQXM16			CR_ASEQX(13)
+#define CR_ASEQXM32			CR_ASEQX(14)
+#define CR_ASEQXM64			CR_ASEQX(15)
+#define CR_PSIZEBYTE			BIT(8)
+#define CR_PSIZEHALF			(2 << 8)
+#define CR_PSIZEWORD			(3 << 8)
+#define CR_PORTCPU			(0 << 6)
+#define CR_PORTIO			BIT(6)
+#define CR_PORTVXI			(2 << 6)
+#define CR_PORTMXI			(3 << 6)
+#define CR_AMDEVICE			BIT(0)
+
+#define CHSR_INT			0x80000000
+#define CHSR_DONE			0x02000000
+#define CHSR_LINKC			0x00080000
+
+#define MITE_MCR		0x50c
+#define	MCRPON				0
+
+#define MITE_MAR		0x510
+
+#define MITE_DCR		0x514
+#define DCR_NORMAL			BIT(29)
+#define DCRPON				0
+
+#define MITE_DAR		0x518
+
+#define MITE_LKCR		0x51c
+
+#define MITE_LKAR		0x520
+#define MITE_LLKAR		0x524
+#define MITE_BAR		0x528
+#define MITE_BCR		0x52c
+#define MITE_SAR		0x530
+#define MITE_WSCR		0x534
+#define MITE_WSER		0x538
+#define MITE_CHSR		0x53c
+#define MITE_FCR		0x540
+
+#define MITE_FIFO		0x80
+#define MITE_FIFOEND		0xff
+
+#define MITE_AMRAM		        0x00
+#define MITE_AMDEVICE		        0x01
+#define MITE_AMHOST_A32_SINGLE	        0x09
+#define MITE_AMHOST_A24_SINGLE	        0x39
+#define MITE_AMHOST_A16_SINGLE	        0x29
+#define MITE_AMHOST_A32_BLOCK	        0x0b
+#define MITE_AMHOST_A32D64_BLOCK	0x08
+#define MITE_AMHOST_A24_BLOCK	        0x3b
+
+enum mite_registers {
+	MITE_IODWBSR = 0xc0,	// IO Device Window Base Size Register
+	MITE_CSIGR = 0x460,	// chip signature
+	MITE_IODWBSR_1 = 0xc4,	// IO Device Window Base Size Register 1 (used by 6602 boards)
+	MITE_IODWCR_1 = 0xf4
+};
+
+enum MITE_IODWBSR_bits {
+	WENAB = 0x80,		// window enable
+	WENAB_6602 = 0x8c	// window enable for 6602 boards
+};
+
+#endif
+
diff --git a/drivers/gpib/tnt4882/tnt4882_gpib.c b/drivers/gpib/tnt4882/tnt4882_gpib.c
new file mode 100644
index 000000000000..c03a976b7380
--- /dev/null
+++ b/drivers/gpib/tnt4882/tnt4882_gpib.c
@@ -0,0 +1,1838 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/***************************************************************************
+ * National Instruments boards using tnt4882 or compatible chips (at-gpib, etc).
+ *    copyright            : (C) 2001, 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
+#define DRV_NAME KBUILD_MODNAME
+
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/isapnp.h>
+
+#include "nec7210.h"
+#include "gpibP.h"
+#include "mite.h"
+#include "tnt4882_registers.h"
+
+static const int ISAPNP_VENDOR_ID_NI = ISAPNP_VENDOR('N', 'I', 'C');
+static const int ISAPNP_ID_NI_ATGPIB_TNT = 0xc601;
+enum {
+	PCI_DEVICE_ID_NI_GPIB = 0xc801,
+	PCI_DEVICE_ID_NI_GPIB_PLUS = 0xc811,
+	PCI_DEVICE_ID_NI_GPIB_PLUS2 = 0x71ad,
+	PCI_DEVICE_ID_NI_PXIGPIB = 0xc821,
+	PCI_DEVICE_ID_NI_PMCGPIB = 0xc831,
+	PCI_DEVICE_ID_NI_PCIEGPIB = 0x70cf,
+	PCI_DEVICE_ID_NI_PCIE2GPIB = 0x710e,
+// Measurement Computing PCI-488 same design as PCI-GPIB with TNT5004
+	PCI_DEVICE_ID_MC_PCI488 = 0x7259,
+	PCI_DEVICE_ID_CEC_NI_GPIB = 0x7258
+};
+
+// struct which defines private_data for tnt4882 devices
+struct tnt4882_priv {
+	struct nec7210_priv nec7210_priv;
+	struct mite_struct *mite;
+	struct pnp_dev *pnp_dev;
+	unsigned int irq;
+	unsigned short imr0_bits;
+	unsigned short imr3_bits;
+	unsigned short auxg_bits;	// bits written to auxiliary register G
+};
+
+static irqreturn_t tnt4882_internal_interrupt(struct gpib_board *board);
+
+// register offset for nec7210 compatible registers
+static const int atgpib_reg_offset = 2;
+
+// number of ioports used
+static const int atgpib_iosize = 32;
+
+/* paged io */
+static inline unsigned int tnt_paged_readb(struct tnt4882_priv *priv, unsigned long offset)
+{
+	iowrite8(AUX_PAGEIN, priv->nec7210_priv.mmiobase + AUXMR * priv->nec7210_priv.offset);
+	udelay(1);
+	return ioread8(priv->nec7210_priv.mmiobase + offset);
+}
+
+static inline void tnt_paged_writeb(struct tnt4882_priv *priv, unsigned int value,
+				    unsigned long offset)
+{
+	iowrite8(AUX_PAGEIN, priv->nec7210_priv.mmiobase + AUXMR * priv->nec7210_priv.offset);
+	udelay(1);
+	iowrite8(value, priv->nec7210_priv.mmiobase + offset);
+}
+
+/* readb/writeb wrappers */
+static inline unsigned short tnt_readb(struct tnt4882_priv *priv, unsigned long offset)
+{
+	void __iomem *address = priv->nec7210_priv.mmiobase + offset;
+	unsigned long flags;
+	unsigned short retval;
+	spinlock_t *register_lock = &priv->nec7210_priv.register_page_lock;
+
+	spin_lock_irqsave(register_lock, flags);
+	switch (offset) {
+	case CSR:
+	case SASR:
+	case ISR0:
+	case BSR:
+		switch (priv->nec7210_priv.type) {
+		case TNT4882:
+		case TNT5004:
+			retval = ioread8(address);
+			break;
+		case NAT4882:
+			retval = tnt_paged_readb(priv, offset - tnt_pagein_offset);
+			break;
+		case NEC7210:
+			retval = 0;
+			break;
+		default:
+			retval = 0;
+			break;
+		}
+		break;
+	default:
+		retval = ioread8(address);
+		break;
+	}
+	spin_unlock_irqrestore(register_lock, flags);
+	return retval;
+}
+
+static inline void tnt_writeb(struct tnt4882_priv *priv, unsigned short value, unsigned long offset)
+{
+	void __iomem *address = priv->nec7210_priv.mmiobase + offset;
+	unsigned long flags;
+	spinlock_t *register_lock = &priv->nec7210_priv.register_page_lock;
+
+	spin_lock_irqsave(register_lock, flags);
+	switch (offset)	{
+	case KEYREG:
+	case IMR0:
+	case BCR:
+		switch (priv->nec7210_priv.type) {
+		case TNT4882:
+		case TNT5004:
+			iowrite8(value, address);
+			break;
+		case NAT4882:
+			tnt_paged_writeb(priv, value, offset - tnt_pagein_offset);
+			break;
+		case NEC7210:
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		iowrite8(value, address);
+		break;
+	}
+	spin_unlock_irqrestore(register_lock, flags);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GPIB driver for National Instruments boards using tnt4882 or compatible chips");
+
+static int tnt4882_line_status(const struct gpib_board *board)
+{
+	int status = VALID_ALL;
+	int bcsr_bits;
+	struct tnt4882_priv *tnt_priv;
+
+	tnt_priv = board->private_data;
+
+	bcsr_bits = tnt_readb(tnt_priv, BSR);
+
+	if (bcsr_bits & BCSR_REN_BIT)
+		status |= BUS_REN;
+	if (bcsr_bits & BCSR_IFC_BIT)
+		status |= BUS_IFC;
+	if (bcsr_bits & BCSR_SRQ_BIT)
+		status |= BUS_SRQ;
+	if (bcsr_bits & BCSR_EOI_BIT)
+		status |= BUS_EOI;
+	if (bcsr_bits & BCSR_NRFD_BIT)
+		status |= BUS_NRFD;
+	if (bcsr_bits & BCSR_NDAC_BIT)
+		status |= BUS_NDAC;
+	if (bcsr_bits & BCSR_DAV_BIT)
+		status |= BUS_DAV;
+	if (bcsr_bits & BCSR_ATN_BIT)
+		status |= BUS_ATN;
+
+	return status;
+}
+
+static int tnt4882_t1_delay(struct gpib_board *board, unsigned int nano_sec)
+{
+	struct tnt4882_priv *tnt_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
+	unsigned int retval;
+
+	retval = nec7210_t1_delay(board, nec_priv, nano_sec);
+	if (nec_priv->type == NEC7210)
+		return retval;
+
+	if (nano_sec <= 350) {
+		tnt_writeb(tnt_priv, MSTD, KEYREG);
+		retval = 350;
+	} else {
+		tnt_writeb(tnt_priv, 0, KEYREG);
+	}
+	if (nano_sec > 500 && nano_sec <= 1100)	{
+		write_byte(nec_priv, AUXRI | USTD, AUXMR);
+		retval = 1100;
+	} else {
+		write_byte(nec_priv, AUXRI, AUXMR);
+	}
+	return retval;
+}
+
+static int fifo_word_available(struct tnt4882_priv *tnt_priv)
+{
+	int status2;
+	int retval;
+
+	status2 = tnt_readb(tnt_priv, STS2);
+	retval = (status2 & AEFN) && (status2 & BEFN);
+
+	return retval;
+}
+
+static int fifo_byte_available(struct tnt4882_priv *tnt_priv)
+{
+	int status2;
+	int retval;
+
+	status2 = tnt_readb(tnt_priv, STS2);
+	retval = (status2 & AEFN) || (status2 & BEFN);
+
+	return retval;
+}
+
+static int fifo_xfer_done(struct tnt4882_priv *tnt_priv)
+{
+	int status1;
+	int retval;
+
+	status1 = tnt_readb(tnt_priv, STS1);
+	retval = status1 & (S_DONE | S_HALT);
+
+	return retval;
+}
+
+static int drain_fifo_words(struct tnt4882_priv *tnt_priv, u8 *buffer, int num_bytes)
+{
+	int count = 0;
+	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
+
+	while (fifo_word_available(tnt_priv) && count + 2 <= num_bytes)	{
+		short word;
+
+		word = ioread16(nec_priv->mmiobase + FIFOB);
+		buffer[count++] = word & 0xff;
+		buffer[count++] = (word >> 8) & 0xff;
+	}
+	return count;
+}
+
+static void tnt4882_release_holdoff(struct gpib_board *board, struct tnt4882_priv *tnt_priv)
+{
+	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
+	unsigned short sasr_bits;
+
+	sasr_bits = tnt_readb(tnt_priv, SASR);
+
+	/*
+	 * tnt4882 not in one-chip mode won't always release holdoff unless we
+	 * are in the right mode when release handshake command is given
+	 */
+	if (sasr_bits & AEHS_BIT) /* holding off due to holdoff on end mode*/	{
+		nec7210_set_handshake_mode(board, nec_priv, HR_HLDE);
+		write_byte(nec_priv, AUX_FH, AUXMR);
+	} else if (sasr_bits & ANHS1_BIT) { /* held off due to holdoff on all data mode*/
+		nec7210_set_handshake_mode(board, nec_priv, HR_HLDA);
+		write_byte(nec_priv, AUX_FH, AUXMR);
+		nec7210_set_handshake_mode(board, nec_priv, HR_HLDE);
+	} else { /* held off due to holdoff immediately command */
+		nec7210_set_handshake_mode(board, nec_priv, HR_HLDE);
+		write_byte(nec_priv, AUX_FH, AUXMR);
+	}
+}
+
+static int tnt4882_accel_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
+			      size_t *bytes_read)
+{
+	size_t count = 0;
+	ssize_t retval = 0;
+	struct tnt4882_priv *tnt_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
+	unsigned int bits;
+	s32 hw_count;
+	unsigned long flags;
+
+	*bytes_read = 0;
+	// FIXME: really, DEV_CLEAR_BN should happen elsewhere to prevent race
+	clear_bit(DEV_CLEAR_BN, &nec_priv->state);
+	clear_bit(ADR_CHANGE_BN, &nec_priv->state);
+
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_ENDIE, HR_ENDIE);
+	if (nec_priv->type != TNT4882 && nec_priv->type != TNT5004)
+		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, HR_DMAI);
+	else
+		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
+	tnt_writeb(tnt_priv, nec_priv->auxa_bits | HR_HLDA, CCR);
+	bits = TNT_B_16BIT | TNT_IN | TNT_CCEN;
+	tnt_writeb(tnt_priv, bits, CFG);
+	tnt_writeb(tnt_priv, RESET_FIFO, CMDR);
+	udelay(1);
+	// load 2's complement of count into hardware counters
+	hw_count = -length;
+	tnt_writeb(tnt_priv, hw_count & 0xff, CNT0);
+	tnt_writeb(tnt_priv, (hw_count >> 8) & 0xff, CNT1);
+	tnt_writeb(tnt_priv, (hw_count >> 16) & 0xff, CNT2);
+	tnt_writeb(tnt_priv, (hw_count >> 24) & 0xff, CNT3);
+
+	tnt4882_release_holdoff(board, tnt_priv);
+
+	tnt_writeb(tnt_priv, GO, CMDR);
+	udelay(1);
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	tnt_priv->imr3_bits |= HR_DONE | HR_NEF;
+	tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	while (count + 2 <= length &&
+	       test_bit(RECEIVED_END_BN, &nec_priv->state) == 0 &&
+	       fifo_xfer_done(tnt_priv) == 0) {
+		// wait until a word is ready
+		if (wait_event_interruptible(board->wait,
+					     fifo_word_available(tnt_priv) ||
+					     fifo_xfer_done(tnt_priv) ||
+					     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
+					     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+					     test_bit(ADR_CHANGE_BN, &nec_priv->state) ||
+					     test_bit(TIMO_NUM, &board->status))) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		if (test_bit(TIMO_NUM, &board->status))	{
+			retval = -ETIMEDOUT;
+			break;
+		}
+		if (test_bit(DEV_CLEAR_BN, &nec_priv->state)) {
+			retval = -EINTR;
+			break;
+		}
+		if (test_bit(ADR_CHANGE_BN, &nec_priv->state)) {
+			retval = -EINTR;
+			break;
+		}
+
+		spin_lock_irqsave(&board->spinlock, flags);
+		count += drain_fifo_words(tnt_priv, &buffer[count], length - count);
+		tnt_priv->imr3_bits |= HR_NEF;
+		tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
+		spin_unlock_irqrestore(&board->spinlock, flags);
+
+		if (need_resched())
+			schedule();
+	}
+	// wait for last byte
+	if (count < length) {
+		spin_lock_irqsave(&board->spinlock, flags);
+		tnt_priv->imr3_bits |= HR_DONE | HR_NEF;
+		tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
+		spin_unlock_irqrestore(&board->spinlock, flags);
+
+		if (wait_event_interruptible(board->wait,
+					     fifo_xfer_done(tnt_priv) ||
+					     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
+					     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+					     test_bit(ADR_CHANGE_BN, &nec_priv->state) ||
+					     test_bit(TIMO_NUM, &board->status))) {
+			retval = -ERESTARTSYS;
+		}
+		if (test_bit(TIMO_NUM, &board->status))
+			retval = -ETIMEDOUT;
+		if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
+			retval = -EINTR;
+		if (test_bit(ADR_CHANGE_BN, &nec_priv->state))
+			retval = -EINTR;
+		count += drain_fifo_words(tnt_priv, &buffer[count], length - count);
+		if (fifo_byte_available(tnt_priv) && count < length)
+			buffer[count++] = tnt_readb(tnt_priv, FIFOB);
+	}
+	if (count < length)
+		tnt_writeb(tnt_priv, STOP, CMDR);
+	udelay(1);
+
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_ENDIE, 0);
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
+	/*
+	 * force handling of any pending interrupts (seems to be needed
+	 * to keep interrupts from getting hosed, plus for syncing
+	 * with RECEIVED_END below)
+	 */
+	tnt4882_internal_interrupt(board);
+	/* RECEIVED_END should be in sync now */
+	if (test_and_clear_bit(RECEIVED_END_BN, &nec_priv->state))
+		*end = 1;
+	if (retval < 0)	{
+		// force immediate holdoff
+		write_byte(nec_priv, AUX_HLDI, AUXMR);
+
+		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+	}
+	*bytes_read = count;
+
+	return retval;
+}
+
+static int fifo_space_available(struct tnt4882_priv *tnt_priv)
+{
+	int status2;
+	int retval;
+
+	status2 = tnt_readb(tnt_priv, STS2);
+	retval = (status2 & AFFN) && (status2 & BFFN);
+
+	return retval;
+}
+
+static unsigned int tnt_transfer_count(struct tnt4882_priv *tnt_priv)
+{
+	unsigned int count = 0;
+
+	count |= tnt_readb(tnt_priv, CNT0) & 0xff;
+	count |= (tnt_readb(tnt_priv, CNT1) << 8) & 0xff00;
+	count |= (tnt_readb(tnt_priv, CNT2) << 16) & 0xff0000;
+	count |= (tnt_readb(tnt_priv, CNT3) << 24) & 0xff000000;
+	// return two's complement
+	return -count;
+};
+
+static int write_wait(struct gpib_board *board, struct tnt4882_priv *tnt_priv,
+		      int wait_for_done, int send_commands)
+{
+	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
+
+	if (wait_event_interruptible(board->wait,
+				     (!wait_for_done && fifo_space_available(tnt_priv)) ||
+				     fifo_xfer_done(tnt_priv) ||
+				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
+				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
+				     test_bit(TIMO_NUM, &board->status)))
+		return -ERESTARTSYS;
+
+	if (test_bit(TIMO_NUM, &board->status))
+		return -ETIMEDOUT;
+	if (test_and_clear_bit(BUS_ERROR_BN, &nec_priv->state))
+		return (send_commands) ? -ENOTCONN : -ECOMM;
+	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
+		return -EINTR;
+	return 0;
+}
+
+static int generic_write(struct gpib_board *board, u8 *buffer, size_t length,
+			 int send_eoi, int send_commands, size_t *bytes_written)
+{
+	size_t count = 0;
+	ssize_t retval = 0;
+	struct tnt4882_priv *tnt_priv = board->private_data;
+	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
+	unsigned int bits;
+	s32 hw_count;
+	unsigned long flags;
+
+	*bytes_written = 0;
+	// FIXME: really, DEV_CLEAR_BN should happen elsewhere to prevent race
+	clear_bit(DEV_CLEAR_BN, &nec_priv->state);
+
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_ERRIE, HR_ERRIE);
+
+	if (nec_priv->type != TNT4882 && nec_priv->type != TNT5004)
+		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, HR_DMAO);
+	else
+		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0);
+
+	tnt_writeb(tnt_priv, RESET_FIFO, CMDR);
+	udelay(1);
+
+	bits = TNT_B_16BIT;
+	if (send_eoi) {
+		bits |= TNT_CCEN;
+		if (nec_priv->type != TNT4882 && nec_priv->type != TNT5004)
+			tnt_writeb(tnt_priv, AUX_SEOI, CCR);
+	}
+	if (send_commands)
+		bits |= TNT_COMMAND;
+	tnt_writeb(tnt_priv, bits, CFG);
+
+	// load 2's complement of count into hardware counters
+	hw_count = -length;
+	tnt_writeb(tnt_priv, hw_count & 0xff, CNT0);
+	tnt_writeb(tnt_priv, (hw_count >> 8) & 0xff, CNT1);
+	tnt_writeb(tnt_priv, (hw_count >> 16) & 0xff, CNT2);
+	tnt_writeb(tnt_priv, (hw_count >> 24) & 0xff, CNT3);
+
+	tnt_writeb(tnt_priv, GO, CMDR);
+	udelay(1);
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	tnt_priv->imr3_bits |= HR_DONE;
+	tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+
+	while (count < length)	{
+		// wait until byte is ready to be sent
+		retval = write_wait(board, tnt_priv, 0, send_commands);
+		if (retval < 0)
+			break;
+		if (fifo_xfer_done(tnt_priv))
+			break;
+		spin_lock_irqsave(&board->spinlock, flags);
+		while (fifo_space_available(tnt_priv) && count < length) {
+			u16 word;
+
+			word = buffer[count++] & 0xff;
+			if (count < length)
+				word |= (buffer[count++] << 8) & 0xff00;
+			iowrite16(word, nec_priv->mmiobase + FIFOB);
+		}
+//  avoid unnecessary HR_NFF interrupts
+//		tnt_priv->imr3_bits |= HR_NFF;
+//		tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
+		spin_unlock_irqrestore(&board->spinlock, flags);
+
+		if (need_resched())
+			schedule();
+	}
+	// wait last byte has been sent
+	if (retval == 0)
+		retval = write_wait(board, tnt_priv, 1, send_commands);
+
+	tnt_writeb(tnt_priv, STOP, CMDR);
+	udelay(1);
+
+	nec7210_set_reg_bits(nec_priv, IMR1, HR_ERR, 0x0);
+	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0x0);
+	/*
+	 * force handling of any interrupts that happened
+	 * while they were masked (this appears to be needed)
+	 */
+	tnt4882_internal_interrupt(board);
+	*bytes_written = length - tnt_transfer_count(tnt_priv);
+	return retval;
+}
+
+static int tnt4882_accel_write(struct gpib_board *board, u8 *buffer,
+			       size_t length, int send_eoi, size_t *bytes_written)
+{
+	return generic_write(board, buffer, length, send_eoi, 0, bytes_written);
+}
+
+static int tnt4882_command(struct gpib_board *board, u8 *buffer, size_t length,
+			   size_t *bytes_written)
+{
+	return generic_write(board, buffer, length, 0, 1, bytes_written);
+}
+
+static irqreturn_t tnt4882_internal_interrupt(struct gpib_board *board)
+{
+	struct tnt4882_priv *priv = board->private_data;
+	int isr0_bits, isr3_bits, imr3_bits;
+	unsigned long flags;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+
+	nec7210_interrupt(board, &priv->nec7210_priv);
+
+	isr0_bits = tnt_readb(priv, ISR0);
+	isr3_bits = tnt_readb(priv, ISR3);
+	imr3_bits = priv->imr3_bits;
+
+	if (isr0_bits & TNT_IFCI_BIT)
+		push_gpib_event(board, EVENT_IFC);
+	// XXX don't need this wakeup, one below should do?
+//		wake_up_interruptible(&board->wait);
+
+	if (isr3_bits & HR_NFF)
+		priv->imr3_bits &= ~HR_NFF;
+	if (isr3_bits & HR_NEF)
+		priv->imr3_bits &= ~HR_NEF;
+	if (isr3_bits & HR_DONE)
+		priv->imr3_bits &= ~HR_DONE;
+	if (isr3_bits & (HR_INTR | HR_TLCI)) {
+		dev_dbg(board->gpib_dev, "minor %i isr0 0x%x imr0 0x%x isr3 0x%x imr3 0x%x\n",
+			board->minor, isr0_bits, priv->imr0_bits, isr3_bits, imr3_bits);
+		tnt_writeb(priv, priv->imr3_bits, IMR3);
+		wake_up_interruptible(&board->wait);
+	}
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t tnt4882_interrupt(int irq, void *arg)
+{
+	return tnt4882_internal_interrupt(arg);
+}
+
+// wrappers for interface functions
+static int tnt4882_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
+			size_t *bytes_read)
+{
+	struct tnt4882_priv *priv = board->private_data;
+	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
+	int retval;
+	int dummy;
+
+	retval = nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
+
+	if (retval < 0)	{	// force immediate holdoff
+		write_byte(nec_priv, AUX_HLDI, AUXMR);
+
+		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+
+		nec7210_read_data_in(board, nec_priv, &dummy);
+	}
+	return retval;
+}
+
+static int tnt4882_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
+			 size_t *bytes_written)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
+}
+
+static int tnt4882_command_unaccel(struct gpib_board *board, u8 *buffer,
+				   size_t length, size_t *bytes_written)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
+}
+
+static int tnt4882_take_control(struct gpib_board *board, int synchronous)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
+}
+
+static int tnt4882_go_to_standby(struct gpib_board *board)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	return nec7210_go_to_standby(board, &priv->nec7210_priv);
+}
+
+static int tnt4882_request_system_control(struct gpib_board *board, int request_control)
+{
+	struct tnt4882_priv *priv = board->private_data;
+	int retval;
+
+	if (request_control) {
+		tnt_writeb(priv, SETSC, CMDR);
+		udelay(1);
+	}
+	retval = nec7210_request_system_control(board, &priv->nec7210_priv, request_control);
+	if (!request_control) {
+		tnt_writeb(priv, CLRSC, CMDR);
+		udelay(1);
+	}
+	return retval;
+}
+
+static void tnt4882_interface_clear(struct gpib_board *board, int assert)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
+}
+
+static void tnt4882_remote_enable(struct gpib_board *board, int enable)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
+}
+
+static int tnt4882_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
+}
+
+static void tnt4882_disable_eos(struct gpib_board *board)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	nec7210_disable_eos(board, &priv->nec7210_priv);
+}
+
+static unsigned int tnt4882_update_status(struct gpib_board *board, unsigned int clear_mask)
+{
+	unsigned long flags;
+	u8 line_status;
+	struct tnt4882_priv *priv = board->private_data;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	board->status &= ~clear_mask;
+	nec7210_update_status_nolock(board, &priv->nec7210_priv);
+	/* set / clear SRQ state since it is not cleared by interrupt */
+	line_status = tnt_readb(priv, BSR);
+	if (line_status & BCSR_SRQ_BIT)
+		set_bit(SRQI_NUM, &board->status);
+	else
+		clear_bit(SRQI_NUM, &board->status);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+	return board->status;
+}
+
+static int tnt4882_primary_address(struct gpib_board *board, unsigned int address)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	return nec7210_primary_address(board, &priv->nec7210_priv, address);
+}
+
+static int tnt4882_secondary_address(struct gpib_board *board, unsigned int address, int enable)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
+}
+
+static int tnt4882_parallel_poll(struct gpib_board *board, u8 *result)
+{
+	struct tnt4882_priv *tnt_priv = board->private_data;
+
+	if (tnt_priv->nec7210_priv.type != NEC7210) {
+		tnt_priv->auxg_bits |= RPP2_BIT;
+		write_byte(&tnt_priv->nec7210_priv, tnt_priv->auxg_bits, AUXMR);
+		udelay(2);	// FIXME use parallel poll timeout
+		*result = read_byte(&tnt_priv->nec7210_priv, CPTR);
+		tnt_priv->auxg_bits &= ~RPP2_BIT;
+		write_byte(&tnt_priv->nec7210_priv, tnt_priv->auxg_bits, AUXMR);
+		return 0;
+	} else {
+		return nec7210_parallel_poll(board, &tnt_priv->nec7210_priv, result);
+	}
+}
+
+static void tnt4882_parallel_poll_configure(struct gpib_board *board, u8 config)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	if (priv->nec7210_priv.type == TNT5004) {
+		/* configure locally */
+		write_byte(&priv->nec7210_priv, AUXRI | 0x4, AUXMR);
+		if (config)
+			/* set response + clear sense */
+			write_byte(&priv->nec7210_priv, PPR | config, AUXMR);
+		else
+			/* disable ppoll */
+			write_byte(&priv->nec7210_priv, PPR | 0x10, AUXMR);
+	} else {
+		nec7210_parallel_poll_configure(board, &priv->nec7210_priv, config);
+	}
+}
+
+static void tnt4882_parallel_poll_response(struct gpib_board *board, int ist)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
+}
+
+/*
+ * this is just used by the old nec7210 isa interfaces, the newer
+ * boards use tnt4882_serial_poll_response2
+ */
+static void tnt4882_serial_poll_response(struct gpib_board *board, u8 status)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
+}
+
+static void tnt4882_serial_poll_response2(struct gpib_board *board, u8 status,
+					  int new_reason_for_service)
+{
+	struct tnt4882_priv *priv = board->private_data;
+	unsigned long flags;
+	const int MSS = status & request_service_bit;
+	const int reqt = MSS && new_reason_for_service;
+	const int reqf = MSS == 0;
+
+	spin_lock_irqsave(&board->spinlock, flags);
+	if (reqt) {
+		priv->nec7210_priv.srq_pending = 1;
+		clear_bit(SPOLL_NUM, &board->status);
+	} else {
+		if (reqf)
+			priv->nec7210_priv.srq_pending = 0;
+	}
+	if (reqt)
+		/*
+		 * It may seem like a race to issue reqt before updating
+		 * the status byte, but it is not.  The chip does not
+		 * issue the reqt until the SPMR is written to at
+		 * a later time.
+		 */
+		write_byte(&priv->nec7210_priv, AUX_REQT, AUXMR);
+	else if (reqf)
+		write_byte(&priv->nec7210_priv, AUX_REQF, AUXMR);
+	/*
+	 * We need to always zero bit 6 of the status byte before writing it to
+	 * the SPMR to insure we are using
+	 * serial poll mode SP1, and not accidentally triggering mode SP3.
+	 */
+	write_byte(&priv->nec7210_priv, status & ~request_service_bit, SPMR);
+	spin_unlock_irqrestore(&board->spinlock, flags);
+}
+
+static u8 tnt4882_serial_poll_status(struct gpib_board *board)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
+}
+
+static void tnt4882_return_to_local(struct gpib_board *board)
+{
+	struct tnt4882_priv *priv = board->private_data;
+
+	nec7210_return_to_local(board, &priv->nec7210_priv);
+}
+
+static void tnt4882_board_reset(struct tnt4882_priv *tnt_priv, struct gpib_board *board)
+{
+	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
+
+	tnt_priv->imr0_bits = 0;
+	tnt_writeb(tnt_priv, tnt_priv->imr0_bits, IMR0);
+	tnt_priv->imr3_bits = 0;
+	tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
+	tnt_readb(tnt_priv, IMR0);
+	tnt_readb(tnt_priv, IMR3);
+	nec7210_board_reset(nec_priv, board);
+}
+
+static int tnt4882_allocate_private(struct gpib_board *board)
+{
+	struct tnt4882_priv *tnt_priv;
+
+	board->private_data = kmalloc(sizeof(struct tnt4882_priv), GFP_KERNEL);
+	if (!board->private_data)
+		return -1;
+	tnt_priv = board->private_data;
+	memset(tnt_priv, 0, sizeof(struct tnt4882_priv));
+	init_nec7210_private(&tnt_priv->nec7210_priv);
+	return 0;
+}
+
+static void tnt4882_free_private(struct gpib_board *board)
+{
+	kfree(board->private_data);
+	board->private_data = NULL;
+}
+
+static void tnt4882_init(struct tnt4882_priv *tnt_priv, const struct gpib_board *board)
+{
+	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
+
+	/* Turbo488 software reset */
+	tnt_writeb(tnt_priv, SOFT_RESET, CMDR);
+	udelay(1);
+
+	// turn off one-chip mode
+	tnt_writeb(tnt_priv, NODMA, HSSEL);
+	tnt_writeb(tnt_priv, 0, ACCWR);
+	// make sure we are in 7210 mode
+	tnt_writeb(tnt_priv, AUX_7210, AUXCR);
+	udelay(1);
+	// registers might be swapped, so write it to the swapped address too
+	tnt_writeb(tnt_priv, AUX_7210, SWAPPED_AUXCR);
+	udelay(1);
+	// turn on one-chip mode
+	if (nec_priv->type == TNT4882 || nec_priv->type == TNT5004)
+		tnt_writeb(tnt_priv, NODMA | TNT_ONE_CHIP_BIT, HSSEL);
+	else
+		tnt_writeb(tnt_priv, NODMA, HSSEL);
+
+	nec7210_board_reset(nec_priv, board);
+	// read-clear isr0
+	tnt_readb(tnt_priv, ISR0);
+
+	// enable passing of nat4882 interrupts
+	tnt_priv->imr3_bits = HR_TLCI;
+	tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
+
+	// enable interrupt
+	tnt_writeb(tnt_priv, 0x1, INTRT);
+
+	// force immediate holdoff
+	write_byte(&tnt_priv->nec7210_priv, AUX_HLDI, AUXMR);
+
+	set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
+
+	tnt_priv->auxg_bits = AUXRG | NTNL_BIT;
+	write_byte(&tnt_priv->nec7210_priv, tnt_priv->auxg_bits, AUXMR);
+
+	nec7210_board_online(nec_priv, board);
+	// enable interface clear interrupt for event queue
+	tnt_priv->imr0_bits = TNT_IMR0_ALWAYS_BITS | TNT_ATNI_BIT | TNT_IFCIE_BIT;
+	tnt_writeb(tnt_priv, tnt_priv->imr0_bits, IMR0);
+}
+
+static int ni_pci_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct tnt4882_priv *tnt_priv;
+	struct nec7210_priv *nec_priv;
+	int isr_flags = IRQF_SHARED;
+	int retval;
+	struct mite_struct *mite;
+
+	board->status = 0;
+
+	if (tnt4882_allocate_private(board))
+		return -ENOMEM;
+	tnt_priv = board->private_data;
+	nec_priv = &tnt_priv->nec7210_priv;
+	nec_priv->type = TNT4882;
+	nec_priv->read_byte = nec7210_locking_iomem_read_byte;
+	nec_priv->write_byte = nec7210_locking_iomem_write_byte;
+	nec_priv->offset = atgpib_reg_offset;
+
+	if (!mite_devices)
+		return -ENODEV;
+
+	for (mite = mite_devices; mite; mite = mite->next) {
+		short found_board;
+
+		if (mite->used)
+			continue;
+		if (config->pci_bus >= 0 && config->pci_bus != mite->pcidev->bus->number)
+			continue;
+		if (config->pci_slot >= 0 && config->pci_slot != PCI_SLOT(mite->pcidev->devfn))
+			continue;
+		switch (mite_device_id(mite)) {
+		case PCI_DEVICE_ID_NI_GPIB:
+		case PCI_DEVICE_ID_NI_GPIB_PLUS:
+		case PCI_DEVICE_ID_NI_GPIB_PLUS2:
+		case PCI_DEVICE_ID_NI_PXIGPIB:
+		case PCI_DEVICE_ID_NI_PMCGPIB:
+		case PCI_DEVICE_ID_NI_PCIEGPIB:
+		case PCI_DEVICE_ID_NI_PCIE2GPIB:
+// support for Measurement Computing PCI-488
+		case PCI_DEVICE_ID_MC_PCI488:
+		case PCI_DEVICE_ID_CEC_NI_GPIB:
+			found_board = 1;
+			break;
+		default:
+			found_board = 0;
+			break;
+		}
+		if (found_board)
+			break;
+	}
+	if (!mite)
+		return -ENODEV;
+
+	tnt_priv->mite = mite;
+	retval = mite_setup(tnt_priv->mite);
+	if (retval < 0)
+		return retval;
+
+	nec_priv->mmiobase = tnt_priv->mite->daq_io_addr;
+
+	// get irq
+	retval = request_irq(mite_irq(tnt_priv->mite), tnt4882_interrupt, isr_flags, "ni-pci-gpib",
+			     board);
+	if (retval) {
+		dev_err(board->gpib_dev, "failed to obtain pci irq %d\n", mite_irq(tnt_priv->mite));
+		return retval;
+	}
+	tnt_priv->irq = mite_irq(tnt_priv->mite);
+
+	// TNT5004 detection
+	switch (tnt_readb(tnt_priv, CSR) & 0xf0) {
+	case 0x30:
+		nec_priv->type = TNT4882;
+		break;
+	case 0x40:
+		nec_priv->type = TNT5004;
+		break;
+	}
+	tnt4882_init(tnt_priv, board);
+
+	return 0;
+}
+
+static void ni_pci_detach(struct gpib_board *board)
+{
+	struct tnt4882_priv *tnt_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (tnt_priv) {
+		nec_priv = &tnt_priv->nec7210_priv;
+
+		if (nec_priv->mmiobase)
+			tnt4882_board_reset(tnt_priv, board);
+		if (tnt_priv->irq)
+			free_irq(tnt_priv->irq, board);
+		if (tnt_priv->mite)
+			mite_unsetup(tnt_priv->mite);
+	}
+	tnt4882_free_private(board);
+}
+
+static int ni_isapnp_find(struct pnp_dev **dev)
+{
+	*dev = pnp_find_dev(NULL, ISAPNP_VENDOR_ID_NI,
+			    ISAPNP_FUNCTION(ISAPNP_ID_NI_ATGPIB_TNT), NULL);
+	if (!*dev || !(*dev)->card)
+		return -ENODEV;
+	if (pnp_device_attach(*dev) < 0)
+		return -EBUSY;
+	if (pnp_activate_dev(*dev) < 0)	{
+		pnp_device_detach(*dev);
+		return -EAGAIN;
+	}
+	if (!pnp_port_valid(*dev, 0) || !pnp_irq_valid(*dev, 0)) {
+		pnp_device_detach(*dev);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int ni_isa_attach_common(struct gpib_board *board, const struct gpib_board_config *config,
+				enum nec7210_chipset chipset)
+{
+	struct tnt4882_priv *tnt_priv;
+	struct nec7210_priv *nec_priv;
+	int isr_flags = 0;
+	u32 iobase;
+	int irq;
+	int retval;
+
+	board->status = 0;
+
+	if (tnt4882_allocate_private(board))
+		return -ENOMEM;
+	tnt_priv = board->private_data;
+	nec_priv = &tnt_priv->nec7210_priv;
+	nec_priv->type = chipset;
+	nec_priv->read_byte = nec7210_locking_ioport_read_byte;
+	nec_priv->write_byte = nec7210_locking_ioport_write_byte;
+	nec_priv->offset = atgpib_reg_offset;
+
+	// look for plug-n-play board
+	if (config->ibbase == 0) {
+		struct pnp_dev *dev;
+
+		retval = ni_isapnp_find(&dev);
+		if (retval < 0)
+			return retval;
+		tnt_priv->pnp_dev = dev;
+		iobase = pnp_port_start(dev, 0);
+		irq = pnp_irq(dev, 0);
+	} else {
+		iobase = config->ibbase;
+		irq = config->ibirq;
+	}
+	// allocate ioports
+	if (!request_region(iobase, atgpib_iosize, "atgpib"))
+		return -EBUSY;
+
+	nec_priv->mmiobase = ioport_map(iobase, atgpib_iosize);
+	if (!nec_priv->mmiobase)
+		return -EBUSY;
+
+	// get irq
+	retval = request_irq(irq, tnt4882_interrupt, isr_flags, "atgpib", board);
+	if (retval) {
+		dev_err(board->gpib_dev, "failed to request ISA irq %d\n", irq);
+		return retval;
+	}
+	tnt_priv->irq = irq;
+
+	tnt4882_init(tnt_priv, board);
+
+	return 0;
+}
+
+static int ni_tnt_isa_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	return ni_isa_attach_common(board, config, TNT4882);
+}
+
+static int ni_nat4882_isa_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	return ni_isa_attach_common(board, config, NAT4882);
+}
+
+static int ni_nec_isa_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	return ni_isa_attach_common(board, config, NEC7210);
+}
+
+static void ni_isa_detach(struct gpib_board *board)
+{
+	struct tnt4882_priv *tnt_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (tnt_priv) {
+		nec_priv = &tnt_priv->nec7210_priv;
+		if (nec_priv->iobase)
+			tnt4882_board_reset(tnt_priv, board);
+		if (tnt_priv->irq)
+			free_irq(tnt_priv->irq, board);
+		if (nec_priv->mmiobase)
+			ioport_unmap(nec_priv->mmiobase);
+		if (nec_priv->iobase)
+			release_region(nec_priv->iobase, atgpib_iosize);
+		if (tnt_priv->pnp_dev)
+			pnp_device_detach(tnt_priv->pnp_dev);
+	}
+	tnt4882_free_private(board);
+}
+
+static int tnt4882_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	return 0;
+}
+
+static struct gpib_interface ni_pci_interface = {
+	.name = "ni_pci",
+	.attach = ni_pci_attach,
+	.detach = ni_pci_detach,
+	.read = tnt4882_accel_read,
+	.write = tnt4882_accel_write,
+	.command = tnt4882_command,
+	.take_control = tnt4882_take_control,
+	.go_to_standby = tnt4882_go_to_standby,
+	.request_system_control = tnt4882_request_system_control,
+	.interface_clear = tnt4882_interface_clear,
+	.remote_enable = tnt4882_remote_enable,
+	.enable_eos = tnt4882_enable_eos,
+	.disable_eos = tnt4882_disable_eos,
+	.parallel_poll = tnt4882_parallel_poll,
+	.parallel_poll_configure = tnt4882_parallel_poll_configure,
+	.parallel_poll_response = tnt4882_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = tnt4882_line_status,
+	.update_status = tnt4882_update_status,
+	.primary_address = tnt4882_primary_address,
+	.secondary_address = tnt4882_secondary_address,
+	.serial_poll_response2 = tnt4882_serial_poll_response2,
+	.serial_poll_status = tnt4882_serial_poll_status,
+	.t1_delay = tnt4882_t1_delay,
+	.return_to_local = tnt4882_return_to_local,
+};
+
+static struct gpib_interface ni_pci_accel_interface = {
+	.name = "ni_pci_accel",
+	.attach = ni_pci_attach,
+	.detach = ni_pci_detach,
+	.read = tnt4882_accel_read,
+	.write = tnt4882_accel_write,
+	.command = tnt4882_command,
+	.take_control = tnt4882_take_control,
+	.go_to_standby = tnt4882_go_to_standby,
+	.request_system_control = tnt4882_request_system_control,
+	.interface_clear = tnt4882_interface_clear,
+	.remote_enable = tnt4882_remote_enable,
+	.enable_eos = tnt4882_enable_eos,
+	.disable_eos = tnt4882_disable_eos,
+	.parallel_poll = tnt4882_parallel_poll,
+	.parallel_poll_configure = tnt4882_parallel_poll_configure,
+	.parallel_poll_response = tnt4882_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = tnt4882_line_status,
+	.update_status = tnt4882_update_status,
+	.primary_address = tnt4882_primary_address,
+	.secondary_address = tnt4882_secondary_address,
+	.serial_poll_response2 = tnt4882_serial_poll_response2,
+	.serial_poll_status = tnt4882_serial_poll_status,
+	.t1_delay = tnt4882_t1_delay,
+	.return_to_local = tnt4882_return_to_local,
+};
+
+static struct gpib_interface ni_isa_interface = {
+	.name = "ni_isa",
+	.attach = ni_tnt_isa_attach,
+	.detach = ni_isa_detach,
+	.read = tnt4882_accel_read,
+	.write = tnt4882_accel_write,
+	.command = tnt4882_command,
+	.take_control = tnt4882_take_control,
+	.go_to_standby = tnt4882_go_to_standby,
+	.request_system_control = tnt4882_request_system_control,
+	.interface_clear = tnt4882_interface_clear,
+	.remote_enable = tnt4882_remote_enable,
+	.enable_eos = tnt4882_enable_eos,
+	.disable_eos = tnt4882_disable_eos,
+	.parallel_poll = tnt4882_parallel_poll,
+	.parallel_poll_configure = tnt4882_parallel_poll_configure,
+	.parallel_poll_response = tnt4882_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = tnt4882_line_status,
+	.update_status = tnt4882_update_status,
+	.primary_address = tnt4882_primary_address,
+	.secondary_address = tnt4882_secondary_address,
+	.serial_poll_response2 = tnt4882_serial_poll_response2,
+	.serial_poll_status = tnt4882_serial_poll_status,
+	.t1_delay = tnt4882_t1_delay,
+	.return_to_local = tnt4882_return_to_local,
+};
+
+static struct gpib_interface ni_nat4882_isa_interface = {
+	.name = "ni_nat4882_isa",
+	.attach = ni_nat4882_isa_attach,
+	.detach = ni_isa_detach,
+	.read = tnt4882_read,
+	.write = tnt4882_write,
+	.command = tnt4882_command_unaccel,
+	.take_control = tnt4882_take_control,
+	.go_to_standby = tnt4882_go_to_standby,
+	.request_system_control = tnt4882_request_system_control,
+	.interface_clear = tnt4882_interface_clear,
+	.remote_enable = tnt4882_remote_enable,
+	.enable_eos = tnt4882_enable_eos,
+	.disable_eos = tnt4882_disable_eos,
+	.parallel_poll = tnt4882_parallel_poll,
+	.parallel_poll_configure = tnt4882_parallel_poll_configure,
+	.parallel_poll_response = tnt4882_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = tnt4882_line_status,
+	.update_status = tnt4882_update_status,
+	.primary_address = tnt4882_primary_address,
+	.secondary_address = tnt4882_secondary_address,
+	.serial_poll_response2 = tnt4882_serial_poll_response2,
+	.serial_poll_status = tnt4882_serial_poll_status,
+	.t1_delay = tnt4882_t1_delay,
+	.return_to_local = tnt4882_return_to_local,
+};
+
+static struct gpib_interface ni_nec_isa_interface = {
+	.name = "ni_nec_isa",
+	.attach = ni_nec_isa_attach,
+	.detach = ni_isa_detach,
+	.read = tnt4882_read,
+	.write = tnt4882_write,
+	.command = tnt4882_command_unaccel,
+	.take_control = tnt4882_take_control,
+	.go_to_standby = tnt4882_go_to_standby,
+	.request_system_control = tnt4882_request_system_control,
+	.interface_clear = tnt4882_interface_clear,
+	.remote_enable = tnt4882_remote_enable,
+	.enable_eos = tnt4882_enable_eos,
+	.disable_eos = tnt4882_disable_eos,
+	.parallel_poll = tnt4882_parallel_poll,
+	.parallel_poll_configure = tnt4882_parallel_poll_configure,
+	.parallel_poll_response = tnt4882_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = NULL,
+	.update_status = tnt4882_update_status,
+	.primary_address = tnt4882_primary_address,
+	.secondary_address = tnt4882_secondary_address,
+	.serial_poll_response = tnt4882_serial_poll_response,
+	.serial_poll_status = tnt4882_serial_poll_status,
+	.t1_delay = tnt4882_t1_delay,
+	.return_to_local = tnt4882_return_to_local,
+};
+
+static struct gpib_interface ni_isa_accel_interface = {
+	.name = "ni_isa_accel",
+	.attach = ni_tnt_isa_attach,
+	.detach = ni_isa_detach,
+	.read = tnt4882_accel_read,
+	.write = tnt4882_accel_write,
+	.command = tnt4882_command,
+	.take_control = tnt4882_take_control,
+	.go_to_standby = tnt4882_go_to_standby,
+	.request_system_control = tnt4882_request_system_control,
+	.interface_clear = tnt4882_interface_clear,
+	.remote_enable = tnt4882_remote_enable,
+	.enable_eos = tnt4882_enable_eos,
+	.disable_eos = tnt4882_disable_eos,
+	.parallel_poll = tnt4882_parallel_poll,
+	.parallel_poll_configure = tnt4882_parallel_poll_configure,
+	.parallel_poll_response = tnt4882_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = tnt4882_line_status,
+	.update_status = tnt4882_update_status,
+	.primary_address = tnt4882_primary_address,
+	.secondary_address = tnt4882_secondary_address,
+	.serial_poll_response2 = tnt4882_serial_poll_response2,
+	.serial_poll_status = tnt4882_serial_poll_status,
+	.t1_delay = tnt4882_t1_delay,
+	.return_to_local = tnt4882_return_to_local,
+};
+
+static struct gpib_interface ni_nat4882_isa_accel_interface = {
+	.name = "ni_nat4882_isa_accel",
+	.attach = ni_nat4882_isa_attach,
+	.detach = ni_isa_detach,
+	.read = tnt4882_accel_read,
+	.write = tnt4882_accel_write,
+	.command = tnt4882_command_unaccel,
+	.take_control = tnt4882_take_control,
+	.go_to_standby = tnt4882_go_to_standby,
+	.request_system_control = tnt4882_request_system_control,
+	.interface_clear = tnt4882_interface_clear,
+	.remote_enable = tnt4882_remote_enable,
+	.enable_eos = tnt4882_enable_eos,
+	.disable_eos = tnt4882_disable_eos,
+	.parallel_poll = tnt4882_parallel_poll,
+	.parallel_poll_configure = tnt4882_parallel_poll_configure,
+	.parallel_poll_response = tnt4882_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = tnt4882_line_status,
+	.update_status = tnt4882_update_status,
+	.primary_address = tnt4882_primary_address,
+	.secondary_address = tnt4882_secondary_address,
+	.serial_poll_response2 = tnt4882_serial_poll_response2,
+	.serial_poll_status = tnt4882_serial_poll_status,
+	.t1_delay = tnt4882_t1_delay,
+	.return_to_local = tnt4882_return_to_local,
+};
+
+static struct gpib_interface ni_nec_isa_accel_interface = {
+	.name = "ni_nec_isa_accel",
+	.attach = ni_nec_isa_attach,
+	.detach = ni_isa_detach,
+	.read = tnt4882_accel_read,
+	.write = tnt4882_accel_write,
+	.command = tnt4882_command_unaccel,
+	.take_control = tnt4882_take_control,
+	.go_to_standby = tnt4882_go_to_standby,
+	.request_system_control = tnt4882_request_system_control,
+	.interface_clear = tnt4882_interface_clear,
+	.remote_enable = tnt4882_remote_enable,
+	.enable_eos = tnt4882_enable_eos,
+	.disable_eos = tnt4882_disable_eos,
+	.parallel_poll = tnt4882_parallel_poll,
+	.parallel_poll_configure = tnt4882_parallel_poll_configure,
+	.parallel_poll_response = tnt4882_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = NULL,
+	.update_status = tnt4882_update_status,
+	.primary_address = tnt4882_primary_address,
+	.secondary_address = tnt4882_secondary_address,
+	.serial_poll_response = tnt4882_serial_poll_response,
+	.serial_poll_status = tnt4882_serial_poll_status,
+	.t1_delay = tnt4882_t1_delay,
+	.return_to_local = tnt4882_return_to_local,
+};
+
+static const struct pci_device_id tnt4882_pci_table[] = {
+	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_GPIB)},
+	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_GPIB_PLUS)},
+	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_GPIB_PLUS2)},
+	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_PXIGPIB)},
+	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_PMCGPIB)},
+	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_PCIEGPIB)},
+	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_PCIE2GPIB)},
+	// support for Measurement Computing PCI-488
+	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_MC_PCI488)},
+	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_CEC_NI_GPIB)},
+	{ 0 }
+};
+MODULE_DEVICE_TABLE(pci, tnt4882_pci_table);
+
+static struct pci_driver tnt4882_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = tnt4882_pci_table,
+	.probe = &tnt4882_pci_probe
+};
+
+#if 0
+/* unused, will be needed when the driver is turned into a pnp_driver */
+static const struct pnp_device_id tnt4882_pnp_table[] = {
+	{.id = "NICC601"},
+	{.id = ""}
+};
+MODULE_DEVICE_TABLE(pnp, tnt4882_pnp_table);
+#endif
+
+#ifdef CONFIG_GPIB_PCMCIA
+static struct gpib_interface ni_pcmcia_interface;
+static struct gpib_interface ni_pcmcia_accel_interface;
+static int __init init_ni_gpib_cs(void);
+static void __exit exit_ni_gpib_cs(void);
+#endif
+
+static int __init tnt4882_init_module(void)
+{
+	int result;
+
+	result = pci_register_driver(&tnt4882_pci_driver);
+	if (result) {
+		pr_err("pci_register_driver failed: error = %d\n", result);
+		return result;
+	}
+
+	result = gpib_register_driver(&ni_isa_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_isa;
+	}
+
+	result = gpib_register_driver(&ni_isa_accel_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_isa_accel;
+	}
+
+	result = gpib_register_driver(&ni_nat4882_isa_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_nat4882_isa;
+	}
+
+	result = gpib_register_driver(&ni_nat4882_isa_accel_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_nat4882_isa_accel;
+	}
+
+	result = gpib_register_driver(&ni_nec_isa_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_nec_isa;
+	}
+
+	result = gpib_register_driver(&ni_nec_isa_accel_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_nec_isa_accel;
+	}
+
+	result = gpib_register_driver(&ni_pci_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_pci;
+	}
+
+	result = gpib_register_driver(&ni_pci_accel_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_pci_accel;
+	}
+
+#ifdef CONFIG_GPIB_PCMCIA
+	result = gpib_register_driver(&ni_pcmcia_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_pcmcia;
+	}
+
+	result = gpib_register_driver(&ni_pcmcia_accel_interface, THIS_MODULE);
+	if (result) {
+		pr_err("gpib_register_driver failed: error = %d\n", result);
+		goto err_pcmcia_accel;
+	}
+
+	result = init_ni_gpib_cs();
+	if (result) {
+		pr_err("pcmcia_register_driver failed: error = %d\n", result);
+		goto err_pcmcia_driver;
+	}
+#endif
+
+	mite_init();
+
+	return 0;
+
+#ifdef CONFIG_GPIB_PCMCIA
+err_pcmcia_driver:
+	gpib_unregister_driver(&ni_pcmcia_accel_interface);
+err_pcmcia_accel:
+	gpib_unregister_driver(&ni_pcmcia_interface);
+err_pcmcia:
+#endif
+	gpib_unregister_driver(&ni_pci_accel_interface);
+err_pci_accel:
+	gpib_unregister_driver(&ni_pci_interface);
+err_pci:
+	gpib_unregister_driver(&ni_nec_isa_accel_interface);
+err_nec_isa_accel:
+	gpib_unregister_driver(&ni_nec_isa_interface);
+err_nec_isa:
+	gpib_unregister_driver(&ni_nat4882_isa_accel_interface);
+err_nat4882_isa_accel:
+	gpib_unregister_driver(&ni_nat4882_isa_interface);
+err_nat4882_isa:
+	gpib_unregister_driver(&ni_isa_accel_interface);
+err_isa_accel:
+	gpib_unregister_driver(&ni_isa_interface);
+err_isa:
+	pci_unregister_driver(&tnt4882_pci_driver);
+
+	return result;
+}
+
+static void __exit tnt4882_exit_module(void)
+{
+	gpib_unregister_driver(&ni_isa_interface);
+	gpib_unregister_driver(&ni_isa_accel_interface);
+	gpib_unregister_driver(&ni_nat4882_isa_interface);
+	gpib_unregister_driver(&ni_nat4882_isa_accel_interface);
+	gpib_unregister_driver(&ni_nec_isa_interface);
+	gpib_unregister_driver(&ni_nec_isa_accel_interface);
+	gpib_unregister_driver(&ni_pci_interface);
+	gpib_unregister_driver(&ni_pci_accel_interface);
+#ifdef CONFIG_GPIB_PCMCIA
+	gpib_unregister_driver(&ni_pcmcia_interface);
+	gpib_unregister_driver(&ni_pcmcia_accel_interface);
+	exit_ni_gpib_cs();
+#endif
+
+	mite_cleanup();
+
+	pci_unregister_driver(&tnt4882_pci_driver);
+}
+
+#ifdef CONFIG_GPIB_PCMCIA
+
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/ptrace.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+
+#include <pcmcia/cistpl.h>
+#include <pcmcia/cisreg.h>
+#include <pcmcia/ds.h>
+
+static int ni_gpib_config(struct pcmcia_device  *link);
+static void ni_gpib_release(struct pcmcia_device *link);
+static void ni_pcmcia_detach(struct gpib_board *board);
+
+/*
+ * A linked list of "instances" of the dummy device.  Each actual
+ * PCMCIA card corresponds to one device instance, and is described
+ * by one dev_link_t structure (defined in ds.h).
+ *
+ * You may not want to use a linked list for this -- for example, the
+ * memory card driver uses an array of dev_link_t pointers, where minor
+ * device numbers are used to derive the corresponding array index.
+ *
+ * I think this dev_list is obsolete but the pointer is needed to keep
+ * the module instance for the ni_pcmcia_attach function.
+ */
+
+static struct pcmcia_device   *curr_dev;
+
+struct local_info_t {
+	struct pcmcia_device	*p_dev;
+	struct gpib_board		*dev;
+	int			stop;
+	struct bus_operations	*bus;
+};
+
+/*
+ * ni_gpib_probe() creates an "instance" of the driver, allocating
+ * local data structures for one device.  The device is registered
+ * with Card Services.
+ */
+
+static int ni_gpib_probe(struct pcmcia_device *link)
+{
+	struct local_info_t *info;
+	//struct struct gpib_board *dev;
+
+	/* Allocate space for private device-specific data */
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->p_dev = link;
+	link->priv = info;
+
+	/*
+	 * General socket configuration defaults can go here.  In this
+	 * client, we assume very little, and rely on the CIS for almost
+	 * everything.  In most clients, many details (i.e., number, sizes,
+	 * and attributes of IO windows) are fixed by the nature of the
+	 * device, and can be hard-wired here.
+	 */
+	link->config_flags = CONF_ENABLE_IRQ | CONF_AUTO_SET_IO;
+
+	/* Register with Card Services */
+	curr_dev = link;
+	return ni_gpib_config(link);
+}
+
+/*
+ * This deletes a driver "instance".  The device is de-registered
+ * with Card Services.  If it has been released, all local data
+ * structures are freed.  Otherwise, the structures will be freed
+ * when the device is released.
+ */
+static void ni_gpib_remove(struct pcmcia_device *link)
+{
+	struct local_info_t *info = link->priv;
+	//struct struct gpib_board *dev = info->dev;
+
+	if (info->dev)
+		ni_pcmcia_detach(info->dev);
+	ni_gpib_release(link);
+
+	//free_netdev(dev);
+	kfree(info);
+}
+
+static int ni_gpib_config_iteration(struct pcmcia_device *link,	void *priv_data)
+{
+	int retval;
+
+	retval = pcmcia_request_io(link);
+	if (retval != 0)
+		return retval;
+
+	return 0;
+}
+
+/*
+ * ni_gpib_config() is scheduled to run after a CARD_INSERTION event
+ * is received, to configure the PCMCIA socket, and to make the
+ * device available to the system.
+ */
+static int ni_gpib_config(struct pcmcia_device *link)
+{
+	//struct local_info_t *info = link->priv;
+	//struct gpib_board *dev = info->dev;
+	int last_ret;
+
+	last_ret = pcmcia_loop_config(link, &ni_gpib_config_iteration, NULL);
+	if (last_ret) {
+		dev_warn(&link->dev, "no configuration found\n");
+		ni_gpib_release(link);
+		return last_ret;
+	}
+
+	last_ret = pcmcia_enable_device(link);
+	if (last_ret) {
+		ni_gpib_release(link);
+		return last_ret;
+	}
+	return 0;
+} /* ni_gpib_config */
+
+/*
+ * After a card is removed, ni_gpib_release() will unregister the
+ * device, and release the PCMCIA configuration.  If the device is
+ * still open, this will be postponed until it is closed.
+ */
+static void ni_gpib_release(struct pcmcia_device *link)
+{
+	pcmcia_disable_device(link);
+} /* ni_gpib_release */
+
+static int ni_gpib_suspend(struct pcmcia_device *link)
+{
+	//struct local_info_t *info = link->priv;
+	//struct struct gpib_board *dev = info->dev;
+
+	if (link->open)
+		dev_warn(&link->dev, "Device still open\n");
+	//netif_device_detach(dev);
+
+	return 0;
+}
+
+static int ni_gpib_resume(struct pcmcia_device *link)
+{
+	//struct local_info_t *info = link->priv;
+	//struct struct gpib_board *dev = info->dev;
+
+	/*if (link->open) {
+	 *	ni_gpib_probe(dev);	/ really?
+	 *	//netif_device_attach(dev);
+	 *}
+	 */
+	return ni_gpib_config(link);
+}
+
+static struct pcmcia_device_id ni_pcmcia_ids[] = {
+	PCMCIA_DEVICE_MANF_CARD(0x010b, 0x4882),
+	PCMCIA_DEVICE_MANF_CARD(0x010b, 0x0c71), // NI PCMCIA-GPIB+
+	PCMCIA_DEVICE_NULL
+};
+
+MODULE_DEVICE_TABLE(pcmcia, ni_pcmcia_ids);
+
+static struct pcmcia_driver ni_gpib_cs_driver = {
+	.name           = "ni_gpib_cs",
+	.owner		= THIS_MODULE,
+	.drv = { .name = "ni_gpib_cs", },
+	.id_table	= ni_pcmcia_ids,
+	.probe		= ni_gpib_probe,
+	.remove		= ni_gpib_remove,
+	.suspend	= ni_gpib_suspend,
+	.resume		= ni_gpib_resume,
+};
+
+static int __init init_ni_gpib_cs(void)
+{
+	return pcmcia_register_driver(&ni_gpib_cs_driver);
+}
+
+static void __exit exit_ni_gpib_cs(void)
+{
+	pcmcia_unregister_driver(&ni_gpib_cs_driver);
+}
+
+static const int pcmcia_gpib_iosize = 32;
+
+static int ni_pcmcia_attach(struct gpib_board *board, const struct gpib_board_config *config)
+{
+	struct local_info_t *info;
+	struct tnt4882_priv *tnt_priv;
+	struct nec7210_priv *nec_priv;
+	int isr_flags = IRQF_SHARED;
+	int retval;
+
+	if (!curr_dev)
+		return -ENODEV;
+
+	info = curr_dev->priv;
+	info->dev = board;
+
+	board->status = 0;
+
+	if (tnt4882_allocate_private(board))
+		return -ENOMEM;
+
+	tnt_priv = board->private_data;
+	nec_priv = &tnt_priv->nec7210_priv;
+	nec_priv->type = TNT4882;
+	nec_priv->read_byte = nec7210_locking_ioport_read_byte;
+	nec_priv->write_byte = nec7210_locking_ioport_write_byte;
+	nec_priv->offset = atgpib_reg_offset;
+
+	if (!request_region(curr_dev->resource[0]->start, resource_size(curr_dev->resource[0]),
+			    DRV_NAME))
+		return -ENOMEM;
+
+	nec_priv->mmiobase = ioport_map(curr_dev->resource[0]->start,
+					resource_size(curr_dev->resource[0]));
+	if (!nec_priv->mmiobase)
+		return -ENOMEM;
+
+	// get irq
+	retval = request_irq(curr_dev->irq, tnt4882_interrupt, isr_flags, DRV_NAME, board);
+	if (retval) {
+		dev_err(board->gpib_dev, "failed to obtain PCMCIA irq %d\n", curr_dev->irq);
+		return retval;
+	}
+	tnt_priv->irq = curr_dev->irq;
+
+	tnt4882_init(tnt_priv, board);
+
+	return 0;
+}
+
+static void ni_pcmcia_detach(struct gpib_board *board)
+{
+	struct tnt4882_priv *tnt_priv = board->private_data;
+	struct nec7210_priv *nec_priv;
+
+	if (tnt_priv) {
+		nec_priv = &tnt_priv->nec7210_priv;
+		if (tnt_priv->irq)
+			free_irq(tnt_priv->irq, board);
+		if (nec_priv->mmiobase)
+			ioport_unmap(nec_priv->mmiobase);
+		if (nec_priv->iobase) {
+			tnt4882_board_reset(tnt_priv, board);
+			release_region(nec_priv->iobase, pcmcia_gpib_iosize);
+		}
+	}
+	tnt4882_free_private(board);
+}
+
+static struct gpib_interface ni_pcmcia_interface = {
+	.name = "ni_pcmcia",
+	.attach = ni_pcmcia_attach,
+	.detach = ni_pcmcia_detach,
+	.read = tnt4882_accel_read,
+	.write = tnt4882_accel_write,
+	.command = tnt4882_command,
+	.take_control = tnt4882_take_control,
+	.go_to_standby = tnt4882_go_to_standby,
+	.request_system_control = tnt4882_request_system_control,
+	.interface_clear = tnt4882_interface_clear,
+	.remote_enable = tnt4882_remote_enable,
+	.enable_eos = tnt4882_enable_eos,
+	.disable_eos = tnt4882_disable_eos,
+	.parallel_poll = tnt4882_parallel_poll,
+	.parallel_poll_configure = tnt4882_parallel_poll_configure,
+	.parallel_poll_response = tnt4882_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = tnt4882_line_status,
+	.update_status = tnt4882_update_status,
+	.primary_address = tnt4882_primary_address,
+	.secondary_address = tnt4882_secondary_address,
+	.serial_poll_response = tnt4882_serial_poll_response,
+	.serial_poll_status = tnt4882_serial_poll_status,
+	.t1_delay = tnt4882_t1_delay,
+	.return_to_local = tnt4882_return_to_local,
+};
+
+static struct gpib_interface ni_pcmcia_accel_interface = {
+	.name = "ni_pcmcia_accel",
+	.attach = ni_pcmcia_attach,
+	.detach = ni_pcmcia_detach,
+	.read = tnt4882_accel_read,
+	.write = tnt4882_accel_write,
+	.command = tnt4882_command,
+	.take_control = tnt4882_take_control,
+	.go_to_standby = tnt4882_go_to_standby,
+	.request_system_control = tnt4882_request_system_control,
+	.interface_clear = tnt4882_interface_clear,
+	.remote_enable = tnt4882_remote_enable,
+	.enable_eos = tnt4882_enable_eos,
+	.disable_eos = tnt4882_disable_eos,
+	.parallel_poll = tnt4882_parallel_poll,
+	.parallel_poll_configure = tnt4882_parallel_poll_configure,
+	.parallel_poll_response = tnt4882_parallel_poll_response,
+	.local_parallel_poll_mode = NULL, // XXX
+	.line_status = tnt4882_line_status,
+	.update_status = tnt4882_update_status,
+	.primary_address = tnt4882_primary_address,
+	.secondary_address = tnt4882_secondary_address,
+	.serial_poll_response = tnt4882_serial_poll_response,
+	.serial_poll_status = tnt4882_serial_poll_status,
+	.t1_delay = tnt4882_t1_delay,
+	.return_to_local = tnt4882_return_to_local,
+};
+
+#endif	// CONFIG_GPIB_PCMCIA
+
+module_init(tnt4882_init_module);
+module_exit(tnt4882_exit_module);
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 075e775d3868..2f92cd698bef 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -48,6 +48,4 @@ source "drivers/staging/axis-fifo/Kconfig"
 
 source "drivers/staging/vme_user/Kconfig"
 
-source "drivers/staging/gpib/Kconfig"
-
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index e681e403509c..f5b8876aa536 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -13,4 +13,3 @@ obj-$(CONFIG_MOST)		+= most/
 obj-$(CONFIG_GREYBUS)		+= greybus/
 obj-$(CONFIG_BCM2835_VCHIQ)	+= vc04_services/
 obj-$(CONFIG_XIL_AXIS_FIFO)	+= axis-fifo/
-obj-$(CONFIG_GPIB)	 	+= gpib/
diff --git a/drivers/staging/gpib/Kconfig b/drivers/staging/gpib/Kconfig
deleted file mode 100644
index aa01538d5beb..000000000000
--- a/drivers/staging/gpib/Kconfig
+++ /dev/null
@@ -1,255 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-menuconfig GPIB
-	tristate "Linux GPIB drivers"
-	help
-	  Enable support for GPIB cards and dongles for Linux.  GPIB
-	  is the General Purpose Interface Bus which conforms to the
-	  IEEE488 standard.
-
-	  This set of drivers can be used with the corresponding user
-	  space library that can be found on Sourceforge under linux-gpib.
-	  Select the drivers for your hardware from the list.
-
-if GPIB
-
-config GPIB_COMMON
-	tristate "GPIB core"
-	help
-
-	  Core common driver for all GPIB drivers. It provides the
-	  interface for the userland library
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called gpib_common
-
-config GPIB_AGILENT_82350B
-	tristate "Agilent 8235xx PCI(e) adapters"
-	depends on PCI
-	select GPIB_COMMON
-	select GPIB_TMS9914
-	help
-	  Enable support for HP/Agilent/Keysight boards
-	    82350A
-	    82350B
-	    82351A
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called agilent_82350b.
-
-config GPIB_AGILENT_82357A
-	tristate "Agilent 82357a/b USB dongles"
-	select GPIB_COMMON
-	depends on USB
-	help
-	  Enable support for Agilent/Keysight 82357x USB dongles.
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called agilent_82357a.
-
-config GPIB_CEC_PCI
-	tristate "CEC PCI board"
-	depends on PCI
-	depends on HAS_IOPORT
-	select GPIB_COMMON
-	select GPIB_NEC7210
-	help
-	  Enable support for Capital Equipment Corporation PCI-488
-	  and Keithly KPCI-488 boards.
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called cec_gpib.
-
-config GPIB_NI_PCI_ISA
-	tristate "NI PCI/ISA compatible boards"
-	depends on ISA_BUS || PCI || PCMCIA
-	depends on HAS_IOPORT
-	depends on PCMCIA || !PCMCIA
-	depends on HAS_IOPORT_MAP
-	select GPIB_COMMON
-	select GPIB_NEC7210
-	help
-	  Enable support for National Instruments boards based
-	  on TNT4882 chips:
-	     AT-GPIB (with NAT4882 chip)
-	     AT-GPIB (with NEC7210 chip)
-	     AT-GPIB/TNT
-	     PCI-GPIB
-	     PCIe-GPIB
-	     PCI-GPIB+
-	     PCM-GPIB
-	     PXI-GPIB
-	     PCMCIA-GPIB
-	     and Capital Equipment Corporation CEC-488 board.
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called tnt4882.
-
-config GPIB_CB7210
-       tristate "Measurement Computing compatible boards"
-	depends on HAS_IOPORT
-	depends on ISA_BUS || PCI || PCMCIA
-	depends on PCMCIA || !PCMCIA
-       select GPIB_COMMON
-	select GPIB_NEC7210
-       help
-       Enable support for Measurement Computing (Computer Boards):
-       CPCI_GPIB, ISA-GPIB, ISA-GPIB/LC, PCI-GPIB/1M, PCI-GPIB/300K and
-       PCMCIA-GPIB
-       Quancom PCIGPIB-1 with MC cb7210 chip
-
-	  To compile this driver as a module, choose M here: the module will be
-
-config GPIB_NI_USB
-	tristate "NI USB dongles"
-	select GPIB_COMMON
-	depends on USB
-	help
-	  Enable support for National Instruments
-	       GPIB-USB-B
-	       GPIB-USB-HS
-	       GPIB-USB-HS+
-	   Keithly
-	       KUSB-488
-	       KUSB-488A
-	   Measurement Computing (Computer Boards)
-	       USB-488
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called ni_usb.
-
-config GPIB_FLUKE
-       tristate "Fluke"
-	depends on OF
-       select GPIB_COMMON
-       select GPIB_NEC7210
-       help
-         GPIB driver for Fluke based cda devices.
-
-	 To compile this driver as a module, choose M here: the module will be
-	 called fluke_gpib
-
-config GPIB_FMH
-       tristate "FMH FPGA based devices"
-       select GPIB_COMMON
-       select GPIB_NEC7210
-       depends on !PPC
-       depends on OF && PCI
-       help
-         GPIB driver for fmhess FPGA based devices
-
-	 To compile this driver as a module, choose M here: the module will be
-	 called fmh_gpib
-
-config GPIB_GPIO
-       tristate "RPi GPIO bitbang"
-	depends on ARCH_BCM2835 || COMPILE_TEST
-       select GPIB_COMMON
-       help
-         GPIB bitbang driver Raspberry Pi GPIO adapters
-
-	 To compile this driver as a module, choose M here: the module will be
-	 called gpib_bitbang
-
-config GPIB_HP82335
-       tristate "HP82335/HP27209"
-	depends on ISA_BUS
-       select GPIB_COMMON
-       select GPIB_TMS9914
-       help
-         GPIB driver for HP82335 and HP27209 boards
-
-	 To compile this driver as a module, choose M here: the module will be
-	 called hp82335
-
-
-config GPIB_HP82341
-       tristate "HP82341x"
-       select GPIB_COMMON
-       select GPIB_TMS9914
-       depends on ISA_BUS || EISA
-       help
-         GPIB driver for HP82341 A/B/C/D boards
-
-	 To compile this driver as a module, choose M here: the module will be
-	 called hp82341
-
-config GPIB_INES
-       tristate "INES"
-	depends on PCI || ISA_BUS || PCMCIA
-	depends on PCMCIA || !PCMCIA
-	depends on HAS_IOPORT
-       select GPIB_COMMON
-       select GPIB_NEC7210
-       help
-         GPIB driver for Ines compatible boards
-	 Ines
-	    GPIB-HS-NT
-	    GPIB for Compact PCI
-	    GPIB for PCI
-	    GPIB for PCMCIA
-	    GPIB PC/104
-	 Hameg
-	    HO80-2
-	 Quancom
-	    PCIGPIB-1 based on Ines iGPIB 72010 chip
-
-	 To compile this driver as a module, choose M here: the module will be
-	 called ines_gpib
-	  called cb7210.
-
-config GPIB_PCMCIA
-       def_bool y
-       depends on PCMCIA && (GPIB_NI_PCI_ISA || GPIB_CB7210 || GPIB_INES)
-       help
-         Enable PCMCIA/CArdbus support for National Instruments,
-	 measurement computing boards and Ines boards.
-
-config GPIB_LPVO
-       tristate "LPVO DIY USB GPIB"
-       select GPIB_COMMON
-       depends on USB
-       help
-         Enable support for LPVO Self-made usb-gpib adapter
-
-	 To compile this driver as a module, choose M here: the module will be
-	 called lpvo_usb_gpib
-
-config GPIB_PC2
-       tristate "PC2 PC2a"
-	depends on ISA_BUS
-	depends on HAS_IOPORT
-       select GPIB_COMMON
-       select GPIB_NEC7210
-       help
-         Enable support for pc2 and pc2a compatible adapters
-	    Capital Equipment Corporation PC-488
-	    CONTEC GP-IB(PC)
-	    Hameg HO80
-	    Iotech GP488B
-	    Keithly MBC-488
-	    Measurement Computing ISA-GPIB-PCA2
-	    National Instruments PCII, PCIIa and PCII/IIa
-
-	 To compile this driver as a module, choose M here: the module will be
-	 called pc2_gpib
-
-
-config GPIB_TMS9914
-       tristate
-       select GPIB_COMMON
-       help
-         Enable support for TMS 9914 chip.
-
-	 To compile this driver as a module, choose M here: the module will be
-	 called tms9914
-
-config GPIB_NEC7210
-       tristate
-       select GPIB_COMMON
-       help
-         Enable support for NEC 7210 compatible chips.
-
-	 To compile this driver as a module, choose M here: the module will be
-	 called nec7210
-
-endif # GPIB
diff --git a/drivers/staging/gpib/Makefile b/drivers/staging/gpib/Makefile
deleted file mode 100644
index d0e88f5c0844..000000000000
--- a/drivers/staging/gpib/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-
-subdir-ccflags-y += -I$(src)/include -I$(src)/uapi
-
-obj-$(CONFIG_GPIB_AGILENT_82350B) += agilent_82350b/
-obj-$(CONFIG_GPIB_AGILENT_82357A) += agilent_82357a/
-obj-$(CONFIG_GPIB_CB7210) += cb7210/
-obj-$(CONFIG_GPIB_CEC_PCI) += cec/
-obj-$(CONFIG_GPIB_COMMON) += common/
-obj-$(CONFIG_GPIB_FLUKE) += eastwood/
-obj-$(CONFIG_GPIB_FMH) += fmh_gpib/
-obj-$(CONFIG_GPIB_GPIO) += gpio/
-obj-$(CONFIG_GPIB_HP82335) += hp_82335/
-obj-$(CONFIG_GPIB_HP82341) += hp_82341/
-obj-$(CONFIG_GPIB_INES) += ines/
-obj-$(CONFIG_GPIB_LPVO) += lpvo_usb_gpib/
-obj-$(CONFIG_GPIB_NEC7210) += nec7210/
-obj-$(CONFIG_GPIB_NI_USB) += ni_usb/
-obj-$(CONFIG_GPIB_PC2) += pc2/
-obj-$(CONFIG_GPIB_TMS9914) += tms9914/
-obj-$(CONFIG_GPIB_NI_PCI_ISA) += tnt4882/
diff --git a/drivers/staging/gpib/TODO b/drivers/staging/gpib/TODO
deleted file mode 100644
index ac07dd90b4ef..000000000000
--- a/drivers/staging/gpib/TODO
+++ /dev/null
@@ -1,10 +0,0 @@
-TODO:
-- checkpatch.pl fixes
-  These checks should be ignored:
-    CHECK:ALLOC_SIZEOF_STRUCT: Prefer kmalloc(sizeof(*board->private_data)...) over kmalloc(sizeof(struct xxx_priv)...)
-    ./gpio/gpib_bitbang.c:50: ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in parenthese
-  This warning will be addressed later:  WARNING:UNDOCUMENTED_DT_STRING: DT compatible string
-- resolve XXX notes where possible
-- fix FIXME notes
-- clean-up commented-out code
-- fix typos
diff --git a/drivers/staging/gpib/agilent_82350b/Makefile b/drivers/staging/gpib/agilent_82350b/Makefile
deleted file mode 100644
index f24e1e713a63..000000000000
--- a/drivers/staging/gpib/agilent_82350b/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-
-obj-$(CONFIG_GPIB_AGILENT_82350B) += agilent_82350b.o
diff --git a/drivers/staging/gpib/agilent_82350b/agilent_82350b.c b/drivers/staging/gpib/agilent_82350b/agilent_82350b.c
deleted file mode 100644
index 01a5bb43cd2d..000000000000
--- a/drivers/staging/gpib/agilent_82350b/agilent_82350b.c
+++ /dev/null
@@ -1,896 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *   copyright            : (C) 2002, 2004 by Frank Mori Hess              *
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include "agilent_82350b.h"
-#include <linux/delay.h>
-#include <linux/ioport.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/dma.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/wait.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver for Agilent 82350b");
-
-static int read_transfer_counter(struct agilent_82350b_priv *a_priv);
-static unsigned short read_and_clear_event_status(struct gpib_board *board);
-static void set_transfer_counter(struct agilent_82350b_priv *a_priv, int count);
-static int agilent_82350b_write(struct gpib_board *board, u8 *buffer,
-				size_t length, int send_eoi, size_t *bytes_written);
-
-static int agilent_82350b_accel_read(struct gpib_board *board, u8 *buffer,
-				     size_t length, int *end, size_t *bytes_read)
-
-{
-	struct agilent_82350b_priv *a_priv = board->private_data;
-	struct tms9914_priv *tms_priv = &a_priv->tms9914_priv;
-	int retval = 0;
-	unsigned short event_status;
-	int i, num_fifo_bytes;
-	/* hardware doesn't support checking for end-of-string character when using fifo */
-	if (tms_priv->eos_flags & REOS)
-		return tms9914_read(board, tms_priv, buffer, length, end, bytes_read);
-
-	clear_bit(DEV_CLEAR_BN, &tms_priv->state);
-
-	read_and_clear_event_status(board);
-	*end = 0;
-	*bytes_read = 0;
-	if (length == 0)
-		return 0;
-	/* disable fifo for the moment */
-	writeb(DIRECTION_GPIB_TO_HOST, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
-	/* handle corner case of board not in holdoff and one byte might slip in early */
-	if (tms_priv->holdoff_active == 0 && length > 1) {
-		size_t num_bytes;
-
-		retval = tms9914_read(board, tms_priv, buffer, 1, end, &num_bytes);
-		*bytes_read += num_bytes;
-		if (retval < 0 || *end)
-			return retval;
-		++buffer;
-		--length;
-	}
-	tms9914_set_holdoff_mode(tms_priv, TMS9914_HOLDOFF_EOI);
-	tms9914_release_holdoff(tms_priv);
-	i = 0;
-	num_fifo_bytes = length - 1;
-	/* disable BI interrupts */
-	write_byte(tms_priv, tms_priv->imr0_bits & ~HR_BIIE, IMR0);
-	while (i < num_fifo_bytes && *end == 0) {
-		int block_size;
-		int j;
-		int count;
-
-		block_size = min(num_fifo_bytes - i, agilent_82350b_fifo_size);
-		set_transfer_counter(a_priv, block_size);
-		writeb(ENABLE_TI_TO_SRAM | DIRECTION_GPIB_TO_HOST,
-		       a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
-		if (agilent_82350b_fifo_is_halted(a_priv))
-			writeb(RESTART_STREAM_BIT, a_priv->gpib_base + STREAM_STATUS_REG);
-
-		clear_bit(READ_READY_BN, &tms_priv->state);
-
-		retval = wait_event_interruptible(board->wait,
-						  ((event_status =
-						    read_and_clear_event_status(board)) &
-						   (TERM_COUNT_STATUS_BIT |
-						    BUFFER_END_STATUS_BIT)) ||
-						  test_bit(DEV_CLEAR_BN, &tms_priv->state) ||
-						  test_bit(TIMO_NUM, &board->status));
-		if (retval) {
-			retval = -ERESTARTSYS;
-			break;
-		}
-		count = block_size - read_transfer_counter(a_priv);
-		for (j = 0; j < count && i < num_fifo_bytes; ++j)
-			buffer[i++] = readb(a_priv->sram_base + j);
-		if (event_status & BUFFER_END_STATUS_BIT) {
-			clear_bit(RECEIVED_END_BN, &tms_priv->state);
-
-			tms_priv->holdoff_active = 1;
-			*end = 1;
-		}
-		if (test_bit(TIMO_NUM, &board->status)) {
-			retval = -ETIMEDOUT;
-			break;
-		}
-		if (test_bit(DEV_CLEAR_BN, &tms_priv->state)) {
-			retval = -EINTR;
-			break;
-		}
-	}
-	/* re-enable BI interrupts */
-	write_byte(tms_priv, tms_priv->imr0_bits, IMR0);
-	*bytes_read += i;
-	buffer += i;
-	length -= i;
-	writeb(DIRECTION_GPIB_TO_HOST, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
-	if (retval < 0)
-		return retval;
-	/* read last bytes if we havn't received an END yet */
-	if (*end == 0) {
-		size_t num_bytes;
-		/* try to make sure we holdoff after last byte read */
-		retval = tms9914_read(board, tms_priv, buffer, length, end, &num_bytes);
-		*bytes_read += num_bytes;
-		if (retval < 0)
-			return retval;
-	}
-	return 0;
-}
-
-static int translate_wait_return_value(struct gpib_board *board, int retval)
-
-{
-	struct agilent_82350b_priv *a_priv = board->private_data;
-	struct tms9914_priv *tms_priv = &a_priv->tms9914_priv;
-
-	if (retval)
-		return -ERESTARTSYS;
-	if (test_bit(TIMO_NUM, &board->status))
-		return -ETIMEDOUT;
-	if (test_bit(DEV_CLEAR_BN, &tms_priv->state))
-		return -EINTR;
-	return 0;
-}
-
-static int agilent_82350b_accel_write(struct gpib_board *board, u8 *buffer,
-				      size_t length, int send_eoi,
-				      size_t *bytes_written)
-{
-	struct agilent_82350b_priv *a_priv = board->private_data;
-	struct tms9914_priv *tms_priv = &a_priv->tms9914_priv;
-	int i, j;
-	unsigned short event_status;
-	int retval = 0;
-	int fifotransferlength = length;
-	int block_size = 0;
-	size_t num_bytes;
-
-	*bytes_written = 0;
-	if (send_eoi)
-		--fifotransferlength;
-
-	clear_bit(DEV_CLEAR_BN, &tms_priv->state);
-
-	writeb(0, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
-
-	event_status = read_and_clear_event_status(board);
-
-#ifdef EXPERIMENTAL
-	/* wait for previous BO to complete if any */
-	retval = wait_event_interruptible(board->wait,
-					  test_bit(DEV_CLEAR_BN, &tms_priv->state) ||
-					  test_bit(WRITE_READY_BN, &tms_priv->state) ||
-					  test_bit(TIMO_NUM, &board->status));
-	retval = translate_wait_return_value(board, retval);
-
-	if (retval)
-		return retval;
-#endif
-
-	if (fifotransferlength > 0) {
-		retval = agilent_82350b_write(board, buffer, 1, 0, &num_bytes);
-		*bytes_written += num_bytes;
-		if (retval < 0)
-			return retval;
-	}
-
-	write_byte(tms_priv, tms_priv->imr0_bits & ~HR_BOIE, IMR0);
-	for (i = 1; i < fifotransferlength;) {
-		clear_bit(WRITE_READY_BN, &tms_priv->state);
-
-		block_size = min(fifotransferlength - i, agilent_82350b_fifo_size);
-		set_transfer_counter(a_priv, block_size);
-		for (j = 0; j < block_size; ++j, ++i) {
-			/* load data into board's sram */
-			writeb(buffer[i], a_priv->sram_base + j);
-		}
-		writeb(ENABLE_TI_TO_SRAM, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
-
-		if (agilent_82350b_fifo_is_halted(a_priv))
-			writeb(RESTART_STREAM_BIT, a_priv->gpib_base + STREAM_STATUS_REG);
-
-		retval = wait_event_interruptible(board->wait,
-						  ((event_status =
-						    read_and_clear_event_status(board)) &
-						   TERM_COUNT_STATUS_BIT) ||
-						  test_bit(DEV_CLEAR_BN, &tms_priv->state) ||
-						  test_bit(TIMO_NUM, &board->status));
-		writeb(0, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
-		num_bytes = block_size - read_transfer_counter(a_priv);
-
-		*bytes_written += num_bytes;
-		retval = translate_wait_return_value(board, retval);
-		if (retval)
-			break;
-	}
-	write_byte(tms_priv, tms_priv->imr0_bits, IMR0);
-	if (retval < 0)
-		return retval;
-
-	if (send_eoi) {
-		retval = agilent_82350b_write(board, buffer + fifotransferlength, 1, send_eoi,
-					      &num_bytes);
-		*bytes_written += num_bytes;
-		if (retval < 0)
-			return retval;
-	}
-	return 0;
-}
-
-static unsigned short read_and_clear_event_status(struct gpib_board *board)
-{
-	struct agilent_82350b_priv *a_priv = board->private_data;
-	unsigned long flags;
-	unsigned short status;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	status = a_priv->event_status_bits;
-	a_priv->event_status_bits = 0;
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return status;
-}
-
-static irqreturn_t agilent_82350b_interrupt(int irq, void *arg)
-
-{
-	int tms9914_status1 = 0, tms9914_status2 = 0;
-	int event_status;
-	struct gpib_board *board = arg;
-	struct agilent_82350b_priv *a_priv = board->private_data;
-	unsigned long flags;
-	irqreturn_t retval = IRQ_NONE;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	event_status = readb(a_priv->gpib_base + EVENT_STATUS_REG);
-	if (event_status & IRQ_STATUS_BIT)
-		retval = IRQ_HANDLED;
-
-	if (event_status & TMS9914_IRQ_STATUS_BIT) {
-		tms9914_status1 = read_byte(&a_priv->tms9914_priv, ISR0);
-		tms9914_status2 = read_byte(&a_priv->tms9914_priv, ISR1);
-		tms9914_interrupt_have_status(board, &a_priv->tms9914_priv, tms9914_status1,
-					      tms9914_status2);
-	}
-	/* write-clear status bits */
-	if (event_status & (BUFFER_END_STATUS_BIT | TERM_COUNT_STATUS_BIT)) {
-		writeb(event_status & (BUFFER_END_STATUS_BIT | TERM_COUNT_STATUS_BIT),
-		       a_priv->gpib_base + EVENT_STATUS_REG);
-		a_priv->event_status_bits |= event_status;
-		wake_up_interruptible(&board->wait);
-	}
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return retval;
-}
-
-static void agilent_82350b_detach(struct gpib_board *board);
-
-static int read_transfer_counter(struct agilent_82350b_priv *a_priv)
-{
-	int lo, mid, value;
-
-	lo = readb(a_priv->gpib_base + XFER_COUNT_LO_REG);
-	mid = readb(a_priv->gpib_base + XFER_COUNT_MID_REG);
-	value = (lo & 0xff) | ((mid << 8) & 0x7f00);
-	value = ~(value - 1) & 0x7fff;
-	return value;
-}
-
-static void set_transfer_counter(struct agilent_82350b_priv *a_priv, int count)
-{
-	int complement = -count;
-
-	writeb(complement & 0xff, a_priv->gpib_base + XFER_COUNT_LO_REG);
-	writeb((complement >> 8) & 0xff, a_priv->gpib_base + XFER_COUNT_MID_REG);
-	/* I don't think the hi count reg is even used, but oh well */
-	writeb((complement >> 16) & 0xf, a_priv->gpib_base + XFER_COUNT_HI_REG);
-}
-
-/* wrappers for interface functions */
-static int agilent_82350b_read(struct gpib_board *board, u8 *buffer,
-			       size_t length, int *end, size_t *bytes_read)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_read(board, &priv->tms9914_priv, buffer, length, end, bytes_read);
-}
-
-static int agilent_82350b_write(struct gpib_board *board, u8 *buffer,
-				size_t length, int send_eoi, size_t *bytes_written)
-
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_write(board, &priv->tms9914_priv, buffer, length, send_eoi, bytes_written);
-}
-
-static int agilent_82350b_command(struct gpib_board *board, u8 *buffer,
-				  size_t length, size_t *bytes_written)
-
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_command(board, &priv->tms9914_priv, buffer, length, bytes_written);
-}
-
-static int agilent_82350b_take_control(struct gpib_board *board, int synchronous)
-
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_take_control_workaround(board, &priv->tms9914_priv, synchronous);
-}
-
-static int agilent_82350b_go_to_standby(struct gpib_board *board)
-
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_go_to_standby(board, &priv->tms9914_priv);
-}
-
-static int agilent_82350b_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct agilent_82350b_priv *a_priv = board->private_data;
-
-	if (request_control) {
-		a_priv->card_mode_bits |= CM_SYSTEM_CONTROLLER_BIT;
-		if (a_priv->model != MODEL_82350A)
-			writeb(IC_SYSTEM_CONTROLLER_BIT, a_priv->gpib_base + INTERNAL_CONFIG_REG);
-	} else {
-		a_priv->card_mode_bits &= ~CM_SYSTEM_CONTROLLER_BIT;
-		if (a_priv->model != MODEL_82350A)
-			writeb(0, a_priv->gpib_base + INTERNAL_CONFIG_REG);
-	}
-	writeb(a_priv->card_mode_bits, a_priv->gpib_base + CARD_MODE_REG);
-	return tms9914_request_system_control(board, &a_priv->tms9914_priv, request_control);
-}
-
-static void agilent_82350b_interface_clear(struct gpib_board *board, int assert)
-
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	tms9914_interface_clear(board, &priv->tms9914_priv, assert);
-}
-
-static void agilent_82350b_remote_enable(struct gpib_board *board, int enable)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	tms9914_remote_enable(board, &priv->tms9914_priv, enable);
-}
-
-static int agilent_82350b_enable_eos(struct gpib_board *board, u8 eos_byte,
-				     int compare_8_bits)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_enable_eos(board, &priv->tms9914_priv, eos_byte, compare_8_bits);
-}
-
-static void agilent_82350b_disable_eos(struct gpib_board *board)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	tms9914_disable_eos(board, &priv->tms9914_priv);
-}
-
-static unsigned int agilent_82350b_update_status(struct gpib_board *board,
-						 unsigned int clear_mask)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_update_status(board, &priv->tms9914_priv, clear_mask);
-}
-
-static int agilent_82350b_primary_address(struct gpib_board *board,
-					  unsigned int address)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_primary_address(board, &priv->tms9914_priv, address);
-}
-
-static int agilent_82350b_secondary_address(struct gpib_board *board,
-					    unsigned int address, int enable)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_secondary_address(board, &priv->tms9914_priv, address, enable);
-}
-
-static int agilent_82350b_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_parallel_poll(board, &priv->tms9914_priv, result);
-}
-
-static void agilent_82350b_parallel_poll_configure(struct gpib_board *board,
-						   u8 config)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	tms9914_parallel_poll_configure(board, &priv->tms9914_priv, config);
-}
-
-static void agilent_82350b_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	tms9914_parallel_poll_response(board, &priv->tms9914_priv, ist);
-}
-
-static void agilent_82350b_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	tms9914_serial_poll_response(board, &priv->tms9914_priv, status);
-}
-
-static u8 agilent_82350b_serial_poll_status(struct gpib_board *board)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_serial_poll_status(board, &priv->tms9914_priv);
-}
-
-static int agilent_82350b_line_status(const struct gpib_board *board)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	return tms9914_line_status(board, &priv->tms9914_priv);
-}
-
-static int agilent_82350b_t1_delay(struct gpib_board *board, unsigned int nanosec)
-{
-	struct agilent_82350b_priv *a_priv = board->private_data;
-	static const int nanosec_per_clock = 30;
-	unsigned int value;
-
-	tms9914_t1_delay(board, &a_priv->tms9914_priv, nanosec);
-
-	value = (nanosec + nanosec_per_clock - 1) / nanosec_per_clock;
-	if (value > 0xff)
-		value = 0xff;
-	writeb(value, a_priv->gpib_base + T1_DELAY_REG);
-	return value * nanosec_per_clock;
-}
-
-static void agilent_82350b_return_to_local(struct gpib_board *board)
-{
-	struct agilent_82350b_priv *priv = board->private_data;
-
-	tms9914_return_to_local(board, &priv->tms9914_priv);
-}
-
-static int agilent_82350b_allocate_private(struct gpib_board *board)
-{
-	board->private_data = kzalloc(sizeof(struct agilent_82350b_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -ENOMEM;
-	return 0;
-}
-
-static void agilent_82350b_free_private(struct gpib_board *board)
-{
-	kfree(board->private_data);
-	board->private_data = NULL;
-}
-
-static int init_82350a_hardware(struct gpib_board *board,
-				const struct gpib_board_config *config)
-{
-	struct agilent_82350b_priv *a_priv = board->private_data;
-	static const unsigned int firmware_length = 5302;
-	unsigned int borg_status;
-	static const unsigned int timeout = 1000;
-	int i, j;
-	const char *firmware_data = config->init_data;
-	const unsigned int plx_cntrl_static_bits = PLX9050_WAITO_NOT_USER0_SELECT_BIT |
-		PLX9050_USER0_OUTPUT_BIT |
-		PLX9050_LLOCK_NOT_USER1_SELECT_BIT |
-		PLX9050_USER1_OUTPUT_BIT |
-		PLX9050_USER2_OUTPUT_BIT |
-		PLX9050_USER3_OUTPUT_BIT |
-		PLX9050_PCI_READ_MODE_BIT |
-		PLX9050_PCI_WRITE_MODE_BIT |
-		PLX9050_PCI_RETRY_DELAY_BITS(64) |
-		PLX9050_DIRECT_SLAVE_LOCK_ENABLE_BIT;
-
-	/* load borg data */
-	borg_status = readb(a_priv->borg_base);
-	if ((borg_status & BORG_DONE_BIT))
-		return 0;
-	/* need to programme borg */
-	if (!config->init_data || config->init_data_length != firmware_length) {
-		dev_err(board->gpib_dev, "the 82350A board requires firmware after powering on.\n");
-		return -EIO;
-	}
-	dev_dbg(board->gpib_dev, "Loading firmware...\n");
-
-	/* tickle the borg */
-	writel(plx_cntrl_static_bits | PLX9050_USER3_DATA_BIT,
-	       a_priv->plx_base + PLX9050_CNTRL_REG);
-	usleep_range(1000, 2000);
-	writel(plx_cntrl_static_bits, a_priv->plx_base + PLX9050_CNTRL_REG);
-	usleep_range(1000, 2000);
-	writel(plx_cntrl_static_bits | PLX9050_USER3_DATA_BIT,
-	       a_priv->plx_base + PLX9050_CNTRL_REG);
-	usleep_range(1000, 2000);
-
-	for (i = 0; i < config->init_data_length; ++i) {
-		for (j = 0; j < timeout && (readb(a_priv->borg_base) & BORG_READY_BIT) == 0; ++j) {
-			if (need_resched())
-				schedule();
-			usleep_range(10, 20);
-		}
-		if (j == timeout) {
-			dev_err(board->gpib_dev, "timed out loading firmware.\n");
-			return -ETIMEDOUT;
-		}
-		writeb(firmware_data[i], a_priv->gpib_base + CONFIG_DATA_REG);
-	}
-	for (j = 0; j < timeout && (readb(a_priv->borg_base) & BORG_DONE_BIT) == 0; ++j) {
-		if (need_resched())
-			schedule();
-		usleep_range(10, 20);
-	}
-	if (j == timeout) {
-		dev_err(board->gpib_dev, "timed out waiting for firmware load to complete.\n");
-		return -ETIMEDOUT;
-	}
-	dev_dbg(board->gpib_dev, " ...done.\n");
-	return 0;
-}
-
-static int test_sram(struct gpib_board *board)
-
-{
-	struct agilent_82350b_priv *a_priv = board->private_data;
-	unsigned int i;
-	const unsigned int sram_length = pci_resource_len(a_priv->pci_device, SRAM_82350A_REGION);
-	/* test SRAM */
-	const unsigned int byte_mask = 0xff;
-
-	for (i = 0; i < sram_length; ++i) {
-		writeb(i & byte_mask, a_priv->sram_base + i);
-		if (need_resched())
-			schedule();
-	}
-	for (i = 0; i < sram_length; ++i) {
-		unsigned int read_value = readb(a_priv->sram_base + i);
-
-		if ((i & byte_mask) != read_value) {
-			dev_err(board->gpib_dev, "SRAM test failed at %d wanted %d got %d\n",
-				i, (i & byte_mask), read_value);
-			return -EIO;
-		}
-		if (need_resched())
-			schedule();
-	}
-	dev_dbg(board->gpib_dev, "SRAM test passed 0x%x bytes checked\n", sram_length);
-	return 0;
-}
-
-static int agilent_82350b_generic_attach(struct gpib_board *board,
-					 const struct gpib_board_config *config,
-					 int use_fifos)
-
-{
-	struct agilent_82350b_priv *a_priv;
-	struct tms9914_priv *tms_priv;
-	int retval;
-
-	board->status = 0;
-
-	if (agilent_82350b_allocate_private(board))
-		return -ENOMEM;
-	a_priv = board->private_data;
-	a_priv->using_fifos = use_fifos;
-	tms_priv = &a_priv->tms9914_priv;
-	tms_priv->read_byte = tms9914_iomem_read_byte;
-	tms_priv->write_byte = tms9914_iomem_write_byte;
-	tms_priv->offset = 1;
-
-	/* find board */
-	a_priv->pci_device = gpib_pci_get_device(config, PCI_VENDOR_ID_AGILENT,
-						 PCI_DEVICE_ID_82350B, NULL);
-	if (a_priv->pci_device) {
-		a_priv->model = MODEL_82350B;
-		dev_dbg(board->gpib_dev, "Agilent 82350B board found\n");
-
-	} else	{
-		a_priv->pci_device = gpib_pci_get_device(config, PCI_VENDOR_ID_AGILENT,
-							 PCI_DEVICE_ID_82351A, NULL);
-		if (a_priv->pci_device)	{
-			a_priv->model = MODEL_82351A;
-			dev_dbg(board->gpib_dev, "Agilent 82351B board found\n");
-
-		} else {
-			a_priv->pci_device = gpib_pci_get_subsys(config, PCI_VENDOR_ID_PLX,
-								 PCI_DEVICE_ID_PLX_9050,
-								 PCI_VENDOR_ID_HP,
-								 PCI_SUBDEVICE_ID_82350A,
-								 a_priv->pci_device);
-			if (a_priv->pci_device) {
-				a_priv->model = MODEL_82350A;
-				dev_dbg(board->gpib_dev, "HP/Agilent 82350A board found\n");
-			} else {
-				dev_err(board->gpib_dev, "no 82350/82351 board found\n");
-				return -ENODEV;
-			}
-		}
-	}
-	if (pci_enable_device(a_priv->pci_device)) {
-		dev_err(board->gpib_dev, "error enabling pci device\n");
-		return -EIO;
-	}
-	if (pci_request_regions(a_priv->pci_device, DRV_NAME))
-		return -ENOMEM;
-	switch (a_priv->model) {
-	case MODEL_82350A:
-		a_priv->plx_base = ioremap(pci_resource_start(a_priv->pci_device, PLX_MEM_REGION),
-					   pci_resource_len(a_priv->pci_device, PLX_MEM_REGION));
-		dev_dbg(board->gpib_dev, "plx base address remapped to 0x%p\n", a_priv->plx_base);
-		a_priv->gpib_base = ioremap(pci_resource_start(a_priv->pci_device,
-							       GPIB_82350A_REGION),
-					    pci_resource_len(a_priv->pci_device,
-							     GPIB_82350A_REGION));
-		dev_dbg(board->gpib_dev, "chip base address remapped to 0x%p\n", a_priv->gpib_base);
-		tms_priv->mmiobase = a_priv->gpib_base + TMS9914_BASE_REG;
-		a_priv->sram_base = ioremap(pci_resource_start(a_priv->pci_device,
-							       SRAM_82350A_REGION),
-					    pci_resource_len(a_priv->pci_device,
-							     SRAM_82350A_REGION));
-		dev_dbg(board->gpib_dev, "sram base address remapped to 0x%p\n", a_priv->sram_base);
-		a_priv->borg_base = ioremap(pci_resource_start(a_priv->pci_device,
-							       BORG_82350A_REGION),
-					    pci_resource_len(a_priv->pci_device,
-							     BORG_82350A_REGION));
-		dev_dbg(board->gpib_dev, "borg base address remapped to 0x%p\n", a_priv->borg_base);
-
-		retval = init_82350a_hardware(board, config);
-		if (retval < 0)
-			return retval;
-		break;
-	case MODEL_82350B:
-	case MODEL_82351A:
-		a_priv->gpib_base = ioremap(pci_resource_start(a_priv->pci_device, GPIB_REGION),
-					    pci_resource_len(a_priv->pci_device, GPIB_REGION));
-		dev_dbg(board->gpib_dev, "chip base address remapped to 0x%p\n", a_priv->gpib_base);
-		tms_priv->mmiobase = a_priv->gpib_base + TMS9914_BASE_REG;
-		a_priv->sram_base = ioremap(pci_resource_start(a_priv->pci_device, SRAM_REGION),
-					    pci_resource_len(a_priv->pci_device, SRAM_REGION));
-		dev_dbg(board->gpib_dev, "sram base address remapped to 0x%p\n", a_priv->sram_base);
-		a_priv->misc_base = ioremap(pci_resource_start(a_priv->pci_device, MISC_REGION),
-					    pci_resource_len(a_priv->pci_device, MISC_REGION));
-		dev_dbg(board->gpib_dev, "misc base address remapped to 0x%p\n", a_priv->misc_base);
-		break;
-	default:
-		dev_err(board->gpib_dev, "invalid board\n");
-		return -ENODEV;
-	}
-
-	retval = test_sram(board);
-	if (retval < 0)
-		return retval;
-
-	if (request_irq(a_priv->pci_device->irq, agilent_82350b_interrupt,
-			IRQF_SHARED, DRV_NAME, board)) {
-		dev_err(board->gpib_dev, "failed to obtain irq %d\n", a_priv->pci_device->irq);
-		return -EIO;
-	}
-	a_priv->irq = a_priv->pci_device->irq;
-	dev_dbg(board->gpib_dev, " IRQ %d\n", a_priv->irq);
-
-	writeb(0, a_priv->gpib_base + SRAM_ACCESS_CONTROL_REG);
-	a_priv->card_mode_bits = ENABLE_PCI_IRQ_BIT;
-	writeb(a_priv->card_mode_bits, a_priv->gpib_base + CARD_MODE_REG);
-
-	if (a_priv->model == MODEL_82350A) {
-		/* enable PCI interrupts for 82350a */
-		writel(PLX9050_LINTR1_EN_BIT | PLX9050_LINTR2_POLARITY_BIT |
-		       PLX9050_PCI_INTR_EN_BIT,
-		       a_priv->plx_base + PLX9050_INTCSR_REG);
-	}
-
-	if (use_fifos) {
-		writeb(ENABLE_BUFFER_END_EVENTS_BIT | ENABLE_TERM_COUNT_EVENTS_BIT,
-		       a_priv->gpib_base + EVENT_ENABLE_REG);
-		writeb(ENABLE_TERM_COUNT_INTERRUPT_BIT | ENABLE_BUFFER_END_INTERRUPT_BIT |
-		       ENABLE_TMS9914_INTERRUPTS_BIT, a_priv->gpib_base + INTERRUPT_ENABLE_REG);
-		/* write-clear event status bits */
-		writeb(BUFFER_END_STATUS_BIT | TERM_COUNT_STATUS_BIT,
-		       a_priv->gpib_base + EVENT_STATUS_REG);
-	} else {
-		writeb(0, a_priv->gpib_base + EVENT_ENABLE_REG);
-		writeb(ENABLE_TMS9914_INTERRUPTS_BIT,
-		       a_priv->gpib_base + INTERRUPT_ENABLE_REG);
-	}
-	board->t1_nano_sec = agilent_82350b_t1_delay(board, 2000);
-	tms9914_board_reset(tms_priv);
-
-	tms9914_online(board, tms_priv);
-
-	return 0;
-}
-
-static int agilent_82350b_unaccel_attach(struct gpib_board *board,
-					 const struct gpib_board_config *config)
-{
-	return agilent_82350b_generic_attach(board, config, 0);
-}
-
-static int agilent_82350b_accel_attach(struct gpib_board *board,
-				       const struct gpib_board_config *config)
-{
-	return agilent_82350b_generic_attach(board, config, 1);
-}
-
-static void agilent_82350b_detach(struct gpib_board *board)
-{
-	struct agilent_82350b_priv *a_priv = board->private_data;
-	struct tms9914_priv *tms_priv;
-
-	if (a_priv) {
-		if (a_priv->plx_base) /* disable interrupts */
-			writel(0, a_priv->plx_base + PLX9050_INTCSR_REG);
-
-		tms_priv = &a_priv->tms9914_priv;
-		if (a_priv->irq)
-			free_irq(a_priv->irq, board);
-		if (a_priv->gpib_base) {
-			tms9914_board_reset(tms_priv);
-			if (a_priv->misc_base)
-				iounmap(a_priv->misc_base);
-			if (a_priv->borg_base)
-				iounmap(a_priv->borg_base);
-			if (a_priv->sram_base)
-				iounmap(a_priv->sram_base);
-			if (a_priv->gpib_base)
-				iounmap(a_priv->gpib_base);
-			if (a_priv->plx_base)
-				iounmap(a_priv->plx_base);
-			pci_release_regions(a_priv->pci_device);
-		}
-		if (a_priv->pci_device)
-			pci_dev_put(a_priv->pci_device);
-	}
-	agilent_82350b_free_private(board);
-}
-
-static struct gpib_interface agilent_82350b_unaccel_interface = {
-	.name = "agilent_82350b_unaccel",
-	.attach = agilent_82350b_unaccel_attach,
-	.detach = agilent_82350b_detach,
-	.read = agilent_82350b_read,
-	.write = agilent_82350b_write,
-	.command = agilent_82350b_command,
-	.request_system_control = agilent_82350b_request_system_control,
-	.take_control = agilent_82350b_take_control,
-	.go_to_standby = agilent_82350b_go_to_standby,
-	.interface_clear = agilent_82350b_interface_clear,
-	.remote_enable = agilent_82350b_remote_enable,
-	.enable_eos = agilent_82350b_enable_eos,
-	.disable_eos = agilent_82350b_disable_eos,
-	.parallel_poll = agilent_82350b_parallel_poll,
-	.parallel_poll_configure = agilent_82350b_parallel_poll_configure,
-	.parallel_poll_response = agilent_82350b_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, /* XXX */
-	.line_status = agilent_82350b_line_status,
-	.update_status = agilent_82350b_update_status,
-	.primary_address = agilent_82350b_primary_address,
-	.secondary_address = agilent_82350b_secondary_address,
-	.serial_poll_response = agilent_82350b_serial_poll_response,
-	.serial_poll_status = agilent_82350b_serial_poll_status,
-	.t1_delay = agilent_82350b_t1_delay,
-	.return_to_local = agilent_82350b_return_to_local,
-};
-
-static struct gpib_interface agilent_82350b_interface = {
-	.name = "agilent_82350b",
-	.attach = agilent_82350b_accel_attach,
-	.detach = agilent_82350b_detach,
-	.read = agilent_82350b_accel_read,
-	.write = agilent_82350b_accel_write,
-	.command = agilent_82350b_command,
-	.request_system_control = agilent_82350b_request_system_control,
-	.take_control = agilent_82350b_take_control,
-	.go_to_standby = agilent_82350b_go_to_standby,
-	.interface_clear = agilent_82350b_interface_clear,
-	.remote_enable = agilent_82350b_remote_enable,
-	.enable_eos = agilent_82350b_enable_eos,
-	.disable_eos = agilent_82350b_disable_eos,
-	.parallel_poll = agilent_82350b_parallel_poll,
-	.parallel_poll_configure = agilent_82350b_parallel_poll_configure,
-	.parallel_poll_response = agilent_82350b_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, /* XXX */
-	.line_status = agilent_82350b_line_status,
-	.update_status = agilent_82350b_update_status,
-	.primary_address = agilent_82350b_primary_address,
-	.secondary_address = agilent_82350b_secondary_address,
-	.serial_poll_response = agilent_82350b_serial_poll_response,
-	.serial_poll_status = agilent_82350b_serial_poll_status,
-	.t1_delay = agilent_82350b_t1_delay,
-	.return_to_local = agilent_82350b_return_to_local,
-};
-
-static int agilent_82350b_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
-
-{
-	return 0;
-}
-
-static const struct pci_device_id agilent_82350b_pci_table[] = {
-	{ PCI_VENDOR_ID_PLX,     PCI_DEVICE_ID_PLX_9050, PCI_VENDOR_ID_HP,
-	  PCI_SUBDEVICE_ID_82350A, 0, 0, 0 },
-	{ PCI_VENDOR_ID_AGILENT, PCI_DEVICE_ID_82350B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
-	{ PCI_VENDOR_ID_AGILENT, PCI_DEVICE_ID_82351A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
-	{ 0 }
-};
-MODULE_DEVICE_TABLE(pci, agilent_82350b_pci_table);
-
-static struct pci_driver agilent_82350b_pci_driver = {
-	.name = DRV_NAME,
-	.id_table = agilent_82350b_pci_table,
-	.probe = &agilent_82350b_pci_probe
-};
-
-static int __init agilent_82350b_init_module(void)
-{
-	int result;
-
-	result = pci_register_driver(&agilent_82350b_pci_driver);
-	if (result) {
-		pr_err("pci_register_driver failed: error = %d\n", result);
-		return result;
-	}
-
-	result = gpib_register_driver(&agilent_82350b_unaccel_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_unaccel;
-	}
-
-	result = gpib_register_driver(&agilent_82350b_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_interface;
-	}
-
-	return 0;
-
-err_interface:
-	gpib_unregister_driver(&agilent_82350b_unaccel_interface);
-err_unaccel:
-	pci_unregister_driver(&agilent_82350b_pci_driver);
-
-	return result;
-}
-
-static void __exit agilent_82350b_exit_module(void)
-{
-	gpib_unregister_driver(&agilent_82350b_interface);
-	gpib_unregister_driver(&agilent_82350b_unaccel_interface);
-
-	pci_unregister_driver(&agilent_82350b_pci_driver);
-}
-
-module_init(agilent_82350b_init_module);
-module_exit(agilent_82350b_exit_module);
diff --git a/drivers/staging/gpib/agilent_82350b/agilent_82350b.h b/drivers/staging/gpib/agilent_82350b/agilent_82350b.h
deleted file mode 100644
index ef841957297f..000000000000
--- a/drivers/staging/gpib/agilent_82350b/agilent_82350b.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright            : (C) 2002, 2004 by Frank Mori Hess             *
- ***************************************************************************/
-
-#include "gpibP.h"
-#include "plx9050.h"
-#include "tms9914.h"
-
-enum pci_vendor_ids {
-	PCI_VENDOR_ID_AGILENT = 0x15bc,
-};
-
-enum pci_device_ids {
-	PCI_DEVICE_ID_82350B = 0x0b01,
-	PCI_DEVICE_ID_82351A = 0x1218
-};
-
-enum pci_subdevice_ids {
-	PCI_SUBDEVICE_ID_82350A = 0x10b0,
-};
-
-enum pci_regions_82350a {
-	PLX_MEM_REGION  = 0,
-	PLX_IO_REGION   = 1,
-	GPIB_82350A_REGION = 2,
-	SRAM_82350A_REGION = 3,
-	BORG_82350A_REGION = 4
-};
-
-enum pci_regions_82350b {
-	GPIB_REGION = 0,
-	SRAM_REGION = 1,
-	MISC_REGION = 2,
-};
-
-enum board_model {
-	MODEL_82350A,
-	MODEL_82350B,
-	MODEL_82351A
-};
-
-/* struct which defines private_data for board */
-struct agilent_82350b_priv {
-	struct tms9914_priv tms9914_priv;
-	struct pci_dev *pci_device;
-	void __iomem *plx_base;	/* 82350a only */
-	void __iomem *gpib_base;
-	void __iomem *sram_base;
-	void __iomem *misc_base;
-	void __iomem *borg_base;
-	int irq;
-	unsigned short card_mode_bits;
-	unsigned short event_status_bits;
-	enum board_model model;
-	bool using_fifos;
-};
-
-/* registers */
-enum agilent_82350b_gpib_registers
-
-{
-	CARD_MODE_REG = 0x1,
-	CONFIG_DATA_REG = 0x2, /* 82350A specific */
-	INTERRUPT_ENABLE_REG = 0x3,
-	EVENT_STATUS_REG = 0x4,
-	EVENT_ENABLE_REG = 0x5,
-	STREAM_STATUS_REG = 0x7,
-	DEBUG_RAM0_REG = 0x8,
-	DEBUG_RAM1_REG = 0x9,
-	DEBUG_RAM2_REG = 0xa,
-	DEBUG_RAM3_REG = 0xb,
-	XFER_COUNT_LO_REG = 0xc,
-	XFER_COUNT_MID_REG = 0xd,
-	XFER_COUNT_HI_REG = 0xe,
-	TMS9914_BASE_REG = 0x10,
-	INTERNAL_CONFIG_REG = 0x18,
-	IMR0_READ_REG = 0x19, /* read */
-	T1_DELAY_REG = 0x19, /* write */
-	IMR1_READ_REG = 0x1a,
-	ADR_READ_REG = 0x1b,
-	SPMR_READ_REG = 0x1c,
-	PPR_READ_REG = 0x1d,
-	CDOR_READ_REG = 0x1e,
-	SRAM_ACCESS_CONTROL_REG = 0x1f,
-};
-
-enum card_mode_bits
-
-{
-	ACTIVE_CONTROLLER_BIT = 0x2, /* read-only */
-	CM_SYSTEM_CONTROLLER_BIT = 0x8,
-	ENABLE_BUS_MONITOR_BIT = 0x10,
-	ENABLE_PCI_IRQ_BIT = 0x20,
-};
-
-enum interrupt_enable_bits
-
-{
-	ENABLE_TMS9914_INTERRUPTS_BIT = 0x1,
-	ENABLE_BUFFER_END_INTERRUPT_BIT = 0x10,
-	ENABLE_TERM_COUNT_INTERRUPT_BIT = 0x20,
-};
-
-enum event_enable_bits
-
-{
-	ENABLE_BUFFER_END_EVENTS_BIT = 0x10,
-	ENABLE_TERM_COUNT_EVENTS_BIT = 0x20,
-};
-
-enum event_status_bits
-
-{
-	TMS9914_IRQ_STATUS_BIT = 0x1,
-	IRQ_STATUS_BIT = 0x2,
-	BUFFER_END_STATUS_BIT = 0x10, /* write-clear */
-	TERM_COUNT_STATUS_BIT = 0x20, /* write-clear */
-};
-
-enum stream_status_bits
-
-{
-	HALTED_STATUS_BIT = 0x1, /* read */
-	RESTART_STREAM_BIT = 0x1, /* write */
-};
-
-enum internal_config_bits
-
-{
-	IC_SYSTEM_CONTROLLER_BIT = 0x80,
-};
-
-enum sram_access_control_bits
-
-{
-	DIRECTION_GPIB_TO_HOST = 0x20, /* transfer direction */
-	ENABLE_TI_TO_SRAM = 0x40, /* enable fifo */
-	ENABLE_FAST_TALKER = 0x80 /* added for 82350A (not used) */
-};
-
-enum borg_bits
-
-{
-	BORG_READY_BIT = 0x40,
-	BORG_DONE_BIT = 0x80
-};
-
-static const int agilent_82350b_fifo_size = 0x8000;
-
-static inline int agilent_82350b_fifo_is_halted(struct agilent_82350b_priv *a_priv)
-
-{
-	return readb(a_priv->gpib_base + STREAM_STATUS_REG) & HALTED_STATUS_BIT;
-}
-
diff --git a/drivers/staging/gpib/agilent_82357a/Makefile b/drivers/staging/gpib/agilent_82357a/Makefile
deleted file mode 100644
index 81a55c257a6e..000000000000
--- a/drivers/staging/gpib/agilent_82357a/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-
-obj-$(CONFIG_GPIB_AGILENT_82357A) += agilent_82357a.o
-
-
diff --git a/drivers/staging/gpib/agilent_82357a/agilent_82357a.c b/drivers/staging/gpib/agilent_82357a/agilent_82357a.c
deleted file mode 100644
index 77c8e549b208..000000000000
--- a/drivers/staging/gpib/agilent_82357a/agilent_82357a.c
+++ /dev/null
@@ -1,1691 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *	driver for Agilent 82357A/B usb to gpib adapters		   *
- *    copyright		   : (C) 2004 by Frank Mori Hess		   *
- ***************************************************************************/
-
-#define _GNU_SOURCE
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include "agilent_82357a.h"
-#include "gpibP.h"
-#include "tms9914.h"
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver for Agilent 82357A/B usb adapters");
-
-#define MAX_NUM_82357A_INTERFACES 128
-static struct usb_interface *agilent_82357a_driver_interfaces[MAX_NUM_82357A_INTERFACES];
-static DEFINE_MUTEX(agilent_82357a_hotplug_lock); // protect board insertion and removal
-
-static unsigned int agilent_82357a_update_status(struct gpib_board *board,
-						 unsigned int clear_mask);
-
-static int agilent_82357a_take_control_internal(struct gpib_board *board, int synchronous);
-
-static void agilent_82357a_bulk_complete(struct urb *urb)
-{
-	struct agilent_82357a_urb_ctx *context = urb->context;
-
-	complete(&context->complete);
-}
-
-static void agilent_82357a_timeout_handler(struct timer_list *t)
-{
-	struct agilent_82357a_priv *a_priv = timer_container_of(a_priv, t,
-								bulk_timer);
-	struct agilent_82357a_urb_ctx *context = &a_priv->context;
-
-	context->timed_out = 1;
-	complete(&context->complete);
-}
-
-static int agilent_82357a_send_bulk_msg(struct agilent_82357a_priv *a_priv, void *data,
-					int data_length, int *actual_data_length,
-					int timeout_msecs)
-{
-	struct usb_device *usb_dev;
-	int retval;
-	unsigned int out_pipe;
-	struct agilent_82357a_urb_ctx *context = &a_priv->context;
-
-	*actual_data_length = 0;
-	retval = mutex_lock_interruptible(&a_priv->bulk_alloc_lock);
-	if (retval)
-		return retval;
-	if (!a_priv->bus_interface) {
-		mutex_unlock(&a_priv->bulk_alloc_lock);
-		return -ENODEV;
-	}
-	if (a_priv->bulk_urb) {
-		mutex_unlock(&a_priv->bulk_alloc_lock);
-		return -EAGAIN;
-	}
-	a_priv->bulk_urb = usb_alloc_urb(0, GFP_KERNEL);
-	if (!a_priv->bulk_urb) {
-		mutex_unlock(&a_priv->bulk_alloc_lock);
-		return -ENOMEM;
-	}
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	out_pipe = usb_sndbulkpipe(usb_dev, a_priv->bulk_out_endpoint);
-	init_completion(&context->complete);
-	context->timed_out = 0;
-	usb_fill_bulk_urb(a_priv->bulk_urb, usb_dev, out_pipe, data, data_length,
-			  &agilent_82357a_bulk_complete, context);
-
-	if (timeout_msecs)
-		mod_timer(&a_priv->bulk_timer, jiffies + msecs_to_jiffies(timeout_msecs));
-
-	retval = usb_submit_urb(a_priv->bulk_urb, GFP_KERNEL);
-	if (retval) {
-		dev_err(&usb_dev->dev, "failed to submit bulk out urb, retval=%i\n", retval);
-		mutex_unlock(&a_priv->bulk_alloc_lock);
-		goto cleanup;
-	}
-	mutex_unlock(&a_priv->bulk_alloc_lock);
-	if (wait_for_completion_interruptible(&context->complete)) {
-		retval = -ERESTARTSYS;
-		goto cleanup;
-	}
-	if (context->timed_out)	{
-		retval = -ETIMEDOUT;
-	} else {
-		retval = a_priv->bulk_urb->status;
-		*actual_data_length = a_priv->bulk_urb->actual_length;
-	}
-cleanup:
-	if (timeout_msecs) {
-		if (timer_pending(&a_priv->bulk_timer))
-			timer_delete_sync(&a_priv->bulk_timer);
-	}
-	mutex_lock(&a_priv->bulk_alloc_lock);
-	if (a_priv->bulk_urb) {
-		usb_kill_urb(a_priv->bulk_urb);
-		usb_free_urb(a_priv->bulk_urb);
-		a_priv->bulk_urb = NULL;
-	}
-	mutex_unlock(&a_priv->bulk_alloc_lock);
-	return retval;
-}
-
-static int agilent_82357a_receive_bulk_msg(struct agilent_82357a_priv *a_priv, void *data,
-					   int data_length, int *actual_data_length,
-					   int timeout_msecs)
-{
-	struct usb_device *usb_dev;
-	int retval;
-	unsigned int in_pipe;
-	struct agilent_82357a_urb_ctx *context = &a_priv->context;
-
-	*actual_data_length = 0;
-	retval = mutex_lock_interruptible(&a_priv->bulk_alloc_lock);
-	if (retval)
-		return retval;
-	if (!a_priv->bus_interface) {
-		mutex_unlock(&a_priv->bulk_alloc_lock);
-		return -ENODEV;
-	}
-	if (a_priv->bulk_urb) {
-		mutex_unlock(&a_priv->bulk_alloc_lock);
-		return -EAGAIN;
-	}
-	a_priv->bulk_urb = usb_alloc_urb(0, GFP_KERNEL);
-	if (!a_priv->bulk_urb) {
-		mutex_unlock(&a_priv->bulk_alloc_lock);
-		return -ENOMEM;
-	}
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	in_pipe = usb_rcvbulkpipe(usb_dev, AGILENT_82357_BULK_IN_ENDPOINT);
-	init_completion(&context->complete);
-	context->timed_out = 0;
-	usb_fill_bulk_urb(a_priv->bulk_urb, usb_dev, in_pipe, data, data_length,
-			  &agilent_82357a_bulk_complete, context);
-
-	if (timeout_msecs)
-		mod_timer(&a_priv->bulk_timer, jiffies + msecs_to_jiffies(timeout_msecs));
-
-	retval = usb_submit_urb(a_priv->bulk_urb, GFP_KERNEL);
-	if (retval) {
-		dev_err(&usb_dev->dev, "failed to submit bulk in urb, retval=%i\n", retval);
-		mutex_unlock(&a_priv->bulk_alloc_lock);
-		goto cleanup;
-	}
-	mutex_unlock(&a_priv->bulk_alloc_lock);
-	if (wait_for_completion_interruptible(&context->complete)) {
-		retval = -ERESTARTSYS;
-		goto cleanup;
-	}
-	if (context->timed_out)	{
-		retval = -ETIMEDOUT;
-		goto cleanup;
-	}
-	retval = a_priv->bulk_urb->status;
-	*actual_data_length = a_priv->bulk_urb->actual_length;
-cleanup:
-	if (timeout_msecs)
-		timer_delete_sync(&a_priv->bulk_timer);
-
-	mutex_lock(&a_priv->bulk_alloc_lock);
-	if (a_priv->bulk_urb) {
-		usb_kill_urb(a_priv->bulk_urb);
-		usb_free_urb(a_priv->bulk_urb);
-		a_priv->bulk_urb = NULL;
-	}
-	mutex_unlock(&a_priv->bulk_alloc_lock);
-	return retval;
-}
-
-static int agilent_82357a_receive_control_msg(struct agilent_82357a_priv *a_priv, __u8 request,
-					      __u8 requesttype, __u16 value,  __u16 index,
-					      void *data, __u16 size, int timeout_msecs)
-{
-	struct usb_device *usb_dev;
-	int retval;
-	unsigned int in_pipe;
-
-	retval = mutex_lock_interruptible(&a_priv->control_alloc_lock);
-	if (retval)
-		return retval;
-	if (!a_priv->bus_interface) {
-		mutex_unlock(&a_priv->control_alloc_lock);
-		return -ENODEV;
-	}
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	in_pipe = usb_rcvctrlpipe(usb_dev, AGILENT_82357_CONTROL_ENDPOINT);
-	retval = usb_control_msg(usb_dev, in_pipe, request, requesttype, value, index, data,
-				 size, timeout_msecs);
-	mutex_unlock(&a_priv->control_alloc_lock);
-	return retval;
-}
-
-static void agilent_82357a_dump_raw_block(const u8 *raw_data, int length)
-{
-	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 8, 1, raw_data, length, true);
-}
-
-static int agilent_82357a_write_registers(struct agilent_82357a_priv *a_priv,
-					  const struct agilent_82357a_register_pairlet *writes,
-					  int num_writes)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	int retval;
-	u8 *out_data, *in_data;
-	int out_data_length, in_data_length;
-	int bytes_written, bytes_read;
-	int i = 0;
-	int j;
-	static const int bytes_per_write = 2;
-	static const int header_length = 2;
-	static const int max_writes = 31;
-
-	if (num_writes > max_writes) {
-		dev_err(&usb_dev->dev, "bug! num_writes=%i too large\n", num_writes);
-		return -EIO;
-	}
-	out_data_length = num_writes * bytes_per_write + header_length;
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-
-	out_data[i++] = DATA_PIPE_CMD_WR_REGS;
-	out_data[i++] = num_writes;
-	for (j = 0; j < num_writes; j++)	{
-		out_data[i++] = writes[j].address;
-		out_data[i++] = writes[j].value;
-	}
-
-	retval = mutex_lock_interruptible(&a_priv->bulk_transfer_lock);
-	if (retval) {
-		kfree(out_data);
-		return retval;
-	}
-	retval = agilent_82357a_send_bulk_msg(a_priv, out_data, i, &bytes_written, 1000);
-	kfree(out_data);
-	if (retval) {
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
-			retval, bytes_written, i);
-		mutex_unlock(&a_priv->bulk_transfer_lock);
-		return retval;
-	}
-	in_data_length = 0x20;
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data) {
-		mutex_unlock(&a_priv->bulk_transfer_lock);
-		return -ENOMEM;
-	}
-	retval = agilent_82357a_receive_bulk_msg(a_priv, in_data, in_data_length,
-						 &bytes_read, 1000);
-	mutex_unlock(&a_priv->bulk_transfer_lock);
-
-	if (retval) {
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-			retval, bytes_read);
-		agilent_82357a_dump_raw_block(in_data, bytes_read);
-		kfree(in_data);
-		return -EIO;
-	}
-	if (in_data[0] != (0xff & ~DATA_PIPE_CMD_WR_REGS)) {
-		dev_err(&usb_dev->dev, "bulk command=0x%x != ~DATA_PIPE_CMD_WR_REGS\n", in_data[0]);
-		return -EIO;
-	}
-	if (in_data[1])	{
-		dev_err(&usb_dev->dev, "nonzero error code 0x%x in DATA_PIPE_CMD_WR_REGS response\n",
-			in_data[1]);
-		return -EIO;
-	}
-	kfree(in_data);
-	return 0;
-}
-
-static int agilent_82357a_read_registers(struct agilent_82357a_priv *a_priv,
-					 struct agilent_82357a_register_pairlet *reads,
-					 int num_reads, int blocking)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	int retval;
-	u8 *out_data, *in_data;
-	int out_data_length, in_data_length;
-	int bytes_written, bytes_read;
-	int i = 0;
-	int j;
-	static const int header_length = 2;
-	static const int max_reads = 62;
-
-	if (num_reads > max_reads) {
-		dev_err(&usb_dev->dev, "bug! num_reads=%i too large\n", num_reads);
-		return -EIO;
-	}
-	out_data_length = num_reads + header_length;
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-
-	out_data[i++] = DATA_PIPE_CMD_RD_REGS;
-	out_data[i++] = num_reads;
-	for (j = 0; j < num_reads; j++)
-		out_data[i++] = reads[j].address;
-
-	if (blocking) {
-		retval = mutex_lock_interruptible(&a_priv->bulk_transfer_lock);
-		if (retval) {
-			kfree(out_data);
-			return retval;
-		}
-	} else {
-		retval = mutex_trylock(&a_priv->bulk_transfer_lock);
-		if (retval == 0) {
-			kfree(out_data);
-			return -EAGAIN;
-		}
-	}
-	retval = agilent_82357a_send_bulk_msg(a_priv, out_data, i, &bytes_written, 1000);
-	kfree(out_data);
-	if (retval) {
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
-			retval, bytes_written, i);
-		mutex_unlock(&a_priv->bulk_transfer_lock);
-		return retval;
-	}
-	in_data_length = 0x20;
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data) {
-		mutex_unlock(&a_priv->bulk_transfer_lock);
-		return -ENOMEM;
-	}
-	retval = agilent_82357a_receive_bulk_msg(a_priv, in_data, in_data_length,
-						 &bytes_read, 10000);
-	mutex_unlock(&a_priv->bulk_transfer_lock);
-
-	if (retval) {
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-			retval, bytes_read);
-		agilent_82357a_dump_raw_block(in_data, bytes_read);
-		kfree(in_data);
-		return -EIO;
-	}
-	i = 0;
-	if (in_data[i++] != (0xff & ~DATA_PIPE_CMD_RD_REGS)) {
-		dev_err(&usb_dev->dev, "bulk command=0x%x != ~DATA_PIPE_CMD_RD_REGS\n",	in_data[0]);
-		return -EIO;
-	}
-	if (in_data[i++]) {
-		dev_err(&usb_dev->dev, "nonzero error code 0x%x in DATA_PIPE_CMD_RD_REGS response\n",
-			in_data[1]);
-		return -EIO;
-	}
-	for (j = 0; j < num_reads; j++)
-		reads[j].value = in_data[i++];
-	kfree(in_data);
-	return 0;
-}
-
-static int agilent_82357a_abort(struct agilent_82357a_priv *a_priv, int flush)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	int retval = 0;
-	int receive_control_retval;
-	u16 wIndex = 0;
-	u8 *status_data;
-	static const unsigned int status_data_len = 2;
-
-	status_data = kmalloc(status_data_len, GFP_KERNEL);
-	if (!status_data)
-		return -ENOMEM;
-
-	if (flush)
-		wIndex |= XA_FLUSH;
-	receive_control_retval = agilent_82357a_receive_control_msg(a_priv,
-								    agilent_82357a_control_request,
-								    USB_DIR_IN | USB_TYPE_VENDOR |
-								    USB_RECIP_DEVICE, XFER_ABORT,
-								    wIndex, status_data,
-								    status_data_len, 100);
-	if (receive_control_retval < 0)	{
-		dev_err(&usb_dev->dev, "82357a_receive_control_msg() returned %i\n",
-			receive_control_retval);
-		retval = -EIO;
-		goto cleanup;
-	}
-	if (status_data[0] != (~XFER_ABORT & 0xff)) {
-		dev_err(&usb_dev->dev, "major code=0x%x != ~XFER_ABORT\n", status_data[0]);
-		retval = -EIO;
-		goto cleanup;
-	}
-	switch (status_data[1])	{
-	case UGP_SUCCESS:
-		retval = 0;
-		break;
-	case UGP_ERR_FLUSHING:
-		if (flush) {
-			retval = 0;
-			break;
-		}
-		fallthrough;
-	case UGP_ERR_FLUSHING_ALREADY:
-	default:
-		dev_err(&usb_dev->dev, "abort returned error code=0x%x\n", status_data[1]);
-		retval = -EIO;
-		break;
-	}
-
-cleanup:
-	kfree(status_data);
-	return retval;
-}
-
-// interface functions
-int agilent_82357a_command(struct gpib_board *board, u8 *buffer, size_t length,
-			   size_t *bytes_written);
-
-static int agilent_82357a_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
-			       size_t *nbytes)
-{
-	int retval;
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	u8 *out_data, *in_data;
-	int out_data_length, in_data_length;
-	int bytes_written, bytes_read;
-	int i = 0;
-	u8 trailing_flags;
-	unsigned long start_jiffies = jiffies;
-	int msec_timeout;
-
-	*nbytes = 0;
-	*end = 0;
-
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	out_data_length = 0x9;
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-	out_data[i++] = DATA_PIPE_CMD_READ;
-	out_data[i++] = 0;	// primary address when ARF_NO_ADDR is not set
-	out_data[i++] = 0;	// secondary address when ARF_NO_ADDR is not set
-	out_data[i] = ARF_NO_ADDRESS | ARF_END_ON_EOI;
-	if (a_priv->eos_mode & REOS)
-		out_data[i] |= ARF_END_ON_EOS_CHAR;
-	++i;
-	out_data[i++] = length & 0xff;
-	out_data[i++] = (length >> 8) & 0xff;
-	out_data[i++] = (length >> 16) & 0xff;
-	out_data[i++] = (length >> 24) & 0xff;
-	out_data[i++] = a_priv->eos_char;
-	msec_timeout = (board->usec_timeout + 999) / 1000;
-	retval = mutex_lock_interruptible(&a_priv->bulk_transfer_lock);
-	if (retval) {
-		kfree(out_data);
-		return retval;
-	}
-	retval = agilent_82357a_send_bulk_msg(a_priv, out_data, i, &bytes_written, msec_timeout);
-	kfree(out_data);
-	if (retval || bytes_written != i) {
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
-			retval, bytes_written, i);
-		mutex_unlock(&a_priv->bulk_transfer_lock);
-		if (retval < 0)
-			return retval;
-		return -EIO;
-	}
-	in_data_length = length + 1;
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data) {
-		mutex_unlock(&a_priv->bulk_transfer_lock);
-		return -ENOMEM;
-	}
-	if (board->usec_timeout != 0)
-		msec_timeout -= jiffies_to_msecs(jiffies - start_jiffies) - 1;
-	if (msec_timeout >= 0) {
-		retval = agilent_82357a_receive_bulk_msg(a_priv, in_data, in_data_length,
-							 &bytes_read, msec_timeout);
-	} else {
-		retval = -ETIMEDOUT;
-		bytes_read = 0;
-	}
-	if (retval == -ETIMEDOUT) {
-		int extra_bytes_read;
-		int extra_bytes_retval;
-
-		agilent_82357a_abort(a_priv, 1);
-		extra_bytes_retval = agilent_82357a_receive_bulk_msg(a_priv, in_data + bytes_read,
-								     in_data_length - bytes_read,
-								     &extra_bytes_read, 100);
-		bytes_read += extra_bytes_read;
-		if (extra_bytes_retval)	{
-			dev_err(&usb_dev->dev, "extra_bytes_retval=%i, bytes_read=%i\n",
-				extra_bytes_retval, bytes_read);
-			agilent_82357a_abort(a_priv, 0);
-		}
-	} else if (retval) {
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-			retval, bytes_read);
-		agilent_82357a_abort(a_priv, 0);
-	}
-	mutex_unlock(&a_priv->bulk_transfer_lock);
-	if (bytes_read > length + 1) {
-		bytes_read = length + 1;
-		dev_warn(&usb_dev->dev, "bytes_read > length? truncating");
-	}
-
-	if (bytes_read >= 1) {
-		memcpy(buffer, in_data, bytes_read - 1);
-		trailing_flags = in_data[bytes_read - 1];
-		*nbytes = bytes_read - 1;
-		if (trailing_flags & (ATRF_EOI | ATRF_EOS))
-			*end = 1;
-	}
-	kfree(in_data);
-
-	/*
-	 * Fix for a bug in 9914A that does not return the contents of ADSR
-	 * when the board is in listener active state and ATN is not asserted.
-	 * Set ATN here to obtain a valid board level ibsta
-	 */
-	agilent_82357a_take_control_internal(board, 0);
-
-	// FIXME check trailing flags for error
-	return retval;
-}
-
-static ssize_t agilent_82357a_generic_write(struct gpib_board *board,
-					    u8 *buffer, size_t length,
-					    int send_commands, int send_eoi,
-					    size_t *bytes_written)
-{
-	int retval;
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	u8 *out_data = NULL;
-	u8 *status_data = NULL;
-	int out_data_length;
-	int raw_bytes_written;
-	int i = 0, j;
-	int msec_timeout;
-	unsigned short bsr, adsr;
-	struct agilent_82357a_register_pairlet read_reg;
-
-	*bytes_written = 0;
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	out_data_length = length + 0x8;
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-	out_data[i++] = DATA_PIPE_CMD_WRITE;
-	out_data[i++] = 0; // primary address when AWF_NO_ADDRESS is not set
-	out_data[i++] = 0; // secondary address when AWF_NO_ADDRESS is not set
-	out_data[i] = AWF_NO_ADDRESS | AWF_NO_FAST_TALKER_FIRST_BYTE;
-	if (send_commands)
-		out_data[i] |= AWF_ATN | AWF_NO_FAST_TALKER;
-	if (send_eoi)
-		out_data[i] |= AWF_SEND_EOI;
-	++i;
-	out_data[i++] = length & 0xff;
-	out_data[i++] = (length >> 8) & 0xff;
-	out_data[i++] = (length >> 16) & 0xff;
-	out_data[i++] = (length >> 24) & 0xff;
-	for (j = 0; j < length; j++)
-		out_data[i++] = buffer[j];
-
-	clear_bit(AIF_WRITE_COMPLETE_BN, &a_priv->interrupt_flags);
-
-	msec_timeout = (board->usec_timeout + 999) / 1000;
-	retval = mutex_lock_interruptible(&a_priv->bulk_transfer_lock);
-	if (retval) {
-		kfree(out_data);
-		return retval;
-	}
-	retval = agilent_82357a_send_bulk_msg(a_priv, out_data, i, &raw_bytes_written,
-					      msec_timeout);
-	kfree(out_data);
-	if (retval || raw_bytes_written != i) {
-		agilent_82357a_abort(a_priv, 0);
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, raw_bytes_written=%i, i=%i\n",
-			retval, raw_bytes_written, i);
-		mutex_unlock(&a_priv->bulk_transfer_lock);
-		if (retval < 0)
-			return retval;
-		return -EIO;
-	}
-
-	retval = wait_event_interruptible(board->wait,
-					  test_bit(AIF_WRITE_COMPLETE_BN,
-						   &a_priv->interrupt_flags) ||
-					  test_bit(TIMO_NUM, &board->status));
-	if (retval) {
-		dev_dbg(&usb_dev->dev, "wait write complete interrupted\n");
-		agilent_82357a_abort(a_priv, 0);
-		mutex_unlock(&a_priv->bulk_transfer_lock);
-		return -ERESTARTSYS;
-	}
-
-	if (test_bit(AIF_WRITE_COMPLETE_BN, &a_priv->interrupt_flags) == 0) {
-		dev_dbg(&usb_dev->dev, "write timed out ibs %i, tmo %i\n",
-			test_bit(TIMO_NUM, &board->status), msec_timeout);
-
-		agilent_82357a_abort(a_priv, 0);
-
-		mutex_unlock(&a_priv->bulk_transfer_lock);
-
-		read_reg.address = BSR;
-		retval = agilent_82357a_read_registers(a_priv, &read_reg, 1, 1);
-		if (retval) {
-			dev_err(&usb_dev->dev, "read_registers() returned error\n");
-			return -ETIMEDOUT;
-		}
-
-		bsr = read_reg.value;
-		dev_dbg(&usb_dev->dev, "write aborted bsr 0x%x\n", bsr);
-
-		if (send_commands) {/* check for no listeners */
-			if ((bsr & BSR_ATN_BIT) && !(bsr & (BSR_NDAC_BIT | BSR_NRFD_BIT))) {
-				dev_dbg(&usb_dev->dev, "No listener on command\n");
-				clear_bit(TIMO_NUM, &board->status);
-				return -ENOTCONN; // no listener on bus
-			}
-		} else {
-			read_reg.address = ADSR;
-			retval = agilent_82357a_read_registers(a_priv, &read_reg, 1, 1);
-			if (retval) {
-				dev_err(&usb_dev->dev, "read_registers() returned error\n");
-				return -ETIMEDOUT;
-			}
-			adsr = read_reg.value;
-			if ((adsr & HR_TA) && !(bsr & (BSR_NDAC_BIT | BSR_NRFD_BIT))) {
-				dev_dbg(&usb_dev->dev, "No listener on write\n");
-				clear_bit(TIMO_NUM, &board->status);
-				return -ECOMM;
-			}
-		}
-
-		return -ETIMEDOUT;
-	}
-
-	status_data = kmalloc(STATUS_DATA_LEN, GFP_KERNEL);
-	if (!status_data) {
-		mutex_unlock(&a_priv->bulk_transfer_lock);
-		return -ENOMEM;
-	}
-
-	retval = agilent_82357a_receive_control_msg(a_priv, agilent_82357a_control_request,
-						    USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-						    XFER_STATUS, 0, status_data, STATUS_DATA_LEN,
-						    100);
-	mutex_unlock(&a_priv->bulk_transfer_lock);
-	if (retval < 0)	{
-		dev_err(&usb_dev->dev, "receive_control_msg() returned %i\n", retval);
-		kfree(status_data);
-		return -EIO;
-	}
-	*bytes_written	= (u32)status_data[2];
-	*bytes_written |= (u32)status_data[3] << 8;
-	*bytes_written |= (u32)status_data[4] << 16;
-	*bytes_written |= (u32)status_data[5] << 24;
-
-	kfree(status_data);
-	return 0;
-}
-
-static int agilent_82357a_write(struct gpib_board *board, u8 *buffer,
-				size_t length, int send_eoi, size_t *bytes_written)
-{
-	return agilent_82357a_generic_write(board, buffer, length, 0, send_eoi, bytes_written);
-}
-
-int agilent_82357a_command(struct gpib_board *board, u8 *buffer, size_t length,
-			   size_t *bytes_written)
-{
-	return agilent_82357a_generic_write(board, buffer, length, 1, 0, bytes_written);
-}
-
-int agilent_82357a_take_control_internal(struct gpib_board *board, int synchronous)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	struct agilent_82357a_register_pairlet write;
-	int retval;
-
-	write.address = AUXCR;
-	if (synchronous)
-		write.value = AUX_TCS;
-	else
-		write.value = AUX_TCA;
-	retval = agilent_82357a_write_registers(a_priv, &write, 1);
-	if (retval)
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-
-	return retval;
-}
-
-static int agilent_82357a_take_control(struct gpib_board *board, int synchronous)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	const int timeout = 10;
-	int i;
-
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-
-/*
- * It looks like the 9914 does not handle tcs properly.
- * See comment above tms9914_take_control_workaround() in
- * drivers/gpib/tms9914/tms9914_aux.c
- */
-	if (synchronous)
-		return -ETIMEDOUT;
-
-	agilent_82357a_take_control_internal(board, synchronous);
-	// busy wait until ATN is asserted
-	for (i = 0; i < timeout; ++i) {
-		agilent_82357a_update_status(board, 0);
-		if (test_bit(ATN_NUM, &board->status))
-			break;
-		udelay(1);
-	}
-	if (i == timeout)
-		return -ETIMEDOUT;
-	return 0;
-}
-
-static int agilent_82357a_go_to_standby(struct gpib_board *board)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	struct agilent_82357a_register_pairlet write;
-	int retval;
-
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	write.address = AUXCR;
-	write.value = AUX_GTS;
-	retval = agilent_82357a_write_registers(a_priv, &write, 1);
-	if (retval)
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-	return 0;
-}
-
-static int agilent_82357a_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	struct agilent_82357a_register_pairlet writes[2];
-	int retval;
-	int i = 0;
-
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	/* 82357B needs bit to be set in 9914 AUXCR register */
-	writes[i].address = AUXCR;
-	if (request_control) {
-		writes[i].value = AUX_RQC;
-		a_priv->hw_control_bits |= SYSTEM_CONTROLLER;
-	} else {
-		return -EINVAL;
-	}
-	++i;
-	writes[i].address = HW_CONTROL;
-	writes[i].value = a_priv->hw_control_bits;
-	++i;
-	retval = agilent_82357a_write_registers(a_priv, writes, i);
-	if (retval)
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-	return retval;
-}
-
-static void agilent_82357a_interface_clear(struct gpib_board *board, int assert)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	struct agilent_82357a_register_pairlet write;
-	int retval;
-
-	if (!a_priv->bus_interface)
-		return; // -ENODEV;
-
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	write.address = AUXCR;
-	write.value = AUX_SIC;
-	if (assert) {
-		write.value |= AUX_CS;
-		a_priv->is_cic = 1;
-	}
-	retval = agilent_82357a_write_registers(a_priv, &write, 1);
-	if (retval)
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-}
-
-static void agilent_82357a_remote_enable(struct gpib_board *board, int enable)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	struct agilent_82357a_register_pairlet write;
-	int retval;
-
-	if (!a_priv->bus_interface)
-		return; //-ENODEV;
-
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	write.address = AUXCR;
-	write.value = AUX_SRE;
-	if (enable)
-		write.value |= AUX_CS;
-	retval = agilent_82357a_write_registers(a_priv, &write, 1);
-	if (retval)
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-	a_priv->ren_state = enable;
-	return;// 0;
-}
-
-static int agilent_82357a_enable_eos(struct gpib_board *board, u8 eos_byte,
-				     int compare_8_bits)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-	if (compare_8_bits == 0)
-		return -EOPNOTSUPP;
-
-	a_priv->eos_char = eos_byte;
-	a_priv->eos_mode = REOS | BIN;
-	return 0;
-}
-
-static void agilent_82357a_disable_eos(struct gpib_board *board)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-
-	a_priv->eos_mode &= ~REOS;
-}
-
-static unsigned int agilent_82357a_update_status(struct gpib_board *board,
-						 unsigned int clear_mask)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	struct agilent_82357a_register_pairlet address_status, bus_status;
-	int retval;
-
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	board->status &= ~clear_mask;
-	if (a_priv->is_cic)
-		set_bit(CIC_NUM, &board->status);
-	else
-		clear_bit(CIC_NUM, &board->status);
-	address_status.address = ADSR;
-	retval = agilent_82357a_read_registers(a_priv, &address_status, 1, 0);
-	if (retval) {
-		if (retval != -EAGAIN)
-			dev_err(&usb_dev->dev, "read_registers() returned error\n");
-		return board->status;
-	}
-	// check for remote/local
-	if (address_status.value & HR_REM)
-		set_bit(REM_NUM, &board->status);
-	else
-		clear_bit(REM_NUM, &board->status);
-	// check for lockout
-	if (address_status.value & HR_LLO)
-		set_bit(LOK_NUM, &board->status);
-	else
-		clear_bit(LOK_NUM, &board->status);
-	// check for ATN
-	if (address_status.value & HR_ATN)
-		set_bit(ATN_NUM, &board->status);
-	else
-		clear_bit(ATN_NUM, &board->status);
-	// check for talker/listener addressed
-	if (address_status.value & HR_TA)
-		set_bit(TACS_NUM, &board->status);
-	else
-		clear_bit(TACS_NUM, &board->status);
-	if (address_status.value & HR_LA)
-		set_bit(LACS_NUM, &board->status);
-	else
-		clear_bit(LACS_NUM, &board->status);
-
-	bus_status.address = BSR;
-	retval = agilent_82357a_read_registers(a_priv, &bus_status, 1, 0);
-	if (retval) {
-		if (retval != -EAGAIN)
-			dev_err(&usb_dev->dev, "read_registers() returned error\n");
-		return board->status;
-	}
-	if (bus_status.value & BSR_SRQ_BIT)
-		set_bit(SRQI_NUM, &board->status);
-	else
-		clear_bit(SRQI_NUM, &board->status);
-
-	return board->status;
-}
-
-static int agilent_82357a_primary_address(struct gpib_board *board, unsigned int address)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	struct agilent_82357a_register_pairlet write;
-	int retval;
-
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	// put primary address in address0
-	write.address = ADR;
-	write.value = address & ADDRESS_MASK;
-	retval = agilent_82357a_write_registers(a_priv, &write, 1);
-	if (retval) {
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-		return retval;
-	}
-	return retval;
-}
-
-static int agilent_82357a_secondary_address(struct gpib_board *board,
-					    unsigned int address, int enable)
-{
-	if (enable)
-		return	-EOPNOTSUPP;
-	return 0;
-}
-
-static int agilent_82357a_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	struct agilent_82357a_register_pairlet writes[2];
-	struct agilent_82357a_register_pairlet read;
-	int retval;
-
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	// execute parallel poll
-	writes[0].address = AUXCR;
-	writes[0].value = AUX_CS | AUX_RPP;
-	writes[1].address = HW_CONTROL;
-	writes[1].value = a_priv->hw_control_bits & ~NOT_PARALLEL_POLL;
-	retval = agilent_82357a_write_registers(a_priv, writes, 2);
-	if (retval) {
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-		return retval;
-	}
-	udelay(2);	// silly, since usb write will take way longer
-	read.address = CPTR;
-	retval = agilent_82357a_read_registers(a_priv, &read, 1, 1);
-	if (retval) {
-		dev_err(&usb_dev->dev, "read_registers() returned error\n");
-		return retval;
-	}
-	*result = read.value;
-	// clear parallel poll state
-	writes[0].address = HW_CONTROL;
-	writes[0].value = a_priv->hw_control_bits | NOT_PARALLEL_POLL;
-	writes[1].address = AUXCR;
-	writes[1].value = AUX_RPP;
-	retval = agilent_82357a_write_registers(a_priv, writes, 2);
-	if (retval) {
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-		return retval;
-	}
-	return 0;
-}
-
-static void agilent_82357a_parallel_poll_configure(struct gpib_board *board, u8 config)
-{
-	// board can only be system controller
-	return;// 0;
-}
-
-static void agilent_82357a_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	// board can only be system controller
-	return;// 0;
-}
-
-static void agilent_82357a_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	// board can only be system controller
-	return;// 0;
-}
-
-static u8 agilent_82357a_serial_poll_status(struct gpib_board *board)
-{
-	// board can only be system controller
-	return 0;
-}
-
-static void agilent_82357a_return_to_local(struct gpib_board *board)
-{
-	// board can only be system controller
-	return;// 0;
-}
-
-static int agilent_82357a_line_status(const struct gpib_board *board)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	struct agilent_82357a_register_pairlet bus_status;
-	int retval;
-	int status = VALID_ALL;
-
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	bus_status.address = BSR;
-	retval = agilent_82357a_read_registers(a_priv, &bus_status, 1, 0);
-	if (retval) {
-		if (retval != -EAGAIN)
-			dev_err(&usb_dev->dev, "read_registers() returned error\n");
-		return retval;
-	}
-	if (bus_status.value & BSR_REN_BIT)
-		status |= BUS_REN;
-	if (bus_status.value & BSR_IFC_BIT)
-		status |= BUS_IFC;
-	if (bus_status.value & BSR_SRQ_BIT)
-		status |= BUS_SRQ;
-	if (bus_status.value & BSR_EOI_BIT)
-		status |= BUS_EOI;
-	if (bus_status.value & BSR_NRFD_BIT)
-		status |= BUS_NRFD;
-	if (bus_status.value & BSR_NDAC_BIT)
-		status |= BUS_NDAC;
-	if (bus_status.value & BSR_DAV_BIT)
-		status |= BUS_DAV;
-	if (bus_status.value & BSR_ATN_BIT)
-		status |= BUS_ATN;
-	return status;
-}
-
-static unsigned short nanosec_to_fast_talker_bits(unsigned int *nanosec)
-{
-	static const int nanosec_per_bit = 21;
-	static const int max_value = 0x72;
-	static const int min_value = 0x11;
-	unsigned short bits;
-
-	bits = (*nanosec + nanosec_per_bit / 2) / nanosec_per_bit;
-	if (bits < min_value)
-		bits = min_value;
-	if (bits > max_value)
-		bits = max_value;
-	*nanosec = bits * nanosec_per_bit;
-	return bits;
-}
-
-static int agilent_82357a_t1_delay(struct gpib_board *board, unsigned int nanosec)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	struct agilent_82357a_register_pairlet write;
-	int retval;
-
-	if (!a_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	write.address = FAST_TALKER_T1;
-	write.value = nanosec_to_fast_talker_bits(&nanosec);
-	retval = agilent_82357a_write_registers(a_priv, &write, 1);
-	if (retval)
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-	return nanosec;
-}
-
-static void agilent_82357a_interrupt_complete(struct urb *urb)
-{
-	struct gpib_board *board = urb->context;
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	int retval;
-	u8 *transfer_buffer = urb->transfer_buffer;
-	unsigned long interrupt_flags;
-
-	switch (urb->status) {
-		/* success */
-	case 0:
-		break;
-		/* unlinked, don't resubmit */
-	case -ECONNRESET:
-	case -ENOENT:
-	case -ESHUTDOWN:
-		return;
-	default: /* other error, resubmit */
-		retval = usb_submit_urb(a_priv->interrupt_urb, GFP_ATOMIC);
-		if (retval)
-			dev_err(&usb_dev->dev, "failed to resubmit interrupt urb\n");
-		return;
-	}
-
-	interrupt_flags = transfer_buffer[0];
-	if (test_bit(AIF_READ_COMPLETE_BN, &interrupt_flags))
-		set_bit(AIF_READ_COMPLETE_BN, &a_priv->interrupt_flags);
-	if (test_bit(AIF_WRITE_COMPLETE_BN, &interrupt_flags))
-		set_bit(AIF_WRITE_COMPLETE_BN, &a_priv->interrupt_flags);
-	if (test_bit(AIF_SRQ_BN, &interrupt_flags))
-		set_bit(SRQI_NUM, &board->status);
-
-	wake_up_interruptible(&board->wait);
-
-	retval = usb_submit_urb(a_priv->interrupt_urb, GFP_ATOMIC);
-	if (retval)
-		dev_err(&usb_dev->dev, "failed to resubmit interrupt urb\n");
-}
-
-static int agilent_82357a_setup_urbs(struct gpib_board *board)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev;
-	int int_pipe;
-	int retval;
-
-	retval = mutex_lock_interruptible(&a_priv->interrupt_alloc_lock);
-	if (retval)
-		return retval;
-	if (!a_priv->bus_interface) {
-		retval = -ENODEV;
-		goto setup_exit;
-	}
-
-	a_priv->interrupt_buffer = kmalloc(INTERRUPT_BUF_LEN, GFP_KERNEL);
-	if (!a_priv->interrupt_buffer) {
-		retval = -ENOMEM;
-		goto setup_exit;
-	}
-	a_priv->interrupt_urb = usb_alloc_urb(0, GFP_KERNEL);
-	if (!a_priv->interrupt_urb) {
-		retval = -ENOMEM;
-		goto setup_exit;
-	}
-	usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	int_pipe = usb_rcvintpipe(usb_dev, a_priv->interrupt_in_endpoint);
-	usb_fill_int_urb(a_priv->interrupt_urb, usb_dev, int_pipe, a_priv->interrupt_buffer,
-			 INTERRUPT_BUF_LEN, &agilent_82357a_interrupt_complete, board, 1);
-	retval = usb_submit_urb(a_priv->interrupt_urb, GFP_KERNEL);
-	if (retval) {
-		usb_free_urb(a_priv->interrupt_urb);
-		a_priv->interrupt_urb = NULL;
-		dev_err(&usb_dev->dev, "failed to submit first interrupt urb, retval=%i\n", retval);
-		goto setup_exit;
-	}
-	mutex_unlock(&a_priv->interrupt_alloc_lock);
-	return 0;
-
-setup_exit:
-	kfree(a_priv->interrupt_buffer);
-	mutex_unlock(&a_priv->interrupt_alloc_lock);
-	return retval;
-}
-
-static void agilent_82357a_cleanup_urbs(struct agilent_82357a_priv *a_priv)
-{
-	if (a_priv && a_priv->bus_interface) {
-		if (a_priv->interrupt_urb)
-			usb_kill_urb(a_priv->interrupt_urb);
-		if (a_priv->bulk_urb)
-			usb_kill_urb(a_priv->bulk_urb);
-	}
-};
-
-static void agilent_82357a_release_urbs(struct agilent_82357a_priv *a_priv)
-{
-	if (a_priv) {
-		usb_free_urb(a_priv->interrupt_urb);
-		a_priv->interrupt_urb = NULL;
-		kfree(a_priv->interrupt_buffer);
-	}
-}
-
-static int agilent_82357a_allocate_private(struct gpib_board *board)
-{
-	struct agilent_82357a_priv *a_priv;
-
-	board->private_data = kzalloc(sizeof(struct agilent_82357a_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -ENOMEM;
-	a_priv = board->private_data;
-	mutex_init(&a_priv->bulk_transfer_lock);
-	mutex_init(&a_priv->bulk_alloc_lock);
-	mutex_init(&a_priv->control_alloc_lock);
-	mutex_init(&a_priv->interrupt_alloc_lock);
-	return 0;
-}
-
-static void agilent_82357a_free_private(struct gpib_board *board)
-{
-	kfree(board->private_data);
-	board->private_data = NULL;
-}
-
-#define INIT_NUM_REG_WRITES 18
-static int agilent_82357a_init(struct gpib_board *board)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	struct agilent_82357a_register_pairlet hw_control;
-	struct agilent_82357a_register_pairlet writes[INIT_NUM_REG_WRITES];
-	int retval;
-	unsigned int nanosec;
-
-	writes[0].address = LED_CONTROL;
-	writes[0].value = FAIL_LED_ON;
-	writes[1].address = RESET_TO_POWERUP;
-	writes[1].value = RESET_SPACEBALL;
-	retval = agilent_82357a_write_registers(a_priv, writes, 2);
-	if (retval) {
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-		return -EIO;
-	}
-	set_current_state(TASK_INTERRUPTIBLE);
-	if (schedule_timeout(usec_to_jiffies(2000)))
-		return -ERESTARTSYS;
-	writes[0].address = AUXCR;
-	writes[0].value = AUX_NBAF;
-	writes[1].address = AUXCR;
-	writes[1].value = AUX_HLDE;
-	writes[2].address = AUXCR;
-	writes[2].value = AUX_TON;
-	writes[3].address = AUXCR;
-	writes[3].value = AUX_LON;
-	writes[4].address = AUXCR;
-	writes[4].value = AUX_RSV2;
-	writes[5].address = AUXCR;
-	writes[5].value = AUX_INVAL;
-	writes[6].address = AUXCR;
-	writes[6].value = AUX_RPP;
-	writes[7].address = AUXCR;
-	writes[7].value = AUX_STDL;
-	writes[8].address = AUXCR;
-	writes[8].value = AUX_VSTDL;
-	writes[9].address = FAST_TALKER_T1;
-	nanosec = board->t1_nano_sec;
-	writes[9].value = nanosec_to_fast_talker_bits(&nanosec);
-	board->t1_nano_sec = nanosec;
-	writes[10].address = ADR;
-	writes[10].value = board->pad & ADDRESS_MASK;
-	writes[11].address = PPR;
-	writes[11].value = 0;
-	writes[12].address = SPMR;
-	writes[12].value = 0;
-	writes[13].address = PROTOCOL_CONTROL;
-	writes[13].value = WRITE_COMPLETE_INTERRUPT_EN;
-	writes[14].address = IMR0;
-	writes[14].value = HR_BOIE | HR_BIIE;
-	writes[15].address = IMR1;
-	writes[15].value = HR_SRQIE;
-	// turn off reset state
-	writes[16].address = AUXCR;
-	writes[16].value = AUX_CHIP_RESET;
-	writes[17].address = LED_CONTROL;
-	writes[17].value = FIRMWARE_LED_CONTROL;
-	retval = agilent_82357a_write_registers(a_priv, writes, INIT_NUM_REG_WRITES);
-	if (retval) {
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-		return -EIO;
-	}
-	hw_control.address = HW_CONTROL;
-	retval = agilent_82357a_read_registers(a_priv, &hw_control, 1, 1);
-	if (retval) {
-		dev_err(&usb_dev->dev, "read_registers() returned error\n");
-		return -EIO;
-	}
-	a_priv->hw_control_bits = (hw_control.value & ~0x7) | NOT_TI_RESET | NOT_PARALLEL_POLL;
-
-	return 0;
-}
-
-static inline int agilent_82357a_device_match(struct usb_interface *interface,
-					      const struct gpib_board_config *config)
-{
-	struct usb_device * const usbdev = interface_to_usbdev(interface);
-
-	if (gpib_match_device_path(&interface->dev, config->device_path) == 0)
-		return 0;
-	if (config->serial_number &&
-	    strcmp(usbdev->serial, config->serial_number) != 0)
-		return 0;
-
-	return 1;
-}
-
-static int agilent_82357a_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	int retval;
-	int i;
-	unsigned int product_id;
-	struct agilent_82357a_priv *a_priv;
-	struct usb_device *usb_dev;
-
-	if (mutex_lock_interruptible(&agilent_82357a_hotplug_lock))
-		return -ERESTARTSYS;
-
-	retval = agilent_82357a_allocate_private(board);
-	if (retval < 0) {
-		mutex_unlock(&agilent_82357a_hotplug_lock);
-		return retval;
-	}
-	a_priv = board->private_data;
-	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i) {
-		if (agilent_82357a_driver_interfaces[i] &&
-		    !usb_get_intfdata(agilent_82357a_driver_interfaces[i]) &&
-		    agilent_82357a_device_match(agilent_82357a_driver_interfaces[i], config)) {
-			a_priv->bus_interface = agilent_82357a_driver_interfaces[i];
-			usb_set_intfdata(agilent_82357a_driver_interfaces[i], board);
-			usb_dev = interface_to_usbdev(a_priv->bus_interface);
-			break;
-		}
-	}
-	if (i == MAX_NUM_82357A_INTERFACES) {
-		dev_err(board->gpib_dev,
-			"No supported adapters found, have you loaded its firmware?\n");
-		retval = -ENODEV;
-		goto attach_fail;
-	}
-	product_id = le16_to_cpu(interface_to_usbdev(a_priv->bus_interface)->descriptor.idProduct);
-	switch (product_id) {
-	case USB_DEVICE_ID_AGILENT_82357A:
-		a_priv->bulk_out_endpoint = AGILENT_82357A_BULK_OUT_ENDPOINT;
-		a_priv->interrupt_in_endpoint = AGILENT_82357A_INTERRUPT_IN_ENDPOINT;
-		break;
-	case USB_DEVICE_ID_AGILENT_82357B:
-		a_priv->bulk_out_endpoint = AGILENT_82357B_BULK_OUT_ENDPOINT;
-		a_priv->interrupt_in_endpoint = AGILENT_82357B_INTERRUPT_IN_ENDPOINT;
-		break;
-	default:
-		dev_err(&usb_dev->dev, "bug, unhandled product_id in switch?\n");
-		retval = -EIO;
-		goto attach_fail;
-	}
-
-	retval = agilent_82357a_setup_urbs(board);
-	if (retval < 0)
-		goto attach_fail;
-
-	timer_setup(&a_priv->bulk_timer, agilent_82357a_timeout_handler, 0);
-
-	board->t1_nano_sec = 800;
-
-	retval = agilent_82357a_init(board);
-
-	if (retval < 0)	{
-		agilent_82357a_cleanup_urbs(a_priv);
-		agilent_82357a_release_urbs(a_priv);
-		goto attach_fail;
-	}
-
-	dev_info(&usb_dev->dev, "bus %d dev num %d attached to gpib%d, interface %i\n",
-		 usb_dev->bus->busnum, usb_dev->devnum, board->minor, i);
-	mutex_unlock(&agilent_82357a_hotplug_lock);
-	return retval;
-
-attach_fail:
-	agilent_82357a_free_private(board);
-	mutex_unlock(&agilent_82357a_hotplug_lock);
-	return retval;
-}
-
-static int agilent_82357a_go_idle(struct gpib_board *board)
-{
-	struct agilent_82357a_priv *a_priv = board->private_data;
-	struct usb_device *usb_dev = interface_to_usbdev(a_priv->bus_interface);
-	struct agilent_82357a_register_pairlet writes[0x20];
-	int retval;
-
-	// turn on tms9914 reset state
-	writes[0].address = AUXCR;
-	writes[0].value = AUX_CS | AUX_CHIP_RESET;
-	a_priv->hw_control_bits &= ~NOT_TI_RESET;
-	writes[1].address = HW_CONTROL;
-	writes[1].value = a_priv->hw_control_bits;
-	writes[2].address = PROTOCOL_CONTROL;
-	writes[2].value = 0;
-	writes[3].address = IMR0;
-	writes[3].value = 0;
-	writes[4].address = IMR1;
-	writes[4].value = 0;
-	writes[5].address = LED_CONTROL;
-	writes[5].value = 0;
-	retval = agilent_82357a_write_registers(a_priv, writes, 6);
-	if (retval) {
-		dev_err(&usb_dev->dev, "write_registers() returned error\n");
-		return -EIO;
-	}
-	return 0;
-}
-
-static void agilent_82357a_detach(struct gpib_board *board)
-{
-	struct agilent_82357a_priv *a_priv;
-
-	mutex_lock(&agilent_82357a_hotplug_lock);
-
-	a_priv = board->private_data;
-	if (a_priv) {
-		if (a_priv->bus_interface) {
-			agilent_82357a_go_idle(board);
-			usb_set_intfdata(a_priv->bus_interface, NULL);
-		}
-		mutex_lock(&a_priv->control_alloc_lock);
-		mutex_lock(&a_priv->bulk_alloc_lock);
-		mutex_lock(&a_priv->interrupt_alloc_lock);
-		agilent_82357a_cleanup_urbs(a_priv);
-		agilent_82357a_release_urbs(a_priv);
-		agilent_82357a_free_private(board);
-	}
-	mutex_unlock(&agilent_82357a_hotplug_lock);
-}
-
-static struct gpib_interface agilent_82357a_gpib_interface = {
-	.name = "agilent_82357a",
-	.attach = agilent_82357a_attach,
-	.detach = agilent_82357a_detach,
-	.read = agilent_82357a_read,
-	.write = agilent_82357a_write,
-	.command = agilent_82357a_command,
-	.take_control = agilent_82357a_take_control,
-	.go_to_standby = agilent_82357a_go_to_standby,
-	.request_system_control = agilent_82357a_request_system_control,
-	.interface_clear = agilent_82357a_interface_clear,
-	.remote_enable = agilent_82357a_remote_enable,
-	.enable_eos = agilent_82357a_enable_eos,
-	.disable_eos = agilent_82357a_disable_eos,
-	.parallel_poll = agilent_82357a_parallel_poll,
-	.parallel_poll_configure = agilent_82357a_parallel_poll_configure,
-	.parallel_poll_response = agilent_82357a_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = agilent_82357a_line_status,
-	.update_status = agilent_82357a_update_status,
-	.primary_address = agilent_82357a_primary_address,
-	.secondary_address = agilent_82357a_secondary_address,
-	.serial_poll_response = agilent_82357a_serial_poll_response,
-	.serial_poll_status = agilent_82357a_serial_poll_status,
-	.t1_delay = agilent_82357a_t1_delay,
-	.return_to_local = agilent_82357a_return_to_local,
-	.no_7_bit_eos = 1,
-	.skip_check_for_command_acceptors = 1
-};
-
-// Table with the USB-devices: just now only testing IDs
-static struct usb_device_id agilent_82357a_driver_device_table[] = {
-	{USB_DEVICE(USB_VENDOR_ID_AGILENT, USB_DEVICE_ID_AGILENT_82357A)},
-	{USB_DEVICE(USB_VENDOR_ID_AGILENT, USB_DEVICE_ID_AGILENT_82357B)},
-	{} /* Terminating entry */
-};
-MODULE_DEVICE_TABLE(usb, agilent_82357a_driver_device_table);
-
-static int agilent_82357a_driver_probe(struct usb_interface *interface,
-				       const struct usb_device_id *id)
-{
-	int i;
-	char *path;
-	static const int path_length = 1024;
-	struct usb_device *usb_dev;
-
-	if (mutex_lock_interruptible(&agilent_82357a_hotplug_lock))
-		return -ERESTARTSYS;
-	usb_dev = usb_get_dev(interface_to_usbdev(interface));
-	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i) {
-		if (!agilent_82357a_driver_interfaces[i]) {
-			agilent_82357a_driver_interfaces[i] = interface;
-			usb_set_intfdata(interface, NULL);
-			dev_dbg(&usb_dev->dev, "set bus interface %i to address 0x%p\n",
-				i, interface);
-			break;
-		}
-	}
-	if (i == MAX_NUM_82357A_INTERFACES) {
-		usb_put_dev(usb_dev);
-		mutex_unlock(&agilent_82357a_hotplug_lock);
-		dev_err(&usb_dev->dev, "out of space in agilent_82357a_driver_interfaces[]\n");
-		return -1;
-	}
-	path = kmalloc(path_length, GFP_KERNEL);
-	if (!path) {
-		usb_put_dev(usb_dev);
-		mutex_unlock(&agilent_82357a_hotplug_lock);
-		return -ENOMEM;
-	}
-	usb_make_path(usb_dev, path, path_length);
-	dev_info(&usb_dev->dev, "probe succeeded for path: %s\n", path);
-	kfree(path);
-	mutex_unlock(&agilent_82357a_hotplug_lock);
-	return 0;
-}
-
-static void agilent_82357a_driver_disconnect(struct usb_interface *interface)
-{
-	int i;
-	struct usb_device *usb_dev = interface_to_usbdev(interface);
-
-	mutex_lock(&agilent_82357a_hotplug_lock);
-
-	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i) {
-		if (agilent_82357a_driver_interfaces[i] == interface) {
-			struct gpib_board *board = usb_get_intfdata(interface);
-
-			if (board) {
-				struct agilent_82357a_priv *a_priv = board->private_data;
-
-				if (a_priv) {
-					mutex_lock(&a_priv->control_alloc_lock);
-					mutex_lock(&a_priv->bulk_alloc_lock);
-					mutex_lock(&a_priv->interrupt_alloc_lock);
-					agilent_82357a_cleanup_urbs(a_priv);
-					a_priv->bus_interface = NULL;
-					mutex_unlock(&a_priv->interrupt_alloc_lock);
-					mutex_unlock(&a_priv->bulk_alloc_lock);
-					mutex_unlock(&a_priv->control_alloc_lock);
-				}
-			}
-			agilent_82357a_driver_interfaces[i] = NULL;
-			break;
-		}
-	}
-	if (i == MAX_NUM_82357A_INTERFACES)
-		dev_err(&usb_dev->dev, "unable to find interface - bug?\n");
-	usb_put_dev(usb_dev);
-
-	mutex_unlock(&agilent_82357a_hotplug_lock);
-}
-
-static int agilent_82357a_driver_suspend(struct usb_interface *interface, pm_message_t message)
-{
-	int i, retval;
-	struct usb_device *usb_dev = interface_to_usbdev(interface);
-
-	mutex_lock(&agilent_82357a_hotplug_lock);
-
-	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i) {
-		if (agilent_82357a_driver_interfaces[i] == interface)	{
-			struct gpib_board *board = usb_get_intfdata(interface);
-
-			if (board) {
-				struct agilent_82357a_priv *a_priv = board->private_data;
-
-				if (a_priv) {
-					agilent_82357a_abort(a_priv, 0);
-					agilent_82357a_abort(a_priv, 0);
-					retval = agilent_82357a_go_idle(board);
-					if (retval) {
-						dev_err(&usb_dev->dev, "failed to go idle, retval=%i\n",
-							retval);
-						mutex_unlock(&agilent_82357a_hotplug_lock);
-						return retval;
-					}
-					mutex_lock(&a_priv->interrupt_alloc_lock);
-					agilent_82357a_cleanup_urbs(a_priv);
-					mutex_unlock(&a_priv->interrupt_alloc_lock);
-					dev_dbg(&usb_dev->dev,
-						"bus %d dev num %d gpib %d, interface %i suspended\n",
-						usb_dev->bus->busnum, usb_dev->devnum,
-						board->minor, i);
-				}
-			}
-			break;
-		}
-	}
-
-	mutex_unlock(&agilent_82357a_hotplug_lock);
-
-	return 0;
-}
-
-static int agilent_82357a_driver_resume(struct usb_interface *interface)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(interface);
-	struct gpib_board *board;
-	int i, retval = 0;
-
-	mutex_lock(&agilent_82357a_hotplug_lock);
-
-	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i)	{
-		if (agilent_82357a_driver_interfaces[i] == interface) {
-			board = usb_get_intfdata(interface);
-			if (board)
-				break;
-		}
-	}
-	if (i == MAX_NUM_82357A_INTERFACES) {
-		retval = -ENOENT;
-		goto resume_exit;
-	}
-
-	struct agilent_82357a_priv *a_priv = board->private_data;
-
-	if (a_priv) {
-		if (a_priv->interrupt_urb) {
-			mutex_lock(&a_priv->interrupt_alloc_lock);
-			retval = usb_submit_urb(a_priv->interrupt_urb, GFP_KERNEL);
-			if (retval) {
-				dev_err(&usb_dev->dev, "failed to resubmit interrupt urb in resume, retval=%i\n",
-					retval);
-				mutex_unlock(&a_priv->interrupt_alloc_lock);
-				mutex_unlock(&agilent_82357a_hotplug_lock);
-				return retval;
-			}
-			mutex_unlock(&a_priv->interrupt_alloc_lock);
-		}
-		retval = agilent_82357a_init(board);
-		if (retval < 0) {
-			mutex_unlock(&agilent_82357a_hotplug_lock);
-			return retval;
-		}
-		// set/unset system controller
-		retval = agilent_82357a_request_system_control(board, board->master);
-		// toggle ifc if master
-		if (board->master) {
-			agilent_82357a_interface_clear(board, 1);
-			usleep_range(200, 250);
-			agilent_82357a_interface_clear(board, 0);
-		}
-		// assert/unassert REN
-		agilent_82357a_remote_enable(board, a_priv->ren_state);
-
-		dev_dbg(&usb_dev->dev,
-			"bus %d dev num %d gpib%d, interface %i resumed\n",
-			usb_dev->bus->busnum, usb_dev->devnum, board->minor, i);
-	}
-
-resume_exit:
-	mutex_unlock(&agilent_82357a_hotplug_lock);
-
-	return retval;
-}
-
-static struct usb_driver agilent_82357a_bus_driver = {
-	.name = DRV_NAME,
-	.probe = agilent_82357a_driver_probe,
-	.disconnect = agilent_82357a_driver_disconnect,
-	.suspend = agilent_82357a_driver_suspend,
-	.resume = agilent_82357a_driver_resume,
-	.id_table = agilent_82357a_driver_device_table,
-};
-
-static int __init agilent_82357a_init_module(void)
-{
-	int i;
-	int ret;
-
-	for (i = 0; i < MAX_NUM_82357A_INTERFACES; ++i)
-		agilent_82357a_driver_interfaces[i] = NULL;
-
-	ret = usb_register(&agilent_82357a_bus_driver);
-	if (ret) {
-		pr_err("usb_register failed: error = %d\n", ret);
-		return ret;
-	}
-
-	ret = gpib_register_driver(&agilent_82357a_gpib_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		usb_deregister(&agilent_82357a_bus_driver);
-		return ret;
-	}
-
-	return 0;
-}
-
-static void __exit agilent_82357a_exit_module(void)
-{
-	gpib_unregister_driver(&agilent_82357a_gpib_interface);
-	usb_deregister(&agilent_82357a_bus_driver);
-}
-
-module_init(agilent_82357a_init_module);
-module_exit(agilent_82357a_exit_module);
diff --git a/drivers/staging/gpib/agilent_82357a/agilent_82357a.h b/drivers/staging/gpib/agilent_82357a/agilent_82357a.h
deleted file mode 100644
index 33ac558e5552..000000000000
--- a/drivers/staging/gpib/agilent_82357a/agilent_82357a.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *   copyright            : (C) 2004 by Frank Mori Hess                    *
- ***************************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/mutex.h>
-#include <linux/completion.h>
-#include <linux/usb.h>
-#include <linux/timer.h>
-#include <linux/compiler_attributes.h>
-#include "gpibP.h"
-#include "tms9914.h"
-
-enum usb_vendor_ids {
-	USB_VENDOR_ID_AGILENT = 0x0957
-};
-
-enum usb_device_ids {
-	USB_DEVICE_ID_AGILENT_82357A = 0x0107,
-	USB_DEVICE_ID_AGILENT_82357A_PREINIT = 0x0007,	// device id before firmware is loaded
-	USB_DEVICE_ID_AGILENT_82357B = 0x0718,		// device id before firmware is loaded
-	USB_DEVICE_ID_AGILENT_82357B_PREINIT = 0x0518,	// device id before firmware is loaded
-};
-
-enum endpoint_addresses {
-	AGILENT_82357_CONTROL_ENDPOINT = 0x0,
-	AGILENT_82357_BULK_IN_ENDPOINT = 0x2,
-	AGILENT_82357A_BULK_OUT_ENDPOINT = 0x4,
-	AGILENT_82357A_INTERRUPT_IN_ENDPOINT = 0x6,
-	AGILENT_82357B_BULK_OUT_ENDPOINT = 0x6,
-	AGILENT_82357B_INTERRUPT_IN_ENDPOINT = 0x8,
-};
-
-enum bulk_commands {
-	DATA_PIPE_CMD_WRITE = 0x1,
-	DATA_PIPE_CMD_READ = 0x3,
-	DATA_PIPE_CMD_WR_REGS = 0x4,
-	DATA_PIPE_CMD_RD_REGS = 0x5
-};
-
-enum agilent_82357a_read_flags {
-	ARF_END_ON_EOI = 0x1,
-	ARF_NO_ADDRESS = 0x2,
-	ARF_END_ON_EOS_CHAR = 0x4,
-	ARF_SPOLL = 0x8
-};
-
-enum agilent_82357a_trailing_read_flags {
-	ATRF_EOI = 0x1,
-	ATRF_ATN = 0x2,
-	ATRF_IFC = 0x4,
-	ATRF_EOS = 0x8,
-	ATRF_ABORT = 0x10,
-	ATRF_COUNT = 0x20,
-	ATRF_DEAD_BUS = 0x40,
-	ATRF_UNADDRESSED = 0x80
-};
-
-enum agilent_82357a_write_flags {
-	AWF_SEND_EOI = 0x1,
-	AWF_NO_FAST_TALKER_FIRST_BYTE = 0x2,
-	AWF_NO_FAST_TALKER = 0x4,
-	AWF_NO_ADDRESS = 0x8,
-	AWF_ATN = 0x10,
-	AWF_SEPARATE_HEADER = 0x80
-};
-
-enum agilent_82357a_interrupt_flag_bit_numbers {
-	AIF_SRQ_BN = 0,
-	AIF_WRITE_COMPLETE_BN = 1,
-	AIF_READ_COMPLETE_BN = 2,
-};
-
-enum agilent_82357_error_codes {
-	UGP_SUCCESS = 0,
-	UGP_ERR_INVALID_CMD = 1,
-	UGP_ERR_INVALID_PARAM = 2,
-	UGP_ERR_INVALID_REG = 3,
-	UGP_ERR_GPIB_READ = 4,
-	UGP_ERR_GPIB_WRITE = 5,
-	UGP_ERR_FLUSHING = 6,
-	UGP_ERR_FLUSHING_ALREADY = 7,
-	UGP_ERR_UNSUPPORTED = 8,
-	UGP_ERR_OTHER  = 9
-};
-
-enum agilent_82357_control_values {
-	XFER_ABORT = 0xa0,
-	XFER_STATUS = 0xb0,
-};
-
-enum xfer_status_bits {
-	XS_COMPLETED = 0x1,
-	XS_READ = 0x2,
-};
-
-enum xfer_status_completion_bits {
-	XSC_EOI = 0x1,
-	XSC_ATN = 0x2,
-	XSC_IFC = 0x4,
-	XSC_EOS = 0x8,
-	XSC_ABORT = 0x10,
-	XSC_COUNT = 0x20,
-	XSC_DEAD_BUS = 0x40,
-	XSC_BUS_NOT_ADDRESSED = 0x80
-};
-
-enum xfer_abort_type {
-	XA_FLUSH = 0x1
-};
-
-#define STATUS_DATA_LEN 8
-#define INTERRUPT_BUF_LEN 8
-
-struct agilent_82357a_urb_ctx {
-	struct completion complete;
-	unsigned timed_out : 1;
-};
-
-// struct which defines local data for each 82357 device
-struct agilent_82357a_priv {
-	struct usb_interface *bus_interface;
-	unsigned short eos_char;
-	unsigned short eos_mode;
-	unsigned short hw_control_bits;
-	unsigned long interrupt_flags;
-	struct urb *bulk_urb;
-	struct urb *interrupt_urb;
-	u8 *interrupt_buffer;
-	struct mutex bulk_transfer_lock;	// bulk transfer lock
-	struct mutex bulk_alloc_lock;		// bulk transfer allocation lock
-	struct mutex interrupt_alloc_lock;	// interrupt allocation lock
-	struct mutex control_alloc_lock;	// control message allocation lock
-	struct timer_list bulk_timer;
-	struct agilent_82357a_urb_ctx context;
-	unsigned int bulk_out_endpoint;
-	unsigned int interrupt_in_endpoint;
-	unsigned is_cic : 1;
-	unsigned ren_state : 1;
-};
-
-struct agilent_82357a_register_pairlet {
-	short address;
-	unsigned short value;
-};
-
-enum firmware_registers {
-	HW_CONTROL = 0xa,
-	LED_CONTROL = 0xb,
-	RESET_TO_POWERUP = 0xc,
-	PROTOCOL_CONTROL = 0xd,
-	FAST_TALKER_T1 = 0xe
-};
-
-enum hardware_control_bits {
-	NOT_TI_RESET = 0x1,
-	SYSTEM_CONTROLLER = 0x2,
-	NOT_PARALLEL_POLL = 0x4,
-	OSCILLATOR_5V_ON = 0x8,
-	OUTPUT_5V_ON = 0x20,
-	CPLD_3V_ON = 0x80,
-};
-
-enum led_control_bits {
-	FIRMWARE_LED_CONTROL = 0x1,
-	FAIL_LED_ON = 0x20,
-	READY_LED_ON = 0x40,
-	ACCESS_LED_ON = 0x80
-};
-
-enum reset_to_powerup_bits {
-	RESET_SPACEBALL = 0x1,	// wait 2 millisec after sending
-};
-
-enum protocol_control_bits {
-	WRITE_COMPLETE_INTERRUPT_EN = 0x1,
-};
-
-static const int agilent_82357a_control_request = 0x4;
-
diff --git a/drivers/staging/gpib/cb7210/Makefile b/drivers/staging/gpib/cb7210/Makefile
deleted file mode 100644
index d239ae80b415..000000000000
--- a/drivers/staging/gpib/cb7210/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GPIB_CB7210) += cb7210.o
-
-
diff --git a/drivers/staging/gpib/cb7210/cb7210.c b/drivers/staging/gpib/cb7210/cb7210.c
deleted file mode 100644
index 3e2397898a9b..000000000000
--- a/drivers/staging/gpib/cb7210/cb7210.c
+++ /dev/null
@@ -1,1598 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- * Measurement Computing boards using cb7210.2 and cbi488.2 chips
- *    copyright            : (C) 2001, 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include "cb7210.h"
-#include <linux/ioport.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/dma.h>
-#include <linux/bitops.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include "gpib_pci_ids.h"
-#include "quancom_pci.h"
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver Measurement Computing boards using cb7210.2 and cbi488.2");
-
-static int cb7210_read(struct gpib_board *board, u8 *buffer, size_t length,
-		       int *end, size_t *bytes_read);
-
-	static inline int have_fifo_word(const struct cb7210_priv *cb_priv)
-{
-	if (((cb7210_read_byte(cb_priv, HS_STATUS)) &
-	     (HS_RX_MSB_NOT_EMPTY | HS_RX_LSB_NOT_EMPTY)) ==
-	    (HS_RX_MSB_NOT_EMPTY | HS_RX_LSB_NOT_EMPTY))
-		return 1;
-	else
-		return 0;
-}
-
-static inline void input_fifo_enable(struct gpib_board *board, int enable)
-{
-	struct cb7210_priv *cb_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-
-	if (enable) {
-		cb_priv->in_fifo_half_full = 0;
-		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
-
-		cb7210_write_byte(cb_priv, HS_RX_ENABLE | HS_TX_ENABLE | HS_CLR_SRQ_INT |
-				  HS_CLR_EOI_EMPTY_INT | HS_CLR_HF_INT | cb_priv->hs_mode_bits,
-				  HS_MODE);
-
-		cb_priv->hs_mode_bits &= ~HS_ENABLE_MASK;
-		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
-
-		cb7210_write_byte(cb_priv, irq_bits(cb_priv->irq), HS_INT_LEVEL);
-
-		cb_priv->hs_mode_bits |= HS_RX_ENABLE;
-		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
-	} else {
-		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
-
-		cb_priv->hs_mode_bits &= ~HS_ENABLE_MASK;
-		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, nec7210_iobase(cb_priv) +
-				  HS_MODE);
-
-		clear_bit(READ_READY_BN, &nec_priv->state);
-	}
-
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-
-static int fifo_read(struct gpib_board *board, struct cb7210_priv *cb_priv, u8 *buffer,
-		     size_t length, int *end, size_t *bytes_read)
-{
-	ssize_t retval = 0;
-	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-	int hs_status;
-	u16 word;
-	unsigned long flags;
-
-	*bytes_read = 0;
-	if (cb_priv->fifo_iobase == 0)	{
-		dev_err(board->gpib_dev, "fifo iobase is zero!\n");
-		return -EIO;
-	}
-	*end = 0;
-	if (length <= cb7210_fifo_size)	{
-		dev_err(board->gpib_dev, " bug! fifo read length < fifo size\n");
-		return -EINVAL;
-	}
-
-	input_fifo_enable(board, 1);
-
-	while (*bytes_read + cb7210_fifo_size < length)	{
-		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, HR_DMAI);
-
-		if (wait_event_interruptible(board->wait,
-					     (cb_priv->in_fifo_half_full &&
-					      have_fifo_word(cb_priv)) ||
-					     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
-					     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-					     test_bit(TIMO_NUM, &board->status))) {
-			retval = -ERESTARTSYS;
-			nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
-			break;
-		}
-
-		spin_lock_irqsave(&board->spinlock, flags);
-
-		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
-
-		while (have_fifo_word(cb_priv))	{
-			word = inw(cb_priv->fifo_iobase + DIR);
-			buffer[(*bytes_read)++] = word & 0xff;
-			buffer[(*bytes_read)++] = (word >> 8) & 0xff;
-		}
-
-		cb_priv->in_fifo_half_full = 0;
-
-		hs_status = cb7210_read_byte(cb_priv, HS_STATUS);
-
-		spin_unlock_irqrestore(&board->spinlock, flags);
-
-		if (test_and_clear_bit(RECEIVED_END_BN, &nec_priv->state)) {
-			*end = 1;
-			break;
-		}
-		if (hs_status & HS_FIFO_FULL)
-			break;
-		if (test_bit(TIMO_NUM, &board->status))	{
-			retval = -ETIMEDOUT;
-			break;
-		}
-		if (test_bit(DEV_CLEAR_BN, &nec_priv->state)) {
-			retval = -EINTR;
-			break;
-		}
-	}
-	hs_status = cb7210_read_byte(cb_priv, HS_STATUS);
-	if (hs_status & HS_RX_LSB_NOT_EMPTY) {
-		word = inw(cb_priv->fifo_iobase + DIR);
-		buffer[(*bytes_read)++] = word & 0xff;
-	}
-
-	input_fifo_enable(board, 0);
-
-	if (wait_event_interruptible(board->wait,
-				     test_bit(READ_READY_BN, &nec_priv->state) ||
-				     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status))) {
-		retval = -ERESTARTSYS;
-	}
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-	if (test_bit(READ_READY_BN, &nec_priv->state)) {
-		nec7210_set_handshake_mode(board, nec_priv, HR_HLDA);
-		buffer[(*bytes_read)++] = nec7210_read_data_in(board, nec_priv, end);
-	}
-
-	return retval;
-}
-
-static int cb7210_accel_read(struct gpib_board *board, u8 *buffer,
-			     size_t length, int *end, size_t *bytes_read)
-{
-	ssize_t retval;
-	struct cb7210_priv *cb_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-	size_t num_bytes;
-
-	*bytes_read = 0;
-	// deal with limitations of fifo
-	if (length < cb7210_fifo_size + 3 || (nec_priv->auxa_bits & HR_REOS))
-		return cb7210_read(board, buffer, length, end, bytes_read);
-	*end = 0;
-
-	nec7210_release_rfd_holdoff(board, nec_priv);
-
-	if (wait_event_interruptible(board->wait,
-				     test_bit(READ_READY_BN, &nec_priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status))) {
-		return -ERESTARTSYS;
-	}
-	if (test_bit(TIMO_NUM, &board->status))
-		return -ETIMEDOUT;
-	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
-		return -EINTR;
-
-	nec7210_set_handshake_mode(board, nec_priv, HR_HLDE);
-	buffer[(*bytes_read)++] = nec7210_read_data_in(board, nec_priv, end);
-	if (*end)
-		return 0;
-
-	nec7210_release_rfd_holdoff(board, nec_priv);
-
-	retval = fifo_read(board, cb_priv, &buffer[*bytes_read], length - *bytes_read - 1,
-			   end, &num_bytes);
-	*bytes_read += num_bytes;
-	if (retval < 0)
-		return retval;
-	if (*end)
-		return 0;
-
-	retval = cb7210_read(board, &buffer[*bytes_read], 1, end, &num_bytes);
-	*bytes_read += num_bytes;
-	if (retval < 0)
-		return retval;
-
-	return 0;
-}
-
-static int output_fifo_empty(const struct cb7210_priv *cb_priv)
-{
-	if ((cb7210_read_byte(cb_priv, HS_STATUS) & (HS_TX_MSB_NOT_EMPTY | HS_TX_LSB_NOT_EMPTY))
-	    == 0)
-		return 1;
-	else
-		return 0;
-}
-
-static inline void output_fifo_enable(struct gpib_board *board, int enable)
-{
-	struct cb7210_priv *cb_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-
-	if (enable) {
-		nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, 0);
-		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, HR_DMAO);
-
-		cb7210_write_byte(cb_priv, HS_RX_ENABLE | HS_TX_ENABLE | HS_CLR_SRQ_INT |
-				  HS_CLR_EOI_EMPTY_INT | HS_CLR_HF_INT | cb_priv->hs_mode_bits,
-				  HS_MODE);
-
-		cb_priv->hs_mode_bits &= ~HS_ENABLE_MASK;
-		cb_priv->hs_mode_bits |= HS_TX_ENABLE;
-		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
-
-		cb7210_write_byte(cb_priv, irq_bits(cb_priv->irq), HS_INT_LEVEL);
-
-		clear_bit(WRITE_READY_BN, &nec_priv->state);
-
-	} else {
-		cb_priv->hs_mode_bits &= ~HS_ENABLE_MASK;
-		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
-
-		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0);
-		nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, HR_DOIE);
-	}
-
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-
-static int fifo_write(struct gpib_board *board, u8 *buffer, size_t length,
-		      size_t *bytes_written)
-{
-	size_t count = 0;
-	ssize_t retval = 0;
-	struct cb7210_priv *cb_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-	unsigned int num_bytes, i;
-	unsigned long flags;
-
-	*bytes_written = 0;
-	if (cb_priv->fifo_iobase == 0) {
-		dev_err(board->gpib_dev, "fifo iobase is zero!\n");
-		return -EINVAL;
-	}
-	if (length == 0)
-		return 0;
-
-	clear_bit(DEV_CLEAR_BN, &nec_priv->state);
-	clear_bit(BUS_ERROR_BN, &nec_priv->state);
-
-	output_fifo_enable(board, 1);
-
-	while (count < length) {
-		// wait until byte is ready to be sent
-		if (wait_event_interruptible(board->wait,
-					     cb_priv->out_fifo_half_empty ||
-					     output_fifo_empty(cb_priv) ||
-					     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-					     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
-					     test_bit(TIMO_NUM, &board->status))) {
-			retval = -ERESTARTSYS;
-			break;
-		}
-		if (test_bit(TIMO_NUM, &board->status) ||
-		    test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-		    test_bit(BUS_ERROR_BN, &nec_priv->state))
-			break;
-
-		if (output_fifo_empty(cb_priv))
-			num_bytes = cb7210_fifo_size - cb7210_fifo_width;
-		else
-			num_bytes = cb7210_fifo_size / 2;
-		if (num_bytes + count > length)
-			num_bytes = length - count;
-		if (num_bytes % cb7210_fifo_width) {
-			dev_err(board->gpib_dev, " bug! fifo write with odd number of bytes\n");
-			retval = -EINVAL;
-			break;
-		}
-
-		spin_lock_irqsave(&board->spinlock, flags);
-		for (i = 0; i < num_bytes / cb7210_fifo_width; i++) {
-			u16 word;
-
-			word = buffer[count++] & 0xff;
-			word |= (buffer[count++] << 8) & 0xff00;
-			outw(word, cb_priv->fifo_iobase + CDOR);
-		}
-		cb_priv->out_fifo_half_empty = 0;
-		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits |
-				  HS_CLR_EOI_EMPTY_INT | HS_CLR_HF_INT, HS_MODE);
-		cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
-		spin_unlock_irqrestore(&board->spinlock, flags);
-	}
-	// wait last byte has been sent
-	if (wait_event_interruptible(board->wait,
-				     output_fifo_empty(cb_priv) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status))) {
-		retval = -ERESTARTSYS;
-	}
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_bit(BUS_ERROR_BN, &nec_priv->state))
-		retval = -EIO;
-	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-
-	output_fifo_enable(board, 0);
-
-	*bytes_written = count;
-	return retval;
-}
-
-static int cb7210_accel_write(struct gpib_board *board, u8 *buffer,
-			      size_t length, int send_eoi, size_t *bytes_written)
-{
-	struct cb7210_priv *cb_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-	unsigned long fast_chunk_size, leftover;
-	int retval;
-	size_t num_bytes;
-
-	*bytes_written = 0;
-	if (length > cb7210_fifo_width)
-		fast_chunk_size = length - 1;
-	else
-		fast_chunk_size = 0;
-	fast_chunk_size -= fast_chunk_size % cb7210_fifo_width;
-	leftover = length - fast_chunk_size;
-
-	retval = fifo_write(board, buffer, fast_chunk_size, &num_bytes);
-	*bytes_written += num_bytes;
-	if (retval < 0)
-		return retval;
-
-	retval = nec7210_write(board, nec_priv, buffer + fast_chunk_size, leftover,
-			       send_eoi, &num_bytes);
-	*bytes_written += num_bytes;
-	return retval;
-}
-
-static int cb7210_line_status(const struct gpib_board *board)
-{
-	int status = VALID_ALL;
-	int bsr_bits;
-	struct cb7210_priv *cb_priv;
-
-	cb_priv = board->private_data;
-
-	bsr_bits = cb7210_paged_read_byte(cb_priv, BUS_STATUS, BUS_STATUS_PAGE);
-
-	if ((bsr_bits & BSR_REN_BIT) == 0)
-		status |= BUS_REN;
-	if ((bsr_bits & BSR_IFC_BIT) == 0)
-		status |= BUS_IFC;
-	if ((bsr_bits & BSR_SRQ_BIT) == 0)
-		status |= BUS_SRQ;
-	if ((bsr_bits & BSR_EOI_BIT) == 0)
-		status |= BUS_EOI;
-	if ((bsr_bits & BSR_NRFD_BIT) == 0)
-		status |= BUS_NRFD;
-	if ((bsr_bits & BSR_NDAC_BIT) == 0)
-		status |= BUS_NDAC;
-	if ((bsr_bits & BSR_DAV_BIT) == 0)
-		status |= BUS_DAV;
-	if ((bsr_bits & BSR_ATN_BIT) == 0)
-		status |= BUS_ATN;
-
-	return status;
-}
-
-static int cb7210_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	struct cb7210_priv *cb_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-	unsigned int retval;
-
-	retval = nec7210_t1_delay(board, nec_priv, nano_sec);
-
-	if (nano_sec <= 350) {
-		write_byte(nec_priv, AUX_HI_SPEED, AUXMR);
-		retval = 350;
-	} else {
-		write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
-	}
-	return retval;
-}
-
-static irqreturn_t cb7210_locked_internal_interrupt(struct gpib_board *board);
-
-/*
- * GPIB interrupt service routines
- */
-
-static irqreturn_t cb_pci_interrupt(int irq, void *arg)
-{
-	int bits;
-	struct gpib_board *board = arg;
-	struct cb7210_priv *priv = board->private_data;
-
-	// first task check if this is really our interrupt in a shared irq environment
-	switch (priv->pci_chip)	{
-	case PCI_CHIP_AMCC_S5933:
-		if ((inl(priv->amcc_iobase + INTCSR_REG) &
-		     (INBOX_INTR_CS_BIT | INTR_ASSERTED_BIT)) == 0)
-			return IRQ_NONE;
-
-		// read incoming mailbox to clear mailbox full flag
-		inl(priv->amcc_iobase + INCOMING_MAILBOX_REG(3));
-		// clear amccs5933 interrupt
-		bits = INBOX_FULL_INTR_BIT | INBOX_BYTE_BITS(3) |
-			INBOX_SELECT_BITS(3) |	INBOX_INTR_CS_BIT;
-		outl(bits, priv->amcc_iobase + INTCSR_REG);
-		break;
-	case PCI_CHIP_QUANCOM:
-		if ((inb(nec7210_iobase(priv) + QUANCOM_IRQ_CONTROL_STATUS_REG) &
-		     QUANCOM_IRQ_ASSERTED_BIT))
-			outb(QUANCOM_IRQ_ENABLE_BIT, nec7210_iobase(priv) +
-			     QUANCOM_IRQ_CONTROL_STATUS_REG);
-		break;
-	default:
-		break;
-	}
-	return cb7210_locked_internal_interrupt(arg);
-}
-
-static irqreturn_t cb7210_internal_interrupt(struct gpib_board *board)
-{
-	int hs_status, status1, status2;
-	struct cb7210_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-	int clear_bits;
-
-	if ((priv->hs_mode_bits & HS_ENABLE_MASK)) {
-		status1 = 0;
-		hs_status = cb7210_read_byte(priv, HS_STATUS);
-	} else {
-		hs_status = 0;
-		status1 = read_byte(nec_priv, ISR1);
-	}
-	status2 = read_byte(nec_priv, ISR2);
-	nec7210_interrupt_have_status(board, nec_priv, status1, status2);
-
-	dev_dbg(board->gpib_dev, "status 0x%x, mode 0x%x\n", hs_status, priv->hs_mode_bits);
-
-	clear_bits = 0;
-
-	if (hs_status & HS_HALF_FULL) {
-		if (priv->hs_mode_bits & HS_TX_ENABLE)
-			priv->out_fifo_half_empty = 1;
-		else if (priv->hs_mode_bits & HS_RX_ENABLE)
-			priv->in_fifo_half_full = 1;
-		clear_bits |= HS_CLR_HF_INT;
-	}
-
-	if (hs_status & HS_SRQ_INT) {
-		set_bit(SRQI_NUM, &board->status);
-		clear_bits |= HS_CLR_SRQ_INT;
-	}
-
-	if ((hs_status & HS_EOI_INT)) {
-		clear_bits |= HS_CLR_EOI_EMPTY_INT;
-		set_bit(RECEIVED_END_BN, &nec_priv->state);
-		if ((nec_priv->auxa_bits & HR_HANDSHAKE_MASK) == HR_HLDE)
-			set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-	}
-
-	if ((priv->hs_mode_bits & HS_TX_ENABLE) &&
-	    (hs_status & (HS_TX_MSB_NOT_EMPTY | HS_TX_LSB_NOT_EMPTY)) == 0)
-		clear_bits |= HS_CLR_EOI_EMPTY_INT;
-
-	if (clear_bits) {
-		cb7210_write_byte(priv, priv->hs_mode_bits | clear_bits, HS_MODE);
-		cb7210_write_byte(priv, priv->hs_mode_bits, HS_MODE);
-		wake_up_interruptible(&board->wait);
-	}
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t cb7210_locked_internal_interrupt(struct gpib_board *board)
-{
-	unsigned long flags;
-	irqreturn_t retval;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	retval = cb7210_internal_interrupt(board);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return retval;
-}
-
-static irqreturn_t cb7210_interrupt(int irq, void *arg)
-{
-	return cb7210_internal_interrupt(arg);
-}
-
-static int cb_pci_attach(struct gpib_board *board, const struct gpib_board_config *config);
-static int cb_isa_attach(struct gpib_board *board, const struct gpib_board_config *config);
-
-static void cb_pci_detach(struct gpib_board *board);
-static void cb_isa_detach(struct gpib_board *board);
-
-// wrappers for interface functions
-static int cb7210_read(struct gpib_board *board, u8 *buffer, size_t length,
-		       int *end, size_t *bytes_read)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
-}
-
-static int cb7210_write(struct gpib_board *board, u8 *buffer, size_t length,
-			int send_eoi, size_t *bytes_written)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
-}
-
-static int cb7210_command(struct gpib_board *board, u8 *buffer, size_t length,
-			  size_t *bytes_written)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
-}
-
-static int cb7210_take_control(struct gpib_board *board, int synchronous)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
-}
-
-static int cb7210_go_to_standby(struct gpib_board *board)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_go_to_standby(board, &priv->nec7210_priv);
-}
-
-static int cb7210_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct cb7210_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-
-	if (request_control)
-		priv->hs_mode_bits |= HS_SYS_CONTROL;
-	else
-		priv->hs_mode_bits &= ~HS_SYS_CONTROL;
-
-	cb7210_write_byte(priv, priv->hs_mode_bits, HS_MODE);
-	return nec7210_request_system_control(board, nec_priv, request_control);
-}
-
-static void cb7210_interface_clear(struct gpib_board *board, int assert)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
-}
-
-static void cb7210_remote_enable(struct gpib_board *board, int enable)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
-}
-
-static int cb7210_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
-}
-
-static void cb7210_disable_eos(struct gpib_board *board)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	nec7210_disable_eos(board, &priv->nec7210_priv);
-}
-
-static unsigned int cb7210_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
-}
-
-static int cb7210_primary_address(struct gpib_board *board, unsigned int address)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_primary_address(board, &priv->nec7210_priv, address);
-}
-
-static int cb7210_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
-}
-
-static int cb7210_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
-}
-
-static void cb7210_parallel_poll_configure(struct gpib_board *board, u8 configuration)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, configuration);
-}
-
-static void cb7210_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
-}
-
-static void cb7210_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
-}
-
-static u8 cb7210_serial_poll_status(struct gpib_board *board)
-{
-	struct cb7210_priv *priv = board->private_data;
-
-	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
-}
-
-static void cb7210_return_to_local(struct gpib_board *board)
-{
-	struct cb7210_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-
-	write_byte(nec_priv, AUX_RTL2, AUXMR);
-	udelay(1);
-	write_byte(nec_priv, AUX_RTL, AUXMR);
-}
-
-static struct gpib_interface cb_pci_unaccel_interface = {
-	.name = "cbi_pci_unaccel",
-	.attach = cb_pci_attach,
-	.detach = cb_pci_detach,
-	.read = cb7210_read,
-	.write = cb7210_write,
-	.command = cb7210_command,
-	.take_control = cb7210_take_control,
-	.go_to_standby = cb7210_go_to_standby,
-	.request_system_control = cb7210_request_system_control,
-	.interface_clear = cb7210_interface_clear,
-	.remote_enable = cb7210_remote_enable,
-	.enable_eos = cb7210_enable_eos,
-	.disable_eos = cb7210_disable_eos,
-	.parallel_poll = cb7210_parallel_poll,
-	.parallel_poll_configure = cb7210_parallel_poll_configure,
-	.parallel_poll_response = cb7210_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = cb7210_line_status,
-	.update_status = cb7210_update_status,
-	.primary_address = cb7210_primary_address,
-	.secondary_address = cb7210_secondary_address,
-	.serial_poll_response = cb7210_serial_poll_response,
-	.serial_poll_status = cb7210_serial_poll_status,
-	.t1_delay = cb7210_t1_delay,
-	.return_to_local = cb7210_return_to_local,
-};
-
-static struct gpib_interface cb_pci_accel_interface = {
-	.name = "cbi_pci_accel",
-	.attach = cb_pci_attach,
-	.detach = cb_pci_detach,
-	.read = cb7210_accel_read,
-	.write = cb7210_accel_write,
-	.command = cb7210_command,
-	.take_control = cb7210_take_control,
-	.go_to_standby = cb7210_go_to_standby,
-	.request_system_control = cb7210_request_system_control,
-	.interface_clear = cb7210_interface_clear,
-	.remote_enable = cb7210_remote_enable,
-	.enable_eos = cb7210_enable_eos,
-	.disable_eos = cb7210_disable_eos,
-	.parallel_poll = cb7210_parallel_poll,
-	.parallel_poll_configure = cb7210_parallel_poll_configure,
-	.parallel_poll_response = cb7210_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = cb7210_line_status,
-	.update_status = cb7210_update_status,
-	.primary_address = cb7210_primary_address,
-	.secondary_address = cb7210_secondary_address,
-	.serial_poll_response = cb7210_serial_poll_response,
-	.serial_poll_status = cb7210_serial_poll_status,
-	.t1_delay = cb7210_t1_delay,
-	.return_to_local = cb7210_return_to_local,
-};
-
-static struct gpib_interface cb_pci_interface = {
-	.name = "cbi_pci",
-	.attach = cb_pci_attach,
-	.detach = cb_pci_detach,
-	.read = cb7210_accel_read,
-	.write = cb7210_accel_write,
-	.command = cb7210_command,
-	.take_control = cb7210_take_control,
-	.go_to_standby = cb7210_go_to_standby,
-	.request_system_control = cb7210_request_system_control,
-	.interface_clear = cb7210_interface_clear,
-	.remote_enable = cb7210_remote_enable,
-	.enable_eos = cb7210_enable_eos,
-	.disable_eos = cb7210_disable_eos,
-	.parallel_poll = cb7210_parallel_poll,
-	.parallel_poll_configure = cb7210_parallel_poll_configure,
-	.parallel_poll_response = cb7210_parallel_poll_response,
-	.line_status = cb7210_line_status,
-	.update_status = cb7210_update_status,
-	.primary_address = cb7210_primary_address,
-	.secondary_address = cb7210_secondary_address,
-	.serial_poll_response = cb7210_serial_poll_response,
-	.serial_poll_status = cb7210_serial_poll_status,
-	.t1_delay = cb7210_t1_delay,
-	.return_to_local = cb7210_return_to_local,
-};
-
-static struct gpib_interface cb_isa_unaccel_interface = {
-	.name = "cbi_isa_unaccel",
-	.attach = cb_isa_attach,
-	.detach = cb_isa_detach,
-	.read = cb7210_read,
-	.write = cb7210_write,
-	.command = cb7210_command,
-	.take_control = cb7210_take_control,
-	.go_to_standby = cb7210_go_to_standby,
-	.request_system_control = cb7210_request_system_control,
-	.interface_clear = cb7210_interface_clear,
-	.remote_enable = cb7210_remote_enable,
-	.enable_eos = cb7210_enable_eos,
-	.disable_eos = cb7210_disable_eos,
-	.parallel_poll = cb7210_parallel_poll,
-	.parallel_poll_configure = cb7210_parallel_poll_configure,
-	.parallel_poll_response = cb7210_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = cb7210_line_status,
-	.update_status = cb7210_update_status,
-	.primary_address = cb7210_primary_address,
-	.secondary_address = cb7210_secondary_address,
-	.serial_poll_response = cb7210_serial_poll_response,
-	.serial_poll_status = cb7210_serial_poll_status,
-	.t1_delay = cb7210_t1_delay,
-	.return_to_local = cb7210_return_to_local,
-};
-
-static struct gpib_interface cb_isa_interface = {
-	.name = "cbi_isa",
-	.attach = cb_isa_attach,
-	.detach = cb_isa_detach,
-	.read = cb7210_accel_read,
-	.write = cb7210_accel_write,
-	.command = cb7210_command,
-	.take_control = cb7210_take_control,
-	.go_to_standby = cb7210_go_to_standby,
-	.request_system_control = cb7210_request_system_control,
-	.interface_clear = cb7210_interface_clear,
-	.remote_enable = cb7210_remote_enable,
-	.enable_eos = cb7210_enable_eos,
-	.disable_eos = cb7210_disable_eos,
-	.parallel_poll = cb7210_parallel_poll,
-	.parallel_poll_configure = cb7210_parallel_poll_configure,
-	.parallel_poll_response = cb7210_parallel_poll_response,
-	.line_status = cb7210_line_status,
-	.update_status = cb7210_update_status,
-	.primary_address = cb7210_primary_address,
-	.secondary_address = cb7210_secondary_address,
-	.serial_poll_response = cb7210_serial_poll_response,
-	.serial_poll_status = cb7210_serial_poll_status,
-	.t1_delay = cb7210_t1_delay,
-	.return_to_local = cb7210_return_to_local,
-};
-
-static struct gpib_interface cb_isa_accel_interface = {
-	.name = "cbi_isa_accel",
-	.attach = cb_isa_attach,
-	.detach = cb_isa_detach,
-	.read = cb7210_accel_read,
-	.write = cb7210_accel_write,
-	.command = cb7210_command,
-	.take_control = cb7210_take_control,
-	.go_to_standby = cb7210_go_to_standby,
-	.request_system_control = cb7210_request_system_control,
-	.interface_clear = cb7210_interface_clear,
-	.remote_enable = cb7210_remote_enable,
-	.enable_eos = cb7210_enable_eos,
-	.disable_eos = cb7210_disable_eos,
-	.parallel_poll = cb7210_parallel_poll,
-	.parallel_poll_configure = cb7210_parallel_poll_configure,
-	.parallel_poll_response = cb7210_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = cb7210_line_status,
-	.update_status = cb7210_update_status,
-	.primary_address = cb7210_primary_address,
-	.secondary_address = cb7210_secondary_address,
-	.serial_poll_response = cb7210_serial_poll_response,
-	.serial_poll_status = cb7210_serial_poll_status,
-	.t1_delay = cb7210_t1_delay,
-	.return_to_local = cb7210_return_to_local,
-};
-
-static int cb7210_allocate_private(struct gpib_board *board)
-{
-	struct cb7210_priv *priv;
-
-	board->private_data = kmalloc(sizeof(struct cb7210_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -ENOMEM;
-	priv = board->private_data;
-	memset(priv, 0, sizeof(struct cb7210_priv));
-	init_nec7210_private(&priv->nec7210_priv);
-	return 0;
-}
-
-static void cb7210_generic_detach(struct gpib_board *board)
-{
-	kfree(board->private_data);
-	board->private_data = NULL;
-}
-
-// generic part of attach functions shared by all cb7210 boards
-static int cb7210_generic_attach(struct gpib_board *board)
-{
-	struct cb7210_priv *cb_priv;
-	struct nec7210_priv *nec_priv;
-
-	board->status = 0;
-
-	if (cb7210_allocate_private(board))
-		return -ENOMEM;
-	cb_priv = board->private_data;
-	nec_priv = &cb_priv->nec7210_priv;
-	nec_priv->read_byte = nec7210_locking_ioport_read_byte;
-	nec_priv->write_byte = nec7210_locking_ioport_write_byte;
-	nec_priv->offset = cb7210_reg_offset;
-	nec_priv->type = CB7210;
-	return 0;
-}
-
-static int cb7210_init(struct cb7210_priv *cb_priv, struct gpib_board *board)
-{
-	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-
-	cb7210_write_byte(cb_priv, HS_RESET7210, HS_INT_LEVEL);
-	cb7210_write_byte(cb_priv, irq_bits(cb_priv->irq), HS_INT_LEVEL);
-
-	nec7210_board_reset(nec_priv, board);
-	cb7210_write_byte(cb_priv, HS_TX_ENABLE | HS_RX_ENABLE | HS_CLR_SRQ_INT |
-			  HS_CLR_EOI_EMPTY_INT | HS_CLR_HF_INT, HS_MODE);
-
-	cb_priv->hs_mode_bits = HS_HF_INT_EN;
-	cb7210_write_byte(cb_priv, cb_priv->hs_mode_bits, HS_MODE);
-
-	write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
-	/*
-	 * set clock register for maximum (20 MHz) driving frequency
-	 * ICR should be set to clock in megahertz (1-15) and to zero
-	 * for clocks faster than 15 MHz (max 20MHz)
-	 */
-	write_byte(nec_priv, ICR | 0, AUXMR);
-
-	if (cb_priv->pci_chip == PCI_CHIP_QUANCOM) {
-		/* change interrupt polarity */
-		nec_priv->auxb_bits |= HR_INV;
-		write_byte(nec_priv, nec_priv->auxb_bits, AUXMR);
-	}
-	nec7210_board_online(nec_priv, board);
-
-	/* poll so we can detect assertion of ATN */
-	if (gpib_request_pseudo_irq(board, cb_pci_interrupt)) {
-		pr_err("failed to allocate pseudo_irq\n");
-		return -1;
-	}
-	return 0;
-}
-
-static int cb_pci_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct cb7210_priv *cb_priv;
-	struct nec7210_priv *nec_priv;
-	int isr_flags = 0;
-	int bits;
-	int retval;
-
-	retval = cb7210_generic_attach(board);
-	if (retval)
-		return retval;
-
-	cb_priv = board->private_data;
-	nec_priv = &cb_priv->nec7210_priv;
-
-	cb_priv->pci_device = gpib_pci_get_device(config, PCI_VENDOR_ID_CBOARDS,
-						  PCI_DEVICE_ID_CBOARDS_PCI_GPIB, NULL);
-	if (cb_priv->pci_device)
-		cb_priv->pci_chip = PCI_CHIP_AMCC_S5933;
-	if (!cb_priv->pci_device) {
-		cb_priv->pci_device = gpib_pci_get_device(config, PCI_VENDOR_ID_CBOARDS,
-							  PCI_DEVICE_ID_CBOARDS_CPCI_GPIB, NULL);
-		if (cb_priv->pci_device)
-			cb_priv->pci_chip = PCI_CHIP_AMCC_S5933;
-	}
-	if (!cb_priv->pci_device) {
-		cb_priv->pci_device = gpib_pci_get_device(config, PCI_VENDOR_ID_QUANCOM,
-							  PCI_DEVICE_ID_QUANCOM_GPIB, NULL);
-		if (cb_priv->pci_device) {
-			cb_priv->pci_chip = PCI_CHIP_QUANCOM;
-			nec_priv->offset = 4;
-		}
-	}
-	if (!cb_priv->pci_device) {
-		dev_err(board->gpib_dev, "no supported boards found.\n");
-		return -ENODEV;
-	}
-
-	if (pci_enable_device(cb_priv->pci_device)) {
-		dev_err(board->gpib_dev, "error enabling pci device\n");
-		return -EIO;
-	}
-
-	if (pci_request_regions(cb_priv->pci_device, DRV_NAME))
-		return -EBUSY;
-	switch (cb_priv->pci_chip) {
-	case PCI_CHIP_AMCC_S5933:
-		cb_priv->amcc_iobase = pci_resource_start(cb_priv->pci_device, 0);
-		nec_priv->iobase = pci_resource_start(cb_priv->pci_device, 1);
-		cb_priv->fifo_iobase = pci_resource_start(cb_priv->pci_device, 2);
-		break;
-	case PCI_CHIP_QUANCOM:
-		nec_priv->iobase = pci_resource_start(cb_priv->pci_device, 0);
-		cb_priv->fifo_iobase = nec_priv->iobase;
-		break;
-	default:
-		dev_err(board->gpib_dev, "bug! unhandled pci_chip=%i\n", cb_priv->pci_chip);
-		return -EIO;
-	}
-	isr_flags |= IRQF_SHARED;
-	if (request_irq(cb_priv->pci_device->irq, cb_pci_interrupt, isr_flags, DRV_NAME, board)) {
-		dev_err(board->gpib_dev, "can't request IRQ %d\n",
-			cb_priv->pci_device->irq);
-		return -EBUSY;
-	}
-	cb_priv->irq = cb_priv->pci_device->irq;
-
-	switch (cb_priv->pci_chip) {
-	case PCI_CHIP_AMCC_S5933:
-		// make sure mailbox flags are clear
-		inl(cb_priv->amcc_iobase + INCOMING_MAILBOX_REG(3));
-		// enable interrupts on amccs5933 chip
-		bits = INBOX_FULL_INTR_BIT | INBOX_BYTE_BITS(3) | INBOX_SELECT_BITS(3) |
-			INBOX_INTR_CS_BIT;
-		outl(bits, cb_priv->amcc_iobase + INTCSR_REG);
-		break;
-	default:
-		break;
-	}
-	return cb7210_init(cb_priv, board);
-}
-
-static void cb_pci_detach(struct gpib_board *board)
-{
-	struct cb7210_priv *cb_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (cb_priv) {
-		gpib_free_pseudo_irq(board);
-		nec_priv = &cb_priv->nec7210_priv;
-		if (cb_priv->irq) {
-			// disable amcc interrupts
-			outl(0, cb_priv->amcc_iobase + INTCSR_REG);
-			free_irq(cb_priv->irq, board);
-		}
-		if (nec_priv->iobase) {
-			nec7210_board_reset(nec_priv, board);
-			pci_release_regions(cb_priv->pci_device);
-		}
-		if (cb_priv->pci_device)
-			pci_dev_put(cb_priv->pci_device);
-	}
-	cb7210_generic_detach(board);
-}
-
-static int cb_isa_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	int isr_flags = 0;
-	struct cb7210_priv *cb_priv;
-	struct nec7210_priv *nec_priv;
-	unsigned int bits;
-	int retval;
-
-	retval = cb7210_generic_attach(board);
-	if (retval)
-		return retval;
-	cb_priv = board->private_data;
-	nec_priv = &cb_priv->nec7210_priv;
-	if (!request_region(config->ibbase, cb7210_iosize, DRV_NAME)) {
-		dev_err(board->gpib_dev, "ioports starting at 0x%x are already in use\n",
-			config->ibbase);
-		return -EBUSY;
-	}
-	nec_priv->iobase = config->ibbase;
-	cb_priv->fifo_iobase = nec7210_iobase(cb_priv);
-
-	bits = irq_bits(config->ibirq);
-	if (bits == 0)
-		dev_err(board->gpib_dev, "board incapable of using irq %i, try 2-5, 7, 10, or 11\n",
-			config->ibirq);
-
-	// install interrupt handler
-	if (request_irq(config->ibirq, cb7210_interrupt, isr_flags, DRV_NAME, board)) {
-		dev_err(board->gpib_dev, "failed to obtain IRQ %d\n", config->ibirq);
-		return -EBUSY;
-	}
-	cb_priv->irq = config->ibirq;
-
-	return cb7210_init(cb_priv, board);
-}
-
-static void cb_isa_detach(struct gpib_board *board)
-{
-	struct cb7210_priv *cb_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (cb_priv) {
-		gpib_free_pseudo_irq(board);
-		nec_priv = &cb_priv->nec7210_priv;
-		if (cb_priv->irq)
-			free_irq(cb_priv->irq, board);
-		if (nec_priv->iobase) {
-			nec7210_board_reset(nec_priv, board);
-			release_region(nec7210_iobase(cb_priv), cb7210_iosize);
-		}
-	}
-	cb7210_generic_detach(board);
-}
-
-static int cb7210_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return 0;
-}
-
-static const struct pci_device_id cb7210_pci_table[] = {
-	{PCI_VENDOR_ID_CBOARDS, PCI_DEVICE_ID_CBOARDS_PCI_GPIB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
-	{PCI_VENDOR_ID_CBOARDS, PCI_DEVICE_ID_CBOARDS_CPCI_GPIB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
-	{PCI_VENDOR_ID_QUANCOM, PCI_DEVICE_ID_QUANCOM_GPIB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
-	{ 0 }
-};
-MODULE_DEVICE_TABLE(pci, cb7210_pci_table);
-
-static struct pci_driver cb7210_pci_driver = {
-	.name = DRV_NAME,
-	.id_table = cb7210_pci_table,
-	.probe = &cb7210_pci_probe
-};
-
-/***************************************************************************
- *  Support for computer boards pcmcia-gpib card
- *
- *  Based on gpib PCMCIA client driver written by Claus Schroeter
- *  (clausi@chemie.fu-berlin.de), which was adapted from the
- *  pcmcia skeleton example (presumably David Hinds)
- ***************************************************************************/
-
-#ifdef CONFIG_GPIB_PCMCIA
-
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/timer.h>
-#include <linux/io.h>
-
-#include <pcmcia/cistpl.h>
-#include <pcmcia/ds.h>
-
-/*
- * The event() function is this driver's Card Services event handler.
- * It will be called by Card Services when an appropriate card status
- * event is received.  The config() and release() entry points are
- * used to configure or release a socket, in response to card insertion
- * and ejection events.	 They are invoked from the gpib event
- * handler.
- */
-
-static int cb_gpib_config(struct pcmcia_device	*link);
-static void cb_gpib_release(struct pcmcia_device  *link);
-static int cb_pcmcia_attach(struct gpib_board *board, const struct gpib_board_config *config);
-static void cb_pcmcia_detach(struct gpib_board *board);
-
-/*
- *  A linked list of "instances" of the gpib device.  Each actual
- *  PCMCIA card corresponds to one device instance, and is described
- *  by one dev_link_t structure (defined in ds.h).
- *
- *  You may not want to use a linked list for this -- for example, the
- *  memory card driver uses an array of dev_link_t pointers, where minor
- *  device numbers are used to derive the corresponding array index.
- */
-
-static	struct pcmcia_device  *curr_dev;
-
-/*
- *  A dev_link_t structure has fields for most things that are needed
- *  to keep track of a socket, but there will usually be some device
- *  specific information that also needs to be kept track of.  The
- *  'priv' pointer in a dev_link_t structure can be used to point to
- *  a device-specific private data structure, like this.
- *
- *  A driver needs to provide a dev_node_t structure for each device
- *  on a card.	In some cases, there is only one device per card (for
- *  example, ethernet cards, modems).  In other cases, there may be
- *  many actual or logical devices (SCSI adapters, memory cards with
- *  multiple partitions).  The dev_node_t structures need to be kept
- *  in a linked list starting at the 'dev' field of a dev_link_t
- *  structure.	We allocate them in the card's private data structure,
- * because they generally can't be allocated dynamically.
- */
-
-struct local_info {
-	struct pcmcia_device	*p_dev;
-	struct gpib_board		*dev;
-};
-
-/*
- *  gpib_attach() creates an "instance" of the driver, allocating
- *  local data structures for one device.  The device is registered
- *  with Card Services.
- *
- *  The dev_link structure is initialized, but we don't actually
- *  configure the card at this point -- we wait until we receive a
- *  card insertion event.
- */
-
-static int cb_gpib_probe(struct pcmcia_device *link)
-{
-	struct local_info *info;
-	int ret;
-
-	/* Allocate space for private device-specific data */
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	info->p_dev = link;
-	link->priv = info;
-
-	/* The io structure describes IO port mapping */
-	link->resource[0]->end = 16;
-	link->resource[0]->flags &= ~IO_DATA_PATH_WIDTH;
-	link->resource[0]->flags |= IO_DATA_PATH_WIDTH_AUTO;
-	link->resource[1]->end = 16;
-	link->resource[1]->flags &= ~IO_DATA_PATH_WIDTH;
-	link->resource[1]->flags |= IO_DATA_PATH_WIDTH_16;
-	link->io_lines = 10;
-
-	/* General socket configuration */
-	link->config_flags = CONF_ENABLE_IRQ | CONF_AUTO_SET_IO;
-	link->config_index = 1;
-	link->config_regs = PRESENT_OPTION;
-
-	/* Register with Card Services */
-	curr_dev = link;
-	ret = cb_gpib_config(link);
-	if (ret)
-		goto free_info;
-
-	return 0;
-
-free_info:
-	kfree(info);
-	return ret;
-}
-
-/*
- *   This deletes a driver "instance".  The device is de-registered
- *   with Card Services.  If it has been released, all local data
- *   structures are freed.  Otherwise, the structures will be freed
- *   when the device is released.
- */
-
-static void cb_gpib_remove(struct pcmcia_device *link)
-{
-	struct local_info *info = link->priv;
-	//struct struct gpib_board *dev = info->dev;
-
-	if (info->dev)
-		cb_pcmcia_detach(info->dev);
-	cb_gpib_release(link);
-
-	//free_netdev(dev);
-	kfree(info);
-}
-
-static int cb_gpib_config_iteration(struct pcmcia_device *link, void *priv_data)
-{
-	return pcmcia_request_io(link);
-}
-
-/*
- *   gpib_config() is scheduled to run after a CARD_INSERTION event
- *   is received, to configure the PCMCIA socket, and to make the
- *   ethernet device available to the system.
- */
-
-static int cb_gpib_config(struct pcmcia_device  *link)
-{
-	int retval;
-
-	retval = pcmcia_loop_config(link, &cb_gpib_config_iteration, NULL);
-	if (retval) {
-		dev_warn(&link->dev, "no configuration found\n");
-		cb_gpib_release(link);
-		return -ENODEV;
-	}
-
-	/*
-	 *  This actually configures the PCMCIA socket -- setting up
-	 *  the I/O windows and the interrupt mapping.
-	 */
-	retval = pcmcia_enable_device(link);
-	if (retval) {
-		dev_warn(&link->dev, "pcmcia_enable_device failed\n");
-		cb_gpib_release(link);
-		return -ENODEV;
-	}
-
-	return 0;
-} /* gpib_config */
-
-/*
- * After a card is removed, gpib_release() will unregister the net
- * device, and release the PCMCIA configuration.  If the device is
- * still open, this will be postponed until it is closed.
- */
-
-static void cb_gpib_release(struct pcmcia_device *link)
-{
-	pcmcia_disable_device(link);
-}
-
-static int cb_gpib_suspend(struct pcmcia_device *link)
-{
-	//struct local_info *info = link->priv;
-	//struct struct gpib_board *dev = info->dev;
-
-	if (link->open)
-		dev_warn(&link->dev, "Device still open\n");
-	//netif_device_detach(dev);
-
-	return 0;
-}
-
-static int cb_gpib_resume(struct pcmcia_device *link)
-{
-	//struct local_info *info = link->priv;
-	//struct struct gpib_board *dev = info->dev;
-
-	/*if (link->open) {
-	 *	ni_gpib_probe(dev);	/ really?
-	 *	//netif_device_attach(dev);
-	 *
-	 */
-	return cb_gpib_config(link);
-}
-
-/*====================================================================*/
-
-static struct pcmcia_device_id cb_pcmcia_ids[] = {
-	PCMCIA_DEVICE_MANF_CARD(0x01c5, 0x0005),
-	PCMCIA_DEVICE_NULL
-};
-MODULE_DEVICE_TABLE(pcmcia, cb_pcmcia_ids);
-
-static struct pcmcia_driver cb_gpib_cs_driver = {
-	.name           = "cb_gpib_cs",
-	.owner		= THIS_MODULE,
-	.id_table	= cb_pcmcia_ids,
-	.probe		= cb_gpib_probe,
-	.remove		= cb_gpib_remove,
-	.suspend	= cb_gpib_suspend,
-	.resume		= cb_gpib_resume,
-};
-
-static void cb_pcmcia_cleanup_module(void)
-{
-	pcmcia_unregister_driver(&cb_gpib_cs_driver);
-}
-
-static struct gpib_interface cb_pcmcia_unaccel_interface = {
-	.name = "cbi_pcmcia_unaccel",
-	.attach = cb_pcmcia_attach,
-	.detach = cb_pcmcia_detach,
-	.read = cb7210_read,
-	.write = cb7210_write,
-	.command = cb7210_command,
-	.take_control = cb7210_take_control,
-	.go_to_standby = cb7210_go_to_standby,
-	.request_system_control = cb7210_request_system_control,
-	.interface_clear = cb7210_interface_clear,
-	.remote_enable = cb7210_remote_enable,
-	.enable_eos = cb7210_enable_eos,
-	.disable_eos = cb7210_disable_eos,
-	.parallel_poll = cb7210_parallel_poll,
-	.parallel_poll_configure = cb7210_parallel_poll_configure,
-	.parallel_poll_response = cb7210_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = cb7210_line_status,
-	.update_status = cb7210_update_status,
-	.primary_address = cb7210_primary_address,
-	.secondary_address = cb7210_secondary_address,
-	.serial_poll_response = cb7210_serial_poll_response,
-	.serial_poll_status = cb7210_serial_poll_status,
-	.t1_delay = cb7210_t1_delay,
-	.return_to_local = cb7210_return_to_local,
-};
-
-static struct gpib_interface cb_pcmcia_interface = {
-	.name = "cbi_pcmcia",
-	.attach = cb_pcmcia_attach,
-	.detach = cb_pcmcia_detach,
-	.read = cb7210_accel_read,
-	.write = cb7210_accel_write,
-	.command = cb7210_command,
-	.take_control = cb7210_take_control,
-	.go_to_standby = cb7210_go_to_standby,
-	.request_system_control = cb7210_request_system_control,
-	.interface_clear = cb7210_interface_clear,
-	.remote_enable = cb7210_remote_enable,
-	.enable_eos = cb7210_enable_eos,
-	.disable_eos = cb7210_disable_eos,
-	.parallel_poll = cb7210_parallel_poll,
-	.parallel_poll_configure = cb7210_parallel_poll_configure,
-	.parallel_poll_response = cb7210_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = cb7210_line_status,
-	.update_status = cb7210_update_status,
-	.primary_address = cb7210_primary_address,
-	.secondary_address = cb7210_secondary_address,
-	.serial_poll_response = cb7210_serial_poll_response,
-	.serial_poll_status = cb7210_serial_poll_status,
-	.t1_delay = cb7210_t1_delay,
-	.return_to_local = cb7210_return_to_local,
-};
-
-static struct gpib_interface cb_pcmcia_accel_interface = {
-	.name = "cbi_pcmcia_accel",
-	.attach = cb_pcmcia_attach,
-	.detach = cb_pcmcia_detach,
-	.read = cb7210_accel_read,
-	.write = cb7210_accel_write,
-	.command = cb7210_command,
-	.take_control = cb7210_take_control,
-	.go_to_standby = cb7210_go_to_standby,
-	.request_system_control = cb7210_request_system_control,
-	.interface_clear = cb7210_interface_clear,
-	.remote_enable = cb7210_remote_enable,
-	.enable_eos = cb7210_enable_eos,
-	.disable_eos = cb7210_disable_eos,
-	.parallel_poll = cb7210_parallel_poll,
-	.parallel_poll_configure = cb7210_parallel_poll_configure,
-	.parallel_poll_response = cb7210_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = cb7210_line_status,
-	.update_status = cb7210_update_status,
-	.primary_address = cb7210_primary_address,
-	.secondary_address = cb7210_secondary_address,
-	.serial_poll_response = cb7210_serial_poll_response,
-	.serial_poll_status = cb7210_serial_poll_status,
-	.t1_delay = cb7210_t1_delay,
-	.return_to_local = cb7210_return_to_local,
-};
-
-static int cb_pcmcia_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct cb7210_priv *cb_priv;
-	struct nec7210_priv *nec_priv;
-	int retval;
-
-	if (!curr_dev) {
-		dev_err(board->gpib_dev, "no cb pcmcia cards found\n");
-		return -ENODEV;
-	}
-
-	retval = cb7210_generic_attach(board);
-	if (retval)
-		return retval;
-
-	cb_priv = board->private_data;
-	nec_priv = &cb_priv->nec7210_priv;
-
-	if (!request_region(curr_dev->resource[0]->start, resource_size(curr_dev->resource[0]),
-			    DRV_NAME))	{
-		dev_err(board->gpib_dev, "ioports starting at 0x%lx are already in use\n",
-			(unsigned long)curr_dev->resource[0]->start);
-		return -EBUSY;
-	}
-	nec_priv->iobase = curr_dev->resource[0]->start;
-	cb_priv->fifo_iobase = curr_dev->resource[0]->start;
-
-	if (request_irq(curr_dev->irq, cb7210_interrupt, IRQF_SHARED, DRV_NAME, board)) {
-		dev_err(board->gpib_dev, "failed to request IRQ %d\n", curr_dev->irq);
-		return -EBUSY;
-	}
-	cb_priv->irq = curr_dev->irq;
-
-	return cb7210_init(cb_priv, board);
-}
-
-static void cb_pcmcia_detach(struct gpib_board *board)
-{
-	struct cb7210_priv *cb_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (cb_priv) {
-		nec_priv = &cb_priv->nec7210_priv;
-		gpib_free_pseudo_irq(board);
-		if (cb_priv->irq)
-			free_irq(cb_priv->irq, board);
-		if (nec_priv->iobase) {
-			nec7210_board_reset(nec_priv, board);
-			release_region(nec7210_iobase(cb_priv), cb7210_iosize);
-		}
-	}
-	cb7210_generic_detach(board);
-}
-
-#endif /* CONFIG_GPIB_PCMCIA */
-
-static int __init cb7210_init_module(void)
-{
-	int ret;
-
-	ret = pci_register_driver(&cb7210_pci_driver);
-	if (ret) {
-		pr_err("pci_register_driver failed: error = %d\n", ret);
-		return ret;
-	}
-
-	ret = gpib_register_driver(&cb_pci_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pci;
-	}
-
-	ret = gpib_register_driver(&cb_isa_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_isa;
-	}
-
-	ret = gpib_register_driver(&cb_pci_accel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pci_accel;
-	}
-
-	ret = gpib_register_driver(&cb_pci_unaccel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pci_unaccel;
-	}
-
-	ret = gpib_register_driver(&cb_isa_accel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_isa_accel;
-	}
-
-	ret = gpib_register_driver(&cb_isa_unaccel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_isa_unaccel;
-	}
-
-#ifdef CONFIG_GPIB_PCMCIA
-	ret = gpib_register_driver(&cb_pcmcia_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pcmcia;
-	}
-
-	ret = gpib_register_driver(&cb_pcmcia_accel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pcmcia_accel;
-	}
-
-	ret = gpib_register_driver(&cb_pcmcia_unaccel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pcmcia_unaccel;
-	}
-
-	ret = pcmcia_register_driver(&cb_gpib_cs_driver);
-	if (ret) {
-		pr_err("pcmcia_register_driver failed: error = %d\n", ret);
-		goto err_pcmcia_driver;
-	}
-#endif
-
-	return 0;
-
-#ifdef CONFIG_GPIB_PCMCIA
-err_pcmcia_driver:
-	gpib_unregister_driver(&cb_pcmcia_unaccel_interface);
-err_pcmcia_unaccel:
-	gpib_unregister_driver(&cb_pcmcia_accel_interface);
-err_pcmcia_accel:
-	gpib_unregister_driver(&cb_pcmcia_interface);
-err_pcmcia:
-#endif
-	gpib_unregister_driver(&cb_isa_unaccel_interface);
-err_isa_unaccel:
-	gpib_unregister_driver(&cb_isa_accel_interface);
-err_isa_accel:
-	gpib_unregister_driver(&cb_pci_unaccel_interface);
-err_pci_unaccel:
-	gpib_unregister_driver(&cb_pci_accel_interface);
-err_pci_accel:
-	gpib_unregister_driver(&cb_isa_interface);
-err_isa:
-	gpib_unregister_driver(&cb_pci_interface);
-err_pci:
-	pci_unregister_driver(&cb7210_pci_driver);
-
-	return ret;
-}
-
-static void __exit cb7210_exit_module(void)
-{
-	gpib_unregister_driver(&cb_pci_interface);
-	gpib_unregister_driver(&cb_isa_interface);
-	gpib_unregister_driver(&cb_pci_accel_interface);
-	gpib_unregister_driver(&cb_pci_unaccel_interface);
-	gpib_unregister_driver(&cb_isa_accel_interface);
-	gpib_unregister_driver(&cb_isa_unaccel_interface);
-#ifdef CONFIG_GPIB_PCMCIA
-	gpib_unregister_driver(&cb_pcmcia_interface);
-	gpib_unregister_driver(&cb_pcmcia_accel_interface);
-	gpib_unregister_driver(&cb_pcmcia_unaccel_interface);
-	cb_pcmcia_cleanup_module();
-#endif
-
-	pci_unregister_driver(&cb7210_pci_driver);
-}
-
-module_init(cb7210_init_module);
-module_exit(cb7210_exit_module);
diff --git a/drivers/staging/gpib/cb7210/cb7210.h b/drivers/staging/gpib/cb7210/cb7210.h
deleted file mode 100644
index ddc841ff87ae..000000000000
--- a/drivers/staging/gpib/cb7210/cb7210.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright            : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#include "nec7210.h"
-#include "gpibP.h"
-#include "amccs5933.h"
-
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-
-enum {
-	PCI_DEVICE_ID_CBOARDS_PCI_GPIB = 0x6,
-	PCI_DEVICE_ID_CBOARDS_CPCI_GPIB = 0xe,
-};
-
-enum pci_chip {
-	PCI_CHIP_NONE = 0,
-	PCI_CHIP_AMCC_S5933,
-	PCI_CHIP_QUANCOM
-};
-
-// struct which defines private_data for cb7210 boards
-struct cb7210_priv {
-	struct nec7210_priv nec7210_priv;
-	struct pci_dev *pci_device;
-	// base address of amccs5933 pci chip
-	unsigned long amcc_iobase;
-	unsigned long fifo_iobase;
-	unsigned int irq;
-	enum pci_chip pci_chip;
-	u8 hs_mode_bits;
-	unsigned out_fifo_half_empty : 1;
-	unsigned in_fifo_half_full : 1;
-};
-
-// pci-gpib register offset
-static const int cb7210_reg_offset = 1;
-
-// uses 10 ioports
-static const int cb7210_iosize = 10;
-
-// fifo size in bytes
-static const int cb7210_fifo_size = 2048;
-static const int cb7210_fifo_width = 2;
-
-// cb7210 specific registers and bits
-enum cb7210_regs {
-	BUS_STATUS = 0x7,
-};
-
-enum cb7210_page_in {
-	BUS_STATUS_PAGE = 1,
-};
-
-enum hs_regs {
-	// write registers
-	HS_MODE = 0x8,	/* HS_MODE register */
-	HS_INT_LEVEL = 0x9,	/* HS_INT_LEVEL register */
-	// read registers
-	HS_STATUS = 0x8,	/* HS_STATUS register */
-};
-
-static inline u32 nec7210_iobase(const struct cb7210_priv *cb_priv)
-{
-	return cb_priv->nec7210_priv.iobase;
-}
-
-static inline int cb7210_page_in_bits(unsigned int page)
-{
-	return 0x50 | (page & 0xf);
-}
-
-static inline u8 cb7210_paged_read_byte(struct cb7210_priv *cb_priv,
-					unsigned int register_num, unsigned int page)
-{
-	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-	u8 retval;
-	unsigned long flags;
-
-	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
-	outb(cb7210_page_in_bits(page), nec7210_iobase(cb_priv) + AUXMR * nec_priv->offset);
-	udelay(1);
-	retval = inb(nec7210_iobase(cb_priv) + register_num * nec_priv->offset);
-	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
-	return retval;
-}
-
-// don't use for register_num < 8, since it doesn't lock
-static inline u8 cb7210_read_byte(const struct cb7210_priv *cb_priv,
-				  enum hs_regs register_num)
-{
-	const struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-	u8 retval;
-
-	retval = inb(nec7210_iobase(cb_priv) + register_num * nec_priv->offset);
-	return retval;
-}
-
-static inline void cb7210_paged_write_byte(struct cb7210_priv *cb_priv, u8 data,
-					   unsigned int register_num, unsigned int page)
-{
-	struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-	unsigned long flags;
-
-	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
-	outb(cb7210_page_in_bits(page), nec7210_iobase(cb_priv) + AUXMR * nec_priv->offset);
-	udelay(1);
-	outb(data, nec7210_iobase(cb_priv) + register_num * nec_priv->offset);
-	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
-}
-
-// don't use for register_num < 8, since it doesn't lock
-static inline void cb7210_write_byte(const struct cb7210_priv *cb_priv, u8 data,
-				     enum hs_regs register_num)
-{
-	const struct nec7210_priv *nec_priv = &cb_priv->nec7210_priv;
-
-	outb(data, nec7210_iobase(cb_priv) + register_num * nec_priv->offset);
-}
-
-enum bus_status_bits {
-	BSR_ATN_BIT = 0x1,
-	BSR_EOI_BIT = 0x2,
-	BSR_SRQ_BIT = 0x4,
-	BSR_IFC_BIT = 0x8,
-	BSR_REN_BIT = 0x10,
-	BSR_DAV_BIT = 0x20,
-	BSR_NRFD_BIT = 0x40,
-	BSR_NDAC_BIT = 0x80,
-};
-
-/* CBI 488.2 HS control */
-
-/*
- * when both bit 0 and 1 are set, it
- *   1 clears the transmit state machine to an initial condition
- *   2 clears any residual interrupts left latched on cbi488.2
- *   3 resets all control bits in HS_MODE to zero
- *   4 enables TX empty interrupts
- * when both bit 0 and 1 are zero, then the high speed mode is disabled
- */
-enum hs_mode_bits {
-	HS_ENABLE_MASK = 0x3,
-	HS_TX_ENABLE = (1 << 0),
-	HS_RX_ENABLE = (1 << 1),
-	HS_HF_INT_EN = (1 << 3),
-	HS_CLR_SRQ_INT = (1 << 4),
-	HS_CLR_EOI_EMPTY_INT = (1 << 5),
-	HS_CLR_HF_INT = (1 << 6),
-	HS_SYS_CONTROL = (1 << 7),
-};
-
-/* CBI 488.2 status */
-enum hs_status_bits {
-	HS_FIFO_FULL = (1 << 0),
-	HS_HALF_FULL = (1 << 1),
-	HS_SRQ_INT = (1 << 2),
-	HS_EOI_INT = (1 << 3),
-	HS_TX_MSB_NOT_EMPTY = (1 << 4),
-	HS_RX_MSB_NOT_EMPTY = (1 << 5),
-	HS_TX_LSB_NOT_EMPTY = (1 << 6),
-	HS_RX_LSB_NOT_EMPTY = (1 << 7),
-};
-
-/* CBI488.2 hs_int_level register */
-enum hs_int_level_bits {
-	HS_RESET7210 = (1 << 7),
-};
-
-static inline unsigned int irq_bits(unsigned int irq)
-{
-	switch (irq) {
-	case 2:
-	case 3:
-	case 4:
-	case 5:
-		return irq - 1;
-	case 7:
-		return 0x5;
-	case 10:
-		return 0x6;
-	case 11:
-		return 0x7;
-	default:
-		return 0;
-	}
-}
-
-enum cb7210_aux_cmds {
-/*
- * AUX_RTL2 is an undocumented aux command which causes cb7210 to assert
- * (and keep asserted) local rtl message.  This is used in conjunction
- * with the (stupid) cb7210 implementation
- * of the normal nec7210 AUX_RTL aux command, which
- * causes the rtl message to toggle between on and off.
- */
-	AUX_RTL2 = 0xd,
-	AUX_LO_SPEED = 0x40,
-	AUX_HI_SPEED = 0x41,
-};
diff --git a/drivers/staging/gpib/cec/Makefile b/drivers/staging/gpib/cec/Makefile
deleted file mode 100644
index b7141e23d4e0..000000000000
--- a/drivers/staging/gpib/cec/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-
-obj-$(CONFIG_GPIB_CEC_PCI) += cec_gpib.o
-
diff --git a/drivers/staging/gpib/cec/cec.h b/drivers/staging/gpib/cec/cec.h
deleted file mode 100644
index 3ce2869c7429..000000000000
--- a/drivers/staging/gpib/cec/cec.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright            : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#include "nec7210.h"
-#include "gpibP.h"
-#include "plx9050.h"
-
-struct cec_priv  {
-	struct nec7210_priv nec7210_priv;
-	struct pci_dev *pci_device;
-	// base address for plx9052 pci chip
-	unsigned long plx_iobase;
-	unsigned int irq;
-};
-
-// offset between consecutive nec7210 registers
-static const int cec_reg_offset = 1;
diff --git a/drivers/staging/gpib/cec/cec_gpib.c b/drivers/staging/gpib/cec/cec_gpib.c
deleted file mode 100644
index dbf9b95baabc..000000000000
--- a/drivers/staging/gpib/cec/cec_gpib.c
+++ /dev/null
@@ -1,393 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *   copyright            : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include "cec.h"
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <linux/bitops.h>
-#include <asm/dma.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver for CEC PCI and PCMCIA boards");
-
-/*
- * GPIB interrupt service routines
- */
-
-static irqreturn_t cec_interrupt(int irq, void *arg)
-{
-	struct gpib_board *board = arg;
-	struct cec_priv *priv = board->private_data;
-	unsigned long flags;
-	irqreturn_t retval;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	retval = nec7210_interrupt(board, &priv->nec7210_priv);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return retval;
-}
-
-#define CEC_VENDOR_ID 0x12fc
-#define CEC_DEV_ID    0x5cec
-#define CEC_SUBID 0x9050
-
-static int cec_pci_attach(struct gpib_board *board, const struct gpib_board_config *config);
-
-static void cec_pci_detach(struct gpib_board *board);
-
-// wrappers for interface functions
-static int cec_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
-		    size_t *bytes_read)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
-}
-
-static int cec_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
-		     size_t *bytes_written)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
-}
-
-static int cec_command(struct gpib_board *board, u8 *buffer,
-		       size_t length, size_t *bytes_written)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
-}
-
-static int cec_take_control(struct gpib_board *board, int synchronous)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
-}
-
-static int cec_go_to_standby(struct gpib_board *board)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_go_to_standby(board, &priv->nec7210_priv);
-}
-
-static int cec_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_request_system_control(board, &priv->nec7210_priv, request_control);
-}
-
-static void cec_interface_clear(struct gpib_board *board, int assert)
-{
-	struct cec_priv *priv = board->private_data;
-
-	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
-}
-
-static void cec_remote_enable(struct gpib_board *board, int enable)
-{
-	struct cec_priv *priv = board->private_data;
-
-	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
-}
-
-static int cec_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
-}
-
-static void cec_disable_eos(struct gpib_board *board)
-{
-	struct cec_priv *priv = board->private_data;
-
-	nec7210_disable_eos(board, &priv->nec7210_priv);
-}
-
-static unsigned int cec_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
-}
-
-static int cec_primary_address(struct gpib_board *board, unsigned int address)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_primary_address(board, &priv->nec7210_priv, address);
-}
-
-static int cec_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
-}
-
-static int cec_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
-}
-
-static void cec_parallel_poll_configure(struct gpib_board *board, u8 config)
-{
-	struct cec_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, config);
-}
-
-static void cec_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	struct cec_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
-}
-
-static void cec_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	struct cec_priv *priv = board->private_data;
-
-	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
-}
-
-static u8 cec_serial_poll_status(struct gpib_board *board)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
-}
-
-static int cec_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	struct cec_priv *priv = board->private_data;
-
-	return nec7210_t1_delay(board, &priv->nec7210_priv, nano_sec);
-}
-
-static void cec_return_to_local(struct gpib_board *board)
-{
-	struct cec_priv *priv = board->private_data;
-
-	nec7210_return_to_local(board, &priv->nec7210_priv);
-}
-
-static struct gpib_interface cec_pci_interface = {
-	.name = "cec_pci",
-	.attach = cec_pci_attach,
-	.detach = cec_pci_detach,
-	.read = cec_read,
-	.write = cec_write,
-	.command = cec_command,
-	.take_control = cec_take_control,
-	.go_to_standby = cec_go_to_standby,
-	.request_system_control = cec_request_system_control,
-	.interface_clear = cec_interface_clear,
-	.remote_enable = cec_remote_enable,
-	.enable_eos = cec_enable_eos,
-	.disable_eos = cec_disable_eos,
-	.parallel_poll = cec_parallel_poll,
-	.parallel_poll_configure = cec_parallel_poll_configure,
-	.parallel_poll_response = cec_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = NULL,	// XXX
-	.update_status = cec_update_status,
-	.primary_address = cec_primary_address,
-	.secondary_address = cec_secondary_address,
-	.serial_poll_response = cec_serial_poll_response,
-	.serial_poll_status = cec_serial_poll_status,
-	.t1_delay = cec_t1_delay,
-	.return_to_local = cec_return_to_local,
-};
-
-static int cec_allocate_private(struct gpib_board *board)
-{
-	struct cec_priv *priv;
-
-	board->private_data = kmalloc(sizeof(struct cec_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -1;
-	priv = board->private_data;
-	memset(priv, 0, sizeof(struct cec_priv));
-	init_nec7210_private(&priv->nec7210_priv);
-	return 0;
-}
-
-static void cec_free_private(struct gpib_board *board)
-{
-	kfree(board->private_data);
-	board->private_data = NULL;
-}
-
-static int cec_generic_attach(struct gpib_board *board)
-{
-	struct cec_priv *cec_priv;
-	struct nec7210_priv *nec_priv;
-
-	board->status = 0;
-
-	if (cec_allocate_private(board))
-		return -ENOMEM;
-	cec_priv = board->private_data;
-	nec_priv = &cec_priv->nec7210_priv;
-	nec_priv->read_byte = nec7210_ioport_read_byte;
-	nec_priv->write_byte = nec7210_ioport_write_byte;
-	nec_priv->offset = cec_reg_offset;
-	nec_priv->type = NEC7210;	// guess
-	return 0;
-}
-
-static void cec_init(struct cec_priv *cec_priv, const struct gpib_board *board)
-{
-	struct nec7210_priv *nec_priv = &cec_priv->nec7210_priv;
-
-	nec7210_board_reset(nec_priv, board);
-
-	/* set internal counter register for 8 MHz input clock */
-	write_byte(nec_priv, ICR | 8, AUXMR);
-
-	nec7210_board_online(nec_priv, board);
-}
-
-static int cec_pci_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct cec_priv *cec_priv;
-	struct nec7210_priv *nec_priv;
-	int isr_flags = 0;
-	int retval;
-
-	retval = cec_generic_attach(board);
-	if (retval)
-		return retval;
-
-	cec_priv = board->private_data;
-	nec_priv = &cec_priv->nec7210_priv;
-
-	// find board
-	cec_priv->pci_device = NULL;
-	while ((cec_priv->pci_device =
-		gpib_pci_get_device(config, CEC_VENDOR_ID,
-				    CEC_DEV_ID, cec_priv->pci_device)))	{
-		// check for board with plx9050 controller
-		if (cec_priv->pci_device->subsystem_device == CEC_SUBID)
-			break;
-	}
-	if (!cec_priv->pci_device) {
-		dev_err(board->gpib_dev, "no cec PCI board found\n");
-		return -ENODEV;
-	}
-
-	if (pci_enable_device(cec_priv->pci_device)) {
-		dev_err(board->gpib_dev, "error enabling pci device\n");
-		return -EIO;
-	}
-
-	if (pci_request_regions(cec_priv->pci_device, "cec-gpib"))
-		return -EBUSY;
-
-	cec_priv->plx_iobase = pci_resource_start(cec_priv->pci_device, 1);
-	nec_priv->iobase = pci_resource_start(cec_priv->pci_device, 3);
-
-	isr_flags |= IRQF_SHARED;
-	if (request_irq(cec_priv->pci_device->irq, cec_interrupt, isr_flags, DRV_NAME, board)) {
-		dev_err(board->gpib_dev, "failed to obtain IRQ %d\n", cec_priv->pci_device->irq);
-		return -EBUSY;
-	}
-	cec_priv->irq = cec_priv->pci_device->irq;
-	if (gpib_request_pseudo_irq(board, cec_interrupt)) {
-		dev_err(board->gpib_dev, "failed to allocate pseudo irq\n");
-		return -1;
-	}
-	cec_init(cec_priv, board);
-
-	// enable interrupts on plx chip
-	outl(PLX9050_LINTR1_EN_BIT | PLX9050_LINTR1_POLARITY_BIT | PLX9050_PCI_INTR_EN_BIT,
-	     cec_priv->plx_iobase + PLX9050_INTCSR_REG);
-
-	return 0;
-}
-
-static void cec_pci_detach(struct gpib_board *board)
-{
-	struct cec_priv *cec_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (cec_priv) {
-		nec_priv = &cec_priv->nec7210_priv;
-		gpib_free_pseudo_irq(board);
-		if (cec_priv->irq) {
-			// disable plx9050 interrupts
-			outl(0, cec_priv->plx_iobase + PLX9050_INTCSR_REG);
-			free_irq(cec_priv->irq, board);
-		}
-		if (nec_priv->iobase) {
-			nec7210_board_reset(nec_priv, board);
-			pci_release_regions(cec_priv->pci_device);
-		}
-		if (cec_priv->pci_device)
-			pci_dev_put(cec_priv->pci_device);
-	}
-	cec_free_private(board);
-}
-
-static int cec_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return 0;
-}
-
-static const struct pci_device_id cec_pci_table[] = {
-	{CEC_VENDOR_ID, CEC_DEV_ID, PCI_ANY_ID, CEC_SUBID, 0, 0, 0 },
-	{0}
-};
-MODULE_DEVICE_TABLE(pci, cec_pci_table);
-
-static struct pci_driver cec_pci_driver = {
-	.name = DRV_NAME,
-	.id_table = cec_pci_table,
-	.probe = &cec_pci_probe
-};
-
-static int __init cec_init_module(void)
-{
-	int result;
-
-	result = pci_register_driver(&cec_pci_driver);
-	if (result) {
-		pr_err("pci_register_driver failed: error = %d\n", result);
-		return result;
-	}
-
-	result = gpib_register_driver(&cec_pci_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		return result;
-	}
-
-	return 0;
-}
-
-static void cec_exit_module(void)
-{
-	gpib_unregister_driver(&cec_pci_interface);
-
-	pci_unregister_driver(&cec_pci_driver);
-}
-
-module_init(cec_init_module);
-module_exit(cec_exit_module);
diff --git a/drivers/staging/gpib/common/Makefile b/drivers/staging/gpib/common/Makefile
deleted file mode 100644
index 460586edb574..000000000000
--- a/drivers/staging/gpib/common/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-
-obj-$(CONFIG_GPIB_COMMON) += gpib_common.o
-
-gpib_common-objs := gpib_os.o iblib.o
-
-
diff --git a/drivers/staging/gpib/common/gpib_os.c b/drivers/staging/gpib/common/gpib_os.c
deleted file mode 100644
index 9dbbac8b8436..000000000000
--- a/drivers/staging/gpib/common/gpib_os.c
+++ /dev/null
@@ -1,2271 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *    copyright            : (C) 2001, 2004 by Frank Mori Hess
- ***************************************************************************
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-
-#include "ibsys.h"
-#include <linux/module.h>
-#include <linux/wait.h>
-#include <linux/list.h>
-#include <linux/fs.h>
-#include <linux/pci.h>
-#include <linux/device.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/vmalloc.h>
-#include <linux/fcntl.h>
-#include <linux/kmod.h>
-#include <linux/uaccess.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB base support");
-MODULE_ALIAS_CHARDEV_MAJOR(GPIB_CODE);
-
-static int board_type_ioctl(struct gpib_file_private *file_priv,
-			    struct gpib_board *board, unsigned long arg);
-static int read_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
-		      unsigned long arg);
-static int write_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
-		       unsigned long arg);
-static int command_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
-			 unsigned long arg);
-static int open_dev_ioctl(struct file *filep, struct gpib_board *board, unsigned long arg);
-static int close_dev_ioctl(struct file *filep, struct gpib_board *board, unsigned long arg);
-static int serial_poll_ioctl(struct gpib_board *board, unsigned long arg);
-static int wait_ioctl(struct gpib_file_private *file_priv,
-		      struct gpib_board *board, unsigned long arg);
-static int parallel_poll_ioctl(struct gpib_board *board, unsigned long arg);
-static int online_ioctl(struct gpib_board *board, unsigned long arg);
-static int remote_enable_ioctl(struct gpib_board *board, unsigned long arg);
-static int take_control_ioctl(struct gpib_board *board, unsigned long arg);
-static int line_status_ioctl(struct gpib_board *board, unsigned long arg);
-static int pad_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
-		     unsigned long arg);
-static int sad_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
-		     unsigned long arg);
-static int eos_ioctl(struct gpib_board *board, unsigned long arg);
-static int request_service_ioctl(struct gpib_board *board, unsigned long arg);
-static int request_service2_ioctl(struct gpib_board *board, unsigned long arg);
-static int iobase_ioctl(struct gpib_board_config *config, unsigned long arg);
-static int irq_ioctl(struct gpib_board_config *config, unsigned long arg);
-static int dma_ioctl(struct gpib_board_config *config, unsigned long arg);
-static int autospoll_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
-			   unsigned long arg);
-static int mutex_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
-		       unsigned long arg);
-static int timeout_ioctl(struct gpib_board *board, unsigned long arg);
-static int status_bytes_ioctl(struct gpib_board *board, unsigned long arg);
-static int board_info_ioctl(const struct gpib_board *board, unsigned long arg);
-static int ppc_ioctl(struct gpib_board *board, unsigned long arg);
-static int set_local_ppoll_mode_ioctl(struct gpib_board *board, unsigned long arg);
-static int get_local_ppoll_mode_ioctl(struct gpib_board *board, unsigned long arg);
-static int query_board_rsv_ioctl(struct gpib_board *board, unsigned long arg);
-static int interface_clear_ioctl(struct gpib_board *board, unsigned long arg);
-static int select_pci_ioctl(struct gpib_board_config *config, unsigned long arg);
-static int select_device_path_ioctl(struct gpib_board_config *config, unsigned long arg);
-static int event_ioctl(struct gpib_board *board, unsigned long arg);
-static int request_system_control_ioctl(struct gpib_board *board, unsigned long arg);
-static int t1_delay_ioctl(struct gpib_board *board, unsigned long arg);
-
-static int cleanup_open_devices(struct gpib_file_private *file_priv, struct gpib_board *board);
-
-static int pop_gpib_event_nolock(struct gpib_board *board,
-				 struct gpib_event_queue *queue, short *event_type);
-
-/*
- * Timer functions
- */
-
-/* Watchdog timeout routine */
-
-static void watchdog_timeout(struct timer_list *t)
-{
-	struct gpib_board *board = timer_container_of(board, t, timer);
-
-	set_bit(TIMO_NUM, &board->status);
-	wake_up_interruptible(&board->wait);
-}
-
-/* install timer interrupt handler */
-void os_start_timer(struct gpib_board *board, unsigned int usec_timeout)
-/* Starts the timeout task  */
-{
-	if (timer_pending(&board->timer)) {
-		dev_err(board->gpib_dev, "bug! timer already running?\n");
-		return;
-	}
-	clear_bit(TIMO_NUM, &board->status);
-
-	if (usec_timeout > 0) {
-		board->timer.function = watchdog_timeout;
-		/* set number of ticks */
-		mod_timer(&board->timer, jiffies + usec_to_jiffies(usec_timeout));
-	}
-}
-
-void os_remove_timer(struct gpib_board *board)
-/* Removes the timeout task */
-{
-	if (timer_pending(&board->timer))
-		timer_delete_sync(&board->timer);
-}
-
-int io_timed_out(struct gpib_board *board)
-{
-	if (test_bit(TIMO_NUM, &board->status))
-		return 1;
-	return 0;
-}
-
-/*
- * this is a function instead of a constant because of Suse
- * defining HZ to be a function call to get_hz()
- */
-static inline int pseudo_irq_period(void)
-{
-	return (HZ + 99) / 100;
-}
-
-static void pseudo_irq_handler(struct timer_list *t)
-{
-	struct gpib_pseudo_irq *pseudo_irq = timer_container_of(pseudo_irq, t,
-								timer);
-
-	if (pseudo_irq->handler)
-		pseudo_irq->handler(0, pseudo_irq->board);
-	else
-		pr_err("gpib: bug! pseudo_irq.handler is NULL\n");
-
-	if (atomic_read(&pseudo_irq->active))
-		mod_timer(&pseudo_irq->timer, jiffies + pseudo_irq_period());
-}
-
-int gpib_request_pseudo_irq(struct gpib_board *board, irqreturn_t (*handler)(int, void *))
-{
-	if (timer_pending(&board->pseudo_irq.timer) || board->pseudo_irq.handler) {
-		dev_err(board->gpib_dev, "only one pseudo interrupt per board allowed\n");
-		return -1;
-	}
-
-	board->pseudo_irq.handler = handler;
-	board->pseudo_irq.timer.function = pseudo_irq_handler;
-	board->pseudo_irq.board = board;
-
-	atomic_set(&board->pseudo_irq.active, 1);
-
-	mod_timer(&board->pseudo_irq.timer, jiffies + pseudo_irq_period());
-
-	return 0;
-}
-EXPORT_SYMBOL(gpib_request_pseudo_irq);
-
-void gpib_free_pseudo_irq(struct gpib_board *board)
-{
-	atomic_set(&board->pseudo_irq.active, 0);
-
-	timer_delete_sync(&board->pseudo_irq.timer);
-	board->pseudo_irq.handler = NULL;
-}
-EXPORT_SYMBOL(gpib_free_pseudo_irq);
-
-static const unsigned int serial_timeout = 1000000;
-
-unsigned int num_status_bytes(const struct gpib_status_queue *dev)
-{
-	if (!dev)
-		return 0;
-	return dev->num_status_bytes;
-}
-
-// push status byte onto back of status byte fifo
-int push_status_byte(struct gpib_board *board, struct gpib_status_queue *device, u8 poll_byte)
-{
-	struct list_head *head = &device->status_bytes;
-	struct gpib_status_byte *status;
-	static const unsigned int max_num_status_bytes = 1024;
-	int retval;
-
-	if (num_status_bytes(device) >= max_num_status_bytes) {
-		u8 lost_byte;
-
-		device->dropped_byte = 1;
-		retval = pop_status_byte(board, device, &lost_byte);
-		if (retval < 0)
-			return retval;
-	}
-
-	status = kmalloc(sizeof(*status), GFP_KERNEL);
-	if (!status)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&status->list);
-	status->poll_byte = poll_byte;
-
-	list_add_tail(&status->list, head);
-
-	device->num_status_bytes++;
-
-	dev_dbg(board->gpib_dev, "pushed status byte 0x%x, %i in queue\n",
-		(int)poll_byte, num_status_bytes(device));
-
-	return 0;
-}
-
-// pop status byte from front of status byte fifo
-int pop_status_byte(struct gpib_board *board, struct gpib_status_queue *device, u8 *poll_byte)
-{
-	struct list_head *head = &device->status_bytes;
-	struct list_head *front = head->next;
-	struct gpib_status_byte *status;
-
-	if (num_status_bytes(device) == 0)
-		return -EIO;
-
-	if (front == head)
-		return -EIO;
-
-	if (device->dropped_byte) {
-		device->dropped_byte = 0;
-		return -EPIPE;
-	}
-
-	status = list_entry(front, struct gpib_status_byte, list);
-	*poll_byte = status->poll_byte;
-
-	list_del(front);
-	kfree(status);
-
-	device->num_status_bytes--;
-
-	dev_dbg(board->gpib_dev, "popped status byte 0x%x, %i in queue\n",
-		(int)*poll_byte, num_status_bytes(device));
-
-	return 0;
-}
-
-struct gpib_status_queue *get_gpib_status_queue(struct gpib_board *board, unsigned int pad, int sad)
-{
-	struct gpib_status_queue *device;
-	struct list_head *list_ptr;
-	const struct list_head *head = &board->device_list;
-
-	for (list_ptr = head->next; list_ptr != head; list_ptr = list_ptr->next) {
-		device = list_entry(list_ptr, struct gpib_status_queue, list);
-		if (gpib_address_equal(device->pad, device->sad, pad, sad))
-			return device;
-	}
-
-	return NULL;
-}
-
-int get_serial_poll_byte(struct gpib_board *board, unsigned int pad, int sad,
-			 unsigned int usec_timeout, u8 *poll_byte)
-{
-	struct gpib_status_queue *device;
-
-	device = get_gpib_status_queue(board, pad, sad);
-	if (num_status_bytes(device))
-		return pop_status_byte(board, device, poll_byte);
-	else
-		return dvrsp(board, pad, sad, usec_timeout, poll_byte);
-}
-
-int autopoll_all_devices(struct gpib_board *board)
-{
-	int retval;
-
-	if (mutex_lock_interruptible(&board->user_mutex))
-		return -ERESTARTSYS;
-	if (mutex_lock_interruptible(&board->big_gpib_mutex)) {
-		mutex_unlock(&board->user_mutex);
-		return -ERESTARTSYS;
-	}
-
-	dev_dbg(board->gpib_dev, "autopoll has board lock\n");
-
-	retval = serial_poll_all(board, serial_timeout);
-	if (retval < 0)	{
-		mutex_unlock(&board->big_gpib_mutex);
-		mutex_unlock(&board->user_mutex);
-		return retval;
-	}
-
-	dev_dbg(board->gpib_dev, "complete\n");
-	/*
-	 * need to wake wait queue in case someone is
-	 * waiting on RQS
-	 */
-	wake_up_interruptible(&board->wait);
-	mutex_unlock(&board->big_gpib_mutex);
-	mutex_unlock(&board->user_mutex);
-
-	return retval;
-}
-
-static int setup_serial_poll(struct gpib_board *board, unsigned int usec_timeout)
-{
-	u8 cmd_string[8];
-	int i;
-	size_t bytes_written;
-	int ret;
-
-	os_start_timer(board, usec_timeout);
-	ret = ibcac(board, 1, 1);
-	if (ret < 0) {
-		os_remove_timer(board);
-		return ret;
-	}
-
-	i = 0;
-	cmd_string[i++] = UNL;
-	cmd_string[i++] = MLA(board->pad);	/* controller's listen address */
-	if (board->sad >= 0)
-		cmd_string[i++] = MSA(board->sad);
-	cmd_string[i++] = SPE;	// serial poll enable
-
-	ret = board->interface->command(board, cmd_string, i, &bytes_written);
-	if (ret < 0 || bytes_written < i) {
-		dev_dbg(board->gpib_dev, "failed to setup serial poll\n");
-		os_remove_timer(board);
-		return -EIO;
-	}
-	os_remove_timer(board);
-
-	return 0;
-}
-
-static int read_serial_poll_byte(struct gpib_board *board, unsigned int pad,
-				 int sad, unsigned int usec_timeout, u8 *result)
-{
-	u8 cmd_string[8];
-	int end_flag;
-	int ret;
-	int i;
-	size_t nbytes;
-
-	dev_dbg(board->gpib_dev, "entering  pad=%i sad=%i\n", pad, sad);
-
-	os_start_timer(board, usec_timeout);
-	ret = ibcac(board, 1, 1);
-	if (ret < 0) {
-		os_remove_timer(board);
-		return ret;
-	}
-
-	i = 0;
-	// send talk address
-	cmd_string[i++] = MTA(pad);
-	if (sad >= 0)
-		cmd_string[i++] = MSA(sad);
-
-	ret = board->interface->command(board, cmd_string, i, &nbytes);
-	if (ret < 0 || nbytes < i) {
-		dev_err(board->gpib_dev, "failed to setup serial poll\n");
-		os_remove_timer(board);
-		return -EIO;
-	}
-
-	ibgts(board);
-
-	// read poll result
-	ret = board->interface->read(board, result, 1, &end_flag, &nbytes);
-	if (ret < 0 || nbytes < 1) {
-		dev_err(board->gpib_dev, "serial poll failed\n");
-		os_remove_timer(board);
-		return -EIO;
-	}
-	os_remove_timer(board);
-
-	return 0;
-}
-
-static int cleanup_serial_poll(struct gpib_board *board, unsigned int usec_timeout)
-{
-	u8 cmd_string[8];
-	int ret;
-	size_t bytes_written;
-
-	os_start_timer(board, usec_timeout);
-	ret = ibcac(board, 1, 1);
-	if (ret < 0) {
-		os_remove_timer(board);
-		return ret;
-	}
-
-	cmd_string[0] = SPD;	/* disable serial poll bytes */
-	cmd_string[1] = UNT;
-	ret = board->interface->command(board, cmd_string, 2, &bytes_written);
-	if (ret < 0 || bytes_written < 2) {
-		dev_err(board->gpib_dev, "failed to disable serial poll\n");
-		os_remove_timer(board);
-		return -EIO;
-	}
-	os_remove_timer(board);
-
-	return 0;
-}
-
-static int serial_poll_single(struct gpib_board *board, unsigned int pad, int sad,
-			      unsigned int usec_timeout, u8 *result)
-{
-	int retval, cleanup_retval;
-
-	retval = setup_serial_poll(board, usec_timeout);
-	if (retval < 0)
-		return retval;
-	retval = read_serial_poll_byte(board, pad, sad, usec_timeout, result);
-	cleanup_retval = cleanup_serial_poll(board, usec_timeout);
-	if (retval < 0)
-		return retval;
-	if (cleanup_retval < 0)
-		return retval;
-
-	return 0;
-}
-
-int serial_poll_all(struct gpib_board *board, unsigned int usec_timeout)
-{
-	int retval = 0;
-	struct list_head *cur;
-	const struct list_head *head = NULL;
-	struct gpib_status_queue *device;
-	u8 result;
-	unsigned int num_bytes = 0;
-
-	head = &board->device_list;
-	if (head->next == head)
-		return 0;
-
-	retval = setup_serial_poll(board, usec_timeout);
-	if (retval < 0)
-		return retval;
-
-	for (cur = head->next; cur != head; cur = cur->next) {
-		device = list_entry(cur, struct gpib_status_queue, list);
-		retval = read_serial_poll_byte(board,
-					       device->pad, device->sad, usec_timeout, &result);
-		if (retval < 0)
-			continue;
-		if (result & request_service_bit) {
-			retval = push_status_byte(board, device, result);
-			if (retval < 0)
-				continue;
-			num_bytes++;
-		}
-	}
-
-	retval = cleanup_serial_poll(board, usec_timeout);
-	if (retval < 0)
-		return retval;
-
-	return num_bytes;
-}
-
-/*
- * DVRSP
- * This function performs a serial poll of the device with primary
- * address pad and secondary address sad. If the device has no
- * secondary address, pass a negative number in for this argument.  At the
- * end of a successful serial poll the response is returned in result.
- * SPD and UNT are sent at the completion of the poll.
- */
-
-int dvrsp(struct gpib_board *board, unsigned int pad, int sad,
-	  unsigned int usec_timeout, u8 *result)
-{
-	int status = ibstatus(board);
-	int retval;
-
-	if ((status & CIC) == 0) {
-		dev_err(board->gpib_dev, "not CIC during serial poll\n");
-		return -1;
-	}
-
-	if (pad > MAX_GPIB_PRIMARY_ADDRESS || sad > MAX_GPIB_SECONDARY_ADDRESS || sad < -1) {
-		dev_err(board->gpib_dev, "bad address for serial poll");
-		return -1;
-	}
-
-	retval = serial_poll_single(board, pad, sad, usec_timeout, result);
-	if (io_timed_out(board))
-		retval = -ETIMEDOUT;
-
-	return retval;
-}
-
-static struct gpib_descriptor *handle_to_descriptor(const struct gpib_file_private *file_priv,
-						    int handle)
-{
-	if (handle < 0 || handle >= GPIB_MAX_NUM_DESCRIPTORS) {
-		pr_err("gpib: invalid handle %i\n", handle);
-		return NULL;
-	}
-
-	return file_priv->descriptors[handle];
-}
-
-static int init_gpib_file_private(struct gpib_file_private *priv)
-{
-	memset(priv, 0, sizeof(*priv));
-	atomic_set(&priv->holding_mutex, 0);
-	priv->descriptors[0] = kmalloc(sizeof(struct gpib_descriptor), GFP_KERNEL);
-	if (!priv->descriptors[0]) {
-		pr_err("gpib: failed to allocate default board descriptor\n");
-		return -ENOMEM;
-	}
-	init_gpib_descriptor(priv->descriptors[0]);
-	priv->descriptors[0]->is_board = 1;
-	mutex_init(&priv->descriptors_mutex);
-	return 0;
-}
-
-int ibopen(struct inode *inode, struct file *filep)
-{
-	unsigned int minor = iminor(inode);
-	struct gpib_board *board;
-	struct gpib_file_private *priv;
-
-	if (minor >= GPIB_MAX_NUM_BOARDS) {
-		pr_err("gpib: invalid minor number of device file\n");
-		return -ENXIO;
-	}
-
-	board = &board_array[minor];
-
-	filep->private_data = kmalloc(sizeof(struct gpib_file_private), GFP_KERNEL);
-	if (!filep->private_data)
-		return -ENOMEM;
-
-	priv = filep->private_data;
-	init_gpib_file_private((struct gpib_file_private *)filep->private_data);
-
-	if (board->use_count == 0) {
-		int retval;
-
-		retval = request_module("gpib%i", minor);
-		if (retval)
-			dev_dbg(board->gpib_dev, "request module returned %i\n", retval);
-	}
-	if (board->interface) {
-		if (!try_module_get(board->provider_module)) {
-			dev_err(board->gpib_dev, "try_module_get() failed\n");
-			return -EIO;
-		}
-		board->use_count++;
-		priv->got_module = 1;
-	}
-	return 0;
-}
-
-int ibclose(struct inode *inode, struct file *filep)
-{
-	unsigned int minor = iminor(inode);
-	struct gpib_board *board;
-	struct gpib_file_private *priv = filep->private_data;
-	struct gpib_descriptor *desc;
-
-	if (minor >= GPIB_MAX_NUM_BOARDS) {
-		pr_err("gpib: invalid minor number of device file\n");
-		return -ENODEV;
-	}
-
-	board = &board_array[minor];
-
-	if (priv) {
-		desc = handle_to_descriptor(priv, 0);
-		if (desc) {
-			if (desc->autopoll_enabled) {
-				dev_dbg(board->gpib_dev, "decrementing autospollers\n");
-				if (board->autospollers > 0)
-					board->autospollers--;
-				else
-					dev_err(board->gpib_dev,
-						"Attempt to decrement zero autospollers\n");
-			}
-		} else {
-			dev_err(board->gpib_dev, "Unexpected null gpib_descriptor\n");
-		}
-
-		cleanup_open_devices(priv, board);
-
-		if (atomic_read(&priv->holding_mutex))
-			mutex_unlock(&board->user_mutex);
-
-		if (priv->got_module && board->use_count) {
-			module_put(board->provider_module);
-			--board->use_count;
-		}
-
-		kfree(filep->private_data);
-		filep->private_data = NULL;
-	}
-
-	return 0;
-}
-
-long ibioctl(struct file *filep, unsigned int cmd, unsigned long arg)
-{
-	unsigned int minor = iminor(file_inode(filep));
-	struct gpib_board *board;
-	struct gpib_file_private *file_priv = filep->private_data;
-	long retval = -ENOTTY;
-
-	if (minor >= GPIB_MAX_NUM_BOARDS) {
-		pr_err("gpib: invalid minor number of device file\n");
-		return -ENODEV;
-	}
-	board = &board_array[minor];
-
-	if (mutex_lock_interruptible(&board->big_gpib_mutex))
-		return -ERESTARTSYS;
-
-	dev_dbg(board->gpib_dev, "ioctl %d, interface=%s, use=%d, onl=%d\n",
-		cmd & 0xff,
-		board->interface ? board->interface->name : "",
-		board->use_count,
-		board->online);
-
-	switch (cmd) {
-	case CFCBOARDTYPE:
-		retval = board_type_ioctl(file_priv, board, arg);
-		goto done;
-	case IBONL:
-		retval = online_ioctl(board, arg);
-		goto done;
-	default:
-		break;
-	}
-	if (!board->interface) {
-		dev_err(board->gpib_dev, "no gpib board configured\n");
-		retval = -ENODEV;
-		goto done;
-	}
-	if (file_priv->got_module == 0)	{
-		if (!try_module_get(board->provider_module)) {
-			dev_err(board->gpib_dev, "try_module_get() failed\n");
-			retval = -EIO;
-			goto done;
-		}
-		file_priv->got_module = 1;
-		board->use_count++;
-	}
-	switch (cmd) {
-	case CFCBASE:
-		retval = iobase_ioctl(&board->config, arg);
-		goto done;
-	case CFCIRQ:
-		retval = irq_ioctl(&board->config, arg);
-		goto done;
-	case CFCDMA:
-		retval = dma_ioctl(&board->config, arg);
-		goto done;
-	case IBAUTOSPOLL:
-		retval = autospoll_ioctl(board, file_priv, arg);
-		goto done;
-	case IBBOARD_INFO:
-		retval = board_info_ioctl(board, arg);
-		goto done;
-	case IBMUTEX:
-		/*
-		 * Need to unlock board->big_gpib_mutex before potentially locking board->user_mutex
-		 * to maintain consistent locking order
-		 */
-		mutex_unlock(&board->big_gpib_mutex);
-		return mutex_ioctl(board, file_priv, arg);
-	case IBPAD:
-		retval = pad_ioctl(board, file_priv, arg);
-		goto done;
-	case IBSAD:
-		retval = sad_ioctl(board, file_priv, arg);
-		goto done;
-	case IBSELECT_PCI:
-		retval = select_pci_ioctl(&board->config, arg);
-		goto done;
-	case IBSELECT_DEVICE_PATH:
-		retval = select_device_path_ioctl(&board->config, arg);
-		goto done;
-	default:
-		break;
-	}
-
-	if (!board->online) {
-		retval = -EINVAL;
-		goto done;
-	}
-
-	switch (cmd) {
-	case IBEVENT:
-		retval = event_ioctl(board, arg);
-		goto done;
-	case IBCLOSEDEV:
-		retval = close_dev_ioctl(filep, board, arg);
-		goto done;
-	case IBOPENDEV:
-		retval = open_dev_ioctl(filep, board, arg);
-		goto done;
-	case IBSPOLL_BYTES:
-		retval = status_bytes_ioctl(board, arg);
-		goto done;
-	case IBWAIT:
-		retval = wait_ioctl(file_priv, board, arg);
-		if (retval == -ERESTARTSYS)
-			return retval;
-		goto done;
-	case IBLINES:
-		retval = line_status_ioctl(board, arg);
-		goto done;
-	case IBLOC:
-		board->interface->return_to_local(board);
-		retval = 0;
-		goto done;
-	default:
-		break;
-	}
-
-	spin_lock(&board->locking_pid_spinlock);
-	if (current->pid != board->locking_pid)	{
-		spin_unlock(&board->locking_pid_spinlock);
-		retval = -EPERM;
-		goto done;
-	}
-	spin_unlock(&board->locking_pid_spinlock);
-
-	switch (cmd) {
-	case IB_T1_DELAY:
-		retval = t1_delay_ioctl(board, arg);
-		goto done;
-	case IBCAC:
-		retval = take_control_ioctl(board, arg);
-		goto done;
-	case IBCMD:
-		/*
-		 * IO ioctls can take a long time, we need to unlock board->big_gpib_mutex
-		 * before we call them.
-		 */
-		mutex_unlock(&board->big_gpib_mutex);
-		return command_ioctl(file_priv, board, arg);
-	case IBEOS:
-		retval = eos_ioctl(board, arg);
-		goto done;
-	case IBGTS:
-		retval = ibgts(board);
-		goto done;
-	case IBPPC:
-		retval = ppc_ioctl(board, arg);
-		goto done;
-	case IBPP2_SET:
-		retval = set_local_ppoll_mode_ioctl(board, arg);
-		goto done;
-	case IBPP2_GET:
-		retval = get_local_ppoll_mode_ioctl(board, arg);
-		goto done;
-	case IBQUERY_BOARD_RSV:
-		retval = query_board_rsv_ioctl(board, arg);
-		goto done;
-	case IBRD:
-		/*
-		 * IO ioctls can take a long time, we need to unlock board->big_gpib_mutex
-		 * before we call them.
-		 */
-		mutex_unlock(&board->big_gpib_mutex);
-		return read_ioctl(file_priv, board, arg);
-	case IBRPP:
-		retval = parallel_poll_ioctl(board, arg);
-		goto done;
-	case IBRSC:
-		retval = request_system_control_ioctl(board, arg);
-		goto done;
-	case IBRSP:
-		retval = serial_poll_ioctl(board, arg);
-		goto done;
-	case IBRSV:
-		retval = request_service_ioctl(board, arg);
-		goto done;
-	case IBRSV2:
-		retval = request_service2_ioctl(board, arg);
-		goto done;
-	case IBSIC:
-		retval = interface_clear_ioctl(board, arg);
-		goto done;
-	case IBSRE:
-		retval = remote_enable_ioctl(board, arg);
-		goto done;
-	case IBTMO:
-		retval = timeout_ioctl(board, arg);
-		goto done;
-	case IBWRT:
-		/*
-		 * IO ioctls can take a long time, we need to unlock board->big_gpib_mutex
-		 * before we call them.
-		 */
-		mutex_unlock(&board->big_gpib_mutex);
-		return write_ioctl(file_priv, board, arg);
-	default:
-		retval = -ENOTTY;
-		goto done;
-	}
-
-done:
-	mutex_unlock(&board->big_gpib_mutex);
-	dev_dbg(board->gpib_dev, "ioctl done status = 0x%lx\n", board->status);
-	return retval;
-}
-
-static int board_type_ioctl(struct gpib_file_private *file_priv,
-			    struct gpib_board *board, unsigned long arg)
-{
-	struct list_head *list_ptr;
-	struct gpib_board_type_ioctl cmd;
-	int retval;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	if (board->online)
-		return -EBUSY;
-
-	retval = copy_from_user(&cmd, (void __user *)arg,
-				sizeof(struct gpib_board_type_ioctl));
-	if (retval)
-		return -EFAULT;
-
-	for (list_ptr = registered_drivers.next; list_ptr != &registered_drivers;
-	     list_ptr = list_ptr->next) {
-		struct gpib_interface_list *entry;
-
-		entry = list_entry(list_ptr, struct gpib_interface_list, list);
-		if (strcmp(entry->interface->name, cmd.name) == 0) {
-			int i;
-			int had_module = file_priv->got_module;
-
-			if (board->use_count) {
-				for (i = 0; i < board->use_count; ++i)
-					module_put(board->provider_module);
-				board->interface = NULL;
-				file_priv->got_module = 0;
-			}
-			board->interface = entry->interface;
-			board->provider_module = entry->module;
-			for (i = 0; i < board->use_count; ++i) {
-				if (!try_module_get(entry->module)) {
-					board->use_count = i;
-					return -EIO;
-				}
-			}
-			if (had_module == 0) {
-				if (!try_module_get(entry->module))
-					return -EIO;
-				++board->use_count;
-			}
-			file_priv->got_module = 1;
-			return 0;
-		}
-	}
-
-	return -EINVAL;
-}
-
-static int read_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
-		      unsigned long arg)
-{
-	struct gpib_read_write_ioctl read_cmd;
-	u8 __user *userbuf;
-	unsigned long remain;
-	int end_flag = 0;
-	int retval;
-	ssize_t read_ret = 0;
-	struct gpib_descriptor *desc;
-	size_t nbytes;
-
-	retval = copy_from_user(&read_cmd, (void __user *)arg, sizeof(read_cmd));
-	if (retval)
-		return -EFAULT;
-
-	if (read_cmd.completed_transfer_count > read_cmd.requested_transfer_count)
-		return -EINVAL;
-
-	desc = handle_to_descriptor(file_priv, read_cmd.handle);
-	if (!desc)
-		return -EINVAL;
-
-	if (WARN_ON_ONCE(sizeof(userbuf) > sizeof(read_cmd.buffer_ptr)))
-		return -EFAULT;
-
-	userbuf = (u8 __user *)(unsigned long)read_cmd.buffer_ptr;
-	userbuf += read_cmd.completed_transfer_count;
-
-	remain = read_cmd.requested_transfer_count - read_cmd.completed_transfer_count;
-
-	/* Check write access to buffer */
-	if (!access_ok(userbuf, remain))
-		return -EFAULT;
-
-	atomic_set(&desc->io_in_progress, 1);
-
-	/* Read buffer loads till we fill the user supplied buffer */
-	while (remain > 0 && end_flag == 0) {
-		nbytes = 0;
-		read_ret = ibrd(board, board->buffer, (board->buffer_length < remain) ?
-				board->buffer_length : remain, &end_flag, &nbytes);
-		if (nbytes == 0)
-			break;
-		retval = copy_to_user(userbuf, board->buffer, nbytes);
-		if (retval) {
-			retval = -EFAULT;
-			break;
-		}
-		remain -= nbytes;
-		userbuf += nbytes;
-		if (read_ret < 0)
-			break;
-	}
-	read_cmd.completed_transfer_count = read_cmd.requested_transfer_count - remain;
-	read_cmd.end = end_flag;
-	/*
-	 * suppress errors (for example due to timeout or interruption by device clear)
-	 * if all bytes got sent.  This prevents races that can occur in the various drivers
-	 * if a device receives a device clear immediately after a transfer completes and
-	 * the driver code wasn't careful enough to handle that case.
-	 */
-	if (remain == 0 || end_flag)
-		read_ret = 0;
-	if (retval == 0)
-		retval = copy_to_user((void __user *)arg, &read_cmd, sizeof(read_cmd));
-
-	atomic_set(&desc->io_in_progress, 0);
-
-	wake_up_interruptible(&board->wait);
-	if (retval)
-		return -EFAULT;
-
-	return read_ret;
-}
-
-static int command_ioctl(struct gpib_file_private *file_priv,
-			 struct gpib_board *board, unsigned long arg)
-{
-	struct gpib_read_write_ioctl cmd;
-	u8 __user *userbuf;
-	unsigned long remain;
-	int retval;
-	int fault = 0;
-	struct gpib_descriptor *desc;
-	size_t bytes_written;
-	int no_clear_io_in_prog;
-
-	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
-	if (retval)
-		return -EFAULT;
-
-	if (cmd.completed_transfer_count > cmd.requested_transfer_count)
-		return -EINVAL;
-
-	desc = handle_to_descriptor(file_priv, cmd.handle);
-	if (!desc)
-		return -EINVAL;
-
-	userbuf = (u8 __user *)(unsigned long)cmd.buffer_ptr;
-	userbuf += cmd.completed_transfer_count;
-
-	no_clear_io_in_prog = cmd.end;
-	cmd.end = 0;
-
-	remain = cmd.requested_transfer_count - cmd.completed_transfer_count;
-
-	/* Check read access to buffer */
-	if (!access_ok(userbuf, remain))
-		return -EFAULT;
-
-	/*
-	 * Write buffer loads till we empty the user supplied buffer.
-	 * Call drivers at least once, even if remain is zero, in
-	 * order to allow them to insure previous commands were
-	 * completely finished, in the case of a restarted ioctl.
-	 */
-
-	atomic_set(&desc->io_in_progress, 1);
-
-	do {
-		fault = copy_from_user(board->buffer, userbuf, (board->buffer_length < remain) ?
-				       board->buffer_length : remain);
-		if (fault) {
-			retval = -EFAULT;
-			bytes_written = 0;
-		} else {
-			retval = ibcmd(board, board->buffer, (board->buffer_length < remain) ?
-				       board->buffer_length : remain, &bytes_written);
-		}
-		remain -= bytes_written;
-		userbuf += bytes_written;
-		if (retval < 0) {
-			atomic_set(&desc->io_in_progress, 0);
-
-			wake_up_interruptible(&board->wait);
-			break;
-		}
-	} while (remain > 0);
-
-	cmd.completed_transfer_count = cmd.requested_transfer_count - remain;
-
-	if (fault == 0)
-		fault = copy_to_user((void __user *)arg, &cmd, sizeof(cmd));
-
-	/*
-	 * no_clear_io_in_prog (cmd.end) is true when io_in_progress should
-	 * not be set to zero because the cmd in progress is the address setup
-	 * operation for an async read or write. This causes CMPL not to be set
-	 * in general_ibstatus until the async read or write completes.
-	 */
-	if (!no_clear_io_in_prog || fault)
-		atomic_set(&desc->io_in_progress, 0);
-
-	wake_up_interruptible(&board->wait);
-	if (fault)
-		return -EFAULT;
-
-	return retval;
-}
-
-static int write_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
-		       unsigned long arg)
-{
-	struct gpib_read_write_ioctl write_cmd;
-	u8 __user *userbuf;
-	unsigned long remain;
-	int retval = 0;
-	int fault;
-	struct gpib_descriptor *desc;
-
-	fault = copy_from_user(&write_cmd, (void __user *)arg, sizeof(write_cmd));
-	if (fault)
-		return -EFAULT;
-
-	if (write_cmd.completed_transfer_count > write_cmd.requested_transfer_count)
-		return -EINVAL;
-
-	desc = handle_to_descriptor(file_priv, write_cmd.handle);
-	if (!desc)
-		return -EINVAL;
-
-	userbuf = (u8 __user *)(unsigned long)write_cmd.buffer_ptr;
-	userbuf += write_cmd.completed_transfer_count;
-
-	remain = write_cmd.requested_transfer_count - write_cmd.completed_transfer_count;
-
-	/* Check read access to buffer */
-	if (!access_ok(userbuf, remain))
-		return -EFAULT;
-
-	atomic_set(&desc->io_in_progress, 1);
-
-	/* Write buffer loads till we empty the user supplied buffer */
-	while (remain > 0) {
-		int send_eoi;
-		size_t bytes_written = 0;
-
-		send_eoi = remain <= board->buffer_length && write_cmd.end;
-		fault = copy_from_user(board->buffer, userbuf, (board->buffer_length < remain) ?
-				       board->buffer_length : remain);
-		if (fault) {
-			retval = -EFAULT;
-			break;
-		}
-		retval = ibwrt(board, board->buffer, (board->buffer_length < remain) ?
-			       board->buffer_length : remain, send_eoi, &bytes_written);
-		remain -= bytes_written;
-		userbuf += bytes_written;
-		if (retval < 0)
-			break;
-	}
-	write_cmd.completed_transfer_count = write_cmd.requested_transfer_count - remain;
-	/*
-	 * suppress errors (for example due to timeout or interruption by device clear)
-	 * if all bytes got sent.  This prevents races that can occur in the various drivers
-	 * if a device receives a device clear immediately after a transfer completes and
-	 * the driver code wasn't careful enough to handle that case.
-	 */
-	if (remain == 0)
-		retval = 0;
-	if (fault == 0)
-		fault = copy_to_user((void __user *)arg, &write_cmd, sizeof(write_cmd));
-
-	atomic_set(&desc->io_in_progress, 0);
-
-	wake_up_interruptible(&board->wait);
-	if (fault)
-		return -EFAULT;
-
-	return retval;
-}
-
-static int status_bytes_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	struct gpib_status_queue *device;
-	struct gpib_spoll_bytes_ioctl cmd;
-	int retval;
-
-	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
-	if (retval)
-		return -EFAULT;
-
-	device = get_gpib_status_queue(board, cmd.pad, cmd.sad);
-	if (!device)
-		cmd.num_bytes = 0;
-	else
-		cmd.num_bytes = num_status_bytes(device);
-
-	retval = copy_to_user((void __user *)arg, &cmd, sizeof(cmd));
-	if (retval)
-		return -EFAULT;
-
-	return 0;
-}
-
-static int increment_open_device_count(struct gpib_board *board, struct list_head *head,
-				       unsigned int pad, int sad)
-{
-	struct list_head *list_ptr;
-	struct gpib_status_queue *device;
-
-	/*
-	 * first see if address has already been opened, then increment
-	 * open count
-	 */
-	for (list_ptr = head->next; list_ptr != head; list_ptr = list_ptr->next) {
-		device = list_entry(list_ptr, struct gpib_status_queue, list);
-		if (gpib_address_equal(device->pad, device->sad, pad, sad)) {
-			dev_dbg(board->gpib_dev, "incrementing open count for pad %i, sad %i\n",
-				device->pad, device->sad);
-			device->reference_count++;
-			return 0;
-		}
-	}
-
-	/* otherwise we need to allocate a new struct gpib_status_queue */
-	device = kmalloc(sizeof(struct gpib_status_queue), GFP_ATOMIC);
-	if (!device)
-		return -ENOMEM;
-	init_gpib_status_queue(device);
-	device->pad = pad;
-	device->sad = sad;
-	device->reference_count = 1;
-
-	list_add(&device->list, head);
-
-	dev_dbg(board->gpib_dev, "opened pad %i, sad %i\n", device->pad, device->sad);
-
-	return 0;
-}
-
-static int subtract_open_device_count(struct gpib_board *board, struct list_head *head,
-				      unsigned int pad, int sad, unsigned int count)
-{
-	struct gpib_status_queue *device;
-	struct list_head *list_ptr;
-
-	for (list_ptr = head->next; list_ptr != head; list_ptr = list_ptr->next) {
-		device = list_entry(list_ptr, struct gpib_status_queue, list);
-		if (gpib_address_equal(device->pad, device->sad, pad, sad)) {
-			dev_dbg(board->gpib_dev, "decrementing open count for pad %i, sad %i\n",
-				device->pad, device->sad);
-			if (count > device->reference_count) {
-				dev_err(board->gpib_dev, "bug! in %s()\n", __func__);
-				return -EINVAL;
-			}
-			device->reference_count -= count;
-			if (device->reference_count == 0) {
-				dev_dbg(board->gpib_dev, "closing pad %i, sad %i\n",
-					device->pad, device->sad);
-				list_del(list_ptr);
-				kfree(device);
-			}
-			return 0;
-		}
-	}
-	dev_err(board->gpib_dev, "bug! tried to close address that was never opened!\n");
-	return -EINVAL;
-}
-
-static inline int decrement_open_device_count(struct gpib_board *board, struct list_head *head,
-					      unsigned int pad, int sad)
-{
-	return subtract_open_device_count(board, head, pad, sad, 1);
-}
-
-static int cleanup_open_devices(struct gpib_file_private *file_priv, struct gpib_board *board)
-{
-	int retval = 0;
-	int i;
-
-	for (i = 0; i < GPIB_MAX_NUM_DESCRIPTORS; i++) {
-		struct gpib_descriptor *desc;
-
-		desc = file_priv->descriptors[i];
-		if (!desc)
-			continue;
-
-		if (desc->is_board == 0) {
-			retval = decrement_open_device_count(board, &board->device_list, desc->pad,
-							     desc->sad);
-			if (retval < 0)
-				return retval;
-		}
-		kfree(desc);
-		file_priv->descriptors[i] = NULL;
-	}
-
-	return 0;
-}
-
-static int open_dev_ioctl(struct file *filep, struct gpib_board *board, unsigned long arg)
-{
-	struct gpib_open_dev_ioctl open_dev_cmd;
-	int retval;
-	struct gpib_file_private *file_priv = filep->private_data;
-	int i;
-
-	retval = copy_from_user(&open_dev_cmd, (void __user *)arg, sizeof(open_dev_cmd));
-	if (retval)
-		return -EFAULT;
-
-	if (mutex_lock_interruptible(&file_priv->descriptors_mutex))
-		return -ERESTARTSYS;
-	for (i = 0; i < GPIB_MAX_NUM_DESCRIPTORS; i++)
-		if (!file_priv->descriptors[i])
-			break;
-	if (i == GPIB_MAX_NUM_DESCRIPTORS) {
-		mutex_unlock(&file_priv->descriptors_mutex);
-		return -ERANGE;
-	}
-	file_priv->descriptors[i] = kmalloc(sizeof(struct gpib_descriptor), GFP_KERNEL);
-	if (!file_priv->descriptors[i]) {
-		mutex_unlock(&file_priv->descriptors_mutex);
-		return -ENOMEM;
-	}
-	init_gpib_descriptor(file_priv->descriptors[i]);
-
-	file_priv->descriptors[i]->pad = open_dev_cmd.pad;
-	file_priv->descriptors[i]->sad = open_dev_cmd.sad;
-	file_priv->descriptors[i]->is_board = open_dev_cmd.is_board;
-	mutex_unlock(&file_priv->descriptors_mutex);
-
-	retval = increment_open_device_count(board, &board->device_list, open_dev_cmd.pad,
-					     open_dev_cmd.sad);
-	if (retval < 0)
-		return retval;
-
-	/*
-	 * clear stuck srq state, since we may be able to find service request on
-	 * the new device
-	 */
-	atomic_set(&board->stuck_srq, 0);
-
-	open_dev_cmd.handle = i;
-	retval = copy_to_user((void __user *)arg, &open_dev_cmd, sizeof(open_dev_cmd));
-	if (retval)
-		return -EFAULT;
-
-	return 0;
-}
-
-static int close_dev_ioctl(struct file *filep, struct gpib_board *board, unsigned long arg)
-{
-	struct gpib_close_dev_ioctl cmd;
-	struct gpib_file_private *file_priv = filep->private_data;
-	int retval;
-
-	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
-	if (retval)
-		return -EFAULT;
-
-	if (cmd.handle >= GPIB_MAX_NUM_DESCRIPTORS)
-		return -EINVAL;
-	if (!file_priv->descriptors[cmd.handle])
-		return -EINVAL;
-
-	retval = decrement_open_device_count(board, &board->device_list,
-					     file_priv->descriptors[cmd.handle]->pad,
-					     file_priv->descriptors[cmd.handle]->sad);
-	if (retval < 0)
-		return retval;
-
-	kfree(file_priv->descriptors[cmd.handle]);
-	file_priv->descriptors[cmd.handle] = NULL;
-
-	return 0;
-}
-
-static int serial_poll_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	struct gpib_serial_poll_ioctl serial_cmd;
-	int retval;
-
-	retval = copy_from_user(&serial_cmd, (void __user *)arg, sizeof(serial_cmd));
-	if (retval)
-		return -EFAULT;
-
-	retval = get_serial_poll_byte(board, serial_cmd.pad, serial_cmd.sad, board->usec_timeout,
-				      &serial_cmd.status_byte);
-	if (retval < 0)
-		return retval;
-
-	retval = copy_to_user((void __user *)arg, &serial_cmd, sizeof(serial_cmd));
-	if (retval)
-		return -EFAULT;
-
-	return 0;
-}
-
-static int wait_ioctl(struct gpib_file_private *file_priv, struct gpib_board *board,
-		      unsigned long arg)
-{
-	struct gpib_wait_ioctl wait_cmd;
-	int retval;
-	struct gpib_descriptor *desc;
-
-	retval = copy_from_user(&wait_cmd, (void __user *)arg, sizeof(wait_cmd));
-	if (retval)
-		return -EFAULT;
-
-	desc = handle_to_descriptor(file_priv, wait_cmd.handle);
-	if (!desc)
-		return -EINVAL;
-
-	retval = ibwait(board, wait_cmd.wait_mask, wait_cmd.clear_mask,
-			wait_cmd.set_mask, &wait_cmd.ibsta, wait_cmd.usec_timeout, desc);
-	if (retval < 0)
-		return retval;
-
-	retval = copy_to_user((void __user *)arg, &wait_cmd, sizeof(wait_cmd));
-	if (retval)
-		return -EFAULT;
-
-	return 0;
-}
-
-static int parallel_poll_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	u8 poll_byte;
-	int retval;
-
-	retval = ibrpp(board, &poll_byte);
-	if (retval < 0)
-		return retval;
-
-	retval = copy_to_user((void __user *)arg, &poll_byte, sizeof(poll_byte));
-	if (retval)
-		return -EFAULT;
-
-	return 0;
-}
-
-static int online_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	struct gpib_online_ioctl online_cmd;
-	int retval;
-	void __user *init_data = NULL;
-
-	board->config.init_data = NULL;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	retval = copy_from_user(&online_cmd, (void __user *)arg, sizeof(online_cmd));
-	if (retval)
-		return -EFAULT;
-	if (online_cmd.init_data_length > 0) {
-		board->config.init_data = vmalloc(online_cmd.init_data_length);
-		if (!board->config.init_data)
-			return -ENOMEM;
-		if (WARN_ON_ONCE(sizeof(init_data) > sizeof(online_cmd.init_data_ptr)))
-			return -EFAULT;
-		init_data = (void __user *)(unsigned long)(online_cmd.init_data_ptr);
-		retval = copy_from_user(board->config.init_data, init_data,
-					online_cmd.init_data_length);
-		if (retval) {
-			vfree(board->config.init_data);
-			return -EFAULT;
-		}
-		board->config.init_data_length = online_cmd.init_data_length;
-	} else {
-		board->config.init_data = NULL;
-		board->config.init_data_length = 0;
-	}
-	if (online_cmd.online)
-		retval = ibonline(board);
-	else
-		retval = iboffline(board);
-	if (board->config.init_data) {
-		vfree(board->config.init_data);
-		board->config.init_data = NULL;
-		board->config.init_data_length = 0;
-	}
-	return retval;
-}
-
-static int remote_enable_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	int enable;
-	int retval;
-
-	retval = copy_from_user(&enable, (void __user *)arg, sizeof(enable));
-	if (retval)
-		return -EFAULT;
-
-	return ibsre(board, enable);
-}
-
-static int take_control_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	int synchronous;
-	int retval;
-
-	retval = copy_from_user(&synchronous, (void __user *)arg, sizeof(synchronous));
-	if (retval)
-		return -EFAULT;
-
-	return ibcac(board, synchronous, 1);
-}
-
-static int line_status_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	short lines;
-	int retval;
-
-	retval = iblines(board, &lines);
-	if (retval < 0)
-		return retval;
-
-	retval = copy_to_user((void __user *)arg, &lines, sizeof(lines));
-	if (retval)
-		return -EFAULT;
-
-	return 0;
-}
-
-static int pad_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
-		     unsigned long arg)
-{
-	struct gpib_pad_ioctl cmd;
-	int retval;
-	struct gpib_descriptor *desc;
-
-	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
-	if (retval)
-		return -EFAULT;
-
-	desc = handle_to_descriptor(file_priv, cmd.handle);
-	if (!desc)
-		return -EINVAL;
-
-	if (desc->is_board) {
-		retval = ibpad(board, cmd.pad);
-		if (retval < 0)
-			return retval;
-	} else {
-		retval = decrement_open_device_count(board, &board->device_list, desc->pad,
-						     desc->sad);
-		if (retval < 0)
-			return retval;
-
-		desc->pad = cmd.pad;
-
-		retval = increment_open_device_count(board, &board->device_list, desc->pad,
-						     desc->sad);
-		if (retval < 0)
-			return retval;
-	}
-
-	return 0;
-}
-
-static int sad_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
-		     unsigned long arg)
-{
-	struct gpib_sad_ioctl cmd;
-	int retval;
-	struct gpib_descriptor *desc;
-
-	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
-	if (retval)
-		return -EFAULT;
-
-	desc = handle_to_descriptor(file_priv, cmd.handle);
-	if (!desc)
-		return -EINVAL;
-
-	if (desc->is_board) {
-		retval = ibsad(board, cmd.sad);
-		if (retval < 0)
-			return retval;
-	} else {
-		retval = decrement_open_device_count(board, &board->device_list, desc->pad,
-						     desc->sad);
-		if (retval < 0)
-			return retval;
-
-		desc->sad = cmd.sad;
-
-		retval = increment_open_device_count(board, &board->device_list, desc->pad,
-						     desc->sad);
-		if (retval < 0)
-			return retval;
-	}
-	return 0;
-}
-
-static int eos_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	struct gpib_eos_ioctl eos_cmd;
-	int retval;
-
-	retval = copy_from_user(&eos_cmd, (void __user *)arg, sizeof(eos_cmd));
-	if (retval)
-		return -EFAULT;
-
-	return ibeos(board, eos_cmd.eos, eos_cmd.eos_flags);
-}
-
-static int request_service_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	u8 status_byte;
-	int retval;
-
-	retval = copy_from_user(&status_byte, (void __user *)arg, sizeof(status_byte));
-	if (retval)
-		return -EFAULT;
-
-	return ibrsv2(board, status_byte, status_byte & request_service_bit);
-}
-
-static int request_service2_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	struct gpib_request_service2 request_service2_cmd;
-	int retval;
-
-	retval = copy_from_user(&request_service2_cmd, (void __user *)arg,
-				sizeof(struct gpib_request_service2));
-	if (retval)
-		return -EFAULT;
-
-	return ibrsv2(board, request_service2_cmd.status_byte,
-		      request_service2_cmd.new_reason_for_service);
-}
-
-static int iobase_ioctl(struct gpib_board_config *config, unsigned long arg)
-{
-	u64 base_addr;
-	int retval;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	retval = copy_from_user(&base_addr, (void __user *)arg, sizeof(base_addr));
-	if (retval)
-		return -EFAULT;
-
-	if (WARN_ON_ONCE(sizeof(void *) > sizeof(base_addr)))
-		return -EFAULT;
-	config->ibbase = base_addr;
-
-	return 0;
-}
-
-static int irq_ioctl(struct gpib_board_config *config, unsigned long arg)
-{
-	unsigned int irq;
-	int retval;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	retval = copy_from_user(&irq, (void __user *)arg, sizeof(irq));
-	if (retval)
-		return -EFAULT;
-
-	config->ibirq = irq;
-
-	return 0;
-}
-
-static int dma_ioctl(struct gpib_board_config *config, unsigned long arg)
-{
-	unsigned int dma_channel;
-	int retval;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	retval = copy_from_user(&dma_channel, (void __user *)arg, sizeof(dma_channel));
-	if (retval)
-		return -EFAULT;
-
-	config->ibdma = dma_channel;
-
-	return 0;
-}
-
-static int autospoll_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
-			   unsigned long arg)
-{
-	short enable;
-	int retval;
-	struct gpib_descriptor *desc;
-
-	retval = copy_from_user(&enable, (void __user *)arg, sizeof(enable));
-	if (retval)
-		return -EFAULT;
-
-	desc = handle_to_descriptor(file_priv, 0); /* board handle is 0 */
-
-	if (enable) {
-		if (!desc->autopoll_enabled) {
-			board->autospollers++;
-			desc->autopoll_enabled = 1;
-		}
-		retval = 0;
-	} else {
-		if (desc->autopoll_enabled) {
-			desc->autopoll_enabled = 0;
-			if (board->autospollers > 0) {
-				board->autospollers--;
-				retval = 0;
-			} else {
-				dev_err(board->gpib_dev,
-					"tried to set number of autospollers negative\n");
-				retval = -EINVAL;
-			}
-		} else {
-			dev_err(board->gpib_dev, "autopoll disable requested before enable\n");
-			retval = -EINVAL;
-		}
-	}
-	return retval;
-}
-
-static int mutex_ioctl(struct gpib_board *board, struct gpib_file_private *file_priv,
-		       unsigned long arg)
-{
-	int retval, lock_mutex;
-
-	retval = copy_from_user(&lock_mutex, (void __user *)arg, sizeof(lock_mutex));
-	if (retval)
-		return -EFAULT;
-
-	if (lock_mutex)	{
-		retval = mutex_lock_interruptible(&board->user_mutex);
-		if (retval)
-			return -ERESTARTSYS;
-
-		spin_lock(&board->locking_pid_spinlock);
-		board->locking_pid = current->pid;
-		spin_unlock(&board->locking_pid_spinlock);
-
-		atomic_set(&file_priv->holding_mutex, 1);
-
-		dev_dbg(board->gpib_dev, "locked board mutex\n");
-	} else {
-		spin_lock(&board->locking_pid_spinlock);
-		if (current->pid != board->locking_pid) {
-			dev_err(board->gpib_dev, "bug! pid %i tried to release mutex held by pid %i\n",
-				current->pid, board->locking_pid);
-			spin_unlock(&board->locking_pid_spinlock);
-			return -EPERM;
-		}
-		board->locking_pid = 0;
-		spin_unlock(&board->locking_pid_spinlock);
-
-		atomic_set(&file_priv->holding_mutex, 0);
-
-		mutex_unlock(&board->user_mutex);
-		dev_dbg(board->gpib_dev, "unlocked board mutex\n");
-	}
-	return 0;
-}
-
-static int timeout_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	unsigned int timeout;
-	int retval;
-
-	retval = copy_from_user(&timeout, (void __user *)arg, sizeof(timeout));
-	if (retval)
-		return -EFAULT;
-
-	board->usec_timeout = timeout;
-	dev_dbg(board->gpib_dev, "timeout set to %i usec\n", timeout);
-
-	return 0;
-}
-
-static int ppc_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	struct gpib_ppoll_config_ioctl cmd;
-	int retval;
-
-	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
-	if (retval)
-		return -EFAULT;
-
-	if (cmd.set_ist) {
-		board->ist = 1;
-		board->interface->parallel_poll_response(board, board->ist);
-	} else if (cmd.clear_ist) {
-		board->ist = 0;
-		board->interface->parallel_poll_response(board, board->ist);
-	}
-
-	if (cmd.config)	{
-		retval = ibppc(board, cmd.config);
-		if (retval < 0)
-			return retval;
-	}
-
-	return 0;
-}
-
-static int set_local_ppoll_mode_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	short cmd;
-	int retval;
-
-	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
-	if (retval)
-		return -EFAULT;
-
-	if (!board->interface->local_parallel_poll_mode)
-		return -ENOENT;
-	board->local_ppoll_mode = cmd != 0;
-	board->interface->local_parallel_poll_mode(board, board->local_ppoll_mode);
-
-	return 0;
-}
-
-static int get_local_ppoll_mode_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	short cmd;
-	int retval;
-
-	cmd = board->local_ppoll_mode;
-	retval = copy_to_user((void __user *)arg, &cmd, sizeof(cmd));
-	if (retval)
-		return -EFAULT;
-
-	return 0;
-}
-
-static int query_board_rsv_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	int status;
-	int retval;
-
-	status = board->interface->serial_poll_status(board);
-
-	retval = copy_to_user((void __user *)arg, &status, sizeof(status));
-	if (retval)
-		return -EFAULT;
-
-	return 0;
-}
-
-static int board_info_ioctl(const struct gpib_board *board, unsigned long arg)
-{
-	struct gpib_board_info_ioctl info = { };
-	int retval;
-
-	info.pad = board->pad;
-	info.sad = board->sad;
-	info.parallel_poll_configuration = board->parallel_poll_configuration;
-	info.is_system_controller = board->master;
-	if (board->autospollers)
-		info.autopolling = 1;
-	else
-		info.autopolling = 0;
-	info.t1_delay = board->t1_nano_sec;
-	info.ist = board->ist;
-	info.no_7_bit_eos = board->interface->no_7_bit_eos;
-	retval = copy_to_user((void __user *)arg, &info, sizeof(info));
-	if (retval)
-		return -EFAULT;
-
-	return 0;
-}
-
-static int interface_clear_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	unsigned int usec_duration;
-	int retval;
-
-	retval = copy_from_user(&usec_duration, (void __user *)arg, sizeof(usec_duration));
-	if (retval)
-		return -EFAULT;
-
-	return ibsic(board, usec_duration);
-}
-
-static int select_pci_ioctl(struct gpib_board_config *config, unsigned long arg)
-{
-	struct gpib_select_pci_ioctl selection;
-	int retval;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	retval = copy_from_user(&selection, (void __user *)arg, sizeof(selection));
-	if (retval)
-		return -EFAULT;
-
-	config->pci_bus = selection.pci_bus;
-	config->pci_slot = selection.pci_slot;
-
-	return 0;
-}
-
-static int select_device_path_ioctl(struct gpib_board_config *config, unsigned long arg)
-{
-	struct gpib_select_device_path_ioctl *selection;
-	int retval;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	selection = vmalloc(sizeof(struct gpib_select_device_path_ioctl));
-	if (!selection)
-		return -ENOMEM;
-
-	retval = copy_from_user(selection, (void __user *)arg,
-				sizeof(struct gpib_select_device_path_ioctl));
-	if (retval) {
-		vfree(selection);
-		return -EFAULT;
-	}
-
-	selection->device_path[sizeof(selection->device_path) - 1] = '\0';
-	kfree(config->device_path);
-	config->device_path = NULL;
-	if (strlen(selection->device_path) > 0)
-		config->device_path = kstrdup(selection->device_path, GFP_KERNEL);
-
-	vfree(selection);
-	return 0;
-}
-
-unsigned int num_gpib_events(const struct gpib_event_queue *queue)
-{
-	return queue->num_events;
-}
-
-static int push_gpib_event_nolock(struct gpib_board *board, short event_type)
-{
-	struct gpib_event_queue *queue = &board->event_queue;
-	struct list_head *head = &queue->event_head;
-	struct gpib_event *event;
-	static const unsigned int max_num_events = 1024;
-	int retval;
-
-	if (num_gpib_events(queue) >= max_num_events) {
-		short lost_event;
-
-		queue->dropped_event = 1;
-		retval = pop_gpib_event_nolock(board, queue, &lost_event);
-		if (retval < 0)
-			return retval;
-	}
-
-	event = kmalloc(sizeof(struct gpib_event), GFP_ATOMIC);
-	if (!event) {
-		queue->dropped_event = 1;
-		dev_err(board->gpib_dev, "failed to allocate memory for event\n");
-		return -ENOMEM;
-	}
-
-	INIT_LIST_HEAD(&event->list);
-	event->event_type = event_type;
-
-	list_add_tail(&event->list, head);
-
-	queue->num_events++;
-
-	dev_dbg(board->gpib_dev, "pushed event %i, %i in queue\n",
-		(int)event_type, num_gpib_events(queue));
-
-	return 0;
-}
-
-// push event onto back of event queue
-int push_gpib_event(struct gpib_board *board, short event_type)
-{
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&board->event_queue.lock, flags);
-	retval = push_gpib_event_nolock(board, event_type);
-	spin_unlock_irqrestore(&board->event_queue.lock, flags);
-
-	if (event_type == EVENT_DEV_TRG)
-		board->status |= DTAS;
-	if (event_type == EVENT_DEV_CLR)
-		board->status |= DCAS;
-
-	return retval;
-}
-EXPORT_SYMBOL(push_gpib_event);
-
-static int pop_gpib_event_nolock(struct gpib_board *board,
-				 struct gpib_event_queue *queue, short *event_type)
-{
-	struct list_head *head = &queue->event_head;
-	struct list_head *front = head->next;
-	struct gpib_event *event;
-
-	if (num_gpib_events(queue) == 0) {
-		*event_type = EVENT_NONE;
-		return 0;
-	}
-
-	if (front == head)
-		return -EIO;
-
-	if (queue->dropped_event) {
-		queue->dropped_event = 0;
-		return -EPIPE;
-	}
-
-	event = list_entry(front, struct gpib_event, list);
-	*event_type = event->event_type;
-
-	list_del(front);
-	kfree(event);
-
-	queue->num_events--;
-
-	dev_dbg(board->gpib_dev, "popped event %i, %i in queue\n",
-		(int)*event_type, num_gpib_events(queue));
-
-	return 0;
-}
-
-// pop event from front of event queue
-int pop_gpib_event(struct gpib_board *board, struct gpib_event_queue *queue, short *event_type)
-{
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&queue->lock, flags);
-	retval = pop_gpib_event_nolock(board, queue, event_type);
-	spin_unlock_irqrestore(&queue->lock, flags);
-	return retval;
-}
-
-static int event_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	short user_event;
-	int retval;
-	short event;
-
-	retval = pop_gpib_event(board, &board->event_queue, &event);
-	if (retval < 0)
-		return retval;
-
-	user_event = event;
-
-	retval = copy_to_user((void __user *)arg, &user_event, sizeof(user_event));
-	if (retval)
-		return -EFAULT;
-
-	return 0;
-}
-
-static int request_system_control_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	int request_control;
-	int retval;
-
-	retval = copy_from_user(&request_control, (void __user *)arg, sizeof(request_control));
-	if (retval)
-		return -EFAULT;
-
-	return ibrsc(board, request_control);
-}
-
-static int t1_delay_ioctl(struct gpib_board *board, unsigned long arg)
-{
-	unsigned int cmd;
-	unsigned int delay;
-	int retval;
-
-	if (!board->interface->t1_delay)
-		return -ENOENT;
-
-	retval = copy_from_user(&cmd, (void __user *)arg, sizeof(cmd));
-	if (retval)
-		return -EFAULT;
-
-	delay = cmd;
-
-	retval = board->interface->t1_delay(board, delay);
-	if (retval < 0)
-		return retval;
-
-	board->t1_nano_sec = retval;
-	return 0;
-}
-
-static const struct file_operations ib_fops = {
-	.owner = THIS_MODULE,
-	.llseek = NULL,
-	.unlocked_ioctl = &ibioctl,
-	.compat_ioctl = &ibioctl,
-	.open = &ibopen,
-	.release = &ibclose,
-};
-
-struct gpib_board board_array[GPIB_MAX_NUM_BOARDS];
-
-LIST_HEAD(registered_drivers);
-
-void init_gpib_descriptor(struct gpib_descriptor *desc)
-{
-	desc->pad = 0;
-	desc->sad = -1;
-	desc->is_board = 0;
-	desc->autopoll_enabled = 0;
-	atomic_set(&desc->io_in_progress, 0);
-}
-
-int gpib_register_driver(struct gpib_interface *interface, struct module *provider_module)
-{
-	struct gpib_interface_list *entry;
-
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-
-	entry->interface = interface;
-	entry->module = provider_module;
-	list_add(&entry->list, &registered_drivers);
-
-	return 0;
-}
-EXPORT_SYMBOL(gpib_register_driver);
-
-void gpib_unregister_driver(struct gpib_interface *interface)
-{
-	int i;
-	struct list_head *list_ptr;
-
-	for (i = 0; i < GPIB_MAX_NUM_BOARDS; i++) {
-		struct gpib_board *board = &board_array[i];
-
-		if (board->interface == interface) {
-			if (board->use_count > 0)
-				pr_warn("gpib: Warning: deregistered interface %s in use\n",
-					interface->name);
-			iboffline(board);
-			board->interface = NULL;
-		}
-	}
-	for (list_ptr = registered_drivers.next; list_ptr != &registered_drivers;) {
-		struct gpib_interface_list *entry;
-
-		entry = list_entry(list_ptr, struct gpib_interface_list, list);
-		list_ptr = list_ptr->next;
-		if (entry->interface == interface) {
-			list_del(&entry->list);
-			kfree(entry);
-		}
-	}
-}
-EXPORT_SYMBOL(gpib_unregister_driver);
-
-static void init_gpib_board_config(struct gpib_board_config *config)
-{
-	memset(config, 0, sizeof(struct gpib_board_config));
-	config->pci_bus = -1;
-	config->pci_slot = -1;
-}
-
-void init_gpib_board(struct gpib_board *board)
-{
-	board->interface = NULL;
-	board->provider_module = NULL;
-	board->buffer = NULL;
-	board->buffer_length = 0;
-	board->status = 0;
-	init_waitqueue_head(&board->wait);
-	mutex_init(&board->user_mutex);
-	mutex_init(&board->big_gpib_mutex);
-	board->locking_pid = 0;
-	spin_lock_init(&board->locking_pid_spinlock);
-	spin_lock_init(&board->spinlock);
-	timer_setup(&board->timer, NULL, 0);
-	board->dev = NULL;
-	board->gpib_dev = NULL;
-	init_gpib_board_config(&board->config);
-	board->private_data = NULL;
-	board->use_count = 0;
-	INIT_LIST_HEAD(&board->device_list);
-	board->pad = 0;
-	board->sad = -1;
-	board->usec_timeout = 3000000;
-	board->parallel_poll_configuration = 0;
-	board->online = 0;
-	board->autospollers = 0;
-	board->autospoll_task = NULL;
-	init_event_queue(&board->event_queue);
-	board->minor = -1;
-	init_gpib_pseudo_irq(&board->pseudo_irq);
-	board->master = 1;
-	atomic_set(&board->stuck_srq, 0);
-	board->local_ppoll_mode = 0;
-}
-
-int gpib_allocate_board(struct gpib_board *board)
-{
-	if (!board->buffer) {
-		board->buffer_length = 0x4000;
-		board->buffer = vmalloc(board->buffer_length);
-		if (!board->buffer) {
-			board->buffer_length = 0;
-			return -ENOMEM;
-		}
-	}
-	return 0;
-}
-
-void gpib_deallocate_board(struct gpib_board *board)
-{
-	short dummy;
-
-	if (board->buffer) {
-		vfree(board->buffer);
-		board->buffer = NULL;
-		board->buffer_length = 0;
-	}
-	while (num_gpib_events(&board->event_queue))
-		pop_gpib_event(board, &board->event_queue, &dummy);
-}
-
-static void init_board_array(struct gpib_board *board_array, unsigned int length)
-{
-	int i;
-
-	for (i = 0; i < length; i++) {
-		init_gpib_board(&board_array[i]);
-		board_array[i].minor = i;
-	}
-}
-
-void init_gpib_status_queue(struct gpib_status_queue *device)
-{
-	INIT_LIST_HEAD(&device->list);
-	INIT_LIST_HEAD(&device->status_bytes);
-	device->num_status_bytes = 0;
-	device->reference_count = 0;
-	device->dropped_byte = 0;
-}
-
-static struct class *gpib_class;
-
-static int __init gpib_common_init_module(void)
-{
-	int i;
-
-	pr_info("GPIB core driver\n");
-	init_board_array(board_array, GPIB_MAX_NUM_BOARDS);
-	if (register_chrdev(GPIB_CODE, "gpib", &ib_fops)) {
-		pr_err("gpib: can't get major %d\n", GPIB_CODE);
-		return -EIO;
-	}
-	gpib_class = class_create("gpib_common");
-	if (IS_ERR(gpib_class)) {
-		pr_err("gpib: failed to create gpib class\n");
-		unregister_chrdev(GPIB_CODE, "gpib");
-		return PTR_ERR(gpib_class);
-	}
-	for (i = 0; i < GPIB_MAX_NUM_BOARDS; ++i)
-		board_array[i].gpib_dev = device_create(gpib_class, NULL,
-							MKDEV(GPIB_CODE, i), NULL, "gpib%i", i);
-
-	return 0;
-}
-
-static void __exit gpib_common_exit_module(void)
-{
-	int i;
-
-	for (i = 0; i < GPIB_MAX_NUM_BOARDS; ++i)
-		device_destroy(gpib_class, MKDEV(GPIB_CODE, i));
-
-	class_destroy(gpib_class);
-	unregister_chrdev(GPIB_CODE, "gpib");
-}
-
-int gpib_match_device_path(struct device *dev, const char *device_path_in)
-{
-	if (device_path_in) {
-		char *device_path;
-
-		device_path = kobject_get_path(&dev->kobj, GFP_KERNEL);
-		if (!device_path) {
-			dev_err(dev, "kobject_get_path returned NULL.");
-			return 0;
-		}
-		if (strcmp(device_path_in, device_path) != 0) {
-			kfree(device_path);
-			return 0;
-		}
-		kfree(device_path);
-	}
-	return 1;
-}
-EXPORT_SYMBOL(gpib_match_device_path);
-
-struct pci_dev *gpib_pci_get_device(const struct gpib_board_config *config, unsigned int vendor_id,
-				    unsigned int device_id, struct pci_dev *from)
-{
-	struct pci_dev *pci_device = from;
-
-	while ((pci_device = pci_get_device(vendor_id, device_id, pci_device)))	{
-		if (config->pci_bus >= 0 && config->pci_bus != pci_device->bus->number)
-			continue;
-		if (config->pci_slot >= 0 && config->pci_slot !=
-		    PCI_SLOT(pci_device->devfn))
-			continue;
-		if (gpib_match_device_path(&pci_device->dev, config->device_path) == 0)
-			continue;
-		return pci_device;
-	}
-	return NULL;
-}
-EXPORT_SYMBOL(gpib_pci_get_device);
-
-struct pci_dev *gpib_pci_get_subsys(const struct gpib_board_config *config, unsigned int vendor_id,
-				    unsigned int device_id, unsigned int ss_vendor,
-				    unsigned int ss_device,
-				    struct pci_dev *from)
-{
-	struct pci_dev *pci_device = from;
-
-	while ((pci_device = pci_get_subsys(vendor_id, device_id,
-					    ss_vendor, ss_device, pci_device))) {
-		if (config->pci_bus >= 0 && config->pci_bus != pci_device->bus->number)
-			continue;
-		if (config->pci_slot >= 0 && config->pci_slot !=
-		    PCI_SLOT(pci_device->devfn))
-			continue;
-		if (gpib_match_device_path(&pci_device->dev, config->device_path) == 0)
-			continue;
-		return pci_device;
-	}
-	return NULL;
-}
-EXPORT_SYMBOL(gpib_pci_get_subsys);
-
-module_init(gpib_common_init_module);
-module_exit(gpib_common_exit_module);
-
diff --git a/drivers/staging/gpib/common/iblib.c b/drivers/staging/gpib/common/iblib.c
deleted file mode 100644
index 7cbb6a467177..000000000000
--- a/drivers/staging/gpib/common/iblib.c
+++ /dev/null
@@ -1,717 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *    copyright            : (C) 2001, 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#define dev_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include "ibsys.h"
-#include <linux/delay.h>
-#include <linux/kthread.h>
-#include <linux/vmalloc.h>
-
-/*
- * IBCAC
- * Return to the controller active state from the
- * controller standby state, i.e., turn ATN on.  Note
- * that in order to enter the controller active state
- * from the controller idle state, ibsic must be called.
- * If sync is non-zero, attempt to take control synchronously.
- * If fallback_to_async is non-zero, try to take control asynchronously
- * if synchronous attempt fails.
- */
-int ibcac(struct gpib_board *board, int sync, int fallback_to_async)
-{
-	int status = ibstatus(board);
-	int retval;
-
-	if ((status & CIC) == 0)
-		return -EINVAL;
-
-	if (status & ATN)
-		return 0;
-
-	if (sync && (status & LACS) == 0)
-		/*
-		 * tcs (take control synchronously) can only possibly work when
-		 * controller is listener.  Error code also needs to be -ETIMEDOUT
-		 * or it will giveout without doing fallback.
-		 */
-		retval = -ETIMEDOUT;
-	else
-		retval = board->interface->take_control(board, sync);
-
-	if (retval < 0 && fallback_to_async) {
-		if (sync && retval == -ETIMEDOUT)
-			retval = board->interface->take_control(board, 0);
-	}
-	board->interface->update_status(board, 0);
-
-	return retval;
-}
-
-/*
- * After ATN is asserted, it should cause any connected devices
- * to start listening for command bytes and leave acceptor idle state.
- * So if ATN is asserted and neither NDAC or NRFD are asserted,
- * then there are no devices and ibcmd should error out immediately.
- * Some gpib hardware sees itself asserting NDAC/NRFD when it
- * is controller in charge, in which case this check will
- * do nothing useful (but shouldn't cause any harm either).
- * Drivers that don't need this check (ni_usb for example) may
- * set the skip_check_for_command_acceptors flag in their
- * gpib_interface_struct to avoid useless overhead.
- */
-static int check_for_command_acceptors(struct gpib_board *board)
-{
-	int lines;
-
-	if (board->interface->skip_check_for_command_acceptors)
-		return 0;
-	if (!board->interface->line_status)
-		return 0;
-
-	udelay(2); // allow time for devices to respond to ATN if it was just asserted
-
-	lines = board->interface->line_status(board);
-	if (lines < 0)
-		return lines;
-
-	if ((lines & VALID_NRFD) && (lines & VALID_NDAC))	{
-		if ((lines & BUS_NRFD) == 0 && (lines & BUS_NDAC) == 0)
-			return -ENOTCONN;
-	}
-
-	return 0;
-}
-
-/*
- * IBCMD
- * Write cnt command bytes from buf to the GPIB.  The
- * command operation terminates only on I/O complete.
- *
- * NOTE:
- *      1.  Prior to beginning the command, the interface is
- *          placed in the controller active state.
- *      2.  Before calling ibcmd for the first time, ibsic
- *          must be called to initialize the GPIB and enable
- *          the interface to leave the controller idle state.
- */
-int ibcmd(struct gpib_board *board, u8 *buf, size_t length, size_t *bytes_written)
-{
-	ssize_t ret = 0;
-	int status;
-
-	*bytes_written = 0;
-
-	status = ibstatus(board);
-
-	if ((status & CIC) == 0)
-		return -EINVAL;
-
-	os_start_timer(board, board->usec_timeout);
-
-	ret = ibcac(board, 1, 1);
-	if (ret == 0) {
-		ret = check_for_command_acceptors(board);
-		if (ret == 0)
-			ret = board->interface->command(board, buf, length, bytes_written);
-	}
-
-	os_remove_timer(board);
-
-	if (io_timed_out(board))
-		ret = -ETIMEDOUT;
-
-	return ret;
-}
-
-/*
- * IBGTS
- * Go to the controller standby state from the controller
- * active state, i.e., turn ATN off.
- */
-
-int ibgts(struct gpib_board *board)
-{
-	int status = ibstatus(board);
-	int retval;
-
-	if ((status & CIC) == 0)
-		return -EINVAL;
-
-	retval = board->interface->go_to_standby(board);    /* go to standby */
-
-	board->interface->update_status(board, 0);
-
-	return retval;
-}
-
-static int autospoll_wait_should_wake_up(struct gpib_board *board)
-{
-	int retval;
-
-	mutex_lock(&board->big_gpib_mutex);
-
-	retval = board->master && board->autospollers > 0 &&
-		!atomic_read(&board->stuck_srq) &&
-		test_and_clear_bit(SRQI_NUM, &board->status);
-
-	mutex_unlock(&board->big_gpib_mutex);
-	return retval;
-}
-
-static int autospoll_thread(void *board_void)
-{
-	struct gpib_board *board = board_void;
-	int retval = 0;
-
-	dev_dbg(board->gpib_dev, "entering autospoll thread\n");
-
-	while (1) {
-		wait_event_interruptible(board->wait,
-					 kthread_should_stop() ||
-					 autospoll_wait_should_wake_up(board));
-		dev_dbg(board->gpib_dev, "autospoll wait satisfied\n");
-		if (kthread_should_stop())
-			break;
-
-		mutex_lock(&board->big_gpib_mutex);
-		/* make sure we are still good after we have lock */
-		if (board->autospollers <= 0 || board->master == 0) {
-			mutex_unlock(&board->big_gpib_mutex);
-			continue;
-		}
-		mutex_unlock(&board->big_gpib_mutex);
-
-		if (try_module_get(board->provider_module)) {
-			retval = autopoll_all_devices(board);
-			module_put(board->provider_module);
-		} else {
-			dev_err(board->gpib_dev, "try_module_get() failed!\n");
-		}
-		if (retval <= 0) {
-			dev_err(board->gpib_dev, "stuck SRQ\n");
-
-			atomic_set(&board->stuck_srq, 1);	// XXX could be better
-			set_bit(SRQI_NUM, &board->status);
-		}
-	}
-	return retval;
-}
-
-int ibonline(struct gpib_board *board)
-{
-	int retval;
-
-	if (board->online)
-		return -EBUSY;
-	if (!board->interface)
-		return -ENODEV;
-	retval = gpib_allocate_board(board);
-	if (retval < 0)
-		return retval;
-
-	board->dev = NULL;
-	board->local_ppoll_mode = 0;
-	retval = board->interface->attach(board, &board->config);
-	if (retval < 0) {
-		board->interface->detach(board);
-		return retval;
-	}
-	/*
-	 * nios2nommu on 2.6.11 uclinux kernel has weird problems
-	 * with autospoll thread causing huge slowdowns
-	 */
-#ifndef CONFIG_NIOS2
-	board->autospoll_task = kthread_run(&autospoll_thread, board,
-					    "gpib%d_autospoll_kthread", board->minor);
-	retval = IS_ERR(board->autospoll_task);
-	if (retval) {
-		dev_err(board->gpib_dev, "failed to create autospoll thread\n");
-		board->interface->detach(board);
-		return retval;
-	}
-#endif
-	board->online = 1;
-	dev_dbg(board->gpib_dev, "board online\n");
-
-	return 0;
-}
-
-/* XXX need to make sure board is generally not in use (grab board lock?) */
-int iboffline(struct gpib_board *board)
-{
-	int retval;
-
-	if (board->online == 0)
-		return 0;
-	if (!board->interface)
-		return -ENODEV;
-
-	if (board->autospoll_task && !IS_ERR(board->autospoll_task)) {
-		retval = kthread_stop(board->autospoll_task);
-		if (retval)
-			dev_err(board->gpib_dev, "kthread_stop returned %i\n", retval);
-		board->autospoll_task = NULL;
-	}
-
-	board->interface->detach(board);
-	gpib_deallocate_board(board);
-	board->online = 0;
-	dev_dbg(board->gpib_dev, "board offline\n");
-
-	return 0;
-}
-
-/*
- * IBLINES
- * Poll the GPIB control lines and return their status in buf.
- *
- *      LSB (bits 0-7)  -  VALID lines mask (lines that can be monitored).
- * Next LSB (bits 8-15) - STATUS lines mask (lines that are currently set).
- *
- */
-int iblines(const struct gpib_board *board, short *lines)
-{
-	int retval;
-
-	*lines = 0;
-	if (!board->interface->line_status)
-		return 0;
-	retval = board->interface->line_status(board);
-	if (retval < 0)
-		return retval;
-	*lines = retval;
-	return 0;
-}
-
-/*
- * IBRD
- * Read up to 'length' bytes of data from the GPIB into buf.  End
- * on detection of END (EOI and or EOS) and set 'end_flag'.
- *
- * NOTE:
- *      1.  The interface is placed in the controller standby
- *          state prior to beginning the read.
- *      2.  Prior to calling ibrd, the intended devices as well
- *          as the interface board itself must be addressed by
- *          calling ibcmd.
- */
-
-int ibrd(struct gpib_board *board, u8 *buf, size_t length, int *end_flag, size_t *nbytes)
-{
-	ssize_t ret = 0;
-	int retval;
-	size_t bytes_read;
-
-	*nbytes = 0;
-	*end_flag = 0;
-	if (length == 0)
-		return 0;
-
-	if (board->master) {
-		retval = ibgts(board);
-		if (retval < 0)
-			return retval;
-	}
-	/*
-	 * XXX resetting timer here could cause timeouts take longer than they should,
-	 * since read_ioctl calls this
-	 * function in a loop, there is probably a similar problem with writes/commands
-	 */
-	os_start_timer(board, board->usec_timeout);
-
-	do {
-		ret = board->interface->read(board, buf, length - *nbytes, end_flag, &bytes_read);
-		if (ret < 0)
-			goto ibrd_out;
-
-		buf += bytes_read;
-		*nbytes += bytes_read;
-		if (need_resched())
-			schedule();
-	} while (ret == 0 && *nbytes > 0 && *nbytes < length && *end_flag == 0);
-ibrd_out:
-	os_remove_timer(board);
-
-	return ret;
-}
-
-/*
- * IBRPP
- * Conduct a parallel poll and return the byte in buf.
- *
- * NOTE:
- *	1.  Prior to conducting the poll the interface is placed
- *	    in the controller active state.
- */
-int ibrpp(struct gpib_board *board, u8 *result)
-{
-	int retval = 0;
-
-	os_start_timer(board, board->usec_timeout);
-	retval = ibcac(board, 1, 1);
-	if (retval)
-		return -1;
-
-	retval =  board->interface->parallel_poll(board, result);
-
-	os_remove_timer(board);
-	return retval;
-}
-
-int ibppc(struct gpib_board *board, u8 configuration)
-{
-	configuration &= 0x1f;
-	board->interface->parallel_poll_configure(board, configuration);
-	board->parallel_poll_configuration = configuration;
-
-	return 0;
-}
-
-int ibrsv2(struct gpib_board *board, u8 status_byte, int new_reason_for_service)
-{
-	int board_status = ibstatus(board);
-	const unsigned int MSS = status_byte & request_service_bit;
-
-	if ((board_status & CIC))
-		return -EINVAL;
-
-	if (MSS == 0 && new_reason_for_service)
-		return -EINVAL;
-
-	if (board->interface->serial_poll_response2)	{
-		board->interface->serial_poll_response2(board, status_byte, new_reason_for_service);
-		// fall back on simpler serial_poll_response if the behavior would be the same
-	} else if (board->interface->serial_poll_response &&
-		   (MSS == 0 || (MSS && new_reason_for_service))) {
-		board->interface->serial_poll_response(board, status_byte);
-	} else {
-		return -EOPNOTSUPP;
-	}
-
-	return 0;
-}
-
-/*
- * IBSIC
- * Send IFC for at least 100 microseconds.
- *
- * NOTE:
- *	1.  Ibsic must be called prior to the first call to
- *	    ibcmd in order to initialize the bus and enable the
- *	    interface to leave the controller idle state.
- */
-int ibsic(struct gpib_board *board, unsigned int usec_duration)
-{
-	if (board->master == 0)
-		return -EINVAL;
-
-	if (usec_duration < 100)
-		usec_duration = 100;
-	if (usec_duration > 1000)
-		usec_duration = 1000;
-
-	dev_dbg(board->gpib_dev, "sending interface clear, delay = %ius\n", usec_duration);
-	board->interface->interface_clear(board, 1);
-	udelay(usec_duration);
-	board->interface->interface_clear(board, 0);
-
-	return 0;
-}
-
-int ibrsc(struct gpib_board *board, int request_control)
-{
-	int retval;
-
-	if (!board->interface->request_system_control)
-		return -EPERM;
-
-	retval = board->interface->request_system_control(board, request_control);
-
-	if (retval)
-		return retval;
-
-	board->master = request_control != 0;
-
-	return  0;
-}
-
-/*
- * IBSRE
- * Send REN true if v is non-zero or false if v is zero.
- */
-int ibsre(struct gpib_board *board, int enable)
-{
-	if (board->master == 0)
-		return -EINVAL;
-
-	board->interface->remote_enable(board, enable);	/* set or clear REN */
-	if (!enable)
-		usleep_range(100, 150);
-
-	return 0;
-}
-
-/*
- * IBPAD
- * change the GPIB address of the interface board.  The address
- * must be 0 through 30.  ibonl resets the address to PAD.
- */
-int ibpad(struct gpib_board *board, unsigned int addr)
-{
-	if (addr > MAX_GPIB_PRIMARY_ADDRESS)
-		return -EINVAL;
-
-	board->pad = addr;
-	if (board->online)
-		board->interface->primary_address(board, board->pad);
-	dev_dbg(board->gpib_dev, "set primary addr to %i\n", board->pad);
-	return 0;
-}
-
-/*
- * IBSAD
- * change the secondary GPIB address of the interface board.
- * The address must be 0 through 30, or negative disables.  ibonl resets the
- * address to SAD.
- */
-int ibsad(struct gpib_board *board, int addr)
-{
-	if (addr > MAX_GPIB_SECONDARY_ADDRESS)
-		return -EINVAL;
-	board->sad = addr;
-	if (board->online) {
-		if (board->sad >= 0)
-			board->interface->secondary_address(board, board->sad, 1);
-		else
-			board->interface->secondary_address(board, 0, 0);
-	}
-	dev_dbg(board->gpib_dev, "set secondary addr to %i\n", board->sad);
-
-	return 0;
-}
-
-/*
- * IBEOS
- * Set the end-of-string modes for I/O operations to v.
- *
- */
-int ibeos(struct gpib_board *board, int eos, int eosflags)
-{
-	int retval;
-
-	if (eosflags & ~EOS_MASK)
-		return -EINVAL;
-	if (eosflags & REOS) {
-		retval = board->interface->enable_eos(board, eos, eosflags & BIN);
-	} else {
-		board->interface->disable_eos(board);
-		retval = 0;
-	}
-	return retval;
-}
-
-int ibstatus(struct gpib_board *board)
-{
-	return general_ibstatus(board, NULL, 0, 0, NULL);
-}
-
-int general_ibstatus(struct gpib_board *board, const struct gpib_status_queue *device,
-		     int clear_mask, int set_mask, struct gpib_descriptor *desc)
-{
-	int status = 0;
-	short line_status;
-
-	if (board->private_data) {
-		status = board->interface->update_status(board, clear_mask);
-		/*
-		 * XXX should probably stop having drivers use TIMO bit in
-		 * board->status to avoid confusion
-		 */
-		status &= ~TIMO;
-		/* get real SRQI status if we can */
-		if (iblines(board, &line_status) == 0) {
-			if ((line_status & VALID_SRQ)) {
-				if ((line_status & BUS_SRQ))
-					status |= SRQI;
-				else
-					status &= ~SRQI;
-			}
-		}
-	}
-	if (device)
-		if (num_status_bytes(device))
-			status |= RQS;
-
-	if (desc) {
-		if (set_mask & CMPL)
-			atomic_set(&desc->io_in_progress, 0);
-		else if (clear_mask & CMPL)
-			atomic_set(&desc->io_in_progress, 1);
-
-		if (atomic_read(&desc->io_in_progress))
-			status &= ~CMPL;
-		else
-			status |= CMPL;
-	}
-	if (num_gpib_events(&board->event_queue))
-		status |= EVENT;
-	else
-		status &= ~EVENT;
-
-	return status;
-}
-
-struct wait_info {
-	struct gpib_board *board;
-	struct timer_list timer;
-	int timed_out;
-	unsigned long usec_timeout;
-};
-
-static void wait_timeout(struct timer_list *t)
-{
-	struct wait_info *winfo = timer_container_of(winfo, t, timer);
-
-	winfo->timed_out = 1;
-	wake_up_interruptible(&winfo->board->wait);
-}
-
-static void init_wait_info(struct wait_info *winfo)
-{
-	winfo->board = NULL;
-	winfo->timed_out = 0;
-	timer_setup_on_stack(&winfo->timer, wait_timeout, 0);
-}
-
-static int wait_satisfied(struct wait_info *winfo, struct gpib_status_queue *status_queue,
-			  int wait_mask, int *status, struct gpib_descriptor *desc)
-{
-	struct gpib_board *board = winfo->board;
-	int temp_status;
-
-	if (mutex_lock_interruptible(&board->big_gpib_mutex))
-		return -ERESTARTSYS;
-
-	temp_status = general_ibstatus(board, status_queue, 0, 0, desc);
-
-	mutex_unlock(&board->big_gpib_mutex);
-
-	if (winfo->timed_out)
-		temp_status |= TIMO;
-	else
-		temp_status &= ~TIMO;
-	if (wait_mask & temp_status) {
-		*status = temp_status;
-		return 1;
-	}
-// XXX does wait for END work?
-	return 0;
-}
-
-/* install timer interrupt handler */
-static void start_wait_timer(struct wait_info *winfo)
-/* Starts the timeout task  */
-{
-	winfo->timed_out = 0;
-
-	if (winfo->usec_timeout > 0)
-		mod_timer(&winfo->timer, jiffies + usec_to_jiffies(winfo->usec_timeout));
-}
-
-static void remove_wait_timer(struct wait_info *winfo)
-{
-	timer_delete_sync(&winfo->timer);
-	timer_destroy_on_stack(&winfo->timer);
-}
-
-/*
- * IBWAIT
- * Check or wait for a GPIB event to occur.  The mask argument
- * is a bit vector corresponding to the status bit vector.  It
- * has a bit set for each condition which can terminate the wait
- * If the mask is 0 then
- * no condition is waited for.
- */
-int ibwait(struct gpib_board *board, int wait_mask, int clear_mask, int set_mask,
-	   int *status, unsigned long usec_timeout, struct gpib_descriptor *desc)
-{
-	int retval = 0;
-	struct gpib_status_queue *status_queue;
-	struct wait_info winfo;
-
-	if (desc->is_board)
-		status_queue = NULL;
-	else
-		status_queue = get_gpib_status_queue(board, desc->pad, desc->sad);
-
-	if (wait_mask == 0) {
-		*status = general_ibstatus(board, status_queue, clear_mask, set_mask, desc);
-		return 0;
-	}
-
-	mutex_unlock(&board->big_gpib_mutex);
-
-	init_wait_info(&winfo);
-	winfo.board = board;
-	winfo.usec_timeout = usec_timeout;
-	start_wait_timer(&winfo);
-
-	if (wait_event_interruptible(board->wait, wait_satisfied(&winfo, status_queue,
-								 wait_mask, status, desc))) {
-		dev_dbg(board->gpib_dev, "wait interrupted\n");
-		retval = -ERESTARTSYS;
-	}
-	remove_wait_timer(&winfo);
-
-	if (retval)
-		return retval;
-	if (mutex_lock_interruptible(&board->big_gpib_mutex))
-		return -ERESTARTSYS;
-
-	/* make sure we only clear status bits that we are reporting */
-	if (*status & clear_mask || set_mask)
-		general_ibstatus(board, status_queue, *status & clear_mask, set_mask, NULL);
-
-	return 0;
-}
-
-/*
- * IBWRT
- * Write cnt bytes of data from buf to the GPIB.  The write
- * operation terminates only on I/O complete.
- *
- * NOTE:
- *      1.  Prior to beginning the write, the interface is
- *          placed in the controller standby state.
- *      2.  Prior to calling ibwrt, the intended devices as
- *          well as the interface board itself must be
- *          addressed by calling ibcmd.
- */
-int ibwrt(struct gpib_board *board, u8 *buf, size_t cnt, int send_eoi, size_t *bytes_written)
-{
-	int ret = 0;
-	int retval;
-
-	if (cnt == 0)
-		return 0;
-
-	if (board->master) {
-		retval = ibgts(board);
-		if (retval < 0)
-			return retval;
-	}
-	os_start_timer(board, board->usec_timeout);
-	ret = board->interface->write(board, buf, cnt, send_eoi, bytes_written);
-
-	if (io_timed_out(board))
-		ret = -ETIMEDOUT;
-
-	os_remove_timer(board);
-
-	return ret;
-}
-
diff --git a/drivers/staging/gpib/common/ibsys.h b/drivers/staging/gpib/common/ibsys.h
deleted file mode 100644
index e5a148f513a8..000000000000
--- a/drivers/staging/gpib/common/ibsys.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include "gpibP.h"
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/major.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/timer.h>
-
-#include <linux/io.h>
-#include <linux/uaccess.h>
-#include <asm/irq.h>
-#include <asm/dma.h>
-
-#define MAX_GPIB_PRIMARY_ADDRESS 30
-#define MAX_GPIB_SECONDARY_ADDRESS 31
-
-int gpib_allocate_board(struct gpib_board *board);
-void gpib_deallocate_board(struct gpib_board *board);
-
-unsigned int num_status_bytes(const struct gpib_status_queue *dev);
-int push_status_byte(struct gpib_board *board, struct gpib_status_queue *device,
-		     u8 poll_byte);
-int pop_status_byte(struct gpib_board *board, struct gpib_status_queue *device,
-		    u8 *poll_byte);
-struct gpib_status_queue *get_gpib_status_queue(struct gpib_board *board,
-						unsigned int pad, int sad);
-int get_serial_poll_byte(struct gpib_board *board, unsigned int pad, int sad,
-			 unsigned int usec_timeout, u8 *poll_byte);
-int autopoll_all_devices(struct gpib_board *board);
diff --git a/drivers/staging/gpib/eastwood/Makefile b/drivers/staging/gpib/eastwood/Makefile
deleted file mode 100644
index 384825195f77..000000000000
--- a/drivers/staging/gpib/eastwood/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-
-obj-$(CONFIG_GPIB_FLUKE) += fluke_gpib.o
-
diff --git a/drivers/staging/gpib/eastwood/fluke_gpib.c b/drivers/staging/gpib/eastwood/fluke_gpib.c
deleted file mode 100644
index 3ae848e3f738..000000000000
--- a/drivers/staging/gpib/eastwood/fluke_gpib.c
+++ /dev/null
@@ -1,1180 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- * GPIB Driver for Fluke cda devices.  Basically, its a driver for a (bugfixed)
- * cb7210 connected to channel 0 of a pl330 dma controller.
- *    Author: Frank Mori Hess <fmh6jj@gmail.com>
- *   copyright: (C) 2006, 2010, 2015 Fluke Corporation
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include "fluke_gpib.h"
-
-#include "gpibP.h"
-#include <linux/dma-mapping.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/mod_devicetable.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB Driver for Fluke cda devices");
-
-static int fluke_attach_holdoff_all(struct gpib_board *board,
-				    const struct gpib_board_config *config);
-static int fluke_attach_holdoff_end(struct gpib_board *board,
-				    const struct gpib_board_config *config);
-static void fluke_detach(struct gpib_board *board);
-static int fluke_config_dma(struct gpib_board *board, int output);
-static irqreturn_t fluke_gpib_internal_interrupt(struct gpib_board *board);
-
-static struct platform_device *fluke_gpib_pdev;
-
-static u8 fluke_locking_read_byte(struct nec7210_priv *nec_priv, unsigned int register_number)
-{
-	u8 retval;
-	unsigned long flags;
-
-	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
-	retval = fluke_read_byte_nolock(nec_priv, register_number);
-	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
-	return retval;
-}
-
-static void fluke_locking_write_byte(struct nec7210_priv *nec_priv, u8 byte,
-				     unsigned int register_number)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
-	fluke_write_byte_nolock(nec_priv, byte, register_number);
-	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
-}
-
-// wrappers for interface functions
-static int fluke_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
-		      size_t *bytes_read)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
-}
-
-static int fluke_write(struct gpib_board *board, u8 *buffer, size_t length,
-		       int send_eoi, size_t *bytes_written)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
-}
-
-static int fluke_command(struct gpib_board *board, u8 *buffer,
-			 size_t length, size_t *bytes_written)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
-}
-
-static int fluke_take_control(struct gpib_board *board, int synchronous)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
-}
-
-static int fluke_go_to_standby(struct gpib_board *board)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_go_to_standby(board, &priv->nec7210_priv);
-}
-
-static int fluke_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct fluke_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-
-	return nec7210_request_system_control(board, nec_priv, request_control);
-}
-
-static void fluke_interface_clear(struct gpib_board *board, int assert)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
-}
-
-static void fluke_remote_enable(struct gpib_board *board, int enable)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
-}
-
-static int fluke_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
-}
-
-static void fluke_disable_eos(struct gpib_board *board)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	nec7210_disable_eos(board, &priv->nec7210_priv);
-}
-
-static unsigned int fluke_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
-}
-
-static int fluke_primary_address(struct gpib_board *board, unsigned int address)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_primary_address(board, &priv->nec7210_priv, address);
-}
-
-static int fluke_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
-}
-
-static int fluke_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
-}
-
-static void fluke_parallel_poll_configure(struct gpib_board *board, u8 configuration)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, configuration);
-}
-
-static void fluke_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
-}
-
-static void fluke_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
-}
-
-static u8 fluke_serial_poll_status(struct gpib_board *board)
-{
-	struct fluke_priv *priv = board->private_data;
-
-	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
-}
-
-static void fluke_return_to_local(struct gpib_board *board)
-{
-	struct fluke_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-
-	write_byte(nec_priv, AUX_RTL2, AUXMR);
-	udelay(1);
-	write_byte(nec_priv, AUX_RTL, AUXMR);
-}
-
-static int fluke_line_status(const struct gpib_board *board)
-{
-	int status = VALID_ALL;
-	int bsr_bits;
-	struct fluke_priv *e_priv;
-
-	e_priv = board->private_data;
-
-	bsr_bits = fluke_paged_read_byte(e_priv, BUS_STATUS, BUS_STATUS_PAGE);
-
-	if ((bsr_bits & BSR_REN_BIT) == 0)
-		status |= BUS_REN;
-	if ((bsr_bits & BSR_IFC_BIT) == 0)
-		status |= BUS_IFC;
-	if ((bsr_bits & BSR_SRQ_BIT) == 0)
-		status |= BUS_SRQ;
-	if ((bsr_bits & BSR_EOI_BIT) == 0)
-		status |= BUS_EOI;
-	if ((bsr_bits & BSR_NRFD_BIT) == 0)
-		status |= BUS_NRFD;
-	if ((bsr_bits & BSR_NDAC_BIT) == 0)
-		status |= BUS_NDAC;
-	if ((bsr_bits & BSR_DAV_BIT) == 0)
-		status |= BUS_DAV;
-	if ((bsr_bits & BSR_ATN_BIT) == 0)
-		status |= BUS_ATN;
-
-	return status;
-}
-
-static int fluke_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	struct fluke_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	unsigned int retval;
-
-	retval = nec7210_t1_delay(board, nec_priv, nano_sec);
-
-	if (nano_sec <= 350) {
-		write_byte(nec_priv, AUX_HI_SPEED, AUXMR);
-		retval = 350;
-	} else {
-		write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
-	}
-	return retval;
-}
-
-static int lacs_or_read_ready(struct gpib_board *board)
-{
-	const struct fluke_priv *e_priv = board->private_data;
-	const struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	retval = test_bit(LACS_NUM, &board->status) || test_bit(READ_READY_BN, &nec_priv->state);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return retval;
-}
-
-/*
- * Wait until it is possible for a read to do something useful.  This
- * is not essential, it only exists to prevent RFD holdoff from being released pointlessly.
- */
-static int wait_for_read(struct gpib_board *board)
-{
-	struct fluke_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-
-	if (wait_event_interruptible(board->wait,
-				     lacs_or_read_ready(board) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-	return retval;
-}
-
-/*
- * Check if the SH state machine is in SGNS.  We check twice since there is a very small chance
- * we could be blowing through SGNS from SIDS to SDYS if there is already a
- * byte available in the handshake state machine.  We are interested
- * in the case where the handshake is stuck in SGNS due to no byte being
- * available to the chip (and thus we can be confident a dma transfer will
- * result in at least one byte making it into the chip).  This matters
- * because we want to be confident before sending a "send eoi" auxilary
- * command that we will be able to also put the associated data byte
- * in the chip before any potential timeout.
- */
-static int source_handshake_is_sgns(struct fluke_priv *e_priv)
-{
-	int i;
-
-	for (i = 0; i < 2; ++i)	{
-		if ((fluke_paged_read_byte(e_priv, STATE1_REG, STATE1_PAGE) &
-		     SOURCE_HANDSHAKE_MASK) != SOURCE_HANDSHAKE_SGNS_BITS) {
-			return 0;
-		}
-	}
-	return 1;
-}
-
-static int source_handshake_is_sids_or_sgns(struct fluke_priv *e_priv)
-{
-	unsigned int source_handshake_bits;
-
-	source_handshake_bits = fluke_paged_read_byte(e_priv, STATE1_REG, STATE1_PAGE) &
-		SOURCE_HANDSHAKE_MASK;
-
-	return (source_handshake_bits == SOURCE_HANDSHAKE_SGNS_BITS) ||
-		(source_handshake_bits == SOURCE_HANDSHAKE_SIDS_BITS);
-}
-
-/*
- * Wait until the gpib chip is ready to accept a data out byte.
- * If the chip is SGNS it is probably waiting for a a byte to
- * be written to it.
- */
-static int wait_for_data_out_ready(struct gpib_board *board)
-{
-	struct fluke_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-
-	if (wait_event_interruptible(board->wait,
-				     (test_bit(TACS_NUM, &board->status) &&
-				      source_handshake_is_sgns(e_priv)) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-	return retval;
-}
-
-static int wait_for_sids_or_sgns(struct gpib_board *board)
-{
-	struct fluke_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-
-	if (wait_event_interruptible(board->wait,
-				     source_handshake_is_sids_or_sgns(e_priv) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-	return retval;
-}
-
-static void fluke_dma_callback(void *arg)
-{
-	struct gpib_board *board = arg;
-	struct fluke_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE | HR_DIIE, HR_DOIE | HR_DIIE);
-	wake_up_interruptible(&board->wait);
-
-	fluke_gpib_internal_interrupt(board);
-	clear_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state);
-	clear_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state);
-
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-
-static int fluke_dma_write(struct gpib_board *board, u8 *buffer, size_t length,
-			   size_t *bytes_written)
-{
-	struct fluke_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	unsigned long flags;
-	int retval = 0;
-	dma_addr_t address;
-	struct dma_async_tx_descriptor *tx_desc;
-
-	*bytes_written = 0;
-
-	if (WARN_ON_ONCE(length > e_priv->dma_buffer_size))
-		return -EFAULT;
-	dmaengine_terminate_all(e_priv->dma_channel);
-	// write-clear counter
-	writel(0x0, e_priv->write_transfer_counter);
-
-	memcpy(e_priv->dma_buffer, buffer, length);
-	address = dma_map_single(board->dev, e_priv->dma_buffer,
-				 length, DMA_TO_DEVICE);
-	/* program dma controller */
-	retval = fluke_config_dma(board, 1);
-	if (retval)
-		goto cleanup;
-
-	tx_desc = dmaengine_prep_slave_single(e_priv->dma_channel, address, length, DMA_MEM_TO_DEV,
-					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
-	if (!tx_desc) {
-		dev_err(board->gpib_dev, "failed to allocate dma transmit descriptor\n");
-		retval = -ENOMEM;
-		goto cleanup;
-	}
-	tx_desc->callback = fluke_dma_callback;
-	tx_desc->callback_param = board;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, 0);
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, HR_DMAO);
-	dmaengine_submit(tx_desc);
-	dma_async_issue_pending(e_priv->dma_channel);
-
-	clear_bit(WRITE_READY_BN, &nec_priv->state);
-	set_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state);
-
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	// suspend until message is sent
-	if (wait_event_interruptible(board->wait,
-				     ((readl(e_priv->write_transfer_counter) &
-				       write_transfer_counter_mask) == length) ||
-				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status))) {
-		retval = -ERESTARTSYS;
-	}
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-	if (test_and_clear_bit(BUS_ERROR_BN, &nec_priv->state))
-		retval = -EIO;
-	// disable board's dma
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0);
-
-	dmaengine_terminate_all(e_priv->dma_channel);
-	// make sure fluke_dma_callback got called
-	if (test_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state))
-		fluke_dma_callback(board);
-
-	/*
-	 * if everything went fine, try to wait until last byte is actually
-	 * transmitted across gpib (but don't try _too_ hard)
-	 */
-	if (retval == 0)
-		retval = wait_for_sids_or_sgns(board);
-
-	*bytes_written = readl(e_priv->write_transfer_counter) & write_transfer_counter_mask;
-	if (WARN_ON_ONCE(*bytes_written > length))
-		return -EFAULT;
-
-cleanup:
-	dma_unmap_single(board->dev, address, length, DMA_TO_DEVICE);
-	return retval;
-}
-
-static int fluke_accel_write(struct gpib_board *board, u8 *buffer, size_t length,
-			     int send_eoi, size_t *bytes_written)
-{
-	struct fluke_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	size_t remainder = length;
-	size_t transfer_size;
-	ssize_t retval = 0;
-	size_t dma_remainder = remainder;
-
-	if (!e_priv->dma_channel) {
-		dev_err(board->gpib_dev, "No dma channel available, cannot do accel write.");
-		return -ENXIO;
-	}
-
-	*bytes_written = 0;
-	if (length < 1)
-		return 0;
-
-	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
-
-	if (send_eoi)
-		--dma_remainder;
-
-	while (dma_remainder > 0) {
-		size_t num_bytes;
-
-		retval = wait_for_data_out_ready(board);
-		if (retval < 0)
-			break;
-
-		transfer_size = (e_priv->dma_buffer_size < dma_remainder) ?
-			e_priv->dma_buffer_size : dma_remainder;
-		retval = fluke_dma_write(board, buffer, transfer_size, &num_bytes);
-		*bytes_written += num_bytes;
-		if (retval < 0)
-			break;
-		dma_remainder -= num_bytes;
-		remainder -= num_bytes;
-		buffer += num_bytes;
-		if (need_resched())
-			schedule();
-	}
-	if (retval < 0)
-		return retval;
-	// handle sending of last byte with eoi
-	if (send_eoi) {
-		size_t num_bytes;
-
-		if (WARN_ON_ONCE(remainder != 1))
-			return -EFAULT;
-
-		/*
-		 * wait until we are sure we will be able to write the data byte
-		 * into the chip before we send AUX_SEOI.  This prevents a timeout
-		 * scenerio where we send AUX_SEOI but then timeout without getting
-		 * any bytes into the gpib chip.  This will result in the first byte
-		 * of the next write having a spurious EOI set on the first byte.
-		 */
-		retval = wait_for_data_out_ready(board);
-		if (retval < 0)
-			return retval;
-
-		write_byte(nec_priv, AUX_SEOI, AUXMR);
-		retval = fluke_dma_write(board, buffer, remainder, &num_bytes);
-		*bytes_written += num_bytes;
-		if (retval < 0)
-			return retval;
-		remainder -= num_bytes;
-	}
-	return 0;
-}
-
-static int fluke_get_dma_residue(struct dma_chan *chan, dma_cookie_t cookie)
-{
-	struct dma_tx_state state;
-	int result;
-
-	result = dmaengine_pause(chan);
-	if (result < 0) {
-		pr_err("dma pause failed?\n");
-		return result;
-	}
-	dmaengine_tx_status(chan, cookie, &state);
-	/*
-	 * hardware doesn't support resume, so dont call this
-	 * method unless the dma transfer is done.
-	 */
-	return state.residue;
-}
-
-static int fluke_dma_read(struct gpib_board *board, u8 *buffer,
-			  size_t length, int *end, size_t *bytes_read)
-{
-	struct fluke_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-	unsigned long flags;
-	int residue;
-	dma_addr_t bus_address;
-	struct dma_async_tx_descriptor *tx_desc;
-	dma_cookie_t dma_cookie;
-	int i;
-	static const int timeout = 10;
-
-	*bytes_read = 0;
-	*end = 0;
-	if (length == 0)
-		return 0;
-
-	bus_address = dma_map_single(board->dev, e_priv->dma_buffer,
-				     length, DMA_FROM_DEVICE);
-
-	/* program dma controller */
-	retval = fluke_config_dma(board, 0);
-	if (retval) {
-		dma_unmap_single(board->dev, bus_address, length, DMA_FROM_DEVICE);
-		return retval;
-	}
-	tx_desc = dmaengine_prep_slave_single(e_priv->dma_channel,
-					      bus_address, length, DMA_DEV_TO_MEM,
-					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
-	if (!tx_desc) {
-		dev_err(board->gpib_dev, "failed to allocate dma transmit descriptor\n");
-		dma_unmap_single(NULL, bus_address, length, DMA_FROM_DEVICE);
-		return -EIO;
-	}
-	tx_desc->callback = fluke_dma_callback;
-	tx_desc->callback_param = board;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	// enable nec7210 dma
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DIIE, 0);
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, HR_DMAI);
-
-	dma_cookie = dmaengine_submit(tx_desc);
-	dma_async_issue_pending(e_priv->dma_channel);
-
-	set_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state);
-	clear_bit(READ_READY_BN, &nec_priv->state);
-
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	// wait for data to transfer
-	if (wait_event_interruptible(board->wait,
-				     test_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state) == 0 ||
-				     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status))) {
-		retval = -ERESTARTSYS;
-	}
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-
-	/*
-	 * If we woke up because of end, wait until the dma transfer has pulled
-	 * the data byte associated with the end before we cancel the dma transfer.
-	 */
-	if (test_bit(RECEIVED_END_BN, &nec_priv->state)) {
-		for (i = 0; i < timeout; ++i) {
-			if (test_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state) == 0)
-				break;
-			if ((read_byte(nec_priv, ADR0) & DATA_IN_STATUS) == 0)
-				break;
-			usleep_range(10, 15);
-		}
-		if (i == timeout)
-			pr_warn("fluke_gpib: timeout waiting for dma to transfer end data byte.\n");
-	}
-
-	// stop the dma transfer
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
-	/*
-	 * delay a little just to make sure any bytes in dma controller's fifo get
-	 * written to memory before we disable it
-	 */
-	usleep_range(10, 15);
-	residue = fluke_get_dma_residue(e_priv->dma_channel, dma_cookie);
-	if (WARN_ON_ONCE(residue > length || residue < 0))
-		return -EFAULT;
-	*bytes_read += length - residue;
-	dmaengine_terminate_all(e_priv->dma_channel);
-	// make sure fluke_dma_callback got called
-	if (test_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state))
-		fluke_dma_callback(board);
-
-	dma_unmap_single(board->dev, bus_address, length, DMA_FROM_DEVICE);
-	memcpy(buffer, e_priv->dma_buffer, *bytes_read);
-
-	/*
-	 * If we got an end interrupt, figure out if it was
-	 * associated with the last byte we dma'd or with a
-	 * byte still sitting on the cb7210.
-	 */
-	spin_lock_irqsave(&board->spinlock, flags);
-	if (test_bit(READ_READY_BN, &nec_priv->state) == 0) {
-		/*
-		 * There is no byte sitting on the cb7210.  If we
-		 * saw an end interrupt, we need to deal with it now
-		 */
-		if (test_and_clear_bit(RECEIVED_END_BN, &nec_priv->state))
-			*end = 1;
-	}
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	return retval;
-}
-
-static int fluke_accel_read(struct gpib_board *board, u8 *buffer, size_t length,
-			    int *end, size_t *bytes_read)
-{
-	struct fluke_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	size_t remain = length;
-	size_t transfer_size;
-	int retval = 0;
-	size_t dma_nbytes;
-
-	*end = 0;
-	*bytes_read = 0;
-
-	smp_mb__before_atomic();
-	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
-	smp_mb__after_atomic();
-
-	retval = wait_for_read(board);
-	if (retval < 0)
-		return retval;
-
-	nec7210_release_rfd_holdoff(board, nec_priv);
-
-	while (remain > 0) {
-		transfer_size = (e_priv->dma_buffer_size < remain) ?
-			e_priv->dma_buffer_size : remain;
-		retval = fluke_dma_read(board, buffer, transfer_size, end, &dma_nbytes);
-		remain -= dma_nbytes;
-		buffer += dma_nbytes;
-		*bytes_read += dma_nbytes;
-		if (*end)
-			break;
-		if (retval < 0)
-			return retval;
-		if (need_resched())
-			schedule();
-	}
-
-	return retval;
-}
-
-static struct gpib_interface fluke_unaccel_interface = {
-	.name = "fluke_unaccel",
-	.attach = fluke_attach_holdoff_all,
-	.detach = fluke_detach,
-	.read = fluke_read,
-	.write = fluke_write,
-	.command = fluke_command,
-	.take_control = fluke_take_control,
-	.go_to_standby = fluke_go_to_standby,
-	.request_system_control = fluke_request_system_control,
-	.interface_clear = fluke_interface_clear,
-	.remote_enable = fluke_remote_enable,
-	.enable_eos = fluke_enable_eos,
-	.disable_eos = fluke_disable_eos,
-	.parallel_poll = fluke_parallel_poll,
-	.parallel_poll_configure = fluke_parallel_poll_configure,
-	.parallel_poll_response = fluke_parallel_poll_response,
-	.line_status = fluke_line_status,
-	.update_status = fluke_update_status,
-	.primary_address = fluke_primary_address,
-	.secondary_address = fluke_secondary_address,
-	.serial_poll_response = fluke_serial_poll_response,
-	.serial_poll_status = fluke_serial_poll_status,
-	.t1_delay = fluke_t1_delay,
-	.return_to_local = fluke_return_to_local,
-};
-
-/*
- * fluke_hybrid uses dma for writes but not for reads.  Added
- * to deal with occasional corruption of bytes seen when doing dma
- * reads.  From looking at the cb7210 vhdl, I believe the corruption
- * is due to a hardware bug triggered by the cpu reading a cb7210
- *		}
- * register just as the dma controller is also doing a read.
- */
-
-static struct gpib_interface fluke_hybrid_interface = {
-	.name = "fluke_hybrid",
-	.attach = fluke_attach_holdoff_all,
-	.detach = fluke_detach,
-	.read = fluke_read,
-	.write = fluke_accel_write,
-	.command = fluke_command,
-	.take_control = fluke_take_control,
-	.go_to_standby = fluke_go_to_standby,
-	.request_system_control = fluke_request_system_control,
-	.interface_clear = fluke_interface_clear,
-	.remote_enable = fluke_remote_enable,
-	.enable_eos = fluke_enable_eos,
-	.disable_eos = fluke_disable_eos,
-	.parallel_poll = fluke_parallel_poll,
-	.parallel_poll_configure = fluke_parallel_poll_configure,
-	.parallel_poll_response = fluke_parallel_poll_response,
-	.line_status = fluke_line_status,
-	.update_status = fluke_update_status,
-	.primary_address = fluke_primary_address,
-	.secondary_address = fluke_secondary_address,
-	.serial_poll_response = fluke_serial_poll_response,
-	.serial_poll_status = fluke_serial_poll_status,
-	.t1_delay = fluke_t1_delay,
-	.return_to_local = fluke_return_to_local,
-};
-
-static struct gpib_interface fluke_interface = {
-	.name = "fluke",
-	.attach = fluke_attach_holdoff_end,
-	.detach = fluke_detach,
-	.read = fluke_accel_read,
-	.write = fluke_accel_write,
-	.command = fluke_command,
-	.take_control = fluke_take_control,
-	.go_to_standby = fluke_go_to_standby,
-	.request_system_control = fluke_request_system_control,
-	.interface_clear = fluke_interface_clear,
-	.remote_enable = fluke_remote_enable,
-	.enable_eos = fluke_enable_eos,
-	.disable_eos = fluke_disable_eos,
-	.parallel_poll = fluke_parallel_poll,
-	.parallel_poll_configure = fluke_parallel_poll_configure,
-	.parallel_poll_response = fluke_parallel_poll_response,
-	.line_status = fluke_line_status,
-	.update_status = fluke_update_status,
-	.primary_address = fluke_primary_address,
-	.secondary_address = fluke_secondary_address,
-	.serial_poll_response = fluke_serial_poll_response,
-	.serial_poll_status = fluke_serial_poll_status,
-	.t1_delay = fluke_t1_delay,
-	.return_to_local = fluke_return_to_local,
-};
-
-irqreturn_t fluke_gpib_internal_interrupt(struct gpib_board *board)
-{
-	int status0, status1, status2;
-	struct fluke_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-	int retval = IRQ_NONE;
-
-	if (read_byte(nec_priv, ADR0) & DATA_IN_STATUS)
-		set_bit(READ_READY_BN, &nec_priv->state);
-
-	status0 = fluke_paged_read_byte(priv, ISR0_IMR0, ISR0_IMR0_PAGE);
-	status1 = read_byte(nec_priv, ISR1);
-	status2 = read_byte(nec_priv, ISR2);
-
-	if (status0 & FLUKE_IFCI_BIT) {
-		push_gpib_event(board, EVENT_IFC);
-		retval = IRQ_HANDLED;
-	}
-
-	if (nec7210_interrupt_have_status(board, nec_priv, status1, status2) == IRQ_HANDLED)
-		retval = IRQ_HANDLED;
-
-	if (read_byte(nec_priv, ADR0) & DATA_IN_STATUS)	{
-		if (test_bit(RFD_HOLDOFF_BN, &nec_priv->state))
-			set_bit(READ_READY_BN, &nec_priv->state);
-		else
-			clear_bit(READ_READY_BN, &nec_priv->state);
-	}
-
-	if (retval == IRQ_HANDLED)
-		wake_up_interruptible(&board->wait);
-
-	return retval;
-}
-
-static irqreturn_t fluke_gpib_interrupt(int irq, void *arg)
-{
-	struct gpib_board *board = arg;
-	unsigned long flags;
-	irqreturn_t retval;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	retval = fluke_gpib_internal_interrupt(board);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return retval;
-}
-
-static int fluke_allocate_private(struct gpib_board *board)
-{
-	struct fluke_priv *priv;
-
-	board->private_data = kmalloc(sizeof(struct fluke_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -ENOMEM;
-	priv = board->private_data;
-	memset(priv, 0, sizeof(struct fluke_priv));
-	init_nec7210_private(&priv->nec7210_priv);
-	priv->dma_buffer_size = 0x7ff;
-	priv->dma_buffer = kmalloc(priv->dma_buffer_size, GFP_KERNEL);
-	if (!priv->dma_buffer)
-		return -ENOMEM;
-	return 0;
-}
-
-static void fluke_generic_detach(struct gpib_board *board)
-{
-	if (board->private_data) {
-		struct fluke_priv *e_priv = board->private_data;
-
-		kfree(e_priv->dma_buffer);
-		kfree(board->private_data);
-		board->private_data = NULL;
-	}
-}
-
-// generic part of attach functions shared by all cb7210 boards
-static int fluke_generic_attach(struct gpib_board *board)
-{
-	struct fluke_priv *e_priv;
-	struct nec7210_priv *nec_priv;
-	int retval;
-
-	board->status = 0;
-
-	retval = fluke_allocate_private(board);
-	if (retval < 0)
-		return retval;
-	e_priv = board->private_data;
-	nec_priv = &e_priv->nec7210_priv;
-	nec_priv->read_byte = fluke_locking_read_byte;
-	nec_priv->write_byte = fluke_locking_write_byte;
-	nec_priv->offset = fluke_reg_offset;
-	nec_priv->type = CB7210;
-	return 0;
-}
-
-static int fluke_config_dma(struct gpib_board *board, int output)
-{
-	struct fluke_priv *e_priv = board->private_data;
-	struct dma_slave_config config;
-
-	config.src_maxburst = 1;
-	config.dst_maxburst = 1;
-	config.device_fc = true;
-
-	if (output) {
-		config.direction = DMA_MEM_TO_DEV;
-		config.src_addr = 0;
-		config.dst_addr = e_priv->dma_port_res->start;
-		config.src_addr_width = 1;
-		config.dst_addr_width = 1;
-	} else {
-		config.direction = DMA_DEV_TO_MEM;
-		config.src_addr = e_priv->dma_port_res->start;
-		config.dst_addr = 0;
-		config.src_addr_width = 1;
-		config.dst_addr_width = 1;
-	}
-	return dmaengine_slave_config(e_priv->dma_channel, &config);
-}
-
-static int fluke_init(struct fluke_priv *e_priv, struct gpib_board *board, int handshake_mode)
-{
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-
-	nec7210_board_reset(nec_priv, board);
-	write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
-	/*
-	 * set clock register for driving frequency
-	 * ICR should be set to clock in megahertz (1-15) and to zero
-	 * for clocks faster than 15 MHz (max 20MHz)
-	 */
-	write_byte(nec_priv, ICR | 10, AUXMR);
-	nec7210_set_handshake_mode(board, nec_priv, handshake_mode);
-
-	nec7210_board_online(nec_priv, board);
-
-	/* poll so we can detect ATN changes */
-	if (gpib_request_pseudo_irq(board, fluke_gpib_interrupt)) {
-		dev_err(board->gpib_dev, "failed to allocate pseudo_irq\n");
-		return -EINVAL;
-	}
-
-	fluke_paged_write_byte(e_priv, FLUKE_IFCIE_BIT, ISR0_IMR0, ISR0_IMR0_PAGE);
-	return 0;
-}
-
-/*
- * This function is passed to dma_request_channel() in order to
- * select the pl330 dma channel which has been hardwired to
- * the gpib controller.
- */
-static bool gpib_dma_channel_filter(struct dma_chan *chan, void *filter_param)
-{
-	// select the channel which is wired to the gpib chip
-	return chan->chan_id == 0;
-}
-
-static int fluke_attach_impl(struct gpib_board *board, const struct gpib_board_config *config,
-			     unsigned int handshake_mode)
-{
-	struct fluke_priv *e_priv;
-	struct nec7210_priv *nec_priv;
-	int isr_flags = 0;
-	int retval;
-	int irq;
-	struct resource *res;
-	dma_cap_mask_t dma_cap;
-
-	if (!fluke_gpib_pdev) {
-		dev_err(board->gpib_dev, "No fluke device was found, attach failed.\n");
-		return -ENODEV;
-	}
-
-	retval = fluke_generic_attach(board);
-	if (retval)
-		return retval;
-
-	e_priv = board->private_data;
-	nec_priv = &e_priv->nec7210_priv;
-	nec_priv->offset = fluke_reg_offset;
-	board->dev = &fluke_gpib_pdev->dev;
-
-	res = platform_get_resource(fluke_gpib_pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		dev_err(&fluke_gpib_pdev->dev, "Unable to locate mmio resource\n");
-		return -ENODEV;
-	}
-
-	if (request_mem_region(res->start,
-			       resource_size(res),
-			       fluke_gpib_pdev->name) == NULL) {
-		dev_err(&fluke_gpib_pdev->dev, "cannot claim registers\n");
-		return -ENXIO;
-	}
-	e_priv->gpib_iomem_res = res;
-
-	nec_priv->mmiobase = ioremap(e_priv->gpib_iomem_res->start,
-				     resource_size(e_priv->gpib_iomem_res));
-	if (!nec_priv->mmiobase) {
-		dev_err(&fluke_gpib_pdev->dev, "Could not map I/O memory\n");
-		return -ENOMEM;
-	}
-
-	res = platform_get_resource(fluke_gpib_pdev, IORESOURCE_MEM, 1);
-	if (!res) {
-		dev_err(&fluke_gpib_pdev->dev, "Unable to locate mmio resource for gpib dma port\n");
-		return -ENODEV;
-	}
-	if (request_mem_region(res->start,
-			       resource_size(res),
-			       fluke_gpib_pdev->name) == NULL) {
-		dev_err(&fluke_gpib_pdev->dev, "cannot claim registers\n");
-		return -ENXIO;
-	}
-	e_priv->dma_port_res = res;
-
-	res = platform_get_resource(fluke_gpib_pdev, IORESOURCE_MEM, 2);
-	if (!res) {
-		dev_err(&fluke_gpib_pdev->dev, "Unable to locate mmio resource for write transfer counter\n");
-		return -ENODEV;
-	}
-
-	if (request_mem_region(res->start,
-			       resource_size(res),
-			       fluke_gpib_pdev->name) == NULL) {
-		dev_err(&fluke_gpib_pdev->dev, "cannot claim registers\n");
-		return -ENXIO;
-	}
-	e_priv->write_transfer_counter_res = res;
-
-	e_priv->write_transfer_counter = ioremap(e_priv->write_transfer_counter_res->start,
-						 resource_size(e_priv->write_transfer_counter_res));
-	if (!e_priv->write_transfer_counter) {
-		dev_err(&fluke_gpib_pdev->dev, "Could not map I/O memory\n");
-		return -ENOMEM;
-	}
-
-	irq = platform_get_irq(fluke_gpib_pdev, 0);
-	if (irq < 0)
-		return -EBUSY;
-	retval = request_irq(irq, fluke_gpib_interrupt, isr_flags, fluke_gpib_pdev->name, board);
-	if (retval) {
-		dev_err(&fluke_gpib_pdev->dev,
-			"cannot register interrupt handler err=%d\n",
-			retval);
-		return retval;
-	}
-	e_priv->irq = irq;
-
-	dma_cap_zero(dma_cap);
-	dma_cap_set(DMA_SLAVE, dma_cap);
-	e_priv->dma_channel = dma_request_channel(dma_cap, gpib_dma_channel_filter, NULL);
-	if (!e_priv->dma_channel) {
-		dev_err(board->gpib_dev, "failed to allocate a dma channel.\n");
-		/*
-		 * we don't error out here because unaccel interface will still
-		 * work without dma
-		 */
-	}
-
-	return fluke_init(e_priv, board, handshake_mode);
-}
-
-int fluke_attach_holdoff_all(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	return fluke_attach_impl(board, config, HR_HLDA);
-}
-
-int fluke_attach_holdoff_end(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	return fluke_attach_impl(board, config, HR_HLDE);
-}
-
-void fluke_detach(struct gpib_board *board)
-{
-	struct fluke_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (e_priv) {
-		if (e_priv->dma_channel)
-			dma_release_channel(e_priv->dma_channel);
-		gpib_free_pseudo_irq(board);
-		nec_priv = &e_priv->nec7210_priv;
-
-		if (nec_priv->mmiobase) {
-			fluke_paged_write_byte(e_priv, 0, ISR0_IMR0, ISR0_IMR0_PAGE);
-			nec7210_board_reset(nec_priv, board);
-		}
-		if (e_priv->irq)
-			free_irq(e_priv->irq, board);
-		if (e_priv->write_transfer_counter_res) {
-			release_mem_region(e_priv->write_transfer_counter_res->start,
-					   resource_size(e_priv->write_transfer_counter_res));
-		}
-		if (e_priv->dma_port_res) {
-			release_mem_region(e_priv->dma_port_res->start,
-					   resource_size(e_priv->dma_port_res));
-		}
-		if (e_priv->gpib_iomem_res)
-			release_mem_region(e_priv->gpib_iomem_res->start,
-					   resource_size(e_priv->gpib_iomem_res));
-	}
-	fluke_generic_detach(board);
-}
-
-static int fluke_gpib_probe(struct platform_device *pdev)
-{
-	fluke_gpib_pdev = pdev;
-	return 0;
-}
-
-static const struct of_device_id fluke_gpib_of_match[] = {
-	{ .compatible = "flk,fgpib-4.0"},
-	{ {0} }
-};
-MODULE_DEVICE_TABLE(of, fluke_gpib_of_match);
-
-static struct platform_driver fluke_gpib_platform_driver = {
-	.driver = {
-		.name = DRV_NAME,
-		.of_match_table = fluke_gpib_of_match,
-	},
-	.probe = &fluke_gpib_probe
-};
-
-static int __init fluke_init_module(void)
-{
-	int result;
-
-	result = platform_driver_register(&fluke_gpib_platform_driver);
-	if (result) {
-		pr_err("platform_driver_register failed: error = %d\n", result);
-		return result;
-	}
-
-	result = gpib_register_driver(&fluke_unaccel_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_unaccel;
-	}
-
-	result = gpib_register_driver(&fluke_hybrid_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_hybrid;
-	}
-
-	result = gpib_register_driver(&fluke_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_interface;
-	}
-
-	return 0;
-
-err_interface:
-	gpib_unregister_driver(&fluke_hybrid_interface);
-err_hybrid:
-	gpib_unregister_driver(&fluke_unaccel_interface);
-err_unaccel:
-	platform_driver_unregister(&fluke_gpib_platform_driver);
-
-	return result;
-}
-
-static void __exit fluke_exit_module(void)
-{
-	gpib_unregister_driver(&fluke_unaccel_interface);
-	gpib_unregister_driver(&fluke_hybrid_interface);
-	gpib_unregister_driver(&fluke_interface);
-	platform_driver_unregister(&fluke_gpib_platform_driver);
-}
-
-module_init(fluke_init_module);
-module_exit(fluke_exit_module);
diff --git a/drivers/staging/gpib/eastwood/fluke_gpib.h b/drivers/staging/gpib/eastwood/fluke_gpib.h
deleted file mode 100644
index 493c200d0bbf..000000000000
--- a/drivers/staging/gpib/eastwood/fluke_gpib.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *   Author: Frank Mori Hess <fmh6jj@gmail.com>
- *   copyright: (C) 2006, 2010, 2015 Fluke Corporation
- ***************************************************************************/
-
-#include <linux/compiler.h>
-#include <linux/dmaengine.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include "nec7210.h"
-
-struct fluke_priv {
-	struct nec7210_priv nec7210_priv;
-	struct resource *gpib_iomem_res;
-	struct resource *write_transfer_counter_res;
-	struct resource *dma_port_res;
-	int irq;
-	struct dma_chan *dma_channel;
-	u8 *dma_buffer;
-	int dma_buffer_size;
-	void __iomem *write_transfer_counter;
-};
-
-// cb7210 specific registers and bits
-enum cb7210_regs {
-	STATE1_REG = 0x4,
-	ISR0_IMR0 = 0x6,
-	BUS_STATUS = 0x7
-};
-
-enum cb7210_page_in {
-	ISR0_IMR0_PAGE = 1,
-	BUS_STATUS_PAGE = 1,
-	STATE1_PAGE = 1
-};
-
-/* IMR0 -- Interrupt Mode Register 0 */
-enum imr0_bits {
-	FLUKE_IFCIE_BIT = 0x8,	/* interface clear interrupt */
-};
-
-/* ISR0 -- Interrupt Status Register 0 */
-enum isr0_bits {
-	FLUKE_IFCI_BIT = 0x8,	/* interface clear interrupt */
-};
-
-enum state1_bits {
-	SOURCE_HANDSHAKE_SIDS_BITS = 0x0, /* source idle state */
-	SOURCE_HANDSHAKE_SGNS_BITS = 0x1, /* source generate state */
-	SOURCE_HANDSHAKE_SDYS_BITS = 0x2, /* source delay state */
-	SOURCE_HANDSHAKE_STRS_BITS = 0x5, /* source transfer state */
-	SOURCE_HANDSHAKE_MASK = 0x7
-};
-
-/*
- * we customized the cb7210 vhdl to give the "data in" status
- * on the unused bit 7 of the address0 register.
- */
-enum cb7210_address0 {
-	DATA_IN_STATUS = 0x80
-};
-
-static inline int cb7210_page_in_bits(unsigned int page)
-{
-	return 0x50 | (page & 0xf);
-}
-
-// don't use without locking nec_priv->register_page_lock
-static inline u8 fluke_read_byte_nolock(struct nec7210_priv *nec_priv,
-					int register_num)
-{
-	u8 retval;
-
-	retval = readl(nec_priv->mmiobase + register_num * nec_priv->offset);
-	return retval;
-}
-
-// don't use without locking nec_priv->register_page_lock
-static inline void fluke_write_byte_nolock(struct nec7210_priv *nec_priv, u8 data,
-					   int register_num)
-{
-	writel(data, nec_priv->mmiobase + register_num * nec_priv->offset);
-}
-
-static inline u8 fluke_paged_read_byte(struct fluke_priv *e_priv,
-				       unsigned int register_num, unsigned int page)
-{
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	u8 retval;
-	unsigned long flags;
-
-	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
-	fluke_write_byte_nolock(nec_priv, cb7210_page_in_bits(page), AUXMR);
-	udelay(1);
-	/* chip auto clears the page after a read */
-	retval = fluke_read_byte_nolock(nec_priv, register_num);
-	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
-	return retval;
-}
-
-static inline void fluke_paged_write_byte(struct fluke_priv *e_priv, u8 data,
-					  unsigned int register_num, unsigned int page)
-{
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	unsigned long flags;
-
-	spin_lock_irqsave(&nec_priv->register_page_lock, flags);
-	fluke_write_byte_nolock(nec_priv, cb7210_page_in_bits(page), AUXMR);
-	udelay(1);
-	fluke_write_byte_nolock(nec_priv, data, register_num);
-	spin_unlock_irqrestore(&nec_priv->register_page_lock, flags);
-}
-
-enum bus_status_bits {
-	BSR_ATN_BIT = 0x1,
-	BSR_EOI_BIT = 0x2,
-	BSR_SRQ_BIT = 0x4,
-	BSR_IFC_BIT = 0x8,
-	BSR_REN_BIT = 0x10,
-	BSR_DAV_BIT = 0x20,
-	BSR_NRFD_BIT = 0x40,
-	BSR_NDAC_BIT = 0x80,
-};
-
-enum cb7210_aux_cmds {
-/*
- * AUX_RTL2 is an undocumented aux command which causes cb7210 to assert
- * (and keep asserted) local rtl message.  This is used in conjunction
- * with the (stupid) cb7210 implementation
- * of the normal nec7210 AUX_RTL aux command, which
- * causes the rtl message to toggle between on and off.
- */
-	AUX_RTL2 = 0xd,
-	AUX_NBAF = 0xe,	// new byte available false (also clears seoi)
-	AUX_LO_SPEED = 0x40,
-	AUX_HI_SPEED = 0x41,
-};
-
-enum {
-	fluke_reg_offset = 4,
-	fluke_num_regs = 8,
-	write_transfer_counter_mask = 0x7ff,
-};
diff --git a/drivers/staging/gpib/fmh_gpib/Makefile b/drivers/staging/gpib/fmh_gpib/Makefile
deleted file mode 100644
index cc4d9e7cd5cd..000000000000
--- a/drivers/staging/gpib/fmh_gpib/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-
-obj-$(CONFIG_GPIB_FMH) += fmh_gpib.o
diff --git a/drivers/staging/gpib/fmh_gpib/fmh_gpib.c b/drivers/staging/gpib/fmh_gpib/fmh_gpib.c
deleted file mode 100644
index f7bfb4a8e553..000000000000
--- a/drivers/staging/gpib/fmh_gpib/fmh_gpib.c
+++ /dev/null
@@ -1,1754 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- * GPIB Driver for fmh_gpib_core, see
- * https://github.com/fmhess/fmh_gpib_core
- *
- * More specifically, it is a driver for the hardware arrangement described by
- *  src/examples/fmh_gpib_top.vhd in the fmh_gpib_core repository.
- *
- * Author: Frank Mori Hess <fmh6jj@gmail.com>
- * Copyright: (C) 2006, 2010, 2015 Fluke Corporation
- *	(C) 2017 Frank Mori Hess
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include "fmh_gpib.h"
-
-#include "gpibP.h"
-#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB Driver for fmh_gpib_core");
-MODULE_AUTHOR("Frank Mori Hess <fmh6jj@gmail.com>");
-
-static irqreturn_t fmh_gpib_interrupt(int irq, void *arg);
-static int fmh_gpib_attach_holdoff_all(struct gpib_board *board,
-				       const struct gpib_board_config *config);
-static int fmh_gpib_attach_holdoff_end(struct gpib_board *board,
-				       const struct gpib_board_config *config);
-static void fmh_gpib_detach(struct gpib_board *board);
-static int fmh_gpib_pci_attach_holdoff_all(struct gpib_board *board,
-					   const struct gpib_board_config *config);
-static int fmh_gpib_pci_attach_holdoff_end(struct gpib_board *board,
-					   const struct gpib_board_config *config);
-static void fmh_gpib_pci_detach(struct gpib_board *board);
-static int fmh_gpib_config_dma(struct gpib_board *board, int output);
-static irqreturn_t fmh_gpib_internal_interrupt(struct gpib_board *board);
-static struct platform_driver fmh_gpib_platform_driver;
-static struct pci_driver fmh_gpib_pci_driver;
-
-// wrappers for interface functions
-static int fmh_gpib_read(struct gpib_board *board, u8 *buffer, size_t length,
-			 int *end, size_t *bytes_read)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
-}
-
-static int fmh_gpib_write(struct gpib_board *board, u8 *buffer, size_t length,
-			  int send_eoi, size_t *bytes_written)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
-}
-
-static int fmh_gpib_command(struct gpib_board *board, u8 *buffer, size_t length,
-			    size_t *bytes_written)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
-}
-
-static int fmh_gpib_take_control(struct gpib_board *board, int synchronous)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
-}
-
-static int fmh_gpib_go_to_standby(struct gpib_board *board)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_go_to_standby(board, &priv->nec7210_priv);
-}
-
-static int fmh_gpib_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct fmh_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-
-	return nec7210_request_system_control(board, nec_priv, request_control);
-}
-
-static void fmh_gpib_interface_clear(struct gpib_board *board, int assert)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
-}
-
-static void fmh_gpib_remote_enable(struct gpib_board *board, int enable)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
-}
-
-static int fmh_gpib_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
-}
-
-static void fmh_gpib_disable_eos(struct gpib_board *board)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	nec7210_disable_eos(board, &priv->nec7210_priv);
-}
-
-static unsigned int fmh_gpib_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
-}
-
-static int fmh_gpib_primary_address(struct gpib_board *board, unsigned int address)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_primary_address(board, &priv->nec7210_priv, address);
-}
-
-static int fmh_gpib_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
-}
-
-static int fmh_gpib_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
-}
-
-static void fmh_gpib_parallel_poll_configure(struct gpib_board *board, u8 configuration)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, configuration);
-}
-
-static void fmh_gpib_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
-}
-
-static void fmh_gpib_local_parallel_poll_mode(struct gpib_board *board, int local)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	if (local) {
-		write_byte(&priv->nec7210_priv, AUX_I_REG | LOCAL_PPOLL_MODE_BIT, AUXMR);
-	} else	{
-		/*
-		 * For fmh_gpib_core, remote parallel poll config mode is unaffected by the
-		 * state of the disable bit of the parallel poll register (unlike the tnt4882).
-		 * So, we don't need to worry about that.
-		 */
-		write_byte(&priv->nec7210_priv, AUX_I_REG | 0x0, AUXMR);
-	}
-}
-
-static void fmh_gpib_serial_poll_response2(struct gpib_board *board, u8 status,
-					   int new_reason_for_service)
-{
-	struct fmh_priv *priv = board->private_data;
-	unsigned long flags;
-	const int MSS = status & request_service_bit;
-	const int reqt = MSS && new_reason_for_service;
-	const int reqf = MSS == 0;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	if (reqt) {
-		priv->nec7210_priv.srq_pending = 1;
-		clear_bit(SPOLL_NUM, &board->status);
-	} else if (reqf) {
-		priv->nec7210_priv.srq_pending = 0;
-	}
-
-	if (reqt) {
-		/*
-		 * It may seem like a race to issue reqt before updating
-		 * the status byte, but it is not.  The chip does not
-		 * issue the reqt until the SPMR is written to at
-		 * a later time.
-		 */
-		write_byte(&priv->nec7210_priv, AUX_REQT, AUXMR);
-	} else if (reqf) {
-		write_byte(&priv->nec7210_priv, AUX_REQF, AUXMR);
-	}
-	/*
-	 * We need to always zero bit 6 of the status byte before writing it to
-	 * the SPMR to insure we are using
-	 * serial poll mode SP1, and not accidentally triggering mode SP3.
-	 */
-	write_byte(&priv->nec7210_priv, status & ~request_service_bit, SPMR);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-
-static u8 fmh_gpib_serial_poll_status(struct gpib_board *board)
-{
-	struct fmh_priv *priv = board->private_data;
-
-	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
-}
-
-static void fmh_gpib_return_to_local(struct gpib_board *board)
-{
-	struct fmh_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-
-	write_byte(nec_priv, AUX_RTL2, AUXMR);
-	udelay(1);
-	write_byte(nec_priv, AUX_RTL, AUXMR);
-}
-
-static int fmh_gpib_line_status(const struct gpib_board *board)
-{
-	int status = VALID_ALL;
-	int bsr_bits;
-	struct fmh_priv *e_priv;
-	struct nec7210_priv *nec_priv;
-
-	e_priv = board->private_data;
-	nec_priv = &e_priv->nec7210_priv;
-
-	bsr_bits = read_byte(nec_priv, BUS_STATUS_REG);
-
-	if ((bsr_bits & BSR_REN_BIT) == 0)
-		status |= BUS_REN;
-	if ((bsr_bits & BSR_IFC_BIT) == 0)
-		status |= BUS_IFC;
-	if ((bsr_bits & BSR_SRQ_BIT) == 0)
-		status |= BUS_SRQ;
-	if ((bsr_bits & BSR_EOI_BIT) == 0)
-		status |= BUS_EOI;
-	if ((bsr_bits & BSR_NRFD_BIT) == 0)
-		status |= BUS_NRFD;
-	if ((bsr_bits & BSR_NDAC_BIT) == 0)
-		status |= BUS_NDAC;
-	if ((bsr_bits & BSR_DAV_BIT) == 0)
-		status |= BUS_DAV;
-	if ((bsr_bits & BSR_ATN_BIT) == 0)
-		status |= BUS_ATN;
-
-	return status;
-}
-
-static int fmh_gpib_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	unsigned int retval;
-
-	retval = nec7210_t1_delay(board, nec_priv, nano_sec);
-
-	if (nano_sec <= 350) {
-		write_byte(nec_priv, AUX_HI_SPEED, AUXMR);
-		retval = 350;
-	} else {
-		write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
-	}
-	return retval;
-}
-
-static int lacs_or_read_ready(struct gpib_board *board)
-{
-	const struct fmh_priv *e_priv = board->private_data;
-	const struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	retval = test_bit(LACS_NUM, &board->status) ||
-		test_bit(READ_READY_BN, &nec_priv->state);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	return retval;
-}
-
-static int wait_for_read(struct gpib_board *board)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-
-	if (wait_event_interruptible(board->wait,
-				     lacs_or_read_ready(board) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-	return retval;
-}
-
-static int wait_for_rx_fifo_half_full_or_end(struct gpib_board *board)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-
-	if (wait_event_interruptible(board->wait,
-				     (fifos_read(e_priv, FIFO_CONTROL_STATUS_REG) &
-				      RX_FIFO_HALF_FULL) ||
-				     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-	return retval;
-}
-
-/*
- * Wait until the gpib chip is ready to accept a data out byte.
- */
-static int wait_for_data_out_ready(struct gpib_board *board)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-
-	if (wait_event_interruptible(board->wait,
-				     (test_bit(TACS_NUM, &board->status) &&
-				      (read_byte(nec_priv, EXT_STATUS_1_REG) &
-				       DATA_OUT_STATUS_BIT)) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-
-	return retval;
-}
-
-static void fmh_gpib_dma_callback(void *arg)
-{
-	struct gpib_board *board = arg;
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE | HR_DIIE, HR_DOIE | HR_DIIE);
-	wake_up_interruptible(&board->wait);
-
-	fmh_gpib_internal_interrupt(board);
-
-	clear_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state);
-	clear_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state);
-
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-
-/*
- * returns true when all the bytes of a write have been transferred to
- * the chip and successfully transferred out over the gpib bus.
- */
-static int fmh_gpib_all_bytes_are_sent(struct fmh_priv *e_priv)
-{
-	if (fifos_read(e_priv, FIFO_XFER_COUNTER_REG) & fifo_xfer_counter_mask)
-		return 0;
-
-	if ((read_byte(&e_priv->nec7210_priv, EXT_STATUS_1_REG) & DATA_OUT_STATUS_BIT) == 0)
-		return 0;
-
-	return 1;
-}
-
-static int fmh_gpib_dma_write(struct gpib_board *board, u8 *buffer, size_t length,
-			      size_t *bytes_written)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	unsigned long flags;
-	int retval = 0;
-	dma_addr_t address;
-	struct dma_async_tx_descriptor *tx_desc;
-
-	*bytes_written = 0;
-	if (WARN_ON_ONCE(length > e_priv->dma_buffer_size))
-		return -EFAULT;
-	dmaengine_terminate_all(e_priv->dma_channel);
-	memcpy(e_priv->dma_buffer, buffer, length);
-	address = dma_map_single(board->dev, e_priv->dma_buffer, length, DMA_TO_DEVICE);
-	if (dma_mapping_error(board->dev,  address))
-		dev_err(board->gpib_dev, "dma mapping error in dma write!\n");
-	/* program dma controller */
-	retval = fmh_gpib_config_dma(board, 1);
-	if (retval)
-		goto cleanup;
-
-	tx_desc = dmaengine_prep_slave_single(e_priv->dma_channel, address, length, DMA_MEM_TO_DEV,
-					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
-	if (!tx_desc) {
-		dev_err(board->gpib_dev, "failed to allocate dma transmit descriptor\n");
-		retval = -ENOMEM;
-		goto cleanup;
-	}
-	tx_desc->callback = fmh_gpib_dma_callback;
-	tx_desc->callback_param = board;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	fifos_write(e_priv, length & fifo_xfer_counter_mask, FIFO_XFER_COUNTER_REG);
-	fifos_write(e_priv, TX_FIFO_DMA_REQUEST_ENABLE | TX_FIFO_CLEAR, FIFO_CONTROL_STATUS_REG);
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, 0);
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, HR_DMAO);
-
-	dmaengine_submit(tx_desc);
-	dma_async_issue_pending(e_priv->dma_channel);
-	clear_bit(WRITE_READY_BN, &nec_priv->state);
-	set_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state);
-
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	// suspend until message is sent
-	if (wait_event_interruptible(board->wait,
-				     fmh_gpib_all_bytes_are_sent(e_priv) ||
-				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-	if (test_and_clear_bit(BUS_ERROR_BN, &nec_priv->state))
-		retval = -EIO;
-	// disable board's dma
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0);
-	fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
-
-	dmaengine_terminate_all(e_priv->dma_channel);
-	// make sure fmh_gpib_dma_callback got called
-	if (test_bit(DMA_WRITE_IN_PROGRESS_BN, &nec_priv->state))
-		fmh_gpib_dma_callback(board);
-
-	*bytes_written = length - (fifos_read(e_priv, FIFO_XFER_COUNTER_REG) &
-				   fifo_xfer_counter_mask);
-	if (WARN_ON_ONCE(*bytes_written > length))
-		return -EFAULT;
-cleanup:
-	dma_unmap_single(board->dev, address, length, DMA_TO_DEVICE);
-	return retval;
-}
-
-static int fmh_gpib_accel_write(struct gpib_board *board, u8 *buffer,
-				size_t length, int send_eoi, size_t *bytes_written)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	size_t remainder = length;
-	size_t transfer_size;
-	ssize_t retval = 0;
-	size_t dma_remainder = remainder;
-
-	if (!e_priv->dma_channel) {
-		dev_err(board->gpib_dev, "No dma channel available, cannot do accel write.");
-		return -ENXIO;
-	}
-
-	*bytes_written = 0;
-	if (length < 1)
-		return 0;
-
-	smp_mb__before_atomic();
-	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
-	smp_mb__after_atomic();
-
-	if (send_eoi)
-		--dma_remainder;
-
-	while (dma_remainder > 0) {
-		size_t num_bytes;
-
-		retval = wait_for_data_out_ready(board);
-		if (retval < 0)
-			break;
-
-		transfer_size = (e_priv->dma_buffer_size < dma_remainder) ?
-			e_priv->dma_buffer_size : dma_remainder;
-		retval = fmh_gpib_dma_write(board, buffer, transfer_size, &num_bytes);
-		*bytes_written += num_bytes;
-		if (retval < 0)
-			break;
-		dma_remainder -= num_bytes;
-		remainder -= num_bytes;
-		buffer += num_bytes;
-		if (need_resched())
-			schedule();
-	}
-	if (retval < 0)
-		return retval;
-	// handle sending of last byte with eoi
-	if (send_eoi) {
-		size_t num_bytes;
-
-		if (WARN_ON_ONCE(remainder != 1))
-			return -EFAULT;
-
-		/*
-		 * wait until we are sure we will be able to write the data byte
-		 * into the chip before we send AUX_SEOI.  This prevents a timeout
-		 * scenario where we send AUX_SEOI but then timeout without getting
-		 * any bytes into the gpib chip.  This will result in the first byte
-		 * of the next write having a spurious EOI set on the first byte.
-		 */
-		retval = wait_for_data_out_ready(board);
-		if (retval < 0)
-			return retval;
-
-		write_byte(nec_priv, AUX_SEOI, AUXMR);
-		retval = fmh_gpib_dma_write(board, buffer, remainder, &num_bytes);
-		*bytes_written += num_bytes;
-		if (retval < 0)
-			return retval;
-		remainder -= num_bytes;
-	}
-	return 0;
-}
-
-static int fmh_gpib_get_dma_residue(struct dma_chan *chan, dma_cookie_t cookie)
-{
-	struct dma_tx_state state;
-	int result;
-
-	result = dmaengine_pause(chan);
-	if (result < 0)	{
-		pr_err("dma pause failed?\n");
-		return result;
-	}
-	dmaengine_tx_status(chan, cookie, &state);
-	/*
-	 * dma330 hardware doesn't support resume, so dont call this
-	 * method unless the dma transfer is done.
-	 */
-	return state.residue;
-}
-
-static int wait_for_tx_fifo_half_empty(struct gpib_board *board)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-
-	if (wait_event_interruptible(board->wait,
-				     (test_bit(TACS_NUM, &board->status) &&
-				      (fifos_read(e_priv, FIFO_CONTROL_STATUS_REG) &
-				       TX_FIFO_HALF_EMPTY)) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-
-	return retval;
-}
-
-/*
- * supports writing a chunk of data whose length must fit into the hardware'd xfer counter,
- * called in a loop by fmh_gpib_fifo_write()
- */
-static int fmh_gpib_fifo_write_countable(struct gpib_board *board, u8 *buffer,
-					 size_t length, int send_eoi, size_t *bytes_written)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-	unsigned int remainder;
-
-	*bytes_written = 0;
-	if (WARN_ON_ONCE(length > fifo_xfer_counter_mask))
-		return -EFAULT;
-
-	fifos_write(e_priv, length & fifo_xfer_counter_mask, FIFO_XFER_COUNTER_REG);
-	fifos_write(e_priv, TX_FIFO_CLEAR, FIFO_CONTROL_STATUS_REG);
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, 0);
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, HR_DMAO);
-
-	remainder = length;
-	while (remainder > 0) {
-		int i;
-
-		fifos_write(e_priv, TX_FIFO_HALF_EMPTY_INTERRUPT_ENABLE, FIFO_CONTROL_STATUS_REG);
-		retval = wait_for_tx_fifo_half_empty(board);
-		if (retval < 0)
-			goto cleanup;
-
-		for (i = 0; i < fmh_gpib_half_fifo_size(e_priv) && remainder > 0; ++i) {
-			unsigned int data_value = *buffer;
-
-			if (send_eoi && remainder == 1)
-				data_value |= FIFO_DATA_EOI_FLAG;
-			fifos_write(e_priv, data_value, FIFO_DATA_REG);
-			++buffer;
-			--remainder;
-		}
-	}
-
-	// suspend until last byte is sent
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, HR_DOIE);
-	if (wait_event_interruptible(board->wait,
-				     fmh_gpib_all_bytes_are_sent(e_priv) ||
-				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-	if (test_and_clear_bit(BUS_ERROR_BN, &nec_priv->state))
-		retval = -EIO;
-
-cleanup:
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE, 0);
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0);
-	fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
-
-	*bytes_written = length - (fifos_read(e_priv, FIFO_XFER_COUNTER_REG) &
-				   fifo_xfer_counter_mask);
-	if (WARN_ON_ONCE(*bytes_written > length))
-		return -EFAULT;
-
-	return retval;
-}
-
-static int fmh_gpib_fifo_write(struct gpib_board *board, u8 *buffer, size_t length,
-			       int send_eoi, size_t *bytes_written)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	size_t remainder = length;
-	size_t transfer_size;
-	ssize_t retval = 0;
-
-	*bytes_written = 0;
-	if (length < 1)
-		return 0;
-
-	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
-
-	while (remainder > 0) {
-		size_t num_bytes;
-		int last_pass;
-
-		retval = wait_for_data_out_ready(board);
-		if (retval < 0)
-			break;
-
-		if (fifo_xfer_counter_mask < remainder)	{
-			// round transfer size to a multiple of half fifo size
-			transfer_size = (fifo_xfer_counter_mask /
-					 fmh_gpib_half_fifo_size(e_priv)) *
-				fmh_gpib_half_fifo_size(e_priv);
-			last_pass = 0;
-		} else {
-			transfer_size = remainder;
-			last_pass = 1;
-		}
-		retval = fmh_gpib_fifo_write_countable(board, buffer, transfer_size,
-						       last_pass && send_eoi, &num_bytes);
-		*bytes_written += num_bytes;
-		if (retval < 0)
-			break;
-		remainder -= num_bytes;
-		buffer += num_bytes;
-		if (need_resched())
-			schedule();
-	}
-
-	return retval;
-}
-
-static int fmh_gpib_dma_read(struct gpib_board *board, u8 *buffer,
-			     size_t length, int *end, size_t *bytes_read)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-	unsigned long flags;
-	int residue;
-	int wait_retval;
-	dma_addr_t bus_address;
-	struct dma_async_tx_descriptor *tx_desc;
-	dma_cookie_t dma_cookie;
-
-	*bytes_read = 0;
-	*end = 0;
-	if (length == 0)
-		return 0;
-
-	bus_address = dma_map_single(board->dev, e_priv->dma_buffer,
-				     length, DMA_FROM_DEVICE);
-	if (dma_mapping_error(board->dev, bus_address))
-		dev_err(board->gpib_dev, "dma mapping error in dma read!");
-
-	/* program dma controller */
-	retval = fmh_gpib_config_dma(board, 0);
-	if (retval) {
-		dma_unmap_single(board->dev, bus_address, length, DMA_FROM_DEVICE);
-		return retval;
-	}
-	tx_desc = dmaengine_prep_slave_single(e_priv->dma_channel, bus_address,
-					      length, DMA_DEV_TO_MEM,
-					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
-	if (!tx_desc)  {
-		dev_err(board->gpib_dev, "failed to allocate dma transmit descriptor\n");
-		dma_unmap_single(board->dev, bus_address, length, DMA_FROM_DEVICE);
-		return -EIO;
-	}
-	tx_desc->callback = fmh_gpib_dma_callback;
-	tx_desc->callback_param = board;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	// enable nec7210 dma
-	fifos_write(e_priv, length & fifo_xfer_counter_mask, FIFO_XFER_COUNTER_REG);
-	fifos_write(e_priv, RX_FIFO_DMA_REQUEST_ENABLE | RX_FIFO_CLEAR, FIFO_CONTROL_STATUS_REG);
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DIIE, 0);
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, HR_DMAI);
-
-	dma_cookie = dmaengine_submit(tx_desc);
-	dma_async_issue_pending(e_priv->dma_channel);
-
-	set_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state);
-
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	// wait for data to transfer
-	wait_retval = wait_event_interruptible(board->wait,
-					       test_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state)
-					       == 0 ||
-					       test_bit(RECEIVED_END_BN, &nec_priv->state) ||
-					       test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-					       test_bit(TIMO_NUM, &board->status));
-	if (wait_retval)
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
-		retval = -EINTR;
-	// stop the dma transfer
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
-	fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
-	/*
-	 * give time for pl330 to transfer any in-flight data, since
-	 * pl330 will throw it away when dmaengine_pause is called.
-	 */
-	usleep_range(10, 15);
-	residue = fmh_gpib_get_dma_residue(e_priv->dma_channel, dma_cookie);
-	if (WARN_ON_ONCE(residue > length || residue < 0))
-		return -EFAULT;
-	*bytes_read += length - residue;
-	dmaengine_terminate_all(e_priv->dma_channel);
-	// make sure fmh_gpib_dma_callback got called
-	if (test_bit(DMA_READ_IN_PROGRESS_BN, &nec_priv->state))
-		fmh_gpib_dma_callback(board);
-
-	dma_unmap_single(board->dev, bus_address, length, DMA_FROM_DEVICE);
-	memcpy(buffer, e_priv->dma_buffer, *bytes_read);
-
-	/* Manually read any dregs out of fifo. */
-	while ((fifos_read(e_priv, FIFO_CONTROL_STATUS_REG) & RX_FIFO_EMPTY) == 0) {
-		if ((*bytes_read) >= length) {
-			dev_err(board->dev, "unexpected extra bytes in rx fifo, discarding!  bytes_read=%d length=%d residue=%d\n",
-				(int)(*bytes_read), (int)length, (int)residue);
-			break;
-		}
-		buffer[(*bytes_read)++] = fifos_read(e_priv, FIFO_DATA_REG) & fifo_data_mask;
-	}
-
-	/*
-	 * If we got an end interrupt, figure out if it was
-	 * associated with the last byte we dma'd or with a
-	 * byte still sitting on the cb7210.
-	 */
-	spin_lock_irqsave(&board->spinlock, flags);
-	if (*bytes_read > 0 && test_bit(READ_READY_BN, &nec_priv->state) == 0) {
-		/*
-		 * If there is no byte sitting on the cb7210 and we
-		 * saw an end, we need to deal with it now
-		 */
-		if (test_and_clear_bit(RECEIVED_END_BN, &nec_priv->state))
-			*end = 1;
-	}
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	return retval;
-}
-
-static void fmh_gpib_release_rfd_holdoff(struct gpib_board *board, struct fmh_priv *e_priv)
-{
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	unsigned int ext_status_1;
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-
-	ext_status_1 = read_byte(nec_priv, EXT_STATUS_1_REG);
-
-	/*
-	 * if there is an end byte sitting on the chip, don't release
-	 * holdoff.  We want it left set after we read out the end
-	 * byte.
-	 */
-	if ((ext_status_1 & (DATA_IN_STATUS_BIT | END_STATUS_BIT)) !=
-	    (DATA_IN_STATUS_BIT | END_STATUS_BIT))	{
-		if (ext_status_1 & RFD_HOLDOFF_STATUS_BIT)
-			write_byte(nec_priv, AUX_FH, AUXMR);
-
-		/*
-		 * Check if an end byte raced in before we executed the AUX_FH command.
-		 * If it did, we want to make sure the rfd holdoff is in effect.  The end
-		 * byte can arrive since
-		 * AUX_RFD_HOLDOFF_ASAP doesn't immediately force the acceptor handshake
-		 * to leave ACRS.
-		 */
-		if ((read_byte(nec_priv, EXT_STATUS_1_REG) &
-		     (RFD_HOLDOFF_STATUS_BIT | DATA_IN_STATUS_BIT | END_STATUS_BIT)) ==
-		    (DATA_IN_STATUS_BIT | END_STATUS_BIT)) {
-			write_byte(nec_priv, AUX_RFD_HOLDOFF_ASAP, AUXMR);
-			set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-		} else {
-			clear_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-		}
-	}
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-
-static int fmh_gpib_accel_read(struct gpib_board *board, u8 *buffer, size_t length,
-			       int *end, size_t *bytes_read)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	size_t remain = length;
-	size_t transfer_size;
-	int retval = 0;
-	size_t dma_nbytes;
-	unsigned long flags;
-
-	smp_mb__before_atomic();
-	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
-	smp_mb__after_atomic();
-	*end = 0;
-	*bytes_read = 0;
-
-	retval = wait_for_read(board);
-	if (retval < 0)
-		return retval;
-
-	fmh_gpib_release_rfd_holdoff(board, e_priv);
-	while (remain > 0) {
-		transfer_size = (e_priv->dma_buffer_size < remain) ?
-			e_priv->dma_buffer_size : remain;
-		retval = fmh_gpib_dma_read(board, buffer, transfer_size, end, &dma_nbytes);
-		remain -= dma_nbytes;
-		buffer += dma_nbytes;
-		*bytes_read += dma_nbytes;
-		if (*end)
-			break;
-		if (retval < 0)
-			break;
-		if (need_resched())
-			schedule();
-	}
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	if (test_bit(RFD_HOLDOFF_BN, &nec_priv->state) == 0) {
-		write_byte(nec_priv, AUX_RFD_HOLDOFF_ASAP, AUXMR);
-		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-	}
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	return retval;
-}
-
-/*
- * Read a chunk of data whose length is within the limits of the hardware's
- * xfer counter.  Called in a loop from fmh_gpib_fifo_read().
- */
-static int fmh_gpib_fifo_read_countable(struct gpib_board *board, u8 *buffer,
-					size_t length, int *end, size_t *bytes_read)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	int retval = 0;
-
-	*bytes_read = 0;
-	*end = 0;
-	if (length == 0)
-		return 0;
-
-	fifos_write(e_priv, length & fifo_xfer_counter_mask, FIFO_XFER_COUNTER_REG);
-	fifos_write(e_priv, RX_FIFO_CLEAR, FIFO_CONTROL_STATUS_REG);
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DIIE, 0);
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, HR_DMAI);
-
-	while (*bytes_read < length && *end == 0) {
-		int i;
-
-		fifos_write(e_priv, RX_FIFO_HALF_FULL_INTERRUPT_ENABLE, FIFO_CONTROL_STATUS_REG);
-		retval = wait_for_rx_fifo_half_full_or_end(board);
-		if (retval < 0)
-			goto cleanup;
-
-		for (i = 0; i < fmh_gpib_half_fifo_size(e_priv) && *end == 0; ++i) {
-			unsigned int data_value;
-
-			data_value = fifos_read(e_priv, FIFO_DATA_REG);
-			buffer[(*bytes_read)++] = data_value & fifo_data_mask;
-			if (data_value & FIFO_DATA_EOI_FLAG)
-				*end = 1;
-		}
-	}
-
-cleanup:
-	// stop the transfer
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
-	fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
-
-	/* Manually read any dregs out of fifo. */
-	while ((fifos_read(e_priv, FIFO_CONTROL_STATUS_REG) & RX_FIFO_EMPTY) == 0) {
-		unsigned int data_value;
-
-		if ((*bytes_read) >= length) {
-			dev_err(board->dev, "unexpected extra bytes in rx fifo, discarding!  bytes_read=%d length=%d\n",
-				(int)(*bytes_read), (int)length);
-			break;
-		}
-		data_value = fifos_read(e_priv, FIFO_DATA_REG);
-		buffer[(*bytes_read)++] = data_value & fifo_data_mask;
-		if (data_value & FIFO_DATA_EOI_FLAG)
-			*end = 1;
-	}
-
-	return retval;
-}
-
-static int fmh_gpib_fifo_read(struct gpib_board *board, u8 *buffer, size_t length,
-			      int *end, size_t *bytes_read)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	size_t remain = length;
-	size_t transfer_size;
-	int retval = 0;
-	size_t nbytes;
-	unsigned long flags;
-
-	clear_bit(DEV_CLEAR_BN, &nec_priv->state); // XXX FIXME
-	*end = 0;
-	*bytes_read = 0;
-
-	/*
-	 * Do a little prep with data in interrupt so that following wait_for_read()
-	 * will wake up if a data byte is received.
-	 */
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_DIIE, HR_DIIE);
-	fmh_gpib_interrupt(0, board);
-
-	retval = wait_for_read(board);
-	if (retval < 0)
-		return retval;
-
-	fmh_gpib_release_rfd_holdoff(board, e_priv);
-	while (remain > 0) {
-		if (fifo_xfer_counter_mask < remain) {
-			// round transfer size to a multiple of half fifo size
-			transfer_size = (fifo_xfer_counter_mask /
-					 fmh_gpib_half_fifo_size(e_priv)) *
-				fmh_gpib_half_fifo_size(e_priv);
-		} else {
-			transfer_size = remain;
-		}
-		retval = fmh_gpib_fifo_read_countable(board, buffer, transfer_size, end, &nbytes);
-		remain -= nbytes;
-		buffer += nbytes;
-		*bytes_read += nbytes;
-		if (*end)
-			break;
-		if (retval < 0)
-			break;
-		if (need_resched())
-			schedule();
-	}
-
-	if (*end == 0)	{
-		spin_lock_irqsave(&board->spinlock, flags);
-		write_byte(nec_priv, AUX_RFD_HOLDOFF_ASAP, AUXMR);
-		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-		spin_unlock_irqrestore(&board->spinlock, flags);
-	}
-
-	return retval;
-}
-
-static struct gpib_interface fmh_gpib_unaccel_interface = {
-	.name = "fmh_gpib_unaccel",
-	.attach = fmh_gpib_attach_holdoff_all,
-	.detach = fmh_gpib_detach,
-	.read = fmh_gpib_read,
-	.write = fmh_gpib_write,
-	.command = fmh_gpib_command,
-	.take_control = fmh_gpib_take_control,
-	.go_to_standby = fmh_gpib_go_to_standby,
-	.request_system_control = fmh_gpib_request_system_control,
-	.interface_clear = fmh_gpib_interface_clear,
-	.remote_enable = fmh_gpib_remote_enable,
-	.enable_eos = fmh_gpib_enable_eos,
-	.disable_eos = fmh_gpib_disable_eos,
-	.parallel_poll = fmh_gpib_parallel_poll,
-	.parallel_poll_configure = fmh_gpib_parallel_poll_configure,
-	.parallel_poll_response = fmh_gpib_parallel_poll_response,
-	.local_parallel_poll_mode = fmh_gpib_local_parallel_poll_mode,
-	.line_status = fmh_gpib_line_status,
-	.update_status = fmh_gpib_update_status,
-	.primary_address = fmh_gpib_primary_address,
-	.secondary_address = fmh_gpib_secondary_address,
-	.serial_poll_response2 = fmh_gpib_serial_poll_response2,
-	.serial_poll_status = fmh_gpib_serial_poll_status,
-	.t1_delay = fmh_gpib_t1_delay,
-	.return_to_local = fmh_gpib_return_to_local,
-};
-
-static struct gpib_interface fmh_gpib_interface = {
-	.name = "fmh_gpib",
-	.attach = fmh_gpib_attach_holdoff_end,
-	.detach = fmh_gpib_detach,
-	.read = fmh_gpib_accel_read,
-	.write = fmh_gpib_accel_write,
-	.command = fmh_gpib_command,
-	.take_control = fmh_gpib_take_control,
-	.go_to_standby = fmh_gpib_go_to_standby,
-	.request_system_control = fmh_gpib_request_system_control,
-	.interface_clear = fmh_gpib_interface_clear,
-	.remote_enable = fmh_gpib_remote_enable,
-	.enable_eos = fmh_gpib_enable_eos,
-	.disable_eos = fmh_gpib_disable_eos,
-	.parallel_poll = fmh_gpib_parallel_poll,
-	.parallel_poll_configure = fmh_gpib_parallel_poll_configure,
-	.parallel_poll_response = fmh_gpib_parallel_poll_response,
-	.local_parallel_poll_mode = fmh_gpib_local_parallel_poll_mode,
-	.line_status = fmh_gpib_line_status,
-	.update_status = fmh_gpib_update_status,
-	.primary_address = fmh_gpib_primary_address,
-	.secondary_address = fmh_gpib_secondary_address,
-	.serial_poll_response2 = fmh_gpib_serial_poll_response2,
-	.serial_poll_status = fmh_gpib_serial_poll_status,
-	.t1_delay = fmh_gpib_t1_delay,
-	.return_to_local = fmh_gpib_return_to_local,
-};
-
-static struct gpib_interface fmh_gpib_pci_interface = {
-	.name = "fmh_gpib_pci",
-	.attach = fmh_gpib_pci_attach_holdoff_end,
-	.detach = fmh_gpib_pci_detach,
-	.read = fmh_gpib_fifo_read,
-	.write = fmh_gpib_fifo_write,
-	.command = fmh_gpib_command,
-	.take_control = fmh_gpib_take_control,
-	.go_to_standby = fmh_gpib_go_to_standby,
-	.request_system_control = fmh_gpib_request_system_control,
-	.interface_clear = fmh_gpib_interface_clear,
-	.remote_enable = fmh_gpib_remote_enable,
-	.enable_eos = fmh_gpib_enable_eos,
-	.disable_eos = fmh_gpib_disable_eos,
-	.parallel_poll = fmh_gpib_parallel_poll,
-	.parallel_poll_configure = fmh_gpib_parallel_poll_configure,
-	.parallel_poll_response = fmh_gpib_parallel_poll_response,
-	.local_parallel_poll_mode = fmh_gpib_local_parallel_poll_mode,
-	.line_status = fmh_gpib_line_status,
-	.update_status = fmh_gpib_update_status,
-	.primary_address = fmh_gpib_primary_address,
-	.secondary_address = fmh_gpib_secondary_address,
-	.serial_poll_response2 = fmh_gpib_serial_poll_response2,
-	.serial_poll_status = fmh_gpib_serial_poll_status,
-	.t1_delay = fmh_gpib_t1_delay,
-	.return_to_local = fmh_gpib_return_to_local,
-};
-
-static struct gpib_interface fmh_gpib_pci_unaccel_interface = {
-	.name = "fmh_gpib_pci_unaccel",
-	.attach = fmh_gpib_pci_attach_holdoff_all,
-	.detach = fmh_gpib_pci_detach,
-	.read = fmh_gpib_read,
-	.write = fmh_gpib_write,
-	.command = fmh_gpib_command,
-	.take_control = fmh_gpib_take_control,
-	.go_to_standby = fmh_gpib_go_to_standby,
-	.request_system_control = fmh_gpib_request_system_control,
-	.interface_clear = fmh_gpib_interface_clear,
-	.remote_enable = fmh_gpib_remote_enable,
-	.enable_eos = fmh_gpib_enable_eos,
-	.disable_eos = fmh_gpib_disable_eos,
-	.parallel_poll = fmh_gpib_parallel_poll,
-	.parallel_poll_configure = fmh_gpib_parallel_poll_configure,
-	.parallel_poll_response = fmh_gpib_parallel_poll_response,
-	.local_parallel_poll_mode = fmh_gpib_local_parallel_poll_mode,
-	.line_status = fmh_gpib_line_status,
-	.update_status = fmh_gpib_update_status,
-	.primary_address = fmh_gpib_primary_address,
-	.secondary_address = fmh_gpib_secondary_address,
-	.serial_poll_response2 = fmh_gpib_serial_poll_response2,
-	.serial_poll_status = fmh_gpib_serial_poll_status,
-	.t1_delay = fmh_gpib_t1_delay,
-	.return_to_local = fmh_gpib_return_to_local,
-};
-
-irqreturn_t fmh_gpib_internal_interrupt(struct gpib_board *board)
-{
-	unsigned int status0, status1, status2, ext_status_1, fifo_status;
-	struct fmh_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-	int retval = IRQ_NONE;
-
-	status0 = read_byte(nec_priv, ISR0_IMR0_REG);
-	status1 = read_byte(nec_priv, ISR1);
-	status2 = read_byte(nec_priv, ISR2);
-	fifo_status = fifos_read(priv, FIFO_CONTROL_STATUS_REG);
-
-	if (status0 & IFC_INTERRUPT_BIT) {
-		push_gpib_event(board, EVENT_IFC);
-		retval = IRQ_HANDLED;
-	}
-
-	if (nec7210_interrupt_have_status(board, nec_priv, status1, status2) == IRQ_HANDLED)
-		retval = IRQ_HANDLED;
-
-	ext_status_1 = read_byte(nec_priv, EXT_STATUS_1_REG);
-
-	if (ext_status_1 & DATA_IN_STATUS_BIT)
-		set_bit(READ_READY_BN, &nec_priv->state);
-	else
-		clear_bit(READ_READY_BN, &nec_priv->state);
-
-	if (ext_status_1 & DATA_OUT_STATUS_BIT)
-		set_bit(WRITE_READY_BN, &nec_priv->state);
-	else
-		clear_bit(WRITE_READY_BN, &nec_priv->state);
-
-	if (ext_status_1 & COMMAND_OUT_STATUS_BIT)
-		set_bit(COMMAND_READY_BN, &nec_priv->state);
-	else
-		clear_bit(COMMAND_READY_BN, &nec_priv->state);
-
-	if (ext_status_1 & RFD_HOLDOFF_STATUS_BIT)
-		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-	else
-		clear_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-
-	if (ext_status_1 & END_STATUS_BIT) {
-		/*
-		 * only set RECEIVED_END while there is still a data
-		 * byte sitting in the chip, to avoid spuriously
-		 * setting it multiple times after it has been cleared
-		 * during a read.
-		 */
-		if (ext_status_1 & DATA_IN_STATUS_BIT)
-			set_bit(RECEIVED_END_BN, &nec_priv->state);
-	} else {
-		clear_bit(RECEIVED_END_BN, &nec_priv->state);
-	}
-
-	if ((fifo_status & TX_FIFO_HALF_EMPTY_INTERRUPT_IS_ENABLED) &&
-	    (fifo_status & TX_FIFO_HALF_EMPTY)) {
-		/*
-		 * We really only want to clear the
-		 * TX_FIFO_HALF_EMPTY_INTERRUPT_ENABLE bit in the
-		 * FIFO_CONTROL_STATUS_REG.  Since we are not being
-		 * careful, this also has a side effect of disabling
-		 * DMA requests and the RX fifo interrupt.  That is
-		 * fine though, since they should never be in use at
-		 * the same time as the TX fifo interrupt.
-		 */
-		fifos_write(priv, 0x0, FIFO_CONTROL_STATUS_REG);
-		retval = IRQ_HANDLED;
-	}
-
-	if ((fifo_status & RX_FIFO_HALF_FULL_INTERRUPT_IS_ENABLED) &&
-	    (fifo_status & RX_FIFO_HALF_FULL)) {
-		/*
-		 * We really only want to clear the
-		 * RX_FIFO_HALF_FULL_INTERRUPT_ENABLE bit in the
-		 * FIFO_CONTROL_STATUS_REG.  Since we are not being
-		 * careful, this also has a side effect of disabling
-		 * DMA requests and the TX fifo interrupt.  That is
-		 * fine though, since they should never be in use at
-		 * the same time as the RX fifo interrupt.
-		 */
-		fifos_write(priv, 0x0, FIFO_CONTROL_STATUS_REG);
-		retval = IRQ_HANDLED;
-	}
-
-	if (retval == IRQ_HANDLED)
-		wake_up_interruptible(&board->wait);
-
-	return retval;
-}
-
-irqreturn_t fmh_gpib_interrupt(int irq, void *arg)
-{
-	struct gpib_board *board = arg;
-	unsigned long flags;
-	irqreturn_t retval;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	retval = fmh_gpib_internal_interrupt(board);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return retval;
-}
-
-static int fmh_gpib_allocate_private(struct gpib_board *board)
-{
-	struct fmh_priv *priv;
-
-	board->private_data = kmalloc(sizeof(struct fmh_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -ENOMEM;
-	priv = board->private_data;
-	memset(priv, 0, sizeof(struct fmh_priv));
-	init_nec7210_private(&priv->nec7210_priv);
-	priv->dma_buffer_size = 0x800;
-	priv->dma_buffer = kmalloc(priv->dma_buffer_size, GFP_KERNEL);
-	if (!priv->dma_buffer)
-		return -ENOMEM;
-	return 0;
-}
-
-static void fmh_gpib_generic_detach(struct gpib_board *board)
-{
-	if (board->private_data) {
-		struct fmh_priv *e_priv = board->private_data;
-
-		kfree(e_priv->dma_buffer);
-		kfree(board->private_data);
-		board->private_data = NULL;
-	}
-	if (board->dev)
-		dev_set_drvdata(board->dev, NULL);
-}
-
-// generic part of attach functions
-static int fmh_gpib_generic_attach(struct gpib_board *board)
-{
-	struct fmh_priv *e_priv;
-	struct nec7210_priv *nec_priv;
-	int retval;
-
-	board->status = 0;
-
-	retval = fmh_gpib_allocate_private(board);
-	if (retval < 0)
-		return retval;
-	e_priv = board->private_data;
-	nec_priv = &e_priv->nec7210_priv;
-	nec_priv->read_byte = gpib_cs_read_byte;
-	nec_priv->write_byte = gpib_cs_write_byte;
-	nec_priv->offset = 1;
-	nec_priv->type = CB7210;
-	return 0;
-}
-
-static int fmh_gpib_config_dma(struct gpib_board *board, int output)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct dma_slave_config config;
-
-	config.device_fc = true;
-
-	if (e_priv->dma_burst_length < 1) {
-		config.src_maxburst = 1;
-		config.dst_maxburst = 1;
-	} else {
-		config.src_maxburst = e_priv->dma_burst_length;
-		config.dst_maxburst = e_priv->dma_burst_length;
-	}
-
-	config.src_addr_width = 1;
-	config.dst_addr_width = 1;
-
-	if (output) {
-		config.direction = DMA_MEM_TO_DEV;
-		config.src_addr = 0;
-		config.dst_addr = e_priv->dma_port_res->start + FIFO_DATA_REG * fifo_reg_offset;
-	} else {
-		config.direction = DMA_DEV_TO_MEM;
-		config.src_addr = e_priv->dma_port_res->start + FIFO_DATA_REG * fifo_reg_offset;
-		config.dst_addr = 0;
-	}
-	return dmaengine_slave_config(e_priv->dma_channel, &config);
-}
-
-static int fmh_gpib_init(struct fmh_priv *e_priv, struct gpib_board *board, int handshake_mode)
-{
-	struct nec7210_priv *nec_priv = &e_priv->nec7210_priv;
-	unsigned long flags;
-	unsigned int fifo_status_bits;
-
-	fifos_write(e_priv, RX_FIFO_CLEAR | TX_FIFO_CLEAR, FIFO_CONTROL_STATUS_REG);
-
-	nec7210_board_reset(nec_priv, board);
-	write_byte(nec_priv, AUX_LO_SPEED, AUXMR);
-	nec7210_set_handshake_mode(board, nec_priv, handshake_mode);
-
-	/* Hueristically check if hardware supports fifo half full/empty interrupts */
-	fifo_status_bits = fifos_read(e_priv, FIFO_CONTROL_STATUS_REG);
-	e_priv->supports_fifo_interrupts = (fifo_status_bits & TX_FIFO_EMPTY) &&
-		(fifo_status_bits & TX_FIFO_HALF_EMPTY);
-
-	nec7210_board_online(nec_priv, board);
-
-	write_byte(nec_priv, IFC_INTERRUPT_ENABLE_BIT | ATN_INTERRUPT_ENABLE_BIT, ISR0_IMR0_REG);
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	write_byte(nec_priv, AUX_RFD_HOLDOFF_ASAP, AUXMR);
-	set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return 0;
-}
-
-/* Match callback for driver_find_device */
-static int fmh_gpib_device_match(struct device *dev, const void *data)
-{
-	const struct gpib_board_config *config = data;
-
-	if (dev_get_drvdata(dev))
-		return 0;
-
-	if (gpib_match_device_path(dev, config->device_path) == 0)
-		return 0;
-
-	// driver doesn't support selection by serial number
-	if (config->serial_number)
-		return 0;
-
-	dev_dbg(dev, "matched: %s\n", of_node_full_name(dev_of_node((dev))));
-	return 1;
-}
-
-static int fmh_gpib_attach_impl(struct gpib_board *board, const struct gpib_board_config *config,
-				unsigned int handshake_mode, int acquire_dma)
-{
-	struct fmh_priv *e_priv;
-	struct nec7210_priv *nec_priv;
-	int retval;
-	int irq;
-	struct resource *res;
-	struct platform_device *pdev;
-
-	board->dev = driver_find_device(&fmh_gpib_platform_driver.driver,
-					NULL, (const void *)config, &fmh_gpib_device_match);
-	if (!board->dev)	{
-		dev_err(board->gpib_dev, "No matching fmh_gpib_core device was found, attach failed.");
-		return -ENODEV;
-	}
-	// currently only used to mark the device as already attached
-	dev_set_drvdata(board->dev, board);
-	pdev = to_platform_device(board->dev);
-
-	retval = fmh_gpib_generic_attach(board);
-	if (retval)
-		return retval;
-
-	e_priv = board->private_data;
-	nec_priv = &e_priv->nec7210_priv;
-
-	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "gpib_control_status");
-	if (!res) {
-		dev_err(board->dev, "Unable to locate mmio resource\n");
-		return -ENODEV;
-	}
-
-	if (request_mem_region(res->start,
-			       resource_size(res),
-			       pdev->name) == NULL) {
-		dev_err(board->dev, "cannot claim registers\n");
-		return -ENXIO;
-	}
-	e_priv->gpib_iomem_res = res;
-
-	nec_priv->mmiobase = ioremap(e_priv->gpib_iomem_res->start,
-				     resource_size(e_priv->gpib_iomem_res));
-	if (!nec_priv->mmiobase) {
-		dev_err(board->dev, "Could not map I/O memory\n");
-		return -ENOMEM;
-	}
-	dev_dbg(board->dev, "iobase %pr remapped to %p\n",
-		e_priv->gpib_iomem_res, nec_priv->mmiobase);
-
-	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dma_fifos");
-	if (!res) {
-		dev_err(board->dev, "Unable to locate mmio resource for gpib dma port\n");
-		return -ENODEV;
-	}
-	if (request_mem_region(res->start,
-			       resource_size(res),
-			       pdev->name) == NULL) {
-		dev_err(board->dev, "cannot claim registers\n");
-		return -ENXIO;
-	}
-	e_priv->dma_port_res = res;
-	e_priv->fifo_base = ioremap(e_priv->dma_port_res->start,
-				    resource_size(e_priv->dma_port_res));
-	if (!e_priv->fifo_base) {
-		dev_err(board->dev, "Could not map I/O memory for fifos\n");
-		return -ENOMEM;
-	}
-	dev_dbg(board->dev, "dma fifos 0x%lx remapped to %p, length=%ld\n",
-		(unsigned long)e_priv->dma_port_res->start, e_priv->fifo_base,
-		(unsigned long)resource_size(e_priv->dma_port_res));
-
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0)
-		return -EBUSY;
-	retval = request_irq(irq, fmh_gpib_interrupt, IRQF_SHARED, pdev->name, board);
-	if (retval) {
-		dev_err(board->dev,
-			"cannot register interrupt handler err=%d\n",
-			retval);
-		return retval;
-	}
-	e_priv->irq = irq;
-
-	if (acquire_dma) {
-		e_priv->dma_channel = dma_request_slave_channel(board->dev, "rxtx");
-		if (!e_priv->dma_channel) {
-			dev_err(board->dev, "failed to acquire dma channel \"rxtx\".\n");
-			return -EIO;
-		}
-	}
-	/*
-	 * in the future we might want to know the half-fifo size
-	 * (dma_burst_length) even when not using dma, so go ahead an
-	 * initialize it unconditionally.
-	 */
-	e_priv->dma_burst_length = fifos_read(e_priv, FIFO_MAX_BURST_LENGTH_REG) &
-		fifo_max_burst_length_mask;
-
-	return fmh_gpib_init(e_priv, board, handshake_mode);
-}
-
-int fmh_gpib_attach_holdoff_all(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	return fmh_gpib_attach_impl(board, config, HR_HLDA, 0);
-}
-
-int fmh_gpib_attach_holdoff_end(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	return fmh_gpib_attach_impl(board, config, HR_HLDE, 1);
-}
-
-void fmh_gpib_detach(struct gpib_board *board)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (e_priv) {
-		if (e_priv->dma_channel)
-			dma_release_channel(e_priv->dma_channel);
-		nec_priv = &e_priv->nec7210_priv;
-
-		if (e_priv->irq)
-			free_irq(e_priv->irq, board);
-		if (e_priv->fifo_base)
-			fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
-		if (nec_priv->mmiobase) {
-			write_byte(nec_priv, 0, ISR0_IMR0_REG);
-			nec7210_board_reset(nec_priv, board);
-		}
-		if (e_priv->fifo_base)
-			iounmap(e_priv->fifo_base);
-		if (nec_priv->mmiobase)
-			iounmap(nec_priv->mmiobase);
-		if (e_priv->dma_port_res) {
-			release_mem_region(e_priv->dma_port_res->start,
-					   resource_size(e_priv->dma_port_res));
-		}
-		if (e_priv->gpib_iomem_res)
-			release_mem_region(e_priv->gpib_iomem_res->start,
-					   resource_size(e_priv->gpib_iomem_res));
-	}
-	fmh_gpib_generic_detach(board);
-
-	if (board->dev) {
-		put_device(board->dev);
-		board->dev = NULL;
-	}
-}
-
-static int fmh_gpib_pci_attach_impl(struct gpib_board *board,
-				    const struct gpib_board_config *config,
-				    unsigned int handshake_mode)
-{
-	struct fmh_priv *e_priv;
-	struct nec7210_priv *nec_priv;
-	int retval;
-	struct pci_dev *pci_device;
-
-	retval = fmh_gpib_generic_attach(board);
-	if (retval)
-		return retval;
-
-	e_priv = board->private_data;
-	nec_priv = &e_priv->nec7210_priv;
-
-	// find board
-	pci_device = gpib_pci_get_device(config, BOGUS_PCI_VENDOR_ID_FLUKE,
-					 BOGUS_PCI_DEVICE_ID_FLUKE_BLADERUNNER, NULL);
-	if (!pci_device)	{
-		dev_err(board->gpib_dev, "No matching fmh_gpib_core pci device was found, attach failed.");
-		return -ENODEV;
-	}
-	board->dev = &pci_device->dev;
-
-	// bladerunner prototype has offset of 4 between gpib control/status registers
-	nec_priv->offset = 4;
-
-	if (pci_enable_device(pci_device)) {
-		dev_err(board->dev, "error enabling pci device\n");
-		return -EIO;
-	}
-	if (pci_request_regions(pci_device, KBUILD_MODNAME)) {
-		dev_err(board->dev, "pci_request_regions failed\n");
-		return -EIO;
-	}
-	e_priv->gpib_iomem_res = &pci_device->resource[gpib_control_status_pci_resource_index];
-	e_priv->dma_port_res =	&pci_device->resource[gpib_fifo_pci_resource_index];
-
-	nec_priv->mmiobase = ioremap(pci_resource_start(pci_device,
-							gpib_control_status_pci_resource_index),
-				     pci_resource_len(pci_device,
-						      gpib_control_status_pci_resource_index));
-	dev_dbg(board->dev, "base address for gpib control/status registers remapped to 0x%p\n",
-		nec_priv->mmiobase);
-
-	if (e_priv->dma_port_res->flags & IORESOURCE_MEM) {
-		e_priv->fifo_base = ioremap(pci_resource_start(pci_device,
-							       gpib_fifo_pci_resource_index),
-					    pci_resource_len(pci_device,
-							     gpib_fifo_pci_resource_index));
-		dev_dbg(board->dev, "base address for gpib fifo registers remapped to 0x%p\n",
-			e_priv->fifo_base);
-	} else {
-		e_priv->fifo_base = NULL;
-		dev_dbg(board->dev, "hardware has no gpib fifo registers.\n");
-	}
-
-	if (pci_device->irq) {
-		retval = request_irq(pci_device->irq, fmh_gpib_interrupt, IRQF_SHARED,
-				     KBUILD_MODNAME, board);
-		if (retval) {
-			dev_err(board->dev, "cannot register interrupt handler err=%d\n", retval);
-			return retval;
-		}
-	}
-	e_priv->irq = pci_device->irq;
-
-	e_priv->dma_burst_length = fifos_read(e_priv, FIFO_MAX_BURST_LENGTH_REG) &
-		fifo_max_burst_length_mask;
-
-	return fmh_gpib_init(e_priv, board, handshake_mode);
-}
-
-int fmh_gpib_pci_attach_holdoff_all(struct gpib_board *board,
-				    const struct gpib_board_config *config)
-{
-	return fmh_gpib_pci_attach_impl(board, config, HR_HLDA);
-}
-
-int fmh_gpib_pci_attach_holdoff_end(struct gpib_board *board,
-				    const struct gpib_board_config *config)
-{
-	int retval;
-	struct fmh_priv *e_priv;
-
-	retval = fmh_gpib_pci_attach_impl(board, config, HR_HLDE);
-	e_priv = board->private_data;
-	if (retval == 0 && e_priv && e_priv->supports_fifo_interrupts == 0) {
-		dev_err(board->gpib_dev, "your fmh_gpib_core does not appear to support fifo interrupts.  Try the fmh_gpib_pci_unaccel board type instead.");
-		return -EIO;
-	}
-	return retval;
-}
-
-void fmh_gpib_pci_detach(struct gpib_board *board)
-{
-	struct fmh_priv *e_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (e_priv)	{
-		nec_priv = &e_priv->nec7210_priv;
-
-		if (e_priv->irq)
-			free_irq(e_priv->irq, board);
-		if (e_priv->fifo_base)
-			fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG);
-		if (nec_priv->mmiobase) {
-			write_byte(nec_priv, 0, ISR0_IMR0_REG);
-			nec7210_board_reset(nec_priv, board);
-		}
-		if (e_priv->fifo_base)
-			iounmap(e_priv->fifo_base);
-		if (nec_priv->mmiobase)
-			iounmap(nec_priv->mmiobase);
-		if (e_priv->dma_port_res || e_priv->gpib_iomem_res)
-			pci_release_regions(to_pci_dev(board->dev));
-		if (board->dev)
-			pci_dev_put(to_pci_dev(board->dev));
-	}
-	fmh_gpib_generic_detach(board);
-}
-
-static int fmh_gpib_platform_probe(struct platform_device *pdev)
-{
-	return 0;
-}
-
-static const struct of_device_id fmh_gpib_of_match[] = {
-	{ .compatible = "fmhess,fmh_gpib_core"},
-	{ {0} }
-};
-MODULE_DEVICE_TABLE(of, fmh_gpib_of_match);
-
-static struct platform_driver fmh_gpib_platform_driver = {
-	.driver = {
-		.name = DRV_NAME,
-		.of_match_table = fmh_gpib_of_match,
-	},
-	.probe = &fmh_gpib_platform_probe
-};
-
-static int fmh_gpib_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return 0;
-}
-
-static const struct pci_device_id fmh_gpib_pci_match[] = {
-	{ BOGUS_PCI_VENDOR_ID_FLUKE, BOGUS_PCI_DEVICE_ID_FLUKE_BLADERUNNER, 0, 0, 0 },
-	{ 0 }
-};
-MODULE_DEVICE_TABLE(pci, fmh_gpib_pci_match);
-
-static struct pci_driver fmh_gpib_pci_driver = {
-	.name = DRV_NAME,
-	.id_table = fmh_gpib_pci_match,
-	.probe = &fmh_gpib_pci_probe
-};
-
-static int __init fmh_gpib_init_module(void)
-{
-	int result;
-
-	result = platform_driver_register(&fmh_gpib_platform_driver);
-	if (result) {
-		pr_err("platform_driver_register failed: error = %d\n", result);
-		return result;
-	}
-
-	result = pci_register_driver(&fmh_gpib_pci_driver);
-	if (result) {
-		pr_err("pci_register_driver failed: error = %d\n", result);
-		goto err_pci_driver;
-	}
-
-	result = gpib_register_driver(&fmh_gpib_unaccel_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_unaccel;
-	}
-
-	result = gpib_register_driver(&fmh_gpib_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_interface;
-	}
-
-	result = gpib_register_driver(&fmh_gpib_pci_unaccel_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_pci_unaccel;
-	}
-
-	result = gpib_register_driver(&fmh_gpib_pci_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_pci;
-	}
-
-	return 0;
-
-err_pci:
-	gpib_unregister_driver(&fmh_gpib_pci_unaccel_interface);
-err_pci_unaccel:
-	gpib_unregister_driver(&fmh_gpib_interface);
-err_interface:
-	gpib_unregister_driver(&fmh_gpib_unaccel_interface);
-err_unaccel:
-	pci_unregister_driver(&fmh_gpib_pci_driver);
-err_pci_driver:
-	platform_driver_unregister(&fmh_gpib_platform_driver);
-
-	return result;
-}
-
-static void __exit fmh_gpib_exit_module(void)
-{
-	gpib_unregister_driver(&fmh_gpib_pci_interface);
-	gpib_unregister_driver(&fmh_gpib_pci_unaccel_interface);
-	gpib_unregister_driver(&fmh_gpib_unaccel_interface);
-	gpib_unregister_driver(&fmh_gpib_interface);
-
-	pci_unregister_driver(&fmh_gpib_pci_driver);
-	platform_driver_unregister(&fmh_gpib_platform_driver);
-}
-
-module_init(fmh_gpib_init_module);
-module_exit(fmh_gpib_exit_module);
diff --git a/drivers/staging/gpib/fmh_gpib/fmh_gpib.h b/drivers/staging/gpib/fmh_gpib/fmh_gpib.h
deleted file mode 100644
index e7602d7e1401..000000000000
--- a/drivers/staging/gpib/fmh_gpib/fmh_gpib.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    Author: Frank Mori Hess <fmh6jj@gmail.com>
- *   Copyright: (C) 2006, 2010, 2015 Fluke Corporation
- *	(C) 2017 Frank Mori Hess
- ***************************************************************************/
-
-#include <linux/dmaengine.h>
-#include <linux/ioport.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include "nec7210.h"
-
-static const int fifo_reg_offset = 2;
-
-static const int gpib_control_status_pci_resource_index;
-static const int gpib_fifo_pci_resource_index = 1;
-
-/* We don't have a real pci vendor/device id, the following will need to be
- * patched to match prototype hardware.
- */
-#define BOGUS_PCI_VENDOR_ID_FLUKE 0xffff
-#define BOGUS_PCI_DEVICE_ID_FLUKE_BLADERUNNER 0x0
-
-struct fmh_priv {
-	struct nec7210_priv nec7210_priv;
-	struct resource *gpib_iomem_res;
-	struct resource *write_transfer_counter_res;
-	struct resource *dma_port_res;
-	int irq;
-	struct dma_chan *dma_channel;
-	u8 *dma_buffer;
-	int dma_buffer_size;
-	int dma_burst_length;
-	void __iomem *fifo_base;
-	unsigned supports_fifo_interrupts : 1;
-};
-
-static inline int fmh_gpib_half_fifo_size(struct fmh_priv *priv)
-{
-	return priv->dma_burst_length;
-}
-
-// registers beyond the nec7210 register set
-enum fmh_gpib_regs {
-	EXT_STATUS_1_REG = 0x9,
-	STATE1_REG = 0xc,
-	ISR0_IMR0_REG = 0xe,
-	BUS_STATUS_REG = 0xf
-};
-
-/* IMR0 -- Interrupt Mode Register 0 */
-enum imr0_bits {
-	ATN_INTERRUPT_ENABLE_BIT = 0x4,
-	IFC_INTERRUPT_ENABLE_BIT = 0x8
-};
-
-/* ISR0 -- Interrupt Status Register 0 */
-enum isr0_bits {
-	ATN_INTERRUPT_BIT = 0x4,
-	IFC_INTERRUPT_BIT = 0x8
-};
-
-enum state1_bits {
-	SOURCE_HANDSHAKE_SIDS_BITS = 0x0, /* source idle state */
-	SOURCE_HANDSHAKE_SGNS_BITS = 0x1, /* source generate state */
-	SOURCE_HANDSHAKE_SDYS_BITS = 0x2, /* source delay state */
-	SOURCE_HANDSHAKE_STRS_BITS = 0x5, /* source transfer state */
-	SOURCE_HANDSHAKE_MASK = 0x7
-};
-
-enum fmh_gpib_auxmr_bits {
-	AUX_I_REG = 0xe0,
-};
-
-enum aux_reg_i_bits {
-	LOCAL_PPOLL_MODE_BIT = 0x4
-};
-
-enum ext_status_1_bits {
-	DATA_IN_STATUS_BIT = 0x01,
-	DATA_OUT_STATUS_BIT = 0x02,
-	COMMAND_OUT_STATUS_BIT = 0x04,
-	RFD_HOLDOFF_STATUS_BIT = 0x08,
-	END_STATUS_BIT = 0x10
-};
-
-/* dma fifo reg and bits */
-enum dma_fifo_regs {
-	FIFO_DATA_REG = 0x0,
-	FIFO_CONTROL_STATUS_REG = 0x1,
-	FIFO_XFER_COUNTER_REG = 0x2,
-	FIFO_MAX_BURST_LENGTH_REG = 0x3
-};
-
-enum fifo_data_bits {
-	FIFO_DATA_EOI_FLAG = 0x100
-};
-
-enum fifo_control_bits {
-	TX_FIFO_DMA_REQUEST_ENABLE = 0x0001,
-	TX_FIFO_CLEAR = 0x0002,
-	TX_FIFO_HALF_EMPTY_INTERRUPT_ENABLE = 0x0008,
-	RX_FIFO_DMA_REQUEST_ENABLE = 0x0100,
-	RX_FIFO_CLEAR = 0x0200,
-	RX_FIFO_HALF_FULL_INTERRUPT_ENABLE = 0x0800
-};
-
-enum fifo_status_bits {
-	TX_FIFO_EMPTY = 0x0001,
-	TX_FIFO_FULL = 0x0002,
-	TX_FIFO_HALF_EMPTY = 0x0004,
-	TX_FIFO_HALF_EMPTY_INTERRUPT_IS_ENABLED = 0x0008,
-	TX_FIFO_DMA_REQUEST_IS_ENABLED = 0x0010,
-	RX_FIFO_EMPTY = 0x0100,
-	RX_FIFO_FULL = 0x0200,
-	RX_FIFO_HALF_FULL = 0x0400,
-	RX_FIFO_HALF_FULL_INTERRUPT_IS_ENABLED = 0x0800,
-	RX_FIFO_DMA_REQUEST_IS_ENABLED = 0x1000
-};
-
-static const unsigned int fifo_data_mask = 0x00ff;
-static const unsigned int fifo_xfer_counter_mask = 0x0fff;
-static const unsigned int fifo_max_burst_length_mask = 0x00ff;
-
-static inline u8 gpib_cs_read_byte(struct nec7210_priv *nec_priv,
-				   unsigned int register_num)
-{
-	return readb(nec_priv->mmiobase + register_num * nec_priv->offset);
-}
-
-static inline void gpib_cs_write_byte(struct nec7210_priv *nec_priv, u8 data,
-				      unsigned int register_num)
-{
-	writeb(data, nec_priv->mmiobase + register_num * nec_priv->offset);
-}
-
-static inline uint16_t fifos_read(struct fmh_priv *fmh_priv, int register_num)
-{
-	if (!fmh_priv->fifo_base)
-		return 0;
-	return readw(fmh_priv->fifo_base + register_num * fifo_reg_offset);
-}
-
-static inline void fifos_write(struct fmh_priv *fmh_priv, uint16_t data, int register_num)
-{
-	if (!fmh_priv->fifo_base)
-		return;
-	writew(data, fmh_priv->fifo_base + register_num * fifo_reg_offset);
-}
-
-enum bus_status_bits {
-	BSR_ATN_BIT = 0x01,
-	BSR_EOI_BIT = 0x02,
-	BSR_SRQ_BIT = 0x04,
-	BSR_IFC_BIT = 0x08,
-	BSR_REN_BIT = 0x10,
-	BSR_DAV_BIT = 0x20,
-	BSR_NRFD_BIT = 0x40,
-	BSR_NDAC_BIT = 0x80,
-};
-
-enum fmh_gpib_aux_cmds {
-	/* AUX_RTL2 is an auxiliary command which causes the cb7210 to assert
-	 * (and keep asserted) the local rtl message.  This is used in conjunction
-	 * with the normal nec7210 AUX_RTL command, which
-	 * pulses the rtl message, having the effect of clearing rtl if it was left
-	 * asserted by AUX_RTL2.
-	 */
-	AUX_RTL2 = 0x0d,
-	AUX_RFD_HOLDOFF_ASAP = 0x15,
-	AUX_REQT = 0x18,
-	AUX_REQF = 0x19,
-	AUX_LO_SPEED = 0x40,
-	AUX_HI_SPEED = 0x41
-};
diff --git a/drivers/staging/gpib/gpio/Makefile b/drivers/staging/gpib/gpio/Makefile
deleted file mode 100644
index 00ea52abdda7..000000000000
--- a/drivers/staging/gpib/gpio/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-
-obj-$(CONFIG_GPIB_GPIO) += gpib_bitbang.o
-
-
diff --git a/drivers/staging/gpib/gpio/gpib_bitbang.c b/drivers/staging/gpib/gpio/gpib_bitbang.c
deleted file mode 100644
index 374cd61355e9..000000000000
--- a/drivers/staging/gpib/gpio/gpib_bitbang.c
+++ /dev/null
@@ -1,1469 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*************************************************************************
- *  This code has been developed at the Institute of Sensor and Actuator  *
- *  Systems (Technical University of Vienna, Austria) to enable the GPIO  *
- *  lines (e.g. of a raspberry pi) to function as a GPIO master device	  *
- *									  *
- *  authors		 : Thomas Klima					  *
- *			   Marcello Carla'				  *
- *			   Dave Penkler					  *
- *									  *
- *  copyright		 : (C) 2016 Thomas Klima			  *
- *									  *
- *************************************************************************/
-
-/*
- * limitations:
- *	works only on RPi
- *	cannot function as non-CIC system controller with SN7516x because
- *	SN75161B cannot simultaneously make ATN input with IFC and REN as
- *	outputs.
- * not implemented:
- *	parallel poll
- *	return2local
- *	device support (non master operation)
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define NAME KBUILD_MODNAME
-
-#define ENABLE_IRQ(IRQ, TYPE) irq_set_irq_type(IRQ, TYPE)
-#define DISABLE_IRQ(IRQ) irq_set_irq_type(IRQ, IRQ_TYPE_NONE)
-
-/*
- * Debug print levels:
- *  0 = load/unload info and errors that make the driver fail;
- *  1 = + warnings for unforeseen events that may break the current
- *	 operation and lead to a timeout, but do not affect the
- *       driver integrity (mainly unexpected interrupts);
- *  2 = + trace of function calls;
- *  3 = + trace of protocol codes;
- *  4 = + trace of interrupt operation.
- */
-#define dbg_printk(level, frm, ...)					\
-	do { if (debug >= (level))					\
-			dev_dbg(board->gpib_dev, frm, ## __VA_ARGS__); } \
-	while (0)
-
-#define LINVAL gpiod_get_value(DAV),		\
-		gpiod_get_value(NRFD),		\
-		gpiod_get_value(NDAC),		\
-		gpiod_get_value(SRQ)
-#define LINFMT "DAV: %d	 NRFD:%d  NDAC: %d SRQ: %d"
-
-#include "gpibP.h"
-#include "gpib_state_machines.h"
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/gpio/consumer.h>
-#include <linux/gpio/driver.h>
-#include <linux/gpio/machine.h>
-#include <linux/gpio.h>
-#include <linux/irq.h>
-
-static int sn7516x_used = 1, sn7516x;
-module_param(sn7516x_used, int, 0660);
-
-#define PINMAP_0 "elektronomikon"
-#define PINMAP_1 "gpib4pi-1.1"
-#define PINMAP_2 "yoga"
-static char *pin_map = PINMAP_0;
-module_param(pin_map, charp, 0660);
-MODULE_PARM_DESC(pin_map, " valid values: " PINMAP_0 " " PINMAP_1 " " PINMAP_2);
-
-/**********************************************
- *  Signal pairing and pin wiring between the *
- *  Raspberry-Pi connector and the GPIB bus   *
- *					      *
- *		 signal		  pin wiring  *
- *	      GPIB  Pi-gpio	GPIB  ->  RPi *
- **********************************************
- */
-enum lines_t {
-	D01_pin_nr =  20,     /*   1  ->  38  */
-	D02_pin_nr =  26,     /*   2  ->  37  */
-	D03_pin_nr =  16,     /*   3  ->  36  */
-	D04_pin_nr =  19,     /*   4  ->  35  */
-	D05_pin_nr =  13,     /*  13  ->  33  */
-	D06_pin_nr =  12,     /*  14  ->  32  */
-	D07_pin_nr =   6,     /*  15  ->  31  */
-	D08_pin_nr =   5,     /*  16  ->  29  */
-	EOI_pin_nr =   9,     /*   5  ->  21  */
-	DAV_pin_nr =  10,     /*   6  ->  19  */
-	NRFD_pin_nr = 24,     /*   7  ->  18  */
-	NDAC_pin_nr = 23,     /*   8  ->  16  */
-	IFC_pin_nr =  22,     /*   9  ->  15  */
-	SRQ_pin_nr =  11,     /*  10  ->  23  */
-	_ATN_pin_nr = 25,     /*  11  ->  22  */
-	REN_pin_nr =  27,     /*  17  ->  13  */
-/*
- *  GROUND PINS
- *    12,18,19,20,21,22,23,24  => 14,20,25,30,34,39
- */
-
-/*
- *  These lines are used to control the external
- *  SN75160/161 driver chips when used.
- *  When not used there is reduced fan out;
- *  currently tested with up to 4 devices.
- */
-
-/*		 Pi GPIO	RPI   75161B 75160B   Description       */
-	PE_pin_nr =    7,    /*	 26  ->	  nc	 11   Pullup Enable     */
-	DC_pin_nr =    8,    /*	 24  ->	  12	 nc   Direction control */
-	TE_pin_nr =   18,    /*	 12  ->	   2	  1   Talk Enable       */
-	ACT_LED_pin_nr = 4,  /*	  7  ->	 LED  */
-
-/* YOGA adapter uses different pinout to ease layout */
-	YOGA_D03_pin_nr =  13,
-	YOGA_D04_pin_nr =  12,
-	YOGA_D05_pin_nr =  21,
-	YOGA_D06_pin_nr =  19,
-};
-
-/*
- * GPIO descriptors and pins - WARNING: STRICTLY KEEP ITEMS ORDER
- */
-
-#define GPIB_PINS 16
-#define SN7516X_PINS 4
-#define NUM_PINS (GPIB_PINS + SN7516X_PINS)
-
-#define ACT_LED_ON do {						\
-		if (ACT_LED)					\
-			gpiod_direction_output(ACT_LED, 1);	\
-	} while (0)
-#define ACT_LED_OFF do {					\
-		if (ACT_LED)					\
-			gpiod_direction_output(ACT_LED, 0);	\
-	} while (0)
-
-static struct gpio_desc *all_descriptors[GPIB_PINS + SN7516X_PINS];
-
-#define D01 all_descriptors[0]
-#define D02 all_descriptors[1]
-#define D03 all_descriptors[2]
-#define D04 all_descriptors[3]
-#define D05 all_descriptors[4]
-#define D06 all_descriptors[5]
-#define D07 all_descriptors[6]
-#define D08 all_descriptors[7]
-
-#define EOI all_descriptors[8]
-#define NRFD all_descriptors[9]
-#define IFC all_descriptors[10]
-#define _ATN all_descriptors[11]
-#define REN all_descriptors[12]
-#define DAV all_descriptors[13]
-#define NDAC all_descriptors[14]
-#define SRQ all_descriptors[15]
-
-#define PE all_descriptors[16]
-#define DC all_descriptors[17]
-#define TE all_descriptors[18]
-#define ACT_LED all_descriptors[19]
-
-/* YOGA adapter uses a global enable for the buffer chips, re-using the TE pin */
-#define YOGA_ENABLE TE
-
-static int gpios_vector[] = {
-	D01_pin_nr,
-	D02_pin_nr,
-	D03_pin_nr,
-	D04_pin_nr,
-	D05_pin_nr,
-	D06_pin_nr,
-	D07_pin_nr,
-	D08_pin_nr,
-
-	EOI_pin_nr,
-	NRFD_pin_nr,
-	IFC_pin_nr,
-	_ATN_pin_nr,
-	REN_pin_nr,
-	DAV_pin_nr,
-	NDAC_pin_nr,
-	SRQ_pin_nr,
-
-	PE_pin_nr,
-	DC_pin_nr,
-	TE_pin_nr,
-	ACT_LED_pin_nr
-};
-
-/* Lookup table for general GPIOs */
-
-static struct gpiod_lookup_table gpib_gpio_table_1 = {
-	// for bcm2835/6
-	.dev_id = "",	 // device id of board device
-	.table = {
-		GPIO_LOOKUP_IDX("GPIO_GCLK",  U16_MAX, NULL,  4, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO5",	  U16_MAX, NULL,  5, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO6",	  U16_MAX, NULL,  6, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("SPI_CE1_N",  U16_MAX, NULL,  7, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("SPI_CE0_N",  U16_MAX, NULL,  8, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("SPI_MISO",	  U16_MAX, NULL,  9, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("SPI_MOSI",	  U16_MAX, NULL, 10, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("SPI_SCLK",	  U16_MAX, NULL, 11, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO12",	  U16_MAX, NULL, 12, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO13",	  U16_MAX, NULL, 13, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO16",	  U16_MAX, NULL, 16, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO17",	  U16_MAX, NULL, 17, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO18",	  U16_MAX, NULL, 18, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO19",	  U16_MAX, NULL, 19, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO20",	  U16_MAX, NULL, 20, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO21",	  U16_MAX, NULL, 21, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO22",	  U16_MAX, NULL, 22, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO23",	  U16_MAX, NULL, 23, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO24",	  U16_MAX, NULL, 24, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO25",	  U16_MAX, NULL, 25, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO26",	  U16_MAX, NULL, 26, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO27",	  U16_MAX, NULL, 27, GPIO_ACTIVE_HIGH),
-		{ }
-	},
-};
-
-static struct gpiod_lookup_table gpib_gpio_table_0 = {
-	.dev_id = "",	 // device id of board device
-	.table = {
-		// for bcm27xx based pis (b b+ 2b 3b 3b+ 4 5)
-		GPIO_LOOKUP_IDX("GPIO4",  U16_MAX, NULL,  4, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO5",  U16_MAX, NULL,  5, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO6",  U16_MAX, NULL,  6, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO7",  U16_MAX, NULL,  7, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO8",  U16_MAX, NULL,  8, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO9",  U16_MAX, NULL,  9, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO10", U16_MAX, NULL, 10, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO11", U16_MAX, NULL, 11, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO12", U16_MAX, NULL, 12, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO13", U16_MAX, NULL, 13, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO16", U16_MAX, NULL, 16, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO17", U16_MAX, NULL, 17, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO18", U16_MAX, NULL, 18, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO19", U16_MAX, NULL, 19, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO20", U16_MAX, NULL, 20, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO21", U16_MAX, NULL, 21, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO22", U16_MAX, NULL, 22, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO23", U16_MAX, NULL, 23, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO24", U16_MAX, NULL, 24, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO25", U16_MAX, NULL, 25, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO26", U16_MAX, NULL, 26, GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP_IDX("GPIO27", U16_MAX, NULL, 27, GPIO_ACTIVE_HIGH),
-		{ }
-	},
-};
-
-static struct gpiod_lookup_table *lookup_tables[] = {
-	&gpib_gpio_table_0,
-	&gpib_gpio_table_1,
-	NULL
-};
-
-/* struct which defines private_data for gpio driver */
-
-struct bb_priv {
-	int irq_NRFD;
-	int irq_NDAC;
-	int irq_DAV;
-	int irq_SRQ;
-	int dav_mode;	     /* dav  interrupt mode 0/1 -> edge/levels */
-	int nrfd_mode;	     /* nrfd interrupt mode 0/1 -> edge/levels */
-	int ndac_mode;	     /* nrfd interrupt mode 0/1 -> edge/levels */
-	int dav_tx;	     /* keep trace of DAV status while sending */
-	int dav_rx;	     /* keep trace of DAV status while receiving */
-	u8 eos;              /* eos character */
-	short eos_flags;     /* eos mode */
-	short eos_check;     /* eos check required in current operation ... */
-	short eos_check_8;   /* ... with byte comparison */
-	short eos_mask_7;    /* ... with 7 bit masked character */
-	short int end;
-	int request;
-	int count;
-	int direction;
-	int t1_delay;
-	u8 *rbuf;
-	u8 *wbuf;
-	int end_flag;
-	int r_busy;	      /* 0==idle   1==busy */
-	int w_busy;
-	int write_done;
-	int cmd;	      /* 1 = cmd write in progress */
-	size_t w_cnt;
-	size_t length;
-	u8 *w_buf;
-	spinlock_t rw_lock;   /* protect mods to rw_lock */
-	int phase;
-	int ndac_idle;
-	int ndac_seq;
-	int nrfd_idle;
-	int nrfd_seq;
-	int dav_seq;
-	long all_irqs;
-	int dav_idle;
-
-	enum talker_function_state talker_state;
-	enum listener_function_state listener_state;
-};
-
-static inline long usec_diff(struct timespec64 *a, struct timespec64 *b);
-static void bb_buffer_print(struct gpib_board *board, unsigned char *buffer, size_t length,
-			    int cmd, int eoi);
-static void set_data_lines(u8 byte);
-static u8 get_data_lines(void);
-static void set_data_lines_input(void);
-static void set_data_lines_output(void);
-static inline int check_for_eos(struct bb_priv *priv, u8 byte);
-static void set_atn(struct gpib_board *board, int atn_asserted);
-
-static inline void SET_DIR_WRITE(struct bb_priv *priv);
-static inline void SET_DIR_READ(struct bb_priv *priv);
-
-#define DIR_READ 0
-#define DIR_WRITE 1
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB helper functions for bitbanging I/O");
-
-/****  global variables	 ****/
-static int debug;
-module_param(debug, int, 0644);
-
-static char printable(char x)
-{
-	if (x < 32 || x > 126)
-		return ' ';
-	return x;
-}
-
-/***************************************************************************
- *									   *
- * READ									   *
- *									   *
- ***************************************************************************/
-
-static int bb_read(struct gpib_board *board, u8 *buffer, size_t length,
-		   int *end, size_t *bytes_read)
-{
-	struct bb_priv *priv = board->private_data;
-	unsigned long flags;
-	int retval = 0;
-
-	ACT_LED_ON;
-	SET_DIR_READ(priv);
-
-	dbg_printk(2, "board: %p  lock %d  length: %zu\n",
-		   board, mutex_is_locked(&board->user_mutex), length);
-
-	priv->end = 0;
-	priv->count = 0;
-	priv->rbuf = buffer;
-	if (length == 0)
-		goto read_end;
-	priv->request = length;
-	priv->eos_check = (priv->eos_flags & REOS) == 0; /* do eos check */
-	priv->eos_check_8 = priv->eos_flags & BIN;	 /* over 8 bits */
-	priv->eos_mask_7 = priv->eos & 0x7f;		 /* with this 7 bit eos */
-
-	dbg_printk(3, ".........." LINFMT "\n", LINVAL);
-
-	spin_lock_irqsave(&priv->rw_lock, flags);
-	priv->dav_mode = 1;
-	priv->dav_rx = 1;
-	ENABLE_IRQ(priv->irq_DAV, IRQ_TYPE_LEVEL_LOW);
-	priv->end_flag = 0;
-	gpiod_set_value(NRFD, 1); // ready for data
-	priv->r_busy = 1;
-	priv->phase = 100;
-	spin_unlock_irqrestore(&priv->rw_lock, flags);
-
-	/* wait for the interrupt routines finish their work */
-
-	retval = wait_event_interruptible(board->wait,
-					  (priv->end_flag || board->status & TIMO));
-
-	dbg_printk(3, "awake from wait queue: %d\n", retval);
-
-	if (retval == 0 && board->status & TIMO) {
-		retval = -ETIMEDOUT;
-		dbg_printk(1, "timeout\n");
-	} else if (retval) {
-		retval = -ERESTARTSYS;
-	}
-
-	DISABLE_IRQ(priv->irq_DAV);
-	spin_lock_irqsave(&priv->rw_lock, flags);
-	gpiod_set_value(NRFD, 0); // DIR_READ line state
-	priv->r_busy = 0;
-	spin_unlock_irqrestore(&priv->rw_lock, flags);
-
-read_end:
-	ACT_LED_OFF;
-	*bytes_read = priv->count;
-	*end = priv->end;
-	priv->r_busy = 0;
-	dbg_printk(2, "return: %d  eoi|eos: %d count: %d\n\n", retval, priv->end, priv->count);
-	return retval;
-}
-
-/***************************************************************************
- *									   *
- *	READ interrupt routine (DAV line)				   *
- *									   *
- ***************************************************************************/
-
-static irqreturn_t bb_DAV_interrupt(int irq, void *arg)
-{
-	struct gpib_board *board = arg;
-	struct bb_priv *priv = board->private_data;
-	int val;
-	unsigned long flags;
-
-	spin_lock_irqsave(&priv->rw_lock, flags);
-
-	priv->all_irqs++;
-
-	if (priv->dav_mode) {
-		ENABLE_IRQ(priv->irq_DAV, IRQ_TYPE_EDGE_BOTH);
-		priv->dav_mode = 0;
-	}
-
-	if (priv->r_busy == 0) {
-		dbg_printk(1, "interrupt while idle after %d at %d\n",
-			   priv->count, priv->phase);
-		priv->dav_idle++;
-		priv->phase = 200;
-		goto dav_exit;	/* idle */
-	}
-
-	val = gpiod_get_value(DAV);
-	if (val == priv->dav_rx) {
-		dbg_printk(1, "out of order DAV interrupt %d/%d after %zu/%zu at %d cmd %d "
-			   LINFMT ".\n", val, priv->dav_rx, priv->w_cnt, priv->length,
-			   priv->phase, priv->cmd, LINVAL);
-		priv->dav_seq++;
-	}
-	priv->dav_rx = val;
-
-	dbg_printk(3, "> irq: %d  DAV: %d  st: %4lx dir: %d  busy: %d:%d\n",
-		   irq, val, board->status, priv->direction, priv->r_busy, priv->w_busy);
-
-	if (val == 0) {
-		gpiod_set_value(NRFD, 0); // not ready for data
-		priv->rbuf[priv->count++] = get_data_lines();
-		priv->end = !gpiod_get_value(EOI);
-		gpiod_set_value(NDAC, 1); // data accepted
-		priv->end |= check_for_eos(priv, priv->rbuf[priv->count - 1]);
-		priv->end_flag = ((priv->count >= priv->request) || priv->end);
-		priv->phase = 210;
-	} else {
-		gpiod_set_value(NDAC, 0);	// data not accepted
-		if (priv->end_flag) {
-			priv->r_busy = 0;
-			wake_up_interruptible(&board->wait);
-			priv->phase = 220;
-		} else {
-			gpiod_set_value(NRFD, 1);     // ready for data
-			priv->phase = 230;
-		}
-	}
-
-dav_exit:
-	spin_unlock_irqrestore(&priv->rw_lock, flags);
-	dbg_printk(3, "< irq: %d  count %d\n", irq, priv->count);
-	return IRQ_HANDLED;
-}
-
-/***************************************************************************
- *									   *
- * WRITE								   *
- *									   *
- ***************************************************************************/
-
-static int bb_write(struct gpib_board *board, u8 *buffer, size_t length,
-		    int send_eoi, size_t *bytes_written)
-{
-	unsigned long flags;
-	int retval = 0;
-
-	struct bb_priv *priv = board->private_data;
-
-	ACT_LED_ON;
-
-	priv->w_cnt = 0;
-	priv->w_buf = buffer;
-	dbg_printk(2, "board %p	lock %d	 length: %zu\n",
-		   board, mutex_is_locked(&board->user_mutex), length);
-
-	if (debug > 1)
-		bb_buffer_print(board, buffer, length, priv->cmd, send_eoi);
-	priv->count = 0;
-	priv->phase = 300;
-
-	if (length == 0)
-		goto write_end;
-	priv->end = send_eoi;
-	priv->length = length;
-
-	SET_DIR_WRITE(priv);
-
-	dbg_printk(2, "Enabling interrupts - NRFD: %d   NDAC: %d\n",
-		   gpiod_get_value(NRFD), gpiod_get_value(NDAC));
-
-	if (gpiod_get_value(NRFD) && gpiod_get_value(NDAC)) { /* check for listener */
-		retval = -ENOTCONN;
-		goto write_end;
-	}
-
-	spin_lock_irqsave(&priv->rw_lock, flags);
-	priv->w_busy = 1;	   /* make the interrupt routines active */
-	priv->write_done = 0;
-	priv->nrfd_mode = 1;
-	priv->ndac_mode = 1;
-	priv->dav_tx = 1;
-	ENABLE_IRQ(priv->irq_NDAC, IRQ_TYPE_LEVEL_HIGH);
-	ENABLE_IRQ(priv->irq_NRFD, IRQ_TYPE_LEVEL_HIGH);
-	spin_unlock_irqrestore(&priv->rw_lock, flags);
-
-	/* wait for the interrupt routines finish their work */
-
-	retval = wait_event_interruptible(board->wait,
-					  priv->write_done || (board->status & TIMO));
-
-	dbg_printk(3, "awake from wait queue: %d\n", retval);
-
-	if (retval == 0) {
-		if (board->status & TIMO) {
-			retval = -ETIMEDOUT;
-			dbg_printk(1, "timeout after %zu/%zu at %d " LINFMT " eoi: %d\n",
-				   priv->w_cnt, length, priv->phase, LINVAL, send_eoi);
-		} else {
-			retval = priv->w_cnt;
-		}
-	} else {
-		retval = -ERESTARTSYS;
-	}
-
-	DISABLE_IRQ(priv->irq_NRFD);
-	DISABLE_IRQ(priv->irq_NDAC);
-
-	spin_lock_irqsave(&priv->rw_lock, flags);
-	priv->w_busy = 0;
-	gpiod_set_value(DAV, 1); // DIR_WRITE line state
-	gpiod_set_value(EOI, 1); // De-assert EOI (in case)
-	spin_unlock_irqrestore(&priv->rw_lock, flags);
-
-write_end:
-	*bytes_written = priv->w_cnt;
-	ACT_LED_OFF;
-	dbg_printk(2, "sent %zu bytes\r\n\r\n", *bytes_written);
-	priv->phase = 310;
-	return retval;
-}
-
-/***************************************************************************
- *									   *
- *	WRITE interrupt routine (NRFD line)				   *
- *									   *
- ***************************************************************************/
-
-static irqreturn_t bb_NRFD_interrupt(int irq, void *arg)
-{
-	struct gpib_board *board = arg;
-	struct bb_priv *priv = board->private_data;
-	unsigned long flags;
-	int nrfd;
-
-	spin_lock_irqsave(&priv->rw_lock, flags);
-
-	nrfd = gpiod_get_value(NRFD);
-	priv->all_irqs++;
-
-	dbg_printk(3, "> irq: %d  NRFD: %d   NDAC: %d	st: %4lx dir: %d  busy: %d:%d\n",
-		   irq, nrfd, gpiod_get_value(NDAC), board->status, priv->direction,
-		   priv->w_busy, priv->r_busy);
-
-	if (priv->nrfd_mode) {
-		ENABLE_IRQ(priv->irq_NRFD, IRQ_TYPE_EDGE_RISING);
-		priv->nrfd_mode = 0;
-	}
-
-	if (priv->w_busy == 0) {
-		dbg_printk(1, "interrupt while idle after %zu/%zu at %d\n",
-			   priv->w_cnt, priv->length, priv->phase);
-		priv->nrfd_idle++;
-		goto nrfd_exit;	 /* idle */
-	}
-	if (nrfd == 0) {
-		dbg_printk(1, "out of order interrupt after %zu/%zu at %d cmd %d " LINFMT ".\n",
-			   priv->w_cnt, priv->length, priv->phase, priv->cmd, LINVAL);
-		priv->phase = 400;
-		priv->nrfd_seq++;
-		goto nrfd_exit;
-	}
-	if (!priv->dav_tx) {
-		dbg_printk(1, "DAV low after %zu/%zu cmd %d " LINFMT ". No action.\n",
-			   priv->w_cnt, priv->length, priv->cmd, LINVAL);
-		priv->dav_seq++;
-		goto nrfd_exit;
-	}
-
-	if (priv->w_cnt >= priv->length) { // test for missed NDAC end of transfer
-		dev_err(board->gpib_dev, "Unexpected NRFD exit\n");
-		priv->write_done = 1;
-		priv->w_busy = 0;
-		wake_up_interruptible(&board->wait);
-		goto nrfd_exit;
-	}
-
-	dbg_printk(3, "sending %zu\n", priv->w_cnt);
-
-	set_data_lines(priv->w_buf[priv->w_cnt++]); // put the data on the lines
-
-	if (priv->w_cnt == priv->length && priv->end) {
-		dbg_printk(3, "Asserting EOI\n");
-		gpiod_set_value(EOI, 0); // Assert EOI
-	}
-
-	gpiod_set_value(DAV, 0); // Data available
-	priv->dav_tx = 0;
-	priv->phase = 410;
-
-nrfd_exit:
-	spin_unlock_irqrestore(&priv->rw_lock, flags);
-
-	return IRQ_HANDLED;
-}
-
-/***************************************************************************
- *									   *
- *	WRITE interrupt routine (NDAC line)				   *
- *									   *
- ***************************************************************************/
-
-static irqreturn_t bb_NDAC_interrupt(int irq, void *arg)
-{
-	struct gpib_board *board = arg;
-	struct bb_priv *priv = board->private_data;
-	unsigned long flags;
-	int ndac;
-
-	spin_lock_irqsave(&priv->rw_lock, flags);
-
-	ndac = gpiod_get_value(NDAC);
-	priv->all_irqs++;
-	dbg_printk(3, "> irq: %d  NRFD: %d   NDAC: %d	st: %4lx dir: %d  busy: %d:%d\n",
-		   irq, gpiod_get_value(NRFD), ndac, board->status, priv->direction,
-		   priv->w_busy, priv->r_busy);
-
-	if (priv->ndac_mode) {
-		ENABLE_IRQ(priv->irq_NDAC, IRQ_TYPE_EDGE_RISING);
-		priv->ndac_mode = 0;
-	}
-
-	if (priv->w_busy == 0) {
-		dbg_printk(1, "interrupt while idle.\n");
-		priv->ndac_idle++;
-		goto ndac_exit;
-	}
-	if (ndac == 0) {
-		dbg_printk(1, "out of order interrupt at %zu:%d.\n", priv->w_cnt, priv->phase);
-		priv->phase = 500;
-		priv->ndac_seq++;
-		goto ndac_exit;
-	}
-	if (priv->dav_tx) {
-		dbg_printk(1, "DAV high after %zu/%zu cmd %d " LINFMT ". No action.\n",
-			   priv->w_cnt, priv->length, priv->cmd, LINVAL);
-		priv->dav_seq++;
-		goto ndac_exit;
-	}
-
-	dbg_printk(3, "accepted %zu\n", priv->w_cnt - 1);
-
-	gpiod_set_value(DAV, 1); // Data not available
-	priv->dav_tx = 1;
-	priv->phase = 510;
-
-	if (priv->w_cnt >= priv->length) { // test for end of transfer
-		priv->write_done = 1;
-		priv->w_busy = 0;
-		wake_up_interruptible(&board->wait);
-	}
-
-ndac_exit:
-	spin_unlock_irqrestore(&priv->rw_lock, flags);
-	return IRQ_HANDLED;
-}
-
-/***************************************************************************
- *									   *
- *	interrupt routine for SRQ line					   *
- *									   *
- ***************************************************************************/
-
-static irqreturn_t bb_SRQ_interrupt(int irq, void *arg)
-{
-	struct gpib_board  *board = arg;
-
-	int val = gpiod_get_value(SRQ);
-
-	dbg_printk(3, "> %d   st: %4lx\n", val, board->status);
-
-	if (!val)
-		set_bit(SRQI_NUM, &board->status);  /* set_bit() is atomic */
-
-	wake_up_interruptible(&board->wait);
-
-	return IRQ_HANDLED;
-}
-
-static int bb_command(struct gpib_board *board, u8 *buffer,
-		      size_t length, size_t *bytes_written)
-{
-	int ret;
-	struct bb_priv *priv = board->private_data;
-	int i;
-
-	dbg_printk(2, "%p  %p\n", buffer, board->buffer);
-
-	/* the _ATN line has already been asserted by bb_take_control() */
-
-	priv->cmd = 1;
-
-	ret = bb_write(board, buffer, length, 0, bytes_written); // no eoi
-
-	for (i = 0; i < length; i++) {
-		if (buffer[i] == UNT) {
-			priv->talker_state = talker_idle;
-		} else {
-			if (buffer[i] == UNL) {
-				priv->listener_state = listener_idle;
-			} else {
-				if (buffer[i] == (MTA(board->pad))) {
-					priv->talker_state = talker_addressed;
-					priv->listener_state = listener_idle;
-				} else if (buffer[i] == (MLA(board->pad))) {
-					priv->listener_state = listener_addressed;
-					priv->talker_state = talker_idle;
-				}
-			}
-		}
-	}
-
-	/* the _ATN line will be released by bb_go_to_stby */
-
-	priv->cmd = 0;
-
-	return ret;
-}
-
-/***************************************************************************
- *									   *
- *	Buffer print with decode for debug/trace			   *
- *									   *
- ***************************************************************************/
-
-static char *cmd_string[32] = {
-	"",    // 0x00
-	"GTL", // 0x01
-	"",    // 0x02
-	"",    // 0x03
-	"SDC", // 0x04
-	"PPC", // 0x05
-	"",    // 0x06
-	"",    // 0x07
-	"GET", // 0x08
-	"TCT", // 0x09
-	"",    // 0x0a
-	"",    // 0x0b
-	"",    // 0x0c
-	"",    // 0x0d
-	"",    // 0x0e
-	"",    // 0x0f
-	"",    // 0x10
-	"LLO", // 0x11
-	"",    // 0x12
-	"",    // 0x13
-	"DCL", // 0x14
-	"PPU", // 0x15
-	"",    // 0x16
-	"",    // 0x17
-	"SPE", // 0x18
-	"SPD", // 0x19
-	"",    // 0x1a
-	"",    // 0x1b
-	"",    // 0x1c
-	"",    // 0x1d
-	"",    // 0x1e
-	"CFE"  // 0x1f
-};
-
-static void bb_buffer_print(struct gpib_board *board, unsigned char *buffer, size_t length,
-			    int cmd, int eoi)
-{
-	int i;
-
-	if (cmd) {
-		dbg_printk(2, "<cmd len %zu>\n", length);
-		for (i = 0; i < length; i++) {
-			if (buffer[i] < 0x20) {
-				dbg_printk(3, "0x%x=%s\n", buffer[i], cmd_string[buffer[i]]);
-			} else if (buffer[i] == 0x3f) {
-				dbg_printk(3, "0x%x=%s\n", buffer[i], "UNL");
-			} else if (buffer[i] == 0x5f) {
-				dbg_printk(3, "0x%x=%s\n", buffer[i], "UNT");
-			} else	if (buffer[i] < 0x60) {
-				dbg_printk(3, "0x%x=%s%d\n", buffer[i],
-					   (buffer[i] & 0x40) ? "TLK" : "LSN", buffer[i] & 0x1F);
-			} else {
-				dbg_printk(3, "0x%x\n", buffer[i]);
-			}
-		}
-	} else {
-		dbg_printk(2, "<data len %zu %s>\n", length, (eoi) ? "w.EOI" : " ");
-		for (i = 0; i < length; i++)
-			dbg_printk(2, "%3d  0x%x->%c\n", i, buffer[i], printable(buffer[i]));
-	}
-}
-
-/***************************************************************************
- *									   *
- * STATUS Management							   *
- *									   *
- ***************************************************************************/
-static void set_atn(struct gpib_board *board, int atn_asserted)
-{
-	struct bb_priv *priv = board->private_data;
-
-	if (priv->listener_state != listener_idle &&
-	    priv->talker_state != talker_idle) {
-		dev_err(board->gpib_dev, "listener/talker state machine conflict\n");
-	}
-	if (atn_asserted) {
-		if (priv->listener_state == listener_active)
-			priv->listener_state = listener_addressed;
-		if (priv->talker_state == talker_active)
-			priv->talker_state = talker_addressed;
-		SET_DIR_WRITE(priv);  // need to be able to read bus NRFD/NDAC
-	} else {
-		if (priv->listener_state == listener_addressed) {
-			priv->listener_state = listener_active;
-			SET_DIR_READ(priv); // make sure holdoff is active when we unassert ATN
-		}
-		if (priv->talker_state == talker_addressed)
-			priv->talker_state = talker_active;
-	}
-	gpiod_direction_output(_ATN, !atn_asserted);
-}
-
-static int bb_take_control(struct gpib_board *board, int synchronous)
-{
-	dbg_printk(2, "%d\n", synchronous);
-	set_atn(board, 1);
-	return 0;
-}
-
-static int bb_go_to_standby(struct gpib_board *board)
-{
-	dbg_printk(2, "\n");
-	set_atn(board, 0);
-	return 0;
-}
-
-static int bb_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct bb_priv *priv = board->private_data;
-
-	dbg_printk(2, "%d\n", request_control);
-	if (!request_control)
-		return -EINVAL;
-
-	gpiod_direction_output(REN, 1); /* user space must enable REN if needed */
-	gpiod_direction_output(IFC, 1); /* user space must toggle IFC if needed */
-	if (sn7516x)
-		gpiod_direction_output(DC, 0); /* enable ATN as output on SN75161/2 */
-
-	gpiod_direction_input(SRQ);
-
-	ENABLE_IRQ(priv->irq_SRQ, IRQ_TYPE_EDGE_FALLING);
-
-	return 0;
-}
-
-static void bb_interface_clear(struct gpib_board *board, int assert)
-{
-	struct bb_priv *priv = board->private_data;
-
-	dbg_printk(2, "%d\n", assert);
-	if (assert) {
-		gpiod_direction_output(IFC, 0);
-		priv->talker_state = talker_idle;
-		priv->listener_state = listener_idle;
-		set_bit(CIC_NUM, &board->status);
-	} else {
-		gpiod_direction_output(IFC, 1);
-	}
-}
-
-static void bb_remote_enable(struct gpib_board *board, int enable)
-{
-	dbg_printk(2, "%d\n", enable);
-	if (enable) {
-		set_bit(REM_NUM, &board->status);
-		gpiod_direction_output(REN, 0);
-	} else {
-		clear_bit(REM_NUM, &board->status);
-		gpiod_direction_output(REN, 1);
-	}
-}
-
-static int bb_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct bb_priv *priv = board->private_data;
-
-	dbg_printk(2, "%s\n", "EOS_en");
-	priv->eos = eos_byte;
-	priv->eos_flags = REOS;
-	if (compare_8_bits)
-		priv->eos_flags |= BIN;
-
-	return 0;
-}
-
-static void bb_disable_eos(struct gpib_board *board)
-{
-	struct bb_priv *priv = board->private_data;
-
-	dbg_printk(2, "\n");
-	priv->eos_flags &= ~REOS;
-}
-
-static unsigned int bb_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	struct bb_priv *priv = board->private_data;
-
-	board->status &= ~clear_mask;
-
-	if (gpiod_get_value(SRQ))	       /* SRQ asserted low */
-		clear_bit(SRQI_NUM, &board->status);
-	else
-		set_bit(SRQI_NUM, &board->status);
-	if (gpiod_get_value(_ATN))			/* ATN asserted low */
-		clear_bit(ATN_NUM, &board->status);
-	else
-		set_bit(ATN_NUM, &board->status);
-	if (priv->talker_state == talker_active ||
-	    priv->talker_state == talker_addressed)
-		set_bit(TACS_NUM, &board->status);
-	else
-		clear_bit(TACS_NUM, &board->status);
-
-	if (priv->listener_state == listener_active ||
-	    priv->listener_state == listener_addressed)
-		set_bit(LACS_NUM, &board->status);
-	else
-		clear_bit(LACS_NUM, &board->status);
-
-	dbg_printk(2, "0x%lx mask 0x%x\n", board->status, clear_mask);
-
-	return board->status;
-}
-
-static int bb_primary_address(struct gpib_board *board, unsigned int address)
-{
-	dbg_printk(2, "%d\n", address);
-	board->pad = address;
-	return 0;
-}
-
-static int bb_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	dbg_printk(2, "%d %d\n", address, enable);
-	if (enable)
-		board->sad = address;
-	return 0;
-}
-
-static int bb_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	return -ENOENT;
-}
-
-static void bb_parallel_poll_configure(struct gpib_board *board, u8 config)
-{
-}
-
-static void bb_parallel_poll_response(struct gpib_board *board, int ist)
-{
-}
-
-static void bb_serial_poll_response(struct gpib_board *board, u8 status)
-{
-}
-
-static u8 bb_serial_poll_status(struct gpib_board *board)
-{
-	return 0; // -ENOENT;
-}
-
-static int bb_t1_delay(struct gpib_board *board,  unsigned int nano_sec)
-{
-	struct bb_priv *priv = board->private_data;
-
-	if (nano_sec <= 350)
-		priv->t1_delay = 350;
-	else if (nano_sec <= 1100)
-		priv->t1_delay = 1100;
-	else
-		priv->t1_delay = 2000;
-
-	dbg_printk(2, "t1 delay set to %d nanosec\n", priv->t1_delay);
-
-	return priv->t1_delay;
-}
-
-static void bb_return_to_local(struct gpib_board *board)
-{
-}
-
-static int bb_line_status(const struct gpib_board *board)
-{
-	int line_status = VALID_ALL;
-
-	if (gpiod_get_value(REN) == 0)
-		line_status |= BUS_REN;
-	if (gpiod_get_value(IFC) == 0)
-		line_status |= BUS_IFC;
-	if (gpiod_get_value(NDAC) == 0)
-		line_status |= BUS_NDAC;
-	if (gpiod_get_value(NRFD) == 0)
-		line_status |= BUS_NRFD;
-	if (gpiod_get_value(DAV) == 0)
-		line_status |= BUS_DAV;
-	if (gpiod_get_value(EOI) == 0)
-		line_status |= BUS_EOI;
-	if (gpiod_get_value(_ATN) == 0)
-		line_status |= BUS_ATN;
-	if (gpiod_get_value(SRQ) == 0)
-		line_status |= BUS_SRQ;
-
-	dbg_printk(2, "status lines: %4x\n", line_status);
-
-	return line_status;
-}
-
-/***************************************************************************
- *									   *
- * Module Management							   *
- *									   *
- ***************************************************************************/
-
-static int allocate_private(struct gpib_board *board)
-{
-	board->private_data = kzalloc(sizeof(struct bb_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -1;
-	return 0;
-}
-
-static void free_private(struct gpib_board *board)
-{
-	kfree(board->private_data);
-	board->private_data = NULL;
-}
-
-static int bb_get_irq(struct gpib_board *board, char *name,
-		      struct gpio_desc *gpio, int *irq,
-		      irq_handler_t handler, irq_handler_t thread_fn, unsigned long flags)
-{
-	if (!gpio)
-		return -1;
-	gpiod_direction_input(gpio);
-	*irq = gpiod_to_irq(gpio);
-	dbg_printk(2, "IRQ %s: %d\n", name, *irq);
-	if (*irq < 0) {
-		dev_err(board->gpib_dev, "can't get IRQ for %s\n", name);
-		return -1;
-	}
-	if (request_threaded_irq(*irq, handler, thread_fn, flags, name, board)) {
-		dev_err(board->gpib_dev, "can't request IRQ for %s %d\n", name, *irq);
-		*irq = 0;
-		return -1;
-	}
-	DISABLE_IRQ(*irq);
-	return 0;
-}
-
-static void bb_free_irq(struct gpib_board *board, int *irq, char *name)
-{
-	if (*irq) {
-		free_irq(*irq, board);
-		dbg_printk(2, "IRQ %d(%s) freed\n", *irq, name);
-		*irq = 0;
-	}
-}
-
-static void release_gpios(void)
-{
-	int j;
-
-	for (j = 0 ; j < NUM_PINS ; j++) {
-		if (all_descriptors[j]) {
-			gpiod_put(all_descriptors[j]);
-			all_descriptors[j] = NULL;
-		}
-	}
-}
-
-static int allocate_gpios(struct gpib_board *board)
-{
-	int j;
-	int table_index = 0;
-	char name[256];
-	struct gpio_desc *desc;
-	struct gpiod_lookup_table *lookup_table;
-
-	if (!board->gpib_dev) {
-		pr_err("NULL gpib dev for board\n");
-		return -ENOENT;
-	}
-
-	lookup_table = lookup_tables[table_index];
-	lookup_table->dev_id = dev_name(board->gpib_dev);
-	gpiod_add_lookup_table(lookup_table);
-	dbg_printk(1, "Allocating gpios using table index %d\n", table_index);
-
-	for (j = 0 ; j < NUM_PINS ; j++) {
-		if (gpios_vector[j] < 0)
-			continue;
-		/* name not really used in gpiod_get_index() */
-		sprintf(name, "GPIO%d", gpios_vector[j]);
-try_again:
-		dbg_printk(1, "Allocating gpio %s pin no %d\n", name, gpios_vector[j]);
-		desc = gpiod_get_index(board->gpib_dev, name, gpios_vector[j], GPIOD_IN);
-
-		if (IS_ERR(desc)) {
-			gpiod_remove_lookup_table(lookup_table);
-			table_index++;
-			lookup_table = lookup_tables[table_index];
-			if (!lookup_table) {
-				dev_err(board->gpib_dev, "Unable to obtain gpio descriptor for pin %d error %ld\n",
-					gpios_vector[j], PTR_ERR(desc));
-				goto alloc_gpios_fail;
-			}
-			dbg_printk(1, "Allocation failed, now using table_index %d\n", table_index);
-			lookup_table->dev_id = dev_name(board->gpib_dev);
-			gpiod_add_lookup_table(lookup_table);
-			goto try_again;
-		}
-		all_descriptors[j] = desc;
-	}
-
-	gpiod_remove_lookup_table(lookup_table);
-
-	return 0;
-
-alloc_gpios_fail:
-	release_gpios();
-	return -1;
-}
-
-static void bb_detach(struct gpib_board *board)
-{
-	struct bb_priv *priv = board->private_data;
-
-	dbg_printk(2, "Enter with data %p\n", board->private_data);
-	if (!board->private_data)
-		return;
-
-	bb_free_irq(board, &priv->irq_DAV, NAME "_DAV");
-	bb_free_irq(board, &priv->irq_NRFD, NAME "_NRFD");
-	bb_free_irq(board, &priv->irq_NDAC, NAME "_NDAC");
-	bb_free_irq(board, &priv->irq_SRQ, NAME "_SRQ");
-
-	if (strcmp(PINMAP_2, pin_map) == 0) { /* YOGA */
-		gpiod_set_value(YOGA_ENABLE, 0);
-	}
-
-	release_gpios();
-
-	dbg_printk(2, "detached board: %d\n", board->minor);
-	dbg_printk(0, "NRFD: idle %d, seq %d,  NDAC: idle %d, seq %d  DAV: idle %d  seq: %d  all: %ld",
-		   priv->nrfd_idle, priv->nrfd_seq,
-		   priv->ndac_idle, priv->ndac_seq,
-		   priv->dav_idle, priv->dav_seq, priv->all_irqs);
-
-	free_private(board);
-}
-
-static int bb_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct bb_priv *priv;
-	int retval = 0;
-
-	dbg_printk(2, "%s\n", "Enter ...");
-
-	board->status = 0;
-
-	if (allocate_private(board))
-		return -ENOMEM;
-	priv = board->private_data;
-	priv->direction = -1;
-	priv->t1_delay = 2000;
-	priv->listener_state = listener_idle;
-	priv->talker_state = talker_idle;
-
-	sn7516x = sn7516x_used;
-	if (strcmp(PINMAP_0, pin_map) == 0) {
-		if (!sn7516x) {
-			gpios_vector[&(PE) - &all_descriptors[0]] = -1;
-			gpios_vector[&(DC) - &all_descriptors[0]] = -1;
-			gpios_vector[&(TE) - &all_descriptors[0]] = -1;
-		}
-	} else if (strcmp(PINMAP_1, pin_map) == 0) {
-		if (!sn7516x) {
-			gpios_vector[&(PE) - &all_descriptors[0]] = -1;
-			gpios_vector[&(DC) - &all_descriptors[0]] = -1;
-			gpios_vector[&(TE) - &all_descriptors[0]] = -1;
-		}
-		gpios_vector[&(REN) - &all_descriptors[0]] = 0; /* 27 -> 0 REN on GPIB pin 0 */
-	} else if (strcmp(PINMAP_2, pin_map) == 0) { /* YOGA */
-		sn7516x = 0;
-		gpios_vector[&(D03) - &all_descriptors[0]] = YOGA_D03_pin_nr;
-		gpios_vector[&(D04) - &all_descriptors[0]] = YOGA_D04_pin_nr;
-		gpios_vector[&(D05) - &all_descriptors[0]] = YOGA_D05_pin_nr;
-		gpios_vector[&(D06) - &all_descriptors[0]] = YOGA_D06_pin_nr;
-		gpios_vector[&(PE)  - &all_descriptors[0]] = -1;
-		gpios_vector[&(DC)  - &all_descriptors[0]] = -1;
-	} else {
-		dev_err(board->gpib_dev, "Unrecognized pin map %s\n", pin_map);
-		goto bb_attach_fail;
-	}
-	dbg_printk(0, "Using pin map \"%s\" %s\n", pin_map, (sn7516x) ?
-		   " with SN7516x driver support" : "");
-
-	if (allocate_gpios(board))
-		goto bb_attach_fail;
-
-/*
- * Configure SN7516X control lines.
- * drive ATN, IFC and REN as outputs only when master
- * i.e. system controller. In this mode can only be the CIC
- * When not master then enable device mode ATN, IFC & REN as inputs
- */
-	if (sn7516x) {
-		gpiod_direction_output(DC, 0);
-		gpiod_direction_output(TE, 1);
-		gpiod_direction_output(PE, 1);
-	}
-/* Set main control lines to a known state */
-	gpiod_direction_output(IFC, 1);
-	gpiod_direction_output(REN, 1);
-	gpiod_direction_output(_ATN, 1);
-
-	if (strcmp(PINMAP_2, pin_map) == 0) { /* YOGA: enable level shifters */
-		gpiod_direction_output(YOGA_ENABLE, 1);
-	}
-
-	spin_lock_init(&priv->rw_lock);
-
-	/* request DAV interrupt for read */
-	if (bb_get_irq(board, NAME "_DAV", DAV, &priv->irq_DAV, bb_DAV_interrupt, NULL,
-		       IRQF_TRIGGER_NONE))
-		goto bb_attach_fail_r;
-
-	/* request NRFD interrupt for write */
-	if (bb_get_irq(board, NAME "_NRFD", NRFD, &priv->irq_NRFD, bb_NRFD_interrupt, NULL,
-		       IRQF_TRIGGER_NONE))
-		goto bb_attach_fail_r;
-
-	/* request NDAC interrupt for write */
-	if (bb_get_irq(board, NAME "_NDAC", NDAC, &priv->irq_NDAC, bb_NDAC_interrupt, NULL,
-		       IRQF_TRIGGER_NONE))
-		goto bb_attach_fail_r;
-
-	/* request SRQ interrupt for Service Request */
-	if (bb_get_irq(board, NAME "_SRQ", SRQ, &priv->irq_SRQ, bb_SRQ_interrupt, NULL,
-		       IRQF_TRIGGER_NONE))
-		goto bb_attach_fail_r;
-
-	dbg_printk(0, "attached board %d\n", board->minor);
-	goto bb_attach_out;
-
-bb_attach_fail_r:
-	release_gpios();
-bb_attach_fail:
-	retval = -1;
-bb_attach_out:
-	return retval;
-}
-
-static struct gpib_interface bb_interface = {
-	.name =	NAME,
-	.attach = bb_attach,
-	.detach = bb_detach,
-	.read = bb_read,
-	.write = bb_write,
-	.command = bb_command,
-	.take_control = bb_take_control,
-	.go_to_standby = bb_go_to_standby,
-	.request_system_control = bb_request_system_control,
-	.interface_clear = bb_interface_clear,
-	.remote_enable = bb_remote_enable,
-	.enable_eos = bb_enable_eos,
-	.disable_eos = bb_disable_eos,
-	.parallel_poll = bb_parallel_poll,
-	.parallel_poll_configure = bb_parallel_poll_configure,
-	.parallel_poll_response = bb_parallel_poll_response,
-	.line_status = bb_line_status,
-	.update_status = bb_update_status,
-	.primary_address = bb_primary_address,
-	.secondary_address = bb_secondary_address,
-	.serial_poll_response = bb_serial_poll_response,
-	.serial_poll_status = bb_serial_poll_status,
-	.t1_delay = bb_t1_delay,
-	.return_to_local = bb_return_to_local,
-};
-
-static int __init bb_init_module(void)
-{
-	int result = gpib_register_driver(&bb_interface, THIS_MODULE);
-
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		return result;
-	}
-
-	return 0;
-}
-
-static void __exit bb_exit_module(void)
-{
-	gpib_unregister_driver(&bb_interface);
-}
-
-module_init(bb_init_module);
-module_exit(bb_exit_module);
-
-/***************************************************************************
- *									   *
- * UTILITY Functions							   *
- *									   *
- ***************************************************************************/
-inline long usec_diff(struct timespec64 *a, struct timespec64 *b)
-{
-	return ((a->tv_sec - b->tv_sec) * 1000000 +
-		(a->tv_nsec - b->tv_nsec) / 1000);
-}
-
-static inline int check_for_eos(struct bb_priv *priv, u8 byte)
-{
-	if (priv->eos_check)
-		return 0;
-
-	if (priv->eos_check_8) {
-		if (priv->eos == byte)
-			return 1;
-	} else {
-		if (priv->eos_mask_7 == (byte & 0x7f))
-			return 1;
-	}
-	return 0;
-}
-
-static void set_data_lines_output(void)
-{
-	gpiod_direction_output(D01, 1);
-	gpiod_direction_output(D02, 1);
-	gpiod_direction_output(D03, 1);
-	gpiod_direction_output(D04, 1);
-	gpiod_direction_output(D05, 1);
-	gpiod_direction_output(D06, 1);
-	gpiod_direction_output(D07, 1);
-	gpiod_direction_output(D08, 1);
-}
-
-static void set_data_lines(u8 byte)
-{
-	gpiod_set_value(D01, !(byte & 0x01));
-	gpiod_set_value(D02, !(byte & 0x02));
-	gpiod_set_value(D03, !(byte & 0x04));
-	gpiod_set_value(D04, !(byte & 0x08));
-	gpiod_set_value(D05, !(byte & 0x10));
-	gpiod_set_value(D06, !(byte & 0x20));
-	gpiod_set_value(D07, !(byte & 0x40));
-	gpiod_set_value(D08, !(byte & 0x80));
-}
-
-static u8 get_data_lines(void)
-{
-	u8 ret;
-
-	ret = gpiod_get_value(D01);
-	ret |= gpiod_get_value(D02) << 1;
-	ret |= gpiod_get_value(D03) << 2;
-	ret |= gpiod_get_value(D04) << 3;
-	ret |= gpiod_get_value(D05) << 4;
-	ret |= gpiod_get_value(D06) << 5;
-	ret |= gpiod_get_value(D07) << 6;
-	ret |= gpiod_get_value(D08) << 7;
-	return ~ret;
-}
-
-static void set_data_lines_input(void)
-{
-	gpiod_direction_input(D01);
-	gpiod_direction_input(D02);
-	gpiod_direction_input(D03);
-	gpiod_direction_input(D04);
-	gpiod_direction_input(D05);
-	gpiod_direction_input(D06);
-	gpiod_direction_input(D07);
-	gpiod_direction_input(D08);
-}
-
-static inline void SET_DIR_WRITE(struct bb_priv *priv)
-{
-	if (priv->direction == DIR_WRITE)
-		return;
-
-	gpiod_direction_input(NRFD);
-	gpiod_direction_input(NDAC);
-	set_data_lines_output();
-	gpiod_direction_output(DAV, 1);
-	gpiod_direction_output(EOI, 1);
-
-	if (sn7516x) {
-		gpiod_set_value(PE, 1);	 /* set data lines to transmit on sn75160b */
-		gpiod_set_value(TE, 1);	 /* set NDAC and NRFD to receive and DAV to transmit */
-	}
-
-	priv->direction = DIR_WRITE;
-}
-
-static inline void SET_DIR_READ(struct bb_priv *priv)
-{
-	if (priv->direction == DIR_READ)
-		return;
-
-	gpiod_direction_input(DAV);
-	gpiod_direction_input(EOI);
-
-	set_data_lines_input();
-
-	if (sn7516x) {
-		gpiod_set_value(PE, 0);	 /* set data lines to receive on sn75160b */
-		gpiod_set_value(TE, 0);	 /* set NDAC and NRFD to transmit and DAV to receive */
-	}
-
-	gpiod_direction_output(NRFD, 0); /* hold off the talker */
-	gpiod_direction_output(NDAC, 0); /* data not accepted */
-
-	priv->direction = DIR_READ;
-}
diff --git a/drivers/staging/gpib/hp_82335/Makefile b/drivers/staging/gpib/hp_82335/Makefile
deleted file mode 100644
index 305ce44ee48a..000000000000
--- a/drivers/staging/gpib/hp_82335/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-
-obj-$(CONFIG_GPIB_HP82335) += hp82335.o
-
-
diff --git a/drivers/staging/gpib/hp_82335/hp82335.c b/drivers/staging/gpib/hp_82335/hp82335.c
deleted file mode 100644
index d0e47ef77c87..000000000000
--- a/drivers/staging/gpib/hp_82335/hp82335.c
+++ /dev/null
@@ -1,371 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- * copyright            : (C) 2002 by Frank Mori Hess                      *
- ***************************************************************************/
-
-/*
- * should enable ATN interrupts (and update board->status on occurrence),
- * implement recovery from bus errors (if necessary)
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include "hp82335.h"
-#include <linux/io.h>
-#include <linux/ioport.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/init.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver for HP 82335 interface cards");
-
-static int hp82335_attach(struct gpib_board *board, const struct gpib_board_config *config);
-static void hp82335_detach(struct gpib_board *board);
-static irqreturn_t hp82335_interrupt(int irq, void *arg);
-
-// wrappers for interface functions
-static int hp82335_read(struct gpib_board *board, u8 *buffer, size_t length,
-			int *end, size_t *bytes_read)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_read(board, &priv->tms9914_priv, buffer, length, end, bytes_read);
-}
-
-static int hp82335_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
-			 size_t *bytes_written)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_write(board, &priv->tms9914_priv, buffer, length, send_eoi, bytes_written);
-}
-
-static int hp82335_command(struct gpib_board *board, u8 *buffer, size_t length,
-			   size_t *bytes_written)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_command(board, &priv->tms9914_priv, buffer, length, bytes_written);
-}
-
-static int hp82335_take_control(struct gpib_board *board, int synchronous)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_take_control(board, &priv->tms9914_priv, synchronous);
-}
-
-static int hp82335_go_to_standby(struct gpib_board *board)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_go_to_standby(board, &priv->tms9914_priv);
-}
-
-static int hp82335_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_request_system_control(board, &priv->tms9914_priv, request_control);
-}
-
-static void hp82335_interface_clear(struct gpib_board *board, int assert)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	tms9914_interface_clear(board, &priv->tms9914_priv, assert);
-}
-
-static void hp82335_remote_enable(struct gpib_board *board, int enable)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	tms9914_remote_enable(board, &priv->tms9914_priv, enable);
-}
-
-static int hp82335_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_enable_eos(board, &priv->tms9914_priv, eos_byte, compare_8_bits);
-}
-
-static void hp82335_disable_eos(struct gpib_board *board)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	tms9914_disable_eos(board, &priv->tms9914_priv);
-}
-
-static unsigned int hp82335_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_update_status(board, &priv->tms9914_priv, clear_mask);
-}
-
-static int hp82335_primary_address(struct gpib_board *board, unsigned int address)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_primary_address(board, &priv->tms9914_priv, address);
-}
-
-static int hp82335_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_secondary_address(board, &priv->tms9914_priv, address, enable);
-}
-
-static int hp82335_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_parallel_poll(board, &priv->tms9914_priv, result);
-}
-
-static void hp82335_parallel_poll_configure(struct gpib_board *board, u8 config)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	tms9914_parallel_poll_configure(board, &priv->tms9914_priv, config);
-}
-
-static void hp82335_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	tms9914_parallel_poll_response(board, &priv->tms9914_priv, ist);
-}
-
-static void hp82335_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	tms9914_serial_poll_response(board, &priv->tms9914_priv, status);
-}
-
-static u8 hp82335_serial_poll_status(struct gpib_board *board)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_serial_poll_status(board, &priv->tms9914_priv);
-}
-
-static int hp82335_line_status(const struct gpib_board *board)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_line_status(board, &priv->tms9914_priv);
-}
-
-static int hp82335_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	return tms9914_t1_delay(board, &priv->tms9914_priv, nano_sec);
-}
-
-static void hp82335_return_to_local(struct gpib_board *board)
-{
-	struct hp82335_priv *priv = board->private_data;
-
-	tms9914_return_to_local(board, &priv->tms9914_priv);
-}
-
-static struct gpib_interface hp82335_interface = {
-	.name = "hp82335",
-	.attach = hp82335_attach,
-	.detach = hp82335_detach,
-	.read = hp82335_read,
-	.write = hp82335_write,
-	.command = hp82335_command,
-	.request_system_control = hp82335_request_system_control,
-	.take_control = hp82335_take_control,
-	.go_to_standby = hp82335_go_to_standby,
-	.interface_clear = hp82335_interface_clear,
-	.remote_enable = hp82335_remote_enable,
-	.enable_eos = hp82335_enable_eos,
-	.disable_eos = hp82335_disable_eos,
-	.parallel_poll = hp82335_parallel_poll,
-	.parallel_poll_configure = hp82335_parallel_poll_configure,
-	.parallel_poll_response = hp82335_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = hp82335_line_status,
-	.update_status = hp82335_update_status,
-	.primary_address = hp82335_primary_address,
-	.secondary_address = hp82335_secondary_address,
-	.serial_poll_response = hp82335_serial_poll_response,
-	.serial_poll_status = hp82335_serial_poll_status,
-	.t1_delay = hp82335_t1_delay,
-	.return_to_local = hp82335_return_to_local,
-};
-
-static int hp82335_allocate_private(struct gpib_board *board)
-{
-	board->private_data = kzalloc(sizeof(struct hp82335_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -1;
-	return 0;
-}
-
-static void hp82335_free_private(struct gpib_board *board)
-{
-	kfree(board->private_data);
-	board->private_data = NULL;
-}
-
-static inline unsigned int tms9914_to_hp82335_offset(unsigned int register_num)
-{
-	return 0x1ff8 + register_num;
-}
-
-static u8 hp82335_read_byte(struct tms9914_priv *priv, unsigned int register_num)
-{
-	return tms9914_iomem_read_byte(priv, tms9914_to_hp82335_offset(register_num));
-}
-
-static void hp82335_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num)
-{
-	tms9914_iomem_write_byte(priv, data, tms9914_to_hp82335_offset(register_num));
-}
-
-static void hp82335_clear_interrupt(struct hp82335_priv *hp_priv)
-{
-	struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv;
-
-	writeb(0, tms_priv->mmiobase + HPREG_INTR_CLEAR);
-}
-
-static int hp82335_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct hp82335_priv *hp_priv;
-	struct tms9914_priv *tms_priv;
-	int retval;
-	const unsigned long upper_iomem_base = config->ibbase + hp82335_rom_size;
-
-	board->status = 0;
-
-	if (hp82335_allocate_private(board))
-		return -ENOMEM;
-	hp_priv = board->private_data;
-	tms_priv = &hp_priv->tms9914_priv;
-	tms_priv->read_byte = hp82335_read_byte;
-	tms_priv->write_byte = hp82335_write_byte;
-	tms_priv->offset = 1;
-
-	switch (config->ibbase) {
-	case 0xc4000:
-	case 0xc8000:
-	case 0xcc000:
-	case 0xd0000:
-	case 0xd4000:
-	case 0xd8000:
-	case 0xdc000:
-	case 0xe0000:
-	case 0xe4000:
-	case 0xe8000:
-	case 0xec000:
-	case 0xf0000:
-	case 0xf4000:
-	case 0xf8000:
-	case 0xfc000:
-		break;
-	default:
-		dev_err(board->gpib_dev, "invalid base io address 0x%x\n", config->ibbase);
-		return -EINVAL;
-	}
-	if (!request_mem_region(upper_iomem_base, hp82335_upper_iomem_size, "hp82335")) {
-		dev_err(board->gpib_dev, "failed to allocate io memory region 0x%lx-0x%lx\n",
-			upper_iomem_base, upper_iomem_base + hp82335_upper_iomem_size - 1);
-		return -EBUSY;
-	}
-	hp_priv->raw_iobase = upper_iomem_base;
-	tms_priv->mmiobase = ioremap(upper_iomem_base, hp82335_upper_iomem_size);
-
-	retval = request_irq(config->ibirq, hp82335_interrupt, 0, DRV_NAME, board);
-	if (retval) {
-		dev_err(board->gpib_dev, "can't request IRQ %d\n", config->ibirq);
-		return retval;
-	}
-	hp_priv->irq = config->ibirq;
-
-	tms9914_board_reset(tms_priv);
-
-	hp82335_clear_interrupt(hp_priv);
-
-	writeb(INTR_ENABLE, tms_priv->mmiobase + HPREG_CCR);
-
-	tms9914_online(board, tms_priv);
-
-	return 0;
-}
-
-static void hp82335_detach(struct gpib_board *board)
-{
-	struct hp82335_priv *hp_priv = board->private_data;
-	struct tms9914_priv *tms_priv;
-
-	if (hp_priv) {
-		tms_priv = &hp_priv->tms9914_priv;
-		if (hp_priv->irq)
-			free_irq(hp_priv->irq, board);
-		if (tms_priv->mmiobase) {
-			writeb(0, tms_priv->mmiobase + HPREG_CCR);
-			tms9914_board_reset(tms_priv);
-			iounmap(tms_priv->mmiobase);
-		}
-		if (hp_priv->raw_iobase)
-			release_mem_region(hp_priv->raw_iobase, hp82335_upper_iomem_size);
-	}
-	hp82335_free_private(board);
-}
-
-static int __init hp82335_init_module(void)
-{
-	int result = gpib_register_driver(&hp82335_interface, THIS_MODULE);
-
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		return result;
-	}
-
-	return 0;
-}
-
-static void __exit hp82335_exit_module(void)
-{
-	gpib_unregister_driver(&hp82335_interface);
-}
-
-module_init(hp82335_init_module);
-module_exit(hp82335_exit_module);
-
-/*
- * GPIB interrupt service routines
- */
-
-static irqreturn_t hp82335_interrupt(int irq, void *arg)
-{
-	int status1, status2;
-	struct gpib_board *board = arg;
-	struct hp82335_priv *priv = board->private_data;
-	unsigned long flags;
-	irqreturn_t retval;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	status1 = read_byte(&priv->tms9914_priv, ISR0);
-	status2 = read_byte(&priv->tms9914_priv, ISR1);
-	hp82335_clear_interrupt(priv);
-	retval = tms9914_interrupt_have_status(board, &priv->tms9914_priv, status1, status2);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return retval;
-}
-
diff --git a/drivers/staging/gpib/hp_82335/hp82335.h b/drivers/staging/gpib/hp_82335/hp82335.h
deleted file mode 100644
index 0c252a712ec9..000000000000
--- a/drivers/staging/gpib/hp_82335/hp82335.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright            : (C) 2002 by Frank Mori Hess                   *
- ***************************************************************************/
-
-#ifndef _HP82335_H
-#define _HP82335_H
-
-#include "tms9914.h"
-#include "gpibP.h"
-
-// struct which defines private_data for board
-struct hp82335_priv  {
-	struct tms9914_priv tms9914_priv;
-	unsigned int irq;
-	unsigned long raw_iobase;
-};
-
-// size of io memory region used
-static const int hp82335_rom_size = 0x2000;
-static const int hp82335_upper_iomem_size = 0x2000;
-
-// hp82335 register offsets
-enum hp_read_regs {
-	HPREG_CSR = 0x17f8,
-	HPREG_STATUS = 0x1ffc,
-};
-
-enum hp_write_regs {
-	HPREG_INTR_CLEAR = 0x17f7,
-	HPREG_CCR = HPREG_CSR,
-};
-
-enum ccr_bits {
-	DMA_ENABLE = (1 << 0),   /* DMA enable                  */
-	DMA_CHAN_SELECT = (1 << 1),   /* DMA channel select  O=3,1=2 */
-	INTR_ENABLE = (1 << 2),   /* interrupt enable            */
-	SYS_DISABLE = (1 << 3),   /* system controller disable   */
-};
-
-enum csr_bits {
-	SWITCH6 = (1 << 0),   /* switch 6 position           */
-	SWITCH5 = (1 << 1),   /* switch 5 position           */
-	SYS_CONTROLLER = (1 << 2),   /* system controller bit       */
-	DMA_ENABLE_STATUS = (1 << 4),   /* DMA enabled                 */
-	DMA_CHAN_STATUS = (1 << 5),   /* DMA channel   0=3,1=2       */
-	INTR_ENABLE_STATUS = (1 << 6),   /* Interrupt enable            */
-	INTR_PENDING = (1 << 7),   /* Interrupt Pending           */
-};
-
-#endif	// _HP82335_H
diff --git a/drivers/staging/gpib/hp_82341/Makefile b/drivers/staging/gpib/hp_82341/Makefile
deleted file mode 100644
index 21367310a17e..000000000000
--- a/drivers/staging/gpib/hp_82341/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-
-obj-$(CONFIG_GPIB_HP82341) += hp_82341.o
diff --git a/drivers/staging/gpib/hp_82341/hp_82341.c b/drivers/staging/gpib/hp_82341/hp_82341.c
deleted file mode 100644
index 1a2ad0560e14..000000000000
--- a/drivers/staging/gpib/hp_82341/hp_82341.c
+++ /dev/null
@@ -1,907 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *     Driver for hp 82341a/b/c/d boards.                                  *
- * Might be worth merging with Agilent 82350b driver.                      *
- *   copyright            : (C) 2002, 2005 by Frank Mori Hess              *
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include "hp_82341.h"
-#include <linux/delay.h>
-#include <linux/ioport.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/isapnp.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver for hp 82341a/b/c/d boards");
-
-static unsigned short read_and_clear_event_status(struct gpib_board *board);
-static void set_transfer_counter(struct hp_82341_priv *hp_priv, int count);
-static int read_transfer_counter(struct hp_82341_priv *hp_priv);
-static int hp_82341_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
-			  size_t *bytes_written);
-static irqreturn_t hp_82341_interrupt(int irq, void *arg);
-
-static int hp_82341_accel_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
-			       size_t *bytes_read)
-{
-	struct hp_82341_priv *hp_priv = board->private_data;
-	struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv;
-	int retval = 0;
-	unsigned short event_status;
-	int i;
-	int num_fifo_bytes;
-	// hardware doesn't support checking for end-of-string character when using fifo
-	if (tms_priv->eos_flags & REOS)
-		return tms9914_read(board, tms_priv, buffer, length, end, bytes_read);
-
-	clear_bit(DEV_CLEAR_BN, &tms_priv->state);
-
-	read_and_clear_event_status(board);
-	*end = 0;
-	*bytes_read = 0;
-	if (length == 0)
-		return 0;
-	// disable fifo for the moment
-	outb(DIRECTION_GPIB_TO_HOST_BIT, hp_priv->iobase[3] + BUFFER_CONTROL_REG);
-	/*
-	 * Handle corner case of board not in holdoff and one byte has slipped in already.
-	 * Also, board sometimes has problems (spurious 1 byte reads) when read fifo is
-	 * started up with board in TACS under certain data holdoff conditions.
-	 * Doing a 1 byte tms9914-style read avoids these problems.
-	 */
-	if (/*tms_priv->holdoff_active == 0 && */length > 1) {
-		size_t num_bytes;
-
-		retval = tms9914_read(board, tms_priv, buffer, 1, end, &num_bytes);
-		*bytes_read += num_bytes;
-		if (retval < 0)
-			dev_err(board->gpib_dev, "tms9914_read failed retval=%i\n", retval);
-		if (retval < 0 || *end)
-			return retval;
-		++buffer;
-		--length;
-	}
-	tms9914_set_holdoff_mode(tms_priv, TMS9914_HOLDOFF_EOI);
-	tms9914_release_holdoff(tms_priv);
-	outb(0x00, hp_priv->iobase[3] + BUFFER_FLUSH_REG);
-	i = 0;
-	num_fifo_bytes = length - 1;
-	while (i < num_fifo_bytes && *end == 0)	{
-		int block_size;
-		int j;
-		int count;
-
-		block_size = min(num_fifo_bytes - i, hp_82341_fifo_size);
-		set_transfer_counter(hp_priv, block_size);
-		outb(ENABLE_TI_BUFFER_BIT | DIRECTION_GPIB_TO_HOST_BIT, hp_priv->iobase[3] +
-		     BUFFER_CONTROL_REG);
-		if (inb(hp_priv->iobase[0] + STREAM_STATUS_REG) & HALTED_STATUS_BIT)
-			outb(RESTART_STREAM_BIT, hp_priv->iobase[0] + STREAM_STATUS_REG);
-
-		clear_bit(READ_READY_BN, &tms_priv->state);
-
-		retval = wait_event_interruptible(board->wait,
-						  ((event_status =
-						    read_and_clear_event_status(board)) &
-						   (TERMINAL_COUNT_EVENT_BIT |
-						    BUFFER_END_EVENT_BIT)) ||
-						  test_bit(DEV_CLEAR_BN, &tms_priv->state) ||
-						  test_bit(TIMO_NUM, &board->status));
-		if (retval)  {
-			retval = -ERESTARTSYS;
-			break;
-		}
-		// have to disable buffer before we can read from buffer port
-		outb(DIRECTION_GPIB_TO_HOST_BIT, hp_priv->iobase[3] + BUFFER_CONTROL_REG);
-		count = block_size - read_transfer_counter(hp_priv);
-		j = 0;
-		while (j < count && i < num_fifo_bytes) {
-			unsigned short data_word = inw(hp_priv->iobase[3] + BUFFER_PORT_LOW_REG);
-
-			buffer[i++] = data_word & 0xff;
-			++j;
-			if (j < count && i < num_fifo_bytes) {
-				buffer[i++] = (data_word >> 8) & 0xff;
-				++j;
-			}
-		}
-		if (event_status & BUFFER_END_EVENT_BIT) {
-			clear_bit(RECEIVED_END_BN, &tms_priv->state);
-
-			*end = 1;
-			tms_priv->holdoff_active = 1;
-		}
-		if (test_bit(TIMO_NUM, &board->status))	{
-			retval = -ETIMEDOUT;
-			break;
-		}
-		if (test_bit(DEV_CLEAR_BN, &tms_priv->state)) {
-			retval = -EINTR;
-			break;
-		}
-	}
-	*bytes_read += i;
-	buffer += i;
-	length -= i;
-	if (retval < 0)
-		return retval;
-	// read last byte if we havn't received an END yet
-	if (*end == 0) {
-		size_t num_bytes;
-		// try to make sure we holdoff after last byte read
-		retval = tms9914_read(board, tms_priv, buffer, length, end, &num_bytes);
-		*bytes_read += num_bytes;
-		if (retval < 0)
-			return retval;
-	}
-	return 0;
-}
-
-static int restart_write_fifo(struct gpib_board *board, struct hp_82341_priv *hp_priv)
-{
-	struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv;
-
-	if ((inb(hp_priv->iobase[0] + STREAM_STATUS_REG) & HALTED_STATUS_BIT) == 0)
-		return 0;
-	while (1) {
-		int status;
-
-		// restart doesn't work if data holdoff is in effect
-		status = tms9914_line_status(board, tms_priv);
-		if ((status & BUS_NRFD) == 0) {
-			outb(RESTART_STREAM_BIT, hp_priv->iobase[0] + STREAM_STATUS_REG);
-			return 0;
-		}
-		if (test_bit(DEV_CLEAR_BN, &tms_priv->state))
-			return -EINTR;
-		if (test_bit(TIMO_NUM, &board->status))
-			return -ETIMEDOUT;
-		if (msleep_interruptible(1))
-			return -EINTR;
-	}
-	return 0;
-}
-
-static int hp_82341_accel_write(struct gpib_board *board, u8 *buffer, size_t length,
-				int send_eoi, size_t *bytes_written)
-{
-	struct hp_82341_priv *hp_priv = board->private_data;
-	struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv;
-	int i, j;
-	unsigned short event_status;
-	int retval = 0;
-	int fifo_xfer_len = length;
-
-	*bytes_written = 0;
-	if (send_eoi)
-		--fifo_xfer_len;
-
-	clear_bit(DEV_CLEAR_BN, &tms_priv->state);
-
-	read_and_clear_event_status(board);
-	outb(0, hp_priv->iobase[3] + BUFFER_CONTROL_REG);
-	outb(0x00, hp_priv->iobase[3] + BUFFER_FLUSH_REG);
-	for (i = 0; i < fifo_xfer_len;) {
-		int block_size;
-
-		block_size = min(fifo_xfer_len - i, hp_82341_fifo_size);
-		set_transfer_counter(hp_priv, block_size);
-		// load data into board's fifo
-		for (j = 0; j < block_size;) {
-			unsigned short data_word = buffer[i++];
-			++j;
-			if (j < block_size) {
-				data_word |= buffer[i++] << 8;
-				++j;
-			}
-			outw(data_word, hp_priv->iobase[3] + BUFFER_PORT_LOW_REG);
-		}
-		clear_bit(WRITE_READY_BN, &tms_priv->state);
-		outb(ENABLE_TI_BUFFER_BIT, hp_priv->iobase[3] + BUFFER_CONTROL_REG);
-		retval = restart_write_fifo(board, hp_priv);
-		if (retval < 0)	{
-			dev_err(board->gpib_dev, "failed to restart write stream\n");
-			break;
-		}
-		retval = wait_event_interruptible(board->wait,
-						  ((event_status =
-						    read_and_clear_event_status(board)) &
-						   TERMINAL_COUNT_EVENT_BIT) ||
-						  test_bit(DEV_CLEAR_BN, &tms_priv->state) ||
-						  test_bit(TIMO_NUM, &board->status));
-		outb(0, hp_priv->iobase[3] + BUFFER_CONTROL_REG);
-		*bytes_written += block_size - read_transfer_counter(hp_priv);
-		if (retval) {
-			retval = -ERESTARTSYS;
-			break;
-		}
-		if (test_bit(TIMO_NUM, &board->status))	{
-			retval = -ETIMEDOUT;
-			break;
-		}
-		if (test_bit(DEV_CLEAR_BN, &tms_priv->state)) {
-			retval = -EINTR;
-			break;
-		}
-	}
-	if (retval)
-		return retval;
-	if (send_eoi) {
-		size_t num_bytes;
-
-		retval = hp_82341_write(board, buffer + fifo_xfer_len, 1, 1, &num_bytes);
-		*bytes_written += num_bytes;
-		if (retval < 0)
-			return retval;
-	}
-	return 0;
-}
-
-static int hp_82341_attach(struct gpib_board *board, const struct gpib_board_config *config);
-
-static void hp_82341_detach(struct gpib_board *board);
-
-// wrappers for interface functions
-static int hp_82341_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
-			 size_t *bytes_read)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_read(board, &priv->tms9914_priv, buffer, length, end, bytes_read);
-}
-
-static int hp_82341_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
-			  size_t *bytes_written)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_write(board, &priv->tms9914_priv, buffer, length, send_eoi, bytes_written);
-}
-
-static int hp_82341_command(struct gpib_board *board, u8 *buffer, size_t length,
-			    size_t *bytes_written)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_command(board, &priv->tms9914_priv, buffer, length, bytes_written);
-}
-
-static int hp_82341_take_control(struct gpib_board *board, int synchronous)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_take_control(board, &priv->tms9914_priv, synchronous);
-}
-
-static int hp_82341_go_to_standby(struct gpib_board *board)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_go_to_standby(board, &priv->tms9914_priv);
-}
-
-static int hp_82341_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	if (request_control)
-		priv->mode_control_bits |= SYSTEM_CONTROLLER_BIT;
-	else
-		priv->mode_control_bits &= ~SYSTEM_CONTROLLER_BIT;
-	outb(priv->mode_control_bits, priv->iobase[0] + MODE_CONTROL_STATUS_REG);
-	return tms9914_request_system_control(board, &priv->tms9914_priv, request_control);
-}
-
-static void hp_82341_interface_clear(struct gpib_board *board, int assert)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	tms9914_interface_clear(board, &priv->tms9914_priv, assert);
-}
-
-static void hp_82341_remote_enable(struct gpib_board *board, int enable)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	tms9914_remote_enable(board, &priv->tms9914_priv, enable);
-}
-
-static int hp_82341_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_enable_eos(board, &priv->tms9914_priv, eos_byte, compare_8_bits);
-}
-
-static void hp_82341_disable_eos(struct gpib_board *board)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	tms9914_disable_eos(board, &priv->tms9914_priv);
-}
-
-static unsigned int hp_82341_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_update_status(board, &priv->tms9914_priv, clear_mask);
-}
-
-static int hp_82341_primary_address(struct gpib_board *board, unsigned int address)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_primary_address(board, &priv->tms9914_priv, address);
-}
-
-static int hp_82341_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_secondary_address(board, &priv->tms9914_priv, address, enable);
-}
-
-static int hp_82341_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_parallel_poll(board, &priv->tms9914_priv, result);
-}
-
-static void hp_82341_parallel_poll_configure(struct gpib_board *board, u8 config)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	tms9914_parallel_poll_configure(board, &priv->tms9914_priv, config);
-}
-
-static void hp_82341_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	tms9914_parallel_poll_response(board, &priv->tms9914_priv, ist);
-}
-
-static void hp_82341_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	tms9914_serial_poll_response(board, &priv->tms9914_priv, status);
-}
-
-static u8 hp_82341_serial_poll_status(struct gpib_board *board)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_serial_poll_status(board, &priv->tms9914_priv);
-}
-
-static int hp_82341_line_status(const struct gpib_board *board)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_line_status(board, &priv->tms9914_priv);
-}
-
-static int hp_82341_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	return tms9914_t1_delay(board, &priv->tms9914_priv, nano_sec);
-}
-
-static void hp_82341_return_to_local(struct gpib_board *board)
-{
-	struct hp_82341_priv *priv = board->private_data;
-
-	tms9914_return_to_local(board, &priv->tms9914_priv);
-}
-
-static struct gpib_interface hp_82341_unaccel_interface = {
-	.name = "hp_82341_unaccel",
-	.attach = hp_82341_attach,
-	.detach = hp_82341_detach,
-	.read = hp_82341_read,
-	.write = hp_82341_write,
-	.command = hp_82341_command,
-	.request_system_control = hp_82341_request_system_control,
-	.take_control = hp_82341_take_control,
-	.go_to_standby = hp_82341_go_to_standby,
-	.interface_clear = hp_82341_interface_clear,
-	.remote_enable = hp_82341_remote_enable,
-	.enable_eos = hp_82341_enable_eos,
-	.disable_eos = hp_82341_disable_eos,
-	.parallel_poll = hp_82341_parallel_poll,
-	.parallel_poll_configure = hp_82341_parallel_poll_configure,
-	.parallel_poll_response = hp_82341_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = hp_82341_line_status,
-	.update_status = hp_82341_update_status,
-	.primary_address = hp_82341_primary_address,
-	.secondary_address = hp_82341_secondary_address,
-	.serial_poll_response = hp_82341_serial_poll_response,
-	.serial_poll_status = hp_82341_serial_poll_status,
-	.t1_delay = hp_82341_t1_delay,
-	.return_to_local = hp_82341_return_to_local,
-};
-
-static struct gpib_interface hp_82341_interface = {
-	.name = "hp_82341",
-	.attach = hp_82341_attach,
-	.detach = hp_82341_detach,
-	.read = hp_82341_accel_read,
-	.write = hp_82341_accel_write,
-	.command = hp_82341_command,
-	.request_system_control = hp_82341_request_system_control,
-	.take_control = hp_82341_take_control,
-	.go_to_standby = hp_82341_go_to_standby,
-	.interface_clear = hp_82341_interface_clear,
-	.remote_enable = hp_82341_remote_enable,
-	.enable_eos = hp_82341_enable_eos,
-	.disable_eos = hp_82341_disable_eos,
-	.parallel_poll = hp_82341_parallel_poll,
-	.parallel_poll_configure = hp_82341_parallel_poll_configure,
-	.parallel_poll_response = hp_82341_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = hp_82341_line_status,
-	.update_status = hp_82341_update_status,
-	.primary_address = hp_82341_primary_address,
-	.secondary_address = hp_82341_secondary_address,
-	.serial_poll_response = hp_82341_serial_poll_response,
-	.t1_delay = hp_82341_t1_delay,
-	.return_to_local = hp_82341_return_to_local,
-};
-
-static int hp_82341_allocate_private(struct gpib_board *board)
-{
-	board->private_data = kzalloc(sizeof(struct hp_82341_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -ENOMEM;
-	return 0;
-}
-
-static void hp_82341_free_private(struct gpib_board *board)
-{
-	kfree(board->private_data);
-	board->private_data = NULL;
-}
-
-static u8 hp_82341_read_byte(struct tms9914_priv *priv, unsigned int register_num)
-{
-	return inb(priv->iobase + register_num);
-}
-
-static void hp_82341_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num)
-{
-	outb(data, priv->iobase + register_num);
-}
-
-static int hp_82341_find_isapnp_board(struct pnp_dev **dev)
-{
-	*dev = pnp_find_dev(NULL, ISAPNP_VENDOR('H', 'W', 'P'),
-			    ISAPNP_FUNCTION(0x1411), NULL);
-	if (!*dev || !(*dev)->card) {
-		pr_err("failed to find isapnp board\n");
-		return -ENODEV;
-	}
-	if (pnp_device_attach(*dev) < 0) {
-		pr_err("board already active, skipping\n");
-		return -EBUSY;
-	}
-	if (pnp_activate_dev(*dev) < 0) {
-		pnp_device_detach(*dev);
-		pr_err("failed to activate(), aborting\n");
-		return -EAGAIN;
-	}
-	if (!pnp_port_valid(*dev, 0) || !pnp_irq_valid(*dev, 0)) {
-		pnp_device_detach(*dev);
-		pr_err("invalid port or irq, aborting\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-static int xilinx_ready(struct hp_82341_priv *hp_priv)
-{
-	switch (hp_priv->hw_version) {
-	case HW_VERSION_82341C:
-		if (inb(hp_priv->iobase[0] + CONFIG_CONTROL_STATUS_REG) & XILINX_READY_BIT)
-			return 1;
-		else
-			return 0;
-		break;
-	case HW_VERSION_82341D:
-		if (isapnp_read_byte(PIO_DATA_REG) & HP_82341D_XILINX_READY_BIT)
-			return 1;
-		else
-			return 0;
-	default:
-		pr_err("bug! unknown hw_version\n");
-		break;
-	}
-	return 0;
-}
-
-static int xilinx_done(struct hp_82341_priv *hp_priv)
-{
-	switch (hp_priv->hw_version) {
-	case HW_VERSION_82341C:
-		if (inb(hp_priv->iobase[0] + CONFIG_CONTROL_STATUS_REG) & DONE_PGL_BIT)
-			return 1;
-		else
-			return 0;
-	case HW_VERSION_82341D:
-		if (isapnp_read_byte(PIO_DATA_REG) & HP_82341D_XILINX_DONE_BIT)
-			return 1;
-		else
-			return 0;
-	default:
-		pr_err("bug! unknown hw_version\n");
-		break;
-	}
-	return 0;
-}
-
-static int irq_valid(struct hp_82341_priv *hp_priv, int irq)
-{
-	switch (hp_priv->hw_version) {
-	case HW_VERSION_82341C:
-		switch (irq) {
-		case 3:
-		case 5:
-		case 7:
-		case 9:
-		case 10:
-		case 11:
-		case 12:
-		case 15:
-			return 1;
-		default:
-			pr_err("invalid irq=%i for 82341C, irq must be 3, 5, 7, 9, 10, 11, 12, or 15.\n",
-			       irq);
-			return 0;
-		}
-		break;
-	case HW_VERSION_82341D:
-		return 1;
-	default:
-		pr_err("bug! unknown hw_version\n");
-		break;
-	}
-	return 0;
-}
-
-static int hp_82341_load_firmware_array(struct hp_82341_priv *hp_priv,
-					const unsigned char *firmware_data,
-					unsigned int firmware_length)
-{
-	int i, j;
-	static const int timeout = 100;
-
-	for (i = 0; i < firmware_length; ++i) {
-		for (j = 0; j < timeout; ++j) {
-			if (need_resched())
-				schedule();
-			if (xilinx_ready(hp_priv))
-				break;
-			usleep_range(10, 15);
-		}
-		if (j == timeout) {
-			pr_err("timed out waiting for Xilinx ready.\n");
-			return -ETIMEDOUT;
-		}
-		outb(firmware_data[i], hp_priv->iobase[0] + XILINX_DATA_REG);
-	}
-	for (j = 0; j < timeout; ++j) {
-		if (xilinx_done(hp_priv))
-			break;
-		if (need_resched())
-			schedule();
-		usleep_range(10, 15);
-	}
-	if (j == timeout) {
-		pr_err("timed out waiting for Xilinx done.\n");
-		return -ETIMEDOUT;
-	}
-	return 0;
-}
-
-static int hp_82341_load_firmware(struct hp_82341_priv *hp_priv,
-				  const struct gpib_board_config *config)
-{
-	if (config->init_data_length == 0) {
-		if (xilinx_done(hp_priv))
-			return 0;
-		pr_err("board needs be initialized with firmware upload.\n"
-		       "\tUse the --init-data option of gpib_config.\n");
-		return -EINVAL;
-	}
-	switch (hp_priv->hw_version) {
-	case HW_VERSION_82341C:
-		if (config->init_data_length != hp_82341c_firmware_length) {
-			pr_err("bad firmware length=%i for 82341c (expected %i).\n",
-			       config->init_data_length, hp_82341c_firmware_length);
-			return -EINVAL;
-		}
-		break;
-	case HW_VERSION_82341D:
-		if (config->init_data_length != hp_82341d_firmware_length) {
-			pr_err("bad firmware length=%i for 82341d (expected %i).\n",
-			       config->init_data_length, hp_82341d_firmware_length);
-			return -EINVAL;
-		}
-		break;
-	default:
-		pr_err("bug! unknown hw_version\n");
-		break;
-	}
-	return hp_82341_load_firmware_array(hp_priv, config->init_data, config->init_data_length);
-}
-
-static void set_xilinx_not_prog(struct hp_82341_priv *hp_priv, int assert)
-{
-	switch (hp_priv->hw_version) {
-	case HW_VERSION_82341C:
-		if (assert)
-			hp_priv->config_control_bits |= DONE_PGL_BIT;
-		else
-			hp_priv->config_control_bits &= ~DONE_PGL_BIT;
-		outb(hp_priv->config_control_bits, hp_priv->iobase[0] + CONFIG_CONTROL_STATUS_REG);
-		break;
-	case HW_VERSION_82341D:
-		if (assert)
-			isapnp_write_byte(PIO_DATA_REG, HP_82341D_NOT_PROG_BIT);
-		else
-			isapnp_write_byte(PIO_DATA_REG, 0x0);
-		break;
-	default:
-		break;
-	}
-}
-
-// clear xilinx firmware
-static int clear_xilinx(struct hp_82341_priv *hp_priv)
-{
-	set_xilinx_not_prog(hp_priv, 1);
-	if (msleep_interruptible(1))
-		return -EINTR;
-	set_xilinx_not_prog(hp_priv, 0);
-	if (msleep_interruptible(1))
-		return -EINTR;
-	set_xilinx_not_prog(hp_priv, 1);
-	if (msleep_interruptible(1))
-		return -EINTR;
-	return 0;
-}
-
-static int hp_82341_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct hp_82341_priv *hp_priv;
-	struct tms9914_priv *tms_priv;
-	u32 start_addr;
-	u32 iobase;
-	int irq;
-	int i;
-	int retval;
-
-	board->status = 0;
-	if (hp_82341_allocate_private(board))
-		return -ENOMEM;
-	hp_priv = board->private_data;
-	tms_priv = &hp_priv->tms9914_priv;
-	tms_priv->read_byte = hp_82341_read_byte;
-	tms_priv->write_byte = hp_82341_write_byte;
-	tms_priv->offset = 1;
-
-	if (config->ibbase == 0) {
-		struct pnp_dev *dev;
-		int retval = hp_82341_find_isapnp_board(&dev);
-
-		if (retval < 0)
-			return retval;
-		hp_priv->pnp_dev = dev;
-		iobase = pnp_port_start(dev, 0);
-		irq = pnp_irq(dev, 0);
-		hp_priv->hw_version = HW_VERSION_82341D;
-		hp_priv->io_region_offset = 0x8;
-	} else {
-		iobase = config->ibbase;
-		irq = config->ibirq;
-		hp_priv->hw_version = HW_VERSION_82341C;
-		hp_priv->io_region_offset = 0x400;
-	}
-	for (i = 0; i < hp_82341_num_io_regions; ++i) {
-		start_addr = iobase + i * hp_priv->io_region_offset;
-		if (!request_region(start_addr, hp_82341_region_iosize, DRV_NAME)) {
-			dev_err(board->gpib_dev, "failed to allocate io ports 0x%x-0x%x\n",
-				start_addr,
-				start_addr + hp_82341_region_iosize - 1);
-			return -EIO;
-		}
-		hp_priv->iobase[i] = start_addr;
-	}
-	tms_priv->iobase = hp_priv->iobase[2];
-	if (hp_priv->hw_version == HW_VERSION_82341D) {
-		retval = isapnp_cfg_begin(hp_priv->pnp_dev->card->number,
-					  hp_priv->pnp_dev->number);
-		if (retval < 0)	{
-			dev_err(board->gpib_dev, "isapnp_cfg_begin returned error\n");
-			return retval;
-		}
-		isapnp_write_byte(PIO_DIRECTION_REG, HP_82341D_XILINX_READY_BIT |
-				  HP_82341D_XILINX_DONE_BIT);
-	}
-	retval = clear_xilinx(hp_priv);
-	if (retval < 0)
-		return retval;
-	retval = hp_82341_load_firmware(hp_priv, config);
-	if (hp_priv->hw_version == HW_VERSION_82341D)
-		isapnp_cfg_end();
-	if (retval < 0)
-		return retval;
-	if (irq_valid(hp_priv, irq) == 0)
-		return -EINVAL;
-	if (request_irq(irq, hp_82341_interrupt, 0, DRV_NAME, board))	{
-		dev_err(board->gpib_dev, "failed to allocate IRQ %d\n", irq);
-		return -EIO;
-	}
-	hp_priv->irq = irq;
-	hp_priv->config_control_bits &= ~IRQ_SELECT_MASK;
-	hp_priv->config_control_bits |= IRQ_SELECT_BITS(irq);
-	outb(hp_priv->config_control_bits, hp_priv->iobase[0] + CONFIG_CONTROL_STATUS_REG);
-	hp_priv->mode_control_bits |= ENABLE_IRQ_CONFIG_BIT;
-	outb(hp_priv->mode_control_bits, hp_priv->iobase[0] + MODE_CONTROL_STATUS_REG);
-	tms9914_board_reset(tms_priv);
-	outb(ENABLE_BUFFER_END_EVENT_BIT | ENABLE_TERMINAL_COUNT_EVENT_BIT |
-	     ENABLE_TI_INTERRUPT_EVENT_BIT, hp_priv->iobase[0] +  EVENT_ENABLE_REG);
-	outb(ENABLE_BUFFER_END_INTERRUPT_BIT | ENABLE_TERMINAL_COUNT_INTERRUPT_BIT |
-	     ENABLE_TI_INTERRUPT_BIT, hp_priv->iobase[0] + INTERRUPT_ENABLE_REG);
-	// write clear event register
-	outb((TI_INTERRUPT_EVENT_BIT | POINTERS_EQUAL_EVENT_BIT |
-	      BUFFER_END_EVENT_BIT | TERMINAL_COUNT_EVENT_BIT),
-	     hp_priv->iobase[0] + EVENT_STATUS_REG);
-
-	tms9914_online(board, tms_priv);
-
-	return 0;
-}
-
-static void hp_82341_detach(struct gpib_board *board)
-{
-	struct hp_82341_priv *hp_priv = board->private_data;
-	struct tms9914_priv *tms_priv;
-	int i;
-
-	if (hp_priv) {
-		tms_priv = &hp_priv->tms9914_priv;
-		if (hp_priv->iobase[0])	{
-			outb(0, hp_priv->iobase[0] + INTERRUPT_ENABLE_REG);
-			if (tms_priv->iobase)
-				tms9914_board_reset(tms_priv);
-			if (hp_priv->irq)
-				free_irq(hp_priv->irq, board);
-		}
-		for (i = 0; i < hp_82341_num_io_regions; ++i) {
-			if (hp_priv->iobase[i])
-				release_region(hp_priv->iobase[i], hp_82341_region_iosize);
-		}
-		if (hp_priv->pnp_dev)
-			pnp_device_detach(hp_priv->pnp_dev);
-	}
-	hp_82341_free_private(board);
-}
-
-#if 0
-/* unused, will be needed when the driver is turned into a pnp_driver */
-static const struct pnp_device_id hp_82341_pnp_table[] = {
-	{.id = "HWP1411"},
-	{.id = ""}
-};
-MODULE_DEVICE_TABLE(pnp, hp_82341_pnp_table);
-#endif
-
-static int __init hp_82341_init_module(void)
-{
-	int ret;
-
-	ret = gpib_register_driver(&hp_82341_unaccel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		return ret;
-	}
-
-	ret = gpib_register_driver(&hp_82341_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		gpib_unregister_driver(&hp_82341_unaccel_interface);
-		return ret;
-	}
-
-	return 0;
-}
-
-static void __exit hp_82341_exit_module(void)
-{
-	gpib_unregister_driver(&hp_82341_interface);
-	gpib_unregister_driver(&hp_82341_unaccel_interface);
-}
-
-module_init(hp_82341_init_module);
-module_exit(hp_82341_exit_module);
-
-/*
- * GPIB interrupt service routines
- */
-static unsigned short read_and_clear_event_status(struct gpib_board *board)
-{
-	struct hp_82341_priv *hp_priv = board->private_data;
-	unsigned long flags;
-	unsigned short status;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	status = hp_priv->event_status_bits;
-	hp_priv->event_status_bits = 0;
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return status;
-}
-
-static irqreturn_t hp_82341_interrupt(int irq, void *arg)
-{
-	int status1, status2;
-	struct gpib_board *board = arg;
-	struct hp_82341_priv *hp_priv = board->private_data;
-	struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv;
-	unsigned long flags;
-	irqreturn_t retval = IRQ_NONE;
-	int event_status;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	event_status = inb(hp_priv->iobase[0] + EVENT_STATUS_REG);
-	if (event_status & INTERRUPT_PENDING_EVENT_BIT)
-		retval = IRQ_HANDLED;
-	// write-clear status bits
-	if (event_status & (TI_INTERRUPT_EVENT_BIT | POINTERS_EQUAL_EVENT_BIT |
-			    BUFFER_END_EVENT_BIT | TERMINAL_COUNT_EVENT_BIT)) {
-		outb(event_status & (TI_INTERRUPT_EVENT_BIT | POINTERS_EQUAL_EVENT_BIT |
-				     BUFFER_END_EVENT_BIT | TERMINAL_COUNT_EVENT_BIT),
-		     hp_priv->iobase[0] + EVENT_STATUS_REG);
-		hp_priv->event_status_bits |= event_status;
-	}
-	if (event_status & TI_INTERRUPT_EVENT_BIT) {
-		status1 = read_byte(tms_priv, ISR0);
-		status2 = read_byte(tms_priv, ISR1);
-		tms9914_interrupt_have_status(board, tms_priv, status1, status2);
-	}
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return retval;
-}
-
-static int read_transfer_counter(struct hp_82341_priv *hp_priv)
-{
-	int lo, mid, value;
-
-	lo = inb(hp_priv->iobase[1] + TRANSFER_COUNT_LOW_REG);
-	mid = inb(hp_priv->iobase[1] + TRANSFER_COUNT_MID_REG);
-	value = (lo & 0xff) | ((mid << 8) & 0x7f00);
-	value = ~(value - 1) & 0x7fff;
-	return value;
-}
-
-static void set_transfer_counter(struct hp_82341_priv *hp_priv, int count)
-{
-	int complement = -count;
-
-	outb(complement & 0xff, hp_priv->iobase[1] + TRANSFER_COUNT_LOW_REG);
-	outb((complement >> 8) & 0xff, hp_priv->iobase[1] + TRANSFER_COUNT_MID_REG);
-	// I don't think the hi count reg is even used, but oh well
-	outb((complement >> 16) & 0xf, hp_priv->iobase[1] + TRANSFER_COUNT_HIGH_REG);
-}
-
diff --git a/drivers/staging/gpib/hp_82341/hp_82341.h b/drivers/staging/gpib/hp_82341/hp_82341.h
deleted file mode 100644
index 859ef2899acb..000000000000
--- a/drivers/staging/gpib/hp_82341/hp_82341.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright            : (C) 2002, 2005 by Frank Mori Hess             *
- ***************************************************************************/
-
-#include "tms9914.h"
-#include "gpibP.h"
-
-enum hp_82341_hardware_version {
-	HW_VERSION_UNKNOWN,
-	HW_VERSION_82341C,
-	HW_VERSION_82341D,
-};
-
-// struct which defines private_data for board
-struct hp_82341_priv {
-	struct tms9914_priv tms9914_priv;
-	unsigned int irq;
-	unsigned short config_control_bits;
-	unsigned short mode_control_bits;
-	unsigned short event_status_bits;
-	struct pnp_dev *pnp_dev;
-	unsigned long iobase[4];
-	unsigned long io_region_offset;
-	enum hp_82341_hardware_version hw_version;
-};
-
-static const int hp_82341_region_iosize = 0x8;
-static const int hp_82341_num_io_regions = 4;
-static const int hp_82341_fifo_size = 0xffe;
-static const int hp_82341c_firmware_length = 5764;
-static const int hp_82341d_firmware_length = 5302;
-
-// hp 82341 register offsets
-enum hp_82341_region_0_registers {
-	CONFIG_CONTROL_STATUS_REG = 0x0,
-	MODE_CONTROL_STATUS_REG = 0x1,
-	MONITOR_REG = 0x2,	// after initialization
-	XILINX_DATA_REG = 0x2,	// before initialization, write only
-	INTERRUPT_ENABLE_REG = 0x3,
-	EVENT_STATUS_REG = 0x4,
-	EVENT_ENABLE_REG = 0x5,
-	STREAM_STATUS_REG = 0x7,
-};
-
-enum hp_82341_region_1_registers {
-	ID0_REG = 0x2,
-	ID1_REG = 0x3,
-	TRANSFER_COUNT_LOW_REG = 0x4,
-	TRANSFER_COUNT_MID_REG = 0x5,
-	TRANSFER_COUNT_HIGH_REG = 0x6,
-};
-
-enum hp_82341_region_3_registers {
-	BUFFER_PORT_LOW_REG = 0x0,
-	BUFFER_PORT_HIGH_REG = 0x1,
-	ID2_REG = 0x2,
-	ID3_REG = 0x3,
-	BUFFER_FLUSH_REG = 0x4,
-	BUFFER_CONTROL_REG = 0x7
-};
-
-enum config_control_status_bits {
-	IRQ_SELECT_MASK = 0x7,
-	DMA_CONFIG_MASK = 0x18,
-	ENABLE_DMA_CONFIG_BIT = 0x20,
-	XILINX_READY_BIT = 0x40,	// read only
-	DONE_PGL_BIT = 0x80
-};
-
-static inline unsigned int IRQ_SELECT_BITS(int irq)
-{
-	switch (irq) {
-	case 3:
-		return 0x3;
-	case 5:
-		return 0x2;
-	case 7:
-		return 0x1;
-	case 9:
-		return 0x0;
-	case 10:
-		return 0x7;
-	case 11:
-		return 0x6;
-	case 12:
-		return 0x5;
-	case 15:
-		return 0x4;
-	default:
-		return 0x0;
-	}
-};
-
-enum mode_control_status_bits {
-	SLOT8_BIT = 0x1,		// read only
-	ACTIVE_CONTROLLER_BIT = 0x2,	// read only
-	ENABLE_DMA_BIT = 0x4,
-	SYSTEM_CONTROLLER_BIT = 0x8,
-	MONITOR_BIT = 0x10,
-	ENABLE_IRQ_CONFIG_BIT = 0x20,
-	ENABLE_TI_STREAM_BIT = 0x40
-};
-
-enum monitor_bits {
-	MONITOR_INTERRUPT_PENDING_BIT = 0x1,	// read only
-	MONITOR_CLEAR_HOLDOFF_BIT = 0x2,	// write only
-	MONITOR_PPOLL_BIT = 0x4,		// write clear
-	MONITOR_SRQ_BIT = 0x8,			// write clear
-	MONITOR_IFC_BIT = 0x10,			// write clear
-	MONITOR_REN_BIT = 0x20,			// write clear
-	MONITOR_END_BIT = 0x40,			// write clear
-	MONITOR_DAV_BIT = 0x80			// write clear
-};
-
-enum interrupt_enable_bits {
-	ENABLE_TI_INTERRUPT_BIT = 0x1,
-	ENABLE_POINTERS_EQUAL_INTERRUPT_BIT = 0x4,
-	ENABLE_BUFFER_END_INTERRUPT_BIT = 0x10,
-	ENABLE_TERMINAL_COUNT_INTERRUPT_BIT = 0x20,
-	ENABLE_DMA_TERMINAL_COUNT_INTERRUPT_BIT = 0x80,
-};
-
-enum event_status_bits {
-	TI_INTERRUPT_EVENT_BIT = 0x1,		// write clear
-	INTERRUPT_PENDING_EVENT_BIT = 0x2,	// read only
-	POINTERS_EQUAL_EVENT_BIT = 0x4,		// write clear
-	BUFFER_END_EVENT_BIT = 0x10,		// write clear
-	TERMINAL_COUNT_EVENT_BIT = 0x20,	// write clear
-	DMA_TERMINAL_COUNT_EVENT_BIT = 0x80,	// write clear
-};
-
-enum event_enable_bits {
-	ENABLE_TI_INTERRUPT_EVENT_BIT = 0x1,		// write clear
-	ENABLE_POINTERS_EQUAL_EVENT_BIT = 0x4,		// write clear
-	ENABLE_BUFFER_END_EVENT_BIT = 0x10,		// write clear
-	ENABLE_TERMINAL_COUNT_EVENT_BIT = 0x20,		// write clear
-	ENABLE_DMA_TERMINAL_COUNT_EVENT_BIT = 0x80,	// write clear
-};
-
-enum stream_status_bits {
-	HALTED_STATUS_BIT = 0x1,	// read
-	RESTART_STREAM_BIT = 0x1	// write
-};
-
-enum buffer_control_bits {
-	DIRECTION_GPIB_TO_HOST_BIT = 0x20,	// transfer direction (set for gpib to host)
-	ENABLE_TI_BUFFER_BIT = 0x40,		// enable fifo
-	FAST_WR_EN_BIT = 0x80,			// 350 ns t1 delay?
-};
-
-// registers accessible through isapnp chip on 82341d
-enum hp_82341d_pnp_registers {
-	PIO_DATA_REG = 0x20,		// read/write pio data lines
-	PIO_DIRECTION_REG = 0x21,	// set pio data line directions (set for input)
-};
-
-enum hp_82341d_pnp_pio_bits {
-	HP_82341D_XILINX_READY_BIT = 0x1,
-	HP_82341D_XILINX_DONE_BIT = 0x2,
-	// use register layout compatible with C and older versions instead of 32 contiguous ioports
-	HP_82341D_LEGACY_MODE_BIT = 0x4,
-	HP_82341D_NOT_PROG_BIT = 0x8,	// clear to reinitialize xilinx
-};
diff --git a/drivers/staging/gpib/include/amcc5920.h b/drivers/staging/gpib/include/amcc5920.h
deleted file mode 100644
index 7a88bd282feb..000000000000
--- a/drivers/staging/gpib/include/amcc5920.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *  Header for amcc5920 pci chip
- *
- *   copyright		  : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-// plx pci chip registers and bits
-enum amcc_registers {
-	AMCC_INTCS_REG = 0x38,
-	AMCC_PASS_THRU_REG	= 0x60,
-};
-
-enum amcc_incsr_bits {
-	AMCC_ADDON_INTR_ENABLE_BIT = 0x2000,
-	AMCC_ADDON_INTR_ACTIVE_BIT = 0x400000,
-	AMCC_INTR_ACTIVE_BIT = 0x800000,
-};
-
-static const int bits_per_region = 8;
-
-static inline uint32_t amcc_wait_state_bits(unsigned int region, unsigned int num_wait_states)
-{
-	return (num_wait_states & 0x7) << (--region * bits_per_region);
-};
-
-enum amcc_prefetch_bits {
-	PREFETCH_DISABLED = 0x0,
-	PREFETCH_SMALL = 0x8,
-	PREFETCH_MEDIUM = 0x10,
-	PREFETCH_LARGE = 0x18,
-};
-
-static inline uint32_t amcc_prefetch_bits(unsigned int region, enum amcc_prefetch_bits prefetch)
-{
-	return prefetch << (--region * bits_per_region);
-};
-
-static inline uint32_t amcc_PTADR_mode_bit(unsigned int region)
-{
-	return 0x80 << (--region * bits_per_region);
-};
-
-static inline uint32_t amcc_disable_write_fifo_bit(unsigned int region)
-{
-	return 0x20 << (--region * bits_per_region);
-};
-
diff --git a/drivers/staging/gpib/include/amccs5933.h b/drivers/staging/gpib/include/amccs5933.h
deleted file mode 100644
index d7f63c795096..000000000000
--- a/drivers/staging/gpib/include/amccs5933.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- * Registers and bits for amccs5933 pci chip
- *    copyright            : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-// register offsets
-enum {
-	MBEF_REG = 0x34,	// mailbux empty/full
-	INTCSR_REG = 0x38,	// interrupt control and status
-	BMCSR_REG = 0x3c,	// bus master control and status
-};
-
-// incoming mailbox 0-3  register offsets
-extern inline int INCOMING_MAILBOX_REG(unsigned int mailbox)
-{
-	return (0x10 + 4 * mailbox);
-};
-
-// bit definitions
-
-// INTCSR bits
-enum {
-	OUTBOX_EMPTY_INTR_BIT = 0x10,	// enable outbox empty interrupt
-	INBOX_FULL_INTR_BIT = 0x1000,	// enable inbox full interrupt
-	INBOX_INTR_CS_BIT = 0x20000,	// read, or write clear inbox full interrupt
-	INTR_ASSERTED_BIT = 0x800000,	// read only, interrupt asserted
-};
-
-// select byte 0 to 3 of incoming mailbox
-extern inline int INBOX_BYTE_BITS(unsigned int byte)
-{
-	return (byte & 0x3) << 8;
-};
-
-// select incoming mailbox 0 to 3
-extern inline int INBOX_SELECT_BITS(unsigned int mailbox)
-{
-	return (mailbox & 0x3) << 10;
-};
-
-// select byte 0 to 3 of outgoing mailbox
-extern inline int OUTBOX_BYTE_BITS(unsigned int byte)
-{
-	return (byte & 0x3);
-};
-
-// select outgoing mailbox 0 to 3
-extern inline int OUTBOX_SELECT_BITS(unsigned int mailbox)
-{
-	return (mailbox & 0x3) << 2;
-};
-
-// BMCSR bits
-enum {
-	MBOX_FLAGS_RESET_BIT = 0x08000000,	// resets mailbox empty/full flags
-};
-
diff --git a/drivers/staging/gpib/include/gpibP.h b/drivers/staging/gpib/include/gpibP.h
deleted file mode 100644
index 1b27f37e0ba0..000000000000
--- a/drivers/staging/gpib/include/gpibP.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright		   : (C) 2002,2003 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _GPIB_P_H
-#define _GPIB_P_H
-
-#include <linux/types.h>
-
-#include "gpib_types.h"
-#include "gpib_proto.h"
-#include "gpib_cmd.h"
-#include "gpib.h"
-#include "gpib_ioctl.h"
-
-#include <linux/fs.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-int gpib_register_driver(struct gpib_interface *interface, struct module *mod);
-void gpib_unregister_driver(struct gpib_interface *interface);
-struct pci_dev *gpib_pci_get_device(const struct gpib_board_config *config, unsigned int vendor_id,
-				    unsigned int device_id, struct pci_dev *from);
-struct pci_dev *gpib_pci_get_subsys(const struct gpib_board_config *config, unsigned int vendor_id,
-				    unsigned int device_id, unsigned int ss_vendor,
-				    unsigned int ss_device, struct pci_dev *from);
-unsigned int num_gpib_events(const struct gpib_event_queue *queue);
-int push_gpib_event(struct gpib_board *board, short event_type);
-int pop_gpib_event(struct gpib_board *board, struct gpib_event_queue *queue, short *event_type);
-int gpib_request_pseudo_irq(struct gpib_board *board, irqreturn_t (*handler)(int, void *));
-void gpib_free_pseudo_irq(struct gpib_board *board);
-int gpib_match_device_path(struct device *dev, const char *device_path_in);
-
-extern struct gpib_board board_array[GPIB_MAX_NUM_BOARDS];
-
-extern struct list_head registered_drivers;
-
-#endif	// _GPIB_P_H
-
diff --git a/drivers/staging/gpib/include/gpib_cmd.h b/drivers/staging/gpib/include/gpib_cmd.h
deleted file mode 100644
index 9e96a3bfa22d..000000000000
--- a/drivers/staging/gpib/include/gpib_cmd.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _GPIB_CMD_H
-#define _GPIB_CMD_H
-
-#include <linux/types.h>
-
-/* Command byte definitions tests and functions */
-
-/* mask of bits that actually matter in a command byte */
-enum {
-	gpib_command_mask = 0x7f,
-};
-
-/* Possible GPIB command messages */
-
-enum cmd_byte {
-	GTL = 0x1,	/* go to local			*/
-	SDC = 0x4,	/* selected device clear	*/
-	PP_CONFIG = 0x5,
-	GET = 0x8,	/* group execute trigger	*/
-	TCT = 0x9,	/* take control			*/
-	LLO = 0x11,	/* local lockout		*/
-	DCL = 0x14,	/* device clear			*/
-	PPU = 0x15,	/* parallel poll unconfigure	*/
-	SPE = 0x18,	/* serial poll enable		*/
-	SPD = 0x19,	/* serial poll disable		*/
-	CFE = 0x1f,     /* configure enable */
-	LAD = 0x20,	/* value to be 'ored' in to obtain listen address */
-	UNL = 0x3F,	/* unlisten			*/
-	TAD = 0x40,	/* value to be 'ored' in to obtain talk address	  */
-	UNT = 0x5F,	/* untalk			*/
-	SAD = 0x60,	/* my secondary address (base) */
-	PPE = 0x60,	/* parallel poll enable (base)	*/
-	PPD = 0x70	/* parallel poll disable	*/
-};
-
-/* confine address to range 0 to 30. */
-static inline unsigned int gpib_address_restrict(u32 addr)
-{
-	addr &= 0x1f;
-	if (addr == 0x1f)
-		addr = 0;
-	return addr;
-}
-
-static inline u8 MLA(u32 addr)
-{
-	return gpib_address_restrict(addr) | LAD;
-}
-
-static inline u8 MTA(u32 addr)
-{
-	return gpib_address_restrict(addr) | TAD;
-}
-
-static inline u8 MSA(u32 addr)
-{
-	return (addr & 0x1f) | SAD;
-}
-
-static inline s32 gpib_address_equal(u32 pad1, s32 sad1, u32 pad2, s32 sad2)
-{
-	if (pad1 == pad2) {
-		if (sad1 == sad2)
-			return 1;
-		if (sad1 < 0 && sad2 < 0)
-			return 1;
-	}
-
-	return 0;
-}
-
-static inline s32 is_PPE(u8 command)
-{
-	return (command & 0x70) == 0x60;
-}
-
-static inline s32 is_PPD(u8 command)
-{
-	return (command & 0x70) == 0x70;
-}
-
-static inline s32 in_addressed_command_group(u8 command)
-{
-	return (command & 0x70) == 0x0;
-}
-
-static inline s32 in_universal_command_group(u8 command)
-{
-	return (command & 0x70) == 0x10;
-}
-
-static inline s32 in_listen_address_group(u8 command)
-{
-	return (command & 0x60) == 0x20;
-}
-
-static inline s32 in_talk_address_group(u8 command)
-{
-	return (command & 0x60) == 0x40;
-}
-
-static inline s32 in_primary_command_group(u8 command)
-{
-	return in_addressed_command_group(command) ||
-		in_universal_command_group(command) ||
-		in_listen_address_group(command) ||
-		in_talk_address_group(command);
-}
-
-#endif /* _GPIB_CMD_H */
diff --git a/drivers/staging/gpib/include/gpib_pci_ids.h b/drivers/staging/gpib/include/gpib_pci_ids.h
deleted file mode 100644
index 52dcab07a7d1..000000000000
--- a/drivers/staging/gpib/include/gpib_pci_ids.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __GPIB_PCI_IDS_H
-#define __GPIB_PCI_IDS_H
-
-#ifndef PCI_VENDOR_ID_AMCC
-#define PCI_VENDOR_ID_AMCC	0x10e8
-#endif
-
-#ifndef PCI_VENDOR_ID_CBOARDS
-#define PCI_VENDOR_ID_CBOARDS	0x1307
-#endif
-
-#ifndef PCI_VENDOR_ID_QUANCOM
-#define PCI_VENDOR_ID_QUANCOM	0x8008
-#endif
-
-#ifndef PCI_DEVICE_ID_QUANCOM_GPIB
-#define PCI_DEVICE_ID_QUANCOM_GPIB	0x3302
-#endif
-
-#endif	// __GPIB_PCI_IDS_H
-
diff --git a/drivers/staging/gpib/include/gpib_proto.h b/drivers/staging/gpib/include/gpib_proto.h
deleted file mode 100644
index 42e736e3b7cd..000000000000
--- a/drivers/staging/gpib/include/gpib_proto.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef GPIB_PROTO_INCLUDED
-#define GPIB_PROTO_INCLUDED
-
-#include <linux/fs.h>
-
-int ibopen(struct inode *inode, struct file *filep);
-int ibclose(struct inode *inode, struct file *file);
-long ibioctl(struct file *filep, unsigned int cmd, unsigned long arg);
-void os_start_timer(struct gpib_board *board, unsigned int usec_timeout);
-void os_remove_timer(struct gpib_board *board);
-void init_gpib_board(struct gpib_board *board);
-static inline unsigned long usec_to_jiffies(unsigned int usec)
-{
-	unsigned long usec_per_jiffy = 1000000 / HZ;
-
-	return 1 + (usec + usec_per_jiffy - 1) / usec_per_jiffy;
-};
-
-int serial_poll_all(struct gpib_board *board, unsigned int usec_timeout);
-void init_gpib_descriptor(struct gpib_descriptor *desc);
-int dvrsp(struct gpib_board *board, unsigned int pad, int sad,
-	  unsigned int usec_timeout, u8 *result);
-int ibcac(struct gpib_board *board, int sync, int fallback_to_async);
-int ibcmd(struct gpib_board *board, u8 *buf, size_t length, size_t *bytes_written);
-int ibgts(struct gpib_board *board);
-int ibonline(struct gpib_board *board);
-int iboffline(struct gpib_board *board);
-int iblines(const struct gpib_board *board, short *lines);
-int ibrd(struct gpib_board *board, u8 *buf, size_t length, int *end_flag, size_t *bytes_read);
-int ibrpp(struct gpib_board *board, u8 *buf);
-int ibrsv2(struct gpib_board *board, u8 status_byte, int new_reason_for_service);
-int ibrsc(struct gpib_board *board, int request_control);
-int ibsic(struct gpib_board *board, unsigned int usec_duration);
-int ibsre(struct gpib_board *board, int enable);
-int ibpad(struct gpib_board *board, unsigned int addr);
-int ibsad(struct gpib_board *board, int addr);
-int ibeos(struct gpib_board *board, int eos, int eosflags);
-int ibwait(struct gpib_board *board, int wait_mask, int clear_mask, int set_mask,
-	   int *status, unsigned long usec_timeout, struct gpib_descriptor *desc);
-int ibwrt(struct gpib_board *board, u8 *buf, size_t cnt, int send_eoi, size_t *bytes_written);
-int ibstatus(struct gpib_board *board);
-int general_ibstatus(struct gpib_board *board, const struct gpib_status_queue *device,
-		     int clear_mask, int set_mask, struct gpib_descriptor *desc);
-int io_timed_out(struct gpib_board *board);
-int ibppc(struct gpib_board *board, u8 configuration);
-
-#endif /* GPIB_PROTO_INCLUDED */
diff --git a/drivers/staging/gpib/include/gpib_state_machines.h b/drivers/staging/gpib/include/gpib_state_machines.h
deleted file mode 100644
index 7488c00f191e..000000000000
--- a/drivers/staging/gpib/include/gpib_state_machines.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright            : (C) 2006 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _GPIB_STATE_MACHINES_H
-#define _GPIB_STATE_MACHINES_H
-
-enum talker_function_state {
-	talker_idle,
-	talker_addressed,
-	talker_active,
-	serial_poll_active
-};
-
-enum listener_function_state {
-	listener_idle,
-	listener_addressed,
-	listener_active
-};
-
-#endif	// _GPIB_STATE_MACHINES_H
diff --git a/drivers/staging/gpib/include/gpib_types.h b/drivers/staging/gpib/include/gpib_types.h
deleted file mode 100644
index 998abb379749..000000000000
--- a/drivers/staging/gpib/include/gpib_types.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright		   : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _GPIB_TYPES_H
-#define _GPIB_TYPES_H
-
-#ifdef __KERNEL__
-#include "gpib.h"
-#include <linux/atomic.h>
-#include <linux/device.h>
-#include <linux/mutex.h>
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/interrupt.h>
-
-struct gpib_board;
-
-/* config parameters that are only used by driver attach functions */
-struct gpib_board_config {
-	/* firmware blob */
-	void *init_data;
-	int init_data_length;
-	/* IO base address to use for non-pnp cards (set by core, driver should make local copy) */
-	u32 ibbase;
-	void __iomem *mmibbase;
-	/* IRQ to use for non-pnp cards (set by core, driver should make local copy) */
-	unsigned int ibirq;
-	/* dma channel to use for non-pnp cards (set by core, driver should make local copy) */
-	unsigned int ibdma;
-	/*
-	 * pci bus of card, useful for distinguishing multiple identical pci cards
-	 * (negative means don't care)
-	 */
-	int pci_bus;
-	/*
-	 * pci slot of card, useful for distinguishing multiple identical pci cards
-	 * (negative means don't care)
-	 */
-	int pci_slot;
-	/* sysfs device path of hardware to attach */
-	char *device_path;
-	/* serial number of hardware to attach */
-	char *serial_number;
-};
-
-/*
- * struct gpib_interface defines the interface
- * between the board-specific details dealt with in the drivers
- * and generic interface provided by gpib-common.
- * This really should be in a different header file.
- */
-struct gpib_interface {
-	/* name of board */
-	char *name;
-	/* attach() initializes board and allocates resources */
-	int (*attach)(struct gpib_board *board, const struct gpib_board_config *config);
-	/* detach() shuts down board and frees resources */
-	void (*detach)(struct gpib_board *board);
-	/*
-	 * read() should read at most 'length' bytes from the bus into
-	 * 'buffer'.  It should return when it fills the buffer or
-	 * encounters an END (EOI and or EOS if appropriate).  It should set 'end'
-	 * to be nonzero if the read was terminated by an END, otherwise 'end'
-	 * should be zero.
-	 * Ultimately, this will be changed into or replaced by an asynchronous
-	 * read.  Zero return value for success, negative
-	 * return indicates error.
-	 * nbytes returns number of bytes read
-	 */
-	int (*read)(struct gpib_board *board, u8 *buffer, size_t length, int *end,
-		    size_t *bytes_read);
-	/*
-	 * write() should write 'length' bytes from buffer to the bus.
-	 * If the boolean value send_eoi is nonzero, then EOI should
-	 * be sent along with the last byte.  Returns number of bytes
-	 * written or negative value on error.
-	 */
-	int (*write)(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
-		     size_t *bytes_written);
-	/*
-	 * command() writes the command bytes in 'buffer' to the bus
-	 * Returns zero on success or negative value on error.
-	 */
-	int (*command)(struct gpib_board *board, u8 *buffer, size_t length,
-		       size_t *bytes_written);
-	/*
-	 * Take control (assert ATN).  If 'asyncronous' is nonzero, take
-	 * control asyncronously (assert ATN immediately without waiting
-	 * for other processes to complete first).  Should not return
-	 * until board becomes controller in charge.  Returns zero no success,
-	 * nonzero on error.
-	 */
-	int (*take_control)(struct gpib_board *board, int asyncronous);
-	/*
-	 * De-assert ATN.  Returns zero on success, nonzer on error.
-	 */
-	int (*go_to_standby)(struct gpib_board *board);
-	/* request/release control of the IFC and REN lines (system controller) */
-	int (*request_system_control)(struct gpib_board *board, int request_control);
-	/*
-	 * Asserts or de-asserts 'interface clear' (IFC) depending on
-	 * boolean value of 'assert'
-	 */
-	void (*interface_clear)(struct gpib_board *board, int assert);
-	/*
-	 * Sends remote enable command if 'enable' is nonzero, disables remote mode
-	 * if 'enable' is zero
-	 */
-	void (*remote_enable)(struct gpib_board *board, int enable);
-	/*
-	 * enable END for reads, when byte 'eos' is received.  If
-	 * 'compare_8_bits' is nonzero, then all 8 bits are compared
-	 * with the eos bytes.	Otherwise only the 7 least significant
-	 * bits are compared.
-	 */
-	int (*enable_eos)(struct gpib_board *board, u8 eos, int compare_8_bits);
-	/* disable END on eos byte (END on EOI only)*/
-	void (*disable_eos)(struct gpib_board *board);
-	/* configure parallel poll */
-	void (*parallel_poll_configure)(struct gpib_board *board, u8 configuration);
-	/* conduct parallel poll */
-	int (*parallel_poll)(struct gpib_board *board, u8 *result);
-	/* set/clear ist (individual status bit) */
-	void (*parallel_poll_response)(struct gpib_board *board, int ist);
-	/* select local parallel poll configuration mode PP2 versus remote PP1 */
-	void (*local_parallel_poll_mode)(struct gpib_board *board, int local);
-	/*
-	 * Returns current status of the bus lines.  Should be set to
-	 * NULL if your board does not have the ability to query the
-	 * state of the bus lines.
-	 */
-	int (*line_status)(const struct gpib_board *board);
-	/*
-	 * updates and returns the board's current status.
-	 * The meaning of the bits are specified in gpib_user.h
-	 * in the IBSTA section.  The driver does not need to
-	 * worry about setting the CMPL, END, TIMO, or ERR bits.
-	 */
-	unsigned int (*update_status)(struct gpib_board *board, unsigned int clear_mask);
-	/*
-	 * Sets primary address 0-30 for gpib interface card.
-	 */
-	int (*primary_address)(struct gpib_board *board, unsigned int address);
-	/*
-	 * Sets and enables, or disables secondary address 0-30
-	 * for gpib interface card.
-	 */
-	int (*secondary_address)(struct gpib_board *board, unsigned int address,
-				 int enable);
-	/*
-	 * Sets the byte the board should send in response to a serial poll.
-	 * This function should also start or stop requests for service via
-	 * IEEE 488.2 reqt/reqf, based on MSS (bit 6 of the status_byte).
-	 * If the more flexible serial_poll_response2 is implemented by the
-	 * driver, then this method should be left NULL since it will not
-	 * be used.  This method can generate spurious service requests
-	 * which are allowed by IEEE 488.2, but not ideal.
-	 *
-	 * This method should implement the serial poll response method described
-	 * by IEEE 488.2 section 11.3.3.4.3 "Allowed Coupled Control of
-	 * STB, reqt, and reqf".
-	 */
-	void (*serial_poll_response)(struct gpib_board *board, u8 status_byte);
-	/*
-	 * Sets the byte the board should send in response to a serial poll.
-	 * This function should also request service via IEEE 488.2 reqt/reqf
-	 * based on MSS (bit 6 of the status_byte) and new_reason_for_service.
-	 * reqt should be set true if new_reason_for_service is true,
-	 * and reqf should be set true if MSS is false.	 This function
-	 * will never be called with MSS false and new_reason_for_service
-	 * true simultaneously, so don't worry about that case.
-	 *
-	 * This method implements the serial poll response method described
-	 * by IEEE 488.2 section 11.3.3.4.1 "Preferred Implementation".
-	 *
-	 * If this method is left NULL by the driver, then the user library
-	 * function ibrsv2 will not work.
-	 */
-	void (*serial_poll_response2)(struct gpib_board *board, u8 status_byte,
-				      int new_reason_for_service);
-	/*
-	 * returns the byte the board will send in response to a serial poll.
-	 */
-	u8 (*serial_poll_status)(struct gpib_board *board);
-	/* adjust T1 delay */
-	int (*t1_delay)(struct gpib_board *board, unsigned int nano_sec);
-	/* go to local mode */
-	void (*return_to_local)(struct gpib_board *board);
-	/* board does not support 7 bit eos comparisons */
-	unsigned no_7_bit_eos : 1;
-	/* skip check for listeners before trying to send command bytes */
-	unsigned skip_check_for_command_acceptors : 1;
-};
-
-struct gpib_event_queue {
-	struct list_head event_head;
-	spinlock_t lock; // for access to event list
-	unsigned int num_events;
-	unsigned dropped_event : 1;
-};
-
-static inline void init_event_queue(struct gpib_event_queue *queue)
-{
-	INIT_LIST_HEAD(&queue->event_head);
-	queue->num_events = 0;
-	queue->dropped_event = 0;
-	spin_lock_init(&queue->lock);
-}
-
-/* struct for supporting polling operation when irq is not available */
-struct gpib_pseudo_irq {
-	struct timer_list timer;
-	irqreturn_t (*handler)(int irq, void *arg);
-	struct gpib_board *board;
-	atomic_t active;
-};
-
-static inline void init_gpib_pseudo_irq(struct gpib_pseudo_irq *pseudo_irq)
-{
-	pseudo_irq->handler = NULL;
-	timer_setup(&pseudo_irq->timer, NULL, 0);
-	atomic_set(&pseudo_irq->active, 0);
-}
-
-/* list so we can make a linked list of drivers */
-struct gpib_interface_list {
-	struct list_head list;
-	struct gpib_interface *interface;
-	struct module *module;
-};
-
-/*
- * One struct gpib_board is allocated for each physical board in the computer.
- * It provides storage for variables local to each board, and interface
- * functions for performing operations on the board
- */
-struct gpib_board {
-	/* functions used by this board */
-	struct gpib_interface *interface;
-	/*
-	 * Pointer to module whose use count we should increment when
-	 * interface is in use
-	 */
-	struct module *provider_module;
-	/* buffer used to store read/write data for this board */
-	u8 *buffer;
-	/* length of buffer */
-	unsigned int buffer_length;
-	/*
-	 * Used to hold the board's current status (see update_status() above)
-	 */
-	unsigned long status;
-	/*
-	 * Driver should only sleep on this wait queue.	 It is special in that the
-	 * core will wake this queue and set the TIMO bit in 'status' when the
-	 * watchdog timer times out.
-	 */
-	wait_queue_head_t wait;
-	/*
-	 * Lock that only allows one process to access this board at a time.
-	 * Has to be first in any locking order, since it can be locked over
-	 * multiple ioctls.
-	 */
-	struct mutex user_mutex;
-	/*
-	 * Mutex which compensates for removal of "big kernel lock" from kernel.
-	 * Should not be held for extended waits.
-	 */
-	struct mutex big_gpib_mutex;
-	/* pid of last process to lock the board mutex */
-	pid_t locking_pid;
-	/* lock for setting locking pid */
-	spinlock_t locking_pid_spinlock;
-	/* Spin lock for dealing with races with the interrupt handler */
-	spinlock_t spinlock;
-	/* Watchdog timer to enable timeouts */
-	struct timer_list timer;
-	/* device of attached driver if any */
-	struct device *dev;
-	/* gpib_common device gpibN */
-	struct device *gpib_dev;
-	/*
-	 * 'private_data' can be used as seen fit by the driver to
-	 * store additional variables for this board
-	 */
-	void *private_data;
-	/* Number of open file descriptors using this board */
-	unsigned int use_count;
-	/* list of open devices connected to this board */
-	struct list_head device_list;
-	/* primary address */
-	unsigned int pad;
-	/* secondary address */
-	int sad;
-	/* timeout for io operations, in microseconds */
-	unsigned int usec_timeout;
-	/* board's parallel poll configuration byte */
-	u8 parallel_poll_configuration;
-	/* t1 delay we are using */
-	unsigned int t1_nano_sec;
-	/* Count that keeps track of whether board is up and running or not */
-	unsigned int online;
-	/* number of processes trying to autopoll */
-	int autospollers;
-	/* autospoll kernel thread */
-	struct task_struct *autospoll_task;
-	/* queue for recording received trigger/clear/ifc events */
-	struct gpib_event_queue event_queue;
-	/* minor number for this board's device file */
-	int minor;
-	/* struct to deal with polling mode*/
-	struct gpib_pseudo_irq pseudo_irq;
-	/* error dong autopoll */
-	atomic_t stuck_srq;
-	struct gpib_board_config config;
-	/* Flag that indicates whether board is system controller of the bus */
-	unsigned master : 1;
-	/* individual status bit */
-	unsigned ist : 1;
-	/*
-	 * one means local parallel poll mode ieee 488.1 PP2 (or no parallel poll PP0),
-	 * zero means remote parallel poll configuration mode ieee 488.1 PP1
-	 */
-	unsigned local_ppoll_mode : 1;
-};
-
-/* element of event queue */
-struct gpib_event {
-	struct list_head list;
-	short event_type;
-};
-
-/*
- * Each board has a list of gpib_status_queue to keep track of all open devices
- * on the bus, so we know what address to poll when we get a service request
- */
-struct gpib_status_queue {
-	/* list_head so we can make a linked list of devices */
-	struct list_head list;
-	unsigned int pad;	/* primary gpib address */
-	int sad;	/* secondary gpib address (negative means disabled) */
-	/* stores serial poll bytes for this device */
-	struct list_head status_bytes;
-	unsigned int num_status_bytes;
-	/* number of times this address is opened */
-	unsigned int reference_count;
-	/* flags loss of status byte error due to limit on size of queue */
-	unsigned dropped_byte : 1;
-};
-
-struct gpib_status_byte {
-	struct list_head list;
-	u8 poll_byte;
-};
-
-void init_gpib_status_queue(struct gpib_status_queue *device);
-
-/* Used to store device-descriptor-specific information */
-struct gpib_descriptor {
-	unsigned int pad;	/* primary gpib address */
-	int sad;	/* secondary gpib address (negative means disabled) */
-	atomic_t io_in_progress;
-	unsigned is_board : 1;
-	unsigned autopoll_enabled : 1;
-};
-
-struct gpib_file_private {
-	atomic_t holding_mutex;
-	struct gpib_descriptor *descriptors[GPIB_MAX_NUM_DESCRIPTORS];
-	/* locked while descriptors are being allocated/deallocated */
-	struct mutex descriptors_mutex;
-	unsigned got_module : 1;
-};
-
-#endif	/* __KERNEL__ */
-
-#endif	/* _GPIB_TYPES_H */
diff --git a/drivers/staging/gpib/include/nec7210.h b/drivers/staging/gpib/include/nec7210.h
deleted file mode 100644
index 9835aa5ef4ff..000000000000
--- a/drivers/staging/gpib/include/nec7210.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright            : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _NEC7210_H
-#define _NEC7210_H
-
-#include "gpib_state_machines.h"
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/interrupt.h>
-
-#include "gpib_types.h"
-#include "nec7210_registers.h"
-
-/* struct used to provide variables local to a nec7210 chip */
-struct nec7210_priv {
-#ifdef CONFIG_HAS_IOPORT
-	u32 iobase;
-#endif
-	void __iomem *mmiobase;
-	unsigned int offset;		// offset between successive nec7210 io addresses
-	unsigned int dma_channel;
-	u8 *dma_buffer;
-	unsigned int dma_buffer_length;	// length of dma buffer
-	dma_addr_t dma_buffer_addr;	// bus address of board->buffer for use with dma
-	// software copy of bits written to registers
-	u8 reg_bits[8];
-	u8 auxa_bits;			// bits written to auxiliary register A
-	u8 auxb_bits;			// bits written to auxiliary register B
-	// used to keep track of board's state, bit definitions given below
-	unsigned long state;
-	// lock for chips that extend the nec7210 registers by paging in alternate regs
-	spinlock_t register_page_lock;
-	// wrappers for outb, inb, readb, or writeb
-	u8 (*read_byte)(struct nec7210_priv *priv, unsigned int register_number);
-	void (*write_byte)(struct nec7210_priv *priv, u8 byte, unsigned int register_number);
-	enum nec7210_chipset type;
-	enum talker_function_state talker_state;
-	enum listener_function_state listener_state;
-	void *private;
-	unsigned srq_pending : 1;
-};
-
-static inline void init_nec7210_private(struct nec7210_priv *priv)
-{
-	memset(priv, 0, sizeof(struct nec7210_priv));
-	spin_lock_init(&priv->register_page_lock);
-}
-
-// slightly shorter way to access read_byte and write_byte
-static inline u8 read_byte(struct nec7210_priv *priv, unsigned int register_number)
-{
-	return priv->read_byte(priv, register_number);
-}
-
-static inline void write_byte(struct nec7210_priv *priv, u8 byte, unsigned int register_number)
-{
-	priv->write_byte(priv, byte, register_number);
-}
-
-// struct nec7210_priv.state bit numbers
-enum {
-	PIO_IN_PROGRESS_BN,		// pio transfer in progress
-	DMA_READ_IN_PROGRESS_BN,	// dma read transfer in progress
-	DMA_WRITE_IN_PROGRESS_BN,	// dma write transfer in progress
-	READ_READY_BN,			// board has data byte available to read
-	WRITE_READY_BN,			// board is ready to send a data byte
-	COMMAND_READY_BN,		// board is ready to send a command byte
-	RECEIVED_END_BN,		// received END
-	BUS_ERROR_BN,			// output error has occurred
-	RFD_HOLDOFF_BN,			// rfd holdoff in effect
-	DEV_CLEAR_BN,			// device clear received
-	ADR_CHANGE_BN,			// address state change occurred
-};
-
-// interface functions
-int nec7210_read(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
-		 size_t length, int *end, size_t *bytes_read);
-int nec7210_write(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
-		  size_t length, int send_eoi, size_t *bytes_written);
-int nec7210_command(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
-		    size_t length, size_t *bytes_written);
-int nec7210_take_control(struct gpib_board *board, struct nec7210_priv *priv, int syncronous);
-int nec7210_go_to_standby(struct gpib_board *board, struct nec7210_priv *priv);
-int nec7210_request_system_control(struct gpib_board *board,
-				   struct nec7210_priv *priv, int request_control);
-void nec7210_interface_clear(struct gpib_board *board, struct nec7210_priv *priv, int assert);
-void nec7210_remote_enable(struct gpib_board *board, struct nec7210_priv *priv, int enable);
-int nec7210_enable_eos(struct gpib_board *board, struct nec7210_priv *priv, u8 eos_bytes,
-		       int compare_8_bits);
-void nec7210_disable_eos(struct gpib_board *board, struct nec7210_priv *priv);
-unsigned int nec7210_update_status(struct gpib_board *board, struct nec7210_priv *priv,
-				   unsigned int clear_mask);
-unsigned int nec7210_update_status_nolock(struct gpib_board *board, struct nec7210_priv *priv);
-int nec7210_primary_address(const struct gpib_board *board,
-			    struct nec7210_priv *priv, unsigned int address);
-int nec7210_secondary_address(const struct gpib_board *board, struct nec7210_priv *priv,
-			      unsigned int address, int enable);
-int nec7210_parallel_poll(struct gpib_board *board, struct nec7210_priv *priv, u8 *result);
-void nec7210_serial_poll_response(struct gpib_board *board,
-				  struct nec7210_priv *priv, u8 status);
-void nec7210_parallel_poll_configure(struct gpib_board *board,
-				     struct nec7210_priv *priv, unsigned int configuration);
-void nec7210_parallel_poll_response(struct gpib_board *board,
-				    struct nec7210_priv *priv, int ist);
-u8 nec7210_serial_poll_status(struct gpib_board *board, struct nec7210_priv *priv);
-int nec7210_t1_delay(struct gpib_board *board,
-		     struct nec7210_priv *priv, unsigned int nano_sec);
-void nec7210_return_to_local(const struct gpib_board *board, struct nec7210_priv *priv);
-
-// utility functions
-void nec7210_board_reset(struct nec7210_priv *priv, const struct gpib_board *board);
-void nec7210_board_online(struct nec7210_priv *priv, const struct gpib_board *board);
-unsigned int nec7210_set_reg_bits(struct nec7210_priv *priv, unsigned int reg,
-				  unsigned int mask, unsigned int bits);
-void nec7210_set_handshake_mode(struct gpib_board *board, struct nec7210_priv *priv, int mode);
-void nec7210_release_rfd_holdoff(struct gpib_board *board, struct nec7210_priv *priv);
-u8 nec7210_read_data_in(struct gpib_board *board, struct nec7210_priv *priv, int *end);
-
-// wrappers for io functions
-u8 nec7210_ioport_read_byte(struct nec7210_priv *priv, unsigned int register_num);
-void nec7210_ioport_write_byte(struct nec7210_priv *priv, u8 data, unsigned int register_num);
-u8 nec7210_iomem_read_byte(struct nec7210_priv *priv, unsigned int register_num);
-void nec7210_iomem_write_byte(struct nec7210_priv *priv, u8 data, unsigned int register_num);
-u8 nec7210_locking_ioport_read_byte(struct nec7210_priv *priv, unsigned int register_num);
-void nec7210_locking_ioport_write_byte(struct nec7210_priv *priv, u8 data,
-				       unsigned int register_num);
-u8 nec7210_locking_iomem_read_byte(struct nec7210_priv *priv, unsigned int register_num);
-void nec7210_locking_iomem_write_byte(struct nec7210_priv *priv, u8 data,
-				      unsigned int register_num);
-
-// interrupt service routine
-irqreturn_t nec7210_interrupt(struct gpib_board *board, struct nec7210_priv *priv);
-irqreturn_t nec7210_interrupt_have_status(struct gpib_board *board,
-					  struct nec7210_priv *priv, int status1, int status2);
-
-#endif	//_NEC7210_H
diff --git a/drivers/staging/gpib/include/nec7210_registers.h b/drivers/staging/gpib/include/nec7210_registers.h
deleted file mode 100644
index 067983d7a07f..000000000000
--- a/drivers/staging/gpib/include/nec7210_registers.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright            : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _NEC7210_REGISTERS_H
-#define _NEC7210_REGISTERS_H
-
-enum nec7210_chipset {
-	NEC7210,	// The original
-	TNT4882,	// NI
-	NAT4882,	// NI
-	CB7210,		// measurement computing
-	IOT7210,	// iotech
-	IGPIB7210,	// Ines
-	TNT5004,	// NI (minor differences to TNT4882)
-};
-
-/*
- * nec7210 register numbers (might need to be multiplied by
- * a board-dependent offset to get actually io address offset)
- */
-// write registers
-enum nec7210_write_regs {
-	CDOR,	// command/data out
-	IMR1,	// interrupt mask 1
-	IMR2,	// interrupt mask 2
-	SPMR,	// serial poll mode
-	ADMR,	// address mode
-	AUXMR,	// auxiliary mode
-	ADR,	// address
-	EOSR,	// end-of-string
-
-	// nec7210 has 8 registers
-	nec7210_num_registers = 8,
-};
-
-// read registers
-enum nec7210_read_regs {
-	DIR,	// data in
-	ISR1,	// interrupt status 1
-	ISR2,	// interrupt status 2
-	SPSR,	// serial poll status
-	ADSR,	// address status
-	CPTR,	// command pass though
-	ADR0,	// address 1
-	ADR1,	// address 2
-};
-
-// bit definitions common to nec-7210 compatible registers
-
-// ISR1: interrupt status register 1
-enum isr1_bits {
-	HR_DI = (1 << 0),
-	HR_DO = (1 << 1),
-	HR_ERR = (1 << 2),
-	HR_DEC = (1 << 3),
-	HR_END = (1 << 4),
-	HR_DET = (1 << 5),
-	HR_APT = (1 << 6),
-	HR_CPT = (1 << 7),
-};
-
-// IMR1: interrupt mask register 1
-enum imr1_bits {
-	HR_DIIE = (1 << 0),
-	HR_DOIE = (1 << 1),
-	HR_ERRIE = (1 << 2),
-	HR_DECIE = (1 << 3),
-	HR_ENDIE = (1 << 4),
-	HR_DETIE = (1 << 5),
-	HR_APTIE = (1 << 6),
-	HR_CPTIE = (1 << 7),
-};
-
-// ISR2, interrupt status register 2
-enum isr2_bits {
-	HR_ADSC = (1 << 0),
-	HR_REMC = (1 << 1),
-	HR_LOKC = (1 << 2),
-	HR_CO = (1 << 3),
-	HR_REM = (1 << 4),
-	HR_LOK = (1 << 5),
-	HR_SRQI = (1 << 6),
-	HR_INT = (1 << 7),
-};
-
-// IMR2, interrupt mask register 2
-enum imr2_bits {
-	// all the bits in this register that enable interrupts
-	IMR2_ENABLE_INTR_MASK = 0x4f,
-	HR_ACIE = (1 << 0),
-	HR_REMIE = (1 << 1),
-	HR_LOKIE = (1 << 2),
-	HR_COIE = (1 << 3),
-	HR_DMAI = (1 << 4),
-	HR_DMAO = (1 << 5),
-	HR_SRQIE = (1 << 6),
-};
-
-// SPSR, serial poll status register
-enum spsr_bits {
-	HR_PEND = (1 << 6),
-};
-
-// SPMR, serial poll mode register
-enum spmr_bits {
-	HR_RSV = (1 << 6),
-};
-
-// ADSR, address status register
-enum adsr_bits {
-	HR_MJMN = (1 << 0),
-	HR_TA = (1 << 1),
-	HR_LA = (1 << 2),
-	HR_TPAS = (1 << 3),
-	HR_LPAS = (1 << 4),
-	HR_SPMS = (1 << 5),
-	HR_NATN = (1 << 6),
-	HR_CIC = (1 << 7),
-};
-
-// ADMR, address mode register
-enum admr_bits {
-	HR_ADM0 = (1 << 0),
-	HR_ADM1 = (1 << 1),
-	HR_TRM0 = (1 << 4),
-	HR_TRM1 = (1 << 5),
-	HR_TRM_EOIOE_TRIG = 0,
-	HR_TRM_CIC_TRIG = HR_TRM0,
-	HR_TRM_CIC_EOIOE = HR_TRM1,
-	HR_TRM_CIC_PE = HR_TRM0 | HR_TRM1,
-	HR_LON = (1 << 6),
-	HR_TON = (1 << 7),
-};
-
-// ADR, bits used in address0, address1 and address0/1 registers
-enum adr_bits {
-	ADDRESS_MASK = 0x1f,	/* mask to specify lower 5 bits */
-	HR_DL = (1 << 5),
-	HR_DT = (1 << 6),
-	HR_ARS = (1 << 7),
-};
-
-// ADR1, address1 register
-enum adr1_bits {
-	HR_EOI = (1 << 7),
-};
-
-// AUXMR, auxiliary mode register
-enum auxmr_bits {
-	ICR = 0x20,
-	PPR = 0x60,
-	AUXRA = 0x80,
-	AUXRB = 0xa0,
-	AUXRE = 0xc0,
-};
-
-// auxra, auxiliary register A
-enum auxra_bits {
-	HR_HANDSHAKE_MASK = 0x3,
-	HR_HLDA = 0x1,
-	HR_HLDE = 0x2,
-	HR_LCM = 0x3,	/* auxra listen continuous */
-	HR_REOS = 0x4,
-	HR_XEOS = 0x8,
-	HR_BIN = 0x10,
-};
-
-// auxrb, auxiliary register B
-enum auxrb_bits {
-	HR_CPTE = (1 << 0),
-	HR_SPEOI = (1 << 1),
-	HR_TRI = (1 << 2),
-	HR_INV = (1 << 3),
-	HR_ISS = (1 << 4),
-};
-
-enum auxre_bits {
-	HR_DAC_HLD_DCAS = 0x1,	/* perform DAC holdoff on receiving clear */
-	HR_DAC_HLD_DTAS = 0x2,	/* perform DAC holdoff on receiving trigger */
-};
-
-// parallel poll register
-enum ppr_bits {
-	HR_PPS = (1 << 3),
-	HR_PPU = (1 << 4),
-};
-
-/* 7210 Auxiliary Commands */
-enum aux_cmds {
-	AUX_PON = 0x0,	/* Immediate Execute pon                  */
-	AUX_CPPF = 0x1,	/* Clear Parallel Poll Flag               */
-	AUX_CR = 0x2,	/* Chip Reset                             */
-	AUX_FH = 0x3,	/* Finish Handshake                       */
-	AUX_TRIG = 0x4,	/* Trigger                                */
-	AUX_RTL = 0x5,	/* Return to local                        */
-	AUX_SEOI = 0x6,	/* Send EOI                               */
-	AUX_NVAL = 0x7,	/* Non-Valid Secondary Command or Address */
-	AUX_SPPF = 0x9,	/* Set Parallel Poll Flag                 */
-	AUX_VAL = 0xf,	/* Valid Secondary Command or Address     */
-	AUX_GTS = 0x10,	/* Go To Standby                          */
-	AUX_TCA = 0x11,	/* Take Control Asynchronously            */
-	AUX_TCS = 0x12,	/* Take Control Synchronously             */
-	AUX_LTN = 0x13,	/* Listen                                 */
-	AUX_DSC = 0x14,	/* Disable System Control                 */
-	AUX_CIFC = 0x16,	/* Clear IFC                              */
-	AUX_CREN = 0x17,	/* Clear REN                              */
-	AUX_TCSE = 0x1a,	/* Take Control Synchronously on End      */
-	AUX_LTNC = 0x1b,	/* Listen in Continuous Mode              */
-	AUX_LUN = 0x1c,	/* Local Unlisten                         */
-	AUX_EPP = 0x1d,	/* Execute Parallel Poll                  */
-	AUX_SIFC = 0x1e,	/* Set IFC                                */
-	AUX_SREN = 0x1f,	/* Set REN                                */
-};
-
-#endif	//_NEC7210_REGISTERS_H
diff --git a/drivers/staging/gpib/include/plx9050.h b/drivers/staging/gpib/include/plx9050.h
deleted file mode 100644
index c911b285a0ca..000000000000
--- a/drivers/staging/gpib/include/plx9050.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *  Header for plx9050 pci chip
- *    copyright            : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _PLX9050_GPIB_H
-#define _PLX9050_GPIB_H
-
-// plx pci chip registers and bits
-enum {
-	PLX9050_INTCSR_REG = 0x4c,
-	PLX9050_CNTRL_REG = 0x50
-};
-
-enum plx9050_intcsr_bits {
-	PLX9050_LINTR1_EN_BIT = 0x1,
-	PLX9050_LINTR1_POLARITY_BIT = 0x2,
-	PLX9050_LINTR1_STATUS_BIT = 0x4,
-	PLX9050_LINTR2_EN_BIT = 0x8,
-	PLX9050_LINTR2_POLARITY_BIT = 0x10,
-	PLX9050_LINTR2_STATUS_BIT = 0x20,
-	PLX9050_PCI_INTR_EN_BIT = 0x40,
-	PLX9050_SOFT_INTR_BIT = 0x80,
-	PLX9050_LINTR1_SELECT_ENABLE_BIT = 0x100,	// 9052 extension
-	PLX9050_LINTR2_SELECT_ENABLE_BIT = 0x200,	// 9052 extension
-	PLX9050_LINTR1_EDGE_CLEAR_BIT = 0x400,		// 9052 extension
-	PLX9050_LINTR2_EDGE_CLEAR_BIT = 0x800,		// 9052 extension
-};
-
-enum plx9050_cntrl_bits {
-	PLX9050_WAITO_NOT_USER0_SELECT_BIT = 0x1,
-	PLX9050_USER0_OUTPUT_BIT = 0x2,
-	PLX9050_USER0_DATA_BIT = 0x4,
-	PLX9050_LLOCK_NOT_USER1_SELECT_BIT = 0x8,
-	PLX9050_USER1_OUTPUT_BIT = 0x10,
-	PLX9050_USER1_DATA_BIT = 0x20,
-	PLX9050_CS2_NOT_USER2_SELECT_BIT = 0x40,
-	PLX9050_USER2_OUTPUT_BIT = 0x80,
-	PLX9050_USER2_DATA_BIT = 0x100,
-	PLX9050_CS3_NOT_USER3_SELECT_BIT = 0x200,
-	PLX9050_USER3_OUTPUT_BIT = 0x400,
-	PLX9050_USER3_DATA_BIT = 0x800,
-	PLX9050_PCIBAR_ENABLE_MASK = 0x3000,
-	PLX9050_PCIBAR_MEMORY_AND_IO_ENABLE_BITS = 0x0,
-	PLX9050_PCIBAR_MEMORY_NO_IO_ENABLE_BITS = 0x1000,
-	PLX9050_PCIBAR_IO_NO_MEMORY_ENABLE_BITS = 0x2000,
-	PLX9050_PCIBAR_MEMORY_AND_IO_TOO_ENABLE_BITS = 0x3000,
-	PLX9050_PCI_READ_MODE_BIT = 0x4000,
-	PLX9050_PCI_READ_WITH_WRITE_FLUSH_MODE_BIT = 0x8000,
-	PLX9050_PCI_READ_NO_FLUSH_MODE_BIT = 0x10000,
-	PLX9050_PCI_READ_NO_WRITE_MODE_BIT = 0x20000,
-	PLX9050_PCI_WRITE_MODE_BIT = 0x40000,
-	PLX9050_PCI_RETRY_DELAY_MASK = 0x780000,
-	PLX9050_DIRECT_SLAVE_LOCK_ENABLE_BIT = 0x800000,
-	PLX9050_EEPROM_CLOCK_BIT = 0x1000000,
-	PLX9050_EEPROM_CHIP_SELECT_BIT = 0x2000000,
-	PLX9050_WRITE_TO_EEPROM_BIT = 0x4000000,
-	PLX9050_READ_EEPROM_DATA_BIT = 0x8000000,
-	PLX9050_EEPROM_VALID_BIT = 0x10000000,
-	PLX9050_RELOAD_CONFIG_REGISTERS_BIT = 0x20000000,
-	PLX9050_PCI_SOFTWARE_RESET_BIT = 0x40000000,
-	PLX9050_MASK_REVISION_BIT = 0x80000000
-};
-
-static inline unsigned int PLX9050_PCI_RETRY_DELAY_BITS(unsigned int clocks)
-{
-	return ((clocks / 8) << 19) & PLX9050_PCI_RETRY_DELAY_MASK;
-}
-
-#endif	// _PLX9050_GPIB_H
diff --git a/drivers/staging/gpib/include/quancom_pci.h b/drivers/staging/gpib/include/quancom_pci.h
deleted file mode 100644
index cdaf0d056be9..000000000000
--- a/drivers/staging/gpib/include/quancom_pci.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- * Quancom pci stuff
- * copyright (C) 2005 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _QUANCOM_PCI_H
-#define _QUANCOM_PCI_H
-
-/* quancom registers */
-enum quancom_regs {
-	QUANCOM_IRQ_CONTROL_STATUS_REG = 0xfc,
-};
-
-enum quancom_irq_control_status_bits {
-	QUANCOM_IRQ_ASSERTED_BIT = 0x1, /* readable */
-	/* (any write to the register clears the interrupt)*/
-	QUANCOM_IRQ_ENABLE_BIT = 0x4, /* writeable */
-};
-
-#endif	// _QUANCOM_PCI_H
diff --git a/drivers/staging/gpib/include/tms9914.h b/drivers/staging/gpib/include/tms9914.h
deleted file mode 100644
index e66b75e0fda8..000000000000
--- a/drivers/staging/gpib/include/tms9914.h
+++ /dev/null
@@ -1,280 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright            : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _TMS9914_H
-#define _TMS9914_H
-
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include "gpib_state_machines.h"
-#include "gpib_types.h"
-
-enum tms9914_holdoff_mode {
-	TMS9914_HOLDOFF_NONE,
-	TMS9914_HOLDOFF_EOI,
-	TMS9914_HOLDOFF_ALL,
-};
-
-/* struct used to provide variables local to a tms9914 chip */
-struct tms9914_priv {
-#ifdef CONFIG_HAS_IOPORT
-	u32 iobase;
-#endif
-	void __iomem *mmiobase;
-	unsigned int offset;	// offset between successive tms9914 io addresses
-	unsigned int dma_channel;
-	// software copy of bits written to interrupt mask registers
-	u8 imr0_bits, imr1_bits;
-	// bits written to address mode register
-	u8 admr_bits;
-	u8 auxa_bits;		// bits written to auxiliary register A
-	// used to keep track of board's state, bit definitions given below
-	unsigned long state;
-	u8 eos;			// eos character
-	short eos_flags;
-	u8 spoll_status;
-	enum tms9914_holdoff_mode holdoff_mode;
-	unsigned int ppoll_line;
-	enum talker_function_state talker_state;
-	enum listener_function_state listener_state;
-	unsigned ppoll_sense : 1;
-	unsigned ppoll_enable : 1;
-	unsigned ppoll_configure_state : 1;
-	unsigned primary_listen_addressed : 1;
-	unsigned primary_talk_addressed : 1;
-	unsigned holdoff_on_end : 1;
-	unsigned holdoff_on_all : 1;
-	unsigned holdoff_active : 1;
-	// wrappers for outb, inb, readb, or writeb
-	u8 (*read_byte)(struct tms9914_priv *priv, unsigned int register_number);
-	void (*write_byte)(struct tms9914_priv *priv, u8 byte, unsigned int
-			   register_number);
-};
-
-// slightly shorter way to access read_byte and write_byte
-static inline u8 read_byte(struct tms9914_priv *priv, unsigned int register_number)
-{
-	return priv->read_byte(priv, register_number);
-}
-
-static inline void write_byte(struct tms9914_priv *priv, u8 byte, unsigned int register_number)
-{
-	priv->write_byte(priv, byte, register_number);
-}
-
-// struct tms9914_priv.state bit numbers
-enum {
-	PIO_IN_PROGRESS_BN,		// pio transfer in progress
-	DMA_READ_IN_PROGRESS_BN,	// dma read transfer in progress
-	DMA_WRITE_IN_PROGRESS_BN,	// dma write transfer in progress
-	READ_READY_BN,			// board has data byte available to read
-	WRITE_READY_BN,			// board is ready to send a data byte
-	COMMAND_READY_BN,		// board is ready to send a command byte
-	RECEIVED_END_BN,		// received END
-	BUS_ERROR_BN,			// bus error
-	DEV_CLEAR_BN,			// device clear received
-};
-
-// interface functions
-int tms9914_read(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
-		 size_t length, int *end, size_t *bytes_read);
-int tms9914_write(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
-		  size_t length, int send_eoi, size_t *bytes_written);
-int tms9914_command(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
-		    size_t length, size_t *bytes_written);
-int tms9914_take_control(struct gpib_board *board, struct tms9914_priv *priv, int syncronous);
-/*
- * alternate version of tms9914_take_control which works around buggy tcs
- * implementation.
- */
-int tms9914_take_control_workaround(struct gpib_board *board, struct tms9914_priv *priv,
-				    int syncronous);
-int tms9914_go_to_standby(struct gpib_board *board, struct tms9914_priv *priv);
-int tms9914_request_system_control(struct gpib_board *board, struct tms9914_priv *priv,
-				   int request_control);
-void tms9914_interface_clear(struct gpib_board *board, struct tms9914_priv *priv, int assert);
-void tms9914_remote_enable(struct gpib_board *board, struct tms9914_priv *priv, int enable);
-int tms9914_enable_eos(struct gpib_board *board, struct tms9914_priv *priv, u8 eos_bytes,
-		       int compare_8_bits);
-void tms9914_disable_eos(struct gpib_board *board, struct tms9914_priv *priv);
-unsigned int tms9914_update_status(struct gpib_board *board, struct tms9914_priv *priv,
-				   unsigned int clear_mask);
-int tms9914_primary_address(struct gpib_board *board,
-			    struct tms9914_priv *priv, unsigned int address);
-int tms9914_secondary_address(struct gpib_board *board, struct tms9914_priv *priv,
-			      unsigned int address, int enable);
-int tms9914_parallel_poll(struct gpib_board *board, struct tms9914_priv *priv, u8 *result);
-void tms9914_parallel_poll_configure(struct gpib_board *board,
-				     struct tms9914_priv *priv, u8 config);
-void tms9914_parallel_poll_response(struct gpib_board *board,
-				    struct tms9914_priv *priv, int ist);
-void tms9914_serial_poll_response(struct gpib_board *board,
-				  struct tms9914_priv *priv, u8 status);
-u8 tms9914_serial_poll_status(struct gpib_board *board, struct tms9914_priv *priv);
-int tms9914_line_status(const struct gpib_board *board, struct tms9914_priv *priv);
-unsigned int tms9914_t1_delay(struct gpib_board *board, struct tms9914_priv *priv,
-			      unsigned int nano_sec);
-void tms9914_return_to_local(const struct gpib_board *board, struct tms9914_priv *priv);
-
-// utility functions
-void tms9914_board_reset(struct tms9914_priv *priv);
-void tms9914_online(struct gpib_board *board, struct tms9914_priv *priv);
-void tms9914_release_holdoff(struct tms9914_priv *priv);
-void tms9914_set_holdoff_mode(struct tms9914_priv *priv, enum tms9914_holdoff_mode mode);
-
-// wrappers for io functions
-u8 tms9914_ioport_read_byte(struct tms9914_priv *priv, unsigned int register_num);
-void tms9914_ioport_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num);
-u8 tms9914_iomem_read_byte(struct tms9914_priv *priv, unsigned int register_num);
-void tms9914_iomem_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num);
-
-// interrupt service routine
-irqreturn_t tms9914_interrupt(struct gpib_board *board, struct tms9914_priv *priv);
-irqreturn_t tms9914_interrupt_have_status(struct gpib_board *board, struct tms9914_priv *priv,
-					  int status1,	int status2);
-
-// tms9914 has 8 registers
-enum {
-	ms9914_num_registers = 8,
-};
-
-/*
- * tms9914 register numbers (might need to be multiplied by
- * a board-dependent offset to get actually io address offset)
- */
-// write registers
-enum {
-	IMR0 = 0,	/* interrupt mask 0          */
-	IMR1 = 1,	/* interrupt mask 1          */
-	AUXCR = 3,	/* auxiliary command         */
-	ADR = 4,	/* address register	     */
-	SPMR = 5,	/* serial poll mode register */
-	PPR = 6,	/* parallel poll             */
-	CDOR = 7,	/* data out register         */
-};
-
-// read registers
-enum {
-	ISR0 = 0,	/* interrupt status 0	     */
-	ISR1 = 1,	/* interrupt status 1	     */
-	ADSR = 2,	/* address status	     */
-	BSR = 3,	/* bus status		     */
-	CPTR = 6,	/* command pass thru	     */
-	DIR = 7,	/* data in register          */
-};
-
-// bit definitions common to tms9914 compatible registers
-
-/* ISR0   - Register bits */
-enum isr0_bits {
-	HR_MAC = (1 << 0),   /* My Address Change           */
-	HR_RLC = (1 << 1),   /* Remote/Local change         */
-	HR_SPAS = (1 << 2),   /* Serial Poll active State    */
-	HR_END = (1 << 3),   /* END (EOI or EOS)            */
-	HR_BO = (1 << 4),   /* Byte Out                    */
-	HR_BI = (1 << 5),   /* Byte In                     */
-};
-
-/* IMR0   - Register bits */
-enum imr0_bits {
-	HR_MACIE = (1 << 0),   /*        */
-	HR_RLCIE = (1 << 1),   /*        */
-	HR_SPASIE = (1 << 2),   /*        */
-	HR_ENDIE = (1 << 3),   /*        */
-	HR_BOIE = (1 << 4),   /*        */
-	HR_BIIE = (1 << 5),   /*        */
-};
-
-/* ISR1   - Register bits */
-enum isr1_bits {
-	HR_IFC = (1 << 0),   /* IFC asserted                */
-	HR_SRQ = (1 << 1),   /* SRQ asserted                */
-	HR_MA = (1 << 2),    /* My Address                  */
-	HR_DCAS = (1 << 3),  /* Device Clear active State   */
-	HR_APT = (1 << 4),   /* Address pass Through        */
-	HR_UNC = (1 << 5),   /* Unrecognized Command        */
-	HR_ERR = (1 << 6),   /* Data Transmission Error     */
-	HR_GET = (1 << 7),   /* Group execute Trigger       */
-};
-
-/* IMR1   - Register bits */
-enum imr1_bits {
-	HR_IFCIE = (1 << 0),   /*        */
-	HR_SRQIE = (1 << 1),   /*        */
-	HR_MAIE = (1 << 2),    /*        */
-	HR_DCASIE = (1 << 3),  /*        */
-	HR_APTIE = (1 << 4),   /*        */
-	HR_UNCIE = (1 << 5),   /*        */
-	HR_ERRIE = (1 << 6),   /*        */
-	HR_GETIE = (1 << 7),   /*        */
-};
-
-/* ADSR   - Register bits */
-enum adsr_bits {
-	HR_ULPA = (1 << 0),   /* Store last address LSB       */
-	HR_TA = (1 << 1),     /* Talker Adressed              */
-	HR_LA = (1 << 2),     /* Listener adressed            */
-	HR_TPAS = (1 << 3),   /* talker primary address state */
-	HR_LPAS = (1 << 4),   /* listener    "                */
-	HR_ATN = (1 << 5),    /* ATN active                   */
-	HR_LLO = (1 << 6),    /* LLO active                   */
-	HR_REM = (1 << 7),    /* REM active                   */
-};
-
-/* ADR   - Register bits */
-enum adr_bits {
-	ADDRESS_MASK = 0x1f,	/* mask to specify lower 5 bits for ADR */
-	HR_DAT = (1 << 5),      /* disable talker */
-	HR_DAL = (1 << 6),      /* disable listener */
-	HR_EDPA = (1 << 7),     /* enable dual primary addressing */
-};
-
-enum bus_status_bits {
-	BSR_REN_BIT = 0x1,
-	BSR_IFC_BIT = 0x2,
-	BSR_SRQ_BIT = 0x4,
-	BSR_EOI_BIT = 0x8,
-	BSR_NRFD_BIT = 0x10,
-	BSR_NDAC_BIT = 0x20,
-	BSR_DAV_BIT = 0x40,
-	BSR_ATN_BIT = 0x80,
-};
-
-/*---------------------------------------------------------*/
-/* TMS 9914 Auxiliary Commands                             */
-/*---------------------------------------------------------*/
-
-enum aux_cmd_bits {
-	AUX_CS = 0x80,			/* set bit instead of clearing it, used with commands marked 'd' below */
-	AUX_CHIP_RESET = 0x0,		/* d Chip reset                   */
-	AUX_INVAL = 0x1,		/* release dac holdoff, invalid command byte */
-	AUX_VAL = (AUX_INVAL | AUX_CS),	/* release dac holdoff, valid command byte   */
-	AUX_RHDF = 0x2,			/* X Release RFD holdoff          */
-	AUX_HLDA = 0x3,			/* d holdoff on all data          */
-	AUX_HLDE = 0x4,			/* d holdoff on EOI only          */
-	AUX_NBAF = 0x5,			/* X Set new byte available false */
-	AUX_FGET = 0x6,			/* d force GET                    */
-	AUX_RTL = 0x7,			/* d return to local              */
-	AUX_SEOI = 0x8,			/* X send EOI with next byte      */
-	AUX_LON = 0x9,			/* d Listen only                  */
-	AUX_TON = 0xa,			/* d Talk only                    */
-	AUX_GTS = 0xb,			/* X goto standby                 */
-	AUX_TCA = 0xc,			/* X take control asynchronously  */
-	AUX_TCS = 0xd,			/* X take    "     synchronously  */
-	AUX_RPP = 0xe,			/* d Request parallel poll        */
-	AUX_SIC = 0xf,			/* d send interface clear         */
-	AUX_SRE = 0x10,			/* d send remote enable           */
-	AUX_RQC = 0x11,			/* X request control              */
-	AUX_RLC = 0x12,			/* X release control              */
-	AUX_DAI = 0x13,			/* d disable all interrupts       */
-	AUX_PTS = 0x14,			/* X pass through next secondary  */
-	AUX_STDL = 0x15,		/* d short T1 delay		  */
-	AUX_SHDW = 0x16,		/* d shadow handshake             */
-	AUX_VSTDL = 0x17,		/* d very short T1 delay (smj9914 extension)   */
-	AUX_RSV2 = 0x18,		/* d request service bit 2 (smj9914 extension) */
-};
-
-#endif	//_TMS9914_H
diff --git a/drivers/staging/gpib/include/tnt4882_registers.h b/drivers/staging/gpib/include/tnt4882_registers.h
deleted file mode 100644
index d54c4cc61168..000000000000
--- a/drivers/staging/gpib/include/tnt4882_registers.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *    copyright		   : (C) 2002, 2004 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _TNT4882_REGISTERS_H
-#define _TNT4882_REGISTERS_H
-
-// tnt4882 register offsets
-enum {
-	ACCWR = 0x5,
-	// offset of auxiliary command register in 9914 mode
-	AUXCR = 0x6,
-	INTRT = 0x7,
-	// register number for auxiliary command register when swap bit is set (9914 mode)
-	SWAPPED_AUXCR = 0xa,
-	HSSEL = 0xd,	// handshake select register
-	CNT2 = 0x9,
-	CNT3 = 0xb,
-	CFG = 0x10,
-	SASR = 0x1b,
-	IMR0 = 0x1d,
-	IMR3 = 0x12,
-	CNT0 = 0x14,
-	CNT1 = 0x16,
-	KEYREG = 0x17,	// key control register (7210 mode only)
-	CSR = KEYREG,
-	FIFOB = 0x18,
-	FIFOA = 0x19,
-	CCR = 0x1a,	// carry cycle register
-	CMDR = 0x1c,	// command register
-	TIMER = 0x1e,	// timer register
-
-	STS1 = 0x10,	// T488 Status Register 1
-	STS2 = 0x1c,	// T488 Status Register 2
-	ISR0 = IMR0,
-	ISR3 = 0x1a,	// T488 Interrupt Status Register 3
-	BCR = 0x1f,	// bus control/status register
-	BSR = BCR,
-};
-
-enum {
-	tnt_pagein_offset = 0x11,
-};
-
-/*============================================================*/
-
-/* TURBO-488 registers bit definitions */
-
-enum bus_control_status_bits {
-	BCSR_REN_BIT = 0x1,
-	BCSR_IFC_BIT = 0x2,
-	BCSR_SRQ_BIT = 0x4,
-	BCSR_EOI_BIT = 0x8,
-	BCSR_NRFD_BIT = 0x10,
-	BCSR_NDAC_BIT = 0x20,
-	BCSR_DAV_BIT = 0x40,
-	BCSR_ATN_BIT = 0x80,
-};
-
-/* CFG -- Configuration Register (write only) */
-enum cfg_bits {
-	TNT_COMMAND = 0x80,	/* bytes are command bytes instead of data bytes
-				 * (tnt4882 one-chip and newer only?)
-				 */
-	TNT_TLCHE = (1 << 6),	/* halt transfer on imr0, imr1, or imr2 interrupt */
-	TNT_IN = (1 << 5),	/* transfer is GPIB read		 */
-	TNT_A_B = (1 << 4),	/* order to use fifos 1=fifo A first(big endian),
-				 * 0=fifo b first(little endian)
-				 */
-	TNT_CCEN = (1 << 3),	/* enable carry cycle		      */
-	TNT_TMOE = (1 << 2),	/* enable CPU bus time limit	      */
-	TNT_TIM_BYTN = (1 << 1),	/* tmot reg is: 1=125ns clocks, 0=num bytes */
-	TNT_B_16BIT = (1 << 0),	/* 1=FIFO is 16-bit register, 0=8-bit */
-};
-
-/* CMDR -- Command Register */
-enum cmdr_bits {
-	CLRSC = 0x2,	/* clear the system controller bit */
-	SETSC = 0x3,	/* set the system controller bit */
-	GO = 0x4,	/* start fifos */
-	STOP = 0x8,	/* stop fifos */
-	RESET_FIFO = 0x10,	/* reset the FIFOs		*/
-	SOFT_RESET = 0x22,	/* issue a software reset	*/
-	HARD_RESET = 0x40	/* 500x only? */
-};
-
-/* HSSEL -- handshake select register (write only) */
-enum hssel_bits {
-	TNT_ONE_CHIP_BIT = 0x1,
-	NODMA = 0x10,
-	TNT_GO2SIDS_BIT = 0x20,
-};
-
-/* IMR0 -- Interrupt Mode Register 0 */
-enum imr0_bits {
-	TNT_SYNCIE_BIT = 0x1, /* handshake sync */
-	TNT_TOIE_BIT = 0x2, /* timeout */
-	TNT_ATNIE_BIT = 0x4, /* ATN interrupt */
-	TNT_IFCIE_BIT = 0x8,	/* interface clear interrupt */
-	TNT_BTO_BIT = 0x10, /* byte timeout */
-	TNT_NLEN_BIT = 0x20,	/* treat new line as EOS char */
-	TNT_STBOIE_BIT = 0x40,	/* status byte out  */
-	TNT_IMR0_ALWAYS_BITS = 0x80,	/* always set this bit on write */
-};
-
-/* ISR0 -- Interrupt Status Register 0 */
-enum isr0_bits {
-	TNT_SYNC_BIT = 0x1,	/* handshake sync */
-	TNT_TO_BIT = 0x2,	/* timeout */
-	TNT_ATNI_BIT = 0x4,	/* ATN interrupt */
-	TNT_IFCI_BIT = 0x8,	/* interface clear interrupt */
-	TNT_EOS_BIT = 0x10,	/* end of string */
-	TNT_NL_BIT = 0x20,	/* new line receive */
-	TNT_STBO_BIT = 0x40,	/* status byte out  */
-	TNT_NBA_BIT = 0x80,	/* new byte available */
-};
-
-/* ISR3 -- Interrupt Status Register 3 (read only) */
-enum isr3_bits {
-	HR_DONE = (1 << 0),	/* transfer done */
-	HR_TLCI = (1 << 1),	/* isr0, isr1, or isr2 interrupt asserted */
-	HR_NEF = (1 << 2),	/* NOT empty fifo */
-	HR_NFF = (1 << 3),	/* NOT full fifo */
-	HR_STOP = (1 << 4),	/* fifo empty or STOP command issued */
-	HR_SRQI_CIC = (1 << 5),	/* SRQ asserted and we are CIC (500x only?)*/
-	HR_INTR = (1 << 7),	/* isr3 interrupt active */
-};
-
-enum keyreg_bits {
-	MSTD = 0x20,	/* enable 350ns T1 delay */
-};
-
-/* STS1 -- Status Register 1 (read only) */
-enum sts1_bits {
-	S_DONE = 0x80,	/* DMA done			      */
-	S_SC = 0x40,	/* is system controller		      */
-	S_IN = 0x20,	/* DMA in (to memory)		      */
-	S_DRQ = 0x10,	/* DRQ line (for diagnostics)	      */
-	S_STOP = 0x08,	/* DMA stopped			      */
-	S_NDAV = 0x04,	/* inverse of DAV		      */
-	S_HALT = 0x02,	/* status of transfer machine	      */
-	S_GSYNC = 0x01,	/* indicates if GPIB is in sync w I/O */
-};
-
-/* STS2 -- Status Register 2 */
-enum sts2_bits {
-	AFFN = (1 << 3),	/* "A full FIFO NOT"  (0=FIFO full)  */
-	AEFN = (1 << 2),	/* "A empty FIFO NOT" (0=FIFO empty) */
-	BFFN = (1 << 1),	/* "B full FIFO NOT"  (0=FIFO full)  */
-	BEFN = (1 << 0),	/* "B empty FIFO NOT" (0=FIFO empty) */
-};
-
-// Auxiliary commands
-enum tnt4882_aux_cmds {
-	AUX_9914 = 0x15,	// switch to 9914 mode
-	AUX_REQT = 0x18,
-	AUX_REQF = 0x19,
-	AUX_PAGEIN = 0x50,	// page in alternate registers
-	AUX_HLDI = 0x51,	// rfd holdoff immediately
-	AUX_CLEAR_END = 0x55,
-	AUX_7210 = 0x99,	// switch to 7210 mode
-};
-
-enum tnt4882_aux_regs {
-	AUXRG = 0x40,
-	AUXRI = 0xe0,
-};
-
-enum auxg_bits {
- /* no talking when no listeners bit (prevents bus errors when data written at wrong time) */
-	NTNL_BIT = 0x8,
-	RPP2_BIT = 0x4,	/* set/clear local rpp message */
-	CHES_BIT = 0x1, /*clear holdoff on end select bit*/
-};
-
-enum auxi_bits {
-	SISB = 0x1,	// static interrupt bits (don't clear isr1, isr2 on read)
-	PP2 = 0x4,	// ignore remote parallel poll configuration
-	USTD = 0x8,	// ultra short (1100 nanosec) T1 delay
-};
-
-enum sasr_bits {
-	ACRDY_BIT = 0x4,	/* acceptor ready state */
-	ADHS_BIT = 0x8,		/* acceptor data holdoff state */
-	ANHS2_BIT = 0x10,	/* acceptor not ready holdoff immediately state */
-	ANHS1_BIT = 0x20,	/* acceptor not ready holdoff state */
-	AEHS_BIT = 0x40,	/* acceptor end holdoff state */
-};
-
-#endif	// _TNT4882_REGISTERS_H
diff --git a/drivers/staging/gpib/ines/Makefile b/drivers/staging/gpib/ines/Makefile
deleted file mode 100644
index 88241f15ecea..000000000000
--- a/drivers/staging/gpib/ines/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GPIB_INES) += ines_gpib.o
-
-
diff --git a/drivers/staging/gpib/ines/ines.h b/drivers/staging/gpib/ines/ines.h
deleted file mode 100644
index 6ad57e9a1216..000000000000
--- a/drivers/staging/gpib/ines/ines.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *  Header for ines GPIB boards
- *    copyright            : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _INES_GPIB_H
-#define _INES_GPIB_H
-
-#include "nec7210.h"
-#include "gpibP.h"
-#include "plx9050.h"
-#include "amcc5920.h"
-#include "quancom_pci.h"
-#include <linux/interrupt.h>
-
-enum ines_pci_chip {
-	PCI_CHIP_NONE,
-	PCI_CHIP_PLX9050,
-	PCI_CHIP_AMCC5920,
-	PCI_CHIP_QUANCOM,
-	PCI_CHIP_QUICKLOGIC5030,
-};
-
-struct ines_priv {
-	struct nec7210_priv nec7210_priv;
-	struct pci_dev *pci_device;
-	// base address for plx9052 pci chip
-	unsigned long plx_iobase;
-	// base address for amcc5920 pci chip
-	unsigned long amcc_iobase;
-	unsigned int irq;
-	enum ines_pci_chip pci_chip_type;
-	u8 extend_mode_bits;
-};
-
-/* inb/outb wrappers */
-static inline unsigned int ines_inb(struct ines_priv *priv, unsigned int register_number)
-{
-	return inb(priv->nec7210_priv.iobase +
-		   register_number * priv->nec7210_priv.offset);
-}
-
-static inline void ines_outb(struct ines_priv *priv, unsigned int value,
-			     unsigned int register_number)
-{
-	outb(value, priv->nec7210_priv.iobase +
-	     register_number * priv->nec7210_priv.offset);
-}
-
-enum ines_regs {
-	// read
-	FIFO_STATUS = 0x8,
-	ISR3 = 0x9,
-	ISR4 = 0xa,
-	IN_FIFO_COUNT = 0x10,
-	OUT_FIFO_COUNT = 0x11,
-	EXTEND_STATUS = 0xf,
-
-	// write
-	XDMA_CONTROL = 0x8,
-	IMR3 = ISR3,
-	IMR4 = ISR4,
-	IN_FIFO_WATERMARK = IN_FIFO_COUNT,
-	OUT_FIFO_WATERMARK = OUT_FIFO_COUNT,
-	EXTEND_MODE = 0xf,
-
-	// read-write
-	XFER_COUNT_LOWER = 0xb,
-	XFER_COUNT_UPPER = 0xc,
-	BUS_CONTROL_MONITOR = 0x13,
-};
-
-enum isr3_imr3_bits {
-	HW_TIMEOUT_BIT = 0x1,
-	XFER_COUNT_BIT = 0x2,
-	CMD_RECEIVED_BIT = 0x4,
-	TCT_RECEIVED_BIT = 0x8,
-	IFC_ACTIVE_BIT = 0x10,
-	ATN_ACTIVE_BIT = 0x20,
-	FIFO_ERROR_BIT = 0x40,
-};
-
-enum isr4_imr4_bits {
-	IN_FIFO_WATERMARK_BIT = 0x1,
-	OUT_FIFO_WATERMARK_BIT = 0x2,
-	IN_FIFO_FULL_BIT = 0x4,
-	OUT_FIFO_EMPTY_BIT = 0x8,
-	IN_FIFO_READY_BIT = 0x10,
-	OUT_FIFO_READY_BIT = 0x20,
-	IN_FIFO_EXIT_WATERMARK_BIT = 0x40,
-	OUT_FIFO_EXIT_WATERMARK_BIT = 0x80,
-};
-
-enum extend_mode_bits {
-	TR3_TRIG_ENABLE_BIT = 0x1,	// enable generation of trigger pulse T/R3 pin
-	// clear message available status bit when chip writes byte with EOI true
-	MAV_ENABLE_BIT = 0x2,
-	EOS1_ENABLE_BIT = 0x4,		// enable eos register 1
-	EOS2_ENABLE_BIT = 0x8,		// enable eos register 2
-	EOIDIS_BIT = 0x10,		// disable EOI interrupt when doing rfd holdoff on end?
-	XFER_COUNTER_ENABLE_BIT = 0x20,
-	XFER_COUNTER_OUTPUT_BIT = 0x40,	// use counter for output, clear for input
-	// when xfer counter hits 0, assert EOI on write or RFD holdoff on read
-	LAST_BYTE_HANDLING_BIT = 0x80,
-};
-
-enum extend_status_bits {
-	OUTPUT_MESSAGE_IN_PROGRESS_BIT = 0x1,
-	SCSEL_BIT = 0x2,	// statue of SCSEL pin
-	LISTEN_DISABLED = 0x4,
-	IN_FIFO_EMPTY_BIT = 0x8,
-	OUT_FIFO_FULL_BIT = 0x10,
-};
-
-// ines adds fifo enable bits to address mode register
-enum ines_admr_bits {
-	IN_FIFO_ENABLE_BIT = 0x8,
-	OUT_FIFO_ENABLE_BIT = 0x4,
-};
-
-enum xdma_control_bits {
-	DMA_OUTPUT_BIT = 0x1,		// use dma for output, clear for input
-	ENABLE_SYNC_DMA_BIT = 0x2,
-	DMA_ACCESS_EVERY_CYCLE = 0x4,	// dma accesses fifo every cycle, clear for every other cycle
-	DMA_16BIT = 0x8,		// clear for 8 bit transfers
-};
-
-enum bus_control_monitor_bits {
-	BCM_DAV_BIT = 0x1,
-	BCM_NRFD_BIT = 0x2,
-	BCM_NDAC_BIT = 0x4,
-	BCM_IFC_BIT = 0x8,
-	BCM_ATN_BIT = 0x10,
-	BCM_SRQ_BIT = 0x20,
-	BCM_REN_BIT = 0x40,
-	BCM_EOI_BIT = 0x80,
-};
-
-enum ines_aux_reg_bits {
-	INES_AUXD = 0x40,
-};
-
-enum ines_aux_cmds {
-	INES_RFD_HLD_IMMEDIATE = 0x4,
-	INES_AUX_CLR_OUT_FIFO = 0x5,
-	INES_AUX_CLR_IN_FIFO = 0x6,
-	INES_AUX_XMODE = 0xa,
-};
-
-enum ines_auxd_bits {
-	INES_FOLLOWING_T1_MASK = 0x3,
-	INES_FOLLOWING_T1_500ns = 0x0,
-	INES_FOLLOWING_T1_350ns = 0x1,
-	INES_FOLLOWING_T1_250ns = 0x2,
-	INES_INITIAL_TI_MASK = 0xc,
-	INES_INITIAL_T1_2000ns = 0x0,
-	INES_INITIAL_T1_1100ns = 0x4,
-	INES_INITIAL_T1_700ns = 0x8,
-	INES_T6_2us = 0x0,
-	INES_T6_50us = 0x10,
-};
-
-#endif	// _INES_GPIB_H
diff --git a/drivers/staging/gpib/ines/ines_gpib.c b/drivers/staging/gpib/ines/ines_gpib.c
deleted file mode 100644
index a3cf846fd0f9..000000000000
--- a/drivers/staging/gpib/ines/ines_gpib.c
+++ /dev/null
@@ -1,1500 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *    copyright		   : (C) 1999 Axel Dziemba (axel.dziemba@ines.de)
- *			    (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include "ines.h"
-
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-#include <linux/bitops.h>
-#include <asm/dma.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include "gpib_pci_ids.h"
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver for Ines iGPIB 72010");
-
-static irqreturn_t ines_interrupt(struct gpib_board *board);
-
-static int ines_line_status(const struct gpib_board *board)
-{
-	int status = VALID_ALL;
-	int bcm_bits;
-	struct ines_priv *ines_priv;
-
-	ines_priv = board->private_data;
-
-	bcm_bits = ines_inb(ines_priv, BUS_CONTROL_MONITOR);
-
-	if (bcm_bits & BCM_REN_BIT)
-		status |= BUS_REN;
-	if (bcm_bits & BCM_IFC_BIT)
-		status |= BUS_IFC;
-	if (bcm_bits & BCM_SRQ_BIT)
-		status |= BUS_SRQ;
-	if (bcm_bits & BCM_EOI_BIT)
-		status |= BUS_EOI;
-	if (bcm_bits & BCM_NRFD_BIT)
-		status |= BUS_NRFD;
-	if (bcm_bits & BCM_NDAC_BIT)
-		status |= BUS_NDAC;
-	if (bcm_bits & BCM_DAV_BIT)
-		status |= BUS_DAV;
-	if (bcm_bits & BCM_ATN_BIT)
-		status |= BUS_ATN;
-
-	return status;
-}
-
-static void ines_set_xfer_counter(struct ines_priv *priv, unsigned int count)
-{
-	if (count > 0xffff) {
-		pr_err("bug! tried to set xfer counter > 0xffff\n");
-		return;
-	}
-	ines_outb(priv, (count >> 8) & 0xff, XFER_COUNT_UPPER);
-	ines_outb(priv, count & 0xff, XFER_COUNT_LOWER);
-}
-
-static int ines_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	struct ines_priv *ines_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
-	unsigned int retval;
-
-	retval = nec7210_t1_delay(board, nec_priv, nano_sec);
-
-	if (nano_sec <= 250) {
-		write_byte(nec_priv, INES_AUXD | INES_FOLLOWING_T1_250ns |
-			   INES_INITIAL_T1_2000ns, AUXMR);
-		retval = 250;
-	} else if (nano_sec <= 350) {
-		write_byte(nec_priv, INES_AUXD | INES_FOLLOWING_T1_350ns |
-			   INES_INITIAL_T1_2000ns, AUXMR);
-		retval = 350;
-	} else {
-		write_byte(nec_priv, INES_AUXD | INES_FOLLOWING_T1_500ns |
-			   INES_INITIAL_T1_2000ns, AUXMR);
-		retval = 500;
-	}
-
-	return retval;
-}
-
-static inline unsigned short num_in_fifo_bytes(struct ines_priv *ines_priv)
-{
-	return ines_inb(ines_priv, IN_FIFO_COUNT);
-}
-
-static ssize_t pio_read(struct gpib_board *board, struct ines_priv *ines_priv, u8 *buffer,
-			size_t length, size_t *nbytes)
-{
-	ssize_t retval = 0;
-	unsigned int num_fifo_bytes, i;
-	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
-
-	*nbytes = 0;
-	while (*nbytes < length) {
-		if (wait_event_interruptible(board->wait,
-					     num_in_fifo_bytes(ines_priv) ||
-					     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
-					     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-					     test_bit(TIMO_NUM, &board->status)))
-			return -ERESTARTSYS;
-
-		if (test_bit(TIMO_NUM, &board->status))
-			return -ETIMEDOUT;
-		if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
-			return -EINTR;
-
-		num_fifo_bytes = num_in_fifo_bytes(ines_priv);
-		if (num_fifo_bytes + *nbytes > length)
-			num_fifo_bytes = length - *nbytes;
-
-		for (i = 0; i < num_fifo_bytes; i++)
-			buffer[(*nbytes)++] = read_byte(nec_priv, DIR);
-		if (test_bit(RECEIVED_END_BN, &nec_priv->state) &&
-		    num_in_fifo_bytes(ines_priv) == 0)
-			break;
-		if (need_resched())
-			schedule();
-	}
-	/* make sure RECEIVED_END is in sync */
-	ines_interrupt(board);
-	return retval;
-}
-
-static int ines_accel_read(struct gpib_board *board, u8 *buffer,
-			   size_t length, int *end, size_t *bytes_read)
-{
-	ssize_t retval = 0;
-	struct ines_priv *ines_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
-	int counter_setting;
-
-	*end = 0;
-	*bytes_read = 0;
-	if (length == 0)
-		return 0;
-
-	clear_bit(DEV_CLEAR_BN, &nec_priv->state);
-
-	write_byte(nec_priv, INES_RFD_HLD_IMMEDIATE, AUXMR);
-
-	// clear in fifo
-	nec7210_set_reg_bits(nec_priv, ADMR, IN_FIFO_ENABLE_BIT, 0);
-	nec7210_set_reg_bits(nec_priv, ADMR, IN_FIFO_ENABLE_BIT, IN_FIFO_ENABLE_BIT);
-
-	ines_priv->extend_mode_bits |= LAST_BYTE_HANDLING_BIT;
-	ines_priv->extend_mode_bits &= ~XFER_COUNTER_OUTPUT_BIT & ~XFER_COUNTER_ENABLE_BIT;
-	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
-
-	counter_setting = length - num_in_fifo_bytes(ines_priv);
-	if (counter_setting > 0) {
-		ines_set_xfer_counter(ines_priv, length);
-		ines_priv->extend_mode_bits |= XFER_COUNTER_ENABLE_BIT;
-		ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
-
-		// holdoff on END
-		nec7210_set_handshake_mode(board, nec_priv, HR_HLDE);
-		/* release rfd holdoff */
-		write_byte(nec_priv, AUX_FH, AUXMR);
-	}
-
-	retval = pio_read(board, ines_priv, buffer, length, bytes_read);
-	ines_priv->extend_mode_bits &= ~XFER_COUNTER_ENABLE_BIT;
-	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
-	if (retval < 0)	{
-		write_byte(nec_priv, INES_RFD_HLD_IMMEDIATE, AUXMR);
-		return retval;
-	}
-	if (test_and_clear_bit(RECEIVED_END_BN, &nec_priv->state))
-		*end = 1;
-
-	return retval;
-}
-
-static const int out_fifo_size = 0xff;
-
-static inline unsigned short num_out_fifo_bytes(struct ines_priv *ines_priv)
-{
-	return ines_inb(ines_priv, OUT_FIFO_COUNT);
-}
-
-static int ines_write_wait(struct gpib_board *board, struct ines_priv *ines_priv,
-			   unsigned int fifo_threshold)
-{
-	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
-
-	// wait until byte is ready to be sent
-	if (wait_event_interruptible(board->wait,
-				     num_out_fifo_bytes(ines_priv) < fifo_threshold ||
-				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		return -ERESTARTSYS;
-
-	if (test_bit(BUS_ERROR_BN, &nec_priv->state))
-		return -EIO;
-	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
-		return -EINTR;
-	if (test_bit(TIMO_NUM, &board->status))
-		return -ETIMEDOUT;
-
-	return 0;
-}
-
-static int ines_accel_write(struct gpib_board *board, u8 *buffer, size_t length,
-			    int send_eoi, size_t *bytes_written)
-{
-	size_t count = 0;
-	ssize_t retval = 0;
-	struct ines_priv *ines_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
-	unsigned int num_bytes, i;
-
-	*bytes_written = 0;
-	// clear out fifo
-	nec7210_set_reg_bits(nec_priv, ADMR, OUT_FIFO_ENABLE_BIT, 0);
-	nec7210_set_reg_bits(nec_priv, ADMR, OUT_FIFO_ENABLE_BIT, OUT_FIFO_ENABLE_BIT);
-
-	ines_priv->extend_mode_bits |= XFER_COUNTER_OUTPUT_BIT;
-	ines_priv->extend_mode_bits &= ~XFER_COUNTER_ENABLE_BIT;
-	ines_priv->extend_mode_bits &= ~LAST_BYTE_HANDLING_BIT;
-	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
-
-	ines_set_xfer_counter(ines_priv, length);
-	if (send_eoi)
-		ines_priv->extend_mode_bits |= LAST_BYTE_HANDLING_BIT;
-	ines_priv->extend_mode_bits |= XFER_COUNTER_ENABLE_BIT;
-	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
-
-	while (count < length) {
-		retval = ines_write_wait(board, ines_priv, out_fifo_size);
-		if (retval < 0)
-			break;
-
-		num_bytes = out_fifo_size - num_out_fifo_bytes(ines_priv);
-		if (num_bytes + count > length)
-			num_bytes = length - count;
-		for (i = 0; i < num_bytes; i++)
-			write_byte(nec_priv, buffer[count++], CDOR);
-	}
-	if (retval < 0)	{
-		ines_priv->extend_mode_bits &= ~XFER_COUNTER_ENABLE_BIT;
-		ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
-		*bytes_written = length - num_out_fifo_bytes(ines_priv);
-		return retval;
-	}
-	// wait last byte has been sent
-	retval = ines_write_wait(board, ines_priv, 1);
-	ines_priv->extend_mode_bits &= ~XFER_COUNTER_ENABLE_BIT;
-	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
-	*bytes_written = length - num_out_fifo_bytes(ines_priv);
-
-	return retval;
-}
-
-static irqreturn_t ines_pci_interrupt(int irq, void *arg)
-{
-	struct gpib_board *board = arg;
-	struct ines_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-
-	if (priv->pci_chip_type == PCI_CHIP_QUANCOM) {
-		if ((inb(nec_priv->iobase +
-			 QUANCOM_IRQ_CONTROL_STATUS_REG) &
-		     QUANCOM_IRQ_ASSERTED_BIT))
-			outb(QUANCOM_IRQ_ENABLE_BIT, nec_priv->iobase +
-			     QUANCOM_IRQ_CONTROL_STATUS_REG);
-	}
-
-	return ines_interrupt(board);
-}
-
-static irqreturn_t ines_interrupt(struct gpib_board *board)
-{
-	struct ines_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-	unsigned int isr3_bits, isr4_bits;
-	unsigned long flags;
-	int wake = 0;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-
-	nec7210_interrupt(board, nec_priv);
-	isr3_bits = ines_inb(priv, ISR3);
-	isr4_bits = ines_inb(priv, ISR4);
-	if (isr3_bits & IFC_ACTIVE_BIT)	{
-		push_gpib_event(board, EVENT_IFC);
-		wake++;
-	}
-	if (isr3_bits & FIFO_ERROR_BIT)
-		dev_err(board->gpib_dev, "fifo error\n");
-	if (isr3_bits & XFER_COUNT_BIT)
-		wake++;
-
-	if (isr4_bits & (IN_FIFO_WATERMARK_BIT | IN_FIFO_FULL_BIT | OUT_FIFO_WATERMARK_BIT |
-			 OUT_FIFO_EMPTY_BIT))
-		wake++;
-
-	if (wake)
-		wake_up_interruptible(&board->wait);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return IRQ_HANDLED;
-}
-
-static int ines_pci_attach(struct gpib_board *board, const struct gpib_board_config *config);
-static int ines_pci_accel_attach(struct gpib_board *board, const struct gpib_board_config *config);
-static int ines_isa_attach(struct gpib_board *board, const struct gpib_board_config *config);
-
-static void ines_pci_detach(struct gpib_board *board);
-static void ines_isa_detach(struct gpib_board *board);
-
-enum ines_pci_vendor_ids {
-	PCI_VENDOR_ID_INES_QUICKLOGIC = 0x16da
-};
-
-enum ines_pci_device_ids {
-	PCI_DEVICE_ID_INES_GPIB_AMCC = 0x8507,
-	PCI_DEVICE_ID_INES_GPIB_QL5030 = 0x11,
-};
-
-enum ines_pci_subdevice_ids {
-	PCI_SUBDEVICE_ID_INES_GPIB = 0x1072
-};
-
-static struct pci_device_id ines_pci_table[] = {
-	{PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9050, PCI_VENDOR_ID_PLX,
-	 PCI_SUBDEVICE_ID_INES_GPIB, 0, 0, 0},
-	{PCI_VENDOR_ID_AMCC, PCI_DEVICE_ID_INES_GPIB_AMCC, PCI_VENDOR_ID_AMCC,
-	 PCI_SUBDEVICE_ID_INES_GPIB, 0, 0, 0},
-	{PCI_VENDOR_ID_INES_QUICKLOGIC, PCI_DEVICE_ID_INES_GPIB_QL5030,
-	 PCI_VENDOR_ID_INES_QUICKLOGIC, PCI_DEVICE_ID_INES_GPIB_QL5030, 0, 0, 0},
-	{PCI_DEVICE(PCI_VENDOR_ID_QUANCOM, PCI_DEVICE_ID_QUANCOM_GPIB)},
-	{0}
-};
-MODULE_DEVICE_TABLE(pci, ines_pci_table);
-
-struct ines_pci_id {
-	unsigned int vendor_id;
-	unsigned int device_id;
-	int subsystem_vendor_id;
-	int subsystem_device_id;
-	unsigned int gpib_region;
-	unsigned int io_offset;
-	enum ines_pci_chip pci_chip_type;
-};
-
-static struct ines_pci_id pci_ids[] = {
-	{.vendor_id = PCI_VENDOR_ID_PLX,
-	 .device_id = PCI_DEVICE_ID_PLX_9050,
-	 .subsystem_vendor_id = PCI_VENDOR_ID_PLX,
-	 .subsystem_device_id = PCI_SUBDEVICE_ID_INES_GPIB,
-	 .gpib_region = 2,
-	 .io_offset = 1,
-	 .pci_chip_type = PCI_CHIP_PLX9050,
-	},
-	{.vendor_id = PCI_VENDOR_ID_AMCC,
-	 .device_id = PCI_DEVICE_ID_INES_GPIB_AMCC,
-	 .subsystem_vendor_id = PCI_VENDOR_ID_AMCC,
-	 .subsystem_device_id = PCI_SUBDEVICE_ID_INES_GPIB,
-	 .gpib_region = 1,
-	 .io_offset = 1,
-	 .pci_chip_type = PCI_CHIP_AMCC5920,
-	},
-	{.vendor_id = PCI_VENDOR_ID_INES_QUICKLOGIC,
-	 .device_id = PCI_DEVICE_ID_INES_GPIB_QL5030,
-	 .subsystem_vendor_id = PCI_VENDOR_ID_INES_QUICKLOGIC,
-	 .subsystem_device_id = PCI_DEVICE_ID_INES_GPIB_QL5030,
-	 .gpib_region = 1,
-	 .io_offset = 1,
-	 .pci_chip_type = PCI_CHIP_QUICKLOGIC5030,
-	},
-	{.vendor_id = PCI_VENDOR_ID_QUANCOM,
-	 .device_id = PCI_DEVICE_ID_QUANCOM_GPIB,
-	 .subsystem_vendor_id = -1,
-	 .subsystem_device_id = -1,
-	 .gpib_region = 0,
-	 .io_offset = 4,
-	 .pci_chip_type = PCI_CHIP_QUANCOM,
-	},
-};
-
-static const int num_pci_chips = ARRAY_SIZE(pci_ids);
-
-// wrappers for interface functions
-static int ines_read(struct gpib_board *board, u8 *buffer, size_t length,
-		     int *end, size_t *bytes_read)
-{
-	struct ines_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-	ssize_t retval;
-	int dummy;
-
-	retval = nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
-	if (retval < 0)	{
-		write_byte(nec_priv, INES_RFD_HLD_IMMEDIATE, AUXMR);
-
-		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-
-		nec7210_read_data_in(board, nec_priv, &dummy);
-	}
-	return retval;
-}
-
-static int ines_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
-		      size_t *bytes_written)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
-}
-
-static int ines_command(struct gpib_board *board, u8 *buffer, size_t length, size_t *bytes_written)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
-}
-
-static int ines_take_control(struct gpib_board *board, int synchronous)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
-}
-
-static int ines_go_to_standby(struct gpib_board *board)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_go_to_standby(board, &priv->nec7210_priv);
-}
-
-static int ines_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_request_system_control(board, &priv->nec7210_priv, request_control);
-}
-
-static void ines_interface_clear(struct gpib_board *board, int assert)
-{
-	struct ines_priv *priv = board->private_data;
-
-	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
-}
-
-static void ines_remote_enable(struct gpib_board *board, int enable)
-{
-	struct ines_priv *priv = board->private_data;
-
-	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
-}
-
-static int ines_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
-}
-
-static void ines_disable_eos(struct gpib_board *board)
-{
-	struct ines_priv *priv = board->private_data;
-
-	nec7210_disable_eos(board, &priv->nec7210_priv);
-}
-
-static unsigned int ines_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
-}
-
-static int ines_primary_address(struct gpib_board *board, unsigned int address)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_primary_address(board, &priv->nec7210_priv, address);
-}
-
-static int ines_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
-}
-
-static int ines_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
-}
-
-static void ines_parallel_poll_configure(struct gpib_board *board, u8 config)
-{
-	struct ines_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, config);
-}
-
-static void ines_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	struct ines_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
-}
-
-static void ines_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	struct ines_priv *priv = board->private_data;
-
-	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
-}
-
-static u8 ines_serial_poll_status(struct gpib_board *board)
-{
-	struct ines_priv *priv = board->private_data;
-
-	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
-}
-
-static void ines_return_to_local(struct gpib_board *board)
-{
-	struct ines_priv *priv = board->private_data;
-
-	nec7210_return_to_local(board, &priv->nec7210_priv);
-}
-
-static struct gpib_interface ines_pci_unaccel_interface = {
-	.name = "ines_pci_unaccel",
-	.attach = ines_pci_attach,
-	.detach = ines_pci_detach,
-	.read = ines_read,
-	.write = ines_write,
-	.command = ines_command,
-	.take_control = ines_take_control,
-	.go_to_standby = ines_go_to_standby,
-	.request_system_control = ines_request_system_control,
-	.interface_clear = ines_interface_clear,
-	.remote_enable = ines_remote_enable,
-	.enable_eos = ines_enable_eos,
-	.disable_eos = ines_disable_eos,
-	.parallel_poll = ines_parallel_poll,
-	.parallel_poll_configure = ines_parallel_poll_configure,
-	.parallel_poll_response = ines_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = ines_line_status,
-	.update_status = ines_update_status,
-	.primary_address = ines_primary_address,
-	.secondary_address = ines_secondary_address,
-	.serial_poll_response = ines_serial_poll_response,
-	.serial_poll_status = ines_serial_poll_status,
-	.t1_delay = ines_t1_delay,
-	.return_to_local = ines_return_to_local,
-};
-
-static struct gpib_interface ines_pci_interface = {
-	.name = "ines_pci",
-	.attach = ines_pci_accel_attach,
-	.detach = ines_pci_detach,
-	.read = ines_accel_read,
-	.write = ines_accel_write,
-	.command = ines_command,
-	.take_control = ines_take_control,
-	.go_to_standby = ines_go_to_standby,
-	.request_system_control = ines_request_system_control,
-	.interface_clear = ines_interface_clear,
-	.remote_enable = ines_remote_enable,
-	.enable_eos = ines_enable_eos,
-	.disable_eos = ines_disable_eos,
-	.parallel_poll = ines_parallel_poll,
-	.parallel_poll_configure = ines_parallel_poll_configure,
-	.parallel_poll_response = ines_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = ines_line_status,
-	.update_status = ines_update_status,
-	.primary_address = ines_primary_address,
-	.secondary_address = ines_secondary_address,
-	.serial_poll_response = ines_serial_poll_response,
-	.serial_poll_status = ines_serial_poll_status,
-	.t1_delay = ines_t1_delay,
-	.return_to_local = ines_return_to_local,
-};
-
-static struct gpib_interface ines_pci_accel_interface = {
-	.name = "ines_pci_accel",
-	.attach = ines_pci_accel_attach,
-	.detach = ines_pci_detach,
-	.read = ines_accel_read,
-	.write = ines_accel_write,
-	.command = ines_command,
-	.take_control = ines_take_control,
-	.go_to_standby = ines_go_to_standby,
-	.request_system_control = ines_request_system_control,
-	.interface_clear = ines_interface_clear,
-	.remote_enable = ines_remote_enable,
-	.enable_eos = ines_enable_eos,
-	.disable_eos = ines_disable_eos,
-	.parallel_poll = ines_parallel_poll,
-	.parallel_poll_configure = ines_parallel_poll_configure,
-	.parallel_poll_response = ines_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = ines_line_status,
-	.update_status = ines_update_status,
-	.primary_address = ines_primary_address,
-	.secondary_address = ines_secondary_address,
-	.serial_poll_response = ines_serial_poll_response,
-	.serial_poll_status = ines_serial_poll_status,
-	.t1_delay = ines_t1_delay,
-	.return_to_local = ines_return_to_local,
-};
-
-static struct gpib_interface ines_isa_interface = {
-	.name = "ines_isa",
-	.attach = ines_isa_attach,
-	.detach = ines_isa_detach,
-	.read = ines_accel_read,
-	.write = ines_accel_write,
-	.command = ines_command,
-	.take_control = ines_take_control,
-	.go_to_standby = ines_go_to_standby,
-	.request_system_control = ines_request_system_control,
-	.interface_clear = ines_interface_clear,
-	.remote_enable = ines_remote_enable,
-	.enable_eos = ines_enable_eos,
-	.disable_eos = ines_disable_eos,
-	.parallel_poll = ines_parallel_poll,
-	.parallel_poll_configure = ines_parallel_poll_configure,
-	.parallel_poll_response = ines_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = ines_line_status,
-	.update_status = ines_update_status,
-	.primary_address = ines_primary_address,
-	.secondary_address = ines_secondary_address,
-	.serial_poll_response = ines_serial_poll_response,
-	.serial_poll_status = ines_serial_poll_status,
-	.t1_delay = ines_t1_delay,
-	.return_to_local = ines_return_to_local,
-};
-
-static int ines_allocate_private(struct gpib_board *board)
-{
-	struct ines_priv *priv;
-
-	board->private_data = kmalloc(sizeof(struct ines_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -1;
-	priv = board->private_data;
-	memset(priv, 0, sizeof(struct ines_priv));
-	init_nec7210_private(&priv->nec7210_priv);
-	return 0;
-}
-
-static void ines_free_private(struct gpib_board *board)
-{
-	kfree(board->private_data);
-	board->private_data = NULL;
-}
-
-static int ines_generic_attach(struct gpib_board *board)
-{
-	struct ines_priv *ines_priv;
-	struct nec7210_priv *nec_priv;
-
-	board->status = 0;
-
-	if (ines_allocate_private(board))
-		return -ENOMEM;
-	ines_priv = board->private_data;
-	nec_priv = &ines_priv->nec7210_priv;
-	nec_priv->read_byte = nec7210_ioport_read_byte;
-	nec_priv->write_byte = nec7210_ioport_write_byte;
-	nec_priv->offset = 1;
-	nec_priv->type = IGPIB7210;
-	ines_priv->pci_chip_type = PCI_CHIP_NONE;
-
-	return 0;
-}
-
-static void ines_online(struct ines_priv *ines_priv, const struct gpib_board *board, int use_accel)
-{
-	struct nec7210_priv *nec_priv = &ines_priv->nec7210_priv;
-
-	/* ines doesn't seem to use internal count register */
-	write_byte(nec_priv, ICR | 0, AUXMR);
-
-	write_byte(nec_priv, INES_AUX_XMODE, AUXMR);
-	write_byte(nec_priv, INES_RFD_HLD_IMMEDIATE, AUXMR);
-
-	set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-
-	write_byte(nec_priv, INES_AUXD | 0, AUXMR);
-	ines_outb(ines_priv, 0, XDMA_CONTROL);
-	ines_priv->extend_mode_bits = 0;
-	ines_outb(ines_priv, ines_priv->extend_mode_bits, EXTEND_MODE);
-	if (use_accel) {
-		ines_outb(ines_priv, 0x80, OUT_FIFO_WATERMARK);
-		ines_outb(ines_priv, 0x80, IN_FIFO_WATERMARK);
-		ines_outb(ines_priv, IFC_ACTIVE_BIT | ATN_ACTIVE_BIT |
-			  FIFO_ERROR_BIT | XFER_COUNT_BIT, IMR3);
-		ines_outb(ines_priv, IN_FIFO_WATERMARK_BIT | IN_FIFO_FULL_BIT |
-			  OUT_FIFO_WATERMARK_BIT | OUT_FIFO_EMPTY_BIT, IMR4);
-	} else {
-		nec7210_set_reg_bits(nec_priv, ADMR, IN_FIFO_ENABLE_BIT | OUT_FIFO_ENABLE_BIT, 0);
-		ines_outb(ines_priv, IFC_ACTIVE_BIT | FIFO_ERROR_BIT, IMR3);
-		ines_outb(ines_priv, 0, IMR4);
-	}
-
-	nec7210_board_online(nec_priv, board);
-	if (use_accel)
-		nec7210_set_reg_bits(nec_priv, IMR1, HR_DOIE | HR_DIIE, 0);
-}
-
-static int ines_common_pci_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct ines_priv *ines_priv;
-	struct nec7210_priv *nec_priv;
-	int isr_flags = 0;
-	int retval;
-	struct ines_pci_id found_id;
-	unsigned int i;
-	struct pci_dev *pdev;
-
-	memset(&found_id, 0, sizeof(found_id));
-
-	retval = ines_generic_attach(board);
-	if (retval)
-		return retval;
-
-	ines_priv = board->private_data;
-	nec_priv = &ines_priv->nec7210_priv;
-
-	// find board
-	ines_priv->pci_device = NULL;
-	for (i = 0; i < num_pci_chips && !ines_priv->pci_device; i++) {
-		pdev = NULL;
-		do {
-			if (pci_ids[i].subsystem_vendor_id >= 0 &&
-			    pci_ids[i].subsystem_device_id >= 0)
-				pdev = pci_get_subsys(pci_ids[i].vendor_id, pci_ids[i].device_id,
-						      pci_ids[i].subsystem_vendor_id,
-						      pci_ids[i].subsystem_device_id, pdev);
-			else
-				pdev = pci_get_device(pci_ids[i].vendor_id, pci_ids[i].device_id,
-						      pdev);
-			if (!pdev)
-				break;
-			if (config->pci_bus >= 0 && config->pci_bus != pdev->bus->number)
-				continue;
-			if (config->pci_slot >= 0 && config->pci_slot != PCI_SLOT(pdev->devfn))
-				continue;
-			found_id = pci_ids[i];
-			ines_priv->pci_device = pdev;
-			break;
-		} while (1);
-	}
-	if (!ines_priv->pci_device) {
-		dev_err(board->gpib_dev, "could not find ines PCI board\n");
-		return -1;
-	}
-
-	if (pci_enable_device(ines_priv->pci_device)) {
-		dev_err(board->gpib_dev, "error enabling pci device\n");
-		return -1;
-	}
-
-	if (pci_request_regions(ines_priv->pci_device, DRV_NAME))
-		return -1;
-	nec_priv->iobase = pci_resource_start(ines_priv->pci_device,
-					      found_id.gpib_region);
-
-	ines_priv->pci_chip_type = found_id.pci_chip_type;
-	nec_priv->offset = found_id.io_offset;
-	switch (ines_priv->pci_chip_type) {
-	case PCI_CHIP_PLX9050:
-		ines_priv->plx_iobase = pci_resource_start(ines_priv->pci_device, 1);
-		break;
-	case PCI_CHIP_AMCC5920:
-		ines_priv->amcc_iobase = pci_resource_start(ines_priv->pci_device, 0);
-		break;
-	case PCI_CHIP_QUANCOM:
-		break;
-	case PCI_CHIP_QUICKLOGIC5030:
-		break;
-	default:
-		dev_err(board->gpib_dev, "unspecified chip type? (bug)\n");
-		nec_priv->iobase = 0;
-		pci_release_regions(ines_priv->pci_device);
-		return -1;
-	}
-
-	nec7210_board_reset(nec_priv, board);
-#ifdef QUANCOM_PCI
-	if (ines_priv->pci_chip_type == PCI_CHIP_QUANCOM) {
-		/* change interrupt polarity */
-		nec_priv->auxb_bits |= HR_INV;
-		ines_outb(ines_priv, nec_priv->auxb_bits, AUXMR);
-	}
-#endif
-	isr_flags |= IRQF_SHARED;
-	if (request_irq(ines_priv->pci_device->irq, ines_pci_interrupt, isr_flags,
-			DRV_NAME, board)) {
-		dev_err(board->gpib_dev, "can't request IRQ %d\n", ines_priv->pci_device->irq);
-		return -1;
-	}
-	ines_priv->irq = ines_priv->pci_device->irq;
-
-	// enable interrupts on pci chip
-	switch (ines_priv->pci_chip_type) {
-	case PCI_CHIP_PLX9050:
-		outl(PLX9050_LINTR1_EN_BIT | PLX9050_LINTR1_POLARITY_BIT | PLX9050_PCI_INTR_EN_BIT,
-		     ines_priv->plx_iobase + PLX9050_INTCSR_REG);
-		break;
-	case PCI_CHIP_AMCC5920:
-	{
-		static const int region = 1;
-		static const int num_wait_states = 7;
-		u32 bits;
-
-		bits = amcc_prefetch_bits(region, PREFETCH_DISABLED);
-		bits |= amcc_PTADR_mode_bit(region);
-		bits |= amcc_disable_write_fifo_bit(region);
-		bits |= amcc_wait_state_bits(region, num_wait_states);
-		outl(bits, ines_priv->amcc_iobase + AMCC_PASS_THRU_REG);
-		outl(AMCC_ADDON_INTR_ENABLE_BIT, ines_priv->amcc_iobase + AMCC_INTCS_REG);
-	}
-	break;
-	case PCI_CHIP_QUANCOM:
-		outb(QUANCOM_IRQ_ENABLE_BIT, nec_priv->iobase +
-		     QUANCOM_IRQ_CONTROL_STATUS_REG);
-		break;
-	case PCI_CHIP_QUICKLOGIC5030:
-		break;
-	default:
-		dev_err(board->gpib_dev, "unspecified chip type? (bug)\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-static int ines_pci_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct ines_priv *ines_priv;
-	int retval;
-
-	retval = ines_common_pci_attach(board, config);
-	if (retval < 0)
-		return retval;
-
-	ines_priv = board->private_data;
-	ines_online(ines_priv, board, 0);
-
-	return 0;
-}
-
-static int ines_pci_accel_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct ines_priv *ines_priv;
-	int retval;
-
-	retval = ines_common_pci_attach(board, config);
-	if (retval < 0)
-		return retval;
-
-	ines_priv = board->private_data;
-	ines_online(ines_priv, board, 1);
-
-	return 0;
-}
-
-static const int ines_isa_iosize = 0x20;
-
-static int ines_isa_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct ines_priv *ines_priv;
-	struct nec7210_priv *nec_priv;
-	int isr_flags = 0;
-	int retval;
-
-	retval = ines_generic_attach(board);
-	if (retval)
-		return retval;
-
-	ines_priv = board->private_data;
-	nec_priv = &ines_priv->nec7210_priv;
-
-	if (!request_region(config->ibbase, ines_isa_iosize, DRV_NAME)) {
-		dev_err(board->gpib_dev, "ioports at 0x%x already in use\n",
-			config->ibbase);
-		return -EBUSY;
-	}
-	nec_priv->iobase = config->ibbase;
-	nec_priv->offset = 1;
-	nec7210_board_reset(nec_priv, board);
-	if (request_irq(config->ibirq, ines_pci_interrupt, isr_flags, DRV_NAME, board)) {
-		dev_err(board->gpib_dev, "failed to allocate IRQ %d\n", config->ibirq);
-		return -1;
-	}
-	ines_priv->irq = config->ibirq;
-	ines_online(ines_priv, board, 1);
-	return 0;
-}
-
-static void ines_pci_detach(struct gpib_board *board)
-{
-	struct ines_priv *ines_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (ines_priv) {
-		nec_priv = &ines_priv->nec7210_priv;
-		if (ines_priv->irq) {
-			// disable interrupts
-			switch (ines_priv->pci_chip_type) {
-			case PCI_CHIP_AMCC5920:
-				if (ines_priv->plx_iobase)
-					outl(0, ines_priv->plx_iobase + PLX9050_INTCSR_REG);
-				break;
-			case PCI_CHIP_QUANCOM:
-				if (nec_priv->iobase)
-					outb(0, nec_priv->iobase +
-					     QUANCOM_IRQ_CONTROL_STATUS_REG);
-				break;
-			default:
-				break;
-			}
-			free_irq(ines_priv->irq, board);
-		}
-		if (nec_priv->iobase) {
-			nec7210_board_reset(nec_priv, board);
-			pci_release_regions(ines_priv->pci_device);
-		}
-		if (ines_priv->pci_device)
-			pci_dev_put(ines_priv->pci_device);
-	}
-	ines_free_private(board);
-}
-
-static void ines_isa_detach(struct gpib_board *board)
-{
-	struct ines_priv *ines_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (ines_priv) {
-		nec_priv = &ines_priv->nec7210_priv;
-		if (ines_priv->irq)
-			free_irq(ines_priv->irq, board);
-		if (nec_priv->iobase) {
-			nec7210_board_reset(nec_priv, board);
-			release_region(nec_priv->iobase, ines_isa_iosize);
-		}
-	}
-	ines_free_private(board);
-}
-
-static int ines_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return 0;
-}
-
-static struct pci_driver ines_pci_driver = {
-	.name = "ines_gpib",
-	.id_table = ines_pci_table,
-	.probe = &ines_pci_probe
-};
-
-#ifdef CONFIG_GPIB_PCMCIA
-
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/timer.h>
-
-#include <pcmcia/cistpl.h>
-#include <pcmcia/ds.h>
-#include <pcmcia/cisreg.h>
-
-static const int ines_pcmcia_iosize = 0x20;
-
-/*
- * The event() function is this driver's Card Services event handler.
- * It will be called by Card Services when an appropriate card status
- * event is received.  The config() and release() entry points are
- * used to configure or release a socket, in response to card insertion
- * and ejection events.  They are invoked from the gpib event
- * handler.
- */
-
-static int ines_gpib_config(struct pcmcia_device  *link);
-static void ines_gpib_release(struct pcmcia_device  *link);
-static int ines_pcmcia_attach(struct gpib_board *board, const struct gpib_board_config *config);
-static int ines_pcmcia_accel_attach(struct gpib_board *board,
-				    const struct gpib_board_config *config);
-static void ines_pcmcia_detach(struct gpib_board *board);
-static int ines_common_pcmcia_attach(struct gpib_board *board);
-/*
- * A linked list of "instances" of the gpib device.  Each actual
- * PCMCIA card corresponds to one device instance, and is described
- * by one dev_link_t structure (defined in ds.h).
- *
- * You may not want to use a linked list for this -- for example, the
- * memory card driver uses an array of dev_link_t pointers, where minor
- * device numbers are used to derive the corresponding array index.
- */
-
-static struct pcmcia_device *curr_dev;
-
-/*
- * A dev_link_t structure has fields for most things that are needed
- * to keep track of a socket, but there will usually be some device
- * specific information that also needs to be kept track of.  The
- * 'priv' pointer in a dev_link_t structure can be used to point to
- * a device-specific private data structure, like this.
- *
- * A driver needs to provide a dev_node_t structure for each device
- * on a card.	In some cases, there is only one device per card (for
- * example, ethernet cards, modems).  In other cases, there may be
- * many actual or logical devices (SCSI adapters, memory cards with
- * multiple partitions).  The dev_node_t structures need to be kept
- * in a linked list starting at the 'dev' field of a dev_link_t
- * structure.	We allocate them in the card's private data structure,
- * because they generally can't be allocated dynamically.
- */
-
-struct local_info {
-	struct pcmcia_device	*p_dev;
-	struct gpib_board		*dev;
-	u_short manfid;
-	u_short cardid;
-};
-
-/*
- * gpib_attach() creates an "instance" of the driver, allocating
- * local data structures for one device.  The device is registered
- * with Card Services.
- *
- * The dev_link structure is initialized, but we don't actually
- * configure the card at this point -- we wait until we receive a
- * card insertion event.
- */
-static int ines_gpib_probe(struct pcmcia_device *link)
-{
-	struct local_info *info;
-
-//	int ret, i;
-
-	/* Allocate space for private device-specific data */
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	info->p_dev = link;
-	link->priv = info;
-
-	/* The io structure describes IO port mapping */
-	link->resource[0]->end = 32;
-	link->resource[0]->flags &= ~IO_DATA_PATH_WIDTH;
-	link->resource[0]->flags |= IO_DATA_PATH_WIDTH_8;
-	link->io_lines = 5;
-
-	/* General socket configuration */
-	link->config_flags = CONF_ENABLE_IRQ | CONF_AUTO_SET_IO;
-
-	/* Register with Card Services */
-	curr_dev = link;
-	return ines_gpib_config(link);
-}
-
-/*
- * This deletes a driver "instance".	The device is de-registered
- * with Card Services.  If it has been released, all local data
- * structures are freed.  Otherwise, the structures will be freed
- * when the device is released.
- */
-static void ines_gpib_remove(struct pcmcia_device *link)
-{
-	struct local_info *info = link->priv;
-	//struct struct gpib_board *dev = info->dev;
-
-	if (info->dev)
-		ines_pcmcia_detach(info->dev);
-	ines_gpib_release(link);
-
-	//free_netdev(dev);
-	kfree(info);
-}
-
-static int ines_gpib_config_iteration(struct pcmcia_device *link, void *priv_data)
-{
-	return pcmcia_request_io(link);
-}
-
-/*
- * gpib_config() is scheduled to run after a CARD_INSERTION event
- * is received, to configure the PCMCIA socket, and to make the
- * device available to the system.
- */
-static int ines_gpib_config(struct pcmcia_device *link)
-{
-	int retval;
-	void __iomem *virt;
-
-	retval = pcmcia_loop_config(link, &ines_gpib_config_iteration, NULL);
-	if (retval) {
-		dev_warn(&link->dev, "no configuration found\n");
-		ines_gpib_release(link);
-		return -ENODEV;
-	}
-
-	dev_dbg(&link->dev, "ines_cs: manufacturer: 0x%x card: 0x%x\n",
-		link->manf_id, link->card_id);
-
-	/*
-	 * for the ines card we have to setup the configuration registers in
-	 * attribute memory here
-	 */
-	link->resource[2]->flags |= WIN_MEMORY_TYPE_AM | WIN_DATA_WIDTH_8 | WIN_ENABLE;
-	link->resource[2]->end = 0x1000;
-	retval = pcmcia_request_window(link, link->resource[2], 250);
-	if (retval) {
-		dev_warn(&link->dev, "pcmcia_request_window failed\n");
-		ines_gpib_release(link);
-		return -ENODEV;
-	}
-	retval = pcmcia_map_mem_page(link, link->resource[2], 0);
-	if (retval) {
-		dev_warn(&link->dev, "pcmcia_map_mem_page failed\n");
-		ines_gpib_release(link);
-		return -ENODEV;
-	}
-	virt = ioremap(link->resource[2]->start, resource_size(link->resource[2]));
-	writeb((link->resource[2]->start >> 2) & 0xff, virt + 0xf0); // IOWindow base
-	iounmap(virt);
-
-	/*
-	 * This actually configures the PCMCIA socket -- setting up
-	 * the I/O windows and the interrupt mapping.
-	 */
-	retval = pcmcia_enable_device(link);
-	if (retval) {
-		ines_gpib_release(link);
-		return -ENODEV;
-	}
-	return 0;
-} /* gpib_config */
-
-/*
- * After a card is removed, gpib_release() will unregister the net
- * device, and release the PCMCIA configuration.  If the device is
- * still open, this will be postponed until it is closed.
- */
-
-static void ines_gpib_release(struct pcmcia_device *link)
-{
-	pcmcia_disable_device(link);
-} /* gpib_release */
-
-static int ines_gpib_suspend(struct pcmcia_device *link)
-{
-	//struct local_info *info = link->priv;
-	//struct struct gpib_board *dev = info->dev;
-
-	if (link->open)
-		dev_err(&link->dev, "Device still open\n");
-	//netif_device_detach(dev);
-
-	return 0;
-}
-
-static int ines_gpib_resume(struct pcmcia_device *link)
-{
-	//struct local_info_t *info = link->priv;
-	//struct struct gpib_board *dev = info->dev;
-
-	/*if (link->open) {
-	 *	ni_gpib_probe(dev);	/ really?
-	 *	//netif_device_attach(dev);
-	 *}
-	 */
-	return ines_gpib_config(link);
-}
-
-static struct pcmcia_device_id ines_pcmcia_ids[] = {
-	PCMCIA_DEVICE_MANF_CARD(0x01b4, 0x4730),
-	PCMCIA_DEVICE_NULL
-};
-MODULE_DEVICE_TABLE(pcmcia, ines_pcmcia_ids);
-
-static struct pcmcia_driver ines_gpib_cs_driver = {
-	.owner		= THIS_MODULE,
-	.name		= "ines_gpib_cs",
-	.id_table	= ines_pcmcia_ids,
-	.probe		= ines_gpib_probe,
-	.remove		= ines_gpib_remove,
-	.suspend	= ines_gpib_suspend,
-	.resume		= ines_gpib_resume,
-};
-
-static void ines_pcmcia_cleanup_module(void)
-{
-	pcmcia_unregister_driver(&ines_gpib_cs_driver);
-}
-
-static struct gpib_interface ines_pcmcia_unaccel_interface = {
-	.name = "ines_pcmcia_unaccel",
-	.attach = ines_pcmcia_attach,
-	.detach = ines_pcmcia_detach,
-	.read = ines_read,
-	.write = ines_write,
-	.command = ines_command,
-	.take_control = ines_take_control,
-	.go_to_standby = ines_go_to_standby,
-	.request_system_control = ines_request_system_control,
-	.interface_clear = ines_interface_clear,
-	.remote_enable = ines_remote_enable,
-	.enable_eos = ines_enable_eos,
-	.disable_eos = ines_disable_eos,
-	.parallel_poll = ines_parallel_poll,
-	.parallel_poll_configure = ines_parallel_poll_configure,
-	.parallel_poll_response = ines_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = ines_line_status,
-	.update_status = ines_update_status,
-	.primary_address = ines_primary_address,
-	.secondary_address = ines_secondary_address,
-	.serial_poll_response = ines_serial_poll_response,
-	.serial_poll_status = ines_serial_poll_status,
-	.t1_delay = ines_t1_delay,
-	.return_to_local = ines_return_to_local,
-};
-
-static struct gpib_interface ines_pcmcia_accel_interface = {
-	.name = "ines_pcmcia_accel",
-	.attach = ines_pcmcia_accel_attach,
-	.detach = ines_pcmcia_detach,
-	.read = ines_accel_read,
-	.write = ines_accel_write,
-	.command = ines_command,
-	.take_control = ines_take_control,
-	.go_to_standby = ines_go_to_standby,
-	.request_system_control = ines_request_system_control,
-	.interface_clear = ines_interface_clear,
-	.remote_enable = ines_remote_enable,
-	.enable_eos = ines_enable_eos,
-	.disable_eos = ines_disable_eos,
-	.parallel_poll = ines_parallel_poll,
-	.parallel_poll_configure = ines_parallel_poll_configure,
-	.parallel_poll_response = ines_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = ines_line_status,
-	.update_status = ines_update_status,
-	.primary_address = ines_primary_address,
-	.secondary_address = ines_secondary_address,
-	.serial_poll_response = ines_serial_poll_response,
-	.serial_poll_status = ines_serial_poll_status,
-	.t1_delay = ines_t1_delay,
-	.return_to_local = ines_return_to_local,
-};
-
-static struct gpib_interface ines_pcmcia_interface = {
-	.name = "ines_pcmcia",
-	.attach = ines_pcmcia_accel_attach,
-	.detach = ines_pcmcia_detach,
-	.read = ines_accel_read,
-	.write = ines_accel_write,
-	.command = ines_command,
-	.take_control = ines_take_control,
-	.go_to_standby = ines_go_to_standby,
-	.request_system_control = ines_request_system_control,
-	.interface_clear = ines_interface_clear,
-	.remote_enable = ines_remote_enable,
-	.enable_eos = ines_enable_eos,
-	.disable_eos = ines_disable_eos,
-	.parallel_poll = ines_parallel_poll,
-	.parallel_poll_configure = ines_parallel_poll_configure,
-	.parallel_poll_response = ines_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = ines_line_status,
-	.update_status = ines_update_status,
-	.primary_address = ines_primary_address,
-	.secondary_address = ines_secondary_address,
-	.serial_poll_response = ines_serial_poll_response,
-	.serial_poll_status = ines_serial_poll_status,
-	.t1_delay = ines_t1_delay,
-	.return_to_local = ines_return_to_local,
-};
-
-static irqreturn_t ines_pcmcia_interrupt(int irq, void *arg)
-{
-	struct gpib_board *board = arg;
-
-	return ines_interrupt(board);
-}
-
-static int ines_common_pcmcia_attach(struct gpib_board *board)
-{
-	struct ines_priv *ines_priv;
-	struct nec7210_priv *nec_priv;
-	int retval;
-
-	if (!curr_dev) {
-		dev_err(board->gpib_dev, "no ines pcmcia cards found\n");
-		return -1;
-	}
-
-	retval = ines_generic_attach(board);
-	if (retval)
-		return retval;
-
-	ines_priv = board->private_data;
-	nec_priv = &ines_priv->nec7210_priv;
-
-	if (!request_region(curr_dev->resource[0]->start,
-			    resource_size(curr_dev->resource[0]), DRV_NAME)) {
-		dev_err(board->gpib_dev, "ioports at 0x%lx already in use\n",
-			(unsigned long)(curr_dev->resource[0]->start));
-		return -1;
-	}
-
-	nec_priv->iobase = curr_dev->resource[0]->start;
-
-	nec7210_board_reset(nec_priv, board);
-
-	if (request_irq(curr_dev->irq, ines_pcmcia_interrupt, IRQF_SHARED,
-			"pcmcia-gpib", board))	{
-		dev_err(board->gpib_dev, "can't request IRQ %d\n", curr_dev->irq);
-		return -1;
-	}
-	ines_priv->irq = curr_dev->irq;
-
-	return 0;
-}
-
-static int ines_pcmcia_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct ines_priv *ines_priv;
-	int retval;
-
-	retval = ines_common_pcmcia_attach(board);
-	if (retval < 0)
-		return retval;
-
-	ines_priv = board->private_data;
-	ines_online(ines_priv, board, 0);
-
-	return 0;
-}
-
-static int ines_pcmcia_accel_attach(struct gpib_board *board,
-				    const struct gpib_board_config *config)
-{
-	struct ines_priv *ines_priv;
-	int retval;
-
-	retval = ines_common_pcmcia_attach(board);
-	if (retval < 0)
-		return retval;
-
-	ines_priv = board->private_data;
-	ines_online(ines_priv, board, 1);
-
-	return 0;
-}
-
-static void ines_pcmcia_detach(struct gpib_board *board)
-{
-	struct ines_priv *ines_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (ines_priv) {
-		nec_priv = &ines_priv->nec7210_priv;
-		if (ines_priv->irq)
-			free_irq(ines_priv->irq, board);
-		if (nec_priv->iobase) {
-			nec7210_board_reset(nec_priv, board);
-			release_region(nec_priv->iobase, ines_pcmcia_iosize);
-		}
-	}
-	ines_free_private(board);
-}
-
-#endif /* CONFIG_GPIB_PCMCIA */
-
-static int __init ines_init_module(void)
-{
-	int ret;
-
-	ret = pci_register_driver(&ines_pci_driver);
-	if (ret) {
-		pr_err("pci_register_driver failed: error = %d\n", ret);
-		return ret;
-	}
-
-	ret = gpib_register_driver(&ines_pci_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pci;
-	}
-
-	ret = gpib_register_driver(&ines_pci_unaccel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pci_unaccel;
-	}
-
-	ret = gpib_register_driver(&ines_pci_accel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pci_accel;
-	}
-
-	ret = gpib_register_driver(&ines_isa_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_isa;
-	}
-
-#ifdef CONFIG_GPIB_PCMCIA
-	ret = gpib_register_driver(&ines_pcmcia_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pcmcia;
-	}
-
-	ret = gpib_register_driver(&ines_pcmcia_unaccel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pcmcia_unaccel;
-	}
-
-	ret = gpib_register_driver(&ines_pcmcia_accel_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pcmcia_accel;
-	}
-
-	ret = pcmcia_register_driver(&ines_gpib_cs_driver);
-	if (ret) {
-		pr_err("pcmcia_register_driver failed: error = %d\n", ret);
-		goto err_pcmcia_driver;
-	}
-#endif
-
-	return 0;
-
-#ifdef CONFIG_GPIB_PCMCIA
-err_pcmcia_driver:
-	gpib_unregister_driver(&ines_pcmcia_accel_interface);
-err_pcmcia_accel:
-	gpib_unregister_driver(&ines_pcmcia_unaccel_interface);
-err_pcmcia_unaccel:
-	gpib_unregister_driver(&ines_pcmcia_interface);
-err_pcmcia:
-#endif
-	gpib_unregister_driver(&ines_isa_interface);
-err_isa:
-	gpib_unregister_driver(&ines_pci_accel_interface);
-err_pci_accel:
-	gpib_unregister_driver(&ines_pci_unaccel_interface);
-err_pci_unaccel:
-	gpib_unregister_driver(&ines_pci_interface);
-err_pci:
-	pci_unregister_driver(&ines_pci_driver);
-
-	return ret;
-}
-
-static void __exit ines_exit_module(void)
-{
-	gpib_unregister_driver(&ines_pci_interface);
-	gpib_unregister_driver(&ines_pci_unaccel_interface);
-	gpib_unregister_driver(&ines_pci_accel_interface);
-	gpib_unregister_driver(&ines_isa_interface);
-#ifdef CONFIG_GPIB_PCMCIA
-	gpib_unregister_driver(&ines_pcmcia_interface);
-	gpib_unregister_driver(&ines_pcmcia_unaccel_interface);
-	gpib_unregister_driver(&ines_pcmcia_accel_interface);
-	ines_pcmcia_cleanup_module();
-#endif
-
-	pci_unregister_driver(&ines_pci_driver);
-}
-
-module_init(ines_init_module);
-module_exit(ines_exit_module);
diff --git a/drivers/staging/gpib/lpvo_usb_gpib/Makefile b/drivers/staging/gpib/lpvo_usb_gpib/Makefile
deleted file mode 100644
index 360553488e6d..000000000000
--- a/drivers/staging/gpib/lpvo_usb_gpib/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-
-obj-$(CONFIG_GPIB_LPVO) += lpvo_usb_gpib.o
-
diff --git a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c
deleted file mode 100644
index dd68c4843490..000000000000
--- a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c
+++ /dev/null
@@ -1,2025 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *  This code has been developed at the Department of Physics (University  *
- *  of Florence, Italy) to support in linux-gpib the open usb-gpib adapter *
- *  implemented at the University of Ljubljana (lpvo.fe.uni-lj.si/gpib)	   *
- *									   *
- *  copyright		 : (C) 2011 Marcello Carla'			   *
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define NAME KBUILD_MODNAME
-
-/* base module includes */
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/tty.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/spinlock.h>
-#include <linux/file.h>
-#include <linux/timer.h>
-#include <linux/delay.h>
-#include <linux/sched/signal.h>
-#include <linux/usb.h>
-
-#include "gpibP.h"
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver for LPVO usb devices");
-
-/*
- * Table of devices that work with this driver.
- *
- * Currently, only one device is known to be used in the
- * lpvo_usb_gpib adapter (FTDI 0403:6001).
- * If your adapter uses a different chip, insert a line
- * in the following table with proper <Vendor-id>, <Product-id>.
- *
- * To have your chip automatically handled by the driver,
- * update files "/usr/local/etc/modprobe.d/lpvo_usb_gpib.conf"
- * and /usr/local/etc/udev/rules.d/99-lpvo_usb_gpib.rules.
- *
- */
-
-static const struct usb_device_id skel_table[] = {
-	{ USB_DEVICE(0x0403, 0x6001) },
-	{ }					   /* Terminating entry */
-};
-MODULE_DEVICE_TABLE(usb, skel_table);
-
-/*
- *   ***  Diagnostics and Debug  ***
- * To enable the diagnostic and debug messages either compile with DEBUG set
- * or control via the dynamic debug mechanisms.
- * The module parameter "debug" controls the sending of debug messages to
- * syslog. By default it is set to 0
- * debug = 0: only attach/detach messages are sent
- *         1: every action is logged
- *         2: extended logging; each single exchanged byte is documented
- *	(about twice the log volume of [1])
- * To switch debug level:
- *         At module loading:  modprobe lpvo_usb_gpib debug={0,1,2}
- *         On the fly: echo {0,1,2} > /sys/modules/lpvo_usb_gpib/parameters/debug
- */
-
-static int debug;
-module_param(debug, int, 0644);
-
-#define DIA_LOG(level, format, ...)			   \
-	do { if (debug >= (level))					\
-			dev_dbg(board->gpib_dev, format, ## __VA_ARGS__); } \
-	while (0)
-
-#define WQT wait_queue_entry_t
-#define WQH head
-#define WQE entry
-
-/* standard and extended command sets of the usb-gpib adapter */
-
-#define USB_GPIB_ON	 "\nIB\n"
-#define USB_GPIB_OFF	 "\nIBO\n"
-#define USB_GPIB_IBm0	 "\nIBm0\n"   /* do not assert REN with IFC */
-#define USB_GPIB_IBm1	 "\nIBm1\n"   /* assert REN with IFC */
-#define USB_GPIB_IBCL	 "\nIBZ\n"
-#define USB_GPIB_STATUS	 "\nIBS\n"
-#define USB_GPIB_READ	 "\nIB?\n"
-#define USB_GPIB_READ_1	 "\nIBB\n"
-#define USB_GPIB_EOI	 "\nIBe0\n"
-#define USB_GPIB_FTMO	 "\nIBf0\n"    /* disable first byte timeout */
-#define USB_GPIB_TTMOZ	 "\nIBt0\n"    /* disable byte timeout */
-
-/* incomplete commands */
-
-#define USB_GPIB_BTMO	 "\nIBt"      /* set byte timeout */
-#define USB_GPIB_TTMO	 "\nIBT"      /* set total timeout */
-
-#define USB_GPIB_DEBUG_ON    "\nIBDE\xAA\n"
-#define USB_GPIB_SET_LISTEN  "\nIBDT0\n"
-#define USB_GPIB_SET_TALK    "\nIBDT1\n"
-#define USB_GPIB_SET_LINES   "\nIBDC.\n"
-#define USB_GPIB_SET_DATA    "\nIBDM.\n"
-#define USB_GPIB_READ_LINES  "\nIBD?C\n"
-#define USB_GPIB_READ_DATA   "\nIBD?M\n"
-#define USB_GPIB_READ_BUS    "\nIBD??\n"
-
-/* command sequences */
-
-#define USB_GPIB_UNTALK "\nIBC_\n"
-#define USB_GPIB_UNLISTEN "\nIBC?\n"
-
-/* special characters used by the adapter */
-
-#define DLE ('\020')
-#define STX ('\02')
-#define ETX ('\03')
-#define ACK ('\06')
-#define NODATA ('\03')
-#define NODAV ('\011')
-
-#define IB_BUS_REN  0x01
-#define IB_BUS_IFC  0x02
-#define IB_BUS_NDAC 0x04
-#define IB_BUS_NRFD 0x08
-#define IB_BUS_DAV  0x10
-#define IB_BUS_EOI  0x20
-#define IB_BUS_ATN  0x40
-#define IB_BUS_SRQ  0x80
-
-#define INBUF_SIZE 128
-
-struct char_buf {		/* used by one_char() routine */
-	char *inbuf;
-	int last;
-	int nchar;
-};
-
-struct usb_gpib_priv {		/* private data to the device */
-	u8 eos;			/* eos character */
-	short eos_flags;	/* eos mode */
-	int timeout;		/* current value for timeout */
-	void *dev;		/* the usb device private data structure */
-};
-
-#define GPIB_DEV (((struct usb_gpib_priv *)board->private_data)->dev)
-
-static void show_status(struct gpib_board *board)
-{
-	DIA_LOG(2, "# - buffer_length %d\n", board->buffer_length);
-	DIA_LOG(2, "# - status %lx\n", board->status);
-	DIA_LOG(2, "# - use_count %d\n", board->use_count);
-	DIA_LOG(2, "# - pad %x\n", board->pad);
-	DIA_LOG(2, "# - sad %x\n", board->sad);
-	DIA_LOG(2, "# - timeout %d\n", board->usec_timeout);
-	DIA_LOG(2, "# - ppc %d\n", board->parallel_poll_configuration);
-	DIA_LOG(2, "# - t1delay %d\n", board->t1_nano_sec);
-	DIA_LOG(2, "# - online %d\n", board->online);
-	DIA_LOG(2, "# - autopoll %d\n", board->autospollers);
-	DIA_LOG(2, "# - autopoll task %p\n", board->autospoll_task);
-	DIA_LOG(2, "# - minor %d\n", board->minor);
-	DIA_LOG(2, "# - master %d\n", board->master);
-	DIA_LOG(2, "# - list %d\n", board->ist);
-}
-
-/*
- * GLOBAL VARIABLES: required for
- * pairing among gpib minor and usb minor.
- * MAX_DEV is the max number of usb-gpib adapters; free
- * to change as you like, but no more than 32
- */
-
-#define MAX_DEV 8
-static struct usb_interface *lpvo_usb_interfaces[MAX_DEV];   /* registered interfaces */
-static int usb_minors[MAX_DEV];			   /* usb minors */
-static int assigned_usb_minors;		   /* mask of filled slots */
-static struct mutex minors_lock;     /* operations on usb_minors are to be protected */
-
-/*
- * usb-skeleton prototypes
- */
-
-struct usb_skel;
-static ssize_t skel_do_write(struct usb_skel *, const char *, size_t);
-static ssize_t skel_do_read(struct usb_skel *, char *, size_t);
-static int skel_do_open(struct gpib_board *, int);
-static int skel_do_release(struct gpib_board *);
-
-/*
- *  usec_diff : take difference in MICROsec between two 'timespec'
- *		 (unix time in sec and NANOsec)
- */
-
-static inline int usec_diff(struct timespec64 *a, struct timespec64 *b)
-{
-	return ((a->tv_sec - b->tv_sec) * 1000000 +
-		(a->tv_nsec - b->tv_nsec) / 1000);
-}
-
-/*
- *  ***  these routines are specific to the usb-gpib adapter  ***
- */
-
-/**
- * write_loop() - Send a byte sequence to the adapter
- *
- * @dev:      the private device structure
- * @msg:      the byte sequence.
- * @leng:     the byte sequence length.
- *
- */
-
-static int write_loop(void *dev, char *msg, int leng)
-{
-	return skel_do_write(dev, msg, leng);
-}
-
-/**
- * send_command() - Send a byte sequence and return a single byte reply.
- *
- * @board:    the gpib_board_struct data area for this gpib interface
- * @msg:      the byte sequence.
- * @leng:     the byte sequence length; can be given as zero and is
- *	      computed automatically, but if 'msg' contains a zero byte,
- *	      it has to be given explicitly.
- */
-
-static int send_command(struct gpib_board *board, char *msg, int leng)
-{
-	char buffer[64];
-	int nchar;
-	int retval;
-	struct timespec64 before, after;
-
-	ktime_get_real_ts64 (&before);
-
-	if (!leng)
-		leng = strlen(msg);
-	retval = write_loop(GPIB_DEV, msg, leng);
-	if (retval < 0)
-		return retval;
-
-	nchar = skel_do_read(GPIB_DEV, buffer, 64);
-
-	if (nchar < 0) {
-		dev_err(board->gpib_dev, " return from read: %d\n", nchar);
-		return nchar;
-	} else if (nchar != 1) {
-		dev_err(board->gpib_dev, " Irregular reply to command: %s\n", msg);
-		return -EIO;
-	}
-	ktime_get_real_ts64 (&after);
-
-	DIA_LOG(1, "Sent %d - done %d us.\n", leng, usec_diff(&after, &before));
-
-	return buffer[0] & 0xff;
-}
-
-/*
- * set_control_line() - Set the value of a single gpib control line
- *
- * @board:    the gpib_board_struct data area for this gpib interface
- * @line:     line mask
- * @value:    line new value (0/1)
- */
-
-static int set_control_line(struct gpib_board *board, int line, int value)
-{
-	char msg[] = USB_GPIB_SET_LINES;
-	int retval;
-	int leng = strlen(msg);
-
-	DIA_LOG(1, "setting line %x to %x\n", line, value);
-
-	retval = send_command(board, USB_GPIB_READ_LINES, 0);
-
-	DIA_LOG(1, "old line values: %x\n", retval);
-
-	if (retval == -EIO)
-		return retval;
-
-	msg[leng - 2] = value ? (retval & ~line) : retval | line;
-
-	retval = send_command(board, msg, 0);
-
-	DIA_LOG(1, "operation result: %x\n", retval);
-
-	return retval;
-}
-
-/*
- * one_char() - read one single byte from input buffer
- *
- * @board:	the gpib_board_struct data area for this gpib interface
- * @char_buf:	the routine private data structure
- */
-
-static int one_char(struct gpib_board *board, struct char_buf *b)
-{
-	struct timespec64 before, after;
-
-	if (b->nchar) {
-		DIA_LOG(2, "-> %x\n", b->inbuf[b->last - b->nchar]);
-		return b->inbuf[b->last - b->nchar--];
-	}
-	ktime_get_real_ts64 (&before);
-	b->nchar = skel_do_read(GPIB_DEV, b->inbuf, INBUF_SIZE);
-	b->last = b->nchar;
-	ktime_get_real_ts64 (&after);
-
-	DIA_LOG(2, "read %d bytes in %d usec\n",
-		b->nchar, usec_diff(&after, &before));
-
-	if (b->nchar > 0) {
-		DIA_LOG(2, "--> %x\n", b->inbuf[b->last - b->nchar]);
-		return b->inbuf[b->last - b->nchar--];
-	}
-	return -EIO;
-}
-
-/**
- * set_timeout() - set single byte / total timeouts on the adapter
- *
- * @board:    the gpib_board_struct data area for this gpib interface
- *
- *	   For sake of speed, the operation is performed only if it
- *	   modifies the current (saved) value. Minimum allowed timeout
- *	   is 30 ms (T30ms -> 8); timeout disable (TNONE -> 0) currently
- *	   not supported.
- */
-
-static void set_timeout(struct gpib_board *board)
-{
-	int n, val;
-	char command[sizeof(USB_GPIB_TTMO) + 6];
-	struct usb_gpib_priv *data = board->private_data;
-
-	if (data->timeout == board->usec_timeout)
-		return;
-
-	n = (board->usec_timeout + 32767) / 32768;
-	if (n < 2)
-		n = 2;
-
-	DIA_LOG(1, "Set timeout to %d us -> %d\n", board->usec_timeout, n);
-
-	sprintf(command, "%s%d\n", USB_GPIB_BTMO, n > 255 ? 255 : n);
-	val = send_command(board, command, 0);
-
-	if (val == ACK) {
-		if (n > 65535)
-			n = 65535;
-		sprintf(command, "%s%d\n", USB_GPIB_TTMO, n);
-		val = send_command(board, command, 0);
-	}
-
-	if (val != ACK)
-		dev_err(board->gpib_dev, "error in timeout set: <%s>\n", command);
-	else
-		data->timeout = board->usec_timeout;
-}
-
-/*
- * now the standard interface functions - attach and detach
- */
-
-/**
- * usb_gpib_attach() - activate the usb-gpib converter board
- *
- * @board:    the gpib_board_struct data area for this gpib interface
- * @config:   firmware data, if any (from gpib_config -I <file>)
- *
- * The channel name is ttyUSBn, with n=0 by default. Other values for n
- * passed with gpib_config -b <n>.
- *
- * In this routine I trust that when an error code is returned
- * detach() will be called. Always.
- */
-
-static int usb_gpib_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	int retval, j;
-	u32 base = config->ibbase;
-	char *device_path;
-	int match;
-	struct usb_device *udev;
-
-	DIA_LOG(0, "Board %p -t %s -m %d -a %p -u %d -l %d -b %d\n",
-		board, board->interface->name, board->minor, config->device_path,
-		config->pci_bus, config->pci_slot, base);
-
-	board->private_data = NULL;  /* to be sure - we can detach before setting */
-
-	/* identify device to be attached */
-
-	mutex_lock(&minors_lock);
-
-	if (config->device_path) {
-		/* if config->device_path given, try that first */
-		for (j = 0 ; j < MAX_DEV ; j++) {
-			if ((assigned_usb_minors & 1 << j) == 0)
-				continue;
-			udev =	usb_get_dev(interface_to_usbdev(lpvo_usb_interfaces[j]));
-			device_path = kobject_get_path(&udev->dev.kobj, GFP_KERNEL);
-			match = gpib_match_device_path(&lpvo_usb_interfaces[j]->dev,
-						       config->device_path);
-			DIA_LOG(1, "dev. %d: minor %d  path: %s --> %d\n", j,
-				lpvo_usb_interfaces[j]->minor, device_path, match);
-			kfree(device_path);
-			if (match)
-				break;
-		}
-	} else if (config->pci_bus != -1 && config->pci_slot != -1) {
-		/* second: look for bus and slot */
-		for (j = 0 ; j < MAX_DEV ; j++) {
-			if ((assigned_usb_minors & 1 << j) == 0)
-				continue;
-			udev =	usb_get_dev(interface_to_usbdev(lpvo_usb_interfaces[j]));
-			DIA_LOG(1, "dev. %d: bus %d -> %d  dev: %d -> %d\n", j,
-				udev->bus->busnum, config->pci_bus, udev->devnum, config->pci_slot);
-			if (config->pci_bus == udev->bus->busnum &&
-			    config->pci_slot == udev->devnum)
-				break;
-		}
-	} else {		/* last chance: usb_minor, given as ibbase */
-		for (j = 0 ; j < MAX_DEV ; j++) {
-			if (usb_minors[j] == base && assigned_usb_minors & 1 << j)
-				break;
-		}
-	}
-	mutex_unlock(&minors_lock);
-
-	if (j == MAX_DEV) {
-		dev_err(board->gpib_dev, "Requested device is not registered.\n");
-		return -EIO;
-	}
-
-	board->private_data = kzalloc(sizeof(struct usb_gpib_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -ENOMEM;
-
-	retval = skel_do_open(board, usb_minors[j]);
-
-	DIA_LOG(1, "Skel open: %d\n", retval);
-
-	if (retval) {
-		dev_err(board->gpib_dev, "skel open failed.\n");
-		kfree(board->private_data);
-		board->private_data = NULL;
-		return -ENODEV;
-	}
-
-	show_status(board);
-
-	retval = send_command(board, USB_GPIB_ON, 0);
-	DIA_LOG(1, "USB_GPIB_ON returns %x\n", retval);
-	if (retval != ACK)
-		return -EIO;
-
-	/*
-	 * We must setup debug mode because we need the extended instruction
-	 * set to cope with the Core (gpib_common) point of view
-	 */
-
-	retval = send_command(board, USB_GPIB_DEBUG_ON, 0);
-	DIA_LOG(1, "USB_GPIB_DEBUG_ON returns %x\n", retval);
-	if (retval != ACK)
-		return -EIO;
-
-	/*
-	 * We must keep REN off after an IFC because so it is
-	 * assumed by the Core
-	 */
-
-	retval = send_command(board, USB_GPIB_IBm0, 0);
-	DIA_LOG(1, "USB_GPIB_IBm0 returns %x\n", retval);
-	if (retval != ACK)
-		return -EIO;
-
-	retval = set_control_line(board, IB_BUS_REN, 0);
-	if (retval != ACK)
-		return -EIO;
-
-	retval = send_command(board, USB_GPIB_FTMO, 0);
-	DIA_LOG(1, "USB_GPIB_FTMO returns %x\n", retval);
-	if (retval != ACK)
-		return -EIO;
-
-	show_status(board);
-	DIA_LOG(0, "attached\n");
-	return 0;
-}
-
-/**
- * usb_gpib_detach() - deactivate the usb-gpib converter board
- *
- * @board:    the gpib_board data area for this gpib interface
- *
- */
-
-static void usb_gpib_detach(struct gpib_board *board)
-{
-	int retval;
-
-	show_status(board);
-
-	DIA_LOG(0, "detaching\n");
-
-	if (board->private_data) {
-		if (GPIB_DEV) {
-			write_loop(GPIB_DEV, USB_GPIB_OFF, strlen(USB_GPIB_OFF));
-			msleep(100);
-			DIA_LOG(1, "%s", "GPIB off\n");
-			retval = skel_do_release(board);
-			DIA_LOG(1, "skel release -> %d\n", retval);
-		}
-		kfree(board->private_data);
-		board->private_data = NULL;
-	}
-
-	DIA_LOG(0, "detached\n");
-}
-
-/*
- *   Other functions follow in alphabetical order
- */
-/* command */
-static int usb_gpib_command(struct gpib_board *board,
-			    u8 *buffer,
-			    size_t length,
-			    size_t *bytes_written)
-{
-	int i, retval;
-	char command[6] = "IBc.\n";
-
-	DIA_LOG(1, "enter %p\n", board);
-
-	set_timeout(board);
-
-	*bytes_written = 0;
-	for (i = 0 ; i < length ; i++) {
-		command[3] = buffer[i];
-		retval = send_command(board, command, 5);
-		DIA_LOG(2, "%d ==> %x %x\n", i, buffer[i], retval);
-		if (retval != 0x06)
-			return retval;
-		++(*bytes_written);
-	}
-	return 0;
-}
-
-/**
- * usb_gpib_disable_eos() - Disable END on eos byte (END on EOI only)
- *
- * @board:    the gpib_board data area for this gpib interface
- *
- *   With the lpvo adapter eos can only be handled via software.
- *   Cannot do nothing here, but remember for future use.
- */
-
-static void usb_gpib_disable_eos(struct gpib_board *board)
-{
-	((struct usb_gpib_priv *)board->private_data)->eos_flags &= ~REOS;
-	DIA_LOG(1, "done: %x\n",
-		((struct usb_gpib_priv *)board->private_data)->eos_flags);
-}
-
-/**
- * usb_gpib_enable_eos() - Enable END for reads when eos byte is received.
- *
- * @board:    the gpib_board data area for this gpib interface
- * @eos_byte: the 'eos' byte
- * @compare_8_bits: if zero ignore eigthth bit when comparing
- *
- */
-
-static int usb_gpib_enable_eos(struct gpib_board *board,
-			       u8 eos_byte,
-			       int compare_8_bits)
-{
-	struct usb_gpib_priv *pd = (struct usb_gpib_priv *)board->private_data;
-
-	DIA_LOG(1, "enter with %x\n", eos_byte);
-	pd->eos = eos_byte;
-	pd->eos_flags = REOS;
-	if (compare_8_bits)
-		pd->eos_flags |= BIN;
-	return 0;
-}
-
-/**
- * usb_gpib_go_to_standby() - De-assert ATN
- *
- * @board:    the gpib_board data area for this gpib interface
- */
-
-static int usb_gpib_go_to_standby(struct gpib_board *board)
-{
-	int retval = set_control_line(board, IB_BUS_ATN, 0);
-
-	DIA_LOG(1, "done with %x\n", retval);
-
-	if (retval == ACK)
-		return 0;
-	return -EIO;
-}
-
-/**
- * usb_gpib_interface_clear() - Assert or de-assert IFC
- *
- * @board:    the gpib_board data area for this gpib interface
- * @assert:   1: assert IFC;  0: de-assert IFC
- *
- *    Currently on the assert request we issue the lpvo IBZ
- *    command that cycles IFC low for 100 usec, then we ignore
- *    the de-assert request.
- */
-
-static void usb_gpib_interface_clear(struct gpib_board *board, int assert)
-{
-	int retval = 0;
-
-	DIA_LOG(1, "enter with %d\n", assert);
-
-	if (assert) {
-		retval = send_command(board, USB_GPIB_IBCL, 0);
-
-		set_bit(CIC_NUM, &board->status);
-	}
-
-	DIA_LOG(1, "done with %d %d\n", assert, retval);
-}
-
-/**
- * usb_gpib_line_status() - Read the status of the bus lines.
- *
- *  @board:    the gpib_board data area for this gpib interface
- *
- *    We can read all lines.
- */
-static int usb_gpib_line_status(const struct gpib_board *board)
-{
-	int buffer;
-	int line_status = VALID_ALL;   /* all lines will be read */
-	struct list_head *p, *q;
-	WQT *item;
-	unsigned long flags;
-	int sleep = 0;
-
-	DIA_LOG(1, "%s\n", "request");
-
-	/*
-	 * if we are on the wait queue (board->wait), do not hurry
-	 * reading status line; instead, pause a little
-	 */
-
-	spin_lock_irqsave((spinlock_t *)&board->wait.lock, flags);
-	q = (struct list_head *)&board->wait.WQH;
-	list_for_each(p, q) {
-		item = container_of(p, WQT, WQE);
-		if (item->private == current) {
-			sleep = 20;
-			break;
-		}
-		/* pid is: ((struct task_struct *) item->private)->pid); */
-	}
-	spin_unlock_irqrestore((spinlock_t *)&board->wait.lock, flags);
-	if (sleep) {
-		DIA_LOG(1, "we are on the wait queue - sleep %d ms\n", sleep);
-		msleep(sleep);
-	}
-
-	buffer = send_command((struct gpib_board *)board, USB_GPIB_STATUS, 0);
-
-	if (buffer < 0) {
-		dev_err(board->gpib_dev, "line status read failed with %d\n", buffer);
-		return -1;
-	}
-
-	if ((buffer & 0x01) == 0)
-		line_status |= BUS_REN;
-	if ((buffer & 0x02) == 0)
-		line_status |= BUS_IFC;
-	if ((buffer & 0x04) == 0)
-		line_status |= BUS_NDAC;
-	if ((buffer & 0x08) == 0)
-		line_status |= BUS_NRFD;
-	if ((buffer & 0x10) == 0)
-		line_status |= BUS_DAV;
-	if ((buffer & 0x20) == 0)
-		line_status |= BUS_EOI;
-	if ((buffer & 0x40) == 0)
-		line_status |= BUS_ATN;
-	if ((buffer & 0x80) == 0)
-		line_status |= BUS_SRQ;
-
-	DIA_LOG(1, "done with %x %x\n", buffer, line_status);
-
-	return line_status;
-}
-
-/* parallel_poll */
-
-static int usb_gpib_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	/*
-	 * request parallel poll asserting ATN | EOI;
-	 * we suppose ATN already asserted
-	 */
-
-	int retval;
-
-	DIA_LOG(1, "enter %p\n", board);
-
-	retval = set_control_line(board, IB_BUS_EOI, 1);
-	if (retval != ACK)
-		return -EIO;
-
-	*result = send_command(board, USB_GPIB_READ_DATA, 0);
-
-	DIA_LOG(1, "done with %x\n", *result);
-
-	retval = set_control_line(board, IB_BUS_EOI, 0);
-	if (retval != 0x06)
-		return -EIO;
-
-	return 0;
-}
-
-/* read */
-
-static int usb_gpib_read(struct gpib_board *board,
-			 u8 *buffer,
-			 size_t length,
-			 int *end,
-			 size_t *bytes_read)
-{
-#define MAX_READ_EXCESS 16384
-
-	struct char_buf b = {NULL, 0};
-
-	int retval;
-	char c, nc;
-	int ic;
-	struct timespec64 before, after;
-	int read_count = MAX_READ_EXCESS;
-	struct usb_gpib_priv *pd = (struct usb_gpib_priv *)board->private_data;
-
-	DIA_LOG(1, "enter %p -> %zu\n", board, length);
-
-	*bytes_read = 0;      /* by default, things go wrong */
-	*end = 0;
-
-	set_timeout(board);
-
-	/* single byte read has a special handling */
-
-	if (length == 1) {
-		char inbuf[2] = {0, 0};
-
-		/* read a single character */
-
-		ktime_get_real_ts64 (&before);
-
-		retval = write_loop(GPIB_DEV, USB_GPIB_READ_1, strlen(USB_GPIB_READ_1));
-		if (retval < 0)
-			return retval;
-
-		retval = skel_do_read(GPIB_DEV, inbuf, 1);
-		retval += skel_do_read(GPIB_DEV, inbuf + 1, 1);
-
-		ktime_get_real_ts64 (&after);
-
-		DIA_LOG(1, "single read: %x %x %x in %d\n", retval,
-			inbuf[0], inbuf[1],
-			usec_diff(&after, &before));
-
-		/* good char / last char? */
-
-		if (retval == 2 && inbuf[1] == ACK) {
-			buffer[0] = inbuf[0];
-			*bytes_read = 1;
-			return 0;
-		}
-		if (retval < 2)
-			return -EIO;
-		else
-			return -ETIME;
-	}
-
-	/* allocate buffer for multibyte read */
-
-	b.inbuf = kmalloc(INBUF_SIZE, GFP_KERNEL);
-	if (!b.inbuf)
-		return -ENOMEM;
-
-	/* send read command and check <DLE><STX> sequence */
-
-	retval = write_loop(GPIB_DEV, USB_GPIB_READ, strlen(USB_GPIB_READ));
-	if (retval < 0)
-		goto read_return;
-
-	if (one_char(board, &b) != DLE || one_char(board, &b) != STX) {
-		dev_err(board->gpib_dev, "wrong <DLE><STX> sequence\n");
-		retval = -EIO;
-		goto read_return;
-	}
-
-	/* get data flow */
-
-	while (1) {
-		ic = one_char(board, &b);
-		if (ic == -EIO) {
-			retval = -EIO;
-			goto read_return;
-		}
-		c = ic;
-
-		if (c == DLE)
-			nc = one_char(board, &b);
-		if (c != DLE || nc == DLE) {
-			/* data byte - store into buffer */
-
-			if (*bytes_read == length)
-				break; /* data overflow */
-			if (c == DLE)
-				c = nc;
-			buffer[(*bytes_read)++] = c;
-			if (c == pd->eos) {
-				*end = 1;
-				break;
-			}
-
-		} else {
-			/* we are in the closing <DLE><ETX> sequence */
-			c = nc;
-			if (c == ETX) {
-				c = one_char(board, &b);
-				if (c == ACK) {
-					*end = 1;
-					retval = 0;
-					goto read_return;
-				} else {
-					dev_err(board->gpib_dev, "wrong end of message %x", c);
-					retval = -ETIME;
-					goto read_return;
-				}
-			} else {
-				dev_err(board->gpib_dev, "lone <DLE> in stream");
-				retval = -EIO;
-				goto read_return;
-			}
-		}
-	}
-
-	/* we had a data overflow - flush excess data */
-
-	while (read_count--) {
-		if (one_char(board, &b) != DLE)
-			continue;
-		c = one_char(board, &b);
-		if (c == DLE)
-			continue;
-		if (c == ETX) {
-			c = one_char(board, &b);
-			if (c == ACK) {
-				if (MAX_READ_EXCESS - read_count > 1)
-					dev_dbg(board->gpib_dev, "small buffer - maybe some data lost");
-				retval = 0;
-				goto read_return;
-			}
-			break;
-		}
-	}
-
-	dev_err(board->gpib_dev, "no input end - board in odd state\n");
-	retval = -EIO;
-
-read_return:
-	kfree(b.inbuf);
-
-	DIA_LOG(1, "done with byte/status: %d %x %d\n",	(int)*bytes_read, retval, *end);
-
-	if (retval == 0 || retval == -ETIME) {
-		if (send_command(board, USB_GPIB_UNTALK, sizeof(USB_GPIB_UNTALK)) == 0x06)
-			return retval;
-		return	-EIO;
-	}
-
-	return retval;
-}
-
-/* remote_enable */
-
-static void usb_gpib_remote_enable(struct gpib_board *board, int enable)
-{
-	int retval;
-
-	retval = set_control_line(board, IB_BUS_REN, enable ? 1 : 0);
-	if (retval != ACK)
-		dev_err(board->gpib_dev, "could not set REN line: %x\n", retval);
-
-	DIA_LOG(1, "done with %x\n", retval);
-}
-
-/* request_system_control */
-
-static int usb_gpib_request_system_control(struct gpib_board *board, int request_control)
-{
-	if (!request_control)
-		return -EINVAL;
-
-	DIA_LOG(1, "done with %d -> %lx\n", request_control, board->status);
-	return 0;
-}
-
-/* take_control */
-/* beware: the sync flag is ignored; what is its real meaning? */
-
-static int usb_gpib_take_control(struct gpib_board *board, int sync)
-{
-	int retval;
-
-	retval = set_control_line(board, IB_BUS_ATN, 1);
-
-	DIA_LOG(1, "done with %d %x\n", sync, retval);
-
-	if (retval == ACK)
-		return 0;
-	return -EIO;
-}
-
-/* update_status */
-
-static unsigned int usb_gpib_update_status(struct gpib_board *board,
-					   unsigned int clear_mask)
-{
-	/* There is nothing we can do here, I guess */
-
-	board->status &= ~clear_mask;
-
-	DIA_LOG(1, "done with %x %lx\n", clear_mask, board->status);
-
-	return board->status;
-}
-
-/* write */
-/* beware: DLE characters are not escaped - can only send ASCII data */
-
-static int usb_gpib_write(struct gpib_board *board,
-			  u8 *buffer,
-			  size_t length,
-			  int send_eoi,
-			  size_t *bytes_written)
-{
-	int retval;
-	char *msg;
-
-	DIA_LOG(1, "enter %p -> %zu\n", board, length);
-
-	set_timeout(board);
-
-	msg = kmalloc(length + 8, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
-
-	memcpy(msg, "\nIB\020\002", 5);
-	memcpy(msg + 5, buffer, length);
-	memcpy(msg + 5 + length, "\020\003\n", 3);
-
-	retval = send_command(board, msg, length + 8);
-	kfree(msg);
-
-	DIA_LOG(1, "<%.*s> -> %x\n", (int)length, buffer, retval);
-
-	if (retval != ACK)
-		return -EPIPE;
-
-	*bytes_written = length;
-
-	if (send_command(board, USB_GPIB_UNLISTEN, sizeof(USB_GPIB_UNLISTEN)) != 0x06)
-		return -EPIPE;
-
-	return length;
-}
-
-/*
- *  ***	 following functions not implemented yet  ***
- */
-
-/* parallel_poll configure */
-
-static void usb_gpib_parallel_poll_configure(struct gpib_board *board,
-					     u8 configuration)
-{
-}
-
-/* parallel_poll_response */
-
-static void usb_gpib_parallel_poll_response(struct gpib_board *board, int ist)
-{
-}
-
-/* primary_address */
-
-static int  usb_gpib_primary_address(struct gpib_board *board, unsigned int address)
-{
-	return 0;
-}
-
-/* return_to_local */
-
-static	void usb_gpib_return_to_local(struct gpib_board *board)
-{
-}
-
-/* secondary_address */
-
-static int usb_gpib_secondary_address(struct gpib_board *board,
-				      unsigned int address,
-				      int enable)
-{
-	return 0;
-}
-
-/* serial_poll_response */
-
-static void usb_gpib_serial_poll_response(struct gpib_board *board, u8 status)
-{
-}
-
-/* serial_poll_status */
-
-static u8 usb_gpib_serial_poll_status(struct gpib_board *board)
-{
-	return 0;
-}
-
-/* t1_delay */
-
-static int usb_gpib_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	return 0;
-}
-
-/*
- *   ***  module dispatch table and init/exit functions	 ***
- */
-
-static struct gpib_interface usb_gpib_interface = {
-	.name = NAME,
-	.attach = usb_gpib_attach,
-	.detach = usb_gpib_detach,
-	.read = usb_gpib_read,
-	.write = usb_gpib_write,
-	.command = usb_gpib_command,
-	.take_control = usb_gpib_take_control,
-	.go_to_standby = usb_gpib_go_to_standby,
-	.request_system_control = usb_gpib_request_system_control,
-	.interface_clear = usb_gpib_interface_clear,
-	.remote_enable = usb_gpib_remote_enable,
-	.enable_eos = usb_gpib_enable_eos,
-	.disable_eos = usb_gpib_disable_eos,
-	.parallel_poll = usb_gpib_parallel_poll,
-	.parallel_poll_configure = usb_gpib_parallel_poll_configure,
-	.parallel_poll_response = usb_gpib_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = usb_gpib_line_status,
-	.update_status = usb_gpib_update_status,
-	.primary_address = usb_gpib_primary_address,
-	.secondary_address = usb_gpib_secondary_address,
-	.serial_poll_response = usb_gpib_serial_poll_response,
-	.serial_poll_status = usb_gpib_serial_poll_status,
-	.t1_delay = usb_gpib_t1_delay,
-	.return_to_local = usb_gpib_return_to_local,
-	.skip_check_for_command_acceptors = 1
-};
-
-/*
- * usb_gpib_init_module(), usb_gpib_exit_module()
- *
- * This functions are called every time a new device is detected
- * and registered or is removed and unregistered.
- * We must take note of created and destroyed usb minors to be used
- * when usb_gpib_attach() and usb_gpib_detach() will be called on
- * request by gpib_config.
- */
-
-static int usb_gpib_init_module(struct usb_interface *interface)
-{
-	int j, mask, rv;
-
-	rv = mutex_lock_interruptible(&minors_lock);
-	if (rv < 0)
-		return rv;
-
-	if (!assigned_usb_minors) {
-		rv = gpib_register_driver(&usb_gpib_interface, THIS_MODULE);
-		if (rv) {
-			pr_err("gpib_register_driver failed: error = %d\n", rv);
-			goto exit;
-		}
-	} else {
-		/*
-		 * check if minor is already registered - maybe useless, but if
-		 * it happens the code is inconsistent somewhere
-		 */
-
-		for (j = 0 ; j < MAX_DEV ; j++) {
-			if (usb_minors[j] == interface->minor && assigned_usb_minors & 1 << j) {
-				pr_err("CODE BUG: USB minor %d registered at %d.\n",
-				       interface->minor, j);
-				rv = -1;
-				goto exit;
-			}
-		}
-	}
-
-	/* find a free slot */
-
-	for (j = 0 ; j < MAX_DEV ; j++) {
-		mask = 1 << j;
-		if ((assigned_usb_minors & mask) == 0) {
-			usb_minors[j] = interface->minor;
-			lpvo_usb_interfaces[j] = interface;
-			assigned_usb_minors |= mask;
-			rv = 0;
-			goto exit;
-		}
-	}
-	pr_err("No slot available for interface %p minor %d\n", interface, interface->minor);
-	rv = -1;
-
-exit:
-	mutex_unlock(&minors_lock);
-	return rv;
-}
-
-static void usb_gpib_exit_module(int minor)
-{
-	int j;
-
-	mutex_lock(&minors_lock);
-	for (j = 0 ; j < MAX_DEV ; j++) {
-		if (usb_minors[j] == minor && assigned_usb_minors & 1 << j) {
-			assigned_usb_minors &= ~(1 << j);
-			usb_minors[j] = -1;
-			if (assigned_usb_minors == 0)
-				gpib_unregister_driver(&usb_gpib_interface);
-			goto exit;
-		}
-	}
-	pr_err("CODE BUG: USB minor %d not found.\n", minor);
-
-exit:
-	mutex_unlock(&minors_lock);
-}
-
-/*
- * Default latency time (16 msec) is too long.
- * We must use 1 msec (best); anyhow, no more than 5 msec.
- *
- * Defines and function taken and modified from the kernel tree
- * (see ftdi_sio.h and ftdi_sio.c).
- */
-
-#define FTDI_SIO_SET_LATENCY_TIMER	9 /* Set the latency timer */
-#define FTDI_SIO_SET_LATENCY_TIMER_REQUEST FTDI_SIO_SET_LATENCY_TIMER
-#define FTDI_SIO_SET_LATENCY_TIMER_REQUEST_TYPE 0x40
-#define WDR_TIMEOUT 5000 /* default urb timeout */
-#define WDR_SHORT_TIMEOUT 1000	/* shorter urb timeout */
-
-#define LATENCY_TIMER 1		   /* use a small latency timer: 1 ... 5 msec */
-#define LATENCY_CHANNEL 0	   /* channel selection in multichannel devices */
-static int write_latency_timer(struct usb_device *udev)
-{
-	int rv = usb_control_msg(udev,
-				 usb_sndctrlpipe(udev, 0),
-				 FTDI_SIO_SET_LATENCY_TIMER_REQUEST,
-				 FTDI_SIO_SET_LATENCY_TIMER_REQUEST_TYPE,
-				 LATENCY_TIMER, LATENCY_CHANNEL,
-				 NULL, 0, WDR_TIMEOUT);
-	if (rv < 0)
-		dev_err(&udev->dev, "Unable to write latency timer: %i\n", rv);
-	return rv;
-}
-
-/*****************************************************************************
- *									     *
- *  The following code is a modified version of the USB Skeleton driver	     *
- *  written by Greg Kroah-Hartman and available in the kernel tree.	     *
- *									     *
- *  Functions skel_open() and skel_release() have been rewritten and named   *
- *  skel_do_open() and skel_do_release() to process the attach and detach    *
- *  requests coming from gpib_config.					     *
- *									     *
- *  Functions skel_read() and skel_write() have been split into a	     *
- *  skel_do_read() and skel_do_write(), that cover the kernel stuff of read  *
- *  and write operations, and the original skel_read() and skel_write(),     *
- *  that handle communication with user space and call their _do_ companion. *
- *									     *
- *  Only the _do_ versions are used by the lpvo_usb_gpib driver; other ones  *
- *  can be (optionally) maintained in the compilation to have direct access  *
- *  to a gpib controller for debug and diagnostics.			     *
- *									     *
- *  To avoid collisions in names, devices in user space have been renamed    *
- *  lpvo_raw1, lpvo_raw2 ....  and the usb driver has been renamed with the  *
- *  gpib module name.							     *
- *									     *
- *****************************************************************************/
-
-/*
- * USB Skeleton driver - 2.2
- *
- * Copyright (C) 2001-2004 Greg Kroah-Hartman (greg@kroah.com)
- *
- * This driver is based on the 2.6.3 version of drivers/usb/usb-skeleton.c
- * but has been rewritten to be easier to read and use.
- */
-
-#include <linux/errno.h>
-#include <linux/kref.h>
-#include <linux/uaccess.h>
-#include <linux/mutex.h>
-
-/* Get a minor range for your devices from the usb maintainer */
-#define USB_SKEL_MINOR_BASE	   192
-
-/*   private defines   */
-
-#define MAX_TRANSFER		    (PAGE_SIZE - 512)
-/*
- * MAX_TRANSFER is chosen so that the VM is not stressed by
- * allocations > PAGE_SIZE and the number of packets in a page
- * is an integer 512 is the largest possible packet on EHCI
- */
-
-#define WRITES_IN_FLIGHT	1     /* we do not want more than one pending write */
-#define USER_DEVICE 1		      /* compile for device(s) in user space */
-
-/* Structure to hold all of our device specific stuff */
-struct usb_skel {
-	struct usb_device     *udev;		     /* the usb device for this device */
-	struct usb_interface  *interface;	     /* the interface for this device */
-	struct semaphore      limit_sem;	     /* limiting the number of writes in progress */
-	struct usb_anchor     submitted;	     /* in case need to retract our submissions */
-	struct urb	      *bulk_in_urb;	     /* the urb to read data with */
-	unsigned char	      *bulk_in_buffer;	     /* the buffer to receive data */
-	size_t		      bulk_in_size;	     /* the size of the receive buffer */
-	size_t		      bulk_in_filled;	     /* number of bytes in the buffer */
-	size_t		      bulk_in_copied;	     /* already copied to user space */
-	__u8		      bulk_in_endpoint_addr;  /* the address of the bulk in endpoint */
-	__u8		      bulk_out_endpoint_addr; /* the address of the bulk out endpoint */
-	int		      errors;		     /* the last request tanked */
-	bool		      ongoing_read;	     /* a read is going on */
-	spinlock_t	      err_lock;		     /* lock for errors */
-	struct kref	      kref;
-	struct mutex	      io_mutex;		     /* synchronize I/O with disconnect */
-	wait_queue_head_t     bulk_in_wait;	     /* to wait for an ongoing read */
-};
-
-#define to_skel_dev(d) container_of(d, struct usb_skel, kref)
-
-static struct usb_driver skel_driver;
-static void skel_draw_down(struct usb_skel *dev);
-
-static void skel_delete(struct kref *kref)
-{
-	struct usb_skel *dev = to_skel_dev(kref);
-
-	usb_free_urb(dev->bulk_in_urb);
-	usb_put_dev(dev->udev);
-	kfree(dev->bulk_in_buffer);
-	kfree(dev);
-}
-
-/*
- * skel_do_open() - to be called by usb_gpib_attach
- */
-
-static int skel_do_open(struct gpib_board *board, int subminor)
-{
-	struct usb_skel *dev;
-	struct usb_interface *interface;
-	int retval = 0;
-
-	interface = usb_find_interface(&skel_driver, subminor);
-	if (!interface) {
-		dev_err(board->gpib_dev, "can't find device for minor %d\n", subminor);
-		retval = -ENODEV;
-		goto exit;
-	}
-
-	dev = usb_get_intfdata(interface);
-	if (!dev) {
-		retval = -ENODEV;
-		goto exit;
-	}
-
-	retval = usb_autopm_get_interface(interface);
-	if (retval)
-		goto exit;
-
-	/* increment our usage count for the device */
-	kref_get(&dev->kref);
-
-	/* save our object in the file's private structure */
-	GPIB_DEV = dev;
-
-exit:
-	return retval;
-}
-
-/*
- * skel_do_release() - to be called by usb_gpib_detach
- */
-
-static int skel_do_release(struct gpib_board *board)
-{
-	struct usb_skel *dev;
-
-	dev = GPIB_DEV;
-	if (!dev)
-		return -ENODEV;
-
-	/* allow the device to be autosuspended */
-	mutex_lock(&dev->io_mutex);
-	if (dev->interface)
-		usb_autopm_put_interface(dev->interface);
-	mutex_unlock(&dev->io_mutex);
-
-	/* decrement the count on our device */
-	kref_put(&dev->kref, skel_delete);
-	return 0;
-}
-
-/*
- * read functions
- */
-
-static void skel_read_bulk_callback(struct urb *urb)
-{
-	struct usb_skel *dev;
-	unsigned long flags;
-
-	dev = urb->context;
-
-	spin_lock_irqsave(&dev->err_lock, flags);
-	/* sync/async unlink faults aren't errors */
-	if (urb->status) {
-		if (!(urb->status == -ENOENT ||
-		      urb->status == -ECONNRESET ||
-		      urb->status == -ESHUTDOWN))
-			dev_err(&dev->interface->dev, "nonzero read bulk status received: %d\n",
-				urb->status);
-
-		dev->errors = urb->status;
-	} else {
-		dev->bulk_in_filled = urb->actual_length;
-	}
-	dev->ongoing_read = 0;
-	spin_unlock_irqrestore(&dev->err_lock, flags);
-
-	wake_up_interruptible(&dev->bulk_in_wait);
-}
-
-static int skel_do_read_io(struct usb_skel *dev, size_t count)
-{
-	int rv;
-
-	/* prepare a read */
-	usb_fill_bulk_urb(dev->bulk_in_urb,
-			  dev->udev,
-			  usb_rcvbulkpipe(dev->udev,
-					  dev->bulk_in_endpoint_addr),
-			  dev->bulk_in_buffer,
-			  min(dev->bulk_in_size, count),
-			  skel_read_bulk_callback,
-			  dev);
-	/* tell everybody to leave the URB alone */
-	spin_lock_irq(&dev->err_lock);
-	dev->ongoing_read = 1;
-	spin_unlock_irq(&dev->err_lock);
-
-	/* submit bulk in urb, which means no data to deliver */
-	dev->bulk_in_filled = 0;
-	dev->bulk_in_copied = 0;
-
-	/* do it */
-	rv = usb_submit_urb(dev->bulk_in_urb, GFP_KERNEL);
-	if (rv < 0) {
-		dev_err(&dev->interface->dev, "failed submitting read urb, error %d\n", rv);
-		rv = (rv == -ENOMEM) ? rv : -EIO;
-		spin_lock_irq(&dev->err_lock);
-		dev->ongoing_read = 0;
-		spin_unlock_irq(&dev->err_lock);
-	}
-
-	return rv;
-}
-
-/*
- * skel_do_read() - read operations from lpvo_usb_gpib
- */
-
-static ssize_t skel_do_read(struct usb_skel *dev, char *buffer, size_t count)
-{
-	int rv;
-	bool ongoing_io;
-
-	/* if we cannot read at all, return EOF */
-
-	if (!dev->bulk_in_urb || !count)
-		return 0;
-
-restart:  /* added to comply with ftdi timeout technique */
-
-	/* no concurrent readers */
-
-	rv = mutex_lock_interruptible(&dev->io_mutex);
-	if (rv < 0)
-		return rv;
-
-	if (!dev->interface) {		      /* disconnect() was called */
-		rv = -ENODEV;
-		goto exit;
-	}
-
-retry:
-	/* if IO is under way, we must not touch things */
-	spin_lock_irq(&dev->err_lock);
-	ongoing_io = dev->ongoing_read;
-	spin_unlock_irq(&dev->err_lock);
-
-	if (ongoing_io) {
-//		  /* nonblocking IO shall not wait */
-//		  /* no file, no O_NONBLOCK; maybe provide when from user space */
-//		  if (file->f_flags & O_NONBLOCK) {
-//			  rv = -EAGAIN;
-//			  goto exit;
-//		  }
-
-		/*
-		 * IO may take forever
-		 * hence wait in an interruptible state
-		 */
-		rv = wait_event_interruptible(dev->bulk_in_wait, (!dev->ongoing_read));
-		if (rv < 0)
-			goto exit;
-	}
-
-	/* errors must be reported */
-	rv = dev->errors;
-	if (rv < 0) {
-		/* any error is reported once */
-		dev->errors = 0;
-		/* to preserve notifications about reset */
-		rv = (rv == -EPIPE) ? rv : -EIO;
-		/* report it */
-		goto exit;
-	}
-
-	/*
-	 * if the buffer is filled we may satisfy the read
-	 * else we need to start IO
-	 */
-
-	if (dev->bulk_in_filled) {
-		/* we had read data */
-
-		size_t available = dev->bulk_in_filled - dev->bulk_in_copied;
-//		  size_t chunk = min(available, count);	 /* compute chunk later */
-		size_t chunk;
-
-		if (!available) {
-			/*
-			 * all data has been used
-			 * actual IO needs to be done
-			 */
-			/*
-			 * it seems that requests for less than dev->bulk_in_size
-			 *  are not accepted
-			 */
-			rv = skel_do_read_io(dev, dev->bulk_in_size);
-			if (rv < 0)
-				goto exit;
-			else
-				goto retry;
-		}
-
-		/*
-		 * data is available - chunk tells us how much shall be copied
-		 */
-
-		/*
-		 * Condition dev->bulk_in_copied > 0 maybe will never happen. In case,
-		 * signal the event and copy using the original procedure, i.e., copy
-		 * first two bytes also
-		 */
-
-		if (dev->bulk_in_copied) {
-			chunk = min(available, count);
-			memcpy(buffer, dev->bulk_in_buffer + dev->bulk_in_copied, chunk);
-			rv = chunk;
-			dev->bulk_in_copied += chunk;
-
-			/* copy discarding first two bytes that contain ftdi chip status */
-
-		} else {
-			/* account for two bytes to be discarded */
-			chunk = min(available, count + 2);
-			if (chunk < 2) {
-				dev_err(&dev->udev->dev, "BAD READ - chunk: %zu\n", chunk);
-				rv = -EIO;
-				goto exit;
-			}
-
-			memcpy(buffer, dev->bulk_in_buffer + 2, chunk - 2);
-			rv = chunk;
-			dev->bulk_in_copied += chunk;
-		}
-
-		/*
-		 * if we are asked for more than we have,
-		 * we start IO but don't wait
-		 *
-		 * No, no read ahead allowed; if the case, more data will be
-		 * asked for by the lpvo_usb_gpib layer.
-		 */
-//		  if (available < count)
-//			  skel_do_read_io(dev, dev->bulk_in_size);
-	} else {
-		/* no data in the buffer */
-		rv = skel_do_read_io(dev, dev->bulk_in_size);
-		if (rv < 0)
-			goto exit;
-		else
-			goto retry;
-	}
-exit:
-	mutex_unlock(&dev->io_mutex);
-	if (rv == 2)
-		goto restart;	/* ftdi chip returns two status bytes after a latency anyhow */
-
-	if (rv > 0)
-		return rv - 2;	/* account for 2 discarded bytes in a valid buffer */
-	return rv;
-}
-
-/*
- * write functions
- */
-
-static void skel_write_bulk_callback(struct urb *urb)
-{
-	struct usb_skel *dev;
-	unsigned long flags;
-
-	dev = urb->context;
-
-	/* sync/async unlink faults aren't errors */
-	if (urb->status) {
-		if (!(urb->status == -ENOENT ||
-		      urb->status == -ECONNRESET ||
-		      urb->status == -ESHUTDOWN))
-			dev_err(&dev->interface->dev,
-				"nonzero write bulk status received: %d\n", urb->status);
-
-		spin_lock_irqsave(&dev->err_lock, flags);
-		dev->errors = urb->status;
-		spin_unlock_irqrestore(&dev->err_lock, flags);
-	}
-
-	/* free up our allocated buffer */
-	usb_free_coherent(urb->dev, urb->transfer_buffer_length,
-			  urb->transfer_buffer, urb->transfer_dma);
-	up(&dev->limit_sem);
-}
-
-/*
- * skel_do_write() - write operations from lpvo_usb_gpib
- */
-
-static ssize_t skel_do_write(struct usb_skel *dev, const char *buffer, size_t count)
-{
-	int retval = 0;
-	struct urb *urb = NULL;
-	char *buf = NULL;
-	size_t writesize = min_t(size_t, count, (size_t)MAX_TRANSFER);
-
-	/* verify that we actually have some data to write */
-	if (count == 0)
-		goto exit;
-
-	/*
-	 * limit the number of URBs in flight to stop a user from using up all
-	 * RAM
-	 */
-	/* Only one URB is used, because we can't have a pending write() and go on */
-
-//	  if (!(file->f_flags & O_NONBLOCK)) {	/* no NONBLOCK provided */
-	if (down_interruptible(&dev->limit_sem)) {
-		retval = -ERESTARTSYS;
-		goto exit;
-	}
-//	  } else {
-//		  if (down_trylock(&dev->limit_sem)) {
-//			  retval = -EAGAIN;
-//			  goto exit;
-//		  }
-//	  }
-
-	spin_lock_irq(&dev->err_lock);
-	retval = dev->errors;
-	if (retval < 0) {
-		/* any error is reported once */
-		dev->errors = 0;
-		/* to preserve notifications about reset */
-		retval = (retval == -EPIPE) ? retval : -EIO;
-	}
-	spin_unlock_irq(&dev->err_lock);
-	if (retval < 0)
-		goto error;
-
-	/* create a urb, and a buffer for it, and copy the data to the urb */
-	urb = usb_alloc_urb(0, GFP_KERNEL);
-	if (!urb) {
-		retval = -ENOMEM;
-		goto error;
-	}
-
-	buf = usb_alloc_coherent(dev->udev, writesize, GFP_KERNEL,
-				 &urb->transfer_dma);
-	if (!buf) {
-		retval = -ENOMEM;
-		goto error;
-	}
-
-	memcpy(buf, buffer, count);
-
-	/* this lock makes sure we don't submit URBs to gone devices */
-	mutex_lock(&dev->io_mutex);
-	if (!dev->interface) {		      /* disconnect() was called */
-		mutex_unlock(&dev->io_mutex);
-		retval = -ENODEV;
-		goto error;
-	}
-
-	/* initialize the urb properly */
-	usb_fill_bulk_urb(urb, dev->udev,
-			  usb_sndbulkpipe(dev->udev, dev->bulk_out_endpoint_addr),
-			  buf, writesize, skel_write_bulk_callback, dev);
-	urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
-	usb_anchor_urb(urb, &dev->submitted);
-
-	/* send the data out the bulk port */
-	retval = usb_submit_urb(urb, GFP_KERNEL);
-	mutex_unlock(&dev->io_mutex);
-	if (retval) {
-		dev_err(&dev->interface->dev, "failed submitting write urb, error %d\n", retval);
-		goto error_unanchor;
-	}
-
-	/*
-	 * release our reference to this urb, the USB core will eventually free
-	 * it entirely
-	 */
-	usb_free_urb(urb);
-
-	return writesize;
-
-error_unanchor:
-	usb_unanchor_urb(urb);
-error:
-	if (urb) {
-		usb_free_coherent(dev->udev, writesize, buf, urb->transfer_dma);
-		usb_free_urb(urb);
-	}
-	up(&dev->limit_sem);
-
-exit:
-	return retval;
-}
-
-/*
- * services for the user space devices
- */
-
-#if USER_DEVICE	 /* conditional compilation of user space device */
-
-static int skel_flush(struct file *file, fl_owner_t id)
-{
-	struct usb_skel *dev;
-	int res;
-
-	dev = file->private_data;
-	if (!dev)
-		return -ENODEV;
-
-	/* wait for io to stop */
-	mutex_lock(&dev->io_mutex);
-	skel_draw_down(dev);
-
-	/* read out errors, leave subsequent opens a clean slate */
-	spin_lock_irq(&dev->err_lock);
-	res = dev->errors ? (dev->errors == -EPIPE ? -EPIPE : -EIO) : 0;
-	dev->errors = 0;
-	spin_unlock_irq(&dev->err_lock);
-
-	mutex_unlock(&dev->io_mutex);
-
-	return res;
-}
-
-static int skel_open(struct inode *inode, struct file *file)
-{
-	struct usb_skel *dev;
-	struct usb_interface *interface;
-	int subminor;
-	int retval = 0;
-
-	subminor = iminor(inode);
-
-	interface = usb_find_interface(&skel_driver, subminor);
-	if (!interface) {
-		pr_err("can't find device for minor %d\n", subminor);
-		retval = -ENODEV;
-		goto exit;
-	}
-
-	dev = usb_get_intfdata(interface);
-	if (!dev) {
-		retval = -ENODEV;
-		goto exit;
-	}
-
-	retval = usb_autopm_get_interface(interface);
-	if (retval)
-		goto exit;
-
-	/* increment our usage count for the device */
-	kref_get(&dev->kref);
-
-	/* save our object in the file's private structure */
-	file->private_data = dev;
-
-exit:
-	return retval;
-}
-
-static int skel_release(struct inode *inode, struct file *file)
-{
-	struct usb_skel *dev;
-
-	dev = file->private_data;
-	if (!dev)
-		return -ENODEV;
-
-	/* allow the device to be autosuspended */
-	mutex_lock(&dev->io_mutex);
-	if (dev->interface)
-		usb_autopm_put_interface(dev->interface);
-	mutex_unlock(&dev->io_mutex);
-
-	/* decrement the count on our device */
-	kref_put(&dev->kref, skel_delete);
-	return 0;
-}
-
-/*
- * user space access to read function
- */
-
-static ssize_t skel_read(struct file *file, char __user *buffer, size_t count,
-			 loff_t *ppos)
-{
-	struct usb_skel *dev;
-	char *buf;
-	ssize_t rv;
-
-	dev = file->private_data;
-
-	buf = kmalloc(count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	rv = skel_do_read(dev, buf, count);
-
-	if (rv > 0) {
-		if (copy_to_user(buffer, buf, rv)) {
-			kfree(buf);
-			return -EFAULT;
-		}
-	}
-	kfree(buf);
-	return rv;
-}
-
-/*
- * user space access to write function
- */
-
-static ssize_t skel_write(struct file *file, const char __user *user_buffer,
-			  size_t count, loff_t *ppos)
-{
-	struct usb_skel *dev;
-	char *buf;
-	ssize_t rv;
-
-	dev = file->private_data;
-
-	buf = kmalloc(count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	if (copy_from_user(buf, user_buffer, count)) {
-		kfree(buf);
-		return -EFAULT;
-	}
-
-	rv = skel_do_write(dev, buf, count);
-	kfree(buf);
-	return rv;
-}
-#endif
-
-static const struct file_operations skel_fops = {
-	.owner =	THIS_MODULE,
-#if USER_DEVICE
-	.read =	   skel_read,
-	.write =   skel_write,
-	.open =	   skel_open,
-	.release = skel_release,
-	.flush =   skel_flush,
-	.llseek =  noop_llseek,
-#endif
-};
-
-/*
- * usb class driver info in order to get a minor number from the usb core,
- * and to have the device registered with the driver core
- */
-#if USER_DEVICE
-static struct usb_class_driver skel_class = {
-	.name =		       "lpvo_raw%d",
-	.fops =		       &skel_fops,
-	.minor_base =	     USB_SKEL_MINOR_BASE,
-};
-#endif
-
-static int skel_probe(struct usb_interface *interface,
-		      const struct usb_device_id *id)
-{
-	struct usb_skel *dev;
-	struct usb_endpoint_descriptor *bulk_in, *bulk_out;
-	int retval;
-	char *device_path;
-
-	mutex_init(&minors_lock);   /* required for handling minor numbers table */
-
-	/* allocate memory for our device state and initialize it */
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return -ENOMEM;
-
-	kref_init(&dev->kref);
-	sema_init(&dev->limit_sem, WRITES_IN_FLIGHT);
-	mutex_init(&dev->io_mutex);
-	spin_lock_init(&dev->err_lock);
-	init_usb_anchor(&dev->submitted);
-	init_waitqueue_head(&dev->bulk_in_wait);
-
-	dev->udev = usb_get_dev(interface_to_usbdev(interface));
-	dev->interface = interface;
-
-	/* set up the endpoint information */
-	/* use only the first bulk-in and bulk-out endpoints */
-	retval = usb_find_common_endpoints(interface->cur_altsetting,
-					   &bulk_in, &bulk_out, NULL, NULL);
-	if (retval) {
-		dev_err(&interface->dev,
-			"Could not find both bulk-in and bulk-out endpoints\n");
-		goto error;
-	}
-
-	dev->bulk_in_size = usb_endpoint_maxp(bulk_in);
-	dev->bulk_in_endpoint_addr = bulk_in->bEndpointAddress;
-	dev->bulk_in_buffer = kmalloc(dev->bulk_in_size, GFP_KERNEL);
-	if (!dev->bulk_in_buffer) {
-		retval = -ENOMEM;
-		goto error;
-	}
-	dev->bulk_in_urb = usb_alloc_urb(0, GFP_KERNEL);
-	if (!dev->bulk_in_urb) {
-		retval = -ENOMEM;
-		goto error;
-	}
-
-	dev->bulk_out_endpoint_addr = bulk_out->bEndpointAddress;
-
-	/* save our data pointer in this interface device */
-	usb_set_intfdata(interface, dev);
-
-	/* let the world know */
-
-	device_path = kobject_get_path(&dev->udev->dev.kobj, GFP_KERNEL);
-	dev_dbg(&interface->dev, "New lpvo_usb_device -> bus: %d  dev: %d  path: %s\n",
-		dev->udev->bus->busnum, dev->udev->devnum, device_path);
-	kfree(device_path);
-
-#if USER_DEVICE
-	/* we can register the device now, as it is ready */
-	retval = usb_register_dev(interface, &skel_class);
-	if (retval) {
-		/* something prevented us from registering this driver */
-		dev_err(&interface->dev,
-			"Not able to get a minor for this device.\n");
-		usb_set_intfdata(interface, NULL);
-		goto error;
-	}
-#endif
-
-	write_latency_timer(dev->udev);	    /* adjust the latency timer */
-
-	usb_gpib_init_module(interface);    /* last, init the lpvo for this minor */
-
-	return 0;
-
-error:
-	/* this frees allocated memory */
-	kref_put(&dev->kref, skel_delete);
-
-	return retval;
-}
-
-static void skel_disconnect(struct usb_interface *interface)
-{
-	struct usb_skel *dev;
-	int minor = interface->minor;
-
-	usb_gpib_exit_module(minor);	  /* first, disactivate the lpvo */
-
-	dev = usb_get_intfdata(interface);
-	usb_set_intfdata(interface, NULL);
-
-#if USER_DEVICE
-	/* give back our minor */
-	usb_deregister_dev(interface, &skel_class);
-#endif
-
-	/* prevent more I/O from starting */
-	mutex_lock(&dev->io_mutex);
-	dev->interface = NULL;
-	mutex_unlock(&dev->io_mutex);
-
-	usb_kill_anchored_urbs(&dev->submitted);
-
-	/* decrement our usage count */
-	kref_put(&dev->kref, skel_delete);
-}
-
-static void skel_draw_down(struct usb_skel *dev)
-{
-	int time;
-
-	time = usb_wait_anchor_empty_timeout(&dev->submitted, 1000);
-	if (!time)
-		usb_kill_anchored_urbs(&dev->submitted);
-	usb_kill_urb(dev->bulk_in_urb);
-}
-
-static int skel_suspend(struct usb_interface *intf, pm_message_t message)
-{
-	struct usb_skel *dev = usb_get_intfdata(intf);
-
-	if (!dev)
-		return 0;
-	skel_draw_down(dev);
-	return 0;
-}
-
-static int skel_resume(struct usb_interface *intf)
-{
-	return 0;
-}
-
-static int skel_pre_reset(struct usb_interface *intf)
-{
-	struct usb_skel *dev = usb_get_intfdata(intf);
-
-	mutex_lock(&dev->io_mutex);
-	skel_draw_down(dev);
-
-	return 0;
-}
-
-static int skel_post_reset(struct usb_interface *intf)
-{
-	struct usb_skel *dev = usb_get_intfdata(intf);
-
-	/* we are sure no URBs are active - no locking needed */
-	dev->errors = -EPIPE;
-	mutex_unlock(&dev->io_mutex);
-
-	return 0;
-}
-
-static struct usb_driver skel_driver = {
-	.name =			NAME,
-	.probe =		skel_probe,
-	.disconnect =		skel_disconnect,
-	.suspend =		skel_suspend,
-	.resume =		skel_resume,
-	.pre_reset =		skel_pre_reset,
-	.post_reset =		skel_post_reset,
-	.id_table =		skel_table,
-	.supports_autosuspend = 1,
-};
-
-module_usb_driver(skel_driver);
diff --git a/drivers/staging/gpib/nec7210/Makefile b/drivers/staging/gpib/nec7210/Makefile
deleted file mode 100644
index 64330f2e89d1..000000000000
--- a/drivers/staging/gpib/nec7210/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-
-obj-$(CONFIG_GPIB_NEC7210) += nec7210.o
-
-
diff --git a/drivers/staging/gpib/nec7210/board.h b/drivers/staging/gpib/nec7210/board.h
deleted file mode 100644
index ac3fe38ade57..000000000000
--- a/drivers/staging/gpib/nec7210/board.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *   copyright            : (C) 2001, 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _GPIB_PCIIA_BOARD_H
-#define _GPIB_PCIIA_BOARD_H
-
-#include "gpibP.h"
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/delay.h>
-
-#include "nec7210.h"
-
-#endif	//_GPIB_PCIIA_BOARD_H
-
diff --git a/drivers/staging/gpib/nec7210/nec7210.c b/drivers/staging/gpib/nec7210/nec7210.c
deleted file mode 100644
index bbf39367f5e4..000000000000
--- a/drivers/staging/gpib/nec7210/nec7210.c
+++ /dev/null
@@ -1,1121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *   copyright            : (C) 2001, 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#define dev_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include "board.h"
-#include <linux/ioport.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/dma.h>
-#include <linux/bitops.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/delay.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB library code for NEC uPD7210");
-
-int nec7210_enable_eos(struct gpib_board *board, struct nec7210_priv *priv, u8 eos_byte,
-		       int compare_8_bits)
-{
-	write_byte(priv, eos_byte, EOSR);
-	priv->auxa_bits |= HR_REOS;
-	if (compare_8_bits)
-		priv->auxa_bits |= HR_BIN;
-	else
-		priv->auxa_bits &= ~HR_BIN;
-	write_byte(priv, priv->auxa_bits, AUXMR);
-	return 0;
-}
-EXPORT_SYMBOL(nec7210_enable_eos);
-
-void nec7210_disable_eos(struct gpib_board *board, struct nec7210_priv *priv)
-{
-	priv->auxa_bits &= ~HR_REOS;
-	write_byte(priv, priv->auxa_bits, AUXMR);
-}
-EXPORT_SYMBOL(nec7210_disable_eos);
-
-int nec7210_parallel_poll(struct gpib_board *board, struct nec7210_priv *priv, u8 *result)
-{
-	int ret;
-
-	clear_bit(COMMAND_READY_BN, &priv->state);
-
-	// execute parallel poll
-	write_byte(priv, AUX_EPP, AUXMR);
-	// wait for result FIXME: support timeouts
-	ret = wait_event_interruptible(board->wait, test_bit(COMMAND_READY_BN, &priv->state));
-	if (ret) {
-		dev_dbg(board->gpib_dev, "gpib: parallel poll interrupted\n");
-		return -ERESTARTSYS;
-	}
-	*result = read_byte(priv, CPTR);
-
-	return 0;
-}
-EXPORT_SYMBOL(nec7210_parallel_poll);
-
-void nec7210_parallel_poll_configure(struct gpib_board *board,
-				     struct nec7210_priv *priv, unsigned int configuration)
-{
-	write_byte(priv, PPR | configuration, AUXMR);
-}
-EXPORT_SYMBOL(nec7210_parallel_poll_configure);
-
-void nec7210_parallel_poll_response(struct gpib_board *board, struct nec7210_priv *priv, int ist)
-{
-	if (ist)
-		write_byte(priv, AUX_SPPF, AUXMR);
-	else
-		write_byte(priv, AUX_CPPF, AUXMR);
-}
-EXPORT_SYMBOL(nec7210_parallel_poll_response);
-/*
- * This is really only adequate for chips that do a 488.2 style reqt/reqf
- * based on bit 6 of the SPMR (see chapter 11.3.3 of 488.2). For simpler chips that simply
- * set rsv directly based on bit 6, we either need to do more hardware setup to expose
- * the 488.2 capability (for example with NI chips), or we need to implement the
- * 488.2 set srv state machine in the driver (if that is even viable).
- */
-void nec7210_serial_poll_response(struct gpib_board *board,
-				  struct nec7210_priv *priv, u8 status)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	if (status & request_service_bit) {
-		priv->srq_pending = 1;
-		clear_bit(SPOLL_NUM, &board->status);
-
-	} else {
-		priv->srq_pending = 0;
-	}
-	write_byte(priv, status, SPMR);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-EXPORT_SYMBOL(nec7210_serial_poll_response);
-
-u8 nec7210_serial_poll_status(struct gpib_board *board, struct nec7210_priv *priv)
-{
-	return read_byte(priv, SPSR);
-}
-EXPORT_SYMBOL(nec7210_serial_poll_status);
-
-int nec7210_primary_address(const struct gpib_board *board, struct nec7210_priv *priv,
-			    unsigned int address)
-{
-	// put primary address in address0
-	write_byte(priv, address & ADDRESS_MASK, ADR);
-	return 0;
-}
-EXPORT_SYMBOL(nec7210_primary_address);
-
-int nec7210_secondary_address(const struct gpib_board *board, struct nec7210_priv *priv,
-			      unsigned int address, int enable)
-{
-	if (enable) {
-		// put secondary address in address1
-		write_byte(priv, HR_ARS | (address & ADDRESS_MASK), ADR);
-		// go to address mode 2
-		priv->reg_bits[ADMR] &= ~HR_ADM0;
-		priv->reg_bits[ADMR] |= HR_ADM1;
-	} else {
-		// disable address1 register
-		write_byte(priv, HR_ARS | HR_DT | HR_DL, ADR);
-		// go to address mode 1
-		priv->reg_bits[ADMR] |= HR_ADM0;
-		priv->reg_bits[ADMR] &= ~HR_ADM1;
-	}
-	write_byte(priv, priv->reg_bits[ADMR], ADMR);
-	return 0;
-}
-EXPORT_SYMBOL(nec7210_secondary_address);
-
-static void update_talker_state(struct nec7210_priv *priv, unsigned int address_status_bits)
-{
-	if ((address_status_bits & HR_TA)) {
-		if ((address_status_bits & HR_NATN)) {
-			if (address_status_bits & HR_SPMS)
-				priv->talker_state = serial_poll_active;
-			else
-				priv->talker_state = talker_active;
-		} else {
-			priv->talker_state = talker_addressed;
-		}
-	} else {
-		priv->talker_state = talker_idle;
-	}
-}
-
-static void update_listener_state(struct nec7210_priv *priv, unsigned int address_status_bits)
-{
-	if (address_status_bits & HR_LA) {
-		if ((address_status_bits & HR_NATN))
-			priv->listener_state = listener_active;
-		else
-			priv->listener_state = listener_addressed;
-	} else {
-		priv->listener_state = listener_idle;
-	}
-}
-
-unsigned int nec7210_update_status_nolock(struct gpib_board *board, struct nec7210_priv *priv)
-{
-	int address_status_bits;
-	u8 spoll_status;
-
-	if (!priv)
-		return 0;
-
-	address_status_bits = read_byte(priv, ADSR);
-	if (address_status_bits & HR_CIC)
-		set_bit(CIC_NUM, &board->status);
-	else
-		clear_bit(CIC_NUM, &board->status);
-	// check for talker/listener addressed
-	update_talker_state(priv, address_status_bits);
-	if (priv->talker_state == talker_active || priv->talker_state == talker_addressed)
-		set_bit(TACS_NUM, &board->status);
-	else
-		clear_bit(TACS_NUM, &board->status);
-	update_listener_state(priv, address_status_bits);
-	if (priv->listener_state == listener_active ||
-	    priv->listener_state == listener_addressed)
-		set_bit(LACS_NUM, &board->status);
-	else
-		clear_bit(LACS_NUM, &board->status);
-	if (address_status_bits & HR_NATN)
-		clear_bit(ATN_NUM, &board->status);
-	else
-		set_bit(ATN_NUM, &board->status);
-	spoll_status = nec7210_serial_poll_status(board, priv);
-	if (priv->srq_pending && (spoll_status & request_service_bit) == 0) {
-		priv->srq_pending = 0;
-		set_bit(SPOLL_NUM, &board->status);
-	}
-
-	/*
-	 * we rely on the interrupt handler to set the
-	 * rest of the status bits
-	 */
-
-	return board->status;
-}
-EXPORT_SYMBOL(nec7210_update_status_nolock);
-
-unsigned int nec7210_update_status(struct gpib_board *board, struct nec7210_priv *priv,
-				   unsigned int clear_mask)
-{
-	unsigned long flags;
-	unsigned int retval;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	board->status &= ~clear_mask;
-	retval = nec7210_update_status_nolock(board, priv);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	return retval;
-}
-EXPORT_SYMBOL(nec7210_update_status);
-
-unsigned int nec7210_set_reg_bits(struct nec7210_priv *priv, unsigned int reg,
-				  unsigned int mask, unsigned int bits)
-{
-	priv->reg_bits[reg] &= ~mask;
-	priv->reg_bits[reg] |= mask & bits;
-	write_byte(priv, priv->reg_bits[reg], reg);
-	return priv->reg_bits[reg];
-}
-EXPORT_SYMBOL(nec7210_set_reg_bits);
-
-void nec7210_set_handshake_mode(struct gpib_board *board, struct nec7210_priv *priv, int mode)
-{
-	unsigned long flags;
-
-	mode &= HR_HANDSHAKE_MASK;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	if ((priv->auxa_bits & HR_HANDSHAKE_MASK) != mode) {
-		priv->auxa_bits &= ~HR_HANDSHAKE_MASK;
-		priv->auxa_bits |= mode;
-		write_byte(priv, priv->auxa_bits, AUXMR);
-	}
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-EXPORT_SYMBOL(nec7210_set_handshake_mode);
-
-u8 nec7210_read_data_in(struct gpib_board *board, struct nec7210_priv *priv, int *end)
-{
-	unsigned long flags;
-	u8 data;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	data = read_byte(priv, DIR);
-	clear_bit(READ_READY_BN, &priv->state);
-	if (test_and_clear_bit(RECEIVED_END_BN, &priv->state))
-		*end = 1;
-	else
-		*end = 0;
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	return data;
-}
-EXPORT_SYMBOL(nec7210_read_data_in);
-
-int nec7210_take_control(struct gpib_board *board, struct nec7210_priv *priv, int syncronous)
-{
-	int i;
-	const int timeout = 100;
-	int retval = 0;
-	unsigned int adsr_bits = 0;
-
-	if (syncronous)
-		write_byte(priv, AUX_TCS, AUXMR);
-	else
-		write_byte(priv, AUX_TCA, AUXMR);
-	// busy wait until ATN is asserted
-	for (i = 0; i < timeout; i++) {
-		adsr_bits = read_byte(priv, ADSR);
-		if ((adsr_bits & HR_NATN) == 0)
-			break;
-		udelay(1);
-	}
-	if (i == timeout)
-		return -ETIMEDOUT;
-
-	clear_bit(WRITE_READY_BN, &priv->state);
-
-	return retval;
-}
-EXPORT_SYMBOL(nec7210_take_control);
-
-int nec7210_go_to_standby(struct gpib_board *board, struct nec7210_priv *priv)
-{
-	int i;
-	const int timeout = 1000;
-	unsigned int adsr_bits = 0;
-	int retval = 0;
-
-	write_byte(priv, AUX_GTS, AUXMR);
-	// busy wait until ATN is released
-	for (i = 0; i < timeout; i++) {
-		adsr_bits = read_byte(priv, ADSR);
-		if (adsr_bits & HR_NATN)
-			break;
-		udelay(1);
-	}
-	// if busy wait has failed, try sleeping
-	if (i == timeout) {
-		for (i = 0; i < HZ; i++) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			if (schedule_timeout(1))
-				return -ERESTARTSYS;
-			adsr_bits = read_byte(priv, ADSR);
-			if (adsr_bits & HR_NATN)
-				break;
-		}
-		if (i == HZ)
-			return -ETIMEDOUT;
-	}
-
-	clear_bit(COMMAND_READY_BN, &priv->state);
-	return retval;
-}
-EXPORT_SYMBOL(nec7210_go_to_standby);
-
-int nec7210_request_system_control(struct gpib_board *board, struct nec7210_priv *priv,
-				   int request_control)
-{
-	if (request_control == 0) {
-		write_byte(priv, AUX_CREN, AUXMR);
-		write_byte(priv, AUX_CIFC, AUXMR);
-		write_byte(priv, AUX_DSC, AUXMR);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(nec7210_request_system_control);
-
-void nec7210_interface_clear(struct gpib_board *board, struct nec7210_priv *priv, int assert)
-{
-	if (assert)
-		write_byte(priv, AUX_SIFC, AUXMR);
-	else
-		write_byte(priv, AUX_CIFC, AUXMR);
-}
-EXPORT_SYMBOL(nec7210_interface_clear);
-
-void nec7210_remote_enable(struct gpib_board *board, struct nec7210_priv *priv, int enable)
-{
-	if (enable)
-		write_byte(priv, AUX_SREN, AUXMR);
-	else
-		write_byte(priv, AUX_CREN, AUXMR);
-}
-EXPORT_SYMBOL(nec7210_remote_enable);
-
-void nec7210_release_rfd_holdoff(struct gpib_board *board, struct nec7210_priv *priv)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	if (test_bit(RFD_HOLDOFF_BN, &priv->state) &&
-	    test_bit(READ_READY_BN, &priv->state) == 0) {
-		write_byte(priv, AUX_FH, AUXMR);
-		clear_bit(RFD_HOLDOFF_BN, &priv->state);
-	}
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-EXPORT_SYMBOL(nec7210_release_rfd_holdoff);
-
-int nec7210_t1_delay(struct gpib_board *board, struct nec7210_priv *priv,
-		     unsigned int nano_sec)
-{
-	unsigned int retval;
-
-	if (nano_sec <= 500) {
-		priv->auxb_bits |= HR_TRI;
-		retval = 500;
-	} else {
-		priv->auxb_bits &= ~HR_TRI;
-		retval = 2000;
-	}
-	write_byte(priv, priv->auxb_bits, AUXMR);
-
-	return retval;
-}
-EXPORT_SYMBOL(nec7210_t1_delay);
-
-void nec7210_return_to_local(const struct gpib_board *board, struct nec7210_priv *priv)
-{
-	write_byte(priv, AUX_RTL, AUXMR);
-}
-EXPORT_SYMBOL(nec7210_return_to_local);
-
-static inline short nec7210_atn_has_changed(struct gpib_board *board, struct nec7210_priv *priv)
-{
-	short address_status_bits = read_byte(priv, ADSR);
-
-	if (address_status_bits & HR_NATN) {
-		if (test_bit(ATN_NUM, &board->status))
-			return 1;
-		else
-			return 0;
-	} else	{
-		if (test_bit(ATN_NUM, &board->status))
-			return 0;
-		else
-			return 1;
-	}
-	return -1;
-}
-
-int nec7210_command(struct gpib_board *board, struct nec7210_priv *priv, u8
-		    *buffer, size_t length, size_t *bytes_written)
-{
-	int retval = 0;
-	unsigned long flags;
-
-	*bytes_written = 0;
-
-	clear_bit(BUS_ERROR_BN, &priv->state);
-
-	while (*bytes_written < length)	{
-		if (wait_event_interruptible(board->wait,
-					     test_bit(COMMAND_READY_BN, &priv->state) ||
-					     test_bit(BUS_ERROR_BN, &priv->state) ||
-					     test_bit(TIMO_NUM, &board->status))) {
-			dev_dbg(board->gpib_dev, "command wait interrupted\n");
-			retval = -ERESTARTSYS;
-			break;
-		}
-		if (test_bit(TIMO_NUM, &board->status))
-			break;
-		if (test_and_clear_bit(BUS_ERROR_BN, &priv->state))
-			break;
-		spin_lock_irqsave(&board->spinlock, flags);
-		clear_bit(COMMAND_READY_BN, &priv->state);
-		write_byte(priv, buffer[*bytes_written], CDOR);
-		spin_unlock_irqrestore(&board->spinlock, flags);
-
-		++(*bytes_written);
-
-		if (need_resched())
-			schedule();
-	}
-	// wait for last byte to get sent
-	if (wait_event_interruptible(board->wait, test_bit(COMMAND_READY_BN, &priv->state) ||
-				     test_bit(BUS_ERROR_BN, &priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-
-	if (test_and_clear_bit(BUS_ERROR_BN, &priv->state))
-		retval = -EIO;
-
-	return retval;
-}
-EXPORT_SYMBOL(nec7210_command);
-
-static int pio_read(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
-		    size_t length, int *end, size_t *bytes_read)
-{
-	ssize_t retval = 0;
-
-	*bytes_read = 0;
-	*end = 0;
-
-	while (*bytes_read < length) {
-		if (wait_event_interruptible(board->wait,
-					     test_bit(READ_READY_BN, &priv->state) ||
-					     test_bit(DEV_CLEAR_BN, &priv->state) ||
-					     test_bit(TIMO_NUM, &board->status))) {
-			retval = -ERESTARTSYS;
-			break;
-		}
-		if (test_bit(READ_READY_BN, &priv->state)) {
-			if (*bytes_read == 0)	{
-				/*
-				 * We set the handshake mode here because we know
-				 * no new bytes will arrive (it has already arrived
-				 * and is awaiting being read out of the chip) while we are changing
-				 * modes.  This ensures we can reliably keep track
-				 * of the holdoff state.
-				 */
-				nec7210_set_handshake_mode(board, priv, HR_HLDA);
-			}
-			buffer[(*bytes_read)++] = nec7210_read_data_in(board, priv, end);
-			if (*end)
-				break;
-		}
-		if (test_bit(TIMO_NUM, &board->status)) {
-			retval = -ETIMEDOUT;
-			break;
-		}
-		if (test_bit(DEV_CLEAR_BN, &priv->state)) {
-			retval = -EINTR;
-			break;
-		}
-
-		if (*bytes_read < length)
-			nec7210_release_rfd_holdoff(board, priv);
-
-		if (need_resched())
-			schedule();
-	}
-	return retval;
-}
-
-#ifdef NEC_DMA
-static ssize_t __dma_read(struct gpib_board *board, struct nec7210_priv *priv, size_t length)
-{
-	ssize_t retval = 0;
-	size_t count = 0;
-	unsigned long flags, dma_irq_flags;
-
-	if (length == 0)
-		return 0;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-
-	dma_irq_flags = claim_dma_lock();
-	disable_dma(priv->dma_channel);
-	/* program dma controller */
-	clear_dma_ff(priv->dma_channel);
-	set_dma_count(priv->dma_channel, length);
-	set_dma_addr(priv->dma_channel, priv->dma_buffer_addr);
-	set_dma_mode(priv->dma_channel, DMA_MODE_READ);
-	release_dma_lock(dma_irq_flags);
-
-	enable_dma(priv->dma_channel);
-
-	set_bit(DMA_READ_IN_PROGRESS_BN, &priv->state);
-	clear_bit(READ_READY_BN, &priv->state);
-
-	// enable nec7210 dma
-	nec7210_set_reg_bits(priv, IMR2, HR_DMAI, HR_DMAI);
-
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	// wait for data to transfer
-	if (wait_event_interruptible(board->wait,
-				     test_bit(DMA_READ_IN_PROGRESS_BN, &priv->state) == 0 ||
-				     test_bit(DEV_CLEAR_BN, &priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_bit(DEV_CLEAR_BN, &priv->state))
-		retval = -EINTR;
-
-	// disable nec7210 dma
-	nec7210_set_reg_bits(priv, IMR2, HR_DMAI, 0);
-
-	// record how many bytes we transferred
-	flags = claim_dma_lock();
-	clear_dma_ff(priv->dma_channel);
-	disable_dma(priv->dma_channel);
-	count += length - get_dma_residue(priv->dma_channel);
-	release_dma_lock(flags);
-
-	return retval ? retval : count;
-}
-
-static ssize_t dma_read(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
-			size_t length)
-{
-	size_t remain = length;
-	size_t transfer_size;
-	ssize_t retval = 0;
-
-	while (remain > 0) {
-		transfer_size = (priv->dma_buffer_length < remain) ?
-			priv->dma_buffer_length : remain;
-		retval = __dma_read(board, priv, transfer_size);
-		if (retval < 0)
-			break;
-		memcpy(buffer, priv->dma_buffer, transfer_size);
-		remain -= retval;
-		buffer += retval;
-		if (test_bit(RECEIVED_END_BN, &priv->state))
-			break;
-	}
-
-	if (retval < 0)
-		return retval;
-
-	return length - remain;
-}
-#endif
-
-int nec7210_read(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
-		 size_t length, int *end, size_t *bytes_read)
-{
-	ssize_t retval = 0;
-
-	*end = 0;
-	*bytes_read = 0;
-
-	if (length == 0)
-		return 0;
-
-	clear_bit(DEV_CLEAR_BN, &priv->state); // XXX wrong
-
-	nec7210_release_rfd_holdoff(board, priv);
-
-	retval = pio_read(board, priv, buffer, length, end, bytes_read);
-
-	return retval;
-}
-EXPORT_SYMBOL(nec7210_read);
-
-static int pio_write_wait(struct gpib_board *board, struct nec7210_priv *priv,
-			  short wake_on_lacs, short wake_on_atn, short wake_on_bus_error)
-{
-	// wait until byte is ready to be sent
-	if (wait_event_interruptible(board->wait,
-				     (test_bit(TACS_NUM, &board->status) &&
-				      test_bit(WRITE_READY_BN, &priv->state)) ||
-				     test_bit(DEV_CLEAR_BN, &priv->state) ||
-				     (wake_on_bus_error && test_bit(BUS_ERROR_BN, &priv->state)) ||
-				     (wake_on_lacs && test_bit(LACS_NUM, &board->status)) ||
-				     (wake_on_atn && test_bit(ATN_NUM, &board->status)) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		return -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		return -ETIMEDOUT;
-
-	if (test_bit(DEV_CLEAR_BN, &priv->state))
-		return -EINTR;
-
-	if (wake_on_bus_error && test_and_clear_bit(BUS_ERROR_BN, &priv->state))
-		return -EIO;
-
-	return 0;
-}
-
-static int pio_write(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
-		     size_t length, size_t *bytes_written)
-{
-	size_t last_count = 0;
-	ssize_t retval = 0;
-	unsigned long flags;
-	const int max_bus_errors = (length > 1000) ? length : 1000;
-	int bus_error_count = 0;
-	*bytes_written = 0;
-
-	clear_bit(BUS_ERROR_BN, &priv->state);
-
-	while (*bytes_written < length) {
-		if (need_resched())
-			schedule();
-
-		retval = pio_write_wait(board, priv, 0, 0, priv->type == NEC7210);
-		if (retval == -EIO) {
-			/* resend last byte on bus error */
-			*bytes_written = last_count;
-			/*
-			 * we can get unrecoverable bus errors,
-			 * so give up after a while
-			 */
-			bus_error_count++;
-			if (bus_error_count > max_bus_errors)
-				return retval;
-			continue;
-		} else {
-			if (retval < 0)
-				return retval;
-		}
-		spin_lock_irqsave(&board->spinlock, flags);
-		clear_bit(BUS_ERROR_BN, &priv->state);
-		clear_bit(WRITE_READY_BN, &priv->state);
-		last_count = *bytes_written;
-		write_byte(priv, buffer[(*bytes_written)++], CDOR);
-		spin_unlock_irqrestore(&board->spinlock, flags);
-	}
-	retval = pio_write_wait(board, priv, 1, 1, priv->type == NEC7210);
-	return retval;
-}
-
-#ifdef NEC_DMA
-static ssize_t __dma_write(struct gpib_board *board, struct nec7210_priv *priv, dma_addr_t address,
-			   size_t length)
-{
-	unsigned long flags, dma_irq_flags;
-	int residue = 0;
-	int retval = 0;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-
-	/* program dma controller */
-	dma_irq_flags = claim_dma_lock();
-	disable_dma(priv->dma_channel);
-	clear_dma_ff(priv->dma_channel);
-	set_dma_count(priv->dma_channel, length);
-	set_dma_addr(priv->dma_channel, address);
-	set_dma_mode(priv->dma_channel, DMA_MODE_WRITE);
-	enable_dma(priv->dma_channel);
-	release_dma_lock(dma_irq_flags);
-
-	// enable board's dma for output
-	nec7210_set_reg_bits(priv, IMR2, HR_DMAO, HR_DMAO);
-
-	clear_bit(WRITE_READY_BN, &priv->state);
-	set_bit(DMA_WRITE_IN_PROGRESS_BN, &priv->state);
-
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	// suspend until message is sent
-	if (wait_event_interruptible(board->wait,
-				     test_bit(DMA_WRITE_IN_PROGRESS_BN, &priv->state) == 0 ||
-				     test_bit(BUS_ERROR_BN, &priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-	if (test_and_clear_bit(DEV_CLEAR_BN, &priv->state))
-		retval = -EINTR;
-	if (test_and_clear_bit(BUS_ERROR_BN, &priv->state))
-		retval = -EIO;
-
-	// disable board's dma
-	nec7210_set_reg_bits(priv, IMR2, HR_DMAO, 0);
-
-	dma_irq_flags = claim_dma_lock();
-	clear_dma_ff(priv->dma_channel);
-	disable_dma(priv->dma_channel);
-	residue = get_dma_residue(priv->dma_channel);
-	release_dma_lock(dma_irq_flags);
-
-	if (residue)
-		retval = -EPIPE;
-
-	return retval ? retval : length;
-}
-
-static ssize_t dma_write(struct gpib_board *board, struct nec7210_priv *priv, u8 *buffer,
-			 size_t length)
-{
-	size_t remain = length;
-	size_t transfer_size;
-	ssize_t retval = 0;
-
-	while (remain > 0) {
-		transfer_size = (priv->dma_buffer_length < remain) ?
-			priv->dma_buffer_length : remain;
-		memcpy(priv->dma_buffer, buffer, transfer_size);
-		retval = __dma_write(board, priv, priv->dma_buffer_addr, transfer_size);
-		if (retval < 0)
-			break;
-		remain -= retval;
-		buffer += retval;
-	}
-
-	if (retval < 0)
-		return retval;
-
-	return length - remain;
-}
-#endif
-int nec7210_write(struct gpib_board *board, struct nec7210_priv *priv,
-		  u8 *buffer, size_t length, int send_eoi,
-		  size_t *bytes_written)
-{
-	int retval = 0;
-
-	*bytes_written = 0;
-
-	clear_bit(DEV_CLEAR_BN, &priv->state); // XXX
-
-	if (send_eoi)
-		length-- ; // save the last byte for sending EOI
-
-	if (length > 0)	{
-		// isa dma transfer
-		if (0 /*priv->dma_channel*/) {
-/*
- * dma writes are unreliable since they can't recover from bus errors
- * (which happen when ATN is asserted in the middle of a write)
- */
-#ifdef NEC_DMA
-			retval = dma_write(board, priv, buffer, length);
-			if (retval < 0)
-				return retval;
-			count += retval;
-#endif
-		} else {	// PIO transfer
-			size_t num_bytes;
-
-			retval = pio_write(board, priv, buffer, length, &num_bytes);
-
-			*bytes_written += num_bytes;
-			if (retval < 0)
-				return retval;
-		}
-	}
-	if (send_eoi) {
-		size_t num_bytes;
-
-		/*
-		 * We need to wait to make sure we will immediately be able to write the data byte
-		 * into the chip before sending the associated AUX_SEOI command.  This is really
-		 * only needed for length==1 since otherwise the earlier calls to pio_write
-		 * will have dont the wait already.
-		 */
-		retval = pio_write_wait(board, priv, 0, 0, priv->type == NEC7210);
-		if (retval < 0)
-			return retval;
-		/*send EOI */
-		write_byte(priv, AUX_SEOI, AUXMR);
-
-		retval = pio_write(board, priv, &buffer[*bytes_written], 1, &num_bytes);
-		*bytes_written += num_bytes;
-		if (retval < 0)
-			return retval;
-	}
-
-	return retval;
-}
-EXPORT_SYMBOL(nec7210_write);
-
-/*
- * interrupt service routine
- */
-irqreturn_t nec7210_interrupt(struct gpib_board *board, struct nec7210_priv *priv)
-{
-	int status1, status2;
-
-	// read interrupt status (also clears status)
-	status1 = read_byte(priv, ISR1);
-	status2 = read_byte(priv, ISR2);
-
-	return nec7210_interrupt_have_status(board, priv, status1, status2);
-}
-EXPORT_SYMBOL(nec7210_interrupt);
-
-irqreturn_t nec7210_interrupt_have_status(struct gpib_board *board,
-					  struct nec7210_priv *priv, int status1, int status2)
-{
-#ifdef NEC_DMA
-	unsigned long dma_flags;
-#endif
-	int retval = IRQ_NONE;
-
-	// record service request in status
-	if (status2 & HR_SRQI)
-		set_bit(SRQI_NUM, &board->status);
-
-	// change in lockout status
-	if (status2 & HR_LOKC) {
-		if (status2 & HR_LOK)
-			set_bit(LOK_NUM, &board->status);
-		else
-			clear_bit(LOK_NUM, &board->status);
-	}
-
-	// change in remote status
-	if (status2 & HR_REMC) {
-		if (status2 & HR_REM)
-			set_bit(REM_NUM, &board->status);
-		else
-			clear_bit(REM_NUM, &board->status);
-	}
-
-	// record reception of END
-	if (status1 & HR_END) {
-		set_bit(RECEIVED_END_BN, &priv->state);
-		if ((priv->auxa_bits & HR_HANDSHAKE_MASK) == HR_HLDE)
-			set_bit(RFD_HOLDOFF_BN, &priv->state);
-	}
-
-	// get incoming data in PIO mode
-	if ((status1 & HR_DI)) {
-		set_bit(READ_READY_BN, &priv->state);
-		if ((priv->auxa_bits & HR_HANDSHAKE_MASK) == HR_HLDA)
-			set_bit(RFD_HOLDOFF_BN, &priv->state);
-	}
-#ifdef NEC_DMA
-	// check for dma read transfer complete
-	if (test_bit(DMA_READ_IN_PROGRESS_BN, &priv->state)) {
-		dma_flags = claim_dma_lock();
-		disable_dma(priv->dma_channel);
-		clear_dma_ff(priv->dma_channel);
-		if ((status1 & HR_END) || get_dma_residue(priv->dma_channel) == 0)
-			clear_bit(DMA_READ_IN_PROGRESS_BN, &priv->state);
-		else
-			enable_dma(priv->dma_channel);
-		release_dma_lock(dma_flags);
-	}
-#endif
-	if ((status1 & HR_DO)) {
-		if (test_bit(DMA_WRITE_IN_PROGRESS_BN, &priv->state) == 0)
-			set_bit(WRITE_READY_BN, &priv->state);
-#ifdef NEC_DMA
-		if (test_bit(DMA_WRITE_IN_PROGRESS_BN, &priv->state)) {	// write data, isa dma mode
-			// check if dma transfer is complete
-			dma_flags = claim_dma_lock();
-			disable_dma(priv->dma_channel);
-			clear_dma_ff(priv->dma_channel);
-			if (get_dma_residue(priv->dma_channel) == 0) {
-				clear_bit(DMA_WRITE_IN_PROGRESS_BN, &priv->state);
-			// XXX race? byte may still be in CDOR reg
-			} else {
-				clear_bit(WRITE_READY_BN, &priv->state);
-				enable_dma(priv->dma_channel);
-			}
-			release_dma_lock(dma_flags);
-		}
-#endif
-	}
-
-	// outgoing command can be sent
-	if (status2 & HR_CO)
-		set_bit(COMMAND_READY_BN, &priv->state);
-
-	// command pass through received
-	if (status1 & HR_CPT)
-		write_byte(priv, AUX_NVAL, AUXMR);
-
-	if (status1 & HR_ERR)
-		set_bit(BUS_ERROR_BN, &priv->state);
-
-	if (status1 & HR_DEC) {
-		unsigned short address_status_bits = read_byte(priv, ADSR);
-
-		// ignore device clear events if we are controller in charge
-		if ((address_status_bits & HR_CIC) == 0) {
-			push_gpib_event(board, EVENT_DEV_CLR);
-			set_bit(DEV_CLEAR_BN, &priv->state);
-		}
-	}
-
-	if (status1 & HR_DET)
-		push_gpib_event(board, EVENT_DEV_TRG);
-
-	// Addressing status has changed
-	if (status2 & HR_ADSC)
-		set_bit(ADR_CHANGE_BN, &priv->state);
-
-	if ((status1 & priv->reg_bits[IMR1]) ||
-	    (status2 & (priv->reg_bits[IMR2] & IMR2_ENABLE_INTR_MASK)) ||
-	    nec7210_atn_has_changed(board, priv))	{
-		nec7210_update_status_nolock(board, priv);
-		dev_dbg(board->gpib_dev, "minor %i, stat %lx, isr1 0x%x, imr1 0x%x, isr2 0x%x, imr2 0x%x\n",
-			board->minor, board->status, status1, priv->reg_bits[IMR1], status2,
-			     priv->reg_bits[IMR2]);
-		wake_up_interruptible(&board->wait); /* wake up sleeping process */
-		retval = IRQ_HANDLED;
-	}
-
-	return retval;
-}
-EXPORT_SYMBOL(nec7210_interrupt_have_status);
-
-void nec7210_board_reset(struct nec7210_priv *priv, const struct gpib_board *board)
-{
-	/* 7210 chip reset */
-	write_byte(priv, AUX_CR, AUXMR);
-
-	/* disable all interrupts */
-	priv->reg_bits[IMR1] = 0;
-	write_byte(priv, priv->reg_bits[IMR1], IMR1);
-	priv->reg_bits[IMR2] = 0;
-	write_byte(priv, priv->reg_bits[IMR2], IMR2);
-	write_byte(priv, 0, SPMR);
-
-	/* clear registers by reading */
-	read_byte(priv, CPTR);
-	read_byte(priv, ISR1);
-	read_byte(priv, ISR2);
-
-	/* parallel poll unconfigure */
-	write_byte(priv, PPR | HR_PPU, AUXMR);
-
-	priv->reg_bits[ADMR] = HR_TRM0 | HR_TRM1;
-
-	priv->auxa_bits = AUXRA | HR_HLDA;
-	write_byte(priv, priv->auxa_bits, AUXMR);
-
-	write_byte(priv, AUXRE | 0, AUXMR);
-
-	/* set INT pin to active high, enable command pass through of unknown commands */
-	priv->auxb_bits = AUXRB | HR_CPTE;
-	write_byte(priv, priv->auxb_bits, AUXMR);
-	write_byte(priv, AUXRE, AUXMR);
-}
-EXPORT_SYMBOL(nec7210_board_reset);
-
-void nec7210_board_online(struct nec7210_priv *priv, const struct gpib_board *board)
-{
-	/* set GPIB address */
-	nec7210_primary_address(board, priv, board->pad);
-	nec7210_secondary_address(board, priv, board->sad, board->sad >= 0);
-
-	/* enable interrupts */
-	priv->reg_bits[IMR1] = HR_ERRIE | HR_DECIE | HR_ENDIE |
-		HR_DETIE | HR_CPTIE | HR_DOIE | HR_DIIE;
-	priv->reg_bits[IMR2] = IMR2_ENABLE_INTR_MASK;
-	write_byte(priv, priv->reg_bits[IMR1], IMR1);
-	write_byte(priv, priv->reg_bits[IMR2], IMR2);
-
-	write_byte(priv, AUX_PON, AUXMR);
-}
-EXPORT_SYMBOL(nec7210_board_online);
-
-#ifdef CONFIG_HAS_IOPORT
-/* wrappers for io */
-u8 nec7210_ioport_read_byte(struct nec7210_priv *priv, unsigned int register_num)
-{
-	return inb(priv->iobase + register_num * priv->offset);
-}
-EXPORT_SYMBOL(nec7210_ioport_read_byte);
-
-void nec7210_ioport_write_byte(struct nec7210_priv *priv, u8 data, unsigned int register_num)
-{
-	if (register_num == AUXMR)
-		/*
-		 * locking makes absolutely sure noone accesses the
-		 * AUXMR register faster than once per microsecond
-		 */
-		nec7210_locking_ioport_write_byte(priv, data, register_num);
-	else
-		outb(data, priv->iobase + register_num * priv->offset);
-}
-EXPORT_SYMBOL(nec7210_ioport_write_byte);
-
-/* locking variants of io wrappers, for chips that page-in registers */
-u8 nec7210_locking_ioport_read_byte(struct nec7210_priv *priv, unsigned int register_num)
-{
-	u8 retval;
-	unsigned long flags;
-
-	spin_lock_irqsave(&priv->register_page_lock, flags);
-	retval = inb(priv->iobase + register_num * priv->offset);
-	spin_unlock_irqrestore(&priv->register_page_lock, flags);
-	return retval;
-}
-EXPORT_SYMBOL(nec7210_locking_ioport_read_byte);
-
-void nec7210_locking_ioport_write_byte(struct nec7210_priv *priv, u8 data,
-				       unsigned int register_num)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&priv->register_page_lock, flags);
-	if (register_num == AUXMR)
-		udelay(1);
-	outb(data, priv->iobase + register_num * priv->offset);
-	spin_unlock_irqrestore(&priv->register_page_lock, flags);
-}
-EXPORT_SYMBOL(nec7210_locking_ioport_write_byte);
-#endif
-
-u8 nec7210_iomem_read_byte(struct nec7210_priv *priv, unsigned int register_num)
-{
-	return readb(priv->mmiobase + register_num * priv->offset);
-}
-EXPORT_SYMBOL(nec7210_iomem_read_byte);
-
-void nec7210_iomem_write_byte(struct nec7210_priv *priv, u8 data, unsigned int register_num)
-{
-	if (register_num == AUXMR)
-		/*
-		 * locking makes absolutely sure noone accesses the
-		 * AUXMR register faster than once per microsecond
-		 */
-		nec7210_locking_iomem_write_byte(priv, data, register_num);
-	else
-		writeb(data, priv->mmiobase + register_num * priv->offset);
-}
-EXPORT_SYMBOL(nec7210_iomem_write_byte);
-
-u8 nec7210_locking_iomem_read_byte(struct nec7210_priv *priv, unsigned int register_num)
-{
-	u8 retval;
-	unsigned long flags;
-
-	spin_lock_irqsave(&priv->register_page_lock, flags);
-	retval = readb(priv->mmiobase + register_num * priv->offset);
-	spin_unlock_irqrestore(&priv->register_page_lock, flags);
-	return retval;
-}
-EXPORT_SYMBOL(nec7210_locking_iomem_read_byte);
-
-void nec7210_locking_iomem_write_byte(struct nec7210_priv *priv, u8 data,
-				      unsigned int register_num)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&priv->register_page_lock, flags);
-	if (register_num == AUXMR)
-		udelay(1);
-	writeb(data, priv->mmiobase + register_num * priv->offset);
-	spin_unlock_irqrestore(&priv->register_page_lock, flags);
-}
-EXPORT_SYMBOL(nec7210_locking_iomem_write_byte);
-
-static int __init nec7210_init_module(void)
-{
-	return 0;
-}
-
-static void __exit nec7210_exit_module(void)
-{
-}
-
-module_init(nec7210_init_module);
-module_exit(nec7210_exit_module);
diff --git a/drivers/staging/gpib/ni_usb/Makefile b/drivers/staging/gpib/ni_usb/Makefile
deleted file mode 100644
index 469c5d16add3..000000000000
--- a/drivers/staging/gpib/ni_usb/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-
-obj-$(CONFIG_GPIB_NI_USB) += ni_usb_gpib.o
-
-
diff --git a/drivers/staging/gpib/ni_usb/ni_usb_gpib.c b/drivers/staging/gpib/ni_usb/ni_usb_gpib.c
deleted file mode 100644
index 1f8412de9fa3..000000000000
--- a/drivers/staging/gpib/ni_usb/ni_usb_gpib.c
+++ /dev/null
@@ -1,2678 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- * driver for National Instruments usb to gpib adapters
- *    copyright		   : (C) 2004 by Frank Mori Hess
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include "ni_usb_gpib.h"
-#include "gpibP.h"
-#include "nec7210.h"
-#include "tnt4882_registers.h"
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver for National Instruments USB devices");
-
-#define MAX_NUM_NI_USB_INTERFACES 128
-static struct usb_interface *ni_usb_driver_interfaces[MAX_NUM_NI_USB_INTERFACES];
-
-static int ni_usb_parse_status_block(const u8 *buffer, struct ni_usb_status_block *status);
-static int ni_usb_set_interrupt_monitor(struct gpib_board *board, unsigned int monitored_bits);
-static void ni_usb_stop(struct ni_usb_priv *ni_priv);
-
-static DEFINE_MUTEX(ni_usb_hotplug_lock);
-
-// calculates a reasonable timeout in that can be passed to usb functions
-static inline unsigned long ni_usb_timeout_msecs(unsigned int usec)
-{
-	if (usec == 0)
-		return 0;
-	return 2000 + usec / 500;
-};
-
-// returns timeout code byte for use in ni-usb-b instructions
-static unsigned short ni_usb_timeout_code(unsigned int usec)
-{
-	if (usec == 0)
-		return 0xf0;
-	else if (usec <= 10)
-		return 0xf1;
-	else if (usec <= 30)
-		return 0xf2;
-	else if (usec <= 100)
-		return 0xf3;
-	else if (usec <= 300)
-		return 0xf4;
-	else if (usec <= 1000)
-		return 0xf5;
-	else if (usec <= 3000)
-		return 0xf6;
-	else if (usec <= 10000)
-		return 0xf7;
-	else if (usec <= 30000)
-		return 0xf8;
-	else if (usec <= 100000)
-		return 0xf9;
-	else if (usec <= 300000)
-		return 0xfa;
-	else if (usec <= 1000000)
-		return 0xfb;
-	else if (usec <= 3000000)
-		return 0xfc;
-	else if (usec <= 10000000)
-		return 0xfd;
-	else if (usec <= 30000000)
-		return 0xfe;
-	else if (usec <= 100000000)
-		return 0xff;
-	else if	 (usec <= 300000000)
-		return 0x01;
-	/*
-	 * NI driver actually uses 0xff for timeout T1000s, which is a bug in their code.
-	 * I've verified on a usb-b that a code of 0x2 is correct for a 1000 sec timeout
-	 */
-	else if (usec <= 1000000000)
-		return 0x02;
-	pr_err("bug? usec is greater than 1e9\n");
-	return 0xf0;
-}
-
-static void ni_usb_bulk_complete(struct urb *urb)
-{
-	struct ni_usb_urb_ctx *context = urb->context;
-
-	complete(&context->complete);
-}
-
-static void ni_usb_timeout_handler(struct timer_list *t)
-{
-	struct ni_usb_priv *ni_priv = timer_container_of(ni_priv, t,
-							 bulk_timer);
-	struct ni_usb_urb_ctx *context = &ni_priv->context;
-
-	context->timed_out = 1;
-	complete(&context->complete);
-};
-
-// I'm using nonblocking loosely here, it only means -EAGAIN can be returned in certain cases
-static int ni_usb_nonblocking_send_bulk_msg(struct ni_usb_priv *ni_priv, void *data,
-					    int data_length, int *actual_data_length,
-					    int timeout_msecs)
-{
-	struct usb_device *usb_dev;
-	int retval;
-	unsigned int out_pipe;
-	struct ni_usb_urb_ctx *context = &ni_priv->context;
-
-	*actual_data_length = 0;
-	mutex_lock(&ni_priv->bulk_transfer_lock);
-	if (!ni_priv->bus_interface) {
-		mutex_unlock(&ni_priv->bulk_transfer_lock);
-		return -ENODEV;
-	}
-	if (ni_priv->bulk_urb) {
-		mutex_unlock(&ni_priv->bulk_transfer_lock);
-		return -EAGAIN;
-	}
-	ni_priv->bulk_urb = usb_alloc_urb(0, GFP_KERNEL);
-	if (!ni_priv->bulk_urb) {
-		mutex_unlock(&ni_priv->bulk_transfer_lock);
-		return -ENOMEM;
-	}
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	out_pipe = usb_sndbulkpipe(usb_dev, ni_priv->bulk_out_endpoint);
-	init_completion(&context->complete);
-	context->timed_out = 0;
-	usb_fill_bulk_urb(ni_priv->bulk_urb, usb_dev, out_pipe, data, data_length,
-			  &ni_usb_bulk_complete, context);
-
-	if (timeout_msecs)
-		mod_timer(&ni_priv->bulk_timer, jiffies + msecs_to_jiffies(timeout_msecs));
-
-	retval = usb_submit_urb(ni_priv->bulk_urb, GFP_KERNEL);
-	if (retval) {
-		timer_delete_sync(&ni_priv->bulk_timer);
-		usb_free_urb(ni_priv->bulk_urb);
-		ni_priv->bulk_urb = NULL;
-		dev_err(&usb_dev->dev, "failed to submit bulk out urb, retval=%i\n",
-			retval);
-		mutex_unlock(&ni_priv->bulk_transfer_lock);
-		return retval;
-	}
-	mutex_unlock(&ni_priv->bulk_transfer_lock);
-	wait_for_completion(&context->complete);    // wait for ni_usb_bulk_complete
-	if (context->timed_out) {
-		usb_kill_urb(ni_priv->bulk_urb);
-		dev_err(&usb_dev->dev, "killed urb due to timeout\n");
-		retval = -ETIMEDOUT;
-	} else {
-		retval = ni_priv->bulk_urb->status;
-	}
-
-	timer_delete_sync(&ni_priv->bulk_timer);
-	*actual_data_length = ni_priv->bulk_urb->actual_length;
-	mutex_lock(&ni_priv->bulk_transfer_lock);
-	usb_free_urb(ni_priv->bulk_urb);
-	ni_priv->bulk_urb = NULL;
-	mutex_unlock(&ni_priv->bulk_transfer_lock);
-	return retval;
-}
-
-static int ni_usb_send_bulk_msg(struct ni_usb_priv *ni_priv, void *data, int data_length,
-				int *actual_data_length, int timeout_msecs)
-{
-	int retval;
-	int timeout_msecs_remaining = timeout_msecs;
-
-	retval = ni_usb_nonblocking_send_bulk_msg(ni_priv, data, data_length, actual_data_length,
-						  timeout_msecs_remaining);
-	while (retval == -EAGAIN && (timeout_msecs == 0 || timeout_msecs_remaining > 0)) {
-		usleep_range(1000, 1500);
-		retval = ni_usb_nonblocking_send_bulk_msg(ni_priv, data, data_length,
-							  actual_data_length,
-							  timeout_msecs_remaining);
-		if (timeout_msecs != 0)
-			--timeout_msecs_remaining;
-	}
-	if (timeout_msecs != 0 && timeout_msecs_remaining <= 0)
-		return -ETIMEDOUT;
-	return retval;
-}
-
-// I'm using nonblocking loosely here, it only means -EAGAIN can be returned in certain cases
-static int ni_usb_nonblocking_receive_bulk_msg(struct ni_usb_priv *ni_priv,
-					       void *data, int data_length,
-					       int *actual_data_length, int timeout_msecs,
-					       int interruptible)
-{
-	struct usb_device *usb_dev;
-	int retval;
-	unsigned int in_pipe;
-	struct ni_usb_urb_ctx *context = &ni_priv->context;
-
-	*actual_data_length = 0;
-	mutex_lock(&ni_priv->bulk_transfer_lock);
-	if (!ni_priv->bus_interface) {
-		mutex_unlock(&ni_priv->bulk_transfer_lock);
-		return -ENODEV;
-	}
-	if (ni_priv->bulk_urb) {
-		mutex_unlock(&ni_priv->bulk_transfer_lock);
-		return -EAGAIN;
-	}
-	ni_priv->bulk_urb = usb_alloc_urb(0, GFP_KERNEL);
-	if (!ni_priv->bulk_urb) {
-		mutex_unlock(&ni_priv->bulk_transfer_lock);
-		return -ENOMEM;
-	}
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	in_pipe = usb_rcvbulkpipe(usb_dev, ni_priv->bulk_in_endpoint);
-	init_completion(&context->complete);
-	context->timed_out = 0;
-	usb_fill_bulk_urb(ni_priv->bulk_urb, usb_dev, in_pipe, data, data_length,
-			  &ni_usb_bulk_complete, context);
-
-	if (timeout_msecs)
-		mod_timer(&ni_priv->bulk_timer, jiffies + msecs_to_jiffies(timeout_msecs));
-
-	retval = usb_submit_urb(ni_priv->bulk_urb, GFP_KERNEL);
-	if (retval) {
-		timer_delete_sync(&ni_priv->bulk_timer);
-		usb_free_urb(ni_priv->bulk_urb);
-		ni_priv->bulk_urb = NULL;
-		dev_err(&usb_dev->dev, "failed to submit bulk in urb, retval=%i\n", retval);
-		mutex_unlock(&ni_priv->bulk_transfer_lock);
-		return retval;
-	}
-	mutex_unlock(&ni_priv->bulk_transfer_lock);
-	if (interruptible) {
-		if (wait_for_completion_interruptible(&context->complete)) {
-			/*
-			 * If we got interrupted by a signal while
-			 * waiting for the usb gpib to respond, we
-			 * should send a stop command so it will
-			 * finish up with whatever it was doing and
-			 * send its response now.
-			 */
-			ni_usb_stop(ni_priv);
-			retval = -ERESTARTSYS;
-			/*
-			 * now do an uninterruptible wait, it shouldn't take long
-			 * for the board to respond now.
-			 */
-			wait_for_completion(&context->complete);
-		}
-	} else {
-		wait_for_completion(&context->complete);
-	}
-	if (context->timed_out) {
-		usb_kill_urb(ni_priv->bulk_urb);
-		dev_err(&usb_dev->dev, "killed urb due to timeout\n");
-		retval = -ETIMEDOUT;
-	} else {
-		if (ni_priv->bulk_urb->status)
-			retval = ni_priv->bulk_urb->status;
-	}
-	timer_delete_sync(&ni_priv->bulk_timer);
-	*actual_data_length = ni_priv->bulk_urb->actual_length;
-	mutex_lock(&ni_priv->bulk_transfer_lock);
-	usb_free_urb(ni_priv->bulk_urb);
-	ni_priv->bulk_urb = NULL;
-	mutex_unlock(&ni_priv->bulk_transfer_lock);
-	return retval;
-}
-
-static int ni_usb_receive_bulk_msg(struct ni_usb_priv *ni_priv, void *data,
-				   int data_length, int *actual_data_length, int timeout_msecs,
-				   int interruptible)
-{
-	int retval;
-	int timeout_msecs_remaining = timeout_msecs;
-
-	retval = ni_usb_nonblocking_receive_bulk_msg(ni_priv, data, data_length,
-						     actual_data_length, timeout_msecs_remaining,
-						     interruptible);
-	while (retval == -EAGAIN && (timeout_msecs == 0 || timeout_msecs_remaining > 0)) {
-		usleep_range(1000, 1500);
-		retval = ni_usb_nonblocking_receive_bulk_msg(ni_priv, data, data_length,
-							     actual_data_length,
-							     timeout_msecs_remaining,
-							     interruptible);
-		if (timeout_msecs != 0)
-			--timeout_msecs_remaining;
-	}
-	if (timeout_msecs && timeout_msecs_remaining <= 0)
-		return -ETIMEDOUT;
-	return retval;
-}
-
-static int ni_usb_receive_control_msg(struct ni_usb_priv *ni_priv, __u8 request,
-				      __u8 requesttype, __u16 value, __u16 index,
-				      void *data, __u16 size, int timeout_msecs)
-{
-	struct usb_device *usb_dev;
-	int retval;
-	unsigned int in_pipe;
-
-	mutex_lock(&ni_priv->control_transfer_lock);
-	if (!ni_priv->bus_interface) {
-		mutex_unlock(&ni_priv->control_transfer_lock);
-		return -ENODEV;
-	}
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	in_pipe = usb_rcvctrlpipe(usb_dev, 0);
-	retval = usb_control_msg(usb_dev, in_pipe, request, requesttype, value, index, data,
-				 size, timeout_msecs);
-	mutex_unlock(&ni_priv->control_transfer_lock);
-	return retval;
-}
-
-static void ni_usb_soft_update_status(struct gpib_board *board, unsigned int ni_usb_ibsta,
-				      unsigned int clear_mask)
-{
-	static const unsigned int ni_usb_ibsta_mask = SRQI | ATN | CIC | REM | LACS | TACS | LOK;
-
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	unsigned int need_monitoring_bits = ni_usb_ibsta_monitor_mask;
-	unsigned long flags;
-
-	board->status &= ~clear_mask;
-	board->status &= ~ni_usb_ibsta_mask;
-	board->status |= ni_usb_ibsta & ni_usb_ibsta_mask;
-	if (ni_usb_ibsta & DCAS)
-		push_gpib_event(board, EVENT_DEV_CLR);
-	if (ni_usb_ibsta & DTAS)
-		push_gpib_event(board, EVENT_DEV_TRG);
-
-	spin_lock_irqsave(&board->spinlock, flags);
-/* remove set status bits from monitored set why ?***/
-	ni_priv->monitored_ibsta_bits &= ~ni_usb_ibsta;
-	need_monitoring_bits &= ~ni_priv->monitored_ibsta_bits; /* mm - monitored set */
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	dev_dbg(&usb_dev->dev, "need_monitoring_bits=0x%x\n", need_monitoring_bits);
-
-	if (need_monitoring_bits & ~ni_usb_ibsta)
-		ni_usb_set_interrupt_monitor(board, ni_usb_ibsta_monitor_mask);
-	else if (need_monitoring_bits & ni_usb_ibsta)
-		wake_up_interruptible(&board->wait);
-
-	dev_dbg(&usb_dev->dev, "ibsta=0x%x\n", ni_usb_ibsta);
-}
-
-static int ni_usb_parse_status_block(const u8 *buffer, struct ni_usb_status_block *status)
-{
-	u16 count;
-
-	status->id = buffer[0];
-	status->ibsta = (buffer[1] << 8) | buffer[2];
-	status->error_code = buffer[3];
-	count = buffer[4] | (buffer[5] << 8);
-	count = ~count;
-	count++;
-	status->count = count;
-	return 8;
-};
-
-static void ni_usb_dump_raw_block(const u8 *raw_data, int length)
-{
-	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 8, 1, raw_data, length, true);
-}
-
-static int ni_usb_parse_register_read_block(const u8 *raw_data, unsigned int *results,
-					    int num_results)
-{
-	int i = 0;
-	int j;
-	int unexpected = 0;
-	static const int results_per_chunk = 3;
-
-	for (j = 0; j < num_results;) {
-		int k;
-
-		if (raw_data[i++] != NIUSB_REGISTER_READ_DATA_START_ID) {
-			pr_err("parse error: wrong start id\n");
-			unexpected = 1;
-		}
-		for (k = 0; k < results_per_chunk && j < num_results; ++k)
-			results[j++] = raw_data[i++];
-	}
-	while (i % 4)
-		i++;
-	if (raw_data[i++] != NIUSB_REGISTER_READ_DATA_END_ID) {
-		pr_err("parse error: wrong end id\n");
-		unexpected = 1;
-	}
-	if (raw_data[i++] % results_per_chunk != num_results % results_per_chunk) {
-		pr_err("parse error: wrong count=%i for NIUSB_REGISTER_READ_DATA_END\n",
-		       (int)raw_data[i - 1]);
-		unexpected = 1;
-	}
-	while (i % 4) {
-		if (raw_data[i++] != 0) {
-			pr_err("unexpected data: raw_data[%i]=0x%x, expected 0\n",
-			       i - 1, (int)raw_data[i - 1]);
-			unexpected = 1;
-		}
-	}
-	if (unexpected)
-		ni_usb_dump_raw_block(raw_data, i);
-	return i;
-}
-
-static int ni_usb_parse_termination_block(const u8 *buffer)
-{
-	int i = 0;
-
-	if (buffer[i++] != NIUSB_TERM_ID ||
-	    buffer[i++] != 0x0 ||
-	    buffer[i++] != 0x0 ||
-	    buffer[i++] != 0x0) {
-		pr_err("received unexpected termination block\n");
-		pr_err(" expected: 0x%x 0x%x 0x%x 0x%x\n", NIUSB_TERM_ID, 0x0, 0x0, 0x0);
-		pr_err(" received: 0x%x 0x%x 0x%x 0x%x\n",
-		       buffer[i - 4], buffer[i - 3], buffer[i - 2], buffer[i - 1]);
-	}
-	return i;
-};
-
-static int parse_board_ibrd_readback(const u8 *raw_data, struct ni_usb_status_block *status,
-				     u8 *parsed_data, int parsed_data_length,
-				     int *actual_bytes_read)
-{
-	static const int ibrd_data_block_length = 0xf;
-	static const int ibrd_extended_data_block_length = 0x1e;
-	int data_block_length = 0;
-	int i = 0;
-	int j = 0;
-	int k;
-	int num_data_blocks = 0;
-	struct ni_usb_status_block register_write_status;
-	int unexpected = 0;
-
-	while (raw_data[i] == NIUSB_IBRD_DATA_ID || raw_data[i] == NIUSB_IBRD_EXTENDED_DATA_ID) {
-		if (raw_data[i] == NIUSB_IBRD_DATA_ID) {
-			data_block_length = ibrd_data_block_length;
-		} else if (raw_data[i] == NIUSB_IBRD_EXTENDED_DATA_ID) {
-			data_block_length = ibrd_extended_data_block_length;
-			if (raw_data[++i] !=  0)	{
-				pr_err("unexpected data: raw_data[%i]=0x%x, expected 0\n",
-				       i, (int)raw_data[i]);
-				unexpected = 1;
-			}
-		} else {
-			pr_err("Unexpected NIUSB_IBRD ID\n");
-			return -EINVAL;
-		}
-		++i;
-		for (k = 0; k < data_block_length; k++) {
-			if (j < parsed_data_length)
-				parsed_data[j++] = raw_data[i++];
-			else
-				++i;
-		}
-		++num_data_blocks;
-	}
-	i += ni_usb_parse_status_block(&raw_data[i], status);
-	if (status->id != NIUSB_IBRD_STATUS_ID) {
-		pr_err("bug: status->id=%i, != ibrd_status_id\n", status->id);
-		return -EIO;
-	}
-	i++;
-	if (num_data_blocks) {
-		*actual_bytes_read = (num_data_blocks - 1) * data_block_length + raw_data[i++];
-	} else {
-		++i;
-		*actual_bytes_read = 0;
-	}
-	if (*actual_bytes_read > j)
-		pr_err("bug: discarded data. actual_bytes_read=%i, j=%i\n", *actual_bytes_read, j);
-	for (k = 0; k < 2; k++)
-		if (raw_data[i++] != 0) {
-			pr_err("unexpected data: raw_data[%i]=0x%x, expected 0\n",
-			       i - 1, (int)raw_data[i - 1]);
-			unexpected = 1;
-		}
-	i += ni_usb_parse_status_block(&raw_data[i], &register_write_status);
-	if (register_write_status.id != NIUSB_REG_WRITE_ID) {
-		pr_err("unexpected data: register write status id=0x%x, expected 0x%x\n",
-		       register_write_status.id, NIUSB_REG_WRITE_ID);
-		unexpected = 1;
-	}
-	if (raw_data[i++] != 2) {
-		pr_err("unexpected data: register write count=%i, expected 2\n",
-		       (int)raw_data[i - 1]);
-		unexpected = 1;
-	}
-	for (k = 0; k < 3; k++)
-		if (raw_data[i++] != 0) {
-			pr_err("unexpected data: raw_data[%i]=0x%x, expected 0\n",
-			       i - 1, (int)raw_data[i - 1]);
-			unexpected = 1;
-		}
-	i += ni_usb_parse_termination_block(&raw_data[i]);
-	if (unexpected)
-		ni_usb_dump_raw_block(raw_data, i);
-	return i;
-}
-
-static	int ni_usb_parse_reg_write_status_block(const u8 *raw_data,
-						struct ni_usb_status_block *status,
-						int *writes_completed)
-{
-	int i = 0;
-
-	i += ni_usb_parse_status_block(raw_data, status);
-	*writes_completed = raw_data[i++];
-	while (i % 4)
-		i++;
-	return i;
-}
-
-static int ni_usb_write_registers(struct ni_usb_priv *ni_priv,
-				  const struct ni_usb_register *writes, int num_writes,
-				  unsigned int *ibsta)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	int retval;
-	u8 *out_data, *in_data;
-	int out_data_length;
-	static const int in_data_length = 0x20;
-	int bytes_written = 0, bytes_read = 0;
-	int i = 0;
-	int j;
-	struct ni_usb_status_block status;
-	static const int bytes_per_write = 3;
-	int reg_writes_completed;
-
-	out_data_length = num_writes * bytes_per_write + 0x10;
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-	i += ni_usb_bulk_register_write_header(&out_data[i], num_writes);
-	for (j = 0; j < num_writes; j++)
-		i += ni_usb_bulk_register_write(&out_data[i], writes[j]);
-	while (i % 4)
-		out_data[i++] = 0x00;
-	i += ni_usb_bulk_termination(&out_data[i]);
-
-	mutex_lock(&ni_priv->addressed_transfer_lock);
-
-	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
-	kfree(out_data);
-	if (retval) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
-			retval, bytes_written, i);
-		return retval;
-	}
-
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		return -ENOMEM;
-	}
-	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read, 1000, 0);
-	if (retval || bytes_read != 16) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-			retval, bytes_read);
-		ni_usb_dump_raw_block(in_data, bytes_read);
-		kfree(in_data);
-		return retval;
-	}
-
-	mutex_unlock(&ni_priv->addressed_transfer_lock);
-
-	ni_usb_parse_reg_write_status_block(in_data, &status, &reg_writes_completed);
-	// FIXME parse extra 09 status bits and termination
-	kfree(in_data);
-	if (status.id != NIUSB_REG_WRITE_ID) {
-		dev_err(&usb_dev->dev, "parse error, id=0x%x != NIUSB_REG_WRITE_ID\n", status.id);
-		return -EIO;
-	}
-	if (status.error_code) {
-		dev_err(&usb_dev->dev, "nonzero error code 0x%x\n", status.error_code);
-		return -EIO;
-	}
-	if (reg_writes_completed != num_writes) {
-		dev_err(&usb_dev->dev, "reg_writes_completed=%i, num_writes=%i\n",
-			reg_writes_completed, num_writes);
-		return -EIO;
-	}
-	if (ibsta)
-		*ibsta = status.ibsta;
-	return 0;
-}
-
-// interface functions
-static int ni_usb_read(struct gpib_board *board, u8 *buffer, size_t length,
-		       int *end, size_t *bytes_read)
-{
-	int retval, parse_retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	u8 *out_data, *in_data;
-	static const int out_data_length = 0x20;
-	int in_data_length;
-	int usb_bytes_written = 0, usb_bytes_read = 0;
-	int i = 0;
-	int complement_count;
-	int actual_length;
-	struct ni_usb_status_block status;
-	static const int max_read_length = 0xffff;
-	struct ni_usb_register reg;
-
-	*bytes_read = 0;
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	if (length > max_read_length)
-		return -EINVAL;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-	out_data[i++] = 0x0a;
-	out_data[i++] = ni_priv->eos_mode >> 8;
-	out_data[i++] = ni_priv->eos_char;
-	out_data[i++] = ni_usb_timeout_code(board->usec_timeout);
-	complement_count = length - 1;
-	complement_count = ~complement_count;
-	out_data[i++] = complement_count & 0xff;
-	out_data[i++] = (complement_count >> 8) & 0xff;
-	out_data[i++] = 0x0;
-	out_data[i++] = 0x0;
-	i += ni_usb_bulk_register_write_header(&out_data[i], 2);
-	reg.device = NIUSB_SUBDEV_TNT4882;
-	reg.address = nec7210_to_tnt4882_offset(AUXMR);
-	reg.value = AUX_HLDI;
-	i += ni_usb_bulk_register_write(&out_data[i], reg);
-	reg.value = AUX_CLEAR_END;
-	i += ni_usb_bulk_register_write(&out_data[i], reg);
-	while (i % 4)	// pad with zeros to 4-byte boundary
-		out_data[i++] = 0x0;
-	i += ni_usb_bulk_termination(&out_data[i]);
-
-	mutex_lock(&ni_priv->addressed_transfer_lock);
-
-	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &usb_bytes_written, 1000);
-	kfree(out_data);
-	if (retval || usb_bytes_written != i) {
-		if (retval == 0)
-			retval = -EIO;
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, usb_bytes_written=%i, i=%i\n",
-			retval, usb_bytes_written, i);
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		return retval;
-	}
-
-	in_data_length = (length / 30 + 1) * 0x20 + 0x20;
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		return -ENOMEM;
-	}
-	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &usb_bytes_read,
-					 ni_usb_timeout_msecs(board->usec_timeout), 1);
-
-	mutex_unlock(&ni_priv->addressed_transfer_lock);
-
-	if (retval == -ERESTARTSYS) {
-	} else if (retval) {
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, usb_bytes_read=%i\n",
-			retval, usb_bytes_read);
-		kfree(in_data);
-		return retval;
-	}
-	parse_retval = parse_board_ibrd_readback(in_data, &status, buffer, length, &actual_length);
-	if (parse_retval != usb_bytes_read) {
-		if (parse_retval >= 0)
-			parse_retval = -EIO;
-		dev_err(&usb_dev->dev, "retval=%i usb_bytes_read=%i\n",
-			parse_retval, usb_bytes_read);
-		kfree(in_data);
-		return parse_retval;
-	}
-	if (actual_length != length - status.count) {
-		dev_err(&usb_dev->dev, "actual_length=%i expected=%li\n",
-			actual_length, (long)(length - status.count));
-		ni_usb_dump_raw_block(in_data, usb_bytes_read);
-	}
-	kfree(in_data);
-	switch (status.error_code) {
-	case NIUSB_NO_ERROR:
-		retval = 0;
-		break;
-	case NIUSB_ABORTED_ERROR:
-		/*
-		 * this is expected if ni_usb_receive_bulk_msg got
-		 * interrupted by a signal and returned -ERESTARTSYS
-		 */
-		break;
-	case NIUSB_ATN_STATE_ERROR:
-		if (status.ibsta & DCAS) {
-			retval = -EINTR;
-		} else {
-			retval = -EIO;
-			dev_dbg(&usb_dev->dev, "read when ATN set stat: 0x%06x\n", status.ibsta);
-		}
-		break;
-	case NIUSB_ADDRESSING_ERROR:
-		retval = -EIO;
-		break;
-	case NIUSB_TIMEOUT_ERROR:
-		retval = -ETIMEDOUT;
-		break;
-	case NIUSB_EOSMODE_ERROR:
-		dev_err(&usb_dev->dev, "driver bug, we should have been able to avoid NIUSB_EOSMODE_ERROR.\n");
-		retval = -EINVAL;
-		break;
-	default:
-		dev_err(&usb_dev->dev, "unknown error code=%i\n",  status.error_code);
-		retval = -EIO;
-		break;
-	}
-	ni_usb_soft_update_status(board, status.ibsta, 0);
-	if (status.ibsta & END)
-		*end = 1;
-	else
-		*end = 0;
-	*bytes_read = actual_length;
-	return retval;
-}
-
-static int ni_usb_write(struct gpib_board *board, u8 *buffer, size_t length,
-			int send_eoi, size_t *bytes_written)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	u8 *out_data, *in_data;
-	int out_data_length;
-	static const int in_data_length = 0x10;
-	int usb_bytes_written = 0, usb_bytes_read = 0;
-	int i = 0, j;
-	int complement_count;
-	struct ni_usb_status_block status;
-	static const int max_write_length = 0xffff;
-
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	if (length > max_write_length)
-		return -EINVAL;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	out_data_length = length + 0x10;
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-	out_data[i++] = 0x0d;
-	complement_count = length - 1;
-	complement_count = ~complement_count;
-	out_data[i++] = complement_count & 0xff;
-	out_data[i++] = (complement_count >> 8) & 0xff;
-	out_data[i++] = ni_usb_timeout_code(board->usec_timeout);
-	out_data[i++] = 0x0;
-	out_data[i++] = 0x0;
-	if (send_eoi)
-		out_data[i++] = 0x8;
-	else
-		out_data[i++] = 0x0;
-	out_data[i++] = 0x0;
-	for (j = 0; j < length; j++)
-		out_data[i++] = buffer[j];
-	while (i % 4)	// pad with zeros to 4-byte boundary
-		out_data[i++] = 0x0;
-	i += ni_usb_bulk_termination(&out_data[i]);
-
-	mutex_lock(&ni_priv->addressed_transfer_lock);
-
-	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &usb_bytes_written,
-				      ni_usb_timeout_msecs(board->usec_timeout));
-	kfree(out_data);
-	if (retval || usb_bytes_written != i)	{
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, usb_bytes_written=%i, i=%i\n",
-			retval, usb_bytes_written, i);
-		return retval;
-	}
-
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		return -ENOMEM;
-	}
-	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &usb_bytes_read,
-					 ni_usb_timeout_msecs(board->usec_timeout), 1);
-
-	mutex_unlock(&ni_priv->addressed_transfer_lock);
-
-	if ((retval && retval != -ERESTARTSYS) || usb_bytes_read != 12) {
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, usb_bytes_read=%i\n",
-			retval, usb_bytes_read);
-		kfree(in_data);
-		return retval;
-	}
-	ni_usb_parse_status_block(in_data, &status);
-	kfree(in_data);
-	switch	(status.error_code) {
-	case NIUSB_NO_ERROR:
-		retval = 0;
-		break;
-	case NIUSB_ABORTED_ERROR:
-		/*
-		 * this is expected if ni_usb_receive_bulk_msg got
-		 * interrupted by a signal and returned -ERESTARTSYS
-		 */
-		break;
-	case NIUSB_ADDRESSING_ERROR:
-		dev_err(&usb_dev->dev, "Addressing error retval %d error code=%i\n",
-			retval, status.error_code);
-		retval = -ENXIO;
-		break;
-	case NIUSB_NO_LISTENER_ERROR:
-		retval = -ECOMM;
-		break;
-	case NIUSB_TIMEOUT_ERROR:
-		retval = -ETIMEDOUT;
-		break;
-	default:
-		dev_err(&usb_dev->dev, "unknown error code=%i\n", status.error_code);
-		retval = -EPIPE;
-		break;
-	}
-	ni_usb_soft_update_status(board, status.ibsta, 0);
-	*bytes_written = length - status.count;
-	return retval;
-}
-
-static int ni_usb_command_chunk(struct gpib_board *board, u8 *buffer, size_t length,
-				size_t *command_bytes_written)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	u8 *out_data, *in_data;
-	int out_data_length;
-	static const int in_data_length = 0x10;
-	int bytes_written = 0, bytes_read = 0;
-	int i = 0, j;
-	unsigned int complement_count;
-	struct ni_usb_status_block status;
-	// usb-b gives error 4 if you try to send more than 16 command bytes at once
-	static const int max_command_length = 0x10;
-
-	*command_bytes_written = 0;
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	if (length > max_command_length)
-		length = max_command_length;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	out_data_length = length + 0x10;
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-	out_data[i++] = 0x0c;
-	complement_count = length - 1;
-	complement_count = ~complement_count;
-	out_data[i++] = complement_count;
-	out_data[i++] = 0x0;
-	out_data[i++] = ni_usb_timeout_code(board->usec_timeout);
-	for (j = 0; j < length; j++)
-		out_data[i++] = buffer[j];
-	while (i % 4)	// pad with zeros to 4-byte boundary
-		out_data[i++] = 0x0;
-	i += ni_usb_bulk_termination(&out_data[i]);
-
-	mutex_lock(&ni_priv->addressed_transfer_lock);
-
-	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written,
-				      ni_usb_timeout_msecs(board->usec_timeout));
-	kfree(out_data);
-	if (retval || bytes_written != i) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
-			retval, bytes_written, i);
-		return retval;
-	}
-
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		return -ENOMEM;
-	}
-
-	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read,
-					 ni_usb_timeout_msecs(board->usec_timeout), 1);
-
-	mutex_unlock(&ni_priv->addressed_transfer_lock);
-
-	if ((retval && retval != -ERESTARTSYS) || bytes_read != 12) {
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-			retval, bytes_read);
-		kfree(in_data);
-		return retval;
-	}
-	ni_usb_parse_status_block(in_data, &status);
-	kfree(in_data);
-	*command_bytes_written = length - status.count;
-	switch (status.error_code) {
-	case NIUSB_NO_ERROR:
-		break;
-	case NIUSB_ABORTED_ERROR:
-		/*
-		 * this is expected if ni_usb_receive_bulk_msg got
-		 * interrupted by a signal and returned -ERESTARTSYS
-		 */
-		break;
-	case NIUSB_NO_BUS_ERROR:
-		return -ENOTCONN;
-	case NIUSB_EOSMODE_ERROR:
-		dev_err(&usb_dev->dev, "got eosmode error. Driver bug?\n");
-		return -EIO;
-	case NIUSB_TIMEOUT_ERROR:
-		return -ETIMEDOUT;
-	default:
-		dev_err(&usb_dev->dev, "unknown error code=%i\n", status.error_code);
-		return -EIO;
-	}
-	ni_usb_soft_update_status(board, status.ibsta, 0);
-	return 0;
-}
-
-static int ni_usb_command(struct gpib_board *board, u8 *buffer, size_t length,
-			  size_t *bytes_written)
-{
-	size_t count;
-	int retval;
-
-	*bytes_written = 0;
-	while (*bytes_written < length) {
-		retval = ni_usb_command_chunk(board, buffer + *bytes_written,
-					      length - *bytes_written, &count);
-		*bytes_written += count;
-		if (retval < 0)
-			return retval;
-	}
-	return 0;
-}
-
-static int ni_usb_take_control(struct gpib_board *board, int synchronous)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	u8 *out_data, *in_data;
-	static const int out_data_length = 0x10;
-	static const int  in_data_length = 0x10;
-	int bytes_written = 0, bytes_read = 0;
-	int i = 0;
-	struct ni_usb_status_block status;
-
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-	out_data[i++] = NIUSB_IBCAC_ID;
-	if (synchronous)
-		out_data[i++] = 0x1;
-	else
-		out_data[i++] = 0x0;
-	out_data[i++] = 0x0;
-	out_data[i++] = 0x0;
-	i += ni_usb_bulk_termination(&out_data[i]);
-
-	mutex_lock(&ni_priv->addressed_transfer_lock);
-
-	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
-	kfree(out_data);
-	if (retval || bytes_written != i) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
-			retval, bytes_written, i);
-		return retval;
-	}
-
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		return -ENOMEM;
-	}
-	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read, 1000, 1);
-
-	mutex_unlock(&ni_priv->addressed_transfer_lock);
-
-	if ((retval && retval != -ERESTARTSYS) || bytes_read != 12) {
-		if (retval == 0)
-			retval = -EIO;
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-			retval, bytes_read);
-		kfree(in_data);
-		return retval;
-	}
-	ni_usb_parse_status_block(in_data, &status);
-	kfree(in_data);
-	ni_usb_soft_update_status(board, status.ibsta, 0);
-	return retval;
-}
-
-static int ni_usb_go_to_standby(struct gpib_board *board)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	u8 *out_data, *in_data;
-	static const int out_data_length = 0x10;
-	static const int  in_data_length = 0x20;
-	int bytes_written = 0, bytes_read = 0;
-	int i = 0;
-	struct ni_usb_status_block status;
-
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-
-	out_data[i++] = NIUSB_IBGTS_ID;
-	out_data[i++] = 0x0;
-	out_data[i++] = 0x0;
-	out_data[i++] = 0x0;
-	i += ni_usb_bulk_termination(&out_data[i]);
-
-	mutex_lock(&ni_priv->addressed_transfer_lock);
-
-	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
-	kfree(out_data);
-	if (retval || bytes_written != i) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
-			retval, bytes_written, i);
-		return retval;
-	}
-
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		return -ENOMEM;
-	}
-	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read, 1000, 0);
-
-	mutex_unlock(&ni_priv->addressed_transfer_lock);
-
-	if (retval || bytes_read != 12) {
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-			retval, bytes_read);
-		kfree(in_data);
-		return retval;
-	}
-	ni_usb_parse_status_block(in_data, &status);
-	kfree(in_data);
-	if (status.id != NIUSB_IBGTS_ID)
-		dev_err(&usb_dev->dev, "bug: status.id 0x%x != INUSB_IBGTS_ID\n", status.id);
-	ni_usb_soft_update_status(board, status.ibsta, 0);
-	return 0;
-}
-
-static int ni_usb_request_system_control(struct gpib_board *board, int request_control)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	int i = 0;
-	struct ni_usb_register writes[4];
-	unsigned int ibsta;
-
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	if (request_control) {
-		writes[i].device = NIUSB_SUBDEV_TNT4882;
-		writes[i].address = CMDR;
-		writes[i].value = SETSC;
-		i++;
-		writes[i].device = NIUSB_SUBDEV_TNT4882;
-		writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-		writes[i].value = AUX_CIFC;
-		i++;
-	} else {
-		writes[i].device = NIUSB_SUBDEV_TNT4882;
-		writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-		writes[i].value = AUX_CREN;
-		i++;
-		writes[i].device = NIUSB_SUBDEV_TNT4882;
-		writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-		writes[i].value = AUX_CIFC;
-		i++;
-		writes[i].device = NIUSB_SUBDEV_TNT4882;
-		writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-		writes[i].value = AUX_DSC;
-		i++;
-		writes[i].device = NIUSB_SUBDEV_TNT4882;
-		writes[i].address = CMDR;
-		writes[i].value = CLRSC;
-		i++;
-	}
-	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
-	if (retval < 0) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return retval;
-	}
-	if (!request_control)
-		ni_priv->ren_state = 0;
-	ni_usb_soft_update_status(board, ibsta, 0);
-	return 0;
-}
-
-// FIXME maybe the interface should have a "pulse interface clear" function that can return an error?
-static void ni_usb_interface_clear(struct gpib_board *board, int assert)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	u8 *out_data, *in_data;
-	static const int out_data_length = 0x10;
-	static const int  in_data_length = 0x10;
-	int bytes_written = 0, bytes_read = 0;
-	int i = 0;
-	struct ni_usb_status_block status;
-
-	if (!ni_priv->bus_interface)
-		return; // -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-// FIXME: we are going to pulse when assert is true, and ignore otherwise
-	if (assert == 0)
-		return;
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return;
-	out_data[i++] = NIUSB_IBSIC_ID;
-	out_data[i++] = 0x0;
-	out_data[i++] = 0x0;
-	out_data[i++] = 0x0;
-	i += ni_usb_bulk_termination(&out_data[i]);
-	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
-	kfree(out_data);
-	if (retval || bytes_written != i) {
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
-			retval, bytes_written, i);
-		return;
-	}
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data)
-		return;
-
-	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read, 1000, 0);
-	if (retval || bytes_read != 12) {
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-			retval, bytes_read);
-		kfree(in_data);
-		return;
-	}
-	ni_usb_parse_status_block(in_data, &status);
-	kfree(in_data);
-	ni_usb_soft_update_status(board, status.ibsta, 0);
-}
-
-static void ni_usb_remote_enable(struct gpib_board *board, int enable)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	struct ni_usb_register reg;
-	unsigned int ibsta;
-
-	if (!ni_priv->bus_interface)
-		return; // -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	reg.device = NIUSB_SUBDEV_TNT4882;
-	reg.address = nec7210_to_tnt4882_offset(AUXMR);
-	if (enable)
-		reg.value = AUX_SREN;
-	else
-		reg.value = AUX_CREN;
-	retval = ni_usb_write_registers(ni_priv, &reg, 1, &ibsta);
-	if (retval < 0) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return; //retval;
-	}
-	ni_priv->ren_state = enable;
-	ni_usb_soft_update_status(board, ibsta, 0);
-	return;// 0;
-}
-
-static int ni_usb_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct ni_usb_priv *ni_priv = board->private_data;
-
-	ni_priv->eos_char = eos_byte;
-	ni_priv->eos_mode |= REOS;
-	if (compare_8_bits)
-		ni_priv->eos_mode |= BIN;
-	else
-		ni_priv->eos_mode &= ~BIN;
-	return 0;
-}
-
-static void ni_usb_disable_eos(struct gpib_board *board)
-{
-	struct ni_usb_priv *ni_priv = board->private_data;
-	/*
-	 * adapter gets unhappy if you don't zero all the bits
-	 * for the eos mode and eos char (returns error 4 on reads).
-	 */
-	ni_priv->eos_mode = 0;
-	ni_priv->eos_char = 0;
-}
-
-static unsigned int ni_usb_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	static const int buffer_length = 8;
-	u8 *buffer;
-	struct ni_usb_status_block status;
-
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	buffer = kmalloc(buffer_length, GFP_KERNEL);
-	if (!buffer)
-		return board->status;
-
-	retval = ni_usb_receive_control_msg(ni_priv, NI_USB_WAIT_REQUEST, USB_DIR_IN |
-					    USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-					    0x200, 0x0, buffer, buffer_length, 1000);
-	if (retval != buffer_length) {
-		dev_err(&usb_dev->dev, "usb_control_msg returned %i\n", retval);
-		kfree(buffer);
-		return board->status;
-	}
-	ni_usb_parse_status_block(buffer, &status);
-	kfree(buffer);
-	ni_usb_soft_update_status(board, status.ibsta, clear_mask);
-	return board->status;
-}
-
-// tells ni-usb to immediately stop an ongoing i/o operation
-static void ni_usb_stop(struct ni_usb_priv *ni_priv)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	int retval;
-	static const int buffer_length = 8;
-	u8 *buffer;
-	struct ni_usb_status_block status;
-
-	buffer = kmalloc(buffer_length, GFP_KERNEL);
-	if (!buffer)
-		return;
-
-	retval = ni_usb_receive_control_msg(ni_priv, NI_USB_STOP_REQUEST, USB_DIR_IN |
-					    USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-					    0x0, 0x0, buffer, buffer_length, 1000);
-	if (retval != buffer_length) {
-		dev_err(&usb_dev->dev, "usb_control_msg returned %i\n", retval);
-		kfree(buffer);
-		return;
-	}
-	ni_usb_parse_status_block(buffer, &status);
-	kfree(buffer);
-}
-
-static int ni_usb_primary_address(struct gpib_board *board, unsigned int address)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	int i = 0;
-	struct ni_usb_register writes[2];
-	unsigned int ibsta;
-
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(ADR);
-	writes[i].value = address;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_UNKNOWN2;
-	writes[i].address = 0x0;
-	writes[i].value = address;
-	i++;
-	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
-	if (retval < 0) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return retval;
-	}
-	ni_usb_soft_update_status(board, ibsta, 0);
-	return 0;
-}
-
-static int ni_usb_write_sad(struct ni_usb_register *writes, int address, int enable)
-{
-	unsigned int adr_bits, admr_bits;
-	int i = 0;
-
-	adr_bits = HR_ARS;
-	admr_bits = HR_TRM0 | HR_TRM1;
-	if (enable) {
-		adr_bits |= address;
-		admr_bits |= HR_ADM1;
-	} else {
-		adr_bits |= HR_DT | HR_DL;
-		admr_bits |= HR_ADM0;
-	}
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(ADR);
-	writes[i].value = adr_bits;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(ADMR);
-	writes[i].value = admr_bits;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_UNKNOWN2;
-	writes[i].address = 0x1;
-	writes[i].value = enable ? MSA(address) : 0x0;
-	i++;
-	return i;
-}
-
-static int ni_usb_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	int i = 0;
-	struct ni_usb_register writes[3];
-	unsigned int ibsta;
-
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	i += ni_usb_write_sad(writes, address, enable);
-	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
-	if (retval < 0) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return retval;
-	}
-	ni_usb_soft_update_status(board, ibsta, 0);
-	return 0;
-}
-
-static int ni_usb_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	u8 *out_data, *in_data;
-	static const int out_data_length = 0x10;
-	static const int  in_data_length = 0x20;
-	int bytes_written = 0, bytes_read = 0;
-	int i = 0;
-	int j = 0;
-	struct ni_usb_status_block status;
-
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-
-	out_data[i++] = NIUSB_IBRPP_ID;
-	out_data[i++] = 0xf0;	// FIXME: this should be the parallel poll timeout code
-	out_data[i++] = 0x0;
-	out_data[i++] = 0x0;
-	i += ni_usb_bulk_termination(&out_data[i]);
-	/*FIXME: 1000 should use parallel poll timeout (not supported yet)*/
-	retval = ni_usb_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
-
-	kfree(out_data);
-	if (retval || bytes_written != i) {
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
-			retval, bytes_written, i);
-		return retval;
-	}
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data)
-		return -ENOMEM;
-
-	/*FIXME: should use parallel poll timeout (not supported yet)*/
-	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length,
-					 &bytes_read, 1000, 1);
-
-	if (retval && retval != -ERESTARTSYS)	{
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-			retval, bytes_read);
-		kfree(in_data);
-		return retval;
-	}
-	j += ni_usb_parse_status_block(in_data, &status);
-	*result = in_data[j++];
-	kfree(in_data);
-	ni_usb_soft_update_status(board, status.ibsta, 0);
-	return retval;
-}
-
-static void ni_usb_parallel_poll_configure(struct gpib_board *board, u8 config)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	int i = 0;
-	struct ni_usb_register writes[1];
-	unsigned int ibsta;
-
-	if (!ni_priv->bus_interface)
-		return; // -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	writes[i].value = PPR | config;
-	i++;
-	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
-	if (retval < 0) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return;// retval;
-	}
-	ni_usb_soft_update_status(board, ibsta, 0);
-	return;// 0;
-}
-
-static void ni_usb_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	int i = 0;
-	struct ni_usb_register writes[1];
-	unsigned int ibsta;
-
-	if (!ni_priv->bus_interface)
-		return; // -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	if (ist)
-		writes[i].value = AUX_SPPF;
-	else
-		writes[i].value = AUX_CPPF;
-	i++;
-	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
-	if (retval < 0) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return;// retval;
-	}
-	ni_usb_soft_update_status(board, ibsta, 0);
-	return;// 0;
-}
-
-static void ni_usb_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	int i = 0;
-	struct ni_usb_register writes[1];
-	unsigned int ibsta;
-
-	if (!ni_priv->bus_interface)
-		return; // -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(SPMR);
-	writes[i].value = status;
-	i++;
-	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
-	if (retval < 0) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return;// retval;
-	}
-	ni_usb_soft_update_status(board, ibsta, 0);
-	return;// 0;
-}
-
-static u8 ni_usb_serial_poll_status(struct gpib_board *board)
-{
-	return 0;
-}
-
-static void ni_usb_return_to_local(struct gpib_board *board)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	int i = 0;
-	struct ni_usb_register writes[1];
-	unsigned int ibsta;
-
-	if (!ni_priv->bus_interface)
-		return; // -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	writes[i].value = AUX_RTL;
-	i++;
-	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
-	if (retval < 0) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return;// retval;
-	}
-	ni_usb_soft_update_status(board, ibsta, 0);
-	return;// 0;
-}
-
-static int ni_usb_line_status(const struct gpib_board *board)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	u8 *out_data, *in_data;
-	static const int out_data_length = 0x20;
-	static const int  in_data_length = 0x20;
-	int bytes_written = 0, bytes_read = 0;
-	int i = 0;
-	unsigned int bsr_bits;
-	int line_status = VALID_ALL;
-	// NI windows driver reads 0xd(HSSEL), 0xc (ARD0), 0x1f (BSR)
-
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data)
-		return -ENOMEM;
-
-	/* line status gets called during ibwait */
-	retval = mutex_trylock(&ni_priv->addressed_transfer_lock);
-
-	if (retval == 0) {
-		kfree(out_data);
-		return -EBUSY;
-	}
-	i += ni_usb_bulk_register_read_header(&out_data[i], 1);
-	i += ni_usb_bulk_register_read(&out_data[i], NIUSB_SUBDEV_TNT4882, BSR);
-	while (i % 4)
-		out_data[i++] = 0x0;
-	i += ni_usb_bulk_termination(&out_data[i]);
-	retval = ni_usb_nonblocking_send_bulk_msg(ni_priv, out_data, i, &bytes_written, 1000);
-	kfree(out_data);
-	if (retval || bytes_written != i) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		if (retval != -EAGAIN)
-			dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%i\n",
-				retval, bytes_written, i);
-		return retval;
-	}
-
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data) {
-		mutex_unlock(&ni_priv->addressed_transfer_lock);
-		return -ENOMEM;
-	}
-	retval = ni_usb_nonblocking_receive_bulk_msg(ni_priv, in_data, in_data_length,
-						     &bytes_read, 1000, 0);
-
-	mutex_unlock(&ni_priv->addressed_transfer_lock);
-
-	if (retval) {
-		if (retval != -EAGAIN)
-			dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-				retval, bytes_read);
-		kfree(in_data);
-		return retval;
-	}
-
-	ni_usb_parse_register_read_block(in_data, &bsr_bits, 1);
-	kfree(in_data);
-	if (bsr_bits & BCSR_REN_BIT)
-		line_status |= BUS_REN;
-	if (bsr_bits & BCSR_IFC_BIT)
-		line_status |= BUS_IFC;
-	if (bsr_bits & BCSR_SRQ_BIT)
-		line_status |= BUS_SRQ;
-	if (bsr_bits & BCSR_EOI_BIT)
-		line_status |= BUS_EOI;
-	if (bsr_bits & BCSR_NRFD_BIT)
-		line_status |= BUS_NRFD;
-	if (bsr_bits & BCSR_NDAC_BIT)
-		line_status |= BUS_NDAC;
-	if (bsr_bits & BCSR_DAV_BIT)
-		line_status |= BUS_DAV;
-	if (bsr_bits & BCSR_ATN_BIT)
-		line_status |= BUS_ATN;
-	return line_status;
-}
-
-static int ni_usb_setup_t1_delay(struct ni_usb_register *reg, unsigned int nano_sec,
-				 unsigned int *actual_ns)
-{
-	int i = 0;
-
-	*actual_ns = 2000;
-
-	reg[i].device = NIUSB_SUBDEV_TNT4882;
-	reg[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	if (nano_sec <= 1100)	{
-		reg[i].value = AUXRI | USTD | SISB;
-		*actual_ns = 1100;
-	} else {
-		reg[i].value = AUXRI | SISB;
-	}
-	i++;
-	reg[i].device = NIUSB_SUBDEV_TNT4882;
-	reg[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	if (nano_sec <= 500)	{
-		reg[i].value = AUXRB | HR_TRI;
-		*actual_ns = 500;
-	} else {
-		reg[i].value = AUXRB;
-	}
-	i++;
-	reg[i].device = NIUSB_SUBDEV_TNT4882;
-	reg[i].address = KEYREG;
-	if (nano_sec <= 350) {
-		reg[i].value = MSTD;
-		*actual_ns = 350;
-	} else {
-		reg[i].value = 0x0;
-	}
-	i++;
-	return i;
-}
-
-static int ni_usb_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	struct ni_usb_register writes[3];
-	unsigned int ibsta;
-	unsigned int actual_ns;
-	int i;
-
-	if (!ni_priv->bus_interface)
-		return -ENODEV;
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	i = ni_usb_setup_t1_delay(writes, nano_sec, &actual_ns);
-	retval = ni_usb_write_registers(ni_priv, writes, i, &ibsta);
-	if (retval < 0) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return retval;
-	}
-	board->t1_nano_sec = actual_ns;
-	ni_usb_soft_update_status(board, ibsta, 0);
-	return actual_ns;
-}
-
-static int ni_usb_allocate_private(struct gpib_board *board)
-{
-	struct ni_usb_priv *ni_priv;
-
-	board->private_data = kmalloc(sizeof(struct ni_usb_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -ENOMEM;
-	ni_priv = board->private_data;
-	memset(ni_priv, 0, sizeof(struct ni_usb_priv));
-	mutex_init(&ni_priv->bulk_transfer_lock);
-	mutex_init(&ni_priv->control_transfer_lock);
-	mutex_init(&ni_priv->interrupt_transfer_lock);
-	mutex_init(&ni_priv->addressed_transfer_lock);
-	return 0;
-}
-
-static void ni_usb_free_private(struct ni_usb_priv *ni_priv)
-{
-	usb_free_urb(ni_priv->interrupt_urb);
-	kfree(ni_priv);
-}
-
-#define NUM_INIT_WRITES 26
-static int ni_usb_setup_init(struct gpib_board *board, struct ni_usb_register *writes)
-{
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	unsigned int mask, actual_ns;
-	int i = 0;
-
-	writes[i].device = NIUSB_SUBDEV_UNKNOWN3;
-	writes[i].address = 0x10;
-	writes[i].value = 0x0;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = CMDR;
-	writes[i].value = SOFT_RESET;
-	i++;
-	writes[i].device =  NIUSB_SUBDEV_TNT4882;
-	writes[i].address =  nec7210_to_tnt4882_offset(AUXMR);
-	mask = AUXRA | HR_HLDA;
-	if (ni_priv->eos_mode & BIN)
-		mask |= HR_BIN;
-	writes[i].value = mask;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = AUXCR;
-	writes[i].value = mask;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = HSSEL;
-	writes[i].value = TNT_ONE_CHIP_BIT;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	writes[i].value = AUX_CR;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = IMR0;
-	writes[i].value = TNT_IMR0_ALWAYS_BITS;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(IMR1);
-	writes[i].value = 0x0;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address =  nec7210_to_tnt4882_offset(IMR2);
-	writes[i].value = 0x0;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = IMR3;
-	writes[i].value = 0x0;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	writes[i].value = AUX_HLDI;
-	i++;
-
-	i += ni_usb_setup_t1_delay(&writes[i], board->t1_nano_sec, &actual_ns);
-
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	writes[i].value = AUXRG | NTNL_BIT;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = CMDR;
-	if (board->master)
-		mask = SETSC; // set system controller
-	else
-		mask = CLRSC; // clear system controller
-	writes[i].value = mask;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	writes[i].value = AUX_CIFC;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(ADR);
-	writes[i].value = board->pad;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_UNKNOWN2;
-	writes[i].address = 0x0;
-	writes[i].value = board->pad;
-	i++;
-
-	i += ni_usb_write_sad(&writes[i], board->sad, board->sad >= 0);
-
-	writes[i].device = NIUSB_SUBDEV_UNKNOWN2;
-	writes[i].address = 0x2; // could this be a timeout ?
-	writes[i].value = 0xfd;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = 0xf; // undocumented address
-	writes[i].value = 0x11;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	writes[i].value = AUX_PON;
-	i++;
-	writes[i].device = NIUSB_SUBDEV_TNT4882;
-	writes[i].address = nec7210_to_tnt4882_offset(AUXMR);
-	writes[i].value = AUX_CPPF;
-	i++;
-	if (i > NUM_INIT_WRITES) {
-		dev_err(&usb_dev->dev, "bug!, buffer overrun, i=%i\n", i);
-		return 0;
-	}
-	return i;
-}
-
-static int ni_usb_init(struct gpib_board *board)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	struct ni_usb_register *writes;
-	unsigned int ibsta;
-	int writes_len;
-
-	writes = kmalloc_array(NUM_INIT_WRITES, sizeof(*writes), GFP_KERNEL);
-	if (!writes)
-		return -ENOMEM;
-
-	writes_len = ni_usb_setup_init(board, writes);
-	if (writes_len)
-		retval = ni_usb_write_registers(ni_priv, writes, writes_len, &ibsta);
-	else
-		return -EFAULT;
-	kfree(writes);
-	if (retval) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return retval;
-	}
-	ni_usb_soft_update_status(board, ibsta, 0);
-	return 0;
-}
-
-static void ni_usb_interrupt_complete(struct urb *urb)
-{
-	struct gpib_board *board = urb->context;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	int retval;
-	struct ni_usb_status_block status;
-	unsigned long flags;
-
-	switch (urb->status) {
-		/* success */
-	case 0:
-		break;
-		/* unlinked, don't resubmit */
-	case -ECONNRESET:
-	case -ENOENT:
-	case -ESHUTDOWN:
-		return;
-	default: /* other error, resubmit */
-		retval = usb_submit_urb(ni_priv->interrupt_urb, GFP_ATOMIC);
-		if (retval)
-			dev_err(&usb_dev->dev, "failed to resubmit interrupt urb\n");
-		return;
-	}
-
-	ni_usb_parse_status_block(urb->transfer_buffer, &status);
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	ni_priv->monitored_ibsta_bits &= ~status.ibsta;
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	wake_up_interruptible(&board->wait);
-
-	retval = usb_submit_urb(ni_priv->interrupt_urb, GFP_ATOMIC);
-	if (retval)
-		dev_err(&usb_dev->dev, "failed to resubmit interrupt urb\n");
-}
-
-static int ni_usb_set_interrupt_monitor(struct gpib_board *board, unsigned int monitored_bits)
-{
-	int retval;
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	static const int buffer_length = 8;
-	u8 *buffer;
-	struct ni_usb_status_block status;
-	unsigned long flags;
-
-	buffer = kmalloc(buffer_length, GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	ni_priv->monitored_ibsta_bits = ni_usb_ibsta_monitor_mask & monitored_bits;
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	retval = ni_usb_receive_control_msg(ni_priv, NI_USB_WAIT_REQUEST, USB_DIR_IN |
-					    USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-					    0x300, ni_usb_ibsta_monitor_mask & monitored_bits,
-					    buffer, buffer_length, 1000);
-	if (retval != buffer_length) {
-		dev_err(&usb_dev->dev, "usb_control_msg returned %i\n", retval);
-		kfree(buffer);
-		return -1;
-	}
-	ni_usb_parse_status_block(buffer, &status);
-	kfree(buffer);
-	return 0;
-}
-
-static int ni_usb_setup_urbs(struct gpib_board *board)
-{
-	struct ni_usb_priv *ni_priv = board->private_data;
-	struct usb_device *usb_dev;
-	int int_pipe;
-	int retval;
-
-	if (ni_priv->interrupt_in_endpoint < 0)
-		return 0;
-
-	mutex_lock(&ni_priv->interrupt_transfer_lock);
-	if (!ni_priv->bus_interface) {
-		mutex_unlock(&ni_priv->interrupt_transfer_lock);
-		return -ENODEV;
-	}
-	ni_priv->interrupt_urb = usb_alloc_urb(0, GFP_KERNEL);
-	if (!ni_priv->interrupt_urb) {
-		mutex_unlock(&ni_priv->interrupt_transfer_lock);
-		return -ENOMEM;
-	}
-	usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	int_pipe = usb_rcvintpipe(usb_dev, ni_priv->interrupt_in_endpoint);
-	usb_fill_int_urb(ni_priv->interrupt_urb, usb_dev, int_pipe, ni_priv->interrupt_buffer,
-			 sizeof(ni_priv->interrupt_buffer), &ni_usb_interrupt_complete, board, 1);
-	retval = usb_submit_urb(ni_priv->interrupt_urb, GFP_KERNEL);
-	mutex_unlock(&ni_priv->interrupt_transfer_lock);
-	if (retval) {
-		dev_err(&usb_dev->dev, "failed to submit first interrupt urb, retval=%i\n", retval);
-		return retval;
-	}
-	return 0;
-}
-
-static void ni_usb_cleanup_urbs(struct ni_usb_priv *ni_priv)
-{
-	if (ni_priv && ni_priv->bus_interface) {
-		if (ni_priv->interrupt_urb)
-			usb_kill_urb(ni_priv->interrupt_urb);
-		if (ni_priv->bulk_urb)
-			usb_kill_urb(ni_priv->bulk_urb);
-	}
-}
-
-static int ni_usb_b_read_serial_number(struct ni_usb_priv *ni_priv)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	int retval;
-	u8 *out_data;
-	u8 *in_data;
-	static const int out_data_length = 0x20;
-	static const int  in_data_length = 0x20;
-	int bytes_written = 0, bytes_read = 0;
-	int i = 0;
-	static const int num_reads = 4;
-	unsigned int results[4];
-	int j;
-	unsigned int serial_number;
-
-	in_data = kmalloc(in_data_length, GFP_KERNEL);
-	if (!in_data)
-		return -ENOMEM;
-
-	out_data = kmalloc(out_data_length, GFP_KERNEL);
-	if (!out_data) {
-		kfree(in_data);
-		return -ENOMEM;
-	}
-	i += ni_usb_bulk_register_read_header(&out_data[i], num_reads);
-	i += ni_usb_bulk_register_read(&out_data[i], NIUSB_SUBDEV_UNKNOWN3, SERIAL_NUMBER_1_REG);
-	i += ni_usb_bulk_register_read(&out_data[i], NIUSB_SUBDEV_UNKNOWN3, SERIAL_NUMBER_2_REG);
-	i += ni_usb_bulk_register_read(&out_data[i], NIUSB_SUBDEV_UNKNOWN3, SERIAL_NUMBER_3_REG);
-	i += ni_usb_bulk_register_read(&out_data[i], NIUSB_SUBDEV_UNKNOWN3, SERIAL_NUMBER_4_REG);
-	while (i % 4)
-		out_data[i++] = 0x0;
-	i += ni_usb_bulk_termination(&out_data[i]);
-	retval = ni_usb_send_bulk_msg(ni_priv, out_data, out_data_length, &bytes_written, 1000);
-	if (retval) {
-		dev_err(&usb_dev->dev, "send_bulk_msg returned %i, bytes_written=%i, i=%li\n",
-			retval, bytes_written, (long)out_data_length);
-		goto serial_out;
-	}
-	retval = ni_usb_receive_bulk_msg(ni_priv, in_data, in_data_length, &bytes_read, 1000, 0);
-	if (retval) {
-		dev_err(&usb_dev->dev, "receive_bulk_msg returned %i, bytes_read=%i\n",
-			retval, bytes_read);
-		ni_usb_dump_raw_block(in_data, bytes_read);
-		goto serial_out;
-	}
-	if (ARRAY_SIZE(results) < num_reads) {
-		dev_err(&usb_dev->dev, "serial number eetup bug\n");
-		retval = -EINVAL;
-		goto serial_out;
-	}
-	ni_usb_parse_register_read_block(in_data, results, num_reads);
-	serial_number = 0;
-	for (j = 0; j < num_reads; ++j)
-		serial_number |= (results[j] & 0xff) << (8 * j);
-	dev_dbg(&usb_dev->dev, "board serial number is 0x%x\n", serial_number);
-	retval = 0;
-serial_out:
-	kfree(in_data);
-	kfree(out_data);
-	return retval;
-}
-
-static int ni_usb_hs_wait_for_ready(struct ni_usb_priv *ni_priv)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	static const int buffer_size = 0x10;
-	static const int timeout = 50;
-	static const int msec_sleep_duration = 100;
-	int i;	int retval;
-	int j;
-	int unexpected = 0;
-	unsigned int serial_number;
-	u8 *buffer;
-
-	buffer = kmalloc(buffer_size, GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-
-	retval = ni_usb_receive_control_msg(ni_priv, NI_USB_SERIAL_NUMBER_REQUEST,
-					    USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-					    0x0, 0x0, buffer, buffer_size, 1000);
-	if (retval < 0) {
-		dev_err(&usb_dev->dev, "usb_control_msg request 0x%x returned %i\n",
-			NI_USB_SERIAL_NUMBER_REQUEST, retval);
-		goto ready_out;
-	}
-	j = 0;
-	if (buffer[j] != NI_USB_SERIAL_NUMBER_REQUEST) {
-		dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x%x\n",
-			j, (int)buffer[j], NI_USB_SERIAL_NUMBER_REQUEST);
-		unexpected = 1;
-	}
-	if (unexpected)
-		ni_usb_dump_raw_block(buffer, retval);
-	// NI-USB-HS+ pads the serial with 0x0 to make 16 bytes
-	if (retval != 5 && retval != 16) {
-		dev_err(&usb_dev->dev, "received unexpected number of bytes = %i, expected 5 or 16\n",
-			retval);
-		ni_usb_dump_raw_block(buffer, retval);
-	}
-	serial_number = 0;
-	serial_number |= buffer[++j];
-	serial_number |= (buffer[++j] << 8);
-	serial_number |= (buffer[++j] << 16);
-	serial_number |= (buffer[++j] << 24);
-	dev_dbg(&usb_dev->dev, "board serial number is 0x%x\n", serial_number);
-	for (i = 0; i < timeout; ++i) {
-		int ready = 0;
-
-		retval = ni_usb_receive_control_msg(ni_priv, NI_USB_POLL_READY_REQUEST,
-						    USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-						    0x0, 0x0, buffer, buffer_size, 100);
-		if (retval < 0) {
-			dev_err(&usb_dev->dev, "usb_control_msg request 0x%x returned %i\n",
-				NI_USB_POLL_READY_REQUEST, retval);
-			goto ready_out;
-		}
-		j = 0;
-		unexpected = 0;
-		if (buffer[j] != NI_USB_POLL_READY_REQUEST) { // [0]
-			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x%x\n",
-				j, (int)buffer[j], NI_USB_POLL_READY_REQUEST);
-			unexpected = 1;
-		}
-		++j;
-		if (buffer[j] != 0x1 && buffer[j] != 0x0) { // [1] HS+ sends 0x0
-			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x1 or 0x0\n",
-				j, (int)buffer[j]);
-			unexpected = 1;
-		}
-		if (buffer[++j] != 0x0) { // [2]
-			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x%x\n",
-				j, (int)buffer[j], 0x0);
-			unexpected = 1;
-		}
-		++j;
-		/*
-		 * MC usb-488 (and sometimes NI-USB-HS?) sends 0x8 here; MC usb-488A sends 0x7 here
-		 * NI-USB-HS+ sends 0x0
-		 */
-		if (buffer[j] != 0x1 && buffer[j] != 0x8 && buffer[j] != 0x7 && buffer[j] != 0x0) {
-			// [3]
-			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x0, 0x1, 0x7 or 0x8\n",
-				j, (int)buffer[j]);
-			unexpected = 1;
-		}
-		++j;
-		// NI-USB-HS+ sends 0 here
-		if (buffer[j] != 0x30 && buffer[j] != 0x0) { // [4]
-			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x0 or 0x30\n",
-				j, (int)buffer[j]);
-			unexpected = 1;
-		}
-		++j;
-		// MC usb-488 (and sometimes NI-USB-HS?) and NI-USB-HS+ sends 0x0 here
-		if (buffer[j] != 0x1 && buffer[j] != 0x0) { // [5]
-			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x1 or 0x0\n",
-				j, (int)buffer[j]);
-			unexpected = 1;
-		}
-		if (buffer[++j] != 0x0) { // [6]
-			ready = 1;
-			// NI-USB-HS+ sends 0xf or 0x19 here
-			if (buffer[j] != 0x2 && buffer[j] != 0xe && buffer[j] != 0xf &&
-			    buffer[j] != 0x16 && buffer[j] != 0x19) {
-				dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x2, 0xe, 0xf, 0x16 or 0x19\n",
-					j, (int)buffer[j]);
-				unexpected = 1;
-			}
-		}
-		if (buffer[++j] != 0x0) { // [7]
-			ready = 1;
-			// MC usb-488 sends 0x5 here; MC usb-488A sends 0x6 here
-			if (buffer[j] != 0x3 && buffer[j] != 0x5 && buffer[j] != 0x6 &&
-			    buffer[j] != 0x8)	{
-				dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x3 or 0x5, 0x6 or 0x08\n",
-					j, (int)buffer[j]);
-				unexpected = 1;
-			}
-		}
-		++j;
-		if (buffer[j] != 0x0 && buffer[j] != 0x2) { // [8] MC usb-488 sends 0x2 here
-			dev_err(&usb_dev->dev, " unexpected data: buffer[%i]=0x%x, expected 0x0 or 0x2\n",
-				j, (int)buffer[j]);
-			unexpected = 1;
-		}
-		++j;
-		// MC usb-488A and NI-USB-HS sends 0x3 here; NI-USB-HS+ sends 0x30 here
-		if (buffer[j] != 0x0 && buffer[j] != 0x3 && buffer[j] != 0x30) { // [9]
-			dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x0, 0x3 or 0x30\n",
-				j, (int)buffer[j]);
-			unexpected = 1;
-		}
-		if (buffer[++j] != 0x0) { // [10] MC usb-488 sends 0x7 here, new HS+ sends 0x59
-			ready = 1;
-			if (buffer[j] != 0x96 && buffer[j] != 0x7 && buffer[j] != 0x6e &&
-			    buffer[j] != 0x59) {
-				dev_err(&usb_dev->dev, "unexpected data: buffer[%i]=0x%x, expected 0x96, 0x07, 0x6e or 0x59\n",
-					j, (int)buffer[j]);
-				unexpected = 1;
-			}
-		}
-		if (unexpected)
-			ni_usb_dump_raw_block(buffer, retval);
-		if (ready)
-			break;
-		retval = msleep_interruptible(msec_sleep_duration);
-		if (retval) {
-			retval = -ERESTARTSYS;
-			goto ready_out;
-		}
-	}
-	retval = 0;
-
-ready_out:
-	kfree(buffer);
-	dev_dbg(&usb_dev->dev, "exit retval=%d\n", retval);
-	return retval;
-}
-
-/*
- * This does some extra init for HS+ models, as observed on Windows.  One of the
- * control requests causes the LED to stop blinking.
- * I'm not sure what the other 2 requests do.  None of these requests are actually required
- * for the adapter to work, maybe they do some init for the analyzer interface
- * (which we don't use).
- */
-static int ni_usb_hs_plus_extra_init(struct ni_usb_priv *ni_priv)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	int retval;
-	u8 *buffer;
-	static const int buffer_size = 16;
-	int transfer_size;
-
-	buffer = kmalloc(buffer_size, GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-	do {
-		transfer_size = 16;
-
-		retval = ni_usb_receive_control_msg(ni_priv, NI_USB_HS_PLUS_0x48_REQUEST,
-						    USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-						    0x0, 0x0, buffer, transfer_size, 1000);
-		if (retval < 0) {
-			dev_err(&usb_dev->dev, "usb_control_msg request 0x%x returned %i\n",
-				NI_USB_HS_PLUS_0x48_REQUEST, retval);
-			break;
-		}
-		// expected response data: 48 f3 30 00 00 00 00 00 00 00 00 00 00 00 00 00
-		if (buffer[0] != NI_USB_HS_PLUS_0x48_REQUEST)
-			dev_err(&usb_dev->dev, "unexpected data: buffer[0]=0x%x, expected 0x%x\n",
-				(int)buffer[0], NI_USB_HS_PLUS_0x48_REQUEST);
-
-		transfer_size = 2;
-
-		retval = ni_usb_receive_control_msg(ni_priv, NI_USB_HS_PLUS_LED_REQUEST,
-						    USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-						    0x1, 0x0, buffer, transfer_size, 1000);
-		if (retval < 0) {
-			dev_err(&usb_dev->dev, "usb_control_msg request 0x%x returned %i\n",
-				NI_USB_HS_PLUS_LED_REQUEST, retval);
-			break;
-		}
-		// expected response data: 4b 00
-		if (buffer[0] != NI_USB_HS_PLUS_LED_REQUEST)
-			dev_err(&usb_dev->dev, "unexpected data: buffer[0]=0x%x, expected 0x%x\n",
-				(int)buffer[0], NI_USB_HS_PLUS_LED_REQUEST);
-
-		transfer_size = 9;
-
-		retval = ni_usb_receive_control_msg(ni_priv, NI_USB_HS_PLUS_0xf8_REQUEST,
-						    USB_DIR_IN | USB_TYPE_VENDOR |
-						    USB_RECIP_INTERFACE,
-						    0x0, 0x1, buffer, transfer_size, 1000);
-		if (retval < 0) {
-			dev_err(&usb_dev->dev, "usb_control_msg request 0x%x returned %i\n",
-				NI_USB_HS_PLUS_0xf8_REQUEST, retval);
-			break;
-		}
-		// expected response data: f8 01 00 00 00 01 00 00 00
-		if (buffer[0] != NI_USB_HS_PLUS_0xf8_REQUEST)
-			dev_err(&usb_dev->dev, "unexpected data: buffer[0]=0x%x, expected 0x%x\n",
-				(int)buffer[0], NI_USB_HS_PLUS_0xf8_REQUEST);
-	} while (0);
-
-	// cleanup
-	kfree(buffer);
-	return retval;
-}
-
-static inline int ni_usb_device_match(struct usb_interface *interface,
-				      const struct gpib_board_config *config)
-{
-	if (gpib_match_device_path(&interface->dev, config->device_path) == 0)
-		return 0;
-	return 1;
-}
-
-static int ni_usb_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	int retval;
-	int i, index;
-	struct ni_usb_priv *ni_priv;
-	int product_id;
-	struct usb_device *usb_dev;
-
-	mutex_lock(&ni_usb_hotplug_lock);
-	retval = ni_usb_allocate_private(board);
-	if (retval < 0)		{
-		mutex_unlock(&ni_usb_hotplug_lock);
-		return retval;
-	}
-	ni_priv = board->private_data;
-	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++)	{
-		if (ni_usb_driver_interfaces[i] &&
-		    !usb_get_intfdata(ni_usb_driver_interfaces[i]) &&
-		    ni_usb_device_match(ni_usb_driver_interfaces[i], config)) {
-			ni_priv->bus_interface = ni_usb_driver_interfaces[i];
-			usb_set_intfdata(ni_usb_driver_interfaces[i], board);
-			usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-			index = i;
-			break;
-		}
-	}
-	if (i == MAX_NUM_NI_USB_INTERFACES) {
-		mutex_unlock(&ni_usb_hotplug_lock);
-		dev_err(board->gpib_dev, "No supported adapters found, have you loaded its firmware?\n");
-		return -ENODEV;
-	}
-	if (usb_reset_configuration(interface_to_usbdev(ni_priv->bus_interface)))
-		dev_err(&usb_dev->dev, "usb_reset_configuration() failed.\n");
-
-	product_id = le16_to_cpu(usb_dev->descriptor.idProduct);
-	ni_priv->product_id = product_id;
-
-	timer_setup(&ni_priv->bulk_timer, ni_usb_timeout_handler, 0);
-
-	switch (product_id) {
-	case USB_DEVICE_ID_NI_USB_B:
-		ni_priv->bulk_out_endpoint = NIUSB_B_BULK_OUT_ENDPOINT;
-		ni_priv->bulk_in_endpoint = NIUSB_B_BULK_IN_ENDPOINT;
-		ni_priv->interrupt_in_endpoint = NIUSB_B_INTERRUPT_IN_ENDPOINT;
-		ni_usb_b_read_serial_number(ni_priv);
-		break;
-	case USB_DEVICE_ID_NI_USB_HS:
-	case USB_DEVICE_ID_MC_USB_488:
-	case USB_DEVICE_ID_KUSB_488A:
-		ni_priv->bulk_out_endpoint = NIUSB_HS_BULK_OUT_ENDPOINT;
-		ni_priv->bulk_in_endpoint = NIUSB_HS_BULK_IN_ENDPOINT;
-		ni_priv->interrupt_in_endpoint = NIUSB_HS_INTERRUPT_IN_ENDPOINT;
-		retval = ni_usb_hs_wait_for_ready(ni_priv);
-		if (retval < 0) {
-			mutex_unlock(&ni_usb_hotplug_lock);
-			return retval;
-		}
-		break;
-	case USB_DEVICE_ID_NI_USB_HS_PLUS:
-		ni_priv->bulk_out_endpoint = NIUSB_HS_PLUS_BULK_OUT_ENDPOINT;
-		ni_priv->bulk_in_endpoint = NIUSB_HS_PLUS_BULK_IN_ENDPOINT;
-		ni_priv->interrupt_in_endpoint = NIUSB_HS_PLUS_INTERRUPT_IN_ENDPOINT;
-		retval = ni_usb_hs_wait_for_ready(ni_priv);
-		if (retval < 0) {
-			mutex_unlock(&ni_usb_hotplug_lock);
-			return retval;
-		}
-		retval = ni_usb_hs_plus_extra_init(ni_priv);
-		if (retval < 0) {
-			mutex_unlock(&ni_usb_hotplug_lock);
-			return retval;
-		}
-		break;
-	default:
-		mutex_unlock(&ni_usb_hotplug_lock);
-		dev_err(&usb_dev->dev, "\tDriver bug: unknown endpoints for usb device id %x\n",
-			product_id);
-		return -EINVAL;
-	}
-
-	retval = ni_usb_setup_urbs(board);
-	if (retval < 0) {
-		mutex_unlock(&ni_usb_hotplug_lock);
-		return retval;
-	}
-	retval = ni_usb_set_interrupt_monitor(board, 0);
-	if (retval < 0) {
-		mutex_unlock(&ni_usb_hotplug_lock);
-		return retval;
-	}
-
-	board->t1_nano_sec = 500;
-
-	retval = ni_usb_init(board);
-	if (retval < 0) {
-		mutex_unlock(&ni_usb_hotplug_lock);
-		return retval;
-	}
-	retval = ni_usb_set_interrupt_monitor(board, ni_usb_ibsta_monitor_mask);
-	if (retval < 0)		{
-		mutex_unlock(&ni_usb_hotplug_lock);
-		return retval;
-	}
-
-	mutex_unlock(&ni_usb_hotplug_lock);
-	dev_info(&usb_dev->dev,
-		 "bus %d dev num %d attached to gpib%d, intf %i\n",
-		 usb_dev->bus->busnum, usb_dev->devnum, board->minor, index);
-	return retval;
-}
-
-static int ni_usb_shutdown_hardware(struct ni_usb_priv *ni_priv)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(ni_priv->bus_interface);
-	int retval;
-	struct ni_usb_register writes[2];
-	static const int writes_length = ARRAY_SIZE(writes);
-	unsigned int ibsta;
-
-	writes[0].device = NIUSB_SUBDEV_TNT4882;
-	writes[0].address = nec7210_to_tnt4882_offset(AUXMR);
-	writes[0].value = AUX_CR;
-	writes[1].device = NIUSB_SUBDEV_UNKNOWN3;
-	writes[1].address = 0x10;
-	writes[1].value = 0x0;
-	retval = ni_usb_write_registers(ni_priv, writes, writes_length, &ibsta);
-	if (retval) {
-		dev_err(&usb_dev->dev, "register write failed, retval=%i\n", retval);
-		return retval;
-	}
-	return 0;
-}
-
-static void ni_usb_detach(struct gpib_board *board)
-{
-	struct ni_usb_priv *ni_priv;
-
-	mutex_lock(&ni_usb_hotplug_lock);
-	/*
-	 * under windows, software unplug does chip_reset nec7210 aux command,
-	 * then writes 0x0 to address 0x10 of device 3
-	 */
-	ni_priv = board->private_data;
-	if (ni_priv) {
-		if (ni_priv->bus_interface) {
-			ni_usb_set_interrupt_monitor(board, 0);
-			ni_usb_shutdown_hardware(ni_priv);
-			usb_set_intfdata(ni_priv->bus_interface, NULL);
-		}
-		mutex_lock(&ni_priv->bulk_transfer_lock);
-		mutex_lock(&ni_priv->control_transfer_lock);
-		mutex_lock(&ni_priv->interrupt_transfer_lock);
-		ni_usb_cleanup_urbs(ni_priv);
-		ni_usb_free_private(ni_priv);
-	}
-	mutex_unlock(&ni_usb_hotplug_lock);
-}
-
-static struct gpib_interface ni_usb_gpib_interface = {
-	.name = "ni_usb_b",
-	.attach = ni_usb_attach,
-	.detach = ni_usb_detach,
-	.read = ni_usb_read,
-	.write = ni_usb_write,
-	.command = ni_usb_command,
-	.take_control = ni_usb_take_control,
-	.go_to_standby = ni_usb_go_to_standby,
-	.request_system_control = ni_usb_request_system_control,
-	.interface_clear = ni_usb_interface_clear,
-	.remote_enable = ni_usb_remote_enable,
-	.enable_eos = ni_usb_enable_eos,
-	.disable_eos = ni_usb_disable_eos,
-	.parallel_poll = ni_usb_parallel_poll,
-	.parallel_poll_configure = ni_usb_parallel_poll_configure,
-	.parallel_poll_response = ni_usb_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = ni_usb_line_status,
-	.update_status = ni_usb_update_status,
-	.primary_address = ni_usb_primary_address,
-	.secondary_address = ni_usb_secondary_address,
-	.serial_poll_response = ni_usb_serial_poll_response,
-	.serial_poll_status = ni_usb_serial_poll_status,
-	.t1_delay = ni_usb_t1_delay,
-	.return_to_local = ni_usb_return_to_local,
-	.skip_check_for_command_acceptors = 1
-};
-
-// Table with the USB-devices: just now only testing IDs
-static struct usb_device_id ni_usb_driver_device_table[] = {
-	{USB_DEVICE(USB_VENDOR_ID_NI, USB_DEVICE_ID_NI_USB_B)},
-	{USB_DEVICE(USB_VENDOR_ID_NI, USB_DEVICE_ID_NI_USB_HS)},
-	// gpib-usb-hs+ has a second interface for the analyzer, which we ignore
-	{USB_DEVICE_INTERFACE_NUMBER(USB_VENDOR_ID_NI, USB_DEVICE_ID_NI_USB_HS_PLUS, 0)},
-	{USB_DEVICE(USB_VENDOR_ID_NI, USB_DEVICE_ID_KUSB_488A)},
-	{USB_DEVICE(USB_VENDOR_ID_NI, USB_DEVICE_ID_MC_USB_488)},
-	{} /* Terminating entry */
-};
-MODULE_DEVICE_TABLE(usb, ni_usb_driver_device_table);
-
-static int ni_usb_driver_probe(struct usb_interface *interface,	const struct usb_device_id *id)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(interface);
-	int i;
-	char *path;
-	static const int path_length = 1024;
-
-	mutex_lock(&ni_usb_hotplug_lock);
-	usb_get_dev(usb_dev);
-	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++) {
-		if (!ni_usb_driver_interfaces[i]) {
-			ni_usb_driver_interfaces[i] = interface;
-			usb_set_intfdata(interface, NULL);
-			break;
-		}
-	}
-	if (i == MAX_NUM_NI_USB_INTERFACES) {
-		usb_put_dev(usb_dev);
-		mutex_unlock(&ni_usb_hotplug_lock);
-		dev_err(&usb_dev->dev, "ni_usb_driver_interfaces[] full\n");
-		return -1;
-	}
-	path = kmalloc(path_length, GFP_KERNEL);
-	if (!path) {
-		usb_put_dev(usb_dev);
-		mutex_unlock(&ni_usb_hotplug_lock);
-		return -ENOMEM;
-	}
-	usb_make_path(usb_dev, path, path_length);
-	dev_info(&usb_dev->dev, "probe succeeded for path: %s\n", path);
-	kfree(path);
-	mutex_unlock(&ni_usb_hotplug_lock);
-	return 0;
-}
-
-static void ni_usb_driver_disconnect(struct usb_interface *interface)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(interface);
-	int i;
-
-	mutex_lock(&ni_usb_hotplug_lock);
-	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++)	{
-		if (ni_usb_driver_interfaces[i] == interface)	{
-			struct gpib_board *board = usb_get_intfdata(interface);
-
-			if (board) {
-				struct ni_usb_priv *ni_priv = board->private_data;
-
-				if (ni_priv) {
-					mutex_lock(&ni_priv->bulk_transfer_lock);
-					mutex_lock(&ni_priv->control_transfer_lock);
-					mutex_lock(&ni_priv->interrupt_transfer_lock);
-					ni_usb_cleanup_urbs(ni_priv);
-					ni_priv->bus_interface = NULL;
-					mutex_unlock(&ni_priv->interrupt_transfer_lock);
-					mutex_unlock(&ni_priv->control_transfer_lock);
-					mutex_unlock(&ni_priv->bulk_transfer_lock);
-				}
-			}
-			ni_usb_driver_interfaces[i] = NULL;
-			break;
-		}
-	}
-	if (i == MAX_NUM_NI_USB_INTERFACES)
-		dev_err(&usb_dev->dev, "unable to find interface  bug?\n");
-	usb_put_dev(usb_dev);
-	mutex_unlock(&ni_usb_hotplug_lock);
-}
-
-static int ni_usb_driver_suspend(struct usb_interface *interface, pm_message_t message)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(interface);
-	struct gpib_board *board;
-	int i, retval;
-
-	mutex_lock(&ni_usb_hotplug_lock);
-
-	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++)	{
-		if (ni_usb_driver_interfaces[i] == interface) {
-			board = usb_get_intfdata(interface);
-			if (board)
-				break;
-		}
-	}
-	if (i == MAX_NUM_NI_USB_INTERFACES) {
-		mutex_unlock(&ni_usb_hotplug_lock);
-		return 0;
-	}
-
-	struct ni_usb_priv *ni_priv = board->private_data;
-
-	if (ni_priv) {
-		ni_usb_set_interrupt_monitor(board, 0);
-		retval = ni_usb_shutdown_hardware(ni_priv);
-		if (retval) {
-			mutex_unlock(&ni_usb_hotplug_lock);
-			return retval;
-		}
-		if (ni_priv->interrupt_urb) {
-			mutex_lock(&ni_priv->interrupt_transfer_lock);
-			ni_usb_cleanup_urbs(ni_priv);
-			mutex_unlock(&ni_priv->interrupt_transfer_lock);
-		}
-		dev_dbg(&usb_dev->dev,
-			"bus %d dev num %d gpib%d, interface %i suspended\n",
-			usb_dev->bus->busnum, usb_dev->devnum, board->minor, i);
-	}
-
-	mutex_unlock(&ni_usb_hotplug_lock);
-	return 0;
-}
-
-static int ni_usb_driver_resume(struct usb_interface *interface)
-{
-	struct usb_device *usb_dev = interface_to_usbdev(interface);
-
-	struct gpib_board *board;
-	int i, retval;
-
-	mutex_lock(&ni_usb_hotplug_lock);
-
-	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++)	{
-		if (ni_usb_driver_interfaces[i] == interface) {
-			board = usb_get_intfdata(interface);
-			if (board)
-				break;
-		}
-	}
-	if (i == MAX_NUM_NI_USB_INTERFACES) {
-		mutex_unlock(&ni_usb_hotplug_lock);
-		return 0;
-	}
-
-	struct ni_usb_priv *ni_priv = board->private_data;
-
-	if (ni_priv) {
-		if (ni_priv->interrupt_urb) {
-			mutex_lock(&ni_priv->interrupt_transfer_lock);
-			retval = usb_submit_urb(ni_priv->interrupt_urb, GFP_KERNEL);
-			if (retval) {
-				dev_err(&usb_dev->dev, "resume failed to resubmit interrupt urb, retval=%i\n",
-					retval);
-				mutex_unlock(&ni_priv->interrupt_transfer_lock);
-				mutex_unlock(&ni_usb_hotplug_lock);
-				return retval;
-			}
-			mutex_unlock(&ni_priv->interrupt_transfer_lock);
-		} else {
-			dev_err(&usb_dev->dev, "bug! resume int urb not set up\n");
-			mutex_unlock(&ni_usb_hotplug_lock);
-			return -EINVAL;
-		}
-
-		switch (ni_priv->product_id) {
-		case USB_DEVICE_ID_NI_USB_B:
-			ni_usb_b_read_serial_number(ni_priv);
-			break;
-		case USB_DEVICE_ID_NI_USB_HS:
-		case USB_DEVICE_ID_MC_USB_488:
-		case USB_DEVICE_ID_KUSB_488A:
-			retval = ni_usb_hs_wait_for_ready(ni_priv);
-			if (retval < 0) {
-				mutex_unlock(&ni_usb_hotplug_lock);
-				return retval;
-			}
-			break;
-		case USB_DEVICE_ID_NI_USB_HS_PLUS:
-			retval = ni_usb_hs_wait_for_ready(ni_priv);
-			if (retval < 0) {
-				mutex_unlock(&ni_usb_hotplug_lock);
-				return retval;
-			}
-			retval = ni_usb_hs_plus_extra_init(ni_priv);
-			if (retval < 0) {
-				mutex_unlock(&ni_usb_hotplug_lock);
-				return retval;
-			}
-			break;
-		default:
-			mutex_unlock(&ni_usb_hotplug_lock);
-			dev_err(&usb_dev->dev, "\tDriver bug: unknown endpoints for usb device id\n");
-			return -EINVAL;
-		}
-
-		retval = ni_usb_set_interrupt_monitor(board, 0);
-		if (retval < 0) {
-			mutex_unlock(&ni_usb_hotplug_lock);
-			return retval;
-		}
-
-		retval = ni_usb_init(board);
-		if (retval < 0) {
-			mutex_unlock(&ni_usb_hotplug_lock);
-			return retval;
-		}
-		retval = ni_usb_set_interrupt_monitor(board, ni_usb_ibsta_monitor_mask);
-		if (retval < 0)		{
-			mutex_unlock(&ni_usb_hotplug_lock);
-			return retval;
-		}
-		if (board->master)
-			ni_usb_interface_clear(board, 1); // this is a pulsed action
-		if (ni_priv->ren_state)
-			ni_usb_remote_enable(board, 1);
-
-		dev_dbg(&usb_dev->dev,
-			"bus %d dev num %d gpib%d, interface %i resumed\n",
-			usb_dev->bus->busnum, usb_dev->devnum, board->minor, i);
-	}
-
-	mutex_unlock(&ni_usb_hotplug_lock);
-	return 0;
-}
-
-static struct usb_driver ni_usb_bus_driver = {
-	.name = DRV_NAME,
-	.probe = ni_usb_driver_probe,
-	.disconnect = ni_usb_driver_disconnect,
-	.suspend = ni_usb_driver_suspend,
-	.resume = ni_usb_driver_resume,
-	.id_table = ni_usb_driver_device_table,
-};
-
-static int __init ni_usb_init_module(void)
-{
-	int i;
-	int ret;
-
-	for (i = 0; i < MAX_NUM_NI_USB_INTERFACES; i++)
-		ni_usb_driver_interfaces[i] = NULL;
-
-	ret = usb_register(&ni_usb_bus_driver);
-	if (ret) {
-		pr_err("usb_register failed: error = %d\n", ret);
-		return ret;
-	}
-
-	ret = gpib_register_driver(&ni_usb_gpib_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		return ret;
-	}
-
-	return 0;
-}
-
-static void __exit ni_usb_exit_module(void)
-{
-	gpib_unregister_driver(&ni_usb_gpib_interface);
-	usb_deregister(&ni_usb_bus_driver);
-}
-
-module_init(ni_usb_init_module);
-module_exit(ni_usb_exit_module);
diff --git a/drivers/staging/gpib/ni_usb/ni_usb_gpib.h b/drivers/staging/gpib/ni_usb/ni_usb_gpib.h
deleted file mode 100644
index 688f5e08792f..000000000000
--- a/drivers/staging/gpib/ni_usb/ni_usb_gpib.h
+++ /dev/null
@@ -1,226 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/***************************************************************************
- *   copyright            : (C) 2004 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _NI_USB_GPIB_H
-#define _NI_USB_GPIB_H
-
-#include <linux/mutex.h>
-#include <linux/semaphore.h>
-#include <linux/usb.h>
-#include <linux/timer.h>
-#include "gpibP.h"
-
-enum {
-	USB_VENDOR_ID_NI = 0x3923
-};
-
-enum {
-	USB_DEVICE_ID_NI_USB_B = 0x702a,
-	USB_DEVICE_ID_NI_USB_B_PREINIT = 0x702b,	// device id before firmware is loaded
-	USB_DEVICE_ID_NI_USB_HS = 0x709b,
-	USB_DEVICE_ID_NI_USB_HS_PLUS = 0x7618,
-	USB_DEVICE_ID_KUSB_488A = 0x725c,
-	USB_DEVICE_ID_MC_USB_488 = 0x725d
-};
-
-enum ni_usb_device {
-	NIUSB_SUBDEV_TNT4882 = 1,
-	NIUSB_SUBDEV_UNKNOWN2 = 2,
-	NIUSB_SUBDEV_UNKNOWN3 = 3,
-};
-
-enum endpoint_addresses {
-	NIUSB_B_BULK_OUT_ENDPOINT = 0x2,
-	NIUSB_B_BULK_IN_ENDPOINT = 0x2,
-	NIUSB_B_BULK_IN_ALT_ENDPOINT = 0x6,
-	NIUSB_B_INTERRUPT_IN_ENDPOINT = 0x4,
-};
-
-enum hs_enpoint_addresses {
-	NIUSB_HS_BULK_OUT_ENDPOINT = 0x2,
-	NIUSB_HS_BULK_OUT_ALT_ENDPOINT = 0x6,
-	NIUSB_HS_BULK_IN_ENDPOINT = 0x4,
-	NIUSB_HS_BULK_IN_ALT_ENDPOINT = 0x8,
-	NIUSB_HS_INTERRUPT_IN_ENDPOINT = 0x1,
-};
-
-enum hs_plus_endpoint_addresses {
-	NIUSB_HS_PLUS_BULK_OUT_ENDPOINT = 0x1,
-	NIUSB_HS_PLUS_BULK_OUT_ALT_ENDPOINT = 0x4,
-	NIUSB_HS_PLUS_BULK_IN_ENDPOINT = 0x2,
-	NIUSB_HS_PLUS_BULK_IN_ALT_ENDPOINT = 0x5,
-	NIUSB_HS_PLUS_INTERRUPT_IN_ENDPOINT = 0x3,
-};
-
-struct ni_usb_urb_ctx {
-	struct completion complete;
-	unsigned timed_out : 1;
-};
-
-// struct which defines private_data for ni_usb devices
-struct ni_usb_priv {
-	struct usb_interface *bus_interface;
-	int bulk_out_endpoint;
-	int bulk_in_endpoint;
-	int interrupt_in_endpoint;
-	u8 eos_char;
-	unsigned short eos_mode;
-	unsigned int monitored_ibsta_bits;
-	struct urb *bulk_urb;
-	struct urb *interrupt_urb;
-	u8 interrupt_buffer[0x11];
-	struct mutex addressed_transfer_lock;	// protect transfer lock
-	struct mutex bulk_transfer_lock;	// protect bulk message sends
-	struct mutex control_transfer_lock;	// protect control messages
-	struct mutex interrupt_transfer_lock;	//  protect interrupt messages
-	struct timer_list bulk_timer;
-	struct ni_usb_urb_ctx context;
-	int product_id;
-	unsigned short ren_state;
-};
-
-struct ni_usb_status_block {
-	short id;
-	unsigned short ibsta;
-	short error_code;
-	unsigned short count;
-};
-
-struct ni_usb_register {
-	enum ni_usb_device device;
-	short address;
-	unsigned short value;
-};
-
-enum ni_usb_bulk_ids {
-	NIUSB_IBCAC_ID = 0x1,
-	NIUSB_UNKNOWN3_ID = 0x3, // device level function id?
-	NIUSB_TERM_ID = 0x4,
-	NIUSB_IBGTS_ID = 0x6,
-	NIUSB_IBRPP_ID = 0x7,
-	NIUSB_REG_READ_ID = 0x8,
-	NIUSB_REG_WRITE_ID = 0x9,
-	NIUSB_IBSIC_ID = 0xf,
-	NIUSB_REGISTER_READ_DATA_START_ID = 0x34,
-	NIUSB_REGISTER_READ_DATA_END_ID = 0x35,
-	NIUSB_IBRD_DATA_ID = 0x36,
-	NIUSB_IBRD_EXTENDED_DATA_ID = 0x37,
-	NIUSB_IBRD_STATUS_ID = 0x38
-};
-
-enum ni_usb_error_codes {
-	NIUSB_NO_ERROR = 0,
-	/*
-	 * NIUSB_ABORTED_ERROR occurs when I/O is interrupted early by
-	 * doing a NI_USB_STOP_REQUEST on the control endpoint.
-	 */
-	NIUSB_ABORTED_ERROR = 1,
-	/*
-	 * NIUSB_READ_ATN_ERROR occurs when you do a board read while
-	 * ATN is set
-	 */
-	NIUSB_ATN_STATE_ERROR = 2,
-	/*
-	 * NIUSB_ADDRESSING_ERROR occurs when you do a board
-	 * read/write as CIC but are not in LACS/TACS
-	 */
-	NIUSB_ADDRESSING_ERROR = 3,
-	/*
-	 * NIUSB_EOSMODE_ERROR occurs on reads if any eos mode or char
-	 * bits are set when REOS is not set.
-	 * Have also seen error 4 if you try to send more than 16
-	 * command bytes at once on a usb-b.
-	 */
-	NIUSB_EOSMODE_ERROR = 4,
-	/*
-	 * NIUSB_NO_BUS_ERROR occurs when you try to write a command
-	 * byte but there are no devices connected to the gpib bus
-	 */
-	NIUSB_NO_BUS_ERROR = 5,
-	/*
-	 * NIUSB_NO_LISTENER_ERROR occurs when you do a board write as
-	 * CIC with no listener
-	 */
-	NIUSB_NO_LISTENER_ERROR = 8,
-	/* get NIUSB_TIMEOUT_ERROR on board read/write timeout */
-	NIUSB_TIMEOUT_ERROR = 10,
-};
-
-enum ni_usb_control_requests {
-	NI_USB_STOP_REQUEST = 0x20,
-	NI_USB_WAIT_REQUEST = 0x21,
-	NI_USB_POLL_READY_REQUEST = 0x40,
-	NI_USB_SERIAL_NUMBER_REQUEST = 0x41,
-	NI_USB_HS_PLUS_0x48_REQUEST = 0x48,
-	NI_USB_HS_PLUS_LED_REQUEST = 0x4b,
-	NI_USB_HS_PLUS_0xf8_REQUEST = 0xf8
-};
-
-static const unsigned int ni_usb_ibsta_monitor_mask =
-	SRQI | LOK | REM | CIC | ATN | TACS | LACS | DTAS | DCAS;
-
-static inline int nec7210_to_tnt4882_offset(int offset)
-{
-	return 2 * offset;
-};
-
-static inline int ni_usb_bulk_termination(u8 *buffer)
-{
-	int i = 0;
-
-	buffer[i++] = NIUSB_TERM_ID;
-	buffer[i++] = 0x0;
-	buffer[i++] = 0x0;
-	buffer[i++] = 0x0;
-	return i;
-}
-
-enum ni_usb_unknown3_register {
-	SERIAL_NUMBER_4_REG = 0x8,
-	SERIAL_NUMBER_3_REG = 0x9,
-	SERIAL_NUMBER_2_REG = 0xa,
-	SERIAL_NUMBER_1_REG = 0xb,
-};
-
-static inline int ni_usb_bulk_register_write_header(u8 *buffer, int num_writes)
-{
-	int i = 0;
-
-	buffer[i++] = NIUSB_REG_WRITE_ID;
-	buffer[i++] = num_writes;
-	buffer[i++] = 0x0;
-	return i;
-}
-
-static inline int ni_usb_bulk_register_write(u8 *buffer, struct ni_usb_register reg)
-{
-	int i = 0;
-
-	buffer[i++] = reg.device;
-	buffer[i++] = reg.address;
-	buffer[i++] = reg.value;
-	return i;
-}
-
-static inline int ni_usb_bulk_register_read_header(u8 *buffer, int num_reads)
-{
-	int i = 0;
-
-	buffer[i++] = NIUSB_REG_READ_ID;
-	buffer[i++] = num_reads;
-	return i;
-}
-
-static inline int ni_usb_bulk_register_read(u8 *buffer, int device, int address)
-{
-	int i = 0;
-
-	buffer[i++] = device;
-	buffer[i++] = address;
-	return i;
-}
-
-#endif	// _NI_USB_GPIB_H
diff --git a/drivers/staging/gpib/pc2/Makefile b/drivers/staging/gpib/pc2/Makefile
deleted file mode 100644
index 481ee4296e1b..000000000000
--- a/drivers/staging/gpib/pc2/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-
-obj-$(CONFIG_GPIB_PC2) += pc2_gpib.o
-
-
-
diff --git a/drivers/staging/gpib/pc2/pc2_gpib.c b/drivers/staging/gpib/pc2/pc2_gpib.c
deleted file mode 100644
index 9f3943d1df66..000000000000
--- a/drivers/staging/gpib/pc2/pc2_gpib.c
+++ /dev/null
@@ -1,684 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *    copyright            : (C) 2001, 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-
-#include <linux/ioport.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/bitops.h>
-#include <asm/dma.h>
-#include <linux/dma-mapping.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include "nec7210.h"
-#include "gpibP.h"
-
-// struct which defines private_data for pc2 driver
-struct pc2_priv {
-	struct nec7210_priv nec7210_priv;
-	unsigned int irq;
-	// io address that clears interrupt for pc2a (0x2f0 + irq)
-	unsigned int clear_intr_addr;
-};
-
-// pc2 uses 8 consecutive io addresses
-static const int pc2_iosize = 8;
-static const int pc2a_iosize = 8;
-static const int pc2_2a_iosize = 16;
-
-// offset between io addresses of successive nec7210 registers
-static const int pc2a_reg_offset = 0x400;
-static const int pc2_reg_offset = 1;
-
-// interrupt service routine
-static irqreturn_t pc2_interrupt(int irq, void *arg);
-static irqreturn_t pc2a_interrupt(int irq, void *arg);
-
-// pc2 specific registers and bits
-
-// interrupt clear register address
-static const int pc2a_clear_intr_iobase = 0x2f0;
-static inline unsigned int CLEAR_INTR_REG(unsigned int irq)
-{
-	return pc2a_clear_intr_iobase + irq;
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver for PC2/PC2a and compatible devices");
-
-/*
- * GPIB interrupt service routines
- */
-
-irqreturn_t pc2_interrupt(int irq, void *arg)
-{
-	struct gpib_board *board = arg;
-	struct pc2_priv *priv = board->private_data;
-	unsigned long flags;
-	irqreturn_t retval;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	retval = nec7210_interrupt(board, &priv->nec7210_priv);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return retval;
-}
-
-irqreturn_t pc2a_interrupt(int irq, void *arg)
-{
-	struct gpib_board *board = arg;
-	struct pc2_priv *priv = board->private_data;
-	int status1, status2;
-	unsigned long flags;
-	irqreturn_t retval;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	// read interrupt status (also clears status)
-	status1 = read_byte(&priv->nec7210_priv, ISR1);
-	status2 = read_byte(&priv->nec7210_priv, ISR2);
-	/* clear interrupt circuit */
-	if (priv->irq)
-		outb(0xff, CLEAR_INTR_REG(priv->irq));
-	retval = nec7210_interrupt_have_status(board, &priv->nec7210_priv, status1, status2);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return retval;
-}
-
-// wrappers for interface functions
-static int pc2_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
-		    size_t *bytes_read)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
-}
-
-static int pc2_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
-		     size_t *bytes_written)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
-}
-
-static int pc2_command(struct gpib_board *board, u8 *buffer,
-		       size_t length, size_t *bytes_written)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
-}
-
-static int pc2_take_control(struct gpib_board *board, int synchronous)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
-}
-
-static int pc2_go_to_standby(struct gpib_board *board)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_go_to_standby(board, &priv->nec7210_priv);
-}
-
-static int pc2_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_request_system_control(board, &priv->nec7210_priv, request_control);
-}
-
-static void pc2_interface_clear(struct gpib_board *board, int assert)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
-}
-
-static void pc2_remote_enable(struct gpib_board *board, int enable)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
-}
-
-static int pc2_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
-}
-
-static void pc2_disable_eos(struct gpib_board *board)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	nec7210_disable_eos(board, &priv->nec7210_priv);
-}
-
-static unsigned int pc2_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_update_status(board, &priv->nec7210_priv, clear_mask);
-}
-
-static int pc2_primary_address(struct gpib_board *board, unsigned int address)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_primary_address(board, &priv->nec7210_priv, address);
-}
-
-static int pc2_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
-}
-
-static int pc2_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_parallel_poll(board, &priv->nec7210_priv, result);
-}
-
-static void pc2_parallel_poll_configure(struct gpib_board *board, u8 config)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_configure(board, &priv->nec7210_priv, config);
-}
-
-static void pc2_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
-}
-
-static void pc2_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
-}
-
-static u8 pc2_serial_poll_status(struct gpib_board *board)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
-}
-
-static int pc2_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	return nec7210_t1_delay(board, &priv->nec7210_priv, nano_sec);
-}
-
-static void pc2_return_to_local(struct gpib_board *board)
-{
-	struct pc2_priv *priv = board->private_data;
-
-	nec7210_return_to_local(board, &priv->nec7210_priv);
-}
-
-static int allocate_private(struct gpib_board *board)
-{
-	struct pc2_priv *priv;
-
-	board->private_data = kmalloc(sizeof(struct pc2_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -1;
-	priv = board->private_data;
-	memset(priv, 0, sizeof(struct pc2_priv));
-	init_nec7210_private(&priv->nec7210_priv);
-	return 0;
-}
-
-static void free_private(struct gpib_board *board)
-{
-	kfree(board->private_data);
-	board->private_data = NULL;
-}
-
-static int pc2_generic_attach(struct gpib_board *board, const struct gpib_board_config *config,
-			      enum nec7210_chipset chipset)
-{
-	struct pc2_priv *pc2_priv;
-	struct nec7210_priv *nec_priv;
-
-	board->status = 0;
-	if (allocate_private(board))
-		return -ENOMEM;
-	pc2_priv = board->private_data;
-	nec_priv = &pc2_priv->nec7210_priv;
-	nec_priv->read_byte = nec7210_ioport_read_byte;
-	nec_priv->write_byte = nec7210_ioport_write_byte;
-	nec_priv->type = chipset;
-
-#ifndef PC2_DMA
-	/*
-	 * board->dev hasn't been initialized, so forget about DMA until this driver
-	 * is adapted to use isa_register_driver.
-	 */
-	if (config->ibdma)
-	// driver needs to be adapted to use isa_register_driver to get a struct device*
-		dev_err(board->gpib_dev, "DMA disabled for pc2 gpib");
-#else
-	if (config->ibdma) {
-		nec_priv->dma_buffer_length = 0x1000;
-		nec_priv->dma_buffer = dma_alloc_coherent(board->dev,
-							  nec_priv->dma_buffer_length, &
-							  nec_priv->dma_buffer_addr, GFP_ATOMIC);
-		if (!nec_priv->dma_buffer)
-			return -ENOMEM;
-
-		// request isa dma channel
-		if (request_dma(config->ibdma, "pc2")) {
-			dev_err(board->gpib_dev, "can't request DMA %d\n", config->ibdma);
-			return -1;
-		}
-		nec_priv->dma_channel = config->ibdma;
-	}
-#endif
-
-	return 0;
-}
-
-static int pc2_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	int isr_flags = 0;
-	struct pc2_priv *pc2_priv;
-	struct nec7210_priv *nec_priv;
-	int retval;
-
-	retval = pc2_generic_attach(board, config, NEC7210);
-	if (retval)
-		return retval;
-
-	pc2_priv = board->private_data;
-	nec_priv = &pc2_priv->nec7210_priv;
-	nec_priv->offset = pc2_reg_offset;
-
-	if (!request_region(config->ibbase, pc2_iosize, "pc2")) {
-		dev_err(board->gpib_dev, "ioports are already in use\n");
-		return -EBUSY;
-	}
-	nec_priv->iobase = config->ibbase;
-
-	nec7210_board_reset(nec_priv, board);
-
-	// install interrupt handler
-	if (config->ibirq) {
-		if (request_irq(config->ibirq, pc2_interrupt, isr_flags, "pc2", board))	{
-			dev_err(board->gpib_dev, "can't request IRQ %d\n", config->ibirq);
-			return -EBUSY;
-		}
-	}
-	pc2_priv->irq = config->ibirq;
-	/* poll so we can detect assertion of ATN */
-	if (gpib_request_pseudo_irq(board, pc2_interrupt)) {
-		dev_err(board->gpib_dev, "failed to allocate pseudo_irq\n");
-		return -1;
-	}
-	/* set internal counter register for 8 MHz input clock */
-	write_byte(nec_priv, ICR | 8, AUXMR);
-
-	nec7210_board_online(nec_priv, board);
-
-	return 0;
-}
-
-static void pc2_detach(struct gpib_board *board)
-{
-	struct pc2_priv *pc2_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (pc2_priv) {
-		nec_priv = &pc2_priv->nec7210_priv;
-#ifdef PC2_DMA
-		if (nec_priv->dma_channel)
-			free_dma(nec_priv->dma_channel);
-#endif
-		gpib_free_pseudo_irq(board);
-		if (pc2_priv->irq)
-			free_irq(pc2_priv->irq, board);
-		if (nec_priv->iobase) {
-			nec7210_board_reset(nec_priv, board);
-			release_region(nec_priv->iobase, pc2_iosize);
-		}
-		if (nec_priv->dma_buffer) {
-			dma_free_coherent(board->dev, nec_priv->dma_buffer_length,
-					  nec_priv->dma_buffer, nec_priv->dma_buffer_addr);
-			nec_priv->dma_buffer = NULL;
-		}
-	}
-	free_private(board);
-}
-
-static int pc2a_common_attach(struct gpib_board *board, const struct gpib_board_config *config,
-			      unsigned int num_registers, enum nec7210_chipset chipset)
-{
-	unsigned int i, j;
-	struct pc2_priv *pc2_priv;
-	struct nec7210_priv *nec_priv;
-	int retval;
-
-	retval = pc2_generic_attach(board, config, chipset);
-	if (retval)
-		return retval;
-
-	pc2_priv = board->private_data;
-	nec_priv = &pc2_priv->nec7210_priv;
-	nec_priv->offset = pc2a_reg_offset;
-
-	switch (config->ibbase) {
-	case 0x02e1:
-	case 0x22e1:
-	case 0x42e1:
-	case 0x62e1:
-		break;
-	default:
-		dev_err(board->gpib_dev, "PCIIa base range invalid, must be one of 0x[0246]2e1, but is 0x%x\n",
-			config->ibbase);
-		return -1;
-	}
-
-	if (config->ibirq) {
-		if (config->ibirq < 2 || config->ibirq > 7) {
-			dev_err(board->gpib_dev, "illegal interrupt level %i\n",
-				config->ibirq);
-			return -1;
-		}
-	} else	{
-		dev_err(board->gpib_dev, "interrupt disabled, using polling mode (slow)\n");
-	}
-#ifdef CHECK_IOPORTS
-	unsigned int err = 0;
-
-	for (i = 0; i < num_registers; i++) {
-		if (check_region(config->ibbase + i * pc2a_reg_offset, 1))
-			err++;
-	}
-	if (config->ibirq && check_region(pc2a_clear_intr_iobase + config->ibirq, 1))
-		err++;
-	if (err) {
-		dev_err(board->gpib_dev, "ioports are already in use");
-		return -EBUSY;
-	}
-#endif
-	for (i = 0; i < num_registers; i++) {
-		if (!request_region(config->ibbase +
-					i * pc2a_reg_offset, 1, "pc2a")) {
-			dev_err(board->gpib_dev, "ioports are already in use");
-			for (j = 0; j < i; j++)
-				release_region(config->ibbase +
-					j * pc2a_reg_offset, 1);
-			return -EBUSY;
-		}
-	}
-	nec_priv->iobase = config->ibbase;
-	if (config->ibirq) {
-		if (!request_region(pc2a_clear_intr_iobase + config->ibirq, 1, "pc2a"))  {
-			dev_err(board->gpib_dev, "ioports are already in use");
-			return -1;
-		}
-		pc2_priv->clear_intr_addr = pc2a_clear_intr_iobase + config->ibirq;
-		if (request_irq(config->ibirq, pc2a_interrupt, 0, "pc2a", board)) {
-			dev_err(board->gpib_dev, "can't request IRQ %d\n", config->ibirq);
-			return -EBUSY;
-		}
-	}
-	pc2_priv->irq = config->ibirq;
-	/* poll so we can detect assertion of ATN */
-	if (gpib_request_pseudo_irq(board, pc2_interrupt)) {
-		dev_err(board->gpib_dev, "failed to allocate pseudo_irq\n");
-		return -1;
-	}
-
-	// make sure interrupt is clear
-	if (pc2_priv->irq)
-		outb(0xff, CLEAR_INTR_REG(pc2_priv->irq));
-
-	nec7210_board_reset(nec_priv, board);
-
-	/* set internal counter register for 8 MHz input clock */
-	write_byte(nec_priv, ICR | 8, AUXMR);
-
-	nec7210_board_online(nec_priv, board);
-
-	return 0;
-}
-
-static int pc2a_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	return pc2a_common_attach(board, config, pc2a_iosize, NEC7210);
-}
-
-static int pc2a_cb7210_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	return pc2a_common_attach(board, config, pc2a_iosize, CB7210);
-}
-
-static int pc2_2a_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	return pc2a_common_attach(board, config, pc2_2a_iosize, NAT4882);
-}
-
-static void pc2a_common_detach(struct gpib_board *board, unsigned int num_registers)
-{
-	int i;
-	struct pc2_priv *pc2_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (pc2_priv) {
-		nec_priv = &pc2_priv->nec7210_priv;
-#ifdef PC2_DMA
-		if (nec_priv->dma_channel)
-			free_dma(nec_priv->dma_channel);
-#endif
-		gpib_free_pseudo_irq(board);
-		if (pc2_priv->irq)
-			free_irq(pc2_priv->irq, board);
-		if (nec_priv->iobase) {
-			nec7210_board_reset(nec_priv, board);
-			for (i = 0; i < num_registers; i++)
-				release_region(nec_priv->iobase +
-					       i * pc2a_reg_offset, 1);
-		}
-		if (pc2_priv->clear_intr_addr)
-			release_region(pc2_priv->clear_intr_addr, 1);
-		if (nec_priv->dma_buffer) {
-			dma_free_coherent(board->dev, nec_priv->dma_buffer_length,
-					  nec_priv->dma_buffer,
-					  nec_priv->dma_buffer_addr);
-			nec_priv->dma_buffer = NULL;
-		}
-	}
-	free_private(board);
-}
-
-static void pc2a_detach(struct gpib_board *board)
-{
-	pc2a_common_detach(board, pc2a_iosize);
-}
-
-static void pc2_2a_detach(struct gpib_board *board)
-{
-	pc2a_common_detach(board, pc2_2a_iosize);
-}
-
-static struct gpib_interface pc2_interface = {
-	.name =	"pcII",
-	.attach =	pc2_attach,
-	.detach =	pc2_detach,
-	.read =	pc2_read,
-	.write =	pc2_write,
-	.command =	pc2_command,
-	.take_control =	pc2_take_control,
-	.go_to_standby =	pc2_go_to_standby,
-	.request_system_control =	pc2_request_system_control,
-	.interface_clear =	pc2_interface_clear,
-	.remote_enable =	pc2_remote_enable,
-	.enable_eos =	pc2_enable_eos,
-	.disable_eos =	pc2_disable_eos,
-	.parallel_poll =	pc2_parallel_poll,
-	.parallel_poll_configure =	pc2_parallel_poll_configure,
-	.parallel_poll_response =	pc2_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status =	NULL,
-	.update_status =	pc2_update_status,
-	.primary_address =	pc2_primary_address,
-	.secondary_address =	pc2_secondary_address,
-	.serial_poll_response =	pc2_serial_poll_response,
-	.serial_poll_status =	pc2_serial_poll_status,
-	.t1_delay = pc2_t1_delay,
-	.return_to_local = pc2_return_to_local,
-};
-
-static struct gpib_interface pc2a_interface = {
-	.name =	"pcIIa",
-	.attach =	pc2a_attach,
-	.detach =	pc2a_detach,
-	.read =	pc2_read,
-	.write =	pc2_write,
-	.command =	pc2_command,
-	.take_control =	pc2_take_control,
-	.go_to_standby =	pc2_go_to_standby,
-	.request_system_control =	pc2_request_system_control,
-	.interface_clear =	pc2_interface_clear,
-	.remote_enable =	pc2_remote_enable,
-	.enable_eos =	pc2_enable_eos,
-	.disable_eos =	pc2_disable_eos,
-	.parallel_poll =	pc2_parallel_poll,
-	.parallel_poll_configure =	pc2_parallel_poll_configure,
-	.parallel_poll_response =	pc2_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status =	NULL,
-	.update_status =	pc2_update_status,
-	.primary_address =	pc2_primary_address,
-	.secondary_address =	pc2_secondary_address,
-	.serial_poll_response =	pc2_serial_poll_response,
-	.serial_poll_status =	pc2_serial_poll_status,
-	.t1_delay = pc2_t1_delay,
-	.return_to_local = pc2_return_to_local,
-};
-
-static struct gpib_interface pc2a_cb7210_interface = {
-	.name =	"pcIIa_cb7210",
-	.attach =	pc2a_cb7210_attach,
-	.detach =	pc2a_detach,
-	.read =	pc2_read,
-	.write =	pc2_write,
-	.command =	pc2_command,
-	.take_control =	pc2_take_control,
-	.go_to_standby =	pc2_go_to_standby,
-	.request_system_control =	pc2_request_system_control,
-	.interface_clear =	pc2_interface_clear,
-	.remote_enable =	pc2_remote_enable,
-	.enable_eos =	pc2_enable_eos,
-	.disable_eos =	pc2_disable_eos,
-	.parallel_poll =	pc2_parallel_poll,
-	.parallel_poll_configure =	pc2_parallel_poll_configure,
-	.parallel_poll_response =	pc2_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status =	NULL, // XXX
-	.update_status =	pc2_update_status,
-	.primary_address =	pc2_primary_address,
-	.secondary_address =	pc2_secondary_address,
-	.serial_poll_response =	pc2_serial_poll_response,
-	.serial_poll_status =	pc2_serial_poll_status,
-	.t1_delay = pc2_t1_delay,
-	.return_to_local = pc2_return_to_local,
-};
-
-static struct gpib_interface pc2_2a_interface = {
-	.name =	"pcII_IIa",
-	.attach =	pc2_2a_attach,
-	.detach =	pc2_2a_detach,
-	.read =	pc2_read,
-	.write =	pc2_write,
-	.command =	pc2_command,
-	.take_control =	pc2_take_control,
-	.go_to_standby =	pc2_go_to_standby,
-	.request_system_control =	pc2_request_system_control,
-	.interface_clear =	pc2_interface_clear,
-	.remote_enable =	pc2_remote_enable,
-	.enable_eos =	pc2_enable_eos,
-	.disable_eos =	pc2_disable_eos,
-	.parallel_poll =	pc2_parallel_poll,
-	.parallel_poll_configure =	pc2_parallel_poll_configure,
-	.parallel_poll_response =	pc2_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status =	NULL,
-	.update_status =	pc2_update_status,
-	.primary_address =	pc2_primary_address,
-	.secondary_address =	pc2_secondary_address,
-	.serial_poll_response =	pc2_serial_poll_response,
-	.serial_poll_status =	pc2_serial_poll_status,
-	.t1_delay = pc2_t1_delay,
-	.return_to_local = pc2_return_to_local,
-};
-
-static int __init pc2_init_module(void)
-{
-	int ret;
-
-	ret = gpib_register_driver(&pc2_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		return ret;
-	}
-
-	ret = gpib_register_driver(&pc2a_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pc2a;
-	}
-
-	ret = gpib_register_driver(&pc2a_cb7210_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_cb7210;
-	}
-
-	ret = gpib_register_driver(&pc2_2a_interface, THIS_MODULE);
-	if (ret) {
-		pr_err("gpib_register_driver failed: error = %d\n", ret);
-		goto err_pc2_2a;
-	}
-
-	return 0;
-
-err_pc2_2a:
-	gpib_unregister_driver(&pc2a_cb7210_interface);
-err_cb7210:
-	gpib_unregister_driver(&pc2a_interface);
-err_pc2a:
-	gpib_unregister_driver(&pc2_interface);
-
-	return ret;
-}
-
-static void __exit pc2_exit_module(void)
-{
-	gpib_unregister_driver(&pc2_interface);
-	gpib_unregister_driver(&pc2a_interface);
-	gpib_unregister_driver(&pc2a_cb7210_interface);
-	gpib_unregister_driver(&pc2_2a_interface);
-}
-
-module_init(pc2_init_module);
-module_exit(pc2_exit_module);
-
diff --git a/drivers/staging/gpib/tms9914/Makefile b/drivers/staging/gpib/tms9914/Makefile
deleted file mode 100644
index 4705ab07f413..000000000000
--- a/drivers/staging/gpib/tms9914/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-
-obj-$(CONFIG_GPIB_TMS9914) += tms9914.o
-
-
-
-
diff --git a/drivers/staging/gpib/tms9914/tms9914.c b/drivers/staging/gpib/tms9914/tms9914.c
deleted file mode 100644
index 72a11596a35e..000000000000
--- a/drivers/staging/gpib/tms9914/tms9914.c
+++ /dev/null
@@ -1,914 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- *   copyright		  : (C) 2001, 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-
-#include <linux/ioport.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/dma.h>
-#include <linux/io.h>
-#include <linux/bitops.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/delay.h>
-
-#include "gpibP.h"
-#include "tms9914.h"
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB library for tms9914");
-
-static unsigned int update_status_nolock(struct gpib_board *board, struct tms9914_priv *priv);
-
-int tms9914_take_control(struct gpib_board *board, struct tms9914_priv *priv, int synchronous)
-{
-	int i;
-	const int timeout = 100;
-
-	if (synchronous)
-		write_byte(priv, AUX_TCS, AUXCR);
-	else
-		write_byte(priv, AUX_TCA, AUXCR);
-	// busy wait until ATN is asserted
-	for (i = 0; i < timeout; i++) {
-		if ((read_byte(priv, ADSR) & HR_ATN))
-			break;
-		udelay(1);
-	}
-	if (i == timeout)
-		return -ETIMEDOUT;
-
-	clear_bit(WRITE_READY_BN, &priv->state);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(tms9914_take_control);
-
-/*
- * The agilent 82350B has a buggy implementation of tcs which interferes with the
- * operation of tca.  It appears to be based on the controller state machine
- * described in the TI 9900 TMS9914A data manual published in 1982.  This
- * manual describes tcs as putting the controller into a CWAS
- * state where it waits indefinitely for ANRS and ignores tca.	Since a
- * functioning tca is far more important than tcs, we work around the
- * problem by never issuing tcs.
- *
- * I don't know if this problem exists in the real tms9914a or just in the fpga
- * of the 82350B.  For now, only the agilent_82350b uses this workaround.
- * The rest of the tms9914 based drivers still use tms9914_take_control
- * directly (which does issue tcs).
- */
-int tms9914_take_control_workaround(struct gpib_board *board,
-				    struct tms9914_priv *priv, int synchronous)
-{
-	if (synchronous)
-		return -ETIMEDOUT;
-	return tms9914_take_control(board, priv, synchronous);
-}
-EXPORT_SYMBOL_GPL(tms9914_take_control_workaround);
-
-int tms9914_go_to_standby(struct gpib_board *board, struct tms9914_priv *priv)
-{
-	int i;
-	const int timeout = 1000;
-
-	write_byte(priv, AUX_GTS, AUXCR);
-	// busy wait until ATN is released
-	for (i = 0; i < timeout; i++) {
-		if ((read_byte(priv, ADSR) & HR_ATN) == 0)
-			break;
-		udelay(1);
-	}
-	if (i == timeout)
-		return -ETIMEDOUT;
-
-	clear_bit(COMMAND_READY_BN, &priv->state);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(tms9914_go_to_standby);
-
-void tms9914_interface_clear(struct gpib_board *board, struct tms9914_priv *priv, int assert)
-{
-	if (assert) {
-		write_byte(priv, AUX_SIC | AUX_CS, AUXCR);
-
-		set_bit(CIC_NUM, &board->status);
-	} else {
-		write_byte(priv, AUX_SIC, AUXCR);
-	}
-}
-EXPORT_SYMBOL_GPL(tms9914_interface_clear);
-
-void tms9914_remote_enable(struct gpib_board *board, struct tms9914_priv *priv, int enable)
-{
-	if (enable)
-		write_byte(priv, AUX_SRE | AUX_CS, AUXCR);
-	else
-		write_byte(priv, AUX_SRE, AUXCR);
-}
-EXPORT_SYMBOL_GPL(tms9914_remote_enable);
-
-int tms9914_request_system_control(struct gpib_board *board, struct tms9914_priv *priv,
-				   int request_control)
-{
-	if (request_control) {
-		write_byte(priv, AUX_RQC, AUXCR);
-	} else {
-		clear_bit(CIC_NUM, &board->status);
-		write_byte(priv, AUX_RLC, AUXCR);
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(tms9914_request_system_control);
-
-unsigned int tms9914_t1_delay(struct gpib_board *board, struct tms9914_priv *priv,
-			      unsigned int nano_sec)
-{
-	static const int clock_period = 200;	// assuming 5Mhz input clock
-	int num_cycles;
-
-	num_cycles = 12;
-
-	if (nano_sec <= 8 * clock_period) {
-		write_byte(priv, AUX_STDL | AUX_CS, AUXCR);
-		num_cycles = 8;
-	} else {
-		write_byte(priv, AUX_STDL, AUXCR);
-	}
-
-	if (nano_sec <= 4 * clock_period) {
-		write_byte(priv, AUX_VSTDL | AUX_CS, AUXCR);
-		num_cycles = 4;
-	} else {
-		write_byte(priv, AUX_VSTDL, AUXCR);
-	}
-
-	return num_cycles * clock_period;
-}
-EXPORT_SYMBOL_GPL(tms9914_t1_delay);
-
-void tms9914_return_to_local(const struct gpib_board *board, struct tms9914_priv *priv)
-{
-	write_byte(priv, AUX_RTL, AUXCR);
-}
-EXPORT_SYMBOL_GPL(tms9914_return_to_local);
-
-void tms9914_set_holdoff_mode(struct tms9914_priv *priv, enum tms9914_holdoff_mode mode)
-{
-	switch (mode) {
-	case TMS9914_HOLDOFF_NONE:
-		write_byte(priv, AUX_HLDE, AUXCR);
-		write_byte(priv, AUX_HLDA, AUXCR);
-		break;
-	case TMS9914_HOLDOFF_EOI:
-		write_byte(priv, AUX_HLDE | AUX_CS, AUXCR);
-		write_byte(priv, AUX_HLDA, AUXCR);
-		break;
-	case TMS9914_HOLDOFF_ALL:
-		write_byte(priv, AUX_HLDE, AUXCR);
-		write_byte(priv, AUX_HLDA | AUX_CS, AUXCR);
-		break;
-	default:
-		pr_err("bug! bad holdoff mode %i\n", mode);
-		break;
-	}
-	priv->holdoff_mode = mode;
-}
-EXPORT_SYMBOL_GPL(tms9914_set_holdoff_mode);
-
-void tms9914_release_holdoff(struct tms9914_priv *priv)
-{
-	if (priv->holdoff_active) {
-		write_byte(priv, AUX_RHDF, AUXCR);
-		priv->holdoff_active = 0;
-	}
-}
-EXPORT_SYMBOL_GPL(tms9914_release_holdoff);
-
-int tms9914_enable_eos(struct gpib_board *board, struct tms9914_priv *priv, u8 eos_byte,
-		       int compare_8_bits)
-{
-	priv->eos = eos_byte;
-	priv->eos_flags = REOS;
-	if (compare_8_bits)
-		priv->eos_flags |= BIN;
-	return 0;
-}
-EXPORT_SYMBOL(tms9914_enable_eos);
-
-void tms9914_disable_eos(struct gpib_board *board, struct tms9914_priv *priv)
-{
-	priv->eos_flags &= ~REOS;
-}
-EXPORT_SYMBOL(tms9914_disable_eos);
-
-int tms9914_parallel_poll(struct gpib_board *board, struct tms9914_priv *priv, u8 *result)
-{
-	// execute parallel poll
-	write_byte(priv, AUX_CS | AUX_RPP, AUXCR);
-	udelay(2);
-	*result = read_byte(priv, CPTR);
-	// clear parallel poll state
-	write_byte(priv, AUX_RPP, AUXCR);
-	return 0;
-}
-EXPORT_SYMBOL(tms9914_parallel_poll);
-
-static void set_ppoll_reg(struct tms9914_priv *priv, int enable,
-			  unsigned int dio_line, int sense, int ist)
-{
-	u8 dio_byte;
-
-	if (enable && ((sense && ist) || (!sense && !ist))) {
-		dio_byte = 1 << (dio_line - 1);
-		write_byte(priv, dio_byte, PPR);
-	} else {
-		write_byte(priv, 0, PPR);
-	}
-}
-
-void tms9914_parallel_poll_configure(struct gpib_board *board,
-				     struct tms9914_priv *priv, u8 config)
-{
-	priv->ppoll_enable = (config & PPC_DISABLE) == 0;
-	priv->ppoll_line = (config & PPC_DIO_MASK) + 1;
-	priv->ppoll_sense = (config & PPC_SENSE) != 0;
-	set_ppoll_reg(priv, priv->ppoll_enable, priv->ppoll_line, priv->ppoll_sense, board->ist);
-}
-EXPORT_SYMBOL(tms9914_parallel_poll_configure);
-
-void tms9914_parallel_poll_response(struct gpib_board *board,
-				    struct tms9914_priv *priv, int ist)
-{
-	set_ppoll_reg(priv, priv->ppoll_enable, priv->ppoll_line, priv->ppoll_sense, ist);
-}
-EXPORT_SYMBOL(tms9914_parallel_poll_response);
-
-void tms9914_serial_poll_response(struct gpib_board *board,
-				  struct tms9914_priv *priv, u8 status)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	write_byte(priv, status, SPMR);
-	priv->spoll_status = status;
-	if (status & request_service_bit)
-		write_byte(priv, AUX_RSV2 | AUX_CS, AUXCR);
-	else
-		write_byte(priv, AUX_RSV2, AUXCR);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-EXPORT_SYMBOL(tms9914_serial_poll_response);
-
-u8 tms9914_serial_poll_status(struct gpib_board *board, struct tms9914_priv *priv)
-{
-	u8 status;
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	status = priv->spoll_status;
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	return status;
-}
-EXPORT_SYMBOL(tms9914_serial_poll_status);
-
-int tms9914_primary_address(struct gpib_board *board,
-			    struct tms9914_priv *priv, unsigned int address)
-{
-	// put primary address in address0
-	write_byte(priv, address & ADDRESS_MASK, ADR);
-	return 0;
-}
-EXPORT_SYMBOL(tms9914_primary_address);
-
-int tms9914_secondary_address(struct gpib_board *board, struct tms9914_priv *priv,
-			      unsigned int address, int enable)
-{
-	if (enable)
-		priv->imr1_bits |= HR_APTIE;
-	else
-		priv->imr1_bits &= ~HR_APTIE;
-
-	write_byte(priv, priv->imr1_bits, IMR1);
-	return 0;
-}
-EXPORT_SYMBOL(tms9914_secondary_address);
-
-unsigned int tms9914_update_status(struct gpib_board *board, struct tms9914_priv *priv,
-				   unsigned int clear_mask)
-{
-	unsigned long flags;
-	unsigned int retval;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	retval = update_status_nolock(board, priv);
-	board->status &= ~clear_mask;
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	return retval;
-}
-EXPORT_SYMBOL(tms9914_update_status);
-
-static void update_talker_state(struct tms9914_priv *priv, unsigned int address_status_bits)
-{
-	if (address_status_bits & HR_TA)	{
-		if (address_status_bits & HR_ATN)
-			priv->talker_state = talker_addressed;
-		else
-			/*
-			 * this could also be serial_poll_active, but the tms9914 provides no
-			 * way to distinguish, so we'll assume talker_active
-			 */
-			priv->talker_state = talker_active;
-	} else {
-		priv->talker_state = talker_idle;
-	}
-}
-
-static void update_listener_state(struct tms9914_priv *priv, unsigned int address_status_bits)
-{
-	if (address_status_bits & HR_LA)	{
-		if (address_status_bits & HR_ATN)
-			priv->listener_state = listener_addressed;
-		else
-			priv->listener_state = listener_active;
-	} else {
-		priv->listener_state = listener_idle;
-	}
-}
-
-static unsigned int update_status_nolock(struct gpib_board *board, struct tms9914_priv *priv)
-{
-	int address_status;
-	int bsr_bits;
-
-	address_status = read_byte(priv, ADSR);
-
-	// check for remote/local
-	if (address_status & HR_REM)
-		set_bit(REM_NUM, &board->status);
-	else
-		clear_bit(REM_NUM, &board->status);
-	// check for lockout
-	if (address_status & HR_LLO)
-		set_bit(LOK_NUM, &board->status);
-	else
-		clear_bit(LOK_NUM, &board->status);
-	// check for ATN
-	if (address_status & HR_ATN)
-		set_bit(ATN_NUM, &board->status);
-	else
-		clear_bit(ATN_NUM, &board->status);
-	// check for talker/listener addressed
-	update_talker_state(priv, address_status);
-	if (priv->talker_state == talker_active || priv->talker_state == talker_addressed)
-		set_bit(TACS_NUM, &board->status);
-	else
-		clear_bit(TACS_NUM, &board->status);
-
-	update_listener_state(priv, address_status);
-	if (priv->listener_state == listener_active || priv->listener_state == listener_addressed)
-		set_bit(LACS_NUM, &board->status);
-	else
-		clear_bit(LACS_NUM, &board->status);
-	// Check for SRQI - not reset elsewhere except in autospoll
-	if (board->status & SRQI) {
-		bsr_bits = read_byte(priv, BSR);
-		if (!(bsr_bits & BSR_SRQ_BIT))
-			clear_bit(SRQI_NUM, &board->status);
-	}
-
-	dev_dbg(board->gpib_dev, "status 0x%lx, state 0x%lx\n", board->status, priv->state);
-
-	return board->status;
-}
-
-int tms9914_line_status(const struct gpib_board *board, struct tms9914_priv *priv)
-{
-	int bsr_bits;
-	int status = VALID_ALL;
-
-	bsr_bits = read_byte(priv, BSR);
-
-	if (bsr_bits & BSR_REN_BIT)
-		status |= BUS_REN;
-	if (bsr_bits & BSR_IFC_BIT)
-		status |= BUS_IFC;
-	if (bsr_bits & BSR_SRQ_BIT)
-		status |= BUS_SRQ;
-	if (bsr_bits & BSR_EOI_BIT)
-		status |= BUS_EOI;
-	if (bsr_bits & BSR_NRFD_BIT)
-		status |= BUS_NRFD;
-	if (bsr_bits & BSR_NDAC_BIT)
-		status |= BUS_NDAC;
-	if (bsr_bits & BSR_DAV_BIT)
-		status |= BUS_DAV;
-	if (bsr_bits & BSR_ATN_BIT)
-		status |= BUS_ATN;
-
-	return status;
-}
-EXPORT_SYMBOL(tms9914_line_status);
-
-static int check_for_eos(struct tms9914_priv *priv, u8 byte)
-{
-	static const u8 seven_bit_compare_mask = 0x7f;
-
-	if ((priv->eos_flags & REOS) == 0)
-		return 0;
-
-	if (priv->eos_flags & BIN) {
-		if (priv->eos == byte)
-			return 1;
-	} else	{
-		if ((priv->eos & seven_bit_compare_mask) == (byte & seven_bit_compare_mask))
-			return 1;
-	}
-	return 0;
-}
-
-static int wait_for_read_byte(struct gpib_board *board, struct tms9914_priv *priv)
-{
-	if (wait_event_interruptible(board->wait,
-				     test_bit(READ_READY_BN, &priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		return -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		return -ETIMEDOUT;
-
-	if (test_bit(DEV_CLEAR_BN, &priv->state))
-		return -EINTR;
-	return 0;
-}
-
-static inline u8 tms9914_read_data_in(struct gpib_board *board,
-				      struct tms9914_priv *priv, int *end)
-{
-	unsigned long flags;
-	u8 data;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	clear_bit(READ_READY_BN, &priv->state);
-	data = read_byte(priv, DIR);
-	if (test_and_clear_bit(RECEIVED_END_BN, &priv->state))
-		*end = 1;
-	else
-		*end = 0;
-	switch (priv->holdoff_mode) {
-	case TMS9914_HOLDOFF_EOI:
-		if (*end)
-			priv->holdoff_active = 1;
-		break;
-	case TMS9914_HOLDOFF_ALL:
-		priv->holdoff_active = 1;
-		break;
-	case TMS9914_HOLDOFF_NONE:
-		break;
-	default:
-		dev_err(board->gpib_dev, "bug! bad holdoff mode %i\n", priv->holdoff_mode);
-		break;
-	}
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	return data;
-}
-
-static int pio_read(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
-		    size_t length, int *end, size_t *bytes_read)
-{
-	ssize_t retval = 0;
-
-	*bytes_read = 0;
-	*end = 0;
-	while (*bytes_read < length && *end == 0) {
-		tms9914_release_holdoff(priv);
-		retval = wait_for_read_byte(board, priv);
-		if (retval < 0)
-			return retval;
-		buffer[(*bytes_read)++] = tms9914_read_data_in(board, priv, end);
-
-		if (check_for_eos(priv, buffer[*bytes_read - 1]))
-			*end = 1;
-	}
-
-	return retval;
-}
-
-int tms9914_read(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
-		 size_t length, int *end, size_t *bytes_read)
-{
-	ssize_t retval = 0;
-	size_t num_bytes;
-
-	*end = 0;
-	*bytes_read = 0;
-	if (length == 0)
-		return 0;
-
-	clear_bit(DEV_CLEAR_BN, &priv->state);
-
-	// transfer data (except for last byte)
-	if (length > 1)	{
-		if (priv->eos_flags & REOS)
-			tms9914_set_holdoff_mode(priv, TMS9914_HOLDOFF_ALL);
-		else
-			tms9914_set_holdoff_mode(priv, TMS9914_HOLDOFF_EOI);
-		// PIO transfer
-		retval = pio_read(board, priv, buffer, length - 1, end, &num_bytes);
-		*bytes_read += num_bytes;
-		if (retval < 0)
-			return retval;
-		buffer += num_bytes;
-		length -= num_bytes;
-	}
-	// read last bytes if we haven't received an END yet
-	if (*end == 0) {
-		// make sure we holdoff after last byte read
-		tms9914_set_holdoff_mode(priv, TMS9914_HOLDOFF_ALL);
-		retval = pio_read(board, priv, buffer, length, end, &num_bytes);
-		*bytes_read += num_bytes;
-		if (retval < 0)
-			return retval;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(tms9914_read);
-
-static int pio_write_wait(struct gpib_board *board, struct tms9914_priv *priv)
-{
-	// wait until next byte is ready to be sent
-	if (wait_event_interruptible(board->wait,
-				     test_bit(WRITE_READY_BN, &priv->state) ||
-				     test_bit(BUS_ERROR_BN, &priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		return -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		return -ETIMEDOUT;
-	if (test_bit(BUS_ERROR_BN, &priv->state))
-		return -EIO;
-	if (test_bit(DEV_CLEAR_BN, &priv->state))
-		return -EINTR;
-
-	return 0;
-}
-
-static int pio_write(struct gpib_board *board, struct tms9914_priv *priv, u8 *buffer,
-		     size_t length, size_t *bytes_written)
-{
-	ssize_t retval = 0;
-	unsigned long flags;
-
-	*bytes_written = 0;
-	while (*bytes_written < length) {
-		retval = pio_write_wait(board, priv);
-		if (retval < 0)
-			break;
-
-		spin_lock_irqsave(&board->spinlock, flags);
-		clear_bit(WRITE_READY_BN, &priv->state);
-		write_byte(priv, buffer[(*bytes_written)++], CDOR);
-		spin_unlock_irqrestore(&board->spinlock, flags);
-	}
-	retval = pio_write_wait(board, priv);
-	if (retval < 0)
-		return retval;
-
-	return length;
-}
-
-int tms9914_write(struct gpib_board *board, struct tms9914_priv *priv,
-		  u8 *buffer, size_t length, int send_eoi, size_t *bytes_written)
-{
-	ssize_t retval = 0;
-
-	*bytes_written = 0;
-	if (length == 0)
-		return 0;
-
-	clear_bit(BUS_ERROR_BN, &priv->state);
-	clear_bit(DEV_CLEAR_BN, &priv->state);
-
-	if (send_eoi)
-		length-- ; /* save the last byte for sending EOI */
-
-	if (length > 0)	{
-		size_t num_bytes;
-		// PIO transfer
-		retval = pio_write(board, priv, buffer, length, &num_bytes);
-		*bytes_written += num_bytes;
-		if (retval < 0)
-			return retval;
-	}
-	if (send_eoi) {
-		size_t num_bytes;
-		/*send EOI */
-		write_byte(priv, AUX_SEOI, AUXCR);
-
-		retval = pio_write(board, priv, &buffer[*bytes_written], 1, &num_bytes);
-		*bytes_written += num_bytes;
-	}
-	return retval;
-}
-EXPORT_SYMBOL(tms9914_write);
-
-static void check_my_address_state(struct gpib_board *board,
-				   struct tms9914_priv *priv, int cmd_byte)
-{
-	if (cmd_byte == MLA(board->pad)) {
-		priv->primary_listen_addressed = 1;
-		// become active listener
-		if (board->sad < 0)
-			write_byte(priv, AUX_LON | AUX_CS, AUXCR);
-	} else if (board->sad >= 0 && priv->primary_listen_addressed &&
-		  cmd_byte == MSA(board->sad)) {
-		// become active listener
-		write_byte(priv, AUX_LON | AUX_CS, AUXCR);
-	} else if (cmd_byte != MLA(board->pad) && (cmd_byte & 0xe0) == LAD) {
-		priv->primary_listen_addressed = 0;
-	} else if (cmd_byte == UNL) {
-		priv->primary_listen_addressed = 0;
-		write_byte(priv, AUX_LON, AUXCR);
-	} else if (cmd_byte == MTA(board->pad))	{
-		priv->primary_talk_addressed = 1;
-		if (board->sad < 0)
-			// make active talker
-			write_byte(priv, AUX_TON | AUX_CS, AUXCR);
-	} else if (board->sad >= 0 && priv->primary_talk_addressed &&
-		   cmd_byte == MSA(board->sad)) {
-		// become active talker
-		write_byte(priv, AUX_TON | AUX_CS, AUXCR);
-	} else if (cmd_byte != MTA(board->pad) && (cmd_byte & 0xe0) == TAD) {
-		// Other Talk Address
-		priv->primary_talk_addressed = 0;
-		write_byte(priv, AUX_TON, AUXCR);
-	} else if (cmd_byte == UNT) {
-		priv->primary_talk_addressed = 0;
-		write_byte(priv, AUX_TON, AUXCR);
-	}
-}
-
-int tms9914_command(struct gpib_board *board, struct tms9914_priv *priv,  u8 *buffer,
-		    size_t length, size_t *bytes_written)
-{
-	int retval = 0;
-	unsigned long flags;
-
-	*bytes_written = 0;
-	while (*bytes_written < length) {
-		if (wait_event_interruptible(board->wait,
-					     test_bit(COMMAND_READY_BN,
-						      &priv->state) ||
-					     test_bit(TIMO_NUM, &board->status)))
-			break;
-		if (test_bit(TIMO_NUM, &board->status))
-			break;
-
-		spin_lock_irqsave(&board->spinlock, flags);
-		clear_bit(COMMAND_READY_BN, &priv->state);
-		write_byte(priv, buffer[*bytes_written], CDOR);
-		spin_unlock_irqrestore(&board->spinlock, flags);
-
-		check_my_address_state(board, priv, buffer[*bytes_written]);
-
-		++(*bytes_written);
-	}
-	// wait until last command byte is written
-	if (wait_event_interruptible(board->wait,
-				     test_bit(COMMAND_READY_BN,
-					      &priv->state) || test_bit(TIMO_NUM, &board->status)))
-		retval = -ERESTARTSYS;
-	if (test_bit(TIMO_NUM, &board->status))
-		retval = -ETIMEDOUT;
-
-	return retval;
-}
-EXPORT_SYMBOL(tms9914_command);
-
-irqreturn_t tms9914_interrupt(struct gpib_board *board, struct tms9914_priv *priv)
-{
-	int status0, status1;
-
-	// read interrupt status (also clears status)
-	status0 = read_byte(priv, ISR0);
-	status1 = read_byte(priv, ISR1);
-	return tms9914_interrupt_have_status(board, priv, status0, status1);
-}
-EXPORT_SYMBOL(tms9914_interrupt);
-
-irqreturn_t tms9914_interrupt_have_status(struct gpib_board *board, struct tms9914_priv *priv,
-					  int status0, int status1)
-{
-	// record reception of END
-	if (status0 & HR_END)
-		set_bit(RECEIVED_END_BN, &priv->state);
-	// get incoming data in PIO mode
-	if ((status0 & HR_BI))
-		set_bit(READ_READY_BN, &priv->state);
-	if ((status0 & HR_BO))	{
-		if (read_byte(priv, ADSR) & HR_ATN)
-			set_bit(COMMAND_READY_BN, &priv->state);
-		else
-			set_bit(WRITE_READY_BN, &priv->state);
-	}
-
-	if (status0 & HR_SPAS) {
-		priv->spoll_status &= ~request_service_bit;
-		write_byte(priv, priv->spoll_status, SPMR);
-		// FIXME: set SPOLL status bit
-	}
-	// record service request in status
-	if (status1 & HR_SRQ)
-		set_bit(SRQI_NUM, &board->status);
-	// have been addressed (with secondary addressing disabled)
-	if (status1 & HR_MA)
-		// clear dac holdoff
-		write_byte(priv, AUX_VAL, AUXCR);
-	// unrecognized command received
-	if (status1 & HR_UNC) {
-		unsigned short command_byte = read_byte(priv, CPTR) & gpib_command_mask;
-
-		switch (command_byte) {
-		case PP_CONFIG:
-			priv->ppoll_configure_state = 1;
-			/*
-			 * AUX_PTS generates another UNC interrupt on the next command byte
-			 * if it is in the secondary address group (such as PPE and PPD).
-			 */
-			write_byte(priv, AUX_PTS, AUXCR);
-			write_byte(priv, AUX_VAL, AUXCR);
-			break;
-		case PPU:
-			tms9914_parallel_poll_configure(board, priv, command_byte);
-			write_byte(priv, AUX_VAL, AUXCR);
-			break;
-		default:
-			if (is_PPE(command_byte) || is_PPD(command_byte)) {
-				if (priv->ppoll_configure_state) {
-					tms9914_parallel_poll_configure(board, priv, command_byte);
-					write_byte(priv, AUX_VAL, AUXCR);
-				} else	{// bad parallel poll configure byte
-					// clear dac holdoff
-					write_byte(priv, AUX_INVAL, AUXCR);
-				}
-			} else	{
-				// clear dac holdoff
-				write_byte(priv, AUX_INVAL, AUXCR);
-			}
-			break;
-		}
-
-		if (in_primary_command_group(command_byte) && command_byte != PP_CONFIG)
-			priv->ppoll_configure_state = 0;
-	}
-
-	if (status1 & HR_ERR) {
-		dev_dbg(board->gpib_dev, "gpib bus error\n");
-		set_bit(BUS_ERROR_BN, &priv->state);
-	}
-
-	if (status1 & HR_IFC) {
-		push_gpib_event(board, EVENT_IFC);
-		clear_bit(CIC_NUM, &board->status);
-	}
-
-	if (status1 & HR_GET) {
-		push_gpib_event(board, EVENT_DEV_TRG);
-		// clear dac holdoff
-		write_byte(priv, AUX_VAL, AUXCR);
-	}
-
-	if (status1 & HR_DCAS) {
-		push_gpib_event(board, EVENT_DEV_CLR);
-		// clear dac holdoff
-		write_byte(priv, AUX_VAL, AUXCR);
-		set_bit(DEV_CLEAR_BN, &priv->state);
-	}
-
-	// check for being addressed with secondary addressing
-	if (status1 & HR_APT) {
-		if (board->sad < 0)
-			dev_err(board->gpib_dev, "bug, APT interrupt without secondary addressing?\n");
-		if ((read_byte(priv, CPTR) & gpib_command_mask) == MSA(board->sad))
-			write_byte(priv, AUX_VAL, AUXCR);
-		else
-			write_byte(priv, AUX_INVAL, AUXCR);
-	}
-
-	if ((status0 & priv->imr0_bits) || (status1 & priv->imr1_bits))	{
-		dev_dbg(board->gpib_dev, "isr0 0x%x, imr0 0x%x, isr1 0x%x, imr1 0x%x\n",
-			status0, priv->imr0_bits, status1, priv->imr1_bits);
-		update_status_nolock(board, priv);
-		wake_up_interruptible(&board->wait);
-	}
-	return IRQ_HANDLED;
-}
-EXPORT_SYMBOL(tms9914_interrupt_have_status);
-
-void tms9914_board_reset(struct tms9914_priv *priv)
-{
-	/* chip reset */
-	write_byte(priv, AUX_CHIP_RESET | AUX_CS, AUXCR);
-
-	/* disable all interrupts */
-	priv->imr0_bits = 0;
-	write_byte(priv, priv->imr0_bits, IMR0);
-	priv->imr1_bits = 0;
-	write_byte(priv, priv->imr1_bits, IMR1);
-	write_byte(priv, AUX_DAI | AUX_CS, AUXCR);
-
-	/* clear registers by reading */
-	read_byte(priv, CPTR);
-	read_byte(priv, ISR0);
-	read_byte(priv, ISR1);
-
-	write_byte(priv, 0, SPMR);
-
-	/* parallel poll unconfigure */
-	write_byte(priv, 0, PPR);
-	/* request for data holdoff */
-	tms9914_set_holdoff_mode(priv, TMS9914_HOLDOFF_ALL);
-}
-EXPORT_SYMBOL_GPL(tms9914_board_reset);
-
-void tms9914_online(struct gpib_board *board, struct tms9914_priv *priv)
-{
-	/* set GPIB address */
-	tms9914_primary_address(board, priv, board->pad);
-	tms9914_secondary_address(board, priv, board->sad, board->sad >= 0);
-
-	/* enable tms9914 interrupts */
-	priv->imr0_bits |= HR_MACIE | HR_RLCIE | HR_ENDIE | HR_BOIE | HR_BIIE |
-		HR_SPASIE;
-	priv->imr1_bits |= HR_MAIE | HR_SRQIE | HR_UNCIE | HR_ERRIE | HR_IFCIE |
-		HR_GETIE | HR_DCASIE;
-	write_byte(priv, priv->imr0_bits, IMR0);
-	write_byte(priv, priv->imr1_bits, IMR1);
-	write_byte(priv, AUX_DAI, AUXCR);
-
-	/* turn off reset state */
-	write_byte(priv, AUX_CHIP_RESET, AUXCR);
-}
-EXPORT_SYMBOL_GPL(tms9914_online);
-
-#ifdef CONFIG_HAS_IOPORT
-// wrapper for inb
-u8 tms9914_ioport_read_byte(struct tms9914_priv *priv, unsigned int register_num)
-{
-	return inb(priv->iobase + register_num * priv->offset);
-}
-EXPORT_SYMBOL_GPL(tms9914_ioport_read_byte);
-
-// wrapper for outb
-void tms9914_ioport_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num)
-{
-	outb(data, priv->iobase + register_num * priv->offset);
-	if (register_num == AUXCR)
-		udelay(1);
-}
-EXPORT_SYMBOL_GPL(tms9914_ioport_write_byte);
-#endif
-
-// wrapper for readb
-u8 tms9914_iomem_read_byte(struct tms9914_priv *priv, unsigned int register_num)
-{
-	return readb(priv->mmiobase + register_num * priv->offset);
-}
-EXPORT_SYMBOL_GPL(tms9914_iomem_read_byte);
-
-// wrapper for writeb
-void tms9914_iomem_write_byte(struct tms9914_priv *priv, u8 data, unsigned int register_num)
-{
-	writeb(data, priv->mmiobase + register_num * priv->offset);
-	if (register_num == AUXCR)
-		udelay(1);
-}
-EXPORT_SYMBOL_GPL(tms9914_iomem_write_byte);
-
-static int __init tms9914_init_module(void)
-{
-	return 0;
-}
-
-static void __exit tms9914_exit_module(void)
-{
-}
-
-module_init(tms9914_init_module);
-module_exit(tms9914_exit_module);
-
diff --git a/drivers/staging/gpib/tnt4882/Makefile b/drivers/staging/gpib/tnt4882/Makefile
deleted file mode 100644
index fa1687ad0d1b..000000000000
--- a/drivers/staging/gpib/tnt4882/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-obj-$(CONFIG_GPIB_NI_PCI_ISA) += tnt4882.o
-
-tnt4882-objs := tnt4882_gpib.o mite.o
-
-
-
diff --git a/drivers/staging/gpib/tnt4882/mite.c b/drivers/staging/gpib/tnt4882/mite.c
deleted file mode 100644
index 847b96f411bd..000000000000
--- a/drivers/staging/gpib/tnt4882/mite.c
+++ /dev/null
@@ -1,133 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-/*
- *	Hardware driver for NI Mite PCI interface chip,
- *	adapted from COMEDI
- *
- *	Copyright (C) 1997-8 David A. Schleef
- *	Copyright (C) 2002 Frank Mori Hess
- *
- *	The PCI-MIO E series driver was originally written by
- *	Tomasz Motylewski <...>, and ported to comedi by ds.
- *
- *	References for specifications:
- *
- *	   321747b.pdf  Register Level Programmer Manual (obsolete)
- *	   321747c.pdf  Register Level Programmer Manual (new)
- *	   DAQ-STC reference manual
- *
- *	Other possibly relevant info:
- *
- *	   320517c.pdf  User manual (obsolete)
- *	   320517f.pdf  User manual (new)
- *	   320889a.pdf  delete
- *	   320906c.pdf  maximum signal ratings
- *	   321066a.pdf  about 16x
- *	   321791a.pdf  discontinuation of at-mio-16e-10 rev. c
- *	   321808a.pdf  about at-mio-16e-10 rev P
- *	   321837a.pdf  discontinuation of at-mio-16de-10 rev d
- *	   321838a.pdf  about at-mio-16de-10 rev N
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-
-#include "mite.h"
-
-#define PCI_MITE_SIZE		4096
-#define PCI_DAQ_SIZE		4096
-
-struct mite_struct *mite_devices;
-
-#define TOP_OF_PAGE(x) ((x) | (~(PAGE_MASK)))
-
-void mite_init(void)
-{
-	struct pci_dev *pcidev;
-	struct mite_struct *mite;
-
-	for (pcidev = pci_get_device(PCI_VENDOR_ID_NATINST, PCI_ANY_ID, NULL);
-		pcidev;
-		pcidev = pci_get_device(PCI_VENDOR_ID_NATINST, PCI_ANY_ID, pcidev)) {
-		mite = kzalloc(sizeof(*mite), GFP_KERNEL);
-		if (!mite)
-			return;
-
-		mite->pcidev = pcidev;
-		pci_dev_get(mite->pcidev);
-		mite->next = mite_devices;
-		mite_devices = mite;
-	}
-}
-
-int mite_setup(struct mite_struct *mite)
-{
-	u32 addr;
-
-	if (pci_enable_device(mite->pcidev)) {
-		pr_err("mite: error enabling mite.\n");
-		return -EIO;
-	}
-	pci_set_master(mite->pcidev);
-	if (pci_request_regions(mite->pcidev, "mite")) {
-		pr_err("mite: failed to request mite io regions.\n");
-		return -EIO;
-	}
-	addr = pci_resource_start(mite->pcidev, 0);
-	mite->mite_phys_addr = addr;
-	mite->mite_io_addr = ioremap(addr, pci_resource_len(mite->pcidev, 0));
-	if (!mite->mite_io_addr) {
-		pr_err("mite: failed to remap mite io memory address.\n");
-		return -ENOMEM;
-	}
-	addr = pci_resource_start(mite->pcidev, 1);
-	mite->daq_phys_addr = addr;
-	mite->daq_io_addr = ioremap(mite->daq_phys_addr, pci_resource_len(mite->pcidev, 1));
-	if (!mite->daq_io_addr)	{
-		pr_err("mite: failed to remap daq io memory address.\n");
-		return -ENOMEM;
-	}
-	writel(mite->daq_phys_addr | WENAB, mite->mite_io_addr + MITE_IODWBSR);
-	mite->used = 1;
-	return 0;
-}
-
-void mite_cleanup(void)
-{
-	struct mite_struct *mite, *next;
-
-	for (mite = mite_devices; mite; mite = next) {
-		next = mite->next;
-		if (mite->pcidev)
-			pci_dev_put(mite->pcidev);
-		kfree(mite);
-	}
-}
-
-void mite_unsetup(struct mite_struct *mite)
-{
-	if (!mite)
-		return;
-	if (mite->mite_io_addr)	{
-		iounmap(mite->mite_io_addr);
-		mite->mite_io_addr = NULL;
-	}
-	if (mite->daq_io_addr) {
-		iounmap(mite->daq_io_addr);
-		mite->daq_io_addr = NULL;
-	}
-	if (mite->mite_phys_addr) {
-		pci_release_regions(mite->pcidev);
-		pci_disable_device(mite->pcidev);
-		mite->mite_phys_addr = 0;
-	}
-	mite->used = 0;
-}
diff --git a/drivers/staging/gpib/tnt4882/mite.h b/drivers/staging/gpib/tnt4882/mite.h
deleted file mode 100644
index a1fdba9672a0..000000000000
--- a/drivers/staging/gpib/tnt4882/mite.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-
-/*
- *   Hardware driver for NI Mite PCI interface chip
- *
- *   Copyright (C) 1999 David A. Schleef <ds@stm.lbl.gov>
- */
-
-#ifndef _MITE_H_
-#define _MITE_H_
-
-#include <linux/pci.h>
-
-#define PCI_VENDOR_ID_NATINST		0x1093
-
-//#define DEBUG_MITE
-
-#ifdef DEBUG_MITE
-#define MDPRINTK(format, args...) pr_debug(format, ## args)
-#else
-#define MDPRINTK(args...)
-#endif
-
-#define MITE_RING_SIZE 3000
-struct mite_dma_chain {
-	u32 count;
-	u32 addr;
-	u32 next;
-};
-
-struct mite_struct {
-	struct mite_struct *next;
-	int used;
-
-	struct pci_dev *pcidev;
-	unsigned long mite_phys_addr;
-	void __iomem *mite_io_addr;
-	unsigned long daq_phys_addr;
-	void __iomem *daq_io_addr;
-
-	int DMA_CheckNearEnd;
-
-	struct mite_dma_chain ring[MITE_RING_SIZE];
-};
-
-extern struct mite_struct *mite_devices;
-
-extern inline unsigned int mite_irq(struct mite_struct *mite)
-{
-	return mite->pcidev->irq;
-};
-
-extern inline unsigned int mite_device_id(struct mite_struct *mite)
-{
-	return mite->pcidev->device;
-};
-
-void mite_init(void);
-void mite_cleanup(void);
-int mite_setup(struct mite_struct *mite);
-void mite_unsetup(struct mite_struct *mite);
-void mite_list_devices(void);
-
-#define CHAN_OFFSET(x)			(0x100 * (x))
-
-/* DMA base for chan 0 is 0x500, chan 1 is 0x600 */
-
-#define MITE_CHOR		0x500
-#define CHOR_DMARESET			BIT(31)
-#define CHOR_SET_SEND_TC		BIT(11)
-#define CHOR_CLR_SEND_TC		BIT(10)
-#define CHOR_SET_LPAUSE			BIT(9)
-#define CHOR_CLR_LPAUSE			BIT(8)
-#define CHOR_CLRDONE			BIT(7)
-#define CHOR_CLRRB			BIT(6)
-#define CHOR_CLRLC			BIT(5)
-#define CHOR_FRESET			BIT(4)
-#define CHOR_ABORT			BIT(3)
-#define CHOR_STOP			BIT(2)
-#define CHOR_CONT			BIT(1)
-#define CHOR_START			BIT(0)
-#define CHOR_PON			(CHOR_CLR_SEND_TC | CHOR_CLR_LPAUSE)
-
-#define MITE_CHCR		0x504
-#define CHCR_SET_DMA_IE			BIT(31)
-#define CHCR_CLR_DMA_IE			BIT(30)
-#define CHCR_SET_LINKP_IE		BIT(29)
-#define CHCR_CLR_LINKP_IE		BIT(28)
-#define CHCR_SET_SAR_IE			BIT(27)
-#define CHCR_CLR_SAR_IE			BIT(26)
-#define CHCR_SET_DONE_IE		BIT(25)
-#define CHCR_CLR_DONE_IE		BIT(24)
-#define CHCR_SET_MRDY_IE		BIT(23)
-#define CHCR_CLR_MRDY_IE		BIT(22)
-#define CHCR_SET_DRDY_IE		BIT(21)
-#define CHCR_CLR_DRDY_IE		BIT(20)
-#define CHCR_SET_LC_IE			BIT(19)
-#define CHCR_CLR_LC_IE			BIT(18)
-#define CHCR_SET_CONT_RB_IE		BIT(17)
-#define CHCR_CLR_CONT_RB_IE		BIT(16)
-#define CHCR_FIFODIS			BIT(15)
-#define CHCR_FIFO_ON			0
-#define CHCR_BURSTEN			BIT(14)
-#define CHCR_NO_BURSTEN			0
-#define CHCR_NFTP(x)			((x) << 11)
-#define CHCR_NFTP0			CHCR_NFTP(0)
-#define CHCR_NFTP1			CHCR_NFTP(1)
-#define CHCR_NFTP2			CHCR_NFTP(2)
-#define CHCR_NFTP4			CHCR_NFTP(3)
-#define CHCR_NFTP8			CHCR_NFTP(4)
-#define CHCR_NFTP16			CHCR_NFTP(5)
-#define CHCR_NETP(x)			((x) << 11)
-#define CHCR_NETP0			CHCR_NETP(0)
-#define CHCR_NETP1			CHCR_NETP(1)
-#define CHCR_NETP2			CHCR_NETP(2)
-#define CHCR_NETP4			CHCR_NETP(3)
-#define CHCR_NETP8			CHCR_NETP(4)
-#define CHCR_CHEND1			BIT(5)
-#define CHCR_CHEND0			BIT(4)
-#define CHCR_DIR			BIT(3)
-#define CHCR_DEV_TO_MEM			CHCR_DIR
-#define CHCR_MEM_TO_DEV			0
-#define CHCR_NORMAL			((0) << 0)
-#define CHCR_CONTINUE			((1) << 0)
-#define CHCR_RINGBUFF			((2) << 0)
-#define CHCR_LINKSHORT			((4) << 0)
-#define CHCR_LINKLONG			((5) << 0)
-#define CHCRPON				(CHCR_CLR_DMA_IE | CHCR_CLR_LINKP_IE | CHCR_CLR_SAR_IE | \
-					 CHCR_CLR_DONE_IE | CHCR_CLR_MRDY_IE | CHCR_CLR_DRDY_IE | \
-					 CHCR_CLR_LC_IE | CHCR_CLR_CONT_IE)
-
-#define MITE_TCR		0x508
-
-/* CR bits */
-#define CR_RL(x)			((x) << 21)
-#define CR_RL0				CR_RL(0)
-#define CR_RL1				CR_RL(1)
-#define CR_RL2				CR_RL(2)
-#define CR_RL4				CR_RL(3)
-#define CR_RL8				CR_RL(4)
-#define CR_RL16				CR_RL(5)
-#define CR_RL32				CR_RL(6)
-#define CR_RL64				CR_RL(7)
-#define CR_RD(x)			((x) << 19)
-#define CR_RD0				CR_RD(0)
-#define CR_RD32				CR_RD(1)
-#define CR_RD512			CR_RD(2)
-#define CR_RD8192			CR_RD(3)
-#define CR_REQS(x)			((x) << 16)
-#define CR_REQSDRQ0			CR_REQS(4)
-#define CR_REQSDRQ1			CR_REQS(5)
-#define CR_REQSDRQ2			CR_REQS(6)
-#define CR_REQSDRQ3			CR_REQS(7)
-#define CR_ASEQX(x)			((x) << 10)
-#define CR_ASEQX0			CR_ASEQX(0)
-#define	CR_ASEQDONT			CR_ASEQX0
-#define CR_ASEQXP1			CR_ASEQX(1)
-#define CR_ASEQUP			CR_ASEQXP1
-#define CR_ASEQXP2			CR_ASEQX(2)
-#define CR_ASEQDOWN			CR_ASEQXP2
-#define CR_ASEQXP4			CR_ASEQX(3)
-#define CR_ASEQXP8			CR_ASEQX(4)
-#define CR_ASEQXP16			CR_ASEQX(5)
-#define CR_ASEQXP32			CR_ASEQX(6)
-#define CR_ASEQXP64			CR_ASEQX(7)
-#define CR_ASEQXM1			CR_ASEQX(9)
-#define CR_ASEQXM2			CR_ASEQX(10)
-#define CR_ASEQXM4			CR_ASEQX(11)
-#define CR_ASEQXM8			CR_ASEQX(12)
-#define CR_ASEQXM16			CR_ASEQX(13)
-#define CR_ASEQXM32			CR_ASEQX(14)
-#define CR_ASEQXM64			CR_ASEQX(15)
-#define CR_PSIZEBYTE			BIT(8)
-#define CR_PSIZEHALF			(2 << 8)
-#define CR_PSIZEWORD			(3 << 8)
-#define CR_PORTCPU			(0 << 6)
-#define CR_PORTIO			BIT(6)
-#define CR_PORTVXI			(2 << 6)
-#define CR_PORTMXI			(3 << 6)
-#define CR_AMDEVICE			BIT(0)
-
-#define CHSR_INT			0x80000000
-#define CHSR_DONE			0x02000000
-#define CHSR_LINKC			0x00080000
-
-#define MITE_MCR		0x50c
-#define	MCRPON				0
-
-#define MITE_MAR		0x510
-
-#define MITE_DCR		0x514
-#define DCR_NORMAL			BIT(29)
-#define DCRPON				0
-
-#define MITE_DAR		0x518
-
-#define MITE_LKCR		0x51c
-
-#define MITE_LKAR		0x520
-#define MITE_LLKAR		0x524
-#define MITE_BAR		0x528
-#define MITE_BCR		0x52c
-#define MITE_SAR		0x530
-#define MITE_WSCR		0x534
-#define MITE_WSER		0x538
-#define MITE_CHSR		0x53c
-#define MITE_FCR		0x540
-
-#define MITE_FIFO		0x80
-#define MITE_FIFOEND		0xff
-
-#define MITE_AMRAM		        0x00
-#define MITE_AMDEVICE		        0x01
-#define MITE_AMHOST_A32_SINGLE	        0x09
-#define MITE_AMHOST_A24_SINGLE	        0x39
-#define MITE_AMHOST_A16_SINGLE	        0x29
-#define MITE_AMHOST_A32_BLOCK	        0x0b
-#define MITE_AMHOST_A32D64_BLOCK	0x08
-#define MITE_AMHOST_A24_BLOCK	        0x3b
-
-enum mite_registers {
-	MITE_IODWBSR = 0xc0,	// IO Device Window Base Size Register
-	MITE_CSIGR = 0x460,	// chip signature
-	MITE_IODWBSR_1 = 0xc4,	// IO Device Window Base Size Register 1 (used by 6602 boards)
-	MITE_IODWCR_1 = 0xf4
-};
-
-enum MITE_IODWBSR_bits {
-	WENAB = 0x80,		// window enable
-	WENAB_6602 = 0x8c	// window enable for 6602 boards
-};
-
-#endif
-
diff --git a/drivers/staging/gpib/tnt4882/tnt4882_gpib.c b/drivers/staging/gpib/tnt4882/tnt4882_gpib.c
deleted file mode 100644
index c03a976b7380..000000000000
--- a/drivers/staging/gpib/tnt4882/tnt4882_gpib.c
+++ /dev/null
@@ -1,1838 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/***************************************************************************
- * National Instruments boards using tnt4882 or compatible chips (at-gpib, etc).
- *    copyright            : (C) 2001, 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
-#define DRV_NAME KBUILD_MODNAME
-
-#include <linux/ioport.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/isapnp.h>
-
-#include "nec7210.h"
-#include "gpibP.h"
-#include "mite.h"
-#include "tnt4882_registers.h"
-
-static const int ISAPNP_VENDOR_ID_NI = ISAPNP_VENDOR('N', 'I', 'C');
-static const int ISAPNP_ID_NI_ATGPIB_TNT = 0xc601;
-enum {
-	PCI_DEVICE_ID_NI_GPIB = 0xc801,
-	PCI_DEVICE_ID_NI_GPIB_PLUS = 0xc811,
-	PCI_DEVICE_ID_NI_GPIB_PLUS2 = 0x71ad,
-	PCI_DEVICE_ID_NI_PXIGPIB = 0xc821,
-	PCI_DEVICE_ID_NI_PMCGPIB = 0xc831,
-	PCI_DEVICE_ID_NI_PCIEGPIB = 0x70cf,
-	PCI_DEVICE_ID_NI_PCIE2GPIB = 0x710e,
-// Measurement Computing PCI-488 same design as PCI-GPIB with TNT5004
-	PCI_DEVICE_ID_MC_PCI488 = 0x7259,
-	PCI_DEVICE_ID_CEC_NI_GPIB = 0x7258
-};
-
-// struct which defines private_data for tnt4882 devices
-struct tnt4882_priv {
-	struct nec7210_priv nec7210_priv;
-	struct mite_struct *mite;
-	struct pnp_dev *pnp_dev;
-	unsigned int irq;
-	unsigned short imr0_bits;
-	unsigned short imr3_bits;
-	unsigned short auxg_bits;	// bits written to auxiliary register G
-};
-
-static irqreturn_t tnt4882_internal_interrupt(struct gpib_board *board);
-
-// register offset for nec7210 compatible registers
-static const int atgpib_reg_offset = 2;
-
-// number of ioports used
-static const int atgpib_iosize = 32;
-
-/* paged io */
-static inline unsigned int tnt_paged_readb(struct tnt4882_priv *priv, unsigned long offset)
-{
-	iowrite8(AUX_PAGEIN, priv->nec7210_priv.mmiobase + AUXMR * priv->nec7210_priv.offset);
-	udelay(1);
-	return ioread8(priv->nec7210_priv.mmiobase + offset);
-}
-
-static inline void tnt_paged_writeb(struct tnt4882_priv *priv, unsigned int value,
-				    unsigned long offset)
-{
-	iowrite8(AUX_PAGEIN, priv->nec7210_priv.mmiobase + AUXMR * priv->nec7210_priv.offset);
-	udelay(1);
-	iowrite8(value, priv->nec7210_priv.mmiobase + offset);
-}
-
-/* readb/writeb wrappers */
-static inline unsigned short tnt_readb(struct tnt4882_priv *priv, unsigned long offset)
-{
-	void __iomem *address = priv->nec7210_priv.mmiobase + offset;
-	unsigned long flags;
-	unsigned short retval;
-	spinlock_t *register_lock = &priv->nec7210_priv.register_page_lock;
-
-	spin_lock_irqsave(register_lock, flags);
-	switch (offset) {
-	case CSR:
-	case SASR:
-	case ISR0:
-	case BSR:
-		switch (priv->nec7210_priv.type) {
-		case TNT4882:
-		case TNT5004:
-			retval = ioread8(address);
-			break;
-		case NAT4882:
-			retval = tnt_paged_readb(priv, offset - tnt_pagein_offset);
-			break;
-		case NEC7210:
-			retval = 0;
-			break;
-		default:
-			retval = 0;
-			break;
-		}
-		break;
-	default:
-		retval = ioread8(address);
-		break;
-	}
-	spin_unlock_irqrestore(register_lock, flags);
-	return retval;
-}
-
-static inline void tnt_writeb(struct tnt4882_priv *priv, unsigned short value, unsigned long offset)
-{
-	void __iomem *address = priv->nec7210_priv.mmiobase + offset;
-	unsigned long flags;
-	spinlock_t *register_lock = &priv->nec7210_priv.register_page_lock;
-
-	spin_lock_irqsave(register_lock, flags);
-	switch (offset)	{
-	case KEYREG:
-	case IMR0:
-	case BCR:
-		switch (priv->nec7210_priv.type) {
-		case TNT4882:
-		case TNT5004:
-			iowrite8(value, address);
-			break;
-		case NAT4882:
-			tnt_paged_writeb(priv, value, offset - tnt_pagein_offset);
-			break;
-		case NEC7210:
-			break;
-		default:
-			break;
-		}
-		break;
-	default:
-		iowrite8(value, address);
-		break;
-	}
-	spin_unlock_irqrestore(register_lock, flags);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GPIB driver for National Instruments boards using tnt4882 or compatible chips");
-
-static int tnt4882_line_status(const struct gpib_board *board)
-{
-	int status = VALID_ALL;
-	int bcsr_bits;
-	struct tnt4882_priv *tnt_priv;
-
-	tnt_priv = board->private_data;
-
-	bcsr_bits = tnt_readb(tnt_priv, BSR);
-
-	if (bcsr_bits & BCSR_REN_BIT)
-		status |= BUS_REN;
-	if (bcsr_bits & BCSR_IFC_BIT)
-		status |= BUS_IFC;
-	if (bcsr_bits & BCSR_SRQ_BIT)
-		status |= BUS_SRQ;
-	if (bcsr_bits & BCSR_EOI_BIT)
-		status |= BUS_EOI;
-	if (bcsr_bits & BCSR_NRFD_BIT)
-		status |= BUS_NRFD;
-	if (bcsr_bits & BCSR_NDAC_BIT)
-		status |= BUS_NDAC;
-	if (bcsr_bits & BCSR_DAV_BIT)
-		status |= BUS_DAV;
-	if (bcsr_bits & BCSR_ATN_BIT)
-		status |= BUS_ATN;
-
-	return status;
-}
-
-static int tnt4882_t1_delay(struct gpib_board *board, unsigned int nano_sec)
-{
-	struct tnt4882_priv *tnt_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
-	unsigned int retval;
-
-	retval = nec7210_t1_delay(board, nec_priv, nano_sec);
-	if (nec_priv->type == NEC7210)
-		return retval;
-
-	if (nano_sec <= 350) {
-		tnt_writeb(tnt_priv, MSTD, KEYREG);
-		retval = 350;
-	} else {
-		tnt_writeb(tnt_priv, 0, KEYREG);
-	}
-	if (nano_sec > 500 && nano_sec <= 1100)	{
-		write_byte(nec_priv, AUXRI | USTD, AUXMR);
-		retval = 1100;
-	} else {
-		write_byte(nec_priv, AUXRI, AUXMR);
-	}
-	return retval;
-}
-
-static int fifo_word_available(struct tnt4882_priv *tnt_priv)
-{
-	int status2;
-	int retval;
-
-	status2 = tnt_readb(tnt_priv, STS2);
-	retval = (status2 & AEFN) && (status2 & BEFN);
-
-	return retval;
-}
-
-static int fifo_byte_available(struct tnt4882_priv *tnt_priv)
-{
-	int status2;
-	int retval;
-
-	status2 = tnt_readb(tnt_priv, STS2);
-	retval = (status2 & AEFN) || (status2 & BEFN);
-
-	return retval;
-}
-
-static int fifo_xfer_done(struct tnt4882_priv *tnt_priv)
-{
-	int status1;
-	int retval;
-
-	status1 = tnt_readb(tnt_priv, STS1);
-	retval = status1 & (S_DONE | S_HALT);
-
-	return retval;
-}
-
-static int drain_fifo_words(struct tnt4882_priv *tnt_priv, u8 *buffer, int num_bytes)
-{
-	int count = 0;
-	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
-
-	while (fifo_word_available(tnt_priv) && count + 2 <= num_bytes)	{
-		short word;
-
-		word = ioread16(nec_priv->mmiobase + FIFOB);
-		buffer[count++] = word & 0xff;
-		buffer[count++] = (word >> 8) & 0xff;
-	}
-	return count;
-}
-
-static void tnt4882_release_holdoff(struct gpib_board *board, struct tnt4882_priv *tnt_priv)
-{
-	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
-	unsigned short sasr_bits;
-
-	sasr_bits = tnt_readb(tnt_priv, SASR);
-
-	/*
-	 * tnt4882 not in one-chip mode won't always release holdoff unless we
-	 * are in the right mode when release handshake command is given
-	 */
-	if (sasr_bits & AEHS_BIT) /* holding off due to holdoff on end mode*/	{
-		nec7210_set_handshake_mode(board, nec_priv, HR_HLDE);
-		write_byte(nec_priv, AUX_FH, AUXMR);
-	} else if (sasr_bits & ANHS1_BIT) { /* held off due to holdoff on all data mode*/
-		nec7210_set_handshake_mode(board, nec_priv, HR_HLDA);
-		write_byte(nec_priv, AUX_FH, AUXMR);
-		nec7210_set_handshake_mode(board, nec_priv, HR_HLDE);
-	} else { /* held off due to holdoff immediately command */
-		nec7210_set_handshake_mode(board, nec_priv, HR_HLDE);
-		write_byte(nec_priv, AUX_FH, AUXMR);
-	}
-}
-
-static int tnt4882_accel_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
-			      size_t *bytes_read)
-{
-	size_t count = 0;
-	ssize_t retval = 0;
-	struct tnt4882_priv *tnt_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
-	unsigned int bits;
-	s32 hw_count;
-	unsigned long flags;
-
-	*bytes_read = 0;
-	// FIXME: really, DEV_CLEAR_BN should happen elsewhere to prevent race
-	clear_bit(DEV_CLEAR_BN, &nec_priv->state);
-	clear_bit(ADR_CHANGE_BN, &nec_priv->state);
-
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_ENDIE, HR_ENDIE);
-	if (nec_priv->type != TNT4882 && nec_priv->type != TNT5004)
-		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, HR_DMAI);
-	else
-		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
-	tnt_writeb(tnt_priv, nec_priv->auxa_bits | HR_HLDA, CCR);
-	bits = TNT_B_16BIT | TNT_IN | TNT_CCEN;
-	tnt_writeb(tnt_priv, bits, CFG);
-	tnt_writeb(tnt_priv, RESET_FIFO, CMDR);
-	udelay(1);
-	// load 2's complement of count into hardware counters
-	hw_count = -length;
-	tnt_writeb(tnt_priv, hw_count & 0xff, CNT0);
-	tnt_writeb(tnt_priv, (hw_count >> 8) & 0xff, CNT1);
-	tnt_writeb(tnt_priv, (hw_count >> 16) & 0xff, CNT2);
-	tnt_writeb(tnt_priv, (hw_count >> 24) & 0xff, CNT3);
-
-	tnt4882_release_holdoff(board, tnt_priv);
-
-	tnt_writeb(tnt_priv, GO, CMDR);
-	udelay(1);
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	tnt_priv->imr3_bits |= HR_DONE | HR_NEF;
-	tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	while (count + 2 <= length &&
-	       test_bit(RECEIVED_END_BN, &nec_priv->state) == 0 &&
-	       fifo_xfer_done(tnt_priv) == 0) {
-		// wait until a word is ready
-		if (wait_event_interruptible(board->wait,
-					     fifo_word_available(tnt_priv) ||
-					     fifo_xfer_done(tnt_priv) ||
-					     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
-					     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-					     test_bit(ADR_CHANGE_BN, &nec_priv->state) ||
-					     test_bit(TIMO_NUM, &board->status))) {
-			retval = -ERESTARTSYS;
-			break;
-		}
-		if (test_bit(TIMO_NUM, &board->status))	{
-			retval = -ETIMEDOUT;
-			break;
-		}
-		if (test_bit(DEV_CLEAR_BN, &nec_priv->state)) {
-			retval = -EINTR;
-			break;
-		}
-		if (test_bit(ADR_CHANGE_BN, &nec_priv->state)) {
-			retval = -EINTR;
-			break;
-		}
-
-		spin_lock_irqsave(&board->spinlock, flags);
-		count += drain_fifo_words(tnt_priv, &buffer[count], length - count);
-		tnt_priv->imr3_bits |= HR_NEF;
-		tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
-		spin_unlock_irqrestore(&board->spinlock, flags);
-
-		if (need_resched())
-			schedule();
-	}
-	// wait for last byte
-	if (count < length) {
-		spin_lock_irqsave(&board->spinlock, flags);
-		tnt_priv->imr3_bits |= HR_DONE | HR_NEF;
-		tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
-		spin_unlock_irqrestore(&board->spinlock, flags);
-
-		if (wait_event_interruptible(board->wait,
-					     fifo_xfer_done(tnt_priv) ||
-					     test_bit(RECEIVED_END_BN, &nec_priv->state) ||
-					     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-					     test_bit(ADR_CHANGE_BN, &nec_priv->state) ||
-					     test_bit(TIMO_NUM, &board->status))) {
-			retval = -ERESTARTSYS;
-		}
-		if (test_bit(TIMO_NUM, &board->status))
-			retval = -ETIMEDOUT;
-		if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
-			retval = -EINTR;
-		if (test_bit(ADR_CHANGE_BN, &nec_priv->state))
-			retval = -EINTR;
-		count += drain_fifo_words(tnt_priv, &buffer[count], length - count);
-		if (fifo_byte_available(tnt_priv) && count < length)
-			buffer[count++] = tnt_readb(tnt_priv, FIFOB);
-	}
-	if (count < length)
-		tnt_writeb(tnt_priv, STOP, CMDR);
-	udelay(1);
-
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_ENDIE, 0);
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAI, 0);
-	/*
-	 * force handling of any pending interrupts (seems to be needed
-	 * to keep interrupts from getting hosed, plus for syncing
-	 * with RECEIVED_END below)
-	 */
-	tnt4882_internal_interrupt(board);
-	/* RECEIVED_END should be in sync now */
-	if (test_and_clear_bit(RECEIVED_END_BN, &nec_priv->state))
-		*end = 1;
-	if (retval < 0)	{
-		// force immediate holdoff
-		write_byte(nec_priv, AUX_HLDI, AUXMR);
-
-		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-	}
-	*bytes_read = count;
-
-	return retval;
-}
-
-static int fifo_space_available(struct tnt4882_priv *tnt_priv)
-{
-	int status2;
-	int retval;
-
-	status2 = tnt_readb(tnt_priv, STS2);
-	retval = (status2 & AFFN) && (status2 & BFFN);
-
-	return retval;
-}
-
-static unsigned int tnt_transfer_count(struct tnt4882_priv *tnt_priv)
-{
-	unsigned int count = 0;
-
-	count |= tnt_readb(tnt_priv, CNT0) & 0xff;
-	count |= (tnt_readb(tnt_priv, CNT1) << 8) & 0xff00;
-	count |= (tnt_readb(tnt_priv, CNT2) << 16) & 0xff0000;
-	count |= (tnt_readb(tnt_priv, CNT3) << 24) & 0xff000000;
-	// return two's complement
-	return -count;
-};
-
-static int write_wait(struct gpib_board *board, struct tnt4882_priv *tnt_priv,
-		      int wait_for_done, int send_commands)
-{
-	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
-
-	if (wait_event_interruptible(board->wait,
-				     (!wait_for_done && fifo_space_available(tnt_priv)) ||
-				     fifo_xfer_done(tnt_priv) ||
-				     test_bit(BUS_ERROR_BN, &nec_priv->state) ||
-				     test_bit(DEV_CLEAR_BN, &nec_priv->state) ||
-				     test_bit(TIMO_NUM, &board->status)))
-		return -ERESTARTSYS;
-
-	if (test_bit(TIMO_NUM, &board->status))
-		return -ETIMEDOUT;
-	if (test_and_clear_bit(BUS_ERROR_BN, &nec_priv->state))
-		return (send_commands) ? -ENOTCONN : -ECOMM;
-	if (test_bit(DEV_CLEAR_BN, &nec_priv->state))
-		return -EINTR;
-	return 0;
-}
-
-static int generic_write(struct gpib_board *board, u8 *buffer, size_t length,
-			 int send_eoi, int send_commands, size_t *bytes_written)
-{
-	size_t count = 0;
-	ssize_t retval = 0;
-	struct tnt4882_priv *tnt_priv = board->private_data;
-	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
-	unsigned int bits;
-	s32 hw_count;
-	unsigned long flags;
-
-	*bytes_written = 0;
-	// FIXME: really, DEV_CLEAR_BN should happen elsewhere to prevent race
-	clear_bit(DEV_CLEAR_BN, &nec_priv->state);
-
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_ERRIE, HR_ERRIE);
-
-	if (nec_priv->type != TNT4882 && nec_priv->type != TNT5004)
-		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, HR_DMAO);
-	else
-		nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0);
-
-	tnt_writeb(tnt_priv, RESET_FIFO, CMDR);
-	udelay(1);
-
-	bits = TNT_B_16BIT;
-	if (send_eoi) {
-		bits |= TNT_CCEN;
-		if (nec_priv->type != TNT4882 && nec_priv->type != TNT5004)
-			tnt_writeb(tnt_priv, AUX_SEOI, CCR);
-	}
-	if (send_commands)
-		bits |= TNT_COMMAND;
-	tnt_writeb(tnt_priv, bits, CFG);
-
-	// load 2's complement of count into hardware counters
-	hw_count = -length;
-	tnt_writeb(tnt_priv, hw_count & 0xff, CNT0);
-	tnt_writeb(tnt_priv, (hw_count >> 8) & 0xff, CNT1);
-	tnt_writeb(tnt_priv, (hw_count >> 16) & 0xff, CNT2);
-	tnt_writeb(tnt_priv, (hw_count >> 24) & 0xff, CNT3);
-
-	tnt_writeb(tnt_priv, GO, CMDR);
-	udelay(1);
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	tnt_priv->imr3_bits |= HR_DONE;
-	tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-
-	while (count < length)	{
-		// wait until byte is ready to be sent
-		retval = write_wait(board, tnt_priv, 0, send_commands);
-		if (retval < 0)
-			break;
-		if (fifo_xfer_done(tnt_priv))
-			break;
-		spin_lock_irqsave(&board->spinlock, flags);
-		while (fifo_space_available(tnt_priv) && count < length) {
-			u16 word;
-
-			word = buffer[count++] & 0xff;
-			if (count < length)
-				word |= (buffer[count++] << 8) & 0xff00;
-			iowrite16(word, nec_priv->mmiobase + FIFOB);
-		}
-//  avoid unnecessary HR_NFF interrupts
-//		tnt_priv->imr3_bits |= HR_NFF;
-//		tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
-		spin_unlock_irqrestore(&board->spinlock, flags);
-
-		if (need_resched())
-			schedule();
-	}
-	// wait last byte has been sent
-	if (retval == 0)
-		retval = write_wait(board, tnt_priv, 1, send_commands);
-
-	tnt_writeb(tnt_priv, STOP, CMDR);
-	udelay(1);
-
-	nec7210_set_reg_bits(nec_priv, IMR1, HR_ERR, 0x0);
-	nec7210_set_reg_bits(nec_priv, IMR2, HR_DMAO, 0x0);
-	/*
-	 * force handling of any interrupts that happened
-	 * while they were masked (this appears to be needed)
-	 */
-	tnt4882_internal_interrupt(board);
-	*bytes_written = length - tnt_transfer_count(tnt_priv);
-	return retval;
-}
-
-static int tnt4882_accel_write(struct gpib_board *board, u8 *buffer,
-			       size_t length, int send_eoi, size_t *bytes_written)
-{
-	return generic_write(board, buffer, length, send_eoi, 0, bytes_written);
-}
-
-static int tnt4882_command(struct gpib_board *board, u8 *buffer, size_t length,
-			   size_t *bytes_written)
-{
-	return generic_write(board, buffer, length, 0, 1, bytes_written);
-}
-
-static irqreturn_t tnt4882_internal_interrupt(struct gpib_board *board)
-{
-	struct tnt4882_priv *priv = board->private_data;
-	int isr0_bits, isr3_bits, imr3_bits;
-	unsigned long flags;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-
-	nec7210_interrupt(board, &priv->nec7210_priv);
-
-	isr0_bits = tnt_readb(priv, ISR0);
-	isr3_bits = tnt_readb(priv, ISR3);
-	imr3_bits = priv->imr3_bits;
-
-	if (isr0_bits & TNT_IFCI_BIT)
-		push_gpib_event(board, EVENT_IFC);
-	// XXX don't need this wakeup, one below should do?
-//		wake_up_interruptible(&board->wait);
-
-	if (isr3_bits & HR_NFF)
-		priv->imr3_bits &= ~HR_NFF;
-	if (isr3_bits & HR_NEF)
-		priv->imr3_bits &= ~HR_NEF;
-	if (isr3_bits & HR_DONE)
-		priv->imr3_bits &= ~HR_DONE;
-	if (isr3_bits & (HR_INTR | HR_TLCI)) {
-		dev_dbg(board->gpib_dev, "minor %i isr0 0x%x imr0 0x%x isr3 0x%x imr3 0x%x\n",
-			board->minor, isr0_bits, priv->imr0_bits, isr3_bits, imr3_bits);
-		tnt_writeb(priv, priv->imr3_bits, IMR3);
-		wake_up_interruptible(&board->wait);
-	}
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t tnt4882_interrupt(int irq, void *arg)
-{
-	return tnt4882_internal_interrupt(arg);
-}
-
-// wrappers for interface functions
-static int tnt4882_read(struct gpib_board *board, u8 *buffer, size_t length, int *end,
-			size_t *bytes_read)
-{
-	struct tnt4882_priv *priv = board->private_data;
-	struct nec7210_priv *nec_priv = &priv->nec7210_priv;
-	int retval;
-	int dummy;
-
-	retval = nec7210_read(board, &priv->nec7210_priv, buffer, length, end, bytes_read);
-
-	if (retval < 0)	{	// force immediate holdoff
-		write_byte(nec_priv, AUX_HLDI, AUXMR);
-
-		set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-
-		nec7210_read_data_in(board, nec_priv, &dummy);
-	}
-	return retval;
-}
-
-static int tnt4882_write(struct gpib_board *board, u8 *buffer, size_t length, int send_eoi,
-			 size_t *bytes_written)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	return nec7210_write(board, &priv->nec7210_priv, buffer, length, send_eoi, bytes_written);
-}
-
-static int tnt4882_command_unaccel(struct gpib_board *board, u8 *buffer,
-				   size_t length, size_t *bytes_written)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	return nec7210_command(board, &priv->nec7210_priv, buffer, length, bytes_written);
-}
-
-static int tnt4882_take_control(struct gpib_board *board, int synchronous)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	return nec7210_take_control(board, &priv->nec7210_priv, synchronous);
-}
-
-static int tnt4882_go_to_standby(struct gpib_board *board)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	return nec7210_go_to_standby(board, &priv->nec7210_priv);
-}
-
-static int tnt4882_request_system_control(struct gpib_board *board, int request_control)
-{
-	struct tnt4882_priv *priv = board->private_data;
-	int retval;
-
-	if (request_control) {
-		tnt_writeb(priv, SETSC, CMDR);
-		udelay(1);
-	}
-	retval = nec7210_request_system_control(board, &priv->nec7210_priv, request_control);
-	if (!request_control) {
-		tnt_writeb(priv, CLRSC, CMDR);
-		udelay(1);
-	}
-	return retval;
-}
-
-static void tnt4882_interface_clear(struct gpib_board *board, int assert)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	nec7210_interface_clear(board, &priv->nec7210_priv, assert);
-}
-
-static void tnt4882_remote_enable(struct gpib_board *board, int enable)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	nec7210_remote_enable(board, &priv->nec7210_priv, enable);
-}
-
-static int tnt4882_enable_eos(struct gpib_board *board, u8 eos_byte, int compare_8_bits)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	return nec7210_enable_eos(board, &priv->nec7210_priv, eos_byte, compare_8_bits);
-}
-
-static void tnt4882_disable_eos(struct gpib_board *board)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	nec7210_disable_eos(board, &priv->nec7210_priv);
-}
-
-static unsigned int tnt4882_update_status(struct gpib_board *board, unsigned int clear_mask)
-{
-	unsigned long flags;
-	u8 line_status;
-	struct tnt4882_priv *priv = board->private_data;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	board->status &= ~clear_mask;
-	nec7210_update_status_nolock(board, &priv->nec7210_priv);
-	/* set / clear SRQ state since it is not cleared by interrupt */
-	line_status = tnt_readb(priv, BSR);
-	if (line_status & BCSR_SRQ_BIT)
-		set_bit(SRQI_NUM, &board->status);
-	else
-		clear_bit(SRQI_NUM, &board->status);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-	return board->status;
-}
-
-static int tnt4882_primary_address(struct gpib_board *board, unsigned int address)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	return nec7210_primary_address(board, &priv->nec7210_priv, address);
-}
-
-static int tnt4882_secondary_address(struct gpib_board *board, unsigned int address, int enable)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	return nec7210_secondary_address(board, &priv->nec7210_priv, address, enable);
-}
-
-static int tnt4882_parallel_poll(struct gpib_board *board, u8 *result)
-{
-	struct tnt4882_priv *tnt_priv = board->private_data;
-
-	if (tnt_priv->nec7210_priv.type != NEC7210) {
-		tnt_priv->auxg_bits |= RPP2_BIT;
-		write_byte(&tnt_priv->nec7210_priv, tnt_priv->auxg_bits, AUXMR);
-		udelay(2);	// FIXME use parallel poll timeout
-		*result = read_byte(&tnt_priv->nec7210_priv, CPTR);
-		tnt_priv->auxg_bits &= ~RPP2_BIT;
-		write_byte(&tnt_priv->nec7210_priv, tnt_priv->auxg_bits, AUXMR);
-		return 0;
-	} else {
-		return nec7210_parallel_poll(board, &tnt_priv->nec7210_priv, result);
-	}
-}
-
-static void tnt4882_parallel_poll_configure(struct gpib_board *board, u8 config)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	if (priv->nec7210_priv.type == TNT5004) {
-		/* configure locally */
-		write_byte(&priv->nec7210_priv, AUXRI | 0x4, AUXMR);
-		if (config)
-			/* set response + clear sense */
-			write_byte(&priv->nec7210_priv, PPR | config, AUXMR);
-		else
-			/* disable ppoll */
-			write_byte(&priv->nec7210_priv, PPR | 0x10, AUXMR);
-	} else {
-		nec7210_parallel_poll_configure(board, &priv->nec7210_priv, config);
-	}
-}
-
-static void tnt4882_parallel_poll_response(struct gpib_board *board, int ist)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	nec7210_parallel_poll_response(board, &priv->nec7210_priv, ist);
-}
-
-/*
- * this is just used by the old nec7210 isa interfaces, the newer
- * boards use tnt4882_serial_poll_response2
- */
-static void tnt4882_serial_poll_response(struct gpib_board *board, u8 status)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	nec7210_serial_poll_response(board, &priv->nec7210_priv, status);
-}
-
-static void tnt4882_serial_poll_response2(struct gpib_board *board, u8 status,
-					  int new_reason_for_service)
-{
-	struct tnt4882_priv *priv = board->private_data;
-	unsigned long flags;
-	const int MSS = status & request_service_bit;
-	const int reqt = MSS && new_reason_for_service;
-	const int reqf = MSS == 0;
-
-	spin_lock_irqsave(&board->spinlock, flags);
-	if (reqt) {
-		priv->nec7210_priv.srq_pending = 1;
-		clear_bit(SPOLL_NUM, &board->status);
-	} else {
-		if (reqf)
-			priv->nec7210_priv.srq_pending = 0;
-	}
-	if (reqt)
-		/*
-		 * It may seem like a race to issue reqt before updating
-		 * the status byte, but it is not.  The chip does not
-		 * issue the reqt until the SPMR is written to at
-		 * a later time.
-		 */
-		write_byte(&priv->nec7210_priv, AUX_REQT, AUXMR);
-	else if (reqf)
-		write_byte(&priv->nec7210_priv, AUX_REQF, AUXMR);
-	/*
-	 * We need to always zero bit 6 of the status byte before writing it to
-	 * the SPMR to insure we are using
-	 * serial poll mode SP1, and not accidentally triggering mode SP3.
-	 */
-	write_byte(&priv->nec7210_priv, status & ~request_service_bit, SPMR);
-	spin_unlock_irqrestore(&board->spinlock, flags);
-}
-
-static u8 tnt4882_serial_poll_status(struct gpib_board *board)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	return nec7210_serial_poll_status(board, &priv->nec7210_priv);
-}
-
-static void tnt4882_return_to_local(struct gpib_board *board)
-{
-	struct tnt4882_priv *priv = board->private_data;
-
-	nec7210_return_to_local(board, &priv->nec7210_priv);
-}
-
-static void tnt4882_board_reset(struct tnt4882_priv *tnt_priv, struct gpib_board *board)
-{
-	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
-
-	tnt_priv->imr0_bits = 0;
-	tnt_writeb(tnt_priv, tnt_priv->imr0_bits, IMR0);
-	tnt_priv->imr3_bits = 0;
-	tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
-	tnt_readb(tnt_priv, IMR0);
-	tnt_readb(tnt_priv, IMR3);
-	nec7210_board_reset(nec_priv, board);
-}
-
-static int tnt4882_allocate_private(struct gpib_board *board)
-{
-	struct tnt4882_priv *tnt_priv;
-
-	board->private_data = kmalloc(sizeof(struct tnt4882_priv), GFP_KERNEL);
-	if (!board->private_data)
-		return -1;
-	tnt_priv = board->private_data;
-	memset(tnt_priv, 0, sizeof(struct tnt4882_priv));
-	init_nec7210_private(&tnt_priv->nec7210_priv);
-	return 0;
-}
-
-static void tnt4882_free_private(struct gpib_board *board)
-{
-	kfree(board->private_data);
-	board->private_data = NULL;
-}
-
-static void tnt4882_init(struct tnt4882_priv *tnt_priv, const struct gpib_board *board)
-{
-	struct nec7210_priv *nec_priv = &tnt_priv->nec7210_priv;
-
-	/* Turbo488 software reset */
-	tnt_writeb(tnt_priv, SOFT_RESET, CMDR);
-	udelay(1);
-
-	// turn off one-chip mode
-	tnt_writeb(tnt_priv, NODMA, HSSEL);
-	tnt_writeb(tnt_priv, 0, ACCWR);
-	// make sure we are in 7210 mode
-	tnt_writeb(tnt_priv, AUX_7210, AUXCR);
-	udelay(1);
-	// registers might be swapped, so write it to the swapped address too
-	tnt_writeb(tnt_priv, AUX_7210, SWAPPED_AUXCR);
-	udelay(1);
-	// turn on one-chip mode
-	if (nec_priv->type == TNT4882 || nec_priv->type == TNT5004)
-		tnt_writeb(tnt_priv, NODMA | TNT_ONE_CHIP_BIT, HSSEL);
-	else
-		tnt_writeb(tnt_priv, NODMA, HSSEL);
-
-	nec7210_board_reset(nec_priv, board);
-	// read-clear isr0
-	tnt_readb(tnt_priv, ISR0);
-
-	// enable passing of nat4882 interrupts
-	tnt_priv->imr3_bits = HR_TLCI;
-	tnt_writeb(tnt_priv, tnt_priv->imr3_bits, IMR3);
-
-	// enable interrupt
-	tnt_writeb(tnt_priv, 0x1, INTRT);
-
-	// force immediate holdoff
-	write_byte(&tnt_priv->nec7210_priv, AUX_HLDI, AUXMR);
-
-	set_bit(RFD_HOLDOFF_BN, &nec_priv->state);
-
-	tnt_priv->auxg_bits = AUXRG | NTNL_BIT;
-	write_byte(&tnt_priv->nec7210_priv, tnt_priv->auxg_bits, AUXMR);
-
-	nec7210_board_online(nec_priv, board);
-	// enable interface clear interrupt for event queue
-	tnt_priv->imr0_bits = TNT_IMR0_ALWAYS_BITS | TNT_ATNI_BIT | TNT_IFCIE_BIT;
-	tnt_writeb(tnt_priv, tnt_priv->imr0_bits, IMR0);
-}
-
-static int ni_pci_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct tnt4882_priv *tnt_priv;
-	struct nec7210_priv *nec_priv;
-	int isr_flags = IRQF_SHARED;
-	int retval;
-	struct mite_struct *mite;
-
-	board->status = 0;
-
-	if (tnt4882_allocate_private(board))
-		return -ENOMEM;
-	tnt_priv = board->private_data;
-	nec_priv = &tnt_priv->nec7210_priv;
-	nec_priv->type = TNT4882;
-	nec_priv->read_byte = nec7210_locking_iomem_read_byte;
-	nec_priv->write_byte = nec7210_locking_iomem_write_byte;
-	nec_priv->offset = atgpib_reg_offset;
-
-	if (!mite_devices)
-		return -ENODEV;
-
-	for (mite = mite_devices; mite; mite = mite->next) {
-		short found_board;
-
-		if (mite->used)
-			continue;
-		if (config->pci_bus >= 0 && config->pci_bus != mite->pcidev->bus->number)
-			continue;
-		if (config->pci_slot >= 0 && config->pci_slot != PCI_SLOT(mite->pcidev->devfn))
-			continue;
-		switch (mite_device_id(mite)) {
-		case PCI_DEVICE_ID_NI_GPIB:
-		case PCI_DEVICE_ID_NI_GPIB_PLUS:
-		case PCI_DEVICE_ID_NI_GPIB_PLUS2:
-		case PCI_DEVICE_ID_NI_PXIGPIB:
-		case PCI_DEVICE_ID_NI_PMCGPIB:
-		case PCI_DEVICE_ID_NI_PCIEGPIB:
-		case PCI_DEVICE_ID_NI_PCIE2GPIB:
-// support for Measurement Computing PCI-488
-		case PCI_DEVICE_ID_MC_PCI488:
-		case PCI_DEVICE_ID_CEC_NI_GPIB:
-			found_board = 1;
-			break;
-		default:
-			found_board = 0;
-			break;
-		}
-		if (found_board)
-			break;
-	}
-	if (!mite)
-		return -ENODEV;
-
-	tnt_priv->mite = mite;
-	retval = mite_setup(tnt_priv->mite);
-	if (retval < 0)
-		return retval;
-
-	nec_priv->mmiobase = tnt_priv->mite->daq_io_addr;
-
-	// get irq
-	retval = request_irq(mite_irq(tnt_priv->mite), tnt4882_interrupt, isr_flags, "ni-pci-gpib",
-			     board);
-	if (retval) {
-		dev_err(board->gpib_dev, "failed to obtain pci irq %d\n", mite_irq(tnt_priv->mite));
-		return retval;
-	}
-	tnt_priv->irq = mite_irq(tnt_priv->mite);
-
-	// TNT5004 detection
-	switch (tnt_readb(tnt_priv, CSR) & 0xf0) {
-	case 0x30:
-		nec_priv->type = TNT4882;
-		break;
-	case 0x40:
-		nec_priv->type = TNT5004;
-		break;
-	}
-	tnt4882_init(tnt_priv, board);
-
-	return 0;
-}
-
-static void ni_pci_detach(struct gpib_board *board)
-{
-	struct tnt4882_priv *tnt_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (tnt_priv) {
-		nec_priv = &tnt_priv->nec7210_priv;
-
-		if (nec_priv->mmiobase)
-			tnt4882_board_reset(tnt_priv, board);
-		if (tnt_priv->irq)
-			free_irq(tnt_priv->irq, board);
-		if (tnt_priv->mite)
-			mite_unsetup(tnt_priv->mite);
-	}
-	tnt4882_free_private(board);
-}
-
-static int ni_isapnp_find(struct pnp_dev **dev)
-{
-	*dev = pnp_find_dev(NULL, ISAPNP_VENDOR_ID_NI,
-			    ISAPNP_FUNCTION(ISAPNP_ID_NI_ATGPIB_TNT), NULL);
-	if (!*dev || !(*dev)->card)
-		return -ENODEV;
-	if (pnp_device_attach(*dev) < 0)
-		return -EBUSY;
-	if (pnp_activate_dev(*dev) < 0)	{
-		pnp_device_detach(*dev);
-		return -EAGAIN;
-	}
-	if (!pnp_port_valid(*dev, 0) || !pnp_irq_valid(*dev, 0)) {
-		pnp_device_detach(*dev);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int ni_isa_attach_common(struct gpib_board *board, const struct gpib_board_config *config,
-				enum nec7210_chipset chipset)
-{
-	struct tnt4882_priv *tnt_priv;
-	struct nec7210_priv *nec_priv;
-	int isr_flags = 0;
-	u32 iobase;
-	int irq;
-	int retval;
-
-	board->status = 0;
-
-	if (tnt4882_allocate_private(board))
-		return -ENOMEM;
-	tnt_priv = board->private_data;
-	nec_priv = &tnt_priv->nec7210_priv;
-	nec_priv->type = chipset;
-	nec_priv->read_byte = nec7210_locking_ioport_read_byte;
-	nec_priv->write_byte = nec7210_locking_ioport_write_byte;
-	nec_priv->offset = atgpib_reg_offset;
-
-	// look for plug-n-play board
-	if (config->ibbase == 0) {
-		struct pnp_dev *dev;
-
-		retval = ni_isapnp_find(&dev);
-		if (retval < 0)
-			return retval;
-		tnt_priv->pnp_dev = dev;
-		iobase = pnp_port_start(dev, 0);
-		irq = pnp_irq(dev, 0);
-	} else {
-		iobase = config->ibbase;
-		irq = config->ibirq;
-	}
-	// allocate ioports
-	if (!request_region(iobase, atgpib_iosize, "atgpib"))
-		return -EBUSY;
-
-	nec_priv->mmiobase = ioport_map(iobase, atgpib_iosize);
-	if (!nec_priv->mmiobase)
-		return -EBUSY;
-
-	// get irq
-	retval = request_irq(irq, tnt4882_interrupt, isr_flags, "atgpib", board);
-	if (retval) {
-		dev_err(board->gpib_dev, "failed to request ISA irq %d\n", irq);
-		return retval;
-	}
-	tnt_priv->irq = irq;
-
-	tnt4882_init(tnt_priv, board);
-
-	return 0;
-}
-
-static int ni_tnt_isa_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	return ni_isa_attach_common(board, config, TNT4882);
-}
-
-static int ni_nat4882_isa_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	return ni_isa_attach_common(board, config, NAT4882);
-}
-
-static int ni_nec_isa_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	return ni_isa_attach_common(board, config, NEC7210);
-}
-
-static void ni_isa_detach(struct gpib_board *board)
-{
-	struct tnt4882_priv *tnt_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (tnt_priv) {
-		nec_priv = &tnt_priv->nec7210_priv;
-		if (nec_priv->iobase)
-			tnt4882_board_reset(tnt_priv, board);
-		if (tnt_priv->irq)
-			free_irq(tnt_priv->irq, board);
-		if (nec_priv->mmiobase)
-			ioport_unmap(nec_priv->mmiobase);
-		if (nec_priv->iobase)
-			release_region(nec_priv->iobase, atgpib_iosize);
-		if (tnt_priv->pnp_dev)
-			pnp_device_detach(tnt_priv->pnp_dev);
-	}
-	tnt4882_free_private(board);
-}
-
-static int tnt4882_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return 0;
-}
-
-static struct gpib_interface ni_pci_interface = {
-	.name = "ni_pci",
-	.attach = ni_pci_attach,
-	.detach = ni_pci_detach,
-	.read = tnt4882_accel_read,
-	.write = tnt4882_accel_write,
-	.command = tnt4882_command,
-	.take_control = tnt4882_take_control,
-	.go_to_standby = tnt4882_go_to_standby,
-	.request_system_control = tnt4882_request_system_control,
-	.interface_clear = tnt4882_interface_clear,
-	.remote_enable = tnt4882_remote_enable,
-	.enable_eos = tnt4882_enable_eos,
-	.disable_eos = tnt4882_disable_eos,
-	.parallel_poll = tnt4882_parallel_poll,
-	.parallel_poll_configure = tnt4882_parallel_poll_configure,
-	.parallel_poll_response = tnt4882_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = tnt4882_line_status,
-	.update_status = tnt4882_update_status,
-	.primary_address = tnt4882_primary_address,
-	.secondary_address = tnt4882_secondary_address,
-	.serial_poll_response2 = tnt4882_serial_poll_response2,
-	.serial_poll_status = tnt4882_serial_poll_status,
-	.t1_delay = tnt4882_t1_delay,
-	.return_to_local = tnt4882_return_to_local,
-};
-
-static struct gpib_interface ni_pci_accel_interface = {
-	.name = "ni_pci_accel",
-	.attach = ni_pci_attach,
-	.detach = ni_pci_detach,
-	.read = tnt4882_accel_read,
-	.write = tnt4882_accel_write,
-	.command = tnt4882_command,
-	.take_control = tnt4882_take_control,
-	.go_to_standby = tnt4882_go_to_standby,
-	.request_system_control = tnt4882_request_system_control,
-	.interface_clear = tnt4882_interface_clear,
-	.remote_enable = tnt4882_remote_enable,
-	.enable_eos = tnt4882_enable_eos,
-	.disable_eos = tnt4882_disable_eos,
-	.parallel_poll = tnt4882_parallel_poll,
-	.parallel_poll_configure = tnt4882_parallel_poll_configure,
-	.parallel_poll_response = tnt4882_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = tnt4882_line_status,
-	.update_status = tnt4882_update_status,
-	.primary_address = tnt4882_primary_address,
-	.secondary_address = tnt4882_secondary_address,
-	.serial_poll_response2 = tnt4882_serial_poll_response2,
-	.serial_poll_status = tnt4882_serial_poll_status,
-	.t1_delay = tnt4882_t1_delay,
-	.return_to_local = tnt4882_return_to_local,
-};
-
-static struct gpib_interface ni_isa_interface = {
-	.name = "ni_isa",
-	.attach = ni_tnt_isa_attach,
-	.detach = ni_isa_detach,
-	.read = tnt4882_accel_read,
-	.write = tnt4882_accel_write,
-	.command = tnt4882_command,
-	.take_control = tnt4882_take_control,
-	.go_to_standby = tnt4882_go_to_standby,
-	.request_system_control = tnt4882_request_system_control,
-	.interface_clear = tnt4882_interface_clear,
-	.remote_enable = tnt4882_remote_enable,
-	.enable_eos = tnt4882_enable_eos,
-	.disable_eos = tnt4882_disable_eos,
-	.parallel_poll = tnt4882_parallel_poll,
-	.parallel_poll_configure = tnt4882_parallel_poll_configure,
-	.parallel_poll_response = tnt4882_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = tnt4882_line_status,
-	.update_status = tnt4882_update_status,
-	.primary_address = tnt4882_primary_address,
-	.secondary_address = tnt4882_secondary_address,
-	.serial_poll_response2 = tnt4882_serial_poll_response2,
-	.serial_poll_status = tnt4882_serial_poll_status,
-	.t1_delay = tnt4882_t1_delay,
-	.return_to_local = tnt4882_return_to_local,
-};
-
-static struct gpib_interface ni_nat4882_isa_interface = {
-	.name = "ni_nat4882_isa",
-	.attach = ni_nat4882_isa_attach,
-	.detach = ni_isa_detach,
-	.read = tnt4882_read,
-	.write = tnt4882_write,
-	.command = tnt4882_command_unaccel,
-	.take_control = tnt4882_take_control,
-	.go_to_standby = tnt4882_go_to_standby,
-	.request_system_control = tnt4882_request_system_control,
-	.interface_clear = tnt4882_interface_clear,
-	.remote_enable = tnt4882_remote_enable,
-	.enable_eos = tnt4882_enable_eos,
-	.disable_eos = tnt4882_disable_eos,
-	.parallel_poll = tnt4882_parallel_poll,
-	.parallel_poll_configure = tnt4882_parallel_poll_configure,
-	.parallel_poll_response = tnt4882_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = tnt4882_line_status,
-	.update_status = tnt4882_update_status,
-	.primary_address = tnt4882_primary_address,
-	.secondary_address = tnt4882_secondary_address,
-	.serial_poll_response2 = tnt4882_serial_poll_response2,
-	.serial_poll_status = tnt4882_serial_poll_status,
-	.t1_delay = tnt4882_t1_delay,
-	.return_to_local = tnt4882_return_to_local,
-};
-
-static struct gpib_interface ni_nec_isa_interface = {
-	.name = "ni_nec_isa",
-	.attach = ni_nec_isa_attach,
-	.detach = ni_isa_detach,
-	.read = tnt4882_read,
-	.write = tnt4882_write,
-	.command = tnt4882_command_unaccel,
-	.take_control = tnt4882_take_control,
-	.go_to_standby = tnt4882_go_to_standby,
-	.request_system_control = tnt4882_request_system_control,
-	.interface_clear = tnt4882_interface_clear,
-	.remote_enable = tnt4882_remote_enable,
-	.enable_eos = tnt4882_enable_eos,
-	.disable_eos = tnt4882_disable_eos,
-	.parallel_poll = tnt4882_parallel_poll,
-	.parallel_poll_configure = tnt4882_parallel_poll_configure,
-	.parallel_poll_response = tnt4882_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = NULL,
-	.update_status = tnt4882_update_status,
-	.primary_address = tnt4882_primary_address,
-	.secondary_address = tnt4882_secondary_address,
-	.serial_poll_response = tnt4882_serial_poll_response,
-	.serial_poll_status = tnt4882_serial_poll_status,
-	.t1_delay = tnt4882_t1_delay,
-	.return_to_local = tnt4882_return_to_local,
-};
-
-static struct gpib_interface ni_isa_accel_interface = {
-	.name = "ni_isa_accel",
-	.attach = ni_tnt_isa_attach,
-	.detach = ni_isa_detach,
-	.read = tnt4882_accel_read,
-	.write = tnt4882_accel_write,
-	.command = tnt4882_command,
-	.take_control = tnt4882_take_control,
-	.go_to_standby = tnt4882_go_to_standby,
-	.request_system_control = tnt4882_request_system_control,
-	.interface_clear = tnt4882_interface_clear,
-	.remote_enable = tnt4882_remote_enable,
-	.enable_eos = tnt4882_enable_eos,
-	.disable_eos = tnt4882_disable_eos,
-	.parallel_poll = tnt4882_parallel_poll,
-	.parallel_poll_configure = tnt4882_parallel_poll_configure,
-	.parallel_poll_response = tnt4882_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = tnt4882_line_status,
-	.update_status = tnt4882_update_status,
-	.primary_address = tnt4882_primary_address,
-	.secondary_address = tnt4882_secondary_address,
-	.serial_poll_response2 = tnt4882_serial_poll_response2,
-	.serial_poll_status = tnt4882_serial_poll_status,
-	.t1_delay = tnt4882_t1_delay,
-	.return_to_local = tnt4882_return_to_local,
-};
-
-static struct gpib_interface ni_nat4882_isa_accel_interface = {
-	.name = "ni_nat4882_isa_accel",
-	.attach = ni_nat4882_isa_attach,
-	.detach = ni_isa_detach,
-	.read = tnt4882_accel_read,
-	.write = tnt4882_accel_write,
-	.command = tnt4882_command_unaccel,
-	.take_control = tnt4882_take_control,
-	.go_to_standby = tnt4882_go_to_standby,
-	.request_system_control = tnt4882_request_system_control,
-	.interface_clear = tnt4882_interface_clear,
-	.remote_enable = tnt4882_remote_enable,
-	.enable_eos = tnt4882_enable_eos,
-	.disable_eos = tnt4882_disable_eos,
-	.parallel_poll = tnt4882_parallel_poll,
-	.parallel_poll_configure = tnt4882_parallel_poll_configure,
-	.parallel_poll_response = tnt4882_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = tnt4882_line_status,
-	.update_status = tnt4882_update_status,
-	.primary_address = tnt4882_primary_address,
-	.secondary_address = tnt4882_secondary_address,
-	.serial_poll_response2 = tnt4882_serial_poll_response2,
-	.serial_poll_status = tnt4882_serial_poll_status,
-	.t1_delay = tnt4882_t1_delay,
-	.return_to_local = tnt4882_return_to_local,
-};
-
-static struct gpib_interface ni_nec_isa_accel_interface = {
-	.name = "ni_nec_isa_accel",
-	.attach = ni_nec_isa_attach,
-	.detach = ni_isa_detach,
-	.read = tnt4882_accel_read,
-	.write = tnt4882_accel_write,
-	.command = tnt4882_command_unaccel,
-	.take_control = tnt4882_take_control,
-	.go_to_standby = tnt4882_go_to_standby,
-	.request_system_control = tnt4882_request_system_control,
-	.interface_clear = tnt4882_interface_clear,
-	.remote_enable = tnt4882_remote_enable,
-	.enable_eos = tnt4882_enable_eos,
-	.disable_eos = tnt4882_disable_eos,
-	.parallel_poll = tnt4882_parallel_poll,
-	.parallel_poll_configure = tnt4882_parallel_poll_configure,
-	.parallel_poll_response = tnt4882_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = NULL,
-	.update_status = tnt4882_update_status,
-	.primary_address = tnt4882_primary_address,
-	.secondary_address = tnt4882_secondary_address,
-	.serial_poll_response = tnt4882_serial_poll_response,
-	.serial_poll_status = tnt4882_serial_poll_status,
-	.t1_delay = tnt4882_t1_delay,
-	.return_to_local = tnt4882_return_to_local,
-};
-
-static const struct pci_device_id tnt4882_pci_table[] = {
-	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_GPIB)},
-	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_GPIB_PLUS)},
-	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_GPIB_PLUS2)},
-	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_PXIGPIB)},
-	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_PMCGPIB)},
-	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_PCIEGPIB)},
-	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_NI_PCIE2GPIB)},
-	// support for Measurement Computing PCI-488
-	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_MC_PCI488)},
-	{PCI_DEVICE(PCI_VENDOR_ID_NATINST, PCI_DEVICE_ID_CEC_NI_GPIB)},
-	{ 0 }
-};
-MODULE_DEVICE_TABLE(pci, tnt4882_pci_table);
-
-static struct pci_driver tnt4882_pci_driver = {
-	.name = DRV_NAME,
-	.id_table = tnt4882_pci_table,
-	.probe = &tnt4882_pci_probe
-};
-
-#if 0
-/* unused, will be needed when the driver is turned into a pnp_driver */
-static const struct pnp_device_id tnt4882_pnp_table[] = {
-	{.id = "NICC601"},
-	{.id = ""}
-};
-MODULE_DEVICE_TABLE(pnp, tnt4882_pnp_table);
-#endif
-
-#ifdef CONFIG_GPIB_PCMCIA
-static struct gpib_interface ni_pcmcia_interface;
-static struct gpib_interface ni_pcmcia_accel_interface;
-static int __init init_ni_gpib_cs(void);
-static void __exit exit_ni_gpib_cs(void);
-#endif
-
-static int __init tnt4882_init_module(void)
-{
-	int result;
-
-	result = pci_register_driver(&tnt4882_pci_driver);
-	if (result) {
-		pr_err("pci_register_driver failed: error = %d\n", result);
-		return result;
-	}
-
-	result = gpib_register_driver(&ni_isa_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_isa;
-	}
-
-	result = gpib_register_driver(&ni_isa_accel_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_isa_accel;
-	}
-
-	result = gpib_register_driver(&ni_nat4882_isa_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_nat4882_isa;
-	}
-
-	result = gpib_register_driver(&ni_nat4882_isa_accel_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_nat4882_isa_accel;
-	}
-
-	result = gpib_register_driver(&ni_nec_isa_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_nec_isa;
-	}
-
-	result = gpib_register_driver(&ni_nec_isa_accel_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_nec_isa_accel;
-	}
-
-	result = gpib_register_driver(&ni_pci_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_pci;
-	}
-
-	result = gpib_register_driver(&ni_pci_accel_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_pci_accel;
-	}
-
-#ifdef CONFIG_GPIB_PCMCIA
-	result = gpib_register_driver(&ni_pcmcia_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_pcmcia;
-	}
-
-	result = gpib_register_driver(&ni_pcmcia_accel_interface, THIS_MODULE);
-	if (result) {
-		pr_err("gpib_register_driver failed: error = %d\n", result);
-		goto err_pcmcia_accel;
-	}
-
-	result = init_ni_gpib_cs();
-	if (result) {
-		pr_err("pcmcia_register_driver failed: error = %d\n", result);
-		goto err_pcmcia_driver;
-	}
-#endif
-
-	mite_init();
-
-	return 0;
-
-#ifdef CONFIG_GPIB_PCMCIA
-err_pcmcia_driver:
-	gpib_unregister_driver(&ni_pcmcia_accel_interface);
-err_pcmcia_accel:
-	gpib_unregister_driver(&ni_pcmcia_interface);
-err_pcmcia:
-#endif
-	gpib_unregister_driver(&ni_pci_accel_interface);
-err_pci_accel:
-	gpib_unregister_driver(&ni_pci_interface);
-err_pci:
-	gpib_unregister_driver(&ni_nec_isa_accel_interface);
-err_nec_isa_accel:
-	gpib_unregister_driver(&ni_nec_isa_interface);
-err_nec_isa:
-	gpib_unregister_driver(&ni_nat4882_isa_accel_interface);
-err_nat4882_isa_accel:
-	gpib_unregister_driver(&ni_nat4882_isa_interface);
-err_nat4882_isa:
-	gpib_unregister_driver(&ni_isa_accel_interface);
-err_isa_accel:
-	gpib_unregister_driver(&ni_isa_interface);
-err_isa:
-	pci_unregister_driver(&tnt4882_pci_driver);
-
-	return result;
-}
-
-static void __exit tnt4882_exit_module(void)
-{
-	gpib_unregister_driver(&ni_isa_interface);
-	gpib_unregister_driver(&ni_isa_accel_interface);
-	gpib_unregister_driver(&ni_nat4882_isa_interface);
-	gpib_unregister_driver(&ni_nat4882_isa_accel_interface);
-	gpib_unregister_driver(&ni_nec_isa_interface);
-	gpib_unregister_driver(&ni_nec_isa_accel_interface);
-	gpib_unregister_driver(&ni_pci_interface);
-	gpib_unregister_driver(&ni_pci_accel_interface);
-#ifdef CONFIG_GPIB_PCMCIA
-	gpib_unregister_driver(&ni_pcmcia_interface);
-	gpib_unregister_driver(&ni_pcmcia_accel_interface);
-	exit_ni_gpib_cs();
-#endif
-
-	mite_cleanup();
-
-	pci_unregister_driver(&tnt4882_pci_driver);
-}
-
-#ifdef CONFIG_GPIB_PCMCIA
-
-#include <linux/kernel.h>
-#include <linux/moduleparam.h>
-#include <linux/ptrace.h>
-#include <linux/timer.h>
-#include <linux/io.h>
-
-#include <pcmcia/cistpl.h>
-#include <pcmcia/cisreg.h>
-#include <pcmcia/ds.h>
-
-static int ni_gpib_config(struct pcmcia_device  *link);
-static void ni_gpib_release(struct pcmcia_device *link);
-static void ni_pcmcia_detach(struct gpib_board *board);
-
-/*
- * A linked list of "instances" of the dummy device.  Each actual
- * PCMCIA card corresponds to one device instance, and is described
- * by one dev_link_t structure (defined in ds.h).
- *
- * You may not want to use a linked list for this -- for example, the
- * memory card driver uses an array of dev_link_t pointers, where minor
- * device numbers are used to derive the corresponding array index.
- *
- * I think this dev_list is obsolete but the pointer is needed to keep
- * the module instance for the ni_pcmcia_attach function.
- */
-
-static struct pcmcia_device   *curr_dev;
-
-struct local_info_t {
-	struct pcmcia_device	*p_dev;
-	struct gpib_board		*dev;
-	int			stop;
-	struct bus_operations	*bus;
-};
-
-/*
- * ni_gpib_probe() creates an "instance" of the driver, allocating
- * local data structures for one device.  The device is registered
- * with Card Services.
- */
-
-static int ni_gpib_probe(struct pcmcia_device *link)
-{
-	struct local_info_t *info;
-	//struct struct gpib_board *dev;
-
-	/* Allocate space for private device-specific data */
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	info->p_dev = link;
-	link->priv = info;
-
-	/*
-	 * General socket configuration defaults can go here.  In this
-	 * client, we assume very little, and rely on the CIS for almost
-	 * everything.  In most clients, many details (i.e., number, sizes,
-	 * and attributes of IO windows) are fixed by the nature of the
-	 * device, and can be hard-wired here.
-	 */
-	link->config_flags = CONF_ENABLE_IRQ | CONF_AUTO_SET_IO;
-
-	/* Register with Card Services */
-	curr_dev = link;
-	return ni_gpib_config(link);
-}
-
-/*
- * This deletes a driver "instance".  The device is de-registered
- * with Card Services.  If it has been released, all local data
- * structures are freed.  Otherwise, the structures will be freed
- * when the device is released.
- */
-static void ni_gpib_remove(struct pcmcia_device *link)
-{
-	struct local_info_t *info = link->priv;
-	//struct struct gpib_board *dev = info->dev;
-
-	if (info->dev)
-		ni_pcmcia_detach(info->dev);
-	ni_gpib_release(link);
-
-	//free_netdev(dev);
-	kfree(info);
-}
-
-static int ni_gpib_config_iteration(struct pcmcia_device *link,	void *priv_data)
-{
-	int retval;
-
-	retval = pcmcia_request_io(link);
-	if (retval != 0)
-		return retval;
-
-	return 0;
-}
-
-/*
- * ni_gpib_config() is scheduled to run after a CARD_INSERTION event
- * is received, to configure the PCMCIA socket, and to make the
- * device available to the system.
- */
-static int ni_gpib_config(struct pcmcia_device *link)
-{
-	//struct local_info_t *info = link->priv;
-	//struct gpib_board *dev = info->dev;
-	int last_ret;
-
-	last_ret = pcmcia_loop_config(link, &ni_gpib_config_iteration, NULL);
-	if (last_ret) {
-		dev_warn(&link->dev, "no configuration found\n");
-		ni_gpib_release(link);
-		return last_ret;
-	}
-
-	last_ret = pcmcia_enable_device(link);
-	if (last_ret) {
-		ni_gpib_release(link);
-		return last_ret;
-	}
-	return 0;
-} /* ni_gpib_config */
-
-/*
- * After a card is removed, ni_gpib_release() will unregister the
- * device, and release the PCMCIA configuration.  If the device is
- * still open, this will be postponed until it is closed.
- */
-static void ni_gpib_release(struct pcmcia_device *link)
-{
-	pcmcia_disable_device(link);
-} /* ni_gpib_release */
-
-static int ni_gpib_suspend(struct pcmcia_device *link)
-{
-	//struct local_info_t *info = link->priv;
-	//struct struct gpib_board *dev = info->dev;
-
-	if (link->open)
-		dev_warn(&link->dev, "Device still open\n");
-	//netif_device_detach(dev);
-
-	return 0;
-}
-
-static int ni_gpib_resume(struct pcmcia_device *link)
-{
-	//struct local_info_t *info = link->priv;
-	//struct struct gpib_board *dev = info->dev;
-
-	/*if (link->open) {
-	 *	ni_gpib_probe(dev);	/ really?
-	 *	//netif_device_attach(dev);
-	 *}
-	 */
-	return ni_gpib_config(link);
-}
-
-static struct pcmcia_device_id ni_pcmcia_ids[] = {
-	PCMCIA_DEVICE_MANF_CARD(0x010b, 0x4882),
-	PCMCIA_DEVICE_MANF_CARD(0x010b, 0x0c71), // NI PCMCIA-GPIB+
-	PCMCIA_DEVICE_NULL
-};
-
-MODULE_DEVICE_TABLE(pcmcia, ni_pcmcia_ids);
-
-static struct pcmcia_driver ni_gpib_cs_driver = {
-	.name           = "ni_gpib_cs",
-	.owner		= THIS_MODULE,
-	.drv = { .name = "ni_gpib_cs", },
-	.id_table	= ni_pcmcia_ids,
-	.probe		= ni_gpib_probe,
-	.remove		= ni_gpib_remove,
-	.suspend	= ni_gpib_suspend,
-	.resume		= ni_gpib_resume,
-};
-
-static int __init init_ni_gpib_cs(void)
-{
-	return pcmcia_register_driver(&ni_gpib_cs_driver);
-}
-
-static void __exit exit_ni_gpib_cs(void)
-{
-	pcmcia_unregister_driver(&ni_gpib_cs_driver);
-}
-
-static const int pcmcia_gpib_iosize = 32;
-
-static int ni_pcmcia_attach(struct gpib_board *board, const struct gpib_board_config *config)
-{
-	struct local_info_t *info;
-	struct tnt4882_priv *tnt_priv;
-	struct nec7210_priv *nec_priv;
-	int isr_flags = IRQF_SHARED;
-	int retval;
-
-	if (!curr_dev)
-		return -ENODEV;
-
-	info = curr_dev->priv;
-	info->dev = board;
-
-	board->status = 0;
-
-	if (tnt4882_allocate_private(board))
-		return -ENOMEM;
-
-	tnt_priv = board->private_data;
-	nec_priv = &tnt_priv->nec7210_priv;
-	nec_priv->type = TNT4882;
-	nec_priv->read_byte = nec7210_locking_ioport_read_byte;
-	nec_priv->write_byte = nec7210_locking_ioport_write_byte;
-	nec_priv->offset = atgpib_reg_offset;
-
-	if (!request_region(curr_dev->resource[0]->start, resource_size(curr_dev->resource[0]),
-			    DRV_NAME))
-		return -ENOMEM;
-
-	nec_priv->mmiobase = ioport_map(curr_dev->resource[0]->start,
-					resource_size(curr_dev->resource[0]));
-	if (!nec_priv->mmiobase)
-		return -ENOMEM;
-
-	// get irq
-	retval = request_irq(curr_dev->irq, tnt4882_interrupt, isr_flags, DRV_NAME, board);
-	if (retval) {
-		dev_err(board->gpib_dev, "failed to obtain PCMCIA irq %d\n", curr_dev->irq);
-		return retval;
-	}
-	tnt_priv->irq = curr_dev->irq;
-
-	tnt4882_init(tnt_priv, board);
-
-	return 0;
-}
-
-static void ni_pcmcia_detach(struct gpib_board *board)
-{
-	struct tnt4882_priv *tnt_priv = board->private_data;
-	struct nec7210_priv *nec_priv;
-
-	if (tnt_priv) {
-		nec_priv = &tnt_priv->nec7210_priv;
-		if (tnt_priv->irq)
-			free_irq(tnt_priv->irq, board);
-		if (nec_priv->mmiobase)
-			ioport_unmap(nec_priv->mmiobase);
-		if (nec_priv->iobase) {
-			tnt4882_board_reset(tnt_priv, board);
-			release_region(nec_priv->iobase, pcmcia_gpib_iosize);
-		}
-	}
-	tnt4882_free_private(board);
-}
-
-static struct gpib_interface ni_pcmcia_interface = {
-	.name = "ni_pcmcia",
-	.attach = ni_pcmcia_attach,
-	.detach = ni_pcmcia_detach,
-	.read = tnt4882_accel_read,
-	.write = tnt4882_accel_write,
-	.command = tnt4882_command,
-	.take_control = tnt4882_take_control,
-	.go_to_standby = tnt4882_go_to_standby,
-	.request_system_control = tnt4882_request_system_control,
-	.interface_clear = tnt4882_interface_clear,
-	.remote_enable = tnt4882_remote_enable,
-	.enable_eos = tnt4882_enable_eos,
-	.disable_eos = tnt4882_disable_eos,
-	.parallel_poll = tnt4882_parallel_poll,
-	.parallel_poll_configure = tnt4882_parallel_poll_configure,
-	.parallel_poll_response = tnt4882_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = tnt4882_line_status,
-	.update_status = tnt4882_update_status,
-	.primary_address = tnt4882_primary_address,
-	.secondary_address = tnt4882_secondary_address,
-	.serial_poll_response = tnt4882_serial_poll_response,
-	.serial_poll_status = tnt4882_serial_poll_status,
-	.t1_delay = tnt4882_t1_delay,
-	.return_to_local = tnt4882_return_to_local,
-};
-
-static struct gpib_interface ni_pcmcia_accel_interface = {
-	.name = "ni_pcmcia_accel",
-	.attach = ni_pcmcia_attach,
-	.detach = ni_pcmcia_detach,
-	.read = tnt4882_accel_read,
-	.write = tnt4882_accel_write,
-	.command = tnt4882_command,
-	.take_control = tnt4882_take_control,
-	.go_to_standby = tnt4882_go_to_standby,
-	.request_system_control = tnt4882_request_system_control,
-	.interface_clear = tnt4882_interface_clear,
-	.remote_enable = tnt4882_remote_enable,
-	.enable_eos = tnt4882_enable_eos,
-	.disable_eos = tnt4882_disable_eos,
-	.parallel_poll = tnt4882_parallel_poll,
-	.parallel_poll_configure = tnt4882_parallel_poll_configure,
-	.parallel_poll_response = tnt4882_parallel_poll_response,
-	.local_parallel_poll_mode = NULL, // XXX
-	.line_status = tnt4882_line_status,
-	.update_status = tnt4882_update_status,
-	.primary_address = tnt4882_primary_address,
-	.secondary_address = tnt4882_secondary_address,
-	.serial_poll_response = tnt4882_serial_poll_response,
-	.serial_poll_status = tnt4882_serial_poll_status,
-	.t1_delay = tnt4882_t1_delay,
-	.return_to_local = tnt4882_return_to_local,
-};
-
-#endif	// CONFIG_GPIB_PCMCIA
-
-module_init(tnt4882_init_module);
-module_exit(tnt4882_exit_module);
diff --git a/drivers/staging/gpib/uapi/gpib.h b/drivers/staging/gpib/uapi/gpib.h
deleted file mode 100644
index 2a7f5eeb9777..000000000000
--- a/drivers/staging/gpib/uapi/gpib.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-
-/***************************************************************************
- *    copyright		   : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _GPIB_H
-#define _GPIB_H
-
-#define GPIB_MAX_NUM_BOARDS 16
-#define GPIB_MAX_NUM_DESCRIPTORS 0x1000
-
-enum ibsta_bit_numbers {
-	DCAS_NUM = 0,
-	DTAS_NUM = 1,
-	LACS_NUM = 2,
-	TACS_NUM = 3,
-	ATN_NUM = 4,
-	CIC_NUM = 5,
-	REM_NUM = 6,
-	LOK_NUM = 7,
-	CMPL_NUM = 8,
-	EVENT_NUM = 9,
-	SPOLL_NUM = 10,
-	RQS_NUM = 11,
-	SRQI_NUM = 12,
-	END_NUM = 13,
-	TIMO_NUM = 14,
-	ERR_NUM = 15
-};
-
-/* IBSTA status bits (returned by all functions) */
-enum ibsta_bits {
-	DCAS = (1 << DCAS_NUM),	/* device clear state */
-	DTAS = (1 << DTAS_NUM),	/* device trigger state */
-	LACS = (1 <<  LACS_NUM),	/* GPIB interface is addressed as Listener */
-	TACS = (1 <<  TACS_NUM),	/* GPIB interface is addressed as Talker */
-	ATN = (1 <<  ATN_NUM),	/* Attention is asserted */
-	CIC = (1 <<  CIC_NUM),	/* GPIB interface is Controller-in-Charge */
-	REM = (1 << REM_NUM),	/* remote state */
-	LOK = (1 << LOK_NUM),	/* lockout state */
-	CMPL = (1 <<  CMPL_NUM),	/* I/O is complete  */
-	EVENT = (1 << EVENT_NUM),	/* DCAS, DTAS, or IFC has occurred */
-	SPOLL = (1 << SPOLL_NUM),	/* board serial polled by busmaster */
-	RQS = (1 <<  RQS_NUM),	/* Device requesting service  */
-	SRQI = (1 << SRQI_NUM),	/* SRQ is asserted */
-	END = (1 << END_NUM),	/* EOI or EOS encountered */
-	TIMO = (1 << TIMO_NUM),	/* Time limit on I/O or wait function exceeded */
-	ERR = (1 << ERR_NUM),	/* Function call terminated on error */
-
-	device_status_mask = ERR | TIMO | END | CMPL | RQS,
-	board_status_mask = ERR | TIMO | END | CMPL | SPOLL |
-		EVENT | LOK | REM | CIC | ATN | TACS | LACS | DTAS | DCAS | SRQI,
-};
-
-/* End-of-string (EOS) modes for use with ibeos */
-
-enum eos_flags {
-	EOS_MASK = 0x1c00,
-	REOS = 0x0400,		/* Terminate reads on EOS	*/
-	XEOS = 0x800,	/* assert EOI when EOS char is sent */
-	BIN = 0x1000		/* Do 8-bit compare on EOS	*/
-};
-
-/* GPIB Bus Control Lines bit vector */
-enum bus_control_line {
-	VALID_DAV = 0x01,
-	VALID_NDAC = 0x02,
-	VALID_NRFD = 0x04,
-	VALID_IFC = 0x08,
-	VALID_REN = 0x10,
-	VALID_SRQ = 0x20,
-	VALID_ATN = 0x40,
-	VALID_EOI = 0x80,
-	VALID_ALL = 0xff,
-	BUS_DAV = 0x0100,		/* DAV	line status bit */
-	BUS_NDAC = 0x0200,		/* NDAC line status bit */
-	BUS_NRFD = 0x0400,		/* NRFD line status bit */
-	BUS_IFC = 0x0800,		/* IFC	line status bit */
-	BUS_REN = 0x1000,		/* REN	line status bit */
-	BUS_SRQ = 0x2000,		/* SRQ	line status bit */
-	BUS_ATN = 0x4000,		/* ATN	line status bit */
-	BUS_EOI = 0x8000		/* EOI	line status bit */
-};
-
-enum ppe_bits {
-	PPC_DISABLE = 0x10,
-	PPC_SENSE = 0x8,	/* parallel poll sense bit	*/
-	PPC_DIO_MASK = 0x7
-};
-
-enum {
-	request_service_bit = 0x40,
-};
-
-enum gpib_events {
-	EVENT_NONE = 0,
-	EVENT_DEV_TRG = 1,
-	EVENT_DEV_CLR = 2,
-	EVENT_IFC = 3
-};
-
-#endif	/* _GPIB_H */
-
diff --git a/drivers/staging/gpib/uapi/gpib_ioctl.h b/drivers/staging/gpib/uapi/gpib_ioctl.h
deleted file mode 100644
index d544d8e4362c..000000000000
--- a/drivers/staging/gpib/uapi/gpib_ioctl.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-
-/***************************************************************************
- *    copyright            : (C) 2002 by Frank Mori Hess
- ***************************************************************************/
-
-#ifndef _GPIB_IOCTL_H
-#define _GPIB_IOCTL_H
-
-#include <asm/ioctl.h>
-#include <linux/types.h>
-
-#define GPIB_CODE 160
-
-struct gpib_board_type_ioctl {
-	char name[100];
-};
-
-/* argument for read/write/command ioctls */
-struct gpib_read_write_ioctl {
-	__u64 buffer_ptr;
-	__u32 requested_transfer_count;
-	__u32 completed_transfer_count;
-	__s32 end; /* end flag return for reads, end io suppression request for cmd*/
-	__s32 handle;
-};
-
-struct gpib_open_dev_ioctl {
-	__u32 handle;
-	__u32 pad;
-	__s32 sad;
-	__u32 is_board;
-};
-
-struct gpib_close_dev_ioctl {
-	__u32 handle;
-};
-
-struct gpib_serial_poll_ioctl {
-	__u32 pad;
-	__s32 sad;
-	__u8 status_byte;
-	__u8 padding[3];   /* align to 32 bit boundary */
-};
-
-struct gpib_eos_ioctl {
-	__s32 eos;
-	__s32 eos_flags;
-};
-
-struct gpib_wait_ioctl {
-	__s32 handle;
-	__s32 wait_mask;
-	__s32 clear_mask;
-	__s32 set_mask;
-	__s32 ibsta;
-	__s32 pad;
-	__s32 sad;
-	__u32 usec_timeout;
-};
-
-struct gpib_online_ioctl {
-	__u64 init_data_ptr;
-	__s32 init_data_length;
-	__s32 online;
-};
-
-struct gpib_spoll_bytes_ioctl {
-	__u32 num_bytes;
-	__u32 pad;
-	__s32 sad;
-};
-
-struct gpib_board_info_ioctl {
-	__u32 pad;
-	__s32 sad;
-	__s32 parallel_poll_configuration;
-	__s32 autopolling;
-	__s32 is_system_controller;
-	__u32 t1_delay;
-	unsigned ist : 1;
-	unsigned no_7_bit_eos : 1;
-	unsigned padding :30; /* align to 32 bit boundary */
-};
-
-struct gpib_select_pci_ioctl {
-	__s32 pci_bus;
-	__s32 pci_slot;
-};
-
-struct gpib_ppoll_config_ioctl {
-	__u8 config;
-	unsigned set_ist : 1;
-	unsigned clear_ist : 1;
-	unsigned padding :22; /* align to 32 bit boundary */
-};
-
-struct gpib_pad_ioctl {
-	__u32 handle;
-	__u32 pad;
-};
-
-struct gpib_sad_ioctl {
-	__u32 handle;
-	__s32 sad;
-};
-
-/* select a piece of hardware to attach by its sysfs device path */
-struct gpib_select_device_path_ioctl {
-	char device_path[0x1000];
-};
-
-/* update status byte and request service */
-struct gpib_request_service2 {
-	__u8 status_byte;
-	__u8 padding[3]; /* align to 32 bit boundary */
-	__s32 new_reason_for_service;
-};
-
-/* Standard functions. */
-enum gpib_ioctl {
-	IBRD = _IOWR(GPIB_CODE, 100, struct gpib_read_write_ioctl),
-	IBWRT = _IOWR(GPIB_CODE, 101, struct gpib_read_write_ioctl),
-	IBCMD = _IOWR(GPIB_CODE, 102, struct gpib_read_write_ioctl),
-	IBOPENDEV = _IOWR(GPIB_CODE, 3, struct gpib_open_dev_ioctl),
-	IBCLOSEDEV = _IOW(GPIB_CODE, 4, struct gpib_close_dev_ioctl),
-	IBWAIT = _IOWR(GPIB_CODE, 5, struct gpib_wait_ioctl),
-	IBRPP = _IOWR(GPIB_CODE, 6, __u8),
-
-	IBSIC = _IOW(GPIB_CODE, 9, __u32),
-	IBSRE = _IOW(GPIB_CODE, 10, __s32),
-	IBGTS = _IO(GPIB_CODE, 11),
-	IBCAC = _IOW(GPIB_CODE, 12, __s32),
-	IBLINES = _IOR(GPIB_CODE, 14, __s16),
-	IBPAD = _IOW(GPIB_CODE, 15, struct gpib_pad_ioctl),
-	IBSAD = _IOW(GPIB_CODE, 16, struct gpib_sad_ioctl),
-	IBTMO = _IOW(GPIB_CODE, 17, __u32),
-	IBRSP = _IOWR(GPIB_CODE, 18, struct gpib_serial_poll_ioctl),
-	IBEOS = _IOW(GPIB_CODE, 19, struct gpib_eos_ioctl),
-	IBRSV = _IOW(GPIB_CODE, 20, __u8),
-	CFCBASE = _IOW(GPIB_CODE, 21, __u64),
-	CFCIRQ = _IOW(GPIB_CODE, 22, __u32),
-	CFCDMA = _IOW(GPIB_CODE, 23, __u32),
-	CFCBOARDTYPE = _IOW(GPIB_CODE, 24, struct gpib_board_type_ioctl),
-
-	IBMUTEX = _IOW(GPIB_CODE, 26, __s32),
-	IBSPOLL_BYTES = _IOWR(GPIB_CODE, 27, struct gpib_spoll_bytes_ioctl),
-	IBPPC = _IOW(GPIB_CODE, 28, struct gpib_ppoll_config_ioctl),
-	IBBOARD_INFO = _IOR(GPIB_CODE, 29, struct gpib_board_info_ioctl),
-
-	IBQUERY_BOARD_RSV = _IOR(GPIB_CODE, 31, __s32),
-	IBSELECT_PCI = _IOWR(GPIB_CODE, 32, struct gpib_select_pci_ioctl),
-	IBEVENT = _IOR(GPIB_CODE, 33, __s16),
-	IBRSC = _IOW(GPIB_CODE, 34, __s32),
-	IB_T1_DELAY = _IOW(GPIB_CODE, 35, __u32),
-	IBLOC = _IO(GPIB_CODE, 36),
-
-	IBAUTOSPOLL = _IOW(GPIB_CODE, 38, __s16),
-	IBONL = _IOW(GPIB_CODE, 39, struct gpib_online_ioctl),
-	IBPP2_SET = _IOW(GPIB_CODE, 40, __s16),
-	IBPP2_GET = _IOR(GPIB_CODE, 41, __s16),
-	IBSELECT_DEVICE_PATH = _IOW(GPIB_CODE, 43, struct gpib_select_device_path_ioctl),
-	/* 44 was IBSELECT_SERIAL_NUMBER */
-	IBRSV2 = _IOW(GPIB_CODE, 45, struct gpib_request_service2)
-};
-
-#endif	/* _GPIB_IOCTL_H */
diff --git a/include/uapi/linux/gpib.h b/include/uapi/linux/gpib.h
new file mode 100644
index 000000000000..2a7f5eeb9777
--- /dev/null
+++ b/include/uapi/linux/gpib.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+/***************************************************************************
+ *    copyright		   : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _GPIB_H
+#define _GPIB_H
+
+#define GPIB_MAX_NUM_BOARDS 16
+#define GPIB_MAX_NUM_DESCRIPTORS 0x1000
+
+enum ibsta_bit_numbers {
+	DCAS_NUM = 0,
+	DTAS_NUM = 1,
+	LACS_NUM = 2,
+	TACS_NUM = 3,
+	ATN_NUM = 4,
+	CIC_NUM = 5,
+	REM_NUM = 6,
+	LOK_NUM = 7,
+	CMPL_NUM = 8,
+	EVENT_NUM = 9,
+	SPOLL_NUM = 10,
+	RQS_NUM = 11,
+	SRQI_NUM = 12,
+	END_NUM = 13,
+	TIMO_NUM = 14,
+	ERR_NUM = 15
+};
+
+/* IBSTA status bits (returned by all functions) */
+enum ibsta_bits {
+	DCAS = (1 << DCAS_NUM),	/* device clear state */
+	DTAS = (1 << DTAS_NUM),	/* device trigger state */
+	LACS = (1 <<  LACS_NUM),	/* GPIB interface is addressed as Listener */
+	TACS = (1 <<  TACS_NUM),	/* GPIB interface is addressed as Talker */
+	ATN = (1 <<  ATN_NUM),	/* Attention is asserted */
+	CIC = (1 <<  CIC_NUM),	/* GPIB interface is Controller-in-Charge */
+	REM = (1 << REM_NUM),	/* remote state */
+	LOK = (1 << LOK_NUM),	/* lockout state */
+	CMPL = (1 <<  CMPL_NUM),	/* I/O is complete  */
+	EVENT = (1 << EVENT_NUM),	/* DCAS, DTAS, or IFC has occurred */
+	SPOLL = (1 << SPOLL_NUM),	/* board serial polled by busmaster */
+	RQS = (1 <<  RQS_NUM),	/* Device requesting service  */
+	SRQI = (1 << SRQI_NUM),	/* SRQ is asserted */
+	END = (1 << END_NUM),	/* EOI or EOS encountered */
+	TIMO = (1 << TIMO_NUM),	/* Time limit on I/O or wait function exceeded */
+	ERR = (1 << ERR_NUM),	/* Function call terminated on error */
+
+	device_status_mask = ERR | TIMO | END | CMPL | RQS,
+	board_status_mask = ERR | TIMO | END | CMPL | SPOLL |
+		EVENT | LOK | REM | CIC | ATN | TACS | LACS | DTAS | DCAS | SRQI,
+};
+
+/* End-of-string (EOS) modes for use with ibeos */
+
+enum eos_flags {
+	EOS_MASK = 0x1c00,
+	REOS = 0x0400,		/* Terminate reads on EOS	*/
+	XEOS = 0x800,	/* assert EOI when EOS char is sent */
+	BIN = 0x1000		/* Do 8-bit compare on EOS	*/
+};
+
+/* GPIB Bus Control Lines bit vector */
+enum bus_control_line {
+	VALID_DAV = 0x01,
+	VALID_NDAC = 0x02,
+	VALID_NRFD = 0x04,
+	VALID_IFC = 0x08,
+	VALID_REN = 0x10,
+	VALID_SRQ = 0x20,
+	VALID_ATN = 0x40,
+	VALID_EOI = 0x80,
+	VALID_ALL = 0xff,
+	BUS_DAV = 0x0100,		/* DAV	line status bit */
+	BUS_NDAC = 0x0200,		/* NDAC line status bit */
+	BUS_NRFD = 0x0400,		/* NRFD line status bit */
+	BUS_IFC = 0x0800,		/* IFC	line status bit */
+	BUS_REN = 0x1000,		/* REN	line status bit */
+	BUS_SRQ = 0x2000,		/* SRQ	line status bit */
+	BUS_ATN = 0x4000,		/* ATN	line status bit */
+	BUS_EOI = 0x8000		/* EOI	line status bit */
+};
+
+enum ppe_bits {
+	PPC_DISABLE = 0x10,
+	PPC_SENSE = 0x8,	/* parallel poll sense bit	*/
+	PPC_DIO_MASK = 0x7
+};
+
+enum {
+	request_service_bit = 0x40,
+};
+
+enum gpib_events {
+	EVENT_NONE = 0,
+	EVENT_DEV_TRG = 1,
+	EVENT_DEV_CLR = 2,
+	EVENT_IFC = 3
+};
+
+#endif	/* _GPIB_H */
+
diff --git a/include/uapi/linux/gpib_ioctl.h b/include/uapi/linux/gpib_ioctl.h
new file mode 100644
index 000000000000..d544d8e4362c
--- /dev/null
+++ b/include/uapi/linux/gpib_ioctl.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+/***************************************************************************
+ *    copyright            : (C) 2002 by Frank Mori Hess
+ ***************************************************************************/
+
+#ifndef _GPIB_IOCTL_H
+#define _GPIB_IOCTL_H
+
+#include <asm/ioctl.h>
+#include <linux/types.h>
+
+#define GPIB_CODE 160
+
+struct gpib_board_type_ioctl {
+	char name[100];
+};
+
+/* argument for read/write/command ioctls */
+struct gpib_read_write_ioctl {
+	__u64 buffer_ptr;
+	__u32 requested_transfer_count;
+	__u32 completed_transfer_count;
+	__s32 end; /* end flag return for reads, end io suppression request for cmd*/
+	__s32 handle;
+};
+
+struct gpib_open_dev_ioctl {
+	__u32 handle;
+	__u32 pad;
+	__s32 sad;
+	__u32 is_board;
+};
+
+struct gpib_close_dev_ioctl {
+	__u32 handle;
+};
+
+struct gpib_serial_poll_ioctl {
+	__u32 pad;
+	__s32 sad;
+	__u8 status_byte;
+	__u8 padding[3];   /* align to 32 bit boundary */
+};
+
+struct gpib_eos_ioctl {
+	__s32 eos;
+	__s32 eos_flags;
+};
+
+struct gpib_wait_ioctl {
+	__s32 handle;
+	__s32 wait_mask;
+	__s32 clear_mask;
+	__s32 set_mask;
+	__s32 ibsta;
+	__s32 pad;
+	__s32 sad;
+	__u32 usec_timeout;
+};
+
+struct gpib_online_ioctl {
+	__u64 init_data_ptr;
+	__s32 init_data_length;
+	__s32 online;
+};
+
+struct gpib_spoll_bytes_ioctl {
+	__u32 num_bytes;
+	__u32 pad;
+	__s32 sad;
+};
+
+struct gpib_board_info_ioctl {
+	__u32 pad;
+	__s32 sad;
+	__s32 parallel_poll_configuration;
+	__s32 autopolling;
+	__s32 is_system_controller;
+	__u32 t1_delay;
+	unsigned ist : 1;
+	unsigned no_7_bit_eos : 1;
+	unsigned padding :30; /* align to 32 bit boundary */
+};
+
+struct gpib_select_pci_ioctl {
+	__s32 pci_bus;
+	__s32 pci_slot;
+};
+
+struct gpib_ppoll_config_ioctl {
+	__u8 config;
+	unsigned set_ist : 1;
+	unsigned clear_ist : 1;
+	unsigned padding :22; /* align to 32 bit boundary */
+};
+
+struct gpib_pad_ioctl {
+	__u32 handle;
+	__u32 pad;
+};
+
+struct gpib_sad_ioctl {
+	__u32 handle;
+	__s32 sad;
+};
+
+/* select a piece of hardware to attach by its sysfs device path */
+struct gpib_select_device_path_ioctl {
+	char device_path[0x1000];
+};
+
+/* update status byte and request service */
+struct gpib_request_service2 {
+	__u8 status_byte;
+	__u8 padding[3]; /* align to 32 bit boundary */
+	__s32 new_reason_for_service;
+};
+
+/* Standard functions. */
+enum gpib_ioctl {
+	IBRD = _IOWR(GPIB_CODE, 100, struct gpib_read_write_ioctl),
+	IBWRT = _IOWR(GPIB_CODE, 101, struct gpib_read_write_ioctl),
+	IBCMD = _IOWR(GPIB_CODE, 102, struct gpib_read_write_ioctl),
+	IBOPENDEV = _IOWR(GPIB_CODE, 3, struct gpib_open_dev_ioctl),
+	IBCLOSEDEV = _IOW(GPIB_CODE, 4, struct gpib_close_dev_ioctl),
+	IBWAIT = _IOWR(GPIB_CODE, 5, struct gpib_wait_ioctl),
+	IBRPP = _IOWR(GPIB_CODE, 6, __u8),
+
+	IBSIC = _IOW(GPIB_CODE, 9, __u32),
+	IBSRE = _IOW(GPIB_CODE, 10, __s32),
+	IBGTS = _IO(GPIB_CODE, 11),
+	IBCAC = _IOW(GPIB_CODE, 12, __s32),
+	IBLINES = _IOR(GPIB_CODE, 14, __s16),
+	IBPAD = _IOW(GPIB_CODE, 15, struct gpib_pad_ioctl),
+	IBSAD = _IOW(GPIB_CODE, 16, struct gpib_sad_ioctl),
+	IBTMO = _IOW(GPIB_CODE, 17, __u32),
+	IBRSP = _IOWR(GPIB_CODE, 18, struct gpib_serial_poll_ioctl),
+	IBEOS = _IOW(GPIB_CODE, 19, struct gpib_eos_ioctl),
+	IBRSV = _IOW(GPIB_CODE, 20, __u8),
+	CFCBASE = _IOW(GPIB_CODE, 21, __u64),
+	CFCIRQ = _IOW(GPIB_CODE, 22, __u32),
+	CFCDMA = _IOW(GPIB_CODE, 23, __u32),
+	CFCBOARDTYPE = _IOW(GPIB_CODE, 24, struct gpib_board_type_ioctl),
+
+	IBMUTEX = _IOW(GPIB_CODE, 26, __s32),
+	IBSPOLL_BYTES = _IOWR(GPIB_CODE, 27, struct gpib_spoll_bytes_ioctl),
+	IBPPC = _IOW(GPIB_CODE, 28, struct gpib_ppoll_config_ioctl),
+	IBBOARD_INFO = _IOR(GPIB_CODE, 29, struct gpib_board_info_ioctl),
+
+	IBQUERY_BOARD_RSV = _IOR(GPIB_CODE, 31, __s32),
+	IBSELECT_PCI = _IOWR(GPIB_CODE, 32, struct gpib_select_pci_ioctl),
+	IBEVENT = _IOR(GPIB_CODE, 33, __s16),
+	IBRSC = _IOW(GPIB_CODE, 34, __s32),
+	IB_T1_DELAY = _IOW(GPIB_CODE, 35, __u32),
+	IBLOC = _IO(GPIB_CODE, 36),
+
+	IBAUTOSPOLL = _IOW(GPIB_CODE, 38, __s16),
+	IBONL = _IOW(GPIB_CODE, 39, struct gpib_online_ioctl),
+	IBPP2_SET = _IOW(GPIB_CODE, 40, __s16),
+	IBPP2_GET = _IOR(GPIB_CODE, 41, __s16),
+	IBSELECT_DEVICE_PATH = _IOW(GPIB_CODE, 43, struct gpib_select_device_path_ioctl),
+	/* 44 was IBSELECT_SERIAL_NUMBER */
+	IBRSV2 = _IOW(GPIB_CODE, 45, struct gpib_request_service2)
+};
+
+#endif	/* _GPIB_IOCTL_H */
-- 
cgit v1.2.3


From 25e4e3565d45f567f78089f38822fa64abee5230 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Tue, 18 Nov 2025 20:36:29 +0800
Subject: ftrace: Introduce FTRACE_OPS_FL_JMP

For now, the "nop" will be replaced with a "call" instruction when a
function is hooked by the ftrace. However, sometimes the "call" can break
the RSB and introduce extra overhead. Therefore, introduce the flag
FTRACE_OPS_FL_JMP, which indicate that the ftrace_ops should be called
with a "jmp" instead of "call". For now, it is only used by the direct
call case.

When a direct ftrace_ops is marked with FTRACE_OPS_FL_JMP, the last bit of
the ops->direct_call will be set to 1. Therefore, we can tell if we should
use "jmp" for the callback in ftrace_call_replace().

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20251118123639.688444-2-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/ftrace.h | 33 +++++++++++++++++++++++++++++++++
 kernel/trace/Kconfig   | 12 ++++++++++++
 kernel/trace/ftrace.c  | 17 ++++++++++++++++-
 3 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 07f8c309e432..015dd1049bea 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -359,6 +359,7 @@ enum {
 	FTRACE_OPS_FL_DIRECT			= BIT(17),
 	FTRACE_OPS_FL_SUBOP			= BIT(18),
 	FTRACE_OPS_FL_GRAPH			= BIT(19),
+	FTRACE_OPS_FL_JMP			= BIT(20),
 };
 
 #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
@@ -577,6 +578,38 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs,
 						 unsigned long addr) { }
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
+static inline bool ftrace_is_jmp(unsigned long addr)
+{
+	return addr & 1;
+}
+
+static inline unsigned long ftrace_jmp_set(unsigned long addr)
+{
+	return addr | 1UL;
+}
+
+static inline unsigned long ftrace_jmp_get(unsigned long addr)
+{
+	return addr & ~1UL;
+}
+#else
+static inline bool ftrace_is_jmp(unsigned long addr)
+{
+	return false;
+}
+
+static inline unsigned long ftrace_jmp_set(unsigned long addr)
+{
+	return addr;
+}
+
+static inline unsigned long ftrace_jmp_get(unsigned long addr)
+{
+	return addr;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_JMP */
+
 #ifdef CONFIG_STACK_TRACER
 
 int stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d2c79da81e4f..4661b9e606e0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -80,6 +80,12 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
 	  If the architecture generates __patchable_function_entries sections
 	  but does not want them included in the ftrace locations.
 
+config HAVE_DYNAMIC_FTRACE_WITH_JMP
+	bool
+	help
+	  If the architecture supports to replace the __fentry__ with a
+	  "jmp" instruction.
+
 config HAVE_SYSCALL_TRACEPOINTS
 	bool
 	help
@@ -330,6 +336,12 @@ config DYNAMIC_FTRACE_WITH_ARGS
 	depends on DYNAMIC_FTRACE
 	depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
 
+config DYNAMIC_FTRACE_WITH_JMP
+	def_bool y
+	depends on DYNAMIC_FTRACE
+	depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	depends on HAVE_DYNAMIC_FTRACE_WITH_JMP
+
 config FPROBE
 	bool "Kernel Function Probe (fprobe)"
 	depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 59cfacb8a5bb..bbb37c0f8c6c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5951,7 +5951,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
 	for (i = 0; i < size; i++) {
 		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
 			del = __ftrace_lookup_ip(direct_functions, entry->ip);
-			if (del && del->direct == addr) {
+			if (del && ftrace_jmp_get(del->direct) ==
+				   ftrace_jmp_get(addr)) {
 				remove_hash_entry(direct_functions, del);
 				kfree(del);
 			}
@@ -6016,8 +6017,15 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 	if (ftrace_hash_empty(hash))
 		return -EINVAL;
 
+	/* This is a "raw" address, and this should never happen. */
+	if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
+		return -EINVAL;
+
 	mutex_lock(&direct_mutex);
 
+	if (ops->flags & FTRACE_OPS_FL_JMP)
+		addr = ftrace_jmp_set(addr);
+
 	/* Make sure requested entries are not already registered.. */
 	size = 1 << hash->size_bits;
 	for (i = 0; i < size; i++) {
@@ -6138,6 +6146,13 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 
 	lockdep_assert_held_once(&direct_mutex);
 
+	/* This is a "raw" address, and this should never happen. */
+	if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
+		return -EINVAL;
+
+	if (ops->flags & FTRACE_OPS_FL_JMP)
+		addr = ftrace_jmp_set(addr);
+
 	/* Enable the tmp_ops to have the same functions as the direct ops */
 	ftrace_ops_init(&tmp_ops);
 	tmp_ops.func_hash = ops->func_hash;
-- 
cgit v1.2.3


From 373f2f44c300815c5f170e89560ac361c0053dfe Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Tue, 18 Nov 2025 20:36:32 +0800
Subject: bpf,x86: adjust the "jmp" mode for bpf trampoline

In the origin call case, if BPF_TRAMP_F_SKIP_FRAME is not set, it means
that the trampoline is not called, but "jmp".

Introduce the function bpf_trampoline_use_jmp() to check if the trampoline
is in "jmp" mode.

Do some adjustment on the "jmp" mode for the x86_64. The main adjustment
that we make is for the stack parameter passing case, as the stack
alignment logic changes in the "jmp" mode without the "rip". What's more,
the location of the parameters on the stack also changes.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20251118123639.688444-5-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 16 +++++++++++-----
 include/linux/bpf.h         | 12 ++++++++++++
 2 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 808d4343f6cf..632a83381c2d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2847,9 +2847,10 @@ static int get_nr_used_regs(const struct btf_func_model *m)
 }
 
 static void save_args(const struct btf_func_model *m, u8 **prog,
-		      int stack_size, bool for_call_origin)
+		      int stack_size, bool for_call_origin, u32 flags)
 {
 	int arg_regs, first_off = 0, nr_regs = 0, nr_stack_slots = 0;
+	bool use_jmp = bpf_trampoline_use_jmp(flags);
 	int i, j;
 
 	/* Store function arguments to stack.
@@ -2890,7 +2891,7 @@ static void save_args(const struct btf_func_model *m, u8 **prog,
 			 */
 			for (j = 0; j < arg_regs; j++) {
 				emit_ldx(prog, BPF_DW, BPF_REG_0, BPF_REG_FP,
-					 nr_stack_slots * 8 + 0x18);
+					 nr_stack_slots * 8 + 16 + (!use_jmp) * 8);
 				emit_stx(prog, BPF_DW, BPF_REG_FP, BPF_REG_0,
 					 -stack_size);
 
@@ -3284,7 +3285,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		 * should be 16-byte aligned. Following code depend on
 		 * that stack_size is already 8-byte aligned.
 		 */
-		stack_size += (stack_size % 16) ? 0 : 8;
+		if (bpf_trampoline_use_jmp(flags)) {
+			/* no rip in the "jmp" case */
+			stack_size += (stack_size % 16) ? 8 : 0;
+		} else {
+			stack_size += (stack_size % 16) ? 0 : 8;
+		}
 	}
 
 	arg_stack_off = stack_size;
@@ -3344,7 +3350,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
 	}
 
-	save_args(m, &prog, regs_off, false);
+	save_args(m, &prog, regs_off, false, flags);
 
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
 		/* arg1: mov rdi, im */
@@ -3377,7 +3383,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
 		restore_regs(m, &prog, regs_off);
-		save_args(m, &prog, arg_stack_off, true);
+		save_args(m, &prog, arg_stack_off, true, flags);
 
 		if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
 			/* Before calling the original function, load the
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 30fb40421405..2f79afe81482 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1264,6 +1264,18 @@ typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start,
 bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog);
 bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog);
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
+static inline bool bpf_trampoline_use_jmp(u64 flags)
+{
+	return flags & BPF_TRAMP_F_CALL_ORIG && !(flags & BPF_TRAMP_F_SKIP_FRAME);
+}
+#else
+static inline bool bpf_trampoline_use_jmp(u64 flags)
+{
+	return false;
+}
+#endif
+
 struct bpf_ksym {
 	unsigned long		 start;
 	unsigned long		 end;
-- 
cgit v1.2.3


From ae4a3160d19cd16b874737ebc1798c7bc2fe3c9e Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Tue, 18 Nov 2025 20:36:33 +0800
Subject: bpf: specify the old and new poke_type for bpf_arch_text_poke

In the origin logic, the bpf_arch_text_poke() assume that the old and new
instructions have the same opcode. However, they can have different opcode
if we want to replace a "call" insn with a "jmp" insn.

Therefore, add the new function parameter "old_t" along with the "new_t",
which are used to indicate the old and new poke type. Meanwhile, adjust
the implement of bpf_arch_text_poke() for all the archs.

"BPF_MOD_NOP" is added to make the code more readable. In
bpf_arch_text_poke(), we still check if the new and old address is NULL to
determine if nop insn should be used, which I think is more safe.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20251118123639.688444-6-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c   | 14 +++++++-------
 arch/loongarch/net/bpf_jit.c    |  9 ++++++---
 arch/powerpc/net/bpf_jit_comp.c | 10 ++++++----
 arch/riscv/net/bpf_jit_comp64.c |  9 ++++++---
 arch/s390/net/bpf_jit_comp.c    |  7 ++++---
 arch/x86/net/bpf_jit_comp.c     | 37 +++++++++++++++++++++----------------
 include/linux/bpf.h             |  6 ++++--
 kernel/bpf/core.c               |  5 +++--
 kernel/bpf/trampoline.c         | 20 ++++++++++++++------
 9 files changed, 71 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 4cfb549f2b43..929123a5431a 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -2934,8 +2934,9 @@ static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
  * The dummy_tramp is used to prevent another CPU from jumping to unknown
  * locations during the patching process, making the patching process easier.
  */
-int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
-		       void *old_addr, void *new_addr)
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+		       enum bpf_text_poke_type new_t, void *old_addr,
+		       void *new_addr)
 {
 	int ret;
 	u32 old_insn;
@@ -2979,14 +2980,13 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
 		    !poking_bpf_entry))
 		return -EINVAL;
 
-	if (poke_type == BPF_MOD_CALL)
-		branch_type = AARCH64_INSN_BRANCH_LINK;
-	else
-		branch_type = AARCH64_INSN_BRANCH_NOLINK;
-
+	branch_type = old_t == BPF_MOD_CALL ? AARCH64_INSN_BRANCH_LINK :
+					      AARCH64_INSN_BRANCH_NOLINK;
 	if (gen_branch_or_nop(branch_type, ip, old_addr, plt, &old_insn) < 0)
 		return -EFAULT;
 
+	branch_type = new_t == BPF_MOD_CALL ? AARCH64_INSN_BRANCH_LINK :
+					      AARCH64_INSN_BRANCH_NOLINK;
 	if (gen_branch_or_nop(branch_type, ip, new_addr, plt, &new_insn) < 0)
 		return -EFAULT;
 
diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c
index cbe53d0b7fb0..2e7dacbbef5c 100644
--- a/arch/loongarch/net/bpf_jit.c
+++ b/arch/loongarch/net/bpf_jit.c
@@ -1284,11 +1284,12 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 	return ret ? ERR_PTR(-EINVAL) : dst;
 }
 
-int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
-		       void *old_addr, void *new_addr)
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+		       enum bpf_text_poke_type new_t, void *old_addr,
+		       void *new_addr)
 {
 	int ret;
-	bool is_call = (poke_type == BPF_MOD_CALL);
+	bool is_call;
 	u32 old_insns[LOONGARCH_LONG_JUMP_NINSNS] = {[0 ... 4] = INSN_NOP};
 	u32 new_insns[LOONGARCH_LONG_JUMP_NINSNS] = {[0 ... 4] = INSN_NOP};
 
@@ -1298,6 +1299,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
 	if (!is_bpf_text_address((unsigned long)ip))
 		return -ENOTSUPP;
 
+	is_call = old_t == BPF_MOD_CALL;
 	ret = emit_jump_or_nops(old_addr, ip, old_insns, is_call);
 	if (ret)
 		return ret;
@@ -1305,6 +1307,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
 	if (memcmp(ip, old_insns, LOONGARCH_LONG_JUMP_NBYTES))
 		return -EFAULT;
 
+	is_call = new_t == BPF_MOD_CALL;
 	ret = emit_jump_or_nops(new_addr, ip, new_insns, is_call);
 	if (ret)
 		return ret;
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 88ad5ba7b87f..5e976730b2f5 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -1107,8 +1107,9 @@ static void do_isync(void *info __maybe_unused)
  * execute isync (or some CSI) so that they don't go back into the
  * trampoline again.
  */
-int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
-		       void *old_addr, void *new_addr)
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+		       enum bpf_text_poke_type new_t, void *old_addr,
+		       void *new_addr)
 {
 	unsigned long bpf_func, bpf_func_end, size, offset;
 	ppc_inst_t old_inst, new_inst;
@@ -1119,7 +1120,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
 		return -EOPNOTSUPP;
 
 	bpf_func = (unsigned long)ip;
-	branch_flags = poke_type == BPF_MOD_CALL ? BRANCH_SET_LINK : 0;
 
 	/* We currently only support poking bpf programs */
 	if (!__bpf_address_lookup(bpf_func, &size, &offset, name)) {
@@ -1132,7 +1132,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
 	 * an unconditional branch instruction at im->ip_after_call
 	 */
 	if (offset) {
-		if (poke_type != BPF_MOD_JUMP) {
+		if (old_t == BPF_MOD_CALL || new_t == BPF_MOD_CALL) {
 			pr_err("%s (0x%lx): calls are not supported in bpf prog body\n", __func__,
 			       bpf_func);
 			return -EOPNOTSUPP;
@@ -1166,6 +1166,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
 	}
 
 	old_inst = ppc_inst(PPC_RAW_NOP());
+	branch_flags = old_t == BPF_MOD_CALL ? BRANCH_SET_LINK : 0;
 	if (old_addr) {
 		if (is_offset_in_branch_range(ip - old_addr))
 			create_branch(&old_inst, ip, (unsigned long)old_addr, branch_flags);
@@ -1174,6 +1175,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
 				      branch_flags);
 	}
 	new_inst = ppc_inst(PPC_RAW_NOP());
+	branch_flags = new_t == BPF_MOD_CALL ? BRANCH_SET_LINK : 0;
 	if (new_addr) {
 		if (is_offset_in_branch_range(ip - new_addr))
 			create_branch(&new_inst, ip, (unsigned long)new_addr, branch_flags);
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 21c70ae3296b..5f9457e910e8 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -852,17 +852,19 @@ static int gen_jump_or_nops(void *target, void *ip, u32 *insns, bool is_call)
 	return emit_jump_and_link(is_call ? RV_REG_T0 : RV_REG_ZERO, rvoff, false, &ctx);
 }
 
-int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
-		       void *old_addr, void *new_addr)
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+		       enum bpf_text_poke_type new_t, void *old_addr,
+		       void *new_addr)
 {
 	u32 old_insns[RV_FENTRY_NINSNS], new_insns[RV_FENTRY_NINSNS];
-	bool is_call = poke_type == BPF_MOD_CALL;
+	bool is_call;
 	int ret;
 
 	if (!is_kernel_text((unsigned long)ip) &&
 	    !is_bpf_text_address((unsigned long)ip))
 		return -ENOTSUPP;
 
+	is_call = old_t == BPF_MOD_CALL;
 	ret = gen_jump_or_nops(old_addr, ip, old_insns, is_call);
 	if (ret)
 		return ret;
@@ -870,6 +872,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
 	if (memcmp(ip, old_insns, RV_FENTRY_NBYTES))
 		return -EFAULT;
 
+	is_call = new_t == BPF_MOD_CALL;
 	ret = gen_jump_or_nops(new_addr, ip, new_insns, is_call);
 	if (ret)
 		return ret;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index cf461d76e9da..a2072cabba76 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -2413,8 +2413,9 @@ bool bpf_jit_supports_far_kfunc_call(void)
 	return true;
 }
 
-int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
-		       void *old_addr, void *new_addr)
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+		       enum bpf_text_poke_type new_t, void *old_addr,
+		       void *new_addr)
 {
 	struct bpf_plt expected_plt, current_plt, new_plt, *plt;
 	struct {
@@ -2431,7 +2432,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 	if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0)))
 		return -EINVAL;
 
-	if (t == BPF_MOD_JUMP &&
+	if ((new_t == BPF_MOD_JUMP || old_t == BPF_MOD_JUMP) &&
 	    insn.disp == ((char *)new_addr - (char *)ip) >> 1) {
 		/*
 		 * The branch already points to the destination,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 632a83381c2d..b69dc7194e2c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -597,7 +597,8 @@ static int emit_jump(u8 **pprog, void *func, void *ip)
 	return emit_patch(pprog, func, ip, 0xE9);
 }
 
-static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+				enum bpf_text_poke_type new_t,
 				void *old_addr, void *new_addr)
 {
 	const u8 *nop_insn = x86_nops[5];
@@ -607,9 +608,9 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 	int ret;
 
 	memcpy(old_insn, nop_insn, X86_PATCH_SIZE);
-	if (old_addr) {
+	if (old_t != BPF_MOD_NOP && old_addr) {
 		prog = old_insn;
-		ret = t == BPF_MOD_CALL ?
+		ret = old_t == BPF_MOD_CALL ?
 		      emit_call(&prog, old_addr, ip) :
 		      emit_jump(&prog, old_addr, ip);
 		if (ret)
@@ -617,9 +618,9 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 	}
 
 	memcpy(new_insn, nop_insn, X86_PATCH_SIZE);
-	if (new_addr) {
+	if (new_t != BPF_MOD_NOP && new_addr) {
 		prog = new_insn;
-		ret = t == BPF_MOD_CALL ?
+		ret = new_t == BPF_MOD_CALL ?
 		      emit_call(&prog, new_addr, ip) :
 		      emit_jump(&prog, new_addr, ip);
 		if (ret)
@@ -640,8 +641,9 @@ out:
 	return ret;
 }
 
-int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
-		       void *old_addr, void *new_addr)
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+		       enum bpf_text_poke_type new_t, void *old_addr,
+		       void *new_addr)
 {
 	if (!is_kernel_text((long)ip) &&
 	    !is_bpf_text_address((long)ip))
@@ -655,7 +657,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 	if (is_endbr(ip))
 		ip += ENDBR_INSN_SIZE;
 
-	return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
+	return __bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr);
 }
 
 #define EMIT_LFENCE()	EMIT3(0x0F, 0xAE, 0xE8)
@@ -897,12 +899,13 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
 		target = array->ptrs[poke->tail_call.key];
 		if (target) {
 			ret = __bpf_arch_text_poke(poke->tailcall_target,
-						   BPF_MOD_JUMP, NULL,
+						   BPF_MOD_NOP, BPF_MOD_JUMP,
+						   NULL,
 						   (u8 *)target->bpf_func +
 						   poke->adj_off);
 			BUG_ON(ret < 0);
 			ret = __bpf_arch_text_poke(poke->tailcall_bypass,
-						   BPF_MOD_JUMP,
+						   BPF_MOD_JUMP, BPF_MOD_NOP,
 						   (u8 *)poke->tailcall_target +
 						   X86_PATCH_SIZE, NULL);
 			BUG_ON(ret < 0);
@@ -3985,6 +3988,7 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
 			       struct bpf_prog *new, struct bpf_prog *old)
 {
 	u8 *old_addr, *new_addr, *old_bypass_addr;
+	enum bpf_text_poke_type t;
 	int ret;
 
 	old_bypass_addr = old ? NULL : poke->bypass_addr;
@@ -3997,21 +4001,22 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
 	 * the kallsyms check.
 	 */
 	if (new) {
+		t = old_addr ? BPF_MOD_JUMP : BPF_MOD_NOP;
 		ret = __bpf_arch_text_poke(poke->tailcall_target,
-					   BPF_MOD_JUMP,
+					   t, BPF_MOD_JUMP,
 					   old_addr, new_addr);
 		BUG_ON(ret < 0);
 		if (!old) {
 			ret = __bpf_arch_text_poke(poke->tailcall_bypass,
-						   BPF_MOD_JUMP,
+						   BPF_MOD_JUMP, BPF_MOD_NOP,
 						   poke->bypass_addr,
 						   NULL);
 			BUG_ON(ret < 0);
 		}
 	} else {
+		t = old_bypass_addr ? BPF_MOD_JUMP : BPF_MOD_NOP;
 		ret = __bpf_arch_text_poke(poke->tailcall_bypass,
-					   BPF_MOD_JUMP,
-					   old_bypass_addr,
+					   t, BPF_MOD_JUMP, old_bypass_addr,
 					   poke->bypass_addr);
 		BUG_ON(ret < 0);
 		/* let other CPUs finish the execution of program
@@ -4020,9 +4025,9 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
 		 */
 		if (!ret)
 			synchronize_rcu();
+		t = old_addr ? BPF_MOD_JUMP : BPF_MOD_NOP;
 		ret = __bpf_arch_text_poke(poke->tailcall_target,
-					   BPF_MOD_JUMP,
-					   old_addr, NULL);
+					   t, BPF_MOD_NOP, old_addr, NULL);
 		BUG_ON(ret < 0);
 	}
 }
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2f79afe81482..a9b788c7b4aa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3710,12 +3710,14 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
 #endif /* CONFIG_INET */
 
 enum bpf_text_poke_type {
+	BPF_MOD_NOP,
 	BPF_MOD_CALL,
 	BPF_MOD_JUMP,
 };
 
-int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
-		       void *addr1, void *addr2);
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+		       enum bpf_text_poke_type new_t, void *old_addr,
+		       void *new_addr);
 
 void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
 			       struct bpf_prog *new, struct bpf_prog *old);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ef4448f18aad..c8ae6ab31651 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3150,8 +3150,9 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 	return -EFAULT;
 }
 
-int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
-			      void *addr1, void *addr2)
+int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+			      enum bpf_text_poke_type new_t, void *old_addr,
+			      void *new_addr)
 {
 	return -ENOTSUPP;
 }
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 04104397c432..0230ad19533e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -183,7 +183,8 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	if (tr->func.ftrace_managed)
 		ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
 	else
-		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, BPF_MOD_NOP,
+					 old_addr, NULL);
 
 	return ret;
 }
@@ -200,7 +201,10 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
 		else
 			ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
 	} else {
-		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+		ret = bpf_arch_text_poke(ip,
+					 old_addr ? BPF_MOD_CALL : BPF_MOD_NOP,
+					 new_addr ? BPF_MOD_CALL : BPF_MOD_NOP,
+					 old_addr, new_addr);
 	}
 	return ret;
 }
@@ -225,7 +229,8 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 			return ret;
 		ret = register_ftrace_direct(tr->fops, (long)new_addr);
 	} else {
-		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+		ret = bpf_arch_text_poke(ip, BPF_MOD_NOP, BPF_MOD_CALL,
+					 NULL, new_addr);
 	}
 
 	return ret;
@@ -336,8 +341,9 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
 	 * call_rcu_tasks() is not necessary.
 	 */
 	if (im->ip_after_call) {
-		int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
-					     NULL, im->ip_epilogue);
+		int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP,
+					     BPF_MOD_JUMP, NULL,
+					     im->ip_epilogue);
 		WARN_ON(err);
 		if (IS_ENABLED(CONFIG_TASKS_RCU))
 			call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
@@ -570,7 +576,8 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
 		if (err)
 			return err;
 		tr->extension_prog = link->link.prog;
-		return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+		return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP,
+					  BPF_MOD_JUMP, NULL,
 					  link->link.prog->bpf_func);
 	}
 	if (cnt >= BPF_MAX_TRAMP_LINKS)
@@ -618,6 +625,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
 	if (kind == BPF_TRAMP_REPLACE) {
 		WARN_ON_ONCE(!tr->extension_prog);
 		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
+					 BPF_MOD_NOP,
 					 tr->extension_prog->bpf_func, NULL);
 		tr->extension_prog = NULL;
 		guard(mutex)(&tgt_prog->aux->ext_mutex);
-- 
cgit v1.2.3


From 7584edf15892e29190b2145294cc1680aa142586 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Mon, 24 Nov 2025 17:15:35 +0000
Subject: firmware: cs_dsp: Store control length as 32-bit

The architectures supported by this driver have a maximum of 32-bits
of address, so we don't need more than 32-bits to store the length of
control data. Change the length in struct cs_dsp_coeff_ctl to an
unsigned int instead of a size_t. Also make a corresponding trivial
change to wm_adsp.c to prevent a compiler warning.

Tested on x86_64 builds this saves at least 4 bytes per control
(another 4 bytes might be saved if the compiler was inserting padding
to align the size_t).

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20251124171536.78962-1-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/firmware/cirrus/cs_dsp.c       | 2 +-
 include/linux/firmware/cirrus/cs_dsp.h | 2 +-
 sound/soc/codecs/wm_adsp.c             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c
index 9acdcd75928a..36a5aefa16e7 100644
--- a/drivers/firmware/cirrus/cs_dsp.c
+++ b/drivers/firmware/cirrus/cs_dsp.c
@@ -477,7 +477,7 @@ static int cs_dsp_debugfs_read_controls_show(struct seq_file *s, void *ignored)
 
 	list_for_each_entry(ctl, &dsp->ctl_list, list) {
 		cs_dsp_coeff_base_reg(ctl, &reg, 0);
-		seq_printf(s, "%22.*s: %#8zx %s:%08x %#8x %s %#8x %#4x %c%c%c%c %s %s\n",
+		seq_printf(s, "%22.*s: %#8x %s:%08x %#8x %s %#8x %#4x %c%c%c%c %s %s\n",
 			   ctl->subname_len, ctl->subname, ctl->len,
 			   cs_dsp_mem_region_name(ctl->alg_region.type),
 			   ctl->offset, reg, ctl->fw_name, ctl->alg_region.alg, ctl->type,
diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 69959032f8f5..0ec1cdc5585d 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -102,7 +102,7 @@ struct cs_dsp_coeff_ctl {
 	const char *subname;
 	unsigned int subname_len;
 	unsigned int offset;
-	size_t len;
+	unsigned int len;
 	unsigned int type;
 	unsigned int flags;
 	unsigned int set:1;
diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c
index 172dcdd7dbca..17cec79245d4 100644
--- a/sound/soc/codecs/wm_adsp.c
+++ b/sound/soc/codecs/wm_adsp.c
@@ -1561,7 +1561,7 @@ static int wm_adsp_buffer_parse_coeff(struct cs_dsp_coeff_ctl *cs_ctl)
 
 	for (i = 0; i < 5; ++i) {
 		ret = cs_dsp_coeff_read_ctrl(cs_ctl, 0, &coeff_v1,
-					     min(cs_ctl->len, sizeof(coeff_v1)));
+					     min((size_t)cs_ctl->len, sizeof(coeff_v1)));
 		if (ret < 0)
 			return ret;
 
-- 
cgit v1.2.3


From 2a6c045640c38a407a39cd40c3c4d8dd2fd89aa8 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 6 Nov 2025 14:34:00 +0100
Subject: bitfield: Add less-checking __FIELD_{GET,PREP}()

The BUILD_BUG_ON_MSG() check against "~0ull" works only with "unsigned
(long) long" _mask types.  For constant masks, that condition is usually
met, as GENMASK() yields an UL value.  The few places where the
constant mask is stored in an intermediate variable were fixed by
changing the variable type to u64 (see e.g. [1] and [2]).

However, for non-constant masks, smaller unsigned types should be valid,
too, but currently lead to "result of comparison of constant
18446744073709551615 with expression of type ... is always
false"-warnings with clang and W=1.

Hence refactor the __BF_FIELD_CHECK() helper, and factor out
__FIELD_{GET,PREP}().  The later lack the single problematic check, but
are otherwise identical to FIELD_{GET,PREP}(), and are intended to be
used in the fully non-const variants later.

[1] commit 5c667d5a5a3ec166 ("clk: sp7021: Adjust width of _m in
    HWM_FIELD_PREP()")
[2] commit cfd6fb45cfaf46fa ("crypto: ccree - avoid out-of-range
    warnings from clang")

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://git.kernel.org/torvalds/c/5c667d5a5a3ec166 [1]
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
 include/linux/bitfield.h | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 5355f8f806a9..bf8e0ae4b5b4 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -60,7 +60,7 @@
 
 #define __bf_cast_unsigned(type, x)	((__unsigned_scalar_typeof(type))(x))
 
-#define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx)			\
+#define __BF_FIELD_CHECK_MASK(_mask, _val, _pfx)			\
 	({								\
 		BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask),		\
 				 _pfx "mask is not constant");		\
@@ -69,13 +69,33 @@
 				 ~((_mask) >> __bf_shf(_mask)) &	\
 					(0 + (_val)) : 0,		\
 				 _pfx "value too large for the field"); \
-		BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) >	\
-				 __bf_cast_unsigned(_reg, ~0ull),	\
-				 _pfx "type of reg too small for mask"); \
 		__BUILD_BUG_ON_NOT_POWER_OF_2((_mask) +			\
 					      (1ULL << __bf_shf(_mask))); \
 	})
 
+#define __BF_FIELD_CHECK_REG(mask, reg, pfx)				\
+	BUILD_BUG_ON_MSG(__bf_cast_unsigned(mask, mask) >		\
+			 __bf_cast_unsigned(reg, ~0ull),		\
+			 pfx "type of reg too small for mask")
+
+#define __BF_FIELD_CHECK(mask, reg, val, pfx)				\
+	({								\
+		__BF_FIELD_CHECK_MASK(mask, val, pfx);			\
+		__BF_FIELD_CHECK_REG(mask, reg, pfx);			\
+	})
+
+#define __FIELD_PREP(mask, val, pfx)					\
+	({								\
+		__BF_FIELD_CHECK_MASK(mask, val, pfx);			\
+		((typeof(mask))(val) << __bf_shf(mask)) & (mask);	\
+	})
+
+#define __FIELD_GET(mask, reg, pfx)					\
+	({								\
+		__BF_FIELD_CHECK_MASK(mask, 0U, pfx);			\
+		(typeof(mask))(((reg) & (mask)) >> __bf_shf(mask));	\
+	})
+
 /**
  * FIELD_MAX() - produce the maximum value representable by a field
  * @_mask: shifted mask defining the field's length and position
@@ -112,8 +132,8 @@
  */
 #define FIELD_PREP(_mask, _val)						\
 	({								\
-		__BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: ");	\
-		((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask);	\
+		__BF_FIELD_CHECK_REG(_mask, 0ULL, "FIELD_PREP: ");	\
+		__FIELD_PREP(_mask, _val, "FIELD_PREP: ");		\
 	})
 
 #define __BF_CHECK_POW2(n)	BUILD_BUG_ON_ZERO(((n) & ((n) - 1)) != 0)
@@ -152,8 +172,8 @@
  */
 #define FIELD_GET(_mask, _reg)						\
 	({								\
-		__BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: ");	\
-		(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask));	\
+		__BF_FIELD_CHECK_REG(_mask, _reg, "FIELD_GET: ");	\
+		__FIELD_GET(_mask, _reg, "FIELD_GET: ");		\
 	})
 
 /**
-- 
cgit v1.2.3


From c1c6ab80b25c8db1e2ef5ae3ac8075d2c242ae13 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 6 Nov 2025 14:34:01 +0100
Subject: bitfield: Add non-constant field_{prep,get}() helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing FIELD_{GET,PREP}() macros are limited to compile-time
constants.  However, it is very common to prepare or extract bitfield
elements where the bitfield mask is not a compile-time constant.

To avoid this limitation, the AT91 clock driver and several other
drivers already have their own non-const field_{prep,get}() macros.
Make them available for general use by adding them to
<linux/bitfield.h>, and improve them slightly:
  1. Avoid evaluating macro parameters more than once,
  2. Replace "ffs() - 1" by "__ffs()",
  3. Support 64-bit use on 32-bit architectures,
  4. Wire field_{get,prep}() to FIELD_{GET,PREP}() when mask is
     actually constant.

This is deliberately not merged into the existing FIELD_{GET,PREP}()
macros, as people expressed the desire to keep stricter variants for
increased safety, or for performance critical paths.

Yury: use __mask withing new macros.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Crt Mori <cmo@melexis.com>
Acked-by: Nuno Sá <nuno.sa@analog.com>
Acked-by: Richard Genoud <richard.genoud@bootlin.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Reviewed-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
 include/linux/bitfield.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index bf8e0ae4b5b4..126dc5b380af 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -17,6 +17,7 @@
  * FIELD_{GET,PREP} macros take as first parameter shifted mask
  * from which they extract the base mask and shift amount.
  * Mask must be a compilation time constant.
+ * field_{get,prep} are variants that take a non-const mask.
  *
  * Example:
  *
@@ -240,4 +241,62 @@ __MAKE_OP(64)
 #undef __MAKE_OP
 #undef ____MAKE_OP
 
+#define __field_prep(mask, val)						\
+	({								\
+		__auto_type __mask = (mask);				\
+		typeof(__mask) __val = (val);				\
+		unsigned int __shift = BITS_PER_TYPE(__mask) <= 32 ?	\
+				       __ffs(__mask) : __ffs64(__mask);	\
+		(__val << __shift) & __mask;				\
+	})
+
+#define __field_get(mask, reg)						\
+	({								\
+		__auto_type __mask = (mask);				\
+		typeof(__mask) __reg =  (reg);				\
+		unsigned int __shift = BITS_PER_TYPE(__mask) <= 32 ?	\
+				       __ffs(__mask) : __ffs64(__mask);	\
+		(__reg & __mask) >> __shift;				\
+	})
+
+/**
+ * field_prep() - prepare a bitfield element
+ * @mask: shifted mask defining the field's length and position, must be
+ *        non-zero
+ * @val:  value to put in the field
+ *
+ * Return: field value masked and shifted to its final destination
+ *
+ * field_prep() masks and shifts up the value.  The result should be
+ * combined with other fields of the bitfield using logical OR.
+ * Unlike FIELD_PREP(), @mask is not limited to a compile-time constant.
+ * Typical usage patterns are a value stored in a table, or calculated by
+ * shifting a constant by a variable number of bits.
+ * If you want to ensure that @mask is a compile-time constant, please use
+ * FIELD_PREP() directly instead.
+ */
+#define field_prep(mask, val)						\
+	(__builtin_constant_p(mask) ? __FIELD_PREP(mask, val, "field_prep: ") \
+				    : __field_prep(mask, val))
+
+/**
+ * field_get() - extract a bitfield element
+ * @mask: shifted mask defining the field's length and position, must be
+ *        non-zero
+ * @reg:  value of entire bitfield
+ *
+ * Return: extracted field value
+ *
+ * field_get() extracts the field specified by @mask from the
+ * bitfield passed in as @reg by masking and shifting it down.
+ * Unlike FIELD_GET(), @mask is not limited to a compile-time constant.
+ * Typical usage patterns are a value stored in a table, or calculated by
+ * shifting a constant by a variable number of bits.
+ * If you want to ensure that @mask is a compile-time constant, please use
+ * FIELD_GET() directly instead.
+ */
+#define field_get(mask, reg)						\
+	(__builtin_constant_p(mask) ? __FIELD_GET(mask, reg, "field_get: ") \
+				    : __field_get(mask, reg))
+
 #endif
-- 
cgit v1.2.3


From 4f1b701f24bea0900e349aa1c860db24ba0150aa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 10 Nov 2025 10:24:09 +0100
Subject: x86/bug: Use BUG_FORMAT for DEBUG_BUGVERBOSE_DETAILED

Since we have an explicit format string, use it for the condition string
instead of frobbing it in the file string.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115758.097401406@infradead.org
---
 arch/x86/include/asm/bug.h | 10 ++++++++--
 include/asm-generic/bug.h  |  8 +++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 5e63f9952a0c..50b802169a4b 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -74,13 +74,19 @@
 	".popsection\n"							\
 	extra
 
+#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED
+#define WARN_CONDITION_STR(cond_str) cond_str
+#else
+#define WARN_CONDITION_STR(cond_str) NULL
+#endif
+
 #define _BUG_FLAGS(cond_str, ins, flags, extra)				\
 do {									\
 	asm_inline volatile(_BUG_FLAGS_ASM(ins, "%c[fmt]", "%c[file]",	\
 					   "%c[line]", "%c[fl]",	\
 					   "%c[size]", extra)		\
-		   : : [fmt] "i" (NULL),				\
-		       [file] "i" (WARN_CONDITION_STR(cond_str) __FILE__), \
+		   : : [fmt] "i" (WARN_CONDITION_STR(cond_str)),	\
+		       [file] "i" (__FILE__),				\
 		       [line] "i" (__LINE__),				\
 		       [fl] "i" (flags),				\
 		       [size] "i" (sizeof(struct bug_entry)));		\
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 21d2c8f88d49..e512071216be 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -18,11 +18,13 @@
 #define BUG_GET_TAINT(bug)	((bug)->flags >> 8)
 #endif
 
+#ifndef WARN_CONDITION_STR
 #ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED
-# define WARN_CONDITION_STR(cond_str) cond_str
+# define WARN_CONDITION_STR(cond_str) "[" cond_str "] "
 #else
 # define WARN_CONDITION_STR(cond_str)
 #endif
+#endif /* WARN_CONDITION_STR */
 
 #ifndef __ASSEMBLY__
 #include <linux/panic.h>
@@ -107,7 +109,7 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 #define WARN_ON(condition) ({						\
 	int __ret_warn_on = !!(condition);				\
 	if (unlikely(__ret_warn_on))					\
-		__WARN_FLAGS("["#condition"] ",				\
+		__WARN_FLAGS(#condition,				\
 			     BUGFLAG_TAINT(TAINT_WARN));		\
 	unlikely(__ret_warn_on);					\
 })
@@ -117,7 +119,7 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 #define WARN_ON_ONCE(condition) ({					\
 	int __ret_warn_on = !!(condition);				\
 	if (unlikely(__ret_warn_on))					\
-		__WARN_FLAGS("["#condition"] ",				\
+		__WARN_FLAGS(#condition,				\
 			     BUGFLAG_ONCE |				\
 			     BUGFLAG_TAINT(TAINT_WARN));		\
 	unlikely(__ret_warn_on);					\
-- 
cgit v1.2.3


From 11bb4944f014d756f35261f5afcb346901ef1efa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 2 Jun 2025 15:08:30 +0200
Subject: x86/bug: Implement WARN_ONCE()

Implement WARN_ONCE like WARN using BUGFLAG_ONCE.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115758.339309119@infradead.org
---
 arch/x86/include/asm/bug.h | 9 +++++++++
 include/asm-generic/bug.h  | 2 ++
 2 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index b5474603460b..87199e6633f9 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -178,6 +178,15 @@ do {									\
 #define __WARN_printf(taint, fmt, arg...) \
 	__WARN_print_arg(BUGFLAG_TAINT(taint), fmt, ## arg)
 
+#define WARN_ONCE(cond, format, arg...) ({				\
+	int __ret_warn_on = !!(cond);					\
+	if (unlikely(__ret_warn_on)) {					\
+		__WARN_print_arg(BUGFLAG_ONCE|BUGFLAG_TAINT(TAINT_WARN),\
+				format, ## arg);			\
+	}								\
+	__ret_warn_on;							\
+})
+
 #endif /* HAVE_ARCH_BUG_FORMAT_ARGS */
 
 #include <asm-generic/bug.h>
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index e512071216be..09e8eccee8ed 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -180,8 +180,10 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 	DO_ONCE_LITE_IF(condition, WARN_ON, 1)
 #endif
 
+#ifndef WARN_ONCE
 #define WARN_ONCE(condition, format...)				\
 	DO_ONCE_LITE_IF(condition, WARN, 1, format)
+#endif
 
 #define WARN_TAINT_ONCE(condition, taint, format...)		\
 	DO_ONCE_LITE_IF(condition, WARN_TAINT, 1, taint, format)
-- 
cgit v1.2.3


From 645b9ad2dc6b2d6d31e2944bd7f680f3f9d827ea Mon Sep 17 00:00:00 2001
From: Kriish Sharma <kriish.sharma2006@gmail.com>
Date: Tue, 18 Nov 2025 18:48:28 +0000
Subject: string: Add missing kernel-doc return descriptions

While running kernel-doc validation on linux-next, warnings were emitted
for functions in include/linux/string.h due to missing return value
documentation:

    Warning: include/linux/string.h:375 No description found for return value of 'kbasename'
    Warning: include/linux/string.h:560 No description found for return value of 'strstarts'

This patch adds the missing return value descriptions for both functions
and clears the related kernel-doc warnings.

Signed-off-by: Kriish Sharma <kriish.sharma2006@gmail.com>
Reviewed-by: Andy Shevchenko <andy@kernel.org>
Link: https://patch.msgid.link/20251118184828.2621595-1-kriish.sharma2006@gmail.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/string.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index fdd3442c6bcb..434b152df66a 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -371,6 +371,10 @@ static inline void memzero_explicit(void *s, size_t count)
  * kbasename - return the last part of a pathname.
  *
  * @path: path to extract the filename from.
+ *
+ * Returns:
+ * Pointer to the filename portion inside @path. If no '/' exists,
+ * returns @path unchanged.
  */
 static inline const char *kbasename(const char *path)
 {
@@ -556,6 +560,9 @@ static __always_inline size_t str_has_prefix(const char *str, const char *prefix
  * strstarts - does @str start with @prefix?
  * @str: string to examine
  * @prefix: prefix to look for.
+ *
+ * Returns:
+ * True if @str begins with @prefix. False in all other cases.
  */
 static inline bool strstarts(const char *str, const char *prefix)
 {
-- 
cgit v1.2.3


From 6b1ac78dd0f29fe66421c460c12ec15e45af38c3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 13 Oct 2025 10:22:04 +1030
Subject: btrfs: implement shutdown ioctl

The shutdown ioctl should follow the XFS one, which use magic number 'X',
and ioctl number 125, with a uint32 as flags.

For now btrfs don't distinguish DEFAULT and LOGFLUSH flags (just like
f2fs), both will freeze the fs first (implies committing the current
transaction), setting the SHUTDOWN flag and finally thaw the fs.

For NOLOGFLUSH flag, the freeze/thaw part is skipped thus the current
transaction is aborted.

The new shutdown ioctl is hidden behind experimental features for more
testing.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Tested-by: Anand Jain <asj@kernel.org>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c           | 41 +++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h |  9 +++++++++
 2 files changed, 50 insertions(+)

(limited to 'include')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 803556ec0e18..127b5d8303a8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5223,6 +5223,43 @@ static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *a
 	return 0;
 }
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg)
+{
+	int ret = 0;
+	u32 flags;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (get_user(flags, (u32 __user *)arg))
+		return -EFAULT;
+
+	if (flags >= BTRFS_SHUTDOWN_FLAGS_LAST)
+		return -EINVAL;
+
+	if (btrfs_is_shutdown(fs_info))
+		return 0;
+
+	switch (flags) {
+	case BTRFS_SHUTDOWN_FLAGS_LOGFLUSH:
+	case BTRFS_SHUTDOWN_FLAGS_DEFAULT:
+		ret = freeze_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL);
+		if (ret)
+			return ret;
+		btrfs_force_shutdown(fs_info);
+		ret = thaw_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL);
+		if (ret)
+			return ret;
+		break;
+	case BTRFS_SHUTDOWN_FLAGS_NOLOGFLUSH:
+		btrfs_force_shutdown(fs_info);
+		break;
+	}
+	return ret;
+}
+#endif
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -5378,6 +5415,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 #endif
 	case BTRFS_IOC_SUBVOL_SYNC_WAIT:
 		return btrfs_ioctl_subvol_sync(fs_info, argp);
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	case BTRFS_IOC_SHUTDOWN:
+		return btrfs_ioctl_shutdown(fs_info, arg);
+#endif
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 8e710bbb688e..e8fd92789423 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -1099,6 +1099,12 @@ enum btrfs_err_code {
 	BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
 };
 
+/* Flags for IOC_SHUTDOWN, must match XFS_FSOP_GOING_FLAGS_* flags. */
+#define BTRFS_SHUTDOWN_FLAGS_DEFAULT			0x0
+#define BTRFS_SHUTDOWN_FLAGS_LOGFLUSH			0x1
+#define BTRFS_SHUTDOWN_FLAGS_NOLOGFLUSH			0x2
+#define BTRFS_SHUTDOWN_FLAGS_LAST			0x3
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -1220,6 +1226,9 @@ enum btrfs_err_code {
 #define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \
 					struct btrfs_ioctl_subvol_wait)
 
+/* Shutdown ioctl should follow XFS's interfaces, thus not using btrfs magic. */
+#define BTRFS_IOC_SHUTDOWN	_IOR('X', 125, __u32)
+
 #ifdef __cplusplus
 }
 #endif
-- 
cgit v1.2.3


From 4bd68e475300bc97b33a7f1ef9bd112970018789 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 24 Nov 2025 21:39:59 +0100
Subject: cpumask: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
 arch/x86/include/asm/cpumask.h |  2 ++
 include/linux/cpumask.h        | 10 ++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
index 70f6b60ad67b..9df9e9cde670 100644
--- a/arch/x86/include/asm/cpumask.h
+++ b/arch/x86/include/asm/cpumask.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_CPUMASK_H
 #define _ASM_X86_CPUMASK_H
 #ifndef __ASSEMBLER__
+
+#include <linux/compiler.h>
 #include <linux/cpumask.h>
 
 extern void setup_cpu_local_masks(void);
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ff8f41ab7ce6..df89eedc6e91 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -7,14 +7,16 @@
  * set of CPUs in a system, one bit position per CPU number.  In general,
  * only nr_cpu_ids (<= NR_CPUS) bits are valid.
  */
-#include <linux/cleanup.h>
-#include <linux/kernel.h>
+#include <linux/atomic.h>
 #include <linux/bitmap.h>
+#include <linux/cleanup.h>
 #include <linux/cpumask_types.h>
-#include <linux/atomic.h>
-#include <linux/bug.h>
 #include <linux/gfp_types.h>
 #include <linux/numa.h>
+#include <linux/threads.h>
+#include <linux/types.h>
+
+#include <asm/bug.h>
 
 /**
  * cpumask_pr_args - printf args to output a cpumask
-- 
cgit v1.2.3


From 8cb4ecec5e366b7dbbf200629a22624ad2340af5 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 20 Nov 2025 17:24:51 +0000
Subject: irqchip/gic: Add missing GICH_HCR control bits

The GICH_HCR description is missing a bunch of control bits that
control the maintenance interrupt. Add them.

Tested-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Tested-by: Mark Brown <broonie@kernel.org>
Link: https://msgid.link/20251120172540.2267180-2-maz@kernel.org
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
 include/linux/irqchip/arm-gic.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 2223f95079ce..d45fa19f9e47 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -86,7 +86,13 @@
 
 #define GICH_HCR_EN			(1 << 0)
 #define GICH_HCR_UIE			(1 << 1)
+#define GICH_HCR_LRENPIE		(1 << 2)
 #define GICH_HCR_NPIE			(1 << 3)
+#define GICH_HCR_VGrp0EIE		(1 << 4)
+#define GICH_HCR_VGrp0DIE		(1 << 5)
+#define GICH_HCR_VGrp1EIE		(1 << 6)
+#define GICH_HCR_VGrp1DIE		(1 << 7)
+#define GICH_HCR_EOICOUNT		GENMASK(31, 27)
 
 #define GICH_LR_VIRTUALID		(0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT	(10)
-- 
cgit v1.2.3


From fa8f11e8e18383d234c77ba08d347aed7883d39a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 20 Nov 2025 17:24:52 +0000
Subject: irqchip/gic: Expose CPU interface VA to KVM

Future changes will require KVM to be able to perform deactivations
by writing to the physical CPU interface. Add the corresponding
VA to the kvm_info structure, and let KVM stash it.

Tested-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Tested-by: Mark Brown <broonie@kernel.org>
Link: https://msgid.link/20251120172540.2267180-3-maz@kernel.org
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-v2.c         | 1 +
 drivers/irqchip/irq-gic.c             | 3 +++
 include/kvm/arm_vgic.h                | 3 +++
 include/linux/irqchip/arm-vgic-info.h | 2 ++
 4 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 381673f03c39..441efef80d60 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -385,6 +385,7 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
 
 	kvm_vgic_global_state.can_emulate_gicv2 = true;
 	kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+	kvm_vgic_global_state.gicc_base = info->gicc_base;
 	kvm_vgic_global_state.type = VGIC_V2;
 	kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
 
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 1269ab8eb726..ec70c84e9f91 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -1459,6 +1459,8 @@ static void __init gic_of_setup_kvm_info(struct device_node *node)
 	if (ret)
 		return;
 
+	gic_v2_kvm_info.gicc_base = gic_data[0].cpu_base.common_base;
+
 	if (static_branch_likely(&supports_deactivate_key))
 		vgic_set_kvm_info(&gic_v2_kvm_info);
 }
@@ -1620,6 +1622,7 @@ static void __init gic_acpi_setup_kvm_info(void)
 		return;
 
 	gic_v2_kvm_info.maint_irq = irq;
+	gic_v2_kvm_info.gicc_base = gic_data[0].cpu_base.common_base;
 
 	vgic_set_kvm_info(&gic_v2_kvm_info);
 }
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 7a0b972eb1b1..577723f5599b 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -59,6 +59,9 @@ struct vgic_global {
 	/* virtual control interface mapping, HYP VA */
 	void __iomem		*vctrl_hyp;
 
+	/* Physical CPU interface, kernel VA */
+	void __iomem		*gicc_base;
+
 	/* Number of implemented list registers */
 	int			nr_lr;
 
diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h
index a470a73a805a..67d9d960273b 100644
--- a/include/linux/irqchip/arm-vgic-info.h
+++ b/include/linux/irqchip/arm-vgic-info.h
@@ -24,6 +24,8 @@ struct gic_kvm_info {
 	enum gic_type	type;
 	/* Virtual CPU interface */
 	struct resource vcpu;
+	/* GICv2 GICC VA */
+	void __iomem	*gicc_base;
 	/* Interrupt number */
 	unsigned int	maint_irq;
 	/* No interrupt mask, no need to use the above field */
-- 
cgit v1.2.3


From a4413a7c31cfca49d3f4830cf8a45edf4a713f63 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 20 Nov 2025 17:24:57 +0000
Subject: KVM: arm64: Repack struct vgic_irq fields

struct vgic_irq has grown over the years, in a rather bad way.
Repack it using bitfields so that the individual flags, and move
things around a bit so that it a bit smaller.

Tested-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Tested-by: Mark Brown <broonie@kernel.org>
Link: https://msgid.link/20251120172540.2267180-8-maz@kernel.org
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-v4.c |  5 ++++-
 include/kvm/arm_vgic.h        | 20 ++++++++++----------
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 548aec9d5a72..09c3e9eb23f8 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -163,6 +163,7 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
 		struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i);
 		struct irq_desc *desc;
 		unsigned long flags;
+		bool pending;
 		int ret;
 
 		raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -173,9 +174,11 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
 		irq->hw = false;
 		ret = irq_get_irqchip_state(irq->host_irq,
 					    IRQCHIP_STATE_PENDING,
-					    &irq->pending_latch);
+					    &pending);
 		WARN_ON(ret);
 
+		irq->pending_latch = pending;
+
 		desc = irq_to_desc(irq->host_irq);
 		irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
 	unlock:
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 577723f5599b..e84a1bc5cf17 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -123,6 +123,7 @@ struct irq_ops {
 
 struct vgic_irq {
 	raw_spinlock_t irq_lock;	/* Protects the content of the struct */
+	u32 intid;			/* Guest visible INTID */
 	struct rcu_head rcu;
 	struct list_head ap_list;
 
@@ -137,17 +138,17 @@ struct vgic_irq {
 					 * affinity reg (v3).
 					 */
 
-	u32 intid;			/* Guest visible INTID */
-	bool line_level;		/* Level only */
-	bool pending_latch;		/* The pending latch state used to calculate
-					 * the pending state for both level
-					 * and edge triggered IRQs. */
-	bool active;
-	bool pending_release;		/* Used for LPIs only, unreferenced IRQ
+	bool pending_release:1;		/* Used for LPIs only, unreferenced IRQ
 					 * pending a release */
 
-	bool enabled;
-	bool hw;			/* Tied to HW IRQ */
+	bool pending_latch:1;		/* The pending latch state used to calculate
+					 * the pending state for both level
+					 * and edge triggered IRQs. */
+	enum vgic_irq_config config:1;	/* Level or edge */
+	bool line_level:1;		/* Level only */
+	bool enabled:1;
+	bool active:1;
+	bool hw:1;			/* Tied to HW IRQ */
 	refcount_t refcount;		/* Used for LPIs */
 	u32 hwintid;			/* HW INTID number */
 	unsigned int host_irq;		/* linux irq corresponding to hwintid */
@@ -159,7 +160,6 @@ struct vgic_irq {
 	u8 active_source;		/* GICv2 SGIs only */
 	u8 priority;
 	u8 group;			/* 0 == group 0, 1 == group 1 */
-	enum vgic_irq_config config;	/* Level or edge */
 
 	struct irq_ops *ops;
 
-- 
cgit v1.2.3


From 879a7fd4fd64656d953f887e6a18e13e0b9a9f8f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 20 Nov 2025 17:24:58 +0000
Subject: KVM: arm64: Add tracking of vgic_irq being present in a LR

We currently cannot identify whether an interrupt is queued into
a LR. It wasn't needed until now, but that's about to change.

Add yet another flag to track that state.

Tested-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Tested-by: Mark Brown <broonie@kernel.org>
Link: https://msgid.link/20251120172540.2267180-9-maz@kernel.org
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-v2.c | 6 ++++++
 arch/arm64/kvm/vgic/vgic-v3.c | 6 ++++++
 include/kvm/arm_vgic.h        | 1 +
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 441efef80d60..74efacba38d4 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -101,6 +101,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 		/* Handle resampling for mapped interrupts if required */
 		vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
 
+		irq->on_lr = false;
+
 		raw_spin_unlock(&irq->irq_lock);
 		vgic_put_irq(vcpu->kvm, irq);
 	}
@@ -124,6 +126,8 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
 	u32 val = irq->intid;
 	bool allow_pending = true;
 
+	WARN_ON(irq->on_lr);
+
 	if (irq->active) {
 		val |= GICH_LR_ACTIVE_BIT;
 		if (vgic_irq_is_sgi(irq->intid))
@@ -194,6 +198,8 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
 	/* The GICv2 LR only holds five bits of priority. */
 	val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
 
+	irq->on_lr = true;
+
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
 }
 
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 1b6c3071ec80..e3f4b27e0225 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -97,6 +97,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 		/* Handle resampling for mapped interrupts if required */
 		vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
 
+		irq->on_lr = false;
+
 		raw_spin_unlock(&irq->irq_lock);
 		vgic_put_irq(vcpu->kvm, irq);
 	}
@@ -111,6 +113,8 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
 	u64 val = irq->intid;
 	bool allow_pending = true, is_v2_sgi;
 
+	WARN_ON(irq->on_lr);
+
 	is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
 		     model == KVM_DEV_TYPE_ARM_VGIC_V2);
 
@@ -185,6 +189,8 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
 	val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
 
 	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+
+	irq->on_lr = true;
 }
 
 void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index e84a1bc5cf17..ec349c5a4a8b 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -149,6 +149,7 @@ struct vgic_irq {
 	bool enabled:1;
 	bool active:1;
 	bool hw:1;			/* Tied to HW IRQ */
+	bool on_lr:1;			/* Present in a CPU LR */
 	refcount_t refcount;		/* Used for LPIs */
 	u32 hwintid;			/* HW INTID number */
 	unsigned int host_irq;		/* linux irq corresponding to hwintid */
-- 
cgit v1.2.3


From cd4f6ee99b28f10692c2444c8dc0bab77357a25e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 20 Nov 2025 17:25:18 +0000
Subject: KVM: arm64: GICv3: Handle deactivation via ICV_DIR_EL1 traps

Deactivation via ICV_DIR_EL1 is both relatively straightforward
(we have the interrupt that needs deactivation) and really awkward.

The main issue is that the interrupt may either be in an LR on
another CPU, or ourside of any LR.

In the former case, we process the deactivation is if ot was
a write to GICD_CACTIVERn, which is already implemented as a big
hammer IPI'ing all vcpus. In the latter case, we just perform
a normal deactivation, similar to what we do for EOImode==0.

Another annoying aspect is that we need to tell the CPU owning
the interrupt that its ap_list needs laudering. We use a brand new
vcpu request to that effect.

Note that this doesn't address deactivation via the GICV MMIO view,
which will be taken care of in a later change.

Tested-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Tested-by: Mark Brown <broonie@kernel.org>
Link: https://msgid.link/20251120172540.2267180-29-maz@kernel.org
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h |  1 +
 arch/arm64/kvm/arm.c              |  4 ++
 arch/arm64/kvm/hyp/vgic-v3-sr.c   |  3 ++
 arch/arm64/kvm/sys_regs.c         | 19 ++++++++-
 arch/arm64/kvm/vgic/vgic-v3.c     | 85 +++++++++++++++++++++++++++++++++++++++
 arch/arm64/kvm/vgic/vgic.c        | 11 +++++
 arch/arm64/kvm/vgic/vgic.h        |  1 +
 include/kvm/arm_vgic.h            |  1 +
 8 files changed, 123 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 64302c438355..7501a2ee4dd4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -54,6 +54,7 @@
 #define KVM_REQ_NESTED_S2_UNMAP		KVM_ARCH_REQ(8)
 #define KVM_REQ_GUEST_HYP_IRQ_PENDING	KVM_ARCH_REQ(9)
 #define KVM_REQ_MAP_L1_VNCR_EL2		KVM_ARCH_REQ(10)
+#define KVM_REQ_VGIC_PROCESS_UPDATE	KVM_ARCH_REQ(11)
 
 #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
 				     KVM_DIRTY_LOG_INITIALLY_SET)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 733195ef183e..fe13f9777f9c 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1041,6 +1041,10 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
 		 */
 		kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
 
+		/* Process interrupts deactivated through a trap */
+		if (kvm_check_request(KVM_REQ_VGIC_PROCESS_UPDATE, vcpu))
+			kvm_vgic_process_async_update(vcpu);
+
 		if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
 			kvm_update_stolen_time(vcpu);
 
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index cafbb41b4c33..f2f585455144 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -1247,6 +1247,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
 	case SYS_ICC_DIR_EL1:
 		if (unlikely(is_read))
 			return 0;
+		/* Full exit if required to handle overflow deactivation... */
+		if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR)
+			return 0;
 		fn = __vgic_v3_write_dir;
 		break;
 	case SYS_ICC_RPR_EL1:
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e67eb39ddc11..1b69d6e2d720 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -666,6 +666,21 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu,
 	return true;
 }
 
+static bool access_gic_dir(struct kvm_vcpu *vcpu,
+			   struct sys_reg_params *p,
+			   const struct sys_reg_desc *r)
+{
+	if (!kvm_has_gicv3(vcpu->kvm))
+		return undef_access(vcpu, p, r);
+
+	if (!p->is_write)
+		return undef_access(vcpu, p, r);
+
+	vgic_v3_deactivate(vcpu, p->regval);
+
+	return true;
+}
+
 static bool trap_raz_wi(struct kvm_vcpu *vcpu,
 			struct sys_reg_params *p,
 			const struct sys_reg_desc *r)
@@ -3370,7 +3385,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
 	{ SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
 	{ SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
-	{ SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
+	{ SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir },
 	{ SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
 	{ SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
 	{ SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi },
@@ -4495,7 +4510,7 @@ static const struct sys_reg_desc cp15_regs[] = {
 	{ CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
 	{ CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
 	{ CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
-	{ CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
+	{ CP15_SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir },
 	{ CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
 	{ CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
 	{ CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index d4f27f451c8f..d83edf02d072 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -12,6 +12,7 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_asm.h>
 
+#include "vgic-mmio.h"
 #include "vgic.h"
 
 static bool group0_trap;
@@ -171,6 +172,90 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 	cpuif->used_lrs = 0;
 }
 
+void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
+	struct kvm_vcpu *target_vcpu = NULL;
+	struct vgic_irq *irq;
+	unsigned long flags;
+	bool mmio = false;
+	u64 lr = 0;
+
+	/*
+	 * We only deal with DIR when EOIMode==1, and only for SGI,
+	 * PPI or SPI.
+	 */
+	if (!(cpuif->vgic_vmcr & ICH_VMCR_EOIM_MASK) ||
+	    val >= vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)
+		return;
+
+	/* Make sure we're in the same context as LR handling */
+	local_irq_save(flags);
+
+	irq = vgic_get_vcpu_irq(vcpu, val);
+	if (WARN_ON_ONCE(!irq))
+		goto out;
+
+	/*
+	 * EOIMode=1: we must rely on traps to handle deactivate of
+	 * overflowing interrupts, as there is no ordering guarantee and
+	 * EOIcount isn't being incremented. Priority drop will have taken
+	 * place, as ICV_EOIxR_EL1 only affects the APRs and not the LRs.
+	 *
+	 * Three possibities:
+	 *
+	 * - The irq is not queued on any CPU, and there is nothing to
+	 *   do,
+	 *
+	 * - Or the irq is in an LR, meaning that its state is not
+	 *   directly observable. Treat it bluntly by making it as if
+	 *   this was a write to GICD_ICACTIVER, which will force an
+	 *   exit on all vcpus. If it hurts, don't do that.
+	 *
+	 * - Or the irq is active, but not in an LR, and we can
+	 *   directly deactivate it by building a pseudo-LR, fold it,
+	 *   and queue a request to prune the resulting ap_list,
+	 */
+	scoped_guard(raw_spinlock, &irq->irq_lock) {
+		target_vcpu = irq->vcpu;
+
+		/* Not on any ap_list? */
+		if (!target_vcpu)
+			goto put;
+
+		/*
+		 * Urgh. We're deactivating something that we cannot
+		 * observe yet... Big hammer time.
+		 */
+		if (irq->on_lr) {
+			mmio = true;
+			goto put;
+		}
+
+		/* (with a Dalek voice) DEACTIVATE!!!! */
+		lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT;
+	}
+
+	if (lr & ICH_LR_HW)
+		vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+
+	vgic_v3_fold_lr(vcpu, lr);
+
+put:
+	vgic_put_irq(vcpu->kvm, irq);
+
+out:
+	local_irq_restore(flags);
+
+	if (mmio)
+		vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32));
+
+	/* Force the ap_list to be pruned */
+	if (target_vcpu)
+		kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu);
+}
+
 /* Requires the irq to be locked already */
 static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
 {
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index abe01c9c6b36..cbba6c2988d1 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -990,6 +990,17 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 	vgic_prune_ap_list(vcpu);
 }
 
+/* Sync interrupts that were deactivated through a DIR trap */
+void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+
+	/* Make sure we're in the same context as LR handling */
+	local_irq_save(flags);
+	vgic_prune_ap_list(vcpu);
+	local_irq_restore(flags);
+}
+
 static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
 {
 	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 037efb620082..01ff6d4aa9da 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -318,6 +318,7 @@ static inline void vgic_get_irq_ref(struct vgic_irq *irq)
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
 void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val);
 void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als);
 void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
 void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ec349c5a4a8b..b798546755a3 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -421,6 +421,7 @@ bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid);
+void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu);
 
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1);
 
-- 
cgit v1.2.3


From 1c3b3cadcd69f7415e8b3b1b1e81459e0e8c9f33 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 20 Nov 2025 17:25:21 +0000
Subject: KVM: arm64: GICv3: Add SPI tracking to handle asymmetric deactivation

SPIs are specially annpying, as they can be activated on a CPU and
deactivated on another. WHich means that when an SPI is in flight
anywhere, all CPUs need to have their TDIR trap bit set.

This translates into broadcasting an IPI across all CPUs to make sure
they set their trap bit, The number of in-flight SPIs is kept in
an atomic variable so that CPUs can turn the trap bit off as soon
as possible.

Tested-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Tested-by: Mark Brown <broonie@kernel.org>
Link: https://msgid.link/20251120172540.2267180-32-maz@kernel.org
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-init.c |  1 +
 arch/arm64/kvm/vgic/vgic-v3.c   | 21 +++++++++++++++------
 arch/arm64/kvm/vgic/vgic.c      | 25 +++++++++++++++++++++++--
 include/kvm/arm_vgic.h          |  3 +++
 4 files changed, 42 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 6d5e5d708f23..52de99c0f01c 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -188,6 +188,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
 	struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
 	int i;
 
+	dist->active_spis = (atomic_t)ATOMIC_INIT(0);
 	dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT);
 	if (!dist->spis)
 		return  -ENOMEM;
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 09f86bf6fe7b..55847fbad4d0 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -47,10 +47,17 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu,
 		ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE;
 
 	/*
+	 * Dealing with EOImode=1 is a massive source of headache. Not
+	 * only do we need to track that we have active interrupts
+	 * outside of the LRs and force DIR to be trapped, we also
+	 * need to deal with SPIs that can be deactivated on another
+	 * CPU.
+	 *
 	 * Note that we set the trap irrespective of EOIMode, as that
 	 * can change behind our back without any warning...
 	 */
-	if (irqs_active_outside_lrs(als))
+	if (irqs_active_outside_lrs(als)		     ||
+	    atomic_read(&vcpu->kvm->arch.vgic.active_spis))
 		cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR;
 }
 
@@ -78,11 +85,6 @@ static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val)
 	if (!irq)	/* An LPI could have been unmapped. */
 		return;
 
-	/* Notify fds when the guest EOI'ed a level-triggered IRQ */
-	if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
-		kvm_notify_acked_irq(vcpu->kvm, 0,
-				     intid - VGIC_NR_PRIVATE_IRQS);
-
 	scoped_guard(raw_spinlock, &irq->irq_lock) {
 		/* Always preserve the active bit for !LPIs, note deactivation */
 		if (irq->intid >= VGIC_MIN_LPI)
@@ -117,6 +119,13 @@ static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val)
 		irq->on_lr = false;
 	}
 
+	/* Notify fds when the guest EOI'ed a level-triggered SPI, and drop the refcount */
+	if (deactivated && lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) {
+		kvm_notify_acked_irq(vcpu->kvm, 0,
+				     intid - VGIC_NR_PRIVATE_IRQS);
+		atomic_dec_if_positive(&vcpu->kvm->arch.vgic.active_spis);
+	}
+
 	vgic_put_irq(vcpu->kvm, irq);
 }
 
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index cbba6c2988d1..83969c18ef03 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -367,6 +367,17 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne
 	return false;
 }
 
+static bool vgic_model_needs_bcst_kick(struct kvm *kvm)
+{
+	/*
+	 * A GICv3 (or GICv3-like) system exposing a GICv3 to the
+	 * guest needs a broadcast kick to set TDIR globally, even if
+	 * the bit doesn't really exist (we still need to check for
+	 * the shadow bit in the DIR emulation fast-path).
+	 */
+	return (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3);
+}
+
 /*
  * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
  * Do the queuing if necessary, taking the right locks in the right order.
@@ -379,6 +390,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
 			   unsigned long flags) __releases(&irq->irq_lock)
 {
 	struct kvm_vcpu *vcpu;
+	bool bcast;
 
 	lockdep_assert_held(&irq->irq_lock);
 
@@ -453,11 +465,20 @@ retry:
 	list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
 	irq->vcpu = vcpu;
 
+	/* A new SPI may result in deactivation trapping on all vcpus */
+	bcast = (vgic_model_needs_bcst_kick(vcpu->kvm) &&
+		 vgic_valid_spi(vcpu->kvm, irq->intid) &&
+		 atomic_fetch_inc(&vcpu->kvm->arch.vgic.active_spis) == 0);
+
 	raw_spin_unlock(&irq->irq_lock);
 	raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
 
-	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-	kvm_vcpu_kick(vcpu);
+	if (!bcast) {
+		kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+		kvm_vcpu_kick(vcpu);
+	} else {
+		kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_IRQ_PENDING);
+	}
 
 	return true;
 }
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index b798546755a3..6a4d3d205596 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -263,6 +263,9 @@ struct vgic_dist {
 	/* The GIC maintenance IRQ for nested hypervisors. */
 	u32			mi_intid;
 
+	/* Track the number of in-flight active SPIs */
+	atomic_t		active_spis;
+
 	/* base addresses in guest physical address space: */
 	gpa_t			vgic_dist_base;		/* distributor */
 	union {
-- 
cgit v1.2.3


From 255de897e7fb918a34845167c572b5bf8e1d9d79 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 20 Nov 2025 17:25:28 +0000
Subject: KVM: arm64: GICv2: Handle deactivation via GICV_DIR traps

Add the plumbing of GICv2 interrupt deactivation via GICV_DIR.
This requires adding a new device so that we can easily decode
the DIR address.

The deactivation itself is very similar to the GICv3 version.

Tested-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Tested-by: Mark Brown <broonie@kernel.org>
Link: https://msgid.link/20251120172540.2267180-39-maz@kernel.org
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-mmio-v2.c | 24 +++++++++++
 arch/arm64/kvm/vgic/vgic-mmio.h    |  1 +
 arch/arm64/kvm/vgic/vgic-v2.c      | 85 ++++++++++++++++++++++++++++++++++++++
 arch/arm64/kvm/vgic/vgic.h         |  1 +
 include/kvm/arm_vgic.h             |  1 +
 5 files changed, 112 insertions(+)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index f25fccb1f8e6..406845b3117c 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -359,6 +359,16 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
 	vgic_set_vmcr(vcpu, &vmcr);
 }
 
+static void vgic_mmio_write_dir(struct kvm_vcpu *vcpu,
+				gpa_t addr, unsigned int len,
+				unsigned long val)
+{
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_deactivate(vcpu, val);
+	else
+		vgic_v3_deactivate(vcpu, val);
+}
+
 static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
 					gpa_t addr, unsigned int len)
 {
@@ -482,6 +492,10 @@ static const struct vgic_register_region vgic_v2_cpu_registers[] = {
 	REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
 		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
 		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_CPU_DEACTIVATE,
+		vgic_mmio_read_raz, vgic_mmio_write_dir,
+		vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi,
+		4, VGIC_ACCESS_32bit),
 };
 
 unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
@@ -494,6 +508,16 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
 	return SZ_4K;
 }
 
+unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev)
+{
+	dev->regions = vgic_v2_cpu_registers;
+	dev->nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
+
+	kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+	return KVM_VGIC_V2_CPU_SIZE;
+}
+
 int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	const struct vgic_register_region *region;
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h
index 5b490a4dfa5e..50dc80220b0f 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.h
+++ b/arch/arm64/kvm/vgic/vgic-mmio.h
@@ -213,6 +213,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
 				    const u32 val);
 
 unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
+unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev);
 
 unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
 
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index bbd4d003fde8..bc52d44a573d 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -9,6 +9,7 @@
 #include <kvm/arm_vgic.h>
 #include <asm/kvm_mmu.h>
 
+#include "vgic-mmio.h"
 #include "vgic.h"
 
 static inline void vgic_v2_write_lr(int lr, u32 val)
@@ -147,6 +148,79 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 	cpuif->used_lrs = 0;
 }
 
+void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
+	struct kvm_vcpu *target_vcpu = NULL;
+	bool mmio = false;
+	struct vgic_irq *irq;
+	unsigned long flags;
+	u64 lr = 0;
+	u8 cpuid;
+
+	/* Snapshot CPUID, and remove it from the INTID */
+	cpuid = FIELD_GET(GENMASK_ULL(12, 10), val);
+	val &= ~GENMASK_ULL(12, 10);
+
+	/* We only deal with DIR when EOIMode==1 */
+	if (!(cpuif->vgic_vmcr & GICH_VMCR_EOI_MODE_MASK))
+		return;
+
+	/* Make sure we're in the same context as LR handling */
+	local_irq_save(flags);
+
+	irq = vgic_get_vcpu_irq(vcpu, val);
+	if (WARN_ON_ONCE(!irq))
+		goto out;
+
+	/* See the corresponding v3 code for the rationale */
+	scoped_guard(raw_spinlock, &irq->irq_lock) {
+		target_vcpu = irq->vcpu;
+
+		/* Not on any ap_list? */
+		if (!target_vcpu)
+			goto put;
+
+		/*
+		 * Urgh. We're deactivating something that we cannot
+		 * observe yet... Big hammer time.
+		 */
+		if (irq->on_lr) {
+			mmio = true;
+			goto put;
+		}
+
+		/* SGI: check that the cpuid matches */
+		if (val < VGIC_NR_SGIS && irq->active_source != cpuid) {
+			target_vcpu = NULL;
+			goto put;
+		}
+
+		/* (with a Dalek voice) DEACTIVATE!!!! */
+		lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT;
+	}
+
+	if (lr & GICH_LR_HW)
+		writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr),
+			       kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE);
+
+	vgic_v2_fold_lr(vcpu, lr);
+
+put:
+	vgic_put_irq(vcpu->kvm, irq);
+
+out:
+	local_irq_restore(flags);
+
+	if (mmio)
+		vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32));
+
+	/* Force the ap_list to be pruned */
+	if (target_vcpu)
+		kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu);
+}
+
 static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
 {
 	u32 val = irq->intid;
@@ -346,6 +420,7 @@ static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
 int vgic_v2_map_resources(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
+	unsigned int len;
 	int ret = 0;
 
 	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
@@ -369,6 +444,16 @@ int vgic_v2_map_resources(struct kvm *kvm)
 		return ret;
 	}
 
+	len = vgic_v2_init_cpuif_iodev(&dist->cpuif_iodev);
+	dist->cpuif_iodev.base_addr = dist->vgic_cpu_base;
+	dist->cpuif_iodev.iodev_type = IODEV_CPUIF;
+	dist->cpuif_iodev.redist_vcpu = NULL;
+
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist->vgic_cpu_base,
+				      len, &dist->cpuif_iodev.dev);
+	if (ret)
+		return ret;
+
 	if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
 		ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
 					    kvm_vgic_global_state.vcpu_base,
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index e93bdb485f07..5f0fc96b4dc2 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -277,6 +277,7 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr,
 
 void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val);
 void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
 void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als);
 int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 6a4d3d205596..b261fb3968d0 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -287,6 +287,7 @@ struct vgic_dist {
 	struct vgic_irq		*spis;
 
 	struct vgic_io_device	dist_iodev;
+	struct vgic_io_device	cpuif_iodev;
 
 	bool			has_its;
 	bool			table_write_in_progress;
-- 
cgit v1.2.3


From d245f9b4ab806733a77e51a218ca7b8bc3135cd9 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs@nvidia.com>
Date: Wed, 1 Oct 2025 16:56:52 +1000
Subject: mm/zone_device: support large zone device private folios
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm: support device-private THP", v7.

This patch series introduces support for Transparent Huge Page (THP)
migration in zone device-private memory.  The implementation enables
efficient migration of large folios between system memory and
device-private memory

Background

Current zone device-private memory implementation only supports PAGE_SIZE
granularity, leading to:
- Increased TLB pressure
- Inefficient migration between CPU and device memory

This series extends the existing zone device-private infrastructure to
support THP, leading to:
- Reduced page table overhead
- Improved memory bandwidth utilization
- Seamless fallback to base pages when needed

In my local testing (using lib/test_hmm) and a throughput test, the series
shows a 350% improvement in data transfer throughput and a 80% improvement
in latency

These patches build on the earlier posts by Ralph Campbell [1]

Two new flags are added in vma_migration to select and mark compound
pages.  migrate_vma_setup(), migrate_vma_pages() and
migrate_vma_finalize() support migration of these pages when
MIGRATE_VMA_SELECT_COMPOUND is passed in as arguments.

The series also adds zone device awareness to (m)THP pages along with
fault handling of large zone device private pages.  page vma walk and the
rmap code is also zone device aware.  Support has also been added for
folios that might need to be split in the middle of migration (when the
src and dst do not agree on MIGRATE_PFN_COMPOUND), that occurs when src
side of the migration can migrate large pages, but the destination has not
been able to allocate large pages.  The code supported and used
folio_split() when migrating THP pages, this is used when
MIGRATE_VMA_SELECT_COMPOUND is not passed as an argument to
migrate_vma_setup().

The test infrastructure lib/test_hmm.c has been enhanced to support THP
migration.  A new ioctl to emulate failure of large page allocations has
been added to test the folio split code path.  hmm-tests.c has new test
cases for huge page migration and to test the folio split path.  A new
throughput test has been added as well.

The nouveau dmem code has been enhanced to use the new THP migration
capability.

mTHP support:

The patches hard code, HPAGE_PMD_NR in a few places, but the code has been
kept generic to support various order sizes.  With additional refactoring
of the code support of different order sizes should be possible.

The future plan is to post enhancements to support mTHP with a rough
design as follows:

1. Add the notion of allowable thp orders to the HMM based test driver
2. For non PMD based THP paths in migrate_device.c, check to see if
   a suitable order is found and supported by the driver
3. Iterate across orders to check the highest supported order for migration
4. Migrate and finalize

The mTHP patches can be built on top of this series, the key design
elements that need to be worked out are infrastructure and driver support
for multiple ordered pages and their migration.

HMM support for large folios was added in 10b9feee2d0d ("mm/hmm:
populate PFNs from PMD swap entry").


This patch (of 16)

Add routines to support allocation of large order zone device folios and
helper functions for zone device folios, to check if a folio is device
private and helpers for setting zone device data.

When large folios are used, the existing page_free() callback in pgmap is
called when the folio is freed, this is true for both PAGE_SIZE and higher
order pages.

Zone device private large folios do not support deferred split and scan
like normal THP folios.

Link: https://lkml.kernel.org/r/20251001065707.920170-1-balbirs@nvidia.com
Link: https://lkml.kernel.org/r/20251001065707.920170-2-balbirs@nvidia.com
Link: https://lore.kernel.org/linux-mm/20201106005147.20113-1-rcampbell@nvidia.com/ [1]
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "Christian König" <christian.koenig@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c       |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  2 +-
 drivers/gpu/drm/drm_pagemap.c            |  2 +-
 drivers/gpu/drm/nouveau/nouveau_dmem.c   |  2 +-
 include/linux/memremap.h                 | 10 +++++++++-
 lib/test_hmm.c                           |  2 +-
 mm/memremap.c                            | 26 +++++++++++++++-----------
 mm/rmap.c                                |  6 +++++-
 8 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 03f8c34fa0a2..91f763410673 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -723,7 +723,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 
 	dpage = pfn_to_page(uvmem_pfn);
 	dpage->zone_device_data = pvt;
-	zone_device_page_init(dpage);
+	zone_device_page_init(dpage, 0);
 	return dpage;
 out_clear:
 	spin_lock(&kvmppc_uvmem_bitmap_lock);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 59a5a3fea65d..f6198e66dc5a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -218,7 +218,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
 	page = pfn_to_page(pfn);
 	svm_range_bo_ref(prange->svm_bo);
 	page->zone_device_data = prange->svm_bo;
-	zone_device_page_init(page);
+	zone_device_page_init(page, 0);
 }
 
 static void
diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c
index 22c44807e3fe..46a8edb279dc 100644
--- a/drivers/gpu/drm/drm_pagemap.c
+++ b/drivers/gpu/drm/drm_pagemap.c
@@ -196,7 +196,7 @@ static void drm_pagemap_get_devmem_page(struct page *page,
 					struct drm_pagemap_zdd *zdd)
 {
 	page->zone_device_data = drm_pagemap_zdd_get(zdd);
-	zone_device_page_init(page);
+	zone_device_page_init(page, 0);
 }
 
 /**
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index ca4932a150e3..53cc1926b9da 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -318,7 +318,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
 			return NULL;
 	}
 
-	zone_device_page_init(page);
+	zone_device_page_init(page, 0);
 	return page;
 }
 
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index e5951ba12a28..d2487a19cba2 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -206,7 +206,7 @@ static inline bool is_fsdax_page(const struct page *page)
 }
 
 #ifdef CONFIG_ZONE_DEVICE
-void zone_device_page_init(struct page *page);
+void zone_device_page_init(struct page *page, unsigned int order);
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -215,6 +215,14 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn);
 bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
 
 unsigned long memremap_compat_align(void);
+
+static inline void zone_device_folio_init(struct folio *folio, unsigned int order)
+{
+	zone_device_page_init(&folio->page, order);
+	if (order)
+		folio_set_large_rmappable(folio);
+}
+
 #else
 static inline void *devm_memremap_pages(struct device *dev,
 		struct dev_pagemap *pgmap)
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 83e3d8208a54..24d82121cde8 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -627,7 +627,7 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
 			goto error;
 	}
 
-	zone_device_page_init(dpage);
+	zone_device_page_init(dpage, 0);
 	dpage->zone_device_data = rpage;
 	return dpage;
 
diff --git a/mm/memremap.c b/mm/memremap.c
index 46cb1b0b6f72..e45dfb568710 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -416,20 +416,19 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
 void free_zone_device_folio(struct folio *folio)
 {
 	struct dev_pagemap *pgmap = folio->pgmap;
+	unsigned long nr = folio_nr_pages(folio);
+	int i;
 
 	if (WARN_ON_ONCE(!pgmap))
 		return;
 
 	mem_cgroup_uncharge(folio);
 
-	/*
-	 * Note: we don't expect anonymous compound pages yet. Once supported
-	 * and we could PTE-map them similar to THP, we'd have to clear
-	 * PG_anon_exclusive on all tail pages.
-	 */
 	if (folio_test_anon(folio)) {
-		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-		__ClearPageAnonExclusive(folio_page(folio, 0));
+		for (i = 0; i < nr; i++)
+			__ClearPageAnonExclusive(folio_page(folio, i));
+	} else {
+		VM_WARN_ON_ONCE(folio_test_large(folio));
 	}
 
 	/*
@@ -456,8 +455,8 @@ void free_zone_device_folio(struct folio *folio)
 	case MEMORY_DEVICE_COHERENT:
 		if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
 			break;
-		pgmap->ops->page_free(folio_page(folio, 0));
-		put_dev_pagemap(pgmap);
+		pgmap->ops->page_free(&folio->page);
+		percpu_ref_put_many(&folio->pgmap->ref, nr);
 		break;
 
 	case MEMORY_DEVICE_GENERIC:
@@ -480,14 +479,19 @@ void free_zone_device_folio(struct folio *folio)
 	}
 }
 
-void zone_device_page_init(struct page *page)
+void zone_device_page_init(struct page *page, unsigned int order)
 {
+	VM_WARN_ON_ONCE(order > MAX_ORDER_NR_PAGES);
+
 	/*
 	 * Drivers shouldn't be allocating pages after calling
 	 * memunmap_pages().
 	 */
-	WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref));
+	WARN_ON_ONCE(!percpu_ref_tryget_many(&page_pgmap(page)->ref, 1 << order));
 	set_page_count(page, 1);
 	lock_page(page);
+
+	if (order)
+		prep_compound_page(page, order);
 }
 EXPORT_SYMBOL_GPL(zone_device_page_init);
diff --git a/mm/rmap.c b/mm/rmap.c
index 3c3cf3efa5f6..eaed5dfbb9b7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1733,9 +1733,13 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
 	 * the folio is unmapped and at least one page is still mapped.
 	 *
 	 * Check partially_mapped first to ensure it is a large folio.
+	 *
+	 * Device private folios do not support deferred splitting and
+	 * shrinker based scanning of the folios to free.
 	 */
 	if (partially_mapped && folio_test_anon(folio) &&
-	    !folio_test_partially_mapped(folio))
+	    !folio_test_partially_mapped(folio) &&
+	    !folio_is_device_private(folio))
 		deferred_split_folio(folio, true);
 
 	__folio_mod_stat(folio, -nr, -nr_pmdmapped);
-- 
cgit v1.2.3


From 3a5a06554566fcc9f7de7327cfc365ed384d396c Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs@nvidia.com>
Date: Wed, 1 Oct 2025 16:56:53 +1000
Subject: mm/zone_device: rename page_free callback to folio_free
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change page_free to folio_free to make the folio support for
zone device-private more consistent. The PCI P2PDMA callback
has also been updated and changed to folio_free() as a result.

For drivers that do not support folios (yet), the folio is
converted back into page via &folio->page and the page is used
as is, in the current callback implementation.

Link: https://lkml.kernel.org/r/20251001065707.920170-3-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "Christian König" <christian.koenig@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/memory-model.rst        |  2 +-
 arch/powerpc/kvm/book3s_hv_uvmem.c       |  5 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  5 +++--
 drivers/gpu/drm/drm_pagemap.c            | 10 +++++-----
 drivers/gpu/drm/nouveau/nouveau_dmem.c   |  5 +++--
 drivers/pci/p2pdma.c                     |  5 +++--
 include/linux/memremap.h                 |  6 +++---
 lib/test_hmm.c                           |  5 +++--
 mm/memremap.c                            | 16 ++++++++--------
 9 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/Documentation/mm/memory-model.rst b/Documentation/mm/memory-model.rst
index 5f3eafbbc520..7957122039e8 100644
--- a/Documentation/mm/memory-model.rst
+++ b/Documentation/mm/memory-model.rst
@@ -165,7 +165,7 @@ The users of `ZONE_DEVICE` are:
 * pmem: Map platform persistent memory to be used as a direct-I/O target
   via DAX mappings.
 
-* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()`
+* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->folio_free()`
   event callbacks to allow a device-driver to coordinate memory management
   events related to device-memory, typically GPU memory. See
   Documentation/mm/hmm.rst.
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 91f763410673..e5000bef90f2 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -1014,8 +1014,9 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
  * to a normal PFN during H_SVM_PAGE_OUT.
  * Gets called with kvm->arch.uvmem_lock held.
  */
-static void kvmppc_uvmem_page_free(struct page *page)
+static void kvmppc_uvmem_folio_free(struct folio *folio)
 {
+	struct page *page = &folio->page;
 	unsigned long pfn = page_to_pfn(page) -
 			(kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT);
 	struct kvmppc_uvmem_page_pvt *pvt;
@@ -1034,7 +1035,7 @@ static void kvmppc_uvmem_page_free(struct page *page)
 }
 
 static const struct dev_pagemap_ops kvmppc_uvmem_ops = {
-	.page_free = kvmppc_uvmem_page_free,
+	.folio_free = kvmppc_uvmem_folio_free,
 	.migrate_to_ram	= kvmppc_uvmem_migrate_to_ram,
 };
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index f6198e66dc5a..6f1617436f4b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -568,8 +568,9 @@ out:
 	return r < 0 ? r : 0;
 }
 
-static void svm_migrate_page_free(struct page *page)
+static void svm_migrate_folio_free(struct folio *folio)
 {
+	struct page *page = &folio->page;
 	struct svm_range_bo *svm_bo = page->zone_device_data;
 
 	if (svm_bo) {
@@ -1009,7 +1010,7 @@ out_mmput:
 }
 
 static const struct dev_pagemap_ops svm_migrate_pgmap_ops = {
-	.page_free		= svm_migrate_page_free,
+	.folio_free		= svm_migrate_folio_free,
 	.migrate_to_ram		= svm_migrate_to_ram,
 };
 
diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c
index 46a8edb279dc..37d7cfbbb3e8 100644
--- a/drivers/gpu/drm/drm_pagemap.c
+++ b/drivers/gpu/drm/drm_pagemap.c
@@ -752,15 +752,15 @@ err_out:
 }
 
 /**
- * drm_pagemap_page_free() - Put GPU SVM zone device data associated with a page
- * @page: Pointer to the page
+ * drm_pagemap_folio_free() - Put GPU SVM zone device data associated with a folio
+ * @folio: Pointer to the folio
  *
  * This function is a callback used to put the GPU SVM zone device data
  * associated with a page when it is being released.
  */
-static void drm_pagemap_page_free(struct page *page)
+static void drm_pagemap_folio_free(struct folio *folio)
 {
-	drm_pagemap_zdd_put(page->zone_device_data);
+	drm_pagemap_zdd_put(folio->page.zone_device_data);
 }
 
 /**
@@ -788,7 +788,7 @@ static vm_fault_t drm_pagemap_migrate_to_ram(struct vm_fault *vmf)
 }
 
 static const struct dev_pagemap_ops drm_pagemap_pagemap_ops = {
-	.page_free = drm_pagemap_page_free,
+	.folio_free = drm_pagemap_folio_free,
 	.migrate_to_ram = drm_pagemap_migrate_to_ram,
 };
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 53cc1926b9da..d34288ebe7d2 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -108,8 +108,9 @@ unsigned long nouveau_dmem_page_addr(struct page *page)
 	return chunk->bo->offset + off;
 }
 
-static void nouveau_dmem_page_free(struct page *page)
+static void nouveau_dmem_folio_free(struct folio *folio)
 {
+	struct page *page = &folio->page;
 	struct nouveau_dmem_chunk *chunk = nouveau_page_to_chunk(page);
 	struct nouveau_dmem *dmem = chunk->drm->dmem;
 
@@ -220,7 +221,7 @@ done:
 }
 
 static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = {
-	.page_free		= nouveau_dmem_page_free,
+	.folio_free		= nouveau_dmem_folio_free,
 	.migrate_to_ram		= nouveau_dmem_migrate_to_ram,
 };
 
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 78e108e47254..ee74b75d3e1f 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -200,8 +200,9 @@ static const struct attribute_group p2pmem_group = {
 	.name = "p2pmem",
 };
 
-static void p2pdma_page_free(struct page *page)
+static void p2pdma_folio_free(struct folio *folio)
 {
+	struct page *page = &folio->page;
 	struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page));
 	/* safe to dereference while a reference is held to the percpu ref */
 	struct pci_p2pdma *p2pdma =
@@ -214,7 +215,7 @@ static void p2pdma_page_free(struct page *page)
 }
 
 static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
-	.page_free = p2pdma_page_free,
+	.folio_free = p2pdma_folio_free,
 };
 
 static void pci_p2pdma_release(void *data)
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index d2487a19cba2..cd28d1666801 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -77,11 +77,11 @@ enum memory_type {
 
 struct dev_pagemap_ops {
 	/*
-	 * Called once the page refcount reaches 0.  The reference count will be
+	 * Called once the folio refcount reaches 0.  The reference count will be
 	 * reset to one by the core code after the method is called to prepare
-	 * for handing out the page again.
+	 * for handing out the folio again.
 	 */
-	void (*page_free)(struct page *page);
+	void (*folio_free)(struct folio *folio);
 
 	/*
 	 * Used for private (un-addressable) device memory only.  Must migrate
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 24d82121cde8..9dbf265d1036 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1374,8 +1374,9 @@ static const struct file_operations dmirror_fops = {
 	.owner		= THIS_MODULE,
 };
 
-static void dmirror_devmem_free(struct page *page)
+static void dmirror_devmem_free(struct folio *folio)
 {
+	struct page *page = &folio->page;
 	struct page *rpage = BACKING_PAGE(page);
 	struct dmirror_device *mdevice;
 
@@ -1438,7 +1439,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
 }
 
 static const struct dev_pagemap_ops dmirror_devmem_ops = {
-	.page_free	= dmirror_devmem_free,
+	.folio_free	= dmirror_devmem_free,
 	.migrate_to_ram	= dmirror_devmem_fault,
 };
 
diff --git a/mm/memremap.c b/mm/memremap.c
index e45dfb568710..4c2e0d68eb27 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -289,8 +289,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 			WARN(1, "Missing migrate_to_ram method\n");
 			return ERR_PTR(-EINVAL);
 		}
-		if (!pgmap->ops->page_free) {
-			WARN(1, "Missing page_free method\n");
+		if (!pgmap->ops->folio_free) {
+			WARN(1, "Missing folio_free method\n");
 			return ERR_PTR(-EINVAL);
 		}
 		if (!pgmap->owner) {
@@ -299,8 +299,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 		}
 		break;
 	case MEMORY_DEVICE_COHERENT:
-		if (!pgmap->ops->page_free) {
-			WARN(1, "Missing page_free method\n");
+		if (!pgmap->ops->folio_free) {
+			WARN(1, "Missing folio_free method\n");
 			return ERR_PTR(-EINVAL);
 		}
 		if (!pgmap->owner) {
@@ -453,9 +453,9 @@ void free_zone_device_folio(struct folio *folio)
 	switch (pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
 	case MEMORY_DEVICE_COHERENT:
-		if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+		if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free))
 			break;
-		pgmap->ops->page_free(&folio->page);
+		pgmap->ops->folio_free(folio);
 		percpu_ref_put_many(&folio->pgmap->ref, nr);
 		break;
 
@@ -472,9 +472,9 @@ void free_zone_device_folio(struct folio *folio)
 		break;
 
 	case MEMORY_DEVICE_PCI_P2PDMA:
-		if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+		if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free))
 			break;
-		pgmap->ops->page_free(folio_page(folio, 0));
+		pgmap->ops->folio_free(folio);
 		break;
 	}
 }
-- 
cgit v1.2.3


From 368076f52ebeecd33e10a9f80905d7508b6b6149 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs@nvidia.com>
Date: Wed, 1 Oct 2025 16:56:54 +1000
Subject: mm/huge_memory: add device-private THP support to PMD operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend core huge page management functions to handle device-private THP
entries.  This enables proper handling of large device-private folios in
fundamental MM operations.

The following functions have been updated:

- copy_huge_pmd(): Handle device-private entries during fork/clone
- zap_huge_pmd(): Properly free device-private THP during munmap
- change_huge_pmd(): Support protection changes on device-private THP
- __pte_offset_map(): Add device-private entry awareness

Link: https://lkml.kernel.org/r/20251001065707.920170-4-balbirs@nvidia.com
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h | 32 ++++++++++++++++++++++++++++
 mm/huge_memory.c        | 56 +++++++++++++++++++++++++++++++++++++++++--------
 mm/pgtable-generic.c    |  2 +-
 3 files changed, 80 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 64ea151a7ae3..2687928a8146 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -594,10 +594,42 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
 }
 #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
 
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
+
+/**
+ * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry
+ * @pmd: The PMD to check
+ *
+ * Returns true if the PMD contains a swap entry that represents a device private
+ * page mapping. This is used for zone device private pages that have been
+ * swapped out but still need special handling during various memory management
+ * operations.
+ *
+ * Return: 1 if PMD contains device private entry, 0 otherwise
+ */
+static inline int is_pmd_device_private_entry(pmd_t pmd)
+{
+	return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
+}
+
+#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static inline int is_pmd_device_private_entry(pmd_t pmd)
+{
+	return 0;
+}
+
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
 static inline int non_swap_entry(swp_entry_t entry)
 {
 	return swp_type(entry) >= MAX_SWAPFILES;
 }
 
+static inline int is_pmd_non_present_folio_entry(pmd_t pmd)
+{
+	return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd);
+}
+
 #endif /* CONFIG_MMU */
 #endif /* _LINUX_SWAPOPS_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3ae16b4a82de..19f0ee7373ae 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1704,17 +1704,45 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (unlikely(is_swap_pmd(pmd))) {
 		swp_entry_t entry = pmd_to_swp_entry(pmd);
 
-		VM_BUG_ON(!is_pmd_migration_entry(pmd));
-		if (!is_readable_migration_entry(entry)) {
-			entry = make_readable_migration_entry(
-							swp_offset(entry));
+		VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd));
+
+		if (is_writable_migration_entry(entry) ||
+		    is_readable_exclusive_migration_entry(entry)) {
+			entry = make_readable_migration_entry(swp_offset(entry));
 			pmd = swp_entry_to_pmd(entry);
 			if (pmd_swp_soft_dirty(*src_pmd))
 				pmd = pmd_swp_mksoft_dirty(pmd);
 			if (pmd_swp_uffd_wp(*src_pmd))
 				pmd = pmd_swp_mkuffd_wp(pmd);
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
+		} else if (is_device_private_entry(entry)) {
+			/*
+			 * For device private entries, since there are no
+			 * read exclusive entries, writable = !readable
+			 */
+			if (is_writable_device_private_entry(entry)) {
+				entry = make_readable_device_private_entry(swp_offset(entry));
+				pmd = swp_entry_to_pmd(entry);
+
+				if (pmd_swp_soft_dirty(*src_pmd))
+					pmd = pmd_swp_mksoft_dirty(pmd);
+				if (pmd_swp_uffd_wp(*src_pmd))
+					pmd = pmd_swp_mkuffd_wp(pmd);
+				set_pmd_at(src_mm, addr, src_pmd, pmd);
+			}
+
+			src_folio = pfn_swap_entry_folio(entry);
+			VM_WARN_ON(!folio_test_large(src_folio));
+
+			folio_get(src_folio);
+			/*
+			 * folio_try_dup_anon_rmap_pmd does not fail for
+			 * device private entries.
+			 */
+			folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
+							dst_vma, src_vma);
 		}
+
 		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		mm_inc_nr_ptes(dst_mm);
 		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
@@ -2212,15 +2240,16 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			folio_remove_rmap_pmd(folio, page, vma);
 			WARN_ON_ONCE(folio_mapcount(folio) < 0);
 			VM_BUG_ON_PAGE(!PageHead(page), page);
-		} else if (thp_migration_supported()) {
+		} else if (is_pmd_non_present_folio_entry(orig_pmd)) {
 			swp_entry_t entry;
 
-			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
 			entry = pmd_to_swp_entry(orig_pmd);
 			folio = pfn_swap_entry_folio(entry);
 			flush_needed = 0;
-		} else
-			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
+			if (!thp_migration_supported())
+				WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+		}
 
 		if (folio_test_anon(folio)) {
 			zap_deposited_table(tlb->mm, pmd);
@@ -2240,6 +2269,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 				folio_mark_accessed(folio);
 		}
 
+		if (folio_is_device_private(folio)) {
+			folio_remove_rmap_pmd(folio, &folio->page, vma);
+			WARN_ON_ONCE(folio_mapcount(folio) < 0);
+			folio_put(folio);
+		}
+
 		spin_unlock(ptl);
 		if (flush_needed)
 			tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
@@ -2368,7 +2403,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		struct folio *folio = pfn_swap_entry_folio(entry);
 		pmd_t newpmd;
 
-		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
+		VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd));
 		if (is_writable_migration_entry(entry)) {
 			/*
 			 * A protection check is difficult so
@@ -2381,6 +2416,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			newpmd = swp_entry_to_pmd(entry);
 			if (pmd_swp_soft_dirty(*pmd))
 				newpmd = pmd_swp_mksoft_dirty(newpmd);
+		} else if (is_writable_device_private_entry(entry)) {
+			entry = make_readable_device_private_entry(swp_offset(entry));
+			newpmd = swp_entry_to_pmd(entry);
 		} else {
 			newpmd = *pmd;
 		}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e46f0cf2159c..d3aec7a9926a 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -292,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
 
 	if (pmdvalp)
 		*pmdvalp = pmdval;
-	if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
+	if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
 		goto nomap;
 	if (unlikely(pmd_trans_huge(pmdval)))
 		goto nomap;
-- 
cgit v1.2.3


From a30b48bf1b244f11bf9b6d20cdccfe0c2264130c Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs@nvidia.com>
Date: Wed, 1 Oct 2025 16:56:58 +1000
Subject: mm/migrate_device: implement THP migration of zone device pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during
migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating device
pages as compound pages during device pfn migration.

migrate_device code paths go through the collect, setup and finalize
phases of migration.

The entries in src and dst arrays passed to these functions still remain
at a PAGE_SIZE granularity.  When a compound page is passed, the first
entry has the PFN along with MIGRATE_PFN_COMPOUND and other flags set
(MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the remaining entries
(HPAGE_PMD_NR - 1) are filled with 0's.  This representation allows for
the compound page to be split into smaller page sizes.

migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP page
aware.  Two new helper functions migrate_vma_collect_huge_pmd() and
migrate_vma_insert_huge_pmd_page() have been added.

migrate_vma_collect_huge_pmd() can collect THP pages, but if for some
reason this fails, there is fallback support to split the folio and
migrate it.

migrate_vma_insert_huge_pmd_page() closely follows the logic of
migrate_vma_insert_page()

Support for splitting pages as needed for migration will follow in later
patches in this series.

Link: https://lkml.kernel.org/r/20251001065707.920170-8-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h |   2 +
 mm/migrate_device.c     | 469 +++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 408 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 1f0ac122c3bf..41b4cc05a450 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node)
 #define MIGRATE_PFN_VALID	(1UL << 0)
 #define MIGRATE_PFN_MIGRATE	(1UL << 1)
 #define MIGRATE_PFN_WRITE	(1UL << 3)
+#define MIGRATE_PFN_COMPOUND	(1UL << 4)
 #define MIGRATE_PFN_SHIFT	6
 
 static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
@@ -143,6 +144,7 @@ enum migrate_vma_direction {
 	MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
 	MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
 	MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
+	MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
 };
 
 struct migrate_vma {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index e6bcd6dc5129..a0a315f3572a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -14,6 +14,7 @@
 #include <linux/pagewalk.h>
 #include <linux/rmap.h>
 #include <linux/swapops.h>
+#include <linux/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
 
@@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
 	if (!vma_is_anonymous(walk->vma))
 		return migrate_vma_collect_skip(start, end, walk);
 
+	if (thp_migration_supported() &&
+		(migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+		(IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+		 IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+		migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
+						MIGRATE_PFN_COMPOUND;
+		migrate->dst[migrate->npages] = 0;
+		migrate->npages++;
+		migrate->cpages++;
+
+		/*
+		 * Collect the remaining entries as holes, in case we
+		 * need to split later
+		 */
+		return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+	}
+
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
 		migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
 		migrate->dst[migrate->npages] = 0;
@@ -103,57 +121,151 @@ static int migrate_vma_split_folio(struct folio *folio,
 	return 0;
 }
 
-static int migrate_vma_collect_pmd(pmd_t *pmdp,
-				   unsigned long start,
-				   unsigned long end,
-				   struct mm_walk *walk)
+/** migrate_vma_collect_huge_pmd - collect THP pages without splitting the
+ * folio for device private pages.
+ * @pmdp: pointer to pmd entry
+ * @start: start address of the range for migration
+ * @end: end address of the range for migration
+ * @walk: mm_walk callback structure
+ * @fault_folio: folio associated with the fault if any
+ *
+ * Collect the huge pmd entry at @pmdp for migration and set the
+ * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that
+ * migration will occur at HPAGE_PMD granularity
+ */
+static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
+					unsigned long end, struct mm_walk *walk,
+					struct folio *fault_folio)
 {
+	struct mm_struct *mm = walk->mm;
+	struct folio *folio;
 	struct migrate_vma *migrate = walk->private;
-	struct folio *fault_folio = migrate->fault_page ?
-		page_folio(migrate->fault_page) : NULL;
-	struct vm_area_struct *vma = walk->vma;
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long addr = start, unmapped = 0;
 	spinlock_t *ptl;
-	pte_t *ptep;
+	swp_entry_t entry;
+	int ret;
+	unsigned long write = 0;
 
-again:
-	if (pmd_none(*pmdp))
+	ptl = pmd_lock(mm, pmdp);
+	if (pmd_none(*pmdp)) {
+		spin_unlock(ptl);
 		return migrate_vma_collect_hole(start, end, -1, walk);
+	}
 
 	if (pmd_trans_huge(*pmdp)) {
-		struct folio *folio;
-
-		ptl = pmd_lock(mm, pmdp);
-		if (unlikely(!pmd_trans_huge(*pmdp))) {
+		if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
 			spin_unlock(ptl);
-			goto again;
+			return migrate_vma_collect_skip(start, end, walk);
 		}
 
 		folio = pmd_folio(*pmdp);
 		if (is_huge_zero_folio(folio)) {
 			spin_unlock(ptl);
-			split_huge_pmd(vma, pmdp, addr);
-		} else {
-			int ret;
+			return migrate_vma_collect_hole(start, end, -1, walk);
+		}
+		if (pmd_write(*pmdp))
+			write = MIGRATE_PFN_WRITE;
+	} else if (!pmd_present(*pmdp)) {
+		entry = pmd_to_swp_entry(*pmdp);
+		folio = pfn_swap_entry_folio(entry);
+
+		if (!is_device_private_entry(entry) ||
+			!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+			(folio->pgmap->owner != migrate->pgmap_owner)) {
+			spin_unlock(ptl);
+			return migrate_vma_collect_skip(start, end, walk);
+		}
 
-			folio_get(folio);
+		if (is_migration_entry(entry)) {
+			migration_entry_wait_on_locked(entry, ptl);
 			spin_unlock(ptl);
-			/* FIXME: we don't expect THP for fault_folio */
-			if (WARN_ON_ONCE(fault_folio == folio))
-				return migrate_vma_collect_skip(start, end,
-								walk);
-			if (unlikely(!folio_trylock(folio)))
-				return migrate_vma_collect_skip(start, end,
-								walk);
-			ret = split_folio(folio);
-			if (fault_folio != folio)
-				folio_unlock(folio);
-			folio_put(folio);
-			if (ret)
-				return migrate_vma_collect_skip(start, end,
-								walk);
+			return -EAGAIN;
+		}
+
+		if (is_writable_device_private_entry(entry))
+			write = MIGRATE_PFN_WRITE;
+	} else {
+		spin_unlock(ptl);
+		return -EAGAIN;
+	}
+
+	folio_get(folio);
+	if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
+		spin_unlock(ptl);
+		folio_put(folio);
+		return migrate_vma_collect_skip(start, end, walk);
+	}
+
+	if (thp_migration_supported() &&
+		(migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+		(IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+		 IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+
+		struct page_vma_mapped_walk pvmw = {
+			.ptl = ptl,
+			.address = start,
+			.pmd = pmdp,
+			.vma = walk->vma,
+		};
+
+		unsigned long pfn = page_to_pfn(folio_page(folio, 0));
+
+		migrate->src[migrate->npages] = migrate_pfn(pfn) | write
+						| MIGRATE_PFN_MIGRATE
+						| MIGRATE_PFN_COMPOUND;
+		migrate->dst[migrate->npages++] = 0;
+		migrate->cpages++;
+		ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
+		if (ret) {
+			migrate->npages--;
+			migrate->cpages--;
+			migrate->src[migrate->npages] = 0;
+			migrate->dst[migrate->npages] = 0;
+			goto fallback;
 		}
+		migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+		spin_unlock(ptl);
+		return 0;
+	}
+
+fallback:
+	spin_unlock(ptl);
+	if (!folio_test_large(folio))
+		goto done;
+	ret = split_folio(folio);
+	if (fault_folio != folio)
+		folio_unlock(folio);
+	folio_put(folio);
+	if (ret)
+		return migrate_vma_collect_skip(start, end, walk);
+	if (pmd_none(pmdp_get_lockless(pmdp)))
+		return migrate_vma_collect_hole(start, end, -1, walk);
+
+done:
+	return -ENOENT;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+				   unsigned long start,
+				   unsigned long end,
+				   struct mm_walk *walk)
+{
+	struct migrate_vma *migrate = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr = start, unmapped = 0;
+	spinlock_t *ptl;
+	struct folio *fault_folio = migrate->fault_page ?
+		page_folio(migrate->fault_page) : NULL;
+	pte_t *ptep;
+
+again:
+	if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) {
+		int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio);
+
+		if (ret == -EAGAIN)
+			goto again;
+		if (ret == 0)
+			return 0;
 	}
 
 	ptep = pte_offset_map_lock(mm, pmdp, start, &ptl);
@@ -243,8 +355,7 @@ again:
 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 		}
 
-		/* FIXME support THP */
-		if (!page || !page->mapping || PageTransCompound(page)) {
+		if (!page || !page->mapping) {
 			mpfn = 0;
 			goto next;
 		}
@@ -415,14 +526,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
 	 */
 	int extra = 1 + (page == fault_page);
 
-	/*
-	 * FIXME support THP (transparent huge page), it is bit more complex to
-	 * check them than regular pages, because they can be mapped with a pmd
-	 * or with a pte (split pte mapping).
-	 */
-	if (folio_test_large(folio))
-		return false;
-
 	/* Page from ZONE_DEVICE have one extra reference */
 	if (folio_is_zone_device(folio))
 		extra++;
@@ -453,17 +556,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
 
 	lru_add_drain();
 
-	for (i = 0; i < npages; i++) {
+	for (i = 0; i < npages; ) {
 		struct page *page = migrate_pfn_to_page(src_pfns[i]);
 		struct folio *folio;
+		unsigned int nr = 1;
 
 		if (!page) {
 			if (src_pfns[i] & MIGRATE_PFN_MIGRATE)
 				unmapped++;
-			continue;
+			goto next;
 		}
 
 		folio =	page_folio(page);
+		nr = folio_nr_pages(folio);
+
+		if (nr > 1)
+			src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+
+
 		/* ZONE_DEVICE folios are not on LRU */
 		if (!folio_is_zone_device(folio)) {
 			if (!folio_test_lru(folio) && allow_drain) {
@@ -475,7 +585,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
 			if (!folio_isolate_lru(folio)) {
 				src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
 				restore++;
-				continue;
+				goto next;
 			}
 
 			/* Drop the reference we took in collect */
@@ -494,10 +604,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
 
 			src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
 			restore++;
-			continue;
+			goto next;
 		}
 
 		unmapped++;
+next:
+		i += nr;
 	}
 
 	for (i = 0; i < npages && restore; i++) {
@@ -643,6 +755,160 @@ int migrate_vma_setup(struct migrate_vma *args)
 }
 EXPORT_SYMBOL(migrate_vma_setup);
 
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+/**
+ * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
+ * at @addr. folio is already allocated as a part of the migration process with
+ * large page.
+ *
+ * @page needs to be initialized and setup after it's allocated. The code bits
+ * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does
+ * not support THP zero pages.
+ *
+ * @migrate: migrate_vma arguments
+ * @addr: address where the folio will be inserted
+ * @page: page to be inserted at @addr
+ * @src: src pfn which is being migrated
+ * @pmdp: pointer to the pmd
+ */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+					 unsigned long addr,
+					 struct page *page,
+					 unsigned long *src,
+					 pmd_t *pmdp)
+{
+	struct vm_area_struct *vma = migrate->vma;
+	gfp_t gfp = vma_thp_gfp_mask(vma);
+	struct folio *folio = page_folio(page);
+	int ret;
+	vm_fault_t csa_ret;
+	spinlock_t *ptl;
+	pgtable_t pgtable;
+	pmd_t entry;
+	bool flush = false;
+	unsigned long i;
+
+	VM_WARN_ON_FOLIO(!folio, folio);
+	VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp));
+
+	if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER))
+		return -EINVAL;
+
+	ret = anon_vma_prepare(vma);
+	if (ret)
+		return ret;
+
+	folio_set_order(folio, HPAGE_PMD_ORDER);
+	folio_set_large_rmappable(folio);
+
+	if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) {
+		count_vm_event(THP_FAULT_FALLBACK);
+		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+		ret = -ENOMEM;
+		goto abort;
+	}
+
+	__folio_mark_uptodate(folio);
+
+	pgtable = pte_alloc_one(vma->vm_mm);
+	if (unlikely(!pgtable))
+		goto abort;
+
+	if (folio_is_device_private(folio)) {
+		swp_entry_t swp_entry;
+
+		if (vma->vm_flags & VM_WRITE)
+			swp_entry = make_writable_device_private_entry(
+						page_to_pfn(page));
+		else
+			swp_entry = make_readable_device_private_entry(
+						page_to_pfn(page));
+		entry = swp_entry_to_pmd(swp_entry);
+	} else {
+		if (folio_is_zone_device(folio) &&
+		    !folio_is_device_coherent(folio)) {
+			goto abort;
+		}
+		entry = folio_mk_pmd(folio, vma->vm_page_prot);
+		if (vma->vm_flags & VM_WRITE)
+			entry = pmd_mkwrite(pmd_mkdirty(entry), vma);
+	}
+
+	ptl = pmd_lock(vma->vm_mm, pmdp);
+	csa_ret = check_stable_address_space(vma->vm_mm);
+	if (csa_ret)
+		goto abort;
+
+	/*
+	 * Check for userfaultfd but do not deliver the fault. Instead,
+	 * just back off.
+	 */
+	if (userfaultfd_missing(vma))
+		goto unlock_abort;
+
+	if (!pmd_none(*pmdp)) {
+		if (!is_huge_zero_pmd(*pmdp))
+			goto unlock_abort;
+		flush = true;
+	} else if (!pmd_none(*pmdp))
+		goto unlock_abort;
+
+	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+	folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
+	if (!folio_is_zone_device(folio))
+		folio_add_lru_vma(folio, vma);
+	folio_get(folio);
+
+	if (flush) {
+		pte_free(vma->vm_mm, pgtable);
+		flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE);
+		pmdp_invalidate(vma, addr, pmdp);
+	} else {
+		pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable);
+		mm_inc_nr_ptes(vma->vm_mm);
+	}
+	set_pmd_at(vma->vm_mm, addr, pmdp, entry);
+	update_mmu_cache_pmd(vma, addr, pmdp);
+
+	spin_unlock(ptl);
+
+	count_vm_event(THP_FAULT_ALLOC);
+	count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+
+	return 0;
+
+unlock_abort:
+	spin_unlock(ptl);
+abort:
+	for (i = 0; i < HPAGE_PMD_NR; i++)
+		src[i] &= ~MIGRATE_PFN_MIGRATE;
+	return 0;
+}
+#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+					 unsigned long addr,
+					 struct page *page,
+					 unsigned long *src,
+					 pmd_t *pmdp)
+{
+	return 0;
+}
+#endif
+
+static unsigned long migrate_vma_nr_pages(unsigned long *src)
+{
+	unsigned long nr = 1;
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+	if (*src & MIGRATE_PFN_COMPOUND)
+		nr = HPAGE_PMD_NR;
+#else
+	if (*src & MIGRATE_PFN_COMPOUND)
+		VM_WARN_ON_ONCE(true);
+#endif
+	return nr;
+}
+
 /*
  * This code closely matches the code in:
  *   __handle_mm_fault()
@@ -653,9 +919,10 @@ EXPORT_SYMBOL(migrate_vma_setup);
  */
 static void migrate_vma_insert_page(struct migrate_vma *migrate,
 				    unsigned long addr,
-				    struct page *page,
+				    unsigned long *dst,
 				    unsigned long *src)
 {
+	struct page *page = migrate_pfn_to_page(*dst);
 	struct folio *folio = page_folio(page);
 	struct vm_area_struct *vma = migrate->vma;
 	struct mm_struct *mm = vma->vm_mm;
@@ -683,8 +950,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	pmdp = pmd_alloc(mm, pudp, addr);
 	if (!pmdp)
 		goto abort;
-	if (pmd_trans_huge(*pmdp))
-		goto abort;
+
+	if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) {
+		int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page,
+								src, pmdp);
+		if (ret)
+			goto abort;
+		return;
+	}
+
+	if (!pmd_none(*pmdp)) {
+		if (pmd_trans_huge(*pmdp)) {
+			if (!is_huge_zero_pmd(*pmdp))
+				goto abort;
+			split_huge_pmd(vma, pmdp, addr);
+		} else if (pmd_leaf(*pmdp))
+			goto abort;
+	}
+
 	if (pte_alloc(mm, pmdp))
 		goto abort;
 	if (unlikely(anon_vma_prepare(vma)))
@@ -775,23 +1058,24 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 	unsigned long i;
 	bool notified = false;
 
-	for (i = 0; i < npages; i++) {
+	for (i = 0; i < npages; ) {
 		struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
 		struct page *page = migrate_pfn_to_page(src_pfns[i]);
 		struct address_space *mapping;
 		struct folio *newfolio, *folio;
 		int r, extra_cnt = 0;
+		unsigned long nr = 1;
 
 		if (!newpage) {
 			src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
-			continue;
+			goto next;
 		}
 
 		if (!page) {
 			unsigned long addr;
 
 			if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
-				continue;
+				goto next;
 
 			/*
 			 * The only time there is no vma is when called from
@@ -809,15 +1093,47 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 					migrate->pgmap_owner);
 				mmu_notifier_invalidate_range_start(&range);
 			}
-			migrate_vma_insert_page(migrate, addr, newpage,
+
+			if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+				(!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) {
+				nr = migrate_vma_nr_pages(&src_pfns[i]);
+				src_pfns[i] &= ~MIGRATE_PFN_COMPOUND;
+				src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+				goto next;
+			}
+
+			migrate_vma_insert_page(migrate, addr, &dst_pfns[i],
 						&src_pfns[i]);
-			continue;
+			goto next;
 		}
 
 		newfolio = page_folio(newpage);
 		folio = page_folio(page);
 		mapping = folio_mapping(folio);
 
+		/*
+		 * If THP migration is enabled, check if both src and dst
+		 * can migrate large pages
+		 */
+		if (thp_migration_supported()) {
+			if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+				(src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+				!(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+
+				if (!migrate) {
+					src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
+							 MIGRATE_PFN_COMPOUND);
+					goto next;
+				}
+				src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+			} else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+				(dst_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+				!(src_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+				src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+			}
+		}
+
+
 		if (folio_is_device_private(newfolio) ||
 		    folio_is_device_coherent(newfolio)) {
 			if (mapping) {
@@ -830,7 +1146,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 				if (!folio_test_anon(folio) ||
 				    !folio_free_swap(folio)) {
 					src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
-					continue;
+					goto next;
 				}
 			}
 		} else if (folio_is_zone_device(newfolio)) {
@@ -838,7 +1154,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 			 * Other types of ZONE_DEVICE page are not supported.
 			 */
 			src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
-			continue;
+			goto next;
 		}
 
 		BUG_ON(folio_test_writeback(folio));
@@ -850,6 +1166,8 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 			src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
 		else
 			folio_migrate_flags(newfolio, folio);
+next:
+		i += nr;
 	}
 
 	if (notified)
@@ -1011,10 +1329,23 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn)
 int migrate_device_range(unsigned long *src_pfns, unsigned long start,
 			unsigned long npages)
 {
-	unsigned long i, pfn;
+	unsigned long i, j, pfn;
+
+	for (pfn = start, i = 0; i < npages; pfn++, i++) {
+		struct page *page = pfn_to_page(pfn);
+		struct folio *folio = page_folio(page);
+		unsigned int nr = 1;
 
-	for (pfn = start, i = 0; i < npages; pfn++, i++)
 		src_pfns[i] = migrate_device_pfn_lock(pfn);
+		nr = folio_nr_pages(folio);
+		if (nr > 1) {
+			src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+			for (j = 1; j < nr; j++)
+				src_pfns[i+j] = 0;
+			i += j - 1;
+			pfn += j - 1;
+		}
+	}
 
 	migrate_device_unmap(src_pfns, npages, NULL);
 
@@ -1032,10 +1363,22 @@ EXPORT_SYMBOL(migrate_device_range);
  */
 int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages)
 {
-	unsigned long i;
+	unsigned long i, j;
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = pfn_to_page(src_pfns[i]);
+		struct folio *folio = page_folio(page);
+		unsigned int nr = 1;
 
-	for (i = 0; i < npages; i++)
 		src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]);
+		nr = folio_nr_pages(folio);
+		if (nr > 1) {
+			src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+			for (j = 1; j < nr; j++)
+				src_pfns[i+j] = 0;
+			i += j - 1;
+		}
+	}
 
 	migrate_device_unmap(src_pfns, npages, NULL);
 
-- 
cgit v1.2.3


From 4964099163d0524a769d039ffa886bb4515136d0 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs@nvidia.com>
Date: Wed, 1 Oct 2025 16:56:59 +1000
Subject: mm/memory/fault: add THP fault handling for zone device private pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement CPU fault handling for zone device THP entries through
do_huge_pmd_device_private(), enabling transparent migration of
device-private large pages back to system memory on CPU access.

When the CPU accesses a zone device THP entry, the fault handler calls the
device driver's migrate_to_ram() callback to migrate the entire large page
back to system memory.

Link: https://lkml.kernel.org/r/20251001065707.920170-9-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |  7 +++++++
 mm/huge_memory.c        | 38 ++++++++++++++++++++++++++++++++++++++
 mm/memory.c             |  5 +++--
 3 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fee4cf7fa300..82408c90b396 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -481,6 +481,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
 
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
 
+vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);
+
 extern struct folio *huge_zero_folio;
 extern unsigned long huge_zero_pfn;
 
@@ -662,6 +664,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 	return 0;
 }
 
+static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
+{
+	return 0;
+}
+
 static inline bool is_huge_zero_folio(const struct folio *folio)
 {
 	return false;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 23db562cde07..ded707a50af8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1288,6 +1288,44 @@ release:
 
 }
 
+vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	vm_fault_t ret = 0;
+	spinlock_t *ptl;
+	swp_entry_t swp_entry;
+	struct page *page;
+	struct folio *folio;
+
+	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+		vma_end_read(vma);
+		return VM_FAULT_RETRY;
+	}
+
+	ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+	if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) {
+		spin_unlock(ptl);
+		return 0;
+	}
+
+	swp_entry = pmd_to_swp_entry(vmf->orig_pmd);
+	page = pfn_swap_entry_to_page(swp_entry);
+	folio = page_folio(page);
+	vmf->page = page;
+	vmf->pte = NULL;
+	if (folio_trylock(folio)) {
+		folio_get(folio);
+		spin_unlock(ptl);
+		ret = page_pgmap(page)->ops->migrate_to_ram(vmf);
+		folio_unlock(folio);
+		folio_put(folio);
+	} else {
+		spin_unlock(ptl);
+	}
+
+	return ret;
+}
+
 /*
  * always: directly stall for all thp allocations
  * defer: wake kswapd and fail if not immediately available
diff --git a/mm/memory.c b/mm/memory.c
index 27bc457b32c2..732414852570 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6345,8 +6345,9 @@ retry_pud:
 		vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
 
 		if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
-			VM_BUG_ON(thp_migration_supported() &&
-					  !is_pmd_migration_entry(vmf.orig_pmd));
+			if (is_pmd_device_private_entry(vmf.orig_pmd))
+				return do_huge_pmd_device_private(&vmf);
+
 			if (is_pmd_migration_entry(vmf.orig_pmd))
 				pmd_migration_entry_wait(mm, vmf.pmd);
 			return 0;
-- 
cgit v1.2.3


From 775465fd26a325359887f9c3129444fcc76c6298 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs@nvidia.com>
Date: Wed, 1 Oct 2025 16:57:00 +1000
Subject: lib/test_hmm: add zone device private THP test infrastructure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enhance the hmm test driver (lib/test_hmm) with support for THP pages.

A new pool of free_folios() has now been added to the dmirror device,
which can be allocated when a request for a THP zone device private page
is made.

Add compound page awareness to the allocation function during normal
migration and fault based migration.  These routines also copy
folio_nr_pages() when moving data between system memory and device memory.

args.src and args.dst used to hold migration entries are now dynamically
allocated (as they need to hold HPAGE_PMD_NR entries or more).

Split and migrate support will be added in future patches in this series.

Link: https://lkml.kernel.org/r/20251001065707.920170-10-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h |  12 ++
 lib/test_hmm.c           | 368 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 304 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index cd28d1666801..7df4dd037b69 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -177,6 +177,18 @@ static inline bool folio_is_pci_p2pdma(const struct folio *folio)
 		folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
 
+static inline void *folio_zone_device_data(const struct folio *folio)
+{
+	VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio);
+	return folio->page.zone_device_data;
+}
+
+static inline void folio_set_zone_device_data(struct folio *folio, void *data)
+{
+	VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio);
+	folio->page.zone_device_data = data;
+}
+
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
 	return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 9dbf265d1036..32d402e80bcc 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -119,6 +119,7 @@ struct dmirror_device {
 	unsigned long		calloc;
 	unsigned long		cfree;
 	struct page		*free_pages;
+	struct folio		*free_folios;
 	spinlock_t		lock;		/* protects the above */
 };
 
@@ -492,7 +493,7 @@ fini:
 }
 
 static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
-				   struct page **ppage)
+				  struct page **ppage, bool is_large)
 {
 	struct dmirror_chunk *devmem;
 	struct resource *res = NULL;
@@ -572,20 +573,45 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
 		pfn_first, pfn_last);
 
 	spin_lock(&mdevice->lock);
-	for (pfn = pfn_first; pfn < pfn_last; pfn++) {
+	for (pfn = pfn_first; pfn < pfn_last; ) {
 		struct page *page = pfn_to_page(pfn);
 
+		if (is_large && IS_ALIGNED(pfn, HPAGE_PMD_NR)
+			&& (pfn + HPAGE_PMD_NR <= pfn_last)) {
+			page->zone_device_data = mdevice->free_folios;
+			mdevice->free_folios = page_folio(page);
+			pfn += HPAGE_PMD_NR;
+			continue;
+		}
+
 		page->zone_device_data = mdevice->free_pages;
 		mdevice->free_pages = page;
+		pfn++;
 	}
+
+	ret = 0;
 	if (ppage) {
-		*ppage = mdevice->free_pages;
-		mdevice->free_pages = (*ppage)->zone_device_data;
-		mdevice->calloc++;
+		if (is_large) {
+			if (!mdevice->free_folios) {
+				ret = -ENOMEM;
+				goto err_unlock;
+			}
+			*ppage = folio_page(mdevice->free_folios, 0);
+			mdevice->free_folios = (*ppage)->zone_device_data;
+			mdevice->calloc += HPAGE_PMD_NR;
+		} else if (mdevice->free_pages) {
+			*ppage = mdevice->free_pages;
+			mdevice->free_pages = (*ppage)->zone_device_data;
+			mdevice->calloc++;
+		} else {
+			ret = -ENOMEM;
+			goto err_unlock;
+		}
 	}
+err_unlock:
 	spin_unlock(&mdevice->lock);
 
-	return 0;
+	return ret;
 
 err_release:
 	mutex_unlock(&mdevice->devmem_lock);
@@ -598,10 +624,13 @@ err_devmem:
 	return ret;
 }
 
-static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
+static struct page *dmirror_devmem_alloc_page(struct dmirror *dmirror,
+					      bool is_large)
 {
 	struct page *dpage = NULL;
 	struct page *rpage = NULL;
+	unsigned int order = is_large ? HPAGE_PMD_ORDER : 0;
+	struct dmirror_device *mdevice = dmirror->mdevice;
 
 	/*
 	 * For ZONE_DEVICE private type, this is a fake device so we allocate
@@ -610,49 +639,55 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
 	 * data and ignore rpage.
 	 */
 	if (dmirror_is_private_zone(mdevice)) {
-		rpage = alloc_page(GFP_HIGHUSER);
+		rpage = folio_page(folio_alloc(GFP_HIGHUSER, order), 0);
 		if (!rpage)
 			return NULL;
 	}
 	spin_lock(&mdevice->lock);
 
-	if (mdevice->free_pages) {
+	if (is_large && mdevice->free_folios) {
+		dpage = folio_page(mdevice->free_folios, 0);
+		mdevice->free_folios = dpage->zone_device_data;
+		mdevice->calloc += 1 << order;
+		spin_unlock(&mdevice->lock);
+	} else if (!is_large && mdevice->free_pages) {
 		dpage = mdevice->free_pages;
 		mdevice->free_pages = dpage->zone_device_data;
 		mdevice->calloc++;
 		spin_unlock(&mdevice->lock);
 	} else {
 		spin_unlock(&mdevice->lock);
-		if (dmirror_allocate_chunk(mdevice, &dpage))
+		if (dmirror_allocate_chunk(mdevice, &dpage, is_large))
 			goto error;
 	}
 
-	zone_device_page_init(dpage, 0);
+	zone_device_folio_init(page_folio(dpage), order);
 	dpage->zone_device_data = rpage;
 	return dpage;
 
 error:
 	if (rpage)
-		__free_page(rpage);
+		__free_pages(rpage, order);
 	return NULL;
 }
 
 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
 					   struct dmirror *dmirror)
 {
-	struct dmirror_device *mdevice = dmirror->mdevice;
 	const unsigned long *src = args->src;
 	unsigned long *dst = args->dst;
 	unsigned long addr;
 
-	for (addr = args->start; addr < args->end; addr += PAGE_SIZE,
-						   src++, dst++) {
+	for (addr = args->start; addr < args->end; ) {
 		struct page *spage;
 		struct page *dpage;
 		struct page *rpage;
+		bool is_large = *src & MIGRATE_PFN_COMPOUND;
+		int write = (*src & MIGRATE_PFN_WRITE) ? MIGRATE_PFN_WRITE : 0;
+		unsigned long nr = 1;
 
 		if (!(*src & MIGRATE_PFN_MIGRATE))
-			continue;
+			goto next;
 
 		/*
 		 * Note that spage might be NULL which is OK since it is an
@@ -662,17 +697,45 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
 		if (WARN(spage && is_zone_device_page(spage),
 		     "page already in device spage pfn: 0x%lx\n",
 		     page_to_pfn(spage)))
+			goto next;
+
+		dpage = dmirror_devmem_alloc_page(dmirror, is_large);
+		if (!dpage) {
+			struct folio *folio;
+			unsigned long i;
+			unsigned long spfn = *src >> MIGRATE_PFN_SHIFT;
+			struct page *src_page;
+
+			if (!is_large)
+				goto next;
+
+			if (!spage && is_large) {
+				nr = HPAGE_PMD_NR;
+			} else {
+				folio = page_folio(spage);
+				nr = folio_nr_pages(folio);
+			}
+
+			for (i = 0; i < nr && addr < args->end; i++) {
+				dpage = dmirror_devmem_alloc_page(dmirror, false);
+				rpage = BACKING_PAGE(dpage);
+				rpage->zone_device_data = dmirror;
+
+				*dst = migrate_pfn(page_to_pfn(dpage)) | write;
+				src_page = pfn_to_page(spfn + i);
+
+				if (spage)
+					copy_highpage(rpage, src_page);
+				else
+					clear_highpage(rpage);
+				src++;
+				dst++;
+				addr += PAGE_SIZE;
+			}
 			continue;
-
-		dpage = dmirror_devmem_alloc_page(mdevice);
-		if (!dpage)
-			continue;
+		}
 
 		rpage = BACKING_PAGE(dpage);
-		if (spage)
-			copy_highpage(rpage, spage);
-		else
-			clear_highpage(rpage);
 
 		/*
 		 * Normally, a device would use the page->zone_device_data to
@@ -684,10 +747,42 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
 
 		pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
 			 page_to_pfn(spage), page_to_pfn(dpage));
-		*dst = migrate_pfn(page_to_pfn(dpage));
-		if ((*src & MIGRATE_PFN_WRITE) ||
-		    (!spage && args->vma->vm_flags & VM_WRITE))
-			*dst |= MIGRATE_PFN_WRITE;
+
+		*dst = migrate_pfn(page_to_pfn(dpage)) | write;
+
+		if (is_large) {
+			int i;
+			struct folio *folio = page_folio(dpage);
+			*dst |= MIGRATE_PFN_COMPOUND;
+
+			if (folio_test_large(folio)) {
+				for (i = 0; i < folio_nr_pages(folio); i++) {
+					struct page *dst_page =
+						pfn_to_page(page_to_pfn(rpage) + i);
+					struct page *src_page =
+						pfn_to_page(page_to_pfn(spage) + i);
+
+					if (spage)
+						copy_highpage(dst_page, src_page);
+					else
+						clear_highpage(dst_page);
+					src++;
+					dst++;
+					addr += PAGE_SIZE;
+				}
+				continue;
+			}
+		}
+
+		if (spage)
+			copy_highpage(rpage, spage);
+		else
+			clear_highpage(rpage);
+
+next:
+		src++;
+		dst++;
+		addr += PAGE_SIZE;
 	}
 }
 
@@ -734,14 +829,17 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
 	const unsigned long *src = args->src;
 	const unsigned long *dst = args->dst;
 	unsigned long pfn;
+	const unsigned long start_pfn = start >> PAGE_SHIFT;
+	const unsigned long end_pfn = end >> PAGE_SHIFT;
 
 	/* Map the migrated pages into the device's page tables. */
 	mutex_lock(&dmirror->mutex);
 
-	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++,
-								src++, dst++) {
+	for (pfn = start_pfn; pfn < end_pfn; pfn++, src++, dst++) {
 		struct page *dpage;
 		void *entry;
+		int nr, i;
+		struct page *rpage;
 
 		if (!(*src & MIGRATE_PFN_MIGRATE))
 			continue;
@@ -750,13 +848,25 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
 		if (!dpage)
 			continue;
 
-		entry = BACKING_PAGE(dpage);
-		if (*dst & MIGRATE_PFN_WRITE)
-			entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
-		entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
-		if (xa_is_err(entry)) {
-			mutex_unlock(&dmirror->mutex);
-			return xa_err(entry);
+		if (*dst & MIGRATE_PFN_COMPOUND)
+			nr = folio_nr_pages(page_folio(dpage));
+		else
+			nr = 1;
+
+		WARN_ON_ONCE(end_pfn < start_pfn + nr);
+
+		rpage = BACKING_PAGE(dpage);
+		VM_WARN_ON(folio_nr_pages(page_folio(rpage)) != nr);
+
+		for (i = 0; i < nr; i++) {
+			entry = folio_page(page_folio(rpage), i);
+			if (*dst & MIGRATE_PFN_WRITE)
+				entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
+			entry = xa_store(&dmirror->pt, pfn + i, entry, GFP_ATOMIC);
+			if (xa_is_err(entry)) {
+				mutex_unlock(&dmirror->mutex);
+				return xa_err(entry);
+			}
 		}
 	}
 
@@ -829,31 +939,66 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
 	unsigned long start = args->start;
 	unsigned long end = args->end;
 	unsigned long addr;
+	unsigned int order = 0;
+	int i;
 
-	for (addr = start; addr < end; addr += PAGE_SIZE,
-				       src++, dst++) {
+	for (addr = start; addr < end; ) {
 		struct page *dpage, *spage;
 
 		spage = migrate_pfn_to_page(*src);
-		if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
-			continue;
+		if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) {
+			addr += PAGE_SIZE;
+			goto next;
+		}
 
 		if (WARN_ON(!is_device_private_page(spage) &&
-			    !is_device_coherent_page(spage)))
-			continue;
+			    !is_device_coherent_page(spage))) {
+			addr += PAGE_SIZE;
+			goto next;
+		}
+
 		spage = BACKING_PAGE(spage);
-		dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
-		if (!dpage)
-			continue;
-		pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
-			 page_to_pfn(spage), page_to_pfn(dpage));
+		order = folio_order(page_folio(spage));
 
+		if (order)
+			dpage = folio_page(vma_alloc_folio(GFP_HIGHUSER_MOVABLE,
+						order, args->vma, addr), 0);
+		else
+			dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
+
+		/* Try with smaller pages if large allocation fails */
+		if (!dpage && order) {
+			dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
+			if (!dpage)
+				return VM_FAULT_OOM;
+			order = 0;
+		}
+
+		pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
+				page_to_pfn(spage), page_to_pfn(dpage));
 		lock_page(dpage);
 		xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
 		copy_highpage(dpage, spage);
 		*dst = migrate_pfn(page_to_pfn(dpage));
 		if (*src & MIGRATE_PFN_WRITE)
 			*dst |= MIGRATE_PFN_WRITE;
+		if (order)
+			*dst |= MIGRATE_PFN_COMPOUND;
+
+		for (i = 0; i < (1 << order); i++) {
+			struct page *src_page;
+			struct page *dst_page;
+
+			src_page = pfn_to_page(page_to_pfn(spage) + i);
+			dst_page = pfn_to_page(page_to_pfn(dpage) + i);
+
+			xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
+			copy_highpage(dst_page, src_page);
+		}
+next:
+		addr += PAGE_SIZE << order;
+		src += 1 << order;
+		dst += 1 << order;
 	}
 	return 0;
 }
@@ -879,11 +1024,14 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror,
 	unsigned long size = cmd->npages << PAGE_SHIFT;
 	struct mm_struct *mm = dmirror->notifier.mm;
 	struct vm_area_struct *vma;
-	unsigned long src_pfns[32] = { 0 };
-	unsigned long dst_pfns[32] = { 0 };
 	struct migrate_vma args = { 0 };
 	unsigned long next;
 	int ret;
+	unsigned long *src_pfns;
+	unsigned long *dst_pfns;
+
+	src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL);
+	dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL);
 
 	start = cmd->addr;
 	end = start + size;
@@ -902,7 +1050,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror,
 			ret = -EINVAL;
 			goto out;
 		}
-		next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
+		next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT));
 		if (next > vma->vm_end)
 			next = vma->vm_end;
 
@@ -912,7 +1060,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror,
 		args.start = addr;
 		args.end = next;
 		args.pgmap_owner = dmirror->mdevice;
-		args.flags = dmirror_select_device(dmirror);
+		args.flags = dmirror_select_device(dmirror) | MIGRATE_VMA_SELECT_COMPOUND;
 
 		ret = migrate_vma_setup(&args);
 		if (ret)
@@ -928,6 +1076,8 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror,
 out:
 	mmap_read_unlock(mm);
 	mmput(mm);
+	kvfree(src_pfns);
+	kvfree(dst_pfns);
 
 	return ret;
 }
@@ -939,12 +1089,12 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
 	unsigned long size = cmd->npages << PAGE_SHIFT;
 	struct mm_struct *mm = dmirror->notifier.mm;
 	struct vm_area_struct *vma;
-	unsigned long src_pfns[32] = { 0 };
-	unsigned long dst_pfns[32] = { 0 };
 	struct dmirror_bounce bounce;
 	struct migrate_vma args = { 0 };
 	unsigned long next;
 	int ret;
+	unsigned long *src_pfns = NULL;
+	unsigned long *dst_pfns = NULL;
 
 	start = cmd->addr;
 	end = start + size;
@@ -955,6 +1105,18 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
 	if (!mmget_not_zero(mm))
 		return -EINVAL;
 
+	ret = -ENOMEM;
+	src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns),
+			  GFP_KERNEL | __GFP_NOFAIL);
+	if (!src_pfns)
+		goto free_mem;
+
+	dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns),
+			  GFP_KERNEL | __GFP_NOFAIL);
+	if (!dst_pfns)
+		goto free_mem;
+
+	ret = 0;
 	mmap_read_lock(mm);
 	for (addr = start; addr < end; addr = next) {
 		vma = vma_lookup(mm, addr);
@@ -962,7 +1124,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
 			ret = -EINVAL;
 			goto out;
 		}
-		next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
+		next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT));
 		if (next > vma->vm_end)
 			next = vma->vm_end;
 
@@ -972,7 +1134,8 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
 		args.start = addr;
 		args.end = next;
 		args.pgmap_owner = dmirror->mdevice;
-		args.flags = MIGRATE_VMA_SELECT_SYSTEM;
+		args.flags = MIGRATE_VMA_SELECT_SYSTEM |
+				MIGRATE_VMA_SELECT_COMPOUND;
 		ret = migrate_vma_setup(&args);
 		if (ret)
 			goto out;
@@ -992,7 +1155,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
 	 */
 	ret = dmirror_bounce_init(&bounce, start, size);
 	if (ret)
-		return ret;
+		goto free_mem;
 	mutex_lock(&dmirror->mutex);
 	ret = dmirror_do_read(dmirror, start, end, &bounce);
 	mutex_unlock(&dmirror->mutex);
@@ -1003,11 +1166,14 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
 	}
 	cmd->cpages = bounce.cpages;
 	dmirror_bounce_fini(&bounce);
-	return ret;
+	goto free_mem;
 
 out:
 	mmap_read_unlock(mm);
 	mmput(mm);
+free_mem:
+	kfree(src_pfns);
+	kfree(dst_pfns);
 	return ret;
 }
 
@@ -1200,6 +1366,7 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
 	unsigned long i;
 	unsigned long *src_pfns;
 	unsigned long *dst_pfns;
+	unsigned int order = 0;
 
 	src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL);
 	dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL);
@@ -1215,13 +1382,25 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
 		if (WARN_ON(!is_device_private_page(spage) &&
 			    !is_device_coherent_page(spage)))
 			continue;
+
+		order = folio_order(page_folio(spage));
 		spage = BACKING_PAGE(spage);
-		dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL);
+		if (src_pfns[i] & MIGRATE_PFN_COMPOUND) {
+			dpage = folio_page(folio_alloc(GFP_HIGHUSER_MOVABLE,
+					      order), 0);
+		} else {
+			dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL);
+			order = 0;
+		}
+
+		/* TODO Support splitting here */
 		lock_page(dpage);
-		copy_highpage(dpage, spage);
 		dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
 		if (src_pfns[i] & MIGRATE_PFN_WRITE)
 			dst_pfns[i] |= MIGRATE_PFN_WRITE;
+		if (order)
+			dst_pfns[i] |= MIGRATE_PFN_COMPOUND;
+		folio_copy(page_folio(dpage), page_folio(spage));
 	}
 	migrate_device_pages(src_pfns, dst_pfns, npages);
 	migrate_device_finalize(src_pfns, dst_pfns, npages);
@@ -1234,7 +1413,12 @@ static void dmirror_remove_free_pages(struct dmirror_chunk *devmem)
 {
 	struct dmirror_device *mdevice = devmem->mdevice;
 	struct page *page;
+	struct folio *folio;
+
 
+	for (folio = mdevice->free_folios; folio; folio = folio_zone_device_data(folio))
+		if (dmirror_page_to_chunk(folio_page(folio, 0)) == devmem)
+			mdevice->free_folios = folio_zone_device_data(folio);
 	for (page = mdevice->free_pages; page; page = page->zone_device_data)
 		if (dmirror_page_to_chunk(page) == devmem)
 			mdevice->free_pages = page->zone_device_data;
@@ -1265,6 +1449,7 @@ static void dmirror_device_remove_chunks(struct dmirror_device *mdevice)
 		mdevice->devmem_count = 0;
 		mdevice->devmem_capacity = 0;
 		mdevice->free_pages = NULL;
+		mdevice->free_folios = NULL;
 		kfree(mdevice->devmem_chunks);
 		mdevice->devmem_chunks = NULL;
 	}
@@ -1379,18 +1564,30 @@ static void dmirror_devmem_free(struct folio *folio)
 	struct page *page = &folio->page;
 	struct page *rpage = BACKING_PAGE(page);
 	struct dmirror_device *mdevice;
+	struct folio *rfolio = page_folio(rpage);
+	unsigned int order = folio_order(rfolio);
 
-	if (rpage != page)
-		__free_page(rpage);
+	if (rpage != page) {
+		if (order)
+			__free_pages(rpage, order);
+		else
+			__free_page(rpage);
+		rpage = NULL;
+	}
 
 	mdevice = dmirror_page_to_device(page);
 	spin_lock(&mdevice->lock);
 
 	/* Return page to our allocator if not freeing the chunk */
 	if (!dmirror_page_to_chunk(page)->remove) {
-		mdevice->cfree++;
-		page->zone_device_data = mdevice->free_pages;
-		mdevice->free_pages = page;
+		mdevice->cfree += 1 << order;
+		if (order) {
+			page->zone_device_data = mdevice->free_folios;
+			mdevice->free_folios = page_folio(page);
+		} else {
+			page->zone_device_data = mdevice->free_pages;
+			mdevice->free_pages = page;
+		}
 	}
 	spin_unlock(&mdevice->lock);
 }
@@ -1398,36 +1595,52 @@ static void dmirror_devmem_free(struct folio *folio)
 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
 {
 	struct migrate_vma args = { 0 };
-	unsigned long src_pfns = 0;
-	unsigned long dst_pfns = 0;
 	struct page *rpage;
 	struct dmirror *dmirror;
-	vm_fault_t ret;
+	vm_fault_t ret = 0;
+	unsigned int order, nr;
 
 	/*
 	 * Normally, a device would use the page->zone_device_data to point to
 	 * the mirror but here we use it to hold the page for the simulated
 	 * device memory and that page holds the pointer to the mirror.
 	 */
-	rpage = vmf->page->zone_device_data;
+	rpage = folio_zone_device_data(page_folio(vmf->page));
 	dmirror = rpage->zone_device_data;
 
 	/* FIXME demonstrate how we can adjust migrate range */
+	order = folio_order(page_folio(vmf->page));
+	nr = 1 << order;
+
+	/*
+	 * Consider a per-cpu cache of src and dst pfns, but with
+	 * large number of cpus that might not scale well.
+	 */
+	args.start = ALIGN_DOWN(vmf->address, (PAGE_SIZE << order));
 	args.vma = vmf->vma;
-	args.start = vmf->address;
-	args.end = args.start + PAGE_SIZE;
-	args.src = &src_pfns;
-	args.dst = &dst_pfns;
+	args.end = args.start + (PAGE_SIZE << order);
+
+	nr = (args.end - args.start) >> PAGE_SHIFT;
+	args.src = kcalloc(nr, sizeof(unsigned long), GFP_KERNEL);
+	args.dst = kcalloc(nr, sizeof(unsigned long), GFP_KERNEL);
 	args.pgmap_owner = dmirror->mdevice;
 	args.flags = dmirror_select_device(dmirror);
 	args.fault_page = vmf->page;
 
+	if (!args.src || !args.dst) {
+		ret = VM_FAULT_OOM;
+		goto err;
+	}
+
+	if (order)
+		args.flags |= MIGRATE_VMA_SELECT_COMPOUND;
+
 	if (migrate_vma_setup(&args))
 		return VM_FAULT_SIGBUS;
 
 	ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
 	if (ret)
-		return ret;
+		goto err;
 	migrate_vma_pages(&args);
 	/*
 	 * No device finalize step is needed since
@@ -1435,7 +1648,10 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
 	 * invalidated the device page table.
 	 */
 	migrate_vma_finalize(&args);
-	return 0;
+err:
+	kfree(args.src);
+	kfree(args.dst);
+	return ret;
 }
 
 static const struct dev_pagemap_ops dmirror_devmem_ops = {
@@ -1466,7 +1682,7 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id)
 		return ret;
 
 	/* Build a list of free ZONE_DEVICE struct pages */
-	return dmirror_allocate_chunk(mdevice, NULL);
+	return dmirror_allocate_chunk(mdevice, NULL, false);
 }
 
 static void dmirror_device_remove(struct dmirror_device *mdevice)
-- 
cgit v1.2.3


From 56ef398996435a0021569b86293d376649f12540 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs@nvidia.com>
Date: Wed, 1 Oct 2025 16:57:01 +1000
Subject: mm/memremap: add driver callback support for folio splitting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a zone device page is split (via huge pmd folio split).  The driver
callback for folio_split is invoked to let the device driver know that the
folio size has been split into a smaller order.

Provide a default implementation for drivers that do not provide this
callback that copies the pgmap and mapping fields for the split folios.

Update the HMM test driver to handle the split.

Link: https://lkml.kernel.org/r/20251001065707.920170-11-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h | 29 +++++++++++++++++++++++++++++
 lib/test_hmm.c           | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'include')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 7df4dd037b69..aca2b16d6889 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -100,6 +100,13 @@ struct dev_pagemap_ops {
 	 */
 	int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
 			      unsigned long nr_pages, int mf_flags);
+
+	/*
+	 * Used for private (un-addressable) device memory only.
+	 * This callback is used when a folio is split into
+	 * a smaller folio
+	 */
+	void (*folio_split)(struct folio *head, struct folio *tail);
 };
 
 #define PGMAP_ALTMAP_VALID	(1 << 0)
@@ -235,6 +242,23 @@ static inline void zone_device_folio_init(struct folio *folio, unsigned int orde
 		folio_set_large_rmappable(folio);
 }
 
+static inline void zone_device_private_split_cb(struct folio *original_folio,
+						struct folio *new_folio)
+{
+	if (folio_is_device_private(original_folio)) {
+		if (!original_folio->pgmap->ops->folio_split) {
+			if (new_folio) {
+				new_folio->pgmap = original_folio->pgmap;
+				new_folio->page.mapping =
+					original_folio->page.mapping;
+			}
+		} else {
+			original_folio->pgmap->ops->folio_split(original_folio,
+								 new_folio);
+		}
+	}
+}
+
 #else
 static inline void *devm_memremap_pages(struct device *dev,
 		struct dev_pagemap *pgmap)
@@ -268,6 +292,11 @@ static inline unsigned long memremap_compat_align(void)
 {
 	return PAGE_SIZE;
 }
+
+static inline void zone_device_private_split_cb(struct folio *original_folio,
+						struct folio *new_folio)
+{
+}
 #endif /* CONFIG_ZONE_DEVICE */
 
 static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 32d402e80bcc..46fa9e200db8 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1654,9 +1654,44 @@ err:
 	return ret;
 }
 
+static void dmirror_devmem_folio_split(struct folio *head, struct folio *tail)
+{
+	struct page *rpage = BACKING_PAGE(folio_page(head, 0));
+	struct page *rpage_tail;
+	struct folio *rfolio;
+	unsigned long offset = 0;
+
+	if (!rpage) {
+		tail->page.zone_device_data = NULL;
+		return;
+	}
+
+	rfolio = page_folio(rpage);
+
+	if (tail == NULL) {
+		folio_reset_order(rfolio);
+		rfolio->mapping = NULL;
+		folio_set_count(rfolio, 1);
+		return;
+	}
+
+	offset = folio_pfn(tail) - folio_pfn(head);
+
+	rpage_tail = folio_page(rfolio, offset);
+	tail->page.zone_device_data = rpage_tail;
+	rpage_tail->zone_device_data = rpage->zone_device_data;
+	clear_compound_head(rpage_tail);
+	rpage_tail->mapping = NULL;
+
+	folio_page(tail, 0)->mapping = folio_page(head, 0)->mapping;
+	tail->pgmap = head->pgmap;
+	folio_set_count(page_folio(rpage_tail), 1);
+}
+
 static const struct dev_pagemap_ops dmirror_devmem_ops = {
 	.folio_free	= dmirror_devmem_free,
 	.migrate_to_ram	= dmirror_devmem_fault,
+	.folio_split	= dmirror_devmem_folio_split,
 };
 
 static int dmirror_device_init(struct dmirror_device *mdevice, int id)
-- 
cgit v1.2.3


From 4265d67e405a41562634279ca1ededf79fdadcd7 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs@nvidia.com>
Date: Wed, 1 Oct 2025 16:57:02 +1000
Subject: mm/migrate_device: add THP splitting during migration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement migrate_vma_split_pages() to handle THP splitting during the
migration process when destination cannot allocate compound pages.

This addresses the common scenario where migrate_vma_setup() succeeds with
MIGRATE_PFN_COMPOUND pages, but the destination device cannot allocate
large pages during the migration phase.

Key changes:
- migrate_vma_split_pages(): Split already-isolated pages during migration
- Enhanced folio_split() and __split_unmapped_folio() with isolated
  parameter to avoid redundant unmap/remap operations

This provides a fallback mechansim to ensure migration succeeds even when
large page allocation fails at the destination.

[matthew.brost@intel.com: add THP splitting during migration]
  Link: https://lkml.kernel.org/r/20251120230825.181072-2-matthew.brost@intel.com
Link: https://lkml.kernel.org/r/20251001065707.920170-12-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 11 +++++--
 lib/test_hmm.c          |  9 +++++
 mm/huge_memory.c        | 46 ++++++++++++++------------
 mm/migrate_device.c     | 87 ++++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 119 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 82408c90b396..ed99e6bd31ac 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -365,8 +365,8 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
 		vm_flags_t vm_flags);
 
 bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
-		unsigned int new_order);
+int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+		unsigned int new_order, bool unmapped);
 int min_order_for_split(struct folio *folio);
 int split_folio_to_list(struct folio *folio, struct list_head *list);
 bool uniform_split_supported(struct folio *folio, unsigned int new_order,
@@ -375,6 +375,13 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
 		bool warns);
 int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
 		struct list_head *list);
+
+static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+		unsigned int new_order)
+{
+	return __split_huge_page_to_list_to_order(page, list, new_order, false);
+}
+
 /*
  * try_folio_split_to_order - try to split a @folio at @page to @new_order using
  * non uniform split.
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 46fa9e200db8..df429670633e 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1612,6 +1612,15 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
 	order = folio_order(page_folio(vmf->page));
 	nr = 1 << order;
 
+	/*
+	 * When folios are partially mapped, we can't rely on the folio
+	 * order of vmf->page as the folio might not be fully split yet
+	 */
+	if (vmf->pte) {
+		order = 0;
+		nr = 1;
+	}
+
 	/*
 	 * Consider a per-cpu cache of src and dst pfns, but with
 	 * large number of cpus that might not scale well.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ded707a50af8..81e511f1ed26 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3452,15 +3452,6 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
 		new_folio->mapping = folio->mapping;
 		new_folio->index = folio->index + i;
 
-		/*
-		 * page->private should not be set in tail pages. Fix up and warn once
-		 * if private is unexpectedly set.
-		 */
-		if (unlikely(new_folio->private)) {
-			VM_WARN_ON_ONCE_PAGE(true, new_head);
-			new_folio->private = NULL;
-		}
-
 		if (folio_test_swapcache(folio))
 			new_folio->swap.val = folio->swap.val + i;
 
@@ -3661,6 +3652,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
  * @lock_at: a page within @folio to be left locked to caller
  * @list: after-split folios will be put on it if non NULL
  * @uniform_split: perform uniform split or not (non-uniform split)
+ * @unmapped: The pages are already unmapped, they are migration entries.
  *
  * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
  * It is in charge of checking whether the split is supported or not and
@@ -3676,7 +3668,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
  */
 static int __folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct page *lock_at,
-		struct list_head *list, bool uniform_split)
+		struct list_head *list, bool uniform_split, bool unmapped)
 {
 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
@@ -3736,13 +3728,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		 * is taken to serialise against parallel split or collapse
 		 * operations.
 		 */
-		anon_vma = folio_get_anon_vma(folio);
-		if (!anon_vma) {
-			ret = -EBUSY;
-			goto out;
+		if (!unmapped) {
+			anon_vma = folio_get_anon_vma(folio);
+			if (!anon_vma) {
+				ret = -EBUSY;
+				goto out;
+			}
+			anon_vma_lock_write(anon_vma);
 		}
 		mapping = NULL;
-		anon_vma_lock_write(anon_vma);
 	} else {
 		unsigned int min_order;
 		gfp_t gfp;
@@ -3795,7 +3789,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		goto out_unlock;
 	}
 
-	unmap_folio(folio);
+	if (!unmapped)
+		unmap_folio(folio);
 
 	/* block interrupt reentry in xa_lock and spinlock */
 	local_irq_disable();
@@ -3882,10 +3877,13 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 
 			next = folio_next(new_folio);
 
+			zone_device_private_split_cb(folio, new_folio);
+
 			expected_refs = folio_expected_ref_count(new_folio) + 1;
 			folio_ref_unfreeze(new_folio, expected_refs);
 
-			lru_add_split_folio(folio, new_folio, lruvec, list);
+			if (!unmapped)
+				lru_add_split_folio(folio, new_folio, lruvec, list);
 
 			/*
 			 * Anonymous folio with swap cache.
@@ -3916,6 +3914,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 			__filemap_remove_folio(new_folio, NULL);
 			folio_put_refs(new_folio, nr_pages);
 		}
+
+		zone_device_private_split_cb(folio, NULL);
 		/*
 		 * Unfreeze @folio only after all page cache entries, which
 		 * used to point to it, have been updated with new folios.
@@ -3939,6 +3939,9 @@ fail:
 
 	local_irq_enable();
 
+	if (unmapped)
+		return ret;
+
 	if (nr_shmem_dropped)
 		shmem_uncharge(mapping->host, nr_shmem_dropped);
 
@@ -4029,12 +4032,13 @@ out:
  * Returns -EINVAL when trying to split to an order that is incompatible
  * with the folio. Splitting to order 0 is compatible with all folios.
  */
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
-				     unsigned int new_order)
+int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+				     unsigned int new_order, bool unmapped)
 {
 	struct folio *folio = page_folio(page);
 
-	return __folio_split(folio, new_order, &folio->page, page, list, true);
+	return __folio_split(folio, new_order, &folio->page, page, list, true,
+				unmapped);
 }
 
 /*
@@ -4063,7 +4067,7 @@ int folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct list_head *list)
 {
 	return __folio_split(folio, new_order, split_at, &folio->page, list,
-			false);
+			false, false);
 }
 
 int min_order_for_split(struct folio *folio)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index a0a315f3572a..ab373fd38961 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -309,6 +309,25 @@ again:
 			    pgmap->owner != migrate->pgmap_owner)
 				goto next;
 
+			folio = page_folio(page);
+			if (folio_test_large(folio)) {
+				int ret;
+
+				arch_leave_lazy_mmu_mode();
+				pte_unmap_unlock(ptep, ptl);
+				ret = migrate_vma_split_folio(folio,
+							  migrate->fault_page);
+
+				if (ret) {
+					if (unmapped)
+						flush_tlb_range(walk->vma, start, end);
+
+					return migrate_vma_collect_skip(addr, end, walk);
+				}
+
+				goto again;
+			}
+
 			mpfn = migrate_pfn(page_to_pfn(page)) |
 					MIGRATE_PFN_MIGRATE;
 			if (is_writable_device_private_entry(entry))
@@ -885,6 +904,29 @@ abort:
 		src[i] &= ~MIGRATE_PFN_MIGRATE;
 	return 0;
 }
+
+static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
+					    unsigned long idx, unsigned long addr,
+					    struct folio *folio)
+{
+	unsigned long i;
+	unsigned long pfn;
+	unsigned long flags;
+	int ret = 0;
+
+	folio_get(folio);
+	split_huge_pmd_address(migrate->vma, addr, true);
+	ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL,
+							0, true);
+	if (ret)
+		return ret;
+	migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND;
+	flags = migrate->src[idx] & ((1UL << MIGRATE_PFN_SHIFT) - 1);
+	pfn = migrate->src[idx] >> MIGRATE_PFN_SHIFT;
+	for (i = 1; i < HPAGE_PMD_NR; i++)
+		migrate->src[i+idx] = migrate_pfn(pfn + i) | flags;
+	return ret;
+}
 #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
 static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
 					 unsigned long addr,
@@ -894,6 +936,13 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
 {
 	return 0;
 }
+
+static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
+					    unsigned long idx, unsigned long addr,
+					    struct folio *folio)
+{
+	return 0;
+}
 #endif
 
 static unsigned long migrate_vma_nr_pages(unsigned long *src)
@@ -1055,8 +1104,9 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 				struct migrate_vma *migrate)
 {
 	struct mmu_notifier_range range;
-	unsigned long i;
+	unsigned long i, j;
 	bool notified = false;
+	unsigned long addr;
 
 	for (i = 0; i < npages; ) {
 		struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
@@ -1098,12 +1148,16 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 				(!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) {
 				nr = migrate_vma_nr_pages(&src_pfns[i]);
 				src_pfns[i] &= ~MIGRATE_PFN_COMPOUND;
-				src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
-				goto next;
+			} else {
+				nr = 1;
 			}
 
-			migrate_vma_insert_page(migrate, addr, &dst_pfns[i],
-						&src_pfns[i]);
+			for (j = 0; j < nr && i + j < npages; j++) {
+				src_pfns[i+j] |= MIGRATE_PFN_MIGRATE;
+				migrate_vma_insert_page(migrate,
+					addr + j * PAGE_SIZE,
+					&dst_pfns[i+j], &src_pfns[i+j]);
+			}
 			goto next;
 		}
 
@@ -1125,7 +1179,13 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 							 MIGRATE_PFN_COMPOUND);
 					goto next;
 				}
-				src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+				nr = 1 << folio_order(folio);
+				addr = migrate->start + i * PAGE_SIZE;
+				if (migrate_vma_split_unmapped_folio(migrate, i, addr, folio)) {
+					src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
+							 MIGRATE_PFN_COMPOUND);
+					goto next;
+				}
 			} else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
 				(dst_pfns[i] & MIGRATE_PFN_COMPOUND) &&
 				!(src_pfns[i] & MIGRATE_PFN_COMPOUND)) {
@@ -1161,11 +1221,16 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 
 		if (migrate && migrate->fault_page == page)
 			extra_cnt = 1;
-		r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
-		if (r)
-			src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
-		else
-			folio_migrate_flags(newfolio, folio);
+		for (j = 0; j < nr && i + j < npages; j++) {
+			folio = page_folio(migrate_pfn_to_page(src_pfns[i+j]));
+			newfolio = page_folio(migrate_pfn_to_page(dst_pfns[i+j]));
+
+			r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
+			if (r)
+				src_pfns[i+j] &= ~MIGRATE_PFN_MIGRATE;
+			else
+				folio_migrate_flags(newfolio, folio);
+		}
 next:
 		i += nr;
 	}
-- 
cgit v1.2.3


From ac7756771a34f19c9a757eb86efe028e51f57b23 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Wed, 8 Oct 2025 09:54:53 +0000
Subject: mm/khugepaged: unify pmd folio installation with map_anon_folio_pmd()

Currently we install pmd folio with map_anon_folio_pmd() in
__do_huge_pmd_anonymous_page() and do_huge_zero_wp_pmd().  While in
collapse_huge_page(), it is done with identical code except statistics
adjustment.

Unify the process with map_anon_folio_pmd() to install pmd folio.  Split
it to map_anon_folio_pmd_pf() and map_anon_folio_pmd_nopf() to be used in
page fault or not respectively.

No functional change is intended.

[akpm@linux-foundation.org: remove unneeded map_anon_folio_pmd_nopf() stub, per Wei & David]
Link: https://lkml.kernel.org/r/20251008095453.18772-3-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Lance Yang <lance.yang@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |  2 ++
 mm/huge_memory.c        | 14 ++++++++++----
 mm/khugepaged.c         |  9 +--------
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ed99e6bd31ac..396d9e3d1d46 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -533,6 +533,8 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
 			   pmd_t *pmd, bool freeze);
 bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
 			   pmd_t *pmdp, struct folio *folio);
+void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd,
+		struct vm_area_struct *vma, unsigned long haddr);
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a2a2fda2bff8..05bf419513ad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1218,7 +1218,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
 	return folio;
 }
 
-static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
+void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd,
 		struct vm_area_struct *vma, unsigned long haddr)
 {
 	pmd_t entry;
@@ -1229,11 +1229,17 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
 	folio_add_lru_vma(folio, vma);
 	set_pmd_at(vma->vm_mm, haddr, pmd, entry);
 	update_mmu_cache_pmd(vma, haddr, pmd);
+	deferred_split_folio(folio, false);
+}
+
+static void map_anon_folio_pmd_pf(struct folio *folio, pmd_t *pmd,
+		struct vm_area_struct *vma, unsigned long haddr)
+{
+	map_anon_folio_pmd_nopf(folio, pmd, vma, haddr);
 	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 	count_vm_event(THP_FAULT_ALLOC);
 	count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
 	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
-	deferred_split_folio(folio, false);
 }
 
 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
@@ -1272,7 +1278,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 			return ret;
 		}
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
-		map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
+		map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr);
 		mm_inc_nr_ptes(vma->vm_mm);
 		spin_unlock(vmf->ptl);
 	}
@@ -1944,7 +1950,7 @@ static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
 	if (ret)
 		goto release;
 	(void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
-	map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
+	map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr);
 	goto unlock;
 release:
 	folio_put(folio);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 1b5c2e942df9..af1c162c9a94 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1226,17 +1226,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	__folio_mark_uptodate(folio);
 	pgtable = pmd_pgtable(_pmd);
 
-	_pmd = folio_mk_pmd(folio, vma->vm_page_prot);
-	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-
 	spin_lock(pmd_ptl);
 	BUG_ON(!pmd_none(*pmd));
-	folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
-	folio_add_lru_vma(folio, vma);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
-	set_pmd_at(mm, address, pmd, _pmd);
-	update_mmu_cache_pmd(vma, address, pmd);
-	deferred_split_folio(folio, false);
+	map_anon_folio_pmd_nopf(folio, pmd, vma, address);
 	spin_unlock(pmd_ptl);
 
 	folio = NULL;
-- 
cgit v1.2.3


From a7ef12c64fd991c0f42b2e1bf0c4f09068575864 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Fri, 31 Oct 2025 12:19:59 -0400
Subject: mm/huge_memory: add split_huge_page_to_order()

Patch series "Optimize folio split in memory failure", v5.

This patchset optimizes folio split operations in memory failure code by
always splitting a folio to min_order_for_split() to minimize unusable
pages, even if min_order_for_split() is non zero and memory failure code
would take the failed path eventually for a successfully split folio.

This means instead of making the entire original folio unusable memory
failure code would only make its after-split folio, which has order of
min_order_for_split() and contains HWPoison page, unusable.

For soft offline case, since the original folio is still accessible, no
split is performed if the folio cannot be split to order-0 to prevent
potential performance loss.

In addition, add split_huge_page_to_order() to improve code readability
and fix kernel-doc comment format for folio_split() and other related
functions.

Background
==========

This patchset is a follow-up of "[PATCH v3] mm/huge_memory: do not change
split_huge_page*() target order silently."[1] and [PATCH v4]
mm/huge_memory: preserve PG_has_hwpoisoned if a folio is split to >0
order[2], since both are separated out as hotfixes.  It improves how
memory failure code handles large block size(LBS) folios with
min_order_for_split() > 0.  By splitting a large folio containing HW
poisoned pages to min_order_for_split(), the after-split folios without HW
poisoned pages could be freed for reuse.  To achieve this, folio split
code needs to set has_hwpoisoned on after-split folios containing HW
poisoned pages and it is done in the hotfix in [2].

This patchset includes:
1. A patch adds split_huge_page_to_order(),
2. Patch 2 and Patch 3 of "[PATCH v2 0/3] Do not change split folio target
   order"[3],


This patch (of 3):

When the caller does not supply a list to
split_huge_page_to_list_to_order(), use split_huge_page_to_order()
instead.

Link: https://lkml.kernel.org/r/20251031162001.670503-1-ziy@nvidia.com
Link: https://lkml.kernel.org/r/20251031162001.670503-2-ziy@nvidia.com
Link: https://lore.kernel.org/all/20251017013630.139907-1-ziy@nvidia.com/ [1]
Link: https://lore.kernel.org/all/20251023030521.473097-1-ziy@nvidia.com/ [2]
Link: https://lore.kernel.org/all/20251016033452.125479-1-ziy@nvidia.com/ [3]
Signed-off-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Pankaj Raghav <kernel@pankajraghav.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 396d9e3d1d46..a06924cf4065 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -381,6 +381,10 @@ static inline int split_huge_page_to_list_to_order(struct page *page, struct lis
 {
 	return __split_huge_page_to_list_to_order(page, list, new_order, false);
 }
+static inline int split_huge_page_to_order(struct page *page, unsigned int new_order)
+{
+	return split_huge_page_to_list_to_order(page, NULL, new_order);
+}
 
 /*
  * try_folio_split_to_order - try to split a @folio at @page to @new_order using
@@ -400,8 +404,7 @@ static inline int try_folio_split_to_order(struct folio *folio,
 		struct page *page, unsigned int new_order)
 {
 	if (!non_uniform_split_supported(folio, new_order, /* warns= */ false))
-		return split_huge_page_to_list_to_order(&folio->page, NULL,
-				new_order);
+		return split_huge_page_to_order(&folio->page, new_order);
 	return folio_split(folio, new_order, page, NULL);
 }
 static inline int split_huge_page(struct page *page)
@@ -587,6 +590,11 @@ split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 	VM_WARN_ON_ONCE_PAGE(1, page);
 	return -EINVAL;
 }
+static inline int split_huge_page_to_order(struct page *page, unsigned int new_order)
+{
+	VM_WARN_ON_ONCE_PAGE(1, page);
+	return -EINVAL;
+}
 static inline int split_huge_page(struct page *page)
 {
 	VM_WARN_ON_ONCE_PAGE(1, page);
-- 
cgit v1.2.3


From 50d0598cf2c9d33e1f08c3b1a357752ea8a9b94a Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Fri, 31 Oct 2025 12:20:01 -0400
Subject: mm/huge_memory: fix kernel-doc comments for folio_split() and related

try_folio_split_to_order(), folio_split, __folio_split(), and
__split_unmapped_folio() do not have correct kernel-doc comment format.
Fix them.

[ziy@nvidia.com: kernel-doc fixup]
  Link: https://lkml.kernel.org/r/BE7AC5F3-9E64-4923-861D-C2C4E0CB91EB@nvidia.com
[ziy@nvidia.com: add newline to fix an error and a warning from docutils]
  Link: https://lkml.kernel.org/r/040B38C0-23C6-4AEA-B069-69AE6DAA828B@nvidia.com
Link: https://lkml.kernel.org/r/20251031162001.670503-4-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Pankaj Raghav <kernel@pankajraghav.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 10 ++++++----
 mm/huge_memory.c        | 52 ++++++++++++++++++++++++++-----------------------
 2 files changed, 34 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a06924cf4065..9f7f7d772fe5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -386,9 +386,9 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o
 	return split_huge_page_to_list_to_order(page, NULL, new_order);
 }
 
-/*
- * try_folio_split_to_order - try to split a @folio at @page to @new_order using
- * non uniform split.
+/**
+ * try_folio_split_to_order() - try to split a @folio at @page to @new_order
+ * using non uniform split.
  * @folio: folio to be split
  * @page: split to @new_order at the given page
  * @new_order: the target split order
@@ -398,7 +398,7 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o
  * folios are put back to LRU list. Use min_order_for_split() to get the lower
  * bound of @new_order.
  *
- * Return: 0: split is successful, otherwise split failed.
+ * Return: 0 - split is successful, otherwise split failed.
  */
 static inline int try_folio_split_to_order(struct folio *folio,
 		struct page *page, unsigned int new_order)
@@ -483,6 +483,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
 /**
  * folio_test_pmd_mappable - Can we map this folio with a PMD?
  * @folio: The folio to test
+ *
+ * Return: true - @folio can be mapped, false - @folio cannot be mapped.
  */
 static inline bool folio_test_pmd_mappable(struct folio *folio)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 30d6afc79016..3d87127c02cf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3493,8 +3493,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
 		ClearPageCompound(&folio->page);
 }
 
-/*
- * It splits an unmapped @folio to lower order smaller folios in two ways.
+/**
+ * __split_unmapped_folio() - splits an unmapped @folio to lower order folios in
+ * two ways: uniform split or non-uniform split.
  * @folio: the to-be-split folio
  * @new_order: the smallest order of the after split folios (since buddy
  *             allocator like split generates folios with orders from @folio's
@@ -3511,26 +3512,27 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
  *    uniform_split is true.
  * 2. buddy allocator like (non-uniform) split: the given @folio is split into
  *    half and one of the half (containing the given page) is split into half
- *    until the given @page's order becomes @new_order. This is done when
+ *    until the given @folio's order becomes @new_order. This is done when
  *    uniform_split is false.
  *
  * The high level flow for these two methods are:
- * 1. uniform split: a single __split_folio_to_order() is called to split the
- *    @folio into @new_order, then we traverse all the resulting folios one by
- *    one in PFN ascending order and perform stats, unfreeze, adding to list,
- *    and file mapping index operations.
- * 2. non-uniform split: in general, folio_order - @new_order calls to
- *    __split_folio_to_order() are made in a for loop to split the @folio
- *    to one lower order at a time. The resulting small folios are processed
- *    like what is done during the traversal in 1, except the one containing
- *    @page, which is split in next for loop.
+ *
+ * 1. uniform split: @xas is split with no expectation of failure and a single
+ *    __split_folio_to_order() is called to split the @folio into @new_order
+ *    along with stats update.
+ * 2. non-uniform split: folio_order - @new_order calls to
+ *    __split_folio_to_order() are expected to be made in a for loop to split
+ *    the @folio to one lower order at a time. The folio containing @split_at
+ *    is split in each iteration. @xas is split into half in each iteration and
+ *    can fail. A failed @xas split leaves split folios as is without merging
+ *    them back.
  *
  * After splitting, the caller's folio reference will be transferred to the
- * folio containing @page. The caller needs to unlock and/or free after-split
- * folios if necessary.
+ * folio containing @split_at. The caller needs to unlock and/or free
+ * after-split folios if necessary.
  *
- * For !uniform_split, when -ENOMEM is returned, the original folio might be
- * split. The caller needs to check the input folio.
+ * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
  */
 static int __split_unmapped_folio(struct folio *folio, int new_order,
 		struct page *split_at, struct xa_state *xas,
@@ -3650,8 +3652,8 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
 	return true;
 }
 
-/*
- * __folio_split: split a folio at @split_at to a @new_order folio
+/**
+ * __folio_split() - split a folio at @split_at to a @new_order folio
  * @folio: folio to split
  * @new_order: the order of the new folio
  * @split_at: a page within the new folio
@@ -3669,7 +3671,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
  * 1. for uniform split, @lock_at points to one of @folio's subpages;
  * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
  *
- * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
  * split but not to @new_order, the caller needs to check)
  */
 static int __folio_split(struct folio *folio, unsigned int new_order,
@@ -4047,14 +4049,13 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
 				unmapped);
 }
 
-/*
- * folio_split: split a folio at @split_at to a @new_order folio
+/**
+ * folio_split() - split a folio at @split_at to a @new_order folio
  * @folio: folio to split
  * @new_order: the order of the new folio
  * @split_at: a page within the new folio
- *
- * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
- * split but not to @new_order, the caller needs to check)
+ * @list: after-split folios are added to @list if not null, otherwise to LRU
+ *        list
  *
  * It has the same prerequisites and returns as
  * split_huge_page_to_list_to_order().
@@ -4068,6 +4069,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
  * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
  *
  * After split, folio is left locked for caller.
+ *
+ * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
  */
 int folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct list_head *list)
-- 
cgit v1.2.3


From c467061fbb6eb483d59f546c145b2ff2249455e4 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 6 Nov 2025 03:41:54 +0000
Subject: mm/huge_memory: introduce enum split_type for clarity

Patch series "mm/huge_memory: Define split_type and consolidate split
support checks", v3.

This two-patch series focuses on improving code clarity and removing
redundancy in the huge memory handling logic related to folio splitting.

The series is based on an original proposal to merge two significantly
identical functions that check folio split support[1].  During this
process, we found an opportunity to improve readability by explicitly
defining the split types.

Patch 1: define split_type and use it
Patch 2: merge uniform_split_supported() and non_uniform_split_supported()


This patch (of 2):

We currently handle two distinct types of large folio splitting:
  * uniform split
  * non-uniform split

Differentiating between these types using a simple boolean variable is not
obvious and can harm code readability.

This commit introduces enum split_type to explicitly define these two
types.  Replacing the existing boolean variable with this enumeration
significantly improves code clarity and expressiveness when dealing with
folio splitting logic.

No functional change is expected.

[akpm@linux-foundation.org: tweak layout, per David]
Link: https://lkml.kernel.org/r/20251106034155.21398-1-richard.weiyang@gmail.com
Link: https://lkml.kernel.org/r/20251106034155.21398-2-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |  5 +++++
 mm/huge_memory.c        | 30 +++++++++++++++---------------
 2 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9f7f7d772fe5..b74708dc5b5f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -364,6 +364,11 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
 		unsigned long len, unsigned long pgoff, unsigned long flags,
 		vm_flags_t vm_flags);
 
+enum split_type {
+	SPLIT_TYPE_UNIFORM,
+	SPLIT_TYPE_NON_UNIFORM,
+};
+
 bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
 int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		unsigned int new_order, bool unmapped);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3d87127c02cf..4118f330c55e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3504,16 +3504,16 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
  *            will be split until its order becomes @new_order.
  * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
  * @mapping: @folio->mapping
- * @uniform_split: if the split is uniform or not (buddy allocator like split)
+ * @split_type: if the split is uniform or not (buddy allocator like split)
  *
  *
  * 1. uniform split: the given @folio into multiple @new_order small folios,
  *    where all small folios have the same order. This is done when
- *    uniform_split is true.
+ *    split_type is SPLIT_TYPE_UNIFORM.
  * 2. buddy allocator like (non-uniform) split: the given @folio is split into
  *    half and one of the half (containing the given page) is split into half
  *    until the given @folio's order becomes @new_order. This is done when
- *    uniform_split is false.
+ *    split_type is SPLIT_TYPE_NON_UNIFORM.
  *
  * The high level flow for these two methods are:
  *
@@ -3536,11 +3536,11 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
  */
 static int __split_unmapped_folio(struct folio *folio, int new_order,
 		struct page *split_at, struct xa_state *xas,
-		struct address_space *mapping, bool uniform_split)
+		struct address_space *mapping, enum split_type split_type)
 {
 	const bool is_anon = folio_test_anon(folio);
 	int old_order = folio_order(folio);
-	int start_order = uniform_split ? new_order : old_order - 1;
+	int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1;
 	int split_order;
 
 	/*
@@ -3562,7 +3562,7 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
 			 * irq is disabled to allocate enough memory, whereas
 			 * non-uniform split can handle ENOMEM.
 			 */
-			if (uniform_split)
+			if (split_type == SPLIT_TYPE_UNIFORM)
 				xas_split(xas, folio, old_order);
 			else {
 				xas_set_order(xas, folio->index, split_order);
@@ -3659,7 +3659,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
  * @split_at: a page within the new folio
  * @lock_at: a page within @folio to be left locked to caller
  * @list: after-split folios will be put on it if non NULL
- * @uniform_split: perform uniform split or not (non-uniform split)
+ * @split_type: perform uniform split or not (non-uniform split)
  * @unmapped: The pages are already unmapped, they are migration entries.
  *
  * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
@@ -3676,7 +3676,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
  */
 static int __folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct page *lock_at,
-		struct list_head *list, bool uniform_split, bool unmapped)
+		struct list_head *list, enum split_type split_type, bool unmapped)
 {
 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
@@ -3711,10 +3711,10 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	if (new_order >= old_order)
 		return -EINVAL;
 
-	if (uniform_split && !uniform_split_supported(folio, new_order, true))
+	if (split_type == SPLIT_TYPE_UNIFORM && !uniform_split_supported(folio, new_order, true))
 		return -EINVAL;
 
-	if (!uniform_split &&
+	if (split_type == SPLIT_TYPE_NON_UNIFORM &&
 	    !non_uniform_split_supported(folio, new_order, true))
 		return -EINVAL;
 
@@ -3764,7 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 			goto out;
 		}
 
-		if (uniform_split) {
+		if (split_type == SPLIT_TYPE_UNIFORM) {
 			xas_set_order(&xas, folio->index, new_order);
 			xas_split_alloc(&xas, folio, old_order, gfp);
 			if (xas_error(&xas)) {
@@ -3869,7 +3869,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		lruvec = folio_lruvec_lock(folio);
 
 		ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
-					     mapping, uniform_split);
+					     mapping, split_type);
 
 		/*
 		 * Unfreeze after-split folios and put them back to the right
@@ -4045,8 +4045,8 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
 {
 	struct folio *folio = page_folio(page);
 
-	return __folio_split(folio, new_order, &folio->page, page, list, true,
-				unmapped);
+	return __folio_split(folio, new_order, &folio->page, page, list,
+			     SPLIT_TYPE_UNIFORM, unmapped);
 }
 
 /**
@@ -4077,7 +4077,7 @@ int folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct list_head *list)
 {
 	return __folio_split(folio, new_order, split_at, &folio->page, list,
-			false, false);
+			     SPLIT_TYPE_NON_UNIFORM, false);
 }
 
 int min_order_for_split(struct folio *folio)
-- 
cgit v1.2.3


From 8a0e4bdddd1c998b894d879a1d22f1e745606215 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 6 Nov 2025 03:41:55 +0000
Subject: mm/huge_memory: merge uniform_split_supported() and
 non_uniform_split_supported()

uniform_split_supported() and non_uniform_split_supported() share
significantly similar logic.

The only functional difference is that uniform_split_supported() includes
an additional check on the requested @new_order.

The reason for this check comes from the following two aspects:

  * some file system or swap cache just supports order-0 folio
  * the behavioral difference between uniform/non-uniform split

The behavioral difference between uniform split and non-uniform:

  * uniform split splits folio directly to @new_order
  * non-uniform split creates after-split folios with orders from
    folio_order(folio) - 1 to new_order.

This means for non-uniform split or !new_order split we should check the
file system and swap cache respectively.

This commit unifies the logic and merge the two functions into a single
combined helper, removing redundant code and simplifying the split
support checking mechanism.

Link: https://lkml.kernel.org/r/20251106034155.21398-3-richard.weiyang@gmail.com
Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages")
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |  8 +++---
 mm/huge_memory.c        | 71 +++++++++++++++++++++----------------------------
 2 files changed, 33 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b74708dc5b5f..19d4a5f52ca2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -374,10 +374,8 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
 		unsigned int new_order, bool unmapped);
 int min_order_for_split(struct folio *folio);
 int split_folio_to_list(struct folio *folio, struct list_head *list);
-bool uniform_split_supported(struct folio *folio, unsigned int new_order,
-		bool warns);
-bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
-		bool warns);
+bool folio_split_supported(struct folio *folio, unsigned int new_order,
+		enum split_type split_type, bool warns);
 int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
 		struct list_head *list);
 
@@ -408,7 +406,7 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o
 static inline int try_folio_split_to_order(struct folio *folio,
 		struct page *page, unsigned int new_order)
 {
-	if (!non_uniform_split_supported(folio, new_order, /* warns= */ false))
+	if (!folio_split_supported(folio, new_order, SPLIT_TYPE_NON_UNIFORM, /* warns= */ false))
 		return split_huge_page_to_order(&folio->page, new_order);
 	return folio_split(folio, new_order, page, NULL);
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4118f330c55e..d79a4bb363de 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3593,8 +3593,8 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
 	return 0;
 }
 
-bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
-		bool warns)
+bool folio_split_supported(struct folio *folio, unsigned int new_order,
+		enum split_type split_type, bool warns)
 {
 	if (folio_test_anon(folio)) {
 		/* order-1 is not supported for anonymous THP. */
@@ -3602,48 +3602,41 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
 				"Cannot split to order-1 folio");
 		if (new_order == 1)
 			return false;
-	} else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
-	    !mapping_large_folio_support(folio->mapping)) {
-		/*
-		 * No split if the file system does not support large folio.
-		 * Note that we might still have THPs in such mappings due to
-		 * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
-		 * does not actually support large folios properly.
-		 */
-		VM_WARN_ONCE(warns,
-			"Cannot split file folio to non-0 order");
-		return false;
-	}
-
-	/* Only swapping a whole PMD-mapped folio is supported */
-	if (folio_test_swapcache(folio)) {
-		VM_WARN_ONCE(warns,
-			"Cannot split swapcache folio to non-0 order");
-		return false;
-	}
-
-	return true;
-}
-
-/* See comments in non_uniform_split_supported() */
-bool uniform_split_supported(struct folio *folio, unsigned int new_order,
-		bool warns)
-{
-	if (folio_test_anon(folio)) {
-		VM_WARN_ONCE(warns && new_order == 1,
-				"Cannot split to order-1 folio");
-		if (new_order == 1)
-			return false;
-	} else  if (new_order) {
+	} else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) {
 		if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
 		    !mapping_large_folio_support(folio->mapping)) {
+			/*
+			 * We can always split a folio down to a single page
+			 * (new_order == 0) uniformly.
+			 *
+			 * For any other scenario
+			 *   a) uniform split targeting a large folio
+			 *      (new_order > 0)
+			 *   b) any non-uniform split
+			 * we must confirm that the file system supports large
+			 * folios.
+			 *
+			 * Note that we might still have THPs in such
+			 * mappings, which is created from khugepaged when
+			 * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that
+			 * case, the mapping does not actually support large
+			 * folios properly.
+			 */
 			VM_WARN_ONCE(warns,
 				"Cannot split file folio to non-0 order");
 			return false;
 		}
 	}
 
-	if (new_order && folio_test_swapcache(folio)) {
+	/*
+	 * swapcache folio could only be split to order 0
+	 *
+	 * non-uniform split creates after-split folios with orders from
+	 * folio_order(folio) - 1 to new_order, making it not suitable for any
+	 * swapcache folio split. Only uniform split to order-0 can be used
+	 * here.
+	 */
+	if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) {
 		VM_WARN_ONCE(warns,
 			"Cannot split swapcache folio to non-0 order");
 		return false;
@@ -3711,11 +3704,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	if (new_order >= old_order)
 		return -EINVAL;
 
-	if (split_type == SPLIT_TYPE_UNIFORM && !uniform_split_supported(folio, new_order, true))
-		return -EINVAL;
-
-	if (split_type == SPLIT_TYPE_NON_UNIFORM &&
-	    !non_uniform_split_supported(folio, new_order, true))
+	if (!folio_split_supported(folio, new_order, split_type, /* warn = */ true))
 		return -EINVAL;
 
 	is_hzp = is_huge_zero_folio(folio);
-- 
cgit v1.2.3


From c093cf451094a9a03c4d4929bc30122a53038b7b Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:19 +0000
Subject: mm: correctly handle UFFD PTE markers

Patch series "mm: remove is_swap_[pte, pmd]() + non-swap entries,
introduce leaf entries", v3.

There's an established convention in the kernel that we treat leaf page
tables (so far at the PTE, PMD level) as containing 'swap entries' should
they be neither empty (i.e.  p**_none() evaluating true) nor present (i.e.
p**_present() evaluating true).

However, at the same time we also have helper predicates - is_swap_pte(),
is_swap_pmd() - which are inconsistently used.

This is problematic, as it is logical to assume that should somebody wish
to operate upon a page table swap entry they should first check to see if
it is in fact one.

It also implies that perhaps, in future, we might introduce a non-present,
none page table entry that is not a swap entry.

This series resolves this issue by systematically eliminating all use of
the is_swap_pte() and is swap_pmd() predicates so we retain only the
convention that should a leaf page table entry be neither none nor present
it is a swap entry.

We also have the further issue that 'swap entry' is unfortunately a really
rather overloaded term and in fact refers to both entries for swap and for
other information such as migration entries, page table markers, and
device private entries.

We therefore have the rather 'unique' concept of a 'non-swap' swap entry.

This series therefore introduces the concept of 'software leaf entries',
of type softleaf_t, to eliminate this confusion.

A software leaf entry in this sense is any page table entry which is
non-present, and represented by the softleaf_t type.  That is - page table
leaf entries which are software-controlled by the kernel.

This includes 'none' or empty entries, which are simply represented by an
zero leaf entry value.

In order to maintain compatibility as we transition the kernel to this new
type, we simply typedef swp_entry_t to softleaf_t.

We introduce a number of predicates and helpers to interact with software
leaf entries in include/linux/leafops.h which, as it imports swapops.h,
can be treated as a drop-in replacement for swapops.h wherever leaf entry
helpers are used.

Since softleaf_from_[pte, pmd]() treats present entries as they were
empty/none leaf entries, this allows for a great deal of simplification of
code throughout the code base, which this series utilises a great deal.

We additionally change from swap entry to software leaf entry handling
where it makes sense to and eliminate functions from swapops.h where
software leaf entries obviate the need for the functions.


This patch (of 16):

PTE markers were previously only concerned with UFFD-specific logic - that
is, PTE entries with the UFFD WP marker set or those marked via
UFFDIO_POISON.

However since the introduction of guard markers in commit 7c53dfbdb024
("mm: add PTE_MARKER_GUARD PTE marker"), this has no longer been the case.

Issues have been avoided as guard regions are not permitted in conjunction
with UFFD, but it still leaves very confusing logic in place, most notably
the misleading and poorly named pte_none_mostly() and
huge_pte_none_mostly().

This predicate returns true for PTE entries that ought to be treated as
none, but only in certain circumstances, and on the assumption we are
dealing with H/W poison markers or UFFD WP markers.

This patch removes these functions and makes each invocation of these
functions instead explicitly check what it needs to check.

As part of this effort it introduces is_uffd_pte_marker() to explicitly
determine if a marker in fact is used as part of UFFD or not.

In the HMM logic we note that the only time we would need to check for a
fault is in the case of a UFFD WP marker, otherwise we simply encounter a
fault error (VM_FAULT_HWPOISON for H/W poisoned marker, VM_FAULT_SIGSEGV
for a guard marker), so only check for the UFFD WP case.

While we're here we also refactor code to make it easier to understand.

[akpm@linux-foundation.org: fix comment typo, per Mike]
Link: https://lkml.kernel.org/r/cover.1762812360.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/c38625fd9a1c1f1cf64ae8a248858e45b3dcdf11.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              | 93 ++++++++++++++++++++++++++-----------------
 include/asm-generic/hugetlb.h |  8 ----
 include/linux/swapops.h       | 18 ---------
 include/linux/userfaultfd_k.h | 21 ++++++++++
 mm/hmm.c                      |  7 +++-
 mm/hugetlb.c                  | 47 +++++++++++-----------
 mm/mincore.c                  | 17 ++++++--
 mm/userfaultfd.c              | 27 ++++++++-----
 8 files changed, 138 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 54c6cc7fe9c6..94c4d68f0818 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -233,40 +233,48 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 {
 	struct vm_area_struct *vma = vmf->vma;
 	pte_t *ptep, pte;
-	bool ret = true;
 
 	assert_fault_locked(vmf);
 
 	ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
 	if (!ptep)
-		goto out;
+		return true;
 
-	ret = false;
 	pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);
 
 	/*
 	 * Lockless access: we're in a wait_event so it's ok if it
-	 * changes under us.  PTE markers should be handled the same as none
-	 * ptes here.
+	 * changes under us.
+	 */
+
+	/* Entry is still missing, wait for userspace to resolve the fault. */
+	if (huge_pte_none(pte))
+		return true;
+	/* UFFD PTE markers require userspace to resolve the fault. */
+	if (is_uffd_pte_marker(pte))
+		return true;
+	/*
+	 * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
+	 * resolve the fault.
 	 */
-	if (huge_pte_none_mostly(pte))
-		ret = true;
 	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
-		ret = true;
-out:
-	return ret;
+		return true;
+
+	return false;
 }
 #else
 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 					      struct vm_fault *vmf,
 					      unsigned long reason)
 {
-	return false;	/* should never get here */
+	/* Should never get here. */
+	VM_WARN_ON_ONCE(1);
+	return false;
 }
 #endif /* CONFIG_HUGETLB_PAGE */
 
 /*
- * Verify the pagetables are still not ok after having reigstered into
+ * Verify the pagetables are still not ok after having registered into
  * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
  * userfault that has already been resolved, if userfaultfd_read_iter and
  * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
@@ -284,53 +292,63 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 	pmd_t *pmd, _pmd;
 	pte_t *pte;
 	pte_t ptent;
-	bool ret = true;
+	bool ret;
 
 	assert_fault_locked(vmf);
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
-		goto out;
+		return true;
 	p4d = p4d_offset(pgd, address);
 	if (!p4d_present(*p4d))
-		goto out;
+		return true;
 	pud = pud_offset(p4d, address);
 	if (!pud_present(*pud))
-		goto out;
+		return true;
 	pmd = pmd_offset(pud, address);
 again:
 	_pmd = pmdp_get_lockless(pmd);
 	if (pmd_none(_pmd))
-		goto out;
+		return true;
 
-	ret = false;
+	/*
+	 * A race could arise which would result in a softleaf entry such as
+	 * migration entry unexpectedly being present in the PMD, so explicitly
+	 * check for this and bail out if so.
+	 */
 	if (!pmd_present(_pmd))
-		goto out;
+		return false;
 
-	if (pmd_trans_huge(_pmd)) {
-		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
-			ret = true;
-		goto out;
-	}
+	if (pmd_trans_huge(_pmd))
+		return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
 
 	pte = pte_offset_map(pmd, address);
-	if (!pte) {
-		ret = true;
+	if (!pte)
 		goto again;
-	}
+
 	/*
 	 * Lockless access: we're in a wait_event so it's ok if it
-	 * changes under us.  PTE markers should be handled the same as none
-	 * ptes here.
+	 * changes under us.
 	 */
 	ptent = ptep_get(pte);
-	if (pte_none_mostly(ptent))
-		ret = true;
+
+	ret = true;
+	/* Entry is still missing, wait for userspace to resolve the fault. */
+	if (pte_none(ptent))
+		goto out;
+	/* UFFD PTE markers require userspace to resolve the fault. */
+	if (is_uffd_pte_marker(ptent))
+		goto out;
+	/*
+	 * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
+	 * resolve the fault.
+	 */
 	if (!pte_write(ptent) && (reason & VM_UFFD_WP))
-		ret = true;
-	pte_unmap(pte);
+		goto out;
 
+	ret = false;
 out:
+	pte_unmap(pte);
 	return ret;
 }
 
@@ -490,12 +508,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	set_current_state(blocking_state);
 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 
-	if (!is_vm_hugetlb_page(vma))
-		must_wait = userfaultfd_must_wait(ctx, vmf, reason);
-	else
+	if (is_vm_hugetlb_page(vma)) {
 		must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
-	if (is_vm_hugetlb_page(vma))
 		hugetlb_vma_unlock_read(vma);
+	} else {
+		must_wait = userfaultfd_must_wait(ctx, vmf, reason);
+	}
+
 	release_fault_lock(vmf);
 
 	if (likely(must_wait && !READ_ONCE(ctx->released))) {
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index dcb8727f2b82..e1a2e1b7c8e7 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -97,14 +97,6 @@ static inline int huge_pte_none(pte_t pte)
 }
 #endif
 
-/* Please refer to comments above pte_none_mostly() for the usage */
-#ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY
-static inline int huge_pte_none_mostly(pte_t pte)
-{
-	return huge_pte_none(pte) || is_pte_marker(pte);
-}
-#endif
-
 #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
 static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 		unsigned long addr, pte_t *ptep)
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 2687928a8146..d1f665935cfc 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -469,24 +469,6 @@ static inline int is_guard_swp_entry(swp_entry_t entry)
 		(pte_marker_get(entry) & PTE_MARKER_GUARD);
 }
 
-/*
- * This is a special version to check pte_none() just to cover the case when
- * the pte is a pte marker.  It existed because in many cases the pte marker
- * should be seen as a none pte; it's just that we have stored some information
- * onto the none pte so it becomes not-none any more.
- *
- * It should be used when the pte is file-backed, ram-based and backing
- * userspace pages, like shmem.  It is not needed upon pgtables that do not
- * support pte markers at all.  For example, it's not needed on anonymous
- * memory, kernel-only memory (including when the system is during-boot),
- * non-ram based generic file-system.  It's fine to be used even there, but the
- * extra pte marker check will be pure overhead.
- */
-static inline int pte_none_mostly(pte_t pte)
-{
-	return pte_none(pte) || is_pte_marker(pte);
-}
-
 static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
 {
 	struct page *p = pfn_to_page(swp_offset_pfn(entry));
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index c0e716aec26a..da0b4fcc566f 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -479,4 +479,25 @@ static inline bool pte_swp_uffd_wp_any(pte_t pte)
 	return false;
 }
 
+
+static inline bool is_uffd_pte_marker(pte_t pte)
+{
+	swp_entry_t entry;
+
+	if (pte_present(pte))
+		return false;
+
+	entry = pte_to_swp_entry(pte);
+	if (!is_pte_marker_entry(entry))
+		return false;
+
+	/* UFFD WP, poisoned swap entries are UFFD handled. */
+	if (pte_marker_entry_uffd_wp(entry))
+		return true;
+	if (is_poisoned_swp_entry(entry))
+		return true;
+
+	return false;
+}
+
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/mm/hmm.c b/mm/hmm.c
index a56081d67ad6..387a38bbaf6a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -244,7 +244,12 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	uint64_t pfn_req_flags = *hmm_pfn;
 	uint64_t new_pfn_flags = 0;
 
-	if (pte_none_mostly(pte)) {
+	/*
+	 * Any other marker than a UFFD WP marker will result in a fault error
+	 * that will be correctly handled, so we need only check for UFFD WP
+	 * here.
+	 */
+	if (pte_none(pte) || pte_marker_uffd_wp(pte)) {
 		required_fault =
 			hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
 		if (required_fault)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 106e61f6e12c..96c991f54f7a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6037,29 +6037,28 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte);
-	if (huge_pte_none_mostly(vmf.orig_pte)) {
-		if (is_pte_marker(vmf.orig_pte)) {
-			pte_marker marker =
-				pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
-
-			if (marker & PTE_MARKER_POISONED) {
-				ret = VM_FAULT_HWPOISON_LARGE |
-				      VM_FAULT_SET_HINDEX(hstate_index(h));
-				goto out_mutex;
-			} else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
-				/* This isn't supported in hugetlb. */
-				ret = VM_FAULT_SIGSEGV;
-				goto out_mutex;
-			}
-		}
-
+	if (huge_pte_none(vmf.orig_pte))
 		/*
-		 * Other PTE markers should be handled the same way as none PTE.
-		 *
 		 * hugetlb_no_page will drop vma lock and hugetlb fault
 		 * mutex internally, which make us return immediately.
 		 */
 		return hugetlb_no_page(mapping, &vmf);
+
+	if (is_pte_marker(vmf.orig_pte)) {
+		const pte_marker marker =
+			pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
+
+		if (marker & PTE_MARKER_POISONED) {
+			ret = VM_FAULT_HWPOISON_LARGE |
+				VM_FAULT_SET_HINDEX(hstate_index(h));
+			goto out_mutex;
+		} else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
+			/* This isn't supported in hugetlb. */
+			ret = VM_FAULT_SIGSEGV;
+			goto out_mutex;
+		}
+
+		return hugetlb_no_page(mapping, &vmf);
 	}
 
 	ret = 0;
@@ -6228,6 +6227,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 	int ret = -ENOMEM;
 	struct folio *folio;
 	bool folio_in_pagecache = false;
+	pte_t dst_ptep;
 
 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
 		ptl = huge_pte_lock(h, dst_mm, dst_pte);
@@ -6367,13 +6367,14 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 	if (folio_test_hwpoison(folio))
 		goto out_release_unlock;
 
+	ret = -EEXIST;
+
+	dst_ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte);
 	/*
-	 * We allow to overwrite a pte marker: consider when both MISSING|WP
-	 * registered, we firstly wr-protect a none pte which has no page cache
-	 * page backing it, then access the page.
+	 * See comment about UFFD marker overwriting in
+	 * mfill_atomic_install_pte().
 	 */
-	ret = -EEXIST;
-	if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte)))
+	if (!huge_pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep))
 		goto out_release_unlock;
 
 	if (folio_in_pagecache)
diff --git a/mm/mincore.c b/mm/mincore.c
index 8ec4719370e1..fb80becd6119 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -32,11 +32,22 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
 	spinlock_t *ptl;
 
 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+
 	/*
 	 * Hugepages under user process are always in RAM and never
 	 * swapped out, but theoretically it needs to be checked.
 	 */
-	present = pte && !huge_pte_none_mostly(huge_ptep_get(walk->mm, addr, pte));
+	if (!pte) {
+		present = 0;
+	} else {
+		const pte_t ptep = huge_ptep_get(walk->mm, addr, pte);
+
+		if (huge_pte_none(ptep) || is_pte_marker(ptep))
+			present = 0;
+		else
+			present = 1;
+	}
+
 	for (; addr != end; vec++, addr += PAGE_SIZE)
 		*vec = present;
 	walk->private = vec;
@@ -175,8 +186,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		pte_t pte = ptep_get(ptep);
 
 		step = 1;
-		/* We need to do cache lookup too for pte markers */
-		if (pte_none_mostly(pte))
+		/* We need to do cache lookup too for markers */
+		if (pte_none(pte) || is_pte_marker(pte))
 			__mincore_unmapped_range(addr, addr + PAGE_SIZE,
 						 vma, vec);
 		else if (pte_present(pte)) {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 00122f42718c..cc4ce205bbec 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -178,6 +178,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
 	spinlock_t *ptl;
 	struct folio *folio = page_folio(page);
 	bool page_in_cache = folio_mapping(folio);
+	pte_t dst_ptep;
 
 	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
 	_dst_pte = pte_mkdirty(_dst_pte);
@@ -199,12 +200,15 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
 	}
 
 	ret = -EEXIST;
+
+	dst_ptep = ptep_get(dst_pte);
+
 	/*
-	 * We allow to overwrite a pte marker: consider when both MISSING|WP
-	 * registered, we firstly wr-protect a none pte which has no page cache
-	 * page backing it, then access the page.
+	 * We are allowed to overwrite a UFFD pte marker: consider when both
+	 * MISSING|WP registered, we firstly wr-protect a none pte which has no
+	 * page cache page backing it, then access the page.
 	 */
-	if (!pte_none_mostly(ptep_get(dst_pte)))
+	if (!pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep))
 		goto out_unlock;
 
 	if (page_in_cache) {
@@ -583,12 +587,15 @@ retry:
 			goto out_unlock;
 		}
 
-		if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
-		    !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
-			err = -EEXIST;
-			hugetlb_vma_unlock_read(dst_vma);
-			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-			goto out_unlock;
+		if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
+			const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte);
+
+			if (!huge_pte_none(ptep) && !is_uffd_pte_marker(ptep)) {
+				err = -EEXIST;
+				hugetlb_vma_unlock_read(dst_vma);
+				mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+				goto out_unlock;
+			}
 		}
 
 		err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
-- 
cgit v1.2.3


From 68aa2fdbf57f769e552f472ddb762aba028a207e Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:20 +0000
Subject: mm: introduce leaf entry type and use to simplify leaf entry logic

The kernel maintains leaf page table entries which contain either:

The kernel maintains leaf page table entries which contain either:

 - Nothing ('none' entries)
 - Present entries*
 - Everything else that will cause a fault which the kernel handles

* Present entries are either entries the hardware can navigate without page
  fault or special cases like NUMA hint protnone or PMD with cleared
  present bit which contain hardware-valid entries modulo the present bit.

In the 'everything else' group we include swap entries, but we also
include a number of other things such as migration entries, device private
entries and marker entries.

Unfortunately this 'everything else' group expresses everything through a
swp_entry_t type, and these entries are referred to swap entries even
though they may well not contain a...  swap entry.

This is compounded by the rather mind-boggling concept of a non-swap swap
entry (checked via non_swap_entry()) and the means by which we twist and
turn to satisfy this.

This patch lays the foundation for reducing this confusion.

We refer to 'everything else' as a 'software-define leaf entry' or
'softleaf'.  for short And in fact we scoop up the 'none' entries into
this concept also so we are left with:

- Present entries.
- Softleaf entries (which may be empty).

This allows for radical simplification across the board - one can simply
convert any leaf page table entry to a leaf entry via softleaf_from_pte().

If the entry is present, we return an empty leaf entry, so it is assumed
the caller is aware that they must differentiate between the two
categories of page table entries, checking for the former via
pte_present().

As a result, we can eliminate a number of places where we would otherwise
need to use predicates to see if we can proceed with leaf page table entry
conversion and instead just go ahead and do it unconditionally.

We do so where we can, adjusting surrounding logic as necessary to
integrate the new softleaf_t logic as far as seems reasonable at this
stage.

We typedef swp_entry_t to softleaf_t for the time being until the
conversion can be complete, meaning everything remains compatible
regardless of which type is used.  We will eventually remove swp_entry_t
when the conversion is complete.

We introduce a new header file to keep things clear - leafops.h - this
imports swapops.h so can direct replace swapops imports without issue, and
we do so in all the files that require it.

Additionally, add new leafops.h file to core mm maintainers entry.

Link: https://lkml.kernel.org/r/c879383aac77d96a03e4d38f7daba893cd35fc76.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS                   |   1 +
 fs/proc/task_mmu.c            |  26 +--
 fs/userfaultfd.c              |   6 +-
 include/linux/leafops.h       | 387 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm_inline.h     |   6 +-
 include/linux/mm_types.h      |  25 +++
 include/linux/swapops.h       |  28 ---
 include/linux/userfaultfd_k.h |  51 +-----
 mm/hmm.c                      |   2 +-
 mm/hugetlb.c                  |  37 ++--
 mm/madvise.c                  |  16 +-
 mm/memory.c                   |  41 ++---
 mm/mincore.c                  |   6 +-
 mm/mprotect.c                 |   6 +-
 mm/mremap.c                   |   4 +-
 mm/page_vma_mapped.c          |  11 +-
 mm/shmem.c                    |   7 +-
 mm/userfaultfd.c              |   6 +-
 18 files changed, 502 insertions(+), 164 deletions(-)
 create mode 100644 include/linux/leafops.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 6168d3aebdc1..5ca4caf73021 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16263,6 +16263,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
 F:	include/linux/gfp.h
 F:	include/linux/gfp_types.h
 F:	include/linux/highmem.h
+F:	include/linux/leafops.h
 F:	include/linux/memory.h
 F:	include/linux/mm.h
 F:	include/linux/mm_*.h
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index db16ed91c269..5a1e897b0973 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,7 +14,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/sched/mm.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
 #include <linux/shmem_fs.h>
@@ -1231,11 +1231,11 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
 	if (pte_present(ptent)) {
 		folio = page_folio(pte_page(ptent));
 		present = true;
-	} else if (is_swap_pte(ptent)) {
-		swp_entry_t swpent = pte_to_swp_entry(ptent);
+	} else {
+		const softleaf_t entry = softleaf_from_pte(ptent);
 
-		if (is_pfn_swap_entry(swpent))
-			folio = pfn_swap_entry_folio(swpent);
+		if (softleaf_has_pfn(entry))
+			folio = softleaf_to_folio(entry);
 	}
 
 	if (folio) {
@@ -1956,9 +1956,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		flags |= PM_SWAP;
 		if (is_pfn_swap_entry(entry))
 			page = pfn_swap_entry_to_page(entry);
-		if (pte_marker_entry_uffd_wp(entry))
+		if (softleaf_is_uffd_wp_marker(entry))
 			flags |= PM_UFFD_WP;
-		if (is_guard_swp_entry(entry))
+		if (softleaf_is_guard_marker(entry))
 			flags |=  PM_GUARD_REGION;
 	}
 
@@ -2331,18 +2331,18 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
 		if (pte_soft_dirty(pte))
 			categories |= PAGE_IS_SOFT_DIRTY;
 	} else if (is_swap_pte(pte)) {
-		swp_entry_t swp;
+		softleaf_t entry;
 
 		categories |= PAGE_IS_SWAPPED;
 		if (!pte_swp_uffd_wp_any(pte))
 			categories |= PAGE_IS_WRITTEN;
 
-		swp = pte_to_swp_entry(pte);
-		if (is_guard_swp_entry(swp))
+		entry = softleaf_from_pte(pte);
+		if (softleaf_is_guard_marker(entry))
 			categories |= PAGE_IS_GUARD;
 		else if ((p->masks_of_interest & PAGE_IS_FILE) &&
-			 is_pfn_swap_entry(swp) &&
-			 !folio_test_anon(pfn_swap_entry_folio(swp)))
+			 softleaf_has_pfn(entry) &&
+			 !folio_test_anon(softleaf_to_folio(entry)))
 			categories |= PAGE_IS_FILE;
 
 		if (pte_swp_soft_dirty(pte))
@@ -2467,7 +2467,7 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
 {
 	unsigned long psize;
 
-	if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
+	if (is_hugetlb_entry_hwpoisoned(ptent) || pte_is_marker(ptent))
 		return;
 
 	psize = huge_page_size(hstate_vma(vma));
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 94c4d68f0818..3f539aabc3b3 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -29,7 +29,7 @@
 #include <linux/ioctl.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/miscdevice.h>
 #include <linux/uio.h>
 
@@ -251,7 +251,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 	if (huge_pte_none(pte))
 		return true;
 	/* UFFD PTE markers require userspace to resolve the fault. */
-	if (is_uffd_pte_marker(pte))
+	if (pte_is_uffd_marker(pte))
 		return true;
 	/*
 	 * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
@@ -337,7 +337,7 @@ again:
 	if (pte_none(ptent))
 		goto out;
 	/* UFFD PTE markers require userspace to resolve the fault. */
-	if (is_uffd_pte_marker(ptent))
+	if (pte_is_uffd_marker(ptent))
 		goto out;
 	/*
 	 * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
diff --git a/include/linux/leafops.h b/include/linux/leafops.h
new file mode 100644
index 000000000000..cff9d94fd5d1
--- /dev/null
+++ b/include/linux/leafops.h
@@ -0,0 +1,387 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Describes operations that can be performed on software-defined page table
+ * leaf entries. These are abstracted from the hardware page table entries
+ * themselves by the softleaf_t type, see mm_types.h.
+ */
+#ifndef _LINUX_LEAFOPS_H
+#define _LINUX_LEAFOPS_H
+
+#include <linux/mm_types.h>
+#include <linux/swapops.h>
+#include <linux/swap.h>
+
+#ifdef CONFIG_MMU
+
+/* Temporary until swp_entry_t eliminated. */
+#define LEAF_TYPE_SHIFT SWP_TYPE_SHIFT
+
+enum softleaf_type {
+	/* Fundamental types. */
+	SOFTLEAF_NONE,
+	SOFTLEAF_SWAP,
+	/* Migration types. */
+	SOFTLEAF_MIGRATION_READ,
+	SOFTLEAF_MIGRATION_READ_EXCLUSIVE,
+	SOFTLEAF_MIGRATION_WRITE,
+	/* Device types. */
+	SOFTLEAF_DEVICE_PRIVATE_READ,
+	SOFTLEAF_DEVICE_PRIVATE_WRITE,
+	SOFTLEAF_DEVICE_EXCLUSIVE,
+	/* H/W posion types. */
+	SOFTLEAF_HWPOISON,
+	/* Marker types. */
+	SOFTLEAF_MARKER,
+};
+
+/**
+ * softleaf_mk_none() - Create an empty ('none') leaf entry.
+ * Returns: empty leaf entry.
+ */
+static inline softleaf_t softleaf_mk_none(void)
+{
+	return ((softleaf_t) { 0 });
+}
+
+/**
+ * softleaf_from_pte() - Obtain a leaf entry from a PTE entry.
+ * @pte: PTE entry.
+ *
+ * If @pte is present (therefore not a leaf entry) the function returns an empty
+ * leaf entry. Otherwise, it returns a leaf entry.
+ *
+ * Returns: Leaf entry.
+ */
+static inline softleaf_t softleaf_from_pte(pte_t pte)
+{
+	if (pte_present(pte) || pte_none(pte))
+		return softleaf_mk_none();
+
+	/* Temporary until swp_entry_t eliminated. */
+	return pte_to_swp_entry(pte);
+}
+
+/**
+ * softleaf_is_none() - Is the leaf entry empty?
+ * @entry: Leaf entry.
+ *
+ * Empty entries are typically the result of a 'none' page table leaf entry
+ * being converted to a leaf entry.
+ *
+ * Returns: true if the entry is empty, false otherwise.
+ */
+static inline bool softleaf_is_none(softleaf_t entry)
+{
+	return entry.val == 0;
+}
+
+/**
+ * softleaf_type() - Identify the type of leaf entry.
+ * @enntry: Leaf entry.
+ *
+ * Returns: the leaf entry type associated with @entry.
+ */
+static inline enum softleaf_type softleaf_type(softleaf_t entry)
+{
+	unsigned int type_num;
+
+	if (softleaf_is_none(entry))
+		return SOFTLEAF_NONE;
+
+	type_num = entry.val >> LEAF_TYPE_SHIFT;
+
+	if (type_num < MAX_SWAPFILES)
+		return SOFTLEAF_SWAP;
+
+	switch (type_num) {
+#ifdef CONFIG_MIGRATION
+	case SWP_MIGRATION_READ:
+		return SOFTLEAF_MIGRATION_READ;
+	case SWP_MIGRATION_READ_EXCLUSIVE:
+		return SOFTLEAF_MIGRATION_READ_EXCLUSIVE;
+	case SWP_MIGRATION_WRITE:
+		return SOFTLEAF_MIGRATION_WRITE;
+#endif
+#ifdef CONFIG_DEVICE_PRIVATE
+	case SWP_DEVICE_WRITE:
+		return SOFTLEAF_DEVICE_PRIVATE_WRITE;
+	case SWP_DEVICE_READ:
+		return SOFTLEAF_DEVICE_PRIVATE_READ;
+	case SWP_DEVICE_EXCLUSIVE:
+		return SOFTLEAF_DEVICE_EXCLUSIVE;
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+	case SWP_HWPOISON:
+		return SOFTLEAF_HWPOISON;
+#endif
+	case SWP_PTE_MARKER:
+		return SOFTLEAF_MARKER;
+	}
+
+	/* Unknown entry type. */
+	VM_WARN_ON_ONCE(1);
+	return SOFTLEAF_NONE;
+}
+
+/**
+ * softleaf_is_swap() - Is this leaf entry a swap entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a swap entry, otherwise false.
+ */
+static inline bool softleaf_is_swap(softleaf_t entry)
+{
+	return softleaf_type(entry) == SOFTLEAF_SWAP;
+}
+
+/**
+ * softleaf_is_migration() - Is this leaf entry a migration entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a migration entry, otherwise false.
+ */
+static inline bool softleaf_is_migration(softleaf_t entry)
+{
+	switch (softleaf_type(entry)) {
+	case SOFTLEAF_MIGRATION_READ:
+	case SOFTLEAF_MIGRATION_READ_EXCLUSIVE:
+	case SOFTLEAF_MIGRATION_WRITE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/**
+ * softleaf_is_device_private() - Is this leaf entry a device private entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a device private entry, otherwise false.
+ */
+static inline bool softleaf_is_device_private(softleaf_t entry)
+{
+	switch (softleaf_type(entry)) {
+	case SOFTLEAF_DEVICE_PRIVATE_WRITE:
+	case SOFTLEAF_DEVICE_PRIVATE_READ:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/**
+ * softleaf_is_device_exclusive() - Is this leaf entry a device exclusive entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a device exclusive entry, otherwise false.
+ */
+static inline bool softleaf_is_device_exclusive(softleaf_t entry)
+{
+	return softleaf_type(entry) == SOFTLEAF_DEVICE_EXCLUSIVE;
+}
+
+/**
+ * softleaf_is_hwpoison() - Is this leaf entry a hardware poison entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a hardware poison entry, otherwise false.
+ */
+static inline bool softleaf_is_hwpoison(softleaf_t entry)
+{
+	return softleaf_type(entry) == SOFTLEAF_HWPOISON;
+}
+
+/**
+ * softleaf_is_marker() - Is this leaf entry a marker?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a marker entry, otherwise false.
+ */
+static inline bool softleaf_is_marker(softleaf_t entry)
+{
+	return softleaf_type(entry) == SOFTLEAF_MARKER;
+}
+
+/**
+ * softleaf_to_marker() - Obtain marker associated with leaf entry.
+ * @entry: Leaf entry, softleaf_is_marker(@entry) must return true.
+ *
+ * Returns: Marker associated with the leaf entry.
+ */
+static inline pte_marker softleaf_to_marker(softleaf_t entry)
+{
+	VM_WARN_ON_ONCE(!softleaf_is_marker(entry));
+
+	return swp_offset(entry) & PTE_MARKER_MASK;
+}
+
+/**
+ * softleaf_has_pfn() - Does this leaf entry encode a valid PFN number?
+ * @entry: Leaf entry.
+ *
+ * A pfn swap entry is a special type of swap entry that always has a pfn stored
+ * in the swap offset. They can either be used to represent unaddressable device
+ * memory, to restrict access to a page undergoing migration or to represent a
+ * pfn which has been hwpoisoned and unmapped.
+ *
+ * Returns: true if the leaf entry encodes a PFN, otherwise false.
+ */
+static inline bool softleaf_has_pfn(softleaf_t entry)
+{
+	/* Make sure the swp offset can always store the needed fields. */
+	BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
+
+	if (softleaf_is_migration(entry))
+		return true;
+	if (softleaf_is_device_private(entry))
+		return true;
+	if (softleaf_is_device_exclusive(entry))
+		return true;
+	if (softleaf_is_hwpoison(entry))
+		return true;
+
+	return false;
+}
+
+/**
+ * softleaf_to_pfn() - Obtain PFN encoded within leaf entry.
+ * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true.
+ *
+ * Returns: The PFN associated with the leaf entry.
+ */
+static inline unsigned long softleaf_to_pfn(softleaf_t entry)
+{
+	VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
+
+	/* Temporary until swp_entry_t eliminated. */
+	return swp_offset_pfn(entry);
+}
+
+/**
+ * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry.
+ * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true.
+ *
+ * Returns: Pointer to the struct page associated with the leaf entry's PFN.
+ */
+static inline struct page *softleaf_to_page(softleaf_t entry)
+{
+	VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
+
+	/* Temporary until swp_entry_t eliminated. */
+	return pfn_swap_entry_to_page(entry);
+}
+
+/**
+ * softleaf_to_folio() - Obtains struct folio for PFN encoded within leaf entry.
+ * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true.
+ *
+ * Returns: Pointer to the struct folio associated with the leaf entry's PFN.
+ */
+static inline struct folio *softleaf_to_folio(softleaf_t entry)
+{
+	VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
+
+	/* Temporary until swp_entry_t eliminated. */
+	return pfn_swap_entry_folio(entry);
+}
+
+/**
+ * softleaf_is_poison_marker() - Is this leaf entry a poison marker?
+ * @entry: Leaf entry.
+ *
+ * The poison marker is set via UFFDIO_POISON. Userfaultfd-specific.
+ *
+ * Returns: true if the leaf entry is a poison marker, otherwise false.
+ */
+static inline bool softleaf_is_poison_marker(softleaf_t entry)
+{
+	if (!softleaf_is_marker(entry))
+		return false;
+
+	return softleaf_to_marker(entry) & PTE_MARKER_POISONED;
+}
+
+/**
+ * softleaf_is_guard_marker() - Is this leaf entry a guard region marker?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a guard marker, otherwise false.
+ */
+static inline bool softleaf_is_guard_marker(softleaf_t entry)
+{
+	if (!softleaf_is_marker(entry))
+		return false;
+
+	return softleaf_to_marker(entry) & PTE_MARKER_GUARD;
+}
+
+/**
+ * softleaf_is_uffd_wp_marker() - Is this leaf entry a userfautlfd write protect
+ * marker?
+ * @entry: Leaf entry.
+ *
+ * Userfaultfd-specific.
+ *
+ * Returns: true if the leaf entry is a UFFD WP marker, otherwise false.
+ */
+static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry)
+{
+	if (!softleaf_is_marker(entry))
+		return false;
+
+	return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP;
+}
+
+/**
+ * pte_is_marker() - Does the PTE entry encode a marker leaf entry?
+ * @pte: PTE entry.
+ *
+ * Returns: true if this PTE is a marker leaf entry, otherwise false.
+ */
+static inline bool pte_is_marker(pte_t pte)
+{
+	return softleaf_is_marker(softleaf_from_pte(pte));
+}
+
+/**
+ * pte_is_uffd_wp_marker() - Does this PTE entry encode a userfaultfd write
+ * protect marker leaf entry?
+ * @pte: PTE entry.
+ *
+ * Returns: true if this PTE is a UFFD WP marker leaf entry, otherwise false.
+ */
+static inline bool pte_is_uffd_wp_marker(pte_t pte)
+{
+	const softleaf_t entry = softleaf_from_pte(pte);
+
+	return softleaf_is_uffd_wp_marker(entry);
+}
+
+/**
+ * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker
+ * leaf entry?
+ * @entry: Leaf entry.
+ *
+ * It's useful to be able to determine which leaf entries encode UFFD-specific
+ * markers so we can handle these correctly.
+ *
+ * Returns: true if this PTE entry is a UFFD-specific marker, otherwise false.
+ */
+static inline bool pte_is_uffd_marker(pte_t pte)
+{
+	const softleaf_t entry = softleaf_from_pte(pte);
+
+	if (!softleaf_is_marker(entry))
+		return false;
+
+	/* UFFD WP, poisoned swap entries are UFFD-handled. */
+	if (softleaf_is_uffd_wp_marker(entry))
+		return true;
+	if (softleaf_is_poison_marker(entry))
+		return true;
+
+	return false;
+}
+
+#endif  /* CONFIG_MMU */
+#endif  /* _LINUX_LEAFOPS_H */
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f6a2b2d20016..ca7a18351797 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -8,7 +8,7 @@
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/userfaultfd_k.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 
 /**
  * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
@@ -541,9 +541,9 @@ static inline bool mm_tlb_flush_nested(const struct mm_struct *mm)
  * The caller should insert a new pte created with make_pte_marker().
  */
 static inline pte_marker copy_pte_marker(
-		swp_entry_t entry, struct vm_area_struct *dst_vma)
+		softleaf_t entry, struct vm_area_struct *dst_vma)
 {
-	pte_marker srcm = pte_marker_get(entry);
+	const pte_marker srcm = softleaf_to_marker(entry);
 	/* Always copy error entries. */
 	pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5021047485a9..4f66a3206a63 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -285,6 +285,31 @@ typedef struct {
 	unsigned long val;
 } swp_entry_t;
 
+/**
+ * typedef softleaf_t - Describes a page table software leaf entry, abstracted
+ * from its architecture-specific encoding.
+ *
+ * Page table leaf entries are those which do not reference any descendent page
+ * tables but rather either reference a data page, are an empty (or 'none'
+ * entry), or contain a non-present entry.
+ *
+ * If referencing another page table or a data page then the page table entry is
+ * pertinent to hardware - that is it tells the hardware how to decode the page
+ * table entry.
+ *
+ * Otherwise it is a software-defined leaf page table entry, which this type
+ * describes. See leafops.h and specifically @softleaf_type for a list of all
+ * possible kinds of software leaf entry.
+ *
+ * A softleaf_t entry is abstracted from the hardware page table entry, so is
+ * not architecture-specific.
+ *
+ * NOTE: While we transition from the confusing swp_entry_t type used for this
+ *       purpose, we simply alias this type. This will be removed once the
+ *       transition is complete.
+ */
+typedef swp_entry_t softleaf_t;
+
 #if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT)
 /* We have some extra room after the refcount in tail pages. */
 #define NR_PAGES_IN_LARGE_FOLIO
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index d1f665935cfc..0a4b3f51ecf5 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -426,21 +426,6 @@ static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
 	return swp_entry(SWP_PTE_MARKER, marker);
 }
 
-static inline bool is_pte_marker_entry(swp_entry_t entry)
-{
-	return swp_type(entry) == SWP_PTE_MARKER;
-}
-
-static inline pte_marker pte_marker_get(swp_entry_t entry)
-{
-	return swp_offset(entry) & PTE_MARKER_MASK;
-}
-
-static inline bool is_pte_marker(pte_t pte)
-{
-	return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte));
-}
-
 static inline pte_t make_pte_marker(pte_marker marker)
 {
 	return swp_entry_to_pte(make_pte_marker_entry(marker));
@@ -451,24 +436,11 @@ static inline swp_entry_t make_poisoned_swp_entry(void)
 	return make_pte_marker_entry(PTE_MARKER_POISONED);
 }
 
-static inline int is_poisoned_swp_entry(swp_entry_t entry)
-{
-	return is_pte_marker_entry(entry) &&
-	    (pte_marker_get(entry) & PTE_MARKER_POISONED);
-
-}
-
 static inline swp_entry_t make_guard_swp_entry(void)
 {
 	return make_pte_marker_entry(PTE_MARKER_GUARD);
 }
 
-static inline int is_guard_swp_entry(swp_entry_t entry)
-{
-	return is_pte_marker_entry(entry) &&
-		(pte_marker_get(entry) & PTE_MARKER_GUARD);
-}
-
 static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
 {
 	struct page *p = pfn_to_page(swp_offset_pfn(entry));
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index da0b4fcc566f..983c860a00f1 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -16,7 +16,7 @@
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <asm-generic/pgtable_uffd.h>
 #include <linux/hugetlb_inline.h>
 
@@ -434,32 +434,6 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
 	return userfaultfd_wp_unpopulated(vma);
 }
 
-static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry)
-{
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
-	return is_pte_marker_entry(entry) &&
-	    (pte_marker_get(entry) & PTE_MARKER_UFFD_WP);
-#else
-	return false;
-#endif
-}
-
-static inline bool pte_marker_uffd_wp(pte_t pte)
-{
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
-	swp_entry_t entry;
-
-	if (!is_swap_pte(pte))
-		return false;
-
-	entry = pte_to_swp_entry(pte);
-
-	return pte_marker_entry_uffd_wp(entry);
-#else
-	return false;
-#endif
-}
-
 /*
  * Returns true if this is a swap pte and was uffd-wp wr-protected in either
  * forms (pte marker or a normal swap pte), false otherwise.
@@ -473,31 +447,10 @@ static inline bool pte_swp_uffd_wp_any(pte_t pte)
 	if (pte_swp_uffd_wp(pte))
 		return true;
 
-	if (pte_marker_uffd_wp(pte))
+	if (pte_is_uffd_wp_marker(pte))
 		return true;
 #endif
 	return false;
 }
 
-
-static inline bool is_uffd_pte_marker(pte_t pte)
-{
-	swp_entry_t entry;
-
-	if (pte_present(pte))
-		return false;
-
-	entry = pte_to_swp_entry(pte);
-	if (!is_pte_marker_entry(entry))
-		return false;
-
-	/* UFFD WP, poisoned swap entries are UFFD handled. */
-	if (pte_marker_entry_uffd_wp(entry))
-		return true;
-	if (is_poisoned_swp_entry(entry))
-		return true;
-
-	return false;
-}
-
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/mm/hmm.c b/mm/hmm.c
index 387a38bbaf6a..e350d0cc9d41 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -249,7 +249,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	 * that will be correctly handled, so we need only check for UFFD WP
 	 * here.
 	 */
-	if (pte_none(pte) || pte_marker_uffd_wp(pte)) {
+	if (pte_none(pte) || pte_is_uffd_wp_marker(pte)) {
 		required_fault =
 			hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
 		if (required_fault)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 96c991f54f7a..12853cdefc9b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -26,7 +26,7 @@
 #include <linux/string_choices.h>
 #include <linux/string_helpers.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/jhash.h>
 #include <linux/numa.h>
 #include <linux/llist.h>
@@ -4956,17 +4956,17 @@ again:
 				entry = huge_pte_clear_uffd_wp(entry);
 			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
 		} else if (unlikely(is_hugetlb_entry_migration(entry))) {
-			swp_entry_t swp_entry = pte_to_swp_entry(entry);
+			softleaf_t softleaf = softleaf_from_pte(entry);
 			bool uffd_wp = pte_swp_uffd_wp(entry);
 
-			if (!is_readable_migration_entry(swp_entry) && cow) {
+			if (!is_readable_migration_entry(softleaf) && cow) {
 				/*
 				 * COW mappings require pages in both
 				 * parent and child to be set to read.
 				 */
-				swp_entry = make_readable_migration_entry(
-							swp_offset(swp_entry));
-				entry = swp_entry_to_pte(swp_entry);
+				softleaf = make_readable_migration_entry(
+							swp_offset(softleaf));
+				entry = swp_entry_to_pte(softleaf);
 				if (userfaultfd_wp(src_vma) && uffd_wp)
 					entry = pte_swp_mkuffd_wp(entry);
 				set_huge_pte_at(src, addr, src_pte, entry, sz);
@@ -4974,9 +4974,9 @@ again:
 			if (!userfaultfd_wp(dst_vma))
 				entry = huge_pte_clear_uffd_wp(entry);
 			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
-		} else if (unlikely(is_pte_marker(entry))) {
-			pte_marker marker = copy_pte_marker(
-				pte_to_swp_entry(entry), dst_vma);
+		} else if (unlikely(pte_is_marker(entry))) {
+			const softleaf_t softleaf = softleaf_from_pte(entry);
+			const pte_marker marker = copy_pte_marker(softleaf, dst_vma);
 
 			if (marker)
 				set_huge_pte_at(dst, addr, dst_pte,
@@ -5092,7 +5092,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
 
 	pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
 
-	if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+	if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte))
 		huge_pte_clear(mm, new_addr, dst_pte, sz);
 	else {
 		if (need_clear_uffd_wp) {
@@ -5911,7 +5911,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
 	 * If this pte was previously wr-protected, keep it wr-protected even
 	 * if populated.
 	 */
-	if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))
+	if (unlikely(pte_is_uffd_wp_marker(vmf->orig_pte)))
 		new_pte = huge_pte_mkuffd_wp(new_pte);
 	set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));
 
@@ -6044,9 +6044,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		return hugetlb_no_page(mapping, &vmf);
 
-	if (is_pte_marker(vmf.orig_pte)) {
+	if (pte_is_marker(vmf.orig_pte)) {
 		const pte_marker marker =
-			pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
+			softleaf_to_marker(softleaf_from_pte(vmf.orig_pte));
 
 		if (marker & PTE_MARKER_POISONED) {
 			ret = VM_FAULT_HWPOISON_LARGE |
@@ -6374,7 +6374,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 	 * See comment about UFFD marker overwriting in
 	 * mfill_atomic_install_pte().
 	 */
-	if (!huge_pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep))
+	if (!huge_pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep))
 		goto out_release_unlock;
 
 	if (folio_in_pagecache)
@@ -6495,8 +6495,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
 			/* Nothing to do. */
 		} else if (unlikely(is_hugetlb_entry_migration(pte))) {
-			swp_entry_t entry = pte_to_swp_entry(pte);
-			struct folio *folio = pfn_swap_entry_folio(entry);
+			softleaf_t entry = softleaf_from_pte(pte);
+
+			struct folio *folio = softleaf_to_folio(entry);
 			pte_t newpte = pte;
 
 			if (is_writable_migration_entry(entry)) {
@@ -6516,14 +6517,14 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 				newpte = pte_swp_clear_uffd_wp(newpte);
 			if (!pte_same(pte, newpte))
 				set_huge_pte_at(mm, address, ptep, newpte, psize);
-		} else if (unlikely(is_pte_marker(pte))) {
+		} else if (unlikely(pte_is_marker(pte))) {
 			/*
 			 * Do nothing on a poison marker; page is
 			 * corrupted, permissions do not apply. Here
 			 * pte_marker_uffd_wp()==true implies !poison
 			 * because they're mutual exclusive.
 			 */
-			if (pte_marker_uffd_wp(pte) && uffd_wp_resolve)
+			if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve)
 				/* Safe to modify directly (non-present->none). */
 				huge_pte_clear(mm, address, ptep, psize);
 		} else if (!huge_pte_none(pte)) {
diff --git a/mm/madvise.c b/mm/madvise.c
index 2a165e9beb5b..c8381b954235 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -29,7 +29,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagewalk.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/shmem_fs.h>
 #include <linux/mmu_notifier.h>
 
@@ -690,17 +690,16 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		 * (page allocation + zeroing).
 		 */
 		if (!pte_present(ptent)) {
-			swp_entry_t entry;
+			softleaf_t entry = softleaf_from_pte(ptent);
 
-			entry = pte_to_swp_entry(ptent);
-			if (!non_swap_entry(entry)) {
+			if (softleaf_is_swap(entry)) {
 				max_nr = (end - addr) / PAGE_SIZE;
 				nr = swap_pte_batch(pte, max_nr, ptent);
 				nr_swap -= nr;
 				free_swap_and_cache_nr(entry, nr);
 				clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
-			} else if (is_hwpoison_entry(entry) ||
-				   is_poisoned_swp_entry(entry)) {
+			} else if (softleaf_is_hwpoison(entry) ||
+				   softleaf_is_poison_marker(entry)) {
 				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 			}
 			continue;
@@ -1071,8 +1070,9 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
 
 static bool is_guard_pte_marker(pte_t ptent)
 {
-	return is_swap_pte(ptent) &&
-	       is_guard_swp_entry(pte_to_swp_entry(ptent));
+	const softleaf_t entry = softleaf_from_pte(ptent);
+
+	return softleaf_is_guard_marker(entry);
 }
 
 static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index 732414852570..0caf8c5c8c68 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -60,7 +60,7 @@
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
 #include <linux/migrate.h>
@@ -109,7 +109,7 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
 	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
 		return false;
 
-	return pte_marker_uffd_wp(vmf->orig_pte);
+	return pte_is_uffd_wp_marker(vmf->orig_pte);
 }
 
 /*
@@ -927,10 +927,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 {
 	vm_flags_t vm_flags = dst_vma->vm_flags;
 	pte_t orig_pte = ptep_get(src_pte);
+	softleaf_t entry = softleaf_from_pte(orig_pte);
 	pte_t pte = orig_pte;
 	struct folio *folio;
 	struct page *page;
-	swp_entry_t entry = pte_to_swp_entry(orig_pte);
 
 	if (likely(!non_swap_entry(entry))) {
 		if (swap_duplicate(entry) < 0)
@@ -1016,7 +1016,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
 			return -EBUSY;
 		return -ENOENT;
-	} else if (is_pte_marker_entry(entry)) {
+	} else if (softleaf_is_marker(entry)) {
 		pte_marker marker = copy_pte_marker(entry, dst_vma);
 
 		if (marker)
@@ -1711,14 +1711,14 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
 		unsigned int max_nr, unsigned long addr,
 		struct zap_details *details, int *rss, bool *any_skipped)
 {
-	swp_entry_t entry;
+	softleaf_t entry;
 	int nr = 1;
 
 	*any_skipped = true;
-	entry = pte_to_swp_entry(ptent);
-	if (is_device_private_entry(entry) ||
-		is_device_exclusive_entry(entry)) {
-		struct page *page = pfn_swap_entry_to_page(entry);
+	entry = softleaf_from_pte(ptent);
+	if (softleaf_is_device_private(entry) ||
+	    softleaf_is_device_exclusive(entry)) {
+		struct page *page = softleaf_to_page(entry);
 		struct folio *folio = page_folio(page);
 
 		if (unlikely(!should_zap_folio(details, folio)))
@@ -1733,7 +1733,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
 		rss[mm_counter(folio)]--;
 		folio_remove_rmap_pte(folio, page, vma);
 		folio_put(folio);
-	} else if (!non_swap_entry(entry)) {
+	} else if (softleaf_is_swap(entry)) {
 		/* Genuine swap entries, hence a private anon pages */
 		if (!should_zap_cows(details))
 			return 1;
@@ -1741,20 +1741,20 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
 		nr = swap_pte_batch(pte, max_nr, ptent);
 		rss[MM_SWAPENTS] -= nr;
 		free_swap_and_cache_nr(entry, nr);
-	} else if (is_migration_entry(entry)) {
-		struct folio *folio = pfn_swap_entry_folio(entry);
+	} else if (softleaf_is_migration(entry)) {
+		struct folio *folio = softleaf_to_folio(entry);
 
 		if (!should_zap_folio(details, folio))
 			return 1;
 		rss[mm_counter(folio)]--;
-	} else if (pte_marker_entry_uffd_wp(entry)) {
+	} else if (softleaf_is_uffd_wp_marker(entry)) {
 		/*
 		 * For anon: always drop the marker; for file: only
 		 * drop the marker if explicitly requested.
 		 */
 		if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
 			return 1;
-	} else if (is_guard_swp_entry(entry)) {
+	} else if (softleaf_is_guard_marker(entry)) {
 		/*
 		 * Ordinary zapping should not remove guard PTE
 		 * markers. Only do so if we should remove PTE markers
@@ -1762,7 +1762,8 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
 		 */
 		if (!zap_drop_markers(details))
 			return 1;
-	} else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) {
+	} else if (softleaf_is_hwpoison(entry) ||
+		   softleaf_is_poison_marker(entry)) {
 		if (!should_zap_cows(details))
 			return 1;
 	} else {
@@ -4380,7 +4381,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
 	 *
 	 * This should also cover the case where e.g. the pte changed
 	 * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
-	 * So is_pte_marker() check is not enough to safely drop the pte.
+	 * So pte_is_marker() check is not enough to safely drop the pte.
 	 */
 	if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
 		pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
@@ -4414,8 +4415,8 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
 
 static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 {
-	swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
-	unsigned long marker = pte_marker_get(entry);
+	const softleaf_t entry = softleaf_from_pte(vmf->orig_pte);
+	const pte_marker marker = softleaf_to_marker(entry);
 
 	/*
 	 * PTE markers should never be empty.  If anything weird happened,
@@ -4432,7 +4433,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 	if (marker & PTE_MARKER_GUARD)
 		return VM_FAULT_SIGSEGV;
 
-	if (pte_marker_entry_uffd_wp(entry))
+	if (softleaf_is_uffd_wp_marker(entry))
 		return pte_marker_handle_uffd_wp(vmf);
 
 	/* This is an unknown pte marker */
@@ -4680,7 +4681,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			}
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
-		} else if (is_pte_marker_entry(entry)) {
+		} else if (softleaf_is_marker(entry)) {
 			ret = handle_pte_marker(vmf);
 		} else {
 			print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
diff --git a/mm/mincore.c b/mm/mincore.c
index fb80becd6119..b3682488a65d 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -14,7 +14,7 @@
 #include <linux/mman.h>
 #include <linux/syscalls.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/shmem_fs.h>
 #include <linux/hugetlb.h>
 #include <linux/pgtable.h>
@@ -42,7 +42,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
 	} else {
 		const pte_t ptep = huge_ptep_get(walk->mm, addr, pte);
 
-		if (huge_pte_none(ptep) || is_pte_marker(ptep))
+		if (huge_pte_none(ptep) || pte_is_marker(ptep))
 			present = 0;
 		else
 			present = 1;
@@ -187,7 +187,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 		step = 1;
 		/* We need to do cache lookup too for markers */
-		if (pte_none(pte) || is_pte_marker(pte))
+		if (pte_none(pte) || pte_is_marker(pte))
 			__mincore_unmapped_range(addr, addr + PAGE_SIZE,
 						 vma, vec);
 		else if (pte_present(pte)) {
diff --git a/mm/mprotect.c b/mm/mprotect.c
index db93d3bb1a5e..918a64cc6033 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -326,14 +326,14 @@ static long change_pte_range(struct mmu_gather *tlb,
 				newpte = swp_entry_to_pte(entry);
 				if (pte_swp_uffd_wp(oldpte))
 					newpte = pte_swp_mkuffd_wp(newpte);
-			} else if (is_pte_marker_entry(entry)) {
+			} else if (softleaf_is_marker(entry)) {
 				/*
 				 * Ignore error swap entries unconditionally,
 				 * because any access should sigbus/sigsegv
 				 * anyway.
 				 */
-				if (is_poisoned_swp_entry(entry) ||
-				    is_guard_swp_entry(entry))
+				if (softleaf_is_poison_marker(entry) ||
+				    softleaf_is_guard_marker(entry))
 					continue;
 				/*
 				 * If this is uffd-wp pte marker and we'd like
diff --git a/mm/mremap.c b/mm/mremap.c
index 8ad06cf50783..7c21b2ad13f6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -17,7 +17,7 @@
 #include <linux/swap.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -288,7 +288,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
 		pte = move_pte(pte, old_addr, new_addr);
 		pte = move_soft_dirty_pte(pte);
 
-		if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+		if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte))
 			pte_clear(mm, new_addr, new_ptep);
 		else {
 			if (need_clear_uffd_wp) {
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 137ce27ff68c..be20468fb5a9 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -3,7 +3,7 @@
 #include <linux/rmap.h>
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 
 #include "internal.h"
 
@@ -107,15 +107,12 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr)
 	pte_t ptent = ptep_get(pvmw->pte);
 
 	if (pvmw->flags & PVMW_MIGRATION) {
-		swp_entry_t entry;
-		if (!is_swap_pte(ptent))
-			return false;
-		entry = pte_to_swp_entry(ptent);
+		const softleaf_t entry = softleaf_from_pte(ptent);
 
-		if (!is_migration_entry(entry))
+		if (!softleaf_is_migration(entry))
 			return false;
 
-		pfn = swp_offset_pfn(entry);
+		pfn = softleaf_to_pfn(entry);
 	} else if (is_swap_pte(ptent)) {
 		swp_entry_t entry;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 6580f3cd24bb..395ca58ac4a5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,7 +66,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
 #include <linux/falloc.h>
 #include <linux/splice.h>
 #include <linux/security.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
 #include <linux/ctype.h>
@@ -2286,7 +2286,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	swp_entry_t swap, index_entry;
+	swp_entry_t swap;
+	softleaf_t index_entry;
 	struct swap_info_struct *si;
 	struct folio *folio = NULL;
 	bool skip_swapcache = false;
@@ -2298,7 +2299,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	swap = index_entry;
 	*foliop = NULL;
 
-	if (is_poisoned_swp_entry(index_entry))
+	if (softleaf_is_poison_marker(index_entry))
 		return -EIO;
 
 	si = get_swap_device(index_entry);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index cc4ce205bbec..055ec1050776 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -10,7 +10,7 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/mmu_notifier.h>
 #include <linux/hugetlb.h>
@@ -208,7 +208,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
 	 * MISSING|WP registered, we firstly wr-protect a none pte which has no
 	 * page cache page backing it, then access the page.
 	 */
-	if (!pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep))
+	if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep))
 		goto out_unlock;
 
 	if (page_in_cache) {
@@ -590,7 +590,7 @@ retry:
 		if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
 			const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte);
 
-			if (!huge_pte_none(ptep) && !is_uffd_pte_marker(ptep)) {
+			if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) {
 				err = -EEXIST;
 				hugetlb_vma_unlock_read(dst_vma);
 				mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-- 
cgit v1.2.3


From fb888710e26a8a8a37dc0f8ed09a3c908c63eb71 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:21 +0000
Subject: mm: avoid unnecessary uses of is_swap_pte()

There's an established convention in the kernel that we treat PTEs as
containing swap entries (and the unfortunately named non-swap swap
entries) should they be neither empty (i.e.  pte_none() evaluating true)
nor present (i.e.  pte_present() evaluating true).

However, there is some inconsistency in how this is applied, as we also
have the is_swap_pte() helper which explicitly performs this check:

	/* check whether a pte points to a swap entry */
	static inline int is_swap_pte(pte_t pte)
	{
		return !pte_none(pte) && !pte_present(pte);
	}

As this represents a predicate, and it's logical to assume that in order
to establish that a PTE entry can correctly be manipulated as a
swap/non-swap entry, this predicate seems as if it must first be checked.

But we instead, we far more often utilise the established convention of
checking pte_none() / pte_present() before operating on entries as if they
were swap/non-swap.

This patch works towards correcting this inconsistency by removing all
uses of is_swap_pte() where we are already in a position where we perform
pte_none()/pte_present() checks anyway or otherwise it is clearly logical
to do so.

We also take advantage of the fact that pte_swp_uffd_wp() is only set on
swap entries.

Additionally, update comments referencing to is_swap_pte() and
non_swap_entry().

No functional change intended.

Link: https://lkml.kernel.org/r/17fd6d7f46a846517fd455fadd640af47fcd7c55.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c            | 49 ++++++++++++++++++++++++++++++-------------
 include/linux/userfaultfd_k.h |  3 +--
 mm/hugetlb.c                  |  6 +++---
 mm/internal.h                 |  6 ++----
 mm/khugepaged.c               | 29 ++++++++++++-------------
 mm/migrate.c                  |  2 +-
 mm/mprotect.c                 | 43 ++++++++++++++++++-------------------
 mm/mremap.c                   |  7 +++++--
 mm/page_table_check.c         | 13 +++++++-----
 mm/page_vma_mapped.c          | 31 +++++++++++++--------------
 10 files changed, 104 insertions(+), 85 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5a1e897b0973..bf48fedaf128 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1017,7 +1017,9 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 		young = pte_young(ptent);
 		dirty = pte_dirty(ptent);
 		present = true;
-	} else if (is_swap_pte(ptent)) {
+	} else if (pte_none(ptent)) {
+		smaps_pte_hole_lookup(addr, walk);
+	} else {
 		swp_entry_t swpent = pte_to_swp_entry(ptent);
 
 		if (!non_swap_entry(swpent)) {
@@ -1038,9 +1040,6 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 				present = true;
 			page = pfn_swap_entry_to_page(swpent);
 		}
-	} else {
-		smaps_pte_hole_lookup(addr, walk);
-		return;
 	}
 
 	if (!page)
@@ -1612,6 +1611,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 	 */
 	pte_t ptent = ptep_get(pte);
 
+	if (pte_none(ptent))
+		return;
+
 	if (pte_present(ptent)) {
 		pte_t old_pte;
 
@@ -1621,7 +1623,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 		ptent = pte_wrprotect(old_pte);
 		ptent = pte_clear_soft_dirty(ptent);
 		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
-	} else if (is_swap_pte(ptent)) {
+	} else {
 		ptent = pte_swp_clear_soft_dirty(ptent);
 		set_pte_at(vma->vm_mm, addr, pte, ptent);
 	}
@@ -1924,6 +1926,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 	struct page *page = NULL;
 	struct folio *folio;
 
+	if (pte_none(pte))
+		goto out;
+
 	if (pte_present(pte)) {
 		if (pm->show_pfn)
 			frame = pte_pfn(pte);
@@ -1933,8 +1938,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 			flags |= PM_SOFT_DIRTY;
 		if (pte_uffd_wp(pte))
 			flags |= PM_UFFD_WP;
-	} else if (is_swap_pte(pte)) {
+	} else {
 		swp_entry_t entry;
+
 		if (pte_swp_soft_dirty(pte))
 			flags |= PM_SOFT_DIRTY;
 		if (pte_swp_uffd_wp(pte))
@@ -1942,6 +1948,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		entry = pte_to_swp_entry(pte);
 		if (pm->show_pfn) {
 			pgoff_t offset;
+
 			/*
 			 * For PFN swap offsets, keeping the offset field
 			 * to be PFN only to be compatible with old smaps.
@@ -1970,6 +1977,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		    __folio_page_mapped_exclusively(folio, page))
 			flags |= PM_MMAP_EXCLUSIVE;
 	}
+
+out:
 	if (vma->vm_flags & VM_SOFTDIRTY)
 		flags |= PM_SOFT_DIRTY;
 
@@ -2311,12 +2320,16 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
 					   struct vm_area_struct *vma,
 					   unsigned long addr, pte_t pte)
 {
-	unsigned long categories = 0;
+	unsigned long categories;
+
+	if (pte_none(pte))
+		return 0;
 
 	if (pte_present(pte)) {
 		struct page *page;
 
-		categories |= PAGE_IS_PRESENT;
+		categories = PAGE_IS_PRESENT;
+
 		if (!pte_uffd_wp(pte))
 			categories |= PAGE_IS_WRITTEN;
 
@@ -2330,10 +2343,11 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
 			categories |= PAGE_IS_PFNZERO;
 		if (pte_soft_dirty(pte))
 			categories |= PAGE_IS_SOFT_DIRTY;
-	} else if (is_swap_pte(pte)) {
+	} else {
 		softleaf_t entry;
 
-		categories |= PAGE_IS_SWAPPED;
+		categories = PAGE_IS_SWAPPED;
+
 		if (!pte_swp_uffd_wp_any(pte))
 			categories |= PAGE_IS_WRITTEN;
 
@@ -2361,12 +2375,12 @@ static void make_uffd_wp_pte(struct vm_area_struct *vma,
 		old_pte = ptep_modify_prot_start(vma, addr, pte);
 		ptent = pte_mkuffd_wp(old_pte);
 		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
-	} else if (is_swap_pte(ptent)) {
-		ptent = pte_swp_mkuffd_wp(ptent);
-		set_pte_at(vma->vm_mm, addr, pte, ptent);
-	} else {
+	} else if (pte_none(ptent)) {
 		set_pte_at(vma->vm_mm, addr, pte,
 			   make_pte_marker(PTE_MARKER_UFFD_WP));
+	} else {
+		ptent = pte_swp_mkuffd_wp(ptent);
+		set_pte_at(vma->vm_mm, addr, pte, ptent);
 	}
 }
 
@@ -2435,6 +2449,9 @@ static unsigned long pagemap_hugetlb_category(pte_t pte)
 {
 	unsigned long categories = PAGE_IS_HUGE;
 
+	if (pte_none(pte))
+		return categories;
+
 	/*
 	 * According to pagemap_hugetlb_range(), file-backed HugeTLB
 	 * page cannot be swapped. So PAGE_IS_FILE is not checked for
@@ -2442,6 +2459,7 @@ static unsigned long pagemap_hugetlb_category(pte_t pte)
 	 */
 	if (pte_present(pte)) {
 		categories |= PAGE_IS_PRESENT;
+
 		if (!huge_pte_uffd_wp(pte))
 			categories |= PAGE_IS_WRITTEN;
 		if (!PageAnon(pte_page(pte)))
@@ -2450,8 +2468,9 @@ static unsigned long pagemap_hugetlb_category(pte_t pte)
 			categories |= PAGE_IS_PFNZERO;
 		if (pte_soft_dirty(pte))
 			categories |= PAGE_IS_SOFT_DIRTY;
-	} else if (is_swap_pte(pte)) {
+	} else {
 		categories |= PAGE_IS_SWAPPED;
+
 		if (!pte_swp_uffd_wp_any(pte))
 			categories |= PAGE_IS_WRITTEN;
 		if (pte_swp_soft_dirty(pte))
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 983c860a00f1..96b089dff4ef 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -441,9 +441,8 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
 static inline bool pte_swp_uffd_wp_any(pte_t pte)
 {
 #ifdef CONFIG_PTE_MARKER_UFFD_WP
-	if (!is_swap_pte(pte))
+	if (pte_present(pte))
 		return false;
-
 	if (pte_swp_uffd_wp(pte))
 		return true;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 12853cdefc9b..59d91c36770c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5092,13 +5092,13 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
 
 	pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
 
-	if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte))
+	if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) {
 		huge_pte_clear(mm, new_addr, dst_pte, sz);
-	else {
+	} else {
 		if (need_clear_uffd_wp) {
 			if (pte_present(pte))
 				pte = huge_pte_clear_uffd_wp(pte);
-			else if (is_swap_pte(pte))
+			else
 				pte = pte_swp_clear_uffd_wp(pte);
 		}
 		set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
diff --git a/mm/internal.h b/mm/internal.h
index 2bad3971813b..a9b38cadb192 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -325,8 +325,7 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
 /**
  * pte_move_swp_offset - Move the swap entry offset field of a swap pte
  *	 forward or backward by delta
- * @pte: The initial pte state; is_swap_pte(pte) must be true and
- *	 non_swap_entry() must be false.
+ * @pte: The initial pte state; must be a swap entry
  * @delta: The direction and the offset we are moving; forward if delta
  *	 is positive; backward if delta is negative
  *
@@ -352,8 +351,7 @@ static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
 
 /**
  * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
- * @pte: The initial pte state; is_swap_pte(pte) must be true and
- *	 non_swap_entry() must be false.
+ * @pte: The initial pte state; must be a swap entry.
  *
  * Increments the swap offset, while maintaining all other fields, including
  * swap type, and any swp pte bits. The resulting pte is returned.
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index af1c162c9a94..d7e71c2e2571 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1019,7 +1019,8 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
 		}
 
 		vmf.orig_pte = ptep_get_lockless(pte);
-		if (!is_swap_pte(vmf.orig_pte))
+		if (pte_none(vmf.orig_pte) ||
+		    pte_present(vmf.orig_pte))
 			continue;
 
 		vmf.pte = pte;
@@ -1276,7 +1277,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
 	     _pte++, addr += PAGE_SIZE) {
 		pte_t pteval = ptep_get(_pte);
-		if (is_swap_pte(pteval)) {
+		if (pte_none_or_zero(pteval)) {
+			++none_or_zero;
+			if (!userfaultfd_armed(vma) &&
+			    (!cc->is_khugepaged ||
+			     none_or_zero <= khugepaged_max_ptes_none)) {
+				continue;
+			} else {
+				result = SCAN_EXCEED_NONE_PTE;
+				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+				goto out_unmap;
+			}
+		}
+		if (!pte_present(pteval)) {
 			++unmapped;
 			if (!cc->is_khugepaged ||
 			    unmapped <= khugepaged_max_ptes_swap) {
@@ -1296,18 +1309,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 				goto out_unmap;
 			}
 		}
-		if (pte_none_or_zero(pteval)) {
-			++none_or_zero;
-			if (!userfaultfd_armed(vma) &&
-			    (!cc->is_khugepaged ||
-			     none_or_zero <= khugepaged_max_ptes_none)) {
-				continue;
-			} else {
-				result = SCAN_EXCEED_NONE_PTE;
-				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
-				goto out_unmap;
-			}
-		}
 		if (pte_uffd_wp(pteval)) {
 			/*
 			 * Don't collapse the page if any of the small
diff --git a/mm/migrate.c b/mm/migrate.c
index d8f6cd14cdb7..847c1ec17628 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -492,7 +492,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 	pte = ptep_get(ptep);
 	pte_unmap(ptep);
 
-	if (!is_swap_pte(pte))
+	if (pte_none(pte) || pte_present(pte))
 		goto out;
 
 	entry = pte_to_swp_entry(pte);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 918a64cc6033..aa555dfbdfc5 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -297,7 +297,26 @@ static long change_pte_range(struct mmu_gather *tlb,
 				prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent,
 					nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
 			pages += nr_ptes;
-		} else if (is_swap_pte(oldpte)) {
+		} else if (pte_none(oldpte)) {
+			/*
+			 * Nobody plays with any none ptes besides
+			 * userfaultfd when applying the protections.
+			 */
+			if (likely(!uffd_wp))
+				continue;
+
+			if (userfaultfd_wp_use_markers(vma)) {
+				/*
+				 * For file-backed mem, we need to be able to
+				 * wr-protect a none pte, because even if the
+				 * pte is none, the page/swap cache could
+				 * exist.  Doing that by install a marker.
+				 */
+				set_pte_at(vma->vm_mm, addr, pte,
+					   make_pte_marker(PTE_MARKER_UFFD_WP));
+				pages++;
+			}
+		} else  {
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
 			pte_t newpte;
 
@@ -358,28 +377,6 @@ static long change_pte_range(struct mmu_gather *tlb,
 				set_pte_at(vma->vm_mm, addr, pte, newpte);
 				pages++;
 			}
-		} else {
-			/* It must be an none page, or what else?.. */
-			WARN_ON_ONCE(!pte_none(oldpte));
-
-			/*
-			 * Nobody plays with any none ptes besides
-			 * userfaultfd when applying the protections.
-			 */
-			if (likely(!uffd_wp))
-				continue;
-
-			if (userfaultfd_wp_use_markers(vma)) {
-				/*
-				 * For file-backed mem, we need to be able to
-				 * wr-protect a none pte, because even if the
-				 * pte is none, the page/swap cache could
-				 * exist.  Doing that by install a marker.
-				 */
-				set_pte_at(vma->vm_mm, addr, pte,
-					   make_pte_marker(PTE_MARKER_UFFD_WP));
-				pages++;
-			}
 		}
 	} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
diff --git a/mm/mremap.c b/mm/mremap.c
index 7c21b2ad13f6..62b6827abacf 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -158,6 +158,9 @@ static void drop_rmap_locks(struct vm_area_struct *vma)
 
 static pte_t move_soft_dirty_pte(pte_t pte)
 {
+	if (pte_none(pte))
+		return pte;
+
 	/*
 	 * Set soft dirty bit so we can notice
 	 * in userspace the ptes were moved.
@@ -165,7 +168,7 @@ static pte_t move_soft_dirty_pte(pte_t pte)
 #ifdef CONFIG_MEM_SOFT_DIRTY
 	if (pte_present(pte))
 		pte = pte_mksoft_dirty(pte);
-	else if (is_swap_pte(pte))
+	else
 		pte = pte_swp_mksoft_dirty(pte);
 #endif
 	return pte;
@@ -294,7 +297,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
 			if (need_clear_uffd_wp) {
 				if (pte_present(pte))
 					pte = pte_clear_uffd_wp(pte);
-				else if (is_swap_pte(pte))
+				else
 					pte = pte_swp_clear_uffd_wp(pte);
 			}
 			set_ptes(mm, new_addr, new_ptep, pte, nr_ptes);
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 4eeca782b888..43f75d2f7c36 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -185,12 +185,15 @@ static inline bool swap_cached_writable(swp_entry_t entry)
 	       is_writable_migration_entry(entry);
 }
 
-static inline void page_table_check_pte_flags(pte_t pte)
+static void page_table_check_pte_flags(pte_t pte)
 {
-	if (pte_present(pte) && pte_uffd_wp(pte))
-		WARN_ON_ONCE(pte_write(pte));
-	else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte))
-		WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte)));
+	if (pte_present(pte)) {
+		WARN_ON_ONCE(pte_uffd_wp(pte) && pte_write(pte));
+	} else if (pte_swp_uffd_wp(pte)) {
+		const swp_entry_t entry = pte_to_swp_entry(pte);
+
+		WARN_ON_ONCE(swap_cached_writable(entry));
+	}
 }
 
 void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index be20468fb5a9..a4e23818f37f 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -16,6 +16,7 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
 static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
 		    spinlock_t **ptlp)
 {
+	bool is_migration;
 	pte_t ptent;
 
 	if (pvmw->flags & PVMW_SYNC) {
@@ -26,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
 		return !!pvmw->pte;
 	}
 
+	is_migration = pvmw->flags & PVMW_MIGRATION;
 again:
 	/*
 	 * It is important to return the ptl corresponding to pte,
@@ -41,11 +43,14 @@ again:
 
 	ptent = ptep_get(pvmw->pte);
 
-	if (pvmw->flags & PVMW_MIGRATION) {
-		if (!is_swap_pte(ptent))
+	if (pte_none(ptent)) {
+		return false;
+	} else if (pte_present(ptent)) {
+		if (is_migration)
 			return false;
-	} else if (is_swap_pte(ptent)) {
+	} else if (!is_migration) {
 		swp_entry_t entry;
+
 		/*
 		 * Handle un-addressable ZONE_DEVICE memory.
 		 *
@@ -66,8 +71,6 @@ again:
 		if (!is_device_private_entry(entry) &&
 		    !is_device_exclusive_entry(entry))
 			return false;
-	} else if (!pte_present(ptent)) {
-		return false;
 	}
 	spin_lock(*ptlp);
 	if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) {
@@ -113,21 +116,17 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr)
 			return false;
 
 		pfn = softleaf_to_pfn(entry);
-	} else if (is_swap_pte(ptent)) {
-		swp_entry_t entry;
+	} else if (pte_present(ptent)) {
+		pfn = pte_pfn(ptent);
+	} else {
+		const softleaf_t entry = softleaf_from_pte(ptent);
 
 		/* Handle un-addressable ZONE_DEVICE memory */
-		entry = pte_to_swp_entry(ptent);
-		if (!is_device_private_entry(entry) &&
-		    !is_device_exclusive_entry(entry))
-			return false;
-
-		pfn = swp_offset_pfn(entry);
-	} else {
-		if (!pte_present(ptent))
+		if (!softleaf_is_device_private(entry) &&
+		    !softleaf_is_device_exclusive(entry))
 			return false;
 
-		pfn = pte_pfn(ptent);
+		pfn = softleaf_to_pfn(entry);
 	}
 
 	if ((pfn + pte_nr - 1) < pvmw->pfn)
-- 
cgit v1.2.3


From fb410d8b89e89ef61b18326f07c477f563b631f6 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:23 +0000
Subject: mm: use leaf entries in debug pgtable + remove is_swap_pte()

Remove invocations of is_swap_pte() in mm/debug_vm_pgtable.c and use
softleaf_from_pte() and softleaf_is_swap() as necessary to replace this
usage.

We update the test code to use a 'true' swap entry throughout so we are
guaranteed this is not a non-swap entry, so all asserts continue to
operate correctly.

With this change in place, we no longer use is_swap_pte() anywhere, so
remove it.

Link: https://lkml.kernel.org/r/222f352e7a99191b4bdfa77e835f2fc0dd83fa72.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h |  6 ------
 mm/debug_vm_pgtable.c   | 39 ++++++++++++++++++++++++---------------
 2 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 0a4b3f51ecf5..a66ac4f2105c 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -120,12 +120,6 @@ static inline unsigned long swp_offset_pfn(swp_entry_t entry)
 	return swp_offset(entry) & SWP_PFN_MASK;
 }
 
-/* check whether a pte points to a swap entry */
-static inline int is_swap_pte(pte_t pte)
-{
-	return !pte_none(pte) && !pte_present(pte);
-}
-
 /*
  * Convert the arch-dependent pte representation of a swp_entry_t into an
  * arch-independent swp_entry_t.
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 055e0e025b42..fff311830959 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -25,7 +25,7 @@
 #include <linux/random.h>
 #include <linux/spinlock.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/start_kernel.h>
 #include <linux/sched/mm.h>
 #include <linux/io.h>
@@ -714,14 +714,16 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
 static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args)
 {
 	pte_t pte;
+	softleaf_t entry;
 
 	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
 		return;
 
 	pr_debug("Validating PTE swap soft dirty\n");
 	pte = swp_entry_to_pte(args->swp_entry);
-	WARN_ON(!is_swap_pte(pte));
+	entry = softleaf_from_pte(pte);
 
+	WARN_ON(!softleaf_is_swap(entry));
 	WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
 	WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
 }
@@ -768,40 +770,47 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) {
 
 static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
 {
-	swp_entry_t entry, entry2;
+	swp_entry_t entry;
+	softleaf_t softleaf;
 	pte_t pte;
 
 	pr_debug("Validating PTE swap exclusive\n");
 	entry = args->swp_entry;
 
 	pte = swp_entry_to_pte(entry);
+	softleaf = softleaf_from_pte(pte);
+
 	WARN_ON(pte_swp_exclusive(pte));
-	WARN_ON(!is_swap_pte(pte));
-	entry2 = pte_to_swp_entry(pte);
-	WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+	WARN_ON(!softleaf_is_swap(softleaf));
+	WARN_ON(memcmp(&entry, &softleaf, sizeof(entry)));
 
 	pte = pte_swp_mkexclusive(pte);
+	softleaf = softleaf_from_pte(pte);
+
 	WARN_ON(!pte_swp_exclusive(pte));
-	WARN_ON(!is_swap_pte(pte));
+	WARN_ON(!softleaf_is_swap(softleaf));
 	WARN_ON(pte_swp_soft_dirty(pte));
-	entry2 = pte_to_swp_entry(pte);
-	WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+	WARN_ON(memcmp(&entry, &softleaf, sizeof(entry)));
 
 	pte = pte_swp_clear_exclusive(pte);
+	softleaf = softleaf_from_pte(pte);
+
 	WARN_ON(pte_swp_exclusive(pte));
-	WARN_ON(!is_swap_pte(pte));
-	entry2 = pte_to_swp_entry(pte);
-	WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+	WARN_ON(!softleaf_is_swap(softleaf));
+	WARN_ON(memcmp(&entry, &softleaf, sizeof(entry)));
 }
 
 static void __init pte_swap_tests(struct pgtable_debug_args *args)
 {
 	swp_entry_t arch_entry;
+	softleaf_t entry;
 	pte_t pte1, pte2;
 
 	pr_debug("Validating PTE swap\n");
 	pte1 = swp_entry_to_pte(args->swp_entry);
-	WARN_ON(!is_swap_pte(pte1));
+	entry = softleaf_from_pte(pte1);
+
+	WARN_ON(!softleaf_is_swap(entry));
 
 	arch_entry = __pte_to_swp_entry(pte1);
 	pte2 = __swp_entry_to_pte(arch_entry);
@@ -1218,8 +1227,8 @@ static int __init init_args(struct pgtable_debug_args *args)
 
 	/* See generic_max_swapfile_size(): probe the maximum offset */
 	max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
-	/* Create a swp entry with all possible bits set */
-	args->swp_entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+	/* Create a swp entry with all possible bits set while still being swap. */
+	args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset);
 
 	/*
 	 * Allocate (huge) pages because some of the tests need to access
-- 
cgit v1.2.3


From aa62204cb680d8ff32497181fc9e0dac4956f7e5 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:25 +0000
Subject: mm: avoid unnecessary use of is_swap_pmd()

PMD 'non-swap' swap entries are currently used for PMD-level migration
entries and device private entries.

To add to the confusion in this terminology we use is_swap_pmd() in an
inconsistent way similar to how is_swap_pte() was being used - sometimes
adopting the convention that !pmd_none(), !pmd_present() implies PMD 'swap'
entry, sometimes not.

This patch handles the low-hanging fruit of cases where we can simply
substitute other predicates for is_swap_pmd().

No functional change intended.

Link: https://lkml.kernel.org/r/8a1704b36a009c18032d5bea4cb68e71448fbbe5.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c      | 15 ++++++++++++---
 include/linux/swapops.h | 16 ++++++++++++++--
 mm/huge_memory.c        |  4 +++-
 mm/memory.c             | 50 +++++++++++++++++++++++++++----------------------
 mm/page_table_check.c   | 12 ++++++++----
 5 files changed, 65 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8c35ea48a93e..1bedf7fa5e79 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1059,10 +1059,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 	bool present = false;
 	struct folio *folio;
 
+	if (pmd_none(*pmd))
+		return;
 	if (pmd_present(*pmd)) {
 		page = vm_normal_page_pmd(vma, addr, *pmd);
 		present = true;
-	} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
+	} else if (unlikely(thp_migration_supported())) {
 		swp_entry_t entry = pmd_to_swp_entry(*pmd);
 
 		if (is_pfn_swap_entry(entry))
@@ -2000,6 +2002,9 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
 	if (vma->vm_flags & VM_SOFTDIRTY)
 		flags |= PM_SOFT_DIRTY;
 
+	if (pmd_none(pmd))
+		goto populate_pagemap;
+
 	if (pmd_present(pmd)) {
 		page = pmd_page(pmd);
 
@@ -2010,7 +2015,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
 			flags |= PM_UFFD_WP;
 		if (pm->show_pfn)
 			frame = pmd_pfn(pmd) + idx;
-	} else if (thp_migration_supported() && is_swap_pmd(pmd)) {
+	} else if (thp_migration_supported()) {
 		swp_entry_t entry = pmd_to_swp_entry(pmd);
 		unsigned long offset;
 
@@ -2037,6 +2042,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
 			flags |= PM_FILE;
 	}
 
+populate_pagemap:
 	for (; addr != end; addr += PAGE_SIZE, idx++) {
 		u64 cur_flags = flags;
 		pagemap_entry_t pme;
@@ -2399,6 +2405,9 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
 {
 	unsigned long categories = PAGE_IS_HUGE;
 
+	if (pmd_none(pmd))
+		return categories;
+
 	if (pmd_present(pmd)) {
 		struct page *page;
 
@@ -2416,7 +2425,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
 			categories |= PAGE_IS_PFNZERO;
 		if (pmd_soft_dirty(pmd))
 			categories |= PAGE_IS_SOFT_DIRTY;
-	} else if (is_swap_pmd(pmd)) {
+	} else {
 		swp_entry_t swp;
 
 		categories |= PAGE_IS_SWAPPED;
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index a66ac4f2105c..3e8dd6ea94dd 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -509,7 +509,13 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
 
 static inline int is_pmd_migration_entry(pmd_t pmd)
 {
-	return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
+	swp_entry_t entry;
+
+	if (pmd_present(pmd))
+		return 0;
+
+	entry = pmd_to_swp_entry(pmd);
+	return is_migration_entry(entry);
 }
 #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
 static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
@@ -557,7 +563,13 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
  */
 static inline int is_pmd_device_private_entry(pmd_t pmd)
 {
-	return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
+	swp_entry_t entry;
+
+	if (pmd_present(pmd))
+		return 0;
+
+	entry = pmd_to_swp_entry(pmd);
+	return is_device_private_entry(entry);
 }
 
 #else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d79a4bb363de..b88b4b866cb3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2354,9 +2354,11 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
 
 static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
 {
+	if (pmd_none(pmd))
+		return pmd;
 	if (pmd_present(pmd))
 		pmd = pmd_clear_uffd_wp(pmd);
-	else if (is_swap_pmd(pmd))
+	else
 		pmd = pmd_swp_clear_uffd_wp(pmd);
 
 	return pmd;
diff --git a/mm/memory.c b/mm/memory.c
index 0caf8c5c8c68..76c17feff88b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1376,6 +1376,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		next = pmd_addr_end(addr, end);
 		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) {
 			int err;
+
 			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
 			err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
 					    addr, dst_vma, src_vma);
@@ -6340,35 +6341,40 @@ retry_pud:
 	if (pmd_none(*vmf.pmd) &&
 	    thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
 		ret = create_huge_pmd(&vmf);
-		if (!(ret & VM_FAULT_FALLBACK))
+		if (ret & VM_FAULT_FALLBACK)
+			goto fallback;
+		else
 			return ret;
-	} else {
-		vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
+	}
 
-		if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
-			if (is_pmd_device_private_entry(vmf.orig_pmd))
-				return do_huge_pmd_device_private(&vmf);
+	vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
+	if (pmd_none(vmf.orig_pmd))
+		goto fallback;
 
-			if (is_pmd_migration_entry(vmf.orig_pmd))
-				pmd_migration_entry_wait(mm, vmf.pmd);
-			return 0;
-		}
-		if (pmd_trans_huge(vmf.orig_pmd)) {
-			if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
-				return do_huge_pmd_numa_page(&vmf);
+	if (unlikely(!pmd_present(vmf.orig_pmd))) {
+		if (is_pmd_device_private_entry(vmf.orig_pmd))
+			return do_huge_pmd_device_private(&vmf);
 
-			if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
-			    !pmd_write(vmf.orig_pmd)) {
-				ret = wp_huge_pmd(&vmf);
-				if (!(ret & VM_FAULT_FALLBACK))
-					return ret;
-			} else {
-				huge_pmd_set_accessed(&vmf);
-				return 0;
-			}
+		if (is_pmd_migration_entry(vmf.orig_pmd))
+			pmd_migration_entry_wait(mm, vmf.pmd);
+		return 0;
+	}
+	if (pmd_trans_huge(vmf.orig_pmd)) {
+		if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+			return do_huge_pmd_numa_page(&vmf);
+
+		if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+		    !pmd_write(vmf.orig_pmd)) {
+			ret = wp_huge_pmd(&vmf);
+			if (!(ret & VM_FAULT_FALLBACK))
+				return ret;
+		} else {
+			huge_pmd_set_accessed(&vmf);
+			return 0;
 		}
 	}
 
+fallback:
 	return handle_pte_fault(&vmf);
 }
 
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 43f75d2f7c36..f5f25e120f69 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -215,10 +215,14 @@ EXPORT_SYMBOL(__page_table_check_ptes_set);
 
 static inline void page_table_check_pmd_flags(pmd_t pmd)
 {
-	if (pmd_present(pmd) && pmd_uffd_wp(pmd))
-		WARN_ON_ONCE(pmd_write(pmd));
-	else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd))
-		WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
+	if (pmd_present(pmd)) {
+		if (pmd_uffd_wp(pmd))
+			WARN_ON_ONCE(pmd_write(pmd));
+	} else if (pmd_swp_uffd_wp(pmd)) {
+		swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+		WARN_ON_ONCE(swap_cached_writable(entry));
+	}
 }
 
 void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
-- 
cgit v1.2.3


From 0ac881efe16468503e8c1e7d8a7210b75f027ce3 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:28 +0000
Subject: mm: replace pmd_to_swp_entry() with softleaf_from_pmd()

Introduce softleaf_from_pmd() to do the equivalent operation for PMDs that
softleaf_from_pte() fulfils, and cascade changes through code base
accordingly, introducing helpers as necessary.

We are then able to eliminate pmd_to_swp_entry(),
is_pmd_migration_entry(), is_pmd_device_private_entry() and
is_pmd_non_present_folio_entry().

This further establishes the use of leaf operations throughout the code
base and further establishes the foundations for eliminating
is_swap_pmd().

No functional change intended.

[lorenzo.stoakes@oracle.com: check writable, not readable/writable, per Vlastimil]
  Link: https://lkml.kernel.org/r/cd97b6ec-00f9-45a4-9ae0-8f009c212a94@lucifer.local
Link: https://lkml.kernel.org/r/3fb431699639ded8fdc63d2210aa77a38c8891f1.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: SeongJae Park <sj@kernel.org>\
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c      |  27 +++---
 include/linux/leafops.h | 218 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/migrate.h |   2 +-
 include/linux/swapops.h | 100 ----------------------
 mm/damon/ops-common.c   |   6 +-
 mm/filemap.c            |   6 +-
 mm/hmm.c                |  16 ++--
 mm/huge_memory.c        |  98 +++++++++++-----------
 mm/khugepaged.c         |   4 +-
 mm/madvise.c            |   2 +-
 mm/memory.c             |   4 +-
 mm/mempolicy.c          |   4 +-
 mm/migrate.c            |  20 ++---
 mm/migrate_device.c     |  14 ++--
 mm/page_table_check.c   |  16 ++--
 mm/page_vma_mapped.c    |  15 ++--
 mm/pagewalk.c           |   8 +-
 mm/rmap.c               |   4 +-
 18 files changed, 339 insertions(+), 225 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1bedf7fa5e79..898df952b6bc 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1065,10 +1065,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 		page = vm_normal_page_pmd(vma, addr, *pmd);
 		present = true;
 	} else if (unlikely(thp_migration_supported())) {
-		swp_entry_t entry = pmd_to_swp_entry(*pmd);
+		const softleaf_t entry = softleaf_from_pmd(*pmd);
 
-		if (is_pfn_swap_entry(entry))
-			page = pfn_swap_entry_to_page(entry);
+		if (softleaf_has_pfn(entry))
+			page = softleaf_to_page(entry);
 	}
 	if (IS_ERR_OR_NULL(page))
 		return;
@@ -1655,7 +1655,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		pmd = pmd_clear_soft_dirty(pmd);
 
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
-	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+	} else if (pmd_is_migration_entry(pmd)) {
 		pmd = pmd_swp_clear_soft_dirty(pmd);
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 	}
@@ -2016,12 +2016,12 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
 		if (pm->show_pfn)
 			frame = pmd_pfn(pmd) + idx;
 	} else if (thp_migration_supported()) {
-		swp_entry_t entry = pmd_to_swp_entry(pmd);
+		const softleaf_t entry = softleaf_from_pmd(pmd);
 		unsigned long offset;
 
 		if (pm->show_pfn) {
-			if (is_pfn_swap_entry(entry))
-				offset = swp_offset_pfn(entry) + idx;
+			if (softleaf_has_pfn(entry))
+				offset = softleaf_to_pfn(entry) + idx;
 			else
 				offset = swp_offset(entry) + idx;
 			frame = swp_type(entry) |
@@ -2032,7 +2032,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
 			flags |= PM_SOFT_DIRTY;
 		if (pmd_swp_uffd_wp(pmd))
 			flags |= PM_UFFD_WP;
-		VM_WARN_ON_ONCE(!is_pmd_migration_entry(pmd));
+		VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd));
 		page = pfn_swap_entry_to_page(entry);
 	}
 
@@ -2426,8 +2426,6 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
 		if (pmd_soft_dirty(pmd))
 			categories |= PAGE_IS_SOFT_DIRTY;
 	} else {
-		swp_entry_t swp;
-
 		categories |= PAGE_IS_SWAPPED;
 		if (!pmd_swp_uffd_wp(pmd))
 			categories |= PAGE_IS_WRITTEN;
@@ -2435,9 +2433,10 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
 			categories |= PAGE_IS_SOFT_DIRTY;
 
 		if (p->masks_of_interest & PAGE_IS_FILE) {
-			swp = pmd_to_swp_entry(pmd);
-			if (is_pfn_swap_entry(swp) &&
-			    !folio_test_anon(pfn_swap_entry_folio(swp)))
+			const softleaf_t entry = softleaf_from_pmd(pmd);
+
+			if (softleaf_has_pfn(entry) &&
+			    !folio_test_anon(softleaf_to_folio(entry)))
 				categories |= PAGE_IS_FILE;
 		}
 	}
@@ -2454,7 +2453,7 @@ static void make_uffd_wp_pmd(struct vm_area_struct *vma,
 		old = pmdp_invalidate_ad(vma, addr, pmdp);
 		pmd = pmd_mkuffd_wp(old);
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
-	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+	} else if (pmd_is_migration_entry(pmd)) {
 		pmd = pmd_swp_mkuffd_wp(pmd);
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 	}
diff --git a/include/linux/leafops.h b/include/linux/leafops.h
index cff9d94fd5d1..f5ea9b0385ff 100644
--- a/include/linux/leafops.h
+++ b/include/linux/leafops.h
@@ -61,6 +61,57 @@ static inline softleaf_t softleaf_from_pte(pte_t pte)
 	return pte_to_swp_entry(pte);
 }
 
+/**
+ * softleaf_to_pte() - Obtain a PTE entry from a leaf entry.
+ * @entry: Leaf entry.
+ *
+ * This generates an architecture-specific PTE entry that can be utilised to
+ * encode the metadata the leaf entry encodes.
+ *
+ * Returns: Architecture-specific PTE entry encoding leaf entry.
+ */
+static inline pte_t softleaf_to_pte(softleaf_t entry)
+{
+	/* Temporary until swp_entry_t eliminated. */
+	return swp_entry_to_pte(entry);
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+/**
+ * softleaf_from_pmd() - Obtain a leaf entry from a PMD entry.
+ * @pmd: PMD entry.
+ *
+ * If @pmd is present (therefore not a leaf entry) the function returns an empty
+ * leaf entry. Otherwise, it returns a leaf entry.
+ *
+ * Returns: Leaf entry.
+ */
+static inline softleaf_t softleaf_from_pmd(pmd_t pmd)
+{
+	softleaf_t arch_entry;
+
+	if (pmd_present(pmd) || pmd_none(pmd))
+		return softleaf_mk_none();
+
+	if (pmd_swp_soft_dirty(pmd))
+		pmd = pmd_swp_clear_soft_dirty(pmd);
+	if (pmd_swp_uffd_wp(pmd))
+		pmd = pmd_swp_clear_uffd_wp(pmd);
+	arch_entry = __pmd_to_swp_entry(pmd);
+
+	/* Temporary until swp_entry_t eliminated. */
+	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
+}
+
+#else
+
+static inline softleaf_t softleaf_from_pmd(pmd_t pmd)
+{
+	return softleaf_mk_none();
+}
+
+#endif
+
 /**
  * softleaf_is_none() - Is the leaf entry empty?
  * @entry: Leaf entry.
@@ -134,6 +185,43 @@ static inline bool softleaf_is_swap(softleaf_t entry)
 	return softleaf_type(entry) == SOFTLEAF_SWAP;
 }
 
+/**
+ * softleaf_is_migration_write() - Is this leaf entry a writable migration entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a writable migration entry, otherwise
+ * false.
+ */
+static inline bool softleaf_is_migration_write(softleaf_t entry)
+{
+	return softleaf_type(entry) == SOFTLEAF_MIGRATION_WRITE;
+}
+
+/**
+ * softleaf_is_migration_read() - Is this leaf entry a readable migration entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a readable migration entry, otherwise
+ * false.
+ */
+static inline bool softleaf_is_migration_read(softleaf_t entry)
+{
+	return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ;
+}
+
+/**
+ * softleaf_is_migration_read_exclusive() - Is this leaf entry an exclusive
+ * readable migration entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is an exclusive readable migration entry,
+ * otherwise false.
+ */
+static inline bool softleaf_is_migration_read_exclusive(softleaf_t entry)
+{
+	return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ_EXCLUSIVE;
+}
+
 /**
  * softleaf_is_migration() - Is this leaf entry a migration entry?
  * @entry: Leaf entry.
@@ -152,6 +240,19 @@ static inline bool softleaf_is_migration(softleaf_t entry)
 	}
 }
 
+/**
+ * softleaf_is_device_private_write() - Is this leaf entry a device private
+ * writable entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a device private writable entry, otherwise
+ * false.
+ */
+static inline bool softleaf_is_device_private_write(softleaf_t entry)
+{
+	return softleaf_type(entry) == SOFTLEAF_DEVICE_PRIVATE_WRITE;
+}
+
 /**
  * softleaf_is_device_private() - Is this leaf entry a device private entry?
  * @entry: Leaf entry.
@@ -170,10 +271,10 @@ static inline bool softleaf_is_device_private(softleaf_t entry)
 }
 
 /**
- * softleaf_is_device_exclusive() - Is this leaf entry a device exclusive entry?
+ * softleaf_is_device_exclusive() - Is this leaf entry a device-exclusive entry?
  * @entry: Leaf entry.
  *
- * Returns: true if the leaf entry is a device exclusive entry, otherwise false.
+ * Returns: true if the leaf entry is a device-exclusive entry, otherwise false.
  */
 static inline bool softleaf_is_device_exclusive(softleaf_t entry)
 {
@@ -332,6 +433,61 @@ static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry)
 	return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP;
 }
 
+#ifdef CONFIG_MIGRATION
+
+/**
+ * softleaf_is_migration_young() - Does this migration entry contain an accessed
+ * bit?
+ * @entry: Leaf entry.
+ *
+ * If the architecture can support storing A/D bits in migration entries, this
+ * determines whether the accessed (or 'young') bit was set on the migrated page
+ * table entry.
+ *
+ * Returns: true if the entry contains an accessed bit, otherwise false.
+ */
+static inline bool softleaf_is_migration_young(softleaf_t entry)
+{
+	VM_WARN_ON_ONCE(!softleaf_is_migration(entry));
+
+	if (migration_entry_supports_ad())
+		return swp_offset(entry) & SWP_MIG_YOUNG;
+	/* Keep the old behavior of aging page after migration */
+	return false;
+}
+
+/**
+ * softleaf_is_migration_dirty() - Does this migration entry contain a dirty bit?
+ * @entry: Leaf entry.
+ *
+ * If the architecture can support storing A/D bits in migration entries, this
+ * determines whether the dirty bit was set on the migrated page table entry.
+ *
+ * Returns: true if the entry contains a dirty bit, otherwise false.
+ */
+static inline bool softleaf_is_migration_dirty(softleaf_t entry)
+{
+	VM_WARN_ON_ONCE(!softleaf_is_migration(entry));
+
+	if (migration_entry_supports_ad())
+		return swp_offset(entry) & SWP_MIG_DIRTY;
+	/* Keep the old behavior of clean page after migration */
+	return false;
+}
+
+#else /* CONFIG_MIGRATION */
+
+static inline bool softleaf_is_migration_young(softleaf_t entry)
+{
+	return false;
+}
+
+static inline bool softleaf_is_migration_dirty(softleaf_t entry)
+{
+	return false;
+}
+#endif /* CONFIG_MIGRATION */
+
 /**
  * pte_is_marker() - Does the PTE entry encode a marker leaf entry?
  * @pte: PTE entry.
@@ -383,5 +539,63 @@ static inline bool pte_is_uffd_marker(pte_t pte)
 	return false;
 }
 
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
+
+/**
+ * pmd_is_device_private_entry() - Check if PMD contains a device private swap
+ * entry.
+ * @pmd: The PMD to check.
+ *
+ * Returns true if the PMD contains a swap entry that represents a device private
+ * page mapping. This is used for zone device private pages that have been
+ * swapped out but still need special handling during various memory management
+ * operations.
+ *
+ * Return: true if PMD contains device private entry, false otherwise
+ */
+static inline bool pmd_is_device_private_entry(pmd_t pmd)
+{
+	return softleaf_is_device_private(softleaf_from_pmd(pmd));
+}
+
+#else  /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static inline bool pmd_is_device_private_entry(pmd_t pmd)
+{
+	return false;
+}
+
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+/**
+ * pmd_is_migration_entry() - Does this PMD entry encode a migration entry?
+ * @pmd: PMD entry.
+ *
+ * Returns: true if the PMD encodes a migration entry, otherwise false.
+ */
+static inline bool pmd_is_migration_entry(pmd_t pmd)
+{
+	return softleaf_is_migration(softleaf_from_pmd(pmd));
+}
+
+/**
+ * pmd_is_valid_softleaf() - Is this PMD entry a valid leaf entry?
+ * @pmd: PMD entry.
+ *
+ * PMD leaf entries are valid only if they are device private or migration
+ * entries. This function asserts that a PMD leaf entry is valid in this
+ * respect.
+ *
+ * Returns: true if the PMD entry is a valid leaf entry, otherwise false.
+ */
+static inline bool pmd_is_valid_softleaf(pmd_t pmd)
+{
+	const softleaf_t entry = softleaf_from_pmd(pmd);
+
+	/* Only device private, migration entries valid for PMD. */
+	return softleaf_is_device_private(entry) ||
+		softleaf_is_migration(entry);
+}
+
 #endif  /* CONFIG_MMU */
 #endif  /* _LINUX_LEAFOPS_H */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 41b4cc05a450..26ca00c325d9 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list);
 
 int migrate_huge_page_move_mapping(struct address_space *mapping,
 		struct folio *dst, struct folio *src);
-void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
+void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
 		__releases(ptl);
 void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
 int folio_migrate_mapping(struct address_space *mapping,
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 3e8dd6ea94dd..f1277647262d 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -283,14 +283,6 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
 	return entry;
 }
 
-static inline bool is_migration_entry_young(swp_entry_t entry)
-{
-	if (migration_entry_supports_ad())
-		return swp_offset(entry) & SWP_MIG_YOUNG;
-	/* Keep the old behavior of aging page after migration */
-	return false;
-}
-
 static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
 {
 	if (migration_entry_supports_ad())
@@ -299,14 +291,6 @@ static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
 	return entry;
 }
 
-static inline bool is_migration_entry_dirty(swp_entry_t entry)
-{
-	if (migration_entry_supports_ad())
-		return swp_offset(entry) & SWP_MIG_DIRTY;
-	/* Keep the old behavior of clean page after migration */
-	return false;
-}
-
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
 extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte);
@@ -349,20 +333,11 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
 	return entry;
 }
 
-static inline bool is_migration_entry_young(swp_entry_t entry)
-{
-	return false;
-}
-
 static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
 {
 	return entry;
 }
 
-static inline bool is_migration_entry_dirty(swp_entry_t entry)
-{
-	return false;
-}
 #endif	/* CONFIG_MIGRATION */
 
 #ifdef CONFIG_MEMORY_FAILURE
@@ -487,18 +462,6 @@ extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
 
 extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
 
-static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
-{
-	swp_entry_t arch_entry;
-
-	if (pmd_swp_soft_dirty(pmd))
-		pmd = pmd_swp_clear_soft_dirty(pmd);
-	if (pmd_swp_uffd_wp(pmd))
-		pmd = pmd_swp_clear_uffd_wp(pmd);
-	arch_entry = __pmd_to_swp_entry(pmd);
-	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
-}
-
 static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
 {
 	swp_entry_t arch_entry;
@@ -507,23 +470,7 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
 	return __swp_entry_to_pmd(arch_entry);
 }
 
-static inline int is_pmd_migration_entry(pmd_t pmd)
-{
-	swp_entry_t entry;
-
-	if (pmd_present(pmd))
-		return 0;
-
-	entry = pmd_to_swp_entry(pmd);
-	return is_migration_entry(entry);
-}
 #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
-		struct page *page)
-{
-	BUILD_BUG();
-}
-
 static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
 		struct page *new)
 {
@@ -532,64 +479,17 @@ static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
 
 static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }
 
-static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
-{
-	return swp_entry(0, 0);
-}
-
 static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
 {
 	return __pmd(0);
 }
 
-static inline int is_pmd_migration_entry(pmd_t pmd)
-{
-	return 0;
-}
 #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
 
-#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
-
-/**
- * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry
- * @pmd: The PMD to check
- *
- * Returns true if the PMD contains a swap entry that represents a device private
- * page mapping. This is used for zone device private pages that have been
- * swapped out but still need special handling during various memory management
- * operations.
- *
- * Return: 1 if PMD contains device private entry, 0 otherwise
- */
-static inline int is_pmd_device_private_entry(pmd_t pmd)
-{
-	swp_entry_t entry;
-
-	if (pmd_present(pmd))
-		return 0;
-
-	entry = pmd_to_swp_entry(pmd);
-	return is_device_private_entry(entry);
-}
-
-#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
-
-static inline int is_pmd_device_private_entry(pmd_t pmd)
-{
-	return 0;
-}
-
-#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
-
 static inline int non_swap_entry(swp_entry_t entry)
 {
 	return swp_type(entry) >= MAX_SWAPFILES;
 }
 
-static inline int is_pmd_non_present_folio_entry(pmd_t pmd)
-{
-	return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd);
-}
-
 #endif /* CONFIG_MMU */
 #endif /* _LINUX_SWAPOPS_H */
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 971df8a16ba4..a218d9922234 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -11,7 +11,7 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 
 #include "../internal.h"
 #include "ops-common.h"
@@ -51,7 +51,7 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr
 	if (likely(pte_present(pteval)))
 		pfn = pte_pfn(pteval);
 	else
-		pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+		pfn = softleaf_to_pfn(softleaf_from_pte(pteval));
 
 	folio = damon_get_folio(pfn);
 	if (!folio)
@@ -83,7 +83,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr
 	if (likely(pmd_present(pmdval)))
 		pfn = pmd_pfn(pmdval);
 	else
-		pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval));
+		pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval));
 
 	folio = damon_get_folio(pfn);
 	if (!folio)
diff --git a/mm/filemap.c b/mm/filemap.c
index f0c36df1def7..02355aa46324 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -21,7 +21,7 @@
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
@@ -1402,7 +1402,7 @@ repeat:
  * This follows the same logic as folio_wait_bit_common() so see the comments
  * there.
  */
-void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
+void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
 	__releases(ptl)
 {
 	struct wait_page_queue wait_page;
@@ -1411,7 +1411,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
 	unsigned long pflags;
 	bool in_thrashing;
 	wait_queue_head_t *q;
-	struct folio *folio = pfn_swap_entry_folio(entry);
+	struct folio *folio = softleaf_to_folio(entry);
 
 	q = folio_waitqueue(folio);
 	if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
diff --git a/mm/hmm.c b/mm/hmm.c
index e350d0cc9d41..e9735a9b6102 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -18,7 +18,7 @@
 #include <linux/sched.h>
 #include <linux/mmzone.h>
 #include <linux/pagemap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/hugetlb.h>
 #include <linux/memremap.h>
 #include <linux/sched/mm.h>
@@ -339,19 +339,19 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
 	unsigned long npages = (end - start) >> PAGE_SHIFT;
+	const softleaf_t entry = softleaf_from_pmd(pmd);
 	unsigned long addr = start;
-	swp_entry_t entry = pmd_to_swp_entry(pmd);
 	unsigned int required_fault;
 
-	if (is_device_private_entry(entry) &&
-	    pfn_swap_entry_folio(entry)->pgmap->owner ==
+	if (softleaf_is_device_private(entry) &&
+	    softleaf_to_folio(entry)->pgmap->owner ==
 	    range->dev_private_owner) {
 		unsigned long cpu_flags = HMM_PFN_VALID |
 			hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
-		unsigned long pfn = swp_offset_pfn(entry);
+		unsigned long pfn = softleaf_to_pfn(entry);
 		unsigned long i;
 
-		if (is_writable_device_private_entry(entry))
+		if (softleaf_is_device_private_write(entry))
 			cpu_flags |= HMM_PFN_WRITE;
 
 		/*
@@ -370,7 +370,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
 	required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
 					      npages, 0);
 	if (required_fault) {
-		if (is_device_private_entry(entry))
+		if (softleaf_is_device_private(entry))
 			return hmm_vma_fault(addr, end, required_fault, walk);
 		else
 			return -EFAULT;
@@ -412,7 +412,7 @@ again:
 	if (pmd_none(pmd))
 		return hmm_vma_walk_hole(start, end, -1, walk);
 
-	if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
+	if (thp_migration_supported() && pmd_is_migration_entry(pmd)) {
 		if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
 			hmm_vma_walk->last = addr;
 			pmd_migration_entry_wait(walk->mm, pmdp);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0fdb3be39e31..9aa933723355 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1299,7 +1299,7 @@ vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	vm_fault_t ret = 0;
 	spinlock_t *ptl;
-	swp_entry_t swp_entry;
+	softleaf_t entry;
 	struct page *page;
 	struct folio *folio;
 
@@ -1314,8 +1314,8 @@ vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
 		return 0;
 	}
 
-	swp_entry = pmd_to_swp_entry(vmf->orig_pmd);
-	page = pfn_swap_entry_to_page(swp_entry);
+	entry = softleaf_from_pmd(vmf->orig_pmd);
+	page = softleaf_to_page(entry);
 	folio = page_folio(page);
 	vmf->page = page;
 	vmf->pte = NULL;
@@ -1705,13 +1705,13 @@ static void copy_huge_non_present_pmd(
 		struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		pmd_t pmd, pgtable_t pgtable)
 {
-	swp_entry_t entry = pmd_to_swp_entry(pmd);
+	softleaf_t entry = softleaf_from_pmd(pmd);
 	struct folio *src_folio;
 
-	VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd));
+	VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd));
 
-	if (is_writable_migration_entry(entry) ||
-	    is_readable_exclusive_migration_entry(entry)) {
+	if (softleaf_is_migration_write(entry) ||
+	    softleaf_is_migration_read_exclusive(entry)) {
 		entry = make_readable_migration_entry(swp_offset(entry));
 		pmd = swp_entry_to_pmd(entry);
 		if (pmd_swp_soft_dirty(*src_pmd))
@@ -1719,12 +1719,12 @@ static void copy_huge_non_present_pmd(
 		if (pmd_swp_uffd_wp(*src_pmd))
 			pmd = pmd_swp_mkuffd_wp(pmd);
 		set_pmd_at(src_mm, addr, src_pmd, pmd);
-	} else if (is_device_private_entry(entry)) {
+	} else if (softleaf_is_device_private(entry)) {
 		/*
 		 * For device private entries, since there are no
 		 * read exclusive entries, writable = !readable
 		 */
-		if (is_writable_device_private_entry(entry)) {
+		if (softleaf_is_device_private_write(entry)) {
 			entry = make_readable_device_private_entry(swp_offset(entry));
 			pmd = swp_entry_to_pmd(entry);
 
@@ -1735,7 +1735,7 @@ static void copy_huge_non_present_pmd(
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
 		}
 
-		src_folio = pfn_swap_entry_folio(entry);
+		src_folio = softleaf_to_folio(entry);
 		VM_WARN_ON(!folio_test_large(src_folio));
 
 		folio_get(src_folio);
@@ -2195,7 +2195,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 	if (unlikely(!pmd_present(orig_pmd))) {
 		VM_BUG_ON(thp_migration_supported() &&
-				  !is_pmd_migration_entry(orig_pmd));
+				  !pmd_is_migration_entry(orig_pmd));
 		goto out;
 	}
 
@@ -2293,11 +2293,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			folio_remove_rmap_pmd(folio, page, vma);
 			WARN_ON_ONCE(folio_mapcount(folio) < 0);
 			VM_BUG_ON_PAGE(!PageHead(page), page);
-		} else if (is_pmd_non_present_folio_entry(orig_pmd)) {
-			swp_entry_t entry;
+		} else if (pmd_is_valid_softleaf(orig_pmd)) {
+			const softleaf_t entry = softleaf_from_pmd(orig_pmd);
 
-			entry = pmd_to_swp_entry(orig_pmd);
-			folio = pfn_swap_entry_folio(entry);
+			folio = softleaf_to_folio(entry);
 			flush_needed = 0;
 
 			if (!thp_migration_supported())
@@ -2353,7 +2352,7 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
 static pmd_t move_soft_dirty_pmd(pmd_t pmd)
 {
 #ifdef CONFIG_MEM_SOFT_DIRTY
-	if (unlikely(is_pmd_migration_entry(pmd)))
+	if (unlikely(pmd_is_migration_entry(pmd)))
 		pmd = pmd_swp_mksoft_dirty(pmd);
 	else if (pmd_present(pmd))
 		pmd = pmd_mksoft_dirty(pmd);
@@ -2428,12 +2427,12 @@ static void change_non_present_huge_pmd(struct mm_struct *mm,
 		unsigned long addr, pmd_t *pmd, bool uffd_wp,
 		bool uffd_wp_resolve)
 {
-	swp_entry_t entry = pmd_to_swp_entry(*pmd);
-	struct folio *folio = pfn_swap_entry_folio(entry);
+	softleaf_t entry = softleaf_from_pmd(*pmd);
+	const struct folio *folio = softleaf_to_folio(entry);
 	pmd_t newpmd;
 
-	VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd));
-	if (is_writable_migration_entry(entry)) {
+	VM_WARN_ON(!pmd_is_valid_softleaf(*pmd));
+	if (softleaf_is_migration_write(entry)) {
 		/*
 		 * A protection check is difficult so
 		 * just be safe and disable write
@@ -2445,7 +2444,7 @@ static void change_non_present_huge_pmd(struct mm_struct *mm,
 		newpmd = swp_entry_to_pmd(entry);
 		if (pmd_swp_soft_dirty(*pmd))
 			newpmd = pmd_swp_mksoft_dirty(newpmd);
-	} else if (is_writable_device_private_entry(entry)) {
+	} else if (softleaf_is_device_private_write(entry)) {
 		entry = make_readable_device_private_entry(swp_offset(entry));
 		newpmd = swp_entry_to_pmd(entry);
 	} else {
@@ -2643,7 +2642,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
 
 	if (!pmd_trans_huge(src_pmdval)) {
 		spin_unlock(src_ptl);
-		if (is_pmd_migration_entry(src_pmdval)) {
+		if (pmd_is_migration_entry(src_pmdval)) {
 			pmd_migration_entry_wait(mm, &src_pmdval);
 			return -EAGAIN;
 		}
@@ -2908,13 +2907,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	unsigned long addr;
 	pte_t *pte;
 	int i;
-	swp_entry_t entry;
 
 	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
 
-	VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd));
+	VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(*pmd) && !pmd_trans_huge(*pmd));
 
 	count_vm_event(THP_SPLIT_PMD);
 
@@ -2928,11 +2926,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			zap_deposited_table(mm, pmd);
 		if (!vma_is_dax(vma) && vma_is_special_huge(vma))
 			return;
-		if (unlikely(is_pmd_migration_entry(old_pmd))) {
-			swp_entry_t entry;
+		if (unlikely(pmd_is_migration_entry(old_pmd))) {
+			const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
 
-			entry = pmd_to_swp_entry(old_pmd);
-			folio = pfn_swap_entry_folio(entry);
+			folio = softleaf_to_folio(old_entry);
 		} else if (is_huge_zero_pmd(old_pmd)) {
 			return;
 		} else {
@@ -2962,31 +2959,34 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		return __split_huge_zero_page_pmd(vma, haddr, pmd);
 	}
 
+	if (pmd_is_migration_entry(*pmd)) {
+		softleaf_t entry;
 
-	if (is_pmd_migration_entry(*pmd)) {
 		old_pmd = *pmd;
-		entry = pmd_to_swp_entry(old_pmd);
-		page = pfn_swap_entry_to_page(entry);
+		entry = softleaf_from_pmd(old_pmd);
+		page = softleaf_to_page(entry);
 		folio = page_folio(page);
 
 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
 		uffd_wp = pmd_swp_uffd_wp(old_pmd);
 
-		write = is_writable_migration_entry(entry);
+		write = softleaf_is_migration_write(entry);
 		if (PageAnon(page))
-			anon_exclusive = is_readable_exclusive_migration_entry(entry);
-		young = is_migration_entry_young(entry);
-		dirty = is_migration_entry_dirty(entry);
-	} else if (is_pmd_device_private_entry(*pmd)) {
+			anon_exclusive = softleaf_is_migration_read_exclusive(entry);
+		young = softleaf_is_migration_young(entry);
+		dirty = softleaf_is_migration_dirty(entry);
+	} else if (pmd_is_device_private_entry(*pmd)) {
+		softleaf_t entry;
+
 		old_pmd = *pmd;
-		entry = pmd_to_swp_entry(old_pmd);
-		page = pfn_swap_entry_to_page(entry);
+		entry = softleaf_from_pmd(old_pmd);
+		page = softleaf_to_page(entry);
 		folio = page_folio(page);
 
 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
 		uffd_wp = pmd_swp_uffd_wp(old_pmd);
 
-		write = is_writable_device_private_entry(entry);
+		write = softleaf_is_device_private_write(entry);
 		anon_exclusive = PageAnonExclusive(page);
 
 		/*
@@ -3090,7 +3090,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	 * Note that NUMA hinting access restrictions are not transferred to
 	 * avoid any possibility of altering permissions across VMAs.
 	 */
-	if (freeze || is_pmd_migration_entry(old_pmd)) {
+	if (freeze || pmd_is_migration_entry(old_pmd)) {
 		pte_t entry;
 		swp_entry_t swp_entry;
 
@@ -3116,7 +3116,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
 			set_pte_at(mm, addr, pte + i, entry);
 		}
-	} else if (is_pmd_device_private_entry(old_pmd)) {
+	} else if (pmd_is_device_private_entry(old_pmd)) {
 		pte_t entry;
 		swp_entry_t swp_entry;
 
@@ -3166,7 +3166,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	}
 	pte_unmap(pte);
 
-	if (!is_pmd_migration_entry(*pmd))
+	if (!pmd_is_migration_entry(*pmd))
 		folio_remove_rmap_pmd(folio, page, vma);
 	if (freeze)
 		put_page(page);
@@ -3179,7 +3179,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
 			   pmd_t *pmd, bool freeze)
 {
 	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
-	if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd))
+	if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd))
 		__split_huge_pmd_locked(vma, pmd, address, freeze);
 }
 
@@ -4749,25 +4749,25 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 	unsigned long address = pvmw->address;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 	pmd_t pmde;
-	swp_entry_t entry;
+	softleaf_t entry;
 
 	if (!(pvmw->pmd && !pvmw->pte))
 		return;
 
-	entry = pmd_to_swp_entry(*pvmw->pmd);
+	entry = softleaf_from_pmd(*pvmw->pmd);
 	folio_get(folio);
 	pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
 
 	if (pmd_swp_soft_dirty(*pvmw->pmd))
 		pmde = pmd_mksoft_dirty(pmde);
-	if (is_writable_migration_entry(entry))
+	if (softleaf_is_migration_write(entry))
 		pmde = pmd_mkwrite(pmde, vma);
 	if (pmd_swp_uffd_wp(*pvmw->pmd))
 		pmde = pmd_mkuffd_wp(pmde);
-	if (!is_migration_entry_young(entry))
+	if (!softleaf_is_migration_young(entry))
 		pmde = pmd_mkold(pmde);
 	/* NOTE: this may contain setting soft-dirty on some archs */
-	if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
+	if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry))
 		pmde = pmd_mkdirty(pmde);
 
 	if (folio_is_device_private(folio)) {
@@ -4790,7 +4790,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 	if (folio_test_anon(folio)) {
 		rmap_t rmap_flags = RMAP_NONE;
 
-		if (!is_readable_migration_entry(entry))
+		if (!softleaf_is_migration_read(entry))
 			rmap_flags |= RMAP_EXCLUSIVE;
 
 		folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d7e71c2e2571..7e8cb181d5bd 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -17,7 +17,7 @@
 #include <linux/page_idle.h>
 #include <linux/page_table_check.h>
 #include <linux/rcupdate_wait.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/shmem_fs.h>
 #include <linux/dax.h>
 #include <linux/ksm.h>
@@ -941,7 +941,7 @@ static inline int check_pmd_state(pmd_t *pmd)
 	 * collapse it. Migration success or failure will eventually end
 	 * up with a present PMD mapping a folio again.
 	 */
-	if (is_pmd_migration_entry(pmde))
+	if (pmd_is_migration_entry(pmde))
 		return SCAN_PMD_MAPPED;
 	if (!pmd_present(pmde))
 		return SCAN_PMD_NULL;
diff --git a/mm/madvise.c b/mm/madvise.c
index 2d7dd7901bae..5979a4a39738 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -390,7 +390,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 
 		if (unlikely(!pmd_present(orig_pmd))) {
 			VM_BUG_ON(thp_migration_supported() &&
-					!is_pmd_migration_entry(orig_pmd));
+					!pmd_is_migration_entry(orig_pmd));
 			goto huge_unlock;
 		}
 
diff --git a/mm/memory.c b/mm/memory.c
index 76c17feff88b..9d0d527e95a8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6352,10 +6352,10 @@ retry_pud:
 		goto fallback;
 
 	if (unlikely(!pmd_present(vmf.orig_pmd))) {
-		if (is_pmd_device_private_entry(vmf.orig_pmd))
+		if (pmd_is_device_private_entry(vmf.orig_pmd))
 			return do_huge_pmd_device_private(&vmf);
 
-		if (is_pmd_migration_entry(vmf.orig_pmd))
+		if (pmd_is_migration_entry(vmf.orig_pmd))
 			pmd_migration_entry_wait(mm, vmf.pmd);
 		return 0;
 	}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7ae3f5e2dee6..01c3b98f87a6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -110,7 +110,7 @@
 #include <linux/mm_inline.h>
 #include <linux/mmu_notifier.h>
 #include <linux/printk.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/gcd.h>
 
 #include <asm/tlbflush.h>
@@ -647,7 +647,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
 	struct folio *folio;
 	struct queue_pages *qp = walk->private;
 
-	if (unlikely(is_pmd_migration_entry(*pmd))) {
+	if (unlikely(pmd_is_migration_entry(*pmd))) {
 		qp->nr_failed++;
 		return;
 	}
diff --git a/mm/migrate.c b/mm/migrate.c
index 847c1ec17628..ca4ec170a89b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -16,7 +16,7 @@
 #include <linux/migrate.h>
 #include <linux/export.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/mm_inline.h>
@@ -353,7 +353,7 @@ static bool remove_migration_pte(struct folio *folio,
 		rmap_t rmap_flags = RMAP_NONE;
 		pte_t old_pte;
 		pte_t pte;
-		swp_entry_t entry;
+		softleaf_t entry;
 		struct page *new;
 		unsigned long idx = 0;
 
@@ -379,22 +379,22 @@ static bool remove_migration_pte(struct folio *folio,
 		folio_get(folio);
 		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
 
-		entry = pte_to_swp_entry(old_pte);
-		if (!is_migration_entry_young(entry))
+		entry = softleaf_from_pte(old_pte);
+		if (!softleaf_is_migration_young(entry))
 			pte = pte_mkold(pte);
-		if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
+		if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry))
 			pte = pte_mkdirty(pte);
 		if (pte_swp_soft_dirty(old_pte))
 			pte = pte_mksoft_dirty(pte);
 		else
 			pte = pte_clear_soft_dirty(pte);
 
-		if (is_writable_migration_entry(entry))
+		if (softleaf_is_migration_write(entry))
 			pte = pte_mkwrite(pte, vma);
 		else if (pte_swp_uffd_wp(old_pte))
 			pte = pte_mkuffd_wp(pte);
 
-		if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
+		if (folio_test_anon(folio) && !softleaf_is_migration_read(entry))
 			rmap_flags |= RMAP_EXCLUSIVE;
 
 		if (unlikely(is_device_private_page(new))) {
@@ -404,7 +404,7 @@ static bool remove_migration_pte(struct folio *folio,
 			else
 				entry = make_readable_device_private_entry(
 							page_to_pfn(new));
-			pte = swp_entry_to_pte(entry);
+			pte = softleaf_to_pte(entry);
 			if (pte_swp_soft_dirty(old_pte))
 				pte = pte_swp_mksoft_dirty(pte);
 			if (pte_swp_uffd_wp(old_pte))
@@ -543,9 +543,9 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 	spinlock_t *ptl;
 
 	ptl = pmd_lock(mm, pmd);
-	if (!is_pmd_migration_entry(*pmd))
+	if (!pmd_is_migration_entry(*pmd))
 		goto unlock;
-	migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl);
+	migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl);
 	return;
 unlock:
 	spin_unlock(ptl);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index ab373fd38961..592b4561507c 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -13,7 +13,7 @@
 #include <linux/oom.h>
 #include <linux/pagewalk.h>
 #include <linux/rmap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -141,7 +141,6 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
 	struct folio *folio;
 	struct migrate_vma *migrate = walk->private;
 	spinlock_t *ptl;
-	swp_entry_t entry;
 	int ret;
 	unsigned long write = 0;
 
@@ -165,23 +164,24 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
 		if (pmd_write(*pmdp))
 			write = MIGRATE_PFN_WRITE;
 	} else if (!pmd_present(*pmdp)) {
-		entry = pmd_to_swp_entry(*pmdp);
-		folio = pfn_swap_entry_folio(entry);
+		const softleaf_t entry = softleaf_from_pmd(*pmdp);
 
-		if (!is_device_private_entry(entry) ||
+		folio = softleaf_to_folio(entry);
+
+		if (!softleaf_is_device_private(entry) ||
 			!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
 			(folio->pgmap->owner != migrate->pgmap_owner)) {
 			spin_unlock(ptl);
 			return migrate_vma_collect_skip(start, end, walk);
 		}
 
-		if (is_migration_entry(entry)) {
+		if (softleaf_is_migration(entry)) {
 			migration_entry_wait_on_locked(entry, ptl);
 			spin_unlock(ptl);
 			return -EAGAIN;
 		}
 
-		if (is_writable_device_private_entry(entry))
+		if (softleaf_is_device_private_write(entry))
 			write = MIGRATE_PFN_WRITE;
 	} else {
 		spin_unlock(ptl);
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index f5f25e120f69..741884645ab0 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -8,7 +8,7 @@
 #include <linux/mm.h>
 #include <linux/page_table_check.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 
 #undef pr_fmt
 #define pr_fmt(fmt)	"page_table_check: " fmt
@@ -179,10 +179,10 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
 EXPORT_SYMBOL(__page_table_check_pud_clear);
 
 /* Whether the swap entry cached writable information */
-static inline bool swap_cached_writable(swp_entry_t entry)
+static inline bool softleaf_cached_writable(softleaf_t entry)
 {
-	return is_writable_device_private_entry(entry) ||
-	       is_writable_migration_entry(entry);
+	return softleaf_is_device_private_write(entry) ||
+		softleaf_is_migration_write(entry);
 }
 
 static void page_table_check_pte_flags(pte_t pte)
@@ -190,9 +190,9 @@ static void page_table_check_pte_flags(pte_t pte)
 	if (pte_present(pte)) {
 		WARN_ON_ONCE(pte_uffd_wp(pte) && pte_write(pte));
 	} else if (pte_swp_uffd_wp(pte)) {
-		const swp_entry_t entry = pte_to_swp_entry(pte);
+		const softleaf_t entry = softleaf_from_pte(pte);
 
-		WARN_ON_ONCE(swap_cached_writable(entry));
+		WARN_ON_ONCE(softleaf_cached_writable(entry));
 	}
 }
 
@@ -219,9 +219,9 @@ static inline void page_table_check_pmd_flags(pmd_t pmd)
 		if (pmd_uffd_wp(pmd))
 			WARN_ON_ONCE(pmd_write(pmd));
 	} else if (pmd_swp_uffd_wp(pmd)) {
-		swp_entry_t entry = pmd_to_swp_entry(pmd);
+		const softleaf_t entry = softleaf_from_pmd(pmd);
 
-		WARN_ON_ONCE(swap_cached_writable(entry));
+		WARN_ON_ONCE(softleaf_cached_writable(entry));
 	}
 }
 
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index a4e23818f37f..8137d2366722 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -242,18 +242,19 @@ restart:
 		 */
 		pmde = pmdp_get_lockless(pvmw->pmd);
 
-		if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+		if (pmd_trans_huge(pmde) || pmd_is_migration_entry(pmde)) {
 			pvmw->ptl = pmd_lock(mm, pvmw->pmd);
 			pmde = *pvmw->pmd;
 			if (!pmd_present(pmde)) {
-				swp_entry_t entry;
+				softleaf_t entry;
 
 				if (!thp_migration_supported() ||
 				    !(pvmw->flags & PVMW_MIGRATION))
 					return not_found(pvmw);
-				entry = pmd_to_swp_entry(pmde);
-				if (!is_migration_entry(entry) ||
-				    !check_pmd(swp_offset_pfn(entry), pvmw))
+				entry = softleaf_from_pmd(pmde);
+
+				if (!softleaf_is_migration(entry) ||
+				    !check_pmd(softleaf_to_pfn(entry), pvmw))
 					return not_found(pvmw);
 				return true;
 			}
@@ -273,9 +274,9 @@ restart:
 			 * cannot return prematurely, while zap_huge_pmd() has
 			 * cleared *pmd but not decremented compound_mapcount().
 			 */
-			swp_entry_t entry = pmd_to_swp_entry(pmde);
+			const softleaf_t entry = softleaf_from_pmd(pmde);
 
-			if (is_device_private_entry(entry)) {
+			if (softleaf_is_device_private(entry)) {
 				pvmw->ptl = pmd_lock(mm, pvmw->pmd);
 				return true;
 			}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8a29b7237bc6..378c774795fc 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -5,7 +5,7 @@
 #include <linux/hugetlb.h>
 #include <linux/mmu_context.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 
 #include <asm/tlbflush.h>
 
@@ -973,10 +973,10 @@ pmd_table:
 				goto found;
 			}
 		} else if ((flags & FW_MIGRATION) &&
-			   is_pmd_migration_entry(pmd)) {
-			swp_entry_t entry = pmd_to_swp_entry(pmd);
+			   pmd_is_migration_entry(pmd)) {
+			const softleaf_t entry = softleaf_from_pmd(pmd);
 
-			page = pfn_swap_entry_to_page(entry);
+			page = softleaf_to_page(entry);
 			expose_page = false;
 			goto found;
 		}
diff --git a/mm/rmap.c b/mm/rmap.c
index 1954c538a991..775710115a41 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -57,7 +57,7 @@
 #include <linux/sched/task.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/ksm.h>
@@ -2341,7 +2341,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			if (likely(pmd_present(pmdval)))
 				pfn = pmd_pfn(pmdval);
 			else
-				pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval));
+				pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval));
 
 			subpage = folio_page(folio, pfn - folio_pfn(folio));
 
-- 
cgit v1.2.3


From 15eabc898dc58c9e97eb9ddd56dc6b893e7d0d0e Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:29 +0000
Subject: mm: introduce pmd_is_huge() and use where appropriate

The leaf entry PMD case is confusing as only migration entries and device
private entries are valid at PMD level, not true swap entries.

We repeatedly perform checks of the form is_swap_pmd() || pmd_trans_huge()
which is itself confusing - it implies that leaf entries at PMD level
exist and are different from huge entries.

Address this confusion by introduced pmd_is_huge() which checks for either
case.  Sadly due to header dependency issues (huge_mm.h is included very
early on in headers and cannot really rely on much else) we cannot use
pmd_is_valid_softleaf() here.

However since these are the only valid, handled cases the function is
still achieving what it intends to do.

We then replace all instances of is_swap_pmd() || pmd_trans_huge() with
pmd_is_huge() invocations and adjust logic accordingly to accommodate
this.

No functional change intended.

Link: https://lkml.kernel.org/r/00f79db3b15293cac8f7040a48d69c52d00117e4.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 39 +++++++++++++++++++++++++++++++++++----
 include/linux/swapops.h |  6 ++++++
 mm/huge_memory.c        |  3 ++-
 mm/memory.c             |  4 ++--
 mm/mprotect.c           |  2 +-
 mm/mremap.c             |  2 +-
 6 files changed, 47 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 19d4a5f52ca2..5ab240d61dcc 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -419,10 +419,36 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped);
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze);
 
+/**
+ * pmd_is_huge() - Is this PMD either a huge PMD entry or a software leaf entry?
+ * @pmd: The PMD to check.
+ *
+ * A huge PMD entry is a non-empty entry which is present and marked huge or a
+ * software leaf entry. This check be performed without the appropriate locks
+ * held, in which case the condition should be rechecked after they are
+ * acquired.
+ *
+ * Returns: true if this PMD is huge, false otherwise.
+ */
+static inline bool pmd_is_huge(pmd_t pmd)
+{
+	if (pmd_present(pmd)) {
+		return pmd_trans_huge(pmd);
+	} else if (!pmd_none(pmd)) {
+		/*
+		 * Non-present PMDs must be valid huge non-present entries. We
+		 * cannot assert that here due to header dependency issues.
+		 */
+		return true;
+	}
+
+	return false;
+}
+
 #define split_huge_pmd(__vma, __pmd, __address)				\
 	do {								\
 		pmd_t *____pmd = (__pmd);				\
-		if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd))	\
+		if (pmd_is_huge(*____pmd))				\
 			__split_huge_pmd(__vma, __pmd, __address,	\
 					 false);			\
 	}  while (0)
@@ -469,10 +495,10 @@ static inline int is_swap_pmd(pmd_t pmd)
 static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma)
 {
-	if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))
+	if (pmd_is_huge(*pmd))
 		return __pmd_trans_huge_lock(pmd, vma);
-	else
-		return NULL;
+
+	return NULL;
 }
 static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
 		struct vm_area_struct *vma)
@@ -743,6 +769,11 @@ static inline struct folio *get_persistent_huge_zero_folio(void)
 {
 	return NULL;
 }
+
+static inline bool pmd_is_huge(pmd_t pmd)
+{
+	return false;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static inline int split_folio_to_list_to_order(struct folio *folio,
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index f1277647262d..41cfc6d59054 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -471,6 +471,12 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
 }
 
 #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+		struct page *page)
+{
+	BUILD_BUG();
+}
+
 static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
 		struct page *new)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9aa933723355..71dc6e41f0c8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2735,8 +2735,9 @@ unlock_ptls:
 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 {
 	spinlock_t *ptl;
+
 	ptl = pmd_lock(vma->vm_mm, pmd);
-	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)))
+	if (likely(pmd_is_huge(*pmd)))
 		return ptl;
 	spin_unlock(ptl);
 	return NULL;
diff --git a/mm/memory.c b/mm/memory.c
index 9d0d527e95a8..95dac6a1cbc4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1374,7 +1374,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	src_pmd = pmd_offset(src_pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) {
+		if (pmd_is_huge(*src_pmd)) {
 			int err;
 
 			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
@@ -1917,7 +1917,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) {
+		if (pmd_is_huge(*pmd)) {
 			if (next - addr != HPAGE_PMD_SIZE)
 				__split_huge_pmd(vma, pmd, addr, false);
 			else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
diff --git a/mm/mprotect.c b/mm/mprotect.c
index aa555dfbdfc5..f910cbf41442 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -474,7 +474,7 @@ again:
 			goto next;
 
 		_pmd = pmdp_get_lockless(pmd);
-		if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) {
+		if (pmd_is_huge(_pmd)) {
 			if ((next - addr != HPAGE_PMD_SIZE) ||
 			    pgtable_split_needed(vma, cp_flags)) {
 				__split_huge_pmd(vma, pmd, addr, false);
diff --git a/mm/mremap.c b/mm/mremap.c
index 62b6827abacf..fdb0485ede74 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -850,7 +850,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc)
 		if (!new_pmd)
 			break;
 again:
-		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
+		if (pmd_is_huge(*old_pmd)) {
 			if (extent == HPAGE_PMD_SIZE &&
 			    move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
 				continue;
-- 
cgit v1.2.3


From c0a80c2ce68d3a04daa52497fbf524ffb3a376e0 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:30 +0000
Subject: mm: remove remaining is_swap_pmd() users and is_swap_pmd()

Update copy_huge_pmd() and change_huge_pmd() to use
pmd_is_valid_softleaf() - as this checks for the only valid non-present
huge PMD states.

Also update mm/debug_vm_pgtable.c to explicitly test for a valid leaf PMD
entry (which it was not before, which was incorrect), and have it test
against pmd_is_huge() and pmd_is_valid_softleaf() rather than
is_swap_pmd().

With these changes done there are no further users of is_swap_pmd(), so
remove it.

Link: https://lkml.kernel.org/r/1628b00b00c8498bbd2c20b82117ee87845fb738.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |  9 ---------
 mm/debug_vm_pgtable.c   | 25 +++++++++++++++----------
 mm/huge_memory.c        |  5 +++--
 3 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 5ab240d61dcc..525624c285a6 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -486,11 +486,6 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);
 
-static inline int is_swap_pmd(pmd_t pmd)
-{
-	return !pmd_none(pmd) && !pmd_present(pmd);
-}
-
 /* mmap_lock must be held on entry */
 static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma)
@@ -692,10 +687,6 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 struct vm_area_struct *next)
 {
 }
-static inline int is_swap_pmd(pmd_t pmd)
-{
-	return 0;
-}
 static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma)
 {
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index fff311830959..608d1011ce03 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -74,6 +74,7 @@ struct pgtable_debug_args {
 	unsigned long		fixed_pte_pfn;
 
 	swp_entry_t		swp_entry;
+	swp_entry_t		leaf_entry;
 };
 
 static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
@@ -745,7 +746,7 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args)
 	WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
 }
 
-static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args)
+static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args)
 {
 	pmd_t pmd;
 
@@ -757,15 +758,16 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args)
 		return;
 
 	pr_debug("Validating PMD swap soft dirty\n");
-	pmd = swp_entry_to_pmd(args->swp_entry);
-	WARN_ON(!is_swap_pmd(pmd));
+	pmd = swp_entry_to_pmd(args->leaf_entry);
+	WARN_ON(!pmd_is_huge(pmd));
+	WARN_ON(!pmd_is_valid_softleaf(pmd));
 
 	WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
 	WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
 }
 #else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
 static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { }
-static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
@@ -818,7 +820,7 @@ static void __init pte_swap_tests(struct pgtable_debug_args *args)
 }
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-static void __init pmd_swap_tests(struct pgtable_debug_args *args)
+static void __init pmd_softleaf_tests(struct pgtable_debug_args *args)
 {
 	swp_entry_t arch_entry;
 	pmd_t pmd1, pmd2;
@@ -827,15 +829,16 @@ static void __init pmd_swap_tests(struct pgtable_debug_args *args)
 		return;
 
 	pr_debug("Validating PMD swap\n");
-	pmd1 = swp_entry_to_pmd(args->swp_entry);
-	WARN_ON(!is_swap_pmd(pmd1));
+	pmd1 = swp_entry_to_pmd(args->leaf_entry);
+	WARN_ON(!pmd_is_huge(pmd1));
+	WARN_ON(!pmd_is_valid_softleaf(pmd1));
 
 	arch_entry = __pmd_to_swp_entry(pmd1);
 	pmd2 = __swp_entry_to_pmd(arch_entry);
 	WARN_ON(memcmp(&pmd1, &pmd2, sizeof(pmd1)));
 }
 #else  /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static void __init pmd_swap_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
 
 static void __init swap_migration_tests(struct pgtable_debug_args *args)
@@ -1229,6 +1232,8 @@ static int __init init_args(struct pgtable_debug_args *args)
 	max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
 	/* Create a swp entry with all possible bits set while still being swap. */
 	args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset);
+	/* Create a non-present migration entry. */
+	args->leaf_entry = make_writable_migration_entry(~0UL);
 
 	/*
 	 * Allocate (huge) pages because some of the tests need to access
@@ -1318,12 +1323,12 @@ static int __init debug_vm_pgtable(void)
 	pte_soft_dirty_tests(&args);
 	pmd_soft_dirty_tests(&args);
 	pte_swap_soft_dirty_tests(&args);
-	pmd_swap_soft_dirty_tests(&args);
+	pmd_leaf_soft_dirty_tests(&args);
 
 	pte_swap_exclusive_tests(&args);
 
 	pte_swap_tests(&args);
-	pmd_swap_tests(&args);
+	pmd_softleaf_tests(&args);
 
 	swap_migration_tests(&args);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 71dc6e41f0c8..e38b0d5e3102 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1800,7 +1800,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	ret = -EAGAIN;
 	pmd = *src_pmd;
 
-	if (unlikely(thp_migration_supported() && is_swap_pmd(pmd))) {
+	if (unlikely(thp_migration_supported() &&
+		     pmd_is_valid_softleaf(pmd))) {
 		copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
 					  dst_vma, src_vma, pmd, pgtable);
 		ret = 0;
@@ -2487,7 +2488,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	if (!ptl)
 		return 0;
 
-	if (thp_migration_supported() && is_swap_pmd(*pmd)) {
+	if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) {
 		change_non_present_huge_pmd(mm, addr, pmd, uffd_wp,
 					    uffd_wp_resolve);
 		goto unlock;
-- 
cgit v1.2.3


From 9ff30bb9ab40b34908eefd661f12f99aa00d04c3 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:31 +0000
Subject: mm: remove non_swap_entry() and use softleaf helpers instead

There is simply no need for the hugely confusing concept of 'non-swap'
swap entries now we have the concept of softleaf entries and relevant
softleaf_xxx() helpers.

Adjust all callers to use these instead and remove non_swap_entry()
altogether.

No functional change intended.

Link: https://lkml.kernel.org/r/2562093f37f4a9cffea0447058014485eb50aaaf.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/gmap_helpers.c | 20 ++++++++++----------
 arch/s390/mm/pgtable.c      | 12 ++++++------
 fs/proc/task_mmu.c          | 12 ++++++------
 include/linux/swapops.h     |  5 -----
 mm/filemap.c                |  2 +-
 mm/hmm.c                    | 16 ++++++++--------
 mm/madvise.c                |  2 +-
 mm/memory.c                 | 36 ++++++++++++++++++------------------
 mm/mincore.c                |  2 +-
 mm/userfaultfd.c            | 24 ++++++++++++------------
 10 files changed, 63 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
index d4c3c36855e2..549f14ad08af 100644
--- a/arch/s390/mm/gmap_helpers.c
+++ b/arch/s390/mm/gmap_helpers.c
@@ -11,27 +11,27 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/pagewalk.h>
 #include <linux/ksm.h>
 #include <asm/gmap_helpers.h>
 #include <asm/pgtable.h>
 
 /**
- * ptep_zap_swap_entry() - discard a swap entry.
+ * ptep_zap_softleaf_entry() - discard a software leaf entry.
  * @mm: the mm
- * @entry: the swap entry that needs to be zapped
+ * @entry: the software leaf entry that needs to be zapped
  *
- * Discards the given swap entry. If the swap entry was an actual swap
- * entry (and not a migration entry, for example), the actual swapped
+ * Discards the given software leaf entry. If the leaf entry was an actual
+ * swap entry (and not a migration entry, for example), the actual swapped
  * page is also discarded from swap.
  */
-static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
+static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
 {
-	if (!non_swap_entry(entry))
+	if (softleaf_is_swap(entry))
 		dec_mm_counter(mm, MM_SWAPENTS);
-	else if (is_migration_entry(entry))
-		dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry)));
+	else if (softleaf_is_migration(entry))
+		dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry)));
 	free_swap_and_cache(entry);
 }
 
@@ -66,7 +66,7 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
 		preempt_disable();
 		pgste = pgste_get_lock(ptep);
 
-		ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep));
+		ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
 		pte_clear(mm, vmaddr, ptep);
 
 		pgste_set_unlock(ptep, pgste);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 0fde20bbc50b..d670bfb47d9b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -16,7 +16,7 @@
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/sysctl.h>
 #include <linux/ksm.h>
 #include <linux/mman.h>
@@ -683,12 +683,12 @@ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
 	pgste_set_unlock(ptep, pgste);
 }
 
-static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
+static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
 {
-	if (!non_swap_entry(entry))
+	if (softleaf_is_swap(entry))
 		dec_mm_counter(mm, MM_SWAPENTS);
-	else if (is_migration_entry(entry)) {
-		struct folio *folio = pfn_swap_entry_folio(entry);
+	else if (softleaf_is_migration(entry)) {
+		struct folio *folio = softleaf_to_folio(entry);
 
 		dec_mm_counter(mm, mm_counter(folio));
 	}
@@ -710,7 +710,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
 	if (!reset && pte_swap(pte) &&
 	    ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
 	     (pgstev & _PGSTE_GPS_ZERO))) {
-		ptep_zap_swap_entry(mm, pte_to_swp_entry(pte));
+		ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte));
 		pte_clear(mm, addr, ptep);
 	}
 	if (reset)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 898df952b6bc..1f49c81b3591 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1020,13 +1020,13 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 	} else if (pte_none(ptent)) {
 		smaps_pte_hole_lookup(addr, walk);
 	} else {
-		swp_entry_t swpent = pte_to_swp_entry(ptent);
+		const softleaf_t entry = softleaf_from_pte(ptent);
 
-		if (!non_swap_entry(swpent)) {
+		if (softleaf_is_swap(entry)) {
 			int mapcount;
 
 			mss->swap += PAGE_SIZE;
-			mapcount = swp_swapcount(swpent);
+			mapcount = swp_swapcount(entry);
 			if (mapcount >= 2) {
 				u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
 
@@ -1035,10 +1035,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			} else {
 				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
 			}
-		} else if (is_pfn_swap_entry(swpent)) {
-			if (is_device_private_entry(swpent))
+		} else if (softleaf_has_pfn(entry)) {
+			if (softleaf_is_device_private(entry))
 				present = true;
-			page = pfn_swap_entry_to_page(swpent);
+			page = softleaf_to_page(entry);
 		}
 	}
 
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 41cfc6d59054..c8e6f927da48 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -492,10 +492,5 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
 
 #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
 
-static inline int non_swap_entry(swp_entry_t entry)
-{
-	return swp_type(entry) >= MAX_SWAPFILES;
-}
-
 #endif /* CONFIG_MMU */
 #endif /* _LINUX_SWAPOPS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 02355aa46324..07634b7d9934 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -4567,7 +4567,7 @@ static void filemap_cachestat(struct address_space *mapping,
 				swp_entry_t swp = radix_to_swp_entry(folio);
 
 				/* swapin error results in poisoned entry */
-				if (non_swap_entry(swp))
+				if (!softleaf_is_swap(swp))
 					goto resched;
 
 				/*
diff --git a/mm/hmm.c b/mm/hmm.c
index e9735a9b6102..0158f2d1e027 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -258,17 +258,17 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	}
 
 	if (!pte_present(pte)) {
-		swp_entry_t entry = pte_to_swp_entry(pte);
+		const softleaf_t entry = softleaf_from_pte(pte);
 
 		/*
 		 * Don't fault in device private pages owned by the caller,
 		 * just report the PFN.
 		 */
-		if (is_device_private_entry(entry) &&
-		    page_pgmap(pfn_swap_entry_to_page(entry))->owner ==
+		if (softleaf_is_device_private(entry) &&
+		    page_pgmap(softleaf_to_page(entry))->owner ==
 		    range->dev_private_owner) {
 			cpu_flags = HMM_PFN_VALID;
-			if (is_writable_device_private_entry(entry))
+			if (softleaf_is_device_private_write(entry))
 				cpu_flags |= HMM_PFN_WRITE;
 			new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
 			goto out;
@@ -279,16 +279,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		if (!required_fault)
 			goto out;
 
-		if (!non_swap_entry(entry))
+		if (softleaf_is_swap(entry))
 			goto fault;
 
-		if (is_device_private_entry(entry))
+		if (softleaf_is_device_private(entry))
 			goto fault;
 
-		if (is_device_exclusive_entry(entry))
+		if (softleaf_is_device_exclusive(entry))
 			goto fault;
 
-		if (is_migration_entry(entry)) {
+		if (softleaf_is_migration(entry)) {
 			pte_unmap(ptep);
 			hmm_vma_walk->last = addr;
 			migration_entry_wait(walk->mm, pmdp, addr);
diff --git a/mm/madvise.c b/mm/madvise.c
index 5979a4a39738..d8bc51e1bea7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -249,7 +249,7 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
 			continue;
 		entry = radix_to_swp_entry(folio);
 		/* There might be swapin error entries in shmem mapping. */
-		if (non_swap_entry(entry))
+		if (!softleaf_is_swap(entry))
 			continue;
 
 		addr = vma->vm_start +
diff --git a/mm/memory.c b/mm/memory.c
index 95dac6a1cbc4..a3f001a47ecf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -932,7 +932,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	struct folio *folio;
 	struct page *page;
 
-	if (likely(!non_swap_entry(entry))) {
+	if (likely(softleaf_is_swap(entry))) {
 		if (swap_duplicate(entry) < 0)
 			return -EIO;
 
@@ -950,12 +950,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			set_pte_at(src_mm, addr, src_pte, pte);
 		}
 		rss[MM_SWAPENTS]++;
-	} else if (is_migration_entry(entry)) {
-		folio = pfn_swap_entry_folio(entry);
+	} else if (softleaf_is_migration(entry)) {
+		folio = softleaf_to_folio(entry);
 
 		rss[mm_counter(folio)]++;
 
-		if (!is_readable_migration_entry(entry) &&
+		if (!softleaf_is_migration_read(entry) &&
 				is_cow_mapping(vm_flags)) {
 			/*
 			 * COW mappings require pages in both parent and child
@@ -964,15 +964,15 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			 */
 			entry = make_readable_migration_entry(
 							swp_offset(entry));
-			pte = swp_entry_to_pte(entry);
+			pte = softleaf_to_pte(entry);
 			if (pte_swp_soft_dirty(orig_pte))
 				pte = pte_swp_mksoft_dirty(pte);
 			if (pte_swp_uffd_wp(orig_pte))
 				pte = pte_swp_mkuffd_wp(pte);
 			set_pte_at(src_mm, addr, src_pte, pte);
 		}
-	} else if (is_device_private_entry(entry)) {
-		page = pfn_swap_entry_to_page(entry);
+	} else if (softleaf_is_device_private(entry)) {
+		page = softleaf_to_page(entry);
 		folio = page_folio(page);
 
 		/*
@@ -996,7 +996,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * when a device driver is involved (you cannot easily
 		 * save and restore device driver state).
 		 */
-		if (is_writable_device_private_entry(entry) &&
+		if (softleaf_is_device_private_write(entry) &&
 		    is_cow_mapping(vm_flags)) {
 			entry = make_readable_device_private_entry(
 							swp_offset(entry));
@@ -1005,7 +1005,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 				pte = pte_swp_mkuffd_wp(pte);
 			set_pte_at(src_mm, addr, src_pte, pte);
 		}
-	} else if (is_device_exclusive_entry(entry)) {
+	} else if (softleaf_is_device_exclusive(entry)) {
 		/*
 		 * Make device exclusive entries present by restoring the
 		 * original entry then copying as for a present pte. Device
@@ -4625,7 +4625,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	rmap_t rmap_flags = RMAP_NONE;
 	bool need_clear_cache = false;
 	bool exclusive = false;
-	swp_entry_t entry;
+	softleaf_t entry;
 	pte_t pte;
 	vm_fault_t ret = 0;
 	void *shadow = NULL;
@@ -4637,15 +4637,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (!pte_unmap_same(vmf))
 		goto out;
 
-	entry = pte_to_swp_entry(vmf->orig_pte);
-	if (unlikely(non_swap_entry(entry))) {
-		if (is_migration_entry(entry)) {
+	entry = softleaf_from_pte(vmf->orig_pte);
+	if (unlikely(!softleaf_is_swap(entry))) {
+		if (softleaf_is_migration(entry)) {
 			migration_entry_wait(vma->vm_mm, vmf->pmd,
 					     vmf->address);
-		} else if (is_device_exclusive_entry(entry)) {
-			vmf->page = pfn_swap_entry_to_page(entry);
+		} else if (softleaf_is_device_exclusive(entry)) {
+			vmf->page = softleaf_to_page(entry);
 			ret = remove_device_exclusive_entry(vmf);
-		} else if (is_device_private_entry(entry)) {
+		} else if (softleaf_is_device_private(entry)) {
 			if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
 				/*
 				 * migrate_to_ram is not yet ready to operate
@@ -4656,7 +4656,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 				goto out;
 			}
 
-			vmf->page = pfn_swap_entry_to_page(entry);
+			vmf->page = softleaf_to_page(entry);
 			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
 					vmf->address, &vmf->ptl);
 			if (unlikely(!vmf->pte ||
@@ -4680,7 +4680,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			} else {
 				pte_unmap_unlock(vmf->pte, vmf->ptl);
 			}
-		} else if (is_hwpoison_entry(entry)) {
+		} else if (softleaf_is_hwpoison(entry)) {
 			ret = VM_FAULT_HWPOISON;
 		} else if (softleaf_is_marker(entry)) {
 			ret = handle_pte_marker(vmf);
diff --git a/mm/mincore.c b/mm/mincore.c
index b3682488a65d..9a908d8bb706 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -74,7 +74,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
 	 * absent. Page table may contain migration or hwpoison
 	 * entries which are always uptodate.
 	 */
-	if (non_swap_entry(entry))
+	if (!softleaf_is_swap(entry))
 		return !shmem;
 
 	/*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 055ec1050776..bd1f74a7a5ac 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1256,7 +1256,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd
 			    unsigned long dst_addr, unsigned long src_addr,
 			    unsigned long len, __u64 mode)
 {
-	swp_entry_t entry;
 	struct swap_info_struct *si = NULL;
 	pte_t orig_src_pte, orig_dst_pte;
 	pte_t src_folio_pte;
@@ -1430,19 +1429,20 @@ retry:
 					orig_dst_pte, orig_src_pte, dst_pmd,
 					dst_pmdval, dst_ptl, src_ptl, &src_folio,
 					len);
-	} else {
+	} else { /* !pte_present() */
 		struct folio *folio = NULL;
+		const softleaf_t entry = softleaf_from_pte(orig_src_pte);
 
-		entry = pte_to_swp_entry(orig_src_pte);
-		if (non_swap_entry(entry)) {
-			if (is_migration_entry(entry)) {
-				pte_unmap(src_pte);
-				pte_unmap(dst_pte);
-				src_pte = dst_pte = NULL;
-				migration_entry_wait(mm, src_pmd, src_addr);
-				ret = -EAGAIN;
-			} else
-				ret = -EFAULT;
+		if (softleaf_is_migration(entry)) {
+			pte_unmap(src_pte);
+			pte_unmap(dst_pte);
+			src_pte = dst_pte = NULL;
+			migration_entry_wait(mm, src_pmd, src_addr);
+
+			ret = -EAGAIN;
+			goto out;
+		} else if (!softleaf_is_swap(entry)) {
+			ret = -EFAULT;
 			goto out;
 		}
 
-- 
cgit v1.2.3


From 03bfbc3ad6e496fb576ca9ace08211943232fdf9 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:32 +0000
Subject: mm: remove is_hugetlb_entry_[migration, hwpoisoned]()

We do not need to have explicit helper functions for these, it adds a
level of confusion and indirection when we can simply use software leaf
entry logic here instead and spell out the special huge_pte_none() case we
must consider.

No functional change intended.

Link: https://lkml.kernel.org/r/0e92d6924d3de88cd014ce1c53e20edc08fc152e.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c      | 19 ++++++-----
 include/linux/hugetlb.h |  2 --
 mm/hugetlb.c            | 91 ++++++++++++++++++++-----------------------------
 mm/mempolicy.c          | 17 +++++----
 mm/migrate.c            | 15 +++++---
 5 files changed, 69 insertions(+), 75 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1f49c81b3591..92ada14eabc0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2500,22 +2500,23 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
 				  unsigned long addr, pte_t *ptep,
 				  pte_t ptent)
 {
-	unsigned long psize;
+	const unsigned long psize = huge_page_size(hstate_vma(vma));
+	softleaf_t entry;
 
-	if (is_hugetlb_entry_hwpoisoned(ptent) || pte_is_marker(ptent))
-		return;
+	if (huge_pte_none(ptent))
+		set_huge_pte_at(vma->vm_mm, addr, ptep,
+				make_pte_marker(PTE_MARKER_UFFD_WP), psize);
 
-	psize = huge_page_size(hstate_vma(vma));
+	entry = softleaf_from_pte(ptent);
+	if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry))
+		return;
 
-	if (is_hugetlb_entry_migration(ptent))
+	if (softleaf_is_migration(entry))
 		set_huge_pte_at(vma->vm_mm, addr, ptep,
 				pte_swp_mkuffd_wp(ptent), psize);
-	else if (!huge_pte_none(ptent))
+	else
 		huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
 					     huge_pte_mkuffd_wp(ptent));
-	else
-		set_huge_pte_at(vma->vm_mm, addr, ptep,
-				make_pte_marker(PTE_MARKER_UFFD_WP), psize);
 }
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2387513d6ae5..457d48ac7bcd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -274,8 +274,6 @@ void hugetlb_vma_lock_release(struct kref *kref);
 long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot,
 		unsigned long cp_flags);
-bool is_hugetlb_entry_migration(pte_t pte);
-bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 void fixup_hugetlb_reservations(struct vm_area_struct *vma);
 void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 59d91c36770c..311c5d601310 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4846,32 +4846,6 @@ static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
 		set_huge_ptep_writable(vma, address, ptep);
 }
 
-bool is_hugetlb_entry_migration(pte_t pte)
-{
-	swp_entry_t swp;
-
-	if (huge_pte_none(pte) || pte_present(pte))
-		return false;
-	swp = pte_to_swp_entry(pte);
-	if (is_migration_entry(swp))
-		return true;
-	else
-		return false;
-}
-
-bool is_hugetlb_entry_hwpoisoned(pte_t pte)
-{
-	swp_entry_t swp;
-
-	if (huge_pte_none(pte) || pte_present(pte))
-		return false;
-	swp = pte_to_swp_entry(pte);
-	if (is_hwpoison_entry(swp))
-		return true;
-	else
-		return false;
-}
-
 static void
 hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
 		      struct folio *new_folio, pte_t old, unsigned long sz)
@@ -4900,6 +4874,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	unsigned long npages = pages_per_huge_page(h);
 	struct mmu_notifier_range range;
 	unsigned long last_addr_mask;
+	softleaf_t softleaf;
 	int ret = 0;
 
 	if (cow) {
@@ -4947,16 +4922,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
 again:
 		if (huge_pte_none(entry)) {
-			/*
-			 * Skip if src entry none.
-			 */
-			;
-		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
+			/* Skip if src entry none. */
+			goto next;
+		}
+
+		softleaf = softleaf_from_pte(entry);
+		if (unlikely(softleaf_is_hwpoison(softleaf))) {
 			if (!userfaultfd_wp(dst_vma))
 				entry = huge_pte_clear_uffd_wp(entry);
 			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
-		} else if (unlikely(is_hugetlb_entry_migration(entry))) {
-			softleaf_t softleaf = softleaf_from_pte(entry);
+		} else if (unlikely(softleaf_is_migration(softleaf))) {
 			bool uffd_wp = pte_swp_uffd_wp(entry);
 
 			if (!is_readable_migration_entry(softleaf) && cow) {
@@ -4975,7 +4950,6 @@ again:
 				entry = huge_pte_clear_uffd_wp(entry);
 			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
 		} else if (unlikely(pte_is_marker(entry))) {
-			const softleaf_t softleaf = softleaf_from_pte(entry);
 			const pte_marker marker = copy_pte_marker(softleaf, dst_vma);
 
 			if (marker)
@@ -5033,9 +5007,7 @@ again:
 				}
 				hugetlb_install_folio(dst_vma, dst_pte, addr,
 						      new_folio, src_pte_old, sz);
-				spin_unlock(src_ptl);
-				spin_unlock(dst_ptl);
-				continue;
+				goto next;
 			}
 
 			if (cow) {
@@ -5056,6 +5028,8 @@ again:
 			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
 			hugetlb_count_add(npages, dst);
 		}
+
+next:
 		spin_unlock(src_ptl);
 		spin_unlock(dst_ptl);
 	}
@@ -6064,8 +6038,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	ret = 0;
 
 	/* Not present, either a migration or a hwpoisoned entry */
-	if (!pte_present(vmf.orig_pte)) {
-		if (is_hugetlb_entry_migration(vmf.orig_pte)) {
+	if (!pte_present(vmf.orig_pte) && !huge_pte_none(vmf.orig_pte)) {
+		const softleaf_t softleaf = softleaf_from_pte(vmf.orig_pte);
+
+		if (softleaf_is_migration(softleaf)) {
 			/*
 			 * Release the hugetlb fault lock now, but retain
 			 * the vma lock, because it is needed to guard the
@@ -6076,9 +6052,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			migration_entry_wait_huge(vma, vmf.address, vmf.pte);
 			return 0;
-		} else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte))
+		}
+		if (softleaf_is_hwpoison(softleaf)) {
 			ret = VM_FAULT_HWPOISON_LARGE |
 			    VM_FAULT_SET_HINDEX(hstate_index(h));
+		}
+
 		goto out_mutex;
 	}
 
@@ -6460,7 +6439,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	last_addr_mask = hugetlb_mask_last_page(h);
 	for (; address < end; address += psize) {
+		softleaf_t entry;
 		spinlock_t *ptl;
+
 		ptep = hugetlb_walk(vma, address, psize);
 		if (!ptep) {
 			if (!uffd_wp) {
@@ -6492,15 +6473,23 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 			continue;
 		}
 		pte = huge_ptep_get(mm, address, ptep);
-		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
-			/* Nothing to do. */
-		} else if (unlikely(is_hugetlb_entry_migration(pte))) {
-			softleaf_t entry = softleaf_from_pte(pte);
+		if (huge_pte_none(pte)) {
+			if (unlikely(uffd_wp))
+				/* Safe to modify directly (none->non-present). */
+				set_huge_pte_at(mm, address, ptep,
+						make_pte_marker(PTE_MARKER_UFFD_WP),
+						psize);
+			goto next;
+		}
 
+		entry = softleaf_from_pte(pte);
+		if (unlikely(softleaf_is_hwpoison(entry))) {
+			/* Nothing to do. */
+		} else if (unlikely(softleaf_is_migration(entry))) {
 			struct folio *folio = softleaf_to_folio(entry);
 			pte_t newpte = pte;
 
-			if (is_writable_migration_entry(entry)) {
+			if (softleaf_is_migration_write(entry)) {
 				if (folio_test_anon(folio))
 					entry = make_readable_exclusive_migration_entry(
 								swp_offset(entry));
@@ -6527,7 +6516,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 			if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve)
 				/* Safe to modify directly (non-present->none). */
 				huge_pte_clear(mm, address, ptep, psize);
-		} else if (!huge_pte_none(pte)) {
+		} else {
 			pte_t old_pte;
 			unsigned int shift = huge_page_shift(hstate_vma(vma));
 
@@ -6540,16 +6529,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 				pte = huge_pte_clear_uffd_wp(pte);
 			huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
 			pages++;
-		} else {
-			/* None pte */
-			if (unlikely(uffd_wp))
-				/* Safe to modify directly (none->non-present). */
-				set_huge_pte_at(mm, address, ptep,
-						make_pte_marker(PTE_MARKER_UFFD_WP),
-						psize);
 		}
-		spin_unlock(ptl);
 
+next:
+		spin_unlock(ptl);
 		cond_resched();
 	}
 	/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 01c3b98f87a6..dee95d5ecfd4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -768,16 +768,21 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
 	unsigned long flags = qp->flags;
 	struct folio *folio;
 	spinlock_t *ptl;
-	pte_t entry;
+	pte_t ptep;
 
 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
-	entry = huge_ptep_get(walk->mm, addr, pte);
-	if (!pte_present(entry)) {
-		if (unlikely(is_hugetlb_entry_migration(entry)))
-			qp->nr_failed++;
+	ptep = huge_ptep_get(walk->mm, addr, pte);
+	if (!pte_present(ptep)) {
+		if (!huge_pte_none(ptep)) {
+			const softleaf_t entry = softleaf_from_pte(ptep);
+
+			if (unlikely(softleaf_is_migration(entry)))
+				qp->nr_failed++;
+		}
+
 		goto unlock;
 	}
-	folio = pfn_folio(pte_pfn(entry));
+	folio = pfn_folio(pte_pfn(ptep));
 	if (!queue_folio_required(folio, qp))
 		goto unlock;
 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
diff --git a/mm/migrate.c b/mm/migrate.c
index ca4ec170a89b..5edfd0b2f63d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -515,16 +515,18 @@ out:
 void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
 	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep);
+	softleaf_t entry;
 	pte_t pte;
 
 	hugetlb_vma_assert_locked(vma);
 	spin_lock(ptl);
 	pte = huge_ptep_get(vma->vm_mm, addr, ptep);
 
-	if (unlikely(!is_hugetlb_entry_migration(pte))) {
-		spin_unlock(ptl);
-		hugetlb_vma_unlock_read(vma);
-	} else {
+	if (huge_pte_none(pte))
+		goto fail;
+
+	entry = softleaf_from_pte(pte);
+	if (softleaf_is_migration(entry)) {
 		/*
 		 * If migration entry existed, safe to release vma lock
 		 * here because the pgtable page won't be freed without the
@@ -533,7 +535,12 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p
 		 */
 		hugetlb_vma_unlock_read(vma);
 		migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl);
+		return;
 	}
+
+fail:
+	spin_unlock(ptl);
+	hugetlb_vma_unlock_read(vma);
 }
 #endif
 
-- 
cgit v1.2.3


From 93976a20345b4aff1ac7598ec1223d65ca33d49c Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:33 +0000
Subject: mm: eliminate further swapops predicates

Having converted so much of the code base to software leaf entries, we can
mop up some remaining cases.

We replace is_pfn_swap_entry(), pfn_swap_entry_to_page(),
is_writable_device_private_entry(), is_device_exclusive_entry(),
is_migration_entry(), is_writable_migration_entry(),
is_readable_migration_entry(), swp_offset_pfn() and pfn_swap_entry_folio()
with softleaf equivalents.

No functional change intended.

Link: https://lkml.kernel.org/r/956bc9c031604811c0070d2f4bf2f1373f230213.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c      |  14 +++---
 include/linux/leafops.h |  25 +++++++---
 include/linux/swapops.h | 121 +-----------------------------------------------
 mm/debug_vm_pgtable.c   |  20 ++++----
 mm/hmm.c                |   2 +-
 mm/hugetlb.c            |   2 +-
 mm/ksm.c                |   6 +--
 mm/memory-failure.c     |   6 +--
 mm/memory.c             |   3 +-
 mm/mempolicy.c          |   4 +-
 mm/migrate.c            |   6 +--
 mm/migrate_device.c     |  10 ++--
 mm/mprotect.c           |   8 ++--
 mm/page_vma_mapped.c    |   8 ++--
 mm/pagewalk.c           |   7 ++-
 mm/rmap.c               |   9 ++--
 16 files changed, 75 insertions(+), 176 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 92ada14eabc0..41b062ce6ad8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1941,13 +1941,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		if (pte_uffd_wp(pte))
 			flags |= PM_UFFD_WP;
 	} else {
-		swp_entry_t entry;
+		softleaf_t entry;
 
 		if (pte_swp_soft_dirty(pte))
 			flags |= PM_SOFT_DIRTY;
 		if (pte_swp_uffd_wp(pte))
 			flags |= PM_UFFD_WP;
-		entry = pte_to_swp_entry(pte);
+		entry = softleaf_from_pte(pte);
 		if (pm->show_pfn) {
 			pgoff_t offset;
 
@@ -1955,16 +1955,16 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 			 * For PFN swap offsets, keeping the offset field
 			 * to be PFN only to be compatible with old smaps.
 			 */
-			if (is_pfn_swap_entry(entry))
-				offset = swp_offset_pfn(entry);
+			if (softleaf_has_pfn(entry))
+				offset = softleaf_to_pfn(entry);
 			else
 				offset = swp_offset(entry);
 			frame = swp_type(entry) |
 			    (offset << MAX_SWAPFILES_SHIFT);
 		}
 		flags |= PM_SWAP;
-		if (is_pfn_swap_entry(entry))
-			page = pfn_swap_entry_to_page(entry);
+		if (softleaf_has_pfn(entry))
+			page = softleaf_to_page(entry);
 		if (softleaf_is_uffd_wp_marker(entry))
 			flags |= PM_UFFD_WP;
 		if (softleaf_is_guard_marker(entry))
@@ -2033,7 +2033,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
 		if (pmd_swp_uffd_wp(pmd))
 			flags |= PM_UFFD_WP;
 		VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd));
-		page = pfn_swap_entry_to_page(entry);
+		page = softleaf_to_page(entry);
 	}
 
 	if (page) {
diff --git a/include/linux/leafops.h b/include/linux/leafops.h
index f5ea9b0385ff..d282fab866a1 100644
--- a/include/linux/leafops.h
+++ b/include/linux/leafops.h
@@ -355,7 +355,7 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry)
 	VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
 
 	/* Temporary until swp_entry_t eliminated. */
-	return swp_offset_pfn(entry);
+	return swp_offset(entry) & SWP_PFN_MASK;
 }
 
 /**
@@ -366,10 +366,16 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry)
  */
 static inline struct page *softleaf_to_page(softleaf_t entry)
 {
+	struct page *page = pfn_to_page(softleaf_to_pfn(entry));
+
 	VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
+	/*
+	 * Any use of migration entries may only occur while the
+	 * corresponding page is locked
+	 */
+	VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page));
 
-	/* Temporary until swp_entry_t eliminated. */
-	return pfn_swap_entry_to_page(entry);
+	return page;
 }
 
 /**
@@ -380,10 +386,17 @@ static inline struct page *softleaf_to_page(softleaf_t entry)
  */
 static inline struct folio *softleaf_to_folio(softleaf_t entry)
 {
-	VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
+	struct folio *folio = pfn_folio(softleaf_to_pfn(entry));
 
-	/* Temporary until swp_entry_t eliminated. */
-	return pfn_swap_entry_folio(entry);
+	VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
+	/*
+	 * Any use of migration entries may only occur while the
+	 * corresponding folio is locked.
+	 */
+	VM_WARN_ON_ONCE(softleaf_is_migration(entry) &&
+			!folio_test_locked(folio));
+
+	return folio;
 }
 
 /**
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index c8e6f927da48..3d02b288c15e 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -28,7 +28,7 @@
 #define SWP_OFFSET_MASK	((1UL << SWP_TYPE_SHIFT) - 1)
 
 /*
- * Definitions only for PFN swap entries (see is_pfn_swap_entry()).  To
+ * Definitions only for PFN swap entries (see leafeant_has_pfn()).  To
  * store PFN, we only need SWP_PFN_BITS bits.  Each of the pfn swap entries
  * can use the extra bits to store other information besides PFN.
  */
@@ -66,8 +66,6 @@
 #define SWP_MIG_YOUNG			BIT(SWP_MIG_YOUNG_BIT)
 #define SWP_MIG_DIRTY			BIT(SWP_MIG_DIRTY_BIT)
 
-static inline bool is_pfn_swap_entry(swp_entry_t entry);
-
 /* Clear all flags but only keep swp_entry_t related information */
 static inline pte_t pte_swp_clear_flags(pte_t pte)
 {
@@ -109,17 +107,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 	return entry.val & SWP_OFFSET_MASK;
 }
 
-/*
- * This should only be called upon a pfn swap entry to get the PFN stored
- * in the swap entry.  Please refers to is_pfn_swap_entry() for definition
- * of pfn swap entry.
- */
-static inline unsigned long swp_offset_pfn(swp_entry_t entry)
-{
-	VM_BUG_ON(!is_pfn_swap_entry(entry));
-	return swp_offset(entry) & SWP_PFN_MASK;
-}
-
 /*
  * Convert the arch-dependent pte representation of a swp_entry_t into an
  * arch-independent swp_entry_t.
@@ -169,27 +156,11 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
 	return swp_entry(SWP_DEVICE_WRITE, offset);
 }
 
-static inline bool is_device_private_entry(swp_entry_t entry)
-{
-	int type = swp_type(entry);
-	return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
-}
-
-static inline bool is_writable_device_private_entry(swp_entry_t entry)
-{
-	return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
-}
-
 static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
 {
 	return swp_entry(SWP_DEVICE_EXCLUSIVE, offset);
 }
 
-static inline bool is_device_exclusive_entry(swp_entry_t entry)
-{
-	return swp_type(entry) == SWP_DEVICE_EXCLUSIVE;
-}
-
 #else /* CONFIG_DEVICE_PRIVATE */
 static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
 {
@@ -201,50 +172,14 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
 	return swp_entry(0, 0);
 }
 
-static inline bool is_device_private_entry(swp_entry_t entry)
-{
-	return false;
-}
-
-static inline bool is_writable_device_private_entry(swp_entry_t entry)
-{
-	return false;
-}
-
 static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
 {
 	return swp_entry(0, 0);
 }
 
-static inline bool is_device_exclusive_entry(swp_entry_t entry)
-{
-	return false;
-}
-
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 #ifdef CONFIG_MIGRATION
-static inline int is_migration_entry(swp_entry_t entry)
-{
-	return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
-			swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE ||
-			swp_type(entry) == SWP_MIGRATION_WRITE);
-}
-
-static inline int is_writable_migration_entry(swp_entry_t entry)
-{
-	return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
-}
-
-static inline int is_readable_migration_entry(swp_entry_t entry)
-{
-	return unlikely(swp_type(entry) == SWP_MIGRATION_READ);
-}
-
-static inline int is_readable_exclusive_migration_entry(swp_entry_t entry)
-{
-	return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE);
-}
 
 static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
 {
@@ -310,23 +245,10 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
 	return swp_entry(0, 0);
 }
 
-static inline int is_migration_entry(swp_entry_t swp)
-{
-	return 0;
-}
-
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address) { }
 static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
 					     unsigned long addr, pte_t *pte) { }
-static inline int is_writable_migration_entry(swp_entry_t entry)
-{
-	return 0;
-}
-static inline int is_readable_migration_entry(swp_entry_t entry)
-{
-	return 0;
-}
 
 static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
 {
@@ -410,47 +332,6 @@ static inline swp_entry_t make_guard_swp_entry(void)
 	return make_pte_marker_entry(PTE_MARKER_GUARD);
 }
 
-static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
-{
-	struct page *p = pfn_to_page(swp_offset_pfn(entry));
-
-	/*
-	 * Any use of migration entries may only occur while the
-	 * corresponding page is locked
-	 */
-	BUG_ON(is_migration_entry(entry) && !PageLocked(p));
-
-	return p;
-}
-
-static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
-{
-	struct folio *folio = pfn_folio(swp_offset_pfn(entry));
-
-	/*
-	 * Any use of migration entries may only occur while the
-	 * corresponding folio is locked
-	 */
-	BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio));
-
-	return folio;
-}
-
-/*
- * A pfn swap entry is a special type of swap entry that always has a pfn stored
- * in the swap offset. They can either be used to represent unaddressable device
- * memory, to restrict access to a page undergoing migration or to represent a
- * pfn which has been hwpoisoned and unmapped.
- */
-static inline bool is_pfn_swap_entry(swp_entry_t entry)
-{
-	/* Make sure the swp offset can always store the needed fields */
-	BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
-
-	return is_migration_entry(entry) || is_device_private_entry(entry) ||
-	       is_device_exclusive_entry(entry) || is_hwpoison_entry(entry);
-}
-
 struct page_vma_mapped_walk;
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 608d1011ce03..64db85a80558 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -844,7 +844,7 @@ static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { }
 static void __init swap_migration_tests(struct pgtable_debug_args *args)
 {
 	struct page *page;
-	swp_entry_t swp;
+	softleaf_t entry;
 
 	if (!IS_ENABLED(CONFIG_MIGRATION))
 		return;
@@ -867,17 +867,17 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args)
 	 * be locked, otherwise it stumbles upon a BUG_ON().
 	 */
 	__SetPageLocked(page);
-	swp = make_writable_migration_entry(page_to_pfn(page));
-	WARN_ON(!is_migration_entry(swp));
-	WARN_ON(!is_writable_migration_entry(swp));
+	entry = make_writable_migration_entry(page_to_pfn(page));
+	WARN_ON(!softleaf_is_migration(entry));
+	WARN_ON(!softleaf_is_migration_write(entry));
 
-	swp = make_readable_migration_entry(swp_offset(swp));
-	WARN_ON(!is_migration_entry(swp));
-	WARN_ON(is_writable_migration_entry(swp));
+	entry = make_readable_migration_entry(swp_offset(entry));
+	WARN_ON(!softleaf_is_migration(entry));
+	WARN_ON(softleaf_is_migration_write(entry));
 
-	swp = make_readable_migration_entry(page_to_pfn(page));
-	WARN_ON(!is_migration_entry(swp));
-	WARN_ON(is_writable_migration_entry(swp));
+	entry = make_readable_migration_entry(page_to_pfn(page));
+	WARN_ON(!softleaf_is_migration(entry));
+	WARN_ON(softleaf_is_migration_write(entry));
 	__ClearPageLocked(page);
 }
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 0158f2d1e027..3912d92a2b9a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -270,7 +270,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 			cpu_flags = HMM_PFN_VALID;
 			if (softleaf_is_device_private_write(entry))
 				cpu_flags |= HMM_PFN_WRITE;
-			new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
+			new_pfn_flags = softleaf_to_pfn(entry) | cpu_flags;
 			goto out;
 		}
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 311c5d601310..9e7815b4f058 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4934,7 +4934,7 @@ again:
 		} else if (unlikely(softleaf_is_migration(softleaf))) {
 			bool uffd_wp = pte_swp_uffd_wp(entry);
 
-			if (!is_readable_migration_entry(softleaf) && cow) {
+			if (!softleaf_is_migration_read(softleaf) && cow) {
 				/*
 				 * COW mappings require pages in both
 				 * parent and child to be set to read.
diff --git a/mm/ksm.c b/mm/ksm.c
index f9a1a3658ead..cfc182255c7b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -632,14 +632,14 @@ static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long en
 		if (pte_present(pte)) {
 			folio = vm_normal_folio(walk->vma, addr, pte);
 		} else if (!pte_none(pte)) {
-			swp_entry_t entry = pte_to_swp_entry(pte);
+			const softleaf_t entry = softleaf_from_pte(pte);
 
 			/*
 			 * As KSM pages remain KSM pages until freed, no need to wait
 			 * here for migration to end.
 			 */
-			if (is_migration_entry(entry))
-				folio = pfn_swap_entry_folio(entry);
+			if (softleaf_is_migration(entry))
+				folio = softleaf_to_folio(entry);
 		}
 		/* return 1 if the page is an normal ksm page or KSM-placed zero page */
 		found = (folio && folio_test_ksm(folio)) ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1f7fb9bf287a..71652cfedcdf 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -693,10 +693,10 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
 	if (pte_present(pte)) {
 		pfn = pte_pfn(pte);
 	} else {
-		swp_entry_t swp = pte_to_swp_entry(pte);
+		const softleaf_t entry = softleaf_from_pte(pte);
 
-		if (is_hwpoison_entry(swp))
-			pfn = swp_offset_pfn(swp);
+		if (softleaf_is_hwpoison(entry))
+			pfn = softleaf_to_pfn(entry);
 	}
 
 	if (!pfn || pfn != poisoned_pfn)
diff --git a/mm/memory.c b/mm/memory.c
index a3f001a47ecf..525da4479228 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -902,7 +902,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
 static int try_restore_exclusive_pte(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *ptep, pte_t orig_pte)
 {
-	struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte));
+	const softleaf_t entry = softleaf_from_pte(orig_pte);
+	struct page *page = softleaf_to_page(entry);
 	struct folio *folio = page_folio(page);
 
 	if (folio_trylock(folio)) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index dee95d5ecfd4..acb9bf89f619 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -705,7 +705,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 		if (pte_none(ptent))
 			continue;
 		if (!pte_present(ptent)) {
-			if (is_migration_entry(pte_to_swp_entry(ptent)))
+			const softleaf_t entry = softleaf_from_pte(ptent);
+
+			if (softleaf_is_migration(entry))
 				qp->nr_failed++;
 			continue;
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index 5edfd0b2f63d..c39dfea1a925 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -483,7 +483,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 	spinlock_t *ptl;
 	pte_t *ptep;
 	pte_t pte;
-	swp_entry_t entry;
+	softleaf_t entry;
 
 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (!ptep)
@@ -495,8 +495,8 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 	if (pte_none(pte) || pte_present(pte))
 		goto out;
 
-	entry = pte_to_swp_entry(pte);
-	if (!is_migration_entry(entry))
+	entry = softleaf_from_pte(pte);
+	if (!softleaf_is_migration(entry))
 		goto out;
 
 	migration_entry_wait_on_locked(entry, ptl);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 592b4561507c..b1ce6e3478d6 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -279,7 +279,7 @@ again:
 		unsigned long mpfn = 0, pfn;
 		struct folio *folio;
 		struct page *page;
-		swp_entry_t entry;
+		softleaf_t entry;
 		pte_t pte;
 
 		pte = ptep_get(ptep);
@@ -298,11 +298,11 @@ again:
 			 * page table entry. Other special swap entries are not
 			 * migratable, and we ignore regular swapped page.
 			 */
-			entry = pte_to_swp_entry(pte);
-			if (!is_device_private_entry(entry))
+			entry = softleaf_from_pte(pte);
+			if (!softleaf_is_device_private(entry))
 				goto next;
 
-			page = pfn_swap_entry_to_page(entry);
+			page = softleaf_to_page(entry);
 			pgmap = page_pgmap(page);
 			if (!(migrate->flags &
 				MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
@@ -330,7 +330,7 @@ again:
 
 			mpfn = migrate_pfn(page_to_pfn(page)) |
 					MIGRATE_PFN_MIGRATE;
-			if (is_writable_device_private_entry(entry))
+			if (softleaf_is_device_private_write(entry))
 				mpfn |= MIGRATE_PFN_WRITE;
 		} else {
 			pfn = pte_pfn(pte);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index f910cbf41442..283889e4f1ce 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -317,11 +317,11 @@ static long change_pte_range(struct mmu_gather *tlb,
 				pages++;
 			}
 		} else  {
-			swp_entry_t entry = pte_to_swp_entry(oldpte);
+			softleaf_t entry = softleaf_from_pte(oldpte);
 			pte_t newpte;
 
-			if (is_writable_migration_entry(entry)) {
-				struct folio *folio = pfn_swap_entry_folio(entry);
+			if (softleaf_is_migration_write(entry)) {
+				const struct folio *folio = softleaf_to_folio(entry);
 
 				/*
 				 * A protection check is difficult so
@@ -335,7 +335,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 				newpte = swp_entry_to_pte(entry);
 				if (pte_swp_soft_dirty(oldpte))
 					newpte = pte_swp_mksoft_dirty(newpte);
-			} else if (is_writable_device_private_entry(entry)) {
+			} else if (softleaf_is_device_private_write(entry)) {
 				/*
 				 * We do not preserve soft-dirtiness. See
 				 * copy_nonpresent_pte() for explanation.
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 8137d2366722..b38a1d00c971 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -49,7 +49,7 @@ again:
 		if (is_migration)
 			return false;
 	} else if (!is_migration) {
-		swp_entry_t entry;
+		softleaf_t entry;
 
 		/*
 		 * Handle un-addressable ZONE_DEVICE memory.
@@ -67,9 +67,9 @@ again:
 		 * For more details on device private memory see HMM
 		 * (include/linux/hmm.h or mm/hmm.c).
 		 */
-		entry = pte_to_swp_entry(ptent);
-		if (!is_device_private_entry(entry) &&
-		    !is_device_exclusive_entry(entry))
+		entry = softleaf_from_pte(ptent);
+		if (!softleaf_is_device_private(entry) &&
+		    !softleaf_is_device_exclusive(entry))
 			return false;
 	}
 	spin_lock(*ptlp);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 378c774795fc..90cc346a6ecf 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1007,11 +1007,10 @@ pte_table:
 			goto found;
 		}
 	} else if (!pte_none(pte)) {
-		swp_entry_t entry = pte_to_swp_entry(pte);
+		const softleaf_t entry = softleaf_from_pte(pte);
 
-		if ((flags & FW_MIGRATION) &&
-		    is_migration_entry(entry)) {
-			page = pfn_swap_entry_to_page(entry);
+		if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) {
+			page = softleaf_to_page(entry);
 			expose_page = false;
 			goto found;
 		}
diff --git a/mm/rmap.c b/mm/rmap.c
index 775710115a41..345466ad396b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1969,7 +1969,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 		if (likely(pte_present(pteval))) {
 			pfn = pte_pfn(pteval);
 		} else {
-			pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+			pfn = softleaf_to_pfn(pte_to_swp_entry(pteval));
 			VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
 		}
 
@@ -2368,7 +2368,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 		if (likely(pte_present(pteval))) {
 			pfn = pte_pfn(pteval);
 		} else {
-			pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+			pfn = softleaf_to_pfn(pte_to_swp_entry(pteval));
 			VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
 		}
 
@@ -2453,8 +2453,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 				folio_mark_dirty(folio);
 			writable = pte_write(pteval);
 		} else {
+			const softleaf_t entry = softleaf_from_pte(pteval);
+
 			pte_clear(mm, address, pvmw.pte);
-			writable = is_writable_device_private_entry(pte_to_swp_entry(pteval));
+
+			writable = softleaf_is_device_private_write(entry);
 		}
 
 		VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
-- 
cgit v1.2.3


From a3a3e215c9c140c08760d4d96ba4e8bc485d0f14 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 10 Nov 2025 22:21:34 +0000
Subject: mm: replace remaining pte_to_swp_entry() with softleaf_from_pte()

There are straggler invocations of pte_to_swp_entry() lying around,
replace all of these with the software leaf entry equivalent -
softleaf_from_pte().

With those removed, eliminate pte_to_swp_entry() altogether.

No functional change intended.

Link: https://lkml.kernel.org/r/d8ee5ccefe4c42d7c4fe1a2e46f285ac40421cd3.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/leafops.h |  7 ++++++-
 include/linux/swapops.h | 13 -------------
 mm/debug_vm_pgtable.c   |  2 +-
 mm/internal.h           |  7 +++++--
 mm/memory-failure.c     |  2 +-
 mm/memory.c             | 16 ++++++++--------
 mm/migrate.c            |  2 +-
 mm/mincore.c            |  4 +++-
 mm/rmap.c               |  8 ++++++--
 mm/swapfile.c           | 13 +++++++++++--
 10 files changed, 42 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/leafops.h b/include/linux/leafops.h
index d282fab866a1..cfafe7a5e7b1 100644
--- a/include/linux/leafops.h
+++ b/include/linux/leafops.h
@@ -54,11 +54,16 @@ static inline softleaf_t softleaf_mk_none(void)
  */
 static inline softleaf_t softleaf_from_pte(pte_t pte)
 {
+	softleaf_t arch_entry;
+
 	if (pte_present(pte) || pte_none(pte))
 		return softleaf_mk_none();
 
+	pte = pte_swp_clear_flags(pte);
+	arch_entry = __pte_to_swp_entry(pte);
+
 	/* Temporary until swp_entry_t eliminated. */
-	return pte_to_swp_entry(pte);
+	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
 }
 
 /**
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 3d02b288c15e..8cfc966eae48 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -107,19 +107,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 	return entry.val & SWP_OFFSET_MASK;
 }
 
-/*
- * Convert the arch-dependent pte representation of a swp_entry_t into an
- * arch-independent swp_entry_t.
- */
-static inline swp_entry_t pte_to_swp_entry(pte_t pte)
-{
-	swp_entry_t arch_entry;
-
-	pte = pte_swp_clear_flags(pte);
-	arch_entry = __pte_to_swp_entry(pte);
-	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
-}
-
 /*
  * Convert the arch-independent representation of a swp_entry_t into the
  * arch-dependent pte representation.
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 64db85a80558..1eae87dbef73 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1229,7 +1229,7 @@ static int __init init_args(struct pgtable_debug_args *args)
 	init_fixed_pfns(args);
 
 	/* See generic_max_swapfile_size(): probe the maximum offset */
-	max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
+	max_swap_offset = swp_offset(softleaf_from_pte(softleaf_to_pte(swp_entry(0, ~0UL))));
 	/* Create a swp entry with all possible bits set while still being swap. */
 	args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset);
 	/* Create a non-present migration entry. */
diff --git a/mm/internal.h b/mm/internal.h
index 2ed041e6ebc3..929bc4a5dd98 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -334,7 +334,7 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
  */
 static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
 {
-	swp_entry_t entry = pte_to_swp_entry(pte);
+	const softleaf_t entry = softleaf_from_pte(pte);
 	pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
 						   (swp_offset(entry) + delta)));
 
@@ -389,11 +389,14 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
 
 	cgroup_id = lookup_swap_cgroup_id(entry);
 	while (ptep < end_ptep) {
+		softleaf_t entry;
+
 		pte = ptep_get(ptep);
 
 		if (!pte_same(pte, expected_pte))
 			break;
-		if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id)
+		entry = softleaf_from_pte(pte);
+		if (lookup_swap_cgroup_id(entry) != cgroup_id)
 			break;
 		expected_pte = pte_next_swp_offset(expected_pte);
 		ptep++;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 71652cfedcdf..7f908ad795ad 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -51,7 +51,7 @@
 #include <linux/backing-dev.h>
 #include <linux/migrate.h>
 #include <linux/slab.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
 #include <linux/hugetlb.h>
 #include <linux/memory_hotplug.h>
 #include <linux/mm_inline.h>
diff --git a/mm/memory.c b/mm/memory.c
index 525da4479228..50b93b45b174 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1218,7 +1218,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress, max_nr, ret = 0;
 	int rss[NR_MM_COUNTERS];
-	swp_entry_t entry = (swp_entry_t){0};
+	softleaf_t entry = softleaf_mk_none();
 	struct folio *prealloc = NULL;
 	int nr;
 
@@ -1282,7 +1282,7 @@ again:
 						  dst_vma, src_vma,
 						  addr, rss);
 			if (ret == -EIO) {
-				entry = pte_to_swp_entry(ptep_get(src_pte));
+				entry = softleaf_from_pte(ptep_get(src_pte));
 				break;
 			} else if (ret == -EBUSY) {
 				break;
@@ -4446,13 +4446,13 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct folio *folio;
-	swp_entry_t entry;
+	softleaf_t entry;
 
 	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
 	if (!folio)
 		return NULL;
 
-	entry = pte_to_swp_entry(vmf->orig_pte);
+	entry = softleaf_from_pte(vmf->orig_pte);
 	if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
 					   GFP_KERNEL, entry)) {
 		folio_put(folio);
@@ -4470,7 +4470,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
 static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
 {
 	unsigned long addr;
-	swp_entry_t entry;
+	softleaf_t entry;
 	int idx;
 	pte_t pte;
 
@@ -4480,7 +4480,7 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
 
 	if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
 		return false;
-	entry = pte_to_swp_entry(pte);
+	entry = softleaf_from_pte(pte);
 	if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
 		return false;
 
@@ -4526,7 +4526,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 	unsigned long orders;
 	struct folio *folio;
 	unsigned long addr;
-	swp_entry_t entry;
+	softleaf_t entry;
 	spinlock_t *ptl;
 	pte_t *pte;
 	gfp_t gfp;
@@ -4547,7 +4547,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 	if (!zswap_never_enabled())
 		goto fallback;
 
-	entry = pte_to_swp_entry(vmf->orig_pte);
+	entry = softleaf_from_pte(vmf->orig_pte);
 	/*
 	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
 	 * and suitable for swapping THP.
diff --git a/mm/migrate.c b/mm/migrate.c
index c39dfea1a925..b2ad78bf85d5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -534,7 +534,7 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p
 		 * lock release in migration_entry_wait_on_locked().
 		 */
 		hugetlb_vma_unlock_read(vma);
-		migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl);
+		migration_entry_wait_on_locked(entry, ptl);
 		return;
 	}
 
diff --git a/mm/mincore.c b/mm/mincore.c
index 9a908d8bb706..e5d13eea9234 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -202,7 +202,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			for (i = 0; i < step; i++)
 				vec[i] = 1;
 		} else { /* pte is a swap entry */
-			*vec = mincore_swap(pte_to_swp_entry(pte), false);
+			const softleaf_t entry = softleaf_from_pte(pte);
+
+			*vec = mincore_swap(entry, false);
 		}
 		vec += step;
 	}
diff --git a/mm/rmap.c b/mm/rmap.c
index 345466ad396b..d871f2eb821c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1969,7 +1969,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 		if (likely(pte_present(pteval))) {
 			pfn = pte_pfn(pteval);
 		} else {
-			pfn = softleaf_to_pfn(pte_to_swp_entry(pteval));
+			const softleaf_t entry = softleaf_from_pte(pteval);
+
+			pfn = softleaf_to_pfn(entry);
 			VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
 		}
 
@@ -2368,7 +2370,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 		if (likely(pte_present(pteval))) {
 			pfn = pte_pfn(pteval);
 		} else {
-			pfn = softleaf_to_pfn(pte_to_swp_entry(pteval));
+			const softleaf_t entry = softleaf_from_pte(pteval);
+
+			pfn = softleaf_to_pfn(entry);
 			VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
 		}
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8c7f14061f5b..94e0f0c54168 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3202,8 +3202,17 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
  */
 unsigned long generic_max_swapfile_size(void)
 {
-	return swp_offset(pte_to_swp_entry(
-			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+	swp_entry_t entry = swp_entry(0, ~0UL);
+	const pte_t pte = softleaf_to_pte(entry);
+
+	/*
+	 * Since the PTE can be an invalid softleaf entry (e.g. the none PTE),
+	 * we need to do this manually.
+	 */
+	entry = __pte_to_swp_entry(pte);
+	entry = swp_entry(__swp_type(entry), __swp_offset(entry));
+
+	return swp_offset(entry) + 1;
 }
 
 /* Can be overridden by an architecture for additional checks. */
-- 
cgit v1.2.3


From ad7c7f4576a5977b4ec4ac5dd090ab3f81ca7c6f Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Mon, 10 Nov 2025 16:17:56 +0800
Subject: mm: thp: introduce folio_split_queue_lock and its variants

In future memcg removal, the binding between a folio and a memcg may
change, making the split lock within the memcg unstable when held.

A new approach is required to reparent the split queue to its parent.
This patch starts introducing a unified way to acquire the split lock for
future work.

It's a code-only refactoring with no functional changes.

Link: https://lkml.kernel.org/r/a31a90bcac04dc754f775e87ae3205be3170b571.1762762324.git.zhengqi.arch@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nico Pache <npache@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  10 ++++
 mm/huge_memory.c           | 119 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 94 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 966f7c1a0128..b0c6a4635c67 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1647,6 +1647,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg);
 void free_shrinker_info(struct mem_cgroup *memcg);
 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
 void reparent_shrinker_deferred(struct mem_cgroup *memcg);
+
+static inline int shrinker_id(struct shrinker *shrinker)
+{
+	return shrinker->id;
+}
 #else
 #define mem_cgroup_sockets_enabled 0
 
@@ -1678,6 +1683,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg,
 				    int nid, int shrinker_id)
 {
 }
+
+static inline int shrinker_id(struct shrinker *shrinker)
+{
+	return -1;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9758171d49c9..bfcb5d895f67 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1077,28 +1077,86 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 	return pmd;
 }
 
+static struct deferred_split *split_queue_node(int nid)
+{
+	struct pglist_data *pgdata = NODE_DATA(nid);
+
+	return &pgdata->deferred_split_queue;
+}
+
 #ifdef CONFIG_MEMCG
 static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
+struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+					   struct deferred_split *queue)
 {
-	struct mem_cgroup *memcg = folio_memcg(folio);
-	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
+	if (mem_cgroup_disabled())
+		return NULL;
+	if (split_queue_node(folio_nid(folio)) == queue)
+		return NULL;
+	return container_of(queue, struct mem_cgroup, deferred_split_queue);
+}
 
-	if (memcg)
-		return &memcg->deferred_split_queue;
-	else
-		return &pgdat->deferred_split_queue;
+static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
+{
+	return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
 }
 #else
 static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
+struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+					   struct deferred_split *queue)
 {
-	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
+	return NULL;
+}
 
-	return &pgdat->deferred_split_queue;
+static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
+{
+	return split_queue_node(nid);
 }
 #endif
 
+static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
+{
+	struct deferred_split *queue;
+
+	queue = memcg_split_queue(nid, memcg);
+	spin_lock(&queue->split_queue_lock);
+
+	return queue;
+}
+
+static struct deferred_split *
+split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
+{
+	struct deferred_split *queue;
+
+	queue = memcg_split_queue(nid, memcg);
+	spin_lock_irqsave(&queue->split_queue_lock, *flags);
+
+	return queue;
+}
+
+static struct deferred_split *folio_split_queue_lock(struct folio *folio)
+{
+	return split_queue_lock(folio_nid(folio), folio_memcg(folio));
+}
+
+static struct deferred_split *
+folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
+{
+	return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
+}
+
+static inline void split_queue_unlock(struct deferred_split *queue)
+{
+	spin_unlock(&queue->split_queue_lock);
+}
+
+static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
+						 unsigned long flags)
+{
+	spin_unlock_irqrestore(&queue->split_queue_lock, flags);
+}
+
 static inline bool is_transparent_hugepage(const struct folio *folio)
 {
 	if (!folio_test_large(folio))
@@ -3690,7 +3748,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct page *lock_at,
 		struct list_head *list, enum split_type split_type, bool unmapped)
 {
-	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
+	struct deferred_split *ds_queue;
 	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
 	struct folio *end_folio = folio_next(folio);
 	bool is_anon = folio_test_anon(folio);
@@ -3824,7 +3882,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	}
 
 	/* Prevent deferred_split_scan() touching ->_refcount */
-	spin_lock(&ds_queue->split_queue_lock);
+	ds_queue = folio_split_queue_lock(folio);
 	if (folio_ref_freeze(folio, 1 + extra_pins)) {
 		struct swap_cluster_info *ci = NULL;
 		struct lruvec *lruvec;
@@ -3846,7 +3904,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 			 */
 			list_del_init(&folio->_deferred_list);
 		}
-		spin_unlock(&ds_queue->split_queue_lock);
+		split_queue_unlock(ds_queue);
 		if (mapping) {
 			int nr = folio_nr_pages(folio);
 
@@ -3946,7 +4004,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		if (ci)
 			swap_cluster_unlock(ci);
 	} else {
-		spin_unlock(&ds_queue->split_queue_lock);
+		split_queue_unlock(ds_queue);
 		ret = -EAGAIN;
 	}
 fail:
@@ -4129,8 +4187,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 	WARN_ON_ONCE(folio_ref_count(folio));
 	WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
 
-	ds_queue = get_deferred_split_queue(folio);
-	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
 	if (!list_empty(&folio->_deferred_list)) {
 		ds_queue->split_queue_len--;
 		if (folio_test_partially_mapped(folio)) {
@@ -4141,7 +4198,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 		list_del_init(&folio->_deferred_list);
 		unqueued = true;
 	}
-	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+	split_queue_unlock_irqrestore(ds_queue, flags);
 
 	return unqueued;	/* useful for debug warnings */
 }
@@ -4149,10 +4206,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 /* partially_mapped=false won't clear PG_partially_mapped folio flag */
 void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
-	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
-#ifdef CONFIG_MEMCG
-	struct mem_cgroup *memcg = folio_memcg(folio);
-#endif
+	struct deferred_split *ds_queue;
 	unsigned long flags;
 
 	/*
@@ -4175,7 +4229,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
 	if (folio_test_swapcache(folio))
 		return;
 
-	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
 	if (partially_mapped) {
 		if (!folio_test_partially_mapped(folio)) {
 			folio_set_partially_mapped(folio);
@@ -4190,15 +4244,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
 		VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
 	}
 	if (list_empty(&folio->_deferred_list)) {
+		struct mem_cgroup *memcg;
+
+		memcg = folio_split_queue_memcg(folio, ds_queue);
 		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
 		ds_queue->split_queue_len++;
-#ifdef CONFIG_MEMCG
 		if (memcg)
 			set_shrinker_bit(memcg, folio_nid(folio),
-					 deferred_split_shrinker->id);
-#endif
+					 shrinker_id(deferred_split_shrinker));
 	}
-	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+	split_queue_unlock_irqrestore(ds_queue, flags);
 }
 
 static unsigned long deferred_split_count(struct shrinker *shrink,
@@ -4244,19 +4299,13 @@ static bool thp_underused(struct folio *folio)
 static unsigned long deferred_split_scan(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
-	struct pglist_data *pgdata = NODE_DATA(sc->nid);
-	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+	struct deferred_split *ds_queue;
 	unsigned long flags;
 	LIST_HEAD(list);
 	struct folio *folio, *next, *prev = NULL;
 	int split = 0, removed = 0;
 
-#ifdef CONFIG_MEMCG
-	if (sc->memcg)
-		ds_queue = &sc->memcg->deferred_split_queue;
-#endif
-
-	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
 	/* Take pin on all head pages to avoid freeing them under us */
 	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
 							_deferred_list) {
@@ -4275,7 +4324,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 		if (!--sc->nr_to_scan)
 			break;
 	}
-	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+	split_queue_unlock_irqrestore(ds_queue, flags);
 
 	list_for_each_entry_safe(folio, next, &list, _deferred_list) {
 		bool did_split = false;
-- 
cgit v1.2.3


From 46156dba32cb68537d36877a97d672227f3e8134 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 10 Nov 2025 16:17:58 +0800
Subject: mm: thp: reparent the split queue during memcg offline

Similar to list_lru, the split queue is relatively independent and does
not need to be reparented along with objcg and LRU folios (holding objcg
lock and lru lock).  So let's apply the similar mechanism as list_lru to
reparent the split queue separately when memcg is offine.

This is also a preparation for reparenting LRU folios.

Link: https://lkml.kernel.org/r/8703f907c4d1f7e8a2ef2bfed3036a84fa53028b.1762762324.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h    |  4 ++++
 include/linux/memcontrol.h | 11 +++++++++++
 mm/huge_memory.c           | 44 ++++++++++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c            |  1 +
 4 files changed, 60 insertions(+)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 525624c285a6..e2e91aa1a042 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -415,6 +415,9 @@ static inline int split_huge_page(struct page *page)
 	return split_huge_page_to_list_to_order(page, NULL, 0);
 }
 void deferred_split_folio(struct folio *folio, bool partially_mapped);
+#ifdef CONFIG_MEMCG
+void reparent_deferred_split_queue(struct mem_cgroup *memcg);
+#endif
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze);
@@ -647,6 +650,7 @@ static inline int try_folio_split_to_order(struct folio *folio,
 }
 
 static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
+static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
 #define split_huge_pmd(__vma, __pmd, __address)	\
 	do { } while (0)
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b0c6a4635c67..cc6db20d7dca 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1775,6 +1775,12 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
 bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
 
 void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);
+
+static inline bool memcg_is_dying(struct mem_cgroup *memcg)
+{
+	return memcg ? css_is_dying(&memcg->css) : false;
+}
+
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -1845,6 +1851,11 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
 static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
 {
 }
+
+static inline bool memcg_is_dying(struct mem_cgroup *memcg)
+{
+	return false;
+}
 #endif /* CONFIG_MEMCG */
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 13684e5376e8..d17d3810a882 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1118,8 +1118,19 @@ static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg
 {
 	struct deferred_split *queue;
 
+retry:
 	queue = memcg_split_queue(nid, memcg);
 	spin_lock(&queue->split_queue_lock);
+	/*
+	 * There is a period between setting memcg to dying and reparenting
+	 * deferred split queue, and during this period the THPs in the deferred
+	 * split queue will be hidden from the shrinker side.
+	 */
+	if (unlikely(memcg_is_dying(memcg))) {
+		spin_unlock(&queue->split_queue_lock);
+		memcg = parent_mem_cgroup(memcg);
+		goto retry;
+	}
 
 	return queue;
 }
@@ -1129,8 +1140,14 @@ split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags
 {
 	struct deferred_split *queue;
 
+retry:
 	queue = memcg_split_queue(nid, memcg);
 	spin_lock_irqsave(&queue->split_queue_lock, *flags);
+	if (unlikely(memcg_is_dying(memcg))) {
+		spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
+		memcg = parent_mem_cgroup(memcg);
+		goto retry;
+	}
 
 	return queue;
 }
@@ -4389,6 +4406,33 @@ next:
 	return split;
 }
 
+#ifdef CONFIG_MEMCG
+void reparent_deferred_split_queue(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+	struct deferred_split *ds_queue = &memcg->deferred_split_queue;
+	struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
+	int nid;
+
+	spin_lock_irq(&ds_queue->split_queue_lock);
+	spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
+
+	if (!ds_queue->split_queue_len)
+		goto unlock;
+
+	list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
+	parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
+	ds_queue->split_queue_len = 0;
+
+	for_each_node(nid)
+		set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
+
+unlock:
+	spin_unlock(&parent_ds_queue->split_queue_lock);
+	spin_unlock_irq(&ds_queue->split_queue_lock);
+}
+#endif
+
 #ifdef CONFIG_DEBUG_FS
 static void split_huge_pages_all(void)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bfc986da3289..623446821b00 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3920,6 +3920,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	zswap_memcg_offline_cleanup(memcg);
 
 	memcg_offline_kmem(memcg);
+	reparent_deferred_split_queue(memcg);
 	reparent_shrinker_deferred(memcg);
 	wb_memcg_offline(memcg);
 	lru_gen_offline_memcg(memcg);
-- 
cgit v1.2.3


From 9e014077083753461938312d565e4ac7119570d1 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Fri, 14 Nov 2025 03:00:28 +0000
Subject: mm/khugepaged: unify SCAN_PMD_NONE and SCAN_PMD_NULL into
 SCAN_NO_PTE_TABLE

The current hugepage collapse scan results include two separate values,
SCAN_PMD_NONE and SCAN_PMD_NULL, which are handled identically by the
consuming code.

To reduce confusion and improve long-term maintenance, this commit merges
these two functionally equivalent states into a single, clearer
identifier: SCAN_NO_PTE_TABLE

Link: https://lkml.kernel.org/r/20251114030028.7035-4-richard.weiyang@gmail.com
Suggested-by: "David Hildenbrand (Red Hat)" <david@kernel.org>
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Nico Pache <npache@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/huge_memory.h |  3 +--
 mm/khugepaged.c                    | 23 ++++++++++-------------
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index dd94d14a2427..4cde53b45a85 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -10,8 +10,7 @@
 #define SCAN_STATUS							\
 	EM( SCAN_FAIL,			"failed")			\
 	EM( SCAN_SUCCEED,		"succeeded")			\
-	EM( SCAN_PMD_NULL,		"pmd_null")			\
-	EM( SCAN_PMD_NONE,		"pmd_none")			\
+	EM( SCAN_NO_PTE_TABLE,		"no_pte_table")			\
 	EM( SCAN_PMD_MAPPED,		"page_pmd_mapped")		\
 	EM( SCAN_EXCEED_NONE_PTE,	"exceed_none_pte")		\
 	EM( SCAN_EXCEED_SWAP_PTE,	"exceed_swap_pte")		\
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2ee5048b764e..40f9d5939aa5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -30,8 +30,7 @@
 enum scan_result {
 	SCAN_FAIL,
 	SCAN_SUCCEED,
-	SCAN_PMD_NULL,
-	SCAN_PMD_NONE,
+	SCAN_NO_PTE_TABLE,
 	SCAN_PMD_MAPPED,
 	SCAN_EXCEED_NONE_PTE,
 	SCAN_EXCEED_SWAP_PTE,
@@ -934,7 +933,7 @@ static inline int check_pmd_state(pmd_t *pmd)
 	pmd_t pmde = pmdp_get_lockless(pmd);
 
 	if (pmd_none(pmde))
-		return SCAN_PMD_NONE;
+		return SCAN_NO_PTE_TABLE;
 
 	/*
 	 * The folio may be under migration when khugepaged is trying to
@@ -944,11 +943,11 @@ static inline int check_pmd_state(pmd_t *pmd)
 	if (pmd_is_migration_entry(pmde))
 		return SCAN_PMD_MAPPED;
 	if (!pmd_present(pmde))
-		return SCAN_PMD_NULL;
+		return SCAN_NO_PTE_TABLE;
 	if (pmd_trans_huge(pmde))
 		return SCAN_PMD_MAPPED;
 	if (pmd_bad(pmde))
-		return SCAN_PMD_NULL;
+		return SCAN_NO_PTE_TABLE;
 	return SCAN_SUCCEED;
 }
 
@@ -958,7 +957,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
 {
 	*pmd = mm_find_pmd(mm, address);
 	if (!*pmd)
-		return SCAN_PMD_NULL;
+		return SCAN_NO_PTE_TABLE;
 
 	return check_pmd_state(*pmd);
 }
@@ -1013,7 +1012,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
 			pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
 			if (!pte) {
 				mmap_read_unlock(mm);
-				result = SCAN_PMD_NULL;
+				result = SCAN_NO_PTE_TABLE;
 				goto out;
 			}
 		}
@@ -1187,7 +1186,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 						      &compound_pagelist);
 		spin_unlock(pte_ptl);
 	} else {
-		result = SCAN_PMD_NULL;
+		result = SCAN_NO_PTE_TABLE;
 	}
 
 	if (unlikely(result != SCAN_SUCCEED)) {
@@ -1270,7 +1269,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	nodes_clear(cc->alloc_nmask);
 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
 	if (!pte) {
-		result = SCAN_PMD_NULL;
+		result = SCAN_NO_PTE_TABLE;
 		goto out;
 	}
 
@@ -1544,8 +1543,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	switch (result) {
 	case SCAN_SUCCEED:
 		break;
-	case SCAN_PMD_NULL:
-	case SCAN_PMD_NONE:
+	case SCAN_NO_PTE_TABLE:
 		/*
 		 * All pte entries have been removed and pmd cleared.
 		 * Skip all the pte checks and just update the pmd mapping.
@@ -2832,8 +2830,7 @@ handle_result:
 			mmap_read_unlock(mm);
 			goto handle_result;
 		/* Whitelisted set of results where continuing OK */
-		case SCAN_PMD_NULL:
-		case SCAN_PMD_NONE:
+		case SCAN_NO_PTE_TABLE:
 		case SCAN_PTE_NON_PRESENT:
 		case SCAN_PTE_UFFD_WP:
 		case SCAN_LACK_REFERENCED_PAGE:
-- 
cgit v1.2.3


From cab812d9c9642ec11b8961b7ea994f4bd0826159 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs@nvidia.com>
Date: Fri, 14 Nov 2025 12:22:28 +1100
Subject: mm/huge_memory.c: introduce folio_split_unmapped
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unmapped was added as a parameter to __folio_split() and related call
sites to support splitting of folios already in the midst of a migration.
This special case arose for device private folio migration since during
migration there could be a disconnect between source and destination on
the folio size.

Introduce folio_split_unmapped() to handle this special case.  Also
refactor code and add __folio_freeze_and_split_unmapped() helper that is
common to both __folio_split() and folio_split_unmapped().

This in turn removes the special casing introduced by the unmapped
parameter in __folio_split().

[balbirs@nvidia.com: v2]
  Link: https://lkml.kernel.org/r/20251115084041.3914728-1-balbirs@nvidia.com
[balbirs@nvidia.com: fix clang-20 build]
  Link: https://lkml.kernel.org/r/20251120134232.3588203-1-balbirs@nvidia.com
[akpm@linux-foundation.org: add `inline' to shmem_uncharge() stub, per Balbir]
Link: https://lkml.kernel.org/r/20251114012228.2634882-1-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Suggested-by: Zi Yan <ziy@nvidia.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h  |   5 +-
 include/linux/shmem_fs.h |   6 +-
 mm/huge_memory.c         | 348 +++++++++++++++++++++++++++--------------------
 mm/migrate_device.c      |   3 +-
 4 files changed, 211 insertions(+), 151 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e2e91aa1a042..1d439de1ca2c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -371,7 +371,8 @@ enum split_type {
 
 bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
 int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
-		unsigned int new_order, bool unmapped);
+		unsigned int new_order);
+int folio_split_unmapped(struct folio *folio, unsigned int new_order);
 int min_order_for_split(struct folio *folio);
 int split_folio_to_list(struct folio *folio, struct list_head *list);
 bool folio_split_supported(struct folio *folio, unsigned int new_order,
@@ -382,7 +383,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
 static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		unsigned int new_order)
 {
-	return __split_huge_page_to_list_to_order(page, list, new_order, false);
+	return __split_huge_page_to_list_to_order(page, list, new_order);
 }
 static inline int split_huge_page_to_order(struct page *page, unsigned int new_order)
 {
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 5b368f9549d6..d02270072a34 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -136,11 +136,16 @@ static inline bool shmem_hpage_pmd_enabled(void)
 
 #ifdef CONFIG_SHMEM
 extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
+extern void shmem_uncharge(struct inode *inode, long pages);
 #else
 static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma)
 {
 	return 0;
 }
+
+static inline void shmem_uncharge(struct inode *inode, long pages)
+{
+}
 #endif
 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 						pgoff_t start, pgoff_t end);
@@ -194,7 +199,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
 }
 
 extern bool shmem_charge(struct inode *inode, long pages);
-extern void shmem_uncharge(struct inode *inode, long pages);
 
 #ifdef CONFIG_USERFAULTFD
 #ifdef CONFIG_SHMEM
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d17d3810a882..53a8d380eab2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3739,6 +3739,152 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order,
 	return true;
 }
 
+static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
+					     struct page *split_at, struct xa_state *xas,
+					     struct address_space *mapping, bool do_lru,
+					     struct list_head *list, enum split_type split_type,
+					     pgoff_t end, int *nr_shmem_dropped, int extra_pins)
+{
+	struct folio *end_folio = folio_next(folio);
+	struct folio *new_folio, *next;
+	int old_order = folio_order(folio);
+	int ret = 0;
+	struct deferred_split *ds_queue;
+
+	VM_WARN_ON_ONCE(!mapping && end);
+	/* Prevent deferred_split_scan() touching ->_refcount */
+	ds_queue = folio_split_queue_lock(folio);
+	if (folio_ref_freeze(folio, 1 + extra_pins)) {
+		struct swap_cluster_info *ci = NULL;
+		struct lruvec *lruvec;
+		int expected_refs;
+
+		if (old_order > 1) {
+			if (!list_empty(&folio->_deferred_list)) {
+				ds_queue->split_queue_len--;
+				/*
+				 * Reinitialize page_deferred_list after removing the
+				 * page from the split_queue, otherwise a subsequent
+				 * split will see list corruption when checking the
+				 * page_deferred_list.
+				 */
+				list_del_init(&folio->_deferred_list);
+			}
+			if (folio_test_partially_mapped(folio)) {
+				folio_clear_partially_mapped(folio);
+				mod_mthp_stat(old_order,
+					MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+			}
+		}
+		split_queue_unlock(ds_queue);
+		if (mapping) {
+			int nr = folio_nr_pages(folio);
+
+			if (folio_test_pmd_mappable(folio) &&
+			    new_order < HPAGE_PMD_ORDER) {
+				if (folio_test_swapbacked(folio)) {
+					__lruvec_stat_mod_folio(folio,
+							NR_SHMEM_THPS, -nr);
+				} else {
+					__lruvec_stat_mod_folio(folio,
+							NR_FILE_THPS, -nr);
+					filemap_nr_thps_dec(mapping);
+				}
+			}
+		}
+
+		if (folio_test_swapcache(folio)) {
+			if (mapping) {
+				VM_WARN_ON_ONCE_FOLIO(mapping, folio);
+				return -EINVAL;
+			}
+
+			ci = swap_cluster_get_and_lock(folio);
+		}
+
+		/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
+		if (do_lru)
+			lruvec = folio_lruvec_lock(folio);
+
+		ret = __split_unmapped_folio(folio, new_order, split_at, xas,
+					     mapping, split_type);
+
+		/*
+		 * Unfreeze after-split folios and put them back to the right
+		 * list. @folio should be kept frozon until page cache
+		 * entries are updated with all the other after-split folios
+		 * to prevent others seeing stale page cache entries.
+		 * As a result, new_folio starts from the next folio of
+		 * @folio.
+		 */
+		for (new_folio = folio_next(folio); new_folio != end_folio;
+		     new_folio = next) {
+			unsigned long nr_pages = folio_nr_pages(new_folio);
+
+			next = folio_next(new_folio);
+
+			zone_device_private_split_cb(folio, new_folio);
+
+			expected_refs = folio_expected_ref_count(new_folio) + 1;
+			folio_ref_unfreeze(new_folio, expected_refs);
+
+			if (do_lru)
+				lru_add_split_folio(folio, new_folio, lruvec, list);
+
+			/*
+			 * Anonymous folio with swap cache.
+			 * NOTE: shmem in swap cache is not supported yet.
+			 */
+			if (ci) {
+				__swap_cache_replace_folio(ci, folio, new_folio);
+				continue;
+			}
+
+			/* Anonymous folio without swap cache */
+			if (!mapping)
+				continue;
+
+			/* Add the new folio to the page cache. */
+			if (new_folio->index < end) {
+				__xa_store(&mapping->i_pages, new_folio->index,
+					   new_folio, 0);
+				continue;
+			}
+
+			VM_WARN_ON_ONCE(!nr_shmem_dropped);
+			/* Drop folio beyond EOF: ->index >= end */
+			if (shmem_mapping(mapping) && nr_shmem_dropped)
+				*nr_shmem_dropped += nr_pages;
+			else if (folio_test_clear_dirty(new_folio))
+				folio_account_cleaned(
+					new_folio, inode_to_wb(mapping->host));
+			__filemap_remove_folio(new_folio, NULL);
+			folio_put_refs(new_folio, nr_pages);
+		}
+
+		zone_device_private_split_cb(folio, NULL);
+		/*
+		 * Unfreeze @folio only after all page cache entries, which
+		 * used to point to it, have been updated with new folios.
+		 * Otherwise, a parallel folio_try_get() can grab @folio
+		 * and its caller can see stale page cache entries.
+		 */
+		expected_refs = folio_expected_ref_count(folio) + 1;
+		folio_ref_unfreeze(folio, expected_refs);
+
+		if (do_lru)
+			unlock_page_lruvec(lruvec);
+
+		if (ci)
+			swap_cluster_unlock(ci);
+	} else {
+		split_queue_unlock(ds_queue);
+		return -EAGAIN;
+	}
+
+	return ret;
+}
+
 /**
  * __folio_split() - split a folio at @split_at to a @new_order folio
  * @folio: folio to split
@@ -3747,7 +3893,6 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order,
  * @lock_at: a page within @folio to be left locked to caller
  * @list: after-split folios will be put on it if non NULL
  * @split_type: perform uniform split or not (non-uniform split)
- * @unmapped: The pages are already unmapped, they are migration entries.
  *
  * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
  * It is in charge of checking whether the split is supported or not and
@@ -3763,9 +3908,8 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order,
  */
 static int __folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct page *lock_at,
-		struct list_head *list, enum split_type split_type, bool unmapped)
+		struct list_head *list, enum split_type split_type)
 {
-	struct deferred_split *ds_queue;
 	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
 	struct folio *end_folio = folio_next(folio);
 	bool is_anon = folio_test_anon(folio);
@@ -3776,7 +3920,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	int nr_shmem_dropped = 0;
 	int remap_flags = 0;
 	int extra_pins, ret;
-	pgoff_t end;
+	pgoff_t end = 0;
 	bool is_hzp;
 
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
@@ -3819,14 +3963,12 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		 * is taken to serialise against parallel split or collapse
 		 * operations.
 		 */
-		if (!unmapped) {
-			anon_vma = folio_get_anon_vma(folio);
-			if (!anon_vma) {
-				ret = -EBUSY;
-				goto out;
-			}
-			anon_vma_lock_write(anon_vma);
+		anon_vma = folio_get_anon_vma(folio);
+		if (!anon_vma) {
+			ret = -EBUSY;
+			goto out;
 		}
+		anon_vma_lock_write(anon_vma);
 		mapping = NULL;
 	} else {
 		unsigned int min_order;
@@ -3880,8 +4022,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		goto out_unlock;
 	}
 
-	if (!unmapped)
-		unmap_folio(folio);
+	unmap_folio(folio);
 
 	/* block interrupt reentry in xa_lock and spinlock */
 	local_irq_disable();
@@ -3898,142 +4039,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		}
 	}
 
-	/* Prevent deferred_split_scan() touching ->_refcount */
-	ds_queue = folio_split_queue_lock(folio);
-	if (folio_ref_freeze(folio, 1 + extra_pins)) {
-		struct swap_cluster_info *ci = NULL;
-		struct lruvec *lruvec;
-		int expected_refs;
-
-		if (old_order > 1) {
-			if (!list_empty(&folio->_deferred_list)) {
-				ds_queue->split_queue_len--;
-				/*
-				 * Reinitialize page_deferred_list after removing the
-				 * page from the split_queue, otherwise a subsequent
-				 * split will see list corruption when checking the
-				 * page_deferred_list.
-				 */
-				list_del_init(&folio->_deferred_list);
-			}
-			if (folio_test_partially_mapped(folio)) {
-				folio_clear_partially_mapped(folio);
-				mod_mthp_stat(old_order,
-					MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
-			}
-		}
-		split_queue_unlock(ds_queue);
-		if (mapping) {
-			int nr = folio_nr_pages(folio);
-
-			if (folio_test_pmd_mappable(folio) &&
-			    new_order < HPAGE_PMD_ORDER) {
-				if (folio_test_swapbacked(folio)) {
-					__lruvec_stat_mod_folio(folio,
-							NR_SHMEM_THPS, -nr);
-				} else {
-					__lruvec_stat_mod_folio(folio,
-							NR_FILE_THPS, -nr);
-					filemap_nr_thps_dec(mapping);
-				}
-			}
-		}
-
-		if (folio_test_swapcache(folio)) {
-			if (mapping) {
-				VM_WARN_ON_ONCE_FOLIO(mapping, folio);
-				ret = -EINVAL;
-				goto fail;
-			}
-
-			ci = swap_cluster_get_and_lock(folio);
-		}
-
-		/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
-		lruvec = folio_lruvec_lock(folio);
-
-		ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
-					     mapping, split_type);
-
-		/*
-		 * Unfreeze after-split folios and put them back to the right
-		 * list. @folio should be kept frozon until page cache
-		 * entries are updated with all the other after-split folios
-		 * to prevent others seeing stale page cache entries.
-		 * As a result, new_folio starts from the next folio of
-		 * @folio.
-		 */
-		for (new_folio = folio_next(folio); new_folio != end_folio;
-		     new_folio = next) {
-			unsigned long nr_pages = folio_nr_pages(new_folio);
-
-			next = folio_next(new_folio);
-
-			zone_device_private_split_cb(folio, new_folio);
-
-			expected_refs = folio_expected_ref_count(new_folio) + 1;
-			folio_ref_unfreeze(new_folio, expected_refs);
-
-			if (!unmapped)
-				lru_add_split_folio(folio, new_folio, lruvec, list);
-
-			/*
-			 * Anonymous folio with swap cache.
-			 * NOTE: shmem in swap cache is not supported yet.
-			 */
-			if (ci) {
-				__swap_cache_replace_folio(ci, folio, new_folio);
-				continue;
-			}
-
-			/* Anonymous folio without swap cache */
-			if (!mapping)
-				continue;
-
-			/* Add the new folio to the page cache. */
-			if (new_folio->index < end) {
-				__xa_store(&mapping->i_pages, new_folio->index,
-					   new_folio, 0);
-				continue;
-			}
-
-			/* Drop folio beyond EOF: ->index >= end */
-			if (shmem_mapping(mapping))
-				nr_shmem_dropped += nr_pages;
-			else if (folio_test_clear_dirty(new_folio))
-				folio_account_cleaned(
-					new_folio, inode_to_wb(mapping->host));
-			__filemap_remove_folio(new_folio, NULL);
-			folio_put_refs(new_folio, nr_pages);
-		}
-
-		zone_device_private_split_cb(folio, NULL);
-		/*
-		 * Unfreeze @folio only after all page cache entries, which
-		 * used to point to it, have been updated with new folios.
-		 * Otherwise, a parallel folio_try_get() can grab @folio
-		 * and its caller can see stale page cache entries.
-		 */
-		expected_refs = folio_expected_ref_count(folio) + 1;
-		folio_ref_unfreeze(folio, expected_refs);
-
-		unlock_page_lruvec(lruvec);
-
-		if (ci)
-			swap_cluster_unlock(ci);
-	} else {
-		split_queue_unlock(ds_queue);
-		ret = -EAGAIN;
-	}
+	ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping,
+						true, list, split_type, end, &nr_shmem_dropped,
+						extra_pins);
 fail:
 	if (mapping)
 		xas_unlock(&xas);
 
 	local_irq_enable();
 
-	if (unmapped)
-		return ret;
-
 	if (nr_shmem_dropped)
 		shmem_uncharge(mapping->host, nr_shmem_dropped);
 
@@ -4077,6 +4091,48 @@ out:
 	return ret;
 }
 
+/**
+ * folio_split_unmapped() - split a large anon folio that is already unmapped
+ * @folio: folio to split
+ * @new_order: the order of folios after split
+ *
+ * This function is a helper for splitting folios that have already been
+ * unmapped. The use case is that the device or the CPU can refuse to migrate
+ * THP pages in the middle of migration, due to allocation issues on either
+ * side.
+ *
+ * anon_vma_lock is not required to be held, mmap_read_lock() or
+ * mmap_write_lock() should be held. @folio is expected to be locked by the
+ * caller. device-private and non device-private folios are supported along
+ * with folios that are in the swapcache. @folio should also be unmapped and
+ * isolated from LRU (if applicable)
+ *
+ * Upon return, the folio is not remapped, split folios are not added to LRU,
+ * free_folio_and_swap_cache() is not called, and new folios remain locked.
+ *
+ * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to
+ *         insufficient reference count or extra pins).
+ */
+int folio_split_unmapped(struct folio *folio, unsigned int new_order)
+{
+	int extra_pins, ret = 0;
+
+	VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
+	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+	VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
+	VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);
+
+	if (!can_split_folio(folio, 1, &extra_pins))
+		return -EAGAIN;
+
+	local_irq_disable();
+	ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL,
+						NULL, false, NULL, SPLIT_TYPE_UNIFORM,
+						0, NULL, extra_pins);
+	local_irq_enable();
+	return ret;
+}
+
 /*
  * This function splits a large folio into smaller folios of order @new_order.
  * @page can point to any page of the large folio to split. The split operation
@@ -4125,12 +4181,12 @@ out:
  * with the folio. Splitting to order 0 is compatible with all folios.
  */
 int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
-				     unsigned int new_order, bool unmapped)
+				     unsigned int new_order)
 {
 	struct folio *folio = page_folio(page);
 
 	return __folio_split(folio, new_order, &folio->page, page, list,
-			     SPLIT_TYPE_UNIFORM, unmapped);
+			     SPLIT_TYPE_UNIFORM);
 }
 
 /**
@@ -4161,7 +4217,7 @@ int folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct list_head *list)
 {
 	return __folio_split(folio, new_order, split_at, &folio->page, list,
-			     SPLIT_TYPE_NON_UNIFORM, false);
+			     SPLIT_TYPE_NON_UNIFORM);
 }
 
 int min_order_for_split(struct folio *folio)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index b1ce6e3478d6..23379663b1e1 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -916,8 +916,7 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
 
 	folio_get(folio);
 	split_huge_pmd_address(migrate->vma, addr, true);
-	ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL,
-							0, true);
+	ret = folio_split_unmapped(folio, 0);
 	if (ret)
 		return ret;
 	migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND;
-- 
cgit v1.2.3


From 7e44d00a13ca5691caf4f7c46541ee60bf75b208 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Mon, 10 Nov 2025 15:20:05 -0800
Subject: memcg: use mod_node_page_state to update stats

Patch series "memcg: cleanup the memcg stats interfaces".

The memcg stats are safe against irq (and nmi) context and thus does not
require disabling irqs.  However for some stats which are also maintained
at node level, it is using irq unsafe interface and thus requiring the
users to still disables irqs or use interfaces which explicitly disables
irqs.  Let's move memcg code to use irq safe node level stats function
which is already optimized for architectures with HAVE_CMPXCHG_LOCAL (all
major ones), so there will not be any performance penalty for its usage.


This patch (of 4):

The memcg stats are safe against irq (and nmi) context and thus does not
require disabling irqs.  However some code paths for memcg stats also
update the node level stats and use irq unsafe interface and thus require
the users to disable irqs.  However node level stats, on architectures
with HAVE_CMPXCHG_LOCAL (all major ones), has interface which does not
require irq disabling.  Let's move memcg stats code to start using that
interface for node level stats.

Link: https://lkml.kernel.org/r/20251110232008.1352063-1-shakeel.butt@linux.dev
Link: https://lkml.kernel.org/r/20251110232008.1352063-2-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 +-
 include/linux/vmstat.h     | 4 ++--
 mm/memcontrol.c            | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index cc6db20d7dca..1085d0460e66 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1408,7 +1408,7 @@ static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
 {
 	struct page *page = virt_to_head_page(p);
 
-	__mod_node_page_state(page_pgdat(page), idx, val);
+	mod_node_page_state(page_pgdat(page), idx, val);
 }
 
 static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index c287998908bf..11a37aaa4dd9 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -557,7 +557,7 @@ static inline void mod_lruvec_page_state(struct page *page,
 static inline void __mod_lruvec_state(struct lruvec *lruvec,
 				      enum node_stat_item idx, int val)
 {
-	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 }
 
 static inline void mod_lruvec_state(struct lruvec *lruvec,
@@ -569,7 +569,7 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
 static inline void __lruvec_stat_mod_folio(struct folio *folio,
 					 enum node_stat_item idx, int val)
 {
-	__mod_node_page_state(folio_pgdat(folio), idx, val);
+	mod_node_page_state(folio_pgdat(folio), idx, val);
 }
 
 static inline void lruvec_stat_mod_folio(struct folio *folio,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 623446821b00..7e6407b8bfb7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -770,7 +770,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			int val)
 {
 	/* Update node */
-	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 
 	/* Update memcg and lruvec */
 	if (!mem_cgroup_disabled())
@@ -789,7 +789,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
 	/* Untracked pages have no memcg, no lruvec. Update only the node */
 	if (!memcg) {
 		rcu_read_unlock();
-		__mod_node_page_state(pgdat, idx, val);
+		mod_node_page_state(pgdat, idx, val);
 		return;
 	}
 
@@ -815,7 +815,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 	 * vmstats to keep it correct for the root memcg.
 	 */
 	if (!memcg) {
-		__mod_node_page_state(pgdat, idx, val);
+		mod_node_page_state(pgdat, idx, val);
 	} else {
 		lruvec = mem_cgroup_lruvec(memcg, pgdat);
 		__mod_lruvec_state(lruvec, idx, val);
-- 
cgit v1.2.3


From 469241fe7657dbec9e2948287ab7412955d8b73a Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Mon, 10 Nov 2025 15:20:06 -0800
Subject: memcg: remove __mod_lruvec_kmem_state

__mod_lruvec_kmem_state() is already safe against irqs, so there is no
need to have a separate interface (i.e.  mod_lruvec_kmem_state) which
wraps calls to it with irq disabling and reenabling.  Let's rename
__mod_lruvec_kmem_state() to mod_lruvec_kmem_state().

Link: https://lkml.kernel.org/r/20251110232008.1352063-3-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 28 +++++-----------------------
 mm/memcontrol.c            |  2 +-
 mm/workingset.c            |  2 +-
 3 files changed, 7 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1085d0460e66..d35390f9892a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -957,17 +957,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
 
-void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
-
-static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
-					 int val)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__mod_lruvec_kmem_state(p, idx, val);
-	local_irq_restore(flags);
-}
+void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
 
 void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 			unsigned long count);
@@ -1403,14 +1393,6 @@ static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
 {
 }
 
-static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
-					   int val)
-{
-	struct page *page = virt_to_head_page(p);
-
-	mod_node_page_state(page_pgdat(page), idx, val);
-}
-
 static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
 					 int val)
 {
@@ -1470,14 +1452,14 @@ struct slabobj_ext {
 #endif
 } __aligned(8);
 
-static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
+static inline void inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
 {
-	__mod_lruvec_kmem_state(p, idx, 1);
+	mod_lruvec_kmem_state(p, idx, 1);
 }
 
-static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
+static inline void dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
 {
-	__mod_lruvec_kmem_state(p, idx, -1);
+	mod_lruvec_kmem_state(p, idx, -1);
 }
 
 static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7e6407b8bfb7..ae154f51931e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -799,7 +799,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
 }
 EXPORT_SYMBOL(__lruvec_stat_mod_folio);
 
-void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
+void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 {
 	pg_data_t *pgdat = page_pgdat(virt_to_page(p));
 	struct mem_cgroup *memcg;
diff --git a/mm/workingset.c b/mm/workingset.c
index 68a76a91111f..6ff30369b758 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -749,7 +749,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	if (WARN_ON_ONCE(node->count != node->nr_values))
 		goto out_invalid;
 	xa_delete_node(node, workingset_update_node);
-	__inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
+	inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
 
 out_invalid:
 	xa_unlock_irq(&mapping->i_pages);
-- 
cgit v1.2.3


From 5b3eb779a20cf30d74bb346d2a1e525bc9072685 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Mon, 10 Nov 2025 15:20:07 -0800
Subject: memcg: remove __mod_lruvec_state

__mod_lruvec_state() is already safe against irqs, so there is no need to
have a separate interface (i.e.  mod_lruvec_state) which wraps calls to it
with irq disabling and reenabling.  Let's rename __mod_lruvec_state() to
mod_lruvec_state().

Link: https://lkml.kernel.org/r/20251110232008.1352063-4-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h |  2 +-
 include/linux/vmstat.h    | 18 +-----------------
 mm/memcontrol.c           |  8 ++++----
 mm/migrate.c              | 20 ++++++++++----------
 mm/vmscan.c               |  4 ++--
 5 files changed, 18 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index ca7a18351797..b58f34c4fe92 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -44,7 +44,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
 	lockdep_assert_held(&lruvec->lru_lock);
 	WARN_ON_ONCE(nr_pages != (int)nr_pages);
 
-	__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
+	mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
 	__mod_zone_page_state(&pgdat->node_zones[zid],
 				NR_ZONE_LRU_BASE + lru, nr_pages);
 }
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 11a37aaa4dd9..4eb7753e6e5c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -520,19 +520,9 @@ static inline const char *vm_event_name(enum vm_event_item item)
 
 #ifdef CONFIG_MEMCG
 
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			int val);
 
-static inline void mod_lruvec_state(struct lruvec *lruvec,
-				    enum node_stat_item idx, int val)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__mod_lruvec_state(lruvec, idx, val);
-	local_irq_restore(flags);
-}
-
 void __lruvec_stat_mod_folio(struct folio *folio,
 			     enum node_stat_item idx, int val);
 
@@ -554,12 +544,6 @@ static inline void mod_lruvec_page_state(struct page *page,
 
 #else
 
-static inline void __mod_lruvec_state(struct lruvec *lruvec,
-				      enum node_stat_item idx, int val)
-{
-	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
-}
-
 static inline void mod_lruvec_state(struct lruvec *lruvec,
 				    enum node_stat_item idx, int val)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ae154f51931e..9a659f16af77 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -757,7 +757,7 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec,
 }
 
 /**
- * __mod_lruvec_state - update lruvec memory statistics
+ * mod_lruvec_state - update lruvec memory statistics
  * @lruvec: the lruvec
  * @idx: the stat item
  * @val: delta to add to the counter, can be negative
@@ -766,7 +766,7 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec,
  * function updates the all three counters that are affected by a
  * change of state at this level: per-node, per-cgroup, per-lruvec.
  */
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			int val)
 {
 	/* Update node */
@@ -794,7 +794,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
 	}
 
 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
-	__mod_lruvec_state(lruvec, idx, val);
+	mod_lruvec_state(lruvec, idx, val);
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(__lruvec_stat_mod_folio);
@@ -818,7 +818,7 @@ void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 		mod_node_page_state(pgdat, idx, val);
 	} else {
 		lruvec = mem_cgroup_lruvec(memcg, pgdat);
-		__mod_lruvec_state(lruvec, idx, val);
+		mod_lruvec_state(lruvec, idx, val);
 	}
 	rcu_read_unlock();
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index b2ad78bf85d5..5169f9717f60 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -675,27 +675,27 @@ static int __folio_migrate_mapping(struct address_space *mapping,
 		old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
 		new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
 
-		__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
-		__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
+		mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
+		mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
 		if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
-			__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
-			__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
+			mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
+			mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
 
 			if (folio_test_pmd_mappable(folio)) {
-				__mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
-				__mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
+				mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
+				mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
 			}
 		}
 #ifdef CONFIG_SWAP
 		if (folio_test_swapcache(folio)) {
-			__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
-			__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
+			mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
+			mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
 		}
 #endif
 		if (dirty && mapping_can_writeback(mapping)) {
-			__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
+			mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
 			__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
-			__mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
+			mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
 			__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
 		}
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 51ffd32e6e01..720772baf2a7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2018,7 +2018,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 	spin_lock_irq(&lruvec->lru_lock);
 	move_folios_to_lru(lruvec, &folio_list);
 
-	__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
+	mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
 					stat.nr_demoted);
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
 	item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
@@ -4744,7 +4744,7 @@ retry:
 		reset_batch_size(walk);
 	}
 
-	__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
+	mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
 					stat.nr_demoted);
 
 	item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
-- 
cgit v1.2.3


From c1bd09994c4d5b897571671bed16581335e93242 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Mon, 10 Nov 2025 15:20:08 -0800
Subject: memcg: remove __lruvec_stat_mod_folio

__lruvec_stat_mod_folio() is already safe against irqs, so there is no
need to have a separate interface (i.e.  lruvec_stat_mod_folio) which
wraps calls to it with irq disabling and reenabling.  Let's rename
__lruvec_stat_mod_folio() to lruvec_stat_mod_folio().

Link: https://lkml.kernel.org/r/20251110232008.1352063-5-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmstat.h | 30 +-----------------------------
 mm/filemap.c           | 20 ++++++++++----------
 mm/huge_memory.c       |  4 ++--
 mm/khugepaged.c        |  8 ++++----
 mm/memcontrol.c        |  4 ++--
 mm/page-writeback.c    |  2 +-
 mm/rmap.c              |  4 ++--
 mm/shmem.c             |  6 +++---
 8 files changed, 25 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 4eb7753e6e5c..3398a345bda8 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -523,19 +523,9 @@ static inline const char *vm_event_name(enum vm_event_item item)
 void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			int val);
 
-void __lruvec_stat_mod_folio(struct folio *folio,
+void lruvec_stat_mod_folio(struct folio *folio,
 			     enum node_stat_item idx, int val);
 
-static inline void lruvec_stat_mod_folio(struct folio *folio,
-					 enum node_stat_item idx, int val)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__lruvec_stat_mod_folio(folio, idx, val);
-	local_irq_restore(flags);
-}
-
 static inline void mod_lruvec_page_state(struct page *page,
 					 enum node_stat_item idx, int val)
 {
@@ -550,12 +540,6 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
 	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 }
 
-static inline void __lruvec_stat_mod_folio(struct folio *folio,
-					 enum node_stat_item idx, int val)
-{
-	mod_node_page_state(folio_pgdat(folio), idx, val);
-}
-
 static inline void lruvec_stat_mod_folio(struct folio *folio,
 					 enum node_stat_item idx, int val)
 {
@@ -570,18 +554,6 @@ static inline void mod_lruvec_page_state(struct page *page,
 
 #endif /* CONFIG_MEMCG */
 
-static inline void __lruvec_stat_add_folio(struct folio *folio,
-					   enum node_stat_item idx)
-{
-	__lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
-}
-
-static inline void __lruvec_stat_sub_folio(struct folio *folio,
-					   enum node_stat_item idx)
-{
-	__lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
-}
-
 static inline void lruvec_stat_add_folio(struct folio *folio,
 					 enum node_stat_item idx)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 07634b7d9934..7d15a9c216ef 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -182,13 +182,13 @@ static void filemap_unaccount_folio(struct address_space *mapping,
 
 	nr = folio_nr_pages(folio);
 
-	__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+	lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
 	if (folio_test_swapbacked(folio)) {
-		__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
+		lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
 		if (folio_test_pmd_mappable(folio))
-			__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
+			lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
 	} else if (folio_test_pmd_mappable(folio)) {
-		__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
+		lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
 		filemap_nr_thps_dec(mapping);
 	}
 	if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))
@@ -844,13 +844,13 @@ void replace_page_cache_folio(struct folio *old, struct folio *new)
 	old->mapping = NULL;
 	/* hugetlb pages do not participate in page cache accounting. */
 	if (!folio_test_hugetlb(old))
-		__lruvec_stat_sub_folio(old, NR_FILE_PAGES);
+		lruvec_stat_sub_folio(old, NR_FILE_PAGES);
 	if (!folio_test_hugetlb(new))
-		__lruvec_stat_add_folio(new, NR_FILE_PAGES);
+		lruvec_stat_add_folio(new, NR_FILE_PAGES);
 	if (folio_test_swapbacked(old))
-		__lruvec_stat_sub_folio(old, NR_SHMEM);
+		lruvec_stat_sub_folio(old, NR_SHMEM);
 	if (folio_test_swapbacked(new))
-		__lruvec_stat_add_folio(new, NR_SHMEM);
+		lruvec_stat_add_folio(new, NR_SHMEM);
 	xas_unlock_irq(&xas);
 	if (free_folio)
 		free_folio(old);
@@ -933,9 +933,9 @@ noinline int __filemap_add_folio(struct address_space *mapping,
 
 		/* hugetlb pages do not participate in page cache accounting */
 		if (!huge) {
-			__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+			lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
 			if (folio_test_pmd_mappable(folio))
-				__lruvec_stat_mod_folio(folio,
+				lruvec_stat_mod_folio(folio,
 						NR_FILE_THPS, nr);
 		}
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 53a8d380eab2..7af3e037d891 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3783,10 +3783,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
 			if (folio_test_pmd_mappable(folio) &&
 			    new_order < HPAGE_PMD_ORDER) {
 				if (folio_test_swapbacked(folio)) {
-					__lruvec_stat_mod_folio(folio,
+					lruvec_stat_mod_folio(folio,
 							NR_SHMEM_THPS, -nr);
 				} else {
-					__lruvec_stat_mod_folio(folio,
+					lruvec_stat_mod_folio(folio,
 							NR_FILE_THPS, -nr);
 					filemap_nr_thps_dec(mapping);
 				}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 40f9d5939aa5..89c33ef7aac3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2195,14 +2195,14 @@ immap_locked:
 	}
 
 	if (is_shmem)
-		__lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
+		lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
 	else
-		__lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
+		lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
 
 	if (nr_none) {
-		__lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
+		lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
 		/* nr_none is always 0 for non-shmem. */
-		__lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
+		lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
 	}
 
 	/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9a659f16af77..9b07db2cb232 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -777,7 +777,7 @@ void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 		mod_memcg_lruvec_state(lruvec, idx, val);
 }
 
-void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
+void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
 			     int val)
 {
 	struct mem_cgroup *memcg;
@@ -797,7 +797,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
 	mod_lruvec_state(lruvec, idx, val);
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL(__lruvec_stat_mod_folio);
+EXPORT_SYMBOL(lruvec_stat_mod_folio);
 
 void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 757bc4d3b5b5..d6b339cc876d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2658,7 +2658,7 @@ static void folio_account_dirtied(struct folio *folio,
 		inode_attach_wb(inode, folio);
 		wb = inode_to_wb(inode);
 
-		__lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
+		lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
 		__zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
 		__node_stat_mod_folio(folio, NR_DIRTIED, nr);
 		wb_stat_mod(wb, WB_RECLAIMABLE, nr);
diff --git a/mm/rmap.c b/mm/rmap.c
index d871f2eb821c..f955f02d570e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1212,12 +1212,12 @@ static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
 
 	if (nr) {
 		idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
-		__lruvec_stat_mod_folio(folio, idx, nr);
+		lruvec_stat_mod_folio(folio, idx, nr);
 	}
 	if (nr_pmdmapped) {
 		if (folio_test_anon(folio)) {
 			idx = NR_ANON_THPS;
-			__lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
+			lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
 		} else {
 			/* NR_*_PMDMAPPED are not maintained per-memcg */
 			idx = folio_test_swapbacked(folio) ?
diff --git a/mm/shmem.c b/mm/shmem.c
index fc835b3e4914..ad18172ff831 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -871,9 +871,9 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index
 static void shmem_update_stats(struct folio *folio, int nr_pages)
 {
 	if (folio_test_pmd_mappable(folio))
-		__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
-	__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
-	__lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
+		lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
+	lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+	lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
 }
 
 /*
-- 
cgit v1.2.3


From 277a1ae3879a82a15a2e2d6741e38e31ea6487ee Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Date: Thu, 13 Nov 2025 15:28:01 +0800
Subject: mm: softdirty: add pgtable_supports_soft_dirty()

Patch series "mm: Add soft-dirty and uffd-wp support for RISC-V", v15.

This patchset adds support for Svrsw60t59b [1] extension which is ratified
now, also add soft dirty and userfaultfd write protect tracking for
RISC-V.

The patches 1 and 2 add macros to allow architectures to define their own
checks if the soft-dirty / uffd_wp PTE bits are available, in other words
for RISC-V, the Svrsw60t59b extension is supported on which device the
kernel is running.  Also patch1-2 are removing "ifdef
CONFIG_MEM_SOFT_DIRTY" "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and "ifdef
CONFIG_PTE_MARKER_UFFD_WP" in favor of checks which if not overridden by
the architecture, no change in behavior is expected.

This patchset has been tested with kselftest mm suite in which soft-dirty,
madv_populate, test_unmerge_uffd_wp, and uffd-unit-tests run and pass, and
no regressions are observed in any of the other tests.


This patch (of 6):

Some platforms can customize the PTE PMD entry soft-dirty bit making it
unavailable even if the architecture provides the resource.

Add an API which architectures can define their specific implementations
to detect if soft-dirty bit is available on which device the kernel is
running.

This patch is removing "ifdef CONFIG_MEM_SOFT_DIRTY" in favor of
pgtable_supports_soft_dirty() checks that defaults to
IS_ENABLED(CONFIG_MEM_SOFT_DIRTY), if not overridden by the architecture,
no change in behavior is expected.

We make sure to never set VM_SOFTDIRTY if !pgtable_supports_soft_dirty(),
so we will never run into VM_SOFTDIRTY checks.

[lorenzo.stoakes@oracle.com: fix VMA selftests]
  Link: https://lkml.kernel.org/r/dac6ddfe-773a-43d5-8f69-021b9ca4d24b@lucifer.local
Link: https://lkml.kernel.org/r/20251113072806.795029-1-zhangchunyan@iscas.ac.cn
Link: https://lkml.kernel.org/r/20251113072806.795029-2-zhangchunyan@iscas.ac.cn
Link: https://github.com/riscv-non-isa/riscv-iommu/pull/543 [1]
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Conor Dooley <conor@kernel.org>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c               | 15 ++++++---------
 include/linux/mm.h               |  3 +++
 include/linux/pgtable.h          | 12 ++++++++++++
 mm/debug_vm_pgtable.c            | 10 +++++-----
 mm/huge_memory.c                 | 13 +++++++------
 mm/internal.h                    |  2 +-
 mm/mmap.c                        |  6 ++++--
 mm/mremap.c                      | 13 +++++++------
 mm/userfaultfd.c                 | 10 ++++------
 mm/vma.c                         |  6 ++++--
 mm/vma_exec.c                    |  5 ++++-
 tools/testing/vma/vma_internal.h |  2 ++
 12 files changed, 59 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 41b062ce6ad8..2b4ab5718ab5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1584,8 +1584,6 @@ struct clear_refs_private {
 	enum clear_refs_types type;
 };
 
-#ifdef CONFIG_MEM_SOFT_DIRTY
-
 static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
 	struct folio *folio;
@@ -1605,6 +1603,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
 static inline void clear_soft_dirty(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *pte)
 {
+	if (!pgtable_supports_soft_dirty())
+		return;
 	/*
 	 * The soft-dirty tracker uses #PF-s to catch writes
 	 * to pages, so write-protect the pte as well. See the
@@ -1630,19 +1630,16 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 		set_pte_at(vma->vm_mm, addr, pte, ptent);
 	}
 }
-#else
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
-		unsigned long addr, pte_t *pte)
-{
-}
-#endif
 
-#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
 	pmd_t old, pmd = *pmdp;
 
+	if (!pgtable_supports_soft_dirty())
+		return;
+
 	if (pmd_present(pmd)) {
 		/* See comment in change_huge_pmd() */
 		old = pmdp_invalidate(vma, addr, pmdp);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf660d5b6e97..75f894c3f521 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -859,6 +859,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 static inline void vm_flags_init(struct vm_area_struct *vma,
 				 vm_flags_t flags)
 {
+	VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
 	ACCESS_PRIVATE(vma, __vm_flags) = flags;
 }
 
@@ -870,6 +871,7 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
 static inline void vm_flags_reset(struct vm_area_struct *vma,
 				  vm_flags_t flags)
 {
+	VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
 	vma_assert_write_locked(vma);
 	vm_flags_init(vma, flags);
 }
@@ -891,6 +893,7 @@ static inline void vm_flags_set(struct vm_area_struct *vma,
 static inline void vm_flags_clear(struct vm_area_struct *vma,
 				  vm_flags_t flags)
 {
+	VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
 	vma_start_write(vma);
 	ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
 }
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 32e8457ad535..b13b6f42be3c 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1553,6 +1553,18 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 #define arch_start_context_switch(prev)	do {} while (0)
 #endif
 
+/*
+ * Some platforms can customize the PTE soft-dirty bit making it unavailable
+ * even if the architecture provides the resource.
+ * Adding this API allows architectures to add their own checks for the
+ * devices on which the kernel is running.
+ * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY
+ * is part of this macro.
+ */
+#ifndef pgtable_supports_soft_dirty
+#define pgtable_supports_soft_dirty()	IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)
+#endif
+
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
 static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 1eae87dbef73..ae9b9310d96f 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -704,7 +704,7 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
 {
 	pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
 
-	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+	if (!pgtable_supports_soft_dirty())
 		return;
 
 	pr_debug("Validating PTE soft dirty\n");
@@ -717,7 +717,7 @@ static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args)
 	pte_t pte;
 	softleaf_t entry;
 
-	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+	if (!pgtable_supports_soft_dirty())
 		return;
 
 	pr_debug("Validating PTE swap soft dirty\n");
@@ -734,7 +734,7 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args)
 {
 	pmd_t pmd;
 
-	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+	if (!pgtable_supports_soft_dirty())
 		return;
 
 	if (!has_transparent_hugepage())
@@ -750,8 +750,8 @@ static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args)
 {
 	pmd_t pmd;
 
-	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) ||
-		!IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
+	if (!pgtable_supports_soft_dirty() ||
+	    !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
 		return;
 
 	if (!has_transparent_hugepage())
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7af3e037d891..041b554c7115 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2427,12 +2427,13 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
 
 static pmd_t move_soft_dirty_pmd(pmd_t pmd)
 {
-#ifdef CONFIG_MEM_SOFT_DIRTY
-	if (unlikely(pmd_is_migration_entry(pmd)))
-		pmd = pmd_swp_mksoft_dirty(pmd);
-	else if (pmd_present(pmd))
-		pmd = pmd_mksoft_dirty(pmd);
-#endif
+	if (pgtable_supports_soft_dirty()) {
+		if (unlikely(pmd_is_migration_entry(pmd)))
+			pmd = pmd_swp_mksoft_dirty(pmd);
+		else if (pmd_present(pmd))
+			pmd = pmd_mksoft_dirty(pmd);
+	}
+
 	return pmd;
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 929bc4a5dd98..04c307ee33ae 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1554,7 +1554,7 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
 	 * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
 	 * will be constantly true.
 	 */
-	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+	if (!pgtable_supports_soft_dirty())
 		return false;
 
 	/*
diff --git a/mm/mmap.c b/mm/mmap.c
index dc51680824ec..4bdb9ffa9e25 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1448,8 +1448,10 @@ static struct vm_area_struct *__install_special_mapping(
 		return ERR_PTR(-ENOMEM);
 
 	vma_set_range(vma, addr, addr + len, 0);
-	vm_flags_init(vma, (vm_flags | mm->def_flags |
-		      VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
+	vm_flags |= mm->def_flags | VM_DONTEXPAND;
+	if (pgtable_supports_soft_dirty())
+		vm_flags |= VM_SOFTDIRTY;
+	vm_flags_init(vma, vm_flags & ~VM_LOCKED_MASK);
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 
 	vma->vm_ops = ops;
diff --git a/mm/mremap.c b/mm/mremap.c
index fdb0485ede74..672264807db6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -165,12 +165,13 @@ static pte_t move_soft_dirty_pte(pte_t pte)
 	 * Set soft dirty bit so we can notice
 	 * in userspace the ptes were moved.
 	 */
-#ifdef CONFIG_MEM_SOFT_DIRTY
-	if (pte_present(pte))
-		pte = pte_mksoft_dirty(pte);
-	else
-		pte = pte_swp_mksoft_dirty(pte);
-#endif
+	if (pgtable_supports_soft_dirty()) {
+		if (pte_present(pte))
+			pte = pte_mksoft_dirty(pte);
+		else
+			pte = pte_swp_mksoft_dirty(pte);
+	}
+
 	return pte;
 }
 
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index bd1f74a7a5ac..e6dfd5f28acd 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1119,9 +1119,8 @@ static long move_present_ptes(struct mm_struct *mm,
 
 		orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
 		/* Set soft dirty bit so userspace can notice the pte was moved */
-#ifdef CONFIG_MEM_SOFT_DIRTY
-		orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
-#endif
+		if (pgtable_supports_soft_dirty())
+			orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
 		if (pte_dirty(orig_src_pte))
 			orig_dst_pte = pte_mkdirty(orig_dst_pte);
 		orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
@@ -1208,9 +1207,8 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
 	}
 
 	orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
-#ifdef CONFIG_MEM_SOFT_DIRTY
-	orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
-#endif
+	if (pgtable_supports_soft_dirty())
+		orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
 	set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
 	double_pt_unlock(dst_ptl, src_ptl);
 
diff --git a/mm/vma.c b/mm/vma.c
index 4e21c988054d..fc90befd162f 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2559,7 +2559,8 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
 	 * then new mapped in-place (which must be aimed as
 	 * a completely new data area).
 	 */
-	vm_flags_set(vma, VM_SOFTDIRTY);
+	if (pgtable_supports_soft_dirty())
+		vm_flags_set(vma, VM_SOFTDIRTY);
 
 	vma_set_page_prot(vma);
 }
@@ -2864,7 +2865,8 @@ out:
 	mm->data_vm += len >> PAGE_SHIFT;
 	if (vm_flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
-	vm_flags_set(vma, VM_SOFTDIRTY);
+	if (pgtable_supports_soft_dirty())
+		vm_flags_set(vma, VM_SOFTDIRTY);
 	return 0;
 
 mas_store_fail:
diff --git a/mm/vma_exec.c b/mm/vma_exec.c
index 922ee51747a6..8134e1afca68 100644
--- a/mm/vma_exec.c
+++ b/mm/vma_exec.c
@@ -107,6 +107,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
 int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
 			  unsigned long *top_mem_p)
 {
+	unsigned long flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 	int err;
 	struct vm_area_struct *vma = vm_area_alloc(mm);
 
@@ -137,7 +138,9 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
 	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
 	vma->vm_end = STACK_TOP_MAX;
 	vma->vm_start = vma->vm_end - PAGE_SIZE;
-	vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
+	if (pgtable_supports_soft_dirty())
+		flags |= VM_SOFTDIRTY;
+	vm_flags_init(vma, flags);
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 
 	err = insert_vm_struct(mm, vma);
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 81b501f51948..be99056c5d56 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -212,6 +212,8 @@ typedef __bitwise unsigned int vm_fault_t;
 
 #define ASSERT_EXCLUSIVE_WRITER(x)
 
+#define pgtable_supports_soft_dirty() 1
+
 /**
  * swap - swap values of @a and @b
  * @a: first value
-- 
cgit v1.2.3


From f59c0924d61aa2a2bb85936a593140f327112787 Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Date: Thu, 13 Nov 2025 15:28:02 +0800
Subject: mm: userfaultfd: add pgtable_supports_uffd_wp()

Some platforms can customize the PTE/PMD entry uffd-wp bit making it
unavailable even if the architecture provides the resource.  This patch
adds a macro API pgtable_supports_uffd_wp() that allows architectures to
define their specific implementations to check if the uffd-wp bit is
available on which device the kernel is running.

Also this patch is removing "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and
"ifdef CONFIG_PTE_MARKER_UFFD_WP" in favor of pgtable_supports_uffd_wp()
and uffd_supports_wp_marker() checks respectively that default to
IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) and
"IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) &&
IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP)" if not overridden by the
architecture, no change in behavior is expected.

Link: https://lkml.kernel.org/r/20251113072806.795029-3-zhangchunyan@iscas.ac.cn
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Conor Dooley <conor.dooley@microchip.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c                   | 22 ++++++------
 include/asm-generic/pgtable_uffd.h | 17 ++++++++++
 include/linux/mm_inline.h          |  8 +++--
 include/linux/userfaultfd_k.h      | 69 ++++++++++++++++++++++----------------
 mm/memory.c                        |  6 ++--
 5 files changed, 78 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 3f539aabc3b3..5a0d19dec7ba 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1289,9 +1289,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
 		vm_flags |= VM_UFFD_MISSING;
 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
-#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
-		goto out;
-#endif
+		if (!pgtable_supports_uffd_wp())
+			goto out;
+
 		vm_flags |= VM_UFFD_WP;
 	}
 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
@@ -1999,14 +1999,14 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	uffdio_api.features &=
 		~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
 #endif
-#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
-	uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
-#endif
-#ifndef CONFIG_PTE_MARKER_UFFD_WP
-	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
-	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
-	uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
-#endif
+	if (!pgtable_supports_uffd_wp())
+		uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+
+	if (!uffd_supports_wp_marker()) {
+		uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+		uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+		uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
+	}
 
 	ret = -EINVAL;
 	if (features & ~uffdio_api.features)
diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h
index 828966d4c281..0d85791efdf7 100644
--- a/include/asm-generic/pgtable_uffd.h
+++ b/include/asm-generic/pgtable_uffd.h
@@ -1,6 +1,23 @@
 #ifndef _ASM_GENERIC_PGTABLE_UFFD_H
 #define _ASM_GENERIC_PGTABLE_UFFD_H
 
+/*
+ * Some platforms can customize the uffd-wp bit, making it unavailable
+ * even if the architecture provides the resource.
+ * Adding this API allows architectures to add their own checks for the
+ * devices on which the kernel is running.
+ * Note: When overriding it, please make sure the
+ * CONFIG_HAVE_ARCH_USERFAULTFD_WP is part of this macro.
+ */
+#ifndef pgtable_supports_uffd_wp
+#define pgtable_supports_uffd_wp()	IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP)
+#endif
+
+static inline bool uffd_supports_wp_marker(void)
+{
+	return pgtable_supports_uffd_wp() && IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP);
+}
+
 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
 static __always_inline int pte_uffd_wp(pte_t pte)
 {
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index b58f34c4fe92..fa2d6ba811b5 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -553,7 +553,6 @@ static inline pte_marker copy_pte_marker(
 
 	return dstm;
 }
-#endif
 
 /*
  * If this pte is wr-protected by uffd-wp in any form, arm the special pte to
@@ -571,9 +570,11 @@ static inline bool
 pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
 			      pte_t *pte, pte_t pteval)
 {
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
 	bool arm_uffd_pte = false;
 
+	if (!uffd_supports_wp_marker())
+		return false;
+
 	/* The current status of the pte should be "cleared" before calling */
 	WARN_ON_ONCE(!pte_none(ptep_get(pte)));
 
@@ -602,7 +603,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
 			   make_pte_marker(PTE_MARKER_UFFD_WP));
 		return true;
 	}
-#endif
+
 	return false;
 }
 
@@ -616,6 +617,7 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma)
 
 	return true;
 }
+#endif
 
 /**
  * num_pages_contiguous() - determine the number of contiguous pages
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 96b089dff4ef..fd5f42765497 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -228,15 +228,14 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 	if (wp_async && (vm_flags == VM_UFFD_WP))
 		return true;
 
-#ifndef CONFIG_PTE_MARKER_UFFD_WP
 	/*
 	 * If user requested uffd-wp but not enabled pte markers for
 	 * uffd-wp, then shmem & hugetlbfs are not supported but only
 	 * anonymous.
 	 */
-	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
+	if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) &&
+	    !vma_is_anonymous(vma))
 		return false;
-#endif
 
 	/* By default, allow any of anon|shmem|hugetlb */
 	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
@@ -291,6 +290,43 @@ void userfaultfd_release_new(struct userfaultfd_ctx *ctx);
 void userfaultfd_release_all(struct mm_struct *mm,
 			     struct userfaultfd_ctx *ctx);
 
+static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
+{
+	/* Only wr-protect mode uses pte markers */
+	if (!userfaultfd_wp(vma))
+		return false;
+
+	/* File-based uffd-wp always need markers */
+	if (!vma_is_anonymous(vma))
+		return true;
+
+	/*
+	 * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
+	 * enabled (to apply markers on zero pages).
+	 */
+	return userfaultfd_wp_unpopulated(vma);
+}
+
+/*
+ * Returns true if this is a swap pte and was uffd-wp wr-protected in either
+ * forms (pte marker or a normal swap pte), false otherwise.
+ */
+static inline bool pte_swp_uffd_wp_any(pte_t pte)
+{
+	if (!uffd_supports_wp_marker())
+		return false;
+
+	if (pte_present(pte))
+		return false;
+
+	if (pte_swp_uffd_wp(pte))
+		return true;
+
+	if (pte_is_uffd_wp_marker(pte))
+		return true;
+
+	return false;
+}
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -415,23 +451,9 @@ static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
 	return false;
 }
 
-#endif /* CONFIG_USERFAULTFD */
-
 static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
 {
-	/* Only wr-protect mode uses pte markers */
-	if (!userfaultfd_wp(vma))
-		return false;
-
-	/* File-based uffd-wp always need markers */
-	if (!vma_is_anonymous(vma))
-		return true;
-
-	/*
-	 * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
-	 * enabled (to apply markers on zero pages).
-	 */
-	return userfaultfd_wp_unpopulated(vma);
+	return false;
 }
 
 /*
@@ -440,16 +462,7 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
  */
 static inline bool pte_swp_uffd_wp_any(pte_t pte)
 {
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
-	if (pte_present(pte))
-		return false;
-	if (pte_swp_uffd_wp(pte))
-		return true;
-
-	if (pte_is_uffd_wp_marker(pte))
-		return true;
-#endif
 	return false;
 }
-
+#endif /* CONFIG_USERFAULTFD */
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/mm/memory.c b/mm/memory.c
index 50b93b45b174..6675e87eb7dd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1590,7 +1590,9 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 {
 	bool was_installed = false;
 
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
+	if (!uffd_supports_wp_marker())
+		return false;
+
 	/* Zap on anonymous always means dropping everything */
 	if (vma_is_anonymous(vma))
 		return false;
@@ -1607,7 +1609,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 		pte++;
 		addr += PAGE_SIZE;
 	}
-#endif
+
 	return was_installed;
 }
 
-- 
cgit v1.2.3


From 31807483d3952059d395c2a73b1fa9625db9b366 Mon Sep 17 00:00:00 2001
From: Xie Yuanbin <xieyuanbin1@huawei.com>
Date: Wed, 19 Nov 2025 17:59:43 +0800
Subject: mm/memory-failure: remove the selection of RAS

commit 97f0b13452198290799f ("tracing: add trace event for
memory-failure") introduces the selection of RAS in memory-failure.  This
commit is just a tracing feature; in reality, there is no dependency
between memory-failure and RAS.  RAS increases the size of the bzImage
image by 8k, which is very valuable for embedded devices.

Move the memory-failure traceing code from ras_event.h to
memory-failure.h and remove the selection of RAS.

Link: https://lkml.kernel.org/r/20251119095943.67125-1-xieyuanbin1@huawei.com
Signed-off-by: Xie Yuanbin <xieyuanbin1@huawei.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS                           |  1 +
 include/ras/ras_event.h               | 87 -------------------------------
 include/trace/events/memory-failure.h | 98 +++++++++++++++++++++++++++++++++++
 mm/Kconfig                            |  1 -
 mm/memory-failure.c                   |  5 +-
 5 files changed, 103 insertions(+), 89 deletions(-)
 create mode 100644 include/trace/events/memory-failure.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5ca4caf73021..302c57deffac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11560,6 +11560,7 @@ R:	Naoya Horiguchi <nao.horiguchi@gmail.com>
 L:	linux-mm@kvack.org
 S:	Maintained
 F:	include/linux/memory-failure.h
+F:	include/trace/events/memory-failure.h
 F:	mm/hwpoison-inject.c
 F:	mm/memory-failure.c
 
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index fecfeb7c8be7..1e5e87020eef 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -12,7 +12,6 @@
 #include <linux/pci.h>
 #include <linux/aer.h>
 #include <linux/cper.h>
-#include <linux/mm.h>
 
 /*
  * MCE Extended Error Log trace event
@@ -339,92 +338,6 @@ TRACE_EVENT(aer_event,
 			"Not available")
 );
 #endif /* CONFIG_PCIEAER */
-
-/*
- * memory-failure recovery action result event
- *
- * unsigned long pfn -	Page Frame Number of the corrupted page
- * int type	-	Page types of the corrupted page
- * int result	-	Result of recovery action
- */
-
-#ifdef CONFIG_MEMORY_FAILURE
-#define MF_ACTION_RESULT	\
-	EM ( MF_IGNORED, "Ignored" )	\
-	EM ( MF_FAILED,  "Failed" )	\
-	EM ( MF_DELAYED, "Delayed" )	\
-	EMe ( MF_RECOVERED, "Recovered" )
-
-#define MF_PAGE_TYPE		\
-	EM ( MF_MSG_KERNEL, "reserved kernel page" )			\
-	EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" )	\
-	EM ( MF_MSG_HUGE, "huge page" )					\
-	EM ( MF_MSG_FREE_HUGE, "free huge page" )			\
-	EM ( MF_MSG_GET_HWPOISON, "get hwpoison page" )			\
-	EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" )		\
-	EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" )		\
-	EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" )		\
-	EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" )	\
-	EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" )	\
-	EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" )	\
-	EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" )	\
-	EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" )			\
-	EM ( MF_MSG_CLEAN_LRU, "clean LRU page" )			\
-	EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" )	\
-	EM ( MF_MSG_BUDDY, "free buddy page" )				\
-	EM ( MF_MSG_DAX, "dax page" )					\
-	EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" )			\
-	EM ( MF_MSG_ALREADY_POISONED, "already poisoned" )		\
-	EM ( MF_MSG_PFN_MAP, "non struct page pfn" )                    \
-	EMe ( MF_MSG_UNKNOWN, "unknown page" )
-
-/*
- * First define the enums in MM_ACTION_RESULT to be exported to userspace
- * via TRACE_DEFINE_ENUM().
- */
-#undef EM
-#undef EMe
-#define EM(a, b) TRACE_DEFINE_ENUM(a);
-#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
-
-MF_ACTION_RESULT
-MF_PAGE_TYPE
-
-/*
- * Now redefine the EM() and EMe() macros to map the enums to the strings
- * that will be printed in the output.
- */
-#undef EM
-#undef EMe
-#define EM(a, b)		{ a, b },
-#define EMe(a, b)	{ a, b }
-
-TRACE_EVENT(memory_failure_event,
-	TP_PROTO(unsigned long pfn,
-		 int type,
-		 int result),
-
-	TP_ARGS(pfn, type, result),
-
-	TP_STRUCT__entry(
-		__field(unsigned long, pfn)
-		__field(int, type)
-		__field(int, result)
-	),
-
-	TP_fast_assign(
-		__entry->pfn	= pfn;
-		__entry->type	= type;
-		__entry->result	= result;
-	),
-
-	TP_printk("pfn %#lx: recovery action for %s: %s",
-		__entry->pfn,
-		__print_symbolic(__entry->type, MF_PAGE_TYPE),
-		__print_symbolic(__entry->result, MF_ACTION_RESULT)
-	)
-);
-#endif /* CONFIG_MEMORY_FAILURE */
 #endif /* _TRACE_HW_EVENT_MC_H */
 
 /* This part must be outside protection */
diff --git a/include/trace/events/memory-failure.h b/include/trace/events/memory-failure.h
new file mode 100644
index 000000000000..aa57cc8f896b
--- /dev/null
+++ b/include/trace/events/memory-failure.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM memory_failure
+#define TRACE_INCLUDE_FILE memory-failure
+
+#if !defined(_TRACE_MEMORY_FAILURE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MEMORY_FAILURE_H
+
+#include <linux/tracepoint.h>
+#include <linux/mm.h>
+
+/*
+ * memory-failure recovery action result event
+ *
+ * unsigned long pfn -	Page Frame Number of the corrupted page
+ * int type	-	Page types of the corrupted page
+ * int result	-	Result of recovery action
+ */
+
+#define MF_ACTION_RESULT	\
+	EM ( MF_IGNORED, "Ignored" )	\
+	EM ( MF_FAILED,  "Failed" )	\
+	EM ( MF_DELAYED, "Delayed" )	\
+	EMe ( MF_RECOVERED, "Recovered" )
+
+#define MF_PAGE_TYPE		\
+	EM ( MF_MSG_KERNEL, "reserved kernel page" )			\
+	EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" )	\
+	EM ( MF_MSG_HUGE, "huge page" )					\
+	EM ( MF_MSG_FREE_HUGE, "free huge page" )			\
+	EM ( MF_MSG_GET_HWPOISON, "get hwpoison page" )			\
+	EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" )		\
+	EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" )		\
+	EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" )		\
+	EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" )	\
+	EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" )	\
+	EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" )	\
+	EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" )	\
+	EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" )			\
+	EM ( MF_MSG_CLEAN_LRU, "clean LRU page" )			\
+	EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" )	\
+	EM ( MF_MSG_BUDDY, "free buddy page" )				\
+	EM ( MF_MSG_DAX, "dax page" )					\
+	EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" )			\
+	EM ( MF_MSG_ALREADY_POISONED, "already poisoned" )		\
+	EM ( MF_MSG_PFN_MAP, "non struct page pfn" )                    \
+	EMe ( MF_MSG_UNKNOWN, "unknown page" )
+
+/*
+ * First define the enums in MM_ACTION_RESULT to be exported to userspace
+ * via TRACE_DEFINE_ENUM().
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+MF_ACTION_RESULT
+MF_PAGE_TYPE
+
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)		{ a, b },
+#define EMe(a, b)	{ a, b }
+
+TRACE_EVENT(memory_failure_event,
+	TP_PROTO(unsigned long pfn,
+		 int type,
+		 int result),
+
+	TP_ARGS(pfn, type, result),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, pfn)
+		__field(int, type)
+		__field(int, result)
+	),
+
+	TP_fast_assign(
+		__entry->pfn	= pfn;
+		__entry->type	= type;
+		__entry->result	= result;
+	),
+
+	TP_printk("pfn %#lx: recovery action for %s: %s",
+		__entry->pfn,
+		__print_symbolic(__entry->type, MF_PAGE_TYPE),
+		__print_symbolic(__entry->result, MF_ACTION_RESULT)
+	)
+);
+#endif /* _TRACE_MEMORY_FAILURE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/Kconfig b/mm/Kconfig
index d548976d0e0a..bd0ea5454af8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -740,7 +740,6 @@ config MEMORY_FAILURE
 	depends on MMU
 	depends on ARCH_SUPPORTS_MEMORY_FAILURE
 	bool "Enable recovery from hardware memory errors"
-	select RAS
 	select INTERVAL_TREE
 	help
 	  Enables code to recover from some memory failures on systems
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7f908ad795ad..fbc5a01260c8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -61,9 +61,12 @@
 #include <linux/pagewalk.h>
 #include <linux/shmem_fs.h>
 #include <linux/sysctl.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/memory-failure.h>
+
 #include "swap.h"
 #include "internal.h"
-#include "ras/ras_event.h"
 
 static int sysctl_memory_failure_early_kill __read_mostly;
 
-- 
cgit v1.2.3


From 348ced3da52b3161f5ceec8868e81973ce48e11d Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry@gourry.net>
Date: Fri, 21 Nov 2025 14:48:59 -0500
Subject: hugetlb: add __read_mostly to sysctl_hugetlb_shm_group

sysctl bits are mostly-read values.

Link: https://lkml.kernel.org/r/20251121194859.265259-2-gourry@gourry.net
Signed-off-by: Gregory Price <gourry@gourry.net>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 457d48ac7bcd..019a1c5281e4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -171,7 +171,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h,
 
 struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
 
-extern int sysctl_hugetlb_shm_group;
+extern int sysctl_hugetlb_shm_group __read_mostly;
 extern struct list_head huge_boot_pages[MAX_NUMNODES];
 
 void hugetlb_bootmem_alloc(void);
-- 
cgit v1.2.3


From 48f014356698a3525959a9eb343dc67b5a5c6842 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 24 Nov 2025 17:37:40 +0200
Subject: PCI: Validate pci_rebar_size_supported() input
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to Dan Carpenter, smatch detects issue with size parameter given
to pci_rebar_size_supported():

  drivers/pci/rebar.c:142 pci_rebar_size_supported()
  error: undefined (user controlled) shift '(((1))) << size'

The problem is this call tree, which uses the 'size' from the user to shift
in BIT() without validating it:

  __resource_resize_store         # takes 'buf' from user sysfs write
    kstrtoul(buf, 0, &size)       # converts to unsigned long
    pci_resize_resource           # truncates to int
      pci_rebar_size_supported    # BIT(size) without validation

There could be similar problems also with pci_resize_resource() parameter
values coming from drivers.

Add 'size' validation to pci_rebar_size_supported().

There seems to be no SZ_128T prior to this so add one to be able to specify
the largest size supported by the kernel (PCIe r7.0 spec already defines
sizes even beyond 128TB but kernel does not yet support them).

The issue looks older than the introduction of pci_rebar_size_supported()
by bb1fabd0d94e ("PCI: Add pci_rebar_size_supported() helper").

It would be also nice to convert 'size' unsigned too everywhere, maybe even
u8 but that is left as further work.

Fixes: 8bb705e3e79d ("PCI: Add pci_resize_resource() for resizing BARs")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/r/aSA1WiRG3RuhqZMY@stanley.mountain/
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
[bhelgaas: commit log, add report URL]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20251124153740.2995-1-ilpo.jarvinen@linux.intel.com
---
 drivers/pci/rebar.c   | 3 +++
 include/linux/sizes.h | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c
index 7f6dece19138..ecdebdeb2dff 100644
--- a/drivers/pci/rebar.c
+++ b/drivers/pci/rebar.c
@@ -139,6 +139,9 @@ bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size)
 {
 	u64 sizes = pci_rebar_get_possible_sizes(pdev, bar);
 
+	if (size < 0 || size > ilog2(SZ_128T) - ilog2(PCI_REBAR_MIN_SIZE))
+		return false;
+
 	return BIT(size) & sizes;
 }
 EXPORT_SYMBOL_GPL(pci_rebar_size_supported);
diff --git a/include/linux/sizes.h b/include/linux/sizes.h
index 49039494076f..f1f1a055b047 100644
--- a/include/linux/sizes.h
+++ b/include/linux/sizes.h
@@ -67,5 +67,6 @@
 #define SZ_16T				_AC(0x100000000000, ULL)
 #define SZ_32T				_AC(0x200000000000, ULL)
 #define SZ_64T				_AC(0x400000000000, ULL)
+#define SZ_128T				_AC(0x800000000000, ULL)
 
 #endif /* __LINUX_SIZES_H__ */
-- 
cgit v1.2.3


From 4fe5a00ec70717a7f1002d8913ec6143582b3c8e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Nov 2025 15:41:00 +0000
Subject: net: sched: fix TCF_LAYER_TRANSPORT handling in tcf_get_base_ptr()

syzbot reported that tcf_get_base_ptr() can be called while transport
header is not set [1].

Instead of returning a dangling pointer, return NULL.

Fix tcf_get_base_ptr() callers to handle this NULL value.

[1]
 WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 skb_transport_header include/linux/skbuff.h:3071 [inline]
 WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 tcf_get_base_ptr include/net/pkt_cls.h:539 [inline]
 WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 em_nbyte_match+0x2d8/0x3f0 net/sched/em_nbyte.c:43
Modules linked in:
CPU: 1 UID: 0 PID: 6019 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full)
Call Trace:
 <TASK>
  tcf_em_match net/sched/ematch.c:494 [inline]
  __tcf_em_tree_match+0x1ac/0x770 net/sched/ematch.c:520
  tcf_em_tree_match include/net/pkt_cls.h:512 [inline]
  basic_classify+0x115/0x2d0 net/sched/cls_basic.c:50
  tc_classify include/net/tc_wrapper.h:197 [inline]
  __tcf_classify net/sched/cls_api.c:1764 [inline]
  tcf_classify+0x4cf/0x1140 net/sched/cls_api.c:1860
  multiq_classify net/sched/sch_multiq.c:39 [inline]
  multiq_enqueue+0xfd/0x4c0 net/sched/sch_multiq.c:66
  dev_qdisc_enqueue+0x4e/0x260 net/core/dev.c:4118
  __dev_xmit_skb net/core/dev.c:4214 [inline]
  __dev_queue_xmit+0xe83/0x3b50 net/core/dev.c:4729
  packet_snd net/packet/af_packet.c:3076 [inline]
  packet_sendmsg+0x3e33/0x5080 net/packet/af_packet.c:3108
  sock_sendmsg_nosec net/socket.c:727 [inline]
  __sock_sendmsg+0x21c/0x270 net/socket.c:742
  ____sys_sendmsg+0x505/0x830 net/socket.c:2630

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: syzbot+f3a497f02c389d86ef16@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/6920855a.a70a0220.2ea503.0058.GAE@google.com/T/#u
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251121154100.1616228-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_cls.h |  2 ++
 net/sched/em_cmp.c    |  5 ++++-
 net/sched/em_nbyte.c  |  2 ++
 net/sched/em_text.c   | 11 +++++++++--
 4 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index c64fd896b1f9..99ac747b7906 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -536,6 +536,8 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
 		case TCF_LAYER_NETWORK:
 			return skb_network_header(skb);
 		case TCF_LAYER_TRANSPORT:
+			if (!skb_transport_header_was_set(skb))
+				break;
 			return skb_transport_header(skb);
 	}
 
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index 64b637f18bc7..48c1bce74f49 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -22,9 +22,12 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
 			struct tcf_pkt_info *info)
 {
 	struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
-	unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
+	unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer);
 	u32 val = 0;
 
+	if (!ptr)
+		return 0;
+	ptr += cmp->off;
 	if (!tcf_valid_offset(skb, ptr, cmp->align))
 		return 0;
 
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 4f9f21a05d5e..c65ffa5fff94 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -42,6 +42,8 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
 	struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
 	unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
 
+	if (!ptr)
+		return 0;
 	ptr += nbyte->hdr.off;
 
 	if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index 6b3d0af72c39..692e2be1793e 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -29,12 +29,19 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
 			 struct tcf_pkt_info *info)
 {
 	struct text_match *tm = EM_TEXT_PRIV(m);
+	unsigned char *ptr;
 	int from, to;
 
-	from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+	ptr = tcf_get_base_ptr(skb, tm->from_layer);
+	if (!ptr)
+		return 0;
+	from = ptr - skb->data;
 	from += tm->from_offset;
 
-	to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+	ptr = tcf_get_base_ptr(skb, tm->to_layer);
+	if (!ptr)
+		return 0;
+	to = ptr - skb->data;
 	to += tm->to_offset;
 
 	return skb_find_text(skb, from, to, tm->config) != UINT_MAX;
-- 
cgit v1.2.3


From 075b19c211dfeea5f27075293ddf8795b78c9bd9 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 21 Nov 2025 18:02:00 +0100
Subject: net: factor-out _sk_charge() helper

Move out of __inet_accept() the code dealing charging newly
accepted socket to memcg. MPTCP will soon use it to on a per
subflow basis, in different contexts.

No functional changes intended.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Geliang Tang <geliang@kernel.org>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-1-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h |  2 ++
 net/core/sock.c    | 18 ++++++++++++++++++
 net/ipv4/af_inet.c | 17 +----------------
 3 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index a5f36ea9d46f..38d48cfe0741 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1631,6 +1631,8 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 	sk_mem_reclaim(sk);
 }
 
+void __sk_charge(struct sock *sk, gfp_t gfp);
+
 #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
 static inline void sk_owner_set(struct sock *sk, struct module *owner)
 {
diff --git a/net/core/sock.c b/net/core/sock.c
index 3b74fc71f51c..b26a6cdc9bcd 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3448,6 +3448,24 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
 
+void __sk_charge(struct sock *sk, gfp_t gfp)
+{
+	int amt;
+
+	gfp |= __GFP_NOFAIL;
+	if (mem_cgroup_from_sk(sk)) {
+		/* The socket has not been accepted yet, no need
+		 * to look at newsk->sk_wmem_queued.
+		 */
+		amt = sk_mem_pages(sk->sk_forward_alloc +
+				   atomic_read(&sk->sk_rmem_alloc));
+		if (amt)
+			mem_cgroup_sk_charge(sk, amt, gfp);
+	}
+
+	kmem_cache_charge(sk, gfp);
+}
+
 int sk_set_peek_off(struct sock *sk, int val)
 {
 	WRITE_ONCE(sk->sk_peek_off, val);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a31b94ce8968..08d811f11896 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -756,23 +756,8 @@ EXPORT_SYMBOL(inet_stream_connect);
 void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk)
 {
 	if (mem_cgroup_sockets_enabled) {
-		gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
-
 		mem_cgroup_sk_alloc(newsk);
-
-		if (mem_cgroup_from_sk(newsk)) {
-			int amt;
-
-			/* The socket has not been accepted yet, no need
-			 * to look at newsk->sk_wmem_queued.
-			 */
-			amt = sk_mem_pages(newsk->sk_forward_alloc +
-					   atomic_read(&newsk->sk_rmem_alloc));
-			if (amt)
-				mem_cgroup_sk_charge(newsk, amt, gfp);
-		}
-
-		kmem_cache_charge(newsk, gfp);
+		__sk_charge(newsk, GFP_KERNEL);
 	}
 
 	sock_rps_record_flow(newsk);
-- 
cgit v1.2.3


From 73029e73ccd07b64905f441d4f474a9bb91e7027 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 24 Nov 2025 18:27:30 -0800
Subject: x86/cc: Fix enum spelling to fix kernel-doc warnings

Make the enum name in kernel-doc match the code to prevent kernel-doc warnings:

  Warning: include/linux/cc_platform.h:106 Enum value
   'CC_ATTR_GUEST_SEV_SNP' not described in enum 'cc_attr'
  Warning: include/linux/cc_platform.h:106 Excess enum value
   '%CC_ATTR_SEV_SNP' description in 'cc_attr'

Fixes: f742b90e61bb ("x86/mm: Extend cc_attr to include AMD SEV-SNP")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20251125022730.3163679-1-rdunlap@infradead.org
---
 include/linux/cc_platform.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h
index 7fcec025c5e0..559353ad64ac 100644
--- a/include/linux/cc_platform.h
+++ b/include/linux/cc_platform.h
@@ -74,7 +74,7 @@ enum cc_attr {
 	CC_ATTR_GUEST_UNROLL_STRING_IO,
 
 	/**
-	 * @CC_ATTR_SEV_SNP: Guest SNP is active.
+	 * @CC_ATTR_GUEST_SEV_SNP: Guest SNP is active.
 	 *
 	 * The platform/OS is running as a guest/virtual machine and actively
 	 * using AMD SEV-SNP features.
-- 
cgit v1.2.3


From 37d369fa97cc0774ea4eab726d16bcb5fbe3a104 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 23 Nov 2025 22:05:15 +0000
Subject: fs: Add uoff_t

In a recent commit, I inadvertently changed a comparison from being an
unsigned comparison (on 64-bit systems) to being a signed comparison
(which it had always been on 32-bit systems).  This led to a sporadic
fstests failure.

To make sure this comparison is always unsigned, introduce a new type,
uoff_t which is the unsigned version of loff_t.  Generally file sizes
are restricted to being a signed integer, but in these two places it is
convenient to pass -1 to indicate "up to the end of the file".

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251123220518.1447261-1-willy@infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/mm.h                     | 8 ++++----
 include/linux/shmem_fs.h               | 2 +-
 include/linux/types.h                  | 1 +
 include/uapi/asm-generic/posix_types.h | 1 +
 mm/shmem.c                             | 6 +++---
 mm/truncate.c                          | 2 +-
 6 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d16b33bacc32..2a36d1bcf491 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3495,10 +3495,10 @@ struct vm_unmapped_area_info {
 extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
 
 /* truncate.c */
-extern void truncate_inode_pages(struct address_space *, loff_t);
-extern void truncate_inode_pages_range(struct address_space *,
-				       loff_t lstart, loff_t lend);
-extern void truncate_inode_pages_final(struct address_space *);
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart);
+void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart,
+		uoff_t lend);
+void truncate_inode_pages_final(struct address_space *mapping);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern vm_fault_t filemap_fault(struct vm_fault *vmf);
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0e47465ef0fd..774efe592a9a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -111,7 +111,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 					pgoff_t index, gfp_t gfp_mask);
 int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
 		struct list_head *folio_list);
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
+void shmem_truncate_range(struct inode *inode, loff_t start, uoff_t end);
 int shmem_unuse(unsigned int type);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/include/linux/types.h b/include/linux/types.h
index 6dfdb8e8e4c3..d4437e9c452c 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -50,6 +50,7 @@ typedef __kernel_old_gid_t	old_gid_t;
 
 #if defined(__GNUC__)
 typedef __kernel_loff_t		loff_t;
+typedef __kernel_uoff_t		uoff_t;
 #endif
 
 /*
diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h
index b5f7594eee7a..0a90ad92dbf3 100644
--- a/include/uapi/asm-generic/posix_types.h
+++ b/include/uapi/asm-generic/posix_types.h
@@ -86,6 +86,7 @@ typedef struct {
  */
 typedef __kernel_long_t	__kernel_off_t;
 typedef long long	__kernel_loff_t;
+typedef unsigned long long	__kernel_uoff_t;
 typedef __kernel_long_t	__kernel_old_time_t;
 #ifndef __KERNEL__
 typedef __kernel_long_t	__kernel_time_t;
diff --git a/mm/shmem.c b/mm/shmem.c
index c819cecf1ed9..43b41a42c463 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1076,7 +1076,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
  * Remove range of pages and swap entries from page cache, and free them.
  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
  */
-static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend,
 								 bool unfalloc)
 {
 	struct address_space *mapping = inode->i_mapping;
@@ -1227,7 +1227,7 @@ whole_folios:
 	shmem_recalc_inode(inode, 0, -nr_swaps_freed);
 }
 
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
 {
 	shmem_undo_range(inode, lstart, lend, false);
 	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
@@ -5776,7 +5776,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
 }
 #endif
 
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
 {
 	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index a3d673533e32..fbe848fdc391 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -339,7 +339,7 @@ long mapping_evict_folio(struct address_space *mapping, struct folio *folio)
  * page aligned properly.
  */
 void truncate_inode_pages_range(struct address_space *mapping,
-				loff_t lstart, loff_t lend)
+				loff_t lstart, uoff_t lend)
 {
 	pgoff_t		start;		/* inclusive */
 	pgoff_t		end;		/* exclusive */
-- 
cgit v1.2.3


From 54ca9e913e22e364292a484783efc4fcdb6fdc51 Mon Sep 17 00:00:00 2001
From: Askar Safin <safinaskar@gmail.com>
Date: Thu, 20 Nov 2025 19:51:40 +0000
Subject: include/linux/fs.h: trivial fix: regualr -> regular

Trivial fix.

Signed-off-by: Askar Safin <safinaskar@gmail.com>
Link: https://patch.msgid.link/20251120195140.571608-1-safinaskar@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ff69734b9fde..e02700b4e36b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3102,7 +3102,7 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
  * file_start_write - get write access to a superblock for regular file io
  * @file: the file we want to write to
  *
- * This is a variant of sb_start_write() which is a noop on non-regualr file.
+ * This is a variant of sb_start_write() which is a noop on non-regular file.
  * Should be matched with a call to file_end_write().
  */
 static inline void file_start_write(struct file *file)
-- 
cgit v1.2.3


From f9f85149994dbb9db43202ae8fabf68940c0ac0f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Nov 2025 18:06:26 +0100
Subject: fs, iomap: remove IOCB_DIO_CALLER_COMP

This was added by commit 099ada2c8726 ("io_uring/rw: add write support
for IOCB_DIO_CALLER_COMP") and disabled a little later by commit
838b35bb6a89 ("io_uring/rw: disable IOCB_DIO_CALLER_COMP") because it
didn't work.  Remove all the related code that sat unused for 2 years.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113170633.1453259-2-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/iomap/operations.rst |  4 --
 fs/backing-file.c                              |  6 ---
 fs/iomap/direct-io.c                           | 56 +-------------------------
 include/linux/fs.h                             | 43 +++++---------------
 io_uring/rw.c                                  | 16 +-------
 5 files changed, 13 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 64f4baf5750e..da982ca7e413 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -490,10 +490,6 @@ These ``struct kiocb`` flags are significant for direct I/O with iomap:
    Only meaningful for asynchronous I/O, and only if the entire I/O can
    be issued as a single ``struct bio``.
 
- * ``IOCB_DIO_CALLER_COMP``: Try to run I/O completion from the caller's
-   process context.
-   See ``linux/fs.h`` for more details.
-
 Filesystems should call ``iomap_dio_rw`` from ``->read_iter`` and
 ``->write_iter``, and set ``FMODE_CAN_ODIRECT`` in the ``->open``
 function for the file.
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 15a7f8031084..2a86bb6fcd13 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -227,12 +227,6 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 	    !(file->f_mode & FMODE_CAN_ODIRECT))
 		return -EINVAL;
 
-	/*
-	 * Stacked filesystems don't support deferred completions, don't copy
-	 * this property in case it is set by the issuer.
-	 */
-	flags &= ~IOCB_DIO_CALLER_COMP;
-
 	old_cred = override_creds(ctx->cred);
 	if (is_sync_kiocb(iocb)) {
 		rwf_t rwf = iocb_to_rw_flags(flags);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 8b2f9fb89eb3..7659db85083a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -16,8 +16,7 @@
  * Private flags for iomap_dio, must not overlap with the public ones in
  * iomap.h:
  */
-#define IOMAP_DIO_NO_INVALIDATE	(1U << 25)
-#define IOMAP_DIO_CALLER_COMP	(1U << 26)
+#define IOMAP_DIO_NO_INVALIDATE	(1U << 26)
 #define IOMAP_DIO_INLINE_COMP	(1U << 27)
 #define IOMAP_DIO_WRITE_THROUGH	(1U << 28)
 #define IOMAP_DIO_NEED_SYNC	(1U << 29)
@@ -140,11 +139,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
 }
 EXPORT_SYMBOL_GPL(iomap_dio_complete);
 
-static ssize_t iomap_dio_deferred_complete(void *data)
-{
-	return iomap_dio_complete(data);
-}
-
 static void iomap_dio_complete_work(struct work_struct *work)
 {
 	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -182,29 +176,6 @@ static void iomap_dio_done(struct iomap_dio *dio)
 	} else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
 		WRITE_ONCE(iocb->private, NULL);
 		iomap_dio_complete_work(&dio->aio.work);
-	} else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
-		/*
-		 * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
-		 * schedule our completion that way to avoid an async punt to a
-		 * workqueue.
-		 */
-		/* only polled IO cares about private cleared */
-		iocb->private = dio;
-		iocb->dio_complete = iomap_dio_deferred_complete;
-
-		/*
-		 * Invoke ->ki_complete() directly. We've assigned our
-		 * dio_complete callback handler, and since the issuer set
-		 * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
-		 * notice ->dio_complete being set and will defer calling that
-		 * handler until it can be done from a safe task context.
-		 *
-		 * Note that the 'res' being passed in here is not important
-		 * for this case. The actual completion value of the request
-		 * will be gotten from dio_complete when that is run by the
-		 * issuer.
-		 */
-		iocb->ki_complete(iocb, 0);
 	} else {
 		struct inode *inode = file_inode(iocb->ki_filp);
 
@@ -261,7 +232,6 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
 			dio->flags |= IOMAP_DIO_INLINE_COMP;
 			dio->flags |= IOMAP_DIO_NO_INVALIDATE;
 		}
-		dio->flags &= ~IOMAP_DIO_CALLER_COMP;
 		iomap_dio_done(dio);
 	}
 
@@ -380,19 +350,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 
 		if (!(bio_opf & REQ_FUA))
 			dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
-
-		/*
-		 * We can only do deferred completion for pure overwrites that
-		 * don't require additional I/O at completion time.
-		 *
-		 * This rules out writes that need zeroing or extent conversion,
-		 * extend the file size, or issue metadata I/O or cache flushes
-		 * during completion processing.
-		 */
-		if (need_zeroout || (pos >= i_size_read(inode)) ||
-		    ((dio->flags & IOMAP_DIO_NEED_SYNC) &&
-		     !(bio_opf & REQ_FUA)))
-			dio->flags &= ~IOMAP_DIO_CALLER_COMP;
 	} else {
 		bio_opf |= REQ_OP_READ;
 	}
@@ -413,7 +370,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 	 * ones we set for inline and deferred completions. If none of those
 	 * are available for this IO, clear the polled flag.
 	 */
-	if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
+	if (!(dio->flags & IOMAP_DIO_INLINE_COMP))
 		dio->iocb->ki_flags &= ~IOCB_HIPRI;
 
 	if (need_zeroout) {
@@ -669,15 +626,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		iomi.flags |= IOMAP_WRITE;
 		dio->flags |= IOMAP_DIO_WRITE;
 
-		/*
-		 * Flag as supporting deferred completions, if the issuer
-		 * groks it. This can avoid a workqueue punt for writes.
-		 * We may later clear this flag if we need to do other IO
-		 * as part of this IO completion.
-		 */
-		if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
-			dio->flags |= IOMAP_DIO_CALLER_COMP;
-
 		if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
 			ret = -EAGAIN;
 			if (iomi.pos >= dio->i_size ||
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..e210d2d8af53 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -367,23 +367,9 @@ struct readahead_control;
 #define IOCB_NOIO		(1 << 20)
 /* can use bio alloc cache */
 #define IOCB_ALLOC_CACHE	(1 << 21)
-/*
- * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
- * iocb completion can be passed back to the owner for execution from a safe
- * context rather than needing to be punted through a workqueue. If this
- * flag is set, the bio completion handling may set iocb->dio_complete to a
- * handler function and iocb->private to context information for that handler.
- * The issuer should call the handler with that context information from task
- * context to complete the processing of the iocb. Note that while this
- * provides a task context for the dio_complete() callback, it should only be
- * used on the completion side for non-IO generating completions. It's fine to
- * call blocking functions from this callback, but they should not wait for
- * unrelated IO (like cache flushing, new IO generation, etc).
- */
-#define IOCB_DIO_CALLER_COMP	(1 << 22)
 /* kiocb is a read or write operation submitted by fs/aio.c. */
-#define IOCB_AIO_RW		(1 << 23)
-#define IOCB_HAS_METADATA	(1 << 24)
+#define IOCB_AIO_RW		(1 << 22)
+#define IOCB_HAS_METADATA	(1 << 23)
 
 /* for use in trace events */
 #define TRACE_IOCB_STRINGS \
@@ -400,7 +386,6 @@ struct readahead_control;
 	{ IOCB_WAITQ,		"WAITQ" }, \
 	{ IOCB_NOIO,		"NOIO" }, \
 	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }, \
-	{ IOCB_DIO_CALLER_COMP,	"CALLER_COMP" }, \
 	{ IOCB_AIO_RW,		"AIO_RW" }, \
 	{ IOCB_HAS_METADATA,	"AIO_HAS_METADATA" }
 
@@ -412,23 +397,13 @@ struct kiocb {
 	int			ki_flags;
 	u16			ki_ioprio; /* See linux/ioprio.h */
 	u8			ki_write_stream;
-	union {
-		/*
-		 * Only used for async buffered reads, where it denotes the
-		 * page waitqueue associated with completing the read. Valid
-		 * IFF IOCB_WAITQ is set.
-		 */
-		struct wait_page_queue	*ki_waitq;
-		/*
-		 * Can be used for O_DIRECT IO, where the completion handling
-		 * is punted back to the issuer of the IO. May only be set
-		 * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
-		 * must then check for presence of this handler when ki_complete
-		 * is invoked. The data passed in to this handler must be
-		 * assigned to ->private when dio_complete is assigned.
-		 */
-		ssize_t (*dio_complete)(void *data);
-	};
+
+	/*
+	 * Only used for async buffered reads, where it denotes the page
+	 * waitqueue associated with completing the read.
+	 * Valid IFF IOCB_WAITQ is set.
+	 */
+	struct wait_page_queue	*ki_waitq;
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 08882648d569..4d0ab8f50d14 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -277,7 +277,6 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	} else {
 		rw->kiocb.ki_ioprio = get_current_ioprio();
 	}
-	rw->kiocb.dio_complete = NULL;
 	rw->kiocb.ki_flags = 0;
 	rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream);
 
@@ -566,15 +565,6 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
 
 void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
-	struct kiocb *kiocb = &rw->kiocb;
-
-	if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
-		long res = kiocb->dio_complete(rw->kiocb.private);
-
-		io_req_set_res(req, io_fixup_rw_res(req, res), 0);
-	}
-
 	io_req_io_end(req);
 
 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
@@ -589,10 +579,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 	struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
 	struct io_kiocb *req = cmd_to_io_kiocb(rw);
 
-	if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
-		__io_complete_rw_common(req, res);
-		io_req_set_res(req, io_fixup_rw_res(req, res), 0);
-	}
+	__io_complete_rw_common(req, res);
+	io_req_set_res(req, io_fixup_rw_res(req, res), 0);
 	req->io_task_work.func = io_req_rw_complete;
 	__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
 }
-- 
cgit v1.2.3


From 24d4da5c2565313c2ad3c43449937a9351a64407 Mon Sep 17 00:00:00 2001
From: Ria Thomas <ria.thomas@morsemicro.com>
Date: Mon, 24 Nov 2025 18:26:37 +0530
Subject: wifi: ieee80211: correct FILS status codes

The FILS status codes are set to 108/109, but the IEEE 802.11-2020
spec defines them as 112/113. Update the enum so it matches the
specification and keeps the kernel consistent with standard values.

Fixes: a3caf7440ded ("cfg80211: Add support for FILS shared key authentication offload")
Signed-off-by: Ria Thomas <ria.thomas@morsemicro.com>
Reviewed-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
Link: https://patch.msgid.link/20251124125637.3936154-1-ria.thomas@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index d55d8ea3a8be..96439de55f07 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1493,8 +1493,8 @@ enum ieee80211_statuscode {
 	WLAN_STATUS_DENIED_WITH_SUGGESTED_BAND_AND_CHANNEL = 99,
 	WLAN_STATUS_DENIED_DUE_TO_SPECTRUM_MANAGEMENT = 103,
 	/* 802.11ai */
-	WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 108,
-	WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 109,
+	WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 112,
+	WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 113,
 	WLAN_STATUS_SAE_HASH_TO_ELEMENT = 126,
 	WLAN_STATUS_SAE_PK = 127,
 	WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING = 133,
-- 
cgit v1.2.3


From cba1ba11c1bae87de9c2e13d342bfbd6a3c1cf63 Mon Sep 17 00:00:00 2001
From: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Date: Tue, 25 Nov 2025 13:59:26 +1100
Subject: wifi: cfg80211: include s1g_primary_2mhz when comparing chandefs

When comparing chandefs, ensure we include s1g_primary_2mhz.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20251125025927.245280-3-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 3d3ed1932262..899f267b7cf9 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -974,7 +974,8 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1,
 		chandef1->center_freq1 == chandef2->center_freq1 &&
 		chandef1->freq1_offset == chandef2->freq1_offset &&
 		chandef1->center_freq2 == chandef2->center_freq2 &&
-		chandef1->punctured == chandef2->punctured);
+		chandef1->punctured == chandef2->punctured &&
+		chandef1->s1g_primary_2mhz == chandef2->s1g_primary_2mhz);
 }
 
 /**
-- 
cgit v1.2.3


From a27628f4363435beac84b55c749c41a005054d30 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sat, 11 Oct 2025 00:17:36 +0200
Subject: fs: rework I_NEW handling to operate without fences

In the inode hash code grab the state while ->i_lock is held. If found
to be set, synchronize the sleep once more with the lock held.

In the real world the flag is not set most of the time.

Apart from being simpler to reason about, it comes with a minor speed up
as now clearing the flag does not require the smp_mb() fence.

While here rename wait_on_inode() to wait_on_new_inode() to line it up
with __wait_on_freeing_inode().

Christian Brauner <brauner@kernel.org> says:

As per the discussion in [1] I folded in the diff sent in [2].

Link: https://lore.kernel.org/69238e4d.a70a0220.d98e3.006e.GAE@google.com [1]
Link: https://lore.kernel.org/c2kpawomkbvtahjm7y5mposbhckb7wxthi3iqy5yr22ggpucrm@ufvxwy233qxo [2]
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251010221737.1403539-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/dir.c       |  4 +--
 fs/dcache.c        | 10 ------
 fs/gfs2/glock.c    |  2 +-
 fs/inode.c         | 98 +++++++++++++++++++++++++++++++++---------------------
 include/linux/fs.h | 12 ++-----
 5 files changed, 66 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 89d36e3e5c79..f4e9e12373ac 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -779,7 +779,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
 	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
 	struct inode *inode = NULL, *ti;
 	afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version);
-	bool supports_ibulk;
+	bool supports_ibulk, isnew;
 	long ret;
 	int i;
 
@@ -850,7 +850,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
 			 * callback counters.
 			 */
 			ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode,
-					     afs_ilookup5_test_by_fid, &vp->fid);
+					     afs_ilookup5_test_by_fid, &vp->fid, &isnew);
 			if (!IS_ERR_OR_NULL(ti)) {
 				vnode = AFS_FS_I(ti);
 				vp->dv_before = vnode->status.data_version;
diff --git a/fs/dcache.c b/fs/dcache.c
index 78ffa7b7e824..25131f105a60 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1981,17 +1981,7 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
 	spin_lock(&inode->i_lock);
 	__d_instantiate(entry, inode);
 	WARN_ON(!(inode_state_read(inode) & I_NEW));
-	/*
-	 * Pairs with smp_rmb in wait_on_inode().
-	 */
-	smp_wmb();
 	inode_state_clear(inode, I_NEW | I_CREATING);
-	/*
-	 * Pairs with the barrier in prepare_to_wait_event() to make sure
-	 * ___wait_var_event() either sees the bit cleared or
-	 * waitqueue_active() check in wake_up_var() sees the waiter.
-	 */
-	smp_mb();
 	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b677c0e6b9ab..c9712235e7a0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -957,7 +957,7 @@ static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl)
 		ip = NULL;
 	spin_unlock(&gl->gl_lockref.lock);
 	if (ip) {
-		wait_on_inode(&ip->i_inode);
+		wait_on_new_inode(&ip->i_inode);
 		if (is_bad_inode(&ip->i_inode)) {
 			iput(&ip->i_inode);
 			ip = NULL;
diff --git a/fs/inode.c b/fs/inode.c
index 3153d725859c..80298f048117 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -558,6 +558,32 @@ struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
 }
 EXPORT_SYMBOL(inode_bit_waitqueue);
 
+void wait_on_new_inode(struct inode *inode)
+{
+	struct wait_bit_queue_entry wqe;
+	struct wait_queue_head *wq_head;
+
+	spin_lock(&inode->i_lock);
+	if (!(inode_state_read(inode) & I_NEW)) {
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+
+	wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
+	for (;;) {
+		prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
+		if (!(inode_state_read(inode) & I_NEW))
+			break;
+		spin_unlock(&inode->i_lock);
+		schedule();
+		spin_lock(&inode->i_lock);
+	}
+	finish_wait(wq_head, &wqe.wq_entry);
+	WARN_ON(inode_state_read(inode) & I_NEW);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(wait_on_new_inode);
+
 /*
  * Add inode to LRU if needed (inode is unused and clean).
  *
@@ -1008,7 +1034,8 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock
 static struct inode *find_inode(struct super_block *sb,
 				struct hlist_head *head,
 				int (*test)(struct inode *, void *),
-				void *data, bool is_inode_hash_locked)
+				void *data, bool is_inode_hash_locked,
+				bool *isnew)
 {
 	struct inode *inode = NULL;
 
@@ -1035,6 +1062,7 @@ repeat:
 			return ERR_PTR(-ESTALE);
 		}
 		__iget(inode);
+		*isnew = !!(inode_state_read(inode) & I_NEW);
 		spin_unlock(&inode->i_lock);
 		rcu_read_unlock();
 		return inode;
@@ -1049,7 +1077,7 @@ repeat:
  */
 static struct inode *find_inode_fast(struct super_block *sb,
 				struct hlist_head *head, unsigned long ino,
-				bool is_inode_hash_locked)
+				bool is_inode_hash_locked, bool *isnew)
 {
 	struct inode *inode = NULL;
 
@@ -1076,6 +1104,7 @@ repeat:
 			return ERR_PTR(-ESTALE);
 		}
 		__iget(inode);
+		*isnew = !!(inode_state_read(inode) & I_NEW);
 		spin_unlock(&inode->i_lock);
 		rcu_read_unlock();
 		return inode;
@@ -1181,17 +1210,7 @@ void unlock_new_inode(struct inode *inode)
 	lockdep_annotate_inode_mutex_key(inode);
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode_state_read(inode) & I_NEW));
-	/*
-	 * Pairs with smp_rmb in wait_on_inode().
-	 */
-	smp_wmb();
 	inode_state_clear(inode, I_NEW | I_CREATING);
-	/*
-	 * Pairs with the barrier in prepare_to_wait_event() to make sure
-	 * ___wait_var_event() either sees the bit cleared or
-	 * waitqueue_active() check in wake_up_var() sees the waiter.
-	 */
-	smp_mb();
 	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
 }
@@ -1202,17 +1221,7 @@ void discard_new_inode(struct inode *inode)
 	lockdep_annotate_inode_mutex_key(inode);
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode_state_read(inode) & I_NEW));
-	/*
-	 * Pairs with smp_rmb in wait_on_inode().
-	 */
-	smp_wmb();
 	inode_state_clear(inode, I_NEW);
-	/*
-	 * Pairs with the barrier in prepare_to_wait_event() to make sure
-	 * ___wait_var_event() either sees the bit cleared or
-	 * waitqueue_active() check in wake_up_var() sees the waiter.
-	 */
-	smp_mb();
 	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
 	iput(inode);
@@ -1268,6 +1277,7 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
  * @test:	callback used for comparisons between inodes
  * @set:	callback used to initialize a new struct inode
  * @data:	opaque data pointer to pass to @test and @set
+ * @isnew:	pointer to a bool which will indicate whether I_NEW is set
  *
  * Search for the inode specified by @hashval and @data in the inode cache,
  * and if present return it with an increased reference count. This is a
@@ -1286,12 +1296,13 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
 {
 	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
 	struct inode *old;
+	bool isnew;
 
 	might_sleep();
 
 again:
 	spin_lock(&inode_hash_lock);
-	old = find_inode(inode->i_sb, head, test, data, true);
+	old = find_inode(inode->i_sb, head, test, data, true, &isnew);
 	if (unlikely(old)) {
 		/*
 		 * Uhhuh, somebody else created the same inode under us.
@@ -1300,7 +1311,8 @@ again:
 		spin_unlock(&inode_hash_lock);
 		if (IS_ERR(old))
 			return NULL;
-		wait_on_inode(old);
+		if (unlikely(isnew))
+			wait_on_new_inode(old);
 		if (unlikely(inode_unhashed(old))) {
 			iput(old);
 			goto again;
@@ -1391,15 +1403,17 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode, *new;
+	bool isnew;
 
 	might_sleep();
 
 again:
-	inode = find_inode(sb, head, test, data, false);
+	inode = find_inode(sb, head, test, data, false, &isnew);
 	if (inode) {
 		if (IS_ERR(inode))
 			return NULL;
-		wait_on_inode(inode);
+		if (unlikely(isnew))
+			wait_on_new_inode(inode);
 		if (unlikely(inode_unhashed(inode))) {
 			iput(inode);
 			goto again;
@@ -1434,15 +1448,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct inode *inode;
+	bool isnew;
 
 	might_sleep();
 
 again:
-	inode = find_inode_fast(sb, head, ino, false);
+	inode = find_inode_fast(sb, head, ino, false, &isnew);
 	if (inode) {
 		if (IS_ERR(inode))
 			return NULL;
-		wait_on_inode(inode);
+		if (unlikely(isnew))
+			wait_on_new_inode(inode);
 		if (unlikely(inode_unhashed(inode))) {
 			iput(inode);
 			goto again;
@@ -1456,7 +1472,7 @@ again:
 
 		spin_lock(&inode_hash_lock);
 		/* We released the lock, so.. */
-		old = find_inode_fast(sb, head, ino, true);
+		old = find_inode_fast(sb, head, ino, true, &isnew);
 		if (!old) {
 			inode->i_ino = ino;
 			spin_lock(&inode->i_lock);
@@ -1482,7 +1498,8 @@ again:
 		if (IS_ERR(old))
 			return NULL;
 		inode = old;
-		wait_on_inode(inode);
+		if (unlikely(isnew))
+			wait_on_new_inode(inode);
 		if (unlikely(inode_unhashed(inode))) {
 			iput(inode);
 			goto again;
@@ -1586,13 +1603,13 @@ EXPORT_SYMBOL(igrab);
  * Note2: @test is called with the inode_hash_lock held, so can't sleep.
  */
 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
-		int (*test)(struct inode *, void *), void *data)
+		int (*test)(struct inode *, void *), void *data, bool *isnew)
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode;
 
 	spin_lock(&inode_hash_lock);
-	inode = find_inode(sb, head, test, data, true);
+	inode = find_inode(sb, head, test, data, true, isnew);
 	spin_unlock(&inode_hash_lock);
 
 	return IS_ERR(inode) ? NULL : inode;
@@ -1620,13 +1637,15 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 {
 	struct inode *inode;
+	bool isnew;
 
 	might_sleep();
 
 again:
-	inode = ilookup5_nowait(sb, hashval, test, data);
+	inode = ilookup5_nowait(sb, hashval, test, data, &isnew);
 	if (inode) {
-		wait_on_inode(inode);
+		if (unlikely(isnew))
+			wait_on_new_inode(inode);
 		if (unlikely(inode_unhashed(inode))) {
 			iput(inode);
 			goto again;
@@ -1648,16 +1667,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct inode *inode;
+	bool isnew;
 
 	might_sleep();
 
 again:
-	inode = find_inode_fast(sb, head, ino, false);
+	inode = find_inode_fast(sb, head, ino, false, &isnew);
 
 	if (inode) {
 		if (IS_ERR(inode))
 			return NULL;
-		wait_on_inode(inode);
+		if (unlikely(isnew))
+			wait_on_new_inode(inode);
 		if (unlikely(inode_unhashed(inode))) {
 			iput(inode);
 			goto again;
@@ -1800,6 +1821,7 @@ int insert_inode_locked(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	ino_t ino = inode->i_ino;
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
+	bool isnew;
 
 	might_sleep();
 
@@ -1832,9 +1854,11 @@ int insert_inode_locked(struct inode *inode)
 			return -EBUSY;
 		}
 		__iget(old);
+		isnew = !!(inode_state_read(old) & I_NEW);
 		spin_unlock(&old->i_lock);
 		spin_unlock(&inode_hash_lock);
-		wait_on_inode(old);
+		if (isnew)
+			wait_on_new_inode(old);
 		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
 			return -EBUSY;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21c73df3ce75..a813abdcf218 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1030,15 +1030,7 @@ static inline void inode_fake_hash(struct inode *inode)
 	hlist_add_fake(&inode->i_hash);
 }
 
-static inline void wait_on_inode(struct inode *inode)
-{
-	wait_var_event(inode_state_wait_address(inode, __I_NEW),
-		       !(inode_state_read_once(inode) & I_NEW));
-	/*
-	 * Pairs with routines clearing I_NEW.
-	 */
-	smp_rmb();
-}
+void wait_on_new_inode(struct inode *inode);
 
 /*
  * inode->i_rwsem nesting subclasses for the lock validator:
@@ -3417,7 +3409,7 @@ extern void d_mark_dontcache(struct inode *inode);
 
 extern struct inode *ilookup5_nowait(struct super_block *sb,
 		unsigned long hashval, int (*test)(struct inode *, void *),
-		void *data);
+		void *data, bool *isnew);
 extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data);
 extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
-- 
cgit v1.2.3


From 4c6b40877b4dc83f61a762a3a35a09dcf744b585 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 29 Oct 2025 14:14:28 +0100
Subject: fs: cosmetic fixes to lru handling

1. inode_bit_waitqueue() was somehow placed between __inode_add_lru() and
   inode_add_lru(). move it up
2. assert ->i_lock is held in __inode_add_lru instead of just claiming it is
   needed
3. s/__inode_add_lru/__inode_lru_list_add/ for consistency with itself
   (inode_lru_list_del()) and similar routines for sb and io list
   management
4. push list presence check into inode_lru_list_del(), just like sb and
   io list

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251029131428.654761-2-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c  |  2 +-
 fs/inode.c         | 50 ++++++++++++++++++++++++++------------------------
 include/linux/fs.h |  2 +-
 mm/filemap.c       |  4 ++--
 mm/truncate.c      |  6 +++---
 mm/vmscan.c        |  2 +-
 mm/workingset.c    |  2 +-
 7 files changed, 35 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f784d8b09b04..c00b72e2d339 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1452,7 +1452,7 @@ static void inode_sync_complete(struct inode *inode)
 
 	inode_state_clear(inode, I_SYNC);
 	/* If inode is clean an unused, put it into LRU now... */
-	inode_add_lru(inode);
+	inode_lru_list_add(inode);
 	/* Called with inode->i_lock which ensures memory ordering. */
 	inode_wake_up_bit(inode, __I_SYNC);
 }
diff --git a/fs/inode.c b/fs/inode.c
index 80298f048117..7229a56732c6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -530,23 +530,6 @@ void ihold(struct inode *inode)
 }
 EXPORT_SYMBOL(ihold);
 
-static void __inode_add_lru(struct inode *inode, bool rotate)
-{
-	if (inode_state_read(inode) & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
-		return;
-	if (icount_read(inode))
-		return;
-	if (!(inode->i_sb->s_flags & SB_ACTIVE))
-		return;
-	if (!mapping_shrinkable(&inode->i_data))
-		return;
-
-	if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
-		this_cpu_inc(nr_unused);
-	else if (rotate)
-		inode_state_set(inode, I_REFERENCED);
-}
-
 struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
 					    struct inode *inode, u32 bit)
 {
@@ -584,18 +567,38 @@ void wait_on_new_inode(struct inode *inode)
 }
 EXPORT_SYMBOL(wait_on_new_inode);
 
+static void __inode_lru_list_add(struct inode *inode, bool rotate)
+{
+	lockdep_assert_held(&inode->i_lock);
+
+	if (inode_state_read(inode) & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
+		return;
+	if (icount_read(inode))
+		return;
+	if (!(inode->i_sb->s_flags & SB_ACTIVE))
+		return;
+	if (!mapping_shrinkable(&inode->i_data))
+		return;
+
+	if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
+		this_cpu_inc(nr_unused);
+	else if (rotate)
+		inode_state_set(inode, I_REFERENCED);
+}
+
 /*
  * Add inode to LRU if needed (inode is unused and clean).
- *
- * Needs inode->i_lock held.
  */
-void inode_add_lru(struct inode *inode)
+void inode_lru_list_add(struct inode *inode)
 {
-	__inode_add_lru(inode, false);
+	__inode_lru_list_add(inode, false);
 }
 
 static void inode_lru_list_del(struct inode *inode)
 {
+	if (list_empty(&inode->i_lru))
+		return;
+
 	if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
 		this_cpu_dec(nr_unused);
 }
@@ -1920,7 +1923,7 @@ static void iput_final(struct inode *inode)
 	if (!drop &&
 	    !(inode_state_read(inode) & I_DONTCACHE) &&
 	    (sb->s_flags & SB_ACTIVE)) {
-		__inode_add_lru(inode, true);
+		__inode_lru_list_add(inode, true);
 		spin_unlock(&inode->i_lock);
 		return;
 	}
@@ -1944,8 +1947,7 @@ static void iput_final(struct inode *inode)
 		inode_state_replace(inode, I_WILL_FREE, I_FREEING);
 	}
 
-	if (!list_empty(&inode->i_lru))
-		inode_lru_list_del(inode);
+	inode_lru_list_del(inode);
 	spin_unlock(&inode->i_lock);
 
 	evict(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a813abdcf218..33129cda3a99 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3502,7 +3502,7 @@ static inline void remove_inode_hash(struct inode *inode)
 }
 
 extern void inode_sb_list_add(struct inode *inode);
-extern void inode_add_lru(struct inode *inode);
+extern void inode_lru_list_add(struct inode *inode);
 
 extern int sb_set_blocksize(struct super_block *, int);
 extern int sb_min_blocksize(struct super_block *, int);
diff --git a/mm/filemap.c b/mm/filemap.c
index 13f0259d993c..add5228a7d97 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -256,7 +256,7 @@ void filemap_remove_folio(struct folio *folio)
 	__filemap_remove_folio(folio, NULL);
 	xa_unlock_irq(&mapping->i_pages);
 	if (mapping_shrinkable(mapping))
-		inode_add_lru(mapping->host);
+		inode_lru_list_add(mapping->host);
 	spin_unlock(&mapping->host->i_lock);
 
 	filemap_free_folio(mapping, folio);
@@ -335,7 +335,7 @@ void delete_from_page_cache_batch(struct address_space *mapping,
 	page_cache_delete_batch(mapping, fbatch);
 	xa_unlock_irq(&mapping->i_pages);
 	if (mapping_shrinkable(mapping))
-		inode_add_lru(mapping->host);
+		inode_lru_list_add(mapping->host);
 	spin_unlock(&mapping->host->i_lock);
 
 	for (i = 0; i < folio_batch_count(fbatch); i++)
diff --git a/mm/truncate.c b/mm/truncate.c
index 91eb92a5ce4f..ad9c0fa29d94 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -46,7 +46,7 @@ static void clear_shadow_entries(struct address_space *mapping,
 
 	xas_unlock_irq(&xas);
 	if (mapping_shrinkable(mapping))
-		inode_add_lru(mapping->host);
+		inode_lru_list_add(mapping->host);
 	spin_unlock(&mapping->host->i_lock);
 }
 
@@ -111,7 +111,7 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping,
 
 	xas_unlock_irq(&xas);
 	if (mapping_shrinkable(mapping))
-		inode_add_lru(mapping->host);
+		inode_lru_list_add(mapping->host);
 	spin_unlock(&mapping->host->i_lock);
 out:
 	folio_batch_remove_exceptionals(fbatch);
@@ -622,7 +622,7 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
 	__filemap_remove_folio(folio, NULL);
 	xa_unlock_irq(&mapping->i_pages);
 	if (mapping_shrinkable(mapping))
-		inode_add_lru(mapping->host);
+		inode_lru_list_add(mapping->host);
 	spin_unlock(&mapping->host->i_lock);
 
 	filemap_free_folio(mapping, folio);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2fc8b626d3d..bb4a96c7b682 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -811,7 +811,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
 		__filemap_remove_folio(folio, shadow);
 		xa_unlock_irq(&mapping->i_pages);
 		if (mapping_shrinkable(mapping))
-			inode_add_lru(mapping->host);
+			inode_lru_list_add(mapping->host);
 		spin_unlock(&mapping->host->i_lock);
 
 		if (free_folio)
diff --git a/mm/workingset.c b/mm/workingset.c
index 68a76a91111f..d32dc2e02a61 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -755,7 +755,7 @@ out_invalid:
 	xa_unlock_irq(&mapping->i_pages);
 	if (mapping->host != NULL) {
 		if (mapping_shrinkable(mapping))
-			inode_add_lru(mapping->host);
+			inode_lru_list_add(mapping->host);
 		spin_unlock(&mapping->host->i_lock);
 	}
 	ret = LRU_REMOVED_RETRY;
-- 
cgit v1.2.3


From 1c6a92a5a5de7ebf94526dee7068926e6d5b1b01 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 24 Nov 2025 18:28:34 -0800
Subject: wifi: nl80211: vendor-cmd: intel: fix a blank kernel-doc line warning

Delete an empty line prevent a kernel-doc warning:

Warning: ../include/uapi/linux/nl80211-vnd-intel.h:86 bad line:

Fixes: 3d2a2544eae9 ("nl80211: vendor-cmd: add Intel vendor commands for iwlmei usage")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251125022834.3171742-1-rdunlap@infradead.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211-vnd-intel.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211-vnd-intel.h b/include/uapi/linux/nl80211-vnd-intel.h
index 4ed7d0b24512..79ccc9401d50 100644
--- a/include/uapi/linux/nl80211-vnd-intel.h
+++ b/include/uapi/linux/nl80211-vnd-intel.h
@@ -84,7 +84,6 @@ enum iwl_vendor_auth_akm_mode {
  *
  * @NUM_IWL_MVM_VENDOR_ATTR: number of vendor attributes
  * @MAX_IWL_MVM_VENDOR_ATTR: highest vendor attribute number
-
  */
 enum iwl_mvm_vendor_attr {
 	__IWL_MVM_VENDOR_ATTR_INVALID				= 0x00,
-- 
cgit v1.2.3


From 194832dcb13b0d02fce0df887235b7e6d1ef0121 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 18 Nov 2025 11:04:04 +0100
Subject: string: use __attribute__((nonnull())) in strends()

The arguments of strends() must not be NULL so annotate the function
with the nonnull attribute.

Suggested-by: Kees Cook <kees@kernel.org>
Link: https://lore.kernel.org/r/20251118-strends-follow-up-v1-2-d3f8ef750f59@linaro.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/string.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index 69e9256592f8..0266dbdaa4cd 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -570,7 +570,8 @@ static inline bool strstarts(const char *str, const char *prefix)
  * Returns:
  * True if @str ends with @suffix. False in all other cases.
  */
-static inline bool strends(const char *str, const char *suffix)
+static inline bool __attribute__((nonnull(1, 2)))
+strends(const char *str, const char *suffix)
 {
 	unsigned int str_len = strlen(str), suffix_len = strlen(suffix);
 
-- 
cgit v1.2.3


From 155f8d4ef0b78afbf25b1449bbd654fd1327cc7a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Oct 2025 11:01:37 +0000
Subject: ACPI: GTDT: Get rid of acpi_arch_timer_mem_init()

Since 0f67b56d84b4c ("clocksource/drivers/arm_arch_timer_mmio: Switch
over to standalone driver"), acpi_arch_timer_mem_init() is unused.

Remove it.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Mark Rutland <mark.rutland@arm.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/arm64/gtdt.c | 34 ----------------------------------
 include/linux/acpi.h      |  1 -
 2 files changed, 35 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index fd995a1d3d24..2f100c1fa2d4 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -303,40 +303,6 @@ error:
 	return -EINVAL;
 }
 
-/**
- * acpi_arch_timer_mem_init() - Get the info of all GT blocks in GTDT table.
- * @timer_mem:	The pointer to the array of struct arch_timer_mem for returning
- *		the result of parsing. The element number of this array should
- *		be platform_timer_count(the total number of platform timers).
- * @timer_count: It points to a integer variable which is used for storing the
- *		number of GT blocks we have parsed.
- *
- * Return: 0 if success, -EINVAL/-ENODEV if error.
- */
-int __init acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem,
-				    int *timer_count)
-{
-	int ret;
-	void *platform_timer;
-
-	*timer_count = 0;
-	for_each_platform_timer(platform_timer) {
-		if (is_timer_block(platform_timer)) {
-			ret = gtdt_parse_timer_block(platform_timer, timer_mem);
-			if (ret)
-				return ret;
-			timer_mem++;
-			(*timer_count)++;
-		}
-	}
-
-	if (*timer_count)
-		pr_info("found %d memory-mapped timer block(s).\n",
-			*timer_count);
-
-	return 0;
-}
-
 /*
  * Initialize a SBSA generic Watchdog platform device info from GTDT
  */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 5ff5d99f6ead..22b377c3a319 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -755,7 +755,6 @@ int acpi_reconfig_notifier_unregister(struct notifier_block *nb);
 int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
 int acpi_gtdt_map_ppi(int type);
 bool acpi_gtdt_c3stop(int type);
-int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
 #endif
 
 #ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER
-- 
cgit v1.2.3


From 2d45db63260c6ae3cf007361e04a1c41bd265084 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 10 Nov 2025 22:09:16 -0800
Subject: backlight: lp855x: Fix lp855x.h kernel-doc warnings

Add a missing struct short description and a missing leading " *" to
lp855x.h to avoid kernel-doc warnings:

Warning: include/linux/platform_data/lp855x.h:126 missing initial short
 description on line:
 * struct lp855x_platform_data
Warning: include/linux/platform_data/lp855x.h:131 bad line:
   Only valid when mode is PWM_BASED.

Fixes: 7be865ab8634 ("backlight: new backlight driver for LP855x devices")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Daniel Thompson (RISCstar) <danielt@kernel.org>
Link: https://patch.msgid.link/20251111060916.1995920-1-rdunlap@infradead.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/platform_data/lp855x.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/lp855x.h b/include/linux/platform_data/lp855x.h
index ab222dd05bbc..3b4a891acefe 100644
--- a/include/linux/platform_data/lp855x.h
+++ b/include/linux/platform_data/lp855x.h
@@ -124,12 +124,12 @@ struct lp855x_rom_data {
 };
 
 /**
- * struct lp855x_platform_data
+ * struct lp855x_platform_data - lp855 platform-specific data
  * @name : Backlight driver name. If it is not defined, default name is set.
  * @device_control : value of DEVICE CONTROL register
  * @initial_brightness : initial value of backlight brightness
  * @period_ns : platform specific pwm period value. unit is nano.
-		Only valid when mode is PWM_BASED.
+ *		Only valid when mode is PWM_BASED.
  * @size_program : total size of lp855x_rom_data
  * @rom_data : list of new eeprom/eprom registers
  */
-- 
cgit v1.2.3


From 34fa09c698d626b09f7824fe2c520a0a21a072b9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 25 Nov 2025 14:50:25 +0100
Subject: Revert "ACPI: processor: Do not expose global variable
 acpi_idle_driver"

Revert commit 559f2eacc8a2 ACPI: processor: Do not expose global variable
acpi_idle_driver" because it depends on a problematic one.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/processor_driver.c | 3 ++-
 drivers/acpi/processor_idle.c   | 9 +--------
 include/acpi/processor.h        | 1 +
 3 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 5d824435b26b..de17c1412678 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -166,7 +166,8 @@ static int __acpi_processor_start(struct acpi_device *device)
 	if (result && !IS_ENABLED(CONFIG_ACPI_CPU_FREQ_PSS))
 		dev_dbg(&device->dev, "CPPC data invalid or not present\n");
 
-	acpi_processor_power_init(pr);
+	if (cpuidle_get_driver() == &acpi_idle_driver)
+		acpi_processor_power_init(pr);
 
 	acpi_pss_perf_init(pr);
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 22b051b94a86..698d14c19587 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -51,7 +51,7 @@ module_param(latency_factor, uint, 0644);
 
 static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device);
 
-static struct cpuidle_driver acpi_idle_driver = {
+struct cpuidle_driver acpi_idle_driver = {
 	.name =		"acpi_idle",
 	.owner =	THIS_MODULE,
 };
@@ -1404,13 +1404,6 @@ void acpi_processor_power_init(struct acpi_processor *pr)
 {
 	struct cpuidle_device *dev;
 
-	/*
-	 * The code below only works if the current cpuidle driver is the ACPI
-	 * idle driver.
-	 */
-	if (cpuidle_get_driver() != &acpi_idle_driver)
-		return;
-
 	if (disabled_by_idle_boot_param())
 		return;
 
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 7146a8e9e9c2..24fdaa3c2899 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -417,6 +417,7 @@ static inline void acpi_processor_throttling_init(void) {}
 #endif	/* CONFIG_ACPI_CPU_FREQ_PSS */
 
 /* in processor_idle.c */
+extern struct cpuidle_driver acpi_idle_driver;
 #ifdef CONFIG_ACPI_PROCESSOR_IDLE
 void acpi_processor_power_init(struct acpi_processor *pr);
 void acpi_processor_power_exit(struct acpi_processor *pr);
-- 
cgit v1.2.3


From 66e600a26ee7d845d9434c3d60cef4bbf7dd3eb4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 25 Nov 2025 14:53:33 +0100
Subject: Revert "ACPI: processor: idle: Redefine two functions as void"

Revert commit fbd401e95e56 ("ACPI: processor: idle: Redefine two
functions as void") because it depends on a problematic one.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/processor_idle.c | 41 ++++++++++++++++++++++-------------------
 include/acpi/processor.h      |  4 ++--
 2 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 698d14c19587..5dacf41d7cc0 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -1400,45 +1400,47 @@ void acpi_processor_unregister_idle_driver(void)
 	cpuidle_unregister_driver(&acpi_idle_driver);
 }
 
-void acpi_processor_power_init(struct acpi_processor *pr)
+int acpi_processor_power_init(struct acpi_processor *pr)
 {
+	int retval;
 	struct cpuidle_device *dev;
 
 	if (disabled_by_idle_boot_param())
-		return;
+		return 0;
 
 	acpi_processor_cstate_first_run_checks();
 
 	if (!acpi_processor_get_power_info(pr))
 		pr->flags.power_setup_done = 1;
 
-	if (!pr->flags.power)
-		return;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return;
+	if (pr->flags.power) {
+		dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+		if (!dev)
+			return -ENOMEM;
+		per_cpu(acpi_cpuidle_device, pr->id) = dev;
 
-	per_cpu(acpi_cpuidle_device, pr->id) = dev;
+		acpi_processor_setup_cpuidle_dev(pr, dev);
 
-	acpi_processor_setup_cpuidle_dev(pr, dev);
+		/* Register per-cpu cpuidle_device. Cpuidle driver
+		 * must already be registered before registering device
+		 */
+		retval = cpuidle_register_device(dev);
+		if (retval) {
 
-	/*
-	 * Register a cpuidle device for this CPU.  The cpuidle driver using
-	 * this device is expected to be registered.
-	 */
-	if (cpuidle_register_device(dev)) {
-		per_cpu(acpi_cpuidle_device, pr->id) = NULL;
-		kfree(dev);
+			per_cpu(acpi_cpuidle_device, pr->id) = NULL;
+			kfree(dev);
+			return retval;
+		}
 	}
+	return 0;
 }
 
-void acpi_processor_power_exit(struct acpi_processor *pr)
+int acpi_processor_power_exit(struct acpi_processor *pr)
 {
 	struct cpuidle_device *dev = per_cpu(acpi_cpuidle_device, pr->id);
 
 	if (disabled_by_idle_boot_param())
-		return;
+		return 0;
 
 	if (pr->flags.power) {
 		cpuidle_unregister_device(dev);
@@ -1446,6 +1448,7 @@ void acpi_processor_power_exit(struct acpi_processor *pr)
 	}
 
 	pr->flags.power_setup_done = 0;
+	return 0;
 }
 
 MODULE_IMPORT_NS("ACPI_PROCESSOR_IDLE");
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 24fdaa3c2899..6ee4a69412de 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -419,8 +419,8 @@ static inline void acpi_processor_throttling_init(void) {}
 /* in processor_idle.c */
 extern struct cpuidle_driver acpi_idle_driver;
 #ifdef CONFIG_ACPI_PROCESSOR_IDLE
-void acpi_processor_power_init(struct acpi_processor *pr);
-void acpi_processor_power_exit(struct acpi_processor *pr);
+int acpi_processor_power_init(struct acpi_processor *pr);
+int acpi_processor_power_exit(struct acpi_processor *pr);
 int acpi_processor_power_state_has_changed(struct acpi_processor *pr);
 int acpi_processor_hotplug(struct acpi_processor *pr);
 void acpi_processor_register_idle_driver(void);
-- 
cgit v1.2.3


From e6889323c2184c700428dd4b90a1c2c06b8ae51f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 25 Nov 2025 15:03:24 +0100
Subject: Revert "ACPI: processor: idle: Rearrange declarations in header file"

Revert commit bdf780fbcef5 ("ACPI: processor: idle: Rearrange declarations
in header file") because it depends on a problematic one.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/processor.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 6ee4a69412de..2976a6d0c54f 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -425,8 +425,6 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr);
 int acpi_processor_hotplug(struct acpi_processor *pr);
 void acpi_processor_register_idle_driver(void);
 void acpi_processor_unregister_idle_driver(void);
-int acpi_processor_ffh_lpi_probe(unsigned int cpu);
-int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi);
 #endif /* CONFIG_ACPI_PROCESSOR_IDLE */
 
 /* in processor_thermal.c */
@@ -449,6 +447,11 @@ static inline void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy)
 }
 #endif	/* CONFIG_CPU_FREQ */
 
+#ifdef CONFIG_ACPI_PROCESSOR_IDLE
+extern int acpi_processor_ffh_lpi_probe(unsigned int cpu);
+extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi);
+#endif
+
 void acpi_processor_init_invariance_cppc(void);
 
 #endif
-- 
cgit v1.2.3


From 1a8b3501821b608383f7c7aa0f24e2006681e2b5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 25 Nov 2025 15:05:01 +0100
Subject: Revert "ACPI: processor: Remove unused empty stubs of some functions"

Revert commit 5020d05b3476 ("ACPI: processor: Remove unused empty stubs
of some functions") because it depends on a problematic one.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/processor.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 2976a6d0c54f..ff864c1cee3a 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -425,6 +425,26 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr);
 int acpi_processor_hotplug(struct acpi_processor *pr);
 void acpi_processor_register_idle_driver(void);
 void acpi_processor_unregister_idle_driver(void);
+#else
+static inline int acpi_processor_power_init(struct acpi_processor *pr)
+{
+	return -ENODEV;
+}
+
+static inline int acpi_processor_power_exit(struct acpi_processor *pr)
+{
+	return -ENODEV;
+}
+
+static inline int acpi_processor_power_state_has_changed(struct acpi_processor *pr)
+{
+	return -ENODEV;
+}
+
+static inline int acpi_processor_hotplug(struct acpi_processor *pr)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_ACPI_PROCESSOR_IDLE */
 
 /* in processor_thermal.c */
-- 
cgit v1.2.3


From 43ff36c4a5a574ee83b4b0d3f3d74f09a3a8c2d3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 25 Nov 2025 15:06:12 +0100
Subject: Revert "ACPI: processor: idle: Optimize ACPI idle driver
 registration"

Revert commit 7a8c994cbb2d ("ACPI: processor: idle: Optimize ACPI idle
driver registration") because it is reported to introduce a cpuidle
regression leading to a kernel crash on a platform using the ACPI idle
driver.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reported-by: Borislav Petkov <bp@alien8.de>
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
Closes: https://lore.kernel.org/lkml/20251124200019.GIaSS5U9HhsWBotrQZ@fat_crate.local/
---
 drivers/acpi/processor_driver.c |  3 --
 drivers/acpi/processor_idle.c   | 65 +++++++++++++++--------------------------
 include/acpi/processor.h        |  2 --
 3 files changed, 23 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index de17c1412678..7644de24d2fa 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -263,8 +263,6 @@ static int __init acpi_processor_driver_init(void)
 	if (result < 0)
 		return result;
 
-	acpi_processor_register_idle_driver();
-
 	result = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
 				   "acpi/cpu-drv:online",
 				   acpi_soft_cpu_online, NULL);
@@ -303,7 +301,6 @@ static void __exit acpi_processor_driver_exit(void)
 
 	cpuhp_remove_state_nocalls(hp_online);
 	cpuhp_remove_state_nocalls(CPUHP_ACPI_CPUDRV_DEAD);
-	acpi_processor_unregister_idle_driver();
 	driver_unregister(&acpi_processor_driver);
 }
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5dacf41d7cc0..4166090db642 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -1357,48 +1357,7 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr)
 	return 0;
 }
 
-void acpi_processor_register_idle_driver(void)
-{
-	struct acpi_processor *pr;
-	int ret = -ENODEV;
-	int cpu;
-
-	/*
-	 * Acpi idle driver is used by all possible CPUs.
-	 * Install the idle handler by the processor power info of one in them.
-	 * Note that we use previously set idle handler will be used on
-	 * platforms that only support C1.
-	 */
-	for_each_cpu(cpu, (struct cpumask *)cpu_possible_mask) {
-		pr = per_cpu(processors, cpu);
-		if (!pr)
-			continue;
-
-		ret = acpi_processor_get_power_info(pr);
-		if (!ret) {
-			pr->flags.power_setup_done = 1;
-			acpi_processor_setup_cpuidle_states(pr);
-			break;
-		}
-	}
-
-	if (ret) {
-		pr_debug("No ACPI power information from any CPUs.\n");
-		return;
-	}
-
-	ret = cpuidle_register_driver(&acpi_idle_driver);
-	if (ret) {
-		pr_debug("register %s failed.\n", acpi_idle_driver.name);
-		return;
-	}
-	pr_debug("%s registered with cpuidle.\n", acpi_idle_driver.name);
-}
-
-void acpi_processor_unregister_idle_driver(void)
-{
-	cpuidle_unregister_driver(&acpi_idle_driver);
-}
+static int acpi_processor_registered;
 
 int acpi_processor_power_init(struct acpi_processor *pr)
 {
@@ -1413,7 +1372,22 @@ int acpi_processor_power_init(struct acpi_processor *pr)
 	if (!acpi_processor_get_power_info(pr))
 		pr->flags.power_setup_done = 1;
 
+	/*
+	 * Install the idle handler if processor power management is supported.
+	 * Note that we use previously set idle handler will be used on
+	 * platforms that only support C1.
+	 */
 	if (pr->flags.power) {
+		/* Register acpi_idle_driver if not already registered */
+		if (!acpi_processor_registered) {
+			acpi_processor_setup_cpuidle_states(pr);
+			retval = cpuidle_register_driver(&acpi_idle_driver);
+			if (retval)
+				return retval;
+			pr_debug("%s registered with cpuidle\n",
+				 acpi_idle_driver.name);
+		}
+
 		dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 		if (!dev)
 			return -ENOMEM;
@@ -1426,11 +1400,14 @@ int acpi_processor_power_init(struct acpi_processor *pr)
 		 */
 		retval = cpuidle_register_device(dev);
 		if (retval) {
+			if (acpi_processor_registered == 0)
+				cpuidle_unregister_driver(&acpi_idle_driver);
 
 			per_cpu(acpi_cpuidle_device, pr->id) = NULL;
 			kfree(dev);
 			return retval;
 		}
+		acpi_processor_registered++;
 	}
 	return 0;
 }
@@ -1444,6 +1421,10 @@ int acpi_processor_power_exit(struct acpi_processor *pr)
 
 	if (pr->flags.power) {
 		cpuidle_unregister_device(dev);
+		acpi_processor_registered--;
+		if (acpi_processor_registered == 0)
+			cpuidle_unregister_driver(&acpi_idle_driver);
+
 		kfree(dev);
 	}
 
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index ff864c1cee3a..d0eccbd920e5 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -423,8 +423,6 @@ int acpi_processor_power_init(struct acpi_processor *pr);
 int acpi_processor_power_exit(struct acpi_processor *pr);
 int acpi_processor_power_state_has_changed(struct acpi_processor *pr);
 int acpi_processor_hotplug(struct acpi_processor *pr);
-void acpi_processor_register_idle_driver(void);
-void acpi_processor_unregister_idle_driver(void);
 #else
 static inline int acpi_processor_power_init(struct acpi_processor *pr)
 {
-- 
cgit v1.2.3


From b2a38f6df9dab0b05858746edcbe2403f8f4e4ec Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Nov 2025 08:32:43 +0000
Subject: net_sched: make room for (struct qdisc_skb_cb)->pkt_segs

Add a new u16 field, next to pkt_len : pkt_segs

This will cache shinfo->gso_segs to speed up qdisc deqeue().

Move slave_dev_queue_mapping at the end of qdisc_skb_cb,
and move three bits from tc_skb_cb :
- post_ct
- post_ct_snat
- post_ct_dnat

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-2-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sch_generic.h | 18 +++++++++---------
 net/core/dev.c            |  2 +-
 net/sched/act_ct.c        |  8 ++++----
 net/sched/cls_api.c       |  6 +++---
 net/sched/cls_flower.c    |  2 +-
 5 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 94966692ccdf..9cd8b5d4b236 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -429,13 +429,16 @@ struct tcf_proto {
 };
 
 struct qdisc_skb_cb {
-	struct {
-		unsigned int		pkt_len;
-		u16			slave_dev_queue_mapping;
-		u16			tc_classid;
-	};
+	unsigned int		pkt_len;
+	u16			pkt_segs;
+	u16			tc_classid;
 #define QDISC_CB_PRIV_LEN 20
 	unsigned char		data[QDISC_CB_PRIV_LEN];
+
+	u16			slave_dev_queue_mapping;
+	u8			post_ct:1;
+	u8			post_ct_snat:1;
+	u8			post_ct_dnat:1;
 };
 
 typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
@@ -1064,11 +1067,8 @@ struct tc_skb_cb {
 	struct qdisc_skb_cb qdisc_cb;
 	u32 drop_reason;
 
-	u16 zone; /* Only valid if post_ct = true */
+	u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */
 	u16 mru;
-	u8 post_ct:1;
-	u8 post_ct_snat:1;
-	u8 post_ct_dnat:1;
 };
 
 static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
diff --git a/net/core/dev.c b/net/core/dev.c
index 69515edd17bc..46ce6c610780 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4355,7 +4355,7 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
 		return ret;
 
 	tc_skb_cb(skb)->mru = 0;
-	tc_skb_cb(skb)->post_ct = false;
+	qdisc_skb_cb(skb)->post_ct = false;
 	tcf_set_drop_reason(skb, *drop_reason);
 
 	mini_qdisc_bstats_cpu_update(miniq, skb);
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 6749a4a9a9cd..2b6ac7069dc1 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -948,9 +948,9 @@ static int tcf_ct_act_nat(struct sk_buff *skb,
 		return err & NF_VERDICT_MASK;
 
 	if (action & BIT(NF_NAT_MANIP_SRC))
-		tc_skb_cb(skb)->post_ct_snat = 1;
+		qdisc_skb_cb(skb)->post_ct_snat = 1;
 	if (action & BIT(NF_NAT_MANIP_DST))
-		tc_skb_cb(skb)->post_ct_dnat = 1;
+		qdisc_skb_cb(skb)->post_ct_dnat = 1;
 
 	return err;
 #else
@@ -986,7 +986,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	tcf_action_update_bstats(&c->common, skb);
 
 	if (clear) {
-		tc_skb_cb(skb)->post_ct = false;
+		qdisc_skb_cb(skb)->post_ct = false;
 		ct = nf_ct_get(skb, &ctinfo);
 		if (ct) {
 			nf_ct_put(ct);
@@ -1097,7 +1097,7 @@ do_nat:
 out_push:
 	skb_push_rcsum(skb, nh_ofs);
 
-	tc_skb_cb(skb)->post_ct = true;
+	qdisc_skb_cb(skb)->post_ct = true;
 	tc_skb_cb(skb)->zone = p->zone;
 out_clear:
 	if (defrag)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f751cd5eeac8..ebca4b926dcf 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1872,9 +1872,9 @@ int tcf_classify(struct sk_buff *skb,
 			}
 			ext->chain = last_executed_chain;
 			ext->mru = cb->mru;
-			ext->post_ct = cb->post_ct;
-			ext->post_ct_snat = cb->post_ct_snat;
-			ext->post_ct_dnat = cb->post_ct_dnat;
+			ext->post_ct = qdisc_skb_cb(skb)->post_ct;
+			ext->post_ct_snat = qdisc_skb_cb(skb)->post_ct_snat;
+			ext->post_ct_dnat = qdisc_skb_cb(skb)->post_ct_dnat;
 			ext->zone = cb->zone;
 		}
 	}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 099ff6a3e1f5..7669371c1354 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -326,7 +326,7 @@ TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb,
 				  struct tcf_result *res)
 {
 	struct cls_fl_head *head = rcu_dereference_bh(tp->root);
-	bool post_ct = tc_skb_cb(skb)->post_ct;
+	bool post_ct = qdisc_skb_cb(skb)->post_ct;
 	u16 zone = tc_skb_cb(skb)->zone;
 	struct fl_flow_key skb_key;
 	struct fl_flow_mask *mask;
-- 
cgit v1.2.3


From 2773cb0b3120eb5c4b66d949eb99853d5bae1221 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Nov 2025 08:32:47 +0000
Subject: net_sched: use qdisc_skb_cb(skb)->pkt_segs in bstats_update()

Avoid up to two cache line misses in qdisc dequeue() to fetch
skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.

This gives a 5 % improvement in a TX intensive workload.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-6-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sch_generic.h | 13 ++++++++++---
 net/sched/sch_cake.c      |  1 +
 net/sched/sch_dualpi2.c   |  1 +
 net/sched/sch_netem.c     |  1 +
 net/sched/sch_qfq.c       |  2 +-
 net/sched/sch_taprio.c    |  1 +
 net/sched/sch_tbf.c       |  1 +
 7 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9cd8b5d4b236..cdf7a58ebcf5 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -829,6 +829,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
 	return qdisc_skb_cb(skb)->pkt_len;
 }
 
+static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb)
+{
+	u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs;
+
+	DEBUG_NET_WARN_ON_ONCE(pkt_segs !=
+			(skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1));
+	return pkt_segs;
+}
+
 /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
 enum net_xmit_qdisc_t {
 	__NET_XMIT_STOLEN = 0x00010000,
@@ -870,9 +879,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats,
 static inline void bstats_update(struct gnet_stats_basic_sync *bstats,
 				 const struct sk_buff *skb)
 {
-	_bstats_update(bstats,
-		       qdisc_pkt_len(skb),
-		       skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
+	_bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb));
 }
 
 static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 9213129f0de1..a20880034aa5 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1800,6 +1800,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		skb_list_walk_safe(segs, segs, nskb) {
 			skb_mark_not_on_list(segs);
 			qdisc_skb_cb(segs)->pkt_len = segs->len;
+			qdisc_skb_cb(segs)->pkt_segs = 1;
 			cobalt_set_enqueue_time(segs, now);
 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
 									  segs);
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
index 4b975feb52b1..6d7e6389758d 100644
--- a/net/sched/sch_dualpi2.c
+++ b/net/sched/sch_dualpi2.c
@@ -475,6 +475,7 @@ static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			 * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb
 			 */
 			qdisc_skb_cb(nskb)->pkt_len = nskb->len;
+			qdisc_skb_cb(nskb)->pkt_segs = 1;
 			dualpi2_skb_cb(nskb)->classified =
 				dualpi2_skb_cb(skb)->classified;
 			dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index eafc316ae319..32a5f3304046 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -429,6 +429,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
 	struct sk_buff *segs;
 	netdev_features_t features = netif_skb_features(skb);
 
+	qdisc_skb_cb(skb)->pkt_segs = 1;
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 
 	if (IS_ERR_OR_NULL(segs)) {
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 2255355e51d3..d920f57dc6d7 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1250,7 +1250,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		}
 	}
 
-	gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
+	gso_segs = qdisc_pkt_segs(skb);
 	err = qdisc_enqueue(skb, cl->qdisc, to_free);
 	if (unlikely(err != NET_XMIT_SUCCESS)) {
 		pr_debug("qfq_enqueue: enqueue failed %d\n", err);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 39b735386996..300d577b3286 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -595,6 +595,7 @@ static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch,
 	skb_list_walk_safe(segs, segs, nskb) {
 		skb_mark_not_on_list(segs);
 		qdisc_skb_cb(segs)->pkt_len = segs->len;
+		qdisc_skb_cb(segs)->pkt_segs = 1;
 		slen += segs->len;
 
 		/* FIXME: we should be segmenting to a smaller size
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 4c977f049670..f2340164f579 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -221,6 +221,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 		skb_mark_not_on_list(segs);
 		seg_len = segs->len;
 		qdisc_skb_cb(segs)->pkt_len = seg_len;
+		qdisc_skb_cb(segs)->pkt_segs = 1;
 		ret = qdisc_enqueue(segs, q->qdisc, to_free);
 		if (ret != NET_XMIT_SUCCESS) {
 			if (net_xmit_drop_count(ret))
-- 
cgit v1.2.3


From ad50d5a3fc20327e133e2db849c6e67fc49650e6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Nov 2025 08:32:49 +0000
Subject: net_sched: add Qdisc_read_mostly and Qdisc_write groups

It is possible to reorg Qdisc to avoid always dirtying 2 cache lines in
fast path by reducing this to a single dirtied cache line.

In current layout, we change only four/six fields in the first cache line:
 - q.spinlock
 - q.qlen
 - bstats.bytes
 - bstats.packets
 - some Qdisc also change q.next/q.prev

In the second cache line we change in the fast path:
 - running
 - state
 - qstats.backlog

        /* --- cacheline 2 boundary (128 bytes) --- */
        struct sk_buff_head        gso_skb __attribute__((__aligned__(64))); /*  0x80  0x18 */
        struct qdisc_skb_head      q;                    /*  0x98  0x18 */
        struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /*  0xb0  0x10 */

        /* --- cacheline 3 boundary (192 bytes) --- */
        struct gnet_stats_queue    qstats;               /*  0xc0  0x14 */
        bool                       running;              /*  0xd4   0x1 */

        /* XXX 3 bytes hole, try to pack */

        unsigned long              state;                /*  0xd8   0x8 */
        struct Qdisc *             next_sched;           /*  0xe0   0x8 */
        struct sk_buff_head        skb_bad_txq;          /*  0xe8  0x18 */
        /* --- cacheline 4 boundary (256 bytes) --- */

Reorganize things to have a first cache line mostly read,
then a mostly written one.

This gives a ~3% increase of performance under tx stress.

Note that there is an additional hole because @qstats now spans over a third cache line.

	/* --- cacheline 2 boundary (128 bytes) --- */
	__u8                       __cacheline_group_begin__Qdisc_read_mostly[0] __attribute__((__aligned__(64))); /*  0x80     0 */
	struct sk_buff_head        gso_skb;              /*  0x80  0x18 */
	struct Qdisc *             next_sched;           /*  0x98   0x8 */
	struct sk_buff_head        skb_bad_txq;          /*  0xa0  0x18 */
	__u8                       __cacheline_group_end__Qdisc_read_mostly[0]; /*  0xb8     0 */

	/* XXX 8 bytes hole, try to pack */

	/* --- cacheline 3 boundary (192 bytes) --- */
	__u8                       __cacheline_group_begin__Qdisc_write[0] __attribute__((__aligned__(64))); /*  0xc0     0 */
	struct qdisc_skb_head      q;                    /*  0xc0  0x18 */
	unsigned long              state;                /*  0xd8   0x8 */
	struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /*  0xe0  0x10 */
	bool                       running;              /*  0xf0   0x1 */

	/* XXX 3 bytes hole, try to pack */

	struct gnet_stats_queue    qstats;               /*  0xf4  0x14 */
	/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
	__u8                       __cacheline_group_end__Qdisc_write[0]; /* 0x108     0 */

	/* XXX 56 bytes hole, try to pack */

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-8-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sch_generic.h | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cdf7a58ebcf5..79501499dafb 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -103,17 +103,24 @@ struct Qdisc {
 	int			pad;
 	refcount_t		refcnt;
 
-	/*
-	 * For performance sake on SMP, we put highly modified fields at the end
-	 */
-	struct sk_buff_head	gso_skb ____cacheline_aligned_in_smp;
-	struct qdisc_skb_head	q;
-	struct gnet_stats_basic_sync bstats;
-	struct gnet_stats_queue	qstats;
-	bool			running; /* must be written under qdisc spinlock */
-	unsigned long		state;
-	struct Qdisc            *next_sched;
-	struct sk_buff_head	skb_bad_txq;
+	/* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */
+	__cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned;
+		struct sk_buff_head	gso_skb;
+		struct Qdisc		*next_sched;
+		struct sk_buff_head	skb_bad_txq;
+	__cacheline_group_end(Qdisc_read_mostly);
+
+	/* Fields dirtied in dequeue() fast path. */
+	__cacheline_group_begin(Qdisc_write) ____cacheline_aligned;
+		struct qdisc_skb_head	q;
+		unsigned long		state;
+		struct gnet_stats_basic_sync bstats;
+		bool			running; /* must be written under qdisc spinlock */
+
+		/* Note : we only change qstats.backlog in fast path. */
+		struct gnet_stats_queue	qstats;
+	__cacheline_group_end(Qdisc_write);
+
 
 	atomic_long_t		defer_count ____cacheline_aligned_in_smp;
 	struct llist_head	defer_list;
-- 
cgit v1.2.3


From 0170d7f47c8bb0311bc802bad52245c045f151fe Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Nov 2025 08:32:54 +0000
Subject: net_sched: add tcf_kfree_skb_list() helper

Using kfree_skb_list_reason() to free list of skbs from qdisc
operations seems wrong as each skb might have a different drop reason.

Cleanup __dev_xmit_skb() to call tcf_kfree_skb_list() once
in preparation of the following patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-13-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sch_generic.h | 11 +++++++++++
 net/core/dev.c            | 15 +++++----------
 2 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 79501499dafb..b8092d0378a0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1105,6 +1105,17 @@ static inline void tcf_set_drop_reason(const struct sk_buff *skb,
 	tc_skb_cb(skb)->drop_reason = reason;
 }
 
+static inline void tcf_kfree_skb_list(struct sk_buff *skb)
+{
+	while (unlikely(skb)) {
+		struct sk_buff *next = skb->next;
+
+		prefetch(next);
+		kfree_skb_reason(skb, tcf_get_drop_reason(skb));
+		skb = next;
+	}
+}
+
 /* Instead of calling kfree_skb() while root qdisc lock is held,
  * queue the skb for future freeing at end of __dev_xmit_skb()
  */
diff --git a/net/core/dev.c b/net/core/dev.c
index 10042139dbb0..e865cdb9b696 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4162,7 +4162,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				__qdisc_run(q);
 				qdisc_run_end(q);
 
-				goto no_lock_out;
+				goto free_skbs;
 			}
 
 			qdisc_bstats_cpu_update(q, skb);
@@ -4176,12 +4176,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 		qdisc_run(q);
-
-no_lock_out:
-		if (unlikely(to_free))
-			kfree_skb_list_reason(to_free,
-					      tcf_get_drop_reason(to_free));
-		return rc;
+		goto free_skbs;
 	}
 
 	/* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
@@ -4257,9 +4252,9 @@ no_lock_out:
 	}
 unlock:
 	spin_unlock(root_lock);
-	if (unlikely(to_free))
-		kfree_skb_list_reason(to_free,
-				      tcf_get_drop_reason(to_free));
+
+free_skbs:
+	tcf_kfree_skb_list(to_free);
 	return rc;
 }
 
-- 
cgit v1.2.3


From 191ff13e42a7b7824fec5b2ed84fd6481356754d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Nov 2025 08:32:55 +0000
Subject: net_sched: add qdisc_dequeue_drop() helper

Some qdisc like cake, codel, fq_codel might drop packets
in their dequeue() method.

This is currently problematic because dequeue() runs with
the qdisc spinlock held. Freeing skbs can be extremely expensive.

Add qdisc_dequeue_drop() method and a new TCQ_F_DEQUEUE_DROPS
so that these qdiscs can opt-in to defer the skb frees
after the socket spinlock is released.

TCQ_F_DEQUEUE_DROPS is an attempt to not penalize other qdiscs
with an extra cache line miss.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-14-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/pkt_sched.h   |  5 +++--
 include/net/sch_generic.h | 30 +++++++++++++++++++++++++++---
 net/core/dev.c            | 22 +++++++++++++---------
 3 files changed, 43 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 4678db45832a..e703c507d0da 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 void __qdisc_run(struct Qdisc *q);
 
-static inline void qdisc_run(struct Qdisc *q)
+static inline struct sk_buff *qdisc_run(struct Qdisc *q)
 {
 	if (qdisc_run_begin(q)) {
 		__qdisc_run(q);
-		qdisc_run_end(q);
+		return qdisc_run_end(q);
 	}
+	return NULL;
 }
 
 extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index b8092d0378a0..c3a7268b567e 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -88,6 +88,8 @@ struct Qdisc {
 #define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
 #define TCQ_F_NOLOCK		0x100 /* qdisc does not require locking */
 #define TCQ_F_OFFLOADED		0x200 /* qdisc is offloaded to HW */
+#define TCQ_F_DEQUEUE_DROPS	0x400 /* ->dequeue() can drop packets in q->to_free */
+
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
@@ -119,6 +121,8 @@ struct Qdisc {
 
 		/* Note : we only change qstats.backlog in fast path. */
 		struct gnet_stats_queue	qstats;
+
+		struct sk_buff		*to_free;
 	__cacheline_group_end(Qdisc_write);
 
 
@@ -218,8 +222,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 	return true;
 }
 
-static inline void qdisc_run_end(struct Qdisc *qdisc)
+static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc)
 {
+	struct sk_buff *to_free = NULL;
+
 	if (qdisc->flags & TCQ_F_NOLOCK) {
 		spin_unlock(&qdisc->seqlock);
 
@@ -232,9 +238,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
 		if (unlikely(test_bit(__QDISC_STATE_MISSED,
 				      &qdisc->state)))
 			__netif_schedule(qdisc);
-	} else {
-		WRITE_ONCE(qdisc->running, false);
+		return NULL;
+	}
+
+	if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) {
+		to_free = qdisc->to_free;
+		if (to_free)
+			qdisc->to_free = NULL;
 	}
+	WRITE_ONCE(qdisc->running, false);
+	return to_free;
 }
 
 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
@@ -1116,6 +1129,17 @@ static inline void tcf_kfree_skb_list(struct sk_buff *skb)
 	}
 }
 
+static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb,
+				      enum skb_drop_reason reason)
+{
+	DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS));
+	DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK);
+
+	tcf_set_drop_reason(skb, reason);
+	skb->next = q->to_free;
+	q->to_free = skb;
+}
+
 /* Instead of calling kfree_skb() while root qdisc lock is held,
  * queue the skb for future freeing at end of __dev_xmit_skb()
  */
diff --git a/net/core/dev.c b/net/core/dev.c
index e865cdb9b696..9094c0fb8c68 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4141,7 +4141,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct net_device *dev,
 				 struct netdev_queue *txq)
 {
-	struct sk_buff *next, *to_free = NULL;
+	struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
 	spinlock_t *root_lock = qdisc_lock(q);
 	struct llist_node *ll_list, *first_n;
 	unsigned long defer_count = 0;
@@ -4160,7 +4160,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			if (unlikely(!nolock_qdisc_is_empty(q))) {
 				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 				__qdisc_run(q);
-				qdisc_run_end(q);
+				to_free2 = qdisc_run_end(q);
 
 				goto free_skbs;
 			}
@@ -4170,12 +4170,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			    !nolock_qdisc_is_empty(q))
 				__qdisc_run(q);
 
-			qdisc_run_end(q);
-			return NET_XMIT_SUCCESS;
+			to_free2 = qdisc_run_end(q);
+			rc = NET_XMIT_SUCCESS;
+			goto free_skbs;
 		}
 
 		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
-		qdisc_run(q);
+		to_free2 = qdisc_run(q);
 		goto free_skbs;
 	}
 
@@ -4234,7 +4235,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		qdisc_bstats_update(q, skb);
 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
 			__qdisc_run(q);
-		qdisc_run_end(q);
+		to_free2 = qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		int count = 0;
@@ -4246,7 +4247,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 			count++;
 		}
-		qdisc_run(q);
+		to_free2 = qdisc_run(q);
 		if (count != 1)
 			rc = NET_XMIT_SUCCESS;
 	}
@@ -4255,6 +4256,7 @@ unlock:
 
 free_skbs:
 	tcf_kfree_skb_list(to_free);
+	tcf_kfree_skb_list(to_free2);
 	return rc;
 }
 
@@ -5747,8 +5749,9 @@ static __latent_entropy void net_tx_action(void)
 		rcu_read_lock();
 
 		while (head) {
-			struct Qdisc *q = head;
 			spinlock_t *root_lock = NULL;
+			struct sk_buff *to_free;
+			struct Qdisc *q = head;
 
 			head = head->next_sched;
 
@@ -5775,9 +5778,10 @@ static __latent_entropy void net_tx_action(void)
 			}
 
 			clear_bit(__QDISC_STATE_SCHED, &q->state);
-			qdisc_run(q);
+			to_free = qdisc_run(q);
 			if (root_lock)
 				spin_unlock(root_lock);
+			tcf_kfree_skb_list(to_free);
 		}
 
 		rcu_read_unlock();
-- 
cgit v1.2.3


From 96ce2aeb15bd8672ab47abe547e2a1f8ba3886ff Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 21 Nov 2025 11:50:58 -0400
Subject: vfio/pci: Add vfio_pci_dma_buf_iommufd_map()

This function is used to establish the "private interconnect" between the
VFIO DMABUF exporter and the iommufd DMABUF importer. This is intended to
be a temporary API until the core DMABUF interface is improved to natively
support a private interconnect and revocable negotiation.

This function should only be called by iommufd when trying to map a
DMABUF. For now iommufd will only support VFIO DMABUFs.

The following improvements are needed in the DMABUF API to generically
support more exporters with iommufd/kvm type importers that cannot use the
DMA API:

 1) Revoke semantics. VFIO needs to be able to prevent access to the MMIO
    during FLR, and so it will use dma_buf_move_notify() to prevent
    access. iommmufd does not support fault handling so it cannot
    implement the full move_notify. Instead if revoke is negotiated the
    exporter promises not to use move_notify() unless the importer can
    experiance failures. iommufd will unmap the dmabuf from the iommu page
    tables while it is revoked.

 2) Private interconnect negotiation. iommufd will only be able to map
    a "private interconnect" that provides a phys_addr_t and a
    struct p2pdma_provider * to describe the memory. It cannot use a DMA
    mapped scatterlist since it is directly calling iommu_map().

 3) NULL device during dma_buf_dynamic_attach(). Since iommufd doesn't use
    the DMA API it doesn't have a DMAable struct device to pass here.

Link: https://patch.msgid.link/r/1-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shuai Xue <xueshuai@linux.alibaba.com>
Acked-by: Alex Williamson <alex@shazbot.org>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/vfio/pci/vfio_pci_dmabuf.c | 34 ++++++++++++++++++++++++++++++++++
 include/linux/vfio_pci_core.h      |  4 ++++
 2 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index 6698f540bdac..d4d0f7d08c53 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -82,6 +82,40 @@ static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
 	.release = vfio_pci_dma_buf_release,
 };
 
+/*
+ * This is a temporary "private interconnect" between VFIO DMABUF and iommufd.
+ * It allows the two co-operating drivers to exchange the physical address of
+ * the BAR. This is to be replaced with a formal DMABUF system for negotiated
+ * interconnect types.
+ *
+ * If this function succeeds the following are true:
+ *  - There is one physical range and it is pointing to MMIO
+ *  - When move_notify is called it means revoke, not move, vfio_dma_buf_map
+ *    will fail if it is currently revoked
+ */
+int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+				 struct dma_buf_phys_vec *phys)
+{
+	struct vfio_pci_dma_buf *priv;
+
+	dma_resv_assert_held(attachment->dmabuf->resv);
+
+	if (attachment->dmabuf->ops != &vfio_pci_dmabuf_ops)
+		return -EOPNOTSUPP;
+
+	priv = attachment->dmabuf->priv;
+	if (priv->revoked)
+		return -ENODEV;
+
+	/* More than one range to iommufd will require proper DMABUF support */
+	if (priv->nr_ranges != 1)
+		return -EOPNOTSUPP;
+
+	*phys = priv->phys_vec[0];
+	return 0;
+}
+EXPORT_SYMBOL_FOR_MODULES(vfio_pci_dma_buf_iommufd_map, "iommufd");
+
 int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
 				struct vfio_region_dma_range *dma_ranges,
 				size_t nr_ranges, phys_addr_t start,
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index c9466ba323fa..6a3074f2cf1c 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -28,6 +28,7 @@ struct vfio_pci_core_device;
 struct vfio_pci_region;
 struct p2pdma_provider;
 struct dma_buf_phys_vec;
+struct dma_buf_attachment;
 
 struct vfio_pci_regops {
 	ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
@@ -203,4 +204,7 @@ VFIO_IOREAD_DECLARATION(32)
 VFIO_IOREAD_DECLARATION(64)
 #endif
 
+int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+				 struct dma_buf_phys_vec *phys);
+
 #endif /* VFIO_PCI_CORE_H */
-- 
cgit v1.2.3


From a4e6512a79d8486dccf3e8b066e5d6bd5ff95446 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 25 Nov 2025 12:26:42 +0100
Subject: PM: QoS: Introduce a CPU system wakeup QoS limit

Some platforms supports multiple low power states for CPUs that can be used
when entering system-wide suspend. Currently we are always selecting the
deepest possible state for the CPUs, which can break the system wakeup
latency constraint that may be required for a use case.

Let's take the first step towards addressing this problem, by introducing
an interface for user space, that allows us to specify the CPU system
wakeup QoS limit. Subsequent changes will start taking into account the new
QoS limit.

Reviewed-by: Dhruva Gole <d-gole@ti.com>
Reviewed-by: Kevin Hilman (TI) <khilman@baylibre.com>
Tested-by: Kevin Hilman (TI) <khilman@baylibre.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://patch.msgid.link/20251125112650.329269-2-ulf.hansson@linaro.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_qos.h |   9 +++++
 kernel/power/Kconfig   |  11 +++++
 kernel/power/qos.c     | 106 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 126 insertions(+)

(limited to 'include')

diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 4a69d4af3ff8..6cea4455f867 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -162,6 +162,15 @@ static inline void cpu_latency_qos_update_request(struct pm_qos_request *req,
 static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {}
 #endif
 
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+s32 cpu_wakeup_latency_qos_limit(void);
+#else
+static inline s32 cpu_wakeup_latency_qos_limit(void)
+{
+	return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+}
+#endif
+
 #ifdef CONFIG_PM
 enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask);
 enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 54a623680019..05337f437cca 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -202,6 +202,17 @@ config PM_WAKELOCKS_GC
 	depends on PM_WAKELOCKS
 	default y
 
+config PM_QOS_CPU_SYSTEM_WAKEUP
+	bool "User space interface for CPU system wakeup QoS"
+	depends on CPU_IDLE
+	help
+	  Enable this to allow user space via the cpu_wakeup_latency file to
+	  specify a CPU system wakeup latency limit.
+
+	  This may be particularly useful for platforms supporting multiple low
+	  power states for CPUs during system-wide suspend and s2idle in
+	  particular.
+
 config PM
 	bool "Device power management core functionality"
 	help
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 4244b069442e..f7d8064e9adc 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -415,6 +415,105 @@ static struct miscdevice cpu_latency_qos_miscdev = {
 	.fops = &cpu_latency_qos_fops,
 };
 
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+/* The CPU system wakeup latency QoS. */
+static struct pm_qos_constraints cpu_wakeup_latency_constraints = {
+	.list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list),
+	.target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+	.default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+	.no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+	.type = PM_QOS_MIN,
+};
+
+/**
+ * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit.
+ *
+ * Returns the current CPU system wakeup latency QoS limit that may have been
+ * requested by user space.
+ */
+s32 cpu_wakeup_latency_qos_limit(void)
+{
+	return pm_qos_read_value(&cpu_wakeup_latency_constraints);
+}
+
+static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp)
+{
+	struct pm_qos_request *req;
+
+	req = kzalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->qos = &cpu_wakeup_latency_constraints;
+	pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ,
+			     PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
+	filp->private_data = req;
+
+	return 0;
+}
+
+static int cpu_wakeup_latency_qos_release(struct inode *inode,
+					  struct file *filp)
+{
+	struct pm_qos_request *req = filp->private_data;
+
+	filp->private_data = NULL;
+	pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ,
+			     PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
+	kfree(req);
+
+	return 0;
+}
+
+static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf,
+					   size_t count, loff_t *f_pos)
+{
+	s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints);
+
+	return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
+
+static ssize_t cpu_wakeup_latency_qos_write(struct file *filp,
+					    const char __user *buf,
+					    size_t count, loff_t *f_pos)
+{
+	struct pm_qos_request *req = filp->private_data;
+	s32 value;
+
+	if (count == sizeof(s32)) {
+		if (copy_from_user(&value, buf, sizeof(s32)))
+			return -EFAULT;
+	} else {
+		int ret;
+
+		ret = kstrtos32_from_user(buf, count, 16, &value);
+		if (ret)
+			return ret;
+	}
+
+	if (value < 0)
+		return -EINVAL;
+
+	pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value);
+
+	return count;
+}
+
+static const struct file_operations cpu_wakeup_latency_qos_fops = {
+	.open = cpu_wakeup_latency_qos_open,
+	.release = cpu_wakeup_latency_qos_release,
+	.read = cpu_wakeup_latency_qos_read,
+	.write = cpu_wakeup_latency_qos_write,
+	.llseek = noop_llseek,
+};
+
+static struct miscdevice cpu_wakeup_latency_qos_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "cpu_wakeup_latency",
+	.fops = &cpu_wakeup_latency_qos_fops,
+};
+#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */
+
 static int __init cpu_latency_qos_init(void)
 {
 	int ret;
@@ -424,6 +523,13 @@ static int __init cpu_latency_qos_init(void)
 		pr_err("%s: %s setup failed\n", __func__,
 		       cpu_latency_qos_miscdev.name);
 
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+	ret = misc_register(&cpu_wakeup_latency_qos_miscdev);
+	if (ret < 0)
+		pr_err("%s: %s setup failed\n", __func__,
+		       cpu_wakeup_latency_qos_miscdev.name);
+#endif
+
 	return ret;
 }
 late_initcall(cpu_latency_qos_init);
-- 
cgit v1.2.3


From 8e7de6dc420979f4e4443807b71dcc8b72d8c4a9 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 25 Nov 2025 12:26:43 +0100
Subject: pmdomain: Respect the CPU system wakeup QoS limit for s2idle

A CPU system wakeup QoS limit may have been requested by user space. To
avoid breaking this constraint when entering a low power state during
s2idle through genpd, let's extend the corresponding genpd governor for
CPUs. More precisely, during s2idle let the genpd governor select a
suitable domain idle state, by taking into account the QoS limit.

Reviewed-by: Dhruva Gole <d-gole@ti.com>
Reviewed-by: Kevin Hilman (TI) <khilman@baylibre.com>
Tested-by: Kevin Hilman (TI) <khilman@baylibre.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://patch.msgid.link/20251125112650.329269-3-ulf.hansson@linaro.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pmdomain/core.c     | 10 ++++++++--
 drivers/pmdomain/governor.c | 27 +++++++++++++++++++++++++++
 include/linux/pm_domain.h   |  1 +
 3 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 61c2277c9ce3..4fd546ef0448 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -1425,8 +1425,14 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock,
 			return;
 	}
 
-	/* Choose the deepest state when suspending */
-	genpd->state_idx = genpd->state_count - 1;
+	if (genpd->gov && genpd->gov->system_power_down_ok) {
+		if (!genpd->gov->system_power_down_ok(&genpd->domain))
+			return;
+	} else {
+		/* Default to the deepest state. */
+		genpd->state_idx = genpd->state_count - 1;
+	}
+
 	if (_genpd_power_off(genpd, false)) {
 		genpd->states[genpd->state_idx].rejected++;
 		return;
diff --git a/drivers/pmdomain/governor.c b/drivers/pmdomain/governor.c
index 39359811a930..bd1b9d66d4a5 100644
--- a/drivers/pmdomain/governor.c
+++ b/drivers/pmdomain/governor.c
@@ -415,9 +415,36 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd)
 	return false;
 }
 
+static bool cpu_system_power_down_ok(struct dev_pm_domain *pd)
+{
+	s64 constraint_ns = cpu_wakeup_latency_qos_limit() * NSEC_PER_USEC;
+	struct generic_pm_domain *genpd = pd_to_genpd(pd);
+	int state_idx = genpd->state_count - 1;
+
+	if (!(genpd->flags & GENPD_FLAG_CPU_DOMAIN)) {
+		genpd->state_idx = state_idx;
+		return true;
+	}
+
+	/* Find the deepest state for the latency constraint. */
+	while (state_idx >= 0) {
+		s64 latency_ns = genpd->states[state_idx].power_off_latency_ns +
+				 genpd->states[state_idx].power_on_latency_ns;
+
+		if (latency_ns <= constraint_ns) {
+			genpd->state_idx = state_idx;
+			return true;
+		}
+		state_idx--;
+	}
+
+	return false;
+}
+
 struct dev_power_governor pm_domain_cpu_gov = {
 	.suspend_ok = default_suspend_ok,
 	.power_down_ok = cpu_power_down_ok,
+	.system_power_down_ok = cpu_system_power_down_ok,
 };
 #endif
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index f67a2cb7d781..93ba0143ca47 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -153,6 +153,7 @@ enum genpd_sync_state {
 };
 
 struct dev_power_governor {
+	bool (*system_power_down_ok)(struct dev_pm_domain *domain);
 	bool (*power_down_ok)(struct dev_pm_domain *domain);
 	bool (*suspend_ok)(struct device *dev);
 };
-- 
cgit v1.2.3


From 99b42445f4a4aaff75eca24dfc9e6e376292dd48 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 25 Nov 2025 12:26:45 +0100
Subject: sched: idle: Respect the CPU system wakeup QoS limit for s2idle

A CPU system wakeup QoS limit may have been requested by user space. To
avoid breaking this constraint when entering a low power state during
s2idle, let's start to take into account the QoS limit.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dhruva Gole <d-gole@ti.com>
Reviewed-by: Kevin Hilman (TI) <khilman@baylibre.com>
Tested-by: Kevin Hilman (TI) <khilman@baylibre.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://patch.msgid.link/20251125112650.329269-5-ulf.hansson@linaro.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle.c | 12 +++++++-----
 include/linux/cpuidle.h   |  6 ++++--
 kernel/sched/idle.c       | 12 +++++++-----
 3 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 56132e843c99..c7876e9e024f 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -184,20 +184,22 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv,
  * cpuidle_enter_s2idle - Enter an idle state suitable for suspend-to-idle.
  * @drv: cpuidle driver for the given CPU.
  * @dev: cpuidle device for the given CPU.
+ * @latency_limit_ns: Idle state exit latency limit
  *
  * If there are states with the ->enter_s2idle callback, find the deepest of
  * them and enter it with frozen tick.
  */
-int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+			 u64 latency_limit_ns)
 {
 	int index;
 
 	/*
-	 * Find the deepest state with ->enter_s2idle present, which guarantees
-	 * that interrupts won't be enabled when it exits and allows the tick to
-	 * be frozen safely.
+	 * Find the deepest state with ->enter_s2idle present that meets the
+	 * specified latency limit, which guarantees that interrupts won't be
+	 * enabled when it exits and allows the tick to be frozen safely.
 	 */
-	index = find_deepest_state(drv, dev, U64_MAX, 0, true);
+	index = find_deepest_state(drv, dev, latency_limit_ns, 0, true);
 	if (index > 0) {
 		enter_s2idle_proper(drv, dev, index);
 		local_irq_enable();
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index a9ee4fe55dcf..4073690504a7 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -248,7 +248,8 @@ extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 				      struct cpuidle_device *dev,
 				      u64 latency_limit_ns);
 extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
-				struct cpuidle_device *dev);
+				struct cpuidle_device *dev,
+				u64 latency_limit_ns);
 extern void cpuidle_use_deepest_state(u64 latency_limit_ns);
 #else
 static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
@@ -256,7 +257,8 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 					     u64 latency_limit_ns)
 {return -ENODEV; }
 static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
-				       struct cpuidle_device *dev)
+				       struct cpuidle_device *dev,
+				       u64 latency_limit_ns)
 {return -ENODEV; }
 static inline void cpuidle_use_deepest_state(u64 latency_limit_ns)
 {
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c39b089d4f09..c1c3d0166610 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -131,12 +131,13 @@ void __cpuidle default_idle_call(void)
 }
 
 static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
-			       struct cpuidle_device *dev)
+			       struct cpuidle_device *dev,
+			       u64 max_latency_ns)
 {
 	if (current_clr_polling_and_test())
 		return -EBUSY;
 
-	return cpuidle_enter_s2idle(drv, dev);
+	return cpuidle_enter_s2idle(drv, dev, max_latency_ns);
 }
 
 static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
@@ -205,12 +206,13 @@ static void cpuidle_idle_call(void)
 		u64 max_latency_ns;
 
 		if (idle_should_enter_s2idle()) {
+			max_latency_ns = cpu_wakeup_latency_qos_limit() *
+					 NSEC_PER_USEC;
 
-			entered_state = call_cpuidle_s2idle(drv, dev);
+			entered_state = call_cpuidle_s2idle(drv, dev,
+							    max_latency_ns);
 			if (entered_state > 0)
 				goto exit_idle;
-
-			max_latency_ns = U64_MAX;
 		} else {
 			max_latency_ns = dev->forced_idle_latency_limit_ns;
 		}
-- 
cgit v1.2.3


From 35a5c37cb9f1f947dff18e7cfc75a8cfcfd557ca Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:01 +0100
Subject: cpumask: Cache num_possible_cpus()

Reevaluating num_possible_cpus() over and over does not make sense. That
becomes a constant after init as cpu_possible_mask is marked ro_after_init.

Cache the value during initialization and provide that for consumption.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Link: https://patch.msgid.link/20251119172549.578653738@linutronix.de
---
 include/linux/cpumask.h | 10 ++++++++--
 kernel/cpu.c            | 19 +++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index feba06eb0a42..66694ee8d86e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -126,6 +126,7 @@ extern struct cpumask __cpu_dying_mask;
 #define cpu_dying_mask    ((const struct cpumask *)&__cpu_dying_mask)
 
 extern atomic_t __num_online_cpus;
+extern unsigned int __num_possible_cpus;
 
 extern cpumask_t cpus_booted_once_mask;
 
@@ -1152,13 +1153,13 @@ void init_cpu_possible(const struct cpumask *src);
 #define __assign_cpu(cpu, mask, val)	\
 	__assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val))
 
-#define set_cpu_possible(cpu, possible)	assign_cpu((cpu), &__cpu_possible_mask, (possible))
 #define set_cpu_enabled(cpu, enabled)	assign_cpu((cpu), &__cpu_enabled_mask, (enabled))
 #define set_cpu_present(cpu, present)	assign_cpu((cpu), &__cpu_present_mask, (present))
 #define set_cpu_active(cpu, active)	assign_cpu((cpu), &__cpu_active_mask, (active))
 #define set_cpu_dying(cpu, dying)	assign_cpu((cpu), &__cpu_dying_mask, (dying))
 
 void set_cpu_online(unsigned int cpu, bool online);
+void set_cpu_possible(unsigned int cpu, bool possible);
 
 /**
  * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
@@ -1211,7 +1212,12 @@ static __always_inline unsigned int num_online_cpus(void)
 {
 	return raw_atomic_read(&__num_online_cpus);
 }
-#define num_possible_cpus()	cpumask_weight(cpu_possible_mask)
+
+static __always_inline unsigned int num_possible_cpus(void)
+{
+	return __num_possible_cpus;
+}
+
 #define num_enabled_cpus()	cpumask_weight(cpu_enabled_mask)
 #define num_present_cpus()	cpumask_weight(cpu_present_mask)
 #define num_active_cpus()	cpumask_weight(cpu_active_mask)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index db9f6c539b28..b674fdf96208 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3085,10 +3085,13 @@ EXPORT_SYMBOL(cpu_all_bits);
 #ifdef CONFIG_INIT_ALL_POSSIBLE
 struct cpumask __cpu_possible_mask __ro_after_init
 	= {CPU_BITS_ALL};
+unsigned int __num_possible_cpus __ro_after_init = NR_CPUS;
 #else
 struct cpumask __cpu_possible_mask __ro_after_init;
+unsigned int __num_possible_cpus __ro_after_init;
 #endif
 EXPORT_SYMBOL(__cpu_possible_mask);
+EXPORT_SYMBOL(__num_possible_cpus);
 
 struct cpumask __cpu_online_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_online_mask);
@@ -3116,6 +3119,7 @@ void init_cpu_present(const struct cpumask *src)
 void init_cpu_possible(const struct cpumask *src)
 {
 	cpumask_copy(&__cpu_possible_mask, src);
+	__num_possible_cpus = cpumask_weight(&__cpu_possible_mask);
 }
 
 void set_cpu_online(unsigned int cpu, bool online)
@@ -3139,6 +3143,21 @@ void set_cpu_online(unsigned int cpu, bool online)
 	}
 }
 
+/*
+ * This should be marked __init, but there is a boatload of call sites
+ * which need to be fixed up to do so. Sigh...
+ */
+void set_cpu_possible(unsigned int cpu, bool possible)
+{
+	if (possible) {
+		if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask))
+			__num_possible_cpus++;
+	} else {
+		if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask))
+			__num_possible_cpus--;
+	}
+}
+
 /*
  * Activate the first processor.
  */
-- 
cgit v1.2.3


From 539115f08cf850b9fdc6526b31da0839ff6c1631 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:03 +0100
Subject: sched/mmcid: Convert mm CID mask to a bitmap

This is truly a bitmap and just conveniently uses a cpumask because the
maximum size of the bitmap is nr_cpu_ids.

But that prevents to do searches for a zero bit in a limited range, which
is helpful to provide an efficient mechanism to consolidate the CID space
when the number of users decreases.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Link: https://patch.msgid.link/20251119172549.642866767@linutronix.de
---
 include/linux/mm_types.h | 9 +++++----
 kernel/sched/core.c      | 2 +-
 kernel/sched/sched.h     | 6 +++---
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 67a7bdf772f7..bafb81b33922 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1342,13 +1342,13 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm)
 }
 
 /* Accessor for struct mm_struct's cidmask. */
-static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
+static inline unsigned long *mm_cidmask(struct mm_struct *mm)
 {
 	unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm);
 
 	/* Skip mm_cpus_allowed */
 	cid_bitmap += cpumask_size();
-	return (struct cpumask *)cid_bitmap;
+	return (unsigned long *)cid_bitmap;
 }
 
 static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
@@ -1363,7 +1363,7 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 	mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
 	raw_spin_lock_init(&mm->mm_cid.lock);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
-	cpumask_clear(mm_cidmask(mm));
+	bitmap_zero(mm_cidmask(mm), num_possible_cpus());
 }
 
 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
@@ -1384,7 +1384,8 @@ static inline void mm_destroy_cid(struct mm_struct *mm)
 
 static inline unsigned int mm_cid_size(void)
 {
-	return 2 * cpumask_size();	/* mm_cpus_allowed(), mm_cidmask(). */
+	/* mm_cpus_allowed(), mm_cidmask(). */
+	return cpumask_size() + bitmap_size(num_possible_cpus());
 }
 
 #else /* CONFIG_SCHED_MM_CID */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f6bbfa1f5c15..9a114b6f6a6f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10402,7 +10402,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
 	guard(preempt)();
 	t->mm_cid.active = 0;
 	if (t->mm_cid.cid != MM_CID_UNSET) {
-		cpumask_clear_cpu(t->mm_cid.cid, mm_cidmask(mm));
+		clear_bit(t->mm_cid.cid, mm_cidmask(mm));
 		t->mm_cid.cid = MM_CID_UNSET;
 	}
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a17f04f075e1..31f2e431db5e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3559,7 +3559,7 @@ static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigne
 
 	if (cid >= max_cids)
 		return false;
-	if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm)))
+	if (test_and_set_bit(cid, mm_cidmask(mm)))
 		return false;
 	t->mm_cid.cid = t->mm_cid.last_cid = cid;
 	__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
@@ -3582,7 +3582,7 @@ static inline bool mm_cid_get(struct task_struct *t)
 		return true;
 
 	/* Try the first zero bit in the cidmask. */
-	return __mm_cid_get(t, cpumask_first_zero(mm_cidmask(mm)), max_cids);
+	return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), num_possible_cpus()), max_cids);
 }
 
 static inline void mm_cid_select(struct task_struct *t)
@@ -3603,7 +3603,7 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
 {
 	if (prev->mm_cid.active) {
 		if (prev->mm_cid.cid != MM_CID_UNSET)
-			cpumask_clear_cpu(prev->mm_cid.cid, mm_cidmask(prev->mm));
+			clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm));
 		prev->mm_cid.cid = MM_CID_UNSET;
 	}
 
-- 
cgit v1.2.3


From 2b1642b881088bbf73fcb1147c474a198ec46729 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:05 +0100
Subject: signal: Move MMCID exit out of sighand lock

There is no need anymore to keep this under sighand lock as the current
code and the upcoming replacement are not depending on the exit state of a
task anymore.

That allows to use a mutex in the exit path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.706439391@linutronix.de
---
 include/linux/sched.h | 4 ++--
 kernel/exit.c         | 1 +
 kernel/sched/core.c   | 4 ++--
 kernel/signal.c       | 2 --
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 64f080d6ed6e..c411ae021bc5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2298,7 +2298,7 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
 void sched_mm_cid_before_execve(struct task_struct *t);
 void sched_mm_cid_after_execve(struct task_struct *t);
 void sched_mm_cid_fork(struct task_struct *t);
-void sched_mm_cid_exit_signals(struct task_struct *t);
+void sched_mm_cid_exit(struct task_struct *t);
 static inline int task_mm_cid(struct task_struct *t)
 {
 	return t->mm_cid.cid;
@@ -2307,7 +2307,7 @@ static inline int task_mm_cid(struct task_struct *t)
 static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
 static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
 static inline void sched_mm_cid_fork(struct task_struct *t) { }
-static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
+static inline void sched_mm_cid_exit(struct task_struct *t) { }
 static inline int task_mm_cid(struct task_struct *t)
 {
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 9f74e8f1c431..324616f690b7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -910,6 +910,7 @@ void __noreturn do_exit(long code)
 	user_events_exit(tsk);
 
 	io_uring_files_cancel();
+	sched_mm_cid_exit(tsk);
 	exit_signals(tsk);  /* sets PF_EXITING */
 
 	seccomp_filter_release(tsk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a114b6f6a6f..3fdf90a7074d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10392,7 +10392,7 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
 	WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight);
 }
 
-void sched_mm_cid_exit_signals(struct task_struct *t)
+void sched_mm_cid_exit(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
 
@@ -10410,7 +10410,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
 /* Deactivate MM CID allocation across execve() */
 void sched_mm_cid_before_execve(struct task_struct *t)
 {
-	sched_mm_cid_exit_signals(t);
+	sched_mm_cid_exit(t);
 }
 
 /* Reactivate MM CID after successful execve() */
diff --git a/kernel/signal.c b/kernel/signal.c
index fe9190d84f28..e42b8bd6922f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3125,7 +3125,6 @@ void exit_signals(struct task_struct *tsk)
 	cgroup_threadgroup_change_begin(tsk);
 
 	if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
-		sched_mm_cid_exit_signals(tsk);
 		tsk->flags |= PF_EXITING;
 		cgroup_threadgroup_change_end(tsk);
 		return;
@@ -3136,7 +3135,6 @@ void exit_signals(struct task_struct *tsk)
 	 * From now this task is not visible for group-wide signals,
 	 * see wants_signal(), do_signal_stop().
 	 */
-	sched_mm_cid_exit_signals(tsk);
 	tsk->flags |= PF_EXITING;
 
 	cgroup_threadgroup_change_end(tsk);
-- 
cgit v1.2.3


From bf070520e398679cd582b3c3e44107bf22c143ba Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:07 +0100
Subject: sched/mmcid: Move initialization out of line

It's getting bigger soon, so just move it out of line to the rest of the
code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.769636491@linutronix.de
---
 include/linux/mm_types.h | 15 +--------------
 kernel/sched/core.c      | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bafb81b33922..3b7d05e7169c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1351,20 +1351,7 @@ static inline unsigned long *mm_cidmask(struct mm_struct *mm)
 	return (unsigned long *)cid_bitmap;
 }
 
-static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
-{
-	int i;
-
-	for_each_possible_cpu(i) {
-		struct mm_cid_pcpu *pcpu = per_cpu_ptr(mm->mm_cid.pcpu, i);
-
-		pcpu->cid = MM_CID_UNSET;
-	}
-	mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
-	raw_spin_lock_init(&mm->mm_cid.lock);
-	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
-	bitmap_zero(mm_cidmask(mm), num_possible_cpus());
-}
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p);
 
 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3fdf90a7074d..34b6c31eca3a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10431,6 +10431,20 @@ void sched_mm_cid_fork(struct task_struct *t)
 	WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET);
 	t->mm_cid.active = 1;
 }
+
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+{
+	struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
+
+	mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+	raw_spin_lock_init(&mm->mm_cid.lock);
+	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
+	bitmap_zero(mm_cidmask(mm), num_possible_cpus());
+}
 #else /* CONFIG_SCHED_MM_CID */
 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
 #endif /* !CONFIG_SCHED_MM_CID */
-- 
cgit v1.2.3


From b0c3d51b54f8a4f4c809432d210c0c983d5cd97e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:09 +0100
Subject: sched/mmcid: Provide precomputed maximal value

Reading mm::mm_users and mm:::mm_cid::nr_cpus_allowed every time to compute
the maximal CID value is just wasteful as that value is only changing on
fork(), exit() and eventually when the affinity changes.

So it can be easily precomputed at those points and provided in mm::mm_cid
for consumption in the hot path.

But there is an issue with using mm::mm_users for accounting because that
does not necessarily reflect the number of user space tasks as other kernel
code can take temporary references on the MM which skew the picture.

Solve that by adding a users counter to struct mm_mm_cid, which is modified
by fork() and exit() and used for precomputing under mm_mm_cid::lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.832764634@linutronix.de
---
 include/linux/rseq_types.h |  6 +++++
 kernel/fork.c              |  1 +
 kernel/sched/core.c        | 59 +++++++++++++++++++++++++++++++++-------------
 kernel/sched/sched.h       |  3 +--
 4 files changed, 50 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index d7e8071b626a..0fab369999b6 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -117,14 +117,20 @@ struct mm_cid_pcpu {
 /**
  * struct mm_mm_cid - Storage for per MM CID data
  * @pcpu:		Per CPU storage for CIDs associated to a CPU
+ * @max_cids:		The exclusive maximum CID value for allocation and convergence
  * @nr_cpus_allowed:	The number of CPUs in the per MM allowed CPUs map. The map
  *			is growth only.
+ * @users:		The number of tasks sharing this MM. Separate from mm::mm_users
+ *			as that is modified by mmget()/mm_put() by other entities which
+ *			do not actually share the MM.
  * @lock:		Spinlock to protect all fields except @pcpu. It also protects
  *			the MM cid cpumask and the MM cidmask bitmap.
  */
 struct mm_mm_cid {
 	struct mm_cid_pcpu	__percpu *pcpu;
+	unsigned int		max_cids;
 	unsigned int		nr_cpus_allowed;
+	unsigned int		users;
 	raw_spinlock_t		lock;
 }____cacheline_aligned_in_smp;
 #else /* CONFIG_SCHED_MM_CID */
diff --git a/kernel/fork.c b/kernel/fork.c
index 74bc7c9f1bb3..6c23219e1169 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2455,6 +2455,7 @@ bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
 	if (p->mm) {
+		sched_mm_cid_exit(p);
 		mm_clear_owner(p->mm, p);
 		mmput(p->mm);
 	}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 34b6c31eca3a..f9295c42da22 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4485,7 +4485,6 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
 	init_numa_balancing(clone_flags, p);
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
 	p->migration_pending = NULL;
-	init_sched_mm_cid(p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -10371,15 +10370,27 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
 
 #ifdef CONFIG_SCHED_MM_CID
 /*
- * When a task exits, the MM CID held by the task is not longer required as
- * the task cannot return to user space.
+ * Update the CID range properties when the constraints change. Invoked via
+ * fork(), exit() and affinity changes
  */
+static void mm_update_max_cids(struct mm_struct *mm)
+{
+	struct mm_mm_cid *mc = &mm->mm_cid;
+	unsigned int max_cids;
+
+	lockdep_assert_held(&mm->mm_cid.lock);
+
+	/* Calculate the new maximum constraint */
+	max_cids = min(mc->nr_cpus_allowed, mc->users);
+	WRITE_ONCE(mc->max_cids, max_cids);
+}
+
 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
 {
 	struct cpumask *mm_allowed;
 	unsigned int weight;
 
-	if (!mm)
+	if (!mm || !READ_ONCE(mm->mm_cid.users))
 		return;
 
 	/*
@@ -10389,9 +10400,30 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
 	guard(raw_spinlock)(&mm->mm_cid.lock);
 	mm_allowed = mm_cpus_allowed(mm);
 	weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk);
+	if (weight == mm->mm_cid.nr_cpus_allowed)
+		return;
 	WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight);
+	mm_update_max_cids(mm);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+	struct mm_struct *mm = t->mm;
+
+	WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
+
+	guard(raw_spinlock)(&mm->mm_cid.lock);
+	t->mm_cid.active = 1;
+	mm->mm_cid.users++;
+	/* Preset last_cid for mm_cid_select() */
+	t->mm_cid.last_cid = READ_ONCE(mm->mm_cid.max_cids) - 1;
+	mm_update_max_cids(mm);
 }
 
+/*
+ * When a task exits, the MM CID held by the task is not longer required as
+ * the task cannot return to user space.
+ */
 void sched_mm_cid_exit(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
@@ -10399,12 +10431,14 @@ void sched_mm_cid_exit(struct task_struct *t)
 	if (!mm || !t->mm_cid.active)
 		return;
 
-	guard(preempt)();
+	guard(raw_spinlock)(&mm->mm_cid.lock);
 	t->mm_cid.active = 0;
+	mm->mm_cid.users--;
 	if (t->mm_cid.cid != MM_CID_UNSET) {
 		clear_bit(t->mm_cid.cid, mm_cidmask(mm));
 		t->mm_cid.cid = MM_CID_UNSET;
 	}
+	mm_update_max_cids(mm);
 }
 
 /* Deactivate MM CID allocation across execve() */
@@ -10416,22 +10450,11 @@ void sched_mm_cid_before_execve(struct task_struct *t)
 /* Reactivate MM CID after successful execve() */
 void sched_mm_cid_after_execve(struct task_struct *t)
 {
-	struct mm_struct *mm = t->mm;
-
-	if (!mm)
-		return;
-
+	sched_mm_cid_fork(t);
 	guard(preempt)();
-	t->mm_cid.active = 1;
 	mm_cid_select(t);
 }
 
-void sched_mm_cid_fork(struct task_struct *t)
-{
-	WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET);
-	t->mm_cid.active = 1;
-}
-
 void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 {
 	struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu;
@@ -10440,7 +10463,9 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 	for_each_possible_cpu(cpu)
 		per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
 
+	mm->mm_cid.max_cids = 0;
 	mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+	mm->mm_cid.users = 0;
 	raw_spin_lock_init(&mm->mm_cid.lock);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
 	bitmap_zero(mm_cidmask(mm), num_possible_cpus());
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31f2e431db5e..d539fb269957 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3571,7 +3571,7 @@ static inline bool mm_cid_get(struct task_struct *t)
 	struct mm_struct *mm = t->mm;
 	unsigned int max_cids;
 
-	max_cids = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
+	max_cids = READ_ONCE(mm->mm_cid.max_cids);
 
 	/* Try to reuse the last CID of this task */
 	if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
@@ -3614,7 +3614,6 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
 }
 
 #else /* !CONFIG_SCHED_MM_CID: */
-static inline void init_sched_mm_cid(struct task_struct *t) { }
 static inline void mm_cid_select(struct task_struct *t) { }
 static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
 #endif /* !CONFIG_SCHED_MM_CID */
-- 
cgit v1.2.3


From 51dd92c71a38647803478fb81e1812286a8998b1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:11 +0100
Subject: sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex

Prepare for the new CID management scheme which puts the CID ownership
transition into the fork() and exit() slow path by serializing
sched_mm_cid_fork()/exit() with it, so task list and cpu mask walks can be
done in interruptible and preemptible code.

The contention on it is not worse than on other concurrency controls in the
fork()/exit() machinery.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.895826703@linutronix.de
---
 include/linux/rseq_types.h |  2 ++
 kernel/sched/core.c        | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 0fab369999b6..574aba6fe97c 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -125,6 +125,7 @@ struct mm_cid_pcpu {
  *			do not actually share the MM.
  * @lock:		Spinlock to protect all fields except @pcpu. It also protects
  *			the MM cid cpumask and the MM cidmask bitmap.
+ * @mutex:		Mutex to serialize forks and exits related to this mm
  */
 struct mm_mm_cid {
 	struct mm_cid_pcpu	__percpu *pcpu;
@@ -132,6 +133,7 @@ struct mm_mm_cid {
 	unsigned int		nr_cpus_allowed;
 	unsigned int		users;
 	raw_spinlock_t		lock;
+	struct mutex		mutex;
 }____cacheline_aligned_in_smp;
 #else /* CONFIG_SCHED_MM_CID */
 struct mm_mm_cid { };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9295c42da22..01903cf03ab2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10369,6 +10369,25 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
 }
 
 #ifdef CONFIG_SCHED_MM_CID
+/*
+ * Concurrency IDentifier management
+ *
+ * Serialization rules:
+ *
+ * mm::mm_cid::mutex:	Serializes fork() and exit() and therefore
+ *			protects mm::mm_cid::users.
+ *
+ * mm::mm_cid::lock:	Serializes mm_update_max_cids() and
+ *			mm_update_cpus_allowed(). Nests in mm_cid::mutex
+ *			and runqueue lock.
+ *
+ * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks
+ * and can only be modified with atomic operations.
+ *
+ * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue
+ * lock.
+ */
+
 /*
  * Update the CID range properties when the constraints change. Invoked via
  * fork(), exit() and affinity changes
@@ -10412,6 +10431,7 @@ void sched_mm_cid_fork(struct task_struct *t)
 
 	WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
 
+	guard(mutex)(&mm->mm_cid.mutex);
 	guard(raw_spinlock)(&mm->mm_cid.lock);
 	t->mm_cid.active = 1;
 	mm->mm_cid.users++;
@@ -10431,6 +10451,7 @@ void sched_mm_cid_exit(struct task_struct *t)
 	if (!mm || !t->mm_cid.active)
 		return;
 
+	guard(mutex)(&mm->mm_cid.mutex);
 	guard(raw_spinlock)(&mm->mm_cid.lock);
 	t->mm_cid.active = 0;
 	mm->mm_cid.users--;
@@ -10467,6 +10488,7 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 	mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
 	mm->mm_cid.users = 0;
 	raw_spin_lock_init(&mm->mm_cid.lock);
+	mutex_init(&mm->mm_cid.mutex);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
 	bitmap_zero(mm_cidmask(mm), num_possible_cpus());
 }
-- 
cgit v1.2.3


From 23343b6b09acb4bf97f34ed60e135000ca57ede1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:12 +0100
Subject: sched/mmcid: Introduce per task/CPU ownership infrastructure

The MM CID management has two fundamental requirements:

  1) It has to guarantee that at no given point in time the same CID is
     used by concurrent tasks in userspace.

  2) The CID space must not exceed the number of possible CPUs in a
     system. While most allocators (glibc, tcmalloc, jemalloc) do not care
     about that, there seems to be at least librseq depending on it.

The CID space compaction itself is not a functional correctness
requirement, it is only a useful optimization mechanism to reduce the
memory foot print in unused user space pools.

The optimal CID space is:

    min(nr_tasks, nr_cpus_allowed);

Where @nr_tasks is the number of actual user space threads associated to
the mm and @nr_cpus_allowed is the superset of all task affinities. It is
growth only as it would be insane to take a racy snapshot of all task
affinities when the affinity of one task changes just do redo it 2
milliseconds later when the next task changes its affinity.

That means that as long as the number of tasks is lower or equal than the
number of CPUs allowed, each task owns a CID. If the number of tasks
exceeds the number of CPUs allowed it switches to per CPU mode, where the
CPUs own the CIDs and the tasks borrow them as long as they are scheduled
in.

For transition periods CIDs can go beyond the optimal space as long as they
don't go beyond the number of possible CPUs.

The current upstream implementation adds overhead into task migration to
keep the CID with the task. It also has to do the CID space consolidation
work from a task work in the exit to user space path. As that work is
assigned to a random task related to a MM this can inflict unwanted exit
latencies.

This can be done differently by implementing a strict CID ownership
mechanism. Either the CIDs are owned by the tasks or by the CPUs. The
latter provides less locality when tasks are heavily migrating, but there
is no justification to optimize for overcommit scenarios and thereby
penalizing everyone else.

Provide the basic infrastructure to implement this:

  - Change the UNSET marker to BIT(31) from ~0U
  - Add the ONCPU marker as BIT(30)
  - Add the TRANSIT marker as BIT(29)

That allows to check for ownership trivially and provides a simple check for
UNSET as well. The TRANSIT marker is required to prevent CID space
exhaustion when switching from per CPU to per task mode.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251119172549.960252358@linutronix.de
---
 include/linux/rseq_types.h |  4 +++-
 include/linux/sched.h      |  6 ++---
 kernel/sched/core.c        | 10 ++++++++
 kernel/sched/sched.h       | 59 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 574aba6fe97c..87854effe1ad 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -92,7 +92,9 @@ struct rseq_data { };
 
 #ifdef CONFIG_SCHED_MM_CID
 
-#define MM_CID_UNSET	(~0U)
+#define MM_CID_UNSET	BIT(31)
+#define MM_CID_ONCPU	BIT(30)
+#define MM_CID_TRANSIT	BIT(29)
 
 /**
  * struct sched_mm_cid - Storage for per task MM CID data
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c411ae021bc5..9eec409745f8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2299,16 +2299,16 @@ void sched_mm_cid_before_execve(struct task_struct *t);
 void sched_mm_cid_after_execve(struct task_struct *t);
 void sched_mm_cid_fork(struct task_struct *t);
 void sched_mm_cid_exit(struct task_struct *t);
-static inline int task_mm_cid(struct task_struct *t)
+static __always_inline int task_mm_cid(struct task_struct *t)
 {
-	return t->mm_cid.cid;
+	return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT);
 }
 #else
 static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
 static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
 static inline void sched_mm_cid_fork(struct task_struct *t) { }
 static inline void sched_mm_cid_exit(struct task_struct *t) { }
-static inline int task_mm_cid(struct task_struct *t)
+static __always_inline int task_mm_cid(struct task_struct *t)
 {
 	/*
 	 * Use the processor id as a fall-back when the mm cid feature is
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 01903cf03ab2..55bb9c9ae32c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10386,6 +10386,16 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
  *
  * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue
  * lock.
+ *
+ * CID ownership:
+ *
+ * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
+ * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
+ * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
+ * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
+ * task needs to drop the CID into the pool when scheduling out.  Both bits
+ * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
+ * actually handed over to user space in the RSEQ memory.
  */
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d539fb269957..4b49284504fb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3540,6 +3540,65 @@ extern void sched_dynamic_update(int mode);
 extern const char *preempt_modes[];
 
 #ifdef CONFIG_SCHED_MM_CID
+
+static __always_inline bool cid_on_cpu(unsigned int cid)
+{
+	return cid & MM_CID_ONCPU;
+}
+
+static __always_inline bool cid_in_transit(unsigned int cid)
+{
+	return cid & MM_CID_TRANSIT;
+}
+
+static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid)
+{
+	return cid & ~MM_CID_ONCPU;
+}
+
+static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid)
+{
+	return cid | MM_CID_ONCPU;
+}
+
+static __always_inline unsigned int cid_to_transit_cid(unsigned int cid)
+{
+	return cid | MM_CID_TRANSIT;
+}
+
+static __always_inline unsigned int cid_from_transit_cid(unsigned int cid)
+{
+	return cid & ~MM_CID_TRANSIT;
+}
+
+static __always_inline bool cid_on_task(unsigned int cid)
+{
+	/* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */
+	return cid < MM_CID_TRANSIT;
+}
+
+static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid)
+{
+	clear_bit(cid, mm_cidmask(mm));
+}
+
+static __always_inline void mm_unset_cid_on_task(struct task_struct *t)
+{
+	unsigned int cid = t->mm_cid.cid;
+
+	t->mm_cid.cid = MM_CID_UNSET;
+	if (cid_on_task(cid))
+		mm_drop_cid(t->mm, cid);
+}
+
+static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp)
+{
+	/* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */
+	pcp->cid = cpu_cid_to_cid(pcp->cid);
+	mm_drop_cid(mm, pcp->cid);
+}
+
+/* Active implementation */
 static inline void init_sched_mm_cid(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-- 
cgit v1.2.3


From 9a723ed7facff6955da8d64cc9de7066038036c1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:14 +0100
Subject: sched/mmcid: Provide new scheduler CID mechanism

The MM CID management has two fundamental requirements:

  1) It has to guarantee that at no given point in time the same CID is
     used by concurrent tasks in userspace.

  2) The CID space must not exceed the number of possible CPUs in a
     system. While most allocators (glibc, tcmalloc, jemalloc) do not
     care about that, there seems to be at least some LTTng library
     depending on it.

The CID space compaction itself is not a functional correctness
requirement, it is only a useful optimization mechanism to reduce the
memory foot print in unused user space pools.

The optimal CID space is:

    min(nr_tasks, nr_cpus_allowed);

Where @nr_tasks is the number of actual user space threads associated to
the mm and @nr_cpus_allowed is the superset of all task affinities. It is
growth only as it would be insane to take a racy snapshot of all task
affinities when the affinity of one task changes just do redo it 2
milliseconds later when the next task changes it's affinity.

That means that as long as the number of tasks is lower or equal than the
number of CPUs allowed, each task owns a CID. If the number of tasks
exceeds the number of CPUs allowed it switches to per CPU mode, where the
CPUs own the CIDs and the tasks borrow them as long as they are scheduled
in.

For transition periods CIDs can go beyond the optimal space as long as they
don't go beyond the number of possible CPUs.

The current upstream implementation adds overhead into task migration to
keep the CID with the task. It also has to do the CID space consolidation
work from a task work in the exit to user space path. As that work is
assigned to a random task related to a MM this can inflict unwanted exit
latencies.

Implement the context switch parts of a strict ownership mechanism to
address this.

This removes most of the work from the task which schedules out. Only
during transitioning from per CPU to per task ownership it is required to
drop the CID when leaving the CPU to prevent CID space exhaustion. Other
than that scheduling out is just a single check and branch.

The task which schedules in has to check whether:

    1) The ownership mode changed
    2) The CID is within the optimal CID space

In stable situations this results in zero work. The only short disruption
is when ownership mode changes or when the associated CID is not in the
optimal CID space. The latter only happens when tasks exit and therefore
the optimal CID space shrinks.

That mechanism is strictly optimized for the common case where no change
happens. The only case where it actually causes a temporary one time spike
is on mode changes when and only when a lot of tasks related to a MM
schedule exactly at the same time and have eventually to compete on
allocating a CID from the bitmap.

In the sysbench test case which triggered the spinlock contention in the
initial CID code, __schedule() drops significantly in perf top on a 128
Core (256 threads) machine when running sysbench with 255 threads, which
fits into the task mode limit of 256 together with the parent thread:

  Upstream  rseq/perf branch  +CID rework
  0.42%     0.37%             0.32%          [k] __schedule

Increasing the number of threads to 256, which puts the test process into
per CPU mode looks about the same.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.023984859@linutronix.de
---
 include/linux/rseq.h       |   8 +--
 include/linux/rseq_types.h |  18 ++++--
 kernel/sched/core.c        |   2 +
 kernel/sched/sched.h       | 150 ++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 168 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index bf8a6bf315f3..4c0e8bdd2dd9 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -73,13 +73,13 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t)
 }
 
 /*
- * Invoked from __set_task_cpu() when a task migrates to enforce an IDs
- * update.
+ * Invoked from __set_task_cpu() when a task migrates or from
+ * mm_cid_schedin() when the CID changes to enforce an IDs update.
  *
  * This does not raise TIF_NOTIFY_RESUME as that happens in
  * rseq_sched_switch_event().
  */
-static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu)
+static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
 {
 	t->rseq.event.ids_changed = true;
 }
@@ -168,7 +168,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
 static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_sched_switch_event(struct task_struct *t) { }
-static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
+static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
 static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
 static inline void rseq_force_update(void) { }
 static inline void rseq_virt_userspace_exit(void) { }
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 87854effe1ad..66b1482e1146 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -119,23 +119,31 @@ struct mm_cid_pcpu {
 /**
  * struct mm_mm_cid - Storage for per MM CID data
  * @pcpu:		Per CPU storage for CIDs associated to a CPU
+ * @percpu:		Set, when CIDs are in per CPU mode
+ * @transit:		Set to MM_CID_TRANSIT during a mode change transition phase
  * @max_cids:		The exclusive maximum CID value for allocation and convergence
+ * @lock:		Spinlock to protect all fields except @pcpu. It also protects
+ *			the MM cid cpumask and the MM cidmask bitmap.
+ * @mutex:		Mutex to serialize forks and exits related to this mm
  * @nr_cpus_allowed:	The number of CPUs in the per MM allowed CPUs map. The map
  *			is growth only.
  * @users:		The number of tasks sharing this MM. Separate from mm::mm_users
  *			as that is modified by mmget()/mm_put() by other entities which
  *			do not actually share the MM.
- * @lock:		Spinlock to protect all fields except @pcpu. It also protects
- *			the MM cid cpumask and the MM cidmask bitmap.
- * @mutex:		Mutex to serialize forks and exits related to this mm
  */
 struct mm_mm_cid {
+	/* Hotpath read mostly members */
 	struct mm_cid_pcpu	__percpu *pcpu;
+	unsigned int		percpu;
+	unsigned int		transit;
 	unsigned int		max_cids;
-	unsigned int		nr_cpus_allowed;
-	unsigned int		users;
+
 	raw_spinlock_t		lock;
 	struct mutex		mutex;
+
+	/* Low frequency modified */
+	unsigned int		nr_cpus_allowed;
+	unsigned int		users;
 }____cacheline_aligned_in_smp;
 #else /* CONFIG_SCHED_MM_CID */
 struct mm_mm_cid { };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 55bb9c9ae32c..659ae56b459f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10495,6 +10495,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 		per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
 
 	mm->mm_cid.max_cids = 0;
+	mm->mm_cid.percpu = 0;
+	mm->mm_cid.transit = 0;
 	mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
 	mm->mm_cid.users = 0;
 	raw_spin_lock_init(&mm->mm_cid.lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4b49284504fb..82c7978d548e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2209,7 +2209,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 	smp_wmb();
 	WRITE_ONCE(task_thread_info(p)->cpu, cpu);
 	p->wake_cpu = cpu;
-	rseq_sched_set_task_cpu(p, cpu);
+	rseq_sched_set_ids_changed(p);
 #endif /* CONFIG_SMP */
 }
 
@@ -3598,6 +3598,153 @@ static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_c
 	mm_drop_cid(mm, pcp->cid);
 }
 
+static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids)
+{
+	unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids);
+
+	if (cid >= max_cids)
+		return MM_CID_UNSET;
+	if (test_and_set_bit(cid, mm_cidmask(mm)))
+		return MM_CID_UNSET;
+	return cid;
+}
+
+static inline unsigned int mm_get_cid(struct mm_struct *mm)
+{
+	unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids));
+
+	while (cid == MM_CID_UNSET) {
+		cpu_relax();
+		cid = __mm_get_cid(mm, num_possible_cpus());
+	}
+	return cid;
+}
+
+static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid,
+					   unsigned int max_cids)
+{
+	unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid);
+
+	/* Is it in the optimal CID space? */
+	if (likely(cid < max_cids))
+		return orig_cid;
+
+	/* Try to find one in the optimal space. Otherwise keep the provided. */
+	new_cid = __mm_get_cid(mm, max_cids);
+	if (new_cid != MM_CID_UNSET) {
+		mm_drop_cid(mm, cid);
+		/* Preserve the ONCPU mode of the original CID */
+		return new_cid | (orig_cid & MM_CID_ONCPU);
+	}
+	return orig_cid;
+}
+
+static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid)
+{
+	if (t->mm_cid.cid != cid) {
+		t->mm_cid.cid = cid;
+		rseq_sched_set_ids_changed(t);
+	}
+}
+
+static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid)
+{
+	__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
+}
+
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
+{
+	unsigned int max_cids, tcid = t->mm_cid.cid;
+	struct mm_struct *mm = t->mm;
+
+	max_cids = READ_ONCE(mm->mm_cid.max_cids);
+	/* Optimize for the common case where both have the ONCPU bit set */
+	if (likely(cid_on_cpu(cpu_cid & tcid))) {
+		if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) {
+			mm_cid_update_task_cid(t, cpu_cid);
+			return;
+		}
+		/* Try to converge into the optimal CID space */
+		cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids);
+	} else {
+		/* Hand over or drop the task owned CID */
+		if (cid_on_task(tcid)) {
+			if (cid_on_cpu(cpu_cid))
+				mm_unset_cid_on_task(t);
+			else
+				cpu_cid = cid_to_cpu_cid(tcid);
+		}
+		/* Still nothing, allocate a new one */
+		if (!cid_on_cpu(cpu_cid))
+			cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
+	}
+	mm_cid_update_pcpu_cid(mm, cpu_cid);
+	mm_cid_update_task_cid(t, cpu_cid);
+}
+
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
+{
+	unsigned int max_cids, tcid = t->mm_cid.cid;
+	struct mm_struct *mm = t->mm;
+
+	max_cids = READ_ONCE(mm->mm_cid.max_cids);
+	/* Optimize for the common case, where both have the ONCPU bit clear */
+	if (likely(cid_on_task(tcid | cpu_cid))) {
+		if (likely(tcid < max_cids)) {
+			mm_cid_update_pcpu_cid(mm, tcid);
+			return;
+		}
+		/* Try to converge into the optimal CID space */
+		tcid = mm_cid_converge(mm, tcid, max_cids);
+	} else {
+		/* Hand over or drop the CPU owned CID */
+		if (cid_on_cpu(cpu_cid)) {
+			if (cid_on_task(tcid))
+				mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
+			else
+				tcid = cpu_cid_to_cid(cpu_cid);
+		}
+		/* Still nothing, allocate a new one */
+		if (!cid_on_task(tcid))
+			tcid = mm_get_cid(mm);
+		/* Set the transition mode flag if required */
+		tcid |= READ_ONCE(mm->mm_cid.transit);
+	}
+	mm_cid_update_pcpu_cid(mm, tcid);
+	mm_cid_update_task_cid(t, tcid);
+}
+
+static __always_inline void mm_cid_schedin(struct task_struct *next)
+{
+	struct mm_struct *mm = next->mm;
+	unsigned int cpu_cid;
+
+	if (!next->mm_cid.active)
+		return;
+
+	cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
+	if (likely(!READ_ONCE(mm->mm_cid.percpu)))
+		mm_cid_from_task(next, cpu_cid);
+	else
+		mm_cid_from_cpu(next, cpu_cid);
+}
+
+static __always_inline void mm_cid_schedout(struct task_struct *prev)
+{
+	/* During mode transitions CIDs are temporary and need to be dropped */
+	if (likely(!cid_in_transit(prev->mm_cid.cid)))
+		return;
+
+	mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid));
+	prev->mm_cid.cid = MM_CID_UNSET;
+}
+
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)
+{
+	mm_cid_schedout(prev);
+	mm_cid_schedin(next);
+}
+
 /* Active implementation */
 static inline void init_sched_mm_cid(struct task_struct *t)
 {
@@ -3675,6 +3822,7 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
 #else /* !CONFIG_SCHED_MM_CID: */
 static inline void mm_cid_select(struct task_struct *t) { }
 static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
 #endif /* !CONFIG_SCHED_MM_CID */
 
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
-- 
cgit v1.2.3


From fbd0e71dc370af73f6b316e4de9eed273dd90340 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:16 +0100
Subject: sched/mmcid: Provide CID ownership mode fixup functions

CIDs are either owned by tasks or by CPUs. The ownership mode depends on
the number of tasks related to a MM and the number of CPUs on which these
tasks are theoretically allowed to run on. Theoretically because that
number is the superset of CPU affinities of all tasks which only grows and
never shrinks.

Switching to per CPU mode happens when the user count becomes greater than
the maximum number of CIDs, which is calculated by:

	opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
	max_cids = min(1.25 * opt_cids, nr_cpu_ids);

The +25% allowance is useful for tight CPU masks in scenarios where only a
few threads are created and destroyed to avoid frequent mode
switches. Though this allowance shrinks, the closer opt_cids becomes to
nr_cpu_ids, which is the (unfortunate) hard ABI limit.

At the point of switching to per CPU mode the new user is not yet visible
in the system, so the task which initiated the fork() runs the fixup
function: mm_cid_fixup_tasks_to_cpu() walks the thread list and either
transfers each tasks owned CID to the CPU the task runs on or drops it into
the CID pool if a task is not on a CPU at that point in time. Tasks which
schedule in before the task walk reaches them do the handover in
mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes it's
guaranteed that no task related to that MM owns a CID anymore.

Switching back to task mode happens when the user count goes below the
threshold which was recorded on the per CPU mode switch:

	pcpu_thrs = min(opt_cids - (opt_cids / 4), nr_cpu_ids / 2);

This threshold is updated when a affinity change increases the number of
allowed CPUs for the MM, which might cause a switch back to per task mode.

If the switch back was initiated by a exiting task, then that task runs the
fixup function. If it was initiated by a affinity change, then it's run
either in the deferred update function in context of a workqueue or by a
task which forks a new one or by a task which exits. Whatever happens
first. mm_cid_fixup_cpus_to_task() walks through the possible CPUs and
either transfers the CPU owned CIDs to a related task which runs on the CPU
or drops it into the pool. Tasks which schedule in on a CPU which the walk
did not cover yet do the handover themselves.

This transition from CPU to per task ownership happens in two phases:

 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the task
    CID and denotes that the CID is only temporarily owned by the
    task. When it schedules out the task drops the CID back into the
    pool if this bit is set.

 2) The initiating context walks the per CPU space and after completion
    clears mm:mm_cid.transit. After that point the CIDs are strictly
    task owned again.

This two phase transition is required to prevent CID space exhaustion
during the transition as a direct transfer of ownership would fail if
two tasks are scheduled in on the same CPU before the fixup freed per
CPU CIDs.

When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
related to that MM is owned by a CPU anymore.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.088189028@linutronix.de
---
 include/linux/rseq_types.h |   7 +-
 kernel/sched/core.c        | 278 +++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 259 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 66b1482e1146..a3a4f3f10862 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -122,14 +122,15 @@ struct mm_cid_pcpu {
  * @percpu:		Set, when CIDs are in per CPU mode
  * @transit:		Set to MM_CID_TRANSIT during a mode change transition phase
  * @max_cids:		The exclusive maximum CID value for allocation and convergence
- * @lock:		Spinlock to protect all fields except @pcpu. It also protects
- *			the MM cid cpumask and the MM cidmask bitmap.
+ * @lock:		Spinlock to protect against affinity setting which can't take @mutex
  * @mutex:		Mutex to serialize forks and exits related to this mm
  * @nr_cpus_allowed:	The number of CPUs in the per MM allowed CPUs map. The map
  *			is growth only.
  * @users:		The number of tasks sharing this MM. Separate from mm::mm_users
  *			as that is modified by mmget()/mm_put() by other entities which
  *			do not actually share the MM.
+ * @pcpu_thrs:		Threshold for switching back from per CPU mode
+ * @update_deferred:	A deferred switch back to per task mode is pending.
  */
 struct mm_mm_cid {
 	/* Hotpath read mostly members */
@@ -144,6 +145,8 @@ struct mm_mm_cid {
 	/* Low frequency modified */
 	unsigned int		nr_cpus_allowed;
 	unsigned int		users;
+	unsigned int		pcpu_thrs;
+	unsigned int		update_deferred;
 }____cacheline_aligned_in_smp;
 #else /* CONFIG_SCHED_MM_CID */
 struct mm_mm_cid { };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 659ae56b459f..eb0d59df8acc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10396,43 +10396,270 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
  * task needs to drop the CID into the pool when scheduling out.  Both bits
  * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
  * actually handed over to user space in the RSEQ memory.
+ *
+ * Mode switching:
+ *
+ * Switching to per CPU mode happens when the user count becomes greater
+ * than the maximum number of CIDs, which is calculated by:
+ *
+ *	opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
+ *	max_cids = min(1.25 * opt_cids, num_possible_cpus());
+ *
+ * The +25% allowance is useful for tight CPU masks in scenarios where only
+ * a few threads are created and destroyed to avoid frequent mode
+ * switches. Though this allowance shrinks, the closer opt_cids becomes to
+ * num_possible_cpus(), which is the (unfortunate) hard ABI limit.
+ *
+ * At the point of switching to per CPU mode the new user is not yet
+ * visible in the system, so the task which initiated the fork() runs the
+ * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either transfers each tasks owned CID to the CPU the task runs on or
+ * drops it into the CID pool if a task is not on a CPU at that point in
+ * time. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
+ * it's guaranteed that no task related to that MM owns a CID anymore.
+ *
+ * Switching back to task mode happens when the user count goes below the
+ * threshold which was recorded on the per CPU mode switch:
+ *
+ *	pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2);
+ *
+ * This threshold is updated when a affinity change increases the number of
+ * allowed CPUs for the MM, which might cause a switch back to per task
+ * mode.
+ *
+ * If the switch back was initiated by a exiting task, then that task runs
+ * the fixup function. If it was initiated by a affinity change, then it's
+ * run either in the deferred update function in context of a workqueue or
+ * by a task which forks a new one or by a task which exits. Whatever
+ * happens first. mm_cid_fixup_cpus_to_task() walks through the possible
+ * CPUs and either transfers the CPU owned CIDs to a related task which
+ * runs on the CPU or drops it into the pool. Tasks which schedule in on a
+ * CPU which the walk did not cover yet do the handover themself.
+ *
+ * This transition from CPU to per task ownership happens in two phases:
+ *
+ *  1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
+ *     CID and denotes that the CID is only temporarily owned by the
+ *     task. When it schedules out the task drops the CID back into the
+ *     pool if this bit is set.
+ *
+ *  2) The initiating context walks the per CPU space and after completion
+ *     clears mm:mm_cid.transit. So after that point the CIDs are strictly
+ *     task owned again.
+ *
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail if
+ * two tasks are scheduled in on the same CPU before the fixup freed per
+ * CPU CIDs.
+ *
+ * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
+ * related to that MM is owned by a CPU anymore.
  */
 
 /*
  * Update the CID range properties when the constraints change. Invoked via
  * fork(), exit() and affinity changes
  */
-static void mm_update_max_cids(struct mm_struct *mm)
+static void __mm_update_max_cids(struct mm_mm_cid *mc)
+{
+	unsigned int opt_cids, max_cids;
+
+	/* Calculate the new optimal constraint */
+	opt_cids = min(mc->nr_cpus_allowed, mc->users);
+
+	/* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */
+	max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus());
+	WRITE_ONCE(mc->max_cids, max_cids);
+}
+
+static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
+{
+	unsigned int opt_cids;
+
+	opt_cids = min(mc->nr_cpus_allowed, mc->users);
+	/* Has to be at least 1 because 0 indicates PCPU mode off */
+	return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1);
+}
+
+static bool mm_update_max_cids(struct mm_struct *mm)
 {
 	struct mm_mm_cid *mc = &mm->mm_cid;
-	unsigned int max_cids;
 
 	lockdep_assert_held(&mm->mm_cid.lock);
 
-	/* Calculate the new maximum constraint */
-	max_cids = min(mc->nr_cpus_allowed, mc->users);
-	WRITE_ONCE(mc->max_cids, max_cids);
+	/* Clear deferred mode switch flag. A change is handled by the caller */
+	mc->update_deferred = false;
+	__mm_update_max_cids(mc);
+
+	/* Check whether owner mode must be changed */
+	if (!mc->percpu) {
+		/* Enable per CPU mode when the number of users is above max_cids */
+		if (mc->users > mc->max_cids)
+			mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+	} else {
+		/* Switch back to per task if user count under threshold */
+		if (mc->users < mc->pcpu_thrs)
+			mc->pcpu_thrs = 0;
+	}
+
+	/* Mode change required? */
+	if (!!mc->percpu == !!mc->pcpu_thrs)
+		return false;
+	/* When switching back to per TASK mode, set the transition flag */
+	if (!mc->pcpu_thrs)
+		WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
+	WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
+	return true;
 }
 
 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
 {
 	struct cpumask *mm_allowed;
+	struct mm_mm_cid *mc;
 	unsigned int weight;
 
 	if (!mm || !READ_ONCE(mm->mm_cid.users))
 		return;
-
 	/*
 	 * mm::mm_cid::mm_cpus_allowed is the superset of each threads
 	 * allowed CPUs mask which means it can only grow.
 	 */
-	guard(raw_spinlock)(&mm->mm_cid.lock);
+	mc = &mm->mm_cid;
+	guard(raw_spinlock)(&mc->lock);
 	mm_allowed = mm_cpus_allowed(mm);
 	weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk);
-	if (weight == mm->mm_cid.nr_cpus_allowed)
+	if (weight == mc->nr_cpus_allowed)
+		return;
+
+	WRITE_ONCE(mc->nr_cpus_allowed, weight);
+	__mm_update_max_cids(mc);
+	if (!mc->percpu)
 		return;
-	WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight);
-	mm_update_max_cids(mm);
+
+	/* Adjust the threshold to the wider set */
+	mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+
+	/* Scheduling of deferred mode switch goes here */
+}
+
+static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
+{
+	if (cid_on_cpu(t->mm_cid.cid)) {
+		unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid);
+
+		t->mm_cid.cid = cid_to_transit_cid(cid);
+		pcp->cid = t->mm_cid.cid;
+	}
+}
+
+static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
+{
+	unsigned int cpu;
+
+	/* Walk the CPUs and fixup all stale CIDs */
+	for_each_possible_cpu(cpu) {
+		struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu);
+		struct rq *rq = cpu_rq(cpu);
+
+		/* Remote access to mm::mm_cid::pcpu requires rq_lock */
+		guard(rq_lock_irq)(rq);
+		/* Is the CID still owned by the CPU? */
+		if (cid_on_cpu(pcp->cid)) {
+			/*
+			 * If rq->curr has @mm, transfer it with the
+			 * transition bit set. Otherwise drop it.
+			 */
+			if (rq->curr->mm == mm && rq->curr->mm_cid.active)
+				mm_cid_transit_to_task(rq->curr, pcp);
+			else
+				mm_drop_cid_on_cpu(mm, pcp);
+
+		} else if (rq->curr->mm == mm && rq->curr->mm_cid.active) {
+			unsigned int cid = rq->curr->mm_cid.cid;
+
+			/* Ensure it has the transition bit set */
+			if (!cid_in_transit(cid)) {
+				cid = cid_to_transit_cid(cid);
+				rq->curr->mm_cid.cid = cid;
+				pcp->cid = cid;
+			}
+		}
+	}
+	/* Clear the transition bit */
+	WRITE_ONCE(mm->mm_cid.transit, 0);
+}
+
+static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+{
+	if (cid_on_task(t->mm_cid.cid)) {
+		t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+		pcp->cid = t->mm_cid.cid;
+	}
+}
+
+static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
+{
+	/* Remote access to mm::mm_cid::pcpu requires rq_lock */
+	guard(task_rq_lock)(t);
+	/* If the task is not active it is not in the users count */
+	if (!t->mm_cid.active)
+		return false;
+	if (cid_on_task(t->mm_cid.cid)) {
+		/* If running on the CPU, transfer the CID, otherwise drop it */
+		if (task_rq(t)->curr == t)
+			mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+		else
+			mm_unset_cid_on_task(t);
+	}
+	return true;
+}
+
+static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
+{
+	struct mm_struct *mm = current->mm;
+	struct task_struct *p, *t;
+	unsigned int users;
+
+	/*
+	 * This can obviously race with a concurrent affinity change, which
+	 * increases the number of allowed CPUs for this mm, but that does
+	 * not affect the mode and only changes the CID constraints. A
+	 * possible switch back to per task mode happens either in the
+	 * deferred handler function or in the next fork()/exit().
+	 *
+	 * The caller has already transferred. The newly incoming task is
+	 * already accounted for, but not yet visible.
+	 */
+	users = mm->mm_cid.users - 2;
+	if (!users)
+		return;
+
+	guard(rcu)();
+	for_other_threads(current, t) {
+		if (mm_cid_fixup_task_to_cpu(t, mm))
+			users--;
+	}
+
+	if (!users)
+		return;
+
+	/* Happens only for VM_CLONE processes. */
+	for_each_process_thread(p, t) {
+		if (t == current || t->mm != mm)
+			continue;
+		if (mm_cid_fixup_task_to_cpu(t, mm)) {
+			if (--users == 0)
+				return;
+		}
+	}
+}
+
+static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
+{
+	t->mm_cid.active = 1;
+	mm->mm_cid.users++;
+	return mm_update_max_cids(mm);
 }
 
 void sched_mm_cid_fork(struct task_struct *t)
@@ -10442,12 +10669,19 @@ void sched_mm_cid_fork(struct task_struct *t)
 	WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
 
 	guard(mutex)(&mm->mm_cid.mutex);
-	guard(raw_spinlock)(&mm->mm_cid.lock);
-	t->mm_cid.active = 1;
-	mm->mm_cid.users++;
-	/* Preset last_cid for mm_cid_select() */
-	t->mm_cid.last_cid = READ_ONCE(mm->mm_cid.max_cids) - 1;
-	mm_update_max_cids(mm);
+	scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
+		sched_mm_cid_add_user(t, mm);
+		/* Preset last_cid for mm_cid_select() */
+		t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
+	}
+}
+
+static bool sched_mm_cid_remove_user(struct task_struct *t)
+{
+	t->mm_cid.active = 0;
+	mm_unset_cid_on_task(t);
+	t->mm->mm_cid.users--;
+	return mm_update_max_cids(t->mm);
 }
 
 /*
@@ -10462,14 +10696,8 @@ void sched_mm_cid_exit(struct task_struct *t)
 		return;
 
 	guard(mutex)(&mm->mm_cid.mutex);
-	guard(raw_spinlock)(&mm->mm_cid.lock);
-	t->mm_cid.active = 0;
-	mm->mm_cid.users--;
-	if (t->mm_cid.cid != MM_CID_UNSET) {
-		clear_bit(t->mm_cid.cid, mm_cidmask(mm));
-		t->mm_cid.cid = MM_CID_UNSET;
-	}
-	mm_update_max_cids(mm);
+	scoped_guard(raw_spinlock, &mm->mm_cid.lock)
+		sched_mm_cid_remove_user(t);
 }
 
 /* Deactivate MM CID allocation across execve() */
@@ -10499,6 +10727,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 	mm->mm_cid.transit = 0;
 	mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
 	mm->mm_cid.users = 0;
+	mm->mm_cid.pcpu_thrs = 0;
+	mm->mm_cid.update_deferred = 0;
 	raw_spin_lock_init(&mm->mm_cid.lock);
 	mutex_init(&mm->mm_cid.mutex);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
-- 
cgit v1.2.3


From c809f081fe400cb1b9898f4791c0d33146315161 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:18 +0100
Subject: irqwork: Move data struct to a types header

... to avoid header recursion hell.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.152813625@linutronix.de
---
 include/linux/irq_work.h       |  9 ++-------
 include/linux/irq_work_types.h | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/irq_work_types.h

(limited to 'include')

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 136f2980cba3..c5afd053ae32 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -2,8 +2,9 @@
 #ifndef _LINUX_IRQ_WORK_H
 #define _LINUX_IRQ_WORK_H
 
-#include <linux/smp_types.h>
+#include <linux/irq_work_types.h>
 #include <linux/rcuwait.h>
+#include <linux/smp_types.h>
 
 /*
  * An entry can be in one of four states:
@@ -14,12 +15,6 @@
  * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
  */
 
-struct irq_work {
-	struct __call_single_node node;
-	void (*func)(struct irq_work *);
-	struct rcuwait irqwait;
-};
-
 #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){	\
 	.node = { .u_flags = (_flags), },			\
 	.func = (_func),					\
diff --git a/include/linux/irq_work_types.h b/include/linux/irq_work_types.h
new file mode 100644
index 000000000000..73abec5bb06e
--- /dev/null
+++ b/include/linux/irq_work_types.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IRQ_WORK_TYPES_H
+#define _LINUX_IRQ_WORK_TYPES_H
+
+#include <linux/smp_types.h>
+#include <linux/types.h>
+
+struct irq_work {
+	struct __call_single_node	node;
+	void				(*func)(struct irq_work *);
+	struct rcuwait			irqwait;
+};
+
+#endif
-- 
cgit v1.2.3


From 9da6ccbcea3de1fa704202e3346fe6c0226bfc18 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:20 +0100
Subject: sched/mmcid: Implement deferred mode change

When affinity changes cause an increase of the number of CPUs allowed for
tasks which are related to a MM, that might results in a situation where
the ownership mode can go back from per CPU mode to per task mode.

As affinity changes happen with runqueue lock held there is no way to do
the actual mode change and required fixup right there.

Add the infrastructure to defer it to a workqueue. The scheduled work can
race with a fork() or exit(). Whatever happens first takes care of it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.216484739@linutronix.de
---
 include/linux/rseq_types.h |  8 +++++++
 kernel/sched/core.c        | 58 ++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 59 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index a3a4f3f10862..81fbb8885e8d 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -2,7 +2,9 @@
 #ifndef _LINUX_RSEQ_TYPES_H
 #define _LINUX_RSEQ_TYPES_H
 
+#include <linux/irq_work_types.h>
 #include <linux/types.h>
+#include <linux/workqueue_types.h>
 
 #ifdef CONFIG_RSEQ
 struct rseq;
@@ -122,6 +124,8 @@ struct mm_cid_pcpu {
  * @percpu:		Set, when CIDs are in per CPU mode
  * @transit:		Set to MM_CID_TRANSIT during a mode change transition phase
  * @max_cids:		The exclusive maximum CID value for allocation and convergence
+ * @irq_work:		irq_work to handle the affinity mode change case
+ * @work:		Regular work to handle the affinity mode change case
  * @lock:		Spinlock to protect against affinity setting which can't take @mutex
  * @mutex:		Mutex to serialize forks and exits related to this mm
  * @nr_cpus_allowed:	The number of CPUs in the per MM allowed CPUs map. The map
@@ -139,6 +143,10 @@ struct mm_mm_cid {
 	unsigned int		transit;
 	unsigned int		max_cids;
 
+	/* Rarely used. Moves @lock and @mutex into the second cacheline */
+	struct irq_work		irq_work;
+	struct work_struct	work;
+
 	raw_spinlock_t		lock;
 	struct mutex		mutex;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb0d59df8acc..cbb543a6efda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10539,8 +10539,17 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
 
 	/* Adjust the threshold to the wider set */
 	mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+	/* Switch back to per task mode? */
+	if (mc->users >= mc->pcpu_thrs)
+		return;
+
+	/* Don't queue twice */
+	if (mc->update_deferred)
+		return;
 
-	/* Scheduling of deferred mode switch goes here */
+	/* Queue the irq work, which schedules the real work */
+	mc->update_deferred = true;
+	irq_work_queue(&mc->irq_work);
 }
 
 static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
@@ -10553,7 +10562,7 @@ static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_p
 	}
 }
 
-static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
+static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
 {
 	unsigned int cpu;
 
@@ -10714,14 +10723,47 @@ void sched_mm_cid_after_execve(struct task_struct *t)
 	mm_cid_select(t);
 }
 
-void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+static void mm_cid_work_fn(struct work_struct *work)
 {
-	struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu;
-	int cpu;
+	struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
 
-	for_each_possible_cpu(cpu)
-		per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
+	/* Make it compile, but not functional yet */
+	if (!IS_ENABLED(CONFIG_NEW_MM_CID))
+		return;
+
+	guard(mutex)(&mm->mm_cid.mutex);
+	/* Did the last user task exit already? */
+	if (!mm->mm_cid.users)
+		return;
+
+	scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+		/* Have fork() or exit() handled it already? */
+		if (!mm->mm_cid.update_deferred)
+			return;
+		/* This clears mm_cid::update_deferred */
+		if (!mm_update_max_cids(mm))
+			return;
+		/* Affinity changes can only switch back to task mode */
+		if (WARN_ON_ONCE(mm->mm_cid.percpu))
+			return;
+	}
+	mm_cid_fixup_cpus_to_tasks(mm);
+}
+
+static void mm_cid_irq_work(struct irq_work *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work);
 
+	/*
+	 * Needs to be unconditional because mm_cid::lock cannot be held
+	 * when scheduling work as mm_update_cpus_allowed() nests inside
+	 * rq::lock and schedule_work() might end up in wakeup...
+	 */
+	schedule_work(&mm->mm_cid.work);
+}
+
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+{
 	mm->mm_cid.max_cids = 0;
 	mm->mm_cid.percpu = 0;
 	mm->mm_cid.transit = 0;
@@ -10731,6 +10773,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 	mm->mm_cid.update_deferred = 0;
 	raw_spin_lock_init(&mm->mm_cid.lock);
 	mutex_init(&mm->mm_cid.mutex);
+	mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
+	INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
 	bitmap_zero(mm_cidmask(mm), num_possible_cpus());
 }
-- 
cgit v1.2.3


From 653fda7ae73d8033dedb65537acac0c2c287dc3f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Nov 2025 18:27:22 +0100
Subject: sched/mmcid: Switch over to the new mechanism

Now that all pieces are in place, change the implementations of
sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict
ownership scheme and switch context_switch() over to use the new
mm_cid_schedin() functionality.

The common case is that there is no mode change required, which makes
fork() and exit() just update the user count and the constraints.

In case that a new user would exceed the CID space limit the fork() context
handles the transition to per CPU mode with mm::mm_cid::mutex held. exit()
handles the transition back to per task mode when the user count drops
below the switch back threshold. fork() might also be forced to handle a
deferred switch back to per task mode, when a affinity change increased the
number of allowed CPUs enough.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.280380631@linutronix.de
---
 include/linux/rseq.h       |  19 --------
 include/linux/rseq_types.h |   8 ++--
 kernel/fork.c              |   1 -
 kernel/sched/core.c        | 115 ++++++++++++++++++++++++++++++++++++++-------
 kernel/sched/sched.h       |  76 ------------------------------
 5 files changed, 103 insertions(+), 116 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 4c0e8bdd2dd9..2266f4dc77b6 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -84,24 +84,6 @@ static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
 	t->rseq.event.ids_changed = true;
 }
 
-/*
- * Invoked from switch_mm_cid() in context switch when the task gets a MM
- * CID assigned.
- *
- * This does not raise TIF_NOTIFY_RESUME as that happens in
- * rseq_sched_switch_event().
- */
-static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
-{
-	/*
-	 * Requires a comparison as the switch_mm_cid() code does not
-	 * provide a conditional for it readily. So avoid excessive updates
-	 * when nothing changes.
-	 */
-	if (t->rseq.ids.mm_cid != cid)
-		t->rseq.event.ids_changed = true;
-}
-
 /* Enforce a full update after RSEQ registration and when execve() failed */
 static inline void rseq_force_update(void)
 {
@@ -169,7 +151,6 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_sched_switch_event(struct task_struct *t) { }
 static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
-static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
 static inline void rseq_force_update(void) { }
 static inline void rseq_virt_userspace_exit(void) { }
 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 81fbb8885e8d..332dc14b81c9 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -101,18 +101,18 @@ struct rseq_data { };
 /**
  * struct sched_mm_cid - Storage for per task MM CID data
  * @active:	MM CID is active for the task
- * @cid:	The CID associated to the task
- * @last_cid:	The last CID associated to the task
+ * @cid:	The CID associated to the task either permanently or
+ *		borrowed from the CPU
  */
 struct sched_mm_cid {
 	unsigned int		active;
 	unsigned int		cid;
-	unsigned int		last_cid;
 };
 
 /**
  * struct mm_cid_pcpu - Storage for per CPU MM_CID data
- * @cid:	The CID associated to the CPU
+ * @cid:	The CID associated to the CPU either permanently or
+ *		while a task with a CID is running
  */
 struct mm_cid_pcpu {
 	unsigned int	cid;
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c23219e1169..8475958e029b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -956,7 +956,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 
 #ifdef CONFIG_SCHED_MM_CID
 	tsk->mm_cid.cid = MM_CID_UNSET;
-	tsk->mm_cid.last_cid = MM_CID_UNSET;
 	tsk->mm_cid.active = 0;
 #endif
 	return tsk;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cbb543a6efda..62235f1dc04e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5307,7 +5307,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		}
 	}
 
-	switch_mm_cid(prev, next);
+	mm_cid_switch_to(prev, next);
 
 	/*
 	 * Tell rseq that the task was scheduled in. Must be after
@@ -10624,7 +10624,7 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
 	return true;
 }
 
-static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_fixup_tasks_to_cpus(void)
 {
 	struct mm_struct *mm = current->mm;
 	struct task_struct *p, *t;
@@ -10674,25 +10674,81 @@ static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
 void sched_mm_cid_fork(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
+	bool percpu;
 
 	WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
 
 	guard(mutex)(&mm->mm_cid.mutex);
-	scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
-		sched_mm_cid_add_user(t, mm);
-		/* Preset last_cid for mm_cid_select() */
-		t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
+	scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+		struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+
+		/* First user ? */
+		if (!mm->mm_cid.users) {
+			sched_mm_cid_add_user(t, mm);
+			t->mm_cid.cid = mm_get_cid(mm);
+			/* Required for execve() */
+			pcp->cid = t->mm_cid.cid;
+			return;
+		}
+
+		if (!sched_mm_cid_add_user(t, mm)) {
+			if (!mm->mm_cid.percpu)
+				t->mm_cid.cid = mm_get_cid(mm);
+			return;
+		}
+
+		/* Handle the mode change and transfer current's CID */
+		percpu = !!mm->mm_cid.percpu;
+		if (!percpu)
+			mm_cid_transit_to_task(current, pcp);
+		else
+			mm_cid_transfer_to_cpu(current, pcp);
+	}
+
+	if (percpu) {
+		mm_cid_fixup_tasks_to_cpus();
+	} else {
+		mm_cid_fixup_cpus_to_tasks(mm);
+		t->mm_cid.cid = mm_get_cid(mm);
 	}
 }
 
 static bool sched_mm_cid_remove_user(struct task_struct *t)
 {
 	t->mm_cid.active = 0;
-	mm_unset_cid_on_task(t);
+	scoped_guard(preempt) {
+		/* Clear the transition bit */
+		t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+		mm_unset_cid_on_task(t);
+	}
 	t->mm->mm_cid.users--;
 	return mm_update_max_cids(t->mm);
 }
 
+static bool __sched_mm_cid_exit(struct task_struct *t)
+{
+	struct mm_struct *mm = t->mm;
+
+	if (!sched_mm_cid_remove_user(t))
+		return false;
+	/*
+	 * Contrary to fork() this only deals with a switch back to per
+	 * task mode either because the above decreased users or an
+	 * affinity change increased the number of allowed CPUs and the
+	 * deferred fixup did not run yet.
+	 */
+	if (WARN_ON_ONCE(mm->mm_cid.percpu))
+		return false;
+	/*
+	 * A failed fork(2) cleanup never gets here, so @current must have
+	 * the same MM as @t. That's true for exit() and the failed
+	 * pthread_create() cleanup case.
+	 */
+	if (WARN_ON_ONCE(current->mm != mm))
+		return false;
+	return true;
+}
+
 /*
  * When a task exits, the MM CID held by the task is not longer required as
  * the task cannot return to user space.
@@ -10703,10 +10759,43 @@ void sched_mm_cid_exit(struct task_struct *t)
 
 	if (!mm || !t->mm_cid.active)
 		return;
+	/*
+	 * Ensure that only one instance is doing MM CID operations within
+	 * a MM. The common case is uncontended. The rare fixup case adds
+	 * some overhead.
+	 */
+	scoped_guard(mutex, &mm->mm_cid.mutex) {
+		/* mm_cid::mutex is sufficient to protect mm_cid::users */
+		if (likely(mm->mm_cid.users > 1)) {
+			scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+				if (!__sched_mm_cid_exit(t))
+					return;
+				/* Mode change required. Transfer currents CID */
+				mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+			}
+			mm_cid_fixup_cpus_to_tasks(mm);
+			return;
+		}
+		/* Last user */
+		scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+			/* Required across execve() */
+			if (t == current)
+				mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+			/* Ignore mode change. There is nothing to do. */
+			sched_mm_cid_remove_user(t);
+		}
+	}
 
-	guard(mutex)(&mm->mm_cid.mutex);
-	scoped_guard(raw_spinlock, &mm->mm_cid.lock)
-		sched_mm_cid_remove_user(t);
+	/*
+	 * As this is the last user (execve(), process exit or failed
+	 * fork(2)) there is no concurrency anymore.
+	 *
+	 * Synchronize eventually pending work to ensure that there are no
+	 * dangling references left. @t->mm_cid.users is zero so nothing
+	 * can queue this work anymore.
+	 */
+	irq_work_sync(&mm->mm_cid.irq_work);
+	cancel_work_sync(&mm->mm_cid.work);
 }
 
 /* Deactivate MM CID allocation across execve() */
@@ -10719,18 +10808,12 @@ void sched_mm_cid_before_execve(struct task_struct *t)
 void sched_mm_cid_after_execve(struct task_struct *t)
 {
 	sched_mm_cid_fork(t);
-	guard(preempt)();
-	mm_cid_select(t);
 }
 
 static void mm_cid_work_fn(struct work_struct *work)
 {
 	struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
 
-	/* Make it compile, but not functional yet */
-	if (!IS_ENABLED(CONFIG_NEW_MM_CID))
-		return;
-
 	guard(mutex)(&mm->mm_cid.mutex);
 	/* Did the last user task exit already? */
 	if (!mm->mm_cid.users)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 82c7978d548e..f9d0515db130 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3745,83 +3745,7 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
 	mm_cid_schedin(next);
 }
 
-/* Active implementation */
-static inline void init_sched_mm_cid(struct task_struct *t)
-{
-	struct mm_struct *mm = t->mm;
-	unsigned int max_cid;
-
-	if (!mm)
-		return;
-
-	/* Preset last_mm_cid */
-	max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
-	t->mm_cid.last_cid = max_cid - 1;
-}
-
-static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
-{
-	struct mm_struct *mm = t->mm;
-
-	if (cid >= max_cids)
-		return false;
-	if (test_and_set_bit(cid, mm_cidmask(mm)))
-		return false;
-	t->mm_cid.cid = t->mm_cid.last_cid = cid;
-	__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
-	return true;
-}
-
-static inline bool mm_cid_get(struct task_struct *t)
-{
-	struct mm_struct *mm = t->mm;
-	unsigned int max_cids;
-
-	max_cids = READ_ONCE(mm->mm_cid.max_cids);
-
-	/* Try to reuse the last CID of this task */
-	if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
-		return true;
-
-	/* Try to reuse the last CID of this mm on this CPU */
-	if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids))
-		return true;
-
-	/* Try the first zero bit in the cidmask. */
-	return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), num_possible_cpus()), max_cids);
-}
-
-static inline void mm_cid_select(struct task_struct *t)
-{
-	/*
-	 * mm_cid_get() can fail when the maximum CID, which is determined
-	 * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently.
-	 * That's a transient failure as there cannot be more tasks
-	 * concurrently on a CPU (or about to be scheduled in) than that.
-	 */
-	for (;;) {
-		if (mm_cid_get(t))
-			break;
-	}
-}
-
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
-{
-	if (prev->mm_cid.active) {
-		if (prev->mm_cid.cid != MM_CID_UNSET)
-			clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm));
-		prev->mm_cid.cid = MM_CID_UNSET;
-	}
-
-	if (next->mm_cid.active) {
-		mm_cid_select(next);
-		rseq_sched_set_task_mm_cid(next, next->mm_cid.cid);
-	}
-}
-
 #else /* !CONFIG_SCHED_MM_CID: */
-static inline void mm_cid_select(struct task_struct *t) { }
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
 static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
 #endif /* !CONFIG_SCHED_MM_CID */
 
-- 
cgit v1.2.3


From ec95cd103c3a1e2567927014e4a710416cde3e52 Mon Sep 17 00:00:00 2001
From: Viacheslav Dubeyko <slava@dubeyko.com>
Date: Tue, 25 Nov 2025 15:13:27 -0800
Subject: hfs/hfsplus: move on-disk layout declarations into hfs_common.h

Currently, HFS declares on-disk layout's metadata structures
in fs/hfs/hfs.h and HFS+ declares it in fs/hfsplus/hfsplus_raw.h.
However, HFS and HFS+ on-disk layouts have some similarity and
overlapping in declarations. As a result, fs/hfs/hfs.h and
fs/hfsplus/hfsplus_raw.h contain multiple duplicated declarations.
Moreover, both HFS and HFS+ drivers contain completely similar
implemented functionality in multiple places.

This patch is moving the on-disk layout declarations from
fs/hfs/hfs.h and fs/hfsplus/hfsplus_raw.h into
include/linux/hfs_common.h with the goal to exclude
the duplication in declarations. Also, this patch prepares
the basis for creating a hfslib that can aggregate common
functionality without necessity to duplicate the same code
in HFS and HFS+ drivers.

Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
cc: Yangtao Li <frank.li@vivo.com>
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
---
 fs/hfs/btree.h             |  42 ---
 fs/hfs/hfs.h               | 269 +------------------
 fs/hfs/hfs_fs.h            |   1 -
 fs/hfsplus/hfsplus_fs.h    |   1 -
 fs/hfsplus/hfsplus_raw.h   | 394 +---------------------------
 fs/hfsplus/xattr.c         |  22 +-
 include/linux/hfs_common.h | 633 +++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 645 insertions(+), 717 deletions(-)

(limited to 'include')

diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index 97f88035b224..99be858b2446 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -129,45 +129,3 @@ extern int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd);
 extern int hfs_brec_find(struct hfs_find_data *fd);
 extern int hfs_brec_read(struct hfs_find_data *fd, void *rec, u32 rec_len);
 extern int hfs_brec_goto(struct hfs_find_data *fd, int cnt);
-
-
-struct hfs_bnode_desc {
-	__be32 next;		/* (V) Number of the next node at this level */
-	__be32 prev;		/* (V) Number of the prev node at this level */
-	u8 type;		/* (F) The type of node */
-	u8 height;		/* (F) The level of this node (leaves=1) */
-	__be16 num_recs;	/* (V) The number of records in this node */
-	u16 reserved;
-} __packed;
-
-#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
-#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
-#define HFS_NODE_MAP	0x02	/* Holds part of the bitmap of used nodes */
-#define HFS_NODE_LEAF	0xFF	/* A leaf (ndNHeight==1) node */
-
-struct hfs_btree_header_rec {
-	__be16 depth;		/* (V) The number of levels in this B-tree */
-	__be32 root;		/* (V) The node number of the root node */
-	__be32 leaf_count;	/* (V) The number of leaf records */
-	__be32 leaf_head;	/* (V) The number of the first leaf node */
-	__be32 leaf_tail;	/* (V) The number of the last leaf node */
-	__be16 node_size;	/* (F) The number of bytes in a node (=512) */
-	__be16 max_key_len;	/* (F) The length of a key in an index node */
-	__be32 node_count;	/* (V) The total number of nodes */
-	__be32 free_nodes;	/* (V) The number of unused nodes */
-	u16 reserved1;
-	__be32 clump_size;	/* (F) clump size. not usually used. */
-	u8 btree_type;		/* (F) BTree type */
-	u8 reserved2;
-	__be32 attributes;	/* (F) attributes */
-	u32 reserved3[16];
-} __packed;
-
-#define BTREE_ATTR_BADCLOSE	0x00000001	/* b-tree not closed properly. not
-						   used by hfsplus. */
-#define HFS_TREE_BIGKEYS	0x00000002	/* key length is u16 instead of u8.
-						   used by hfsplus. */
-#define HFS_TREE_VARIDXKEYS	0x00000004	/* variable key length instead of
-						   max key length. use din catalog
-						   b-tree but not in extents
-						   b-tree (hfsplus). */
diff --git a/fs/hfs/hfs.h b/fs/hfs/hfs.h
index 6f194d0768b6..3f2293ff6fdd 100644
--- a/fs/hfs/hfs.h
+++ b/fs/hfs/hfs.h
@@ -9,274 +9,7 @@
 #ifndef _HFS_H
 #define _HFS_H
 
-/* offsets to various blocks */
-#define HFS_DD_BLK		0 /* Driver Descriptor block */
-#define HFS_PMAP_BLK		1 /* First block of partition map */
-#define HFS_MDB_BLK		2 /* Block (w/i partition) of MDB */
-
-/* magic numbers for various disk blocks */
-#define HFS_DRVR_DESC_MAGIC	0x4552 /* "ER": driver descriptor map */
-#define HFS_OLD_PMAP_MAGIC	0x5453 /* "TS": old-type partition map */
-#define HFS_NEW_PMAP_MAGIC	0x504D /* "PM": new-type partition map */
-#define HFS_SUPER_MAGIC		0x4244 /* "BD": HFS MDB (super block) */
-#define HFS_MFS_SUPER_MAGIC	0xD2D7 /* MFS MDB (super block) */
-
-/* various FIXED size parameters */
-#define HFS_SECTOR_SIZE		512    /* size of an HFS sector */
-#define HFS_SECTOR_SIZE_BITS	9      /* log_2(HFS_SECTOR_SIZE) */
-#define HFS_NAMELEN		31     /* maximum length of an HFS filename */
-#define HFS_MAX_NAMELEN		128
-#define HFS_MAX_VALENCE		32767U
-
-/* Meanings of the drAtrb field of the MDB,
- * Reference: _Inside Macintosh: Files_ p. 2-61
- */
-#define HFS_SB_ATTRIB_HLOCK	(1 << 7)
-#define HFS_SB_ATTRIB_UNMNT	(1 << 8)
-#define HFS_SB_ATTRIB_SPARED	(1 << 9)
-#define HFS_SB_ATTRIB_INCNSTNT	(1 << 11)
-#define HFS_SB_ATTRIB_SLOCK	(1 << 15)
-
-/* Some special File ID numbers */
-#define HFS_POR_CNID		1	/* Parent Of the Root */
-#define HFS_ROOT_CNID		2	/* ROOT directory */
-#define HFS_EXT_CNID		3	/* EXTents B-tree */
-#define HFS_CAT_CNID		4	/* CATalog B-tree */
-#define HFS_BAD_CNID		5	/* BAD blocks file */
-#define HFS_ALLOC_CNID		6	/* ALLOCation file (HFS+) */
-#define HFS_START_CNID		7	/* STARTup file (HFS+) */
-#define HFS_ATTR_CNID		8	/* ATTRibutes file (HFS+) */
-#define HFS_EXCH_CNID		15	/* ExchangeFiles temp id */
-#define HFS_FIRSTUSER_CNID	16
-
-/* values for hfs_cat_rec.cdrType */
-#define HFS_CDR_DIR    0x01    /* folder (directory) */
-#define HFS_CDR_FIL    0x02    /* file */
-#define HFS_CDR_THD    0x03    /* folder (directory) thread */
-#define HFS_CDR_FTH    0x04    /* file thread */
-
-/* legal values for hfs_ext_key.FkType and hfs_file.fork */
-#define HFS_FK_DATA	0x00
-#define HFS_FK_RSRC	0xFF
-
-/* bits in hfs_fil_entry.Flags */
-#define HFS_FIL_LOCK	0x01  /* locked */
-#define HFS_FIL_THD	0x02  /* file thread */
-#define HFS_FIL_DOPEN   0x04  /* data fork open */
-#define HFS_FIL_ROPEN   0x08  /* resource fork open */
-#define HFS_FIL_DIR     0x10  /* directory (always clear) */
-#define HFS_FIL_NOCOPY  0x40  /* copy-protected file */
-#define HFS_FIL_USED	0x80  /* open */
-
-/* bits in hfs_dir_entry.Flags. dirflags is 16 bits. */
-#define HFS_DIR_LOCK        0x01  /* locked */
-#define HFS_DIR_THD         0x02  /* directory thread */
-#define HFS_DIR_INEXPFOLDER 0x04  /* in a shared area */
-#define HFS_DIR_MOUNTED     0x08  /* mounted */
-#define HFS_DIR_DIR         0x10  /* directory (always set) */
-#define HFS_DIR_EXPFOLDER   0x20  /* share point */
-
-/* bits hfs_finfo.fdFlags */
-#define HFS_FLG_INITED		0x0100
-#define HFS_FLG_LOCKED		0x1000
-#define HFS_FLG_INVISIBLE	0x4000
-
-/*======== HFS structures as they appear on the disk ========*/
-
-/* Pascal-style string of up to 31 characters */
-struct hfs_name {
-	u8 len;
-	u8 name[HFS_NAMELEN];
-} __packed;
-
-struct hfs_point {
-	__be16 v;
-	__be16 h;
-} __packed;
-
-struct hfs_rect {
-	__be16 top;
-	__be16 left;
-	__be16 bottom;
-	__be16 right;
-} __packed;
-
-struct hfs_finfo {
-	__be32 fdType;
-	__be32 fdCreator;
-	__be16 fdFlags;
-	struct hfs_point fdLocation;
-	__be16 fdFldr;
-} __packed;
-
-struct hfs_fxinfo {
-	__be16 fdIconID;
-	u8 fdUnused[8];
-	__be16 fdComment;
-	__be32 fdPutAway;
-} __packed;
-
-struct hfs_dinfo {
-	struct hfs_rect frRect;
-	__be16 frFlags;
-	struct hfs_point frLocation;
-	__be16 frView;
-} __packed;
-
-struct hfs_dxinfo {
-	struct hfs_point frScroll;
-	__be32 frOpenChain;
-	__be16 frUnused;
-	__be16 frComment;
-	__be32 frPutAway;
-} __packed;
-
-union hfs_finder_info {
-	struct {
-		struct hfs_finfo finfo;
-		struct hfs_fxinfo fxinfo;
-	} file;
-	struct {
-		struct hfs_dinfo dinfo;
-		struct hfs_dxinfo dxinfo;
-	} dir;
-} __packed;
-
-/* Cast to a pointer to a generic bkey */
-#define	HFS_BKEY(X)	(((void)((X)->KeyLen)), ((struct hfs_bkey *)(X)))
-
-/* The key used in the catalog b-tree: */
-struct hfs_cat_key {
-	u8 key_len;		/* number of bytes in the key */
-	u8 reserved;		/* padding */
-	__be32 ParID;		/* CNID of the parent dir */
-	struct hfs_name	CName;	/* The filename of the entry */
-} __packed;
-
-/* The key used in the extents b-tree: */
-struct hfs_ext_key {
-	u8 key_len;		/* number of bytes in the key */
-	u8 FkType;		/* HFS_FK_{DATA,RSRC} */
-	__be32 FNum;		/* The File ID of the file */
-	__be16 FABN;		/* allocation blocks number*/
-} __packed;
-
-typedef union hfs_btree_key {
-	u8 key_len;			/* number of bytes in the key */
-	struct hfs_cat_key cat;
-	struct hfs_ext_key ext;
-} hfs_btree_key;
-
-#define HFS_MAX_CAT_KEYLEN	(sizeof(struct hfs_cat_key) - sizeof(u8))
-#define HFS_MAX_EXT_KEYLEN	(sizeof(struct hfs_ext_key) - sizeof(u8))
-
-typedef union hfs_btree_key btree_key;
-
-struct hfs_extent {
-	__be16 block;
-	__be16 count;
-};
-typedef struct hfs_extent hfs_extent_rec[3];
-
-/* The catalog record for a file */
-struct hfs_cat_file {
-	s8 type;			/* The type of entry */
-	u8 reserved;
-	u8 Flags;			/* Flags such as read-only */
-	s8 Typ;				/* file version number = 0 */
-	struct hfs_finfo UsrWds;	/* data used by the Finder */
-	__be32 FlNum;			/* The CNID */
-	__be16 StBlk;			/* obsolete */
-	__be32 LgLen;			/* The logical EOF of the data fork*/
-	__be32 PyLen;			/* The physical EOF of the data fork */
-	__be16 RStBlk;			/* obsolete */
-	__be32 RLgLen;			/* The logical EOF of the rsrc fork */
-	__be32 RPyLen;			/* The physical EOF of the rsrc fork */
-	__be32 CrDat;			/* The creation date */
-	__be32 MdDat;			/* The modified date */
-	__be32 BkDat;			/* The last backup date */
-	struct hfs_fxinfo FndrInfo;	/* more data for the Finder */
-	__be16 ClpSize;			/* number of bytes to allocate
-					   when extending files */
-	hfs_extent_rec ExtRec;		/* first extent record
-					   for the data fork */
-	hfs_extent_rec RExtRec;		/* first extent record
-					   for the resource fork */
-	u32 Resrv;			/* reserved by Apple */
-} __packed;
-
-/* the catalog record for a directory */
-struct hfs_cat_dir {
-	s8 type;			/* The type of entry */
-	u8 reserved;
-	__be16 Flags;			/* flags */
-	__be16 Val;			/* Valence: number of files and
-					   dirs in the directory */
-	__be32 DirID;			/* The CNID */
-	__be32 CrDat;			/* The creation date */
-	__be32 MdDat;			/* The modification date */
-	__be32 BkDat;			/* The last backup date */
-	struct hfs_dinfo UsrInfo;	/* data used by the Finder */
-	struct hfs_dxinfo FndrInfo;	/* more data used by Finder */
-	u8 Resrv[16];			/* reserved by Apple */
-} __packed;
-
-/* the catalog record for a thread */
-struct hfs_cat_thread {
-	s8 type;			/* The type of entry */
-	u8 reserved[9];			/* reserved by Apple */
-	__be32 ParID;			/* CNID of parent directory */
-	struct hfs_name CName;		/* The name of this entry */
-}  __packed;
-
-/* A catalog tree record */
-typedef union hfs_cat_rec {
-	s8 type;			/* The type of entry */
-	struct hfs_cat_file file;
-	struct hfs_cat_dir dir;
-	struct hfs_cat_thread thread;
-} hfs_cat_rec;
-
-struct hfs_mdb {
-	__be16 drSigWord;		/* Signature word indicating fs type */
-	__be32 drCrDate;		/* fs creation date/time */
-	__be32 drLsMod;			/* fs modification date/time */
-	__be16 drAtrb;			/* fs attributes */
-	__be16 drNmFls;			/* number of files in root directory */
-	__be16 drVBMSt;			/* location (in 512-byte blocks)
-					   of the volume bitmap */
-	__be16 drAllocPtr;		/* location (in allocation blocks)
-					   to begin next allocation search */
-	__be16 drNmAlBlks;		/* number of allocation blocks */
-	__be32 drAlBlkSiz;		/* bytes in an allocation block */
-	__be32 drClpSiz;		/* clumpsize, the number of bytes to
-					   allocate when extending a file */
-	__be16 drAlBlSt;		/* location (in 512-byte blocks)
-					   of the first allocation block */
-	__be32 drNxtCNID;		/* CNID to assign to the next
-					   file or directory created */
-	__be16 drFreeBks;		/* number of free allocation blocks */
-	u8 drVN[28];			/* the volume label */
-	__be32 drVolBkUp;		/* fs backup date/time */
-	__be16 drVSeqNum;		/* backup sequence number */
-	__be32 drWrCnt;			/* fs write count */
-	__be32 drXTClpSiz;		/* clumpsize for the extents B-tree */
-	__be32 drCTClpSiz;		/* clumpsize for the catalog B-tree */
-	__be16 drNmRtDirs;		/* number of directories in
-					   the root directory */
-	__be32 drFilCnt;		/* number of files in the fs */
-	__be32 drDirCnt;		/* number of directories in the fs */
-	u8 drFndrInfo[32];		/* data used by the Finder */
-	__be16 drEmbedSigWord;		/* embedded volume signature */
-	__be32 drEmbedExtent;		/* starting block number (xdrStABN)
-					   and number of allocation blocks
-					   (xdrNumABlks) occupied by embedded
-					   volume */
-	__be32 drXTFlSize;		/* bytes in the extents B-tree */
-	hfs_extent_rec drXTExtRec;	/* extents B-tree's first 3 extents */
-	__be32 drCTFlSize;		/* bytes in the catalog B-tree */
-	hfs_extent_rec drCTExtRec;	/* catalog B-tree's first 3 extents */
-} __packed;
+#include <linux/hfs_common.h>
 
 /*======== Data structures kept in memory ========*/
 
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 38854df4c1b4..e94dbc04a1e4 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -18,7 +18,6 @@
 
 #include <asm/byteorder.h>
 #include <linux/uaccess.h>
-#include <linux/hfs_common.h>
 
 #include "hfs.h"
 
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 56dcd80a62ba..45fe3a12ecba 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -16,7 +16,6 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/fs_context.h>
-#include <linux/hfs_common.h>
 #include "hfsplus_raw.h"
 
 /* Runtime config options */
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 68b4240c6191..83b5dbde924b 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -15,398 +15,6 @@
 #define _LINUX_HFSPLUS_RAW_H
 
 #include <linux/types.h>
-
-/* Some constants */
-#define HFSPLUS_SECTOR_SIZE        512
-#define HFSPLUS_SECTOR_SHIFT         9
-#define HFSPLUS_VOLHEAD_SECTOR       2
-#define HFSPLUS_VOLHEAD_SIG     0x482b
-#define HFSPLUS_VOLHEAD_SIGX    0x4858
-#define HFSPLUS_SUPER_MAGIC     0x482b
-#define HFSPLUS_MIN_VERSION          4
-#define HFSPLUS_CURRENT_VERSION      5
-
-#define HFSP_WRAP_MAGIC         0x4244
-#define HFSP_WRAP_ATTRIB_SLOCK  0x8000
-#define HFSP_WRAP_ATTRIB_SPARED 0x0200
-
-#define HFSP_WRAPOFF_SIG          0x00
-#define HFSP_WRAPOFF_ATTRIB       0x0A
-#define HFSP_WRAPOFF_ABLKSIZE     0x14
-#define HFSP_WRAPOFF_ABLKSTART    0x1C
-#define HFSP_WRAPOFF_EMBEDSIG     0x7C
-#define HFSP_WRAPOFF_EMBEDEXT     0x7E
-
-#define HFSP_HIDDENDIR_NAME \
-	"\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
-
-#define HFSP_HARDLINK_TYPE	0x686c6e6b	/* 'hlnk' */
-#define HFSP_HFSPLUS_CREATOR	0x6866732b	/* 'hfs+' */
-
-#define HFSP_SYMLINK_TYPE	0x736c6e6b	/* 'slnk' */
-#define HFSP_SYMLINK_CREATOR	0x72686170	/* 'rhap' */
-
-#define HFSP_MOUNT_VERSION	0x482b4c78	/* 'H+Lx' */
-
-/* Structures used on disk */
-
-typedef __be32 hfsplus_cnid;
-typedef __be16 hfsplus_unichr;
-
-#define HFSPLUS_MAX_STRLEN 255
-#define HFSPLUS_ATTR_MAX_STRLEN 127
-
-/* A "string" as used in filenames, etc. */
-struct hfsplus_unistr {
-	__be16 length;
-	hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN];
-} __packed;
-
-/*
- * A "string" is used in attributes file
- * for name of extended attribute
- */
-struct hfsplus_attr_unistr {
-	__be16 length;
-	hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN];
-} __packed;
-
-/* POSIX permissions */
-struct hfsplus_perm {
-	__be32 owner;
-	__be32 group;
-	u8  rootflags;
-	u8  userflags;
-	__be16 mode;
-	__be32 dev;
-} __packed;
-
-#define HFSPLUS_FLG_NODUMP	0x01
-#define HFSPLUS_FLG_IMMUTABLE	0x02
-#define HFSPLUS_FLG_APPEND	0x04
-
-/* A single contiguous area of a file */
-struct hfsplus_extent {
-	__be32 start_block;
-	__be32 block_count;
-} __packed;
-typedef struct hfsplus_extent hfsplus_extent_rec[8];
-
-/* Information for a "Fork" in a file */
-struct hfsplus_fork_raw {
-	__be64 total_size;
-	__be32 clump_size;
-	__be32 total_blocks;
-	hfsplus_extent_rec extents;
-} __packed;
-
-/* HFS+ Volume Header */
-struct hfsplus_vh {
-	__be16 signature;
-	__be16 version;
-	__be32 attributes;
-	__be32 last_mount_vers;
-	u32 reserved;
-
-	__be32 create_date;
-	__be32 modify_date;
-	__be32 backup_date;
-	__be32 checked_date;
-
-	__be32 file_count;
-	__be32 folder_count;
-
-	__be32 blocksize;
-	__be32 total_blocks;
-	__be32 free_blocks;
-
-	__be32 next_alloc;
-	__be32 rsrc_clump_sz;
-	__be32 data_clump_sz;
-	hfsplus_cnid next_cnid;
-
-	__be32 write_count;
-	__be64 encodings_bmp;
-
-	u32 finder_info[8];
-
-	struct hfsplus_fork_raw alloc_file;
-	struct hfsplus_fork_raw ext_file;
-	struct hfsplus_fork_raw cat_file;
-	struct hfsplus_fork_raw attr_file;
-	struct hfsplus_fork_raw start_file;
-} __packed;
-
-/* HFS+ volume attributes */
-#define HFSPLUS_VOL_UNMNT		(1 << 8)
-#define HFSPLUS_VOL_SPARE_BLK		(1 << 9)
-#define HFSPLUS_VOL_NOCACHE		(1 << 10)
-#define HFSPLUS_VOL_INCNSTNT		(1 << 11)
-#define HFSPLUS_VOL_NODEID_REUSED	(1 << 12)
-#define HFSPLUS_VOL_JOURNALED		(1 << 13)
-#define HFSPLUS_VOL_SOFTLOCK		(1 << 15)
-#define HFSPLUS_VOL_UNUSED_NODE_FIX	(1 << 31)
-
-/* HFS+ BTree node descriptor */
-struct hfs_bnode_desc {
-	__be32 next;
-	__be32 prev;
-	s8 type;
-	u8 height;
-	__be16 num_recs;
-	u16 reserved;
-} __packed;
-
-/* HFS+ BTree node types */
-#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
-#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
-#define HFS_NODE_MAP	0x02	/* Holds part of the bitmap of used nodes */
-#define HFS_NODE_LEAF	0xFF	/* A leaf (ndNHeight==1) node */
-
-/* HFS+ BTree header */
-struct hfs_btree_header_rec {
-	__be16 depth;
-	__be32 root;
-	__be32 leaf_count;
-	__be32 leaf_head;
-	__be32 leaf_tail;
-	__be16 node_size;
-	__be16 max_key_len;
-	__be32 node_count;
-	__be32 free_nodes;
-	u16 reserved1;
-	__be32 clump_size;
-	u8 btree_type;
-	u8 key_type;
-	__be32 attributes;
-	u32 reserved3[16];
-} __packed;
-
-/* BTree attributes */
-#define HFS_TREE_BIGKEYS	2
-#define HFS_TREE_VARIDXKEYS	4
-
-/* HFS+ BTree misc info */
-#define HFSPLUS_TREE_HEAD 0
-#define HFSPLUS_NODE_MXSZ 32768
-#define HFSPLUS_ATTR_TREE_NODE_SIZE		8192
-#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT	3
-#define HFSPLUS_BTREE_HDR_USER_BYTES		128
-
-/* Some special File ID numbers (stolen from hfs.h) */
-#define HFSPLUS_POR_CNID		1	/* Parent Of the Root */
-#define HFSPLUS_ROOT_CNID		2	/* ROOT directory */
-#define HFSPLUS_EXT_CNID		3	/* EXTents B-tree */
-#define HFSPLUS_CAT_CNID		4	/* CATalog B-tree */
-#define HFSPLUS_BAD_CNID		5	/* BAD blocks file */
-#define HFSPLUS_ALLOC_CNID		6	/* ALLOCation file */
-#define HFSPLUS_START_CNID		7	/* STARTup file */
-#define HFSPLUS_ATTR_CNID		8	/* ATTRibutes file */
-#define HFSPLUS_EXCH_CNID		15	/* ExchangeFiles temp id */
-#define HFSPLUS_FIRSTUSER_CNID		16	/* first available user id */
-
-/* btree key type */
-#define HFSPLUS_KEY_CASEFOLDING		0xCF	/* case-insensitive */
-#define HFSPLUS_KEY_BINARY		0xBC	/* case-sensitive */
-
-/* HFS+ catalog entry key */
-struct hfsplus_cat_key {
-	__be16 key_len;
-	hfsplus_cnid parent;
-	struct hfsplus_unistr name;
-} __packed;
-
-#define HFSPLUS_CAT_KEYLEN	(sizeof(struct hfsplus_cat_key))
-
-/* Structs from hfs.h */
-struct hfsp_point {
-	__be16 v;
-	__be16 h;
-} __packed;
-
-struct hfsp_rect {
-	__be16 top;
-	__be16 left;
-	__be16 bottom;
-	__be16 right;
-} __packed;
-
-
-/* HFS directory info (stolen from hfs.h */
-struct DInfo {
-	struct hfsp_rect frRect;
-	__be16 frFlags;
-	struct hfsp_point frLocation;
-	__be16 frView;
-} __packed;
-
-struct DXInfo {
-	struct hfsp_point frScroll;
-	__be32 frOpenChain;
-	__be16 frUnused;
-	__be16 frComment;
-	__be32 frPutAway;
-} __packed;
-
-/* HFS+ folder data (part of an hfsplus_cat_entry) */
-struct hfsplus_cat_folder {
-	__be16 type;
-	__be16 flags;
-	__be32 valence;
-	hfsplus_cnid id;
-	__be32 create_date;
-	__be32 content_mod_date;
-	__be32 attribute_mod_date;
-	__be32 access_date;
-	__be32 backup_date;
-	struct hfsplus_perm permissions;
-	struct_group_attr(info, __packed,
-		struct DInfo user_info;
-		struct DXInfo finder_info;
-	);
-	__be32 text_encoding;
-	__be32 subfolders;	/* Subfolder count in HFSX. Reserved in HFS+. */
-} __packed;
-
-/* HFS file info (stolen from hfs.h) */
-struct FInfo {
-	__be32 fdType;
-	__be32 fdCreator;
-	__be16 fdFlags;
-	struct hfsp_point fdLocation;
-	__be16 fdFldr;
-} __packed;
-
-struct FXInfo {
-	__be16 fdIconID;
-	u8 fdUnused[8];
-	__be16 fdComment;
-	__be32 fdPutAway;
-} __packed;
-
-/* HFS+ file data (part of a cat_entry) */
-struct hfsplus_cat_file {
-	__be16 type;
-	__be16 flags;
-	u32 reserved1;
-	hfsplus_cnid id;
-	__be32 create_date;
-	__be32 content_mod_date;
-	__be32 attribute_mod_date;
-	__be32 access_date;
-	__be32 backup_date;
-	struct hfsplus_perm permissions;
-	struct_group_attr(info, __packed,
-		struct FInfo user_info;
-		struct FXInfo finder_info;
-	);
-	__be32 text_encoding;
-	u32 reserved2;
-
-	struct hfsplus_fork_raw data_fork;
-	struct hfsplus_fork_raw rsrc_fork;
-} __packed;
-
-/* File and folder flag bits */
-#define HFSPLUS_FILE_LOCKED		0x0001
-#define HFSPLUS_FILE_THREAD_EXISTS	0x0002
-#define HFSPLUS_XATTR_EXISTS		0x0004
-#define HFSPLUS_ACL_EXISTS		0x0008
-#define HFSPLUS_HAS_FOLDER_COUNT	0x0010	/* Folder has subfolder count
-						 * (HFSX only) */
-
-/* HFS+ catalog thread (part of a cat_entry) */
-struct hfsplus_cat_thread {
-	__be16 type;
-	s16 reserved;
-	hfsplus_cnid parentID;
-	struct hfsplus_unistr nodeName;
-} __packed;
-
-#define HFSPLUS_MIN_THREAD_SZ 10
-
-/* A data record in the catalog tree */
-typedef union {
-	__be16 type;
-	struct hfsplus_cat_folder folder;
-	struct hfsplus_cat_file file;
-	struct hfsplus_cat_thread thread;
-} __packed hfsplus_cat_entry;
-
-/* HFS+ catalog entry type */
-#define HFSPLUS_FOLDER         0x0001
-#define HFSPLUS_FILE           0x0002
-#define HFSPLUS_FOLDER_THREAD  0x0003
-#define HFSPLUS_FILE_THREAD    0x0004
-
-/* HFS+ extents tree key */
-struct hfsplus_ext_key {
-	__be16 key_len;
-	u8 fork_type;
-	u8 pad;
-	hfsplus_cnid cnid;
-	__be32 start_block;
-} __packed;
-
-#define HFSPLUS_EXT_KEYLEN	sizeof(struct hfsplus_ext_key)
-
-#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo"
-#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security"
-
-#define HFSPLUS_ATTR_INLINE_DATA 0x10
-#define HFSPLUS_ATTR_FORK_DATA   0x20
-#define HFSPLUS_ATTR_EXTENTS     0x30
-
-/* HFS+ attributes tree key */
-struct hfsplus_attr_key {
-	__be16 key_len;
-	__be16 pad;
-	hfsplus_cnid cnid;
-	__be32 start_block;
-	struct hfsplus_attr_unistr key_name;
-} __packed;
-
-#define HFSPLUS_ATTR_KEYLEN	sizeof(struct hfsplus_attr_key)
-
-/* HFS+ fork data attribute */
-struct hfsplus_attr_fork_data {
-	__be32 record_type;
-	__be32 reserved;
-	struct hfsplus_fork_raw the_fork;
-} __packed;
-
-/* HFS+ extension attribute */
-struct hfsplus_attr_extents {
-	__be32 record_type;
-	__be32 reserved;
-	struct hfsplus_extent extents;
-} __packed;
-
-#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802
-
-/* HFS+ attribute inline data */
-struct hfsplus_attr_inline_data {
-	__be32 record_type;
-	__be32 reserved1;
-	u8 reserved2[6];
-	__be16 length;
-	u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE];
-} __packed;
-
-/* A data record in the attributes tree */
-typedef union {
-	__be32 record_type;
-	struct hfsplus_attr_fork_data fork_data;
-	struct hfsplus_attr_extents extents;
-	struct hfsplus_attr_inline_data inline_data;
-} __packed hfsplus_attr_entry;
-
-/* HFS+ generic BTree key */
-typedef union {
-	__be16 key_len;
-	struct hfsplus_cat_key cat;
-	struct hfsplus_ext_key ext;
-	struct hfsplus_attr_key attr;
-} __packed hfsplus_btree_key;
+#include <linux/hfs_common.h>
 
 #endif
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index ece4d29c0ab9..da95a9de9a65 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -265,10 +265,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
 	struct hfs_find_data cat_fd;
 	hfsplus_cat_entry entry;
 	u16 cat_entry_flags, cat_entry_type;
-	u16 folder_finderinfo_len = sizeof(struct DInfo) +
-					sizeof(struct DXInfo);
-	u16 file_finderinfo_len = sizeof(struct FInfo) +
-					sizeof(struct FXInfo);
+	u16 folder_finderinfo_len = sizeof(DInfo) + sizeof(DXInfo);
+	u16 file_finderinfo_len = sizeof(FInfo) + sizeof(FXInfo);
 
 	if ((!S_ISREG(inode->i_mode) &&
 			!S_ISDIR(inode->i_mode)) ||
@@ -444,11 +442,11 @@ static ssize_t hfsplus_getxattr_finder_info(struct inode *inode,
 	ssize_t res = 0;
 	struct hfs_find_data fd;
 	u16 entry_type;
-	u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo);
-	u16 file_rec_len = sizeof(struct FInfo) + sizeof(struct FXInfo);
+	u16 folder_rec_len = sizeof(DInfo) + sizeof(DXInfo);
+	u16 file_rec_len = sizeof(FInfo) + sizeof(FXInfo);
 	u16 record_len = max(folder_rec_len, file_rec_len);
-	u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)];
-	u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)];
+	u8 folder_finder_info[sizeof(DInfo) + sizeof(DXInfo)];
+	u8 file_finder_info[sizeof(FInfo) + sizeof(FXInfo)];
 
 	if (size >= record_len) {
 		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
@@ -612,8 +610,8 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry,
 	struct inode *inode = d_inode(dentry);
 	struct hfs_find_data fd;
 	u16 entry_type;
-	u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)];
-	u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)];
+	u8 folder_finder_info[sizeof(DInfo) + sizeof(DXInfo)];
+	u8 file_finder_info[sizeof(FInfo) + sizeof(FXInfo)];
 	unsigned long len, found_bit;
 	int xattr_name_len, symbols_count;
 
@@ -629,14 +627,14 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry,
 
 	entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset);
 	if (entry_type == HFSPLUS_FOLDER) {
-		len = sizeof(struct DInfo) + sizeof(struct DXInfo);
+		len = sizeof(DInfo) + sizeof(DXInfo);
 		hfs_bnode_read(fd.bnode, folder_finder_info,
 				fd.entryoffset +
 				offsetof(struct hfsplus_cat_folder, user_info),
 				len);
 		found_bit = find_first_bit((void *)folder_finder_info, len*8);
 	} else if (entry_type == HFSPLUS_FILE) {
-		len = sizeof(struct FInfo) + sizeof(struct FXInfo);
+		len = sizeof(FInfo) + sizeof(FXInfo);
 		hfs_bnode_read(fd.bnode, file_finder_info,
 				fd.entryoffset +
 				offsetof(struct hfsplus_cat_file, user_info),
diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h
index 8838ca2f3d08..dadb5e0aa8a3 100644
--- a/include/linux/hfs_common.h
+++ b/include/linux/hfs_common.h
@@ -17,4 +17,637 @@
 	pr_debug("pid %d:%s:%d %s(): " fmt,					\
 		 current->pid, __FILE__, __LINE__, __func__, ##__VA_ARGS__)	\
 
+/*
+ * Format of structures on disk
+ * Information taken from Apple Technote #1150 (HFS Plus Volume Format)
+ */
+
+/* offsets to various blocks */
+#define HFS_DD_BLK			0	/* Driver Descriptor block */
+#define HFS_PMAP_BLK			1	/* First block of partition map */
+#define HFS_MDB_BLK			2	/* Block (w/i partition) of MDB */
+
+/* magic numbers for various disk blocks */
+#define HFS_DRVR_DESC_MAGIC		0x4552	/* "ER": driver descriptor map */
+#define HFS_OLD_PMAP_MAGIC		0x5453	/* "TS": old-type partition map */
+#define HFS_NEW_PMAP_MAGIC		0x504D	/* "PM": new-type partition map */
+#define HFS_SUPER_MAGIC			0x4244	/* "BD": HFS MDB (super block) */
+#define HFS_MFS_SUPER_MAGIC		0xD2D7	/* MFS MDB (super block) */
+
+#define HFSPLUS_VOLHEAD_SIG		0x482b
+#define HFSPLUS_VOLHEAD_SIGX		0x4858
+#define HFSPLUS_SUPER_MAGIC		0x482b
+
+#define HFSP_WRAP_MAGIC			0x4244
+#define HFSP_WRAP_ATTRIB_SLOCK		0x8000
+#define HFSP_WRAP_ATTRIB_SPARED		0x0200
+
+#define HFSP_WRAPOFF_SIG		0x00
+#define HFSP_WRAPOFF_ATTRIB		0x0A
+#define HFSP_WRAPOFF_ABLKSIZE		0x14
+#define HFSP_WRAPOFF_ABLKSTART		0x1C
+#define HFSP_WRAPOFF_EMBEDSIG		0x7C
+#define HFSP_WRAPOFF_EMBEDEXT		0x7E
+
+#define HFSP_HARDLINK_TYPE		0x686c6e6b	/* 'hlnk' */
+#define HFSP_HFSPLUS_CREATOR		0x6866732b	/* 'hfs+' */
+
+#define HFSP_SYMLINK_TYPE		0x736c6e6b	/* 'slnk' */
+#define HFSP_SYMLINK_CREATOR		0x72686170	/* 'rhap' */
+
+#define HFSP_MOUNT_VERSION		0x482b4c78	/* 'H+Lx' */
+
+#define HFSP_HIDDENDIR_NAME \
+	"\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
+
+/* various FIXED size parameters */
+#define HFS_SECTOR_SIZE			512	/* size of an HFS sector */
+#define HFS_SECTOR_SIZE_BITS		9	/* log_2(HFS_SECTOR_SIZE) */
+#define HFS_MAX_VALENCE			32767U
+
+#define HFSPLUS_SECTOR_SIZE		HFS_SECTOR_SIZE
+#define HFSPLUS_SECTOR_SHIFT		HFS_SECTOR_SIZE_BITS
+#define HFSPLUS_VOLHEAD_SECTOR		2
+#define HFSPLUS_MIN_VERSION		4
+#define HFSPLUS_CURRENT_VERSION		5
+
+#define HFS_NAMELEN			31	/* maximum length of an HFS filename */
+#define HFS_MAX_NAMELEN			128
+
+#define HFSPLUS_MAX_STRLEN		255
+#define HFSPLUS_ATTR_MAX_STRLEN		127
+
+/* Meanings of the drAtrb field of the MDB,
+ * Reference: _Inside Macintosh: Files_ p. 2-61
+ */
+#define HFS_SB_ATTRIB_HLOCK	(1 << 7)
+#define HFS_SB_ATTRIB_UNMNT	(1 << 8)
+#define HFS_SB_ATTRIB_SPARED	(1 << 9)
+#define HFS_SB_ATTRIB_INCNSTNT	(1 << 11)
+#define HFS_SB_ATTRIB_SLOCK	(1 << 15)
+
+/* values for hfs_cat_rec.cdrType */
+#define HFS_CDR_DIR		0x01	/* folder (directory) */
+#define HFS_CDR_FIL		0x02	/* file */
+#define HFS_CDR_THD		0x03	/* folder (directory) thread */
+#define HFS_CDR_FTH		0x04	/* file thread */
+
+/* legal values for hfs_ext_key.FkType and hfs_file.fork */
+#define HFS_FK_DATA		0x00
+#define HFS_FK_RSRC		0xFF
+
+/* bits in hfs_fil_entry.Flags */
+#define HFS_FIL_LOCK		0x01	/* locked */
+#define HFS_FIL_THD		0x02	/* file thread */
+#define HFS_FIL_DOPEN		0x04	/* data fork open */
+#define HFS_FIL_ROPEN		0x08	/* resource fork open */
+#define HFS_FIL_DIR		0x10	/* directory (always clear) */
+#define HFS_FIL_NOCOPY		0x40	/* copy-protected file */
+#define HFS_FIL_USED		0x80	/* open */
+
+/* bits in hfs_dir_entry.Flags. dirflags is 16 bits. */
+#define HFS_DIR_LOCK		0x01	/* locked */
+#define HFS_DIR_THD		0x02	/* directory thread */
+#define HFS_DIR_INEXPFOLDER	0x04	/* in a shared area */
+#define HFS_DIR_MOUNTED		0x08	/* mounted */
+#define HFS_DIR_DIR		0x10	/* directory (always set) */
+#define HFS_DIR_EXPFOLDER	0x20	/* share point */
+
+/* bits hfs_finfo.fdFlags */
+#define HFS_FLG_INITED		0x0100
+#define HFS_FLG_LOCKED		0x1000
+#define HFS_FLG_INVISIBLE	0x4000
+
+/* Some special File ID numbers */
+#define HFS_POR_CNID		1	/* Parent Of the Root */
+#define HFSPLUS_POR_CNID	HFS_POR_CNID
+#define HFS_ROOT_CNID		2	/* ROOT directory */
+#define HFSPLUS_ROOT_CNID	HFS_ROOT_CNID
+#define HFS_EXT_CNID		3	/* EXTents B-tree */
+#define HFSPLUS_EXT_CNID	HFS_EXT_CNID
+#define HFS_CAT_CNID		4	/* CATalog B-tree */
+#define HFSPLUS_CAT_CNID	HFS_CAT_CNID
+#define HFS_BAD_CNID		5	/* BAD blocks file */
+#define HFSPLUS_BAD_CNID	HFS_BAD_CNID
+#define HFS_ALLOC_CNID		6	/* ALLOCation file (HFS+) */
+#define HFSPLUS_ALLOC_CNID	HFS_ALLOC_CNID
+#define HFS_START_CNID		7	/* STARTup file (HFS+) */
+#define HFSPLUS_START_CNID	HFS_START_CNID
+#define HFS_ATTR_CNID		8	/* ATTRibutes file (HFS+) */
+#define HFSPLUS_ATTR_CNID	HFS_ATTR_CNID
+#define HFS_EXCH_CNID		15	/* ExchangeFiles temp id */
+#define HFSPLUS_EXCH_CNID	HFS_EXCH_CNID
+#define HFS_FIRSTUSER_CNID	16	/* first available user id */
+#define HFSPLUS_FIRSTUSER_CNID	HFS_FIRSTUSER_CNID
+
+/*======== HFS/HFS+ structures as they appear on the disk ========*/
+
+typedef __be32 hfsplus_cnid;
+typedef __be16 hfsplus_unichr;
+
+/* Pascal-style string of up to 31 characters */
+struct hfs_name {
+	u8 len;
+	u8 name[HFS_NAMELEN];
+} __packed;
+
+/* A "string" as used in filenames, etc. */
+struct hfsplus_unistr {
+	__be16 length;
+	hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN];
+} __packed;
+
+/*
+ * A "string" is used in attributes file
+ * for name of extended attribute
+ */
+struct hfsplus_attr_unistr {
+	__be16 length;
+	hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN];
+} __packed;
+
+struct hfs_extent {
+	__be16 block;
+	__be16 count;
+};
+typedef struct hfs_extent hfs_extent_rec[3];
+
+/* A single contiguous area of a file */
+struct hfsplus_extent {
+	__be32 start_block;
+	__be32 block_count;
+} __packed;
+typedef struct hfsplus_extent hfsplus_extent_rec[8];
+
+/* Information for a "Fork" in a file */
+struct hfsplus_fork_raw {
+	__be64 total_size;
+	__be32 clump_size;
+	__be32 total_blocks;
+	hfsplus_extent_rec extents;
+} __packed;
+
+struct hfs_mdb {
+	__be16 drSigWord;		/* Signature word indicating fs type */
+	__be32 drCrDate;		/* fs creation date/time */
+	__be32 drLsMod;			/* fs modification date/time */
+	__be16 drAtrb;			/* fs attributes */
+	__be16 drNmFls;			/* number of files in root directory */
+	__be16 drVBMSt;			/* location (in 512-byte blocks)
+					   of the volume bitmap */
+	__be16 drAllocPtr;		/* location (in allocation blocks)
+					   to begin next allocation search */
+	__be16 drNmAlBlks;		/* number of allocation blocks */
+	__be32 drAlBlkSiz;		/* bytes in an allocation block */
+	__be32 drClpSiz;		/* clumpsize, the number of bytes to
+					   allocate when extending a file */
+	__be16 drAlBlSt;		/* location (in 512-byte blocks)
+					   of the first allocation block */
+	__be32 drNxtCNID;		/* CNID to assign to the next
+					   file or directory created */
+	__be16 drFreeBks;		/* number of free allocation blocks */
+	u8 drVN[28];			/* the volume label */
+	__be32 drVolBkUp;		/* fs backup date/time */
+	__be16 drVSeqNum;		/* backup sequence number */
+	__be32 drWrCnt;			/* fs write count */
+	__be32 drXTClpSiz;		/* clumpsize for the extents B-tree */
+	__be32 drCTClpSiz;		/* clumpsize for the catalog B-tree */
+	__be16 drNmRtDirs;		/* number of directories in
+					   the root directory */
+	__be32 drFilCnt;		/* number of files in the fs */
+	__be32 drDirCnt;		/* number of directories in the fs */
+	u8 drFndrInfo[32];		/* data used by the Finder */
+	__be16 drEmbedSigWord;		/* embedded volume signature */
+	__be32 drEmbedExtent;		/* starting block number (xdrStABN)
+					   and number of allocation blocks
+					   (xdrNumABlks) occupied by embedded
+					   volume */
+	__be32 drXTFlSize;		/* bytes in the extents B-tree */
+	hfs_extent_rec drXTExtRec;	/* extents B-tree's first 3 extents */
+	__be32 drCTFlSize;		/* bytes in the catalog B-tree */
+	hfs_extent_rec drCTExtRec;	/* catalog B-tree's first 3 extents */
+} __packed;
+
+/* HFS+ Volume Header */
+struct hfsplus_vh {
+	__be16 signature;
+	__be16 version;
+	__be32 attributes;
+	__be32 last_mount_vers;
+	u32 reserved;
+
+	__be32 create_date;
+	__be32 modify_date;
+	__be32 backup_date;
+	__be32 checked_date;
+
+	__be32 file_count;
+	__be32 folder_count;
+
+	__be32 blocksize;
+	__be32 total_blocks;
+	__be32 free_blocks;
+
+	__be32 next_alloc;
+	__be32 rsrc_clump_sz;
+	__be32 data_clump_sz;
+	hfsplus_cnid next_cnid;
+
+	__be32 write_count;
+	__be64 encodings_bmp;
+
+	u32 finder_info[8];
+
+	struct hfsplus_fork_raw alloc_file;
+	struct hfsplus_fork_raw ext_file;
+	struct hfsplus_fork_raw cat_file;
+	struct hfsplus_fork_raw attr_file;
+	struct hfsplus_fork_raw start_file;
+} __packed;
+
+/* HFS+ volume attributes */
+#define HFSPLUS_VOL_UNMNT		(1 << 8)
+#define HFSPLUS_VOL_SPARE_BLK		(1 << 9)
+#define HFSPLUS_VOL_NOCACHE		(1 << 10)
+#define HFSPLUS_VOL_INCNSTNT		(1 << 11)
+#define HFSPLUS_VOL_NODEID_REUSED	(1 << 12)
+#define HFSPLUS_VOL_JOURNALED		(1 << 13)
+#define HFSPLUS_VOL_SOFTLOCK		(1 << 15)
+#define HFSPLUS_VOL_UNUSED_NODE_FIX	(1 << 31)
+
+struct hfs_point {
+	__be16 v;
+	__be16 h;
+} __packed;
+
+typedef struct hfs_point hfsp_point;
+
+struct hfs_rect {
+	__be16 top;
+	__be16 left;
+	__be16 bottom;
+	__be16 right;
+} __packed;
+
+typedef struct hfs_rect hfsp_rect;
+
+struct hfs_finfo {
+	__be32 fdType;
+	__be32 fdCreator;
+	__be16 fdFlags;
+	struct hfs_point fdLocation;
+	__be16 fdFldr;
+} __packed;
+
+typedef struct hfs_finfo FInfo;
+
+struct hfs_fxinfo {
+	__be16 fdIconID;
+	u8 fdUnused[8];
+	__be16 fdComment;
+	__be32 fdPutAway;
+} __packed;
+
+typedef struct hfs_fxinfo FXInfo;
+
+struct hfs_dinfo {
+	struct hfs_rect frRect;
+	__be16 frFlags;
+	struct hfs_point frLocation;
+	__be16 frView;
+} __packed;
+
+typedef struct hfs_dinfo DInfo;
+
+struct hfs_dxinfo {
+	struct hfs_point frScroll;
+	__be32 frOpenChain;
+	__be16 frUnused;
+	__be16 frComment;
+	__be32 frPutAway;
+} __packed;
+
+typedef struct hfs_dxinfo DXInfo;
+
+union hfs_finder_info {
+	struct {
+		struct hfs_finfo finfo;
+		struct hfs_fxinfo fxinfo;
+	} file;
+	struct {
+		struct hfs_dinfo dinfo;
+		struct hfs_dxinfo dxinfo;
+	} dir;
+} __packed;
+
+/* The key used in the catalog b-tree: */
+struct hfs_cat_key {
+	u8 key_len;		/* number of bytes in the key */
+	u8 reserved;		/* padding */
+	__be32 ParID;		/* CNID of the parent dir */
+	struct hfs_name	CName;	/* The filename of the entry */
+} __packed;
+
+/* HFS+ catalog entry key */
+struct hfsplus_cat_key {
+	__be16 key_len;
+	hfsplus_cnid parent;
+	struct hfsplus_unistr name;
+} __packed;
+
+#define HFSPLUS_CAT_KEYLEN	(sizeof(struct hfsplus_cat_key))
+
+/* The key used in the extents b-tree: */
+struct hfs_ext_key {
+	u8 key_len;		/* number of bytes in the key */
+	u8 FkType;		/* HFS_FK_{DATA,RSRC} */
+	__be32 FNum;		/* The File ID of the file */
+	__be16 FABN;		/* allocation blocks number*/
+} __packed;
+
+/* HFS+ extents tree key */
+struct hfsplus_ext_key {
+	__be16 key_len;
+	u8 fork_type;
+	u8 pad;
+	hfsplus_cnid cnid;
+	__be32 start_block;
+} __packed;
+
+#define HFSPLUS_EXT_KEYLEN	sizeof(struct hfsplus_ext_key)
+
+typedef union hfs_btree_key {
+	u8 key_len;			/* number of bytes in the key */
+	struct hfs_cat_key cat;
+	struct hfs_ext_key ext;
+} hfs_btree_key;
+
+#define HFS_MAX_CAT_KEYLEN	(sizeof(struct hfs_cat_key) - sizeof(u8))
+#define HFS_MAX_EXT_KEYLEN	(sizeof(struct hfs_ext_key) - sizeof(u8))
+
+typedef union hfs_btree_key btree_key;
+
+/* The catalog record for a file */
+struct hfs_cat_file {
+	s8 type;			/* The type of entry */
+	u8 reserved;
+	u8 Flags;			/* Flags such as read-only */
+	s8 Typ;				/* file version number = 0 */
+	struct hfs_finfo UsrWds;	/* data used by the Finder */
+	__be32 FlNum;			/* The CNID */
+	__be16 StBlk;			/* obsolete */
+	__be32 LgLen;			/* The logical EOF of the data fork*/
+	__be32 PyLen;			/* The physical EOF of the data fork */
+	__be16 RStBlk;			/* obsolete */
+	__be32 RLgLen;			/* The logical EOF of the rsrc fork */
+	__be32 RPyLen;			/* The physical EOF of the rsrc fork */
+	__be32 CrDat;			/* The creation date */
+	__be32 MdDat;			/* The modified date */
+	__be32 BkDat;			/* The last backup date */
+	struct hfs_fxinfo FndrInfo;	/* more data for the Finder */
+	__be16 ClpSize;			/* number of bytes to allocate
+					   when extending files */
+	hfs_extent_rec ExtRec;		/* first extent record
+					   for the data fork */
+	hfs_extent_rec RExtRec;		/* first extent record
+					   for the resource fork */
+	u32 Resrv;			/* reserved by Apple */
+} __packed;
+
+/* the catalog record for a directory */
+struct hfs_cat_dir {
+	s8 type;			/* The type of entry */
+	u8 reserved;
+	__be16 Flags;			/* flags */
+	__be16 Val;			/* Valence: number of files and
+					   dirs in the directory */
+	__be32 DirID;			/* The CNID */
+	__be32 CrDat;			/* The creation date */
+	__be32 MdDat;			/* The modification date */
+	__be32 BkDat;			/* The last backup date */
+	struct hfs_dinfo UsrInfo;	/* data used by the Finder */
+	struct hfs_dxinfo FndrInfo;	/* more data used by Finder */
+	u8 Resrv[16];			/* reserved by Apple */
+} __packed;
+
+/* the catalog record for a thread */
+struct hfs_cat_thread {
+	s8 type;			/* The type of entry */
+	u8 reserved[9];			/* reserved by Apple */
+	__be32 ParID;			/* CNID of parent directory */
+	struct hfs_name CName;		/* The name of this entry */
+}  __packed;
+
+/* A catalog tree record */
+typedef union hfs_cat_rec {
+	s8 type;			/* The type of entry */
+	struct hfs_cat_file file;
+	struct hfs_cat_dir dir;
+	struct hfs_cat_thread thread;
+} hfs_cat_rec;
+
+/* POSIX permissions */
+struct hfsplus_perm {
+	__be32 owner;
+	__be32 group;
+	u8  rootflags;
+	u8  userflags;
+	__be16 mode;
+	__be32 dev;
+} __packed;
+
+#define HFSPLUS_FLG_NODUMP	0x01
+#define HFSPLUS_FLG_IMMUTABLE	0x02
+#define HFSPLUS_FLG_APPEND	0x04
+
+/* HFS/HFS+ BTree node descriptor */
+struct hfs_bnode_desc {
+	__be32 next;		/* (V) Number of the next node at this level */
+	__be32 prev;		/* (V) Number of the prev node at this level */
+	u8 type;		/* (F) The type of node */
+	u8 height;		/* (F) The level of this node (leaves=1) */
+	__be16 num_recs;	/* (V) The number of records in this node */
+	u16 reserved;
+} __packed;
+
+/* HFS/HFS+ BTree node types */
+#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
+#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
+#define HFS_NODE_MAP	0x02	/* Holds part of the bitmap of used nodes */
+#define HFS_NODE_LEAF	0xFF	/* A leaf (ndNHeight==1) node */
+
+/* HFS/HFS+ BTree header */
+struct hfs_btree_header_rec {
+	__be16 depth;		/* (V) The number of levels in this B-tree */
+	__be32 root;		/* (V) The node number of the root node */
+	__be32 leaf_count;	/* (V) The number of leaf records */
+	__be32 leaf_head;	/* (V) The number of the first leaf node */
+	__be32 leaf_tail;	/* (V) The number of the last leaf node */
+	__be16 node_size;	/* (F) The number of bytes in a node (=512) */
+	__be16 max_key_len;	/* (F) The length of a key in an index node */
+	__be32 node_count;	/* (V) The total number of nodes */
+	__be32 free_nodes;	/* (V) The number of unused nodes */
+	u16 reserved1;
+	__be32 clump_size;	/* (F) clump size. not usually used. */
+	u8 btree_type;		/* (F) BTree type */
+	u8 key_type;
+	__be32 attributes;	/* (F) attributes */
+	u32 reserved3[16];
+} __packed;
+
+/* BTree attributes */
+#define BTREE_ATTR_BADCLOSE	0x00000001	/* b-tree not closed properly. not
+						   used by hfsplus. */
+#define HFS_TREE_BIGKEYS	0x00000002	/* key length is u16 instead of u8.
+						   used by hfsplus. */
+#define HFS_TREE_VARIDXKEYS	0x00000004	/* variable key length instead of
+						   max key length. use din catalog
+						   b-tree but not in extents
+						   b-tree (hfsplus). */
+
+/* HFS+ BTree misc info */
+#define HFSPLUS_TREE_HEAD			0
+#define HFSPLUS_NODE_MXSZ			32768
+#define HFSPLUS_ATTR_TREE_NODE_SIZE		8192
+#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT	3
+#define HFSPLUS_BTREE_HDR_USER_BYTES		128
+
+/* btree key type */
+#define HFSPLUS_KEY_CASEFOLDING		0xCF	/* case-insensitive */
+#define HFSPLUS_KEY_BINARY		0xBC	/* case-sensitive */
+
+/* HFS+ folder data (part of an hfsplus_cat_entry) */
+struct hfsplus_cat_folder {
+	__be16 type;
+	__be16 flags;
+	__be32 valence;
+	hfsplus_cnid id;
+	__be32 create_date;
+	__be32 content_mod_date;
+	__be32 attribute_mod_date;
+	__be32 access_date;
+	__be32 backup_date;
+	struct hfsplus_perm permissions;
+	struct_group_attr(info, __packed,
+		DInfo user_info;
+		DXInfo finder_info;
+	);
+	__be32 text_encoding;
+	__be32 subfolders;	/* Subfolder count in HFSX. Reserved in HFS+. */
+} __packed;
+
+/* HFS+ file data (part of a cat_entry) */
+struct hfsplus_cat_file {
+	__be16 type;
+	__be16 flags;
+	u32 reserved1;
+	hfsplus_cnid id;
+	__be32 create_date;
+	__be32 content_mod_date;
+	__be32 attribute_mod_date;
+	__be32 access_date;
+	__be32 backup_date;
+	struct hfsplus_perm permissions;
+	struct_group_attr(info, __packed,
+		FInfo user_info;
+		FXInfo finder_info;
+	);
+	__be32 text_encoding;
+	u32 reserved2;
+
+	struct hfsplus_fork_raw data_fork;
+	struct hfsplus_fork_raw rsrc_fork;
+} __packed;
+
+/* File and folder flag bits */
+#define HFSPLUS_FILE_LOCKED		0x0001
+#define HFSPLUS_FILE_THREAD_EXISTS	0x0002
+#define HFSPLUS_XATTR_EXISTS		0x0004
+#define HFSPLUS_ACL_EXISTS		0x0008
+#define HFSPLUS_HAS_FOLDER_COUNT	0x0010	/* Folder has subfolder count
+						 * (HFSX only) */
+
+/* HFS+ catalog thread (part of a cat_entry) */
+struct hfsplus_cat_thread {
+	__be16 type;
+	s16 reserved;
+	hfsplus_cnid parentID;
+	struct hfsplus_unistr nodeName;
+} __packed;
+
+#define HFSPLUS_MIN_THREAD_SZ		10
+
+/* A data record in the catalog tree */
+typedef union {
+	__be16 type;
+	struct hfsplus_cat_folder folder;
+	struct hfsplus_cat_file file;
+	struct hfsplus_cat_thread thread;
+} __packed hfsplus_cat_entry;
+
+/* HFS+ catalog entry type */
+#define HFSPLUS_FOLDER		0x0001
+#define HFSPLUS_FILE		0x0002
+#define HFSPLUS_FOLDER_THREAD	0x0003
+#define HFSPLUS_FILE_THREAD	0x0004
+
+#define HFSPLUS_XATTR_FINDER_INFO_NAME	"com.apple.FinderInfo"
+#define HFSPLUS_XATTR_ACL_NAME		"com.apple.system.Security"
+
+#define HFSPLUS_ATTR_INLINE_DATA	0x10
+#define HFSPLUS_ATTR_FORK_DATA		0x20
+#define HFSPLUS_ATTR_EXTENTS		0x30
+
+/* HFS+ attributes tree key */
+struct hfsplus_attr_key {
+	__be16 key_len;
+	__be16 pad;
+	hfsplus_cnid cnid;
+	__be32 start_block;
+	struct hfsplus_attr_unistr key_name;
+} __packed;
+
+#define HFSPLUS_ATTR_KEYLEN	sizeof(struct hfsplus_attr_key)
+
+/* HFS+ fork data attribute */
+struct hfsplus_attr_fork_data {
+	__be32 record_type;
+	__be32 reserved;
+	struct hfsplus_fork_raw the_fork;
+} __packed;
+
+/* HFS+ extension attribute */
+struct hfsplus_attr_extents {
+	__be32 record_type;
+	__be32 reserved;
+	struct hfsplus_extent extents;
+} __packed;
+
+#define HFSPLUS_MAX_INLINE_DATA_SIZE	3802
+
+/* HFS+ attribute inline data */
+struct hfsplus_attr_inline_data {
+	__be32 record_type;
+	__be32 reserved1;
+	u8 reserved2[6];
+	__be16 length;
+	u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE];
+} __packed;
+
+/* A data record in the attributes tree */
+typedef union {
+	__be32 record_type;
+	struct hfsplus_attr_fork_data fork_data;
+	struct hfsplus_attr_extents extents;
+	struct hfsplus_attr_inline_data inline_data;
+} __packed hfsplus_attr_entry;
+
+/* HFS+ generic BTree key */
+typedef union {
+	__be16 key_len;
+	struct hfsplus_cat_key cat;
+	struct hfsplus_ext_key ext;
+	struct hfsplus_attr_key attr;
+} __packed hfsplus_btree_key;
+
 #endif /* _HFS_COMMON_H_ */
-- 
cgit v1.2.3


From 8f6ddc0587606c4be7ffcbdb20a4a99647e0c362 Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Tue, 25 Nov 2025 22:58:50 +0800
Subject: bpf: Introduce internal bpf_map_check_op_flags helper function

It is to unify map flags checking for lookup_elem, update_elem,
lookup_batch and update_batch APIs.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20251125145857.98134-2-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 11 +++++++++++
 kernel/bpf/syscall.c | 34 +++++++++++-----------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a9b788c7b4aa..6498be4c44f8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3829,4 +3829,15 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
 }
 #endif
 
+static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags)
+{
+	if (flags & ~allowed_flags)
+		return -EINVAL;
+
+	if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))
+		return -EINVAL;
+
+	return 0;
+}
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 792623a7c90b..cef8963d69f9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1725,9 +1725,6 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
 		return -EINVAL;
 
-	if (attr->flags & ~BPF_F_LOCK)
-		return -EINVAL;
-
 	CLASS(fd, f)(attr->map_fd);
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
@@ -1735,9 +1732,9 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
 		return -EPERM;
 
-	if ((attr->flags & BPF_F_LOCK) &&
-	    !btf_record_has_field(map->record, BPF_SPIN_LOCK))
-		return -EINVAL;
+	err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
+	if (err)
+		return err;
 
 	key = __bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key))
@@ -1800,11 +1797,9 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 		goto err_put;
 	}
 
-	if ((attr->flags & BPF_F_LOCK) &&
-	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
-		err = -EINVAL;
+	err = bpf_map_check_op_flags(map, attr->flags, ~0);
+	if (err)
 		goto err_put;
-	}
 
 	key = ___bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
@@ -2008,13 +2003,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 	void *key, *value;
 	int err = 0;
 
-	if (attr->batch.elem_flags & ~BPF_F_LOCK)
-		return -EINVAL;
-
-	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
-	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
-		return -EINVAL;
-	}
+	err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+	if (err)
+		return err;
 
 	value_size = bpf_map_value_size(map);
 
@@ -2071,12 +2062,9 @@ int generic_map_lookup_batch(struct bpf_map *map,
 	u32 value_size, cp, max_count;
 	int err;
 
-	if (attr->batch.elem_flags & ~BPF_F_LOCK)
-		return -EINVAL;
-
-	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
-	    !btf_record_has_field(map->record, BPF_SPIN_LOCK))
-		return -EINVAL;
+	err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+	if (err)
+		return err;
 
 	value_size = bpf_map_value_size(map);
 
-- 
cgit v1.2.3


From 68e83f3472667aac18d577587102f4bf77d0bd06 Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Thu, 20 Nov 2025 17:44:27 +0000
Subject: tools: ynl-gen: add regeneration comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a comment on regeneration to the generated files.

The comment is placed after the YNL-GEN line[1], as to not interfere
with ynl-regen.sh's detection logic.

[1] and after the optional YNL-ARG line.

Link: https://lore.kernel.org/r/aR5m174O7pklKrMR@zx2c4.com/
Suggested-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251120174429.390574-3-ast@fiberby.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/android/binder_netlink.c               | 1 +
 drivers/android/binder_netlink.h               | 1 +
 drivers/dpll/dpll_nl.c                         | 1 +
 drivers/dpll/dpll_nl.h                         | 1 +
 drivers/net/ovpn/netlink-gen.c                 | 1 +
 drivers/net/ovpn/netlink-gen.h                 | 1 +
 drivers/net/team/team_nl.c                     | 1 +
 drivers/net/team/team_nl.h                     | 1 +
 fs/lockd/netlink.c                             | 1 +
 fs/lockd/netlink.h                             | 1 +
 fs/nfsd/netlink.c                              | 1 +
 fs/nfsd/netlink.h                              | 1 +
 include/uapi/linux/android/binder_netlink.h    | 1 +
 include/uapi/linux/dpll.h                      | 1 +
 include/uapi/linux/ethtool_netlink_generated.h | 1 +
 include/uapi/linux/fou.h                       | 1 +
 include/uapi/linux/handshake.h                 | 1 +
 include/uapi/linux/if_team.h                   | 1 +
 include/uapi/linux/lockd_netlink.h             | 1 +
 include/uapi/linux/mptcp_pm.h                  | 1 +
 include/uapi/linux/net_shaper.h                | 1 +
 include/uapi/linux/netdev.h                    | 1 +
 include/uapi/linux/nfsd_netlink.h              | 1 +
 include/uapi/linux/ovpn.h                      | 1 +
 include/uapi/linux/psp.h                       | 1 +
 net/core/netdev-genl-gen.c                     | 1 +
 net/core/netdev-genl-gen.h                     | 1 +
 net/devlink/netlink_gen.c                      | 1 +
 net/devlink/netlink_gen.h                      | 1 +
 net/handshake/genl.c                           | 1 +
 net/handshake/genl.h                           | 1 +
 net/ipv4/fou_nl.c                              | 1 +
 net/ipv4/fou_nl.h                              | 1 +
 net/mptcp/mptcp_pm_gen.c                       | 1 +
 net/mptcp/mptcp_pm_gen.h                       | 1 +
 net/psp/psp-nl-gen.c                           | 1 +
 net/psp/psp-nl-gen.h                           | 1 +
 net/shaper/shaper_nl_gen.c                     | 1 +
 net/shaper/shaper_nl_gen.h                     | 1 +
 tools/include/uapi/linux/netdev.h              | 1 +
 tools/net/ynl/pyynl/ynl_gen_c.py               | 1 +
 41 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/drivers/android/binder_netlink.c b/drivers/android/binder_netlink.c
index d05397a50ca6..81e8432b5904 100644
--- a/drivers/android/binder_netlink.c
+++ b/drivers/android/binder_netlink.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/binder.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/drivers/android/binder_netlink.h b/drivers/android/binder_netlink.h
index 882c7a6b537e..57399942a5e3 100644
--- a/drivers/android/binder_netlink.h
+++ b/drivers/android/binder_netlink.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/binder.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_BINDER_GEN_H
 #define _LINUX_BINDER_GEN_H
diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c
index 3c6d570babf8..36d11ff195df 100644
--- a/drivers/dpll/dpll_nl.c
+++ b/drivers/dpll/dpll_nl.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/dpll.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/drivers/dpll/dpll_nl.h b/drivers/dpll/dpll_nl.h
index 3da10cfe9a6e..7419679b6977 100644
--- a/drivers/dpll/dpll_nl.h
+++ b/drivers/dpll/dpll_nl.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/dpll.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_DPLL_GEN_H
 #define _LINUX_DPLL_GEN_H
diff --git a/drivers/net/ovpn/netlink-gen.c b/drivers/net/ovpn/netlink-gen.c
index 14298188c5f1..ecbe9dcf4f7d 100644
--- a/drivers/net/ovpn/netlink-gen.c
+++ b/drivers/net/ovpn/netlink-gen.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/ovpn.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/drivers/net/ovpn/netlink-gen.h b/drivers/net/ovpn/netlink-gen.h
index 220b5b2fdd4f..b2301580770f 100644
--- a/drivers/net/ovpn/netlink-gen.h
+++ b/drivers/net/ovpn/netlink-gen.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/ovpn.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_OVPN_GEN_H
 #define _LINUX_OVPN_GEN_H
diff --git a/drivers/net/team/team_nl.c b/drivers/net/team/team_nl.c
index 208424ab78f5..6db21725f9cc 100644
--- a/drivers/net/team/team_nl.c
+++ b/drivers/net/team/team_nl.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/team.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/drivers/net/team/team_nl.h b/drivers/net/team/team_nl.h
index c9ec1b22ac4d..74816b193475 100644
--- a/drivers/net/team/team_nl.h
+++ b/drivers/net/team/team_nl.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/team.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_TEAM_GEN_H
 #define _LINUX_TEAM_GEN_H
diff --git a/fs/lockd/netlink.c b/fs/lockd/netlink.c
index 6e00b02cad90..880c42b4f8c3 100644
--- a/fs/lockd/netlink.c
+++ b/fs/lockd/netlink.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/lockd.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/fs/lockd/netlink.h b/fs/lockd/netlink.h
index 1920543a7955..d8408f077dd8 100644
--- a/fs/lockd/netlink.h
+++ b/fs/lockd/netlink.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/lockd.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_LOCKD_GEN_H
 #define _LINUX_LOCKD_GEN_H
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
index ca54aa583530..ac51a44e1065 100644
--- a/fs/nfsd/netlink.c
+++ b/fs/nfsd/netlink.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/nfsd.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
index 8eb903f24c41..478117ff6b8c 100644
--- a/fs/nfsd/netlink.h
+++ b/fs/nfsd/netlink.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/nfsd.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_NFSD_GEN_H
 #define _LINUX_NFSD_GEN_H
diff --git a/include/uapi/linux/android/binder_netlink.h b/include/uapi/linux/android/binder_netlink.h
index b218f96d6668..bf69833c9a19 100644
--- a/include/uapi/linux/android/binder_netlink.h
+++ b/include/uapi/linux/android/binder_netlink.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/binder.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_ANDROID_BINDER_NETLINK_H
 #define _UAPI_LINUX_ANDROID_BINDER_NETLINK_H
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index 69d35570ac4f..b7ff9c44f9aa 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/dpll.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_DPLL_H
 #define _UAPI_LINUX_DPLL_H
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index b71b175df46d..556a0c834df5 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/ethtool.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H
 #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H
diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h
index b5cd3e7b3775..bb6bef74d2d1 100644
--- a/include/uapi/linux/fou.h
+++ b/include/uapi/linux/fou.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/fou.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_FOU_H
 #define _UAPI_LINUX_FOU_H
diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h
index 662e7de46c54..d7e40f594888 100644
--- a/include/uapi/linux/handshake.h
+++ b/include/uapi/linux/handshake.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/handshake.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_HANDSHAKE_H
 #define _UAPI_LINUX_HANDSHAKE_H
diff --git a/include/uapi/linux/if_team.h b/include/uapi/linux/if_team.h
index a5c06243a435..f4cd839ae725 100644
--- a/include/uapi/linux/if_team.h
+++ b/include/uapi/linux/if_team.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/team.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_IF_TEAM_H
 #define _UAPI_LINUX_IF_TEAM_H
diff --git a/include/uapi/linux/lockd_netlink.h b/include/uapi/linux/lockd_netlink.h
index 21c65aec3bc6..2d766a0fa6ea 100644
--- a/include/uapi/linux/lockd_netlink.h
+++ b/include/uapi/linux/lockd_netlink.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/lockd.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_LOCKD_NETLINK_H
 #define _UAPI_LINUX_LOCKD_NETLINK_H
diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h
index bf44a5cf5b5a..c97d060ee90b 100644
--- a/include/uapi/linux/mptcp_pm.h
+++ b/include/uapi/linux/mptcp_pm.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/mptcp_pm.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_MPTCP_PM_H
 #define _UAPI_LINUX_MPTCP_PM_H
diff --git a/include/uapi/linux/net_shaper.h b/include/uapi/linux/net_shaper.h
index d8834b59f7d7..3dd22c2930d9 100644
--- a/include/uapi/linux/net_shaper.h
+++ b/include/uapi/linux/net_shaper.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/net_shaper.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_NET_SHAPER_H
 #define _UAPI_LINUX_NET_SHAPER_H
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 048c8de1a130..e0b579a1df4f 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/netdev.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_NETDEV_H
 #define _UAPI_LINUX_NETDEV_H
diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h
index 887cbd12b695..e157e2009ea8 100644
--- a/include/uapi/linux/nfsd_netlink.h
+++ b/include/uapi/linux/nfsd_netlink.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/nfsd.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_NFSD_NETLINK_H
 #define _UAPI_LINUX_NFSD_NETLINK_H
diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h
index 680d1522dc87..959b41def61f 100644
--- a/include/uapi/linux/ovpn.h
+++ b/include/uapi/linux/ovpn.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/ovpn.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_OVPN_H
 #define _UAPI_LINUX_OVPN_H
diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h
index d8449c043ba1..a3a336488dc3 100644
--- a/include/uapi/linux/psp.h
+++ b/include/uapi/linux/psp.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/psp.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_PSP_H
 #define _UAPI_LINUX_PSP_H
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index ff20435c45d2..ba673e81716f 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/netdev.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index cf3fad74511f..cffc08517a41 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/netdev.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_NETDEV_GEN_H
 #define _LINUX_NETDEV_GEN_H
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index 580985025f49..f4c61c2b4f22 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/devlink.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h
index 09cc6f264ccf..2817d53a0eba 100644
--- a/net/devlink/netlink_gen.h
+++ b/net/devlink/netlink_gen.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/devlink.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_DEVLINK_GEN_H
 #define _LINUX_DEVLINK_GEN_H
diff --git a/net/handshake/genl.c b/net/handshake/genl.c
index f55d14d7b726..870612609491 100644
--- a/net/handshake/genl.c
+++ b/net/handshake/genl.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/handshake.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/net/handshake/genl.h b/net/handshake/genl.h
index ae72a596f6cc..8d3e18672daf 100644
--- a/net/handshake/genl.h
+++ b/net/handshake/genl.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/handshake.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_HANDSHAKE_GEN_H
 #define _LINUX_HANDSHAKE_GEN_H
diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c
index 506260b4a4dc..7a99639204b1 100644
--- a/net/ipv4/fou_nl.c
+++ b/net/ipv4/fou_nl.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/fou.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h
index 63a6c4ed803d..438342dc8507 100644
--- a/net/ipv4/fou_nl.h
+++ b/net/ipv4/fou_nl.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/fou.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_FOU_GEN_H
 #define _LINUX_FOU_GEN_H
diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c
index dcffd847af33..c180930a8e0e 100644
--- a/net/mptcp/mptcp_pm_gen.c
+++ b/net/mptcp/mptcp_pm_gen.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/mptcp_pm.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/net/mptcp/mptcp_pm_gen.h b/net/mptcp/mptcp_pm_gen.h
index e24258f6f819..b9280bcb43f5 100644
--- a/net/mptcp/mptcp_pm_gen.h
+++ b/net/mptcp/mptcp_pm_gen.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/mptcp_pm.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_MPTCP_PM_GEN_H
 #define _LINUX_MPTCP_PM_GEN_H
diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c
index 73f8b06d66f0..22a48d0fa378 100644
--- a/net/psp/psp-nl-gen.c
+++ b/net/psp/psp-nl-gen.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/psp.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h
index 5bc3b5d5a53e..599c5f1c82f2 100644
--- a/net/psp/psp-nl-gen.h
+++ b/net/psp/psp-nl-gen.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/psp.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_PSP_GEN_H
 #define _LINUX_PSP_GEN_H
diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c
index 204c8ae8c7b1..e8cccc4c1180 100644
--- a/net/shaper/shaper_nl_gen.c
+++ b/net/shaper/shaper_nl_gen.c
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/net_shaper.yaml */
 /* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h
index cb7f9026fc23..ec41c90431a4 100644
--- a/net/shaper/shaper_nl_gen.h
+++ b/net/shaper/shaper_nl_gen.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/net_shaper.yaml */
 /* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _LINUX_NET_SHAPER_GEN_H
 #define _LINUX_NET_SHAPER_GEN_H
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 048c8de1a130..e0b579a1df4f 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -2,6 +2,7 @@
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/netdev.yaml */
 /* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
 #ifndef _UAPI_LINUX_NETDEV_H
 #define _UAPI_LINUX_NETDEV_H
diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index 0dd589e502cb..b517d0c605ad 100755
--- a/tools/net/ynl/pyynl/ynl_gen_c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -3442,6 +3442,7 @@ def main():
         if args.fn_prefix:
             line += f' --function-prefix {args.fn_prefix}'
         cw.p(f'/* YNL-ARG{line} */')
+    cw.p('/* To regenerate run: tools/net/ynl/ynl-regen.sh */')
     cw.nl()
 
     if args.mode == 'uapi':
-- 
cgit v1.2.3


From 3a6e8fd0bf4042c572dc52e634878b9aca02970d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Nov 2025 17:50:10 +0000
Subject: tcp: rename icsk_timeout() to tcp_timeout_expires()

In preparation of sk->tcp_timeout_timer introduction,
rename icsk_timeout() helper and change its argument to plain
'const struct sock *sk'.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251124175013.1473655-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_connection_sock.h | 5 ++---
 net/ipv4/inet_diag.c               | 4 ++--
 net/ipv4/tcp_ipv4.c                | 4 ++--
 net/ipv4/tcp_timer.c               | 6 +++---
 net/ipv6/tcp_ipv6.c                | 4 ++--
 net/mptcp/protocol.c               | 2 +-
 6 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index fd40af2221b9..765c2149d678 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -184,10 +184,9 @@ static inline void inet_csk_delack_init(struct sock *sk)
 	memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack));
 }
 
-static inline unsigned long
-icsk_timeout(const struct inet_connection_sock *icsk)
+static inline unsigned long tcp_timeout_expires(const struct sock *sk)
 {
-	return READ_ONCE(icsk->icsk_retransmit_timer.expires);
+	return READ_ONCE(inet_csk(sk)->icsk_retransmit_timer.expires);
 }
 
 static inline unsigned long
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index f0b6c5a411a2..9f63c09439a0 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -287,12 +287,12 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		r->idiag_timer = 1;
 		r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits);
 		r->idiag_expires =
-			jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
+			jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
 		r->idiag_timer = 4;
 		r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
 		r->idiag_expires =
-			jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
+			jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
 	} else if (timer_pending(&sk->sk_timer)) {
 		r->idiag_timer = 2;
 		r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e0bb8d9e2d9c..7b8af2c8d03a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2869,10 +2869,10 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
-		timer_expires	= icsk_timeout(icsk);
+		timer_expires	= tcp_timeout_expires(sk);
 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
-		timer_expires	= icsk_timeout(icsk);
+		timer_expires	= tcp_timeout_expires(sk);
 	} else if (timer_pending(&sk->sk_timer)) {
 		timer_active	= 2;
 		timer_expires	= sk->sk_timer.expires;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0672c3d8f4f1..afbd901e610e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -510,7 +510,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
 	 * and tp->rcv_tstamp might very well have been written recently.
 	 * rcv_delta can thus be negative.
 	 */
-	rcv_delta = icsk_timeout(icsk) - tp->rcv_tstamp;
+	rcv_delta = tcp_timeout_expires(sk) - tp->rcv_tstamp;
 	if (rcv_delta <= timeout)
 		return false;
 
@@ -697,9 +697,9 @@ void tcp_write_timer_handler(struct sock *sk)
 	    !icsk->icsk_pending)
 		return;
 
-	if (time_after(icsk_timeout(icsk), jiffies)) {
+	if (time_after(tcp_timeout_expires(sk), jiffies)) {
 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
-			       icsk_timeout(icsk));
+			       tcp_timeout_expires(sk));
 		return;
 	}
 	tcp_mstamp_refresh(tcp_sk(sk));
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 08113f430124..33c76c3a6da7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2163,10 +2163,10 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
-		timer_expires	= icsk_timeout(icsk);
+		timer_expires	= tcp_timeout_expires(sp);
 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
-		timer_expires	= icsk_timeout(icsk);
+		timer_expires	= tcp_timeout_expires(sp);
 	} else if (timer_pending(&sp->sk_timer)) {
 		timer_active	= 2;
 		timer_expires	= sp->sk_timer.expires;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e4ccc57b6f57..4288b6a53b6e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -519,7 +519,7 @@ static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subfl
 	const struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
 	return inet_csk(ssk)->icsk_pending && !subflow->stale_count ?
-	       icsk_timeout(inet_csk(ssk)) - jiffies : 0;
+	       tcp_timeout_expires(ssk) - jiffies : 0;
 }
 
 static void mptcp_set_timeout(struct sock *sk)
-- 
cgit v1.2.3


From 27e8257a86516682e2ec5d7543a8909c37ae8b00 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Nov 2025 17:50:11 +0000
Subject: net: move sk_dst_pending_confirm and sk_pacing_status to sock_read_tx
 group

These two fields are mostly read in TCP tx path, move them
in an more appropriate group for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251124175013.1473655-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 4 ++--
 net/core/sock.c    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 38d48cfe0741..a89aa97151f5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -481,8 +481,6 @@ struct sock {
 		struct rb_root	tcp_rtx_queue;
 	};
 	struct sk_buff_head	sk_write_queue;
-	u32			sk_dst_pending_confirm;
-	u32			sk_pacing_status; /* see enum sk_pacing */
 	struct page_frag	sk_frag;
 	struct timer_list	sk_timer;
 
@@ -493,6 +491,8 @@ struct sock {
 	__cacheline_group_end(sock_write_tx);
 
 	__cacheline_group_begin(sock_read_tx);
+	u32			sk_dst_pending_confirm;
+	u32			sk_pacing_status; /* see enum sk_pacing */
 	unsigned long		sk_max_pacing_rate;
 	long			sk_sndtimeo;
 	u32			sk_priority;
diff --git a/net/core/sock.c b/net/core/sock.c
index b26a6cdc9bcd..45c98bf524b2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -4519,14 +4519,14 @@ static int __init sock_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
-	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
-	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
 
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
-- 
cgit v1.2.3


From 08dfe370239e53494453cee1e2ded2cdaa1efd12 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Nov 2025 17:50:12 +0000
Subject: tcp: introduce icsk->icsk_keepalive_timer

sk->sk_timer has been used for TCP keepalives.

Keepalive timers are not in fast path, we want to use sk->sk_timer
storage for retransmit timers, for better cache locality.

Create icsk->icsk_keepalive_timer and change keepalive
code to no longer use sk->sk_timer.

Added space is reclaimed in the following patch.

This includes changes to MPTCP, which was also using sk_timer.

Alias icsk->mptcp_tout_timer and icsk->icsk_keepalive_timer
for inet_sk_diag_fill() sake.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251124175013.1473655-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/net_cachelines/inet_connection_sock.rst        |  1 +
 include/net/inet_connection_sock.h                            | 11 +++++++++--
 net/ipv4/inet_connection_sock.c                               |  6 +++---
 net/ipv4/inet_diag.c                                          |  4 ++--
 net/ipv4/tcp_ipv4.c                                           |  4 ++--
 net/ipv4/tcp_timer.c                                          |  9 +++++----
 net/ipv6/tcp_ipv6.c                                           |  4 ++--
 net/mptcp/protocol.c                                          | 10 ++++++----
 net/mptcp/protocol.h                                          |  2 +-
 tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c             |  4 ++--
 tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c             |  4 ++--
 11 files changed, 35 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/net_cachelines/inet_connection_sock.rst b/Documentation/networking/net_cachelines/inet_connection_sock.rst
index 8fae85ebb773..4f65de2def8c 100644
--- a/Documentation/networking/net_cachelines/inet_connection_sock.rst
+++ b/Documentation/networking/net_cachelines/inet_connection_sock.rst
@@ -14,6 +14,7 @@ struct inet_bind_bucket             icsk_bind_hash         read_mostly
 struct inet_bind2_bucket            icsk_bind2_hash        read_mostly                             tcp_set_state,inet_put_port
 struct timer_list                   icsk_retransmit_timer  read_write                              inet_csk_reset_xmit_timer,tcp_connect
 struct timer_list                   icsk_delack_timer      read_mostly                             inet_csk_reset_xmit_timer,tcp_connect
+struct timer_list                   icsk_keepalive_timer
 u32                                 icsk_rto               read_write                              tcp_cwnd_validate,tcp_schedule_loss_probe,tcp_connect_init,tcp_connect,tcp_write_xmit,tcp_push_one
 u32                                 icsk_rto_min
 u32                                 icsk_rto_max           read_mostly                             tcp_reset_xmit_timer
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 765c2149d678..e0d90b996348 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -57,6 +57,9 @@ struct inet_connection_sock_af_ops {
  * @icsk_bind_hash:	   Bind node
  * @icsk_bind2_hash:	   Bind node in the bhash2 table
  * @icsk_retransmit_timer: Resend (no ack)
+ * @icsk_delack_timer:     Delayed ACK timer
+ * @icsk_keepalive_timer:  Keepalive timer
+ * @mptcp_tout_timer: mptcp timer
  * @icsk_rto:		   Retransmit timeout
  * @icsk_pmtu_cookie	   Last pmtu seen by socket
  * @icsk_ca_ops		   Pluggable congestion control hook
@@ -81,8 +84,12 @@ struct inet_connection_sock {
 	struct request_sock_queue icsk_accept_queue;
 	struct inet_bind_bucket	  *icsk_bind_hash;
 	struct inet_bind2_bucket  *icsk_bind2_hash;
- 	struct timer_list	  icsk_retransmit_timer;
- 	struct timer_list	  icsk_delack_timer;
+	struct timer_list	  icsk_retransmit_timer;
+	struct timer_list	  icsk_delack_timer;
+	union {
+		struct timer_list icsk_keepalive_timer;
+		struct timer_list mptcp_tout_timer;
+	};
 	__u32			  icsk_rto;
 	__u32                     icsk_rto_min;
 	u32			  icsk_rto_max;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index b4eae731c9ba..4fc09f9bf25d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -739,7 +739,7 @@ void inet_csk_init_xmit_timers(struct sock *sk,
 
 	timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
 	timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
-	timer_setup(&sk->sk_timer, keepalive_handler, 0);
+	timer_setup(&icsk->icsk_keepalive_timer, keepalive_handler, 0);
 	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
 }
 
@@ -752,7 +752,7 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
 
 	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
 	sk_stop_timer(sk, &icsk->icsk_delack_timer);
-	sk_stop_timer(sk, &sk->sk_timer);
+	sk_stop_timer(sk, &icsk->icsk_keepalive_timer);
 }
 
 void inet_csk_clear_xmit_timers_sync(struct sock *sk)
@@ -767,7 +767,7 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk)
 
 	sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer);
 	sk_stop_timer_sync(sk, &icsk->icsk_delack_timer);
-	sk_stop_timer_sync(sk, &sk->sk_timer);
+	sk_stop_timer_sync(sk, &icsk->icsk_keepalive_timer);
 }
 
 struct dst_entry *inet_csk_route_req(const struct sock *sk,
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 9f63c09439a0..3f5b1418a610 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -293,11 +293,11 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
 		r->idiag_expires =
 			jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
-	} else if (timer_pending(&sk->sk_timer)) {
+	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
 		r->idiag_timer = 2;
 		r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
 		r->idiag_expires =
-			jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies);
+			jiffies_delta_to_msecs(icsk->icsk_keepalive_timer.expires - jiffies);
 	}
 
 	if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7b8af2c8d03a..f8a9596e8f4d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2873,9 +2873,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
 		timer_expires	= tcp_timeout_expires(sk);
-	} else if (timer_pending(&sk->sk_timer)) {
+	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
 		timer_active	= 2;
-		timer_expires	= sk->sk_timer.expires;
+		timer_expires	= icsk->icsk_keepalive_timer.expires;
 	} else {
 		timer_active	= 0;
 		timer_expires = jiffies;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index afbd901e610e..d2678dfd8118 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -755,12 +755,12 @@ void tcp_syn_ack_timeout(const struct request_sock *req)
 
 void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len)
 {
-	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+	sk_reset_timer(sk, &inet_csk(sk)->icsk_keepalive_timer, jiffies + len);
 }
 
 static void tcp_delete_keepalive_timer(struct sock *sk)
 {
-	sk_stop_timer(sk, &sk->sk_timer);
+	sk_stop_timer(sk, &inet_csk(sk)->icsk_keepalive_timer);
 }
 
 void tcp_set_keepalive(struct sock *sk, int val)
@@ -777,8 +777,9 @@ EXPORT_IPV6_MOD_GPL(tcp_set_keepalive);
 
 static void tcp_keepalive_timer(struct timer_list *t)
 {
-	struct sock *sk = timer_container_of(sk, t, sk_timer);
-	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_connection_sock *icsk =
+		timer_container_of(icsk, t, icsk_keepalive_timer);
+	struct sock *sk = &icsk->icsk_inet.sk;
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 elapsed;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 33c76c3a6da7..280fe5978559 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2167,9 +2167,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
 		timer_expires	= tcp_timeout_expires(sp);
-	} else if (timer_pending(&sp->sk_timer)) {
+	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
 		timer_active	= 2;
-		timer_expires	= sp->sk_timer.expires;
+		timer_expires	= icsk->icsk_keepalive_timer.expires;
 	} else {
 		timer_active	= 0;
 		timer_expires = jiffies;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4288b6a53b6e..89a5f63921e6 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2374,7 +2374,9 @@ static void mptcp_retransmit_timer(struct timer_list *t)
 
 static void mptcp_tout_timer(struct timer_list *t)
 {
-	struct sock *sk = timer_container_of(sk, t, sk_timer);
+	struct inet_connection_sock *icsk =
+		timer_container_of(icsk, t, mptcp_tout_timer);
+	struct sock *sk = &icsk->icsk_inet.sk;
 
 	mptcp_schedule_work(sk);
 	sock_put(sk);
@@ -2828,7 +2830,7 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
 	 */
 	timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout;
 
-	sk_reset_timer(sk, &sk->sk_timer, timeout);
+	sk_reset_timer(sk, &inet_csk(sk)->mptcp_tout_timer, timeout);
 }
 
 static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
@@ -2974,7 +2976,7 @@ static void __mptcp_init_sock(struct sock *sk)
 
 	/* re-use the csk retrans timer for MPTCP-level retrans */
 	timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
-	timer_setup(&sk->sk_timer, mptcp_tout_timer, 0);
+	timer_setup(&msk->sk.mptcp_tout_timer, mptcp_tout_timer, 0);
 }
 
 static void mptcp_ca_reset(struct sock *sk)
@@ -3176,7 +3178,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
 	might_sleep();
 
 	mptcp_stop_rtx_timer(sk);
-	sk_stop_timer(sk, &sk->sk_timer);
+	sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer);
 	msk->pm.status = 0;
 	mptcp_release_sched(msk);
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 313da78e2b75..9c0d17876b22 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -892,7 +892,7 @@ static inline void mptcp_stop_tout_timer(struct sock *sk)
 	if (!inet_csk(sk)->icsk_mtup.probe_timestamp)
 		return;
 
-	sk_stop_timer(sk, &sk->sk_timer);
+	sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer);
 	inet_csk(sk)->icsk_mtup.probe_timestamp = 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
index 164640db3a29..685811326a04 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
@@ -103,9 +103,9 @@ static int dump_tcp_sock(struct seq_file *seq, struct tcp_sock *tp,
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active = 4;
 		timer_expires = icsk->icsk_retransmit_timer.expires;
-	} else if (timer_pending(&sp->sk_timer)) {
+	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
 		timer_active = 2;
-		timer_expires = sp->sk_timer.expires;
+		timer_expires = icsk->icsk_keepalive_timer.expires;
 	} else {
 		timer_active = 0;
 		timer_expires = bpf_jiffies64();
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
index 591c703f5032..0f4a92712751 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
@@ -103,9 +103,9 @@ static int dump_tcp6_sock(struct seq_file *seq, struct tcp6_sock *tp,
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active = 4;
 		timer_expires = icsk->icsk_retransmit_timer.expires;
-	} else if (timer_pending(&sp->sk_timer)) {
+	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
 		timer_active = 2;
-		timer_expires = sp->sk_timer.expires;
+		timer_expires = icsk->icsk_keepalive_timer.expires;
 	} else {
 		timer_active = 0;
 		timer_expires = bpf_jiffies64();
-- 
cgit v1.2.3


From 9a5e5334adc039fa652aa071ea95b18db0bc1f43 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Nov 2025 17:50:13 +0000
Subject: tcp: remove icsk->icsk_retransmit_timer

Now sk->sk_timer is no longer used by TCP keepalive, we can use
its storage for TCP and MPTCP retransmit timers for better
cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251124175013.1473655-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/net_cachelines/inet_connection_sock.rst    |  1 -
 include/net/inet_connection_sock.h                        |  8 +++-----
 include/net/sock.h                                        |  9 +++++++--
 net/ipv4/inet_connection_sock.c                           |  6 +++---
 net/ipv4/tcp_timer.c                                      |  8 +++-----
 net/mptcp/protocol.c                                      | 15 +++++----------
 tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c         |  4 ++--
 tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c         |  4 ++--
 8 files changed, 25 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/net_cachelines/inet_connection_sock.rst b/Documentation/networking/net_cachelines/inet_connection_sock.rst
index 4f65de2def8c..cc2000f55c29 100644
--- a/Documentation/networking/net_cachelines/inet_connection_sock.rst
+++ b/Documentation/networking/net_cachelines/inet_connection_sock.rst
@@ -12,7 +12,6 @@ struct inet_sock                    icsk_inet              read_mostly         r
 struct request_sock_queue           icsk_accept_queue
 struct inet_bind_bucket             icsk_bind_hash         read_mostly                             tcp_set_state
 struct inet_bind2_bucket            icsk_bind2_hash        read_mostly                             tcp_set_state,inet_put_port
-struct timer_list                   icsk_retransmit_timer  read_write                              inet_csk_reset_xmit_timer,tcp_connect
 struct timer_list                   icsk_delack_timer      read_mostly                             inet_csk_reset_xmit_timer,tcp_connect
 struct timer_list                   icsk_keepalive_timer
 u32                                 icsk_rto               read_write                              tcp_cwnd_validate,tcp_schedule_loss_probe,tcp_connect_init,tcp_connect,tcp_write_xmit,tcp_push_one
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index e0d90b996348..ecb362025c4e 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -56,7 +56,6 @@ struct inet_connection_sock_af_ops {
  * @icsk_accept_queue:	   FIFO of established children
  * @icsk_bind_hash:	   Bind node
  * @icsk_bind2_hash:	   Bind node in the bhash2 table
- * @icsk_retransmit_timer: Resend (no ack)
  * @icsk_delack_timer:     Delayed ACK timer
  * @icsk_keepalive_timer:  Keepalive timer
  * @mptcp_tout_timer: mptcp timer
@@ -84,7 +83,6 @@ struct inet_connection_sock {
 	struct request_sock_queue icsk_accept_queue;
 	struct inet_bind_bucket	  *icsk_bind_hash;
 	struct inet_bind2_bucket  *icsk_bind2_hash;
-	struct timer_list	  icsk_retransmit_timer;
 	struct timer_list	  icsk_delack_timer;
 	union {
 		struct timer_list icsk_keepalive_timer;
@@ -193,7 +191,7 @@ static inline void inet_csk_delack_init(struct sock *sk)
 
 static inline unsigned long tcp_timeout_expires(const struct sock *sk)
 {
-	return READ_ONCE(inet_csk(sk)->icsk_retransmit_timer.expires);
+	return READ_ONCE(sk->tcp_retransmit_timer.expires);
 }
 
 static inline unsigned long
@@ -209,7 +207,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
 	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) {
 		smp_store_release(&icsk->icsk_pending, 0);
 #ifdef INET_CSK_CLEAR_TIMERS
-		sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+		sk_stop_timer(sk, &sk->tcp_retransmit_timer);
 #endif
 	} else if (what == ICSK_TIME_DACK) {
 		smp_store_release(&icsk->icsk_ack.pending, 0);
@@ -241,7 +239,7 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
 	    what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) {
 		smp_store_release(&icsk->icsk_pending, what);
-		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, when);
+		sk_reset_timer(sk, &sk->tcp_retransmit_timer, when);
 	} else if (what == ICSK_TIME_DACK) {
 		smp_store_release(&icsk->icsk_ack.pending,
 				  icsk->icsk_ack.pending | ICSK_ACK_TIMER);
diff --git a/include/net/sock.h b/include/net/sock.h
index a89aa97151f5..02253c6a578b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -305,6 +305,8 @@ struct sk_filter;
   *	@sk_txrehash: enable TX hash rethink
   *	@sk_filter: socket filtering instructions
   *	@sk_timer: sock cleanup timer
+  *	@tcp_retransmit_timer: tcp retransmit timer
+  *	@mptcp_retransmit_timer: mptcp retransmit timer
   *	@sk_stamp: time stamp of last packet received
   *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
   *	@sk_tsflags: SO_TIMESTAMPING flags
@@ -482,8 +484,11 @@ struct sock {
 	};
 	struct sk_buff_head	sk_write_queue;
 	struct page_frag	sk_frag;
-	struct timer_list	sk_timer;
-
+	union {
+		struct timer_list	sk_timer;
+		struct timer_list	tcp_retransmit_timer;
+		struct timer_list	mptcp_retransmit_timer;
+	};
 	unsigned long		sk_pacing_rate; /* bytes per second */
 	atomic_t		sk_zckey;
 	atomic_t		sk_tskey;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4fc09f9bf25d..97d57c52b9ad 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -737,7 +737,7 @@ void inet_csk_init_xmit_timers(struct sock *sk,
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
+	timer_setup(&sk->tcp_retransmit_timer, retransmit_handler, 0);
 	timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
 	timer_setup(&icsk->icsk_keepalive_timer, keepalive_handler, 0);
 	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
@@ -750,7 +750,7 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
 	smp_store_release(&icsk->icsk_pending, 0);
 	smp_store_release(&icsk->icsk_ack.pending, 0);
 
-	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+	sk_stop_timer(sk, &sk->tcp_retransmit_timer);
 	sk_stop_timer(sk, &icsk->icsk_delack_timer);
 	sk_stop_timer(sk, &icsk->icsk_keepalive_timer);
 }
@@ -765,7 +765,7 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk)
 	smp_store_release(&icsk->icsk_pending, 0);
 	smp_store_release(&icsk->icsk_ack.pending, 0);
 
-	sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer);
+	sk_stop_timer_sync(sk, &sk->tcp_retransmit_timer);
 	sk_stop_timer_sync(sk, &icsk->icsk_delack_timer);
 	sk_stop_timer_sync(sk, &icsk->icsk_keepalive_timer);
 }
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d2678dfd8118..160080c9021d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -698,7 +698,7 @@ void tcp_write_timer_handler(struct sock *sk)
 		return;
 
 	if (time_after(tcp_timeout_expires(sk), jiffies)) {
-		sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+		sk_reset_timer(sk, &sk->tcp_retransmit_timer,
 			       tcp_timeout_expires(sk));
 		return;
 	}
@@ -725,12 +725,10 @@ void tcp_write_timer_handler(struct sock *sk)
 
 static void tcp_write_timer(struct timer_list *t)
 {
-	struct inet_connection_sock *icsk =
-			timer_container_of(icsk, t, icsk_retransmit_timer);
-	struct sock *sk = &icsk->icsk_inet.sk;
+	struct sock *sk = timer_container_of(sk, t, tcp_retransmit_timer);
 
 	/* Avoid locking the socket when there is no pending event. */
-	if (!smp_load_acquire(&icsk->icsk_pending))
+	if (!smp_load_acquire(&inet_csk(sk)->icsk_pending))
 		goto out;
 
 	bh_lock_sock(sk);
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 89a5f63921e6..bb7d634cf312 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -411,9 +411,7 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
 
 static void mptcp_stop_rtx_timer(struct sock *sk)
 {
-	struct inet_connection_sock *icsk = inet_csk(sk);
-
-	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+	sk_stop_timer(sk, &sk->mptcp_retransmit_timer);
 	mptcp_sk(sk)->timer_ival = 0;
 }
 
@@ -954,12 +952,11 @@ static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list
 
 static bool mptcp_rtx_timer_pending(struct sock *sk)
 {
-	return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
+	return timer_pending(&sk->mptcp_retransmit_timer);
 }
 
 static void mptcp_reset_rtx_timer(struct sock *sk)
 {
-	struct inet_connection_sock *icsk = inet_csk(sk);
 	unsigned long tout;
 
 	/* prevent rescheduling on close */
@@ -967,7 +964,7 @@ static void mptcp_reset_rtx_timer(struct sock *sk)
 		return;
 
 	tout = mptcp_sk(sk)->timer_ival;
-	sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
+	sk_reset_timer(sk, &sk->mptcp_retransmit_timer, jiffies + tout);
 }
 
 bool mptcp_schedule_work(struct sock *sk)
@@ -2354,9 +2351,7 @@ out_err:
 
 static void mptcp_retransmit_timer(struct timer_list *t)
 {
-	struct inet_connection_sock *icsk = timer_container_of(icsk, t,
-							       icsk_retransmit_timer);
-	struct sock *sk = &icsk->icsk_inet.sk;
+	struct sock *sk = timer_container_of(sk, t, mptcp_retransmit_timer);
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
 	bh_lock_sock(sk);
@@ -2975,7 +2970,7 @@ static void __mptcp_init_sock(struct sock *sk)
 	spin_lock_init(&msk->fallback_lock);
 
 	/* re-use the csk retrans timer for MPTCP-level retrans */
-	timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
+	timer_setup(&sk->mptcp_retransmit_timer, mptcp_retransmit_timer, 0);
 	timer_setup(&msk->sk.mptcp_tout_timer, mptcp_tout_timer, 0);
 }
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
index 685811326a04..b1e509b231cd 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
@@ -99,10 +99,10 @@ static int dump_tcp_sock(struct seq_file *seq, struct tcp_sock *tp,
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active = 1;
-		timer_expires = icsk->icsk_retransmit_timer.expires;
+		timer_expires = sp->tcp_retransmit_timer.expires;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active = 4;
-		timer_expires = icsk->icsk_retransmit_timer.expires;
+		timer_expires = sp->tcp_retransmit_timer.expires;
 	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
 		timer_active = 2;
 		timer_expires = icsk->icsk_keepalive_timer.expires;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
index 0f4a92712751..dbc7166aee91 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
@@ -99,10 +99,10 @@ static int dump_tcp6_sock(struct seq_file *seq, struct tcp6_sock *tp,
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active = 1;
-		timer_expires = icsk->icsk_retransmit_timer.expires;
+		timer_expires = sp->tcp_retransmit_timer.expires;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active = 4;
-		timer_expires = icsk->icsk_retransmit_timer.expires;
+		timer_expires = sp->tcp_retransmit_timer.expires;
 	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
 		timer_active = 2;
 		timer_expires = icsk->icsk_keepalive_timer.expires;
-- 
cgit v1.2.3


From 585a4f22c4f9d85e32d42be65e67c232e82e5b3a Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Wed, 26 Nov 2025 11:16:03 +0100
Subject: can: bittiming: apply NL_SET_ERR_MSG() to can_calc_bittiming()

When CONFIG_CAN_CALC_BITTIMING is disabled, the can_calc_bittiming()
functions can not be used and the user needs to provide all the
bittiming parameters.

Currently, can_calc_bittiming() prints an error message to the kernel
log. Instead use NL_SET_ERR_MSG() to make it return the error message
through the netlink interface so that the user can directly see it.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-2-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/bittiming.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index d30816dd93c7..3926c78b2222 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -141,7 +141,7 @@ static inline int
 can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt,
 		   const struct can_bittiming_const *btc, struct netlink_ext_ack *extack)
 {
-	netdev_err(dev, "bit-timing calculation not available\n");
+	NL_SET_ERR_MSG(extack, "bit-timing calculation not available\n");
 	return -EINVAL;
 }
 
-- 
cgit v1.2.3


From d037d05c2e32792a6fa572b0aa3c92a8ac78589d Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Wed, 26 Nov 2025 11:16:04 +0100
Subject: can: dev: can_dev_dropped_skb: drop CAN FD skbs if FD is off

Currently, the CAN FD skb validation logic is based on the MTU: the
interface is deemed FD capable if and only if its MTU is greater or
equal to CANFD_MTU.

This logic is showing its limit with the introduction of CAN XL. For
example, consider the two scenarios below:

  1. An interface configured with CAN FD on and CAN XL on

  2. An interface configured with CAN FD off and CAN XL on

In those two scenarios, the interfaces would have the same MTU:

  CANXL_MTU

making it impossible to differentiate which one has CAN FD turned on
and which one has it off.

Because of the limitation, the only non-UAPI-breaking workaround is to
do the check at the device level using the can_priv->ctrlmode flags.
Unfortunately, the virtual interfaces (vcan, vxcan), which do not have
a can_priv, are left behind.

Add a check on the CAN_CTRLMODE_FD flag in can_dev_dropped_skb() and
drop FD frames whenever the feature is turned off.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-3-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index bd7410b5d8a6..a7a39a6101d9 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -103,12 +103,20 @@ static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *s
 	if (priv->ctrlmode & CAN_CTRLMODE_LISTENONLY) {
 		netdev_info_once(dev,
 				 "interface in listen only mode, dropping skb\n");
-		kfree_skb(skb);
-		dev->stats.tx_dropped++;
-		return true;
+		goto invalid_skb;
+	}
+
+	if (!(priv->ctrlmode & CAN_CTRLMODE_FD) && can_is_canfd_skb(skb)) {
+		netdev_info_once(dev, "CAN FD is disabled, dropping skb\n");
+		goto invalid_skb;
 	}
 
 	return can_dropped_invalid_skb(dev, skb);
+
+invalid_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+	return true;
 }
 
 void can_setup(struct net_device *dev);
-- 
cgit v1.2.3


From 60f511f443e552ef5b5cd79ec2b881f4323e19c9 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Wed, 26 Nov 2025 11:16:05 +0100
Subject: can: netlink: add CAN_CTRLMODE_RESTRICTED

ISO 11898-1:2024 adds a new restricted operation mode. This mode is
added as a mandatory feature for nodes which support CAN XL and is
retrofitted as optional for legacy nodes (i.e. the ones which only
support Classical CAN and CAN FD).

The restricted operation mode is nearly the same as the listen only
mode: the node can not send data frames or remote frames and can not
send dominant bits if an error occurs. The only exception is that the
node shall still send the acknowledgment bit. A second niche exception
is that the node may still send a data frame containing a time
reference message if the node is a primary time provider, but because
the time provider feature is not yet implemented in the kernel, this
second exception is not relevant to us at the moment.

Add the CAN_CTRLMODE_RESTRICTED control mode flag and update the
can_dev_dropped_skb() helper function accordingly.

Finally, bail out if both CAN_CTRLMODE_LISTENONLY and
CAN_CTRLMODE_RESTRICTED are provided.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-4-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/dev.c        |  2 ++
 drivers/net/can/dev/netlink.c    |  7 ++++++
 include/linux/can/dev.h          | 50 +++++++++++++++++++++-------------------
 include/uapi/linux/can/netlink.h |  1 +
 4 files changed, 36 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index b392483e4499..b6980d32e5b4 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -115,6 +115,8 @@ const char *can_get_ctrlmode_str(u32 ctrlmode)
 		return "TDC-AUTO";
 	case CAN_CTRLMODE_TDC_MANUAL:
 		return "TDC-MANUAL";
+	case CAN_CTRLMODE_RESTRICTED:
+		return "RESTRICTED";
 	default:
 		return "<unknown>";
 	}
diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c
index 6f83b87d54fc..87e731527dd7 100644
--- a/drivers/net/can/dev/netlink.c
+++ b/drivers/net/can/dev/netlink.c
@@ -188,6 +188,13 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[],
 		struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]);
 
 		flags = cm->flags & cm->mask;
+
+		if ((flags & CAN_CTRLMODE_LISTENONLY) &&
+		    (flags & CAN_CTRLMODE_RESTRICTED)) {
+			NL_SET_ERR_MSG(extack,
+				       "LISTEN-ONLY and RESTRICTED modes are mutually exclusive");
+			return -EOPNOTSUPP;
+		}
 	}
 
 	err = can_validate_bittiming(data, extack, IFLA_CAN_BITTIMING);
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index a7a39a6101d9..ab11c0e9111b 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -95,30 +95,6 @@ static inline bool can_is_canxl_dev_mtu(unsigned int mtu)
 	return (mtu >= CANXL_MIN_MTU && mtu <= CANXL_MAX_MTU);
 }
 
-/* drop skb if it does not contain a valid CAN frame for sending */
-static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *skb)
-{
-	struct can_priv *priv = netdev_priv(dev);
-
-	if (priv->ctrlmode & CAN_CTRLMODE_LISTENONLY) {
-		netdev_info_once(dev,
-				 "interface in listen only mode, dropping skb\n");
-		goto invalid_skb;
-	}
-
-	if (!(priv->ctrlmode & CAN_CTRLMODE_FD) && can_is_canfd_skb(skb)) {
-		netdev_info_once(dev, "CAN FD is disabled, dropping skb\n");
-		goto invalid_skb;
-	}
-
-	return can_dropped_invalid_skb(dev, skb);
-
-invalid_skb:
-	kfree_skb(skb);
-	dev->stats.tx_dropped++;
-	return true;
-}
-
 void can_setup(struct net_device *dev);
 
 struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
@@ -154,6 +130,32 @@ void can_bus_off(struct net_device *dev);
 const char *can_get_state_str(const enum can_state state);
 const char *can_get_ctrlmode_str(u32 ctrlmode);
 
+/* drop skb if it does not contain a valid CAN frame for sending */
+static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *skb)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	u32 silent_mode = priv->ctrlmode & (CAN_CTRLMODE_LISTENONLY |
+					    CAN_CTRLMODE_RESTRICTED);
+
+	if (silent_mode) {
+		netdev_info_once(dev, "interface in %s mode, dropping skb\n",
+				 can_get_ctrlmode_str(silent_mode));
+		goto invalid_skb;
+	}
+
+	if (!(priv->ctrlmode & CAN_CTRLMODE_FD) && can_is_canfd_skb(skb)) {
+		netdev_info_once(dev, "CAN FD is disabled, dropping skb\n");
+		goto invalid_skb;
+	}
+
+	return can_dropped_invalid_skb(dev, skb);
+
+invalid_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+	return true;
+}
+
 void can_state_get_by_berr_counter(const struct net_device *dev,
 				   const struct can_berr_counter *bec,
 				   enum can_state *tx_state,
diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index ef62f56eaaef..fafd1cce4798 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -103,6 +103,7 @@ struct can_ctrlmode {
 #define CAN_CTRLMODE_CC_LEN8_DLC	0x100	/* Classic CAN DLC option */
 #define CAN_CTRLMODE_TDC_AUTO		0x200	/* FD transceiver automatically calculates TDCV */
 #define CAN_CTRLMODE_TDC_MANUAL		0x400	/* FD TDCV is manually set up by user */
+#define CAN_CTRLMODE_RESTRICTED		0x800	/* Restricted operation mode */
 
 /*
  * CAN device statistics
-- 
cgit v1.2.3


From e63281614747c73f25b708c75bc696c4e76f5588 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Wed, 26 Nov 2025 11:16:06 +0100
Subject: can: netlink: add initial CAN XL support

CAN XL uses bittiming parameters different from Classical CAN and CAN
FD. Thus, all the data bittiming parameters, including TDC, need to be
duplicated for CAN XL.

Add the CAN XL netlink interface for all the features which are common
with CAN FD. Any new CAN XL specific features are added later on.

The first time CAN XL is activated, the MTU is set by default to
CANXL_MAX_MTU. The user may then configure a custom MTU within the
CANXL_MIN_MTU to CANXL_MAX_MTU range, in which case, the custom MTU
value will be kept as long as CAN XL remains active.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-5-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/dev.c        | 14 +++++++-
 drivers/net/can/dev/netlink.c    | 76 +++++++++++++++++++++++++++++++---------
 include/linux/can/bittiming.h    |  6 ++--
 include/linux/can/dev.h          |  7 +++-
 include/uapi/linux/can/netlink.h |  7 ++++
 5 files changed, 90 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index b6980d32e5b4..bdec2c52c8ec 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -117,6 +117,12 @@ const char *can_get_ctrlmode_str(u32 ctrlmode)
 		return "TDC-MANUAL";
 	case CAN_CTRLMODE_RESTRICTED:
 		return "RESTRICTED";
+	case CAN_CTRLMODE_XL:
+		return "XL";
+	case CAN_CTRLMODE_XL_TDC_AUTO:
+		return "XL-TDC-AUTO";
+	case CAN_CTRLMODE_XL_TDC_MANUAL:
+		return "XL-TDC-MANUAL";
 	default:
 		return "<unknown>";
 	}
@@ -350,7 +356,13 @@ void can_set_default_mtu(struct net_device *dev)
 {
 	struct can_priv *priv = netdev_priv(dev);
 
-	if (priv->ctrlmode & CAN_CTRLMODE_FD) {
+	if (priv->ctrlmode & CAN_CTRLMODE_XL) {
+		if (can_is_canxl_dev_mtu(dev->mtu))
+			return;
+		dev->mtu = CANXL_MTU;
+		dev->min_mtu = CANXL_MIN_MTU;
+		dev->max_mtu = CANXL_MAX_MTU;
+	} else if (priv->ctrlmode & CAN_CTRLMODE_FD) {
 		dev->mtu = CANFD_MTU;
 		dev->min_mtu = CANFD_MTU;
 		dev->max_mtu = CANFD_MTU;
diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c
index 87e731527dd7..fdd1fa7cf93a 100644
--- a/drivers/net/can/dev/netlink.c
+++ b/drivers/net/can/dev/netlink.c
@@ -2,7 +2,7 @@
 /* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix
  * Copyright (C) 2006 Andrey Volkov, Varma Electronics
  * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com>
- * Copyright (C) 2021 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
+ * Copyright (C) 2021-2025 Vincent Mailhol <mailhol@kernel.org>
  */
 
 #include <linux/can/dev.h>
@@ -22,6 +22,9 @@ static const struct nla_policy can_policy[IFLA_CAN_MAX + 1] = {
 	[IFLA_CAN_TERMINATION] = { .type = NLA_U16 },
 	[IFLA_CAN_TDC] = { .type = NLA_NESTED },
 	[IFLA_CAN_CTRLMODE_EXT] = { .type = NLA_NESTED },
+	[IFLA_CAN_XL_DATA_BITTIMING] = { .len = sizeof(struct can_bittiming) },
+	[IFLA_CAN_XL_DATA_BITTIMING_CONST] = { .len = sizeof(struct can_bittiming_const) },
+	[IFLA_CAN_XL_TDC] = { .type = NLA_NESTED },
 };
 
 static const struct nla_policy can_tdc_policy[IFLA_CAN_TDC_MAX + 1] = {
@@ -70,7 +73,7 @@ static int can_validate_tdc(struct nlattr *data_tdc,
 		return -EOPNOTSUPP;
 	}
 
-	/* If one of the CAN_CTRLMODE_TDC_* flag is set then TDC
+	/* If one of the CAN_CTRLMODE_{,XL}_TDC_* flags is set then TDC
 	 * must be set and vice-versa
 	 */
 	if ((tdc_auto || tdc_manual) && !data_tdc) {
@@ -82,8 +85,8 @@ static int can_validate_tdc(struct nlattr *data_tdc,
 		return -EOPNOTSUPP;
 	}
 
-	/* If providing TDC parameters, at least TDCO is needed. TDCV
-	 * is needed if and only if CAN_CTRLMODE_TDC_MANUAL is set
+	/* If providing TDC parameters, at least TDCO is needed. TDCV is
+	 * needed if and only if CAN_CTRLMODE_{,XL}_TDC_MANUAL is set
 	 */
 	if (data_tdc) {
 		struct nlattr *tb_tdc[IFLA_CAN_TDC_MAX + 1];
@@ -126,10 +129,10 @@ static int can_validate_databittiming(struct nlattr *data[],
 	bool is_on;
 	int err;
 
-	/* Make sure that valid CAN FD configurations always consist of
+	/* Make sure that valid CAN FD/XL configurations always consist of
 	 * - nominal/arbitration bittiming
 	 * - data bittiming
-	 * - control mode with CAN_CTRLMODE_FD set
+	 * - control mode with CAN_CTRLMODE_{FD,XL} set
 	 * - TDC parameters are coherent (details in can_validate_tdc())
 	 */
 
@@ -139,7 +142,10 @@ static int can_validate_databittiming(struct nlattr *data[],
 		is_on = flags & CAN_CTRLMODE_FD;
 		type = "FD";
 	} else {
-		return -EOPNOTSUPP; /* Place holder for CAN XL */
+		data_tdc = data[IFLA_CAN_XL_TDC];
+		tdc_flags = flags & CAN_CTRLMODE_XL_TDC_MASK;
+		is_on = flags & CAN_CTRLMODE_XL;
+		type = "XL";
 	}
 
 	if (is_on) {
@@ -206,6 +212,11 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[],
 	if (err)
 		return err;
 
+	err = can_validate_databittiming(data, extack,
+					 IFLA_CAN_XL_DATA_BITTIMING, flags);
+	if (err)
+		return err;
+
 	return 0;
 }
 
@@ -251,18 +262,26 @@ static int can_ctrlmode_changelink(struct net_device *dev,
 	/* If a top dependency flag is provided, reset all its dependencies */
 	if (cm->mask & CAN_CTRLMODE_FD)
 		priv->ctrlmode &= ~CAN_CTRLMODE_FD_TDC_MASK;
+	if (cm->mask & CAN_CTRLMODE_XL)
+		priv->ctrlmode &= ~(CAN_CTRLMODE_XL_TDC_MASK);
 
 	/* clear bits to be modified and copy the flag values */
 	priv->ctrlmode &= ~cm->mask;
 	priv->ctrlmode |= maskedflags;
 
-	/* Wipe potential leftovers from previous CAN FD config */
+	/* Wipe potential leftovers from previous CAN FD/XL config */
 	if (!(priv->ctrlmode & CAN_CTRLMODE_FD)) {
 		memset(&priv->fd.data_bittiming, 0,
 		       sizeof(priv->fd.data_bittiming));
 		priv->ctrlmode &= ~CAN_CTRLMODE_FD_TDC_MASK;
 		memset(&priv->fd.tdc, 0, sizeof(priv->fd.tdc));
 	}
+	if (!(priv->ctrlmode & CAN_CTRLMODE_XL)) {
+		memset(&priv->xl.data_bittiming, 0,
+		       sizeof(priv->fd.data_bittiming));
+		priv->ctrlmode &= ~CAN_CTRLMODE_XL_TDC_MASK;
+		memset(&priv->xl.tdc, 0, sizeof(priv->xl.tdc));
+	}
 
 	can_set_default_mtu(dev);
 
@@ -337,7 +356,10 @@ static int can_dbt_changelink(struct net_device *dev, struct nlattr *data[],
 		dbt_params = &priv->fd;
 		tdc_mask = CAN_CTRLMODE_FD_TDC_MASK;
 	} else {
-		return -EOPNOTSUPP; /* Place holder for CAN XL */
+		data_bittiming = data[IFLA_CAN_XL_DATA_BITTIMING];
+		data_tdc = data[IFLA_CAN_XL_TDC];
+		dbt_params = &priv->xl;
+		tdc_mask = CAN_CTRLMODE_XL_TDC_MASK;
 	}
 
 	if (!data_bittiming)
@@ -388,7 +410,7 @@ static int can_dbt_changelink(struct net_device *dev, struct nlattr *data[],
 		 */
 		can_calc_tdco(&dbt_params->tdc, dbt_params->tdc_const, &dbt,
 			      tdc_mask, &priv->ctrlmode, priv->ctrlmode_supported);
-	} /* else: both CAN_CTRLMODE_TDC_{AUTO,MANUAL} are explicitly
+	} /* else: both CAN_CTRLMODE_{,XL}_TDC_{AUTO,MANUAL} are explicitly
 	   * turned off. TDC is disabled: do nothing
 	   */
 
@@ -493,6 +515,11 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (err)
 		return err;
 
+	/* CAN XL */
+	err = can_dbt_changelink(dev, data, false, extack);
+	if (err)
+		return err;
+
 	if (data[IFLA_CAN_TERMINATION]) {
 		const u16 termval = nla_get_u16(data[IFLA_CAN_TERMINATION]);
 		const unsigned int num_term = priv->termination_const_cnt;
@@ -560,14 +587,14 @@ static size_t can_data_bittiming_get_size(struct data_bittiming_params *dbt_para
 {
 	size_t size = 0;
 
-	if (dbt_params->data_bittiming.bitrate)		/* IFLA_CAN_DATA_BITTIMING */
+	if (dbt_params->data_bittiming.bitrate)		/* IFLA_CAN_{,XL}_DATA_BITTIMING */
 		size += nla_total_size(sizeof(dbt_params->data_bittiming));
-	if (dbt_params->data_bittiming_const)		/* IFLA_CAN_DATA_BITTIMING_CONST */
+	if (dbt_params->data_bittiming_const)		/* IFLA_CAN_{,XL}_DATA_BITTIMING_CONST */
 		size += nla_total_size(sizeof(*dbt_params->data_bittiming_const));
-	if (dbt_params->data_bitrate_const)		/* IFLA_CAN_DATA_BITRATE_CONST */
+	if (dbt_params->data_bitrate_const)		/* IFLA_CAN_{,XL}_DATA_BITRATE_CONST */
 		size += nla_total_size(sizeof(*dbt_params->data_bitrate_const) *
 				       dbt_params->data_bitrate_const_cnt);
-	size += can_tdc_get_size(dbt_params, tdc_flags);/* IFLA_CAN_TDC */
+	size += can_tdc_get_size(dbt_params, tdc_flags);/* IFLA_CAN_{,XL}_TDC */
 
 	return size;
 }
@@ -607,6 +634,9 @@ static size_t can_get_size(const struct net_device *dev)
 	size += can_data_bittiming_get_size(&priv->fd,
 					    priv->ctrlmode & CAN_CTRLMODE_FD_TDC_MASK);
 
+	size += can_data_bittiming_get_size(&priv->xl,
+					    priv->ctrlmode & CAN_CTRLMODE_XL_TDC_MASK);
+
 	return size;
 }
 
@@ -651,7 +681,9 @@ static int can_tdc_fill_info(struct sk_buff *skb, const struct net_device *dev,
 		tdc_is_enabled = can_fd_tdc_is_enabled(priv);
 		tdc_manual = priv->ctrlmode & CAN_CTRLMODE_TDC_MANUAL;
 	} else {
-		return -EOPNOTSUPP; /* Place holder for CAN XL */
+		dbt_params = &priv->xl;
+		tdc_is_enabled = can_xl_tdc_is_enabled(priv);
+		tdc_manual = priv->ctrlmode & CAN_CTRLMODE_XL_TDC_MANUAL;
 	}
 	tdc_const = dbt_params->tdc_const;
 	tdc = &dbt_params->tdc;
@@ -773,7 +805,19 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
 	    can_tdc_fill_info(skb, dev, IFLA_CAN_TDC) ||
 
-	    can_ctrlmode_ext_fill_info(skb, priv)
+	    can_ctrlmode_ext_fill_info(skb, priv) ||
+
+	    can_bittiming_fill_info(skb, IFLA_CAN_XL_DATA_BITTIMING,
+				    &priv->xl.data_bittiming) ||
+
+	    can_bittiming_const_fill_info(skb, IFLA_CAN_XL_DATA_BITTIMING_CONST,
+					  priv->xl.data_bittiming_const) ||
+
+	    can_bitrate_const_fill_info(skb, IFLA_CAN_XL_DATA_BITRATE_CONST,
+					priv->xl.data_bitrate_const,
+					priv->xl.data_bitrate_const_cnt) ||
+
+	    can_tdc_fill_info(skb, dev, IFLA_CAN_XL_TDC)
 	    )
 
 		return -EMSGSIZE;
diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index 3926c78b2222..b6cd2476ffd7 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -16,10 +16,12 @@
 
 #define CAN_CTRLMODE_FD_TDC_MASK				\
 	(CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL)
+#define CAN_CTRLMODE_XL_TDC_MASK				\
+	(CAN_CTRLMODE_XL_TDC_AUTO | CAN_CTRLMODE_XL_TDC_MANUAL)
 #define CAN_CTRLMODE_TDC_AUTO_MASK				\
-	(CAN_CTRLMODE_TDC_AUTO)
+	(CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_XL_TDC_AUTO)
 #define CAN_CTRLMODE_TDC_MANUAL_MASK				\
-	(CAN_CTRLMODE_TDC_MANUAL)
+	(CAN_CTRLMODE_TDC_MANUAL | CAN_CTRLMODE_XL_TDC_MANUAL)
 
 /*
  * struct can_tdc - CAN FD Transmission Delay Compensation parameters
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index ab11c0e9111b..f15879bd818d 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -47,7 +47,7 @@ struct can_priv {
 
 	const struct can_bittiming_const *bittiming_const;
 	struct can_bittiming bittiming;
-	struct data_bittiming_params fd;
+	struct data_bittiming_params fd, xl;
 	unsigned int bitrate_const_cnt;
 	const u32 *bitrate_const;
 	u32 bitrate_max;
@@ -85,6 +85,11 @@ static inline bool can_fd_tdc_is_enabled(const struct can_priv *priv)
 	return !!(priv->ctrlmode & CAN_CTRLMODE_FD_TDC_MASK);
 }
 
+static inline bool can_xl_tdc_is_enabled(const struct can_priv *priv)
+{
+	return !!(priv->ctrlmode & CAN_CTRLMODE_XL_TDC_MASK);
+}
+
 static inline u32 can_get_static_ctrlmode(struct can_priv *priv)
 {
 	return priv->ctrlmode & ~priv->ctrlmode_supported;
diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index fafd1cce4798..c2c96c5978a8 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -104,6 +104,9 @@ struct can_ctrlmode {
 #define CAN_CTRLMODE_TDC_AUTO		0x200	/* FD transceiver automatically calculates TDCV */
 #define CAN_CTRLMODE_TDC_MANUAL		0x400	/* FD TDCV is manually set up by user */
 #define CAN_CTRLMODE_RESTRICTED		0x800	/* Restricted operation mode */
+#define CAN_CTRLMODE_XL			0x1000	/* CAN XL mode */
+#define CAN_CTRLMODE_XL_TDC_AUTO	0x2000	/* XL transceiver automatically calculates TDCV */
+#define CAN_CTRLMODE_XL_TDC_MANUAL	0x4000	/* XL TDCV is manually set up by user */
 
 /*
  * CAN device statistics
@@ -139,6 +142,10 @@ enum {
 	IFLA_CAN_BITRATE_MAX,
 	IFLA_CAN_TDC, /* FD */
 	IFLA_CAN_CTRLMODE_EXT,
+	IFLA_CAN_XL_DATA_BITTIMING,
+	IFLA_CAN_XL_DATA_BITTIMING_CONST,
+	IFLA_CAN_XL_DATA_BITRATE_CONST,
+	IFLA_CAN_XL_TDC,
 
 	/* add new constants above here */
 	__IFLA_CAN_MAX,
-- 
cgit v1.2.3


From 233134af208689c2d5d40896f5740473a74e3cb2 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Wed, 26 Nov 2025 11:16:07 +0100
Subject: can: netlink: add CAN_CTRLMODE_XL_TMS flag

The Transceiver Mode Switching (TMS) indicates whether the CAN XL
controller shall use the PWM or NRZ encoding during the data phase.

The term "transceiver mode switching" is used in both ISO 11898-1 and
CiA 612-2 (although only the latter one uses the abbreviation TMS). We
adopt the same naming convention here for consistency.

Add the CAN_CTRLMODE_XL_TMS flag to the list of the CAN control modes.

Add can_validate_xl_flags() to check the coherency of the TMS flag.
That function will be reused in upcoming changes to validate the other
CAN XL flags.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-6-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/dev.c        |  2 ++
 drivers/net/can/dev/netlink.c    | 48 +++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/can/netlink.h |  1 +
 3 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index bdec2c52c8ec..091f30e94c61 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -123,6 +123,8 @@ const char *can_get_ctrlmode_str(u32 ctrlmode)
 		return "XL-TDC-AUTO";
 	case CAN_CTRLMODE_XL_TDC_MANUAL:
 		return "XL-TDC-MANUAL";
+	case CAN_CTRLMODE_XL_TMS:
+		return "TMS";
 	default:
 		return "<unknown>";
 	}
diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c
index fdd1fa7cf93a..b2c24439abba 100644
--- a/drivers/net/can/dev/netlink.c
+++ b/drivers/net/can/dev/netlink.c
@@ -181,6 +181,32 @@ static int can_validate_databittiming(struct nlattr *data[],
 	return 0;
 }
 
+static int can_validate_xl_flags(struct netlink_ext_ack *extack,
+				 u32 masked_flags, u32 mask)
+{
+	if (masked_flags & CAN_CTRLMODE_XL) {
+		if (masked_flags & CAN_CTRLMODE_XL_TMS) {
+			const u32 tms_conflicts_mask = CAN_CTRLMODE_FD |
+				CAN_CTRLMODE_XL_TDC_MASK;
+			u32 tms_conflicts = masked_flags & tms_conflicts_mask;
+
+			if (tms_conflicts) {
+				NL_SET_ERR_MSG_FMT(extack,
+						   "TMS and %s are mutually exclusive",
+						   can_get_ctrlmode_str(tms_conflicts));
+				return -EOPNOTSUPP;
+			}
+		}
+	} else {
+		if (mask & CAN_CTRLMODE_XL_TMS) {
+			NL_SET_ERR_MSG(extack, "TMS requires CAN XL");
+			return -EOPNOTSUPP;
+		}
+	}
+
+	return 0;
+}
+
 static int can_validate(struct nlattr *tb[], struct nlattr *data[],
 			struct netlink_ext_ack *extack)
 {
@@ -201,6 +227,10 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[],
 				       "LISTEN-ONLY and RESTRICTED modes are mutually exclusive");
 			return -EOPNOTSUPP;
 		}
+
+		err = can_validate_xl_flags(extack, flags, cm->mask);
+		if (err)
+			return err;
 	}
 
 	err = can_validate_bittiming(data, extack, IFLA_CAN_BITTIMING);
@@ -226,7 +256,7 @@ static int can_ctrlmode_changelink(struct net_device *dev,
 {
 	struct can_priv *priv = netdev_priv(dev);
 	struct can_ctrlmode *cm;
-	u32 ctrlstatic, maskedflags, notsupp, ctrlstatic_missing;
+	u32 ctrlstatic, maskedflags, deactivated, notsupp, ctrlstatic_missing;
 
 	if (!data[IFLA_CAN_CTRLMODE])
 		return 0;
@@ -238,6 +268,7 @@ static int can_ctrlmode_changelink(struct net_device *dev,
 	cm = nla_data(data[IFLA_CAN_CTRLMODE]);
 	ctrlstatic = can_get_static_ctrlmode(priv);
 	maskedflags = cm->flags & cm->mask;
+	deactivated = ~cm->flags & cm->mask;
 	notsupp = maskedflags & ~(priv->ctrlmode_supported | ctrlstatic);
 	ctrlstatic_missing = (maskedflags & ctrlstatic) ^ ctrlstatic;
 
@@ -259,11 +290,21 @@ static int can_ctrlmode_changelink(struct net_device *dev,
 		return -EOPNOTSUPP;
 	}
 
+	/* If FD was active and is not turned off, check for XL conflicts */
+	if (priv->ctrlmode & CAN_CTRLMODE_FD & ~deactivated) {
+		if (maskedflags & CAN_CTRLMODE_XL_TMS) {
+			NL_SET_ERR_MSG(extack,
+				       "TMS can not be activated while CAN FD is on");
+			return -EOPNOTSUPP;
+		}
+	}
+
 	/* If a top dependency flag is provided, reset all its dependencies */
 	if (cm->mask & CAN_CTRLMODE_FD)
 		priv->ctrlmode &= ~CAN_CTRLMODE_FD_TDC_MASK;
 	if (cm->mask & CAN_CTRLMODE_XL)
-		priv->ctrlmode &= ~(CAN_CTRLMODE_XL_TDC_MASK);
+		priv->ctrlmode &= ~(CAN_CTRLMODE_XL_TDC_MASK |
+				    CAN_CTRLMODE_XL_TMS);
 
 	/* clear bits to be modified and copy the flag values */
 	priv->ctrlmode &= ~cm->mask;
@@ -395,7 +436,8 @@ static int can_dbt_changelink(struct net_device *dev, struct nlattr *data[],
 	if (data[IFLA_CAN_CTRLMODE]) {
 		struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]);
 
-		need_tdc_calc = !(cm->mask & tdc_mask);
+		if (fd || !(priv->ctrlmode & CAN_CTRLMODE_XL_TMS))
+			need_tdc_calc = !(cm->mask & tdc_mask);
 	}
 	if (data_tdc) {
 		/* TDC parameters are provided: use them */
diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index c2c96c5978a8..ebafb091d80f 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -107,6 +107,7 @@ struct can_ctrlmode {
 #define CAN_CTRLMODE_XL			0x1000	/* CAN XL mode */
 #define CAN_CTRLMODE_XL_TDC_AUTO	0x2000	/* XL transceiver automatically calculates TDCV */
 #define CAN_CTRLMODE_XL_TDC_MANUAL	0x4000	/* XL TDCV is manually set up by user */
+#define CAN_CTRLMODE_XL_TMS		0x8000	/* Transceiver Mode Switching */
 
 /*
  * CAN device statistics
-- 
cgit v1.2.3


From 6df01533e535d21cac779ff35cc25c43304035c3 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Wed, 26 Nov 2025 11:16:08 +0100
Subject: can: dev: can_dev_dropped_skb: drop CC/FD frames in CANXL-only mode

The error-signalling (ES) is a mandatory functionality for CAN CC and
CAN FD to report CAN frame format violations by sending an error-frame
signal on the bus.

A so-called 'mixed-mode' is intended to have (XL-tolerant) CAN FD nodes
and CAN XL nodes on one CAN segment, where the FD-controllers can talk
CC/FD and the XL-controllers can talk CC/FD/XL. This mixed-mode
utilizes the error-signalling for sending CC/FD/XL frames.

The CANXL-only mode disables the error-signalling in the CAN XL
controller. This mode does not allow CC/FD frames to be sent but
additionally offers a CAN XL transceiver mode switching (TMS).

Configured with CAN_CTRLMODE_FD and CAN_CTRLMODE_XL this leads to:

FD=0 XL=0 CC-only mode         (ES=1)
FD=1 XL=0 FD/CC mixed-mode     (ES=1)
FD=1 XL=1 XL/FD/CC mixed-mode  (ES=1)
FD=0 XL=1 XL-only mode         (ES=0, TMS optional)

The helper function can_dev_in_xl_only_mode() determines the required
value to disable error signalling in the CAN XL controller.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-7-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index f15879bd818d..52c8be5c160e 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -135,6 +135,19 @@ void can_bus_off(struct net_device *dev);
 const char *can_get_state_str(const enum can_state state);
 const char *can_get_ctrlmode_str(u32 ctrlmode);
 
+static inline bool can_dev_in_xl_only_mode(struct can_priv *priv)
+{
+	const u32 mixed_mode = CAN_CTRLMODE_FD | CAN_CTRLMODE_XL;
+
+	/* When CAN XL is enabled but FD is disabled we are running in
+	 * the so-called 'CANXL-only mode' where the error signalling is
+	 * disabled. This helper function determines the required value
+	 * to disable error signalling in the CAN XL controller.
+	 * The so-called CC/FD/XL 'mixed mode' requires error signalling.
+	 */
+	return ((priv->ctrlmode & mixed_mode) == CAN_CTRLMODE_XL);
+}
+
 /* drop skb if it does not contain a valid CAN frame for sending */
 static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *skb)
 {
@@ -153,6 +166,12 @@ static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *s
 		goto invalid_skb;
 	}
 
+	if (can_dev_in_xl_only_mode(priv) && !can_is_canxl_skb(skb)) {
+		netdev_info_once(dev,
+				 "Error signaling is disabled, dropping skb\n");
+		goto invalid_skb;
+	}
+
 	return can_dropped_invalid_skb(dev, skb);
 
 invalid_skb:
-- 
cgit v1.2.3


From f6ccc2b293ba27e9171c63e456d9cba664fa2337 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Wed, 26 Nov 2025 11:16:09 +0100
Subject: can: bittiming: add PWM parameters

In CAN XL, higher data bit rates require the CAN transceiver to switch
its operation mode to use Pulse-Width Modulation (PWM) transmission
mode instead of the classic dominant/recessive transmission mode.

The PWM parameters are:

  - PWMS: pulse width modulation short phase
  - PWML: pulse width modulation long phase
  - PWMO: pulse width modulation offset

CiA 612-2 specifies PWMS and PWML to be at least 1 (arguably, PWML
shall be at least 2 to respect the PWMS < PWML rule). PWMO's minimum
is expected to always be zero. It is added more for consistency than
anything else.

Add struct can_pwm_const so that the different devices can provide
their minimum and maximum values.

When TMS is on, the runtime PWMS, PWML and PWMO are needed (either
calculated or provided by the user): add struct can_pwm to store
these.

TDC and PWM can not be used at the same time (TDC can only be used
when TMS is off and PWM only when TMS is on). struct can_pwm is thus
put together with struct can_tdc inside a union to save some space.

The netlink logic will be added in an upcoming change.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-8-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/bittiming.h | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index b6cd2476ffd7..967d76689c4f 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright (c) 2020 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de>
- * Copyright (c) 2021 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
+ * Copyright (c) 2021-2025 Vincent Mailhol <mailhol@kernel.org>
  */
 
 #ifndef _CAN_BITTIMING_H
@@ -120,11 +120,48 @@ struct can_tdc_const {
 	u32 tdcf_max;
 };
 
+/*
+ * struct can_pwm - CAN Pulse-Width Modulation (PWM) parameters
+ *
+ * @pwms: pulse width modulation short phase
+ * @pwml: pulse width modulation long phase
+ * @pwmo: pulse width modulation offset
+ */
+struct can_pwm {
+	u32 pwms;
+	u32 pwml;
+	u32 pwmo;
+};
+
+/*
+ * struct can_pwm - CAN hardware-dependent constants for Pulse-Width
+ *	Modulation (PWM)
+ *
+ * @pwms_min: PWM short phase minimum value. Must be at least 1.
+ * @pwms_max: PWM short phase maximum value
+ * @pwml_min: PWM long phase minimum value. Must be at least 1.
+ * @pwml_max: PWM long phase maximum value
+ * @pwmo_min: PWM offset phase minimum value
+ * @pwmo_max: PWM offset phase maximum value
+ */
+struct can_pwm_const {
+	u32 pwms_min;
+	u32 pwms_max;
+	u32 pwml_min;
+	u32 pwml_max;
+	u32 pwmo_min;
+	u32 pwmo_max;
+};
+
 struct data_bittiming_params {
 	const struct can_bittiming_const *data_bittiming_const;
 	struct can_bittiming data_bittiming;
 	const struct can_tdc_const *tdc_const;
-	struct can_tdc tdc;
+	const struct can_pwm_const *pwm_const;
+	union {
+		struct can_tdc tdc;
+		struct can_pwm pwm;
+	};
 	const u32 *data_bitrate_const;
 	unsigned int data_bitrate_const_cnt;
 	int (*do_set_data_bittiming)(struct net_device *dev);
-- 
cgit v1.2.3


From 8e2a2885a2a6217190065d1aae98fe88a670cc28 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Wed, 26 Nov 2025 11:16:10 +0100
Subject: can: bittiming: add PWM validation

Add can_validate_pwm() to validate the values pwms, pwml and pwml.
Error messages are added to each of the checks to inform the user on
what went wrong. Refer to those error messages to understand the
validation logic.

The boundary values CAN_PWM_DECODE_NS (the transceiver minimum
decoding margin) and CAN_PWM_NS_MAX (the maximum PWM symbol duration)
are hardcoded for the moment. Note that a transceiver capable of
bitrates higher than 20 Mbps may be able to handle a CAN_PWM_DECODE_NS
below 5 ns. If such transceivers become commercially available, this
code could be revisited to make this parameter configurable. For now,
leave it static.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-9-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/bittiming.c | 63 +++++++++++++++++++++++++++++++++++++++++
 include/linux/can/bittiming.h   | 22 ++++++++++++++
 2 files changed, 85 insertions(+)

(limited to 'include')

diff --git a/drivers/net/can/dev/bittiming.c b/drivers/net/can/dev/bittiming.c
index 0b93900b1dfa..8f82418230ce 100644
--- a/drivers/net/can/dev/bittiming.c
+++ b/drivers/net/can/dev/bittiming.c
@@ -2,6 +2,7 @@
 /* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix
  * Copyright (C) 2006 Andrey Volkov, Varma Electronics
  * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com>
+ * Copyright (c) 2025 Vincent Mailhol <mailhol@kernel.org>
  */
 
 #include <linux/can/dev.h>
@@ -151,3 +152,65 @@ int can_get_bittiming(const struct net_device *dev, struct can_bittiming *bt,
 
 	return -EINVAL;
 }
+
+int can_validate_pwm_bittiming(const struct net_device *dev,
+			       const struct can_pwm *pwm,
+			       struct netlink_ext_ack *extack)
+{
+	const struct can_priv *priv = netdev_priv(dev);
+	u32 xl_bit_time_tqmin = can_bit_time_tqmin(&priv->xl.data_bittiming);
+	u32 nom_bit_time_tqmin = can_bit_time_tqmin(&priv->bittiming);
+	u32 pwms_ns = can_tqmin_to_ns(pwm->pwms, priv->clock.freq);
+	u32 pwml_ns = can_tqmin_to_ns(pwm->pwml, priv->clock.freq);
+
+	if (pwms_ns + pwml_ns > CAN_PWM_NS_MAX) {
+		NL_SET_ERR_MSG_FMT(extack,
+				   "The PWM symbol duration: %u ns may not exceed %u ns",
+				   pwms_ns + pwml_ns, CAN_PWM_NS_MAX);
+		return -EINVAL;
+	}
+
+	if (pwms_ns < CAN_PWM_DECODE_NS) {
+		NL_SET_ERR_MSG_FMT(extack,
+				   "PWMS: %u ns shall be at least %u ns",
+				   pwms_ns, CAN_PWM_DECODE_NS);
+		return -EINVAL;
+	}
+
+	if (pwm->pwms >= pwm->pwml) {
+		NL_SET_ERR_MSG_FMT(extack,
+				   "PWMS: %u tqmin shall be smaller than PWML: %u tqmin",
+				   pwm->pwms, pwm->pwml);
+		return -EINVAL;
+	}
+
+	if (pwml_ns - pwms_ns < 2 * CAN_PWM_DECODE_NS) {
+		NL_SET_ERR_MSG_FMT(extack,
+				   "At least %u ns shall separate PWMS: %u ns from PMWL: %u ns",
+				   2 * CAN_PWM_DECODE_NS, pwms_ns, pwml_ns);
+		return -EINVAL;
+	}
+
+	if (xl_bit_time_tqmin % (pwm->pwms + pwm->pwml) != 0) {
+		NL_SET_ERR_MSG_FMT(extack,
+				   "PWM duration: %u tqmin does not divide XL's bit time: %u tqmin",
+				   pwm->pwms + pwm->pwml, xl_bit_time_tqmin);
+		return -EINVAL;
+	}
+
+	if (pwm->pwmo >= pwm->pwms + pwm->pwml) {
+		NL_SET_ERR_MSG_FMT(extack,
+				   "PWMO: %u tqmin can not be greater than PWMS + PWML: %u tqmin",
+				   pwm->pwmo, pwm->pwms + pwm->pwml);
+		return -EINVAL;
+	}
+
+	if (nom_bit_time_tqmin % (pwm->pwms + pwm->pwml) != pwm->pwmo) {
+		NL_SET_ERR_MSG_FMT(extack,
+				   "Can not assemble nominal bit time: %u tqmin out of PWMS + PMWL and PWMO",
+				   nom_bit_time_tqmin);
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index 967d76689c4f..2504fafc72e4 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -87,6 +87,11 @@ struct can_tdc {
 	u32 tdcf;
 };
 
+/* The transceiver decoding margin corresponds to t_Decode in ISO 11898-2 */
+#define CAN_PWM_DECODE_NS 5
+/* Maximum PWM symbol duration. Corresponds to t_SymbolNom_MAX - t_Decode */
+#define CAN_PWM_NS_MAX (205 - CAN_PWM_DECODE_NS)
+
 /*
  * struct can_tdc_const - CAN hardware-dependent constant for
  *	Transmission Delay Compensation
@@ -203,6 +208,10 @@ int can_get_bittiming(const struct net_device *dev, struct can_bittiming *bt,
 		      const unsigned int bitrate_const_cnt,
 		      struct netlink_ext_ack *extack);
 
+int can_validate_pwm_bittiming(const struct net_device *dev,
+			       const struct can_pwm *pwm,
+			       struct netlink_ext_ack *extack);
+
 /*
  * can_get_relative_tdco() - TDCO relative to the sample point
  *
@@ -245,4 +254,17 @@ static inline unsigned int can_bit_time(const struct can_bittiming *bt)
 	return CAN_SYNC_SEG + bt->prop_seg + bt->phase_seg1 + bt->phase_seg2;
 }
 
+/* Duration of one bit in minimum time quantum */
+static inline unsigned int can_bit_time_tqmin(const struct can_bittiming *bt)
+{
+	return can_bit_time(bt) * bt->brp;
+}
+
+/* Convert a duration from minimum a minimum time quantum to nano seconds */
+static inline u32 can_tqmin_to_ns(u32 tqmin, u32 clock_freq)
+{
+	return DIV_U64_ROUND_CLOSEST(mul_u32_u32(tqmin, NSEC_PER_SEC),
+				     clock_freq);
+}
+
 #endif /* !_CAN_BITTIMING_H */
-- 
cgit v1.2.3


From 9892339cf0348730e82383d4de9d9387b9d63925 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Wed, 26 Nov 2025 11:16:11 +0100
Subject: can: calc_bittiming: add PWM calculation

Perform the PWM calculation according to CiA recommendations.

Note that for databitrates greater than 5 MBPS, tqmin is less than
CAN_PWM_NS_MAX (which is defined to 200 nano seconds), consequently,
the result of the division:

  DIV_ROUND_UP(xl_ns, CAN_PWM_NS_MAX)

is one and thus the for loop automatically stops on the first
iteration giving a single PWM symbol per bit as expected. Because of
that, there is no actual need for a separate conditional branch for
when the databitrate is greater than 5 MBPS.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-10-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/calc_bittiming.c | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/can/bittiming.h        | 10 ++++++++++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/net/can/dev/calc_bittiming.c b/drivers/net/can/dev/calc_bittiming.c
index 394d6974f481..268ec6fa7c49 100644
--- a/drivers/net/can/dev/calc_bittiming.c
+++ b/drivers/net/can/dev/calc_bittiming.c
@@ -2,6 +2,7 @@
 /* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix
  * Copyright (C) 2006 Andrey Volkov, Varma Electronics
  * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com>
+ * Copyright (C) 2021-2025 Vincent Mailhol <mailhol@kernel.org>
  */
 
 #include <linux/units.h>
@@ -198,3 +199,38 @@ void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const,
 		*ctrlmode |= tdc_auto;
 	}
 }
+
+int can_calc_pwm(struct net_device *dev, struct netlink_ext_ack *extack)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	const struct can_pwm_const *pwm_const = priv->xl.pwm_const;
+	struct can_pwm *pwm = &priv->xl.pwm;
+	u32 xl_tqmin = can_bit_time_tqmin(&priv->xl.data_bittiming);
+	u32 xl_ns = can_tqmin_to_ns(xl_tqmin, priv->clock.freq);
+	u32 nom_tqmin = can_bit_time_tqmin(&priv->bittiming);
+	int pwm_per_bit_max = xl_tqmin / (pwm_const->pwms_min + pwm_const->pwml_min);
+	int pwm_per_bit;
+	u32 pwm_tqmin;
+
+	/* For 5 MB/s databitrate or greater, xl_ns < CAN_PWM_NS_MAX
+	 * giving us a pwm_per_bit of 1 and the loop immediately breaks
+	 */
+	for (pwm_per_bit = DIV_ROUND_UP(xl_ns, CAN_PWM_NS_MAX);
+	     pwm_per_bit <= pwm_per_bit_max; pwm_per_bit++)
+		if (xl_tqmin % pwm_per_bit == 0)
+			break;
+
+	if (pwm_per_bit > pwm_per_bit_max) {
+		NL_SET_ERR_MSG_FMT(extack,
+				   "Can not divide the XL data phase's bit time: %u tqmin into multiple PWM symbols",
+				   xl_tqmin);
+		return -EINVAL;
+	}
+
+	pwm_tqmin = xl_tqmin / pwm_per_bit;
+	pwm->pwms = DIV_ROUND_UP_POW2(pwm_tqmin, 4);
+	pwm->pwml = pwm_tqmin - pwm->pwms;
+	pwm->pwmo = nom_tqmin % pwm_tqmin;
+
+	return 0;
+}
diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index 2504fafc72e4..726d909e87ce 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -180,6 +180,8 @@ int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt,
 void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const,
 		   const struct can_bittiming *dbt,
 		   u32 tdc_mask, u32 *ctrlmode, u32 ctrlmode_supported);
+
+int can_calc_pwm(struct net_device *dev, struct netlink_ext_ack *extack);
 #else /* !CONFIG_CAN_CALC_BITTIMING */
 static inline int
 can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt,
@@ -195,6 +197,14 @@ can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const,
 	      u32 tdc_mask, u32 *ctrlmode, u32 ctrlmode_supported)
 {
 }
+
+static inline int
+can_calc_pwm(struct net_device *dev, struct netlink_ext_ack *extack)
+{
+	NL_SET_ERR_MSG(extack,
+		       "bit-timing calculation not available: manually provide PWML and PWMS\n");
+	return -EINVAL;
+}
 #endif /* CONFIG_CAN_CALC_BITTIMING */
 
 void can_sjw_set_default(struct can_bittiming *bt);
-- 
cgit v1.2.3


From 46552323fa6779beb1ea558254dfd56021174c93 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Wed, 26 Nov 2025 11:16:12 +0100
Subject: can: netlink: add PWM netlink interface

When the TMS is switched on, the node uses PWM (Pulse Width
Modulation) during the data phase instead of the classic NRZ (Non
Return to Zero) encoding.

PWM is configured by three parameters:

  - PWMS: Pulse Width Modulation Short phase
  - PWML: Pulse Width Modulation Long phase
  - PWMO: Pulse Width Modulation Offset time

For each of these parameters, define three IFLA symbols:

  - IFLA_CAN_PWM_PWM*_MIN: the minimum allowed value.
  - IFLA_CAN_PWM_PWM*_MAX: the maximum allowed value.
  - IFLA_CAN_PWM_PWM*: the runtime value.

This results in a total of nine IFLA symbols which are all nested in a
parent IFLA_CAN_XL_PWM symbol.

IFLA_CAN_PWM_PWM*_MIN and IFLA_CAN_PWM_PWM*_MAX define the range of
allowed values and will match the value statically configured by the
device in struct can_pwm_const.

IFLA_CAN_PWM_PWM* match the runtime values stored in struct can_pwm.
Those parameters may only be configured when the tms mode is on. If
the PWMS, PWML and PWMO parameters are provided, check that all the
needed parameters are present using can_validate_pwm(), then check
their value using can_validate_pwm_bittiming(). PWMO defaults to zero
if omitted. Otherwise, if CAN_CTRLMODE_XL_TMS is true but none of the
PWM parameters are provided, calculate them using can_calc_pwm().

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-11-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/netlink.c    | 192 ++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/can/netlink.h |  25 +++++
 2 files changed, 215 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c
index b2c24439abba..d6b0e686fb11 100644
--- a/drivers/net/can/dev/netlink.c
+++ b/drivers/net/can/dev/netlink.c
@@ -25,6 +25,7 @@ static const struct nla_policy can_policy[IFLA_CAN_MAX + 1] = {
 	[IFLA_CAN_XL_DATA_BITTIMING] = { .len = sizeof(struct can_bittiming) },
 	[IFLA_CAN_XL_DATA_BITTIMING_CONST] = { .len = sizeof(struct can_bittiming_const) },
 	[IFLA_CAN_XL_TDC] = { .type = NLA_NESTED },
+	[IFLA_CAN_XL_PWM] = { .type = NLA_NESTED },
 };
 
 static const struct nla_policy can_tdc_policy[IFLA_CAN_TDC_MAX + 1] = {
@@ -39,6 +40,18 @@ static const struct nla_policy can_tdc_policy[IFLA_CAN_TDC_MAX + 1] = {
 	[IFLA_CAN_TDC_TDCF] = { .type = NLA_U32 },
 };
 
+static const struct nla_policy can_pwm_policy[IFLA_CAN_PWM_MAX + 1] = {
+	[IFLA_CAN_PWM_PWMS_MIN] = { .type = NLA_U32 },
+	[IFLA_CAN_PWM_PWMS_MAX] = { .type = NLA_U32 },
+	[IFLA_CAN_PWM_PWML_MIN] = { .type = NLA_U32 },
+	[IFLA_CAN_PWM_PWML_MAX] = { .type = NLA_U32 },
+	[IFLA_CAN_PWM_PWMO_MIN] = { .type = NLA_U32 },
+	[IFLA_CAN_PWM_PWMO_MAX] = { .type = NLA_U32 },
+	[IFLA_CAN_PWM_PWMS] = { .type = NLA_U32 },
+	[IFLA_CAN_PWM_PWML] = { .type = NLA_U32 },
+	[IFLA_CAN_PWM_PWMO] = { .type = NLA_U32 },
+};
+
 static int can_validate_bittiming(struct nlattr *data[],
 				  struct netlink_ext_ack *extack,
 				  int ifla_can_bittiming)
@@ -119,6 +132,40 @@ static int can_validate_tdc(struct nlattr *data_tdc,
 	return 0;
 }
 
+static int can_validate_pwm(struct nlattr *data[],
+			    struct netlink_ext_ack *extack, u32 flags)
+{
+	struct nlattr *tb_pwm[IFLA_CAN_PWM_MAX + 1];
+	int err;
+
+	if (!data[IFLA_CAN_XL_PWM])
+		return 0;
+
+	if (!(flags & CAN_CTRLMODE_XL_TMS)) {
+		NL_SET_ERR_MSG(extack, "PWM requires TMS");
+		return -EOPNOTSUPP;
+	}
+
+	err = nla_parse_nested(tb_pwm, IFLA_CAN_PWM_MAX, data[IFLA_CAN_XL_PWM],
+			       can_pwm_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb_pwm[IFLA_CAN_PWM_PWMS] != !tb_pwm[IFLA_CAN_PWM_PWML]) {
+		NL_SET_ERR_MSG(extack,
+			       "Provide either both PWMS and PWML, or none for automatic calculation");
+		return -EOPNOTSUPP;
+	}
+
+	if (tb_pwm[IFLA_CAN_PWM_PWMO] &&
+	    (!tb_pwm[IFLA_CAN_PWM_PWMS] || !tb_pwm[IFLA_CAN_PWM_PWML])) {
+		NL_SET_ERR_MSG(extack, "PWMO requires both PWMS and PWML");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 static int can_validate_databittiming(struct nlattr *data[],
 				      struct netlink_ext_ack *extack,
 				      int ifla_can_data_bittiming, u32 flags)
@@ -247,6 +294,10 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[],
 	if (err)
 		return err;
 
+	err = can_validate_pwm(data, extack, flags);
+	if (err)
+		return err;
+
 	return 0;
 }
 
@@ -322,6 +373,7 @@ static int can_ctrlmode_changelink(struct net_device *dev,
 		       sizeof(priv->fd.data_bittiming));
 		priv->ctrlmode &= ~CAN_CTRLMODE_XL_TDC_MASK;
 		memset(&priv->xl.tdc, 0, sizeof(priv->xl.tdc));
+		memset(&priv->xl.pwm, 0, sizeof(priv->xl.pwm));
 	}
 
 	can_set_default_mtu(dev);
@@ -468,6 +520,76 @@ static int can_dbt_changelink(struct net_device *dev, struct nlattr *data[],
 	return 0;
 }
 
+static int can_pwm_changelink(struct net_device *dev,
+			      const struct nlattr *pwm_nla,
+			      struct netlink_ext_ack *extack)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	const struct can_pwm_const *pwm_const = priv->xl.pwm_const;
+	struct nlattr *tb_pwm[IFLA_CAN_PWM_MAX + 1];
+	struct can_pwm pwm = { 0 };
+	int err;
+
+	if (!(priv->ctrlmode & CAN_CTRLMODE_XL_TMS))
+		return 0;
+
+	if (!pwm_const) {
+		NL_SET_ERR_MSG(extack, "The device does not support PWM");
+		return -EOPNOTSUPP;
+	}
+
+	if (!pwm_nla)
+		return can_calc_pwm(dev, extack);
+
+	err = nla_parse_nested(tb_pwm, IFLA_CAN_PWM_MAX, pwm_nla,
+			       can_pwm_policy, extack);
+	if (err)
+		return err;
+
+	if (tb_pwm[IFLA_CAN_PWM_PWMS]) {
+		pwm.pwms = nla_get_u32(tb_pwm[IFLA_CAN_PWM_PWMS]);
+		if (pwm.pwms < pwm_const->pwms_min ||
+		    pwm.pwms > pwm_const->pwms_max) {
+			NL_SET_ERR_MSG_FMT(extack,
+					   "PWMS: %u tqmin is out of range: %u...%u",
+					   pwm.pwms, pwm_const->pwms_min,
+					   pwm_const->pwms_max);
+			return -EINVAL;
+		}
+	}
+
+	if (tb_pwm[IFLA_CAN_PWM_PWML]) {
+		pwm.pwml = nla_get_u32(tb_pwm[IFLA_CAN_PWM_PWML]);
+		if (pwm.pwml < pwm_const->pwml_min ||
+		    pwm.pwml > pwm_const->pwml_max) {
+			NL_SET_ERR_MSG_FMT(extack,
+					   "PWML: %u tqmin is out of range: %u...%u",
+					   pwm.pwml, pwm_const->pwml_min,
+					   pwm_const->pwml_max);
+			return -EINVAL;
+		}
+	}
+
+	if (tb_pwm[IFLA_CAN_PWM_PWMO]) {
+		pwm.pwmo = nla_get_u32(tb_pwm[IFLA_CAN_PWM_PWMO]);
+		if (pwm.pwmo < pwm_const->pwmo_min ||
+		    pwm.pwmo > pwm_const->pwmo_max) {
+			NL_SET_ERR_MSG_FMT(extack,
+					   "PWMO: %u tqmin is out of range: %u...%u",
+					   pwm.pwmo, pwm_const->pwmo_min,
+					   pwm_const->pwmo_max);
+			return -EINVAL;
+		}
+	}
+
+	err = can_validate_pwm_bittiming(dev, &pwm, extack);
+	if (err)
+		return err;
+
+	priv->xl.pwm = pwm;
+	return 0;
+}
+
 static int can_changelink(struct net_device *dev, struct nlattr *tb[],
 			  struct nlattr *data[],
 			  struct netlink_ext_ack *extack)
@@ -559,6 +681,9 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[],
 
 	/* CAN XL */
 	err = can_dbt_changelink(dev, data, false, extack);
+	if (err)
+		return err;
+	err = can_pwm_changelink(dev, data[IFLA_CAN_XL_PWM], extack);
 	if (err)
 		return err;
 
@@ -647,6 +772,30 @@ static size_t can_ctrlmode_ext_get_size(void)
 		nla_total_size(sizeof(u32));	/* IFLA_CAN_CTRLMODE_SUPPORTED */
 }
 
+static size_t can_pwm_get_size(const struct can_pwm_const *pwm_const,
+			       bool pwm_on)
+{
+	size_t size;
+
+	if (!pwm_const || !pwm_on)
+		return 0;
+
+	size = nla_total_size(0);			/* nest IFLA_CAN_PWM */
+
+	size += nla_total_size(sizeof(u32));		/* IFLA_CAN_PWM_PWMS_MIN */
+	size += nla_total_size(sizeof(u32));		/* IFLA_CAN_PWM_PWMS_MAX */
+	size += nla_total_size(sizeof(u32));		/* IFLA_CAN_PWM_PWML_MIN */
+	size += nla_total_size(sizeof(u32));		/* IFLA_CAN_PWM_PWML_MAX */
+	size += nla_total_size(sizeof(u32));		/* IFLA_CAN_PWM_PWMO_MIN */
+	size += nla_total_size(sizeof(u32));		/* IFLA_CAN_PWM_PWMO_MAX */
+
+	size += nla_total_size(sizeof(u32));		/* IFLA_CAN_PWM_PWMS */
+	size += nla_total_size(sizeof(u32));		/* IFLA_CAN_PWM_PWML */
+	size += nla_total_size(sizeof(u32));		/* IFLA_CAN_PWM_PWMO */
+
+	return size;
+}
+
 static size_t can_get_size(const struct net_device *dev)
 {
 	struct can_priv *priv = netdev_priv(dev);
@@ -678,6 +827,8 @@ static size_t can_get_size(const struct net_device *dev)
 
 	size += can_data_bittiming_get_size(&priv->xl,
 					    priv->ctrlmode & CAN_CTRLMODE_XL_TDC_MASK);
+	size += can_pwm_get_size(priv->xl.pwm_const,		/* IFLA_CAN_XL_PWM */
+				 priv->ctrlmode & CAN_CTRLMODE_XL_TMS);
 
 	return size;
 }
@@ -776,6 +927,42 @@ err_cancel:
 	return -EMSGSIZE;
 }
 
+static int can_pwm_fill_info(struct sk_buff *skb, const struct can_priv *priv)
+{
+	const struct can_pwm_const *pwm_const = priv->xl.pwm_const;
+	const struct can_pwm *pwm = &priv->xl.pwm;
+	struct nlattr *nest;
+
+	if (!pwm_const)
+		return 0;
+
+	nest = nla_nest_start(skb, IFLA_CAN_XL_PWM);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, IFLA_CAN_PWM_PWMS_MIN, pwm_const->pwms_min) ||
+	    nla_put_u32(skb, IFLA_CAN_PWM_PWMS_MAX, pwm_const->pwms_max) ||
+	    nla_put_u32(skb, IFLA_CAN_PWM_PWML_MIN, pwm_const->pwml_min) ||
+	    nla_put_u32(skb, IFLA_CAN_PWM_PWML_MAX, pwm_const->pwml_max) ||
+	    nla_put_u32(skb, IFLA_CAN_PWM_PWMO_MIN, pwm_const->pwmo_min) ||
+	    nla_put_u32(skb, IFLA_CAN_PWM_PWMO_MAX, pwm_const->pwmo_max))
+		goto err_cancel;
+
+	if (priv->ctrlmode & CAN_CTRLMODE_XL_TMS) {
+		if (nla_put_u32(skb, IFLA_CAN_PWM_PWMS, pwm->pwms) ||
+		    nla_put_u32(skb, IFLA_CAN_PWM_PWML, pwm->pwml) ||
+		    nla_put_u32(skb, IFLA_CAN_PWM_PWMO, pwm->pwmo))
+			goto err_cancel;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+err_cancel:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static int can_ctrlmode_ext_fill_info(struct sk_buff *skb,
 				      const struct can_priv *priv)
 {
@@ -859,9 +1046,10 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
 					priv->xl.data_bitrate_const,
 					priv->xl.data_bitrate_const_cnt) ||
 
-	    can_tdc_fill_info(skb, dev, IFLA_CAN_XL_TDC)
-	    )
+	    can_tdc_fill_info(skb, dev, IFLA_CAN_XL_TDC) ||
 
+	    can_pwm_fill_info(skb, priv)
+	    )
 		return -EMSGSIZE;
 
 	return 0;
diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index ebafb091d80f..c30d16746159 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -5,6 +5,7 @@
  * Definitions for the CAN netlink interface
  *
  * Copyright (c) 2009 Wolfgang Grandegger <wg@grandegger.com>
+ * Copyright (c) 2021-2025 Vincent Mailhol <mailhol@kernel.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the version 2 of the GNU General Public License
@@ -147,6 +148,7 @@ enum {
 	IFLA_CAN_XL_DATA_BITTIMING_CONST,
 	IFLA_CAN_XL_DATA_BITRATE_CONST,
 	IFLA_CAN_XL_TDC,
+	IFLA_CAN_XL_PWM,
 
 	/* add new constants above here */
 	__IFLA_CAN_MAX,
@@ -188,6 +190,29 @@ enum {
 	IFLA_CAN_CTRLMODE_MAX = __IFLA_CAN_CTRLMODE - 1
 };
 
+/*
+ * CAN FD/XL Pulse-Width Modulation (PWM)
+ *
+ * Please refer to struct can_pwm_const and can_pwm in
+ * include/linux/can/bittiming.h for further details.
+ */
+enum {
+	IFLA_CAN_PWM_UNSPEC,
+	IFLA_CAN_PWM_PWMS_MIN,	/* u32 */
+	IFLA_CAN_PWM_PWMS_MAX,	/* u32 */
+	IFLA_CAN_PWM_PWML_MIN,	/* u32 */
+	IFLA_CAN_PWM_PWML_MAX,	/* u32 */
+	IFLA_CAN_PWM_PWMO_MIN,	/* u32 */
+	IFLA_CAN_PWM_PWMO_MAX,	/* u32 */
+	IFLA_CAN_PWM_PWMS,	/* u32 */
+	IFLA_CAN_PWM_PWML,	/* u32 */
+	IFLA_CAN_PWM_PWMO,	/* u32 */
+
+	/* add new constants above here */
+	__IFLA_CAN_PWM,
+	IFLA_CAN_PWM_MAX = __IFLA_CAN_PWM - 1
+};
+
 /* u16 termination range: 1..65535 Ohms */
 #define CAN_TERMINATION_DISABLED 0
 
-- 
cgit v1.2.3


From 4e1da516debbe6a573ffa0392e2809d180d0575c Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Thu, 23 Oct 2025 14:28:18 +0100
Subject: comedi: Add reference counting for Comedi command handling

For interrupts from badly behaved hardware (as emulated by Syzbot), it
is possible for the Comedi core functions that manage the progress of
asynchronous data acquisition to be called from driver ISRs while no
asynchronous command has been set up, which can cause problems such as
invalid pointer dereferencing or dividing by zero.

To help protect against that, introduce new functions to maintain a
reference counter for asynchronous commands that are being set up.
`comedi_get_is_subdevice_running(s)` will check if a command has been
set up on a subdevice and is still marked as running, and if so will
increment the reference counter and return `true`, otherwise it will
return `false` without modifying the reference counter.
`comedi_put_is_subdevice_running(s)` will decrement the reference
counter and set a completion event when decremented to 0.

Change the `do_cmd_ioctl()` function (responsible for setting up the
asynchronous command) to  reinitialize the completion event and set the
reference counter to 1 before it marks the subdevice as running.  Change
the `do_become_nonbusy()` function (responsible for destroying a
completed command) to call `comedi_put_is_subdevice_running(s)` and wait
for the completion event after marking the subdevice as not running.

Because the subdevice normally gets marked as not running before the
call to `do_become_nonbusy()` (and may also be called when the Comedi
device is being detached from the low-level driver), add a new flag
`COMEDI_SRF_BUSY` to the set of subdevice run-flags that indicates that
an asynchronous command was set up and will need to be destroyed.  This
flag is set by `do_cmd_ioctl()` and cleared and checked by
`do_become_nonbusy()`.

Subsequent patches will change the Comedi core functions that are called
from low-level drivers for asynchrous command handling to make use of
the `comedi_get_is_subdevice_running()` and
`comedi_put_is_subdevice_running()` functions, and will modify the ISRs
of some of these low-level drivers if they dereference the subdevice's
`async` pointer directly.

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://patch.msgid.link/20251023133001.8439-2-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/comedi/comedi_fops.c     | 78 +++++++++++++++++++++++++++++++++++-----
 drivers/comedi/drivers.c         |  1 +
 include/linux/comedi/comedidev.h |  7 ++++
 3 files changed, 78 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c
index 40d8b29797ae..8253e4e8232b 100644
--- a/drivers/comedi/comedi_fops.c
+++ b/drivers/comedi/comedi_fops.c
@@ -38,6 +38,7 @@
  * COMEDI_SRF_ERROR:		indicates an COMEDI_CB_ERROR event has occurred
  *				since the last command was started
  * COMEDI_SRF_RUNNING:		command is running
+ * COMEDI_SRF_BUSY:		command was started and subdevice still busy
  * COMEDI_SRF_FREE_SPRIV:	free s->private on detach
  *
  * COMEDI_SRF_BUSY_MASK:	runflags that indicate the subdevice is "busy"
@@ -45,9 +46,11 @@
 #define COMEDI_SRF_RT		BIT(1)
 #define COMEDI_SRF_ERROR	BIT(2)
 #define COMEDI_SRF_RUNNING	BIT(27)
+#define COMEDI_SRF_BUSY		BIT(28)
 #define COMEDI_SRF_FREE_SPRIV	BIT(31)
 
-#define COMEDI_SRF_BUSY_MASK	(COMEDI_SRF_ERROR | COMEDI_SRF_RUNNING)
+#define COMEDI_SRF_BUSY_MASK	\
+	(COMEDI_SRF_ERROR | COMEDI_SRF_RUNNING | COMEDI_SRF_BUSY)
 
 /**
  * struct comedi_file - Per-file private data for COMEDI device
@@ -665,6 +668,11 @@ static bool comedi_is_runflags_in_error(unsigned int runflags)
 	return runflags & COMEDI_SRF_ERROR;
 }
 
+static bool comedi_is_runflags_busy(unsigned int runflags)
+{
+	return runflags & COMEDI_SRF_BUSY;
+}
+
 /**
  * comedi_is_subdevice_running() - Check if async command running on subdevice
  * @s: COMEDI subdevice.
@@ -687,6 +695,46 @@ static bool __comedi_is_subdevice_running(struct comedi_subdevice *s)
 	return comedi_is_runflags_running(runflags);
 }
 
+/**
+ * comedi_get_is_subdevice_running() - Get if async command running on subdevice
+ * @s: COMEDI subdevice.
+ *
+ * If an asynchronous COMEDI command is running on the subdevice, increment
+ * a reference counter.  If the function return value indicates that a
+ * command is running, then the details of the command will not be destroyed
+ * before a matching call to comedi_put_is_subdevice_running().
+ *
+ * Return: %true if an asynchronous COMEDI command is active on the
+ * subdevice, else %false.
+ */
+bool comedi_get_is_subdevice_running(struct comedi_subdevice *s)
+{
+	unsigned long flags;
+	bool running;
+
+	spin_lock_irqsave(&s->spin_lock, flags);
+	running = __comedi_is_subdevice_running(s);
+	if (running)
+		refcount_inc(&s->async->run_active);
+	spin_unlock_irqrestore(&s->spin_lock, flags);
+	return running;
+}
+EXPORT_SYMBOL_GPL(comedi_get_is_subdevice_running);
+
+/**
+ * comedi_put_is_subdevice_running() - Put if async command running on subdevice
+ * @s: COMEDI subdevice.
+ *
+ * Decrements the reference counter that was incremented when
+ * comedi_get_is_subdevice_running() returned %true.
+ */
+void comedi_put_is_subdevice_running(struct comedi_subdevice *s)
+{
+	if (refcount_dec_and_test(&s->async->run_active))
+		complete_all(&s->async->run_complete);
+}
+EXPORT_SYMBOL_GPL(comedi_put_is_subdevice_running);
+
 bool comedi_can_auto_free_spriv(struct comedi_subdevice *s)
 {
 	unsigned int runflags = __comedi_get_subdevice_runflags(s);
@@ -736,20 +784,28 @@ static void do_become_nonbusy(struct comedi_device *dev,
 			      struct comedi_subdevice *s)
 {
 	struct comedi_async *async = s->async;
+	unsigned int runflags;
+	unsigned long flags;
 
 	lockdep_assert_held(&dev->mutex);
-	comedi_update_subdevice_runflags(s, COMEDI_SRF_RUNNING, 0);
-	if (async) {
+	spin_lock_irqsave(&s->spin_lock, flags);
+	runflags = __comedi_get_subdevice_runflags(s);
+	__comedi_clear_subdevice_runflags(s, COMEDI_SRF_RUNNING |
+					     COMEDI_SRF_BUSY);
+	spin_unlock_irqrestore(&s->spin_lock, flags);
+	if (comedi_is_runflags_busy(runflags)) {
+		/*
+		 * "Run active" counter was set to 1 when setting up the
+		 * command.  Decrement it and wait for it to become 0.
+		 */
+		comedi_put_is_subdevice_running(s);
+		wait_for_completion(&async->run_complete);
 		comedi_buf_reset(s);
 		async->inttrig = NULL;
 		kfree(async->cmd.chanlist);
 		async->cmd.chanlist = NULL;
 		s->busy = NULL;
 		wake_up_interruptible_all(&async->wait_head);
-	} else {
-		dev_err(dev->class_dev,
-			"BUG: (?) %s called with async=NULL\n", __func__);
-		s->busy = NULL;
 	}
 }
 
@@ -1860,8 +1916,14 @@ static int do_cmd_ioctl(struct comedi_device *dev,
 	if (async->cmd.flags & CMDF_WAKE_EOS)
 		async->cb_mask |= COMEDI_CB_EOS;
 
+	/*
+	 * Set the "run active" counter with an initial count of 1 that will
+	 * complete the "safe to reset" event when it is decremented to 0.
+	 */
+	refcount_set(&s->async->run_active, 1);
+	reinit_completion(&s->async->run_complete);
 	comedi_update_subdevice_runflags(s, COMEDI_SRF_BUSY_MASK,
-					 COMEDI_SRF_RUNNING);
+					 COMEDI_SRF_RUNNING | COMEDI_SRF_BUSY);
 
 	/*
 	 * Set s->busy _after_ setting COMEDI_SRF_RUNNING flag to avoid
diff --git a/drivers/comedi/drivers.c b/drivers/comedi/drivers.c
index c9ebaadc5e82..fd6e6cbe47ad 100644
--- a/drivers/comedi/drivers.c
+++ b/drivers/comedi/drivers.c
@@ -677,6 +677,7 @@ static int __comedi_device_postconfig_async(struct comedi_device *dev,
 		return -ENOMEM;
 
 	init_waitqueue_head(&async->wait_head);
+	init_completion(&async->run_complete);
 	s->async = async;
 
 	async->max_bufsize = comedi_default_buf_maxsize_kb * 1024;
diff --git a/include/linux/comedi/comedidev.h b/include/linux/comedi/comedidev.h
index 4cb0400ad616..35fdc41845ce 100644
--- a/include/linux/comedi/comedidev.h
+++ b/include/linux/comedi/comedidev.h
@@ -15,6 +15,7 @@
 #include <linux/spinlock_types.h>
 #include <linux/rwsem.h>
 #include <linux/kref.h>
+#include <linux/completion.h>
 #include <linux/comedi.h>
 
 #define COMEDI_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + (c))
@@ -272,6 +273,8 @@ struct comedi_buf_map {
  * @events: Bit-vector of events that have occurred.
  * @cmd: Details of comedi command in progress.
  * @wait_head: Task wait queue for file reader or writer.
+ * @run_complete: "run complete" completion event.
+ * @run_active: "run active" reference counter.
  * @cb_mask: Bit-vector of events that should wake waiting tasks.
  * @inttrig: Software trigger function for command, or NULL.
  *
@@ -357,6 +360,8 @@ struct comedi_async {
 	unsigned int events;
 	struct comedi_cmd cmd;
 	wait_queue_head_t wait_head;
+	struct completion run_complete;
+	refcount_t run_active;
 	unsigned int cb_mask;
 	int (*inttrig)(struct comedi_device *dev, struct comedi_subdevice *s,
 		       unsigned int x);
@@ -584,6 +589,8 @@ struct comedi_device *comedi_dev_get_from_minor(unsigned int minor);
 int comedi_dev_put(struct comedi_device *dev);
 
 bool comedi_is_subdevice_running(struct comedi_subdevice *s);
+bool comedi_get_is_subdevice_running(struct comedi_subdevice *s);
+void comedi_put_is_subdevice_running(struct comedi_subdevice *s);
 
 void *comedi_alloc_spriv(struct comedi_subdevice *s, size_t size);
 void comedi_set_spriv_auto_free(struct comedi_subdevice *s);
-- 
cgit v1.2.3


From d1b3b9c70e11cb4f40b4e41a4dc1503b9a3c0109 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Mon, 27 Oct 2025 15:25:02 +0000
Subject: comedi: kcomedilib: Add loop checking variants of open and close

Add `comedi_open_from(path, from)` and `comedi_close_from(dev, from)` as
variants of the existing `comedi_from(path)` and `comedi_close(dev)`.
The additional `from` parameter is a minor device number that tells the
function that the COMEDI device is being opened or closed from another
COMEDI device if the value is in the range [0,
`COMEDI_NUM_BOARD_MINORS`-1].  In that case the function will refuse to
open the device if it would lead to a chain of devices opening each
other.  (It will also impose a limit on the number of simultaneous opens
from one device to another because we need to count those.)

The new functions are intended to be used by the "comedi_bond" driver,
which is the only driver that uses the existing `comedi_open()` and
`comedi_close()` functions.  The new functions will be used to avoid
some possible deadlock situations.

Replace the existing, exported `comedi_open()` and `comedi_close()`
functions with inline wrapper functions that call the newly exported
`comedi_open_from()` and `comedi_close_from()` functions.

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://patch.msgid.link/20251027153748.4569-2-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/comedi/kcomedilib/kcomedilib_main.c | 120 ++++++++++++++++++++++++++--
 include/linux/comedi/comedilib.h            |  34 +++++++-
 2 files changed, 147 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/comedi/kcomedilib/kcomedilib_main.c b/drivers/comedi/kcomedilib/kcomedilib_main.c
index 43fbe1a63b14..baa9eaaf97d4 100644
--- a/drivers/comedi/kcomedilib/kcomedilib_main.c
+++ b/drivers/comedi/kcomedilib/kcomedilib_main.c
@@ -15,6 +15,7 @@
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/io.h>
+#include <linux/bitmap.h>
 
 #include <linux/comedi.h>
 #include <linux/comedi/comedidev.h>
@@ -24,7 +25,104 @@ MODULE_AUTHOR("David Schleef <ds@schleef.org>");
 MODULE_DESCRIPTION("Comedi kernel library");
 MODULE_LICENSE("GPL");
 
-struct comedi_device *comedi_open(const char *filename)
+static DEFINE_MUTEX(kcomedilib_to_from_lock);
+
+/*
+ * Row index is the "to" node, column index is the "from" node, element value
+ * is the number of links from the "from" node to the "to" node.
+ */
+static unsigned char
+	kcomedilib_to_from[COMEDI_NUM_BOARD_MINORS][COMEDI_NUM_BOARD_MINORS];
+
+static bool kcomedilib_set_link_from_to(unsigned int from, unsigned int to)
+{
+	DECLARE_BITMAP(destinations[2], COMEDI_NUM_BOARD_MINORS);
+	unsigned int cur = 0;
+	bool okay = true;
+
+	/*
+	 * Allow "from" node to be out of range (no loop checking),
+	 * but require "to" node to be in range.
+	 */
+	if (to >= COMEDI_NUM_BOARD_MINORS)
+		return false;
+	if (from >= COMEDI_NUM_BOARD_MINORS)
+		return true;
+
+	/*
+	 * Check that kcomedilib_to_from[to][from] can be made non-zero
+	 * without creating a loop.
+	 *
+	 * Termination of the loop-testing code relies on the assumption that
+	 * kcomedilib_to_from[][] does not contain any loops.
+	 *
+	 * Start with a set destinations set containing "from" as the only
+	 * element and work backwards looking for loops.
+	 */
+	bitmap_zero(destinations[cur], COMEDI_NUM_BOARD_MINORS);
+	set_bit(from, destinations[cur]);
+	mutex_lock(&kcomedilib_to_from_lock);
+	do {
+		unsigned int next = 1 - cur;
+		unsigned int t = 0;
+
+		if (test_bit(to, destinations[cur])) {
+			/* Loop detected. */
+			okay = false;
+			break;
+		}
+		/* Create next set of destinations. */
+		bitmap_zero(destinations[next], COMEDI_NUM_BOARD_MINORS);
+		while ((t = find_next_bit(destinations[cur],
+					  COMEDI_NUM_BOARD_MINORS,
+					  t)) < COMEDI_NUM_BOARD_MINORS) {
+			unsigned int f;
+
+			for (f = 0; f < COMEDI_NUM_BOARD_MINORS; f++) {
+				if (kcomedilib_to_from[t][f])
+					set_bit(f, destinations[next]);
+			}
+			t++;
+		}
+		cur = next;
+	} while (!bitmap_empty(destinations[cur], COMEDI_NUM_BOARD_MINORS));
+	if (okay) {
+		/* Allow a maximum of 255 links from "from" to "to". */
+		if (kcomedilib_to_from[to][from] < 255)
+			kcomedilib_to_from[to][from]++;
+		else
+			okay = false;
+	}
+	mutex_unlock(&kcomedilib_to_from_lock);
+	return okay;
+}
+
+static void kcomedilib_clear_link_from_to(unsigned int from, unsigned int to)
+{
+	if (to < COMEDI_NUM_BOARD_MINORS && from < COMEDI_NUM_BOARD_MINORS) {
+		mutex_lock(&kcomedilib_to_from_lock);
+		if (kcomedilib_to_from[to][from])
+			kcomedilib_to_from[to][from]--;
+		mutex_unlock(&kcomedilib_to_from_lock);
+	}
+}
+
+/**
+ * comedi_open_from() - Open a COMEDI device from the kernel with loop checks
+ * @filename: Fake pathname of the form "/dev/comediN".
+ * @from: Device number it is being opened from (if in range).
+ *
+ * Converts @filename to a COMEDI device number and "opens" it if it exists
+ * and is attached to a low-level COMEDI driver.
+ *
+ * If @from is in range, refuse to open the device if doing so would form a
+ * loop of devices opening each other.  There is also a limit of 255 on the
+ * number of concurrent opens from one device to another.
+ *
+ * Return: A pointer to the COMEDI device on success.
+ * Return %NULL on failure.
+ */
+struct comedi_device *comedi_open_from(const char *filename, int from)
 {
 	struct comedi_device *dev, *retval = NULL;
 	unsigned int minor;
@@ -43,7 +141,7 @@ struct comedi_device *comedi_open(const char *filename)
 		return NULL;
 
 	down_read(&dev->attach_lock);
-	if (dev->attached)
+	if (dev->attached && kcomedilib_set_link_from_to(from, minor))
 		retval = dev;
 	else
 		retval = NULL;
@@ -54,14 +152,26 @@ struct comedi_device *comedi_open(const char *filename)
 
 	return retval;
 }
-EXPORT_SYMBOL_GPL(comedi_open);
+EXPORT_SYMBOL_GPL(comedi_open_from);
 
-int comedi_close(struct comedi_device *dev)
+/**
+ * comedi_close_from() - Close a COMEDI device from the kernel with loop checks
+ * @dev: COMEDI device.
+ * @from: Device number it was opened from (if in range).
+ *
+ * Closes a COMEDI device previously opened by comedi_open_from().
+ *
+ * If @from is in range, it should be match the one used by comedi_open_from().
+ *
+ * Returns: 0
+ */
+int comedi_close_from(struct comedi_device *dev, int from)
 {
+	kcomedilib_clear_link_from_to(from, dev->minor);
 	comedi_dev_put(dev);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(comedi_close);
+EXPORT_SYMBOL_GPL(comedi_close_from);
 
 static int comedi_do_insn(struct comedi_device *dev,
 			  struct comedi_insn *insn,
diff --git a/include/linux/comedi/comedilib.h b/include/linux/comedi/comedilib.h
index 0223c9cd9215..1f2b22b383cc 100644
--- a/include/linux/comedi/comedilib.h
+++ b/include/linux/comedi/comedilib.h
@@ -10,8 +10,38 @@
 #ifndef _LINUX_COMEDILIB_H
 #define _LINUX_COMEDILIB_H
 
-struct comedi_device *comedi_open(const char *path);
-int comedi_close(struct comedi_device *dev);
+struct comedi_device *comedi_open_from(const char *path, int from);
+
+/**
+ * comedi_open() - Open a COMEDI device from the kernel
+ * @filename: Fake pathname of the form "/dev/comediN".
+ *
+ * Converts @filename to a COMEDI device number and "opens" it if it exists
+ * and is attached to a low-level COMEDI driver.
+ *
+ * Return: A pointer to the COMEDI device on success.
+ * Return %NULL on failure.
+ */
+static inline struct comedi_device *comedi_open(const char *path)
+{
+	return comedi_open_from(path, -1);
+}
+
+int comedi_close_from(struct comedi_device *dev, int from);
+
+/**
+ * comedi_close() - Close a COMEDI device from the kernel
+ * @dev: COMEDI device.
+ *
+ * Closes a COMEDI device previously opened by comedi_open().
+ *
+ * Returns: 0
+ */
+static inline int comedi_close(struct comedi_device *dev)
+{
+	return comedi_close_from(dev, -1);
+}
+
 int comedi_dio_get_config(struct comedi_device *dev, unsigned int subdev,
 			  unsigned int chan, unsigned int *io);
 int comedi_dio_config(struct comedi_device *dev, unsigned int subdev,
-- 
cgit v1.2.3


From f0fdaa4ad55b7c6e46a5ccb9102bc9a96cad360f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 27 Oct 2025 21:04:09 -0700
Subject: virt: acrn: split acrn_mmio_dev_res out of acrn_mmiodev

Add struct acrn_mmio_dev_res before struct acrn_mmio_dev.
The former is used in the latter and breaking them up provides
better kernel-doc documentation for the struct members.

Suggested-by: Fei Li <fei1.li@intel.com>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Fei Li <fei1.li@intel.com>
Link: https://patch.msgid.link/20251028040409.868254-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/acrn.h | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index 7b714c1902eb..79e7855a8c42 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -418,26 +418,32 @@ struct acrn_pcidev {
 };
 
 /**
- * struct acrn_mmiodev - Info for assigning or de-assigning a MMIO device
- * @name:			Name of the MMIO device.
- * @res[].user_vm_pa:		Physical address of User VM of the MMIO region
- *				for the MMIO device.
- * @res[].service_vm_pa:	Physical address of Service VM of the MMIO
- *				region for the MMIO device.
- * @res[].size:			Size of the MMIO region for the MMIO device.
- * @res[].mem_type:		Memory type of the MMIO region for the MMIO
- *				device.
+ * struct acrn_mmio_dev_res - MMIO device resource description
+ * @user_vm_pa:		Physical address of User VM of the MMIO region
+ *			for the MMIO device.
+ * @service_vm_pa:	Physical address of Service VM of the MMIO
+ *			region for the MMIO device.
+ * @size:		Size of the MMIO region for the MMIO device.
+ * @mem_type:		Memory type of the MMIO region for the MMIO
+ *			device.
+ */
+struct acrn_mmio_dev_res {
+	__u64	user_vm_pa;
+	__u64	service_vm_pa;
+	__u64	size;
+	__u64	mem_type;
+};
+
+/**
+ * struct acrn_mmiodev - Info for assigning or de-assigning an MMIO device
+ * @name:	Name of the MMIO device.
+ * @res:	Array of MMIO device descriptions
  *
  * This structure will be passed to hypervisor directly.
  */
 struct acrn_mmiodev {
 	__u8	name[8];
-	struct {
-		__u64	user_vm_pa;
-		__u64	service_vm_pa;
-		__u64	size;
-		__u64	mem_type;
-	} res[ACRN_MMIODEV_RES_NUM];
+	struct acrn_mmio_dev_res res[ACRN_MMIODEV_RES_NUM];
 };
 
 /**
-- 
cgit v1.2.3


From f85d90dd8d0efbc75e79698e147c6e682df22e1a Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 29 Oct 2025 09:12:10 +0100
Subject: sysfs: attribute_group: allow registration of const attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To be able to constify instances of struct attribute it has to be
possible to add them to struct attribute_group. The current type of the
attrs member however is not compatible with that. Introduce a union that
allows registration of both const and non-const attributes to enable a
piecewise transition. As both union member types are compatible no
logic needs to be adapted.

Technically it is now possible register a const struct attribute and
receive it as mutable pointer in the callbacks. This is a soundness
issue.  But this same soundness issue already exists today in
sysfs_create_file(). Also the struct definition and callback
implementation are always closely linked and are meant to be moved to
const in lockstep.

Similar to commit 906c508afdca ("sysfs: attribute_group: allow
registration of const bin_attribute")

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20251029-sysfs-const-attr-prep-v5-1-ea7d745acff4@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 9a25a2911652..e34d6af96abb 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -105,7 +105,10 @@ struct attribute_group {
 	size_t			(*bin_size)(struct kobject *,
 					    const struct bin_attribute *,
 					    int);
-	struct attribute	**attrs;
+	union {
+		struct attribute	**attrs;
+		const struct attribute	*const *attrs_const;
+	};
 	const struct bin_attribute	*const *bin_attrs;
 };
 
-- 
cgit v1.2.3


From 964c93b1eef37e3bbe0edb37346c076217d71fe7 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 29 Oct 2025 09:12:11 +0100
Subject: sysfs: transparently handle const pointers in ATTRIBUTE_GROUPS()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To ease the constification process of 'struct attribute', transparently
handle the const pointers in ATTRIBUTE_GROUPS(). A cast is used instead
of assigning to .attrs_new as it keeps the macro smaller. As both
members are aliased to each other the result is identical.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20251029-sysfs-const-attr-prep-v5-2-ea7d745acff4@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index e34d6af96abb..92f82cee5f11 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -287,7 +287,12 @@ static const struct attribute_group *_name##_groups[] = {	\
 
 #define ATTRIBUTE_GROUPS(_name)					\
 static const struct attribute_group _name##_group = {		\
-	.attrs = _name##_attrs,					\
+	.attrs = _Generic(_name##_attrs,			\
+			  struct attribute **:			\
+				_name##_attrs,			\
+			  const struct attribute *const *:	\
+				(void *)_name##_attrs		\
+	),							\
 };								\
 __ATTRIBUTE_GROUPS(_name)
 
-- 
cgit v1.2.3


From 02ac5335a55111d87a7a618355261b4407ed0f7f Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 29 Oct 2025 09:12:12 +0100
Subject: sysfs: introduce __SYSFS_FUNCTION_ALTERNATIVE()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For the constification phase of 'struct attribute' various callback
struct members will need to exist in both const and non-const variants.
Keeping both members in a union avoids memory and CPU overhead but will
be detected and trapped by Control Flow Integrity (CFI). By deciding
between a struct and a union depending whether CFI is enabled, most
configurations can avoid this overhead. Code using these callbacks will
still need to be updated to handle both members explicitly.
In the union case the compiler will recognize that testing for one union
member is enough and optimize away the code for the other one.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20251029-sysfs-const-attr-prep-v5-3-ea7d745acff4@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 92f82cee5f11..9cef5bf24ba7 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -58,6 +58,12 @@ do {							\
 #define sysfs_attr_init(attr) do {} while (0)
 #endif
 
+#ifdef CONFIG_CFI
+#define __SYSFS_FUNCTION_ALTERNATIVE(MEMBERS...) struct { MEMBERS }
+#else
+#define __SYSFS_FUNCTION_ALTERNATIVE(MEMBERS...) union { MEMBERS }
+#endif
+
 /**
  * struct attribute_group - data structure used to declare an attribute group.
  * @name:	Optional: Attribute group name
-- 
cgit v1.2.3


From 7dd9fdb4939b972c1d0523e94fb3f70789653f0c Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 29 Oct 2025 09:12:13 +0100
Subject: sysfs: attribute_group: enable const variants of is_visible()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When constifying instances of struct attribute, for consistency the
corresponding .is_visible() callback should be adapted, too.
Introduce a temporary transition mechanism until all callbacks are
converted.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20251029-sysfs-const-attr-prep-v5-4-ea7d745acff4@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/group.c      | 10 ++++++++--
 include/linux/sysfs.h |  8 ++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index e142bac4f9f8..e1e639f515a0 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -36,6 +36,9 @@ static umode_t __first_visible(const struct attribute_group *grp, struct kobject
 	if (grp->attrs && grp->attrs[0] && grp->is_visible)
 		return grp->is_visible(kobj, grp->attrs[0], 0);
 
+	if (grp->attrs && grp->attrs[0] && grp->is_visible_const)
+		return grp->is_visible_const(kobj, grp->attrs[0], 0);
+
 	if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible)
 		return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0);
 
@@ -61,8 +64,11 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 			 */
 			if (update)
 				kernfs_remove_by_name(parent, (*attr)->name);
-			if (grp->is_visible) {
-				mode = grp->is_visible(kobj, *attr, i);
+			if (grp->is_visible || grp->is_visible_const) {
+				if (grp->is_visible)
+					mode = grp->is_visible(kobj, *attr, i);
+				else
+					mode = grp->is_visible_const(kobj, *attr, i);
 				mode &= ~SYSFS_GROUP_INVISIBLE;
 				if (!mode)
 					continue;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 9cef5bf24ba7..592886ed6ca9 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -104,8 +104,12 @@ do {							\
  */
 struct attribute_group {
 	const char		*name;
-	umode_t			(*is_visible)(struct kobject *,
-					      struct attribute *, int);
+	__SYSFS_FUNCTION_ALTERNATIVE(
+		umode_t			(*is_visible)(struct kobject *,
+						      struct attribute *, int);
+		umode_t			(*is_visible_const)(struct kobject *,
+							    const struct attribute *, int);
+	);
 	umode_t			(*is_bin_visible)(struct kobject *,
 						  const struct bin_attribute *, int);
 	size_t			(*bin_size)(struct kobject *,
-- 
cgit v1.2.3


From 71464949b1f5f8b8599d057fea525a2a520f84d8 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 29 Oct 2025 09:12:16 +0100
Subject: sysfs: simplify attribute definition macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Define the macros in terms of each other.
This makes them easier to understand and also will make it easier to
implement the transition machinery for 'const struct attribute'.

__ATTR_RO_MODE() can't be implemented in terms of __ATTR() as not all
attributes have a .store callback. The same issue theoretically exists
for __ATTR_WO(), but practically that does not occur today.

Reorder __ATTR_RO() below __ATTR_RO_MODE() to keep the order of the
macro definition consistent with respect to each other.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20251029-sysfs-const-attr-prep-v5-7-ea7d745acff4@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 592886ed6ca9..c33a96b7391a 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -251,28 +251,20 @@ struct attribute_group {
 	.store	= _store,						\
 }
 
-#define __ATTR_RO(_name) {						\
-	.attr	= { .name = __stringify(_name), .mode = 0444 },		\
-	.show	= _name##_show,						\
-}
-
 #define __ATTR_RO_MODE(_name, _mode) {					\
 	.attr	= { .name = __stringify(_name),				\
 		    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },		\
 	.show	= _name##_show,						\
 }
 
-#define __ATTR_RW_MODE(_name, _mode) {					\
-	.attr	= { .name = __stringify(_name),				\
-		    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },		\
-	.show	= _name##_show,						\
-	.store	= _name##_store,					\
-}
+#define __ATTR_RO(_name)						\
+	__ATTR_RO_MODE(_name, 0444)
 
-#define __ATTR_WO(_name) {						\
-	.attr	= { .name = __stringify(_name), .mode = 0200 },		\
-	.store	= _name##_store,					\
-}
+#define __ATTR_RW_MODE(_name, _mode)					\
+	__ATTR(_name, _mode, _name##_show, _name##_store)
+
+#define __ATTR_WO(_name)						\
+	__ATTR(_name, 0200, NULL, _name##_store)
 
 #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store)
 
-- 
cgit v1.2.3


From d3d25f430cadc59d42965f54f54a8c0050931860 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Thu, 6 Nov 2025 10:58:38 +0530
Subject: mod_devicetable: Bump auxiliary_device_id name size

We have an upcoming driver named "intel_ehl_pse_io". This creates an
auxiliary child device for it's GPIO sub-functionality, which matches
against "intel_ehl_pse_io.gpio-elkhartlake" and overshoots the current
maximum limit of 32 bytes for auxiliary device id string. Bump the size
to 40 bytes to satisfy such cases.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Link: https://patch.msgid.link/20251106052838.433673-1-raag.jadav@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mod_devicetable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 6077972e8b45..24eb5a88a5c5 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -867,7 +867,7 @@ struct mhi_device_id {
 	kernel_ulong_t driver_data;
 };
 
-#define AUXILIARY_NAME_SIZE 32
+#define AUXILIARY_NAME_SIZE 40
 #define AUXILIARY_MODULE_PREFIX "auxiliary:"
 
 struct auxiliary_device_id {
-- 
cgit v1.2.3


From d6f4941f1b4f3e701e422dfbfee024264294f91f Mon Sep 17 00:00:00 2001
From: Benedek Kupper <kupper.benedek@gmail.com>
Date: Tue, 7 Oct 2025 22:35:44 +0200
Subject: drivers: hid: renegotiate resolution multipliers with device after
 reset

The scroll resolution multipliers are set in the context of
hidinput_connect(), which is only called at probe time: when the host
changes the value on the device with a SET_REPORT(FEATURE), and the device
accepts it, these multipliers are stored on the host side, and used to
calculate the final scroll event values sent to userspace.

After a USB suspend, the resume operation on many hubs and chipsets
involve a USB reset signal as well. A reset on the device side clears all
previous state information, including the value of the multiplier report.
This reset is not handled by the multiplier handling logic, so what ends up
happening is the host is still expecting high-resolution scroll events,
but the device is reset to default resolution, making the effective,
user-perceived scroll speed incredibly slow.

The solution is to renegotiate the multiplier selection after each reset.

This is not the only bug related to the high-resolution scrolling
implementation in the kernel (the other one is
https://bugzilla.kernel.org/show_bug.cgi?id=220144), but for this one,
there is no device side workaround for, leading to poor user experience with our product:
https://github.com/UltimateHackingKeyboard/firmware/issues/1155
https://github.com/UltimateHackingKeyboard/firmware/issues/1261
https://github.com/UltimateHackingKeyboard/firmware/pull/1355
This patch was tested by an affected user and has been reported to
fix the issue (see discussion in 1355).

Signed-off-by: Benedek Kupper <kupper.benedek@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 drivers/hid/hid-generic.c | 9 +++++++++
 drivers/hid/hid-input.c   | 7 +++++++
 include/linux/hid.h       | 1 +
 3 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/drivers/hid/hid-generic.c b/drivers/hid/hid-generic.c
index 9e04c6d0fcc8..c2de916747de 100644
--- a/drivers/hid/hid-generic.c
+++ b/drivers/hid/hid-generic.c
@@ -70,6 +70,14 @@ static int hid_generic_probe(struct hid_device *hdev,
 	return hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 }
 
+static int hid_generic_reset_resume(struct hid_device *hdev)
+{
+	if (hdev->claimed & HID_CLAIMED_INPUT)
+		hidinput_reset_resume(hdev);
+
+	return 0;
+}
+
 static const struct hid_device_id hid_table[] = {
 	{ HID_DEVICE(HID_BUS_ANY, HID_GROUP_ANY, HID_ANY_ID, HID_ANY_ID) },
 	{ }
@@ -81,6 +89,7 @@ static struct hid_driver hid_generic = {
 	.id_table = hid_table,
 	.match = hid_generic_match,
 	.probe = hid_generic_probe,
+	.reset_resume = hid_generic_reset_resume,
 };
 module_hid_driver(hid_generic);
 
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 2bbb645c2ff4..9f899ee83f0b 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -2400,6 +2400,13 @@ void hidinput_disconnect(struct hid_device *hid)
 }
 EXPORT_SYMBOL_GPL(hidinput_disconnect);
 
+void hidinput_reset_resume(struct hid_device *hid)
+{
+	/* renegotiate host-device shared state after reset */
+	hidinput_change_resolution_multipliers(hid);
+}
+EXPORT_SYMBOL_GPL(hidinput_reset_resume);
+
 #ifdef CONFIG_HID_KUNIT_TEST
 #include "hid-input-test.c"
 #endif
diff --git a/include/linux/hid.h b/include/linux/hid.h
index a4ddb94e3ee5..dce862cafbbd 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -984,6 +984,7 @@ extern void hidinput_hid_event(struct hid_device *, struct hid_field *, struct h
 extern void hidinput_report_event(struct hid_device *hid, struct hid_report *report);
 extern int hidinput_connect(struct hid_device *hid, unsigned int force);
 extern void hidinput_disconnect(struct hid_device *);
+void hidinput_reset_resume(struct hid_device *hid);
 
 struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type,
 				 unsigned int application, unsigned int usage);
-- 
cgit v1.2.3


From 81c45c62dc3eefd83af8eb8df10e45705e8e3a47 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 3 Nov 2025 09:27:55 -0800
Subject: iommu/arm-smmu-v3-iommufd: Allow attaching nested domain for GBPA
 cases

A vDEVICE has been a hard requirement for attaching a nested domain to the
device. This makes sense when installing a guest STE, since a vSID must be
present and given to the kernel during the vDEVICE allocation.

But, when CR0.SMMUEN is disabled, VM doesn't really need a vSID to program
the vSMMU behavior as GBPA will take effect, in which case the vSTE in the
nested domain could have carried the bypass or abort configuration in GBPA
register. Thus, having such a hard requirement doesn't work well for GBPA.

Skip vmaster allocation in arm_smmu_attach_prepare_vmaster() for an abort
or bypass vSTE. Note that device on this attachment won't report vevents.

Update the uAPI doc accordingly.

Link: https://patch.msgid.link/r/20251103172755.2026145-1-nicolinc@nvidia.com
Tested-by: Shameer Kolothum <skolothumtho@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Tested-by: Shuai Xue <xueshuai@linux.alibaba.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 13 ++++++++++++-
 include/uapi/linux/iommufd.h                        | 10 ++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 8cd8929bbfdf..e5fbbdbdea24 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -99,6 +99,8 @@ static void arm_smmu_make_nested_domain_ste(
 int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
 				    struct arm_smmu_nested_domain *nested_domain)
 {
+	unsigned int cfg =
+		FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0]));
 	struct arm_smmu_vmaster *vmaster;
 	unsigned long vsid;
 	int ret;
@@ -107,8 +109,17 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
 
 	ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core,
 					 state->master->dev, &vsid);
-	if (ret)
+	/*
+	 * Attaching to a translate nested domain must allocate a vDEVICE prior,
+	 * as CD/ATS invalidations and vevents require a vSID to work properly.
+	 * A abort/bypass domain is allowed to attach w/o vmaster for GBPA case.
+	 */
+	if (ret) {
+		if (cfg == STRTAB_STE_0_CFG_ABORT ||
+		    cfg == STRTAB_STE_0_CFG_BYPASS)
+			return 0;
 		return ret;
+	}
 
 	vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL);
 	if (!vmaster)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index c218c89e0e2e..2c41920b641d 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -450,6 +450,16 @@ struct iommu_hwpt_vtd_s1 {
  * nested domain will translate the same as the nesting parent. The S1 will
  * install a Context Descriptor Table pointing at userspace memory translated
  * by the nesting parent.
+ *
+ * It's suggested to allocate a vDEVICE object carrying vSID and then re-attach
+ * the nested domain, as soon as the vSID is available in the VMM level:
+ *
+ * - when Cfg=translate, a vDEVICE must be allocated prior to attaching to the
+ *   allocated nested domain, as CD/ATS invalidations and vevents need a vSID.
+ * - when Cfg=bypass/abort, a vDEVICE is not enforced during the nested domain
+ *   attachment, to support a GBPA case where VM sets CR0.SMMUEN=0. However, if
+ *   VM sets CR0.SMMUEN=1 while missing a vDEVICE object, kernel would fail to
+ *   report events to the VM. E.g. F_TRANSLATION when guest STE.Cfg=abort.
  */
 struct iommu_hwpt_arm_smmuv3 {
 	__aligned_le64 ste[2];
-- 
cgit v1.2.3


From f83ac7544fbf7ba3f77c122e16ab5319f75bbdfd Mon Sep 17 00:00:00 2001
From: pengdonglin <pengdonglin@xiaomi.com>
Date: Tue, 25 Nov 2025 17:34:25 +0800
Subject: function_graph: Enable funcgraph-args and funcgraph-retaddr to work
 simultaneously

Currently, the funcgraph-args and funcgraph-retaddr features are
mutually exclusive. This patch resolves this limitation by allowing
funcgraph-retaddr to have an args array.

To verify the change, use perf to trace vfs_write with both options
enabled:

Before:
 # perf ftrace -G vfs_write --graph-opts args,retaddr
   ......
   down_read() { /* <-n_tty_write+0xa3/0x540 */
     __cond_resched(); /* <-down_read+0x12/0x160 */
     preempt_count_add(); /* <-down_read+0x3b/0x160 */
     preempt_count_sub(); /* <-down_read+0x8b/0x160 */
   }

After:
 # perf ftrace -G vfs_write --graph-opts args,retaddr
   ......
   down_read(sem=0xffff8880100bea78) { /* <-n_tty_write+0xa3/0x540 */
     __cond_resched(); /* <-down_read+0x12/0x160 */
     preempt_count_add(val=1); /* <-down_read+0x3b/0x160 */
     preempt_count_sub(val=1); /* <-down_read+0x8b/0x160 */
   }

Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Xiaoqin Zhang <zhangxiaoqin@xiaomi.com>
Link: https://patch.msgid.link/20251125093425.2563849-1-dolinux.peng@gmail.com
Signed-off-by: pengdonglin <pengdonglin@xiaomi.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h               |  7 +---
 kernel/trace/trace.h                 | 24 +++++++++++-
 kernel/trace/trace_entries.h         | 15 ++++----
 kernel/trace/trace_functions_graph.c | 71 ++++++++++++++++++++++++------------
 4 files changed, 80 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7ded7df6e9b5..6ca9c6229d93 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1126,17 +1126,14 @@ static inline void ftrace_init(void) { }
  */
 struct ftrace_graph_ent {
 	unsigned long func; /* Current function */
-	int depth;
+	unsigned long depth;
 } __packed;
 
 /*
  * Structure that defines an entry function trace with retaddr.
- * It's already packed but the attribute "packed" is needed
- * to remove extra padding at the end.
  */
 struct fgraph_retaddr_ent {
-	unsigned long func; /* Current function */
-	int depth;
+	struct ftrace_graph_ent ent;
 	unsigned long retaddr;  /* Return address */
 } __packed;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 666f9a2c189d..c2b61bcd912f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -964,7 +964,8 @@ extern int __trace_graph_entry(struct trace_array *tr,
 extern int __trace_graph_retaddr_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned int trace_ctx,
-				unsigned long retaddr);
+				unsigned long retaddr,
+				struct ftrace_regs *fregs);
 extern void __trace_graph_return(struct trace_array *tr,
 				 struct ftrace_graph_ret *trace,
 				 unsigned int trace_ctx,
@@ -2276,4 +2277,25 @@ static inline int rv_init_interface(void)
  */
 #define FTRACE_TRAMPOLINE_MARKER  ((unsigned long) INT_MAX)
 
+/*
+ * This is used to get the address of the args array based on
+ * the type of the entry.
+ */
+#define FGRAPH_ENTRY_ARGS(e)						\
+	({								\
+		unsigned long *_args;					\
+		struct ftrace_graph_ent_entry *_e = e;			\
+									\
+		if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&	\
+			e->ent.type == TRACE_GRAPH_RETADDR_ENT) {	\
+			struct fgraph_retaddr_ent_entry *_re;		\
+									\
+			_re = (typeof(_re))_e;				\
+			_args = _re->args;				\
+		} else {						\
+			_args = _e->args;				\
+		}							\
+		_args;							\
+	})
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index de294ae2c5c5..f6a8d29c0d76 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -80,11 +80,11 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
 	F_STRUCT(
 		__field_struct(	struct ftrace_graph_ent,	graph_ent	)
 		__field_packed(	unsigned long,	graph_ent,	func		)
-		__field_packed(	unsigned int,	graph_ent,	depth		)
+		__field_packed(	unsigned long,	graph_ent,	depth		)
 		__dynamic_array(unsigned long,	args				)
 	),
 
-	F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth)
+	F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth)
 );
 
 #ifdef CONFIG_FUNCTION_GRAPH_RETADDR
@@ -95,13 +95,14 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry,
 	TRACE_GRAPH_RETADDR_ENT,
 
 	F_STRUCT(
-		__field_struct(	struct fgraph_retaddr_ent,	graph_ent	)
-		__field_packed(	unsigned long,	graph_ent,	func		)
-		__field_packed(	unsigned int,	graph_ent,	depth		)
-		__field_packed(	unsigned long,	graph_ent,	retaddr		)
+		__field_struct(	struct fgraph_retaddr_ent,	graph_rent	)
+		__field_packed(	unsigned long,	graph_rent.ent,	func		)
+		__field_packed(	unsigned long,	graph_rent.ent,	depth		)
+		__field_packed(	unsigned long,	graph_rent,	retaddr		)
+		__dynamic_array(unsigned long,	args				)
 	),
 
-	F_printk("--> %ps (%u) <- %ps", (void *)__entry->func, __entry->depth,
+	F_printk("--> %ps (%lu) <- %ps", (void *)__entry->func, __entry->depth,
 		(void *)__entry->retaddr)
 );
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d0513cfcd936..17c75cf2348e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -36,14 +36,19 @@ struct fgraph_ent_args {
 	unsigned long			args[FTRACE_REGS_MAX_ARGS];
 };
 
+struct fgraph_retaddr_ent_args {
+	struct fgraph_retaddr_ent_entry	ent;
+	/* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */
+	unsigned long			args[FTRACE_REGS_MAX_ARGS];
+};
+
 struct fgraph_data {
 	struct fgraph_cpu_data __percpu *cpu_data;
 
 	/* Place to preserve last processed entry. */
 	union {
 		struct fgraph_ent_args		ent;
-		/* TODO allow retaddr to have args */
-		struct fgraph_retaddr_ent_entry	rent;
+		struct fgraph_retaddr_ent_args	rent;
 	};
 	struct ftrace_graph_ret_entry	ret;
 	int				failed;
@@ -160,20 +165,32 @@ int __trace_graph_entry(struct trace_array *tr,
 int __trace_graph_retaddr_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned int trace_ctx,
-				unsigned long retaddr)
+				unsigned long retaddr,
+				struct ftrace_regs *fregs)
 {
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct fgraph_retaddr_ent_entry *entry;
+	int size;
+
+	/* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */
+	size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long));
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT,
-					  sizeof(*entry), trace_ctx);
+					  size, trace_ctx);
 	if (!event)
 		return 0;
 	entry	= ring_buffer_event_data(event);
-	entry->graph_ent.func = trace->func;
-	entry->graph_ent.depth = trace->depth;
-	entry->graph_ent.retaddr = retaddr;
+	entry->graph_rent.ent = *trace;
+	entry->graph_rent.retaddr = retaddr;
+
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+	if (fregs) {
+		for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++)
+			entry->args[i] = ftrace_regs_get_argument(fregs, i);
+	}
+#endif
+
 	trace_buffer_unlock_commit_nostack(buffer, event);
 
 	return 1;
@@ -182,7 +199,8 @@ int __trace_graph_retaddr_entry(struct trace_array *tr,
 int __trace_graph_retaddr_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned int trace_ctx,
-				unsigned long retaddr)
+				unsigned long retaddr,
+				struct ftrace_regs *fregs)
 {
 	return 1;
 }
@@ -267,7 +285,8 @@ static int graph_entry(struct ftrace_graph_ent *trace,
 	if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
 	    tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_RETADDR)) {
 		unsigned long retaddr = ftrace_graph_top_ret_addr(current);
-		ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
+		ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx,
+						  retaddr, fregs);
 	} else {
 		ret = __graph_entry(tr, trace, trace_ctx, fregs);
 	}
@@ -654,13 +673,9 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * Save current and next entries for later reference
 			 * if the output fails.
 			 */
-			if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) {
-				data->rent = *(struct fgraph_retaddr_ent_entry *)curr;
-			} else {
-				int size = min((int)sizeof(data->ent), (int)iter->ent_size);
+			int size = min_t(int, sizeof(data->rent), iter->ent_size);
 
-				memcpy(&data->ent, curr, size);
-			}
+			memcpy(&data->rent, curr, size);
 			/*
 			 * If the next event is not a return type, then
 			 * we only care about what type it is. Otherwise we can
@@ -838,7 +853,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e
 		trace_seq_puts(s, " /*");
 
 	trace_seq_puts(s, " <-");
-	seq_print_ip_sym_offset(s, entry->graph_ent.retaddr, trace_flags);
+	seq_print_ip_sym_offset(s, entry->graph_rent.retaddr, trace_flags);
 
 	if (comment)
 		trace_seq_puts(s, " */");
@@ -984,7 +999,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 		trace_seq_printf(s, "%ps", (void *)ret_func);
 
 		if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) {
-			print_function_args(s, entry->args, ret_func);
+			print_function_args(s, FGRAPH_ENTRY_ARGS(entry), ret_func);
 			trace_seq_putc(s, ';');
 		} else
 			trace_seq_puts(s, "();");
@@ -1036,7 +1051,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
 	args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args);
 
 	if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long))
-		print_function_args(s, entry->args, func);
+		print_function_args(s, FGRAPH_ENTRY_ARGS(entry), func);
 	else
 		trace_seq_puts(s, "()");
 
@@ -1218,11 +1233,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	/*
 	 * print_graph_entry() may consume the current event,
 	 * thus @field may become invalid, so we need to save it.
-	 * sizeof(struct ftrace_graph_ent_entry) is very small,
-	 * it can be safely saved at the stack.
+	 * This function is shared by ftrace_graph_ent_entry and
+	 * fgraph_retaddr_ent_entry, the size of the latter one
+	 * is larger, but it is very small and can be safely saved
+	 * at the stack.
 	 */
 	struct ftrace_graph_ent_entry *entry;
-	u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)];
+	struct fgraph_retaddr_ent_entry *rentry;
+	u8 save_buf[sizeof(*rentry) + FTRACE_REGS_MAX_ARGS * sizeof(long)];
 
 	/* The ent_size is expected to be as big as the entry */
 	if (iter->ent_size > sizeof(save_buf))
@@ -1451,12 +1469,17 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 	}
 #ifdef CONFIG_FUNCTION_GRAPH_RETADDR
 	case TRACE_GRAPH_RETADDR_ENT: {
-		struct fgraph_retaddr_ent_entry saved;
+		/*
+		 * ftrace_graph_ent_entry and fgraph_retaddr_ent_entry have
+		 * similar functions and memory layouts. The only difference
+		 * is that the latter one has an extra retaddr member, so
+		 * they can share most of the logic.
+		 */
 		struct fgraph_retaddr_ent_entry *rfield;
 
 		trace_assign_type(rfield, entry);
-		saved = *rfield;
-		return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags);
+		return print_graph_entry((struct ftrace_graph_ent_entry *)rfield,
+					  s, iter, flags);
 	}
 #endif
 	case TRACE_GRAPH_RET: {
-- 
cgit v1.2.3


From 4677e78800bbde62a9edce0eb3b40c775ec55e0d Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Tue, 25 Nov 2025 16:17:59 -0500
Subject: socket: Unify getsockname and getpeername implementation

They are already implemented by the same get_name hook in the protocol
level.  Bring the unification one level up to reduce code duplication
in preparation to supporting these as io_uring operations.

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/socket.h |  4 +---
 net/compat.c           |  4 ++--
 net/socket.c           | 55 ++++++++++++--------------------------------------
 3 files changed, 16 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 3b262487ec06..937fe331ff1e 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -454,9 +454,7 @@ extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
 extern int __sys_listen(int fd, int backlog);
 extern int __sys_listen_socket(struct socket *sock, int backlog);
 extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
-			     int __user *usockaddr_len);
-extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
-			     int __user *usockaddr_len);
+			     int __user *usockaddr_len, int peer);
 extern int __sys_socketpair(int family, int type, int protocol,
 			    int __user *usockvec);
 extern int __sys_shutdown_sock(struct socket *sock, int how);
diff --git a/net/compat.c b/net/compat.c
index 485db8ee9b28..2c9bd0edac99 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -460,10 +460,10 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
 		ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
 		break;
 	case SYS_GETSOCKNAME:
-		ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
+		ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
 		break;
 	case SYS_GETPEERNAME:
-		ret = __sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
+		ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 1);
 		break;
 	case SYS_SOCKETPAIR:
 		ret = __sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
diff --git a/net/socket.c b/net/socket.c
index e8892b218708..208d92ccf0fb 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2128,12 +2128,11 @@ SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
 }
 
 /*
- *	Get the local address ('name') of a socket object. Move the obtained
- *	name to user space.
+ *	Get the remote or local address ('name') of a socket object. Move the
+ *	obtained name to user space.
  */
-
 int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
-		      int __user *usockaddr_len)
+		      int __user *usockaddr_len, int peer)
 {
 	struct socket *sock;
 	struct sockaddr_storage address;
@@ -2146,11 +2145,14 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	err = security_socket_getsockname(sock);
+	if (peer)
+		err = security_socket_getpeername(sock);
+	else
+		err = security_socket_getsockname(sock);
 	if (err)
 		return err;
 
-	err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0);
+	err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer);
 	if (err < 0)
 		return err;
 
@@ -2161,44 +2163,13 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
 SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
 		int __user *, usockaddr_len)
 {
-	return __sys_getsockname(fd, usockaddr, usockaddr_len);
-}
-
-/*
- *	Get the remote address ('name') of a socket object. Move the obtained
- *	name to user space.
- */
-
-int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
-		      int __user *usockaddr_len)
-{
-	struct socket *sock;
-	struct sockaddr_storage address;
-	CLASS(fd, f)(fd);
-	int err;
-
-	if (fd_empty(f))
-		return -EBADF;
-	sock = sock_from_file(fd_file(f));
-	if (unlikely(!sock))
-		return -ENOTSOCK;
-
-	err = security_socket_getpeername(sock);
-	if (err)
-		return err;
-
-	err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 1);
-	if (err < 0)
-		return err;
-
-	/* "err" is actually length in this case */
-	return move_addr_to_user(&address, err, usockaddr, usockaddr_len);
+	return __sys_getsockname(fd, usockaddr, usockaddr_len, 0);
 }
 
 SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
 		int __user *, usockaddr_len)
 {
-	return __sys_getpeername(fd, usockaddr, usockaddr_len);
+	return __sys_getsockname(fd, usockaddr, usockaddr_len, 1);
 }
 
 /*
@@ -3162,12 +3133,12 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 	case SYS_GETSOCKNAME:
 		err =
 		    __sys_getsockname(a0, (struct sockaddr __user *)a1,
-				      (int __user *)a[2]);
+				      (int __user *)a[2], 0);
 		break;
 	case SYS_GETPEERNAME:
 		err =
-		    __sys_getpeername(a0, (struct sockaddr __user *)a1,
-				      (int __user *)a[2]);
+		    __sys_getsockname(a0, (struct sockaddr __user *)a1,
+				      (int __user *)a[2], 1);
 		break;
 	case SYS_SOCKETPAIR:
 		err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
-- 
cgit v1.2.3


From d73c1677087391379441c0bb444c7fb4238fc6e7 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Tue, 25 Nov 2025 16:18:00 -0500
Subject: socket: Split out a getsockname helper for io_uring

Similar to getsockopt, split out a helper to check security and issue
the operation from the main handler that can be used by io_uring.

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/socket.h |  2 ++
 net/socket.c           | 36 ++++++++++++++++++++----------------
 2 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 937fe331ff1e..8d580074ddea 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -453,6 +453,8 @@ extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
 			 int addrlen);
 extern int __sys_listen(int fd, int backlog);
 extern int __sys_listen_socket(struct socket *sock, int backlog);
+extern int do_getsockname(struct socket *sock, int peer,
+			  struct sockaddr __user *usockaddr, int __user *usockaddr_len);
 extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
 			     int __user *usockaddr_len, int peer);
 extern int __sys_socketpair(int family, int type, int protocol,
diff --git a/net/socket.c b/net/socket.c
index 208d92ccf0fb..89bac0a17e5a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2127,39 +2127,43 @@ SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
 	return __sys_connect(fd, uservaddr, addrlen);
 }
 
-/*
- *	Get the remote or local address ('name') of a socket object. Move the
- *	obtained name to user space.
- */
-int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
-		      int __user *usockaddr_len, int peer)
+int do_getsockname(struct socket *sock, int peer,
+		   struct sockaddr __user *usockaddr, int __user *usockaddr_len)
 {
-	struct socket *sock;
 	struct sockaddr_storage address;
-	CLASS(fd, f)(fd);
 	int err;
 
-	if (fd_empty(f))
-		return -EBADF;
-	sock = sock_from_file(fd_file(f));
-	if (unlikely(!sock))
-		return -ENOTSOCK;
-
 	if (peer)
 		err = security_socket_getpeername(sock);
 	else
 		err = security_socket_getsockname(sock);
 	if (err)
 		return err;
-
 	err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer);
 	if (err < 0)
 		return err;
-
 	/* "err" is actually length in this case */
 	return move_addr_to_user(&address, err, usockaddr, usockaddr_len);
 }
 
+/*
+ *	Get the remote or local address ('name') of a socket object. Move the
+ *	obtained name to user space.
+ */
+int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
+		      int __user *usockaddr_len, int peer)
+{
+	struct socket *sock;
+	CLASS(fd, f)(fd);
+
+	if (fd_empty(f))
+		return -EBADF;
+	sock = sock_from_file(fd_file(f));
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+	return do_getsockname(sock, peer, usockaddr, usockaddr_len);
+}
+
 SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
 		int __user *, usockaddr_len)
 {
-- 
cgit v1.2.3


From 5d24321e4c159088604512d7a5c5cf634d23e01a Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Tue, 25 Nov 2025 16:18:01 -0500
Subject: io_uring: Introduce getsockname io_uring cmd

Introduce a socket-specific io_uring_cmd to support
getsockname/getpeername via io_uring.  I made this an io_uring_cmd
instead of a new operation to avoid polluting the command namespace with
what is exclusively a socket operation.  In addition, since we don't
need to conform to existing interfaces, this merges the
getsockname/getpeername in a single operation, since the implementation
is pretty much the same.

This has been frequently requested, for instance at [1] and more
recently in the project Discord channel. The main use-case is to support
fixed socket file descriptors.

[1] https://github.com/axboe/liburing/issues/1356

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  1 +
 io_uring/cmd_net.c            | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index deb772222b6d..b5b23c0d5283 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -1009,6 +1009,7 @@ enum io_uring_socket_op {
 	SOCKET_URING_OP_GETSOCKOPT,
 	SOCKET_URING_OP_SETSOCKOPT,
 	SOCKET_URING_OP_TX_TIMESTAMP,
+	SOCKET_URING_OP_GETSOCKNAME,
 };
 
 /*
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
index 27a09aa4c9d0..5d11caf5509c 100644
--- a/io_uring/cmd_net.c
+++ b/io_uring/cmd_net.c
@@ -132,6 +132,26 @@ static int io_uring_cmd_timestamp(struct socket *sock,
 	return -EAGAIN;
 }
 
+static int io_uring_cmd_getsockname(struct socket *sock,
+				    struct io_uring_cmd *cmd,
+				    unsigned int issue_flags)
+{
+	const struct io_uring_sqe *sqe = cmd->sqe;
+	struct sockaddr __user *uaddr;
+	unsigned int peer;
+	int __user *ulen;
+
+	if (sqe->ioprio || sqe->__pad1 || sqe->len || sqe->rw_flags)
+		return -EINVAL;
+
+	uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	ulen = u64_to_user_ptr(sqe->addr3);
+	peer = READ_ONCE(sqe->optlen);
+	if (peer > 1)
+		return -EINVAL;
+	return do_getsockname(sock, peer, uaddr, ulen);
+}
+
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
 	struct socket *sock = cmd->file->private_data;
@@ -159,6 +179,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
 		return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
 	case SOCKET_URING_OP_TX_TIMESTAMP:
 		return io_uring_cmd_timestamp(sock, cmd, issue_flags);
+	case SOCKET_URING_OP_GETSOCKNAME:
+		return io_uring_cmd_getsockname(sock, cmd, issue_flags);
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From dac092195b6a35bc7c9f11e2884cfecb1b25e20c Mon Sep 17 00:00:00 2001
From: Yang Erkun <yangerkun@huawei.com>
Date: Wed, 12 Nov 2025 16:45:36 +0800
Subject: ext4: rename EXT4_GET_BLOCKS_PRE_IO

This flag has been generalized to split an unwritten extent when we do
dio or dioread_nolock writeback, or to avoid merge new extents which was
created by extents split. Update some related comments too.

Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Signed-off-by: Yang Erkun <yangerkun@huawei.com>
Message-ID: <20251112084538.1658232-2-yangerkun@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h              | 21 +++++++++++++++------
 fs/ext4/extents.c           | 16 ++++++++--------
 fs/ext4/inode.c             |  2 +-
 include/trace/events/ext4.h |  2 +-
 4 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9f127aedbaee..9df4f3ddfe42 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -695,13 +695,22 @@ enum {
 	/* Caller is from the delayed allocation writeout path
 	 * finally doing the actual allocation of delayed blocks */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
-	/* caller is from the direct IO path, request to creation of an
-	unwritten extents if not allocated, split the unwritten
-	extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_PRE_IO			0x0008
-#define EXT4_GET_BLOCKS_CONVERT			0x0010
-#define EXT4_GET_BLOCKS_IO_CREATE_EXT		(EXT4_GET_BLOCKS_PRE_IO|\
+	/*
+	 * This means that we cannot merge newly allocated extents, and if we
+	 * found an unwritten extent, we need to split it.
+	 */
+#define EXT4_GET_BLOCKS_SPLIT_NOMERGE		0x0008
+	/*
+	 * Caller is from the dio or dioread_nolock buffered IO, reqest to
+	 * create an unwritten extent if it does not exist or split the
+	 * found unwritten extent. Also do not merge the newly created
+	 * unwritten extent, io end will convert unwritten to written,
+	 * and try to merge the written extent.
+	 */
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT		(EXT4_GET_BLOCKS_SPLIT_NOMERGE|\
 					 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+	/* Convert unwritten extent to initialized. */
+#define EXT4_GET_BLOCKS_CONVERT			0x0010
 	/* Eventual metadata allocation (due to growing extent tree)
 	 * should not fail, so try to use reserved blocks for that.*/
 #define EXT4_GET_BLOCKS_METADATA_NOFAIL		0x0020
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c7d219e6c6d8..b9be5d8320de 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -333,7 +333,7 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
 			   int nofail)
 {
 	int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
-	int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
+	int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
 
 	if (nofail)
 		flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
@@ -2002,7 +2002,7 @@ ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	}
 
 	/* try to insert block into found extent and return */
-	if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
+	if (ex && !(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) {
 
 		/*
 		 * Try to see whether we should rather test the extent on
@@ -2181,7 +2181,7 @@ has_space:
 
 merge:
 	/* try to merge extents */
-	if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
+	if (!(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
 		ext4_ext_try_to_merge(handle, inode, path, nearex);
 
 	/* time to correct all indexes above */
@@ -3224,7 +3224,7 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 		else
 			ext4_ext_mark_initialized(ex);
 
-		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+		if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
 			ext4_ext_try_to_merge(handle, inode, path, ex);
 
 		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
@@ -3368,7 +3368,7 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
 
 	if (map->m_lblk + map->m_len < ee_block + ee_len) {
 		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
-		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+		flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
 		if (unwritten)
 			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
 				       EXT4_EXT_MARK_UNWRIT2;
@@ -3739,7 +3739,7 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
 			      EXT4_EXT_MAY_ZEROOUT : 0;
 		split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
 	}
-	flags |= EXT4_GET_BLOCKS_PRE_IO;
+	flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE;
 	return ext4_split_extent(handle, inode, path, map, split_flag, flags,
 				 allocated);
 }
@@ -3911,7 +3911,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
 						*allocated, newblock);
 
 	/* get_block() before submitting IO, split the extent */
-	if (flags & EXT4_GET_BLOCKS_PRE_IO) {
+	if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) {
 		path = ext4_split_convert_extents(handle, inode, map, path,
 				flags | EXT4_GET_BLOCKS_CONVERT, allocated);
 		if (IS_ERR(path))
@@ -5618,7 +5618,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
 			path = ext4_split_extent_at(handle, inode, path,
 					start_lblk, split_flag,
 					EXT4_EX_NOCACHE |
-					EXT4_GET_BLOCKS_PRE_IO |
+					EXT4_GET_BLOCKS_SPLIT_NOMERGE |
 					EXT4_GET_BLOCKS_METADATA_NOFAIL);
 		}
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 32d9f0b36c33..3883793425cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -653,7 +653,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
 	 * If the extent has been zeroed out, we don't need to update
 	 * extent status tree.
 	 */
-	if (flags & EXT4_GET_BLOCKS_PRE_IO &&
+	if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE &&
 	    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
 		if (ext4_es_is_written(&es))
 			return retval;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index a05bdd48e16e..fd76d14c2776 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -39,7 +39,7 @@ struct partial_cluster;
 	{ EXT4_GET_BLOCKS_CREATE,		"CREATE" },		\
 	{ EXT4_GET_BLOCKS_UNWRIT_EXT,		"UNWRIT" },		\
 	{ EXT4_GET_BLOCKS_DELALLOC_RESERVE,	"DELALLOC" },		\
-	{ EXT4_GET_BLOCKS_PRE_IO,		"PRE_IO" },		\
+	{ EXT4_GET_BLOCKS_SPLIT_NOMERGE,	"SPLIT_NOMERGE" },	\
 	{ EXT4_GET_BLOCKS_CONVERT,		"CONVERT" },		\
 	{ EXT4_GET_BLOCKS_METADATA_NOFAIL,	"METADATA_NOFAIL" },	\
 	{ EXT4_GET_BLOCKS_NO_NORMALIZE,		"NO_NORMALIZE" },	\
-- 
cgit v1.2.3


From 85f5491d9c6e9662653c8e6e7b70637b98537ecc Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 3 Nov 2025 21:34:01 +0100
Subject: libceph: drop started parameter of __ceph_open_session()

With the previous commit revamping the timeout handling, started isn't
used anymore.  It could be taken into account by adjusting the initial
value of the timeout, but there is little point as both callers capture
the timestamp shortly before calling __ceph_open_session() -- the only
thing of note that happens in the interim is taking client->mount_mutex
and that isn't expected to take multiple seconds.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
---
 fs/ceph/super.c              | 2 +-
 include/linux/ceph/libceph.h | 3 +--
 net/ceph/ceph_common.c       | 5 ++---
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index ad0cf177e75a..f6bf24b5c683 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1149,7 +1149,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 		const char *path = fsc->mount_options->server_path ?
 				     fsc->mount_options->server_path + 1 : "";
 
-		err = __ceph_open_session(fsc->client, started);
+		err = __ceph_open_session(fsc->client);
 		if (err < 0)
 			goto out;
 
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 733e7f93db66..63e0e2aa1ce9 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -306,8 +306,7 @@ struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
 u64 ceph_client_gid(struct ceph_client *client);
 extern void ceph_destroy_client(struct ceph_client *client);
 extern void ceph_reset_client_addr(struct ceph_client *client);
-extern int __ceph_open_session(struct ceph_client *client,
-			       unsigned long started);
+extern int __ceph_open_session(struct ceph_client *client);
 extern int ceph_open_session(struct ceph_client *client);
 int ceph_wait_for_latest_osdmap(struct ceph_client *client,
 				unsigned long timeout);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 285e981730e5..e734e57be083 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -788,7 +788,7 @@ EXPORT_SYMBOL(ceph_reset_client_addr);
 /*
  * mount: join the ceph cluster, and open root directory.
  */
-int __ceph_open_session(struct ceph_client *client, unsigned long started)
+int __ceph_open_session(struct ceph_client *client)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	long timeout = ceph_timeout_jiffies(client->options->mount_timeout);
@@ -844,12 +844,11 @@ EXPORT_SYMBOL(__ceph_open_session);
 int ceph_open_session(struct ceph_client *client)
 {
 	int ret;
-	unsigned long started = jiffies;  /* note the start time */
 
 	dout("open_session start\n");
 	mutex_lock(&client->mount_mutex);
 
-	ret = __ceph_open_session(client, started);
+	ret = __ceph_open_session(client);
 
 	mutex_unlock(&client->mount_mutex);
 	return ret;
-- 
cgit v1.2.3


From 6aac2aa2dfae38b60f22c3dfe4103ceefbe2d761 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Mon, 24 Nov 2025 18:11:45 +0000
Subject: phy: rename hwtstamp callback to hwtstamp_set

PHY devices has hwtstamp callback which actually performs set operation.
Rename it to better reflect the action.

Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251124181151.277256-2-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/ti/netcp_ethss.c |  2 +-
 drivers/net/phy/bcm-phy-ptp.c         |  8 ++++----
 drivers/net/phy/dp83640.c             |  8 ++++----
 drivers/net/phy/micrel.c              | 16 ++++++++--------
 drivers/net/phy/microchip_rds_ptp.c   |  8 ++++----
 drivers/net/phy/mscc/mscc_ptp.c       |  8 ++++----
 drivers/net/phy/nxp-c45-tja11xx.c     |  8 ++++----
 drivers/net/phy/phy.c                 | 11 +++++++----
 drivers/ptp/ptp_ines.c                |  8 ++++----
 include/linux/mii_timestamper.h       |  8 ++++----
 include/linux/phy.h                   |  4 ++--
 11 files changed, 46 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 4f6cc6cd1f03..8f46e9be76b1 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -2657,7 +2657,7 @@ static int gbe_hwtstamp_set(void *intf_priv, struct kernel_hwtstamp_config *cfg,
 
 	phy = gbe_intf->slave->phy;
 	if (phy_has_hwtstamp(phy))
-		return phy->mii_ts->hwtstamp(phy->mii_ts, cfg, extack);
+		return phy->mii_ts->hwtstamp_set(phy->mii_ts, cfg, extack);
 
 	switch (cfg->tx_type) {
 	case HWTSTAMP_TX_OFF:
diff --git a/drivers/net/phy/bcm-phy-ptp.c b/drivers/net/phy/bcm-phy-ptp.c
index d3501f8487d9..6815e844a62e 100644
--- a/drivers/net/phy/bcm-phy-ptp.c
+++ b/drivers/net/phy/bcm-phy-ptp.c
@@ -780,9 +780,9 @@ out:
 	kfree_skb(skb);
 }
 
-static int bcm_ptp_hwtstamp(struct mii_timestamper *mii_ts,
-			    struct kernel_hwtstamp_config *cfg,
-			    struct netlink_ext_ack *extack)
+static int bcm_ptp_hwtstamp_set(struct mii_timestamper *mii_ts,
+				struct kernel_hwtstamp_config *cfg,
+				struct netlink_ext_ack *extack)
 {
 	struct bcm_ptp_private *priv = mii2priv(mii_ts);
 	u16 mode, ctrl;
@@ -898,7 +898,7 @@ static void bcm_ptp_init(struct bcm_ptp_private *priv)
 
 	priv->mii_ts.rxtstamp = bcm_ptp_rxtstamp;
 	priv->mii_ts.txtstamp = bcm_ptp_txtstamp;
-	priv->mii_ts.hwtstamp = bcm_ptp_hwtstamp;
+	priv->mii_ts.hwtstamp_set = bcm_ptp_hwtstamp_set;
 	priv->mii_ts.ts_info = bcm_ptp_ts_info;
 
 	priv->phydev->mii_ts = &priv->mii_ts;
diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index 74396453f5bb..f733a8b72d40 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -1176,9 +1176,9 @@ static irqreturn_t dp83640_handle_interrupt(struct phy_device *phydev)
 	return IRQ_HANDLED;
 }
 
-static int dp83640_hwtstamp(struct mii_timestamper *mii_ts,
-			    struct kernel_hwtstamp_config *cfg,
-			    struct netlink_ext_ack *extack)
+static int dp83640_hwtstamp_set(struct mii_timestamper *mii_ts,
+				struct kernel_hwtstamp_config *cfg,
+				struct netlink_ext_ack *extack)
 {
 	struct dp83640_private *dp83640 =
 		container_of(mii_ts, struct dp83640_private, mii_ts);
@@ -1407,7 +1407,7 @@ static int dp83640_probe(struct phy_device *phydev)
 	dp83640->phydev = phydev;
 	dp83640->mii_ts.rxtstamp = dp83640_rxtstamp;
 	dp83640->mii_ts.txtstamp = dp83640_txtstamp;
-	dp83640->mii_ts.hwtstamp = dp83640_hwtstamp;
+	dp83640->mii_ts.hwtstamp_set = dp83640_hwtstamp_set;
 	dp83640->mii_ts.ts_info  = dp83640_ts_info;
 
 	INIT_DELAYED_WORK(&dp83640->ts_work, rx_timestamp_work);
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 5d90ccc20df7..05de68b9f719 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -3147,9 +3147,9 @@ static void lan8814_flush_fifo(struct phy_device *phydev, bool egress)
 	lanphy_read_page_reg(phydev, LAN8814_PAGE_PORT_REGS, PTP_TSU_INT_STS);
 }
 
-static int lan8814_hwtstamp(struct mii_timestamper *mii_ts,
-			    struct kernel_hwtstamp_config *config,
-			    struct netlink_ext_ack *extack)
+static int lan8814_hwtstamp_set(struct mii_timestamper *mii_ts,
+				struct kernel_hwtstamp_config *config,
+				struct netlink_ext_ack *extack)
 {
 	struct kszphy_ptp_priv *ptp_priv =
 			  container_of(mii_ts, struct kszphy_ptp_priv, mii_ts);
@@ -4389,7 +4389,7 @@ static void lan8814_ptp_init(struct phy_device *phydev)
 
 	ptp_priv->mii_ts.rxtstamp = lan8814_rxtstamp;
 	ptp_priv->mii_ts.txtstamp = lan8814_txtstamp;
-	ptp_priv->mii_ts.hwtstamp = lan8814_hwtstamp;
+	ptp_priv->mii_ts.hwtstamp_set = lan8814_hwtstamp_set;
 	ptp_priv->mii_ts.ts_info  = lan8814_ts_info;
 
 	phydev->mii_ts = &ptp_priv->mii_ts;
@@ -5042,9 +5042,9 @@ static void lan8841_ptp_enable_processing(struct kszphy_ptp_priv *ptp_priv,
 #define LAN8841_PTP_TX_TIMESTAMP_EN		443
 #define LAN8841_PTP_TX_MOD			445
 
-static int lan8841_hwtstamp(struct mii_timestamper *mii_ts,
-			    struct kernel_hwtstamp_config *config,
-			    struct netlink_ext_ack *extack)
+static int lan8841_hwtstamp_set(struct mii_timestamper *mii_ts,
+				struct kernel_hwtstamp_config *config,
+				struct netlink_ext_ack *extack)
 {
 	struct kszphy_ptp_priv *ptp_priv = container_of(mii_ts, struct kszphy_ptp_priv, mii_ts);
 	struct phy_device *phydev = ptp_priv->phydev;
@@ -5924,7 +5924,7 @@ static int lan8841_probe(struct phy_device *phydev)
 
 	ptp_priv->mii_ts.rxtstamp = lan8841_rxtstamp;
 	ptp_priv->mii_ts.txtstamp = lan8814_txtstamp;
-	ptp_priv->mii_ts.hwtstamp = lan8841_hwtstamp;
+	ptp_priv->mii_ts.hwtstamp_set = lan8841_hwtstamp_set;
 	ptp_priv->mii_ts.ts_info = lan8841_ts_info;
 
 	phydev->mii_ts = &ptp_priv->mii_ts;
diff --git a/drivers/net/phy/microchip_rds_ptp.c b/drivers/net/phy/microchip_rds_ptp.c
index e6514ce04c29..4c6326b0ceaf 100644
--- a/drivers/net/phy/microchip_rds_ptp.c
+++ b/drivers/net/phy/microchip_rds_ptp.c
@@ -476,9 +476,9 @@ static bool mchp_rds_ptp_rxtstamp(struct mii_timestamper *mii_ts,
 	return true;
 }
 
-static int mchp_rds_ptp_hwtstamp(struct mii_timestamper *mii_ts,
-				 struct kernel_hwtstamp_config *config,
-				 struct netlink_ext_ack *extack)
+static int mchp_rds_ptp_hwtstamp_set(struct mii_timestamper *mii_ts,
+				     struct kernel_hwtstamp_config *config,
+				     struct netlink_ext_ack *extack)
 {
 	struct mchp_rds_ptp_clock *clock =
 				container_of(mii_ts, struct mchp_rds_ptp_clock,
@@ -1281,7 +1281,7 @@ struct mchp_rds_ptp_clock *mchp_rds_ptp_probe(struct phy_device *phydev, u8 mmd,
 
 	clock->mii_ts.rxtstamp = mchp_rds_ptp_rxtstamp;
 	clock->mii_ts.txtstamp = mchp_rds_ptp_txtstamp;
-	clock->mii_ts.hwtstamp = mchp_rds_ptp_hwtstamp;
+	clock->mii_ts.hwtstamp_set = mchp_rds_ptp_hwtstamp_set;
 	clock->mii_ts.ts_info = mchp_rds_ptp_ts_info;
 
 	phydev->mii_ts = &clock->mii_ts;
diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c
index d692df7d975c..dc06614222f6 100644
--- a/drivers/net/phy/mscc/mscc_ptp.c
+++ b/drivers/net/phy/mscc/mscc_ptp.c
@@ -1051,9 +1051,9 @@ static void vsc85xx_ts_reset_fifo(struct phy_device *phydev)
 			     val);
 }
 
-static int vsc85xx_hwtstamp(struct mii_timestamper *mii_ts,
-			    struct kernel_hwtstamp_config *cfg,
-			    struct netlink_ext_ack *extack)
+static int vsc85xx_hwtstamp_set(struct mii_timestamper *mii_ts,
+				struct kernel_hwtstamp_config *cfg,
+				struct netlink_ext_ack *extack)
 {
 	struct vsc8531_private *vsc8531 =
 		container_of(mii_ts, struct vsc8531_private, mii_ts);
@@ -1611,7 +1611,7 @@ int vsc8584_ptp_probe(struct phy_device *phydev)
 
 	vsc8531->mii_ts.rxtstamp = vsc85xx_rxtstamp;
 	vsc8531->mii_ts.txtstamp = vsc85xx_txtstamp;
-	vsc8531->mii_ts.hwtstamp = vsc85xx_hwtstamp;
+	vsc8531->mii_ts.hwtstamp_set = vsc85xx_hwtstamp_set;
 	vsc8531->mii_ts.ts_info  = vsc85xx_ts_info;
 	phydev->mii_ts = &vsc8531->mii_ts;
 
diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c
index 87adb6508017..13a8fac223a9 100644
--- a/drivers/net/phy/nxp-c45-tja11xx.c
+++ b/drivers/net/phy/nxp-c45-tja11xx.c
@@ -1012,9 +1012,9 @@ static bool nxp_c45_rxtstamp(struct mii_timestamper *mii_ts,
 	return true;
 }
 
-static int nxp_c45_hwtstamp(struct mii_timestamper *mii_ts,
-			    struct kernel_hwtstamp_config *cfg,
-			    struct netlink_ext_ack *extack)
+static int nxp_c45_hwtstamp_set(struct mii_timestamper *mii_ts,
+				struct kernel_hwtstamp_config *cfg,
+				struct netlink_ext_ack *extack)
 {
 	struct nxp_c45_phy *priv = container_of(mii_ts, struct nxp_c45_phy,
 						mii_ts);
@@ -1749,7 +1749,7 @@ static int nxp_c45_probe(struct phy_device *phydev)
 	    IS_ENABLED(CONFIG_NETWORK_PHY_TIMESTAMPING)) {
 		priv->mii_ts.rxtstamp = nxp_c45_rxtstamp;
 		priv->mii_ts.txtstamp = nxp_c45_txtstamp;
-		priv->mii_ts.hwtstamp = nxp_c45_hwtstamp;
+		priv->mii_ts.hwtstamp_set = nxp_c45_hwtstamp_set;
 		priv->mii_ts.ts_info = nxp_c45_ts_info;
 		phydev->mii_ts = &priv->mii_ts;
 		ret = nxp_c45_init_ptp_clock(priv);
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 02da4a203ddd..350bc23c1fdb 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -405,12 +405,14 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 		return 0;
 
 	case SIOCSHWTSTAMP:
-		if (phydev->mii_ts && phydev->mii_ts->hwtstamp) {
+		if (phydev->mii_ts && phydev->mii_ts->hwtstamp_set) {
 			if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
 				return -EFAULT;
 
 			hwtstamp_config_to_kernel(&kernel_cfg, &cfg);
-			ret = phydev->mii_ts->hwtstamp(phydev->mii_ts, &kernel_cfg, &extack);
+			ret = phydev->mii_ts->hwtstamp_set(phydev->mii_ts,
+							   &kernel_cfg,
+							   &extack);
 			if (ret)
 				return ret;
 
@@ -493,8 +495,9 @@ int __phy_hwtstamp_set(struct phy_device *phydev,
 	if (!phydev)
 		return -ENODEV;
 
-	if (phydev->mii_ts && phydev->mii_ts->hwtstamp)
-		return phydev->mii_ts->hwtstamp(phydev->mii_ts, config, extack);
+	if (phydev->mii_ts && phydev->mii_ts->hwtstamp_set)
+		return phydev->mii_ts->hwtstamp_set(phydev->mii_ts, config,
+						    extack);
 
 	return -EOPNOTSUPP;
 }
diff --git a/drivers/ptp/ptp_ines.c b/drivers/ptp/ptp_ines.c
index 68f1f7fdaa9d..56c798e77f20 100644
--- a/drivers/ptp/ptp_ines.c
+++ b/drivers/ptp/ptp_ines.c
@@ -328,9 +328,9 @@ static u64 ines_find_txts(struct ines_port *port, struct sk_buff *skb)
 	return ns;
 }
 
-static int ines_hwtstamp(struct mii_timestamper *mii_ts,
-			 struct kernel_hwtstamp_config *cfg,
-			 struct netlink_ext_ack *extack)
+static int ines_hwtstamp_set(struct mii_timestamper *mii_ts,
+			     struct kernel_hwtstamp_config *cfg,
+			     struct netlink_ext_ack *extack)
 {
 	struct ines_port *port = container_of(mii_ts, struct ines_port, mii_ts);
 	u32 cm_one_step = 0, port_conf, ts_stat_rx, ts_stat_tx;
@@ -709,7 +709,7 @@ static struct mii_timestamper *ines_ptp_probe_channel(struct device *device,
 	}
 	port->mii_ts.rxtstamp = ines_rxtstamp;
 	port->mii_ts.txtstamp = ines_txtstamp;
-	port->mii_ts.hwtstamp = ines_hwtstamp;
+	port->mii_ts.hwtstamp_set = ines_hwtstamp_set;
 	port->mii_ts.link_state = ines_link_state;
 	port->mii_ts.ts_info = ines_ts_info;
 
diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h
index 995db62570f9..08863c0e9ea3 100644
--- a/include/linux/mii_timestamper.h
+++ b/include/linux/mii_timestamper.h
@@ -27,7 +27,7 @@ struct phy_device;
  *		as soon as a timestamp becomes available. One of the PTP_CLASS_
  *		values is passed in 'type'.
  *
- * @hwtstamp:	Handles SIOCSHWTSTAMP ioctl for hardware time stamping.
+ * @hwtstamp_set: Handles SIOCSHWTSTAMP ioctl for hardware time stamping.
  *
  * @link_state: Allows the device to respond to changes in the link
  *		state.  The caller invokes this function while holding
@@ -51,9 +51,9 @@ struct mii_timestamper {
 	void (*txtstamp)(struct mii_timestamper *mii_ts,
 			 struct sk_buff *skb, int type);
 
-	int  (*hwtstamp)(struct mii_timestamper *mii_ts,
-			 struct kernel_hwtstamp_config *kernel_config,
-			 struct netlink_ext_ack *extack);
+	int  (*hwtstamp_set)(struct mii_timestamper *mii_ts,
+			     struct kernel_hwtstamp_config *kernel_config,
+			     struct netlink_ext_ack *extack);
 
 	void (*link_state)(struct mii_timestamper *mii_ts,
 			   struct phy_device *phydev);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 65b0c3ca6a2b..059a104223c4 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1915,7 +1915,7 @@ static inline bool phy_polling_mode(struct phy_device *phydev)
  */
 static inline bool phy_has_hwtstamp(struct phy_device *phydev)
 {
-	return phydev && phydev->mii_ts && phydev->mii_ts->hwtstamp;
+	return phydev && phydev->mii_ts && phydev->mii_ts->hwtstamp_set;
 }
 
 /**
@@ -1950,7 +1950,7 @@ static inline int phy_hwtstamp(struct phy_device *phydev,
 			       struct kernel_hwtstamp_config *cfg,
 			       struct netlink_ext_ack *extack)
 {
-	return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack);
+	return phydev->mii_ts->hwtstamp_set(phydev->mii_ts, cfg, extack);
 }
 
 static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb,
-- 
cgit v1.2.3


From f467777efbfb8034d813b601b961b25f777b3d37 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Mon, 24 Nov 2025 18:11:46 +0000
Subject: phy: add hwtstamp_get callback to phy drivers

PHY devices had lack of hwtstamp_get callback even though most of them
are tracking configuration info. Introduce new call back to
mii_timestamper.

Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251124181151.277256-3-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy.c           | 3 +++
 include/linux/mii_timestamper.h | 5 +++++
 net/core/dev_ioctl.c            | 9 +++++----
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 350bc23c1fdb..13dd1691886d 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -478,6 +478,9 @@ int __phy_hwtstamp_get(struct phy_device *phydev,
 	if (!phydev)
 		return -ENODEV;
 
+	if (phydev->mii_ts && phydev->mii_ts->hwtstamp_get)
+		return phydev->mii_ts->hwtstamp_get(phydev->mii_ts, config);
+
 	return -EOPNOTSUPP;
 }
 
diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h
index 08863c0e9ea3..3102c425c8e0 100644
--- a/include/linux/mii_timestamper.h
+++ b/include/linux/mii_timestamper.h
@@ -29,6 +29,8 @@ struct phy_device;
  *
  * @hwtstamp_set: Handles SIOCSHWTSTAMP ioctl for hardware time stamping.
  *
+ * @hwtstamp_get: Handles SIOCGHWTSTAMP ioctl for hardware time stamping.
+ *
  * @link_state: Allows the device to respond to changes in the link
  *		state.  The caller invokes this function while holding
  *		the phy_device mutex.
@@ -55,6 +57,9 @@ struct mii_timestamper {
 			     struct kernel_hwtstamp_config *kernel_config,
 			     struct netlink_ext_ack *extack);
 
+	int  (*hwtstamp_get)(struct mii_timestamper *mii_ts,
+			     struct kernel_hwtstamp_config *kernel_config);
+
 	void (*link_state)(struct mii_timestamper *mii_ts,
 			   struct phy_device *phydev);
 
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 0720ccc14df9..53a53357cfef 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -249,10 +249,11 @@ int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
  *
  * Helper for calling the default hardware provider timestamping.
  *
- * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and
- * there only exists a phydev->mii_ts->hwtstamp() method. So this will return
- * -EOPNOTSUPP for phylib for now, which is still more accurate than letting
- * the netdev handle the GET request.
+ * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), but
+ * phydev->mii_ts has both hwtstamp_get() and hwtstamp_set() methods. So this
+ * will return -EOPNOTSUPP for phylib only if hwtstamp_get() is not
+ * implemented for now, which is still more accurate than letting the netdev
+ * handle the GET request.
  */
 int dev_get_hwtstamp_phylib(struct net_device *dev,
 			    struct kernel_hwtstamp_config *cfg)
-- 
cgit v1.2.3


From 4a93adcbd201aad5ba607810cfe1b19d44e5d171 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Wed, 12 Nov 2025 11:28:46 +0100
Subject: of: Add wrappers to match root node with OF device ID tables

Several drivers duplicate same code for getting reference to the root
node, matching it against 'struct of_device_id' table and getting out
the match data from the table entry.

There is a of_machine_compatible_match() wrapper but it takes array of
strings, which is not suitable for many drivers since they want the
driver data associated with each compatible.

Add two wrappers, similar to existing of_device_get_match_data():
1. of_machine_device_match() doing only matching against 'struct
   of_device_id' and returning bool.
2. of_machine_get_match_data() doing the matching and returning
   associated driver data for found compatible.

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Tested-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://patch.msgid.link/20251112-b4-of-match-matchine-data-v2-1-d46b72003fd6@linaro.org
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 drivers/of/base.c  | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of.h | 13 +++++++++++++
 2 files changed, 60 insertions(+)

(limited to 'include')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 7043acd971a0..0b65039ece53 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -434,6 +434,53 @@ bool of_machine_compatible_match(const char *const *compats)
 }
 EXPORT_SYMBOL(of_machine_compatible_match);
 
+/**
+ * of_machine_device_match - Test root of device tree against a of_device_id array
+ * @matches:	NULL terminated array of of_device_id match structures to search in
+ *
+ * Returns true if the root node has any of the given compatible values in its
+ * compatible property.
+ */
+bool of_machine_device_match(const struct of_device_id *matches)
+{
+	struct device_node *root;
+	const struct of_device_id *match = NULL;
+
+	root = of_find_node_by_path("/");
+	if (root) {
+		match = of_match_node(matches, root);
+		of_node_put(root);
+	}
+
+	return match != NULL;
+}
+EXPORT_SYMBOL(of_machine_device_match);
+
+/**
+ * of_machine_get_match_data - Tell if root of device tree has a matching of_match structure
+ * @matches:	NULL terminated array of of_device_id match structures to search in
+ *
+ * Returns data associated with matched entry or NULL
+ */
+const void *of_machine_get_match_data(const struct of_device_id *matches)
+{
+	const struct of_device_id *match;
+	struct device_node *root;
+
+	root = of_find_node_by_path("/");
+	if (!root)
+		return NULL;
+
+	match = of_match_node(matches, root);
+	of_node_put(root);
+
+	if (!match)
+		return NULL;
+
+	return match->data;
+}
+EXPORT_SYMBOL(of_machine_get_match_data);
+
 static bool __of_device_is_status(const struct device_node *device,
 				  const char * const*strings)
 {
diff --git a/include/linux/of.h b/include/linux/of.h
index 121a288ca92d..01bb3affcd49 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -407,6 +407,8 @@ extern int of_alias_get_id(const struct device_node *np, const char *stem);
 extern int of_alias_get_highest_id(const char *stem);
 
 bool of_machine_compatible_match(const char *const *compats);
+bool of_machine_device_match(const struct of_device_id *matches);
+const void *of_machine_get_match_data(const struct of_device_id *matches);
 
 /**
  * of_machine_is_compatible - Test root of device tree for a given compatible value
@@ -855,6 +857,17 @@ static inline bool of_machine_compatible_match(const char *const *compats)
 	return false;
 }
 
+static inline bool of_machine_device_match(const struct of_device_id *matches)
+{
+	return false;
+}
+
+static inline const void *
+of_machine_get_match_data(const struct of_device_id *matches)
+{
+	return NULL;
+}
+
 static inline bool of_console_check(const struct device_node *dn, const char *name, int index)
 {
 	return false;
-- 
cgit v1.2.3


From 1cd1c472343b06d6d32038636ce51bfa2251e3cf Mon Sep 17 00:00:00 2001
From: Jon Kohler <jon@nutanix.com>
Date: Tue, 25 Nov 2025 15:27:53 -0700
Subject: virtio-net: avoid unnecessary checksum calculation on guest RX

Commit a2fb4bc4e2a6 ("net: implement virtio helpers to handle UDP
GSO tunneling.") inadvertently altered checksum offload behavior
for guests not using UDP GSO tunneling.

Before, tun_put_user called tun_vnet_hdr_from_skb, which passed
has_data_valid = true to virtio_net_hdr_from_skb.

After, tun_put_user began calling tun_vnet_hdr_tnl_from_skb instead,
which passes has_data_valid = false into both call sites.

This caused virtio hdr flags to not include VIRTIO_NET_HDR_F_DATA_VALID
for SKBs where skb->ip_summed == CHECKSUM_UNNECESSARY. As a result,
guests are forced to recalculate checksums unnecessarily.

Restore the previous behavior by ensuring has_data_valid = true is
passed in the !tnl_gso_type case, but only from tun side, as
virtio_net_hdr_tnl_from_skb() is used also by the virtio_net driver,
which in turn must not use VIRTIO_NET_HDR_F_DATA_VALID on tx.

cc: stable@vger.kernel.org
Fixes: a2fb4bc4e2a6 ("net: implement virtio helpers to handle UDP GSO tunneling.")
Signed-off-by: Jon Kohler <jon@nutanix.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://patch.msgid.link/20251125222754.1737443-1-jon@nutanix.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/tun_vnet.h     | 2 +-
 drivers/net/virtio_net.c   | 3 ++-
 include/linux/virtio_net.h | 7 ++++---
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
index 81662328b2c7..a5f93b6c4482 100644
--- a/drivers/net/tun_vnet.h
+++ b/drivers/net/tun_vnet.h
@@ -244,7 +244,7 @@ tun_vnet_hdr_tnl_from_skb(unsigned int flags,
 
 	if (virtio_net_hdr_tnl_from_skb(skb, tnl_hdr, has_tnl_offload,
 					tun_vnet_is_little_endian(flags),
-					vlan_hlen)) {
+					vlan_hlen, true)) {
 		struct virtio_net_hdr_v1 *hdr = &tnl_hdr->hash_hdr.hdr;
 		struct skb_shared_info *sinfo = skb_shinfo(skb);
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0369dda5ed60..8e04adb57f52 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3339,7 +3339,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan)
 		hdr = &skb_vnet_common_hdr(skb)->tnl_hdr;
 
 	if (virtio_net_hdr_tnl_from_skb(skb, hdr, vi->tx_tnl,
-					virtio_is_little_endian(vi->vdev), 0))
+					virtio_is_little_endian(vi->vdev), 0,
+					false))
 		return -EPROTO;
 
 	if (vi->mergeable_rx_bufs)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index b673c31569f3..75dabb763c65 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -384,7 +384,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb,
 			    struct virtio_net_hdr_v1_hash_tunnel *vhdr,
 			    bool tnl_hdr_negotiated,
 			    bool little_endian,
-			    int vlan_hlen)
+			    int vlan_hlen,
+			    bool has_data_valid)
 {
 	struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr;
 	unsigned int inner_nh, outer_th;
@@ -394,8 +395,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb,
 	tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL |
 						    SKB_GSO_UDP_TUNNEL_CSUM);
 	if (!tnl_gso_type)
-		return virtio_net_hdr_from_skb(skb, hdr, little_endian, false,
-					       vlan_hlen);
+		return virtio_net_hdr_from_skb(skb, hdr, little_endian,
+					       has_data_valid, vlan_hlen);
 
 	/* Tunnel support not negotiated but skb ask for it. */
 	if (!tnl_hdr_negotiated)
-- 
cgit v1.2.3


From 361173f95ae4b726ebbbf0bd594274f5576c4abc Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 13 Nov 2025 04:34:31 -0500
Subject: virtio: fix typo in virtio_device_ready() comment

"coherenct" -> "coherent"

Fixes: 8b4ec69d7e09 ("virtio: harden vring IRQ")
Message-Id: <db286e9a65449347f6584e68c9960fd5ded2b4b0.1763026134.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 16001e9f9b39..1ea5baa62141 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -362,7 +362,7 @@ void virtio_device_ready(struct virtio_device *dev)
 	 * specific set_status() method.
 	 *
 	 * A well behaved device will only notify a virtqueue after
-	 * DRIVER_OK, this means the device should "see" the coherenct
+	 * DRIVER_OK, this means the device should "see" the coherent
 	 * memory write that set vq->broken as false which is done by
 	 * the driver when it sees DRIVER_OK, then the following
 	 * driver's vring_interrupt() will see vq->broken as false so
-- 
cgit v1.2.3


From 7831791e77a1cd29528d4dc336ce14466aef5ba6 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 13 Nov 2025 04:34:34 -0500
Subject: virtio: fix whitespace in virtio_config_ops

The finalize_features documentation uses a tab between words.
Use space instead.

Fixes: d16c0cd27331 ("docs: driver-api: virtio: virtio on Linux")
Message-Id: <39d7685c82848dc6a876d175e33a1407f6ab3fc1.1763026134.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 1ea5baa62141..dbc7eff1f101 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -86,7 +86,7 @@ struct virtqueue_info {
  *	vdev: the virtio_device
  *	This sends the driver feature bits to the device: it can change
  *	the dev->feature bits if it wants.
- *	Note that despite the name this	can be called any number of
+ *	Note that despite the name this can be called any number of
  *	times.
  *	Returns 0 on success or error status
  * @bus_name: return the bus name associated with the device (optional)
-- 
cgit v1.2.3


From 63598fba55ab9d384818fed48dc04006cecf7be4 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 13 Nov 2025 04:34:36 -0500
Subject: virtio: fix grammar in virtio_queue_info docs

Fix grammar in the description of @ctx

Fixes: c502eb85c34e ("virtio: introduce virtio_queue_info struct and find_vqs_info() config op")
Message-Id: <a5cf2b92573200bdb1c1927e559d3930d61a4af2.1763026134.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index dbc7eff1f101..78cf4119f567 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -24,7 +24,7 @@ typedef void vq_callback_t(struct virtqueue *);
  *        a virtqueue unused by the driver.
  * @callback: A callback to invoke on a used buffer notification.
  *            NULL for a virtqueue that does not need a callback.
- * @ctx: A flag to indicate to maintain an extra context per virtqueue.
+ * @ctx: whether to maintain an extra context per virtqueue.
  */
 struct virtqueue_info {
 	const char *name;
-- 
cgit v1.2.3


From c15f42e09178d2849744ccf064200f5e7f71e688 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 13 Nov 2025 04:34:38 -0500
Subject: virtio: fix grammar in virtio_map_ops docs

Fix grammar issues in the virtio_map_ops docs:
- missing article before "transport"
- "implements" -> "implement" to match subject

Fixes: bee8c7c24b73 ("virtio: introduce map ops in virtio core")
Message-Id: <3f7bcae5a984f14b72e67e82572b110acb06fa7e.1763026134.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 78cf4119f567..6660132258d4 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -141,8 +141,8 @@ struct virtio_config_ops {
 
 /**
  * struct virtio_map_ops - operations for mapping buffer for a virtio device
- * Note: For transport that has its own mapping logic it must
- * implements all of the operations
+ * Note: For a transport that has its own mapping logic it must
+ * implement all of the operations
  * @map_page: map a buffer to the device
  *      map: metadata for performing mapping
  *      page: the page that will be mapped by the device
-- 
cgit v1.2.3


From 5e88a5a97d113619b674ebfdd1d2065f2edd10eb Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 13 Nov 2025 04:34:41 -0500
Subject: virtio: standardize Returns documentation style

Remove colons after "Returns" in virtio_map_ops function
documentation - both to avoid triggering an htmldoc warning
and for consistency with virtio_config_ops.

This affects map_page, alloc, need_sync, and max_mapping_size.

Fixes: bee8c7c24b73 ("virtio: introduce map ops in virtio core")
Message-Id: <c262893fa21f4b1265147ef864574a9bd173348f.1763026134.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 6660132258d4..e231147ff92d 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -150,7 +150,7 @@ struct virtio_config_ops {
  *      size: the buffer size
  *      dir: mapping direction
  *      attrs: mapping attributes
- *      Returns: the mapped address
+ *      Returns the mapped address
  * @unmap_page: unmap a buffer from the device
  *      map: device specific mapping map
  *      map_handle: the mapped address
@@ -172,7 +172,7 @@ struct virtio_config_ops {
  *      size: the size of the buffer
  *      map_handle: the mapping address to sync
  *      gfp: allocation flag (GFP_XXX)
- *      Returns: virtual address of the allocated buffer
+ *      Returns virtual address of the allocated buffer
  * @free: free a coherent buffer mapping
  *      map: metadata for performing mapping
  *      size: the size of the buffer
@@ -182,13 +182,13 @@ struct virtio_config_ops {
  * @need_sync: if the buffer needs synchronization
  *      map: metadata for performing mapping
  *      map_handle: the mapped address
- *      Returns: whether the buffer needs synchronization
+ *      Returns whether the buffer needs synchronization
  * @mapping_error: if the mapping address is error
  *      map: metadata for performing mapping
  *      map_handle: the mapped address
  * @max_mapping_size: get the maximum buffer size that can be mapped
  *      map: metadata for performing mapping
- *      Returns: the maximum buffer size that can be mapped
+ *      Returns the maximum buffer size that can be mapped
  */
 struct virtio_map_ops {
 	dma_addr_t (*map_page)(union virtio_map map, struct page *page,
-- 
cgit v1.2.3


From 43236d8bbafff94b423afecc4a692dd90602d426 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 13 Nov 2025 04:34:43 -0500
Subject: virtio: fix virtqueue_set_affinity() docs

Rewrite the comment for better grammar and clarity.

Fixes: 75a0a52be3c2 ("virtio: introduce an API to set affinity for a virtqueue")
Message-Id: <e317e91bd43b070e5eaec0ebbe60c5749d02e2dd.1763026134.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index e231147ff92d..1a019a1f168d 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -384,7 +384,7 @@ const char *virtio_bus_name(struct virtio_device *vdev)
  * @vq: the virtqueue
  * @cpu_mask: the cpu mask
  *
- * Pay attention the function are best-effort: the affinity hint may not be set
+ * Note that this function is best-effort: the affinity hint may not be set
  * due to config support, irq type and sharing.
  *
  */
-- 
cgit v1.2.3


From deb55fc994e3dc38f139c0147c15fc2a9db27086 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 13 Nov 2025 04:34:49 -0500
Subject: virtio: fix map ops comment

@free will free the map handle not sync it. Fix the doc to match.

Fixes: bee8c7c24b73 ("virtio: introduce map ops in virtio core")
Message-Id: <f6ff1c7aff8401900bf362007d7fb52dfdb6a15b.1763026134.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 1a019a1f168d..a1af2676bbe6 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -177,7 +177,7 @@ struct virtio_config_ops {
  *      map: metadata for performing mapping
  *      size: the size of the buffer
  *      vaddr: virtual address of the buffer
- *      map_handle: the mapping address to sync
+ *      map_handle: the mapping address that needs to be freed
  *      attrs: unmapping attributes
  * @need_sync: if the buffer needs synchronization
  *      map: metadata for performing mapping
-- 
cgit v1.2.3


From 9513f25056b22100ddffe24898c587873b0d022c Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 21 Oct 2025 10:56:57 -0400
Subject: virtio: clean up features qword/dword terms

virtio pci uses word to mean "16 bits". mmio uses it to mean
"32 bits".

To avoid confusion, let's avoid the term in core virtio
altogether. Just say U64 to mean "64 bit".

Fixes: e7d4c1c5a546 ("virtio: introduce extended features")
Cc: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-ID: <ad53b7b6be87fc524f45abaeca0bb05fb3633397.1764225384.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/net.c                    | 12 ++++++------
 drivers/virtio/virtio.c                | 12 ++++++------
 drivers/virtio/virtio_debug.c          | 10 +++++-----
 drivers/virtio/virtio_pci_modern_dev.c |  6 +++---
 include/linux/virtio.h                 |  2 +-
 include/linux/virtio_config.h          |  2 +-
 include/linux/virtio_features.h        | 29 +++++++++++++++--------------
 include/linux/virtio_pci_modern.h      |  8 ++++----
 8 files changed, 41 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 35ded4330431..d057ea55f5ad 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -69,7 +69,7 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 
 #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
 
-static const u64 vhost_net_features[VIRTIO_FEATURES_DWORDS] = {
+static const u64 vhost_net_features[VIRTIO_FEATURES_U64S] = {
 	VHOST_FEATURES |
 	(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
 	(1ULL << VIRTIO_NET_F_MRG_RXBUF) |
@@ -1720,7 +1720,7 @@ out:
 static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
 			    unsigned long arg)
 {
-	u64 all_features[VIRTIO_FEATURES_DWORDS];
+	u64 all_features[VIRTIO_FEATURES_U64S];
 	struct vhost_net *n = f->private_data;
 	void __user *argp = (void __user *)arg;
 	u64 __user *featurep = argp;
@@ -1752,7 +1752,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
 
 		/* Copy the net features, up to the user-provided buffer size */
 		argp += sizeof(u64);
-		copied = min(count, VIRTIO_FEATURES_DWORDS);
+		copied = min(count, (u64)VIRTIO_FEATURES_U64S);
 		if (copy_to_user(argp, vhost_net_features,
 				 copied * sizeof(u64)))
 			return -EFAULT;
@@ -1767,13 +1767,13 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
 
 		virtio_features_zero(all_features);
 		argp += sizeof(u64);
-		copied = min(count, VIRTIO_FEATURES_DWORDS);
+		copied = min(count, (u64)VIRTIO_FEATURES_U64S);
 		if (copy_from_user(all_features, argp, copied * sizeof(u64)))
 			return -EFAULT;
 
 		/*
 		 * Any feature specified by user-space above
-		 * VIRTIO_FEATURES_MAX is not supported by definition.
+		 * VIRTIO_FEATURES_BITS is not supported by definition.
 		 */
 		for (i = copied; i < count; ++i) {
 			if (copy_from_user(&features, featurep + 1 + i,
@@ -1783,7 +1783,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
 				return -EOPNOTSUPP;
 		}
 
-		for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++)
+		for (i = 0; i < VIRTIO_FEATURES_U64S; i++)
 			if (all_features[i] & ~vhost_net_features[i])
 				return -EOPNOTSUPP;
 
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index a09eb4d62f82..5bdc6b82b30b 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -53,7 +53,7 @@ static ssize_t features_show(struct device *_d,
 
 	/* We actually represent this as a bitstring, as it could be
 	 * arbitrary length in future. */
-	for (i = 0; i < VIRTIO_FEATURES_MAX; i++)
+	for (i = 0; i < VIRTIO_FEATURES_BITS; i++)
 		len += sysfs_emit_at(buf, len, "%c",
 			       __virtio_test_bit(dev, i) ? '1' : '0');
 	len += sysfs_emit_at(buf, len, "\n");
@@ -272,8 +272,8 @@ static int virtio_dev_probe(struct device *_d)
 	int err, i;
 	struct virtio_device *dev = dev_to_virtio(_d);
 	struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
-	u64 device_features[VIRTIO_FEATURES_DWORDS];
-	u64 driver_features[VIRTIO_FEATURES_DWORDS];
+	u64 device_features[VIRTIO_FEATURES_U64S];
+	u64 driver_features[VIRTIO_FEATURES_U64S];
 	u64 driver_features_legacy;
 
 	/* We have a driver! */
@@ -286,7 +286,7 @@ static int virtio_dev_probe(struct device *_d)
 	virtio_features_zero(driver_features);
 	for (i = 0; i < drv->feature_table_size; i++) {
 		unsigned int f = drv->feature_table[i];
-		if (!WARN_ON_ONCE(f >= VIRTIO_FEATURES_MAX))
+		if (!WARN_ON_ONCE(f >= VIRTIO_FEATURES_BITS))
 			virtio_features_set_bit(driver_features, f);
 	}
 
@@ -303,7 +303,7 @@ static int virtio_dev_probe(struct device *_d)
 	}
 
 	if (virtio_features_test_bit(device_features, VIRTIO_F_VERSION_1)) {
-		for (i = 0; i < VIRTIO_FEATURES_DWORDS; ++i)
+		for (i = 0; i < VIRTIO_FEATURES_U64S; ++i)
 			dev->features_array[i] = driver_features[i] &
 						 device_features[i];
 	} else {
@@ -325,7 +325,7 @@ static int virtio_dev_probe(struct device *_d)
 		goto err;
 
 	if (drv->validate) {
-		u64 features[VIRTIO_FEATURES_DWORDS];
+		u64 features[VIRTIO_FEATURES_U64S];
 
 		virtio_features_copy(features, dev->features_array);
 		err = drv->validate(dev);
diff --git a/drivers/virtio/virtio_debug.c b/drivers/virtio/virtio_debug.c
index d58713ddf2e5..ccf1955a1183 100644
--- a/drivers/virtio/virtio_debug.c
+++ b/drivers/virtio/virtio_debug.c
@@ -8,12 +8,12 @@ static struct dentry *virtio_debugfs_dir;
 
 static int virtio_debug_device_features_show(struct seq_file *s, void *data)
 {
-	u64 device_features[VIRTIO_FEATURES_DWORDS];
+	u64 device_features[VIRTIO_FEATURES_U64S];
 	struct virtio_device *dev = s->private;
 	unsigned int i;
 
 	virtio_get_features(dev, device_features);
-	for (i = 0; i < VIRTIO_FEATURES_MAX; i++) {
+	for (i = 0; i < VIRTIO_FEATURES_BITS; i++) {
 		if (virtio_features_test_bit(device_features, i))
 			seq_printf(s, "%u\n", i);
 	}
@@ -26,7 +26,7 @@ static int virtio_debug_filter_features_show(struct seq_file *s, void *data)
 	struct virtio_device *dev = s->private;
 	unsigned int i;
 
-	for (i = 0; i < VIRTIO_FEATURES_MAX; i++) {
+	for (i = 0; i < VIRTIO_FEATURES_BITS; i++) {
 		if (virtio_features_test_bit(dev->debugfs_filter_features, i))
 			seq_printf(s, "%u\n", i);
 	}
@@ -50,7 +50,7 @@ static int virtio_debug_filter_feature_add(void *data, u64 val)
 {
 	struct virtio_device *dev = data;
 
-	if (val >= VIRTIO_FEATURES_MAX)
+	if (val >= VIRTIO_FEATURES_BITS)
 		return -EINVAL;
 
 	virtio_features_set_bit(dev->debugfs_filter_features, val);
@@ -64,7 +64,7 @@ static int virtio_debug_filter_feature_del(void *data, u64 val)
 {
 	struct virtio_device *dev = data;
 
-	if (val >= VIRTIO_FEATURES_MAX)
+	if (val >= VIRTIO_FEATURES_BITS)
 		return -EINVAL;
 
 	virtio_features_clear_bit(dev->debugfs_filter_features, val);
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index 9e503b7a58d8..413a8c353463 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -401,7 +401,7 @@ void vp_modern_get_extended_features(struct virtio_pci_modern_device *mdev,
 	int i;
 
 	virtio_features_zero(features);
-	for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+	for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) {
 		u64 cur;
 
 		vp_iowrite32(i, &cfg->device_feature_select);
@@ -427,7 +427,7 @@ vp_modern_get_driver_extended_features(struct virtio_pci_modern_device *mdev,
 	int i;
 
 	virtio_features_zero(features);
-	for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+	for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) {
 		u64 cur;
 
 		vp_iowrite32(i, &cfg->guest_feature_select);
@@ -448,7 +448,7 @@ void vp_modern_set_extended_features(struct virtio_pci_modern_device *mdev,
 	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
 	int i;
 
-	for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+	for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) {
 		u32 cur = features[i >> 1] >> (32 * (i & 1));
 
 		vp_iowrite32(i, &cfg->guest_feature_select);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 96c66126c074..132a474e5914 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -177,7 +177,7 @@ struct virtio_device {
 	union virtio_map vmap;
 #ifdef CONFIG_VIRTIO_DEBUG
 	struct dentry *debugfs_dir;
-	u64 debugfs_filter_features[VIRTIO_FEATURES_DWORDS];
+	u64 debugfs_filter_features[VIRTIO_FEATURES_U64S];
 #endif
 };
 
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index a1af2676bbe6..69f84ea85d71 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -80,7 +80,7 @@ struct virtqueue_info {
  *	Returns the first 64 feature bits.
  * @get_extended_features:
  *      vdev: the virtio_device
- *      Returns the first VIRTIO_FEATURES_MAX feature bits (all we currently
+ *      Returns the first VIRTIO_FEATURES_BITS feature bits (all we currently
  *      need).
  * @finalize_features: confirm what device features we'll be using.
  *	vdev: the virtio_device
diff --git a/include/linux/virtio_features.h b/include/linux/virtio_features.h
index f748f2f87de8..ea2ad8717882 100644
--- a/include/linux/virtio_features.h
+++ b/include/linux/virtio_features.h
@@ -4,15 +4,16 @@
 
 #include <linux/bits.h>
 
-#define VIRTIO_FEATURES_DWORDS	2
-#define VIRTIO_FEATURES_MAX	(VIRTIO_FEATURES_DWORDS * 64)
-#define VIRTIO_FEATURES_WORDS	(VIRTIO_FEATURES_DWORDS * 2)
+#define VIRTIO_FEATURES_U64S	2
+#define VIRTIO_FEATURES_BITS	(VIRTIO_FEATURES_U64S * 64)
+
 #define VIRTIO_BIT(b)		BIT_ULL((b) & 0x3f)
-#define VIRTIO_DWORD(b)		((b) >> 6)
+#define VIRTIO_U64(b)		((b) >> 6)
+
 #define VIRTIO_DECLARE_FEATURES(name)			\
 	union {						\
 		u64 name;				\
-		u64 name##_array[VIRTIO_FEATURES_DWORDS];\
+		u64 name##_array[VIRTIO_FEATURES_U64S];\
 	}
 
 static inline bool virtio_features_chk_bit(unsigned int bit)
@@ -22,9 +23,9 @@ static inline bool virtio_features_chk_bit(unsigned int bit)
 		 * Don't care returning the correct value: the build
 		 * will fail before any bad features access
 		 */
-		BUILD_BUG_ON(bit >= VIRTIO_FEATURES_MAX);
+		BUILD_BUG_ON(bit >= VIRTIO_FEATURES_BITS);
 	} else {
-		if (WARN_ON_ONCE(bit >= VIRTIO_FEATURES_MAX))
+		if (WARN_ON_ONCE(bit >= VIRTIO_FEATURES_BITS))
 			return false;
 	}
 	return true;
@@ -34,26 +35,26 @@ static inline bool virtio_features_test_bit(const u64 *features,
 					    unsigned int bit)
 {
 	return virtio_features_chk_bit(bit) &&
-	       !!(features[VIRTIO_DWORD(bit)] & VIRTIO_BIT(bit));
+	       !!(features[VIRTIO_U64(bit)] & VIRTIO_BIT(bit));
 }
 
 static inline void virtio_features_set_bit(u64 *features,
 					   unsigned int bit)
 {
 	if (virtio_features_chk_bit(bit))
-		features[VIRTIO_DWORD(bit)] |= VIRTIO_BIT(bit);
+		features[VIRTIO_U64(bit)] |= VIRTIO_BIT(bit);
 }
 
 static inline void virtio_features_clear_bit(u64 *features,
 					     unsigned int bit)
 {
 	if (virtio_features_chk_bit(bit))
-		features[VIRTIO_DWORD(bit)] &= ~VIRTIO_BIT(bit);
+		features[VIRTIO_U64(bit)] &= ~VIRTIO_BIT(bit);
 }
 
 static inline void virtio_features_zero(u64 *features)
 {
-	memset(features, 0, sizeof(features[0]) * VIRTIO_FEATURES_DWORDS);
+	memset(features, 0, sizeof(features[0]) * VIRTIO_FEATURES_U64S);
 }
 
 static inline void virtio_features_from_u64(u64 *features, u64 from)
@@ -66,7 +67,7 @@ static inline bool virtio_features_equal(const u64 *f1, const u64 *f2)
 {
 	int i;
 
-	for (i = 0; i < VIRTIO_FEATURES_DWORDS; ++i)
+	for (i = 0; i < VIRTIO_FEATURES_U64S; ++i)
 		if (f1[i] != f2[i])
 			return false;
 	return true;
@@ -74,14 +75,14 @@ static inline bool virtio_features_equal(const u64 *f1, const u64 *f2)
 
 static inline void virtio_features_copy(u64 *to, const u64 *from)
 {
-	memcpy(to, from, sizeof(to[0]) * VIRTIO_FEATURES_DWORDS);
+	memcpy(to, from, sizeof(to[0]) * VIRTIO_FEATURES_U64S);
 }
 
 static inline void virtio_features_andnot(u64 *to, const u64 *f1, const u64 *f2)
 {
 	int i;
 
-	for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++)
+	for (i = 0; i < VIRTIO_FEATURES_U64S; i++)
 		to[i] = f1[i] & ~f2[i];
 }
 
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 48bc12d1045b..9a3f2fc53bd6 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -107,7 +107,7 @@ void vp_modern_set_extended_features(struct virtio_pci_modern_device *mdev,
 static inline u64
 vp_modern_get_features(struct virtio_pci_modern_device *mdev)
 {
-	u64 features_array[VIRTIO_FEATURES_DWORDS];
+	u64 features_array[VIRTIO_FEATURES_U64S];
 
 	vp_modern_get_extended_features(mdev, features_array);
 	return features_array[0];
@@ -116,11 +116,11 @@ vp_modern_get_features(struct virtio_pci_modern_device *mdev)
 static inline u64
 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev)
 {
-	u64 features_array[VIRTIO_FEATURES_DWORDS];
+	u64 features_array[VIRTIO_FEATURES_U64S];
 	int i;
 
 	vp_modern_get_driver_extended_features(mdev, features_array);
-	for (i = 1; i < VIRTIO_FEATURES_DWORDS; ++i)
+	for (i = 1; i < VIRTIO_FEATURES_U64S; ++i)
 		WARN_ON_ONCE(features_array[i]);
 	return features_array[0];
 }
@@ -128,7 +128,7 @@ vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev)
 static inline void
 vp_modern_set_features(struct virtio_pci_modern_device *mdev, u64 features)
 {
-	u64 features_array[VIRTIO_FEATURES_DWORDS];
+	u64 features_array[VIRTIO_FEATURES_U64S];
 
 	virtio_features_from_u64(features_array, features);
 	vp_modern_set_extended_features(mdev, features_array);
-- 
cgit v1.2.3


From e6c43c95009035a63091cd49736886f883127510 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexanderduyck@fb.com>
Date: Fri, 21 Nov 2025 08:39:55 -0800
Subject: net: phy: Add MDIO_PMA_CTRL1_SPEED for 2.5G and 5G to reflect PMA
 values

The 2.5G and 5G values are not consistent between the PCS CTRL1 and PMA
CTRL1 values. In order to avoid confusion between the two I am updating the
values to include "PMA" in the name similar to values used in similar
places.

To avoid breaking UAPI I have retained the original macros and just defined
them as the new PMA based defines.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374319569.959489.6610469879021800710.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/phy-c45.c |  8 ++++----
 include/uapi/linux/mdio.h | 12 ++++++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index e8e5be4684ab..f5e23b53994f 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -148,12 +148,12 @@ int genphy_c45_pma_setup_forced(struct phy_device *phydev)
 		ctrl2 |= MDIO_PMA_CTRL2_1000BT;
 		break;
 	case SPEED_2500:
-		ctrl1 |= MDIO_CTRL1_SPEED2_5G;
+		ctrl1 |= MDIO_PMA_CTRL1_SPEED2_5G;
 		/* Assume 2.5Gbase-T */
 		ctrl2 |= MDIO_PMA_CTRL2_2_5GBT;
 		break;
 	case SPEED_5000:
-		ctrl1 |= MDIO_CTRL1_SPEED5G;
+		ctrl1 |= MDIO_PMA_CTRL1_SPEED5G;
 		/* Assume 5Gbase-T */
 		ctrl2 |= MDIO_PMA_CTRL2_5GBT;
 		break;
@@ -618,10 +618,10 @@ int genphy_c45_read_pma(struct phy_device *phydev)
 	case MDIO_PMA_CTRL1_SPEED1000:
 		phydev->speed = SPEED_1000;
 		break;
-	case MDIO_CTRL1_SPEED2_5G:
+	case MDIO_PMA_CTRL1_SPEED2_5G:
 		phydev->speed = SPEED_2500;
 		break;
-	case MDIO_CTRL1_SPEED5G:
+	case MDIO_PMA_CTRL1_SPEED5G:
 		phydev->speed = SPEED_5000;
 		break;
 	case MDIO_CTRL1_SPEED10G:
diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index 6975f182b22c..9ee6eeae64b8 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -116,10 +116,18 @@
 #define MDIO_CTRL1_SPEED10G		(MDIO_CTRL1_SPEEDSELEXT | 0x00)
 /* 10PASS-TS/2BASE-TL */
 #define MDIO_CTRL1_SPEED10P2B		(MDIO_CTRL1_SPEEDSELEXT | 0x04)
+/* Note: the MDIO_CTRL1_SPEED_XXX values for everything past 10PASS-TS/2BASE-TL
+ * do not match between the PCS and PMA values. Any additions past this point
+ * should be PMA or PCS specific. The following 2 defines are workarounds for
+ * values added before this was caught. They should be considered deprecated.
+ */
+#define MDIO_CTRL1_SPEED2_5G		MDIO_PMA_CTRL1_SPEED2_5G
+#define MDIO_CTRL1_SPEED5G		MDIO_PMA_CTRL1_SPEED5G
 /* 2.5 Gb/s */
-#define MDIO_CTRL1_SPEED2_5G		(MDIO_CTRL1_SPEEDSELEXT | 0x18)
+#define MDIO_PMA_CTRL1_SPEED2_5G	(MDIO_CTRL1_SPEEDSELEXT | 0x18)
 /* 5 Gb/s */
-#define MDIO_CTRL1_SPEED5G		(MDIO_CTRL1_SPEEDSELEXT | 0x1c)
+#define MDIO_PMA_CTRL1_SPEED5G		(MDIO_CTRL1_SPEEDSELEXT | 0x1c)
+
 
 /* Status register 1. */
 #define MDIO_STAT1_LPOWERABLE		0x0002	/* Low-power ability */
-- 
cgit v1.2.3


From 7622d55276932bfeb947b7b6cbf7ea0aa41feeb8 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexanderduyck@fb.com>
Date: Fri, 21 Nov 2025 08:40:02 -0800
Subject: net: pcs: xpcs: Add support for 25G, 50G, and 100G interfaces

With this change we are adding support for 25G, 50G, and 100G interface
types to the XPCS driver. This had supposedly been enabled with the
addition of XLGMII but I don't see any capability for configuration there
so I suspect it may need to be refactored in the future.

With this change we can enable the XPCS driver with the selected interface
and it should be able to detect link, speed, and report the link status to
the phylink interface.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374320248.959489.11649590675011158859.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/pcs/pcs-xpcs.c | 105 +++++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/mdio.h  |   6 +++
 2 files changed, 107 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 3d1bd5aac093..9fb9d4fd2a5b 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -37,6 +37,16 @@ static const int xpcs_10gkr_features[] = {
 	__ETHTOOL_LINK_MODE_MASK_NBITS,
 };
 
+static const int xpcs_25gbaser_features[] = {
+	ETHTOOL_LINK_MODE_MII_BIT,
+	ETHTOOL_LINK_MODE_Pause_BIT,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_25000baseCR_Full_BIT,
+	ETHTOOL_LINK_MODE_25000baseKR_Full_BIT,
+	ETHTOOL_LINK_MODE_25000baseSR_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
 static const int xpcs_xlgmii_features[] = {
 	ETHTOOL_LINK_MODE_Pause_BIT,
 	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
@@ -67,6 +77,40 @@ static const int xpcs_xlgmii_features[] = {
 	__ETHTOOL_LINK_MODE_MASK_NBITS,
 };
 
+static const int xpcs_50gbaser_features[] = {
+	ETHTOOL_LINK_MODE_MII_BIT,
+	ETHTOOL_LINK_MODE_Pause_BIT,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_50000baseKR_Full_BIT,
+	ETHTOOL_LINK_MODE_50000baseSR_Full_BIT,
+	ETHTOOL_LINK_MODE_50000baseCR_Full_BIT,
+	ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT,
+	ETHTOOL_LINK_MODE_50000baseDR_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
+static const int xpcs_50gbaser2_features[] = {
+	ETHTOOL_LINK_MODE_MII_BIT,
+	ETHTOOL_LINK_MODE_Pause_BIT,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT,
+	ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT,
+	ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
+static const int xpcs_100gbasep_features[] = {
+	ETHTOOL_LINK_MODE_MII_BIT,
+	ETHTOOL_LINK_MODE_Pause_BIT,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT,
+	ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT,
+	ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT,
+	ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT,
+	ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
 static const int xpcs_10gbaser_features[] = {
 	ETHTOOL_LINK_MODE_Pause_BIT,
 	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
@@ -523,9 +567,38 @@ static int xpcs_get_max_xlgmii_speed(struct dw_xpcs *xpcs,
 	return speed;
 }
 
-static void xpcs_resolve_pma(struct dw_xpcs *xpcs,
-			     struct phylink_link_state *state)
+static int xpcs_c45_read_pcs_speed(struct dw_xpcs *xpcs,
+				   struct phylink_link_state *state)
 {
+	int pcs_ctrl1;
+
+	pcs_ctrl1 = xpcs_read(xpcs, MDIO_MMD_PCS, MDIO_CTRL1);
+	if (pcs_ctrl1 < 0)
+		return pcs_ctrl1;
+
+	switch (pcs_ctrl1 & MDIO_CTRL1_SPEEDSEL) {
+	case MDIO_PCS_CTRL1_SPEED25G:
+		state->speed = SPEED_25000;
+		break;
+	case MDIO_PCS_CTRL1_SPEED50G:
+		state->speed =  SPEED_50000;
+		break;
+	case MDIO_PCS_CTRL1_SPEED100G:
+		state->speed = SPEED_100000;
+		break;
+	default:
+		state->speed = SPEED_UNKNOWN;
+		break;
+	}
+
+	return 0;
+}
+
+static int xpcs_resolve_pma(struct dw_xpcs *xpcs,
+			    struct phylink_link_state *state)
+{
+	int err = 0;
+
 	state->pause = MLO_PAUSE_TX | MLO_PAUSE_RX;
 	state->duplex = DUPLEX_FULL;
 
@@ -536,10 +609,18 @@ static void xpcs_resolve_pma(struct dw_xpcs *xpcs,
 	case PHY_INTERFACE_MODE_XLGMII:
 		state->speed = xpcs_get_max_xlgmii_speed(xpcs, state);
 		break;
+	case PHY_INTERFACE_MODE_100GBASEP:
+	case PHY_INTERFACE_MODE_LAUI:
+	case PHY_INTERFACE_MODE_50GBASER:
+	case PHY_INTERFACE_MODE_25GBASER:
+		err = xpcs_c45_read_pcs_speed(xpcs, state);
+		break;
 	default:
 		state->speed = SPEED_UNKNOWN;
 		break;
 	}
+
+	return err;
 }
 
 static int xpcs_validate(struct phylink_pcs *pcs, unsigned long *supported,
@@ -945,10 +1026,10 @@ static int xpcs_get_state_c73(struct dw_xpcs *xpcs,
 
 		phylink_resolve_c73(state);
 	} else {
-		xpcs_resolve_pma(xpcs, state);
+		ret = xpcs_resolve_pma(xpcs, state);
 	}
 
-	return 0;
+	return ret;
 }
 
 static int xpcs_get_state_c37_sgmii(struct dw_xpcs *xpcs,
@@ -1312,10 +1393,26 @@ static const struct dw_xpcs_compat synopsys_xpcs_compat[] = {
 		.interface = PHY_INTERFACE_MODE_10GKR,
 		.supported = xpcs_10gkr_features,
 		.an_mode = DW_AN_C73,
+	}, {
+		.interface = PHY_INTERFACE_MODE_25GBASER,
+		.supported = xpcs_25gbaser_features,
+		.an_mode = DW_AN_C73,
 	}, {
 		.interface = PHY_INTERFACE_MODE_XLGMII,
 		.supported = xpcs_xlgmii_features,
 		.an_mode = DW_AN_C73,
+	}, {
+		.interface = PHY_INTERFACE_MODE_50GBASER,
+		.supported = xpcs_50gbaser_features,
+		.an_mode = DW_AN_C73,
+	}, {
+		.interface = PHY_INTERFACE_MODE_LAUI,
+		.supported = xpcs_50gbaser2_features,
+		.an_mode = DW_AN_C73,
+	}, {
+		.interface = PHY_INTERFACE_MODE_100GBASEP,
+		.supported = xpcs_100gbasep_features,
+		.an_mode = DW_AN_C73,
 	}, {
 		.interface = PHY_INTERFACE_MODE_10GBASER,
 		.supported = xpcs_10gbaser_features,
diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index 9ee6eeae64b8..f23cab33e586 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -123,6 +123,12 @@
  */
 #define MDIO_CTRL1_SPEED2_5G		MDIO_PMA_CTRL1_SPEED2_5G
 #define MDIO_CTRL1_SPEED5G		MDIO_PMA_CTRL1_SPEED5G
+/* 100 Gb/s */
+#define MDIO_PCS_CTRL1_SPEED100G	(MDIO_CTRL1_SPEEDSELEXT | 0x10)
+/* 25 Gb/s */
+#define MDIO_PCS_CTRL1_SPEED25G		(MDIO_CTRL1_SPEEDSELEXT | 0x14)
+/* 50 Gb/s */
+#define MDIO_PCS_CTRL1_SPEED50G		(MDIO_CTRL1_SPEEDSELEXT | 0x18)
 /* 2.5 Gb/s */
 #define MDIO_PMA_CTRL1_SPEED2_5G	(MDIO_CTRL1_SPEEDSELEXT | 0x18)
 /* 5 Gb/s */
-- 
cgit v1.2.3


From 39e138173ae7641e952b456d2de7ad2ac03e8d88 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexanderduyck@fb.com>
Date: Fri, 21 Nov 2025 08:40:09 -0800
Subject: net: pcs: xpcs: Fix PMA identifier handling in XPCS

The XPCS driver was mangling the PMA identifier as the original code
appears to have been focused on just capturing the OUI. Rather than store a
mangled ID it is better to work with the actual PMA ID and instead just
mask out the values that don't apply rather than shifting them and
reordering them as you still don't get the original OUI for the NIC without
having to bitswap the values as per the definition of the layout in IEEE
802.3-2022 22.2.4.3.1.

By laying it out as it was in the hardware it is also less likely for us to
have an unintentional collision as the enum values will occupy the revision
number area while the OUI occupies the upper 22 bits.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374320920.959489.17267159479370601070.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/pcs/pcs-xpcs.c   | 9 ++++-----
 include/linux/pcs/pcs-xpcs.h | 2 +-
 include/uapi/linux/mdio.h    | 5 +++++
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 9fb9d4fd2a5b..a94a7cb93664 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -1365,17 +1365,16 @@ static int xpcs_read_ids(struct dw_xpcs *xpcs)
 	if (ret < 0)
 		return ret;
 
-	id = ret;
+	id = ret << 16;
 
 	ret = xpcs_read(xpcs, MDIO_MMD_PMAPMD, MDIO_DEVID2);
 	if (ret < 0)
 		return ret;
 
-	/* Note the inverted dword order and masked out Model/Revision numbers
-	 * with respect to what is done with the PCS ID...
+	/* For now we only record the OUI for the PMAPMD, we may want to
+	 * add the model number at some point in the future.
 	 */
-	ret = (ret >> 10) & 0x3F;
-	id |= ret << 16;
+	id |= ret & MDIO_DEVID2_OUI;
 
 	/* Set the PMA ID if it hasn't been pre-initialized */
 	if (xpcs->info.pma == DW_XPCS_PMA_ID_NATIVE)
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index e40f554ff717..4cf6bd611e5a 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -38,7 +38,7 @@ enum dw_xpcs_pma_id {
 	DW_XPCS_PMA_GEN4_6G_ID,
 	DW_XPCS_PMA_GEN5_10G_ID,
 	DW_XPCS_PMA_GEN5_12G_ID,
-	WX_TXGBE_XPCS_PMA_10G_ID = 0x0018fc80,
+	WX_TXGBE_XPCS_PMA_10G_ID = 0xfc806000,
 };
 
 struct dw_xpcs_info {
diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index f23cab33e586..8d769f100de6 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -147,6 +147,11 @@
 #define MDIO_AN_STAT1_PAGE		0x0040	/* Page received */
 #define MDIO_AN_STAT1_XNP		0x0080	/* Extended next page status */
 
+/* Device Identifier 2 */
+#define MDIO_DEVID2_OUI			0xfc00	/* OUI Portion of PHY ID */
+#define MDIO_DEVID2_MODEL_NUM		0x03f0	/* Manufacturer's Model Number */
+#define MDIO_DEVID2_REV_NUM		0x000f	/* Revision Number */
+
 /* Speed register. */
 #define MDIO_SPEED_10G			0x0001	/* 10G capable */
 #define MDIO_PMA_SPEED_2B		0x0002	/* 2BASE-TL capable */
-- 
cgit v1.2.3


From 3f29dd34f75a09ee7f8333305618edb44617d835 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexanderduyck@fb.com>
Date: Fri, 21 Nov 2025 08:40:16 -0800
Subject: net: pcs: xpcs: Add support for FBNIC 25G, 50G, 100G PMD

The fbnic driver is planning to make use of the XPCS driver to enable
support for PCS and better integration with phylink. To do this though we
will need to enable several workarounds since the PMD interface for fbnic
is likely to be unique since it is a mix of two different vendor products
with a unique wrapper around the IP.

I have generated a PHY identifier based on IEEE 802.3-2022 22.2.4.3.1 using
an OUI belonging to Meta Platforms and used with our NICs. Using this we
will provide it as the PMD ID via the SW based MDIO interface so that
the fbnic device can be identified and necessary workarounds enabled in the
XPCS driver.

As an initial workaround this change adds an exception so that soft_reset
is not set when the driver is initially bound to the PCS.

In addition I have added logic to integrate the PMD Rx signal detect state
into the link state for the PCS. With this we can avoid the link coming up
too soon on the FBNIC PMD and as a result of it being in the training state
so we can avoid link flaps.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374321695.959489.6648161125012056619.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/pcs/pcs-xpcs.c   | 24 ++++++++++++++++++++++--
 include/linux/pcs/pcs-xpcs.h |  2 ++
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index a94a7cb93664..9679f2b35a44 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -597,7 +597,26 @@ static int xpcs_c45_read_pcs_speed(struct dw_xpcs *xpcs,
 static int xpcs_resolve_pma(struct dw_xpcs *xpcs,
 			    struct phylink_link_state *state)
 {
-	int err = 0;
+	int pmd_rxdet, err = 0;
+
+	/* The Meta Platforms FBNIC PMD will go into a training state for
+	 * about 4 seconds when the link first comes up. During this time the
+	 * PCS link will bounce. To avoid reporting link up too soon we include
+	 * the PMD state provided by the driver.
+	 */
+	if (xpcs->info.pma == MP_FBNIC_XPCS_PMA_100G_ID) {
+		pmd_rxdet = xpcs_read(xpcs, MDIO_MMD_PMAPMD, MDIO_PMA_RXDET);
+		if (pmd_rxdet < 0) {
+			state->link = false;
+			return pmd_rxdet;
+		}
+
+		/* Verify Rx lanes are trained before reporting link up */
+		if (!(pmd_rxdet & MDIO_PMD_RXDET_GLOBAL)) {
+			state->link = false;
+			return 0;
+		}
+	}
 
 	state->pause = MLO_PAUSE_TX | MLO_PAUSE_RX;
 	state->duplex = DUPLEX_FULL;
@@ -1591,7 +1610,8 @@ static struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev)
 
 	xpcs_get_interfaces(xpcs, xpcs->pcs.supported_interfaces);
 
-	if (xpcs->info.pma == WX_TXGBE_XPCS_PMA_10G_ID)
+	if (xpcs->info.pma == WX_TXGBE_XPCS_PMA_10G_ID ||
+	    xpcs->info.pma == MP_FBNIC_XPCS_PMA_100G_ID)
 		xpcs->pcs.poll = false;
 	else
 		xpcs->need_reset = true;
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 4cf6bd611e5a..36073f7b6bb4 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -39,6 +39,8 @@ enum dw_xpcs_pma_id {
 	DW_XPCS_PMA_GEN5_10G_ID,
 	DW_XPCS_PMA_GEN5_12G_ID,
 	WX_TXGBE_XPCS_PMA_10G_ID = 0xfc806000,
+	/* Meta Platforms OUI 88:25:08, model 0, revision 0 */
+	MP_FBNIC_XPCS_PMA_100G_ID = 0x46904000,
 };
 
 struct dw_xpcs_info {
-- 
cgit v1.2.3


From f2f36500a63b73a8be90127322ad740253cf89c0 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 25 Oct 2025 13:15:37 +0200
Subject: configfs: Constify ct_group_ops in struct config_item_type

Make 'ct_group_ops' const in struct config_item_type.
This allows constification of many structures which hold some function
pointers.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/6b720cf407e8a6d30f35beb72e031b2553d1ab7e.1761390472.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
---
 fs/configfs/dir.c        | 2 +-
 include/linux/configfs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 81f4f06bc87e..4bcd14b3434c 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -598,7 +598,7 @@ static void detach_attrs(struct config_item * item)
 static int populate_attrs(struct config_item *item)
 {
 	const struct config_item_type *t = item->ci_type;
-	struct configfs_group_operations *ops;
+	const struct configfs_group_operations *ops;
 	struct configfs_attribute *attr;
 	struct configfs_bin_attribute *bin_attr;
 	int error = 0;
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 698520b1bfdb..31a7d7124460 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -65,7 +65,7 @@ extern void config_item_put(struct config_item *);
 struct config_item_type {
 	struct module				*ct_owner;
 	struct configfs_item_operations		*ct_item_ops;
-	struct configfs_group_operations	*ct_group_ops;
+	const struct configfs_group_operations	*ct_group_ops;
 	struct configfs_attribute		**ct_attrs;
 	struct configfs_bin_attribute		**ct_bin_attrs;
 };
-- 
cgit v1.2.3


From f7f78098690d60a03b47942ac7d73ea17b42239e Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 25 Oct 2025 13:15:38 +0200
Subject: configfs: Constify ct_item_ops in struct config_item_type

Make 'ct_item_ops' const in struct config_item_type.
This allows constification of many structures which hold some function
pointers.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/f43cb57418a7f59e883be8eedc7d6abe802a2094.1761390472.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
---
 fs/configfs/file.c       | 2 +-
 include/linux/configfs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 0ad32150611e..affe4742bbb5 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -30,7 +30,7 @@ struct configfs_buffer {
 	size_t			count;
 	loff_t			pos;
 	char			* page;
-	struct configfs_item_operations	* ops;
+	const struct configfs_item_operations	*ops;
 	struct mutex		mutex;
 	int			needs_read_fill;
 	bool			read_in_progress;
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 31a7d7124460..ef65c75beeaa 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -64,7 +64,7 @@ extern void config_item_put(struct config_item *);
 
 struct config_item_type {
 	struct module				*ct_owner;
-	struct configfs_item_operations		*ct_item_ops;
+	const struct configfs_item_operations	*ct_item_ops;
 	const struct configfs_group_operations	*ct_group_ops;
 	struct configfs_attribute		**ct_attrs;
 	struct configfs_bin_attribute		**ct_bin_attrs;
-- 
cgit v1.2.3


From d3f52f53a56278ce5ffeafa3cc6cfb3ecef770fe Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 5 Nov 2025 12:32:14 -0800
Subject: srcu: Create an SRCU-fast-updown API

This commit creates an SRCU-fast-updown API, including
DEFINE_SRCU_FAST_UPDOWN(), DEFINE_STATIC_SRCU_FAST_UPDOWN(),
__init_srcu_struct_fast_updown(), init_srcu_struct_fast_updown(),
srcu_read_lock_fast_updown(), srcu_read_unlock_fast_updown(),
__srcu_read_lock_fast_updown(), and __srcu_read_unlock_fast_updown().

These are initially identical to their SRCU-fast counterparts, but both
SRCU-fast and SRCU-fast-updown will be optimized in different directions
by later commits. SRCU-fast will lack any sort of srcu_down_read() and
srcu_up_read() APIs, which will enable extremely efficient NMI safety.
For its part, SRCU-fast-updown will not be NMI safe, which will enable
reasonably efficient implementations of srcu_down_read_fast() and
srcu_up_read_fast().

This API fork happens to meet two different future use cases.

* SRCU-fast will become the reimplementation basis for RCU-TASK-TRACE
  for consolidation. Since RCU-TASK-TRACE must be NMI safe, SRCU-fast
  must be as well.

* SRCU-fast-updown will be needed for uretprobes code in order to get
  rid of the read-side memory barriers while still allowing entering the
  reader at task level while exiting it in a timer handler.

This commit also adds rcutorture tests for the new APIs.  This
(annoyingly) needs to be in the same commit for bisectability.  With this
commit, the 0x8 value tests SRCU-fast-updown.  However, most SRCU-fast
testing will be via the RCU Tasks Trace wrappers.

[ paulmck: Apply s/0x8/0x4/ missing change per Boqun Feng feedback. ]
[ paulmck: Apply Akira Yokosawa feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <bpf@vger.kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/srcu.h     | 77 ++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/srcutiny.h | 16 ++++++++++
 include/linux/srcutree.h | 55 ++++++++++++++++++++++++++++++++--
 kernel/rcu/rcutorture.c  | 12 ++++----
 kernel/rcu/srcutree.c    | 39 +++++++++++++++++++++---
 5 files changed, 183 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 1dd6812aabe7..344ad51c8f6c 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -28,6 +28,8 @@ struct srcu_struct;
 int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key);
 #ifndef CONFIG_TINY_SRCU
 int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key);
+int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name,
+				   struct lock_class_key *key);
 #endif // #ifndef CONFIG_TINY_SRCU
 
 #define init_srcu_struct(ssp) \
@@ -44,12 +46,20 @@ int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lo
 	__init_srcu_struct_fast((ssp), #ssp, &__srcu_key); \
 })
 
+#define init_srcu_struct_fast_updown(ssp) \
+({ \
+	static struct lock_class_key __srcu_key; \
+	\
+	__init_srcu_struct_fast_updown((ssp), #ssp, &__srcu_key); \
+})
+
 #define __SRCU_DEP_MAP_INIT(srcu_name)	.dep_map = { .name = #srcu_name },
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 int init_srcu_struct(struct srcu_struct *ssp);
 #ifndef CONFIG_TINY_SRCU
 int init_srcu_struct_fast(struct srcu_struct *ssp);
+int init_srcu_struct_fast_updown(struct srcu_struct *ssp);
 #endif // #ifndef CONFIG_TINY_SRCU
 
 #define __SRCU_DEP_MAP_INIT(srcu_name)
@@ -305,6 +315,46 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *
 	return retval;
 }
 
+/**
+ * srcu_read_lock_fast_updown - register a new reader for an SRCU-fast-updown structure.
+ * @ssp: srcu_struct in which to register the new reader.
+ *
+ * Enter an SRCU read-side critical section, but for a light-weight
+ * smp_mb()-free reader.  See srcu_read_lock() for more information.
+ * This function is compatible with srcu_down_read_fast(), but is not
+ * NMI-safe.
+ *
+ * For srcu_read_lock_fast_updown() to be used on an srcu_struct
+ * structure, that structure must have been defined using either
+ * DEFINE_SRCU_FAST_UPDOWN() or DEFINE_STATIC_SRCU_FAST_UPDOWN() on the one
+ * hand or initialized with init_srcu_struct_fast_updown() on the other.
+ * Such an srcu_struct structure cannot be passed to any non-fast-updown
+ * variant of srcu_read_{,un}lock() or srcu_{down,up}_read().  In kernels
+ * built with CONFIG_PROVE_RCU=y, __srcu_check_read_flavor() will complain
+ * bitterly if you ignore this * restriction.
+ *
+ * Grace-period auto-expediting is disabled for SRCU-fast-updown
+ * srcu_struct structures because SRCU-fast-updown expedited grace periods
+ * invoke synchronize_rcu_expedited(), IPIs and all.  If you need expedited
+ * SRCU-fast-updown grace periods, use synchronize_srcu_expedited().
+ *
+ * The srcu_read_lock_fast_updown() function can be invoked only from
+ * those contexts where RCU is watching, that is, from contexts where
+ * it would be legal to invoke rcu_read_lock().  Otherwise, lockdep will
+ * complain.
+ */
+static inline struct srcu_ctr __percpu *srcu_read_lock_fast_updown(struct srcu_struct *ssp)
+__acquires(ssp)
+{
+	struct srcu_ctr __percpu *retval;
+
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast_updown().");
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN);
+	retval = __srcu_read_lock_fast_updown(ssp);
+	rcu_try_lock_acquire(&ssp->dep_map);
+	return retval;
+}
+
 /*
  * Used by tracing, cannot be traced and cannot call lockdep.
  * See srcu_read_lock_fast() for more information.
@@ -335,8 +385,8 @@ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *
 {
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_down_read_fast().");
-	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
-	return __srcu_read_lock_fast(ssp);
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN);
+	return __srcu_read_lock_fast_updown(ssp);
 }
 
 /**
@@ -432,6 +482,23 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast().");
 }
 
+/**
+ * srcu_read_unlock_fast_updown - unregister a old reader from an SRCU-fast-updown structure.
+ * @ssp: srcu_struct in which to unregister the old reader.
+ * @scp: return value from corresponding srcu_read_lock_fast_updown().
+ *
+ * Exit an SRCU-fast-updown read-side critical section.
+ */
+static inline void
+srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) __releases(ssp)
+{
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN);
+	srcu_lock_release(&ssp->dep_map);
+	__srcu_read_unlock_fast_updown(ssp, scp);
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "RCU must be watching srcu_read_unlock_fast_updown().");
+}
+
 /*
  * Used by tracing, cannot be traced and cannot call lockdep.
  * See srcu_read_unlock_fast() for more information.
@@ -455,9 +522,9 @@ static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __
 	__releases(ssp)
 {
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
-	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
-	__srcu_read_unlock_fast(ssp, scp);
-	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_up_read_fast().");
+	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN);
+	__srcu_read_unlock_fast_updown(ssp, scp);
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_up_read_fast_updown().");
 }
 
 /**
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 1ecc3393fb26..e0698024667a 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -50,13 +50,18 @@ void srcu_drive_gp(struct work_struct *wp);
 #define DEFINE_SRCU_FAST(name) DEFINE_SRCU(name)
 #define DEFINE_STATIC_SRCU_FAST(name) \
 	static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name)
+#define DEFINE_SRCU_FAST_UPDOWN(name) DEFINE_SRCU(name)
+#define DEFINE_STATIC_SRCU_FAST_UPDOWN(name) \
+	static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name)
 
 // Dummy structure for srcu_notifier_head.
 struct srcu_usage { };
 #define __SRCU_USAGE_INIT(name) { }
 #define __init_srcu_struct_fast __init_srcu_struct
+#define __init_srcu_struct_fast_updown __init_srcu_struct
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
 #define init_srcu_struct_fast init_srcu_struct
+#define init_srcu_struct_fast_updown init_srcu_struct
 #endif // #ifndef CONFIG_DEBUG_LOCK_ALLOC
 
 void synchronize_srcu(struct srcu_struct *ssp);
@@ -100,6 +105,17 @@ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_
 	__srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp));
 }
 
+static inline struct srcu_ctr __percpu *__srcu_read_lock_fast_updown(struct srcu_struct *ssp)
+{
+	return __srcu_ctr_to_ptr(ssp, __srcu_read_lock(ssp));
+}
+
+static inline
+void __srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
+{
+	__srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp));
+}
+
 static inline void synchronize_srcu_expedited(struct srcu_struct *ssp)
 {
 	synchronize_srcu(ssp);
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 6080a9094618..d6f978b50472 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -199,8 +199,15 @@ struct srcu_struct {
  *
  * See include/linux/percpu-defs.h for the rules on per-CPU variables.
  *
- * DEFINE_SRCU_FAST() creates an srcu_struct and associated structures
- * whose readers must be of the SRCU-fast variety.
+ * DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST create an srcu_struct
+ * and associated structures whose readers must be of the SRCU-fast variety.
+ * DEFINE_SRCU_FAST_UPDOWN() and DEFINE_STATIC_SRCU_FAST_UPDOWN() create
+ * an srcu_struct and associated structures whose readers must be of the
+ * SRCU-fast-updown variety.  The key point (aside from error checking) with
+ * both varieties is that the grace periods must use synchronize_rcu()
+ * instead of smp_mb(), and given that the first (for example)
+ * srcu_read_lock_fast() might race with the first synchronize_srcu(),
+ * this different must be specified at initialization time.
  */
 #ifdef MODULE
 # define __DEFINE_SRCU(name, fast, is_static)							\
@@ -221,6 +228,10 @@ struct srcu_struct {
 #define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, 0, static)
 #define DEFINE_SRCU_FAST(name)		__DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, /* not static */)
 #define DEFINE_STATIC_SRCU_FAST(name)	__DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, static)
+#define DEFINE_SRCU_FAST_UPDOWN(name)	__DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST_UPDOWN, \
+						      /* not static */)
+#define DEFINE_STATIC_SRCU_FAST_UPDOWN(name) \
+					__DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST_UPDOWN, static)
 
 int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
 void synchronize_srcu_expedited(struct srcu_struct *ssp);
@@ -305,6 +316,46 @@ __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
 		atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks));  // Z, and implicit RCU reader.
 }
 
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Returns a pointer that must be passed to the matching
+ * srcu_read_unlock_fast_updown().  This type of reader is compatible
+ * with srcu_down_read_fast() and srcu_up_read_fast().
+ *
+ * See the __srcu_read_lock_fast() comment for more details.
+ */
+static inline
+struct srcu_ctr __percpu notrace *__srcu_read_lock_fast_updown(struct srcu_struct *ssp)
+{
+	struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
+
+	if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
+		this_cpu_inc(scp->srcu_locks.counter); // Y, and implicit RCU reader.
+	else
+		atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks));  // Y, and implicit RCU reader.
+	barrier(); /* Avoid leaking the critical section. */
+	return scp;
+}
+
+/*
+ * Removes the count for the old reader from the appropriate
+ * per-CPU element of the srcu_struct.  Note that this may well be a
+ * different CPU than that which was incremented by the corresponding
+ * srcu_read_lock_fast(), but it must be within the same task.
+ *
+ * Please see the __srcu_read_lock_fast() function's header comment for
+ * information on implicit RCU readers and NMI safety.
+ */
+static inline void notrace
+__srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
+{
+	barrier();  /* Avoid leaking the critical section. */
+	if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
+		this_cpu_inc(scp->srcu_unlocks.counter);  // Z, and implicit RCU reader.
+	else
+		atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks));  // Z, and implicit RCU reader.
+}
+
 void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
 
 // Record SRCU-reader usage type only for CONFIG_PROVE_RCU=y kernels.
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8973cae0a3ef..ad9fdb996a1c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -693,6 +693,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
 
 DEFINE_STATIC_SRCU(srcu_ctl);
 DEFINE_STATIC_SRCU_FAST(srcu_ctlf);
+DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_ctlfud);
 static struct srcu_struct srcu_ctld;
 static struct srcu_struct *srcu_ctlp = &srcu_ctl;
 static struct rcu_torture_ops srcud_ops;
@@ -703,7 +704,7 @@ static void srcu_torture_init(void)
 	if (reader_flavor & SRCU_READ_FLAVOR_FAST)
 		srcu_ctlp = &srcu_ctlf;
 	if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
-		srcu_ctlp = &srcu_ctlf;
+		srcu_ctlp = &srcu_ctlfud;
 }
 
 static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
@@ -736,7 +737,7 @@ static int srcu_torture_read_lock(void)
 		ret += idx << 2;
 	}
 	if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
-		scp = srcu_read_lock_fast(srcu_ctlp);
+		scp = srcu_read_lock_fast_updown(srcu_ctlp);
 		idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
 		WARN_ON_ONCE(idx & ~0x1);
 		ret += idx << 3;
@@ -767,9 +768,10 @@ static void srcu_torture_read_unlock(int idx)
 {
 	WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
 	if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
-		srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
+		srcu_read_unlock_fast_updown(srcu_ctlp,
+					     __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
 	if (reader_flavor & SRCU_READ_FLAVOR_FAST)
-		srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 2));
+		srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x4) >> 2));
 	if (reader_flavor & SRCU_READ_FLAVOR_NMI)
 		srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
 	if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL))
@@ -919,7 +921,7 @@ static void srcud_torture_init(void)
 {
 	rcu_sync_torture_init();
 	if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
-		WARN_ON(init_srcu_struct_fast(&srcu_ctld));
+		WARN_ON(init_srcu_struct_fast_updown(&srcu_ctld));
 	else if (reader_flavor & SRCU_READ_FLAVOR_FAST)
 		WARN_ON(init_srcu_struct_fast(&srcu_ctld));
 	else
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 2f8aa280911e..ea3f128de06f 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -309,13 +309,24 @@ int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lo
 }
 EXPORT_SYMBOL_GPL(__init_srcu_struct_fast);
 
+int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name,
+				   struct lock_class_key *key)
+{
+	ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN;
+	return __init_srcu_struct_common(ssp, name, key);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct_fast_updown);
+
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /**
  * init_srcu_struct - initialize a sleep-RCU structure
  * @ssp: structure to initialize.
  *
- * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * Use this in place of DEFINE_SRCU() and DEFINE_STATIC_SRCU()
+ * for non-static srcu_struct structures that are to be passed to
+ * srcu_read_lock(), srcu_read_lock_nmisafe(), and friends.  It is necessary
+ * to invoke this on a given srcu_struct before passing that srcu_struct
  * to any other function.  Each srcu_struct represents a separate domain
  * of SRCU protection.
  */
@@ -330,9 +341,11 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
  * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure
  * @ssp: structure to initialize.
  *
- * Must invoke this on a given srcu_struct before passing that srcu_struct
- * to any other function.  Each srcu_struct represents a separate domain
- * of SRCU protection.
+ * Use this in place of DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST()
+ * for non-static srcu_struct structures that are to be passed to
+ * srcu_read_lock_fast() and friends.  It is necessary to invoke this on a
+ * given srcu_struct before passing that srcu_struct to any other function.
+ * Each srcu_struct represents a separate domain of SRCU protection.
  */
 int init_srcu_struct_fast(struct srcu_struct *ssp)
 {
@@ -341,6 +354,24 @@ int init_srcu_struct_fast(struct srcu_struct *ssp)
 }
 EXPORT_SYMBOL_GPL(init_srcu_struct_fast);
 
+/**
+ * init_srcu_struct_fast_updown - initialize a fast-reader up/down sleep-RCU structure
+ * @ssp: structure to initialize.
+ *
+ * Use this function in place of DEFINE_SRCU_FAST_UPDOWN() and
+ * DEFINE_STATIC_SRCU_FAST_UPDOWN() for non-static srcu_struct
+ * structures that are to be passed to srcu_read_lock_fast_updown(),
+ * srcu_down_read_fast(), and friends.  It is necessary to invoke this on a
+ * given srcu_struct before passing that srcu_struct to any other function.
+ * Each srcu_struct represents a separate domain of SRCU protection.
+ */
+int init_srcu_struct_fast_updown(struct srcu_struct *ssp)
+{
+	ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN;
+	return init_srcu_struct_fields(ssp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct_fast_updown);
+
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /*
-- 
cgit v1.2.3


From 6ca07a9b63ff4ac24931a21086542cd7092ad74f Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Wed, 1 Oct 2025 15:46:36 +0200
Subject: sysctl: Replace void pointer with const pointer to ctl_table

* Replace void* data in the converter functions with a const struct
  ctl_table* table as it was only getting forwarding values from
  ctl_table->extra{1,2}.
* Remove the void* data in the do_proc_* functions as they already had a
  pointer to the ctl_table.
* Remove min/max structures do_proc_do{uint,int}vec_minmax_conv_param;
  the min/max values get passed directly in ctl_table.
* Keep min/max initialization in extra{1,2} in proc_dou8vec_minmax.
* The do_proc_douintvec was adjusted outside sysctl.c as it is exported
  to fs/pipe.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 fs/pipe.c              |   6 +-
 include/linux/sysctl.h |   5 +-
 kernel/sysctl.c        | 180 ++++++++++++++++++-------------------------------
 3 files changed, 71 insertions(+), 120 deletions(-)

(limited to 'include')

diff --git a/fs/pipe.c b/fs/pipe.c
index 42fead1efe52..9411d4fc2f43 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1482,8 +1482,8 @@ static struct file_system_type pipe_fs_type = {
 
 #ifdef CONFIG_SYSCTL
 static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
-					unsigned int *valp,
-					int write, void *data)
+					unsigned int *valp, int write,
+					const struct ctl_table *table)
 {
 	if (write) {
 		unsigned int val;
@@ -1505,7 +1505,7 @@ static int proc_dopipe_max_size(const struct ctl_table *table, int write,
 				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
-				 do_proc_dopipe_max_size_conv, NULL);
+				 do_proc_dopipe_max_size_conv);
 }
 
 static const struct ctl_table fs_pipe_sysctls[] = {
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 28c4a997fd21..436191e569da 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -235,9 +235,8 @@ bool sysctl_is_alias(char *param);
 int do_proc_douintvec(const struct ctl_table *table, int write,
 		      void *buffer, size_t *lenp, loff_t *ppos,
 		      int (*conv)(unsigned long *lvalp,
-				  unsigned int *valp,
-				  int write, void *data),
-		      void *data);
+				  unsigned int *valp, int write,
+				  const struct ctl_table *table));
 
 extern int unaligned_enabled;
 extern int no_unaligned_warning;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb6196e3fa99..f0a691ffb290 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -355,8 +355,8 @@ static void proc_put_char(void **buf, size_t *size, char c)
 }
 
 static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
-				 int *valp,
-				 int write, void *data)
+				 int *valp, int write,
+				 const struct ctl_table *table)
 {
 	if (write) {
 		if (*negp) {
@@ -382,8 +382,8 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 }
 
 static int do_proc_douintvec_conv(unsigned long *lvalp,
-				  unsigned int *valp,
-				  int write, void *data)
+				  unsigned int *valp, int write,
+				  const struct ctl_table *table)
 {
 	if (write) {
 		if (*lvalp > UINT_MAX)
@@ -402,8 +402,7 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table,
 		  int write, void *buffer,
 		  size_t *lenp, loff_t *ppos,
 		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
-			      int write, void *data),
-		  void *data)
+			      int write, const struct ctl_table *table))
 {
 	int *i, vleft, first = 1, err = 0;
 	size_t left;
@@ -444,12 +443,12 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table,
 					     sizeof(proc_wspace_sep), NULL);
 			if (err)
 				break;
-			if (conv(&neg, &lval, i, 1, data)) {
+			if (conv(&neg, &lval, i, 1, table)) {
 				err = -EINVAL;
 				break;
 			}
 		} else {
-			if (conv(&neg, &lval, i, 0, data)) {
+			if (conv(&neg, &lval, i, 0, table)) {
 				err = -EINVAL;
 				break;
 			}
@@ -474,11 +473,10 @@ out:
 static int do_proc_dointvec(const struct ctl_table *table, int write,
 		  void *buffer, size_t *lenp, loff_t *ppos,
 		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
-			      int write, void *data),
-		  void *data)
+			      int write, const struct ctl_table *table))
 {
 	return __do_proc_dointvec(table->data, table, write,
-			buffer, lenp, ppos, conv, data);
+			buffer, lenp, ppos, conv);
 }
 
 static int do_proc_douintvec_w(unsigned int *tbl_data,
@@ -486,9 +484,8 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
 			       void *buffer,
 			       size_t *lenp, loff_t *ppos,
 			       int (*conv)(unsigned long *lvalp,
-					   unsigned int *valp,
-					   int write, void *data),
-			       void *data)
+					   unsigned int *valp, int write,
+					   const struct ctl_table *table))
 {
 	unsigned long lval;
 	int err = 0;
@@ -518,7 +515,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
 		goto out_free;
 	}
 
-	if (conv(&lval, tbl_data, 1, data)) {
+	if (conv(&lval, tbl_data, 1, table)) {
 		err = -EINVAL;
 		goto out_free;
 	}
@@ -538,12 +535,12 @@ bail_early:
 	return err;
 }
 
-static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer,
+static int do_proc_douintvec_r(unsigned int *tbl_data,
+			       const struct ctl_table *table, void *buffer,
 			       size_t *lenp, loff_t *ppos,
 			       int (*conv)(unsigned long *lvalp,
-					   unsigned int *valp,
-					   int write, void *data),
-			       void *data)
+					   unsigned int *valp, int write,
+					   const struct ctl_table *table))
 {
 	unsigned long lval;
 	int err = 0;
@@ -551,7 +548,7 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer,
 
 	left = *lenp;
 
-	if (conv(&lval, tbl_data, 0, data)) {
+	if (conv(&lval, tbl_data, 0, table)) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -573,9 +570,8 @@ static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table,
 			       int write, void *buffer,
 			       size_t *lenp, loff_t *ppos,
 			       int (*conv)(unsigned long *lvalp,
-					   unsigned int *valp,
-					   int write, void *data),
-			       void *data)
+					   unsigned int *valp, int write,
+					   const struct ctl_table *table))
 {
 	unsigned int *i, vleft;
 
@@ -601,19 +597,18 @@ static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table,
 
 	if (write)
 		return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
-					   conv, data);
-	return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
+					   conv);
+	return do_proc_douintvec_r(i, table, buffer, lenp, ppos, conv);
 }
 
 int do_proc_douintvec(const struct ctl_table *table, int write,
 		      void *buffer, size_t *lenp, loff_t *ppos,
 		      int (*conv)(unsigned long *lvalp,
-				  unsigned int *valp,
-				  int write, void *data),
-		      void *data)
+				  unsigned int *valp, int write,
+				  const struct ctl_table *table))
 {
-	return __do_proc_douintvec(table->data, table, write,
-				   buffer, lenp, ppos, conv, data);
+	return __do_proc_douintvec(table->data, table, write, buffer, lenp,
+				   ppos, conv);
 }
 
 /**
@@ -672,7 +667,7 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer,
 int proc_dointvec(const struct ctl_table *table, int write, void *buffer,
 		  size_t *lenp, loff_t *ppos)
 {
-	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL);
 }
 
 /**
@@ -692,42 +687,28 @@ int proc_douintvec(const struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos)
 {
 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
-				 do_proc_douintvec_conv, NULL);
+				 do_proc_douintvec_conv);
 }
 
-/**
- * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_dointvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_dointvec_minmax() handler.
- */
-struct do_proc_dointvec_minmax_conv_param {
-	int *min;
-	int *max;
-};
-
 static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
-					int *valp,
-					int write, void *data)
+					int *valp, int write,
+					const struct ctl_table *table)
 {
-	int tmp, ret;
-	struct do_proc_dointvec_minmax_conv_param *param = data;
+	int tmp, ret, *min, *max;
 	/*
 	 * If writing, first do so via a temporary local int so we can
 	 * bounds-check it before touching *valp.
 	 */
 	int *ip = write ? &tmp : valp;
 
-	ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
+	ret = do_proc_dointvec_conv(negp, lvalp, ip, write, table);
 	if (ret)
 		return ret;
 
 	if (write) {
-		if ((param->min && *param->min > tmp) ||
-		    (param->max && *param->max < tmp))
+		min = (int *) table->extra1;
+		max = (int *) table->extra2;
+		if ((min && *min > tmp) || (max && *max < tmp))
 			return -EINVAL;
 		WRITE_ONCE(*valp, tmp);
 	}
@@ -754,45 +735,27 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
 int proc_dointvec_minmax(const struct ctl_table *table, int write,
 		  void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct do_proc_dointvec_minmax_conv_param param = {
-		.min = (int *) table->extra1,
-		.max = (int *) table->extra2,
-	};
 	return do_proc_dointvec(table, write, buffer, lenp, ppos,
-				do_proc_dointvec_minmax_conv, &param);
+				do_proc_dointvec_minmax_conv);
 }
 
-/**
- * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_douintvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_douintvec_minmax() handler.
- */
-struct do_proc_douintvec_minmax_conv_param {
-	unsigned int *min;
-	unsigned int *max;
-};
-
 static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
-					 unsigned int *valp,
-					 int write, void *data)
+					 unsigned int *valp, int write,
+					 const struct ctl_table *table)
 {
 	int ret;
-	unsigned int tmp;
-	struct do_proc_douintvec_minmax_conv_param *param = data;
+	unsigned int tmp, *min, *max;
 	/* write via temporary local uint for bounds-checking */
 	unsigned int *up = write ? &tmp : valp;
 
-	ret = do_proc_douintvec_conv(lvalp, up, write, data);
+	ret = do_proc_douintvec_conv(lvalp, up, write, table);
 	if (ret)
 		return ret;
 
 	if (write) {
-		if ((param->min && *param->min > tmp) ||
-		    (param->max && *param->max < tmp))
+		min = (unsigned int *) table->extra1;
+		max = (unsigned int *) table->extra2;
+		if ((min && *min > tmp) || (max && *max < tmp))
 			return -ERANGE;
 
 		WRITE_ONCE(*valp, tmp);
@@ -823,12 +786,8 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
 int proc_douintvec_minmax(const struct ctl_table *table, int write,
 			  void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct do_proc_douintvec_minmax_conv_param param = {
-		.min = (unsigned int *) table->extra1,
-		.max = (unsigned int *) table->extra2,
-	};
 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
-				 do_proc_douintvec_minmax_conv, &param);
+				 do_proc_douintvec_minmax_conv);
 }
 
 /**
@@ -854,28 +813,24 @@ int proc_dou8vec_minmax(const struct ctl_table *table, int write,
 	struct ctl_table tmp;
 	unsigned int min = 0, max = 255U, val;
 	u8 *data = table->data;
-	struct do_proc_douintvec_minmax_conv_param param = {
-		.min = &min,
-		.max = &max,
-	};
 	int res;
 
 	/* Do not support arrays yet. */
 	if (table->maxlen != sizeof(u8))
 		return -EINVAL;
 
-	if (table->extra1)
-		min = *(unsigned int *) table->extra1;
-	if (table->extra2)
-		max = *(unsigned int *) table->extra2;
-
 	tmp = *table;
 
 	tmp.maxlen = sizeof(val);
 	tmp.data = &val;
+	if (!tmp.extra1)
+		tmp.extra1 = (unsigned int *) &min;
+	if (!tmp.extra2)
+		tmp.extra2 = (unsigned int *) &max;
+
 	val = READ_ONCE(*data);
 	res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos,
-				do_proc_douintvec_minmax_conv, &param);
+				do_proc_douintvec_minmax_conv);
 	if (res)
 		return res;
 	if (write)
@@ -1014,8 +969,8 @@ int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
 
 
 static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
-					 int *valp,
-					 int write, void *data)
+					 int *valp, int write,
+					 const struct ctl_table *table)
 {
 	if (write) {
 		if (*lvalp > INT_MAX / HZ)
@@ -1040,8 +995,8 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
 }
 
 static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
-						int *valp,
-						int write, void *data)
+						int *valp, int write,
+						const struct ctl_table *table)
 {
 	if (write) {
 		if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
@@ -1063,8 +1018,8 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
 }
 
 static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
-					    int *valp,
-					    int write, void *data)
+					    int *valp, int write,
+					    const struct ctl_table *table)
 {
 	if (write) {
 		unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
@@ -1088,23 +1043,24 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
 }
 
 static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp,
-						int *valp, int write, void *data)
+						int *valp, int write,
+						const struct ctl_table *table)
 {
-	int tmp, ret;
-	struct do_proc_dointvec_minmax_conv_param *param = data;
+	int tmp, ret, *min, *max;
 	/*
 	 * If writing, first do so via a temporary local int so we can
 	 * bounds-check it before touching *valp.
 	 */
 	int *ip = write ? &tmp : valp;
 
-	ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data);
+	ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, table);
 	if (ret)
 		return ret;
 
 	if (write) {
-		if ((param->min && *param->min > tmp) ||
-				(param->max && *param->max < tmp))
+		min = (int *) table->extra1;
+		max = (int *) table->extra2;
+		if ((min && *min > tmp) || (max && *max < tmp))
 			return -EINVAL;
 		*valp = tmp;
 	}
@@ -1130,18 +1086,14 @@ int proc_dointvec_jiffies(const struct ctl_table *table, int write,
 			  void *buffer, size_t *lenp, loff_t *ppos)
 {
     return do_proc_dointvec(table,write,buffer,lenp,ppos,
-		    	    do_proc_dointvec_jiffies_conv,NULL);
+			    do_proc_dointvec_jiffies_conv);
 }
 
 int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
 			  void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct do_proc_dointvec_minmax_conv_param param = {
-		.min = (int *) table->extra1,
-		.max = (int *) table->extra2,
-	};
 	return do_proc_dointvec(table, write, buffer, lenp, ppos,
-			do_proc_dointvec_ms_jiffies_minmax_conv, &param);
+			do_proc_dointvec_ms_jiffies_minmax_conv);
 }
 
 /**
@@ -1163,7 +1115,7 @@ int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write,
 				 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return do_proc_dointvec(table, write, buffer, lenp, ppos,
-				do_proc_dointvec_userhz_jiffies_conv, NULL);
+				do_proc_dointvec_userhz_jiffies_conv);
 }
 
 /**
@@ -1185,7 +1137,7 @@ int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buf
 		size_t *lenp, loff_t *ppos)
 {
 	return do_proc_dointvec(table, write, buffer, lenp, ppos,
-				do_proc_dointvec_ms_jiffies_conv, NULL);
+				do_proc_dointvec_ms_jiffies_conv);
 }
 
 /**
-- 
cgit v1.2.3


From c5b4c183f7aeb46cd27ddea9dab776655b8d7034 Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Wed, 8 Oct 2025 16:12:37 +0200
Subject: sysctl: Allow custom converters from outside sysctl

The new non-static proc_dointvec_conv forwards a custom converter
function to do_proc_dointvec from outside the sysctl scope. Rename the
do_proc_dointvec call points so any future changes to proc_dointvec_conv
are propagated in sysctl.c This is a preparation commit that allows the
integer jiffie converter functions to move out of kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/sysctl.h |  4 ++++
 kernel/sysctl.c        | 32 ++++++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 436191e569da..a48273757c99 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -68,6 +68,10 @@ int proc_dostring(const struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_dobool(const struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
 int proc_dointvec(const struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer,
+		       size_t *lenp, loff_t *ppos,
+		       int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr,
+				   int dir, const struct ctl_table *table));
 int proc_douintvec(const struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_dointvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 00595b84beac..833ed04f52dc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1005,6 +1005,14 @@ int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
 					 lenp, ppos, HZ, 1000l);
 }
 
+int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer,
+		       size_t *lenp, loff_t *ppos,
+		       int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr,
+				   int dir, const struct ctl_table *table))
+{
+	return do_proc_dointvec(table, dir, buffer, lenp, ppos, conv);
+}
+
 /**
  * proc_dointvec_jiffies - read a vector of integers as seconds
  * @table: the sysctl table
@@ -1023,15 +1031,15 @@ int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
 int proc_dointvec_jiffies(const struct ctl_table *table, int dir,
 			  void *buffer, size_t *lenp, loff_t *ppos)
 {
-	return do_proc_dointvec(table, dir, buffer, lenp, ppos,
-				do_proc_int_conv_jiffies);
+	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+				  do_proc_int_conv_jiffies);
 }
 
 int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
 			  void *buffer, size_t *lenp, loff_t *ppos)
 {
-	return do_proc_dointvec(table, dir, buffer, lenp, ppos,
-			do_proc_int_conv_ms_jiffies_minmax);
+	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+				  do_proc_int_conv_ms_jiffies_minmax);
 }
 
 /**
@@ -1054,8 +1062,8 @@ int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
 {
 	if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ)
 		return -EINVAL;
-	return do_proc_dointvec(table, dir, buffer, lenp, ppos,
-				do_proc_int_conv_userhz_jiffies);
+	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+				  do_proc_int_conv_userhz_jiffies);
 }
 
 /**
@@ -1076,8 +1084,8 @@ int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
 int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
 		size_t *lenp, loff_t *ppos)
 {
-	return do_proc_dointvec(table, dir, buffer, lenp, ppos,
-				do_proc_int_conv_ms_jiffies);
+	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+				  do_proc_int_conv_ms_jiffies);
 }
 
 /**
@@ -1307,6 +1315,14 @@ int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
 	return -ENOSYS;
 }
 
+int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer,
+		       size_t *lenp, loff_t *ppos,
+		       int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr,
+				   int dir, const struct ctl_table *table))
+{
+	return -ENOSYS;
+}
+
 int proc_do_large_bitmap(const struct ctl_table *table, int dir,
 			 void *buffer, size_t *lenp, loff_t *ppos)
 {
-- 
cgit v1.2.3


From e2e5dac304fdf991fb974510db4565db04ef1335 Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Tue, 14 Oct 2025 12:42:01 +0200
Subject: sysctl: Move INT converter macros to sysctl header

Move direction macros (SYSCTL_{USER_TO_KERN,KERN_TO_USER}) and the
integer converter macros (SYSCTL_{USER_TO_KERN,KERN_TO_USER}_INT_CONV,
SYSCTL_INT_CONV_CUSTOM) into include/linux/sysctl.h. This is a
preparation commit to enable jiffies converter creation outside
kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/sysctl.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c        | 75 --------------------------------------------------
 2 files changed, 75 insertions(+), 75 deletions(-)

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a48273757c99..a0ca9496119a 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -59,6 +59,81 @@ extern const int sysctl_vals[];
 #define SYSCTL_LONG_ONE		((void *)&sysctl_long_vals[1])
 #define SYSCTL_LONG_MAX		((void *)&sysctl_long_vals[2])
 
+/**
+ *
+ * "dir" originates from read_iter (dir = 0) or write_iter (dir = 1)
+ * in the file_operations struct at proc/proc_sysctl.c. Its value means
+ * one of two things for sysctl:
+ * 1. SYSCTL_USER_TO_KERN(dir) Writing to an internal kernel variable from user
+ *                             space (dir > 0)
+ * 2. SYSCTL_KERN_TO_USER(dir) Writing to a user space buffer from a kernel
+ *                             variable (dir == 0).
+ */
+#define SYSCTL_USER_TO_KERN(dir) (!!(dir))
+#define SYSCTL_KERN_TO_USER(dir) (!dir)
+
+#define SYSCTL_USER_TO_KERN_INT_CONV(name, u_ptr_op)		\
+int sysctl_user_to_kern_int_conv##name(const bool *negp,	\
+				       const unsigned long *u_ptr,\
+				       int *k_ptr)		\
+{								\
+	unsigned long u = u_ptr_op(*u_ptr);			\
+	if (*negp) {						\
+		if (u > (unsigned long) INT_MAX + 1)		\
+			return -EINVAL;				\
+		WRITE_ONCE(*k_ptr, -u);				\
+	} else {						\
+		if (u > (unsigned long) INT_MAX)		\
+			return -EINVAL;				\
+		WRITE_ONCE(*k_ptr, u);				\
+	}							\
+	return 0;						\
+}
+
+#define SYSCTL_KERN_TO_USER_INT_CONV(name, k_ptr_op)		\
+int sysctl_kern_to_user_int_conv##name(bool *negp,		\
+				       unsigned long *u_ptr,	\
+				       const int *k_ptr)	\
+{								\
+	int val = READ_ONCE(*k_ptr);				\
+	if (val < 0) {						\
+		*negp = true;					\
+		*u_ptr = -k_ptr_op((unsigned long)val);		\
+	} else {						\
+		*negp = false;					\
+		*u_ptr = k_ptr_op((unsigned long)val);		\
+	}							\
+	return 0;						\
+}
+
+/**
+ * To range check on a converted value, use a temp k_ptr
+ * When checking range, value should be within (tbl->extra1, tbl->extra2)
+ */
+#define SYSCTL_INT_CONV_CUSTOM(name, user_to_kern, kern_to_user,	\
+			       k_ptr_range_check)			\
+int do_proc_int_conv##name(bool *negp, unsigned long *u_ptr, int *k_ptr,\
+			   int dir, const struct ctl_table *tbl)	\
+{									\
+	if (SYSCTL_KERN_TO_USER(dir))					\
+		return kern_to_user(negp, u_ptr, k_ptr);		\
+									\
+	if (k_ptr_range_check) {					\
+		int tmp_k, ret;						\
+		if (!tbl)						\
+			return -EINVAL;					\
+		ret = user_to_kern(negp, u_ptr, &tmp_k);		\
+		if (ret)						\
+			return ret;					\
+		if ((tbl->extra1 && *(int *)tbl->extra1 > tmp_k) ||	\
+		    (tbl->extra2 && *(int *)tbl->extra2 < tmp_k))	\
+			return -EINVAL;					\
+		WRITE_ONCE(*k_ptr, tmp_k);				\
+	} else								\
+		return user_to_kern(negp, u_ptr, k_ptr);		\
+	return 0;							\
+}
+
 extern const unsigned long sysctl_long_vals[];
 
 typedef int proc_handler(const struct ctl_table *ctl, int write, void *buffer,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 833ed04f52dc..0a33d92904de 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,19 +30,6 @@ EXPORT_SYMBOL(sysctl_vals);
 const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
 EXPORT_SYMBOL_GPL(sysctl_long_vals);
 
-/**
- *
- * "dir" originates from read_iter (dir = 0) or write_iter (dir = 1)
- * in the file_operations struct at proc/proc_sysctl.c. Its value means
- * one of two things for sysctl:
- * 1. SYSCTL_USER_TO_KERN(dir) Writing to an internal kernel variable from user
- *                             space (dir > 0)
- * 2. SYSCTL_KERN_TO_USER(dir) Writing to a user space buffer from a kernel
- *                             variable (dir == 0).
- */
-#define SYSCTL_USER_TO_KERN(dir) (!!(dir))
-#define SYSCTL_KERN_TO_USER(dir) (!dir)
-
 #if defined(CONFIG_SYSCTL)
 
 /* Constants used for minimum and maximum */
@@ -368,68 +355,6 @@ static void proc_put_char(void **buf, size_t *size, char c)
 	}
 }
 
-#define SYSCTL_USER_TO_KERN_INT_CONV(name, u_ptr_op)		\
-int sysctl_user_to_kern_int_conv##name(const bool *negp,	\
-				       const unsigned long *u_ptr,\
-				       int *k_ptr)		\
-{								\
-	unsigned long u = u_ptr_op(*u_ptr);			\
-	if (*negp) {						\
-		if (u > (unsigned long) INT_MAX + 1)		\
-			return -EINVAL;				\
-		WRITE_ONCE(*k_ptr, -u);				\
-	} else {						\
-		if (u > (unsigned long) INT_MAX)		\
-			return -EINVAL;				\
-		WRITE_ONCE(*k_ptr, u);				\
-	}							\
-	return 0;						\
-}
-
-#define SYSCTL_KERN_TO_USER_INT_CONV(name, k_ptr_op)		\
-int sysctl_kern_to_user_int_conv##name(bool *negp,		\
-				       unsigned long *u_ptr,	\
-				       const int *k_ptr)	\
-{								\
-	int val = READ_ONCE(*k_ptr);				\
-	if (val < 0) {						\
-		*negp = true;					\
-		*u_ptr = -k_ptr_op((unsigned long)val);		\
-	} else {						\
-		*negp = false;					\
-		*u_ptr = k_ptr_op((unsigned long)val);		\
-	}							\
-	return 0;						\
-}
-
-/**
- * To range check on a converted value, use a temp k_ptr
- * When checking range, value should be within (tbl->extra1, tbl->extra2)
- */
-#define SYSCTL_INT_CONV_CUSTOM(name, user_to_kern, kern_to_user,	\
-			       k_ptr_range_check)			\
-int do_proc_int_conv##name(bool *negp, unsigned long *u_ptr, int *k_ptr,\
-			   int dir, const struct ctl_table *tbl)	\
-{									\
-	if (SYSCTL_KERN_TO_USER(dir))					\
-		return kern_to_user(negp, u_ptr, k_ptr);		\
-									\
-	if (k_ptr_range_check) {					\
-		int tmp_k, ret;						\
-		if (!tbl)						\
-			return -EINVAL;					\
-		ret = user_to_kern(negp, u_ptr, &tmp_k);		\
-		if (ret)						\
-			return ret;					\
-		if ((tbl->extra1 && *(int *)tbl->extra1 > tmp_k) ||	\
-		    (tbl->extra2 && *(int *)tbl->extra2 < tmp_k))	\
-			return -EINVAL;					\
-		WRITE_ONCE(*k_ptr, tmp_k);				\
-	} else								\
-		return user_to_kern(negp, u_ptr, k_ptr);		\
-	return 0;							\
-}
-
 #define SYSCTL_CONV_IDENTITY(val) val
 #define SYSCTL_CONV_MULT_HZ(val) ((val) * HZ)
 #define SYSCTL_CONV_DIV_HZ(val) ((val) / HZ)
-- 
cgit v1.2.3


From 24a08eefddb33c7a259975e932c434b85f70d684 Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Thu, 16 Oct 2025 10:38:45 +0200
Subject: sysctl: Move UINT converter macros to sysctl header

Move SYSCTL_USER_TO_KERN_UINT_CONV and SYSCTL_UINT_CONV_CUSTOM macros to
include/linux/sysctl.h. No need to embed sysctl_kern_to_user_uint_conv
in a macro as it will not need a custom kernel pointer operation. This
is a preparation commit to enable jiffies converter creation outside
kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/sysctl.h | 40 ++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c        | 41 ++---------------------------------------
 2 files changed, 42 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a0ca9496119a..fa78136617ad 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -134,6 +134,45 @@ int do_proc_int_conv##name(bool *negp, unsigned long *u_ptr, int *k_ptr,\
 	return 0;							\
 }
 
+#define SYSCTL_USER_TO_KERN_UINT_CONV(name, u_ptr_op)		\
+int sysctl_user_to_kern_uint_conv##name(const unsigned long *u_ptr,\
+					unsigned int *k_ptr)	\
+{								\
+	unsigned long u = u_ptr_op(*u_ptr);			\
+	if (u > UINT_MAX)					\
+		return -EINVAL;					\
+	WRITE_ONCE(*k_ptr, u);					\
+	return 0;						\
+}
+
+#define SYSCTL_UINT_CONV_CUSTOM(name, user_to_kern, kern_to_user,	\
+				k_ptr_range_check)			\
+int do_proc_uint_conv##name(unsigned long *u_ptr, unsigned int *k_ptr,	\
+			   int dir, const struct ctl_table *tbl)	\
+{									\
+	if (SYSCTL_KERN_TO_USER(dir))					\
+		return kern_to_user(u_ptr, k_ptr);			\
+									\
+	if (k_ptr_range_check) {					\
+		unsigned int tmp_k;					\
+		int ret;						\
+		if (!tbl)						\
+			return -EINVAL;					\
+		ret = user_to_kern(u_ptr, &tmp_k);			\
+		if (ret)						\
+			return ret;					\
+		if ((tbl->extra1 &&					\
+		     *(unsigned int *)tbl->extra1 > tmp_k) ||		\
+		    (tbl->extra2 &&					\
+		     *(unsigned int *)tbl->extra2 < tmp_k))		\
+			return -ERANGE;					\
+		WRITE_ONCE(*k_ptr, tmp_k);				\
+	} else								\
+		return user_to_kern(u_ptr, k_ptr);			\
+	return 0;							\
+}
+
+
 extern const unsigned long sysctl_long_vals[];
 
 typedef int proc_handler(const struct ctl_table *ctl, int write, void *buffer,
@@ -166,6 +205,7 @@ int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int, void *
 int proc_do_large_bitmap(const struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_do_static_key(const struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
+int sysctl_kern_to_user_uint_conv(unsigned long *u_ptr, const unsigned int *k_ptr);
 
 /*
  * Register a set of sysctl names by calling register_sysctl
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0a33d92904de..5a79622ad1cd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,54 +387,17 @@ static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies_minmax,
 			      sysctl_user_to_kern_int_conv_ms,
 			      sysctl_kern_to_user_int_conv_ms, true)
 
-#define SYSCTL_USER_TO_KERN_UINT_CONV(name, u_ptr_op)		\
-int sysctl_user_to_kern_uint_conv##name(const unsigned long *u_ptr,\
-					unsigned int *k_ptr)	\
-{								\
-	unsigned long u = u_ptr_op(*u_ptr);			\
-	if (u > UINT_MAX)					\
-		return -EINVAL;					\
-	WRITE_ONCE(*k_ptr, u);					\
-	return 0;						\
-}
 
 static SYSCTL_USER_TO_KERN_UINT_CONV(, SYSCTL_CONV_IDENTITY)
 
-static int sysctl_kern_to_user_uint_conv(unsigned long *u_ptr,
-					 const unsigned int *k_ptr)
+int sysctl_kern_to_user_uint_conv(unsigned long *u_ptr,
+				  const unsigned int *k_ptr)
 {
 	unsigned int val = READ_ONCE(*k_ptr);
 	*u_ptr = (unsigned long)val;
 	return 0;
 }
 
-#define SYSCTL_UINT_CONV_CUSTOM(name, user_to_kern, kern_to_user,	\
-				k_ptr_range_check)			\
-int do_proc_uint_conv##name(unsigned long *u_ptr, unsigned int *k_ptr,	\
-			   int dir, const struct ctl_table *tbl)	\
-{									\
-	if (SYSCTL_KERN_TO_USER(dir))					\
-		return kern_to_user(u_ptr, k_ptr);			\
-									\
-	if (k_ptr_range_check) {					\
-		unsigned int tmp_k;					\
-		int ret;						\
-		if (!tbl)						\
-			return -EINVAL;					\
-		ret = user_to_kern(u_ptr, &tmp_k);			\
-		if (ret)						\
-			return ret;					\
-		if ((tbl->extra1 &&					\
-		     *(unsigned int *)tbl->extra1 > tmp_k) ||		\
-		    (tbl->extra2 &&					\
-		     *(unsigned int *)tbl->extra2 < tmp_k))		\
-			return -ERANGE;					\
-		WRITE_ONCE(*k_ptr, tmp_k);				\
-	} else								\
-		return user_to_kern(u_ptr, k_ptr);			\
-	return 0;							\
-}
-
 static SYSCTL_UINT_CONV_CUSTOM(, sysctl_user_to_kern_uint_conv,
 			       sysctl_kern_to_user_uint_conv, false)
 static SYSCTL_UINT_CONV_CUSTOM(_minmax, sysctl_user_to_kern_uint_conv,
-- 
cgit v1.2.3


From 54932988c4230925d2bf0023509ac2fee59a089a Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Tue, 14 Oct 2025 13:04:16 +0200
Subject: sysctl: Move jiffies converters to kernel/time/jiffies.c

Move integer jiffies converters (proc_dointvec{_,_ms_,_userhz_}jiffies
and proc_dointvec_ms_jiffies_minmax) to kernel/time/jiffies.c. Error
stubs for when CONFIG_PRCO_SYSCTL is not defined are not reproduced
because all the jiffies converters go through proc_dointvec_conv which
is already stubbed. This is part of the greater effort to move sysctl
logic out of kernel/sysctl.c thereby reducing merge conflicts in
kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/jiffies.h |  10 ++++
 include/linux/sysctl.h  |   7 ---
 kernel/sysctl.c         | 124 ------------------------------------------------
 kernel/time/jiffies.c   | 100 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 110 insertions(+), 131 deletions(-)

(limited to 'include')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 0d1927da8055..72d589a8a0d6 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -611,4 +611,14 @@ extern unsigned long nsecs_to_jiffies(u64 n);
 
 #define TIMESTAMP_SIZE	30
 
+struct ctl_table;
+int proc_dointvec_jiffies(const struct ctl_table *table, int dir, void *buffer,
+			  size_t *lenp, loff_t *ppos);
+int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+				    void *buffer, size_t *lenp, loff_t *ppos);
+int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
+				 void *buffer, size_t *lenp, loff_t *ppos);
+int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
+			     size_t *lenp, loff_t *ppos);
+
 #endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index fa78136617ad..db4020f6933b 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -192,13 +192,6 @@ int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer
 		size_t *lenp, loff_t *ppos);
 int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer,
 			size_t *lenp, loff_t *ppos);
-int proc_dointvec_jiffies(const struct ctl_table *, int, void *, size_t *, loff_t *);
-int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos);
-int proc_dointvec_userhz_jiffies(const struct ctl_table *, int, void *, size_t *,
-		loff_t *);
-int proc_dointvec_ms_jiffies(const struct ctl_table *, int, void *, size_t *,
-		loff_t *);
 int proc_doulongvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int, void *,
 		size_t *, loff_t *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5a79622ad1cd..bcbf69c10426 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -356,36 +356,14 @@ static void proc_put_char(void **buf, size_t *size, char c)
 }
 
 #define SYSCTL_CONV_IDENTITY(val) val
-#define SYSCTL_CONV_MULT_HZ(val) ((val) * HZ)
-#define SYSCTL_CONV_DIV_HZ(val) ((val) / HZ)
 
 static SYSCTL_USER_TO_KERN_INT_CONV(, SYSCTL_CONV_IDENTITY)
 static SYSCTL_KERN_TO_USER_INT_CONV(, SYSCTL_CONV_IDENTITY)
 
-static SYSCTL_USER_TO_KERN_INT_CONV(_hz, SYSCTL_CONV_MULT_HZ)
-static SYSCTL_KERN_TO_USER_INT_CONV(_hz, SYSCTL_CONV_DIV_HZ)
-
-static SYSCTL_USER_TO_KERN_INT_CONV(_userhz, clock_t_to_jiffies)
-static SYSCTL_KERN_TO_USER_INT_CONV(_userhz, jiffies_to_clock_t)
-
-static SYSCTL_USER_TO_KERN_INT_CONV(_ms, msecs_to_jiffies)
-static SYSCTL_KERN_TO_USER_INT_CONV(_ms, jiffies_to_msecs)
-
 static SYSCTL_INT_CONV_CUSTOM(, sysctl_user_to_kern_int_conv,
 			      sysctl_kern_to_user_int_conv, false)
-static SYSCTL_INT_CONV_CUSTOM(_jiffies, sysctl_user_to_kern_int_conv_hz,
-			      sysctl_kern_to_user_int_conv_hz, false)
-static SYSCTL_INT_CONV_CUSTOM(_userhz_jiffies,
-			      sysctl_user_to_kern_int_conv_userhz,
-			      sysctl_kern_to_user_int_conv_userhz, false)
-static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies, sysctl_user_to_kern_int_conv_ms,
-			      sysctl_kern_to_user_int_conv_ms, false)
-
 static SYSCTL_INT_CONV_CUSTOM(_minmax, sysctl_user_to_kern_int_conv,
 			      sysctl_kern_to_user_int_conv, true)
-static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies_minmax,
-			      sysctl_user_to_kern_int_conv_ms,
-			      sysctl_kern_to_user_int_conv_ms, true)
 
 
 static SYSCTL_USER_TO_KERN_UINT_CONV(, SYSCTL_CONV_IDENTITY)
@@ -901,81 +879,6 @@ int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer,
 	return do_proc_dointvec(table, dir, buffer, lenp, ppos, conv);
 }
 
-/**
- * proc_dointvec_jiffies - read a vector of integers as seconds
- * @table: the sysctl table
- * @dir: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in seconds, and are converted into
- * jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_jiffies(const struct ctl_table *table, int dir,
-			  void *buffer, size_t *lenp, loff_t *ppos)
-{
-	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
-				  do_proc_int_conv_jiffies);
-}
-
-int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
-			  void *buffer, size_t *lenp, loff_t *ppos)
-{
-	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
-				  do_proc_int_conv_ms_jiffies_minmax);
-}
-
-/**
- * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
- * @table: the sysctl table
- * @dir: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: pointer to the file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/USER_HZ seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
-				 void *buffer, size_t *lenp, loff_t *ppos)
-{
-	if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ)
-		return -EINVAL;
-	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
-				  do_proc_int_conv_userhz_jiffies);
-}
-
-/**
- * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
- * @table: the sysctl table
- * @dir: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: the current position in the file
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/1000 seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
-		size_t *lenp, loff_t *ppos)
-{
-	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
-				  do_proc_int_conv_ms_jiffies);
-}
-
 /**
  * proc_do_large_bitmap - read/write from/to a large bitmap
  * @table: the sysctl table
@@ -1167,30 +1070,6 @@ int proc_dou8vec_minmax(const struct ctl_table *table, int dir,
 	return -ENOSYS;
 }
 
-int proc_dointvec_jiffies(const struct ctl_table *table, int dir,
-		    void *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
-				    void *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
-		    void *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir,
-			     void *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
 int proc_doulongvec_minmax(const struct ctl_table *table, int dir,
 		    void *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -1310,11 +1189,8 @@ int __init sysctl_init_bases(void)
 EXPORT_SYMBOL(proc_dobool);
 EXPORT_SYMBOL(proc_dointvec);
 EXPORT_SYMBOL(proc_douintvec);
-EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
-EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
-EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
 EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 34eeacac2253..8e845a68382c 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -99,3 +99,103 @@ void __init register_refined_jiffies(long cycles_per_second)
 
 	__clocksource_register(&refined_jiffies);
 }
+
+#define SYSCTL_CONV_MULT_HZ(val) ((val) * HZ)
+#define SYSCTL_CONV_DIV_HZ(val) ((val) / HZ)
+
+static SYSCTL_USER_TO_KERN_INT_CONV(_hz, SYSCTL_CONV_MULT_HZ)
+static SYSCTL_KERN_TO_USER_INT_CONV(_hz, SYSCTL_CONV_DIV_HZ)
+static SYSCTL_USER_TO_KERN_INT_CONV(_userhz, clock_t_to_jiffies)
+static SYSCTL_KERN_TO_USER_INT_CONV(_userhz, jiffies_to_clock_t)
+static SYSCTL_USER_TO_KERN_INT_CONV(_ms, msecs_to_jiffies)
+static SYSCTL_KERN_TO_USER_INT_CONV(_ms, jiffies_to_msecs)
+
+static SYSCTL_INT_CONV_CUSTOM(_jiffies, sysctl_user_to_kern_int_conv_hz,
+			      sysctl_kern_to_user_int_conv_hz, false)
+static SYSCTL_INT_CONV_CUSTOM(_userhz_jiffies,
+			      sysctl_user_to_kern_int_conv_userhz,
+			      sysctl_kern_to_user_int_conv_userhz, false)
+static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies, sysctl_user_to_kern_int_conv_ms,
+			      sysctl_kern_to_user_int_conv_ms, false)
+static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies_minmax,
+			      sysctl_user_to_kern_int_conv_ms,
+			      sysctl_kern_to_user_int_conv_ms, true)
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(const struct ctl_table *table, int dir,
+			  void *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+				  do_proc_int_conv_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_jiffies);
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ)
+		return -EINVAL;
+	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+				  do_proc_int_conv_userhz_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+				  do_proc_int_conv_ms_jiffies);
+}
+
+int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+			  void *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+				  do_proc_int_conv_ms_jiffies_minmax);
+}
+EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+
-- 
cgit v1.2.3


From 4639faaa607f3bed85f2cdde686a88453c99ef06 Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Tue, 14 Oct 2025 13:35:42 +0200
Subject: sysctl: Move proc_doulongvec_ms_jiffies_minmax to
 kernel/time/jiffies.c

Move proc_doulongvec_ms_jiffies_minmax to kernel/time/jiffies.c. Create
a non static wrapper function proc_doulongvec_minmax_conv that
forwards the custom convmul and convdiv argument values to the internal
do_proc_doulongvec_minmax. Remove unused linux/times.h include from
kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/jiffies.h |  2 ++
 include/linux/sysctl.h  |  5 +++--
 kernel/sysctl.c         | 41 ++++++++++++-----------------------------
 kernel/time/jiffies.c   | 27 ++++++++++++++++++++++++++-
 4 files changed, 43 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 72d589a8a0d6..fdef2c155c27 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -620,5 +620,7 @@ int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
 				 void *buffer, size_t *lenp, loff_t *ppos);
 int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
 			     size_t *lenp, loff_t *ppos);
+int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+				      void *buffer, size_t *lenp, loff_t *ppos);
 
 #endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index db4020f6933b..30f6a184d3f4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -193,8 +193,9 @@ int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer
 int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer,
 			size_t *lenp, loff_t *ppos);
 int proc_doulongvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *);
-int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int, void *,
-		size_t *, loff_t *);
+int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir,
+				void *buffer, size_t *lenp, loff_t *ppos,
+				unsigned long convmul, unsigned long convdiv);
 int proc_do_large_bitmap(const struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_do_static_key(const struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bcbf69c10426..998400323ae9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -13,7 +13,6 @@
 #include <linux/highuid.h>
 #include <linux/writeback.h>
 #include <linux/initrd.h>
-#include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/syscalls.h>
 #include <linux/capability.h>
@@ -825,6 +824,14 @@ out:
 	return err;
 }
 
+int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir,
+				void *buffer, size_t *lenp, loff_t *ppos,
+				unsigned long convmul, unsigned long convdiv)
+{
+	return do_proc_doulongvec_minmax(table, dir, buffer, lenp, ppos,
+					 convmul, convdiv);
+}
+
 /**
  * proc_doulongvec_minmax - read a vector of long integers with min/max values
  * @table: the sysctl table
@@ -844,31 +851,7 @@ out:
 int proc_doulongvec_minmax(const struct ctl_table *table, int dir,
 			   void *buffer, size_t *lenp, loff_t *ppos)
 {
-	return do_proc_doulongvec_minmax(table, dir, buffer, lenp, ppos, 1l, 1l);
-}
-
-/**
- * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
- * @table: the sysctl table
- * @dir: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
- * values from/to the user buffer, treated as an ASCII string. The values
- * are treated as milliseconds, and converted to jiffies when they are stored.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success.
- */
-int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
-				      void *buffer, size_t *lenp, loff_t *ppos)
-{
-	return do_proc_doulongvec_minmax(table, dir, buffer,
-					 lenp, ppos, HZ, 1000l);
+	return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos, 1l, 1l);
 }
 
 int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer,
@@ -1076,8 +1059,9 @@ int proc_doulongvec_minmax(const struct ctl_table *table, int dir,
 	return -ENOSYS;
 }
 
-int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
-				      void *buffer, size_t *lenp, loff_t *ppos)
+int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir,
+				void *buffer, size_t *lenp, loff_t *ppos,
+				unsigned long convmul, unsigned long convdiv)
 {
 	return -ENOSYS;
 }
@@ -1193,5 +1177,4 @@ EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
 EXPORT_SYMBOL(proc_dostring);
 EXPORT_SYMBOL(proc_doulongvec_minmax);
-EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
 EXPORT_SYMBOL(proc_do_large_bitmap);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 8e845a68382c..d31a6d40d38d 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -190,6 +190,7 @@ int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffe
 	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
 				  do_proc_int_conv_ms_jiffies);
 }
+EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 
 int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
 			  void *buffer, size_t *lenp, loff_t *ppos)
@@ -197,5 +198,29 @@ int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
 	return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
 				  do_proc_int_conv_ms_jiffies_minmax);
 }
-EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+				      void *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos,
+					   HZ, 1000l);
+}
+EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
 
-- 
cgit v1.2.3


From 30baaeb685bce0b7dfd3c5a55f22b1076c21f7b2 Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Tue, 14 Oct 2025 14:21:03 +0200
Subject: sysctl: Create pipe-max-size converter using sysctl UINT macros

Create a converter for the pipe-max-size proc_handler using the
SYSCTL_UINT_CONV_CUSTOM. Move SYSCTL_CONV_IDENTITY macro to the sysctl
header to make it available for pipe size validation. Keep returning
-EINVAL when (val == 0) by using a range checking converter and setting
the minimal valid value (extern1) to SYSCTL_ONE. Keep round_pipe_size by
passing it as the operation for SYSCTL_USER_TO_KERN_INT_CONV.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 fs/pipe.c              | 26 ++++++--------------------
 include/linux/sysctl.h |  1 +
 kernel/sysctl.c        |  2 --
 3 files changed, 7 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/fs/pipe.c b/fs/pipe.c
index 9411d4fc2f43..f1b3d1154ad2 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1481,31 +1481,16 @@ static struct file_system_type pipe_fs_type = {
 };
 
 #ifdef CONFIG_SYSCTL
-static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
-					unsigned int *valp, int write,
-					const struct ctl_table *table)
-{
-	if (write) {
-		unsigned int val;
-
-		val = round_pipe_size(*lvalp);
-		if (val == 0)
-			return -EINVAL;
-
-		*valp = val;
-	} else {
-		unsigned int val = *valp;
-		*lvalp = (unsigned long) val;
-	}
-
-	return 0;
-}
+static SYSCTL_USER_TO_KERN_UINT_CONV(_pipe_maxsz, round_pipe_size)
+static SYSCTL_UINT_CONV_CUSTOM(_pipe_maxsz,
+			       sysctl_user_to_kern_uint_conv_pipe_maxsz,
+			       sysctl_kern_to_user_uint_conv, true)
 
 static int proc_dopipe_max_size(const struct ctl_table *table, int write,
 				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
-				 do_proc_dopipe_max_size_conv);
+				 do_proc_uint_conv_pipe_maxsz);
 }
 
 static const struct ctl_table fs_pipe_sysctls[] = {
@@ -1515,6 +1500,7 @@ static const struct ctl_table fs_pipe_sysctls[] = {
 		.maxlen		= sizeof(pipe_max_size),
 		.mode		= 0644,
 		.proc_handler	= proc_dopipe_max_size,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "pipe-user-pages-hard",
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 30f6a184d3f4..4c88514a7d1a 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -59,6 +59,7 @@ extern const int sysctl_vals[];
 #define SYSCTL_LONG_ONE		((void *)&sysctl_long_vals[1])
 #define SYSCTL_LONG_MAX		((void *)&sysctl_long_vals[2])
 
+#define SYSCTL_CONV_IDENTITY(val) (val)
 /**
  *
  * "dir" originates from read_iter (dir = 0) or write_iter (dir = 1)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 998400323ae9..d09c6602a115 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -354,8 +354,6 @@ static void proc_put_char(void **buf, size_t *size, char c)
 	}
 }
 
-#define SYSCTL_CONV_IDENTITY(val) val
-
 static SYSCTL_USER_TO_KERN_INT_CONV(, SYSCTL_CONV_IDENTITY)
 static SYSCTL_KERN_TO_USER_INT_CONV(, SYSCTL_CONV_IDENTITY)
 
-- 
cgit v1.2.3


From 564195c1a33c8fc631cd3d306e350b0e3d3e9555 Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Thu, 16 Oct 2025 11:04:23 +0200
Subject: sysctl: Wrap do_proc_douintvec with the public function
 proc_douintvec_conv

Make do_proc_douintvec static and export proc_douintvec_conv wrapper
function for external use. This is to keep with the design in sysctl.c.
Update fs/pipe.c to use the new public API.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 fs/pipe.c              |  4 ++--
 include/linux/sysctl.h | 13 +++++++------
 kernel/sysctl.c        | 18 ++++++++++++++----
 3 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/pipe.c b/fs/pipe.c
index f1b3d1154ad2..0acca73617e9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1489,8 +1489,8 @@ static SYSCTL_UINT_CONV_CUSTOM(_pipe_maxsz,
 static int proc_dopipe_max_size(const struct ctl_table *table, int write,
 				void *buffer, size_t *lenp, loff_t *ppos)
 {
-	return do_proc_douintvec(table, write, buffer, lenp, ppos,
-				 do_proc_uint_conv_pipe_maxsz);
+	return proc_douintvec_conv(table, write, buffer, lenp, ppos,
+				   do_proc_uint_conv_pipe_maxsz);
 }
 
 static const struct ctl_table fs_pipe_sysctls[] = {
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4c88514a7d1a..288fe0055cd5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -183,14 +183,20 @@ int proc_dostring(const struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_dobool(const struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
 int proc_dointvec(const struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_dointvec_minmax(const struct ctl_table *table, int dir, void *buffer,
+			 size_t *lenp, loff_t *ppos);
 int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer,
 		       size_t *lenp, loff_t *ppos,
 		       int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr,
 				   int dir, const struct ctl_table *table));
 int proc_douintvec(const struct ctl_table *, int, void *, size_t *, loff_t *);
-int proc_dointvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
+int proc_douintvec_conv(const struct ctl_table *table, int write, void *buffer,
+			size_t *lenp, loff_t *ppos,
+			int (*conv)(unsigned long *lvalp, unsigned int *valp,
+				    int write, const struct ctl_table *table));
+
 int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer,
 			size_t *lenp, loff_t *ppos);
 int proc_doulongvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *);
@@ -346,11 +352,6 @@ extern struct ctl_table_header *register_sysctl_mount_point(const char *path);
 
 void do_sysctl_args(void);
 bool sysctl_is_alias(char *param);
-int do_proc_douintvec(const struct ctl_table *table, int write,
-		      void *buffer, size_t *lenp, loff_t *ppos,
-		      int (*conv)(unsigned long *lvalp,
-				  unsigned int *valp, int write,
-				  const struct ctl_table *table));
 
 extern int unaligned_enabled;
 extern int no_unaligned_warning;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d09c6602a115..2cd767b9680e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -535,10 +535,11 @@ out:
 	return err;
 }
 
-int do_proc_douintvec(const struct ctl_table *table, int dir, void *buffer,
-		      size_t *lenp, loff_t *ppos,
-		      int (*conv)(unsigned long *u_ptr, unsigned int *k_ptr,
-				  int dir, const struct ctl_table *table))
+static int do_proc_douintvec(const struct ctl_table *table, int dir,
+			     void *buffer, size_t *lenp, loff_t *ppos,
+			      int (*conv)(unsigned long *u_ptr,
+					  unsigned int *k_ptr, int dir,
+					  const struct ctl_table *table))
 {
 	unsigned int vleft;
 
@@ -567,6 +568,15 @@ int do_proc_douintvec(const struct ctl_table *table, int dir, void *buffer,
 	return do_proc_douintvec_r(table, buffer, lenp, ppos, conv);
 }
 
+int proc_douintvec_conv(const struct ctl_table *table, int dir, void *buffer,
+			size_t *lenp, loff_t *ppos,
+			int (*conv)(unsigned long *u_ptr, unsigned int *k_ptr,
+				    int dir, const struct ctl_table *table))
+{
+	return do_proc_douintvec(table, dir, buffer, lenp, ppos, conv);
+}
+
+
 /**
  * proc_dobool - read/write a bool
  * @table: the sysctl table
-- 
cgit v1.2.3


From 5fee9edf791a50182382fae23f30690c93e16cec Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 27 Nov 2025 16:34:20 +0000
Subject: ASoC: SDCA: Align mute controls to ALSA expectations

Currently mute controls will be called "FU xx Mute Switch" (note
the switch is added programmatically outside the coverage of this
patch) and the accompanying volume control would be called "FU xx
Channel Volume".  These names are taken from the SDCA specification,
however, this does not mesh well with the ALSA naming system. ALSA
generally expects enables rather than mutes and expects that mutes
and volumes have matching names.

Update the names and invert the mute controls to make them more
standard "FU XX Channel Switch", this does slightly deviate from
the SDCA specification but it makes the rest of the Linux ecosystem
a lot happier.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251127163426.2500633-2-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h | 2 +-
 sound/soc/sdca/sdca_asoc.c    | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index c97861508a15..2564fad33fd4 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -611,7 +611,7 @@ enum sdca_entity0_controls {
 #define SDCA_CTL_NDAI_PACKETTYPE_NAME			"NDAI Packet Type"
 #define SDCA_CTL_MIXER_NAME				"Mixer"
 #define SDCA_CTL_SELECTOR_NAME				"Selector"
-#define SDCA_CTL_MUTE_NAME				"Mute"
+#define SDCA_CTL_MUTE_NAME				"Channel"
 #define SDCA_CTL_CHANNEL_VOLUME_NAME			"Channel Volume"
 #define SDCA_CTL_AGC_NAME				"AGC"
 #define SDCA_CTL_BASS_BOOST_NAME			"Bass Boost"
diff --git a/sound/soc/sdca/sdca_asoc.c b/sound/soc/sdca/sdca_asoc.c
index 4e3f193c75ef..0e21ed109172 100644
--- a/sound/soc/sdca/sdca_asoc.c
+++ b/sound/soc/sdca/sdca_asoc.c
@@ -886,6 +886,9 @@ static int populate_control(struct device *dev,
 	mc->min = 0;
 	mc->max = clamp((0x1ull << control->nbits) - 1, 0, type_max(mc->max));
 
+	if (SDCA_CTL_TYPE(entity->type, control->sel) == SDCA_CTL_TYPE_S(FU, MUTE))
+		mc->invert = true;
+
 	(*kctl)->name = control_name;
 	(*kctl)->private_value = (unsigned long)mc;
 	(*kctl)->iface = SNDRV_CTL_ELEM_IFACE_MIXER;
-- 
cgit v1.2.3


From 48fa77af2f4a55ab961520f2a0e50560dc0baca8 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 27 Nov 2025 16:34:21 +0000
Subject: ASoC: SDCA: Add terminal type into input/output widget name

There have been some complaints around the UCM files for SDCA
devices that the control system is quite hard to follow. This is
definitely true without the specification handy the naming can be
a little cryptic. However, as most of the information is parsed
from DisCo there are some limits to what the driver can safely do
to improve this.

However, one area that can be improved is the non-streaming
input/output terminals. These have a field (enum sdca_terminal_type)
that describes the usage of that terminal. These types can be
appended to the entity name to give the users a better clue as
to the purpose. For example "OT 43", would now become "OT 43
Headphone". This would follow through into the jack controls which
would change from "OT 43 Jack" to "OT 43 Headphone Jack", making the
purpose much more obvious to the user.

This provides slightly more readable controls without relying on
implicit knowledge that individual parts might not conform to.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251127163426.2500633-3-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h   |  2 ++
 sound/soc/sdca/sdca_asoc.c      | 48 ++---------------------------------
 sound/soc/sdca/sdca_functions.c | 56 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 59 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 2564fad33fd4..6e9391b3816c 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -1456,6 +1456,8 @@ int sdca_parse_function(struct device *dev, struct sdw_slave *sdw,
 			struct sdca_function_desc *desc,
 			struct sdca_function_data *function);
 
+const char *sdca_find_terminal_name(enum sdca_terminal_type type);
+
 struct sdca_control *sdca_selector_find_control(struct device *dev,
 						struct sdca_entity *entity,
 						const int sel);
diff --git a/sound/soc/sdca/sdca_asoc.c b/sound/soc/sdca/sdca_asoc.c
index 0e21ed109172..2d328bbb95b9 100644
--- a/sound/soc/sdca/sdca_asoc.c
+++ b/sound/soc/sdca/sdca_asoc.c
@@ -115,50 +115,6 @@ int sdca_asoc_count_component(struct device *dev, struct sdca_function_data *fun
 }
 EXPORT_SYMBOL_NS(sdca_asoc_count_component, "SND_SOC_SDCA");
 
-static const char *get_terminal_name(enum sdca_terminal_type type)
-{
-	switch (type) {
-	case SDCA_TERM_TYPE_LINEIN_STEREO:
-		return SDCA_TERM_TYPE_LINEIN_STEREO_NAME;
-	case SDCA_TERM_TYPE_LINEIN_FRONT_LR:
-		return SDCA_TERM_TYPE_LINEIN_FRONT_LR_NAME;
-	case SDCA_TERM_TYPE_LINEIN_CENTER_LFE:
-		return SDCA_TERM_TYPE_LINEIN_CENTER_LFE_NAME;
-	case SDCA_TERM_TYPE_LINEIN_SURROUND_LR:
-		return SDCA_TERM_TYPE_LINEIN_SURROUND_LR_NAME;
-	case SDCA_TERM_TYPE_LINEIN_REAR_LR:
-		return SDCA_TERM_TYPE_LINEIN_REAR_LR_NAME;
-	case SDCA_TERM_TYPE_LINEOUT_STEREO:
-		return SDCA_TERM_TYPE_LINEOUT_STEREO_NAME;
-	case SDCA_TERM_TYPE_LINEOUT_FRONT_LR:
-		return SDCA_TERM_TYPE_LINEOUT_FRONT_LR_NAME;
-	case SDCA_TERM_TYPE_LINEOUT_CENTER_LFE:
-		return SDCA_TERM_TYPE_LINEOUT_CENTER_LFE_NAME;
-	case SDCA_TERM_TYPE_LINEOUT_SURROUND_LR:
-		return SDCA_TERM_TYPE_LINEOUT_SURROUND_LR_NAME;
-	case SDCA_TERM_TYPE_LINEOUT_REAR_LR:
-		return SDCA_TERM_TYPE_LINEOUT_REAR_LR_NAME;
-	case SDCA_TERM_TYPE_MIC_JACK:
-		return SDCA_TERM_TYPE_MIC_JACK_NAME;
-	case SDCA_TERM_TYPE_STEREO_JACK:
-		return SDCA_TERM_TYPE_STEREO_JACK_NAME;
-	case SDCA_TERM_TYPE_FRONT_LR_JACK:
-		return SDCA_TERM_TYPE_FRONT_LR_JACK_NAME;
-	case SDCA_TERM_TYPE_CENTER_LFE_JACK:
-		return SDCA_TERM_TYPE_CENTER_LFE_JACK_NAME;
-	case SDCA_TERM_TYPE_SURROUND_LR_JACK:
-		return SDCA_TERM_TYPE_SURROUND_LR_JACK_NAME;
-	case SDCA_TERM_TYPE_REAR_LR_JACK:
-		return SDCA_TERM_TYPE_REAR_LR_JACK_NAME;
-	case SDCA_TERM_TYPE_HEADPHONE_JACK:
-		return SDCA_TERM_TYPE_HEADPHONE_JACK_NAME;
-	case SDCA_TERM_TYPE_HEADSET_JACK:
-		return SDCA_TERM_TYPE_HEADSET_JACK_NAME;
-	default:
-		return NULL;
-	}
-}
-
 static int entity_early_parse_ge(struct device *dev,
 				 struct sdca_function_data *function,
 				 struct sdca_entity *entity)
@@ -217,7 +173,7 @@ static int entity_early_parse_ge(struct device *dev,
 		type = sdca_range(range, SDCA_SELECTED_MODE_TERM_TYPE, i);
 
 		values[i + 3] = sdca_range(range, SDCA_SELECTED_MODE_INDEX, i);
-		texts[i + 3] = get_terminal_name(type);
+		texts[i + 3] = sdca_find_terminal_name(type);
 		if (!texts[i + 3]) {
 			dev_err(dev, "%s: unrecognised terminal type: %#x\n",
 				entity->label, type);
@@ -499,7 +455,7 @@ static int entity_parse_su_device(struct device *dev,
 				return -EINVAL;
 			}
 
-			add_route(route, entity->label, get_terminal_name(term),
+			add_route(route, entity->label, sdca_find_terminal_name(term),
 				  entity->sources[affected->val - 1]->label);
 		}
 	}
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index e4b3efeb30f0..5a1f120487ef 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/property.h>
 #include <linux/soundwire/sdw.h>
+#include <linux/string.h>
 #include <linux/types.h>
 #include <sound/sdca.h>
 #include <sound/sdca_function.h>
@@ -1120,6 +1121,14 @@ static int find_sdca_entity_iot(struct device *dev,
 	terminal->type = tmp;
 	terminal->is_dataport = find_sdca_iot_dataport(terminal);
 
+	if (!terminal->is_dataport) {
+		const char *type_name = sdca_find_terminal_name(terminal->type);
+
+		if (type_name)
+			entity->label = devm_kasprintf(dev, GFP_KERNEL, "%s %s",
+						       entity->label, type_name);
+	}
+
 	ret = fwnode_property_read_u32(entity_node,
 				       "mipi-sdca-terminal-reference-number", &tmp);
 	if (!ret)
@@ -1565,7 +1574,7 @@ static struct sdca_entity *find_sdca_entity_by_label(struct sdca_function_data *
 	for (i = 0; i < function->num_entities; i++) {
 		struct sdca_entity *entity = &function->entities[i];
 
-		if (!strcmp(entity->label, entity_label))
+		if (!strncmp(entity->label, entity_label, strlen(entity_label)))
 			return entity;
 	}
 
@@ -2156,6 +2165,51 @@ int sdca_parse_function(struct device *dev, struct sdw_slave *sdw,
 }
 EXPORT_SYMBOL_NS(sdca_parse_function, "SND_SOC_SDCA");
 
+const char *sdca_find_terminal_name(enum sdca_terminal_type type)
+{
+	switch (type) {
+	case SDCA_TERM_TYPE_LINEIN_STEREO:
+		return SDCA_TERM_TYPE_LINEIN_STEREO_NAME;
+	case SDCA_TERM_TYPE_LINEIN_FRONT_LR:
+		return SDCA_TERM_TYPE_LINEIN_FRONT_LR_NAME;
+	case SDCA_TERM_TYPE_LINEIN_CENTER_LFE:
+		return SDCA_TERM_TYPE_LINEIN_CENTER_LFE_NAME;
+	case SDCA_TERM_TYPE_LINEIN_SURROUND_LR:
+		return SDCA_TERM_TYPE_LINEIN_SURROUND_LR_NAME;
+	case SDCA_TERM_TYPE_LINEIN_REAR_LR:
+		return SDCA_TERM_TYPE_LINEIN_REAR_LR_NAME;
+	case SDCA_TERM_TYPE_LINEOUT_STEREO:
+		return SDCA_TERM_TYPE_LINEOUT_STEREO_NAME;
+	case SDCA_TERM_TYPE_LINEOUT_FRONT_LR:
+		return SDCA_TERM_TYPE_LINEOUT_FRONT_LR_NAME;
+	case SDCA_TERM_TYPE_LINEOUT_CENTER_LFE:
+		return SDCA_TERM_TYPE_LINEOUT_CENTER_LFE_NAME;
+	case SDCA_TERM_TYPE_LINEOUT_SURROUND_LR:
+		return SDCA_TERM_TYPE_LINEOUT_SURROUND_LR_NAME;
+	case SDCA_TERM_TYPE_LINEOUT_REAR_LR:
+		return SDCA_TERM_TYPE_LINEOUT_REAR_LR_NAME;
+	case SDCA_TERM_TYPE_MIC_JACK:
+		return SDCA_TERM_TYPE_MIC_JACK_NAME;
+	case SDCA_TERM_TYPE_STEREO_JACK:
+		return SDCA_TERM_TYPE_STEREO_JACK_NAME;
+	case SDCA_TERM_TYPE_FRONT_LR_JACK:
+		return SDCA_TERM_TYPE_FRONT_LR_JACK_NAME;
+	case SDCA_TERM_TYPE_CENTER_LFE_JACK:
+		return SDCA_TERM_TYPE_CENTER_LFE_JACK_NAME;
+	case SDCA_TERM_TYPE_SURROUND_LR_JACK:
+		return SDCA_TERM_TYPE_SURROUND_LR_JACK_NAME;
+	case SDCA_TERM_TYPE_REAR_LR_JACK:
+		return SDCA_TERM_TYPE_REAR_LR_JACK_NAME;
+	case SDCA_TERM_TYPE_HEADPHONE_JACK:
+		return SDCA_TERM_TYPE_HEADPHONE_JACK_NAME;
+	case SDCA_TERM_TYPE_HEADSET_JACK:
+		return SDCA_TERM_TYPE_HEADSET_JACK_NAME;
+	default:
+		return NULL;
+	}
+}
+EXPORT_SYMBOL_NS(sdca_find_terminal_name, "SND_SOC_SDCA");
+
 struct sdca_control *sdca_selector_find_control(struct device *dev,
 						struct sdca_entity *entity,
 						const int sel)
-- 
cgit v1.2.3


From 2ae4659533d8e2b5e06e8f570e2b4b7b88ae0716 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 27 Nov 2025 16:34:23 +0000
Subject: ASoC: sdw_utils: Move codec_name to dai info

As SDCA devices will support each DAI link on a different child device,
move the codec name from codec_info to each dai_info. To allow the
appropriate function device to be bound to each DAI link.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251127163426.2500633-5-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc_sdw_utils.h       |  4 ++--
 sound/soc/sdw_utils/soc_sdw_utils.c | 19 ++++++++++---------
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h
index 76c64c5245d4..714e207d4c01 100644
--- a/include/sound/soc_sdw_utils.h
+++ b/include/sound/soc_sdw_utils.h
@@ -45,6 +45,7 @@ struct asoc_sdw_codec_info;
 
 struct asoc_sdw_dai_info {
 	const bool direction[2]; /* playback & capture support */
+	const char *codec_name;
 	const char *dai_name;
 	const char *component_name;
 	const int dai_type;
@@ -67,7 +68,6 @@ struct asoc_sdw_dai_info {
 struct asoc_sdw_codec_info {
 	const int part_id;
 	const int version_id;
-	const char *codec_name;
 	const char *name_prefix;
 	int amp_num;
 	const u8 acpi_id[ACPI_ID_LEN];
@@ -131,7 +131,7 @@ int asoc_sdw_hw_free(struct snd_pcm_substream *substream);
 void asoc_sdw_shutdown(struct snd_pcm_substream *substream);
 
 const char *asoc_sdw_get_codec_name(struct device *dev,
-				    const struct asoc_sdw_codec_info *codec_info,
+				    const struct asoc_sdw_dai_info *dai_info,
 				    const struct snd_soc_acpi_link_adr *adr_link,
 				    int adr_index);
 
diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c
index f2f1954adf22..f31213e00a16 100644
--- a/sound/soc/sdw_utils/soc_sdw_utils.c
+++ b/sound/soc/sdw_utils/soc_sdw_utils.c
@@ -656,12 +656,12 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 	{
 		.part_id = 0x4243,
 		.name_prefix = "cs42l43",
-		.codec_name = "cs42l43-codec",
 		.count_sidecar = asoc_sdw_bridge_cs35l56_count_sidecar,
 		.add_sidecar = asoc_sdw_bridge_cs35l56_add_sidecar,
 		.dais = {
 			{
 				.direction = {true, false},
+				.codec_name = "cs42l43-codec",
 				.dai_name = "cs42l43-dp5",
 				.dai_type = SOC_SDW_DAI_TYPE_JACK,
 				.dailink = {SOC_SDW_JACK_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID},
@@ -673,6 +673,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 			},
 			{
 				.direction = {false, true},
+				.codec_name = "cs42l43-codec",
 				.dai_name = "cs42l43-dp1",
 				.dai_type = SOC_SDW_DAI_TYPE_MIC,
 				.dailink = {SOC_SDW_UNUSED_DAI_ID, SOC_SDW_DMIC_DAI_ID},
@@ -684,12 +685,14 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 			},
 			{
 				.direction = {false, true},
+				.codec_name = "cs42l43-codec",
 				.dai_name = "cs42l43-dp2",
 				.dai_type = SOC_SDW_DAI_TYPE_JACK,
 				.dailink = {SOC_SDW_UNUSED_DAI_ID, SOC_SDW_JACK_IN_DAI_ID},
 			},
 			{
 				.direction = {true, false},
+				.codec_name = "cs42l43-codec",
 				.dai_name = "cs42l43-dp6",
 				.dai_type = SOC_SDW_DAI_TYPE_AMP,
 				.dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID},
@@ -1094,7 +1097,6 @@ static bool asoc_sdw_is_unique_device(const struct snd_soc_acpi_link_adr *adr_li
 }
 
 static const char *_asoc_sdw_get_codec_name(struct device *dev,
-					    const struct asoc_sdw_codec_info *codec_info,
 					    const struct snd_soc_acpi_link_adr *adr_link,
 					    int adr_index)
 {
@@ -1116,14 +1118,14 @@ static const char *_asoc_sdw_get_codec_name(struct device *dev,
 }
 
 const char *asoc_sdw_get_codec_name(struct device *dev,
-				    const struct asoc_sdw_codec_info *codec_info,
+				    const struct asoc_sdw_dai_info *dai_info,
 				    const struct snd_soc_acpi_link_adr *adr_link,
 				    int adr_index)
 {
-	if (codec_info->codec_name)
-		return devm_kstrdup(dev, codec_info->codec_name, GFP_KERNEL);
+	if (dai_info->codec_name)
+		return devm_kstrdup(dev, dai_info->codec_name, GFP_KERNEL);
 
-	return _asoc_sdw_get_codec_name(dev, codec_info, adr_link, adr_index);
+	return _asoc_sdw_get_codec_name(dev, adr_link, adr_index);
 }
 EXPORT_SYMBOL_NS(asoc_sdw_get_codec_name, "SND_SOC_SDW_UTILS");
 
@@ -1354,8 +1356,7 @@ static int is_sdca_endpoint_present(struct device *dev,
 	}
 	kfree(dlc);
 
-	sdw_codec_name = _asoc_sdw_get_codec_name(dev, codec_info,
-						  adr_link, adr_index);
+	sdw_codec_name = _asoc_sdw_get_codec_name(dev, adr_link, adr_index);
 	if (!sdw_codec_name)
 		return -ENOMEM;
 
@@ -1529,7 +1530,7 @@ int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card,
 				num_link_dailinks += !!list_empty(&soc_dai->endpoints);
 				list_add_tail(&soc_end->list, &soc_dai->endpoints);
 
-				codec_name = asoc_sdw_get_codec_name(dev, codec_info,
+				codec_name = asoc_sdw_get_codec_name(dev, dai_info,
 								     adr_link, i);
 				if (!codec_name)
 					return -ENOMEM;
-- 
cgit v1.2.3


From c66297d09e1a5813eb743bae8cda4e115b8a5c56 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 27 Nov 2025 16:34:24 +0000
Subject: ASoC: intel: sof_sdw: Add ability to have auxiliary devices

Currently the sof_sdw machine driver assumes that all devices involved
in the sound card are connected through a DAI link. However for SDCA
devices we still want the HID (Human Interface Device, used for jack
buttons) to be part of the sound card, but it contains no DAI links.

Add support into the machine driver to specify a list of auxiliary
devices to merged into the card.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251127163426.2500633-6-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc_sdw_utils.h           | 11 ++++++++++-
 sound/soc/amd/acp/acp-sdw-legacy-mach.c | 12 ++++++++++--
 sound/soc/amd/acp/acp-sdw-sof-mach.c    | 12 ++++++++++--
 sound/soc/intel/boards/sof_sdw.c        | 14 ++++++++++++--
 sound/soc/sdw_utils/soc_sdw_utils.c     | 23 ++++++++++++++++++++---
 5 files changed, 62 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h
index 714e207d4c01..48719fde308c 100644
--- a/include/sound/soc_sdw_utils.h
+++ b/include/sound/soc_sdw_utils.h
@@ -13,6 +13,7 @@
 #include <sound/soc-acpi.h>
 
 #define SOC_SDW_MAX_DAI_NUM             8
+#define SOC_SDW_MAX_AUX_NUM		2
 #define SOC_SDW_MAX_NO_PROPS		2
 #define SOC_SDW_JACK_JDSRC(quirk)	((quirk) & GENMASK(3, 0))
 
@@ -65,6 +66,10 @@ struct asoc_sdw_dai_info {
 	bool quirk_exclude;
 };
 
+struct asoc_sdw_aux_info {
+	const char *codec_name;
+};
+
 struct asoc_sdw_codec_info {
 	const int part_id;
 	const int version_id;
@@ -75,6 +80,8 @@ struct asoc_sdw_codec_info {
 	const struct snd_soc_ops *ops;
 	struct asoc_sdw_dai_info dais[SOC_SDW_MAX_DAI_NUM];
 	const int dai_num;
+	struct asoc_sdw_aux_info auxs[SOC_SDW_MAX_AUX_NUM];
+	const int aux_num;
 
 	int (*codec_card_late_probe)(struct snd_soc_card *card);
 
@@ -165,13 +172,15 @@ int asoc_sdw_init_simple_dai_link(struct device *dev, struct snd_soc_dai_link *d
 				  int no_pcm, int (*init)(struct snd_soc_pcm_runtime *rtd),
 				  const struct snd_soc_ops *ops);
 
-int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card, int *num_devs, int *num_ends);
+int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card,
+				 int *num_devs, int *num_ends, int *num_aux);
 
 struct asoc_sdw_dailink *asoc_sdw_find_dailink(struct asoc_sdw_dailink *dailinks,
 					       const struct snd_soc_acpi_endpoint *new);
 int asoc_sdw_get_dai_type(u32 type);
 
 int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card,
+				 struct snd_soc_aux_dev *soc_aux,
 				 struct asoc_sdw_dailink *soc_dais,
 				 struct asoc_sdw_endpoint *soc_ends,
 				 int *num_devs);
diff --git a/sound/soc/amd/acp/acp-sdw-legacy-mach.c b/sound/soc/amd/acp/acp-sdw-legacy-mach.c
index f1f43eeb6037..fae94b9edd5a 100644
--- a/sound/soc/amd/acp/acp-sdw-legacy-mach.c
+++ b/sound/soc/amd/acp/acp-sdw-legacy-mach.c
@@ -360,16 +360,18 @@ static int soc_card_dai_links_create(struct snd_soc_card *card)
 	struct snd_soc_acpi_mach_params *mach_params = &mach->mach_params;
 	struct asoc_sdw_endpoint *soc_ends __free(kfree) = NULL;
 	struct asoc_sdw_dailink *soc_dais __free(kfree) = NULL;
+	struct snd_soc_aux_dev *soc_aux;
 	struct snd_soc_codec_conf *codec_conf;
 	struct snd_soc_dai_link *dai_links;
 	int num_devs = 0;
 	int num_ends = 0;
+	int num_aux = 0;
 	int num_confs;
 	int num_links;
 	int be_id = 0;
 	int ret;
 
-	ret = asoc_sdw_count_sdw_endpoints(card, &num_devs, &num_ends);
+	ret = asoc_sdw_count_sdw_endpoints(card, &num_devs, &num_ends, &num_aux);
 	if (ret < 0) {
 		dev_err(dev, "failed to count devices/endpoints: %d\n", ret);
 		return ret;
@@ -387,7 +389,11 @@ static int soc_card_dai_links_create(struct snd_soc_card *card)
 	if (!soc_ends)
 		return -ENOMEM;
 
-	ret = asoc_sdw_parse_sdw_endpoints(card, soc_dais, soc_ends, &num_confs);
+	soc_aux = devm_kcalloc(dev, num_aux, sizeof(*soc_aux), GFP_KERNEL);
+	if (!soc_aux)
+		return -ENOMEM;
+
+	ret = asoc_sdw_parse_sdw_endpoints(card, soc_aux, soc_dais, soc_ends, &num_confs);
 	if (ret < 0)
 		return ret;
 
@@ -413,6 +419,8 @@ static int soc_card_dai_links_create(struct snd_soc_card *card)
 	card->num_configs = num_confs;
 	card->dai_link = dai_links;
 	card->num_links = num_links;
+	card->aux_dev = soc_aux;
+	card->num_aux_devs = num_aux;
 
 	/* SDW */
 	if (sdw_be_num) {
diff --git a/sound/soc/amd/acp/acp-sdw-sof-mach.c b/sound/soc/amd/acp/acp-sdw-sof-mach.c
index d055582a3bf1..5677ae63fca9 100644
--- a/sound/soc/amd/acp/acp-sdw-sof-mach.c
+++ b/sound/soc/amd/acp/acp-sdw-sof-mach.c
@@ -272,15 +272,17 @@ static int sof_card_dai_links_create(struct snd_soc_card *card)
 	struct snd_soc_acpi_mach_params *mach_params = &mach->mach_params;
 	struct asoc_sdw_endpoint *sof_ends __free(kfree) = NULL;
 	struct asoc_sdw_dailink *sof_dais __free(kfree) = NULL;
+	struct snd_soc_aux_dev *sof_aux;
 	struct snd_soc_codec_conf *codec_conf;
 	struct snd_soc_dai_link *dai_links;
 	int num_devs = 0;
 	int num_ends = 0;
+	int num_aux = 0;
 	int num_links;
 	int be_id = 0;
 	int ret;
 
-	ret = asoc_sdw_count_sdw_endpoints(card, &num_devs, &num_ends);
+	ret = asoc_sdw_count_sdw_endpoints(card, &num_devs, &num_ends, &num_aux);
 	if (ret < 0) {
 		dev_err(dev, "failed to count devices/endpoints: %d\n", ret);
 		return ret;
@@ -296,7 +298,11 @@ static int sof_card_dai_links_create(struct snd_soc_card *card)
 	if (!sof_ends)
 		return -ENOMEM;
 
-	ret = asoc_sdw_parse_sdw_endpoints(card, sof_dais, sof_ends, &num_devs);
+	sof_aux = devm_kcalloc(dev, num_aux, sizeof(*sof_aux), GFP_KERNEL);
+	if (!sof_aux)
+		return -ENOMEM;
+
+	ret = asoc_sdw_parse_sdw_endpoints(card, sof_aux, sof_dais, sof_ends, &num_devs);
 	if (ret < 0)
 		return ret;
 
@@ -322,6 +328,8 @@ static int sof_card_dai_links_create(struct snd_soc_card *card)
 	card->num_configs = num_devs;
 	card->dai_link = dai_links;
 	card->num_links = num_links;
+	card->aux_dev = sof_aux;
+	card->num_aux_devs = num_aux;
 
 	/* SDW */
 	if (sdw_be_num) {
diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c
index 0c6677d66ec7..2c1001148d54 100644
--- a/sound/soc/intel/boards/sof_sdw.c
+++ b/sound/soc/intel/boards/sof_sdw.c
@@ -1189,8 +1189,10 @@ static int sof_card_dai_links_create(struct snd_soc_card *card)
 	struct asoc_sdw_codec_info *ssp_info;
 	struct asoc_sdw_endpoint *sof_ends;
 	struct asoc_sdw_dailink *sof_dais;
+	struct snd_soc_aux_dev *sof_aux;
 	int num_devs = 0;
 	int num_ends = 0;
+	int num_aux = 0;
 	int num_confs;
 	struct snd_soc_dai_link *dai_links;
 	int num_links;
@@ -1199,7 +1201,7 @@ static int sof_card_dai_links_create(struct snd_soc_card *card)
 	unsigned long ssp_mask;
 	int ret;
 
-	ret = asoc_sdw_count_sdw_endpoints(card, &num_devs, &num_ends);
+	ret = asoc_sdw_count_sdw_endpoints(card, &num_devs, &num_ends, &num_aux);
 	if (ret < 0) {
 		dev_err(dev, "failed to count devices/endpoints: %d\n", ret);
 		return ret;
@@ -1223,7 +1225,13 @@ static int sof_card_dai_links_create(struct snd_soc_card *card)
 		goto err_dai;
 	}
 
-	ret = asoc_sdw_parse_sdw_endpoints(card, sof_dais, sof_ends, &num_confs);
+	sof_aux = devm_kcalloc(dev, num_aux, sizeof(*sof_aux), GFP_KERNEL);
+	if (!sof_aux) {
+		ret = -ENOMEM;
+		goto err_dai;
+	}
+
+	ret = asoc_sdw_parse_sdw_endpoints(card, sof_aux, sof_dais, sof_ends, &num_confs);
 	if (ret < 0)
 		goto err_end;
 
@@ -1289,6 +1297,8 @@ static int sof_card_dai_links_create(struct snd_soc_card *card)
 	card->num_configs = num_confs;
 	card->dai_link = dai_links;
 	card->num_links = num_links;
+	card->aux_dev = sof_aux;
+	card->num_aux_devs = num_aux;
 
 	/* SDW */
 	if (sdw_be_num) {
diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c
index f31213e00a16..f57e02bea5b4 100644
--- a/sound/soc/sdw_utils/soc_sdw_utils.c
+++ b/sound/soc/sdw_utils/soc_sdw_utils.c
@@ -1252,7 +1252,8 @@ int asoc_sdw_init_simple_dai_link(struct device *dev, struct snd_soc_dai_link *d
 }
 EXPORT_SYMBOL_NS(asoc_sdw_init_simple_dai_link, "SND_SOC_SDW_UTILS");
 
-int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card, int *num_devs, int *num_ends)
+int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card,
+				 int *num_devs, int *num_ends, int *num_aux)
 {
 	struct device *dev = card->dev;
 	struct snd_soc_acpi_mach *mach = dev_get_platdata(dev);
@@ -1263,8 +1264,18 @@ int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card, int *num_devs, int *
 	for (adr_link = mach_params->links; adr_link->num_adr; adr_link++) {
 		*num_devs += adr_link->num_adr;
 
-		for (i = 0; i < adr_link->num_adr; i++)
-			*num_ends += adr_link->adr_d[i].num_endpoints;
+		for (i = 0; i < adr_link->num_adr; i++) {
+			const struct snd_soc_acpi_adr_device *adr_dev = &adr_link->adr_d[i];
+			struct asoc_sdw_codec_info *codec_info;
+
+			*num_ends += adr_dev->num_endpoints;
+
+			codec_info = asoc_sdw_find_codec_info_part(adr_dev->adr);
+			if (!codec_info)
+				return -EINVAL;
+
+			*num_aux += codec_info->aux_num;
+		}
 	}
 
 	dev_dbg(dev, "Found %d devices with %d endpoints\n", *num_devs, *num_ends);
@@ -1402,6 +1413,7 @@ put_device:
 }
 
 int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card,
+				 struct snd_soc_aux_dev *soc_aux,
 				 struct asoc_sdw_dailink *soc_dais,
 				 struct asoc_sdw_endpoint *soc_ends,
 				 int *num_devs)
@@ -1441,6 +1453,11 @@ int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card,
 			if (!codec_info)
 				return -EINVAL;
 
+			for (j = 0; j < codec_info->aux_num; j++) {
+				soc_aux->dlc.name = codec_info->auxs[j].codec_name;
+				soc_aux++;
+			}
+
 			ctx->ignore_internal_dmic |= codec_info->ignore_internal_dmic;
 
 			if (codec_info->count_sidecar && codec_info->add_sidecar) {
-- 
cgit v1.2.3


From 3f6b562f2107ab2467908fa1543e1a6ea8442bd1 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 27 Nov 2025 16:34:25 +0000
Subject: ASoC: sdw_utils: Add cs42l45 support functions

Add the helper functions into the machine driver for the cs42l45,
this will register a jack for jack detection and add things into
to the components string if they are needed.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20251127163426.2500633-7-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc_sdw_utils.h         |  2 +
 sound/soc/sdw_utils/Makefile          |  1 +
 sound/soc/sdw_utils/soc_sdw_cs42l45.c | 80 +++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)
 create mode 100644 sound/soc/sdw_utils/soc_sdw_cs42l45.c

(limited to 'include')

diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h
index 48719fde308c..227347c8f0b3 100644
--- a/include/sound/soc_sdw_utils.h
+++ b/include/sound/soc_sdw_utils.h
@@ -257,6 +257,8 @@ int asoc_sdw_cs42l42_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_da
 int asoc_sdw_cs42l43_hs_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai);
 int asoc_sdw_cs42l43_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai);
 int asoc_sdw_cs42l43_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai);
+int asoc_sdw_cs42l45_hs_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai);
+int asoc_sdw_cs42l45_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai);
 int asoc_sdw_cs_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai);
 int asoc_sdw_maxim_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai);
 /* TI */
diff --git a/sound/soc/sdw_utils/Makefile b/sound/soc/sdw_utils/Makefile
index a87c53e1a2c1..e8bd5ffb1a6a 100644
--- a/sound/soc/sdw_utils/Makefile
+++ b/sound/soc/sdw_utils/Makefile
@@ -5,6 +5,7 @@ snd-soc-sdw-utils-y := soc_sdw_utils.o soc_sdw_dmic.o soc_sdw_rt_dmic.o \
 		       soc_sdw_rt_amp.o soc_sdw_rt_mf_sdca.o		\
 		       soc_sdw_bridge_cs35l56.o 			\
 		       soc_sdw_cs42l42.o soc_sdw_cs42l43.o 		\
+		       soc_sdw_cs42l45.o				\
 		       soc_sdw_cs_amp.o					\
 		       soc_sdw_maxim.o \
 		       soc_sdw_ti_amp.o
diff --git a/sound/soc/sdw_utils/soc_sdw_cs42l45.c b/sound/soc/sdw_utils/soc_sdw_cs42l45.c
new file mode 100644
index 000000000000..647923d9669f
--- /dev/null
+++ b/sound/soc/sdw_utils/soc_sdw_cs42l45.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Based on sof_sdw_rt5682.c
+// This file incorporates work covered by the following copyright notice:
+// Copyright (c) 2023 Intel Corporation
+// Copyright (c) 2024 Advanced Micro Devices, Inc.
+
+/*
+ *  soc_sdw_cs42l45 - Helpers to handle CS42L45 from generic machine driver
+ */
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <sound/jack.h>
+#include <sound/soc.h>
+#include <sound/soc-card.h>
+#include <sound/soc-component.h>
+#include <sound/soc-dai.h>
+#include <sound/soc_sdw_utils.h>
+
+static struct snd_soc_jack_pin soc_jack_pins[] = {
+	{
+		.pin    = "cs42l45 OT 43 Headphone",
+		.mask   = SND_JACK_HEADPHONE,
+	},
+	{
+		.pin    = "cs42l45 OT 45 Headset",
+		.mask   = SND_JACK_HEADPHONE,
+	},
+	{
+		.pin    = "cs42l45 IT 31 Microphone",
+		.mask   = SND_JACK_MICROPHONE,
+	},
+	{
+		.pin    = "cs42l45 IT 33 Headset",
+		.mask   = SND_JACK_MICROPHONE,
+	},
+};
+
+int asoc_sdw_cs42l45_hs_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai)
+{
+	struct snd_soc_card *card = rtd->card;
+	struct snd_soc_component *component = snd_soc_rtd_to_codec(rtd, 0)->component;
+	struct asoc_sdw_mc_private *ctx = snd_soc_card_get_drvdata(card);
+	struct snd_soc_jack *jack = &ctx->sdw_headset;
+	int ret;
+
+	card->components = devm_kasprintf(card->dev, GFP_KERNEL, "%s hs:cs42l45",
+					  card->components);
+	if (!card->components)
+		return -ENOMEM;
+
+	ret = snd_soc_card_jack_new_pins(card, "Jack", SND_JACK_MECHANICAL |
+					 SND_JACK_HEADSET | SND_JACK_LINEOUT, jack,
+					 soc_jack_pins, ARRAY_SIZE(soc_jack_pins));
+	if (ret) {
+		dev_err(card->dev, "Failed to create jack: %d\n", ret);
+		return ret;
+	}
+
+	ret = snd_soc_component_set_jack(component, jack, NULL);
+	if (ret) {
+		dev_err(card->dev, "Failed to register jack: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(asoc_sdw_cs42l45_hs_rtd_init, "SND_SOC_SDW_UTILS");
+
+int asoc_sdw_cs42l45_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai)
+{
+	struct snd_soc_card *card = rtd->card;
+
+	card->components = devm_kasprintf(card->dev, GFP_KERNEL, "%s mic:cs42l45-dmic",
+					  card->components);
+	if (!card->components)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(asoc_sdw_cs42l45_dmic_rtd_init, "SND_SOC_SDW_UTILS");
-- 
cgit v1.2.3


From 8c8e3df3d2f51e9a3f6f1a1112adf250f7652d42 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Mon, 27 Oct 2025 23:33:00 +0100
Subject: keys: Fix grammar and formatting in 'struct key_type' comments

s/it/if/ and s/revokation/revocation/, capitalize "clear", and add a
period after the sentence. Fix the comment formatting.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/key-type.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index 5caf3ce82373..bb97bd3e5af4 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -107,11 +107,14 @@ struct key_type {
 	 */
 	int (*match_preparse)(struct key_match_data *match_data);
 
-	/* Free preparsed match data (optional).  This should be supplied it
-	 * ->match_preparse() is supplied. */
+	/*
+	 * Free preparsed match data (optional).  This should be supplied if
+	 * ->match_preparse() is supplied.
+	 */
 	void (*match_free)(struct key_match_data *match_data);
 
-	/* clear some of the data from a key on revokation (optional)
+	/*
+	 * Clear some of the data from a key on revocation (optional).
 	 * - the key's semaphore will be write-locked by the caller
 	 */
 	void (*revoke)(struct key *key);
-- 
cgit v1.2.3


From c2d2dad24503d7e2eb7cba354fcc73f95fa78d7a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 14 Nov 2025 14:06:45 +0000
Subject: rbtree: inline rb_first()

Patch series "rbree: inline rb_first() and rb_last()".

Inline these two small helpers, heavily used in TCP and FQ packet scheduler,
and in many other places.

This reduces kernel text size, and brings an 1.5 % improvement on network
TCP stress test.


This patch (of 2):

This is a very small function, inlining it saves cpu cycles by reducing
register pressure and removing call/ret overhead.

It also reduces vmlinux text size by 744 bytes on a typical x86_64 build.

Before:

size vmlinux
   text	   data	    bss	    dec	    hex	filename
34812525	22177365	5685248	62675138	3bc58c2	vmlinux

After:

size vmlinux
   text	   data	    bss	    dec	    hex	filename
34811781	22177365	5685248	62674394	3bc55da	vmlinux

[ojeda@kernel.org: fix rust build]
  Link: https://lkml.kernel.org/r/20251120085518.1463498-1-ojeda@kernel.org
Link: https://lkml.kernel.org/r/20251114140646.3817319-1-edumazet@google.com
Link: https://lkml.kernel.org/r/20251114140646.3817319-2-edumazet@google.com
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Jakub Kacinski <kuba@kernel.org>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Stehen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rbtree.h | 16 +++++++++++++++-
 lib/rbtree.c           | 16 ----------------
 rust/helpers/rbtree.c  |  5 +++++
 3 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 8d2ba3749866..484554900f7d 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -43,7 +43,21 @@ extern void rb_erase(struct rb_node *, struct rb_root *);
 /* Find logical next and previous nodes in a tree */
 extern struct rb_node *rb_next(const struct rb_node *);
 extern struct rb_node *rb_prev(const struct rb_node *);
-extern struct rb_node *rb_first(const struct rb_root *);
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+static inline struct rb_node *rb_first(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
 extern struct rb_node *rb_last(const struct rb_root *);
 
 /* Postorder iteration - always visit the parent after its children */
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 5114eda6309c..b946eb4b759d 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -460,22 +460,6 @@ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 }
 EXPORT_SYMBOL(__rb_insert_augmented);
 
-/*
- * This function returns the first node (in sort order) of the tree.
- */
-struct rb_node *rb_first(const struct rb_root *root)
-{
-	struct rb_node	*n;
-
-	n = root->rb_node;
-	if (!n)
-		return NULL;
-	while (n->rb_left)
-		n = n->rb_left;
-	return n;
-}
-EXPORT_SYMBOL(rb_first);
-
 struct rb_node *rb_last(const struct rb_root *root)
 {
 	struct rb_node	*n;
diff --git a/rust/helpers/rbtree.c b/rust/helpers/rbtree.c
index 6d404b84a9b5..d0e452234632 100644
--- a/rust/helpers/rbtree.c
+++ b/rust/helpers/rbtree.c
@@ -7,3 +7,8 @@ void rust_helper_rb_link_node(struct rb_node *node, struct rb_node *parent,
 {
 	rb_link_node(node, parent, rb_link);
 }
+
+struct rb_node *rust_helper_rb_first(const struct rb_root *root)
+{
+	return rb_first(root);
+}
-- 
cgit v1.2.3


From 94984bfed58ca129f7e259ce09973ed0b3f540a8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 14 Nov 2025 14:06:46 +0000
Subject: rbtree: inline rb_last()

This is a very small function, inlining it saves cpu cycles in TCP by
reducing register pressure and removing call/ret overhead.

It also reduces vmlinux text size by 122 bytes on a typical x86_64 build.

Before:

size vmlinux
   text    data     bss     dec     hex filename
34811781        22177365        5685248 62674394        3bc55da vmlinux

After:

size vmlinux
   text	   data	    bss	    dec	    hex	filename
34811659	22177365	5685248	62674272	3bc5560	vmlinux

[ojeda@kernel.org: fix rust build]
  Link: https://lkml.kernel.org/r/20251120085518.1463498-1-ojeda@kernel.org
Link: https://lkml.kernel.org/r/20251114140646.3817319-3-edumazet@google.com
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Jakub Kacinski <kuba@kernel.org>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Stehen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rbtree.h | 16 +++++++++++++++-
 lib/rbtree.c           | 13 -------------
 rust/helpers/rbtree.c  |  5 +++++
 3 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 484554900f7d..4091e978aef2 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -58,7 +58,21 @@ static inline struct rb_node *rb_first(const struct rb_root *root)
 		n = n->rb_left;
 	return n;
 }
-extern struct rb_node *rb_last(const struct rb_root *);
+
+/*
+ * This function returns the last node (in sort order) of the tree.
+ */
+static inline struct rb_node *rb_last(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
 
 /* Postorder iteration - always visit the parent after its children */
 extern struct rb_node *rb_first_postorder(const struct rb_root *);
diff --git a/lib/rbtree.c b/lib/rbtree.c
index b946eb4b759d..18d42bcf4ec9 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -460,19 +460,6 @@ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 }
 EXPORT_SYMBOL(__rb_insert_augmented);
 
-struct rb_node *rb_last(const struct rb_root *root)
-{
-	struct rb_node	*n;
-
-	n = root->rb_node;
-	if (!n)
-		return NULL;
-	while (n->rb_right)
-		n = n->rb_right;
-	return n;
-}
-EXPORT_SYMBOL(rb_last);
-
 struct rb_node *rb_next(const struct rb_node *node)
 {
 	struct rb_node *parent;
diff --git a/rust/helpers/rbtree.c b/rust/helpers/rbtree.c
index d0e452234632..2a0eabbb4160 100644
--- a/rust/helpers/rbtree.c
+++ b/rust/helpers/rbtree.c
@@ -12,3 +12,8 @@ struct rb_node *rust_helper_rb_first(const struct rb_root *root)
 {
 	return rb_first(root);
 }
+
+struct rb_node *rust_helper_rb_last(const struct rb_root *root)
+{
+	return rb_last(root);
+}
-- 
cgit v1.2.3


From 70f9133096c833922c3b63461480248cefa7bb0f Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Sat, 1 Nov 2025 10:23:18 -0400
Subject: kho: drop notifiers

The KHO framework uses a notifier chain as the mechanism for clients to
participate in the finalization process.  While this works for a single,
central state machine, it is too restrictive for kernel-internal
components like pstore/reserve_mem or IMA.  These components need a
simpler, direct way to register their state for preservation (e.g., during
their initcall) without being part of a complex, shutdown-time notifier
sequence.  The notifier model forces all participants into a single
finalization flow and makes direct preservation from an arbitrary context
difficult.  This patch refactors the client participation model by
removing the notifier chain and introducing a direct API for managing FDT
subtrees.

The core kho_finalize() and kho_abort() state machine remains, but clients
now register their data with KHO beforehand.

Link: https://lkml.kernel.org/r/20251101142325.1326536-3-pasha.tatashin@soleen.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec_handover.h   |  28 ++-----
 kernel/kexec_handover.c          | 166 +++++++++++++++++++++------------------
 kernel/kexec_handover_debugfs.c  |  17 ++--
 kernel/kexec_handover_internal.h |   5 +-
 lib/test_kho.c                   |  35 +--------
 mm/memblock.c                    |  62 +++------------
 6 files changed, 125 insertions(+), 188 deletions(-)

(limited to 'include')

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 25042c1d8d54..0d860d793b66 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -10,14 +10,7 @@ struct kho_scratch {
 	phys_addr_t size;
 };
 
-/* KHO Notifier index */
-enum kho_event {
-	KEXEC_KHO_FINALIZE = 0,
-	KEXEC_KHO_ABORT = 1,
-};
-
 struct folio;
-struct notifier_block;
 struct page;
 
 #define DECLARE_KHOSER_PTR(name, type) \
@@ -37,8 +30,6 @@ struct page;
 		(typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \
 	})
 
-struct kho_serialization;
-
 struct kho_vmalloc_chunk;
 struct kho_vmalloc {
 	DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *);
@@ -57,12 +48,10 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation);
 struct folio *kho_restore_folio(phys_addr_t phys);
 struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages);
 void *kho_restore_vmalloc(const struct kho_vmalloc *preservation);
-int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
+int kho_add_subtree(const char *name, void *fdt);
+void kho_remove_subtree(void *fdt);
 int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
 
-int register_kho_notifier(struct notifier_block *nb);
-int unregister_kho_notifier(struct notifier_block *nb);
-
 void kho_memory_init(void);
 
 void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys,
@@ -110,23 +99,16 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
 	return NULL;
 }
 
-static inline int kho_add_subtree(struct kho_serialization *ser,
-				  const char *name, void *fdt)
+static inline int kho_add_subtree(const char *name, void *fdt)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+static inline void kho_remove_subtree(void *fdt)
 {
-	return -EOPNOTSUPP;
 }
 
-static inline int register_kho_notifier(struct notifier_block *nb)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int unregister_kho_notifier(struct notifier_block *nb)
+static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index befa6ceab574..3dd917bfedcc 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -16,7 +16,6 @@
 #include <linux/libfdt.h>
 #include <linux/list.h>
 #include <linux/memblock.h>
-#include <linux/notifier.h>
 #include <linux/page-isolation.h>
 #include <linux/vmalloc.h>
 
@@ -103,29 +102,34 @@ struct kho_mem_track {
 
 struct khoser_mem_chunk;
 
-struct kho_serialization {
-	struct page *fdt;
-	struct kho_mem_track track;
-	/* First chunk of serialized preserved memory map */
-	struct khoser_mem_chunk *preserved_mem_map;
+struct kho_sub_fdt {
+	struct list_head l;
+	const char *name;
+	void *fdt;
 };
 
 struct kho_out {
-	struct blocking_notifier_head chain_head;
-	struct mutex lock; /* protects KHO FDT finalization */
-	struct kho_serialization ser;
+	void *fdt;
 	bool finalized;
+	struct mutex lock; /* protects KHO FDT finalization */
+
+	struct list_head sub_fdts;
+	struct mutex fdts_lock;
+
+	struct kho_mem_track track;
+	/* First chunk of serialized preserved memory map */
+	struct khoser_mem_chunk *preserved_mem_map;
+
 	struct kho_debugfs dbg;
 };
 
 static struct kho_out kho_out = {
-	.chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
 	.lock = __MUTEX_INITIALIZER(kho_out.lock),
-	.ser = {
-		.track = {
-			.orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
-		},
+	.track = {
+		.orders = XARRAY_INIT(kho_out.track.orders, 0),
 	},
+	.sub_fdts = LIST_HEAD_INIT(kho_out.sub_fdts),
+	.fdts_lock = __MUTEX_INITIALIZER(kho_out.fdts_lock),
 	.finalized = false,
 };
 
@@ -369,7 +373,7 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
 	}
 }
 
-static int kho_mem_serialize(struct kho_serialization *ser)
+static int kho_mem_serialize(struct kho_out *kho_out)
 {
 	struct khoser_mem_chunk *first_chunk = NULL;
 	struct khoser_mem_chunk *chunk = NULL;
@@ -377,7 +381,7 @@ static int kho_mem_serialize(struct kho_serialization *ser)
 	unsigned long order;
 	int err = -ENOMEM;
 
-	xa_for_each(&ser->track.orders, order, physxa) {
+	xa_for_each(&kho_out->track.orders, order, physxa) {
 		struct kho_mem_phys_bits *bits;
 		unsigned long phys;
 
@@ -409,7 +413,7 @@ static int kho_mem_serialize(struct kho_serialization *ser)
 		}
 	}
 
-	ser->preserved_mem_map = first_chunk;
+	kho_out->preserved_mem_map = first_chunk;
 
 	return 0;
 
@@ -670,7 +674,6 @@ err_disable_kho:
 
 /**
  * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
- * @ser: serialization control object passed by KHO notifiers.
  * @name: name of the sub tree.
  * @fdt: the sub tree blob.
  *
@@ -684,34 +687,41 @@ err_disable_kho:
  *
  * Return: 0 on success, error code on failure
  */
-int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt)
+int kho_add_subtree(const char *name, void *fdt)
 {
-	int err = 0;
-	u64 phys = (u64)virt_to_phys(fdt);
-	void *root = page_to_virt(ser->fdt);
+	struct kho_sub_fdt *sub_fdt;
 
-	err |= fdt_begin_node(root, name);
-	err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys));
-	err |= fdt_end_node(root);
+	sub_fdt = kmalloc(sizeof(*sub_fdt), GFP_KERNEL);
+	if (!sub_fdt)
+		return -ENOMEM;
 
-	if (err)
-		return err;
+	INIT_LIST_HEAD(&sub_fdt->l);
+	sub_fdt->name = name;
+	sub_fdt->fdt = fdt;
 
-	return kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false);
-}
-EXPORT_SYMBOL_GPL(kho_add_subtree);
+	guard(mutex)(&kho_out.fdts_lock);
+	list_add_tail(&sub_fdt->l, &kho_out.sub_fdts);
+	WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false));
 
-int register_kho_notifier(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(&kho_out.chain_head, nb);
+	return 0;
 }
-EXPORT_SYMBOL_GPL(register_kho_notifier);
+EXPORT_SYMBOL_GPL(kho_add_subtree);
 
-int unregister_kho_notifier(struct notifier_block *nb)
+void kho_remove_subtree(void *fdt)
 {
-	return blocking_notifier_chain_unregister(&kho_out.chain_head, nb);
+	struct kho_sub_fdt *sub_fdt;
+
+	guard(mutex)(&kho_out.fdts_lock);
+	list_for_each_entry(sub_fdt, &kho_out.sub_fdts, l) {
+		if (sub_fdt->fdt == fdt) {
+			list_del(&sub_fdt->l);
+			kfree(sub_fdt);
+			kho_debugfs_fdt_remove(&kho_out.dbg, fdt);
+			break;
+		}
+	}
 }
-EXPORT_SYMBOL_GPL(unregister_kho_notifier);
+EXPORT_SYMBOL_GPL(kho_remove_subtree);
 
 /**
  * kho_preserve_folio - preserve a folio across kexec.
@@ -726,7 +736,7 @@ int kho_preserve_folio(struct folio *folio)
 {
 	const unsigned long pfn = folio_pfn(folio);
 	const unsigned int order = folio_order(folio);
-	struct kho_mem_track *track = &kho_out.ser.track;
+	struct kho_mem_track *track = &kho_out.track;
 
 	if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
 		return -EINVAL;
@@ -747,7 +757,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio);
  */
 int kho_preserve_pages(struct page *page, unsigned int nr_pages)
 {
-	struct kho_mem_track *track = &kho_out.ser.track;
+	struct kho_mem_track *track = &kho_out.track;
 	const unsigned long start_pfn = page_to_pfn(page);
 	const unsigned long end_pfn = start_pfn + nr_pages;
 	unsigned long pfn = start_pfn;
@@ -849,7 +859,7 @@ err_free:
 static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
 					 unsigned short order)
 {
-	struct kho_mem_track *track = &kho_out.ser.track;
+	struct kho_mem_track *track = &kho_out.track;
 	unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
 
 	__kho_unpreserve(track, pfn, pfn + 1);
@@ -1031,11 +1041,11 @@ EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
 
 static int __kho_abort(void)
 {
-	int err;
+	int err = 0;
 	unsigned long order;
 	struct kho_mem_phys *physxa;
 
-	xa_for_each(&kho_out.ser.track.orders, order, physxa) {
+	xa_for_each(&kho_out.track.orders, order, physxa) {
 		struct kho_mem_phys_bits *bits;
 		unsigned long phys;
 
@@ -1045,17 +1055,13 @@ static int __kho_abort(void)
 		xa_destroy(&physxa->phys_bits);
 		kfree(physxa);
 	}
-	xa_destroy(&kho_out.ser.track.orders);
+	xa_destroy(&kho_out.track.orders);
 
-	if (kho_out.ser.preserved_mem_map) {
-		kho_mem_ser_free(kho_out.ser.preserved_mem_map);
-		kho_out.ser.preserved_mem_map = NULL;
+	if (kho_out.preserved_mem_map) {
+		kho_mem_ser_free(kho_out.preserved_mem_map);
+		kho_out.preserved_mem_map = NULL;
 	}
 
-	err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT,
-					   NULL);
-	err = notifier_to_errno(err);
-
 	if (err)
 		pr_err("Failed to abort KHO finalization: %d\n", err);
 
@@ -1078,7 +1084,8 @@ int kho_abort(void)
 		return ret;
 
 	kho_out.finalized = false;
-	kho_debugfs_cleanup(&kho_out.dbg);
+
+	kho_debugfs_fdt_remove(&kho_out.dbg, kho_out.fdt);
 
 	return 0;
 }
@@ -1087,41 +1094,46 @@ static int __kho_finalize(void)
 {
 	int err = 0;
 	u64 *preserved_mem_map;
-	void *fdt = page_to_virt(kho_out.ser.fdt);
+	void *root = kho_out.fdt;
+	struct kho_sub_fdt *fdt;
 
-	err |= fdt_create(fdt, PAGE_SIZE);
-	err |= fdt_finish_reservemap(fdt);
-	err |= fdt_begin_node(fdt, "");
-	err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE);
+	err |= fdt_create(root, PAGE_SIZE);
+	err |= fdt_finish_reservemap(root);
+	err |= fdt_begin_node(root, "");
+	err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
 	/**
 	 * Reserve the preserved-memory-map property in the root FDT, so
 	 * that all property definitions will precede subnodes created by
 	 * KHO callers.
 	 */
-	err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP,
+	err |= fdt_property_placeholder(root, PROP_PRESERVED_MEMORY_MAP,
 					sizeof(*preserved_mem_map),
 					(void **)&preserved_mem_map);
 	if (err)
 		goto abort;
 
-	err = kho_preserve_folio(page_folio(kho_out.ser.fdt));
+	err = kho_preserve_folio(virt_to_folio(kho_out.fdt));
 	if (err)
 		goto abort;
 
-	err = blocking_notifier_call_chain(&kho_out.chain_head,
-					   KEXEC_KHO_FINALIZE, &kho_out.ser);
-	err = notifier_to_errno(err);
+	err = kho_mem_serialize(&kho_out);
 	if (err)
 		goto abort;
 
-	err = kho_mem_serialize(&kho_out.ser);
-	if (err)
-		goto abort;
+	*preserved_mem_map = (u64)virt_to_phys(kho_out.preserved_mem_map);
+
+	mutex_lock(&kho_out.fdts_lock);
+	list_for_each_entry(fdt, &kho_out.sub_fdts, l) {
+		phys_addr_t phys = virt_to_phys(fdt->fdt);
 
-	*preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map);
+		err |= fdt_begin_node(root, fdt->name);
+		err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys));
+		err |= fdt_end_node(root);
+	}
+	mutex_unlock(&kho_out.fdts_lock);
 
-	err |= fdt_end_node(fdt);
-	err |= fdt_finish(fdt);
+	err |= fdt_end_node(root);
+	err |= fdt_finish(root);
 
 abort:
 	if (err) {
@@ -1149,8 +1161,10 @@ int kho_finalize(void)
 
 	kho_out.finalized = true;
 
-	return kho_debugfs_fdt_add(&kho_out.dbg, "fdt",
-				   page_to_virt(kho_out.ser.fdt), true);
+	WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt",
+					 kho_out.fdt, true));
+
+	return 0;
 }
 
 bool kho_finalized(void)
@@ -1233,15 +1247,17 @@ static __init int kho_init(void)
 {
 	int err = 0;
 	const void *fdt = kho_get_fdt();
+	struct page *fdt_page;
 
 	if (!kho_enable)
 		return 0;
 
-	kho_out.ser.fdt = alloc_page(GFP_KERNEL);
-	if (!kho_out.ser.fdt) {
+	fdt_page = alloc_page(GFP_KERNEL);
+	if (!fdt_page) {
 		err = -ENOMEM;
 		goto err_free_scratch;
 	}
+	kho_out.fdt = page_to_virt(fdt_page);
 
 	err = kho_debugfs_init();
 	if (err)
@@ -1269,8 +1285,8 @@ static __init int kho_init(void)
 	return 0;
 
 err_free_fdt:
-	put_page(kho_out.ser.fdt);
-	kho_out.ser.fdt = NULL;
+	put_page(fdt_page);
+	kho_out.fdt = NULL;
 err_free_scratch:
 	for (int i = 0; i < kho_scratch_cnt; i++) {
 		void *start = __va(kho_scratch[i].addr);
@@ -1281,7 +1297,7 @@ err_free_scratch:
 	kho_enable = false;
 	return err;
 }
-late_initcall(kho_init);
+fs_initcall(kho_init);
 
 static void __init kho_release_scratch(void)
 {
@@ -1417,7 +1433,7 @@ int kho_fill_kimage(struct kimage *image)
 	if (!kho_out.finalized)
 		return 0;
 
-	image->kho.fdt = page_to_phys(kho_out.ser.fdt);
+	image->kho.fdt = virt_to_phys(kho_out.fdt);
 
 	scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
 	scratch = (struct kexec_buf){
diff --git a/kernel/kexec_handover_debugfs.c b/kernel/kexec_handover_debugfs.c
index a91b279f1b23..46e9e6c0791f 100644
--- a/kernel/kexec_handover_debugfs.c
+++ b/kernel/kexec_handover_debugfs.c
@@ -61,14 +61,17 @@ int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
 	return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt);
 }
 
-void kho_debugfs_cleanup(struct kho_debugfs *dbg)
+void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt)
 {
-	struct fdt_debugfs *ff, *tmp;
-
-	list_for_each_entry_safe(ff, tmp, &dbg->fdt_list, list) {
-		debugfs_remove(ff->file);
-		list_del(&ff->list);
-		kfree(ff);
+	struct fdt_debugfs *ff;
+
+	list_for_each_entry(ff, &dbg->fdt_list, list) {
+		if (ff->wrapper.data == fdt) {
+			debugfs_remove(ff->file);
+			list_del(&ff->list);
+			kfree(ff);
+			break;
+		}
 	}
 }
 
diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h
index 217b8b25a542..52ed73659fe6 100644
--- a/kernel/kexec_handover_internal.h
+++ b/kernel/kexec_handover_internal.h
@@ -32,7 +32,7 @@ void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt);
 int kho_out_debugfs_init(struct kho_debugfs *dbg);
 int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
 			const void *fdt, bool root);
-void kho_debugfs_cleanup(struct kho_debugfs *dbg);
+void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt);
 #else
 static inline int kho_debugfs_init(void) { return 0; }
 static inline void kho_in_debugfs_init(struct kho_debugfs *dbg,
@@ -40,7 +40,8 @@ static inline void kho_in_debugfs_init(struct kho_debugfs *dbg,
 static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; }
 static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
 				      const void *fdt, bool root) { return 0; }
-static inline void kho_debugfs_cleanup(struct kho_debugfs *dbg) {}
+static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg,
+					  void *fdt) { }
 #endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */
 
 #ifdef CONFIG_KEXEC_HANDOVER_DEBUG
diff --git a/lib/test_kho.c b/lib/test_kho.c
index fff018e5548d..27618c5b4796 100644
--- a/lib/test_kho.c
+++ b/lib/test_kho.c
@@ -39,33 +39,6 @@ struct kho_test_state {
 
 static struct kho_test_state kho_test_state;
 
-static int kho_test_notifier(struct notifier_block *self, unsigned long cmd,
-			     void *v)
-{
-	struct kho_test_state *state = &kho_test_state;
-	struct kho_serialization *ser = v;
-	int err = 0;
-
-	switch (cmd) {
-	case KEXEC_KHO_ABORT:
-		return NOTIFY_DONE;
-	case KEXEC_KHO_FINALIZE:
-		/* Handled below */
-		break;
-	default:
-		return NOTIFY_BAD;
-	}
-
-	err |= kho_preserve_folio(state->fdt);
-	err |= kho_add_subtree(ser, KHO_TEST_FDT, folio_address(state->fdt));
-
-	return err ? NOTIFY_BAD : NOTIFY_DONE;
-}
-
-static struct notifier_block kho_test_nb = {
-	.notifier_call = kho_test_notifier,
-};
-
 static int kho_test_save_data(struct kho_test_state *state, void *fdt)
 {
 	phys_addr_t *folios_info __free(kvfree) = NULL;
@@ -120,6 +93,7 @@ static int kho_test_prepare_fdt(struct kho_test_state *state)
 
 	fdt = folio_address(state->fdt);
 
+	err |= kho_preserve_folio(state->fdt);
 	err |= fdt_create(fdt, fdt_size);
 	err |= fdt_finish_reservemap(fdt);
 
@@ -131,6 +105,7 @@ static int kho_test_prepare_fdt(struct kho_test_state *state)
 
 	err |= fdt_finish(fdt);
 
+	err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt));
 	if (err)
 		folio_put(state->fdt);
 
@@ -203,10 +178,6 @@ static int kho_test_save(void)
 	if (err)
 		goto err_free_folios;
 
-	err = register_kho_notifier(&kho_test_nb);
-	if (err)
-		goto err_free_fdt;
-
 	return 0;
 
 err_free_fdt:
@@ -329,7 +300,7 @@ static void kho_test_cleanup(void)
 
 static void __exit kho_test_exit(void)
 {
-	unregister_kho_notifier(&kho_test_nb);
+	kho_remove_subtree(folio_address(kho_test_state.fdt));
 	kho_test_cleanup();
 }
 module_exit(kho_test_exit);
diff --git a/mm/memblock.c b/mm/memblock.c
index e23e16618e9b..e3bef9b35d63 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2444,53 +2444,18 @@ int reserve_mem_release_by_name(const char *name)
 #define MEMBLOCK_KHO_FDT "memblock"
 #define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1"
 #define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1"
-static struct page *kho_fdt;
-
-static int reserve_mem_kho_finalize(struct kho_serialization *ser)
-{
-	int err = 0, i;
-
-	for (i = 0; i < reserved_mem_count; i++) {
-		struct reserve_mem_table *map = &reserved_mem_table[i];
-		struct page *page = phys_to_page(map->start);
-		unsigned int nr_pages = map->size >> PAGE_SHIFT;
-
-		err |= kho_preserve_pages(page, nr_pages);
-	}
-
-	err |= kho_preserve_folio(page_folio(kho_fdt));
-	err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt));
-
-	return notifier_from_errno(err);
-}
-
-static int reserve_mem_kho_notifier(struct notifier_block *self,
-				    unsigned long cmd, void *v)
-{
-	switch (cmd) {
-	case KEXEC_KHO_FINALIZE:
-		return reserve_mem_kho_finalize((struct kho_serialization *)v);
-	case KEXEC_KHO_ABORT:
-		return NOTIFY_DONE;
-	default:
-		return NOTIFY_BAD;
-	}
-}
-
-static struct notifier_block reserve_mem_kho_nb = {
-	.notifier_call = reserve_mem_kho_notifier,
-};
 
 static int __init prepare_kho_fdt(void)
 {
 	int err = 0, i;
+	struct page *fdt_page;
 	void *fdt;
 
-	kho_fdt = alloc_page(GFP_KERNEL);
-	if (!kho_fdt)
+	fdt_page = alloc_page(GFP_KERNEL);
+	if (!fdt_page)
 		return -ENOMEM;
 
-	fdt = page_to_virt(kho_fdt);
+	fdt = page_to_virt(fdt_page);
 
 	err |= fdt_create(fdt, PAGE_SIZE);
 	err |= fdt_finish_reservemap(fdt);
@@ -2499,7 +2464,10 @@ static int __init prepare_kho_fdt(void)
 	err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE);
 	for (i = 0; i < reserved_mem_count; i++) {
 		struct reserve_mem_table *map = &reserved_mem_table[i];
+		struct page *page = phys_to_page(map->start);
+		unsigned int nr_pages = map->size >> PAGE_SHIFT;
 
+		err |= kho_preserve_pages(page, nr_pages);
 		err |= fdt_begin_node(fdt, map->name);
 		err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE);
 		err |= fdt_property(fdt, "start", &map->start, sizeof(map->start));
@@ -2507,13 +2475,16 @@ static int __init prepare_kho_fdt(void)
 		err |= fdt_end_node(fdt);
 	}
 	err |= fdt_end_node(fdt);
-
 	err |= fdt_finish(fdt);
 
+	err |= kho_preserve_folio(page_folio(fdt_page));
+
+	if (!err)
+		err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt);
+
 	if (err) {
 		pr_err("failed to prepare memblock FDT for KHO: %d\n", err);
-		put_page(kho_fdt);
-		kho_fdt = NULL;
+		put_page(fdt_page);
 	}
 
 	return err;
@@ -2529,13 +2500,6 @@ static int __init reserve_mem_init(void)
 	err = prepare_kho_fdt();
 	if (err)
 		return err;
-
-	err = register_kho_notifier(&reserve_mem_kho_nb);
-	if (err) {
-		put_page(kho_fdt);
-		kho_fdt = NULL;
-	}
-
 	return err;
 }
 late_initcall(reserve_mem_init);
-- 
cgit v1.2.3


From 36f8f7ef7fd2f238922e9d217e86c69838319d8c Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Sat, 1 Nov 2025 10:23:19 -0400
Subject: kho: add interfaces to unpreserve folios, page ranges, and vmalloc

Allow users of KHO to cancel the previous preservation by adding the
necessary interfaces to unpreserve folio, pages, and vmallocs.

Link: https://lkml.kernel.org/r/20251101142325.1326536-4-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec_handover.h |  18 +++++++
 kernel/kexec_handover.c        | 104 +++++++++++++++++++++++++++++++++++------
 2 files changed, 109 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 0d860d793b66..80ece4232617 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -43,8 +43,11 @@ bool kho_is_enabled(void);
 bool is_kho_boot(void);
 
 int kho_preserve_folio(struct folio *folio);
+int kho_unpreserve_folio(struct folio *folio);
 int kho_preserve_pages(struct page *page, unsigned int nr_pages);
+int kho_unpreserve_pages(struct page *page, unsigned int nr_pages);
 int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation);
+int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation);
 struct folio *kho_restore_folio(phys_addr_t phys);
 struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages);
 void *kho_restore_vmalloc(const struct kho_vmalloc *preservation);
@@ -72,17 +75,32 @@ static inline int kho_preserve_folio(struct folio *folio)
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_unpreserve_folio(struct folio *folio)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages)
 {
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int kho_preserve_vmalloc(void *ptr,
 				       struct kho_vmalloc *preservation)
 {
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct folio *kho_restore_folio(phys_addr_t phys)
 {
 	return NULL;
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 3dd917bfedcc..4e033f96637d 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -157,26 +157,33 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
 	return no_free_ptr(elm);
 }
 
-static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
-			     unsigned long end_pfn)
+static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn,
+				   unsigned int order)
 {
 	struct kho_mem_phys_bits *bits;
 	struct kho_mem_phys *physxa;
+	const unsigned long pfn_high = pfn >> order;
 
-	while (pfn < end_pfn) {
-		const unsigned int order =
-			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
-		const unsigned long pfn_high = pfn >> order;
+	physxa = xa_load(&track->orders, order);
+	if (WARN_ON_ONCE(!physxa))
+		return;
 
-		physxa = xa_load(&track->orders, order);
-		if (WARN_ON_ONCE(!physxa))
-			return;
+	bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
+	if (WARN_ON_ONCE(!bits))
+		return;
 
-		bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
-		if (WARN_ON_ONCE(!bits))
-			return;
+	clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+}
+
+static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
+			     unsigned long end_pfn)
+{
+	unsigned int order;
+
+	while (pfn < end_pfn) {
+		order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
 
-		clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+		__kho_unpreserve_order(track, pfn, order);
 
 		pfn += 1 << order;
 	}
@@ -745,6 +752,30 @@ int kho_preserve_folio(struct folio *folio)
 }
 EXPORT_SYMBOL_GPL(kho_preserve_folio);
 
+/**
+ * kho_unpreserve_folio - unpreserve a folio.
+ * @folio: folio to unpreserve.
+ *
+ * Instructs KHO to unpreserve a folio that was preserved by
+ * kho_preserve_folio() before. The provided @folio (pfn and order)
+ * must exactly match a previously preserved folio.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_unpreserve_folio(struct folio *folio)
+{
+	const unsigned long pfn = folio_pfn(folio);
+	const unsigned int order = folio_order(folio);
+	struct kho_mem_track *track = &kho_out.track;
+
+	if (kho_out.finalized)
+		return -EBUSY;
+
+	__kho_unpreserve_order(track, pfn, order);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
+
 /**
  * kho_preserve_pages - preserve contiguous pages across kexec
  * @page: first page in the list.
@@ -789,6 +820,33 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
 }
 EXPORT_SYMBOL_GPL(kho_preserve_pages);
 
+/**
+ * kho_unpreserve_pages - unpreserve contiguous pages.
+ * @page: first page in the list.
+ * @nr_pages: number of pages.
+ *
+ * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page.
+ * This must be called with the same @page and @nr_pages as the corresponding
+ * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
+ * preserved blocks is not supported.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
+{
+	struct kho_mem_track *track = &kho_out.track;
+	const unsigned long start_pfn = page_to_pfn(page);
+	const unsigned long end_pfn = start_pfn + nr_pages;
+
+	if (kho_out.finalized)
+		return -EBUSY;
+
+	__kho_unpreserve(track, start_pfn, end_pfn);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
+
 struct kho_vmalloc_hdr {
 	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
 };
@@ -950,6 +1008,26 @@ err_free:
 }
 EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
 
+/**
+ * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc()
+ * @preservation: preservation metadata returned by kho_preserve_vmalloc()
+ *
+ * Instructs KHO to unpreserve the area in vmalloc address space that was
+ * previously preserved with kho_preserve_vmalloc().
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
+{
+	if (kho_out.finalized)
+		return -EBUSY;
+
+	kho_vmalloc_free_chunks(preservation);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc);
+
 /**
  * kho_restore_vmalloc - recreates and populates an area in vmalloc address
  * space from the preserved memory.
-- 
cgit v1.2.3


From 4c205677af2726bd3b51c02ab6a5a2b411efed09 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 14 Nov 2025 13:59:52 -0500
Subject: kho: introduce high-level memory allocation API

Currently, clients of KHO must manually allocate memory (e.g., via
alloc_pages), calculate the page order, and explicitly call
kho_preserve_folio().  Similarly, cleanup requires separate calls to
unpreserve and free the memory.

Introduce a high-level API to streamline this common pattern:

- kho_alloc_preserve(size): Allocates physically contiguous, zeroed
  memory and immediately marks it for preservation.
- kho_unpreserve_free(ptr): Unpreserves and frees the memory
  in the current kernel.
- kho_restore_free(ptr): Restores the struct page state of
  preserved memory in the new kernel and immediately frees it to the
  page allocator.

[pasha.tatashin@soleen.com: build fixes]
  Link: https://lkml.kernel.org/r/CA+CK2bBgXDhrHwTVgxrw7YTQ-0=LgW0t66CwPCgG=C85ftz4zw@mail.gmail.com
Link: https://lkml.kernel.org/r/20251114190002.3311679-4-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec_handover.h     | 22 +++++++---
 kernel/liveupdate/kexec_handover.c | 87 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 80ece4232617..dde952227b88 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -2,8 +2,9 @@
 #ifndef LINUX_KEXEC_HANDOVER_H
 #define LINUX_KEXEC_HANDOVER_H
 
-#include <linux/types.h>
+#include <linux/err.h>
 #include <linux/errno.h>
+#include <linux/types.h>
 
 struct kho_scratch {
 	phys_addr_t addr;
@@ -48,6 +49,9 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages);
 int kho_unpreserve_pages(struct page *page, unsigned int nr_pages);
 int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation);
 int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation);
+void *kho_alloc_preserve(size_t size);
+void kho_unpreserve_free(void *mem);
+void kho_restore_free(void *mem);
 struct folio *kho_restore_folio(phys_addr_t phys);
 struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages);
 void *kho_restore_vmalloc(const struct kho_vmalloc *preservation);
@@ -101,6 +105,14 @@ static inline int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
 	return -EOPNOTSUPP;
 }
 
+static inline void *kho_alloc_preserve(size_t size)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void kho_unpreserve_free(void *mem) { }
+static inline void kho_restore_free(void *mem) { }
+
 static inline struct folio *kho_restore_folio(phys_addr_t phys)
 {
 	return NULL;
@@ -122,18 +134,14 @@ static inline int kho_add_subtree(const char *name, void *fdt)
 	return -EOPNOTSUPP;
 }
 
-static inline void kho_remove_subtree(void *fdt)
-{
-}
+static inline void kho_remove_subtree(void *fdt) { }
 
 static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline void kho_memory_init(void)
-{
-}
+static inline void kho_memory_init(void) { }
 
 static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
 				phys_addr_t scratch_phys, u64 scratch_len)
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index bc7f046a1313..5c5c9c46fe92 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
  * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
  * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com>
  */
 
 #define pr_fmt(fmt) "KHO: " fmt
@@ -1117,6 +1118,92 @@ err_free_pages_array:
 }
 EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
 
+/**
+ * kho_alloc_preserve - Allocate, zero, and preserve memory.
+ * @size: The number of bytes to allocate.
+ *
+ * Allocates a physically contiguous block of zeroed pages that is large
+ * enough to hold @size bytes. The allocated memory is then registered with
+ * KHO for preservation across a kexec.
+ *
+ * Note: The actual allocated size will be rounded up to the nearest
+ * power-of-two page boundary.
+ *
+ * @return A virtual pointer to the allocated and preserved memory on success,
+ * or an ERR_PTR() encoded error on failure.
+ */
+void *kho_alloc_preserve(size_t size)
+{
+	struct folio *folio;
+	int order, ret;
+
+	if (!size)
+		return ERR_PTR(-EINVAL);
+
+	order = get_order(size);
+	if (order > MAX_PAGE_ORDER)
+		return ERR_PTR(-E2BIG);
+
+	folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
+	if (!folio)
+		return ERR_PTR(-ENOMEM);
+
+	ret = kho_preserve_folio(folio);
+	if (ret) {
+		folio_put(folio);
+		return ERR_PTR(ret);
+	}
+
+	return folio_address(folio);
+}
+EXPORT_SYMBOL_GPL(kho_alloc_preserve);
+
+/**
+ * kho_unpreserve_free - Unpreserve and free memory.
+ * @mem:  Pointer to the memory allocated by kho_alloc_preserve().
+ *
+ * Unregisters the memory from KHO preservation and frees the underlying
+ * pages back to the system. This function should be called to clean up
+ * memory allocated with kho_alloc_preserve().
+ */
+void kho_unpreserve_free(void *mem)
+{
+	struct folio *folio;
+
+	if (!mem)
+		return;
+
+	folio = virt_to_folio(mem);
+	WARN_ON_ONCE(kho_unpreserve_folio(folio));
+	folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_free);
+
+/**
+ * kho_restore_free - Restore and free memory after kexec.
+ * @mem:  Pointer to the memory (in the new kernel's address space)
+ * that was allocated by the old kernel.
+ *
+ * This function is intended to be called in the new kernel (post-kexec)
+ * to take ownership of and free a memory region that was preserved by the
+ * old kernel using kho_alloc_preserve().
+ *
+ * It first restores the pages from KHO (using their physical address)
+ * and then frees the pages back to the new kernel's page allocator.
+ */
+void kho_restore_free(void *mem)
+{
+	struct folio *folio;
+
+	if (!mem)
+		return;
+
+	folio = kho_restore_folio(__pa(mem));
+	if (!WARN_ON(!folio))
+		folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(kho_restore_free);
+
 static void __kho_abort(void)
 {
 	if (kho_out.preserved_mem_map) {
-- 
cgit v1.2.3


From de51999e687c70a41997124b43291f84324c7924 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 14 Nov 2025 14:00:01 -0500
Subject: kho: allow memory preservation state updates after finalization

Currently, kho_preserve_* and kho_unpreserve_* return -EBUSY if KHO is
finalized.  This enforces a rigid "freeze" on the KHO memory state.

With the introduction of re-entrant finalization, this restriction is no
longer necessary.  Users should be allowed to modify the preservation set
(e.g., adding new pages or freeing old ones) even after an initial
finalization.

The intended workflow for updates is now:
1. Modify state (preserve/unpreserve).
2. Call kho_finalize() again to refresh the serialized metadata.

Remove the kho_out.finalized checks to enable this dynamic behavior.

This also allows to convert kho_unpreserve_* functions to void, as they do
not return any error anymore.

Link: https://lkml.kernel.org/r/20251114190002.3311679-13-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec_handover.h     | 21 +++++----------
 kernel/liveupdate/kexec_handover.c | 55 +++++++++-----------------------------
 2 files changed, 19 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index dde952227b88..5f7b9de97e8d 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -44,11 +44,11 @@ bool kho_is_enabled(void);
 bool is_kho_boot(void);
 
 int kho_preserve_folio(struct folio *folio);
-int kho_unpreserve_folio(struct folio *folio);
+void kho_unpreserve_folio(struct folio *folio);
 int kho_preserve_pages(struct page *page, unsigned int nr_pages);
-int kho_unpreserve_pages(struct page *page, unsigned int nr_pages);
+void kho_unpreserve_pages(struct page *page, unsigned int nr_pages);
 int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation);
-int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation);
+void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation);
 void *kho_alloc_preserve(size_t size);
 void kho_unpreserve_free(void *mem);
 void kho_restore_free(void *mem);
@@ -79,20 +79,14 @@ static inline int kho_preserve_folio(struct folio *folio)
 	return -EOPNOTSUPP;
 }
 
-static inline int kho_unpreserve_folio(struct folio *folio)
-{
-	return -EOPNOTSUPP;
-}
+static inline void kho_unpreserve_folio(struct folio *folio) { }
 
 static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
-{
-	return -EOPNOTSUPP;
-}
+static inline void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) { }
 
 static inline int kho_preserve_vmalloc(void *ptr,
 				       struct kho_vmalloc *preservation)
@@ -100,10 +94,7 @@ static inline int kho_preserve_vmalloc(void *ptr,
 	return -EOPNOTSUPP;
 }
 
-static inline int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
-{
-	return -EOPNOTSUPP;
-}
+static inline void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) { }
 
 static inline void *kho_alloc_preserve(size_t size)
 {
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 4596e67de832..a7f876ece445 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -185,10 +185,6 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
 	const unsigned long pfn_high = pfn >> order;
 
 	might_sleep();
-
-	if (kho_out.finalized)
-		return -EBUSY;
-
 	physxa = xa_load(&track->orders, order);
 	if (!physxa) {
 		int err;
@@ -807,20 +803,14 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio);
  * Instructs KHO to unpreserve a folio that was preserved by
  * kho_preserve_folio() before. The provided @folio (pfn and order)
  * must exactly match a previously preserved folio.
- *
- * Return: 0 on success, error code on failure
  */
-int kho_unpreserve_folio(struct folio *folio)
+void kho_unpreserve_folio(struct folio *folio)
 {
 	const unsigned long pfn = folio_pfn(folio);
 	const unsigned int order = folio_order(folio);
 	struct kho_mem_track *track = &kho_out.track;
 
-	if (kho_out.finalized)
-		return -EBUSY;
-
 	__kho_unpreserve_order(track, pfn, order);
-	return 0;
 }
 EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
 
@@ -877,21 +867,14 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages);
  * This must be called with the same @page and @nr_pages as the corresponding
  * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
  * preserved blocks is not supported.
- *
- * Return: 0 on success, error code on failure
  */
-int kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
+void kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
 {
 	struct kho_mem_track *track = &kho_out.track;
 	const unsigned long start_pfn = page_to_pfn(page);
 	const unsigned long end_pfn = start_pfn + nr_pages;
 
-	if (kho_out.finalized)
-		return -EBUSY;
-
 	__kho_unpreserve(track, start_pfn, end_pfn);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
 
@@ -976,20 +959,6 @@ static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
 	}
 }
 
-static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc)
-{
-	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first);
-
-	while (chunk) {
-		struct kho_vmalloc_chunk *tmp = chunk;
-
-		kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order);
-
-		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
-		free_page((unsigned long)tmp);
-	}
-}
-
 /**
  * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
  * @ptr: pointer to the area in vmalloc address space
@@ -1051,7 +1020,7 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
 	return 0;
 
 err_free:
-	kho_vmalloc_free_chunks(preservation);
+	kho_unpreserve_vmalloc(preservation);
 	return err;
 }
 EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
@@ -1062,17 +1031,19 @@ EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
  *
  * Instructs KHO to unpreserve the area in vmalloc address space that was
  * previously preserved with kho_preserve_vmalloc().
- *
- * Return: 0 on success, error code on failure
  */
-int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
+void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
 {
-	if (kho_out.finalized)
-		return -EBUSY;
+	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
 
-	kho_vmalloc_free_chunks(preservation);
+	while (chunk) {
+		struct kho_vmalloc_chunk *tmp = chunk;
 
-	return 0;
+		kho_vmalloc_unpreserve_chunk(chunk, preservation->order);
+
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+		free_page((unsigned long)tmp);
+	}
 }
 EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc);
 
@@ -1221,7 +1192,7 @@ void kho_unpreserve_free(void *mem)
 		return;
 
 	folio = virt_to_folio(mem);
-	WARN_ON_ONCE(kho_unpreserve_folio(folio));
+	kho_unpreserve_folio(folio);
 	folio_put(folio);
 }
 EXPORT_SYMBOL_GPL(kho_unpreserve_free);
-- 
cgit v1.2.3


From 9e2fd062fa1713a33380cc97ef324d086dd45ba5 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 25 Nov 2025 11:58:31 -0500
Subject: liveupdate: luo_core: Live Update Orchestrator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Live Update Orchestrator", v8.

This series introduces the Live Update Orchestrator, a kernel subsystem
designed to facilitate live kernel updates using a kexec-based reboot.
This capability is critical for cloud environments, allowing hypervisors
to be updated with minimal downtime for running virtual machines.  LUO
achieves this by preserving the state of selected resources, such as
memory, devices and their dependencies, across the kernel transition.

As a key feature, this series includes support for preserving memfd file
descriptors, which allows critical in-memory data, such as guest RAM or
any other large memory region, to be maintained in RAM across the kexec
reboot.

The other series that use LUO, are VFIO [1], IOMMU [2], and PCI [3]
preservations.

Github repo of this series [4].

The core of LUO is a framework for managing the lifecycle of preserved
resources through a userspace-driven interface. Key features include:

- Session Management
  Userspace agent (i.e. luod [5]) creates named sessions, each
  represented by a file descriptor (via centralized agent that controls
  /dev/liveupdate). The lifecycle of all preserved resources within a
  session is tied to this FD, ensuring automatic kernel cleanup if the
  controlling userspace agent crashes or exits unexpectedly.

- File Preservation
  A handler-based framework allows specific file types (demonstrated
  here with memfd) to be preserved. Handlers manage the serialization,
  restoration, and lifecycle of their specific file types.

- File-Lifecycle-Bound State
  A new mechanism for managing shared global state whose lifecycle is
  tied to the preservation of one or more files. This is crucial for
  subsystems like IOMMU or HugeTLB, where multiple file descriptors may
  depend on a single, shared underlying resource that must be preserved
  only once.

- KHO Integration
  LUO drives the Kexec Handover framework programmatically to pass its
  serialized metadata to the next kernel. The LUO state is finalized and
  added to the kexec image just before the reboot is triggered. In the
  future this step will also be removed once stateless KHO is
  merged [6].

- Userspace Interface
  Control is provided via ioctl commands on /dev/liveupdate for creating
  and retrieving sessions, as well as on session file descriptors for
  managing individual files.

- Testing
  The series includes a set of selftests, including userspace API
  validation, kexec-based lifecycle tests for various session and file
  scenarios, and a new in-kernel test module to validate the FLB logic.


Introduce LUO, a mechanism intended to facilitate kernel updates while
keeping designated devices operational across the transition (e.g., via
kexec).  The primary use case is updating hypervisors with minimal
disruption to running virtual machines.  For userspace side of hypervisor
update we have copyless migration.  LUO is for updating the kernel.

This initial patch lays the groundwork for the LUO subsystem.

Further functionality, including the implementation of state transition
logic, integration with KHO, and hooks for subsystems and file
descriptors, will be added in subsequent patches.

Create a character device at /dev/liveupdate.

A new uAPI header, <uapi/linux/liveupdate.h>, will define the necessary
structures.  The magic number for IOCTL is registered in
Documentation/userspace-api/ioctl/ioctl-number.rst.

Link: https://lkml.kernel.org/r/20251125165850.3389713-1-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20251125165850.3389713-2-pasha.tatashin@soleen.com
Link: https://lore.kernel.org/all/20251018000713.677779-1-vipinsh@google.com/ [1]
Link: https://lore.kernel.org/linux-iommu/20250928190624.3735830-1-skhawaja@google.com [2]
Link: https://lore.kernel.org/linux-pci/20250916-luo-pci-v2-0-c494053c3c08@kernel.org [3]
Link: https://github.com/googleprodkernel/linux-liveupdate/tree/luo/v8 [4]
Link: https://tinyurl.com/luoddesign [5]
Link: https://lore.kernel.org/all/20251020100306.2709352-1-jasonmiu@google.com [6]
Link: https://lore.kernel.org/all/20251115233409.768044-1-pasha.tatashin@soleen.com [7]
Link: https://github.com/soleen/linux/blob/luo/v8b03/diff.v7.v8 [8]
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |   2 +
 include/linux/liveupdate.h                         |  35 +++++++
 include/uapi/linux/liveupdate.h                    |  46 +++++++++
 kernel/liveupdate/Kconfig                          |  21 ++++
 kernel/liveupdate/Makefile                         |   5 +
 kernel/liveupdate/luo_core.c                       | 111 +++++++++++++++++++++
 6 files changed, 220 insertions(+)
 create mode 100644 include/linux/liveupdate.h
 create mode 100644 include/uapi/linux/liveupdate.h
 create mode 100644 kernel/liveupdate/luo_core.c

(limited to 'include')

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 7c527a01d1cf..7232b3544cec 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -385,6 +385,8 @@ Code  Seq#    Include File                                             Comments
 0xB8  01-02  uapi/misc/mrvl_cn10k_dpi.h                                Marvell CN10K DPI driver
 0xB8  all    uapi/linux/mshv.h                                         Microsoft Hyper-V /dev/mshv driver
                                                                        <mailto:linux-hyperv@vger.kernel.org>
+0xBA  00-0F  uapi/linux/liveupdate.h                                   Pasha Tatashin
+                                                                       <mailto:pasha.tatashin@soleen.com>
 0xC0  00-0F  linux/usb/iowarrior.h
 0xCA  00-0F  uapi/misc/cxl.h                                           Dead since 6.15
 0xCA  10-2F  uapi/misc/ocxl.h
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
new file mode 100644
index 000000000000..c6a1d6bd90cb
--- /dev/null
+++ b/include/linux/liveupdate.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+#ifndef _LINUX_LIVEUPDATE_H
+#define _LINUX_LIVEUPDATE_H
+
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_LIVEUPDATE
+
+/* Return true if live update orchestrator is enabled */
+bool liveupdate_enabled(void);
+
+/* Called during kexec to tell LUO that entered into reboot */
+int liveupdate_reboot(void);
+
+#else /* CONFIG_LIVEUPDATE */
+
+static inline bool liveupdate_enabled(void)
+{
+	return false;
+}
+
+static inline int liveupdate_reboot(void)
+{
+	return 0;
+}
+
+#endif /* CONFIG_LIVEUPDATE */
+#endif /* _LINUX_LIVEUPDATE_H */
diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h
new file mode 100644
index 000000000000..df34c1642c4d
--- /dev/null
+++ b/include/uapi/linux/liveupdate.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+/*
+ * Userspace interface for /dev/liveupdate
+ * Live Update Orchestrator
+ *
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _UAPI_LIVEUPDATE_H
+#define _UAPI_LIVEUPDATE_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * DOC: General ioctl format
+ *
+ * The ioctl interface follows a general format to allow for extensibility. Each
+ * ioctl is passed in a structure pointer as the argument providing the size of
+ * the structure in the first u32. The kernel checks that any structure space
+ * beyond what it understands is 0. This allows userspace to use the backward
+ * compatible portion while consistently using the newer, larger, structures.
+ *
+ * ioctls use a standard meaning for common errnos:
+ *
+ *  - ENOTTY: The IOCTL number itself is not supported at all
+ *  - E2BIG: The IOCTL number is supported, but the provided structure has
+ *    non-zero in a part the kernel does not understand.
+ *  - EOPNOTSUPP: The IOCTL number is supported, and the structure is
+ *    understood, however a known field has a value the kernel does not
+ *    understand or support.
+ *  - EINVAL: Everything about the IOCTL was understood, but a field is not
+ *    correct.
+ *  - ENOENT: A provided token does not exist.
+ *  - ENOMEM: Out of memory.
+ *  - EOVERFLOW: Mathematics overflowed.
+ *
+ * As well as additional errnos, within specific ioctls.
+ */
+
+/* The ioctl type, documented in ioctl-number.rst */
+#define LIVEUPDATE_IOCTL_TYPE		0xBA
+
+#endif /* _UAPI_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
index a973a54447de..9b2515f31afb 100644
--- a/kernel/liveupdate/Kconfig
+++ b/kernel/liveupdate/Kconfig
@@ -51,4 +51,25 @@ config KEXEC_HANDOVER_ENABLE_DEFAULT
 	  The default behavior can still be overridden at boot time by
 	  passing 'kho=off'.
 
+config LIVEUPDATE
+	bool "Live Update Orchestrator"
+	depends on KEXEC_HANDOVER
+	help
+	  Enable the Live Update Orchestrator. Live Update is a mechanism,
+	  typically based on kexec, that allows the kernel to be updated
+	  while keeping selected devices operational across the transition.
+	  These devices are intended to be reclaimed by the new kernel and
+	  re-attached to their original workload without requiring a device
+	  reset.
+
+	  Ability to handover a device from current to the next kernel depends
+	  on specific support within device drivers and related kernel
+	  subsystems.
+
+	  This feature primarily targets virtual machine hosts to quickly update
+	  the kernel hypervisor with minimal disruption to the running virtual
+	  machines.
+
+	  If unsure, say N.
+
 endmenu
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
index f52ce1ebcf86..08954c1770c4 100644
--- a/kernel/liveupdate/Makefile
+++ b/kernel/liveupdate/Makefile
@@ -1,5 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
 
+luo-y :=								\
+		luo_core.o
+
 obj-$(CONFIG_KEXEC_HANDOVER)		+= kexec_handover.o
 obj-$(CONFIG_KEXEC_HANDOVER_DEBUG)	+= kexec_handover_debug.o
 obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS)	+= kexec_handover_debugfs.o
+
+obj-$(CONFIG_LIVEUPDATE)		+= luo.o
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
new file mode 100644
index 000000000000..30ad8836360b
--- /dev/null
+++ b/kernel/liveupdate/luo_core.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: Live Update Orchestrator (LUO)
+ *
+ * Live Update is a specialized, kexec-based reboot process that allows a
+ * running kernel to be updated from one version to another while preserving
+ * the state of selected resources and keeping designated hardware devices
+ * operational. For these devices, DMA activity may continue throughout the
+ * kernel transition.
+ *
+ * While the primary use case driving this work is supporting live updates of
+ * the Linux kernel when it is used as a hypervisor in cloud environments, the
+ * LUO framework itself is designed to be workload-agnostic. Live Update
+ * facilitates a full kernel version upgrade for any type of system.
+ *
+ * For example, a non-hypervisor system running an in-memory cache like
+ * memcached with many gigabytes of data can use LUO. The userspace service
+ * can place its cache into a memfd, have its state preserved by LUO, and
+ * restore it immediately after the kernel kexec.
+ *
+ * Whether the system is running virtual machines, containers, a
+ * high-performance database, or networking services, LUO's primary goal is to
+ * enable a full kernel update by preserving critical userspace state and
+ * keeping essential devices operational.
+ *
+ * The core of LUO is a mechanism that tracks the progress of a live update,
+ * along with a callback API that allows other kernel subsystems to participate
+ * in the process. Example subsystems that can hook into LUO include: kvm,
+ * iommu, interrupts, vfio, participating filesystems, and memory management.
+ *
+ * LUO uses Kexec Handover to transfer memory state from the current kernel to
+ * the next kernel. For more details see
+ * Documentation/core-api/kho/concepts.rst.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kobject.h>
+#include <linux/liveupdate.h>
+#include <linux/miscdevice.h>
+
+static struct {
+	bool enabled;
+} luo_global;
+
+static int __init early_liveupdate_param(char *buf)
+{
+	return kstrtobool(buf, &luo_global.enabled);
+}
+early_param("liveupdate", early_liveupdate_param);
+
+/* Public Functions */
+
+/**
+ * liveupdate_reboot() - Kernel reboot notifier for live update final
+ * serialization.
+ *
+ * This function is invoked directly from the reboot() syscall pathway
+ * if kexec is in progress.
+ *
+ * If any callback fails, this function aborts KHO, undoes the freeze()
+ * callbacks, and returns an error.
+ */
+int liveupdate_reboot(void)
+{
+	return 0;
+}
+
+/**
+ * liveupdate_enabled - Check if the live update feature is enabled.
+ *
+ * This function returns the state of the live update feature flag, which
+ * can be controlled via the ``liveupdate`` kernel command-line parameter.
+ *
+ * @return true if live update is enabled, false otherwise.
+ */
+bool liveupdate_enabled(void)
+{
+	return luo_global.enabled;
+}
+
+struct luo_device_state {
+	struct miscdevice miscdev;
+};
+
+static const struct file_operations luo_fops = {
+	.owner		= THIS_MODULE,
+};
+
+static struct luo_device_state luo_dev = {
+	.miscdev = {
+		.minor = MISC_DYNAMIC_MINOR,
+		.name  = "liveupdate",
+		.fops  = &luo_fops,
+	},
+};
+
+static int __init liveupdate_ioctl_init(void)
+{
+	if (!liveupdate_enabled())
+		return 0;
+
+	return misc_register(&luo_dev.miscdev);
+}
+late_initcall(liveupdate_ioctl_init);
-- 
cgit v1.2.3


From 1aece821004f67f46ef4db7199bbeca87cf22bdd Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 25 Nov 2025 11:58:32 -0500
Subject: liveupdate: luo_core: integrate with KHO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Integrate the LUO with the KHO framework to enable passing LUO state
across a kexec reboot.

This patch implements the lifecycle integration with KHO:

1. Incoming State: During early boot (`early_initcall`), LUO checks if
   KHO is active. If so, it retrieves the "LUO" subtree, verifies the
   "luo-v1" compatibility string, and reads the `liveupdate-number` to
   track the update count.

2. Outgoing State: During late initialization (`late_initcall`), LUO
   allocates a new FDT for the next kernel, populates it with the basic
   header (compatible string and incremented update number), and
   registers it with KHO (`kho_add_subtree`).

3. Finalization: The `liveupdate_reboot()` notifier is updated to invoke
   `kho_finalize()`. This ensures that all memory segments marked for
   preservation are properly serialized before the kexec jump.

Link: https://lkml.kernel.org/r/20251125165850.3389713-3-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kho/abi/luo.h      |  58 +++++++++++++++
 kernel/liveupdate/luo_core.c     | 154 ++++++++++++++++++++++++++++++++++++++-
 kernel/liveupdate/luo_internal.h |  22 ++++++
 3 files changed, 233 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kho/abi/luo.h
 create mode 100644 kernel/liveupdate/luo_internal.h

(limited to 'include')

diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h
new file mode 100644
index 000000000000..2099b51929e5
--- /dev/null
+++ b/include/linux/kho/abi/luo.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: Live Update Orchestrator ABI
+ *
+ * This header defines the stable Application Binary Interface used by the
+ * Live Update Orchestrator to pass state from a pre-update kernel to a
+ * post-update kernel. The ABI is built upon the Kexec HandOver framework
+ * and uses a Flattened Device Tree to describe the preserved data.
+ *
+ * This interface is a contract. Any modification to the FDT structure, node
+ * properties, compatible strings, or the layout of the `__packed` serialization
+ * structures defined here constitutes a breaking change. Such changes require
+ * incrementing the version number in the relevant `_COMPATIBLE` string to
+ * prevent a new kernel from misinterpreting data from an old kernel.
+ *
+ * Changes are allowed provided the compatibility version is incremented;
+ * however, backward/forward compatibility is only guaranteed for kernels
+ * supporting the same ABI version.
+ *
+ * FDT Structure Overview:
+ *   The entire LUO state is encapsulated within a single KHO entry named "LUO".
+ *   This entry contains an FDT with the following layout:
+ *
+ *   .. code-block:: none
+ *
+ *     / {
+ *         compatible = "luo-v1";
+ *         liveupdate-number = <...>;
+ *     };
+ *
+ * Main LUO Node (/):
+ *
+ *   - compatible: "luo-v1"
+ *     Identifies the overall LUO ABI version.
+ *   - liveupdate-number: u64
+ *     A counter tracking the number of successful live updates performed.
+ */
+
+#ifndef _LINUX_KHO_ABI_LUO_H
+#define _LINUX_KHO_ABI_LUO_H
+
+/*
+ * The LUO FDT hooks all LUO state for sessions, fds, etc.
+ * In the root it also carries "liveupdate-number" 64-bit property that
+ * corresponds to the number of live-updates performed on this machine.
+ */
+#define LUO_FDT_SIZE		PAGE_SIZE
+#define LUO_FDT_KHO_ENTRY_NAME	"LUO"
+#define LUO_FDT_COMPATIBLE	"luo-v1"
+#define LUO_FDT_LIVEUPDATE_NUM	"liveupdate-number"
+
+#endif /* _LINUX_KHO_ABI_LUO_H */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index 30ad8836360b..9f9fe9a81b29 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -41,12 +41,26 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
 #include <linux/kobject.h>
+#include <linux/libfdt.h>
 #include <linux/liveupdate.h>
 #include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+#include "kexec_handover_internal.h"
+#include "luo_internal.h"
 
 static struct {
 	bool enabled;
+	void *fdt_out;
+	void *fdt_in;
+	u64 liveupdate_num;
 } luo_global;
 
 static int __init early_liveupdate_param(char *buf)
@@ -55,6 +69,129 @@ static int __init early_liveupdate_param(char *buf)
 }
 early_param("liveupdate", early_liveupdate_param);
 
+static int __init luo_early_startup(void)
+{
+	phys_addr_t fdt_phys;
+	int err, ln_size;
+	const void *ptr;
+
+	if (!kho_is_enabled()) {
+		if (liveupdate_enabled())
+			pr_warn("Disabling liveupdate because KHO is disabled\n");
+		luo_global.enabled = false;
+		return 0;
+	}
+
+	/* Retrieve LUO subtree, and verify its format. */
+	err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys);
+	if (err) {
+		if (err != -ENOENT) {
+			pr_err("failed to retrieve FDT '%s' from KHO: %pe\n",
+			       LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err));
+			return err;
+		}
+
+		return 0;
+	}
+
+	luo_global.fdt_in = phys_to_virt(fdt_phys);
+	err = fdt_node_check_compatible(luo_global.fdt_in, 0,
+					LUO_FDT_COMPATIBLE);
+	if (err) {
+		pr_err("FDT '%s' is incompatible with '%s' [%d]\n",
+		       LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err);
+
+		return -EINVAL;
+	}
+
+	ln_size = 0;
+	ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM,
+			  &ln_size);
+	if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) {
+		pr_err("Unable to get live update number '%s' [%d]\n",
+		       LUO_FDT_LIVEUPDATE_NUM, ln_size);
+
+		return -EINVAL;
+	}
+
+	luo_global.liveupdate_num = get_unaligned((u64 *)ptr);
+	pr_info("Retrieved live update data, liveupdate number: %lld\n",
+		luo_global.liveupdate_num);
+
+	return 0;
+}
+
+static int __init liveupdate_early_init(void)
+{
+	int err;
+
+	err = luo_early_startup();
+	if (err) {
+		luo_global.enabled = false;
+		luo_restore_fail("The incoming tree failed to initialize properly [%pe], disabling live update\n",
+				 ERR_PTR(err));
+	}
+
+	return err;
+}
+early_initcall(liveupdate_early_init);
+
+/* Called during boot to create outgoing LUO fdt tree */
+static int __init luo_fdt_setup(void)
+{
+	const u64 ln = luo_global.liveupdate_num + 1;
+	void *fdt_out;
+	int err;
+
+	fdt_out = kho_alloc_preserve(LUO_FDT_SIZE);
+	if (IS_ERR(fdt_out)) {
+		pr_err("failed to allocate/preserve FDT memory\n");
+		return PTR_ERR(fdt_out);
+	}
+
+	err = fdt_create(fdt_out, LUO_FDT_SIZE);
+	err |= fdt_finish_reservemap(fdt_out);
+	err |= fdt_begin_node(fdt_out, "");
+	err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
+	err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
+	err |= fdt_end_node(fdt_out);
+	err |= fdt_finish(fdt_out);
+	if (err)
+		goto exit_free;
+
+	err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out);
+	if (err)
+		goto exit_free;
+	luo_global.fdt_out = fdt_out;
+
+	return 0;
+
+exit_free:
+	kho_unpreserve_free(fdt_out);
+	pr_err("failed to prepare LUO FDT: %d\n", err);
+
+	return err;
+}
+
+/*
+ * late initcall because it initializes the outgoing tree that is needed only
+ * once userspace starts using /dev/liveupdate.
+ */
+static int __init luo_late_startup(void)
+{
+	int err;
+
+	if (!liveupdate_enabled())
+		return 0;
+
+	err = luo_fdt_setup();
+	if (err)
+		luo_global.enabled = false;
+
+	return err;
+}
+late_initcall(luo_late_startup);
+
 /* Public Functions */
 
 /**
@@ -69,7 +206,22 @@ early_param("liveupdate", early_liveupdate_param);
  */
 int liveupdate_reboot(void)
 {
-	return 0;
+	int err;
+
+	if (!liveupdate_enabled())
+		return 0;
+
+	err = kho_finalize();
+	if (err) {
+		pr_err("kho_finalize failed %d\n", err);
+		/*
+		 * kho_finalize() may return libfdt errors, to aboid passing to
+		 * userspace unknown errors, change this to EAGAIN.
+		 */
+		err = -EAGAIN;
+	}
+
+	return err;
 }
 
 /**
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
new file mode 100644
index 000000000000..8612687b2000
--- /dev/null
+++ b/kernel/liveupdate/luo_internal.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _LINUX_LUO_INTERNAL_H
+#define _LINUX_LUO_INTERNAL_H
+
+#include <linux/liveupdate.h>
+
+/*
+ * Handles a deserialization failure: devices and memory is in unpredictable
+ * state.
+ *
+ * Continuing the boot process after a failure is dangerous because it could
+ * lead to leaks of private data.
+ */
+#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
+
+#endif /* _LINUX_LUO_INTERNAL_H */
-- 
cgit v1.2.3


From 0153094d03df5a2e834a19c59b255649a258ae46 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 25 Nov 2025 11:58:34 -0500
Subject: liveupdate: luo_session: add sessions support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce concept of "Live Update Sessions" within the LUO framework.  LUO
sessions provide a mechanism to group and manage `struct file *` instances
(representing file descriptors) that need to be preserved across a
kexec-based live update.

Each session is identified by a unique name and acts as a container for
file objects whose state is critical to a userspace workload, such as a
virtual machine or a high-performance database, aiming to maintain their
functionality across a kernel transition.

This groundwork establishes the framework for preserving file-backed state
across kernel updates, with the actual file data preservation mechanisms
to be implemented in subsequent patches.

[dan.carpenter@linaro.org: fix use after free in luo_session_deserialize()]
  Link: https://lkml.kernel.org/r/c5dd637d7eed3a3be48c5e9fedb881596a3b1f5a.1764163896.git.dan.carpenter@linaro.org
Link: https://lkml.kernel.org/r/20251125165850.3389713-5-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kho/abi/luo.h      |  71 ++++++
 include/uapi/linux/liveupdate.h  |   3 +
 kernel/liveupdate/Makefile       |   3 +-
 kernel/liveupdate/luo_core.c     |   9 +
 kernel/liveupdate/luo_internal.h |  29 +++
 kernel/liveupdate/luo_session.c  | 463 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 577 insertions(+), 1 deletion(-)
 create mode 100644 kernel/liveupdate/luo_session.c

(limited to 'include')

diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h
index 2099b51929e5..bf1ab2910959 100644
--- a/include/linux/kho/abi/luo.h
+++ b/include/linux/kho/abi/luo.h
@@ -32,6 +32,11 @@
  *     / {
  *         compatible = "luo-v1";
  *         liveupdate-number = <...>;
+ *
+ *         luo-session {
+ *             compatible = "luo-session-v1";
+ *             luo-session-header = <phys_addr_of_session_header_ser>;
+ *         };
  *     };
  *
  * Main LUO Node (/):
@@ -40,11 +45,37 @@
  *     Identifies the overall LUO ABI version.
  *   - liveupdate-number: u64
  *     A counter tracking the number of successful live updates performed.
+ *
+ * Session Node (luo-session):
+ *   This node describes all preserved user-space sessions.
+ *
+ *   - compatible: "luo-session-v1"
+ *     Identifies the session ABI version.
+ *   - luo-session-header: u64
+ *     The physical address of a `struct luo_session_header_ser`. This structure
+ *     is the header for a contiguous block of memory containing an array of
+ *     `struct luo_session_ser`, one for each preserved session.
+ *
+ * Serialization Structures:
+ *   The FDT properties point to memory regions containing arrays of simple,
+ *   `__packed` structures. These structures contain the actual preserved state.
+ *
+ *   - struct luo_session_header_ser:
+ *     Header for the session array. Contains the total page count of the
+ *     preserved memory block and the number of `struct luo_session_ser`
+ *     entries that follow.
+ *
+ *   - struct luo_session_ser:
+ *     Metadata for a single session, including its name and a physical pointer
+ *     to another preserved memory block containing an array of
+ *     `struct luo_file_ser` for all files in that session.
  */
 
 #ifndef _LINUX_KHO_ABI_LUO_H
 #define _LINUX_KHO_ABI_LUO_H
 
+#include <uapi/linux/liveupdate.h>
+
 /*
  * The LUO FDT hooks all LUO state for sessions, fds, etc.
  * In the root it also carries "liveupdate-number" 64-bit property that
@@ -55,4 +86,44 @@
 #define LUO_FDT_COMPATIBLE	"luo-v1"
 #define LUO_FDT_LIVEUPDATE_NUM	"liveupdate-number"
 
+/*
+ * LUO FDT session node
+ * LUO_FDT_SESSION_HEADER:  is a u64 physical address of struct
+ *                          luo_session_header_ser
+ */
+#define LUO_FDT_SESSION_NODE_NAME	"luo-session"
+#define LUO_FDT_SESSION_COMPATIBLE	"luo-session-v1"
+#define LUO_FDT_SESSION_HEADER		"luo-session-header"
+
+/**
+ * struct luo_session_header_ser - Header for the serialized session data block.
+ * @count: The number of `struct luo_session_ser` entries that immediately
+ *         follow this header in the memory block.
+ *
+ * This structure is located at the beginning of a contiguous block of
+ * physical memory preserved across the kexec. It provides the necessary
+ * metadata to interpret the array of session entries that follow.
+ *
+ * If this structure is modified, `LUO_FDT_SESSION_COMPATIBLE` must be updated.
+ */
+struct luo_session_header_ser {
+	u64 count;
+} __packed;
+
+/**
+ * struct luo_session_ser - Represents the serialized metadata for a LUO session.
+ * @name:         The unique name of the session, provided by the userspace at
+ *                the time of session creation.
+ *
+ * This structure is used to package session-specific metadata for transfer
+ * between kernels via Kexec Handover. An array of these structures (one per
+ * session) is created and passed to the new kernel, allowing it to reconstruct
+ * the session context.
+ *
+ * If this structure is modified, `LUO_FDT_SESSION_COMPATIBLE` must be updated.
+ */
+struct luo_session_ser {
+	char name[LIVEUPDATE_SESSION_NAME_LENGTH];
+} __packed;
+
 #endif /* _LINUX_KHO_ABI_LUO_H */
diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h
index df34c1642c4d..40578ae19668 100644
--- a/include/uapi/linux/liveupdate.h
+++ b/include/uapi/linux/liveupdate.h
@@ -43,4 +43,7 @@
 /* The ioctl type, documented in ioctl-number.rst */
 #define LIVEUPDATE_IOCTL_TYPE		0xBA
 
+/* The maximum length of session name including null termination */
+#define LIVEUPDATE_SESSION_NAME_LENGTH 64
+
 #endif /* _UAPI_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
index 08954c1770c4..6af93caa58cf 100644
--- a/kernel/liveupdate/Makefile
+++ b/kernel/liveupdate/Makefile
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 
 luo-y :=								\
-		luo_core.o
+		luo_core.o						\
+		luo_session.o
 
 obj-$(CONFIG_KEXEC_HANDOVER)		+= kexec_handover.o
 obj-$(CONFIG_KEXEC_HANDOVER_DEBUG)	+= kexec_handover_debug.o
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index 9f9fe9a81b29..a0f7788cd003 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -118,6 +118,10 @@ static int __init luo_early_startup(void)
 	pr_info("Retrieved live update data, liveupdate number: %lld\n",
 		luo_global.liveupdate_num);
 
+	err = luo_session_setup_incoming(luo_global.fdt_in);
+	if (err)
+		return err;
+
 	return 0;
 }
 
@@ -154,6 +158,7 @@ static int __init luo_fdt_setup(void)
 	err |= fdt_begin_node(fdt_out, "");
 	err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
 	err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
+	err |= luo_session_setup_outgoing(fdt_out);
 	err |= fdt_end_node(fdt_out);
 	err |= fdt_finish(fdt_out);
 	if (err)
@@ -211,6 +216,10 @@ int liveupdate_reboot(void)
 	if (!liveupdate_enabled())
 		return 0;
 
+	err = luo_session_serialize();
+	if (err)
+		return err;
+
 	err = kho_finalize();
 	if (err) {
 		pr_err("kho_finalize failed %d\n", err);
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index 8612687b2000..05ae91695ec6 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -19,4 +19,33 @@
  */
 #define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
 
+/**
+ * struct luo_session - Represents an active or incoming Live Update session.
+ * @name:       A unique name for this session, used for identification and
+ *              retrieval.
+ * @ser:        Pointer to the serialized data for this session.
+ * @list:       A list_head member used to link this session into a global list
+ *              of either outgoing (to be preserved) or incoming (restored from
+ *              previous kernel) sessions.
+ * @retrieved:  A boolean flag indicating whether this session has been
+ *              retrieved by a consumer in the new kernel.
+ * @mutex:      protects fields in the luo_session.
+ */
+struct luo_session {
+	char name[LIVEUPDATE_SESSION_NAME_LENGTH];
+	struct luo_session_ser *ser;
+	struct list_head list;
+	bool retrieved;
+	struct mutex mutex;
+};
+
+int luo_session_create(const char *name, struct file **filep);
+int luo_session_retrieve(const char *name, struct file **filep);
+int __init luo_session_setup_outgoing(void *fdt);
+int __init luo_session_setup_incoming(void *fdt);
+int luo_session_serialize(void);
+int luo_session_deserialize(void);
+bool luo_session_quiesce(void);
+void luo_session_resume(void);
+
 #endif /* _LINUX_LUO_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
new file mode 100644
index 000000000000..3a031446d3a4
--- /dev/null
+++ b/kernel/liveupdate/luo_session.c
@@ -0,0 +1,463 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO Sessions
+ *
+ * LUO Sessions provide the core mechanism for grouping and managing `struct
+ * file *` instances that need to be preserved across a kexec-based live
+ * update. Each session acts as a named container for a set of file objects,
+ * allowing a userspace agent to manage the lifecycle of resources critical to a
+ * workload.
+ *
+ * Core Concepts:
+ *
+ * - Named Containers: Sessions are identified by a unique, user-provided name,
+ *   which is used for both creation in the current kernel and retrieval in the
+ *   next kernel.
+ *
+ * - Userspace Interface: Session management is driven from userspace via
+ *   ioctls on /dev/liveupdate.
+ *
+ * - Serialization: Session metadata is preserved using the KHO framework. When
+ *   a live update is triggered via kexec, an array of `struct luo_session_ser`
+ *   is populated and placed in a preserved memory region. An FDT node is also
+ *   created, containing the count of sessions and the physical address of this
+ *   array.
+ *
+ * Session Lifecycle:
+ *
+ * 1.  Creation: A userspace agent calls `luo_session_create()` to create a
+ *     new, empty session and receives a file descriptor for it.
+ *
+ * 2.  Serialization: When the `reboot(LINUX_REBOOT_CMD_KEXEC)` syscall is
+ *     made, `luo_session_serialize()` is called. It iterates through all
+ *     active sessions and writes their metadata into a memory area preserved
+ *     by KHO.
+ *
+ * 3.  Deserialization (in new kernel): After kexec, `luo_session_deserialize()`
+ *     runs, reading the serialized data and creating a list of `struct
+ *     luo_session` objects representing the preserved sessions.
+ *
+ * 4.  Retrieval: A userspace agent in the new kernel can then call
+ *     `luo_session_retrieve()` with a session name to get a new file
+ *     descriptor and access the preserved state.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/anon_inodes.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/libfdt.h>
+#include <linux/list.h>
+#include <linux/liveupdate.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/unaligned.h>
+#include <uapi/linux/liveupdate.h>
+#include "luo_internal.h"
+
+/* 16 4K pages, give space for 744 sessions */
+#define LUO_SESSION_PGCNT	16ul
+#define LUO_SESSION_MAX		(((LUO_SESSION_PGCNT << PAGE_SHIFT) -	\
+		sizeof(struct luo_session_header_ser)) /		\
+		sizeof(struct luo_session_ser))
+
+/**
+ * struct luo_session_header - Header struct for managing LUO sessions.
+ * @count:      The number of sessions currently tracked in the @list.
+ * @list:       The head of the linked list of `struct luo_session` instances.
+ * @rwsem:      A read-write semaphore providing synchronized access to the
+ *              session list and other fields in this structure.
+ * @header_ser: The header data of serialization array.
+ * @ser:        The serialized session data (an array of
+ *              `struct luo_session_ser`).
+ * @active:     Set to true when first initialized. If previous kernel did not
+ *              send session data, active stays false for incoming.
+ */
+struct luo_session_header {
+	long count;
+	struct list_head list;
+	struct rw_semaphore rwsem;
+	struct luo_session_header_ser *header_ser;
+	struct luo_session_ser *ser;
+	bool active;
+};
+
+/**
+ * struct luo_session_global - Global container for managing LUO sessions.
+ * @incoming:     The sessions passed from the previous kernel.
+ * @outgoing:     The sessions that are going to be passed to the next kernel.
+ */
+struct luo_session_global {
+	struct luo_session_header incoming;
+	struct luo_session_header outgoing;
+};
+
+static struct luo_session_global luo_session_global = {
+	.incoming = {
+		.list = LIST_HEAD_INIT(luo_session_global.incoming.list),
+		.rwsem = __RWSEM_INITIALIZER(luo_session_global.incoming.rwsem),
+	},
+	.outgoing = {
+		.list = LIST_HEAD_INIT(luo_session_global.outgoing.list),
+		.rwsem = __RWSEM_INITIALIZER(luo_session_global.outgoing.rwsem),
+	},
+};
+
+static struct luo_session *luo_session_alloc(const char *name)
+{
+	struct luo_session *session = kzalloc(sizeof(*session), GFP_KERNEL);
+
+	if (!session)
+		return ERR_PTR(-ENOMEM);
+
+	strscpy(session->name, name, sizeof(session->name));
+	INIT_LIST_HEAD(&session->list);
+	mutex_init(&session->mutex);
+
+	return session;
+}
+
+static void luo_session_free(struct luo_session *session)
+{
+	mutex_destroy(&session->mutex);
+	kfree(session);
+}
+
+static int luo_session_insert(struct luo_session_header *sh,
+			      struct luo_session *session)
+{
+	struct luo_session *it;
+
+	guard(rwsem_write)(&sh->rwsem);
+
+	/*
+	 * For outgoing we should make sure there is room in serialization array
+	 * for new session.
+	 */
+	if (sh == &luo_session_global.outgoing) {
+		if (sh->count == LUO_SESSION_MAX)
+			return -ENOMEM;
+	}
+
+	/*
+	 * For small number of sessions this loop won't hurt performance
+	 * but if we ever start using a lot of sessions, this might
+	 * become a bottle neck during deserialization time, as it would
+	 * cause O(n*n) complexity.
+	 */
+	list_for_each_entry(it, &sh->list, list) {
+		if (!strncmp(it->name, session->name, sizeof(it->name)))
+			return -EEXIST;
+	}
+	list_add_tail(&session->list, &sh->list);
+	sh->count++;
+
+	return 0;
+}
+
+static void luo_session_remove(struct luo_session_header *sh,
+			       struct luo_session *session)
+{
+	guard(rwsem_write)(&sh->rwsem);
+	list_del(&session->list);
+	sh->count--;
+}
+
+static int luo_session_release(struct inode *inodep, struct file *filep)
+{
+	struct luo_session *session = filep->private_data;
+	struct luo_session_header *sh;
+
+	/* If retrieved is set, it means this session is from incoming list */
+	if (session->retrieved)
+		sh = &luo_session_global.incoming;
+	else
+		sh = &luo_session_global.outgoing;
+
+	luo_session_remove(sh, session);
+	luo_session_free(session);
+
+	return 0;
+}
+
+static const struct file_operations luo_session_fops = {
+	.owner = THIS_MODULE,
+	.release = luo_session_release,
+};
+
+/* Create a "struct file" for session */
+static int luo_session_getfile(struct luo_session *session, struct file **filep)
+{
+	char name_buf[128];
+	struct file *file;
+
+	lockdep_assert_held(&session->mutex);
+	snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name);
+	file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	*filep = file;
+
+	return 0;
+}
+
+int luo_session_create(const char *name, struct file **filep)
+{
+	struct luo_session *session;
+	int err;
+
+	session = luo_session_alloc(name);
+	if (IS_ERR(session))
+		return PTR_ERR(session);
+
+	err = luo_session_insert(&luo_session_global.outgoing, session);
+	if (err)
+		goto err_free;
+
+	scoped_guard(mutex, &session->mutex)
+		err = luo_session_getfile(session, filep);
+	if (err)
+		goto err_remove;
+
+	return 0;
+
+err_remove:
+	luo_session_remove(&luo_session_global.outgoing, session);
+err_free:
+	luo_session_free(session);
+
+	return err;
+}
+
+int luo_session_retrieve(const char *name, struct file **filep)
+{
+	struct luo_session_header *sh = &luo_session_global.incoming;
+	struct luo_session *session = NULL;
+	struct luo_session *it;
+	int err;
+
+	scoped_guard(rwsem_read, &sh->rwsem) {
+		list_for_each_entry(it, &sh->list, list) {
+			if (!strncmp(it->name, name, sizeof(it->name))) {
+				session = it;
+				break;
+			}
+		}
+	}
+
+	if (!session)
+		return -ENOENT;
+
+	guard(mutex)(&session->mutex);
+	if (session->retrieved)
+		return -EINVAL;
+
+	err = luo_session_getfile(session, filep);
+	if (!err)
+		session->retrieved = true;
+
+	return err;
+}
+
+int __init luo_session_setup_outgoing(void *fdt_out)
+{
+	struct luo_session_header_ser *header_ser;
+	u64 header_ser_pa;
+	int err;
+
+	header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT);
+	if (IS_ERR(header_ser))
+		return PTR_ERR(header_ser);
+	header_ser_pa = virt_to_phys(header_ser);
+
+	err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME);
+	err |= fdt_property_string(fdt_out, "compatible",
+				   LUO_FDT_SESSION_COMPATIBLE);
+	err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa,
+			    sizeof(header_ser_pa));
+	err |= fdt_end_node(fdt_out);
+
+	if (err)
+		goto err_unpreserve;
+
+	luo_session_global.outgoing.header_ser = header_ser;
+	luo_session_global.outgoing.ser = (void *)(header_ser + 1);
+	luo_session_global.outgoing.active = true;
+
+	return 0;
+
+err_unpreserve:
+	kho_unpreserve_free(header_ser);
+	return err;
+}
+
+int __init luo_session_setup_incoming(void *fdt_in)
+{
+	struct luo_session_header_ser *header_ser;
+	int err, header_size, offset;
+	u64 header_ser_pa;
+	const void *ptr;
+
+	offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_SESSION_NODE_NAME);
+	if (offset < 0) {
+		pr_err("Unable to get session node: [%s]\n",
+		       LUO_FDT_SESSION_NODE_NAME);
+		return -EINVAL;
+	}
+
+	err = fdt_node_check_compatible(fdt_in, offset,
+					LUO_FDT_SESSION_COMPATIBLE);
+	if (err) {
+		pr_err("Session node incompatible [%s]\n",
+		       LUO_FDT_SESSION_COMPATIBLE);
+		return -EINVAL;
+	}
+
+	header_size = 0;
+	ptr = fdt_getprop(fdt_in, offset, LUO_FDT_SESSION_HEADER, &header_size);
+	if (!ptr || header_size != sizeof(u64)) {
+		pr_err("Unable to get session header '%s' [%d]\n",
+		       LUO_FDT_SESSION_HEADER, header_size);
+		return -EINVAL;
+	}
+
+	header_ser_pa = get_unaligned((u64 *)ptr);
+	header_ser = phys_to_virt(header_ser_pa);
+
+	luo_session_global.incoming.header_ser = header_ser;
+	luo_session_global.incoming.ser = (void *)(header_ser + 1);
+	luo_session_global.incoming.active = true;
+
+	return 0;
+}
+
+int luo_session_deserialize(void)
+{
+	struct luo_session_header *sh = &luo_session_global.incoming;
+	static bool is_deserialized;
+	static int err;
+
+	/* If has been deserialized, always return the same error code */
+	if (is_deserialized)
+		return err;
+
+	is_deserialized = true;
+	if (!sh->active)
+		return 0;
+
+	/*
+	 * Note on error handling:
+	 *
+	 * If deserialization fails (e.g., allocation failure or corrupt data),
+	 * we intentionally skip cleanup of sessions that were already restored.
+	 *
+	 * A partial failure leaves the preserved state inconsistent.
+	 * Implementing a safe "undo" to unwind complex dependencies (sessions,
+	 * files, hardware state) is error-prone and provides little value, as
+	 * the system is effectively in a broken state.
+	 *
+	 * We treat these resources as leaked. The expected recovery path is for
+	 * userspace to detect the failure and trigger a reboot, which will
+	 * reliably reset devices and reclaim memory.
+	 */
+	for (int i = 0; i < sh->header_ser->count; i++) {
+		struct luo_session *session;
+
+		session = luo_session_alloc(sh->ser[i].name);
+		if (IS_ERR(session)) {
+			pr_warn("Failed to allocate session [%s] during deserialization %pe\n",
+				sh->ser[i].name, session);
+			return PTR_ERR(session);
+		}
+
+		err = luo_session_insert(sh, session);
+		if (err) {
+			pr_warn("Failed to insert session [%s] %pe\n",
+				session->name, ERR_PTR(err));
+			luo_session_free(session);
+			return err;
+		}
+	}
+
+	kho_restore_free(sh->header_ser);
+	sh->header_ser = NULL;
+	sh->ser = NULL;
+
+	return 0;
+}
+
+int luo_session_serialize(void)
+{
+	struct luo_session_header *sh = &luo_session_global.outgoing;
+	struct luo_session *session;
+	int i = 0;
+
+	guard(rwsem_write)(&sh->rwsem);
+	list_for_each_entry(session, &sh->list, list) {
+		strscpy(sh->ser[i].name, session->name,
+			sizeof(sh->ser[i].name));
+		i++;
+	}
+	sh->header_ser->count = sh->count;
+
+	return 0;
+}
+
+/**
+ * luo_session_quiesce - Ensure no active sessions exist and lock session lists.
+ *
+ * Acquires exclusive write locks on both incoming and outgoing session lists.
+ * It then validates no sessions exist in either list.
+ *
+ * This mechanism is used during file handler un/registration to ensure that no
+ * sessions are currently using the handler, and no new sessions can be created
+ * while un/registration is in progress.
+ *
+ * This prevents registering new handlers while sessions are active or
+ * while deserialization is in progress.
+ *
+ * Return:
+ * true  - System is quiescent (0 sessions) and locked.
+ * false - Active sessions exist. The locks are released internally.
+ */
+bool luo_session_quiesce(void)
+{
+	down_write(&luo_session_global.incoming.rwsem);
+	down_write(&luo_session_global.outgoing.rwsem);
+
+	if (luo_session_global.incoming.count ||
+	    luo_session_global.outgoing.count) {
+		up_write(&luo_session_global.outgoing.rwsem);
+		up_write(&luo_session_global.incoming.rwsem);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * luo_session_resume - Unlock session lists and resume normal activity.
+ *
+ * Releases the exclusive locks acquired by a successful call to
+ * luo_session_quiesce().
+ */
+void luo_session_resume(void)
+{
+	up_write(&luo_session_global.outgoing.rwsem);
+	up_write(&luo_session_global.incoming.rwsem);
+}
-- 
cgit v1.2.3


From 81cd25d263a182b3dcdc8af3b92e4b8e4db336de Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 25 Nov 2025 11:58:35 -0500
Subject: liveupdate: luo_core: add user interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce the user-space interface for the Live Update Orchestrator via
ioctl commands, enabling external control over the live update process and
management of preserved resources.

The idea is that there is going to be a single userspace agent driving the
live update, therefore, only a single process can ever hold this device
opened at a time.

The following ioctl commands are introduced:

LIVEUPDATE_IOCTL_CREATE_SESSION
Provides a way for userspace to create a named session for grouping file
descriptors that need to be preserved. It returns a new file descriptor
representing the session.

LIVEUPDATE_IOCTL_RETRIEVE_SESSION
Allows the userspace agent in the new kernel to reclaim a preserved
session by its name, receiving a new file descriptor to manage the
restored resources.

Link: https://lkml.kernel.org/r/20251125165850.3389713-6-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/liveupdate.h  |  64 ++++++++++++++
 kernel/liveupdate/luo_core.c     | 178 +++++++++++++++++++++++++++++++++++++++
 kernel/liveupdate/luo_internal.h |  21 +++++
 3 files changed, 263 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h
index 40578ae19668..1183cf984b5f 100644
--- a/include/uapi/linux/liveupdate.h
+++ b/include/uapi/linux/liveupdate.h
@@ -46,4 +46,68 @@
 /* The maximum length of session name including null termination */
 #define LIVEUPDATE_SESSION_NAME_LENGTH 64
 
+/* The /dev/liveupdate ioctl commands */
+enum {
+	LIVEUPDATE_CMD_BASE = 0x00,
+	LIVEUPDATE_CMD_CREATE_SESSION = LIVEUPDATE_CMD_BASE,
+	LIVEUPDATE_CMD_RETRIEVE_SESSION = 0x01,
+};
+
+/**
+ * struct liveupdate_ioctl_create_session - ioctl(LIVEUPDATE_IOCTL_CREATE_SESSION)
+ * @size:	Input; sizeof(struct liveupdate_ioctl_create_session)
+ * @fd:		Output; The new file descriptor for the created session.
+ * @name:	Input; A null-terminated string for the session name, max
+ *		length %LIVEUPDATE_SESSION_NAME_LENGTH including termination
+ *		character.
+ *
+ * Creates a new live update session for managing preserved resources.
+ * This ioctl can only be called on the main /dev/liveupdate device.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+struct liveupdate_ioctl_create_session {
+	__u32		size;
+	__s32		fd;
+	__u8		name[LIVEUPDATE_SESSION_NAME_LENGTH];
+};
+
+#define LIVEUPDATE_IOCTL_CREATE_SESSION					\
+	_IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_CREATE_SESSION)
+
+/**
+ * struct liveupdate_ioctl_retrieve_session - ioctl(LIVEUPDATE_IOCTL_RETRIEVE_SESSION)
+ * @size:    Input; sizeof(struct liveupdate_ioctl_retrieve_session)
+ * @fd:      Output; The new file descriptor for the retrieved session.
+ * @name:    Input; A null-terminated string identifying the session to retrieve.
+ *           The name must exactly match the name used when the session was
+ *           created in the previous kernel.
+ *
+ * Retrieves a handle (a new file descriptor) for a preserved session by its
+ * name. This is the primary mechanism for a userspace agent to regain control
+ * of its preserved resources after a live update.
+ *
+ * The userspace application provides the null-terminated `name` of a session
+ * it created before the live update. If a preserved session with a matching
+ * name is found, the kernel instantiates it and returns a new file descriptor
+ * in the `fd` field. This new session FD can then be used for all file-specific
+ * operations, such as restoring individual file descriptors with
+ * LIVEUPDATE_SESSION_RETRIEVE_FD.
+ *
+ * It is the responsibility of the userspace application to know the names of
+ * the sessions it needs to retrieve. If no session with the given name is
+ * found, the ioctl will fail with -ENOENT.
+ *
+ * This ioctl can only be called on the main /dev/liveupdate device when the
+ * system is in the LIVEUPDATE_STATE_UPDATED state.
+ */
+struct liveupdate_ioctl_retrieve_session {
+	__u32		size;
+	__s32		fd;
+	__u8		name[LIVEUPDATE_SESSION_NAME_LENGTH];
+};
+
+#define LIVEUPDATE_IOCTL_RETRIEVE_SESSION \
+	_IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_RETRIEVE_SESSION)
+
 #endif /* _UAPI_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index a0f7788cd003..f7ecaf7740d1 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -41,7 +41,13 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/atomic.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
 #include <linux/io.h>
+#include <linux/kernel.h>
 #include <linux/kexec_handover.h>
 #include <linux/kho/abi/luo.h>
 #include <linux/kobject.h>
@@ -246,12 +252,183 @@ bool liveupdate_enabled(void)
 	return luo_global.enabled;
 }
 
+/**
+ * DOC: LUO ioctl Interface
+ *
+ * The IOCTL user-space control interface for the LUO subsystem.
+ * It registers a character device, typically found at ``/dev/liveupdate``,
+ * which allows a userspace agent to manage the LUO state machine and its
+ * associated resources, such as preservable file descriptors.
+ *
+ * To ensure that the state machine is controlled by a single entity, access
+ * to this device is exclusive: only one process is permitted to have
+ * ``/dev/liveupdate`` open at any given time. Subsequent open attempts will
+ * fail with -EBUSY until the first process closes its file descriptor.
+ * This singleton model simplifies state management by preventing conflicting
+ * commands from multiple userspace agents.
+ */
+
 struct luo_device_state {
 	struct miscdevice miscdev;
+	atomic_t in_use;
 };
 
+static int luo_ioctl_create_session(struct luo_ucmd *ucmd)
+{
+	struct liveupdate_ioctl_create_session *argp = ucmd->cmd;
+	struct file *file;
+	int err;
+
+	argp->fd = get_unused_fd_flags(O_CLOEXEC);
+	if (argp->fd < 0)
+		return argp->fd;
+
+	err = luo_session_create(argp->name, &file);
+	if (err)
+		goto err_put_fd;
+
+	err = luo_ucmd_respond(ucmd, sizeof(*argp));
+	if (err)
+		goto err_put_file;
+
+	fd_install(argp->fd, file);
+
+	return 0;
+
+err_put_file:
+	fput(file);
+err_put_fd:
+	put_unused_fd(argp->fd);
+
+	return err;
+}
+
+static int luo_ioctl_retrieve_session(struct luo_ucmd *ucmd)
+{
+	struct liveupdate_ioctl_retrieve_session *argp = ucmd->cmd;
+	struct file *file;
+	int err;
+
+	argp->fd = get_unused_fd_flags(O_CLOEXEC);
+	if (argp->fd < 0)
+		return argp->fd;
+
+	err = luo_session_retrieve(argp->name, &file);
+	if (err < 0)
+		goto err_put_fd;
+
+	err = luo_ucmd_respond(ucmd, sizeof(*argp));
+	if (err)
+		goto err_put_file;
+
+	fd_install(argp->fd, file);
+
+	return 0;
+
+err_put_file:
+	fput(file);
+err_put_fd:
+	put_unused_fd(argp->fd);
+
+	return err;
+}
+
+static int luo_open(struct inode *inodep, struct file *filep)
+{
+	struct luo_device_state *ldev = container_of(filep->private_data,
+						     struct luo_device_state,
+						     miscdev);
+
+	if (atomic_cmpxchg(&ldev->in_use, 0, 1))
+		return -EBUSY;
+
+	/* Always return -EIO to user if deserialization fail */
+	if (luo_session_deserialize()) {
+		atomic_set(&ldev->in_use, 0);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int luo_release(struct inode *inodep, struct file *filep)
+{
+	struct luo_device_state *ldev = container_of(filep->private_data,
+						     struct luo_device_state,
+						     miscdev);
+	atomic_set(&ldev->in_use, 0);
+
+	return 0;
+}
+
+union ucmd_buffer {
+	struct liveupdate_ioctl_create_session create;
+	struct liveupdate_ioctl_retrieve_session retrieve;
+};
+
+struct luo_ioctl_op {
+	unsigned int size;
+	unsigned int min_size;
+	unsigned int ioctl_num;
+	int (*execute)(struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last)                                  \
+	[_IOC_NR(_ioctl) - LIVEUPDATE_CMD_BASE] = {                            \
+		.size = sizeof(_struct) +                                      \
+			BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) <          \
+					  sizeof(_struct)),                    \
+		.min_size = offsetofend(_struct, _last),                       \
+		.ioctl_num = _ioctl,                                           \
+		.execute = _fn,                                                \
+	}
+
+static const struct luo_ioctl_op luo_ioctl_ops[] = {
+	IOCTL_OP(LIVEUPDATE_IOCTL_CREATE_SESSION, luo_ioctl_create_session,
+		 struct liveupdate_ioctl_create_session, name),
+	IOCTL_OP(LIVEUPDATE_IOCTL_RETRIEVE_SESSION, luo_ioctl_retrieve_session,
+		 struct liveupdate_ioctl_retrieve_session, name),
+};
+
+static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	const struct luo_ioctl_op *op;
+	struct luo_ucmd ucmd = {};
+	union ucmd_buffer buf;
+	unsigned int nr;
+	int err;
+
+	nr = _IOC_NR(cmd);
+	if (nr < LIVEUPDATE_CMD_BASE ||
+	    (nr - LIVEUPDATE_CMD_BASE) >= ARRAY_SIZE(luo_ioctl_ops)) {
+		return -EINVAL;
+	}
+
+	ucmd.ubuffer = (void __user *)arg;
+	err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+	if (err)
+		return err;
+
+	op = &luo_ioctl_ops[nr - LIVEUPDATE_CMD_BASE];
+	if (op->ioctl_num != cmd)
+		return -ENOIOCTLCMD;
+	if (ucmd.user_size < op->min_size)
+		return -EINVAL;
+
+	ucmd.cmd = &buf;
+	err = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+				    ucmd.user_size);
+	if (err)
+		return err;
+
+	return op->execute(&ucmd);
+}
+
 static const struct file_operations luo_fops = {
 	.owner		= THIS_MODULE,
+	.open		= luo_open,
+	.release	= luo_release,
+	.unlocked_ioctl	= luo_ioctl,
 };
 
 static struct luo_device_state luo_dev = {
@@ -260,6 +437,7 @@ static struct luo_device_state luo_dev = {
 		.name  = "liveupdate",
 		.fops  = &luo_fops,
 	},
+	.in_use = ATOMIC_INIT(0),
 };
 
 static int __init liveupdate_ioctl_init(void)
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index 05ae91695ec6..1292ac47eef8 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -9,6 +9,27 @@
 #define _LINUX_LUO_INTERNAL_H
 
 #include <linux/liveupdate.h>
+#include <linux/uaccess.h>
+
+struct luo_ucmd {
+	void __user *ubuffer;
+	u32 user_size;
+	void *cmd;
+};
+
+static inline int luo_ucmd_respond(struct luo_ucmd *ucmd,
+				   size_t kernel_cmd_size)
+{
+	/*
+	 * Copy the minimum of what the user provided and what we actually
+	 * have.
+	 */
+	if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
+			 min_t(size_t, ucmd->user_size, kernel_cmd_size))) {
+		return -EFAULT;
+	}
+	return 0;
+}
 
 /*
  * Handles a deserialization failure: devices and memory is in unpredictable
-- 
cgit v1.2.3


From 7c722a7f44e0c1f9714084152226bc7bd644b7e3 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 25 Nov 2025 11:58:36 -0500
Subject: liveupdate: luo_file: implement file systems callbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch implements the core mechanism for managing preserved files
throughout the live update lifecycle.  It provides the logic to invoke the
file handler callbacks (preserve, unpreserve, freeze, unfreeze, retrieve,
and finish) at the appropriate stages.

During the reboot phase, luo_file_freeze() serializes the final metadata
for each file (handler compatible string, token, and data handle) into a
memory region preserved by KHO.  In the new kernel, luo_file_deserialize()
reconstructs the in-memory file list from this data, preparing the session
for retrieval.

Link: https://lkml.kernel.org/r/20251125165850.3389713-7-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kho/abi/luo.h      |  39 +-
 include/linux/liveupdate.h       |  98 +++++
 kernel/liveupdate/Makefile       |   1 +
 kernel/liveupdate/luo_file.c     | 880 +++++++++++++++++++++++++++++++++++++++
 kernel/liveupdate/luo_internal.h |  38 ++
 5 files changed, 1055 insertions(+), 1 deletion(-)
 create mode 100644 kernel/liveupdate/luo_file.c

(limited to 'include')

diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h
index bf1ab2910959..bb099c92e469 100644
--- a/include/linux/kho/abi/luo.h
+++ b/include/linux/kho/abi/luo.h
@@ -69,6 +69,11 @@
  *     Metadata for a single session, including its name and a physical pointer
  *     to another preserved memory block containing an array of
  *     `struct luo_file_ser` for all files in that session.
+ *
+ *   - struct luo_file_ser:
+ *     Metadata for a single preserved file. Contains the `compatible` string to
+ *     find the correct handler in the new kernel, a user-provided `token` for
+ *     identification, and an opaque `data` handle for the handler to use.
  */
 
 #ifndef _LINUX_KHO_ABI_LUO_H
@@ -86,13 +91,43 @@
 #define LUO_FDT_COMPATIBLE	"luo-v1"
 #define LUO_FDT_LIVEUPDATE_NUM	"liveupdate-number"
 
+#define LIVEUPDATE_HNDL_COMPAT_LENGTH	48
+
+/**
+ * struct luo_file_ser - Represents the serialized preserves files.
+ * @compatible:  File handler compatible string.
+ * @data:        Private data
+ * @token:       User provided token for this file
+ *
+ * If this structure is modified, LUO_SESSION_COMPATIBLE must be updated.
+ */
+struct luo_file_ser {
+	char compatible[LIVEUPDATE_HNDL_COMPAT_LENGTH];
+	u64 data;
+	u64 token;
+} __packed;
+
+/**
+ * struct luo_file_set_ser - Represents the serialized metadata for file set
+ * @files:   The physical address of a contiguous memory block that holds
+ *           the serialized state of files (array of luo_file_ser) in this file
+ *           set.
+ * @count:   The total number of files that were part of this session during
+ *           serialization. Used for iteration and validation during
+ *           restoration.
+ */
+struct luo_file_set_ser {
+	u64 files;
+	u64 count;
+} __packed;
+
 /*
  * LUO FDT session node
  * LUO_FDT_SESSION_HEADER:  is a u64 physical address of struct
  *                          luo_session_header_ser
  */
 #define LUO_FDT_SESSION_NODE_NAME	"luo-session"
-#define LUO_FDT_SESSION_COMPATIBLE	"luo-session-v1"
+#define LUO_FDT_SESSION_COMPATIBLE	"luo-session-v2"
 #define LUO_FDT_SESSION_HEADER		"luo-session-header"
 
 /**
@@ -114,6 +149,7 @@ struct luo_session_header_ser {
  * struct luo_session_ser - Represents the serialized metadata for a LUO session.
  * @name:         The unique name of the session, provided by the userspace at
  *                the time of session creation.
+ * @file_set_ser: Serialized files belonging to this session,
  *
  * This structure is used to package session-specific metadata for transfer
  * between kernels via Kexec Handover. An array of these structures (one per
@@ -124,6 +160,7 @@ struct luo_session_header_ser {
  */
 struct luo_session_ser {
 	char name[LIVEUPDATE_SESSION_NAME_LENGTH];
+	struct luo_file_set_ser file_set_ser;
 } __packed;
 
 #endif /* _LINUX_KHO_ABI_LUO_H */
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index c6a1d6bd90cb..122ad8f16ff9 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -8,8 +8,93 @@
 #define _LINUX_LIVEUPDATE_H
 
 #include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/kho/abi/luo.h>
 #include <linux/list.h>
 #include <linux/types.h>
+#include <uapi/linux/liveupdate.h>
+
+struct liveupdate_file_handler;
+struct file;
+
+/**
+ * struct liveupdate_file_op_args - Arguments for file operation callbacks.
+ * @handler:          The file handler being called.
+ * @retrieved:        The retrieve status for the 'can_finish / finish'
+ *                    operation.
+ * @file:             The file object. For retrieve: [OUT] The callback sets
+ *                    this to the new file. For other ops: [IN] The caller sets
+ *                    this to the file being operated on.
+ * @serialized_data:  The opaque u64 handle, preserve/prepare/freeze may update
+ *                    this field.
+ *
+ * This structure bundles all parameters for the file operation callbacks.
+ * The 'data' and 'file' fields are used for both input and output.
+ */
+struct liveupdate_file_op_args {
+	struct liveupdate_file_handler *handler;
+	bool retrieved;
+	struct file *file;
+	u64 serialized_data;
+};
+
+/**
+ * struct liveupdate_file_ops - Callbacks for live-updatable files.
+ * @can_preserve: Required. Lightweight check to see if this handler is
+ *                compatible with the given file.
+ * @preserve:     Required. Performs state-saving for the file.
+ * @unpreserve:   Required. Cleans up any resources allocated by @preserve.
+ * @freeze:       Optional. Final actions just before kernel transition.
+ * @unfreeze:     Optional. Undo freeze operations.
+ * @retrieve:     Required. Restores the file in the new kernel.
+ * @can_finish:   Optional. Check if this FD can finish, i.e. all restoration
+ *                pre-requirements for this FD are satisfied. Called prior to
+ *                finish, in order to do successful finish calls for all
+ *                resources in the session.
+ * @finish:       Required. Final cleanup in the new kernel.
+ * @owner:        Module reference
+ *
+ * All operations (except can_preserve) receive a pointer to a
+ * 'struct liveupdate_file_op_args' containing the necessary context.
+ */
+struct liveupdate_file_ops {
+	bool (*can_preserve)(struct liveupdate_file_handler *handler,
+			     struct file *file);
+	int (*preserve)(struct liveupdate_file_op_args *args);
+	void (*unpreserve)(struct liveupdate_file_op_args *args);
+	int (*freeze)(struct liveupdate_file_op_args *args);
+	void (*unfreeze)(struct liveupdate_file_op_args *args);
+	int (*retrieve)(struct liveupdate_file_op_args *args);
+	bool (*can_finish)(struct liveupdate_file_op_args *args);
+	void (*finish)(struct liveupdate_file_op_args *args);
+	struct module *owner;
+};
+
+/**
+ * struct liveupdate_file_handler - Represents a handler for a live-updatable file type.
+ * @ops:                Callback functions
+ * @compatible:         The compatibility string (e.g., "memfd-v1", "vfiofd-v1")
+ *                      that uniquely identifies the file type this handler
+ *                      supports. This is matched against the compatible string
+ *                      associated with individual &struct file instances.
+ *
+ * Modules that want to support live update for specific file types should
+ * register an instance of this structure. LUO uses this registration to
+ * determine if a given file can be preserved and to find the appropriate
+ * operations to manage its state across the update.
+ */
+struct liveupdate_file_handler {
+	const struct liveupdate_file_ops *ops;
+	const char compatible[LIVEUPDATE_HNDL_COMPAT_LENGTH];
+
+	/* private: */
+
+	/*
+	 * Used for linking this handler instance into a global list of
+	 * registered file handlers.
+	 */
+	struct list_head __private list;
+};
 
 #ifdef CONFIG_LIVEUPDATE
 
@@ -19,6 +104,9 @@ bool liveupdate_enabled(void);
 /* Called during kexec to tell LUO that entered into reboot */
 int liveupdate_reboot(void);
 
+int liveupdate_register_file_handler(struct liveupdate_file_handler *fh);
+int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh);
+
 #else /* CONFIG_LIVEUPDATE */
 
 static inline bool liveupdate_enabled(void)
@@ -31,5 +119,15 @@ static inline int liveupdate_reboot(void)
 	return 0;
 }
 
+static inline int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_LIVEUPDATE */
 #endif /* _LINUX_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
index 6af93caa58cf..7cad2eece32d 100644
--- a/kernel/liveupdate/Makefile
+++ b/kernel/liveupdate/Makefile
@@ -2,6 +2,7 @@
 
 luo-y :=								\
 		luo_core.o						\
+		luo_file.o						\
 		luo_session.o
 
 obj-$(CONFIG_KEXEC_HANDOVER)		+= kexec_handover.o
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
new file mode 100644
index 000000000000..e9727cb1275a
--- /dev/null
+++ b/kernel/liveupdate/luo_file.c
@@ -0,0 +1,880 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO File Descriptors
+ *
+ * LUO provides the infrastructure to preserve specific, stateful file
+ * descriptors across a kexec-based live update. The primary goal is to allow
+ * workloads, such as virtual machines using vfio, memfd, or iommufd, to
+ * retain access to their essential resources without interruption.
+ *
+ * The framework is built around a callback-based handler model and a well-
+ * defined lifecycle for each preserved file.
+ *
+ * Handler Registration:
+ * Kernel modules responsible for a specific file type (e.g., memfd, vfio)
+ * register a &struct liveupdate_file_handler. This handler provides a set of
+ * callbacks that LUO invokes at different stages of the update process, most
+ * notably:
+ *
+ *   - can_preserve(): A lightweight check to determine if the handler is
+ *     compatible with a given 'struct file'.
+ *   - preserve(): The heavyweight operation that saves the file's state and
+ *     returns an opaque u64 handle. This is typically performed while the
+ *     workload is still active to minimize the downtime during the
+ *     actual reboot transition.
+ *   - unpreserve(): Cleans up any resources allocated by .preserve(), called
+ *     if the preservation process is aborted before the reboot (i.e. session is
+ *     closed).
+ *   - freeze(): A final pre-reboot opportunity to prepare the state for kexec.
+ *     We are already in reboot syscall, and therefore userspace cannot mutate
+ *     the file anymore.
+ *   - unfreeze(): Undoes the actions of .freeze(), called if the live update
+ *     is aborted after the freeze phase.
+ *   - retrieve(): Reconstructs the file in the new kernel from the preserved
+ *     handle.
+ *   - finish(): Performs final check and cleanup in the new kernel. After
+ *     succesul finish call, LUO gives up ownership to this file.
+ *
+ * File Preservation Lifecycle happy path:
+ *
+ * 1. Preserve (Normal Operation): A userspace agent preserves files one by one
+ *    via an ioctl. For each file, luo_preserve_file() finds a compatible
+ *    handler, calls its .preserve() operation, and creates an internal &struct
+ *    luo_file to track the live state.
+ *
+ * 2. Freeze (Pre-Reboot): Just before the kexec, luo_file_freeze() is called.
+ *    It iterates through all preserved files, calls their respective .freeze()
+ *    operation, and serializes their final metadata (compatible string, token,
+ *    and data handle) into a contiguous memory block for KHO.
+ *
+ * 3. Deserialize: After kexec, luo_file_deserialize() runs when session gets
+ *    deserialized (which is when /dev/liveupdate is first opened). It reads the
+ *    serialized data from the KHO memory region and reconstructs the in-memory
+ *    list of &struct luo_file instances for the new kernel, linking them to
+ *    their corresponding handlers.
+ *
+ * 4. Retrieve (New Kernel - Userspace Ready): The userspace agent can now
+ *    restore file descriptors by providing a token. luo_retrieve_file()
+ *    searches for the matching token, calls the handler's .retrieve() op to
+ *    re-create the 'struct file', and returns a new FD. Files can be
+ *    retrieved in ANY order.
+ *
+ * 5. Finish (New Kernel - Cleanup): Once a session retrival is complete,
+ *    luo_file_finish() is called. It iterates through all files, invokes their
+ *    .finish() operations for final cleanup, and releases all associated kernel
+ *    resources.
+ *
+ * File Preservation Lifecycle unhappy paths:
+ *
+ * 1. Abort Before Reboot: If the userspace agent aborts the live update
+ *    process before calling reboot (e.g., by closing the session file
+ *    descriptor), the session's release handler calls
+ *    luo_file_unpreserve_files(). This invokes the .unpreserve() callback on
+ *    all preserved files, ensuring all allocated resources are cleaned up and
+ *    returning the system to a clean state.
+ *
+ * 2. Freeze Failure: During the reboot() syscall, if any handler's .freeze()
+ *    op fails, the .unfreeze() op is invoked on all previously *successful*
+ *    freezes to roll back their state. The reboot() syscall then returns an
+ *    error to userspace, canceling the live update.
+ *
+ * 3. Finish Failure: In the new kernel, if a handler's .finish() op fails,
+ *    the luo_file_finish() operation is aborted. LUO retains ownership of
+ *    all files within that session, including those that were not yet
+ *    processed. The userspace agent can attempt to call the finish operation
+ *    again later. If the issue cannot be resolved, these resources will be held
+ *    by LUO until the next live update cycle, at which point they will be
+ *    discarded.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cleanup.h>
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/liveupdate.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "luo_internal.h"
+
+static LIST_HEAD(luo_file_handler_list);
+
+/* 2 4K pages, give space for 128 files per file_set */
+#define LUO_FILE_PGCNT		2ul
+#define LUO_FILE_MAX							\
+	((LUO_FILE_PGCNT << PAGE_SHIFT) / sizeof(struct luo_file_ser))
+
+/**
+ * struct luo_file - Represents a single preserved file instance.
+ * @fh:            Pointer to the &struct liveupdate_file_handler that manages
+ *                 this type of file.
+ * @file:          Pointer to the kernel's &struct file that is being preserved.
+ *                 This is NULL in the new kernel until the file is successfully
+ *                 retrieved.
+ * @serialized_data: The opaque u64 handle to the serialized state of the file.
+ *                 This handle is passed back to the handler's .freeze(),
+ *                 .retrieve(), and .finish() callbacks, allowing it to track
+ *                 and update its serialized state across phases.
+ * @retrieved:     A flag indicating whether a user/kernel in the new kernel has
+ *                 successfully called retrieve() on this file. This prevents
+ *                 multiple retrieval attempts.
+ * @mutex:         A mutex that protects the fields of this specific instance
+ *                 (e.g., @retrieved, @file), ensuring that operations like
+ *                 retrieving or finishing a file are atomic.
+ * @list:          The list_head linking this instance into its parent
+ *                 file_set's list of preserved files.
+ * @token:         The user-provided unique token used to identify this file.
+ *
+ * This structure is the core in-kernel representation of a single file being
+ * managed through a live update. An instance is created by luo_preserve_file()
+ * to link a 'struct file' to its corresponding handler, a user-provided token,
+ * and the serialized state handle returned by the handler's .preserve()
+ * operation.
+ *
+ * These instances are tracked in a per-file_set list. The @serialized_data
+ * field, which holds a handle to the file's serialized state, may be updated
+ * during the .freeze() callback before being serialized for the next kernel.
+ * After reboot, these structures are recreated by luo_file_deserialize() and
+ * are finally cleaned up by luo_file_finish().
+ */
+struct luo_file {
+	struct liveupdate_file_handler *fh;
+	struct file *file;
+	u64 serialized_data;
+	bool retrieved;
+	struct mutex mutex;
+	struct list_head list;
+	u64 token;
+};
+
+static int luo_alloc_files_mem(struct luo_file_set *file_set)
+{
+	size_t size;
+	void *mem;
+
+	if (file_set->files)
+		return 0;
+
+	WARN_ON_ONCE(file_set->count);
+
+	size = LUO_FILE_PGCNT << PAGE_SHIFT;
+	mem = kho_alloc_preserve(size);
+	if (IS_ERR(mem))
+		return PTR_ERR(mem);
+
+	file_set->files = mem;
+
+	return 0;
+}
+
+static void luo_free_files_mem(struct luo_file_set *file_set)
+{
+	/* If file_set has files, no need to free preservation memory */
+	if (file_set->count)
+		return;
+
+	if (!file_set->files)
+		return;
+
+	kho_unpreserve_free(file_set->files);
+	file_set->files = NULL;
+}
+
+static bool luo_token_is_used(struct luo_file_set *file_set, u64 token)
+{
+	struct luo_file *iter;
+
+	list_for_each_entry(iter, &file_set->files_list, list) {
+		if (iter->token == token)
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * luo_preserve_file - Initiate the preservation of a file descriptor.
+ * @file_set: The file_set to which the preserved file will be added.
+ * @token:    A unique, user-provided identifier for the file.
+ * @fd:       The file descriptor to be preserved.
+ *
+ * This function orchestrates the first phase of preserving a file. Upon entry,
+ * it takes a reference to the 'struct file' via fget(), effectively making LUO
+ * a co-owner of the file. This reference is held until the file is either
+ * unpreserved or successfully finished in the next kernel, preventing the file
+ * from being prematurely destroyed.
+ *
+ * This function orchestrates the first phase of preserving a file. It performs
+ * the following steps:
+ *
+ * 1. Validates that the @token is not already in use within the file_set.
+ * 2. Ensures the file_set's memory for files serialization is allocated
+ *    (allocates if needed).
+ * 3. Iterates through registered handlers, calling can_preserve() to find one
+ *    compatible with the given @fd.
+ * 4. Calls the handler's .preserve() operation, which saves the file's state
+ *    and returns an opaque private data handle.
+ * 5. Adds the new instance to the file_set's internal list.
+ *
+ * On success, LUO takes a reference to the 'struct file' and considers it
+ * under its management until it is unpreserved or finished.
+ *
+ * In case of any failure, all intermediate allocations (file reference, memory
+ * for the 'luo_file' struct, etc.) are cleaned up before returning an error.
+ *
+ * Context: Can be called from an ioctl handler during normal system operation.
+ * Return: 0 on success. Returns a negative errno on failure:
+ *         -EEXIST if the token is already used.
+ *         -EBADF if the file descriptor is invalid.
+ *         -ENOSPC if the file_set is full.
+ *         -ENOENT if no compatible handler is found.
+ *         -ENOMEM on memory allocation failure.
+ *         Other erros might be returned by .preserve().
+ */
+int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
+{
+	struct liveupdate_file_op_args args = {0};
+	struct liveupdate_file_handler *fh;
+	struct luo_file *luo_file;
+	struct file *file;
+	int err;
+
+	if (luo_token_is_used(file_set, token))
+		return -EEXIST;
+
+	if (file_set->count == LUO_FILE_MAX)
+		return -ENOSPC;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	err = luo_alloc_files_mem(file_set);
+	if (err)
+		goto  err_fput;
+
+	err = -ENOENT;
+	luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+		if (fh->ops->can_preserve(fh, file)) {
+			err = 0;
+			break;
+		}
+	}
+
+	/* err is still -ENOENT if no handler was found */
+	if (err)
+		goto err_free_files_mem;
+
+	luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
+	if (!luo_file) {
+		err = -ENOMEM;
+		goto err_free_files_mem;
+	}
+
+	luo_file->file = file;
+	luo_file->fh = fh;
+	luo_file->token = token;
+	luo_file->retrieved = false;
+	mutex_init(&luo_file->mutex);
+
+	args.handler = fh;
+	args.file = file;
+	err = fh->ops->preserve(&args);
+	if (err)
+		goto err_kfree;
+
+	luo_file->serialized_data = args.serialized_data;
+	list_add_tail(&luo_file->list, &file_set->files_list);
+	file_set->count++;
+
+	return 0;
+
+err_kfree:
+	kfree(luo_file);
+err_free_files_mem:
+	luo_free_files_mem(file_set);
+err_fput:
+	fput(file);
+
+	return err;
+}
+
+/**
+ * luo_file_unpreserve_files - Unpreserves all files from a file_set.
+ * @file_set: The files to be cleaned up.
+ *
+ * This function serves as the primary cleanup path for a file_set. It is
+ * invoked when the userspace agent closes the file_set's file descriptor.
+ *
+ * For each file, it performs the following cleanup actions:
+ *   1. Calls the handler's .unpreserve() callback to allow the handler to
+ *      release any resources it allocated.
+ *   2. Removes the file from the file_set's internal tracking list.
+ *   3. Releases the reference to the 'struct file' that was taken by
+ *      luo_preserve_file() via fput(), returning ownership.
+ *   4. Frees the memory associated with the internal 'struct luo_file'.
+ *
+ * After all individual files are unpreserved, it frees the contiguous memory
+ * block that was allocated to hold their serialization data.
+ */
+void luo_file_unpreserve_files(struct luo_file_set *file_set)
+{
+	struct luo_file *luo_file;
+
+	while (!list_empty(&file_set->files_list)) {
+		struct liveupdate_file_op_args args = {0};
+
+		luo_file = list_last_entry(&file_set->files_list,
+					   struct luo_file, list);
+
+		args.handler = luo_file->fh;
+		args.file = luo_file->file;
+		args.serialized_data = luo_file->serialized_data;
+		luo_file->fh->ops->unpreserve(&args);
+
+		list_del(&luo_file->list);
+		file_set->count--;
+
+		fput(luo_file->file);
+		mutex_destroy(&luo_file->mutex);
+		kfree(luo_file);
+	}
+
+	luo_free_files_mem(file_set);
+}
+
+static int luo_file_freeze_one(struct luo_file_set *file_set,
+			       struct luo_file *luo_file)
+{
+	int err = 0;
+
+	guard(mutex)(&luo_file->mutex);
+
+	if (luo_file->fh->ops->freeze) {
+		struct liveupdate_file_op_args args = {0};
+
+		args.handler = luo_file->fh;
+		args.file = luo_file->file;
+		args.serialized_data = luo_file->serialized_data;
+
+		err = luo_file->fh->ops->freeze(&args);
+		if (!err)
+			luo_file->serialized_data = args.serialized_data;
+	}
+
+	return err;
+}
+
+static void luo_file_unfreeze_one(struct luo_file_set *file_set,
+				  struct luo_file *luo_file)
+{
+	guard(mutex)(&luo_file->mutex);
+
+	if (luo_file->fh->ops->unfreeze) {
+		struct liveupdate_file_op_args args = {0};
+
+		args.handler = luo_file->fh;
+		args.file = luo_file->file;
+		args.serialized_data = luo_file->serialized_data;
+
+		luo_file->fh->ops->unfreeze(&args);
+	}
+
+	luo_file->serialized_data = 0;
+}
+
+static void __luo_file_unfreeze(struct luo_file_set *file_set,
+				struct luo_file *failed_entry)
+{
+	struct list_head *files_list = &file_set->files_list;
+	struct luo_file *luo_file;
+
+	list_for_each_entry(luo_file, files_list, list) {
+		if (luo_file == failed_entry)
+			break;
+
+		luo_file_unfreeze_one(file_set, luo_file);
+	}
+
+	memset(file_set->files, 0, LUO_FILE_PGCNT << PAGE_SHIFT);
+}
+
+/**
+ * luo_file_freeze - Freezes all preserved files and serializes their metadata.
+ * @file_set:     The file_set whose files are to be frozen.
+ * @file_set_ser: Where to put the serialized file_set.
+ *
+ * This function is called from the reboot() syscall path, just before the
+ * kernel transitions to the new image via kexec. Its purpose is to perform the
+ * final preparation and serialization of all preserved files in the file_set.
+ *
+ * It iterates through each preserved file in FIFO order (the order of
+ * preservation) and performs two main actions:
+ *
+ * 1. Freezes the File: It calls the handler's .freeze() callback for each
+ *    file. This gives the handler a final opportunity to quiesce the device or
+ *    prepare its state for the upcoming reboot. The handler may update its
+ *    private data handle during this step.
+ *
+ * 2. Serializes Metadata: After a successful freeze, it copies the final file
+ *    metadata—the handler's compatible string, the user token, and the final
+ *    private data handle—into the pre-allocated contiguous memory buffer
+ *    (file_set->files) that will be handed over to the next kernel via KHO.
+ *
+ * Error Handling (Rollback):
+ * This function is atomic. If any handler's .freeze() operation fails, the
+ * entire live update is aborted. The __luo_file_unfreeze() helper is
+ * immediately called to invoke the .unfreeze() op on all files that were
+ * successfully frozen before the point of failure, rolling them back to a
+ * running state. The function then returns an error, causing the reboot()
+ * syscall to fail.
+ *
+ * Context: Called only from the liveupdate_reboot() path.
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_file_freeze(struct luo_file_set *file_set,
+		    struct luo_file_set_ser *file_set_ser)
+{
+	struct luo_file_ser *file_ser = file_set->files;
+	struct luo_file *luo_file;
+	int err;
+	int i;
+
+	if (!file_set->count)
+		return 0;
+
+	if (WARN_ON(!file_ser))
+		return -EINVAL;
+
+	i = 0;
+	list_for_each_entry(luo_file, &file_set->files_list, list) {
+		err = luo_file_freeze_one(file_set, luo_file);
+		if (err < 0) {
+			pr_warn("Freeze failed for token[%#0llx] handler[%s] err[%pe]\n",
+				luo_file->token, luo_file->fh->compatible,
+				ERR_PTR(err));
+			goto err_unfreeze;
+		}
+
+		strscpy(file_ser[i].compatible, luo_file->fh->compatible,
+			sizeof(file_ser[i].compatible));
+		file_ser[i].data = luo_file->serialized_data;
+		file_ser[i].token = luo_file->token;
+		i++;
+	}
+
+	file_set_ser->count = file_set->count;
+	if (file_set->files)
+		file_set_ser->files = virt_to_phys(file_set->files);
+
+	return 0;
+
+err_unfreeze:
+	__luo_file_unfreeze(file_set, luo_file);
+
+	return err;
+}
+
+/**
+ * luo_file_unfreeze - Unfreezes all files in a file_set and clear serialization
+ * @file_set:     The file_set whose files are to be unfrozen.
+ * @file_set_ser: Serialized file_set.
+ *
+ * This function rolls back the state of all files in a file_set after the
+ * freeze phase has begun but must be aborted. It is the counterpart to
+ * luo_file_freeze().
+ *
+ * It invokes the __luo_file_unfreeze() helper with a NULL argument, which
+ * signals the helper to iterate through all files in the file_set and call
+ * their respective .unfreeze() handler callbacks.
+ *
+ * Context: This is called when the live update is aborted during
+ *          the reboot() syscall, after luo_file_freeze() has been called.
+ */
+void luo_file_unfreeze(struct luo_file_set *file_set,
+		       struct luo_file_set_ser *file_set_ser)
+{
+	if (!file_set->count)
+		return;
+
+	__luo_file_unfreeze(file_set, NULL);
+	memset(file_set_ser, 0, sizeof(*file_set_ser));
+}
+
+/**
+ * luo_retrieve_file - Restores a preserved file from a file_set by its token.
+ * @file_set: The file_set from which to retrieve the file.
+ * @token:    The unique token identifying the file to be restored.
+ * @filep:    Output parameter; on success, this is populated with a pointer
+ *            to the newly retrieved 'struct file'.
+ *
+ * This function is the primary mechanism for recreating a file in the new
+ * kernel after a live update. It searches the file_set's list of deserialized
+ * files for an entry matching the provided @token.
+ *
+ * The operation is idempotent: if a file has already been successfully
+ * retrieved, this function will simply return a pointer to the existing
+ * 'struct file' and report success without re-executing the retrieve
+ * operation. This is handled by checking the 'retrieved' flag under a lock.
+ *
+ * File retrieval can happen in any order; it is not bound by the order of
+ * preservation.
+ *
+ * Context: Can be called from an ioctl or other in-kernel code in the new
+ *          kernel.
+ * Return: 0 on success. Returns a negative errno on failure:
+ *         -ENOENT if no file with the matching token is found.
+ *         Any error code returned by the handler's .retrieve() op.
+ */
+int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
+		      struct file **filep)
+{
+	struct liveupdate_file_op_args args = {0};
+	struct luo_file *luo_file;
+	int err;
+
+	if (list_empty(&file_set->files_list))
+		return -ENOENT;
+
+	list_for_each_entry(luo_file, &file_set->files_list, list) {
+		if (luo_file->token == token)
+			break;
+	}
+
+	if (luo_file->token != token)
+		return -ENOENT;
+
+	guard(mutex)(&luo_file->mutex);
+	if (luo_file->retrieved) {
+		/*
+		 * Someone is asking for this file again, so get a reference
+		 * for them.
+		 */
+		get_file(luo_file->file);
+		*filep = luo_file->file;
+		return 0;
+	}
+
+	args.handler = luo_file->fh;
+	args.serialized_data = luo_file->serialized_data;
+	err = luo_file->fh->ops->retrieve(&args);
+	if (!err) {
+		luo_file->file = args.file;
+
+		/* Get reference so we can keep this file in LUO until finish */
+		get_file(luo_file->file);
+		*filep = luo_file->file;
+		luo_file->retrieved = true;
+	}
+
+	return err;
+}
+
+static int luo_file_can_finish_one(struct luo_file_set *file_set,
+				   struct luo_file *luo_file)
+{
+	bool can_finish = true;
+
+	guard(mutex)(&luo_file->mutex);
+
+	if (luo_file->fh->ops->can_finish) {
+		struct liveupdate_file_op_args args = {0};
+
+		args.handler = luo_file->fh;
+		args.file = luo_file->file;
+		args.serialized_data = luo_file->serialized_data;
+		args.retrieved = luo_file->retrieved;
+		can_finish = luo_file->fh->ops->can_finish(&args);
+	}
+
+	return can_finish ? 0 : -EBUSY;
+}
+
+static void luo_file_finish_one(struct luo_file_set *file_set,
+				struct luo_file *luo_file)
+{
+	struct liveupdate_file_op_args args = {0};
+
+	guard(mutex)(&luo_file->mutex);
+
+	args.handler = luo_file->fh;
+	args.file = luo_file->file;
+	args.serialized_data = luo_file->serialized_data;
+	args.retrieved = luo_file->retrieved;
+
+	luo_file->fh->ops->finish(&args);
+}
+
+/**
+ * luo_file_finish - Completes the lifecycle for all files in a file_set.
+ * @file_set: The file_set to be finalized.
+ *
+ * This function orchestrates the final teardown of a live update file_set in
+ * the new kernel. It should be called after all necessary files have been
+ * retrieved and the userspace agent is ready to release the preserved state.
+ *
+ * The function iterates through all tracked files. For each file, it performs
+ * the following sequence of cleanup actions:
+ *
+ * 1. If file is not yet retrieved, retrieves it, and calls can_finish() on
+ *    every file in the file_set. If all can_finish return true, continue to
+ *    finish.
+ * 2. Calls the handler's .finish() callback (via luo_file_finish_one) to
+ *    allow for final resource cleanup within the handler.
+ * 3. Releases LUO's ownership reference on the 'struct file' via fput(). This
+ *    is the counterpart to the get_file() call in luo_retrieve_file().
+ * 4. Removes the 'struct luo_file' from the file_set's internal list.
+ * 5. Frees the memory for the 'struct luo_file' instance itself.
+ *
+ * After successfully finishing all individual files, it frees the
+ * contiguous memory block that was used to transfer the serialized metadata
+ * from the previous kernel.
+ *
+ * Error Handling (Atomic Failure):
+ * This operation is atomic. If any handler's .can_finish() op fails, the entire
+ * function aborts immediately and returns an error.
+ *
+ * Context: Can be called from an ioctl handler in the new kernel.
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_file_finish(struct luo_file_set *file_set)
+{
+	struct list_head *files_list = &file_set->files_list;
+	struct luo_file *luo_file;
+	int err;
+
+	if (!file_set->count)
+		return 0;
+
+	list_for_each_entry(luo_file, files_list, list) {
+		err = luo_file_can_finish_one(file_set, luo_file);
+		if (err)
+			return err;
+	}
+
+	while (!list_empty(&file_set->files_list)) {
+		luo_file = list_last_entry(&file_set->files_list,
+					   struct luo_file, list);
+
+		luo_file_finish_one(file_set, luo_file);
+
+		if (luo_file->file)
+			fput(luo_file->file);
+		list_del(&luo_file->list);
+		file_set->count--;
+		mutex_destroy(&luo_file->mutex);
+		kfree(luo_file);
+	}
+
+	if (file_set->files) {
+		kho_restore_free(file_set->files);
+		file_set->files = NULL;
+	}
+
+	return 0;
+}
+
+/**
+ * luo_file_deserialize - Reconstructs the list of preserved files in the new kernel.
+ * @file_set:     The incoming file_set to fill with deserialized data.
+ * @file_set_ser: Serialized KHO file_set data from the previous kernel.
+ *
+ * This function is called during the early boot process of the new kernel. It
+ * takes the raw, contiguous memory block of 'struct luo_file_ser' entries,
+ * provided by the previous kernel, and transforms it back into a live,
+ * in-memory linked list of 'struct luo_file' instances.
+ *
+ * For each serialized entry, it performs the following steps:
+ *   1. Reads the 'compatible' string.
+ *   2. Searches the global list of registered file handlers for one that
+ *      matches the compatible string.
+ *   3. Allocates a new 'struct luo_file'.
+ *   4. Populates the new structure with the deserialized data (token, private
+ *      data handle) and links it to the found handler. The 'file' pointer is
+ *      initialized to NULL, as the file has not been retrieved yet.
+ *   5. Adds the new 'struct luo_file' to the file_set's files_list.
+ *
+ * This prepares the file_set for userspace, which can later call
+ * luo_retrieve_file() to restore the actual file descriptors.
+ *
+ * Context: Called from session deserialization.
+ */
+int luo_file_deserialize(struct luo_file_set *file_set,
+			 struct luo_file_set_ser *file_set_ser)
+{
+	struct luo_file_ser *file_ser;
+	u64 i;
+
+	if (!file_set_ser->files) {
+		WARN_ON(file_set_ser->count);
+		return 0;
+	}
+
+	file_set->count = file_set_ser->count;
+	file_set->files = phys_to_virt(file_set_ser->files);
+
+	/*
+	 * Note on error handling:
+	 *
+	 * If deserialization fails (e.g., allocation failure or corrupt data),
+	 * we intentionally skip cleanup of files that were already restored.
+	 *
+	 * A partial failure leaves the preserved state inconsistent.
+	 * Implementing a safe "undo" to unwind complex dependencies (sessions,
+	 * files, hardware state) is error-prone and provides little value, as
+	 * the system is effectively in a broken state.
+	 *
+	 * We treat these resources as leaked. The expected recovery path is for
+	 * userspace to detect the failure and trigger a reboot, which will
+	 * reliably reset devices and reclaim memory.
+	 */
+	file_ser = file_set->files;
+	for (i = 0; i < file_set->count; i++) {
+		struct liveupdate_file_handler *fh;
+		bool handler_found = false;
+		struct luo_file *luo_file;
+
+		luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+			if (!strcmp(fh->compatible, file_ser[i].compatible)) {
+				handler_found = true;
+				break;
+			}
+		}
+
+		if (!handler_found) {
+			pr_warn("No registered handler for compatible '%s'\n",
+				file_ser[i].compatible);
+			return -ENOENT;
+		}
+
+		luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
+		if (!luo_file)
+			return -ENOMEM;
+
+		luo_file->fh = fh;
+		luo_file->file = NULL;
+		luo_file->serialized_data = file_ser[i].data;
+		luo_file->token = file_ser[i].token;
+		luo_file->retrieved = false;
+		mutex_init(&luo_file->mutex);
+		list_add_tail(&luo_file->list, &file_set->files_list);
+	}
+
+	return 0;
+}
+
+void luo_file_set_init(struct luo_file_set *file_set)
+{
+	INIT_LIST_HEAD(&file_set->files_list);
+}
+
+void luo_file_set_destroy(struct luo_file_set *file_set)
+{
+	WARN_ON(file_set->count);
+	WARN_ON(!list_empty(&file_set->files_list));
+}
+
+/**
+ * liveupdate_register_file_handler - Register a file handler with LUO.
+ * @fh: Pointer to a caller-allocated &struct liveupdate_file_handler.
+ * The caller must initialize this structure, including a unique
+ * 'compatible' string and a valid 'fh' callbacks. This function adds the
+ * handler to the global list of supported file handlers.
+ *
+ * Context: Typically called during module initialization for file types that
+ * support live update preservation.
+ *
+ * Return: 0 on success. Negative errno on failure.
+ */
+int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
+{
+	struct liveupdate_file_handler *fh_iter;
+	int err;
+
+	if (!liveupdate_enabled())
+		return -EOPNOTSUPP;
+
+	/* Sanity check that all required callbacks are set */
+	if (!fh->ops->preserve || !fh->ops->unpreserve || !fh->ops->retrieve ||
+	    !fh->ops->finish || !fh->ops->can_preserve) {
+		return -EINVAL;
+	}
+
+	/*
+	 * Ensure the system is quiescent (no active sessions).
+	 * This prevents registering new handlers while sessions are active or
+	 * while deserialization is in progress.
+	 */
+	if (!luo_session_quiesce())
+		return -EBUSY;
+
+	/* Check for duplicate compatible strings */
+	luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) {
+		if (!strcmp(fh_iter->compatible, fh->compatible)) {
+			pr_err("File handler registration failed: Compatible string '%s' already registered.\n",
+			       fh->compatible);
+			err = -EEXIST;
+			goto err_resume;
+		}
+	}
+
+	/* Pin the module implementing the handler */
+	if (!try_module_get(fh->ops->owner)) {
+		err = -EAGAIN;
+		goto err_resume;
+	}
+
+	INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list));
+	list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list);
+	luo_session_resume();
+
+	return 0;
+
+err_resume:
+	luo_session_resume();
+	return err;
+}
+
+/**
+ * liveupdate_unregister_file_handler - Unregister a liveupdate file handler
+ * @fh: The file handler to unregister
+ *
+ * Unregisters the file handler from the liveupdate core. This function
+ * reverses the operations of liveupdate_register_file_handler().
+ *
+ * It ensures safe removal by checking that:
+ * No live update session is currently in progress.
+ *
+ * If the unregistration fails, the internal test state is reverted.
+ *
+ * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live
+ * update is in progress, can't quiesce live update.
+ */
+int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
+{
+	if (!liveupdate_enabled())
+		return -EOPNOTSUPP;
+
+	if (!luo_session_quiesce())
+		return -EBUSY;
+
+	list_del(&ACCESS_PRIVATE(fh, list));
+	module_put(fh->ops->owner);
+	luo_session_resume();
+
+	return 0;
+}
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index 1292ac47eef8..c8973b543d1d 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -40,6 +40,28 @@ static inline int luo_ucmd_respond(struct luo_ucmd *ucmd,
  */
 #define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
 
+/* Mimics list_for_each_entry() but for private list head entries */
+#define luo_list_for_each_private(pos, head, member)				\
+	for (struct list_head *__iter = (head)->next;				\
+	     __iter != (head) &&						\
+	     ({ pos = container_of(__iter, typeof(*(pos)), member); 1; });	\
+	     __iter = __iter->next)
+
+/**
+ * struct luo_file_set - A set of files that belong to the same sessions.
+ * @files_list: An ordered list of files associated with this session, it is
+ *              ordered by preservation time.
+ * @files:      The physically contiguous memory block that holds the serialized
+ *              state of files.
+ * @count:      A counter tracking the number of files currently stored in the
+ *              @files_list for this session.
+ */
+struct luo_file_set {
+	struct list_head files_list;
+	struct luo_file_ser *files;
+	long count;
+};
+
 /**
  * struct luo_session - Represents an active or incoming Live Update session.
  * @name:       A unique name for this session, used for identification and
@@ -50,6 +72,7 @@ static inline int luo_ucmd_respond(struct luo_ucmd *ucmd,
  *              previous kernel) sessions.
  * @retrieved:  A boolean flag indicating whether this session has been
  *              retrieved by a consumer in the new kernel.
+ * @file_set:   A set of files that belong to this session.
  * @mutex:      protects fields in the luo_session.
  */
 struct luo_session {
@@ -57,6 +80,7 @@ struct luo_session {
 	struct luo_session_ser *ser;
 	struct list_head list;
 	bool retrieved;
+	struct luo_file_set file_set;
 	struct mutex mutex;
 };
 
@@ -69,4 +93,18 @@ int luo_session_deserialize(void);
 bool luo_session_quiesce(void);
 void luo_session_resume(void);
 
+int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd);
+void luo_file_unpreserve_files(struct luo_file_set *file_set);
+int luo_file_freeze(struct luo_file_set *file_set,
+		    struct luo_file_set_ser *file_set_ser);
+void luo_file_unfreeze(struct luo_file_set *file_set,
+		       struct luo_file_set_ser *file_set_ser);
+int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
+		      struct file **filep);
+int luo_file_finish(struct luo_file_set *file_set);
+int luo_file_deserialize(struct luo_file_set *file_set,
+			 struct luo_file_set_ser *file_set_ser);
+void luo_file_set_init(struct luo_file_set *file_set);
+void luo_file_set_destroy(struct luo_file_set *file_set);
+
 #endif /* _LINUX_LUO_INTERNAL_H */
-- 
cgit v1.2.3


From 16cec0d265219f14a7fcebcc43aeb69205adba56 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 25 Nov 2025 11:58:37 -0500
Subject: liveupdate: luo_session: add ioctls for file preservation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introducing the userspace interface and internal logic required to manage
the lifecycle of file descriptors within a session.  Previously, a session
was merely a container; this change makes it a functional management unit.

The following capabilities are added:

A new set of ioctl commands are added, which operate on the file
descriptor returned by CREATE_SESSION. This allows userspace to:
- LIVEUPDATE_SESSION_PRESERVE_FD: Add a file descriptor to a session
  to be preserved across the live update.
- LIVEUPDATE_SESSION_RETRIEVE_FD: Retrieve a preserved file in the
  new kernel using its unique token.
- LIVEUPDATE_SESSION_FINISH: finish session

The session's .release handler is enhanced to be state-aware.  When a
session's file descriptor is closed, it correctly unpreserves the session
based on its current state before freeing all associated file resources.

Link: https://lkml.kernel.org/r/20251125165850.3389713-8-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/liveupdate.h | 103 ++++++++++++++++++++++
 kernel/liveupdate/luo_session.c | 187 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 288 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h
index 1183cf984b5f..30bc66ee9436 100644
--- a/include/uapi/linux/liveupdate.h
+++ b/include/uapi/linux/liveupdate.h
@@ -53,6 +53,14 @@ enum {
 	LIVEUPDATE_CMD_RETRIEVE_SESSION = 0x01,
 };
 
+/* ioctl commands for session file descriptors */
+enum {
+	LIVEUPDATE_CMD_SESSION_BASE = 0x40,
+	LIVEUPDATE_CMD_SESSION_PRESERVE_FD = LIVEUPDATE_CMD_SESSION_BASE,
+	LIVEUPDATE_CMD_SESSION_RETRIEVE_FD = 0x41,
+	LIVEUPDATE_CMD_SESSION_FINISH = 0x42,
+};
+
 /**
  * struct liveupdate_ioctl_create_session - ioctl(LIVEUPDATE_IOCTL_CREATE_SESSION)
  * @size:	Input; sizeof(struct liveupdate_ioctl_create_session)
@@ -110,4 +118,99 @@ struct liveupdate_ioctl_retrieve_session {
 #define LIVEUPDATE_IOCTL_RETRIEVE_SESSION \
 	_IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_RETRIEVE_SESSION)
 
+/* Session specific IOCTLs */
+
+/**
+ * struct liveupdate_session_preserve_fd - ioctl(LIVEUPDATE_SESSION_PRESERVE_FD)
+ * @size:  Input; sizeof(struct liveupdate_session_preserve_fd)
+ * @fd:    Input; The user-space file descriptor to be preserved.
+ * @token: Input; An opaque, unique token for preserved resource.
+ *
+ * Holds parameters for preserving a file descriptor.
+ *
+ * User sets the @fd field identifying the file descriptor to preserve
+ * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type
+ * and its dependencies are supported for preservation. If validation passes,
+ * the kernel marks the FD internally and *initiates the process* of preparing
+ * its state for saving. The actual snapshotting of the state typically occurs
+ * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though
+ * some finalization might occur during freeze.
+ * On successful validation and initiation, the kernel uses the @token
+ * field with an opaque identifier representing the resource being preserved.
+ * This token confirms the FD is targeted for preservation and is required for
+ * the subsequent %LIVEUPDATE_SESSION_RETRIEVE_FD call after the live update.
+ *
+ * Return: 0 on success (validation passed, preservation initiated), negative
+ * error code on failure (e.g., unsupported FD type, dependency issue,
+ * validation failed).
+ */
+struct liveupdate_session_preserve_fd {
+	__u32		size;
+	__s32		fd;
+	__aligned_u64	token;
+};
+
+#define LIVEUPDATE_SESSION_PRESERVE_FD					\
+	_IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_SESSION_PRESERVE_FD)
+
+/**
+ * struct liveupdate_session_retrieve_fd - ioctl(LIVEUPDATE_SESSION_RETRIEVE_FD)
+ * @size:  Input; sizeof(struct liveupdate_session_retrieve_fd)
+ * @fd:    Output; The new file descriptor representing the fully restored
+ *         kernel resource.
+ * @token: Input; An opaque, token that was used to preserve the resource.
+ *
+ * Retrieve a previously preserved file descriptor.
+ *
+ * User sets the @token field to the value obtained from a successful
+ * %LIVEUPDATE_IOCTL_FD_PRESERVE call before the live update. On success,
+ * the kernel restores the state (saved during the PREPARE/FREEZE phases)
+ * associated with the token and populates the @fd field with a new file
+ * descriptor referencing the restored resource in the current (new) kernel.
+ * This operation must be performed *before* signaling completion via
+ * %LIVEUPDATE_IOCTL_FINISH.
+ *
+ * Return: 0 on success, negative error code on failure (e.g., invalid token).
+ */
+struct liveupdate_session_retrieve_fd {
+	__u32		size;
+	__s32		fd;
+	__aligned_u64	token;
+};
+
+#define LIVEUPDATE_SESSION_RETRIEVE_FD					\
+	_IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_SESSION_RETRIEVE_FD)
+
+/**
+ * struct liveupdate_session_finish - ioctl(LIVEUPDATE_SESSION_FINISH)
+ * @size:     Input; sizeof(struct liveupdate_session_finish)
+ * @reserved: Input; Must be zero. Reserved for future use.
+ *
+ * Signals the completion of the restoration process for a retrieved session.
+ * This is the final operation that should be performed on a session file
+ * descriptor after a live update.
+ *
+ * This ioctl must be called once all required file descriptors for the session
+ * have been successfully retrieved (using %LIVEUPDATE_SESSION_RETRIEVE_FD) and
+ * are fully restored from the userspace and kernel perspective.
+ *
+ * Upon success, the kernel releases its ownership of the preserved resources
+ * associated with this session. This allows internal resources to be freed,
+ * typically by decrementing reference counts on the underlying preserved
+ * objects.
+ *
+ * If this operation fails, the resources remain preserved in memory. Userspace
+ * may attempt to call finish again. The resources will otherwise be reset
+ * during the next live update cycle.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+struct liveupdate_session_finish {
+	__u32		size;
+	__u32		reserved;
+};
+
+#define LIVEUPDATE_SESSION_FINISH					\
+	_IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_SESSION_FINISH)
+
 #endif /* _UAPI_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
index 3a031446d3a4..dbdbc3bd7929 100644
--- a/kernel/liveupdate/luo_session.c
+++ b/kernel/liveupdate/luo_session.c
@@ -125,6 +125,8 @@ static struct luo_session *luo_session_alloc(const char *name)
 		return ERR_PTR(-ENOMEM);
 
 	strscpy(session->name, name, sizeof(session->name));
+	INIT_LIST_HEAD(&session->file_set.files_list);
+	luo_file_set_init(&session->file_set);
 	INIT_LIST_HEAD(&session->list);
 	mutex_init(&session->mutex);
 
@@ -133,6 +135,7 @@ static struct luo_session *luo_session_alloc(const char *name)
 
 static void luo_session_free(struct luo_session *session)
 {
+	luo_file_set_destroy(&session->file_set);
 	mutex_destroy(&session->mutex);
 	kfree(session);
 }
@@ -177,16 +180,46 @@ static void luo_session_remove(struct luo_session_header *sh,
 	sh->count--;
 }
 
+static int luo_session_finish_one(struct luo_session *session)
+{
+	guard(mutex)(&session->mutex);
+	return luo_file_finish(&session->file_set);
+}
+
+static void luo_session_unfreeze_one(struct luo_session *session,
+				     struct luo_session_ser *ser)
+{
+	guard(mutex)(&session->mutex);
+	luo_file_unfreeze(&session->file_set, &ser->file_set_ser);
+}
+
+static int luo_session_freeze_one(struct luo_session *session,
+				  struct luo_session_ser *ser)
+{
+	guard(mutex)(&session->mutex);
+	return luo_file_freeze(&session->file_set, &ser->file_set_ser);
+}
+
 static int luo_session_release(struct inode *inodep, struct file *filep)
 {
 	struct luo_session *session = filep->private_data;
 	struct luo_session_header *sh;
 
 	/* If retrieved is set, it means this session is from incoming list */
-	if (session->retrieved)
+	if (session->retrieved) {
+		int err = luo_session_finish_one(session);
+
+		if (err) {
+			pr_warn("Unable to finish session [%s] on release\n",
+				session->name);
+			return err;
+		}
 		sh = &luo_session_global.incoming;
-	else
+	} else {
+		scoped_guard(mutex, &session->mutex)
+			luo_file_unpreserve_files(&session->file_set);
 		sh = &luo_session_global.outgoing;
+	}
 
 	luo_session_remove(sh, session);
 	luo_session_free(session);
@@ -194,9 +227,140 @@ static int luo_session_release(struct inode *inodep, struct file *filep)
 	return 0;
 }
 
+static int luo_session_preserve_fd(struct luo_session *session,
+				   struct luo_ucmd *ucmd)
+{
+	struct liveupdate_session_preserve_fd *argp = ucmd->cmd;
+	int err;
+
+	guard(mutex)(&session->mutex);
+	err = luo_preserve_file(&session->file_set, argp->token, argp->fd);
+	if (err)
+		return err;
+
+	err = luo_ucmd_respond(ucmd, sizeof(*argp));
+	if (err)
+		pr_warn("The file was successfully preserved, but response to user failed\n");
+
+	return err;
+}
+
+static int luo_session_retrieve_fd(struct luo_session *session,
+				   struct luo_ucmd *ucmd)
+{
+	struct liveupdate_session_retrieve_fd *argp = ucmd->cmd;
+	struct file *file;
+	int err;
+
+	argp->fd = get_unused_fd_flags(O_CLOEXEC);
+	if (argp->fd < 0)
+		return argp->fd;
+
+	guard(mutex)(&session->mutex);
+	err = luo_retrieve_file(&session->file_set, argp->token, &file);
+	if (err < 0)
+		goto  err_put_fd;
+
+	err = luo_ucmd_respond(ucmd, sizeof(*argp));
+	if (err)
+		goto err_put_file;
+
+	fd_install(argp->fd, file);
+
+	return 0;
+
+err_put_file:
+	fput(file);
+err_put_fd:
+	put_unused_fd(argp->fd);
+
+	return err;
+}
+
+static int luo_session_finish(struct luo_session *session,
+			      struct luo_ucmd *ucmd)
+{
+	struct liveupdate_session_finish *argp = ucmd->cmd;
+	int err = luo_session_finish_one(session);
+
+	if (err)
+		return err;
+
+	return luo_ucmd_respond(ucmd, sizeof(*argp));
+}
+
+union ucmd_buffer {
+	struct liveupdate_session_finish finish;
+	struct liveupdate_session_preserve_fd preserve;
+	struct liveupdate_session_retrieve_fd retrieve;
+};
+
+struct luo_ioctl_op {
+	unsigned int size;
+	unsigned int min_size;
+	unsigned int ioctl_num;
+	int (*execute)(struct luo_session *session, struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last)                                  \
+	[_IOC_NR(_ioctl) - LIVEUPDATE_CMD_SESSION_BASE] = {                    \
+		.size = sizeof(_struct) +                                      \
+			BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) <          \
+					  sizeof(_struct)),                    \
+		.min_size = offsetofend(_struct, _last),                       \
+		.ioctl_num = _ioctl,                                           \
+		.execute = _fn,                                                \
+	}
+
+static const struct luo_ioctl_op luo_session_ioctl_ops[] = {
+	IOCTL_OP(LIVEUPDATE_SESSION_FINISH, luo_session_finish,
+		 struct liveupdate_session_finish, reserved),
+	IOCTL_OP(LIVEUPDATE_SESSION_PRESERVE_FD, luo_session_preserve_fd,
+		 struct liveupdate_session_preserve_fd, token),
+	IOCTL_OP(LIVEUPDATE_SESSION_RETRIEVE_FD, luo_session_retrieve_fd,
+		 struct liveupdate_session_retrieve_fd, token),
+};
+
+static long luo_session_ioctl(struct file *filep, unsigned int cmd,
+			      unsigned long arg)
+{
+	struct luo_session *session = filep->private_data;
+	const struct luo_ioctl_op *op;
+	struct luo_ucmd ucmd = {};
+	union ucmd_buffer buf;
+	unsigned int nr;
+	int ret;
+
+	nr = _IOC_NR(cmd);
+	if (nr < LIVEUPDATE_CMD_SESSION_BASE || (nr - LIVEUPDATE_CMD_SESSION_BASE) >=
+	    ARRAY_SIZE(luo_session_ioctl_ops)) {
+		return -EINVAL;
+	}
+
+	ucmd.ubuffer = (void __user *)arg;
+	ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+	if (ret)
+		return ret;
+
+	op = &luo_session_ioctl_ops[nr - LIVEUPDATE_CMD_SESSION_BASE];
+	if (op->ioctl_num != cmd)
+		return -ENOIOCTLCMD;
+	if (ucmd.user_size < op->min_size)
+		return -EINVAL;
+
+	ucmd.cmd = &buf;
+	ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+				    ucmd.user_size);
+	if (ret)
+		return ret;
+
+	return op->execute(session, &ucmd);
+}
+
 static const struct file_operations luo_session_fops = {
 	.owner = THIS_MODULE,
 	.release = luo_session_release,
+	.unlocked_ioctl = luo_session_ioctl,
 };
 
 /* Create a "struct file" for session */
@@ -392,6 +556,11 @@ int luo_session_deserialize(void)
 			luo_session_free(session);
 			return err;
 		}
+
+		scoped_guard(mutex, &session->mutex) {
+			luo_file_deserialize(&session->file_set,
+					     &sh->ser[i].file_set_ser);
+		}
 	}
 
 	kho_restore_free(sh->header_ser);
@@ -406,9 +575,14 @@ int luo_session_serialize(void)
 	struct luo_session_header *sh = &luo_session_global.outgoing;
 	struct luo_session *session;
 	int i = 0;
+	int err;
 
 	guard(rwsem_write)(&sh->rwsem);
 	list_for_each_entry(session, &sh->list, list) {
+		err = luo_session_freeze_one(session, &sh->ser[i]);
+		if (err)
+			goto err_undo;
+
 		strscpy(sh->ser[i].name, session->name,
 			sizeof(sh->ser[i].name));
 		i++;
@@ -416,6 +590,15 @@ int luo_session_serialize(void)
 	sh->header_ser->count = sh->count;
 
 	return 0;
+
+err_undo:
+	list_for_each_entry_continue_reverse(session, &sh->list, list) {
+		i--;
+		luo_session_unfreeze_one(session, &sh->ser[i]);
+		memset(sh->ser[i].name, 0, sizeof(sh->ser[i].name));
+	}
+
+	return err;
 }
 
 /**
-- 
cgit v1.2.3


From 6ff1610ced5689c9af4c28a1798e04b74128a703 Mon Sep 17 00:00:00 2001
From: Pratyush Yadav <ptyadav@amazon.de>
Date: Tue, 25 Nov 2025 11:58:40 -0500
Subject: mm: shmem: use SHMEM_F_* flags instead of VM_* flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

shmem_inode_info::flags can have the VM flags VM_NORESERVE and VM_LOCKED.
These are used to suppress pre-accounting or to lock the pages in the
inode respectively.  Using the VM flags directly makes it difficult to add
shmem-specific flags that are unrelated to VM behavior since one would
need to find a VM flag not used by shmem and re-purpose it.

Introduce SHMEM_F_NORESERVE and SHMEM_F_LOCKED which represent the same
information, but their bits are independent of the VM flags.  Callers can
still pass VM_NORESERVE to shmem_get_inode(), but it gets transformed to
the shmem-specific flag internally.

No functional changes intended.

Link: https://lkml.kernel.org/r/20251125165850.3389713-11-pasha.tatashin@soleen.com
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h |  6 ++++++
 mm/shmem.c               | 28 +++++++++++++++-------------
 2 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0e47465ef0fd..650874b400b5 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -10,6 +10,7 @@
 #include <linux/xattr.h>
 #include <linux/fs_parser.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/bits.h>
 
 struct swap_iocb;
 
@@ -19,6 +20,11 @@ struct swap_iocb;
 #define SHMEM_MAXQUOTAS 2
 #endif
 
+/* Suppress pre-accounting of the entire object size. */
+#define SHMEM_F_NORESERVE	BIT(0)
+/* Disallow swapping. */
+#define SHMEM_F_LOCKED		BIT(1)
+
 struct shmem_inode_info {
 	spinlock_t		lock;
 	unsigned int		seals;		/* shmem seals */
diff --git a/mm/shmem.c b/mm/shmem.c
index 58701d14dd96..1d5036dec08a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -175,20 +175,20 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
  */
 static inline int shmem_acct_size(unsigned long flags, loff_t size)
 {
-	return (flags & VM_NORESERVE) ?
+	return (flags & SHMEM_F_NORESERVE) ?
 		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
 }
 
 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 {
-	if (!(flags & VM_NORESERVE))
+	if (!(flags & SHMEM_F_NORESERVE))
 		vm_unacct_memory(VM_ACCT(size));
 }
 
 static inline int shmem_reacct_size(unsigned long flags,
 		loff_t oldsize, loff_t newsize)
 {
-	if (!(flags & VM_NORESERVE)) {
+	if (!(flags & SHMEM_F_NORESERVE)) {
 		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
 			return security_vm_enough_memory_mm(current->mm,
 					VM_ACCT(newsize) - VM_ACCT(oldsize));
@@ -206,7 +206,7 @@ static inline int shmem_reacct_size(unsigned long flags,
  */
 static inline int shmem_acct_blocks(unsigned long flags, long pages)
 {
-	if (!(flags & VM_NORESERVE))
+	if (!(flags & SHMEM_F_NORESERVE))
 		return 0;
 
 	return security_vm_enough_memory_mm(current->mm,
@@ -215,7 +215,7 @@ static inline int shmem_acct_blocks(unsigned long flags, long pages)
 
 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 {
-	if (flags & VM_NORESERVE)
+	if (flags & SHMEM_F_NORESERVE)
 		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
 }
 
@@ -1551,7 +1551,7 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
 	int nr_pages;
 	bool split = false;
 
-	if ((info->flags & VM_LOCKED) || sbinfo->noswap)
+	if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap)
 		goto redirty;
 
 	if (!total_swap_pages)
@@ -2910,15 +2910,15 @@ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
 	 * ipc_lock_object() when called from shmctl_do_lock(),
 	 * no serialization needed when called from shm_destroy().
 	 */
-	if (lock && !(info->flags & VM_LOCKED)) {
+	if (lock && !(info->flags & SHMEM_F_LOCKED)) {
 		if (!user_shm_lock(inode->i_size, ucounts))
 			goto out_nomem;
-		info->flags |= VM_LOCKED;
+		info->flags |= SHMEM_F_LOCKED;
 		mapping_set_unevictable(file->f_mapping);
 	}
-	if (!lock && (info->flags & VM_LOCKED) && ucounts) {
+	if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) {
 		user_shm_unlock(inode->i_size, ucounts);
-		info->flags &= ~VM_LOCKED;
+		info->flags &= ~SHMEM_F_LOCKED;
 		mapping_clear_unevictable(file->f_mapping);
 	}
 	retval = 0;
@@ -3062,7 +3062,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
 	spin_lock_init(&info->lock);
 	atomic_set(&info->stop_eviction, 0);
 	info->seals = F_SEAL_SEAL;
-	info->flags = flags & VM_NORESERVE;
+	info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
 	info->i_crtime = inode_get_mtime(inode);
 	info->fsflags = (dir == NULL) ? 0 :
 		SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
@@ -5804,8 +5804,10 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
 /* common code */
 
 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
-			loff_t size, unsigned long flags, unsigned int i_flags)
+				       loff_t size, unsigned long vm_flags,
+				       unsigned int i_flags)
 {
+	unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
 	struct inode *inode;
 	struct file *res;
 
@@ -5822,7 +5824,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
 		return ERR_PTR(-ENOMEM);
 
 	inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
-				S_IFREG | S_IRWXUGO, 0, flags);
+				S_IFREG | S_IRWXUGO, 0, vm_flags);
 	if (IS_ERR(inode)) {
 		shmem_unacct_size(flags, size);
 		return ERR_CAST(inode);
-- 
cgit v1.2.3


From e165e2a2577b048664be09c074a10304290055f0 Mon Sep 17 00:00:00 2001
From: Pratyush Yadav <ptyadav@amazon.de>
Date: Tue, 25 Nov 2025 11:58:41 -0500
Subject: mm: shmem: allow freezing inode mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To prepare a shmem inode for live update, its index -> folio mappings must
be serialized.  Once the mappings are serialized, they cannot change since
it would cause the serialized data to become inconsistent.  This can be
done by pinning the folios to avoid migration, and by making sure no
folios can be added to or removed from the inode.

While mechanisms to pin folios already exist, the only way to stop folios
being added or removed are the grow and shrink file seals.  But file seals
come with their own semantics, one of which is that they can't be removed.
This doesn't work with liveupdate since it can be cancelled or error out,
which would need the seals to be removed and the file's normal
functionality to be restored.

Introduce SHMEM_F_MAPPING_FROZEN to indicate this instead.  It is internal
to shmem and is not directly exposed to userspace.  It functions similar
to F_SEAL_GROW | F_SEAL_SHRINK, but additionally disallows hole punching,
and can be removed.

Link: https://lkml.kernel.org/r/20251125165850.3389713-12-pasha.tatashin@soleen.com
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 17 +++++++++++++++++
 mm/shmem.c               | 11 +++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 650874b400b5..d34a64eafe60 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -24,6 +24,14 @@ struct swap_iocb;
 #define SHMEM_F_NORESERVE	BIT(0)
 /* Disallow swapping. */
 #define SHMEM_F_LOCKED		BIT(1)
+/*
+ * Disallow growing, shrinking, or hole punching in the inode. Combined with
+ * folio pinning, makes sure the inode's mapping stays fixed.
+ *
+ * In some ways similar to F_SEAL_GROW | F_SEAL_SHRINK, but can be removed and
+ * isn't directly visible to userspace.
+ */
+#define SHMEM_F_MAPPING_FROZEN	BIT(2)
 
 struct shmem_inode_info {
 	spinlock_t		lock;
@@ -186,6 +194,15 @@ static inline bool shmem_file(struct file *file)
 	return shmem_mapping(file->f_mapping);
 }
 
+/* Must be called with inode lock taken exclusive. */
+static inline void shmem_freeze(struct inode *inode, bool freeze)
+{
+	if (freeze)
+		SHMEM_I(inode)->flags |= SHMEM_F_MAPPING_FROZEN;
+	else
+		SHMEM_I(inode)->flags &= ~SHMEM_F_MAPPING_FROZEN;
+}
+
 /*
  * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages
  * beyond i_size's notion of EOF, which fallocate has committed to reserving:
diff --git a/mm/shmem.c b/mm/shmem.c
index 1d5036dec08a..786573479360 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1297,6 +1297,8 @@ static int shmem_setattr(struct mnt_idmap *idmap,
 			return -EPERM;
 
 		if (newsize != oldsize) {
+			if (info->flags & SHMEM_F_MAPPING_FROZEN)
+				return -EPERM;
 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
 					oldsize, newsize);
 			if (error)
@@ -3289,6 +3291,10 @@ shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
 			return -EPERM;
 	}
 
+	if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) &&
+		     pos + len > inode->i_size))
+		return -EPERM;
+
 	ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
 	if (ret)
 		return ret;
@@ -3662,6 +3668,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 
 	inode_lock(inode);
 
+	if (info->flags & SHMEM_F_MAPPING_FROZEN) {
+		error = -EPERM;
+		goto out;
+	}
+
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
 		struct address_space *mapping = file->f_mapping;
 		loff_t unmap_start = round_up(offset, PAGE_SIZE);
-- 
cgit v1.2.3


From 8def18633e8df54a05cf7d323d0df24c21b320d6 Mon Sep 17 00:00:00 2001
From: Pratyush Yadav <ptyadav@amazon.de>
Date: Tue, 25 Nov 2025 11:58:43 -0500
Subject: liveupdate: luo_file: add private argument to store runtime state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently file handlers only get the serialized_data field to store their
state.  This field has a pointer to the serialized state of the file, and
it becomes a part of LUO file's serialized state.

File handlers can also need some runtime state to track information that
shouldn't make it in the serialized data.

One such example is a vmalloc pointer.  While kho_preserve_vmalloc()
preserves the memory backing a vmalloc allocation, it does not store the
original vmap pointer, since that has no use being passed to the next
kernel.  The pointer is needed to free the memory in case the file is
unpreserved.

Provide a private field in struct luo_file and pass it to all the
callbacks.  The field's can be set by preserve, and must be freed by
unpreserve.

Link: https://lkml.kernel.org/r/20251125165850.3389713-14-pasha.tatashin@soleen.com
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/liveupdate.h   | 5 +++++
 kernel/liveupdate/luo_file.c | 9 +++++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index 122ad8f16ff9..a7f6ee5b6771 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -27,6 +27,10 @@ struct file;
  *                    this to the file being operated on.
  * @serialized_data:  The opaque u64 handle, preserve/prepare/freeze may update
  *                    this field.
+ * @private_data:     Private data for the file used to hold runtime state that
+ *                    is not preserved. Set by the handler's .preserve()
+ *                    callback, and must be freed in the handler's
+ *                    .unpreserve() callback.
  *
  * This structure bundles all parameters for the file operation callbacks.
  * The 'data' and 'file' fields are used for both input and output.
@@ -36,6 +40,7 @@ struct liveupdate_file_op_args {
 	bool retrieved;
 	struct file *file;
 	u64 serialized_data;
+	void *private_data;
 };
 
 /**
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index e9727cb1275a..ddff87917b21 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -129,6 +129,10 @@ static LIST_HEAD(luo_file_handler_list);
  *                 This handle is passed back to the handler's .freeze(),
  *                 .retrieve(), and .finish() callbacks, allowing it to track
  *                 and update its serialized state across phases.
+ * @private_data:  Pointer to the private data for the file used to hold runtime
+ *                 state that is not preserved. Set by the handler's .preserve()
+ *                 callback, and must be freed in the handler's .unpreserve()
+ *                 callback.
  * @retrieved:     A flag indicating whether a user/kernel in the new kernel has
  *                 successfully called retrieve() on this file. This prevents
  *                 multiple retrieval attempts.
@@ -155,6 +159,7 @@ struct luo_file {
 	struct liveupdate_file_handler *fh;
 	struct file *file;
 	u64 serialized_data;
+	void *private_data;
 	bool retrieved;
 	struct mutex mutex;
 	struct list_head list;
@@ -298,6 +303,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
 		goto err_kfree;
 
 	luo_file->serialized_data = args.serialized_data;
+	luo_file->private_data = args.private_data;
 	list_add_tail(&luo_file->list, &file_set->files_list);
 	file_set->count++;
 
@@ -344,6 +350,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set)
 		args.handler = luo_file->fh;
 		args.file = luo_file->file;
 		args.serialized_data = luo_file->serialized_data;
+		args.private_data = luo_file->private_data;
 		luo_file->fh->ops->unpreserve(&args);
 
 		list_del(&luo_file->list);
@@ -370,6 +377,7 @@ static int luo_file_freeze_one(struct luo_file_set *file_set,
 		args.handler = luo_file->fh;
 		args.file = luo_file->file;
 		args.serialized_data = luo_file->serialized_data;
+		args.private_data = luo_file->private_data;
 
 		err = luo_file->fh->ops->freeze(&args);
 		if (!err)
@@ -390,6 +398,7 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set,
 		args.handler = luo_file->fh;
 		args.file = luo_file->file;
 		args.serialized_data = luo_file->serialized_data;
+		args.private_data = luo_file->private_data;
 
 		luo_file->fh->ops->unfreeze(&args);
 	}
-- 
cgit v1.2.3


From b3749f174d686627f702234e64bad976dc432dbc Mon Sep 17 00:00:00 2001
From: Pratyush Yadav <ptyadav@amazon.de>
Date: Tue, 25 Nov 2025 11:58:44 -0500
Subject: mm: memfd_luo: allow preserving memfd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ability to preserve a memfd allows userspace to use KHO and LUO to
transfer its memory contents to the next kernel.  This is useful in many
ways.  For one, it can be used with IOMMUFD as the backing store for IOMMU
page tables.  Preserving IOMMUFD is essential for performing a hypervisor
live update with passthrough devices.  memfd support provides the first
building block for making that possible.

For another, applications with a large amount of memory that takes time to
reconstruct, reboots to consume kernel upgrades can be very expensive.
memfd with LUO gives those applications reboot-persistent memory that they
can use to quickly save and reconstruct that state.

While memfd is backed by either hugetlbfs or shmem, currently only support
on shmem is added.  To be more precise, support for anonymous shmem files
is added.

The handover to the next kernel is not transparent.  All the properties of
the file are not preserved; only its memory contents, position, and size.
The recreated file gets the UID and GID of the task doing the restore, and
the task's cgroup gets charged with the memory.

Once preserved, the file cannot grow or shrink, and all its pages are
pinned to avoid migrations and swapping.  The file can still be read from
or written to.

Use vmalloc to get the buffer to hold the folios, and preserve it using
kho_preserve_vmalloc().  This doesn't have the size limit.

Link: https://lkml.kernel.org/r/20251125165850.3389713-15-pasha.tatashin@soleen.com
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS                   |   2 +
 include/linux/kho/abi/memfd.h |  77 +++++++
 mm/Makefile                   |   1 +
 mm/memfd_luo.c                | 516 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 596 insertions(+)
 create mode 100644 include/linux/kho/abi/memfd.h
 create mode 100644 mm/memfd_luo.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 868d3d23fdea..425c46bba764 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14469,6 +14469,7 @@ F:	tools/testing/selftests/livepatch/
 LIVE UPDATE
 M:	Pasha Tatashin <pasha.tatashin@soleen.com>
 M:	Mike Rapoport <rppt@kernel.org>
+R:	Pratyush Yadav <pratyush@kernel.org>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/core-api/liveupdate.rst
@@ -14477,6 +14478,7 @@ F:	include/linux/liveupdate.h
 F:	include/linux/liveupdate/
 F:	include/uapi/linux/liveupdate.h
 F:	kernel/liveupdate/
+F:	mm/memfd_luo.c
 
 LLC (802.2)
 L:	netdev@vger.kernel.org
diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h
new file mode 100644
index 000000000000..da7d063474a1
--- /dev/null
+++ b/include/linux/kho/abi/memfd.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ *
+ * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
+ * Pratyush Yadav <ptyadav@amazon.de>
+ */
+
+#ifndef _LINUX_KHO_ABI_MEMFD_H
+#define _LINUX_KHO_ABI_MEMFD_H
+
+#include <linux/types.h>
+#include <linux/kexec_handover.h>
+
+/**
+ * DOC: memfd Live Update ABI
+ *
+ * This header defines the ABI for preserving the state of a memfd across a
+ * kexec reboot using the LUO.
+ *
+ * The state is serialized into a packed structure `struct memfd_luo_ser`
+ * which is handed over to the next kernel via the KHO mechanism.
+ *
+ * This interface is a contract. Any modification to the structure layout
+ * constitutes a breaking change. Such changes require incrementing the
+ * version number in the MEMFD_LUO_FH_COMPATIBLE string.
+ */
+
+/**
+ * MEMFD_LUO_FOLIO_DIRTY - The folio is dirty.
+ *
+ * This flag indicates the folio contains data from user. A non-dirty folio is
+ * one that was allocated (say using fallocate(2)) but not written to.
+ */
+#define MEMFD_LUO_FOLIO_DIRTY		BIT(0)
+
+/**
+ * MEMFD_LUO_FOLIO_UPTODATE - The folio is up-to-date.
+ *
+ * An up-to-date folio has been zeroed out. shmem zeroes out folios on first
+ * use. This flag tracks which folios need zeroing.
+ */
+#define MEMFD_LUO_FOLIO_UPTODATE	BIT(1)
+
+/**
+ * struct memfd_luo_folio_ser - Serialized state of a single folio.
+ * @pfn:       The page frame number of the folio.
+ * @flags:     Flags to describe the state of the folio.
+ * @index:     The page offset (pgoff_t) of the folio within the original file.
+ */
+struct memfd_luo_folio_ser {
+	u64 pfn:52;
+	u64 flags:12;
+	u64 index;
+} __packed;
+
+/**
+ * struct memfd_luo_ser - Main serialization structure for a memfd.
+ * @pos:       The file's current position (f_pos).
+ * @size:      The total size of the file in bytes (i_size).
+ * @nr_folios: Number of folios in the folios array.
+ * @folios:    KHO vmalloc descriptor pointing to the array of
+ *             struct memfd_luo_folio_ser.
+ */
+struct memfd_luo_ser {
+	u64 pos;
+	u64 size;
+	u64 nr_folios;
+	struct kho_vmalloc folios;
+} __packed;
+
+/* The compatibility string for memfd file handler */
+#define MEMFD_LUO_FH_COMPATIBLE	"memfd-v1"
+
+#endif /* _LINUX_KHO_ABI_MEMFD_H */
diff --git a/mm/Makefile b/mm/Makefile
index 21abb3353550..7738ec416f00 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_NUMA) += memory-tiers.o
 obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
+obj-$(CONFIG_LIVEUPDATE) += memfd_luo.o
 obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
 ifdef CONFIG_SWAP
diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
new file mode 100644
index 000000000000..4f6ba63b4310
--- /dev/null
+++ b/mm/memfd_luo.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ *
+ * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
+ * Pratyush Yadav <ptyadav@amazon.de>
+ */
+
+/**
+ * DOC: Memfd Preservation via LUO
+ *
+ * Overview
+ * ========
+ *
+ * Memory file descriptors (memfd) can be preserved over a kexec using the Live
+ * Update Orchestrator (LUO) file preservation. This allows userspace to
+ * transfer its memory contents to the next kernel after a kexec.
+ *
+ * The preservation is not intended to be transparent. Only select properties of
+ * the file are preserved. All others are reset to default. The preserved
+ * properties are described below.
+ *
+ * .. note::
+ *    The LUO API is not stabilized yet, so the preserved properties of a memfd
+ *    are also not stable and are subject to backwards incompatible changes.
+ *
+ * .. note::
+ *    Currently a memfd backed by Hugetlb is not supported. Memfds created
+ *    with ``MFD_HUGETLB`` will be rejected.
+ *
+ * Preserved Properties
+ * ====================
+ *
+ * The following properties of the memfd are preserved across kexec:
+ *
+ * File Contents
+ *   All data stored in the file is preserved.
+ *
+ * File Size
+ *   The size of the file is preserved. Holes in the file are filled by
+ *   allocating pages for them during preservation.
+ *
+ * File Position
+ *   The current file position is preserved, allowing applications to continue
+ *   reading/writing from their last position.
+ *
+ * File Status Flags
+ *   memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
+ *   is maintained.
+ *
+ * Non-Preserved Properties
+ * ========================
+ *
+ * All properties which are not preserved must be assumed to be reset to
+ * default. This section describes some of those properties which may be more of
+ * note.
+ *
+ * ``FD_CLOEXEC`` flag
+ *   A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
+ *   ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
+ *   again after restore via ``fcntl()``.
+ *
+ * Seals
+ *   File seals are not preserved. The file is unsealed on restore and if
+ *   needed, must be sealed again via ``fcntl()``.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bits.h>
+#include <linux/err.h>
+#include <linux/file.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/memfd.h>
+#include <linux/liveupdate.h>
+#include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+
+static int memfd_luo_preserve_folios(struct file *file,
+				     struct kho_vmalloc *kho_vmalloc,
+				     struct memfd_luo_folio_ser **out_folios_ser,
+				     u64 *nr_foliosp)
+{
+	struct inode *inode = file_inode(file);
+	struct memfd_luo_folio_ser *folios_ser;
+	unsigned int max_folios;
+	long i, size, nr_pinned;
+	struct folio **folios;
+	int err = -EINVAL;
+	pgoff_t offset;
+	u64 nr_folios;
+
+	size = i_size_read(inode);
+	/*
+	 * If the file has zero size, then the folios and nr_folios properties
+	 * are not set.
+	 */
+	if (!size) {
+		*nr_foliosp = 0;
+		*out_folios_ser = NULL;
+		memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
+		return 0;
+	}
+
+	/*
+	 * Guess the number of folios based on inode size. Real number might end
+	 * up being smaller if there are higher order folios.
+	 */
+	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
+	folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
+	if (!folios)
+		return -ENOMEM;
+
+	/*
+	 * Pin the folios so they don't move around behind our back. This also
+	 * ensures none of the folios are in CMA -- which ensures they don't
+	 * fall in KHO scratch memory. It also moves swapped out folios back to
+	 * memory.
+	 *
+	 * A side effect of doing this is that it allocates a folio for all
+	 * indices in the file. This might waste memory on sparse memfds. If
+	 * that is really a problem in the future, we can have a
+	 * memfd_pin_folios() variant that does not allocate a page on empty
+	 * slots.
+	 */
+	nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
+				     &offset);
+	if (nr_pinned < 0) {
+		err = nr_pinned;
+		pr_err("failed to pin folios: %d\n", err);
+		goto err_free_folios;
+	}
+	nr_folios = nr_pinned;
+
+	folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
+	if (!folios_ser) {
+		err = -ENOMEM;
+		goto err_unpin;
+	}
+
+	for (i = 0; i < nr_folios; i++) {
+		struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+		struct folio *folio = folios[i];
+		unsigned int flags = 0;
+
+		err = kho_preserve_folio(folio);
+		if (err)
+			goto err_unpreserve;
+
+		if (folio_test_dirty(folio))
+			flags |= MEMFD_LUO_FOLIO_DIRTY;
+		if (folio_test_uptodate(folio))
+			flags |= MEMFD_LUO_FOLIO_UPTODATE;
+
+		pfolio->pfn = folio_pfn(folio);
+		pfolio->flags = flags;
+		pfolio->index = folio->index;
+	}
+
+	err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
+	if (err)
+		goto err_unpreserve;
+
+	kvfree(folios);
+	*nr_foliosp = nr_folios;
+	*out_folios_ser = folios_ser;
+
+	/*
+	 * Note: folios_ser is purposely not freed here. It is preserved
+	 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
+	 * that is passed via private_data.
+	 */
+	return 0;
+
+err_unpreserve:
+	for (i = i - 1; i >= 0; i--)
+		kho_unpreserve_folio(folios[i]);
+	vfree(folios_ser);
+err_unpin:
+	unpin_folios(folios, nr_folios);
+err_free_folios:
+	kvfree(folios);
+
+	return err;
+}
+
+static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
+					struct memfd_luo_folio_ser *folios_ser,
+					u64 nr_folios)
+{
+	long i;
+
+	if (!nr_folios)
+		return;
+
+	kho_unpreserve_vmalloc(kho_vmalloc);
+
+	for (i = 0; i < nr_folios; i++) {
+		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+		struct folio *folio;
+
+		if (!pfolio->pfn)
+			continue;
+
+		folio = pfn_folio(pfolio->pfn);
+
+		kho_unpreserve_folio(folio);
+		unpin_folio(folio);
+	}
+
+	vfree(folios_ser);
+}
+
+static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
+{
+	struct inode *inode = file_inode(args->file);
+	struct memfd_luo_folio_ser *folios_ser;
+	struct memfd_luo_ser *ser;
+	u64 nr_folios;
+	int err = 0;
+
+	inode_lock(inode);
+	shmem_freeze(inode, true);
+
+	/* Allocate the main serialization structure in preserved memory */
+	ser = kho_alloc_preserve(sizeof(*ser));
+	if (IS_ERR(ser)) {
+		err = PTR_ERR(ser);
+		goto err_unlock;
+	}
+
+	ser->pos = args->file->f_pos;
+	ser->size = i_size_read(inode);
+
+	err = memfd_luo_preserve_folios(args->file, &ser->folios,
+					&folios_ser, &nr_folios);
+	if (err)
+		goto err_free_ser;
+
+	ser->nr_folios = nr_folios;
+	inode_unlock(inode);
+
+	args->private_data = folios_ser;
+	args->serialized_data = virt_to_phys(ser);
+
+	return 0;
+
+err_free_ser:
+	kho_unpreserve_free(ser);
+err_unlock:
+	shmem_freeze(inode, false);
+	inode_unlock(inode);
+	return err;
+}
+
+static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
+{
+	struct memfd_luo_ser *ser;
+
+	if (WARN_ON_ONCE(!args->serialized_data))
+		return -EINVAL;
+
+	ser = phys_to_virt(args->serialized_data);
+
+	/*
+	 * The pos might have changed since prepare. Everything else stays the
+	 * same.
+	 */
+	ser->pos = args->file->f_pos;
+
+	return 0;
+}
+
+static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
+{
+	struct inode *inode = file_inode(args->file);
+	struct memfd_luo_ser *ser;
+
+	if (WARN_ON_ONCE(!args->serialized_data))
+		return;
+
+	inode_lock(inode);
+	shmem_freeze(inode, false);
+
+	ser = phys_to_virt(args->serialized_data);
+
+	memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
+				    ser->nr_folios);
+
+	kho_unpreserve_free(ser);
+	inode_unlock(inode);
+}
+
+static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
+				     u64 nr_folios)
+{
+	u64 i;
+
+	for (i = 0; i < nr_folios; i++) {
+		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+		struct folio *folio;
+		phys_addr_t phys;
+
+		if (!pfolio->pfn)
+			continue;
+
+		phys = PFN_PHYS(pfolio->pfn);
+		folio = kho_restore_folio(phys);
+		if (!folio) {
+			pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
+					    phys);
+			continue;
+		}
+
+		folio_put(folio);
+	}
+}
+
+static void memfd_luo_finish(struct liveupdate_file_op_args *args)
+{
+	struct memfd_luo_folio_ser *folios_ser;
+	struct memfd_luo_ser *ser;
+
+	if (args->retrieved)
+		return;
+
+	ser = phys_to_virt(args->serialized_data);
+	if (!ser)
+		return;
+
+	if (ser->nr_folios) {
+		folios_ser = kho_restore_vmalloc(&ser->folios);
+		if (!folios_ser)
+			goto out;
+
+		memfd_luo_discard_folios(folios_ser, ser->nr_folios);
+		vfree(folios_ser);
+	}
+
+out:
+	kho_restore_free(ser);
+}
+
+static int memfd_luo_retrieve_folios(struct file *file,
+				     struct memfd_luo_folio_ser *folios_ser,
+				     u64 nr_folios)
+{
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+	struct folio *folio;
+	int err = -EIO;
+	long i;
+
+	for (i = 0; i < nr_folios; i++) {
+		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+		phys_addr_t phys;
+		u64 index;
+		int flags;
+
+		if (!pfolio->pfn)
+			continue;
+
+		phys = PFN_PHYS(pfolio->pfn);
+		folio = kho_restore_folio(phys);
+		if (!folio) {
+			pr_err("Unable to restore folio at physical address: %llx\n",
+			       phys);
+			goto put_folios;
+		}
+		index = pfolio->index;
+		flags = pfolio->flags;
+
+		/* Set up the folio for insertion. */
+		__folio_set_locked(folio);
+		__folio_set_swapbacked(folio);
+
+		err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
+		if (err) {
+			pr_err("shmem: failed to charge folio index %ld: %d\n",
+			       i, err);
+			goto unlock_folio;
+		}
+
+		err = shmem_add_to_page_cache(folio, mapping, index, NULL,
+					      mapping_gfp_mask(mapping));
+		if (err) {
+			pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
+			       i, err);
+			goto unlock_folio;
+		}
+
+		if (flags & MEMFD_LUO_FOLIO_UPTODATE)
+			folio_mark_uptodate(folio);
+		if (flags & MEMFD_LUO_FOLIO_DIRTY)
+			folio_mark_dirty(folio);
+
+		err = shmem_inode_acct_blocks(inode, 1);
+		if (err) {
+			pr_err("shmem: failed to account folio index %ld: %d\n",
+			       i, err);
+			goto unlock_folio;
+		}
+
+		shmem_recalc_inode(inode, 1, 0);
+		folio_add_lru(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+	return 0;
+
+unlock_folio:
+	folio_unlock(folio);
+	folio_put(folio);
+put_folios:
+	/*
+	 * Note: don't free the folios already added to the file. They will be
+	 * freed when the file is freed. Free the ones not added yet here.
+	 */
+	for (long j = i + 1; j < nr_folios; j++) {
+		const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
+
+		folio = kho_restore_folio(pfolio->pfn);
+		if (folio)
+			folio_put(folio);
+	}
+
+	return err;
+}
+
+static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
+{
+	struct memfd_luo_folio_ser *folios_ser;
+	struct memfd_luo_ser *ser;
+	struct file *file;
+	int err;
+
+	ser = phys_to_virt(args->serialized_data);
+	if (!ser)
+		return -EINVAL;
+
+	file = shmem_file_setup("", 0, VM_NORESERVE);
+
+	if (IS_ERR(file)) {
+		pr_err("failed to setup file: %pe\n", file);
+		return PTR_ERR(file);
+	}
+
+	vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
+	file->f_inode->i_size = ser->size;
+
+	if (ser->nr_folios) {
+		folios_ser = kho_restore_vmalloc(&ser->folios);
+		if (!folios_ser) {
+			err = -EINVAL;
+			goto put_file;
+		}
+
+		err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
+		vfree(folios_ser);
+		if (err)
+			goto put_file;
+	}
+
+	args->file = file;
+	kho_restore_free(ser);
+
+	return 0;
+
+put_file:
+	fput(file);
+
+	return err;
+}
+
+static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
+				   struct file *file)
+{
+	struct inode *inode = file_inode(file);
+
+	return shmem_file(file) && !inode->i_nlink;
+}
+
+static const struct liveupdate_file_ops memfd_luo_file_ops = {
+	.freeze = memfd_luo_freeze,
+	.finish = memfd_luo_finish,
+	.retrieve = memfd_luo_retrieve,
+	.preserve = memfd_luo_preserve,
+	.unpreserve = memfd_luo_unpreserve,
+	.can_preserve = memfd_luo_can_preserve,
+	.owner = THIS_MODULE,
+};
+
+static struct liveupdate_file_handler memfd_luo_handler = {
+	.ops = &memfd_luo_file_ops,
+	.compatible = MEMFD_LUO_FH_COMPATIBLE,
+};
+
+static int __init memfd_luo_init(void)
+{
+	int err = liveupdate_register_file_handler(&memfd_luo_handler);
+
+	if (err && err != -EOPNOTSUPP) {
+		pr_err("Could not register luo filesystem handler: %pe\n",
+		       ERR_PTR(err));
+
+		return err;
+	}
+
+	return 0;
+}
+late_initcall(memfd_luo_init);
-- 
cgit v1.2.3


From 3fa805c37dd4d3e72ae5c58800f3f46ab3ca1f70 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 10 Oct 2025 03:36:50 -0700
Subject: vmcoreinfo: track and log recoverable hardware errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a generic infrastructure for tracking recoverable hardware
errors (HW errors that are visible to the OS but does not cause a panic)
and record them for vmcore consumption.  This aids post-mortem crash
analysis tools by preserving a count and timestamp for the last occurrence
of such errors.  On the other side, correctable errors, which the OS
typically remains unaware of because the underlying hardware handles them
transparently, are less relevant for crash dump and therefore are NOT
tracked in this infrastructure.

Add centralized logging for sources of recoverable hardware errors based
on the subsystem it has been notified.

hwerror_data is write-only at kernel runtime, and it is meant to be read
from vmcore using tools like crash/drgn.  For example, this is how it
looks like when opening the crashdump from drgn.

	>>> prog['hwerror_data']
	(struct hwerror_info[1]){
		{
			.count = (int)844,
			.timestamp = (time64_t)1752852018,
		},
		...

This helps fleet operators quickly triage whether a crash may be
influenced by hardware recoverable errors (which executes a uncommon code
path in the kernel), especially when recoverable errors occurred shortly
before a panic, such as the bug fixed by commit ee62ce7a1d90 ("page_pool:
Track DMA-mapped pages and unmap them when destroying the pool")

This is not intended to replace full hardware diagnostics but provides a
fast way to correlate hardware events with kernel panics quickly.

Rare machine check exceptions—like those indicated by mce_flags.p5 or
mce_flags.winchip—are not accounted for in this method, as they fall
outside the intended usage scope for this feature's user base.

[leitao@debian.org: add hw-recoverable-errors to toctree]
  Link: https://lkml.kernel.org/r/20251127-vmcoreinfo_fix-v1-1-26f5b1c43da9@debian.org
Link: https://lkml.kernel.org/r/20251010-vmcore_hw_error-v5-1-636ede3efe44@debian.org
Signed-off-by: Breno Leitao <leitao@debian.org>
Suggested-by: Tony Luck <tony.luck@intel.com>
Suggested-by: Shuai Xue <xueshuai@linux.alibaba.com>
Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>	[APEI]
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Bob Moore <robert.moore@intel.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Konrad Rzessutek Wilk <konrad.wilk@oracle.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Omar Sandoval <osandov@osandov.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/driver-api/hw-recoverable-errors.rst | 60 ++++++++++++++++++++++
 Documentation/driver-api/index.rst                 |  1 +
 arch/x86/kernel/cpu/mce/core.c                     |  4 ++
 drivers/acpi/apei/ghes.c                           | 36 +++++++++++++
 drivers/pci/pcie/aer.c                             |  2 +
 include/linux/vmcore_info.h                        |  8 +++
 include/uapi/linux/vmcore.h                        |  9 ++++
 kernel/vmcore_info.c                               | 17 ++++++
 8 files changed, 137 insertions(+)
 create mode 100644 Documentation/driver-api/hw-recoverable-errors.rst

(limited to 'include')

diff --git a/Documentation/driver-api/hw-recoverable-errors.rst b/Documentation/driver-api/hw-recoverable-errors.rst
new file mode 100644
index 000000000000..fc526c3454bd
--- /dev/null
+++ b/Documentation/driver-api/hw-recoverable-errors.rst
@@ -0,0 +1,60 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================================================
+Recoverable Hardware Error Tracking in vmcoreinfo
+=================================================
+
+Overview
+--------
+
+This feature provides a generic infrastructure within the Linux kernel to track
+and log recoverable hardware errors. These are hardware recoverable errors
+visible that might not cause immediate panics but may influence health, mainly
+because new code path will be executed in the kernel.
+
+By recording counts and timestamps of recoverable errors into the vmcoreinfo
+crash dump notes, this infrastructure aids post-mortem crash analysis tools in
+correlating hardware events with kernel failures. This enables faster triage
+and better understanding of root causes, especially in large-scale cloud
+environments where hardware issues are common.
+
+Benefits
+--------
+
+- Facilitates correlation of hardware recoverable errors with kernel panics or
+  unusual code paths that lead to system crashes.
+- Provides operators and cloud providers quick insights, improving reliability
+  and reducing troubleshooting time.
+- Complements existing full hardware diagnostics without replacing them.
+
+Data Exposure and Consumption
+-----------------------------
+
+- The tracked error data consists of per-error-type counts and timestamps of
+  last occurrence.
+- This data is stored in the `hwerror_data` array, categorized by error source
+  types like CPU, memory, PCI, CXL, and others.
+- It is exposed via vmcoreinfo crash dump notes and can be read using tools
+  like `crash`, `drgn`, or other kernel crash analysis utilities.
+- There is no other way to read these data other than from crash dumps.
+- These errors are divided by area, which includes CPU, Memory, PCI, CXL and
+  others.
+
+Typical usage example (in drgn REPL):
+
+.. code-block:: python
+
+    >>> prog['hwerror_data']
+    (struct hwerror_info[HWERR_RECOV_MAX]){
+        {
+            .count = (int)844,
+            .timestamp = (time64_t)1752852018,
+        },
+        ...
+    }
+
+Enabling
+--------
+
+- This feature is enabled when CONFIG_VMCORE_INFO is set.
+
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 3e2a270bd828..a35705b44799 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -96,6 +96,7 @@ Subsystem-specific APIs
    gpio/index
    hsi
    hte/index
+   hw-recoverable-errors
    i2c
    iio/index
    infiniband
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 460e90a1a0b1..08adbf4cd6ed 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -45,6 +45,7 @@
 #include <linux/task_work.h>
 #include <linux/hardirq.h>
 #include <linux/kexec.h>
+#include <linux/vmcore_info.h>
 
 #include <asm/fred.h>
 #include <asm/cpu_device_id.h>
@@ -1700,6 +1701,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
 	}
 
 out:
+	/* Given it didn't panic, mark it as recoverable */
+	hwerr_log_error_type(HWERR_RECOV_OTHERS);
+
 	instrumentation_end();
 
 clear:
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 97ee19f2cae0..92b0e3c391b2 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -43,6 +43,7 @@
 #include <linux/uuid.h>
 #include <linux/ras.h>
 #include <linux/task_work.h>
+#include <linux/vmcore_info.h>
 
 #include <acpi/actbl1.h>
 #include <acpi/ghes.h>
@@ -867,6 +868,40 @@ int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, "CXL");
 
+static void ghes_log_hwerr(int sev, guid_t *sec_type)
+{
+	if (sev != CPER_SEV_RECOVERABLE)
+		return;
+
+	if (guid_equal(sec_type, &CPER_SEC_PROC_ARM) ||
+	    guid_equal(sec_type, &CPER_SEC_PROC_GENERIC) ||
+	    guid_equal(sec_type, &CPER_SEC_PROC_IA)) {
+		hwerr_log_error_type(HWERR_RECOV_CPU);
+		return;
+	}
+
+	if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR) ||
+	    guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID) ||
+	    guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID) ||
+	    guid_equal(sec_type, &CPER_SEC_CXL_MEM_MODULE_GUID)) {
+		hwerr_log_error_type(HWERR_RECOV_CXL);
+		return;
+	}
+
+	if (guid_equal(sec_type, &CPER_SEC_PCIE) ||
+	    guid_equal(sec_type, &CPER_SEC_PCI_X_BUS)) {
+		hwerr_log_error_type(HWERR_RECOV_PCI);
+		return;
+	}
+
+	if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
+		hwerr_log_error_type(HWERR_RECOV_MEMORY);
+		return;
+	}
+
+	hwerr_log_error_type(HWERR_RECOV_OTHERS);
+}
+
 static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -888,6 +923,7 @@ static void ghes_do_proc(struct ghes *ghes,
 		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
 			fru_text = gdata->fru_text;
 
+		ghes_log_hwerr(sev, sec_type);
 		if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
 			struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 0b5ed4722ac3..e0bcaa896803 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -30,6 +30,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
+#include <linux/vmcore_info.h>
 #include <acpi/apei.h>
 #include <acpi/ghes.h>
 #include <ras/ras_event.h>
@@ -765,6 +766,7 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
 		break;
 	case AER_NONFATAL:
 		aer_info->dev_total_nonfatal_errs++;
+		hwerr_log_error_type(HWERR_RECOV_PCI);
 		counter = &aer_info->dev_nonfatal_errs[0];
 		max = AER_MAX_TYPEOF_UNCOR_ERRS;
 		break;
diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
index 37e003ae5262..e71518caacdf 100644
--- a/include/linux/vmcore_info.h
+++ b/include/linux/vmcore_info.h
@@ -5,6 +5,7 @@
 #include <linux/linkage.h>
 #include <linux/elfcore.h>
 #include <linux/elf.h>
+#include <uapi/linux/vmcore.h>
 
 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
 #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4)
@@ -77,4 +78,11 @@ extern u32 *vmcoreinfo_note;
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
+
+#ifdef CONFIG_VMCORE_INFO
+void hwerr_log_error_type(enum hwerr_error_type src);
+#else
+static inline void hwerr_log_error_type(enum hwerr_error_type src) {};
+#endif
+
 #endif /* LINUX_VMCORE_INFO_H */
diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h
index 3e9da91866ff..2ba89fafa518 100644
--- a/include/uapi/linux/vmcore.h
+++ b/include/uapi/linux/vmcore.h
@@ -15,4 +15,13 @@ struct vmcoredd_header {
 	__u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */
 };
 
+enum hwerr_error_type {
+	HWERR_RECOV_CPU,
+	HWERR_RECOV_MEMORY,
+	HWERR_RECOV_PCI,
+	HWERR_RECOV_CXL,
+	HWERR_RECOV_OTHERS,
+	HWERR_RECOV_MAX,
+};
+
 #endif /* _UAPI_VMCORE_H */
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index e066d31d08f8..fe9bf8db1922 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -31,6 +31,13 @@ u32 *vmcoreinfo_note;
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
 static unsigned char *vmcoreinfo_data_safecopy;
 
+struct hwerr_info {
+	atomic_t count;
+	time64_t timestamp;
+};
+
+static struct hwerr_info hwerr_data[HWERR_RECOV_MAX];
+
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len)
 {
@@ -118,6 +125,16 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
 }
 EXPORT_SYMBOL(paddr_vmcoreinfo_note);
 
+void hwerr_log_error_type(enum hwerr_error_type src)
+{
+	if (src < 0 || src >= HWERR_RECOV_MAX)
+		return;
+
+	atomic_inc(&hwerr_data[src].count);
+	WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds());
+}
+EXPORT_SYMBOL_GPL(hwerr_log_error_type);
+
 static int __init crash_save_vmcoreinfo_init(void)
 {
 	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
-- 
cgit v1.2.3


From 93d7a7ed07342f5e3da2d250cfd67f899d0b5318 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 10 Oct 2025 12:32:23 +0200
Subject: netfilter: flowtable: move path discovery infrastructure to its own
 file

This file contains the path discovery that is run from the forward chain
for the packet offloading the flow into the flowtable. This consists
of a series of calls to dev_fill_forward_path() for each device stack.

More topologies may be supported in the future, so move this code to its
own file to separate it from the nftables flow_offload expression.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |   6 +
 net/netfilter/Makefile                |   1 +
 net/netfilter/nf_flow_table_path.c    | 274 ++++++++++++++++++++++++++++++++++
 net/netfilter/nft_flow_offload.c      | 259 --------------------------------
 4 files changed, 281 insertions(+), 259 deletions(-)
 create mode 100644 net/netfilter/nf_flow_table_path.c

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index c003cd194fa2..e9f72d2558e9 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -222,6 +222,12 @@ struct nf_flow_route {
 struct flow_offload *flow_offload_alloc(struct nf_conn *ct);
 void flow_offload_free(struct flow_offload *flow);
 
+struct nft_flowtable;
+struct nft_pktinfo;
+int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
+		   struct nf_flow_route *route, enum ip_conntrack_dir dir,
+		   struct nft_flowtable *ft);
+
 static inline int
 nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table,
 			     flow_setup_cb_t *cb, void *cb_priv)
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index e43e20f529f8..6bfc250e474f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -141,6 +141,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
 # flow table infrastructure
 obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
 nf_flow_table-objs		:= nf_flow_table_core.o nf_flow_table_ip.o \
+				   nf_flow_table_path.o \
 				   nf_flow_table_offload.o nf_flow_table_xdp.o
 nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o
 ifeq ($(CONFIG_NF_FLOW_TABLE),m)
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
new file mode 100644
index 000000000000..e525e3745651
--- /dev/null
+++ b/net/netfilter/nf_flow_table_path.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/ip.h>
+#include <net/inet_dscp.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_flow_table.h>
+
+static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst)
+{
+	if (dst_xfrm(dst))
+		return FLOW_OFFLOAD_XMIT_XFRM;
+
+	return FLOW_OFFLOAD_XMIT_NEIGH;
+}
+
+static void nft_default_forward_path(struct nf_flow_route *route,
+				     struct dst_entry *dst_cache,
+				     enum ip_conntrack_dir dir)
+{
+	route->tuple[!dir].in.ifindex	= dst_cache->dev->ifindex;
+	route->tuple[dir].dst		= dst_cache;
+	route->tuple[dir].xmit_type	= nft_xmit_type(dst_cache);
+}
+
+static bool nft_is_valid_ether_device(const struct net_device *dev)
+{
+	if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+	    dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
+		return false;
+
+	return true;
+}
+
+static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
+				     const struct dst_entry *dst_cache,
+				     const struct nf_conn *ct,
+				     enum ip_conntrack_dir dir, u8 *ha,
+				     struct net_device_path_stack *stack)
+{
+	const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
+	struct net_device *dev = dst_cache->dev;
+	struct neighbour *n;
+	u8 nud_state;
+
+	if (!nft_is_valid_ether_device(dev))
+		goto out;
+
+	n = dst_neigh_lookup(dst_cache, daddr);
+	if (!n)
+		return -1;
+
+	read_lock_bh(&n->lock);
+	nud_state = n->nud_state;
+	ether_addr_copy(ha, n->ha);
+	read_unlock_bh(&n->lock);
+	neigh_release(n);
+
+	if (!(nud_state & NUD_VALID))
+		return -1;
+
+out:
+	return dev_fill_forward_path(dev, ha, stack);
+}
+
+struct nft_forward_info {
+	const struct net_device *indev;
+	const struct net_device *outdev;
+	const struct net_device *hw_outdev;
+	struct id {
+		__u16	id;
+		__be16	proto;
+	} encap[NF_FLOW_TABLE_ENCAP_MAX];
+	u8 num_encaps;
+	u8 ingress_vlans;
+	u8 h_source[ETH_ALEN];
+	u8 h_dest[ETH_ALEN];
+	enum flow_offload_xmit_type xmit_type;
+};
+
+static void nft_dev_path_info(const struct net_device_path_stack *stack,
+			      struct nft_forward_info *info,
+			      unsigned char *ha, struct nf_flowtable *flowtable)
+{
+	const struct net_device_path *path;
+	int i;
+
+	memcpy(info->h_dest, ha, ETH_ALEN);
+
+	for (i = 0; i < stack->num_paths; i++) {
+		path = &stack->path[i];
+		switch (path->type) {
+		case DEV_PATH_ETHERNET:
+		case DEV_PATH_DSA:
+		case DEV_PATH_VLAN:
+		case DEV_PATH_PPPOE:
+			info->indev = path->dev;
+			if (is_zero_ether_addr(info->h_source))
+				memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+			if (path->type == DEV_PATH_ETHERNET)
+				break;
+			if (path->type == DEV_PATH_DSA) {
+				i = stack->num_paths;
+				break;
+			}
+
+			/* DEV_PATH_VLAN and DEV_PATH_PPPOE */
+			if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+				info->indev = NULL;
+				break;
+			}
+			if (!info->outdev)
+				info->outdev = path->dev;
+			info->encap[info->num_encaps].id = path->encap.id;
+			info->encap[info->num_encaps].proto = path->encap.proto;
+			info->num_encaps++;
+			if (path->type == DEV_PATH_PPPOE)
+				memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+			break;
+		case DEV_PATH_BRIDGE:
+			if (is_zero_ether_addr(info->h_source))
+				memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+			switch (path->bridge.vlan_mode) {
+			case DEV_PATH_BR_VLAN_UNTAG_HW:
+				info->ingress_vlans |= BIT(info->num_encaps - 1);
+				break;
+			case DEV_PATH_BR_VLAN_TAG:
+				if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+					info->indev = NULL;
+					break;
+				}
+				info->encap[info->num_encaps].id = path->bridge.vlan_id;
+				info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
+				info->num_encaps++;
+				break;
+			case DEV_PATH_BR_VLAN_UNTAG:
+				if (WARN_ON_ONCE(info->num_encaps-- == 0)) {
+					info->indev = NULL;
+					break;
+				}
+				break;
+			case DEV_PATH_BR_VLAN_KEEP:
+				break;
+			}
+			info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+			break;
+		default:
+			info->indev = NULL;
+			break;
+		}
+	}
+	if (!info->outdev)
+		info->outdev = info->indev;
+
+	info->hw_outdev = info->indev;
+
+	if (nf_flowtable_hw_offload(flowtable) &&
+	    nft_is_valid_ether_device(info->indev))
+		info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+}
+
+static bool nft_flowtable_find_dev(const struct net_device *dev,
+				   struct nft_flowtable *ft)
+{
+	struct nft_hook *hook;
+	bool found = false;
+
+	list_for_each_entry_rcu(hook, &ft->hook_list, list) {
+		if (!nft_hook_find_ops_rcu(hook, dev))
+			continue;
+
+		found = true;
+		break;
+	}
+
+	return found;
+}
+
+static void nft_dev_forward_path(struct nf_flow_route *route,
+				 const struct nf_conn *ct,
+				 enum ip_conntrack_dir dir,
+				 struct nft_flowtable *ft)
+{
+	const struct dst_entry *dst = route->tuple[dir].dst;
+	struct net_device_path_stack stack;
+	struct nft_forward_info info = {};
+	unsigned char ha[ETH_ALEN];
+	int i;
+
+	if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
+		nft_dev_path_info(&stack, &info, ha, &ft->data);
+
+	if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
+		return;
+
+	route->tuple[!dir].in.ifindex = info.indev->ifindex;
+	for (i = 0; i < info.num_encaps; i++) {
+		route->tuple[!dir].in.encap[i].id = info.encap[i].id;
+		route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
+	}
+	route->tuple[!dir].in.num_encaps = info.num_encaps;
+	route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
+
+	if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
+		memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
+		memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
+		route->tuple[dir].out.ifindex = info.outdev->ifindex;
+		route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
+		route->tuple[dir].xmit_type = info.xmit_type;
+	}
+}
+
+int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
+		   struct nf_flow_route *route, enum ip_conntrack_dir dir,
+		   struct nft_flowtable *ft)
+{
+	struct dst_entry *this_dst = skb_dst(pkt->skb);
+	struct dst_entry *other_dst = NULL;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof(fl));
+	switch (nft_pf(pkt)) {
+	case NFPROTO_IPV4:
+		fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+		fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
+		fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
+		fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
+		fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
+		fl.u.ip4.flowi4_mark = pkt->skb->mark;
+		fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+		break;
+	case NFPROTO_IPV6:
+		fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
+		fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6;
+		fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
+		fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
+		fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
+		fl.u.ip6.flowi6_mark = pkt->skb->mark;
+		fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
+		break;
+	}
+
+	if (!dst_hold_safe(this_dst))
+		return -ENOENT;
+
+	nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
+	if (!other_dst) {
+		dst_release(this_dst);
+		return -ENOENT;
+	}
+
+	nft_default_forward_path(route, this_dst, dir);
+	nft_default_forward_path(route, other_dst, !dir);
+
+	if (route->tuple[dir].xmit_type	== FLOW_OFFLOAD_XMIT_NEIGH &&
+	    route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
+		nft_dev_forward_path(route, ct, dir, ft);
+		nft_dev_forward_path(route, ct, !dir, ft);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_flow_route);
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index e95e5f59a3d6..b8f76c9057fd 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -20,265 +20,6 @@ struct nft_flow_offload {
 	struct nft_flowtable	*flowtable;
 };
 
-static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst)
-{
-	if (dst_xfrm(dst))
-		return FLOW_OFFLOAD_XMIT_XFRM;
-
-	return FLOW_OFFLOAD_XMIT_NEIGH;
-}
-
-static void nft_default_forward_path(struct nf_flow_route *route,
-				     struct dst_entry *dst_cache,
-				     enum ip_conntrack_dir dir)
-{
-	route->tuple[!dir].in.ifindex	= dst_cache->dev->ifindex;
-	route->tuple[dir].dst		= dst_cache;
-	route->tuple[dir].xmit_type	= nft_xmit_type(dst_cache);
-}
-
-static bool nft_is_valid_ether_device(const struct net_device *dev)
-{
-	if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
-	    dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
-		return false;
-
-	return true;
-}
-
-static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
-				     const struct dst_entry *dst_cache,
-				     const struct nf_conn *ct,
-				     enum ip_conntrack_dir dir, u8 *ha,
-				     struct net_device_path_stack *stack)
-{
-	const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
-	struct net_device *dev = dst_cache->dev;
-	struct neighbour *n;
-	u8 nud_state;
-
-	if (!nft_is_valid_ether_device(dev))
-		goto out;
-
-	n = dst_neigh_lookup(dst_cache, daddr);
-	if (!n)
-		return -1;
-
-	read_lock_bh(&n->lock);
-	nud_state = n->nud_state;
-	ether_addr_copy(ha, n->ha);
-	read_unlock_bh(&n->lock);
-	neigh_release(n);
-
-	if (!(nud_state & NUD_VALID))
-		return -1;
-
-out:
-	return dev_fill_forward_path(dev, ha, stack);
-}
-
-struct nft_forward_info {
-	const struct net_device *indev;
-	const struct net_device *outdev;
-	const struct net_device *hw_outdev;
-	struct id {
-		__u16	id;
-		__be16	proto;
-	} encap[NF_FLOW_TABLE_ENCAP_MAX];
-	u8 num_encaps;
-	u8 ingress_vlans;
-	u8 h_source[ETH_ALEN];
-	u8 h_dest[ETH_ALEN];
-	enum flow_offload_xmit_type xmit_type;
-};
-
-static void nft_dev_path_info(const struct net_device_path_stack *stack,
-			      struct nft_forward_info *info,
-			      unsigned char *ha, struct nf_flowtable *flowtable)
-{
-	const struct net_device_path *path;
-	int i;
-
-	memcpy(info->h_dest, ha, ETH_ALEN);
-
-	for (i = 0; i < stack->num_paths; i++) {
-		path = &stack->path[i];
-		switch (path->type) {
-		case DEV_PATH_ETHERNET:
-		case DEV_PATH_DSA:
-		case DEV_PATH_VLAN:
-		case DEV_PATH_PPPOE:
-			info->indev = path->dev;
-			if (is_zero_ether_addr(info->h_source))
-				memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
-
-			if (path->type == DEV_PATH_ETHERNET)
-				break;
-			if (path->type == DEV_PATH_DSA) {
-				i = stack->num_paths;
-				break;
-			}
-
-			/* DEV_PATH_VLAN and DEV_PATH_PPPOE */
-			if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
-				info->indev = NULL;
-				break;
-			}
-			if (!info->outdev)
-				info->outdev = path->dev;
-			info->encap[info->num_encaps].id = path->encap.id;
-			info->encap[info->num_encaps].proto = path->encap.proto;
-			info->num_encaps++;
-			if (path->type == DEV_PATH_PPPOE)
-				memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
-			break;
-		case DEV_PATH_BRIDGE:
-			if (is_zero_ether_addr(info->h_source))
-				memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
-
-			switch (path->bridge.vlan_mode) {
-			case DEV_PATH_BR_VLAN_UNTAG_HW:
-				info->ingress_vlans |= BIT(info->num_encaps - 1);
-				break;
-			case DEV_PATH_BR_VLAN_TAG:
-				if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
-					info->indev = NULL;
-					break;
-				}
-				info->encap[info->num_encaps].id = path->bridge.vlan_id;
-				info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
-				info->num_encaps++;
-				break;
-			case DEV_PATH_BR_VLAN_UNTAG:
-				if (WARN_ON_ONCE(info->num_encaps-- == 0)) {
-					info->indev = NULL;
-					break;
-				}
-				break;
-			case DEV_PATH_BR_VLAN_KEEP:
-				break;
-			}
-			info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
-			break;
-		default:
-			info->indev = NULL;
-			break;
-		}
-	}
-	if (!info->outdev)
-		info->outdev = info->indev;
-
-	info->hw_outdev = info->indev;
-
-	if (nf_flowtable_hw_offload(flowtable) &&
-	    nft_is_valid_ether_device(info->indev))
-		info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
-}
-
-static bool nft_flowtable_find_dev(const struct net_device *dev,
-				   struct nft_flowtable *ft)
-{
-	struct nft_hook *hook;
-	bool found = false;
-
-	list_for_each_entry_rcu(hook, &ft->hook_list, list) {
-		if (!nft_hook_find_ops_rcu(hook, dev))
-			continue;
-
-		found = true;
-		break;
-	}
-
-	return found;
-}
-
-static void nft_dev_forward_path(struct nf_flow_route *route,
-				 const struct nf_conn *ct,
-				 enum ip_conntrack_dir dir,
-				 struct nft_flowtable *ft)
-{
-	const struct dst_entry *dst = route->tuple[dir].dst;
-	struct net_device_path_stack stack;
-	struct nft_forward_info info = {};
-	unsigned char ha[ETH_ALEN];
-	int i;
-
-	if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
-		nft_dev_path_info(&stack, &info, ha, &ft->data);
-
-	if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
-		return;
-
-	route->tuple[!dir].in.ifindex = info.indev->ifindex;
-	for (i = 0; i < info.num_encaps; i++) {
-		route->tuple[!dir].in.encap[i].id = info.encap[i].id;
-		route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
-	}
-	route->tuple[!dir].in.num_encaps = info.num_encaps;
-	route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
-
-	if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
-		memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
-		memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
-		route->tuple[dir].out.ifindex = info.outdev->ifindex;
-		route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
-		route->tuple[dir].xmit_type = info.xmit_type;
-	}
-}
-
-static int nft_flow_route(const struct nft_pktinfo *pkt,
-			  const struct nf_conn *ct,
-			  struct nf_flow_route *route,
-			  enum ip_conntrack_dir dir,
-			  struct nft_flowtable *ft)
-{
-	struct dst_entry *this_dst = skb_dst(pkt->skb);
-	struct dst_entry *other_dst = NULL;
-	struct flowi fl;
-
-	memset(&fl, 0, sizeof(fl));
-	switch (nft_pf(pkt)) {
-	case NFPROTO_IPV4:
-		fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
-		fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
-		fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
-		fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
-		fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
-		fl.u.ip4.flowi4_mark = pkt->skb->mark;
-		fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
-		break;
-	case NFPROTO_IPV6:
-		fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
-		fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6;
-		fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
-		fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
-		fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
-		fl.u.ip6.flowi6_mark = pkt->skb->mark;
-		fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
-		break;
-	}
-
-	if (!dst_hold_safe(this_dst))
-		return -ENOENT;
-
-	nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
-	if (!other_dst) {
-		dst_release(this_dst);
-		return -ENOENT;
-	}
-
-	nft_default_forward_path(route, this_dst, dir);
-	nft_default_forward_path(route, other_dst, !dir);
-
-	if (route->tuple[dir].xmit_type	== FLOW_OFFLOAD_XMIT_NEIGH &&
-	    route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
-		nft_dev_forward_path(route, ct, dir, ft);
-		nft_dev_forward_path(route, ct, !dir, ft);
-	}
-
-	return 0;
-}
-
 static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
 {
 	if (skb_sec_path(skb))
-- 
cgit v1.2.3


From b5964aac51e0c286a50e68225e0dfcf11fb554cb Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 10 Oct 2025 12:32:35 +0200
Subject: netfilter: flowtable: consolidate xmit path

Use dev_queue_xmit() for the XMIT_NEIGH case. Store the interface index
of the real device behind the vlan/pppoe device, this introduces  an
extra lookup for the real device in the xmit path because rt->dst.dev
provides the vlan/pppoe device.

XMIT_NEIGH now looks more similar to XMIT_DIRECT but the check for stale
dst and the neighbour lookup still remain in place which is convenient
to deal with network topology changes.

Note that nft_flow_route() needs to relax the check for _XMIT_NEIGH so
the existing basic xfrm offload (which only works in one direction) does
not break.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  1 +
 net/netfilter/nf_flow_table_core.c    |  1 +
 net/netfilter/nf_flow_table_ip.c      | 87 +++++++++++++++++++++--------------
 net/netfilter/nf_flow_table_path.c    |  7 ++-
 4 files changed, 57 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index e9f72d2558e9..7c330caae52b 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -141,6 +141,7 @@ struct flow_offload_tuple {
 	union {
 		struct {
 			struct dst_entry *dst_cache;
+			u32		ifidx;
 			u32		dst_cookie;
 		};
 		struct {
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 9441ac3d8c1a..98d7b3708602 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -132,6 +132,7 @@ static int flow_offload_fill_route(struct flow_offload *flow,
 		break;
 	case FLOW_OFFLOAD_XMIT_XFRM:
 	case FLOW_OFFLOAD_XMIT_NEIGH:
+		flow_tuple->ifidx = route->tuple[dir].out.ifindex;
 		flow_tuple->dst_cache = dst;
 		flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
 		break;
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 8cd4cf7ae211..eb4f6a11e779 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -333,19 +333,18 @@ static void nf_flow_encap_pop(struct sk_buff *skb,
 	}
 }
 
+struct nf_flow_xmit {
+	const void		*dest;
+	const void		*source;
+	struct net_device	*outdev;
+};
+
 static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
-				       const struct flow_offload_tuple_rhash *tuplehash,
-				       unsigned short type)
+				       struct nf_flow_xmit *xmit)
 {
-	struct net_device *outdev;
-
-	outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx);
-	if (!outdev)
-		return NF_DROP;
-
-	skb->dev = outdev;
-	dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest,
-			tuplehash->tuple.out.h_source, skb->len);
+	skb->dev = xmit->outdev;
+	dev_hard_header(skb, skb->dev, ntohs(skb->protocol),
+			xmit->dest, xmit->source, skb->len);
 	dev_queue_xmit(skb);
 
 	return NF_STOLEN;
@@ -424,10 +423,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	struct nf_flowtable_ctx ctx = {
 		.in	= state->in,
 	};
+	struct nf_flow_xmit xmit = {};
 	struct flow_offload *flow;
-	struct net_device *outdev;
+	struct neighbour *neigh;
 	struct rtable *rt;
-	__be32 nexthop;
 	int ret;
 
 	tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb);
@@ -454,25 +453,34 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	switch (tuplehash->tuple.xmit_type) {
 	case FLOW_OFFLOAD_XMIT_NEIGH:
 		rt = dst_rtable(tuplehash->tuple.dst_cache);
-		outdev = rt->dst.dev;
-		skb->dev = outdev;
-		nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+		xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx);
+		if (!xmit.outdev) {
+			flow_offload_teardown(flow);
+			return NF_DROP;
+		}
+		neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr));
+		if (IS_ERR(neigh)) {
+			flow_offload_teardown(flow);
+			return NF_DROP;
+		}
+		xmit.dest = neigh->ha;
 		skb_dst_set_noref(skb, &rt->dst);
-		neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
-		ret = NF_STOLEN;
 		break;
 	case FLOW_OFFLOAD_XMIT_DIRECT:
-		ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP);
-		if (ret == NF_DROP)
+		xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx);
+		if (!xmit.outdev) {
 			flow_offload_teardown(flow);
+			return NF_DROP;
+		}
+		xmit.dest = tuplehash->tuple.out.h_dest;
+		xmit.source = tuplehash->tuple.out.h_source;
 		break;
 	default:
 		WARN_ON_ONCE(1);
-		ret = NF_DROP;
-		break;
+		return NF_DROP;
 	}
 
-	return ret;
+	return nf_flow_queue_xmit(state->net, skb, &xmit);
 }
 EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
 
@@ -719,9 +727,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 	struct nf_flowtable_ctx ctx = {
 		.in	= state->in,
 	};
-	const struct in6_addr *nexthop;
+	struct nf_flow_xmit xmit = {};
 	struct flow_offload *flow;
-	struct net_device *outdev;
+	struct neighbour *neigh;
 	struct rt6_info *rt;
 	int ret;
 
@@ -749,24 +757,33 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 	switch (tuplehash->tuple.xmit_type) {
 	case FLOW_OFFLOAD_XMIT_NEIGH:
 		rt = dst_rt6_info(tuplehash->tuple.dst_cache);
-		outdev = rt->dst.dev;
-		skb->dev = outdev;
-		nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+		xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx);
+		if (!xmit.outdev) {
+			flow_offload_teardown(flow);
+			return NF_DROP;
+		}
+		neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6));
+		if (IS_ERR(neigh)) {
+			flow_offload_teardown(flow);
+			return NF_DROP;
+		}
+		xmit.dest = neigh->ha;
 		skb_dst_set_noref(skb, &rt->dst);
-		neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
-		ret = NF_STOLEN;
 		break;
 	case FLOW_OFFLOAD_XMIT_DIRECT:
-		ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6);
-		if (ret == NF_DROP)
+		xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx);
+		if (!xmit.outdev) {
 			flow_offload_teardown(flow);
+			return NF_DROP;
+		}
+		xmit.dest = tuplehash->tuple.out.h_dest;
+		xmit.source = tuplehash->tuple.out.h_source;
 		break;
 	default:
 		WARN_ON_ONCE(1);
-		ret = NF_DROP;
-		break;
+		return NF_DROP;
 	}
 
-	return ret;
+	return nf_flow_queue_xmit(state->net, skb, &xmit);
 }
 EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
index e525e3745651..e0c69fea2e0c 100644
--- a/net/netfilter/nf_flow_table_path.c
+++ b/net/netfilter/nf_flow_table_path.c
@@ -211,11 +211,11 @@ static void nft_dev_forward_path(struct nf_flow_route *route,
 	}
 	route->tuple[!dir].in.num_encaps = info.num_encaps;
 	route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
+	route->tuple[dir].out.ifindex = info.outdev->ifindex;
 
 	if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
 		memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
 		memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
-		route->tuple[dir].out.ifindex = info.outdev->ifindex;
 		route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
 		route->tuple[dir].xmit_type = info.xmit_type;
 	}
@@ -263,11 +263,10 @@ int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
 	nft_default_forward_path(route, this_dst, dir);
 	nft_default_forward_path(route, other_dst, !dir);
 
-	if (route->tuple[dir].xmit_type	== FLOW_OFFLOAD_XMIT_NEIGH &&
-	    route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
+	if (route->tuple[dir].xmit_type	== FLOW_OFFLOAD_XMIT_NEIGH)
 		nft_dev_forward_path(route, ct, dir, ft);
+	if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
 		nft_dev_forward_path(route, ct, !dir, ft);
-	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 030feea3097c41ed268c81240e5c334d9977b1c4 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 10 Oct 2025 12:50:07 +0200
Subject: netfilter: flowtable: remove hw_ifidx

hw_ifidx was originally introduced to store the real netdevice as a
requirement for the hardware offload support in:

 73f97025a972 ("netfilter: nft_flow_offload: use direct xmit if hardware offload is enabled")

Since ("netfilter: flowtable: consolidate xmit path"), ifidx and
hw_ifidx points to the real device in the xmit path, remove it.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h | 1 -
 net/netfilter/nf_flow_table_core.c    | 1 -
 net/netfilter/nf_flow_table_offload.c | 2 +-
 net/netfilter/nf_flow_table_path.c    | 3 ---
 4 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 7c330caae52b..f7306276ece7 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -146,7 +146,6 @@ struct flow_offload_tuple {
 		};
 		struct {
 			u32		ifidx;
-			u32		hw_ifidx;
 			u8		h_source[ETH_ALEN];
 			u8		h_dest[ETH_ALEN];
 		} out;
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 98d7b3708602..6c6a5165f993 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -127,7 +127,6 @@ static int flow_offload_fill_route(struct flow_offload *flow,
 		memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
 		       ETH_ALEN);
 		flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
-		flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex;
 		dst_release(dst);
 		break;
 	case FLOW_OFFLOAD_XMIT_XFRM:
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index e06bc36f49fe..d8f7bfd60ac6 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -555,7 +555,7 @@ static void flow_offload_redirect(struct net *net,
 	switch (this_tuple->xmit_type) {
 	case FLOW_OFFLOAD_XMIT_DIRECT:
 		this_tuple = &flow->tuplehash[dir].tuple;
-		ifindex = this_tuple->out.hw_ifidx;
+		ifindex = this_tuple->out.ifidx;
 		break;
 	case FLOW_OFFLOAD_XMIT_NEIGH:
 		other_tuple = &flow->tuplehash[!dir].tuple;
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
index c51e310bb2ab..eb9b33a1873a 100644
--- a/net/netfilter/nf_flow_table_path.c
+++ b/net/netfilter/nf_flow_table_path.c
@@ -75,7 +75,6 @@ out:
 struct nft_forward_info {
 	const struct net_device *indev;
 	const struct net_device *outdev;
-	const struct net_device *hw_outdev;
 	struct id {
 		__u16	id;
 		__be16	proto;
@@ -159,7 +158,6 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
 		}
 	}
 	info->outdev = info->indev;
-	info->hw_outdev = info->indev;
 
 	if (nf_flowtable_hw_offload(flowtable) &&
 	    nft_is_valid_ether_device(info->indev))
@@ -212,7 +210,6 @@ static void nft_dev_forward_path(struct nf_flow_route *route,
 	if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
 		memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
 		memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
-		route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
 		route->tuple[dir].xmit_type = info.xmit_type;
 	}
 }
-- 
cgit v1.2.3


From ab427db17885814069bae891834f20842f0ac3a4 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 7 Nov 2025 12:14:46 +0100
Subject: netfilter: flowtable: Add IPIP rx sw acceleration

Introduce sw acceleration for rx path of IPIP tunnels relying on the
netfilter flowtable infrastructure. Subsequent patches will add sw
acceleration for IPIP tunnels tx path.
This series introduces basic infrastructure to accelerate other tunnel
types (e.g. IP6IP6).
IPIP rx sw acceleration can be tested running the following scenario where
the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP
tunnel is used to access a remote site (using eth1 as the underlay device):

ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)

$ip addr show
6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff
    inet 192.168.0.2/24 scope global eth0
       valid_lft forever preferred_lft forever
7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff
    inet 192.168.1.1/24 scope global eth1
       valid_lft forever preferred_lft forever
8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000
    link/ipip 192.168.1.1 peer 192.168.1.2
    inet 192.168.100.1/24 scope global tun0
       valid_lft forever preferred_lft forever

$ip route show
default via 192.168.100.2 dev tun0
192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2
192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1
192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1

$nft list ruleset
table inet filter {
        flowtable ft {
                hook ingress priority filter
                devices = { eth0, eth1 }
        }

        chain forward {
                type filter hook forward priority filter; policy accept;
                meta l4proto { tcp, udp } flow add @ft
        }
}

Reproducing the scenario described above using veths I got the following
results:
- TCP stream received from the IPIP tunnel:
  - net-next: (baseline)		~ 71Gbps
  - net-next + IPIP flowtbale support:	~101Gbps

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netdevice.h             | 13 +++++++
 include/net/netfilter/nf_flow_table.h | 18 +++++++++
 net/ipv4/ipip.c                       | 25 +++++++++++++
 net/netfilter/nf_flow_table_core.c    |  3 ++
 net/netfilter/nf_flow_table_ip.c      | 69 ++++++++++++++++++++++++++++++++---
 net/netfilter/nf_flow_table_path.c    | 38 +++++++++++++++----
 6 files changed, 153 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e808071dbb7d..bf99fe8622da 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -877,6 +877,7 @@ enum net_device_path_type {
 	DEV_PATH_PPPOE,
 	DEV_PATH_DSA,
 	DEV_PATH_MTK_WDMA,
+	DEV_PATH_TUN,
 };
 
 struct net_device_path {
@@ -888,6 +889,18 @@ struct net_device_path {
 			__be16		proto;
 			u8		h_dest[ETH_ALEN];
 		} encap;
+		struct {
+			union {
+				struct in_addr	src_v4;
+				struct in6_addr	src_v6;
+			};
+			union {
+				struct in_addr	dst_v4;
+				struct in6_addr	dst_v6;
+			};
+
+			u8	l3_proto;
+		} tun;
 		struct {
 			enum {
 				DEV_PATH_BR_VLAN_KEEP,
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index f7306276ece7..b09c11c048d5 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -107,6 +107,19 @@ enum flow_offload_xmit_type {
 
 #define NF_FLOW_TABLE_ENCAP_MAX		2
 
+struct flow_offload_tunnel {
+	union {
+		struct in_addr	src_v4;
+		struct in6_addr	src_v6;
+	};
+	union {
+		struct in_addr	dst_v4;
+		struct in6_addr	dst_v6;
+	};
+
+	u8	l3_proto;
+};
+
 struct flow_offload_tuple {
 	union {
 		struct in_addr		src_v4;
@@ -130,12 +143,15 @@ struct flow_offload_tuple {
 		__be16			proto;
 	} encap[NF_FLOW_TABLE_ENCAP_MAX];
 
+	struct flow_offload_tunnel	tun;
+
 	/* All members above are keys for lookups, see flow_offload_hash(). */
 	struct { }			__hash;
 
 	u8				dir:2,
 					xmit_type:3,
 					encap_num:2,
+					tun_num:2,
 					in_vlan_ingress:2;
 	u16				mtu;
 	union {
@@ -206,7 +222,9 @@ struct nf_flow_route {
 				u16		id;
 				__be16		proto;
 			} encap[NF_FLOW_TABLE_ENCAP_MAX];
+			struct flow_offload_tunnel tun;
 			u8			num_encaps:2,
+						num_tuns:2,
 						ingress_vlans:2;
 		} in;
 		struct {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 3e03af073a1c..ff95b1b9908e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -353,6 +353,30 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
 	return ip_tunnel_ctl(dev, p, cmd);
 }
 
+static int ipip_fill_forward_path(struct net_device_path_ctx *ctx,
+				  struct net_device_path *path)
+{
+	struct ip_tunnel *tunnel = netdev_priv(ctx->dev);
+	const struct iphdr *tiph = &tunnel->parms.iph;
+	struct rtable *rt;
+
+	rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0,
+			     RT_SCOPE_UNIVERSE);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	path->type = DEV_PATH_TUN;
+	path->tun.src_v4.s_addr = tiph->saddr;
+	path->tun.dst_v4.s_addr = tiph->daddr;
+	path->tun.l3_proto = IPPROTO_IPIP;
+	path->dev = ctx->dev;
+
+	ctx->dev = rt->dst.dev;
+	ip_rt_put(rt);
+
+	return 0;
+}
+
 static const struct net_device_ops ipip_netdev_ops = {
 	.ndo_init       = ipip_tunnel_init,
 	.ndo_uninit     = ip_tunnel_uninit,
@@ -362,6 +386,7 @@ static const struct net_device_ops ipip_netdev_ops = {
 	.ndo_get_stats64 = dev_get_tstats64,
 	.ndo_get_iflink = ip_tunnel_get_iflink,
 	.ndo_tunnel_ctl	= ipip_tunnel_ctl,
+	.ndo_fill_forward_path = ipip_fill_forward_path,
 };
 
 #define IPIP_FEATURES (NETIF_F_SG |		\
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 6c6a5165f993..06e8251a6644 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -118,7 +118,10 @@ static int flow_offload_fill_route(struct flow_offload *flow,
 			flow_tuple->in_vlan_ingress |= BIT(j);
 		j++;
 	}
+
+	flow_tuple->tun = route->tuple[dir].in.tun;
 	flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+	flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
 
 	switch (route->tuple[dir].xmit_type) {
 	case FLOW_OFFLOAD_XMIT_DIRECT:
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 083ceb64ac17..d6a1f0cda189 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -145,8 +145,11 @@ static bool ip_has_options(unsigned int thoff)
 static void nf_flow_tuple_encap(struct sk_buff *skb,
 				struct flow_offload_tuple *tuple)
 {
+	__be16 inner_proto = skb->protocol;
 	struct vlan_ethhdr *veth;
 	struct pppoe_hdr *phdr;
+	struct iphdr *iph;
+	u16 offset = 0;
 	int i = 0;
 
 	if (skb_vlan_tag_present(skb)) {
@@ -159,13 +162,26 @@ static void nf_flow_tuple_encap(struct sk_buff *skb,
 		veth = (struct vlan_ethhdr *)skb_mac_header(skb);
 		tuple->encap[i].id = ntohs(veth->h_vlan_TCI);
 		tuple->encap[i].proto = skb->protocol;
+		inner_proto = veth->h_vlan_encapsulated_proto;
+		offset += VLAN_HLEN;
 		break;
 	case htons(ETH_P_PPP_SES):
 		phdr = (struct pppoe_hdr *)skb_network_header(skb);
 		tuple->encap[i].id = ntohs(phdr->sid);
 		tuple->encap[i].proto = skb->protocol;
+		inner_proto = *((__be16 *)(phdr + 1));
+		offset += PPPOE_SES_HLEN;
 		break;
 	}
+
+	if (inner_proto == htons(ETH_P_IP)) {
+		iph = (struct iphdr *)(skb_network_header(skb) + offset);
+		if (iph->protocol == IPPROTO_IPIP) {
+			tuple->tun.dst_v4.s_addr = iph->daddr;
+			tuple->tun.src_v4.s_addr = iph->saddr;
+			tuple->tun.l3_proto = IPPROTO_IPIP;
+		}
+	}
 }
 
 struct nf_flowtable_ctx {
@@ -277,11 +293,46 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
 	return NF_STOLEN;
 }
 
+static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize)
+{
+	struct iphdr *iph;
+	u16 size;
+
+	if (!pskb_may_pull(skb, sizeof(*iph) + *psize))
+		return false;
+
+	iph = (struct iphdr *)(skb_network_header(skb) + *psize);
+	size = iph->ihl << 2;
+
+	if (ip_is_fragment(iph) || unlikely(ip_has_options(size)))
+		return false;
+
+	if (iph->ttl <= 1)
+		return false;
+
+	if (iph->protocol == IPPROTO_IPIP)
+		*psize += size;
+
+	return true;
+}
+
+static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb)
+{
+	struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
+
+	if (iph->protocol != IPPROTO_IPIP)
+		return;
+
+	skb_pull(skb, iph->ihl << 2);
+	skb_reset_network_header(skb);
+}
+
 static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
 				       u32 *offset)
 {
+	__be16 inner_proto = skb->protocol;
 	struct vlan_ethhdr *veth;
-	__be16 inner_proto;
+	bool ret = false;
 
 	switch (skb->protocol) {
 	case htons(ETH_P_8021Q):
@@ -291,19 +342,23 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
 		veth = (struct vlan_ethhdr *)skb_mac_header(skb);
 		if (veth->h_vlan_encapsulated_proto == proto) {
 			*offset += VLAN_HLEN;
-			return true;
+			inner_proto = proto;
+			ret = true;
 		}
 		break;
 	case htons(ETH_P_PPP_SES):
 		if (nf_flow_pppoe_proto(skb, &inner_proto) &&
 		    inner_proto == proto) {
 			*offset += PPPOE_SES_HLEN;
-			return true;
+			ret = true;
 		}
 		break;
 	}
 
-	return false;
+	if (inner_proto == htons(ETH_P_IP))
+		ret = nf_flow_ip4_tunnel_proto(skb, offset);
+
+	return ret;
 }
 
 static void nf_flow_encap_pop(struct sk_buff *skb,
@@ -331,6 +386,9 @@ static void nf_flow_encap_pop(struct sk_buff *skb,
 			break;
 		}
 	}
+
+	if (skb->protocol == htons(ETH_P_IP))
+		nf_flow_ip4_tunnel_pop(skb);
 }
 
 struct nf_flow_xmit {
@@ -356,8 +414,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx,
 {
 	struct flow_offload_tuple tuple = {};
 
-	if (skb->protocol != htons(ETH_P_IP) &&
-	    !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
+	if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
 		return NULL;
 
 	if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
index eb9b33a1873a..73717cc32df5 100644
--- a/net/netfilter/nf_flow_table_path.c
+++ b/net/netfilter/nf_flow_table_path.c
@@ -80,6 +80,8 @@ struct nft_forward_info {
 		__be16	proto;
 	} encap[NF_FLOW_TABLE_ENCAP_MAX];
 	u8 num_encaps;
+	struct flow_offload_tunnel tun;
+	u8 num_tuns;
 	u8 ingress_vlans;
 	u8 h_source[ETH_ALEN];
 	u8 h_dest[ETH_ALEN];
@@ -102,6 +104,7 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
 		case DEV_PATH_DSA:
 		case DEV_PATH_VLAN:
 		case DEV_PATH_PPPOE:
+		case DEV_PATH_TUN:
 			info->indev = path->dev;
 			if (is_zero_ether_addr(info->h_source))
 				memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
@@ -113,14 +116,27 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
 				break;
 			}
 
-			/* DEV_PATH_VLAN and DEV_PATH_PPPOE */
-			if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
-				info->indev = NULL;
-				break;
+			/* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */
+			if (path->type == DEV_PATH_TUN) {
+				if (info->num_tuns) {
+					info->indev = NULL;
+					break;
+				}
+				info->tun.src_v6 = path->tun.src_v6;
+				info->tun.dst_v6 = path->tun.dst_v6;
+				info->tun.l3_proto = path->tun.l3_proto;
+				info->num_tuns++;
+			} else {
+				if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+					info->indev = NULL;
+					break;
+				}
+				info->encap[info->num_encaps].id =
+					path->encap.id;
+				info->encap[info->num_encaps].proto =
+					path->encap.proto;
+				info->num_encaps++;
 			}
-			info->encap[info->num_encaps].id = path->encap.id;
-			info->encap[info->num_encaps].proto = path->encap.proto;
-			info->num_encaps++;
 			if (path->type == DEV_PATH_PPPOE)
 				memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
 			break;
@@ -203,6 +219,14 @@ static void nft_dev_forward_path(struct nf_flow_route *route,
 		route->tuple[!dir].in.encap[i].id = info.encap[i].id;
 		route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
 	}
+
+	if (info.num_tuns) {
+		route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6;
+		route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6;
+		route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto;
+		route->tuple[!dir].in.num_tuns = info.num_tuns;
+	}
+
 	route->tuple[!dir].in.num_encaps = info.num_encaps;
 	route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
 	route->tuple[dir].out.ifindex = info.outdev->ifindex;
-- 
cgit v1.2.3


From be102eb6a0e7c03db00e50540622f4e43b2d2844 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <fmancera@suse.de>
Date: Fri, 21 Nov 2025 01:14:30 +0100
Subject: netfilter: nf_conncount: rework API to use sk_buff directly

When using nf_conncount infrastructure for non-confirmed connections a
duplicated track is possible due to an optimization introduced since
commit d265929930e2 ("netfilter: nf_conncount: reduce unnecessary GC").

In order to fix this introduce a new conncount API that receives
directly an sk_buff struct.  It fetches the tuple and zone and the
corresponding ct from it. It comes with both existing conncount variants
nf_conncount_count_skb() and nf_conncount_add_skb(). In addition remove
the old API and adjust all the users to use the new one.

This way, for each sk_buff struct it is possible to check if there is a
ct present and already confirmed. If so, skip the add operation.

Fixes: d265929930e2 ("netfilter: nf_conncount: reduce unnecessary GC")
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h |  17 ++-
 net/netfilter/nf_conncount.c               | 177 +++++++++++++++++++----------
 net/netfilter/nft_connlimit.c              |  21 +---
 net/netfilter/xt_connlimit.c               |  14 +--
 net/openvswitch/conntrack.c                |  16 +--
 5 files changed, 142 insertions(+), 103 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index 1b58b5b91ff6..52a06de41aa0 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -18,15 +18,14 @@ struct nf_conncount_list {
 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen);
 void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data);
 
-unsigned int nf_conncount_count(struct net *net,
-				struct nf_conncount_data *data,
-				const u32 *key,
-				const struct nf_conntrack_tuple *tuple,
-				const struct nf_conntrack_zone *zone);
-
-int nf_conncount_add(struct net *net, struct nf_conncount_list *list,
-		     const struct nf_conntrack_tuple *tuple,
-		     const struct nf_conntrack_zone *zone);
+unsigned int nf_conncount_count_skb(struct net *net,
+				    const struct sk_buff *skb,
+				    u16 l3num,
+				    struct nf_conncount_data *data,
+				    const u32 *key);
+
+int nf_conncount_add_skb(struct net *net, const struct sk_buff *skb,
+			 u16 l3num, struct nf_conncount_list *list);
 
 void nf_conncount_list_init(struct nf_conncount_list *list);
 
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 913ede2f57f9..0ffc5ff78a71 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -122,15 +122,65 @@ find_or_evict(struct net *net, struct nf_conncount_list *list,
 	return ERR_PTR(-EAGAIN);
 }
 
+static bool get_ct_or_tuple_from_skb(struct net *net,
+				     const struct sk_buff *skb,
+				     u16 l3num,
+				     struct nf_conn **ct,
+				     struct nf_conntrack_tuple *tuple,
+				     const struct nf_conntrack_zone **zone,
+				     bool *refcounted)
+{
+	const struct nf_conntrack_tuple_hash *h;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *found_ct;
+
+	found_ct = nf_ct_get(skb, &ctinfo);
+	if (found_ct && !nf_ct_is_template(found_ct)) {
+		*tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+		*zone = nf_ct_zone(found_ct);
+		*ct = found_ct;
+		return true;
+	}
+
+	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple))
+		return false;
+
+	if (found_ct)
+		*zone = nf_ct_zone(found_ct);
+
+	h = nf_conntrack_find_get(net, *zone, tuple);
+	if (!h)
+		return true;
+
+	found_ct = nf_ct_tuplehash_to_ctrack(h);
+	*refcounted = true;
+	*ct = found_ct;
+
+	return true;
+}
+
 static int __nf_conncount_add(struct net *net,
-			      struct nf_conncount_list *list,
-			      const struct nf_conntrack_tuple *tuple,
-			      const struct nf_conntrack_zone *zone)
+			      const struct sk_buff *skb,
+			      u16 l3num,
+			      struct nf_conncount_list *list)
 {
+	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
 	const struct nf_conntrack_tuple_hash *found;
 	struct nf_conncount_tuple *conn, *conn_n;
+	struct nf_conntrack_tuple tuple;
+	struct nf_conn *ct = NULL;
 	struct nf_conn *found_ct;
 	unsigned int collect = 0;
+	bool refcounted = false;
+
+	if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted))
+		return -ENOENT;
+
+	if (ct && nf_ct_is_confirmed(ct)) {
+		if (refcounted)
+			nf_ct_put(ct);
+		return 0;
+	}
 
 	if ((u32)jiffies == list->last_gc)
 		goto add_new_node;
@@ -144,10 +194,10 @@ static int __nf_conncount_add(struct net *net,
 		if (IS_ERR(found)) {
 			/* Not found, but might be about to be confirmed */
 			if (PTR_ERR(found) == -EAGAIN) {
-				if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
+				if (nf_ct_tuple_equal(&conn->tuple, &tuple) &&
 				    nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
 				    nf_ct_zone_id(zone, zone->dir))
-					return 0; /* already exists */
+					goto out_put; /* already exists */
 			} else {
 				collect++;
 			}
@@ -156,7 +206,7 @@ static int __nf_conncount_add(struct net *net,
 
 		found_ct = nf_ct_tuplehash_to_ctrack(found);
 
-		if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
+		if (nf_ct_tuple_equal(&conn->tuple, &tuple) &&
 		    nf_ct_zone_equal(found_ct, zone, zone->dir)) {
 			/*
 			 * We should not see tuples twice unless someone hooks
@@ -165,7 +215,7 @@ static int __nf_conncount_add(struct net *net,
 			 * Attempt to avoid a re-add in this case.
 			 */
 			nf_ct_put(found_ct);
-			return 0;
+			goto out_put;
 		} else if (already_closed(found_ct)) {
 			/*
 			 * we do not care about connections which are
@@ -188,31 +238,35 @@ add_new_node:
 	if (conn == NULL)
 		return -ENOMEM;
 
-	conn->tuple = *tuple;
+	conn->tuple = tuple;
 	conn->zone = *zone;
 	conn->cpu = raw_smp_processor_id();
 	conn->jiffies32 = (u32)jiffies;
 	list_add_tail(&conn->node, &list->head);
 	list->count++;
 	list->last_gc = (u32)jiffies;
+
+out_put:
+	if (refcounted)
+		nf_ct_put(ct);
 	return 0;
 }
 
-int nf_conncount_add(struct net *net,
-		     struct nf_conncount_list *list,
-		     const struct nf_conntrack_tuple *tuple,
-		     const struct nf_conntrack_zone *zone)
+int nf_conncount_add_skb(struct net *net,
+			 const struct sk_buff *skb,
+			 u16 l3num,
+			 struct nf_conncount_list *list)
 {
 	int ret;
 
 	/* check the saved connections */
 	spin_lock_bh(&list->list_lock);
-	ret = __nf_conncount_add(net, list, tuple, zone);
+	ret = __nf_conncount_add(net, skb, l3num, list);
 	spin_unlock_bh(&list->list_lock);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_conncount_add);
+EXPORT_SYMBOL_GPL(nf_conncount_add_skb);
 
 void nf_conncount_list_init(struct nf_conncount_list *list)
 {
@@ -309,19 +363,22 @@ static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
 
 static unsigned int
 insert_tree(struct net *net,
+	    const struct sk_buff *skb,
+	    u16 l3num,
 	    struct nf_conncount_data *data,
 	    struct rb_root *root,
 	    unsigned int hash,
-	    const u32 *key,
-	    const struct nf_conntrack_tuple *tuple,
-	    const struct nf_conntrack_zone *zone)
+	    const u32 *key)
 {
 	struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
+	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+	bool do_gc = true, refcounted = false;
+	unsigned int count = 0, gc_count = 0;
 	struct rb_node **rbnode, *parent;
-	struct nf_conncount_rb *rbconn;
+	struct nf_conntrack_tuple tuple;
 	struct nf_conncount_tuple *conn;
-	unsigned int count = 0, gc_count = 0;
-	bool do_gc = true;
+	struct nf_conncount_rb *rbconn;
+	struct nf_conn *ct = NULL;
 
 	spin_lock_bh(&nf_conncount_locks[hash]);
 restart:
@@ -340,7 +397,7 @@ restart:
 		} else {
 			int ret;
 
-			ret = nf_conncount_add(net, &rbconn->list, tuple, zone);
+			ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list);
 			if (ret)
 				count = 0; /* hotdrop */
 			else
@@ -364,30 +421,35 @@ restart:
 		goto restart;
 	}
 
-	/* expected case: match, insert new node */
-	rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
-	if (rbconn == NULL)
-		goto out_unlock;
+	if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) {
+		/* expected case: match, insert new node */
+		rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
+		if (rbconn == NULL)
+			goto out_unlock;
 
-	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
-	if (conn == NULL) {
-		kmem_cache_free(conncount_rb_cachep, rbconn);
-		goto out_unlock;
-	}
+		conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
+		if (conn == NULL) {
+			kmem_cache_free(conncount_rb_cachep, rbconn);
+			goto out_unlock;
+		}
 
-	conn->tuple = *tuple;
-	conn->zone = *zone;
-	conn->cpu = raw_smp_processor_id();
-	conn->jiffies32 = (u32)jiffies;
-	memcpy(rbconn->key, key, sizeof(u32) * data->keylen);
+		conn->tuple = tuple;
+		conn->zone = *zone;
+		conn->cpu = raw_smp_processor_id();
+		conn->jiffies32 = (u32)jiffies;
+		memcpy(rbconn->key, key, sizeof(u32) * data->keylen);
+
+		nf_conncount_list_init(&rbconn->list);
+		list_add(&conn->node, &rbconn->list.head);
+		count = 1;
+		rbconn->list.count = count;
 
-	nf_conncount_list_init(&rbconn->list);
-	list_add(&conn->node, &rbconn->list.head);
-	count = 1;
-	rbconn->list.count = count;
+		rb_link_node_rcu(&rbconn->node, parent, rbnode);
+		rb_insert_color(&rbconn->node, root);
 
-	rb_link_node_rcu(&rbconn->node, parent, rbnode);
-	rb_insert_color(&rbconn->node, root);
+		if (refcounted)
+			nf_ct_put(ct);
+	}
 out_unlock:
 	spin_unlock_bh(&nf_conncount_locks[hash]);
 	return count;
@@ -395,10 +457,10 @@ out_unlock:
 
 static unsigned int
 count_tree(struct net *net,
+	   const struct sk_buff *skb,
+	   u16 l3num,
 	   struct nf_conncount_data *data,
-	   const u32 *key,
-	   const struct nf_conntrack_tuple *tuple,
-	   const struct nf_conntrack_zone *zone)
+	   const u32 *key)
 {
 	struct rb_root *root;
 	struct rb_node *parent;
@@ -422,7 +484,7 @@ count_tree(struct net *net,
 		} else {
 			int ret;
 
-			if (!tuple) {
+			if (!skb) {
 				nf_conncount_gc_list(net, &rbconn->list);
 				return rbconn->list.count;
 			}
@@ -437,7 +499,7 @@ count_tree(struct net *net,
 			}
 
 			/* same source network -> be counted! */
-			ret = __nf_conncount_add(net, &rbconn->list, tuple, zone);
+			ret = __nf_conncount_add(net, skb, l3num, &rbconn->list);
 			spin_unlock_bh(&rbconn->list.list_lock);
 			if (ret)
 				return 0; /* hotdrop */
@@ -446,10 +508,10 @@ count_tree(struct net *net,
 		}
 	}
 
-	if (!tuple)
+	if (!skb)
 		return 0;
 
-	return insert_tree(net, data, root, hash, key, tuple, zone);
+	return insert_tree(net, skb, l3num, data, root, hash, key);
 }
 
 static void tree_gc_worker(struct work_struct *work)
@@ -511,18 +573,19 @@ next:
 }
 
 /* Count and return number of conntrack entries in 'net' with particular 'key'.
- * If 'tuple' is not null, insert it into the accounting data structure.
- * Call with RCU read lock.
+ * If 'skb' is not null, insert the corresponding tuple into the accounting
+ * data structure. Call with RCU read lock.
  */
-unsigned int nf_conncount_count(struct net *net,
-				struct nf_conncount_data *data,
-				const u32 *key,
-				const struct nf_conntrack_tuple *tuple,
-				const struct nf_conntrack_zone *zone)
+unsigned int nf_conncount_count_skb(struct net *net,
+				    const struct sk_buff *skb,
+				    u16 l3num,
+				    struct nf_conncount_data *data,
+				    const u32 *key)
 {
-	return count_tree(net, data, key, tuple, zone);
+	return count_tree(net, skb, l3num, data, key);
+
 }
-EXPORT_SYMBOL_GPL(nf_conncount_count);
+EXPORT_SYMBOL_GPL(nf_conncount_count_skb);
 
 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen)
 {
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index fc35a11cdca2..5df7134131d2 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -24,26 +24,11 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
 					 const struct nft_pktinfo *pkt,
 					 const struct nft_set_ext *ext)
 {
-	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
-	const struct nf_conntrack_tuple *tuple_ptr;
-	struct nf_conntrack_tuple tuple;
-	enum ip_conntrack_info ctinfo;
-	const struct nf_conn *ct;
 	unsigned int count;
+	int err;
 
-	tuple_ptr = &tuple;
-
-	ct = nf_ct_get(pkt->skb, &ctinfo);
-	if (ct != NULL) {
-		tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-		zone = nf_ct_zone(ct);
-	} else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb),
-				      nft_pf(pkt), nft_net(pkt), &tuple)) {
-		regs->verdict.code = NF_DROP;
-		return;
-	}
-
-	if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) {
+	err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list);
+	if (err) {
 		regs->verdict.code = NF_DROP;
 		return;
 	}
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 0189f8b6b0bd..848287ab79cf 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -31,8 +31,6 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	struct net *net = xt_net(par);
 	const struct xt_connlimit_info *info = par->matchinfo;
-	struct nf_conntrack_tuple tuple;
-	const struct nf_conntrack_tuple *tuple_ptr = &tuple;
 	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
 	enum ip_conntrack_info ctinfo;
 	const struct nf_conn *ct;
@@ -40,13 +38,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	u32 key[5];
 
 	ct = nf_ct_get(skb, &ctinfo);
-	if (ct != NULL) {
-		tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	if (ct)
 		zone = nf_ct_zone(ct);
-	} else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
-				      xt_family(par), net, &tuple)) {
-		goto hotdrop;
-	}
 
 	if (xt_family(par) == NFPROTO_IPV6) {
 		const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -69,10 +62,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		key[1] = zone->id;
 	}
 
-	connections = nf_conncount_count(net, info->data, key, tuple_ptr,
-					 zone);
+	connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key);
 	if (connections == 0)
-		/* kmalloc failed, drop it entirely */
+		/* kmalloc failed or tuple couldn't be found, drop it entirely */
 		goto hotdrop;
 
 	return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT);
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index e573e9221302..a0811e1fba65 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -928,8 +928,8 @@ static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone)
 }
 
 static int ovs_ct_check_limit(struct net *net,
-			      const struct ovs_conntrack_info *info,
-			      const struct nf_conntrack_tuple *tuple)
+			      const struct sk_buff *skb,
+			      const struct ovs_conntrack_info *info)
 {
 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
 	const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
@@ -942,8 +942,9 @@ static int ovs_ct_check_limit(struct net *net,
 	if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED)
 		return 0;
 
-	connections = nf_conncount_count(net, ct_limit_info->data,
-					 &conncount_key, tuple, &info->zone);
+	connections = nf_conncount_count_skb(net, skb, info->family,
+					     ct_limit_info->data,
+					     &conncount_key);
 	if (connections > per_zone_limit)
 		return -ENOMEM;
 
@@ -972,8 +973,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 #if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
 	if (static_branch_unlikely(&ovs_ct_limit_enabled)) {
 		if (!nf_ct_is_confirmed(ct)) {
-			err = ovs_ct_check_limit(net, info,
-				&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+			err = ovs_ct_check_limit(net, skb, info);
 			if (err) {
 				net_warn_ratelimited("openvswitch: zone: %u "
 					"exceeds conntrack limit\n",
@@ -1770,8 +1770,8 @@ static int __ovs_ct_limit_get_zone_limit(struct net *net,
 	zone_limit.limit = limit;
 	nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0);
 
-	zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL,
-					      &ct_zone);
+	zone_limit.count = nf_conncount_count_skb(net, NULL, 0, data,
+						  &conncount_key);
 	return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
 }
 
-- 
cgit v1.2.3


From c4f0ab06e1e0c1331e6febd03538a7f621f15134 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 1 Nov 2025 12:20:50 -0700
Subject: netfilter: ip6t_srh: fix UAPI kernel-doc comments format

Fix the kernel-doc format for struct members to be "@member" instead of
"@ member" to avoid kernel-doc warnings.

Warning: ip6t_srh.h:60 struct member 'next_hdr' not described in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'hdr_len' not described in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'segs_left' not described
 in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'last_entry' not described
 in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'tag' not described in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'mt_flags' not described in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'mt_invflags' not described
 in 'ip6t_srh'
Warning: ip6t_srh.h:93 struct member 'next_hdr' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'hdr_len' not described in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'segs_left' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'last_entry' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'tag' not described in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'psid_addr' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'nsid_addr' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'lsid_addr' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'psid_msk' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'nsid_msk' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'lsid_msk' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'mt_flags' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'mt_invflags' not described
 in 'ip6t_srh1'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_srh.h | 40 ++++++++++++++--------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
index 54ed83360dac..80c66c8ece82 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
@@ -41,13 +41,13 @@
 
 /**
  *      struct ip6t_srh - SRH match options
- *      @ next_hdr: Next header field of SRH
- *      @ hdr_len: Extension header length field of SRH
- *      @ segs_left: Segments left field of SRH
- *      @ last_entry: Last entry field of SRH
- *      @ tag: Tag field of SRH
- *      @ mt_flags: match options
- *      @ mt_invflags: Invert the sense of match options
+ *      @next_hdr: Next header field of SRH
+ *      @hdr_len: Extension header length field of SRH
+ *      @segs_left: Segments left field of SRH
+ *      @last_entry: Last entry field of SRH
+ *      @tag: Tag field of SRH
+ *      @mt_flags: match options
+ *      @mt_invflags: Invert the sense of match options
  */
 
 struct ip6t_srh {
@@ -62,19 +62,19 @@ struct ip6t_srh {
 
 /**
  *      struct ip6t_srh1 - SRH match options (revision 1)
- *      @ next_hdr: Next header field of SRH
- *      @ hdr_len: Extension header length field of SRH
- *      @ segs_left: Segments left field of SRH
- *      @ last_entry: Last entry field of SRH
- *      @ tag: Tag field of SRH
- *      @ psid_addr: Address of previous SID in SRH SID list
- *      @ nsid_addr: Address of NEXT SID in SRH SID list
- *      @ lsid_addr: Address of LAST SID in SRH SID list
- *      @ psid_msk: Mask of previous SID in SRH SID list
- *      @ nsid_msk: Mask of next SID in SRH SID list
- *      @ lsid_msk: MAsk of last SID in SRH SID list
- *      @ mt_flags: match options
- *      @ mt_invflags: Invert the sense of match options
+ *      @next_hdr: Next header field of SRH
+ *      @hdr_len: Extension header length field of SRH
+ *      @segs_left: Segments left field of SRH
+ *      @last_entry: Last entry field of SRH
+ *      @tag: Tag field of SRH
+ *      @psid_addr: Address of previous SID in SRH SID list
+ *      @nsid_addr: Address of NEXT SID in SRH SID list
+ *      @lsid_addr: Address of LAST SID in SRH SID list
+ *      @psid_msk: Mask of previous SID in SRH SID list
+ *      @nsid_msk: Mask of next SID in SRH SID list
+ *      @lsid_msk: MAsk of last SID in SRH SID list
+ *      @mt_flags: match options
+ *      @mt_invflags: Invert the sense of match options
  */
 
 struct ip6t_srh1 {
-- 
cgit v1.2.3


From d3a439e55c193b930e0007967cf8d7a29890449b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 1 Nov 2025 12:20:38 -0700
Subject: netfilter: nf_tables: improve UAPI kernel-doc comments

In include/uapi/linux/netfilter/nf_tables.h,
correct the kernel-doc comments for mistyped enum names and enum values to
avoid these kernel-doc warnings and improve the documentation:

nf_tables.h:896: warning: Enum value 'NFT_EXTHDR_OP_TCPOPT' not described
 in enum 'nft_exthdr_op'
nf_tables.h:896: warning: Excess enum value 'NFT_EXTHDR_OP_TCP' description
 in 'nft_exthdr_op'

nf_tables.h:1210: warning: expecting prototype for enum
 nft_flow_attributes. Prototype was for enum nft_offload_attributes instead

nf_tables.h:1428: warning: expecting prototype for enum nft_reject_code.
 Prototype was for enum nft_reject_inet_code instead

(add beginning '@' to each enum value description:)
nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_FAMILY' not described
 in enum 'nft_tproxy_attributes'
nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_REG_ADDR' not described
 in enum 'nft_tproxy_attributes'
nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_REG_PORT' not described
 in enum 'nft_tproxy_attributes'

nf_tables.h:1796: warning: expecting prototype for enum
 nft_device_attributes. Prototype was for enum
 nft_devices_attributes instead

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 7c0c915f0306..45c71f7d21c2 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -881,7 +881,7 @@ enum nft_exthdr_flags {
  * enum nft_exthdr_op - nf_tables match options
  *
  * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers
- * @NFT_EXTHDR_OP_TCP: match against tcp options
+ * @NFT_EXTHDR_OP_TCPOPT: match against tcp options
  * @NFT_EXTHDR_OP_IPV4: match against ipv4 options
  * @NFT_EXTHDR_OP_SCTP: match against sctp chunks
  * @NFT_EXTHDR_OP_DCCP: match against dccp otions
@@ -1200,7 +1200,7 @@ enum nft_ct_attributes {
 #define NFTA_CT_MAX		(__NFTA_CT_MAX - 1)
 
 /**
- * enum nft_flow_attributes - ct offload expression attributes
+ * enum nft_offload_attributes - ct offload expression attributes
  * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING)
  */
 enum nft_offload_attributes {
@@ -1410,7 +1410,7 @@ enum nft_reject_types {
 };
 
 /**
- * enum nft_reject_code - Generic reject codes for IPv4/IPv6
+ * enum nft_reject_inet_code - Generic reject codes for IPv4/IPv6
  *
  * @NFT_REJECT_ICMPX_NO_ROUTE: no route to host / network unreachable
  * @NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable
@@ -1480,9 +1480,9 @@ enum nft_nat_attributes {
 /**
  * enum nft_tproxy_attributes - nf_tables tproxy expression netlink attributes
  *
- * NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers)
- * NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers)
- * NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers)
+ * @NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers)
+ * @NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers)
+ * @NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers)
  */
 enum nft_tproxy_attributes {
 	NFTA_TPROXY_UNSPEC,
@@ -1783,7 +1783,7 @@ enum nft_synproxy_attributes {
 #define NFTA_SYNPROXY_MAX (__NFTA_SYNPROXY_MAX - 1)
 
 /**
- * enum nft_device_attributes - nf_tables device netlink attributes
+ * enum nft_devices_attributes - nf_tables device netlink attributes
  *
  * @NFTA_DEVICE_NAME: name of this device (NLA_STRING)
  * @NFTA_DEVICE_PREFIX: device name prefix, a simple wildcard (NLA_STRING)
-- 
cgit v1.2.3


From f6ed9c5d3190cf18382ee75e0420602101f53586 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Nov 2025 14:52:49 -0500
Subject: overflow: Introduce struct_offset() to get offset of member

The trace_marker_raw file in tracefs takes a buffer from user space that
contains an id as well as a raw data string which is usually a binary
structure. The structure used has the following:

	struct raw_data_entry {
		struct trace_entry	ent;
		unsigned int		id;
		char			buf[];
	};

Since the passed in "cnt" variable is both the size of buf as well as the
size of id, the code to allocate the location on the ring buffer had:

   size = struct_size(entry, buf, cnt - sizeof(entry->id));

Which is quite ugly and hard to understand. Instead, add a helper macro
called struct_offset() which then changes the above to a simple and easy
to understand:

   size = struct_offset(entry, id) + cnt;

This will likely come in handy for other use cases too.

Link: https://lore.kernel.org/all/CAHk-=whYZVoEdfO1PmtbirPdBMTV9Nxt9f09CK0k6S+HJD3Zmg@mail.gmail.com/

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://patch.msgid.link/20251126145249.05b1770a@gandalf.local.home
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Kees Cook <kees@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/overflow.h | 12 ++++++++++++
 kernel/trace/trace.c     |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index 725f95f7e416..736f633b2d5f 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -458,6 +458,18 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
 #define struct_size_t(type, member, count)					\
 	struct_size((type *)NULL, member, count)
 
+/**
+ * struct_offset() - Calculate the offset of a member within a struct
+ * @p: Pointer to the struct
+ * @member: Name of the member to get the offset of
+ *
+ * Calculates the offset of a particular @member of the structure pointed
+ * to by @p.
+ *
+ * Return: number of bytes to the location of @member.
+ */
+#define struct_offset(p, member) (offsetof(typeof(*(p)), member))
+
 /**
  * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family.
  * Enables caller macro to pass arbitrary trailing expressions
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 73f8b79f1b0c..3d433a426e5f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7642,7 +7642,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr,
 	size_t size;
 
 	/* cnt includes both the entry->id and the data behind it. */
-	size = struct_size(entry, buf, cnt - sizeof(entry->id));
+	size = struct_offset(entry, id) + cnt;
 
 	buffer = tr->array_buffer.buffer;
 
-- 
cgit v1.2.3


From df59bb5b9af3fc24d957261e9f80f0c0dec151a4 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Wed, 26 Nov 2025 13:36:46 +0900
Subject: netmem, devmem, tcp: access pp fields through @desc in net_iov

Convert all the legacy code directly accessing the pp fields in net_iov
to access them through @desc in net_iov.

Signed-off-by: Byungchul Park <byungchul@sk.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 4 ++--
 net/core/devmem.c      | 6 +++---
 net/ipv4/tcp.c         | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ff90281ddf90..86737076101d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3778,8 +3778,8 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
 					    enum dma_data_direction dir)
 {
 	if (skb_frag_is_net_iov(frag)) {
-		return netmem_to_net_iov(frag->netmem)->dma_addr + offset +
-		       frag->offset;
+		return netmem_to_net_iov(frag->netmem)->desc.dma_addr +
+		       offset + frag->offset;
 	}
 	return dma_map_page(dev, skb_frag_page(frag),
 			    skb_frag_off(frag) + offset, size, dir);
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 1d04754bc756..ec4217d6c0b4 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -97,9 +97,9 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
 	index = offset / PAGE_SIZE;
 	niov = &owner->area.niovs[index];
 
-	niov->pp_magic = 0;
-	niov->pp = NULL;
-	atomic_long_set(&niov->pp_ref_count, 0);
+	niov->desc.pp_magic = 0;
+	niov->desc.pp = NULL;
+	atomic_long_set(&niov->desc.pp_ref_count, 0);
 
 	return niov;
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dee578aad690..f035440c475a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2587,7 +2587,7 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
 				if (err)
 					goto out;
 
-				atomic_long_inc(&niov->pp_ref_count);
+				atomic_long_inc(&niov->desc.pp_ref_count);
 				tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag);
 
 				sent += copy;
-- 
cgit v1.2.3


From 6557cae0a2a1952645e5df50e1d6eb7267ea2131 Mon Sep 17 00:00:00 2001
From: Peter Enderborg <peterend@axis.com>
Date: Wed, 26 Nov 2025 14:54:06 +0100
Subject: if_ether.h: Clarify ethertype validity for gsw1xx dsa

This 0x88C3 is registered to Infineon Technologies Corporate Research ST
and are used by MaxLinear.
Infineon made a spin off called Lantiq.
Lantiq was acquired by Intel
MaxLinear acquired Intels Connected Home division.

The product FAQ from MaxLinear describes it's history from the F24S.
The driver for the gsw1xx is based on Lantiq showing it's similarities.

Ref https://standards-oui.ieee.org/ethertype/eth.txt

Signed-off-by: Peter Enderborg <Peter.Enderborg@axis.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_ether.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 2c93b7b731c8..df9d44a11540 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -92,7 +92,9 @@
 #define ETH_P_ETHERCAT	0x88A4		/* EtherCAT			*/
 #define ETH_P_8021AD	0x88A8          /* 802.1ad Service VLAN		*/
 #define ETH_P_802_EX1	0x88B5		/* 802.1 Local Experimental 1.  */
-#define ETH_P_MXLGSW	0x88C3		/* MaxLinear GSW DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_MXLGSW	0x88C3		/* Infineon Technologies Corporate Research ST
+					 * Used by MaxLinear GSW DSA
+					 */
 #define ETH_P_PREAUTH	0x88C7		/* 802.11 Preauthentication */
 #define ETH_P_TIPC	0x88CA		/* TIPC 			*/
 #define ETH_P_LLDP	0x88CC		/* Link Layer Discovery Protocol */
-- 
cgit v1.2.3


From d856f9d27885c499d96ab7fe506083346ccf145d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 27 Nov 2025 19:54:07 -0400
Subject: iommupt/vtd: Allow VT-d to have a larger table top than the vasz
 requires

VT-d second stage HW specifies both the maximum IOVA and the supported
table walk starting points. Weirdly there is HW that only supports a 4
level walk but has a maximum IOVA that only needs 3.

The current code miscalculates this and creates a wrongly sized page table
which ultimately fails the compatibility check for number of levels.

This is fixed by allowing the page table to be created with both a vasz
and top_level input. The vasz will set the aperture for the domain while
the top_level will set the page table geometry.

Add top_level to vtdss and correct the logic in VT-d to generate the right
top_level and vasz from mgaw and sagaw.

Fixes: d373449d8e97 ("iommu/vt-d: Use the generic iommu page table")
Reported-by: Calvin Owens <calvin@wbinvd.org>
Closes: https://lore.kernel.org/r/8f257d2651eb8a4358fcbd47b0145002e5f1d638.1764237717.git.calvin@wbinvd.org
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Calvin Owens <calvin@wbinvd.org>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/generic_pt/fmt/vtdss.h | 19 ++++++-------------
 drivers/iommu/generic_pt/iommu_pt.h  | 14 ++++++++++++++
 drivers/iommu/intel/iommu.c          | 20 +++++++++++++-------
 include/linux/generic_pt/iommu.h     |  2 ++
 4 files changed, 35 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h
index 50ffed9d0e50..f5f8981edde7 100644
--- a/drivers/iommu/generic_pt/fmt/vtdss.h
+++ b/drivers/iommu/generic_pt/fmt/vtdss.h
@@ -248,18 +248,11 @@ static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
 					  const struct pt_iommu_vtdss_cfg *cfg)
 {
 	struct pt_vtdss *table = &iommu_table->vtdss_pt;
-	unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2;
 
-	if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2)
-		return -EOPNOTSUPP;
-	else if (vasz_lg2 > 48)
-		pt_top_set_level(&table->common, 4);
-	else if (vasz_lg2 > 39)
-		pt_top_set_level(&table->common, 3);
-	else if (vasz_lg2 > 30)
-		pt_top_set_level(&table->common, 2);
-	else
+	if (cfg->top_level > 4 || cfg->top_level < 2)
 		return -EOPNOTSUPP;
+
+	pt_top_set_level(&table->common, cfg->top_level);
 	return 0;
 }
 #define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init
@@ -282,9 +275,9 @@ vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table,
 
 #if defined(GENERIC_PT_KUNIT)
 static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = {
-	[0] = { .common.hw_max_vasz_lg2 = 39 },
-	[1] = { .common.hw_max_vasz_lg2 = 48 },
-	[2] = { .common.hw_max_vasz_lg2 = 57 },
+	[0] = { .common.hw_max_vasz_lg2 = 39, .top_level = 2},
+	[1] = { .common.hw_max_vasz_lg2 = 48, .top_level = 3},
+	[2] = { .common.hw_max_vasz_lg2 = 57, .top_level = 4},
 };
 #define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs
 enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) };
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 032d04ec7b56..97aeda1ad01c 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -1128,6 +1128,20 @@ static int pt_init_common(struct pt_common *common)
 		     PT_FORCE_ENABLED_FEATURES))
 		return -EOPNOTSUPP;
 
+	/*
+	 * Check if the top level of the page table is too small to hold the
+	 * specified maxvasz.
+	 */
+	if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+	    top_range.top_level != PT_MAX_TOP_LEVEL) {
+		struct pt_state pts = { .range = &top_range,
+					.level = top_range.top_level };
+
+		if (common->max_vasz_lg2 >
+		    pt_num_items_lg2(&pts) + pt_table_item_lg2sz(&pts))
+			return -EOPNOTSUPP;
+	}
+
 	if (common->max_oasz_lg2 == 0)
 		common->max_oasz_lg2 = pt_max_oa_lg2(common);
 	else
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7b3016491ca5..f117349d67db 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2858,22 +2858,28 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
 	return &dmar_domain->domain;
 }
 
-static int compute_vasz_lg2_ss(struct intel_iommu *iommu)
+static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu,
+					unsigned int *top_level)
 {
 	unsigned int sagaw = cap_sagaw(iommu->cap);
 	unsigned int mgaw = cap_mgaw(iommu->cap);
 
 	/*
 	 * Find the largest table size that both the mgaw and sagaw support.
-	 * This sets both the number of table levels and the valid range of
-	 * IOVA.
+	 * This sets the valid range of IOVA and the top starting level.
+	 * Some HW may only support a 4 or 5 level walk but must limit IOVA to
+	 * 3 levels.
 	 */
-	if (mgaw >= 48 && (sagaw & BIT(3)))
+	if (mgaw > 48 && sagaw >= BIT(3)) {
+		*top_level = 4;
 		return min(57, mgaw);
-	else if (mgaw >= 39 && (sagaw & BIT(2)))
+	} else if (mgaw > 39 && sagaw >= BIT(2)) {
+		*top_level = 3 + ffs(sagaw >> 3);
 		return min(48, mgaw);
-	else if (mgaw >= 30 && (sagaw & BIT(1)))
+	} else if (mgaw > 30 && sagaw >= BIT(1)) {
+		*top_level = 2 + ffs(sagaw >> 2);
 		return min(39, mgaw);
+	}
 	return 0;
 }
 
@@ -2910,7 +2916,7 @@ intel_iommu_domain_alloc_second_stage(struct device *dev,
 	if (IS_ERR(dmar_domain))
 		return ERR_CAST(dmar_domain);
 
-	cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu);
+	cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level);
 	cfg.common.hw_max_oasz_lg2 = 52;
 	cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE);
 
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index cfe05a77f86b..c134132ed10f 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -264,6 +264,8 @@ IOMMU_PROTOTYPES(amdv1_mock);
 
 struct pt_iommu_vtdss_cfg {
 	struct pt_iommu_cfg common;
+	/* 4 is a 57 bit 5 level table */
+	unsigned int top_level;
 };
 
 struct pt_iommu_vtdss_hw_info {
-- 
cgit v1.2.3


From 1eb0ae6fbd544619c50b4a4d96ccb4676cac03cb Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 27 Nov 2025 19:54:08 -0400
Subject: iommupt/vtd: Support mgaw's less than a 4 level walk for first stage

If the IOVA is limited to less than 48 the page table will be constructed
with a 3 level configuration which is unsupported by hardware.

Like the second stage the caller needs to pass in both the top_level an
the vasz to specify a table that has more levels than required to hold the
IOVA range.

Fixes: 6cbc09b7719e ("iommu/vt-d: Restore previous domain::aperture_end calculation")
Reported-by: Calvin Owens <calvin@wbinvd.org>
Closes: https://lore.kernel.org/r/8f257d2651eb8a4358fcbd47b0145002e5f1d638.1764237717.git.calvin@wbinvd.org
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Calvin Owens <calvin@wbinvd.org>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/amd/iommu.c             |  7 +++++--
 drivers/iommu/generic_pt/fmt/x86_64.h | 17 +++++++---------
 drivers/iommu/intel/iommu.c           | 38 ++++++++++++++++++++++-------------
 include/linux/generic_pt/iommu.h      |  2 ++
 4 files changed, 38 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 48bca4dc8eb6..273951b4501c 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2708,10 +2708,13 @@ static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev,
 	 * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not
 	 * set which creates a table that is compatible in both modes.
 	 */
-	if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
+	if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
 		cfg.common.hw_max_vasz_lg2 = 56;
-	else
+		cfg.top_level = 4;
+	} else {
 		cfg.common.hw_max_vasz_lg2 = 47;
+		cfg.top_level = 3;
+	}
 	cfg.common.hw_max_oasz_lg2 = 52;
 	domain->domain.ops = &amdv2_ops;
 
diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h
index 507abf2c934c..210748d9d6e8 100644
--- a/drivers/iommu/generic_pt/fmt/x86_64.h
+++ b/drivers/iommu/generic_pt/fmt/x86_64.h
@@ -241,13 +241,10 @@ x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table,
 {
 	struct pt_x86_64 *table = &iommu_table->x86_64_pt;
 
-	if (cfg->common.hw_max_vasz_lg2 < 31 ||
-	    cfg->common.hw_max_vasz_lg2 > 57)
-		return -EINVAL;
+	if (cfg->top_level < 3 || cfg->top_level > 4)
+		return -EOPNOTSUPP;
 
-	/* Top of 2, 3, 4 */
-	pt_top_set_level(&table->common,
-			 (cfg->common.hw_max_vasz_lg2 - 31) / 9 + 2);
+	pt_top_set_level(&table->common, cfg->top_level);
 
 	table->common.max_oasz_lg2 =
 		min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
@@ -269,12 +266,12 @@ x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table,
 #if defined(GENERIC_PT_KUNIT)
 static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = {
 	[0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
-		.common.hw_max_vasz_lg2 = 48 },
+		.common.hw_max_vasz_lg2 = 48, .top_level = 3 },
 	[1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
-		.common.hw_max_vasz_lg2 = 57 },
+		.common.hw_max_vasz_lg2 = 57, .top_level = 4 },
 	/* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */
-	[2] = { .common.hw_max_vasz_lg2 = 47 },
-	[3] = { .common.hw_max_vasz_lg2 = 56 },
+	[2] = { .common.hw_max_vasz_lg2 = 47, .top_level = 3 },
+	[3] = { .common.hw_max_vasz_lg2 = 56, .top_level = 4},
 };
 #define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs
 enum { KUNIT_FMT_FEATURES =  BIT(PT_FEAT_SIGN_EXTEND)};
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f117349d67db..4e888867e85c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2794,6 +2794,28 @@ static struct dmar_domain *paging_domain_alloc(void)
 	return domain;
 }
 
+static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu,
+					unsigned int *top_level)
+{
+	unsigned int mgaw = cap_mgaw(iommu->cap);
+
+	/*
+	 * Spec 3.6 First-Stage Translation:
+	 *
+	 * Software must limit addresses to less than the minimum of MGAW
+	 * and the lower canonical address width implied by FSPM (i.e.,
+	 * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level).
+	 */
+	if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) {
+		*top_level = 4;
+		return min(57, mgaw);
+	}
+
+	/* Four level is always supported */
+	*top_level = 3;
+	return min(48, mgaw);
+}
+
 static struct iommu_domain *
 intel_iommu_domain_alloc_first_stage(struct device *dev,
 				     struct intel_iommu *iommu, u32 flags)
@@ -2813,20 +2835,8 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
 	if (IS_ERR(dmar_domain))
 		return ERR_CAST(dmar_domain);
 
-	if (cap_fl5lp_support(iommu->cap))
-		cfg.common.hw_max_vasz_lg2 = 57;
-	else
-		cfg.common.hw_max_vasz_lg2 = 48;
-
-	/*
-	 * Spec 3.6 First-Stage Translation:
-	 *
-	 * Software must limit addresses to less than the minimum of MGAW
-	 * and the lower canonical address width implied by FSPM (i.e.,
-	 * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level).
-	 */
-	cfg.common.hw_max_vasz_lg2 = min(cap_mgaw(iommu->cap),
-					 cfg.common.hw_max_vasz_lg2);
+	cfg.common.hw_max_vasz_lg2 =
+		compute_vasz_lg2_fs(iommu, &cfg.top_level);
 	cfg.common.hw_max_oasz_lg2 = 52;
 	cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
 			      BIT(PT_FEAT_FLUSH_RANGE);
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index c134132ed10f..9eefbb74efd0 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -277,6 +277,8 @@ IOMMU_FORMAT(vtdss, vtdss_pt);
 
 struct pt_iommu_x86_64_cfg {
 	struct pt_iommu_cfg common;
+	/* 4 is a 57 bit 5 level table */
+	unsigned int top_level;
 };
 
 struct pt_iommu_x86_64_hw_info {
-- 
cgit v1.2.3


From 4be9e04ebf75a5c4478c1c6295e2122e5dc98f5f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 28 Nov 2025 10:55:09 +0100
Subject: vfs: add needed headers for new struct delegation definition

The definition of struct delegation uses stdint.h integer types. Add the
necessary headers to ensure that always works.

Fixes: 1602bad16d7d ("vfs: expose delegation support to userland")
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/fcntl.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 008fac15e573..5e277fd955aa 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -4,6 +4,11 @@
 
 #include <asm/fcntl.h>
 #include <linux/openat2.h>
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
 
 #define F_SETLEASE	(F_LINUX_SPECIFIC_BASE + 0)
 #define F_GETLEASE	(F_LINUX_SPECIFIC_BASE + 1)
-- 
cgit v1.2.3


From 5aefbf5b68794870ccec126cd68bbfd1ee09283a Mon Sep 17 00:00:00 2001
From: "Derek J. Clark" <derekjohn.clark@gmail.com>
Date: Thu, 27 Nov 2025 07:16:03 -0800
Subject: acpi: platform_profile - Add max-power profile option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some devices, namely Lenovo Legion devices, have an "extreme" mode where
power draw is at the maximum limit of the cooling hardware. Add a new
"max-power" platform profile to properly reflect this operating mode.

Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>
Acked-by: Rafael J. Wysocki (Intel) <rafael@kernel.org>
Signed-off-by: Derek J. Clark <derekjohn.clark@gmail.com>
Reviewed-by: Armin Wolf <W_Armin@gmx.de>
Reviewed-by: Mark Pearson <mpearson-lenovo@squebb.ca>
Link: https://patch.msgid.link/20251127151605.1018026-2-derekjohn.clark@gmail.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-class-platform-profile | 2 ++
 drivers/acpi/platform_profile.c                        | 7 +++++--
 include/linux/platform_profile.h                       | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-platform-profile b/Documentation/ABI/testing/sysfs-class-platform-profile
index dc72adfb830a..fcab26894ec3 100644
--- a/Documentation/ABI/testing/sysfs-class-platform-profile
+++ b/Documentation/ABI/testing/sysfs-class-platform-profile
@@ -23,6 +23,8 @@ Description:	This file contains a space-separated list of profiles supported
 					power consumption with a slight bias
 					towards performance
 		performance		High performance operation
+		max-power		Higher performance operation that may exceed
+					internal battery draw limits when on AC power
 		custom			Driver defined custom profile
 		====================	========================================
 
diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index b43f4459a4f6..ea04a8c69215 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -37,6 +37,7 @@ static const char * const profile_names[] = {
 	[PLATFORM_PROFILE_BALANCED] = "balanced",
 	[PLATFORM_PROFILE_BALANCED_PERFORMANCE] = "balanced-performance",
 	[PLATFORM_PROFILE_PERFORMANCE] = "performance",
+	[PLATFORM_PROFILE_MAX_POWER] = "max-power",
 	[PLATFORM_PROFILE_CUSTOM] = "custom",
 };
 static_assert(ARRAY_SIZE(profile_names) == PLATFORM_PROFILE_LAST);
@@ -506,7 +507,8 @@ int platform_profile_cycle(void)
 		if (err)
 			return err;
 
-		if (profile == PLATFORM_PROFILE_CUSTOM ||
+		if (profile == PLATFORM_PROFILE_MAX_POWER ||
+		    profile == PLATFORM_PROFILE_CUSTOM ||
 		    profile == PLATFORM_PROFILE_LAST)
 			return -EINVAL;
 
@@ -515,7 +517,8 @@ int platform_profile_cycle(void)
 		if (err)
 			return err;
 
-		/* never iterate into a custom if all drivers supported it */
+		/* never iterate into a custom or max power if all drivers supported it */
+		clear_bit(PLATFORM_PROFILE_MAX_POWER, data.aggregate);
 		clear_bit(PLATFORM_PROFILE_CUSTOM, data.aggregate);
 
 		next = find_next_bit_wrap(data.aggregate,
diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
index a299225ab92e..855b28340e95 100644
--- a/include/linux/platform_profile.h
+++ b/include/linux/platform_profile.h
@@ -24,6 +24,7 @@ enum platform_profile_option {
 	PLATFORM_PROFILE_BALANCED,
 	PLATFORM_PROFILE_BALANCED_PERFORMANCE,
 	PLATFORM_PROFILE_PERFORMANCE,
+	PLATFORM_PROFILE_MAX_POWER,
 	PLATFORM_PROFILE_CUSTOM,
 	PLATFORM_PROFILE_LAST, /*must always be last */
 };
-- 
cgit v1.2.3


From 011703a9acd76edc7c85d80dbccb6e50dba53aad Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 23 Nov 2025 17:33:19 +0100
Subject: file: add FD_{ADD,PREPARE}()

I've been playing with this to allow for moderately flexible usage of
the get_unused_fd_flags() + create file + fd_install() pattern that's
used quite extensively.

How callers allocate files is really heterogenous so it's not really
convenient to fold them into a single class. It's possibe to split them
into subclasses like for anon inodes. I think that's not necessarily
nice as well.

My take is to add two primites:
(1) FD_ADD() the simple cases a file is installed:

    fd = FD_ADD(O_CLOEXEC, open_file(some, args)));
    if (fd >= 0)
            kvm_get_kvm(vcpu->kvm);
    return fd;

(2) FD_PREPARE() that captures all the cases where access to fd or file
    or additional work before publishing the fd is needed:

    FD_PREPARE(fdf, open_flag, file_open_handle(&path, open_flag));
    if (fdf.err)
            return fdf.err;

    if (copy_to_user(/* something something */))
            return -EFAULT;

    return fd_publish(fdf);

I've converted all of the easy cases over to it and it gets rid of an
aweful lot of convoluted cleanup logic.

It's centered around struct fd_prepare. FD_PREPARE() encapsulates all of
allocation and cleanup logic and must be followed by a call to
fd_publish() which associates the fd with the file and installs it into
the callers fdtable. If fd_publish() isn't called both are deallocated.

It mandates a specific order namely that first we allocate the fd and
then instantiate the file. But that shouldn't be a problem nearly
everyone I've converted uses this exact pattern anyway.

There's a bunch of additional cases where it would be easy to convert
them to this pattern. For example, the whole sync file stuff in dma
currently retains the containing structure of the file instead of the
file itself even though it's only used to allocate files. Changing that
would make it fall into the FD_PREPARE() pattern easily. I've not done
that work yet.

There's room for extending this in a way that wed'd have subclasses for
some particularly often use patterns but as I said I'm not even sure
that's worth it.

Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-1-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/cleanup.h |   7 +++
 include/linux/file.h    | 126 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)

(limited to 'include')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 2573585b7f06..361104bcfe92 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -261,6 +261,10 @@ const volatile void * __must_check_fn(const volatile void *val)
  * CLASS(name, var)(args...):
  *	declare the variable @var as an instance of the named class
  *
+ * CLASS_INIT(name, var, init_expr):
+ *	declare the variable @var as an instance of the named class with
+ *	custom initialization expression.
+ *
  * Ex.
  *
  * DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd)
@@ -290,6 +294,9 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
 	class_##_name##_t var __cleanup(class_##_name##_destructor) =	\
 		class_##_name##_constructor
 
+#define CLASS_INIT(_name, _var, _init_expr)                             \
+        class_##_name##_t _var __cleanup(class_##_name##_destructor) = (_init_expr)
+
 #define scoped_class(_name, var, args)                          \
 	for (CLASS(_name, var)(args);                           \
 	     __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \
diff --git a/include/linux/file.h b/include/linux/file.h
index af1768d934a0..cf389fde9bc2 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -127,4 +127,130 @@ extern void __fput_sync(struct file *);
 
 extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
 
+/*
+ * fd_prepare: Combined fd + file allocation cleanup class.
+ * @err: Error code to indicate if allocation succeeded.
+ * @__fd: Allocated fd (may not be accessed directly)
+ * @__file: Allocated struct file pointer (may not be accessed directly)
+ *
+ * Allocates an fd and a file together. On error paths, automatically cleans
+ * up whichever resource was successfully allocated. Allows flexible file
+ * allocation with different functions per usage.
+ *
+ * Do not use directly.
+ */
+struct fd_prepare {
+	s32 err;
+	s32 __fd; /* do not access directly */
+	struct file *__file; /* do not access directly */
+};
+
+/* Typedef for fd_prepare cleanup guards. */
+typedef struct fd_prepare class_fd_prepare_t;
+
+/*
+ * Accessors for fd_prepare class members.
+ * _Generic() is used for zero-cost type safety.
+ */
+#define fd_prepare_fd(_fdf) \
+	(_Generic((_fdf), struct fd_prepare: (_fdf).__fd))
+
+#define fd_prepare_file(_fdf) \
+	(_Generic((_fdf), struct fd_prepare: (_fdf).__file))
+
+/* Do not use directly. */
+static inline void class_fd_prepare_destructor(const struct fd_prepare *fdf)
+{
+	if (unlikely(fdf->err)) {
+		if (likely(fdf->__fd >= 0))
+			put_unused_fd(fdf->__fd);
+		if (unlikely(!IS_ERR_OR_NULL(fdf->__file)))
+			fput(fdf->__file);
+	}
+}
+
+/* Do not use directly. */
+static inline int class_fd_prepare_lock_err(const struct fd_prepare *fdf)
+{
+	if (unlikely(fdf->err))
+		return fdf->err;
+	if (unlikely(fdf->__fd < 0))
+		return fdf->__fd;
+	if (unlikely(IS_ERR(fdf->__file)))
+		return PTR_ERR(fdf->__file);
+	if (unlikely(!fdf->__file))
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * __FD_PREPARE_INIT - Helper to initialize fd_prepare class.
+ * @_fd_flags: flags for get_unused_fd_flags()
+ * @_file_owned: expression that returns struct file *
+ *
+ * Returns a struct fd_prepare with fd, file, and err set.
+ * If fd allocation fails, fd will be negative and err will be set. If
+ * fd succeeds but file_init_expr fails, file will be ERR_PTR and err
+ * will be set. The err field is the single source of truth for error
+ * checking.
+ */
+#define __FD_PREPARE_INIT(_fd_flags, _file_owned)                 \
+	({                                                        \
+		struct fd_prepare fdf = {                         \
+			.__fd = get_unused_fd_flags((_fd_flags)), \
+		};                                                \
+		if (likely(fdf.__fd >= 0))                        \
+			fdf.__file = (_file_owned);               \
+		fdf.err = ACQUIRE_ERR(fd_prepare, &fdf);          \
+		fdf;                                              \
+	})
+
+/*
+ * FD_PREPARE - Macro to declare and initialize an fd_prepare variable.
+ *
+ * Declares and initializes an fd_prepare variable with automatic
+ * cleanup. No separate scope required - cleanup happens when variable
+ * goes out of scope.
+ *
+ * @_fdf: name of struct fd_prepare variable to define
+ * @_fd_flags: flags for get_unused_fd_flags()
+ * @_file_owned: struct file to take ownership of (can be expression)
+ */
+#define FD_PREPARE(_fdf, _fd_flags, _file_owned) \
+	CLASS_INIT(fd_prepare, _fdf, __FD_PREPARE_INIT(_fd_flags, _file_owned))
+
+/*
+ * fd_publish - Publish prepared fd and file to the fd table.
+ * @_fdf: struct fd_prepare variable
+ */
+#define fd_publish(_fdf)                                       \
+	({                                                     \
+		struct fd_prepare *fdp = &(_fdf);              \
+		VFS_WARN_ON_ONCE(fdp->err);                    \
+		VFS_WARN_ON_ONCE(fdp->__fd < 0);               \
+		VFS_WARN_ON_ONCE(IS_ERR_OR_NULL(fdp->__file)); \
+		fd_install(fdp->__fd, fdp->__file);            \
+		fdp->__fd;                                     \
+	})
+
+/* Do not use directly. */
+#define __FD_ADD(_fdf, _fd_flags, _file_owned)            \
+	({                                                \
+		FD_PREPARE(_fdf, _fd_flags, _file_owned); \
+		s32 ret = _fdf.err;                       \
+		if (likely(!ret))                         \
+			ret = fd_publish(_fdf);           \
+		ret;                                      \
+	})
+
+/*
+ * FD_ADD - Allocate and install an fd and file in one step.
+ * @_fd_flags: flags for get_unused_fd_flags()
+ * @_file_owned: struct file to take ownership of
+ *
+ * Returns the allocated fd number, or negative error code on failure.
+ */
+#define FD_ADD(_fd_flags, _file_owned) \
+	__FD_ADD(__UNIQUE_ID(fd_prepare), _fd_flags, _file_owned)
+
 #endif /* __LINUX_FILE_H */
-- 
cgit v1.2.3


From 816c9cac35185aff33da1eb73cc974349623eb3a Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Fri, 28 Nov 2025 11:25:20 +0000
Subject: ASoC: cs35l56: Log a message if firmware is missing

If the amp is still reporting FIRMWARE_MISSING after cs35l56_patch()
has completed it is helpful to log a warning.

After a complete firmware download the FIRMWARE_MISSING flag will be
clear. If this isn't the case, the driver should log a message to
report this.

The amp can produce basic audio output without firmware, as a fallback,
so this wasn't originally logged as a warning condition because the amp
is still in an operational state - just not with full functionality.
However, it was not at all obvious to an end user that anything is
unusual.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20251128112520.40067-1-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           |  1 +
 sound/soc/codecs/cs35l56-shared.c | 17 +++++++++++++++++
 sound/soc/codecs/cs35l56.c        |  3 +++
 3 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index 883f6a7e50aa..5928af539c46 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -409,6 +409,7 @@ int cs35l56_cal_set_status_get(struct cs35l56_base *cs35l56_base,
 			       struct snd_ctl_elem_value *uvalue);
 int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base,
 			     bool *fw_missing, unsigned int *fw_version);
+void cs35l56_warn_if_firmware_missing(struct cs35l56_base *cs35l56_base);
 void cs35l56_log_tuning(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp);
 int cs35l56_hw_init(struct cs35l56_base *cs35l56_base);
 int cs35l56_get_speaker_id(struct cs35l56_base *cs35l56_base);
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index 7424e1353062..60100c8f8c95 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -1337,6 +1337,23 @@ int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base,
 }
 EXPORT_SYMBOL_NS_GPL(cs35l56_read_prot_status, "SND_SOC_CS35L56_SHARED");
 
+void cs35l56_warn_if_firmware_missing(struct cs35l56_base *cs35l56_base)
+{
+	unsigned int firmware_version;
+	bool firmware_missing;
+	int ret;
+
+	ret = cs35l56_read_prot_status(cs35l56_base, &firmware_missing, &firmware_version);
+	if (ret)
+		return;
+
+	if (!firmware_missing)
+		return;
+
+	dev_warn(cs35l56_base->dev, "FIRMWARE_MISSING\n");
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_warn_if_firmware_missing, "SND_SOC_CS35L56_SHARED");
+
 void cs35l56_log_tuning(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp)
 {
 	__be32 pid, sid, tid;
diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c
index d8a3babce0f6..55b4d0d55712 100644
--- a/sound/soc/codecs/cs35l56.c
+++ b/sound/soc/codecs/cs35l56.c
@@ -823,6 +823,9 @@ static void cs35l56_patch(struct cs35l56_private *cs35l56, bool firmware_missing
 		goto err_unlock;
 	}
 
+	/* Check if the firmware is still reported missing */
+	cs35l56_warn_if_firmware_missing(&cs35l56->base);
+
 	regmap_clear_bits(cs35l56->base.regmap,
 			  cs35l56->base.fw_reg->prot_sts,
 			  CS35L56_FIRMWARE_MISSING);
-- 
cgit v1.2.3


From f01c0f7ee59fce16e5bae92a2d388a8a6fdf3f0f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 27 Nov 2025 22:27:39 -0800
Subject: gpio: regmap: fix kernel-doc notation

Add a ':' to the end of struct member names to prevent kernel-doc
warnings:

Warning: include/linux/gpio/regmap.h:108 struct member 'regmap_irq_line'
 not described in 'gpio_regmap_config'
Warning: include/linux/gpio/regmap.h:108 struct member 'regmap_irq_flags'
 not described in 'gpio_regmap_config'

Fixes: 553b75d4bfe9 ("gpio: regmap: Allow to allocate regmap-irq device")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Michael Walle <mwalle@kernel.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20251128062739.845403-1-rdunlap@infradead.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/regmap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h
index 87983a5f3681..12d154732ca9 100644
--- a/include/linux/gpio/regmap.h
+++ b/include/linux/gpio/regmap.h
@@ -50,8 +50,8 @@ struct regmap;
  * @regmap_irq_chip:	(Optional) Pointer on an regmap_irq_chip structure. If
  *			set, a regmap-irq device will be created and the IRQ
  *			domain will be set accordingly.
- * @regmap_irq_line	(Optional) The IRQ the device uses to signal interrupts.
- * @regmap_irq_flags	(Optional) The IRQF_ flags to use for the interrupt.
+ * @regmap_irq_line:	(Optional) The IRQ the device uses to signal interrupts.
+ * @regmap_irq_flags:	(Optional) The IRQF_ flags to use for the interrupt.
  *
  * The ->reg_mask_xlate translates a given base address and GPIO offset to
  * register and mask pair. The base address is one of the given register
-- 
cgit v1.2.3


From a195c7ccfb7a21b8118139835e25936ec8722596 Mon Sep 17 00:00:00 2001
From: Jason-JH Lin <jason-jh.lin@mediatek.com>
Date: Thu, 23 Oct 2025 01:16:30 +0800
Subject: mailbox: mtk-cmdq: Refine DMA address handling for the command buffer

GCE can only fetch the command buffer address from a 32-bit register.
Some SoCs support a 35-bit command buffer address for GCE, which
requires a right shift of 3 bits before setting the address into
the 32-bit register. A comment has been added to the header of
cmdq_get_shift_pa() to explain this requirement.

To prevent the GCE command buffer address from being DMA mapped beyond
its supported bit range, the DMA bit mask for the device is set during
initialization.

Additionally, to ensure the correct shift is applied when setting or
reading the register that stores the GCE command buffer address,
new APIs, cmdq_convert_gce_addr() and cmdq_revert_gce_addr(), have
been introduced for consistent operations on this register.

The variable type for the command buffer address has been standardized
to dma_addr_t to prevent handling issues caused by type mismatches.

Fixes: 0858fde496f8 ("mailbox: cmdq: variablize address shift in platform")
Signed-off-by: Jason-JH Lin <jason-jh.lin@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Jassi Brar <jassisinghbrar@gmail.com>
---
 drivers/mailbox/mtk-cmdq-mailbox.c       | 45 ++++++++++++++++++++++----------
 include/linux/mailbox/mtk-cmdq-mailbox.h | 10 +++++++
 2 files changed, 41 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c b/drivers/mailbox/mtk-cmdq-mailbox.c
index 654a60f63756..5791f80f995a 100644
--- a/drivers/mailbox/mtk-cmdq-mailbox.c
+++ b/drivers/mailbox/mtk-cmdq-mailbox.c
@@ -92,6 +92,18 @@ struct gce_plat {
 	u32 gce_num;
 };
 
+static inline u32 cmdq_convert_gce_addr(dma_addr_t addr, const struct gce_plat *pdata)
+{
+	/* Convert DMA addr (PA or IOVA) to GCE readable addr */
+	return addr >> pdata->shift;
+}
+
+static inline dma_addr_t cmdq_revert_gce_addr(u32 addr, const struct gce_plat *pdata)
+{
+	/* Revert GCE readable addr to DMA addr (PA or IOVA) */
+	return (dma_addr_t)addr << pdata->shift;
+}
+
 u8 cmdq_get_shift_pa(struct mbox_chan *chan)
 {
 	struct cmdq *cmdq = container_of(chan->mbox, struct cmdq, mbox);
@@ -188,13 +200,12 @@ static void cmdq_task_insert_into_thread(struct cmdq_task *task)
 	struct cmdq_task *prev_task = list_last_entry(
 			&thread->task_busy_list, typeof(*task), list_entry);
 	u64 *prev_task_base = prev_task->pkt->va_base;
+	u32 gce_addr = cmdq_convert_gce_addr(task->pa_base, task->cmdq->pdata);
 
 	/* let previous task jump to this task */
 	dma_sync_single_for_cpu(dev, prev_task->pa_base,
 				prev_task->pkt->cmd_buf_size, DMA_TO_DEVICE);
-	prev_task_base[CMDQ_NUM_CMD(prev_task->pkt) - 1] =
-		(u64)CMDQ_JUMP_BY_PA << 32 |
-		(task->pa_base >> task->cmdq->pdata->shift);
+	prev_task_base[CMDQ_NUM_CMD(prev_task->pkt) - 1] = (u64)CMDQ_JUMP_BY_PA << 32 | gce_addr;
 	dma_sync_single_for_device(dev, prev_task->pa_base,
 				   prev_task->pkt->cmd_buf_size, DMA_TO_DEVICE);
 
@@ -237,7 +248,8 @@ static void cmdq_thread_irq_handler(struct cmdq *cmdq,
 				    struct cmdq_thread *thread)
 {
 	struct cmdq_task *task, *tmp, *curr_task = NULL;
-	u32 curr_pa, irq_flag, task_end_pa;
+	u32 irq_flag, gce_addr;
+	dma_addr_t curr_pa, task_end_pa;
 	bool err;
 
 	irq_flag = readl(thread->base + CMDQ_THR_IRQ_STATUS);
@@ -259,7 +271,8 @@ static void cmdq_thread_irq_handler(struct cmdq *cmdq,
 	else
 		return;
 
-	curr_pa = readl(thread->base + CMDQ_THR_CURR_ADDR) << cmdq->pdata->shift;
+	gce_addr = readl(thread->base + CMDQ_THR_CURR_ADDR);
+	curr_pa = cmdq_revert_gce_addr(gce_addr, cmdq->pdata);
 
 	list_for_each_entry_safe(task, tmp, &thread->task_busy_list,
 				 list_entry) {
@@ -378,7 +391,8 @@ static int cmdq_mbox_send_data(struct mbox_chan *chan, void *data)
 	struct cmdq_thread *thread = (struct cmdq_thread *)chan->con_priv;
 	struct cmdq *cmdq = dev_get_drvdata(chan->mbox->dev);
 	struct cmdq_task *task;
-	unsigned long curr_pa, end_pa;
+	u32 gce_addr;
+	dma_addr_t curr_pa, end_pa;
 
 	/* Client should not flush new tasks if suspended. */
 	WARN_ON(cmdq->suspended);
@@ -402,20 +416,20 @@ static int cmdq_mbox_send_data(struct mbox_chan *chan, void *data)
 		 */
 		WARN_ON(cmdq_thread_reset(cmdq, thread) < 0);
 
-		writel(task->pa_base >> cmdq->pdata->shift,
-		       thread->base + CMDQ_THR_CURR_ADDR);
-		writel((task->pa_base + pkt->cmd_buf_size) >> cmdq->pdata->shift,
-		       thread->base + CMDQ_THR_END_ADDR);
+		gce_addr = cmdq_convert_gce_addr(task->pa_base, cmdq->pdata);
+		writel(gce_addr, thread->base + CMDQ_THR_CURR_ADDR);
+		gce_addr = cmdq_convert_gce_addr(task->pa_base + pkt->cmd_buf_size, cmdq->pdata);
+		writel(gce_addr, thread->base + CMDQ_THR_END_ADDR);
 
 		writel(thread->priority, thread->base + CMDQ_THR_PRIORITY);
 		writel(CMDQ_THR_IRQ_EN, thread->base + CMDQ_THR_IRQ_ENABLE);
 		writel(CMDQ_THR_ENABLED, thread->base + CMDQ_THR_ENABLE_TASK);
 	} else {
 		WARN_ON(cmdq_thread_suspend(cmdq, thread) < 0);
-		curr_pa = readl(thread->base + CMDQ_THR_CURR_ADDR) <<
-			cmdq->pdata->shift;
-		end_pa = readl(thread->base + CMDQ_THR_END_ADDR) <<
-			cmdq->pdata->shift;
+		gce_addr = readl(thread->base + CMDQ_THR_CURR_ADDR);
+		curr_pa = cmdq_revert_gce_addr(gce_addr, cmdq->pdata);
+		gce_addr = readl(thread->base + CMDQ_THR_END_ADDR);
+		end_pa = cmdq_revert_gce_addr(gce_addr, cmdq->pdata);
 		/* check boundary */
 		if (curr_pa == end_pa - CMDQ_INST_SIZE ||
 		    curr_pa == end_pa) {
@@ -646,6 +660,9 @@ static int cmdq_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
+	dma_set_coherent_mask(dev,
+			      DMA_BIT_MASK(sizeof(u32) * BITS_PER_BYTE + cmdq->pdata->shift));
+
 	cmdq->mbox.dev = dev;
 	cmdq->mbox.chans = devm_kcalloc(dev, cmdq->pdata->thread_nr,
 					sizeof(*cmdq->mbox.chans), GFP_KERNEL);
diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h
index 4c1a91b07de3..e1555e06e7e5 100644
--- a/include/linux/mailbox/mtk-cmdq-mailbox.h
+++ b/include/linux/mailbox/mtk-cmdq-mailbox.h
@@ -77,6 +77,16 @@ struct cmdq_pkt {
 	size_t			buf_size; /* real buffer size */
 };
 
+/**
+ * cmdq_get_shift_pa() - get the shift bits of physical address
+ * @chan: mailbox channel
+ *
+ * GCE can only fetch the command buffer address from a 32-bit register.
+ * Some SOCs support more than 32-bit command buffer address for GCE, which
+ * requires some shift bits to make the address fit into the 32-bit register.
+ *
+ * Return: the shift bits of physical address
+ */
 u8 cmdq_get_shift_pa(struct mbox_chan *chan);
 
 #endif /* __MTK_CMDQ_MAILBOX_H__ */
-- 
cgit v1.2.3


From d0c98769ee7d5db8d699a270690639cde1766cd4 Mon Sep 17 00:00:00 2001
From: Fengnan Chang <fengnanchang@gmail.com>
Date: Fri, 28 Nov 2025 16:53:13 +0800
Subject: blk-mq: use array manage hctx map instead of xarray

After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use
an xarray instead of array to store hctx, but in poll mode, each time
in blk_mq_poll, we need use xa_load to find corresponding hctx, this
introduce some costs. In my test, xa_load may cost 3.8% cpu.

This patch revert previous change, eliminates the overhead of xa_load
and can result in a 3% performance improvement.

Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c     |  2 +-
 block/blk-mq.c         | 58 ++++++++++++++++++++++++++++++++------------------
 block/blk-mq.h         |  2 +-
 include/linux/blk-mq.h |  3 ++-
 include/linux/blkdev.h |  2 +-
 5 files changed, 42 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 5b664dbdf655..33946cdb5716 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -499,7 +499,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
 	int srcu_idx;
 
 	/*
-	 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
+	 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
 	 * while the queue is frozen. So we can use q_usage_counter to avoid
 	 * racing with it.
 	 */
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f2650c97a75e..1ef81110eb8a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -730,7 +730,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	 * If not tell the caller that it should skip this queue.
 	 */
 	ret = -EXDEV;
-	data.hctx = xa_load(&q->hctx_table, hctx_idx);
+	data.hctx = q->queue_hw_ctx[hctx_idx];
 	if (!blk_mq_hw_queue_mapped(data.hctx))
 		goto out_queue_exit;
 	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
@@ -3946,8 +3946,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 			blk_free_flush_queue_callback);
 	hctx->fq = NULL;
 
-	xa_erase(&q->hctx_table, hctx_idx);
-
 	spin_lock(&q->unused_hctx_lock);
 	list_add(&hctx->hctx_list, &q->unused_hctx_list);
 	spin_unlock(&q->unused_hctx_lock);
@@ -3989,14 +3987,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
 				hctx->numa_node))
 		goto exit_hctx;
 
-	if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
-		goto exit_flush_rq;
-
 	return 0;
 
- exit_flush_rq:
-	if (set->ops->exit_request)
-		set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
  exit_hctx:
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
@@ -4385,7 +4377,7 @@ void blk_mq_release(struct request_queue *q)
 		kobject_put(&hctx->kobj);
 	}
 
-	xa_destroy(&q->hctx_table);
+	kfree(q->queue_hw_ctx);
 
 	/*
 	 * release .mq_kobj and sw queue's kobject now because
@@ -4529,26 +4521,44 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
 static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 				     struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned long i, j;
+	int i, j, end;
+	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
+
+	if (q->nr_hw_queues < set->nr_hw_queues) {
+		struct blk_mq_hw_ctx **new_hctxs;
+
+		new_hctxs = kcalloc_node(set->nr_hw_queues,
+				       sizeof(*new_hctxs), GFP_KERNEL,
+				       set->numa_node);
+		if (!new_hctxs)
+			return;
+		if (hctxs)
+			memcpy(new_hctxs, hctxs, q->nr_hw_queues *
+			       sizeof(*hctxs));
+		q->queue_hw_ctx = new_hctxs;
+		kfree(hctxs);
+		hctxs = new_hctxs;
+	}
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		int old_node;
 		int node = blk_mq_get_hctx_node(set, i);
-		struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
+		struct blk_mq_hw_ctx *old_hctx = hctxs[i];
 
 		if (old_hctx) {
 			old_node = old_hctx->numa_node;
 			blk_mq_exit_hctx(q, set, old_hctx, i);
 		}
 
-		if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
+		hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node);
+		if (!hctxs[i]) {
 			if (!old_hctx)
 				break;
 			pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
 					node, old_node);
-			hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
-			WARN_ON_ONCE(!hctx);
+			hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i,
+					old_node);
+			WARN_ON_ONCE(!hctxs[i]);
 		}
 	}
 	/*
@@ -4557,13 +4567,21 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	 */
 	if (i != set->nr_hw_queues) {
 		j = q->nr_hw_queues;
+		end = i;
 	} else {
 		j = i;
+		end = q->nr_hw_queues;
 		q->nr_hw_queues = set->nr_hw_queues;
 	}
 
-	xa_for_each_start(&q->hctx_table, j, hctx, j)
-		blk_mq_exit_hctx(q, set, hctx, j);
+	for (; j < end; j++) {
+		struct blk_mq_hw_ctx *hctx = hctxs[j];
+
+		if (hctx) {
+			blk_mq_exit_hctx(q, set, hctx, j);
+			hctxs[j] = NULL;
+		}
+	}
 }
 
 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
@@ -4599,8 +4617,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	INIT_LIST_HEAD(&q->unused_hctx_list);
 	spin_lock_init(&q->unused_hctx_lock);
 
-	xa_init(&q->hctx_table);
-
 	blk_mq_realloc_hw_ctxs(set, q);
 	if (!q->nr_hw_queues)
 		goto err_hctxs;
@@ -5187,7 +5203,7 @@ int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
 {
 	if (!blk_mq_can_poll(q))
 		return 0;
-	return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags);
+	return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags);
 }
 
 int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index c4fccdeb5441..80a3f0c2bce7 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -84,7 +84,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
 							  enum hctx_type type,
 							  unsigned int cpu)
 {
-	return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
+	return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
 }
 
 static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b54506b3b76d..9208ff90ae16 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1016,7 +1016,8 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 }
 
 #define queue_for_each_hw_ctx(q, hctx, i)				\
-	xa_for_each(&(q)->hctx_table, (i), (hctx))
+	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
+	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
 
 #define hctx_for_each_ctx(hctx, ctx, i)					\
 	for ((i) = 0; (i) < (hctx)->nr_ctx &&				\
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cb4ba09959ee..6195f89648db 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -503,7 +503,7 @@ struct request_queue {
 
 	/* hw dispatch queues */
 	unsigned int		nr_hw_queues;
-	struct xarray		hctx_table;
+	struct blk_mq_hw_ctx	**queue_hw_ctx;
 
 	struct percpu_ref	q_usage_counter;
 	struct lock_class_key	io_lock_cls_key;
-- 
cgit v1.2.3


From 89e1fb7ceffd898505ad7fa57acec0585bfaa2cc Mon Sep 17 00:00:00 2001
From: Fengnan Chang <fengnanchang@gmail.com>
Date: Fri, 28 Nov 2025 16:53:14 +0800
Subject: blk-mq: fix potential uaf for 'queue_hw_ctx'

This is just apply Kuai's patch in [1] with mirror changes.

blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate
submit_queues through configfs for null_blk), while it might still be
used from other context(e.g. switch elevator to none):

t1					t2
elevator_switch
 blk_mq_unquiesce_queue
  blk_mq_run_hw_queues
   queue_for_each_hw_ctx
    // assembly code for hctx = (q)->queue_hw_ctx[i]
    mov    0x48(%rbp),%rdx -> read old queue_hw_ctx

					__blk_mq_update_nr_hw_queues
					 blk_mq_realloc_hw_ctxs
					  hctxs = q->queue_hw_ctx
					  q->queue_hw_ctx = new_hctxs
					  kfree(hctxs)
    movslq %ebx,%rax
    mov    (%rdx,%rax,8),%rdi ->uaf

This problem was found by code review, and I comfirmed that the concurrent
scenario do exist(specifically 'q->queue_hw_ctx' can be changed during
blk_mq_run_hw_queues()), however, the uaf problem hasn't been repoduced yet
without hacking the kernel.

Sicne the queue is freezed in __blk_mq_update_nr_hw_queues(), fix the
problem by protecting 'queue_hw_ctx' through rcu where it can be accessed
without grabbing 'q_usage_counter'.

[1] https://lore.kernel.org/all/20220225072053.2472431-1-yukuai3@huawei.com/

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         |  7 ++++++-
 include/linux/blk-mq.h | 13 ++++++++++++-
 include/linux/blkdev.h |  2 +-
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1ef81110eb8a..4e96bb246247 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4535,7 +4535,12 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		if (hctxs)
 			memcpy(new_hctxs, hctxs, q->nr_hw_queues *
 			       sizeof(*hctxs));
-		q->queue_hw_ctx = new_hctxs;
+		rcu_assign_pointer(q->queue_hw_ctx, new_hctxs);
+		/*
+		 * Make sure reading the old queue_hw_ctx from other
+		 * context concurrently won't trigger uaf.
+		 */
+		synchronize_rcu_expedited();
 		kfree(hctxs);
 		hctxs = new_hctxs;
 	}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9208ff90ae16..eb7254b3dddd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1015,9 +1015,20 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 	return rq + 1;
 }
 
+static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	rcu_read_lock();
+	hctx = rcu_dereference(q->queue_hw_ctx)[id];
+	rcu_read_unlock();
+
+	return hctx;
+}
+
 #define queue_for_each_hw_ctx(q, hctx, i)				\
 	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
-	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
+	     ({ hctx = queue_hctx((q), i); 1; }); (i)++)
 
 #define hctx_for_each_ctx(hctx, ctx, i)					\
 	for ((i) = 0; (i) < (hctx)->nr_ctx &&				\
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6195f89648db..72e34acd439c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -503,7 +503,7 @@ struct request_queue {
 
 	/* hw dispatch queues */
 	unsigned int		nr_hw_queues;
-	struct blk_mq_hw_ctx	**queue_hw_ctx;
+	struct blk_mq_hw_ctx * __rcu *queue_hw_ctx;
 
 	struct percpu_ref	q_usage_counter;
 	struct lock_class_key	io_lock_cls_key;
-- 
cgit v1.2.3


From 9574b21e952256d4fa3c8797c94482a240992d18 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 21 Nov 2025 09:58:23 +0800
Subject: kfifo: add kfifo_alloc_node() helper for NUMA awareness

Add __kfifo_alloc_node() by refactoring and reusing __kfifo_alloc(),
and define kfifo_alloc_node() macro to support NUMA-aware memory
allocation.

The new __kfifo_alloc_node() function accepts a NUMA node parameter
and uses kmalloc_array_node() instead of kmalloc_array() for
node-specific allocation. The existing __kfifo_alloc() now calls
__kfifo_alloc_node() with NUMA_NO_NODE to maintain backward
compatibility.

This enables users to allocate kfifo buffers on specific NUMA nodes,
which is important for performance in NUMA systems where the kfifo
will be primarily accessed by threads running on specific nodes.

Cc: Stefani Seibold <stefani@seibold.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/kfifo.h | 34 ++++++++++++++++++++++++++++++++--
 lib/kfifo.c           |  8 ++++----
 2 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index fd743d4c4b4b..8b81ac74829c 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -369,6 +369,30 @@ __kfifo_int_must_check_helper( \
 }) \
 )
 
+/**
+ * kfifo_alloc_node - dynamically allocates a new fifo buffer on a NUMA node
+ * @fifo: pointer to the fifo
+ * @size: the number of elements in the fifo, this must be a power of 2
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ * @node: NUMA node to allocate memory on
+ *
+ * This macro dynamically allocates a new fifo buffer with NUMA node awareness.
+ *
+ * The number of elements will be rounded-up to a power of 2.
+ * The fifo will be release with kfifo_free().
+ * Return 0 if no error, otherwise an error code.
+ */
+#define kfifo_alloc_node(fifo, size, gfp_mask, node) \
+__kfifo_int_must_check_helper( \
+({ \
+	typeof((fifo) + 1) __tmp = (fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	__is_kfifo_ptr(__tmp) ? \
+	__kfifo_alloc_node(__kfifo, size, sizeof(*__tmp->type), gfp_mask, node) : \
+	-EINVAL; \
+}) \
+)
+
 /**
  * kfifo_free - frees the fifo
  * @fifo: the fifo to be freed
@@ -899,8 +923,14 @@ __kfifo_uint_must_check_helper( \
 )
 
 
-extern int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
-	size_t esize, gfp_t gfp_mask);
+extern int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size,
+	size_t esize, gfp_t gfp_mask, int node);
+
+static inline int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
+				size_t esize, gfp_t gfp_mask)
+{
+	return __kfifo_alloc_node(fifo, size, esize, gfp_mask, NUMA_NO_NODE);
+}
 
 extern void __kfifo_free(struct __kfifo *fifo);
 
diff --git a/lib/kfifo.c b/lib/kfifo.c
index a8b2eed90599..525e66f8294c 100644
--- a/lib/kfifo.c
+++ b/lib/kfifo.c
@@ -22,8 +22,8 @@ static inline unsigned int kfifo_unused(struct __kfifo *fifo)
 	return (fifo->mask + 1) - (fifo->in - fifo->out);
 }
 
-int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
-		size_t esize, gfp_t gfp_mask)
+int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size,
+		size_t esize, gfp_t gfp_mask, int node)
 {
 	/*
 	 * round up to the next power of 2, since our 'let the indices
@@ -41,7 +41,7 @@ int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
 		return -EINVAL;
 	}
 
-	fifo->data = kmalloc_array(esize, size, gfp_mask);
+	fifo->data = kmalloc_array_node(esize, size, gfp_mask, node);
 
 	if (!fifo->data) {
 		fifo->mask = 0;
@@ -51,7 +51,7 @@ int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
 
 	return 0;
 }
-EXPORT_SYMBOL(__kfifo_alloc);
+EXPORT_SYMBOL(__kfifo_alloc_node);
 
 void __kfifo_free(struct __kfifo *fifo)
 {
-- 
cgit v1.2.3


From 418de94e7593081c29066555bf9059f1f7dd9d79 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 27 Nov 2025 22:57:54 -0800
Subject: sbitmap: fix all kernel-doc warnings

Modify kernel-doc comments in sbitmap.h to prevent warnings:

Warning: include/linux/sbitmap.h:84 struct member 'alloc_hint' not
 described in 'sbitmap'
Warning: include/linux/sbitmap.h:151 struct member 'ws_active' not
 described in 'sbitmap_queue'
Warning: include/linux/sbitmap.h:552 No description found for
 return value of 'sbq_wait_ptr'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index ffb9907c7070..cc7ad189caa5 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -75,7 +75,7 @@ struct sbitmap {
 	 */
 	struct sbitmap_word *map;
 
-	/*
+	/**
 	 * @alloc_hint: Cache of last successfully allocated or freed bit.
 	 *
 	 * This is per-cpu, which allows multiple users to stick to different
@@ -128,7 +128,7 @@ struct sbitmap_queue {
 	 */
 	struct sbq_wait_state *ws;
 
-	/*
+	/**
 	 * @ws_active: count of currently active ws waitqueues
 	 */
 	atomic_t ws_active;
@@ -547,6 +547,8 @@ static inline void sbq_index_atomic_inc(atomic_t *index)
  * sbitmap_queue.
  * @sbq: Bitmap queue to wait on.
  * @wait_index: A counter per "user" of @sbq.
+ *
+ * Return: Next wait queue to be used
  */
 static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
 						  atomic_t *wait_index)
-- 
cgit v1.2.3


From 98693e0897f754e3f51ce6626ed5f785f625ba2b Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@nvidia.com>
Date: Mon, 24 Nov 2025 15:36:22 -0700
Subject: vfio/pci: Use RCU for error/request triggers to avoid circular
 locking

Thanks to a device generating an ACS violation during bus reset,
lockdep reported the following circular locking issue:

CPU0: SET_IRQS (MSI/X): holds igate, acquires memory_lock
CPU1: HOT_RESET: holds memory_lock, acquires pci_bus_sem
CPU2: AER: holds pci_bus_sem, acquires igate

This results in a potential 3-way deadlock.

Remove the pci_bus_sem->igate leg of the triangle by using RCU
to peek at the eventfd rather than locking it with igate.

Fixes: 3be3a074cf5b ("vfio-pci: Don't use device_lock around AER interrupt setup")
Signed-off-by: Alex Williamson <alex.williamson@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20251124223623.2770706-1-alex@shazbot.org
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/vfio/pci/vfio_pci_core.c  | 68 +++++++++++++++++++++++++++------------
 drivers/vfio/pci/vfio_pci_intrs.c | 52 +++++++++++++++++++-----------
 drivers/vfio/pci/vfio_pci_priv.h  |  4 +++
 include/linux/vfio_pci_core.h     | 10 ++++--
 4 files changed, 93 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 79a1a50a4ef7..2b01bfbce3ea 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -42,6 +42,40 @@ static bool nointxmask;
 static bool disable_vga;
 static bool disable_idle_d3;
 
+static void vfio_pci_eventfd_rcu_free(struct rcu_head *rcu)
+{
+	struct vfio_pci_eventfd *eventfd =
+		container_of(rcu, struct vfio_pci_eventfd, rcu);
+
+	eventfd_ctx_put(eventfd->ctx);
+	kfree(eventfd);
+}
+
+int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev,
+				    struct vfio_pci_eventfd __rcu **peventfd,
+				    struct eventfd_ctx *ctx)
+{
+	struct vfio_pci_eventfd *new = NULL;
+	struct vfio_pci_eventfd *old;
+
+	lockdep_assert_held(&vdev->igate);
+
+	if (ctx) {
+		new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
+		if (!new)
+			return -ENOMEM;
+
+		new->ctx = ctx;
+	}
+
+	old = rcu_replace_pointer(*peventfd, new,
+				  lockdep_is_held(&vdev->igate));
+	if (old)
+		call_rcu(&old->rcu, vfio_pci_eventfd_rcu_free);
+
+	return 0;
+}
+
 /* List of PF's that vfio_pci_core_sriov_configure() has been called on */
 static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex);
 static LIST_HEAD(vfio_pci_sriov_pfs);
@@ -697,14 +731,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 	vfio_pci_dma_buf_cleanup(vdev);
 
 	mutex_lock(&vdev->igate);
-	if (vdev->err_trigger) {
-		eventfd_ctx_put(vdev->err_trigger);
-		vdev->err_trigger = NULL;
-	}
-	if (vdev->req_trigger) {
-		eventfd_ctx_put(vdev->req_trigger);
-		vdev->req_trigger = NULL;
-	}
+	vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL);
+	vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL);
 	mutex_unlock(&vdev->igate);
 }
 EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
@@ -1784,21 +1812,21 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
 	struct vfio_pci_core_device *vdev =
 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_eventfd *eventfd;
 
-	mutex_lock(&vdev->igate);
-
-	if (vdev->req_trigger) {
+	rcu_read_lock();
+	eventfd = rcu_dereference(vdev->req_trigger);
+	if (eventfd) {
 		if (!(count % 10))
 			pci_notice_ratelimited(pdev,
 				"Relaying device request to user (#%u)\n",
 				count);
-		eventfd_signal(vdev->req_trigger);
+		eventfd_signal(eventfd->ctx);
 	} else if (count == 0) {
 		pci_warn(pdev,
 			"No device request channel registered, blocked until released by user\n");
 	}
-
-	mutex_unlock(&vdev->igate);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(vfio_pci_core_request);
 
@@ -2216,13 +2244,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 						pci_channel_state_t state)
 {
 	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+	struct vfio_pci_eventfd *eventfd;
 
-	mutex_lock(&vdev->igate);
-
-	if (vdev->err_trigger)
-		eventfd_signal(vdev->err_trigger);
-
-	mutex_unlock(&vdev->igate);
+	rcu_read_lock();
+	eventfd = rcu_dereference(vdev->err_trigger);
+	if (eventfd)
+		eventfd_signal(eventfd->ctx);
+	rcu_read_unlock();
 
 	return PCI_ERS_RESULT_CAN_RECOVER;
 }
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 30d3e921cb0d..c76e753b3cec 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -731,21 +731,27 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
 	return 0;
 }
 
-static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
+static int vfio_pci_set_ctx_trigger_single(struct vfio_pci_core_device *vdev,
+					   struct vfio_pci_eventfd __rcu **peventfd,
 					   unsigned int count, uint32_t flags,
 					   void *data)
 {
 	/* DATA_NONE/DATA_BOOL enables loopback testing */
 	if (flags & VFIO_IRQ_SET_DATA_NONE) {
-		if (*ctx) {
-			if (count) {
-				eventfd_signal(*ctx);
-			} else {
-				eventfd_ctx_put(*ctx);
-				*ctx = NULL;
-			}
+		struct vfio_pci_eventfd *eventfd;
+
+		eventfd = rcu_dereference_protected(*peventfd,
+						lockdep_is_held(&vdev->igate));
+
+		if (!eventfd)
+			return -EINVAL;
+
+		if (count) {
+			eventfd_signal(eventfd->ctx);
 			return 0;
 		}
+
+		return vfio_pci_eventfd_replace_locked(vdev, peventfd, NULL);
 	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
 		uint8_t trigger;
 
@@ -753,8 +759,15 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
 			return -EINVAL;
 
 		trigger = *(uint8_t *)data;
-		if (trigger && *ctx)
-			eventfd_signal(*ctx);
+
+		if (trigger) {
+			struct vfio_pci_eventfd *eventfd =
+					rcu_dereference_protected(*peventfd,
+					lockdep_is_held(&vdev->igate));
+
+			if (eventfd)
+				eventfd_signal(eventfd->ctx);
+		}
 
 		return 0;
 	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
@@ -765,22 +778,23 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
 
 		fd = *(int32_t *)data;
 		if (fd == -1) {
-			if (*ctx)
-				eventfd_ctx_put(*ctx);
-			*ctx = NULL;
+			return vfio_pci_eventfd_replace_locked(vdev,
+							       peventfd, NULL);
 		} else if (fd >= 0) {
 			struct eventfd_ctx *efdctx;
+			int ret;
 
 			efdctx = eventfd_ctx_fdget(fd);
 			if (IS_ERR(efdctx))
 				return PTR_ERR(efdctx);
 
-			if (*ctx)
-				eventfd_ctx_put(*ctx);
+			ret = vfio_pci_eventfd_replace_locked(vdev,
+							      peventfd, efdctx);
+			if (ret)
+				eventfd_ctx_put(efdctx);
 
-			*ctx = efdctx;
+			return ret;
 		}
-		return 0;
 	}
 
 	return -EINVAL;
@@ -793,7 +807,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
 	if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
 		return -EINVAL;
 
-	return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger,
+	return vfio_pci_set_ctx_trigger_single(vdev, &vdev->err_trigger,
 					       count, flags, data);
 }
 
@@ -804,7 +818,7 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
 	if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1)
 		return -EINVAL;
 
-	return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger,
+	return vfio_pci_set_ctx_trigger_single(vdev, &vdev->req_trigger,
 					       count, flags, data);
 }
 
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 28a405f8b97c..6681389518a7 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -26,6 +26,10 @@ struct vfio_pci_ioeventfd {
 bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
 void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
 
+int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev,
+				    struct vfio_pci_eventfd __rcu **peventfd,
+				    struct eventfd_ctx *ctx);
+
 int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
 			    unsigned index, unsigned start, unsigned count,
 			    void *data);
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 88fd2fd895d0..a1eddd55dab8 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/vfio.h>
 #include <linux/irqbypass.h>
+#include <linux/rcupdate.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
 #include <linux/notifier.h>
@@ -29,6 +30,11 @@ struct vfio_pci_region;
 struct p2pdma_provider;
 struct dma_buf_phys_vec;
 
+struct vfio_pci_eventfd {
+	struct eventfd_ctx	*ctx;
+	struct rcu_head		rcu;
+};
+
 struct vfio_pci_regops {
 	ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
 		      size_t count, loff_t *ppos, bool iswrite);
@@ -124,8 +130,8 @@ struct vfio_pci_core_device {
 	struct pci_saved_state	*pci_saved_state;
 	struct pci_saved_state	*pm_save;
 	int			ioeventfds_nr;
-	struct eventfd_ctx	*err_trigger;
-	struct eventfd_ctx	*req_trigger;
+	struct vfio_pci_eventfd __rcu *err_trigger;
+	struct vfio_pci_eventfd __rcu *req_trigger;
 	struct eventfd_ctx	*pm_wake_eventfd_ctx;
 	struct list_head	dummy_resources_list;
 	struct mutex		ioeventfds_lock;
-- 
cgit v1.2.3


From 9b92bc7554b543dc00a0a0b62904a9ef2ad5c4b0 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Thu, 27 Nov 2025 17:06:27 +0000
Subject: vfio: refactor vfio_pci_mmap_huge_fault function

Refactor vfio_pci_mmap_huge_fault to take out the implementation
to map the VMA to the PTE/PMD/PUD as a separate function.

Export the new function to be used by nvgrace-gpu module.

Move the alignment check code to verify that pfn and VMA VA is
aligned to the page order to the header file and make it inline.

No functional change is intended.

Cc: Shameer Kolothum <skolothumtho@nvidia.com>
Cc: Alex Williamson <alex@shazbot.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Shameer Kolothum <skolothumtho@nvidia.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20251127170632.3477-2-ankita@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/vfio/pci/vfio_pci_core.c | 54 ++++++++++++++++++++--------------------
 include/linux/vfio_pci_core.h    | 13 ++++++++++
 2 files changed, 40 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 2b01bfbce3ea..9bb700d25ccb 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1652,49 +1652,49 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma)
 	return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
 }
 
-static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
-					   unsigned int order)
+vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev,
+				   struct vm_fault *vmf,
+				   unsigned long pfn,
+				   unsigned int order)
 {
-	struct vm_area_struct *vma = vmf->vma;
-	struct vfio_pci_core_device *vdev = vma->vm_private_data;
-	unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1);
-	unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
-	unsigned long pfn = vma_to_pfn(vma) + pgoff;
-	vm_fault_t ret = VM_FAULT_SIGBUS;
-
-	if (order && (addr < vma->vm_start ||
-		      addr + (PAGE_SIZE << order) > vma->vm_end ||
-		      pfn & ((1 << order) - 1))) {
-		ret = VM_FAULT_FALLBACK;
-		goto out;
-	}
-
-	down_read(&vdev->memory_lock);
+	lockdep_assert_held_read(&vdev->memory_lock);
 
 	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
-		goto out_unlock;
+		return VM_FAULT_SIGBUS;
 
 	switch (order) {
 	case 0:
-		ret = vmf_insert_pfn(vma, vmf->address, pfn);
-		break;
+		return vmf_insert_pfn(vmf->vma, vmf->address, pfn);
 #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
 	case PMD_ORDER:
-		ret = vmf_insert_pfn_pmd(vmf, pfn, false);
-		break;
+		return vmf_insert_pfn_pmd(vmf, pfn, false);
 #endif
 #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
 	case PUD_ORDER:
-		ret = vmf_insert_pfn_pud(vmf, pfn, false);
+		return vmf_insert_pfn_pud(vmf, pfn, false);
 		break;
 #endif
 	default:
-		ret = VM_FAULT_FALLBACK;
+		return VM_FAULT_FALLBACK;
+	}
+}
+EXPORT_SYMBOL_GPL(vfio_pci_vmf_insert_pfn);
+
+static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
+					   unsigned int order)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
+	unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1);
+	unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+	unsigned long pfn = vma_to_pfn(vma) + pgoff;
+	vm_fault_t ret = VM_FAULT_FALLBACK;
+
+	if (is_aligned_for_order(vma, addr, pfn, order)) {
+		scoped_guard(rwsem_read, &vdev->memory_lock)
+			ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
 	}
 
-out_unlock:
-	up_read(&vdev->memory_lock);
-out:
 	dev_dbg_ratelimited(&vdev->pdev->dev,
 			   "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n",
 			    __func__, order,
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index a1eddd55dab8..5569488ec4dc 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -170,6 +170,9 @@ ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
 		size_t count, loff_t *ppos);
 ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
 		size_t count, loff_t *ppos);
+vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev,
+				   struct vm_fault *vmf, unsigned long pfn,
+				   unsigned int order);
 int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
 void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
 int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
@@ -212,4 +215,14 @@ VFIO_IOREAD_DECLARATION(32)
 VFIO_IOREAD_DECLARATION(64)
 #endif
 
+static inline bool is_aligned_for_order(struct vm_area_struct *vma,
+					unsigned long addr,
+					unsigned long pfn,
+					unsigned int order)
+{
+	return !(order && (addr < vma->vm_start ||
+			   addr + (PAGE_SIZE << order) > vma->vm_end ||
+			   !IS_ALIGNED(pfn, 1 << order)));
+}
+
 #endif /* VFIO_PCI_CORE_H */
-- 
cgit v1.2.3


From a23b10608d420346e5af7eda6c46726a61572469 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Thu, 27 Nov 2025 17:06:32 +0000
Subject: vfio/nvgrace-gpu: wait for the GPU mem to be ready

Speculative prefetches from CPU to GPU memory until the GPU is
ready after reset can cause harmless corrected RAS events to
be logged on Grace systems. It is thus preferred that the
mapping not be re-established until the GPU is ready post reset.

The GPU readiness can be checked through BAR0 registers similar
to the checking at the time of device probe.

It can take several seconds for the GPU to be ready. So it is
desirable that the time overlaps as much of the VM startup as
possible to reduce impact on the VM bootup time. The GPU
readiness state is thus checked on the first fault/huge_fault
request or read/write access which amortizes the GPU readiness
time.

The first fault and read/write checks the GPU state when the
reset_done flag - which denotes whether the GPU has just been
reset. The memory_lock is taken across map/access to avoid
races with GPU reset.

Also check if the memory is enabled, before waiting for GPU
to be ready. Otherwise the readiness check would block for 30s.

Lastly added PM handling wrapping on read/write access.

Cc: Shameer Kolothum <skolothumtho@nvidia.com>
Cc: Alex Williamson <alex@shazbot.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Vikram Sethi <vsethi@nvidia.com>
Reviewed-by: Shameer Kolothum <skolothumtho@nvidia.com>
Suggested-by: Alex Williamson <alex@shazbot.org>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20251127170632.3477-7-ankita@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 103 +++++++++++++++++++++++++++++++-----
 drivers/vfio/pci/vfio_pci_config.c  |   1 +
 drivers/vfio/pci/vfio_pci_priv.h    |   1 -
 include/linux/vfio_pci_core.h       |   1 +
 4 files changed, 93 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index df360e12b299..84d142a47ec6 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -8,6 +8,7 @@
 #include <linux/delay.h>
 #include <linux/jiffies.h>
 #include <linux/pci-p2pdma.h>
+#include <linux/pm_runtime.h>
 
 /*
  * The device memory usable to the workloads running in the VM is cached
@@ -105,6 +106,19 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
 		mutex_init(&nvdev->remap_lock);
 	}
 
+	/*
+	 * GPU readiness is checked by reading the BAR0 registers.
+	 *
+	 * ioremap BAR0 to ensure that the BAR0 mapping is present before
+	 * register reads on first fault before establishing any GPU
+	 * memory mapping.
+	 */
+	ret = vfio_pci_core_setup_barmap(vdev, 0);
+	if (ret) {
+		vfio_pci_core_disable(vdev);
+		return ret;
+	}
+
 	vfio_pci_core_finish_enable(vdev);
 
 	return 0;
@@ -147,6 +161,34 @@ static int nvgrace_gpu_wait_device_ready(void __iomem *io)
 	return -ETIME;
 }
 
+/*
+ * If the GPU memory is accessed by the CPU while the GPU is not ready
+ * after reset, it can cause harmless corrected RAS events to be logged.
+ * Make sure the GPU is ready before establishing the mappings.
+ */
+static int
+nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
+{
+	struct vfio_pci_core_device *vdev = &nvdev->core_device;
+	int ret;
+
+	lockdep_assert_held_read(&vdev->memory_lock);
+
+	if (!nvdev->reset_done)
+		return 0;
+
+	if (!__vfio_pci_memory_enabled(vdev))
+		return -EIO;
+
+	ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
+	if (ret)
+		return ret;
+
+	nvdev->reset_done = false;
+
+	return 0;
+}
+
 static unsigned long addr_to_pgoff(struct vm_area_struct *vma,
 				   unsigned long addr)
 {
@@ -176,8 +218,13 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
 	pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
 
 	if (is_aligned_for_order(vma, addr, pfn, order)) {
-		scoped_guard(rwsem_read, &vdev->memory_lock)
+		scoped_guard(rwsem_read, &vdev->memory_lock) {
+			if (vdev->pm_runtime_engaged ||
+			    nvgrace_gpu_check_device_ready(nvdev))
+				return VM_FAULT_SIGBUS;
+
 			ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
+		}
 	}
 
 	dev_dbg_ratelimited(&vdev->pdev->dev,
@@ -533,6 +580,7 @@ static ssize_t
 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
 		     char __user *buf, size_t count, loff_t *ppos)
 {
+	struct vfio_pci_core_device *vdev = &nvdev->core_device;
 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
 	struct mem_region *memregion;
@@ -559,9 +607,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
 	else
 		mem_count = min(count, memregion->memlength - (size_t)offset);
 
-	ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
-	if (ret)
-		return ret;
+	scoped_guard(rwsem_read, &vdev->memory_lock) {
+		ret = nvgrace_gpu_check_device_ready(nvdev);
+		if (ret)
+			return ret;
+
+		ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
+		if (ret)
+			return ret;
+	}
 
 	/*
 	 * Only the device memory present on the hardware is mapped, which may
@@ -586,9 +640,16 @@ nvgrace_gpu_read(struct vfio_device *core_vdev,
 	struct nvgrace_gpu_pci_core_device *nvdev =
 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
 			     core_device.vdev);
+	struct vfio_pci_core_device *vdev = &nvdev->core_device;
+	int ret;
 
-	if (nvgrace_gpu_memregion(index, nvdev))
-		return nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+	if (nvgrace_gpu_memregion(index, nvdev)) {
+		if (pm_runtime_resume_and_get(&vdev->pdev->dev))
+			return -EIO;
+		ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+		pm_runtime_put(&vdev->pdev->dev);
+		return ret;
+	}
 
 	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
 		return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
@@ -650,6 +711,7 @@ static ssize_t
 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
 		      size_t count, loff_t *ppos, const char __user *buf)
 {
+	struct vfio_pci_core_device *vdev = &nvdev->core_device;
 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
 	struct mem_region *memregion;
@@ -679,9 +741,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
 	 */
 	mem_count = min(count, memregion->memlength - (size_t)offset);
 
-	ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
-	if (ret)
-		return ret;
+	scoped_guard(rwsem_read, &vdev->memory_lock) {
+		ret = nvgrace_gpu_check_device_ready(nvdev);
+		if (ret)
+			return ret;
+
+		ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
+		if (ret)
+			return ret;
+	}
 
 exitfn:
 	*ppos += count;
@@ -695,10 +763,17 @@ nvgrace_gpu_write(struct vfio_device *core_vdev,
 	struct nvgrace_gpu_pci_core_device *nvdev =
 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
 			     core_device.vdev);
+	struct vfio_pci_core_device *vdev = &nvdev->core_device;
 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	int ret;
 
-	if (nvgrace_gpu_memregion(index, nvdev))
-		return nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+	if (nvgrace_gpu_memregion(index, nvdev)) {
+		if (pm_runtime_resume_and_get(&vdev->pdev->dev))
+			return -EIO;
+		ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+		pm_runtime_put(&vdev->pdev->dev);
+		return ret;
+	}
 
 	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
 		return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
@@ -1074,7 +1149,11 @@ MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
  * faults and read/writes accesses to prevent potential RAS events logging.
  *
  * First fault or access after a reset needs to poll device readiness,
- * flag that a reset has occurred.
+ * flag that a reset has occurred. The readiness test is done by holding
+ * the memory_lock read lock and we expect all vfio-pci initiated resets to
+ * hold the memory_lock write lock to avoid races. However, .reset_done
+ * extends beyond the scope of vfio-pci initiated resets therefore we
+ * cannot assert this behavior and use lockdep_assert_held_write.
  */
 static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev)
 {
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 1f6008eabf23..dc4e510e6e1b 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -416,6 +416,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
 	return pdev->current_state < PCI_D3hot &&
 	       (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
 }
+EXPORT_SYMBOL_GPL(__vfio_pci_memory_enabled);
 
 /*
  * Restore the *real* BARs after we detect a FLR or backdoor reset.
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 6681389518a7..27ac280f00b9 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -64,7 +64,6 @@ void vfio_config_free(struct vfio_pci_core_device *vdev);
 int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
 			     pci_power_t state);
 
-bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev);
 u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 5569488ec4dc..336a0e58b443 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -188,6 +188,7 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 			       void __iomem *io, char __user *buf,
 			       loff_t off, size_t count, size_t x_start,
 			       size_t x_end, bool iswrite);
+bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
 bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
 					 loff_t reg_start, size_t reg_cnt,
 					 loff_t *buf_offset,
-- 
cgit v1.2.3


From 256a21743d911f94ce92fe28f793cd586f3860b2 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Thu, 6 Nov 2025 12:36:00 -0500
Subject: i3c: Add HDR API support

Rename struct i3c_priv_xfer to struct i3c_xfer, since private xfer in the
I3C spec refers only to SDR transfers. Ref: i3c spec ver1.2, section 3,
Technical Overview.

i3c_xfer will be used for both SDR and HDR.

Rename enum i3c_hdr_mode to i3c_xfer_mode. Previous definition need match
CCC GET_CAP1 bit position. Use 31 as SDR transfer mode.

Add i3c_device_do_xfers() with an xfer mode argument, while keeping
i3c_device_do_priv_xfers() as a wrapper that calls i3c_device_do_xfers()
with I3C_SDR for backward compatibility.

Introduce a 'cmd' field in struct i3c_xfer as an anonymous union with
'rnw', since HDR mode uses read/write commands instead of the SDR address
bit.

Add .i3c_xfers() callback for master controllers. If not implemented, fall
back to SDR with .priv_xfers(). The .priv_xfers() API can be removed once
all controllers switch to .i3c_xfers().

Add 'mode_mask' bitmask to advertise controller capability.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://patch.msgid.link/20251106-i3c_ddr-v11-1-33a6a66ed095@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/device.c       | 27 ++++++++++++++++++++-------
 drivers/i3c/internals.h    |  6 +++---
 drivers/i3c/master.c       | 19 ++++++++++++++-----
 include/linux/i3c/device.h | 40 +++++++++++++++++++++++++++++-----------
 include/linux/i3c/master.h |  4 ++++
 5 files changed, 70 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/i3c/device.c b/drivers/i3c/device.c
index 2396545763ff..8a156f5ad692 100644
--- a/drivers/i3c/device.c
+++ b/drivers/i3c/device.c
@@ -15,12 +15,12 @@
 #include "internals.h"
 
 /**
- * i3c_device_do_priv_xfers() - do I3C SDR private transfers directed to a
- *				specific device
+ * i3c_device_do_xfers() - do I3C transfers directed to a specific device
  *
  * @dev: device with which the transfers should be done
  * @xfers: array of transfers
  * @nxfers: number of transfers
+ * @mode: transfer mode
  *
  * Initiate one or several private SDR transfers with @dev.
  *
@@ -33,9 +33,8 @@
  *   'xfers' some time later. See I3C spec ver 1.1.1 09-Jun-2021. Section:
  *   5.1.2.2.3.
  */
-int i3c_device_do_priv_xfers(struct i3c_device *dev,
-			     struct i3c_priv_xfer *xfers,
-			     int nxfers)
+int i3c_device_do_xfers(struct i3c_device *dev, struct i3c_xfer *xfers,
+			int nxfers, enum i3c_xfer_mode mode)
 {
 	int ret, i;
 
@@ -48,12 +47,12 @@ int i3c_device_do_priv_xfers(struct i3c_device *dev,
 	}
 
 	i3c_bus_normaluse_lock(dev->bus);
-	ret = i3c_dev_do_priv_xfers_locked(dev->desc, xfers, nxfers);
+	ret = i3c_dev_do_xfers_locked(dev->desc, xfers, nxfers, mode);
 	i3c_bus_normaluse_unlock(dev->bus);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(i3c_device_do_priv_xfers);
+EXPORT_SYMBOL_GPL(i3c_device_do_xfers);
 
 /**
  * i3c_device_do_setdasa() - do I3C dynamic address assignement with
@@ -260,6 +259,20 @@ i3c_device_match_id(struct i3c_device *i3cdev,
 }
 EXPORT_SYMBOL_GPL(i3c_device_match_id);
 
+/**
+ * i3c_device_get_supported_xfer_mode - Returns the supported transfer mode by
+ *					connected master controller.
+ * @dev: I3C device
+ *
+ * Return: a bit mask, which supported transfer mode, bit position is defined at
+ *	   enum i3c_hdr_mode
+ */
+u32 i3c_device_get_supported_xfer_mode(struct i3c_device *dev)
+{
+	return i3c_dev_get_master(dev->desc)->this->info.hdr_cap | BIT(I3C_SDR);
+}
+EXPORT_SYMBOL_GPL(i3c_device_get_supported_xfer_mode);
+
 /**
  * i3c_driver_register_with_owner() - register an I3C device driver
  *
diff --git a/drivers/i3c/internals.h b/drivers/i3c/internals.h
index 79ceaa5f5afd..f609e5098137 100644
--- a/drivers/i3c/internals.h
+++ b/drivers/i3c/internals.h
@@ -15,9 +15,9 @@ void i3c_bus_normaluse_lock(struct i3c_bus *bus);
 void i3c_bus_normaluse_unlock(struct i3c_bus *bus);
 
 int i3c_dev_setdasa_locked(struct i3c_dev_desc *dev);
-int i3c_dev_do_priv_xfers_locked(struct i3c_dev_desc *dev,
-				 struct i3c_priv_xfer *xfers,
-				 int nxfers);
+int i3c_dev_do_xfers_locked(struct i3c_dev_desc *dev,
+			    struct i3c_xfer *xfers,
+			    int nxfers, enum i3c_xfer_mode mode);
 int i3c_dev_disable_ibi_locked(struct i3c_dev_desc *dev);
 int i3c_dev_enable_ibi_locked(struct i3c_dev_desc *dev);
 int i3c_dev_request_ibi_locked(struct i3c_dev_desc *dev,
diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index 823661a81f5e..f88f7e19203a 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -2819,10 +2819,14 @@ EXPORT_SYMBOL_GPL(i3c_generic_ibi_recycle_slot);
 
 static int i3c_master_check_ops(const struct i3c_master_controller_ops *ops)
 {
-	if (!ops || !ops->bus_init || !ops->priv_xfers ||
+	if (!ops || !ops->bus_init ||
 	    !ops->send_ccc_cmd || !ops->do_daa || !ops->i2c_xfers)
 		return -EINVAL;
 
+	/* Must provide one of priv_xfers (SDR only) or i3c_xfers (all modes) */
+	if (!ops->priv_xfers && !ops->i3c_xfers)
+		return -EINVAL;
+
 	if (ops->request_ibi &&
 	    (!ops->enable_ibi || !ops->disable_ibi || !ops->free_ibi ||
 	     !ops->recycle_ibi_slot))
@@ -3012,9 +3016,8 @@ int i3c_dev_setdasa_locked(struct i3c_dev_desc *dev)
 						dev->boardinfo->init_dyn_addr);
 }
 
-int i3c_dev_do_priv_xfers_locked(struct i3c_dev_desc *dev,
-				 struct i3c_priv_xfer *xfers,
-				 int nxfers)
+int i3c_dev_do_xfers_locked(struct i3c_dev_desc *dev, struct i3c_xfer *xfers,
+			    int nxfers, enum i3c_xfer_mode mode)
 {
 	struct i3c_master_controller *master;
 
@@ -3025,9 +3028,15 @@ int i3c_dev_do_priv_xfers_locked(struct i3c_dev_desc *dev,
 	if (!master || !xfers)
 		return -EINVAL;
 
-	if (!master->ops->priv_xfers)
+	if (mode != I3C_SDR && !(master->this->info.hdr_cap & BIT(mode)))
 		return -EOPNOTSUPP;
 
+	if (master->ops->i3c_xfers)
+		return master->ops->i3c_xfers(dev, xfers, nxfers, mode);
+
+	if (mode != I3C_SDR)
+		return -EINVAL;
+
 	return master->ops->priv_xfers(dev, xfers, nxfers);
 }
 
diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h
index 7f136de4b73e..7f7738041f38 100644
--- a/include/linux/i3c/device.h
+++ b/include/linux/i3c/device.h
@@ -39,20 +39,25 @@ enum i3c_error_code {
 };
 
 /**
- * enum i3c_hdr_mode - HDR mode ids
+ * enum i3c_xfer_mode - I3C xfer mode ids
  * @I3C_HDR_DDR: DDR mode
  * @I3C_HDR_TSP: TSP mode
  * @I3C_HDR_TSL: TSL mode
+ * @I3C_SDR: SDR mode (NOT HDR mode)
  */
-enum i3c_hdr_mode {
-	I3C_HDR_DDR,
-	I3C_HDR_TSP,
-	I3C_HDR_TSL,
+enum i3c_xfer_mode {
+	/* The below 3 value (I3C_HDR*) must match GETCAP1 Byte bit position */
+	I3C_HDR_DDR = 0,
+	I3C_HDR_TSP = 1,
+	I3C_HDR_TSL = 2,
+	/* Use for default SDR transfer mode */
+	I3C_SDR = 0x31,
 };
 
 /**
- * struct i3c_priv_xfer - I3C SDR private transfer
+ * struct i3c_xfer - I3C data transfer
  * @rnw: encodes the transfer direction. true for a read, false for a write
+ * @cmd: Read/Write command in HDR mode, read: 0x80 - 0xff, write: 0x00 - 0x7f
  * @len: transfer length in bytes of the transfer
  * @actual_len: actual length in bytes are transferred by the controller
  * @data: input/output buffer
@@ -60,8 +65,11 @@ enum i3c_hdr_mode {
  * @data.out: output buffer. Must point to a DMA-able buffer
  * @err: I3C error code
  */
-struct i3c_priv_xfer {
-	u8 rnw;
+struct i3c_xfer {
+	union {
+		u8 rnw;
+		u8 cmd;
+	};
 	u16 len;
 	u16 actual_len;
 	union {
@@ -71,6 +79,9 @@ struct i3c_priv_xfer {
 	enum i3c_error_code err;
 };
 
+/* keep back compatible */
+#define i3c_priv_xfer i3c_xfer
+
 /**
  * enum i3c_dcr - I3C DCR values
  * @I3C_DCR_GENERIC_DEVICE: generic I3C device
@@ -297,9 +308,15 @@ static __always_inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv,
 		      i3c_i2c_driver_unregister,	\
 		      __i2cdrv)
 
-int i3c_device_do_priv_xfers(struct i3c_device *dev,
-			     struct i3c_priv_xfer *xfers,
-			     int nxfers);
+int i3c_device_do_xfers(struct i3c_device *dev, struct i3c_xfer *xfers,
+			int nxfers, enum i3c_xfer_mode mode);
+
+static inline int i3c_device_do_priv_xfers(struct i3c_device *dev,
+					   struct i3c_priv_xfer *xfers,
+					   int nxfers)
+{
+	return i3c_device_do_xfers(dev, xfers, nxfers, I3C_SDR);
+}
 
 int i3c_device_do_setdasa(struct i3c_device *dev);
 
@@ -341,5 +358,6 @@ int i3c_device_request_ibi(struct i3c_device *dev,
 void i3c_device_free_ibi(struct i3c_device *dev);
 int i3c_device_enable_ibi(struct i3c_device *dev);
 int i3c_device_disable_ibi(struct i3c_device *dev);
+u32 i3c_device_get_supported_xfer_mode(struct i3c_device *dev);
 
 #endif /* I3C_DEV_H */
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index c52a82dd79a6..d0d5b3a9049f 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -474,9 +474,13 @@ struct i3c_master_controller_ops {
 				 const struct i3c_ccc_cmd *cmd);
 	int (*send_ccc_cmd)(struct i3c_master_controller *master,
 			    struct i3c_ccc_cmd *cmd);
+	/* Deprecated, please use i3c_xfers() */
 	int (*priv_xfers)(struct i3c_dev_desc *dev,
 			  struct i3c_priv_xfer *xfers,
 			  int nxfers);
+	int (*i3c_xfers)(struct i3c_dev_desc *dev,
+			 struct i3c_xfer *xfers,
+			 int nxfers, enum i3c_xfer_mode mode);
 	int (*attach_i2c_dev)(struct i2c_dev_desc *dev);
 	void (*detach_i2c_dev)(struct i2c_dev_desc *dev);
 	int (*i2c_xfers)(struct i2c_dev_desc *dev,
-- 
cgit v1.2.3


From 9280b6ebbf08e53734d34f3bb325c37cddc1422d Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Thu, 6 Nov 2025 12:36:01 -0500
Subject: i3c: Switch to use new i3c_xfer from i3c_priv_xfer

Switch to use i3c_xfer instead of i3c_priv_xfer because framework update to
support HDR mode. i3c_priv_xfer is now an alias of i3c_xfer.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://patch.msgid.link/20251106-i3c_ddr-v11-2-33a6a66ed095@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/device.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h
index 7f7738041f38..ae0662d9d77e 100644
--- a/include/linux/i3c/device.h
+++ b/include/linux/i3c/device.h
@@ -27,7 +27,7 @@
  * These are the standard error codes as defined by the I3C specification.
  * When -EIO is returned by the i3c_device_do_priv_xfers() or
  * i3c_device_send_hdr_cmds() one can check the error code in
- * &struct_i3c_priv_xfer.err or &struct i3c_hdr_cmd.err to get a better idea of
+ * &struct_i3c_xfer.err or &struct i3c_hdr_cmd.err to get a better idea of
  * what went wrong.
  *
  */
@@ -312,7 +312,7 @@ int i3c_device_do_xfers(struct i3c_device *dev, struct i3c_xfer *xfers,
 			int nxfers, enum i3c_xfer_mode mode);
 
 static inline int i3c_device_do_priv_xfers(struct i3c_device *dev,
-					   struct i3c_priv_xfer *xfers,
+					   struct i3c_xfer *xfers,
 					   int nxfers)
 {
 	return i3c_device_do_xfers(dev, xfers, nxfers, I3C_SDR);
-- 
cgit v1.2.3


From cb2dc6d2869a4fb7ef8d792a81a74bc6f0958a72 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Sat, 29 Nov 2025 10:05:00 +0100
Subject: can: Kconfig: select CAN driver infrastructure by default

The CAN bus support enabled with CONFIG_CAN provides a socket-based
access to CAN interfaces. With the introduction of the latest CAN protocol
CAN XL additional configuration status information needs to be exposed to
the network layer than formerly provided by standard Linux network drivers.

This requires the CAN driver infrastructure to be selected by default.
As the CAN network layer can only operate on CAN interfaces anyway all
distributions and common default configs enable at least one CAN driver.

So selecting CONFIG_CAN_DEV when CONFIG_CAN is selected by the user has
no effect on established configurations but solves potential build issues
when CONFIG_CAN[_XXX]=y is set together with CANFIG_CAN_DEV=m

Fixes: 1a620a723853 ("can: raw: instantly reject unsupported CAN frames")
Reported-by: Vincent Mailhol <mailhol@kernel.org>
Closes: https://lore.kernel.org/all/CAMZ6RqL_nGszwoLPXn1Li8op-ox4k3Hs6p=Hw6+w0W=DTtobPw@mail.gmail.com/
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511280531.YnWW2Rxc-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202511280842.djCQ0N0O-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202511282325.uVQFRTkA-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202511291520.guIE1QHj-lkp@intel.com/
Suggested-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251129090500.17484-1-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 7 +++++++
 net/can/Kconfig         | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 52c8be5c160e..f6416a56e95d 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -111,7 +111,14 @@ struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
 void free_candev(struct net_device *dev);
 
 /* a candev safe wrapper around netdev_priv */
+#if IS_ENABLED(CONFIG_CAN_NETLINK)
 struct can_priv *safe_candev_priv(struct net_device *dev);
+#else
+static inline struct can_priv *safe_candev_priv(struct net_device *dev)
+{
+	return NULL;
+}
+#endif
 
 int open_candev(struct net_device *dev);
 void close_candev(struct net_device *dev);
diff --git a/net/can/Kconfig b/net/can/Kconfig
index af64a6f76458..e4ccf731a24c 100644
--- a/net/can/Kconfig
+++ b/net/can/Kconfig
@@ -5,6 +5,7 @@
 
 menuconfig CAN
 	tristate "CAN bus subsystem support"
+	select CAN_DEV
 	help
 	  Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial
 	  communications protocol. Development of the CAN bus started in
-- 
cgit v1.2.3


From 414690746d2da0dc9a931f8c02d83e5834141251 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 24 Nov 2025 18:28:08 -0800
Subject: i2c: i2c.h: fix a bad kernel-doc line

Change an empty line into a blank kernel-doc line to prevent
a kernel-doc warning:

Warning: ../include/uapi/linux/i2c.h:38 bad line:

Fixes: bfb3939c51d5 ("i2c: refactor documentation of struct i2c_msg")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/uapi/linux/i2c.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/i2c.h b/include/uapi/linux/i2c.h
index a2db2a56c8b0..2a226657d9f8 100644
--- a/include/uapi/linux/i2c.h
+++ b/include/uapi/linux/i2c.h
@@ -36,7 +36,7 @@
  *
  *   Only if I2C_FUNC_NOSTART is set:
  *   %I2C_M_NOSTART: skip repeated start sequence
-
+ *
  *   Only if I2C_FUNC_PROTOCOL_MANGLING is set:
  *   %I2C_M_NO_RD_ACK: in a read message, master ACK/NACK bit is skipped
  *   %I2C_M_IGNORE_NAK: treat NACK from client as ACK
-- 
cgit v1.2.3


From beb7021a6003d9c6a463fffca0d6311efb8e0e66 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 28 Nov 2025 23:27:57 +0000
Subject: rqspinlock: Enclose lock/unlock within lock entry acquisitions

Ritesh reported that timeouts occurred frequently for rqspinlock despite
reentrancy on the same lock on the same CPU in [0]. This patch closes
one of the races leading to this behavior, and reduces the frequency of
timeouts.

We currently have a tiny window between the fast-path cmpxchg and the
grabbing of the lock entry where an NMI could land, attempt the same
lock that was just acquired, and end up timing out. This is not ideal.
Instead, move the lock entry acquisition from the fast path to before
the cmpxchg, and remove the grabbing of the lock entry in the slow path,
assuming it was already taken by the fast path. The TAS fallback is
invoked directly without being preceded by the typical fast path,
therefore we must continue to grab the deadlock detection entry in that
case.

Case on lock leading to missed AA:

cmpxchg lock A
<NMI>
... rqspinlock acquisition of A
... timeout
</NMI>
grab_held_lock_entry(A)

There is a similar case when unlocking the lock. If the NMI lands
between the WRITE_ONCE and smp_store_release, it is possible that we end
up in a situation where the NMI fails to diagnose the AA condition,
leading to a timeout.

Case on unlock leading to missed AA:

WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
<NMI>
... rqspinlock acquisition of A
... timeout
</NMI>
smp_store_release(A->locked, 0)

The patch changes the order on unlock to smp_store_release() succeeded
by WRITE_ONCE() of NULL. This avoids the missed AA detection described
above, but may lead to a false positive if the NMI lands between these
two statements, which is acceptable (and preferred over a timeout).

The original intention of the reverse order on unlock was to prevent the
following possible misdiagnosis of an ABBA scenario:

grab entry A
lock A
grab entry B
lock B
unlock B
   smp_store_release(B->locked, 0)
							grab entry B
							lock B
							grab entry A
							lock A
							! <detect ABBA>
   WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)

If the store release were is after the WRITE_ONCE, the other CPU would
not observe B in the table of the CPU unlocking the lock B.  However,
since the threads are obviously participating in an ABBA deadlock, it
is no longer appealing to use the order above since it may lead to a
250 ms timeout due to missed AA detection.

  [0]: https://lore.kernel.org/bpf/CAH6OuBTjG+N=+GGwcpOUbeDN563oz4iVcU3rbse68egp9wj9_A@mail.gmail.com

Fixes: 0d80e7f951be ("rqspinlock: Choose trylock fallback for NMI waiters")
Reported-by: Ritesh Oedayrajsingh Varma <ritesh@superluminal.eu>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251128232802.1031906-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/asm-generic/rqspinlock.h | 60 +++++++++++++++++++++-------------------
 kernel/bpf/rqspinlock.c          | 15 ++++------
 2 files changed, 38 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
index 6d4244d643df..0f2dcbbfee2f 100644
--- a/include/asm-generic/rqspinlock.h
+++ b/include/asm-generic/rqspinlock.h
@@ -129,8 +129,8 @@ dec:
 	 * <error> for lock B
 	 * release_held_lock_entry
 	 *
-	 * try_cmpxchg_acquire for lock A
 	 * grab_held_lock_entry
+	 * try_cmpxchg_acquire for lock A
 	 *
 	 * Lack of any ordering means reordering may occur such that dec, inc
 	 * are done before entry is overwritten. This permits a remote lock
@@ -139,13 +139,8 @@ dec:
 	 * CPU holds a lock it is attempting to acquire, leading to false ABBA
 	 * diagnosis).
 	 *
-	 * In case of unlock, we will always do a release on the lock word after
-	 * releasing the entry, ensuring that other CPUs cannot hold the lock
-	 * (and make conclusions about deadlocks) until the entry has been
-	 * cleared on the local CPU, preventing any anomalies. Reordering is
-	 * still possible there, but a remote CPU cannot observe a lock in our
-	 * table which it is already holding, since visibility entails our
-	 * release store for the said lock has not retired.
+	 * The case of unlock is treated differently due to NMI reentrancy, see
+	 * comments in res_spin_unlock.
 	 *
 	 * In theory we don't have a problem if the dec and WRITE_ONCE above get
 	 * reordered with each other, we either notice an empty NULL entry on
@@ -175,10 +170,22 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock)
 {
 	int val = 0;
 
-	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
-		grab_held_lock_entry(lock);
+	/*
+	 * Grab the deadlock detection entry before doing the cmpxchg, so that
+	 * reentrancy due to NMIs between the succeeding cmpxchg and creation of
+	 * held lock entry can correctly detect an acquisition attempt in the
+	 * interrupted context.
+	 *
+	 * cmpxchg lock A
+	 * <NMI>
+	 * res_spin_lock(A) --> missed AA, leads to timeout
+	 * </NMI>
+	 * grab_held_lock_entry(A)
+	 */
+	grab_held_lock_entry(lock);
+
+	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
 		return 0;
-	}
 	return resilient_queued_spin_lock_slowpath(lock, val);
 }
 
@@ -192,28 +199,25 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock)
 {
 	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
 
-	if (unlikely(rqh->cnt > RES_NR_HELD))
-		goto unlock;
-	WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
-unlock:
 	/*
-	 * Release barrier, ensures correct ordering. See release_held_lock_entry
-	 * for details.  Perform release store instead of queued_spin_unlock,
-	 * since we use this function for test-and-set fallback as well. When we
-	 * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
+	 * Release barrier, ensures correct ordering. Perform release store
+	 * instead of queued_spin_unlock, since we use this function for the TAS
+	 * fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear
+	 * the full 4-byte lockword.
 	 *
-	 * Like release_held_lock_entry, we can do the release before the dec.
-	 * We simply care about not seeing the 'lock' in our table from a remote
-	 * CPU once the lock has been released, which doesn't rely on the dec.
+	 * Perform the smp_store_release before clearing the lock entry so that
+	 * NMIs landing in the unlock path can correctly detect AA issues. The
+	 * opposite order shown below may lead to missed AA checks:
 	 *
-	 * Unlike smp_wmb(), release is not a two way fence, hence it is
-	 * possible for a inc to move up and reorder with our clearing of the
-	 * entry. This isn't a problem however, as for a misdiagnosis of ABBA,
-	 * the remote CPU needs to hold this lock, which won't be released until
-	 * the store below is done, which would ensure the entry is overwritten
-	 * to NULL, etc.
+	 * WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
+	 * <NMI>
+	 * res_spin_lock(A) --> missed AA, leads to timeout
+	 * </NMI>
+	 * smp_store_release(A->locked, 0)
 	 */
 	smp_store_release(&lock->locked, 0);
+	if (likely(rqh->cnt <= RES_NR_HELD))
+		WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
 	this_cpu_dec(rqspinlock_held_locks.cnt);
 }
 
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 3cc23d79a9fc..878d641719da 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -275,6 +275,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
 	int val, ret = 0;
 
 	RES_INIT_TIMEOUT(ts);
+	/*
+	 * The fast path is not invoked for the TAS fallback, so we must grab
+	 * the deadlock detection entry here.
+	 */
 	grab_held_lock_entry(lock);
 
 	/*
@@ -397,10 +401,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 		goto queue;
 	}
 
-	/*
-	 * Grab an entry in the held locks array, to enable deadlock detection.
-	 */
-	grab_held_lock_entry(lock);
+	/* Deadlock detection entry already held after failing fast path. */
 
 	/*
 	 * We're pending, wait for the owner to go away.
@@ -448,11 +449,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 	 */
 queue:
 	lockevent_inc(lock_slowpath);
-	/*
-	 * Grab deadlock detection entry for the queue path.
-	 */
-	grab_held_lock_entry(lock);
-
+	/* Deadlock detection entry already held after failing fast path. */
 	node = this_cpu_ptr(&rqnodes[0].mcs);
 	idx = node->count++;
 	tail = encode_tail(smp_processor_id(), idx);
-- 
cgit v1.2.3


From 2b6a3f061f11372af79b862d6184d43193ae927f Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 25 Nov 2025 10:00:59 +0000
Subject: mm: declare VMA flags by bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "initial work on making VMA flags a bitmap", v3.

We are in the rather silly situation that we are running out of VMA flags
as they are currently limited to a system word in size.

This leads to absurd situations where we limit features to 64-bit
architectures only because we simply do not have the ability to add a flag
for 32-bit ones.

This is very constraining and leads to hacks or, in the worst case, simply
an inability to implement features we want for entirely arbitrary reasons.

This also of course gives us something of a Y2K type situation in mm where
we might eventually exhaust all of the VMA flags even on 64-bit systems.

This series lays the groundwork for getting away from this limitation by
establishing VMA flags as a bitmap whose size we can increase in future
beyond 64 bits if required.

This is necessarily a highly iterative process given the extensive use of
VMA flags throughout the kernel, so we start by performing basic steps.

Firstly, we declare VMA flags by bit number rather than by value,
retaining the VM_xxx fields but in terms of these newly introduced
VMA_xxx_BIT fields.

While we are here, we use sparse annotations to ensure that, when dealing
with VMA bit number parameters, we cannot be passed values which are not
declared as such - providing some useful type safety.

We then introduce an opaque VMA flag type, much like the opaque mm_struct
flag type introduced in commit bb6525f2f8c4 ("mm: add bitmap mm->flags
field"), which we establish in union with vma->vm_flags (but still set at
system word size meaning there is no functional or data type size change).

We update the vm_flags_xxx() helpers to use this new bitmap, introducing
sensible helpers to do so.

This series lays the foundation for further work to expand the use of
bitmap VMA flags and eventually eliminate these arbitrary restrictions.


This patch (of 4):

In order to lay the groundwork for VMA flags being a bitmap rather than a
system word in size, we need to be able to consistently refer to VMA flags
by bit number rather than value.

Take this opportunity to do so in an enum which we which is additionally
useful for tooling to extract metadata from.

This additionally makes it very clear which bits are being used for what
at a glance.

We use the VMA_ prefix for the bit values as it is logical to do so since
these reference VMAs.  We consistently suffix with _BIT to make it clear
what the values refer to.

We declare bit values even when the flags that use them would not be
enabled by config options as this is simply clearer and clearly defines
what bit numbers are used for what, at no additional cost.

We declare a sparse-bitwise type vma_flag_t which ensures that users can't
pass around invalid VMA flags by accident and prepares for future work
towards VMA flags being a bitmap where we want to ensure bit values are
type safe.

To make life easier, we declare some macro helpers - DECLARE_VMA_BIT()
allows us to avoid duplication in the enum bit number declarations (and
maintaining the sparse __bitwise attribute), and INIT_VM_FLAG() is used to
assist with declaration of flags.

Unfortunately we can't declare both in the enum, as we run into issue with
logic in the kernel requiring that flags are preprocessor definitions, and
additionally we cannot have a macro which declares another macro so we
must define each flag macro directly.

Additionally, update the VMA userland testing vma_internal.h header to
include these changes.

We also have to fix the parameters to the vma_flag_*_atomic() functions
since VMA_MAYBE_GUARD_BIT is now of type vma_flag_t and sparse will
complain otherwise.

We have to update some rather silly if-deffery found in mm/task_mmu.c
which would otherwise break.

Finally, we update the rust binding helper as now it cannot auto-detect
the flags at all.

Link: https://lkml.kernel.org/r/cover.1764064556.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/3a35e5a0bcfa00e84af24cbafc0653e74deda64a.1764064556.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Alice Ryhl <aliceryhl@google.com>	[rust]
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Gary Guo <gary@garyguo.net>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Trevor Gross <tmgross@umich.edu>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c               |   4 +-
 include/linux/mm.h               | 379 ++++++++++++++++++++++-----------------
 mm/khugepaged.c                  |   2 +-
 mm/madvise.c                     |   2 +-
 rust/bindgen_parameters          |  25 +++
 rust/bindings/bindings_helper.h  |  25 +++
 tools/testing/vma/vma_internal.h | 304 ++++++++++++++++++++++++++-----
 7 files changed, 524 insertions(+), 217 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2b4ab5718ab5..d00ac179d973 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1183,10 +1183,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_PKEY_BIT0)]	= "",
 		[ilog2(VM_PKEY_BIT1)]	= "",
 		[ilog2(VM_PKEY_BIT2)]	= "",
-#if VM_PKEY_BIT3
+#if CONFIG_ARCH_PKEY_BITS > 3
 		[ilog2(VM_PKEY_BIT3)]	= "",
 #endif
-#if VM_PKEY_BIT4
+#if CONFIG_ARCH_PKEY_BITS > 4
 		[ilog2(VM_PKEY_BIT4)]	= "",
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75f894c3f521..a2f38fb68840 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -271,185 +271,239 @@ extern struct rw_semaphore nommu_region_sem;
 extern unsigned int kobjsize(const void *objp);
 #endif
 
-#define VM_MAYBE_GUARD_BIT 11
-
 /*
  * vm_flags in vm_area_struct, see mm_types.h.
  * When changing, update also include/trace/events/mmflags.h
  */
-#define VM_NONE		0x00000000
 
-#define VM_READ		0x00000001	/* currently active flags */
-#define VM_WRITE	0x00000002
-#define VM_EXEC		0x00000004
-#define VM_SHARED	0x00000008
+#define VM_NONE		0x00000000
 
-/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
-#define VM_MAYREAD	0x00000010	/* limits for mprotect() etc */
-#define VM_MAYWRITE	0x00000020
-#define VM_MAYEXEC	0x00000040
-#define VM_MAYSHARE	0x00000080
+/**
+ * typedef vma_flag_t - specifies an individual VMA flag by bit number.
+ *
+ * This value is made type safe by sparse to avoid passing invalid flag values
+ * around.
+ */
+typedef int __bitwise vma_flag_t;
 
-#define VM_GROWSDOWN	0x00000100	/* general info on the segment */
+#define DECLARE_VMA_BIT(name, bitnum) \
+	VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
+#define DECLARE_VMA_BIT_ALIAS(name, aliased) \
+	VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT)
+enum {
+	DECLARE_VMA_BIT(READ, 0),
+	DECLARE_VMA_BIT(WRITE, 1),
+	DECLARE_VMA_BIT(EXEC, 2),
+	DECLARE_VMA_BIT(SHARED, 3),
+	/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
+	DECLARE_VMA_BIT(MAYREAD, 4),	/* limits for mprotect() etc. */
+	DECLARE_VMA_BIT(MAYWRITE, 5),
+	DECLARE_VMA_BIT(MAYEXEC, 6),
+	DECLARE_VMA_BIT(MAYSHARE, 7),
+	DECLARE_VMA_BIT(GROWSDOWN, 8),	/* general info on the segment */
 #ifdef CONFIG_MMU
-#define VM_UFFD_MISSING	0x00000200	/* missing pages tracking */
-#else /* CONFIG_MMU */
-#define VM_MAYOVERLAY	0x00000200	/* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
-#define VM_UFFD_MISSING	0
+	DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
+#else
+	/* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
+	DECLARE_VMA_BIT(MAYOVERLAY, 9),
 #endif /* CONFIG_MMU */
-#define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
-#define VM_MAYBE_GUARD	BIT(VM_MAYBE_GUARD_BIT)	/* The VMA maybe contains guard regions. */
-#define VM_UFFD_WP	0x00001000	/* wrprotect pages tracking */
-
-#define VM_LOCKED	0x00002000
-#define VM_IO           0x00004000	/* Memory mapped I/O or similar */
-
-					/* Used by sys_madvise() */
-#define VM_SEQ_READ	0x00008000	/* App will access data sequentially */
-#define VM_RAND_READ	0x00010000	/* App will not benefit from clustered reads */
-
-#define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
-#define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
-#define VM_LOCKONFAULT	0x00080000	/* Lock the pages covered when they are faulted in */
-#define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
-#define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
-#define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
-#define VM_SYNC		0x00800000	/* Synchronous page faults */
-#define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
-#define VM_WIPEONFORK	0x02000000	/* Wipe VMA contents in child. */
-#define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
-
+	/* Page-ranges managed without "struct page", just pure PFN */
+	DECLARE_VMA_BIT(PFNMAP, 10),
+	DECLARE_VMA_BIT(MAYBE_GUARD, 11),
+	DECLARE_VMA_BIT(UFFD_WP, 12),	/* wrprotect pages tracking */
+	DECLARE_VMA_BIT(LOCKED, 13),
+	DECLARE_VMA_BIT(IO, 14),	/* Memory mapped I/O or similar */
+	DECLARE_VMA_BIT(SEQ_READ, 15),	/* App will access data sequentially */
+	DECLARE_VMA_BIT(RAND_READ, 16),	/* App will not benefit from clustered reads */
+	DECLARE_VMA_BIT(DONTCOPY, 17),	/* Do not copy this vma on fork */
+	DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
+	DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
+	DECLARE_VMA_BIT(ACCOUNT, 20),	/* Is a VM accounted object */
+	DECLARE_VMA_BIT(NORESERVE, 21),	/* should the VM suppress accounting */
+	DECLARE_VMA_BIT(HUGETLB, 22),	/* Huge TLB Page VM */
+	DECLARE_VMA_BIT(SYNC, 23),	/* Synchronous page faults */
+	DECLARE_VMA_BIT(ARCH_1, 24),	/* Architecture-specific flag */
+	DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
+	DECLARE_VMA_BIT(DONTDUMP, 26),	/* Do not include in the core dump */
+	DECLARE_VMA_BIT(SOFTDIRTY, 27),	/* NOT soft dirty clean area */
+	DECLARE_VMA_BIT(MIXEDMAP, 28),	/* Can contain struct page and pure PFN pages */
+	DECLARE_VMA_BIT(HUGEPAGE, 29),	/* MADV_HUGEPAGE marked this vma */
+	DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
+	DECLARE_VMA_BIT(MERGEABLE, 31),	/* KSM may merge identical pages */
+	/* These bits are reused, we define specific uses below. */
+	DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
+	DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
+	DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
+	DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
+	DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
+	DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
+	DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
+	/*
+	 * This flag is used to connect VFIO to arch specific KVM code. It
+	 * indicates that the memory under this VMA is safe for use with any
+	 * non-cachable memory type inside KVM. Some VFIO devices, on some
+	 * platforms, are thought to be unsafe and can cause machine crashes
+	 * if KVM does not lock down the memory type.
+	 */
+	DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
+#ifdef CONFIG_PPC32
+	DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
+#else
+	DECLARE_VMA_BIT(DROPPABLE, 40),
+#endif
+	DECLARE_VMA_BIT(UFFD_MINOR, 41),
+	DECLARE_VMA_BIT(SEALED, 42),
+	/* Flags that reuse flags above. */
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
+#if defined(CONFIG_X86_USER_SHADOW_STACK)
+	/*
+	 * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
+	 * support core mm.
+	 *
+	 * These VMAs will get a single end guard page. This helps userspace
+	 * protect itself from attacks. A single page is enough for current
+	 * shadow stack archs (x86). See the comments near alloc_shstk() in
+	 * arch/x86/kernel/shstk.c for more details on the guard size.
+	 */
+	DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
+#elif defined(CONFIG_ARM64_GCS)
+	/*
+	 * arm64's Guarded Control Stack implements similar functionality and
+	 * has similar constraints to shadow stacks.
+	 */
+	DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
+#endif
+	DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1),		/* Strong Access Ordering (powerpc) */
+	DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1),		/* parisc */
+	DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1),	/* sparc64 */
+	DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1),	/* arm64 */
+	DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1),	/* sparc64, arm64 */
+	DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1),	/* !CONFIG_MMU */
+	DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4),	/* arm64 */
+	DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
+#ifdef CONFIG_STACK_GROWSUP
+	DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
+	DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
+#else
+	DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
+#endif
+};
+#undef DECLARE_VMA_BIT
+#undef DECLARE_VMA_BIT_ALIAS
+
+#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
+#define VM_READ		INIT_VM_FLAG(READ)
+#define VM_WRITE	INIT_VM_FLAG(WRITE)
+#define VM_EXEC		INIT_VM_FLAG(EXEC)
+#define VM_SHARED	INIT_VM_FLAG(SHARED)
+#define VM_MAYREAD	INIT_VM_FLAG(MAYREAD)
+#define VM_MAYWRITE	INIT_VM_FLAG(MAYWRITE)
+#define VM_MAYEXEC	INIT_VM_FLAG(MAYEXEC)
+#define VM_MAYSHARE	INIT_VM_FLAG(MAYSHARE)
+#define VM_GROWSDOWN	INIT_VM_FLAG(GROWSDOWN)
+#ifdef CONFIG_MMU
+#define VM_UFFD_MISSING	INIT_VM_FLAG(UFFD_MISSING)
+#else
+#define VM_UFFD_MISSING	VM_NONE
+#define VM_MAYOVERLAY	INIT_VM_FLAG(MAYOVERLAY)
+#endif
+#define VM_PFNMAP	INIT_VM_FLAG(PFNMAP)
+#define VM_MAYBE_GUARD	INIT_VM_FLAG(MAYBE_GUARD)
+#define VM_UFFD_WP	INIT_VM_FLAG(UFFD_WP)
+#define VM_LOCKED	INIT_VM_FLAG(LOCKED)
+#define VM_IO		INIT_VM_FLAG(IO)
+#define VM_SEQ_READ	INIT_VM_FLAG(SEQ_READ)
+#define VM_RAND_READ	INIT_VM_FLAG(RAND_READ)
+#define VM_DONTCOPY	INIT_VM_FLAG(DONTCOPY)
+#define VM_DONTEXPAND	INIT_VM_FLAG(DONTEXPAND)
+#define VM_LOCKONFAULT	INIT_VM_FLAG(LOCKONFAULT)
+#define VM_ACCOUNT	INIT_VM_FLAG(ACCOUNT)
+#define VM_NORESERVE	INIT_VM_FLAG(NORESERVE)
+#define VM_HUGETLB	INIT_VM_FLAG(HUGETLB)
+#define VM_SYNC		INIT_VM_FLAG(SYNC)
+#define VM_ARCH_1	INIT_VM_FLAG(ARCH_1)
+#define VM_WIPEONFORK	INIT_VM_FLAG(WIPEONFORK)
+#define VM_DONTDUMP	INIT_VM_FLAG(DONTDUMP)
 #ifdef CONFIG_MEM_SOFT_DIRTY
-# define VM_SOFTDIRTY	0x08000000	/* Not soft dirty clean area */
+#define VM_SOFTDIRTY	INIT_VM_FLAG(SOFTDIRTY)
 #else
-# define VM_SOFTDIRTY	0
+#define VM_SOFTDIRTY	VM_NONE
+#endif
+#define VM_MIXEDMAP	INIT_VM_FLAG(MIXEDMAP)
+#define VM_HUGEPAGE	INIT_VM_FLAG(HUGEPAGE)
+#define VM_NOHUGEPAGE	INIT_VM_FLAG(NOHUGEPAGE)
+#define VM_MERGEABLE	INIT_VM_FLAG(MERGEABLE)
+#define VM_STACK	INIT_VM_FLAG(STACK)
+#ifdef CONFIG_STACK_GROWS_UP
+#define VM_STACK_EARLY	INIT_VM_FLAG(STACK_EARLY)
+#else
+#define VM_STACK_EARLY	VM_NONE
 #endif
-
-#define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
-#define VM_HUGEPAGE	0x20000000	/* MADV_HUGEPAGE marked this vma */
-#define VM_NOHUGEPAGE	0x40000000	/* MADV_NOHUGEPAGE marked this vma */
-#define VM_MERGEABLE	BIT(31)		/* KSM may merge identical pages */
-
-#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
-#define VM_HIGH_ARCH_BIT_0	32	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_1	33	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_2	34	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_3	35	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_4	36	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_5	37	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_6	38	/* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_0	BIT(VM_HIGH_ARCH_BIT_0)
-#define VM_HIGH_ARCH_1	BIT(VM_HIGH_ARCH_BIT_1)
-#define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
-#define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
-#define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
-#define VM_HIGH_ARCH_5	BIT(VM_HIGH_ARCH_BIT_5)
-#define VM_HIGH_ARCH_6	BIT(VM_HIGH_ARCH_BIT_6)
-#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
-
 #ifdef CONFIG_ARCH_HAS_PKEYS
-# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
-# define VM_PKEY_BIT0  VM_HIGH_ARCH_0
-# define VM_PKEY_BIT1  VM_HIGH_ARCH_1
-# define VM_PKEY_BIT2  VM_HIGH_ARCH_2
+#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
+/* Despite the naming, these are FLAGS not bits. */
+#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
+#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
+#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
 #if CONFIG_ARCH_PKEY_BITS > 3
-# define VM_PKEY_BIT3  VM_HIGH_ARCH_3
+#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
 #else
-# define VM_PKEY_BIT3  0
-#endif
+#define VM_PKEY_BIT3  VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 3 */
 #if CONFIG_ARCH_PKEY_BITS > 4
-# define VM_PKEY_BIT4  VM_HIGH_ARCH_4
+#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
 #else
-# define VM_PKEY_BIT4  0
-#endif
+#define VM_PKEY_BIT4  VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 4 */
 #endif /* CONFIG_ARCH_HAS_PKEYS */
-
-#ifdef CONFIG_X86_USER_SHADOW_STACK
-/*
- * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
- * support core mm.
- *
- * These VMAs will get a single end guard page. This helps userspace protect
- * itself from attacks. A single page is enough for current shadow stack archs
- * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c
- * for more details on the guard size.
- */
-# define VM_SHADOW_STACK	VM_HIGH_ARCH_5
-#endif
-
-#if defined(CONFIG_ARM64_GCS)
-/*
- * arm64's Guarded Control Stack implements similar functionality and
- * has similar constraints to shadow stacks.
- */
-# define VM_SHADOW_STACK	VM_HIGH_ARCH_6
-#endif
-
-#ifndef VM_SHADOW_STACK
-# define VM_SHADOW_STACK	VM_NONE
+#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS)
+#define VM_SHADOW_STACK	INIT_VM_FLAG(SHADOW_STACK)
+#else
+#define VM_SHADOW_STACK	VM_NONE
 #endif
-
 #if defined(CONFIG_PPC64)
-# define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
+#define VM_SAO		INIT_VM_FLAG(SAO)
 #elif defined(CONFIG_PARISC)
-# define VM_GROWSUP	VM_ARCH_1
+#define VM_GROWSUP	INIT_VM_FLAG(GROWSUP)
 #elif defined(CONFIG_SPARC64)
-# define VM_SPARC_ADI	VM_ARCH_1	/* Uses ADI tag for access control */
-# define VM_ARCH_CLEAR	VM_SPARC_ADI
+#define VM_SPARC_ADI	INIT_VM_FLAG(SPARC_ADI)
+#define VM_ARCH_CLEAR	INIT_VM_FLAG(ARCH_CLEAR)
 #elif defined(CONFIG_ARM64)
-# define VM_ARM64_BTI	VM_ARCH_1	/* BTI guarded page, a.k.a. GP bit */
-# define VM_ARCH_CLEAR	VM_ARM64_BTI
+#define VM_ARM64_BTI	INIT_VM_FLAG(ARM64_BTI)
+#define VM_ARCH_CLEAR	INIT_VM_FLAG(ARCH_CLEAR)
 #elif !defined(CONFIG_MMU)
-# define VM_MAPPED_COPY	VM_ARCH_1	/* T if mapped copy of data (nommu mmap) */
-#endif
-
-#if defined(CONFIG_ARM64_MTE)
-# define VM_MTE		VM_HIGH_ARCH_4	/* Use Tagged memory for access control */
-# define VM_MTE_ALLOWED	VM_HIGH_ARCH_5	/* Tagged memory permitted */
-#else
-# define VM_MTE		VM_NONE
-# define VM_MTE_ALLOWED	VM_NONE
+#define VM_MAPPED_COPY	INIT_VM_FLAG(MAPPED_COPY)
 #endif
-
 #ifndef VM_GROWSUP
-# define VM_GROWSUP	VM_NONE
+#define VM_GROWSUP	VM_NONE
+#endif
+#ifdef CONFIG_ARM64_MTE
+#define VM_MTE		INIT_VM_FLAG(MTE)
+#define VM_MTE_ALLOWED	INIT_VM_FLAG(MTE_ALLOWED)
+#else
+#define VM_MTE		VM_NONE
+#define VM_MTE_ALLOWED	VM_NONE
 #endif
-
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-# define VM_UFFD_MINOR_BIT	41
-# define VM_UFFD_MINOR		BIT(VM_UFFD_MINOR_BIT)	/* UFFD minor faults */
-#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-# define VM_UFFD_MINOR		VM_NONE
-#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-
-/*
- * This flag is used to connect VFIO to arch specific KVM code. It
- * indicates that the memory under this VMA is safe for use with any
- * non-cachable memory type inside KVM. Some VFIO devices, on some
- * platforms, are thought to be unsafe and can cause machine crashes
- * if KVM does not lock down the memory type.
- */
-#ifdef CONFIG_64BIT
-#define VM_ALLOW_ANY_UNCACHED_BIT	39
-#define VM_ALLOW_ANY_UNCACHED		BIT(VM_ALLOW_ANY_UNCACHED_BIT)
+#define VM_UFFD_MINOR	INIT_VM_FLAG(UFFD_MINOR)
 #else
-#define VM_ALLOW_ANY_UNCACHED		VM_NONE
+#define VM_UFFD_MINOR	VM_NONE
 #endif
-
 #ifdef CONFIG_64BIT
-#define VM_DROPPABLE_BIT	40
-#define VM_DROPPABLE		BIT(VM_DROPPABLE_BIT)
-#elif defined(CONFIG_PPC32)
-#define VM_DROPPABLE		VM_ARCH_1
+#define VM_ALLOW_ANY_UNCACHED	INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
+#define VM_SEALED		INIT_VM_FLAG(SEALED)
 #else
-#define VM_DROPPABLE		VM_NONE
+#define VM_ALLOW_ANY_UNCACHED	VM_NONE
+#define VM_SEALED		VM_NONE
 #endif
-
-#ifdef CONFIG_64BIT
-#define VM_SEALED_BIT	42
-#define VM_SEALED	BIT(VM_SEALED_BIT)
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+#define VM_DROPPABLE		INIT_VM_FLAG(DROPPABLE)
 #else
-#define VM_SEALED	VM_NONE
+#define VM_DROPPABLE		VM_NONE
 #endif
 
 /* Bits set in the VMA until the stack is in its final location */
@@ -475,12 +529,10 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
 
-#ifdef CONFIG_STACK_GROWSUP
-#define VM_STACK	VM_GROWSUP
-#define VM_STACK_EARLY	VM_GROWSDOWN
+#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
+#define VM_SEALED_SYSMAP	VM_SEALED
 #else
-#define VM_STACK	VM_GROWSDOWN
-#define VM_STACK_EARLY	0
+#define VM_SEALED_SYSMAP	VM_NONE
 #endif
 
 #define VM_STACK_FLAGS	(VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
@@ -488,7 +540,6 @@ extern unsigned int kobjsize(const void *objp);
 /* VMA basic access permission flags */
 #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
 
-
 /*
  * Special vmas that are non-mergable, non-mlock()able.
  */
@@ -523,7 +574,7 @@ extern unsigned int kobjsize(const void *objp);
 
 /* Arch-specific flags to clear when updating VM flags on protection change */
 #ifndef VM_ARCH_CLEAR
-# define VM_ARCH_CLEAR	VM_NONE
+#define VM_ARCH_CLEAR	VM_NONE
 #endif
 #define VM_FLAGS_CLEAR	(ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)
 
@@ -920,9 +971,9 @@ static inline void vm_flags_mod(struct vm_area_struct *vma,
 }
 
 static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma,
-				       int bit)
+					   vma_flag_t bit)
 {
-	const vm_flags_t mask = BIT(bit);
+	const vm_flags_t mask = BIT((__force int)bit);
 
 	/* Only specific flags are permitted */
 	if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED)))
@@ -935,14 +986,15 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma,
  * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific
  * valid flags are allowed to do this.
  */
-static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit)
+static inline void vma_flag_set_atomic(struct vm_area_struct *vma,
+				       vma_flag_t bit)
 {
 	/* mmap read lock/VMA read lock must be held. */
 	if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
 		vma_assert_locked(vma);
 
 	if (__vma_flag_atomic_valid(vma, bit))
-		set_bit(bit, &ACCESS_PRIVATE(vma, __vm_flags));
+		set_bit((__force int)bit, &ACCESS_PRIVATE(vma, __vm_flags));
 }
 
 /*
@@ -952,10 +1004,11 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit)
  * This is necessarily racey, so callers must ensure that serialisation is
  * achieved through some other means, or that races are permissible.
  */
-static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, int bit)
+static inline bool vma_flag_test_atomic(struct vm_area_struct *vma,
+					vma_flag_t bit)
 {
 	if (__vma_flag_atomic_valid(vma, bit))
-		return test_bit(bit, &vma->vm_flags);
+		return test_bit((__force int)bit, &vma->vm_flags);
 
 	return false;
 }
@@ -4517,16 +4570,6 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st
 int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
 int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
 
-
-/*
- * mseal of userspace process's system mappings.
- */
-#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
-#define VM_SEALED_SYSMAP	VM_SEALED
-#else
-#define VM_SEALED_SYSMAP	VM_NONE
-#endif
-
 /*
  * DMA mapping IDs for page_pool
  *
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 89c33ef7aac3..97d1b2824386 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1740,7 +1740,7 @@ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
 	 * obtained on guard region installation after the flag is set, so this
 	 * check being performed under this lock excludes races.
 	 */
-	if (vma_flag_test_atomic(vma, VM_MAYBE_GUARD_BIT))
+	if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT))
 		return false;
 
 	return true;
diff --git a/mm/madvise.c b/mm/madvise.c
index d8bc51e1bea7..b617b1be0f53 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1142,7 +1142,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
 	 * acquire an mmap/VMA write lock to read it. All remaining readers may
 	 * or may not see the flag set, but we don't care.
 	 */
-	vma_flag_set_atomic(vma, VM_MAYBE_GUARD_BIT);
+	vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT);
 
 	/*
 	 * If anonymous and we are establishing page tables the VMA ought to
diff --git a/rust/bindgen_parameters b/rust/bindgen_parameters
index e13c6f9dd17b..fd2fd1c3cb9a 100644
--- a/rust/bindgen_parameters
+++ b/rust/bindgen_parameters
@@ -35,6 +35,31 @@
 # recognized, block generation of the non-helper constants.
 --blocklist-item ARCH_SLAB_MINALIGN
 --blocklist-item ARCH_KMALLOC_MINALIGN
+--blocklist-item VM_MERGEABLE
+--blocklist-item VM_READ
+--blocklist-item VM_WRITE
+--blocklist-item VM_EXEC
+--blocklist-item VM_SHARED
+--blocklist-item VM_MAYREAD
+--blocklist-item VM_MAYWRITE
+--blocklist-item VM_MAYEXEC
+--blocklist-item VM_MAYEXEC
+--blocklist-item VM_PFNMAP
+--blocklist-item VM_IO
+--blocklist-item VM_DONTCOPY
+--blocklist-item VM_DONTEXPAND
+--blocklist-item VM_LOCKONFAULT
+--blocklist-item VM_ACCOUNT
+--blocklist-item VM_NORESERVE
+--blocklist-item VM_HUGETLB
+--blocklist-item VM_SYNC
+--blocklist-item VM_ARCH_1
+--blocklist-item VM_WIPEONFORK
+--blocklist-item VM_DONTDUMP
+--blocklist-item VM_SOFTDIRTY
+--blocklist-item VM_MIXEDMAP
+--blocklist-item VM_HUGEPAGE
+--blocklist-item VM_NOHUGEPAGE
 
 # Structs should implement `Zeroable` when all of their fields do.
 --with-derive-custom-struct .*=MaybeZeroable
diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index 2e43c66635a2..4c327db01ca0 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -108,7 +108,32 @@ const xa_mark_t RUST_CONST_HELPER_XA_PRESENT = XA_PRESENT;
 
 const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC = XA_FLAGS_ALLOC;
 const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC1 = XA_FLAGS_ALLOC1;
+
 const vm_flags_t RUST_CONST_HELPER_VM_MERGEABLE = VM_MERGEABLE;
+const vm_flags_t RUST_CONST_HELPER_VM_READ = VM_READ;
+const vm_flags_t RUST_CONST_HELPER_VM_WRITE = VM_WRITE;
+const vm_flags_t RUST_CONST_HELPER_VM_EXEC = VM_EXEC;
+const vm_flags_t RUST_CONST_HELPER_VM_SHARED = VM_SHARED;
+const vm_flags_t RUST_CONST_HELPER_VM_MAYREAD = VM_MAYREAD;
+const vm_flags_t RUST_CONST_HELPER_VM_MAYWRITE = VM_MAYWRITE;
+const vm_flags_t RUST_CONST_HELPER_VM_MAYEXEC = VM_MAYEXEC;
+const vm_flags_t RUST_CONST_HELPER_VM_MAYSHARE = VM_MAYEXEC;
+const vm_flags_t RUST_CONST_HELPER_VM_PFNMAP = VM_PFNMAP;
+const vm_flags_t RUST_CONST_HELPER_VM_IO = VM_IO;
+const vm_flags_t RUST_CONST_HELPER_VM_DONTCOPY = VM_DONTCOPY;
+const vm_flags_t RUST_CONST_HELPER_VM_DONTEXPAND = VM_DONTEXPAND;
+const vm_flags_t RUST_CONST_HELPER_VM_LOCKONFAULT = VM_LOCKONFAULT;
+const vm_flags_t RUST_CONST_HELPER_VM_ACCOUNT = VM_ACCOUNT;
+const vm_flags_t RUST_CONST_HELPER_VM_NORESERVE = VM_NORESERVE;
+const vm_flags_t RUST_CONST_HELPER_VM_HUGETLB = VM_HUGETLB;
+const vm_flags_t RUST_CONST_HELPER_VM_SYNC = VM_SYNC;
+const vm_flags_t RUST_CONST_HELPER_VM_ARCH_1 = VM_ARCH_1;
+const vm_flags_t RUST_CONST_HELPER_VM_WIPEONFORK = VM_WIPEONFORK;
+const vm_flags_t RUST_CONST_HELPER_VM_DONTDUMP = VM_DONTDUMP;
+const vm_flags_t RUST_CONST_HELPER_VM_SOFTDIRTY = VM_SOFTDIRTY;
+const vm_flags_t RUST_CONST_HELPER_VM_MIXEDMAP = VM_MIXEDMAP;
+const vm_flags_t RUST_CONST_HELPER_VM_HUGEPAGE = VM_HUGEPAGE;
+const vm_flags_t RUST_CONST_HELPER_VM_NOHUGEPAGE = VM_NOHUGEPAGE;
 
 #if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_RUST)
 #include "../../drivers/android/binder/rust_binder.h"
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 8c2ac301a00e..b7e8fc9ccdf4 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -46,42 +46,271 @@ extern unsigned long dac_mmap_min_addr;
 
 #define MMF_HAS_MDWE	28
 
+/*
+ * vm_flags in vm_area_struct, see mm_types.h.
+ * When changing, update also include/trace/events/mmflags.h
+ */
+
 #define VM_NONE		0x00000000
-#define VM_READ		0x00000001
-#define VM_WRITE	0x00000002
-#define VM_EXEC		0x00000004
-#define VM_SHARED	0x00000008
-#define VM_MAYREAD	0x00000010
-#define VM_MAYWRITE	0x00000020
-#define VM_MAYEXEC	0x00000040
-#define VM_GROWSDOWN	0x00000100
-#define VM_PFNMAP	0x00000400
-#define VM_MAYBE_GUARD	0x00000800
-#define VM_LOCKED	0x00002000
-#define VM_IO           0x00004000
-#define VM_SEQ_READ	0x00008000	/* App will access data sequentially */
-#define VM_RAND_READ	0x00010000	/* App will not benefit from clustered reads */
-#define VM_DONTEXPAND	0x00040000
-#define VM_LOCKONFAULT	0x00080000
-#define VM_ACCOUNT	0x00100000
-#define VM_NORESERVE	0x00200000
-#define VM_MIXEDMAP	0x10000000
-#define VM_STACK	VM_GROWSDOWN
-#define VM_SHADOW_STACK	VM_NONE
-#define VM_SOFTDIRTY	0
-#define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
-#define VM_GROWSUP	VM_NONE
 
-#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+/**
+ * typedef vma_flag_t - specifies an individual VMA flag by bit number.
+ *
+ * This value is made type safe by sparse to avoid passing invalid flag values
+ * around.
+ */
+typedef int __bitwise vma_flag_t;
 
+#define DECLARE_VMA_BIT(name, bitnum) \
+	VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
+#define DECLARE_VMA_BIT_ALIAS(name, aliased) \
+	VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT
+enum {
+	DECLARE_VMA_BIT(READ, 0),
+	DECLARE_VMA_BIT(WRITE, 1),
+	DECLARE_VMA_BIT(EXEC, 2),
+	DECLARE_VMA_BIT(SHARED, 3),
+	/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
+	DECLARE_VMA_BIT(MAYREAD, 4),	/* limits for mprotect() etc. */
+	DECLARE_VMA_BIT(MAYWRITE, 5),
+	DECLARE_VMA_BIT(MAYEXEC, 6),
+	DECLARE_VMA_BIT(MAYSHARE, 7),
+	DECLARE_VMA_BIT(GROWSDOWN, 8),	/* general info on the segment */
+#ifdef CONFIG_MMU
+	DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
+#else
+	/* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
+	DECLARE_VMA_BIT(MAYOVERLAY, 9),
+#endif /* CONFIG_MMU */
+	/* Page-ranges managed without "struct page", just pure PFN */
+	DECLARE_VMA_BIT(PFNMAP, 10),
+	DECLARE_VMA_BIT(MAYBE_GUARD, 11),
+	DECLARE_VMA_BIT(UFFD_WP, 12),	/* wrprotect pages tracking */
+	DECLARE_VMA_BIT(LOCKED, 13),
+	DECLARE_VMA_BIT(IO, 14),	/* Memory mapped I/O or similar */
+	DECLARE_VMA_BIT(SEQ_READ, 15),	/* App will access data sequentially */
+	DECLARE_VMA_BIT(RAND_READ, 16),	/* App will not benefit from clustered reads */
+	DECLARE_VMA_BIT(DONTCOPY, 17),	/* Do not copy this vma on fork */
+	DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
+	DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
+	DECLARE_VMA_BIT(ACCOUNT, 20),	/* Is a VM accounted object */
+	DECLARE_VMA_BIT(NORESERVE, 21),	/* should the VM suppress accounting */
+	DECLARE_VMA_BIT(HUGETLB, 22),	/* Huge TLB Page VM */
+	DECLARE_VMA_BIT(SYNC, 23),	/* Synchronous page faults */
+	DECLARE_VMA_BIT(ARCH_1, 24),	/* Architecture-specific flag */
+	DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
+	DECLARE_VMA_BIT(DONTDUMP, 26),	/* Do not include in the core dump */
+	DECLARE_VMA_BIT(SOFTDIRTY, 27),	/* NOT soft dirty clean area */
+	DECLARE_VMA_BIT(MIXEDMAP, 28),	/* Can contain struct page and pure PFN pages */
+	DECLARE_VMA_BIT(HUGEPAGE, 29),	/* MADV_HUGEPAGE marked this vma */
+	DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
+	DECLARE_VMA_BIT(MERGEABLE, 31),	/* KSM may merge identical pages */
+	/* These bits are reused, we define specific uses below. */
+	DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
+	DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
+	DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
+	DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
+	DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
+	DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
+	DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
+	/*
+	 * This flag is used to connect VFIO to arch specific KVM code. It
+	 * indicates that the memory under this VMA is safe for use with any
+	 * non-cachable memory type inside KVM. Some VFIO devices, on some
+	 * platforms, are thought to be unsafe and can cause machine crashes
+	 * if KVM does not lock down the memory type.
+	 */
+	DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
+#ifdef CONFIG_PPC32
+	DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
+#else
+	DECLARE_VMA_BIT(DROPPABLE, 40),
+#endif
+	DECLARE_VMA_BIT(UFFD_MINOR, 41),
+	DECLARE_VMA_BIT(SEALED, 42),
+	/* Flags that reuse flags above. */
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
+#if defined(CONFIG_X86_USER_SHADOW_STACK)
+	/*
+	 * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
+	 * support core mm.
+	 *
+	 * These VMAs will get a single end guard page. This helps userspace
+	 * protect itself from attacks. A single page is enough for current
+	 * shadow stack archs (x86). See the comments near alloc_shstk() in
+	 * arch/x86/kernel/shstk.c for more details on the guard size.
+	 */
+	DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
+#elif defined(CONFIG_ARM64_GCS)
+	/*
+	 * arm64's Guarded Control Stack implements similar functionality and
+	 * has similar constraints to shadow stacks.
+	 */
+	DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
+#endif
+	DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1),		/* Strong Access Ordering (powerpc) */
+	DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1),		/* parisc */
+	DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1),	/* sparc64 */
+	DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1),	/* arm64 */
+	DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1),	/* sparc64, arm64 */
+	DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1),	/* !CONFIG_MMU */
+	DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4),	/* arm64 */
+	DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
 #ifdef CONFIG_STACK_GROWSUP
-#define VM_STACK	VM_GROWSUP
-#define VM_STACK_EARLY	VM_GROWSDOWN
+	DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
+	DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
 #else
-#define VM_STACK	VM_GROWSDOWN
-#define VM_STACK_EARLY	0
+	DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
 #endif
+};
+
+#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
+#define VM_READ		INIT_VM_FLAG(READ)
+#define VM_WRITE	INIT_VM_FLAG(WRITE)
+#define VM_EXEC		INIT_VM_FLAG(EXEC)
+#define VM_SHARED	INIT_VM_FLAG(SHARED)
+#define VM_MAYREAD	INIT_VM_FLAG(MAYREAD)
+#define VM_MAYWRITE	INIT_VM_FLAG(MAYWRITE)
+#define VM_MAYEXEC	INIT_VM_FLAG(MAYEXEC)
+#define VM_MAYSHARE	INIT_VM_FLAG(MAYSHARE)
+#define VM_GROWSDOWN	INIT_VM_FLAG(GROWSDOWN)
+#ifdef CONFIG_MMU
+#define VM_UFFD_MISSING	INIT_VM_FLAG(UFFD_MISSING)
+#else
+#define VM_UFFD_MISSING	VM_NONE
+#define VM_MAYOVERLAY	INIT_VM_FLAG(MAYOVERLAY)
+#endif
+#define VM_PFNMAP	INIT_VM_FLAG(PFNMAP)
+#define VM_MAYBE_GUARD	INIT_VM_FLAG(MAYBE_GUARD)
+#define VM_UFFD_WP	INIT_VM_FLAG(UFFD_WP)
+#define VM_LOCKED	INIT_VM_FLAG(LOCKED)
+#define VM_IO		INIT_VM_FLAG(IO)
+#define VM_SEQ_READ	INIT_VM_FLAG(SEQ_READ)
+#define VM_RAND_READ	INIT_VM_FLAG(RAND_READ)
+#define VM_DONTCOPY	INIT_VM_FLAG(DONTCOPY)
+#define VM_DONTEXPAND	INIT_VM_FLAG(DONTEXPAND)
+#define VM_LOCKONFAULT	INIT_VM_FLAG(LOCKONFAULT)
+#define VM_ACCOUNT	INIT_VM_FLAG(ACCOUNT)
+#define VM_NORESERVE	INIT_VM_FLAG(NORESERVE)
+#define VM_HUGETLB	INIT_VM_FLAG(HUGETLB)
+#define VM_SYNC		INIT_VM_FLAG(SYNC)
+#define VM_ARCH_1	INIT_VM_FLAG(ARCH_1)
+#define VM_WIPEONFORK	INIT_VM_FLAG(WIPEONFORK)
+#define VM_DONTDUMP	INIT_VM_FLAG(DONTDUMP)
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define VM_SOFTDIRTY	INIT_VM_FLAG(SOFTDIRTY)
+#else
+#define VM_SOFTDIRTY	VM_NONE
+#endif
+#define VM_MIXEDMAP	INIT_VM_FLAG(MIXEDMAP)
+#define VM_HUGEPAGE	INIT_VM_FLAG(HUGEPAGE)
+#define VM_NOHUGEPAGE	INIT_VM_FLAG(NOHUGEPAGE)
+#define VM_MERGEABLE	INIT_VM_FLAG(MERGEABLE)
+#define VM_STACK	INIT_VM_FLAG(STACK)
+#ifdef CONFIG_STACK_GROWS_UP
+#define VM_STACK_EARLY	INIT_VM_FLAG(STACK_EARLY)
+#else
+#define VM_STACK_EARLY	VM_NONE
+#endif
+#ifdef CONFIG_ARCH_HAS_PKEYS
+#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
+/* Despite the naming, these are FLAGS not bits. */
+#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
+#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
+#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
+#if CONFIG_ARCH_PKEY_BITS > 3
+#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
+#else
+#define VM_PKEY_BIT3  VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 3 */
+#if CONFIG_ARCH_PKEY_BITS > 4
+#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
+#else
+#define VM_PKEY_BIT4  VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 4 */
+#endif /* CONFIG_ARCH_HAS_PKEYS */
+#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS)
+#define VM_SHADOW_STACK	INIT_VM_FLAG(SHADOW_STACK)
+#else
+#define VM_SHADOW_STACK	VM_NONE
+#endif
+#if defined(CONFIG_PPC64)
+#define VM_SAO		INIT_VM_FLAG(SAO)
+#elif defined(CONFIG_PARISC)
+#define VM_GROWSUP	INIT_VM_FLAG(GROWSUP)
+#elif defined(CONFIG_SPARC64)
+#define VM_SPARC_ADI	INIT_VM_FLAG(SPARC_ADI)
+#define VM_ARCH_CLEAR	INIT_VM_FLAG(ARCH_CLEAR)
+#elif defined(CONFIG_ARM64)
+#define VM_ARM64_BTI	INIT_VM_FLAG(ARM64_BTI)
+#define VM_ARCH_CLEAR	INIT_VM_FLAG(ARCH_CLEAR)
+#elif !defined(CONFIG_MMU)
+#define VM_MAPPED_COPY	INIT_VM_FLAG(MAPPED_COPY)
+#endif
+#ifndef VM_GROWSUP
+#define VM_GROWSUP	VM_NONE
+#endif
+#ifdef CONFIG_ARM64_MTE
+#define VM_MTE		INIT_VM_FLAG(MTE)
+#define VM_MTE_ALLOWED	INIT_VM_FLAG(MTE_ALLOWED)
+#else
+#define VM_MTE		VM_NONE
+#define VM_MTE_ALLOWED	VM_NONE
+#endif
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+#define VM_UFFD_MINOR	INIT_VM_FLAG(UFFD_MINOR)
+#else
+#define VM_UFFD_MINOR	VM_NONE
+#endif
+#ifdef CONFIG_64BIT
+#define VM_ALLOW_ANY_UNCACHED	INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
+#define VM_SEALED		INIT_VM_FLAG(SEALED)
+#else
+#define VM_ALLOW_ANY_UNCACHED	VM_NONE
+#define VM_SEALED		VM_NONE
+#endif
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+#define VM_DROPPABLE		INIT_VM_FLAG(DROPPABLE)
+#else
+#define VM_DROPPABLE		VM_NONE
+#endif
+
+/* Bits set in the VMA until the stack is in its final location */
+#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
+
+#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
+
+/* Common data flag combinations */
+#define VM_DATA_FLAGS_TSK_EXEC	(VM_READ | VM_WRITE | TASK_EXEC | \
+				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_NON_EXEC	(VM_READ | VM_WRITE | VM_MAYREAD | \
+				 VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_EXEC	(VM_READ | VM_WRITE | VM_EXEC | \
+				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#ifndef VM_DATA_DEFAULT_FLAGS		/* arch can override this */
+#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC
+#endif
+
+#ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
+#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
+#endif
+
+#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
+
+#define VM_STACK_FLAGS	(VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
+
+/* VMA basic access permission flags */
+#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
+
+/*
+ * Special vmas that are non-mergable, non-mlock()able.
+ */
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
 
 #define DEFAULT_MAP_WINDOW	((1UL << 47) - PAGE_SIZE)
 #define TASK_SIZE_LOW		DEFAULT_MAP_WINDOW
@@ -97,26 +326,11 @@ extern unsigned long dac_mmap_min_addr;
 #define VM_DATA_FLAGS_TSK_EXEC	(VM_READ | VM_WRITE | TASK_EXEC | \
 				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
-#define VM_DATA_DEFAULT_FLAGS	VM_DATA_FLAGS_TSK_EXEC
-
-#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
-
-#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
-#define VM_STACK_FLAGS	(VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
-#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
-
 #define RLIMIT_STACK		3	/* max stack size */
 #define RLIMIT_MEMLOCK		8	/* max locked-in-memory address space */
 
 #define CAP_IPC_LOCK         14
 
-#ifdef CONFIG_64BIT
-#define VM_SEALED_BIT	42
-#define VM_SEALED	BIT(VM_SEALED_BIT)
-#else
-#define VM_SEALED	VM_NONE
-#endif
-
 /*
  * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
  * possesses it but the other does not, the merged VMA should nonetheless have
-- 
cgit v1.2.3


From 58eac97a8ba0bcfc5dffb347e40ea3006347ff38 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 25 Nov 2025 10:01:00 +0000
Subject: mm: simplify and rename mm flags function for clarity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The __mm_flags_set_word() function is slightly ambiguous - we use 'set' to
refer to setting individual bits (such as in mm_flags_set()) but here we
use it to refer to overwriting the value altogether.

Rename it to __mm_flags_overwrite_word() to eliminate this ambiguity.

We additionally simplify the functions, eliminating unnecessary
bitmap_xxx() operations (the compiler would have optimised these out but
it's worth being as clear as we can be here).

Link: https://lkml.kernel.org/r/8f0bc556e1b90eca8ea5eba41f8d5d3f9cd7c98a.1764064557.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Alice Ryhl <aliceryhl@google.com>	[rust]
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Gary Guo <gary@garyguo.net>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Trevor Gross <tmgross@umich.edu>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 14 +++++---------
 kernel/fork.c            |  4 ++--
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4f66a3206a63..3550672e0f9e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1314,15 +1314,13 @@ struct mm_struct {
 	unsigned long cpu_bitmap[];
 };
 
-/* Set the first system word of mm flags, non-atomically. */
-static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value)
+/* Copy value to the first system word of mm flags, non-atomically. */
+static inline void __mm_flags_overwrite_word(struct mm_struct *mm, unsigned long value)
 {
-	unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags);
-
-	bitmap_copy(bitmap, &value, BITS_PER_LONG);
+	*ACCESS_PRIVATE(&mm->flags, __mm_flags) = value;
 }
 
-/* Obtain a read-only view of the bitmap. */
+/* Obtain a read-only view of the mm flags bitmap. */
 static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm)
 {
 	return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags);
@@ -1331,9 +1329,7 @@ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct
 /* Read the first system word of mm flags, non-atomically. */
 static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm)
 {
-	const unsigned long *bitmap = __mm_flags_get_bitmap(mm);
-
-	return bitmap_read(bitmap, 0, BITS_PER_LONG);
+	return *__mm_flags_get_bitmap(mm);
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index dd0bb5fe4305..5e3309a2332c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,10 +1061,10 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (current->mm) {
 		unsigned long flags = __mm_flags_get_word(current->mm);
 
-		__mm_flags_set_word(mm, mmf_init_legacy_flags(flags));
+		__mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
 		mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
 	} else {
-		__mm_flags_set_word(mm, default_dump_filter);
+		__mm_flags_overwrite_word(mm, default_dump_filter);
 		mm->def_flags = 0;
 	}
 
-- 
cgit v1.2.3


From 9ea35a25d51b13013b724943a177a7aaf4bfed71 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 25 Nov 2025 10:01:02 +0000
Subject: mm: introduce VMA flags bitmap type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is useful to transition to using a bitmap for VMA flags so we can avoid
running out of flags, especially for 32-bit kernels which are constrained
to 32 flags, necessitating some features to be limited to 64-bit kernels
only.

By doing so, we remove any constraint on the number of VMA flags moving
forwards no matter the platform and can decide in future to extend beyond
64 if required.

We start by declaring an opaque types, vma_flags_t (which resembles
mm_struct flags of type mm_flags_t), setting it to precisely the same size
as vm_flags_t, and place it in union with vm_flags in the VMA declaration.

We additionally update struct vm_area_desc equivalently placing the new
opaque type in union with vm_flags.

This change therefore does not impact the size of struct vm_area_struct or
struct vm_area_desc.

In order for the change to be iterative and to avoid impacting
performance, we designate VM_xxx declared bitmap flag values as those
which must exist in the first system word of the VMA flags bitmap.

We therefore declare vma_flags_clear_all(), vma_flags_overwrite_word(),
vma_flags_overwrite_word(), vma_flags_overwrite_word_once(),
vma_flags_set_word() and vma_flags_clear_word() in order to allow us to
update the existing vm_flags_*() functions to utilise these helpers.

This is a stepping stone towards converting users to the VMA flags bitmap
and behaves precisely as before.

By doing this, we can eliminate the existing private vma->__vm_flags field
in the vma->vm_flags union and replace it with the newly introduced opaque
type vma_flags, which we call flags so we refer to the new bitmap field as
vma->flags.

We update vma_flag_[test, set]_atomic() to account for the change also.

We adapt vm_flags_reset_once() to only clear those bits above the first
system word providing write-once semantics to the first system word (which
it is presumed the caller requires - and in all current use cases this is
so).

As we currently only specify that the VMA flags bitmap size is equal to
BITS_PER_LONG number of bits, this is a noop, but is defensive in
preparation for a future change that increases this.

We additionally update the VMA userland test declarations to implement the
same changes there.

Finally, we update the rust code to reference vma->vm_flags on update
rather than vma->__vm_flags which has been removed.  This is safe for now,
albeit it is implicitly performing a const cast.

Once we introduce flag helpers we can improve this more.

No functional change intended.

Link: https://lkml.kernel.org/r/bab179d7b153ac12f221b7d65caac2759282cfe9.1764064557.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Alice Ryhl <aliceryhl@google.com>	[rust]
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Gary Guo <gary@garyguo.net>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Trevor Gross <tmgross@umich.edu>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h               |  24 +++++--
 include/linux/mm_types.h         |  64 ++++++++++++++++-
 rust/kernel/mm/virt.rs           |   2 +-
 tools/testing/vma/vma_internal.h | 150 +++++++++++++++++++++++++++++++--------
 4 files changed, 202 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2f38fb68840..2887d3b34d3e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -911,7 +911,8 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
 				 vm_flags_t flags)
 {
 	VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
-	ACCESS_PRIVATE(vma, __vm_flags) = flags;
+	vma_flags_clear_all(&vma->flags);
+	vma_flags_overwrite_word(&vma->flags, flags);
 }
 
 /*
@@ -931,14 +932,25 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma,
 				       vm_flags_t flags)
 {
 	vma_assert_write_locked(vma);
-	WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
+	/*
+	 * If VMA flags exist beyond the first system word, also clear these. It
+	 * is assumed the write once behaviour is required only for the first
+	 * system word.
+	 */
+	if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) {
+		unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags);
+
+		bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG);
+	}
+
+	vma_flags_overwrite_word_once(&vma->flags, flags);
 }
 
 static inline void vm_flags_set(struct vm_area_struct *vma,
 				vm_flags_t flags)
 {
 	vma_start_write(vma);
-	ACCESS_PRIVATE(vma, __vm_flags) |= flags;
+	vma_flags_set_word(&vma->flags, flags);
 }
 
 static inline void vm_flags_clear(struct vm_area_struct *vma,
@@ -946,7 +958,7 @@ static inline void vm_flags_clear(struct vm_area_struct *vma,
 {
 	VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
 	vma_start_write(vma);
-	ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
+	vma_flags_clear_word(&vma->flags, flags);
 }
 
 /*
@@ -989,12 +1001,14 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma,
 static inline void vma_flag_set_atomic(struct vm_area_struct *vma,
 				       vma_flag_t bit)
 {
+	unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags);
+
 	/* mmap read lock/VMA read lock must be held. */
 	if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
 		vma_assert_locked(vma);
 
 	if (__vma_flag_atomic_valid(vma, bit))
-		set_bit((__force int)bit, &ACCESS_PRIVATE(vma, __vm_flags));
+		set_bit((__force int)bit, bitmap);
 }
 
 /*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3550672e0f9e..b71625378ce3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -848,6 +848,15 @@ struct mmap_action {
 	bool hide_from_rmap_until_complete :1;
 };
 
+/*
+ * Opaque type representing current VMA (vm_area_struct) flag state. Must be
+ * accessed via vma_flags_xxx() helper functions.
+ */
+#define NUM_VMA_FLAG_BITS BITS_PER_LONG
+typedef struct {
+	DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS);
+} __private vma_flags_t;
+
 /*
  * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
  * manipulate mutable fields which will cause those fields to be updated in the
@@ -865,7 +874,10 @@ struct vm_area_desc {
 	/* Mutable fields. Populated with initial state. */
 	pgoff_t pgoff;
 	struct file *vm_file;
-	vm_flags_t vm_flags;
+	union {
+		vm_flags_t vm_flags;
+		vma_flags_t vma_flags;
+	};
 	pgprot_t page_prot;
 
 	/* Write-only fields. */
@@ -910,10 +922,12 @@ struct vm_area_struct {
 	/*
 	 * Flags, see mm.h.
 	 * To modify use vm_flags_{init|reset|set|clear|mod} functions.
+	 * Preferably, use vma_flags_xxx() functions.
 	 */
 	union {
+		/* Temporary while VMA flags are being converted. */
 		const vm_flags_t vm_flags;
-		vm_flags_t __private __vm_flags;
+		vma_flags_t flags;
 	};
 
 #ifdef CONFIG_PER_VMA_LOCK
@@ -994,6 +1008,52 @@ struct vm_area_struct {
 #endif
 } __randomize_layout;
 
+/* Clears all bits in the VMA flags bitmap, non-atomically. */
+static inline void vma_flags_clear_all(vma_flags_t *flags)
+{
+	bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Copy value to the first system word of VMA flags, non-atomically.
+ *
+ * IMPORTANT: This does not overwrite bytes past the first system word. The
+ * caller must account for this.
+ */
+static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value)
+{
+	*ACCESS_PRIVATE(flags, __vma_flags) = value;
+}
+
+/*
+ * Copy value to the first system word of VMA flags ONCE, non-atomically.
+ *
+ * IMPORTANT: This does not overwrite bytes past the first system word. The
+ * caller must account for this.
+ */
+static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value)
+{
+	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+	WRITE_ONCE(*bitmap, value);
+}
+
+/* Update the first system word of VMA flags setting bits, non-atomically. */
+static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value)
+{
+	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+	*bitmap |= value;
+}
+
+/* Update the first system word of VMA flags clearing bits, non-atomically. */
+static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value)
+{
+	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+	*bitmap &= ~value;
+}
+
 #ifdef CONFIG_NUMA
 #define vma_policy(vma) ((vma)->vm_policy)
 #else
diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs
index a1bfa4e19293..da21d65ccd20 100644
--- a/rust/kernel/mm/virt.rs
+++ b/rust/kernel/mm/virt.rs
@@ -250,7 +250,7 @@ impl VmaNew {
         // SAFETY: This is not a data race: the vma is undergoing initial setup, so it's not yet
         // shared. Additionally, `VmaNew` is `!Sync`, so it cannot be used to write in parallel.
         // The caller promises that this does not set the flags to an invalid value.
-        unsafe { (*self.as_ptr()).__bindgen_anon_2.__vm_flags = flags };
+        unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags = flags };
     }
 
     /// Set the `VM_MIXEDMAP` flag on this vma.
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index b7e8fc9ccdf4..9f0a9f5ed0fe 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -524,6 +524,15 @@ typedef struct {
 	__private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
 } mm_flags_t;
 
+/*
+ * Opaque type representing current VMA (vm_area_struct) flag state. Must be
+ * accessed via vma_flags_xxx() helper functions.
+ */
+#define NUM_VMA_FLAG_BITS BITS_PER_LONG
+typedef struct {
+	DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS);
+} __private vma_flags_t;
+
 struct mm_struct {
 	struct maple_tree mm_mt;
 	int map_count;			/* number of VMAs */
@@ -608,7 +617,10 @@ struct vm_area_desc {
 	/* Mutable fields. Populated with initial state. */
 	pgoff_t pgoff;
 	struct file *vm_file;
-	vm_flags_t vm_flags;
+	union {
+		vm_flags_t vm_flags;
+		vma_flags_t vma_flags;
+	};
 	pgprot_t page_prot;
 
 	/* Write-only fields. */
@@ -654,7 +666,7 @@ struct vm_area_struct {
 	 */
 	union {
 		const vm_flags_t vm_flags;
-		vm_flags_t __private __vm_flags;
+		vma_flags_t flags;
 	};
 
 #ifdef CONFIG_PER_VMA_LOCK
@@ -1368,26 +1380,6 @@ static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags,
 	return true;
 }
 
-static inline void vm_flags_init(struct vm_area_struct *vma,
-				 vm_flags_t flags)
-{
-	vma->__vm_flags = flags;
-}
-
-static inline void vm_flags_set(struct vm_area_struct *vma,
-				vm_flags_t flags)
-{
-	vma_start_write(vma);
-	vma->__vm_flags |= flags;
-}
-
-static inline void vm_flags_clear(struct vm_area_struct *vma,
-				  vm_flags_t flags)
-{
-	vma_start_write(vma);
-	vma->__vm_flags &= ~flags;
-}
-
 static inline int shmem_zero_setup(struct vm_area_struct *vma)
 {
 	return 0;
@@ -1544,13 +1536,118 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
 {
 }
 
-# define ACCESS_PRIVATE(p, member) ((p)->member)
+#define ACCESS_PRIVATE(p, member) ((p)->member)
+
+#define bitmap_size(nbits)	(ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)
+
+static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
+{
+	unsigned int len = bitmap_size(nbits);
+
+	if (small_const_nbits(nbits))
+		*dst = 0;
+	else
+		memset(dst, 0, len);
+}
 
 static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
 {
 	return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
 }
 
+/* Clears all bits in the VMA flags bitmap, non-atomically. */
+static inline void vma_flags_clear_all(vma_flags_t *flags)
+{
+	bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Copy value to the first system word of VMA flags, non-atomically.
+ *
+ * IMPORTANT: This does not overwrite bytes past the first system word. The
+ * caller must account for this.
+ */
+static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value)
+{
+	*ACCESS_PRIVATE(flags, __vma_flags) = value;
+}
+
+/*
+ * Copy value to the first system word of VMA flags ONCE, non-atomically.
+ *
+ * IMPORTANT: This does not overwrite bytes past the first system word. The
+ * caller must account for this.
+ */
+static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value)
+{
+	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+	WRITE_ONCE(*bitmap, value);
+}
+
+/* Update the first system word of VMA flags setting bits, non-atomically. */
+static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value)
+{
+	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+	*bitmap |= value;
+}
+
+/* Update the first system word of VMA flags clearing bits, non-atomically. */
+static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value)
+{
+	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+	*bitmap &= ~value;
+}
+
+
+/* Use when VMA is not part of the VMA tree and needs no locking */
+static inline void vm_flags_init(struct vm_area_struct *vma,
+				 vm_flags_t flags)
+{
+	vma_flags_clear_all(&vma->flags);
+	vma_flags_overwrite_word(&vma->flags, flags);
+}
+
+/*
+ * Use when VMA is part of the VMA tree and modifications need coordination
+ * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
+ * it should be locked explicitly beforehand.
+ */
+static inline void vm_flags_reset(struct vm_area_struct *vma,
+				  vm_flags_t flags)
+{
+	vma_assert_write_locked(vma);
+	vm_flags_init(vma, flags);
+}
+
+static inline void vm_flags_reset_once(struct vm_area_struct *vma,
+				       vm_flags_t flags)
+{
+	vma_assert_write_locked(vma);
+	/*
+	 * The user should only be interested in avoiding reordering of
+	 * assignment to the first word.
+	 */
+	vma_flags_clear_all(&vma->flags);
+	vma_flags_overwrite_word_once(&vma->flags, flags);
+}
+
+static inline void vm_flags_set(struct vm_area_struct *vma,
+				vm_flags_t flags)
+{
+	vma_start_write(vma);
+	vma_flags_set_word(&vma->flags, flags);
+}
+
+static inline void vm_flags_clear(struct vm_area_struct *vma,
+				  vm_flags_t flags)
+{
+	vma_start_write(vma);
+	vma_flags_clear_word(&vma->flags, flags);
+}
+
 /*
  * Denies creating a writable executable mapping or gaining executable permissions.
  *
@@ -1763,11 +1860,4 @@ static inline int do_munmap(struct mm_struct *, unsigned long, size_t,
 	return 0;
 }
 
-static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags)
-{
-	vm_flags_t *dst = (vm_flags_t *)(&vma->vm_flags);
-
-	*dst = flags;
-}
-
 #endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From f3b566d726357df591602f195a9379494f005225 Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong@huawei.com>
Date: Wed, 26 Nov 2025 02:04:35 +0000
Subject: memcg: remove inc/dec_lruvec_kmem_state helpers

The dec_lruvec_kmem_state helper is unused by any caller and can be safely
removed.  Meanwhile, the inc_lruvec_kmem_state helper is only referenced
by shadow_lru_isolate, retaining these two helpers is unnecessary.  This
patch removes both helper functions to eliminate redundant code.

Link: https://lkml.kernel.org/r/20251126020435.1511637-1-chenridong@huaweicloud.com
Signed-off-by: Chen Ridong <chenridong@huawei.com>
Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Lu Jialin <lujialin4@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Wei Xu <weixugc@google.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ----------
 mm/workingset.c            |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d35390f9892a..0651865a4564 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1452,16 +1452,6 @@ struct slabobj_ext {
 #endif
 } __aligned(8);
 
-static inline void inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
-{
-	mod_lruvec_kmem_state(p, idx, 1);
-}
-
-static inline void dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
-{
-	mod_lruvec_kmem_state(p, idx, -1);
-}
-
 static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
 {
 	struct mem_cgroup *memcg;
diff --git a/mm/workingset.c b/mm/workingset.c
index 6ff30369b758..1399d6da75a2 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -749,7 +749,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	if (WARN_ON_ONCE(node->count != node->nr_values))
 		goto out_invalid;
 	xa_delete_node(node, workingset_update_node);
-	inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
+	mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1);
 
 out_invalid:
 	xa_unlock_irq(&mapping->i_pages);
-- 
cgit v1.2.3


From 127fa2ae9e2b1f9b9d876dfaa39fe3640cec5764 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Sat, 18 Oct 2025 20:41:36 +0300
Subject: KEYS: trusted: Replace a redundant instance of tpm2_hash_map

'trusted_tpm2' duplicates 'tpm2_hash_map' originally part of the TPN
driver, which is suboptimal.

Implement and export `tpm2_find_hash_alg()` in the driver, and substitute
the redundant code in 'trusted_tpm2' with a call to the new function.

Reviewed-by: Jonathan McDowell <noodles@meta.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm2-cmd.c               | 14 +++++++++++++-
 include/linux/tpm.h                       |  1 +
 security/keys/trusted-keys/trusted_tpm2.c | 23 ++++-------------------
 3 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index 7d77f6fbc152..5532e53a2dd3 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -18,7 +18,7 @@ static bool disable_pcr_integrity;
 module_param(disable_pcr_integrity, bool, 0444);
 MODULE_PARM_DESC(disable_pcr_integrity, "Disable integrity protection of TPM2_PCR_Extend");
 
-static struct tpm2_hash tpm2_hash_map[] = {
+struct tpm2_hash tpm2_hash_map[] = {
 	{HASH_ALGO_SHA1, TPM_ALG_SHA1},
 	{HASH_ALGO_SHA256, TPM_ALG_SHA256},
 	{HASH_ALGO_SHA384, TPM_ALG_SHA384},
@@ -26,6 +26,18 @@ static struct tpm2_hash tpm2_hash_map[] = {
 	{HASH_ALGO_SM3_256, TPM_ALG_SM3_256},
 };
 
+int tpm2_find_hash_alg(unsigned int crypto_id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tpm2_hash_map); i++)
+		if (crypto_id == tpm2_hash_map[i].crypto_id)
+			return tpm2_hash_map[i].tpm_id;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(tpm2_find_hash_alg);
+
 int tpm2_get_timeouts(struct tpm_chip *chip)
 {
 	chip->timeout_a = msecs_to_jiffies(TPM2_TIMEOUT_A);
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index dc0338a783f3..b15360ff78d7 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -473,6 +473,7 @@ extern int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 extern int tpm_get_random(struct tpm_chip *chip, u8 *data, size_t max);
 extern struct tpm_chip *tpm_default_chip(void);
 void tpm2_flush_context(struct tpm_chip *chip, u32 handle);
+int tpm2_find_hash_alg(unsigned int crypto_id);
 
 static inline void tpm_buf_append_empty_auth(struct tpm_buf *buf, u32 handle)
 {
diff --git a/security/keys/trusted-keys/trusted_tpm2.c b/security/keys/trusted-keys/trusted_tpm2.c
index 024be262702f..edd7b9d7e4dc 100644
--- a/security/keys/trusted-keys/trusted_tpm2.c
+++ b/security/keys/trusted-keys/trusted_tpm2.c
@@ -18,14 +18,6 @@
 
 #include "tpm2key.asn1.h"
 
-static struct tpm2_hash tpm2_hash_map[] = {
-	{HASH_ALGO_SHA1, TPM_ALG_SHA1},
-	{HASH_ALGO_SHA256, TPM_ALG_SHA256},
-	{HASH_ALGO_SHA384, TPM_ALG_SHA384},
-	{HASH_ALGO_SHA512, TPM_ALG_SHA512},
-	{HASH_ALGO_SM3_256, TPM_ALG_SM3_256},
-};
-
 static u32 tpm2key_oid[] = { 2, 23, 133, 10, 1, 5 };
 
 static int tpm2_key_encode(struct trusted_key_payload *payload,
@@ -244,20 +236,13 @@ int tpm2_seal_trusted(struct tpm_chip *chip,
 	off_t offset = TPM_HEADER_SIZE;
 	struct tpm_buf buf, sized;
 	int blob_len = 0;
-	u32 hash;
+	int hash;
 	u32 flags;
-	int i;
 	int rc;
 
-	for (i = 0; i < ARRAY_SIZE(tpm2_hash_map); i++) {
-		if (options->hash == tpm2_hash_map[i].crypto_id) {
-			hash = tpm2_hash_map[i].tpm_id;
-			break;
-		}
-	}
-
-	if (i == ARRAY_SIZE(tpm2_hash_map))
-		return -EINVAL;
+	hash = tpm2_find_hash_alg(options->hash);
+	if (hash < 0)
+		return hash;
 
 	if (!options->keyhandle)
 		return -EINVAL;
-- 
cgit v1.2.3


From 2b092175f5e301cdaa935093edfef2be9defb6df Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 28 Nov 2025 16:06:41 -0500
Subject: NFS: Fix inheritance of the block sizes when automounting

Only inherit the block sizes that were actually specified as mount
parameters for the parent mount.

Fixes: 62a55d088cd8 ("NFS: Additional refactoring for fs_context conversion")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/client.c           | 21 +++++++++++++++++----
 fs/nfs/internal.h         |  1 -
 fs/nfs/namespace.c        |  5 ++++-
 fs/nfs/nfs4client.c       | 18 ++++++++++++++----
 fs/nfs/super.c            | 10 +++-------
 include/linux/nfs_fs_sb.h |  5 +++++
 6 files changed, 43 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 54699299d5b1..2aaea9c98c2c 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -784,10 +784,18 @@ static int nfs_init_server(struct nfs_server *server,
 		server->fattr_valid = NFS_ATTR_FATTR_V4;
 	}
 
-	if (ctx->rsize)
+	if (ctx->bsize) {
+		server->bsize = ctx->bsize;
+		server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_BSIZE;
+	}
+	if (ctx->rsize) {
 		server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto);
-	if (ctx->wsize)
+		server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_RSIZE;
+	}
+	if (ctx->wsize) {
 		server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto);
+		server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_WSIZE;
+	}
 
 	server->acregmin = ctx->acregmin * HZ;
 	server->acregmax = ctx->acregmax * HZ;
@@ -977,8 +985,13 @@ EXPORT_SYMBOL_GPL(nfs_probe_server);
 void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
 {
 	target->flags = source->flags;
-	target->rsize = source->rsize;
-	target->wsize = source->wsize;
+	target->automount_inherit = source->automount_inherit;
+	if (source->automount_inherit & NFS_AUTOMOUNT_INHERIT_BSIZE)
+		target->bsize = source->bsize;
+	if (source->automount_inherit & NFS_AUTOMOUNT_INHERIT_RSIZE)
+		target->rsize = source->rsize;
+	if (source->automount_inherit & NFS_AUTOMOUNT_INHERIT_WSIZE)
+		target->wsize = source->wsize;
 	target->acregmin = source->acregmin;
 	target->acregmax = source->acregmax;
 	target->acdirmin = source->acdirmin;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ffd382aa31ac..2e596244799f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -152,7 +152,6 @@ struct nfs_fs_context {
 		struct super_block	*sb;
 		struct dentry		*dentry;
 		struct nfs_fattr	*fattr;
-		unsigned int		inherited_bsize;
 	} clone_data;
 };
 
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index dca055676c4f..9e4d94f41fc6 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -190,6 +190,10 @@ struct vfsmount *nfs_d_automount(struct path *path)
 	ctx->nfs_mod		= client->cl_nfs_mod;
 	get_nfs_version(ctx->nfs_mod);
 
+	/* Inherit block sizes if they were specified as mount parameters */
+	if (server->automount_inherit & NFS_AUTOMOUNT_INHERIT_BSIZE)
+		ctx->bsize = server->bsize;
+
 	ret = client->rpc_ops->submount(fc, server);
 	if (ret < 0) {
 		mnt = ERR_PTR(ret);
@@ -289,7 +293,6 @@ int nfs_do_submount(struct fs_context *fc)
 		return -ENOMEM;
 
 	ctx->internal		= true;
-	ctx->clone_data.inherited_bsize = ctx->clone_data.sb->s_blocksize_bits;
 
 	p = nfs_devname(dentry, buffer, 4096);
 	if (IS_ERR(p)) {
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 4e972f85d0ca..96bccefbe2cb 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1179,10 +1179,20 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
 	if (error < 0)
 		return error;
 
-	if (ctx->rsize)
-		server->rsize = nfs_io_size(ctx->rsize, server->nfs_client->cl_proto);
-	if (ctx->wsize)
-		server->wsize = nfs_io_size(ctx->wsize, server->nfs_client->cl_proto);
+	if (ctx->bsize) {
+		server->bsize = ctx->bsize;
+		server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_BSIZE;
+	}
+	if (ctx->rsize) {
+		server->rsize =
+			nfs_io_size(ctx->rsize, server->nfs_client->cl_proto);
+		server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_RSIZE;
+	}
+	if (ctx->wsize) {
+		server->wsize =
+			nfs_io_size(ctx->wsize, server->nfs_client->cl_proto);
+		server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_WSIZE;
+	}
 
 	server->acregmin = ctx->acregmin * HZ;
 	server->acregmax = ctx->acregmax * HZ;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 66413133b43e..57d372db03b9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1091,8 +1091,9 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
 	sb->s_blocksize = 0;
 	sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr;
 	sb->s_op = server->nfs_client->cl_nfs_mod->sops;
-	if (ctx->bsize)
-		sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits);
+	if (server->bsize)
+		sb->s_blocksize =
+			nfs_block_size(server->bsize, &sb->s_blocksize_bits);
 
 	switch (server->nfs_client->rpc_ops->version) {
 	case 2:
@@ -1338,13 +1339,8 @@ int nfs_get_tree_common(struct fs_context *fc)
 	}
 
 	if (!s->s_root) {
-		unsigned bsize = ctx->clone_data.inherited_bsize;
 		/* initial superblock/root creation */
 		nfs_fill_super(s, ctx);
-		if (bsize) {
-			s->s_blocksize_bits = bsize;
-			s->s_blocksize = 1U << bsize;
-		}
 		error = nfs_get_cache_cookie(s, ctx);
 		if (error < 0)
 			goto error_splat_super;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 4ba04de6b1ca..c58b870f31ee 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -172,6 +172,11 @@ struct nfs_server {
 #define NFS_MOUNT_FORCE_RDIRPLUS	0x20000000
 #define NFS_MOUNT_NETUNREACH_FATAL	0x40000000
 
+	unsigned int		automount_inherit; /* Properties inherited by automount */
+#define NFS_AUTOMOUNT_INHERIT_BSIZE	0x0001
+#define NFS_AUTOMOUNT_INHERIT_RSIZE	0x0002
+#define NFS_AUTOMOUNT_INHERIT_WSIZE	0x0004
+
 	unsigned int		caps;		/* server capabilities */
 	__u64			fattr_valid;	/* Valid attributes */
 	unsigned int		rsize;		/* read size */
-- 
cgit v1.2.3


From 205dd7a5d6ad6f4c8e8fcd3c3b95a7c0e7067fee Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 18 Nov 2025 18:06:31 -0500
Subject: virtio_pci: drop kernel.h

virtio UAPI headers really have no business pulling in kernel.h
Replace it with const.h which seems to be what's needed
for __KERNEL_DIV_ROUND_UP.

Fixes: 7c1ae151e812 ("virtio_pci: Introduce device parts access commands")
Cc: Yishai Hadas <yishaih@nvidia.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Message-ID: <7a73b6c6af67e13b86633cd7bf11ad56b5d9809b.1763535341.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index c691ac210ce2..e732e3456e27 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -40,7 +40,7 @@
 #define _LINUX_VIRTIO_PCI_H
 
 #include <linux/types.h>
-#include <linux/kernel.h>
+#include <linux/const.h>
 
 #ifndef VIRTIO_PCI_NO_LEGACY
 
-- 
cgit v1.2.3


From 51d7a054521de7085783a9a1ba15c3530863409a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 5 Nov 2025 15:23:50 +0100
Subject: locking/mutex: Redo __mutex_init() to reduce generated code size

mutex_init() invokes __mutex_init() providing the name of the lock and
a pointer to a the lock class. With LOCKDEP enabled this information is
useful but without LOCKDEP it not used at all. Passing the pointer
information of the lock class might be considered negligible but the
name of the lock is passed as well and the string is stored. This
information is wasting storage.

Split __mutex_init() into a _genereic() variant doing the initialisation
of the lock and a _lockdep() version which does _genereic() plus the
lockdep bits. Restrict the lockdep version to lockdep enabled builds
allowing the compiler to remove the unused parameter.

This results in the following size reduction:

        text     data       bss        dec  filename
  | 30237599  8161430   1176624   39575653  vmlinux.defconfig
  | 30233269  8149142   1176560   39558971  vmlinux.defconfig.patched
     -4.2KiB   -12KiB

  | 32455099  8471098  12934684   53860881  vmlinux.defconfig.lockdep
  | 32455100  8471098  12934684   53860882  vmlinux.defconfig.patched.lockdep

  | 27152407  7191822   2068040   36412269  vmlinux.defconfig.preempt_rt
  | 27145937  7183630   2067976   36397543  vmlinux.defconfig.patched.preempt_rt
     -6.3KiB    -8KiB

  | 29382020  7505742  13784608   50672370  vmlinux.defconfig.preempt_rt.lockdep
  | 29376229  7505742  13784544   50666515  vmlinux.defconfig.patched.preempt_rt.lockdep
     -5.6KiB

[peterz: folded fix from boqun]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Waiman Long <longman@redhat.com>
Link: https://lkml.kernel.org/r/20251125145425.68319-1-boqun.feng@gmail.com
Link: https://patch.msgid.link/20251105142350.Tfeevs2N@linutronix.de
---
 include/linux/mutex.h        | 45 ++++++++++++++++++++++++++++++++++----------
 kernel/locking/mutex-debug.c | 10 +---------
 kernel/locking/mutex.c       | 28 +++++++++++++++++++++------
 kernel/locking/mutex.h       |  5 ++---
 kernel/locking/rtmutex_api.c | 19 +++++++++++++++----
 5 files changed, 75 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 847b81ca6436..bf535f0118bb 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -86,8 +86,23 @@ do {									\
 #define DEFINE_MUTEX(mutexname) \
 	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
 
-extern void __mutex_init(struct mutex *lock, const char *name,
-			 struct lock_class_key *key);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+				struct lock_class_key *key)
+{
+	mutex_init_lockep(lock, name, key);
+}
+#else
+extern void mutex_init_generic(struct mutex *lock);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+				struct lock_class_key *key)
+{
+	mutex_init_generic(lock);
+}
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
 
 /**
  * mutex_is_locked - is the mutex locked
@@ -111,17 +126,27 @@ extern bool mutex_is_locked(struct mutex *lock);
 #define DEFINE_MUTEX(mutexname)						\
 	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
 
-extern void __mutex_rt_init(struct mutex *lock, const char *name,
-			    struct lock_class_key *key);
-
 #define mutex_is_locked(l)	rt_mutex_base_is_locked(&(l)->rtmutex)
 
-#define __mutex_init(mutex, name, key)			\
-do {							\
-	rt_mutex_base_init(&(mutex)->rtmutex);		\
-	__mutex_rt_init((mutex), name, key);		\
-} while (0)
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern void mutex_rt_init_lockdep(struct mutex *mutex, const char *name,
+			     struct lock_class_key *key);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+				struct lock_class_key *key)
+{
+	mutex_rt_init_lockdep(lock, name, key);
+}
 
+#else
+extern void mutex_rt_init_generic(struct mutex *mutex);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+				struct lock_class_key *key)
+{
+	mutex_rt_init_generic(lock);
+}
+#endif /* !CONFIG_LOCKDEP */
 #endif /* CONFIG_PREEMPT_RT */
 
 #ifdef CONFIG_DEBUG_MUTEXES
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 949103fd8e9b..2c6b02d4699b 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -78,16 +78,8 @@ void debug_mutex_unlock(struct mutex *lock)
 	}
 }
 
-void debug_mutex_init(struct mutex *lock, const char *name,
-		      struct lock_class_key *key)
+void debug_mutex_init(struct mutex *lock)
 {
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	/*
-	 * Make sure we are not reinitializing a held lock:
-	 */
-	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
-	lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
-#endif
 	lock->magic = lock;
 }
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index de7d6702cd96..2a1d165b3167 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -43,8 +43,7 @@
 # define MUTEX_WARN_ON(cond)
 #endif
 
-void
-__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+static void __mutex_init_generic(struct mutex *lock)
 {
 	atomic_long_set(&lock->owner, 0);
 	raw_spin_lock_init(&lock->wait_lock);
@@ -52,10 +51,8 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	osq_lock_init(&lock->osq);
 #endif
-
-	debug_mutex_init(lock, name, key);
+	debug_mutex_init(lock);
 }
-EXPORT_SYMBOL(__mutex_init);
 
 static inline struct task_struct *__owner_task(unsigned long owner)
 {
@@ -142,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock)
  * There is nothing that would stop spreading the lockdep annotations outwards
  * except more code.
  */
+void mutex_init_generic(struct mutex *lock)
+{
+	__mutex_init_generic(lock);
+}
+EXPORT_SYMBOL(mutex_init_generic);
 
 /*
  * Optimistic trylock that only works in the uncontended case. Make sure to
@@ -166,7 +168,21 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
 
 	return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
 }
-#endif
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+	__mutex_init_generic(lock);
+
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_init_lockep);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
 
 static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
 {
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 2e8080a9bee3..9ad4da8cea00 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -59,8 +59,7 @@ extern void debug_mutex_add_waiter(struct mutex *lock,
 extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 				      struct task_struct *task);
 extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
-			     struct lock_class_key *key);
+extern void debug_mutex_init(struct mutex *lock);
 #else /* CONFIG_DEBUG_MUTEXES */
 # define debug_mutex_lock_common(lock, waiter)		do { } while (0)
 # define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
@@ -68,6 +67,6 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
 # define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
 # define debug_mutex_remove_waiter(lock, waiter, ti)	do { } while (0)
 # define debug_mutex_unlock(lock)			do { } while (0)
-# define debug_mutex_init(lock, name, key)		do { } while (0)
+# define debug_mutex_init(lock)				do { } while (0)
 #endif /* !CONFIG_DEBUG_MUTEXES */
 #endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index bafd5af98eae..59dbd29cb219 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -515,13 +515,11 @@ void rt_mutex_debug_task_free(struct task_struct *task)
 
 #ifdef CONFIG_PREEMPT_RT
 /* Mutexes */
-void __mutex_rt_init(struct mutex *mutex, const char *name,
-		     struct lock_class_key *key)
+static void __mutex_rt_init_generic(struct mutex *mutex)
 {
+	rt_mutex_base_init(&mutex->rtmutex);
 	debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
-	lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
 }
-EXPORT_SYMBOL(__mutex_rt_init);
 
 static __always_inline int __mutex_lock_common(struct mutex *lock,
 					       unsigned int state,
@@ -542,6 +540,13 @@ static __always_inline int __mutex_lock_common(struct mutex *lock,
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
+void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key)
+{
+	__mutex_rt_init_generic(mutex);
+	lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_rt_init_lockdep);
+
 void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
@@ -598,6 +603,12 @@ int __sched _mutex_trylock_nest_lock(struct mutex *lock,
 EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock);
 #else /* CONFIG_DEBUG_LOCK_ALLOC */
 
+void mutex_rt_init_generic(struct mutex *mutex)
+{
+	__mutex_rt_init_generic(mutex);
+}
+EXPORT_SYMBOL(mutex_rt_init_generic);
+
 void __sched mutex_lock(struct mutex *lock)
 {
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
-- 
cgit v1.2.3


From 719e357fc09c63238956eb7cd546627f9e050640 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol@kernel.org>
Date: Thu, 27 Nov 2025 15:41:40 +0100
Subject: locking/local_lock: s/l/__l/ and s/tl/__tl/ to reduce the risk of
 shadowing

The Linux kernel coding style advises to avoid common variable
names in function-like macros to reduce the risk of namespace
collisions.

Throughout local_lock_internal.h, several macros use the rather common
variable names 'l' and 'tl'. This already resulted in an actual
collision: the __local_lock_acquire() function like macro is currently
shadowing the parameter 'l' of the:

  class_##_name##_t class_##_name##_constructor(_type *l)

function factory from <linux/cleanup.h>.

Rename the variable 'l' to '__l' and the variable 'tl' to '__tl'
throughout the file to fix the current namespace collision and
to prevent future ones.

[ bigeasy: Rebase, update all l and tl instances in macros ]

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://patch.msgid.link/20251127144140.215722-3-bigeasy@linutronix.de
---
 include/linux/local_lock_internal.h | 62 ++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index a4dc479157b5..8f82b4eb542f 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -99,18 +99,18 @@ do {								\
 
 #define __local_lock_acquire(lock)					\
 	do {								\
-		local_trylock_t *tl;					\
-		local_lock_t *l;					\
+		local_trylock_t *__tl;					\
+		local_lock_t *__l;					\
 									\
-		l = (local_lock_t *)(lock);				\
-		tl = (local_trylock_t *)l;				\
+		__l = (local_lock_t *)(lock);				\
+		__tl = (local_trylock_t *)__l;				\
 		_Generic((lock),					\
 			local_trylock_t *: ({				\
-				lockdep_assert(tl->acquired == 0);	\
-				WRITE_ONCE(tl->acquired, 1);		\
+				lockdep_assert(__tl->acquired == 0);	\
+				WRITE_ONCE(__tl->acquired, 1);		\
 			}),						\
 			local_lock_t *: (void)0);			\
-		local_lock_acquire(l);					\
+		local_lock_acquire(__l);				\
 	} while (0)
 
 #define __local_lock(lock)					\
@@ -133,36 +133,36 @@ do {								\
 
 #define __local_trylock(lock)					\
 	({							\
-		local_trylock_t *tl;				\
+		local_trylock_t *__tl;				\
 								\
 		preempt_disable();				\
-		tl = (lock);					\
-		if (READ_ONCE(tl->acquired)) {			\
+		__tl = (lock);					\
+		if (READ_ONCE(__tl->acquired)) {		\
 			preempt_enable();			\
-			tl = NULL;				\
+			__tl = NULL;				\
 		} else {					\
-			WRITE_ONCE(tl->acquired, 1);		\
+			WRITE_ONCE(__tl->acquired, 1);		\
 			local_trylock_acquire(			\
-				(local_lock_t *)tl);		\
+				(local_lock_t *)__tl);		\
 		}						\
-		!!tl;						\
+		!!__tl;						\
 	})
 
 #define __local_trylock_irqsave(lock, flags)			\
 	({							\
-		local_trylock_t *tl;				\
+		local_trylock_t *__tl;				\
 								\
 		local_irq_save(flags);				\
-		tl = (lock);					\
-		if (READ_ONCE(tl->acquired)) {			\
+		__tl = (lock);					\
+		if (READ_ONCE(__tl->acquired)) {		\
 			local_irq_restore(flags);		\
-			tl = NULL;				\
+			__tl = NULL;				\
 		} else {					\
-			WRITE_ONCE(tl->acquired, 1);		\
+			WRITE_ONCE(__tl->acquired, 1);		\
 			local_trylock_acquire(			\
-				(local_lock_t *)tl);		\
+				(local_lock_t *)__tl);		\
 		}						\
-		!!tl;						\
+		!!__tl;						\
 	})
 
 /* preemption or migration must be disabled before calling __local_lock_is_locked */
@@ -170,16 +170,16 @@ do {								\
 
 #define __local_lock_release(lock)					\
 	do {								\
-		local_trylock_t *tl;					\
-		local_lock_t *l;					\
+		local_trylock_t *__tl;					\
+		local_lock_t *__l;					\
 									\
-		l = (local_lock_t *)(lock);				\
-		tl = (local_trylock_t *)l;				\
-		local_lock_release(l);					\
+		__l = (local_lock_t *)(lock);				\
+		__tl = (local_trylock_t *)__l;				\
+		local_lock_release(__l);				\
 		_Generic((lock),					\
 			local_trylock_t *: ({				\
-				lockdep_assert(tl->acquired == 1);	\
-				WRITE_ONCE(tl->acquired, 0);		\
+				lockdep_assert(__tl->acquired == 1);	\
+				WRITE_ONCE(__tl->acquired, 0);		\
 			}),						\
 			local_lock_t *: (void)0);			\
 	} while (0)
@@ -223,12 +223,12 @@ typedef spinlock_t local_trylock_t;
 #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
 #define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
 
-#define __local_lock_init(l)					\
+#define __local_lock_init(__l)					\
 	do {							\
-		local_spin_lock_init((l));			\
+		local_spin_lock_init((__l));			\
 	} while (0)
 
-#define __local_trylock_init(l)			__local_lock_init(l)
+#define __local_trylock_init(__l)			__local_lock_init(__l)
 
 #define __local_lock(__lock)					\
 	do {							\
-- 
cgit v1.2.3


From 43decb6b628eb033a1b6188e5018773c0d38be1d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 27 Nov 2025 22:59:25 -0800
Subject: locking/local_lock: Fix all kernel-doc warnings

Modify kernel-doc comments in local_lock.h to prevent warnings:

  Warning: include/linux/local_lock.h:9 function parameter 'lock' not described in 'local_lock_init'
  Warning: include/linux/local_lock.h:56 function parameter 'lock' not described in 'local_trylock_init'
  Warning: include/linux/local_lock.h:56 expecting prototype for local_lock_init(). Prototype was for local_trylock_init() instead

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251128065925.917917-1-rdunlap@infradead.org
---
 include/linux/local_lock.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
index 0d91d060e3e9..b0e6ab329b00 100644
--- a/include/linux/local_lock.h
+++ b/include/linux/local_lock.h
@@ -6,6 +6,7 @@
 
 /**
  * local_lock_init - Runtime initialize a lock instance
+ * @lock:	The lock variable
  */
 #define local_lock_init(lock)		__local_lock_init(lock)
 
@@ -52,7 +53,8 @@
 	__local_unlock_irqrestore(this_cpu_ptr(lock), flags)
 
 /**
- * local_lock_init - Runtime initialize a lock instance
+ * local_trylock_init - Runtime initialize a lock instance
+ * @lock:	The lock variable
  */
 #define local_trylock_init(lock)	__local_trylock_init(lock)
 
-- 
cgit v1.2.3


From bd45d46ffc8fa96e8ee9fa078cef53e0c1221ff4 Mon Sep 17 00:00:00 2001
From: Michał Winiarski <michal.winiarski@intel.com>
Date: Thu, 27 Nov 2025 10:39:33 +0100
Subject: drm/xe/pf: Export helpers for VFIO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Device specific VFIO driver variant for Xe will implement VF migration.
Export everything that's needed for migration ops.

Reviewed-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://patch.msgid.link/20251127093934.1462188-4-michal.winiarski@intel.com
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
(cherry picked from commit 17f22465c5a5573724c942ca7147b4024631ef87)
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/Makefile        |   4 ++
 drivers/gpu/drm/xe/xe_sriov_vfio.c |  80 +++++++++++++++++++++
 include/drm/intel/xe_sriov_vfio.h  | 143 +++++++++++++++++++++++++++++++++++++
 3 files changed, 227 insertions(+)
 create mode 100644 drivers/gpu/drm/xe/xe_sriov_vfio.c
 create mode 100644 include/drm/intel/xe_sriov_vfio.h

(limited to 'include')

diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index e4b273b025d2..a1f86a59d687 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -184,6 +184,10 @@ xe-$(CONFIG_PCI_IOV) += \
 	xe_sriov_pf_sysfs.o \
 	xe_tile_sriov_pf_debugfs.o
 
+ifeq ($(CONFIG_PCI_IOV),y)
+	xe-$(CONFIG_XE_VFIO_PCI) += xe_sriov_vfio.o
+endif
+
 # include helpers for tests even when XE is built-in
 ifdef CONFIG_DRM_XE_KUNIT_TEST
 xe-y += tests/xe_kunit_helpers.o
diff --git a/drivers/gpu/drm/xe/xe_sriov_vfio.c b/drivers/gpu/drm/xe/xe_sriov_vfio.c
new file mode 100644
index 000000000000..e9a7615bb5c5
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_sriov_vfio.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <drm/intel/xe_sriov_vfio.h>
+#include <linux/cleanup.h>
+
+#include "xe_pci.h"
+#include "xe_pm.h"
+#include "xe_sriov_pf_control.h"
+#include "xe_sriov_pf_helpers.h"
+#include "xe_sriov_pf_migration.h"
+
+struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev)
+{
+	return xe_pci_to_pf_device(pdev);
+}
+EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_get_pf, "xe-vfio-pci");
+
+bool xe_sriov_vfio_migration_supported(struct xe_device *xe)
+{
+	if (!IS_SRIOV_PF(xe))
+		return -EPERM;
+
+	return xe_sriov_pf_migration_supported(xe);
+}
+EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_migration_supported, "xe-vfio-pci");
+
+#define DEFINE_XE_SRIOV_VFIO_FUNCTION(_type, _func, _impl)			\
+_type xe_sriov_vfio_##_func(struct xe_device *xe, unsigned int vfid)		\
+{										\
+	if (!IS_SRIOV_PF(xe))							\
+		return -EPERM;							\
+	if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe))			\
+		return -EINVAL;							\
+										\
+	guard(xe_pm_runtime_noresume)(xe);					\
+										\
+	return xe_sriov_pf_##_impl(xe, vfid);					\
+}										\
+EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_##_func, "xe-vfio-pci")
+
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, wait_flr_done, control_wait_flr);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, suspend_device, control_pause_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_device, control_resume_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_enter, control_trigger_save_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_exit, control_finish_save_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_enter, control_trigger_restore_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_exit, control_finish_restore_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, error, control_stop_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(ssize_t, stop_copy_size, migration_size);
+
+ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid,
+				char __user *buf, size_t len)
+{
+	if (!IS_SRIOV_PF(xe))
+		return -EPERM;
+	if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe))
+		return -EINVAL;
+
+	guard(xe_pm_runtime_noresume)(xe);
+
+	return xe_sriov_pf_migration_read(xe, vfid, buf, len);
+}
+EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_read, "xe-vfio-pci");
+
+ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid,
+				 const char __user *buf, size_t len)
+{
+	if (!IS_SRIOV_PF(xe))
+		return -EPERM;
+	if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe))
+		return -EINVAL;
+
+	guard(xe_pm_runtime_noresume)(xe);
+
+	return xe_sriov_pf_migration_write(xe, vfid, buf, len);
+}
+EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_write, "xe-vfio-pci");
diff --git a/include/drm/intel/xe_sriov_vfio.h b/include/drm/intel/xe_sriov_vfio.h
new file mode 100644
index 000000000000..e9814e8149fd
--- /dev/null
+++ b/include/drm/intel/xe_sriov_vfio.h
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#ifndef _XE_SRIOV_VFIO_H_
+#define _XE_SRIOV_VFIO_H_
+
+#include <linux/types.h>
+
+struct pci_dev;
+struct xe_device;
+
+/**
+ * xe_sriov_vfio_get_pf() - Get PF &xe_device.
+ * @pdev: the VF &pci_dev device
+ *
+ * Return: pointer to PF &xe_device, NULL otherwise.
+ */
+struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev);
+
+/**
+ * xe_sriov_vfio_migration_supported() - Check if migration is supported.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ *
+ * Return: true if migration is supported, false otherwise.
+ */
+bool xe_sriov_vfio_migration_supported(struct xe_device *xe);
+
+/**
+ * xe_sriov_vfio_wait_flr_done() - Wait for VF FLR completion.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ *
+ * This function will wait until VF FLR is processed by PF on all tiles (or
+ * until timeout occurs).
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int xe_sriov_vfio_wait_flr_done(struct xe_device *xe, unsigned int vfid);
+
+/**
+ * xe_sriov_vfio_suspend_device() - Suspend VF.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ *
+ * This function will pause VF on all tiles/GTs.
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int xe_sriov_vfio_suspend_device(struct xe_device *xe, unsigned int vfid);
+
+/**
+ * xe_sriov_vfio_resume_device() - Resume VF.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ *
+ * This function will resume VF on all tiles.
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int xe_sriov_vfio_resume_device(struct xe_device *xe, unsigned int vfid);
+
+/**
+ * xe_sriov_vfio_stop_copy_enter() - Initiate a VF device migration data save.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int xe_sriov_vfio_stop_copy_enter(struct xe_device *xe, unsigned int vfid);
+
+/**
+ * xe_sriov_vfio_stop_copy_exit() - Finish a VF device migration data save.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int xe_sriov_vfio_stop_copy_exit(struct xe_device *xe, unsigned int vfid);
+
+/**
+ * xe_sriov_vfio_resume_data_enter() - Initiate a VF device migration data restore.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int xe_sriov_vfio_resume_data_enter(struct xe_device *xe, unsigned int vfid);
+
+/**
+ * xe_sriov_vfio_resume_data_exit() - Finish a VF device migration data restore.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int xe_sriov_vfio_resume_data_exit(struct xe_device *xe, unsigned int vfid);
+
+/**
+ * xe_sriov_vfio_error() - Move VF device to error state.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ *
+ * Reset is needed to move it out of error state.
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int xe_sriov_vfio_error(struct xe_device *xe, unsigned int vfid);
+
+/**
+ * xe_sriov_vfio_data_read() - Read migration data from the VF device.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ * @buf: start address of userspace buffer
+ * @len: requested read size from userspace
+ *
+ * Return: number of bytes that has been successfully read,
+ *	   0 if no more migration data is available, -errno on failure.
+ */
+ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid,
+				char __user *buf, size_t len);
+/**
+ * xe_sriov_vfio_data_write() - Write migration data to the VF device.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ * @buf: start address of userspace buffer
+ * @len: requested write size from userspace
+ *
+ * Return: number of bytes that has been successfully written, -errno on failure.
+ */
+ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid,
+				 const char __user *buf, size_t len);
+/**
+ * xe_sriov_vfio_stop_copy_size() - Get a size estimate of VF device migration data.
+ * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
+ * @vfid: the VF identifier (can't be 0)
+ *
+ * Return: migration data size in bytes or a negative error code on failure.
+ */
+ssize_t xe_sriov_vfio_stop_copy_size(struct xe_device *xe, unsigned int vfid);
+
+#endif
-- 
cgit v1.2.3


From 611cf41ef6ac8301d23daadd8e78b013db0c5071 Mon Sep 17 00:00:00 2001
From: Yongxin Liu <yongxin.liu@windriver.com>
Date: Fri, 28 Nov 2025 18:24:38 +0800
Subject: platform/x86: intel_pmc_ipc: fix ACPI buffer memory leak
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The intel_pmc_ipc() function uses ACPI_ALLOCATE_BUFFER to allocate memory
for the ACPI evaluation result but never frees it, causing a 192-byte
memory leak on each call.

This leak is triggered during network interface initialization when the
stmmac driver calls intel_mac_finish() -> intel_pmc_ipc().

  unreferenced object 0xffff96a848d6ea80 (size 192):
    comm "dhcpcd", pid 541, jiffies 4294684345
    hex dump (first 32 bytes):
      04 00 00 00 05 00 00 00 98 ea d6 48 a8 96 ff ff  ...........H....
      00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00  ................
    backtrace (crc b1564374):
      kmemleak_alloc+0x2d/0x40
      __kmalloc_noprof+0x2fa/0x730
      acpi_ut_initialize_buffer+0x83/0xc0
      acpi_evaluate_object+0x29a/0x2f0
      intel_pmc_ipc+0xfd/0x170
      intel_mac_finish+0x168/0x230
      stmmac_mac_finish+0x3d/0x50
      phylink_major_config+0x22b/0x5b0
      phylink_mac_initial_config.constprop.0+0xf1/0x1b0
      phylink_start+0x8e/0x210
      __stmmac_open+0x12c/0x2b0
      stmmac_open+0x23c/0x380
      __dev_open+0x11d/0x2c0
      __dev_change_flags+0x1d2/0x250
      netif_change_flags+0x2b/0x70
      dev_change_flags+0x40/0xb0

Add __free(kfree) for ACPI object to properly release the allocated buffer.

Cc: stable@vger.kernel.org
Fixes: 7e2f7e25f6ff ("arch: x86: add IPC mailbox accessor function and add SoC register access")
Signed-off-by: Yongxin Liu <yongxin.liu@windriver.com>
Link: https://patch.msgid.link/20251128102437.3412891-2-yongxin.liu@windriver.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/platform_data/x86/intel_pmc_ipc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/x86/intel_pmc_ipc.h b/include/linux/platform_data/x86/intel_pmc_ipc.h
index 1d34435b7001..85ea381e4a27 100644
--- a/include/linux/platform_data/x86/intel_pmc_ipc.h
+++ b/include/linux/platform_data/x86/intel_pmc_ipc.h
@@ -9,6 +9,7 @@
 #ifndef INTEL_PMC_IPC_H
 #define INTEL_PMC_IPC_H
 #include <linux/acpi.h>
+#include <linux/cleanup.h>
 
 #define IPC_SOC_REGISTER_ACCESS			0xAA
 #define IPC_SOC_SUB_CMD_READ			0x00
@@ -48,7 +49,6 @@ static inline int intel_pmc_ipc(struct pmc_ipc_cmd *ipc_cmd, struct pmc_ipc_rbuf
 		{.type = ACPI_TYPE_INTEGER,},
 	};
 	struct acpi_object_list arg_list = { PMC_IPCS_PARAM_COUNT, params };
-	union acpi_object *obj;
 	int status;
 
 	if (!ipc_cmd || !rbuf)
@@ -72,7 +72,7 @@ static inline int intel_pmc_ipc(struct pmc_ipc_cmd *ipc_cmd, struct pmc_ipc_rbuf
 	if (ACPI_FAILURE(status))
 		return -ENODEV;
 
-	obj = buffer.pointer;
+	union acpi_object *obj __free(kfree) = buffer.pointer;
 
 	if (obj && obj->type == ACPI_TYPE_PACKAGE &&
 	    obj->package.count == VALID_IPC_RESPONSE) {
-- 
cgit v1.2.3


From 33b2835f0b7e2a458473b0e3a23b54b92108b6b0 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 2 Sep 2025 11:11:40 -0400
Subject: Bluetooth: HCI: Add initial support for PAST

This adds PAST related commands (HCI_OP_LE_PAST,
HCI_OP_LE_PAST_SET_INFO and HCI_OP_LE_PAST_PARAMS) and events
(HCI_EV_LE_PAST_RECEIVED) along with handling of PAST sender and
receiver features bits including new MGMG settings (
HCI_EV_LE_PAST_RECEIVED and MGMT_SETTING_PAST_RECEIVER) which
userspace can use to determine if PAST is supported by the
controller.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      | 54 +++++++++++++++++++++++++++
 include/net/bluetooth/hci_core.h | 12 ++++++
 include/net/bluetooth/mgmt.h     |  2 +
 net/bluetooth/hci_event.c        | 79 +++++++++++++++++++++++++++++++++++-----
 net/bluetooth/hci_sync.c         |  3 ++
 net/bluetooth/iso.c              | 25 +++++++++++++
 net/bluetooth/mgmt.c             | 12 ++++++
 7 files changed, 177 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index cb4c02d00759..d883ad233ebc 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -647,6 +647,8 @@ enum {
 #define HCI_LE_EXT_ADV			0x10
 #define HCI_LE_PERIODIC_ADV		0x20
 #define HCI_LE_CHAN_SEL_ALG2		0x40
+#define HCI_LE_PAST_SENDER		0x01
+#define HCI_LE_PAST_RECEIVER		0x02
 #define HCI_LE_CIS_CENTRAL		0x10
 #define HCI_LE_CIS_PERIPHERAL		0x20
 #define HCI_LE_ISO_BROADCASTER		0x40
@@ -2068,6 +2070,44 @@ struct hci_cp_le_set_privacy_mode {
 	__u8  mode;
 } __packed;
 
+#define HCI_OP_LE_PAST			0x205a
+struct hci_cp_le_past {
+	__le16 handle;
+	__le16 service_data;
+	__le16 sync_handle;
+} __packed;
+
+struct hci_rp_le_past {
+	__u8   status;
+	__le16 handle;
+} __packed;
+
+#define HCI_OP_LE_PAST_SET_INFO		0x205b
+struct hci_cp_le_past_set_info {
+	__le16 handle;
+	__le16 service_data;
+	__u8   adv_handle;
+} __packed;
+
+struct hci_rp_le_past_set_info {
+	__u8   status;
+	__le16 handle;
+} __packed;
+
+#define HCI_OP_LE_PAST_PARAMS		0x205c
+struct hci_cp_le_past_params {
+	__le16  handle;
+	__u8    mode;
+	__le16  skip;
+	__le16  sync_timeout;
+	__u8    cte_type;
+} __packed;
+
+struct hci_rp_le_past_params {
+	__u8   status;
+	__le16 handle;
+} __packed;
+
 #define HCI_OP_LE_READ_BUFFER_SIZE_V2	0x2060
 struct hci_rp_le_read_buffer_size_v2 {
 	__u8    status;
@@ -2800,6 +2840,20 @@ struct hci_evt_le_ext_adv_set_term {
 	__u8	num_evts;
 } __packed;
 
+#define HCI_EV_LE_PAST_RECEIVED		0x18
+struct hci_ev_le_past_received {
+	__u8   status;
+	__le16 handle;
+	__le16 service_data;
+	__le16 sync_handle;
+	__u8   sid;
+	__u8   bdaddr_type;
+	bdaddr_t  bdaddr;
+	__u8   phy;
+	__le16 interval;
+	__u8   clock_accuracy;
+} __packed;
+
 #define HCI_EVT_LE_CIS_ESTABLISHED	0x19
 struct hci_evt_le_cis_established {
 	__u8  status;
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 0cb87687837f..1bd12c303e25 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -2053,6 +2053,18 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 #define sync_recv_capable(dev) \
 	((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER)
 #define sync_recv_enabled(dev) (le_enabled(dev) && sync_recv_capable(dev))
+#define past_sender_capable(dev) \
+	((dev)->le_features[3] & HCI_LE_PAST_SENDER)
+#define past_receiver_capable(dev) \
+	((dev)->le_features[3] & HCI_LE_PAST_RECEIVER)
+#define past_capable(dev) \
+	(past_sender_capable(dev) || past_receiver_capable(dev))
+#define past_sender_enabled(dev) \
+	(le_enabled(dev) && past_sender_capable(dev))
+#define past_receiver_enabled(dev) \
+	(le_enabled(dev) && past_receiver_capable(dev))
+#define past_enabled(dev) \
+	(past_sender_enabled(dev) || past_receiver_enabled(dev))
 
 #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \
 	(!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG)))
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index f5be96f08b9d..8234915854b6 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -119,6 +119,8 @@ struct mgmt_rp_read_index_list {
 #define MGMT_SETTING_ISO_BROADCASTER	BIT(20)
 #define MGMT_SETTING_ISO_SYNC_RECEIVER	BIT(21)
 #define MGMT_SETTING_LL_PRIVACY		BIT(22)
+#define MGMT_SETTING_PAST_SENDER	BIT(23)
+#define MGMT_SETTING_PAST_RECEIVER	BIT(24)
 
 #define MGMT_OP_READ_INFO		0x0004
 #define MGMT_READ_INFO_SIZE		0
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 3838b90343d9..af34c9938509 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5936,6 +5936,71 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
+static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle)
+{
+	struct hci_cp_le_pa_term_sync cp;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = handle;
+
+	return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp);
+}
+
+static void hci_le_past_received_evt(struct hci_dev *hdev, void *data,
+				     struct sk_buff *skb)
+{
+	struct hci_ev_le_past_received *ev = data;
+	int mask = hdev->link_mode;
+	__u8 flags = 0;
+	struct hci_conn *pa_sync, *conn;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+	hci_dev_lock(hdev);
+
+	hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+
+	conn = hci_conn_hash_lookup_create_pa_sync(hdev);
+	if (!conn) {
+		bt_dev_err(hdev,
+			   "Unable to find connection for dst %pMR sid 0x%2.2x",
+			   &ev->bdaddr, ev->sid);
+		goto unlock;
+	}
+
+	conn->sync_handle = le16_to_cpu(ev->sync_handle);
+	conn->sid = HCI_SID_INVALID;
+
+	mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, PA_LINK,
+				      &flags);
+	if (!(mask & HCI_LM_ACCEPT)) {
+		hci_le_pa_term_sync(hdev, ev->sync_handle);
+		goto unlock;
+	}
+
+	if (!(flags & HCI_PROTO_DEFER))
+		goto unlock;
+
+	/* Add connection to indicate PA sync event */
+	pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY,
+				     HCI_ROLE_SLAVE);
+
+	if (IS_ERR(pa_sync))
+		goto unlock;
+
+	pa_sync->sync_handle = le16_to_cpu(ev->sync_handle);
+
+	if (ev->status) {
+		set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags);
+
+		/* Notify iso layer */
+		hci_connect_cfm(pa_sync, ev->status);
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
 static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, void *data,
 					    struct sk_buff *skb)
 {
@@ -6412,16 +6477,6 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data,
 	hci_dev_unlock(hdev);
 }
 
-static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle)
-{
-	struct hci_cp_le_pa_term_sync cp;
-
-	memset(&cp, 0, sizeof(cp));
-	cp.handle = handle;
-
-	return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp);
-}
-
 static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data,
 					   struct sk_buff *skb)
 {
@@ -7206,6 +7261,10 @@ static const struct hci_le_ev {
 	/* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */
 	HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt,
 		  sizeof(struct hci_evt_le_ext_adv_set_term)),
+	/* [0x18 = HCI_EVT_LE_PAST_RECEIVED] */
+	HCI_LE_EV(HCI_EV_LE_PAST_RECEIVED,
+		  hci_le_past_received_evt,
+		  sizeof(struct hci_ev_le_past_received)),
 	/* [0x19 = HCI_EVT_LE_CIS_ESTABLISHED] */
 	HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_established_evt,
 		  sizeof(struct hci_evt_le_cis_established)),
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 6e76798ec786..54ce93236551 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4393,6 +4393,9 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev)
 	if (ext_adv_capable(hdev))
 		events[2] |= 0x02;	/* LE Advertising Set Terminated */
 
+	if (past_receiver_capable(hdev))
+		events[2] |= 0x80;	/* LE PAST Received */
+
 	if (cis_capable(hdev)) {
 		events[3] |= 0x01;	/* LE CIS Established */
 		if (cis_peripheral_capable(hdev))
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 5859ec1c04dd..d0a79f601e8d 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -80,6 +80,7 @@ static struct bt_iso_qos default_qos;
 static bool check_ucast_qos(struct bt_iso_qos *qos);
 static bool check_bcast_qos(struct bt_iso_qos *qos);
 static bool iso_match_sid(struct sock *sk, void *data);
+static bool iso_match_sid_past(struct sock *sk, void *data);
 static bool iso_match_sync_handle(struct sock *sk, void *data);
 static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data);
 static void iso_sock_disconn(struct sock *sk);
@@ -2090,6 +2091,16 @@ static bool iso_match_sid(struct sock *sk, void *data)
 	return ev->sid == iso_pi(sk)->bc_sid;
 }
 
+static bool iso_match_sid_past(struct sock *sk, void *data)
+{
+	struct hci_ev_le_past_received *ev = data;
+
+	if (iso_pi(sk)->bc_sid == HCI_SID_INVALID)
+		return true;
+
+	return ev->sid == iso_pi(sk)->bc_sid;
+}
+
 static bool iso_match_sync_handle(struct sock *sk, void *data)
 {
 	struct hci_evt_le_big_info_adv_report *ev = data;
@@ -2109,6 +2120,7 @@ static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data)
 int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 {
 	struct hci_ev_le_pa_sync_established *ev1;
+	struct hci_ev_le_past_received *ev1a;
 	struct hci_evt_le_big_info_adv_report *ev2;
 	struct hci_ev_le_per_adv_report *ev3;
 	struct sock *sk;
@@ -2122,6 +2134,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 	 * SID to listen to and once sync is established its handle needs to
 	 * be stored in iso_pi(sk)->sync_handle so it can be matched once
 	 * receiving the BIG Info.
+	 * 1a. HCI_EV_LE_PAST_RECEIVED: alternative to 1.
 	 * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a
 	 * a BIG Info it attempts to check if there any listening socket with
 	 * the same sync_handle and if it does then attempt to create a sync.
@@ -2141,6 +2154,18 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 		goto done;
 	}
 
+	ev1a = hci_recv_event_data(hdev, HCI_EV_LE_PAST_RECEIVED);
+	if (ev1a) {
+		sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN,
+				  iso_match_sid_past, ev1a);
+		if (sk && !ev1a->status) {
+			iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle);
+			iso_pi(sk)->bc_sid = ev1a->sid;
+		}
+
+		goto done;
+	}
+
 	ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT);
 	if (ev2) {
 		/* Check if BIGInfo report has already been handled */
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 262bf984d2aa..7dbd3d57e66c 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -852,6 +852,12 @@ static u32 get_supported_settings(struct hci_dev *hdev)
 	if (ll_privacy_capable(hdev))
 		settings |= MGMT_SETTING_LL_PRIVACY;
 
+	if (past_sender_capable(hdev))
+		settings |= MGMT_SETTING_PAST_SENDER;
+
+	if (past_receiver_capable(hdev))
+		settings |= MGMT_SETTING_PAST_RECEIVER;
+
 	settings |= MGMT_SETTING_PHY_CONFIGURATION;
 
 	return settings;
@@ -937,6 +943,12 @@ static u32 get_current_settings(struct hci_dev *hdev)
 	if (ll_privacy_enabled(hdev))
 		settings |= MGMT_SETTING_LL_PRIVACY;
 
+	if (past_sender_enabled(hdev))
+		settings |= MGMT_SETTING_PAST_SENDER;
+
+	if (past_receiver_enabled(hdev))
+		settings |= MGMT_SETTING_PAST_RECEIVER;
+
 	return settings;
 }
 
-- 
cgit v1.2.3


From c530569adc19b5f0c62955de41f067bad34e3fe0 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 2 Sep 2025 11:14:28 -0400
Subject: Bluetooth: hci_core: Introduce HCI_CONN_FLAG_PAST

This introduces a new device flag so userspace can indicate if it
wants to enable PAST Receiver for a specific device.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_sync.c         | 55 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 1bd12c303e25..8c2235444808 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -166,6 +166,7 @@ enum hci_conn_flags {
 	HCI_CONN_FLAG_REMOTE_WAKEUP = BIT(0),
 	HCI_CONN_FLAG_DEVICE_PRIVACY = BIT(1),
 	HCI_CONN_FLAG_ADDRESS_RESOLUTION = BIT(2),
+	HCI_CONN_FLAG_PAST = BIT(3),
 };
 typedef u8 hci_conn_flags_t;
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 54ce93236551..ba6f13e9235c 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4324,6 +4324,10 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev)
 	if (ll_privacy_capable(hdev))
 		hdev->conn_flags |= HCI_CONN_FLAG_ADDRESS_RESOLUTION;
 
+	/* Mark PAST if supported */
+	if (past_capable(hdev))
+		hdev->conn_flags |= HCI_CONN_FLAG_PAST;
+
 	/* If the controller supports Extended Scanner Filter
 	 * Policies, enable the corresponding event.
 	 */
@@ -7024,10 +7028,41 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
+static int hci_le_past_params_sync(struct hci_dev *hdev, struct hci_conn *conn,
+				   struct hci_conn *acl, struct bt_iso_qos *qos)
+{
+	struct hci_cp_le_past_params cp;
+	int err;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = cpu_to_le16(acl->handle);
+	/* An HCI_LE_Periodic_Advertising_Sync_Transfer_Received event is sent
+	 * to the Host. HCI_LE_Periodic_Advertising_Report events will be
+	 * enabled with duplicate filtering enabled.
+	 */
+	cp.mode = 0x03;
+	cp.skip = cpu_to_le16(qos->bcast.skip);
+	cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout);
+	cp.cte_type = qos->bcast.sync_cte_type;
+
+	/* HCI_LE_PAST_PARAMS command returns a command complete event so it
+	 * cannot wait for HCI_EV_LE_PAST_RECEIVED.
+	 */
+	err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST_PARAMS,
+				    sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+	if (err)
+		return err;
+
+	/* Wait for HCI_EV_LE_PAST_RECEIVED event */
+	return __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL,
+					HCI_EV_LE_PAST_RECEIVED,
+					conn->conn_timeout, NULL);
+}
+
 static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data)
 {
 	struct hci_cp_le_pa_create_sync cp;
-	struct hci_conn *conn = data;
+	struct hci_conn *conn = data, *le;
 	struct bt_iso_qos *qos = &conn->iso_qos;
 	int err;
 
@@ -7059,6 +7094,24 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data)
 
 	hci_update_passive_scan_sync(hdev);
 
+	/* Check if PAST is possible:
+	 *
+	 * 1. Check if an ACL connection with the destination address exists
+	 * 2. Check if that HCI_CONN_FLAG_PAST has been set which indicates that
+	 *    user really intended to use PAST.
+	 */
+	le = hci_conn_hash_lookup_le(hdev, &conn->dst, conn->dst_type);
+	if (le) {
+		struct hci_conn_params *params;
+
+		params = hci_conn_params_lookup(hdev, &le->dst, le->dst_type);
+		if (params && params->flags & HCI_CONN_FLAG_PAST) {
+			err = hci_le_past_params_sync(hdev, conn, le, qos);
+			if (!err)
+				goto done;
+		}
+	}
+
 	/* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update
 	 * it.
 	 */
-- 
cgit v1.2.3


From d3413703d5f8b7d1e6f514f9440ed5da1bc30796 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 5 Sep 2025 11:34:44 -0400
Subject: Bluetooth: ISO: Add support to bind to trigger PAST

This makes it possible to bind to a different destination address
after being connected (BT_CONNECTED, BT_CONNECT2) which then triggers
PAST Sender proceedure to transfer the PA Sync to the destination
address.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 +
 include/net/bluetooth/hci_sync.h |  1 +
 net/bluetooth/hci_conn.c         | 12 +++++
 net/bluetooth/hci_sync.c         | 92 +++++++++++++++++++++++++++++++++++++
 net/bluetooth/iso.c              | 98 +++++++++++++++++++++++++++++++++-------
 5 files changed, 187 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 8c2235444808..1f74722f3f4d 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1602,6 +1602,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
 			      struct bt_iso_qos *qos,
 			      __u8 base_len, __u8 *base, u16 timeout);
+int hci_past_bis(struct hci_conn *conn, bdaddr_t *dst, __u8 dst_type);
 struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
 				 __u8 dst_type, struct bt_iso_qos *qos,
 				 u16 timeout);
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index e352a4e0ef8d..3133f40fa9f9 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -188,3 +188,4 @@ int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn,
 
 int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn);
 int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn);
+int hci_past_sync(struct hci_conn *conn, struct hci_conn *le);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 6fc0692abf05..4f9dc1435ccc 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -2245,6 +2245,18 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
 	return conn;
 }
 
+int hci_past_bis(struct hci_conn *conn, bdaddr_t *dst, __u8 dst_type)
+{
+	struct hci_conn *le;
+
+	/* Lookup existing LE connection to rebind to */
+	le = hci_conn_hash_lookup_le(conn->hdev, dst, dst_type);
+	if (!le)
+		return -EINVAL;
+
+	return hci_past_sync(conn, le);
+}
+
 static void bis_mark_per_adv(struct hci_conn *conn, void *data)
 {
 	struct iso_list_data *d = data;
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index ba6f13e9235c..65f2701beb49 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -7228,3 +7228,95 @@ int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn)
 	return hci_cmd_sync_queue_once(hdev, hci_le_big_create_sync, conn,
 				       create_big_complete);
 }
+
+struct past_data {
+	struct hci_conn *conn;
+	struct hci_conn *le;
+};
+
+static void past_complete(struct hci_dev *hdev, void *data, int err)
+{
+	struct past_data *past = data;
+
+	bt_dev_dbg(hdev, "err %d", err);
+
+	kfree(past);
+}
+
+static int hci_le_past_set_info_sync(struct hci_dev *hdev, void *data)
+{
+	struct past_data *past = data;
+	struct hci_cp_le_past_set_info cp;
+
+	hci_dev_lock(hdev);
+
+	if (!hci_conn_valid(hdev, past->conn) ||
+	    !hci_conn_valid(hdev, past->le)) {
+		hci_dev_unlock(hdev);
+		return -ECANCELED;
+	}
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = cpu_to_le16(past->le->handle);
+	cp.adv_handle = past->conn->iso_qos.bcast.bis;
+
+	hci_dev_unlock(hdev);
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST_SET_INFO,
+				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_past_sync(struct hci_dev *hdev, void *data)
+{
+	struct past_data *past = data;
+	struct hci_cp_le_past cp;
+
+	hci_dev_lock(hdev);
+
+	if (!hci_conn_valid(hdev, past->conn) ||
+	    !hci_conn_valid(hdev, past->le)) {
+		hci_dev_unlock(hdev);
+		return -ECANCELED;
+	}
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = cpu_to_le16(past->le->handle);
+	cp.sync_handle = cpu_to_le16(past->conn->sync_handle);
+
+	hci_dev_unlock(hdev);
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST,
+				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_past_sync(struct hci_conn *conn, struct hci_conn *le)
+{
+	struct past_data *data;
+	int err;
+
+	if (conn->type != BIS_LINK && conn->type != PA_LINK)
+		return -EINVAL;
+
+	if (!past_sender_capable(conn->hdev))
+		return -EOPNOTSUPP;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->conn = conn;
+	data->le = le;
+
+	if (conn->role == HCI_ROLE_MASTER)
+		err = hci_cmd_sync_queue_once(conn->hdev,
+					      hci_le_past_set_info_sync, data,
+					      past_complete);
+	else
+		err = hci_cmd_sync_queue_once(conn->hdev, hci_le_past_sync,
+					      data, past_complete);
+
+	if (err)
+		kfree(data);
+
+	return err;
+}
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index d0a79f601e8d..85fa9363b897 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -987,20 +987,14 @@ static int iso_sock_bind_bc(struct socket *sock, struct sockaddr_unsized *addr,
 	return 0;
 }
 
-static int iso_sock_bind_pa_sk(struct sock *sk, struct sockaddr_iso *sa,
+/* Must be called on the locked socket. */
+static int iso_sock_rebind_bis(struct sock *sk, struct sockaddr_iso *sa,
 			       int addr_len)
 {
 	int err = 0;
 
-	if (sk->sk_type != SOCK_SEQPACKET) {
-		err = -EINVAL;
-		goto done;
-	}
-
-	if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) {
-		err = -EINVAL;
-		goto done;
-	}
+	if (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags))
+		return -EBADFD;
 
 	if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) {
 		err = -EINVAL;
@@ -1023,6 +1017,77 @@ done:
 	return err;
 }
 
+static struct hci_dev *iso_conn_get_hdev(struct iso_conn *conn)
+{
+	struct hci_dev *hdev = NULL;
+
+	iso_conn_lock(conn);
+	if (conn->hcon)
+		hdev = hci_dev_hold(conn->hcon->hdev);
+	iso_conn_unlock(conn);
+
+	return hdev;
+}
+
+/* Must be called on the locked socket. */
+static int iso_sock_rebind_bc(struct sock *sk, struct sockaddr_iso *sa,
+			      int addr_len)
+{
+	struct hci_dev *hdev;
+	struct hci_conn *bis;
+	int err;
+
+	if (sk->sk_type != SOCK_SEQPACKET || !iso_pi(sk)->conn)
+		return -EINVAL;
+
+	/* Check if it is really a Broadcast address being requested */
+	if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc))
+		return -EINVAL;
+
+	/* Check if the address hasn't changed then perhaps only the number of
+	 * bis has changed.
+	 */
+	if (!bacmp(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr) ||
+	    !bacmp(&sa->iso_bc->bc_bdaddr, BDADDR_ANY))
+		return iso_sock_rebind_bis(sk, sa, addr_len);
+
+	/* Check if the address type is of LE type */
+	if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type))
+		return -EINVAL;
+
+	hdev = iso_conn_get_hdev(iso_pi(sk)->conn);
+	if (!hdev)
+		return -EINVAL;
+
+	bis = iso_pi(sk)->conn->hcon;
+
+	/* Release the socket before lookups since that requires hci_dev_lock
+	 * which shall not be acquired while holding sock_lock for proper
+	 * ordering.
+	 */
+	release_sock(sk);
+	hci_dev_lock(bis->hdev);
+	lock_sock(sk);
+
+	if (!iso_pi(sk)->conn || iso_pi(sk)->conn->hcon != bis) {
+		/* raced with iso_conn_del() or iso_disconn_sock() */
+		err = -ENOTCONN;
+		goto unlock;
+	}
+
+	BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bc->bc_bdaddr,
+	       sa->iso_bc->bc_bdaddr_type);
+
+	err = hci_past_bis(bis, &sa->iso_bc->bc_bdaddr,
+			   le_addr_type(sa->iso_bc->bc_bdaddr_type));
+
+unlock:
+	hci_dev_unlock(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
 static int iso_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
 			 int addr_len)
 {
@@ -1038,13 +1103,12 @@ static int iso_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
 
 	lock_sock(sk);
 
-	/* Allow the user to bind a PA sync socket to a number
-	 * of BISes to sync to.
-	 */
-	if ((sk->sk_state == BT_CONNECT2 ||
-	     sk->sk_state == BT_CONNECTED) &&
-	    test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
-		err = iso_sock_bind_pa_sk(sk, sa, addr_len);
+	if ((sk->sk_state == BT_CONNECT2 || sk->sk_state == BT_CONNECTED) &&
+	    addr_len > sizeof(*sa)) {
+		/* Allow the user to rebind to a different address using
+		 * PAST procedures.
+		 */
+		err = iso_sock_rebind_bc(sk, sa, addr_len);
 		goto done;
 	}
 
-- 
cgit v1.2.3


From 14b06c3a88f7031d64fbce197fad1d400e507663 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 26 Sep 2025 15:56:43 -0400
Subject: Bluetooth: HCI: Always use the identity address when initializing a
 connection

This makes sure hci_conn is initialized with the identity address if
a matching IRK exists which avoids the trouble of having to do it at
multiple places which seems to be missing (e.g. CIS, BIS and PA).

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  4 ++--
 net/bluetooth/hci_conn.c         | 43 +++++++++++++++++++++++++---------------
 net/bluetooth/hci_event.c        | 20 +++++++++----------
 net/bluetooth/hci_sync.c         |  2 +-
 4 files changed, 40 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 1f74722f3f4d..858b58206e80 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1571,9 +1571,9 @@ int hci_le_create_cis_pending(struct hci_dev *hdev);
 int hci_conn_check_create_cis(struct hci_conn *conn);
 
 struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
-			      u8 role, u16 handle);
+			      u8 dst_type, u8 role, u16 handle);
 struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
-				    bdaddr_t *dst, u8 role);
+				    bdaddr_t *dst, u8 dst_type, u8 role);
 void hci_conn_del(struct hci_conn *conn);
 void hci_conn_hash_flush(struct hci_dev *hdev);
 
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 4f9dc1435ccc..c3f7828bf9d5 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -922,10 +922,12 @@ static int hci_conn_hash_alloc_unset(struct hci_dev *hdev)
 			       U16_MAX, GFP_ATOMIC);
 }
 
-static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
+static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type,
+				       bdaddr_t *dst, u8 dst_type,
 				       u8 role, u16 handle)
 {
 	struct hci_conn *conn;
+	struct smp_irk *irk = NULL;
 
 	switch (type) {
 	case ACL_LINK:
@@ -937,12 +939,14 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t
 	case PA_LINK:
 		if (!hdev->iso_mtu)
 			return ERR_PTR(-ECONNREFUSED);
+		irk = hci_get_irk(hdev, dst, dst_type);
 		break;
 	case LE_LINK:
 		if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
 			return ERR_PTR(-ECONNREFUSED);
 		if (!hdev->le_mtu && hdev->acl_mtu < HCI_MIN_LE_MTU)
 			return ERR_PTR(-ECONNREFUSED);
+		irk = hci_get_irk(hdev, dst, dst_type);
 		break;
 	case SCO_LINK:
 	case ESCO_LINK:
@@ -960,7 +964,15 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t
 	if (!conn)
 		return ERR_PTR(-ENOMEM);
 
-	bacpy(&conn->dst, dst);
+	/* If and IRK exists use its identity address */
+	if (!irk) {
+		bacpy(&conn->dst, dst);
+		conn->dst_type = dst_type;
+	} else {
+		bacpy(&conn->dst, &irk->bdaddr);
+		conn->dst_type = irk->addr_type;
+	}
+
 	bacpy(&conn->src, &hdev->bdaddr);
 	conn->handle = handle;
 	conn->hdev  = hdev;
@@ -1059,7 +1071,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t
 }
 
 struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
-				    bdaddr_t *dst, u8 role)
+				    bdaddr_t *dst, u8 dst_type, u8 role)
 {
 	int handle;
 
@@ -1069,16 +1081,16 @@ struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
 	if (unlikely(handle < 0))
 		return ERR_PTR(-ECONNREFUSED);
 
-	return __hci_conn_add(hdev, type, dst, role, handle);
+	return __hci_conn_add(hdev, type, dst, dst_type, role, handle);
 }
 
 struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
-			      u8 role, u16 handle)
+			      u8 dst_type, u8 role, u16 handle)
 {
 	if (handle > HCI_CONN_HANDLE_MAX)
 		return ERR_PTR(-EINVAL);
 
-	return __hci_conn_add(hdev, type, dst, role, handle);
+	return __hci_conn_add(hdev, type, dst, dst_type, role, handle);
 }
 
 static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason)
@@ -1410,14 +1422,13 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 	if (conn) {
 		bacpy(&conn->dst, dst);
 	} else {
-		conn = hci_conn_add_unset(hdev, LE_LINK, dst, role);
+		conn = hci_conn_add_unset(hdev, LE_LINK, dst, dst_type, role);
 		if (IS_ERR(conn))
 			return conn;
 		hci_conn_hold(conn);
 		conn->pending_sec_level = sec_level;
 	}
 
-	conn->dst_type = dst_type;
 	conn->sec_level = BT_SECURITY_LOW;
 	conn->conn_timeout = conn_timeout;
 	conn->le_adv_phy = phy;
@@ -1587,7 +1598,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
 		     memcmp(conn->le_per_adv_data, base, base_len)))
 		return ERR_PTR(-EADDRINUSE);
 
-	conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_MASTER);
+	conn = hci_conn_add_unset(hdev, BIS_LINK, dst, 0, HCI_ROLE_MASTER);
 	if (IS_ERR(conn))
 		return conn;
 
@@ -1633,7 +1644,8 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
 
 	BT_DBG("requesting refresh of dst_addr");
 
-	conn = hci_conn_add_unset(hdev, LE_LINK, dst, HCI_ROLE_MASTER);
+	conn = hci_conn_add_unset(hdev, LE_LINK, dst, dst_type,
+				  HCI_ROLE_MASTER);
 	if (IS_ERR(conn))
 		return conn;
 
@@ -1644,7 +1656,6 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
 
 	conn->state = BT_CONNECT;
 	set_bit(HCI_CONN_SCANNING, &conn->flags);
-	conn->dst_type = dst_type;
 	conn->sec_level = BT_SECURITY_LOW;
 	conn->pending_sec_level = sec_level;
 	conn->conn_timeout = conn_timeout;
@@ -1681,7 +1692,8 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 
 	acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
 	if (!acl) {
-		acl = hci_conn_add_unset(hdev, ACL_LINK, dst, HCI_ROLE_MASTER);
+		acl = hci_conn_add_unset(hdev, ACL_LINK, dst, 0,
+					 HCI_ROLE_MASTER);
 		if (IS_ERR(acl))
 			return acl;
 	}
@@ -1750,7 +1762,7 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
 
 	sco = hci_conn_hash_lookup_ba(hdev, type, dst);
 	if (!sco) {
-		sco = hci_conn_add_unset(hdev, type, dst, HCI_ROLE_MASTER);
+		sco = hci_conn_add_unset(hdev, type, dst, 0, HCI_ROLE_MASTER);
 		if (IS_ERR(sco)) {
 			hci_conn_drop(acl);
 			return sco;
@@ -1942,7 +1954,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 	cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig,
 				       qos->ucast.cis);
 	if (!cis) {
-		cis = hci_conn_add_unset(hdev, CIS_LINK, dst,
+		cis = hci_conn_add_unset(hdev, CIS_LINK, dst, dst_type,
 					 HCI_ROLE_MASTER);
 		if (IS_ERR(cis))
 			return cis;
@@ -2133,12 +2145,11 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst,
 
 	bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid);
 
-	conn = hci_conn_add_unset(hdev, PA_LINK, dst, HCI_ROLE_SLAVE);
+	conn = hci_conn_add_unset(hdev, PA_LINK, dst, dst_type, HCI_ROLE_SLAVE);
 	if (IS_ERR(conn))
 		return conn;
 
 	conn->iso_qos = *qos;
-	conn->dst_type = dst_type;
 	conn->sid = sid;
 	conn->state = BT_LISTEN;
 	conn->conn_timeout = msecs_to_jiffies(qos->bcast.sync_timeout * 10);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index af34c9938509..7c4ca14f13e5 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2267,7 +2267,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 	} else {
 		if (!conn) {
 			conn = hci_conn_add_unset(hdev, ACL_LINK, &cp->bdaddr,
-						  HCI_ROLE_MASTER);
+						  0, HCI_ROLE_MASTER);
 			if (IS_ERR(conn))
 				bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
 		}
@@ -3123,7 +3123,8 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 						      &ev->bdaddr,
 						      BDADDR_BREDR)) {
 			conn = hci_conn_add_unset(hdev, ev->link_type,
-						  &ev->bdaddr, HCI_ROLE_SLAVE);
+						  &ev->bdaddr, 0,
+						  HCI_ROLE_SLAVE);
 			if (IS_ERR(conn)) {
 				bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
 				goto unlock;
@@ -3299,7 +3300,7 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data,
 	conn = hci_conn_hash_lookup_ba(hdev, ev->link_type,
 			&ev->bdaddr);
 	if (!conn) {
-		conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr,
+		conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr, 0,
 					  HCI_ROLE_SLAVE);
 		if (IS_ERR(conn)) {
 			bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
@@ -5670,14 +5671,13 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 		if (status)
 			goto unlock;
 
-		conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, role);
+		conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, bdaddr_type,
+					  role);
 		if (IS_ERR(conn)) {
 			bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
 			goto unlock;
 		}
 
-		conn->dst_type = bdaddr_type;
-
 		/* If we didn't have a hci_conn object previously
 		 * but we're in central role this must be something
 		 * initiated using an accept list. Since accept list based
@@ -5982,7 +5982,7 @@ static void hci_le_past_received_evt(struct hci_dev *hdev, void *data,
 		goto unlock;
 
 	/* Add connection to indicate PA sync event */
-	pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY,
+	pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0,
 				     HCI_ROLE_SLAVE);
 
 	if (IS_ERR(pa_sync))
@@ -6515,7 +6515,7 @@ static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data,
 		goto unlock;
 
 	/* Add connection to indicate PA sync event */
-	pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY,
+	pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0,
 				     HCI_ROLE_SLAVE);
 
 	if (IS_ERR(pa_sync))
@@ -6956,7 +6956,7 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data,
 
 	cis = hci_conn_hash_lookup_handle(hdev, cis_handle);
 	if (!cis) {
-		cis = hci_conn_add(hdev, CIS_LINK, &acl->dst,
+		cis = hci_conn_add(hdev, CIS_LINK, &acl->dst, acl->dst_type,
 				   HCI_ROLE_SLAVE, cis_handle);
 		if (IS_ERR(cis)) {
 			hci_le_reject_cis(hdev, ev->cis_handle);
@@ -7073,7 +7073,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
 				bt_dev_dbg(hdev, "ignore too large handle %u", handle);
 				continue;
 			}
-			bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY,
+			bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY, 0,
 					   HCI_ROLE_SLAVE, handle);
 			if (IS_ERR(bis))
 				continue;
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 65f2701beb49..a36d2414a3ca 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -7013,7 +7013,7 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err)
 		goto unlock;
 
 	/* Add connection to indicate PA sync error */
-	pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY,
+	pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0,
 				     HCI_ROLE_SLAVE);
 
 	if (IS_ERR(pa_sync))
-- 
cgit v1.2.3


From a106e50be74b0896583f4d010a69f9806e4194f4 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 14 Nov 2025 09:29:28 -0500
Subject: Bluetooth: HCI: Add support for LL Extended Feature Set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for emulating LL Extended Feature Set introduced in 6.0
that adds the following:

Commands:

 - HCI_LE_Read_All_Local_Supported_­Features(0x2087)(Feature:47,1)
 - HCI_LE_Read_All_Remote_Features(0x2088)(Feature:47,2)

Events:

 - HCI_LE_Read_All_Remote_Features_Complete(0x2b)(Mask bit:42)

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      |  23 +++++++
 include/net/bluetooth/hci_core.h |   5 +-
 include/net/bluetooth/hci_sync.h |   2 +
 net/bluetooth/hci_event.c        | 125 ++++++++++++++++++++++++++++++---------
 net/bluetooth/hci_sync.c         | 102 +++++++++++++++++++++++++++++++-
 5 files changed, 227 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index d883ad233ebc..a27cd3626b87 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -653,6 +653,7 @@ enum {
 #define HCI_LE_CIS_PERIPHERAL		0x20
 #define HCI_LE_ISO_BROADCASTER		0x40
 #define HCI_LE_ISO_SYNC_RECEIVER	0x80
+#define HCI_LE_LL_EXT_FEATURE		0x80
 
 /* Connection modes */
 #define HCI_CM_ACTIVE	0x0000
@@ -2255,6 +2256,19 @@ struct hci_cp_le_set_host_feature {
 	__u8     bit_value;
 } __packed;
 
+#define HCI_OP_LE_READ_ALL_LOCAL_FEATURES	0x2087
+struct hci_rp_le_read_all_local_features {
+	__u8    status;
+	__u8    page;
+	__u8    features[248];
+} __packed;
+
+#define HCI_OP_LE_READ_ALL_REMOTE_FEATURES	0x2088
+struct hci_cp_le_read_all_remote_features {
+	__le16	 handle;
+	__u8	 pages;
+} __packed;
+
 /* ---- HCI Events ---- */
 struct hci_ev_status {
 	__u8    status;
@@ -2937,6 +2951,15 @@ struct hci_evt_le_big_info_adv_report {
 	__u8    encryption;
 } __packed;
 
+#define HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE 0x2b
+struct hci_evt_le_read_all_remote_features_complete {
+	__u8    status;
+	__le16  handle;
+	__u8    max_pages;
+	__u8    valid_pages;
+	__u8    features[248];
+} __packed;
+
 #define HCI_EV_VENDOR			0xff
 
 /* Internal events generated by Bluetooth stack */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 858b58206e80..4263e71a23ef 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -378,7 +378,7 @@ struct hci_dev {
 	__u8		minor_class;
 	__u8		max_page;
 	__u8		features[HCI_MAX_PAGES][8];
-	__u8		le_features[8];
+	__u8		le_features[248];
 	__u8		le_accept_list_size;
 	__u8		le_resolv_list_size;
 	__u8		le_num_of_adv_sets;
@@ -702,6 +702,7 @@ struct hci_conn {
 	__u8		attempt;
 	__u8		dev_class[3];
 	__u8		features[HCI_MAX_PAGES][8];
+	__u8		le_features[248];
 	__u16		pkt_type;
 	__u16		link_policy;
 	__u8		key_type;
@@ -2067,6 +2068,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 	(le_enabled(dev) && past_receiver_capable(dev))
 #define past_enabled(dev) \
 	(past_sender_enabled(dev) || past_receiver_enabled(dev))
+#define ll_ext_feature_capable(dev) \
+	((dev)->le_features[7] & HCI_LE_LL_EXT_FEATURE)
 
 #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \
 	(!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG)))
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 3133f40fa9f9..56076bbc981d 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -189,3 +189,5 @@ int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn,
 int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn);
 int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn);
 int hci_past_sync(struct hci_conn *conn, struct hci_conn *le);
+
+int hci_le_read_remote_features(struct hci_conn *conn);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 7c4ca14f13e5..a9868f17ef40 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2886,12 +2886,8 @@ static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status)
 	hci_dev_lock(hdev);
 
 	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
-	if (conn) {
-		if (conn->state == BT_CONFIG) {
-			hci_connect_cfm(conn, status);
-			hci_conn_drop(conn);
-		}
-	}
+	if (conn && conn->state == BT_CONFIG)
+		hci_connect_cfm(conn, status);
 
 	hci_dev_unlock(hdev);
 }
@@ -3915,11 +3911,49 @@ unlock:
 	return rp->status;
 }
 
+static u8 hci_cc_le_read_all_local_features(struct hci_dev *hdev, void *data,
+					    struct sk_buff *skb)
+{
+	struct hci_rp_le_read_all_local_features *rp = data;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+	if (rp->status)
+		return rp->status;
+
+	memcpy(hdev->le_features, rp->features, 248);
+
+	return rp->status;
+}
+
 static void hci_cs_le_create_big(struct hci_dev *hdev, u8 status)
 {
 	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 }
 
+static void hci_cs_le_read_all_remote_features(struct hci_dev *hdev, u8 status)
+{
+	struct hci_cp_le_read_remote_features *cp;
+	struct hci_conn *conn;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+	if (conn && conn->state == BT_CONFIG)
+		hci_connect_cfm(conn, status);
+
+	hci_dev_unlock(hdev);
+}
+
 static u8 hci_cc_set_per_adv_param(struct hci_dev *hdev, void *data,
 				   struct sk_buff *skb)
 {
@@ -4171,6 +4205,9 @@ static const struct hci_cc {
 		  sizeof(struct hci_rp_le_set_cig_params), HCI_MAX_EVENT_SIZE),
 	HCI_CC(HCI_OP_LE_SETUP_ISO_PATH, hci_cc_le_setup_iso_path,
 	       sizeof(struct hci_rp_le_setup_iso_path)),
+	HCI_CC(HCI_OP_LE_READ_ALL_LOCAL_FEATURES,
+	       hci_cc_le_read_all_local_features,
+	       sizeof(struct hci_rp_le_read_all_local_features)),
 };
 
 static u8 hci_cc_func(struct hci_dev *hdev, const struct hci_cc *cc,
@@ -4325,6 +4362,8 @@ static const struct hci_cs {
 	HCI_CS(HCI_OP_LE_EXT_CREATE_CONN, hci_cs_le_ext_create_conn),
 	HCI_CS(HCI_OP_LE_CREATE_CIS, hci_cs_le_create_cis),
 	HCI_CS(HCI_OP_LE_CREATE_BIG, hci_cs_le_create_big),
+	HCI_CS(HCI_OP_LE_READ_ALL_REMOTE_FEATURES,
+	       hci_cs_le_read_all_remote_features),
 };
 
 static void hci_cmd_status_evt(struct hci_dev *hdev, void *data,
@@ -5645,6 +5684,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 	struct hci_conn *conn;
 	struct smp_irk *irk;
 	u8 addr_type;
+	int err;
 
 	hci_dev_lock(hdev);
 
@@ -5775,26 +5815,8 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 	hci_debugfs_create_conn(conn);
 	hci_conn_add_sysfs(conn);
 
-	/* The remote features procedure is defined for central
-	 * role only. So only in case of an initiated connection
-	 * request the remote features.
-	 *
-	 * If the local controller supports peripheral-initiated features
-	 * exchange, then requesting the remote features in peripheral
-	 * role is possible. Otherwise just transition into the
-	 * connected state without requesting the remote features.
-	 */
-	if (conn->out ||
-	    (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) {
-		struct hci_cp_le_read_remote_features cp;
-
-		cp.handle = __cpu_to_le16(conn->handle);
-
-		hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES,
-			     sizeof(cp), &cp);
-
-		hci_conn_hold(conn);
-	} else {
+	err = hci_le_read_remote_features(conn);
+	if (err) {
 		conn->state = BT_CONNECTED;
 		hci_connect_cfm(conn, status);
 	}
@@ -6608,7 +6630,6 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data,
 
 			conn->state = BT_CONNECTED;
 			hci_connect_cfm(conn, status);
-			hci_conn_drop(conn);
 		}
 	}
 
@@ -7186,6 +7207,50 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
+static void hci_le_read_all_remote_features_evt(struct hci_dev *hdev,
+						void *data, struct sk_buff *skb)
+{
+	struct hci_evt_le_read_all_remote_features_complete *ev = data;
+	struct hci_conn *conn;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (!conn)
+		goto unlock;
+
+	if (!ev->status)
+		memcpy(conn->le_features, ev->features, 248);
+
+	if (conn->state == BT_CONFIG) {
+		__u8 status;
+
+		/* If the local controller supports peripheral-initiated
+		 * features exchange, but the remote controller does
+		 * not, then it is possible that the error code 0x1a
+		 * for unsupported remote feature gets returned.
+		 *
+		 * In this specific case, allow the connection to
+		 * transition into connected state and mark it as
+		 * successful.
+		 */
+		if (!conn->out &&
+		    ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE &&
+		    (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
+			status = 0x00;
+		else
+			status = ev->status;
+
+		conn->state = BT_CONNECTED;
+		hci_connect_cfm(conn, status);
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
 #define HCI_LE_EV_VL(_op, _func, _min_len, _max_len) \
 [_op] = { \
 	.func = _func, \
@@ -7291,6 +7356,12 @@ static const struct hci_le_ev {
 		     hci_le_big_info_adv_report_evt,
 		     sizeof(struct hci_evt_le_big_info_adv_report),
 		     HCI_MAX_EVENT_SIZE),
+	/* [0x2b = HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE] */
+	HCI_LE_EV_VL(HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE,
+		     hci_le_read_all_remote_features_evt,
+		     sizeof(struct
+			    hci_evt_le_read_all_remote_features_complete),
+		     HCI_MAX_EVENT_SIZE),
 };
 
 static void hci_le_meta_evt(struct hci_dev *hdev, void *data,
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index a36d2414a3ca..a9f5b1a68356 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4011,8 +4011,19 @@ static int hci_le_read_buffer_size_sync(struct hci_dev *hdev)
 /* Read LE Local Supported Features */
 static int hci_le_read_local_features_sync(struct hci_dev *hdev)
 {
-	return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES,
-				     0, NULL, HCI_CMD_TIMEOUT);
+	int err;
+
+	err = __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES,
+				    0, NULL, HCI_CMD_TIMEOUT);
+	if (err)
+		return err;
+
+	if (ll_ext_feature_capable(hdev) && hdev->commands[47] & BIT(2))
+		return __hci_cmd_sync_status(hdev,
+					     HCI_OP_LE_READ_ALL_LOCAL_FEATURES,
+					     0, NULL, HCI_CMD_TIMEOUT);
+
+	return err;
 }
 
 /* Read LE Supported States */
@@ -7320,3 +7331,90 @@ int hci_past_sync(struct hci_conn *conn, struct hci_conn *le)
 
 	return err;
 }
+
+static void le_read_features_complete(struct hci_dev *hdev, void *data, int err)
+{
+	struct hci_conn *conn = data;
+
+	bt_dev_dbg(hdev, "err %d", err);
+
+	if (err == -ECANCELED)
+		return;
+
+	hci_conn_drop(conn);
+}
+
+static int hci_le_read_all_remote_features_sync(struct hci_dev *hdev,
+						void *data)
+{
+	struct hci_conn *conn = data;
+	struct hci_cp_le_read_all_remote_features cp;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = cpu_to_le16(conn->handle);
+	cp.pages = 10; /* Attempt to read all pages */
+
+	/* Wait for HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE event otherwise
+	 * hci_conn_drop may run prematurely causing a disconnection.
+	 */
+	return __hci_cmd_sync_status_sk(hdev,
+					HCI_OP_LE_READ_ALL_REMOTE_FEATURES,
+					sizeof(cp), &cp,
+					HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE,
+					HCI_CMD_TIMEOUT, NULL);
+
+	return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES,
+				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_read_remote_features_sync(struct hci_dev *hdev, void *data)
+{
+	struct hci_conn *conn = data;
+	struct hci_cp_le_read_remote_features cp;
+
+	if (!hci_conn_valid(hdev, conn))
+		return -ECANCELED;
+
+	/* Check if LL Extended Feature Set is supported and
+	 * HCI_OP_LE_READ_ALL_REMOTE_FEATURES is supported then use that to read
+	 * all features.
+	 */
+	if (ll_ext_feature_capable(hdev) && hdev->commands[47] & BIT(3))
+		return hci_le_read_all_remote_features_sync(hdev, data);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.handle = cpu_to_le16(conn->handle);
+
+	/* Wait for HCI_EV_LE_REMOTE_FEAT_COMPLETE event otherwise
+	 * hci_conn_drop may run prematurely causing a disconnection.
+	 */
+	return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_READ_REMOTE_FEATURES,
+					sizeof(cp), &cp,
+					HCI_EV_LE_REMOTE_FEAT_COMPLETE,
+					HCI_CMD_TIMEOUT, NULL);
+}
+
+int hci_le_read_remote_features(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+	int err;
+
+	/* The remote features procedure is defined for central
+	 * role only. So only in case of an initiated connection
+	 * request the remote features.
+	 *
+	 * If the local controller supports peripheral-initiated features
+	 * exchange, then requesting the remote features in peripheral
+	 * role is possible. Otherwise just transition into the
+	 * connected state without requesting the remote features.
+	 */
+	if (conn->out || (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
+		err = hci_cmd_sync_queue_once(hdev,
+					      hci_le_read_remote_features_sync,
+					      hci_conn_hold(conn),
+					      le_read_features_complete);
+	else
+		err = -EOPNOTSUPP;
+
+	return err;
+}
-- 
cgit v1.2.3


From 9bf66036d686b9a67000ba22bd94be13a4ea79ac Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Wed, 26 Nov 2025 13:45:52 -0800
Subject: net: mana: Handle hardware recovery events when probing the device

When MANA is being probed, it's possible that hardware is in recovery
mode and the device may get GDMA_EQE_HWC_RESET_REQUEST over HWC in the
middle of the probe. Detect such condition and go through the recovery
service procedure.

Signed-off-by: Long Li <longli@microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1764193552-9712-1-git-send-email-longli@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c | 176 +++++++++++++++++++++---
 include/net/mana/gdma.h                         |  12 +-
 2 files changed, 170 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 8fd70b34807a..efb4e412ec7e 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -15,6 +15,20 @@
 
 struct dentry *mana_debugfs_root;
 
+struct mana_dev_recovery {
+	struct list_head list;
+	struct pci_dev *pdev;
+	enum gdma_eqe_type type;
+};
+
+static struct mana_dev_recovery_work {
+	struct list_head dev_list;
+	struct delayed_work work;
+
+	/* Lock for dev_list above */
+	spinlock_t lock;
+} mana_dev_recovery_work;
+
 static u32 mana_gd_r32(struct gdma_context *g, u64 offset)
 {
 	return readl(g->bar0_va + offset);
@@ -387,6 +401,25 @@ EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA");
 
 #define MANA_SERVICE_PERIOD 10
 
+static void mana_serv_rescan(struct pci_dev *pdev)
+{
+	struct pci_bus *parent;
+
+	pci_lock_rescan_remove();
+
+	parent = pdev->bus;
+	if (!parent) {
+		dev_err(&pdev->dev, "MANA service: no parent bus\n");
+		goto out;
+	}
+
+	pci_stop_and_remove_bus_device(pdev);
+	pci_rescan_bus(parent);
+
+out:
+	pci_unlock_rescan_remove();
+}
+
 static void mana_serv_fpga(struct pci_dev *pdev)
 {
 	struct pci_bus *bus, *parent;
@@ -419,9 +452,12 @@ static void mana_serv_reset(struct pci_dev *pdev)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 	struct hw_channel_context *hwc;
+	int ret;
 
 	if (!gc) {
-		dev_err(&pdev->dev, "MANA service: no GC\n");
+		/* Perform PCI rescan on device if GC is not set up */
+		dev_err(&pdev->dev, "MANA service: GC not setup, rescanning\n");
+		mana_serv_rescan(pdev);
 		return;
 	}
 
@@ -440,9 +476,18 @@ static void mana_serv_reset(struct pci_dev *pdev)
 
 	msleep(MANA_SERVICE_PERIOD * 1000);
 
-	mana_gd_resume(pdev);
+	ret = mana_gd_resume(pdev);
+	if (ret == -ETIMEDOUT || ret == -EPROTO) {
+		/* Perform PCI rescan on device if we failed on HWC */
+		dev_err(&pdev->dev, "MANA service: resume failed, rescanning\n");
+		mana_serv_rescan(pdev);
+		goto out;
+	}
 
-	dev_info(&pdev->dev, "MANA reset cycle completed\n");
+	if (ret)
+		dev_info(&pdev->dev, "MANA reset cycle failed err %d\n", ret);
+	else
+		dev_info(&pdev->dev, "MANA reset cycle completed\n");
 
 out:
 	gc->in_service = false;
@@ -454,18 +499,9 @@ struct mana_serv_work {
 	enum gdma_eqe_type type;
 };
 
-static void mana_serv_func(struct work_struct *w)
+static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev)
 {
-	struct mana_serv_work *mns_wk;
-	struct pci_dev *pdev;
-
-	mns_wk = container_of(w, struct mana_serv_work, serv_work);
-	pdev = mns_wk->pdev;
-
-	if (!pdev)
-		goto out;
-
-	switch (mns_wk->type) {
+	switch (type) {
 	case GDMA_EQE_HWC_FPGA_RECONFIG:
 		mana_serv_fpga(pdev);
 		break;
@@ -475,12 +511,48 @@ static void mana_serv_func(struct work_struct *w)
 		break;
 
 	default:
-		dev_err(&pdev->dev, "MANA service: unknown type %d\n",
-			mns_wk->type);
+		dev_err(&pdev->dev, "MANA service: unknown type %d\n", type);
 		break;
 	}
+}
+
+static void mana_recovery_delayed_func(struct work_struct *w)
+{
+	struct mana_dev_recovery_work *work;
+	struct mana_dev_recovery *dev;
+	unsigned long flags;
+
+	work = container_of(w, struct mana_dev_recovery_work, work.work);
+
+	spin_lock_irqsave(&work->lock, flags);
+
+	while (!list_empty(&work->dev_list)) {
+		dev = list_first_entry(&work->dev_list,
+				       struct mana_dev_recovery, list);
+		list_del(&dev->list);
+		spin_unlock_irqrestore(&work->lock, flags);
+
+		mana_do_service(dev->type, dev->pdev);
+		pci_dev_put(dev->pdev);
+		kfree(dev);
+
+		spin_lock_irqsave(&work->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&work->lock, flags);
+}
+
+static void mana_serv_func(struct work_struct *w)
+{
+	struct mana_serv_work *mns_wk;
+	struct pci_dev *pdev;
+
+	mns_wk = container_of(w, struct mana_serv_work, serv_work);
+	pdev = mns_wk->pdev;
+
+	if (pdev)
+		mana_do_service(mns_wk->type, pdev);
 
-out:
 	pci_dev_put(pdev);
 	kfree(mns_wk);
 	module_put(THIS_MODULE);
@@ -541,6 +613,17 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
 	case GDMA_EQE_HWC_RESET_REQUEST:
 		dev_info(gc->dev, "Recv MANA service type:%d\n", type);
 
+		if (!test_and_set_bit(GC_PROBE_SUCCEEDED, &gc->flags)) {
+			/*
+			 * Device is in probe and we received a hardware reset
+			 * event, the probe function will detect that the flag
+			 * has changed and perform service procedure.
+			 */
+			dev_info(gc->dev,
+				 "Service is to be processed in probe\n");
+			break;
+		}
+
 		if (gc->in_service) {
 			dev_info(gc->dev, "Already in service\n");
 			break;
@@ -1938,8 +2021,19 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (err)
 		goto cleanup_mana;
 
+	/*
+	 * If a hardware reset event has occurred over HWC during probe,
+	 * rollback and perform hardware reset procedure.
+	 */
+	if (test_and_set_bit(GC_PROBE_SUCCEEDED, &gc->flags)) {
+		err = -EPROTO;
+		goto cleanup_mana_rdma;
+	}
+
 	return 0;
 
+cleanup_mana_rdma:
+	mana_rdma_remove(&gc->mana_ib);
 cleanup_mana:
 	mana_remove(&gc->mana, false);
 cleanup_gd:
@@ -1963,6 +2057,35 @@ release_region:
 disable_dev:
 	pci_disable_device(pdev);
 	dev_err(&pdev->dev, "gdma probe failed: err = %d\n", err);
+
+	/*
+	 * Hardware could be in recovery mode and the HWC returns TIMEDOUT or
+	 * EPROTO from mana_gd_setup(), mana_probe() or mana_rdma_probe(), or
+	 * we received a hardware reset event over HWC interrupt. In this case,
+	 * perform the device recovery procedure after MANA_SERVICE_PERIOD
+	 * seconds.
+	 */
+	if (err == -ETIMEDOUT || err == -EPROTO) {
+		struct mana_dev_recovery *dev;
+		unsigned long flags;
+
+		dev_info(&pdev->dev, "Start MANA recovery mode\n");
+
+		dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+		if (!dev)
+			return err;
+
+		dev->pdev = pci_dev_get(pdev);
+		dev->type = GDMA_EQE_HWC_RESET_REQUEST;
+
+		spin_lock_irqsave(&mana_dev_recovery_work.lock, flags);
+		list_add_tail(&dev->list, &mana_dev_recovery_work.dev_list);
+		spin_unlock_irqrestore(&mana_dev_recovery_work.lock, flags);
+
+		schedule_delayed_work(&mana_dev_recovery_work.work,
+				      secs_to_jiffies(MANA_SERVICE_PERIOD));
+	}
+
 	return err;
 }
 
@@ -2067,6 +2190,10 @@ static int __init mana_driver_init(void)
 {
 	int err;
 
+	INIT_LIST_HEAD(&mana_dev_recovery_work.dev_list);
+	spin_lock_init(&mana_dev_recovery_work.lock);
+	INIT_DELAYED_WORK(&mana_dev_recovery_work.work, mana_recovery_delayed_func);
+
 	mana_debugfs_root = debugfs_create_dir("mana", NULL);
 
 	err = pci_register_driver(&mana_driver);
@@ -2080,6 +2207,21 @@ static int __init mana_driver_init(void)
 
 static void __exit mana_driver_exit(void)
 {
+	struct mana_dev_recovery *dev;
+	unsigned long flags;
+
+	disable_delayed_work_sync(&mana_dev_recovery_work.work);
+
+	spin_lock_irqsave(&mana_dev_recovery_work.lock, flags);
+	while (!list_empty(&mana_dev_recovery_work.dev_list)) {
+		dev = list_first_entry(&mana_dev_recovery_work.dev_list,
+				       struct mana_dev_recovery, list);
+		list_del(&dev->list);
+		pci_dev_put(dev->pdev);
+		kfree(dev);
+	}
+	spin_unlock_irqrestore(&mana_dev_recovery_work.lock, flags);
+
 	pci_unregister_driver(&mana_driver);
 
 	debugfs_remove(mana_debugfs_root);
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index a4cf307859f8..eaa27483f99b 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -382,6 +382,10 @@ struct gdma_irq_context {
 	char name[MANA_IRQ_NAME_SZ];
 };
 
+enum gdma_context_flags {
+	GC_PROBE_SUCCEEDED	= 0,
+};
+
 struct gdma_context {
 	struct device		*dev;
 	struct dentry		*mana_pci_debugfs;
@@ -430,6 +434,8 @@ struct gdma_context {
 	u64 pf_cap_flags1;
 
 	struct workqueue_struct *service_wq;
+
+	unsigned long		flags;
 };
 
 static inline bool mana_gd_is_mana(struct gdma_dev *gd)
@@ -600,6 +606,9 @@ enum {
 /* Driver can send HWC periodically to query stats */
 #define GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY BIT(21)
 
+/* Driver can handle hardware recovery events during probe */
+#define GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY BIT(22)
+
 #define GDMA_DRV_CAP_FLAGS1 \
 	(GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
 	 GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
@@ -611,7 +620,8 @@ enum {
 	 GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \
 	 GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE | \
 	 GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY | \
-	 GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE)
+	 GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \
+	 GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY)
 
 #define GDMA_DRV_CAP_FLAGS2 0
 
-- 
cgit v1.2.3


From 5e1bf5ae5e3ba3588b474669ba05f5d202003d84 Mon Sep 17 00:00:00 2001
From: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
Date: Mon, 1 Dec 2025 08:53:45 +0530
Subject: net: phy: phy-c45: add SQI and SQI+ support for OATC14 10Base-T1S
 PHYs

Add support for reading Signal Quality Indicator (SQI) and enhanced SQI+
from OATC14 10Base-T1S PHYs.

- Introduce MDIO register definitions for DCQ_SQI and DCQ_SQIPLUS.
- Add `genphy_c45_oatc14_get_sqi_max()` to return the maximum supported
  SQI/SQI+ level.
- Add `genphy_c45_oatc14_get_sqi()` to return the current SQI or SQI+
  value.
- Update `include/linux/phy.h` to expose the new APIs.

SQI+ capability is read from the Advanced Diagnostic Features Capability
register (ADFCAP). If SQI+ is supported, the driver calculates the value
from the MSBs of the DCQ_SQIPLUS register; otherwise, it falls back to
basic SQI (0-7 levels). This enables ethtool to report the SQI value for
OATC14 10Base-T1S PHYs.

Open Alliance TC14 10BASE-T1S Advanced Diagnostic PHY Features
Specification ref:
https://opensig.org/wp-content/uploads/2025/06/OPEN_Alliance_10BASE-T1S_Advanced_PHY_features_for-automotive_Ethernet_V2.1b.pdf

Signed-off-by: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251201032346.6699-2-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/mdio-open-alliance.h |  13 ++++
 drivers/net/phy/phy-c45.c            | 137 +++++++++++++++++++++++++++++++++++
 include/linux/phy.h                  |  29 ++++++++
 3 files changed, 179 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/mdio-open-alliance.h b/drivers/net/phy/mdio-open-alliance.h
index 6850a3f0b31e..449d0fb67093 100644
--- a/drivers/net/phy/mdio-open-alliance.h
+++ b/drivers/net/phy/mdio-open-alliance.h
@@ -56,6 +56,8 @@
 /* Advanced Diagnostic Features Capability Register*/
 #define MDIO_OATC14_ADFCAP		0xcc00
 #define OATC14_ADFCAP_HDD_CAPABILITY	GENMASK(10, 8)
+#define OATC14_ADFCAP_SQIPLUS_CAPABILITY	GENMASK(4, 1)
+#define OATC14_ADFCAP_SQI_CAPABILITY	BIT(0)
 
 /* Harness Defect Detection Register */
 #define MDIO_OATC14_HDD			0xcc01
@@ -65,6 +67,17 @@
 #define OATC14_HDD_VALID		BIT(2)
 #define OATC14_HDD_SHORT_OPEN_STATUS	GENMASK(1, 0)
 
+/* Dynamic Channel Quality SQI Register */
+#define MDIO_OATC14_DCQ_SQI		0xcc03
+#define OATC14_DCQ_SQI_VALUE		GENMASK(2, 0)
+
+/* Dynamic Channel Quality SQI Plus Register */
+#define MDIO_OATC14_DCQ_SQIPLUS		0xcc04
+#define OATC14_DCQ_SQIPLUS_VALUE	GENMASK(7, 0)
+
+/* SQI is supported using 3 bits means 8 levels (0-7) */
+#define OATC14_SQI_MAX_LEVEL		7
+
 /* Bus Short/Open Status:
  * 0 0 - no fault; everything is ok. (Default)
  * 0 1 - detected as an open or missing termination(s)
diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index f5e23b53994f..d48aa7231b37 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -1695,3 +1695,140 @@ int genphy_c45_oatc14_cable_test_start(struct phy_device *phydev)
 				OATC14_HDD_START_CONTROL);
 }
 EXPORT_SYMBOL(genphy_c45_oatc14_cable_test_start);
+
+/**
+ * oatc14_update_sqi_capability - Read and update OATC14 10Base-T1S PHY SQI/SQI+
+ *                                capability
+ * @phydev: Pointer to the PHY device structure
+ *
+ * This helper reads the OATC14 ADFCAP capability register to determine whether
+ * the PHY supports SQI or SQI+ reporting.
+ *
+ * SQI+ capability is detected first. The SQI+ field indicates the number of
+ * valid MSBs (3–8), corresponding to 8–256 SQI+ levels. When present, the
+ * function stores the number of SQI+ bits and computes the maximum SQI+ value
+ * as (2^bits - 1).
+ *
+ * If SQI+ is not supported, the function checks for basic SQI capability,
+ * which provides 0–7 SQI levels.
+ *
+ * On success, the capability information is stored in
+ * @phydev->oatc14_sqi_capability and marked as updated.
+ *
+ * Return:
+ * * 0        - capability successfully read and stored
+ * * -EOPNOTSUPP - SQI/SQI+ not supported by this PHY
+ * * Negative errno on read failure
+ */
+static int oatc14_update_sqi_capability(struct phy_device *phydev)
+{
+	u8 bits;
+	int ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_ADFCAP);
+	if (ret < 0)
+		return ret;
+
+	/* Check for SQI+ capability
+	 * 0 - SQI+ is not supported
+	 * (3-8) bits for (8-256) SQI+ levels supported
+	 */
+	bits = FIELD_GET(OATC14_ADFCAP_SQIPLUS_CAPABILITY, ret);
+	if (bits) {
+		phydev->oatc14_sqi_capability.sqiplus_bits = bits;
+		/* Max sqi+ level supported: (2 ^ bits) - 1 */
+		phydev->oatc14_sqi_capability.sqi_max = BIT(bits) - 1;
+		goto update_done;
+	}
+
+	/* Check for SQI capability
+	 * 0 - SQI is not supported
+	 * 1 - SQI is supported (0-7 levels)
+	 */
+	if (ret & OATC14_ADFCAP_SQI_CAPABILITY) {
+		phydev->oatc14_sqi_capability.sqi_max = OATC14_SQI_MAX_LEVEL;
+		goto update_done;
+	}
+
+	return -EOPNOTSUPP;
+
+update_done:
+	phydev->oatc14_sqi_capability.updated = true;
+	return 0;
+}
+
+/**
+ * genphy_c45_oatc14_get_sqi_max - Get maximum supported SQI or SQI+ level of
+ *				   OATC14 10Base-T1S PHY
+ * @phydev: pointer to the PHY device structure
+ *
+ * This function returns the maximum supported Signal Quality Indicator (SQI) or
+ * SQI+ level. The SQI capability is updated on first invocation if it has not
+ * already been updated.
+ *
+ * Return:
+ * * Maximum SQI/SQI+ level supported
+ * * Negative errno on capability read failure
+ */
+int genphy_c45_oatc14_get_sqi_max(struct phy_device *phydev)
+{
+	int ret;
+
+	if (!phydev->oatc14_sqi_capability.updated) {
+		ret = oatc14_update_sqi_capability(phydev);
+		if (ret)
+			return ret;
+	}
+
+	return phydev->oatc14_sqi_capability.sqi_max;
+}
+EXPORT_SYMBOL(genphy_c45_oatc14_get_sqi_max);
+
+/**
+ * genphy_c45_oatc14_get_sqi - Get Signal Quality Indicator (SQI) from an OATC14
+ *			       10Base-T1S PHY
+ * @phydev: pointer to the PHY device structure
+ *
+ * This function reads the SQI+ or SQI value from an OATC14-compatible
+ * 10Base-T1S PHY. If SQI+ capability is supported, the function returns the
+ * extended SQI+ value; otherwise, it returns the basic SQI value. The SQI
+ * capability is updated on first invocation if it has not already been updated.
+ *
+ * Return:
+ * * SQI/SQI+ value on success
+ * * Negative errno on read failure
+ */
+int genphy_c45_oatc14_get_sqi(struct phy_device *phydev)
+{
+	u8 shift;
+	int ret;
+
+	if (!phydev->oatc14_sqi_capability.updated) {
+		ret = oatc14_update_sqi_capability(phydev);
+		if (ret)
+			return ret;
+	}
+
+	/* Calculate and return SQI+ value if supported */
+	if (phydev->oatc14_sqi_capability.sqiplus_bits) {
+		ret = phy_read_mmd(phydev, MDIO_MMD_VEND2,
+				   MDIO_OATC14_DCQ_SQIPLUS);
+		if (ret < 0)
+			return ret;
+
+		/* SQI+ uses N MSBs out of 8 bits, left-aligned with padding 1's
+		 * Calculate the right-shift needed to isolate the N bits.
+		 */
+		shift = 8 - phydev->oatc14_sqi_capability.sqiplus_bits;
+
+		return (ret & OATC14_DCQ_SQIPLUS_VALUE) >> shift;
+	}
+
+	/* Read and return SQI value if SQI+ capability is not supported */
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_DCQ_SQI);
+	if (ret < 0)
+		return ret;
+
+	return ret & OATC14_DCQ_SQI_VALUE;
+}
+EXPORT_SYMBOL(genphy_c45_oatc14_get_sqi);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 059a104223c4..fbbe028cc4b7 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -530,6 +530,30 @@ struct phy_c45_device_ids {
 struct macsec_context;
 struct macsec_ops;
 
+/**
+ * struct phy_oatc14_sqi_capability - SQI capability information for OATC14
+ *                                    10Base-T1S PHY
+ * @updated: Indicates whether the SQI capability fields have been updated.
+ * @sqi_max: Maximum supported Signal Quality Indicator (SQI) level reported by
+ *           the PHY.
+ * @sqiplus_bits: Bits for SQI+ levels supported by the PHY.
+ *                0 - SQI+ is not supported
+ *                3 - SQI+ is supported, using 3 bits (8 levels)
+ *                4 - SQI+ is supported, using 4 bits (16 levels)
+ *                5 - SQI+ is supported, using 5 bits (32 levels)
+ *                6 - SQI+ is supported, using 6 bits (64 levels)
+ *                7 - SQI+ is supported, using 7 bits (128 levels)
+ *                8 - SQI+ is supported, using 8 bits (256 levels)
+ *
+ * This structure is used by the OATC14 10Base-T1S PHY driver to store the SQI
+ * and SQI+ capability information retrieved from the PHY.
+ */
+struct phy_oatc14_sqi_capability {
+	bool updated;
+	int sqi_max;
+	u8 sqiplus_bits;
+};
+
 /**
  * struct phy_device - An instance of a PHY
  *
@@ -626,6 +650,7 @@ struct macsec_ops;
  * @link_down_events: Number of times link was lost
  * @shared: Pointer to private data shared by phys in one package
  * @priv: Pointer to driver private data
+ * @oatc14_sqi_capability: SQI capability information for OATC14 10Base-T1S PHY
  *
  * interrupts currently only supports enabled or disabled,
  * but could be changed in the future to support enabling
@@ -772,6 +797,8 @@ struct phy_device {
 	/* MACsec management functions */
 	const struct macsec_ops *macsec_ops;
 #endif
+
+	struct phy_oatc14_sqi_capability oatc14_sqi_capability;
 };
 
 /* Generic phy_device::dev_flags */
@@ -2257,6 +2284,8 @@ int genphy_c45_an_config_eee_aneg(struct phy_device *phydev);
 int genphy_c45_oatc14_cable_test_start(struct phy_device *phydev);
 int genphy_c45_oatc14_cable_test_get_status(struct phy_device *phydev,
 					    bool *finished);
+int genphy_c45_oatc14_get_sqi_max(struct phy_device *phydev);
+int genphy_c45_oatc14_get_sqi(struct phy_device *phydev);
 
 /* The gen10g_* functions are the old Clause 45 stub */
 int gen10g_config_aneg(struct phy_device *phydev);
-- 
cgit v1.2.3


From a0244e76213980f3b9bb5d40b0b6705fcf24230d Mon Sep 17 00:00:00 2001
From: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Date: Sun, 30 Nov 2025 15:16:44 +0200
Subject: net: hsr: create an API to get hsr port type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since the introduction of HSR_PT_INTERLINK in commit 5055cccfc2d1 ("net:
hsr: Provide RedBox support (HSR-SAN)"), we see that different port
types require different settings for hardware offload, which was not the
case before when we only had HSR_PT_SLAVE_A and HSR_PT_SLAVE_B. But
there is currently no way to know which port is which type, so create
the hsr_get_port_type() API function and export it.

When hsr_get_port_type() is called from the device driver, the port can
must be found in the HSR port list. An important use case is for this
function to work from offloading drivers' NETDEV_CHANGEUPPER handler,
which is triggered by hsr_portdev_setup() -> netdev_master_upper_dev_link().
Therefore, we need to move the addition of the hsr_port to the HSR port
list prior to calling hsr_portdev_setup(). This makes the error
restoration path also more similar to hsr_del_port(), where
kfree_rcu(port) is already used.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Lukasz Majewski <lukma@denx.de>
Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Łukasz Majewski <lukma@nabladev.com>
Link: https://patch.msgid.link/20251130131657.65080-3-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_hsr.h |  9 +++++++++
 net/hsr/hsr_device.c   | 20 ++++++++++++++++++++
 net/hsr/hsr_slave.c    |  7 ++++---
 3 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_hsr.h b/include/linux/if_hsr.h
index d7941fd88032..f4cf2dd36d19 100644
--- a/include/linux/if_hsr.h
+++ b/include/linux/if_hsr.h
@@ -43,6 +43,8 @@ extern bool is_hsr_master(struct net_device *dev);
 extern int hsr_get_version(struct net_device *dev, enum hsr_version *ver);
 struct net_device *hsr_get_port_ndev(struct net_device *ndev,
 				     enum hsr_port_type pt);
+int hsr_get_port_type(struct net_device *hsr_dev, struct net_device *dev,
+		      enum hsr_port_type *type);
 #else
 static inline bool is_hsr_master(struct net_device *dev)
 {
@@ -59,6 +61,13 @@ static inline struct net_device *hsr_get_port_ndev(struct net_device *ndev,
 {
 	return ERR_PTR(-EINVAL);
 }
+
+static inline int hsr_get_port_type(struct net_device *hsr_dev,
+				    struct net_device *dev,
+				    enum hsr_port_type *type)
+{
+	return -EINVAL;
+}
 #endif /* CONFIG_HSR */
 
 #endif /*_LINUX_IF_HSR_H_*/
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 492cbc78ab75..d1bfc49b5f01 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -690,6 +690,26 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev,
 }
 EXPORT_SYMBOL(hsr_get_port_ndev);
 
+int hsr_get_port_type(struct net_device *hsr_dev, struct net_device *dev,
+		      enum hsr_port_type *type)
+{
+	struct hsr_priv *hsr = netdev_priv(hsr_dev);
+	struct hsr_port *port;
+
+	rcu_read_lock();
+	hsr_for_each_port(hsr, port) {
+		if (port->dev == dev) {
+			*type = port->type;
+			rcu_read_unlock();
+			return 0;
+		}
+	}
+	rcu_read_unlock();
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(hsr_get_port_type);
+
 /* Default multicast address for HSR Supervision frames */
 static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
 	0x01, 0x15, 0x4e, 0x00, 0x01, 0x00
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 8177ac6c2d26..afe06ba00ea4 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -207,14 +207,14 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
 	port->type = type;
 	ether_addr_copy(port->original_macaddress, dev->dev_addr);
 
+	list_add_tail_rcu(&port->port_list, &hsr->ports);
+
 	if (type != HSR_PT_MASTER) {
 		res = hsr_portdev_setup(hsr, dev, port, extack);
 		if (res)
 			goto fail_dev_setup;
 	}
 
-	list_add_tail_rcu(&port->port_list, &hsr->ports);
-
 	master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
 	netdev_update_features(master->dev);
 	dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));
@@ -222,7 +222,8 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
 	return 0;
 
 fail_dev_setup:
-	kfree(port);
+	list_del_rcu(&port->port_list);
+	kfree_rcu(port, rcu);
 	return res;
 }
 
-- 
cgit v1.2.3


From 0e75bfe340bf05d1586eaf02942438573bda69e3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 30 Nov 2025 15:16:47 +0200
Subject: net: dsa: add simple HSR offload helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It turns out that HSR offloads are so fine-grained that many DSA
switches can do a small part even though they weren't specifically
designed for the protocols supported by that driver (HSR and PRP).

Specifically NETIF_F_HW_HSR_DUP - it is simple packet duplication on
transmit, towards all (aka 2) ports members of the HSR device.

For many DSA switches, we know how to duplicate a packet, even though we
never typically use that feature. The transmit port mask from the
tagging protocol can have multiple bits set, and the switch should send
the packet once to every port with a bit set from that mask.

Nonetheless, not all tagging protocols are like this, and sometimes the
port is a single numeric value rather than a bit mask. For that reason,
and also because switches can sometimes change tagging protocols for
different ones, we need to make HSR offload helpers opt-in.

For devices that can do nothing else HSR-specific, we introduce
dsa_port_simple_hsr_join() and dsa_port_simple_hsr_leave(). These
functions monitor when two user ports of the same switch are part of the
same HSR device, and when that condition is true, they toggle the
NETIF_F_HW_HSR_DUP feature flag of both net devices.

Normally only dsa_port_simple_hsr_join() and dsa_port_simple_hsr_leave()
are needed. The dsa_port_simple_hsr_validate() helper is just to see
what kind of configuration could be offloadable using the generic
helpers. This is used by switch drivers which are not currently using
the right tagging protocol to offload this HSR ring, but could in
principle offload it after changing the tagger.

Suggested-by: David Yang <mmyangfl@gmail.com>
Cc: "Alvin Šipraga" <alsi@bang-olufsen.dk>
Cc: Chester A. Unal" <chester.a.unal@arinc9.com>
Cc: "Clément Léger" <clement.leger@bootlin.com>
Cc: Daniel Golle <daniel@makrotopia.org>
Cc: DENG Qingfang <dqfext@gmail.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: George McCollister <george.mccollister@gmail.com>
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Jonas Gorski <jonas.gorski@gmail.com>
Cc: Kurt Kanzenbach <kurt@linutronix.de>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Sean Wang <sean.wang@mediatek.com>
Cc: UNGLinuxDriver@microchip.com
Cc: Woojung Huh <woojung.huh@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-6-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h |  9 ++++++++
 net/dsa/dsa.c     | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index e40cdc12f7f3..cced1a866757 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1322,6 +1322,15 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
 				 const struct switchdev_obj_port_mdb *mdb,
 				 struct dsa_db db);
 
+int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port,
+				 struct net_device *hsr,
+				 struct netlink_ext_ack *extack);
+int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port,
+			     struct net_device *hsr,
+			     struct netlink_ext_ack *extack);
+int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port,
+			      struct net_device *hsr);
+
 /* Keep inline for faster access in hot path */
 static inline bool netdev_uses_dsa(const struct net_device *dev)
 {
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 5b01a0e43ebe..a20efabe778f 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -9,6 +9,7 @@
 
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/if_hsr.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
@@ -1766,6 +1767,70 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
 }
 EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db);
 
+/* Helpers for switches without specific HSR offloads, but which can implement
+ * NETIF_F_HW_HSR_DUP because their tagger uses dsa_xmit_port_mask()
+ */
+int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port,
+				 struct net_device *hsr,
+				 struct netlink_ext_ack *extack)
+{
+	enum hsr_port_type type;
+	int err;
+
+	err = hsr_get_port_type(hsr, dsa_to_port(ds, port)->user, &type);
+	if (err)
+		return err;
+
+	if (type != HSR_PT_SLAVE_A && type != HSR_PT_SLAVE_B) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Only HSR slave ports can be offloaded");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_validate);
+
+int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port,
+			     struct net_device *hsr,
+			     struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_to_port(ds, port), *other_dp;
+	int err;
+
+	err = dsa_port_simple_hsr_validate(ds, port, hsr, extack);
+	if (err)
+		return err;
+
+	dsa_hsr_foreach_port(other_dp, ds, hsr) {
+		if (other_dp != dp) {
+			dp->user->features |= NETIF_F_HW_HSR_DUP;
+			other_dp->user->features |= NETIF_F_HW_HSR_DUP;
+			break;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_join);
+
+int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port,
+			      struct net_device *hsr)
+{
+	struct dsa_port *dp = dsa_to_port(ds, port), *other_dp;
+
+	dsa_hsr_foreach_port(other_dp, ds, hsr) {
+		if (other_dp != dp) {
+			dp->user->features &= ~NETIF_F_HW_HSR_DUP;
+			other_dp->user->features &= ~NETIF_F_HW_HSR_DUP;
+			break;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_leave);
+
 static const struct dsa_stubs __dsa_stubs = {
 	.conduit_hwtstamp_validate = __dsa_conduit_hwtstamp_validate,
 };
-- 
cgit v1.2.3


From 6b0f4ca079dbe6ae4aa57e529d67c7dc00d63577 Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Wed, 26 Nov 2025 17:35:37 +0000
Subject: wireguard: netlink: add YNL specification
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds a near[1] complete YNL specification for WireGuard,
documenting the protocol in a machine-readable format, rather than
comments in wireguard.h, and eases usage from C and non-C programming
languages alike.

The generated C library will be featured in a later patch, so in
this patch I will use the in-kernel python client for examples.

This makes the documentation in the UAPI header redundant, it is
therefore removed. The in-line documentation in the spec is based
on the existing comment in wireguard.h, and once released it will
be available in the kernel documentation at:
  https://docs.kernel.org/netlink/specs/wireguard.html
  (until then run: make htmldocs)

Generate wireguard.rst from this spec:
$ make -C tools/net/ynl/generated/ wireguard.rst

Query wireguard interface through pyynl:
$ sudo ./tools/net/ynl/pyynl/cli.py --family wireguard \
                                    --dump get-device \
                                    --json '{"ifindex":3}'
[{'fwmark': 0,
  'ifindex': 3,
  'ifname': 'wg-test',
  'listen-port': 54318,
  'peers': [{0: {'allowedips': [{0: {'cidr-mask': 0,
                                     'family': 2,
                                     'ipaddr': '0.0.0.0'}},
                                {0: {'cidr-mask': 0,
                                     'family': 10,
                                     'ipaddr': '::'}}],
                 'endpoint': b'[...]',
                 'last-handshake-time': {'nsec': 42, 'sec': 42},
                 'persistent-keepalive-interval': 42,
                 'preshared-key': '[...]',
                 'protocol-version': 1,
                 'public-key': '[...]',
                 'rx-bytes': 42,
                 'tx-bytes': 42}}],
  'private-key': '[...]',
  'public-key': '[...]'}]

Add another allowed IP prefix:
$ sudo ./tools/net/ynl/pyynl/cli.py --family wireguard \
  --do set-device --json '{"ifindex":3,"peers":[
    {"public-key":"6a df b1 83 a4 ..","allowedips":[
      {"cidr-mask":0,"family":10,"ipaddr":"::"}]}]}'

[1] As can be seen above, the "endpoint" is only dumped as binary data,
    as it can't be fully described in YNL. It's either a struct
    sockaddr_in or struct sockaddr_in6 depending on the attribute length.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 Documentation/netlink/specs/wireguard.yaml | 298 +++++++++++++++++++++++++++++
 MAINTAINERS                                |   1 +
 include/uapi/linux/wireguard.h             | 129 -------------
 3 files changed, 299 insertions(+), 129 deletions(-)
 create mode 100644 Documentation/netlink/specs/wireguard.yaml

(limited to 'include')

diff --git a/Documentation/netlink/specs/wireguard.yaml b/Documentation/netlink/specs/wireguard.yaml
new file mode 100644
index 000000000000..30479fc6bb69
--- /dev/null
+++ b/Documentation/netlink/specs/wireguard.yaml
@@ -0,0 +1,298 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+---
+name: wireguard
+protocol: genetlink-legacy
+
+doc: |
+  **Netlink protocol to control WireGuard network devices.**
+
+  The below enums and macros are for interfacing with WireGuard, using generic
+  netlink, with family ``WG_GENL_NAME`` and version ``WG_GENL_VERSION``. It
+  defines two commands: get and set. Note that while they share many common
+  attributes, these two commands actually accept a slightly different set of
+  inputs and outputs. These differences are noted under the individual
+  attributes.
+c-family-name: wg-genl-name
+c-version-name: wg-genl-version
+max-by-define: true
+
+definitions:
+  -
+    name-prefix: wg-
+    name: key-len
+    type: const
+    value: 32
+  -
+    name: --kernel-timespec
+    type: struct
+    header: linux/time_types.h
+    members:
+      -
+        name: sec
+        type: u64
+        doc: Number of seconds, since UNIX epoch.
+      -
+        name: nsec
+        type: u64
+        doc: Number of nanoseconds, after the second began.
+  -
+    name: wgdevice-flags
+    name-prefix: wgdevice-f-
+    enum-name: wgdevice-flag
+    type: flags
+    entries:
+      - replace-peers
+  -
+    name: wgpeer-flags
+    name-prefix: wgpeer-f-
+    enum-name: wgpeer-flag
+    type: flags
+    entries:
+      - remove-me
+      - replace-allowedips
+      - update-only
+  -
+    name: wgallowedip-flags
+    name-prefix: wgallowedip-f-
+    enum-name: wgallowedip-flag
+    type: flags
+    entries:
+      - remove-me
+
+attribute-sets:
+  -
+    name: wgdevice
+    enum-name: wgdevice-attribute
+    name-prefix: wgdevice-a-
+    attr-cnt-name: --wgdevice-a-last
+    attributes:
+      -
+        name: unspec
+        type: unused
+        value: 0
+      -
+        name: ifindex
+        type: u32
+      -
+        name: ifname
+        type: string
+        checks:
+          max-len: 15
+      -
+        name: private-key
+        type: binary
+        doc: Set to all zeros to remove.
+        display-hint: hex
+        checks:
+          exact-len: wg-key-len
+      -
+        name: public-key
+        type: binary
+        display-hint: hex
+        checks:
+          exact-len: wg-key-len
+      -
+        name: flags
+        type: u32
+        doc: |
+          ``0`` or ``WGDEVICE_F_REPLACE_PEERS`` if all current peers should be
+          removed prior to adding the list below.
+        enum: wgdevice-flags
+      -
+        name: listen-port
+        type: u16
+        doc: Set as ``0`` to choose randomly.
+      -
+        name: fwmark
+        type: u32
+        doc: Set as ``0`` to disable.
+      -
+        name: peers
+        type: indexed-array
+        sub-type: nest
+        nested-attributes: wgpeer
+        doc: |
+          The index/type parameter is unused on ``SET_DEVICE`` operations and is
+          zero on ``GET_DEVICE`` operations.
+  -
+    name: wgpeer
+    enum-name: wgpeer-attribute
+    name-prefix: wgpeer-a-
+    attr-cnt-name: --wgpeer-a-last
+    attributes:
+      -
+        name: unspec
+        type: unused
+        value: 0
+      -
+        name: public-key
+        type: binary
+        display-hint: hex
+        checks:
+          exact-len: wg-key-len
+      -
+        name: preshared-key
+        type: binary
+        doc: Set as all zeros to remove.
+        display-hint: hex
+        checks:
+          exact-len: wg-key-len
+      -
+        name: flags
+        type: u32
+        doc: |
+          ``0`` and/or ``WGPEER_F_REMOVE_ME`` if the specified peer should not
+          exist at the end of the operation, rather than added/updated and/or
+          ``WGPEER_F_REPLACE_ALLOWEDIPS`` if all current allowed IPs of this
+          peer should be removed prior to adding the list below and/or
+          ``WGPEER_F_UPDATE_ONLY`` if the peer should only be set if it already
+          exists.
+        enum: wgpeer-flags
+      -
+        name: endpoint
+        type: binary
+        doc: struct sockaddr_in or struct sockaddr_in6
+        checks:
+          min-len: 16
+      -
+        name: persistent-keepalive-interval
+        type: u16
+        doc: Set as ``0`` to disable.
+      -
+        name: last-handshake-time
+        type: binary
+        struct: --kernel-timespec
+        checks:
+          exact-len: 16
+      -
+        name: rx-bytes
+        type: u64
+      -
+        name: tx-bytes
+        type: u64
+      -
+        name: allowedips
+        type: indexed-array
+        sub-type: nest
+        nested-attributes: wgallowedip
+        doc: |
+          The index/type parameter is unused on ``SET_DEVICE`` operations and is
+          zero on ``GET_DEVICE`` operations.
+      -
+        name: protocol-version
+        type: u32
+        doc: |
+          Should not be set or used at all by most users of this API, as the
+          most recent protocol will be used when this is unset. Otherwise,
+          must be set to ``1``.
+  -
+    name: wgallowedip
+    enum-name: wgallowedip-attribute
+    name-prefix: wgallowedip-a-
+    attr-cnt-name: --wgallowedip-a-last
+    attributes:
+      -
+        name: unspec
+        type: unused
+        value: 0
+      -
+        name: family
+        type: u16
+        doc: IP family, either ``AF_INET`` or ``AF_INET6``.
+      -
+        name: ipaddr
+        type: binary
+        doc: Either ``struct in_addr`` or ``struct in6_addr``.
+        display-hint: ipv4-or-v6
+        checks:
+          min-len: 4
+      -
+        name: cidr-mask
+        type: u8
+      -
+        name: flags
+        type: u32
+        doc: |
+          ``WGALLOWEDIP_F_REMOVE_ME`` if the specified IP should be removed;
+          otherwise, this IP will be added if it is not already present.
+        enum: wgallowedip-flags
+
+operations:
+  enum-name: wg-cmd
+  name-prefix: wg-cmd-
+  list:
+    -
+      name: get-device
+      value: 0
+      doc: |
+        Retrieve WireGuard device
+        ~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        The command should be called with one but not both of:
+
+        - ``WGDEVICE_A_IFINDEX``
+        - ``WGDEVICE_A_IFNAME``
+
+        The kernel will then return several messages (``NLM_F_MULTI``). It is
+        possible that all of the allowed IPs of a single peer will not fit
+        within a single netlink message. In that case, the same peer will be
+        written in the following message, except it will only contain
+        ``WGPEER_A_PUBLIC_KEY`` and ``WGPEER_A_ALLOWEDIPS``. This may occur
+        several times in a row for the same peer. It is then up to the receiver
+        to coalesce adjacent peers. Likewise, it is possible that all peers will
+        not fit within a single message. So, subsequent peers will be sent in
+        following messages, except those will only contain ``WGDEVICE_A_IFNAME``
+        and ``WGDEVICE_A_PEERS``. It is then up to the receiver to coalesce
+        these messages to form the complete list of peers.
+
+        Since this is an ``NLA_F_DUMP`` command, the final message will always
+        be ``NLMSG_DONE``, even if an error occurs. However, this ``NLMSG_DONE``
+        message contains an integer error code. It is either zero or a negative
+        error code corresponding to the errno.
+      attribute-set: wgdevice
+      flags: [uns-admin-perm]
+
+      dump:
+        pre: wg-get-device-start
+        post: wg-get-device-done
+        request:
+          attributes:
+            - ifindex
+            - ifname
+        reply: &all-attrs
+          attributes:
+            - ifindex
+            - ifname
+            - private-key
+            - public-key
+            - flags
+            - listen-port
+            - fwmark
+            - peers
+    -
+      name: set-device
+      value: 1
+      doc: |
+        Set WireGuard device
+        ~~~~~~~~~~~~~~~~~~~~
+
+        This command should be called with a wgdevice set, containing one but
+        not both of ``WGDEVICE_A_IFINDEX`` and ``WGDEVICE_A_IFNAME``.
+
+        It is possible that the amount of configuration data exceeds that of the
+        maximum message length accepted by the kernel. In that case, several
+        messages should be sent one after another, with each successive one
+        filling in information not contained in the prior. Note that if
+        ``WGDEVICE_F_REPLACE_PEERS`` is specified in the first message, it
+        probably should not be specified in fragments that come after, so that
+        the list of peers is only cleared the first time but appended after.
+        Likewise for peers, if ``WGPEER_F_REPLACE_ALLOWEDIPS`` is specified in
+        the first message of a peer, it likely should not be specified in
+        subsequent fragments.
+
+        If an error occurs, ``NLMSG_ERROR`` will reply containing an errno.
+      attribute-set: wgdevice
+      flags: [uns-admin-perm]
+
+      do:
+        request: *all-attrs
diff --git a/MAINTAINERS b/MAINTAINERS
index 09932ab7e0e8..8b44a380642c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -27673,6 +27673,7 @@ M:	Jason A. Donenfeld <Jason@zx2c4.com>
 L:	wireguard@lists.zx2c4.com
 L:	netdev@vger.kernel.org
 S:	Maintained
+F:	Documentation/netlink/specs/wireguard.yaml
 F:	drivers/net/wireguard/
 F:	tools/testing/selftests/wireguard/
 
diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h
index 8c26391196d5..dee4401e0b5d 100644
--- a/include/uapi/linux/wireguard.h
+++ b/include/uapi/linux/wireguard.h
@@ -1,135 +1,6 @@
 /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
 /*
  * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * Documentation
- * =============
- *
- * The below enums and macros are for interfacing with WireGuard, using generic
- * netlink, with family WG_GENL_NAME and version WG_GENL_VERSION. It defines two
- * methods: get and set. Note that while they share many common attributes,
- * these two functions actually accept a slightly different set of inputs and
- * outputs.
- *
- * WG_CMD_GET_DEVICE
- * -----------------
- *
- * May only be called via NLM_F_REQUEST | NLM_F_DUMP. The command should contain
- * one but not both of:
- *
- *    WGDEVICE_A_IFINDEX: NLA_U32
- *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1
- *
- * The kernel will then return several messages (NLM_F_MULTI) containing the
- * following tree of nested items:
- *
- *    WGDEVICE_A_IFINDEX: NLA_U32
- *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1
- *    WGDEVICE_A_PRIVATE_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
- *    WGDEVICE_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
- *    WGDEVICE_A_LISTEN_PORT: NLA_U16
- *    WGDEVICE_A_FWMARK: NLA_U32
- *    WGDEVICE_A_PEERS: NLA_NESTED
- *        0: NLA_NESTED
- *            WGPEER_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
- *            WGPEER_A_PRESHARED_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
- *            WGPEER_A_ENDPOINT: NLA_MIN_LEN(struct sockaddr), struct sockaddr_in or struct sockaddr_in6
- *            WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL: NLA_U16
- *            WGPEER_A_LAST_HANDSHAKE_TIME: NLA_EXACT_LEN, struct __kernel_timespec
- *            WGPEER_A_RX_BYTES: NLA_U64
- *            WGPEER_A_TX_BYTES: NLA_U64
- *            WGPEER_A_ALLOWEDIPS: NLA_NESTED
- *                0: NLA_NESTED
- *                    WGALLOWEDIP_A_FAMILY: NLA_U16
- *                    WGALLOWEDIP_A_IPADDR: NLA_MIN_LEN(struct in_addr), struct in_addr or struct in6_addr
- *                    WGALLOWEDIP_A_CIDR_MASK: NLA_U8
- *                0: NLA_NESTED
- *                    ...
- *                0: NLA_NESTED
- *                    ...
- *                ...
- *            WGPEER_A_PROTOCOL_VERSION: NLA_U32
- *        0: NLA_NESTED
- *            ...
- *        ...
- *
- * It is possible that all of the allowed IPs of a single peer will not
- * fit within a single netlink message. In that case, the same peer will
- * be written in the following message, except it will only contain
- * WGPEER_A_PUBLIC_KEY and WGPEER_A_ALLOWEDIPS. This may occur several
- * times in a row for the same peer. It is then up to the receiver to
- * coalesce adjacent peers. Likewise, it is possible that all peers will
- * not fit within a single message. So, subsequent peers will be sent
- * in following messages, except those will only contain WGDEVICE_A_IFNAME
- * and WGDEVICE_A_PEERS. It is then up to the receiver to coalesce these
- * messages to form the complete list of peers.
- *
- * Since this is an NLA_F_DUMP command, the final message will always be
- * NLMSG_DONE, even if an error occurs. However, this NLMSG_DONE message
- * contains an integer error code. It is either zero or a negative error
- * code corresponding to the errno.
- *
- * WG_CMD_SET_DEVICE
- * -----------------
- *
- * May only be called via NLM_F_REQUEST. The command should contain the
- * following tree of nested items, containing one but not both of
- * WGDEVICE_A_IFINDEX and WGDEVICE_A_IFNAME:
- *
- *    WGDEVICE_A_IFINDEX: NLA_U32
- *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1
- *    WGDEVICE_A_FLAGS: NLA_U32, 0 or WGDEVICE_F_REPLACE_PEERS if all current
- *                      peers should be removed prior to adding the list below.
- *    WGDEVICE_A_PRIVATE_KEY: len WG_KEY_LEN, all zeros to remove
- *    WGDEVICE_A_LISTEN_PORT: NLA_U16, 0 to choose randomly
- *    WGDEVICE_A_FWMARK: NLA_U32, 0 to disable
- *    WGDEVICE_A_PEERS: NLA_NESTED
- *        0: NLA_NESTED
- *            WGPEER_A_PUBLIC_KEY: len WG_KEY_LEN
- *            WGPEER_A_FLAGS: NLA_U32, 0 and/or WGPEER_F_REMOVE_ME if the
- *                            specified peer should not exist at the end of the
- *                            operation, rather than added/updated and/or
- *                            WGPEER_F_REPLACE_ALLOWEDIPS if all current allowed
- *                            IPs of this peer should be removed prior to adding
- *                            the list below and/or WGPEER_F_UPDATE_ONLY if the
- *                            peer should only be set if it already exists.
- *            WGPEER_A_PRESHARED_KEY: len WG_KEY_LEN, all zeros to remove
- *            WGPEER_A_ENDPOINT: struct sockaddr_in or struct sockaddr_in6
- *            WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL: NLA_U16, 0 to disable
- *            WGPEER_A_ALLOWEDIPS: NLA_NESTED
- *                0: NLA_NESTED
- *                    WGALLOWEDIP_A_FAMILY: NLA_U16
- *                    WGALLOWEDIP_A_IPADDR: struct in_addr or struct in6_addr
- *                    WGALLOWEDIP_A_CIDR_MASK: NLA_U8
- *                    WGALLOWEDIP_A_FLAGS: NLA_U32, WGALLOWEDIP_F_REMOVE_ME if
- *                                         the specified IP should be removed;
- *                                         otherwise, this IP will be added if
- *                                         it is not already present.
- *                0: NLA_NESTED
- *                    ...
- *                0: NLA_NESTED
- *                    ...
- *                ...
- *            WGPEER_A_PROTOCOL_VERSION: NLA_U32, should not be set or used at
- *                                       all by most users of this API, as the
- *                                       most recent protocol will be used when
- *                                       this is unset. Otherwise, must be set
- *                                       to 1.
- *        0: NLA_NESTED
- *            ...
- *        ...
- *
- * It is possible that the amount of configuration data exceeds that of
- * the maximum message length accepted by the kernel. In that case, several
- * messages should be sent one after another, with each successive one
- * filling in information not contained in the prior. Note that if
- * WGDEVICE_F_REPLACE_PEERS is specified in the first message, it probably
- * should not be specified in fragments that come after, so that the list
- * of peers is only cleared the first time but appended after. Likewise for
- * peers, if WGPEER_F_REPLACE_ALLOWEDIPS is specified in the first message
- * of a peer, it likely should not be specified in subsequent fragments.
- *
- * If an error occurs, NLMSG_ERROR will reply containing an errno.
  */
 
 #ifndef _WG_UAPI_WIREGUARD_H
-- 
cgit v1.2.3


From b5c5a82bf5cb96e14a6627ef21be962052a0c6d8 Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Wed, 26 Nov 2025 17:35:38 +0000
Subject: wireguard: uapi: move enum wg_cmd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch moves enum wg_cmd to the end of the file, where ynl-gen
would generate it.

This is an incremental step towards adopting an UAPI header generated
by ynl-gen. This is split out to keep the patches readable.

This is a trivial patch with no behavioural changes intended.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/uapi/linux/wireguard.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h
index dee4401e0b5d..3ebfffd61269 100644
--- a/include/uapi/linux/wireguard.h
+++ b/include/uapi/linux/wireguard.h
@@ -11,13 +11,6 @@
 
 #define WG_KEY_LEN 32
 
-enum wg_cmd {
-	WG_CMD_GET_DEVICE,
-	WG_CMD_SET_DEVICE,
-	__WG_CMD_MAX
-};
-#define WG_CMD_MAX (__WG_CMD_MAX - 1)
-
 enum wgdevice_flag {
 	WGDEVICE_F_REPLACE_PEERS = 1U << 0,
 	__WGDEVICE_F_ALL = WGDEVICE_F_REPLACE_PEERS
@@ -73,4 +66,12 @@ enum wgallowedip_attribute {
 };
 #define WGALLOWEDIP_A_MAX (__WGALLOWEDIP_A_LAST - 1)
 
+enum wg_cmd {
+	WG_CMD_GET_DEVICE,
+	WG_CMD_SET_DEVICE,
+
+	__WG_CMD_MAX
+};
+#define WG_CMD_MAX (__WG_CMD_MAX - 1)
+
 #endif /* _WG_UAPI_WIREGUARD_H */
-- 
cgit v1.2.3


From 8d974872ab29eeb93a5b0b698007257d8be07968 Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Wed, 26 Nov 2025 17:35:39 +0000
Subject: wireguard: uapi: move flag enums
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the wg*_flag enums, so they are defined above the attribute set
enums, where ynl-gen would place them.

This is an incremental step towards adopting an UAPI header generated
by ynl-gen. This is split out to keep the patches readable.

This is a trivial patch with no behavioural changes intended.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/uapi/linux/wireguard.h | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h
index 3ebfffd61269..a2815f4f2910 100644
--- a/include/uapi/linux/wireguard.h
+++ b/include/uapi/linux/wireguard.h
@@ -15,6 +15,20 @@ enum wgdevice_flag {
 	WGDEVICE_F_REPLACE_PEERS = 1U << 0,
 	__WGDEVICE_F_ALL = WGDEVICE_F_REPLACE_PEERS
 };
+
+enum wgpeer_flag {
+	WGPEER_F_REMOVE_ME = 1U << 0,
+	WGPEER_F_REPLACE_ALLOWEDIPS = 1U << 1,
+	WGPEER_F_UPDATE_ONLY = 1U << 2,
+	__WGPEER_F_ALL = WGPEER_F_REMOVE_ME | WGPEER_F_REPLACE_ALLOWEDIPS |
+			 WGPEER_F_UPDATE_ONLY
+};
+
+enum wgallowedip_flag {
+	WGALLOWEDIP_F_REMOVE_ME = 1U << 0,
+	__WGALLOWEDIP_F_ALL = WGALLOWEDIP_F_REMOVE_ME
+};
+
 enum wgdevice_attribute {
 	WGDEVICE_A_UNSPEC,
 	WGDEVICE_A_IFINDEX,
@@ -29,13 +43,6 @@ enum wgdevice_attribute {
 };
 #define WGDEVICE_A_MAX (__WGDEVICE_A_LAST - 1)
 
-enum wgpeer_flag {
-	WGPEER_F_REMOVE_ME = 1U << 0,
-	WGPEER_F_REPLACE_ALLOWEDIPS = 1U << 1,
-	WGPEER_F_UPDATE_ONLY = 1U << 2,
-	__WGPEER_F_ALL = WGPEER_F_REMOVE_ME | WGPEER_F_REPLACE_ALLOWEDIPS |
-			 WGPEER_F_UPDATE_ONLY
-};
 enum wgpeer_attribute {
 	WGPEER_A_UNSPEC,
 	WGPEER_A_PUBLIC_KEY,
@@ -52,10 +59,6 @@ enum wgpeer_attribute {
 };
 #define WGPEER_A_MAX (__WGPEER_A_LAST - 1)
 
-enum wgallowedip_flag {
-	WGALLOWEDIP_F_REMOVE_ME = 1U << 0,
-	__WGALLOWEDIP_F_ALL = WGALLOWEDIP_F_REMOVE_ME
-};
 enum wgallowedip_attribute {
 	WGALLOWEDIP_A_UNSPEC,
 	WGALLOWEDIP_A_FAMILY,
-- 
cgit v1.2.3


From 88cedad45ba14097e06d2c9f6578688097a94691 Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Wed, 26 Nov 2025 17:35:40 +0000
Subject: wireguard: uapi: generate header with ynl-gen
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use ynl-gen to generate the UAPI header for WireGuard.

The cosmetic changes in this patch confirms that the spec is aligned
with the implementation. By using the generated version, it ensures
that they stay in sync.

Changes in the generated header:
* Trivial header guard rename.
* Trivial white space changes.
* Trivial comment changes.
* Precompute bitflags in ynl-gen (see [1]).
* Drop __*_F_ALL constants (see [1]).

[1] https://lore.kernel.org/r/20251014123201.6ecfd146@kernel.org/

No behavioural changes intended.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 drivers/net/wireguard/netlink.c |  6 +++---
 include/uapi/linux/wireguard.h  | 38 +++++++++++++++++++-------------------
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireguard/netlink.c b/drivers/net/wireguard/netlink.c
index c2d0576e96f5..0ce0bda8c1ce 100644
--- a/drivers/net/wireguard/netlink.c
+++ b/drivers/net/wireguard/netlink.c
@@ -26,7 +26,7 @@ static const struct nla_policy device_policy[WGDEVICE_A_MAX + 1] = {
 	[WGDEVICE_A_IFNAME]		= { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
 	[WGDEVICE_A_PRIVATE_KEY]	= NLA_POLICY_EXACT_LEN(WG_KEY_LEN),
 	[WGDEVICE_A_PUBLIC_KEY]		= NLA_POLICY_EXACT_LEN(WG_KEY_LEN),
-	[WGDEVICE_A_FLAGS]		= NLA_POLICY_MASK(NLA_U32, __WGDEVICE_F_ALL),
+	[WGDEVICE_A_FLAGS]		= NLA_POLICY_MASK(NLA_U32, 0x1),
 	[WGDEVICE_A_LISTEN_PORT]	= { .type = NLA_U16 },
 	[WGDEVICE_A_FWMARK]		= { .type = NLA_U32 },
 	[WGDEVICE_A_PEERS]		= NLA_POLICY_NESTED_ARRAY(peer_policy),
@@ -35,7 +35,7 @@ static const struct nla_policy device_policy[WGDEVICE_A_MAX + 1] = {
 static const struct nla_policy peer_policy[WGPEER_A_MAX + 1] = {
 	[WGPEER_A_PUBLIC_KEY]				= NLA_POLICY_EXACT_LEN(WG_KEY_LEN),
 	[WGPEER_A_PRESHARED_KEY]			= NLA_POLICY_EXACT_LEN(WG_KEY_LEN),
-	[WGPEER_A_FLAGS]				= NLA_POLICY_MASK(NLA_U32, __WGPEER_F_ALL),
+	[WGPEER_A_FLAGS]				= NLA_POLICY_MASK(NLA_U32, 0x7),
 	[WGPEER_A_ENDPOINT]				= NLA_POLICY_MIN_LEN(sizeof(struct sockaddr)),
 	[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]	= { .type = NLA_U16 },
 	[WGPEER_A_LAST_HANDSHAKE_TIME]			= NLA_POLICY_EXACT_LEN(sizeof(struct __kernel_timespec)),
@@ -49,7 +49,7 @@ static const struct nla_policy allowedip_policy[WGALLOWEDIP_A_MAX + 1] = {
 	[WGALLOWEDIP_A_FAMILY]		= { .type = NLA_U16 },
 	[WGALLOWEDIP_A_IPADDR]		= NLA_POLICY_MIN_LEN(sizeof(struct in_addr)),
 	[WGALLOWEDIP_A_CIDR_MASK]	= { .type = NLA_U8 },
-	[WGALLOWEDIP_A_FLAGS]		= NLA_POLICY_MASK(NLA_U32, __WGALLOWEDIP_F_ALL),
+	[WGALLOWEDIP_A_FLAGS]		= NLA_POLICY_MASK(NLA_U32, 0x1),
 };
 
 static struct wg_device *lookup_interface(struct nlattr **attrs,
diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h
index a2815f4f2910..a100b9715b08 100644
--- a/include/uapi/linux/wireguard.h
+++ b/include/uapi/linux/wireguard.h
@@ -1,32 +1,29 @@
-/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/wireguard.yaml */
+/* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
-#ifndef _WG_UAPI_WIREGUARD_H
-#define _WG_UAPI_WIREGUARD_H
+#ifndef _UAPI_LINUX_WIREGUARD_H
+#define _UAPI_LINUX_WIREGUARD_H
 
-#define WG_GENL_NAME "wireguard"
-#define WG_GENL_VERSION 1
+#define WG_GENL_NAME	"wireguard"
+#define WG_GENL_VERSION	1
 
-#define WG_KEY_LEN 32
+#define WG_KEY_LEN	32
 
 enum wgdevice_flag {
-	WGDEVICE_F_REPLACE_PEERS = 1U << 0,
-	__WGDEVICE_F_ALL = WGDEVICE_F_REPLACE_PEERS
+	WGDEVICE_F_REPLACE_PEERS = 1,
 };
 
 enum wgpeer_flag {
-	WGPEER_F_REMOVE_ME = 1U << 0,
-	WGPEER_F_REPLACE_ALLOWEDIPS = 1U << 1,
-	WGPEER_F_UPDATE_ONLY = 1U << 2,
-	__WGPEER_F_ALL = WGPEER_F_REMOVE_ME | WGPEER_F_REPLACE_ALLOWEDIPS |
-			 WGPEER_F_UPDATE_ONLY
+	WGPEER_F_REMOVE_ME = 1,
+	WGPEER_F_REPLACE_ALLOWEDIPS = 2,
+	WGPEER_F_UPDATE_ONLY = 4,
 };
 
 enum wgallowedip_flag {
-	WGALLOWEDIP_F_REMOVE_ME = 1U << 0,
-	__WGALLOWEDIP_F_ALL = WGALLOWEDIP_F_REMOVE_ME
+	WGALLOWEDIP_F_REMOVE_ME = 1,
 };
 
 enum wgdevice_attribute {
@@ -39,6 +36,7 @@ enum wgdevice_attribute {
 	WGDEVICE_A_LISTEN_PORT,
 	WGDEVICE_A_FWMARK,
 	WGDEVICE_A_PEERS,
+
 	__WGDEVICE_A_LAST
 };
 #define WGDEVICE_A_MAX (__WGDEVICE_A_LAST - 1)
@@ -55,6 +53,7 @@ enum wgpeer_attribute {
 	WGPEER_A_TX_BYTES,
 	WGPEER_A_ALLOWEDIPS,
 	WGPEER_A_PROTOCOL_VERSION,
+
 	__WGPEER_A_LAST
 };
 #define WGPEER_A_MAX (__WGPEER_A_LAST - 1)
@@ -65,6 +64,7 @@ enum wgallowedip_attribute {
 	WGALLOWEDIP_A_IPADDR,
 	WGALLOWEDIP_A_CIDR_MASK,
 	WGALLOWEDIP_A_FLAGS,
+
 	__WGALLOWEDIP_A_LAST
 };
 #define WGALLOWEDIP_A_MAX (__WGALLOWEDIP_A_LAST - 1)
@@ -77,4 +77,4 @@ enum wg_cmd {
 };
 #define WG_CMD_MAX (__WG_CMD_MAX - 1)
 
-#endif /* _WG_UAPI_WIREGUARD_H */
+#endif /* _UAPI_LINUX_WIREGUARD_H */
-- 
cgit v1.2.3


From a42b71d49945aac0b943987cbdec1d1c805caab3 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <cassel@kernel.org>
Date: Mon, 1 Dec 2025 13:35:03 +0100
Subject: ata: libata: Move quirk flags to their own enum
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The anonymous enum in include/linux/libata.h that is used to store
various global constants can currently be backed by type int.
(It contains both negative and positive constants.)

__ATA_QUIRK_MAX is currently 31.
The quirk flags in the various global constants enum are defined as
"1U << quirk_flag_bit".

Thus if we simply add an additional quirk, the quirk flag will be 1 << 31,
which is a value that is too large to be represented by a signed int.
The various global constants enum will thus therefore be backed by type
long.

This will lead to error prints like e.g.:
ata_port_err(ap, "EH pending after %d tries, giving up\n",
	     ATA_EH_MAX_TRIES);

now failing to build, with build error:
error: format ‘%d’ expects argument of type ‘int’, but argument 4 has type ‘long int’ [-Werror=format=]

This is because all constants in the various global constants enum now
has to be printed as a long, as that is now the backing type of the enum.

Since the compiler will use the smallest possible backing type for an
enum, it is good practice to not mix unrelated things in a single enum.

Move the quirk flags to a separate enum, so that we don't need to change
the printf specifier for all other constants in the "various global
constants" enum when adding an additional quirk.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 include/linux/libata.h | 74 ++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 21de0935775d..9aa0541dc62d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -85,6 +85,44 @@ enum ata_quirks {
 	__ATA_QUIRK_MAX,
 };
 
+/*
+ * Quirk flags: may be set by libata or controller drivers on drives.
+ * Some quirks may be drive/controller pair dependent.
+ */
+enum {
+	ATA_QUIRK_DIAGNOSTIC		= (1U << __ATA_QUIRK_DIAGNOSTIC),
+	ATA_QUIRK_NODMA			= (1U << __ATA_QUIRK_NODMA),
+	ATA_QUIRK_NONCQ			= (1U << __ATA_QUIRK_NONCQ),
+	ATA_QUIRK_MAX_SEC_128		= (1U << __ATA_QUIRK_MAX_SEC_128),
+	ATA_QUIRK_BROKEN_HPA		= (1U << __ATA_QUIRK_BROKEN_HPA),
+	ATA_QUIRK_DISABLE		= (1U << __ATA_QUIRK_DISABLE),
+	ATA_QUIRK_HPA_SIZE		= (1U << __ATA_QUIRK_HPA_SIZE),
+	ATA_QUIRK_IVB			= (1U << __ATA_QUIRK_IVB),
+	ATA_QUIRK_STUCK_ERR		= (1U << __ATA_QUIRK_STUCK_ERR),
+	ATA_QUIRK_BRIDGE_OK		= (1U << __ATA_QUIRK_BRIDGE_OK),
+	ATA_QUIRK_ATAPI_MOD16_DMA	= (1U << __ATA_QUIRK_ATAPI_MOD16_DMA),
+	ATA_QUIRK_FIRMWARE_WARN		= (1U << __ATA_QUIRK_FIRMWARE_WARN),
+	ATA_QUIRK_1_5_GBPS		= (1U << __ATA_QUIRK_1_5_GBPS),
+	ATA_QUIRK_NOSETXFER		= (1U << __ATA_QUIRK_NOSETXFER),
+	ATA_QUIRK_BROKEN_FPDMA_AA	= (1U << __ATA_QUIRK_BROKEN_FPDMA_AA),
+	ATA_QUIRK_DUMP_ID		= (1U << __ATA_QUIRK_DUMP_ID),
+	ATA_QUIRK_MAX_SEC_LBA48		= (1U << __ATA_QUIRK_MAX_SEC_LBA48),
+	ATA_QUIRK_ATAPI_DMADIR		= (1U << __ATA_QUIRK_ATAPI_DMADIR),
+	ATA_QUIRK_NO_NCQ_TRIM		= (1U << __ATA_QUIRK_NO_NCQ_TRIM),
+	ATA_QUIRK_NOLPM			= (1U << __ATA_QUIRK_NOLPM),
+	ATA_QUIRK_WD_BROKEN_LPM		= (1U << __ATA_QUIRK_WD_BROKEN_LPM),
+	ATA_QUIRK_ZERO_AFTER_TRIM	= (1U << __ATA_QUIRK_ZERO_AFTER_TRIM),
+	ATA_QUIRK_NO_DMA_LOG		= (1U << __ATA_QUIRK_NO_DMA_LOG),
+	ATA_QUIRK_NOTRIM		= (1U << __ATA_QUIRK_NOTRIM),
+	ATA_QUIRK_MAX_SEC_1024		= (1U << __ATA_QUIRK_MAX_SEC_1024),
+	ATA_QUIRK_MAX_TRIM_128M		= (1U << __ATA_QUIRK_MAX_TRIM_128M),
+	ATA_QUIRK_NO_NCQ_ON_ATI		= (1U << __ATA_QUIRK_NO_NCQ_ON_ATI),
+	ATA_QUIRK_NO_LPM_ON_ATI		= (1U << __ATA_QUIRK_NO_LPM_ON_ATI),
+	ATA_QUIRK_NO_ID_DEV_LOG		= (1U << __ATA_QUIRK_NO_ID_DEV_LOG),
+	ATA_QUIRK_NO_LOG_DIR		= (1U << __ATA_QUIRK_NO_LOG_DIR),
+	ATA_QUIRK_NO_FUA		= (1U << __ATA_QUIRK_NO_FUA),
+};
+
 enum {
 	/* various global constants */
 	LIBATA_MAX_PRD		= ATA_MAX_PRD / 2,
@@ -390,42 +428,6 @@ enum {
 	 */
 	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 8,
 
-	/*
-	 * Quirk flags: may be set by libata or controller drivers on drives.
-	 * Some quirks may be drive/controller pair dependent.
-	 */
-	ATA_QUIRK_DIAGNOSTIC		= (1U << __ATA_QUIRK_DIAGNOSTIC),
-	ATA_QUIRK_NODMA			= (1U << __ATA_QUIRK_NODMA),
-	ATA_QUIRK_NONCQ			= (1U << __ATA_QUIRK_NONCQ),
-	ATA_QUIRK_MAX_SEC_128		= (1U << __ATA_QUIRK_MAX_SEC_128),
-	ATA_QUIRK_BROKEN_HPA		= (1U << __ATA_QUIRK_BROKEN_HPA),
-	ATA_QUIRK_DISABLE		= (1U << __ATA_QUIRK_DISABLE),
-	ATA_QUIRK_HPA_SIZE		= (1U << __ATA_QUIRK_HPA_SIZE),
-	ATA_QUIRK_IVB			= (1U << __ATA_QUIRK_IVB),
-	ATA_QUIRK_STUCK_ERR		= (1U << __ATA_QUIRK_STUCK_ERR),
-	ATA_QUIRK_BRIDGE_OK		= (1U << __ATA_QUIRK_BRIDGE_OK),
-	ATA_QUIRK_ATAPI_MOD16_DMA	= (1U << __ATA_QUIRK_ATAPI_MOD16_DMA),
-	ATA_QUIRK_FIRMWARE_WARN		= (1U << __ATA_QUIRK_FIRMWARE_WARN),
-	ATA_QUIRK_1_5_GBPS		= (1U << __ATA_QUIRK_1_5_GBPS),
-	ATA_QUIRK_NOSETXFER		= (1U << __ATA_QUIRK_NOSETXFER),
-	ATA_QUIRK_BROKEN_FPDMA_AA	= (1U << __ATA_QUIRK_BROKEN_FPDMA_AA),
-	ATA_QUIRK_DUMP_ID		= (1U << __ATA_QUIRK_DUMP_ID),
-	ATA_QUIRK_MAX_SEC_LBA48		= (1U << __ATA_QUIRK_MAX_SEC_LBA48),
-	ATA_QUIRK_ATAPI_DMADIR		= (1U << __ATA_QUIRK_ATAPI_DMADIR),
-	ATA_QUIRK_NO_NCQ_TRIM		= (1U << __ATA_QUIRK_NO_NCQ_TRIM),
-	ATA_QUIRK_NOLPM			= (1U << __ATA_QUIRK_NOLPM),
-	ATA_QUIRK_WD_BROKEN_LPM		= (1U << __ATA_QUIRK_WD_BROKEN_LPM),
-	ATA_QUIRK_ZERO_AFTER_TRIM	= (1U << __ATA_QUIRK_ZERO_AFTER_TRIM),
-	ATA_QUIRK_NO_DMA_LOG		= (1U << __ATA_QUIRK_NO_DMA_LOG),
-	ATA_QUIRK_NOTRIM		= (1U << __ATA_QUIRK_NOTRIM),
-	ATA_QUIRK_MAX_SEC_1024		= (1U << __ATA_QUIRK_MAX_SEC_1024),
-	ATA_QUIRK_MAX_TRIM_128M		= (1U << __ATA_QUIRK_MAX_TRIM_128M),
-	ATA_QUIRK_NO_NCQ_ON_ATI		= (1U << __ATA_QUIRK_NO_NCQ_ON_ATI),
-	ATA_QUIRK_NO_LPM_ON_ATI		= (1U << __ATA_QUIRK_NO_LPM_ON_ATI),
-	ATA_QUIRK_NO_ID_DEV_LOG		= (1U << __ATA_QUIRK_NO_ID_DEV_LOG),
-	ATA_QUIRK_NO_LOG_DIR		= (1U << __ATA_QUIRK_NO_LOG_DIR),
-	ATA_QUIRK_NO_FUA		= (1U << __ATA_QUIRK_NO_FUA),
-
 	/* User visible DMA mask for DMA control. DO NOT renumber. */
 	ATA_DMA_MASK_ATA	= (1 << 0),	/* DMA on ATA Disk */
 	ATA_DMA_MASK_ATAPI	= (1 << 1),	/* DMA on ATAPI */
-- 
cgit v1.2.3


From 2e983271363108b3813b38754eb96d9b1cb252bb Mon Sep 17 00:00:00 2001
From: Niklas Cassel <cassel@kernel.org>
Date: Mon, 1 Dec 2025 13:35:04 +0100
Subject: ata: libata-core: Quirk DELLBOSS VD max_sectors

Commit 9b8b84879d4a ("block: Increase BLK_DEF_MAX_SECTORS_CAP") increased
the default max_sectors_kb from 1280 KiB to 4096 KiB.

DELLBOSS VD with FW rev MV.R00-0 times out when sending I/Os of size
4096 KiB.

Enable ATA_QUIRK_MAX_SEC, with value 8191 (sectors) for this device,
since any I/O with more sectors than that lead to I/O timeouts.

With this, the DELLBOSS VD SATA controller is usable again.

Cc: stable+noautosel@kernel.org # depends on Move quirk flags to their own enum
Fixes: 9b8b84879d4a ("block: Increase BLK_DEF_MAX_SECTORS_CAP")
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/libata-core.c | 11 +++++++++++
 include/linux/ata.h       |  1 +
 include/linux/libata.h    |  2 ++
 3 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 2c737e63a0b9..08cc43b6bf46 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3139,6 +3139,10 @@ int ata_dev_configure(struct ata_device *dev)
 		dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_1024,
 					 dev->max_sectors);
 
+	if (dev->quirks & ATA_QUIRK_MAX_SEC_8191)
+		dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_8191,
+					 dev->max_sectors);
+
 	if (dev->quirks & ATA_QUIRK_MAX_SEC_LBA48)
 		dev->max_sectors = ATA_MAX_SECTORS_LBA48;
 
@@ -3991,6 +3995,7 @@ static const char * const ata_quirk_names[] = {
 	[__ATA_QUIRK_NO_DMA_LOG]	= "nodmalog",
 	[__ATA_QUIRK_NOTRIM]		= "notrim",
 	[__ATA_QUIRK_MAX_SEC_1024]	= "maxsec1024",
+	[__ATA_QUIRK_MAX_SEC_8191]	= "maxsec8191",
 	[__ATA_QUIRK_MAX_TRIM_128M]	= "maxtrim128m",
 	[__ATA_QUIRK_NO_NCQ_ON_ATI]	= "noncqonati",
 	[__ATA_QUIRK_NO_LPM_ON_ATI]	= "nolpmonati",
@@ -4097,6 +4102,12 @@ static const struct ata_dev_quirks_entry __ata_dev_quirks[] = {
 	{ "LITEON CX1-JB*-HP",	NULL,		ATA_QUIRK_MAX_SEC_1024 },
 	{ "LITEON EP1-*",	NULL,		ATA_QUIRK_MAX_SEC_1024 },
 
+	/*
+	 * These devices time out with higher max sects.
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=220693
+	 */
+	{ "DELLBOSS VD",	"MV.R00-0",	ATA_QUIRK_MAX_SEC_8191 },
+
 	/* Devices we expect to fail diagnostics */
 
 	/* Devices where NCQ should be avoided */
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 792e10a09787..1786e7b1165f 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -29,6 +29,7 @@ enum {
 	ATA_MAX_SECTORS_128	= 128,
 	ATA_MAX_SECTORS		= 256,
 	ATA_MAX_SECTORS_1024    = 1024,
+	ATA_MAX_SECTORS_8191    = 8191,
 	ATA_MAX_SECTORS_LBA48	= 65535,/* avoid count to be 0000h */
 	ATA_MAX_SECTORS_TAPE	= 65535,
 	ATA_MAX_TRIM_RNUM	= 64,	/* 512-byte payload / (6-byte LBA + 2-byte range per entry) */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9aa0541dc62d..abdc7b6f176c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -75,6 +75,7 @@ enum ata_quirks {
 	__ATA_QUIRK_NO_DMA_LOG,		/* Do not use DMA for log read */
 	__ATA_QUIRK_NOTRIM,		/* Do not use TRIM */
 	__ATA_QUIRK_MAX_SEC_1024,	/* Limit max sects to 1024 */
+	__ATA_QUIRK_MAX_SEC_8191,	/* Limit max sects to 8191 */
 	__ATA_QUIRK_MAX_TRIM_128M,	/* Limit max trim size to 128M */
 	__ATA_QUIRK_NO_NCQ_ON_ATI,	/* Disable NCQ on ATI chipset */
 	__ATA_QUIRK_NO_LPM_ON_ATI,	/* Disable LPM on ATI chipset */
@@ -115,6 +116,7 @@ enum {
 	ATA_QUIRK_NO_DMA_LOG		= (1U << __ATA_QUIRK_NO_DMA_LOG),
 	ATA_QUIRK_NOTRIM		= (1U << __ATA_QUIRK_NOTRIM),
 	ATA_QUIRK_MAX_SEC_1024		= (1U << __ATA_QUIRK_MAX_SEC_1024),
+	ATA_QUIRK_MAX_SEC_8191		= (1U << __ATA_QUIRK_MAX_SEC_8191),
 	ATA_QUIRK_MAX_TRIM_128M		= (1U << __ATA_QUIRK_MAX_TRIM_128M),
 	ATA_QUIRK_NO_NCQ_ON_ATI		= (1U << __ATA_QUIRK_NO_NCQ_ON_ATI),
 	ATA_QUIRK_NO_LPM_ON_ATI		= (1U << __ATA_QUIRK_NO_LPM_ON_ATI),
-- 
cgit v1.2.3


From 4b011b538f2b90d07580ff778e28954a4a6520eb Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Tue, 2 Dec 2025 16:38:02 +0100
Subject: i3c: fix I3C_SDR bit number

0x31 is decimal 49 and doesn't fit in a 32 bit integer, switch to the
intended decimal 31.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202512020956.Dnz8A2H0-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202512021613.97jVprvJ-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202512021644.lp8ZMSx5-lkp@intel.com/
Link: https://patch.msgid.link/20251202153804.2640623-1-alexandre.belloni@bootlin.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h
index ae0662d9d77e..9fcb6410a584 100644
--- a/include/linux/i3c/device.h
+++ b/include/linux/i3c/device.h
@@ -51,7 +51,7 @@ enum i3c_xfer_mode {
 	I3C_HDR_TSP = 1,
 	I3C_HDR_TSL = 2,
 	/* Use for default SDR transfer mode */
-	I3C_SDR = 0x31,
+	I3C_SDR = 31,
 };
 
 /**
-- 
cgit v1.2.3


From e01a8baf60af43f6f87a5850dee29cf31377ec25 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Tue, 2 Dec 2025 16:38:03 +0100
Subject: i3c: document i3c_xfers

i3c_xfers was left undocumented, document it.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://patch.msgid.link/20251202153804.2640623-2-alexandre.belloni@bootlin.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index d0d5b3a9049f..2fd850f4678b 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -418,7 +418,11 @@ struct i3c_bus {
  * @send_ccc_cmd: send a CCC command
  *		  This method is mandatory.
  * @priv_xfers: do one or several private I3C SDR transfers
- *		This method is mandatory.
+ *		This method is mandatory when i3c_xfers is not implemented. It
+ *		is deprecated.
+ * @i3c_xfers: do one or several I3C SDR or HDR transfers
+ *	       This method is mandatory when priv_xfers is not implemented but
+ *	       should be implemented instead of priv_xfers.
  * @attach_i2c_dev: called every time an I2C device is attached to the bus.
  *		    This is a good place to attach master controller specific
  *		    data to I2C devices.
-- 
cgit v1.2.3


From bbaacdc339d4bde2690b659dc090af7c20a1937e Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Thu, 13 Nov 2025 16:06:18 +0100
Subject: rv: Fix compilation if !CONFIG_RV_REACTORS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kernel test robot spotted a compilation error if reactors are
disabled.

Fix the warning by keeping LTL monitor variable as always static.

Cc: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://patch.msgid.link/20251113150618.185479-2-gmonaco@redhat.com
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511131948.vxi5mdjU-lkp@intel.com/
Fixes: 4f739ed19d22 ("rv: Pass va_list to reactors")
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/rv/ltl_monitor.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h
index 00c42b36f961..eff60cd61106 100644
--- a/include/rv/ltl_monitor.h
+++ b/include/rv/ltl_monitor.h
@@ -17,12 +17,7 @@
 #endif
 
 #define RV_MONITOR_NAME CONCATENATE(rv_, MONITOR_NAME)
-
-#ifdef CONFIG_RV_REACTORS
 static struct rv_monitor RV_MONITOR_NAME;
-#else
-extern struct rv_monitor RV_MONITOR_NAME;
-#endif
 
 static int ltl_monitor_slot = RV_PER_TASK_MONITOR_INIT;
 
-- 
cgit v1.2.3


From b08ee4d666f216a6f9e7194a9b335147d4717f33 Mon Sep 17 00:00:00 2001
From: Neilay Kharwadkar <neilaykharwadkar@gmail.com>
Date: Sun, 16 Nov 2025 19:20:29 +0000
Subject: lib/fonts: Add Terminus 10x18 console font

Add a compile-in option for Terminus 10x18 bitmap console font
to improve readability on modern laptop displays.

On modern 13-16 inch laptop displays with high pixel density,
common scaled resolutions like 1280x800 and 1440x900 are widely
used.

At these resolutions, VGA 8x16 is too small and difficult to
read for extended periods, while Terminus 16x32 is too large,
providing only 25-28 rows. The existing 10x18 font has poor
readability.

Terminus 10x18 provides improved readability with its clean,
fixed-width design while maintaining practical row counts
(44-50 rows).

A comfortable and readable built-in font for early boot messages,
kernel panics or whenever userspace is unavailable.

The font was converted from standard Terminus ter-i18b.psf using
psftools and formatted to match kernel font conventions.

This patch is non-intrusive, no options are enabled by default
so most users won't notice a thing.

Signed-off-by: Neilay Kharwadkar <neilaykharwadkar@gmail.com>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/font.h      |    4 +-
 lib/fonts/Kconfig         |   12 +
 lib/fonts/Makefile        |    1 +
 lib/fonts/font_ter10x18.c | 5143 +++++++++++++++++++++++++++++++++++++++++++++
 lib/fonts/fonts.c         |    3 +
 5 files changed, 5162 insertions(+), 1 deletion(-)
 create mode 100644 lib/fonts/font_ter10x18.c

(limited to 'include')

diff --git a/include/linux/font.h b/include/linux/font.h
index 81caffd51bb4..fd8625cd76b2 100644
--- a/include/linux/font.h
+++ b/include/linux/font.h
@@ -35,6 +35,7 @@ struct font_desc {
 #define FONT6x10_IDX	10
 #define TER16x32_IDX	11
 #define FONT6x8_IDX	12
+#define TER10x18_IDX	13
 
 extern const struct font_desc	font_vga_8x8,
 			font_vga_8x16,
@@ -48,7 +49,8 @@ extern const struct font_desc	font_vga_8x8,
 			font_mini_4x6,
 			font_6x10,
 			font_ter_16x32,
-			font_6x8;
+			font_6x8,
+			font_ter_10x18;
 
 /* Find a font with a specific name */
 
diff --git a/lib/fonts/Kconfig b/lib/fonts/Kconfig
index ae59b5b4e225..7d03823e46dc 100644
--- a/lib/fonts/Kconfig
+++ b/lib/fonts/Kconfig
@@ -112,6 +112,17 @@ config FONT_SUN12x22
 	  big letters (like the letters used in the SPARC PROM). If the
 	  standard font is unreadable for you, say Y, otherwise say N.
 
+config FONT_TER10x18
+	bool "Terminus 10x18 font (not supported by all drivers)"
+	depends on FRAMEBUFFER_CONSOLE || DRM_PANIC
+	depends on !SPARC && FONTS || SPARC
+	help
+	  Terminus Font is a clean, fixed width bitmap font, designed
+	  for long (8 and more hours per day) work with computers.
+	  This is the high resolution version made for use with 13-16" laptops.
+	  It fits between the normal 8x16 font and Terminus 16x32.
+	  If other fonts are unreadable for you, say Y, otherwise say N.
+
 config FONT_TER16x32
 	bool "Terminus 16x32 font (not supported by all drivers)"
 	depends on FRAMEBUFFER_CONSOLE || DRM_PANIC
@@ -140,6 +151,7 @@ config FONT_AUTOSELECT
 	depends on !FONT_SUN8x16
 	depends on !FONT_SUN12x22
 	depends on !FONT_10x18
+	depends on !FONT_TER10x18
 	depends on !FONT_TER16x32
 	depends on !FONT_6x8
 	select FONT_8x16
diff --git a/lib/fonts/Makefile b/lib/fonts/Makefile
index e16f68492174..30a85a4292fa 100644
--- a/lib/fonts/Makefile
+++ b/lib/fonts/Makefile
@@ -14,6 +14,7 @@ font-objs-$(CONFIG_FONT_PEARL_8x8) += font_pearl_8x8.o
 font-objs-$(CONFIG_FONT_ACORN_8x8) += font_acorn_8x8.o
 font-objs-$(CONFIG_FONT_MINI_4x6)  += font_mini_4x6.o
 font-objs-$(CONFIG_FONT_6x10)      += font_6x10.o
+font-objs-$(CONFIG_FONT_TER10x18)  += font_ter10x18.o
 font-objs-$(CONFIG_FONT_TER16x32)  += font_ter16x32.o
 font-objs-$(CONFIG_FONT_6x8)       += font_6x8.o
 
diff --git a/lib/fonts/font_ter10x18.c b/lib/fonts/font_ter10x18.c
new file mode 100644
index 000000000000..80356e9d56c7
--- /dev/null
+++ b/lib/fonts/font_ter10x18.c
@@ -0,0 +1,5143 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/font.h>
+#include <linux/module.h>
+
+#define FONTDATAMAX 9216
+
+static const struct font_data fontdata_ter10x18 = {
+	{ 0, 0, FONTDATAMAX, 0 }, {
+	/* 0 0x00 '^@' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 1 0x01 '^A' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x80, 0x40, /* #--------# */
+	0x80, 0x40, /* #--------# */
+	0xb3, 0x40, /* #-##--##-# */
+	0xb3, 0x40, /* #-##--##-# */
+	0x80, 0x40, /* #--------# */
+	0x80, 0x40, /* #--------# */
+	0xbf, 0x40, /* #-######-# */
+	0x9e, 0x40, /* #--####--# */
+	0x80, 0x40, /* #--------# */
+	0x80, 0x40, /* #--------# */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 2 0x02 '^B' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xcc, 0xc0, /* ##--##--## */
+	0xcc, 0xc0, /* ##--##--## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xc0, 0xc0, /* ##------## */
+	0xe1, 0xc0, /* ###----### */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 3 0x03 '^C' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x73, 0x80, /* -###--###- */
+	0xf3, 0xc0, /* ####--#### */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x7f, 0x80, /* -########- */
+	0x3f, 0x00, /* --######-- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 4 0x04 '^D' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x3f, 0x00, /* --######-- */
+	0x7f, 0x80, /* -########- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x7f, 0x80, /* -########- */
+	0x3f, 0x00, /* --######-- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 5 0x05 '^E' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x3f, 0x00, /* --######-- */
+	0x3f, 0x00, /* --######-- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x6d, 0x80, /* -##-##-##- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x6d, 0x80, /* -##-##-##- */
+	0x0c, 0x00, /* ----##---- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 6 0x06 '^F' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x3f, 0x00, /* --######-- */
+	0x7f, 0x80, /* -########- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x6d, 0x80, /* -##-##-##- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 7 0x07 '^G' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 8 0x08 '^H' */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xf3, 0xc0, /* ####--#### */
+	0xe1, 0xc0, /* ###----### */
+	0xe1, 0xc0, /* ###----### */
+	0xf3, 0xc0, /* ####--#### */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+
+	/* 9 0x09 '^I' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x21, 0x00, /* --#----#-- */
+	0x21, 0x00, /* --#----#-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 10 0x0a '^J' */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xe1, 0xc0, /* ###----### */
+	0xcc, 0xc0, /* ##--##--## */
+	0xde, 0xc0, /* ##-####-## */
+	0xde, 0xc0, /* ##-####-## */
+	0xcc, 0xc0, /* ##--##--## */
+	0xe1, 0xc0, /* ###----### */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+
+	/* 11 0x0b '^K' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0f, 0x80, /* ----#####- */
+	0x03, 0x80, /* ------###- */
+	0x06, 0x80, /* -----##-#- */
+	0x0c, 0x80, /* ----##--#- */
+	0x3e, 0x00, /* --#####--- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x3e, 0x00, /* --#####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 12 0x0c '^L' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x7f, 0x80, /* -########- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 13 0x0d '^M' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x80, /* --#######- */
+	0x31, 0x80, /* --##---##- */
+	0x31, 0x80, /* --##---##- */
+	0x3f, 0x80, /* --#######- */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0xf0, 0x00, /* ####------ */
+	0xe0, 0x00, /* ###------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 14 0x0e '^N' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x63, 0x80, /* -##---###- */
+	0xe3, 0x00, /* ###---##-- */
+	0xc0, 0x00, /* ##-------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 15 0x0f '^O' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0xcc, 0xc0, /* ##--##--## */
+	0x6d, 0x80, /* -##-##-##- */
+	0x3f, 0x00, /* --######-- */
+	0x1e, 0x00, /* ---####--- */
+	0xf3, 0xc0, /* ####--#### */
+	0x1e, 0x00, /* ---####--- */
+	0x3f, 0x00, /* --######-- */
+	0x6d, 0x80, /* -##-##-##- */
+	0xcc, 0xc0, /* ##--##--## */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 16 0x10 '^P' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xc0, 0x00, /* ##-------- */
+	0xf0, 0x00, /* ####------ */
+	0xfc, 0x00, /* ######---- */
+	0xff, 0x00, /* ########-- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0x00, /* ########-- */
+	0xfc, 0x00, /* ######---- */
+	0xf0, 0x00, /* ####------ */
+	0xc0, 0x00, /* ##-------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 17 0x11 '^Q' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0xc0, /* --------## */
+	0x03, 0xc0, /* ------#### */
+	0x0f, 0xc0, /* ----###### */
+	0x3f, 0xc0, /* --######## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x3f, 0xc0, /* --######## */
+	0x0f, 0xc0, /* ----###### */
+	0x03, 0xc0, /* ------#### */
+	0x00, 0xc0, /* --------## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 18 0x12 '^R' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x3f, 0x00, /* --######-- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x3f, 0x00, /* --######-- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 19 0x13 '^S' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 20 0x14 '^T' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x80, /* --#######- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x3d, 0x80, /* --####-##- */
+	0x0d, 0x80, /* ----##-##- */
+	0x0d, 0x80, /* ----##-##- */
+	0x0d, 0x80, /* ----##-##- */
+	0x0d, 0x80, /* ----##-##- */
+	0x0d, 0x80, /* ----##-##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 21 0x15 '^U' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x30, 0x00, /* --##------ */
+	0x1c, 0x00, /* ---###---- */
+	0x36, 0x00, /* --##-##--- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1b, 0x00, /* ---##-##-- */
+	0x0e, 0x00, /* ----###--- */
+	0x03, 0x00, /* ------##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 22 0x16 '^V' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xff, 0x80, /* #########- */
+	0xff, 0x80, /* #########- */
+	0xff, 0x80, /* #########- */
+	0xff, 0x80, /* #########- */
+	0xff, 0x80, /* #########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 23 0x17 '^W' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x3f, 0x00, /* --######-- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x3f, 0x00, /* --######-- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 24 0x18 '^X' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x3f, 0x00, /* --######-- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 25 0x19 '^Y' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x3f, 0x00, /* --######-- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 26 0x1a '^Z' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x04, 0x00, /* -----#---- */
+	0x06, 0x00, /* -----##--- */
+	0x03, 0x00, /* ------##-- */
+	0xff, 0x80, /* #########- */
+	0xff, 0x80, /* #########- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x04, 0x00, /* -----#---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 27 0x1b '^[' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x10, 0x00, /* ---#------ */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0xff, 0x80, /* #########- */
+	0xff, 0x80, /* #########- */
+	0x60, 0x00, /* -##------- */
+	0x30, 0x00, /* --##------ */
+	0x10, 0x00, /* ---#------ */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 28 0x1c '^\' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 29 0x1d '^]' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x12, 0x00, /* ---#--#--- */
+	0x33, 0x00, /* --##--##-- */
+	0x61, 0x80, /* -##----##- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x12, 0x00, /* ---#--#--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 30 0x1e '^^' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x1e, 0x00, /* ---####--- */
+	0x3f, 0x00, /* --######-- */
+	0x3f, 0x00, /* --######-- */
+	0x7f, 0x80, /* -########- */
+	0x7f, 0x80, /* -########- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 31 0x1f '^_' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x7f, 0x80, /* -########- */
+	0x7f, 0x80, /* -########- */
+	0x3f, 0x00, /* --######-- */
+	0x3f, 0x00, /* --######-- */
+	0x1e, 0x00, /* ---####--- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 32 0x20 ' ' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 33 0x21 '!' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 34 0x22 '"' */
+	0x00, 0x00, /* ---------- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 35 0x23 '#' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x7f, 0x80, /* -########- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x7f, 0x80, /* -########- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 36 0x24 '$' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x3f, 0x00, /* --######-- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6c, 0x00, /* -##-##---- */
+	0x6c, 0x00, /* -##-##---- */
+	0x6c, 0x00, /* -##-##---- */
+	0x3f, 0x00, /* --######-- */
+	0x0d, 0x80, /* ----##-##- */
+	0x0d, 0x80, /* ----##-##- */
+	0x0d, 0x80, /* ----##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x3f, 0x00, /* --######-- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+
+	/* 37 0x25 '%' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x73, 0x00, /* -###--##-- */
+	0x53, 0x00, /* -#-#--##-- */
+	0x76, 0x00, /* -###-##--- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x37, 0x00, /* --##-###-- */
+	0x65, 0x00, /* -##--#-#-- */
+	0x67, 0x00, /* -##--###-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 38 0x26 '&' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3c, 0x00, /* --####---- */
+	0x66, 0x00, /* -##--##--- */
+	0x66, 0x00, /* -##--##--- */
+	0x66, 0x00, /* -##--##--- */
+	0x3c, 0x00, /* --####---- */
+	0x39, 0x80, /* --###--##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0xc7, 0x00, /* ##---###-- */
+	0xc3, 0x00, /* ##----##-- */
+	0xc3, 0x00, /* ##----##-- */
+	0x67, 0x80, /* -##--####- */
+	0x3d, 0x80, /* --####-##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 39 0x27 ''' */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 40 0x28 '(' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x06, 0x00, /* -----##--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 41 0x29 ')' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 42 0x2a '*' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x63, 0x00, /* -##---##-- */
+	0x36, 0x00, /* --##-##--- */
+	0x1c, 0x00, /* ---###---- */
+	0xff, 0x80, /* #########- */
+	0x1c, 0x00, /* ---###---- */
+	0x36, 0x00, /* --##-##--- */
+	0x63, 0x00, /* -##---##-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 43 0x2b '+' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x7f, 0x80, /* -########- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 44 0x2c ',' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 45 0x2d '-' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 46 0x2e '.' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 47 0x2f '/' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 48 0x30 '0' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x63, 0x80, /* -##---###- */
+	0x67, 0x80, /* -##--####- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x79, 0x80, /* -####--##- */
+	0x71, 0x80, /* -###---##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 49 0x31 '1' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x1c, 0x00, /* ---###---- */
+	0x3c, 0x00, /* --####---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 50 0x32 '2' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x01, 0x80, /* -------##- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 51 0x33 '3' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x1f, 0x00, /* ---#####-- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 52 0x34 '4' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x01, 0x80, /* -------##- */
+	0x03, 0x80, /* ------###- */
+	0x07, 0x80, /* -----####- */
+	0x0d, 0x80, /* ----##-##- */
+	0x19, 0x80, /* ---##--##- */
+	0x31, 0x80, /* --##---##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 53 0x35 '5' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x00, /* -#######-- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 54 0x36 '6' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1f, 0x00, /* ---#####-- */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 55 0x37 '7' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x01, 0x80, /* -------##- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 56 0x38 '8' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 57 0x39 '9' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x03, 0x00, /* ------##-- */
+	0x3e, 0x00, /* --#####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 58 0x3a ':' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 59 0x3b ';' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 60 0x3c '<' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x30, 0x00, /* --##------ */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x06, 0x00, /* -----##--- */
+	0x03, 0x00, /* ------##-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 61 0x3d '=' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 62 0x3e '>' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x60, 0x00, /* -##------- */
+	0x30, 0x00, /* --##------ */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x06, 0x00, /* -----##--- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 63 0x3f '?' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x01, 0x80, /* -------##- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 64 0x40 '@' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x00, /* -#######-- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xcf, 0x80, /* ##--#####- */
+	0xd9, 0x80, /* ##-##--##- */
+	0xd9, 0x80, /* ##-##--##- */
+	0xd9, 0x80, /* ##-##--##- */
+	0xd9, 0x80, /* ##-##--##- */
+	0xcf, 0x80, /* ##--#####- */
+	0xc0, 0x00, /* ##-------- */
+	0xc0, 0x00, /* ##-------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 65 0x41 'A' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 66 0x42 'B' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x00, /* -#######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 67 0x43 'C' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 68 0x44 'D' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7e, 0x00, /* -######--- */
+	0x63, 0x00, /* -##---##-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x63, 0x00, /* -##---##-- */
+	0x7e, 0x00, /* -######--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 69 0x45 'E' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7e, 0x00, /* -######--- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 70 0x46 'F' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7e, 0x00, /* -######--- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 71 0x47 'G' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x67, 0x80, /* -##--####- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 72 0x48 'H' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 73 0x49 'I' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 74 0x4a 'J' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x07, 0x80, /* -----####- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x3e, 0x00, /* --#####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 75 0x4b 'K' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x63, 0x00, /* -##---##-- */
+	0x66, 0x00, /* -##--##--- */
+	0x6c, 0x00, /* -##-##---- */
+	0x78, 0x00, /* -####----- */
+	0x78, 0x00, /* -####----- */
+	0x6c, 0x00, /* -##-##---- */
+	0x66, 0x00, /* -##--##--- */
+	0x63, 0x00, /* -##---##-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 76 0x4c 'L' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 77 0x4d 'M' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x80, 0x80, /* #-------#- */
+	0xc1, 0x80, /* ##-----##- */
+	0xe3, 0x80, /* ###---###- */
+	0xf7, 0x80, /* ####-####- */
+	0xdd, 0x80, /* ##-###-##- */
+	0xc9, 0x80, /* ##--#--##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 78 0x4e 'N' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x71, 0x80, /* -###---##- */
+	0x79, 0x80, /* -####--##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x67, 0x80, /* -##--####- */
+	0x63, 0x80, /* -##---###- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 79 0x4f 'O' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 80 0x50 'P' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x00, /* -#######-- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 81 0x51 'Q' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x67, 0x80, /* -##--####- */
+	0x3f, 0x00, /* --######-- */
+	0x03, 0x00, /* ------##-- */
+	0x01, 0x80, /* -------##- */
+	0x00, 0x00, /* ---------- */
+
+	/* 82 0x52 'R' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x00, /* -#######-- */
+	0x78, 0x00, /* -####----- */
+	0x6c, 0x00, /* -##-##---- */
+	0x66, 0x00, /* -##--##--- */
+	0x63, 0x00, /* -##---##-- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 83 0x53 'S' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x3f, 0x00, /* --######-- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 84 0x54 'T' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 85 0x55 'U' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 86 0x56 'V' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 87 0x57 'W' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc9, 0x80, /* ##--#--##- */
+	0xdd, 0x80, /* ##-###-##- */
+	0xf7, 0x80, /* ####-####- */
+	0xe3, 0x80, /* ###---###- */
+	0xc1, 0x80, /* ##-----##- */
+	0x80, 0x80, /* #-------#- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 88 0x58 'X' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 89 0x59 'Y' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 90 0x5a 'Z' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 91 0x5b '[' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 92 0x5c '\' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 93 0x5d ']' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 94 0x5e '^' */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 95 0x5f '_' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+
+	/* 96 0x60 '`' */
+	0x30, 0x00, /* --##------ */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 97 0x61 'a' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x3f, 0x80, /* --#######- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 98 0x62 'b' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x00, /* -#######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 99 0x63 'c' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 100 0x64 'd' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x3f, 0x80, /* --#######- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 101 0x65 'e' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 102 0x66 'f' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x07, 0x80, /* -----####- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x3f, 0x00, /* --######-- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 103 0x67 'g' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x80, /* --#######- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x3f, 0x00, /* --######-- */
+
+	/* 104 0x68 'h' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 105 0x69 'i' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x1c, 0x00, /* ---###---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 106 0x6a 'j' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x00, 0x00, /* ---------- */
+	0x07, 0x00, /* -----###-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+
+	/* 107 0x6b 'k' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x63, 0x00, /* -##---##-- */
+	0x66, 0x00, /* -##--##--- */
+	0x6c, 0x00, /* -##-##---- */
+	0x78, 0x00, /* -####----- */
+	0x6c, 0x00, /* -##-##---- */
+	0x66, 0x00, /* -##--##--- */
+	0x63, 0x00, /* -##---##-- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 108 0x6c 'l' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1c, 0x00, /* ---###---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 109 0x6d 'm' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x00, /* -#######-- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 110 0x6e 'n' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 111 0x6f 'o' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 112 0x70 'p' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x00, /* -#######-- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+
+	/* 113 0x71 'q' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x80, /* --#######- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+
+	/* 114 0x72 'r' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x6f, 0x80, /* -##-#####- */
+	0x78, 0x00, /* -####----- */
+	0x70, 0x00, /* -###------ */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 115 0x73 's' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x3f, 0x00, /* --######-- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 116 0x74 't' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x7e, 0x00, /* -######--- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x18, 0x00, /* ---##----- */
+	0x0f, 0x00, /* ----####-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 117 0x75 'u' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 118 0x76 'v' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 119 0x77 'w' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 120 0x78 'x' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 121 0x79 'y' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x3f, 0x00, /* --######-- */
+
+	/* 122 0x7a 'z' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x01, 0x80, /* -------##- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 123 0x7b '{' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x07, 0x00, /* -----###-- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x38, 0x00, /* --###----- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x07, 0x00, /* -----###-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 124 0x7c '|' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 125 0x7d '}' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x38, 0x00, /* --###----- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x07, 0x00, /* -----###-- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x38, 0x00, /* --###----- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 126 0x7e '~' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x39, 0x80, /* --###--##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x67, 0x00, /* -##--###-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 127 0x7f '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x08, 0x00, /* ----#----- */
+	0x1c, 0x00, /* ---###---- */
+	0x36, 0x00, /* --##-##--- */
+	0x63, 0x00, /* -##---##-- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xc1, 0x80, /* ##-----##- */
+	0xff, 0x80, /* #########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 128 0x80 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+
+	/* 129 0x81 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 130 0x82 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 131 0x83 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x3f, 0x80, /* --#######- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 132 0x84 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x3f, 0x80, /* --#######- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 133 0x85 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x3f, 0x80, /* --#######- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 134 0x86 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x3f, 0x00, /* --######-- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x3f, 0x80, /* --#######- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 135 0x87 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+
+	/* 136 0x88 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 137 0x89 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 138 0x8a '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 139 0x8b '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x1c, 0x00, /* ---###---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 140 0x8c '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x1c, 0x00, /* ---###---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 141 0x8d '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x1c, 0x00, /* ---###---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 142 0x8e '' */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 143 0x8f '' */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x80, /* -########- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 144 0x90 '' */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7e, 0x00, /* -######--- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 145 0x91 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7b, 0x80, /* -####-###- */
+	0x0c, 0xc0, /* ----##--## */
+	0x0c, 0xc0, /* ----##--## */
+	0x7c, 0xc0, /* -#####--## */
+	0xcf, 0xc0, /* ##--###### */
+	0xcc, 0x00, /* ##--##---- */
+	0xcc, 0x00, /* ##--##---- */
+	0xcc, 0xc0, /* ##--##--## */
+	0x77, 0x80, /* -###-####- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 146 0x92 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0xc0, /* -######### */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xff, 0xc0, /* ########## */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc7, 0xc0, /* ##---##### */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 147 0x93 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 148 0x94 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 149 0x95 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 150 0x96 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 151 0x97 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 152 0x98 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x3f, 0x00, /* --######-- */
+
+	/* 153 0x99 '' */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 154 0x9a '' */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 155 0x9b '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x3f, 0x00, /* --######-- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6c, 0x00, /* -##-##---- */
+	0x6c, 0x00, /* -##-##---- */
+	0x6c, 0x00, /* -##-##---- */
+	0x6c, 0x00, /* -##-##---- */
+	0x6c, 0x00, /* -##-##---- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x3f, 0x00, /* --######-- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+
+	/* 156 0x9c '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x7e, 0x00, /* -######--- */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x31, 0x80, /* --##---##- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 157 0x9d '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x1e, 0x00, /* ---####--- */
+	0x0c, 0x00, /* ----##---- */
+	0x3f, 0x00, /* --######-- */
+	0x0c, 0x00, /* ----##---- */
+	0x3f, 0x00, /* --######-- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 158 0x9e '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xf8, 0x00, /* #####----- */
+	0xcc, 0x00, /* ##--##---- */
+	0xcc, 0x00, /* ##--##---- */
+	0xcc, 0x00, /* ##--##---- */
+	0xcc, 0x00, /* ##--##---- */
+	0xfb, 0x00, /* #####-##-- */
+	0xc3, 0x00, /* ##----##-- */
+	0xc7, 0x80, /* ##---####- */
+	0xc3, 0x00, /* ##----##-- */
+	0xc3, 0x00, /* ##----##-- */
+	0xc3, 0x00, /* ##----##-- */
+	0xc1, 0x80, /* ##-----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 159 0x9f '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x07, 0x00, /* -----###-- */
+	0x0d, 0x80, /* ----##-##- */
+	0x0d, 0x80, /* ----##-##- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x3f, 0x00, /* --######-- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x6c, 0x00, /* -##-##---- */
+	0x6c, 0x00, /* -##-##---- */
+	0x38, 0x00, /* --###----- */
+
+	/* 160 0xa0 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x3f, 0x80, /* --#######- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 161 0xa1 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x1c, 0x00, /* ---###---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 162 0xa2 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 163 0xa3 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x80, /* --#######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 164 0xa4 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3b, 0x80, /* --###-###- */
+	0x6e, 0x00, /* -##-###--- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 165 0xa5 '' */
+	0x3b, 0x80, /* --###-###- */
+	0x6e, 0x00, /* -##-###--- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x71, 0x80, /* -###---##- */
+	0x79, 0x80, /* -####--##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x67, 0x80, /* -##--####- */
+	0x63, 0x80, /* -##---###- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 166 0xa6 '' */
+	0x00, 0x00, /* ---------- */
+	0x3e, 0x00, /* --#####--- */
+	0x03, 0x00, /* ------##-- */
+	0x3f, 0x00, /* --######-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x00, /* -#######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 167 0xa7 '' */
+	0x00, 0x00, /* ---------- */
+	0x3e, 0x00, /* --#####--- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x3e, 0x00, /* --#####--- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x00, /* -#######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 168 0xa8 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 169 0xa9 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 170 0xaa '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 171 0xab '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x30, 0x00, /* --##------ */
+	0x70, 0x00, /* -###------ */
+	0x30, 0x80, /* --##----#- */
+	0x31, 0x80, /* --##---##- */
+	0x33, 0x00, /* --##--##-- */
+	0x36, 0x00, /* --##-##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x37, 0x00, /* --##-###-- */
+	0x6d, 0x80, /* -##-##-##- */
+	0xc1, 0x80, /* ##-----##- */
+	0x83, 0x00, /* #-----##-- */
+	0x06, 0x00, /* -----##--- */
+	0x0f, 0x80, /* ----#####- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 172 0xac '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x30, 0x00, /* --##------ */
+	0x70, 0x00, /* -###------ */
+	0x30, 0x80, /* --##----#- */
+	0x31, 0x80, /* --##---##- */
+	0x33, 0x00, /* --##--##-- */
+	0x36, 0x00, /* --##-##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x19, 0x80, /* ---##--##- */
+	0x33, 0x80, /* --##--###- */
+	0x67, 0x80, /* -##--####- */
+	0xcd, 0x80, /* ##--##-##- */
+	0x8f, 0x80, /* #---#####- */
+	0x01, 0x80, /* -------##- */
+	0x01, 0x80, /* -------##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 173 0xad '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 174 0xae '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0xc0, /* ----##--## */
+	0x19, 0x80, /* ---##--##- */
+	0x33, 0x00, /* --##--##-- */
+	0x66, 0x00, /* -##--##--- */
+	0xcc, 0x00, /* ##--##---- */
+	0x66, 0x00, /* -##--##--- */
+	0x33, 0x00, /* --##--##-- */
+	0x19, 0x80, /* ---##--##- */
+	0x0c, 0xc0, /* ----##--## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 175 0xaf '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xcc, 0x00, /* ##--##---- */
+	0x66, 0x00, /* -##--##--- */
+	0x33, 0x00, /* --##--##-- */
+	0x19, 0x80, /* ---##--##- */
+	0x0c, 0xc0, /* ----##--## */
+	0x19, 0x80, /* ---##--##- */
+	0x33, 0x00, /* --##--##-- */
+	0x66, 0x00, /* -##--##--- */
+	0xcc, 0x00, /* ##--##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 176 0xb0 '' */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x00, 0x00, /* ---------- */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x00, 0x00, /* ---------- */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x00, 0x00, /* ---------- */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x00, 0x00, /* ---------- */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x00, 0x00, /* ---------- */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x00, 0x00, /* ---------- */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x00, 0x00, /* ---------- */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x00, 0x00, /* ---------- */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x00, 0x00, /* ---------- */
+
+	/* 177 0xb1 '' */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x55, 0x40, /* -#-#-#-#-# */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x55, 0x40, /* -#-#-#-#-# */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x55, 0x40, /* -#-#-#-#-# */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x55, 0x40, /* -#-#-#-#-# */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x55, 0x40, /* -#-#-#-#-# */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x55, 0x40, /* -#-#-#-#-# */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x55, 0x40, /* -#-#-#-#-# */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x55, 0x40, /* -#-#-#-#-# */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0x55, 0x40, /* -#-#-#-#-# */
+
+	/* 178 0xb2 '' */
+	0xff, 0xc0, /* ########## */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0xff, 0xc0, /* ########## */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0xff, 0xc0, /* ########## */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0xff, 0xc0, /* ########## */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0xff, 0xc0, /* ########## */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0xff, 0xc0, /* ########## */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0xff, 0xc0, /* ########## */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0xff, 0xc0, /* ########## */
+	0xaa, 0x80, /* #-#-#-#-#- */
+	0xff, 0xc0, /* ########## */
+	0xaa, 0x80, /* #-#-#-#-#- */
+
+	/* 179 0xb3 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 180 0xb4 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0xfc, 0x00, /* ######---- */
+	0xfc, 0x00, /* ######---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 181 0xb5 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0xfc, 0x00, /* ######---- */
+	0xfc, 0x00, /* ######---- */
+	0x0c, 0x00, /* ----##---- */
+	0xfc, 0x00, /* ######---- */
+	0xfc, 0x00, /* ######---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 182 0xb6 '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0xf6, 0x00, /* ####-##--- */
+	0xf6, 0x00, /* ####-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 183 0xb7 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xfe, 0x00, /* #######--- */
+	0xfe, 0x00, /* #######--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 184 0xb8 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xfc, 0x00, /* ######---- */
+	0xfc, 0x00, /* ######---- */
+	0x0c, 0x00, /* ----##---- */
+	0xfc, 0x00, /* ######---- */
+	0xfc, 0x00, /* ######---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 185 0xb9 '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0xf6, 0x00, /* ####-##--- */
+	0xf6, 0x00, /* ####-##--- */
+	0x06, 0x00, /* -----##--- */
+	0xf6, 0x00, /* ####-##--- */
+	0xf6, 0x00, /* ####-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 186 0xba '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 187 0xbb '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xfe, 0x00, /* #######--- */
+	0xfe, 0x00, /* #######--- */
+	0x06, 0x00, /* -----##--- */
+	0xf6, 0x00, /* ####-##--- */
+	0xf6, 0x00, /* ####-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 188 0xbc '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0xf6, 0x00, /* ####-##--- */
+	0xf6, 0x00, /* ####-##--- */
+	0x06, 0x00, /* -----##--- */
+	0xfe, 0x00, /* #######--- */
+	0xfe, 0x00, /* #######--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 189 0xbd '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0xfe, 0x00, /* #######--- */
+	0xfe, 0x00, /* #######--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 190 0xbe '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0xfc, 0x00, /* ######---- */
+	0xfc, 0x00, /* ######---- */
+	0x0c, 0x00, /* ----##---- */
+	0xfc, 0x00, /* ######---- */
+	0xfc, 0x00, /* ######---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 191 0xbf '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xfc, 0x00, /* ######---- */
+	0xfc, 0x00, /* ######---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 192 0xc0 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0f, 0xc0, /* ----###### */
+	0x0f, 0xc0, /* ----###### */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 193 0xc1 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 194 0xc2 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 195 0xc3 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0f, 0xc0, /* ----###### */
+	0x0f, 0xc0, /* ----###### */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 196 0xc4 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 197 0xc5 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 198 0xc6 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0f, 0xc0, /* ----###### */
+	0x0f, 0xc0, /* ----###### */
+	0x0c, 0x00, /* ----##---- */
+	0x0f, 0xc0, /* ----###### */
+	0x0f, 0xc0, /* ----###### */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 199 0xc7 '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x37, 0xc0, /* --##-##### */
+	0x37, 0xc0, /* --##-##### */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 200 0xc8 '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x37, 0xc0, /* --##-##### */
+	0x37, 0xc0, /* --##-##### */
+	0x30, 0x00, /* --##------ */
+	0x3f, 0xc0, /* --######## */
+	0x3f, 0xc0, /* --######## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 201 0xc9 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0xc0, /* --######## */
+	0x3f, 0xc0, /* --######## */
+	0x30, 0x00, /* --##------ */
+	0x37, 0xc0, /* --##-##### */
+	0x37, 0xc0, /* --##-##### */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 202 0xca '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0xf7, 0xc0, /* ####-##### */
+	0xf7, 0xc0, /* ####-##### */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 203 0xcb '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0xf7, 0xc0, /* ####-##### */
+	0xf7, 0xc0, /* ####-##### */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 204 0xcc '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x37, 0xc0, /* --##-##### */
+	0x37, 0xc0, /* --##-##### */
+	0x30, 0x00, /* --##------ */
+	0x37, 0xc0, /* --##-##### */
+	0x37, 0xc0, /* --##-##### */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 205 0xcd '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 206 0xce '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0xf7, 0xc0, /* ####-##### */
+	0xf7, 0xc0, /* ####-##### */
+	0x00, 0x00, /* ---------- */
+	0xf7, 0xc0, /* ####-##### */
+	0xf7, 0xc0, /* ####-##### */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 207 0xcf '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 208 0xd0 '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 209 0xd1 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 210 0xd2 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 211 0xd3 '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x3f, 0xc0, /* --######## */
+	0x3f, 0xc0, /* --######## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 212 0xd4 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0f, 0xc0, /* ----###### */
+	0x0f, 0xc0, /* ----###### */
+	0x0c, 0x00, /* ----##---- */
+	0x0f, 0xc0, /* ----###### */
+	0x0f, 0xc0, /* ----###### */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 213 0xd5 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0f, 0xc0, /* ----###### */
+	0x0f, 0xc0, /* ----###### */
+	0x0c, 0x00, /* ----##---- */
+	0x0f, 0xc0, /* ----###### */
+	0x0f, 0xc0, /* ----###### */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 214 0xd6 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0xc0, /* --######## */
+	0x3f, 0xc0, /* --######## */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 215 0xd7 '' */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+	0x36, 0x00, /* --##-##--- */
+
+	/* 216 0xd8 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x0c, 0x00, /* ----##---- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 217 0xd9 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0xfc, 0x00, /* ######---- */
+	0xfc, 0x00, /* ######---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 218 0xda '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0f, 0xc0, /* ----###### */
+	0x0f, 0xc0, /* ----###### */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 219 0xdb '' */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+
+	/* 220 0xdc '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+
+	/* 221 0xdd '' */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+	0xf8, 0x00, /* #####----- */
+
+	/* 222 0xde '' */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+	0x07, 0xc0, /* -----##### */
+
+	/* 223 0xdf '' */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0xff, 0xc0, /* ########## */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 224 0xe0 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7d, 0x80, /* -#####-##- */
+	0xc7, 0x00, /* ##---###-- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc6, 0x00, /* ##---##--- */
+	0xc7, 0x00, /* ##---###-- */
+	0x7d, 0x80, /* -#####-##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 225 0xe1 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3e, 0x00, /* --#####--- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x62, 0x00, /* -##---#--- */
+	0x7f, 0x00, /* -#######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x7f, 0x00, /* -#######-- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+
+	/* 226 0xe2 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 227 0xe3 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 228 0xe4 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x30, 0x00, /* --##------ */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x06, 0x00, /* -----##--- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 229 0xe5 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1f, 0xc0, /* ---####### */
+	0x33, 0x00, /* --##--##-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 230 0xe6 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x63, 0x80, /* -##---###- */
+	0x7d, 0x80, /* -#####-##- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+
+	/* 231 0xe7 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x07, 0x00, /* -----###-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 232 0xe8 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x3f, 0x00, /* --######-- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x3f, 0x00, /* --######-- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 233 0xe9 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 234 0xea '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x73, 0x80, /* -###--###- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 235 0xeb '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 236 0xec '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 237 0xed '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x01, 0x80, /* -------##- */
+	0x03, 0x00, /* ------##-- */
+	0x3f, 0x00, /* --######-- */
+	0x67, 0x80, /* -##--####- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x79, 0x80, /* -####--##- */
+	0x3f, 0x00, /* --######-- */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 238 0xee '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1f, 0x80, /* ---######- */
+	0x30, 0x00, /* --##------ */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x7f, 0x80, /* -########- */
+	0x60, 0x00, /* -##------- */
+	0x60, 0x00, /* -##------- */
+	0x30, 0x00, /* --##------ */
+	0x1f, 0x80, /* ---######- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 239 0xef '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x61, 0x80, /* -##----##- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 240 0xf0 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 241 0xf1 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x7f, 0x80, /* -########- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 242 0xf2 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x30, 0x00, /* --##------ */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x06, 0x00, /* -----##--- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 243 0xf3 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x03, 0x00, /* ------##-- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x30, 0x00, /* --##------ */
+	0x30, 0x00, /* --##------ */
+	0x18, 0x00, /* ---##----- */
+	0x0c, 0x00, /* ----##---- */
+	0x06, 0x00, /* -----##--- */
+	0x03, 0x00, /* ------##-- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 244 0xf4 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x07, 0x00, /* -----###-- */
+	0x0d, 0x80, /* ----##-##- */
+	0x0d, 0x80, /* ----##-##- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+
+	/* 245 0xf5 '' */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x6c, 0x00, /* -##-##---- */
+	0x6c, 0x00, /* -##-##---- */
+	0x38, 0x00, /* --###----- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 246 0xf6 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x7f, 0x80, /* -########- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 247 0xf7 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x39, 0x80, /* --###--##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x67, 0x00, /* -##--###-- */
+	0x00, 0x00, /* ---------- */
+	0x39, 0x80, /* --###--##- */
+	0x6d, 0x80, /* -##-##-##- */
+	0x67, 0x00, /* -##--###-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 248 0xf8 '' */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1e, 0x00, /* ---####--- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 249 0xf9 '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x1c, 0x00, /* ---###---- */
+	0x1c, 0x00, /* ---###---- */
+	0x1c, 0x00, /* ---###---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 250 0xfa '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x0c, 0x00, /* ----##---- */
+	0x0c, 0x00, /* ----##---- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 251 0xfb '' */
+	0x00, 0x00, /* ---------- */
+	0x03, 0x80, /* ------###- */
+	0x03, 0x80, /* ------###- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x03, 0x00, /* ------##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x63, 0x00, /* -##---##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x1b, 0x00, /* ---##-##-- */
+	0x0f, 0x00, /* ----####-- */
+	0x07, 0x00, /* -----###-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 252 0xfc '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3e, 0x00, /* --#####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 253 0xfd '' */
+	0x00, 0x00, /* ---------- */
+	0x1e, 0x00, /* ---####--- */
+	0x33, 0x00, /* --##--##-- */
+	0x33, 0x00, /* --##--##-- */
+	0x06, 0x00, /* -----##--- */
+	0x0c, 0x00, /* ----##---- */
+	0x18, 0x00, /* ---##----- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 254 0xfe '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x3f, 0x00, /* --######-- */
+	0x3f, 0x00, /* --######-- */
+	0x3f, 0x00, /* --######-- */
+	0x3f, 0x00, /* --######-- */
+	0x3f, 0x00, /* --######-- */
+	0x3f, 0x00, /* --######-- */
+	0x3f, 0x00, /* --######-- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+
+	/* 255 0xff '' */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+	0x00, 0x00, /* ---------- */
+} };
+
+
+const struct font_desc font_ter_10x18 = {
+	.idx	= TER10x18_IDX,
+	.name	= "TER10x18",
+	.width	= 10,
+	.height = 18,
+	.charcount = 256,
+	.data	= fontdata_ter10x18.data,
+#ifdef __sparc__
+	.pref	= 5,
+#else
+	.pref	= -1,
+#endif
+};
diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c
index 47e34950b665..a7f118b30171 100644
--- a/lib/fonts/fonts.c
+++ b/lib/fonts/fonts.c
@@ -54,6 +54,9 @@ static const struct font_desc *fonts[] = {
 #ifdef CONFIG_FONT_6x10
 	&font_6x10,
 #endif
+#ifdef CONFIG_FONT_TER10x18
+	&font_ter_10x18,
+#endif
 #ifdef CONFIG_FONT_TER16x32
 	&font_ter_16x32,
 #endif
-- 
cgit v1.2.3


From 8a5dd102e48752f8c4144f051eccc602774f1a93 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@amd.com>
Date: Tue, 2 Dec 2025 13:44:46 +1100
Subject: ccp: Make snp_reclaim_pages and __sev_do_cmd_locked public

The snp_reclaim_pages() helper reclaims pages in the FW state. SEV-TIO
and the TMPM driver (a hardware engine which smashes IOMMU PDEs among
other things) will use to reclaim memory when cleaning up.

Share and export snp_reclaim_pages().

Most of the SEV-TIO code uses sev_do_cmd() which locks the sev_cmd_mutex
and already exported. But the SNP init code (which also sets up SEV-TIO)
executes under the sev_cmd_mutex lock so the SEV-TIO code has to use
the __sev_do_cmd_locked() helper. This one though does not need to be
exported/shared globally as SEV-TIO is a part of the CCP driver still.

Share __sev_do_cmd_locked() via the CCP internal header.

Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251202024449.542361-2-aik@amd.com
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/crypto/ccp/sev-dev.c | 11 +++--------
 drivers/crypto/ccp/sev-dev.h |  2 ++
 include/linux/psp-sev.h      |  6 ++++++
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 0d13d47c164b..9e0c16b36f9c 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -387,13 +387,7 @@ static int sev_write_init_ex_file_if_required(int cmd_id)
 	return sev_write_init_ex_file();
 }
 
-/*
- * snp_reclaim_pages() needs __sev_do_cmd_locked(), and __sev_do_cmd_locked()
- * needs snp_reclaim_pages(), so a forward declaration is needed.
- */
-static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret);
-
-static int snp_reclaim_pages(unsigned long paddr, unsigned int npages, bool locked)
+int snp_reclaim_pages(unsigned long paddr, unsigned int npages, bool locked)
 {
 	int ret, err, i;
 
@@ -427,6 +421,7 @@ cleanup:
 	snp_leak_pages(__phys_to_pfn(paddr), npages - i);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(snp_reclaim_pages);
 
 static int rmp_mark_pages_firmware(unsigned long paddr, unsigned int npages, bool locked)
 {
@@ -857,7 +852,7 @@ static int snp_reclaim_cmd_buf(int cmd, void *cmd_buf)
 	return 0;
 }
 
-static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
+int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
 {
 	struct cmd_buf_desc desc_list[CMD_BUF_DESC_MAX] = {0};
 	struct psp_device *psp = psp_master;
diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h
index ac03bd0848f7..b9029506383f 100644
--- a/drivers/crypto/ccp/sev-dev.h
+++ b/drivers/crypto/ccp/sev-dev.h
@@ -66,6 +66,8 @@ struct sev_device {
 int sev_dev_init(struct psp_device *psp);
 void sev_dev_destroy(struct psp_device *psp);
 
+int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret);
+
 void sev_pci_init(void);
 void sev_pci_exit(void);
 
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index e0dbcb4b4fd9..34a25209f909 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -992,6 +992,7 @@ int sev_do_cmd(int cmd, void *data, int *psp_ret);
 
 void *psp_copy_user_blob(u64 uaddr, u32 len);
 void *snp_alloc_firmware_page(gfp_t mask);
+int snp_reclaim_pages(unsigned long paddr, unsigned int npages, bool locked);
 void snp_free_firmware_page(void *addr);
 void sev_platform_shutdown(void);
 bool sev_is_snp_ciphertext_hiding_supported(void);
@@ -1027,6 +1028,11 @@ static inline void *snp_alloc_firmware_page(gfp_t mask)
 	return NULL;
 }
 
+static inline int snp_reclaim_pages(unsigned long paddr, unsigned int npages, bool locked)
+{
+	return -ENODEV;
+}
+
 static inline void snp_free_firmware_page(void *addr) { }
 
 static inline void sev_platform_shutdown(void) { }
-- 
cgit v1.2.3


From c3859de858aa7ae0d0a5ca21e8ee9792f2f256b6 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@amd.com>
Date: Tue, 2 Dec 2025 13:44:47 +1100
Subject: psp-sev: Assign numbers to all status codes and add new

Make the definitions explicit. Add some more new codes.

The following patches will be using SPDM_REQUEST and
EXPAND_BUFFER_LENGTH_REQUEST, others are useful for the PSP FW
diagnostics.

Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251202024449.542361-3-aik@amd.com
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/uapi/linux/psp-sev.h | 66 +++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h
index c2fd324623c4..2b5b042eb73b 100644
--- a/include/uapi/linux/psp-sev.h
+++ b/include/uapi/linux/psp-sev.h
@@ -47,32 +47,32 @@ typedef enum {
 	 * with possible values from the specification.
 	 */
 	SEV_RET_NO_FW_CALL = -1,
-	SEV_RET_SUCCESS = 0,
-	SEV_RET_INVALID_PLATFORM_STATE,
-	SEV_RET_INVALID_GUEST_STATE,
-	SEV_RET_INAVLID_CONFIG,
+	SEV_RET_SUCCESS                    = 0,
+	SEV_RET_INVALID_PLATFORM_STATE     = 0x0001,
+	SEV_RET_INVALID_GUEST_STATE        = 0x0002,
+	SEV_RET_INAVLID_CONFIG             = 0x0003,
 	SEV_RET_INVALID_CONFIG = SEV_RET_INAVLID_CONFIG,
-	SEV_RET_INVALID_LEN,
-	SEV_RET_ALREADY_OWNED,
-	SEV_RET_INVALID_CERTIFICATE,
-	SEV_RET_POLICY_FAILURE,
-	SEV_RET_INACTIVE,
-	SEV_RET_INVALID_ADDRESS,
-	SEV_RET_BAD_SIGNATURE,
-	SEV_RET_BAD_MEASUREMENT,
-	SEV_RET_ASID_OWNED,
-	SEV_RET_INVALID_ASID,
-	SEV_RET_WBINVD_REQUIRED,
-	SEV_RET_DFFLUSH_REQUIRED,
-	SEV_RET_INVALID_GUEST,
-	SEV_RET_INVALID_COMMAND,
-	SEV_RET_ACTIVE,
-	SEV_RET_HWSEV_RET_PLATFORM,
-	SEV_RET_HWSEV_RET_UNSAFE,
-	SEV_RET_UNSUPPORTED,
-	SEV_RET_INVALID_PARAM,
-	SEV_RET_RESOURCE_LIMIT,
-	SEV_RET_SECURE_DATA_INVALID,
+	SEV_RET_INVALID_LEN                = 0x0004,
+	SEV_RET_ALREADY_OWNED              = 0x0005,
+	SEV_RET_INVALID_CERTIFICATE        = 0x0006,
+	SEV_RET_POLICY_FAILURE             = 0x0007,
+	SEV_RET_INACTIVE                   = 0x0008,
+	SEV_RET_INVALID_ADDRESS            = 0x0009,
+	SEV_RET_BAD_SIGNATURE              = 0x000A,
+	SEV_RET_BAD_MEASUREMENT            = 0x000B,
+	SEV_RET_ASID_OWNED                 = 0x000C,
+	SEV_RET_INVALID_ASID               = 0x000D,
+	SEV_RET_WBINVD_REQUIRED            = 0x000E,
+	SEV_RET_DFFLUSH_REQUIRED           = 0x000F,
+	SEV_RET_INVALID_GUEST              = 0x0010,
+	SEV_RET_INVALID_COMMAND            = 0x0011,
+	SEV_RET_ACTIVE                     = 0x0012,
+	SEV_RET_HWSEV_RET_PLATFORM         = 0x0013,
+	SEV_RET_HWSEV_RET_UNSAFE           = 0x0014,
+	SEV_RET_UNSUPPORTED                = 0x0015,
+	SEV_RET_INVALID_PARAM              = 0x0016,
+	SEV_RET_RESOURCE_LIMIT             = 0x0017,
+	SEV_RET_SECURE_DATA_INVALID        = 0x0018,
 	SEV_RET_INVALID_PAGE_SIZE          = 0x0019,
 	SEV_RET_INVALID_PAGE_STATE         = 0x001A,
 	SEV_RET_INVALID_MDATA_ENTRY        = 0x001B,
@@ -87,6 +87,22 @@ typedef enum {
 	SEV_RET_RESTORE_REQUIRED           = 0x0025,
 	SEV_RET_RMP_INITIALIZATION_FAILED  = 0x0026,
 	SEV_RET_INVALID_KEY                = 0x0027,
+	SEV_RET_SHUTDOWN_INCOMPLETE        = 0x0028,
+	SEV_RET_INCORRECT_BUFFER_LENGTH	   = 0x0030,
+	SEV_RET_EXPAND_BUFFER_LENGTH_REQUEST = 0x0031,
+	SEV_RET_SPDM_REQUEST               = 0x0032,
+	SEV_RET_SPDM_ERROR                 = 0x0033,
+	SEV_RET_SEV_STATUS_ERR_IN_DEV_CONN = 0x0035,
+	SEV_RET_SEV_STATUS_INVALID_DEV_CTX = 0x0036,
+	SEV_RET_SEV_STATUS_INVALID_TDI_CTX = 0x0037,
+	SEV_RET_SEV_STATUS_INVALID_TDI     = 0x0038,
+	SEV_RET_SEV_STATUS_RECLAIM_REQUIRED = 0x0039,
+	SEV_RET_IN_USE                     = 0x003A,
+	SEV_RET_SEV_STATUS_INVALID_DEV_STATE = 0x003B,
+	SEV_RET_SEV_STATUS_INVALID_TDI_STATE = 0x003C,
+	SEV_RET_SEV_STATUS_DEV_CERT_CHANGED = 0x003D,
+	SEV_RET_SEV_STATUS_RESYNC_REQ      = 0x003E,
+	SEV_RET_SEV_STATUS_RESPONSE_TOO_LARGE = 0x003F,
 	SEV_RET_MAX,
 } sev_ret_code;
 
-- 
cgit v1.2.3


From eeb934137debfbe98be61a27756a605edf492ed3 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@amd.com>
Date: Tue, 2 Dec 2025 13:44:48 +1100
Subject: iommu/amd: Report SEV-TIO support

The SEV-TIO switch in the AMD BIOS is reported to the OS via
the IOMMU Extended Feature 2 register (EFR2), bit 1.

Add helper to parse the bit and report the feature presence.

Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251202024449.542361-4-aik@amd.com
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/iommu/amd/amd_iommu_types.h | 1 +
 drivers/iommu/amd/init.c            | 9 +++++++++
 include/linux/amd-iommu.h           | 2 ++
 3 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index a698a2e7ce2a..a2f72c53d3cc 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -107,6 +107,7 @@
 
 
 /* Extended Feature 2 Bits */
+#define FEATURE_SEVSNPIO_SUP	BIT_ULL(1)
 #define FEATURE_SNPAVICSUP	GENMASK_ULL(7, 5)
 #define FEATURE_SNPAVICSUP_GAM(x) \
 	(FIELD_GET(FEATURE_SNPAVICSUP, x) == 0x1)
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index f2991c11867c..ba95467ba492 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -2252,6 +2252,9 @@ static void print_iommu_info(void)
 		if (check_feature(FEATURE_SNP))
 			pr_cont(" SNP");
 
+		if (check_feature2(FEATURE_SEVSNPIO_SUP))
+			pr_cont(" SEV-TIO");
+
 		pr_cont("\n");
 	}
 
@@ -4015,4 +4018,10 @@ int amd_iommu_snp_disable(void)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(amd_iommu_snp_disable);
+
+bool amd_iommu_sev_tio_supported(void)
+{
+	return check_feature2(FEATURE_SEVSNPIO_SUP);
+}
+EXPORT_SYMBOL_GPL(amd_iommu_sev_tio_supported);
 #endif
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 8cced632ecd0..0f64f09d1f34 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -18,10 +18,12 @@ struct task_struct;
 struct pci_dev;
 
 extern void amd_iommu_detect(void);
+extern bool amd_iommu_sev_tio_supported(void);
 
 #else /* CONFIG_AMD_IOMMU */
 
 static inline void amd_iommu_detect(void) { }
+static inline bool amd_iommu_sev_tio_supported(void) { return false; }
 
 #endif /* CONFIG_AMD_IOMMU */
 
-- 
cgit v1.2.3


From 4be423572da1f4c11f45168e3fafda870ddac9f8 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@amd.com>
Date: Tue, 2 Dec 2025 13:44:49 +1100
Subject: crypto/ccp: Implement SEV-TIO PCIe IDE (phase1)

Implement the SEV-TIO (Trusted I/O) firmware interface for PCIe TDISP
(Trust Domain In-Socket Protocol). This enables secure communication
between trusted domains and PCIe devices through the PSP (Platform
Security Processor).

The implementation includes:
- Device Security Manager (DSM) operations for establishing secure links
- SPDM (Security Protocol and Data Model) over DOE (Data Object Exchange)
- IDE (Integrity Data Encryption) stream management for secure PCIe

This module bridges the SEV firmware stack with the generic PCIe TSM
framework.

This is phase1 as described in Documentation/driver-api/pci/tsm.rst.

On AMD SEV, the AMD PSP firmware acts as TSM (manages the security/trust).
The CCP driver provides the interface to it and registers in the TSM
subsystem.

Detect the PSP support (reported via FEATURE_INFO + SNP_PLATFORM_STATUS)
and enable SEV-TIO in the SNP_INIT_EX call if the hardware supports TIO.

Implement SEV TIO PSP command wrappers in sev-dev-tio.c and store
the data in the SEV-TIO-specific structs.

Implement TSM hooks and IDE setup in sev-dev-tsm.c.

Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/692f506bb80c9_261c11004@dwillia2-mobl4.notmuch
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/crypto/ccp/Kconfig       |   1 +
 drivers/crypto/ccp/Makefile      |   4 +
 drivers/crypto/ccp/sev-dev-tio.c | 864 +++++++++++++++++++++++++++++++++++++++
 drivers/crypto/ccp/sev-dev-tio.h | 123 ++++++
 drivers/crypto/ccp/sev-dev-tsm.c | 405 ++++++++++++++++++
 drivers/crypto/ccp/sev-dev.c     |  55 ++-
 drivers/crypto/ccp/sev-dev.h     |   9 +
 include/linux/psp-sev.h          |  11 +-
 8 files changed, 1469 insertions(+), 3 deletions(-)
 create mode 100644 drivers/crypto/ccp/sev-dev-tio.c
 create mode 100644 drivers/crypto/ccp/sev-dev-tio.h
 create mode 100644 drivers/crypto/ccp/sev-dev-tsm.c

(limited to 'include')

diff --git a/drivers/crypto/ccp/Kconfig b/drivers/crypto/ccp/Kconfig
index f394e45e11ab..e2b127f0986b 100644
--- a/drivers/crypto/ccp/Kconfig
+++ b/drivers/crypto/ccp/Kconfig
@@ -39,6 +39,7 @@ config CRYPTO_DEV_SP_PSP
 	bool "Platform Security Processor (PSP) device"
 	default y
 	depends on CRYPTO_DEV_CCP_DD && X86_64 && AMD_IOMMU
+	select PCI_TSM
 	help
 	 Provide support for the AMD Platform Security Processor (PSP).
 	 The PSP is a dedicated processor that provides support for key
diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile
index a9626b30044a..0424e08561ef 100644
--- a/drivers/crypto/ccp/Makefile
+++ b/drivers/crypto/ccp/Makefile
@@ -16,6 +16,10 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \
                                    hsti.o \
                                    sfs.o
 
+ifeq ($(CONFIG_PCI_TSM),y)
+ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += sev-dev-tsm.o sev-dev-tio.o
+endif
+
 obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o
 ccp-crypto-objs := ccp-crypto-main.o \
 		   ccp-crypto-aes.o \
diff --git a/drivers/crypto/ccp/sev-dev-tio.c b/drivers/crypto/ccp/sev-dev-tio.c
new file mode 100644
index 000000000000..9a98f98c20a7
--- /dev/null
+++ b/drivers/crypto/ccp/sev-dev-tio.c
@@ -0,0 +1,864 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+// Interface to PSP for CCP/SEV-TIO/SNP-VM
+
+#include <linux/pci.h>
+#include <linux/tsm.h>
+#include <linux/psp.h>
+#include <linux/vmalloc.h>
+#include <linux/bitfield.h>
+#include <linux/pci-doe.h>
+#include <asm/sev-common.h>
+#include <asm/sev.h>
+#include <asm/page.h>
+#include "sev-dev.h"
+#include "sev-dev-tio.h"
+
+#define to_tio_status(dev_data)	\
+		(container_of((dev_data), struct tio_dsm, data)->sev->tio_status)
+
+#define SLA_PAGE_TYPE_DATA	0
+#define SLA_PAGE_TYPE_SCATTER	1
+#define SLA_PAGE_SIZE_4K	0
+#define SLA_PAGE_SIZE_2M	1
+#define SLA_SZ(s)		((s).page_size == SLA_PAGE_SIZE_2M ? SZ_2M : SZ_4K)
+#define SLA_SCATTER_LEN(s)	(SLA_SZ(s) / sizeof(struct sla_addr_t))
+#define SLA_EOL			((struct sla_addr_t) { .pfn = ((1UL << 40) - 1) })
+#define SLA_NULL		((struct sla_addr_t) { 0 })
+#define IS_SLA_NULL(s)		((s).sla == SLA_NULL.sla)
+#define IS_SLA_EOL(s)		((s).sla == SLA_EOL.sla)
+
+static phys_addr_t sla_to_pa(struct sla_addr_t sla)
+{
+	u64 pfn = sla.pfn;
+	u64 pa = pfn << PAGE_SHIFT;
+
+	return pa;
+}
+
+static void *sla_to_va(struct sla_addr_t sla)
+{
+	void *va = __va(__sme_clr(sla_to_pa(sla)));
+
+	return va;
+}
+
+#define sla_to_pfn(sla)		(__pa(sla_to_va(sla)) >> PAGE_SHIFT)
+#define sla_to_page(sla)	virt_to_page(sla_to_va(sla))
+
+static struct sla_addr_t make_sla(struct page *pg, bool stp)
+{
+	u64 pa = __sme_set(page_to_phys(pg));
+	struct sla_addr_t ret = {
+		.pfn = pa >> PAGE_SHIFT,
+		.page_size = SLA_PAGE_SIZE_4K, /* Do not do SLA_PAGE_SIZE_2M ATM */
+		.page_type = stp ? SLA_PAGE_TYPE_SCATTER : SLA_PAGE_TYPE_DATA
+	};
+
+	return ret;
+}
+
+/* the BUFFER Structure */
+#define SLA_BUFFER_FLAG_ENCRYPTION	BIT(0)
+
+/*
+ * struct sla_buffer_hdr - Scatter list address buffer header
+ *
+ * @capacity_sz: Total capacity of the buffer in bytes
+ * @payload_sz: Size of buffer payload in bytes, must be multiple of 32B
+ * @flags: Buffer flags (SLA_BUFFER_FLAG_ENCRYPTION: buffer is encrypted)
+ * @iv: Initialization vector used for encryption
+ * @authtag: Authentication tag for encrypted buffer
+ */
+struct sla_buffer_hdr {
+	u32 capacity_sz;
+	u32 payload_sz; /* The size of BUFFER_PAYLOAD in bytes. Must be multiple of 32B */
+	u32 flags;
+	u8 reserved1[4];
+	u8 iv[16];	/* IV used for the encryption of this buffer */
+	u8 authtag[16]; /* Authentication tag for this buffer */
+	u8 reserved2[16];
+} __packed;
+
+enum spdm_data_type_t {
+	DOBJ_DATA_TYPE_SPDM = 0x1,
+	DOBJ_DATA_TYPE_SECURE_SPDM = 0x2,
+};
+
+struct spdm_dobj_hdr_req {
+	struct spdm_dobj_hdr hdr; /* hdr.id == SPDM_DOBJ_ID_REQ */
+	u8 data_type; /* spdm_data_type_t */
+	u8 reserved2[5];
+} __packed;
+
+struct spdm_dobj_hdr_resp {
+	struct spdm_dobj_hdr hdr; /* hdr.id == SPDM_DOBJ_ID_RESP */
+	u8 data_type; /* spdm_data_type_t */
+	u8 reserved2[5];
+} __packed;
+
+/* Defined in sev-dev-tio.h so sev-dev-tsm.c can read types of blobs */
+struct spdm_dobj_hdr_cert;
+struct spdm_dobj_hdr_meas;
+struct spdm_dobj_hdr_report;
+
+/* Used in all SPDM-aware TIO commands */
+struct spdm_ctrl {
+	struct sla_addr_t req;
+	struct sla_addr_t resp;
+	struct sla_addr_t scratch;
+	struct sla_addr_t output;
+} __packed;
+
+static size_t sla_dobj_id_to_size(u8 id)
+{
+	size_t n;
+
+	BUILD_BUG_ON(sizeof(struct spdm_dobj_hdr_resp) != 0x10);
+	switch (id) {
+	case SPDM_DOBJ_ID_REQ:
+		n = sizeof(struct spdm_dobj_hdr_req);
+		break;
+	case SPDM_DOBJ_ID_RESP:
+		n = sizeof(struct spdm_dobj_hdr_resp);
+		break;
+	default:
+		WARN_ON(1);
+		n = 0;
+		break;
+	}
+
+	return n;
+}
+
+#define SPDM_DOBJ_HDR_SIZE(hdr)		sla_dobj_id_to_size((hdr)->id)
+#define SPDM_DOBJ_DATA(hdr)		((u8 *)(hdr) + SPDM_DOBJ_HDR_SIZE(hdr))
+#define SPDM_DOBJ_LEN(hdr)		((hdr)->length - SPDM_DOBJ_HDR_SIZE(hdr))
+
+#define sla_to_dobj_resp_hdr(buf)	((struct spdm_dobj_hdr_resp *) \
+					sla_to_dobj_hdr_check((buf), SPDM_DOBJ_ID_RESP))
+#define sla_to_dobj_req_hdr(buf)	((struct spdm_dobj_hdr_req *) \
+					sla_to_dobj_hdr_check((buf), SPDM_DOBJ_ID_REQ))
+
+static struct spdm_dobj_hdr *sla_to_dobj_hdr(struct sla_buffer_hdr *buf)
+{
+	if (!buf)
+		return NULL;
+
+	return (struct spdm_dobj_hdr *) &buf[1];
+}
+
+static struct spdm_dobj_hdr *sla_to_dobj_hdr_check(struct sla_buffer_hdr *buf, u32 check_dobjid)
+{
+	struct spdm_dobj_hdr *hdr = sla_to_dobj_hdr(buf);
+
+	if (WARN_ON_ONCE(!hdr))
+		return NULL;
+
+	if (hdr->id != check_dobjid) {
+		pr_err("! ERROR: expected %d, found %d\n", check_dobjid, hdr->id);
+		return NULL;
+	}
+
+	return hdr;
+}
+
+static void *sla_to_data(struct sla_buffer_hdr *buf, u32 dobjid)
+{
+	struct spdm_dobj_hdr *hdr = sla_to_dobj_hdr(buf);
+
+	if (WARN_ON_ONCE(dobjid != SPDM_DOBJ_ID_REQ && dobjid != SPDM_DOBJ_ID_RESP))
+		return NULL;
+
+	if (!hdr)
+		return NULL;
+
+	return (u8 *) hdr + sla_dobj_id_to_size(dobjid);
+}
+
+/*
+ * struct sev_data_tio_status - SEV_CMD_TIO_STATUS command
+ *
+ * @length: Length of this command buffer in bytes
+ * @status_paddr: System physical address of the TIO_STATUS structure
+ */
+struct sev_data_tio_status {
+	u32 length;
+	u8 reserved[4];
+	u64 status_paddr;
+} __packed;
+
+/* TIO_INIT */
+struct sev_data_tio_init {
+	u32 length;
+	u8 reserved[12];
+} __packed;
+
+/*
+ * struct sev_data_tio_dev_create - TIO_DEV_CREATE command
+ *
+ * @length: Length in bytes of this command buffer
+ * @dev_ctx_sla: Scatter list address pointing to a buffer to be used as a device context buffer
+ * @device_id: PCIe Routing Identifier of the device to connect to
+ * @root_port_id: PCIe Routing Identifier of the root port of the device
+ * @segment_id: PCIe Segment Identifier of the device to connect to
+ */
+struct sev_data_tio_dev_create {
+	u32 length;
+	u8 reserved1[4];
+	struct sla_addr_t dev_ctx_sla;
+	u16 device_id;
+	u16 root_port_id;
+	u8 segment_id;
+	u8 reserved2[11];
+} __packed;
+
+/*
+ * struct sev_data_tio_dev_connect - TIO_DEV_CONNECT command
+ *
+ * @length: Length in bytes of this command buffer
+ * @spdm_ctrl: SPDM control structure defined in Section 5.1
+ * @dev_ctx_sla: Scatter list address of the device context buffer
+ * @tc_mask: Bitmask of the traffic classes to initialize for SEV-TIO usage.
+ *           Setting the kth bit of the TC_MASK to 1 indicates that the traffic
+ *           class k will be initialized
+ * @cert_slot: Slot number of the certificate requested for constructing the SPDM session
+ * @ide_stream_id: IDE stream IDs to be associated with this device.
+ *                 Valid only if corresponding bit in TC_MASK is set
+ */
+struct sev_data_tio_dev_connect {
+	u32 length;
+	u8 reserved1[4];
+	struct spdm_ctrl spdm_ctrl;
+	u8 reserved2[8];
+	struct sla_addr_t dev_ctx_sla;
+	u8 tc_mask;
+	u8 cert_slot;
+	u8 reserved3[6];
+	u8 ide_stream_id[8];
+	u8 reserved4[8];
+} __packed;
+
+/*
+ * struct sev_data_tio_dev_disconnect - TIO_DEV_DISCONNECT command
+ *
+ * @length: Length in bytes of this command buffer
+ * @flags: Command flags (TIO_DEV_DISCONNECT_FLAG_FORCE: force disconnect)
+ * @spdm_ctrl: SPDM control structure defined in Section 5.1
+ * @dev_ctx_sla: Scatter list address of the device context buffer
+ */
+#define TIO_DEV_DISCONNECT_FLAG_FORCE	BIT(0)
+
+struct sev_data_tio_dev_disconnect {
+	u32 length;
+	u32 flags;
+	struct spdm_ctrl spdm_ctrl;
+	struct sla_addr_t dev_ctx_sla;
+} __packed;
+
+/*
+ * struct sev_data_tio_dev_meas - TIO_DEV_MEASUREMENTS command
+ *
+ * @length: Length in bytes of this command buffer
+ * @flags: Command flags (TIO_DEV_MEAS_FLAG_RAW_BITSTREAM: request raw measurements)
+ * @spdm_ctrl: SPDM control structure defined in Section 5.1
+ * @dev_ctx_sla: Scatter list address of the device context buffer
+ * @meas_nonce: Nonce for measurement freshness verification
+ */
+#define TIO_DEV_MEAS_FLAG_RAW_BITSTREAM	BIT(0)
+
+struct sev_data_tio_dev_meas {
+	u32 length;
+	u32 flags;
+	struct spdm_ctrl spdm_ctrl;
+	struct sla_addr_t dev_ctx_sla;
+	u8 meas_nonce[32];
+} __packed;
+
+/*
+ * struct sev_data_tio_dev_certs - TIO_DEV_CERTIFICATES command
+ *
+ * @length: Length in bytes of this command buffer
+ * @spdm_ctrl: SPDM control structure defined in Section 5.1
+ * @dev_ctx_sla: Scatter list address of the device context buffer
+ */
+struct sev_data_tio_dev_certs {
+	u32 length;
+	u8 reserved[4];
+	struct spdm_ctrl spdm_ctrl;
+	struct sla_addr_t dev_ctx_sla;
+} __packed;
+
+/*
+ * struct sev_data_tio_dev_reclaim - TIO_DEV_RECLAIM command
+ *
+ * @length: Length in bytes of this command buffer
+ * @dev_ctx_sla: Scatter list address of the device context buffer
+ *
+ * This command reclaims resources associated with a device context.
+ */
+struct sev_data_tio_dev_reclaim {
+	u32 length;
+	u8 reserved[4];
+	struct sla_addr_t dev_ctx_sla;
+} __packed;
+
+static struct sla_buffer_hdr *sla_buffer_map(struct sla_addr_t sla)
+{
+	struct sla_buffer_hdr *buf;
+
+	BUILD_BUG_ON(sizeof(struct sla_buffer_hdr) != 0x40);
+	if (IS_SLA_NULL(sla))
+		return NULL;
+
+	if (sla.page_type == SLA_PAGE_TYPE_SCATTER) {
+		struct sla_addr_t *scatter = sla_to_va(sla);
+		unsigned int i, npages = 0;
+
+		for (i = 0; i < SLA_SCATTER_LEN(sla); ++i) {
+			if (WARN_ON_ONCE(SLA_SZ(scatter[i]) > SZ_4K))
+				return NULL;
+
+			if (WARN_ON_ONCE(scatter[i].page_type == SLA_PAGE_TYPE_SCATTER))
+				return NULL;
+
+			if (IS_SLA_EOL(scatter[i])) {
+				npages = i;
+				break;
+			}
+		}
+		if (WARN_ON_ONCE(!npages))
+			return NULL;
+
+		struct page **pp = kmalloc_array(npages, sizeof(pp[0]), GFP_KERNEL);
+
+		if (!pp)
+			return NULL;
+
+		for (i = 0; i < npages; ++i)
+			pp[i] = sla_to_page(scatter[i]);
+
+		buf = vm_map_ram(pp, npages, 0);
+		kfree(pp);
+	} else {
+		struct page *pg = sla_to_page(sla);
+
+		buf = vm_map_ram(&pg, 1, 0);
+	}
+
+	return buf;
+}
+
+static void sla_buffer_unmap(struct sla_addr_t sla, struct sla_buffer_hdr *buf)
+{
+	if (!buf)
+		return;
+
+	if (sla.page_type == SLA_PAGE_TYPE_SCATTER) {
+		struct sla_addr_t *scatter = sla_to_va(sla);
+		unsigned int i, npages = 0;
+
+		for (i = 0; i < SLA_SCATTER_LEN(sla); ++i) {
+			if (IS_SLA_EOL(scatter[i])) {
+				npages = i;
+				break;
+			}
+		}
+		if (!npages)
+			return;
+
+		vm_unmap_ram(buf, npages);
+	} else {
+		vm_unmap_ram(buf, 1);
+	}
+}
+
+static void dobj_response_init(struct sla_buffer_hdr *buf)
+{
+	struct spdm_dobj_hdr *dobj = sla_to_dobj_hdr(buf);
+
+	dobj->id = SPDM_DOBJ_ID_RESP;
+	dobj->version.major = 0x1;
+	dobj->version.minor = 0;
+	dobj->length = 0;
+	buf->payload_sz = sla_dobj_id_to_size(dobj->id) + dobj->length;
+}
+
+static void sla_free(struct sla_addr_t sla, size_t len, bool firmware_state)
+{
+	unsigned int npages = PAGE_ALIGN(len) >> PAGE_SHIFT;
+	struct sla_addr_t *scatter = NULL;
+	int ret = 0, i;
+
+	if (IS_SLA_NULL(sla))
+		return;
+
+	if (firmware_state) {
+		if (sla.page_type == SLA_PAGE_TYPE_SCATTER) {
+			scatter = sla_to_va(sla);
+
+			for (i = 0; i < npages; ++i) {
+				if (IS_SLA_EOL(scatter[i]))
+					break;
+
+				ret = snp_reclaim_pages(sla_to_pa(scatter[i]), 1, false);
+				if (ret)
+					break;
+			}
+		} else {
+			ret = snp_reclaim_pages(sla_to_pa(sla), 1, false);
+		}
+	}
+
+	if (WARN_ON(ret))
+		return;
+
+	if (scatter) {
+		for (i = 0; i < npages; ++i) {
+			if (IS_SLA_EOL(scatter[i]))
+				break;
+			free_page((unsigned long)sla_to_va(scatter[i]));
+		}
+	}
+
+	free_page((unsigned long)sla_to_va(sla));
+}
+
+static struct sla_addr_t sla_alloc(size_t len, bool firmware_state)
+{
+	unsigned long i, npages = PAGE_ALIGN(len) >> PAGE_SHIFT;
+	struct sla_addr_t *scatter = NULL;
+	struct sla_addr_t ret = SLA_NULL;
+	struct sla_buffer_hdr *buf;
+	struct page *pg;
+
+	if (npages == 0)
+		return ret;
+
+	if (WARN_ON_ONCE(npages > ((PAGE_SIZE / sizeof(struct sla_addr_t)) + 1)))
+		return ret;
+
+	BUILD_BUG_ON(PAGE_SIZE < SZ_4K);
+
+	if (npages > 1) {
+		pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!pg)
+			return SLA_NULL;
+
+		ret = make_sla(pg, true);
+		scatter = page_to_virt(pg);
+		for (i = 0; i < npages; ++i) {
+			pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+			if (!pg)
+				goto no_reclaim_exit;
+
+			scatter[i] = make_sla(pg, false);
+		}
+		scatter[i] = SLA_EOL;
+	} else {
+		pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!pg)
+			return SLA_NULL;
+
+		ret = make_sla(pg, false);
+	}
+
+	buf = sla_buffer_map(ret);
+	if (!buf)
+		goto no_reclaim_exit;
+
+	buf->capacity_sz = (npages << PAGE_SHIFT);
+	sla_buffer_unmap(ret, buf);
+
+	if (firmware_state) {
+		if (scatter) {
+			for (i = 0; i < npages; ++i) {
+				if (rmp_make_private(sla_to_pfn(scatter[i]), 0,
+						     PG_LEVEL_4K, 0, true))
+					goto free_exit;
+			}
+		} else {
+			if (rmp_make_private(sla_to_pfn(ret), 0, PG_LEVEL_4K, 0, true))
+				goto no_reclaim_exit;
+		}
+	}
+
+	return ret;
+
+no_reclaim_exit:
+	firmware_state = false;
+free_exit:
+	sla_free(ret, len, firmware_state);
+	return SLA_NULL;
+}
+
+/* Expands a buffer, only firmware owned buffers allowed for now */
+static int sla_expand(struct sla_addr_t *sla, size_t *len)
+{
+	struct sla_buffer_hdr *oldbuf = sla_buffer_map(*sla), *newbuf;
+	struct sla_addr_t oldsla = *sla, newsla;
+	size_t oldlen = *len, newlen;
+
+	if (!oldbuf)
+		return -EFAULT;
+
+	newlen = oldbuf->capacity_sz;
+	if (oldbuf->capacity_sz == oldlen) {
+		/* This buffer does not require expansion, must be another buffer */
+		sla_buffer_unmap(oldsla, oldbuf);
+		return 1;
+	}
+
+	pr_notice("Expanding BUFFER from %ld to %ld bytes\n", oldlen, newlen);
+
+	newsla = sla_alloc(newlen, true);
+	if (IS_SLA_NULL(newsla))
+		return -ENOMEM;
+
+	newbuf = sla_buffer_map(newsla);
+	if (!newbuf) {
+		sla_free(newsla, newlen, true);
+		return -EFAULT;
+	}
+
+	memcpy(newbuf, oldbuf, oldlen);
+
+	sla_buffer_unmap(newsla, newbuf);
+	sla_free(oldsla, oldlen, true);
+	*sla = newsla;
+	*len = newlen;
+
+	return 0;
+}
+
+static int sev_tio_do_cmd(int cmd, void *data, size_t data_len, int *psp_ret,
+			  struct tsm_dsm_tio *dev_data)
+{
+	int rc;
+
+	*psp_ret = 0;
+	rc = sev_do_cmd(cmd, data, psp_ret);
+
+	if (WARN_ON(!rc && *psp_ret == SEV_RET_SPDM_REQUEST))
+		return -EIO;
+
+	if (rc == 0 && *psp_ret == SEV_RET_EXPAND_BUFFER_LENGTH_REQUEST) {
+		int rc1, rc2;
+
+		rc1 = sla_expand(&dev_data->output, &dev_data->output_len);
+		if (rc1 < 0)
+			return rc1;
+
+		rc2 = sla_expand(&dev_data->scratch, &dev_data->scratch_len);
+		if (rc2 < 0)
+			return rc2;
+
+		if (!rc1 && !rc2)
+			/* Neither buffer requires expansion, this is wrong */
+			return -EFAULT;
+
+		*psp_ret = 0;
+		rc = sev_do_cmd(cmd, data, psp_ret);
+	}
+
+	if ((rc == 0 || rc == -EIO) && *psp_ret == SEV_RET_SPDM_REQUEST) {
+		struct spdm_dobj_hdr_resp *resp_hdr;
+		struct spdm_dobj_hdr_req *req_hdr;
+		struct sev_tio_status *tio_status = to_tio_status(dev_data);
+		size_t resp_len = tio_status->spdm_req_size_max -
+			(sla_dobj_id_to_size(SPDM_DOBJ_ID_RESP) + sizeof(struct sla_buffer_hdr));
+
+		if (!dev_data->cmd) {
+			if (WARN_ON_ONCE(!data_len || (data_len != *(u32 *) data)))
+				return -EINVAL;
+			if (WARN_ON(data_len > sizeof(dev_data->cmd_data)))
+				return -EFAULT;
+			memcpy(dev_data->cmd_data, data, data_len);
+			memset(&dev_data->cmd_data[data_len], 0xFF,
+			       sizeof(dev_data->cmd_data) - data_len);
+			dev_data->cmd = cmd;
+		}
+
+		req_hdr = sla_to_dobj_req_hdr(dev_data->reqbuf);
+		resp_hdr = sla_to_dobj_resp_hdr(dev_data->respbuf);
+		switch (req_hdr->data_type) {
+		case DOBJ_DATA_TYPE_SPDM:
+			rc = PCI_DOE_FEATURE_CMA;
+			break;
+		case DOBJ_DATA_TYPE_SECURE_SPDM:
+			rc = PCI_DOE_FEATURE_SSESSION;
+			break;
+		default:
+			return -EINVAL;
+		}
+		resp_hdr->data_type = req_hdr->data_type;
+		dev_data->spdm.req_len = req_hdr->hdr.length -
+			sla_dobj_id_to_size(SPDM_DOBJ_ID_REQ);
+		dev_data->spdm.rsp_len = resp_len;
+	} else if (dev_data && dev_data->cmd) {
+		/* For either error or success just stop the bouncing */
+		memset(dev_data->cmd_data, 0, sizeof(dev_data->cmd_data));
+		dev_data->cmd = 0;
+	}
+
+	return rc;
+}
+
+int sev_tio_continue(struct tsm_dsm_tio *dev_data)
+{
+	struct spdm_dobj_hdr_resp *resp_hdr;
+	int ret;
+
+	if (!dev_data || !dev_data->cmd)
+		return -EINVAL;
+
+	resp_hdr = sla_to_dobj_resp_hdr(dev_data->respbuf);
+	resp_hdr->hdr.length = ALIGN(sla_dobj_id_to_size(SPDM_DOBJ_ID_RESP) +
+				     dev_data->spdm.rsp_len, 32);
+	dev_data->respbuf->payload_sz = resp_hdr->hdr.length;
+
+	ret = sev_tio_do_cmd(dev_data->cmd, dev_data->cmd_data, 0,
+			     &dev_data->psp_ret, dev_data);
+	if (ret)
+		return ret;
+
+	if (dev_data->psp_ret != SEV_RET_SUCCESS)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void spdm_ctrl_init(struct spdm_ctrl *ctrl, struct tsm_dsm_tio *dev_data)
+{
+	ctrl->req = dev_data->req;
+	ctrl->resp = dev_data->resp;
+	ctrl->scratch = dev_data->scratch;
+	ctrl->output = dev_data->output;
+}
+
+static void spdm_ctrl_free(struct tsm_dsm_tio *dev_data)
+{
+	struct sev_tio_status *tio_status = to_tio_status(dev_data);
+	size_t len = tio_status->spdm_req_size_max -
+		(sla_dobj_id_to_size(SPDM_DOBJ_ID_RESP) +
+		 sizeof(struct sla_buffer_hdr));
+	struct tsm_spdm *spdm = &dev_data->spdm;
+
+	sla_buffer_unmap(dev_data->resp, dev_data->respbuf);
+	sla_buffer_unmap(dev_data->req, dev_data->reqbuf);
+	spdm->rsp = NULL;
+	spdm->req = NULL;
+	sla_free(dev_data->req, len, true);
+	sla_free(dev_data->resp, len, false);
+	sla_free(dev_data->scratch, tio_status->spdm_scratch_size_max, true);
+
+	dev_data->req.sla = 0;
+	dev_data->resp.sla = 0;
+	dev_data->scratch.sla = 0;
+	dev_data->respbuf = NULL;
+	dev_data->reqbuf = NULL;
+	sla_free(dev_data->output, tio_status->spdm_out_size_max, true);
+}
+
+static int spdm_ctrl_alloc(struct tsm_dsm_tio *dev_data)
+{
+	struct sev_tio_status *tio_status = to_tio_status(dev_data);
+	struct tsm_spdm *spdm = &dev_data->spdm;
+	int ret;
+
+	dev_data->req = sla_alloc(tio_status->spdm_req_size_max, true);
+	dev_data->resp = sla_alloc(tio_status->spdm_req_size_max, false);
+	dev_data->scratch_len = tio_status->spdm_scratch_size_max;
+	dev_data->scratch = sla_alloc(dev_data->scratch_len, true);
+	dev_data->output_len = tio_status->spdm_out_size_max;
+	dev_data->output = sla_alloc(dev_data->output_len, true);
+
+	if (IS_SLA_NULL(dev_data->req) || IS_SLA_NULL(dev_data->resp) ||
+	    IS_SLA_NULL(dev_data->scratch) || IS_SLA_NULL(dev_data->dev_ctx)) {
+		ret = -ENOMEM;
+		goto free_spdm_exit;
+	}
+
+	dev_data->reqbuf = sla_buffer_map(dev_data->req);
+	dev_data->respbuf = sla_buffer_map(dev_data->resp);
+	if (!dev_data->reqbuf || !dev_data->respbuf) {
+		ret = -EFAULT;
+		goto free_spdm_exit;
+	}
+
+	spdm->req = sla_to_data(dev_data->reqbuf, SPDM_DOBJ_ID_REQ);
+	spdm->rsp = sla_to_data(dev_data->respbuf, SPDM_DOBJ_ID_RESP);
+	if (!spdm->req || !spdm->rsp) {
+		ret = -EFAULT;
+		goto free_spdm_exit;
+	}
+
+	dobj_response_init(dev_data->respbuf);
+
+	return 0;
+
+free_spdm_exit:
+	spdm_ctrl_free(dev_data);
+	return ret;
+}
+
+int sev_tio_init_locked(void *tio_status_page)
+{
+	struct sev_tio_status *tio_status = tio_status_page;
+	struct sev_data_tio_status data_status = {
+		.length = sizeof(data_status),
+	};
+	int ret, psp_ret;
+
+	data_status.status_paddr = __psp_pa(tio_status_page);
+	ret = __sev_do_cmd_locked(SEV_CMD_TIO_STATUS, &data_status, &psp_ret);
+	if (ret)
+		return ret;
+
+	if (tio_status->length < offsetofend(struct sev_tio_status, tdictx_size) ||
+	    tio_status->reserved)
+		return -EFAULT;
+
+	if (!tio_status->tio_en && !tio_status->tio_init_done)
+		return -ENOENT;
+
+	if (tio_status->tio_init_done)
+		return -EBUSY;
+
+	struct sev_data_tio_init ti = { .length = sizeof(ti) };
+
+	ret = __sev_do_cmd_locked(SEV_CMD_TIO_INIT, &ti, &psp_ret);
+	if (ret)
+		return ret;
+
+	ret = __sev_do_cmd_locked(SEV_CMD_TIO_STATUS, &data_status, &psp_ret);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+int sev_tio_dev_create(struct tsm_dsm_tio *dev_data, u16 device_id,
+		       u16 root_port_id, u8 segment_id)
+{
+	struct sev_tio_status *tio_status = to_tio_status(dev_data);
+	struct sev_data_tio_dev_create create = {
+		.length = sizeof(create),
+		.device_id = device_id,
+		.root_port_id = root_port_id,
+		.segment_id = segment_id,
+	};
+	void *data_pg;
+	int ret;
+
+	dev_data->dev_ctx = sla_alloc(tio_status->devctx_size, true);
+	if (IS_SLA_NULL(dev_data->dev_ctx))
+		return -ENOMEM;
+
+	data_pg = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
+	if (!data_pg) {
+		ret = -ENOMEM;
+		goto free_ctx_exit;
+	}
+
+	create.dev_ctx_sla = dev_data->dev_ctx;
+	ret = sev_do_cmd(SEV_CMD_TIO_DEV_CREATE, &create, &dev_data->psp_ret);
+	if (ret)
+		goto free_data_pg_exit;
+
+	dev_data->data_pg = data_pg;
+
+	return 0;
+
+free_data_pg_exit:
+	snp_free_firmware_page(data_pg);
+free_ctx_exit:
+	sla_free(create.dev_ctx_sla, tio_status->devctx_size, true);
+	return ret;
+}
+
+int sev_tio_dev_reclaim(struct tsm_dsm_tio *dev_data)
+{
+	struct sev_tio_status *tio_status = to_tio_status(dev_data);
+	struct sev_data_tio_dev_reclaim r = {
+		.length = sizeof(r),
+		.dev_ctx_sla = dev_data->dev_ctx,
+	};
+	int ret;
+
+	if (dev_data->data_pg) {
+		snp_free_firmware_page(dev_data->data_pg);
+		dev_data->data_pg = NULL;
+	}
+
+	if (IS_SLA_NULL(dev_data->dev_ctx))
+		return 0;
+
+	ret = sev_do_cmd(SEV_CMD_TIO_DEV_RECLAIM, &r, &dev_data->psp_ret);
+
+	sla_free(dev_data->dev_ctx, tio_status->devctx_size, true);
+	dev_data->dev_ctx = SLA_NULL;
+
+	spdm_ctrl_free(dev_data);
+
+	return ret;
+}
+
+int sev_tio_dev_connect(struct tsm_dsm_tio *dev_data, u8 tc_mask, u8 ids[8], u8 cert_slot)
+{
+	struct sev_data_tio_dev_connect connect = {
+		.length = sizeof(connect),
+		.tc_mask = tc_mask,
+		.cert_slot = cert_slot,
+		.dev_ctx_sla = dev_data->dev_ctx,
+		.ide_stream_id = {
+			ids[0], ids[1], ids[2], ids[3],
+			ids[4], ids[5], ids[6], ids[7]
+		},
+	};
+	int ret;
+
+	if (WARN_ON(IS_SLA_NULL(dev_data->dev_ctx)))
+		return -EFAULT;
+	if (!(tc_mask & 1))
+		return -EINVAL;
+
+	ret = spdm_ctrl_alloc(dev_data);
+	if (ret)
+		return ret;
+
+	spdm_ctrl_init(&connect.spdm_ctrl, dev_data);
+
+	return sev_tio_do_cmd(SEV_CMD_TIO_DEV_CONNECT, &connect, sizeof(connect),
+			      &dev_data->psp_ret, dev_data);
+}
+
+int sev_tio_dev_disconnect(struct tsm_dsm_tio *dev_data, bool force)
+{
+	struct sev_data_tio_dev_disconnect dc = {
+		.length = sizeof(dc),
+		.dev_ctx_sla = dev_data->dev_ctx,
+		.flags = force ? TIO_DEV_DISCONNECT_FLAG_FORCE : 0,
+	};
+
+	if (WARN_ON_ONCE(IS_SLA_NULL(dev_data->dev_ctx)))
+		return -EFAULT;
+
+	spdm_ctrl_init(&dc.spdm_ctrl, dev_data);
+
+	return sev_tio_do_cmd(SEV_CMD_TIO_DEV_DISCONNECT, &dc, sizeof(dc),
+			      &dev_data->psp_ret, dev_data);
+}
+
+int sev_tio_cmd_buffer_len(int cmd)
+{
+	switch (cmd) {
+	case SEV_CMD_TIO_STATUS:		return sizeof(struct sev_data_tio_status);
+	case SEV_CMD_TIO_INIT:			return sizeof(struct sev_data_tio_init);
+	case SEV_CMD_TIO_DEV_CREATE:		return sizeof(struct sev_data_tio_dev_create);
+	case SEV_CMD_TIO_DEV_RECLAIM:		return sizeof(struct sev_data_tio_dev_reclaim);
+	case SEV_CMD_TIO_DEV_CONNECT:		return sizeof(struct sev_data_tio_dev_connect);
+	case SEV_CMD_TIO_DEV_DISCONNECT:	return sizeof(struct sev_data_tio_dev_disconnect);
+	default:				return 0;
+	}
+}
diff --git a/drivers/crypto/ccp/sev-dev-tio.h b/drivers/crypto/ccp/sev-dev-tio.h
new file mode 100644
index 000000000000..67512b3dbc53
--- /dev/null
+++ b/drivers/crypto/ccp/sev-dev-tio.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __PSP_SEV_TIO_H__
+#define __PSP_SEV_TIO_H__
+
+#include <linux/pci-tsm.h>
+#include <linux/pci-ide.h>
+#include <linux/tsm.h>
+#include <uapi/linux/psp-sev.h>
+
+struct sla_addr_t {
+	union {
+		u64 sla;
+		struct {
+			u64 page_type	:1,
+			    page_size	:1,
+			    reserved1	:10,
+			    pfn		:40,
+			    reserved2	:12;
+		};
+	};
+} __packed;
+
+#define SEV_TIO_MAX_COMMAND_LENGTH	128
+
+/* SPDM control structure for DOE */
+struct tsm_spdm {
+	unsigned long req_len;
+	void *req;
+	unsigned long rsp_len;
+	void *rsp;
+};
+
+/* Describes TIO device */
+struct tsm_dsm_tio {
+	u8 cert_slot;
+	struct sla_addr_t dev_ctx;
+	struct sla_addr_t req;
+	struct sla_addr_t resp;
+	struct sla_addr_t scratch;
+	struct sla_addr_t output;
+	size_t output_len;
+	size_t scratch_len;
+	struct tsm_spdm spdm;
+	struct sla_buffer_hdr *reqbuf; /* vmap'ed @req for DOE */
+	struct sla_buffer_hdr *respbuf; /* vmap'ed @resp for DOE */
+
+	int cmd;
+	int psp_ret;
+	u8 cmd_data[SEV_TIO_MAX_COMMAND_LENGTH];
+	void *data_pg; /* Data page for DEV_STATUS/TDI_STATUS/TDI_INFO/ASID_FENCE */
+
+#define TIO_IDE_MAX_TC	8
+	struct pci_ide *ide[TIO_IDE_MAX_TC];
+};
+
+/* Describes TSM structure for PF0 pointed by pci_dev->tsm */
+struct tio_dsm {
+	struct pci_tsm_pf0 tsm;
+	struct tsm_dsm_tio data;
+	struct sev_device *sev;
+};
+
+/* Data object IDs */
+#define SPDM_DOBJ_ID_NONE		0
+#define SPDM_DOBJ_ID_REQ		1
+#define SPDM_DOBJ_ID_RESP		2
+
+struct spdm_dobj_hdr {
+	u32 id;     /* Data object type identifier */
+	u32 length; /* Length of the data object, INCLUDING THIS HEADER */
+	struct { /* Version of the data object structure */
+		u8 minor;
+		u8 major;
+	} version;
+} __packed;
+
+/**
+ * struct sev_tio_status - TIO_STATUS command's info_paddr buffer
+ *
+ * @length: Length of this structure in bytes
+ * @tio_en: Indicates that SNP_INIT_EX initialized the RMP for SEV-TIO
+ * @tio_init_done: Indicates TIO_INIT has been invoked
+ * @spdm_req_size_min: Minimum SPDM request buffer size in bytes
+ * @spdm_req_size_max: Maximum SPDM request buffer size in bytes
+ * @spdm_scratch_size_min: Minimum SPDM scratch buffer size in bytes
+ * @spdm_scratch_size_max: Maximum SPDM scratch buffer size in bytes
+ * @spdm_out_size_min: Minimum SPDM output buffer size in bytes
+ * @spdm_out_size_max: Maximum for the SPDM output buffer size in bytes
+ * @spdm_rsp_size_min: Minimum SPDM response buffer size in bytes
+ * @spdm_rsp_size_max: Maximum SPDM response buffer size in bytes
+ * @devctx_size: Size of a device context buffer in bytes
+ * @tdictx_size: Size of a TDI context buffer in bytes
+ * @tio_crypto_alg: TIO crypto algorithms supported
+ */
+struct sev_tio_status {
+	u32 length;
+	u32 tio_en	  :1,
+	    tio_init_done :1,
+	    reserved	  :30;
+	u32 spdm_req_size_min;
+	u32 spdm_req_size_max;
+	u32 spdm_scratch_size_min;
+	u32 spdm_scratch_size_max;
+	u32 spdm_out_size_min;
+	u32 spdm_out_size_max;
+	u32 spdm_rsp_size_min;
+	u32 spdm_rsp_size_max;
+	u32 devctx_size;
+	u32 tdictx_size;
+	u32 tio_crypto_alg;
+	u8 reserved2[12];
+} __packed;
+
+int sev_tio_init_locked(void *tio_status_page);
+int sev_tio_continue(struct tsm_dsm_tio *dev_data);
+
+int sev_tio_dev_create(struct tsm_dsm_tio *dev_data, u16 device_id, u16 root_port_id,
+		       u8 segment_id);
+int sev_tio_dev_connect(struct tsm_dsm_tio *dev_data, u8 tc_mask, u8 ids[8], u8 cert_slot);
+int sev_tio_dev_disconnect(struct tsm_dsm_tio *dev_data, bool force);
+int sev_tio_dev_reclaim(struct tsm_dsm_tio *dev_data);
+
+#endif	/* __PSP_SEV_TIO_H__ */
diff --git a/drivers/crypto/ccp/sev-dev-tsm.c b/drivers/crypto/ccp/sev-dev-tsm.c
new file mode 100644
index 000000000000..ea29cd5d0ff9
--- /dev/null
+++ b/drivers/crypto/ccp/sev-dev-tsm.c
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+// Interface to CCP/SEV-TIO for generic PCIe TDISP module
+
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/tsm.h>
+#include <linux/iommu.h>
+#include <linux/pci-doe.h>
+#include <linux/bitfield.h>
+#include <linux/module.h>
+
+#include <asm/sev-common.h>
+#include <asm/sev.h>
+
+#include "psp-dev.h"
+#include "sev-dev.h"
+#include "sev-dev-tio.h"
+
+MODULE_IMPORT_NS("PCI_IDE");
+
+#define TIO_DEFAULT_NR_IDE_STREAMS	1
+
+static uint nr_ide_streams = TIO_DEFAULT_NR_IDE_STREAMS;
+module_param_named(ide_nr, nr_ide_streams, uint, 0644);
+MODULE_PARM_DESC(ide_nr, "Set the maximum number of IDE streams per PHB");
+
+#define dev_to_sp(dev)		((struct sp_device *)dev_get_drvdata(dev))
+#define dev_to_psp(dev)		((struct psp_device *)(dev_to_sp(dev)->psp_data))
+#define dev_to_sev(dev)		((struct sev_device *)(dev_to_psp(dev)->sev_data))
+#define tsm_dev_to_sev(tsmdev)	dev_to_sev((tsmdev)->dev.parent)
+
+#define pdev_to_tio_dsm(pdev)	(container_of((pdev)->tsm, struct tio_dsm, tsm.base_tsm))
+
+static int sev_tio_spdm_cmd(struct tio_dsm *dsm, int ret)
+{
+	struct tsm_dsm_tio *dev_data = &dsm->data;
+	struct tsm_spdm *spdm = &dev_data->spdm;
+
+	/* Check the main command handler response before entering the loop */
+	if (ret == 0 && dev_data->psp_ret != SEV_RET_SUCCESS)
+		return -EINVAL;
+
+	if (ret <= 0)
+		return ret;
+
+	/* ret > 0 means "SPDM requested" */
+	while (ret == PCI_DOE_FEATURE_CMA || ret == PCI_DOE_FEATURE_SSESSION) {
+		ret = pci_doe(dsm->tsm.doe_mb, PCI_VENDOR_ID_PCI_SIG, ret,
+			      spdm->req, spdm->req_len, spdm->rsp, spdm->rsp_len);
+		if (ret < 0)
+			break;
+
+		WARN_ON_ONCE(ret == 0); /* The response should never be empty */
+		spdm->rsp_len = ret;
+		ret = sev_tio_continue(dev_data);
+	}
+
+	return ret;
+}
+
+static int stream_enable(struct pci_ide *ide)
+{
+	struct pci_dev *rp = pcie_find_root_port(ide->pdev);
+	int ret;
+
+	ret = pci_ide_stream_enable(rp, ide);
+	if (ret)
+		return ret;
+
+	ret = pci_ide_stream_enable(ide->pdev, ide);
+	if (ret)
+		pci_ide_stream_disable(rp, ide);
+
+	return ret;
+}
+
+static int streams_enable(struct pci_ide **ide)
+{
+	int ret = 0;
+
+	for (int i = 0; i < TIO_IDE_MAX_TC; ++i) {
+		if (ide[i]) {
+			ret = stream_enable(ide[i]);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static void stream_disable(struct pci_ide *ide)
+{
+	pci_ide_stream_disable(ide->pdev, ide);
+	pci_ide_stream_disable(pcie_find_root_port(ide->pdev), ide);
+}
+
+static void streams_disable(struct pci_ide **ide)
+{
+	for (int i = 0; i < TIO_IDE_MAX_TC; ++i)
+		if (ide[i])
+			stream_disable(ide[i]);
+}
+
+static void stream_setup(struct pci_ide *ide)
+{
+	struct pci_dev *rp = pcie_find_root_port(ide->pdev);
+
+	ide->partner[PCI_IDE_EP].rid_start = 0;
+	ide->partner[PCI_IDE_EP].rid_end = 0xffff;
+	ide->partner[PCI_IDE_RP].rid_start = 0;
+	ide->partner[PCI_IDE_RP].rid_end = 0xffff;
+
+	ide->pdev->ide_cfg = 0;
+	ide->pdev->ide_tee_limit = 1;
+	rp->ide_cfg = 1;
+	rp->ide_tee_limit = 0;
+
+	pci_warn(ide->pdev, "Forcing CFG/TEE for %s", pci_name(rp));
+	pci_ide_stream_setup(ide->pdev, ide);
+	pci_ide_stream_setup(rp, ide);
+}
+
+static u8 streams_setup(struct pci_ide **ide, u8 *ids)
+{
+	bool def = false;
+	u8 tc_mask = 0;
+	int i;
+
+	for (i = 0; i < TIO_IDE_MAX_TC; ++i) {
+		if (!ide[i]) {
+			ids[i] = 0xFF;
+			continue;
+		}
+
+		tc_mask |= BIT(i);
+		ids[i] = ide[i]->stream_id;
+
+		if (!def) {
+			struct pci_ide_partner *settings;
+
+			settings = pci_ide_to_settings(ide[i]->pdev, ide[i]);
+			settings->default_stream = 1;
+			def = true;
+		}
+
+		stream_setup(ide[i]);
+	}
+
+	return tc_mask;
+}
+
+static int streams_register(struct pci_ide **ide)
+{
+	int ret = 0, i;
+
+	for (i = 0; i < TIO_IDE_MAX_TC; ++i) {
+		if (ide[i]) {
+			ret = pci_ide_stream_register(ide[i]);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static void streams_unregister(struct pci_ide **ide)
+{
+	for (int i = 0; i < TIO_IDE_MAX_TC; ++i)
+		if (ide[i])
+			pci_ide_stream_unregister(ide[i]);
+}
+
+static void stream_teardown(struct pci_ide *ide)
+{
+	pci_ide_stream_teardown(ide->pdev, ide);
+	pci_ide_stream_teardown(pcie_find_root_port(ide->pdev), ide);
+}
+
+static void streams_teardown(struct pci_ide **ide)
+{
+	for (int i = 0; i < TIO_IDE_MAX_TC; ++i) {
+		if (ide[i]) {
+			stream_teardown(ide[i]);
+			pci_ide_stream_free(ide[i]);
+			ide[i] = NULL;
+		}
+	}
+}
+
+static int stream_alloc(struct pci_dev *pdev, struct pci_ide **ide,
+			unsigned int tc)
+{
+	struct pci_dev *rp = pcie_find_root_port(pdev);
+	struct pci_ide *ide1;
+
+	if (ide[tc]) {
+		pci_err(pdev, "Stream for class=%d already registered", tc);
+		return -EBUSY;
+	}
+
+	/* FIXME: find a better way */
+	if (nr_ide_streams != TIO_DEFAULT_NR_IDE_STREAMS)
+		pci_notice(pdev, "Enable non-default %d streams", nr_ide_streams);
+	pci_ide_set_nr_streams(to_pci_host_bridge(rp->bus->bridge), nr_ide_streams);
+
+	ide1 = pci_ide_stream_alloc(pdev);
+	if (!ide1)
+		return -EFAULT;
+
+	/* Blindly assign streamid=0 to TC=0, and so on */
+	ide1->stream_id = tc;
+
+	ide[tc] = ide1;
+
+	return 0;
+}
+
+static struct pci_tsm *tio_pf0_probe(struct pci_dev *pdev, struct sev_device *sev)
+{
+	struct tio_dsm *dsm __free(kfree) = kzalloc(sizeof(*dsm), GFP_KERNEL);
+	int rc;
+
+	if (!dsm)
+		return NULL;
+
+	rc = pci_tsm_pf0_constructor(pdev, &dsm->tsm, sev->tsmdev);
+	if (rc)
+		return NULL;
+
+	pci_dbg(pdev, "TSM enabled\n");
+	dsm->sev = sev;
+	return &no_free_ptr(dsm)->tsm.base_tsm;
+}
+
+static struct pci_tsm *dsm_probe(struct tsm_dev *tsmdev, struct pci_dev *pdev)
+{
+	struct sev_device *sev = tsm_dev_to_sev(tsmdev);
+
+	if (is_pci_tsm_pf0(pdev))
+		return tio_pf0_probe(pdev, sev);
+	return 0;
+}
+
+static void dsm_remove(struct pci_tsm *tsm)
+{
+	struct pci_dev *pdev = tsm->pdev;
+
+	pci_dbg(pdev, "TSM disabled\n");
+
+	if (is_pci_tsm_pf0(pdev)) {
+		struct tio_dsm *dsm = container_of(tsm, struct tio_dsm, tsm.base_tsm);
+
+		pci_tsm_pf0_destructor(&dsm->tsm);
+		kfree(dsm);
+	}
+}
+
+static int dsm_create(struct tio_dsm *dsm)
+{
+	struct pci_dev *pdev = dsm->tsm.base_tsm.pdev;
+	u8 segment_id = pdev->bus ? pci_domain_nr(pdev->bus) : 0;
+	struct pci_dev *rootport = pcie_find_root_port(pdev);
+	u16 device_id = pci_dev_id(pdev);
+	u16 root_port_id;
+	u32 lnkcap = 0;
+
+	if (pci_read_config_dword(rootport, pci_pcie_cap(rootport) + PCI_EXP_LNKCAP,
+				  &lnkcap))
+		return -ENODEV;
+
+	root_port_id = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap);
+
+	return sev_tio_dev_create(&dsm->data, device_id, root_port_id, segment_id);
+}
+
+static int dsm_connect(struct pci_dev *pdev)
+{
+	struct tio_dsm *dsm = pdev_to_tio_dsm(pdev);
+	struct tsm_dsm_tio *dev_data = &dsm->data;
+	u8 ids[TIO_IDE_MAX_TC];
+	u8 tc_mask;
+	int ret;
+
+	if (pci_find_doe_mailbox(pdev, PCI_VENDOR_ID_PCI_SIG,
+				 PCI_DOE_FEATURE_SSESSION) != dsm->tsm.doe_mb) {
+		pci_err(pdev, "CMA DOE MB must support SSESSION\n");
+		return -EFAULT;
+	}
+
+	ret = stream_alloc(pdev, dev_data->ide, 0);
+	if (ret)
+		return ret;
+
+	ret = dsm_create(dsm);
+	if (ret)
+		goto ide_free_exit;
+
+	tc_mask = streams_setup(dev_data->ide, ids);
+
+	ret = sev_tio_dev_connect(dev_data, tc_mask, ids, dev_data->cert_slot);
+	ret = sev_tio_spdm_cmd(dsm, ret);
+	if (ret)
+		goto free_exit;
+
+	streams_enable(dev_data->ide);
+
+	ret = streams_register(dev_data->ide);
+	if (ret)
+		goto free_exit;
+
+	return 0;
+
+free_exit:
+	sev_tio_dev_reclaim(dev_data);
+
+	streams_disable(dev_data->ide);
+ide_free_exit:
+
+	streams_teardown(dev_data->ide);
+
+	return ret;
+}
+
+static void dsm_disconnect(struct pci_dev *pdev)
+{
+	bool force = SYSTEM_HALT <= system_state && system_state <= SYSTEM_RESTART;
+	struct tio_dsm *dsm = pdev_to_tio_dsm(pdev);
+	struct tsm_dsm_tio *dev_data = &dsm->data;
+	int ret;
+
+	ret = sev_tio_dev_disconnect(dev_data, force);
+	ret = sev_tio_spdm_cmd(dsm, ret);
+	if (ret && !force) {
+		ret = sev_tio_dev_disconnect(dev_data, true);
+		sev_tio_spdm_cmd(dsm, ret);
+	}
+
+	sev_tio_dev_reclaim(dev_data);
+
+	streams_disable(dev_data->ide);
+	streams_unregister(dev_data->ide);
+	streams_teardown(dev_data->ide);
+}
+
+static struct pci_tsm_ops sev_tsm_ops = {
+	.probe = dsm_probe,
+	.remove = dsm_remove,
+	.connect = dsm_connect,
+	.disconnect = dsm_disconnect,
+};
+
+void sev_tsm_init_locked(struct sev_device *sev, void *tio_status_page)
+{
+	struct sev_tio_status *t = kzalloc(sizeof(*t), GFP_KERNEL);
+	struct tsm_dev *tsmdev;
+	int ret;
+
+	WARN_ON(sev->tio_status);
+
+	if (!t)
+		return;
+
+	ret = sev_tio_init_locked(tio_status_page);
+	if (ret) {
+		pr_warn("SEV-TIO STATUS failed with %d\n", ret);
+		goto error_exit;
+	}
+
+	tsmdev = tsm_register(sev->dev, &sev_tsm_ops);
+	if (IS_ERR(tsmdev))
+		goto error_exit;
+
+	memcpy(t, tio_status_page, sizeof(*t));
+
+	pr_notice("SEV-TIO status: EN=%d INIT_DONE=%d rq=%d..%d rs=%d..%d "
+		  "scr=%d..%d out=%d..%d dev=%d tdi=%d algos=%x\n",
+		  t->tio_en, t->tio_init_done,
+		  t->spdm_req_size_min, t->spdm_req_size_max,
+		  t->spdm_rsp_size_min, t->spdm_rsp_size_max,
+		  t->spdm_scratch_size_min, t->spdm_scratch_size_max,
+		  t->spdm_out_size_min, t->spdm_out_size_max,
+		  t->devctx_size, t->tdictx_size,
+		  t->tio_crypto_alg);
+
+	sev->tsmdev = tsmdev;
+	sev->tio_status = t;
+
+	return;
+
+error_exit:
+	kfree(t);
+	pr_err("Failed to enable SEV-TIO: ret=%d en=%d initdone=%d SEV=%d\n",
+	       ret, t->tio_en, t->tio_init_done, boot_cpu_has(X86_FEATURE_SEV));
+}
+
+void sev_tsm_uninit(struct sev_device *sev)
+{
+	if (sev->tsmdev)
+		tsm_unregister(sev->tsmdev);
+
+	sev->tsmdev = NULL;
+}
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 9e0c16b36f9c..8a74a08553a5 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -75,6 +75,14 @@ static bool psp_init_on_probe = true;
 module_param(psp_init_on_probe, bool, 0444);
 MODULE_PARM_DESC(psp_init_on_probe, "  if true, the PSP will be initialized on module init. Else the PSP will be initialized on the first command requiring it");
 
+#if IS_ENABLED(CONFIG_PCI_TSM)
+static bool sev_tio_enabled = true;
+module_param_named(tio, sev_tio_enabled, bool, 0444);
+MODULE_PARM_DESC(tio, "Enables TIO in SNP_INIT_EX");
+#else
+static const bool sev_tio_enabled = false;
+#endif
+
 MODULE_FIRMWARE("amd/amd_sev_fam17h_model0xh.sbin"); /* 1st gen EPYC */
 MODULE_FIRMWARE("amd/amd_sev_fam17h_model3xh.sbin"); /* 2nd gen EPYC */
 MODULE_FIRMWARE("amd/amd_sev_fam19h_model0xh.sbin"); /* 3rd gen EPYC */
@@ -251,7 +259,7 @@ static int sev_cmd_buffer_len(int cmd)
 	case SEV_CMD_SNP_COMMIT:		return sizeof(struct sev_data_snp_commit);
 	case SEV_CMD_SNP_FEATURE_INFO:		return sizeof(struct sev_data_snp_feature_info);
 	case SEV_CMD_SNP_VLEK_LOAD:		return sizeof(struct sev_user_data_snp_vlek_load);
-	default:				return 0;
+	default:				return sev_tio_cmd_buffer_len(cmd);
 	}
 
 	return 0;
@@ -1394,6 +1402,8 @@ static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid)
 	 *
 	 */
 	if (sev_version_greater_or_equal(SNP_MIN_API_MAJOR, 52)) {
+		bool tio_supp = !!(sev->snp_feat_info_0.ebx & SNP_SEV_TIO_SUPPORTED);
+
 		/*
 		 * Firmware checks that the pages containing the ranges enumerated
 		 * in the RANGES structure are either in the default page state or in the
@@ -1434,6 +1444,17 @@ static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid)
 		data.init_rmp = 1;
 		data.list_paddr_en = 1;
 		data.list_paddr = __psp_pa(snp_range_list);
+
+		data.tio_en = tio_supp && sev_tio_enabled && amd_iommu_sev_tio_supported();
+
+		/*
+		 * When psp_init_on_probe is disabled, the userspace calling
+		 * SEV ioctl can inadvertently shut down SNP and SEV-TIO causing
+		 * unexpected state loss.
+		 */
+		if (data.tio_en && !psp_init_on_probe)
+			dev_warn(sev->dev, "SEV-TIO as incompatible with psp_init_on_probe=0\n");
+
 		cmd = SEV_CMD_SNP_INIT_EX;
 	} else {
 		cmd = SEV_CMD_SNP_INIT;
@@ -1471,7 +1492,8 @@ static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid)
 
 	snp_hv_fixed_pages_state_update(sev, HV_FIXED);
 	sev->snp_initialized = true;
-	dev_dbg(sev->dev, "SEV-SNP firmware initialized\n");
+	dev_dbg(sev->dev, "SEV-SNP firmware initialized, SEV-TIO is %s\n",
+		data.tio_en ? "enabled" : "disabled");
 
 	dev_info(sev->dev, "SEV-SNP API:%d.%d build:%d\n", sev->api_major,
 		 sev->api_minor, sev->build);
@@ -1479,6 +1501,23 @@ static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid)
 	atomic_notifier_chain_register(&panic_notifier_list,
 				       &snp_panic_notifier);
 
+	if (data.tio_en) {
+		/*
+		 * This executes with the sev_cmd_mutex held so down the stack
+		 * snp_reclaim_pages(locked=false) might be needed (which is extremely
+		 * unlikely) but will cause a deadlock.
+		 * Instead of exporting __snp_alloc_firmware_pages(), allocate a page
+		 * for this one call here.
+		 */
+		void *tio_status = page_address(__snp_alloc_firmware_pages(
+			GFP_KERNEL_ACCOUNT | __GFP_ZERO, 0, true));
+
+		if (tio_status) {
+			sev_tsm_init_locked(sev, tio_status);
+			__snp_free_firmware_pages(virt_to_page(tio_status), 0, true);
+		}
+	}
+
 	sev_es_tmr_size = SNP_TMR_SIZE;
 
 	return 0;
@@ -2758,8 +2797,20 @@ static void __sev_firmware_shutdown(struct sev_device *sev, bool panic)
 
 static void sev_firmware_shutdown(struct sev_device *sev)
 {
+	/*
+	 * Calling without sev_cmd_mutex held as TSM will likely try disconnecting
+	 * IDE and this ends up calling sev_do_cmd() which locks sev_cmd_mutex.
+	 */
+	if (sev->tio_status)
+		sev_tsm_uninit(sev);
+
 	mutex_lock(&sev_cmd_mutex);
+
 	__sev_firmware_shutdown(sev, false);
+
+	kfree(sev->tio_status);
+	sev->tio_status = NULL;
+
 	mutex_unlock(&sev_cmd_mutex);
 }
 
diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h
index b9029506383f..b1cd556bbbf6 100644
--- a/drivers/crypto/ccp/sev-dev.h
+++ b/drivers/crypto/ccp/sev-dev.h
@@ -34,6 +34,8 @@ struct sev_misc_dev {
 	struct miscdevice misc;
 };
 
+struct sev_tio_status;
+
 struct sev_device {
 	struct device *dev;
 	struct psp_device *psp;
@@ -61,6 +63,9 @@ struct sev_device {
 
 	struct sev_user_data_snp_status snp_plat_status;
 	struct snp_feature_info snp_feat_info_0;
+
+	struct tsm_dev *tsmdev;
+	struct sev_tio_status *tio_status;
 };
 
 int sev_dev_init(struct psp_device *psp);
@@ -74,4 +79,8 @@ void sev_pci_exit(void);
 struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages);
 void snp_free_hv_fixed_pages(struct page *page);
 
+void sev_tsm_init_locked(struct sev_device *sev, void *tio_status_page);
+void sev_tsm_uninit(struct sev_device *sev);
+int sev_tio_cmd_buffer_len(int cmd);
+
 #endif /* __SEV_DEV_H */
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 34a25209f909..cce864dbf281 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -109,6 +109,13 @@ enum sev_cmd {
 	SEV_CMD_SNP_VLEK_LOAD		= 0x0CD,
 	SEV_CMD_SNP_FEATURE_INFO	= 0x0CE,
 
+	/* SEV-TIO commands */
+	SEV_CMD_TIO_STATUS		= 0x0D0,
+	SEV_CMD_TIO_INIT		= 0x0D1,
+	SEV_CMD_TIO_DEV_CREATE		= 0x0D2,
+	SEV_CMD_TIO_DEV_RECLAIM		= 0x0D3,
+	SEV_CMD_TIO_DEV_CONNECT		= 0x0D4,
+	SEV_CMD_TIO_DEV_DISCONNECT	= 0x0D5,
 	SEV_CMD_MAX,
 };
 
@@ -750,7 +757,8 @@ struct sev_data_snp_init_ex {
 	u32 list_paddr_en:1;
 	u32 rapl_dis:1;
 	u32 ciphertext_hiding_en:1;
-	u32 rsvd:28;
+	u32 tio_en:1;
+	u32 rsvd:27;
 	u32 rsvd1;
 	u64 list_paddr;
 	u16 max_snp_asid;
@@ -850,6 +858,7 @@ struct snp_feature_info {
 } __packed;
 
 #define SNP_CIPHER_TEXT_HIDING_SUPPORTED	BIT(3)
+#define SNP_SEV_TIO_SUPPORTED			BIT(1) /* EBX */
 
 #ifdef CONFIG_CRYPTO_DEV_SP_PSP
 
-- 
cgit v1.2.3


From f7231cff1f3ff8259bef02dc4999bc132abf29cf Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Date: Wed, 3 Dec 2025 08:55:34 +0000
Subject: media: uapi: c3-isp: Fix documentation warning

Building htmldocs generates a warning:

WARNING: include/uapi/linux/media/amlogic/c3-isp-config.h:199
error: Cannot parse struct or union!

Which correctly highlights that the c3_isp_params_block_header symbol
is wrongly documented as a struct while it's a plain #define instead.

Fix this by removing the 'struct' identifier from the documentation of
the c3_isp_params_block_header symbol.

[ribalda: Add Closes:]

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/all/20251127131425.4b5b6644@canb.auug.org.au/
Fixes: 45662082855c ("media: uapi: Convert Amlogic C3 to V4L2 extensible params")
Cc: stable@vger.kernel.org
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/uapi/linux/media/amlogic/c3-isp-config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/media/amlogic/c3-isp-config.h b/include/uapi/linux/media/amlogic/c3-isp-config.h
index 0a3c1cc55ccb..92db5dcdda18 100644
--- a/include/uapi/linux/media/amlogic/c3-isp-config.h
+++ b/include/uapi/linux/media/amlogic/c3-isp-config.h
@@ -186,7 +186,7 @@ enum c3_isp_params_block_type {
 #define C3_ISP_PARAMS_BLOCK_FL_ENABLE	V4L2_ISP_PARAMS_FL_BLOCK_ENABLE
 
 /**
- * struct c3_isp_params_block_header - C3 ISP parameter block header
+ * c3_isp_params_block_header - C3 ISP parameter block header
  *
  * This structure represents the common part of all the ISP configuration
  * blocks and is identical to :c:type:`v4l2_isp_params_block_header`.
-- 
cgit v1.2.3


From 305c8dc477175eb29df18accc95c868acd2cdd4e Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Tue, 2 Dec 2025 09:59:38 -0800
Subject: objtool: Consolidate annotation macros

Consolidate __ASM_ANNOTATE into a single macro which is used by both C
and asm.  This also makes the code generation a bit more palatable by
putting it all on a single line.

Turn this:

	911:
	       .pushsection .discard.annotate_insn,"M", @progbits, 8
	       .long 911b - .
	       .long 1
	       .popsection
	       jmp __x86_return_thunk

Into:

	911: .pushsection ".discard.annotate_insn", "M", @progbits, 8; .long 911b - .; .long 1; .popsection
	jmp __x86_return_thunk

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/c05ff40d3383e85c3b59018ef0b3c7aaf993a60d.1764694625.git.jpoimboe@kernel.org
---
 include/linux/annotate.h | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/annotate.h b/include/linux/annotate.h
index 7c10d34d198c..996126f5f9ec 100644
--- a/include/linux/annotate.h
+++ b/include/linux/annotate.h
@@ -6,41 +6,35 @@
 
 #ifdef CONFIG_OBJTOOL
 
-#ifndef __ASSEMBLY__
-
 #define __ASM_ANNOTATE(section, label, type)				\
-	".pushsection " section ",\"M\", @progbits, 8\n\t"		\
-	".long " __stringify(label) " - .\n\t"				\
-	".long " __stringify(type) "\n\t"				\
-	".popsection\n\t"
+	.pushsection section, "M", @progbits, 8;			\
+	.long label - .;						\
+	.long type;							\
+	.popsection
+
+#ifndef __ASSEMBLY__
 
 #define ASM_ANNOTATE_LABEL(label, type)					\
-	__ASM_ANNOTATE(".discard.annotate_insn", label, type)
+	__stringify(__ASM_ANNOTATE(".discard.annotate_insn", label, type)) "\n\t"
 
 #define ASM_ANNOTATE(type)						\
-	"911:\n\t"							\
-	ASM_ANNOTATE_LABEL(911b, type)
+	"911: "								\
+	__stringify(__ASM_ANNOTATE(".discard.annotate_insn", 911b, type)) "\n\t"
 
 #define ASM_ANNOTATE_DATA(type)						\
-	"912:\n\t"							\
-	__ASM_ANNOTATE(".discard.annotate_data", 912b, type)
+	"912: "								\
+	__stringify(__ASM_ANNOTATE(".discard.annotate_data", 912b, type)) "\n\t"
 
 #else /* __ASSEMBLY__ */
 
-.macro __ANNOTATE section, type
-.Lhere_\@:
-	.pushsection \section, "M", @progbits, 8
-	.long	.Lhere_\@ - .
-	.long	\type
-	.popsection
-.endm
-
 .macro ANNOTATE type
-	__ANNOTATE ".discard.annotate_insn", \type
+.Lhere_\@:
+	__ASM_ANNOTATE(".discard.annotate_insn", .Lhere_\@, \type)
 .endm
 
 .macro ANNOTATE_DATA type
-	__ANNOTATE ".discard.annotate_data", \type
+.Lhere_\@:
+	__ASM_ANNOTATE(".discard.annotate_data", .Lhere_\@, \type)
 .endm
 
 #endif /* __ASSEMBLY__ */
-- 
cgit v1.2.3


From ed3bf863dc9150b56233b01ec073cbbd1fc9c6a3 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Tue, 2 Dec 2025 09:59:39 -0800
Subject: objtool: Remove newlines and tabs from annotation macros

Remove newlines and tabs from the annotation macros so the invoking code
can insert them as needed to match the style of the surrounding code.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/66305834c2eb78f082217611b756231ae9c0b555.1764694625.git.jpoimboe@kernel.org
---
 arch/x86/include/asm/alternative.h    | 2 +-
 arch/x86/include/asm/bug.h            | 2 +-
 arch/x86/include/asm/cpufeature.h     | 2 +-
 arch/x86/include/asm/irq_stack.h      | 2 +-
 arch/x86/include/asm/jump_label.h     | 2 +-
 arch/x86/include/asm/nospec-branch.h  | 4 ++--
 arch/x86/include/asm/paravirt_types.h | 2 +-
 arch/x86/include/asm/smap.h           | 8 ++++----
 arch/x86/include/asm/static_call.h    | 2 +-
 arch/x86/kernel/alternative.c         | 4 ++--
 arch/x86/kernel/rethook.c             | 2 +-
 arch/x86/kernel/static_call.c         | 4 ++--
 arch/x86/lib/error-inject.c           | 2 +-
 include/linux/annotate.h              | 6 +++---
 include/linux/objtool.h               | 2 +-
 15 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index df2c8705e17b..03364510d5fe 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -208,7 +208,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 
 #define ALTINSTR_REPLACEMENT(newinstr)		/* replacement */	\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
-	ANNOTATE_DATA_SPECIAL						\
+	ANNOTATE_DATA_SPECIAL "\n"					\
 	"# ALT: replacement\n"						\
 	"774:\n\t" newinstr "\n775:\n"					\
 	".popsection\n"
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index ab5bba6cf7f5..ee23b98353d7 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -70,7 +70,7 @@ extern void __WARN_trap(struct bug_entry *bug, ...);
 
 #define _BUG_FLAGS_ASM(format, file, line, flags, size, extra)		\
 	".pushsection __bug_table,\"aw\"\n\t"				\
-	ANNOTATE_DATA_SPECIAL						\
+	ANNOTATE_DATA_SPECIAL "\n\t"					\
 	"2:\n\t"							\
 	__BUG_ENTRY(format, file, line, flags)				\
 	"\t.org 2b + " size "\n"					\
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index fc5f32d4da6e..d8bc614f92fa 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -101,7 +101,7 @@ static __always_inline bool _static_cpu_has(u16 bit)
 	asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
 		".pushsection .altinstr_aux,\"ax\"\n"
 		"6:\n"
-		ANNOTATE_DATA_SPECIAL
+		ANNOTATE_DATA_SPECIAL "\n"
 		" testb %[bitnum], %a[cap_byte]\n"
 		" jnz %l[t_yes]\n"
 		" jmp %l[t_no]\n"
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 735c3a491f60..8325b79f2ac6 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -101,7 +101,7 @@
 
 #define ASM_CALL_ARG0							\
 	"1: call %c[__func]				\n"		\
-	ANNOTATE_REACHABLE(1b)
+	ANNOTATE_REACHABLE(1b) "			\n"
 
 #define ASM_CALL_ARG1							\
 	"movq	%[arg1], %%rdi				\n"		\
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index e0a6930a4029..05b16299588d 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -15,7 +15,7 @@
 #define JUMP_TABLE_ENTRY(key, label)			\
 	".pushsection __jump_table,  \"aw\" \n\t"	\
 	_ASM_ALIGN "\n\t"				\
-	ANNOTATE_DATA_SPECIAL				\
+	ANNOTATE_DATA_SPECIAL "\n"			\
 	".long 1b - . \n\t"				\
 	".long " label " - . \n\t"			\
 	_ASM_PTR " " key " - . \n\t"			\
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 08ed5a2e46a5..a5d41d8cd70a 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -464,7 +464,7 @@ static inline void call_depth_return_thunk(void) {}
  */
 # define CALL_NOSPEC						\
 	ALTERNATIVE_2(						\
-	ANNOTATE_RETPOLINE_SAFE					\
+	ANNOTATE_RETPOLINE_SAFE "\n"				\
 	"call *%[thunk_target]\n",				\
 	"       jmp    904f;\n"					\
 	"       .align 16\n"					\
@@ -480,7 +480,7 @@ static inline void call_depth_return_thunk(void) {}
 	"904:	call   901b;\n",				\
 	X86_FEATURE_RETPOLINE,					\
 	"lfence;\n"						\
-	ANNOTATE_RETPOLINE_SAFE					\
+	ANNOTATE_RETPOLINE_SAFE "\n"				\
 	"call *%[thunk_target]\n",				\
 	X86_FEATURE_RETPOLINE_LFENCE)
 
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 37a8627d8277..3502939415ad 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -249,7 +249,7 @@ extern struct paravirt_patch_template pv_ops;
  * don't need to bother with CFI prefixes.
  */
 #define PARAVIRT_CALL					\
-	ANNOTATE_RETPOLINE_SAFE				\
+	ANNOTATE_RETPOLINE_SAFE "\n\t"			\
 	"call *%[paravirt_opptr];"
 
 /*
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 4f84d421d1cf..cd173facecd2 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -40,7 +40,7 @@ static __always_inline unsigned long smap_save(void)
 	unsigned long flags;
 
 	asm volatile ("# smap_save\n\t"
-		      ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
+		      ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "\n\t"
 				  "", "pushf; pop %0; clac",
 				  X86_FEATURE_SMAP)
 		      : "=rm" (flags) : : "memory", "cc");
@@ -51,7 +51,7 @@ static __always_inline unsigned long smap_save(void)
 static __always_inline void smap_restore(unsigned long flags)
 {
 	asm volatile ("# smap_restore\n\t"
-		      ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
+		      ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "\n\t"
 				  "", "push %0; popf",
 				  X86_FEATURE_SMAP)
 		      : : "g" (flags) : "memory", "cc");
@@ -64,9 +64,9 @@ static __always_inline void smap_restore(unsigned long flags)
 	ALTERNATIVE("", "stac", X86_FEATURE_SMAP)
 
 #define ASM_CLAC_UNSAFE \
-	ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "clac", X86_FEATURE_SMAP)
+	ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "\n\t" "clac", X86_FEATURE_SMAP)
 #define ASM_STAC_UNSAFE \
-	ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "stac", X86_FEATURE_SMAP)
+	ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "\n\t" "stac", X86_FEATURE_SMAP)
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index 41502bd2afd6..4cd725a8fe91 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -36,7 +36,7 @@
 	    ".align 4						\n"	\
 	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
 	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
-	    ANNOTATE_NOENDBR						\
+	    ANNOTATE_NOENDBR "					\n"	\
 	    insns "						\n"	\
 	    ".byte 0x0f, 0xb9, 0xcc				\n"	\
 	    ".type " STATIC_CALL_TRAMP_STR(name) ", @function	\n"	\
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e377b06e70e3..3bda5118969f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2229,7 +2229,7 @@ asm (
 "	.pushsection	.init.text, \"ax\", @progbits\n"
 "	.type		int3_selftest_asm, @function\n"
 "int3_selftest_asm:\n"
-	ANNOTATE_NOENDBR
+	ANNOTATE_NOENDBR "\n"
 	/*
 	 * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
 	 * exception, then the int3_exception_nb notifier emulates a call to
@@ -2247,7 +2247,7 @@ asm (
 "	.pushsection	.init.text, \"ax\", @progbits\n"
 "	.type		int3_selftest_callee, @function\n"
 "int3_selftest_callee:\n"
-	ANNOTATE_NOENDBR
+	ANNOTATE_NOENDBR "\n"
 "	movl	$0x1234, (%" _ASM_ARG1 ")\n"
 	ASM_RET
 "	.size		int3_selftest_callee, . - int3_selftest_callee\n"
diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c
index 8a1c0111ae79..85e2f2d16a90 100644
--- a/arch/x86/kernel/rethook.c
+++ b/arch/x86/kernel/rethook.c
@@ -25,7 +25,7 @@ asm(
 	".type arch_rethook_trampoline, @function\n"
 	"arch_rethook_trampoline:\n"
 #ifdef CONFIG_X86_64
-	ANNOTATE_NOENDBR	/* This is only jumped from ret instruction */
+	ANNOTATE_NOENDBR "\n"	/* This is only jumped from ret instruction */
 	/* Push a fake return address to tell the unwinder it's a rethook. */
 	"	pushq $arch_rethook_trampoline\n"
 	UNWIND_HINT_FUNC
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 2892cdb14563..61592e41a6b1 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -50,8 +50,8 @@ asm (".global __static_call_return\n\t"
      ".type __static_call_return, @function\n\t"
      ASM_FUNC_ALIGN "\n\t"
      "__static_call_return:\n\t"
-     ANNOTATE_NOENDBR
-     ANNOTATE_RETPOLINE_SAFE
+     ANNOTATE_NOENDBR "\n\t"
+     ANNOTATE_RETPOLINE_SAFE "\n\t"
      "ret; int3\n\t"
      ".size __static_call_return, . - __static_call_return \n\t");
 
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
index b5a6d83106bc..512a2538596f 100644
--- a/arch/x86/lib/error-inject.c
+++ b/arch/x86/lib/error-inject.c
@@ -13,7 +13,7 @@ asm(
 	".globl just_return_func\n"
 	ASM_FUNC_ALIGN
 	"just_return_func:\n"
-		ANNOTATE_NOENDBR
+		ANNOTATE_NOENDBR "\n"
 		ASM_RET
 	".size just_return_func, .-just_return_func\n"
 );
diff --git a/include/linux/annotate.h b/include/linux/annotate.h
index 996126f5f9ec..5efac5d4f9cf 100644
--- a/include/linux/annotate.h
+++ b/include/linux/annotate.h
@@ -15,15 +15,15 @@
 #ifndef __ASSEMBLY__
 
 #define ASM_ANNOTATE_LABEL(label, type)					\
-	__stringify(__ASM_ANNOTATE(".discard.annotate_insn", label, type)) "\n\t"
+	__stringify(__ASM_ANNOTATE(".discard.annotate_insn", label, type))
 
 #define ASM_ANNOTATE(type)						\
 	"911: "								\
-	__stringify(__ASM_ANNOTATE(".discard.annotate_insn", 911b, type)) "\n\t"
+	__stringify(__ASM_ANNOTATE(".discard.annotate_insn", 911b, type))
 
 #define ASM_ANNOTATE_DATA(type)						\
 	"912: "								\
-	__stringify(__ASM_ANNOTATE(".discard.annotate_data", 912b, type)) "\n\t"
+	__stringify(__ASM_ANNOTATE(".discard.annotate_data", 912b, type))
 
 #else /* __ASSEMBLY__ */
 
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index b18ab53561c9..9a00e701454c 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -12,7 +12,7 @@
 #define UNWIND_HINT(type, sp_reg, sp_offset, signal)		\
 	"987: \n\t"						\
 	".pushsection .discard.unwind_hints\n\t"		\
-	ANNOTATE_DATA_SPECIAL					\
+	ANNOTATE_DATA_SPECIAL "\n\t"				\
 	/* struct unwind_hint */				\
 	".long 987b - .\n\t"					\
 	".short " __stringify(sp_offset) "\n\t"			\
-- 
cgit v1.2.3


From 2d3451ef1ef679ae496f8e335f4b1305885e8083 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 3 Dec 2025 10:07:38 -0800
Subject: objtool: Simplify .annotate_insn code generation output some more

Remove the superfluous section name quotes, and combine the longs into a
single command.

Before:

  911: .pushsection ".discard.annotate_insn", "M", @progbits, 8; .long 911b - .; .long 2; .popsection

After:

  911: .pushsection .discard.annotate_insn, "M", @progbits, 8; .long 911b - ., 2; .popsection

No change in functionality.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/hpsfcihgqmhcdrg7pop7z73ptymakgjq7qlxrawrjxilosk43l@xikqif3ievj4
---
 include/linux/annotate.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/annotate.h b/include/linux/annotate.h
index 5efac5d4f9cf..2f1599c9e573 100644
--- a/include/linux/annotate.h
+++ b/include/linux/annotate.h
@@ -8,33 +8,32 @@
 
 #define __ASM_ANNOTATE(section, label, type)				\
 	.pushsection section, "M", @progbits, 8;			\
-	.long label - .;						\
-	.long type;							\
+	.long label - ., type;						\
 	.popsection
 
 #ifndef __ASSEMBLY__
 
 #define ASM_ANNOTATE_LABEL(label, type)					\
-	__stringify(__ASM_ANNOTATE(".discard.annotate_insn", label, type))
+	__stringify(__ASM_ANNOTATE(.discard.annotate_insn, label, type))
 
 #define ASM_ANNOTATE(type)						\
 	"911: "								\
-	__stringify(__ASM_ANNOTATE(".discard.annotate_insn", 911b, type))
+	__stringify(__ASM_ANNOTATE(.discard.annotate_insn, 911b, type))
 
 #define ASM_ANNOTATE_DATA(type)						\
 	"912: "								\
-	__stringify(__ASM_ANNOTATE(".discard.annotate_data", 912b, type))
+	__stringify(__ASM_ANNOTATE(.discard.annotate_data, 912b, type))
 
 #else /* __ASSEMBLY__ */
 
 .macro ANNOTATE type
 .Lhere_\@:
-	__ASM_ANNOTATE(".discard.annotate_insn", .Lhere_\@, \type)
+	__ASM_ANNOTATE(.discard.annotate_insn, .Lhere_\@, \type)
 .endm
 
 .macro ANNOTATE_DATA type
 .Lhere_\@:
-	__ASM_ANNOTATE(".discard.annotate_data", .Lhere_\@, \type)
+	__ASM_ANNOTATE(.discard.annotate_data, .Lhere_\@, \type)
 .endm
 
 #endif /* __ASSEMBLY__ */
-- 
cgit v1.2.3


From faf07e611dfa464b201223a7253e9dc5ee0f3c9e Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@opinsys.com>
Date: Tue, 30 Sep 2025 15:58:02 +0300
Subject: tpm: Cap the number of PCR banks

tpm2_get_pcr_allocation() does not cap any upper limit for the number of
banks. Cap the limit to eight banks so that out of bounds values coming
from external I/O cause on only limited harm.

Cc: stable@vger.kernel.org # v5.10+
Fixes: bcfff8384f6c ("tpm: dynamically allocate the allocated_banks array")
Tested-by: Lai Yi <yi1.lai@linux.intel.com>
Reviewed-by: Jonathan McDowell <noodles@meta.com>
Reviewed-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@opinsys.com>
---
 drivers/char/tpm/tpm-chip.c | 1 -
 drivers/char/tpm/tpm1-cmd.c | 5 -----
 drivers/char/tpm/tpm2-cmd.c | 8 +++-----
 include/linux/tpm.h         | 8 +++++---
 4 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index 30d00219f9f3..082b910ddf0d 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -246,7 +246,6 @@ static void tpm_dev_release(struct device *dev)
 
 	kfree(chip->work_space.context_buf);
 	kfree(chip->work_space.session_buf);
-	kfree(chip->allocated_banks);
 #ifdef CONFIG_TCG_TPM2_HMAC
 	kfree(chip->auth);
 #endif
diff --git a/drivers/char/tpm/tpm1-cmd.c b/drivers/char/tpm/tpm1-cmd.c
index cf64c7385105..b49a790f1bd5 100644
--- a/drivers/char/tpm/tpm1-cmd.c
+++ b/drivers/char/tpm/tpm1-cmd.c
@@ -799,11 +799,6 @@ int tpm1_pm_suspend(struct tpm_chip *chip, u32 tpm_suspend_pcr)
  */
 int tpm1_get_pcr_allocation(struct tpm_chip *chip)
 {
-	chip->allocated_banks = kcalloc(1, sizeof(*chip->allocated_banks),
-					GFP_KERNEL);
-	if (!chip->allocated_banks)
-		return -ENOMEM;
-
 	chip->allocated_banks[0].alg_id = TPM_ALG_SHA1;
 	chip->allocated_banks[0].digest_size = hash_digest_size[HASH_ALGO_SHA1];
 	chip->allocated_banks[0].crypto_id = HASH_ALGO_SHA1;
diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index 5532e53a2dd3..dd502322f499 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -550,11 +550,9 @@ ssize_t tpm2_get_pcr_allocation(struct tpm_chip *chip)
 
 	nr_possible_banks = be32_to_cpup(
 		(__be32 *)&buf.data[TPM_HEADER_SIZE + 5]);
-
-	chip->allocated_banks = kcalloc(nr_possible_banks,
-					sizeof(*chip->allocated_banks),
-					GFP_KERNEL);
-	if (!chip->allocated_banks) {
+	if (nr_possible_banks > TPM2_MAX_PCR_BANKS) {
+		pr_err("tpm: out of bank capacity: %u > %u\n",
+		       nr_possible_banks, TPM2_MAX_PCR_BANKS);
 		rc = -ENOMEM;
 		goto out;
 	}
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index b15360ff78d7..53de9488c509 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -26,7 +26,9 @@
 #include <crypto/aes.h>
 
 #define TPM_DIGEST_SIZE 20	/* Max TPM v1.2 PCR size */
-#define TPM_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE
+
+#define TPM2_MAX_DIGEST_SIZE	SHA512_DIGEST_SIZE
+#define TPM2_MAX_PCR_BANKS	8
 
 struct tpm_chip;
 struct trusted_key_payload;
@@ -68,7 +70,7 @@ enum tpm2_curves {
 
 struct tpm_digest {
 	u16 alg_id;
-	u8 digest[TPM_MAX_DIGEST_SIZE];
+	u8 digest[TPM2_MAX_DIGEST_SIZE];
 } __packed;
 
 struct tpm_bank_info {
@@ -189,7 +191,7 @@ struct tpm_chip {
 	unsigned int groups_cnt;
 
 	u32 nr_allocated_banks;
-	struct tpm_bank_info *allocated_banks;
+	struct tpm_bank_info allocated_banks[TPM2_MAX_PCR_BANKS];
 #ifdef CONFIG_ACPI
 	acpi_handle acpi_dev_handle;
 	char ppi_version[TPM_PPI_VERSION_LEN + 1];
-- 
cgit v1.2.3


From 7fcf459ac84c42a4ef63a650dccc345602cf4da6 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@opinsys.com>
Date: Tue, 30 Sep 2025 16:02:54 +0300
Subject: tpm: Use -EPERM as fallback error code in tpm_ret_to_err

Using -EFAULT as the tpm_ret_to_err() fallback error code causes makes it
incompatible on how trusted keys transmute TPM return codes.

Change the fallback as -EPERM in order to gain compatibility with trusted
keys. In addition, map TPM_RC_HASH to -EINVAL in order to be compatible
with tpm2_seal_trusted() return values.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@opinsys.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/tpm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 53de9488c509..3d8f7d1ce2b8 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -456,8 +456,10 @@ static inline ssize_t tpm_ret_to_err(ssize_t ret)
 		return 0;
 	case TPM2_RC_SESSION_MEMORY:
 		return -ENOMEM;
+	case TPM2_RC_HASH:
+		return -EINVAL;
 	default:
-		return -EFAULT;
+		return -EPERM;
 	}
 }
 
-- 
cgit v1.2.3


From 5b5578c3b06eba4c256bc3a2788f5a65cd9f31ea Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Wed, 29 Oct 2025 14:31:04 +0800
Subject: f2fs: fix to access i_size w/ i_size_read()

It recommends to use i_size_{read,write}() to access and update i_size,
otherwise, we may get wrong tearing value due to high 32-bits value
and low 32-bits value of i_size field are not updated atomically in
32-bits archicture machine.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/trace/events/f2fs.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index edbbd869078f..e1fae78d64a5 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -204,7 +204,7 @@ DECLARE_EVENT_CLASS(f2fs__inode,
 		__entry->pino	= F2FS_I(inode)->i_pino;
 		__entry->mode	= inode->i_mode;
 		__entry->nlink	= inode->i_nlink;
-		__entry->size	= inode->i_size;
+		__entry->size	= i_size_read(inode);
 		__entry->blocks	= inode->i_blocks;
 		__entry->advise	= F2FS_I(inode)->i_advise;
 	),
@@ -353,7 +353,7 @@ TRACE_EVENT(f2fs_unlink_enter,
 	TP_fast_assign(
 		__entry->dev	= dir->i_sb->s_dev;
 		__entry->ino	= dir->i_ino;
-		__entry->size	= dir->i_size;
+		__entry->size	= i_size_read(dir);
 		__entry->blocks	= dir->i_blocks;
 		__assign_str(name);
 	),
@@ -433,7 +433,7 @@ DECLARE_EVENT_CLASS(f2fs__truncate_op,
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
-		__entry->size	= inode->i_size;
+		__entry->size	= i_size_read(inode);
 		__entry->blocks	= inode->i_blocks;
 		__entry->from	= from;
 	),
@@ -1006,7 +1006,7 @@ TRACE_EVENT(f2fs_fallocate,
 		__entry->mode	= mode;
 		__entry->offset	= offset;
 		__entry->len	= len;
-		__entry->size	= inode->i_size;
+		__entry->size	= i_size_read(inode);
 		__entry->blocks = inode->i_blocks;
 		__entry->ret	= ret;
 	),
-- 
cgit v1.2.3


From 2e2e0d679a1fb88a960049496373f415b67f274f Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 28 Oct 2025 19:50:11 +0000
Subject: f2fs: add fadvise tracepoint

This adds a tracepoint in the fadvise call path.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c              |  2 ++
 include/trace/events/f2fs.h | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7b966f6d40d2..d7047ca6b98d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -5288,6 +5288,8 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
 	struct inode *inode = file_inode(filp);
 	int err;
 
+	trace_f2fs_fadvise(inode, offset, len, advice);
+
 	if (advice == POSIX_FADV_SEQUENTIAL) {
 		if (S_ISFIFO(inode->i_mode))
 			return -ESPIPE;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index e1fae78d64a5..e00611ead024 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -586,6 +586,38 @@ TRACE_EVENT(f2fs_file_write_iter,
 		__entry->ret)
 );
 
+TRACE_EVENT(f2fs_fadvise,
+
+	TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int advice),
+
+	TP_ARGS(inode, offset, len, advice),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(loff_t, size)
+		__field(loff_t,	offset)
+		__field(loff_t,	len)
+		__field(int,	advice)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->size	= i_size_read(inode);
+		__entry->offset	= offset;
+		__entry->len	= len;
+		__entry->advice	= advice;
+	),
+
+	TP_printk("dev = (%d,%d), ino = %lu, i_size = %lld offset:%llu, len:%llu, advise:%d",
+		show_dev_ino(__entry),
+		(unsigned long long)__entry->size,
+		__entry->offset,
+		__entry->len,
+		__entry->advice)
+);
+
 TRACE_EVENT(f2fs_map_blocks,
 	TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int flag,
 		 int ret),
-- 
cgit v1.2.3


From 7ee8bc3942f20964ad730871b885688ea3a2961a Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daehojeong@google.com>
Date: Tue, 11 Nov 2025 09:52:46 -0800
Subject: f2fs: revert summary entry count from 2048 to 512 in 16kb block
 support

The recent increase in the number of Segment Summary Area (SSA) entries
from 512 to 2048 was an unintentional change in logic of 16kb block
support. This commit corrects the issue.

To better utilize the space available from the erroneous 2048-entry
calculation, we are implementing a solution to share the currently
unused SSA space with neighboring segments. This enhances overall
SSA utilization without impacting the established 8MB segment size.

Fixes: d7e9a9037de2 ("f2fs: Support Block Size == Page Size")
Signed-off-by: Daeho Jeong <daehojeong@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          |   2 +
 fs/f2fs/gc.c            | 117 ++++++++++++++++++++++++++++--------------------
 fs/f2fs/recovery.c      |   2 +-
 fs/f2fs/segment.c       |  38 +++++++++++-----
 fs/f2fs/segment.h       |   8 +++-
 fs/f2fs/super.c         |  14 ++++++
 fs/f2fs/sysfs.c         |   7 +++
 include/linux/f2fs_fs.h |   5 ++-
 8 files changed, 130 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d9b2777f09ed..860e9c69d3a6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -245,6 +245,7 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_COMPRESSION		0x00002000
 #define F2FS_FEATURE_RO				0x00004000
 #define F2FS_FEATURE_DEVICE_ALIAS		0x00008000
+#define F2FS_FEATURE_PACKED_SSA			0x00010000
 
 #define __F2FS_HAS_FEATURE(raw_super, mask)				\
 	((raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -4704,6 +4705,7 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
 F2FS_FEATURE_FUNCS(compression, COMPRESSION);
 F2FS_FEATURE_FUNCS(readonly, RO);
 F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS);
+F2FS_FEATURE_FUNCS(packed_ssa, PACKED_SSA);
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline bool f2fs_zone_is_seq(struct f2fs_sb_info *sbi, int devi,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index e04aafee1f2c..384fa7e2085b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1735,7 +1735,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 	unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
 						SUM_TYPE_DATA : SUM_TYPE_NODE;
 	unsigned char data_type = (type == SUM_TYPE_DATA) ? DATA : NODE;
-	int submitted = 0;
+	int submitted = 0, sum_blk_cnt;
 
 	if (__is_large_section(sbi)) {
 		sec_end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi));
@@ -1769,22 +1769,28 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 
 	sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type);
 
+	segno = rounddown(segno, SUMS_PER_BLOCK);
+	sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, SUMS_PER_BLOCK);
 	/* readahead multi ssa blocks those have contiguous address */
 	if (__is_large_section(sbi))
 		f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
-					end_segno - segno, META_SSA, true);
+					sum_blk_cnt, META_SSA, true);
 
 	/* reference all summary page */
 	while (segno < end_segno) {
-		struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno++);
+		struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno);
+
+		segno += SUMS_PER_BLOCK;
 		if (IS_ERR(sum_folio)) {
 			int err = PTR_ERR(sum_folio);
 
-			end_segno = segno - 1;
-			for (segno = start_segno; segno < end_segno; segno++) {
+			end_segno = segno - SUMS_PER_BLOCK;
+			segno = rounddown(start_segno, SUMS_PER_BLOCK);
+			while (segno < end_segno) {
 				sum_folio = filemap_get_folio(META_MAPPING(sbi),
 						GET_SUM_BLOCK(sbi, segno));
 				folio_put_refs(sum_folio, 2);
+				segno += SUMS_PER_BLOCK;
 			}
 			return err;
 		}
@@ -1793,68 +1799,83 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 
 	blk_start_plug(&plug);
 
-	for (segno = start_segno; segno < end_segno; segno++) {
-		struct f2fs_summary_block *sum;
+	segno = start_segno;
+	while (segno < end_segno) {
+		unsigned int cur_segno;
 
 		/* find segment summary of victim */
 		struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi),
 					GET_SUM_BLOCK(sbi, segno));
+		unsigned int block_end_segno = rounddown(segno, SUMS_PER_BLOCK)
+					+ SUMS_PER_BLOCK;
+
+		if (block_end_segno > end_segno)
+			block_end_segno = end_segno;
 
 		if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) {
 			f2fs_err(sbi, "%s: segment %u is used by log",
 							__func__, segno);
 			f2fs_bug_on(sbi, 1);
-			goto skip;
+			goto next_block;
 		}
 
-		if (get_valid_blocks(sbi, segno, false) == 0)
-			goto freed;
-		if (gc_type == BG_GC && __is_large_section(sbi) &&
-				migrated >= sbi->migration_granularity)
-			goto skip;
 		if (!folio_test_uptodate(sum_folio) ||
 		    unlikely(f2fs_cp_error(sbi)))
-			goto skip;
+			goto next_block;
 
-		sum = folio_address(sum_folio);
-		if (type != GET_SUM_TYPE((&sum->footer))) {
-			f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SIT and SSA",
-				 segno, type, GET_SUM_TYPE((&sum->footer)));
-			f2fs_stop_checkpoint(sbi, false,
-				STOP_CP_REASON_CORRUPTED_SUMMARY);
-			goto skip;
-		}
+		for (cur_segno = segno; cur_segno < block_end_segno;
+				cur_segno++) {
+			struct f2fs_summary_block *sum;
 
-		/*
-		 * this is to avoid deadlock:
-		 * - lock_page(sum_page)         - f2fs_replace_block
-		 *  - check_valid_map()            - down_write(sentry_lock)
-		 *   - down_read(sentry_lock)     - change_curseg()
-		 *                                  - lock_page(sum_page)
-		 */
-		if (type == SUM_TYPE_NODE)
-			submitted += gc_node_segment(sbi, sum->entries, segno,
-								gc_type);
-		else
-			submitted += gc_data_segment(sbi, sum->entries, gc_list,
-							segno, gc_type,
-							force_migrate);
+			if (get_valid_blocks(sbi, cur_segno, false) == 0)
+				goto freed;
+			if (gc_type == BG_GC && __is_large_section(sbi) &&
+					migrated >= sbi->migration_granularity)
+				continue;
 
-		stat_inc_gc_seg_count(sbi, data_type, gc_type);
-		sbi->gc_reclaimed_segs[sbi->gc_mode]++;
-		migrated++;
+			sum = SUM_BLK_PAGE_ADDR(sum_folio, cur_segno);
+			if (type != GET_SUM_TYPE((&sum->footer))) {
+				f2fs_err(sbi, "Inconsistent segment (%u) type "
+						"[%d, %d] in SSA and SIT",
+						cur_segno, type,
+						GET_SUM_TYPE((&sum->footer)));
+				f2fs_stop_checkpoint(sbi, false,
+						STOP_CP_REASON_CORRUPTED_SUMMARY);
+				continue;
+			}
 
-freed:
-		if (gc_type == FG_GC &&
-				get_valid_blocks(sbi, segno, false) == 0)
-			seg_freed++;
+			/*
+			 * this is to avoid deadlock:
+			 *  - lock_page(sum_page)     - f2fs_replace_block
+			 *   - check_valid_map()        - down_write(sentry_lock)
+			 *    - down_read(sentry_lock) - change_curseg()
+			 *                               - lock_page(sum_page)
+			 */
+			if (type == SUM_TYPE_NODE)
+				submitted += gc_node_segment(sbi, sum->entries,
+						cur_segno, gc_type);
+			else
+				submitted += gc_data_segment(sbi, sum->entries,
+						gc_list, cur_segno,
+						gc_type, force_migrate);
 
-		if (__is_large_section(sbi))
-			sbi->next_victim_seg[gc_type] =
-				(segno + 1 < sec_end_segno) ?
-					segno + 1 : NULL_SEGNO;
-skip:
+			stat_inc_gc_seg_count(sbi, data_type, gc_type);
+			sbi->gc_reclaimed_segs[sbi->gc_mode]++;
+			migrated++;
+
+freed:
+			if (gc_type == FG_GC &&
+					get_valid_blocks(sbi, cur_segno, false) == 0)
+				seg_freed++;
+
+			if (__is_large_section(sbi))
+				sbi->next_victim_seg[gc_type] =
+					(cur_segno + 1 < sec_end_segno) ?
+					cur_segno + 1 : NULL_SEGNO;
+		}
+next_block:
 		folio_put_refs(sum_folio, 2);
+		segno = block_end_segno;
 	}
 
 	if (submitted)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d7faebaa3c6b..62a0c71b5b75 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -522,7 +522,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
 	sum_folio = f2fs_get_sum_folio(sbi, segno);
 	if (IS_ERR(sum_folio))
 		return PTR_ERR(sum_folio);
-	sum_node = folio_address(sum_folio);
+	sum_node = SUM_BLK_PAGE_ADDR(sum_folio, segno);
 	sum = sum_node->entries[blkoff];
 	f2fs_folio_put(sum_folio, true);
 got_it:
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a473cd1fb37d..10d873d1b328 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2712,7 +2712,15 @@ struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno)
 void f2fs_update_meta_page(struct f2fs_sb_info *sbi,
 					void *src, block_t blk_addr)
 {
-	struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr);
+	struct folio *folio;
+
+	if (SUMS_PER_BLOCK == 1)
+		folio = f2fs_grab_meta_folio(sbi, blk_addr);
+	else
+		folio = f2fs_get_meta_folio_retry(sbi, blk_addr);
+
+	if (IS_ERR(folio))
+		return;
 
 	memcpy(folio_address(folio), src, PAGE_SIZE);
 	folio_mark_dirty(folio);
@@ -2720,9 +2728,21 @@ void f2fs_update_meta_page(struct f2fs_sb_info *sbi,
 }
 
 static void write_sum_page(struct f2fs_sb_info *sbi,
-			struct f2fs_summary_block *sum_blk, block_t blk_addr)
+		struct f2fs_summary_block *sum_blk, unsigned int segno)
 {
-	f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr);
+	struct folio *folio;
+
+	if (SUMS_PER_BLOCK == 1)
+		return f2fs_update_meta_page(sbi, (void *)sum_blk,
+				GET_SUM_BLOCK(sbi, segno));
+
+	folio = f2fs_get_sum_folio(sbi, segno);
+	if (IS_ERR(folio))
+		return;
+
+	memcpy(SUM_BLK_PAGE_ADDR(folio, segno), sum_blk, sizeof(*sum_blk));
+	folio_mark_dirty(folio);
+	f2fs_folio_put(folio, true);
 }
 
 static void write_current_sum_page(struct f2fs_sb_info *sbi,
@@ -2987,7 +3007,7 @@ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
 	int ret;
 
 	if (curseg->inited)
-		write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno));
+		write_sum_page(sbi, curseg->sum_blk, segno);
 
 	segno = __get_next_segno(sbi, type);
 	ret = get_new_segment(sbi, &segno, new_sec, pinning);
@@ -3046,7 +3066,7 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type)
 	struct folio *sum_folio;
 
 	if (curseg->inited)
-		write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
+		write_sum_page(sbi, curseg->sum_blk, curseg->segno);
 
 	__set_test_and_inuse(sbi, new_segno);
 
@@ -3065,7 +3085,7 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type)
 		memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE);
 		return PTR_ERR(sum_folio);
 	}
-	sum_node = folio_address(sum_folio);
+	sum_node = SUM_BLK_PAGE_ADDR(sum_folio, new_segno);
 	memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
 	f2fs_folio_put(sum_folio, true);
 	return 0;
@@ -3154,8 +3174,7 @@ static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type)
 		goto out;
 
 	if (get_valid_blocks(sbi, curseg->segno, false)) {
-		write_sum_page(sbi, curseg->sum_blk,
-				GET_SUM_BLOCK(sbi, curseg->segno));
+		write_sum_page(sbi, curseg->sum_blk, curseg->segno);
 	} else {
 		mutex_lock(&DIRTY_I(sbi)->seglist_lock);
 		__set_test_and_free(sbi, curseg->segno, true);
@@ -3833,8 +3852,7 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio,
 	if (segment_full) {
 		if (type == CURSEG_COLD_DATA_PINNED &&
 		    !((curseg->segno + 1) % sbi->segs_per_sec)) {
-			write_sum_page(sbi, curseg->sum_blk,
-					GET_SUM_BLOCK(sbi, curseg->segno));
+			write_sum_page(sbi, curseg->sum_blk, curseg->segno);
 			reset_curseg_fields(curseg);
 			goto skip_new_segment;
 		}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 1ce2c8abaf48..e883f14c228f 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -85,8 +85,12 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
 #define GET_ZONE_FROM_SEG(sbi, segno)				\
 	GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
 
-#define GET_SUM_BLOCK(sbi, segno)				\
-	((sbi)->sm_info->ssa_blkaddr + (segno))
+#define SUMS_PER_BLOCK (F2FS_BLKSIZE / F2FS_SUM_BLKSIZE)
+#define GET_SUM_BLOCK(sbi, segno)	\
+	(SM_I(sbi)->ssa_blkaddr + (segno / SUMS_PER_BLOCK))
+#define GET_SUM_BLKOFF(segno) (segno % SUMS_PER_BLOCK)
+#define SUM_BLK_PAGE_ADDR(folio, segno)	\
+	(folio_address(folio) + GET_SUM_BLKOFF(segno) * F2FS_SUM_BLKSIZE)
 
 #define GET_SUM_TYPE(footer) ((footer)->entry_type)
 #define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8cf98c40b160..c2161b3469b3 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4080,6 +4080,20 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 	if (sanity_check_area_boundary(sbi, folio, index))
 		return -EFSCORRUPTED;
 
+	/*
+	 * Check for legacy summary layout on 16KB+ block devices.
+	 * Modern f2fs-tools packs multiple 4KB summary areas into one block,
+	 * whereas legacy versions used one block per summary, leading
+	 * to a much larger SSA.
+	 */
+	if (SUMS_PER_BLOCK > 1 &&
+		    !(__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_PACKED_SSA))) {
+		f2fs_info(sbi, "Error: Device formatted with a legacy version. "
+			"Please reformat with a tool supporting the packed ssa "
+			"feature for block sizes larger than 4kb.");
+		return -EOPNOTSUPP;
+	}
+
 	return 0;
 }
 
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 6d2a4fba68a2..5685b454bfd1 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -235,6 +235,9 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_compression(sbi))
 		len += sysfs_emit_at(buf, len, "%s%s",
 				len ? ", " : "", "compression");
+	if (f2fs_sb_has_packed_ssa(sbi))
+		len += sysfs_emit_at(buf, len, "%s%s",
+				len ? ", " : "", "packed_ssa");
 	len += sysfs_emit_at(buf, len, "%s%s",
 				len ? ", " : "", "pin_file");
 	len += sysfs_emit_at(buf, len, "\n");
@@ -1296,6 +1299,7 @@ F2FS_FEATURE_RO_ATTR(pin_file);
 #ifdef CONFIG_UNICODE
 F2FS_FEATURE_RO_ATTR(linear_lookup);
 #endif
+F2FS_FEATURE_RO_ATTR(packed_ssa);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -1455,6 +1459,7 @@ static struct attribute *f2fs_feat_attrs[] = {
 #ifdef CONFIG_UNICODE
 	BASE_ATTR_LIST(linear_lookup),
 #endif
+	BASE_ATTR_LIST(packed_ssa),
 	NULL,
 };
 ATTRIBUTE_GROUPS(f2fs_feat);
@@ -1490,6 +1495,7 @@ F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD);
 F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION);
 F2FS_SB_FEATURE_RO_ATTR(readonly, RO);
 F2FS_SB_FEATURE_RO_ATTR(device_alias, DEVICE_ALIAS);
+F2FS_SB_FEATURE_RO_ATTR(packed_ssa, PACKED_SSA);
 
 static struct attribute *f2fs_sb_feat_attrs[] = {
 	ATTR_LIST(sb_encryption),
@@ -1507,6 +1513,7 @@ static struct attribute *f2fs_sb_feat_attrs[] = {
 	ATTR_LIST(sb_compression),
 	ATTR_LIST(sb_readonly),
 	ATTR_LIST(sb_device_alias),
+	ATTR_LIST(sb_packed_ssa),
 	NULL,
 };
 ATTRIBUTE_GROUPS(f2fs_sb_feat);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 6afb4a13b81d..a7880787cad3 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -17,6 +17,7 @@
 #define F2FS_LOG_SECTORS_PER_BLOCK	(PAGE_SHIFT - 9) /* log number for sector/blk */
 #define F2FS_BLKSIZE			PAGE_SIZE /* support only block == page */
 #define F2FS_BLKSIZE_BITS		PAGE_SHIFT /* bits for F2FS_BLKSIZE */
+#define F2FS_SUM_BLKSIZE		4096	/* only support 4096 byte sum block */
 #define F2FS_MAX_EXTENSION		64	/* # of extension entries */
 #define F2FS_EXTENSION_LEN		8	/* max size of extension */
 
@@ -441,7 +442,7 @@ struct f2fs_sit_block {
  * from node's page's beginning to get a data block address.
  * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
  */
-#define ENTRIES_IN_SUM		(F2FS_BLKSIZE / 8)
+#define ENTRIES_IN_SUM		(F2FS_SUM_BLKSIZE / 8)
 #define	SUMMARY_SIZE		(7)	/* sizeof(struct f2fs_summary) */
 #define	SUM_FOOTER_SIZE		(5)	/* sizeof(struct summary_footer) */
 #define SUM_ENTRY_SIZE		(SUMMARY_SIZE * ENTRIES_IN_SUM)
@@ -467,7 +468,7 @@ struct summary_footer {
 	__le32 check_sum;		/* summary checksum */
 } __packed;
 
-#define SUM_JOURNAL_SIZE	(F2FS_BLKSIZE - SUM_FOOTER_SIZE -\
+#define SUM_JOURNAL_SIZE	(F2FS_SUM_BLKSIZE - SUM_FOOTER_SIZE -\
 				SUM_ENTRY_SIZE)
 #define NAT_JOURNAL_ENTRIES	((SUM_JOURNAL_SIZE - 2) /\
 				sizeof(struct nat_journal_entry))
-- 
cgit v1.2.3


From 8d1cb17aca466b361cca17834b8bb1cf3e3d1818 Mon Sep 17 00:00:00 2001
From: YH Lin <yhli@google.com>
Date: Fri, 28 Nov 2025 11:23:57 +0800
Subject: f2fs: optimize trace_f2fs_write_checkpoint with enums

This patch optimizes the tracepoint by replacing these hardcoded strings
with a new enumeration f2fs_cp_phase.

1.Defines enum f2fs_cp_phase with values for each checkpoint phase.
2.Updates trace_f2fs_write_checkpoint to accept a u16 phase argument
instead of a string pointer.
3.Uses __print_symbolic in TP_printk to convert the enum values
back to their corresponding strings for human-readable trace output.

This change reduces the storage overhead for each trace event
by replacing a variable-length string with a 2-byte integer,
while maintaining the same readable output in ftrace.

Signed-off-by: YH Lin <yhli@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c        |  6 +++---
 fs/f2fs/f2fs.h              |  6 ++++++
 include/trace/events/f2fs.h | 19 ++++++++++++++-----
 3 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 4c401b5b2933..300664269eb6 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1673,7 +1673,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		goto out;
 	}
 
-	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_START_BLOCK_OPS);
 
 	err = block_operations(sbi);
 	if (err)
@@ -1681,7 +1681,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	stat_cp_time(cpc, CP_TIME_OP_LOCK);
 
-	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_BLOCK_OPS);
 
 	f2fs_flush_merged_writes(sbi);
 
@@ -1747,7 +1747,7 @@ stop:
 
 	/* update CP_TIME to trigger checkpoint periodically */
 	f2fs_update_time(sbi, CP_TIME);
-	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_CHECKPOINT);
 out:
 	if (cpc->reason != CP_RESIZE)
 		f2fs_up_write(&sbi->cp_global_sem);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 007195a1d4eb..20edbb99b814 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -319,6 +319,12 @@ struct cp_control {
 	struct cp_stats stats;
 };
 
+enum f2fs_cp_phase {
+	CP_PHASE_START_BLOCK_OPS,
+	CP_PHASE_FINISH_BLOCK_OPS,
+	CP_PHASE_FINISH_CHECKPOINT,
+};
+
 /*
  * indicate meta/data type
  */
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index e00611ead024..df4017dcc701 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -50,6 +50,9 @@ TRACE_DEFINE_ENUM(CP_PAUSE);
 TRACE_DEFINE_ENUM(CP_RESIZE);
 TRACE_DEFINE_ENUM(EX_READ);
 TRACE_DEFINE_ENUM(EX_BLOCK_AGE);
+TRACE_DEFINE_ENUM(CP_PHASE_START_BLOCK_OPS);
+TRACE_DEFINE_ENUM(CP_PHASE_FINISH_BLOCK_OPS);
+TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT);
 
 #define show_block_type(type)						\
 	__print_symbolic(type,						\
@@ -175,6 +178,12 @@ TRACE_DEFINE_ENUM(EX_BLOCK_AGE);
 #define S_ALL_PERM	(S_ISUID | S_ISGID | S_ISVTX |	\
 			S_IRWXU | S_IRWXG | S_IRWXO)
 
+#define show_cp_phase(phase)					\
+	__print_symbolic(phase,						\
+		{ CP_PHASE_START_BLOCK_OPS,		"start block_ops" },			\
+		{ CP_PHASE_FINISH_BLOCK_OPS,	"finish block_ops" },			\
+		{ CP_PHASE_FINISH_CHECKPOINT,	"finish checkpoint" })
+
 struct f2fs_sb_info;
 struct f2fs_io_info;
 struct extent_info;
@@ -1573,26 +1582,26 @@ TRACE_EVENT(f2fs_readpages,
 
 TRACE_EVENT(f2fs_write_checkpoint,
 
-	TP_PROTO(struct super_block *sb, int reason, const char *msg),
+	TP_PROTO(struct super_block *sb, int reason, u16 phase),
 
-	TP_ARGS(sb, reason, msg),
+	TP_ARGS(sb, reason, phase),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(int,	reason)
-		__string(dest_msg, msg)
+		__field(u16, phase)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
 		__entry->reason		= reason;
-		__assign_str(dest_msg);
+		__entry->phase		= phase;
 	),
 
 	TP_printk("dev = (%d,%d), checkpoint for %s, state = %s",
 		show_dev(__entry->dev),
 		show_cpreason(__entry->reason),
-		__get_str(dest_msg))
+		show_cp_phase(__entry->phase))
 );
 
 DECLARE_EVENT_CLASS(f2fs_discard,
-- 
cgit v1.2.3


From f345be751b961ce91e0b883345eaa1d0993a4949 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Tue, 2 Dec 2025 11:21:31 -0700
Subject: io_uring/trace: rename io_uring_queue_async_work event "rw" field

The io_uring_queue_async_work tracepoint event stores an int rw field
that represents whether the work item is hashed. Rename it to "hashed"
and change its type to bool to more accurately reflect its value.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 45d15460b495..34b31a855ea4 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -133,15 +133,15 @@ TRACE_EVENT(io_uring_file_get,
  * io_uring_queue_async_work - called before submitting a new async work
  *
  * @req:	pointer to a submitted request
- * @rw:		type of workqueue, hashed or normal
+ * @hashed:	whether async work is hashed
  *
  * Allows to trace asynchronous work submission.
  */
 TRACE_EVENT(io_uring_queue_async_work,
 
-	TP_PROTO(struct io_kiocb *req, int rw),
+	TP_PROTO(struct io_kiocb *req, bool hashed),
 
-	TP_ARGS(req, rw),
+	TP_ARGS(req, hashed),
 
 	TP_STRUCT__entry (
 		__field(  void *,			ctx		)
@@ -150,7 +150,7 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__field(  u8,				opcode		)
 		__field(  unsigned long long,		flags		)
 		__field(  struct io_wq_work *,		work		)
-		__field(  int,				rw		)
+		__field(  bool,				hashed		)
 
 		__string( op_str, io_uring_get_opcode(req->opcode)	)
 	),
@@ -162,7 +162,7 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__entry->flags		= (__force unsigned long long) req->flags;
 		__entry->opcode		= req->opcode;
 		__entry->work		= &req->work;
-		__entry->rw		= rw;
+		__entry->hashed		= hashed;
 
 		__assign_str(op_str);
 	),
@@ -170,7 +170,7 @@ TRACE_EVENT(io_uring_queue_async_work,
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
 		__entry->ctx, __entry->req, __entry->user_data,
 		__get_str(op_str), __entry->flags,
-		__entry->rw ? "hashed" : "normal", __entry->work)
+		__entry->hashed ? "hashed" : "normal", __entry->work)
 );
 
 /**
-- 
cgit v1.2.3


From 22a1ffea5f805dfa21b64d1c7b5fe39c0c78c997 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 1 Dec 2025 16:43:28 -0500
Subject: block: add IOC_PR_READ_KEYS ioctl

Add a Persistent Reservations ioctl to read the list of currently
registered reservation keys. This calls the pr_ops->read_keys() function
that was previously added in commit c787f1baa503 ("block: Add PR
callouts for read keys and reservation") but was only used by the
in-kernel SCSI target so far.

The IOC_PR_READ_KEYS ioctl is necessary so that userspace applications
that rely on Persistent Reservations ioctls have a way of inspecting the
current state. Cluster managers and validation tests need this
functionality.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c           | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/pr.h |  7 +++++++
 2 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/block/ioctl.c b/block/ioctl.c
index 2b3ab9bfc413..c0802ebf54a6 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -423,6 +423,60 @@ static int blkdev_pr_clear(struct block_device *bdev, blk_mode_t mode,
 	return ops->pr_clear(bdev, c.key);
 }
 
+static int blkdev_pr_read_keys(struct block_device *bdev, blk_mode_t mode,
+		struct pr_read_keys __user *arg)
+{
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+	struct pr_keys *keys_info;
+	struct pr_read_keys read_keys;
+	u64 __user *keys_ptr;
+	size_t keys_info_len;
+	size_t keys_copy_len;
+	int ret;
+
+	if (!blkdev_pr_allowed(bdev, mode))
+		return -EPERM;
+	if (!ops || !ops->pr_read_keys)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&read_keys, arg, sizeof(read_keys)))
+		return -EFAULT;
+
+	keys_info_len = struct_size(keys_info, keys, read_keys.num_keys);
+	if (keys_info_len == SIZE_MAX)
+		return -EINVAL;
+
+	keys_info = kzalloc(keys_info_len, GFP_KERNEL);
+	if (!keys_info)
+		return -ENOMEM;
+
+	keys_info->num_keys = read_keys.num_keys;
+
+	ret = ops->pr_read_keys(bdev, keys_info);
+	if (ret)
+		goto out;
+
+	/* Copy out individual keys */
+	keys_ptr = u64_to_user_ptr(read_keys.keys_ptr);
+	keys_copy_len = min(read_keys.num_keys, keys_info->num_keys) *
+		        sizeof(keys_info->keys[0]);
+
+	if (copy_to_user(keys_ptr, keys_info->keys, keys_copy_len)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Copy out the arg struct */
+	read_keys.generation = keys_info->generation;
+	read_keys.num_keys = keys_info->num_keys;
+
+	if (copy_to_user(arg, &read_keys, sizeof(read_keys)))
+		ret = -EFAULT;
+out:
+	kfree(keys_info);
+	return ret;
+}
+
 static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd,
 		unsigned long arg)
 {
@@ -645,6 +699,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
 		return blkdev_pr_preempt(bdev, mode, argp, true);
 	case IOC_PR_CLEAR:
 		return blkdev_pr_clear(bdev, mode, argp);
+	case IOC_PR_READ_KEYS:
+		return blkdev_pr_read_keys(bdev, mode, argp);
 	default:
 		return blk_get_meta_cap(bdev, cmd, argp);
 	}
diff --git a/include/uapi/linux/pr.h b/include/uapi/linux/pr.h
index d8126415966f..fcb74eab92c8 100644
--- a/include/uapi/linux/pr.h
+++ b/include/uapi/linux/pr.h
@@ -56,6 +56,12 @@ struct pr_clear {
 	__u32	__pad;
 };
 
+struct pr_read_keys {
+	__u32	generation;
+	__u32	num_keys;
+	__u64	keys_ptr;
+};
+
 #define PR_FL_IGNORE_KEY	(1 << 0)	/* ignore existing key */
 
 #define IOC_PR_REGISTER		_IOW('p', 200, struct pr_registration)
@@ -64,5 +70,6 @@ struct pr_clear {
 #define IOC_PR_PREEMPT		_IOW('p', 203, struct pr_preempt)
 #define IOC_PR_PREEMPT_ABORT	_IOW('p', 204, struct pr_preempt)
 #define IOC_PR_CLEAR		_IOW('p', 205, struct pr_clear)
+#define IOC_PR_READ_KEYS	_IOWR('p', 206, struct pr_read_keys)
 
 #endif /* _UAPI_PR_H */
-- 
cgit v1.2.3


From 3e2cb9ee76c27f57bfdb7b4753b909594d4fa31a Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 1 Dec 2025 16:43:29 -0500
Subject: block: add IOC_PR_READ_RESERVATION ioctl

Add a Persistent Reservations ioctl to read the current reservation.
This calls the pr_ops->read_reservation() function that was previously
added in commit c787f1baa503 ("block: Add PR callouts for read keys and
reservation") but was only used by the in-kernel SCSI target so far.

The IOC_PR_READ_RESERVATION ioctl is necessary so that userspace
applications that rely on Persistent Reservations ioctls have a way of
inspecting the current state. Cluster managers and validation tests need
this functionality.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c           | 28 ++++++++++++++++++++++++++++
 include/uapi/linux/pr.h |  7 +++++++
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/block/ioctl.c b/block/ioctl.c
index c0802ebf54a6..61feed686418 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -477,6 +477,32 @@ out:
 	return ret;
 }
 
+static int blkdev_pr_read_reservation(struct block_device *bdev,
+		blk_mode_t mode, struct pr_read_reservation __user *arg)
+{
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+	struct pr_held_reservation rsv = {};
+	struct pr_read_reservation out = {};
+	int ret;
+
+	if (!blkdev_pr_allowed(bdev, mode))
+		return -EPERM;
+	if (!ops || !ops->pr_read_reservation)
+		return -EOPNOTSUPP;
+
+	ret = ops->pr_read_reservation(bdev, &rsv);
+	if (ret)
+		return ret;
+
+	out.key = rsv.key;
+	out.generation = rsv.generation;
+	out.type = rsv.type;
+
+	if (copy_to_user(arg, &out, sizeof(out)))
+		return -EFAULT;
+	return 0;
+}
+
 static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd,
 		unsigned long arg)
 {
@@ -701,6 +727,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
 		return blkdev_pr_clear(bdev, mode, argp);
 	case IOC_PR_READ_KEYS:
 		return blkdev_pr_read_keys(bdev, mode, argp);
+	case IOC_PR_READ_RESERVATION:
+		return blkdev_pr_read_reservation(bdev, mode, argp);
 	default:
 		return blk_get_meta_cap(bdev, cmd, argp);
 	}
diff --git a/include/uapi/linux/pr.h b/include/uapi/linux/pr.h
index fcb74eab92c8..847f3051057a 100644
--- a/include/uapi/linux/pr.h
+++ b/include/uapi/linux/pr.h
@@ -62,6 +62,12 @@ struct pr_read_keys {
 	__u64	keys_ptr;
 };
 
+struct pr_read_reservation {
+	__u64	key;
+	__u32	generation;
+	__u32	type;
+};
+
 #define PR_FL_IGNORE_KEY	(1 << 0)	/* ignore existing key */
 
 #define IOC_PR_REGISTER		_IOW('p', 200, struct pr_registration)
@@ -71,5 +77,6 @@ struct pr_read_keys {
 #define IOC_PR_PREEMPT_ABORT	_IOW('p', 204, struct pr_preempt)
 #define IOC_PR_CLEAR		_IOW('p', 205, struct pr_clear)
 #define IOC_PR_READ_KEYS	_IOWR('p', 206, struct pr_read_keys)
+#define IOC_PR_READ_RESERVATION	_IOR('p', 207, struct pr_read_reservation)
 
 #endif /* _UAPI_PR_H */
-- 
cgit v1.2.3


From 71075d25ca5cae732fb57da065fbf14aeb3bcfc7 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Date: Tue, 2 Dec 2025 19:58:09 -0800
Subject: blk-mq: add blk_rq_nr_bvec() helper

Add a new helper function blk_rq_nr_bvec() that returns the number of
bvecs in a request. This count represents the number of iterations
rq_for_each_bvec() would perform on a request.

Drivers need to pre-allocate bvec arrays before iterating through
a request's bvecs. Currently, they manually count bvecs using
rq_for_each_bvec() in a loop, which is repetitive. The new helper
centralizes this logic.

This pattern exists in loop and zloop drivers, where multi-bio requests
require copying bvecs into a contiguous array before creating
an iov_iter for file operations.

Update loop and zloop drivers to use the new helper, eliminating
duplicate code.

This patch also provides a clear API to avoid any potential misuse of
blk_nr_phys_segments() for calculating the bvecs since, one bvec can
have more than one segments and use of blk_nr_phys_segments() can
lead to extra memory allocation :-

[ 6155.673749] nullb_bio: 128K bio as ONE bvec: sector=0, size=131072
[ 6155.673846] null_blk: #### null_handle_data_transfer:1375
[ 6155.673850] null_blk: nr_bvec=1 blk_rq_nr_phys_segments=2
[ 6155.674263] null_blk: #### null_handle_data_transfer:1375
[ 6155.674267] null_blk: nr_bvec=1 blk_rq_nr_phys_segments=1

Reviewed-by: Niklas Cassel <cassel@kernel.org>
Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c   |  5 ++---
 drivers/block/zloop.c  |  5 ++---
 include/linux/blk-mq.h | 18 ++++++++++++++++++
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ebe751f39742..272bc608e528 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -348,11 +348,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	struct file *file = lo->lo_backing_file;
 	struct bio_vec tmp;
 	unsigned int offset;
-	int nr_bvec = 0;
+	unsigned int nr_bvec;
 	int ret;
 
-	rq_for_each_bvec(tmp, rq, rq_iter)
-		nr_bvec++;
+	nr_bvec = blk_rq_nr_bvec(rq);
 
 	if (rq->bio != rq->biotail) {
 
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
index 3f50321aa4a7..77bd6081b244 100644
--- a/drivers/block/zloop.c
+++ b/drivers/block/zloop.c
@@ -394,7 +394,7 @@ static void zloop_rw(struct zloop_cmd *cmd)
 	struct bio_vec tmp;
 	unsigned long flags;
 	sector_t zone_end;
-	int nr_bvec = 0;
+	unsigned int nr_bvec;
 	int ret;
 
 	atomic_set(&cmd->ref, 2);
@@ -487,8 +487,7 @@ static void zloop_rw(struct zloop_cmd *cmd)
 		spin_unlock_irqrestore(&zone->wp_lock, flags);
 	}
 
-	rq_for_each_bvec(tmp, rq, rq_iter)
-		nr_bvec++;
+	nr_bvec = blk_rq_nr_bvec(rq);
 
 	if (rq->bio != rq->biotail) {
 		struct bio_vec *bvec;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index eb7254b3dddd..cae9e857aea4 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1213,6 +1213,24 @@ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
 	return max_t(unsigned short, rq->nr_phys_segments, 1);
 }
 
+/**
+ * blk_rq_nr_bvec - return number of bvecs in a request
+ * @rq: request to calculate bvecs for
+ *
+ * Returns the number of bvecs.
+ */
+static inline unsigned int blk_rq_nr_bvec(struct request *rq)
+{
+	struct req_iterator rq_iter;
+	struct bio_vec bv;
+	unsigned int nr_bvec = 0;
+
+	rq_for_each_bvec(bv, rq, rq_iter)
+		nr_bvec++;
+
+	return nr_bvec;
+}
+
 int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
 		struct scatterlist **last_sg);
 static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist)
-- 
cgit v1.2.3


From 41f7351fc47283822c4b70b0f42741f52cc1e6f6 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Tue, 2 Dec 2025 11:30:25 -0800
Subject: PM: runtime: Make pm_runtime_barrier() return void

No callers check the return code, and that's a good thing. Doing so
would be racy and unhelpful.

Drop the return code entirely, so we don't make anyone think about its
complexities.

Signed-off-by: Brian Norris <briannorris@chromium.org>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Link: https://patch.msgid.link/20251202193129.1411419-2-briannorris@chromium.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/runtime_pm.rst |  6 ++----
 drivers/base/power/runtime.c       | 14 ++------------
 include/linux/pm_runtime.h         |  4 ++--
 3 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
index 8246df3cecd7..455b9d135d85 100644
--- a/Documentation/power/runtime_pm.rst
+++ b/Documentation/power/runtime_pm.rst
@@ -443,13 +443,11 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
       necessary to execute the subsystem-level resume callback for the device
       to satisfy that request, otherwise 0 is returned
 
-  `int pm_runtime_barrier(struct device *dev);`
+  `void pm_runtime_barrier(struct device *dev);`
     - check if there's a resume request pending for the device and resume it
       (synchronously) in that case, cancel any other pending runtime PM requests
       regarding it and wait for all runtime PM operations on it in progress to
-      complete; returns 1 if there was a resume request pending and it was
-      necessary to execute the subsystem-level resume callback for the device to
-      satisfy that request, otherwise 0 is returned
+      complete
 
   `void pm_suspend_ignore_children(struct device *dev, bool enable);`
     - set/unset the power.ignore_children flag of the device
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 62707738caa4..84676cc24221 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1467,30 +1467,20 @@ static void __pm_runtime_barrier(struct device *dev)
  * Next, make sure that all pending requests for the device have been flushed
  * from pm_wq and wait for all runtime PM operations involving the device in
  * progress to complete.
- *
- * Return value:
- * 1, if there was a resume request pending and the device had to be woken up,
- * 0, otherwise
  */
-int pm_runtime_barrier(struct device *dev)
+void pm_runtime_barrier(struct device *dev)
 {
-	int retval = 0;
-
 	pm_runtime_get_noresume(dev);
 	spin_lock_irq(&dev->power.lock);
 
 	if (dev->power.request_pending
-	    && dev->power.request == RPM_REQ_RESUME) {
+	    && dev->power.request == RPM_REQ_RESUME)
 		rpm_resume(dev, 0);
-		retval = 1;
-	}
 
 	__pm_runtime_barrier(dev);
 
 	spin_unlock_irq(&dev->power.lock);
 	pm_runtime_put_noidle(dev);
-
-	return retval;
 }
 EXPORT_SYMBOL_GPL(pm_runtime_barrier);
 
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 911d7a4d32c1..41037c513f06 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -76,7 +76,7 @@ extern int pm_runtime_get_if_active(struct device *dev);
 extern int pm_runtime_get_if_in_use(struct device *dev);
 extern int pm_schedule_suspend(struct device *dev, unsigned int delay);
 extern int __pm_runtime_set_status(struct device *dev, unsigned int status);
-extern int pm_runtime_barrier(struct device *dev);
+extern void pm_runtime_barrier(struct device *dev);
 extern bool pm_runtime_block_if_disabled(struct device *dev);
 extern void pm_runtime_unblock(struct device *dev);
 extern void pm_runtime_enable(struct device *dev);
@@ -284,7 +284,7 @@ static inline int pm_runtime_get_if_active(struct device *dev)
 }
 static inline int __pm_runtime_set_status(struct device *dev,
 					    unsigned int status) { return 0; }
-static inline int pm_runtime_barrier(struct device *dev) { return 0; }
+static inline void pm_runtime_barrier(struct device *dev) {}
 static inline bool pm_runtime_block_if_disabled(struct device *dev) { return true; }
 static inline void pm_runtime_unblock(struct device *dev) {}
 static inline void pm_runtime_enable(struct device *dev) {}
-- 
cgit v1.2.3


From 8a32282175c964eb15638e8dfe199fc13c060f67 Mon Sep 17 00:00:00 2001
From: shechenglong <shechenglong@xfusion.com>
Date: Wed, 3 Dec 2025 23:17:49 +0800
Subject: block: fix comment for op_is_zone_mgmt() to include RESET_ALL

REQ_OP_ZONE_RESET_ALL is a zone management request, and op_is_zone_mgmt()
has returned true for it.

Update the comment to remove the misleading exception note so
the documentation matches the implementation.

Fixes: 12a1c9353c47 ("block: fix op_is_zone_mgmt() to handle REQ_OP_ZONE_RESET_ALL")
Signed-off-by: shechenglong <shechenglong@xfusion.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cbbcb9051ec3..5dc061d318a4 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -479,10 +479,7 @@ static inline bool op_is_discard(blk_opf_t op)
 }
 
 /*
- * Check if a bio or request operation is a zone management operation, with
- * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
- * due to its different handling in the block layer and device response in
- * case of command failure.
+ * Check if a bio or request operation is a zone management operation.
  */
 static inline bool op_is_zone_mgmt(enum req_op op)
 {
-- 
cgit v1.2.3


From 6e9722e9a7bfe1bbad649937c811076acf86e1fd Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Sun, 30 Nov 2025 21:07:12 +0200
Subject: tpm2-sessions: Fix out of range indexing in name_size

'name_size' does not have any range checks, and it just directly indexes
with TPM_ALG_ID, which could lead into memory corruption at worst.

Address the issue by only processing known values and returning -EINVAL for
unrecognized values.

Make also 'tpm_buf_append_name' and 'tpm_buf_fill_hmac_session' fallible so
that errors are detected before causing any spurious TPM traffic.

End also the authorization session on failure in both of the functions, as
the session state would be then by definition corrupted.

Cc: stable@vger.kernel.org # v6.10+
Fixes: 1085b8276bb4 ("tpm: Add the rest of the session HMAC API")
Reviewed-by: Jonathan McDowell <noodles@meta.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm2-cmd.c               |  23 +++++-
 drivers/char/tpm/tpm2-sessions.c          | 132 +++++++++++++++++++++---------
 include/linux/tpm.h                       |  13 +--
 security/keys/trusted-keys/trusted_tpm2.c |  29 +++++--
 4 files changed, 142 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index dd502322f499..be4a9c7f2e1a 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -199,7 +199,11 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 	}
 
 	if (!disable_pcr_integrity) {
-		tpm_buf_append_name(chip, &buf, pcr_idx, NULL);
+		rc = tpm_buf_append_name(chip, &buf, pcr_idx, NULL);
+		if (rc) {
+			tpm_buf_destroy(&buf);
+			return rc;
+		}
 		tpm_buf_append_hmac_session(chip, &buf, 0, NULL, 0);
 	} else {
 		tpm_buf_append_handle(chip, &buf, pcr_idx);
@@ -214,8 +218,14 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 			       chip->allocated_banks[i].digest_size);
 	}
 
-	if (!disable_pcr_integrity)
-		tpm_buf_fill_hmac_session(chip, &buf);
+	if (!disable_pcr_integrity) {
+		rc = tpm_buf_fill_hmac_session(chip, &buf);
+		if (rc) {
+			tpm_buf_destroy(&buf);
+			return rc;
+		}
+	}
+
 	rc = tpm_transmit_cmd(chip, &buf, 0, "attempting extend a PCR value");
 	if (!disable_pcr_integrity)
 		rc = tpm_buf_check_hmac_response(chip, &buf, rc);
@@ -273,7 +283,12 @@ int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max)
 						| TPM2_SA_CONTINUE_SESSION,
 						NULL, 0);
 		tpm_buf_append_u16(&buf, num_bytes);
-		tpm_buf_fill_hmac_session(chip, &buf);
+		err = tpm_buf_fill_hmac_session(chip, &buf);
+		if (err) {
+			tpm_buf_destroy(&buf);
+			return err;
+		}
+
 		err = tpm_transmit_cmd(chip, &buf,
 				       offsetof(struct tpm2_get_random_out,
 						buffer),
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index 6d03c224e6b2..385014dbca39 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -144,16 +144,23 @@ struct tpm2_auth {
 /*
  * Name Size based on TPM algorithm (assumes no hash bigger than 255)
  */
-static u8 name_size(const u8 *name)
+static int name_size(const u8 *name)
 {
-	static u8 size_map[] = {
-		[TPM_ALG_SHA1] = SHA1_DIGEST_SIZE,
-		[TPM_ALG_SHA256] = SHA256_DIGEST_SIZE,
-		[TPM_ALG_SHA384] = SHA384_DIGEST_SIZE,
-		[TPM_ALG_SHA512] = SHA512_DIGEST_SIZE,
-	};
-	u16 alg = get_unaligned_be16(name);
-	return size_map[alg] + 2;
+	u16 hash_alg = get_unaligned_be16(name);
+
+	switch (hash_alg) {
+	case TPM_ALG_SHA1:
+		return SHA1_DIGEST_SIZE + 2;
+	case TPM_ALG_SHA256:
+		return SHA256_DIGEST_SIZE + 2;
+	case TPM_ALG_SHA384:
+		return SHA384_DIGEST_SIZE + 2;
+	case TPM_ALG_SHA512:
+		return SHA512_DIGEST_SIZE + 2;
+	default:
+		pr_warn("tpm: unsupported name algorithm: 0x%04x\n", hash_alg);
+		return -EINVAL;
+	}
 }
 
 static int tpm2_parse_read_public(char *name, struct tpm_buf *buf)
@@ -161,6 +168,7 @@ static int tpm2_parse_read_public(char *name, struct tpm_buf *buf)
 	struct tpm_header *head = (struct tpm_header *)buf->data;
 	off_t offset = TPM_HEADER_SIZE;
 	u32 tot_len = be32_to_cpu(head->length);
+	int ret;
 	u32 val;
 
 	/* we're starting after the header so adjust the length */
@@ -173,8 +181,13 @@ static int tpm2_parse_read_public(char *name, struct tpm_buf *buf)
 	offset += val;
 	/* name */
 	val = tpm_buf_read_u16(buf, &offset);
-	if (val != name_size(&buf->data[offset]))
+	ret = name_size(&buf->data[offset]);
+	if (ret < 0)
+		return ret;
+
+	if (val != ret)
 		return -EINVAL;
+
 	memcpy(name, &buf->data[offset], val);
 	/* forget the rest */
 	return 0;
@@ -221,46 +234,72 @@ static int tpm2_read_public(struct tpm_chip *chip, u32 handle, char *name)
  * As with most tpm_buf operations, success is assumed because failure
  * will be caused by an incorrect programming model and indicated by a
  * kernel message.
+ *
+ * Ends the authorization session on failure.
  */
-void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
-			 u32 handle, u8 *name)
+int tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
+			u32 handle, u8 *name)
 {
 #ifdef CONFIG_TCG_TPM2_HMAC
 	enum tpm2_mso_type mso = tpm2_handle_mso(handle);
 	struct tpm2_auth *auth;
 	int slot;
+	int ret;
 #endif
 
 	if (!tpm2_chip_auth(chip)) {
 		tpm_buf_append_handle(chip, buf, handle);
-		return;
+		return 0;
 	}
 
 #ifdef CONFIG_TCG_TPM2_HMAC
 	slot = (tpm_buf_length(buf) - TPM_HEADER_SIZE) / 4;
 	if (slot >= AUTH_MAX_NAMES) {
-		dev_err(&chip->dev, "TPM: too many handles\n");
-		return;
+		dev_err(&chip->dev, "too many handles\n");
+		ret = -EIO;
+		goto err;
 	}
 	auth = chip->auth;
-	WARN(auth->session != tpm_buf_length(buf),
-	     "name added in wrong place\n");
+	if (auth->session != tpm_buf_length(buf)) {
+		dev_err(&chip->dev, "session state malformed");
+		ret = -EIO;
+		goto err;
+	}
 	tpm_buf_append_u32(buf, handle);
 	auth->session += 4;
 
 	if (mso == TPM2_MSO_PERSISTENT ||
 	    mso == TPM2_MSO_VOLATILE ||
 	    mso == TPM2_MSO_NVRAM) {
-		if (!name)
-			tpm2_read_public(chip, handle, auth->name[slot]);
+		if (!name) {
+			ret = tpm2_read_public(chip, handle, auth->name[slot]);
+			if (ret)
+				goto err;
+		}
 	} else {
-		if (name)
-			dev_err(&chip->dev, "TPM: Handle does not require name but one is specified\n");
+		if (name) {
+			dev_err(&chip->dev, "handle 0x%08x does not use a name\n",
+				handle);
+			ret = -EIO;
+			goto err;
+		}
 	}
 
 	auth->name_h[slot] = handle;
-	if (name)
-		memcpy(auth->name[slot], name, name_size(name));
+	if (name) {
+		ret = name_size(name);
+		if (ret < 0)
+			goto err;
+
+		memcpy(auth->name[slot], name, ret);
+	}
+#endif
+	return 0;
+
+#ifdef CONFIG_TCG_TPM2_HMAC
+err:
+	tpm2_end_auth_session(chip);
+	return tpm_ret_to_err(ret);
 #endif
 }
 EXPORT_SYMBOL_GPL(tpm_buf_append_name);
@@ -533,11 +572,9 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip,
  * encryption key and encrypts the first parameter of the command
  * buffer with it.
  *
- * As with most tpm_buf operations, success is assumed because failure
- * will be caused by an incorrect programming model and indicated by a
- * kernel message.
+ * Ends the authorization session on failure.
  */
-void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf)
+int tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf)
 {
 	u32 cc, handles, val;
 	struct tpm2_auth *auth = chip->auth;
@@ -549,9 +586,12 @@ void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf)
 	u8 cphash[SHA256_DIGEST_SIZE];
 	struct sha256_ctx sctx;
 	struct hmac_sha256_ctx hctx;
+	int ret;
 
-	if (!auth)
-		return;
+	if (!auth) {
+		ret = -EIO;
+		goto err;
+	}
 
 	/* save the command code in BE format */
 	auth->ordinal = head->ordinal;
@@ -560,9 +600,11 @@ void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf)
 
 	i = tpm2_find_cc(chip, cc);
 	if (i < 0) {
-		dev_err(&chip->dev, "Command 0x%x not found in TPM\n", cc);
-		return;
+		dev_err(&chip->dev, "command 0x%08x not found\n", cc);
+		ret = -EIO;
+		goto err;
 	}
+
 	attrs = chip->cc_attrs_tbl[i];
 
 	handles = (attrs >> TPM2_CC_ATTR_CHANDLES) & GENMASK(2, 0);
@@ -576,9 +618,9 @@ void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf)
 		u32 handle = tpm_buf_read_u32(buf, &offset_s);
 
 		if (auth->name_h[i] != handle) {
-			dev_err(&chip->dev, "TPM: handle %d wrong for name\n",
-				  i);
-			return;
+			dev_err(&chip->dev, "invalid handle 0x%08x\n", handle);
+			ret = -EIO;
+			goto err;
 		}
 	}
 	/* point offset_s to the start of the sessions */
@@ -609,12 +651,14 @@ void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf)
 		offset_s += len;
 	}
 	if (offset_s != offset_p) {
-		dev_err(&chip->dev, "TPM session length is incorrect\n");
-		return;
+		dev_err(&chip->dev, "session length is incorrect\n");
+		ret = -EIO;
+		goto err;
 	}
 	if (!hmac) {
-		dev_err(&chip->dev, "TPM could not find HMAC session\n");
-		return;
+		dev_err(&chip->dev, "could not find HMAC session\n");
+		ret = -EIO;
+		goto err;
 	}
 
 	/* encrypt before HMAC */
@@ -646,8 +690,11 @@ void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf)
 		if (mso == TPM2_MSO_PERSISTENT ||
 		    mso == TPM2_MSO_VOLATILE ||
 		    mso == TPM2_MSO_NVRAM) {
-			sha256_update(&sctx, auth->name[i],
-				      name_size(auth->name[i]));
+			ret = name_size(auth->name[i]);
+			if (ret < 0)
+				goto err;
+
+			sha256_update(&sctx, auth->name[i], ret);
 		} else {
 			__be32 h = cpu_to_be32(auth->name_h[i]);
 
@@ -668,6 +715,11 @@ void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf)
 	hmac_sha256_update(&hctx, auth->tpm_nonce, sizeof(auth->tpm_nonce));
 	hmac_sha256_update(&hctx, &auth->attrs, 1);
 	hmac_sha256_final(&hctx, hmac);
+	return 0;
+
+err:
+	tpm2_end_auth_session(chip);
+	return ret;
 }
 EXPORT_SYMBOL(tpm_buf_fill_hmac_session);
 
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 3d8f7d1ce2b8..aa816b144ab3 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -529,8 +529,8 @@ static inline struct tpm2_auth *tpm2_chip_auth(struct tpm_chip *chip)
 #endif
 }
 
-void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
-			 u32 handle, u8 *name);
+int tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
+			u32 handle, u8 *name);
 void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
 				 u8 attributes, u8 *passphrase,
 				 int passphraselen);
@@ -563,7 +563,7 @@ static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip,
 #ifdef CONFIG_TCG_TPM2_HMAC
 
 int tpm2_start_auth_session(struct tpm_chip *chip);
-void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf);
+int tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf);
 int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf,
 				int rc);
 void tpm2_end_auth_session(struct tpm_chip *chip);
@@ -577,10 +577,13 @@ static inline int tpm2_start_auth_session(struct tpm_chip *chip)
 static inline void tpm2_end_auth_session(struct tpm_chip *chip)
 {
 }
-static inline void tpm_buf_fill_hmac_session(struct tpm_chip *chip,
-					     struct tpm_buf *buf)
+
+static inline int tpm_buf_fill_hmac_session(struct tpm_chip *chip,
+					    struct tpm_buf *buf)
 {
+	return 0;
 }
+
 static inline int tpm_buf_check_hmac_response(struct tpm_chip *chip,
 					      struct tpm_buf *buf,
 					      int rc)
diff --git a/security/keys/trusted-keys/trusted_tpm2.c b/security/keys/trusted-keys/trusted_tpm2.c
index 8bc6efa8accb..5b205279584b 100644
--- a/security/keys/trusted-keys/trusted_tpm2.c
+++ b/security/keys/trusted-keys/trusted_tpm2.c
@@ -268,7 +268,10 @@ int tpm2_seal_trusted(struct tpm_chip *chip,
 		goto out_put;
 	}
 
-	tpm_buf_append_name(chip, &buf, options->keyhandle, NULL);
+	rc = tpm_buf_append_name(chip, &buf, options->keyhandle, NULL);
+	if (rc)
+		goto out;
+
 	tpm_buf_append_hmac_session(chip, &buf, TPM2_SA_DECRYPT,
 				    options->keyauth, TPM_DIGEST_SIZE);
 
@@ -316,7 +319,10 @@ int tpm2_seal_trusted(struct tpm_chip *chip,
 		goto out;
 	}
 
-	tpm_buf_fill_hmac_session(chip, &buf);
+	rc = tpm_buf_fill_hmac_session(chip, &buf);
+	if (rc)
+		goto out;
+
 	rc = tpm_transmit_cmd(chip, &buf, 4, "sealing data");
 	rc = tpm_buf_check_hmac_response(chip, &buf, rc);
 	if (rc)
@@ -427,7 +433,10 @@ static int tpm2_load_cmd(struct tpm_chip *chip,
 		return rc;
 	}
 
-	tpm_buf_append_name(chip, &buf, options->keyhandle, NULL);
+	rc = tpm_buf_append_name(chip, &buf, options->keyhandle, NULL);
+	if (rc)
+		goto out;
+
 	tpm_buf_append_hmac_session(chip, &buf, 0, options->keyauth,
 				    TPM_DIGEST_SIZE);
 
@@ -439,7 +448,10 @@ static int tpm2_load_cmd(struct tpm_chip *chip,
 		goto out;
 	}
 
-	tpm_buf_fill_hmac_session(chip, &buf);
+	rc = tpm_buf_fill_hmac_session(chip, &buf);
+	if (rc)
+		goto out;
+
 	rc = tpm_transmit_cmd(chip, &buf, 4, "loading blob");
 	rc = tpm_buf_check_hmac_response(chip, &buf, rc);
 	if (!rc)
@@ -484,7 +496,9 @@ static int tpm2_unseal_cmd(struct tpm_chip *chip,
 		return rc;
 	}
 
-	tpm_buf_append_name(chip, &buf, blob_handle, NULL);
+	rc = tpm_buf_append_name(chip, &buf, options->keyhandle, NULL);
+	if (rc)
+		goto out;
 
 	if (!options->policyhandle) {
 		tpm_buf_append_hmac_session(chip, &buf, TPM2_SA_ENCRYPT,
@@ -509,7 +523,10 @@ static int tpm2_unseal_cmd(struct tpm_chip *chip,
 						NULL, 0);
 	}
 
-	tpm_buf_fill_hmac_session(chip, &buf);
+	rc = tpm_buf_fill_hmac_session(chip, &buf);
+	if (rc)
+		goto out;
+
 	rc = tpm_transmit_cmd(chip, &buf, 6, "unsealing");
 	rc = tpm_buf_check_hmac_response(chip, &buf, rc);
 
-- 
cgit v1.2.3


From bc677a9216e1396322e42692e9c01cce04a7afc0 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@opinsys.com>
Date: Tue, 30 Sep 2025 16:07:35 +0300
Subject: tpm2-sessions: Remove 'attributes' parameter from tpm_buf_append_auth

Remove 'attributes' parameter from 'tpm_buf_append_auth', as it is not used
by the function.

Fixes: 27184f8905ba ("tpm: Opt-in in disable PCR integrity protection")
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@opinsys.com>
Reviewed-by: Jonathan McDowell <noodles@meta.com>
---
 drivers/char/tpm/tpm2-cmd.c      | 2 +-
 drivers/char/tpm/tpm2-sessions.c | 5 ++---
 include/linux/tpm.h              | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index 34e3599f094f..ce0a1c6b0596 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -210,7 +210,7 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 		tpm_buf_append_hmac_session(chip, &buf, 0, NULL, 0);
 	} else {
 		tpm_buf_append_handle(chip, &buf, pcr_idx);
-		tpm_buf_append_auth(chip, &buf, 0, NULL, 0);
+		tpm_buf_append_auth(chip, &buf, NULL, 0);
 	}
 
 	tpm_buf_append_u32(&buf, chip->nr_allocated_banks);
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index 3f389e2f6f58..4149379665c4 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -311,7 +311,7 @@ err:
 EXPORT_SYMBOL_GPL(tpm_buf_append_name);
 
 void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf,
-			 u8 attributes, u8 *passphrase, int passphrase_len)
+			 u8 *passphrase, int passphrase_len)
 {
 	/* offset tells us where the sessions area begins */
 	int offset = buf->handles * 4 + TPM_HEADER_SIZE;
@@ -372,8 +372,7 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
 #endif
 
 	if (!tpm2_chip_auth(chip)) {
-		tpm_buf_append_auth(chip, buf, attributes, passphrase,
-				    passphrase_len);
+		tpm_buf_append_auth(chip, buf, passphrase, passphrase_len);
 		return;
 	}
 
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index aa816b144ab3..afa51723296a 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -535,7 +535,7 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
 				 u8 attributes, u8 *passphrase,
 				 int passphraselen);
 void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf,
-			 u8 attributes, u8 *passphrase, int passphraselen);
+			 u8 *passphrase, int passphraselen);
 static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip,
 						   struct tpm_buf *buf,
 						   u8 attributes,
-- 
cgit v1.2.3


From b7960b90486139022d2d39caad90db252c469bab Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@opinsys.com>
Date: Tue, 30 Sep 2025 23:44:19 +0300
Subject: tpm2-sessions: Open code tpm_buf_append_hmac_session()

Open code 'tpm_buf_append_hmac_session_opt' to the call site, as it only
masks a call sequence and does otherwise nothing particularly useful.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@opinsys.com>
Reviewed-by: Jonathan McDowell <noodles@meta.com>
---
 drivers/char/tpm/tpm2-cmd.c               | 14 +++++++++++---
 include/linux/tpm.h                       | 23 -----------------------
 security/keys/trusted-keys/trusted_tpm2.c | 12 ++++++++++--
 3 files changed, 21 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index ce0a1c6b0596..3a77be7ebf4a 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -282,9 +282,17 @@ int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max)
 
 	do {
 		tpm_buf_reset(&buf, TPM2_ST_SESSIONS, TPM2_CC_GET_RANDOM);
-		tpm_buf_append_hmac_session_opt(chip, &buf, TPM2_SA_ENCRYPT
-						| TPM2_SA_CONTINUE_SESSION,
-						NULL, 0);
+		if (tpm2_chip_auth(chip)) {
+			tpm_buf_append_hmac_session(chip, &buf,
+						    TPM2_SA_ENCRYPT |
+						    TPM2_SA_CONTINUE_SESSION,
+						    NULL, 0);
+		} else  {
+			offset = buf.handles * 4 + TPM_HEADER_SIZE;
+			head = (struct tpm_header *)buf.data;
+			if (tpm_buf_length(&buf) == offset)
+				head->tag = cpu_to_be16(TPM2_ST_NO_SESSIONS);
+		}
 		tpm_buf_append_u16(&buf, num_bytes);
 		err = tpm_buf_fill_hmac_session(chip, &buf);
 		if (err) {
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index afa51723296a..202da079d500 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -536,29 +536,6 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
 				 int passphraselen);
 void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf,
 			 u8 *passphrase, int passphraselen);
-static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip,
-						   struct tpm_buf *buf,
-						   u8 attributes,
-						   u8 *passphrase,
-						   int passphraselen)
-{
-	struct tpm_header *head;
-	int offset;
-
-	if (tpm2_chip_auth(chip)) {
-		tpm_buf_append_hmac_session(chip, buf, attributes, passphrase, passphraselen);
-	} else  {
-		offset = buf->handles * 4 + TPM_HEADER_SIZE;
-		head = (struct tpm_header *)buf->data;
-
-		/*
-		 * If the only sessions are optional, the command tag must change to
-		 * TPM2_ST_NO_SESSIONS.
-		 */
-		if (tpm_buf_length(buf) == offset)
-			head->tag = cpu_to_be16(TPM2_ST_NO_SESSIONS);
-	}
-}
 
 #ifdef CONFIG_TCG_TPM2_HMAC
 
diff --git a/security/keys/trusted-keys/trusted_tpm2.c b/security/keys/trusted-keys/trusted_tpm2.c
index 5b205279584b..a7ea4a1c3bed 100644
--- a/security/keys/trusted-keys/trusted_tpm2.c
+++ b/security/keys/trusted-keys/trusted_tpm2.c
@@ -481,8 +481,10 @@ static int tpm2_unseal_cmd(struct tpm_chip *chip,
 			   struct trusted_key_options *options,
 			   u32 blob_handle)
 {
+	struct tpm_header *head;
 	struct tpm_buf buf;
 	u16 data_len;
+	int offset;
 	u8 *data;
 	int rc;
 
@@ -519,8 +521,14 @@ static int tpm2_unseal_cmd(struct tpm_chip *chip,
 		tpm2_buf_append_auth(&buf, options->policyhandle,
 				     NULL /* nonce */, 0, 0,
 				     options->blobauth, options->blobauth_len);
-		tpm_buf_append_hmac_session_opt(chip, &buf, TPM2_SA_ENCRYPT,
-						NULL, 0);
+		if (tpm2_chip_auth(chip)) {
+			tpm_buf_append_hmac_session(chip, &buf, TPM2_SA_ENCRYPT, NULL, 0);
+		} else  {
+			offset = buf.handles * 4 + TPM_HEADER_SIZE;
+			head = (struct tpm_header *)buf.data;
+			if (tpm_buf_length(&buf) == offset)
+				head->tag = cpu_to_be16(TPM2_ST_NO_SESSIONS);
+		}
 	}
 
 	rc = tpm_buf_fill_hmac_session(chip, &buf);
-- 
cgit v1.2.3


From fe93446b5ebdaa89a8f97b15668c077921a65140 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Wed, 3 Dec 2025 14:57:57 +0100
Subject: vfs: use UAPI types for new struct delegation definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using libc types and headers from the UAPI headers is problematic as it
introduces a dependency on a full C toolchain.

Use the fixed-width integer types provided by the UAPI headers instead.

Fixes: 1602bad16d7d ("vfs: expose delegation support to userland")
Fixes: 4be9e04ebf75 ("vfs: add needed headers for new struct delegation definition")
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://patch.msgid.link/20251203-uapi-fcntl-v1-1-490c67bf3425@linutronix.de
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/fcntl.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 5e277fd955aa..aadfbf6e0cb3 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -4,11 +4,7 @@
 
 #include <asm/fcntl.h>
 #include <linux/openat2.h>
-#ifdef __KERNEL__
 #include <linux/types.h>
-#else
-#include <stdint.h>
-#endif
 
 #define F_SETLEASE	(F_LINUX_SPECIFIC_BASE + 0)
 #define F_GETLEASE	(F_LINUX_SPECIFIC_BASE + 1)
@@ -90,9 +86,9 @@
 
 /* Argument structure for F_GETDELEG and F_SETDELEG */
 struct delegation {
-	uint32_t	d_flags;	/* Must be 0 */
-	uint16_t	d_type;		/* F_RDLCK, F_WRLCK, F_UNLCK */
-	uint16_t	__pad;		/* Must be 0 */
+	__u32	d_flags;	/* Must be 0 */
+	__u16	d_type;		/* F_RDLCK, F_WRLCK, F_UNLCK */
+	__u16	__pad;		/* Must be 0 */
 };
 
 /*
-- 
cgit v1.2.3


From 02e7769e38c87c92b82db59923d3b0598d153903 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 2 Dec 2025 16:17:51 -0500
Subject: tracing: Fix enabling of tracing on file release

The trace file will pause tracing if the tracing instance has the
"pause-on-trace" option is set. This happens when the file is opened, and
it is unpaused when the file is closed. When this was first added, there
was only one user that paused tracing. On open, the check to pause was:

   if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE)))

Where if it is not the snapshot tracer and the "pause-on-trace" option is
set, then it increments a "stop_count" of the trace instance.

On close, the check is:

   if (!iter->snapshot && tr->stop_count)

That is, if it is not the snapshot buffer and it was stopped, it will
re-enable tracing.

Now there's more places that stop tracing. This means, if something else
stops tracing the tr->stop_count will be non-zero, and that means if the
trace file is closed, it will decrement the stop_count even though it
never incremented it. This causes a warning because when the user that
stopped tracing enables it again, the stop_count goes below zero.

Instead of relying on the stop_count being set to know if the close of
the trace file should enable tracing again, add a new flag to the trace
iterator. The trace iterator is unique per open of the trace file, and if
the open stops tracing set the trace iterator PAUSE flag. On close, if the
PAUSE flag is set, then re-enable it again.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251202161751.24abaaf1@gandalf.local.home
Fixes: 06e0a548bad0f ("tracing: Do not disable tracing when reading the trace file")
Reported-by: syzbot+ccdec3bfe0beec58a38d@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/692f44a5.a70a0220.2ea503.00c8.GAE@google.com/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 1 +
 kernel/trace/trace.c         | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 04307a19cde3..3690221ba3d8 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -138,6 +138,7 @@ enum trace_iter_flags {
 	TRACE_FILE_LAT_FMT	= 1,
 	TRACE_FILE_ANNOTATE	= 2,
 	TRACE_FILE_TIME_IN_NS	= 4,
+	TRACE_FILE_PAUSE	= 8,
 };
 
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c9fbb316dcbd..cf725a33d99c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4709,8 +4709,10 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	 * If pause-on-trace is enabled, then stop the trace while
 	 * dumping, unless this is the "snapshot" file
 	 */
-	if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE)))
+	if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) {
+		iter->iter_flags |= TRACE_FILE_PAUSE;
 		tracing_stop_tr(tr);
+	}
 
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
@@ -4842,7 +4844,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 	if (iter->trace && iter->trace->close)
 		iter->trace->close(iter);
 
-	if (!iter->snapshot && tr->stop_count)
+	if (iter->iter_flags & TRACE_FILE_PAUSE)
 		/* reenable tracing if it was previously enabled */
 		tracing_start_tr(tr);
 
-- 
cgit v1.2.3


From 7bfe3b8ea6e30437e01fcb8e4f56ef6e4d986d0f Mon Sep 17 00:00:00 2001
From: Naman Jain <namjain@linux.microsoft.com>
Date: Thu, 13 Nov 2025 04:41:49 +0000
Subject: Drivers: hv: Introduce mshv_vtl driver

Provide an interface for Virtual Machine Monitor like OpenVMM and its
use as OpenHCL paravisor to control VTL0 (Virtual trust Level).
Expose devices and support IOCTLs for features like VTL creation,
VTL0 memory management, context switch, making hypercalls,
mapping VTL0 address space to VTL2 userspace, getting new VMBus
messages and channel events in VTL2 etc.

Co-developed-by: Roman Kisel <romank@linux.microsoft.com>
Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
Co-developed-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Naman Jain <namjain@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/hyperv/Makefile           |   10 +-
 arch/x86/hyperv/hv_vtl.c           |   30 +
 arch/x86/hyperv/mshv-asm-offsets.c |   37 +
 arch/x86/hyperv/mshv_vtl_asm.S     |  116 +++
 arch/x86/include/asm/mshyperv.h    |   34 +
 drivers/hv/Kconfig                 |   27 +-
 drivers/hv/Makefile                |    7 +-
 drivers/hv/mshv_vtl.h              |   25 +
 drivers/hv/mshv_vtl_main.c         | 1392 ++++++++++++++++++++++++++++++++++++
 include/hyperv/hvgdk_mini.h        |  106 +++
 include/uapi/linux/mshv.h          |   80 +++
 11 files changed, 1861 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/hyperv/mshv-asm-offsets.c
 create mode 100644 arch/x86/hyperv/mshv_vtl_asm.S
 create mode 100644 drivers/hv/mshv_vtl.h
 create mode 100644 drivers/hv/mshv_vtl_main.c

(limited to 'include')

diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 6f5d97cddd80..56292102af62 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,7 +1,12 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y			:= hv_init.o mmu.o nested.o irqdomain.o ivm.o
 obj-$(CONFIG_X86_64)	+= hv_apic.o
-obj-$(CONFIG_HYPERV_VTL_MODE)	+= hv_vtl.o
+obj-$(CONFIG_HYPERV_VTL_MODE)	+= hv_vtl.o mshv_vtl_asm.o
+
+$(obj)/mshv_vtl_asm.o: $(obj)/mshv-asm-offsets.h
+
+$(obj)/mshv-asm-offsets.h: $(obj)/mshv-asm-offsets.s FORCE
+	$(call filechk,offsets,__MSHV_ASM_OFFSETS_H__)
 
 ifdef CONFIG_X86_64
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)	+= hv_spinlock.o
@@ -12,3 +17,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)	+= hv_spinlock.o
   obj-$(CONFIG_CRASH_DUMP)      += hv_crash.o hv_trampoline.o
  endif
 endif
+
+targets += mshv-asm-offsets.s
+clean-files += mshv-asm-offsets.h
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 042e8712d8de..c0edaed0efb3 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -9,12 +9,17 @@
 #include <asm/apic.h>
 #include <asm/boot.h>
 #include <asm/desc.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/types.h>
 #include <asm/i8259.h>
 #include <asm/mshyperv.h>
 #include <asm/msr.h>
 #include <asm/realmode.h>
 #include <asm/reboot.h>
+#include <asm/smap.h>
+#include <linux/export.h>
 #include <../kernel/smpboot.h>
+#include "../../kernel/fpu/legacy.h"
 
 extern struct boot_params boot_params;
 static struct real_mode_header hv_vtl_real_mode_header;
@@ -249,3 +254,28 @@ int __init hv_vtl_early_init(void)
 
 	return 0;
 }
+
+DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void));
+
+void mshv_vtl_return_call_init(u64 vtl_return_offset)
+{
+	static_call_update(__mshv_vtl_return_hypercall,
+			   (void *)((u8 *)hv_hypercall_pg + vtl_return_offset));
+}
+EXPORT_SYMBOL(mshv_vtl_return_call_init);
+
+void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
+{
+	struct hv_vp_assist_page *hvp;
+
+	hvp = hv_vp_assist_page[smp_processor_id()];
+	hvp->vtl_ret_x64rax = vtl0->rax;
+	hvp->vtl_ret_x64rcx = vtl0->rcx;
+
+	kernel_fpu_begin_mask(0);
+	fxrstor(&vtl0->fx_state);
+	__mshv_vtl_return_call(vtl0);
+	fxsave(&vtl0->fx_state);
+	kernel_fpu_end();
+}
+EXPORT_SYMBOL(mshv_vtl_return_call);
diff --git a/arch/x86/hyperv/mshv-asm-offsets.c b/arch/x86/hyperv/mshv-asm-offsets.c
new file mode 100644
index 000000000000..882c1db6df16
--- /dev/null
+++ b/arch/x86/hyperv/mshv-asm-offsets.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ *
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Author:
+ *   Naman Jain <namjain@microsoft.com>
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include <asm/mshyperv.h>
+
+static void __used common(void)
+{
+	if (IS_ENABLED(CONFIG_HYPERV_VTL_MODE)) {
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rax, mshv_vtl_cpu_context, rax);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rcx, mshv_vtl_cpu_context, rcx);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rdx, mshv_vtl_cpu_context, rdx);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rbx, mshv_vtl_cpu_context, rbx);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rbp, mshv_vtl_cpu_context, rbp);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rsi, mshv_vtl_cpu_context, rsi);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rdi, mshv_vtl_cpu_context, rdi);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r8,  mshv_vtl_cpu_context, r8);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r9,  mshv_vtl_cpu_context, r9);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r10, mshv_vtl_cpu_context, r10);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r11, mshv_vtl_cpu_context, r11);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r12, mshv_vtl_cpu_context, r12);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r13, mshv_vtl_cpu_context, r13);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r14, mshv_vtl_cpu_context, r14);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r15, mshv_vtl_cpu_context, r15);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_cr2, mshv_vtl_cpu_context, cr2);
+	}
+}
diff --git a/arch/x86/hyperv/mshv_vtl_asm.S b/arch/x86/hyperv/mshv_vtl_asm.S
new file mode 100644
index 000000000000..f595eefad9ab
--- /dev/null
+++ b/arch/x86/hyperv/mshv_vtl_asm.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Assembly level code for mshv_vtl VTL transition
+ *
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Author:
+ *   Naman Jain <namjain@microsoft.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/static_call_types.h>
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+#include "mshv-asm-offsets.h"
+
+	.text
+	.section .noinstr.text, "ax"
+/*
+ * void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
+ *
+ * This function is used to context switch between different Virtual Trust Levels.
+ * It is marked as 'noinstr' to prevent against instrumentation and debugging facilities.
+ * NMIs aren't a problem because the NMI handler saves/restores CR2 specifically to guard
+ * against #PFs in NMI context clobbering the guest state.
+ */
+SYM_FUNC_START(__mshv_vtl_return_call)
+	/* Push callee save registers */
+	pushq %rbp
+	mov %rsp, %rbp
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	pushq %r15
+	pushq %rbx
+
+	/* register switch to VTL0 clobbers all registers except rax/rcx */
+	mov %_ASM_ARG1, %rax
+
+	/* grab rbx/rbp/rsi/rdi/r8-r15 */
+	mov MSHV_VTL_CPU_CONTEXT_rbx(%rax), %rbx
+	mov MSHV_VTL_CPU_CONTEXT_rbp(%rax), %rbp
+	mov MSHV_VTL_CPU_CONTEXT_rsi(%rax), %rsi
+	mov MSHV_VTL_CPU_CONTEXT_rdi(%rax), %rdi
+	mov MSHV_VTL_CPU_CONTEXT_r8(%rax), %r8
+	mov MSHV_VTL_CPU_CONTEXT_r9(%rax), %r9
+	mov MSHV_VTL_CPU_CONTEXT_r10(%rax), %r10
+	mov MSHV_VTL_CPU_CONTEXT_r11(%rax), %r11
+	mov MSHV_VTL_CPU_CONTEXT_r12(%rax), %r12
+	mov MSHV_VTL_CPU_CONTEXT_r13(%rax), %r13
+	mov MSHV_VTL_CPU_CONTEXT_r14(%rax), %r14
+	mov MSHV_VTL_CPU_CONTEXT_r15(%rax), %r15
+
+	mov MSHV_VTL_CPU_CONTEXT_cr2(%rax), %rdx
+	mov %rdx, %cr2
+	mov MSHV_VTL_CPU_CONTEXT_rdx(%rax), %rdx
+
+	/* stash host registers on stack */
+	pushq %rax
+	pushq %rcx
+
+	xor %ecx, %ecx
+
+	/* make a hypercall to switch VTL */
+	call STATIC_CALL_TRAMP_STR(__mshv_vtl_return_hypercall)
+
+	/* stash guest registers on stack, restore saved host copies */
+	pushq %rax
+	pushq %rcx
+	mov 16(%rsp), %rcx
+	mov 24(%rsp), %rax
+
+	mov %rdx, MSHV_VTL_CPU_CONTEXT_rdx(%rax)
+	mov %cr2, %rdx
+	mov %rdx, MSHV_VTL_CPU_CONTEXT_cr2(%rax)
+	pop MSHV_VTL_CPU_CONTEXT_rcx(%rax)
+	pop MSHV_VTL_CPU_CONTEXT_rax(%rax)
+	add $16, %rsp
+
+	/* save rbx/rbp/rsi/rdi/r8-r15 */
+	mov %rbx, MSHV_VTL_CPU_CONTEXT_rbx(%rax)
+	mov %rbp, MSHV_VTL_CPU_CONTEXT_rbp(%rax)
+	mov %rsi, MSHV_VTL_CPU_CONTEXT_rsi(%rax)
+	mov %rdi, MSHV_VTL_CPU_CONTEXT_rdi(%rax)
+	mov %r8,  MSHV_VTL_CPU_CONTEXT_r8(%rax)
+	mov %r9,  MSHV_VTL_CPU_CONTEXT_r9(%rax)
+	mov %r10, MSHV_VTL_CPU_CONTEXT_r10(%rax)
+	mov %r11, MSHV_VTL_CPU_CONTEXT_r11(%rax)
+	mov %r12, MSHV_VTL_CPU_CONTEXT_r12(%rax)
+	mov %r13, MSHV_VTL_CPU_CONTEXT_r13(%rax)
+	mov %r14, MSHV_VTL_CPU_CONTEXT_r14(%rax)
+	mov %r15, MSHV_VTL_CPU_CONTEXT_r15(%rax)
+
+	/* pop callee-save registers r12-r15, rbx */
+	pop %rbx
+	pop %r15
+	pop %r14
+	pop %r13
+	pop %r12
+
+	pop %rbp
+	RET
+SYM_FUNC_END(__mshv_vtl_return_call)
+/*
+ * Make sure that static_call_key symbol: __SCK____mshv_vtl_return_hypercall is accessible here.
+ * Below code is inspired from __ADDRESSABLE(sym) macro. Symbol name is kept simple, to avoid
+ * naming it something like "__UNIQUE_ID_addressable___SCK____mshv_vtl_return_hypercall_662.0"
+ * which would otherwise have been generated by the macro.
+ */
+	.section	.discard.addressable,"aw"
+	.align 8
+	.type	mshv_vtl_return_sym, @object
+	.size	mshv_vtl_return_sym, 8
+mshv_vtl_return_sym:
+	.quad	__SCK____mshv_vtl_return_hypercall
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 1342d55c2545..10037125099a 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -11,6 +11,7 @@
 #include <asm/paravirt.h>
 #include <asm/msr.h>
 #include <hyperv/hvhdk.h>
+#include <asm/fpu/types.h>
 
 /*
  * Hyper-V always provides a single IO-APIC at this MMIO address.
@@ -269,13 +270,46 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
 static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
 #endif /* CONFIG_HYPERV */
 
+struct mshv_vtl_cpu_context {
+	union {
+		struct {
+			u64 rax;
+			u64 rcx;
+			u64 rdx;
+			u64 rbx;
+			u64 cr2;
+			u64 rbp;
+			u64 rsi;
+			u64 rdi;
+			u64 r8;
+			u64 r9;
+			u64 r10;
+			u64 r11;
+			u64 r12;
+			u64 r13;
+			u64 r14;
+			u64 r15;
+		};
+		u64 gp_regs[16];
+	};
+
+	struct fxregs_state fx_state;
+};
 
 #ifdef CONFIG_HYPERV_VTL_MODE
 void __init hv_vtl_init_platform(void);
 int __init hv_vtl_early_init(void);
+void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
+void mshv_vtl_return_call_init(u64 vtl_return_offset);
+void mshv_vtl_return_hypercall(void);
+void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
 #else
 static inline void __init hv_vtl_init_platform(void) {}
 static inline int __init hv_vtl_early_init(void) { return 0; }
+static inline void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
+static inline void mshv_vtl_return_call_init(u64 vtl_return_offset) {}
+static inline void mshv_vtl_return_hypercall(void) {}
+static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
 #endif
 
 #include <asm-generic/mshyperv.h>
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 0b8c391a0342..d4a8d349200c 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -17,7 +17,8 @@ config HYPERV
 
 config HYPERV_VTL_MODE
 	bool "Enable Linux to boot in VTL context"
-	depends on (X86_64 || ARM64) && HYPERV
+	depends on (X86_64 && HAVE_STATIC_CALL) || ARM64
+	depends on HYPERV
 	depends on SMP
 	default n
 	help
@@ -82,4 +83,28 @@ config MSHV_ROOT
 
 	  If unsure, say N.
 
+config MSHV_VTL
+	tristate "Microsoft Hyper-V VTL driver"
+	depends on X86_64 && HYPERV_VTL_MODE
+	depends on HYPERV_VMBUS
+	# Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL.
+	# VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs,
+	# specially with large memory requirements.
+	depends on TRANSPARENT_HUGEPAGE
+	# MTRRs are controlled by VTL0, and are not specific to individual VTLs.
+	# Therefore, do not attempt to access or modify MTRRs here.
+	depends on !MTRR
+	select CPUMASK_OFFSTACK
+	select VIRT_XFER_TO_GUEST_WORK
+	default n
+	help
+	  Select this option to enable Hyper-V VTL driver support.
+	  This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2
+	  userspace to create VTLs and partitions, setup and manage VTL0 memory and
+	  allow userspace to make direct hypercalls. This also allows to map VTL0's address
+	  space to a usermode process in VTL2 and supports getting new VMBus messages and channel
+	  events in VTL2.
+
+	  If unsure, say N.
+
 endmenu
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index 1a1677bf4dac..6d929fb0e13d 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_HYPERV_VMBUS)	+= hv_vmbus.o
 obj-$(CONFIG_HYPERV_UTILS)	+= hv_utils.o
 obj-$(CONFIG_HYPERV_BALLOON)	+= hv_balloon.o
 obj-$(CONFIG_MSHV_ROOT)		+= mshv_root.o
+obj-$(CONFIG_MSHV_VTL)          += mshv_vtl.o
 
 CFLAGS_hv_trace.o = -I$(src)
 CFLAGS_hv_balloon.o = -I$(src)
@@ -14,7 +15,11 @@ hv_vmbus-$(CONFIG_HYPERV_TESTING)	+= hv_debugfs.o
 hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
 mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
 	       mshv_root_hv_call.o mshv_portid_table.o
+mshv_vtl-y := mshv_vtl_main.o
 
 # Code that must be built-in
 obj-$(CONFIG_HYPERV) += hv_common.o
-obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o
+obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o
+ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),)
+	obj-y += mshv_common.o
+endif
diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h
new file mode 100644
index 000000000000..a6eea52f7aa2
--- /dev/null
+++ b/drivers/hv/mshv_vtl.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _MSHV_VTL_H
+#define _MSHV_VTL_H
+
+#include <linux/mshv.h>
+#include <linux/types.h>
+
+struct mshv_vtl_run {
+	u32 cancel;
+	u32 vtl_ret_action_size;
+	u32 pad[2];
+	char exit_message[MSHV_MAX_RUN_MSG_SIZE];
+	union {
+		struct mshv_vtl_cpu_context cpu_context;
+
+		/*
+		 * Reserving room for the cpu context to grow and to maintain compatibility
+		 * with user mode.
+		 */
+		char reserved[1024];
+	};
+	char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE];
+};
+
+#endif /* _MSHV_VTL_H */
diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c
new file mode 100644
index 000000000000..2cebe9de5a5a
--- /dev/null
+++ b/drivers/hv/mshv_vtl_main.c
@@ -0,0 +1,1392 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Author:
+ *   Roman Kisel <romank@linux.microsoft.com>
+ *   Saurabh Sengar <ssengar@linux.microsoft.com>
+ *   Naman Jain <namjain@linux.microsoft.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpuhotplug.h>
+#include <linux/count_zeros.h>
+#include <linux/entry-virt.h>
+#include <linux/eventfd.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <asm/debugreg.h>
+#include <asm/mshyperv.h>
+#include <trace/events/ipi.h>
+#include <uapi/asm/mtrr.h>
+#include <uapi/linux/mshv.h>
+#include <hyperv/hvhdk.h>
+
+#include "../../kernel/fpu/legacy.h"
+#include "mshv.h"
+#include "mshv_vtl.h"
+#include "hyperv_vmbus.h"
+
+MODULE_AUTHOR("Microsoft");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver");
+
+#define MSHV_ENTRY_REASON_LOWER_VTL_CALL     0x1
+#define MSHV_ENTRY_REASON_INTERRUPT          0x2
+#define MSHV_ENTRY_REASON_INTERCEPT          0x3
+
+#define MSHV_REAL_OFF_SHIFT	16
+#define MSHV_PG_OFF_CPU_MASK	(BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1)
+#define MSHV_RUN_PAGE_OFFSET	0
+#define MSHV_REG_PAGE_OFFSET	1
+#define VTL2_VMBUS_SINT_INDEX	7
+
+static struct device *mem_dev;
+
+static struct tasklet_struct msg_dpc;
+static wait_queue_head_t fd_wait_queue;
+static bool has_message;
+static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT];
+static DEFINE_MUTEX(flag_lock);
+static bool __read_mostly mshv_has_reg_page;
+
+/* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */
+#define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8)
+
+struct mshv_vtl_hvcall_fd {
+	u8 allow_bitmap[MAX_BITMAP_SIZE];
+	bool allow_map_initialized;
+	/*
+	 * Used to protect hvcall setup in IOCTLs
+	 */
+	struct mutex init_mutex;
+	struct miscdevice *dev;
+};
+
+struct mshv_vtl_poll_file {
+	struct file *file;
+	wait_queue_entry_t wait;
+	wait_queue_head_t *wqh;
+	poll_table pt;
+	int cpu;
+};
+
+struct mshv_vtl {
+	struct device *module_dev;
+	u64 id;
+};
+
+struct mshv_vtl_per_cpu {
+	struct mshv_vtl_run *run;
+	struct page *reg_page;
+};
+
+/* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */
+union hv_synic_overlay_page_msr {
+	u64 as_uint64;
+	struct {
+		u64 enabled: 1;
+		u64 reserved: 11;
+		u64 pfn: 52;
+	} __packed;
+};
+
+static struct mutex mshv_vtl_poll_file_lock;
+static union hv_register_vsm_page_offsets mshv_vsm_page_offsets;
+static union hv_register_vsm_capabilities mshv_vsm_capabilities;
+
+static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file);
+static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions);
+static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu);
+
+static const union hv_input_vtl input_vtl_zero;
+static const union hv_input_vtl input_vtl_normal = {
+	.use_target_vtl = 1,
+};
+
+static const struct file_operations mshv_vtl_fops;
+
+static long
+mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev)
+{
+	struct mshv_vtl *vtl;
+	struct file *file;
+	int fd;
+
+	vtl = kzalloc(sizeof(*vtl), GFP_KERNEL);
+	if (!vtl)
+		return -ENOMEM;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		kfree(vtl);
+		return fd;
+	}
+	file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops,
+				  vtl, O_RDWR);
+	if (IS_ERR(file)) {
+		kfree(vtl);
+		return PTR_ERR(file);
+	}
+	vtl->module_dev = module_dev;
+	fd_install(fd, file);
+
+	return fd;
+}
+
+static long
+mshv_ioctl_check_extension(void __user *user_arg)
+{
+	u32 arg;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	switch (arg) {
+	case MSHV_CAP_CORE_API_STABLE:
+		return 0;
+	case MSHV_CAP_REGISTER_PAGE:
+		return mshv_has_reg_page;
+	case MSHV_CAP_VTL_RETURN_ACTION:
+		return mshv_vsm_capabilities.return_action_available;
+	case MSHV_CAP_DR6_SHARED:
+		return mshv_vsm_capabilities.dr6_shared;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static long
+mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+	struct miscdevice *misc = filp->private_data;
+
+	switch (ioctl) {
+	case MSHV_CHECK_EXTENSION:
+		return mshv_ioctl_check_extension((void __user *)arg);
+	case MSHV_CREATE_VTL:
+		return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device);
+	}
+
+	return -ENOTTY;
+}
+
+static const struct file_operations mshv_dev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= mshv_dev_ioctl,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice mshv_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "mshv",
+	.fops = &mshv_dev_fops,
+	.mode = 0600,
+};
+
+static struct mshv_vtl_run *mshv_vtl_this_run(void)
+{
+	return *this_cpu_ptr(&mshv_vtl_per_cpu.run);
+}
+
+static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu)
+{
+	return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu);
+}
+
+static struct page *mshv_vtl_cpu_reg_page(int cpu)
+{
+	return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu);
+}
+
+static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu)
+{
+	struct hv_register_assoc reg_assoc = {};
+	union hv_synic_overlay_page_msr overlay = {};
+	struct page *reg_page;
+
+	reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL);
+	if (!reg_page) {
+		WARN(1, "failed to allocate register page\n");
+		return;
+	}
+
+	overlay.enabled = 1;
+	overlay.pfn = page_to_hvpfn(reg_page);
+	reg_assoc.name = HV_X64_REGISTER_REG_PAGE;
+	reg_assoc.value.reg64 = overlay.as_uint64;
+
+	if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+				     1, input_vtl_zero, &reg_assoc)) {
+		WARN(1, "failed to setup register page\n");
+		__free_page(reg_page);
+		return;
+	}
+
+	per_cpu->reg_page = reg_page;
+	mshv_has_reg_page = true;
+}
+
+static void mshv_vtl_synic_enable_regs(unsigned int cpu)
+{
+	union hv_synic_sint sint;
+
+	sint.as_uint64 = 0;
+	sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+	sint.masked = false;
+	sint.auto_eoi = hv_recommend_using_aeoi();
+
+	/* Enable intercepts */
+	if (!mshv_vsm_capabilities.intercept_page_available)
+		hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+			   sint.as_uint64);
+
+	/* VTL2 Host VSP SINT is (un)masked when the user mode requests that */
+}
+
+static int mshv_vtl_get_vsm_regs(void)
+{
+	struct hv_register_assoc registers[2];
+	int ret, count = 2;
+
+	registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS;
+	registers[1].name = HV_REGISTER_VSM_CAPABILITIES;
+
+	ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+				       count, input_vtl_zero, registers);
+	if (ret)
+		return ret;
+
+	mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64;
+	mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64;
+
+	return ret;
+}
+
+static int mshv_vtl_configure_vsm_partition(struct device *dev)
+{
+	union hv_register_vsm_partition_config config;
+	struct hv_register_assoc reg_assoc;
+
+	config.as_uint64 = 0;
+	config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK;
+	config.enable_vtl_protection = 1;
+	config.zero_memory_on_reset = 1;
+	config.intercept_vp_startup = 1;
+	config.intercept_cpuid_unimplemented = 1;
+
+	if (mshv_vsm_capabilities.intercept_page_available) {
+		dev_dbg(dev, "using intercept page\n");
+		config.intercept_page = 1;
+	}
+
+	reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG;
+	reg_assoc.value.reg64 = config.as_uint64;
+
+	return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+				       1, input_vtl_zero, &reg_assoc);
+}
+
+static void mshv_vtl_vmbus_isr(void)
+{
+	struct hv_per_cpu_context *per_cpu;
+	struct hv_message *msg;
+	u32 message_type;
+	union hv_synic_event_flags *event_flags;
+	struct eventfd_ctx *eventfd;
+	u16 i;
+
+	per_cpu = this_cpu_ptr(hv_context.cpu_context);
+	if (smp_processor_id() == 0) {
+		msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX;
+		message_type = READ_ONCE(msg->header.message_type);
+		if (message_type != HVMSG_NONE)
+			tasklet_schedule(&msg_dpc);
+	}
+
+	event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page +
+			VTL2_VMBUS_SINT_INDEX;
+	for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) {
+		if (!sync_test_and_clear_bit(i, event_flags->flags))
+			continue;
+		rcu_read_lock();
+		eventfd = READ_ONCE(flag_eventfds[i]);
+		if (eventfd)
+			eventfd_signal(eventfd);
+		rcu_read_unlock();
+	}
+
+	vmbus_isr();
+}
+
+static int mshv_vtl_alloc_context(unsigned int cpu)
+{
+	struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu);
+
+	per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	if (!per_cpu->run)
+		return -ENOMEM;
+
+	if (mshv_vsm_capabilities.intercept_page_available)
+		mshv_vtl_configure_reg_page(per_cpu);
+
+	mshv_vtl_synic_enable_regs(cpu);
+
+	return 0;
+}
+
+static int mshv_vtl_cpuhp_online;
+
+static int hv_vtl_setup_synic(void)
+{
+	int ret;
+
+	/* Use our isr to first filter out packets destined for userspace */
+	hv_setup_vmbus_handler(mshv_vtl_vmbus_isr);
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online",
+				mshv_vtl_alloc_context, NULL);
+	if (ret < 0) {
+		hv_setup_vmbus_handler(vmbus_isr);
+		return ret;
+	}
+
+	mshv_vtl_cpuhp_online = ret;
+
+	return 0;
+}
+
+static void hv_vtl_remove_synic(void)
+{
+	cpuhp_remove_state(mshv_vtl_cpuhp_online);
+	hv_setup_vmbus_handler(vmbus_isr);
+}
+
+static int vtl_get_vp_register(struct hv_register_assoc *reg)
+{
+	return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+					1, input_vtl_normal, reg);
+}
+
+static int vtl_set_vp_register(struct hv_register_assoc *reg)
+{
+	return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+					1, input_vtl_normal, reg);
+}
+
+static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg)
+{
+	struct mshv_vtl_ram_disposition vtl0_mem;
+	struct dev_pagemap *pgmap;
+	void *addr;
+
+	if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem)))
+		return -EFAULT;
+	/* vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design */
+	if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) {
+		dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n",
+			vtl0_mem.start_pfn, vtl0_mem.last_pfn);
+		return -EFAULT;
+	}
+
+	pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL);
+	if (!pgmap)
+		return -ENOMEM;
+
+	pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn);
+	pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1;
+	pgmap->nr_range = 1;
+	pgmap->type = MEMORY_DEVICE_GENERIC;
+
+	/*
+	 * Determine the highest page order that can be used for the given memory range.
+	 * This works best when the range is aligned; i.e. both the start and the length.
+	 */
+	pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn);
+	dev_dbg(vtl->module_dev,
+		"Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n",
+		vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift);
+
+	addr = devm_memremap_pages(mem_dev, pgmap);
+	if (IS_ERR(addr)) {
+		dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr));
+		kfree(pgmap);
+		return -EFAULT;
+	}
+
+	/* Don't free pgmap, since it has to stick around until the memory
+	 * is unmapped, which will never happen as there is no scenario
+	 * where VTL0 can be released/shutdown without bringing down VTL2.
+	 */
+	return 0;
+}
+
+static void mshv_vtl_cancel(int cpu)
+{
+	int here = get_cpu();
+
+	if (here != cpu) {
+		if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1))
+			smp_send_reschedule(cpu);
+	} else {
+		WRITE_ONCE(mshv_vtl_this_run()->cancel, 1);
+	}
+	put_cpu();
+}
+
+static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
+{
+	struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait);
+
+	mshv_vtl_cancel(poll_file->cpu);
+
+	return 0;
+}
+
+static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
+{
+	struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt);
+
+	WARN_ON(poll_file->wqh);
+	poll_file->wqh = wqh;
+	add_wait_queue(wqh, &poll_file->wait);
+}
+
+static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input)
+{
+	struct file *file, *old_file;
+	struct mshv_vtl_poll_file *poll_file;
+	struct mshv_vtl_set_poll_file input;
+
+	if (copy_from_user(&input, user_input, sizeof(input)))
+		return -EFAULT;
+
+	if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu))
+		return -EINVAL;
+	/*
+	 * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists.
+	 * CPU is expected to remain online after above cpu_online() check.
+	 */
+
+	file = NULL;
+	file = fget(input.fd);
+	if (!file)
+		return -EBADFD;
+
+	poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu));
+	if (!poll_file)
+		return -EINVAL;
+
+	mutex_lock(&mshv_vtl_poll_file_lock);
+
+	if (poll_file->wqh)
+		remove_wait_queue(poll_file->wqh, &poll_file->wait);
+	poll_file->wqh = NULL;
+
+	old_file = poll_file->file;
+	poll_file->file = file;
+	poll_file->cpu = input.cpu;
+
+	if (file) {
+		init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake);
+		init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc);
+		vfs_poll(file, &poll_file->pt);
+	}
+
+	mutex_unlock(&mshv_vtl_poll_file_lock);
+
+	if (old_file)
+		fput(old_file);
+
+	return 0;
+}
+
+/* Static table mapping register names to their corresponding actions */
+static const struct {
+	enum hv_register_name reg_name;
+	int debug_reg_num;  /* -1 if not a debug register */
+	u32 msr_addr;       /* 0 if not an MSR */
+} reg_table[] = {
+	/* Debug registers */
+	{HV_X64_REGISTER_DR0, 0, 0},
+	{HV_X64_REGISTER_DR1, 1, 0},
+	{HV_X64_REGISTER_DR2, 2, 0},
+	{HV_X64_REGISTER_DR3, 3, 0},
+	{HV_X64_REGISTER_DR6, 6, 0},
+	/* MTRR MSRs */
+	{HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap},
+	{HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)},
+	{HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000},
+};
+
+static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set)
+{
+	u64 *reg64;
+	enum hv_register_name gpr_name;
+	int i;
+
+	gpr_name = regs->name;
+	reg64 = &regs->value.reg64;
+
+	/* Search for the register in the table */
+	for (i = 0; i < ARRAY_SIZE(reg_table); i++) {
+		if (reg_table[i].reg_name != gpr_name)
+			continue;
+		if (reg_table[i].debug_reg_num != -1) {
+			/* Handle debug registers */
+			if (gpr_name == HV_X64_REGISTER_DR6 &&
+			    !mshv_vsm_capabilities.dr6_shared)
+				goto hypercall;
+			if (set)
+				native_set_debugreg(reg_table[i].debug_reg_num, *reg64);
+			else
+				*reg64 = native_get_debugreg(reg_table[i].debug_reg_num);
+		} else {
+			/* Handle MSRs */
+			if (set)
+				wrmsrl(reg_table[i].msr_addr, *reg64);
+			else
+				rdmsrl(reg_table[i].msr_addr, *reg64);
+		}
+		return 0;
+	}
+
+hypercall:
+	return 1;
+}
+
+static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0)
+{
+	struct hv_vp_assist_page *hvp;
+
+	hvp = hv_vp_assist_page[smp_processor_id()];
+
+	/*
+	 * Process signal event direct set in the run page, if any.
+	 */
+	if (mshv_vsm_capabilities.return_action_available) {
+		u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size);
+
+		WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0);
+
+		/*
+		 * Hypervisor will take care of clearing out the actions
+		 * set in the assist page.
+		 */
+		memcpy(hvp->vtl_ret_actions,
+		       mshv_vtl_this_run()->vtl_ret_actions,
+		       min_t(u32, offset, sizeof(hvp->vtl_ret_actions)));
+	}
+
+	mshv_vtl_return_call(vtl0);
+}
+
+static bool mshv_vtl_process_intercept(void)
+{
+	struct hv_per_cpu_context *mshv_cpu;
+	void *synic_message_page;
+	struct hv_message *msg;
+	u32 message_type;
+
+	mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
+	synic_message_page = mshv_cpu->hyp_synic_message_page;
+	if (unlikely(!synic_message_page))
+		return true;
+
+	msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX;
+	message_type = READ_ONCE(msg->header.message_type);
+	if (message_type == HVMSG_NONE)
+		return true;
+
+	memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg));
+	vmbus_signal_eom(msg, message_type);
+
+	return false;
+}
+
+static int mshv_vtl_ioctl_return_to_lower_vtl(void)
+{
+	preempt_disable();
+	for (;;) {
+		unsigned long irq_flags;
+		struct hv_vp_assist_page *hvp;
+		int ret;
+
+		if (__xfer_to_guest_mode_work_pending()) {
+			preempt_enable();
+			ret = xfer_to_guest_mode_handle_work();
+			if (ret)
+				return ret;
+			preempt_disable();
+		}
+
+		local_irq_save(irq_flags);
+		if (READ_ONCE(mshv_vtl_this_run()->cancel)) {
+			local_irq_restore(irq_flags);
+			preempt_enable();
+			return -EINTR;
+		}
+
+		mshv_vtl_return(&mshv_vtl_this_run()->cpu_context);
+		local_irq_restore(irq_flags);
+
+		hvp = hv_vp_assist_page[smp_processor_id()];
+		this_cpu_inc(num_vtl0_transitions);
+		switch (hvp->vtl_entry_reason) {
+		case MSHV_ENTRY_REASON_INTERRUPT:
+			if (!mshv_vsm_capabilities.intercept_page_available &&
+			    likely(!mshv_vtl_process_intercept()))
+				goto done;
+			break;
+
+		case MSHV_ENTRY_REASON_INTERCEPT:
+			WARN_ON(!mshv_vsm_capabilities.intercept_page_available);
+			memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message,
+			       sizeof(hvp->intercept_message));
+			goto done;
+
+		default:
+			panic("unknown entry reason: %d", hvp->vtl_entry_reason);
+		}
+	}
+
+done:
+	preempt_enable();
+
+	return 0;
+}
+
+static long
+mshv_vtl_ioctl_get_regs(void __user *user_args)
+{
+	struct mshv_vp_registers args;
+	struct hv_register_assoc reg;
+	long ret;
+
+	if (copy_from_user(&args, user_args, sizeof(args)))
+		return -EFAULT;
+
+	/*  This IOCTL supports processing only one register at a time. */
+	if (args.count != 1)
+		return -EINVAL;
+
+	if (copy_from_user(&reg, (void __user *)args.regs_ptr,
+			   sizeof(reg)))
+		return -EFAULT;
+
+	ret = mshv_vtl_get_set_reg(&reg, false);
+	if (!ret)
+		goto copy_args; /* No need of hypercall */
+	ret = vtl_get_vp_register(&reg);
+	if (ret)
+		return ret;
+
+copy_args:
+	if (copy_to_user((void __user *)args.regs_ptr, &reg, sizeof(reg)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static long
+mshv_vtl_ioctl_set_regs(void __user *user_args)
+{
+	struct mshv_vp_registers args;
+	struct hv_register_assoc reg;
+	long ret;
+
+	if (copy_from_user(&args, user_args, sizeof(args)))
+		return -EFAULT;
+
+	/*  This IOCTL supports processing only one register at a time. */
+	if (args.count != 1)
+		return -EINVAL;
+
+	if (copy_from_user(&reg, (void __user *)args.regs_ptr, sizeof(reg)))
+		return -EFAULT;
+
+	ret = mshv_vtl_get_set_reg(&reg, true);
+	if (!ret)
+		return ret; /* No need of hypercall */
+	ret = vtl_set_vp_register(&reg);
+
+	return ret;
+}
+
+static long
+mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+	long ret;
+	struct mshv_vtl *vtl = filp->private_data;
+
+	switch (ioctl) {
+	case MSHV_SET_POLL_FILE:
+		ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg);
+		break;
+	case MSHV_GET_VP_REGISTERS:
+		ret = mshv_vtl_ioctl_get_regs((void __user *)arg);
+		break;
+	case MSHV_SET_VP_REGISTERS:
+		ret = mshv_vtl_ioctl_set_regs((void __user *)arg);
+		break;
+	case MSHV_RETURN_TO_LOWER_VTL:
+		ret = mshv_vtl_ioctl_return_to_lower_vtl();
+		break;
+	case MSHV_ADD_VTL0_MEMORY:
+		ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg);
+		break;
+	default:
+		dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl);
+		ret = -ENOTTY;
+	}
+
+	return ret;
+}
+
+static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf)
+{
+	struct page *page;
+	int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK;
+	int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT;
+
+	if (!cpu_online(cpu))
+		return VM_FAULT_SIGBUS;
+	/*
+	 * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists.
+	 * CPU is expected to remain online after above cpu_online() check.
+	 */
+
+	if (real_off == MSHV_RUN_PAGE_OFFSET) {
+		page = virt_to_page(mshv_vtl_cpu_run(cpu));
+	} else if (real_off == MSHV_REG_PAGE_OFFSET) {
+		if (!mshv_has_reg_page)
+			return VM_FAULT_SIGBUS;
+		page = mshv_vtl_cpu_reg_page(cpu);
+	} else {
+		return VM_FAULT_NOPAGE;
+	}
+
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static const struct vm_operations_struct mshv_vtl_vm_ops = {
+	.fault = mshv_vtl_fault,
+};
+
+static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &mshv_vtl_vm_ops;
+
+	return 0;
+}
+
+static int mshv_vtl_release(struct inode *inode, struct file *filp)
+{
+	struct mshv_vtl *vtl = filp->private_data;
+
+	kfree(vtl);
+
+	return 0;
+}
+
+static const struct file_operations mshv_vtl_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = mshv_vtl_ioctl,
+	.release = mshv_vtl_release,
+	.mmap = mshv_vtl_mmap,
+};
+
+static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask)
+{
+	union hv_synic_sint sint;
+
+	sint.as_uint64 = 0;
+	sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+	sint.masked = (*mask != 0);
+	sint.auto_eoi = hv_recommend_using_aeoi();
+
+	hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX,
+		   sint.as_uint64);
+
+	if (!sint.masked)
+		pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id());
+	else
+		pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id());
+}
+
+static void mshv_vtl_read_remote(void *buffer)
+{
+	struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
+	struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page +
+					VTL2_VMBUS_SINT_INDEX;
+	u32 message_type = READ_ONCE(msg->header.message_type);
+
+	WRITE_ONCE(has_message, false);
+	if (message_type == HVMSG_NONE)
+		return;
+
+	memcpy(buffer, msg, sizeof(*msg));
+	vmbus_signal_eom(msg, message_type);
+}
+
+static bool vtl_synic_mask_vmbus_sint_masked = true;
+
+static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset)
+{
+	struct hv_message msg = {};
+	int ret;
+
+	if (size < sizeof(msg))
+		return -EINVAL;
+
+	for (;;) {
+		smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true);
+		if (msg.header.message_type != HVMSG_NONE)
+			break;
+
+		if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
+			return 0; /* EOF */
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		ret = wait_event_interruptible(fd_wait_queue,
+					       READ_ONCE(has_message) ||
+						READ_ONCE(vtl_synic_mask_vmbus_sint_masked));
+		if (ret)
+			return ret;
+	}
+
+	if (copy_to_user(arg, &msg, sizeof(msg)))
+		return -EFAULT;
+
+	return sizeof(msg);
+}
+
+static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait)
+{
+	__poll_t mask = 0;
+
+	poll_wait(filp, &fd_wait_queue, wait);
+	if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+static void mshv_vtl_sint_on_msg_dpc(unsigned long data)
+{
+	WRITE_ONCE(has_message, true);
+	wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
+}
+
+static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg)
+{
+	struct mshv_vtl_sint_post_msg message;
+	u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT];
+
+	if (copy_from_user(&message, arg, sizeof(message)))
+		return -EFAULT;
+	if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
+		return -EINVAL;
+	if (copy_from_user(payload, (void __user *)message.payload_ptr,
+			   message.payload_size))
+		return -EFAULT;
+
+	return hv_post_message((union hv_connection_id)message.connection_id,
+			       message.message_type, (void *)payload,
+			       message.payload_size);
+}
+
+static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg)
+{
+	u64 input, status;
+	struct mshv_vtl_signal_event signal_event;
+
+	if (copy_from_user(&signal_event, arg, sizeof(signal_event)))
+		return -EFAULT;
+
+	input = signal_event.connection_id | ((u64)signal_event.flag << 32);
+
+	status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input);
+
+	return hv_result_to_errno(status);
+}
+
+static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg)
+{
+	struct mshv_vtl_set_eventfd set_eventfd;
+	struct eventfd_ctx *eventfd, *old_eventfd;
+
+	if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd)))
+		return -EFAULT;
+	if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT)
+		return -EINVAL;
+
+	eventfd = NULL;
+	if (set_eventfd.fd >= 0) {
+		eventfd = eventfd_ctx_fdget(set_eventfd.fd);
+		if (IS_ERR(eventfd))
+			return PTR_ERR(eventfd);
+	}
+
+	guard(mutex)(&flag_lock);
+	old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]);
+	WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd);
+
+	if (old_eventfd) {
+		synchronize_rcu();
+		eventfd_ctx_put(old_eventfd);
+	}
+
+	return 0;
+}
+
+static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg)
+{
+	static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex);
+	struct mshv_sint_mask mask;
+
+	if (copy_from_user(&mask, arg, sizeof(mask)))
+		return -EFAULT;
+	guard(mutex)(&vtl2_vmbus_sint_mask_mutex);
+	on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1);
+	WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0);
+	if (mask.mask)
+		wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
+
+	return 0;
+}
+
+static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case MSHV_SINT_POST_MESSAGE:
+		return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg);
+	case MSHV_SINT_SIGNAL_EVENT:
+		return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg);
+	case MSHV_SINT_SET_EVENTFD:
+		return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg);
+	case MSHV_SINT_PAUSE_MESSAGE_STREAM:
+		return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+static const struct file_operations mshv_vtl_sint_ops = {
+	.owner = THIS_MODULE,
+	.read = mshv_vtl_sint_read,
+	.poll = mshv_vtl_sint_poll,
+	.unlocked_ioctl = mshv_vtl_sint_ioctl,
+};
+
+static struct miscdevice mshv_vtl_sint_dev = {
+	.name = "mshv_sint",
+	.fops = &mshv_vtl_sint_ops,
+	.mode = 0600,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
+static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f)
+{
+	struct miscdevice *dev = f->private_data;
+	struct mshv_vtl_hvcall_fd *fd;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	fd = vzalloc(sizeof(*fd));
+	if (!fd)
+		return -ENOMEM;
+	fd->dev = dev;
+	f->private_data = fd;
+	mutex_init(&fd->init_mutex);
+
+	return 0;
+}
+
+static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f)
+{
+	struct mshv_vtl_hvcall_fd *fd;
+
+	fd = f->private_data;
+	if (fd) {
+		vfree(fd);
+		f->private_data = NULL;
+	}
+
+	return 0;
+}
+
+static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd,
+				    struct mshv_vtl_hvcall_setup __user *hvcall_setup_user)
+{
+	struct mshv_vtl_hvcall_setup hvcall_setup;
+
+	guard(mutex)(&fd->init_mutex);
+
+	if (fd->allow_map_initialized) {
+		dev_err(fd->dev->this_device,
+			"Hypercall allow map has already been set, pid %d\n",
+			current->pid);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(&hvcall_setup, hvcall_setup_user,
+			   sizeof(struct mshv_vtl_hvcall_setup))) {
+		return -EFAULT;
+	}
+	if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap))
+		return -EINVAL;
+
+	if (copy_from_user(&fd->allow_bitmap,
+			   (void __user *)hvcall_setup.allow_bitmap_ptr,
+			   hvcall_setup.bitmap_array_size)) {
+		return -EFAULT;
+	}
+
+	dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n",
+		 current->pid);
+	fd->allow_map_initialized = true;
+	return 0;
+}
+
+static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code)
+{
+	return test_bit(call_code, (unsigned long *)fd->allow_bitmap);
+}
+
+static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd,
+				struct mshv_vtl_hvcall __user *hvcall_user)
+{
+	struct mshv_vtl_hvcall hvcall;
+	void *in, *out;
+	int ret;
+
+	if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall)))
+		return -EFAULT;
+	if (hvcall.input_size > HV_HYP_PAGE_SIZE)
+		return -EINVAL;
+	if (hvcall.output_size > HV_HYP_PAGE_SIZE)
+		return -EINVAL;
+
+	/*
+	 * By default, all hypercalls are not allowed.
+	 * The user mode code has to set up the allow bitmap once.
+	 */
+
+	if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) {
+		dev_err(fd->dev->this_device,
+			"Hypercall with control data %#llx isn't allowed\n",
+			hvcall.control);
+		return -EPERM;
+	}
+
+	/*
+	 * This may create a problem for Confidential VM (CVM) usecase where we need to use
+	 * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and
+	 * hyperv_pcpu_output_arg) for making a hypervisor call.
+	 *
+	 * TODO: Take care of this when CVM support is added.
+	 */
+	in = (void *)__get_free_page(GFP_KERNEL);
+	out = (void *)__get_free_page(GFP_KERNEL);
+
+	if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) {
+		ret = -EFAULT;
+		goto free_pages;
+	}
+
+	hvcall.status = hv_do_hypercall(hvcall.control, in, out);
+
+	if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) {
+		ret = -EFAULT;
+		goto free_pages;
+	}
+	ret = put_user(hvcall.status, &hvcall_user->status);
+free_pages:
+	free_page((unsigned long)in);
+	free_page((unsigned long)out);
+
+	return ret;
+}
+
+static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+	struct mshv_vtl_hvcall_fd *fd = f->private_data;
+
+	switch (cmd) {
+	case MSHV_HVCALL_SETUP:
+		return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg);
+	case MSHV_HVCALL:
+		return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg);
+	default:
+		break;
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static const struct file_operations mshv_vtl_hvcall_dev_file_ops = {
+	.owner = THIS_MODULE,
+	.open = mshv_vtl_hvcall_dev_open,
+	.release = mshv_vtl_hvcall_dev_release,
+	.unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl,
+};
+
+static struct miscdevice mshv_vtl_hvcall_dev = {
+	.name = "mshv_hvcall",
+	.nodename = "mshv_hvcall",
+	.fops = &mshv_vtl_hvcall_dev_file_ops,
+	.mode = 0600,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
+static int mshv_vtl_low_open(struct inode *inodep, struct file *filp)
+{
+	pid_t pid = task_pid_vnr(current);
+	uid_t uid = current_uid().val;
+	int ret = 0;
+
+	pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid);
+
+	if (capable(CAP_SYS_ADMIN)) {
+		filp->private_data = inodep;
+	} else {
+		pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d",
+		       __func__, pid, uid);
+		ret = -EPERM;
+	}
+
+	return ret;
+}
+
+static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn)
+{
+	unsigned long mask = size - 1;
+	unsigned long start = vmf->address & ~mask;
+	unsigned long end = start + size;
+	bool is_valid;
+
+	is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) &&
+		start >= vmf->vma->vm_start &&
+		end <= vmf->vma->vm_end;
+
+	if (is_valid)
+		*pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT);
+
+	return is_valid;
+}
+
+static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order)
+{
+	unsigned long pfn = vmf->pgoff;
+	vm_fault_t ret = VM_FAULT_FALLBACK;
+
+	switch (order) {
+	case 0:
+		return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+
+	case PMD_ORDER:
+		if (can_fault(vmf, PMD_SIZE, &pfn))
+			ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+		return ret;
+
+	case PUD_ORDER:
+		if (can_fault(vmf, PUD_SIZE, &pfn))
+			ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+		return ret;
+
+	default:
+		return VM_FAULT_SIGBUS;
+	}
+}
+
+static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf)
+{
+	return mshv_vtl_low_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct mshv_vtl_low_vm_ops = {
+	.fault = mshv_vtl_low_fault,
+	.huge_fault = mshv_vtl_low_huge_fault,
+};
+
+static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &mshv_vtl_low_vm_ops;
+	vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP);
+
+	return 0;
+}
+
+static const struct file_operations mshv_vtl_low_file_ops = {
+	.owner		= THIS_MODULE,
+	.open		= mshv_vtl_low_open,
+	.mmap		= mshv_vtl_low_mmap,
+};
+
+static struct miscdevice mshv_vtl_low = {
+	.name = "mshv_vtl_low",
+	.nodename = "mshv_vtl_low",
+	.fops = &mshv_vtl_low_file_ops,
+	.mode = 0600,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
+static int __init mshv_vtl_init(void)
+{
+	int ret;
+	struct device *dev = mshv_dev.this_device;
+
+	/*
+	 * This creates /dev/mshv which provides functionality to create VTLs and partitions.
+	 */
+	ret = misc_register(&mshv_dev);
+	if (ret) {
+		dev_err(dev, "mshv device register failed: %d\n", ret);
+		goto free_dev;
+	}
+
+	tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0);
+	init_waitqueue_head(&fd_wait_queue);
+
+	if (mshv_vtl_get_vsm_regs()) {
+		dev_emerg(dev, "Unable to get VSM capabilities !!\n");
+		ret = -ENODEV;
+		goto free_dev;
+	}
+	if (mshv_vtl_configure_vsm_partition(dev)) {
+		dev_emerg(dev, "VSM configuration failed !!\n");
+		ret = -ENODEV;
+		goto free_dev;
+	}
+
+	mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset);
+	ret = hv_vtl_setup_synic();
+	if (ret)
+		goto free_dev;
+
+	/*
+	 * mshv_sint device adds VMBus relay ioctl support.
+	 * This provides a channel for VTL0 to communicate with VTL2.
+	 */
+	ret = misc_register(&mshv_vtl_sint_dev);
+	if (ret)
+		goto free_synic;
+
+	/*
+	 * mshv_hvcall device adds interface to enable userspace for direct hypercalls support.
+	 */
+	ret = misc_register(&mshv_vtl_hvcall_dev);
+	if (ret)
+		goto free_sint;
+
+	/*
+	 * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2.
+	 * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0.
+	 */
+	ret = misc_register(&mshv_vtl_low);
+	if (ret)
+		goto free_hvcall;
+
+	/*
+	 * "mshv vtl mem dev" device is later used to setup VTL0 memory.
+	 */
+	mem_dev = kzalloc(sizeof(*mem_dev), GFP_KERNEL);
+	if (!mem_dev) {
+		ret = -ENOMEM;
+		goto free_low;
+	}
+
+	mutex_init(&mshv_vtl_poll_file_lock);
+
+	device_initialize(mem_dev);
+	dev_set_name(mem_dev, "mshv vtl mem dev");
+	ret = device_add(mem_dev);
+	if (ret) {
+		dev_err(dev, "mshv vtl mem dev add: %d\n", ret);
+		goto free_mem;
+	}
+
+	return 0;
+
+free_mem:
+	kfree(mem_dev);
+free_low:
+	misc_deregister(&mshv_vtl_low);
+free_hvcall:
+	misc_deregister(&mshv_vtl_hvcall_dev);
+free_sint:
+	misc_deregister(&mshv_vtl_sint_dev);
+free_synic:
+	hv_vtl_remove_synic();
+free_dev:
+	misc_deregister(&mshv_dev);
+
+	return ret;
+}
+
+static void __exit mshv_vtl_exit(void)
+{
+	device_del(mem_dev);
+	kfree(mem_dev);
+	misc_deregister(&mshv_vtl_low);
+	misc_deregister(&mshv_vtl_hvcall_dev);
+	misc_deregister(&mshv_vtl_sint_dev);
+	hv_vtl_remove_synic();
+	misc_deregister(&mshv_dev);
+}
+
+module_init(mshv_vtl_init);
+module_exit(mshv_vtl_exit);
diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index 7499a679e60a..1d5ce11be8b6 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -885,6 +885,48 @@ struct hv_get_vp_from_apic_id_in {
 	u32 apic_ids[];
 } __packed;
 
+union hv_register_vsm_partition_config {
+	u64 as_uint64;
+	struct {
+		u64 enable_vtl_protection : 1;
+		u64 default_vtl_protection_mask : 4;
+		u64 zero_memory_on_reset : 1;
+		u64 deny_lower_vtl_startup : 1;
+		u64 intercept_acceptance : 1;
+		u64 intercept_enable_vtl_protection : 1;
+		u64 intercept_vp_startup : 1;
+		u64 intercept_cpuid_unimplemented : 1;
+		u64 intercept_unrecoverable_exception : 1;
+		u64 intercept_page : 1;
+		u64 mbz : 51;
+	} __packed;
+};
+
+union hv_register_vsm_capabilities {
+	u64 as_uint64;
+	struct {
+		u64 dr6_shared: 1;
+		u64 mbec_vtl_mask: 16;
+		u64 deny_lower_vtl_startup: 1;
+		u64 supervisor_shadow_stack: 1;
+		u64 hardware_hvpt_available: 1;
+		u64 software_hvpt_available: 1;
+		u64 hardware_hvpt_range_bits: 6;
+		u64 intercept_page_available: 1;
+		u64 return_action_available: 1;
+		u64 reserved: 35;
+	} __packed;
+};
+
+union hv_register_vsm_page_offsets {
+	struct {
+		u64 vtl_call_offset : 12;
+		u64 vtl_return_offset : 12;
+		u64 reserved_mbz : 40;
+	} __packed;
+	u64 as_uint64;
+};
+
 struct hv_nested_enlightenments_control {
 	struct {
 		u32 directhypercall : 1;
@@ -1007,6 +1049,70 @@ enum hv_register_name {
 
 	/* VSM */
 	HV_REGISTER_VSM_VP_STATUS				= 0x000D0003,
+
+	/* Synthetic VSM registers */
+	HV_REGISTER_VSM_CODE_PAGE_OFFSETS	= 0x000D0002,
+	HV_REGISTER_VSM_CAPABILITIES		= 0x000D0006,
+	HV_REGISTER_VSM_PARTITION_CONFIG	= 0x000D0007,
+
+#if defined(CONFIG_X86)
+	/* X64 Debug Registers */
+	HV_X64_REGISTER_DR0	= 0x00050000,
+	HV_X64_REGISTER_DR1	= 0x00050001,
+	HV_X64_REGISTER_DR2	= 0x00050002,
+	HV_X64_REGISTER_DR3	= 0x00050003,
+	HV_X64_REGISTER_DR6	= 0x00050004,
+	HV_X64_REGISTER_DR7	= 0x00050005,
+
+	/* X64 Cache control MSRs */
+	HV_X64_REGISTER_MSR_MTRR_CAP		= 0x0008000D,
+	HV_X64_REGISTER_MSR_MTRR_DEF_TYPE	= 0x0008000E,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0	= 0x00080010,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1	= 0x00080011,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2	= 0x00080012,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3	= 0x00080013,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4	= 0x00080014,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5	= 0x00080015,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6	= 0x00080016,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7	= 0x00080017,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8	= 0x00080018,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9	= 0x00080019,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA	= 0x0008001A,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB	= 0x0008001B,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC	= 0x0008001C,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASED	= 0x0008001D,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE	= 0x0008001E,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF	= 0x0008001F,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0	= 0x00080040,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1	= 0x00080041,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2	= 0x00080042,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3	= 0x00080043,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4	= 0x00080044,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5	= 0x00080045,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6	= 0x00080046,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7	= 0x00080047,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8	= 0x00080048,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9	= 0x00080049,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA	= 0x0008004A,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB	= 0x0008004B,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC	= 0x0008004C,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD	= 0x0008004D,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE	= 0x0008004E,
+	HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF	= 0x0008004F,
+	HV_X64_REGISTER_MSR_MTRR_FIX64K00000	= 0x00080070,
+	HV_X64_REGISTER_MSR_MTRR_FIX16K80000	= 0x00080071,
+	HV_X64_REGISTER_MSR_MTRR_FIX16KA0000	= 0x00080072,
+	HV_X64_REGISTER_MSR_MTRR_FIX4KC0000	= 0x00080073,
+	HV_X64_REGISTER_MSR_MTRR_FIX4KC8000	= 0x00080074,
+	HV_X64_REGISTER_MSR_MTRR_FIX4KD0000	= 0x00080075,
+	HV_X64_REGISTER_MSR_MTRR_FIX4KD8000	= 0x00080076,
+	HV_X64_REGISTER_MSR_MTRR_FIX4KE0000	= 0x00080077,
+	HV_X64_REGISTER_MSR_MTRR_FIX4KE8000	= 0x00080078,
+	HV_X64_REGISTER_MSR_MTRR_FIX4KF0000	= 0x00080079,
+	HV_X64_REGISTER_MSR_MTRR_FIX4KF8000	= 0x0008007A,
+
+	HV_X64_REGISTER_REG_PAGE	= 0x0009001C,
+#endif
 };
 
 /*
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index b645d17cc531..dee3ece28ce5 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -322,4 +322,84 @@ struct mshv_get_set_vp_state {
  * #define MSHV_ROOT_HVCALL			_IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall)
  */
 
+/* Structure definitions, macros and IOCTLs for mshv_vtl */
+
+#define MSHV_CAP_CORE_API_STABLE        0x0
+#define MSHV_CAP_REGISTER_PAGE          0x1
+#define MSHV_CAP_VTL_RETURN_ACTION      0x2
+#define MSHV_CAP_DR6_SHARED             0x3
+#define MSHV_MAX_RUN_MSG_SIZE                256
+
+struct mshv_vp_registers {
+	__u32 count;	/* supports only 1 register at a time */
+	__u32 reserved; /* Reserved for alignment or future use */
+	__u64 regs_ptr;	/* pointer to struct hv_register_assoc */
+};
+
+struct mshv_vtl_set_eventfd {
+	__s32 fd;
+	__u32 flag;
+};
+
+struct mshv_vtl_signal_event {
+	__u32 connection_id;
+	__u32 flag;
+};
+
+struct mshv_vtl_sint_post_msg {
+	__u64 message_type;
+	__u32 connection_id;
+	__u32 payload_size; /* Must not exceed HV_MESSAGE_PAYLOAD_BYTE_COUNT */
+	__u64 payload_ptr; /* pointer to message payload (bytes) */
+};
+
+struct mshv_vtl_ram_disposition {
+	__u64 start_pfn;
+	__u64 last_pfn;
+};
+
+struct mshv_vtl_set_poll_file {
+	__u32 cpu;
+	__u32 fd;
+};
+
+struct mshv_vtl_hvcall_setup {
+	__u64 bitmap_array_size; /* stores number of bytes */
+	__u64 allow_bitmap_ptr;
+};
+
+struct mshv_vtl_hvcall {
+	__u64 control;      /* Hypercall control code */
+	__u64 input_size;   /* Size of the input data */
+	__u64 input_ptr;    /* Pointer to the input struct */
+	__u64 status;       /* Status of the hypercall (output) */
+	__u64 output_size;  /* Size of the output data */
+	__u64 output_ptr;   /* Pointer to the output struct */
+};
+
+struct mshv_sint_mask {
+	__u8 mask;
+	__u8 reserved[7];
+};
+
+/* /dev/mshv device IOCTL */
+#define MSHV_CHECK_EXTENSION    _IOW(MSHV_IOCTL, 0x00, __u32)
+
+/* vtl device */
+#define MSHV_CREATE_VTL			_IOR(MSHV_IOCTL, 0x1D, char)
+#define MSHV_ADD_VTL0_MEMORY	_IOW(MSHV_IOCTL, 0x21, struct mshv_vtl_ram_disposition)
+#define MSHV_SET_POLL_FILE		_IOW(MSHV_IOCTL, 0x25, struct mshv_vtl_set_poll_file)
+#define MSHV_RETURN_TO_LOWER_VTL	_IO(MSHV_IOCTL, 0x27)
+#define MSHV_GET_VP_REGISTERS		_IOWR(MSHV_IOCTL, 0x05, struct mshv_vp_registers)
+#define MSHV_SET_VP_REGISTERS		_IOW(MSHV_IOCTL, 0x06, struct mshv_vp_registers)
+
+/* VMBus device IOCTLs */
+#define MSHV_SINT_SIGNAL_EVENT    _IOW(MSHV_IOCTL, 0x22, struct mshv_vtl_signal_event)
+#define MSHV_SINT_POST_MESSAGE    _IOW(MSHV_IOCTL, 0x23, struct mshv_vtl_sint_post_msg)
+#define MSHV_SINT_SET_EVENTFD     _IOW(MSHV_IOCTL, 0x24, struct mshv_vtl_set_eventfd)
+#define MSHV_SINT_PAUSE_MESSAGE_STREAM     _IOW(MSHV_IOCTL, 0x25, struct mshv_sint_mask)
+
+/* hv_hvcall device */
+#define MSHV_HVCALL_SETUP        _IOW(MSHV_IOCTL, 0x1E, struct mshv_vtl_hvcall_setup)
+#define MSHV_HVCALL              _IOWR(MSHV_IOCTL, 0x1F, struct mshv_vtl_hvcall)
 #endif
-- 
cgit v1.2.3


From 9d70ef7a18e0ec1653ac63020a13a5d4dda7cc0d Mon Sep 17 00:00:00 2001
From: Jinank Jain <jinankjain@microsoft.com>
Date: Mon, 24 Nov 2025 14:25:59 +0000
Subject: mshv: adjust interrupt control structure for ARM64

Interrupt control structure (union hv_interupt_control) has different
fields when it comes to x86 vs ARM64. Bring in the correct structure
from HyperV header files and adjust the existing interrupt routing
code accordingly.

Signed-off-by: Jinank Jain <jinankjain@microsoft.com>
Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/mshv_eventfd.c      | 6 ++++++
 drivers/hv/mshv_irq.c          | 4 ++++
 drivers/hv/mshv_root_hv_call.c | 6 ++++++
 include/hyperv/hvhdk.h         | 6 ++++++
 4 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 2a80af1d610a..d93a18f09c76 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -163,8 +163,10 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
 		return -EOPNOTSUPP;
 
+#if IS_ENABLED(CONFIG_X86)
 	if (irq->lapic_control.logical_dest_mode)
 		return -EOPNOTSUPP;
+#endif
 
 	vp = partition->pt_vp_array[irq->lapic_apic_id];
 
@@ -196,8 +198,10 @@ static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
 	unsigned int seq;
 	int idx;
 
+#if IS_ENABLED(CONFIG_X86)
 	WARN_ON(irqfd->irqfd_resampler &&
 		!irq->lapic_control.level_triggered);
+#endif
 
 	idx = srcu_read_lock(&partition->pt_irq_srcu);
 	if (irqfd->irqfd_girq_ent.guest_irq_num) {
@@ -469,6 +473,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
 	init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
 
 	spin_lock_irq(&pt->pt_irqfds_lock);
+#if IS_ENABLED(CONFIG_X86)
 	if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
 	    !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
 		/*
@@ -479,6 +484,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
 		ret = -EINVAL;
 		goto fail;
 	}
+#endif
 	ret = 0;
 	hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
 		if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c
index d0fb9ef734f4..798e7e1ab06e 100644
--- a/drivers/hv/mshv_irq.c
+++ b/drivers/hv/mshv_irq.c
@@ -119,6 +119,10 @@ void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent,
 	lirq->lapic_vector = ent->girq_irq_data & 0xFF;
 	lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF;
 	lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8;
+#if IS_ENABLED(CONFIG_X86)
 	lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1;
 	lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1;
+#elif IS_ENABLED(CONFIG_ARM64)
+	lirq->lapic_control.asserted = 1;
+#endif
 }
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index caf02cfa49c9..598eaff4ff29 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -388,7 +388,13 @@ int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
 	memset(input, 0, sizeof(*input));
 	input->partition_id = partition_id;
 	input->vector = vector;
+	/*
+	 * NOTE: dest_addr only needs to be provided while asserting an
+	 * interrupt on x86 platform
+	 */
+#if IS_ENABLED(CONFIG_X86)
 	input->dest_addr = dest_addr;
+#endif
 	input->control = control;
 	status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL);
 	local_irq_restore(flags);
diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index 416c0d45b793..469186df7826 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -579,9 +579,15 @@ union hv_interrupt_control {
 	u64 as_uint64;
 	struct {
 		u32 interrupt_type; /* enum hv_interrupt_type */
+#if IS_ENABLED(CONFIG_X86)
 		u32 level_triggered : 1;
 		u32 logical_dest_mode : 1;
 		u32 rsvd : 30;
+#elif IS_ENABLED(CONFIG_ARM64)
+		u32 rsvd1 : 2;
+		u32 asserted : 1;
+		u32 rsvd2 : 29;
+#endif
 	} __packed;
 };
 
-- 
cgit v1.2.3


From 723c47a221ee407901055c9d9b4434e68c5d650e Mon Sep 17 00:00:00 2001
From: Praveen K Paladugu <prapal@linux.microsoft.com>
Date: Fri, 5 Dec 2025 14:17:06 -0600
Subject: mshv: Add definitions for MSHV sleep state configuration

Add the definitions required to configure sleep states in mshv hypervsior.

Signed-off-by: Praveen K Paladugu <prapal@linux.microsoft.com>
Co-developed-by: Anatol Belski <anbelski@linux.microsoft.com>
Signed-off-by: Anatol Belski <anbelski@linux.microsoft.com>
Reviewed-by: Easwar Hariharan <easwar.hariharan@linux.microsoft.com>
Reviewed-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Acked-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/hyperv/hvgdk_mini.h |  4 +++-
 include/hyperv/hvhdk_mini.h | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index 1d5ce11be8b6..04b18d0e37af 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -465,19 +465,21 @@ union hv_vp_assist_msr_contents {	 /* HV_REGISTER_VP_ASSIST_PAGE */
 #define HVCALL_RESET_DEBUG_SESSION			0x006b
 #define HVCALL_MAP_STATS_PAGE				0x006c
 #define HVCALL_UNMAP_STATS_PAGE				0x006d
+#define HVCALL_SET_SYSTEM_PROPERTY			0x006f
 #define HVCALL_ADD_LOGICAL_PROCESSOR			0x0076
 #define HVCALL_GET_SYSTEM_PROPERTY			0x007b
 #define HVCALL_MAP_DEVICE_INTERRUPT			0x007c
 #define HVCALL_UNMAP_DEVICE_INTERRUPT			0x007d
 #define HVCALL_RETARGET_INTERRUPT			0x007e
 #define HVCALL_NOTIFY_PARTITION_EVENT                   0x0087
+#define HVCALL_ENTER_SLEEP_STATE			0x0084
 #define HVCALL_NOTIFY_PORT_RING_EMPTY			0x008b
 #define HVCALL_REGISTER_INTERCEPT_RESULT		0x0091
 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT			0x0094
 #define HVCALL_CREATE_PORT				0x0095
 #define HVCALL_CONNECT_PORT				0x0096
 #define HVCALL_START_VP					0x0099
-#define HVCALL_GET_VP_INDEX_FROM_APIC_ID			0x009a
+#define HVCALL_GET_VP_INDEX_FROM_APIC_ID		0x009a
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE	0x00af
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST	0x00b0
 #define HVCALL_SIGNAL_EVENT_DIRECT			0x00c0
diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h
index f2d7b50de7a4..41a29bf8ec14 100644
--- a/include/hyperv/hvhdk_mini.h
+++ b/include/hyperv/hvhdk_mini.h
@@ -140,6 +140,7 @@ enum hv_snp_status {
 
 enum hv_system_property {
 	/* Add more values when needed */
+	HV_SYSTEM_PROPERTY_SLEEP_STATE = 3,
 	HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15,
 	HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21,
 	HV_SYSTEM_PROPERTY_CRASHDUMPAREA = 47,
@@ -155,6 +156,19 @@ union hv_pfn_range {            /* HV_SPA_PAGE_RANGE */
 	} __packed;
 };
 
+enum hv_sleep_state {
+	HV_SLEEP_STATE_S1 = 1,
+	HV_SLEEP_STATE_S2 = 2,
+	HV_SLEEP_STATE_S3 = 3,
+	HV_SLEEP_STATE_S4 = 4,
+	HV_SLEEP_STATE_S5 = 5,
+	/*
+	 * After hypervisor has received this, any follow up sleep
+	 * state registration requests will be rejected.
+	 */
+	HV_SLEEP_STATE_LOCK = 6
+};
+
 enum hv_dynamic_processor_feature_property {
 	/* Add more values when needed */
 	HV_X64_DYNAMIC_PROCESSOR_FEATURE_MAX_ENCRYPTED_PARTITIONS = 13,
@@ -184,6 +198,32 @@ struct hv_output_get_system_property {
 	};
 } __packed;
 
+struct hv_sleep_state_info {
+	u32 sleep_state; /* enum hv_sleep_state */
+	u8 pm1a_slp_typ;
+	u8 pm1b_slp_typ;
+} __packed;
+
+struct hv_input_set_system_property {
+	u32 property_id; /* enum hv_system_property */
+	u32 reserved;
+	union {
+		/* More fields to be filled in when needed */
+		struct hv_sleep_state_info set_sleep_state_info;
+
+		/*
+		 * Add a reserved field to ensure the union is 8-byte aligned as
+		 * existing members may not be. This is a temporary measure
+		 * until all remaining members are added.
+		 */
+		 u64 reserved0[8];
+	};
+} __packed;
+
+struct hv_input_enter_sleep_state {     /* HV_INPUT_ENTER_SLEEP_STATE */
+	u32 sleep_state;        /* enum hv_sleep_state */
+} __packed;
+
 struct hv_input_map_stats_page {
 	u32 type; /* enum hv_stats_object_type */
 	u32 padding;
-- 
cgit v1.2.3


From 90dfeef1cd38dff19f8b3a752d13bfd79f0f7694 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 4 Dec 2025 11:43:32 +0100
Subject: seqlock: Cure some more scoped_seqlock() optimization fails

Arnd reported an x86 randconfig using gcc-15 tripped over
__scoped_seqlock_bug(). Turns out GCC chose not to inline the
scoped_seqlock helper functions and as such was not able to optimize
properly.

[ mingo: Clang fails the build too in some circumstances. ]

Reported-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: https://patch.msgid.link/20251204104332.GG2528459@noisy.programming.kicks-ass.net
---
 include/linux/seqlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index a8a8661839b6..221123660e71 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -1224,7 +1224,7 @@ struct ss_tmp {
 	spinlock_t	*lock_irqsave;
 };
 
-static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst)
+static __always_inline void __scoped_seqlock_cleanup(struct ss_tmp *sst)
 {
 	if (sst->lock)
 		spin_unlock(sst->lock);
@@ -1252,7 +1252,7 @@ static inline void __scoped_seqlock_bug(void) { }
 extern void __scoped_seqlock_bug(void);
 #endif
 
-static inline void
+static __always_inline void
 __scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target)
 {
 	switch (sst->state) {
-- 
cgit v1.2.3


From 5e5ea7f61610239fca058011e7d4f342b34d1558 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 6 Dec 2025 11:13:50 -0800
Subject: iommu/amd: fix SEV-TIO support reporting

Commit eeb934137deb ("iommu/amd: Report SEV-TIO support") was confused
about the config options that expose amd_iommu_sev_tio_supported(), and
made the declaration (and alternative dummy function) conditional on the
CONFIG_AMD_IOMMU config option.

But the code is actually dependent on CONFIG_KVM_AMD_SEV, resulting in

   ERROR: modpost: "amd_iommu_sev_tio_supported" [drivers/crypto/ccp/ccp.ko] undefined!
   make[2]: *** [scripts/Makefile.modpost:147: Module.symvers] Error 1

if you have the AMD iommu enabled, but don't enable KVM_AMD_SEV support.

Fix it by moving the declaration into the right #ifdef section in the
header file.

Fixes: eeb934137deb ("iommu/amd: Report SEV-TIO support")
Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/amd-iommu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 0f64f09d1f34..edcee9f5335a 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -18,12 +18,10 @@ struct task_struct;
 struct pci_dev;
 
 extern void amd_iommu_detect(void);
-extern bool amd_iommu_sev_tio_supported(void);
 
 #else /* CONFIG_AMD_IOMMU */
 
 static inline void amd_iommu_detect(void) { }
-static inline bool amd_iommu_sev_tio_supported(void) { return false; }
 
 #endif /* CONFIG_AMD_IOMMU */
 
@@ -72,8 +70,10 @@ struct amd_iommu *get_amd_iommu(unsigned int idx);
 
 #ifdef CONFIG_KVM_AMD_SEV
 int amd_iommu_snp_disable(void);
+extern bool amd_iommu_sev_tio_supported(void);
 #else
 static inline int amd_iommu_snp_disable(void) { return 0; }
+static inline bool amd_iommu_sev_tio_supported(void) { return false; }
 #endif
 
 #endif /* _ASM_X86_AMD_IOMMU_H */
-- 
cgit v1.2.3


From a4f2fa516e83f11c3792405599613c12efe6135e Mon Sep 17 00:00:00 2001
From: Joakim Zhang <joakim.zhang@cixtech.com>
Date: Fri, 5 Dec 2025 23:46:20 +0800
Subject: ALSA: hda/core: add addr_offset field for bus address translation

Add bus addr_offset field for dma address translation,
for some SoCs such as CIX SKY1 which is ARM64 Arch, HOST
and HDAC has different memory view, so need to do dma address
translation between HOST and HDAC.

Signed-off-by: Joakim Zhang <joakim.zhang@cixtech.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20251205154621.3019640-3-joakim.zhang@cixtech.com
---
 include/sound/hdaudio.h     |  3 +++
 sound/hda/core/bus.c        |  1 +
 sound/hda/core/controller.c | 12 ++++++------
 sound/hda/core/stream.c     | 10 +++++-----
 4 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h
index 4e0c1d8af09f..f11bfc6b9f42 100644
--- a/include/sound/hdaudio.h
+++ b/include/sound/hdaudio.h
@@ -380,6 +380,9 @@ struct hdac_bus {
 
 	/* factor used to derive STRIPE control value */
 	unsigned int sdo_limit;
+
+	/* address offset between host and hadc */
+	dma_addr_t addr_offset;
 };
 
 int snd_hdac_bus_init(struct hdac_bus *bus, struct device *dev,
diff --git a/sound/hda/core/bus.c b/sound/hda/core/bus.c
index 9b196c915f37..81498f1e413e 100644
--- a/sound/hda/core/bus.c
+++ b/sound/hda/core/bus.c
@@ -47,6 +47,7 @@ int snd_hdac_bus_init(struct hdac_bus *bus, struct device *dev,
 	INIT_LIST_HEAD(&bus->hlink_list);
 	init_waitqueue_head(&bus->rirb_wq);
 	bus->irq = -1;
+	bus->addr_offset = 0;
 
 	/*
 	 * Default value of '8' is as per the HD audio specification (Rev 1.0a).
diff --git a/sound/hda/core/controller.c b/sound/hda/core/controller.c
index a7c00ad80117..69e11d62bbfa 100644
--- a/sound/hda/core/controller.c
+++ b/sound/hda/core/controller.c
@@ -48,8 +48,8 @@ void snd_hdac_bus_init_cmd_io(struct hdac_bus *bus)
 	/* CORB set up */
 	bus->corb.addr = bus->rb.addr;
 	bus->corb.buf = (__le32 *)bus->rb.area;
-	snd_hdac_chip_writel(bus, CORBLBASE, (u32)bus->corb.addr);
-	snd_hdac_chip_writel(bus, CORBUBASE, upper_32_bits(bus->corb.addr));
+	snd_hdac_chip_writel(bus, CORBLBASE, (u32)(bus->corb.addr + bus->addr_offset));
+	snd_hdac_chip_writel(bus, CORBUBASE, upper_32_bits(bus->corb.addr + bus->addr_offset));
 
 	/* set the corb size to 256 entries (ULI requires explicitly) */
 	snd_hdac_chip_writeb(bus, CORBSIZE, 0x02);
@@ -70,8 +70,8 @@ void snd_hdac_bus_init_cmd_io(struct hdac_bus *bus)
 	bus->rirb.buf = (__le32 *)(bus->rb.area + 2048);
 	bus->rirb.wp = bus->rirb.rp = 0;
 	memset(bus->rirb.cmds, 0, sizeof(bus->rirb.cmds));
-	snd_hdac_chip_writel(bus, RIRBLBASE, (u32)bus->rirb.addr);
-	snd_hdac_chip_writel(bus, RIRBUBASE, upper_32_bits(bus->rirb.addr));
+	snd_hdac_chip_writel(bus, RIRBLBASE, (u32)(bus->rirb.addr + bus->addr_offset));
+	snd_hdac_chip_writel(bus, RIRBUBASE, upper_32_bits(bus->rirb.addr + bus->addr_offset));
 
 	/* set the rirb size to 256 entries (ULI requires explicitly) */
 	snd_hdac_chip_writeb(bus, RIRBSIZE, 0x02);
@@ -625,8 +625,8 @@ bool snd_hdac_bus_init_chip(struct hdac_bus *bus, bool full_reset)
 
 	/* program the position buffer */
 	if (bus->use_posbuf && bus->posbuf.addr) {
-		snd_hdac_chip_writel(bus, DPLBASE, (u32)bus->posbuf.addr);
-		snd_hdac_chip_writel(bus, DPUBASE, upper_32_bits(bus->posbuf.addr));
+		snd_hdac_chip_writel(bus, DPLBASE, (u32)(bus->posbuf.addr + bus->addr_offset));
+		snd_hdac_chip_writel(bus, DPUBASE, upper_32_bits(bus->posbuf.addr + bus->addr_offset));
 	}
 
 	bus->chip_init = true;
diff --git a/sound/hda/core/stream.c b/sound/hda/core/stream.c
index 579ec544ef4a..b471a038b314 100644
--- a/sound/hda/core/stream.c
+++ b/sound/hda/core/stream.c
@@ -288,16 +288,16 @@ int snd_hdac_stream_setup(struct hdac_stream *azx_dev, bool code_loading)
 
 	/* program the BDL address */
 	/* lower BDL address */
-	snd_hdac_stream_writel(azx_dev, SD_BDLPL, (u32)azx_dev->bdl.addr);
+	snd_hdac_stream_writel(azx_dev, SD_BDLPL, (u32)(azx_dev->bdl.addr + bus->addr_offset));
 	/* upper BDL address */
 	snd_hdac_stream_writel(azx_dev, SD_BDLPU,
-			       upper_32_bits(azx_dev->bdl.addr));
+			       upper_32_bits(azx_dev->bdl.addr + bus->addr_offset));
 
 	/* enable the position buffer */
 	if (bus->use_posbuf && bus->posbuf.addr) {
 		if (!(snd_hdac_chip_readl(bus, DPLBASE) & AZX_DPLBASE_ENABLE))
 			snd_hdac_chip_writel(bus, DPLBASE,
-				(u32)bus->posbuf.addr | AZX_DPLBASE_ENABLE);
+				(u32)(bus->posbuf.addr + bus->addr_offset) | AZX_DPLBASE_ENABLE);
 	}
 
 	/* set the interrupt enable bits in the descriptor control register */
@@ -464,8 +464,8 @@ static int setup_bdle(struct hdac_bus *bus,
 
 		addr = snd_sgbuf_get_addr(dmab, ofs);
 		/* program the address field of the BDL entry */
-		bdl[0] = cpu_to_le32((u32)addr);
-		bdl[1] = cpu_to_le32(upper_32_bits(addr));
+		bdl[0] = cpu_to_le32((u32)(addr + bus->addr_offset));
+		bdl[1] = cpu_to_le32(upper_32_bits(addr + bus->addr_offset));
 		/* program the size field of the BDL entry */
 		chunk = snd_sgbuf_get_chunk_size(dmab, ofs, size);
 		/* one BDLE cannot cross 4K boundary on CTHDA chips */
-- 
cgit v1.2.3


From 455a65260f526cedd4680d4836ebdf2eaf1ab4c6 Mon Sep 17 00:00:00 2001
From: Tobias Schumacher <ts@linux.ibm.com>
Date: Thu, 4 Dec 2025 06:05:01 +0100
Subject: genirq: Change hwirq parameter to irq_hw_number_t

The irqdomain implementation internally represents hardware IRQs as
irq_hw_number_t, which is defined as unsigned long int. When providing
an irq_hw_number_t to the generic_handle_domain() functions that expect
and unsigned int hwirq, this can lead to a loss of information. Change
the hwirq parameter to irq_hw_number_t to support the full range of
hwirqs.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Reviewed-by: Farhan Ali <alifm@linux.ibm.com>
Signed-off-by: Tobias Schumacher <ts@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 include/linux/irqdesc.h | 6 +++---
 kernel/irq/irqdesc.c    | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 37e0b5b5600a..17902861de76 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -182,9 +182,9 @@ int generic_handle_irq_safe(unsigned int irq);
  * and handle the result interrupt number. Return -EINVAL if
  * conversion failed.
  */
-int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq);
-int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq);
-int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq);
+int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq);
+int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq);
+int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq);
 #endif
 
 /* Test to see if a driver has successfully requested an irq */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 6acf268f005b..f8e4e13dbe33 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -720,7 +720,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
  * 		This function must be called from an IRQ context with irq regs
  * 		initialized.
  */
-int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
+int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq)
 {
 	return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
 }
@@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
  * context). If the interrupt is marked as 'enforce IRQ-context only' then
  * the function must be invoked from hard interrupt context.
  */
-int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
+int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq)
 {
 	unsigned long flags;
 	int ret;
@@ -761,7 +761,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);
  * 		This function must be called from an NMI context with irq regs
  * 		initialized.
  **/
-int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq)
+int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq)
 {
 	WARN_ON_ONCE(!in_nmi());
 	return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
-- 
cgit v1.2.3


From 0f35040de59371ad542b915d7b91176c9910dadc Mon Sep 17 00:00:00 2001
From: Harry Yoo <harry.yoo@oracle.com>
Date: Mon, 8 Dec 2025 00:41:47 +0900
Subject: mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache
 destruction

Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab
caches when a cache is destroyed. This is unnecessary; only the RCU
sheaves belonging to the cache being destroyed need to be flushed.

As suggested by Vlastimil Babka, introduce a weaker form of
kvfree_rcu_barrier() that operates on a specific slab cache.

Factor out flush_rcu_sheaves_on_cache() from flush_all_rcu_sheaves() and
call it from flush_all_rcu_sheaves() and kvfree_rcu_barrier_on_cache().

Call kvfree_rcu_barrier_on_cache() instead of kvfree_rcu_barrier() on
cache destruction.

The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen
5900X machine (1 socket), by loading slub_kunit module.

Before:
  Total calls: 19
  Average latency (us): 18127
  Total time (us): 344414

After:
  Total calls: 19
  Average latency (us): 10066
  Total time (us): 191264

Two performance regression have been reported:
  - stress module loader test's runtime increases by 50-60% (Daniel)
  - internal graphics test's runtime on Tegra234 increases by 35% (Jon)

They are fixed by this change.

Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Fixes: ec66e0d59952 ("slab: add sheaf support for batching kfree_rcu() operations")
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz
Reported-and-tested-by: Daniel Gomez <da.gomez@samsung.com>
Closes: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org
Reported-and-tested-by: Jon Hunter <jonathanh@nvidia.com>
Closes: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
Link: https://patch.msgid.link/20251207154148.117723-1-harry.yoo@oracle.com
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h |  7 +++++++
 mm/slab.h            |  1 +
 mm/slab_common.c     | 52 +++++++++++++++++++++++++++++++++++--------------
 mm/slub.c            | 55 ++++++++++++++++++++++++++++------------------------
 4 files changed, 75 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index cf443f064a66..2482992248dc 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -1150,10 +1150,17 @@ static inline void kvfree_rcu_barrier(void)
 	rcu_barrier();
 }
 
+static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
+{
+	rcu_barrier();
+}
+
 static inline void kfree_rcu_scheduler_running(void) { }
 #else
 void kvfree_rcu_barrier(void);
 
+void kvfree_rcu_barrier_on_cache(struct kmem_cache *s);
+
 void kfree_rcu_scheduler_running(void);
 #endif
 
diff --git a/mm/slab.h b/mm/slab.h
index f730e012553c..e767aa7e91b0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -422,6 +422,7 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
 
 bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
 void flush_all_rcu_sheaves(void);
+void flush_rcu_sheaves_on_cache(struct kmem_cache *s);
 
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 84dfff4f7b1f..dd8a49d6f9cc 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -492,7 +492,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 		return;
 
 	/* in-flight kfree_rcu()'s may include objects from our cache */
-	kvfree_rcu_barrier();
+	kvfree_rcu_barrier_on_cache(s);
 
 	if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
 	    (s->flags & SLAB_TYPESAFE_BY_RCU)) {
@@ -2038,25 +2038,13 @@ unlock_return:
 }
 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
 
-/**
- * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
- *
- * Note that a single argument of kvfree_rcu() call has a slow path that
- * triggers synchronize_rcu() following by freeing a pointer. It is done
- * before the return from the function. Therefore for any single-argument
- * call that will result in a kfree() to a cache that is to be destroyed
- * during module exit, it is developer's responsibility to ensure that all
- * such calls have returned before the call to kmem_cache_destroy().
- */
-void kvfree_rcu_barrier(void)
+static inline void __kvfree_rcu_barrier(void)
 {
 	struct kfree_rcu_cpu_work *krwp;
 	struct kfree_rcu_cpu *krcp;
 	bool queued;
 	int i, cpu;
 
-	flush_all_rcu_sheaves();
-
 	/*
 	 * Firstly we detach objects and queue them over an RCU-batch
 	 * for all CPUs. Finally queued works are flushed for each CPU.
@@ -2118,8 +2106,43 @@ void kvfree_rcu_barrier(void)
 		}
 	}
 }
+
+/**
+ * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
+ *
+ * Note that a single argument of kvfree_rcu() call has a slow path that
+ * triggers synchronize_rcu() following by freeing a pointer. It is done
+ * before the return from the function. Therefore for any single-argument
+ * call that will result in a kfree() to a cache that is to be destroyed
+ * during module exit, it is developer's responsibility to ensure that all
+ * such calls have returned before the call to kmem_cache_destroy().
+ */
+void kvfree_rcu_barrier(void)
+{
+	flush_all_rcu_sheaves();
+	__kvfree_rcu_barrier();
+}
 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
 
+/**
+ * kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a
+ *                               specific slab cache.
+ * @s: slab cache to wait for
+ *
+ * See the description of kvfree_rcu_barrier() for details.
+ */
+void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
+{
+	if (s->cpu_sheaves)
+		flush_rcu_sheaves_on_cache(s);
+	/*
+	 * TODO: Introduce a version of __kvfree_rcu_barrier() that works
+	 * on a specific slab cache.
+	 */
+	__kvfree_rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
+
 static unsigned long
 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
@@ -2215,4 +2238,3 @@ void __init kvfree_rcu_init(void)
 }
 
 #endif /* CONFIG_KVFREE_RCU_BATCHED */
-
diff --git a/mm/slub.c b/mm/slub.c
index 2acce22590f8..f22ba8be29e0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4122,42 +4122,47 @@ static void flush_rcu_sheaf(struct work_struct *w)
 
 
 /* needed for kvfree_rcu_barrier() */
-void flush_all_rcu_sheaves(void)
+void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
 {
 	struct slub_flush_work *sfw;
-	struct kmem_cache *s;
 	unsigned int cpu;
 
-	cpus_read_lock();
-	mutex_lock(&slab_mutex);
+	mutex_lock(&flush_lock);
 
-	list_for_each_entry(s, &slab_caches, list) {
-		if (!s->cpu_sheaves)
-			continue;
+	for_each_online_cpu(cpu) {
+		sfw = &per_cpu(slub_flush, cpu);
 
-		mutex_lock(&flush_lock);
+		/*
+		 * we don't check if rcu_free sheaf exists - racing
+		 * __kfree_rcu_sheaf() might have just removed it.
+		 * by executing flush_rcu_sheaf() on the cpu we make
+		 * sure the __kfree_rcu_sheaf() finished its call_rcu()
+		 */
 
-		for_each_online_cpu(cpu) {
-			sfw = &per_cpu(slub_flush, cpu);
+		INIT_WORK(&sfw->work, flush_rcu_sheaf);
+		sfw->s = s;
+		queue_work_on(cpu, flushwq, &sfw->work);
+	}
 
-			/*
-			 * we don't check if rcu_free sheaf exists - racing
-			 * __kfree_rcu_sheaf() might have just removed it.
-			 * by executing flush_rcu_sheaf() on the cpu we make
-			 * sure the __kfree_rcu_sheaf() finished its call_rcu()
-			 */
+	for_each_online_cpu(cpu) {
+		sfw = &per_cpu(slub_flush, cpu);
+		flush_work(&sfw->work);
+	}
 
-			INIT_WORK(&sfw->work, flush_rcu_sheaf);
-			sfw->s = s;
-			queue_work_on(cpu, flushwq, &sfw->work);
-		}
+	mutex_unlock(&flush_lock);
+}
 
-		for_each_online_cpu(cpu) {
-			sfw = &per_cpu(slub_flush, cpu);
-			flush_work(&sfw->work);
-		}
+void flush_all_rcu_sheaves(void)
+{
+	struct kmem_cache *s;
+
+	cpus_read_lock();
+	mutex_lock(&slab_mutex);
 
-		mutex_unlock(&flush_lock);
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!s->cpu_sheaves)
+			continue;
+		flush_rcu_sheaves_on_cache(s);
 	}
 
 	mutex_unlock(&slab_mutex);
-- 
cgit v1.2.3


From 18223eececd66365c12275f09042e6fcb2ac5748 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@oss.qualcomm.com>
Date: Fri, 12 Sep 2025 09:32:19 +0100
Subject: of: base: Add of_property_read_u8_index

Add support for of_property_read_u8_index(), simillar to others
u16 and u32 variants. Having this helper makes the code more tidy in
isome cases, specially when we are parsing multiple of these into
data structures.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@oss.qualcomm.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Tested-by: Alexey Klimov <alexey.klimov@linaro.org> # sm8550
Link: https://patch.msgid.link/20250912083225.228778-2-srinivas.kandagatla@oss.qualcomm.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/of/property.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/of.h    |  9 +++++++++
 2 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/drivers/of/property.c b/drivers/of/property.c
index c1feb631e383..4e3524227720 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -147,6 +147,39 @@ static void *of_find_property_value_of_size(const struct device_node *np,
 	return prop->value;
 }
 
+/**
+ * of_property_read_u8_index - Find and read a u8 from a multi-value property.
+ *
+ * @np:		device node from which the property value is to be read.
+ * @propname:	name of the property to be searched.
+ * @index:	index of the u8 in the list of values
+ * @out_value:	pointer to return value, modified only if no error.
+ *
+ * Search for a property in a device node and read nth 8-bit value from
+ * it.
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
+ *
+ * The out_value is modified only if a valid u8 value can be decoded.
+ */
+int of_property_read_u8_index(const struct device_node *np,
+				       const char *propname,
+				       u32 index, u8 *out_value)
+{
+	const u8 *val = of_find_property_value_of_size(np, propname,
+					((index + 1) * sizeof(*out_value)),
+					0, NULL);
+
+	if (IS_ERR(val))
+		return PTR_ERR(val);
+
+	*out_value = val[index];
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_property_read_u8_index);
+
 /**
  * of_property_read_u16_index - Find and read a u16 from a multi-value property.
  *
diff --git a/include/linux/of.h b/include/linux/of.h
index 121a288ca92d..57fb598b72d3 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -316,6 +316,9 @@ extern struct property *of_find_property(const struct device_node *np,
 extern bool of_property_read_bool(const struct device_node *np, const char *propname);
 extern int of_property_count_elems_of_size(const struct device_node *np,
 				const char *propname, int elem_size);
+extern int of_property_read_u8_index(const struct device_node *np,
+				       const char *propname,
+				       u32 index, u8 *out_value);
 extern int of_property_read_u16_index(const struct device_node *np,
 				       const char *propname,
 				       u32 index, u16 *out_value);
@@ -646,6 +649,12 @@ static inline int of_property_count_elems_of_size(const struct device_node *np,
 	return -ENOSYS;
 }
 
+static inline int of_property_read_u8_index(const struct device_node *np,
+			const char *propname, u32 index, u8 *out_value)
+{
+	return -ENOSYS;
+}
+
 static inline int of_property_read_u16_index(const struct device_node *np,
 			const char *propname, u32 index, u16 *out_value)
 {
-- 
cgit v1.2.3


From 167efc6dfd621494c6a7e47115dc829dcc0e502c Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Tue, 14 Oct 2025 11:14:48 +0800
Subject: ASoC: SOF: Intel: export hda_sdw_bpt_get_buf_size_aligment

The dma buffer need to be a multiple of data block size and
the fifo size. Export a function to return the LCM of data
block size and the fifo size.

Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Link: https://patch.msgid.link/20251014031450.3781789-6-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/sound/hda-sdw-bpt.h       |  7 +++++++
 sound/soc/sof/intel/hda-sdw-bpt.c | 13 +++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda-sdw-bpt.h b/include/sound/hda-sdw-bpt.h
index f649549b75d5..9b654c31829a 100644
--- a/include/sound/hda-sdw-bpt.h
+++ b/include/sound/hda-sdw-bpt.h
@@ -30,6 +30,8 @@ int hda_sdw_bpt_wait(struct device *dev, struct hdac_ext_stream *bpt_tx_stream,
 int hda_sdw_bpt_close(struct device *dev, struct hdac_ext_stream *bpt_tx_stream,
 		      struct snd_dma_buffer *dmab_tx_bdl, struct hdac_ext_stream *bpt_rx_stream,
 		      struct snd_dma_buffer *dmab_rx_bdl);
+
+unsigned int hda_sdw_bpt_get_buf_size_alignment(unsigned int dma_bandwidth);
 #else
 static inline int hda_sdw_bpt_open(struct device *dev, int link_id,
 				   struct hdac_ext_stream **bpt_tx_stream,
@@ -64,6 +66,11 @@ static inline int hda_sdw_bpt_close(struct device *dev, struct hdac_ext_stream *
 	WARN_ONCE(1, "SoundWire BPT is disabled");
 	return -EOPNOTSUPP;
 }
+
+static inline unsigned int hda_sdw_bpt_get_buf_size_alignment(unsigned int dma_bandwidth)
+{
+	return 0;
+}
 #endif
 
 #endif /* __HDA_SDW_BPT_H */
diff --git a/sound/soc/sof/intel/hda-sdw-bpt.c b/sound/soc/sof/intel/hda-sdw-bpt.c
index ff5abccf0d88..e45dd051ab8c 100644
--- a/sound/soc/sof/intel/hda-sdw-bpt.c
+++ b/sound/soc/sof/intel/hda-sdw-bpt.c
@@ -10,6 +10,7 @@
  * Hardware interface for SoundWire BPT support with HDA DMA
  */
 
+#include <linux/lcm.h>
 #include <sound/hdaudio_ext.h>
 #include <sound/hda-mlink.h>
 #include <sound/hda-sdw-bpt.h>
@@ -236,6 +237,18 @@ static int hda_sdw_bpt_dma_disable(struct device *dev, struct hdac_ext_stream *s
 	return ret;
 }
 
+#define FIFO_ALIGNMENT	64
+
+unsigned int hda_sdw_bpt_get_buf_size_alignment(unsigned int dma_bandwidth)
+{
+	unsigned int num_channels = DIV_ROUND_UP(dma_bandwidth, BPT_FREQUENCY * 32);
+	unsigned int data_block = num_channels * 4;
+	unsigned int alignment = lcm(data_block, FIFO_ALIGNMENT);
+
+	return alignment;
+}
+EXPORT_SYMBOL_NS(hda_sdw_bpt_get_buf_size_alignment, "SND_SOC_SOF_INTEL_HDA_SDW_BPT");
+
 int hda_sdw_bpt_open(struct device *dev, int link_id, struct hdac_ext_stream **bpt_tx_stream,
 		     struct snd_dma_buffer *dmab_tx_bdl, u32 bpt_tx_num_bytes,
 		     u32 tx_dma_bandwidth, struct hdac_ext_stream **bpt_rx_stream,
-- 
cgit v1.2.3


From 31b931bebd11a0f00967114f62c8c38952f483e5 Mon Sep 17 00:00:00 2001
From: Hans de Goede <johannes.goede@oss.qualcomm.com>
Date: Sun, 7 Dec 2025 19:47:56 +0100
Subject: dma-mapping: Fix DMA_BIT_MASK() macro being broken

After commit a50f7456f853 ("dma-mapping: Allow use of DMA_BIT_MASK(64) in
global scope"), the DMA_BIT_MASK() macro is broken when passed non trivial
statements for the value of 'n'. This is caused by the new version missing
parenthesis around 'n' when evaluating 'n'.

One example of this breakage is the IPU6 driver now crashing due to
it getting DMA-addresses with address bit 32 set even though it has
tried to set a 32 bit DMA mask.

The IPU6 CSI2 engine has a DMA mask of either 31 or 32 bits depending
on if it is in secure mode or not and it sets this masks like this:

        mmu_info->aperture_end =
                (dma_addr_t)DMA_BIT_MASK(isp->secure_mode ?
                                         IPU6_MMU_ADDR_BITS :
                                         IPU6_MMU_ADDR_BITS_NON_SECURE);

So the 'n' argument here is "isp->secure_mode ? IPU6_MMU_ADDR_BITS :
IPU6_MMU_ADDR_BITS_NON_SECURE" which gets expanded into:

isp->secure_mode ? IPU6_MMU_ADDR_BITS : IPU6_MMU_ADDR_BITS_NON_SECURE - 1

With the -1 only being applied in the non secure case, causing
the secure mode mask to be one 1 bit too large.

Fixes: a50f7456f853 ("dma-mapping: Allow use of DMA_BIT_MASK(64) in global scope")
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251207184756.97904-1-johannes.goede@oss.qualcomm.com
---
 include/linux/dma-mapping.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2ceda49c609f..aa36a0d1d9df 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -90,7 +90,7 @@
  */
 #define DMA_MAPPING_ERROR		(~(dma_addr_t)0)
 
-#define DMA_BIT_MASK(n)	GENMASK_ULL(n - 1, 0)
+#define DMA_BIT_MASK(n)	GENMASK_ULL((n) - 1, 0)
 
 struct dma_iova_state {
 	dma_addr_t addr;
-- 
cgit v1.2.3


From 9a97857db0c5655b8932f86b5d18bb959079b0ee Mon Sep 17 00:00:00 2001
From: Andres J Rosa <andyrosa@gmail.com>
Date: Wed, 3 Dec 2025 10:25:01 -0600
Subject: ALSA: uapi: Fix typo in asound.h comment

Fix 'level-shit' to 'level-shift' in struct snd_cea_861_aud_if comment.

Fixes: 7ba1c40b536e ("ALSA: Add definitions for CEA-861 Audio InfoFrames")
Signed-off-by: Andres J Rosa <andyrosa@gmail.com>
Link: https://patch.msgid.link/20251203162509.1822-1-andyrosa@gmail.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/asound.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h
index 5a049eeaecce..d3ce75ba938a 100644
--- a/include/uapi/sound/asound.h
+++ b/include/uapi/sound/asound.h
@@ -60,7 +60,7 @@ struct snd_cea_861_aud_if {
 	unsigned char db2_sf_ss; /* sample frequency and size */
 	unsigned char db3; /* not used, all zeros */
 	unsigned char db4_ca; /* channel allocation code */
-	unsigned char db5_dminh_lsv; /* downmix inhibit & level-shit values */
+	unsigned char db5_dminh_lsv; /* downmix inhibit & level-shift values */
 };
 
 /****************************************************************************
-- 
cgit v1.2.3


From 2fb6915fa22dc5524d704afba58a13305dd9f533 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 18 Jul 2025 11:35:00 -0700
Subject: compiler_types.h: add "auto" as a macro for "__auto_type"

"auto" was defined as a keyword back in the K&R days, but as a storage
type specifier.  No one ever used it, since it was and is the default
storage type for local variables.

C++11 recycled the keyword to allow a type to be declared based on the
type of an initializer.  This was finally adopted into standard C in
C23.

gcc and clang provide the "__auto_type" alias keyword as an extension
for pre-C23, however, there is no reason to pollute the bulk of the
source base with this temporary keyword; instead define "auto" as a
macro unless the compiler is running in C23+ mode.

This macro is added in <linux/compiler_types.h> because that header is
included in some of the tools headers, wheres <linux/compiler.h> is
not as it has a bunch of very kernel-specific things in it.

[ Cc: stable to reduce potential backporting burden. ]

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Cc: <stable@kernel.org>
---
 include/linux/compiler_types.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 3eac51d68426..41172a28ce76 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -13,6 +13,19 @@
 
 #ifndef __ASSEMBLY__
 
+/*
+ * C23 introduces "auto" as a standard way to define type-inferred
+ * variables, but "auto" has been a (useless) keyword even since K&R C,
+ * so it has always been "namespace reserved."
+ *
+ * Until at some future time we require C23 support, we need the gcc
+ * extension __auto_type, but there is no reason to put that elsewhere
+ * in the source code.
+ */
+#if __STDC_VERSION__ < 202311L
+# define auto __auto_type
+#endif
+
 /*
  * Skipped when running bindgen due to a libclang issue;
  * see https://github.com/rust-lang/rust-bindgen/issues/2244.
-- 
cgit v1.2.3


From b3b8767c290102a8d95b9d12585cc1e03381ce3f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sat, 19 Jul 2025 23:36:58 -0700
Subject: include/linux: change "__auto_type" to "auto"

Replace instances of "__auto_type" with "auto" in include/linux.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
---
 include/linux/cleanup.h  | 6 +++---
 include/linux/compiler.h | 2 +-
 include/linux/minmax.h   | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 0b55a8f6c59e..8d41b917c77d 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -212,10 +212,10 @@
 
 #define __free(_name)	__cleanup(__free_##_name)
 
-#define __get_and_null(p, nullvalue)   \
+#define __get_and_null(p, nullvalue)	    \
 	({                                  \
-		__auto_type __ptr = &(p);   \
-		__auto_type __val = *__ptr; \
+		auto __ptr = &(p);	    \
+		auto __val = *__ptr;	    \
 		*__ptr = nullvalue;         \
 		__val;                      \
 	})
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index ff71bebe56f5..04487c9bd751 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -190,7 +190,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #define data_race(expr)							\
 ({									\
 	__kcsan_disable_current();					\
-	__auto_type __v = (expr);					\
+	auto __v = (expr);						\
 	__kcsan_enable_current();					\
 	__v;								\
 })
diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index eaaf5c008e4d..a0158db54a04 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -89,7 +89,7 @@
 	__cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
 
 #define __careful_cmp_once(op, x, y, ux, uy) ({		\
-	__auto_type ux = (x); __auto_type uy = (y);	\
+	auto ux = (x); auto uy = (y);			\
 	BUILD_BUG_ON_MSG(!__types_ok(ux, uy),		\
 		#op"("#x", "#y") signedness error");	\
 	__cmp(op, ux, uy); })
@@ -129,7 +129,7 @@
 	__careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull)
 
 #define __careful_op3(op, x, y, z, ux, uy, uz) ({			\
-	__auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\
+	auto ux = (x); auto uy = (y); auto uz = (z);			\
 	BUILD_BUG_ON_MSG(!__types_ok3(ux, uy, uz),			\
 		#op"3("#x", "#y", "#z") signedness error");		\
 	__cmp(op, ux, __cmp(op, uy, uz)); })
@@ -203,7 +203,7 @@
  * This macro checks @val/@lo/@hi to make sure they have compatible
  * signedness.
  */
-#define clamp(val, lo, hi) __careful_clamp(__auto_type, val, lo, hi)
+#define clamp(val, lo, hi) __careful_clamp(auto, val, lo, hi)
 
 /**
  * clamp_t - return a value clamped to a given range using a given type
-- 
cgit v1.2.3


From 1cba2eba9b73d8dfee6b3e7465f510cace71637c Mon Sep 17 00:00:00 2001
From: Jinhui Guo <guojinhui.liam@bytedance.com>
Date: Thu, 27 Nov 2025 17:25:12 +0800
Subject: mm/sparse: fix sparse_vmemmap_init_nid_early definition without
 CONFIG_SPARSEMEM

When CONFIG_SPARSEMEM is disabled, the macro
sparse_vmemmap_init_nid_early(_nid, _use) passes two arguments, while the
actual function accepts only nid.  Drop the extra argument _use.

Link: https://lkml.kernel.org/r/20251127092512.278-1-guojinhui.liam@bytedance.com
Fixes: d65917c42373 ("mm/sparse: allow for alternate vmemmap section init at boot")
Signed-off-by: Jinhui Guo <guojinhui.liam@bytedance.com>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4398e027f450..75ef7c9f9307 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -2289,7 +2289,7 @@ void sparse_init(void);
 #else
 #define sparse_init()	do {} while (0)
 #define sparse_index_init(_sec, _nid)  do {} while (0)
-#define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0)
+#define sparse_vmemmap_init_nid_early(_nid) do {} while (0)
 #define sparse_vmemmap_init_nid_late(_nid) do {} while (0)
 #define pfn_in_present_section pfn_valid
 #define subsection_map_init(_pfn, _nr_pages) do {} while (0)
-- 
cgit v1.2.3


From bdd0d69a32c2aa6437d23e35acc705758b835a75 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Wed, 26 Nov 2025 16:06:15 -0500
Subject: mm/huge_memory: change folio_split_supported() to
 folio_check_splittable()

Patch series "Improve folio split related functions", v4.

This patchset improves several folio split related functions to avoid
future misuse.  The changes are:

1. Consolidated folio splittable checks by moving truncated folio check,
   huge zero folio check, and writeback folio check into
   folio_split_supported(). Changed the function return type. Renamed it
   to folio_check_splittable() for clarification.

2. Replaced can_split_folio() with open coded folio_expected_ref_count()
   and folio_ref_count() and introduced folio_cache_ref_count().

3. Changed min_order_for_split() to always return an order.

4. Fixed folio split stats counting.

Motivation
==========
This is based on Wei's observation[1] and solves several potential
issues:
1. Dereferencing NULL folio->mapping in try_folio_split_to_order() if it
   is called on truncated folios.
2. Not handling of negative return value of min_order_for_split() in
   mm/memory-failure.c

There is no bug in the current code.


This patch (of 4):

folio_split_supported() used in try_folio_split_to_order() requires
folio->mapping to be non NULL, but current try_folio_split_to_order() does
not check it.  There is no issue in the current code, since
try_folio_split_to_order() is only used in truncate_inode_partial_folio(),
where folio->mapping is not NULL.

To prevent future misuse, move folio->mapping NULL check (i.e., folio is
truncated) into folio_split_supported().  Since folio->mapping NULL check
returns -EBUSY and folio_split_supported() == false means -EINVAL, change
folio_split_supported() return type from bool to int and return error
numbers accordingly.  Rename folio_split_supported() to
folio_check_splittable() to match the return type change.

While at it, move is_huge_zero_folio() check and folio_test_writeback()
check into folio_check_splittable() and add kernel-doc.

Remove all warnings inside folio_check_splittable() and give warnings
in __folio_split() instead, so that bool warns parameter can be removed.

Link: https://lkml.kernel.org/r/20251126210618.1971206-1-ziy@nvidia.com
Link: https://lkml.kernel.org/r/20251126210618.1971206-2-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: Balbir Singh <balbirs@nvidia.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |  6 ++--
 mm/huge_memory.c        | 76 ++++++++++++++++++++++++++++---------------------
 2 files changed, 46 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1d439de1ca2c..66105a90b4c3 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -375,8 +375,8 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
 int folio_split_unmapped(struct folio *folio, unsigned int new_order);
 int min_order_for_split(struct folio *folio);
 int split_folio_to_list(struct folio *folio, struct list_head *list);
-bool folio_split_supported(struct folio *folio, unsigned int new_order,
-		enum split_type split_type, bool warns);
+int folio_check_splittable(struct folio *folio, unsigned int new_order,
+			   enum split_type split_type);
 int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
 		struct list_head *list);
 
@@ -407,7 +407,7 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o
 static inline int try_folio_split_to_order(struct folio *folio,
 		struct page *page, unsigned int new_order)
 {
-	if (!folio_split_supported(folio, new_order, SPLIT_TYPE_NON_UNIFORM, /* warns= */ false))
+	if (folio_check_splittable(folio, new_order, SPLIT_TYPE_NON_UNIFORM))
 		return split_huge_page_to_order(&folio->page, new_order);
 	return folio_split(folio, new_order, page, NULL);
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 041b554c7115..8c2516ac9ce7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3688,15 +3688,40 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
 	return 0;
 }
 
-bool folio_split_supported(struct folio *folio, unsigned int new_order,
-		enum split_type split_type, bool warns)
+/**
+ * folio_check_splittable() - check if a folio can be split to a given order
+ * @folio: folio to be split
+ * @new_order: the smallest order of the after split folios (since buddy
+ *             allocator like split generates folios with orders from @folio's
+ *             order - 1 to new_order).
+ * @split_type: uniform or non-uniform split
+ *
+ * folio_check_splittable() checks if @folio can be split to @new_order using
+ * @split_type method. The truncated folio check must come first.
+ *
+ * Context: folio must be locked.
+ *
+ * Return: 0 - @folio can be split to @new_order, otherwise an error number is
+ * returned.
+ */
+int folio_check_splittable(struct folio *folio, unsigned int new_order,
+			   enum split_type split_type)
 {
+	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+	/*
+	 * Folios that just got truncated cannot get split. Signal to the
+	 * caller that there was a race.
+	 *
+	 * TODO: this will also currently refuse folios without a mapping in the
+	 * swapcache (shmem or to-be-anon folios).
+	 */
+	if (!folio->mapping && !folio_test_anon(folio))
+		return -EBUSY;
+
 	if (folio_test_anon(folio)) {
 		/* order-1 is not supported for anonymous THP. */
-		VM_WARN_ONCE(warns && new_order == 1,
-				"Cannot split to order-1 folio");
 		if (new_order == 1)
-			return false;
+			return -EINVAL;
 	} else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) {
 		if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
 		    !mapping_large_folio_support(folio->mapping)) {
@@ -3717,9 +3742,7 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order,
 			 * case, the mapping does not actually support large
 			 * folios properly.
 			 */
-			VM_WARN_ONCE(warns,
-				"Cannot split file folio to non-0 order");
-			return false;
+			return -EINVAL;
 		}
 	}
 
@@ -3732,12 +3755,16 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order,
 	 * here.
 	 */
 	if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) {
-		VM_WARN_ONCE(warns,
-			"Cannot split swapcache folio to non-0 order");
-		return false;
+		return -EINVAL;
 	}
 
-	return true;
+	if (is_huge_zero_folio(folio))
+		return -EINVAL;
+
+	if (folio_test_writeback(folio))
+		return -EBUSY;
+
+	return 0;
 }
 
 static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
@@ -3922,7 +3949,6 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	int remap_flags = 0;
 	int extra_pins, ret;
 	pgoff_t end = 0;
-	bool is_hzp;
 
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
@@ -3930,31 +3956,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	if (folio != page_folio(split_at) || folio != page_folio(lock_at))
 		return -EINVAL;
 
-	/*
-	 * Folios that just got truncated cannot get split. Signal to the
-	 * caller that there was a race.
-	 *
-	 * TODO: this will also currently refuse shmem folios that are in the
-	 * swapcache.
-	 */
-	if (!is_anon && !folio->mapping)
-		return -EBUSY;
-
 	if (new_order >= old_order)
 		return -EINVAL;
 
-	if (!folio_split_supported(folio, new_order, split_type, /* warn = */ true))
-		return -EINVAL;
-
-	is_hzp = is_huge_zero_folio(folio);
-	if (is_hzp) {
-		pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
-		return -EBUSY;
+	ret = folio_check_splittable(folio, new_order, split_type);
+	if (ret) {
+		VM_WARN_ONCE(ret == -EINVAL, "Tried to split an unsplittable folio");
+		return ret;
 	}
 
-	if (folio_test_writeback(folio))
-		return -EBUSY;
-
 	if (is_anon) {
 		/*
 		 * The caller does not necessarily hold an mmap_lock that would
-- 
cgit v1.2.3


From 5842bcbfc316738cbfcbdb4def5a7592aa03ebf2 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Wed, 26 Nov 2025 16:06:16 -0500
Subject: mm/huge_memory: replace can_split_folio() with direct refcount
 calculation

can_split_folio() is just a refcount comparison, making sure only the
split caller holds an extra pin.  Open code it with
folio_expected_ref_count() != folio_ref_count() - 1.  For the extra_pins
used by folio_ref_freeze(), add folio_cache_ref_count() to calculate it.
Also replace folio_expected_ref_count() with folio_cache_ref_count() used
by folio_ref_unfreeze(), since they are returning the same values when a
folio is frozen and folio_cache_ref_count() does not have unnecessary
folio_mapcount() in its implementation.

Link: https://lkml.kernel.org/r/20251126210618.1971206-3-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |  1 -
 mm/huge_memory.c        | 52 +++++++++++++++++++------------------------------
 mm/vmscan.c             |  3 ++-
 3 files changed, 22 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 66105a90b4c3..8a52e20387b0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -369,7 +369,6 @@ enum split_type {
 	SPLIT_TYPE_NON_UNIFORM,
 };
 
-bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
 int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		unsigned int new_order);
 int folio_split_unmapped(struct folio *folio, unsigned int new_order);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8c2516ac9ce7..5ce00d53b19e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
 	}
 }
 
-/* Racy check whether the huge page can be split */
-bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
-{
-	int extra_pins;
-
-	/* Additional pins from page cache */
-	if (folio_test_anon(folio))
-		extra_pins = folio_test_swapcache(folio) ?
-				folio_nr_pages(folio) : 0;
-	else
-		extra_pins = folio_nr_pages(folio);
-	if (pextra_pins)
-		*pextra_pins = extra_pins;
-	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
-					caller_pins;
-}
-
 static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
 {
 	for (; nr_pages; page++, nr_pages--)
@@ -3767,11 +3750,19 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
 	return 0;
 }
 
+/* Number of folio references from the pagecache or the swapcache. */
+static unsigned int folio_cache_ref_count(const struct folio *folio)
+{
+	if (folio_test_anon(folio) && !folio_test_swapcache(folio))
+		return 0;
+	return folio_nr_pages(folio);
+}
+
 static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
 					     struct page *split_at, struct xa_state *xas,
 					     struct address_space *mapping, bool do_lru,
 					     struct list_head *list, enum split_type split_type,
-					     pgoff_t end, int *nr_shmem_dropped, int extra_pins)
+					     pgoff_t end, int *nr_shmem_dropped)
 {
 	struct folio *end_folio = folio_next(folio);
 	struct folio *new_folio, *next;
@@ -3782,10 +3773,9 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
 	VM_WARN_ON_ONCE(!mapping && end);
 	/* Prevent deferred_split_scan() touching ->_refcount */
 	ds_queue = folio_split_queue_lock(folio);
-	if (folio_ref_freeze(folio, 1 + extra_pins)) {
+	if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
 		struct swap_cluster_info *ci = NULL;
 		struct lruvec *lruvec;
-		int expected_refs;
 
 		if (old_order > 1) {
 			if (!list_empty(&folio->_deferred_list)) {
@@ -3853,8 +3843,8 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
 
 			zone_device_private_split_cb(folio, new_folio);
 
-			expected_refs = folio_expected_ref_count(new_folio) + 1;
-			folio_ref_unfreeze(new_folio, expected_refs);
+			folio_ref_unfreeze(new_folio,
+					   folio_cache_ref_count(new_folio) + 1);
 
 			if (do_lru)
 				lru_add_split_folio(folio, new_folio, lruvec, list);
@@ -3897,8 +3887,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
 		 * Otherwise, a parallel folio_try_get() can grab @folio
 		 * and its caller can see stale page cache entries.
 		 */
-		expected_refs = folio_expected_ref_count(folio) + 1;
-		folio_ref_unfreeze(folio, expected_refs);
+		folio_ref_unfreeze(folio, folio_cache_ref_count(folio) + 1);
 
 		if (do_lru)
 			unlock_page_lruvec(lruvec);
@@ -3947,7 +3936,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	struct folio *new_folio, *next;
 	int nr_shmem_dropped = 0;
 	int remap_flags = 0;
-	int extra_pins, ret;
+	int ret;
 	pgoff_t end = 0;
 
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
@@ -4028,7 +4017,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	 * Racy check if we can split the page, before unmap_folio() will
 	 * split PMDs
 	 */
-	if (!can_split_folio(folio, 1, &extra_pins)) {
+	if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1) {
 		ret = -EAGAIN;
 		goto out_unlock;
 	}
@@ -4051,8 +4040,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	}
 
 	ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping,
-						true, list, split_type, end, &nr_shmem_dropped,
-						extra_pins);
+						true, list, split_type, end, &nr_shmem_dropped);
 fail:
 	if (mapping)
 		xas_unlock(&xas);
@@ -4126,20 +4114,20 @@ out:
  */
 int folio_split_unmapped(struct folio *folio, unsigned int new_order)
 {
-	int extra_pins, ret = 0;
+	int ret = 0;
 
 	VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);
 
-	if (!can_split_folio(folio, 1, &extra_pins))
+	if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1)
 		return -EAGAIN;
 
 	local_irq_disable();
 	ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL,
 						NULL, false, NULL, SPLIT_TYPE_UNIFORM,
-						0, NULL, extra_pins);
+						0, NULL);
 	local_irq_enable();
 	return ret;
 }
@@ -4632,7 +4620,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 		 * can be split or not. So skip the check here.
 		 */
 		if (!folio_test_private(folio) &&
-		    !can_split_folio(folio, 0, NULL))
+		    folio_expected_ref_count(folio) != folio_ref_count(folio))
 			goto next;
 
 		if (!folio_trylock(folio))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 92980b072121..3b85652a42b9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1284,7 +1284,8 @@ retry:
 					goto keep_locked;
 				if (folio_test_large(folio)) {
 					/* cannot split folio, skip it */
-					if (!can_split_folio(folio, 1, NULL))
+					if (folio_expected_ref_count(folio) !=
+					    folio_ref_count(folio) - 1)
 						goto activate_locked;
 					/*
 					 * Split partially mapped folios right away.
-- 
cgit v1.2.3


From 2f78910659c72807b7ff03a2c0d121901bf55848 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Wed, 26 Nov 2025 16:06:17 -0500
Subject: mm/huge_memory: make min_order_for_split() always return an order

min_order_for_split() returns -EBUSY when the folio is truncated and
cannot be split.  In commit 77008e1b2ef7 ("mm/huge_memory: do not change
split_huge_page*() target order silently"), memory_failure() does not
handle it and pass -EBUSY to try_to_split_thp_page() directly.
try_to_split_thp_page() returns -EINVAL since -EBUSY becomes 0xfffffff0 as
new_order is unsigned int in __folio_split() and this large new_order is
rejected as an invalid input.  The code does not cause a bug.
soft_offline_in_use_page() also uses min_order_for_split() but it always
passes 0 as new_order for split.

Fix it by making min_order_for_split() always return an order.  When the
given folio is truncated, namely folio->mapping == NULL, return 0 and let
a subsequent split function handle the situation and return -EBUSY.

Add kernel-doc to min_order_for_split() to clarify its use.

Link: https://lkml.kernel.org/r/20251126210618.1971206-4-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |  6 +++---
 mm/huge_memory.c        | 25 +++++++++++++++++++------
 2 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 8a52e20387b0..21162493a0a0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -372,7 +372,7 @@ enum split_type {
 int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		unsigned int new_order);
 int folio_split_unmapped(struct folio *folio, unsigned int new_order);
-int min_order_for_split(struct folio *folio);
+unsigned int min_order_for_split(struct folio *folio);
 int split_folio_to_list(struct folio *folio, struct list_head *list);
 int folio_check_splittable(struct folio *folio, unsigned int new_order,
 			   enum split_type split_type);
@@ -630,10 +630,10 @@ static inline int split_huge_page(struct page *page)
 	return -EINVAL;
 }
 
-static inline int min_order_for_split(struct folio *folio)
+static inline unsigned int min_order_for_split(struct folio *folio)
 {
 	VM_WARN_ON_ONCE_FOLIO(1, folio);
-	return -EINVAL;
+	return 0;
 }
 
 static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5ce00d53b19e..1a3273491cc5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -4219,16 +4219,29 @@ int folio_split(struct folio *folio, unsigned int new_order,
 			     SPLIT_TYPE_NON_UNIFORM);
 }
 
-int min_order_for_split(struct folio *folio)
+/**
+ * min_order_for_split() - get the minimum order @folio can be split to
+ * @folio: folio to split
+ *
+ * min_order_for_split() tells the minimum order @folio can be split to.
+ * If a file-backed folio is truncated, 0 will be returned. Any subsequent
+ * split attempt should get -EBUSY from split checking code.
+ *
+ * Return: @folio's minimum order for split
+ */
+unsigned int min_order_for_split(struct folio *folio)
 {
 	if (folio_test_anon(folio))
 		return 0;
 
-	if (!folio->mapping) {
-		if (folio_test_pmd_mappable(folio))
-			count_vm_event(THP_SPLIT_PAGE_FAILED);
-		return -EBUSY;
-	}
+	/*
+	 * If the folio got truncated, we don't know the previous mapping and
+	 * consequently the old min order. But it doesn't matter, as any split
+	 * attempt will immediately fail with -EBUSY as the folio cannot get
+	 * split until freed.
+	 */
+	if (!folio->mapping)
+		return 0;
 
 	return mapping_min_folio_order(folio->mapping);
 }
-- 
cgit v1.2.3


From 40a4af52e0472dfc114aa78d6f3debec70b42048 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@redhat.com>
Date: Mon, 1 Dec 2025 13:29:22 +0100
Subject: mm: fix CONFIG_STACK_GROWSUP typo in mm.h

Commit 2b6a3f061f11 ("mm: declare VMA flags by bit") significantly
refactors the header file include/linux/mm.h.  In that step, it introduces
a typo in an ifdef, referring to a non-existing config option
STACK_GROWS_UP, whereas the actual config option is called STACK_GROWSUP.

Fix this typo in the mm header file.

Link: https://lkml.kernel.org/r/20251201122922.352480-1-lukas.bulwahn@redhat.com
Fixes: 2b6a3f061f11 ("mm: declare VMA flags by bit")
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@redhat.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2887d3b34d3e..03f7f92d08c8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -438,7 +438,7 @@ enum {
 #define VM_NOHUGEPAGE	INIT_VM_FLAG(NOHUGEPAGE)
 #define VM_MERGEABLE	INIT_VM_FLAG(MERGEABLE)
 #define VM_STACK	INIT_VM_FLAG(STACK)
-#ifdef CONFIG_STACK_GROWS_UP
+#ifdef CONFIG_STACK_GROWSUP
 #define VM_STACK_EARLY	INIT_VM_FLAG(STACK_EARLY)
 #else
 #define VM_STACK_EARLY	VM_NONE
-- 
cgit v1.2.3


From 12eef14bcbac77bd08dc5693ad5818e69993246f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Dec 2025 09:18:57 +0100
Subject: lockref: add a __cond_lock annotation for lockref_put_or_lock

Add a cond_lock annotation for lockref_put_or_lock to make sparse
happy with using it.  Note that for this the return value has to be
double-inverted as the return value convention of lockref_put_or_lock
is inverted compared to _trylock conventions expected by __cond_lock,
as lockref_put_or_lock returns true when it did not need to take the
lock.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockref.h | 2 ++
 lib/lockref.c           | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/lockref.h b/include/linux/lockref.h
index 676721ee878d..815d871fadfc 100644
--- a/include/linux/lockref.h
+++ b/include/linux/lockref.h
@@ -50,6 +50,8 @@ void lockref_get(struct lockref *lockref);
 int lockref_put_return(struct lockref *lockref);
 bool lockref_get_not_zero(struct lockref *lockref);
 bool lockref_put_or_lock(struct lockref *lockref);
+#define lockref_put_or_lock(_lockref) \
+	(!__cond_lock((_lockref)->lock, !lockref_put_or_lock(_lockref)))
 
 void lockref_mark_dead(struct lockref *lockref);
 bool lockref_get_not_dead(struct lockref *lockref);
diff --git a/lib/lockref.c b/lib/lockref.c
index 5d8e3ef3860e..9210fc6ae714 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -105,6 +105,7 @@ EXPORT_SYMBOL(lockref_put_return);
  * @lockref: pointer to lockref structure
  * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
  */
+#undef lockref_put_or_lock
 bool lockref_put_or_lock(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
-- 
cgit v1.2.3


From 55026a9670ce8b7b3d74f7d570de1382cbfb395d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 2 Dec 2025 21:23:27 +0100
Subject: irqdomain: Delete irq_domain_add_tree()

No in-tree users anymore.

[ tglx: Remove the reference in the Chinese documentation as well ]

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251202202327.1444693-1-andriy.shevchenko@linux.intel.com
---
 .../translations/zh_CN/core-api/irq/irq-domain.rst       |  4 ----
 include/linux/irqdomain.h                                | 16 ----------------
 2 files changed, 20 deletions(-)

(limited to 'include')

diff --git a/Documentation/translations/zh_CN/core-api/irq/irq-domain.rst b/Documentation/translations/zh_CN/core-api/irq/irq-domain.rst
index 4a2d3b27aa4d..aaefeda0e164 100644
--- a/Documentation/translations/zh_CN/core-api/irq/irq-domain.rst
+++ b/Documentation/translations/zh_CN/core-api/irq/irq-domain.rst
@@ -109,10 +109,6 @@ irq_domain维护着从hwirq号到Linux IRQ的radix的树状映射。 当一个hw
 如果hwirq号可以非常大，树状映射是一个很好的选择，因为它不需要分配一个和最大hwirq
 号一样大的表。 缺点是，hwirq到IRQ号的查找取决于表中有多少条目。
 
-irq_domain_add_tree()和irq_domain_create_tree()在功能上是等价的，除了第一
-个参数不同——前者接受一个Open Firmware特定的 'struct device_node' ，而后者接受
-一个更通用的抽象 'struct fwnode_handle' 。
-
 很少有驱动应该需要这个映射。
 
 无映射
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 952d3c8dd6b7..62f81bbeb490 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -730,22 +730,6 @@ static inline void msi_device_domain_free_wired(struct irq_domain *domain, unsig
 }
 #endif
 
-static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
-						     const struct irq_domain_ops *ops,
-						     void *host_data)
-{
-	struct irq_domain_info info = {
-		.fwnode		= of_fwnode_handle(of_node),
-		.hwirq_max	= ~0U,
-		.ops		= ops,
-		.host_data	= host_data,
-	};
-	struct irq_domain *d;
-
-	d = irq_domain_instantiate(&info);
-	return IS_ERR(d) ? NULL : d;
-}
-
 static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
 						       unsigned int size,
 						       const struct irq_domain_ops *ops,
-- 
cgit v1.2.3


From d927a595ab2f6de4e10b3e3962bc70ab61d8f907 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 25 Sep 2025 12:45:12 +0200
Subject: ceph: add trace points to the MDS client

This patch adds trace points to the Ceph filesystem MDS client:

- request submission (CEPH_MSG_CLIENT_REQUEST) and completion
  (CEPH_MSG_CLIENT_REPLY)
- capabilities (CEPH_MSG_CLIENT_CAPS)

These are the central pieces that are useful for analyzing MDS
latency/performance problems from the client's perspective.

In the long run, all doutc() calls should be replaced with
tracepoints.  This way, the Ceph filesystem can be traced at any time
(without spamming the kernel log).  Additionally, trace points can be
used in BPF programs (which can even deference the pointer parameters
and extract more values).

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c              |   4 +
 fs/ceph/mds_client.c        |  20 +++-
 fs/ceph/super.c             |   3 +
 include/trace/events/ceph.h | 234 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 259 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/ceph.h

(limited to 'include')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b1a8ff612c41..2f663972da99 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -18,6 +18,7 @@
 #include "crypto.h"
 #include <linux/ceph/decode.h>
 #include <linux/ceph/messenger.h>
+#include <trace/events/ceph.h>
 
 /*
  * Capability management
@@ -4452,6 +4453,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	      session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode,
 	      seq, issue_seq, mseq);
 
+	trace_ceph_handle_caps(mdsc, session, op, &vino, ceph_inode(inode),
+			       seq, issue_seq, mseq);
+
 	mutex_lock(&session->s_mutex);
 
 	if (!inode) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1740047aef0f..7e4eab824dae 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -24,6 +24,7 @@
 #include <linux/ceph/pagelist.h>
 #include <linux/ceph/auth.h>
 #include <linux/ceph/debugfs.h>
+#include <trace/events/ceph.h>
 
 #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
 
@@ -3288,6 +3289,8 @@ static void complete_request(struct ceph_mds_client *mdsc,
 {
 	req->r_end_latency = ktime_get();
 
+	trace_ceph_mdsc_complete_request(mdsc, req);
+
 	if (req->r_callback)
 		req->r_callback(mdsc, req);
 	complete_all(&req->r_completion);
@@ -3419,6 +3422,8 @@ static int __send_request(struct ceph_mds_session *session,
 {
 	int err;
 
+	trace_ceph_mdsc_send_request(session, req);
+
 	err = __prepare_send_request(session, req, drop_cap_releases);
 	if (!err) {
 		ceph_msg_get(req->r_request);
@@ -3470,6 +3475,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
 		}
 		if (mdsc->mdsmap->m_epoch == 0) {
 			doutc(cl, "no mdsmap, waiting for map\n");
+			trace_ceph_mdsc_suspend_request(mdsc, session, req,
+							ceph_mdsc_suspend_reason_no_mdsmap);
 			list_add(&req->r_wait, &mdsc->waiting_for_map);
 			return;
 		}
@@ -3491,6 +3498,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
 			goto finish;
 		}
 		doutc(cl, "no mds or not active, waiting for map\n");
+		trace_ceph_mdsc_suspend_request(mdsc, session, req,
+						ceph_mdsc_suspend_reason_no_active_mds);
 		list_add(&req->r_wait, &mdsc->waiting_for_map);
 		return;
 	}
@@ -3536,9 +3545,11 @@ static void __do_request(struct ceph_mds_client *mdsc,
 		 * it to the mdsc queue.
 		 */
 		if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
-			if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER))
+			if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) {
+				trace_ceph_mdsc_suspend_request(mdsc, session, req,
+								ceph_mdsc_suspend_reason_rejected);
 				list_add(&req->r_wait, &mdsc->waiting_for_map);
-			else
+			} else
 				err = -EACCES;
 			goto out_session;
 		}
@@ -3552,6 +3563,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
 			if (random)
 				req->r_resend_mds = mds;
 		}
+		trace_ceph_mdsc_suspend_request(mdsc, session, req,
+						ceph_mdsc_suspend_reason_session);
 		list_add(&req->r_wait, &session->s_waiting);
 		goto out_session;
 	}
@@ -3652,6 +3665,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 		list_del_init(&req->r_wait);
 		doutc(cl, " wake request %p tid %llu\n", req,
 		      req->r_tid);
+		trace_ceph_mdsc_resume_request(mdsc, req);
 		__do_request(mdsc, req);
 	}
 }
@@ -3678,6 +3692,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 		    req->r_session->s_mds == mds) {
 			doutc(cl, " kicking tid %llu\n", req->r_tid);
 			list_del_init(&req->r_wait);
+			trace_ceph_mdsc_resume_request(mdsc, req);
 			__do_request(mdsc, req);
 		}
 	}
@@ -3724,6 +3739,7 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
 	doutc(cl, "submit_request on %p for inode %p\n", req, dir);
 	mutex_lock(&mdsc->mutex);
 	__register_request(mdsc, req, dir);
+	trace_ceph_mdsc_submit_request(mdsc, req);
 	__do_request(mdsc, req);
 	err = req->r_err;
 	mutex_unlock(&mdsc->mutex);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f6bf24b5c683..7c1c1dac320d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -30,6 +30,9 @@
 
 #include <uapi/linux/magic.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/ceph.h>
+
 static DEFINE_SPINLOCK(ceph_fsc_lock);
 static LIST_HEAD(ceph_fsc_list);
 
diff --git a/include/trace/events/ceph.h b/include/trace/events/ceph.h
new file mode 100644
index 000000000000..08cb0659fbfc
--- /dev/null
+++ b/include/trace/events/ceph.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Ceph filesystem support module tracepoints
+ *
+ * Copyright (C) 2025 IONOS SE. All Rights Reserved.
+ * Written by Max Kellermann (max.kellermann@ionos.com)
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ceph
+
+#if !defined(_TRACE_CEPH_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CEPH_H
+
+#include <linux/tracepoint.h>
+
+#define ceph_mdsc_suspend_reasons						\
+	EM(ceph_mdsc_suspend_reason_no_mdsmap,		"no-mdsmap")		\
+	EM(ceph_mdsc_suspend_reason_no_active_mds,	"no-active-mds")	\
+	EM(ceph_mdsc_suspend_reason_rejected,		"rejected")		\
+	E_(ceph_mdsc_suspend_reason_session,		"session")
+
+#ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
+#define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
+
+#undef EM
+#undef E_
+#define EM(a, b) a,
+#define E_(a, b) a
+
+enum ceph_mdsc_suspend_reason { ceph_mdsc_suspend_reasons } __mode(byte);
+
+#endif
+
+/*
+ * Export enum symbols via userspace.
+ */
+#undef EM
+#undef E_
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define E_(a, b) TRACE_DEFINE_ENUM(a);
+
+ceph_mdsc_suspend_reasons;
+
+/*
+ * Now redefine the EM() and E_() macros to map the enums to the strings that
+ * will be printed in the output.
+ */
+#undef EM
+#undef E_
+#define EM(a, b)	{ a, b },
+#define E_(a, b)	{ a, b }
+
+TRACE_EVENT(ceph_mdsc_submit_request,
+	TP_PROTO(struct ceph_mds_client *mdsc,
+		 struct ceph_mds_request *req),
+
+	TP_ARGS(mdsc, req),
+
+	TP_STRUCT__entry(
+		__field(u64,	tid)
+		__field(int,	op)
+		__field(u64,	ino)
+		__field(u64,	snap)
+	),
+
+	TP_fast_assign(
+		struct inode *inode;
+
+		__entry->tid = req->r_tid;
+		__entry->op = req->r_op;
+
+		inode = req->r_inode;
+		if (inode == NULL && req->r_dentry)
+			inode = d_inode(req->r_dentry);
+
+		if (inode) {
+			__entry->ino = ceph_ino(inode);
+			__entry->snap = ceph_snap(inode);
+		} else {
+			__entry->ino = __entry->snap = 0;
+		}
+	),
+
+	TP_printk("R=%llu op=%s ino=%llx,%llx",
+		  __entry->tid,
+		  ceph_mds_op_name(__entry->op),
+		  __entry->ino, __entry->snap)
+);
+
+TRACE_EVENT(ceph_mdsc_suspend_request,
+	TP_PROTO(struct ceph_mds_client *mdsc,
+		     struct ceph_mds_session *session,
+		     struct ceph_mds_request *req,
+		     enum ceph_mdsc_suspend_reason reason),
+
+	TP_ARGS(mdsc, session, req, reason),
+
+	TP_STRUCT__entry(
+		__field(u64,				tid)
+		__field(int,				op)
+		__field(int,				mds)
+		__field(enum ceph_mdsc_suspend_reason,	reason)
+	),
+
+	TP_fast_assign(
+		__entry->tid = req->r_tid;
+		__entry->op = req->r_op;
+		__entry->mds = session ? session->s_mds : -1;
+		__entry->reason = reason;
+	),
+
+	TP_printk("R=%llu op=%s reason=%s",
+		  __entry->tid,
+		  ceph_mds_op_name(__entry->op),
+		  __print_symbolic(__entry->reason, ceph_mdsc_suspend_reasons))
+);
+
+TRACE_EVENT(ceph_mdsc_resume_request,
+	TP_PROTO(struct ceph_mds_client *mdsc,
+		 struct ceph_mds_request *req),
+
+	TP_ARGS(mdsc, req),
+
+	TP_STRUCT__entry(
+		__field(u64,				tid)
+		__field(int,				op)
+	),
+
+	TP_fast_assign(
+		__entry->tid = req->r_tid;
+		__entry->op = req->r_op;
+	),
+
+	TP_printk("R=%llu op=%s",
+		  __entry->tid,
+		  ceph_mds_op_name(__entry->op))
+);
+
+TRACE_EVENT(ceph_mdsc_send_request,
+	TP_PROTO(struct ceph_mds_session *session,
+		 struct ceph_mds_request *req),
+
+	TP_ARGS(session, req),
+
+	TP_STRUCT__entry(
+		__field(u64,		tid)
+		__field(int,		op)
+		__field(int,		mds)
+	),
+
+	TP_fast_assign(
+		__entry->tid = req->r_tid;
+		__entry->op = req->r_op;
+		__entry->mds = session->s_mds;
+	),
+
+	TP_printk("R=%llu op=%s mds=%d",
+		  __entry->tid,
+		  ceph_mds_op_name(__entry->op),
+		  __entry->mds)
+);
+
+TRACE_EVENT(ceph_mdsc_complete_request,
+	TP_PROTO(struct ceph_mds_client *mdsc,
+		     struct ceph_mds_request *req),
+
+	TP_ARGS(mdsc, req),
+
+	TP_STRUCT__entry(
+		__field(u64,			tid)
+		__field(int,			op)
+		__field(int,			err)
+		__field(unsigned long,		latency_ns)
+	),
+
+	TP_fast_assign(
+		__entry->tid = req->r_tid;
+		__entry->op = req->r_op;
+		__entry->err = req->r_err;
+		__entry->latency_ns = req->r_end_latency - req->r_start_latency;
+	),
+
+	TP_printk("R=%llu op=%s err=%d latency_ns=%lu",
+		  __entry->tid,
+		  ceph_mds_op_name(__entry->op),
+		  __entry->err,
+		  __entry->latency_ns)
+);
+
+TRACE_EVENT(ceph_handle_caps,
+	TP_PROTO(struct ceph_mds_client *mdsc,
+		 struct ceph_mds_session *session,
+		 int op,
+		 const struct ceph_vino *vino,
+		 struct ceph_inode_info *inode,
+		 u32 seq, u32 mseq, u32 issue_seq),
+
+	TP_ARGS(mdsc, session, op, vino, inode, seq, mseq, issue_seq),
+
+	TP_STRUCT__entry(
+		__field(int,	mds)
+		__field(int,	op)
+		__field(u64,	ino)
+		__field(u64,	snap)
+		__field(u32,	seq)
+		__field(u32,	mseq)
+		__field(u32,	issue_seq)
+	),
+
+	TP_fast_assign(
+		__entry->mds = session->s_mds;
+		__entry->op = op;
+		__entry->ino = vino->ino;
+		__entry->snap = vino->snap;
+		__entry->seq = seq;
+		__entry->mseq = mseq;
+		__entry->issue_seq = issue_seq;
+	),
+
+	TP_printk("mds=%d op=%s vino=%llx.%llx seq=%u iseq=%u mseq=%u",
+		  __entry->mds,
+		  ceph_cap_op_name(__entry->op),
+		  __entry->ino,
+		  __entry->snap,
+		  __entry->seq,
+		  __entry->issue_seq,
+		  __entry->mseq)
+);
+
+#undef EM
+#undef E_
+#endif /* _TRACE_CEPH_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From e6b4d264c8c883d8451c7b5f20cd96ddf94af3ef Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 1 Dec 2025 21:10:18 +0100
Subject: args: fix documentation to reflect the correct numbers

The macro uses up to 15 arguments.  Reflect this in the top level comment.

Link: https://lkml.kernel.org/r/20251201201018.765475-1-andriy.shevchenko@linux.intel.com
Fixes: d51e783c17ba ("lsm: count the LSMs enabled at compile time")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/args.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/args.h b/include/linux/args.h
index 2e8e65d975c7..0562dc51435e 100644
--- a/include/linux/args.h
+++ b/include/linux/args.h
@@ -6,9 +6,9 @@
 /*
  * How do these macros work?
  *
- * In __COUNT_ARGS() _0 to _12 are just placeholders from the start
+ * In __COUNT_ARGS() _0 to _15 are just placeholders from the start
  * in order to make sure _n is positioned over the correct number
- * from 12 to 0 (depending on X, which is a variadic argument list).
+ * from 15 to 0 (depending on X, which is a variadic argument list).
  * They serve no purpose other than occupying a position. Since each
  * macro parameter must have a distinct identifier, those identifiers
  * are as good as any.
-- 
cgit v1.2.3


From bdae29d6512ddc589200b9ae6bda467bdbab863d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 5 Dec 2025 10:07:53 +0000
Subject: rseq: Always inline rseq_debug_syscall_return()

To get the full benefit of:

  eaa9088d568c ("rseq: Use static branch for syscall exit debug when GENERIC_IRQ_ENTRY=y")

clang needs an __always_inline instead of a plain inline qualifier:

	$ for i in {1..10}; do taskset -c 4 perf5 bench syscall basic -l 100000000 | grep "ops/sec"; done

		 Before	     After
	ops/sec  15424491    15872221   +2.9%

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251205100753.4073221-1-edumazet@google.com
---
 include/linux/rseq_entry.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index c92167ff8a7f..a36b472627de 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -596,7 +596,7 @@ static __always_inline void rseq_exit_to_user_mode_legacy(void)
 
 void __rseq_debug_syscall_return(struct pt_regs *regs);
 
-static inline void rseq_debug_syscall_return(struct pt_regs *regs)
+static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs)
 {
 	if (static_branch_unlikely(&rseq_debug_enabled))
 		__rseq_debug_syscall_return(regs);
-- 
cgit v1.2.3


From 41b80d43d9a00a302b5559baa7ebafc28dd54793 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Wed, 3 Dec 2025 15:45:51 -0500
Subject: i3c: master: cleanup callback .priv_xfers()

Remove the .priv_xfers() callback from the framework after all master
controller drivers have switched to use the new .i3c_xfers() callback.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
Tested-by: Tommaso Merciai <tommaso.merciai.xr@bp.renesas.com>
Link: https://patch.msgid.link/20251203-i3c_xfer_cleanup_master-v2-2-7dd94d04ee2d@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master.c       | 14 ++------------
 include/linux/i3c/master.h | 12 ++----------
 2 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index f88f7e19203a..ea45a519dd68 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -2819,14 +2819,10 @@ EXPORT_SYMBOL_GPL(i3c_generic_ibi_recycle_slot);
 
 static int i3c_master_check_ops(const struct i3c_master_controller_ops *ops)
 {
-	if (!ops || !ops->bus_init ||
+	if (!ops || !ops->bus_init || !ops->i3c_xfers ||
 	    !ops->send_ccc_cmd || !ops->do_daa || !ops->i2c_xfers)
 		return -EINVAL;
 
-	/* Must provide one of priv_xfers (SDR only) or i3c_xfers (all modes) */
-	if (!ops->priv_xfers && !ops->i3c_xfers)
-		return -EINVAL;
-
 	if (ops->request_ibi &&
 	    (!ops->enable_ibi || !ops->disable_ibi || !ops->free_ibi ||
 	     !ops->recycle_ibi_slot))
@@ -3031,13 +3027,7 @@ int i3c_dev_do_xfers_locked(struct i3c_dev_desc *dev, struct i3c_xfer *xfers,
 	if (mode != I3C_SDR && !(master->this->info.hdr_cap & BIT(mode)))
 		return -EOPNOTSUPP;
 
-	if (master->ops->i3c_xfers)
-		return master->ops->i3c_xfers(dev, xfers, nxfers, mode);
-
-	if (mode != I3C_SDR)
-		return -EINVAL;
-
-	return master->ops->priv_xfers(dev, xfers, nxfers);
+	return master->ops->i3c_xfers(dev, xfers, nxfers, mode);
 }
 
 int i3c_dev_disable_ibi_locked(struct i3c_dev_desc *dev)
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 2fd850f4678b..58d01ed4cce7 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -417,12 +417,8 @@ struct i3c_bus {
  *		      all CCC commands are supported.
  * @send_ccc_cmd: send a CCC command
  *		  This method is mandatory.
- * @priv_xfers: do one or several private I3C SDR transfers
- *		This method is mandatory when i3c_xfers is not implemented. It
- *		is deprecated.
- * @i3c_xfers: do one or several I3C SDR or HDR transfers
- *	       This method is mandatory when priv_xfers is not implemented but
- *	       should be implemented instead of priv_xfers.
+ * @i3c_xfers: do one or several I3C SDR or HDR transfers.
+ *	       This method is mandatory.
  * @attach_i2c_dev: called every time an I2C device is attached to the bus.
  *		    This is a good place to attach master controller specific
  *		    data to I2C devices.
@@ -478,10 +474,6 @@ struct i3c_master_controller_ops {
 				 const struct i3c_ccc_cmd *cmd);
 	int (*send_ccc_cmd)(struct i3c_master_controller *master,
 			    struct i3c_ccc_cmd *cmd);
-	/* Deprecated, please use i3c_xfers() */
-	int (*priv_xfers)(struct i3c_dev_desc *dev,
-			  struct i3c_priv_xfer *xfers,
-			  int nxfers);
 	int (*i3c_xfers)(struct i3c_dev_desc *dev,
 			 struct i3c_xfer *xfers,
 			 int nxfers, enum i3c_xfer_mode mode);
-- 
cgit v1.2.3


From d2ea4d254d04a89e17504af0230c7268e3cac6bf Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sat, 13 Dec 2025 08:45:23 +0100
Subject: file: ensure cleanup

Brown paper bag time. This is a silly oversight where I missed to drop
the error condition checking to ensure we clean up on early error
returns. I have an internal unit testset coming up for this which will
catch all such issues going forward.

Reported-by: Chris Mason <clm@fb.com>
Reported-by: Jeff Layton <jlayton@kernel.org>
Fixes: 011703a9acd7 ("file: add FD_{ADD,PREPARE}()")
Signed-off-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/file.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/file.h b/include/linux/file.h
index cf389fde9bc2..27484b444d31 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -161,12 +161,10 @@ typedef struct fd_prepare class_fd_prepare_t;
 /* Do not use directly. */
 static inline void class_fd_prepare_destructor(const struct fd_prepare *fdf)
 {
-	if (unlikely(fdf->err)) {
-		if (likely(fdf->__fd >= 0))
-			put_unused_fd(fdf->__fd);
-		if (unlikely(!IS_ERR_OR_NULL(fdf->__file)))
-			fput(fdf->__file);
-	}
+	if (unlikely(fdf->__fd >= 0))
+		put_unused_fd(fdf->__fd);
+	if (unlikely(!IS_ERR_OR_NULL(fdf->__file)))
+		fput(fdf->__file);
 }
 
 /* Do not use directly. */
@@ -230,7 +228,8 @@ static inline int class_fd_prepare_lock_err(const struct fd_prepare *fdf)
 		VFS_WARN_ON_ONCE(fdp->__fd < 0);               \
 		VFS_WARN_ON_ONCE(IS_ERR_OR_NULL(fdp->__file)); \
 		fd_install(fdp->__fd, fdp->__file);            \
-		fdp->__fd;                                     \
+		retain_and_null_ptr(fdp->__file);              \
+		take_fd(fdp->__fd);                            \
 	})
 
 /* Do not use directly. */
-- 
cgit v1.2.3